1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23   class Mask : public KMPAffinity::Mask {
24     hwloc_cpuset_t mask;
25 
26   public:
27     Mask() {
28       mask = hwloc_bitmap_alloc();
29       this->zero();
30     }
31     ~Mask() { hwloc_bitmap_free(mask); }
32     void set(int i) override { hwloc_bitmap_set(mask, i); }
33     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35     void zero() override { hwloc_bitmap_zero(mask); }
36     void copy(const KMPAffinity::Mask *src) override {
37       const Mask *convert = static_cast<const Mask *>(src);
38       hwloc_bitmap_copy(mask, convert->mask);
39     }
40     void bitwise_and(const KMPAffinity::Mask *rhs) override {
41       const Mask *convert = static_cast<const Mask *>(rhs);
42       hwloc_bitmap_and(mask, mask, convert->mask);
43     }
44     void bitwise_or(const KMPAffinity::Mask *rhs) override {
45       const Mask *convert = static_cast<const Mask *>(rhs);
46       hwloc_bitmap_or(mask, mask, convert->mask);
47     }
48     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49     int begin() const override { return hwloc_bitmap_first(mask); }
50     int end() const override { return -1; }
51     int next(int previous) const override {
52       return hwloc_bitmap_next(mask, previous);
53     }
54     int get_system_affinity(bool abort_on_error) override {
55       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56                   "Illegal get affinity operation when not capable");
57       long retval =
58           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59       if (retval >= 0) {
60         return 0;
61       }
62       int error = errno;
63       if (abort_on_error) {
64         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65       }
66       return error;
67     }
68     int set_system_affinity(bool abort_on_error) const override {
69       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70                   "Illegal set affinity operation when not capable");
71       long retval =
72           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73       if (retval >= 0) {
74         return 0;
75       }
76       int error = errno;
77       if (abort_on_error) {
78         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79       }
80       return error;
81     }
82 #if KMP_OS_WINDOWS
83     int set_process_affinity(bool abort_on_error) const override {
84       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
85                   "Illegal set process affinity operation when not capable");
86       int error = 0;
87       const hwloc_topology_support *support =
88           hwloc_topology_get_support(__kmp_hwloc_topology);
89       if (support->cpubind->set_proc_cpubind) {
90         int retval;
91         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
92                                    HWLOC_CPUBIND_PROCESS);
93         if (retval >= 0)
94           return 0;
95         error = errno;
96         if (abort_on_error)
97           __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
98       }
99       return error;
100     }
101 #endif
102     int get_proc_group() const override {
103       int group = -1;
104 #if KMP_OS_WINDOWS
105       if (__kmp_num_proc_groups == 1) {
106         return 1;
107       }
108       for (int i = 0; i < __kmp_num_proc_groups; i++) {
109         // On windows, the long type is always 32 bits
110         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
111         unsigned long second_32_bits =
112             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
113         if (first_32_bits == 0 && second_32_bits == 0) {
114           continue;
115         }
116         if (group >= 0) {
117           return -1;
118         }
119         group = i;
120       }
121 #endif /* KMP_OS_WINDOWS */
122       return group;
123     }
124   };
125   void determine_capable(const char *var) override {
126     const hwloc_topology_support *topology_support;
127     if (__kmp_hwloc_topology == NULL) {
128       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
129         __kmp_hwloc_error = TRUE;
130         if (__kmp_affinity_verbose)
131           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
132       }
133       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
134         __kmp_hwloc_error = TRUE;
135         if (__kmp_affinity_verbose)
136           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
137       }
138     }
139     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
140     // Is the system capable of setting/getting this thread's affinity?
141     // Also, is topology discovery possible? (pu indicates ability to discover
142     // processing units). And finally, were there no errors when calling any
143     // hwloc_* API functions?
144     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
145         topology_support->cpubind->get_thisthread_cpubind &&
146         topology_support->discovery->pu && !__kmp_hwloc_error) {
147       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
148       KMP_AFFINITY_ENABLE(TRUE);
149     } else {
150       // indicate that hwloc didn't work and disable affinity
151       __kmp_hwloc_error = TRUE;
152       KMP_AFFINITY_DISABLE();
153     }
154   }
155   void bind_thread(int which) override {
156     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
157                 "Illegal set affinity operation when not capable");
158     KMPAffinity::Mask *mask;
159     KMP_CPU_ALLOC_ON_STACK(mask);
160     KMP_CPU_ZERO(mask);
161     KMP_CPU_SET(which, mask);
162     __kmp_set_system_affinity(mask, TRUE);
163     KMP_CPU_FREE_FROM_STACK(mask);
164   }
165   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
166   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
167   KMPAffinity::Mask *allocate_mask_array(int num) override {
168     return new Mask[num];
169   }
170   void deallocate_mask_array(KMPAffinity::Mask *array) override {
171     Mask *hwloc_array = static_cast<Mask *>(array);
172     delete[] hwloc_array;
173   }
174   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
175                                       int index) override {
176     Mask *hwloc_array = static_cast<Mask *>(array);
177     return &(hwloc_array[index]);
178   }
179   api_type get_api_type() const override { return HWLOC; }
180 };
181 #endif /* KMP_USE_HWLOC */
182 
183 #if KMP_OS_LINUX || KMP_OS_FREEBSD
184 #if KMP_OS_LINUX
185 /* On some of the older OS's that we build on, these constants aren't present
186    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
187    all systems of the same arch where they are defined, and they cannot change.
188    stone forever. */
189 #include <sys/syscall.h>
190 #if KMP_ARCH_X86 || KMP_ARCH_ARM
191 #ifndef __NR_sched_setaffinity
192 #define __NR_sched_setaffinity 241
193 #elif __NR_sched_setaffinity != 241
194 #error Wrong code for setaffinity system call.
195 #endif /* __NR_sched_setaffinity */
196 #ifndef __NR_sched_getaffinity
197 #define __NR_sched_getaffinity 242
198 #elif __NR_sched_getaffinity != 242
199 #error Wrong code for getaffinity system call.
200 #endif /* __NR_sched_getaffinity */
201 #elif KMP_ARCH_AARCH64
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 122
204 #elif __NR_sched_setaffinity != 122
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 123
209 #elif __NR_sched_getaffinity != 123
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_X86_64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 203
215 #elif __NR_sched_setaffinity != 203
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 204
220 #elif __NR_sched_getaffinity != 204
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_PPC64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 222
226 #elif __NR_sched_setaffinity != 222
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 223
231 #elif __NR_sched_getaffinity != 223
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_MIPS
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 4239
237 #elif __NR_sched_setaffinity != 4239
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 4240
242 #elif __NR_sched_getaffinity != 4240
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 #elif KMP_ARCH_MIPS64
246 #ifndef __NR_sched_setaffinity
247 #define __NR_sched_setaffinity 5195
248 #elif __NR_sched_setaffinity != 5195
249 #error Wrong code for setaffinity system call.
250 #endif /* __NR_sched_setaffinity */
251 #ifndef __NR_sched_getaffinity
252 #define __NR_sched_getaffinity 5196
253 #elif __NR_sched_getaffinity != 5196
254 #error Wrong code for getaffinity system call.
255 #endif /* __NR_sched_getaffinity */
256 #error Unknown or unsupported architecture
257 #endif /* KMP_ARCH_* */
258 #elif KMP_OS_FREEBSD
259 #include <pthread.h>
260 #include <pthread_np.h>
261 #endif
262 class KMPNativeAffinity : public KMPAffinity {
263   class Mask : public KMPAffinity::Mask {
264     typedef unsigned long mask_t;
265     typedef decltype(__kmp_affin_mask_size) mask_size_type;
266     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
267     static const mask_t ONE = 1;
268     mask_size_type get_num_mask_types() const {
269       return __kmp_affin_mask_size / sizeof(mask_t);
270     }
271 
272   public:
273     mask_t *mask;
274     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
275     ~Mask() {
276       if (mask)
277         __kmp_free(mask);
278     }
279     void set(int i) override {
280       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
281     }
282     bool is_set(int i) const override {
283       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
284     }
285     void clear(int i) override {
286       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
287     }
288     void zero() override {
289       mask_size_type e = get_num_mask_types();
290       for (mask_size_type i = 0; i < e; ++i)
291         mask[i] = (mask_t)0;
292     }
293     void copy(const KMPAffinity::Mask *src) override {
294       const Mask *convert = static_cast<const Mask *>(src);
295       mask_size_type e = get_num_mask_types();
296       for (mask_size_type i = 0; i < e; ++i)
297         mask[i] = convert->mask[i];
298     }
299     void bitwise_and(const KMPAffinity::Mask *rhs) override {
300       const Mask *convert = static_cast<const Mask *>(rhs);
301       mask_size_type e = get_num_mask_types();
302       for (mask_size_type i = 0; i < e; ++i)
303         mask[i] &= convert->mask[i];
304     }
305     void bitwise_or(const KMPAffinity::Mask *rhs) override {
306       const Mask *convert = static_cast<const Mask *>(rhs);
307       mask_size_type e = get_num_mask_types();
308       for (mask_size_type i = 0; i < e; ++i)
309         mask[i] |= convert->mask[i];
310     }
311     void bitwise_not() override {
312       mask_size_type e = get_num_mask_types();
313       for (mask_size_type i = 0; i < e; ++i)
314         mask[i] = ~(mask[i]);
315     }
316     int begin() const override {
317       int retval = 0;
318       while (retval < end() && !is_set(retval))
319         ++retval;
320       return retval;
321     }
322     int end() const override {
323       int e;
324       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
325       return e;
326     }
327     int next(int previous) const override {
328       int retval = previous + 1;
329       while (retval < end() && !is_set(retval))
330         ++retval;
331       return retval;
332     }
333     int get_system_affinity(bool abort_on_error) override {
334       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
335                   "Illegal get affinity operation when not capable");
336 #if KMP_OS_LINUX
337       long retval =
338           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
339 #elif KMP_OS_FREEBSD
340       int r =
341           pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
342       int retval = (r == 0 ? 0 : -1);
343 #endif
344       if (retval >= 0) {
345         return 0;
346       }
347       int error = errno;
348       if (abort_on_error) {
349         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
350       }
351       return error;
352     }
353     int set_system_affinity(bool abort_on_error) const override {
354       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
355                   "Illegal set affinity operation when not capable");
356 #if KMP_OS_LINUX
357       long retval =
358           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
359 #elif KMP_OS_FREEBSD
360       int r =
361           pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, reinterpret_cast<cpuset_t *>(mask));
362       int retval = (r == 0 ? 0 : -1);
363 #endif
364       if (retval >= 0) {
365         return 0;
366       }
367       int error = errno;
368       if (abort_on_error) {
369         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
370       }
371       return error;
372     }
373   };
374   void determine_capable(const char *env_var) override {
375     __kmp_affinity_determine_capable(env_var);
376   }
377   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
378   KMPAffinity::Mask *allocate_mask() override {
379     KMPNativeAffinity::Mask *retval = new Mask();
380     return retval;
381   }
382   void deallocate_mask(KMPAffinity::Mask *m) override {
383     KMPNativeAffinity::Mask *native_mask =
384         static_cast<KMPNativeAffinity::Mask *>(m);
385     delete native_mask;
386   }
387   KMPAffinity::Mask *allocate_mask_array(int num) override {
388     return new Mask[num];
389   }
390   void deallocate_mask_array(KMPAffinity::Mask *array) override {
391     Mask *linux_array = static_cast<Mask *>(array);
392     delete[] linux_array;
393   }
394   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
395                                       int index) override {
396     Mask *linux_array = static_cast<Mask *>(array);
397     return &(linux_array[index]);
398   }
399   api_type get_api_type() const override { return NATIVE_OS; }
400 };
401 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
402 
403 #if KMP_OS_WINDOWS
404 class KMPNativeAffinity : public KMPAffinity {
405   class Mask : public KMPAffinity::Mask {
406     typedef ULONG_PTR mask_t;
407     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
408     mask_t *mask;
409 
410   public:
411     Mask() {
412       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
413     }
414     ~Mask() {
415       if (mask)
416         __kmp_free(mask);
417     }
418     void set(int i) override {
419       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
420     }
421     bool is_set(int i) const override {
422       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
423     }
424     void clear(int i) override {
425       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
426     }
427     void zero() override {
428       for (int i = 0; i < __kmp_num_proc_groups; ++i)
429         mask[i] = 0;
430     }
431     void copy(const KMPAffinity::Mask *src) override {
432       const Mask *convert = static_cast<const Mask *>(src);
433       for (int i = 0; i < __kmp_num_proc_groups; ++i)
434         mask[i] = convert->mask[i];
435     }
436     void bitwise_and(const KMPAffinity::Mask *rhs) override {
437       const Mask *convert = static_cast<const Mask *>(rhs);
438       for (int i = 0; i < __kmp_num_proc_groups; ++i)
439         mask[i] &= convert->mask[i];
440     }
441     void bitwise_or(const KMPAffinity::Mask *rhs) override {
442       const Mask *convert = static_cast<const Mask *>(rhs);
443       for (int i = 0; i < __kmp_num_proc_groups; ++i)
444         mask[i] |= convert->mask[i];
445     }
446     void bitwise_not() override {
447       for (int i = 0; i < __kmp_num_proc_groups; ++i)
448         mask[i] = ~(mask[i]);
449     }
450     int begin() const override {
451       int retval = 0;
452       while (retval < end() && !is_set(retval))
453         ++retval;
454       return retval;
455     }
456     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
457     int next(int previous) const override {
458       int retval = previous + 1;
459       while (retval < end() && !is_set(retval))
460         ++retval;
461       return retval;
462     }
463     int set_process_affinity(bool abort_on_error) const override {
464       if (__kmp_num_proc_groups <= 1) {
465         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
466           DWORD error = GetLastError();
467           if (abort_on_error) {
468             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
469                         __kmp_msg_null);
470           }
471           return error;
472         }
473       }
474       return 0;
475     }
476     int set_system_affinity(bool abort_on_error) const override {
477       if (__kmp_num_proc_groups > 1) {
478         // Check for a valid mask.
479         GROUP_AFFINITY ga;
480         int group = get_proc_group();
481         if (group < 0) {
482           if (abort_on_error) {
483             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
484           }
485           return -1;
486         }
487         // Transform the bit vector into a GROUP_AFFINITY struct
488         // and make the system call to set affinity.
489         ga.Group = group;
490         ga.Mask = mask[group];
491         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
492 
493         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
494         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
495           DWORD error = GetLastError();
496           if (abort_on_error) {
497             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
498                         __kmp_msg_null);
499           }
500           return error;
501         }
502       } else {
503         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
504           DWORD error = GetLastError();
505           if (abort_on_error) {
506             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
507                         __kmp_msg_null);
508           }
509           return error;
510         }
511       }
512       return 0;
513     }
514     int get_system_affinity(bool abort_on_error) override {
515       if (__kmp_num_proc_groups > 1) {
516         this->zero();
517         GROUP_AFFINITY ga;
518         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
519         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
520           DWORD error = GetLastError();
521           if (abort_on_error) {
522             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
523                         KMP_ERR(error), __kmp_msg_null);
524           }
525           return error;
526         }
527         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
528             (ga.Mask == 0)) {
529           return -1;
530         }
531         mask[ga.Group] = ga.Mask;
532       } else {
533         mask_t newMask, sysMask, retval;
534         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
535           DWORD error = GetLastError();
536           if (abort_on_error) {
537             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
538                         KMP_ERR(error), __kmp_msg_null);
539           }
540           return error;
541         }
542         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
543         if (!retval) {
544           DWORD error = GetLastError();
545           if (abort_on_error) {
546             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
547                         KMP_ERR(error), __kmp_msg_null);
548           }
549           return error;
550         }
551         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
552         if (!newMask) {
553           DWORD error = GetLastError();
554           if (abort_on_error) {
555             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
556                         KMP_ERR(error), __kmp_msg_null);
557           }
558         }
559         *mask = retval;
560       }
561       return 0;
562     }
563     int get_proc_group() const override {
564       int group = -1;
565       if (__kmp_num_proc_groups == 1) {
566         return 1;
567       }
568       for (int i = 0; i < __kmp_num_proc_groups; i++) {
569         if (mask[i] == 0)
570           continue;
571         if (group >= 0)
572           return -1;
573         group = i;
574       }
575       return group;
576     }
577   };
578   void determine_capable(const char *env_var) override {
579     __kmp_affinity_determine_capable(env_var);
580   }
581   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
582   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
583   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
584   KMPAffinity::Mask *allocate_mask_array(int num) override {
585     return new Mask[num];
586   }
587   void deallocate_mask_array(KMPAffinity::Mask *array) override {
588     Mask *windows_array = static_cast<Mask *>(array);
589     delete[] windows_array;
590   }
591   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
592                                       int index) override {
593     Mask *windows_array = static_cast<Mask *>(array);
594     return &(windows_array[index]);
595   }
596   api_type get_api_type() const override { return NATIVE_OS; }
597 };
598 #endif /* KMP_OS_WINDOWS */
599 #endif /* KMP_AFFINITY_SUPPORTED */
600 
601 class Address {
602 public:
603   static const unsigned maxDepth = 32;
604   unsigned labels[maxDepth];
605   unsigned childNums[maxDepth];
606   unsigned depth;
607   unsigned leader;
608   Address(unsigned _depth) : depth(_depth), leader(FALSE) {}
609   Address &operator=(const Address &b) {
610     depth = b.depth;
611     for (unsigned i = 0; i < depth; i++) {
612       labels[i] = b.labels[i];
613       childNums[i] = b.childNums[i];
614     }
615     leader = FALSE;
616     return *this;
617   }
618   bool operator==(const Address &b) const {
619     if (depth != b.depth)
620       return false;
621     for (unsigned i = 0; i < depth; i++)
622       if (labels[i] != b.labels[i])
623         return false;
624     return true;
625   }
626   bool isClose(const Address &b, int level) const {
627     if (depth != b.depth)
628       return false;
629     if ((unsigned)level >= depth)
630       return true;
631     for (unsigned i = 0; i < (depth - level); i++)
632       if (labels[i] != b.labels[i])
633         return false;
634     return true;
635   }
636   bool operator!=(const Address &b) const { return !operator==(b); }
637   void print() const {
638     unsigned i;
639     printf("Depth: %u --- ", depth);
640     for (i = 0; i < depth; i++) {
641       printf("%u ", labels[i]);
642     }
643   }
644 };
645 
646 class AddrUnsPair {
647 public:
648   Address first;
649   unsigned second;
650   AddrUnsPair(Address _first, unsigned _second)
651       : first(_first), second(_second) {}
652   AddrUnsPair &operator=(const AddrUnsPair &b) {
653     first = b.first;
654     second = b.second;
655     return *this;
656   }
657   void print() const {
658     printf("first = ");
659     first.print();
660     printf(" --- second = %u", second);
661   }
662   bool operator==(const AddrUnsPair &b) const {
663     if (first != b.first)
664       return false;
665     if (second != b.second)
666       return false;
667     return true;
668   }
669   bool operator!=(const AddrUnsPair &b) const { return !operator==(b); }
670 };
671 
672 static int __kmp_affinity_cmp_Address_labels(const void *a, const void *b) {
673   const Address *aa = &(((const AddrUnsPair *)a)->first);
674   const Address *bb = &(((const AddrUnsPair *)b)->first);
675   unsigned depth = aa->depth;
676   unsigned i;
677   KMP_DEBUG_ASSERT(depth == bb->depth);
678   for (i = 0; i < depth; i++) {
679     if (aa->labels[i] < bb->labels[i])
680       return -1;
681     if (aa->labels[i] > bb->labels[i])
682       return 1;
683   }
684   return 0;
685 }
686 
687 /* A structure for holding machine-specific hierarchy info to be computed once
688    at init. This structure represents a mapping of threads to the actual machine
689    hierarchy, or to our best guess at what the hierarchy might be, for the
690    purpose of performing an efficient barrier. In the worst case, when there is
691    no machine hierarchy information, it produces a tree suitable for a barrier,
692    similar to the tree used in the hyper barrier. */
693 class hierarchy_info {
694 public:
695   /* Good default values for number of leaves and branching factor, given no
696      affinity information. Behaves a bit like hyper barrier. */
697   static const kmp_uint32 maxLeaves = 4;
698   static const kmp_uint32 minBranch = 4;
699   /** Number of levels in the hierarchy. Typical levels are threads/core,
700       cores/package or socket, packages/node, nodes/machine, etc. We don't want
701       to get specific with nomenclature. When the machine is oversubscribed we
702       add levels to duplicate the hierarchy, doubling the thread capacity of the
703       hierarchy each time we add a level. */
704   kmp_uint32 maxLevels;
705 
706   /** This is specifically the depth of the machine configuration hierarchy, in
707       terms of the number of levels along the longest path from root to any
708       leaf. It corresponds to the number of entries in numPerLevel if we exclude
709       all but one trailing 1. */
710   kmp_uint32 depth;
711   kmp_uint32 base_num_threads;
712   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
713   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
714   // 2=initialization in progress
715   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
716 
717   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
718       the parent of a node at level i has. For example, if we have a machine
719       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
720       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
721   kmp_uint32 *numPerLevel;
722   kmp_uint32 *skipPerLevel;
723 
724   void deriveLevels(AddrUnsPair *adr2os, int num_addrs) {
725     int hier_depth = adr2os[0].first.depth;
726     int level = 0;
727     for (int i = hier_depth - 1; i >= 0; --i) {
728       int max = -1;
729       for (int j = 0; j < num_addrs; ++j) {
730         int next = adr2os[j].first.childNums[i];
731         if (next > max)
732           max = next;
733       }
734       numPerLevel[level] = max + 1;
735       ++level;
736     }
737   }
738 
739   hierarchy_info()
740       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
741 
742   void fini() {
743     if (!uninitialized && numPerLevel) {
744       __kmp_free(numPerLevel);
745       numPerLevel = NULL;
746       uninitialized = not_initialized;
747     }
748   }
749 
750   void init(AddrUnsPair *adr2os, int num_addrs) {
751     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
752         &uninitialized, not_initialized, initializing);
753     if (bool_result == 0) { // Wait for initialization
754       while (TCR_1(uninitialized) != initialized)
755         KMP_CPU_PAUSE();
756       return;
757     }
758     KMP_DEBUG_ASSERT(bool_result == 1);
759 
760     /* Added explicit initialization of the data fields here to prevent usage of
761        dirty value observed when static library is re-initialized multiple times
762        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
763        OpenMP). */
764     depth = 1;
765     resizing = 0;
766     maxLevels = 7;
767     numPerLevel =
768         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
769     skipPerLevel = &(numPerLevel[maxLevels]);
770     for (kmp_uint32 i = 0; i < maxLevels;
771          ++i) { // init numPerLevel[*] to 1 item per level
772       numPerLevel[i] = 1;
773       skipPerLevel[i] = 1;
774     }
775 
776     // Sort table by physical ID
777     if (adr2os) {
778       qsort(adr2os, num_addrs, sizeof(*adr2os),
779             __kmp_affinity_cmp_Address_labels);
780       deriveLevels(adr2os, num_addrs);
781     } else {
782       numPerLevel[0] = maxLeaves;
783       numPerLevel[1] = num_addrs / maxLeaves;
784       if (num_addrs % maxLeaves)
785         numPerLevel[1]++;
786     }
787 
788     base_num_threads = num_addrs;
789     for (int i = maxLevels - 1; i >= 0;
790          --i) // count non-empty levels to get depth
791       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
792         depth++;
793 
794     kmp_uint32 branch = minBranch;
795     if (numPerLevel[0] == 1)
796       branch = num_addrs / maxLeaves;
797     if (branch < minBranch)
798       branch = minBranch;
799     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
800       while (numPerLevel[d] > branch ||
801              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
802         if (numPerLevel[d] & 1)
803           numPerLevel[d]++;
804         numPerLevel[d] = numPerLevel[d] >> 1;
805         if (numPerLevel[d + 1] == 1)
806           depth++;
807         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
808       }
809       if (numPerLevel[0] == 1) {
810         branch = branch >> 1;
811         if (branch < 4)
812           branch = minBranch;
813       }
814     }
815 
816     for (kmp_uint32 i = 1; i < depth; ++i)
817       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
818     // Fill in hierarchy in the case of oversubscription
819     for (kmp_uint32 i = depth; i < maxLevels; ++i)
820       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
821 
822     uninitialized = initialized; // One writer
823   }
824 
825   // Resize the hierarchy if nproc changes to something larger than before
826   void resize(kmp_uint32 nproc) {
827     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
828     while (bool_result == 0) { // someone else is trying to resize
829       KMP_CPU_PAUSE();
830       if (nproc <= base_num_threads) // happy with other thread's resize
831         return;
832       else // try to resize
833         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
834     }
835     KMP_DEBUG_ASSERT(bool_result != 0);
836     if (nproc <= base_num_threads)
837       return; // happy with other thread's resize
838 
839     // Calculate new maxLevels
840     kmp_uint32 old_sz = skipPerLevel[depth - 1];
841     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
842     // First see if old maxLevels is enough to contain new size
843     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
844       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
845       numPerLevel[i - 1] *= 2;
846       old_sz *= 2;
847       depth++;
848     }
849     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
850       while (nproc > old_sz) {
851         old_sz *= 2;
852         incs++;
853         depth++;
854       }
855       maxLevels += incs;
856 
857       // Resize arrays
858       kmp_uint32 *old_numPerLevel = numPerLevel;
859       kmp_uint32 *old_skipPerLevel = skipPerLevel;
860       numPerLevel = skipPerLevel = NULL;
861       numPerLevel =
862           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
863       skipPerLevel = &(numPerLevel[maxLevels]);
864 
865       // Copy old elements from old arrays
866       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
867         // init numPerLevel[*] to 1 item per level
868         numPerLevel[i] = old_numPerLevel[i];
869         skipPerLevel[i] = old_skipPerLevel[i];
870       }
871 
872       // Init new elements in arrays to 1
873       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
874         // init numPerLevel[*] to 1 item per level
875         numPerLevel[i] = 1;
876         skipPerLevel[i] = 1;
877       }
878 
879       // Free old arrays
880       __kmp_free(old_numPerLevel);
881     }
882 
883     // Fill in oversubscription levels of hierarchy
884     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
885       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
886 
887     base_num_threads = nproc;
888     resizing = 0; // One writer
889   }
890 };
891 #endif // KMP_AFFINITY_H
892