1 /*
2  * kmp_affinity.h -- header for affinity management
3  */
4 
5 //===----------------------------------------------------------------------===//
6 //
7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
8 // See https://llvm.org/LICENSE.txt for license information.
9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef KMP_AFFINITY_H
14 #define KMP_AFFINITY_H
15 
16 #include "kmp.h"
17 #include "kmp_os.h"
18 
19 #if KMP_AFFINITY_SUPPORTED
20 #if KMP_USE_HWLOC
21 class KMPHwlocAffinity : public KMPAffinity {
22 public:
23   class Mask : public KMPAffinity::Mask {
24     hwloc_cpuset_t mask;
25 
26   public:
27     Mask() {
28       mask = hwloc_bitmap_alloc();
29       this->zero();
30     }
31     ~Mask() { hwloc_bitmap_free(mask); }
32     void set(int i) override { hwloc_bitmap_set(mask, i); }
33     bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); }
34     void clear(int i) override { hwloc_bitmap_clr(mask, i); }
35     void zero() override { hwloc_bitmap_zero(mask); }
36     void copy(const KMPAffinity::Mask *src) override {
37       const Mask *convert = static_cast<const Mask *>(src);
38       hwloc_bitmap_copy(mask, convert->mask);
39     }
40     void bitwise_and(const KMPAffinity::Mask *rhs) override {
41       const Mask *convert = static_cast<const Mask *>(rhs);
42       hwloc_bitmap_and(mask, mask, convert->mask);
43     }
44     void bitwise_or(const KMPAffinity::Mask *rhs) override {
45       const Mask *convert = static_cast<const Mask *>(rhs);
46       hwloc_bitmap_or(mask, mask, convert->mask);
47     }
48     void bitwise_not() override { hwloc_bitmap_not(mask, mask); }
49     int begin() const override { return hwloc_bitmap_first(mask); }
50     int end() const override { return -1; }
51     int next(int previous) const override {
52       return hwloc_bitmap_next(mask, previous);
53     }
54     int get_system_affinity(bool abort_on_error) override {
55       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
56                   "Illegal get affinity operation when not capable");
57       long retval =
58           hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
59       if (retval >= 0) {
60         return 0;
61       }
62       int error = errno;
63       if (abort_on_error) {
64         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
65       }
66       return error;
67     }
68     int set_system_affinity(bool abort_on_error) const override {
69       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
70                   "Illegal set affinity operation when not capable");
71       long retval =
72           hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD);
73       if (retval >= 0) {
74         return 0;
75       }
76       int error = errno;
77       if (abort_on_error) {
78         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
79       }
80       return error;
81     }
82 #if KMP_OS_WINDOWS
83     int set_process_affinity(bool abort_on_error) const override {
84       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
85                   "Illegal set process affinity operation when not capable");
86       int error = 0;
87       const hwloc_topology_support *support =
88           hwloc_topology_get_support(__kmp_hwloc_topology);
89       if (support->cpubind->set_proc_cpubind) {
90         int retval;
91         retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask,
92                                    HWLOC_CPUBIND_PROCESS);
93         if (retval >= 0)
94           return 0;
95         error = errno;
96         if (abort_on_error)
97           __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
98       }
99       return error;
100     }
101 #endif
102     int get_proc_group() const override {
103       int group = -1;
104 #if KMP_OS_WINDOWS
105       if (__kmp_num_proc_groups == 1) {
106         return 1;
107       }
108       for (int i = 0; i < __kmp_num_proc_groups; i++) {
109         // On windows, the long type is always 32 bits
110         unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2);
111         unsigned long second_32_bits =
112             hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1);
113         if (first_32_bits == 0 && second_32_bits == 0) {
114           continue;
115         }
116         if (group >= 0) {
117           return -1;
118         }
119         group = i;
120       }
121 #endif /* KMP_OS_WINDOWS */
122       return group;
123     }
124   };
125   void determine_capable(const char *var) override {
126     const hwloc_topology_support *topology_support;
127     if (__kmp_hwloc_topology == NULL) {
128       if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) {
129         __kmp_hwloc_error = TRUE;
130         if (__kmp_affinity_verbose)
131           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()");
132       }
133       if (hwloc_topology_load(__kmp_hwloc_topology) < 0) {
134         __kmp_hwloc_error = TRUE;
135         if (__kmp_affinity_verbose)
136           KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()");
137       }
138     }
139     topology_support = hwloc_topology_get_support(__kmp_hwloc_topology);
140     // Is the system capable of setting/getting this thread's affinity?
141     // Also, is topology discovery possible? (pu indicates ability to discover
142     // processing units). And finally, were there no errors when calling any
143     // hwloc_* API functions?
144     if (topology_support && topology_support->cpubind->set_thisthread_cpubind &&
145         topology_support->cpubind->get_thisthread_cpubind &&
146         topology_support->discovery->pu && !__kmp_hwloc_error) {
147       // enables affinity according to KMP_AFFINITY_CAPABLE() macro
148       KMP_AFFINITY_ENABLE(TRUE);
149     } else {
150       // indicate that hwloc didn't work and disable affinity
151       __kmp_hwloc_error = TRUE;
152       KMP_AFFINITY_DISABLE();
153     }
154   }
155   void bind_thread(int which) override {
156     KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
157                 "Illegal set affinity operation when not capable");
158     KMPAffinity::Mask *mask;
159     KMP_CPU_ALLOC_ON_STACK(mask);
160     KMP_CPU_ZERO(mask);
161     KMP_CPU_SET(which, mask);
162     __kmp_set_system_affinity(mask, TRUE);
163     KMP_CPU_FREE_FROM_STACK(mask);
164   }
165   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
166   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
167   KMPAffinity::Mask *allocate_mask_array(int num) override {
168     return new Mask[num];
169   }
170   void deallocate_mask_array(KMPAffinity::Mask *array) override {
171     Mask *hwloc_array = static_cast<Mask *>(array);
172     delete[] hwloc_array;
173   }
174   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
175                                       int index) override {
176     Mask *hwloc_array = static_cast<Mask *>(array);
177     return &(hwloc_array[index]);
178   }
179   api_type get_api_type() const override { return HWLOC; }
180 };
181 #endif /* KMP_USE_HWLOC */
182 
183 #if KMP_OS_LINUX || KMP_OS_FREEBSD
184 #if KMP_OS_LINUX
185 /* On some of the older OS's that we build on, these constants aren't present
186    in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on
187    all systems of the same arch where they are defined, and they cannot change.
188    stone forever. */
189 #include <sys/syscall.h>
190 #if KMP_ARCH_X86 || KMP_ARCH_ARM
191 #ifndef __NR_sched_setaffinity
192 #define __NR_sched_setaffinity 241
193 #elif __NR_sched_setaffinity != 241
194 #error Wrong code for setaffinity system call.
195 #endif /* __NR_sched_setaffinity */
196 #ifndef __NR_sched_getaffinity
197 #define __NR_sched_getaffinity 242
198 #elif __NR_sched_getaffinity != 242
199 #error Wrong code for getaffinity system call.
200 #endif /* __NR_sched_getaffinity */
201 #elif KMP_ARCH_AARCH64
202 #ifndef __NR_sched_setaffinity
203 #define __NR_sched_setaffinity 122
204 #elif __NR_sched_setaffinity != 122
205 #error Wrong code for setaffinity system call.
206 #endif /* __NR_sched_setaffinity */
207 #ifndef __NR_sched_getaffinity
208 #define __NR_sched_getaffinity 123
209 #elif __NR_sched_getaffinity != 123
210 #error Wrong code for getaffinity system call.
211 #endif /* __NR_sched_getaffinity */
212 #elif KMP_ARCH_X86_64
213 #ifndef __NR_sched_setaffinity
214 #define __NR_sched_setaffinity 203
215 #elif __NR_sched_setaffinity != 203
216 #error Wrong code for setaffinity system call.
217 #endif /* __NR_sched_setaffinity */
218 #ifndef __NR_sched_getaffinity
219 #define __NR_sched_getaffinity 204
220 #elif __NR_sched_getaffinity != 204
221 #error Wrong code for getaffinity system call.
222 #endif /* __NR_sched_getaffinity */
223 #elif KMP_ARCH_PPC64
224 #ifndef __NR_sched_setaffinity
225 #define __NR_sched_setaffinity 222
226 #elif __NR_sched_setaffinity != 222
227 #error Wrong code for setaffinity system call.
228 #endif /* __NR_sched_setaffinity */
229 #ifndef __NR_sched_getaffinity
230 #define __NR_sched_getaffinity 223
231 #elif __NR_sched_getaffinity != 223
232 #error Wrong code for getaffinity system call.
233 #endif /* __NR_sched_getaffinity */
234 #elif KMP_ARCH_MIPS
235 #ifndef __NR_sched_setaffinity
236 #define __NR_sched_setaffinity 4239
237 #elif __NR_sched_setaffinity != 4239
238 #error Wrong code for setaffinity system call.
239 #endif /* __NR_sched_setaffinity */
240 #ifndef __NR_sched_getaffinity
241 #define __NR_sched_getaffinity 4240
242 #elif __NR_sched_getaffinity != 4240
243 #error Wrong code for getaffinity system call.
244 #endif /* __NR_sched_getaffinity */
245 #elif KMP_ARCH_MIPS64
246 #ifndef __NR_sched_setaffinity
247 #define __NR_sched_setaffinity 5195
248 #elif __NR_sched_setaffinity != 5195
249 #error Wrong code for setaffinity system call.
250 #endif /* __NR_sched_setaffinity */
251 #ifndef __NR_sched_getaffinity
252 #define __NR_sched_getaffinity 5196
253 #elif __NR_sched_getaffinity != 5196
254 #error Wrong code for getaffinity system call.
255 #endif /* __NR_sched_getaffinity */
256 #error Unknown or unsupported architecture
257 #endif /* KMP_ARCH_* */
258 #elif KMP_OS_FREEBSD
259 #include <pthread.h>
260 #include <pthread_np.h>
261 #endif
262 class KMPNativeAffinity : public KMPAffinity {
263   class Mask : public KMPAffinity::Mask {
264     typedef unsigned long mask_t;
265     typedef decltype(__kmp_affin_mask_size) mask_size_type;
266     static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
267     static const mask_t ONE = 1;
268     mask_size_type get_num_mask_types() const {
269       return __kmp_affin_mask_size / sizeof(mask_t);
270     }
271 
272   public:
273     mask_t *mask;
274     Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); }
275     ~Mask() {
276       if (mask)
277         __kmp_free(mask);
278     }
279     void set(int i) override {
280       mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T));
281     }
282     bool is_set(int i) const override {
283       return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T)));
284     }
285     void clear(int i) override {
286       mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T));
287     }
288     void zero() override {
289       mask_size_type e = get_num_mask_types();
290       for (mask_size_type i = 0; i < e; ++i)
291         mask[i] = (mask_t)0;
292     }
293     void copy(const KMPAffinity::Mask *src) override {
294       const Mask *convert = static_cast<const Mask *>(src);
295       mask_size_type e = get_num_mask_types();
296       for (mask_size_type i = 0; i < e; ++i)
297         mask[i] = convert->mask[i];
298     }
299     void bitwise_and(const KMPAffinity::Mask *rhs) override {
300       const Mask *convert = static_cast<const Mask *>(rhs);
301       mask_size_type e = get_num_mask_types();
302       for (mask_size_type i = 0; i < e; ++i)
303         mask[i] &= convert->mask[i];
304     }
305     void bitwise_or(const KMPAffinity::Mask *rhs) override {
306       const Mask *convert = static_cast<const Mask *>(rhs);
307       mask_size_type e = get_num_mask_types();
308       for (mask_size_type i = 0; i < e; ++i)
309         mask[i] |= convert->mask[i];
310     }
311     void bitwise_not() override {
312       mask_size_type e = get_num_mask_types();
313       for (mask_size_type i = 0; i < e; ++i)
314         mask[i] = ~(mask[i]);
315     }
316     int begin() const override {
317       int retval = 0;
318       while (retval < end() && !is_set(retval))
319         ++retval;
320       return retval;
321     }
322     int end() const override {
323       int e;
324       __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e);
325       return e;
326     }
327     int next(int previous) const override {
328       int retval = previous + 1;
329       while (retval < end() && !is_set(retval))
330         ++retval;
331       return retval;
332     }
333     int get_system_affinity(bool abort_on_error) override {
334       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
335                   "Illegal get affinity operation when not capable");
336 #if KMP_OS_LINUX
337       long retval =
338           syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask);
339 #elif KMP_OS_FREEBSD
340       int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size,
341                                      reinterpret_cast<cpuset_t *>(mask));
342       int retval = (r == 0 ? 0 : -1);
343 #endif
344       if (retval >= 0) {
345         return 0;
346       }
347       int error = errno;
348       if (abort_on_error) {
349         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
350       }
351       return error;
352     }
353     int set_system_affinity(bool abort_on_error) const override {
354       KMP_ASSERT2(KMP_AFFINITY_CAPABLE(),
355                   "Illegal set affinity operation when not capable");
356 #if KMP_OS_LINUX
357       long retval =
358           syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask);
359 #elif KMP_OS_FREEBSD
360       int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size,
361                                      reinterpret_cast<cpuset_t *>(mask));
362       int retval = (r == 0 ? 0 : -1);
363 #endif
364       if (retval >= 0) {
365         return 0;
366       }
367       int error = errno;
368       if (abort_on_error) {
369         __kmp_fatal(KMP_MSG(FatalSysError), KMP_ERR(error), __kmp_msg_null);
370       }
371       return error;
372     }
373   };
374   void determine_capable(const char *env_var) override {
375     __kmp_affinity_determine_capable(env_var);
376   }
377   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
378   KMPAffinity::Mask *allocate_mask() override {
379     KMPNativeAffinity::Mask *retval = new Mask();
380     return retval;
381   }
382   void deallocate_mask(KMPAffinity::Mask *m) override {
383     KMPNativeAffinity::Mask *native_mask =
384         static_cast<KMPNativeAffinity::Mask *>(m);
385     delete native_mask;
386   }
387   KMPAffinity::Mask *allocate_mask_array(int num) override {
388     return new Mask[num];
389   }
390   void deallocate_mask_array(KMPAffinity::Mask *array) override {
391     Mask *linux_array = static_cast<Mask *>(array);
392     delete[] linux_array;
393   }
394   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
395                                       int index) override {
396     Mask *linux_array = static_cast<Mask *>(array);
397     return &(linux_array[index]);
398   }
399   api_type get_api_type() const override { return NATIVE_OS; }
400 };
401 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */
402 
403 #if KMP_OS_WINDOWS
404 class KMPNativeAffinity : public KMPAffinity {
405   class Mask : public KMPAffinity::Mask {
406     typedef ULONG_PTR mask_t;
407     static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT;
408     mask_t *mask;
409 
410   public:
411     Mask() {
412       mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups);
413     }
414     ~Mask() {
415       if (mask)
416         __kmp_free(mask);
417     }
418     void set(int i) override {
419       mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T));
420     }
421     bool is_set(int i) const override {
422       return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T)));
423     }
424     void clear(int i) override {
425       mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T));
426     }
427     void zero() override {
428       for (int i = 0; i < __kmp_num_proc_groups; ++i)
429         mask[i] = 0;
430     }
431     void copy(const KMPAffinity::Mask *src) override {
432       const Mask *convert = static_cast<const Mask *>(src);
433       for (int i = 0; i < __kmp_num_proc_groups; ++i)
434         mask[i] = convert->mask[i];
435     }
436     void bitwise_and(const KMPAffinity::Mask *rhs) override {
437       const Mask *convert = static_cast<const Mask *>(rhs);
438       for (int i = 0; i < __kmp_num_proc_groups; ++i)
439         mask[i] &= convert->mask[i];
440     }
441     void bitwise_or(const KMPAffinity::Mask *rhs) override {
442       const Mask *convert = static_cast<const Mask *>(rhs);
443       for (int i = 0; i < __kmp_num_proc_groups; ++i)
444         mask[i] |= convert->mask[i];
445     }
446     void bitwise_not() override {
447       for (int i = 0; i < __kmp_num_proc_groups; ++i)
448         mask[i] = ~(mask[i]);
449     }
450     int begin() const override {
451       int retval = 0;
452       while (retval < end() && !is_set(retval))
453         ++retval;
454       return retval;
455     }
456     int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; }
457     int next(int previous) const override {
458       int retval = previous + 1;
459       while (retval < end() && !is_set(retval))
460         ++retval;
461       return retval;
462     }
463     int set_process_affinity(bool abort_on_error) const override {
464       if (__kmp_num_proc_groups <= 1) {
465         if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) {
466           DWORD error = GetLastError();
467           if (abort_on_error) {
468             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
469                         __kmp_msg_null);
470           }
471           return error;
472         }
473       }
474       return 0;
475     }
476     int set_system_affinity(bool abort_on_error) const override {
477       if (__kmp_num_proc_groups > 1) {
478         // Check for a valid mask.
479         GROUP_AFFINITY ga;
480         int group = get_proc_group();
481         if (group < 0) {
482           if (abort_on_error) {
483             KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity");
484           }
485           return -1;
486         }
487         // Transform the bit vector into a GROUP_AFFINITY struct
488         // and make the system call to set affinity.
489         ga.Group = group;
490         ga.Mask = mask[group];
491         ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0;
492 
493         KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL);
494         if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) {
495           DWORD error = GetLastError();
496           if (abort_on_error) {
497             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
498                         __kmp_msg_null);
499           }
500           return error;
501         }
502       } else {
503         if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) {
504           DWORD error = GetLastError();
505           if (abort_on_error) {
506             __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error),
507                         __kmp_msg_null);
508           }
509           return error;
510         }
511       }
512       return 0;
513     }
514     int get_system_affinity(bool abort_on_error) override {
515       if (__kmp_num_proc_groups > 1) {
516         this->zero();
517         GROUP_AFFINITY ga;
518         KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL);
519         if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) {
520           DWORD error = GetLastError();
521           if (abort_on_error) {
522             __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"),
523                         KMP_ERR(error), __kmp_msg_null);
524           }
525           return error;
526         }
527         if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) ||
528             (ga.Mask == 0)) {
529           return -1;
530         }
531         mask[ga.Group] = ga.Mask;
532       } else {
533         mask_t newMask, sysMask, retval;
534         if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) {
535           DWORD error = GetLastError();
536           if (abort_on_error) {
537             __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"),
538                         KMP_ERR(error), __kmp_msg_null);
539           }
540           return error;
541         }
542         retval = SetThreadAffinityMask(GetCurrentThread(), newMask);
543         if (!retval) {
544           DWORD error = GetLastError();
545           if (abort_on_error) {
546             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
547                         KMP_ERR(error), __kmp_msg_null);
548           }
549           return error;
550         }
551         newMask = SetThreadAffinityMask(GetCurrentThread(), retval);
552         if (!newMask) {
553           DWORD error = GetLastError();
554           if (abort_on_error) {
555             __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"),
556                         KMP_ERR(error), __kmp_msg_null);
557           }
558         }
559         *mask = retval;
560       }
561       return 0;
562     }
563     int get_proc_group() const override {
564       int group = -1;
565       if (__kmp_num_proc_groups == 1) {
566         return 1;
567       }
568       for (int i = 0; i < __kmp_num_proc_groups; i++) {
569         if (mask[i] == 0)
570           continue;
571         if (group >= 0)
572           return -1;
573         group = i;
574       }
575       return group;
576     }
577   };
578   void determine_capable(const char *env_var) override {
579     __kmp_affinity_determine_capable(env_var);
580   }
581   void bind_thread(int which) override { __kmp_affinity_bind_thread(which); }
582   KMPAffinity::Mask *allocate_mask() override { return new Mask(); }
583   void deallocate_mask(KMPAffinity::Mask *m) override { delete m; }
584   KMPAffinity::Mask *allocate_mask_array(int num) override {
585     return new Mask[num];
586   }
587   void deallocate_mask_array(KMPAffinity::Mask *array) override {
588     Mask *windows_array = static_cast<Mask *>(array);
589     delete[] windows_array;
590   }
591   KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array,
592                                       int index) override {
593     Mask *windows_array = static_cast<Mask *>(array);
594     return &(windows_array[index]);
595   }
596   api_type get_api_type() const override { return NATIVE_OS; }
597 };
598 #endif /* KMP_OS_WINDOWS */
599 #endif /* KMP_AFFINITY_SUPPORTED */
600 
601 class kmp_hw_thread_t {
602 public:
603   static const int UNKNOWN_ID = -1;
604   static int compare_ids(const void *a, const void *b);
605   static int compare_compact(const void *a, const void *b);
606   int ids[KMP_HW_LAST];
607   int sub_ids[KMP_HW_LAST];
608   bool leader;
609   int os_id;
610   void print() const;
611   void clear() {
612     for (int i = 0; i < (int)KMP_HW_LAST; ++i)
613       ids[i] = UNKNOWN_ID;
614     leader = false;
615   }
616 };
617 
618 class kmp_topology_t {
619 
620   struct flags_t {
621     int uniform : 1;
622     int reserved : 31;
623   };
624 
625   int depth;
626 
627   // The following arrays are all 'depth' long
628 
629   // Orderd array of the types in the topology
630   kmp_hw_t *types;
631 
632   // Keep quick topology ratios, for non-uniform topologies,
633   // this ratio holds the max number of itemAs per itemB
634   // e.g., [ 4 packages | 6 cores / package | 2 threads / core ]
635   int *ratio;
636 
637   // Storage containing the absolute number of each topology layer
638   int *count;
639 
640   // The hardware threads array
641   // hw_threads is num_hw_threads long
642   // Each hw_thread's ids and sub_ids are depth deep
643   int num_hw_threads;
644   kmp_hw_thread_t *hw_threads;
645 
646   // Equivalence hash where the key is the hardware topology item
647   // and the value is the equivalent hardware topology type in the
648   // types[] array, if the value is KMP_HW_UNKNOWN, then there is no
649   // known equivalence for the topology type
650   kmp_hw_t equivalent[KMP_HW_LAST];
651 
652   // Flags describing the topology
653   flags_t flags;
654 
655   // Count each item & get the num x's per y
656   // e.g., get the number of cores and the number of threads per core
657   // for each (x, y) in (KMP_HW_* , KMP_HW_*)
658   void _gather_enumeration_information();
659 
660   // Remove layers that don't add information to the topology.
661   // This is done by having the layer take on the id = UNKNOWN_ID (-1)
662   void _remove_radix1_layers();
663 
664   // Find out if the topology is uniform
665   void _discover_uniformity();
666 
667   // Set all the sub_ids for each hardware thread
668   void _set_sub_ids();
669 
670   // Set global affinity variables describing the number of threads per
671   // core, the number of packages, the number of cores per package, and
672   // the number of cores.
673   void _set_globals();
674 
675   // Set the last level cache equivalent type
676   void _set_last_level_cache();
677 
678 public:
679   // Force use of allocate()/deallocate()
680   kmp_topology_t() = delete;
681   kmp_topology_t(const kmp_topology_t &t) = delete;
682   kmp_topology_t(kmp_topology_t &&t) = delete;
683   kmp_topology_t &operator=(const kmp_topology_t &t) = delete;
684   kmp_topology_t &operator=(kmp_topology_t &&t) = delete;
685 
686   static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types);
687   static void deallocate(kmp_topology_t *);
688 
689   // Functions used in create_map() routines
690   kmp_hw_thread_t &at(int index) {
691     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
692     return hw_threads[index];
693   }
694   const kmp_hw_thread_t &at(int index) const {
695     KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads);
696     return hw_threads[index];
697   }
698   int get_num_hw_threads() const { return num_hw_threads; }
699   void sort_ids() {
700     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
701           kmp_hw_thread_t::compare_ids);
702   }
703   // Check if the hardware ids are unique, if they are
704   // return true, otherwise return false
705   bool check_ids() const;
706 
707   // Function to call after the create_map() routine
708   void canonicalize();
709   void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores);
710 
711   // Functions used after canonicalize() called
712   bool filter_hw_subset();
713   bool is_close(int hwt1, int hwt2, int level) const;
714   bool is_uniform() const { return flags.uniform; }
715   // Tell whether a type is a valid type in the topology
716   // returns KMP_HW_UNKNOWN when there is no equivalent type
717   kmp_hw_t get_equivalent_type(kmp_hw_t type) const { return equivalent[type]; }
718   // Set type1 = type2
719   void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) {
720     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1);
721     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2);
722     kmp_hw_t real_type2 = equivalent[type2];
723     if (real_type2 == KMP_HW_UNKNOWN)
724       real_type2 = type2;
725     equivalent[type1] = real_type2;
726     // This loop is required since any of the types may have been set to
727     // be equivalent to type1.  They all must be checked and reset to type2.
728     KMP_FOREACH_HW_TYPE(type) {
729       if (equivalent[type] == type1) {
730         equivalent[type] = real_type2;
731       }
732     }
733   }
734   // Calculate number of types corresponding to level1
735   // per types corresponding to level2 (e.g., number of threads per core)
736   int calculate_ratio(int level1, int level2) const {
737     KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth);
738     KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth);
739     int r = 1;
740     for (int level = level1; level > level2; --level)
741       r *= ratio[level];
742     return r;
743   }
744   int get_ratio(int level) const {
745     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
746     return ratio[level];
747   }
748   int get_depth() const { return depth; };
749   kmp_hw_t get_type(int level) const {
750     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
751     return types[level];
752   }
753   int get_level(kmp_hw_t type) const {
754     KMP_DEBUG_ASSERT_VALID_HW_TYPE(type);
755     int eq_type = equivalent[type];
756     if (eq_type == KMP_HW_UNKNOWN)
757       return -1;
758     for (int i = 0; i < depth; ++i)
759       if (types[i] == eq_type)
760         return i;
761     return -1;
762   }
763   int get_count(int level) const {
764     KMP_DEBUG_ASSERT(level >= 0 && level < depth);
765     return count[level];
766   }
767 #if KMP_AFFINITY_SUPPORTED
768   void sort_compact() {
769     qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t),
770           kmp_hw_thread_t::compare_compact);
771   }
772 #endif
773   void print(const char *env_var = "KMP_AFFINITY") const;
774   void dump() const;
775 };
776 
777 class kmp_hw_subset_t {
778 public:
779   struct item_t {
780     int num;
781     kmp_hw_t type;
782     int offset;
783   };
784 
785 private:
786   int depth;
787   int capacity;
788   item_t *items;
789   kmp_uint64 set;
790   bool absolute;
791   // The set must be able to handle up to KMP_HW_LAST number of layers
792   KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST);
793 
794 public:
795   // Force use of allocate()/deallocate()
796   kmp_hw_subset_t() = delete;
797   kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete;
798   kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete;
799   kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete;
800   kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete;
801 
802   static kmp_hw_subset_t *allocate() {
803     int initial_capacity = 5;
804     kmp_hw_subset_t *retval =
805         (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t));
806     retval->depth = 0;
807     retval->capacity = initial_capacity;
808     retval->set = 0ull;
809     retval->absolute = false;
810     retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity);
811     return retval;
812   }
813   static void deallocate(kmp_hw_subset_t *subset) {
814     __kmp_free(subset->items);
815     __kmp_free(subset);
816   }
817   void set_absolute() { absolute = true; }
818   bool is_absolute() const { return absolute; }
819   void push_back(int num, kmp_hw_t type, int offset) {
820     if (depth == capacity - 1) {
821       capacity *= 2;
822       item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity);
823       for (int i = 0; i < depth; ++i)
824         new_items[i] = items[i];
825       __kmp_free(items);
826       items = new_items;
827     }
828     items[depth].num = num;
829     items[depth].type = type;
830     items[depth].offset = offset;
831     depth++;
832     set |= (1ull << type);
833   }
834   int get_depth() const { return depth; }
835   const item_t &at(int index) const {
836     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
837     return items[index];
838   }
839   item_t &at(int index) {
840     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
841     return items[index];
842   }
843   void remove(int index) {
844     KMP_DEBUG_ASSERT(index >= 0 && index < depth);
845     set &= ~(1ull << items[index].type);
846     for (int j = index + 1; j < depth; ++j) {
847       items[j - 1] = items[j];
848     }
849     depth--;
850   }
851   bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); }
852   void dump() const {
853     printf("**********************\n");
854     printf("*** kmp_hw_subset: ***\n");
855     printf("* depth: %d\n", depth);
856     printf("* items:\n");
857     for (int i = 0; i < depth; ++i) {
858       printf("num: %d, type: %s, offset: %d\n", items[i].num,
859              __kmp_hw_get_keyword(items[i].type), items[i].offset);
860     }
861     printf("* set: 0x%llx\n", set);
862     printf("* absolute: %d\n", absolute);
863     printf("**********************\n");
864   }
865 };
866 
867 extern kmp_topology_t *__kmp_topology;
868 extern kmp_hw_subset_t *__kmp_hw_subset;
869 
870 /* A structure for holding machine-specific hierarchy info to be computed once
871    at init. This structure represents a mapping of threads to the actual machine
872    hierarchy, or to our best guess at what the hierarchy might be, for the
873    purpose of performing an efficient barrier. In the worst case, when there is
874    no machine hierarchy information, it produces a tree suitable for a barrier,
875    similar to the tree used in the hyper barrier. */
876 class hierarchy_info {
877 public:
878   /* Good default values for number of leaves and branching factor, given no
879      affinity information. Behaves a bit like hyper barrier. */
880   static const kmp_uint32 maxLeaves = 4;
881   static const kmp_uint32 minBranch = 4;
882   /** Number of levels in the hierarchy. Typical levels are threads/core,
883       cores/package or socket, packages/node, nodes/machine, etc. We don't want
884       to get specific with nomenclature. When the machine is oversubscribed we
885       add levels to duplicate the hierarchy, doubling the thread capacity of the
886       hierarchy each time we add a level. */
887   kmp_uint32 maxLevels;
888 
889   /** This is specifically the depth of the machine configuration hierarchy, in
890       terms of the number of levels along the longest path from root to any
891       leaf. It corresponds to the number of entries in numPerLevel if we exclude
892       all but one trailing 1. */
893   kmp_uint32 depth;
894   kmp_uint32 base_num_threads;
895   enum init_status { initialized = 0, not_initialized = 1, initializing = 2 };
896   volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized,
897   // 2=initialization in progress
898   volatile kmp_int8 resizing; // 0=not resizing, 1=resizing
899 
900   /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children
901       the parent of a node at level i has. For example, if we have a machine
902       with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel =
903       {2, 4, 4, 1, 1}. All empty levels are set to 1. */
904   kmp_uint32 *numPerLevel;
905   kmp_uint32 *skipPerLevel;
906 
907   void deriveLevels() {
908     int hier_depth = __kmp_topology->get_depth();
909     for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) {
910       numPerLevel[level] = __kmp_topology->get_ratio(i);
911     }
912   }
913 
914   hierarchy_info()
915       : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {}
916 
917   void fini() {
918     if (!uninitialized && numPerLevel) {
919       __kmp_free(numPerLevel);
920       numPerLevel = NULL;
921       uninitialized = not_initialized;
922     }
923   }
924 
925   void init(int num_addrs) {
926     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(
927         &uninitialized, not_initialized, initializing);
928     if (bool_result == 0) { // Wait for initialization
929       while (TCR_1(uninitialized) != initialized)
930         KMP_CPU_PAUSE();
931       return;
932     }
933     KMP_DEBUG_ASSERT(bool_result == 1);
934 
935     /* Added explicit initialization of the data fields here to prevent usage of
936        dirty value observed when static library is re-initialized multiple times
937        (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses
938        OpenMP). */
939     depth = 1;
940     resizing = 0;
941     maxLevels = 7;
942     numPerLevel =
943         (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
944     skipPerLevel = &(numPerLevel[maxLevels]);
945     for (kmp_uint32 i = 0; i < maxLevels;
946          ++i) { // init numPerLevel[*] to 1 item per level
947       numPerLevel[i] = 1;
948       skipPerLevel[i] = 1;
949     }
950 
951     // Sort table by physical ID
952     if (__kmp_topology && __kmp_topology->get_depth() > 0) {
953       deriveLevels();
954     } else {
955       numPerLevel[0] = maxLeaves;
956       numPerLevel[1] = num_addrs / maxLeaves;
957       if (num_addrs % maxLeaves)
958         numPerLevel[1]++;
959     }
960 
961     base_num_threads = num_addrs;
962     for (int i = maxLevels - 1; i >= 0;
963          --i) // count non-empty levels to get depth
964       if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1'
965         depth++;
966 
967     kmp_uint32 branch = minBranch;
968     if (numPerLevel[0] == 1)
969       branch = num_addrs / maxLeaves;
970     if (branch < minBranch)
971       branch = minBranch;
972     for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width
973       while (numPerLevel[d] > branch ||
974              (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0!
975         if (numPerLevel[d] & 1)
976           numPerLevel[d]++;
977         numPerLevel[d] = numPerLevel[d] >> 1;
978         if (numPerLevel[d + 1] == 1)
979           depth++;
980         numPerLevel[d + 1] = numPerLevel[d + 1] << 1;
981       }
982       if (numPerLevel[0] == 1) {
983         branch = branch >> 1;
984         if (branch < 4)
985           branch = minBranch;
986       }
987     }
988 
989     for (kmp_uint32 i = 1; i < depth; ++i)
990       skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1];
991     // Fill in hierarchy in the case of oversubscription
992     for (kmp_uint32 i = depth; i < maxLevels; ++i)
993       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
994 
995     uninitialized = initialized; // One writer
996   }
997 
998   // Resize the hierarchy if nproc changes to something larger than before
999   void resize(kmp_uint32 nproc) {
1000     kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1001     while (bool_result == 0) { // someone else is trying to resize
1002       KMP_CPU_PAUSE();
1003       if (nproc <= base_num_threads) // happy with other thread's resize
1004         return;
1005       else // try to resize
1006         bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1);
1007     }
1008     KMP_DEBUG_ASSERT(bool_result != 0);
1009     if (nproc <= base_num_threads)
1010       return; // happy with other thread's resize
1011 
1012     // Calculate new maxLevels
1013     kmp_uint32 old_sz = skipPerLevel[depth - 1];
1014     kmp_uint32 incs = 0, old_maxLevels = maxLevels;
1015     // First see if old maxLevels is enough to contain new size
1016     for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) {
1017       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1018       numPerLevel[i - 1] *= 2;
1019       old_sz *= 2;
1020       depth++;
1021     }
1022     if (nproc > old_sz) { // Not enough space, need to expand hierarchy
1023       while (nproc > old_sz) {
1024         old_sz *= 2;
1025         incs++;
1026         depth++;
1027       }
1028       maxLevels += incs;
1029 
1030       // Resize arrays
1031       kmp_uint32 *old_numPerLevel = numPerLevel;
1032       kmp_uint32 *old_skipPerLevel = skipPerLevel;
1033       numPerLevel = skipPerLevel = NULL;
1034       numPerLevel =
1035           (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32));
1036       skipPerLevel = &(numPerLevel[maxLevels]);
1037 
1038       // Copy old elements from old arrays
1039       for (kmp_uint32 i = 0; i < old_maxLevels; ++i) {
1040         // init numPerLevel[*] to 1 item per level
1041         numPerLevel[i] = old_numPerLevel[i];
1042         skipPerLevel[i] = old_skipPerLevel[i];
1043       }
1044 
1045       // Init new elements in arrays to 1
1046       for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) {
1047         // init numPerLevel[*] to 1 item per level
1048         numPerLevel[i] = 1;
1049         skipPerLevel[i] = 1;
1050       }
1051 
1052       // Free old arrays
1053       __kmp_free(old_numPerLevel);
1054     }
1055 
1056     // Fill in oversubscription levels of hierarchy
1057     for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i)
1058       skipPerLevel[i] = 2 * skipPerLevel[i - 1];
1059 
1060     base_num_threads = nproc;
1061     resizing = 0; // One writer
1062   }
1063 };
1064 #endif // KMP_AFFINITY_H
1065