1 /* 2 * kmp_affinity.h -- header for affinity management 3 */ 4 5 //===----------------------------------------------------------------------===// 6 // 7 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 8 // See https://llvm.org/LICENSE.txt for license information. 9 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 10 // 11 //===----------------------------------------------------------------------===// 12 13 #ifndef KMP_AFFINITY_H 14 #define KMP_AFFINITY_H 15 16 #include "kmp.h" 17 #include "kmp_os.h" 18 #include <limits> 19 20 #if KMP_AFFINITY_SUPPORTED 21 #if KMP_USE_HWLOC 22 class KMPHwlocAffinity : public KMPAffinity { 23 public: 24 class Mask : public KMPAffinity::Mask { 25 hwloc_cpuset_t mask; 26 27 public: 28 Mask() { 29 mask = hwloc_bitmap_alloc(); 30 this->zero(); 31 } 32 ~Mask() { hwloc_bitmap_free(mask); } 33 void set(int i) override { hwloc_bitmap_set(mask, i); } 34 bool is_set(int i) const override { return hwloc_bitmap_isset(mask, i); } 35 void clear(int i) override { hwloc_bitmap_clr(mask, i); } 36 void zero() override { hwloc_bitmap_zero(mask); } 37 bool empty() const override { return hwloc_bitmap_iszero(mask); } 38 void copy(const KMPAffinity::Mask *src) override { 39 const Mask *convert = static_cast<const Mask *>(src); 40 hwloc_bitmap_copy(mask, convert->mask); 41 } 42 void bitwise_and(const KMPAffinity::Mask *rhs) override { 43 const Mask *convert = static_cast<const Mask *>(rhs); 44 hwloc_bitmap_and(mask, mask, convert->mask); 45 } 46 void bitwise_or(const KMPAffinity::Mask *rhs) override { 47 const Mask *convert = static_cast<const Mask *>(rhs); 48 hwloc_bitmap_or(mask, mask, convert->mask); 49 } 50 void bitwise_not() override { hwloc_bitmap_not(mask, mask); } 51 bool is_equal(const KMPAffinity::Mask *rhs) const override { 52 const Mask *convert = static_cast<const Mask *>(rhs); 53 return hwloc_bitmap_isequal(mask, convert->mask); 54 } 55 int begin() const override { return hwloc_bitmap_first(mask); } 56 int end() const override { return -1; } 57 int next(int previous) const override { 58 return hwloc_bitmap_next(mask, previous); 59 } 60 int get_system_affinity(bool abort_on_error) override { 61 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 62 "Illegal get affinity operation when not capable"); 63 long retval = 64 hwloc_get_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 65 if (retval >= 0) { 66 return 0; 67 } 68 int error = errno; 69 if (abort_on_error) { 70 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_get_cpubind()"), 71 KMP_ERR(error), __kmp_msg_null); 72 } 73 return error; 74 } 75 int set_system_affinity(bool abort_on_error) const override { 76 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 77 "Illegal set affinity operation when not capable"); 78 long retval = 79 hwloc_set_cpubind(__kmp_hwloc_topology, mask, HWLOC_CPUBIND_THREAD); 80 if (retval >= 0) { 81 return 0; 82 } 83 int error = errno; 84 if (abort_on_error) { 85 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 86 KMP_ERR(error), __kmp_msg_null); 87 } 88 return error; 89 } 90 #if KMP_OS_WINDOWS 91 int set_process_affinity(bool abort_on_error) const override { 92 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 93 "Illegal set process affinity operation when not capable"); 94 int error = 0; 95 const hwloc_topology_support *support = 96 hwloc_topology_get_support(__kmp_hwloc_topology); 97 if (support->cpubind->set_proc_cpubind) { 98 int retval; 99 retval = hwloc_set_cpubind(__kmp_hwloc_topology, mask, 100 HWLOC_CPUBIND_PROCESS); 101 if (retval >= 0) 102 return 0; 103 error = errno; 104 if (abort_on_error) 105 __kmp_fatal(KMP_MSG(FunctionError, "hwloc_set_cpubind()"), 106 KMP_ERR(error), __kmp_msg_null); 107 } 108 return error; 109 } 110 #endif 111 int get_proc_group() const override { 112 int group = -1; 113 #if KMP_OS_WINDOWS 114 if (__kmp_num_proc_groups == 1) { 115 return 1; 116 } 117 for (int i = 0; i < __kmp_num_proc_groups; i++) { 118 // On windows, the long type is always 32 bits 119 unsigned long first_32_bits = hwloc_bitmap_to_ith_ulong(mask, i * 2); 120 unsigned long second_32_bits = 121 hwloc_bitmap_to_ith_ulong(mask, i * 2 + 1); 122 if (first_32_bits == 0 && second_32_bits == 0) { 123 continue; 124 } 125 if (group >= 0) { 126 return -1; 127 } 128 group = i; 129 } 130 #endif /* KMP_OS_WINDOWS */ 131 return group; 132 } 133 }; 134 void determine_capable(const char *var) override { 135 const hwloc_topology_support *topology_support; 136 if (__kmp_hwloc_topology == NULL) { 137 if (hwloc_topology_init(&__kmp_hwloc_topology) < 0) { 138 __kmp_hwloc_error = TRUE; 139 if (__kmp_affinity.flags.verbose) { 140 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_init()"); 141 } 142 } 143 if (hwloc_topology_load(__kmp_hwloc_topology) < 0) { 144 __kmp_hwloc_error = TRUE; 145 if (__kmp_affinity.flags.verbose) { 146 KMP_WARNING(AffHwlocErrorOccurred, var, "hwloc_topology_load()"); 147 } 148 } 149 } 150 topology_support = hwloc_topology_get_support(__kmp_hwloc_topology); 151 // Is the system capable of setting/getting this thread's affinity? 152 // Also, is topology discovery possible? (pu indicates ability to discover 153 // processing units). And finally, were there no errors when calling any 154 // hwloc_* API functions? 155 if (topology_support && topology_support->cpubind->set_thisthread_cpubind && 156 topology_support->cpubind->get_thisthread_cpubind && 157 topology_support->discovery->pu && !__kmp_hwloc_error) { 158 // enables affinity according to KMP_AFFINITY_CAPABLE() macro 159 KMP_AFFINITY_ENABLE(TRUE); 160 } else { 161 // indicate that hwloc didn't work and disable affinity 162 __kmp_hwloc_error = TRUE; 163 KMP_AFFINITY_DISABLE(); 164 } 165 } 166 void bind_thread(int which) override { 167 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 168 "Illegal set affinity operation when not capable"); 169 KMPAffinity::Mask *mask; 170 KMP_CPU_ALLOC_ON_STACK(mask); 171 KMP_CPU_ZERO(mask); 172 KMP_CPU_SET(which, mask); 173 __kmp_set_system_affinity(mask, TRUE); 174 KMP_CPU_FREE_FROM_STACK(mask); 175 } 176 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 177 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 178 KMPAffinity::Mask *allocate_mask_array(int num) override { 179 return new Mask[num]; 180 } 181 void deallocate_mask_array(KMPAffinity::Mask *array) override { 182 Mask *hwloc_array = static_cast<Mask *>(array); 183 delete[] hwloc_array; 184 } 185 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 186 int index) override { 187 Mask *hwloc_array = static_cast<Mask *>(array); 188 return &(hwloc_array[index]); 189 } 190 api_type get_api_type() const override { return HWLOC; } 191 }; 192 #endif /* KMP_USE_HWLOC */ 193 194 #if KMP_OS_LINUX || KMP_OS_FREEBSD 195 #if KMP_OS_LINUX 196 /* On some of the older OS's that we build on, these constants aren't present 197 in <asm/unistd.h> #included from <sys.syscall.h>. They must be the same on 198 all systems of the same arch where they are defined, and they cannot change. 199 stone forever. */ 200 #include <sys/syscall.h> 201 #if KMP_ARCH_X86 || KMP_ARCH_ARM 202 #ifndef __NR_sched_setaffinity 203 #define __NR_sched_setaffinity 241 204 #elif __NR_sched_setaffinity != 241 205 #error Wrong code for setaffinity system call. 206 #endif /* __NR_sched_setaffinity */ 207 #ifndef __NR_sched_getaffinity 208 #define __NR_sched_getaffinity 242 209 #elif __NR_sched_getaffinity != 242 210 #error Wrong code for getaffinity system call. 211 #endif /* __NR_sched_getaffinity */ 212 #elif KMP_ARCH_AARCH64 213 #ifndef __NR_sched_setaffinity 214 #define __NR_sched_setaffinity 122 215 #elif __NR_sched_setaffinity != 122 216 #error Wrong code for setaffinity system call. 217 #endif /* __NR_sched_setaffinity */ 218 #ifndef __NR_sched_getaffinity 219 #define __NR_sched_getaffinity 123 220 #elif __NR_sched_getaffinity != 123 221 #error Wrong code for getaffinity system call. 222 #endif /* __NR_sched_getaffinity */ 223 #elif KMP_ARCH_X86_64 224 #ifndef __NR_sched_setaffinity 225 #define __NR_sched_setaffinity 203 226 #elif __NR_sched_setaffinity != 203 227 #error Wrong code for setaffinity system call. 228 #endif /* __NR_sched_setaffinity */ 229 #ifndef __NR_sched_getaffinity 230 #define __NR_sched_getaffinity 204 231 #elif __NR_sched_getaffinity != 204 232 #error Wrong code for getaffinity system call. 233 #endif /* __NR_sched_getaffinity */ 234 #elif KMP_ARCH_PPC64 235 #ifndef __NR_sched_setaffinity 236 #define __NR_sched_setaffinity 222 237 #elif __NR_sched_setaffinity != 222 238 #error Wrong code for setaffinity system call. 239 #endif /* __NR_sched_setaffinity */ 240 #ifndef __NR_sched_getaffinity 241 #define __NR_sched_getaffinity 223 242 #elif __NR_sched_getaffinity != 223 243 #error Wrong code for getaffinity system call. 244 #endif /* __NR_sched_getaffinity */ 245 #elif KMP_ARCH_MIPS 246 #ifndef __NR_sched_setaffinity 247 #define __NR_sched_setaffinity 4239 248 #elif __NR_sched_setaffinity != 4239 249 #error Wrong code for setaffinity system call. 250 #endif /* __NR_sched_setaffinity */ 251 #ifndef __NR_sched_getaffinity 252 #define __NR_sched_getaffinity 4240 253 #elif __NR_sched_getaffinity != 4240 254 #error Wrong code for getaffinity system call. 255 #endif /* __NR_sched_getaffinity */ 256 #elif KMP_ARCH_MIPS64 257 #ifndef __NR_sched_setaffinity 258 #define __NR_sched_setaffinity 5195 259 #elif __NR_sched_setaffinity != 5195 260 #error Wrong code for setaffinity system call. 261 #endif /* __NR_sched_setaffinity */ 262 #ifndef __NR_sched_getaffinity 263 #define __NR_sched_getaffinity 5196 264 #elif __NR_sched_getaffinity != 5196 265 #error Wrong code for getaffinity system call. 266 #endif /* __NR_sched_getaffinity */ 267 #elif KMP_ARCH_LOONGARCH64 268 #ifndef __NR_sched_setaffinity 269 #define __NR_sched_setaffinity 122 270 #elif __NR_sched_setaffinity != 122 271 #error Wrong code for setaffinity system call. 272 #endif /* __NR_sched_setaffinity */ 273 #ifndef __NR_sched_getaffinity 274 #define __NR_sched_getaffinity 123 275 #elif __NR_sched_getaffinity != 123 276 #error Wrong code for getaffinity system call. 277 #endif /* __NR_sched_getaffinity */ 278 #elif KMP_ARCH_RISCV64 279 #ifndef __NR_sched_setaffinity 280 #define __NR_sched_setaffinity 122 281 #elif __NR_sched_setaffinity != 122 282 #error Wrong code for setaffinity system call. 283 #endif /* __NR_sched_setaffinity */ 284 #ifndef __NR_sched_getaffinity 285 #define __NR_sched_getaffinity 123 286 #elif __NR_sched_getaffinity != 123 287 #error Wrong code for getaffinity system call. 288 #endif /* __NR_sched_getaffinity */ 289 #elif KMP_ARCH_VE 290 #ifndef __NR_sched_setaffinity 291 #define __NR_sched_setaffinity 203 292 #elif __NR_sched_setaffinity != 203 293 #error Wrong code for setaffinity system call. 294 #endif /* __NR_sched_setaffinity */ 295 #ifndef __NR_sched_getaffinity 296 #define __NR_sched_getaffinity 204 297 #elif __NR_sched_getaffinity != 204 298 #error Wrong code for getaffinity system call. 299 #endif /* __NR_sched_getaffinity */ 300 #elif KMP_ARCH_S390X 301 #ifndef __NR_sched_setaffinity 302 #define __NR_sched_setaffinity 239 303 #elif __NR_sched_setaffinity != 239 304 #error Wrong code for setaffinity system call. 305 #endif /* __NR_sched_setaffinity */ 306 #ifndef __NR_sched_getaffinity 307 #define __NR_sched_getaffinity 240 308 #elif __NR_sched_getaffinity != 240 309 #error Wrong code for getaffinity system call. 310 #endif /* __NR_sched_getaffinity */ 311 #else 312 #error Unknown or unsupported architecture 313 #endif /* KMP_ARCH_* */ 314 #elif KMP_OS_FREEBSD 315 #include <pthread.h> 316 #include <pthread_np.h> 317 #endif 318 class KMPNativeAffinity : public KMPAffinity { 319 class Mask : public KMPAffinity::Mask { 320 typedef unsigned long mask_t; 321 typedef decltype(__kmp_affin_mask_size) mask_size_type; 322 static const unsigned int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 323 static const mask_t ONE = 1; 324 mask_size_type get_num_mask_types() const { 325 return __kmp_affin_mask_size / sizeof(mask_t); 326 } 327 328 public: 329 mask_t *mask; 330 Mask() { mask = (mask_t *)__kmp_allocate(__kmp_affin_mask_size); } 331 ~Mask() { 332 if (mask) 333 __kmp_free(mask); 334 } 335 void set(int i) override { 336 mask[i / BITS_PER_MASK_T] |= (ONE << (i % BITS_PER_MASK_T)); 337 } 338 bool is_set(int i) const override { 339 return (mask[i / BITS_PER_MASK_T] & (ONE << (i % BITS_PER_MASK_T))); 340 } 341 void clear(int i) override { 342 mask[i / BITS_PER_MASK_T] &= ~(ONE << (i % BITS_PER_MASK_T)); 343 } 344 void zero() override { 345 mask_size_type e = get_num_mask_types(); 346 for (mask_size_type i = 0; i < e; ++i) 347 mask[i] = (mask_t)0; 348 } 349 bool empty() const override { 350 mask_size_type e = get_num_mask_types(); 351 for (mask_size_type i = 0; i < e; ++i) 352 if (mask[i] != (mask_t)0) 353 return false; 354 return true; 355 } 356 void copy(const KMPAffinity::Mask *src) override { 357 const Mask *convert = static_cast<const Mask *>(src); 358 mask_size_type e = get_num_mask_types(); 359 for (mask_size_type i = 0; i < e; ++i) 360 mask[i] = convert->mask[i]; 361 } 362 void bitwise_and(const KMPAffinity::Mask *rhs) override { 363 const Mask *convert = static_cast<const Mask *>(rhs); 364 mask_size_type e = get_num_mask_types(); 365 for (mask_size_type i = 0; i < e; ++i) 366 mask[i] &= convert->mask[i]; 367 } 368 void bitwise_or(const KMPAffinity::Mask *rhs) override { 369 const Mask *convert = static_cast<const Mask *>(rhs); 370 mask_size_type e = get_num_mask_types(); 371 for (mask_size_type i = 0; i < e; ++i) 372 mask[i] |= convert->mask[i]; 373 } 374 void bitwise_not() override { 375 mask_size_type e = get_num_mask_types(); 376 for (mask_size_type i = 0; i < e; ++i) 377 mask[i] = ~(mask[i]); 378 } 379 bool is_equal(const KMPAffinity::Mask *rhs) const override { 380 const Mask *convert = static_cast<const Mask *>(rhs); 381 mask_size_type e = get_num_mask_types(); 382 for (mask_size_type i = 0; i < e; ++i) 383 if (mask[i] != convert->mask[i]) 384 return false; 385 return true; 386 } 387 int begin() const override { 388 int retval = 0; 389 while (retval < end() && !is_set(retval)) 390 ++retval; 391 return retval; 392 } 393 int end() const override { 394 int e; 395 __kmp_type_convert(get_num_mask_types() * BITS_PER_MASK_T, &e); 396 return e; 397 } 398 int next(int previous) const override { 399 int retval = previous + 1; 400 while (retval < end() && !is_set(retval)) 401 ++retval; 402 return retval; 403 } 404 int get_system_affinity(bool abort_on_error) override { 405 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 406 "Illegal get affinity operation when not capable"); 407 #if KMP_OS_LINUX 408 long retval = 409 syscall(__NR_sched_getaffinity, 0, __kmp_affin_mask_size, mask); 410 #elif KMP_OS_FREEBSD 411 int r = pthread_getaffinity_np(pthread_self(), __kmp_affin_mask_size, 412 reinterpret_cast<cpuset_t *>(mask)); 413 int retval = (r == 0 ? 0 : -1); 414 #endif 415 if (retval >= 0) { 416 return 0; 417 } 418 int error = errno; 419 if (abort_on_error) { 420 __kmp_fatal(KMP_MSG(FunctionError, "pthread_getaffinity_np()"), 421 KMP_ERR(error), __kmp_msg_null); 422 } 423 return error; 424 } 425 int set_system_affinity(bool abort_on_error) const override { 426 KMP_ASSERT2(KMP_AFFINITY_CAPABLE(), 427 "Illegal set affinity operation when not capable"); 428 #if KMP_OS_LINUX 429 long retval = 430 syscall(__NR_sched_setaffinity, 0, __kmp_affin_mask_size, mask); 431 #elif KMP_OS_FREEBSD 432 int r = pthread_setaffinity_np(pthread_self(), __kmp_affin_mask_size, 433 reinterpret_cast<cpuset_t *>(mask)); 434 int retval = (r == 0 ? 0 : -1); 435 #endif 436 if (retval >= 0) { 437 return 0; 438 } 439 int error = errno; 440 if (abort_on_error) { 441 __kmp_fatal(KMP_MSG(FunctionError, "pthread_setaffinity_np()"), 442 KMP_ERR(error), __kmp_msg_null); 443 } 444 return error; 445 } 446 }; 447 void determine_capable(const char *env_var) override { 448 __kmp_affinity_determine_capable(env_var); 449 } 450 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 451 KMPAffinity::Mask *allocate_mask() override { 452 KMPNativeAffinity::Mask *retval = new Mask(); 453 return retval; 454 } 455 void deallocate_mask(KMPAffinity::Mask *m) override { 456 KMPNativeAffinity::Mask *native_mask = 457 static_cast<KMPNativeAffinity::Mask *>(m); 458 delete native_mask; 459 } 460 KMPAffinity::Mask *allocate_mask_array(int num) override { 461 return new Mask[num]; 462 } 463 void deallocate_mask_array(KMPAffinity::Mask *array) override { 464 Mask *linux_array = static_cast<Mask *>(array); 465 delete[] linux_array; 466 } 467 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 468 int index) override { 469 Mask *linux_array = static_cast<Mask *>(array); 470 return &(linux_array[index]); 471 } 472 api_type get_api_type() const override { return NATIVE_OS; } 473 }; 474 #endif /* KMP_OS_LINUX || KMP_OS_FREEBSD */ 475 476 #if KMP_OS_WINDOWS 477 class KMPNativeAffinity : public KMPAffinity { 478 class Mask : public KMPAffinity::Mask { 479 typedef ULONG_PTR mask_t; 480 static const int BITS_PER_MASK_T = sizeof(mask_t) * CHAR_BIT; 481 mask_t *mask; 482 483 public: 484 Mask() { 485 mask = (mask_t *)__kmp_allocate(sizeof(mask_t) * __kmp_num_proc_groups); 486 } 487 ~Mask() { 488 if (mask) 489 __kmp_free(mask); 490 } 491 void set(int i) override { 492 mask[i / BITS_PER_MASK_T] |= ((mask_t)1 << (i % BITS_PER_MASK_T)); 493 } 494 bool is_set(int i) const override { 495 return (mask[i / BITS_PER_MASK_T] & ((mask_t)1 << (i % BITS_PER_MASK_T))); 496 } 497 void clear(int i) override { 498 mask[i / BITS_PER_MASK_T] &= ~((mask_t)1 << (i % BITS_PER_MASK_T)); 499 } 500 void zero() override { 501 for (int i = 0; i < __kmp_num_proc_groups; ++i) 502 mask[i] = 0; 503 } 504 bool empty() const override { 505 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 506 if (mask[i]) 507 return false; 508 return true; 509 } 510 void copy(const KMPAffinity::Mask *src) override { 511 const Mask *convert = static_cast<const Mask *>(src); 512 for (int i = 0; i < __kmp_num_proc_groups; ++i) 513 mask[i] = convert->mask[i]; 514 } 515 void bitwise_and(const KMPAffinity::Mask *rhs) override { 516 const Mask *convert = static_cast<const Mask *>(rhs); 517 for (int i = 0; i < __kmp_num_proc_groups; ++i) 518 mask[i] &= convert->mask[i]; 519 } 520 void bitwise_or(const KMPAffinity::Mask *rhs) override { 521 const Mask *convert = static_cast<const Mask *>(rhs); 522 for (int i = 0; i < __kmp_num_proc_groups; ++i) 523 mask[i] |= convert->mask[i]; 524 } 525 void bitwise_not() override { 526 for (int i = 0; i < __kmp_num_proc_groups; ++i) 527 mask[i] = ~(mask[i]); 528 } 529 bool is_equal(const KMPAffinity::Mask *rhs) const override { 530 const Mask *convert = static_cast<const Mask *>(rhs); 531 for (size_t i = 0; i < __kmp_num_proc_groups; ++i) 532 if (mask[i] != convert->mask[i]) 533 return false; 534 return true; 535 } 536 int begin() const override { 537 int retval = 0; 538 while (retval < end() && !is_set(retval)) 539 ++retval; 540 return retval; 541 } 542 int end() const override { return __kmp_num_proc_groups * BITS_PER_MASK_T; } 543 int next(int previous) const override { 544 int retval = previous + 1; 545 while (retval < end() && !is_set(retval)) 546 ++retval; 547 return retval; 548 } 549 int set_process_affinity(bool abort_on_error) const override { 550 if (__kmp_num_proc_groups <= 1) { 551 if (!SetProcessAffinityMask(GetCurrentProcess(), *mask)) { 552 DWORD error = GetLastError(); 553 if (abort_on_error) { 554 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 555 __kmp_msg_null); 556 } 557 return error; 558 } 559 } 560 return 0; 561 } 562 int set_system_affinity(bool abort_on_error) const override { 563 if (__kmp_num_proc_groups > 1) { 564 // Check for a valid mask. 565 GROUP_AFFINITY ga; 566 int group = get_proc_group(); 567 if (group < 0) { 568 if (abort_on_error) { 569 KMP_FATAL(AffinityInvalidMask, "kmp_set_affinity"); 570 } 571 return -1; 572 } 573 // Transform the bit vector into a GROUP_AFFINITY struct 574 // and make the system call to set affinity. 575 ga.Group = group; 576 ga.Mask = mask[group]; 577 ga.Reserved[0] = ga.Reserved[1] = ga.Reserved[2] = 0; 578 579 KMP_DEBUG_ASSERT(__kmp_SetThreadGroupAffinity != NULL); 580 if (__kmp_SetThreadGroupAffinity(GetCurrentThread(), &ga, NULL) == 0) { 581 DWORD error = GetLastError(); 582 if (abort_on_error) { 583 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 584 __kmp_msg_null); 585 } 586 return error; 587 } 588 } else { 589 if (!SetThreadAffinityMask(GetCurrentThread(), *mask)) { 590 DWORD error = GetLastError(); 591 if (abort_on_error) { 592 __kmp_fatal(KMP_MSG(CantSetThreadAffMask), KMP_ERR(error), 593 __kmp_msg_null); 594 } 595 return error; 596 } 597 } 598 return 0; 599 } 600 int get_system_affinity(bool abort_on_error) override { 601 if (__kmp_num_proc_groups > 1) { 602 this->zero(); 603 GROUP_AFFINITY ga; 604 KMP_DEBUG_ASSERT(__kmp_GetThreadGroupAffinity != NULL); 605 if (__kmp_GetThreadGroupAffinity(GetCurrentThread(), &ga) == 0) { 606 DWORD error = GetLastError(); 607 if (abort_on_error) { 608 __kmp_fatal(KMP_MSG(FunctionError, "GetThreadGroupAffinity()"), 609 KMP_ERR(error), __kmp_msg_null); 610 } 611 return error; 612 } 613 if ((ga.Group < 0) || (ga.Group > __kmp_num_proc_groups) || 614 (ga.Mask == 0)) { 615 return -1; 616 } 617 mask[ga.Group] = ga.Mask; 618 } else { 619 mask_t newMask, sysMask, retval; 620 if (!GetProcessAffinityMask(GetCurrentProcess(), &newMask, &sysMask)) { 621 DWORD error = GetLastError(); 622 if (abort_on_error) { 623 __kmp_fatal(KMP_MSG(FunctionError, "GetProcessAffinityMask()"), 624 KMP_ERR(error), __kmp_msg_null); 625 } 626 return error; 627 } 628 retval = SetThreadAffinityMask(GetCurrentThread(), newMask); 629 if (!retval) { 630 DWORD error = GetLastError(); 631 if (abort_on_error) { 632 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 633 KMP_ERR(error), __kmp_msg_null); 634 } 635 return error; 636 } 637 newMask = SetThreadAffinityMask(GetCurrentThread(), retval); 638 if (!newMask) { 639 DWORD error = GetLastError(); 640 if (abort_on_error) { 641 __kmp_fatal(KMP_MSG(FunctionError, "SetThreadAffinityMask()"), 642 KMP_ERR(error), __kmp_msg_null); 643 } 644 } 645 *mask = retval; 646 } 647 return 0; 648 } 649 int get_proc_group() const override { 650 int group = -1; 651 if (__kmp_num_proc_groups == 1) { 652 return 1; 653 } 654 for (int i = 0; i < __kmp_num_proc_groups; i++) { 655 if (mask[i] == 0) 656 continue; 657 if (group >= 0) 658 return -1; 659 group = i; 660 } 661 return group; 662 } 663 }; 664 void determine_capable(const char *env_var) override { 665 __kmp_affinity_determine_capable(env_var); 666 } 667 void bind_thread(int which) override { __kmp_affinity_bind_thread(which); } 668 KMPAffinity::Mask *allocate_mask() override { return new Mask(); } 669 void deallocate_mask(KMPAffinity::Mask *m) override { delete m; } 670 KMPAffinity::Mask *allocate_mask_array(int num) override { 671 return new Mask[num]; 672 } 673 void deallocate_mask_array(KMPAffinity::Mask *array) override { 674 Mask *windows_array = static_cast<Mask *>(array); 675 delete[] windows_array; 676 } 677 KMPAffinity::Mask *index_mask_array(KMPAffinity::Mask *array, 678 int index) override { 679 Mask *windows_array = static_cast<Mask *>(array); 680 return &(windows_array[index]); 681 } 682 api_type get_api_type() const override { return NATIVE_OS; } 683 }; 684 #endif /* KMP_OS_WINDOWS */ 685 #endif /* KMP_AFFINITY_SUPPORTED */ 686 687 // Describe an attribute for a level in the machine topology 688 struct kmp_hw_attr_t { 689 int core_type : 8; 690 int core_eff : 8; 691 unsigned valid : 1; 692 unsigned reserved : 15; 693 694 static const int UNKNOWN_CORE_EFF = -1; 695 696 kmp_hw_attr_t() 697 : core_type(KMP_HW_CORE_TYPE_UNKNOWN), core_eff(UNKNOWN_CORE_EFF), 698 valid(0), reserved(0) {} 699 void set_core_type(kmp_hw_core_type_t type) { 700 valid = 1; 701 core_type = type; 702 } 703 void set_core_eff(int eff) { 704 valid = 1; 705 core_eff = eff; 706 } 707 kmp_hw_core_type_t get_core_type() const { 708 return (kmp_hw_core_type_t)core_type; 709 } 710 int get_core_eff() const { return core_eff; } 711 bool is_core_type_valid() const { 712 return core_type != KMP_HW_CORE_TYPE_UNKNOWN; 713 } 714 bool is_core_eff_valid() const { return core_eff != UNKNOWN_CORE_EFF; } 715 operator bool() const { return valid; } 716 void clear() { 717 core_type = KMP_HW_CORE_TYPE_UNKNOWN; 718 core_eff = UNKNOWN_CORE_EFF; 719 valid = 0; 720 } 721 bool contains(const kmp_hw_attr_t &other) const { 722 if (!valid && !other.valid) 723 return true; 724 if (valid && other.valid) { 725 if (other.is_core_type_valid()) { 726 if (!is_core_type_valid() || (get_core_type() != other.get_core_type())) 727 return false; 728 } 729 if (other.is_core_eff_valid()) { 730 if (!is_core_eff_valid() || (get_core_eff() != other.get_core_eff())) 731 return false; 732 } 733 return true; 734 } 735 return false; 736 } 737 #if KMP_AFFINITY_SUPPORTED 738 bool contains(const kmp_affinity_attrs_t &attr) const { 739 if (!valid && !attr.valid) 740 return true; 741 if (valid && attr.valid) { 742 if (attr.core_type != KMP_HW_CORE_TYPE_UNKNOWN) 743 return (is_core_type_valid() && 744 (get_core_type() == (kmp_hw_core_type_t)attr.core_type)); 745 if (attr.core_eff != UNKNOWN_CORE_EFF) 746 return (is_core_eff_valid() && (get_core_eff() == attr.core_eff)); 747 return true; 748 } 749 return false; 750 } 751 #endif // KMP_AFFINITY_SUPPORTED 752 bool operator==(const kmp_hw_attr_t &rhs) const { 753 return (rhs.valid == valid && rhs.core_eff == core_eff && 754 rhs.core_type == core_type); 755 } 756 bool operator!=(const kmp_hw_attr_t &rhs) const { return !operator==(rhs); } 757 }; 758 759 #if KMP_AFFINITY_SUPPORTED 760 KMP_BUILD_ASSERT(sizeof(kmp_hw_attr_t) == sizeof(kmp_affinity_attrs_t)); 761 #endif 762 763 class kmp_hw_thread_t { 764 public: 765 static const int UNKNOWN_ID = -1; 766 static const int MULTIPLE_ID = -2; 767 static int compare_ids(const void *a, const void *b); 768 static int compare_compact(const void *a, const void *b); 769 int ids[KMP_HW_LAST]; 770 int sub_ids[KMP_HW_LAST]; 771 bool leader; 772 int os_id; 773 kmp_hw_attr_t attrs; 774 775 void print() const; 776 void clear() { 777 for (int i = 0; i < (int)KMP_HW_LAST; ++i) 778 ids[i] = UNKNOWN_ID; 779 leader = false; 780 attrs.clear(); 781 } 782 }; 783 784 class kmp_topology_t { 785 786 struct flags_t { 787 int uniform : 1; 788 int reserved : 31; 789 }; 790 791 int depth; 792 793 // The following arrays are all 'depth' long and have been 794 // allocated to hold up to KMP_HW_LAST number of objects if 795 // needed so layers can be added without reallocation of any array 796 797 // Orderd array of the types in the topology 798 kmp_hw_t *types; 799 800 // Keep quick topology ratios, for non-uniform topologies, 801 // this ratio holds the max number of itemAs per itemB 802 // e.g., [ 4 packages | 6 cores / package | 2 threads / core ] 803 int *ratio; 804 805 // Storage containing the absolute number of each topology layer 806 int *count; 807 808 // The number of core efficiencies. This is only useful for hybrid 809 // topologies. Core efficiencies will range from 0 to num efficiencies - 1 810 int num_core_efficiencies; 811 int num_core_types; 812 kmp_hw_core_type_t core_types[KMP_HW_MAX_NUM_CORE_TYPES]; 813 814 // The hardware threads array 815 // hw_threads is num_hw_threads long 816 // Each hw_thread's ids and sub_ids are depth deep 817 int num_hw_threads; 818 kmp_hw_thread_t *hw_threads; 819 820 // Equivalence hash where the key is the hardware topology item 821 // and the value is the equivalent hardware topology type in the 822 // types[] array, if the value is KMP_HW_UNKNOWN, then there is no 823 // known equivalence for the topology type 824 kmp_hw_t equivalent[KMP_HW_LAST]; 825 826 // Flags describing the topology 827 flags_t flags; 828 829 // Compact value used during sort_compact() 830 int compact; 831 832 // Insert a new topology layer after allocation 833 void _insert_layer(kmp_hw_t type, const int *ids); 834 835 #if KMP_GROUP_AFFINITY 836 // Insert topology information about Windows Processor groups 837 void _insert_windows_proc_groups(); 838 #endif 839 840 // Count each item & get the num x's per y 841 // e.g., get the number of cores and the number of threads per core 842 // for each (x, y) in (KMP_HW_* , KMP_HW_*) 843 void _gather_enumeration_information(); 844 845 // Remove layers that don't add information to the topology. 846 // This is done by having the layer take on the id = UNKNOWN_ID (-1) 847 void _remove_radix1_layers(); 848 849 // Find out if the topology is uniform 850 void _discover_uniformity(); 851 852 // Set all the sub_ids for each hardware thread 853 void _set_sub_ids(); 854 855 // Set global affinity variables describing the number of threads per 856 // core, the number of packages, the number of cores per package, and 857 // the number of cores. 858 void _set_globals(); 859 860 // Set the last level cache equivalent type 861 void _set_last_level_cache(); 862 863 // Return the number of cores with a particular attribute, 'attr'. 864 // If 'find_all' is true, then find all cores on the machine, otherwise find 865 // all cores per the layer 'above' 866 int _get_ncores_with_attr(const kmp_hw_attr_t &attr, int above, 867 bool find_all = false) const; 868 869 public: 870 // Force use of allocate()/deallocate() 871 kmp_topology_t() = delete; 872 kmp_topology_t(const kmp_topology_t &t) = delete; 873 kmp_topology_t(kmp_topology_t &&t) = delete; 874 kmp_topology_t &operator=(const kmp_topology_t &t) = delete; 875 kmp_topology_t &operator=(kmp_topology_t &&t) = delete; 876 877 static kmp_topology_t *allocate(int nproc, int ndepth, const kmp_hw_t *types); 878 static void deallocate(kmp_topology_t *); 879 880 // Functions used in create_map() routines 881 kmp_hw_thread_t &at(int index) { 882 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 883 return hw_threads[index]; 884 } 885 const kmp_hw_thread_t &at(int index) const { 886 KMP_DEBUG_ASSERT(index >= 0 && index < num_hw_threads); 887 return hw_threads[index]; 888 } 889 int get_num_hw_threads() const { return num_hw_threads; } 890 void sort_ids() { 891 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 892 kmp_hw_thread_t::compare_ids); 893 } 894 // Check if the hardware ids are unique, if they are 895 // return true, otherwise return false 896 bool check_ids() const; 897 898 // Function to call after the create_map() routine 899 void canonicalize(); 900 void canonicalize(int pkgs, int cores_per_pkg, int thr_per_core, int cores); 901 902 // Functions used after canonicalize() called 903 904 #if KMP_AFFINITY_SUPPORTED 905 // Set the granularity for affinity settings 906 void set_granularity(kmp_affinity_t &stgs) const; 907 bool is_close(int hwt1, int hwt2, const kmp_affinity_t &stgs) const; 908 bool restrict_to_mask(const kmp_affin_mask_t *mask); 909 bool filter_hw_subset(); 910 #endif 911 bool is_uniform() const { return flags.uniform; } 912 // Tell whether a type is a valid type in the topology 913 // returns KMP_HW_UNKNOWN when there is no equivalent type 914 kmp_hw_t get_equivalent_type(kmp_hw_t type) const { 915 if (type == KMP_HW_UNKNOWN) 916 return KMP_HW_UNKNOWN; 917 return equivalent[type]; 918 } 919 // Set type1 = type2 920 void set_equivalent_type(kmp_hw_t type1, kmp_hw_t type2) { 921 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type1); 922 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type2); 923 kmp_hw_t real_type2 = equivalent[type2]; 924 if (real_type2 == KMP_HW_UNKNOWN) 925 real_type2 = type2; 926 equivalent[type1] = real_type2; 927 // This loop is required since any of the types may have been set to 928 // be equivalent to type1. They all must be checked and reset to type2. 929 KMP_FOREACH_HW_TYPE(type) { 930 if (equivalent[type] == type1) { 931 equivalent[type] = real_type2; 932 } 933 } 934 } 935 // Calculate number of types corresponding to level1 936 // per types corresponding to level2 (e.g., number of threads per core) 937 int calculate_ratio(int level1, int level2) const { 938 KMP_DEBUG_ASSERT(level1 >= 0 && level1 < depth); 939 KMP_DEBUG_ASSERT(level2 >= 0 && level2 < depth); 940 int r = 1; 941 for (int level = level1; level > level2; --level) 942 r *= ratio[level]; 943 return r; 944 } 945 int get_ratio(int level) const { 946 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 947 return ratio[level]; 948 } 949 int get_depth() const { return depth; }; 950 kmp_hw_t get_type(int level) const { 951 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 952 return types[level]; 953 } 954 int get_level(kmp_hw_t type) const { 955 KMP_DEBUG_ASSERT_VALID_HW_TYPE(type); 956 int eq_type = equivalent[type]; 957 if (eq_type == KMP_HW_UNKNOWN) 958 return -1; 959 for (int i = 0; i < depth; ++i) 960 if (types[i] == eq_type) 961 return i; 962 return -1; 963 } 964 int get_count(int level) const { 965 KMP_DEBUG_ASSERT(level >= 0 && level < depth); 966 return count[level]; 967 } 968 // Return the total number of cores with attribute 'attr' 969 int get_ncores_with_attr(const kmp_hw_attr_t &attr) const { 970 return _get_ncores_with_attr(attr, -1, true); 971 } 972 // Return the number of cores with attribute 973 // 'attr' per topology level 'above' 974 int get_ncores_with_attr_per(const kmp_hw_attr_t &attr, int above) const { 975 return _get_ncores_with_attr(attr, above, false); 976 } 977 978 #if KMP_AFFINITY_SUPPORTED 979 friend int kmp_hw_thread_t::compare_compact(const void *a, const void *b); 980 void sort_compact(kmp_affinity_t &affinity) { 981 compact = affinity.compact; 982 qsort(hw_threads, num_hw_threads, sizeof(kmp_hw_thread_t), 983 kmp_hw_thread_t::compare_compact); 984 } 985 #endif 986 void print(const char *env_var = "KMP_AFFINITY") const; 987 void dump() const; 988 }; 989 extern kmp_topology_t *__kmp_topology; 990 991 class kmp_hw_subset_t { 992 const static size_t MAX_ATTRS = KMP_HW_MAX_NUM_CORE_EFFS; 993 994 public: 995 // Describe a machine topology item in KMP_HW_SUBSET 996 struct item_t { 997 kmp_hw_t type; 998 int num_attrs; 999 int num[MAX_ATTRS]; 1000 int offset[MAX_ATTRS]; 1001 kmp_hw_attr_t attr[MAX_ATTRS]; 1002 }; 1003 // Put parenthesis around max to avoid accidental use of Windows max macro. 1004 const static int USE_ALL = (std::numeric_limits<int>::max)(); 1005 1006 private: 1007 int depth; 1008 int capacity; 1009 item_t *items; 1010 kmp_uint64 set; 1011 bool absolute; 1012 // The set must be able to handle up to KMP_HW_LAST number of layers 1013 KMP_BUILD_ASSERT(sizeof(set) * 8 >= KMP_HW_LAST); 1014 // Sorting the KMP_HW_SUBSET items to follow topology order 1015 // All unknown topology types will be at the beginning of the subset 1016 static int hw_subset_compare(const void *i1, const void *i2) { 1017 kmp_hw_t type1 = ((const item_t *)i1)->type; 1018 kmp_hw_t type2 = ((const item_t *)i2)->type; 1019 int level1 = __kmp_topology->get_level(type1); 1020 int level2 = __kmp_topology->get_level(type2); 1021 return level1 - level2; 1022 } 1023 1024 public: 1025 // Force use of allocate()/deallocate() 1026 kmp_hw_subset_t() = delete; 1027 kmp_hw_subset_t(const kmp_hw_subset_t &t) = delete; 1028 kmp_hw_subset_t(kmp_hw_subset_t &&t) = delete; 1029 kmp_hw_subset_t &operator=(const kmp_hw_subset_t &t) = delete; 1030 kmp_hw_subset_t &operator=(kmp_hw_subset_t &&t) = delete; 1031 1032 static kmp_hw_subset_t *allocate() { 1033 int initial_capacity = 5; 1034 kmp_hw_subset_t *retval = 1035 (kmp_hw_subset_t *)__kmp_allocate(sizeof(kmp_hw_subset_t)); 1036 retval->depth = 0; 1037 retval->capacity = initial_capacity; 1038 retval->set = 0ull; 1039 retval->absolute = false; 1040 retval->items = (item_t *)__kmp_allocate(sizeof(item_t) * initial_capacity); 1041 return retval; 1042 } 1043 static void deallocate(kmp_hw_subset_t *subset) { 1044 __kmp_free(subset->items); 1045 __kmp_free(subset); 1046 } 1047 void set_absolute() { absolute = true; } 1048 bool is_absolute() const { return absolute; } 1049 void push_back(int num, kmp_hw_t type, int offset, kmp_hw_attr_t attr) { 1050 for (int i = 0; i < depth; ++i) { 1051 // Found an existing item for this layer type 1052 // Add the num, offset, and attr to this item 1053 if (items[i].type == type) { 1054 int idx = items[i].num_attrs++; 1055 if ((size_t)idx >= MAX_ATTRS) 1056 return; 1057 items[i].num[idx] = num; 1058 items[i].offset[idx] = offset; 1059 items[i].attr[idx] = attr; 1060 return; 1061 } 1062 } 1063 if (depth == capacity - 1) { 1064 capacity *= 2; 1065 item_t *new_items = (item_t *)__kmp_allocate(sizeof(item_t) * capacity); 1066 for (int i = 0; i < depth; ++i) 1067 new_items[i] = items[i]; 1068 __kmp_free(items); 1069 items = new_items; 1070 } 1071 items[depth].num_attrs = 1; 1072 items[depth].type = type; 1073 items[depth].num[0] = num; 1074 items[depth].offset[0] = offset; 1075 items[depth].attr[0] = attr; 1076 depth++; 1077 set |= (1ull << type); 1078 } 1079 int get_depth() const { return depth; } 1080 const item_t &at(int index) const { 1081 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1082 return items[index]; 1083 } 1084 item_t &at(int index) { 1085 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1086 return items[index]; 1087 } 1088 void remove(int index) { 1089 KMP_DEBUG_ASSERT(index >= 0 && index < depth); 1090 set &= ~(1ull << items[index].type); 1091 for (int j = index + 1; j < depth; ++j) { 1092 items[j - 1] = items[j]; 1093 } 1094 depth--; 1095 } 1096 void sort() { 1097 KMP_DEBUG_ASSERT(__kmp_topology); 1098 qsort(items, depth, sizeof(item_t), hw_subset_compare); 1099 } 1100 bool specified(kmp_hw_t type) const { return ((set & (1ull << type)) > 0); } 1101 void dump() const { 1102 printf("**********************\n"); 1103 printf("*** kmp_hw_subset: ***\n"); 1104 printf("* depth: %d\n", depth); 1105 printf("* items:\n"); 1106 for (int i = 0; i < depth; ++i) { 1107 printf(" type: %s\n", __kmp_hw_get_keyword(items[i].type)); 1108 for (int j = 0; j < items[i].num_attrs; ++j) { 1109 printf(" num: %d, offset: %d, attr: ", items[i].num[j], 1110 items[i].offset[j]); 1111 if (!items[i].attr[j]) { 1112 printf(" (none)\n"); 1113 } else { 1114 printf( 1115 " core_type = %s, core_eff = %d\n", 1116 __kmp_hw_get_core_type_string(items[i].attr[j].get_core_type()), 1117 items[i].attr[j].get_core_eff()); 1118 } 1119 } 1120 } 1121 printf("* set: 0x%llx\n", set); 1122 printf("* absolute: %d\n", absolute); 1123 printf("**********************\n"); 1124 } 1125 }; 1126 extern kmp_hw_subset_t *__kmp_hw_subset; 1127 1128 /* A structure for holding machine-specific hierarchy info to be computed once 1129 at init. This structure represents a mapping of threads to the actual machine 1130 hierarchy, or to our best guess at what the hierarchy might be, for the 1131 purpose of performing an efficient barrier. In the worst case, when there is 1132 no machine hierarchy information, it produces a tree suitable for a barrier, 1133 similar to the tree used in the hyper barrier. */ 1134 class hierarchy_info { 1135 public: 1136 /* Good default values for number of leaves and branching factor, given no 1137 affinity information. Behaves a bit like hyper barrier. */ 1138 static const kmp_uint32 maxLeaves = 4; 1139 static const kmp_uint32 minBranch = 4; 1140 /** Number of levels in the hierarchy. Typical levels are threads/core, 1141 cores/package or socket, packages/node, nodes/machine, etc. We don't want 1142 to get specific with nomenclature. When the machine is oversubscribed we 1143 add levels to duplicate the hierarchy, doubling the thread capacity of the 1144 hierarchy each time we add a level. */ 1145 kmp_uint32 maxLevels; 1146 1147 /** This is specifically the depth of the machine configuration hierarchy, in 1148 terms of the number of levels along the longest path from root to any 1149 leaf. It corresponds to the number of entries in numPerLevel if we exclude 1150 all but one trailing 1. */ 1151 kmp_uint32 depth; 1152 kmp_uint32 base_num_threads; 1153 enum init_status { initialized = 0, not_initialized = 1, initializing = 2 }; 1154 volatile kmp_int8 uninitialized; // 0=initialized, 1=not initialized, 1155 // 2=initialization in progress 1156 volatile kmp_int8 resizing; // 0=not resizing, 1=resizing 1157 1158 /** Level 0 corresponds to leaves. numPerLevel[i] is the number of children 1159 the parent of a node at level i has. For example, if we have a machine 1160 with 4 packages, 4 cores/package and 2 HT per core, then numPerLevel = 1161 {2, 4, 4, 1, 1}. All empty levels are set to 1. */ 1162 kmp_uint32 *numPerLevel; 1163 kmp_uint32 *skipPerLevel; 1164 1165 void deriveLevels() { 1166 int hier_depth = __kmp_topology->get_depth(); 1167 for (int i = hier_depth - 1, level = 0; i >= 0; --i, ++level) { 1168 numPerLevel[level] = __kmp_topology->get_ratio(i); 1169 } 1170 } 1171 1172 hierarchy_info() 1173 : maxLevels(7), depth(1), uninitialized(not_initialized), resizing(0) {} 1174 1175 void fini() { 1176 if (!uninitialized && numPerLevel) { 1177 __kmp_free(numPerLevel); 1178 numPerLevel = NULL; 1179 uninitialized = not_initialized; 1180 } 1181 } 1182 1183 void init(int num_addrs) { 1184 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8( 1185 &uninitialized, not_initialized, initializing); 1186 if (bool_result == 0) { // Wait for initialization 1187 while (TCR_1(uninitialized) != initialized) 1188 KMP_CPU_PAUSE(); 1189 return; 1190 } 1191 KMP_DEBUG_ASSERT(bool_result == 1); 1192 1193 /* Added explicit initialization of the data fields here to prevent usage of 1194 dirty value observed when static library is re-initialized multiple times 1195 (e.g. when non-OpenMP thread repeatedly launches/joins thread that uses 1196 OpenMP). */ 1197 depth = 1; 1198 resizing = 0; 1199 maxLevels = 7; 1200 numPerLevel = 1201 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1202 skipPerLevel = &(numPerLevel[maxLevels]); 1203 for (kmp_uint32 i = 0; i < maxLevels; 1204 ++i) { // init numPerLevel[*] to 1 item per level 1205 numPerLevel[i] = 1; 1206 skipPerLevel[i] = 1; 1207 } 1208 1209 // Sort table by physical ID 1210 if (__kmp_topology && __kmp_topology->get_depth() > 0) { 1211 deriveLevels(); 1212 } else { 1213 numPerLevel[0] = maxLeaves; 1214 numPerLevel[1] = num_addrs / maxLeaves; 1215 if (num_addrs % maxLeaves) 1216 numPerLevel[1]++; 1217 } 1218 1219 base_num_threads = num_addrs; 1220 for (int i = maxLevels - 1; i >= 0; 1221 --i) // count non-empty levels to get depth 1222 if (numPerLevel[i] != 1 || depth > 1) // only count one top-level '1' 1223 depth++; 1224 1225 kmp_uint32 branch = minBranch; 1226 if (numPerLevel[0] == 1) 1227 branch = num_addrs / maxLeaves; 1228 if (branch < minBranch) 1229 branch = minBranch; 1230 for (kmp_uint32 d = 0; d < depth - 1; ++d) { // optimize hierarchy width 1231 while (numPerLevel[d] > branch || 1232 (d == 0 && numPerLevel[d] > maxLeaves)) { // max 4 on level 0! 1233 if (numPerLevel[d] & 1) 1234 numPerLevel[d]++; 1235 numPerLevel[d] = numPerLevel[d] >> 1; 1236 if (numPerLevel[d + 1] == 1) 1237 depth++; 1238 numPerLevel[d + 1] = numPerLevel[d + 1] << 1; 1239 } 1240 if (numPerLevel[0] == 1) { 1241 branch = branch >> 1; 1242 if (branch < 4) 1243 branch = minBranch; 1244 } 1245 } 1246 1247 for (kmp_uint32 i = 1; i < depth; ++i) 1248 skipPerLevel[i] = numPerLevel[i - 1] * skipPerLevel[i - 1]; 1249 // Fill in hierarchy in the case of oversubscription 1250 for (kmp_uint32 i = depth; i < maxLevels; ++i) 1251 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1252 1253 uninitialized = initialized; // One writer 1254 } 1255 1256 // Resize the hierarchy if nproc changes to something larger than before 1257 void resize(kmp_uint32 nproc) { 1258 kmp_int8 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1259 while (bool_result == 0) { // someone else is trying to resize 1260 KMP_CPU_PAUSE(); 1261 if (nproc <= base_num_threads) // happy with other thread's resize 1262 return; 1263 else // try to resize 1264 bool_result = KMP_COMPARE_AND_STORE_ACQ8(&resizing, 0, 1); 1265 } 1266 KMP_DEBUG_ASSERT(bool_result != 0); 1267 if (nproc <= base_num_threads) 1268 return; // happy with other thread's resize 1269 1270 // Calculate new maxLevels 1271 kmp_uint32 old_sz = skipPerLevel[depth - 1]; 1272 kmp_uint32 incs = 0, old_maxLevels = maxLevels; 1273 // First see if old maxLevels is enough to contain new size 1274 for (kmp_uint32 i = depth; i < maxLevels && nproc > old_sz; ++i) { 1275 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1276 numPerLevel[i - 1] *= 2; 1277 old_sz *= 2; 1278 depth++; 1279 } 1280 if (nproc > old_sz) { // Not enough space, need to expand hierarchy 1281 while (nproc > old_sz) { 1282 old_sz *= 2; 1283 incs++; 1284 depth++; 1285 } 1286 maxLevels += incs; 1287 1288 // Resize arrays 1289 kmp_uint32 *old_numPerLevel = numPerLevel; 1290 kmp_uint32 *old_skipPerLevel = skipPerLevel; 1291 numPerLevel = skipPerLevel = NULL; 1292 numPerLevel = 1293 (kmp_uint32 *)__kmp_allocate(maxLevels * 2 * sizeof(kmp_uint32)); 1294 skipPerLevel = &(numPerLevel[maxLevels]); 1295 1296 // Copy old elements from old arrays 1297 for (kmp_uint32 i = 0; i < old_maxLevels; ++i) { 1298 // init numPerLevel[*] to 1 item per level 1299 numPerLevel[i] = old_numPerLevel[i]; 1300 skipPerLevel[i] = old_skipPerLevel[i]; 1301 } 1302 1303 // Init new elements in arrays to 1 1304 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) { 1305 // init numPerLevel[*] to 1 item per level 1306 numPerLevel[i] = 1; 1307 skipPerLevel[i] = 1; 1308 } 1309 1310 // Free old arrays 1311 __kmp_free(old_numPerLevel); 1312 } 1313 1314 // Fill in oversubscription levels of hierarchy 1315 for (kmp_uint32 i = old_maxLevels; i < maxLevels; ++i) 1316 skipPerLevel[i] = 2 * skipPerLevel[i - 1]; 1317 1318 base_num_threads = nproc; 1319 resizing = 0; // One writer 1320 } 1321 }; 1322 #endif // KMP_AFFINITY_H 1323