1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 // Kokkos v. 3.0
6 // Copyright (2020) National Technology & Engineering
7 // Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44
45 #define DEBUG_PRINT 0
46
47 #include <iostream>
48 #include <sstream>
49 #include <algorithm>
50
51 #include <Kokkos_Macros.hpp>
52 #include <Kokkos_Core.hpp>
53 #include <Kokkos_hwloc.hpp>
54 #include <impl/Kokkos_Error.hpp>
55
56 /*--------------------------------------------------------------------------*/
57 /*--------------------------------------------------------------------------*/
58
59 namespace Kokkos {
60 namespace hwloc {
61
62 /* Return 0 if asynchronous, 1 if synchronous and include process. */
thread_mapping(const char * const label,const bool allow_async,unsigned & thread_count,unsigned & use_numa_count,unsigned & use_cores_per_numa,std::pair<unsigned,unsigned> threads_coord[])63 unsigned thread_mapping(const char* const label, const bool allow_async,
64 unsigned& thread_count, unsigned& use_numa_count,
65 unsigned& use_cores_per_numa,
66 std::pair<unsigned, unsigned> threads_coord[]) {
67 const bool hwloc_avail = Kokkos::hwloc::available();
68 const unsigned avail_numa_count =
69 hwloc_avail ? hwloc::get_available_numa_count() : 1;
70 const unsigned avail_cores_per_numa =
71 hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count;
72 const unsigned avail_threads_per_core =
73 hwloc_avail ? hwloc::get_available_threads_per_core() : 1;
74
75 // (numa,core) coordinate of the process:
76 const std::pair<unsigned, unsigned> proc_coord =
77 Kokkos::hwloc::get_this_thread_coordinate();
78
79 //------------------------------------------------------------------------
80 // Defaults for unspecified inputs:
81
82 if (!use_numa_count) {
83 // Default to use all NUMA regions
84 use_numa_count = !thread_count
85 ? avail_numa_count
86 : (thread_count < avail_numa_count ? thread_count
87 : avail_numa_count);
88 }
89
90 if (!use_cores_per_numa) {
91 // Default to use all but one core if asynchronous, all cores if
92 // synchronous.
93 const unsigned threads_per_numa = thread_count / use_numa_count;
94
95 use_cores_per_numa =
96 !threads_per_numa
97 ? avail_cores_per_numa - (allow_async ? 1 : 0)
98 : (threads_per_numa < avail_cores_per_numa ? threads_per_numa
99 : avail_cores_per_numa);
100 }
101
102 if (!thread_count) {
103 thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core;
104 }
105
106 //------------------------------------------------------------------------
107 // Input verification:
108
109 const bool valid_numa = use_numa_count <= avail_numa_count;
110 const bool valid_cores =
111 use_cores_per_numa && use_cores_per_numa <= avail_cores_per_numa;
112 const bool valid_threads =
113 thread_count && thread_count <= use_numa_count * use_cores_per_numa *
114 avail_threads_per_core;
115 const bool balanced_numa = !(thread_count % use_numa_count);
116 const bool balanced_cores =
117 !(thread_count % (use_numa_count * use_cores_per_numa));
118
119 const bool valid_input = valid_numa && valid_cores && valid_threads &&
120 balanced_numa && balanced_cores;
121
122 if (!valid_input) {
123 std::ostringstream msg;
124
125 msg << label << " HWLOC ERROR(s)";
126
127 if (!valid_threads) {
128 msg << " : thread_count(" << thread_count << ") exceeds capacity("
129 << use_numa_count * use_cores_per_numa * avail_threads_per_core
130 << ")";
131 }
132 if (!valid_numa) {
133 msg << " : use_numa_count(" << use_numa_count << ") exceeds capacity("
134 << avail_numa_count << ")";
135 }
136 if (!valid_cores) {
137 msg << " : use_cores_per_numa(" << use_cores_per_numa
138 << ") exceeds capacity(" << avail_cores_per_numa << ")";
139 }
140 if (!balanced_numa) {
141 msg << " : thread_count(" << thread_count << ") imbalanced among numa("
142 << use_numa_count << ")";
143 }
144 if (!balanced_cores) {
145 msg << " : thread_count(" << thread_count << ") imbalanced among cores("
146 << use_numa_count * use_cores_per_numa << ")";
147 }
148
149 Kokkos::Impl::throw_runtime_exception(msg.str());
150 }
151
152 const unsigned thread_spawn_synchronous =
153 (allow_async && 1 < thread_count &&
154 (use_numa_count < avail_numa_count ||
155 use_cores_per_numa < avail_cores_per_numa))
156 ? 0 /* asyncronous */
157 : 1 /* synchronous, threads_coord[0] is process core */;
158
159 // Determine binding coordinates for to-be-spawned threads so that
160 // threads may be bound to cores as they are spawned.
161
162 const unsigned threads_per_core =
163 thread_count / (use_numa_count * use_cores_per_numa);
164
165 if (thread_spawn_synchronous) {
166 // Working synchronously and include process core as threads_coord[0].
167 // Swap the NUMA coordinate of the process core with 0
168 // Swap the CORE coordinate of the process core with 0
169 for (unsigned i = 0, inuma = avail_numa_count - use_numa_count;
170 inuma < avail_numa_count; ++inuma) {
171 const unsigned numa_coord = 0 == inuma
172 ? proc_coord.first
173 : (proc_coord.first == inuma ? 0 : inuma);
174 for (unsigned icore = avail_cores_per_numa - use_cores_per_numa;
175 icore < avail_cores_per_numa; ++icore) {
176 const unsigned core_coord =
177 0 == icore ? proc_coord.second
178 : (proc_coord.second == icore ? 0 : icore);
179 for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) {
180 threads_coord[i].first = numa_coord;
181 threads_coord[i].second = core_coord;
182 }
183 }
184 }
185 } else if (use_numa_count < avail_numa_count) {
186 // Working asynchronously and omit the process' NUMA region from the pool.
187 // Swap the NUMA coordinate of the process core with ( ( avail_numa_count -
188 // use_numa_count ) - 1 )
189 const unsigned numa_coord_swap = (avail_numa_count - use_numa_count) - 1;
190 for (unsigned i = 0, inuma = avail_numa_count - use_numa_count;
191 inuma < avail_numa_count; ++inuma) {
192 const unsigned numa_coord =
193 proc_coord.first == inuma ? numa_coord_swap : inuma;
194 for (unsigned icore = avail_cores_per_numa - use_cores_per_numa;
195 icore < avail_cores_per_numa; ++icore) {
196 const unsigned core_coord = icore;
197 for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) {
198 threads_coord[i].first = numa_coord;
199 threads_coord[i].second = core_coord;
200 }
201 }
202 }
203 } else if (use_cores_per_numa < avail_cores_per_numa) {
204 // Working asynchronously and omit the process' core from the pool.
205 // Swap the CORE coordinate of the process core with ( (
206 // avail_cores_per_numa - use_cores_per_numa ) - 1 )
207 const unsigned core_coord_swap =
208 (avail_cores_per_numa - use_cores_per_numa) - 1;
209 for (unsigned i = 0, inuma = avail_numa_count - use_numa_count;
210 inuma < avail_numa_count; ++inuma) {
211 const unsigned numa_coord = inuma;
212 for (unsigned icore = avail_cores_per_numa - use_cores_per_numa;
213 icore < avail_cores_per_numa; ++icore) {
214 const unsigned core_coord =
215 proc_coord.second == icore ? core_coord_swap : icore;
216 for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) {
217 threads_coord[i].first = numa_coord;
218 threads_coord[i].second = core_coord;
219 }
220 }
221 }
222 }
223
224 return thread_spawn_synchronous;
225 }
226
227 } /* namespace hwloc */
228 } /* namespace Kokkos */
229
230 /*--------------------------------------------------------------------------*/
231 /*--------------------------------------------------------------------------*/
232
233 #if defined(KOKKOS_ENABLE_HWLOC)
234
235 #include <iostream>
236 #include <sstream>
237 #include <stdexcept>
238
239 /*--------------------------------------------------------------------------*/
240 /* Third Party Libraries */
241
242 /* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
243 #include <hwloc.h>
244
245 #define REQUIRED_HWLOC_API_VERSION 0x000010300
246
247 #if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
248 #error \
249 "Requires http://www.open-mpi.org/projects/hwloc/ Version 1.3 or greater"
250 #endif
251
252 /*--------------------------------------------------------------------------*/
253
254 namespace Kokkos {
255 namespace hwloc {
256 namespace {
257
258 #if DEBUG_PRINT
259
print_bitmap(std::ostream & s,const hwloc_const_bitmap_t bitmap)260 inline void print_bitmap(std::ostream& s, const hwloc_const_bitmap_t bitmap) {
261 s << "{";
262 for (int i = hwloc_bitmap_first(bitmap); - 1 != i;
263 i = hwloc_bitmap_next(bitmap, i)) {
264 s << " " << i;
265 }
266 s << " }";
267 }
268
269 #endif
270
271 enum { MAX_CORE = 1024 };
272
273 std::pair<unsigned, unsigned> s_core_topology(0, 0);
274 unsigned s_core_capacity(0);
275 hwloc_topology_t s_hwloc_topology(0);
276 hwloc_bitmap_t s_hwloc_location(0);
277 hwloc_bitmap_t s_process_binding(0);
278 hwloc_bitmap_t s_core[MAX_CORE];
279 bool s_can_bind_threads(true);
280
281 struct Sentinel {
282 ~Sentinel();
283 Sentinel();
284 };
285
sentinel()286 bool sentinel() {
287 static Sentinel self;
288
289 if (0 == s_hwloc_topology) {
290 std::cerr << "Kokkos::hwloc ERROR : Called after return from main()"
291 << std::endl;
292 std::cerr.flush();
293 }
294
295 return 0 != s_hwloc_topology;
296 }
297
~Sentinel()298 Sentinel::~Sentinel() {
299 hwloc_topology_destroy(s_hwloc_topology);
300 hwloc_bitmap_free(s_process_binding);
301 hwloc_bitmap_free(s_hwloc_location);
302
303 s_core_topology.first = 0;
304 s_core_topology.second = 0;
305 s_core_capacity = 0;
306 s_hwloc_topology = 0;
307 s_hwloc_location = 0;
308 s_process_binding = 0;
309 }
310
Sentinel()311 Sentinel::Sentinel() {
312 #if defined(__MIC__)
313 static const bool remove_core_0 = true;
314 #else
315 static const bool remove_core_0 = false;
316 #endif
317
318 s_core_topology = std::pair<unsigned, unsigned>(0, 0);
319 s_core_capacity = 0;
320 s_hwloc_topology = 0;
321 s_hwloc_location = 0;
322 s_process_binding = 0;
323
324 for (unsigned i = 0; i < MAX_CORE; ++i) s_core[i] = 0;
325
326 hwloc_topology_init(&s_hwloc_topology);
327 hwloc_topology_load(s_hwloc_topology);
328
329 s_hwloc_location = hwloc_bitmap_alloc();
330 s_process_binding = hwloc_bitmap_alloc();
331
332 hwloc_get_cpubind(s_hwloc_topology, s_process_binding, HWLOC_CPUBIND_PROCESS);
333
334 if (hwloc_bitmap_iszero(s_process_binding)) {
335 if (Kokkos::show_warnings()) {
336 std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL "
337 "processing units"
338 << std::endl;
339 }
340 const int pu_depth = hwloc_get_type_depth(s_hwloc_topology, HWLOC_OBJ_PU);
341 int num_pu = 1;
342 if (pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN) {
343 num_pu = hwloc_get_nbobjs_by_depth(s_hwloc_topology, pu_depth);
344 } else {
345 if (Kokkos::show_warnings()) {
346 std::cerr << "WARNING: Cannot detect number of processing units -- "
347 "ASSUMING 1 (serial)."
348 << std::endl;
349 }
350 num_pu = 1;
351 }
352 hwloc_bitmap_set_range(s_process_binding, 0, num_pu - 1);
353 s_can_bind_threads = false;
354 }
355
356 if (remove_core_0) {
357 const hwloc_obj_t core =
358 hwloc_get_obj_by_type(s_hwloc_topology, HWLOC_OBJ_CORE, 0);
359
360 if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) {
361 hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
362
363 hwloc_bitmap_andnot(s_process_no_core_zero, s_process_binding,
364 core->cpuset);
365
366 bool ok =
367 0 == hwloc_set_cpubind(s_hwloc_topology, s_process_no_core_zero,
368 HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT);
369
370 if (ok) {
371 hwloc_get_cpubind(s_hwloc_topology, s_process_binding,
372 HWLOC_CPUBIND_PROCESS);
373
374 ok = 0 !=
375 hwloc_bitmap_isequal(s_process_binding, s_process_no_core_zero);
376 }
377
378 hwloc_bitmap_free(s_process_no_core_zero);
379
380 if (Kokkos::show_warnings() && !ok) {
381 std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move "
382 "process off of core #0"
383 << std::endl;
384 }
385 }
386 }
387
388 // Choose a hwloc object type for the NUMA level, which may not exist.
389
390 hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX;
391
392 {
393 // Object types to search, in order.
394 static const hwloc_obj_type_t candidate_root_type[] = {
395 HWLOC_OBJ_NODE /* NUMA region */
396 ,
397 HWLOC_OBJ_SOCKET /* hardware socket */
398 ,
399 HWLOC_OBJ_MACHINE /* local machine */
400 };
401
402 enum {
403 CANDIDATE_ROOT_TYPE_COUNT =
404 sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t)
405 };
406
407 for (int k = 0;
408 k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type;
409 ++k) {
410 if (0 <
411 hwloc_get_nbobjs_by_type(s_hwloc_topology, candidate_root_type[k])) {
412 root_type = candidate_root_type[k];
413 }
414 }
415 }
416
417 // Determine which of these 'root' types are available to this process.
418 // The process may have been bound (e.g., by MPI) to a subset of these root
419 // types. Determine current location of the master (calling) process>
420
421 hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
422
423 hwloc_get_last_cpu_location(s_hwloc_topology, proc_cpuset_location,
424 HWLOC_CPUBIND_THREAD);
425
426 const unsigned max_root =
427 hwloc_get_nbobjs_by_type(s_hwloc_topology, root_type);
428
429 unsigned root_base = max_root;
430 unsigned root_count = 0;
431 unsigned core_per_root = 0;
432 unsigned pu_per_core = 0;
433 bool symmetric = true;
434
435 for (unsigned i = 0; i < max_root; ++i) {
436 const hwloc_obj_t root =
437 hwloc_get_obj_by_type(s_hwloc_topology, root_type, i);
438
439 if (hwloc_bitmap_intersects(s_process_binding, root->cpuset)) {
440 ++root_count;
441
442 // Remember which root (NUMA) object the master thread is running on.
443 // This will be logical NUMA rank #0 for this process.
444
445 if (hwloc_bitmap_intersects(proc_cpuset_location, root->cpuset)) {
446 root_base = i;
447 }
448
449 // Count available cores:
450
451 const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type(
452 s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE);
453
454 unsigned core_count = 0;
455
456 for (unsigned j = 0; j < max_core; ++j) {
457 const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type(
458 s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE, j);
459
460 // If process' cpuset intersects core's cpuset then process can access
461 // this core. Must use intersection instead of inclusion because the
462 // Intel-Phi MPI may bind the process to only one of the core's
463 // hyperthreads.
464 //
465 // Assumption: if the process can access any hyperthread of the core
466 // then it has ownership of the entire core.
467 // This assumes that it would be performance-detrimental
468 // to spawn more than one MPI process per core and use nested threading.
469
470 if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) {
471 ++core_count;
472
473 const unsigned pu_count = hwloc_get_nbobjs_inside_cpuset_by_type(
474 s_hwloc_topology, core->cpuset, HWLOC_OBJ_PU);
475
476 if (pu_per_core == 0) pu_per_core = pu_count;
477
478 // Enforce symmetry by taking the minimum:
479
480 pu_per_core = std::min(pu_per_core, pu_count);
481
482 if (pu_count != pu_per_core) symmetric = false;
483 }
484 }
485
486 if (0 == core_per_root) core_per_root = core_count;
487
488 // Enforce symmetry by taking the minimum:
489
490 core_per_root = std::min(core_per_root, core_count);
491
492 if (core_count != core_per_root) symmetric = false;
493 }
494 }
495
496 s_core_topology.first = root_count;
497 s_core_topology.second = core_per_root;
498 s_core_capacity = pu_per_core;
499
500 // Fill the 's_core' array for fast mapping from a core coordinate to the
501 // hwloc cpuset object required for thread location querying and binding.
502
503 for (unsigned i = 0; i < max_root; ++i) {
504 const unsigned root_rank = (i + root_base) % max_root;
505
506 const hwloc_obj_t root =
507 hwloc_get_obj_by_type(s_hwloc_topology, root_type, root_rank);
508
509 if (hwloc_bitmap_intersects(s_process_binding, root->cpuset)) {
510 const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type(
511 s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE);
512
513 unsigned core_count = 0;
514
515 for (unsigned j = 0; j < max_core && core_count < core_per_root; ++j) {
516 const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type(
517 s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE, j);
518
519 if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) {
520 s_core[core_count + core_per_root * i] = core->cpuset;
521
522 ++core_count;
523 }
524 }
525 }
526 }
527
528 hwloc_bitmap_free(proc_cpuset_location);
529
530 if (Kokkos::show_warnings() && !symmetric) {
531 std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a "
532 "non-symmetric core topology."
533 << std::endl;
534 }
535 }
536
537 } // namespace
538
539 //----------------------------------------------------------------------------
540 //----------------------------------------------------------------------------
541
available()542 bool available() { return true; }
543
get_available_numa_count()544 unsigned get_available_numa_count() {
545 sentinel();
546 return s_core_topology.first;
547 }
548
get_available_cores_per_numa()549 unsigned get_available_cores_per_numa() {
550 sentinel();
551 return s_core_topology.second;
552 }
553
get_available_threads_per_core()554 unsigned get_available_threads_per_core() {
555 sentinel();
556 return s_core_capacity;
557 }
558
can_bind_threads()559 bool can_bind_threads() {
560 sentinel();
561 return s_can_bind_threads;
562 }
563
564 //----------------------------------------------------------------------------
565 //----------------------------------------------------------------------------
566
bind_this_thread(const unsigned coordinate_count,std::pair<unsigned,unsigned> coordinate[])567 unsigned bind_this_thread(const unsigned coordinate_count,
568 std::pair<unsigned, unsigned> coordinate[]) {
569 unsigned i = 0;
570
571 try {
572 const std::pair<unsigned, unsigned> current = get_this_thread_coordinate();
573
574 // Match one of the requests:
575 for (i = 0; i < coordinate_count && current != coordinate[i]; ++i)
576 ;
577
578 if (coordinate_count == i) {
579 // Match the first request (typically NUMA):
580 for (i = 0; i < coordinate_count && current.first != coordinate[i].first;
581 ++i)
582 ;
583 }
584
585 if (coordinate_count == i) {
586 // Match any unclaimed request:
587 for (i = 0; i < coordinate_count && ~0u == coordinate[i].first; ++i)
588 ;
589 }
590
591 if (coordinate_count == i || !bind_this_thread(coordinate[i])) {
592 // Failed to bind:
593 i = ~0u;
594 }
595
596 if (i < coordinate_count) {
597 #if DEBUG_PRINT
598 if (current != coordinate[i]) {
599 std::cout << " bind_this_thread: rebinding from (" << current.first
600 << "," << current.second << ") to (" << coordinate[i].first
601 << "," << coordinate[i].second << ")" << std::endl;
602 }
603 #endif
604
605 coordinate[i].first = ~0u;
606 coordinate[i].second = ~0u;
607 }
608 } catch (...) {
609 i = ~0u;
610 }
611
612 return i;
613 }
614
bind_this_thread(const std::pair<unsigned,unsigned> coord)615 bool bind_this_thread(const std::pair<unsigned, unsigned> coord) {
616 if (!sentinel()) return false;
617
618 #if DEBUG_PRINT
619
620 std::cout << "Kokkos::bind_this_thread() at ";
621
622 hwloc_get_last_cpu_location(s_hwloc_topology, s_hwloc_location,
623 HWLOC_CPUBIND_THREAD);
624
625 print_bitmap(std::cout, s_hwloc_location);
626
627 std::cout << " to ";
628
629 print_bitmap(std::cout,
630 s_core[coord.second + coord.first * s_core_topology.second]);
631
632 std::cout << std::endl;
633
634 #endif
635
636 // As safe and fast as possible.
637 // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
638 return coord.first < s_core_topology.first &&
639 coord.second < s_core_topology.second &&
640 0 == hwloc_set_cpubind(
641 s_hwloc_topology,
642 s_core[coord.second + coord.first * s_core_topology.second],
643 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
644 }
645
unbind_this_thread()646 bool unbind_this_thread() {
647 if (!sentinel()) return false;
648
649 #define HWLOC_DEBUG_PRINT 0
650
651 #if HWLOC_DEBUG_PRINT
652
653 std::cout << "Kokkos::unbind_this_thread() from ";
654
655 hwloc_get_cpubind(s_hwloc_topology, s_hwloc_location, HWLOC_CPUBIND_THREAD);
656
657 print_bitmap(std::cout, s_hwloc_location);
658
659 #endif
660
661 const bool result =
662 s_hwloc_topology &&
663 0 == hwloc_set_cpubind(s_hwloc_topology, s_process_binding,
664 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
665
666 #if HWLOC_DEBUG_PRINT
667
668 std::cout << " to ";
669
670 hwloc_get_cpubind(s_hwloc_topology, s_hwloc_location, HWLOC_CPUBIND_THREAD);
671
672 print_bitmap(std::cout, s_hwloc_location);
673
674 std::cout << std::endl;
675
676 #endif
677
678 return result;
679
680 #undef HWLOC_DEBUG_PRINT
681 }
682
683 //----------------------------------------------------------------------------
684
get_this_thread_coordinate()685 std::pair<unsigned, unsigned> get_this_thread_coordinate() {
686 std::pair<unsigned, unsigned> coord(0u, 0u);
687
688 if (!sentinel()) return coord;
689
690 const unsigned n = s_core_topology.first * s_core_topology.second;
691
692 // Using the pre-allocated 's_hwloc_location' to avoid memory
693 // allocation by this thread. This call is NOT thread-safe.
694 hwloc_get_last_cpu_location(s_hwloc_topology, s_hwloc_location,
695 HWLOC_CPUBIND_THREAD);
696
697 unsigned i = 0;
698
699 while (i < n && !hwloc_bitmap_intersects(s_hwloc_location, s_core[i])) ++i;
700
701 if (i < n) {
702 coord.first = i / s_core_topology.second;
703 coord.second = i % s_core_topology.second;
704 }
705
706 return coord;
707 }
708
709 //----------------------------------------------------------------------------
710
711 } /* namespace hwloc */
712 } /* namespace Kokkos */
713
714 //----------------------------------------------------------------------------
715 //----------------------------------------------------------------------------
716
717 #else /* ! defined( KOKKOS_ENABLE_HWLOC ) */
718
719 namespace Kokkos {
720 namespace hwloc {
721
available()722 bool available() { return false; }
can_bind_threads()723 bool can_bind_threads() { return false; }
724
get_available_numa_count()725 unsigned get_available_numa_count() { return 1; }
get_available_cores_per_numa()726 unsigned get_available_cores_per_numa() { return 1; }
get_available_threads_per_core()727 unsigned get_available_threads_per_core() { return 1; }
728
bind_this_thread(const unsigned,std::pair<unsigned,unsigned>[])729 unsigned bind_this_thread(const unsigned, std::pair<unsigned, unsigned>[]) {
730 return ~0;
731 }
732
bind_this_thread(const std::pair<unsigned,unsigned>)733 bool bind_this_thread(const std::pair<unsigned, unsigned>) { return false; }
734
unbind_this_thread()735 bool unbind_this_thread() { return true; }
736
get_this_thread_coordinate()737 std::pair<unsigned, unsigned> get_this_thread_coordinate() {
738 return std::pair<unsigned, unsigned>(0, 0);
739 }
740
741 } // namespace hwloc
742 } // namespace Kokkos
743
744 //----------------------------------------------------------------------------
745 //----------------------------------------------------------------------------
746
747 #endif
748