1 /*
2 //@HEADER
3 // ************************************************************************
4 //
5 //                        Kokkos v. 3.0
6 //       Copyright (2020) National Technology & Engineering
7 //               Solutions of Sandia, LLC (NTESS).
8 //
9 // Under the terms of Contract DE-NA0003525 with NTESS,
10 // the U.S. Government retains certain rights in this software.
11 //
12 // Redistribution and use in source and binary forms, with or without
13 // modification, are permitted provided that the following conditions are
14 // met:
15 //
16 // 1. Redistributions of source code must retain the above copyright
17 // notice, this list of conditions and the following disclaimer.
18 //
19 // 2. Redistributions in binary form must reproduce the above copyright
20 // notice, this list of conditions and the following disclaimer in the
21 // documentation and/or other materials provided with the distribution.
22 //
23 // 3. Neither the name of the Corporation nor the names of the
24 // contributors may be used to endorse or promote products derived from
25 // this software without specific prior written permission.
26 //
27 // THIS SOFTWARE IS PROVIDED BY NTESS "AS IS" AND ANY
28 // EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
29 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
30 // PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL NTESS OR THE
31 // CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
32 // EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
33 // PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
34 // PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
35 // LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
36 // NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
37 // SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
38 //
39 // Questions? Contact Christian R. Trott (crtrott@sandia.gov)
40 //
41 // ************************************************************************
42 //@HEADER
43 */
44 
45 #define DEBUG_PRINT 0
46 
47 #include <iostream>
48 #include <sstream>
49 #include <algorithm>
50 
51 #include <Kokkos_Macros.hpp>
52 #include <Kokkos_Core.hpp>
53 #include <Kokkos_hwloc.hpp>
54 #include <impl/Kokkos_Error.hpp>
55 
56 /*--------------------------------------------------------------------------*/
57 /*--------------------------------------------------------------------------*/
58 
59 namespace Kokkos {
60 namespace hwloc {
61 
62 /* Return 0 if asynchronous, 1 if synchronous and include process. */
thread_mapping(const char * const label,const bool allow_async,unsigned & thread_count,unsigned & use_numa_count,unsigned & use_cores_per_numa,std::pair<unsigned,unsigned> threads_coord[])63 unsigned thread_mapping(const char* const label, const bool allow_async,
64                         unsigned& thread_count, unsigned& use_numa_count,
65                         unsigned& use_cores_per_numa,
66                         std::pair<unsigned, unsigned> threads_coord[]) {
67   const bool hwloc_avail = Kokkos::hwloc::available();
68   const unsigned avail_numa_count =
69       hwloc_avail ? hwloc::get_available_numa_count() : 1;
70   const unsigned avail_cores_per_numa =
71       hwloc_avail ? hwloc::get_available_cores_per_numa() : thread_count;
72   const unsigned avail_threads_per_core =
73       hwloc_avail ? hwloc::get_available_threads_per_core() : 1;
74 
75   // (numa,core) coordinate of the process:
76   const std::pair<unsigned, unsigned> proc_coord =
77       Kokkos::hwloc::get_this_thread_coordinate();
78 
79   //------------------------------------------------------------------------
80   // Defaults for unspecified inputs:
81 
82   if (!use_numa_count) {
83     // Default to use all NUMA regions
84     use_numa_count = !thread_count
85                          ? avail_numa_count
86                          : (thread_count < avail_numa_count ? thread_count
87                                                             : avail_numa_count);
88   }
89 
90   if (!use_cores_per_numa) {
91     // Default to use all but one core if asynchronous, all cores if
92     // synchronous.
93     const unsigned threads_per_numa = thread_count / use_numa_count;
94 
95     use_cores_per_numa =
96         !threads_per_numa
97             ? avail_cores_per_numa - (allow_async ? 1 : 0)
98             : (threads_per_numa < avail_cores_per_numa ? threads_per_numa
99                                                        : avail_cores_per_numa);
100   }
101 
102   if (!thread_count) {
103     thread_count = use_numa_count * use_cores_per_numa * avail_threads_per_core;
104   }
105 
106   //------------------------------------------------------------------------
107   // Input verification:
108 
109   const bool valid_numa = use_numa_count <= avail_numa_count;
110   const bool valid_cores =
111       use_cores_per_numa && use_cores_per_numa <= avail_cores_per_numa;
112   const bool valid_threads =
113       thread_count && thread_count <= use_numa_count * use_cores_per_numa *
114                                           avail_threads_per_core;
115   const bool balanced_numa = !(thread_count % use_numa_count);
116   const bool balanced_cores =
117       !(thread_count % (use_numa_count * use_cores_per_numa));
118 
119   const bool valid_input = valid_numa && valid_cores && valid_threads &&
120                            balanced_numa && balanced_cores;
121 
122   if (!valid_input) {
123     std::ostringstream msg;
124 
125     msg << label << " HWLOC ERROR(s)";
126 
127     if (!valid_threads) {
128       msg << " : thread_count(" << thread_count << ") exceeds capacity("
129           << use_numa_count * use_cores_per_numa * avail_threads_per_core
130           << ")";
131     }
132     if (!valid_numa) {
133       msg << " : use_numa_count(" << use_numa_count << ") exceeds capacity("
134           << avail_numa_count << ")";
135     }
136     if (!valid_cores) {
137       msg << " : use_cores_per_numa(" << use_cores_per_numa
138           << ") exceeds capacity(" << avail_cores_per_numa << ")";
139     }
140     if (!balanced_numa) {
141       msg << " : thread_count(" << thread_count << ") imbalanced among numa("
142           << use_numa_count << ")";
143     }
144     if (!balanced_cores) {
145       msg << " : thread_count(" << thread_count << ") imbalanced among cores("
146           << use_numa_count * use_cores_per_numa << ")";
147     }
148 
149     Kokkos::Impl::throw_runtime_exception(msg.str());
150   }
151 
152   const unsigned thread_spawn_synchronous =
153       (allow_async && 1 < thread_count &&
154        (use_numa_count < avail_numa_count ||
155         use_cores_per_numa < avail_cores_per_numa))
156           ? 0 /* asyncronous */
157           : 1 /* synchronous, threads_coord[0] is process core */;
158 
159   // Determine binding coordinates for to-be-spawned threads so that
160   // threads may be bound to cores as they are spawned.
161 
162   const unsigned threads_per_core =
163       thread_count / (use_numa_count * use_cores_per_numa);
164 
165   if (thread_spawn_synchronous) {
166     // Working synchronously and include process core as threads_coord[0].
167     // Swap the NUMA coordinate of the process core with 0
168     // Swap the CORE coordinate of the process core with 0
169     for (unsigned i = 0, inuma = avail_numa_count - use_numa_count;
170          inuma < avail_numa_count; ++inuma) {
171       const unsigned numa_coord = 0 == inuma
172                                       ? proc_coord.first
173                                       : (proc_coord.first == inuma ? 0 : inuma);
174       for (unsigned icore = avail_cores_per_numa - use_cores_per_numa;
175            icore < avail_cores_per_numa; ++icore) {
176         const unsigned core_coord =
177             0 == icore ? proc_coord.second
178                        : (proc_coord.second == icore ? 0 : icore);
179         for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) {
180           threads_coord[i].first  = numa_coord;
181           threads_coord[i].second = core_coord;
182         }
183       }
184     }
185   } else if (use_numa_count < avail_numa_count) {
186     // Working asynchronously and omit the process' NUMA region from the pool.
187     // Swap the NUMA coordinate of the process core with ( ( avail_numa_count -
188     // use_numa_count ) - 1 )
189     const unsigned numa_coord_swap = (avail_numa_count - use_numa_count) - 1;
190     for (unsigned i = 0, inuma = avail_numa_count - use_numa_count;
191          inuma < avail_numa_count; ++inuma) {
192       const unsigned numa_coord =
193           proc_coord.first == inuma ? numa_coord_swap : inuma;
194       for (unsigned icore = avail_cores_per_numa - use_cores_per_numa;
195            icore < avail_cores_per_numa; ++icore) {
196         const unsigned core_coord = icore;
197         for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) {
198           threads_coord[i].first  = numa_coord;
199           threads_coord[i].second = core_coord;
200         }
201       }
202     }
203   } else if (use_cores_per_numa < avail_cores_per_numa) {
204     // Working asynchronously and omit the process' core from the pool.
205     // Swap the CORE coordinate of the process core with ( (
206     // avail_cores_per_numa - use_cores_per_numa ) - 1 )
207     const unsigned core_coord_swap =
208         (avail_cores_per_numa - use_cores_per_numa) - 1;
209     for (unsigned i = 0, inuma = avail_numa_count - use_numa_count;
210          inuma < avail_numa_count; ++inuma) {
211       const unsigned numa_coord = inuma;
212       for (unsigned icore = avail_cores_per_numa - use_cores_per_numa;
213            icore < avail_cores_per_numa; ++icore) {
214         const unsigned core_coord =
215             proc_coord.second == icore ? core_coord_swap : icore;
216         for (unsigned ith = 0; ith < threads_per_core; ++ith, ++i) {
217           threads_coord[i].first  = numa_coord;
218           threads_coord[i].second = core_coord;
219         }
220       }
221     }
222   }
223 
224   return thread_spawn_synchronous;
225 }
226 
227 } /* namespace hwloc */
228 } /* namespace Kokkos */
229 
230 /*--------------------------------------------------------------------------*/
231 /*--------------------------------------------------------------------------*/
232 
233 #if defined(KOKKOS_ENABLE_HWLOC)
234 
235 #include <iostream>
236 #include <sstream>
237 #include <stdexcept>
238 
239 /*--------------------------------------------------------------------------*/
240 /* Third Party Libraries */
241 
242 /* Hardware locality library: http://www.open-mpi.org/projects/hwloc/ */
243 #include <hwloc.h>
244 
245 #define REQUIRED_HWLOC_API_VERSION 0x000010300
246 
247 #if HWLOC_API_VERSION < REQUIRED_HWLOC_API_VERSION
248 #error \
249     "Requires  http://www.open-mpi.org/projects/hwloc/  Version 1.3 or greater"
250 #endif
251 
252 /*--------------------------------------------------------------------------*/
253 
254 namespace Kokkos {
255 namespace hwloc {
256 namespace {
257 
258 #if DEBUG_PRINT
259 
print_bitmap(std::ostream & s,const hwloc_const_bitmap_t bitmap)260 inline void print_bitmap(std::ostream& s, const hwloc_const_bitmap_t bitmap) {
261   s << "{";
262   for (int i = hwloc_bitmap_first(bitmap); - 1 != i;
263        i     = hwloc_bitmap_next(bitmap, i)) {
264     s << " " << i;
265   }
266   s << " }";
267 }
268 
269 #endif
270 
271 enum { MAX_CORE = 1024 };
272 
273 std::pair<unsigned, unsigned> s_core_topology(0, 0);
274 unsigned s_core_capacity(0);
275 hwloc_topology_t s_hwloc_topology(0);
276 hwloc_bitmap_t s_hwloc_location(0);
277 hwloc_bitmap_t s_process_binding(0);
278 hwloc_bitmap_t s_core[MAX_CORE];
279 bool s_can_bind_threads(true);
280 
281 struct Sentinel {
282   ~Sentinel();
283   Sentinel();
284 };
285 
sentinel()286 bool sentinel() {
287   static Sentinel self;
288 
289   if (0 == s_hwloc_topology) {
290     std::cerr << "Kokkos::hwloc ERROR : Called after return from main()"
291               << std::endl;
292     std::cerr.flush();
293   }
294 
295   return 0 != s_hwloc_topology;
296 }
297 
~Sentinel()298 Sentinel::~Sentinel() {
299   hwloc_topology_destroy(s_hwloc_topology);
300   hwloc_bitmap_free(s_process_binding);
301   hwloc_bitmap_free(s_hwloc_location);
302 
303   s_core_topology.first  = 0;
304   s_core_topology.second = 0;
305   s_core_capacity        = 0;
306   s_hwloc_topology       = 0;
307   s_hwloc_location       = 0;
308   s_process_binding      = 0;
309 }
310 
Sentinel()311 Sentinel::Sentinel() {
312 #if defined(__MIC__)
313   static const bool remove_core_0 = true;
314 #else
315   static const bool remove_core_0 = false;
316 #endif
317 
318   s_core_topology   = std::pair<unsigned, unsigned>(0, 0);
319   s_core_capacity   = 0;
320   s_hwloc_topology  = 0;
321   s_hwloc_location  = 0;
322   s_process_binding = 0;
323 
324   for (unsigned i = 0; i < MAX_CORE; ++i) s_core[i] = 0;
325 
326   hwloc_topology_init(&s_hwloc_topology);
327   hwloc_topology_load(s_hwloc_topology);
328 
329   s_hwloc_location  = hwloc_bitmap_alloc();
330   s_process_binding = hwloc_bitmap_alloc();
331 
332   hwloc_get_cpubind(s_hwloc_topology, s_process_binding, HWLOC_CPUBIND_PROCESS);
333 
334   if (hwloc_bitmap_iszero(s_process_binding)) {
335     if (Kokkos::show_warnings()) {
336       std::cerr << "WARNING: Cannot detect process binding -- ASSUMING ALL "
337                    "processing units"
338                 << std::endl;
339     }
340     const int pu_depth = hwloc_get_type_depth(s_hwloc_topology, HWLOC_OBJ_PU);
341     int num_pu         = 1;
342     if (pu_depth != HWLOC_TYPE_DEPTH_UNKNOWN) {
343       num_pu = hwloc_get_nbobjs_by_depth(s_hwloc_topology, pu_depth);
344     } else {
345       if (Kokkos::show_warnings()) {
346         std::cerr << "WARNING: Cannot detect number of processing units -- "
347                      "ASSUMING 1 (serial)."
348                   << std::endl;
349       }
350       num_pu = 1;
351     }
352     hwloc_bitmap_set_range(s_process_binding, 0, num_pu - 1);
353     s_can_bind_threads = false;
354   }
355 
356   if (remove_core_0) {
357     const hwloc_obj_t core =
358         hwloc_get_obj_by_type(s_hwloc_topology, HWLOC_OBJ_CORE, 0);
359 
360     if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) {
361       hwloc_bitmap_t s_process_no_core_zero = hwloc_bitmap_alloc();
362 
363       hwloc_bitmap_andnot(s_process_no_core_zero, s_process_binding,
364                           core->cpuset);
365 
366       bool ok =
367           0 == hwloc_set_cpubind(s_hwloc_topology, s_process_no_core_zero,
368                                  HWLOC_CPUBIND_PROCESS | HWLOC_CPUBIND_STRICT);
369 
370       if (ok) {
371         hwloc_get_cpubind(s_hwloc_topology, s_process_binding,
372                           HWLOC_CPUBIND_PROCESS);
373 
374         ok = 0 !=
375              hwloc_bitmap_isequal(s_process_binding, s_process_no_core_zero);
376       }
377 
378       hwloc_bitmap_free(s_process_no_core_zero);
379 
380       if (Kokkos::show_warnings() && !ok) {
381         std::cerr << "WARNING: Kokkos::hwloc attempted and failed to move "
382                      "process off of core #0"
383                   << std::endl;
384       }
385     }
386   }
387 
388   // Choose a hwloc object type for the NUMA level, which may not exist.
389 
390   hwloc_obj_type_t root_type = HWLOC_OBJ_TYPE_MAX;
391 
392   {
393     // Object types to search, in order.
394     static const hwloc_obj_type_t candidate_root_type[] = {
395         HWLOC_OBJ_NODE /* NUMA region     */
396         ,
397         HWLOC_OBJ_SOCKET /* hardware socket */
398         ,
399         HWLOC_OBJ_MACHINE /* local machine   */
400     };
401 
402     enum {
403       CANDIDATE_ROOT_TYPE_COUNT =
404           sizeof(candidate_root_type) / sizeof(hwloc_obj_type_t)
405     };
406 
407     for (int k = 0;
408          k < CANDIDATE_ROOT_TYPE_COUNT && HWLOC_OBJ_TYPE_MAX == root_type;
409          ++k) {
410       if (0 <
411           hwloc_get_nbobjs_by_type(s_hwloc_topology, candidate_root_type[k])) {
412         root_type = candidate_root_type[k];
413       }
414     }
415   }
416 
417   // Determine which of these 'root' types are available to this process.
418   // The process may have been bound (e.g., by MPI) to a subset of these root
419   // types. Determine current location of the master (calling) process>
420 
421   hwloc_bitmap_t proc_cpuset_location = hwloc_bitmap_alloc();
422 
423   hwloc_get_last_cpu_location(s_hwloc_topology, proc_cpuset_location,
424                               HWLOC_CPUBIND_THREAD);
425 
426   const unsigned max_root =
427       hwloc_get_nbobjs_by_type(s_hwloc_topology, root_type);
428 
429   unsigned root_base     = max_root;
430   unsigned root_count    = 0;
431   unsigned core_per_root = 0;
432   unsigned pu_per_core   = 0;
433   bool symmetric         = true;
434 
435   for (unsigned i = 0; i < max_root; ++i) {
436     const hwloc_obj_t root =
437         hwloc_get_obj_by_type(s_hwloc_topology, root_type, i);
438 
439     if (hwloc_bitmap_intersects(s_process_binding, root->cpuset)) {
440       ++root_count;
441 
442       // Remember which root (NUMA) object the master thread is running on.
443       // This will be logical NUMA rank #0 for this process.
444 
445       if (hwloc_bitmap_intersects(proc_cpuset_location, root->cpuset)) {
446         root_base = i;
447       }
448 
449       // Count available cores:
450 
451       const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type(
452           s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE);
453 
454       unsigned core_count = 0;
455 
456       for (unsigned j = 0; j < max_core; ++j) {
457         const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type(
458             s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE, j);
459 
460         // If process' cpuset intersects core's cpuset then process can access
461         // this core. Must use intersection instead of inclusion because the
462         // Intel-Phi MPI may bind the process to only one of the core's
463         // hyperthreads.
464         //
465         // Assumption: if the process can access any hyperthread of the core
466         // then it has ownership of the entire core.
467         // This assumes that it would be performance-detrimental
468         // to spawn more than one MPI process per core and use nested threading.
469 
470         if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) {
471           ++core_count;
472 
473           const unsigned pu_count = hwloc_get_nbobjs_inside_cpuset_by_type(
474               s_hwloc_topology, core->cpuset, HWLOC_OBJ_PU);
475 
476           if (pu_per_core == 0) pu_per_core = pu_count;
477 
478           // Enforce symmetry by taking the minimum:
479 
480           pu_per_core = std::min(pu_per_core, pu_count);
481 
482           if (pu_count != pu_per_core) symmetric = false;
483         }
484       }
485 
486       if (0 == core_per_root) core_per_root = core_count;
487 
488       // Enforce symmetry by taking the minimum:
489 
490       core_per_root = std::min(core_per_root, core_count);
491 
492       if (core_count != core_per_root) symmetric = false;
493     }
494   }
495 
496   s_core_topology.first  = root_count;
497   s_core_topology.second = core_per_root;
498   s_core_capacity        = pu_per_core;
499 
500   // Fill the 's_core' array for fast mapping from a core coordinate to the
501   // hwloc cpuset object required for thread location querying and binding.
502 
503   for (unsigned i = 0; i < max_root; ++i) {
504     const unsigned root_rank = (i + root_base) % max_root;
505 
506     const hwloc_obj_t root =
507         hwloc_get_obj_by_type(s_hwloc_topology, root_type, root_rank);
508 
509     if (hwloc_bitmap_intersects(s_process_binding, root->cpuset)) {
510       const unsigned max_core = hwloc_get_nbobjs_inside_cpuset_by_type(
511           s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE);
512 
513       unsigned core_count = 0;
514 
515       for (unsigned j = 0; j < max_core && core_count < core_per_root; ++j) {
516         const hwloc_obj_t core = hwloc_get_obj_inside_cpuset_by_type(
517             s_hwloc_topology, root->cpuset, HWLOC_OBJ_CORE, j);
518 
519         if (hwloc_bitmap_intersects(s_process_binding, core->cpuset)) {
520           s_core[core_count + core_per_root * i] = core->cpuset;
521 
522           ++core_count;
523         }
524       }
525     }
526   }
527 
528   hwloc_bitmap_free(proc_cpuset_location);
529 
530   if (Kokkos::show_warnings() && !symmetric) {
531     std::cerr << "Kokkos::hwloc WARNING: Using a symmetric subset of a "
532                  "non-symmetric core topology."
533               << std::endl;
534   }
535 }
536 
537 }  // namespace
538 
539 //----------------------------------------------------------------------------
540 //----------------------------------------------------------------------------
541 
available()542 bool available() { return true; }
543 
get_available_numa_count()544 unsigned get_available_numa_count() {
545   sentinel();
546   return s_core_topology.first;
547 }
548 
get_available_cores_per_numa()549 unsigned get_available_cores_per_numa() {
550   sentinel();
551   return s_core_topology.second;
552 }
553 
get_available_threads_per_core()554 unsigned get_available_threads_per_core() {
555   sentinel();
556   return s_core_capacity;
557 }
558 
can_bind_threads()559 bool can_bind_threads() {
560   sentinel();
561   return s_can_bind_threads;
562 }
563 
564 //----------------------------------------------------------------------------
565 //----------------------------------------------------------------------------
566 
bind_this_thread(const unsigned coordinate_count,std::pair<unsigned,unsigned> coordinate[])567 unsigned bind_this_thread(const unsigned coordinate_count,
568                           std::pair<unsigned, unsigned> coordinate[]) {
569   unsigned i = 0;
570 
571   try {
572     const std::pair<unsigned, unsigned> current = get_this_thread_coordinate();
573 
574     // Match one of the requests:
575     for (i = 0; i < coordinate_count && current != coordinate[i]; ++i)
576       ;
577 
578     if (coordinate_count == i) {
579       // Match the first request (typically NUMA):
580       for (i = 0; i < coordinate_count && current.first != coordinate[i].first;
581            ++i)
582         ;
583     }
584 
585     if (coordinate_count == i) {
586       // Match any unclaimed request:
587       for (i = 0; i < coordinate_count && ~0u == coordinate[i].first; ++i)
588         ;
589     }
590 
591     if (coordinate_count == i || !bind_this_thread(coordinate[i])) {
592       // Failed to bind:
593       i = ~0u;
594     }
595 
596     if (i < coordinate_count) {
597 #if DEBUG_PRINT
598       if (current != coordinate[i]) {
599         std::cout << "  bind_this_thread: rebinding from (" << current.first
600                   << "," << current.second << ") to (" << coordinate[i].first
601                   << "," << coordinate[i].second << ")" << std::endl;
602       }
603 #endif
604 
605       coordinate[i].first  = ~0u;
606       coordinate[i].second = ~0u;
607     }
608   } catch (...) {
609     i = ~0u;
610   }
611 
612   return i;
613 }
614 
bind_this_thread(const std::pair<unsigned,unsigned> coord)615 bool bind_this_thread(const std::pair<unsigned, unsigned> coord) {
616   if (!sentinel()) return false;
617 
618 #if DEBUG_PRINT
619 
620   std::cout << "Kokkos::bind_this_thread() at ";
621 
622   hwloc_get_last_cpu_location(s_hwloc_topology, s_hwloc_location,
623                               HWLOC_CPUBIND_THREAD);
624 
625   print_bitmap(std::cout, s_hwloc_location);
626 
627   std::cout << " to ";
628 
629   print_bitmap(std::cout,
630                s_core[coord.second + coord.first * s_core_topology.second]);
631 
632   std::cout << std::endl;
633 
634 #endif
635 
636   // As safe and fast as possible.
637   // Fast-lookup by caching the coordinate -> hwloc cpuset mapping in 's_core'.
638   return coord.first < s_core_topology.first &&
639          coord.second < s_core_topology.second &&
640          0 == hwloc_set_cpubind(
641                   s_hwloc_topology,
642                   s_core[coord.second + coord.first * s_core_topology.second],
643                   HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
644 }
645 
unbind_this_thread()646 bool unbind_this_thread() {
647   if (!sentinel()) return false;
648 
649 #define HWLOC_DEBUG_PRINT 0
650 
651 #if HWLOC_DEBUG_PRINT
652 
653   std::cout << "Kokkos::unbind_this_thread() from ";
654 
655   hwloc_get_cpubind(s_hwloc_topology, s_hwloc_location, HWLOC_CPUBIND_THREAD);
656 
657   print_bitmap(std::cout, s_hwloc_location);
658 
659 #endif
660 
661   const bool result =
662       s_hwloc_topology &&
663       0 == hwloc_set_cpubind(s_hwloc_topology, s_process_binding,
664                              HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
665 
666 #if HWLOC_DEBUG_PRINT
667 
668   std::cout << " to ";
669 
670   hwloc_get_cpubind(s_hwloc_topology, s_hwloc_location, HWLOC_CPUBIND_THREAD);
671 
672   print_bitmap(std::cout, s_hwloc_location);
673 
674   std::cout << std::endl;
675 
676 #endif
677 
678   return result;
679 
680 #undef HWLOC_DEBUG_PRINT
681 }
682 
683 //----------------------------------------------------------------------------
684 
get_this_thread_coordinate()685 std::pair<unsigned, unsigned> get_this_thread_coordinate() {
686   std::pair<unsigned, unsigned> coord(0u, 0u);
687 
688   if (!sentinel()) return coord;
689 
690   const unsigned n = s_core_topology.first * s_core_topology.second;
691 
692   // Using the pre-allocated 's_hwloc_location' to avoid memory
693   // allocation by this thread.  This call is NOT thread-safe.
694   hwloc_get_last_cpu_location(s_hwloc_topology, s_hwloc_location,
695                               HWLOC_CPUBIND_THREAD);
696 
697   unsigned i = 0;
698 
699   while (i < n && !hwloc_bitmap_intersects(s_hwloc_location, s_core[i])) ++i;
700 
701   if (i < n) {
702     coord.first  = i / s_core_topology.second;
703     coord.second = i % s_core_topology.second;
704   }
705 
706   return coord;
707 }
708 
709 //----------------------------------------------------------------------------
710 
711 } /* namespace hwloc */
712 } /* namespace Kokkos */
713 
714 //----------------------------------------------------------------------------
715 //----------------------------------------------------------------------------
716 
717 #else /* ! defined( KOKKOS_ENABLE_HWLOC ) */
718 
719 namespace Kokkos {
720 namespace hwloc {
721 
available()722 bool available() { return false; }
can_bind_threads()723 bool can_bind_threads() { return false; }
724 
get_available_numa_count()725 unsigned get_available_numa_count() { return 1; }
get_available_cores_per_numa()726 unsigned get_available_cores_per_numa() { return 1; }
get_available_threads_per_core()727 unsigned get_available_threads_per_core() { return 1; }
728 
bind_this_thread(const unsigned,std::pair<unsigned,unsigned>[])729 unsigned bind_this_thread(const unsigned, std::pair<unsigned, unsigned>[]) {
730   return ~0;
731 }
732 
bind_this_thread(const std::pair<unsigned,unsigned>)733 bool bind_this_thread(const std::pair<unsigned, unsigned>) { return false; }
734 
unbind_this_thread()735 bool unbind_this_thread() { return true; }
736 
get_this_thread_coordinate()737 std::pair<unsigned, unsigned> get_this_thread_coordinate() {
738   return std::pair<unsigned, unsigned>(0, 0);
739 }
740 
741 }  // namespace hwloc
742 }  // namespace Kokkos
743 
744 //----------------------------------------------------------------------------
745 //----------------------------------------------------------------------------
746 
747 #endif
748