1 #include "cado.h" // IWYU pragma: keep
2 // IWYU pragma: no_include <ext/alloc_traits.h>
3 // IWYU pragma: no_include <hwloc/bitmap.h>
4 // IWYU pragma: no_include "hwloc/bitmap.h"
5 
6 #include <errno.h>             // for EXDEV, errno
7 #include <inttypes.h>          // for PRIu64
8 #include <regex.h>             // for regmatch_t, regcomp, regexec, regfree
9 #include <sstream>      // IWYU pragma: keep
10 #include <stdint.h>            // for uint64_t
11 #include <stdio.h>             // for fprintf, stderr, size_t, fputs
12 #include <stdlib.h>            // for free, exit, EXIT_FAILURE, EXIT_SUCCESS
13 #include <strings.h>           // for strcasecmp
14 #include <mutex>               // for mutex, lock_guard
15 #include <string>              // for string, operator<<, char_traits, opera...
16 #include <tuple>               // for tie, get, make_tuple, tuple
17 #include <vector>              // for vector, vector<>::iterator
18 #ifdef HAVE_HWLOC
19 #include <hwloc.h>
20 #include "hwloc-aux.h"
21 #endif
22 #include "las-parallel.hpp"
23 
24 #include "misc.h"       // size_disp
25 #include "utils_cxx.hpp"        // call_dtor
26 #include "verbose.h"             // verbose_output_print
27 #include "macros.h"
28 #include "params.h"
29 
30 
31 
32 const char * default_placement_with_auto = "node,fit*4,fit,pu,loose";
33 
parse_number(std::string const & s,int & x,std::string::size_type pos=0)34 static bool parse_number(std::string const & s, int & x, std::string::size_type pos = 0) /*{{{*/
35 {
36     const char * digits = "0123456789";
37     if (s.empty() || s.find_first_not_of(digits, pos) != std::string::npos)
38         return false;
39     std::istringstream is(s.substr(pos));
40     is >> x;
41     return true;
42 }/*}}}*/
43 
44 /* used to help with some of the job achieved by the las_parallel ctor.
45  * Otherwise no reason to expose within the public class, as it's of no
46  * real use (not even expository).
47  */
48 struct las_parallel_desc::helper {
49     std::string memory_binding_specifier_string;
50     std::string cpu_binding_specifier_string;
51     std::string jobs_within_cpu_binding_string;
52     std::string threads_per_job_string;
53 
54     /* The banner is always printed once at least. The full description
55      * only with -v */
56     std::string banner;
57     std::string full_diagnostics;
58 
59 #ifdef HAVE_HWLOC
60     hwloc_topology_t topology;
61 
62     std::string synthetic_topology_string;
63     int depth = 0;
64     std::vector<int> depth_per_level;
65 
66     /* for the "fit" keyword */
67     mutable int computed_min_pu_fit = -1;
68 
69     /* size() == number of memory binding zones */
70     std::vector<cxx_hwloc_nodeset> memory_binding_nodesets;
71 
72     /* size() == total number of subjobs */
73     std::vector<cxx_hwloc_cpuset> subjob_binding_cpusets;
74 
75     double total_ram_margin = 0;
76 
77     int memory_binding_size;
78     int cpu_binding_size;
79 #endif
80     int nsubjobs_per_cpu_binding_zone = 1;
81     int nthreads_per_subjob = 1;
82 
83     bool loose = false;
84     bool replicate = true;
85 
is_looselas_parallel_desc::helper86     bool is_loose() const { return loose; }
is_strictlas_parallel_desc::helper87     bool is_strict() const { return !loose; }
want_replicatelas_parallel_desc::helper88     bool want_replicate() const { return replicate; }
89     /*
90     bool has_option(std::string & const opt) {
91         return binding_options.find(opt) != binding_options.end();
92     }
93     */
94 
set_bannerlas_parallel_desc::helper95     void set_banner(std::string const & description_string) {
96         std::ostringstream os;
97 #ifdef HAVE_HWLOC
98         os  << "# Applying binding "
99             << description_string
100             << " on a machine with topology "
101             << synthetic_topology_string
102             << " (" << (total_ram() >> 30) << " GB RAM)\n",
103 #else
104             os  << "# Applying binding " << description_string << " with hwloc disabled\n",
105 #endif
106             banner += os.str();
107     }
108 #ifdef HAVE_HWLOC
modlas_parallel_desc::helper109     int mod(int&i) const {/*{{{*/
110         i = i % depth; if (i < 0) i += depth;
111         return i;
112     }/*}}}*/
number_atlas_parallel_desc::helper113     int number_at(int i) const {/*{{{*/
114         return hwloc_get_nbobjs_by_depth(topology, mod(i));
115     }/*}}}*/
number_oflas_parallel_desc::helper116     int number_of(int child_depth, int parent_depth) const/*{{{*/
117     {
118         mod(child_depth);
119         mod(parent_depth);
120         if (child_depth < parent_depth) return 0;
121         int n = 1;
122         for(int i = parent_depth ; i < child_depth ; ++i)
123             n *= depth_per_level[i+1];
124         return n;
125     }/*}}}*/
126 #endif
helperlas_parallel_desc::helper127     helper() {/*{{{*/
128         /* Here we need hwloc, of course */
129 #ifdef HAVE_HWLOC
130         hwloc_topology_init(&topology);
131         {/*{{{ load the topology `*/
132             unsigned long flags = 0;
133 #if HWLOC_API_VERSION >= 0x010700
134             flags = hwloc_topology_get_flags(topology);
135 #endif  /* HWLOC_API_VERSION >= 0x010700 */
136             /* we must make sure to remove these flags, but it's likely that
137              * they're off by default anyway */
138 #if HWLOC_API_VERSION < 0x020000
139             flags &= ~(HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
140 #endif
141             hwloc_topology_set_flags(topology, flags);
142 
143             hwloc_topology_load(topology);
144 
145             hwloc_obj_t root = hwloc_get_root_obj(topology);
146             if (!root->symmetric_subtree) {
147                 fprintf(stderr, "# Topology is not symmetric,"
148                         " cannot proceed with replication"
149                         " of the las process with the current code."
150                         " No cpu/memory binding will be set.\n");
151                 /* simply stick to the default */
152                 return;
153             }
154         }/*}}}*/
155         depth = hwloc_topology_get_depth(topology);
156         depth_per_level.reserve(depth);
157         int n = 1;
158         std::ostringstream os;
159         for(int i = 0 ; i < depth ; i++) {
160             int x = hwloc_get_nbobjs_by_depth(topology, i);
161             depth_per_level.push_back(x / n);
162             n = x;
163         }
164         char buf[1024];
165         hwloc_topology_export_synthetic(topology, buf, sizeof(buf), HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS );
166         synthetic_topology_string = buf;
167 
168         /* Form a sensible set of defaults, for one unbound thread */
169         memory_binding_size = number_of(-1,0);
170         cpu_binding_size = memory_binding_size;
171         compute_binding_bitmaps();
172 #endif
173     }/*}}}*/
174 
~helperlas_parallel_desc::helper175     ~helper() {/*{{{*/
176 #ifdef HAVE_HWLOC
177         hwloc_topology_destroy(topology);
178 #endif
179     }/*}}}*/
tokenizelas_parallel_desc::helper180     std::vector<std::string> tokenize(std::string const & s) const {/*{{{*/
181         using namespace std;
182         vector<string> tokens;
183         for(string::size_type x = 0, y; x != string::npos ; x = y) {
184             y = s.find(',',x);
185             if (y == string::npos) {
186                 tokens.push_back(s.substr(x));
187             } else {
188                 tokens.push_back(s.substr(x, y-x));
189                 y++;
190             }
191         }
192         return tokens;
193     }/*}}}*/
replace_aliaseslas_parallel_desc::helper194     void replace_aliases(std::string & desc) const {/*{{{*/
195         using namespace std;
196         int k;
197         if (parse_number(desc, k)) {
198             ostringstream os;
199             os << "machine,1," << k;
200             desc = os.str();
201             return;
202         }
203         if (desc == "single") { desc = "machine,1,pu"; return; }
204         if (desc == "auto") { desc = default_placement_with_auto; return; }
205         if (desc == "auto,no-replicate") { desc = default_placement_with_auto; desc += ",no-replicate"; return; }
206         if (desc.substr(0,7) == "single-") {
207             ostringstream os;
208             os << desc.substr(7) << ",1,pu";
209             desc = os.str();
210             return;
211         }
212     }/*}}}*/
parselas_parallel_desc::helper213     void parse(std::string & desc) { /* {{{ */
214         replace_aliases(desc);
215 
216         std::vector<std::string> tokens = tokenize(desc);
217 
218         bool loose_opt = false;
219         bool strict_opt = false;
220         bool norepl_opt = false;
221         for( ; tokens.size() >= 3 ; ) {
222             std::string const & s(tokens.back());
223             if (s == "strict") { strict_opt = true; }
224             else if (s == "loose") { loose_opt = true; }
225             else if (s == "no-replicate") { norepl_opt = true; }
226             else break;
227             tokens.erase(--tokens.end());
228         }
229         if (loose_opt && strict_opt)
230             throw bad_specification("loose and strict are incompatible");
231         if (loose_opt) loose = true;
232         if (strict_opt) loose = false;
233         if (norepl_opt) replicate = false;
234 
235         if (tokens.size() == 3) {
236             memory_binding_specifier_string = tokens[0];
237             jobs_within_cpu_binding_string = tokens[1];
238             threads_per_job_string = tokens[2];
239         } else if (tokens.size() == 4) {
240             memory_binding_specifier_string = tokens[0];
241             cpu_binding_specifier_string = tokens[1];
242             jobs_within_cpu_binding_string = tokens[2];
243             threads_per_job_string = tokens[3];
244         } else {
245             throw bad_specification("not the right number of tokens, or wrong options");
246         }
247     }/*}}}*/
248 #ifdef HAVE_HWLOC
total_ramlas_parallel_desc::helper249     uint64_t total_ram() const {/*{{{*/
250         hwloc_obj_t root = hwloc_get_root_obj(topology);
251         /* I'm not sure about how I want to get the info on the amount of
252          * available ram. hwloc-info, as such, does not seem to document
253          * a way (beyond the fact that this bit of info is output with
254          * "machine" and "Numanode" specifiers). There is an
255          * hwloc_obj_memory_s type in the hwloc C api, and that seems to
256          * be attached to hwloc_obj structures as well. But at any rate,
257          * this gives some notion of available memory. This is well and
258          * good, except that it might be misleading if a job that takes
259          * "almost" 4GB is scheduled to be placedonly 4 times on a 16GB
260          * machine when we specify --job-memory 4 (with some confidence
261          * that this is a convenient enough upper bound).
262          */
263 
264 #if HWLOC_API_VERSION < 0x020000
265         uint64_t ram = root->memory.total_memory;
266 #else
267         uint64_t ram = root->total_memory;
268 #endif
269         /* Round this up to at most 1/16-th. This will do rubbish on a
270          * machine where the 1 bits in the binary expansion of the
271          * hardware ram size spread more than 4 positions, but we find
272          * that unlikely.
273          */
274         for(uint64_t x = ram >> 4; x ; x >>= 1) ram |= x;
275         ram = ram + 1;
276         ram = ram - ((uint64_t) (total_ram_margin * (1<<30)));
277         return ram;
278     }/*}}}*/
enclosing_depthlas_parallel_desc::helper279    int enclosing_depth(int n) const {/*{{{*/
280        /* return deepest level k such that an object at depth k contains
281         * more than, or exactly N PUs.
282         */
283        int k;
284        int m = 1;
285        ASSERT_ALWAYS(n >= 1);
286        for(k = depth - 1; n > m && k >= 0 ; k--) {
287            ASSERT(number_of(-1, k) == m);
288            m *= depth_per_level[k];
289            ASSERT(k == 0 || number_of(-1, k-1) == m);
290        }
291 
292        if (k >= 0) {
293            ASSERT(number_of(-1, k) >= n);
294            if (k == depth - 1) {
295                ASSERT(n == 1);
296            } else {
297                ASSERT(number_of(-1, k + 1) < n);
298            }
299        }
300 
301        return k;
302    }/*}}}*/
flat_to_hierarchicallas_parallel_desc::helper303    std::tuple<int, int, int> flat_to_hierarchical(int n) const {/*{{{*/
304        int k = enclosing_depth(n);
305        /* a binding scope that has to meet the constraint of containing
306         * at least n PUs can do so with an object at depth
307         * enclosing_depth(n). However, a finer grain might be possible,
308         * and that may be an important optimization when we have many
309         * edges in the topology tree at this level: it is perhaps
310         * possible to divide the object at depth k into equal parts made
311         * of several objects at depth k+1, all parts meeting the
312         * constraint. We count how large these parts must be (counted in
313         * terms of number of objects at depth k+1, called "children"
314         * here.).
315         */
316        if (k == depth - 1) {
317            ASSERT(n == 1);
318            return std::make_tuple(k, 1, 1);
319        }
320        int child_depth = k + 1;
321        int child_size = number_of(-1, k + 1);
322        int part_size = iceildiv(n, child_size);
323        int v = number_of(k+1, k);
324        ASSERT(part_size <= v);
325        if (replicate)
326            for( ; v % part_size ; part_size++);
327        if (part_size == v) {
328            child_depth--;
329            child_size *= v;
330            part_size = 1;
331        }
332        return std::make_tuple(child_depth, child_size, part_size);
333    }/*}}}*/
acceptable_bindinglas_parallel_desc::helper334    int acceptable_binding(int n) const {/*{{{*/
335        auto x = flat_to_hierarchical(n);
336        return std::get<1>(x) * std::get<2>(x);
337    }/*}}}*/
textual_description_for_bindinglas_parallel_desc::helper338    std::string textual_description_for_binding(int n) const {/*{{{*/
339        int k, child_size, part_size;
340        ASSERT(n <= number_of(-1, 0));
341        std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
342        for( ; k < depth - 1 && number_of(k+1, k) == 1 ; k++);
343        hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, k, 0);
344        char s[256];
345        hwloc_obj_type_snprintf(s, sizeof(s), obj, 1);
346        if (part_size == 1) {
347            return std::string(s);
348        } else {
349            std::ostringstream os;
350            os << std::string(s) << ":0-" << part_size - 1;
351            return os.str();
352        }
353    }/*}}}*/
all_textual_descriptions_for_bindinglas_parallel_desc::helper354    std::vector<std::string> all_textual_descriptions_for_binding(int n) const {/*{{{*/
355        /* n is a binding granularity. Return all the subsets of the
356         * machine, all identical, that have some hardware relevance (=
357         * correspond to an integer fraction of the subtree at some depth,
358         * and made of entire subtrees at the level below), and all
359         * contain at least n PUs.
360         */
361        int k, child_size, part_size;
362        ASSERT(n <= number_of(-1, 0));
363        std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
364        for( ; k < depth - 1 && number_of(k+1, k) == 1 ; k++);
365        std::vector<std::string> res;
366        int nk = number_of(k, 0);
367        for(int i = 0 ; i < nk ; i+=part_size) {
368            char s[256];
369            hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, k, i);
370            hwloc_obj_type_snprintf(s, sizeof(s), obj, 0);
371            std::ostringstream os;
372            int a = obj->logical_index;
373            int b = a + part_size - 1;
374            if (part_size == 1) {
375                os << s << ":" << a;
376            } else {
377                os << s << ":" << a << '-' << b;
378            }
379            res.push_back(os.str());
380        }
381        return res;
382    }/*}}}*/
all_cpu_bitmapslas_parallel_desc::helper383     std::vector<cxx_hwloc_cpuset> all_cpu_bitmaps(int width, int multiply = 1)/*{{{*/
384     {
385         int n = width;
386         int k, child_size, part_size;
387         ASSERT(n <= number_of(-1, 0));
388         std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
389         std::vector<cxx_hwloc_cpuset> res;
390         int nk = number_of(k, 0);
391         for(int i = 0 ; i < nk ; i+=part_size) {
392             cxx_hwloc_cpuset c;
393             for(int j = 0 ; j < part_size ; j++)
394                 c = c | hwloc_get_obj_by_depth(topology, k, i + j)->cpuset;
395             for(int k = 0 ; k < multiply ; k++)
396                 res.push_back(c);
397         }
398         return res;
399     }/*}}}*/
all_mem_bitmapslas_parallel_desc::helper400     std::vector<cxx_hwloc_nodeset> all_mem_bitmaps(int width, int multiply = 1)/*{{{*/
401     {
402         int n = width;
403         int k, child_size, part_size;
404         ASSERT(n <= number_of(-1, 0));
405         std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
406         std::vector<cxx_hwloc_cpuset> res;
407         int nk = number_of(k, 0);
408         for(int i = 0 ; i < nk ; i+=part_size) {
409             cxx_hwloc_nodeset c;
410             for(int j = 0 ; j < part_size ; j++)
411                 c = c | hwloc_get_obj_by_depth(topology, k, i + j)->nodeset;
412             for(int k = 0 ; k < multiply ; k++)
413                 res.push_back(c);
414         }
415         return res;
416     }/*}}}*/
min_pu_fitlas_parallel_desc::helper417    int min_pu_fit(double jobram) {/*{{{*/
418        if (computed_min_pu_fit >= 0) return computed_min_pu_fit;
419        /* This computes the minimal number n of PUs such that
420         * floor(total_PUs/n) jobs, each using jobram GB, fit on the
421         * machine and do not exceed the total amount of __physical__ RAM.
422         *
423         * hwloc does give us a useful bit of information regarding
424         * available, and not physical ram. We strive to get the physical
425         * ram because that amount is generally better known to the user,
426         * and one expects the automatic placement logic to find that
427         * *four* jobs of say 12GB would fit on a 48GB machine.
428         *
429         * Note that this integer is not yet of interest to the
430         * hierarchical topology.
431         */
432        /* how many jobs can fit ? */
433        int how_many = total_ram() / (jobram * (1<<30));
434        if (how_many == 0) {
435            fprintf(stderr, "This machine does not have enough memory"
436                    " (%" PRIu64 " GB) to fit jobs that need %.2f GB\n",
437                    total_ram() >> 30, jobram);
438            exit(EXIT_FAILURE);
439        }
440        /* This means subjobs can't be smaller than this: */
441        computed_min_pu_fit = iceildiv(number_of(-1, 0), how_many);
442        std::string s = textual_description_for_binding(computed_min_pu_fit);
443        std::ostringstream os;
444        os  << "# Given our estimate of "
445            << size_disp(jobram * (1<<30)) << " RAM per subjob,"
446            << " the \"fit\" binding\n"
447            << "# corresponds to a minimum size of "
448            << computed_min_pu_fit << " covered PUs in the hwloc hierarchy,\n"
449            << "# i.e. no less than \"" << s << "\"\n";
450        full_diagnostics += os.str();
451        return computed_min_pu_fit;
452    }/*}}}*/
compute_binding_bitmapslas_parallel_desc::helper453     void compute_binding_bitmaps() {/*{{{*/
454         subjob_binding_cpusets = all_cpu_bitmaps(cpu_binding_size, nsubjobs_per_cpu_binding_zone);
455         memory_binding_nodesets = all_mem_bitmaps(memory_binding_size);
456     }/*}}}*/
current_memory_bindinglas_parallel_desc::helper457     cxx_hwloc_nodeset current_memory_binding() const {/*{{{*/
458         cxx_hwloc_nodeset nn;
459         hwloc_membind_policy_t pol;
460 #if HWLOC_API_VERSION < 0x010b03
461         /* this legacy called remained valid throughout hwloc 1.x */
462         int rc = hwloc_get_membind_nodeset(topology,  nn, &pol,
463                 HWLOC_MEMBIND_THREAD);
464 #else
465         /* newer call */
466         int rc = hwloc_get_membind(topology,  nn, &pol,
467                 HWLOC_MEMBIND_THREAD | HWLOC_MEMBIND_BYNODESET);
468 #endif
469         if (rc < 0) {
470             static std::mutex mm;
471             std::lock_guard<std::mutex> dummy(mm);
472             static int got_message = 0;
473             if (!got_message++)
474                 fprintf(stderr, "Error while attempting to get memory binding\n");
475             hwloc_bitmap_zero(nn);
476         }
477         return nn;
478     }/*}}}*/
479 #endif
interpret_memory_binding_specifierlas_parallel_desc::helper480    int interpret_memory_binding_specifier() {
481 #ifdef HAVE_HWLOC
482        if (depth) {
483            memory_binding_size = interpret_generic_binding_specifier(memory_binding_specifier_string);
484            std::ostringstream os;
485            os
486                << "# Memory binding specifier: request: "
487                << memory_binding_specifier_string
488                << " ; resolves to: "
489                << textual_description_for_binding(memory_binding_size)
490                << "\n";
491            full_diagnostics += os.str();
492            return memory_binding_size;
493        } else
494 #endif
495        {
496            /* just check for errors */
497            return interpret_generic_binding_specifier(memory_binding_specifier_string);
498        }
499    }
interpret_cpu_binding_specifierlas_parallel_desc::helper500    int interpret_cpu_binding_specifier() {
501 #ifdef HAVE_HWLOC
502        if (depth) {
503            if (cpu_binding_specifier_string.empty()) {
504                cpu_binding_size = memory_binding_size;
505            } else {
506                /* cap to memory binding size */
507                cpu_binding_size = interpret_generic_binding_specifier(cpu_binding_specifier_string, memory_binding_size);
508                if (memory_binding_size % cpu_binding_size)
509                    throw bad_specification(
510                            "\n",
511                            banner,
512                            full_diagnostics,
513                            "# ERROR: cpu binding "
514                            , cpu_binding_specifier_string,
515                            " yields a size of ", cpu_binding_size, " PUs,"
516                            " but this must be an integer divisor of",
517                            " the memory binding size of ", memory_binding_size,
518                            " PUs, which follows from the specifier ",
519                            memory_binding_specifier_string);
520            }
521            std::ostringstream os;
522            os
523                << "# Cpu binding specifier: request: "
524                << cpu_binding_specifier_string
525                << " ; resolves to: "
526                << textual_description_for_binding(cpu_binding_size)
527                << "\n";
528            full_diagnostics += os.str();
529            return cpu_binding_size;
530        } else
531 #endif
532        {
533            /* just check for errors */
534            if (cpu_binding_specifier_string.empty()) {
535                return 0;
536            } else {
537                return interpret_generic_binding_specifier(cpu_binding_specifier_string);
538            }
539        }
540    }
interpret_generic_binding_specifierlas_parallel_desc::helper541    int interpret_generic_binding_specifier(std::string const & specifier, int cap MAYBE_UNUSED = -1) {/*{{{*/
542 #ifdef HAVE_HWLOC
543        if (!depth) {
544            if (strcasecmp(specifier.c_str(), "machine") != 0)
545                throw bad_specification("hwloc detected asymmetric topology, the only accepted memory binding specifier is \"machine\"");
546            return 0;
547        }
548        int binding_size;
549        /* and apply our different calculation rules to the provided
550         * string (either memory_binding_specifier_string or
551         * cpu_binding_specifier_string if there is one). */
552        if (parse_number(specifier, binding_size))
553            return binding_size;
554        const char * binding_specifier_regexp =
555            "^("                         // 1 : non-limited part.
556                "([^*/[:space:]]*)"      // 2: object, or "fit"
557                "(\\*([0-9]+))?"          // 3,4: coarsening
558                "(/([0-9]+))?"           // 5,6: restricting
559            ")"
560            "(/([^[:space:]]*))?"        // 7,8: limiting
561            "$";
562        regex_t R;
563        regcomp(&R, binding_specifier_regexp, REG_ICASE|REG_EXTENDED);
564        auto dummy = call_dtor([&](){regfree(&R);});
565        regmatch_t m[9];
566        int r = regexec(&R, specifier.c_str(), 9, m, 0);
567        if (r != 0)
568            throw bad_specification("binding specifier ",
569                    specifier, " is invalid");
570        int objsize;
571        auto sub = [&](int i, bool want = false) {
572            std::string res;
573            if (want) ASSERT_ALWAYS(m[i].rm_so >= 0);
574            if (m[i].rm_so >= 0)
575                res = specifier.substr(m[i].rm_so, m[i].rm_eo - m[i].rm_so);
576            return res;
577        };
578        std::string base_object = sub(2, true);
579        bool is_fit = strcasecmp(base_object.c_str(), "fit") == 0;
580        if (is_fit) {
581            if (computed_min_pu_fit < 0) throw needs_job_ram();
582            objsize = computed_min_pu_fit;
583        } else {
584            int argdepth = hwloc_aux_get_depth_from_string(topology, base_object.c_str());
585 #if HWLOC_API_VERSION >= 0x020000
586            if (argdepth == HWLOC_TYPE_DEPTH_NUMANODE) {
587                argdepth = hwloc_get_memory_parents_depth(topology);
588            }
589 #endif
590            if (argdepth < 0)
591                throw bad_specification(base_object, " is invalid");
592            objsize = number_of(-1, argdepth);
593        }
594        std::string multiplier_string = sub(4);
595        if (!multiplier_string.empty()) {
596            int x;
597            bool t = parse_number(multiplier_string, x);
598            ASSERT_ALWAYS(t);
599            objsize *= x;
600        }
601        if (cap < 0) cap = number_of(-1, 0);
602        if (objsize > cap)
603            objsize = cap;
604        if (is_fit)
605            objsize = acceptable_binding(objsize);
606        std::string divisor_string = sub(6);
607        if (!divisor_string.empty()) {
608            int x;
609            bool t = parse_number(divisor_string, x);
610            ASSERT_ALWAYS(t);
611            if (!x || (objsize % x)) {
612                std::ostringstream os;
613                os << "specifier binding specifier is invalid. Cannot divide "
614                    << base_object << sub(3) << " (=" << objsize << ") into "
615                    << x << " parts";
616                if (is_strict()) {
617                    throw bad_specification(os.str());
618                } else {
619                    fprintf(stderr, "ERROR: %s\n", os.str().c_str());
620                }
621            }
622            if (x) objsize /= x;
623        }
624        std::string limiting_string = sub(8);
625        if (!limiting_string.empty()) {
626            int argdepth = hwloc_aux_get_depth_from_string(topology, limiting_string.c_str());
627            if (argdepth < 0)
628                throw bad_specification(limiting_string, " is invalid");
629            int compare = number_of(-1, argdepth);
630            if (objsize > compare) {
631                std::ostringstream os;
632                os << "automated binding "
633                    << textual_description_for_binding(objsize)
634                    << " deduced from \"" << sub(2) << sub(4) << sub(6) << "\""
635                    << " (= " << objsize << ")"
636                    << " escapes the advised binding given by " << sub(7)
637                    << " (which is "
638                    << textual_description_for_binding(compare)
639                    << ")";
640                if (is_strict()) {
641                    throw bad_specification(os.str());
642                } else {
643                    fprintf(stderr, "Warning: %s\n", os.str().c_str());
644                }
645            }
646        }
647        return objsize;
648 #else
649        if (strcasecmp(specifier.c_str(), "machine") != 0)
650            throw bad_specification("hwloc being disabled, the only accepted binding specifier is \"machine\"");
651        return 0;
652 #endif
653    }/*}}}*/
interpret_njobs_specifierlas_parallel_desc::helper654    int interpret_njobs_specifier() {/*{{{*/
655        if (parse_number(jobs_within_cpu_binding_string, nsubjobs_per_cpu_binding_zone))
656            return nsubjobs_per_cpu_binding_zone;
657 #ifdef HAVE_HWLOC
658        int objsize;
659        if (strcasecmp(jobs_within_cpu_binding_string.c_str(), "fit") == 0) {
660            if (computed_min_pu_fit < 0) throw needs_job_ram();
661            objsize = acceptable_binding(computed_min_pu_fit);
662        } else if (jobs_within_cpu_binding_string.find_first_of("*/") != std::string::npos) {
663            throw bad_specification("Only binding specifiers allow multipliers");
664        } else {
665            int argdepth = hwloc_aux_get_depth_from_string(topology, jobs_within_cpu_binding_string.c_str());
666            if (argdepth < 0)
667                throw bad_specification(jobs_within_cpu_binding_string, " is invalid");
668            objsize = number_of(-1, argdepth);
669        }
670        if (cpu_binding_size % objsize)
671            throw bad_specification(
672                    "\n",
673                    banner,
674                    full_diagnostics,
675                    "ERROR: cannot place jobs according to",
676                    " the ", jobs_within_cpu_binding_string, " rule",
677                    " (which resolves to: ",
678                    textual_description_for_binding(objsize), "),"
679                    " as there is not an integer number of these",
680                    " in the imposed binding ",
681                    textual_description_for_binding(cpu_binding_size),
682                    " [HINT: are you using -t auto and running out of RAM on NUMA nodes?]");
683        nsubjobs_per_cpu_binding_zone = cpu_binding_size / objsize;
684        std::ostringstream os;
685        os
686            << "# Njobs specifier: request: "
687            << jobs_within_cpu_binding_string
688            << " ; resolves to: "
689            << textual_description_for_binding(objsize)
690            << " (" << nsubjobs_per_cpu_binding_zone << " jobs)"
691            << "\n";
692        full_diagnostics += os.str();
693        return nsubjobs_per_cpu_binding_zone;
694 #else
695        throw bad_specification("hwloc being disabled, the only accepted specifiers for the number of jobs are integers");
696 #endif
697    }/*}}}*/
interpret_nthreads_specifierlas_parallel_desc::helper698    int interpret_nthreads_specifier() {/*{{{*/
699        if (parse_number(threads_per_job_string, nthreads_per_subjob))
700            return nthreads_per_subjob;
701 #ifdef HAVE_HWLOC
702        int objsize;
703        int argdepth = hwloc_aux_get_depth_from_string(topology, threads_per_job_string.c_str());
704        if (argdepth < 0)
705            throw bad_specification(threads_per_job_string, " is invalid");
706        objsize = number_of(-1, argdepth);
707        int loose_per_job_scale = cpu_binding_size / nsubjobs_per_cpu_binding_zone;
708        if (loose_per_job_scale % objsize)
709            throw bad_specification("cannot place threads according to",
710                    " the ", threads_per_job_string, " rule,",
711                    " as there is not an integer number of these",
712                    " in the per-job fraction (1/", nsubjobs_per_cpu_binding_zone, ")",
713                    " of the binding ",
714                    textual_description_for_binding(cpu_binding_size));
715        nthreads_per_subjob = loose_per_job_scale / objsize;
716        std::ostringstream os;
717        os
718            << "# Nthreads specifier: request: "
719            << threads_per_job_string
720            << " ; resolves to: "
721            << textual_description_for_binding(objsize)
722            << " (" << nthreads_per_subjob << " threads)"
723            << "\n";
724        full_diagnostics += os.str();
725        return nthreads_per_subjob;
726 #else
727        throw bad_specification("hwloc being disabled, the only accepted specifiers for the number of threads are integers");
728 #endif
729    }/*}}}*/
730 };
731 
extended_usage()732 static void extended_usage()/*{{{*/
733 {
734     std::ostringstream os;
735     os << R"(
736 Documentation for the --job-binding-policy (-t) option.
737 =======================================================
738 
739 Option takes either an alias, or a longer specificier.  Most of the
740 functionality depends on the hwloc library being usable. Examples are
741 provided after the syntax description
742 
743 
744 Aliases, first:
745 ===============
746 
747     [an integer N]: equivalent to Machine,1,N
748         (run one N-threaded job, not bound to anything).
749     single: equivalent to single-machine
750     single-XXX: equivalent to XXX,1,PU
751 )"
752     << "    auto: equivalent to " << default_placement_with_auto << " [see below]\n"
753     << "    auto,no-replicate: equivalent to " << default_placement_with_auto << ",no-replicate [see below]\n"
754     << R"(
755 Syntax of the specifiers
756 ========================
757 
758 [memory binding level],[cpu binding level],[number of jobs with this binding],[threads per job][,modifiers]
759 
760 The cpu binding level may be omitted, and defaults to the same value as
761 the machine binding level.
762 
763 All three (or four) main items may be set as integers. However this is
764 probably not the most useful, and certainly not the most portable way to
765 set them.
766 
767 Binding level:
768 --------------
769 
770 Both the memory binding level and the cpu binding level follow the exact
771 same syntax.
772 
773 First observe the output ow hwloc-info (may vary depending on the
774 machine).
775         localhost ~ $ hwloc-info --no-io
776         depth 0:	1 Machine (type #1)
777          depth 1:	2 NUMANode (type #2)
778           depth 2:	2 Package (type #3)
779            depth 3:	2 L3Cache (type #4)
780             depth 4:	16 L2Cache (type #4)
781              depth 5:	16 L1Cache (type #4)
782               depth 6:	16 Core (type #5)
783                depth 7:	32 PU (type #6)
784 This means 2 NUMANode per machine, 8 Core per NUMANode, 2 PU per
785 Core.
786 
787 The syntax of the binding level is
788     [integer]
789     OR [hwloc object name](\*[integer]|/[integer])?
790     OR fit(\*[integer])?(/[hwloc object name])?
791 
792 Here's how it goes to specify the binding level as an integer. As
793 mentioned, this is neither easy nor portable, look further for the
794 recommended way to set via level aliases.  An integer N defines a binding
795 level which corresponds to N of the innermost instances (here, PU is the
796 innermost object.  It is probably always so. The text below says PUs, but
797 the code means the innermost object anyway). Let k be the depth
798 such that an object at depth k contains strictly more than N PUs, while
799 an object at depth k+1 contains less than, or exactly N PUs. For example,
800 above, for N=8 we have k=3 (at depth k+1 a single L2Cache is above less
801 than 8 PUs, but at level k=3, below one L3Cache object we have 16 PUs,
802 which is more than 8). When specifying an integer N, it must be a
803 multiple of the number of PUs below an object at level k+1 (here, 8 is
804 indeed a multiple of 2, which is the number of PUs below an L2Cache), as
805 well as a divisor of the number of PUs below an object at level k (here,
806 8 divides 16). The latter requirement is waived if the "no-replicate"
807 modifier is present.
808 
809 We bind jobs to hwloc objects at a particular depth by writing simply the
810 object name (e.g. NUMANode) as a binding level. Names are never
811 case-sensitive, and match in the same way hwloc tools such as hwloc-calc
812 recognizes them. In the example above, this is equivalent to the integer
813 16, but this varies. When specifying an object name, it is possible to
814 append "*N" or "/N" to loosen or restrict the binding level, provided the
815 resulting integer satisfies the constraints above.
816 
817 The "fit" keyword is special. It corresponds to the minimum number of PUs
818 that must be grouped together so that if the machine is filled with
819 identical jobs, the memory available on the machine suffices. This relies
820 on an estimation of the amount of required memory, of course (see also
821 --job-memory). It is possible to write fit*N, which loosens this binding
822 (always subject to the same requirement). When appending /XXX, where XXX
823 denotes an object name, this ensures that the corresponding binding does
824 not escape the size of one object of that name. See also the "loose" and
825 "strict" modifiers.
826 
827 Number of jobs with same binding
828 --------------------------------
829 
830 This is an integer, too. We may also use hwloc specifiers as shortcuts.
831 However the interpretation is reversed compared to the binding level.
832 Here, "Core" indicates a number which is the number of "Core" objects
833 within the binding specification.
834 
835 The "fit" keyword echoes the functionality of the "fit" binding
836 specifier. Again, it means that within the binding specifier, we put
837 exactly the number of jobs so that if this is replicated over the whole
838 machine, the memory fits.
839 
840 Naturally, when using an object name (or "fit"), it is mandatory that the
841 binding specifier contains a positive integer number of objects of that
842 name.
843 
844 Number of threads per job
845 -------------------------
846 
847 Again, an integer. As with the number of jobs, aliases are understood
848 relative to the job size (which may occupy only a fraction of the binding
849 level, but always an integer fraction anyway).
850 
851 The two most useful settings are "PU" and "Core", which are equivalent to
852 using or not using hyperthreading.
853 
854 Modifiers
855 ---------
856 
857 'loose' / 'strict' (default: 'loose'). If a binding specifier of the form
858 fit*N/XXX or fit/XXX is used, warn (with 'loose') or error out (with
859 'strict') if the calculated binding is coarser than XXX.
860 
861 'no-replicate' : by default, we may put several jobs with the same
862 binding, and the set of jobs that are put in a single binding is
863 replicated over the whole machine (as many times as this binding fits in
864 the machine). With the "no-replicate" option, this replication is
865 disabled (however we still have the same number of sub-jobs with one
866 binding).
867 
868 Examples
869 ========
870 
871     -t 8    -> resolves to machine,1,8 = run only one job, with no
872             particular binding. There is only one job, because there is
873             only one "machine" per... machine, so whether or not we use
874             the "no-replicate" modifier makes no difference.
875 
876     -t numa,fit,pu,strict
877             -> binds jobs at the numa level. Set as many jobs as can fit,
878             and then run each on exactly the number of pus (hyperthreaded
879             cores). Abort if it is not possible to fit on one numa node.
880 
881     -t numa,core*2,fit,pu
882             -> memory-bind at the numa level, and do cpu-bind on pairs of
883             cores. Stick as many different jobs as can fit in each cpu
884             binding context, and have each of these jobs use as many
885             threads as the number of pus we have (per sub-job in the cpu
886             binding context of two cores).
887 
888     -t numa,fit,20
889             -> same, but force 20-thread jobs (no matter what).
890 
891     -t auto -> resolves to )" << default_placement_with_auto << R"( = bind fractions of NUMA
892             nodes, and put as many jobs in there as can fit. Then have
893             each run the number of threads so that all PUs are busy.
894 
895     -t package/2,core,pu
896             -> binds to half packages, run 1 job per core, and have each
897             job use as many threads as we have PUs per core.
898 )";
899     fputs(os.str().c_str(), stderr);
900 }/*}}}*/
901 
las_parallel_desc()902 las_parallel_desc::las_parallel_desc()
903     : help (std::make_shared<helper>())
904 {}
905 
las_parallel_desc(cxx_param_list & pl,double jobram_arg)906 las_parallel_desc::las_parallel_desc(cxx_param_list & pl, double jobram_arg)
907     : las_parallel_desc()
908 {
909     jobram = jobram_arg;
910     const char * desc_c0 = param_list_lookup_string(pl, "t");
911 
912     if (!desc_c0) {
913         verbose_output_start_batch();
914         verbose_output_print(0, 1, "# No -t option found, running single-threaded.");
915 #ifdef HAVE_HWLOC
916         verbose_output_print(0, 1, " See also -t help");
917 #endif
918         verbose_output_print(0, 1, "\n");
919         verbose_output_end_batch();
920         /* feed a constant string */
921         description_string = "1";
922     } else {
923         description_string = desc_c0;
924     }
925 
926     if (description_string == "help") {/*{{{*/
927         extended_usage();
928         exit(EXIT_SUCCESS);
929     }/*}}}*/
930 
931     help->replace_aliases(description_string);
932     help->set_banner(description_string);
933 
934 #ifdef HAVE_HWLOC
935     param_list_parse_double(pl, "memory-margin", &help->total_ram_margin);
936     /* The --job-memory argument will **ALWAYS** win */
937     param_list_parse_double(pl, "job-memory", &jobram);
938     if (jobram != -1)
939         help->min_pu_fit(jobram);
940 #endif
941 
942     try {
943         help->parse(description_string);
944     } catch (bad_specification & b) {
945         throw bad_specification("Cannot interpret specification -t ",
946                 description_string,
947                 ": ", b.what(), "."
948                 " See -t help for extended documentation");
949     }
950     help->interpret_memory_binding_specifier();
951     help->interpret_cpu_binding_specifier();
952     help->interpret_njobs_specifier();
953     help->interpret_nthreads_specifier();
954 
955     nsubjobs_per_cpu_binding_zone = help->nsubjobs_per_cpu_binding_zone;
956     nthreads_per_subjob = help->nthreads_per_subjob;
957 
958 #ifdef HAVE_HWLOC
959     if (!help->depth) return;
960     memory_binding_size = help->memory_binding_size;
961     nmemory_binding_zones = help->number_of(-1, 0) / memory_binding_size;
962     if (!help->replicate)
963         nmemory_binding_zones = 1;
964     cpu_binding_size = help->cpu_binding_size;
965     ncpu_binding_zones_per_memory_binding_zone = memory_binding_size / cpu_binding_size;
966 
967     /* Need to populate the arrays subjob_binding_cpusets and
968      * memory_binding_nodesets. */
969     help->compute_binding_bitmaps();
970 
971 #if 0
972     const struct hwloc_topology_support* s = hwloc_topology_get_support(help->topology);
973     /* see /usr/share/doc/libhwloc-doc/html/a00234.html */
974     /* just examples of flags we may be interested in */
975     ASSERT_ALWAYS(s->membind->get_area_membind);
976 #endif
977 
978 #endif
979 }
980 
display_binding_info() const981 void las_parallel_desc::display_binding_info() const /*{{{*/
982 {
983     verbose_output_start_batch();
984     verbose_output_print(0, 1, "%s", help->banner.c_str());
985     verbose_output_print(0, 2, "%s", help->full_diagnostics.c_str());
986 #ifdef HAVE_HWLOC
987     verbose_output_print(0, 1, "# %d memory binding zones\n", nmemory_binding_zones);
988     verbose_output_print(0, 1, "# %d cpu binding zones within each memory binding\n", ncpu_binding_zones_per_memory_binding_zone);
989     verbose_output_print(0, 1, "# %d jobs within each binding context (hence %d in total)\n",
990             number_of_subjobs_per_cpu_binding_zone(),
991             number_of_subjobs_total());
992     verbose_output_print(0, 1, "# %d threads per job\n", nthreads_per_subjob);
993     if (jobram >= 0) {
994         double all = jobram * number_of_subjobs_total();
995         int physical = help->total_ram() >> 30;
996         double ratio = 100 * all / physical;
997 
998         verbose_output_print(0, 1, "# Based on an estimate of %.2f GB per job, we use %.2f GB in total, i.e. %.1f%% of %d GB\n",
999                 jobram, all, ratio, physical);
1000     }
1001 #else
1002     verbose_output_print(0, 1, "# %d jobs in parallel\n", number_of_subjobs_total());
1003     verbose_output_print(0, 1, "# %d threads per job\n", nthreads_per_subjob);
1004 #endif
1005 
1006 
1007 #ifdef HAVE_HWLOC
1008     if (help->depth) {
1009         std::vector<std::string> tm = help->all_textual_descriptions_for_binding(memory_binding_size);
1010         std::vector<std::string> tc = help->all_textual_descriptions_for_binding(cpu_binding_size);
1011         std::vector<std::string> tj = help->all_textual_descriptions_for_binding(cpu_binding_size / number_of_subjobs_per_cpu_binding_zone());
1012         size_t m = 0, c = 0;
1013         {
1014             std::ostringstream pu_app;
1015             pu_app << "[" << memory_binding_size << " PUs]";
1016             for(auto & x : tm) { x += " "; x += pu_app.str(); if (x.size() > m) m = x.size(); }
1017         }
1018         {
1019             std::ostringstream pu_app;
1020             pu_app << "[" << cpu_binding_size << " PUs]";
1021             for(auto & x : tc) { x += " "; x += pu_app.str(); if (x.size() > c) c = x.size(); }
1022         }
1023         size_t qc = number_of_subjobs_per_cpu_binding_zone();
1024         size_t qm = qc * ncpu_binding_zones_per_memory_binding_zone;
1025         for(size_t i = 0 ; i < tj.size() ; i++) {
1026             if (!help->replicate && i >= qc) break;
1027             ASSERT_ALWAYS(i < help->subjob_binding_cpusets.size());
1028             char * sc;
1029             hwloc_bitmap_asprintf(&sc, help->subjob_binding_cpusets[i]);
1030             ASSERT_ALWAYS(i/qm < help->memory_binding_nodesets.size());
1031             char * sm;
1032             hwloc_bitmap_asprintf(&sm, help->memory_binding_nodesets[i/qm]);
1033 
1034             verbose_output_print(0, 2, "# %-*s %-*s %s [%d thread(s)] [ m:%s c:%s ]\n",
1035                     (int) m, (i % qm) ? "" : tm[i / qm].c_str(),
1036                     (int) c, (i % qc) ? "" : tc[i / qc].c_str(),
1037                     "job", // tj[i].c_str(),
1038                     nthreads_per_subjob,
1039                     sm, sc
1040                     );
1041             free(sm);
1042             free(sc);
1043         }
1044     }
1045 #endif
1046     verbose_output_end_batch();
1047 }/*}}}*/
1048 
1049 
1050 #if 0
1051 int las_parallel_desc::query_memory_binding(void * addr, size_t len);
1052 {
1053     /* tentative specification. This is most probably a debug function.
1054      *
1055      * return an integer k for something that matches precisely the k-th
1056      * memory binding zone that we have defined.
1057      *
1058      * return -1 for something that isn't bound.
1059      *
1060      * return -2 if none of the above.
1061      */
1062     /* base this on one of the following hwloc api calls ?
1063 
1064     int hwloc_get_area_membind_nodeset (hwloc_topology_t topology,
1065             const void *addr, size_t len,
1066             hwloc_nodeset_t nodeset,
1067             hwloc_membind_policy_t *policy,
1068             int flags);
1069     int hwloc_get_area_membind (hwloc_topology_t topology,
1070             const void *addr, size_t len,
1071             hwloc_bitmap_t set,
1072             hwloc_membind_policy_t *policy,
1073             int flags);
1074     int hwloc_get_area_memlocation (hwloc_topology_t topology,
1075             const void *addr, size_t len,
1076             hwloc_bitmap_t set,
1077             int flags);
1078 
1079      */
1080     return -2;
1081 }
1082 #endif
1083 
set_loose_binding() const1084 int las_parallel_desc::set_loose_binding() const
1085 {
1086 #ifdef HAVE_HWLOC
1087     if (help->depth == 0)
1088         return 0;
1089     hwloc_obj_t root = hwloc_get_root_obj(help->topology);
1090     hwloc_nodeset_t n = root->nodeset;
1091     hwloc_cpuset_t c = root->cpuset;
1092     /* This achieves a **global** binding. It's not entirely clear we
1093      * ever want this in the normal course of an execution of las, in
1094      * fact.
1095      */
1096     int rc;
1097 #if HWLOC_API_VERSION < 0x010b03
1098     /* this legacy called remained valid throughout hwloc 1.x */
1099     rc = hwloc_set_membind_nodeset(help->topology, n, HWLOC_MEMBIND_BIND,
1100             HWLOC_MEMBIND_THREAD |
1101             HWLOC_MEMBIND_STRICT);
1102 #else
1103     /* newer call */
1104     rc = hwloc_set_membind(help->topology, n, HWLOC_MEMBIND_BIND,
1105             HWLOC_MEMBIND_THREAD |
1106             HWLOC_MEMBIND_STRICT |
1107             HWLOC_MEMBIND_BYNODESET);
1108 #endif
1109     if (rc < 0) {
1110         char * s;
1111         hwloc_bitmap_asprintf(&s, n);
1112         if (errno == EXDEV) {
1113             fprintf(stderr, "Error, cannot enforce loose memory binding [ %s ]\n", s);
1114         } else {
1115             fprintf(stderr, "Error while attempting to set loose memory binding [ %s ]\n", s);
1116         }
1117         free(s);
1118         return -1;
1119     }
1120     rc = hwloc_set_cpubind(help->topology, c,
1121             HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
1122     if (rc < 0) {
1123         char * s;
1124         hwloc_bitmap_asprintf(&s, c);
1125         if (errno == EXDEV) {
1126             fprintf(stderr, "Error, cannot enforce loose cpu binding [ %s ]\n", s);
1127         } else {
1128             fprintf(stderr, "Error while attempting to set loose cpu binding [ %s ]\n", s);
1129         }
1130         free(s);
1131         return -1;
1132     }
1133 #endif
1134     return 0;
1135 }
1136 
set_subjob_binding(int k) const1137 int las_parallel_desc::set_subjob_binding(int k) const
1138 {
1139 #ifdef HAVE_HWLOC
1140     if (help->depth == 0)
1141         return -1;
1142 #endif
1143     set_subjob_cpu_binding(k);
1144     set_subjob_mem_binding(k);
1145     return 0;
1146 }
1147 
set_subjob_mem_binding(int k MAYBE_UNUSED) const1148 int las_parallel_desc::set_subjob_mem_binding(int k MAYBE_UNUSED) const
1149 {
1150 #ifdef HAVE_HWLOC
1151     if (help->depth == 0)
1152         return -1;
1153     ASSERT_ALWAYS(0<= k && k < (int) help->subjob_binding_cpusets.size());
1154     int m = k / number_of_subjobs_per_memory_binding_zone();
1155     ASSERT_ALWAYS(m < (int) help->memory_binding_nodesets.size());
1156 #if HWLOC_API_VERSION < 0x010b03
1157     /* this legacy called remained valid throughout hwloc 1.x */
1158     int rc = hwloc_set_membind_nodeset(help->topology,
1159             help->memory_binding_nodesets[m],
1160             HWLOC_MEMBIND_BIND,
1161             HWLOC_MEMBIND_THREAD |
1162             HWLOC_MEMBIND_STRICT);
1163 #else
1164     /* newer call */
1165     int rc = hwloc_set_membind(help->topology,
1166             help->memory_binding_nodesets[m],
1167             HWLOC_MEMBIND_BIND,
1168             HWLOC_MEMBIND_THREAD |
1169             HWLOC_MEMBIND_STRICT |
1170             HWLOC_MEMBIND_BYNODESET);
1171 #endif
1172     if (rc < 0) {
1173         char * s;
1174         hwloc_bitmap_asprintf(&s, help->memory_binding_nodesets[m]);
1175         if (errno == EXDEV) {
1176             fprintf(stderr, "Error, cannot enforce memory binding for job %d [ %s ]\n", k, s);
1177         } else {
1178             fprintf(stderr, "Error while attempting to set memory binding for job %d [ %s ]\n", k, s);
1179         }
1180         free(s);
1181         return -1;
1182     }
1183 #endif
1184     return 0;
1185 }
set_subjob_cpu_binding(int k MAYBE_UNUSED) const1186 int las_parallel_desc::set_subjob_cpu_binding(int k MAYBE_UNUSED) const
1187 {
1188 #ifdef HAVE_HWLOC
1189     if (help->depth == 0)
1190         return -1;
1191     ASSERT_ALWAYS(0<= k && k < (int) help->subjob_binding_cpusets.size());
1192     int rc = hwloc_set_cpubind(help->topology, help->subjob_binding_cpusets[k], HWLOC_CPUBIND_THREAD |  HWLOC_CPUBIND_STRICT);
1193     if (rc < 0) {
1194         char * s;
1195         hwloc_bitmap_asprintf(&s, help->subjob_binding_cpusets[k]);
1196         if (errno == EXDEV) {
1197             fprintf(stderr, "Error, cannot enforce cpu binding for job %d [ %s ]\n", k, s);
1198         } else {
1199             fprintf(stderr, "Error while attempting to set cpu binding for job %d [ %s ]\n", k, s);
1200         }
1201         free(s);
1202         return -1;
1203     }
1204 #endif
1205     return 0;
1206 }
1207 
1208 #if 0
1209 int las_parallel_desc::number_of_threads_loose() const {
1210 #ifdef HAVE_HWLOC
1211     if (help->depth)
1212         return help->replicate ? help->number_of(-1, 0) : help->memory_binding_size;
1213     else
1214 #endif
1215         return nthreads_per_subjob;
1216 }
1217 #endif
1218 
1219 #ifdef HAVE_HWLOC
current_memory_binding() const1220 cxx_hwloc_nodeset las_parallel_desc::current_memory_binding() const {
1221     return help->current_memory_binding();
1222 }
1223 #endif
1224 
1225