1 #include "cado.h" // IWYU pragma: keep
2 // IWYU pragma: no_include <ext/alloc_traits.h>
3 // IWYU pragma: no_include <hwloc/bitmap.h>
4 // IWYU pragma: no_include "hwloc/bitmap.h"
5
6 #include <errno.h> // for EXDEV, errno
7 #include <inttypes.h> // for PRIu64
8 #include <regex.h> // for regmatch_t, regcomp, regexec, regfree
9 #include <sstream> // IWYU pragma: keep
10 #include <stdint.h> // for uint64_t
11 #include <stdio.h> // for fprintf, stderr, size_t, fputs
12 #include <stdlib.h> // for free, exit, EXIT_FAILURE, EXIT_SUCCESS
13 #include <strings.h> // for strcasecmp
14 #include <mutex> // for mutex, lock_guard
15 #include <string> // for string, operator<<, char_traits, opera...
16 #include <tuple> // for tie, get, make_tuple, tuple
17 #include <vector> // for vector, vector<>::iterator
18 #ifdef HAVE_HWLOC
19 #include <hwloc.h>
20 #include "hwloc-aux.h"
21 #endif
22 #include "las-parallel.hpp"
23
24 #include "misc.h" // size_disp
25 #include "utils_cxx.hpp" // call_dtor
26 #include "verbose.h" // verbose_output_print
27 #include "macros.h"
28 #include "params.h"
29
30
31
32 const char * default_placement_with_auto = "node,fit*4,fit,pu,loose";
33
parse_number(std::string const & s,int & x,std::string::size_type pos=0)34 static bool parse_number(std::string const & s, int & x, std::string::size_type pos = 0) /*{{{*/
35 {
36 const char * digits = "0123456789";
37 if (s.empty() || s.find_first_not_of(digits, pos) != std::string::npos)
38 return false;
39 std::istringstream is(s.substr(pos));
40 is >> x;
41 return true;
42 }/*}}}*/
43
44 /* used to help with some of the job achieved by the las_parallel ctor.
45 * Otherwise no reason to expose within the public class, as it's of no
46 * real use (not even expository).
47 */
48 struct las_parallel_desc::helper {
49 std::string memory_binding_specifier_string;
50 std::string cpu_binding_specifier_string;
51 std::string jobs_within_cpu_binding_string;
52 std::string threads_per_job_string;
53
54 /* The banner is always printed once at least. The full description
55 * only with -v */
56 std::string banner;
57 std::string full_diagnostics;
58
59 #ifdef HAVE_HWLOC
60 hwloc_topology_t topology;
61
62 std::string synthetic_topology_string;
63 int depth = 0;
64 std::vector<int> depth_per_level;
65
66 /* for the "fit" keyword */
67 mutable int computed_min_pu_fit = -1;
68
69 /* size() == number of memory binding zones */
70 std::vector<cxx_hwloc_nodeset> memory_binding_nodesets;
71
72 /* size() == total number of subjobs */
73 std::vector<cxx_hwloc_cpuset> subjob_binding_cpusets;
74
75 double total_ram_margin = 0;
76
77 int memory_binding_size;
78 int cpu_binding_size;
79 #endif
80 int nsubjobs_per_cpu_binding_zone = 1;
81 int nthreads_per_subjob = 1;
82
83 bool loose = false;
84 bool replicate = true;
85
is_looselas_parallel_desc::helper86 bool is_loose() const { return loose; }
is_strictlas_parallel_desc::helper87 bool is_strict() const { return !loose; }
want_replicatelas_parallel_desc::helper88 bool want_replicate() const { return replicate; }
89 /*
90 bool has_option(std::string & const opt) {
91 return binding_options.find(opt) != binding_options.end();
92 }
93 */
94
set_bannerlas_parallel_desc::helper95 void set_banner(std::string const & description_string) {
96 std::ostringstream os;
97 #ifdef HAVE_HWLOC
98 os << "# Applying binding "
99 << description_string
100 << " on a machine with topology "
101 << synthetic_topology_string
102 << " (" << (total_ram() >> 30) << " GB RAM)\n",
103 #else
104 os << "# Applying binding " << description_string << " with hwloc disabled\n",
105 #endif
106 banner += os.str();
107 }
108 #ifdef HAVE_HWLOC
modlas_parallel_desc::helper109 int mod(int&i) const {/*{{{*/
110 i = i % depth; if (i < 0) i += depth;
111 return i;
112 }/*}}}*/
number_atlas_parallel_desc::helper113 int number_at(int i) const {/*{{{*/
114 return hwloc_get_nbobjs_by_depth(topology, mod(i));
115 }/*}}}*/
number_oflas_parallel_desc::helper116 int number_of(int child_depth, int parent_depth) const/*{{{*/
117 {
118 mod(child_depth);
119 mod(parent_depth);
120 if (child_depth < parent_depth) return 0;
121 int n = 1;
122 for(int i = parent_depth ; i < child_depth ; ++i)
123 n *= depth_per_level[i+1];
124 return n;
125 }/*}}}*/
126 #endif
helperlas_parallel_desc::helper127 helper() {/*{{{*/
128 /* Here we need hwloc, of course */
129 #ifdef HAVE_HWLOC
130 hwloc_topology_init(&topology);
131 {/*{{{ load the topology `*/
132 unsigned long flags = 0;
133 #if HWLOC_API_VERSION >= 0x010700
134 flags = hwloc_topology_get_flags(topology);
135 #endif /* HWLOC_API_VERSION >= 0x010700 */
136 /* we must make sure to remove these flags, but it's likely that
137 * they're off by default anyway */
138 #if HWLOC_API_VERSION < 0x020000
139 flags &= ~(HWLOC_TOPOLOGY_FLAG_IO_DEVICES | HWLOC_TOPOLOGY_FLAG_IO_BRIDGES);
140 #endif
141 hwloc_topology_set_flags(topology, flags);
142
143 hwloc_topology_load(topology);
144
145 hwloc_obj_t root = hwloc_get_root_obj(topology);
146 if (!root->symmetric_subtree) {
147 fprintf(stderr, "# Topology is not symmetric,"
148 " cannot proceed with replication"
149 " of the las process with the current code."
150 " No cpu/memory binding will be set.\n");
151 /* simply stick to the default */
152 return;
153 }
154 }/*}}}*/
155 depth = hwloc_topology_get_depth(topology);
156 depth_per_level.reserve(depth);
157 int n = 1;
158 std::ostringstream os;
159 for(int i = 0 ; i < depth ; i++) {
160 int x = hwloc_get_nbobjs_by_depth(topology, i);
161 depth_per_level.push_back(x / n);
162 n = x;
163 }
164 char buf[1024];
165 hwloc_topology_export_synthetic(topology, buf, sizeof(buf), HWLOC_TOPOLOGY_EXPORT_SYNTHETIC_FLAG_NO_ATTRS );
166 synthetic_topology_string = buf;
167
168 /* Form a sensible set of defaults, for one unbound thread */
169 memory_binding_size = number_of(-1,0);
170 cpu_binding_size = memory_binding_size;
171 compute_binding_bitmaps();
172 #endif
173 }/*}}}*/
174
~helperlas_parallel_desc::helper175 ~helper() {/*{{{*/
176 #ifdef HAVE_HWLOC
177 hwloc_topology_destroy(topology);
178 #endif
179 }/*}}}*/
tokenizelas_parallel_desc::helper180 std::vector<std::string> tokenize(std::string const & s) const {/*{{{*/
181 using namespace std;
182 vector<string> tokens;
183 for(string::size_type x = 0, y; x != string::npos ; x = y) {
184 y = s.find(',',x);
185 if (y == string::npos) {
186 tokens.push_back(s.substr(x));
187 } else {
188 tokens.push_back(s.substr(x, y-x));
189 y++;
190 }
191 }
192 return tokens;
193 }/*}}}*/
replace_aliaseslas_parallel_desc::helper194 void replace_aliases(std::string & desc) const {/*{{{*/
195 using namespace std;
196 int k;
197 if (parse_number(desc, k)) {
198 ostringstream os;
199 os << "machine,1," << k;
200 desc = os.str();
201 return;
202 }
203 if (desc == "single") { desc = "machine,1,pu"; return; }
204 if (desc == "auto") { desc = default_placement_with_auto; return; }
205 if (desc == "auto,no-replicate") { desc = default_placement_with_auto; desc += ",no-replicate"; return; }
206 if (desc.substr(0,7) == "single-") {
207 ostringstream os;
208 os << desc.substr(7) << ",1,pu";
209 desc = os.str();
210 return;
211 }
212 }/*}}}*/
parselas_parallel_desc::helper213 void parse(std::string & desc) { /* {{{ */
214 replace_aliases(desc);
215
216 std::vector<std::string> tokens = tokenize(desc);
217
218 bool loose_opt = false;
219 bool strict_opt = false;
220 bool norepl_opt = false;
221 for( ; tokens.size() >= 3 ; ) {
222 std::string const & s(tokens.back());
223 if (s == "strict") { strict_opt = true; }
224 else if (s == "loose") { loose_opt = true; }
225 else if (s == "no-replicate") { norepl_opt = true; }
226 else break;
227 tokens.erase(--tokens.end());
228 }
229 if (loose_opt && strict_opt)
230 throw bad_specification("loose and strict are incompatible");
231 if (loose_opt) loose = true;
232 if (strict_opt) loose = false;
233 if (norepl_opt) replicate = false;
234
235 if (tokens.size() == 3) {
236 memory_binding_specifier_string = tokens[0];
237 jobs_within_cpu_binding_string = tokens[1];
238 threads_per_job_string = tokens[2];
239 } else if (tokens.size() == 4) {
240 memory_binding_specifier_string = tokens[0];
241 cpu_binding_specifier_string = tokens[1];
242 jobs_within_cpu_binding_string = tokens[2];
243 threads_per_job_string = tokens[3];
244 } else {
245 throw bad_specification("not the right number of tokens, or wrong options");
246 }
247 }/*}}}*/
248 #ifdef HAVE_HWLOC
total_ramlas_parallel_desc::helper249 uint64_t total_ram() const {/*{{{*/
250 hwloc_obj_t root = hwloc_get_root_obj(topology);
251 /* I'm not sure about how I want to get the info on the amount of
252 * available ram. hwloc-info, as such, does not seem to document
253 * a way (beyond the fact that this bit of info is output with
254 * "machine" and "Numanode" specifiers). There is an
255 * hwloc_obj_memory_s type in the hwloc C api, and that seems to
256 * be attached to hwloc_obj structures as well. But at any rate,
257 * this gives some notion of available memory. This is well and
258 * good, except that it might be misleading if a job that takes
259 * "almost" 4GB is scheduled to be placedonly 4 times on a 16GB
260 * machine when we specify --job-memory 4 (with some confidence
261 * that this is a convenient enough upper bound).
262 */
263
264 #if HWLOC_API_VERSION < 0x020000
265 uint64_t ram = root->memory.total_memory;
266 #else
267 uint64_t ram = root->total_memory;
268 #endif
269 /* Round this up to at most 1/16-th. This will do rubbish on a
270 * machine where the 1 bits in the binary expansion of the
271 * hardware ram size spread more than 4 positions, but we find
272 * that unlikely.
273 */
274 for(uint64_t x = ram >> 4; x ; x >>= 1) ram |= x;
275 ram = ram + 1;
276 ram = ram - ((uint64_t) (total_ram_margin * (1<<30)));
277 return ram;
278 }/*}}}*/
enclosing_depthlas_parallel_desc::helper279 int enclosing_depth(int n) const {/*{{{*/
280 /* return deepest level k such that an object at depth k contains
281 * more than, or exactly N PUs.
282 */
283 int k;
284 int m = 1;
285 ASSERT_ALWAYS(n >= 1);
286 for(k = depth - 1; n > m && k >= 0 ; k--) {
287 ASSERT(number_of(-1, k) == m);
288 m *= depth_per_level[k];
289 ASSERT(k == 0 || number_of(-1, k-1) == m);
290 }
291
292 if (k >= 0) {
293 ASSERT(number_of(-1, k) >= n);
294 if (k == depth - 1) {
295 ASSERT(n == 1);
296 } else {
297 ASSERT(number_of(-1, k + 1) < n);
298 }
299 }
300
301 return k;
302 }/*}}}*/
flat_to_hierarchicallas_parallel_desc::helper303 std::tuple<int, int, int> flat_to_hierarchical(int n) const {/*{{{*/
304 int k = enclosing_depth(n);
305 /* a binding scope that has to meet the constraint of containing
306 * at least n PUs can do so with an object at depth
307 * enclosing_depth(n). However, a finer grain might be possible,
308 * and that may be an important optimization when we have many
309 * edges in the topology tree at this level: it is perhaps
310 * possible to divide the object at depth k into equal parts made
311 * of several objects at depth k+1, all parts meeting the
312 * constraint. We count how large these parts must be (counted in
313 * terms of number of objects at depth k+1, called "children"
314 * here.).
315 */
316 if (k == depth - 1) {
317 ASSERT(n == 1);
318 return std::make_tuple(k, 1, 1);
319 }
320 int child_depth = k + 1;
321 int child_size = number_of(-1, k + 1);
322 int part_size = iceildiv(n, child_size);
323 int v = number_of(k+1, k);
324 ASSERT(part_size <= v);
325 if (replicate)
326 for( ; v % part_size ; part_size++);
327 if (part_size == v) {
328 child_depth--;
329 child_size *= v;
330 part_size = 1;
331 }
332 return std::make_tuple(child_depth, child_size, part_size);
333 }/*}}}*/
acceptable_bindinglas_parallel_desc::helper334 int acceptable_binding(int n) const {/*{{{*/
335 auto x = flat_to_hierarchical(n);
336 return std::get<1>(x) * std::get<2>(x);
337 }/*}}}*/
textual_description_for_bindinglas_parallel_desc::helper338 std::string textual_description_for_binding(int n) const {/*{{{*/
339 int k, child_size, part_size;
340 ASSERT(n <= number_of(-1, 0));
341 std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
342 for( ; k < depth - 1 && number_of(k+1, k) == 1 ; k++);
343 hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, k, 0);
344 char s[256];
345 hwloc_obj_type_snprintf(s, sizeof(s), obj, 1);
346 if (part_size == 1) {
347 return std::string(s);
348 } else {
349 std::ostringstream os;
350 os << std::string(s) << ":0-" << part_size - 1;
351 return os.str();
352 }
353 }/*}}}*/
all_textual_descriptions_for_bindinglas_parallel_desc::helper354 std::vector<std::string> all_textual_descriptions_for_binding(int n) const {/*{{{*/
355 /* n is a binding granularity. Return all the subsets of the
356 * machine, all identical, that have some hardware relevance (=
357 * correspond to an integer fraction of the subtree at some depth,
358 * and made of entire subtrees at the level below), and all
359 * contain at least n PUs.
360 */
361 int k, child_size, part_size;
362 ASSERT(n <= number_of(-1, 0));
363 std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
364 for( ; k < depth - 1 && number_of(k+1, k) == 1 ; k++);
365 std::vector<std::string> res;
366 int nk = number_of(k, 0);
367 for(int i = 0 ; i < nk ; i+=part_size) {
368 char s[256];
369 hwloc_obj_t obj = hwloc_get_obj_by_depth(topology, k, i);
370 hwloc_obj_type_snprintf(s, sizeof(s), obj, 0);
371 std::ostringstream os;
372 int a = obj->logical_index;
373 int b = a + part_size - 1;
374 if (part_size == 1) {
375 os << s << ":" << a;
376 } else {
377 os << s << ":" << a << '-' << b;
378 }
379 res.push_back(os.str());
380 }
381 return res;
382 }/*}}}*/
all_cpu_bitmapslas_parallel_desc::helper383 std::vector<cxx_hwloc_cpuset> all_cpu_bitmaps(int width, int multiply = 1)/*{{{*/
384 {
385 int n = width;
386 int k, child_size, part_size;
387 ASSERT(n <= number_of(-1, 0));
388 std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
389 std::vector<cxx_hwloc_cpuset> res;
390 int nk = number_of(k, 0);
391 for(int i = 0 ; i < nk ; i+=part_size) {
392 cxx_hwloc_cpuset c;
393 for(int j = 0 ; j < part_size ; j++)
394 c = c | hwloc_get_obj_by_depth(topology, k, i + j)->cpuset;
395 for(int k = 0 ; k < multiply ; k++)
396 res.push_back(c);
397 }
398 return res;
399 }/*}}}*/
all_mem_bitmapslas_parallel_desc::helper400 std::vector<cxx_hwloc_nodeset> all_mem_bitmaps(int width, int multiply = 1)/*{{{*/
401 {
402 int n = width;
403 int k, child_size, part_size;
404 ASSERT(n <= number_of(-1, 0));
405 std::tie(k, child_size, part_size) = flat_to_hierarchical(n);
406 std::vector<cxx_hwloc_cpuset> res;
407 int nk = number_of(k, 0);
408 for(int i = 0 ; i < nk ; i+=part_size) {
409 cxx_hwloc_nodeset c;
410 for(int j = 0 ; j < part_size ; j++)
411 c = c | hwloc_get_obj_by_depth(topology, k, i + j)->nodeset;
412 for(int k = 0 ; k < multiply ; k++)
413 res.push_back(c);
414 }
415 return res;
416 }/*}}}*/
min_pu_fitlas_parallel_desc::helper417 int min_pu_fit(double jobram) {/*{{{*/
418 if (computed_min_pu_fit >= 0) return computed_min_pu_fit;
419 /* This computes the minimal number n of PUs such that
420 * floor(total_PUs/n) jobs, each using jobram GB, fit on the
421 * machine and do not exceed the total amount of __physical__ RAM.
422 *
423 * hwloc does give us a useful bit of information regarding
424 * available, and not physical ram. We strive to get the physical
425 * ram because that amount is generally better known to the user,
426 * and one expects the automatic placement logic to find that
427 * *four* jobs of say 12GB would fit on a 48GB machine.
428 *
429 * Note that this integer is not yet of interest to the
430 * hierarchical topology.
431 */
432 /* how many jobs can fit ? */
433 int how_many = total_ram() / (jobram * (1<<30));
434 if (how_many == 0) {
435 fprintf(stderr, "This machine does not have enough memory"
436 " (%" PRIu64 " GB) to fit jobs that need %.2f GB\n",
437 total_ram() >> 30, jobram);
438 exit(EXIT_FAILURE);
439 }
440 /* This means subjobs can't be smaller than this: */
441 computed_min_pu_fit = iceildiv(number_of(-1, 0), how_many);
442 std::string s = textual_description_for_binding(computed_min_pu_fit);
443 std::ostringstream os;
444 os << "# Given our estimate of "
445 << size_disp(jobram * (1<<30)) << " RAM per subjob,"
446 << " the \"fit\" binding\n"
447 << "# corresponds to a minimum size of "
448 << computed_min_pu_fit << " covered PUs in the hwloc hierarchy,\n"
449 << "# i.e. no less than \"" << s << "\"\n";
450 full_diagnostics += os.str();
451 return computed_min_pu_fit;
452 }/*}}}*/
compute_binding_bitmapslas_parallel_desc::helper453 void compute_binding_bitmaps() {/*{{{*/
454 subjob_binding_cpusets = all_cpu_bitmaps(cpu_binding_size, nsubjobs_per_cpu_binding_zone);
455 memory_binding_nodesets = all_mem_bitmaps(memory_binding_size);
456 }/*}}}*/
current_memory_bindinglas_parallel_desc::helper457 cxx_hwloc_nodeset current_memory_binding() const {/*{{{*/
458 cxx_hwloc_nodeset nn;
459 hwloc_membind_policy_t pol;
460 #if HWLOC_API_VERSION < 0x010b03
461 /* this legacy called remained valid throughout hwloc 1.x */
462 int rc = hwloc_get_membind_nodeset(topology, nn, &pol,
463 HWLOC_MEMBIND_THREAD);
464 #else
465 /* newer call */
466 int rc = hwloc_get_membind(topology, nn, &pol,
467 HWLOC_MEMBIND_THREAD | HWLOC_MEMBIND_BYNODESET);
468 #endif
469 if (rc < 0) {
470 static std::mutex mm;
471 std::lock_guard<std::mutex> dummy(mm);
472 static int got_message = 0;
473 if (!got_message++)
474 fprintf(stderr, "Error while attempting to get memory binding\n");
475 hwloc_bitmap_zero(nn);
476 }
477 return nn;
478 }/*}}}*/
479 #endif
interpret_memory_binding_specifierlas_parallel_desc::helper480 int interpret_memory_binding_specifier() {
481 #ifdef HAVE_HWLOC
482 if (depth) {
483 memory_binding_size = interpret_generic_binding_specifier(memory_binding_specifier_string);
484 std::ostringstream os;
485 os
486 << "# Memory binding specifier: request: "
487 << memory_binding_specifier_string
488 << " ; resolves to: "
489 << textual_description_for_binding(memory_binding_size)
490 << "\n";
491 full_diagnostics += os.str();
492 return memory_binding_size;
493 } else
494 #endif
495 {
496 /* just check for errors */
497 return interpret_generic_binding_specifier(memory_binding_specifier_string);
498 }
499 }
interpret_cpu_binding_specifierlas_parallel_desc::helper500 int interpret_cpu_binding_specifier() {
501 #ifdef HAVE_HWLOC
502 if (depth) {
503 if (cpu_binding_specifier_string.empty()) {
504 cpu_binding_size = memory_binding_size;
505 } else {
506 /* cap to memory binding size */
507 cpu_binding_size = interpret_generic_binding_specifier(cpu_binding_specifier_string, memory_binding_size);
508 if (memory_binding_size % cpu_binding_size)
509 throw bad_specification(
510 "\n",
511 banner,
512 full_diagnostics,
513 "# ERROR: cpu binding "
514 , cpu_binding_specifier_string,
515 " yields a size of ", cpu_binding_size, " PUs,"
516 " but this must be an integer divisor of",
517 " the memory binding size of ", memory_binding_size,
518 " PUs, which follows from the specifier ",
519 memory_binding_specifier_string);
520 }
521 std::ostringstream os;
522 os
523 << "# Cpu binding specifier: request: "
524 << cpu_binding_specifier_string
525 << " ; resolves to: "
526 << textual_description_for_binding(cpu_binding_size)
527 << "\n";
528 full_diagnostics += os.str();
529 return cpu_binding_size;
530 } else
531 #endif
532 {
533 /* just check for errors */
534 if (cpu_binding_specifier_string.empty()) {
535 return 0;
536 } else {
537 return interpret_generic_binding_specifier(cpu_binding_specifier_string);
538 }
539 }
540 }
interpret_generic_binding_specifierlas_parallel_desc::helper541 int interpret_generic_binding_specifier(std::string const & specifier, int cap MAYBE_UNUSED = -1) {/*{{{*/
542 #ifdef HAVE_HWLOC
543 if (!depth) {
544 if (strcasecmp(specifier.c_str(), "machine") != 0)
545 throw bad_specification("hwloc detected asymmetric topology, the only accepted memory binding specifier is \"machine\"");
546 return 0;
547 }
548 int binding_size;
549 /* and apply our different calculation rules to the provided
550 * string (either memory_binding_specifier_string or
551 * cpu_binding_specifier_string if there is one). */
552 if (parse_number(specifier, binding_size))
553 return binding_size;
554 const char * binding_specifier_regexp =
555 "^(" // 1 : non-limited part.
556 "([^*/[:space:]]*)" // 2: object, or "fit"
557 "(\\*([0-9]+))?" // 3,4: coarsening
558 "(/([0-9]+))?" // 5,6: restricting
559 ")"
560 "(/([^[:space:]]*))?" // 7,8: limiting
561 "$";
562 regex_t R;
563 regcomp(&R, binding_specifier_regexp, REG_ICASE|REG_EXTENDED);
564 auto dummy = call_dtor([&](){regfree(&R);});
565 regmatch_t m[9];
566 int r = regexec(&R, specifier.c_str(), 9, m, 0);
567 if (r != 0)
568 throw bad_specification("binding specifier ",
569 specifier, " is invalid");
570 int objsize;
571 auto sub = [&](int i, bool want = false) {
572 std::string res;
573 if (want) ASSERT_ALWAYS(m[i].rm_so >= 0);
574 if (m[i].rm_so >= 0)
575 res = specifier.substr(m[i].rm_so, m[i].rm_eo - m[i].rm_so);
576 return res;
577 };
578 std::string base_object = sub(2, true);
579 bool is_fit = strcasecmp(base_object.c_str(), "fit") == 0;
580 if (is_fit) {
581 if (computed_min_pu_fit < 0) throw needs_job_ram();
582 objsize = computed_min_pu_fit;
583 } else {
584 int argdepth = hwloc_aux_get_depth_from_string(topology, base_object.c_str());
585 #if HWLOC_API_VERSION >= 0x020000
586 if (argdepth == HWLOC_TYPE_DEPTH_NUMANODE) {
587 argdepth = hwloc_get_memory_parents_depth(topology);
588 }
589 #endif
590 if (argdepth < 0)
591 throw bad_specification(base_object, " is invalid");
592 objsize = number_of(-1, argdepth);
593 }
594 std::string multiplier_string = sub(4);
595 if (!multiplier_string.empty()) {
596 int x;
597 bool t = parse_number(multiplier_string, x);
598 ASSERT_ALWAYS(t);
599 objsize *= x;
600 }
601 if (cap < 0) cap = number_of(-1, 0);
602 if (objsize > cap)
603 objsize = cap;
604 if (is_fit)
605 objsize = acceptable_binding(objsize);
606 std::string divisor_string = sub(6);
607 if (!divisor_string.empty()) {
608 int x;
609 bool t = parse_number(divisor_string, x);
610 ASSERT_ALWAYS(t);
611 if (!x || (objsize % x)) {
612 std::ostringstream os;
613 os << "specifier binding specifier is invalid. Cannot divide "
614 << base_object << sub(3) << " (=" << objsize << ") into "
615 << x << " parts";
616 if (is_strict()) {
617 throw bad_specification(os.str());
618 } else {
619 fprintf(stderr, "ERROR: %s\n", os.str().c_str());
620 }
621 }
622 if (x) objsize /= x;
623 }
624 std::string limiting_string = sub(8);
625 if (!limiting_string.empty()) {
626 int argdepth = hwloc_aux_get_depth_from_string(topology, limiting_string.c_str());
627 if (argdepth < 0)
628 throw bad_specification(limiting_string, " is invalid");
629 int compare = number_of(-1, argdepth);
630 if (objsize > compare) {
631 std::ostringstream os;
632 os << "automated binding "
633 << textual_description_for_binding(objsize)
634 << " deduced from \"" << sub(2) << sub(4) << sub(6) << "\""
635 << " (= " << objsize << ")"
636 << " escapes the advised binding given by " << sub(7)
637 << " (which is "
638 << textual_description_for_binding(compare)
639 << ")";
640 if (is_strict()) {
641 throw bad_specification(os.str());
642 } else {
643 fprintf(stderr, "Warning: %s\n", os.str().c_str());
644 }
645 }
646 }
647 return objsize;
648 #else
649 if (strcasecmp(specifier.c_str(), "machine") != 0)
650 throw bad_specification("hwloc being disabled, the only accepted binding specifier is \"machine\"");
651 return 0;
652 #endif
653 }/*}}}*/
interpret_njobs_specifierlas_parallel_desc::helper654 int interpret_njobs_specifier() {/*{{{*/
655 if (parse_number(jobs_within_cpu_binding_string, nsubjobs_per_cpu_binding_zone))
656 return nsubjobs_per_cpu_binding_zone;
657 #ifdef HAVE_HWLOC
658 int objsize;
659 if (strcasecmp(jobs_within_cpu_binding_string.c_str(), "fit") == 0) {
660 if (computed_min_pu_fit < 0) throw needs_job_ram();
661 objsize = acceptable_binding(computed_min_pu_fit);
662 } else if (jobs_within_cpu_binding_string.find_first_of("*/") != std::string::npos) {
663 throw bad_specification("Only binding specifiers allow multipliers");
664 } else {
665 int argdepth = hwloc_aux_get_depth_from_string(topology, jobs_within_cpu_binding_string.c_str());
666 if (argdepth < 0)
667 throw bad_specification(jobs_within_cpu_binding_string, " is invalid");
668 objsize = number_of(-1, argdepth);
669 }
670 if (cpu_binding_size % objsize)
671 throw bad_specification(
672 "\n",
673 banner,
674 full_diagnostics,
675 "ERROR: cannot place jobs according to",
676 " the ", jobs_within_cpu_binding_string, " rule",
677 " (which resolves to: ",
678 textual_description_for_binding(objsize), "),"
679 " as there is not an integer number of these",
680 " in the imposed binding ",
681 textual_description_for_binding(cpu_binding_size),
682 " [HINT: are you using -t auto and running out of RAM on NUMA nodes?]");
683 nsubjobs_per_cpu_binding_zone = cpu_binding_size / objsize;
684 std::ostringstream os;
685 os
686 << "# Njobs specifier: request: "
687 << jobs_within_cpu_binding_string
688 << " ; resolves to: "
689 << textual_description_for_binding(objsize)
690 << " (" << nsubjobs_per_cpu_binding_zone << " jobs)"
691 << "\n";
692 full_diagnostics += os.str();
693 return nsubjobs_per_cpu_binding_zone;
694 #else
695 throw bad_specification("hwloc being disabled, the only accepted specifiers for the number of jobs are integers");
696 #endif
697 }/*}}}*/
interpret_nthreads_specifierlas_parallel_desc::helper698 int interpret_nthreads_specifier() {/*{{{*/
699 if (parse_number(threads_per_job_string, nthreads_per_subjob))
700 return nthreads_per_subjob;
701 #ifdef HAVE_HWLOC
702 int objsize;
703 int argdepth = hwloc_aux_get_depth_from_string(topology, threads_per_job_string.c_str());
704 if (argdepth < 0)
705 throw bad_specification(threads_per_job_string, " is invalid");
706 objsize = number_of(-1, argdepth);
707 int loose_per_job_scale = cpu_binding_size / nsubjobs_per_cpu_binding_zone;
708 if (loose_per_job_scale % objsize)
709 throw bad_specification("cannot place threads according to",
710 " the ", threads_per_job_string, " rule,",
711 " as there is not an integer number of these",
712 " in the per-job fraction (1/", nsubjobs_per_cpu_binding_zone, ")",
713 " of the binding ",
714 textual_description_for_binding(cpu_binding_size));
715 nthreads_per_subjob = loose_per_job_scale / objsize;
716 std::ostringstream os;
717 os
718 << "# Nthreads specifier: request: "
719 << threads_per_job_string
720 << " ; resolves to: "
721 << textual_description_for_binding(objsize)
722 << " (" << nthreads_per_subjob << " threads)"
723 << "\n";
724 full_diagnostics += os.str();
725 return nthreads_per_subjob;
726 #else
727 throw bad_specification("hwloc being disabled, the only accepted specifiers for the number of threads are integers");
728 #endif
729 }/*}}}*/
730 };
731
extended_usage()732 static void extended_usage()/*{{{*/
733 {
734 std::ostringstream os;
735 os << R"(
736 Documentation for the --job-binding-policy (-t) option.
737 =======================================================
738
739 Option takes either an alias, or a longer specificier. Most of the
740 functionality depends on the hwloc library being usable. Examples are
741 provided after the syntax description
742
743
744 Aliases, first:
745 ===============
746
747 [an integer N]: equivalent to Machine,1,N
748 (run one N-threaded job, not bound to anything).
749 single: equivalent to single-machine
750 single-XXX: equivalent to XXX,1,PU
751 )"
752 << " auto: equivalent to " << default_placement_with_auto << " [see below]\n"
753 << " auto,no-replicate: equivalent to " << default_placement_with_auto << ",no-replicate [see below]\n"
754 << R"(
755 Syntax of the specifiers
756 ========================
757
758 [memory binding level],[cpu binding level],[number of jobs with this binding],[threads per job][,modifiers]
759
760 The cpu binding level may be omitted, and defaults to the same value as
761 the machine binding level.
762
763 All three (or four) main items may be set as integers. However this is
764 probably not the most useful, and certainly not the most portable way to
765 set them.
766
767 Binding level:
768 --------------
769
770 Both the memory binding level and the cpu binding level follow the exact
771 same syntax.
772
773 First observe the output ow hwloc-info (may vary depending on the
774 machine).
775 localhost ~ $ hwloc-info --no-io
776 depth 0: 1 Machine (type #1)
777 depth 1: 2 NUMANode (type #2)
778 depth 2: 2 Package (type #3)
779 depth 3: 2 L3Cache (type #4)
780 depth 4: 16 L2Cache (type #4)
781 depth 5: 16 L1Cache (type #4)
782 depth 6: 16 Core (type #5)
783 depth 7: 32 PU (type #6)
784 This means 2 NUMANode per machine, 8 Core per NUMANode, 2 PU per
785 Core.
786
787 The syntax of the binding level is
788 [integer]
789 OR [hwloc object name](\*[integer]|/[integer])?
790 OR fit(\*[integer])?(/[hwloc object name])?
791
792 Here's how it goes to specify the binding level as an integer. As
793 mentioned, this is neither easy nor portable, look further for the
794 recommended way to set via level aliases. An integer N defines a binding
795 level which corresponds to N of the innermost instances (here, PU is the
796 innermost object. It is probably always so. The text below says PUs, but
797 the code means the innermost object anyway). Let k be the depth
798 such that an object at depth k contains strictly more than N PUs, while
799 an object at depth k+1 contains less than, or exactly N PUs. For example,
800 above, for N=8 we have k=3 (at depth k+1 a single L2Cache is above less
801 than 8 PUs, but at level k=3, below one L3Cache object we have 16 PUs,
802 which is more than 8). When specifying an integer N, it must be a
803 multiple of the number of PUs below an object at level k+1 (here, 8 is
804 indeed a multiple of 2, which is the number of PUs below an L2Cache), as
805 well as a divisor of the number of PUs below an object at level k (here,
806 8 divides 16). The latter requirement is waived if the "no-replicate"
807 modifier is present.
808
809 We bind jobs to hwloc objects at a particular depth by writing simply the
810 object name (e.g. NUMANode) as a binding level. Names are never
811 case-sensitive, and match in the same way hwloc tools such as hwloc-calc
812 recognizes them. In the example above, this is equivalent to the integer
813 16, but this varies. When specifying an object name, it is possible to
814 append "*N" or "/N" to loosen or restrict the binding level, provided the
815 resulting integer satisfies the constraints above.
816
817 The "fit" keyword is special. It corresponds to the minimum number of PUs
818 that must be grouped together so that if the machine is filled with
819 identical jobs, the memory available on the machine suffices. This relies
820 on an estimation of the amount of required memory, of course (see also
821 --job-memory). It is possible to write fit*N, which loosens this binding
822 (always subject to the same requirement). When appending /XXX, where XXX
823 denotes an object name, this ensures that the corresponding binding does
824 not escape the size of one object of that name. See also the "loose" and
825 "strict" modifiers.
826
827 Number of jobs with same binding
828 --------------------------------
829
830 This is an integer, too. We may also use hwloc specifiers as shortcuts.
831 However the interpretation is reversed compared to the binding level.
832 Here, "Core" indicates a number which is the number of "Core" objects
833 within the binding specification.
834
835 The "fit" keyword echoes the functionality of the "fit" binding
836 specifier. Again, it means that within the binding specifier, we put
837 exactly the number of jobs so that if this is replicated over the whole
838 machine, the memory fits.
839
840 Naturally, when using an object name (or "fit"), it is mandatory that the
841 binding specifier contains a positive integer number of objects of that
842 name.
843
844 Number of threads per job
845 -------------------------
846
847 Again, an integer. As with the number of jobs, aliases are understood
848 relative to the job size (which may occupy only a fraction of the binding
849 level, but always an integer fraction anyway).
850
851 The two most useful settings are "PU" and "Core", which are equivalent to
852 using or not using hyperthreading.
853
854 Modifiers
855 ---------
856
857 'loose' / 'strict' (default: 'loose'). If a binding specifier of the form
858 fit*N/XXX or fit/XXX is used, warn (with 'loose') or error out (with
859 'strict') if the calculated binding is coarser than XXX.
860
861 'no-replicate' : by default, we may put several jobs with the same
862 binding, and the set of jobs that are put in a single binding is
863 replicated over the whole machine (as many times as this binding fits in
864 the machine). With the "no-replicate" option, this replication is
865 disabled (however we still have the same number of sub-jobs with one
866 binding).
867
868 Examples
869 ========
870
871 -t 8 -> resolves to machine,1,8 = run only one job, with no
872 particular binding. There is only one job, because there is
873 only one "machine" per... machine, so whether or not we use
874 the "no-replicate" modifier makes no difference.
875
876 -t numa,fit,pu,strict
877 -> binds jobs at the numa level. Set as many jobs as can fit,
878 and then run each on exactly the number of pus (hyperthreaded
879 cores). Abort if it is not possible to fit on one numa node.
880
881 -t numa,core*2,fit,pu
882 -> memory-bind at the numa level, and do cpu-bind on pairs of
883 cores. Stick as many different jobs as can fit in each cpu
884 binding context, and have each of these jobs use as many
885 threads as the number of pus we have (per sub-job in the cpu
886 binding context of two cores).
887
888 -t numa,fit,20
889 -> same, but force 20-thread jobs (no matter what).
890
891 -t auto -> resolves to )" << default_placement_with_auto << R"( = bind fractions of NUMA
892 nodes, and put as many jobs in there as can fit. Then have
893 each run the number of threads so that all PUs are busy.
894
895 -t package/2,core,pu
896 -> binds to half packages, run 1 job per core, and have each
897 job use as many threads as we have PUs per core.
898 )";
899 fputs(os.str().c_str(), stderr);
900 }/*}}}*/
901
las_parallel_desc()902 las_parallel_desc::las_parallel_desc()
903 : help (std::make_shared<helper>())
904 {}
905
las_parallel_desc(cxx_param_list & pl,double jobram_arg)906 las_parallel_desc::las_parallel_desc(cxx_param_list & pl, double jobram_arg)
907 : las_parallel_desc()
908 {
909 jobram = jobram_arg;
910 const char * desc_c0 = param_list_lookup_string(pl, "t");
911
912 if (!desc_c0) {
913 verbose_output_start_batch();
914 verbose_output_print(0, 1, "# No -t option found, running single-threaded.");
915 #ifdef HAVE_HWLOC
916 verbose_output_print(0, 1, " See also -t help");
917 #endif
918 verbose_output_print(0, 1, "\n");
919 verbose_output_end_batch();
920 /* feed a constant string */
921 description_string = "1";
922 } else {
923 description_string = desc_c0;
924 }
925
926 if (description_string == "help") {/*{{{*/
927 extended_usage();
928 exit(EXIT_SUCCESS);
929 }/*}}}*/
930
931 help->replace_aliases(description_string);
932 help->set_banner(description_string);
933
934 #ifdef HAVE_HWLOC
935 param_list_parse_double(pl, "memory-margin", &help->total_ram_margin);
936 /* The --job-memory argument will **ALWAYS** win */
937 param_list_parse_double(pl, "job-memory", &jobram);
938 if (jobram != -1)
939 help->min_pu_fit(jobram);
940 #endif
941
942 try {
943 help->parse(description_string);
944 } catch (bad_specification & b) {
945 throw bad_specification("Cannot interpret specification -t ",
946 description_string,
947 ": ", b.what(), "."
948 " See -t help for extended documentation");
949 }
950 help->interpret_memory_binding_specifier();
951 help->interpret_cpu_binding_specifier();
952 help->interpret_njobs_specifier();
953 help->interpret_nthreads_specifier();
954
955 nsubjobs_per_cpu_binding_zone = help->nsubjobs_per_cpu_binding_zone;
956 nthreads_per_subjob = help->nthreads_per_subjob;
957
958 #ifdef HAVE_HWLOC
959 if (!help->depth) return;
960 memory_binding_size = help->memory_binding_size;
961 nmemory_binding_zones = help->number_of(-1, 0) / memory_binding_size;
962 if (!help->replicate)
963 nmemory_binding_zones = 1;
964 cpu_binding_size = help->cpu_binding_size;
965 ncpu_binding_zones_per_memory_binding_zone = memory_binding_size / cpu_binding_size;
966
967 /* Need to populate the arrays subjob_binding_cpusets and
968 * memory_binding_nodesets. */
969 help->compute_binding_bitmaps();
970
971 #if 0
972 const struct hwloc_topology_support* s = hwloc_topology_get_support(help->topology);
973 /* see /usr/share/doc/libhwloc-doc/html/a00234.html */
974 /* just examples of flags we may be interested in */
975 ASSERT_ALWAYS(s->membind->get_area_membind);
976 #endif
977
978 #endif
979 }
980
display_binding_info() const981 void las_parallel_desc::display_binding_info() const /*{{{*/
982 {
983 verbose_output_start_batch();
984 verbose_output_print(0, 1, "%s", help->banner.c_str());
985 verbose_output_print(0, 2, "%s", help->full_diagnostics.c_str());
986 #ifdef HAVE_HWLOC
987 verbose_output_print(0, 1, "# %d memory binding zones\n", nmemory_binding_zones);
988 verbose_output_print(0, 1, "# %d cpu binding zones within each memory binding\n", ncpu_binding_zones_per_memory_binding_zone);
989 verbose_output_print(0, 1, "# %d jobs within each binding context (hence %d in total)\n",
990 number_of_subjobs_per_cpu_binding_zone(),
991 number_of_subjobs_total());
992 verbose_output_print(0, 1, "# %d threads per job\n", nthreads_per_subjob);
993 if (jobram >= 0) {
994 double all = jobram * number_of_subjobs_total();
995 int physical = help->total_ram() >> 30;
996 double ratio = 100 * all / physical;
997
998 verbose_output_print(0, 1, "# Based on an estimate of %.2f GB per job, we use %.2f GB in total, i.e. %.1f%% of %d GB\n",
999 jobram, all, ratio, physical);
1000 }
1001 #else
1002 verbose_output_print(0, 1, "# %d jobs in parallel\n", number_of_subjobs_total());
1003 verbose_output_print(0, 1, "# %d threads per job\n", nthreads_per_subjob);
1004 #endif
1005
1006
1007 #ifdef HAVE_HWLOC
1008 if (help->depth) {
1009 std::vector<std::string> tm = help->all_textual_descriptions_for_binding(memory_binding_size);
1010 std::vector<std::string> tc = help->all_textual_descriptions_for_binding(cpu_binding_size);
1011 std::vector<std::string> tj = help->all_textual_descriptions_for_binding(cpu_binding_size / number_of_subjobs_per_cpu_binding_zone());
1012 size_t m = 0, c = 0;
1013 {
1014 std::ostringstream pu_app;
1015 pu_app << "[" << memory_binding_size << " PUs]";
1016 for(auto & x : tm) { x += " "; x += pu_app.str(); if (x.size() > m) m = x.size(); }
1017 }
1018 {
1019 std::ostringstream pu_app;
1020 pu_app << "[" << cpu_binding_size << " PUs]";
1021 for(auto & x : tc) { x += " "; x += pu_app.str(); if (x.size() > c) c = x.size(); }
1022 }
1023 size_t qc = number_of_subjobs_per_cpu_binding_zone();
1024 size_t qm = qc * ncpu_binding_zones_per_memory_binding_zone;
1025 for(size_t i = 0 ; i < tj.size() ; i++) {
1026 if (!help->replicate && i >= qc) break;
1027 ASSERT_ALWAYS(i < help->subjob_binding_cpusets.size());
1028 char * sc;
1029 hwloc_bitmap_asprintf(&sc, help->subjob_binding_cpusets[i]);
1030 ASSERT_ALWAYS(i/qm < help->memory_binding_nodesets.size());
1031 char * sm;
1032 hwloc_bitmap_asprintf(&sm, help->memory_binding_nodesets[i/qm]);
1033
1034 verbose_output_print(0, 2, "# %-*s %-*s %s [%d thread(s)] [ m:%s c:%s ]\n",
1035 (int) m, (i % qm) ? "" : tm[i / qm].c_str(),
1036 (int) c, (i % qc) ? "" : tc[i / qc].c_str(),
1037 "job", // tj[i].c_str(),
1038 nthreads_per_subjob,
1039 sm, sc
1040 );
1041 free(sm);
1042 free(sc);
1043 }
1044 }
1045 #endif
1046 verbose_output_end_batch();
1047 }/*}}}*/
1048
1049
1050 #if 0
1051 int las_parallel_desc::query_memory_binding(void * addr, size_t len);
1052 {
1053 /* tentative specification. This is most probably a debug function.
1054 *
1055 * return an integer k for something that matches precisely the k-th
1056 * memory binding zone that we have defined.
1057 *
1058 * return -1 for something that isn't bound.
1059 *
1060 * return -2 if none of the above.
1061 */
1062 /* base this on one of the following hwloc api calls ?
1063
1064 int hwloc_get_area_membind_nodeset (hwloc_topology_t topology,
1065 const void *addr, size_t len,
1066 hwloc_nodeset_t nodeset,
1067 hwloc_membind_policy_t *policy,
1068 int flags);
1069 int hwloc_get_area_membind (hwloc_topology_t topology,
1070 const void *addr, size_t len,
1071 hwloc_bitmap_t set,
1072 hwloc_membind_policy_t *policy,
1073 int flags);
1074 int hwloc_get_area_memlocation (hwloc_topology_t topology,
1075 const void *addr, size_t len,
1076 hwloc_bitmap_t set,
1077 int flags);
1078
1079 */
1080 return -2;
1081 }
1082 #endif
1083
set_loose_binding() const1084 int las_parallel_desc::set_loose_binding() const
1085 {
1086 #ifdef HAVE_HWLOC
1087 if (help->depth == 0)
1088 return 0;
1089 hwloc_obj_t root = hwloc_get_root_obj(help->topology);
1090 hwloc_nodeset_t n = root->nodeset;
1091 hwloc_cpuset_t c = root->cpuset;
1092 /* This achieves a **global** binding. It's not entirely clear we
1093 * ever want this in the normal course of an execution of las, in
1094 * fact.
1095 */
1096 int rc;
1097 #if HWLOC_API_VERSION < 0x010b03
1098 /* this legacy called remained valid throughout hwloc 1.x */
1099 rc = hwloc_set_membind_nodeset(help->topology, n, HWLOC_MEMBIND_BIND,
1100 HWLOC_MEMBIND_THREAD |
1101 HWLOC_MEMBIND_STRICT);
1102 #else
1103 /* newer call */
1104 rc = hwloc_set_membind(help->topology, n, HWLOC_MEMBIND_BIND,
1105 HWLOC_MEMBIND_THREAD |
1106 HWLOC_MEMBIND_STRICT |
1107 HWLOC_MEMBIND_BYNODESET);
1108 #endif
1109 if (rc < 0) {
1110 char * s;
1111 hwloc_bitmap_asprintf(&s, n);
1112 if (errno == EXDEV) {
1113 fprintf(stderr, "Error, cannot enforce loose memory binding [ %s ]\n", s);
1114 } else {
1115 fprintf(stderr, "Error while attempting to set loose memory binding [ %s ]\n", s);
1116 }
1117 free(s);
1118 return -1;
1119 }
1120 rc = hwloc_set_cpubind(help->topology, c,
1121 HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
1122 if (rc < 0) {
1123 char * s;
1124 hwloc_bitmap_asprintf(&s, c);
1125 if (errno == EXDEV) {
1126 fprintf(stderr, "Error, cannot enforce loose cpu binding [ %s ]\n", s);
1127 } else {
1128 fprintf(stderr, "Error while attempting to set loose cpu binding [ %s ]\n", s);
1129 }
1130 free(s);
1131 return -1;
1132 }
1133 #endif
1134 return 0;
1135 }
1136
set_subjob_binding(int k) const1137 int las_parallel_desc::set_subjob_binding(int k) const
1138 {
1139 #ifdef HAVE_HWLOC
1140 if (help->depth == 0)
1141 return -1;
1142 #endif
1143 set_subjob_cpu_binding(k);
1144 set_subjob_mem_binding(k);
1145 return 0;
1146 }
1147
set_subjob_mem_binding(int k MAYBE_UNUSED) const1148 int las_parallel_desc::set_subjob_mem_binding(int k MAYBE_UNUSED) const
1149 {
1150 #ifdef HAVE_HWLOC
1151 if (help->depth == 0)
1152 return -1;
1153 ASSERT_ALWAYS(0<= k && k < (int) help->subjob_binding_cpusets.size());
1154 int m = k / number_of_subjobs_per_memory_binding_zone();
1155 ASSERT_ALWAYS(m < (int) help->memory_binding_nodesets.size());
1156 #if HWLOC_API_VERSION < 0x010b03
1157 /* this legacy called remained valid throughout hwloc 1.x */
1158 int rc = hwloc_set_membind_nodeset(help->topology,
1159 help->memory_binding_nodesets[m],
1160 HWLOC_MEMBIND_BIND,
1161 HWLOC_MEMBIND_THREAD |
1162 HWLOC_MEMBIND_STRICT);
1163 #else
1164 /* newer call */
1165 int rc = hwloc_set_membind(help->topology,
1166 help->memory_binding_nodesets[m],
1167 HWLOC_MEMBIND_BIND,
1168 HWLOC_MEMBIND_THREAD |
1169 HWLOC_MEMBIND_STRICT |
1170 HWLOC_MEMBIND_BYNODESET);
1171 #endif
1172 if (rc < 0) {
1173 char * s;
1174 hwloc_bitmap_asprintf(&s, help->memory_binding_nodesets[m]);
1175 if (errno == EXDEV) {
1176 fprintf(stderr, "Error, cannot enforce memory binding for job %d [ %s ]\n", k, s);
1177 } else {
1178 fprintf(stderr, "Error while attempting to set memory binding for job %d [ %s ]\n", k, s);
1179 }
1180 free(s);
1181 return -1;
1182 }
1183 #endif
1184 return 0;
1185 }
set_subjob_cpu_binding(int k MAYBE_UNUSED) const1186 int las_parallel_desc::set_subjob_cpu_binding(int k MAYBE_UNUSED) const
1187 {
1188 #ifdef HAVE_HWLOC
1189 if (help->depth == 0)
1190 return -1;
1191 ASSERT_ALWAYS(0<= k && k < (int) help->subjob_binding_cpusets.size());
1192 int rc = hwloc_set_cpubind(help->topology, help->subjob_binding_cpusets[k], HWLOC_CPUBIND_THREAD | HWLOC_CPUBIND_STRICT);
1193 if (rc < 0) {
1194 char * s;
1195 hwloc_bitmap_asprintf(&s, help->subjob_binding_cpusets[k]);
1196 if (errno == EXDEV) {
1197 fprintf(stderr, "Error, cannot enforce cpu binding for job %d [ %s ]\n", k, s);
1198 } else {
1199 fprintf(stderr, "Error while attempting to set cpu binding for job %d [ %s ]\n", k, s);
1200 }
1201 free(s);
1202 return -1;
1203 }
1204 #endif
1205 return 0;
1206 }
1207
1208 #if 0
1209 int las_parallel_desc::number_of_threads_loose() const {
1210 #ifdef HAVE_HWLOC
1211 if (help->depth)
1212 return help->replicate ? help->number_of(-1, 0) : help->memory_binding_size;
1213 else
1214 #endif
1215 return nthreads_per_subjob;
1216 }
1217 #endif
1218
1219 #ifdef HAVE_HWLOC
current_memory_binding() const1220 cxx_hwloc_nodeset las_parallel_desc::current_memory_binding() const {
1221 return help->current_memory_binding();
1222 }
1223 #endif
1224
1225