1 /*****************************************************************************\
2 * Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P.
3 * Copyright (C) 2008-2009 Lawrence Livermore National Security.
4 * Written by Susanne M. Balle, <susanne.balle@hp.com>
5 * CODE-OCEC-09-009. All rights reserved.
6 *
7 * This file is part of Slurm, a resource management program.
8 * For details, see <https://slurm.schedmd.com/>.
9 * Please also read the included file: DISCLAIMER.
10 *
11 * Slurm is free software; you can redistribute it and/or modify it under
12 * the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 * In addition, as a special exception, the copyright holders give permission
17 * to link the code of portions of this program with the OpenSSL library under
18 * certain conditions as described in each individual source file, and
19 * distribute linked combinations including the two. You must obey the GNU
20 * General Public License in all respects for all of the code used other than
21 * OpenSSL. If you modify file(s) with this exception, you may extend this
22 * exception to your version of the file(s), but you are not obligated to do
23 * so. If you do not wish to do so, delete this exception statement from your
24 * version. If you delete this exception statement from all source files in
25 * the program, then also delete it here.
26 *
27 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
30 * details.
31 *
32 * You should have received a copy of the GNU General Public License along
33 * with Slurm; if not, write to the Free Software Foundation, Inc.,
34 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
35 \*****************************************************************************/
36
37 #define _GNU_SOURCE
38
39 #include "affinity.h"
40 #include "dist_tasks.h"
41 #include "src/common/bitstring.h"
42 #include "src/common/log.h"
43 #include "src/common/slurm_cred.h"
44 #include "src/common/slurm_protocol_api.h"
45 #include "src/common/slurm_resource_info.h"
46 #include "src/common/strlcpy.h"
47 #include "src/common/xmalloc.h"
48 #include "src/slurmd/slurmd/slurmd.h"
49
50 #ifdef HAVE_NUMA
51 #include <numa.h>
52 #endif
53
54 static char *_alloc_mask(launch_tasks_request_msg_t *req,
55 int *whole_node_cnt, int *whole_socket_cnt,
56 int *whole_core_cnt, int *whole_thread_cnt,
57 int *part_socket_cnt, int *part_core_cnt);
58 static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
59 uint16_t *hw_sockets, uint16_t *hw_cores,
60 uint16_t *hw_threads);
61 static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
62 uint16_t *sockets, uint16_t *cores);
63
64 static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
65 uint32_t node_id, bitstr_t ***masks_p);
66 static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
67 uint32_t node_id, bitstr_t ***masks_p);
68
69 static void _lllp_map_abstract_masks(const uint32_t maxtasks,
70 bitstr_t **masks);
71 static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
72 const uint32_t maxtasks,
73 bitstr_t **masks);
74
75 /* BLOCK_MAP physical machine LLLP index to abstract block LLLP index
76 * BLOCK_MAP_INV physical abstract block LLLP index to machine LLLP index
77 */
78 #define BLOCK_MAP(index) _block_map(index, conf->block_map)
79 #define BLOCK_MAP_INV(index) _block_map(index, conf->block_map_inv)
80
81
82 /* _block_map
83 *
84 * safely returns a mapped index using a provided block map
85 *
86 * IN - index to map
87 * IN - map to use
88 */
_block_map(uint16_t index,uint16_t * map)89 static uint16_t _block_map(uint16_t index, uint16_t *map)
90 {
91 if (map == NULL) {
92 return index;
93 }
94 /* make sure bit falls in map */
95 if (index >= conf->block_map_size) {
96 debug3("wrapping index %u into block_map_size of %u",
97 index, conf->block_map_size);
98 index = index % conf->block_map_size;
99 }
100 index = map[index];
101 return(index);
102 }
103
_task_layout_display_masks(launch_tasks_request_msg_t * req,const uint32_t * gtid,const uint32_t maxtasks,bitstr_t ** masks)104 static void _task_layout_display_masks(launch_tasks_request_msg_t *req,
105 const uint32_t *gtid,
106 const uint32_t maxtasks,
107 bitstr_t **masks)
108 {
109 int i;
110 char *str = NULL;
111 for(i = 0; i < maxtasks; i++) {
112 str = (char *)bit_fmt_hexmask(masks[i]);
113 debug3("_task_layout_display_masks jobid [%u:%d] %s",
114 req->job_id, gtid[i], str);
115 xfree(str);
116 }
117 }
118
_lllp_free_masks(const uint32_t maxtasks,bitstr_t ** masks)119 static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks)
120 {
121 int i;
122 bitstr_t *bitmask;
123
124 for (i = 0; i < maxtasks; i++) {
125 bitmask = masks[i];
126 FREE_NULL_BITMAP(bitmask);
127 }
128 xfree(masks);
129 }
130
131 #ifdef HAVE_NUMA
132 /* _match_mask_to_ldom
133 *
134 * expand each mask to encompass the whole locality domain
135 * within which it currently exists
136 * NOTE: this assumes that the masks are already in logical
137 * (and not abstract) CPU order.
138 */
_match_masks_to_ldom(const uint32_t maxtasks,bitstr_t ** masks)139 static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks)
140 {
141 uint32_t i, b, size;
142
143 if (!masks || !masks[0])
144 return;
145 size = bit_size(masks[0]);
146 for(i = 0; i < maxtasks; i++) {
147 for (b = 0; b < size; b++) {
148 if (bit_test(masks[i], b)) {
149 /* get the NUMA node for this CPU, and then
150 * set all CPUs in the mask that exist in
151 * the same CPU */
152 int c;
153 uint16_t nnid = slurm_get_numa_node(b);
154 for (c = 0; c < size; c++) {
155 if (slurm_get_numa_node(c) == nnid)
156 bit_set(masks[i], c);
157 }
158 }
159 }
160 }
161 }
162 #endif
163
164 /*
165 * batch_bind - Set the batch request message so as to bind the shell to the
166 * proper resources
167 */
batch_bind(batch_job_launch_msg_t * req)168 void batch_bind(batch_job_launch_msg_t *req)
169 {
170 bitstr_t *req_map, *hw_map;
171 slurm_cred_arg_t arg;
172 uint16_t sockets=0, cores=0, num_cpus;
173 int start, task_cnt=0;
174
175 if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
176 error("task/affinity: job lacks a credential");
177 return;
178 }
179 start = _get_local_node_info(&arg, 0, &sockets, &cores);
180 if (start != 0) {
181 error("task/affinity: missing node 0 in job credential");
182 slurm_cred_free_args(&arg);
183 return;
184 }
185 if ((sockets * cores) == 0) {
186 error("task/affinity: socket and core count both zero");
187 slurm_cred_free_args(&arg);
188 return;
189 }
190
191 num_cpus = MIN((sockets * cores),
192 (conf->sockets * conf->cores));
193 req_map = (bitstr_t *) bit_alloc(num_cpus);
194 hw_map = (bitstr_t *) bit_alloc(conf->block_map_size);
195
196 #ifdef HAVE_FRONT_END
197 {
198 /* Since the front-end nodes are a shared resource, we limit each job
199 * to one CPU based upon monotonically increasing sequence number */
200 static int last_id = 0;
201 bit_set(hw_map, ((last_id++) % conf->block_map_size));
202 task_cnt = 1;
203 }
204 #else
205 {
206 char *str;
207 int t, p;
208
209 /* Transfer core_bitmap data to local req_map.
210 * The MOD function handles the case where fewer processes
211 * physically exist than are configured (slurmd is out of
212 * sync with the slurmctld daemon). */
213 for (p = 0; p < (sockets * cores); p++) {
214 if (bit_test(arg.job_core_bitmap, p))
215 bit_set(req_map, (p % num_cpus));
216 }
217
218 str = (char *)bit_fmt_hexmask(req_map);
219 debug3("task/affinity: job %u core mask from slurmctld: %s",
220 req->job_id, str);
221 xfree(str);
222
223 for (p = 0; p < num_cpus; p++) {
224 if (bit_test(req_map, p) == 0)
225 continue;
226 /* core_bitmap does not include threads, so we
227 * add them here but limit them to what the job
228 * requested */
229 for (t = 0; t < conf->threads; t++) {
230 uint16_t pos = p * conf->threads + t;
231 if (pos >= conf->block_map_size) {
232 info("more resources configured than exist");
233 p = num_cpus;
234 break;
235 }
236 bit_set(hw_map, pos);
237 task_cnt++;
238 }
239 }
240 }
241 #endif
242 if (task_cnt) {
243 req->cpu_bind_type = CPU_BIND_MASK;
244 if (conf->task_plugin_param & CPU_BIND_VERBOSE)
245 req->cpu_bind_type |= CPU_BIND_VERBOSE;
246 xfree(req->cpu_bind);
247 req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
248 info("task/affinity: job %u CPU input mask for node: %s",
249 req->job_id, req->cpu_bind);
250 /* translate abstract masks to actual hardware layout */
251 _lllp_map_abstract_masks(1, &hw_map);
252 #ifdef HAVE_NUMA
253 if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
254 _match_masks_to_ldom(1, &hw_map);
255 }
256 #endif
257 xfree(req->cpu_bind);
258 req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
259 info("task/affinity: job %u CPU final HW mask for node: %s",
260 req->job_id, req->cpu_bind);
261 } else {
262 error("task/affinity: job %u allocated no CPUs",
263 req->job_id);
264 }
265 FREE_NULL_BITMAP(hw_map);
266 FREE_NULL_BITMAP(req_map);
267 slurm_cred_free_args(&arg);
268 }
269
270 /* The job has specialized cores, synchronize user map with available cores */
_validate_map(launch_tasks_request_msg_t * req,char * avail_mask)271 static void _validate_map(launch_tasks_request_msg_t *req, char *avail_mask)
272 {
273 char *tmp_map, *save_ptr = NULL, *tok;
274 cpu_set_t avail_cpus;
275 bool superset = true;
276
277 CPU_ZERO(&avail_cpus);
278 (void) task_str_to_cpuset(&avail_cpus, avail_mask);
279 tmp_map = xstrdup(req->cpu_bind);
280 tok = strtok_r(tmp_map, ",", &save_ptr);
281 while (tok) {
282 int i = atoi(tok);
283 if (!CPU_ISSET(i, &avail_cpus)) {
284 /* The task's CPU map is completely invalid.
285 * Disable CPU map. */
286 superset = false;
287 break;
288 }
289 tok = strtok_r(NULL, ",", &save_ptr);
290 }
291 xfree(tmp_map);
292
293 if (!superset) {
294 info("task/affinity: Ignoring user CPU binding outside of job "
295 "step allocation");
296 req->cpu_bind_type &= (~CPU_BIND_MAP);
297 req->cpu_bind_type |= CPU_BIND_MASK;
298 xfree(req->cpu_bind);
299 req->cpu_bind = xstrdup(avail_mask);
300 }
301 }
302
303 /* The job has specialized cores, synchronize user mask with available cores */
_validate_mask(launch_tasks_request_msg_t * req,char * avail_mask)304 static void _validate_mask(launch_tasks_request_msg_t *req, char *avail_mask)
305 {
306 char *new_mask = NULL, *save_ptr = NULL, *tok;
307 cpu_set_t avail_cpus, task_cpus;
308 bool superset = true;
309
310 CPU_ZERO(&avail_cpus);
311 (void) task_str_to_cpuset(&avail_cpus, avail_mask);
312 tok = strtok_r(req->cpu_bind, ",", &save_ptr);
313 while (tok) {
314 int i, overlaps = 0;
315 char mask_str[1 + CPU_SETSIZE / 4];
316 CPU_ZERO(&task_cpus);
317 (void) task_str_to_cpuset(&task_cpus, tok);
318 for (i = 0; i < CPU_SETSIZE; i++) {
319 if (!CPU_ISSET(i, &task_cpus))
320 continue;
321 if (CPU_ISSET(i, &avail_cpus)) {
322 overlaps++;
323 } else {
324 CPU_CLR(i, &task_cpus);
325 superset = false;
326 }
327 }
328 if (overlaps == 0) {
329 /* The task's CPU mask is completely invalid.
330 * Give it all allowed CPUs. */
331 for (i = 0; i < CPU_SETSIZE; i++) {
332 if (CPU_ISSET(i, &avail_cpus))
333 CPU_SET(i, &task_cpus);
334 }
335 }
336 task_cpuset_to_str(&task_cpus, mask_str);
337 if (new_mask)
338 xstrcat(new_mask, ",");
339 xstrcat(new_mask, mask_str);
340 tok = strtok_r(NULL, ",", &save_ptr);
341 }
342
343 if (!superset) {
344 info("task/affinity: Ignoring user CPU binding outside of job "
345 "step allocation");
346 }
347
348 xfree(req->cpu_bind);
349 req->cpu_bind = new_mask;
350 }
351
352 /*
353 * lllp_distribution
354 *
355 * Note: lllp stands for Lowest Level of Logical Processors.
356 *
357 * When automatic binding is enabled:
358 * - no binding flags set >= CPU_BIND_NONE, and
359 * - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS}
360 * Otherwise limit job step to the allocated CPUs
361 *
362 * generate the appropriate cpu_bind type and string which results in
363 * the specified lllp distribution.
364 *
365 * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
366 * IN- global task id array
367 */
lllp_distribution(launch_tasks_request_msg_t * req,uint32_t node_id)368 void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id)
369 {
370 int rc = SLURM_SUCCESS;
371 bitstr_t **masks = NULL;
372 char buf_type[100];
373 int maxtasks = req->tasks_to_launch[(int)node_id];
374 int whole_nodes, whole_sockets, whole_cores, whole_threads;
375 int part_sockets, part_cores;
376 const uint32_t *gtid = req->global_task_ids[(int)node_id];
377 static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES |
378 CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS;
379 static uint16_t bind_mode = CPU_BIND_NONE | CPU_BIND_MASK |
380 CPU_BIND_RANK | CPU_BIND_MAP |
381 CPU_BIND_LDMASK | CPU_BIND_LDRANK |
382 CPU_BIND_LDMAP;
383 static int only_one_thread_per_core = -1;
384
385 if (only_one_thread_per_core == -1) {
386 if (conf->cpus == (conf->sockets * conf->cores))
387 only_one_thread_per_core = 1;
388 else
389 only_one_thread_per_core = 0;
390 }
391
392 /*
393 * If we are telling the system we only want to use 1 thread
394 * per core with the CPUs node option this is the easiest way
395 * to portray that to the affinity plugin.
396 */
397 if (only_one_thread_per_core)
398 req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE;
399
400 if (req->cpu_bind_type & bind_mode) {
401 /* Explicit step binding specified by user */
402 char *avail_mask = _alloc_mask(req,
403 &whole_nodes, &whole_sockets,
404 &whole_cores, &whole_threads,
405 &part_sockets, &part_cores);
406 if (!avail_mask) {
407 error("task/affinity: Could not determine allocated CPUs");
408 } else if ((whole_nodes == 0) &&
409 (req->job_core_spec == NO_VAL16)) {
410 info("task/affinity: entire node must be allocated, "
411 "disabling affinity");
412 xfree(req->cpu_bind);
413 req->cpu_bind = avail_mask;
414 req->cpu_bind_type &= (~bind_mode);
415 req->cpu_bind_type |= CPU_BIND_MASK;
416 } else {
417 if (req->job_core_spec == NO_VAL16) {
418 if (req->cpu_bind_type & CPU_BIND_MASK)
419 _validate_mask(req, avail_mask);
420 else if (req->cpu_bind_type & CPU_BIND_MAP)
421 _validate_map(req, avail_mask);
422 }
423 xfree(avail_mask);
424 }
425 slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
426 info("lllp_distribution jobid [%u] manual binding: %s",
427 req->job_id, buf_type);
428 return;
429 }
430
431 if (!(req->cpu_bind_type & bind_entity)) {
432 /*
433 * No bind unit (sockets, cores) specified by user,
434 * pick something reasonable
435 */
436 uint32_t task_plugin_param = slurm_get_task_plugin_param();
437 bool auto_def_set = false;
438 int spec_thread_cnt = 0;
439 int max_tasks = req->tasks_to_launch[(int)node_id] *
440 req->cpus_per_task;
441 char *avail_mask = _alloc_mask(req,
442 &whole_nodes, &whole_sockets,
443 &whole_cores, &whole_threads,
444 &part_sockets, &part_cores);
445 debug("binding tasks:%d to "
446 "nodes:%d sockets:%d:%d cores:%d:%d threads:%d",
447 max_tasks, whole_nodes, whole_sockets ,part_sockets,
448 whole_cores, part_cores, whole_threads);
449 if ((req->job_core_spec != NO_VAL16) &&
450 (req->job_core_spec & CORE_SPEC_THREAD) &&
451 (req->job_core_spec != CORE_SPEC_THREAD)) {
452 spec_thread_cnt = req->job_core_spec &
453 (~CORE_SPEC_THREAD);
454 }
455 if (((max_tasks == whole_sockets) && (part_sockets == 0)) ||
456 (spec_thread_cnt &&
457 (max_tasks == (whole_sockets + part_sockets)))) {
458 req->cpu_bind_type |= CPU_BIND_TO_SOCKETS;
459 goto make_auto;
460 }
461 if (((max_tasks == whole_cores) && (part_cores == 0)) ||
462 (spec_thread_cnt &&
463 (max_tasks == (whole_cores + part_cores)))) {
464 req->cpu_bind_type |= CPU_BIND_TO_CORES;
465 goto make_auto;
466 }
467 if (max_tasks == whole_threads) {
468 req->cpu_bind_type |= CPU_BIND_TO_THREADS;
469 goto make_auto;
470 }
471
472 if (task_plugin_param & CPU_AUTO_BIND_TO_THREADS) {
473 auto_def_set = true;
474 req->cpu_bind_type |= CPU_BIND_TO_THREADS;
475 goto make_auto;
476 } else if (task_plugin_param & CPU_AUTO_BIND_TO_CORES) {
477 auto_def_set = true;
478 req->cpu_bind_type |= CPU_BIND_TO_CORES;
479 goto make_auto;
480 } else if (task_plugin_param & CPU_AUTO_BIND_TO_SOCKETS) {
481 auto_def_set = true;
482 req->cpu_bind_type |= CPU_BIND_TO_SOCKETS;
483 goto make_auto;
484 }
485
486 if (avail_mask) {
487 xfree(req->cpu_bind);
488 req->cpu_bind = avail_mask;
489 req->cpu_bind_type |= CPU_BIND_MASK;
490 }
491
492 slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
493 info("lllp_distribution jobid [%u] auto binding off: %s",
494 req->job_id, buf_type);
495 return;
496
497 make_auto: xfree(avail_mask);
498 slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
499 info("lllp_distribution jobid [%u] %s auto binding: "
500 "%s, dist %d", req->job_id,
501 (auto_def_set) ? "default" : "implicit",
502 buf_type, req->task_dist);
503 } else {
504 /* Explicit bind unit (sockets, cores) specified by user */
505 slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
506 info("lllp_distribution jobid [%u] binding: %s, dist %d",
507 req->job_id, buf_type, req->task_dist);
508 }
509
510 switch (req->task_dist & SLURM_DIST_NODESOCKMASK) {
511 case SLURM_DIST_BLOCK_BLOCK:
512 case SLURM_DIST_CYCLIC_BLOCK:
513 case SLURM_DIST_PLANE:
514 /* tasks are distributed in blocks within a plane */
515 rc = _task_layout_lllp_block(req, node_id, &masks);
516 break;
517 case SLURM_DIST_ARBITRARY:
518 case SLURM_DIST_BLOCK:
519 case SLURM_DIST_CYCLIC:
520 case SLURM_DIST_UNKNOWN:
521 if (slurm_get_select_type_param()
522 & CR_CORE_DEFAULT_DIST_BLOCK) {
523 rc = _task_layout_lllp_block(req, node_id, &masks);
524 break;
525 }
526 /*
527 * We want to fall through here if we aren't doing a
528 * default dist block.
529 */
530 default:
531 rc = _task_layout_lllp_cyclic(req, node_id, &masks);
532 break;
533 }
534
535 /*
536 * FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS &
537 * max_cores - does select/cons_res plugin allocate whole
538 * socket??? Maybe not. Check srun man page.
539 */
540
541 if (rc == SLURM_SUCCESS) {
542 _task_layout_display_masks(req, gtid, maxtasks, masks);
543 /* translate abstract masks to actual hardware layout */
544 _lllp_map_abstract_masks(maxtasks, masks);
545 _task_layout_display_masks(req, gtid, maxtasks, masks);
546 #ifdef HAVE_NUMA
547 if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
548 _match_masks_to_ldom(maxtasks, masks);
549 _task_layout_display_masks(req, gtid, maxtasks, masks);
550 }
551 #endif
552 /* convert masks into cpu_bind mask string */
553 _lllp_generate_cpu_bind(req, maxtasks, masks);
554 } else {
555 char *avail_mask = _alloc_mask(req,
556 &whole_nodes, &whole_sockets,
557 &whole_cores, &whole_threads,
558 &part_sockets, &part_cores);
559 if (avail_mask) {
560 xfree(req->cpu_bind);
561 req->cpu_bind = avail_mask;
562 req->cpu_bind_type &= (~bind_mode);
563 req->cpu_bind_type |= CPU_BIND_MASK;
564 }
565 slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
566 error("lllp_distribution jobid [%u] overriding binding: %s",
567 req->job_id, buf_type);
568 error("Verify socket/core/thread counts in configuration");
569 }
570 if (masks)
571 _lllp_free_masks(maxtasks, masks);
572 }
573
574
575 /*
576 * _get_local_node_info - get job allocation details for this node
577 * IN: req - launch request structure
578 * IN: job_node_id - index of the local node in the job allocation
579 * IN/OUT: sockets - pointer to socket count variable
580 * IN/OUT: cores - pointer to cores_per_socket count variable
581 * OUT: returns the core_bitmap index of the first core for this node
582 */
_get_local_node_info(slurm_cred_arg_t * arg,int job_node_id,uint16_t * sockets,uint16_t * cores)583 static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
584 uint16_t *sockets, uint16_t *cores)
585 {
586 int bit_start = 0, bit_finish = 0;
587 int i, index = -1, cur_node_id = -1;
588
589 do {
590 index++;
591 for (i = 0; i < arg->sock_core_rep_count[index] &&
592 cur_node_id < job_node_id; i++) {
593 bit_start = bit_finish;
594 bit_finish += arg->sockets_per_node[index] *
595 arg->cores_per_socket[index];
596 cur_node_id++;
597 }
598
599 } while (cur_node_id < job_node_id);
600
601 *sockets = arg->sockets_per_node[index];
602 *cores = arg->cores_per_socket[index];
603 return bit_start;
604 }
605
606 /*
607 * Determine which CPUs a job step can use.
608 * OUT whole_<entity>_count - returns count of whole <entities> in this
609 * allocation for this node
610 * OUT part__<entity>_count - returns count of partial <entities> in this
611 * allocation for this node
612 * RET - a string representation of the available mask or NULL on error
613 * NOTE: Caller must xfree() the return value.
614 */
_alloc_mask(launch_tasks_request_msg_t * req,int * whole_node_cnt,int * whole_socket_cnt,int * whole_core_cnt,int * whole_thread_cnt,int * part_socket_cnt,int * part_core_cnt)615 static char *_alloc_mask(launch_tasks_request_msg_t *req,
616 int *whole_node_cnt, int *whole_socket_cnt,
617 int *whole_core_cnt, int *whole_thread_cnt,
618 int *part_socket_cnt, int *part_core_cnt)
619 {
620 uint16_t sockets, cores, threads;
621 int c, s, t, i;
622 int c_miss, s_miss, t_miss, c_hit, t_hit;
623 bitstr_t *alloc_bitmap;
624 char *str_mask;
625 bitstr_t *alloc_mask;
626
627 *whole_node_cnt = 0;
628 *whole_socket_cnt = 0;
629 *whole_core_cnt = 0;
630 *whole_thread_cnt = 0;
631 *part_socket_cnt = 0;
632 *part_core_cnt = 0;
633
634 alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads);
635 if (!alloc_bitmap)
636 return NULL;
637
638 alloc_mask = bit_alloc(bit_size(alloc_bitmap));
639
640 i = 0;
641 for (s = 0, s_miss = false; s < sockets; s++) {
642 for (c = 0, c_hit = c_miss = false; c < cores; c++) {
643 for (t = 0, t_hit = t_miss = false; t < threads; t++) {
644 /*
645 * If we are pretending we have a larger system
646 * than we really have this is needed to make
647 * sure we don't bust the bank.
648 */
649 if (i >= bit_size(alloc_bitmap))
650 i = 0;
651 if (bit_test(alloc_bitmap, i)) {
652 bit_set(alloc_mask, i);
653 (*whole_thread_cnt)++;
654 t_hit = true;
655 c_hit = true;
656 } else
657 t_miss = true;
658 i++;
659 }
660 if (!t_miss)
661 (*whole_core_cnt)++;
662 else {
663 if (t_hit)
664 (*part_core_cnt)++;
665 c_miss = true;
666 }
667 }
668 if (!c_miss)
669 (*whole_socket_cnt)++;
670 else {
671 if (c_hit)
672 (*part_socket_cnt)++;
673 s_miss = true;
674 }
675 }
676 if (!s_miss)
677 (*whole_node_cnt)++;
678 FREE_NULL_BITMAP(alloc_bitmap);
679
680 if ((req->job_core_spec != NO_VAL16) &&
681 (req->job_core_spec & CORE_SPEC_THREAD) &&
682 (req->job_core_spec != CORE_SPEC_THREAD)) {
683 int spec_thread_cnt;
684 spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD);
685 for (t = threads - 1;
686 ((t > 0) && (spec_thread_cnt > 0)); t--) {
687 for (c = cores - 1;
688 ((c > 0) && (spec_thread_cnt > 0)); c--) {
689 for (s = sockets - 1;
690 ((s >= 0) && (spec_thread_cnt > 0)); s--) {
691 i = s * cores + c;
692 i = (i * threads) + t;
693 bit_clear(alloc_mask, i);
694 spec_thread_cnt--;
695 }
696 }
697 }
698 }
699
700 /* translate abstract masks to actual hardware layout */
701 _lllp_map_abstract_masks(1, &alloc_mask);
702
703 #ifdef HAVE_NUMA
704 if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
705 _match_masks_to_ldom(1, &alloc_mask);
706 }
707 #endif
708
709 str_mask = bit_fmt_hexmask(alloc_mask);
710 FREE_NULL_BITMAP(alloc_mask);
711 return str_mask;
712 }
713
714 /*
715 * Given a job step request, return an equivalent local bitmap for this node
716 * IN req - The job step launch request
717 * OUT hw_sockets - number of actual sockets on this node
718 * OUT hw_cores - number of actual cores per socket on this node
719 * OUT hw_threads - number of actual threads per core on this node
720 * RET: bitmap of processors available to this job step on this node
721 * OR NULL on error
722 */
_get_avail_map(launch_tasks_request_msg_t * req,uint16_t * hw_sockets,uint16_t * hw_cores,uint16_t * hw_threads)723 static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
724 uint16_t *hw_sockets, uint16_t *hw_cores,
725 uint16_t *hw_threads)
726 {
727 bitstr_t *req_map, *hw_map;
728 slurm_cred_arg_t arg;
729 uint16_t p, t, new_p, num_cpus, sockets, cores;
730 int job_node_id;
731 int start;
732 char *str;
733 int spec_thread_cnt = 0;
734
735 *hw_sockets = conf->sockets;
736 *hw_cores = conf->cores;
737 *hw_threads = conf->threads;
738
739 if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
740 error("task/affinity: job lacks a credential");
741 return NULL;
742 }
743
744 /* we need this node's ID in relation to the whole
745 * job allocation, not just this jobstep */
746 job_node_id = nodelist_find(arg.job_hostlist, conf->node_name);
747 start = _get_local_node_info(&arg, job_node_id, &sockets, &cores);
748 if (start < 0) {
749 error("task/affinity: missing node %d in job credential",
750 job_node_id);
751 slurm_cred_free_args(&arg);
752 return NULL;
753 }
754 debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u",
755 sockets, cores, *hw_sockets, *hw_cores, *hw_threads);
756
757 num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores)));
758 req_map = (bitstr_t *) bit_alloc(num_cpus);
759 hw_map = (bitstr_t *) bit_alloc(conf->block_map_size);
760
761 /* Transfer core_bitmap data to local req_map.
762 * The MOD function handles the case where fewer processes
763 * physically exist than are configured (slurmd is out of
764 * sync with the slurmctld daemon). */
765 for (p = 0; p < (sockets * cores); p++) {
766 if (bit_test(arg.step_core_bitmap, start+p))
767 bit_set(req_map, (p % num_cpus));
768 }
769
770 str = (char *)bit_fmt_hexmask(req_map);
771 debug3("task/affinity: job %u.%u core mask from slurmctld: %s",
772 req->job_id, req->job_step_id, str);
773 xfree(str);
774
775 for (p = 0; p < num_cpus; p++) {
776 if (bit_test(req_map, p) == 0)
777 continue;
778 /* If we are pretending we have a larger system than
779 we really have this is needed to make sure we
780 don't bust the bank.
781 */
782 new_p = p % conf->block_map_size;
783 /* core_bitmap does not include threads, so we
784 * add them here but limit them to what the job
785 * requested */
786 for (t = 0; t < (*hw_threads); t++) {
787 uint16_t bit = new_p * (*hw_threads) + t;
788 bit %= conf->block_map_size;
789 bit_set(hw_map, bit);
790 }
791 }
792
793 if ((req->job_core_spec != NO_VAL16) &&
794 (req->job_core_spec & CORE_SPEC_THREAD) &&
795 (req->job_core_spec != CORE_SPEC_THREAD)) {
796 spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD);
797 }
798 if (spec_thread_cnt) {
799 /* Skip specialized threads as needed */
800 int i, t, c, s;
801 for (t = conf->threads - 1;
802 ((t >= 0) && (spec_thread_cnt > 0)); t--) {
803 for (c = conf->cores - 1;
804 ((c >= 0) && (spec_thread_cnt > 0)); c--) {
805 for (s = conf->sockets - 1;
806 ((s >= 0) && (spec_thread_cnt > 0)); s--) {
807 i = s * conf->cores + c;
808 i = (i * conf->threads) + t;
809 bit_clear(hw_map, i);
810 spec_thread_cnt--;
811 }
812 }
813 }
814 }
815
816 str = (char *)bit_fmt_hexmask(hw_map);
817 debug3("task/affinity: job %u.%u CPU final mask for local node: %s",
818 req->job_id, req->job_step_id, str);
819 xfree(str);
820
821 FREE_NULL_BITMAP(req_map);
822 slurm_cred_free_args(&arg);
823 return hw_map;
824 }
825
826 /* helper function for _expand_masks() */
_blot_mask(bitstr_t * mask,bitstr_t * avail_map,uint16_t blot)827 static void _blot_mask(bitstr_t *mask, bitstr_t *avail_map, uint16_t blot)
828 {
829 uint16_t i, j, size = 0;
830 int prev = -1;
831
832 if (!mask)
833 return;
834 size = bit_size(mask);
835 for (i = 0; i < size; i++) {
836 if (bit_test(mask, i)) {
837 /* fill in this blot */
838 uint16_t start = (i / blot) * blot;
839 if (start != prev) {
840 for (j = start; j < start + blot; j++) {
841 if (bit_test(avail_map, j))
842 bit_set(mask, j);
843 }
844 prev = start;
845 }
846 }
847 }
848 }
849
850 /* helper function for _expand_masks()
851 * for each task, consider which other bits are set in avail_map
852 * on the same socket */
_blot_mask_sockets(const uint32_t maxtasks,const uint32_t task,bitstr_t ** masks,uint16_t hw_sockets,uint16_t hw_cores,uint16_t hw_threads,bitstr_t * avail_map)853 static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task,
854 bitstr_t **masks, uint16_t hw_sockets,
855 uint16_t hw_cores, uint16_t hw_threads,
856 bitstr_t *avail_map)
857 {
858 uint16_t i, j, size = 0;
859 int blot;
860
861 if (!masks[task])
862 return;
863
864 blot = bit_size(avail_map) / hw_sockets;
865 if (blot <= 0)
866 blot = 1;
867 size = bit_size(masks[task]);
868 for (i = 0; i < size; i++) {
869 if (bit_test(masks[task], i)) {
870 /* check if other bits are set in avail_map on this
871 * socket and set each corresponding bit in masks */
872 uint16_t start = (i / blot) * blot;
873 for (j = start; j < start+blot; j++) {
874 if (bit_test(avail_map, j))
875 bit_set(masks[task], j);
876 }
877 }
878 }
879 }
880
881 /* for each mask, expand the mask around the set bits to include the
882 * complete resource to which the set bits are to be bound */
_expand_masks(uint16_t cpu_bind_type,const uint32_t maxtasks,bitstr_t ** masks,uint16_t hw_sockets,uint16_t hw_cores,uint16_t hw_threads,bitstr_t * avail_map)883 static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks,
884 bitstr_t **masks, uint16_t hw_sockets,
885 uint16_t hw_cores, uint16_t hw_threads,
886 bitstr_t *avail_map)
887 {
888 uint32_t i;
889
890 if (cpu_bind_type & CPU_BIND_TO_THREADS)
891 return;
892 if (cpu_bind_type & CPU_BIND_TO_CORES) {
893 if (hw_threads < 2)
894 return;
895 for (i = 0; i < maxtasks; i++) {
896 _blot_mask(masks[i], avail_map, hw_threads);
897 }
898 return;
899 }
900 if (cpu_bind_type & CPU_BIND_TO_SOCKETS) {
901 if (hw_threads*hw_cores < 2)
902 return;
903 for (i = 0; i < maxtasks; i++) {
904 _blot_mask_sockets(maxtasks, i, masks, hw_sockets,
905 hw_cores, hw_threads, avail_map);
906 }
907 return;
908 }
909 }
910
911 /*
912 * _task_layout_lllp_cyclic
913 *
914 * task_layout_lllp_cyclic creates a cyclic distribution at the
915 * lowest level of logical processor which is either socket, core or
916 * thread depending on the system architecture. The Cyclic algorithm
917 * is the same as the Cyclic distribution performed in srun.
918 *
919 * Distribution at the lllp:
920 * -m hostfile|block|cyclic:block|cyclic
921 *
922 * The first distribution "hostfile|block|cyclic" is computed
923 * in srun. The second distribution "block|cyclic" is computed
924 * locally by each slurmd.
925 *
926 * The input to the lllp distribution algorithms is the gids (tasks
927 * ids) generated for the local node.
928 *
929 * The output is a mapping of the gids onto logical processors
930 * (thread/core/socket) with is expressed cpu_bind masks.
931 *
932 * If a task asks for more than one CPU per task, put the tasks as
933 * close as possible (fill core rather than going next socket for the
934 * extra task)
935 *
936 */
_task_layout_lllp_cyclic(launch_tasks_request_msg_t * req,uint32_t node_id,bitstr_t *** masks_p)937 static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
938 uint32_t node_id, bitstr_t ***masks_p)
939 {
940 int last_taskcount = -1, taskcount = 0;
941 uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
942 uint16_t offset = 0, p = 0;
943 int size, max_tasks = req->tasks_to_launch[(int)node_id];
944 int max_cpus = max_tasks * req->cpus_per_task;
945 bitstr_t *avail_map;
946 bitstr_t **masks = NULL;
947 int *socket_last_pu = NULL;
948 int core_inx, pu_per_core, *core_tasks = NULL;
949
950 info ("_task_layout_lllp_cyclic ");
951
952 avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
953 if (!avail_map)
954 return SLURM_ERROR;
955
956 size = bit_set_count(avail_map);
957 if (size < max_tasks) {
958 error("task/affinity: only %d bits in avail_map for %d tasks!",
959 size, max_tasks);
960 FREE_NULL_BITMAP(avail_map);
961 return SLURM_ERROR;
962 }
963 if (size < max_cpus) {
964 /* Possible result of overcommit */
965 i = size / max_tasks;
966 info("task/affinity: reset cpus_per_task from %d to %d",
967 req->cpus_per_task, i);
968 req->cpus_per_task = i;
969 }
970
971 pu_per_core = hw_threads;
972 core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores);
973 socket_last_pu = xmalloc(hw_sockets * sizeof(int));
974
975 *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
976 masks = *masks_p;
977
978 size = bit_size(avail_map);
979
980 offset = hw_cores * hw_threads;
981 s = 0;
982 while (taskcount < max_tasks) {
983 if (taskcount == last_taskcount)
984 fatal("_task_layout_lllp_cyclic failure");
985 last_taskcount = taskcount;
986 for (i = 0; i < size; i++) {
987 bool already_switched = false;
988 uint16_t bit;
989 uint16_t orig_s = s;
990
991 while (socket_last_pu[s] >= offset) {
992 /* Switch to the next socket we have
993 * ran out here. */
994
995 /* This only happens if the slurmctld
996 * gave us an allocation that made a
997 * task split sockets. Or if the
998 * entire allocation is on one socket.
999 */
1000 s = (s + 1) % hw_sockets;
1001 if (orig_s == s) {
1002 /* This should rarely happen,
1003 * but is here for sanity sake.
1004 */
1005 debug("allocation is full, "
1006 "oversubscribing");
1007 memset(core_tasks, 0,
1008 (sizeof(int) *
1009 hw_sockets * hw_cores));
1010 memset(socket_last_pu, 0,
1011 (sizeof(int) * hw_sockets));
1012 }
1013 }
1014
1015 bit = socket_last_pu[s] + (s * offset);
1016
1017 /* In case hardware and config differ */
1018 bit %= size;
1019
1020 /* set up for the next one */
1021 socket_last_pu[s]++;
1022 /* skip unrequested threads */
1023 if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1024 socket_last_pu[s] += hw_threads - 1;
1025
1026 if (!bit_test(avail_map, bit))
1027 continue;
1028
1029 core_inx = bit / pu_per_core;
1030 if ((req->ntasks_per_core != 0) &&
1031 (core_tasks[core_inx] >= req->ntasks_per_core))
1032 continue;
1033
1034 if (!masks[taskcount])
1035 masks[taskcount] =
1036 bit_alloc(conf->block_map_size);
1037
1038 //info("setting %d %d", taskcount, bit);
1039 bit_set(masks[taskcount], bit);
1040
1041 if (!already_switched &&
1042 (((req->task_dist & SLURM_DIST_NODESOCKMASK) ==
1043 SLURM_DIST_CYCLIC_CFULL) ||
1044 ((req->task_dist & SLURM_DIST_NODESOCKMASK) ==
1045 SLURM_DIST_BLOCK_CFULL))) {
1046 /* This means we are laying out cpus
1047 * within a task cyclically as well. */
1048 s = (s + 1) % hw_sockets;
1049 already_switched = true;
1050 }
1051
1052 if (++p < req->cpus_per_task)
1053 continue;
1054
1055 core_tasks[core_inx]++;
1056
1057 /* Binding to cores, skip remaining of the threads */
1058 if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1059 && ((req->cpu_bind_type & CPU_BIND_TO_CORES)
1060 || (req->ntasks_per_core == 1))) {
1061 int threads_not_used;
1062 if (req->cpus_per_task < hw_threads)
1063 threads_not_used =
1064 hw_threads - req->cpus_per_task;
1065 else
1066 threads_not_used =
1067 req->cpus_per_task % hw_threads;
1068 socket_last_pu[s] += threads_not_used;
1069 }
1070 p = 0;
1071
1072 if (!already_switched) {
1073 /* Now that we have finished a task, switch to
1074 * the next socket. */
1075 s = (s + 1) % hw_sockets;
1076 }
1077
1078 if (++taskcount >= max_tasks)
1079 break;
1080 }
1081 }
1082
1083 /* last step: expand the masks to bind each task
1084 * to the requested resource */
1085 _expand_masks(req->cpu_bind_type, max_tasks, masks,
1086 hw_sockets, hw_cores, hw_threads, avail_map);
1087 FREE_NULL_BITMAP(avail_map);
1088 xfree(core_tasks);
1089 xfree(socket_last_pu);
1090
1091 return SLURM_SUCCESS;
1092 }
1093
1094 /*
1095 * _task_layout_lllp_block
1096 *
1097 * task_layout_lllp_block will create a block distribution at the
1098 * lowest level of logical processor which is either socket, core or
1099 * thread depending on the system architecture. The Block algorithm
1100 * is the same as the Block distribution performed in srun.
1101 *
1102 * Distribution at the lllp:
1103 * -m hostfile|plane|block|cyclic:block|cyclic
1104 *
1105 * The first distribution "hostfile|plane|block|cyclic" is computed
1106 * in srun. The second distribution "plane|block|cyclic" is computed
1107 * locally by each slurmd.
1108 *
1109 * The input to the lllp distribution algorithms is the gids (tasks
1110 * ids) generated for the local node.
1111 *
1112 * The output is a mapping of the gids onto logical processors
1113 * (thread/core/socket) with is expressed cpu_bind masks.
1114 *
1115 */
_task_layout_lllp_block(launch_tasks_request_msg_t * req,uint32_t node_id,bitstr_t *** masks_p)1116 static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
1117 uint32_t node_id, bitstr_t ***masks_p)
1118 {
1119 int c, i, size, last_taskcount = -1, taskcount = 0;
1120 uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
1121 int max_tasks = req->tasks_to_launch[(int)node_id];
1122 int max_cpus = max_tasks * req->cpus_per_task;
1123 bitstr_t *avail_map;
1124 bitstr_t **masks = NULL;
1125 int core_inx, pu_per_core, *core_tasks = NULL;
1126 int sock_inx, pu_per_socket, *socket_tasks = NULL;
1127
1128 info("_task_layout_lllp_block ");
1129
1130 avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
1131 if (!avail_map) {
1132 return SLURM_ERROR;
1133 }
1134
1135 size = bit_set_count(avail_map);
1136 if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) &&
1137 (size < (req->cpus_per_task * hw_threads))) {
1138 error("task/affinity: only %d bits in avail_map, CPU_BIND_ONE_THREAD_PER_CORE requires %d!",
1139 size, (req->cpus_per_task * hw_threads));
1140 FREE_NULL_BITMAP(avail_map);
1141 return SLURM_ERROR;
1142 }
1143 if (size < max_tasks) {
1144 error("task/affinity: only %d bits in avail_map for %d tasks!",
1145 size, max_tasks);
1146 FREE_NULL_BITMAP(avail_map);
1147 return SLURM_ERROR;
1148 }
1149 if (size < max_cpus) {
1150 /* Possible result of overcommit */
1151 i = size / max_tasks;
1152 info("task/affinity: reset cpus_per_task from %d to %d",
1153 req->cpus_per_task, i);
1154 req->cpus_per_task = i;
1155 }
1156 size = bit_size(avail_map);
1157
1158 if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) &&
1159 (max_cpus > (hw_sockets * hw_cores))) {
1160 /* More CPUs requested than available cores,
1161 * disable core-level binding */
1162 req->cpu_bind_type &= (~CPU_BIND_ONE_THREAD_PER_CORE);
1163 }
1164
1165 *masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
1166 masks = *masks_p;
1167
1168 pu_per_core = hw_threads;
1169 core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores);
1170 pu_per_socket = hw_cores * hw_threads;
1171 socket_tasks = xmalloc(sizeof(int) * hw_sockets);
1172
1173 /* block distribution with oversubsciption */
1174 c = 0;
1175 while (taskcount < max_tasks) {
1176 if (taskcount == last_taskcount)
1177 fatal("_task_layout_lllp_block infinite loop");
1178 if (taskcount > 0) {
1179 /* Clear counters to over-subscribe, if necessary */
1180 memset(core_tasks, 0,
1181 (sizeof(int) * hw_sockets * hw_cores));
1182 memset(socket_tasks, 0,
1183 (sizeof(int) * hw_sockets));
1184 }
1185 last_taskcount = taskcount;
1186 /* the abstract map is already laid out in block order,
1187 * so just iterate over it
1188 */
1189 for (i = 0; i < size; i++) {
1190 /* skip unavailable resources */
1191 if (bit_test(avail_map, i) == 0)
1192 continue;
1193
1194 core_inx = i / pu_per_core;
1195 if ((req->ntasks_per_core != 0) &&
1196 (core_tasks[core_inx] >= req->ntasks_per_core))
1197 continue;
1198 sock_inx = i / pu_per_socket;
1199 if ((req->ntasks_per_socket != 0) &&
1200 (socket_tasks[sock_inx] >= req->ntasks_per_socket))
1201 continue;
1202
1203 if (!masks[taskcount])
1204 masks[taskcount] = bit_alloc(
1205 conf->block_map_size);
1206 //info("setting %d %d", taskcount, i);
1207 bit_set(masks[taskcount], i);
1208
1209 /* skip unrequested threads */
1210 if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1211 i += hw_threads - 1;
1212
1213 if (++c < req->cpus_per_task)
1214 continue;
1215
1216 /* We found one! Increment the count on each unit */
1217 core_tasks[core_inx]++;
1218 socket_tasks[sock_inx]++;
1219
1220 /* Binding to cores, skip remaining of the threads */
1221 if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1222 && ((req->cpu_bind_type & CPU_BIND_TO_CORES)
1223 || (req->ntasks_per_core == 1))) {
1224 int threads_not_used;
1225 if (req->cpus_per_task < hw_threads)
1226 threads_not_used =
1227 hw_threads - req->cpus_per_task;
1228 else
1229 threads_not_used =
1230 req->cpus_per_task % hw_threads;
1231 i += threads_not_used;
1232 }
1233 c = 0;
1234 if (++taskcount >= max_tasks)
1235 break;
1236 }
1237 }
1238 xfree(core_tasks);
1239 xfree(socket_tasks);
1240
1241 /* last step: expand the masks to bind each task
1242 * to the requested resource */
1243 _expand_masks(req->cpu_bind_type, max_tasks, masks,
1244 hw_sockets, hw_cores, hw_threads, avail_map);
1245 FREE_NULL_BITMAP(avail_map);
1246
1247 return SLURM_SUCCESS;
1248 }
1249
1250 /*
1251 * _lllp_map_abstract_mask
1252 *
1253 * Map one abstract block mask to a physical machine mask
1254 *
1255 * IN - mask to map
1256 * OUT - mapped mask (storage allocated in this routine)
1257 */
_lllp_map_abstract_mask(bitstr_t * bitmask)1258 static bitstr_t *_lllp_map_abstract_mask(bitstr_t *bitmask)
1259 {
1260 int i, bit;
1261 int num_bits = bit_size(bitmask);
1262 bitstr_t *newmask = NULL;
1263 newmask = (bitstr_t *) bit_alloc(num_bits);
1264
1265 /* remap to physical machine */
1266 for (i = 0; i < num_bits; i++) {
1267 if (bit_test(bitmask,i)) {
1268 bit = BLOCK_MAP(i);
1269 if (bit < bit_size(newmask))
1270 bit_set(newmask, bit);
1271 else
1272 error("%s: can't go from %d -> %d since we "
1273 "only have %"BITSTR_FMT" bits",
1274 __func__, i, bit, bit_size(newmask));
1275 }
1276 }
1277 return newmask;
1278 }
1279
1280 /*
1281 * _lllp_map_abstract_masks
1282 *
1283 * Map an array of abstract block masks to physical machine masks
1284 *
1285 * IN- maximum number of tasks
1286 * IN/OUT- array of masks
1287 */
_lllp_map_abstract_masks(const uint32_t maxtasks,bitstr_t ** masks)1288 static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks)
1289 {
1290 int i;
1291 debug3("_lllp_map_abstract_masks");
1292
1293 for (i = 0; i < maxtasks; i++) {
1294 bitstr_t *bitmask = masks[i];
1295 if (bitmask) {
1296 bitstr_t *newmask = _lllp_map_abstract_mask(bitmask);
1297 FREE_NULL_BITMAP(bitmask);
1298 masks[i] = newmask;
1299 }
1300 }
1301 }
1302
1303 /*
1304 * _lllp_generate_cpu_bind
1305 *
1306 * Generate the cpu_bind type and string given an array of bitstr_t masks
1307 *
1308 * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
1309 * IN- maximum number of tasks
1310 * IN- array of masks
1311 */
_lllp_generate_cpu_bind(launch_tasks_request_msg_t * req,const uint32_t maxtasks,bitstr_t ** masks)1312 static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
1313 const uint32_t maxtasks, bitstr_t **masks)
1314 {
1315 int i, num_bits=0, masks_len;
1316 bitstr_t *bitmask;
1317 bitoff_t charsize;
1318 char *masks_str = NULL;
1319 char buf_type[100];
1320
1321 for (i = 0; i < maxtasks; i++) {
1322 bitmask = masks[i];
1323 if (bitmask) {
1324 num_bits = bit_size(bitmask);
1325 break;
1326 }
1327 }
1328 charsize = (num_bits + 3) / 4; /* ASCII hex digits */
1329 charsize += 3; /* "0x" and trailing "," */
1330 masks_len = maxtasks * charsize + 1; /* number of masks + null */
1331
1332 debug3("%s %d %"BITSTR_FMT" %d", __func__, maxtasks, charsize,
1333 masks_len);
1334
1335 masks_str = xmalloc(masks_len);
1336 masks_len = 0;
1337 for (i = 0; i < maxtasks; i++) {
1338 char *str;
1339 int curlen;
1340 bitmask = masks[i];
1341 if (bitmask == NULL) {
1342 continue;
1343 }
1344 str = (char *)bit_fmt_hexmask(bitmask);
1345 curlen = strlen(str) + 1;
1346
1347 if (masks_len > 0)
1348 masks_str[masks_len-1]=',';
1349 strlcpy(&masks_str[masks_len], str, curlen);
1350 masks_len += curlen;
1351 xfree(str);
1352 }
1353
1354 if (req->cpu_bind) {
1355 xfree(req->cpu_bind);
1356 }
1357 if (masks_str[0] != '\0') {
1358 req->cpu_bind = masks_str;
1359 req->cpu_bind_type |= CPU_BIND_MASK;
1360 } else {
1361 req->cpu_bind = NULL;
1362 req->cpu_bind_type &= ~CPU_BIND_VERBOSE;
1363 }
1364
1365 /* clear mask generation bits */
1366 req->cpu_bind_type &= ~CPU_BIND_TO_THREADS;
1367 req->cpu_bind_type &= ~CPU_BIND_TO_CORES;
1368 req->cpu_bind_type &= ~CPU_BIND_TO_SOCKETS;
1369 req->cpu_bind_type &= ~CPU_BIND_TO_LDOMS;
1370
1371 slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
1372 info("_lllp_generate_cpu_bind jobid [%u]: %s, %s",
1373 req->job_id, buf_type, masks_str);
1374 }
1375