1 /*****************************************************************************\
2  *  Copyright (C) 2006-2009 Hewlett-Packard Development Company, L.P.
3  *  Copyright (C) 2008-2009 Lawrence Livermore National Security.
4  *  Written by Susanne M. Balle, <susanne.balle@hp.com>
5  *  CODE-OCEC-09-009. All rights reserved.
6  *
7  *  This file is part of Slurm, a resource management program.
8  *  For details, see <https://slurm.schedmd.com/>.
9  *  Please also read the included file: DISCLAIMER.
10  *
11  *  Slurm is free software; you can redistribute it and/or modify it under
12  *  the terms of the GNU General Public License as published by the Free
13  *  Software Foundation; either version 2 of the License, or (at your option)
14  *  any later version.
15  *
16  *  In addition, as a special exception, the copyright holders give permission
17  *  to link the code of portions of this program with the OpenSSL library under
18  *  certain conditions as described in each individual source file, and
19  *  distribute linked combinations including the two. You must obey the GNU
20  *  General Public License in all respects for all of the code used other than
21  *  OpenSSL. If you modify file(s) with this exception, you may extend this
22  *  exception to your version of the file(s), but you are not obligated to do
23  *  so. If you do not wish to do so, delete this exception statement from your
24  *  version.  If you delete this exception statement from all source files in
25  *  the program, then also delete it here.
26  *
27  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
30  *  details.
31  *
32  *  You should have received a copy of the GNU General Public License along
33  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
34  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
35 \*****************************************************************************/
36 
37 #define _GNU_SOURCE
38 
39 #include "affinity.h"
40 #include "dist_tasks.h"
41 #include "src/common/bitstring.h"
42 #include "src/common/log.h"
43 #include "src/common/slurm_cred.h"
44 #include "src/common/slurm_protocol_api.h"
45 #include "src/common/slurm_resource_info.h"
46 #include "src/common/strlcpy.h"
47 #include "src/common/xmalloc.h"
48 #include "src/slurmd/slurmd/slurmd.h"
49 
50 #ifdef HAVE_NUMA
51 #include <numa.h>
52 #endif
53 
54 static char *_alloc_mask(launch_tasks_request_msg_t *req,
55 			 int *whole_node_cnt, int *whole_socket_cnt,
56 			 int *whole_core_cnt, int *whole_thread_cnt,
57 			 int *part_socket_cnt, int *part_core_cnt);
58 static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
59 				uint16_t *hw_sockets, uint16_t *hw_cores,
60 				uint16_t *hw_threads);
61 static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
62 				uint16_t *sockets, uint16_t *cores);
63 
64 static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
65 				   uint32_t node_id, bitstr_t ***masks_p);
66 static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
67 				    uint32_t node_id, bitstr_t ***masks_p);
68 
69 static void _lllp_map_abstract_masks(const uint32_t maxtasks,
70 				     bitstr_t **masks);
71 static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
72 				    const uint32_t maxtasks,
73 				    bitstr_t **masks);
74 
75 /*     BLOCK_MAP     physical machine LLLP index to abstract block LLLP index
76  *     BLOCK_MAP_INV physical abstract block LLLP index to machine LLLP index
77  */
78 #define BLOCK_MAP(index)	_block_map(index, conf->block_map)
79 #define BLOCK_MAP_INV(index)	_block_map(index, conf->block_map_inv)
80 
81 
82 /* _block_map
83  *
84  * safely returns a mapped index using a provided block map
85  *
86  * IN - index to map
87  * IN - map to use
88  */
_block_map(uint16_t index,uint16_t * map)89 static uint16_t _block_map(uint16_t index, uint16_t *map)
90 {
91 	if (map == NULL) {
92 	    	return index;
93 	}
94 	/* make sure bit falls in map */
95 	if (index >= conf->block_map_size) {
96 		debug3("wrapping index %u into block_map_size of %u",
97 		       index, conf->block_map_size);
98 		index = index % conf->block_map_size;
99 	}
100 	index = map[index];
101 	return(index);
102 }
103 
_task_layout_display_masks(launch_tasks_request_msg_t * req,const uint32_t * gtid,const uint32_t maxtasks,bitstr_t ** masks)104 static void _task_layout_display_masks(launch_tasks_request_msg_t *req,
105 					const uint32_t *gtid,
106 					const uint32_t maxtasks,
107 					bitstr_t **masks)
108 {
109 	int i;
110 	char *str = NULL;
111 	for(i = 0; i < maxtasks; i++) {
112 		str = (char *)bit_fmt_hexmask(masks[i]);
113 		debug3("_task_layout_display_masks jobid [%u:%d] %s",
114 		       req->job_id, gtid[i], str);
115 		xfree(str);
116 	}
117 }
118 
_lllp_free_masks(const uint32_t maxtasks,bitstr_t ** masks)119 static void _lllp_free_masks(const uint32_t maxtasks, bitstr_t **masks)
120 {
121     	int i;
122 	bitstr_t *bitmask;
123 
124 	for (i = 0; i < maxtasks; i++) {
125 		bitmask = masks[i];
126 		FREE_NULL_BITMAP(bitmask);
127 	}
128 	xfree(masks);
129 }
130 
131 #ifdef HAVE_NUMA
132 /* _match_mask_to_ldom
133  *
134  * expand each mask to encompass the whole locality domain
135  * within which it currently exists
136  * NOTE: this assumes that the masks are already in logical
137  * (and not abstract) CPU order.
138  */
_match_masks_to_ldom(const uint32_t maxtasks,bitstr_t ** masks)139 static void _match_masks_to_ldom(const uint32_t maxtasks, bitstr_t **masks)
140 {
141 	uint32_t i, b, size;
142 
143 	if (!masks || !masks[0])
144 		return;
145 	size = bit_size(masks[0]);
146 	for(i = 0; i < maxtasks; i++) {
147 		for (b = 0; b < size; b++) {
148 			if (bit_test(masks[i], b)) {
149 				/* get the NUMA node for this CPU, and then
150 				 * set all CPUs in the mask that exist in
151 				 * the same CPU */
152 				int c;
153 				uint16_t nnid = slurm_get_numa_node(b);
154 				for (c = 0; c < size; c++) {
155 					if (slurm_get_numa_node(c) == nnid)
156 						bit_set(masks[i], c);
157 				}
158 			}
159 		}
160 	}
161 }
162 #endif
163 
164 /*
165  * batch_bind - Set the batch request message so as to bind the shell to the
166  *	proper resources
167  */
batch_bind(batch_job_launch_msg_t * req)168 void batch_bind(batch_job_launch_msg_t *req)
169 {
170 	bitstr_t *req_map, *hw_map;
171 	slurm_cred_arg_t arg;
172 	uint16_t sockets=0, cores=0, num_cpus;
173 	int start, task_cnt=0;
174 
175 	if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
176 		error("task/affinity: job lacks a credential");
177 		return;
178 	}
179 	start = _get_local_node_info(&arg, 0, &sockets, &cores);
180 	if (start != 0) {
181 		error("task/affinity: missing node 0 in job credential");
182 		slurm_cred_free_args(&arg);
183 		return;
184 	}
185 	if ((sockets * cores) == 0) {
186 		error("task/affinity: socket and core count both zero");
187 		slurm_cred_free_args(&arg);
188 		return;
189 	}
190 
191 	num_cpus  = MIN((sockets * cores),
192 			 (conf->sockets * conf->cores));
193 	req_map = (bitstr_t *) bit_alloc(num_cpus);
194 	hw_map  = (bitstr_t *) bit_alloc(conf->block_map_size);
195 
196 #ifdef HAVE_FRONT_END
197 {
198 	/* Since the front-end nodes are a shared resource, we limit each job
199 	 * to one CPU based upon monotonically increasing sequence number */
200 	static int last_id = 0;
201 	bit_set(hw_map, ((last_id++) % conf->block_map_size));
202 	task_cnt = 1;
203 }
204 #else
205 {
206 	char *str;
207 	int t, p;
208 
209 	/* Transfer core_bitmap data to local req_map.
210 	 * The MOD function handles the case where fewer processes
211 	 * physically exist than are configured (slurmd is out of
212 	 * sync with the slurmctld daemon). */
213 	for (p = 0; p < (sockets * cores); p++) {
214 		if (bit_test(arg.job_core_bitmap, p))
215 			bit_set(req_map, (p % num_cpus));
216 	}
217 
218 	str = (char *)bit_fmt_hexmask(req_map);
219 	debug3("task/affinity: job %u core mask from slurmctld: %s",
220 		req->job_id, str);
221 	xfree(str);
222 
223 	for (p = 0; p < num_cpus; p++) {
224 		if (bit_test(req_map, p) == 0)
225 			continue;
226 		/* core_bitmap does not include threads, so we
227 		 * add them here but limit them to what the job
228 		 * requested */
229 		for (t = 0; t < conf->threads; t++) {
230 			uint16_t pos = p * conf->threads + t;
231 			if (pos >= conf->block_map_size) {
232 				info("more resources configured than exist");
233 				p = num_cpus;
234 				break;
235 			}
236 			bit_set(hw_map, pos);
237 			task_cnt++;
238 		}
239 	}
240 }
241 #endif
242 	if (task_cnt) {
243 		req->cpu_bind_type = CPU_BIND_MASK;
244 		if (conf->task_plugin_param & CPU_BIND_VERBOSE)
245 			req->cpu_bind_type |= CPU_BIND_VERBOSE;
246 		xfree(req->cpu_bind);
247 		req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
248 		info("task/affinity: job %u CPU input mask for node: %s",
249 		     req->job_id, req->cpu_bind);
250 		/* translate abstract masks to actual hardware layout */
251 		_lllp_map_abstract_masks(1, &hw_map);
252 #ifdef HAVE_NUMA
253 		if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
254 			_match_masks_to_ldom(1, &hw_map);
255 		}
256 #endif
257 		xfree(req->cpu_bind);
258 		req->cpu_bind = (char *)bit_fmt_hexmask(hw_map);
259 		info("task/affinity: job %u CPU final HW mask for node: %s",
260 		     req->job_id, req->cpu_bind);
261 	} else {
262 		error("task/affinity: job %u allocated no CPUs",
263 		      req->job_id);
264 	}
265 	FREE_NULL_BITMAP(hw_map);
266 	FREE_NULL_BITMAP(req_map);
267 	slurm_cred_free_args(&arg);
268 }
269 
270 /* The job has specialized cores, synchronize user map with available cores */
_validate_map(launch_tasks_request_msg_t * req,char * avail_mask)271 static void _validate_map(launch_tasks_request_msg_t *req, char *avail_mask)
272 {
273 	char *tmp_map, *save_ptr = NULL, *tok;
274 	cpu_set_t avail_cpus;
275 	bool superset = true;
276 
277 	CPU_ZERO(&avail_cpus);
278 	(void) task_str_to_cpuset(&avail_cpus, avail_mask);
279 	tmp_map = xstrdup(req->cpu_bind);
280 	tok = strtok_r(tmp_map, ",", &save_ptr);
281 	while (tok) {
282 		int i = atoi(tok);
283 		if (!CPU_ISSET(i, &avail_cpus)) {
284 			/* The task's CPU map is completely invalid.
285 			 * Disable CPU map. */
286 			superset = false;
287 			break;
288 		}
289 		tok = strtok_r(NULL, ",", &save_ptr);
290 	}
291 	xfree(tmp_map);
292 
293 	if (!superset) {
294 		info("task/affinity: Ignoring user CPU binding outside of job "
295 		     "step allocation");
296 		req->cpu_bind_type &= (~CPU_BIND_MAP);
297 		req->cpu_bind_type |=   CPU_BIND_MASK;
298 		xfree(req->cpu_bind);
299 		req->cpu_bind = xstrdup(avail_mask);
300 	}
301 }
302 
303 /* The job has specialized cores, synchronize user mask with available cores */
_validate_mask(launch_tasks_request_msg_t * req,char * avail_mask)304 static void _validate_mask(launch_tasks_request_msg_t *req, char *avail_mask)
305 {
306 	char *new_mask = NULL, *save_ptr = NULL, *tok;
307 	cpu_set_t avail_cpus, task_cpus;
308 	bool superset = true;
309 
310 	CPU_ZERO(&avail_cpus);
311 	(void) task_str_to_cpuset(&avail_cpus, avail_mask);
312 	tok = strtok_r(req->cpu_bind, ",", &save_ptr);
313 	while (tok) {
314 		int i, overlaps = 0;
315 		char mask_str[1 + CPU_SETSIZE / 4];
316 		CPU_ZERO(&task_cpus);
317 		(void) task_str_to_cpuset(&task_cpus, tok);
318 		for (i = 0; i < CPU_SETSIZE; i++) {
319 			if (!CPU_ISSET(i, &task_cpus))
320 				continue;
321 			if (CPU_ISSET(i, &avail_cpus)) {
322 				overlaps++;
323 			} else {
324 				CPU_CLR(i, &task_cpus);
325 				superset = false;
326 			}
327 		}
328 		if (overlaps == 0) {
329 			/* The task's CPU mask is completely invalid.
330 			 * Give it all allowed CPUs. */
331 			for (i = 0; i < CPU_SETSIZE; i++) {
332 				if (CPU_ISSET(i, &avail_cpus))
333 					CPU_SET(i, &task_cpus);
334 			}
335 		}
336 		task_cpuset_to_str(&task_cpus, mask_str);
337 		if (new_mask)
338 			xstrcat(new_mask, ",");
339 		xstrcat(new_mask, mask_str);
340 		tok = strtok_r(NULL, ",", &save_ptr);
341 	}
342 
343 	if (!superset) {
344 		info("task/affinity: Ignoring user CPU binding outside of job "
345 		     "step allocation");
346 	}
347 
348 	xfree(req->cpu_bind);
349 	req->cpu_bind = new_mask;
350 }
351 
352 /*
353  * lllp_distribution
354  *
355  * Note: lllp stands for Lowest Level of Logical Processors.
356  *
357  * When automatic binding is enabled:
358  *      - no binding flags set >= CPU_BIND_NONE, and
359  *      - a auto binding level selected CPU_BIND_TO_{SOCKETS,CORES,THREADS}
360  * Otherwise limit job step to the allocated CPUs
361  *
362  * generate the appropriate cpu_bind type and string which results in
363  * the specified lllp distribution.
364  *
365  * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
366  * IN- global task id array
367  */
lllp_distribution(launch_tasks_request_msg_t * req,uint32_t node_id)368 void lllp_distribution(launch_tasks_request_msg_t *req, uint32_t node_id)
369 {
370 	int rc = SLURM_SUCCESS;
371 	bitstr_t **masks = NULL;
372 	char buf_type[100];
373 	int maxtasks = req->tasks_to_launch[(int)node_id];
374 	int whole_nodes, whole_sockets, whole_cores, whole_threads;
375 	int part_sockets, part_cores;
376 	const uint32_t *gtid = req->global_task_ids[(int)node_id];
377 	static uint16_t bind_entity = CPU_BIND_TO_THREADS | CPU_BIND_TO_CORES |
378 				      CPU_BIND_TO_SOCKETS | CPU_BIND_TO_LDOMS;
379 	static uint16_t bind_mode = CPU_BIND_NONE   | CPU_BIND_MASK   |
380 				    CPU_BIND_RANK   | CPU_BIND_MAP    |
381 				    CPU_BIND_LDMASK | CPU_BIND_LDRANK |
382 				    CPU_BIND_LDMAP;
383 	static int only_one_thread_per_core = -1;
384 
385 	if (only_one_thread_per_core == -1) {
386 		if (conf->cpus == (conf->sockets * conf->cores))
387 			only_one_thread_per_core = 1;
388 		else
389 			only_one_thread_per_core = 0;
390 	}
391 
392 	/*
393 	 * If we are telling the system we only want to use 1 thread
394 	 * per core with the CPUs node option this is the easiest way
395 	 * to portray that to the affinity plugin.
396 	 */
397 	if (only_one_thread_per_core)
398 		req->cpu_bind_type |= CPU_BIND_ONE_THREAD_PER_CORE;
399 
400 	if (req->cpu_bind_type & bind_mode) {
401 		/* Explicit step binding specified by user */
402 		char *avail_mask = _alloc_mask(req,
403 					       &whole_nodes,  &whole_sockets,
404 					       &whole_cores,  &whole_threads,
405 					       &part_sockets, &part_cores);
406 		if (!avail_mask) {
407 			error("task/affinity: Could not determine allocated CPUs");
408 		} else if ((whole_nodes == 0) &&
409 			   (req->job_core_spec == NO_VAL16)) {
410 			info("task/affinity: entire node must be allocated, "
411 			     "disabling affinity");
412 			xfree(req->cpu_bind);
413 			req->cpu_bind = avail_mask;
414 			req->cpu_bind_type &= (~bind_mode);
415 			req->cpu_bind_type |= CPU_BIND_MASK;
416 		} else {
417 			if (req->job_core_spec == NO_VAL16) {
418 				if (req->cpu_bind_type & CPU_BIND_MASK)
419 					_validate_mask(req, avail_mask);
420 				else if (req->cpu_bind_type & CPU_BIND_MAP)
421 					_validate_map(req, avail_mask);
422 			}
423 			xfree(avail_mask);
424 		}
425 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
426 		info("lllp_distribution jobid [%u] manual binding: %s",
427 		     req->job_id, buf_type);
428 		return;
429 	}
430 
431 	if (!(req->cpu_bind_type & bind_entity)) {
432 		/*
433 		 * No bind unit (sockets, cores) specified by user,
434 		 * pick something reasonable
435 		 */
436 		uint32_t task_plugin_param = slurm_get_task_plugin_param();
437 		bool auto_def_set = false;
438 		int spec_thread_cnt = 0;
439 		int max_tasks = req->tasks_to_launch[(int)node_id] *
440 			req->cpus_per_task;
441 		char *avail_mask = _alloc_mask(req,
442 					       &whole_nodes,  &whole_sockets,
443 					       &whole_cores,  &whole_threads,
444 					       &part_sockets, &part_cores);
445 		debug("binding tasks:%d to "
446 		      "nodes:%d sockets:%d:%d cores:%d:%d threads:%d",
447 		      max_tasks, whole_nodes, whole_sockets ,part_sockets,
448 		      whole_cores, part_cores, whole_threads);
449 		if ((req->job_core_spec != NO_VAL16) &&
450 		    (req->job_core_spec &  CORE_SPEC_THREAD)  &&
451 		    (req->job_core_spec != CORE_SPEC_THREAD)) {
452 			spec_thread_cnt = req->job_core_spec &
453 					  (~CORE_SPEC_THREAD);
454 		}
455 		if (((max_tasks == whole_sockets) && (part_sockets == 0)) ||
456 		    (spec_thread_cnt &&
457 		     (max_tasks == (whole_sockets + part_sockets)))) {
458 			req->cpu_bind_type |= CPU_BIND_TO_SOCKETS;
459 			goto make_auto;
460 		}
461 		if (((max_tasks == whole_cores) && (part_cores == 0)) ||
462 		    (spec_thread_cnt &&
463 		     (max_tasks == (whole_cores + part_cores)))) {
464 			req->cpu_bind_type |= CPU_BIND_TO_CORES;
465 			goto make_auto;
466 		}
467 		if (max_tasks == whole_threads) {
468 			req->cpu_bind_type |= CPU_BIND_TO_THREADS;
469 			goto make_auto;
470 		}
471 
472 		if (task_plugin_param & CPU_AUTO_BIND_TO_THREADS) {
473 			auto_def_set = true;
474 			req->cpu_bind_type |= CPU_BIND_TO_THREADS;
475 			goto make_auto;
476 		} else if (task_plugin_param & CPU_AUTO_BIND_TO_CORES) {
477 			auto_def_set = true;
478 			req->cpu_bind_type |= CPU_BIND_TO_CORES;
479 			goto make_auto;
480 		} else if (task_plugin_param & CPU_AUTO_BIND_TO_SOCKETS) {
481 			auto_def_set = true;
482 			req->cpu_bind_type |= CPU_BIND_TO_SOCKETS;
483 			goto make_auto;
484 		}
485 
486 		if (avail_mask) {
487 			xfree(req->cpu_bind);
488 			req->cpu_bind = avail_mask;
489 			req->cpu_bind_type |= CPU_BIND_MASK;
490 		}
491 
492 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
493 		info("lllp_distribution jobid [%u] auto binding off: %s",
494 		     req->job_id, buf_type);
495 		return;
496 
497   make_auto:	xfree(avail_mask);
498 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
499 		info("lllp_distribution jobid [%u] %s auto binding: "
500 		     "%s, dist %d", req->job_id,
501 		     (auto_def_set) ? "default" : "implicit",
502 		     buf_type, req->task_dist);
503 	} else {
504 		/* Explicit bind unit (sockets, cores) specified by user */
505 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
506 		info("lllp_distribution jobid [%u] binding: %s, dist %d",
507 		     req->job_id, buf_type, req->task_dist);
508 	}
509 
510 	switch (req->task_dist & SLURM_DIST_NODESOCKMASK) {
511 	case SLURM_DIST_BLOCK_BLOCK:
512 	case SLURM_DIST_CYCLIC_BLOCK:
513 	case SLURM_DIST_PLANE:
514 		/* tasks are distributed in blocks within a plane */
515 		rc = _task_layout_lllp_block(req, node_id, &masks);
516 		break;
517 	case SLURM_DIST_ARBITRARY:
518 	case SLURM_DIST_BLOCK:
519 	case SLURM_DIST_CYCLIC:
520 	case SLURM_DIST_UNKNOWN:
521 		if (slurm_get_select_type_param()
522 		    & CR_CORE_DEFAULT_DIST_BLOCK) {
523 			rc = _task_layout_lllp_block(req, node_id, &masks);
524 			break;
525 		}
526 		/*
527 		 * We want to fall through here if we aren't doing a
528 		 * default dist block.
529 		 */
530 	default:
531 		rc = _task_layout_lllp_cyclic(req, node_id, &masks);
532 		break;
533 	}
534 
535 	/*
536 	 * FIXME: I'm worried about core_bitmap with CPU_BIND_TO_SOCKETS &
537 	 * max_cores - does select/cons_res plugin allocate whole
538 	 * socket??? Maybe not. Check srun man page.
539 	 */
540 
541 	if (rc == SLURM_SUCCESS) {
542 		_task_layout_display_masks(req, gtid, maxtasks, masks);
543 	    	/* translate abstract masks to actual hardware layout */
544 		_lllp_map_abstract_masks(maxtasks, masks);
545 		_task_layout_display_masks(req, gtid, maxtasks, masks);
546 #ifdef HAVE_NUMA
547 		if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
548 			_match_masks_to_ldom(maxtasks, masks);
549 			_task_layout_display_masks(req, gtid, maxtasks, masks);
550 		}
551 #endif
552 	    	 /* convert masks into cpu_bind mask string */
553 		 _lllp_generate_cpu_bind(req, maxtasks, masks);
554 	} else {
555 		char *avail_mask = _alloc_mask(req,
556 					       &whole_nodes,  &whole_sockets,
557 					       &whole_cores,  &whole_threads,
558 					       &part_sockets, &part_cores);
559 		if (avail_mask) {
560 			xfree(req->cpu_bind);
561 			req->cpu_bind = avail_mask;
562 			req->cpu_bind_type &= (~bind_mode);
563 			req->cpu_bind_type |= CPU_BIND_MASK;
564 		}
565 		slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
566 		error("lllp_distribution jobid [%u] overriding binding: %s",
567 		      req->job_id, buf_type);
568 		error("Verify socket/core/thread counts in configuration");
569 	}
570 	if (masks)
571 		_lllp_free_masks(maxtasks, masks);
572 }
573 
574 
575 /*
576  * _get_local_node_info - get job allocation details for this node
577  * IN: req         - launch request structure
578  * IN: job_node_id - index of the local node in the job allocation
579  * IN/OUT: sockets - pointer to socket count variable
580  * IN/OUT: cores   - pointer to cores_per_socket count variable
581  * OUT:  returns the core_bitmap index of the first core for this node
582  */
_get_local_node_info(slurm_cred_arg_t * arg,int job_node_id,uint16_t * sockets,uint16_t * cores)583 static int _get_local_node_info(slurm_cred_arg_t *arg, int job_node_id,
584 				uint16_t *sockets, uint16_t *cores)
585 {
586 	int bit_start = 0, bit_finish = 0;
587 	int i, index = -1, cur_node_id = -1;
588 
589 	do {
590 		index++;
591 		for (i = 0; i < arg->sock_core_rep_count[index] &&
592 			     cur_node_id < job_node_id; i++) {
593 			bit_start = bit_finish;
594 			bit_finish += arg->sockets_per_node[index] *
595 					arg->cores_per_socket[index];
596 			cur_node_id++;
597 		}
598 
599 	} while (cur_node_id < job_node_id);
600 
601 	*sockets = arg->sockets_per_node[index];
602 	*cores   = arg->cores_per_socket[index];
603 	return bit_start;
604 }
605 
606 /*
607  * Determine which CPUs a job step can use.
608  * OUT whole_<entity>_count - returns count of whole <entities> in this
609  *                            allocation for this node
610  * OUT part__<entity>_count - returns count of partial <entities> in this
611  *                            allocation for this node
612  * RET - a string representation of the available mask or NULL on error
613  * NOTE: Caller must xfree() the return value.
614  */
_alloc_mask(launch_tasks_request_msg_t * req,int * whole_node_cnt,int * whole_socket_cnt,int * whole_core_cnt,int * whole_thread_cnt,int * part_socket_cnt,int * part_core_cnt)615 static char *_alloc_mask(launch_tasks_request_msg_t *req,
616 			 int *whole_node_cnt,  int *whole_socket_cnt,
617 			 int *whole_core_cnt,  int *whole_thread_cnt,
618 			 int *part_socket_cnt, int *part_core_cnt)
619 {
620 	uint16_t sockets, cores, threads;
621 	int c, s, t, i;
622 	int c_miss, s_miss, t_miss, c_hit, t_hit;
623 	bitstr_t *alloc_bitmap;
624 	char *str_mask;
625 	bitstr_t *alloc_mask;
626 
627 	*whole_node_cnt   = 0;
628 	*whole_socket_cnt = 0;
629 	*whole_core_cnt   = 0;
630 	*whole_thread_cnt = 0;
631 	*part_socket_cnt  = 0;
632 	*part_core_cnt    = 0;
633 
634 	alloc_bitmap = _get_avail_map(req, &sockets, &cores, &threads);
635 	if (!alloc_bitmap)
636 		return NULL;
637 
638 	alloc_mask = bit_alloc(bit_size(alloc_bitmap));
639 
640 	i = 0;
641 	for (s = 0, s_miss = false; s < sockets; s++) {
642 		for (c = 0, c_hit = c_miss = false; c < cores; c++) {
643 			for (t = 0, t_hit = t_miss = false; t < threads; t++) {
644 				/*
645 				 * If we are pretending we have a larger system
646 				 * than we really have this is needed to make
647 				 * sure we don't bust the bank.
648 				 */
649 				if (i >= bit_size(alloc_bitmap))
650 					i = 0;
651 				if (bit_test(alloc_bitmap, i)) {
652 					bit_set(alloc_mask, i);
653 					(*whole_thread_cnt)++;
654 					t_hit = true;
655 					c_hit = true;
656 				} else
657 					t_miss = true;
658 				i++;
659 			}
660 			if (!t_miss)
661 				(*whole_core_cnt)++;
662 			else {
663 				if (t_hit)
664 					(*part_core_cnt)++;
665 				c_miss = true;
666 			}
667 		}
668 		if (!c_miss)
669 			(*whole_socket_cnt)++;
670 		else {
671 			if (c_hit)
672 				(*part_socket_cnt)++;
673 			s_miss = true;
674 		}
675 	}
676 	if (!s_miss)
677 		(*whole_node_cnt)++;
678 	FREE_NULL_BITMAP(alloc_bitmap);
679 
680 	if ((req->job_core_spec != NO_VAL16) &&
681 	    (req->job_core_spec &  CORE_SPEC_THREAD)  &&
682 	    (req->job_core_spec != CORE_SPEC_THREAD)) {
683 		int spec_thread_cnt;
684 		spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD);
685 		for (t = threads - 1;
686 		     ((t > 0) && (spec_thread_cnt > 0)); t--) {
687 			for (c = cores - 1;
688 			     ((c > 0) && (spec_thread_cnt > 0)); c--) {
689 				for (s = sockets - 1;
690 				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
691 					i = s * cores + c;
692 					i = (i * threads) + t;
693 					bit_clear(alloc_mask, i);
694 					spec_thread_cnt--;
695 				}
696 			}
697 		}
698 	}
699 
700 	/* translate abstract masks to actual hardware layout */
701 	_lllp_map_abstract_masks(1, &alloc_mask);
702 
703 #ifdef HAVE_NUMA
704 	if (req->cpu_bind_type & CPU_BIND_TO_LDOMS) {
705 		_match_masks_to_ldom(1, &alloc_mask);
706 	}
707 #endif
708 
709 	str_mask = bit_fmt_hexmask(alloc_mask);
710 	FREE_NULL_BITMAP(alloc_mask);
711 	return str_mask;
712 }
713 
714 /*
715  * Given a job step request, return an equivalent local bitmap for this node
716  * IN req          - The job step launch request
717  * OUT hw_sockets  - number of actual sockets on this node
718  * OUT hw_cores    - number of actual cores per socket on this node
719  * OUT hw_threads  - number of actual threads per core on this node
720  * RET: bitmap of processors available to this job step on this node
721  *      OR NULL on error
722  */
_get_avail_map(launch_tasks_request_msg_t * req,uint16_t * hw_sockets,uint16_t * hw_cores,uint16_t * hw_threads)723 static bitstr_t *_get_avail_map(launch_tasks_request_msg_t *req,
724 				uint16_t *hw_sockets, uint16_t *hw_cores,
725 				uint16_t *hw_threads)
726 {
727 	bitstr_t *req_map, *hw_map;
728 	slurm_cred_arg_t arg;
729 	uint16_t p, t, new_p, num_cpus, sockets, cores;
730 	int job_node_id;
731 	int start;
732 	char *str;
733 	int spec_thread_cnt = 0;
734 
735 	*hw_sockets = conf->sockets;
736 	*hw_cores   = conf->cores;
737 	*hw_threads = conf->threads;
738 
739 	if (slurm_cred_get_args(req->cred, &arg) != SLURM_SUCCESS) {
740 		error("task/affinity: job lacks a credential");
741 		return NULL;
742 	}
743 
744 	/* we need this node's ID in relation to the whole
745 	 * job allocation, not just this jobstep */
746 	job_node_id = nodelist_find(arg.job_hostlist, conf->node_name);
747 	start = _get_local_node_info(&arg, job_node_id, &sockets, &cores);
748 	if (start < 0) {
749 		error("task/affinity: missing node %d in job credential",
750 		      job_node_id);
751 		slurm_cred_free_args(&arg);
752 		return NULL;
753 	}
754 	debug3("task/affinity: slurmctld s %u c %u; hw s %u c %u t %u",
755 	       sockets, cores, *hw_sockets, *hw_cores, *hw_threads);
756 
757 	num_cpus = MIN((sockets * cores), ((*hw_sockets)*(*hw_cores)));
758 	req_map = (bitstr_t *) bit_alloc(num_cpus);
759 	hw_map  = (bitstr_t *) bit_alloc(conf->block_map_size);
760 
761 	/* Transfer core_bitmap data to local req_map.
762 	 * The MOD function handles the case where fewer processes
763 	 * physically exist than are configured (slurmd is out of
764 	 * sync with the slurmctld daemon). */
765 	for (p = 0; p < (sockets * cores); p++) {
766 		if (bit_test(arg.step_core_bitmap, start+p))
767 			bit_set(req_map, (p % num_cpus));
768 	}
769 
770 	str = (char *)bit_fmt_hexmask(req_map);
771 	debug3("task/affinity: job %u.%u core mask from slurmctld: %s",
772 		req->job_id, req->job_step_id, str);
773 	xfree(str);
774 
775 	for (p = 0; p < num_cpus; p++) {
776 		if (bit_test(req_map, p) == 0)
777 			continue;
778 		/* If we are pretending we have a larger system than
779 		   we really have this is needed to make sure we
780 		   don't bust the bank.
781 		*/
782 		new_p = p % conf->block_map_size;
783 		/* core_bitmap does not include threads, so we
784 		 * add them here but limit them to what the job
785 		 * requested */
786 		for (t = 0; t < (*hw_threads); t++) {
787 			uint16_t bit = new_p * (*hw_threads) + t;
788 			bit %= conf->block_map_size;
789 			bit_set(hw_map, bit);
790 		}
791 	}
792 
793 	if ((req->job_core_spec != NO_VAL16) &&
794 	    (req->job_core_spec &  CORE_SPEC_THREAD)  &&
795 	    (req->job_core_spec != CORE_SPEC_THREAD)) {
796 		spec_thread_cnt = req->job_core_spec & (~CORE_SPEC_THREAD);
797 	}
798 	if (spec_thread_cnt) {
799 		/* Skip specialized threads as needed */
800 		int i, t, c, s;
801 		for (t = conf->threads - 1;
802 		     ((t >= 0) && (spec_thread_cnt > 0)); t--) {
803 			for (c = conf->cores - 1;
804 			     ((c >= 0) && (spec_thread_cnt > 0)); c--) {
805 				for (s = conf->sockets - 1;
806 				     ((s >= 0) && (spec_thread_cnt > 0)); s--) {
807 					i = s * conf->cores + c;
808 					i = (i * conf->threads) + t;
809 					bit_clear(hw_map, i);
810 					spec_thread_cnt--;
811 				}
812 			}
813 		}
814 	}
815 
816 	str = (char *)bit_fmt_hexmask(hw_map);
817 	debug3("task/affinity: job %u.%u CPU final mask for local node: %s",
818 		req->job_id, req->job_step_id, str);
819 	xfree(str);
820 
821 	FREE_NULL_BITMAP(req_map);
822 	slurm_cred_free_args(&arg);
823 	return hw_map;
824 }
825 
826 /* helper function for _expand_masks() */
_blot_mask(bitstr_t * mask,bitstr_t * avail_map,uint16_t blot)827 static void _blot_mask(bitstr_t *mask, bitstr_t *avail_map, uint16_t blot)
828 {
829 	uint16_t i, j, size = 0;
830 	int prev = -1;
831 
832 	if (!mask)
833 		return;
834 	size = bit_size(mask);
835 	for (i = 0; i < size; i++) {
836 		if (bit_test(mask, i)) {
837 			/* fill in this blot */
838 			uint16_t start = (i / blot) * blot;
839 			if (start != prev) {
840 				for (j = start; j < start + blot; j++) {
841 					if (bit_test(avail_map, j))
842 						bit_set(mask, j);
843 				}
844 				prev = start;
845 			}
846 		}
847 	}
848 }
849 
850 /* helper function for _expand_masks()
851  * for each task, consider which other bits are set in avail_map
852  * on the same socket */
_blot_mask_sockets(const uint32_t maxtasks,const uint32_t task,bitstr_t ** masks,uint16_t hw_sockets,uint16_t hw_cores,uint16_t hw_threads,bitstr_t * avail_map)853 static void _blot_mask_sockets(const uint32_t maxtasks, const uint32_t task,
854 			       bitstr_t **masks, uint16_t hw_sockets,
855 			       uint16_t hw_cores, uint16_t hw_threads,
856 			       bitstr_t *avail_map)
857 {
858   	uint16_t i, j, size = 0;
859 	int blot;
860 
861 	if (!masks[task])
862  		return;
863 
864 	blot = bit_size(avail_map) / hw_sockets;
865 	if (blot <= 0)
866 		blot = 1;
867 	size = bit_size(masks[task]);
868 	for (i = 0; i < size; i++) {
869 		if (bit_test(masks[task], i)) {
870 			/* check if other bits are set in avail_map on this
871 			 * socket and set each corresponding bit in masks */
872 			uint16_t start = (i / blot) * blot;
873 			for (j = start; j < start+blot; j++) {
874 				if (bit_test(avail_map, j))
875 					bit_set(masks[task], j);
876 			}
877 		}
878 	}
879 }
880 
881 /* for each mask, expand the mask around the set bits to include the
882  * complete resource to which the set bits are to be bound */
_expand_masks(uint16_t cpu_bind_type,const uint32_t maxtasks,bitstr_t ** masks,uint16_t hw_sockets,uint16_t hw_cores,uint16_t hw_threads,bitstr_t * avail_map)883 static void _expand_masks(uint16_t cpu_bind_type, const uint32_t maxtasks,
884 			  bitstr_t **masks, uint16_t hw_sockets,
885 			  uint16_t hw_cores, uint16_t hw_threads,
886 			  bitstr_t *avail_map)
887 {
888 	uint32_t i;
889 
890 	if (cpu_bind_type & CPU_BIND_TO_THREADS)
891 		return;
892 	if (cpu_bind_type & CPU_BIND_TO_CORES) {
893 		if (hw_threads < 2)
894 			return;
895 		for (i = 0; i < maxtasks; i++) {
896 			_blot_mask(masks[i], avail_map, hw_threads);
897 		}
898 		return;
899 	}
900 	if (cpu_bind_type & CPU_BIND_TO_SOCKETS) {
901 		if (hw_threads*hw_cores < 2)
902 			return;
903 		for (i = 0; i < maxtasks; i++) {
904    			_blot_mask_sockets(maxtasks, i, masks, hw_sockets,
905 					   hw_cores, hw_threads, avail_map);
906 		}
907 		return;
908 	}
909 }
910 
911 /*
912  * _task_layout_lllp_cyclic
913  *
914  * task_layout_lllp_cyclic creates a cyclic distribution at the
915  * lowest level of logical processor which is either socket, core or
916  * thread depending on the system architecture. The Cyclic algorithm
917  * is the same as the Cyclic distribution performed in srun.
918  *
919  *  Distribution at the lllp:
920  *  -m hostfile|block|cyclic:block|cyclic
921  *
922  * The first distribution "hostfile|block|cyclic" is computed
923  * in srun. The second distribution "block|cyclic" is computed
924  * locally by each slurmd.
925  *
926  * The input to the lllp distribution algorithms is the gids (tasks
927  * ids) generated for the local node.
928  *
929  * The output is a mapping of the gids onto logical processors
930  * (thread/core/socket) with is expressed cpu_bind masks.
931  *
932  * If a task asks for more than one CPU per task, put the tasks as
933  * close as possible (fill core rather than going next socket for the
934  * extra task)
935  *
936  */
_task_layout_lllp_cyclic(launch_tasks_request_msg_t * req,uint32_t node_id,bitstr_t *** masks_p)937 static int _task_layout_lllp_cyclic(launch_tasks_request_msg_t *req,
938 				    uint32_t node_id, bitstr_t ***masks_p)
939 {
940 	int last_taskcount = -1, taskcount = 0;
941 	uint16_t i, s, hw_sockets = 0, hw_cores = 0, hw_threads = 0;
942 	uint16_t offset = 0, p = 0;
943 	int size, max_tasks = req->tasks_to_launch[(int)node_id];
944 	int max_cpus = max_tasks * req->cpus_per_task;
945 	bitstr_t *avail_map;
946 	bitstr_t **masks = NULL;
947 	int *socket_last_pu = NULL;
948 	int core_inx, pu_per_core, *core_tasks = NULL;
949 
950 	info ("_task_layout_lllp_cyclic ");
951 
952 	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
953 	if (!avail_map)
954 		return SLURM_ERROR;
955 
956 	size = bit_set_count(avail_map);
957 	if (size < max_tasks) {
958 		error("task/affinity: only %d bits in avail_map for %d tasks!",
959 		      size, max_tasks);
960 		FREE_NULL_BITMAP(avail_map);
961 		return SLURM_ERROR;
962 	}
963 	if (size < max_cpus) {
964 		/* Possible result of overcommit */
965 		i = size / max_tasks;
966 		info("task/affinity: reset cpus_per_task from %d to %d",
967 		     req->cpus_per_task, i);
968 		req->cpus_per_task = i;
969 	}
970 
971 	pu_per_core = hw_threads;
972 	core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores);
973 	socket_last_pu = xmalloc(hw_sockets * sizeof(int));
974 
975 	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
976 	masks = *masks_p;
977 
978 	size = bit_size(avail_map);
979 
980 	offset = hw_cores * hw_threads;
981 	s = 0;
982 	while (taskcount < max_tasks) {
983 		if (taskcount == last_taskcount)
984 			fatal("_task_layout_lllp_cyclic failure");
985 		last_taskcount = taskcount;
986 		for (i = 0; i < size; i++) {
987 			bool already_switched = false;
988 			uint16_t bit;
989 			uint16_t orig_s = s;
990 
991 			while (socket_last_pu[s] >= offset) {
992 				/* Switch to the next socket we have
993 				 * ran out here. */
994 
995 				/* This only happens if the slurmctld
996 				 * gave us an allocation that made a
997 				 * task split sockets.  Or if the
998 				 * entire allocation is on one socket.
999 				 */
1000 				s = (s + 1) % hw_sockets;
1001 				if (orig_s == s) {
1002 					/* This should rarely happen,
1003 					 * but is here for sanity sake.
1004 					 */
1005 					debug("allocation is full, "
1006 					      "oversubscribing");
1007 					memset(core_tasks, 0,
1008 					       (sizeof(int) *
1009 					        hw_sockets * hw_cores));
1010 					memset(socket_last_pu, 0,
1011 					       (sizeof(int) * hw_sockets));
1012 				}
1013 			}
1014 
1015 			bit = socket_last_pu[s] + (s * offset);
1016 
1017 			/* In case hardware and config differ */
1018 			bit %= size;
1019 
1020 			/* set up for the next one */
1021 			socket_last_pu[s]++;
1022 			/* skip unrequested threads */
1023 			if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1024 				socket_last_pu[s] += hw_threads - 1;
1025 
1026 			if (!bit_test(avail_map, bit))
1027 				continue;
1028 
1029 			core_inx = bit / pu_per_core;
1030 			if ((req->ntasks_per_core != 0) &&
1031 			    (core_tasks[core_inx] >= req->ntasks_per_core))
1032 				continue;
1033 
1034 			if (!masks[taskcount])
1035 				masks[taskcount] =
1036 					bit_alloc(conf->block_map_size);
1037 
1038 			//info("setting %d %d", taskcount, bit);
1039 			bit_set(masks[taskcount], bit);
1040 
1041 			if (!already_switched &&
1042 			    (((req->task_dist & SLURM_DIST_NODESOCKMASK) ==
1043 			     SLURM_DIST_CYCLIC_CFULL) ||
1044 			    ((req->task_dist & SLURM_DIST_NODESOCKMASK) ==
1045 			     SLURM_DIST_BLOCK_CFULL))) {
1046 				/* This means we are laying out cpus
1047 				 * within a task cyclically as well. */
1048 				s = (s + 1) % hw_sockets;
1049 				already_switched = true;
1050 			}
1051 
1052 			if (++p < req->cpus_per_task)
1053 				continue;
1054 
1055 			core_tasks[core_inx]++;
1056 
1057 			/* Binding to cores, skip remaining of the threads */
1058 			if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1059 			    && ((req->cpu_bind_type & CPU_BIND_TO_CORES)
1060 				|| (req->ntasks_per_core == 1))) {
1061 				int threads_not_used;
1062 				if (req->cpus_per_task < hw_threads)
1063 					threads_not_used =
1064 						hw_threads - req->cpus_per_task;
1065 				else
1066 					threads_not_used =
1067 						req->cpus_per_task % hw_threads;
1068 				socket_last_pu[s] += threads_not_used;
1069 			}
1070 			p = 0;
1071 
1072 			if (!already_switched) {
1073 				/* Now that we have finished a task, switch to
1074 				 * the next socket. */
1075 				s = (s + 1) % hw_sockets;
1076 			}
1077 
1078 			if (++taskcount >= max_tasks)
1079 				break;
1080 		}
1081 	}
1082 
1083 	/* last step: expand the masks to bind each task
1084 	 * to the requested resource */
1085 	_expand_masks(req->cpu_bind_type, max_tasks, masks,
1086 		      hw_sockets, hw_cores, hw_threads, avail_map);
1087 	FREE_NULL_BITMAP(avail_map);
1088 	xfree(core_tasks);
1089 	xfree(socket_last_pu);
1090 
1091 	return SLURM_SUCCESS;
1092 }
1093 
1094 /*
1095  * _task_layout_lllp_block
1096  *
1097  * task_layout_lllp_block will create a block distribution at the
1098  * lowest level of logical processor which is either socket, core or
1099  * thread depending on the system architecture. The Block algorithm
1100  * is the same as the Block distribution performed in srun.
1101  *
1102  *  Distribution at the lllp:
1103  *  -m hostfile|plane|block|cyclic:block|cyclic
1104  *
1105  * The first distribution "hostfile|plane|block|cyclic" is computed
1106  * in srun. The second distribution "plane|block|cyclic" is computed
1107  * locally by each slurmd.
1108  *
1109  * The input to the lllp distribution algorithms is the gids (tasks
1110  * ids) generated for the local node.
1111  *
1112  * The output is a mapping of the gids onto logical processors
1113  * (thread/core/socket)  with is expressed cpu_bind masks.
1114  *
1115  */
_task_layout_lllp_block(launch_tasks_request_msg_t * req,uint32_t node_id,bitstr_t *** masks_p)1116 static int _task_layout_lllp_block(launch_tasks_request_msg_t *req,
1117 				   uint32_t node_id, bitstr_t ***masks_p)
1118 {
1119 	int c, i, size, last_taskcount = -1, taskcount = 0;
1120 	uint16_t hw_sockets = 0, hw_cores = 0, hw_threads = 0;
1121 	int max_tasks = req->tasks_to_launch[(int)node_id];
1122 	int max_cpus = max_tasks * req->cpus_per_task;
1123 	bitstr_t *avail_map;
1124 	bitstr_t **masks = NULL;
1125 	int core_inx, pu_per_core, *core_tasks = NULL;
1126 	int sock_inx, pu_per_socket, *socket_tasks = NULL;
1127 
1128 	info("_task_layout_lllp_block ");
1129 
1130 	avail_map = _get_avail_map(req, &hw_sockets, &hw_cores, &hw_threads);
1131 	if (!avail_map) {
1132 		return SLURM_ERROR;
1133 	}
1134 
1135 	size = bit_set_count(avail_map);
1136 	if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) &&
1137 	    (size < (req->cpus_per_task * hw_threads))) {
1138 		error("task/affinity: only %d bits in avail_map, CPU_BIND_ONE_THREAD_PER_CORE requires %d!",
1139 		      size, (req->cpus_per_task * hw_threads));
1140 		FREE_NULL_BITMAP(avail_map);
1141 		return SLURM_ERROR;
1142 	}
1143 	if (size < max_tasks) {
1144 		error("task/affinity: only %d bits in avail_map for %d tasks!",
1145 		      size, max_tasks);
1146 		FREE_NULL_BITMAP(avail_map);
1147 		return SLURM_ERROR;
1148 	}
1149 	if (size < max_cpus) {
1150 		/* Possible result of overcommit */
1151 		i = size / max_tasks;
1152 		info("task/affinity: reset cpus_per_task from %d to %d",
1153 		     req->cpus_per_task, i);
1154 		req->cpus_per_task = i;
1155 	}
1156 	size = bit_size(avail_map);
1157 
1158 	if ((req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE) &&
1159 	    (max_cpus > (hw_sockets * hw_cores))) {
1160 		/* More CPUs requested than available cores,
1161 		 * disable core-level binding */
1162 		req->cpu_bind_type &= (~CPU_BIND_ONE_THREAD_PER_CORE);
1163 	}
1164 
1165 	*masks_p = xmalloc(max_tasks * sizeof(bitstr_t*));
1166 	masks = *masks_p;
1167 
1168 	pu_per_core = hw_threads;
1169 	core_tasks = xmalloc(sizeof(int) * hw_sockets * hw_cores);
1170 	pu_per_socket = hw_cores * hw_threads;
1171 	socket_tasks = xmalloc(sizeof(int) * hw_sockets);
1172 
1173 	/* block distribution with oversubsciption */
1174 	c = 0;
1175 	while (taskcount < max_tasks) {
1176 		if (taskcount == last_taskcount)
1177 			fatal("_task_layout_lllp_block infinite loop");
1178 		if (taskcount > 0) {
1179 			/* Clear counters to over-subscribe, if necessary */
1180 			memset(core_tasks, 0,
1181 			       (sizeof(int) * hw_sockets * hw_cores));
1182 			memset(socket_tasks, 0,
1183 			       (sizeof(int) * hw_sockets));
1184 		}
1185 		last_taskcount = taskcount;
1186 		/* the abstract map is already laid out in block order,
1187 		 * so just iterate over it
1188 		 */
1189 		for (i = 0; i < size; i++) {
1190 			/* skip unavailable resources */
1191 			if (bit_test(avail_map, i) == 0)
1192 				continue;
1193 
1194 			core_inx = i / pu_per_core;
1195 			if ((req->ntasks_per_core != 0) &&
1196 			    (core_tasks[core_inx] >= req->ntasks_per_core))
1197 				continue;
1198 			sock_inx = i / pu_per_socket;
1199 			if ((req->ntasks_per_socket != 0) &&
1200 			    (socket_tasks[sock_inx] >= req->ntasks_per_socket))
1201 				continue;
1202 
1203 			if (!masks[taskcount])
1204 				masks[taskcount] = bit_alloc(
1205 					conf->block_map_size);
1206 			//info("setting %d %d", taskcount, i);
1207 			bit_set(masks[taskcount], i);
1208 
1209 			/* skip unrequested threads */
1210 			if (req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1211 				i += hw_threads - 1;
1212 
1213 			if (++c < req->cpus_per_task)
1214 				continue;
1215 
1216 			/* We found one! Increment the count on each unit */
1217 			core_tasks[core_inx]++;
1218 			socket_tasks[sock_inx]++;
1219 
1220 			/* Binding to cores, skip remaining of the threads */
1221 			if (!(req->cpu_bind_type & CPU_BIND_ONE_THREAD_PER_CORE)
1222 			    && ((req->cpu_bind_type & CPU_BIND_TO_CORES)
1223 				|| (req->ntasks_per_core == 1))) {
1224 				int threads_not_used;
1225 				if (req->cpus_per_task < hw_threads)
1226 					threads_not_used =
1227 						hw_threads - req->cpus_per_task;
1228 				else
1229 					threads_not_used =
1230 						req->cpus_per_task % hw_threads;
1231 				i += threads_not_used;
1232 			}
1233 			c = 0;
1234 			if (++taskcount >= max_tasks)
1235 				break;
1236 		}
1237 	}
1238 	xfree(core_tasks);
1239 	xfree(socket_tasks);
1240 
1241 	/* last step: expand the masks to bind each task
1242 	 * to the requested resource */
1243 	_expand_masks(req->cpu_bind_type, max_tasks, masks,
1244 			hw_sockets, hw_cores, hw_threads, avail_map);
1245 	FREE_NULL_BITMAP(avail_map);
1246 
1247 	return SLURM_SUCCESS;
1248 }
1249 
1250 /*
1251  * _lllp_map_abstract_mask
1252  *
1253  * Map one abstract block mask to a physical machine mask
1254  *
1255  * IN - mask to map
1256  * OUT - mapped mask (storage allocated in this routine)
1257  */
_lllp_map_abstract_mask(bitstr_t * bitmask)1258 static bitstr_t *_lllp_map_abstract_mask(bitstr_t *bitmask)
1259 {
1260     	int i, bit;
1261 	int num_bits = bit_size(bitmask);
1262 	bitstr_t *newmask = NULL;
1263 	newmask = (bitstr_t *) bit_alloc(num_bits);
1264 
1265 	/* remap to physical machine */
1266 	for (i = 0; i < num_bits; i++) {
1267 		if (bit_test(bitmask,i)) {
1268 			bit = BLOCK_MAP(i);
1269 			if (bit < bit_size(newmask))
1270 				bit_set(newmask, bit);
1271 			else
1272 				error("%s: can't go from %d -> %d since we "
1273 				      "only have %"BITSTR_FMT" bits",
1274 				      __func__, i, bit, bit_size(newmask));
1275 		}
1276 	}
1277 	return newmask;
1278 }
1279 
1280 /*
1281  * _lllp_map_abstract_masks
1282  *
1283  * Map an array of abstract block masks to physical machine masks
1284  *
1285  * IN- maximum number of tasks
1286  * IN/OUT- array of masks
1287  */
_lllp_map_abstract_masks(const uint32_t maxtasks,bitstr_t ** masks)1288 static void _lllp_map_abstract_masks(const uint32_t maxtasks, bitstr_t **masks)
1289 {
1290     	int i;
1291 	debug3("_lllp_map_abstract_masks");
1292 
1293 	for (i = 0; i < maxtasks; i++) {
1294 		bitstr_t *bitmask = masks[i];
1295 	    	if (bitmask) {
1296 			bitstr_t *newmask = _lllp_map_abstract_mask(bitmask);
1297 			FREE_NULL_BITMAP(bitmask);
1298 			masks[i] = newmask;
1299 		}
1300 	}
1301 }
1302 
1303 /*
1304  * _lllp_generate_cpu_bind
1305  *
1306  * Generate the cpu_bind type and string given an array of bitstr_t masks
1307  *
1308  * IN/OUT- job launch request (cpu_bind_type and cpu_bind updated)
1309  * IN- maximum number of tasks
1310  * IN- array of masks
1311  */
_lllp_generate_cpu_bind(launch_tasks_request_msg_t * req,const uint32_t maxtasks,bitstr_t ** masks)1312 static void _lllp_generate_cpu_bind(launch_tasks_request_msg_t *req,
1313 				    const uint32_t maxtasks, bitstr_t **masks)
1314 {
1315     	int i, num_bits=0, masks_len;
1316 	bitstr_t *bitmask;
1317 	bitoff_t charsize;
1318 	char *masks_str = NULL;
1319 	char buf_type[100];
1320 
1321 	for (i = 0; i < maxtasks; i++) {
1322 		bitmask = masks[i];
1323 	    	if (bitmask) {
1324 			num_bits = bit_size(bitmask);
1325 			break;
1326 		}
1327 	}
1328 	charsize = (num_bits + 3) / 4;		/* ASCII hex digits */
1329 	charsize += 3;				/* "0x" and trailing "," */
1330 	masks_len = maxtasks * charsize + 1;	/* number of masks + null */
1331 
1332 	debug3("%s %d %"BITSTR_FMT" %d", __func__, maxtasks, charsize,
1333 		masks_len);
1334 
1335 	masks_str = xmalloc(masks_len);
1336 	masks_len = 0;
1337 	for (i = 0; i < maxtasks; i++) {
1338 	    	char *str;
1339 		int curlen;
1340 		bitmask = masks[i];
1341 	    	if (bitmask == NULL) {
1342 			continue;
1343 		}
1344 		str = (char *)bit_fmt_hexmask(bitmask);
1345 		curlen = strlen(str) + 1;
1346 
1347 		if (masks_len > 0)
1348 			masks_str[masks_len-1]=',';
1349 		strlcpy(&masks_str[masks_len], str, curlen);
1350 		masks_len += curlen;
1351 		xfree(str);
1352 	}
1353 
1354 	if (req->cpu_bind) {
1355 	    	xfree(req->cpu_bind);
1356 	}
1357 	if (masks_str[0] != '\0') {
1358 		req->cpu_bind = masks_str;
1359 		req->cpu_bind_type |= CPU_BIND_MASK;
1360 	} else {
1361 		req->cpu_bind = NULL;
1362 		req->cpu_bind_type &= ~CPU_BIND_VERBOSE;
1363 	}
1364 
1365 	/* clear mask generation bits */
1366 	req->cpu_bind_type &= ~CPU_BIND_TO_THREADS;
1367 	req->cpu_bind_type &= ~CPU_BIND_TO_CORES;
1368 	req->cpu_bind_type &= ~CPU_BIND_TO_SOCKETS;
1369 	req->cpu_bind_type &= ~CPU_BIND_TO_LDOMS;
1370 
1371 	slurm_sprint_cpu_bind_type(buf_type, req->cpu_bind_type);
1372 	info("_lllp_generate_cpu_bind jobid [%u]: %s, %s",
1373 	     req->job_id, buf_type, masks_str);
1374 }
1375