1 /*****************************************************************************\
2  *  cons_common.c - Common function interface for the select/cons_* plugins
3  *****************************************************************************
4  *  Copyright (C) 2019 SchedMD LLC
5  *  Derived in large part from select/cons_[res|tres] plugins
6  *
7  *  This file is part of Slurm, a resource management program.
8  *  For details, see <https://slurm.schedmd.com/>.
9  *  Please also read the included file: DISCLAIMER.
10  *
11  *  Slurm is free software; you can redistribute it and/or modify it under
12  *  the terms of the GNU General Public License as published by the Free
13  *  Software Foundation; either version 2 of the License, or (at your option)
14  *  any later version.
15  *
16  *  In addition, as a special exception, the copyright holders give permission
17  *  to link the code of portions of this program with the OpenSSL library under
18  *  certain conditions as described in each individual source file, and
19  *  distribute linked combinations including the two. You must obey the GNU
20  *  General Public License in all respects for all of the code used other than
21  *  OpenSSL. If you modify file(s) with this exception, you may extend this
22  *  exception to your version of the file(s), but you are not obligated to do
23  *  so. If you do not wish to do so, delete this exception statement from your
24  *  version.  If you delete this exception statement from all source files in
25  *  the program, then also delete it here.
26  *
27  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
30  *  details.
31  *
32  *  You should have received a copy of the GNU General Public License along
33  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
34  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
35 \*****************************************************************************/
36 
37 #define _GNU_SOURCE
38 
39 #include "src/common/slurm_xlator.h"
40 
41 #include "cons_common.h"
42 
43 #include "src/common/assoc_mgr.h"
44 #include "src/common/slurm_selecttype_info.h"
45 #include "src/common/slurm_topology.h"
46 
47 /* These are defined here so when we link with something other than
48  * the slurmctld we will have these symbols defined.  They will get
49  * overwritten when linking with the slurmctld.
50  */
51 #if defined (__APPLE__)
52 extern slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
53 extern node_record_t *node_record_table_ptr __attribute__((weak_import));
54 extern List part_list __attribute__((weak_import));
55 extern List job_list __attribute__((weak_import));
56 extern int node_record_count __attribute__((weak_import));
57 extern time_t last_node_update __attribute__((weak_import));
58 extern switch_record_t *switch_record_table __attribute__((weak_import));
59 extern int switch_record_cnt __attribute__((weak_import));
60 extern bitstr_t *avail_node_bitmap __attribute__((weak_import));
61 extern uint16_t *cr_node_num_cores __attribute__((weak_import));
62 extern uint32_t *cr_node_cores_offset __attribute__((weak_import));
63 extern int slurmctld_tres_cnt __attribute__((weak_import));
64 extern slurmctld_config_t slurmctld_config __attribute__((weak_import));
65 extern bitstr_t *idle_node_bitmap __attribute__((weak_import));
66 #else
67 slurm_ctl_conf_t slurmctld_conf;
68 node_record_t *node_record_table_ptr;
69 List part_list;
70 List job_list;
71 int node_record_count;
72 time_t last_node_update;
73 switch_record_t *switch_record_table;
74 int switch_record_cnt;
75 bitstr_t *avail_node_bitmap;
76 uint16_t *cr_node_num_cores;
77 uint32_t *cr_node_cores_offset;
78 int slurmctld_tres_cnt = 0;
79 slurmctld_config_t slurmctld_config;
80 bitstr_t *idle_node_bitmap;
81 #endif
82 
83 /* init common global variables */
84 bool     backfill_busy_nodes  = false;
85 int      bf_window_scale      = 0;
86 cons_common_callbacks_t cons_common_callbacks = {0};
87 int      core_array_size      = 1;
88 uint16_t cr_type              = CR_CPU; /* cr_type is overwritten in init() */
89 bool     gang_mode            = false;
90 bool     have_dragonfly       = false;
91 bool     is_cons_tres         = false;
92 bool     pack_serial_at_end   = false;
93 bool     preempt_by_part      = false;
94 bool     preempt_by_qos       = false;
95 uint16_t priority_flags       = 0;
96 uint64_t select_debug_flags   = 0;
97 int      select_node_cnt      = 0;
98 bool     spec_cores_first     = false;
99 bool     topo_optional        = false;
100 
101 /* Global variables */
102 
_create_job_resources(int node_cnt)103 static job_resources_t *_create_job_resources(int node_cnt)
104 {
105 	job_resources_t *job_resrcs_ptr;
106 
107 	job_resrcs_ptr = create_job_resources();
108 	job_resrcs_ptr->cpu_array_reps = xcalloc(node_cnt, sizeof(uint32_t));
109 	job_resrcs_ptr->cpu_array_value = xcalloc(node_cnt, sizeof(uint16_t));
110 	job_resrcs_ptr->cpus = xcalloc(node_cnt, sizeof(uint16_t));
111 	job_resrcs_ptr->cpus_used = xcalloc(node_cnt, sizeof(uint16_t));
112 	job_resrcs_ptr->memory_allocated = xcalloc(node_cnt, sizeof(uint64_t));
113 	job_resrcs_ptr->memory_used = xcalloc(node_cnt, sizeof(uint64_t));
114 	job_resrcs_ptr->nhosts = node_cnt;
115 	return job_resrcs_ptr;
116 }
117 
_get_avail_cores_on_node(int node_inx,bitstr_t ** exc_bitmap)118 static int _get_avail_cores_on_node(int node_inx, bitstr_t **exc_bitmap)
119 {
120 	int exc_cnt = 0, tot_cores;
121 
122 	xassert(node_inx <= select_node_cnt);
123 
124 	tot_cores = select_node_record[node_inx].tot_cores;
125 
126 	if (!exc_bitmap)
127 		return tot_cores;
128 
129 	if (is_cons_tres) {
130 		if (exc_bitmap[node_inx])
131 			exc_cnt += bit_set_count(exc_bitmap[node_inx]);
132 	} else if (*exc_bitmap) {
133 		int coff = cr_get_coremap_offset(node_inx);
134 		for (int i = 0; i < tot_cores; i++) {
135 			if (bit_test(*exc_bitmap, coff + i))
136 				exc_cnt++;
137 		}
138 	}
139 	return tot_cores - exc_cnt;
140 }
141 
common_node_state_str(uint16_t node_state)142 extern char *common_node_state_str(uint16_t node_state)
143 {
144 	if (node_state >= NODE_CR_RESERVED)
145 		return "reserved";	/* Exclusive allocation */
146 	if (node_state >= NODE_CR_ONE_ROW)
147 		return "one_row";	/* Dedicated core for this partition */
148 	return "available";		/* Idle or in-use (shared) */
149 }
150 
_dump_job_res(struct job_resources * job)151 static void _dump_job_res(struct job_resources *job)
152 {
153 	char str[64];
154 
155 	if (job->core_bitmap)
156 		bit_fmt(str, sizeof(str), job->core_bitmap);
157 	else
158 		sprintf(str, "[no core_bitmap]");
159 	info("DEBUG: Dump job_resources: nhosts %u core_bitmap %s",
160 	     job->nhosts, str);
161 }
162 
163 /*
164  * _allocate_sc - Given the job requirements, determine which CPUs/cores
165  *                from the given node can be allocated (if any) to this
166  *                job. Returns structure identifying the usable resources and
167  *                a bitmap of the available cores.
168  *
169  * IN job_ptr       - pointer to job requirements
170  * IN/OUT core_map  - core_bitmap of available cores on this node
171  * IN part_core_map - bitmap of cores already allocated on this partition/node
172  * IN node_i        - index of node to be evaluated
173  * IN/OUT cpu_alloc_size - minimum allocation size, in CPUs
174  * IN entire_sockets_only - if true, allocate cores only on sockets that
175  *                          have no other allocated cores.
176  * IN req_sock_map - OPTIONAL bitmap of required sockets
177  * RET resource availability structure, call common_free_avail_res() to free
178  */
_allocate_sc(job_record_t * job_ptr,bitstr_t * core_map,bitstr_t * part_core_map,const uint32_t node_i,int * cpu_alloc_size,bool entire_sockets_only,bitstr_t * req_sock_map)179 static avail_res_t *_allocate_sc(job_record_t *job_ptr, bitstr_t *core_map,
180 				 bitstr_t *part_core_map, const uint32_t node_i,
181 				 int *cpu_alloc_size, bool entire_sockets_only,
182 				 bitstr_t *req_sock_map)
183 {
184 	uint16_t cpu_count = 0, cpu_cnt = 0, part_cpu_limit = 0xffff;
185 	uint16_t si, cps, avail_cpus = 0, num_tasks = 0;
186 	uint32_t c;
187 	uint32_t core_begin;
188 	uint32_t core_end;
189 	struct job_details *details_ptr = job_ptr->details;
190 	uint16_t cpus_per_task = details_ptr->cpus_per_task;
191 	uint16_t free_core_count = 0, spec_threads = 0;
192 	uint16_t i, j;
193 	uint16_t sockets = select_node_record[node_i].tot_sockets;
194 	uint16_t cores_per_socket = select_node_record[node_i].cores;
195 	uint16_t threads_per_core = select_node_record[node_i].vpus;
196 	uint16_t min_cores = 1, min_sockets = 1, ntasks_per_socket = 0;
197 	uint16_t ncpus_per_core = 0xffff;	/* Usable CPUs per core */
198 	uint16_t ntasks_per_core = 0xffff;
199 	uint32_t free_cpu_count = 0, used_cpu_count = 0;
200 	int tmp_cpt = 0; /* cpus_per_task */
201 	uint16_t free_cores[sockets];
202 	uint16_t used_cores[sockets];
203 	uint32_t used_cpu_array[sockets];
204 	avail_res_t *avail_res;
205 
206 
207 	if (is_cons_tres) {
208 		core_begin = 0;
209 		core_end = select_node_record[node_i].tot_cores;
210 	} else {
211 		core_begin = cr_get_coremap_offset(node_i);
212 		core_end = cr_get_coremap_offset(node_i+1);
213 	}
214 
215 	memset(free_cores, 0, sockets * sizeof(uint16_t));
216 	memset(used_cores, 0, sockets * sizeof(uint16_t));
217 	memset(used_cpu_array, 0, sockets * sizeof(uint32_t));
218 
219 	if (entire_sockets_only && details_ptr->whole_node &&
220 	    (details_ptr->core_spec != NO_VAL16)) {
221 		/* Ignore specialized cores when allocating "entire" socket */
222 		entire_sockets_only = false;
223 	}
224 	if (details_ptr->mc_ptr) {
225 		uint32_t threads_per_socket;
226 		multi_core_data_t *mc_ptr = details_ptr->mc_ptr;
227 		if (mc_ptr->cores_per_socket != NO_VAL16) {
228 			min_cores = mc_ptr->cores_per_socket;
229 		}
230 		if (mc_ptr->sockets_per_node != NO_VAL16) {
231 			min_sockets = mc_ptr->sockets_per_node;
232 		}
233 		if ((mc_ptr->ntasks_per_core != INFINITE16) &&
234 		    (mc_ptr->ntasks_per_core)) {
235 			ntasks_per_core = mc_ptr->ntasks_per_core;
236 			ncpus_per_core = MIN(threads_per_core,
237 					     (ntasks_per_core * cpus_per_task));
238 		}
239 		if ((mc_ptr->threads_per_core != NO_VAL16) &&
240 		    (mc_ptr->threads_per_core <  ncpus_per_core)) {
241 			ncpus_per_core = mc_ptr->threads_per_core;
242 		}
243 		*cpu_alloc_size = MIN(*cpu_alloc_size, ncpus_per_core);
244 		ntasks_per_socket = mc_ptr->ntasks_per_socket;
245 
246 		if ((ncpus_per_core != NO_VAL16) &&
247 		    (ncpus_per_core != INFINITE16) &&
248 		    (ncpus_per_core > threads_per_core)) {
249 			goto fini;
250 		}
251 		threads_per_socket = threads_per_core * cores_per_socket;
252 		if ((ntasks_per_socket != NO_VAL16) &&
253 		    (ntasks_per_socket != INFINITE16) &&
254 		    (ntasks_per_socket > threads_per_socket)) {
255 			goto fini;
256 		}
257 	}
258 
259 	/*
260 	 * These are the job parameters that we must respect:
261 	 *
262 	 *   details_ptr->mc_ptr->cores_per_socket (cr_core|cr_socket)
263 	 *	- min # of cores per socket to allocate to this job
264 	 *   details_ptr->mc_ptr->sockets_per_node (cr_core|cr_socket)
265 	 *	- min # of sockets per node to allocate to this job
266 	 *   details_ptr->mc_ptr->ntasks_per_core (cr_core|cr_socket)
267 	 *	- number of tasks to launch per core
268 	 *   details_ptr->mc_ptr->ntasks_per_socket (cr_core|cr_socket)
269 	 *	- number of tasks to launch per socket
270 	 *
271 	 *   details_ptr->ntasks_per_node (all cr_types)
272 	 *	- total number of tasks to launch on this node
273 	 *   details_ptr->cpus_per_task (all cr_types)
274 	 *	- number of cpus to allocate per task
275 	 *
276 	 * These are the hardware constraints:
277 	 *   cpus = sockets * cores_per_socket * threads_per_core
278 	 *
279 	 * These are the cores/sockets that are available: core_map
280 	 *
281 	 * NOTE: currently we only allocate at the socket level, the core
282 	 *       level, or the cpu level. When hyperthreading is enabled
283 	 *       in the BIOS, then there can be more than one thread/cpu
284 	 *       per physical core.
285 	 *
286 	 * PROCEDURE:
287 	 *
288 	 * Step 1: Determine the current usage data: used_cores[],
289 	 *         used_core_count, free_cores[], free_core_count
290 	 *
291 	 * Step 2: For core-level and socket-level: apply sockets_per_node
292 	 *         and cores_per_socket to the "free" cores.
293 	 *
294 	 * Step 3: Compute task-related data: ncpus_per_core,
295 	 *         ntasks_per_socket, ntasks_per_node and cpus_per_task
296 	 *         and determine the number of tasks to run on this node
297 	 *
298 	 * Step 4: Mark the allocated resources in the job_cores bitmap
299 	 *         and return "num_tasks" from Step 3.
300 	 *
301 	 *
302 	 * For socket and core counts, start by assuming that all available
303 	 * resources will be given to the job. Check min_* to ensure that
304 	 * there's enough resources. Reduce the resource count to match max_*
305 	 * (if necessary). Also reduce resource count (if necessary) to
306 	 * match ntasks_per_resource.
307 	 */
308 
309 	/*
310 	 * Step 1: create and compute core-count-per-socket
311 	 * arrays and total core counts
312 	 */
313 	for (c = core_begin; c < core_end; c++) {
314 		i = (uint16_t) ((c - core_begin) / cores_per_socket);
315 		if (bit_test(core_map, c)) {
316 			free_cores[i]++;
317 			free_core_count++;
318 		} else if (!part_core_map) {
319 			used_cores[i]++;
320 		} else if (bit_test(part_core_map, c)) {
321 			used_cores[i]++;
322 			used_cpu_array[i]++;
323 		}
324 	}
325 
326 	for (i = 0; i < sockets; i++) {
327 		/*
328 		 * if a socket is already in use and entire_sockets_only is
329 		 * enabled, it cannot be used by this job
330 		 */
331 		if (entire_sockets_only && used_cores[i]) {
332 			free_core_count -= free_cores[i];
333 			used_cores[i] += free_cores[i];
334 			free_cores[i] = 0;
335 		}
336 		free_cpu_count += free_cores[i] * threads_per_core;
337 		if (used_cpu_array[i])
338 			used_cpu_count += used_cores[i] * threads_per_core;
339 	}
340 
341 	/* Enforce partition CPU limit, but do not pick specific cores yet */
342 	if ((job_ptr->part_ptr->max_cpus_per_node != INFINITE) &&
343 	    (free_cpu_count + used_cpu_count >
344 	     job_ptr->part_ptr->max_cpus_per_node)) {
345 
346 		if (is_cons_tres) {
347 			if (used_cpu_count >=
348 			    job_ptr->part_ptr->max_cpus_per_node) {
349 				/* no available CPUs on this node */
350 				num_tasks = 0;
351 				goto fini;
352 			}
353 			part_cpu_limit = job_ptr->part_ptr->max_cpus_per_node -
354 				used_cpu_count;
355 			if ((part_cpu_limit == 1) &&
356 			    (((ntasks_per_core != 0xffff) &&
357 			      (ntasks_per_core > part_cpu_limit)) ||
358 			     (ntasks_per_socket > part_cpu_limit) ||
359 			     ((ncpus_per_core != 0xffff) &&
360 			      (ncpus_per_core > part_cpu_limit)) ||
361 			     (cpus_per_task > part_cpu_limit))) {
362 				/* insufficient available CPUs on this node */
363 				num_tasks = 0;
364 				goto fini;
365 			}
366 		} else {
367 			int excess = free_cpu_count + used_cpu_count -
368 				job_ptr->part_ptr->max_cpus_per_node;
369 			int min_excess_cores = min_cores;
370 			int found_cores;
371 			excess = (excess + threads_per_core - 1) /
372 				threads_per_core;
373 			while (excess > 0) {
374 				int min_free_inx = -1;
375 				for (i = 0; i < sockets; i++) {
376 					if (free_cores[i] == 0)
377 						continue;
378 					if (((min_excess_cores > 1) ||
379 					     (min_sockets > 1)) &&
380 					    (free_cores[i] <= min_excess_cores))
381 						continue;
382 					if ((min_free_inx == -1) ||
383 					    (free_cores[i] <
384 					     free_cores[min_free_inx]))
385 						min_free_inx = i;
386 				}
387 				if (min_free_inx == -1) {
388 					if (min_excess_cores) {
389 						min_excess_cores = 0;
390 						continue;
391 					}
392 					break;
393 				}
394 				if (free_cores[min_free_inx] < excess)
395 					found_cores = free_cores[min_free_inx];
396 				else
397 					found_cores = excess;
398 				if (min_excess_cores  > 1 &&
399 				    ((free_cores[min_free_inx] - found_cores) <
400 				     min_excess_cores)) {
401 					found_cores = free_cores[min_free_inx] -
402 						min_excess_cores;
403 				}
404 				free_core_count -= found_cores;
405 				free_cpu_count -= (found_cores *
406 						   threads_per_core);
407 				free_cores[min_free_inx] -= found_cores;
408 				excess -= found_cores;
409 			}
410 		}
411 	}
412 
413 	/* Step 2: check min_cores per socket and min_sockets per node */
414 	j = 0;
415 	for (i = 0; i < sockets; i++) {
416 		if (free_cores[i] < min_cores) {
417 			/* cannot use this socket */
418 			free_core_count -= free_cores[i];
419 			free_cores[i] = 0;
420 			continue;
421 		}
422 		/* count this socket as usable */
423 		j++;
424 	}
425 	if (j < min_sockets) {
426 		/* cannot use this node */
427 		num_tasks = 0;
428 		goto fini;
429 	}
430 
431 	if (free_core_count < 1) {
432 		/* no available resources on this node */
433 		num_tasks = 0;
434 		goto fini;
435 	}
436 
437 	/*
438 	 * Step 3: Compute task-related data:
439 	 *         ntasks_per_socket, ntasks_per_node and cpus_per_task
440 	 *         to determine the number of tasks to run on this node
441 	 *
442 	 * Note: cpus_per_task and ncpus_per_core need to play nice
443 	 *       2 tasks_per_core vs. 2 cpus_per_task
444 	 */
445 	avail_cpus = 0;
446 	num_tasks = 0;
447 	threads_per_core = common_cpus_per_core(details_ptr, node_i);
448 
449 	for (i = 0; i < sockets; i++) {
450 		uint16_t tmp = free_cores[i] * threads_per_core;
451 		if ((tmp == 0) && req_sock_map && bit_test(req_sock_map, i)) {
452 			/* no available resources on required socket */
453 			num_tasks = 0;
454 			goto fini;
455 		}
456 		avail_cpus += tmp;
457 		if (ntasks_per_socket)
458 			num_tasks += MIN(tmp, ntasks_per_socket);
459 		else
460 			num_tasks += tmp;
461 	}
462 
463 	/*
464 	 * If job requested exclusive rights to the node don't do the min
465 	 * here since it will make it so we don't allocate the entire node.
466 	 */
467 	if (details_ptr->ntasks_per_node && details_ptr->share_res)
468 		num_tasks = MIN(num_tasks, details_ptr->ntasks_per_node);
469 
470 	if (cpus_per_task < 2) {
471 		avail_cpus = num_tasks;
472 	} else if ((ntasks_per_core == 1) &&
473 		   (cpus_per_task > threads_per_core)) {
474 		/* find out how many cores a task will use */
475 		int task_cores = (cpus_per_task + threads_per_core - 1) /
476 			threads_per_core;
477 		int task_cpus  = task_cores * threads_per_core;
478 		/* find out how many tasks can fit on a node */
479 		int tasks = avail_cpus / task_cpus;
480 		/* how many cpus the job would use on the node */
481 		avail_cpus = tasks * task_cpus;
482 		/* subtract out the extra cpus. */
483 		avail_cpus -= (tasks * (task_cpus - cpus_per_task));
484 	} else {
485 		j = avail_cpus / cpus_per_task;
486 		num_tasks = MIN(num_tasks, j);
487 		avail_cpus = num_tasks * cpus_per_task;
488 	}
489 
490 	if ((details_ptr->ntasks_per_node &&
491 	     (num_tasks < details_ptr->ntasks_per_node) &&
492 	     (details_ptr->overcommit == 0)) ||
493 	    (details_ptr->pn_min_cpus &&
494 	     (avail_cpus < details_ptr->pn_min_cpus))) {
495 		/* insufficient resources on this node */
496 		num_tasks = 0;
497 		goto fini;
498 	}
499 
500 	/*
501 	 * Step 4 - make sure that ntasks_per_socket is enforced when
502 	 *          allocating cores
503 	 */
504 	if ((ntasks_per_socket != NO_VAL16) &&
505 	    (ntasks_per_socket != INFINITE16) &&
506 	    (ntasks_per_socket >= 1)) {
507 		cps = ntasks_per_socket;
508 		if (cpus_per_task > 1)
509 			cps *= cpus_per_task;
510 	} else
511 		cps = cores_per_socket * threads_per_core;
512 
513 	si = 9999;
514 	tmp_cpt = cpus_per_task;
515 	for (c = core_begin; c < core_end && (avail_cpus > 0); c++) {
516 		if (!bit_test(core_map, c))
517 			continue;
518 
519 		/* Socket index */
520 		i = (uint16_t) ((c - core_begin) / cores_per_socket);
521 		if (free_cores[i] > 0) {
522 			/*
523 			 * this socket has free cores, but make sure we don't
524 			 * use more than are needed for ntasks_per_socket
525 			 */
526 			if (si != i) {	/* Start use of next socket */
527 				si = i;
528 				cpu_cnt = threads_per_core;
529 			} else {	/* Continued use of same socket */
530 				if (cpu_cnt >= cps) {
531 					/* do not allocate this core */
532 					bit_clear(core_map, c);
533 					continue;
534 				}
535 				cpu_cnt += threads_per_core;
536 			}
537 			free_cores[i]--;
538 			/*
539 			 * we have to ensure that cpu_count is not bigger than
540 			 * avail_cpus due to hyperthreading or this would break
541 			 * the selection logic providing more CPUs than allowed
542 			 * after task-related data processing of stage 3
543 			 */
544 			if (avail_cpus >= threads_per_core) {
545 				int used;
546 				if (is_cons_tres &&
547 				    (slurmctld_conf.select_type_param &
548 				     CR_ONE_TASK_PER_CORE) &&
549 				    (details_ptr->min_gres_cpu > 0)) {
550 					used = threads_per_core;
551 				} else if ((ntasks_per_core == 1) &&
552 					   (cpus_per_task > threads_per_core)) {
553 					used = MIN(tmp_cpt, threads_per_core);
554 				} else
555 					used = threads_per_core;
556 				avail_cpus -= used;
557 				cpu_count  += used;
558 				if (tmp_cpt <= used)
559 					tmp_cpt = cpus_per_task;
560 				else
561 					tmp_cpt -= used;
562 			} else {
563 				cpu_count += avail_cpus;
564 				avail_cpus = 0;
565 			}
566 
567 		} else
568 			bit_clear(core_map, c);
569 	}
570 	/* clear leftovers */
571 	if (c < core_end)
572 		bit_nclear(core_map, c, core_end - 1);
573 
574 fini:
575 	/* if num_tasks == 0 then clear all bits on this node */
576 	if (num_tasks == 0) {
577 		bit_nclear(core_map, core_begin, core_end-1);
578 		cpu_count = 0;
579 	}
580 
581 	if ((details_ptr->core_spec != NO_VAL16) &&
582 	    (details_ptr->core_spec & CORE_SPEC_THREAD) &&
583 	    ((select_node_record[node_i].threads == 1) ||
584 	     (select_node_record[node_i].threads ==
585 	      select_node_record[node_i].vpus))) {
586 		/*
587 		 * NOTE: Currently does not support the situation when Slurm
588 		 * allocates by core, the thread specialization count occupies
589 		 * a full core
590 		 */
591 		c = details_ptr->core_spec & (~CORE_SPEC_THREAD);
592 		if (((cpu_count + c) <= select_node_record[node_i].cpus))
593 			;
594 		else if (cpu_count > c)
595 			spec_threads = c;
596 		else
597 			spec_threads = cpu_count;
598 	}
599 	cpu_count -= spec_threads;
600 
601 	avail_res = xmalloc(sizeof(avail_res_t));
602 	avail_res->max_cpus = MIN(cpu_count, part_cpu_limit);
603 
604 	if (is_cons_tres) {
605 		avail_res->min_cpus = *cpu_alloc_size;
606 		avail_res->avail_cores_per_sock =
607 			xcalloc(sockets, sizeof(uint16_t));
608 		for (c = core_begin; c < core_end; c++) {
609 			i = (uint16_t) ((c - core_begin) / cores_per_socket);
610 			if (bit_test(core_map, c))
611 				avail_res->avail_cores_per_sock[i]++;
612 		}
613 		avail_res->sock_cnt = sockets;
614 		avail_res->spec_threads = spec_threads;
615 		avail_res->vpus = select_node_record[node_i].vpus;
616 	}
617 
618 	return avail_res;
619 }
620 
621 /*
622  * Get configured DefCpuPerGPU information from a list
623  * (either global or per partition list)
624  * Returns NO_VAL64 if configuration parameter not set
625  */
common_get_def_cpu_per_gpu(List job_defaults_list)626 extern uint64_t common_get_def_cpu_per_gpu(List job_defaults_list)
627 {
628 	uint64_t cpu_per_gpu = NO_VAL64;
629 	ListIterator iter;
630 	job_defaults_t *job_defaults;
631 
632 	if (!job_defaults_list)
633 		return cpu_per_gpu;
634 
635 	iter = list_iterator_create(job_defaults_list);
636 	while ((job_defaults = (job_defaults_t *) list_next(iter))) {
637 		if (job_defaults->type == JOB_DEF_CPU_PER_GPU) {
638 			cpu_per_gpu = job_defaults->value;
639 			break;
640 		}
641 	}
642 	list_iterator_destroy(iter);
643 
644 	return cpu_per_gpu;
645 }
646 
647 /*
648  * Get configured DefMemPerGPU information from a list
649  * (either global or per partition list)
650  * Returns NO_VAL64 if configuration parameter not set
651  */
common_get_def_mem_per_gpu(List job_defaults_list)652 extern uint64_t common_get_def_mem_per_gpu(List job_defaults_list)
653 {
654 	uint64_t mem_per_gpu = NO_VAL64;
655 	ListIterator iter;
656 	job_defaults_t *job_defaults;
657 
658 	if (!job_defaults_list)
659 		return mem_per_gpu;
660 
661 	iter = list_iterator_create(job_defaults_list);
662 	while ((job_defaults = (job_defaults_t *) list_next(iter))) {
663 		if (job_defaults->type == JOB_DEF_MEM_PER_GPU) {
664 			mem_per_gpu = job_defaults->value;
665 			break;
666 		}
667 	}
668 	list_iterator_destroy(iter);
669 
670 	return mem_per_gpu;
671 }
672 
common_free_avail_res(avail_res_t * avail_res)673 extern void common_free_avail_res(avail_res_t *avail_res)
674 {
675 	if (!avail_res)
676 		return;
677 
678 	xfree(avail_res->avail_cores_per_sock);
679 	FREE_NULL_LIST(avail_res->sock_gres_list);
680 	xfree(avail_res);
681 }
682 
683 /*
684  * Return the number of usable logical processors by a given job on
685  * some specified node. Returns 0xffff if no limit.
686  */
common_cpus_per_core(struct job_details * details,int node_inx)687 extern int common_cpus_per_core(struct job_details *details, int node_inx)
688 {
689 	uint16_t ncpus_per_core = 0xffff;	/* Usable CPUs per core */
690 	uint16_t threads_per_core = select_node_record[node_inx].vpus;
691 
692 	if (is_cons_tres &&
693 	    (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE) &&
694 	    (details->min_gres_cpu > 0)) {
695 		/* May override default of 1 CPU per core */
696 		uint16_t pu_per_core = 0xffff;	/* Usable CPUs per core */
697 		uint16_t vpus_per_core = select_node_record[node_inx].vpus;
698 		return MIN(vpus_per_core, pu_per_core);
699 	}
700 
701 	if (details && details->mc_ptr) {
702 		multi_core_data_t *mc_ptr = details->mc_ptr;
703 		if ((mc_ptr->ntasks_per_core != INFINITE16) &&
704 		    (mc_ptr->ntasks_per_core)) {
705 			ncpus_per_core = MIN(threads_per_core,
706 					     (mc_ptr->ntasks_per_core *
707 					      details->cpus_per_task));
708 		}
709 		if ((mc_ptr->threads_per_core != NO_VAL16) &&
710 		    (mc_ptr->threads_per_core <  ncpus_per_core)) {
711 			ncpus_per_core = mc_ptr->threads_per_core;
712 		}
713 	}
714 
715 	threads_per_core = MIN(threads_per_core, ncpus_per_core);
716 
717 	return threads_per_core;
718 }
719 
common_init(void)720 extern void common_init(void)
721 {
722 	char *topo_param;
723 
724 	cr_type = slurmctld_conf.select_type_param;
725 	if (cr_type)
726 		verbose("%s loaded with argument %u", plugin_type, cr_type);
727 
728 	select_debug_flags = slurm_get_debug_flags();
729 
730 	topo_param = slurm_get_topology_param();
731 	if (topo_param) {
732 		if (xstrcasestr(topo_param, "dragonfly"))
733 			have_dragonfly = true;
734 		if (xstrcasestr(topo_param, "TopoOptional"))
735 			topo_optional = true;
736 		xfree(topo_param);
737 	}
738 
739 	priority_flags = slurm_get_priority_flags();
740 
741 	if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG)
742 		gang_mode = true;
743 	else
744 		gang_mode = false;
745 
746 	if (plugin_id == SELECT_PLUGIN_CONS_TRES)
747 		is_cons_tres = true;
748 }
749 
common_fini(void)750 extern void common_fini(void)
751 {
752 	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
753 		info("%s shutting down ...", plugin_type);
754 	else
755 		verbose("%s shutting down ...", plugin_type);
756 
757 	node_data_destroy(select_node_usage, select_node_record);
758 	select_node_record = NULL;
759 	select_node_usage = NULL;
760 	part_data_destroy_res(select_part_record);
761 	select_part_record = NULL;
762 	cr_fini_global_core_data();
763 }
764 
765 /*
766  * Bit a core bitmap array of available cores
767  * node_bitmap IN - Nodes available for use
768  * core_spec IN - Specialized core specification, NO_VAL16 if none
769  * RET core bitmap array, one per node. Use free_core_array() to release memory
770  */
common_mark_avail_cores(bitstr_t * node_bitmap,uint16_t core_spec)771 extern bitstr_t **common_mark_avail_cores(
772 	bitstr_t *node_bitmap, uint16_t core_spec)
773 {
774 	bitstr_t **avail_cores;
775 	int from_core, to_core, incr_core, from_sock, to_sock, incr_sock;
776 	int res_core, res_sock, res_off;
777 	int n, n_first, n_last;
778 	int c;
779 	int rem_core_spec, node_core_spec, thread_spec = 0;
780 	node_record_t *node_ptr;
781 	bitstr_t *core_map = NULL;
782 	uint16_t use_spec_cores = slurmctld_conf.conf_flags & CTL_CONF_ASRU;
783 	node_res_record_t *node_res_ptr = NULL;
784 	uint32_t coff;
785 
786 	if (is_cons_tres) {
787 		avail_cores = build_core_array();
788 	} else {
789 		core_map = bit_alloc(
790 			cr_get_coremap_offset(bit_size(node_bitmap)));
791 		avail_cores = build_core_array();
792 		*avail_cores = core_map;
793 	}
794 
795 	if ((core_spec != NO_VAL16) &&
796 	    (core_spec & CORE_SPEC_THREAD)) {	/* Reserving threads */
797 		thread_spec = core_spec & (~CORE_SPEC_THREAD);
798 		core_spec = NO_VAL16;		/* Don't remove cores */
799 	}
800 
801 	n_first = bit_ffs(node_bitmap);
802 	if (n_first != -1)
803 		n_last = bit_fls(node_bitmap);
804 	else
805 		n_last = -2;
806 	for (n = n_first; n <= n_last; n++) {
807 		if (!bit_test(node_bitmap, n))
808 			continue;
809 
810 		node_res_ptr = &select_node_record[n];
811 		node_ptr = node_res_ptr->node_ptr;
812 
813 		if (is_cons_tres) {
814 			c    = 0;
815 			coff = node_res_ptr->tot_cores;
816 			avail_cores[n] = bit_alloc(node_res_ptr->tot_cores);
817 			core_map = avail_cores[n];
818 		} else {
819 			c    = cr_get_coremap_offset(n);
820 			coff = cr_get_coremap_offset(n+1);
821 		}
822 
823 		if ((core_spec != NO_VAL16) &&
824 		    (core_spec >= node_res_ptr->tot_cores)) {
825 			bit_clear(node_bitmap, n);
826 			continue;
827 		}
828 
829 		bit_nset(core_map, c, coff - 1);
830 
831 		/* Job can't over-ride system defaults */
832 		if (use_spec_cores && (core_spec == 0))
833 			continue;
834 
835 		if (thread_spec &&
836 		    (node_res_ptr->cpus == node_res_ptr->tot_cores))
837 			/* Each core has one thead, reserve cores here */
838 			node_core_spec = thread_spec;
839 		else
840 			node_core_spec = core_spec;
841 
842 		/*
843 		 * remove node's specialized cores accounting toward the
844 		 * requested limit if allowed by configuration
845 		 */
846 		rem_core_spec = node_core_spec;
847 		if (node_ptr->node_spec_bitmap) {
848 			for (int i = 0; i < node_res_ptr->tot_cores; i++) {
849 				if (!bit_test(node_ptr->node_spec_bitmap, i)) {
850 					bit_clear(core_map, c + i);
851 					if (!use_spec_cores)
852 						continue;
853 					rem_core_spec--;
854 					if (!rem_core_spec)
855 						break;
856 				}
857 			}
858 		}
859 
860 		if (!use_spec_cores || !rem_core_spec ||
861 		    (node_core_spec == NO_VAL16))
862 			continue;
863 
864 		/* if more cores need to be specialized, look for
865 		 * them in the non-specialized cores */
866 		if (spec_cores_first) {
867 			from_core = 0;
868 			to_core   = node_res_ptr->cores;
869 			incr_core = 1;
870 			from_sock = 0;
871 			to_sock   = node_res_ptr->tot_sockets;
872 			incr_sock = 1;
873 		} else {
874 			from_core = node_res_ptr->cores - 1;
875 			to_core   = -1;
876 			incr_core = -1;
877 			from_sock = node_res_ptr->tot_sockets - 1;
878 			to_sock   = -1;
879 			incr_sock = -1;
880 		}
881 		for (res_core = from_core;
882 		     ((rem_core_spec > 0) && (res_core != to_core));
883 		     res_core += incr_core) {
884 			for (res_sock = from_sock;
885 			     ((rem_core_spec > 0) && (res_sock != to_sock));
886 			     res_sock += incr_sock) {
887 				res_off = c + res_core +
888 					(res_sock * node_res_ptr->cores);
889 				if (!bit_test(core_map, res_off))
890 					continue;
891 				bit_clear(core_map, res_off);
892 				rem_core_spec--;
893 			}
894 		}
895 	}
896 
897 	return avail_cores;
898 }
899 
900 /*
901  * common_allocate_cores - Given the job requirements, determine which cores
902  *                   from the given node can be allocated (if any) to this
903  *                   job. Returns the number of cpus that can be used by
904  *                   this node AND a bitmap of the selected cores.
905  *
906  * IN job_ptr       - pointer to job requirements
907  * IN/OUT core_map  - core_bitmap of available cores on this node
908  * IN part_core_map - bitmap of cores already allocated on this partition/node
909  * IN node_i        - index of node to be evaluated
910  * IN/OUT cpu_alloc_size - minimum allocation size, in CPUs
911  * IN cpu_type      - if true, allocate CPUs rather than cores
912  * IN req_sock_map - OPTIONAL bitmap of required sockets
913  * RET resource availability structure, call common_free_avail_res() to free
914  */
common_allocate_cores(job_record_t * job_ptr,bitstr_t * core_map,bitstr_t * part_core_map,const uint32_t node_i,int * cpu_alloc_size,bool cpu_type,bitstr_t * req_sock_map)915 extern avail_res_t *common_allocate_cores(job_record_t *job_ptr,
916 					  bitstr_t *core_map,
917 					  bitstr_t *part_core_map,
918 					  const uint32_t node_i,
919 					  int *cpu_alloc_size,
920 					  bool cpu_type,
921 					  bitstr_t *req_sock_map)
922 {
923 	return _allocate_sc(job_ptr, core_map, part_core_map, node_i,
924 			    cpu_alloc_size, false, req_sock_map);
925 }
926 
927 /*
928  * common_allocate_sockets - Given the job requirements, determine which sockets
929  *                     from the given node can be allocated (if any) to this
930  *                     job. Returns the number of cpus that can be used by
931  *                     this node AND a core-level bitmap of the selected
932  *                     sockets.
933  *
934  * IN job_ptr       - pointer to job requirements
935  * IN/OUT core_map  - core_bitmap of available cores on this node
936  * IN part_core_map - bitmap of cores already allocated on this partition/node
937  * IN node_i        - index of node to be evaluated
938  * IN/OUT cpu_alloc_size - minimum allocation size, in CPUs
939  * IN req_sock_map - OPTIONAL bitmap of required sockets
940  * RET resource availability structure, call common_free_avail_res() to free
941  */
common_allocate_sockets(job_record_t * job_ptr,bitstr_t * core_map,bitstr_t * part_core_map,const uint32_t node_i,int * cpu_alloc_size,bitstr_t * req_sock_map)942 extern avail_res_t *common_allocate_sockets(job_record_t *job_ptr,
943 					    bitstr_t *core_map,
944 					    bitstr_t *part_core_map,
945 					    const uint32_t node_i,
946 					    int *cpu_alloc_size,
947 					    bitstr_t *req_sock_map)
948 {
949 	return _allocate_sc(job_ptr, core_map, part_core_map, node_i,
950 			    cpu_alloc_size, true, req_sock_map);
951 }
952 
select_p_state_save(char * dir_name)953 extern int select_p_state_save(char *dir_name)
954 {
955 	/* nothing to save */
956 	return SLURM_SUCCESS;
957 }
958 
959 /* This is Part 2 of a 4-part procedure which can be found in
960  * src/slurmctld/read_config.c. See select_p_node_init for the
961  * whole story.
962  */
select_p_state_restore(char * dir_name)963 extern int select_p_state_restore(char *dir_name)
964 {
965 	/* nothing to restore */
966 	return SLURM_SUCCESS;
967 }
968 
969 /* This is Part 3 of a 4-part procedure which can be found in
970  * src/slurmctld/read_config.c. See select_p_node_init for the
971  * whole story.
972  */
select_p_job_init(List job_list)973 extern int select_p_job_init(List job_list)
974 {
975 	/* nothing to initialize for jobs */
976 	return SLURM_SUCCESS;
977 }
978 
979 /* This plugin does not generate a node ranking. */
select_p_node_ranking(node_record_t * node_ptr,int node_cnt)980 extern bool select_p_node_ranking(node_record_t *node_ptr, int node_cnt)
981 {
982 	return false;
983 }
984 
985 /* This is Part 1 of a 4-part procedure which can be found in
986  * src/slurmctld/read_config.c. The whole story goes like this:
987  *
988  * Step 1: select_g_node_init          : initializes the global node arrays
989  * Step 2: select_g_state_restore      : NO-OP - nothing to restore
990  * Step 3: select_g_job_init           : NO-OP - nothing to initialize
991  * Step 4: select_g_select_nodeinfo_set: called from reset_job_bitmaps() with
992  *                                       each valid recovered job_ptr AND from
993  *                                       select_nodes(), this procedure adds
994  *                                       job data to the 'select_part_record'
995  *                                       global array
996  */
select_p_node_init(node_record_t * node_ptr,int node_cnt)997 extern int select_p_node_init(node_record_t *node_ptr, int node_cnt)
998 {
999 	char *preempt_type, *sched_params, *tmp_ptr;
1000 	uint32_t cume_cores = 0;
1001 	int i;
1002 
1003 	info("%s: %s", plugin_type, __func__);
1004 	if ((cr_type & (CR_CPU | CR_CORE | CR_SOCKET)) == 0) {
1005 		fatal("Invalid SelectTypeParameters: %s (%u), "
1006 		      "You need at least CR_(CPU|CORE|SOCKET)*",
1007 		      select_type_param_string(cr_type), cr_type);
1008 	}
1009 	if (node_ptr == NULL) {
1010 		error("select_p_node_init: node_ptr == NULL");
1011 		return SLURM_ERROR;
1012 	}
1013 	if (node_cnt < 0) {
1014 		error("select_p_node_init: node_cnt < 0");
1015 		return SLURM_ERROR;
1016 	}
1017 
1018 	sched_params = slurm_get_sched_params();
1019 	if (xstrcasestr(sched_params, "preempt_strict_order"))
1020 		preempt_strict_order = true;
1021 	else
1022 		preempt_strict_order = false;
1023 	if ((tmp_ptr = xstrcasestr(sched_params, "preempt_reorder_count="))) {
1024 		preempt_reorder_cnt = atoi(tmp_ptr + 22);
1025 		if (preempt_reorder_cnt < 0) {
1026 			error("Invalid SchedulerParameters preempt_reorder_count: %d",
1027 			      preempt_reorder_cnt);
1028 			preempt_reorder_cnt = 1;	/* Use default value */
1029 		}
1030 	}
1031 	if ((tmp_ptr = xstrcasestr(sched_params, "bf_window_linear="))) {
1032 		bf_window_scale = atoi(tmp_ptr + 17);
1033 		if (bf_window_scale <= 0) {
1034 			error("Invalid SchedulerParameters bf_window_linear: %d",
1035 			      bf_window_scale);
1036 			bf_window_scale = 0;		/* Use default value */
1037 		}
1038 	} else
1039 		bf_window_scale = 0;
1040 
1041 	if (xstrcasestr(sched_params, "pack_serial_at_end"))
1042 		pack_serial_at_end = true;
1043 	else
1044 		pack_serial_at_end = false;
1045 	if (xstrcasestr(sched_params, "spec_cores_first"))
1046 		spec_cores_first = true;
1047 	else
1048 		spec_cores_first = false;
1049 	if (xstrcasestr(sched_params, "bf_busy_nodes"))
1050 		backfill_busy_nodes = true;
1051 	else
1052 		backfill_busy_nodes = false;
1053 	xfree(sched_params);
1054 
1055 	preempt_type = slurm_get_preempt_type();
1056 	preempt_by_part = false;
1057 	preempt_by_qos = false;
1058 	if (preempt_type) {
1059 		if (xstrcasestr(preempt_type, "partition"))
1060 			preempt_by_part = true;
1061 		if (xstrcasestr(preempt_type, "qos"))
1062 			preempt_by_qos = true;
1063 		xfree(preempt_type);
1064 	}
1065 
1066 	/* initial global core data structures */
1067 	select_state_initializing = true;
1068 	cr_init_global_core_data(node_ptr, node_cnt);
1069 
1070 	node_data_destroy(select_node_usage, select_node_record);
1071 	select_node_cnt = node_cnt;
1072 
1073 	if (is_cons_tres)
1074 		core_array_size = select_node_cnt;
1075 
1076 	select_node_record = xcalloc(select_node_cnt,
1077 				     sizeof(node_res_record_t));
1078 	select_node_usage  = xcalloc(select_node_cnt,
1079 				     sizeof(node_use_record_t));
1080 
1081 	for (i = 0; i < select_node_cnt; i++) {
1082 		config_record_t *config_ptr;
1083 		select_node_record[i].node_ptr = &node_ptr[i];
1084 		select_node_record[i].mem_spec_limit =
1085 			node_ptr[i].mem_spec_limit;
1086 
1087 		config_ptr = node_ptr[i].config_ptr;
1088 		select_node_record[i].cpus    = config_ptr->cpus;
1089 		select_node_record[i].boards  = config_ptr->boards;
1090 		select_node_record[i].sockets = config_ptr->sockets;
1091 		select_node_record[i].cores   = config_ptr->cores;
1092 		select_node_record[i].threads = config_ptr->threads;
1093 		select_node_record[i].vpus    = config_ptr->threads;
1094 		select_node_record[i].real_memory = config_ptr->real_memory;
1095 
1096 		select_node_record[i].tot_sockets =
1097 			select_node_record[i].boards *
1098 			select_node_record[i].sockets;
1099 		select_node_record[i].tot_cores =
1100 			select_node_record[i].tot_sockets *
1101 			select_node_record[i].cores;
1102 		cume_cores += select_node_record[i].tot_cores;
1103 		select_node_record[i].cume_cores = cume_cores;
1104 		if (select_node_record[i].tot_cores >=
1105 		    select_node_record[i].cpus)
1106 			select_node_record[i].vpus = 1;
1107 
1108 		if ((select_node_record[i].cpus !=
1109 		     select_node_record[i].tot_cores) &&
1110 		    (select_node_record[i].cpus !=
1111 		     select_node_record[i].tot_cores *
1112 		     select_node_record[i].threads))
1113 			fatal("NodeName=%s CPUs=%u doesn't match neither Sockets(%u)*CoresPerSocket(%u)=(%u) nor Sockets(%u)*CoresPerSocket(%u)*ThreadsPerCore(%u)=(%u).  Please fix your slurm.conf.",
1114 			      node_ptr[i].name,
1115 			      select_node_record[i].cpus,
1116 			      select_node_record[i].tot_sockets,
1117 			      select_node_record[i].cores,
1118 			      select_node_record[i].tot_cores,
1119 			      select_node_record[i].tot_sockets,
1120 			      select_node_record[i].cores,
1121 			      select_node_record[i].threads,
1122 			      select_node_record[i].tot_cores *
1123 			      select_node_record[i].threads);
1124 
1125 		select_node_usage[i].node_state = NODE_CR_AVAILABLE;
1126 		gres_plugin_node_state_dealloc_all(
1127 			select_node_record[i].node_ptr->gres_list);
1128 	}
1129 	part_data_create_array();
1130 	node_data_dump();
1131 
1132 	return SLURM_SUCCESS;
1133 }
1134 
select_p_job_begin(job_record_t * job_ptr)1135 extern int select_p_job_begin(job_record_t *job_ptr)
1136 {
1137 	return SLURM_SUCCESS;
1138 }
1139 
select_p_job_ready(job_record_t * job_ptr)1140 extern int select_p_job_ready(job_record_t *job_ptr)
1141 {
1142 	int i, i_first, i_last;
1143 	node_record_t *node_ptr;
1144 
1145 	if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
1146 		/* Gang scheduling might suspend job immediately */
1147 		return 0;
1148 	}
1149 
1150 	if ((job_ptr->node_bitmap == NULL) ||
1151 	    ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1))
1152 		return READY_NODE_STATE;
1153 	i_last  = bit_fls(job_ptr->node_bitmap);
1154 	for (i = i_first; i <= i_last; i++) {
1155 		if (bit_test(job_ptr->node_bitmap, i) == 0)
1156 			continue;
1157 		node_ptr = node_record_table_ptr + i;
1158 		if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr))
1159 			return 0;
1160 	}
1161 
1162 	return READY_NODE_STATE;
1163 }
1164 
select_p_job_expand(job_record_t * from_job_ptr,job_record_t * to_job_ptr)1165 extern int select_p_job_expand(job_record_t *from_job_ptr,
1166 			       job_record_t *to_job_ptr)
1167 {
1168 	job_resources_t *from_job_resrcs_ptr, *to_job_resrcs_ptr,
1169 		*new_job_resrcs_ptr;
1170 	node_record_t *node_ptr;
1171 	int first_bit, last_bit, i, node_cnt;
1172 	bool from_node_used, to_node_used;
1173 	int from_node_offset, to_node_offset, new_node_offset;
1174 	bitstr_t *tmp_bitmap, *tmp_bitmap2;
1175 
1176 	xassert(from_job_ptr);
1177 	xassert(from_job_ptr->details);
1178 	xassert(to_job_ptr);
1179 	xassert(to_job_ptr->details);
1180 
1181 	if (from_job_ptr->job_id == to_job_ptr->job_id) {
1182 		error("%s: %s: attempt to merge %pJ with self",
1183 		      plugin_type, __func__, from_job_ptr);
1184 		return SLURM_ERROR;
1185 	}
1186 
1187 	from_job_resrcs_ptr = from_job_ptr->job_resrcs;
1188 	if ((from_job_resrcs_ptr == NULL) ||
1189 	    (from_job_resrcs_ptr->cpus == NULL) ||
1190 	    (from_job_resrcs_ptr->core_bitmap == NULL) ||
1191 	    (from_job_resrcs_ptr->node_bitmap == NULL)) {
1192 		error("%s: %s: %pJ lacks a job_resources struct",
1193 		      plugin_type, __func__, from_job_ptr);
1194 		return SLURM_ERROR;
1195 	}
1196 	to_job_resrcs_ptr = to_job_ptr->job_resrcs;
1197 	if ((to_job_resrcs_ptr == NULL) ||
1198 	    (to_job_resrcs_ptr->cpus == NULL) ||
1199 	    (to_job_resrcs_ptr->core_bitmap == NULL) ||
1200 	    (to_job_resrcs_ptr->node_bitmap == NULL)) {
1201 		error("%s: %s: %pJ lacks a job_resources struct",
1202 		      plugin_type, __func__, to_job_ptr);
1203 		return SLURM_ERROR;
1204 	}
1205 
1206 	if (is_cons_tres) {
1207 		if (to_job_ptr->gres_list) {
1208 			/* Can't reset gres/mps fields today */
1209 			error("%s: %s: %pJ has allocated GRES",
1210 			      plugin_type, __func__, to_job_ptr);
1211 			return SLURM_ERROR;
1212 		}
1213 		if (from_job_ptr->gres_list) {
1214 			/* Can't reset gres/mps fields today */
1215 			error("%s: %s: %pJ has allocated GRES",
1216 			      plugin_type, __func__, from_job_ptr);
1217 			return SLURM_ERROR;
1218 		}
1219 	}
1220 
1221 	(void) job_res_rm_job(select_part_record, select_node_usage,
1222 			      from_job_ptr, 0, true, NULL);
1223 	(void) job_res_rm_job(select_part_record, select_node_usage,
1224 			      to_job_ptr, 0, true, NULL);
1225 
1226 	if (to_job_resrcs_ptr->core_bitmap_used) {
1227 		i = bit_size(to_job_resrcs_ptr->core_bitmap_used);
1228 		bit_nclear(to_job_resrcs_ptr->core_bitmap_used, 0, i-1);
1229 	}
1230 
1231 	tmp_bitmap = bit_copy(to_job_resrcs_ptr->node_bitmap);
1232 	bit_or(tmp_bitmap, from_job_resrcs_ptr->node_bitmap);
1233 	tmp_bitmap2 = bit_copy(to_job_ptr->node_bitmap);
1234 	bit_or(tmp_bitmap2, from_job_ptr->node_bitmap);
1235 	bit_and(tmp_bitmap, tmp_bitmap2);
1236 	bit_free(tmp_bitmap2);
1237 	node_cnt = bit_set_count(tmp_bitmap);
1238 
1239 	new_job_resrcs_ptr = _create_job_resources(node_cnt);
1240 	new_job_resrcs_ptr->ncpus = from_job_resrcs_ptr->ncpus +
1241 		to_job_resrcs_ptr->ncpus;
1242 	new_job_resrcs_ptr->node_req = to_job_resrcs_ptr->node_req;
1243 	new_job_resrcs_ptr->node_bitmap = tmp_bitmap;
1244 	new_job_resrcs_ptr->nodes = bitmap2node_name(new_job_resrcs_ptr->
1245 						     node_bitmap);
1246 	new_job_resrcs_ptr->whole_node = to_job_resrcs_ptr->whole_node;
1247 	build_job_resources(new_job_resrcs_ptr, node_record_table_ptr);
1248 	xfree(to_job_ptr->node_addr);
1249 	to_job_ptr->node_addr = xcalloc(node_cnt, sizeof(slurm_addr_t));
1250 	to_job_ptr->total_cpus = 0;
1251 
1252 	first_bit = MIN(bit_ffs(from_job_resrcs_ptr->node_bitmap),
1253 			bit_ffs(to_job_resrcs_ptr->node_bitmap));
1254 	last_bit =  MAX(bit_fls(from_job_resrcs_ptr->node_bitmap),
1255 			bit_fls(to_job_resrcs_ptr->node_bitmap));
1256 	from_node_offset = to_node_offset = new_node_offset = -1;
1257 	for (i = first_bit; i <= last_bit; i++) {
1258 		from_node_used = to_node_used = false;
1259 		if (bit_test(from_job_resrcs_ptr->node_bitmap, i)) {
1260 			from_node_used = bit_test(from_job_ptr->node_bitmap,i);
1261 			from_node_offset++;
1262 		}
1263 		if (bit_test(to_job_resrcs_ptr->node_bitmap, i)) {
1264 			to_node_used = bit_test(to_job_ptr->node_bitmap, i);
1265 			to_node_offset++;
1266 		}
1267 		if (!from_node_used && !to_node_used)
1268 			continue;
1269 		new_node_offset++;
1270 		node_ptr = node_record_table_ptr + i;
1271 		memcpy(&to_job_ptr->node_addr[new_node_offset],
1272 		       &node_ptr->slurm_addr, sizeof(slurm_addr_t));
1273 		if (from_node_used) {
1274 			/*
1275 			 * Merge alloc info from both "from" and "to" jobs,
1276 			 * leave "from" job with no allocated CPUs or memory
1277 			 *
1278 			 * The following fields should be zero:
1279 			 * from_job_resrcs_ptr->cpus_used[from_node_offset]
1280 			 * from_job_resrcs_ptr->memory_used[from_node_offset];
1281 			 */
1282 			new_job_resrcs_ptr->cpus[new_node_offset] =
1283 				from_job_resrcs_ptr->cpus[from_node_offset];
1284 			from_job_resrcs_ptr->cpus[from_node_offset] = 0;
1285 			new_job_resrcs_ptr->memory_allocated[new_node_offset] =
1286 				from_job_resrcs_ptr->
1287 				memory_allocated[from_node_offset];
1288 			job_resources_bits_copy(new_job_resrcs_ptr,
1289 						new_node_offset,
1290 						from_job_resrcs_ptr,
1291 						from_node_offset);
1292 		}
1293 		if (to_node_used) {
1294 			/*
1295 			 * Merge alloc info from both "from" and "to" jobs
1296 			 *
1297 			 * DO NOT double count the allocated CPUs in partition
1298 			 * with Shared nodes
1299 			 */
1300 			new_job_resrcs_ptr->cpus[new_node_offset] +=
1301 				to_job_resrcs_ptr->cpus[to_node_offset];
1302 			new_job_resrcs_ptr->cpus_used[new_node_offset] +=
1303 				to_job_resrcs_ptr->cpus_used[to_node_offset];
1304 			new_job_resrcs_ptr->memory_allocated[new_node_offset]+=
1305 				to_job_resrcs_ptr->
1306 				memory_allocated[to_node_offset];
1307 			new_job_resrcs_ptr->memory_used[new_node_offset] +=
1308 				to_job_resrcs_ptr->memory_used[to_node_offset];
1309 			job_resources_bits_copy(new_job_resrcs_ptr,
1310 						new_node_offset,
1311 						to_job_resrcs_ptr,
1312 						to_node_offset);
1313 			if (from_node_used) {
1314 				/* Adjust CPU count for shared CPUs */
1315 				int from_core_cnt, to_core_cnt, new_core_cnt;
1316 				from_core_cnt = count_job_resources_node(
1317 					from_job_resrcs_ptr,
1318 					from_node_offset);
1319 				to_core_cnt = count_job_resources_node(
1320 					to_job_resrcs_ptr,
1321 					to_node_offset);
1322 				new_core_cnt = count_job_resources_node(
1323 					new_job_resrcs_ptr,
1324 					new_node_offset);
1325 				if ((from_core_cnt + to_core_cnt) !=
1326 				    new_core_cnt) {
1327 					new_job_resrcs_ptr->
1328 						cpus[new_node_offset] *=
1329 						new_core_cnt;
1330 					new_job_resrcs_ptr->
1331 						cpus[new_node_offset] /=
1332 						(from_core_cnt + to_core_cnt);
1333 				}
1334 			}
1335 		}
1336 		if (to_job_ptr->details->whole_node == 1) {
1337 			to_job_ptr->total_cpus += select_node_record[i].cpus;
1338 		} else {
1339 			to_job_ptr->total_cpus += new_job_resrcs_ptr->
1340 				cpus[new_node_offset];
1341 		}
1342 	}
1343 	build_job_resources_cpu_array(new_job_resrcs_ptr);
1344 	gres_plugin_job_merge(from_job_ptr->gres_list,
1345 			      from_job_resrcs_ptr->node_bitmap,
1346 			      to_job_ptr->gres_list,
1347 			      to_job_resrcs_ptr->node_bitmap);
1348 
1349 	/* Now swap data: "new" -> "to" and clear "from" */
1350 	free_job_resources(&to_job_ptr->job_resrcs);
1351 	to_job_ptr->job_resrcs = new_job_resrcs_ptr;
1352 
1353 	to_job_ptr->cpu_cnt = to_job_ptr->total_cpus;
1354 	to_job_ptr->details->min_cpus = to_job_ptr->total_cpus;
1355 	to_job_ptr->details->max_cpus = to_job_ptr->total_cpus;
1356 	from_job_ptr->total_cpus   = 0;
1357 	from_job_resrcs_ptr->ncpus = 0;
1358 	from_job_ptr->details->min_cpus = 0;
1359 	from_job_ptr->details->max_cpus = 0;
1360 
1361 	from_job_ptr->total_nodes   = 0;
1362 	from_job_resrcs_ptr->nhosts = 0;
1363 	from_job_ptr->node_cnt      = 0;
1364 	from_job_ptr->details->min_nodes = 0;
1365 	to_job_ptr->total_nodes     = new_job_resrcs_ptr->nhosts;
1366 	to_job_ptr->node_cnt        = new_job_resrcs_ptr->nhosts;
1367 
1368 	bit_or(to_job_ptr->node_bitmap, from_job_ptr->node_bitmap);
1369 	bit_nclear(from_job_ptr->node_bitmap, 0, (node_record_count - 1));
1370 	bit_nclear(from_job_resrcs_ptr->node_bitmap, 0,
1371 		   (node_record_count - 1));
1372 
1373 	xfree(to_job_ptr->nodes);
1374 	to_job_ptr->nodes = xstrdup(new_job_resrcs_ptr->nodes);
1375 	xfree(from_job_ptr->nodes);
1376 	from_job_ptr->nodes = xstrdup("");
1377 	xfree(from_job_resrcs_ptr->nodes);
1378 	from_job_resrcs_ptr->nodes = xstrdup("");
1379 
1380 	(void) job_res_add_job(to_job_ptr, 0);
1381 
1382 	return SLURM_SUCCESS;
1383 }
1384 
select_p_job_resized(job_record_t * job_ptr,node_record_t * node_ptr)1385 extern int select_p_job_resized(job_record_t *job_ptr, node_record_t *node_ptr)
1386 {
1387 	part_res_record_t *part_record_ptr = select_part_record;
1388 	node_use_record_t *node_usage = select_node_usage;
1389 	struct job_resources *job = job_ptr->job_resrcs;
1390 	part_res_record_t *p_ptr;
1391 	int i, i_first, i_last, node_inx, n;
1392 	List gres_list;
1393 	bool old_job = false;
1394 
1395 	xassert(job_ptr);
1396 	xassert(job_ptr->magic == JOB_MAGIC);
1397 
1398 	if (!job || !job->core_bitmap) {
1399 		error("%s: %s: %pJ has no job_resrcs info",
1400 		      plugin_type, __func__, job_ptr);
1401 		return SLURM_ERROR;
1402 	}
1403 
1404 	debug3("%s: %s: %pJ node %s",
1405 	       plugin_type, __func__, job_ptr, node_ptr->name);
1406 	if (job_ptr->start_time < slurmctld_config.boot_time)
1407 		old_job = true;
1408 	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
1409 		_dump_job_res(job);
1410 
1411 	/* subtract memory */
1412 	node_inx  = node_ptr - node_record_table_ptr;
1413 	i_first = bit_ffs(job->node_bitmap);
1414 	if (i_first != -1)
1415 		i_last  = bit_fls(job->node_bitmap);
1416 	else
1417 		i_last = -2;
1418 	for (i = i_first, n = 0; i <= i_last; i++) {
1419 		if (!bit_test(job->node_bitmap, i))
1420 			continue;
1421 		if (i != node_inx) {
1422 			n++;
1423 			continue;
1424 		}
1425 
1426 		if (job->cpus[n] == 0) {
1427 			info("%s: %s: attempt to remove node %s from %pJ again",
1428 			     plugin_type, __func__, node_ptr->name, job_ptr);
1429 			return SLURM_SUCCESS;
1430 		}
1431 
1432 		if (node_usage[i].gres_list)
1433 			gres_list = node_usage[i].gres_list;
1434 		else
1435 			gres_list = node_ptr->gres_list;
1436 		gres_plugin_job_dealloc(job_ptr->gres_list, gres_list, n,
1437 					job_ptr->job_id, node_ptr->name,
1438 					old_job, job_ptr->user_id, true);
1439 		gres_plugin_node_state_log(gres_list, node_ptr->name);
1440 
1441 		if (node_usage[i].alloc_memory < job->memory_allocated[n]) {
1442 			error("%s: %s: node %s memory is underallocated (%"PRIu64"-%"PRIu64") for %pJ",
1443 			      plugin_type,
1444 			      __func__, node_ptr->name,
1445 			      node_usage[i].alloc_memory,
1446 			      job->memory_allocated[n], job_ptr);
1447 			node_usage[i].alloc_memory = 0;
1448 		} else
1449 			node_usage[i].alloc_memory -= job->memory_allocated[n];
1450 
1451 		extract_job_resources_node(job, n);
1452 
1453 		break;
1454 	}
1455 
1456 	if (IS_JOB_SUSPENDED(job_ptr))
1457 		return SLURM_SUCCESS;	/* No cores allocated to the job now */
1458 
1459 	/* subtract cores, reconstruct rows with remaining jobs */
1460 	if (!job_ptr->part_ptr) {
1461 		error("%s: %s: removed %pJ does not have a partition assigned",
1462 		      plugin_type, __func__, job_ptr);
1463 		return SLURM_ERROR;
1464 	}
1465 
1466 	for (p_ptr = part_record_ptr; p_ptr; p_ptr = p_ptr->next) {
1467 		if (p_ptr->part_ptr == job_ptr->part_ptr)
1468 			break;
1469 	}
1470 	if (!p_ptr) {
1471 		error("%s: %s: removed %pJ could not find part %s",
1472 		      plugin_type, __func__, job_ptr, job_ptr->part_ptr->name);
1473 		return SLURM_ERROR;
1474 	}
1475 
1476 	if (!p_ptr->row)
1477 		return SLURM_SUCCESS;
1478 
1479 	/* look for the job in the partition's job_list */
1480 	n = 0;
1481 	for (i = 0; i < p_ptr->num_rows; i++) {
1482 		uint32_t j;
1483 		for (j = 0; j < p_ptr->row[i].num_jobs; j++) {
1484 			if (p_ptr->row[i].job_list[j] != job)
1485 				continue;
1486 			debug3("%s: %s: found %pJ in part %s row %u",
1487 			       plugin_type, __func__, job_ptr,
1488 			       p_ptr->part_ptr->name, i);
1489 			/* found job - we're done, don't actually remove */
1490 			n = 1;
1491 			i = p_ptr->num_rows;
1492 			break;
1493 		}
1494 	}
1495 	if (n == 0) {
1496 		error("%s: %s: could not find %pJ in partition %s",
1497 		      plugin_type, __func__, job_ptr, p_ptr->part_ptr->name);
1498 		return SLURM_ERROR;
1499 	}
1500 
1501 
1502 	/* some node of job removed from core-bitmap, so rebuild core bitmaps */
1503 	part_data_build_row_bitmaps(p_ptr, NULL);
1504 
1505 	/*
1506 	 * Adjust the node_state of the node removed from this job.
1507 	 * If all cores are now available, set node_state = NODE_CR_AVAILABLE
1508 	 */
1509 	if (node_usage[node_inx].node_state >= job->node_req) {
1510 		node_usage[node_inx].node_state -= job->node_req;
1511 	} else {
1512 		error("%s: %s: node_state miscount", plugin_type, __func__);
1513 		node_usage[node_inx].node_state = NODE_CR_AVAILABLE;
1514 	}
1515 
1516 	return SLURM_SUCCESS;
1517 }
1518 
select_p_job_signal(job_record_t * job_ptr,int signal)1519 extern int select_p_job_signal(job_record_t *job_ptr, int signal)
1520 {
1521 	xassert(job_ptr);
1522 	xassert(job_ptr->magic == JOB_MAGIC);
1523 
1524 	return SLURM_SUCCESS;
1525 }
1526 
select_p_job_mem_confirm(job_record_t * job_ptr)1527 extern int select_p_job_mem_confirm(job_record_t *job_ptr)
1528 {
1529 	int i_first, i_last, i, offset;
1530 	uint64_t avail_mem, lowest_mem = 0;
1531 
1532 	xassert(job_ptr);
1533 
1534 	if (!(job_ptr->bit_flags & NODE_MEM_CALC))
1535 		return SLURM_SUCCESS;
1536 	if ((job_ptr->details == NULL) ||
1537 	    (job_ptr->job_resrcs == NULL) ||
1538 	    (job_ptr->job_resrcs->node_bitmap == NULL) ||
1539 	    (job_ptr->job_resrcs->memory_allocated == NULL))
1540 		return SLURM_ERROR;
1541 	i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
1542 	if (i_first >= 0)
1543 		i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
1544 	else
1545 		i_last = i_first - 1;
1546 	for (i = i_first, offset = 0; i <= i_last; i++) {
1547 		if (!bit_test(job_ptr->job_resrcs->node_bitmap, i))
1548 			continue;
1549 		avail_mem = select_node_record[i].real_memory -
1550 			select_node_record[i].mem_spec_limit;
1551 		job_ptr->job_resrcs->memory_allocated[offset] = avail_mem;
1552 		select_node_usage[i].alloc_memory = avail_mem;
1553 		if ((offset == 0) || (lowest_mem > avail_mem))
1554 			lowest_mem = avail_mem;
1555 		offset++;
1556 	}
1557 	job_ptr->details->pn_min_memory = lowest_mem;
1558 
1559 	return SLURM_SUCCESS;
1560 }
1561 
select_p_job_fini(job_record_t * job_ptr)1562 extern int select_p_job_fini(job_record_t *job_ptr)
1563 {
1564 	xassert(job_ptr);
1565 	xassert(job_ptr->magic == JOB_MAGIC);
1566 
1567 	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
1568 		info("%s: %s: %pJ", plugin_type, __func__, job_ptr);
1569 
1570 	job_res_rm_job(select_part_record, select_node_usage,
1571 		       job_ptr, 0, true, NULL);
1572 
1573 	return SLURM_SUCCESS;
1574 }
1575 
1576 /* NOTE: This function is not called with gang scheduling because it
1577  * needs to track how many jobs are running or suspended on each node.
1578  * This sum is compared with the partition's Shared parameter */
select_p_job_suspend(job_record_t * job_ptr,bool indf_susp)1579 extern int select_p_job_suspend(job_record_t *job_ptr, bool indf_susp)
1580 {
1581 	xassert(job_ptr);
1582 	xassert(job_ptr->magic == JOB_MAGIC);
1583 
1584 	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
1585 		if (indf_susp)
1586 			info("%s: %s: %pJ indf_susp", plugin_type, __func__,
1587 			     job_ptr);
1588 		else
1589 			info("%s: %s: %pJ", plugin_type, __func__, job_ptr);
1590 	}
1591 
1592 	if (!indf_susp)
1593 		return SLURM_SUCCESS;
1594 
1595 	return job_res_rm_job(select_part_record, select_node_usage,
1596 			      job_ptr, 2, false, NULL);
1597 }
1598 
1599 /* See NOTE with select_p_job_suspend() above */
select_p_job_resume(job_record_t * job_ptr,bool indf_susp)1600 extern int select_p_job_resume(job_record_t *job_ptr, bool indf_susp)
1601 {
1602 	xassert(job_ptr);
1603 	xassert(job_ptr->magic == JOB_MAGIC);
1604 
1605 	if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
1606 		if (indf_susp)
1607 			info("%s: %s: %pJ indf_susp", plugin_type, __func__,
1608 			     job_ptr);
1609 		else
1610 			info("%s: %s: %pJ", plugin_type, __func__, job_ptr);
1611 	}
1612 	if (!indf_susp)
1613 		return SLURM_SUCCESS;
1614 
1615 	return job_res_add_job(job_ptr, 2);
1616 }
1617 
select_p_step_pick_nodes(job_record_t * job_ptr,select_jobinfo_t * jobinfo,uint32_t node_count,bitstr_t ** avail_nodes)1618 extern bitstr_t *select_p_step_pick_nodes(job_record_t *job_ptr,
1619 					  select_jobinfo_t *jobinfo,
1620 					  uint32_t node_count,
1621 					  bitstr_t **avail_nodes)
1622 {
1623 	return NULL;
1624 }
1625 
1626 /* Unused for this plugin */
select_p_step_start(step_record_t * step_ptr)1627 extern int select_p_step_start(step_record_t *step_ptr)
1628 {
1629 	return SLURM_SUCCESS;
1630 }
1631 
1632 /* Unused for this plugin */
select_p_step_finish(step_record_t * step_ptr,bool killing_step)1633 extern int select_p_step_finish(step_record_t *step_ptr, bool killing_step)
1634 {
1635 	return SLURM_SUCCESS;
1636 }
1637 
select_p_select_nodeinfo_pack(select_nodeinfo_t * nodeinfo,Buf buffer,uint16_t protocol_version)1638 extern int select_p_select_nodeinfo_pack(select_nodeinfo_t *nodeinfo,
1639 					 Buf buffer,
1640 					 uint16_t protocol_version)
1641 {
1642 	select_nodeinfo_t *nodeinfo_empty = NULL;
1643 
1644 	if (!nodeinfo) {
1645 		/*
1646 		 * We should never get here,
1647 		 * but avoid abort with bad data structures
1648 		 */
1649 		error("%s: nodeinfo is NULL", __func__);
1650 		nodeinfo_empty = xmalloc(sizeof(select_nodeinfo_t));
1651 		nodeinfo = nodeinfo_empty;
1652 	}
1653 
1654 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
1655 		pack16(nodeinfo->alloc_cpus, buffer);
1656 		pack64(nodeinfo->alloc_memory, buffer);
1657 		packstr(nodeinfo->tres_alloc_fmt_str, buffer);
1658 		packdouble(nodeinfo->tres_alloc_weighted, buffer);
1659 	}
1660 	xfree(nodeinfo_empty);
1661 
1662 	return SLURM_SUCCESS;
1663 }
1664 
select_p_select_nodeinfo_alloc(void)1665 extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(void)
1666 {
1667 	select_nodeinfo_t *nodeinfo = xmalloc(sizeof(select_nodeinfo_t));
1668 
1669 	nodeinfo->magic = nodeinfo_magic;
1670 
1671 	return nodeinfo;
1672 }
1673 
select_p_select_nodeinfo_free(select_nodeinfo_t * nodeinfo)1674 extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo)
1675 {
1676 	if (nodeinfo) {
1677 		if (nodeinfo->magic != nodeinfo_magic) {
1678 			error("%s: nodeinfo magic bad", __func__);
1679 			return EINVAL;
1680 		}
1681 		xfree(nodeinfo->tres_alloc_cnt);
1682 		xfree(nodeinfo->tres_alloc_fmt_str);
1683 		xfree(nodeinfo);
1684 	}
1685 	return SLURM_SUCCESS;
1686 }
1687 
select_p_select_nodeinfo_unpack(select_nodeinfo_t ** nodeinfo,Buf buffer,uint16_t protocol_version)1688 extern int select_p_select_nodeinfo_unpack(select_nodeinfo_t **nodeinfo,
1689 					   Buf buffer,
1690 					   uint16_t protocol_version)
1691 {
1692 	uint32_t uint32_tmp;
1693 	select_nodeinfo_t *nodeinfo_ptr = NULL;
1694 
1695 	nodeinfo_ptr = select_p_select_nodeinfo_alloc();
1696 	*nodeinfo = nodeinfo_ptr;
1697 
1698 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
1699 		safe_unpack16(&nodeinfo_ptr->alloc_cpus, buffer);
1700 		safe_unpack64(&nodeinfo_ptr->alloc_memory, buffer);
1701 		safe_unpackstr_xmalloc(&nodeinfo_ptr->tres_alloc_fmt_str,
1702 				       &uint32_tmp, buffer);
1703 		safe_unpackdouble(&nodeinfo_ptr->tres_alloc_weighted, buffer);
1704 	}
1705 
1706 	return SLURM_SUCCESS;
1707 
1708 unpack_error:
1709 	error("%s: error unpacking here", __func__);
1710 	select_p_select_nodeinfo_free(nodeinfo_ptr);
1711 	*nodeinfo = NULL;
1712 
1713 	return SLURM_ERROR;
1714 }
1715 
select_p_select_nodeinfo_set_all(void)1716 extern int select_p_select_nodeinfo_set_all(void)
1717 {
1718 	static time_t last_set_all = 0;
1719 	part_res_record_t *p_ptr;
1720 	node_record_t *node_ptr = NULL;
1721 	int i, n;
1722 	uint32_t alloc_cpus, alloc_cores, node_cores, node_cpus, node_threads;
1723 	uint32_t node_boards, node_sockets, total_node_cores;
1724 	bitstr_t **alloc_core_bitmap = NULL;
1725 	List gres_list;
1726 
1727 	/*
1728 	 * only set this once when the last_node_update is newer than
1729 	 * the last time we set things up.
1730 	 */
1731 	if (last_set_all && (last_node_update < last_set_all)) {
1732 		debug2("%s: Node data hasn't changed since %ld", __func__,
1733 		       (long)last_set_all);
1734 		return SLURM_NO_CHANGE_IN_DATA;
1735 	}
1736 	last_set_all = last_node_update;
1737 
1738 	/*
1739 	 * Build core bitmap array representing all cores allocated to all
1740 	 * active jobs (running or preempted jobs)
1741 	 */
1742 	for (p_ptr = select_part_record; p_ptr; p_ptr = p_ptr->next) {
1743 		if (!p_ptr->row)
1744 			continue;
1745 		for (i = 0; i < p_ptr->num_rows; i++) {
1746 			if (!p_ptr->row[i].row_bitmap)
1747 				continue;
1748 			if (!alloc_core_bitmap) {
1749 				alloc_core_bitmap =
1750 					copy_core_array(
1751 						p_ptr->row[i].row_bitmap);
1752 			} else {
1753 				core_array_or(alloc_core_bitmap,
1754 					      p_ptr->row[i].row_bitmap);
1755 			}
1756 		}
1757 	}
1758 
1759 	for (n = 0, node_ptr = node_record_table_ptr;
1760 	     n < select_node_cnt; n++, node_ptr++) {
1761 		select_nodeinfo_t *nodeinfo = NULL;
1762 		/*
1763 		 * We have to use the '_g_' here to make sure we get the
1764 		 * correct data to work on.  i.e. select/cray calls this plugin
1765 		 * and has it's own struct.
1766 		 */
1767 		select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
1768 					     SELECT_NODEDATA_PTR, 0,
1769 					     (void *)&nodeinfo);
1770 		if (!nodeinfo) {
1771 			error("%s: no nodeinfo returned from structure",
1772 			      __func__);
1773 			continue;
1774 		}
1775 
1776 		node_boards  = node_ptr->config_ptr->boards;
1777 		node_sockets = node_ptr->config_ptr->sockets;
1778 		node_cores   = node_ptr->config_ptr->cores;
1779 		node_cpus    = node_ptr->config_ptr->cpus;
1780 		node_threads = node_ptr->config_ptr->threads;
1781 
1782 		if (is_cons_tres) {
1783 			if (alloc_core_bitmap && alloc_core_bitmap[n])
1784 				alloc_cores = bit_set_count(
1785 					alloc_core_bitmap[n]);
1786 			else
1787 				alloc_cores = 0;
1788 
1789 			total_node_cores =
1790 				node_boards * node_sockets * node_cores;
1791 		} else {
1792 			int start = cr_get_coremap_offset(n);
1793 			int end = cr_get_coremap_offset(n + 1);
1794 			if (alloc_core_bitmap)
1795 				alloc_cores = bit_set_count_range(
1796 					*alloc_core_bitmap,
1797 					start, end);
1798 			else
1799 				alloc_cores = 0;
1800 
1801 			total_node_cores = end - start;
1802 		}
1803 
1804 		/*
1805 		 * Administrator could resume suspended jobs and oversubscribe
1806 		 * cores, avoid reporting more cores in use than configured
1807 		 */
1808 		if (alloc_cores > total_node_cores)
1809 			alloc_cpus = total_node_cores;
1810 		else
1811 			alloc_cpus = alloc_cores;
1812 
1813 		/*
1814 		 * The minimum allocatable unit may a core, so scale by thread
1815 		 * count up to the proper CPU count as needed
1816 		 */
1817 		if (total_node_cores < node_cpus)
1818 			alloc_cpus *= node_threads;
1819 		nodeinfo->alloc_cpus = alloc_cpus;
1820 
1821 		if (select_node_record) {
1822 			nodeinfo->alloc_memory =
1823 				select_node_usage[n].alloc_memory;
1824 		} else {
1825 			nodeinfo->alloc_memory = 0;
1826 		}
1827 
1828 		/* Build allocated TRES info */
1829 		if (!nodeinfo->tres_alloc_cnt)
1830 			nodeinfo->tres_alloc_cnt = xcalloc(slurmctld_tres_cnt,
1831 							   sizeof(uint64_t));
1832 		nodeinfo->tres_alloc_cnt[TRES_ARRAY_CPU] = alloc_cpus;
1833 		nodeinfo->tres_alloc_cnt[TRES_ARRAY_MEM] =
1834 			nodeinfo->alloc_memory;
1835 		if (select_node_usage[n].gres_list)
1836 			gres_list = select_node_usage[n].gres_list;
1837 		else
1838 			gres_list = node_ptr->gres_list;
1839 		gres_set_node_tres_cnt(gres_list, nodeinfo->tres_alloc_cnt,
1840 				       false);
1841 
1842 		xfree(nodeinfo->tres_alloc_fmt_str);
1843 		nodeinfo->tres_alloc_fmt_str =
1844 			assoc_mgr_make_tres_str_from_array(
1845 				nodeinfo->tres_alloc_cnt,
1846 				TRES_STR_CONVERT_UNITS, false);
1847 		nodeinfo->tres_alloc_weighted =
1848 			assoc_mgr_tres_weighted(nodeinfo->tres_alloc_cnt,
1849 						node_ptr->config_ptr->tres_weights,
1850 						priority_flags, false);
1851 	}
1852 	free_core_array(&alloc_core_bitmap);
1853 
1854 	return SLURM_SUCCESS;
1855 }
1856 
select_p_select_nodeinfo_set(job_record_t * job_ptr)1857 extern int select_p_select_nodeinfo_set(job_record_t *job_ptr)
1858 {
1859 	int rc;
1860 
1861 	xassert(job_ptr);
1862 	xassert(job_ptr->magic == JOB_MAGIC);
1863 
1864 	if (IS_JOB_RUNNING(job_ptr))
1865 		rc = job_res_add_job(job_ptr, 0);
1866 	else if (IS_JOB_SUSPENDED(job_ptr)) {
1867 		if (job_ptr->priority == 0)
1868 			rc = job_res_add_job(job_ptr, 1);
1869 		else	/* Gang schedule suspend */
1870 			rc = job_res_add_job(job_ptr, 0);
1871 	} else
1872 		return SLURM_SUCCESS;
1873 
1874 	gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id);
1875 
1876 	return rc;
1877 }
1878 
select_p_select_nodeinfo_get(select_nodeinfo_t * nodeinfo,enum select_nodedata_type dinfo,enum node_states state,void * data)1879 extern int select_p_select_nodeinfo_get(select_nodeinfo_t *nodeinfo,
1880 					enum select_nodedata_type dinfo,
1881 					enum node_states state,
1882 					void *data)
1883 {
1884 	int rc = SLURM_SUCCESS;
1885 	uint16_t *uint16 = (uint16_t *) data;
1886 	uint64_t *uint64 = (uint64_t *) data;
1887 	char **tmp_char = (char **) data;
1888 	double *tmp_double = (double *) data;
1889 	select_nodeinfo_t **select_nodeinfo = (select_nodeinfo_t **) data;
1890 
1891 	if (nodeinfo == NULL) {
1892 		error("%s: nodeinfo not set", __func__);
1893 		return SLURM_ERROR;
1894 	}
1895 
1896 	if (nodeinfo->magic != nodeinfo_magic) {
1897 		error("%s: jobinfo magic bad", __func__);
1898 		return SLURM_ERROR;
1899 	}
1900 
1901 	switch (dinfo) {
1902 	case SELECT_NODEDATA_SUBCNT:
1903 		if (state == NODE_STATE_ALLOCATED)
1904 			*uint16 = nodeinfo->alloc_cpus;
1905 		else
1906 			*uint16 = 0;
1907 		break;
1908 	case SELECT_NODEDATA_PTR:
1909 		*select_nodeinfo = nodeinfo;
1910 		break;
1911 	case SELECT_NODEDATA_MEM_ALLOC:
1912 		*uint64 = nodeinfo->alloc_memory;
1913 		break;
1914 	case SELECT_NODEDATA_TRES_ALLOC_FMT_STR:
1915 		*tmp_char = xstrdup(nodeinfo->tres_alloc_fmt_str);
1916 		break;
1917 	case SELECT_NODEDATA_TRES_ALLOC_WEIGHTED:
1918 		*tmp_double = nodeinfo->tres_alloc_weighted;
1919 		break;
1920 	default:
1921 		error("%s: Unsupported option %d", __func__, dinfo);
1922 		rc = SLURM_ERROR;
1923 		break;
1924 	}
1925 	return rc;
1926 }
1927 
1928 /* Unused for this plugin */
select_p_select_jobinfo_alloc(void)1929 extern int select_p_select_jobinfo_alloc(void)
1930 {
1931 	return SLURM_SUCCESS;
1932 }
1933 
1934 /* Unused for this plugin */
select_p_select_jobinfo_free(select_jobinfo_t * jobinfo)1935 extern int select_p_select_jobinfo_free(select_jobinfo_t *jobinfo)
1936 {
1937 	return SLURM_SUCCESS;
1938 }
1939 
1940 /* Unused for this plugin */
select_p_select_jobinfo_set(select_jobinfo_t * jobinfo,enum select_jobdata_type data_type,void * data)1941 extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo,
1942 				       enum select_jobdata_type data_type,
1943 				       void *data)
1944 {
1945 	return SLURM_SUCCESS;
1946 }
1947 
1948 /* Unused for this plugin */
select_p_select_jobinfo_get(select_jobinfo_t * jobinfo,enum select_jobdata_type data_type,void * data)1949 extern int select_p_select_jobinfo_get(select_jobinfo_t *jobinfo,
1950 				       enum select_jobdata_type data_type,
1951 				       void *data)
1952 {
1953 	return SLURM_ERROR;
1954 }
1955 
1956 /* Unused for this plugin */
select_p_select_jobinfo_copy(select_jobinfo_t * jobinfo)1957 extern select_jobinfo_t *select_p_select_jobinfo_copy(select_jobinfo_t *jobinfo)
1958 {
1959 	return NULL;
1960 }
1961 
1962 /* Unused for this plugin */
select_p_select_jobinfo_pack(select_jobinfo_t * jobinfo,Buf buffer,uint16_t protocol_version)1963 extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer,
1964 					uint16_t protocol_version)
1965 {
1966 	return SLURM_SUCCESS;
1967 }
1968 
1969 /* Unused for this plugin */
select_p_select_jobinfo_unpack(select_jobinfo_t * jobinfo,Buf buffer,uint16_t protocol_version)1970 extern int select_p_select_jobinfo_unpack(select_jobinfo_t *jobinfo,
1971 					  Buf buffer,
1972 					  uint16_t protocol_version)
1973 {
1974 	return SLURM_SUCCESS;
1975 }
1976 
1977 /* Unused for this plugin */
select_p_select_jobinfo_sprint(select_jobinfo_t * jobinfo,char * buf,size_t size,int mode)1978 extern char *select_p_select_jobinfo_sprint(select_jobinfo_t *jobinfo,
1979 					    char *buf, size_t size, int mode)
1980 {
1981 	if (buf && size) {
1982 		buf[0] = '\0';
1983 		return buf;
1984 	}
1985 	return NULL;
1986 }
1987 
1988 /* Unused for this plugin */
select_p_select_jobinfo_xstrdup(select_jobinfo_t * jobinfo,int mode)1989 extern char *select_p_select_jobinfo_xstrdup(select_jobinfo_t *jobinfo,
1990 					     int mode)
1991 {
1992 	return NULL;
1993 }
1994 
select_p_get_info_from_plugin(enum select_plugindata_info info,job_record_t * job_ptr,void * data)1995 extern int select_p_get_info_from_plugin(enum select_plugindata_info info,
1996 					 job_record_t *job_ptr,
1997 					 void *data)
1998 {
1999 	int rc = SLURM_SUCCESS;
2000 	uint32_t *tmp_32 = (uint32_t *) data;
2001 	List *tmp_list = (List *) data;
2002 
2003 	switch (info) {
2004 	case SELECT_CR_PLUGIN:
2005 		*tmp_32 = is_cons_tres ?
2006 			SELECT_TYPE_CONS_TRES : SELECT_TYPE_CONS_RES;
2007 		break;
2008 	case SELECT_CONFIG_INFO:
2009 		*tmp_list = NULL;
2010 		break;
2011 	case SELECT_SINGLE_JOB_TEST:
2012 		*tmp_32 = is_cons_tres ? 1 : 0;
2013 		break;
2014 	default:
2015 		error("%s: info type %d invalid", __func__, info);
2016 		rc = SLURM_ERROR;
2017 		break;
2018 	}
2019 	return rc;
2020 }
2021 
select_p_update_node_config(int index)2022 extern int select_p_update_node_config(int index)
2023 {
2024 	if (index >= select_node_cnt) {
2025 		error("%s: index too large (%d > %d)", __func__, index,
2026 		      select_node_cnt);
2027 		return SLURM_ERROR;
2028 	}
2029 
2030 	/*
2031 	 * Socket and core count can be changed when KNL node reboots in a
2032 	 * different NUMA configuration
2033 	 */
2034 	if (!(slurmctld_conf.conf_flags & CTL_CONF_OR) &&
2035 	    (select_node_record[index].sockets !=
2036 	     select_node_record[index].node_ptr->config_ptr->sockets) &&
2037 	    (select_node_record[index].cores !=
2038 	     select_node_record[index].node_ptr->config_ptr->cores) &&
2039 	    ((select_node_record[index].sockets *
2040 	      select_node_record[index].cores) ==
2041 	     (select_node_record[index].node_ptr->sockets *
2042 	      select_node_record[index].node_ptr->cores))) {
2043 		select_node_record[index].cores =
2044 			select_node_record[index].node_ptr->config_ptr->cores;
2045 		select_node_record[index].sockets =
2046 			select_node_record[index].node_ptr->config_ptr->sockets;
2047 		/* tot_sockets should be the same */
2048 		/* tot_cores should be the same */
2049 	}
2050 
2051 	return SLURM_SUCCESS;
2052 }
2053 
select_p_reconfigure(void)2054 extern int select_p_reconfigure(void)
2055 {
2056 	ListIterator job_iterator;
2057 	job_record_t *job_ptr;
2058 	int rc = SLURM_SUCCESS;
2059 
2060 	info("%s: reconfigure", plugin_type);
2061 	select_debug_flags = slurm_get_debug_flags();
2062 
2063 	if (is_cons_tres) {
2064 		def_cpu_per_gpu = 0;
2065 		def_mem_per_gpu = 0;
2066 		if (slurmctld_conf.job_defaults_list) {
2067 			def_cpu_per_gpu = common_get_def_cpu_per_gpu(
2068 				slurmctld_conf.job_defaults_list);
2069 			def_mem_per_gpu = common_get_def_mem_per_gpu(
2070 				slurmctld_conf.job_defaults_list);
2071 		}
2072 	}
2073 
2074 	rc = select_p_node_init(node_record_table_ptr, node_record_count);
2075 	if (rc != SLURM_SUCCESS)
2076 		return rc;
2077 
2078 	/* reload job data */
2079 	job_iterator = list_iterator_create(job_list);
2080 	while ((job_ptr = list_next(job_iterator))) {
2081 		if (IS_JOB_RUNNING(job_ptr)) {
2082 			/* add the job */
2083 			job_res_add_job(job_ptr, 0);
2084 		} else if (IS_JOB_SUSPENDED(job_ptr)) {
2085 			/* add the job in a suspended state */
2086 			if (job_ptr->priority == 0)
2087 				(void) job_res_add_job(job_ptr, 1);
2088 			else	/* Gang schedule suspend */
2089 				(void) job_res_add_job(job_ptr, 0);
2090 		}
2091 	}
2092 	list_iterator_destroy(job_iterator);
2093 	select_state_initializing = false;
2094 
2095 	return SLURM_SUCCESS;
2096 }
2097 
select_p_resv_test(resv_desc_msg_t * resv_desc_ptr,uint32_t node_cnt,bitstr_t * avail_node_bitmap,bitstr_t ** core_bitmap)2098 extern bitstr_t *select_p_resv_test(resv_desc_msg_t *resv_desc_ptr,
2099 				    uint32_t node_cnt,
2100 				    bitstr_t *avail_node_bitmap,
2101 				    bitstr_t **core_bitmap)
2102 {
2103 	bitstr_t **switches_bitmap;		/* nodes on this switch */
2104 	bitstr_t ***switches_core_bitmap;	/* cores on this switch */
2105 	int       *switches_core_cnt;		/* total cores on switch */
2106 	int       *switches_node_cnt;		/* total nodes on switch */
2107 	int       *switches_required;		/* set if has required node */
2108 
2109 	bitstr_t *avail_nodes_bitmap = NULL;	/* nodes on any switch */
2110 	bitstr_t *picked_node_bitmap;
2111 	uint32_t *core_cnt;
2112 	bitstr_t **exc_core_bitmap = NULL, **picked_core_bitmap;
2113 	int32_t prev_rem_cores, rem_cores = 0, rem_cores_save, rem_nodes;
2114 	uint32_t cores_per_node = 1;	/* Minimum cores per node to consider */
2115 	bool aggr_core_cnt = false, clear_core, sufficient;
2116 	int c, i, i_first, i_last, j, k, n;
2117 	int best_fit_inx, best_fit_nodes;
2118 	int best_fit_location = 0, best_fit_sufficient;
2119 
2120 	xassert(avail_node_bitmap);
2121 	xassert(resv_desc_ptr);
2122 
2123 	/*
2124 	 * FIXME: core_bitmap is a full-system core bitmap to be
2125 	 * replaced with a set of per-node bitmaps in a future release
2126 	 * of Slurm.
2127 	 */
2128 	if (core_bitmap)
2129 		exc_core_bitmap = core_bitmap_to_array(*core_bitmap);
2130 
2131 	core_cnt = resv_desc_ptr->core_cnt;
2132 
2133 	if (core_cnt) {
2134 		/*
2135 		 * Run this now to set up exc_core_bitmap if needed for
2136 		 * pick_first_cores and sequential_pick.
2137 		 */
2138 		if (!exc_core_bitmap)
2139 			exc_core_bitmap = build_core_array();
2140 		(*cons_common_callbacks.spec_core_filter)(
2141 			avail_node_bitmap, exc_core_bitmap);
2142 	}
2143 
2144 	if ((resv_desc_ptr->flags & RESERVE_FLAG_FIRST_CORES) && core_cnt) {
2145 		/* Reservation request with "Flags=first_cores CoreCnt=#" */
2146 		avail_nodes_bitmap = (*cons_common_callbacks.pick_first_cores)(
2147 			avail_node_bitmap,
2148 			node_cnt, core_cnt,
2149 			&exc_core_bitmap);
2150 		if (avail_nodes_bitmap && core_bitmap && exc_core_bitmap) {
2151 			FREE_NULL_BITMAP(*core_bitmap);
2152 			*core_bitmap = core_array_to_bitmap(exc_core_bitmap);
2153 		}
2154 		free_core_array(&exc_core_bitmap);
2155 		return avail_nodes_bitmap;
2156 	}
2157 
2158 	/* When reservation includes a nodelist we use _sequential_pick code */
2159 	if (!switch_record_cnt || !switch_record_table || !node_cnt)  {
2160 		/* Reservation request with "Nodes=* [CoreCnt=#]" */
2161 		avail_nodes_bitmap = (*cons_common_callbacks.sequential_pick)(
2162 			avail_node_bitmap,
2163 			node_cnt, core_cnt,
2164 			&exc_core_bitmap);
2165 		if (avail_nodes_bitmap && core_bitmap && exc_core_bitmap) {
2166 			FREE_NULL_BITMAP(*core_bitmap);
2167 			*core_bitmap = core_array_to_bitmap(exc_core_bitmap);
2168 		}
2169 		free_core_array(&exc_core_bitmap);
2170 		return avail_nodes_bitmap;
2171 	}
2172 
2173 	/* Use topology state information */
2174 	if (bit_set_count(avail_node_bitmap) < node_cnt) {
2175 		free_core_array(&exc_core_bitmap);
2176 		return NULL;
2177 	}
2178 
2179 	rem_nodes = node_cnt;
2180 	if (core_cnt && core_cnt[1]) {	/* Array of core counts */
2181 		for (j = 0; core_cnt[j]; j++) {
2182 			rem_cores += core_cnt[j];
2183 			if (j == 0)
2184 				cores_per_node = core_cnt[j];
2185 			else if (cores_per_node > core_cnt[j])
2186 				cores_per_node = core_cnt[j];
2187 		}
2188 	} else if (core_cnt) {		/* Aggregate core count */
2189 		rem_cores = core_cnt[0];
2190 		cores_per_node = core_cnt[0] / MAX(node_cnt, 1);
2191 		aggr_core_cnt = true;
2192 	}
2193 
2194 	rem_cores_save = rem_cores;
2195 
2196 	/*
2197 	 * Construct a set of switch array entries,
2198 	 * use the same indexes as switch_record_table in slurmctld
2199 	 */
2200 	switches_bitmap = xcalloc(switch_record_cnt, sizeof(bitstr_t *));
2201 	switches_core_bitmap = xcalloc(switch_record_cnt, sizeof(bitstr_t **));
2202 	switches_core_cnt = xcalloc(switch_record_cnt, sizeof(int));
2203 	switches_node_cnt = xcalloc(switch_record_cnt, sizeof(int));
2204 	switches_required = xcalloc(switch_record_cnt, sizeof(int));
2205 
2206 	for (i = 0; i < switch_record_cnt; i++) {
2207 		switches_bitmap[i] =
2208 			bit_copy(switch_record_table[i].node_bitmap);
2209 		bit_and(switches_bitmap[i], avail_node_bitmap);
2210 		switches_node_cnt[i] = bit_set_count(switches_bitmap[i]);
2211 		switches_core_bitmap[i] = common_mark_avail_cores(
2212 			switches_bitmap[i], NO_VAL16);
2213 		if (exc_core_bitmap) {
2214 			core_array_and_not(switches_core_bitmap[i],
2215 					   exc_core_bitmap);
2216 		}
2217 		switches_core_cnt[i] =
2218 			count_core_array_set(switches_core_bitmap[i]);
2219 		debug2("switch:%d nodes:%d cores:%d",
2220 		       i, switches_node_cnt[i], switches_core_cnt[i]);
2221 	}
2222 
2223 	/* Remove nodes with fewer available cores than needed */
2224 	if (core_cnt) {
2225 		n = 0;
2226 
2227 		for (j = 0; j < switch_record_cnt; j++) {
2228 			i_first = bit_ffs(switches_bitmap[j]);
2229 			if (i_first >= 0)
2230 				i_last = bit_fls(switches_bitmap[j]);
2231 			else
2232 				i_last = i_first - 1;
2233 			for (i = i_first; i <= i_last; i++) {
2234 				if (!bit_test(switches_bitmap[j], i))
2235 					continue;
2236 
2237 				c = _get_avail_cores_on_node(
2238 					i, exc_core_bitmap);
2239 
2240 				clear_core = false;
2241 				if (aggr_core_cnt && (c < cores_per_node)) {
2242 					clear_core = true;
2243 				} else if (aggr_core_cnt) {
2244 					;
2245 				} else if (c < core_cnt[n]) {
2246 					clear_core = true;
2247 				} else if (core_cnt[n]) {
2248 					n++;
2249 				}
2250 				if (!clear_core)
2251 					continue;
2252 				for (k = 0; k < switch_record_cnt; k++) {
2253 					if (!switches_bitmap[k] ||
2254 					    !bit_test(switches_bitmap[k], i))
2255 						continue;
2256 					bit_clear(switches_bitmap[k], i);
2257 					switches_node_cnt[k]--;
2258 					switches_core_cnt[k] -= c;
2259 				}
2260 			}
2261 		}
2262 	}
2263 
2264 #if SELECT_DEBUG
2265 	/* Don't compile this, it slows things down too much */
2266 	for (i = 0; i < switch_record_cnt; i++) {
2267 		char *node_names = NULL;
2268 		if (switches_node_cnt[i])
2269 			node_names = bitmap2node_name(switches_bitmap[i]);
2270 		info("switch=%s nodes=%u:%s cores:%d required:%u speed=%u",
2271 		     switch_record_table[i].name,
2272 		     switches_node_cnt[i], node_names,
2273 		     switches_core_cnt[i], switches_required[i],
2274 		     switch_record_table[i].link_speed);
2275 		xfree(node_names);
2276 	}
2277 #endif
2278 
2279 	/* Determine lowest level switch satisfying request with best fit */
2280 	best_fit_inx = -1;
2281 	for (j = 0; j < switch_record_cnt; j++) {
2282 		if ((switches_node_cnt[j] < rem_nodes) ||
2283 		    (core_cnt && (switches_core_cnt[j] < rem_cores)))
2284 			continue;
2285 		if ((best_fit_inx == -1) ||
2286 		    (switch_record_table[j].level <
2287 		     switch_record_table[best_fit_inx].level) ||
2288 		    ((switch_record_table[j].level ==
2289 		      switch_record_table[best_fit_inx].level) &&
2290 		     (switches_node_cnt[j] < switches_node_cnt[best_fit_inx])))
2291 			/* We should use core count by switch here as well */
2292 			best_fit_inx = j;
2293 	}
2294 	if (best_fit_inx == -1) {
2295 		debug("%s: could not find resources for reservation", __func__);
2296 		goto fini;
2297 	}
2298 
2299 	/* Identify usable leafs (within higher switch having best fit) */
2300 	for (j = 0; j < switch_record_cnt; j++) {
2301 		if ((switch_record_table[j].level != 0) ||
2302 		    (!bit_super_set(switches_bitmap[j],
2303 				    switches_bitmap[best_fit_inx]))) {
2304 			switches_node_cnt[j] = 0;
2305 		}
2306 	}
2307 
2308 	/* Select resources from these leafs on a best-fit basis */
2309 	avail_nodes_bitmap = bit_alloc(node_record_count);
2310 	while (rem_nodes > 0) {
2311 		best_fit_nodes = best_fit_sufficient = 0;
2312 		for (j = 0; j < switch_record_cnt; j++) {
2313 			if (switches_node_cnt[j] == 0)
2314 				continue;
2315 			if (core_cnt) {
2316 				sufficient =
2317 					(switches_node_cnt[j] >= rem_nodes) &&
2318 					(switches_core_cnt[j] >= rem_cores);
2319 			} else
2320 				sufficient = switches_node_cnt[j] >= rem_nodes;
2321 			/*
2322 			 * If first possibility OR
2323 			 * first set large enough for request OR
2324 			 * tightest fit (less resource waste) OR
2325 			 * nothing yet large enough, but this is biggest
2326 			 */
2327 			if ((best_fit_nodes == 0) ||
2328 			    (sufficient && (best_fit_sufficient == 0)) ||
2329 			    (sufficient &&
2330 			     (switches_node_cnt[j] < best_fit_nodes)) ||
2331 			    ((sufficient == 0) &&
2332 			     (switches_node_cnt[j] > best_fit_nodes))) {
2333 				best_fit_nodes = switches_node_cnt[j];
2334 				best_fit_location = j;
2335 				best_fit_sufficient = sufficient;
2336 			}
2337 		}
2338 		if (best_fit_nodes == 0)
2339 			break;
2340 		/* Use select nodes from this leaf */
2341 		i_first = bit_ffs(switches_bitmap[best_fit_location]);
2342 		if (i_first >= 0)
2343 			i_last = bit_fls(switches_bitmap[best_fit_location]);
2344 		else
2345 			i_last = i_first - 1;
2346 
2347 		for (i = i_first; i <= i_last; i++) {
2348 			if (!bit_test(switches_bitmap[best_fit_location], i))
2349 				continue;
2350 			bit_clear(switches_bitmap[best_fit_location], i);
2351 			switches_node_cnt[best_fit_location]--;
2352 
2353 			if (bit_test(avail_nodes_bitmap, i)) {
2354 				/*
2355 				 * node on multiple leaf switches
2356 				 * and already selected
2357 				 */
2358 				continue;
2359 			}
2360 
2361 			if (core_cnt) {
2362 				c = _get_avail_cores_on_node(i,
2363 				                             exc_core_bitmap);
2364 				if (c < cores_per_node)
2365 					continue;
2366 				debug2("Using node %d with %d cores available",
2367 				       i, c);
2368 				rem_cores -= c;
2369 			}
2370 			bit_set(avail_nodes_bitmap, i);
2371 			if (--rem_nodes <= 0)
2372 				break;
2373 		}
2374 		switches_node_cnt[best_fit_location] = 0;
2375 	}
2376 
2377 	if ((rem_nodes > 0) || (rem_cores > 0))	/* insufficient resources */
2378 		FREE_NULL_BITMAP(avail_nodes_bitmap);
2379 
2380 fini:	for (i = 0; i < switch_record_cnt; i++) {
2381 		FREE_NULL_BITMAP(switches_bitmap[i]);
2382 		free_core_array(&switches_core_bitmap[i]);
2383 	}
2384 	xfree(switches_bitmap);
2385 	xfree(switches_core_bitmap);
2386 	xfree(switches_core_cnt);
2387 	xfree(switches_node_cnt);
2388 	xfree(switches_required);
2389 
2390 	if (avail_nodes_bitmap && core_cnt) {
2391 		/* Reservation is using partial nodes */
2392 		picked_node_bitmap = bit_alloc(bit_size(avail_node_bitmap));
2393 		picked_core_bitmap = build_core_array();
2394 
2395 		rem_cores = rem_cores_save;
2396 		n = 0;
2397 		prev_rem_cores = -1;
2398 
2399 		while (rem_cores) {
2400 			int avail_cores_in_node, inx, coff;
2401 			bitstr_t *use_exc_bitmap = NULL,
2402 				*use_picked_bitmap = NULL;
2403 
2404 			inx = bit_ffs(avail_nodes_bitmap);
2405 			if ((inx < 0) && aggr_core_cnt && (rem_cores > 0) &&
2406 			    (rem_cores != prev_rem_cores)) {
2407 				/*
2408 				 * Make another pass over nodes to reach
2409 				 * requested aggregate core count
2410 				 */
2411 				bit_or(avail_nodes_bitmap, picked_node_bitmap);
2412 				inx = bit_ffs(avail_nodes_bitmap);
2413 				prev_rem_cores = rem_cores;
2414 				cores_per_node = 1;
2415 			}
2416 			if (inx < 0)
2417 				break;
2418 
2419 			debug2("Using node inx:%d cores_per_node:%d rem_cores:%u",
2420 			       inx, cores_per_node, rem_cores);
2421 
2422 			/* Clear this node from the initial available bitmap */
2423 			bit_clear(avail_nodes_bitmap, inx);
2424 
2425 			if (select_node_record[inx].tot_cores < cores_per_node)
2426 				continue;
2427 			avail_cores_in_node =
2428 				_get_avail_cores_on_node(inx, exc_core_bitmap);
2429 
2430 			debug2("Node inx:%d has %d available cores", inx,
2431 			       avail_cores_in_node);
2432 			if (avail_cores_in_node < cores_per_node)
2433 				continue;
2434 
2435 			xassert(exc_core_bitmap);
2436 
2437 			avail_cores_in_node = 0;
2438 
2439 			if (!is_cons_tres) {
2440 				use_exc_bitmap = *exc_core_bitmap;
2441 				coff = cr_get_coremap_offset(inx);
2442 				if (!*picked_core_bitmap)
2443 					*picked_core_bitmap = bit_alloc(
2444 						bit_size(use_exc_bitmap));
2445 				use_picked_bitmap = *picked_core_bitmap;
2446 			} else {
2447 				use_exc_bitmap = exc_core_bitmap[inx];
2448 				coff = 0;
2449 				if (!picked_core_bitmap[inx]) {
2450 					picked_core_bitmap[inx] = bit_alloc(
2451 						select_node_record[inx].
2452 						tot_cores);
2453 				}
2454 				use_picked_bitmap = picked_core_bitmap[inx];
2455 			}
2456 
2457 			for (int i = 0;
2458 			     i < select_node_record[inx].tot_cores;
2459 			     i++) {
2460 				int set = coff + i;
2461 				if ((!use_exc_bitmap ||
2462 				     !bit_test(use_exc_bitmap, set)) &&
2463 				    !bit_test(use_picked_bitmap, set)) {
2464 					bit_set(use_picked_bitmap, set);
2465 					rem_cores--;
2466 					avail_cores_in_node++;
2467 				}
2468 				if (rem_cores == 0)
2469 					break;
2470 				if (aggr_core_cnt &&
2471 				    (avail_cores_in_node >= cores_per_node))
2472 					break;
2473 				if (!aggr_core_cnt &&
2474 				    (avail_cores_in_node >= core_cnt[n]))
2475 					break;
2476 			}
2477 
2478 			/* Add this node to the final node bitmap */
2479 			if (avail_cores_in_node)
2480 				bit_set(picked_node_bitmap, inx);
2481 			n++;
2482 		}
2483 		FREE_NULL_BITMAP(avail_nodes_bitmap);
2484 		free_core_array(&exc_core_bitmap);
2485 
2486 		if (rem_cores) {
2487 			info("reservation request can not be satisfied");
2488 			FREE_NULL_BITMAP(picked_node_bitmap);
2489 			picked_node_bitmap = NULL;
2490 		} else {
2491 			*core_bitmap = core_array_to_bitmap(picked_core_bitmap);
2492 		}
2493 		free_core_array(&picked_core_bitmap);
2494 		return picked_node_bitmap;
2495 	}
2496 	free_core_array(&exc_core_bitmap);
2497 
2498 	return avail_nodes_bitmap;
2499 }
2500