1 /*****************************************************************************\
2 * cons_common.c - Common function interface for the select/cons_* plugins
3 *****************************************************************************
4 * Copyright (C) 2019 SchedMD LLC
5 * Derived in large part from select/cons_[res|tres] plugins
6 *
7 * This file is part of Slurm, a resource management program.
8 * For details, see <https://slurm.schedmd.com/>.
9 * Please also read the included file: DISCLAIMER.
10 *
11 * Slurm is free software; you can redistribute it and/or modify it under
12 * the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 * In addition, as a special exception, the copyright holders give permission
17 * to link the code of portions of this program with the OpenSSL library under
18 * certain conditions as described in each individual source file, and
19 * distribute linked combinations including the two. You must obey the GNU
20 * General Public License in all respects for all of the code used other than
21 * OpenSSL. If you modify file(s) with this exception, you may extend this
22 * exception to your version of the file(s), but you are not obligated to do
23 * so. If you do not wish to do so, delete this exception statement from your
24 * version. If you delete this exception statement from all source files in
25 * the program, then also delete it here.
26 *
27 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
30 * details.
31 *
32 * You should have received a copy of the GNU General Public License along
33 * with Slurm; if not, write to the Free Software Foundation, Inc.,
34 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
35 \*****************************************************************************/
36
37 #define _GNU_SOURCE
38
39 #include "src/common/slurm_xlator.h"
40
41 #include "cons_common.h"
42
43 #include "src/common/assoc_mgr.h"
44 #include "src/common/slurm_selecttype_info.h"
45 #include "src/common/slurm_topology.h"
46
47 /* These are defined here so when we link with something other than
48 * the slurmctld we will have these symbols defined. They will get
49 * overwritten when linking with the slurmctld.
50 */
51 #if defined (__APPLE__)
52 extern slurm_ctl_conf_t slurmctld_conf __attribute__((weak_import));
53 extern node_record_t *node_record_table_ptr __attribute__((weak_import));
54 extern List part_list __attribute__((weak_import));
55 extern List job_list __attribute__((weak_import));
56 extern int node_record_count __attribute__((weak_import));
57 extern time_t last_node_update __attribute__((weak_import));
58 extern switch_record_t *switch_record_table __attribute__((weak_import));
59 extern int switch_record_cnt __attribute__((weak_import));
60 extern bitstr_t *avail_node_bitmap __attribute__((weak_import));
61 extern uint16_t *cr_node_num_cores __attribute__((weak_import));
62 extern uint32_t *cr_node_cores_offset __attribute__((weak_import));
63 extern int slurmctld_tres_cnt __attribute__((weak_import));
64 extern slurmctld_config_t slurmctld_config __attribute__((weak_import));
65 extern bitstr_t *idle_node_bitmap __attribute__((weak_import));
66 #else
67 slurm_ctl_conf_t slurmctld_conf;
68 node_record_t *node_record_table_ptr;
69 List part_list;
70 List job_list;
71 int node_record_count;
72 time_t last_node_update;
73 switch_record_t *switch_record_table;
74 int switch_record_cnt;
75 bitstr_t *avail_node_bitmap;
76 uint16_t *cr_node_num_cores;
77 uint32_t *cr_node_cores_offset;
78 int slurmctld_tres_cnt = 0;
79 slurmctld_config_t slurmctld_config;
80 bitstr_t *idle_node_bitmap;
81 #endif
82
83 /* init common global variables */
84 bool backfill_busy_nodes = false;
85 int bf_window_scale = 0;
86 cons_common_callbacks_t cons_common_callbacks = {0};
87 int core_array_size = 1;
88 uint16_t cr_type = CR_CPU; /* cr_type is overwritten in init() */
89 bool gang_mode = false;
90 bool have_dragonfly = false;
91 bool is_cons_tres = false;
92 bool pack_serial_at_end = false;
93 bool preempt_by_part = false;
94 bool preempt_by_qos = false;
95 uint16_t priority_flags = 0;
96 uint64_t select_debug_flags = 0;
97 int select_node_cnt = 0;
98 bool spec_cores_first = false;
99 bool topo_optional = false;
100
101 /* Global variables */
102
_create_job_resources(int node_cnt)103 static job_resources_t *_create_job_resources(int node_cnt)
104 {
105 job_resources_t *job_resrcs_ptr;
106
107 job_resrcs_ptr = create_job_resources();
108 job_resrcs_ptr->cpu_array_reps = xcalloc(node_cnt, sizeof(uint32_t));
109 job_resrcs_ptr->cpu_array_value = xcalloc(node_cnt, sizeof(uint16_t));
110 job_resrcs_ptr->cpus = xcalloc(node_cnt, sizeof(uint16_t));
111 job_resrcs_ptr->cpus_used = xcalloc(node_cnt, sizeof(uint16_t));
112 job_resrcs_ptr->memory_allocated = xcalloc(node_cnt, sizeof(uint64_t));
113 job_resrcs_ptr->memory_used = xcalloc(node_cnt, sizeof(uint64_t));
114 job_resrcs_ptr->nhosts = node_cnt;
115 return job_resrcs_ptr;
116 }
117
_get_avail_cores_on_node(int node_inx,bitstr_t ** exc_bitmap)118 static int _get_avail_cores_on_node(int node_inx, bitstr_t **exc_bitmap)
119 {
120 int exc_cnt = 0, tot_cores;
121
122 xassert(node_inx <= select_node_cnt);
123
124 tot_cores = select_node_record[node_inx].tot_cores;
125
126 if (!exc_bitmap)
127 return tot_cores;
128
129 if (is_cons_tres) {
130 if (exc_bitmap[node_inx])
131 exc_cnt += bit_set_count(exc_bitmap[node_inx]);
132 } else if (*exc_bitmap) {
133 int coff = cr_get_coremap_offset(node_inx);
134 for (int i = 0; i < tot_cores; i++) {
135 if (bit_test(*exc_bitmap, coff + i))
136 exc_cnt++;
137 }
138 }
139 return tot_cores - exc_cnt;
140 }
141
common_node_state_str(uint16_t node_state)142 extern char *common_node_state_str(uint16_t node_state)
143 {
144 if (node_state >= NODE_CR_RESERVED)
145 return "reserved"; /* Exclusive allocation */
146 if (node_state >= NODE_CR_ONE_ROW)
147 return "one_row"; /* Dedicated core for this partition */
148 return "available"; /* Idle or in-use (shared) */
149 }
150
_dump_job_res(struct job_resources * job)151 static void _dump_job_res(struct job_resources *job)
152 {
153 char str[64];
154
155 if (job->core_bitmap)
156 bit_fmt(str, sizeof(str), job->core_bitmap);
157 else
158 sprintf(str, "[no core_bitmap]");
159 info("DEBUG: Dump job_resources: nhosts %u core_bitmap %s",
160 job->nhosts, str);
161 }
162
163 /*
164 * _allocate_sc - Given the job requirements, determine which CPUs/cores
165 * from the given node can be allocated (if any) to this
166 * job. Returns structure identifying the usable resources and
167 * a bitmap of the available cores.
168 *
169 * IN job_ptr - pointer to job requirements
170 * IN/OUT core_map - core_bitmap of available cores on this node
171 * IN part_core_map - bitmap of cores already allocated on this partition/node
172 * IN node_i - index of node to be evaluated
173 * IN/OUT cpu_alloc_size - minimum allocation size, in CPUs
174 * IN entire_sockets_only - if true, allocate cores only on sockets that
175 * have no other allocated cores.
176 * IN req_sock_map - OPTIONAL bitmap of required sockets
177 * RET resource availability structure, call common_free_avail_res() to free
178 */
_allocate_sc(job_record_t * job_ptr,bitstr_t * core_map,bitstr_t * part_core_map,const uint32_t node_i,int * cpu_alloc_size,bool entire_sockets_only,bitstr_t * req_sock_map)179 static avail_res_t *_allocate_sc(job_record_t *job_ptr, bitstr_t *core_map,
180 bitstr_t *part_core_map, const uint32_t node_i,
181 int *cpu_alloc_size, bool entire_sockets_only,
182 bitstr_t *req_sock_map)
183 {
184 uint16_t cpu_count = 0, cpu_cnt = 0, part_cpu_limit = 0xffff;
185 uint16_t si, cps, avail_cpus = 0, num_tasks = 0;
186 uint32_t c;
187 uint32_t core_begin;
188 uint32_t core_end;
189 struct job_details *details_ptr = job_ptr->details;
190 uint16_t cpus_per_task = details_ptr->cpus_per_task;
191 uint16_t free_core_count = 0, spec_threads = 0;
192 uint16_t i, j;
193 uint16_t sockets = select_node_record[node_i].tot_sockets;
194 uint16_t cores_per_socket = select_node_record[node_i].cores;
195 uint16_t threads_per_core = select_node_record[node_i].vpus;
196 uint16_t min_cores = 1, min_sockets = 1, ntasks_per_socket = 0;
197 uint16_t ncpus_per_core = 0xffff; /* Usable CPUs per core */
198 uint16_t ntasks_per_core = 0xffff;
199 uint32_t free_cpu_count = 0, used_cpu_count = 0;
200 int tmp_cpt = 0; /* cpus_per_task */
201 uint16_t free_cores[sockets];
202 uint16_t used_cores[sockets];
203 uint32_t used_cpu_array[sockets];
204 avail_res_t *avail_res;
205
206
207 if (is_cons_tres) {
208 core_begin = 0;
209 core_end = select_node_record[node_i].tot_cores;
210 } else {
211 core_begin = cr_get_coremap_offset(node_i);
212 core_end = cr_get_coremap_offset(node_i+1);
213 }
214
215 memset(free_cores, 0, sockets * sizeof(uint16_t));
216 memset(used_cores, 0, sockets * sizeof(uint16_t));
217 memset(used_cpu_array, 0, sockets * sizeof(uint32_t));
218
219 if (entire_sockets_only && details_ptr->whole_node &&
220 (details_ptr->core_spec != NO_VAL16)) {
221 /* Ignore specialized cores when allocating "entire" socket */
222 entire_sockets_only = false;
223 }
224 if (details_ptr->mc_ptr) {
225 uint32_t threads_per_socket;
226 multi_core_data_t *mc_ptr = details_ptr->mc_ptr;
227 if (mc_ptr->cores_per_socket != NO_VAL16) {
228 min_cores = mc_ptr->cores_per_socket;
229 }
230 if (mc_ptr->sockets_per_node != NO_VAL16) {
231 min_sockets = mc_ptr->sockets_per_node;
232 }
233 if ((mc_ptr->ntasks_per_core != INFINITE16) &&
234 (mc_ptr->ntasks_per_core)) {
235 ntasks_per_core = mc_ptr->ntasks_per_core;
236 ncpus_per_core = MIN(threads_per_core,
237 (ntasks_per_core * cpus_per_task));
238 }
239 if ((mc_ptr->threads_per_core != NO_VAL16) &&
240 (mc_ptr->threads_per_core < ncpus_per_core)) {
241 ncpus_per_core = mc_ptr->threads_per_core;
242 }
243 *cpu_alloc_size = MIN(*cpu_alloc_size, ncpus_per_core);
244 ntasks_per_socket = mc_ptr->ntasks_per_socket;
245
246 if ((ncpus_per_core != NO_VAL16) &&
247 (ncpus_per_core != INFINITE16) &&
248 (ncpus_per_core > threads_per_core)) {
249 goto fini;
250 }
251 threads_per_socket = threads_per_core * cores_per_socket;
252 if ((ntasks_per_socket != NO_VAL16) &&
253 (ntasks_per_socket != INFINITE16) &&
254 (ntasks_per_socket > threads_per_socket)) {
255 goto fini;
256 }
257 }
258
259 /*
260 * These are the job parameters that we must respect:
261 *
262 * details_ptr->mc_ptr->cores_per_socket (cr_core|cr_socket)
263 * - min # of cores per socket to allocate to this job
264 * details_ptr->mc_ptr->sockets_per_node (cr_core|cr_socket)
265 * - min # of sockets per node to allocate to this job
266 * details_ptr->mc_ptr->ntasks_per_core (cr_core|cr_socket)
267 * - number of tasks to launch per core
268 * details_ptr->mc_ptr->ntasks_per_socket (cr_core|cr_socket)
269 * - number of tasks to launch per socket
270 *
271 * details_ptr->ntasks_per_node (all cr_types)
272 * - total number of tasks to launch on this node
273 * details_ptr->cpus_per_task (all cr_types)
274 * - number of cpus to allocate per task
275 *
276 * These are the hardware constraints:
277 * cpus = sockets * cores_per_socket * threads_per_core
278 *
279 * These are the cores/sockets that are available: core_map
280 *
281 * NOTE: currently we only allocate at the socket level, the core
282 * level, or the cpu level. When hyperthreading is enabled
283 * in the BIOS, then there can be more than one thread/cpu
284 * per physical core.
285 *
286 * PROCEDURE:
287 *
288 * Step 1: Determine the current usage data: used_cores[],
289 * used_core_count, free_cores[], free_core_count
290 *
291 * Step 2: For core-level and socket-level: apply sockets_per_node
292 * and cores_per_socket to the "free" cores.
293 *
294 * Step 3: Compute task-related data: ncpus_per_core,
295 * ntasks_per_socket, ntasks_per_node and cpus_per_task
296 * and determine the number of tasks to run on this node
297 *
298 * Step 4: Mark the allocated resources in the job_cores bitmap
299 * and return "num_tasks" from Step 3.
300 *
301 *
302 * For socket and core counts, start by assuming that all available
303 * resources will be given to the job. Check min_* to ensure that
304 * there's enough resources. Reduce the resource count to match max_*
305 * (if necessary). Also reduce resource count (if necessary) to
306 * match ntasks_per_resource.
307 */
308
309 /*
310 * Step 1: create and compute core-count-per-socket
311 * arrays and total core counts
312 */
313 for (c = core_begin; c < core_end; c++) {
314 i = (uint16_t) ((c - core_begin) / cores_per_socket);
315 if (bit_test(core_map, c)) {
316 free_cores[i]++;
317 free_core_count++;
318 } else if (!part_core_map) {
319 used_cores[i]++;
320 } else if (bit_test(part_core_map, c)) {
321 used_cores[i]++;
322 used_cpu_array[i]++;
323 }
324 }
325
326 for (i = 0; i < sockets; i++) {
327 /*
328 * if a socket is already in use and entire_sockets_only is
329 * enabled, it cannot be used by this job
330 */
331 if (entire_sockets_only && used_cores[i]) {
332 free_core_count -= free_cores[i];
333 used_cores[i] += free_cores[i];
334 free_cores[i] = 0;
335 }
336 free_cpu_count += free_cores[i] * threads_per_core;
337 if (used_cpu_array[i])
338 used_cpu_count += used_cores[i] * threads_per_core;
339 }
340
341 /* Enforce partition CPU limit, but do not pick specific cores yet */
342 if ((job_ptr->part_ptr->max_cpus_per_node != INFINITE) &&
343 (free_cpu_count + used_cpu_count >
344 job_ptr->part_ptr->max_cpus_per_node)) {
345
346 if (is_cons_tres) {
347 if (used_cpu_count >=
348 job_ptr->part_ptr->max_cpus_per_node) {
349 /* no available CPUs on this node */
350 num_tasks = 0;
351 goto fini;
352 }
353 part_cpu_limit = job_ptr->part_ptr->max_cpus_per_node -
354 used_cpu_count;
355 if ((part_cpu_limit == 1) &&
356 (((ntasks_per_core != 0xffff) &&
357 (ntasks_per_core > part_cpu_limit)) ||
358 (ntasks_per_socket > part_cpu_limit) ||
359 ((ncpus_per_core != 0xffff) &&
360 (ncpus_per_core > part_cpu_limit)) ||
361 (cpus_per_task > part_cpu_limit))) {
362 /* insufficient available CPUs on this node */
363 num_tasks = 0;
364 goto fini;
365 }
366 } else {
367 int excess = free_cpu_count + used_cpu_count -
368 job_ptr->part_ptr->max_cpus_per_node;
369 int min_excess_cores = min_cores;
370 int found_cores;
371 excess = (excess + threads_per_core - 1) /
372 threads_per_core;
373 while (excess > 0) {
374 int min_free_inx = -1;
375 for (i = 0; i < sockets; i++) {
376 if (free_cores[i] == 0)
377 continue;
378 if (((min_excess_cores > 1) ||
379 (min_sockets > 1)) &&
380 (free_cores[i] <= min_excess_cores))
381 continue;
382 if ((min_free_inx == -1) ||
383 (free_cores[i] <
384 free_cores[min_free_inx]))
385 min_free_inx = i;
386 }
387 if (min_free_inx == -1) {
388 if (min_excess_cores) {
389 min_excess_cores = 0;
390 continue;
391 }
392 break;
393 }
394 if (free_cores[min_free_inx] < excess)
395 found_cores = free_cores[min_free_inx];
396 else
397 found_cores = excess;
398 if (min_excess_cores > 1 &&
399 ((free_cores[min_free_inx] - found_cores) <
400 min_excess_cores)) {
401 found_cores = free_cores[min_free_inx] -
402 min_excess_cores;
403 }
404 free_core_count -= found_cores;
405 free_cpu_count -= (found_cores *
406 threads_per_core);
407 free_cores[min_free_inx] -= found_cores;
408 excess -= found_cores;
409 }
410 }
411 }
412
413 /* Step 2: check min_cores per socket and min_sockets per node */
414 j = 0;
415 for (i = 0; i < sockets; i++) {
416 if (free_cores[i] < min_cores) {
417 /* cannot use this socket */
418 free_core_count -= free_cores[i];
419 free_cores[i] = 0;
420 continue;
421 }
422 /* count this socket as usable */
423 j++;
424 }
425 if (j < min_sockets) {
426 /* cannot use this node */
427 num_tasks = 0;
428 goto fini;
429 }
430
431 if (free_core_count < 1) {
432 /* no available resources on this node */
433 num_tasks = 0;
434 goto fini;
435 }
436
437 /*
438 * Step 3: Compute task-related data:
439 * ntasks_per_socket, ntasks_per_node and cpus_per_task
440 * to determine the number of tasks to run on this node
441 *
442 * Note: cpus_per_task and ncpus_per_core need to play nice
443 * 2 tasks_per_core vs. 2 cpus_per_task
444 */
445 avail_cpus = 0;
446 num_tasks = 0;
447 threads_per_core = common_cpus_per_core(details_ptr, node_i);
448
449 for (i = 0; i < sockets; i++) {
450 uint16_t tmp = free_cores[i] * threads_per_core;
451 if ((tmp == 0) && req_sock_map && bit_test(req_sock_map, i)) {
452 /* no available resources on required socket */
453 num_tasks = 0;
454 goto fini;
455 }
456 avail_cpus += tmp;
457 if (ntasks_per_socket)
458 num_tasks += MIN(tmp, ntasks_per_socket);
459 else
460 num_tasks += tmp;
461 }
462
463 /*
464 * If job requested exclusive rights to the node don't do the min
465 * here since it will make it so we don't allocate the entire node.
466 */
467 if (details_ptr->ntasks_per_node && details_ptr->share_res)
468 num_tasks = MIN(num_tasks, details_ptr->ntasks_per_node);
469
470 if (cpus_per_task < 2) {
471 avail_cpus = num_tasks;
472 } else if ((ntasks_per_core == 1) &&
473 (cpus_per_task > threads_per_core)) {
474 /* find out how many cores a task will use */
475 int task_cores = (cpus_per_task + threads_per_core - 1) /
476 threads_per_core;
477 int task_cpus = task_cores * threads_per_core;
478 /* find out how many tasks can fit on a node */
479 int tasks = avail_cpus / task_cpus;
480 /* how many cpus the job would use on the node */
481 avail_cpus = tasks * task_cpus;
482 /* subtract out the extra cpus. */
483 avail_cpus -= (tasks * (task_cpus - cpus_per_task));
484 } else {
485 j = avail_cpus / cpus_per_task;
486 num_tasks = MIN(num_tasks, j);
487 avail_cpus = num_tasks * cpus_per_task;
488 }
489
490 if ((details_ptr->ntasks_per_node &&
491 (num_tasks < details_ptr->ntasks_per_node) &&
492 (details_ptr->overcommit == 0)) ||
493 (details_ptr->pn_min_cpus &&
494 (avail_cpus < details_ptr->pn_min_cpus))) {
495 /* insufficient resources on this node */
496 num_tasks = 0;
497 goto fini;
498 }
499
500 /*
501 * Step 4 - make sure that ntasks_per_socket is enforced when
502 * allocating cores
503 */
504 if ((ntasks_per_socket != NO_VAL16) &&
505 (ntasks_per_socket != INFINITE16) &&
506 (ntasks_per_socket >= 1)) {
507 cps = ntasks_per_socket;
508 if (cpus_per_task > 1)
509 cps *= cpus_per_task;
510 } else
511 cps = cores_per_socket * threads_per_core;
512
513 si = 9999;
514 tmp_cpt = cpus_per_task;
515 for (c = core_begin; c < core_end && (avail_cpus > 0); c++) {
516 if (!bit_test(core_map, c))
517 continue;
518
519 /* Socket index */
520 i = (uint16_t) ((c - core_begin) / cores_per_socket);
521 if (free_cores[i] > 0) {
522 /*
523 * this socket has free cores, but make sure we don't
524 * use more than are needed for ntasks_per_socket
525 */
526 if (si != i) { /* Start use of next socket */
527 si = i;
528 cpu_cnt = threads_per_core;
529 } else { /* Continued use of same socket */
530 if (cpu_cnt >= cps) {
531 /* do not allocate this core */
532 bit_clear(core_map, c);
533 continue;
534 }
535 cpu_cnt += threads_per_core;
536 }
537 free_cores[i]--;
538 /*
539 * we have to ensure that cpu_count is not bigger than
540 * avail_cpus due to hyperthreading or this would break
541 * the selection logic providing more CPUs than allowed
542 * after task-related data processing of stage 3
543 */
544 if (avail_cpus >= threads_per_core) {
545 int used;
546 if (is_cons_tres &&
547 (slurmctld_conf.select_type_param &
548 CR_ONE_TASK_PER_CORE) &&
549 (details_ptr->min_gres_cpu > 0)) {
550 used = threads_per_core;
551 } else if ((ntasks_per_core == 1) &&
552 (cpus_per_task > threads_per_core)) {
553 used = MIN(tmp_cpt, threads_per_core);
554 } else
555 used = threads_per_core;
556 avail_cpus -= used;
557 cpu_count += used;
558 if (tmp_cpt <= used)
559 tmp_cpt = cpus_per_task;
560 else
561 tmp_cpt -= used;
562 } else {
563 cpu_count += avail_cpus;
564 avail_cpus = 0;
565 }
566
567 } else
568 bit_clear(core_map, c);
569 }
570 /* clear leftovers */
571 if (c < core_end)
572 bit_nclear(core_map, c, core_end - 1);
573
574 fini:
575 /* if num_tasks == 0 then clear all bits on this node */
576 if (num_tasks == 0) {
577 bit_nclear(core_map, core_begin, core_end-1);
578 cpu_count = 0;
579 }
580
581 if ((details_ptr->core_spec != NO_VAL16) &&
582 (details_ptr->core_spec & CORE_SPEC_THREAD) &&
583 ((select_node_record[node_i].threads == 1) ||
584 (select_node_record[node_i].threads ==
585 select_node_record[node_i].vpus))) {
586 /*
587 * NOTE: Currently does not support the situation when Slurm
588 * allocates by core, the thread specialization count occupies
589 * a full core
590 */
591 c = details_ptr->core_spec & (~CORE_SPEC_THREAD);
592 if (((cpu_count + c) <= select_node_record[node_i].cpus))
593 ;
594 else if (cpu_count > c)
595 spec_threads = c;
596 else
597 spec_threads = cpu_count;
598 }
599 cpu_count -= spec_threads;
600
601 avail_res = xmalloc(sizeof(avail_res_t));
602 avail_res->max_cpus = MIN(cpu_count, part_cpu_limit);
603
604 if (is_cons_tres) {
605 avail_res->min_cpus = *cpu_alloc_size;
606 avail_res->avail_cores_per_sock =
607 xcalloc(sockets, sizeof(uint16_t));
608 for (c = core_begin; c < core_end; c++) {
609 i = (uint16_t) ((c - core_begin) / cores_per_socket);
610 if (bit_test(core_map, c))
611 avail_res->avail_cores_per_sock[i]++;
612 }
613 avail_res->sock_cnt = sockets;
614 avail_res->spec_threads = spec_threads;
615 avail_res->vpus = select_node_record[node_i].vpus;
616 }
617
618 return avail_res;
619 }
620
621 /*
622 * Get configured DefCpuPerGPU information from a list
623 * (either global or per partition list)
624 * Returns NO_VAL64 if configuration parameter not set
625 */
common_get_def_cpu_per_gpu(List job_defaults_list)626 extern uint64_t common_get_def_cpu_per_gpu(List job_defaults_list)
627 {
628 uint64_t cpu_per_gpu = NO_VAL64;
629 ListIterator iter;
630 job_defaults_t *job_defaults;
631
632 if (!job_defaults_list)
633 return cpu_per_gpu;
634
635 iter = list_iterator_create(job_defaults_list);
636 while ((job_defaults = (job_defaults_t *) list_next(iter))) {
637 if (job_defaults->type == JOB_DEF_CPU_PER_GPU) {
638 cpu_per_gpu = job_defaults->value;
639 break;
640 }
641 }
642 list_iterator_destroy(iter);
643
644 return cpu_per_gpu;
645 }
646
647 /*
648 * Get configured DefMemPerGPU information from a list
649 * (either global or per partition list)
650 * Returns NO_VAL64 if configuration parameter not set
651 */
common_get_def_mem_per_gpu(List job_defaults_list)652 extern uint64_t common_get_def_mem_per_gpu(List job_defaults_list)
653 {
654 uint64_t mem_per_gpu = NO_VAL64;
655 ListIterator iter;
656 job_defaults_t *job_defaults;
657
658 if (!job_defaults_list)
659 return mem_per_gpu;
660
661 iter = list_iterator_create(job_defaults_list);
662 while ((job_defaults = (job_defaults_t *) list_next(iter))) {
663 if (job_defaults->type == JOB_DEF_MEM_PER_GPU) {
664 mem_per_gpu = job_defaults->value;
665 break;
666 }
667 }
668 list_iterator_destroy(iter);
669
670 return mem_per_gpu;
671 }
672
common_free_avail_res(avail_res_t * avail_res)673 extern void common_free_avail_res(avail_res_t *avail_res)
674 {
675 if (!avail_res)
676 return;
677
678 xfree(avail_res->avail_cores_per_sock);
679 FREE_NULL_LIST(avail_res->sock_gres_list);
680 xfree(avail_res);
681 }
682
683 /*
684 * Return the number of usable logical processors by a given job on
685 * some specified node. Returns 0xffff if no limit.
686 */
common_cpus_per_core(struct job_details * details,int node_inx)687 extern int common_cpus_per_core(struct job_details *details, int node_inx)
688 {
689 uint16_t ncpus_per_core = 0xffff; /* Usable CPUs per core */
690 uint16_t threads_per_core = select_node_record[node_inx].vpus;
691
692 if (is_cons_tres &&
693 (slurmctld_conf.select_type_param & CR_ONE_TASK_PER_CORE) &&
694 (details->min_gres_cpu > 0)) {
695 /* May override default of 1 CPU per core */
696 uint16_t pu_per_core = 0xffff; /* Usable CPUs per core */
697 uint16_t vpus_per_core = select_node_record[node_inx].vpus;
698 return MIN(vpus_per_core, pu_per_core);
699 }
700
701 if (details && details->mc_ptr) {
702 multi_core_data_t *mc_ptr = details->mc_ptr;
703 if ((mc_ptr->ntasks_per_core != INFINITE16) &&
704 (mc_ptr->ntasks_per_core)) {
705 ncpus_per_core = MIN(threads_per_core,
706 (mc_ptr->ntasks_per_core *
707 details->cpus_per_task));
708 }
709 if ((mc_ptr->threads_per_core != NO_VAL16) &&
710 (mc_ptr->threads_per_core < ncpus_per_core)) {
711 ncpus_per_core = mc_ptr->threads_per_core;
712 }
713 }
714
715 threads_per_core = MIN(threads_per_core, ncpus_per_core);
716
717 return threads_per_core;
718 }
719
common_init(void)720 extern void common_init(void)
721 {
722 char *topo_param;
723
724 cr_type = slurmctld_conf.select_type_param;
725 if (cr_type)
726 verbose("%s loaded with argument %u", plugin_type, cr_type);
727
728 select_debug_flags = slurm_get_debug_flags();
729
730 topo_param = slurm_get_topology_param();
731 if (topo_param) {
732 if (xstrcasestr(topo_param, "dragonfly"))
733 have_dragonfly = true;
734 if (xstrcasestr(topo_param, "TopoOptional"))
735 topo_optional = true;
736 xfree(topo_param);
737 }
738
739 priority_flags = slurm_get_priority_flags();
740
741 if (slurm_get_preempt_mode() & PREEMPT_MODE_GANG)
742 gang_mode = true;
743 else
744 gang_mode = false;
745
746 if (plugin_id == SELECT_PLUGIN_CONS_TRES)
747 is_cons_tres = true;
748 }
749
common_fini(void)750 extern void common_fini(void)
751 {
752 if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
753 info("%s shutting down ...", plugin_type);
754 else
755 verbose("%s shutting down ...", plugin_type);
756
757 node_data_destroy(select_node_usage, select_node_record);
758 select_node_record = NULL;
759 select_node_usage = NULL;
760 part_data_destroy_res(select_part_record);
761 select_part_record = NULL;
762 cr_fini_global_core_data();
763 }
764
765 /*
766 * Bit a core bitmap array of available cores
767 * node_bitmap IN - Nodes available for use
768 * core_spec IN - Specialized core specification, NO_VAL16 if none
769 * RET core bitmap array, one per node. Use free_core_array() to release memory
770 */
common_mark_avail_cores(bitstr_t * node_bitmap,uint16_t core_spec)771 extern bitstr_t **common_mark_avail_cores(
772 bitstr_t *node_bitmap, uint16_t core_spec)
773 {
774 bitstr_t **avail_cores;
775 int from_core, to_core, incr_core, from_sock, to_sock, incr_sock;
776 int res_core, res_sock, res_off;
777 int n, n_first, n_last;
778 int c;
779 int rem_core_spec, node_core_spec, thread_spec = 0;
780 node_record_t *node_ptr;
781 bitstr_t *core_map = NULL;
782 uint16_t use_spec_cores = slurmctld_conf.conf_flags & CTL_CONF_ASRU;
783 node_res_record_t *node_res_ptr = NULL;
784 uint32_t coff;
785
786 if (is_cons_tres) {
787 avail_cores = build_core_array();
788 } else {
789 core_map = bit_alloc(
790 cr_get_coremap_offset(bit_size(node_bitmap)));
791 avail_cores = build_core_array();
792 *avail_cores = core_map;
793 }
794
795 if ((core_spec != NO_VAL16) &&
796 (core_spec & CORE_SPEC_THREAD)) { /* Reserving threads */
797 thread_spec = core_spec & (~CORE_SPEC_THREAD);
798 core_spec = NO_VAL16; /* Don't remove cores */
799 }
800
801 n_first = bit_ffs(node_bitmap);
802 if (n_first != -1)
803 n_last = bit_fls(node_bitmap);
804 else
805 n_last = -2;
806 for (n = n_first; n <= n_last; n++) {
807 if (!bit_test(node_bitmap, n))
808 continue;
809
810 node_res_ptr = &select_node_record[n];
811 node_ptr = node_res_ptr->node_ptr;
812
813 if (is_cons_tres) {
814 c = 0;
815 coff = node_res_ptr->tot_cores;
816 avail_cores[n] = bit_alloc(node_res_ptr->tot_cores);
817 core_map = avail_cores[n];
818 } else {
819 c = cr_get_coremap_offset(n);
820 coff = cr_get_coremap_offset(n+1);
821 }
822
823 if ((core_spec != NO_VAL16) &&
824 (core_spec >= node_res_ptr->tot_cores)) {
825 bit_clear(node_bitmap, n);
826 continue;
827 }
828
829 bit_nset(core_map, c, coff - 1);
830
831 /* Job can't over-ride system defaults */
832 if (use_spec_cores && (core_spec == 0))
833 continue;
834
835 if (thread_spec &&
836 (node_res_ptr->cpus == node_res_ptr->tot_cores))
837 /* Each core has one thead, reserve cores here */
838 node_core_spec = thread_spec;
839 else
840 node_core_spec = core_spec;
841
842 /*
843 * remove node's specialized cores accounting toward the
844 * requested limit if allowed by configuration
845 */
846 rem_core_spec = node_core_spec;
847 if (node_ptr->node_spec_bitmap) {
848 for (int i = 0; i < node_res_ptr->tot_cores; i++) {
849 if (!bit_test(node_ptr->node_spec_bitmap, i)) {
850 bit_clear(core_map, c + i);
851 if (!use_spec_cores)
852 continue;
853 rem_core_spec--;
854 if (!rem_core_spec)
855 break;
856 }
857 }
858 }
859
860 if (!use_spec_cores || !rem_core_spec ||
861 (node_core_spec == NO_VAL16))
862 continue;
863
864 /* if more cores need to be specialized, look for
865 * them in the non-specialized cores */
866 if (spec_cores_first) {
867 from_core = 0;
868 to_core = node_res_ptr->cores;
869 incr_core = 1;
870 from_sock = 0;
871 to_sock = node_res_ptr->tot_sockets;
872 incr_sock = 1;
873 } else {
874 from_core = node_res_ptr->cores - 1;
875 to_core = -1;
876 incr_core = -1;
877 from_sock = node_res_ptr->tot_sockets - 1;
878 to_sock = -1;
879 incr_sock = -1;
880 }
881 for (res_core = from_core;
882 ((rem_core_spec > 0) && (res_core != to_core));
883 res_core += incr_core) {
884 for (res_sock = from_sock;
885 ((rem_core_spec > 0) && (res_sock != to_sock));
886 res_sock += incr_sock) {
887 res_off = c + res_core +
888 (res_sock * node_res_ptr->cores);
889 if (!bit_test(core_map, res_off))
890 continue;
891 bit_clear(core_map, res_off);
892 rem_core_spec--;
893 }
894 }
895 }
896
897 return avail_cores;
898 }
899
900 /*
901 * common_allocate_cores - Given the job requirements, determine which cores
902 * from the given node can be allocated (if any) to this
903 * job. Returns the number of cpus that can be used by
904 * this node AND a bitmap of the selected cores.
905 *
906 * IN job_ptr - pointer to job requirements
907 * IN/OUT core_map - core_bitmap of available cores on this node
908 * IN part_core_map - bitmap of cores already allocated on this partition/node
909 * IN node_i - index of node to be evaluated
910 * IN/OUT cpu_alloc_size - minimum allocation size, in CPUs
911 * IN cpu_type - if true, allocate CPUs rather than cores
912 * IN req_sock_map - OPTIONAL bitmap of required sockets
913 * RET resource availability structure, call common_free_avail_res() to free
914 */
common_allocate_cores(job_record_t * job_ptr,bitstr_t * core_map,bitstr_t * part_core_map,const uint32_t node_i,int * cpu_alloc_size,bool cpu_type,bitstr_t * req_sock_map)915 extern avail_res_t *common_allocate_cores(job_record_t *job_ptr,
916 bitstr_t *core_map,
917 bitstr_t *part_core_map,
918 const uint32_t node_i,
919 int *cpu_alloc_size,
920 bool cpu_type,
921 bitstr_t *req_sock_map)
922 {
923 return _allocate_sc(job_ptr, core_map, part_core_map, node_i,
924 cpu_alloc_size, false, req_sock_map);
925 }
926
927 /*
928 * common_allocate_sockets - Given the job requirements, determine which sockets
929 * from the given node can be allocated (if any) to this
930 * job. Returns the number of cpus that can be used by
931 * this node AND a core-level bitmap of the selected
932 * sockets.
933 *
934 * IN job_ptr - pointer to job requirements
935 * IN/OUT core_map - core_bitmap of available cores on this node
936 * IN part_core_map - bitmap of cores already allocated on this partition/node
937 * IN node_i - index of node to be evaluated
938 * IN/OUT cpu_alloc_size - minimum allocation size, in CPUs
939 * IN req_sock_map - OPTIONAL bitmap of required sockets
940 * RET resource availability structure, call common_free_avail_res() to free
941 */
common_allocate_sockets(job_record_t * job_ptr,bitstr_t * core_map,bitstr_t * part_core_map,const uint32_t node_i,int * cpu_alloc_size,bitstr_t * req_sock_map)942 extern avail_res_t *common_allocate_sockets(job_record_t *job_ptr,
943 bitstr_t *core_map,
944 bitstr_t *part_core_map,
945 const uint32_t node_i,
946 int *cpu_alloc_size,
947 bitstr_t *req_sock_map)
948 {
949 return _allocate_sc(job_ptr, core_map, part_core_map, node_i,
950 cpu_alloc_size, true, req_sock_map);
951 }
952
select_p_state_save(char * dir_name)953 extern int select_p_state_save(char *dir_name)
954 {
955 /* nothing to save */
956 return SLURM_SUCCESS;
957 }
958
959 /* This is Part 2 of a 4-part procedure which can be found in
960 * src/slurmctld/read_config.c. See select_p_node_init for the
961 * whole story.
962 */
select_p_state_restore(char * dir_name)963 extern int select_p_state_restore(char *dir_name)
964 {
965 /* nothing to restore */
966 return SLURM_SUCCESS;
967 }
968
969 /* This is Part 3 of a 4-part procedure which can be found in
970 * src/slurmctld/read_config.c. See select_p_node_init for the
971 * whole story.
972 */
select_p_job_init(List job_list)973 extern int select_p_job_init(List job_list)
974 {
975 /* nothing to initialize for jobs */
976 return SLURM_SUCCESS;
977 }
978
979 /* This plugin does not generate a node ranking. */
select_p_node_ranking(node_record_t * node_ptr,int node_cnt)980 extern bool select_p_node_ranking(node_record_t *node_ptr, int node_cnt)
981 {
982 return false;
983 }
984
985 /* This is Part 1 of a 4-part procedure which can be found in
986 * src/slurmctld/read_config.c. The whole story goes like this:
987 *
988 * Step 1: select_g_node_init : initializes the global node arrays
989 * Step 2: select_g_state_restore : NO-OP - nothing to restore
990 * Step 3: select_g_job_init : NO-OP - nothing to initialize
991 * Step 4: select_g_select_nodeinfo_set: called from reset_job_bitmaps() with
992 * each valid recovered job_ptr AND from
993 * select_nodes(), this procedure adds
994 * job data to the 'select_part_record'
995 * global array
996 */
select_p_node_init(node_record_t * node_ptr,int node_cnt)997 extern int select_p_node_init(node_record_t *node_ptr, int node_cnt)
998 {
999 char *preempt_type, *sched_params, *tmp_ptr;
1000 uint32_t cume_cores = 0;
1001 int i;
1002
1003 info("%s: %s", plugin_type, __func__);
1004 if ((cr_type & (CR_CPU | CR_CORE | CR_SOCKET)) == 0) {
1005 fatal("Invalid SelectTypeParameters: %s (%u), "
1006 "You need at least CR_(CPU|CORE|SOCKET)*",
1007 select_type_param_string(cr_type), cr_type);
1008 }
1009 if (node_ptr == NULL) {
1010 error("select_p_node_init: node_ptr == NULL");
1011 return SLURM_ERROR;
1012 }
1013 if (node_cnt < 0) {
1014 error("select_p_node_init: node_cnt < 0");
1015 return SLURM_ERROR;
1016 }
1017
1018 sched_params = slurm_get_sched_params();
1019 if (xstrcasestr(sched_params, "preempt_strict_order"))
1020 preempt_strict_order = true;
1021 else
1022 preempt_strict_order = false;
1023 if ((tmp_ptr = xstrcasestr(sched_params, "preempt_reorder_count="))) {
1024 preempt_reorder_cnt = atoi(tmp_ptr + 22);
1025 if (preempt_reorder_cnt < 0) {
1026 error("Invalid SchedulerParameters preempt_reorder_count: %d",
1027 preempt_reorder_cnt);
1028 preempt_reorder_cnt = 1; /* Use default value */
1029 }
1030 }
1031 if ((tmp_ptr = xstrcasestr(sched_params, "bf_window_linear="))) {
1032 bf_window_scale = atoi(tmp_ptr + 17);
1033 if (bf_window_scale <= 0) {
1034 error("Invalid SchedulerParameters bf_window_linear: %d",
1035 bf_window_scale);
1036 bf_window_scale = 0; /* Use default value */
1037 }
1038 } else
1039 bf_window_scale = 0;
1040
1041 if (xstrcasestr(sched_params, "pack_serial_at_end"))
1042 pack_serial_at_end = true;
1043 else
1044 pack_serial_at_end = false;
1045 if (xstrcasestr(sched_params, "spec_cores_first"))
1046 spec_cores_first = true;
1047 else
1048 spec_cores_first = false;
1049 if (xstrcasestr(sched_params, "bf_busy_nodes"))
1050 backfill_busy_nodes = true;
1051 else
1052 backfill_busy_nodes = false;
1053 xfree(sched_params);
1054
1055 preempt_type = slurm_get_preempt_type();
1056 preempt_by_part = false;
1057 preempt_by_qos = false;
1058 if (preempt_type) {
1059 if (xstrcasestr(preempt_type, "partition"))
1060 preempt_by_part = true;
1061 if (xstrcasestr(preempt_type, "qos"))
1062 preempt_by_qos = true;
1063 xfree(preempt_type);
1064 }
1065
1066 /* initial global core data structures */
1067 select_state_initializing = true;
1068 cr_init_global_core_data(node_ptr, node_cnt);
1069
1070 node_data_destroy(select_node_usage, select_node_record);
1071 select_node_cnt = node_cnt;
1072
1073 if (is_cons_tres)
1074 core_array_size = select_node_cnt;
1075
1076 select_node_record = xcalloc(select_node_cnt,
1077 sizeof(node_res_record_t));
1078 select_node_usage = xcalloc(select_node_cnt,
1079 sizeof(node_use_record_t));
1080
1081 for (i = 0; i < select_node_cnt; i++) {
1082 config_record_t *config_ptr;
1083 select_node_record[i].node_ptr = &node_ptr[i];
1084 select_node_record[i].mem_spec_limit =
1085 node_ptr[i].mem_spec_limit;
1086
1087 config_ptr = node_ptr[i].config_ptr;
1088 select_node_record[i].cpus = config_ptr->cpus;
1089 select_node_record[i].boards = config_ptr->boards;
1090 select_node_record[i].sockets = config_ptr->sockets;
1091 select_node_record[i].cores = config_ptr->cores;
1092 select_node_record[i].threads = config_ptr->threads;
1093 select_node_record[i].vpus = config_ptr->threads;
1094 select_node_record[i].real_memory = config_ptr->real_memory;
1095
1096 select_node_record[i].tot_sockets =
1097 select_node_record[i].boards *
1098 select_node_record[i].sockets;
1099 select_node_record[i].tot_cores =
1100 select_node_record[i].tot_sockets *
1101 select_node_record[i].cores;
1102 cume_cores += select_node_record[i].tot_cores;
1103 select_node_record[i].cume_cores = cume_cores;
1104 if (select_node_record[i].tot_cores >=
1105 select_node_record[i].cpus)
1106 select_node_record[i].vpus = 1;
1107
1108 if ((select_node_record[i].cpus !=
1109 select_node_record[i].tot_cores) &&
1110 (select_node_record[i].cpus !=
1111 select_node_record[i].tot_cores *
1112 select_node_record[i].threads))
1113 fatal("NodeName=%s CPUs=%u doesn't match neither Sockets(%u)*CoresPerSocket(%u)=(%u) nor Sockets(%u)*CoresPerSocket(%u)*ThreadsPerCore(%u)=(%u). Please fix your slurm.conf.",
1114 node_ptr[i].name,
1115 select_node_record[i].cpus,
1116 select_node_record[i].tot_sockets,
1117 select_node_record[i].cores,
1118 select_node_record[i].tot_cores,
1119 select_node_record[i].tot_sockets,
1120 select_node_record[i].cores,
1121 select_node_record[i].threads,
1122 select_node_record[i].tot_cores *
1123 select_node_record[i].threads);
1124
1125 select_node_usage[i].node_state = NODE_CR_AVAILABLE;
1126 gres_plugin_node_state_dealloc_all(
1127 select_node_record[i].node_ptr->gres_list);
1128 }
1129 part_data_create_array();
1130 node_data_dump();
1131
1132 return SLURM_SUCCESS;
1133 }
1134
select_p_job_begin(job_record_t * job_ptr)1135 extern int select_p_job_begin(job_record_t *job_ptr)
1136 {
1137 return SLURM_SUCCESS;
1138 }
1139
select_p_job_ready(job_record_t * job_ptr)1140 extern int select_p_job_ready(job_record_t *job_ptr)
1141 {
1142 int i, i_first, i_last;
1143 node_record_t *node_ptr;
1144
1145 if (!IS_JOB_RUNNING(job_ptr) && !IS_JOB_SUSPENDED(job_ptr)) {
1146 /* Gang scheduling might suspend job immediately */
1147 return 0;
1148 }
1149
1150 if ((job_ptr->node_bitmap == NULL) ||
1151 ((i_first = bit_ffs(job_ptr->node_bitmap)) == -1))
1152 return READY_NODE_STATE;
1153 i_last = bit_fls(job_ptr->node_bitmap);
1154 for (i = i_first; i <= i_last; i++) {
1155 if (bit_test(job_ptr->node_bitmap, i) == 0)
1156 continue;
1157 node_ptr = node_record_table_ptr + i;
1158 if (IS_NODE_POWER_SAVE(node_ptr) || IS_NODE_POWER_UP(node_ptr))
1159 return 0;
1160 }
1161
1162 return READY_NODE_STATE;
1163 }
1164
select_p_job_expand(job_record_t * from_job_ptr,job_record_t * to_job_ptr)1165 extern int select_p_job_expand(job_record_t *from_job_ptr,
1166 job_record_t *to_job_ptr)
1167 {
1168 job_resources_t *from_job_resrcs_ptr, *to_job_resrcs_ptr,
1169 *new_job_resrcs_ptr;
1170 node_record_t *node_ptr;
1171 int first_bit, last_bit, i, node_cnt;
1172 bool from_node_used, to_node_used;
1173 int from_node_offset, to_node_offset, new_node_offset;
1174 bitstr_t *tmp_bitmap, *tmp_bitmap2;
1175
1176 xassert(from_job_ptr);
1177 xassert(from_job_ptr->details);
1178 xassert(to_job_ptr);
1179 xassert(to_job_ptr->details);
1180
1181 if (from_job_ptr->job_id == to_job_ptr->job_id) {
1182 error("%s: %s: attempt to merge %pJ with self",
1183 plugin_type, __func__, from_job_ptr);
1184 return SLURM_ERROR;
1185 }
1186
1187 from_job_resrcs_ptr = from_job_ptr->job_resrcs;
1188 if ((from_job_resrcs_ptr == NULL) ||
1189 (from_job_resrcs_ptr->cpus == NULL) ||
1190 (from_job_resrcs_ptr->core_bitmap == NULL) ||
1191 (from_job_resrcs_ptr->node_bitmap == NULL)) {
1192 error("%s: %s: %pJ lacks a job_resources struct",
1193 plugin_type, __func__, from_job_ptr);
1194 return SLURM_ERROR;
1195 }
1196 to_job_resrcs_ptr = to_job_ptr->job_resrcs;
1197 if ((to_job_resrcs_ptr == NULL) ||
1198 (to_job_resrcs_ptr->cpus == NULL) ||
1199 (to_job_resrcs_ptr->core_bitmap == NULL) ||
1200 (to_job_resrcs_ptr->node_bitmap == NULL)) {
1201 error("%s: %s: %pJ lacks a job_resources struct",
1202 plugin_type, __func__, to_job_ptr);
1203 return SLURM_ERROR;
1204 }
1205
1206 if (is_cons_tres) {
1207 if (to_job_ptr->gres_list) {
1208 /* Can't reset gres/mps fields today */
1209 error("%s: %s: %pJ has allocated GRES",
1210 plugin_type, __func__, to_job_ptr);
1211 return SLURM_ERROR;
1212 }
1213 if (from_job_ptr->gres_list) {
1214 /* Can't reset gres/mps fields today */
1215 error("%s: %s: %pJ has allocated GRES",
1216 plugin_type, __func__, from_job_ptr);
1217 return SLURM_ERROR;
1218 }
1219 }
1220
1221 (void) job_res_rm_job(select_part_record, select_node_usage,
1222 from_job_ptr, 0, true, NULL);
1223 (void) job_res_rm_job(select_part_record, select_node_usage,
1224 to_job_ptr, 0, true, NULL);
1225
1226 if (to_job_resrcs_ptr->core_bitmap_used) {
1227 i = bit_size(to_job_resrcs_ptr->core_bitmap_used);
1228 bit_nclear(to_job_resrcs_ptr->core_bitmap_used, 0, i-1);
1229 }
1230
1231 tmp_bitmap = bit_copy(to_job_resrcs_ptr->node_bitmap);
1232 bit_or(tmp_bitmap, from_job_resrcs_ptr->node_bitmap);
1233 tmp_bitmap2 = bit_copy(to_job_ptr->node_bitmap);
1234 bit_or(tmp_bitmap2, from_job_ptr->node_bitmap);
1235 bit_and(tmp_bitmap, tmp_bitmap2);
1236 bit_free(tmp_bitmap2);
1237 node_cnt = bit_set_count(tmp_bitmap);
1238
1239 new_job_resrcs_ptr = _create_job_resources(node_cnt);
1240 new_job_resrcs_ptr->ncpus = from_job_resrcs_ptr->ncpus +
1241 to_job_resrcs_ptr->ncpus;
1242 new_job_resrcs_ptr->node_req = to_job_resrcs_ptr->node_req;
1243 new_job_resrcs_ptr->node_bitmap = tmp_bitmap;
1244 new_job_resrcs_ptr->nodes = bitmap2node_name(new_job_resrcs_ptr->
1245 node_bitmap);
1246 new_job_resrcs_ptr->whole_node = to_job_resrcs_ptr->whole_node;
1247 build_job_resources(new_job_resrcs_ptr, node_record_table_ptr);
1248 xfree(to_job_ptr->node_addr);
1249 to_job_ptr->node_addr = xcalloc(node_cnt, sizeof(slurm_addr_t));
1250 to_job_ptr->total_cpus = 0;
1251
1252 first_bit = MIN(bit_ffs(from_job_resrcs_ptr->node_bitmap),
1253 bit_ffs(to_job_resrcs_ptr->node_bitmap));
1254 last_bit = MAX(bit_fls(from_job_resrcs_ptr->node_bitmap),
1255 bit_fls(to_job_resrcs_ptr->node_bitmap));
1256 from_node_offset = to_node_offset = new_node_offset = -1;
1257 for (i = first_bit; i <= last_bit; i++) {
1258 from_node_used = to_node_used = false;
1259 if (bit_test(from_job_resrcs_ptr->node_bitmap, i)) {
1260 from_node_used = bit_test(from_job_ptr->node_bitmap,i);
1261 from_node_offset++;
1262 }
1263 if (bit_test(to_job_resrcs_ptr->node_bitmap, i)) {
1264 to_node_used = bit_test(to_job_ptr->node_bitmap, i);
1265 to_node_offset++;
1266 }
1267 if (!from_node_used && !to_node_used)
1268 continue;
1269 new_node_offset++;
1270 node_ptr = node_record_table_ptr + i;
1271 memcpy(&to_job_ptr->node_addr[new_node_offset],
1272 &node_ptr->slurm_addr, sizeof(slurm_addr_t));
1273 if (from_node_used) {
1274 /*
1275 * Merge alloc info from both "from" and "to" jobs,
1276 * leave "from" job with no allocated CPUs or memory
1277 *
1278 * The following fields should be zero:
1279 * from_job_resrcs_ptr->cpus_used[from_node_offset]
1280 * from_job_resrcs_ptr->memory_used[from_node_offset];
1281 */
1282 new_job_resrcs_ptr->cpus[new_node_offset] =
1283 from_job_resrcs_ptr->cpus[from_node_offset];
1284 from_job_resrcs_ptr->cpus[from_node_offset] = 0;
1285 new_job_resrcs_ptr->memory_allocated[new_node_offset] =
1286 from_job_resrcs_ptr->
1287 memory_allocated[from_node_offset];
1288 job_resources_bits_copy(new_job_resrcs_ptr,
1289 new_node_offset,
1290 from_job_resrcs_ptr,
1291 from_node_offset);
1292 }
1293 if (to_node_used) {
1294 /*
1295 * Merge alloc info from both "from" and "to" jobs
1296 *
1297 * DO NOT double count the allocated CPUs in partition
1298 * with Shared nodes
1299 */
1300 new_job_resrcs_ptr->cpus[new_node_offset] +=
1301 to_job_resrcs_ptr->cpus[to_node_offset];
1302 new_job_resrcs_ptr->cpus_used[new_node_offset] +=
1303 to_job_resrcs_ptr->cpus_used[to_node_offset];
1304 new_job_resrcs_ptr->memory_allocated[new_node_offset]+=
1305 to_job_resrcs_ptr->
1306 memory_allocated[to_node_offset];
1307 new_job_resrcs_ptr->memory_used[new_node_offset] +=
1308 to_job_resrcs_ptr->memory_used[to_node_offset];
1309 job_resources_bits_copy(new_job_resrcs_ptr,
1310 new_node_offset,
1311 to_job_resrcs_ptr,
1312 to_node_offset);
1313 if (from_node_used) {
1314 /* Adjust CPU count for shared CPUs */
1315 int from_core_cnt, to_core_cnt, new_core_cnt;
1316 from_core_cnt = count_job_resources_node(
1317 from_job_resrcs_ptr,
1318 from_node_offset);
1319 to_core_cnt = count_job_resources_node(
1320 to_job_resrcs_ptr,
1321 to_node_offset);
1322 new_core_cnt = count_job_resources_node(
1323 new_job_resrcs_ptr,
1324 new_node_offset);
1325 if ((from_core_cnt + to_core_cnt) !=
1326 new_core_cnt) {
1327 new_job_resrcs_ptr->
1328 cpus[new_node_offset] *=
1329 new_core_cnt;
1330 new_job_resrcs_ptr->
1331 cpus[new_node_offset] /=
1332 (from_core_cnt + to_core_cnt);
1333 }
1334 }
1335 }
1336 if (to_job_ptr->details->whole_node == 1) {
1337 to_job_ptr->total_cpus += select_node_record[i].cpus;
1338 } else {
1339 to_job_ptr->total_cpus += new_job_resrcs_ptr->
1340 cpus[new_node_offset];
1341 }
1342 }
1343 build_job_resources_cpu_array(new_job_resrcs_ptr);
1344 gres_plugin_job_merge(from_job_ptr->gres_list,
1345 from_job_resrcs_ptr->node_bitmap,
1346 to_job_ptr->gres_list,
1347 to_job_resrcs_ptr->node_bitmap);
1348
1349 /* Now swap data: "new" -> "to" and clear "from" */
1350 free_job_resources(&to_job_ptr->job_resrcs);
1351 to_job_ptr->job_resrcs = new_job_resrcs_ptr;
1352
1353 to_job_ptr->cpu_cnt = to_job_ptr->total_cpus;
1354 to_job_ptr->details->min_cpus = to_job_ptr->total_cpus;
1355 to_job_ptr->details->max_cpus = to_job_ptr->total_cpus;
1356 from_job_ptr->total_cpus = 0;
1357 from_job_resrcs_ptr->ncpus = 0;
1358 from_job_ptr->details->min_cpus = 0;
1359 from_job_ptr->details->max_cpus = 0;
1360
1361 from_job_ptr->total_nodes = 0;
1362 from_job_resrcs_ptr->nhosts = 0;
1363 from_job_ptr->node_cnt = 0;
1364 from_job_ptr->details->min_nodes = 0;
1365 to_job_ptr->total_nodes = new_job_resrcs_ptr->nhosts;
1366 to_job_ptr->node_cnt = new_job_resrcs_ptr->nhosts;
1367
1368 bit_or(to_job_ptr->node_bitmap, from_job_ptr->node_bitmap);
1369 bit_nclear(from_job_ptr->node_bitmap, 0, (node_record_count - 1));
1370 bit_nclear(from_job_resrcs_ptr->node_bitmap, 0,
1371 (node_record_count - 1));
1372
1373 xfree(to_job_ptr->nodes);
1374 to_job_ptr->nodes = xstrdup(new_job_resrcs_ptr->nodes);
1375 xfree(from_job_ptr->nodes);
1376 from_job_ptr->nodes = xstrdup("");
1377 xfree(from_job_resrcs_ptr->nodes);
1378 from_job_resrcs_ptr->nodes = xstrdup("");
1379
1380 (void) job_res_add_job(to_job_ptr, 0);
1381
1382 return SLURM_SUCCESS;
1383 }
1384
select_p_job_resized(job_record_t * job_ptr,node_record_t * node_ptr)1385 extern int select_p_job_resized(job_record_t *job_ptr, node_record_t *node_ptr)
1386 {
1387 part_res_record_t *part_record_ptr = select_part_record;
1388 node_use_record_t *node_usage = select_node_usage;
1389 struct job_resources *job = job_ptr->job_resrcs;
1390 part_res_record_t *p_ptr;
1391 int i, i_first, i_last, node_inx, n;
1392 List gres_list;
1393 bool old_job = false;
1394
1395 xassert(job_ptr);
1396 xassert(job_ptr->magic == JOB_MAGIC);
1397
1398 if (!job || !job->core_bitmap) {
1399 error("%s: %s: %pJ has no job_resrcs info",
1400 plugin_type, __func__, job_ptr);
1401 return SLURM_ERROR;
1402 }
1403
1404 debug3("%s: %s: %pJ node %s",
1405 plugin_type, __func__, job_ptr, node_ptr->name);
1406 if (job_ptr->start_time < slurmctld_config.boot_time)
1407 old_job = true;
1408 if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
1409 _dump_job_res(job);
1410
1411 /* subtract memory */
1412 node_inx = node_ptr - node_record_table_ptr;
1413 i_first = bit_ffs(job->node_bitmap);
1414 if (i_first != -1)
1415 i_last = bit_fls(job->node_bitmap);
1416 else
1417 i_last = -2;
1418 for (i = i_first, n = 0; i <= i_last; i++) {
1419 if (!bit_test(job->node_bitmap, i))
1420 continue;
1421 if (i != node_inx) {
1422 n++;
1423 continue;
1424 }
1425
1426 if (job->cpus[n] == 0) {
1427 info("%s: %s: attempt to remove node %s from %pJ again",
1428 plugin_type, __func__, node_ptr->name, job_ptr);
1429 return SLURM_SUCCESS;
1430 }
1431
1432 if (node_usage[i].gres_list)
1433 gres_list = node_usage[i].gres_list;
1434 else
1435 gres_list = node_ptr->gres_list;
1436 gres_plugin_job_dealloc(job_ptr->gres_list, gres_list, n,
1437 job_ptr->job_id, node_ptr->name,
1438 old_job, job_ptr->user_id, true);
1439 gres_plugin_node_state_log(gres_list, node_ptr->name);
1440
1441 if (node_usage[i].alloc_memory < job->memory_allocated[n]) {
1442 error("%s: %s: node %s memory is underallocated (%"PRIu64"-%"PRIu64") for %pJ",
1443 plugin_type,
1444 __func__, node_ptr->name,
1445 node_usage[i].alloc_memory,
1446 job->memory_allocated[n], job_ptr);
1447 node_usage[i].alloc_memory = 0;
1448 } else
1449 node_usage[i].alloc_memory -= job->memory_allocated[n];
1450
1451 extract_job_resources_node(job, n);
1452
1453 break;
1454 }
1455
1456 if (IS_JOB_SUSPENDED(job_ptr))
1457 return SLURM_SUCCESS; /* No cores allocated to the job now */
1458
1459 /* subtract cores, reconstruct rows with remaining jobs */
1460 if (!job_ptr->part_ptr) {
1461 error("%s: %s: removed %pJ does not have a partition assigned",
1462 plugin_type, __func__, job_ptr);
1463 return SLURM_ERROR;
1464 }
1465
1466 for (p_ptr = part_record_ptr; p_ptr; p_ptr = p_ptr->next) {
1467 if (p_ptr->part_ptr == job_ptr->part_ptr)
1468 break;
1469 }
1470 if (!p_ptr) {
1471 error("%s: %s: removed %pJ could not find part %s",
1472 plugin_type, __func__, job_ptr, job_ptr->part_ptr->name);
1473 return SLURM_ERROR;
1474 }
1475
1476 if (!p_ptr->row)
1477 return SLURM_SUCCESS;
1478
1479 /* look for the job in the partition's job_list */
1480 n = 0;
1481 for (i = 0; i < p_ptr->num_rows; i++) {
1482 uint32_t j;
1483 for (j = 0; j < p_ptr->row[i].num_jobs; j++) {
1484 if (p_ptr->row[i].job_list[j] != job)
1485 continue;
1486 debug3("%s: %s: found %pJ in part %s row %u",
1487 plugin_type, __func__, job_ptr,
1488 p_ptr->part_ptr->name, i);
1489 /* found job - we're done, don't actually remove */
1490 n = 1;
1491 i = p_ptr->num_rows;
1492 break;
1493 }
1494 }
1495 if (n == 0) {
1496 error("%s: %s: could not find %pJ in partition %s",
1497 plugin_type, __func__, job_ptr, p_ptr->part_ptr->name);
1498 return SLURM_ERROR;
1499 }
1500
1501
1502 /* some node of job removed from core-bitmap, so rebuild core bitmaps */
1503 part_data_build_row_bitmaps(p_ptr, NULL);
1504
1505 /*
1506 * Adjust the node_state of the node removed from this job.
1507 * If all cores are now available, set node_state = NODE_CR_AVAILABLE
1508 */
1509 if (node_usage[node_inx].node_state >= job->node_req) {
1510 node_usage[node_inx].node_state -= job->node_req;
1511 } else {
1512 error("%s: %s: node_state miscount", plugin_type, __func__);
1513 node_usage[node_inx].node_state = NODE_CR_AVAILABLE;
1514 }
1515
1516 return SLURM_SUCCESS;
1517 }
1518
select_p_job_signal(job_record_t * job_ptr,int signal)1519 extern int select_p_job_signal(job_record_t *job_ptr, int signal)
1520 {
1521 xassert(job_ptr);
1522 xassert(job_ptr->magic == JOB_MAGIC);
1523
1524 return SLURM_SUCCESS;
1525 }
1526
select_p_job_mem_confirm(job_record_t * job_ptr)1527 extern int select_p_job_mem_confirm(job_record_t *job_ptr)
1528 {
1529 int i_first, i_last, i, offset;
1530 uint64_t avail_mem, lowest_mem = 0;
1531
1532 xassert(job_ptr);
1533
1534 if (!(job_ptr->bit_flags & NODE_MEM_CALC))
1535 return SLURM_SUCCESS;
1536 if ((job_ptr->details == NULL) ||
1537 (job_ptr->job_resrcs == NULL) ||
1538 (job_ptr->job_resrcs->node_bitmap == NULL) ||
1539 (job_ptr->job_resrcs->memory_allocated == NULL))
1540 return SLURM_ERROR;
1541 i_first = bit_ffs(job_ptr->job_resrcs->node_bitmap);
1542 if (i_first >= 0)
1543 i_last = bit_fls(job_ptr->job_resrcs->node_bitmap);
1544 else
1545 i_last = i_first - 1;
1546 for (i = i_first, offset = 0; i <= i_last; i++) {
1547 if (!bit_test(job_ptr->job_resrcs->node_bitmap, i))
1548 continue;
1549 avail_mem = select_node_record[i].real_memory -
1550 select_node_record[i].mem_spec_limit;
1551 job_ptr->job_resrcs->memory_allocated[offset] = avail_mem;
1552 select_node_usage[i].alloc_memory = avail_mem;
1553 if ((offset == 0) || (lowest_mem > avail_mem))
1554 lowest_mem = avail_mem;
1555 offset++;
1556 }
1557 job_ptr->details->pn_min_memory = lowest_mem;
1558
1559 return SLURM_SUCCESS;
1560 }
1561
select_p_job_fini(job_record_t * job_ptr)1562 extern int select_p_job_fini(job_record_t *job_ptr)
1563 {
1564 xassert(job_ptr);
1565 xassert(job_ptr->magic == JOB_MAGIC);
1566
1567 if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE)
1568 info("%s: %s: %pJ", plugin_type, __func__, job_ptr);
1569
1570 job_res_rm_job(select_part_record, select_node_usage,
1571 job_ptr, 0, true, NULL);
1572
1573 return SLURM_SUCCESS;
1574 }
1575
1576 /* NOTE: This function is not called with gang scheduling because it
1577 * needs to track how many jobs are running or suspended on each node.
1578 * This sum is compared with the partition's Shared parameter */
select_p_job_suspend(job_record_t * job_ptr,bool indf_susp)1579 extern int select_p_job_suspend(job_record_t *job_ptr, bool indf_susp)
1580 {
1581 xassert(job_ptr);
1582 xassert(job_ptr->magic == JOB_MAGIC);
1583
1584 if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
1585 if (indf_susp)
1586 info("%s: %s: %pJ indf_susp", plugin_type, __func__,
1587 job_ptr);
1588 else
1589 info("%s: %s: %pJ", plugin_type, __func__, job_ptr);
1590 }
1591
1592 if (!indf_susp)
1593 return SLURM_SUCCESS;
1594
1595 return job_res_rm_job(select_part_record, select_node_usage,
1596 job_ptr, 2, false, NULL);
1597 }
1598
1599 /* See NOTE with select_p_job_suspend() above */
select_p_job_resume(job_record_t * job_ptr,bool indf_susp)1600 extern int select_p_job_resume(job_record_t *job_ptr, bool indf_susp)
1601 {
1602 xassert(job_ptr);
1603 xassert(job_ptr->magic == JOB_MAGIC);
1604
1605 if (select_debug_flags & DEBUG_FLAG_SELECT_TYPE) {
1606 if (indf_susp)
1607 info("%s: %s: %pJ indf_susp", plugin_type, __func__,
1608 job_ptr);
1609 else
1610 info("%s: %s: %pJ", plugin_type, __func__, job_ptr);
1611 }
1612 if (!indf_susp)
1613 return SLURM_SUCCESS;
1614
1615 return job_res_add_job(job_ptr, 2);
1616 }
1617
select_p_step_pick_nodes(job_record_t * job_ptr,select_jobinfo_t * jobinfo,uint32_t node_count,bitstr_t ** avail_nodes)1618 extern bitstr_t *select_p_step_pick_nodes(job_record_t *job_ptr,
1619 select_jobinfo_t *jobinfo,
1620 uint32_t node_count,
1621 bitstr_t **avail_nodes)
1622 {
1623 return NULL;
1624 }
1625
1626 /* Unused for this plugin */
select_p_step_start(step_record_t * step_ptr)1627 extern int select_p_step_start(step_record_t *step_ptr)
1628 {
1629 return SLURM_SUCCESS;
1630 }
1631
1632 /* Unused for this plugin */
select_p_step_finish(step_record_t * step_ptr,bool killing_step)1633 extern int select_p_step_finish(step_record_t *step_ptr, bool killing_step)
1634 {
1635 return SLURM_SUCCESS;
1636 }
1637
select_p_select_nodeinfo_pack(select_nodeinfo_t * nodeinfo,Buf buffer,uint16_t protocol_version)1638 extern int select_p_select_nodeinfo_pack(select_nodeinfo_t *nodeinfo,
1639 Buf buffer,
1640 uint16_t protocol_version)
1641 {
1642 select_nodeinfo_t *nodeinfo_empty = NULL;
1643
1644 if (!nodeinfo) {
1645 /*
1646 * We should never get here,
1647 * but avoid abort with bad data structures
1648 */
1649 error("%s: nodeinfo is NULL", __func__);
1650 nodeinfo_empty = xmalloc(sizeof(select_nodeinfo_t));
1651 nodeinfo = nodeinfo_empty;
1652 }
1653
1654 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
1655 pack16(nodeinfo->alloc_cpus, buffer);
1656 pack64(nodeinfo->alloc_memory, buffer);
1657 packstr(nodeinfo->tres_alloc_fmt_str, buffer);
1658 packdouble(nodeinfo->tres_alloc_weighted, buffer);
1659 }
1660 xfree(nodeinfo_empty);
1661
1662 return SLURM_SUCCESS;
1663 }
1664
select_p_select_nodeinfo_alloc(void)1665 extern select_nodeinfo_t *select_p_select_nodeinfo_alloc(void)
1666 {
1667 select_nodeinfo_t *nodeinfo = xmalloc(sizeof(select_nodeinfo_t));
1668
1669 nodeinfo->magic = nodeinfo_magic;
1670
1671 return nodeinfo;
1672 }
1673
select_p_select_nodeinfo_free(select_nodeinfo_t * nodeinfo)1674 extern int select_p_select_nodeinfo_free(select_nodeinfo_t *nodeinfo)
1675 {
1676 if (nodeinfo) {
1677 if (nodeinfo->magic != nodeinfo_magic) {
1678 error("%s: nodeinfo magic bad", __func__);
1679 return EINVAL;
1680 }
1681 xfree(nodeinfo->tres_alloc_cnt);
1682 xfree(nodeinfo->tres_alloc_fmt_str);
1683 xfree(nodeinfo);
1684 }
1685 return SLURM_SUCCESS;
1686 }
1687
select_p_select_nodeinfo_unpack(select_nodeinfo_t ** nodeinfo,Buf buffer,uint16_t protocol_version)1688 extern int select_p_select_nodeinfo_unpack(select_nodeinfo_t **nodeinfo,
1689 Buf buffer,
1690 uint16_t protocol_version)
1691 {
1692 uint32_t uint32_tmp;
1693 select_nodeinfo_t *nodeinfo_ptr = NULL;
1694
1695 nodeinfo_ptr = select_p_select_nodeinfo_alloc();
1696 *nodeinfo = nodeinfo_ptr;
1697
1698 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
1699 safe_unpack16(&nodeinfo_ptr->alloc_cpus, buffer);
1700 safe_unpack64(&nodeinfo_ptr->alloc_memory, buffer);
1701 safe_unpackstr_xmalloc(&nodeinfo_ptr->tres_alloc_fmt_str,
1702 &uint32_tmp, buffer);
1703 safe_unpackdouble(&nodeinfo_ptr->tres_alloc_weighted, buffer);
1704 }
1705
1706 return SLURM_SUCCESS;
1707
1708 unpack_error:
1709 error("%s: error unpacking here", __func__);
1710 select_p_select_nodeinfo_free(nodeinfo_ptr);
1711 *nodeinfo = NULL;
1712
1713 return SLURM_ERROR;
1714 }
1715
select_p_select_nodeinfo_set_all(void)1716 extern int select_p_select_nodeinfo_set_all(void)
1717 {
1718 static time_t last_set_all = 0;
1719 part_res_record_t *p_ptr;
1720 node_record_t *node_ptr = NULL;
1721 int i, n;
1722 uint32_t alloc_cpus, alloc_cores, node_cores, node_cpus, node_threads;
1723 uint32_t node_boards, node_sockets, total_node_cores;
1724 bitstr_t **alloc_core_bitmap = NULL;
1725 List gres_list;
1726
1727 /*
1728 * only set this once when the last_node_update is newer than
1729 * the last time we set things up.
1730 */
1731 if (last_set_all && (last_node_update < last_set_all)) {
1732 debug2("%s: Node data hasn't changed since %ld", __func__,
1733 (long)last_set_all);
1734 return SLURM_NO_CHANGE_IN_DATA;
1735 }
1736 last_set_all = last_node_update;
1737
1738 /*
1739 * Build core bitmap array representing all cores allocated to all
1740 * active jobs (running or preempted jobs)
1741 */
1742 for (p_ptr = select_part_record; p_ptr; p_ptr = p_ptr->next) {
1743 if (!p_ptr->row)
1744 continue;
1745 for (i = 0; i < p_ptr->num_rows; i++) {
1746 if (!p_ptr->row[i].row_bitmap)
1747 continue;
1748 if (!alloc_core_bitmap) {
1749 alloc_core_bitmap =
1750 copy_core_array(
1751 p_ptr->row[i].row_bitmap);
1752 } else {
1753 core_array_or(alloc_core_bitmap,
1754 p_ptr->row[i].row_bitmap);
1755 }
1756 }
1757 }
1758
1759 for (n = 0, node_ptr = node_record_table_ptr;
1760 n < select_node_cnt; n++, node_ptr++) {
1761 select_nodeinfo_t *nodeinfo = NULL;
1762 /*
1763 * We have to use the '_g_' here to make sure we get the
1764 * correct data to work on. i.e. select/cray calls this plugin
1765 * and has it's own struct.
1766 */
1767 select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
1768 SELECT_NODEDATA_PTR, 0,
1769 (void *)&nodeinfo);
1770 if (!nodeinfo) {
1771 error("%s: no nodeinfo returned from structure",
1772 __func__);
1773 continue;
1774 }
1775
1776 node_boards = node_ptr->config_ptr->boards;
1777 node_sockets = node_ptr->config_ptr->sockets;
1778 node_cores = node_ptr->config_ptr->cores;
1779 node_cpus = node_ptr->config_ptr->cpus;
1780 node_threads = node_ptr->config_ptr->threads;
1781
1782 if (is_cons_tres) {
1783 if (alloc_core_bitmap && alloc_core_bitmap[n])
1784 alloc_cores = bit_set_count(
1785 alloc_core_bitmap[n]);
1786 else
1787 alloc_cores = 0;
1788
1789 total_node_cores =
1790 node_boards * node_sockets * node_cores;
1791 } else {
1792 int start = cr_get_coremap_offset(n);
1793 int end = cr_get_coremap_offset(n + 1);
1794 if (alloc_core_bitmap)
1795 alloc_cores = bit_set_count_range(
1796 *alloc_core_bitmap,
1797 start, end);
1798 else
1799 alloc_cores = 0;
1800
1801 total_node_cores = end - start;
1802 }
1803
1804 /*
1805 * Administrator could resume suspended jobs and oversubscribe
1806 * cores, avoid reporting more cores in use than configured
1807 */
1808 if (alloc_cores > total_node_cores)
1809 alloc_cpus = total_node_cores;
1810 else
1811 alloc_cpus = alloc_cores;
1812
1813 /*
1814 * The minimum allocatable unit may a core, so scale by thread
1815 * count up to the proper CPU count as needed
1816 */
1817 if (total_node_cores < node_cpus)
1818 alloc_cpus *= node_threads;
1819 nodeinfo->alloc_cpus = alloc_cpus;
1820
1821 if (select_node_record) {
1822 nodeinfo->alloc_memory =
1823 select_node_usage[n].alloc_memory;
1824 } else {
1825 nodeinfo->alloc_memory = 0;
1826 }
1827
1828 /* Build allocated TRES info */
1829 if (!nodeinfo->tres_alloc_cnt)
1830 nodeinfo->tres_alloc_cnt = xcalloc(slurmctld_tres_cnt,
1831 sizeof(uint64_t));
1832 nodeinfo->tres_alloc_cnt[TRES_ARRAY_CPU] = alloc_cpus;
1833 nodeinfo->tres_alloc_cnt[TRES_ARRAY_MEM] =
1834 nodeinfo->alloc_memory;
1835 if (select_node_usage[n].gres_list)
1836 gres_list = select_node_usage[n].gres_list;
1837 else
1838 gres_list = node_ptr->gres_list;
1839 gres_set_node_tres_cnt(gres_list, nodeinfo->tres_alloc_cnt,
1840 false);
1841
1842 xfree(nodeinfo->tres_alloc_fmt_str);
1843 nodeinfo->tres_alloc_fmt_str =
1844 assoc_mgr_make_tres_str_from_array(
1845 nodeinfo->tres_alloc_cnt,
1846 TRES_STR_CONVERT_UNITS, false);
1847 nodeinfo->tres_alloc_weighted =
1848 assoc_mgr_tres_weighted(nodeinfo->tres_alloc_cnt,
1849 node_ptr->config_ptr->tres_weights,
1850 priority_flags, false);
1851 }
1852 free_core_array(&alloc_core_bitmap);
1853
1854 return SLURM_SUCCESS;
1855 }
1856
select_p_select_nodeinfo_set(job_record_t * job_ptr)1857 extern int select_p_select_nodeinfo_set(job_record_t *job_ptr)
1858 {
1859 int rc;
1860
1861 xassert(job_ptr);
1862 xassert(job_ptr->magic == JOB_MAGIC);
1863
1864 if (IS_JOB_RUNNING(job_ptr))
1865 rc = job_res_add_job(job_ptr, 0);
1866 else if (IS_JOB_SUSPENDED(job_ptr)) {
1867 if (job_ptr->priority == 0)
1868 rc = job_res_add_job(job_ptr, 1);
1869 else /* Gang schedule suspend */
1870 rc = job_res_add_job(job_ptr, 0);
1871 } else
1872 return SLURM_SUCCESS;
1873
1874 gres_plugin_job_state_log(job_ptr->gres_list, job_ptr->job_id);
1875
1876 return rc;
1877 }
1878
select_p_select_nodeinfo_get(select_nodeinfo_t * nodeinfo,enum select_nodedata_type dinfo,enum node_states state,void * data)1879 extern int select_p_select_nodeinfo_get(select_nodeinfo_t *nodeinfo,
1880 enum select_nodedata_type dinfo,
1881 enum node_states state,
1882 void *data)
1883 {
1884 int rc = SLURM_SUCCESS;
1885 uint16_t *uint16 = (uint16_t *) data;
1886 uint64_t *uint64 = (uint64_t *) data;
1887 char **tmp_char = (char **) data;
1888 double *tmp_double = (double *) data;
1889 select_nodeinfo_t **select_nodeinfo = (select_nodeinfo_t **) data;
1890
1891 if (nodeinfo == NULL) {
1892 error("%s: nodeinfo not set", __func__);
1893 return SLURM_ERROR;
1894 }
1895
1896 if (nodeinfo->magic != nodeinfo_magic) {
1897 error("%s: jobinfo magic bad", __func__);
1898 return SLURM_ERROR;
1899 }
1900
1901 switch (dinfo) {
1902 case SELECT_NODEDATA_SUBCNT:
1903 if (state == NODE_STATE_ALLOCATED)
1904 *uint16 = nodeinfo->alloc_cpus;
1905 else
1906 *uint16 = 0;
1907 break;
1908 case SELECT_NODEDATA_PTR:
1909 *select_nodeinfo = nodeinfo;
1910 break;
1911 case SELECT_NODEDATA_MEM_ALLOC:
1912 *uint64 = nodeinfo->alloc_memory;
1913 break;
1914 case SELECT_NODEDATA_TRES_ALLOC_FMT_STR:
1915 *tmp_char = xstrdup(nodeinfo->tres_alloc_fmt_str);
1916 break;
1917 case SELECT_NODEDATA_TRES_ALLOC_WEIGHTED:
1918 *tmp_double = nodeinfo->tres_alloc_weighted;
1919 break;
1920 default:
1921 error("%s: Unsupported option %d", __func__, dinfo);
1922 rc = SLURM_ERROR;
1923 break;
1924 }
1925 return rc;
1926 }
1927
1928 /* Unused for this plugin */
select_p_select_jobinfo_alloc(void)1929 extern int select_p_select_jobinfo_alloc(void)
1930 {
1931 return SLURM_SUCCESS;
1932 }
1933
1934 /* Unused for this plugin */
select_p_select_jobinfo_free(select_jobinfo_t * jobinfo)1935 extern int select_p_select_jobinfo_free(select_jobinfo_t *jobinfo)
1936 {
1937 return SLURM_SUCCESS;
1938 }
1939
1940 /* Unused for this plugin */
select_p_select_jobinfo_set(select_jobinfo_t * jobinfo,enum select_jobdata_type data_type,void * data)1941 extern int select_p_select_jobinfo_set(select_jobinfo_t *jobinfo,
1942 enum select_jobdata_type data_type,
1943 void *data)
1944 {
1945 return SLURM_SUCCESS;
1946 }
1947
1948 /* Unused for this plugin */
select_p_select_jobinfo_get(select_jobinfo_t * jobinfo,enum select_jobdata_type data_type,void * data)1949 extern int select_p_select_jobinfo_get(select_jobinfo_t *jobinfo,
1950 enum select_jobdata_type data_type,
1951 void *data)
1952 {
1953 return SLURM_ERROR;
1954 }
1955
1956 /* Unused for this plugin */
select_p_select_jobinfo_copy(select_jobinfo_t * jobinfo)1957 extern select_jobinfo_t *select_p_select_jobinfo_copy(select_jobinfo_t *jobinfo)
1958 {
1959 return NULL;
1960 }
1961
1962 /* Unused for this plugin */
select_p_select_jobinfo_pack(select_jobinfo_t * jobinfo,Buf buffer,uint16_t protocol_version)1963 extern int select_p_select_jobinfo_pack(select_jobinfo_t *jobinfo, Buf buffer,
1964 uint16_t protocol_version)
1965 {
1966 return SLURM_SUCCESS;
1967 }
1968
1969 /* Unused for this plugin */
select_p_select_jobinfo_unpack(select_jobinfo_t * jobinfo,Buf buffer,uint16_t protocol_version)1970 extern int select_p_select_jobinfo_unpack(select_jobinfo_t *jobinfo,
1971 Buf buffer,
1972 uint16_t protocol_version)
1973 {
1974 return SLURM_SUCCESS;
1975 }
1976
1977 /* Unused for this plugin */
select_p_select_jobinfo_sprint(select_jobinfo_t * jobinfo,char * buf,size_t size,int mode)1978 extern char *select_p_select_jobinfo_sprint(select_jobinfo_t *jobinfo,
1979 char *buf, size_t size, int mode)
1980 {
1981 if (buf && size) {
1982 buf[0] = '\0';
1983 return buf;
1984 }
1985 return NULL;
1986 }
1987
1988 /* Unused for this plugin */
select_p_select_jobinfo_xstrdup(select_jobinfo_t * jobinfo,int mode)1989 extern char *select_p_select_jobinfo_xstrdup(select_jobinfo_t *jobinfo,
1990 int mode)
1991 {
1992 return NULL;
1993 }
1994
select_p_get_info_from_plugin(enum select_plugindata_info info,job_record_t * job_ptr,void * data)1995 extern int select_p_get_info_from_plugin(enum select_plugindata_info info,
1996 job_record_t *job_ptr,
1997 void *data)
1998 {
1999 int rc = SLURM_SUCCESS;
2000 uint32_t *tmp_32 = (uint32_t *) data;
2001 List *tmp_list = (List *) data;
2002
2003 switch (info) {
2004 case SELECT_CR_PLUGIN:
2005 *tmp_32 = is_cons_tres ?
2006 SELECT_TYPE_CONS_TRES : SELECT_TYPE_CONS_RES;
2007 break;
2008 case SELECT_CONFIG_INFO:
2009 *tmp_list = NULL;
2010 break;
2011 case SELECT_SINGLE_JOB_TEST:
2012 *tmp_32 = is_cons_tres ? 1 : 0;
2013 break;
2014 default:
2015 error("%s: info type %d invalid", __func__, info);
2016 rc = SLURM_ERROR;
2017 break;
2018 }
2019 return rc;
2020 }
2021
select_p_update_node_config(int index)2022 extern int select_p_update_node_config(int index)
2023 {
2024 if (index >= select_node_cnt) {
2025 error("%s: index too large (%d > %d)", __func__, index,
2026 select_node_cnt);
2027 return SLURM_ERROR;
2028 }
2029
2030 /*
2031 * Socket and core count can be changed when KNL node reboots in a
2032 * different NUMA configuration
2033 */
2034 if (!(slurmctld_conf.conf_flags & CTL_CONF_OR) &&
2035 (select_node_record[index].sockets !=
2036 select_node_record[index].node_ptr->config_ptr->sockets) &&
2037 (select_node_record[index].cores !=
2038 select_node_record[index].node_ptr->config_ptr->cores) &&
2039 ((select_node_record[index].sockets *
2040 select_node_record[index].cores) ==
2041 (select_node_record[index].node_ptr->sockets *
2042 select_node_record[index].node_ptr->cores))) {
2043 select_node_record[index].cores =
2044 select_node_record[index].node_ptr->config_ptr->cores;
2045 select_node_record[index].sockets =
2046 select_node_record[index].node_ptr->config_ptr->sockets;
2047 /* tot_sockets should be the same */
2048 /* tot_cores should be the same */
2049 }
2050
2051 return SLURM_SUCCESS;
2052 }
2053
select_p_reconfigure(void)2054 extern int select_p_reconfigure(void)
2055 {
2056 ListIterator job_iterator;
2057 job_record_t *job_ptr;
2058 int rc = SLURM_SUCCESS;
2059
2060 info("%s: reconfigure", plugin_type);
2061 select_debug_flags = slurm_get_debug_flags();
2062
2063 if (is_cons_tres) {
2064 def_cpu_per_gpu = 0;
2065 def_mem_per_gpu = 0;
2066 if (slurmctld_conf.job_defaults_list) {
2067 def_cpu_per_gpu = common_get_def_cpu_per_gpu(
2068 slurmctld_conf.job_defaults_list);
2069 def_mem_per_gpu = common_get_def_mem_per_gpu(
2070 slurmctld_conf.job_defaults_list);
2071 }
2072 }
2073
2074 rc = select_p_node_init(node_record_table_ptr, node_record_count);
2075 if (rc != SLURM_SUCCESS)
2076 return rc;
2077
2078 /* reload job data */
2079 job_iterator = list_iterator_create(job_list);
2080 while ((job_ptr = list_next(job_iterator))) {
2081 if (IS_JOB_RUNNING(job_ptr)) {
2082 /* add the job */
2083 job_res_add_job(job_ptr, 0);
2084 } else if (IS_JOB_SUSPENDED(job_ptr)) {
2085 /* add the job in a suspended state */
2086 if (job_ptr->priority == 0)
2087 (void) job_res_add_job(job_ptr, 1);
2088 else /* Gang schedule suspend */
2089 (void) job_res_add_job(job_ptr, 0);
2090 }
2091 }
2092 list_iterator_destroy(job_iterator);
2093 select_state_initializing = false;
2094
2095 return SLURM_SUCCESS;
2096 }
2097
select_p_resv_test(resv_desc_msg_t * resv_desc_ptr,uint32_t node_cnt,bitstr_t * avail_node_bitmap,bitstr_t ** core_bitmap)2098 extern bitstr_t *select_p_resv_test(resv_desc_msg_t *resv_desc_ptr,
2099 uint32_t node_cnt,
2100 bitstr_t *avail_node_bitmap,
2101 bitstr_t **core_bitmap)
2102 {
2103 bitstr_t **switches_bitmap; /* nodes on this switch */
2104 bitstr_t ***switches_core_bitmap; /* cores on this switch */
2105 int *switches_core_cnt; /* total cores on switch */
2106 int *switches_node_cnt; /* total nodes on switch */
2107 int *switches_required; /* set if has required node */
2108
2109 bitstr_t *avail_nodes_bitmap = NULL; /* nodes on any switch */
2110 bitstr_t *picked_node_bitmap;
2111 uint32_t *core_cnt;
2112 bitstr_t **exc_core_bitmap = NULL, **picked_core_bitmap;
2113 int32_t prev_rem_cores, rem_cores = 0, rem_cores_save, rem_nodes;
2114 uint32_t cores_per_node = 1; /* Minimum cores per node to consider */
2115 bool aggr_core_cnt = false, clear_core, sufficient;
2116 int c, i, i_first, i_last, j, k, n;
2117 int best_fit_inx, best_fit_nodes;
2118 int best_fit_location = 0, best_fit_sufficient;
2119
2120 xassert(avail_node_bitmap);
2121 xassert(resv_desc_ptr);
2122
2123 /*
2124 * FIXME: core_bitmap is a full-system core bitmap to be
2125 * replaced with a set of per-node bitmaps in a future release
2126 * of Slurm.
2127 */
2128 if (core_bitmap)
2129 exc_core_bitmap = core_bitmap_to_array(*core_bitmap);
2130
2131 core_cnt = resv_desc_ptr->core_cnt;
2132
2133 if (core_cnt) {
2134 /*
2135 * Run this now to set up exc_core_bitmap if needed for
2136 * pick_first_cores and sequential_pick.
2137 */
2138 if (!exc_core_bitmap)
2139 exc_core_bitmap = build_core_array();
2140 (*cons_common_callbacks.spec_core_filter)(
2141 avail_node_bitmap, exc_core_bitmap);
2142 }
2143
2144 if ((resv_desc_ptr->flags & RESERVE_FLAG_FIRST_CORES) && core_cnt) {
2145 /* Reservation request with "Flags=first_cores CoreCnt=#" */
2146 avail_nodes_bitmap = (*cons_common_callbacks.pick_first_cores)(
2147 avail_node_bitmap,
2148 node_cnt, core_cnt,
2149 &exc_core_bitmap);
2150 if (avail_nodes_bitmap && core_bitmap && exc_core_bitmap) {
2151 FREE_NULL_BITMAP(*core_bitmap);
2152 *core_bitmap = core_array_to_bitmap(exc_core_bitmap);
2153 }
2154 free_core_array(&exc_core_bitmap);
2155 return avail_nodes_bitmap;
2156 }
2157
2158 /* When reservation includes a nodelist we use _sequential_pick code */
2159 if (!switch_record_cnt || !switch_record_table || !node_cnt) {
2160 /* Reservation request with "Nodes=* [CoreCnt=#]" */
2161 avail_nodes_bitmap = (*cons_common_callbacks.sequential_pick)(
2162 avail_node_bitmap,
2163 node_cnt, core_cnt,
2164 &exc_core_bitmap);
2165 if (avail_nodes_bitmap && core_bitmap && exc_core_bitmap) {
2166 FREE_NULL_BITMAP(*core_bitmap);
2167 *core_bitmap = core_array_to_bitmap(exc_core_bitmap);
2168 }
2169 free_core_array(&exc_core_bitmap);
2170 return avail_nodes_bitmap;
2171 }
2172
2173 /* Use topology state information */
2174 if (bit_set_count(avail_node_bitmap) < node_cnt) {
2175 free_core_array(&exc_core_bitmap);
2176 return NULL;
2177 }
2178
2179 rem_nodes = node_cnt;
2180 if (core_cnt && core_cnt[1]) { /* Array of core counts */
2181 for (j = 0; core_cnt[j]; j++) {
2182 rem_cores += core_cnt[j];
2183 if (j == 0)
2184 cores_per_node = core_cnt[j];
2185 else if (cores_per_node > core_cnt[j])
2186 cores_per_node = core_cnt[j];
2187 }
2188 } else if (core_cnt) { /* Aggregate core count */
2189 rem_cores = core_cnt[0];
2190 cores_per_node = core_cnt[0] / MAX(node_cnt, 1);
2191 aggr_core_cnt = true;
2192 }
2193
2194 rem_cores_save = rem_cores;
2195
2196 /*
2197 * Construct a set of switch array entries,
2198 * use the same indexes as switch_record_table in slurmctld
2199 */
2200 switches_bitmap = xcalloc(switch_record_cnt, sizeof(bitstr_t *));
2201 switches_core_bitmap = xcalloc(switch_record_cnt, sizeof(bitstr_t **));
2202 switches_core_cnt = xcalloc(switch_record_cnt, sizeof(int));
2203 switches_node_cnt = xcalloc(switch_record_cnt, sizeof(int));
2204 switches_required = xcalloc(switch_record_cnt, sizeof(int));
2205
2206 for (i = 0; i < switch_record_cnt; i++) {
2207 switches_bitmap[i] =
2208 bit_copy(switch_record_table[i].node_bitmap);
2209 bit_and(switches_bitmap[i], avail_node_bitmap);
2210 switches_node_cnt[i] = bit_set_count(switches_bitmap[i]);
2211 switches_core_bitmap[i] = common_mark_avail_cores(
2212 switches_bitmap[i], NO_VAL16);
2213 if (exc_core_bitmap) {
2214 core_array_and_not(switches_core_bitmap[i],
2215 exc_core_bitmap);
2216 }
2217 switches_core_cnt[i] =
2218 count_core_array_set(switches_core_bitmap[i]);
2219 debug2("switch:%d nodes:%d cores:%d",
2220 i, switches_node_cnt[i], switches_core_cnt[i]);
2221 }
2222
2223 /* Remove nodes with fewer available cores than needed */
2224 if (core_cnt) {
2225 n = 0;
2226
2227 for (j = 0; j < switch_record_cnt; j++) {
2228 i_first = bit_ffs(switches_bitmap[j]);
2229 if (i_first >= 0)
2230 i_last = bit_fls(switches_bitmap[j]);
2231 else
2232 i_last = i_first - 1;
2233 for (i = i_first; i <= i_last; i++) {
2234 if (!bit_test(switches_bitmap[j], i))
2235 continue;
2236
2237 c = _get_avail_cores_on_node(
2238 i, exc_core_bitmap);
2239
2240 clear_core = false;
2241 if (aggr_core_cnt && (c < cores_per_node)) {
2242 clear_core = true;
2243 } else if (aggr_core_cnt) {
2244 ;
2245 } else if (c < core_cnt[n]) {
2246 clear_core = true;
2247 } else if (core_cnt[n]) {
2248 n++;
2249 }
2250 if (!clear_core)
2251 continue;
2252 for (k = 0; k < switch_record_cnt; k++) {
2253 if (!switches_bitmap[k] ||
2254 !bit_test(switches_bitmap[k], i))
2255 continue;
2256 bit_clear(switches_bitmap[k], i);
2257 switches_node_cnt[k]--;
2258 switches_core_cnt[k] -= c;
2259 }
2260 }
2261 }
2262 }
2263
2264 #if SELECT_DEBUG
2265 /* Don't compile this, it slows things down too much */
2266 for (i = 0; i < switch_record_cnt; i++) {
2267 char *node_names = NULL;
2268 if (switches_node_cnt[i])
2269 node_names = bitmap2node_name(switches_bitmap[i]);
2270 info("switch=%s nodes=%u:%s cores:%d required:%u speed=%u",
2271 switch_record_table[i].name,
2272 switches_node_cnt[i], node_names,
2273 switches_core_cnt[i], switches_required[i],
2274 switch_record_table[i].link_speed);
2275 xfree(node_names);
2276 }
2277 #endif
2278
2279 /* Determine lowest level switch satisfying request with best fit */
2280 best_fit_inx = -1;
2281 for (j = 0; j < switch_record_cnt; j++) {
2282 if ((switches_node_cnt[j] < rem_nodes) ||
2283 (core_cnt && (switches_core_cnt[j] < rem_cores)))
2284 continue;
2285 if ((best_fit_inx == -1) ||
2286 (switch_record_table[j].level <
2287 switch_record_table[best_fit_inx].level) ||
2288 ((switch_record_table[j].level ==
2289 switch_record_table[best_fit_inx].level) &&
2290 (switches_node_cnt[j] < switches_node_cnt[best_fit_inx])))
2291 /* We should use core count by switch here as well */
2292 best_fit_inx = j;
2293 }
2294 if (best_fit_inx == -1) {
2295 debug("%s: could not find resources for reservation", __func__);
2296 goto fini;
2297 }
2298
2299 /* Identify usable leafs (within higher switch having best fit) */
2300 for (j = 0; j < switch_record_cnt; j++) {
2301 if ((switch_record_table[j].level != 0) ||
2302 (!bit_super_set(switches_bitmap[j],
2303 switches_bitmap[best_fit_inx]))) {
2304 switches_node_cnt[j] = 0;
2305 }
2306 }
2307
2308 /* Select resources from these leafs on a best-fit basis */
2309 avail_nodes_bitmap = bit_alloc(node_record_count);
2310 while (rem_nodes > 0) {
2311 best_fit_nodes = best_fit_sufficient = 0;
2312 for (j = 0; j < switch_record_cnt; j++) {
2313 if (switches_node_cnt[j] == 0)
2314 continue;
2315 if (core_cnt) {
2316 sufficient =
2317 (switches_node_cnt[j] >= rem_nodes) &&
2318 (switches_core_cnt[j] >= rem_cores);
2319 } else
2320 sufficient = switches_node_cnt[j] >= rem_nodes;
2321 /*
2322 * If first possibility OR
2323 * first set large enough for request OR
2324 * tightest fit (less resource waste) OR
2325 * nothing yet large enough, but this is biggest
2326 */
2327 if ((best_fit_nodes == 0) ||
2328 (sufficient && (best_fit_sufficient == 0)) ||
2329 (sufficient &&
2330 (switches_node_cnt[j] < best_fit_nodes)) ||
2331 ((sufficient == 0) &&
2332 (switches_node_cnt[j] > best_fit_nodes))) {
2333 best_fit_nodes = switches_node_cnt[j];
2334 best_fit_location = j;
2335 best_fit_sufficient = sufficient;
2336 }
2337 }
2338 if (best_fit_nodes == 0)
2339 break;
2340 /* Use select nodes from this leaf */
2341 i_first = bit_ffs(switches_bitmap[best_fit_location]);
2342 if (i_first >= 0)
2343 i_last = bit_fls(switches_bitmap[best_fit_location]);
2344 else
2345 i_last = i_first - 1;
2346
2347 for (i = i_first; i <= i_last; i++) {
2348 if (!bit_test(switches_bitmap[best_fit_location], i))
2349 continue;
2350 bit_clear(switches_bitmap[best_fit_location], i);
2351 switches_node_cnt[best_fit_location]--;
2352
2353 if (bit_test(avail_nodes_bitmap, i)) {
2354 /*
2355 * node on multiple leaf switches
2356 * and already selected
2357 */
2358 continue;
2359 }
2360
2361 if (core_cnt) {
2362 c = _get_avail_cores_on_node(i,
2363 exc_core_bitmap);
2364 if (c < cores_per_node)
2365 continue;
2366 debug2("Using node %d with %d cores available",
2367 i, c);
2368 rem_cores -= c;
2369 }
2370 bit_set(avail_nodes_bitmap, i);
2371 if (--rem_nodes <= 0)
2372 break;
2373 }
2374 switches_node_cnt[best_fit_location] = 0;
2375 }
2376
2377 if ((rem_nodes > 0) || (rem_cores > 0)) /* insufficient resources */
2378 FREE_NULL_BITMAP(avail_nodes_bitmap);
2379
2380 fini: for (i = 0; i < switch_record_cnt; i++) {
2381 FREE_NULL_BITMAP(switches_bitmap[i]);
2382 free_core_array(&switches_core_bitmap[i]);
2383 }
2384 xfree(switches_bitmap);
2385 xfree(switches_core_bitmap);
2386 xfree(switches_core_cnt);
2387 xfree(switches_node_cnt);
2388 xfree(switches_required);
2389
2390 if (avail_nodes_bitmap && core_cnt) {
2391 /* Reservation is using partial nodes */
2392 picked_node_bitmap = bit_alloc(bit_size(avail_node_bitmap));
2393 picked_core_bitmap = build_core_array();
2394
2395 rem_cores = rem_cores_save;
2396 n = 0;
2397 prev_rem_cores = -1;
2398
2399 while (rem_cores) {
2400 int avail_cores_in_node, inx, coff;
2401 bitstr_t *use_exc_bitmap = NULL,
2402 *use_picked_bitmap = NULL;
2403
2404 inx = bit_ffs(avail_nodes_bitmap);
2405 if ((inx < 0) && aggr_core_cnt && (rem_cores > 0) &&
2406 (rem_cores != prev_rem_cores)) {
2407 /*
2408 * Make another pass over nodes to reach
2409 * requested aggregate core count
2410 */
2411 bit_or(avail_nodes_bitmap, picked_node_bitmap);
2412 inx = bit_ffs(avail_nodes_bitmap);
2413 prev_rem_cores = rem_cores;
2414 cores_per_node = 1;
2415 }
2416 if (inx < 0)
2417 break;
2418
2419 debug2("Using node inx:%d cores_per_node:%d rem_cores:%u",
2420 inx, cores_per_node, rem_cores);
2421
2422 /* Clear this node from the initial available bitmap */
2423 bit_clear(avail_nodes_bitmap, inx);
2424
2425 if (select_node_record[inx].tot_cores < cores_per_node)
2426 continue;
2427 avail_cores_in_node =
2428 _get_avail_cores_on_node(inx, exc_core_bitmap);
2429
2430 debug2("Node inx:%d has %d available cores", inx,
2431 avail_cores_in_node);
2432 if (avail_cores_in_node < cores_per_node)
2433 continue;
2434
2435 xassert(exc_core_bitmap);
2436
2437 avail_cores_in_node = 0;
2438
2439 if (!is_cons_tres) {
2440 use_exc_bitmap = *exc_core_bitmap;
2441 coff = cr_get_coremap_offset(inx);
2442 if (!*picked_core_bitmap)
2443 *picked_core_bitmap = bit_alloc(
2444 bit_size(use_exc_bitmap));
2445 use_picked_bitmap = *picked_core_bitmap;
2446 } else {
2447 use_exc_bitmap = exc_core_bitmap[inx];
2448 coff = 0;
2449 if (!picked_core_bitmap[inx]) {
2450 picked_core_bitmap[inx] = bit_alloc(
2451 select_node_record[inx].
2452 tot_cores);
2453 }
2454 use_picked_bitmap = picked_core_bitmap[inx];
2455 }
2456
2457 for (int i = 0;
2458 i < select_node_record[inx].tot_cores;
2459 i++) {
2460 int set = coff + i;
2461 if ((!use_exc_bitmap ||
2462 !bit_test(use_exc_bitmap, set)) &&
2463 !bit_test(use_picked_bitmap, set)) {
2464 bit_set(use_picked_bitmap, set);
2465 rem_cores--;
2466 avail_cores_in_node++;
2467 }
2468 if (rem_cores == 0)
2469 break;
2470 if (aggr_core_cnt &&
2471 (avail_cores_in_node >= cores_per_node))
2472 break;
2473 if (!aggr_core_cnt &&
2474 (avail_cores_in_node >= core_cnt[n]))
2475 break;
2476 }
2477
2478 /* Add this node to the final node bitmap */
2479 if (avail_cores_in_node)
2480 bit_set(picked_node_bitmap, inx);
2481 n++;
2482 }
2483 FREE_NULL_BITMAP(avail_nodes_bitmap);
2484 free_core_array(&exc_core_bitmap);
2485
2486 if (rem_cores) {
2487 info("reservation request can not be satisfied");
2488 FREE_NULL_BITMAP(picked_node_bitmap);
2489 picked_node_bitmap = NULL;
2490 } else {
2491 *core_bitmap = core_array_to_bitmap(picked_core_bitmap);
2492 }
2493 free_core_array(&picked_core_bitmap);
2494 return picked_node_bitmap;
2495 }
2496 free_core_array(&exc_core_bitmap);
2497
2498 return avail_nodes_bitmap;
2499 }
2500