1 /*****************************************************************************\
2  *  read_config.c - read the overall slurm configuration file
3  *****************************************************************************
4  *  Copyright (C) 2002-2007 The Regents of the University of California.
5  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
6  *  Portions Copyright (C) 2010-2016 SchedMD <https://www.schedmd.com>.
7  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8  *  Written by Morris Jette <jette1@llnl.gov>.
9  *  CODE-OCEC-09-009. All rights reserved.
10  *
11  *  This file is part of Slurm, a resource management program.
12  *  For details, see <https://slurm.schedmd.com/>.
13  *  Please also read the included file: DISCLAIMER.
14  *
15  *  Slurm is free software; you can redistribute it and/or modify it under
16  *  the terms of the GNU General Public License as published by the Free
17  *  Software Foundation; either version 2 of the License, or (at your option)
18  *  any later version.
19  *
20  *  In addition, as a special exception, the copyright holders give permission
21  *  to link the code of portions of this program with the OpenSSL library under
22  *  certain conditions as described in each individual source file, and
23  *  distribute linked combinations including the two. You must obey the GNU
24  *  General Public License in all respects for all of the code used other than
25  *  OpenSSL. If you modify file(s) with this exception, you may extend this
26  *  exception to your version of the file(s), but you are not obligated to do
27  *  so. If you do not wish to do so, delete this exception statement from your
28  *  version.  If you delete this exception statement from all source files in
29  *  the program, then also delete it here.
30  *
31  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
32  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
33  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
34  *  details.
35  *
36  *  You should have received a copy of the GNU General Public License along
37  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
38  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
39 \*****************************************************************************/
40 
41 #include "config.h"
42 
43 #include <ctype.h>
44 #include <errno.h>
45 #include <stdio.h>
46 #include <stdlib.h>
47 #include <string.h>
48 #include <syslog.h>
49 #include <sys/stat.h>
50 #include <sys/types.h>
51 #include <time.h>
52 #include <unistd.h>
53 
54 #include "src/common/assoc_mgr.h"
55 #include "src/common/cpu_frequency.h"
56 #include "src/common/gres.h"
57 #include "src/common/hostlist.h"
58 #include "src/common/layouts_mgr.h"
59 #include "src/common/list.h"
60 #include "src/common/macros.h"
61 #include "src/common/node_features.h"
62 #include "src/common/node_select.h"
63 #include "src/common/power.h"
64 #include "src/common/prep.h"
65 #include "src/common/read_config.h"
66 #include "src/common/slurm_jobcomp.h"
67 #include "src/common/slurm_mcs.h"
68 #include "src/common/slurm_topology.h"
69 #include "src/common/slurm_rlimits_info.h"
70 #include "src/common/slurm_route.h"
71 #include "src/common/strnatcmp.h"
72 #include "src/common/switch.h"
73 #include "src/common/xstring.h"
74 #include "src/common/xcgroup_read_config.h"
75 
76 #include "src/slurmctld/acct_policy.h"
77 #include "src/slurmctld/burst_buffer.h"
78 #include "src/slurmctld/fed_mgr.h"
79 #include "src/slurmctld/front_end.h"
80 #include "src/slurmctld/gang.h"
81 #include "src/slurmctld/job_scheduler.h"
82 #include "src/slurmctld/job_submit.h"
83 #include "src/slurmctld/licenses.h"
84 #include "src/slurmctld/locks.h"
85 #include "src/slurmctld/node_scheduler.h"
86 #include "src/slurmctld/port_mgr.h"
87 #include "src/slurmctld/preempt.h"
88 #include "src/slurmctld/proc_req.h"
89 #include "src/slurmctld/read_config.h"
90 #include "src/slurmctld/reservation.h"
91 #include "src/slurmctld/sched_plugin.h"
92 #include "src/slurmctld/slurmctld.h"
93 #include "src/slurmctld/srun_comm.h"
94 #include "src/slurmctld/trigger_mgr.h"
95 
96 #define FEATURE_MAGIC	0x34dfd8b5
97 
98 /* Global variables */
99 List active_feature_list;	/* list of currently active features_records */
100 List avail_feature_list;	/* list of available features_records */
101 bool node_features_updated = true;
102 bool slurmctld_init_db = true;
103 
104 static void _acct_restore_active_jobs(void);
105 static void _add_config_feature(List feature_list, char *feature,
106 				bitstr_t *node_bitmap);
107 static void _add_config_feature_inx(List feature_list, char *feature,
108 				    int node_inx);
109 static void _build_bitmaps(void);
110 static void _build_bitmaps_pre_select(void);
111 static int  _compare_hostnames(node_record_t *old_node_table,
112 			       int old_node_count, node_record_t *node_table,
113 			       int node_count);
114 static void _gres_reconfig(bool reconfig);
115 static int  _init_all_slurm_conf(void);
116 static void _list_delete_feature(void *feature_entry);
117 static int  _preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr,
118 					uint16_t old_select_type_p);
119 static void _purge_old_node_state(node_record_t *old_node_table_ptr,
120 				  int old_node_record_count);
121 static void _purge_old_part_state(List old_part_list, char *old_def_part_name);
122 static int  _reset_node_bitmaps(void *x, void *arg);
123 static void _restore_job_accounting();
124 
125 static int  _restore_node_state(int recover, node_record_t *old_node_table_ptr,
126 				int old_node_record_count);
127 static int  _restore_part_state(List old_part_list, char *old_def_part_name,
128 				uint16_t flags);
129 static void _set_features(node_record_t *old_node_table_ptr,
130 			  int old_node_record_count, int recover);
131 static void _stat_slurm_dirs(void);
132 static int  _sync_nodes_to_comp_job(void);
133 static int  _sync_nodes_to_jobs(bool reconfig);
134 static int  _sync_nodes_to_active_job(job_record_t *job_ptr);
135 static void _sync_nodes_to_suspended_job(job_record_t *job_ptr);
136 static void _sync_part_prio(void);
137 static int  _update_preempt(uint16_t old_enable_preempt);
138 
139 
140 /*
141  * Setup the global response_cluster_rec
142  */
_set_response_cluster_rec(void)143 static void _set_response_cluster_rec(void)
144 {
145 	if (response_cluster_rec)
146 		return;
147 
148 	response_cluster_rec = xmalloc(sizeof(slurmdb_cluster_rec_t));
149 	response_cluster_rec->name = xstrdup(slurmctld_conf.cluster_name);
150 	if (slurmctld_conf.slurmctld_addr) {
151 		response_cluster_rec->control_host =
152 			xstrdup(slurmctld_conf.slurmctld_addr);
153 	} else {
154 		response_cluster_rec->control_host =
155 			xstrdup(slurmctld_conf.control_addr[0]);
156 	}
157 	response_cluster_rec->control_port = slurmctld_conf.slurmctld_port;
158 	response_cluster_rec->rpc_version = SLURM_PROTOCOL_VERSION;
159 	response_cluster_rec->plugin_id_select = select_get_plugin_id();
160 }
161 
162 /*
163  * Free the global response_cluster_rec
164  */
cluster_rec_free(void)165 extern void cluster_rec_free(void)
166 {
167 	if (response_cluster_rec) {
168 		xfree(response_cluster_rec->control_host);
169 		xfree(response_cluster_rec->name);
170 		xfree(response_cluster_rec);
171 	}
172 }
173 
174 /* Verify that Slurm directories are secure, not world writable */
_stat_slurm_dirs(void)175 static void _stat_slurm_dirs(void)
176 {
177 	struct stat stat_buf;
178 	char *problem_dir = NULL;
179 
180 	/*
181 	 * PluginDir may have multiple values, and is checked by
182 	 * _is_valid_path() instead
183 	 */
184 
185 	if (slurmctld_conf.plugstack &&
186 	    (stat(slurmctld_conf.plugstack, &stat_buf) == 0) &&
187 	    (stat_buf.st_mode & S_IWOTH)) {
188 		problem_dir = "PlugStack";
189 	}
190 	if ((stat(slurmctld_conf.slurmd_spooldir, &stat_buf) == 0) &&
191 	    (stat_buf.st_mode & S_IWOTH)) {
192 		problem_dir = "SlurmdSpoolDir";
193 	}
194 	if ((stat(slurmctld_conf.state_save_location, &stat_buf) == 0) &&
195 	    (stat_buf.st_mode & S_IWOTH)) {
196 		problem_dir = "StateSaveLocation";
197 	}
198 
199 	if (problem_dir) {
200 		error("################################################");
201 		error("###       SEVERE SECURITY VULERABILTY        ###");
202 		error("### %s DIRECTORY IS WORLD WRITABLE ###", problem_dir);
203 		error("###         CORRECT FILE PERMISSIONS         ###");
204 		error("################################################");
205 	}
206 }
207 
208 /*
209  * _reorder_nodes_by_name - order node table in ascending order of name
210  */
_reorder_nodes_by_name(void)211 static void _reorder_nodes_by_name(void)
212 {
213 	node_record_t *node_ptr, *node_ptr2;
214 	int i, j, min_inx;
215 
216 	/* Now we need to sort the node records */
217 	for (i = 0; i < node_record_count; i++) {
218 		min_inx = i;
219 		for (j = i + 1; j < node_record_count; j++) {
220 			if (strnatcmp(node_record_table_ptr[j].name,
221 				      node_record_table_ptr[min_inx].name) < 0)
222 				min_inx = j;
223 		}
224 
225 		if (min_inx != i) {	/* swap records */
226 			node_record_t node_record_tmp;
227 
228 			j = sizeof(node_record_t);
229 			node_ptr  = node_record_table_ptr + i;
230 			node_ptr2 = node_record_table_ptr + min_inx;
231 
232 			memcpy(&node_record_tmp, node_ptr, j);
233 			memcpy(node_ptr, node_ptr2, j);
234 			memcpy(node_ptr2, &node_record_tmp, j);
235 		}
236 	}
237 
238 #if _DEBUG
239 	/* Log the results */
240 	for (i=0, node_ptr = node_record_table_ptr; i < node_record_count;
241 	     i++, node_ptr++) {
242 		info("node_rank[%d]: %s", i, node_ptr->name);
243 	}
244 #endif
245 }
246 
247 /*
248  * _reorder_nodes_by_rank - order node table in ascending order of node_rank
249  * This depends on the TopologyPlugin and/or SelectPlugin, which may generate
250  * such a ranking.
251  */
_reorder_nodes_by_rank(void)252 static void _reorder_nodes_by_rank(void)
253 {
254 	node_record_t *node_ptr, *node_ptr2;
255 	int i, j, min_inx;
256 	uint32_t min_val;
257 
258 	/* Now we need to sort the node records */
259 	for (i = 0; i < node_record_count; i++) {
260 		min_val = node_record_table_ptr[i].node_rank;
261 		min_inx = i;
262 		for (j = i + 1; j < node_record_count; j++) {
263 			if (node_record_table_ptr[j].node_rank < min_val) {
264 				min_val = node_record_table_ptr[j].node_rank;
265 				min_inx = j;
266 			}
267 		}
268 
269 		if (min_inx != i) {	/* swap records */
270 			node_record_t node_record_tmp;
271 
272 			j = sizeof(node_record_t);
273 			node_ptr  = node_record_table_ptr + i;
274 			node_ptr2 = node_record_table_ptr + min_inx;
275 
276 			memcpy(&node_record_tmp, node_ptr, j);
277 			memcpy(node_ptr, node_ptr2, j);
278 			memcpy(node_ptr2, &node_record_tmp, j);
279 		}
280 	}
281 
282 #if _DEBUG
283 	/* Log the results */
284 	for (i=0, node_ptr = node_record_table_ptr; i < node_record_count;
285 	     i++, node_ptr++) {
286 		info("node_rank[%u]: %s", node_ptr->node_rank, node_ptr->name);
287 	}
288 #endif
289 }
290 
291 /*
292  * Unfortunately the global feature bitmaps have not been set up at this point,
293  * so we'll have to scan through the node_record_table directly to locate
294  * the appropriate records.
295  */
_add_nodes_with_feature(hostlist_t hl,char * feature)296 static void _add_nodes_with_feature(hostlist_t hl, char *feature)
297 {
298 	for (int i = 0; i < node_record_count; i++) {
299 		char *features, *tmp, *tok, *last = NULL;
300 
301 		if (!node_record_table_ptr[i].features)
302 			continue;
303 
304 		features = tmp = xstrdup(node_record_table_ptr[i].features);
305 
306 		while ((tok = strtok_r(tmp, ",", &last))) {
307 			if (!xstrcmp(tok, feature)) {
308 				hostlist_push_host(hl, node_record_table_ptr[i].name);
309 				break;
310 			}
311 			tmp = NULL;
312 		}
313 		xfree(features);
314 	}
315 }
316 
_handle_nodesets(char ** nodeline)317 static void _handle_nodesets(char **nodeline)
318 {
319 	int count;
320 	slurm_conf_nodeset_t *ptr, **ptr_array;
321 	hostlist_t hl;
322 
323 	count = slurm_conf_nodeset_array(&ptr_array);
324 
325 	hl = hostlist_create(*nodeline);
326 
327 	for (int i = 0; i < count; i++) {
328 		ptr = ptr_array[i];
329 
330 		/* swap the nodeset entry with the applicable nodes */
331 		if (hostlist_delete_host(hl, ptr->name)) {
332 			if (ptr->feature) {
333 				_add_nodes_with_feature(hl, ptr->feature);
334 			}
335 
336 			if (ptr->nodes)
337 				hostlist_push_host(hl, ptr->nodes);
338 		}
339 	}
340 
341 	xfree(*nodeline);
342 	*nodeline = hostlist_ranged_string_xmalloc(hl);
343 	hostlist_destroy(hl);
344 }
345 
346 /*
347  * _build_bitmaps_pre_select - recover some state for jobs and nodes prior to
348  *	calling the select_* functions
349  */
_build_bitmaps_pre_select(void)350 static void _build_bitmaps_pre_select(void)
351 {
352 	part_record_t *part_ptr;
353 	node_record_t *node_ptr;
354 	ListIterator part_iterator;
355 	int i;
356 
357 	/* scan partition table and identify nodes in each */
358 	part_iterator = list_iterator_create(part_list);
359 	while ((part_ptr = list_next(part_iterator))) {
360 		_handle_nodesets(&part_ptr->nodes);
361 		if (build_part_bitmap(part_ptr) == ESLURM_INVALID_NODE_NAME)
362 			fatal("Invalid node names in partition %s",
363 					part_ptr->name);
364 	}
365 	list_iterator_destroy(part_iterator);
366 
367 	/* initialize the configuration bitmaps */
368 	list_for_each(config_list, _reset_node_bitmaps, NULL);
369 
370 	for (i = 0, node_ptr = node_record_table_ptr;
371 	     i < node_record_count; i++, node_ptr++) {
372 		if (node_ptr->config_ptr)
373 			bit_set(node_ptr->config_ptr->node_bitmap, i);
374 	}
375 
376 	return;
377 }
378 
_reset_node_bitmaps(void * x,void * arg)379 static int _reset_node_bitmaps(void *x, void *arg)
380 {
381 	config_record_t *config_ptr = (config_record_t *) x;
382 
383 	FREE_NULL_BITMAP(config_ptr->node_bitmap);
384 	config_ptr->node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
385 
386 	return 0;
387 }
388 
_set_share_node_bitmap(void * x,void * arg)389 static int _set_share_node_bitmap(void *x, void *arg)
390 {
391 	job_record_t *job_ptr = (job_record_t *) x;
392 
393 	if (!IS_JOB_RUNNING(job_ptr) ||
394 	    (job_ptr->node_bitmap == NULL)        ||
395 	    (job_ptr->details     == NULL)        ||
396 	    (job_ptr->details->share_res != 0))
397 		return 0;
398 
399 	bit_and_not(share_node_bitmap, job_ptr->node_bitmap);
400 
401 	return 0;
402 }
403 
404 /*
405  * _set_slurmd_addr - establish the slurm_addr_t for the slurmd on each node
406  *	Uses common data structures.
407  */
_set_slurmd_addr(void)408 static void _set_slurmd_addr(void)
409 {
410 #ifndef HAVE_FRONT_END
411 	int i;
412 	node_record_t *node_ptr = node_record_table_ptr;
413 	DEF_TIMERS;
414 
415 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
416 
417 	START_TIMER;
418 	for (i = 0; i < node_record_count; i++, node_ptr++) {
419 		if ((node_ptr->name == NULL) ||
420 		    (node_ptr->name[0] == '\0'))
421 			continue;
422 		if (IS_NODE_FUTURE(node_ptr))
423 			continue;
424 		if (IS_NODE_CLOUD(node_ptr)) {
425                     if (slurmctld_conf.suspend_time < 1 ||
426                         slurmctld_conf.resume_program == NULL ||
427                         slurmctld_conf.suspend_program == NULL)
428                             error("%s: Node %s configured with CLOUD state but "
429                                   "missing any of SuspendTime, SuspendProgram "
430                                   "or ResumeProgram options",__func__,
431 				  node_ptr->name);
432 		    if (IS_NODE_POWER_SAVE(node_ptr))
433 			continue;
434 		}
435 		if (node_ptr->port == 0)
436 			node_ptr->port = slurmctld_conf.slurmd_port;
437 		slurm_set_addr(&node_ptr->slurm_addr, node_ptr->port,
438 			       node_ptr->comm_name);
439 		if (node_ptr->slurm_addr.sin_port)
440 			continue;
441 		error("%s: failure on %s", __func__, node_ptr->comm_name);
442 		node_ptr->node_state = NODE_STATE_FUTURE;
443 		node_ptr->port = 0;
444 		xfree(node_ptr->reason);
445 		node_ptr->reason = xstrdup("NO NETWORK ADDRESS FOUND");
446 		node_ptr->reason_time = time(NULL);
447 		node_ptr->reason_uid = slurmctld_conf.slurm_user_id;
448 	}
449 
450 	END_TIMER2("_set_slurmd_addr");
451 #endif
452 }
453 
454 /*
455  * _build_bitmaps - build node bitmaps to define which nodes are in which
456  *    1) partition  2) configuration record  3) up state  4) idle state
457  *    also sets values of total_nodes and total_cpus for every partition.
458  * RET 0 if no error, errno otherwise
459  * Note: Operates on common variables, no arguments
460  *	node_record_count - number of nodes in the system
461  *	node_record_table_ptr - pointer to global node table
462  *	part_list - pointer to global partition list
463  */
_build_bitmaps(void)464 static void _build_bitmaps(void)
465 {
466 	int i;
467 	node_record_t *node_ptr;
468 
469 	last_node_update = time(NULL);
470 	last_part_update = time(NULL);
471 
472 	/* initialize the idle and up bitmaps */
473 	FREE_NULL_BITMAP(avail_node_bitmap);
474 	FREE_NULL_BITMAP(bf_ignore_node_bitmap);
475 	FREE_NULL_BITMAP(booting_node_bitmap);
476 	FREE_NULL_BITMAP(cg_node_bitmap);
477 	FREE_NULL_BITMAP(future_node_bitmap);
478 	FREE_NULL_BITMAP(idle_node_bitmap);
479 	FREE_NULL_BITMAP(power_node_bitmap);
480 	FREE_NULL_BITMAP(share_node_bitmap);
481 	FREE_NULL_BITMAP(up_node_bitmap);
482 	FREE_NULL_BITMAP(rs_node_bitmap);
483 	avail_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
484 	bf_ignore_node_bitmap = bit_alloc(node_record_count);
485 	booting_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
486 	cg_node_bitmap    = (bitstr_t *) bit_alloc(node_record_count);
487 	future_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
488 	idle_node_bitmap  = (bitstr_t *) bit_alloc(node_record_count);
489 	power_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
490 	share_node_bitmap = (bitstr_t *) bit_alloc(node_record_count);
491 	up_node_bitmap    = (bitstr_t *) bit_alloc(node_record_count);
492 	rs_node_bitmap    = (bitstr_t *) bit_alloc(node_record_count);
493 
494 	/* Set all bits, all nodes initially available for sharing */
495 	bit_set_all(share_node_bitmap);
496 
497 	/* identify all nodes non-sharable due to non-sharing jobs */
498 	list_for_each(job_list, _set_share_node_bitmap, NULL);
499 
500 	/* scan all nodes and identify which are up, idle and
501 	 * their configuration, resync DRAINED vs. DRAINING state */
502 	for (i = 0, node_ptr = node_record_table_ptr;
503 	     i < node_record_count; i++, node_ptr++) {
504 		uint32_t drain_flag, job_cnt;
505 
506 		if (node_ptr->name[0] == '\0')
507 			continue;	/* defunct */
508 		drain_flag = IS_NODE_DRAIN(node_ptr) |
509 			     IS_NODE_FAIL(node_ptr);
510 		job_cnt = node_ptr->run_job_cnt + node_ptr->comp_job_cnt;
511 
512 		if ((IS_NODE_IDLE(node_ptr) && (job_cnt == 0)) ||
513 		    IS_NODE_DOWN(node_ptr))
514 			bit_set(idle_node_bitmap, i);
515 		if (IS_NODE_POWER_UP(node_ptr))
516 			bit_set(booting_node_bitmap, i);
517 		if (IS_NODE_COMPLETING(node_ptr))
518 			bit_set(cg_node_bitmap, i);
519 		if (IS_NODE_IDLE(node_ptr) ||
520 		    IS_NODE_ALLOCATED(node_ptr) ||
521 		    (IS_NODE_REBOOT(node_ptr) &&
522 		     (node_ptr->next_state == NODE_RESUME))) {
523 			if ((drain_flag == 0) &&
524 			    (!IS_NODE_NO_RESPOND(node_ptr)))
525 				make_node_avail(i);
526 			bit_set(up_node_bitmap, i);
527 		}
528 		if (IS_NODE_POWER_SAVE(node_ptr))
529 			bit_set(power_node_bitmap, i);
530 		if (IS_NODE_POWERING_DOWN(node_ptr))
531 			bit_clear(avail_node_bitmap, i);
532 		if (IS_NODE_FUTURE(node_ptr))
533 			bit_set(future_node_bitmap, i);
534 
535 		if (IS_NODE_REBOOT(node_ptr) &&
536 		    (node_ptr->next_state == NODE_RESUME))
537 			bit_set(rs_node_bitmap, i);
538 	}
539 }
540 
541 
542 /*
543  * _init_all_slurm_conf - initialize or re-initialize the slurm
544  *	configuration values.
545  * RET 0 if no error, otherwise an error code.
546  * NOTE: We leave the job table intact
547  * NOTE: Operates on common variables, no arguments
548  */
_init_all_slurm_conf(void)549 static int _init_all_slurm_conf(void)
550 {
551 	int error_code;
552 	char *conf_name = xstrdup(slurmctld_conf.slurm_conf);
553 
554 	slurm_conf_reinit(conf_name);
555 	xfree(conf_name);
556 
557 	if ((error_code = init_node_conf()))
558 		return error_code;
559 
560 	if ((error_code = init_part_conf()))
561 		return error_code;
562 
563 	if ((error_code = init_job_conf()))
564 		return error_code;
565 
566 	return 0;
567 }
568 
_handle_downnodes_line(slurm_conf_downnodes_t * down)569 static int _handle_downnodes_line(slurm_conf_downnodes_t *down)
570 {
571 	int error_code = 0;
572 	node_record_t *node_rec = NULL;
573 	hostlist_t alias_list = NULL;
574 	char *alias = NULL;
575 	int state_val = NODE_STATE_DOWN;
576 
577 	if (down->state != NULL) {
578 		state_val = state_str2int(down->state, down->nodenames);
579 		if (state_val == NO_VAL) {
580 			error("Invalid State \"%s\"", down->state);
581 			goto cleanup;
582 		}
583 	}
584 
585 	if ((alias_list = hostlist_create(down->nodenames)) == NULL) {
586 		error("Unable to create NodeName list from %s",
587 		      down->nodenames);
588 		error_code = errno;
589 		goto cleanup;
590 	}
591 
592 	while ((alias = hostlist_shift(alias_list))) {
593 		node_rec = find_node_record(alias);
594 		if (node_rec == NULL) {
595 			error("DownNode \"%s\" does not exist!", alias);
596 			free(alias);
597 			continue;
598 		}
599 
600 		if ((state_val != NO_VAL) &&
601 		    (state_val != NODE_STATE_UNKNOWN))
602 			node_rec->node_state = state_val;
603 		if (down->reason) {
604 			xfree(node_rec->reason);
605 			node_rec->reason = xstrdup(down->reason);
606 			node_rec->reason_time = time(NULL);
607 			node_rec->reason_uid = slurmctld_conf.slurm_user_id;
608 		}
609 		free(alias);
610 	}
611 
612 cleanup:
613 	if (alias_list)
614 		hostlist_destroy(alias_list);
615 	return error_code;
616 }
617 
_handle_all_downnodes(void)618 static void _handle_all_downnodes(void)
619 {
620 	slurm_conf_downnodes_t *ptr, **ptr_array;
621 	int count;
622 	int i;
623 
624 	count = slurm_conf_downnodes_array(&ptr_array);
625 	if (count == 0) {
626 		debug("No DownNodes");
627 		return;
628 	}
629 
630 	for (i = 0; i < count; i++) {
631 		ptr = ptr_array[i];
632 
633 		_handle_downnodes_line(ptr);
634 	}
635 }
636 
637 /*
638  * _build_all_nodeline_info - get a array of slurm_conf_node_t structures
639  *	from the slurm.conf reader, build table, and set values
640  * RET 0 if no error, error code otherwise
641  * Note: Operates on common variables
642  *	default_node_record - default node configuration values
643  */
_build_all_nodeline_info(void)644 static int _build_all_nodeline_info(void)
645 {
646 	int rc, rc2;
647 
648 	/* Load the node table here */
649 	rc = build_all_nodeline_info(false, slurmctld_tres_cnt);
650 	(void)acct_storage_g_reconfig(acct_db_conn, 0);
651 	rc2 = build_all_frontend_info(false);
652 	rc = MAX(rc, rc2);
653 
654 	return rc;
655 }
656 
657 /* Convert a comma delimited list of account names into a NULL terminated
658  * array of pointers to strings. Call accounts_list_free() to release memory */
accounts_list_build(char * accounts,char *** accounts_array)659 extern void accounts_list_build(char *accounts, char ***accounts_array)
660 {
661 	char *tmp_accts, *one_acct_name, *name_ptr = NULL, **tmp_array = NULL;
662 	int array_len = 0, array_used = 0;
663 
664 	if (!accounts) {
665 		accounts_list_free(accounts_array);
666 		*accounts_array = NULL;
667 		return;
668 	}
669 
670 	tmp_accts = xstrdup(accounts);
671 	one_acct_name = strtok_r(tmp_accts, ",", &name_ptr);
672 	while (one_acct_name) {
673 		if (array_len < array_used + 2) {
674 			array_len += 10;
675 			xrealloc(tmp_array, sizeof(char *) * array_len);
676 		}
677 		tmp_array[array_used++] = xstrdup(one_acct_name);
678 		one_acct_name = strtok_r(NULL, ",", &name_ptr);
679 	}
680 	xfree(tmp_accts);
681 	accounts_list_free(accounts_array);
682 	*accounts_array = tmp_array;
683 }
684 /* Free memory allocated for an account array by accounts_list_build() */
accounts_list_free(char *** accounts_array)685 extern void accounts_list_free(char ***accounts_array)
686 {
687 	int i;
688 
689 	if (*accounts_array == NULL)
690 		return;
691 	for (i = 0; accounts_array[0][i]; i++)
692 		xfree(accounts_array[0][i]);
693 	xfree(*accounts_array);
694 }
695 
696 /* Convert a comma delimited list of QOS names into a bitmap */
qos_list_build(char * qos,bitstr_t ** qos_bits)697 extern void qos_list_build(char *qos, bitstr_t **qos_bits)
698 {
699 	char *tmp_qos, *one_qos_name, *name_ptr = NULL;
700 	slurmdb_qos_rec_t qos_rec, *qos_ptr = NULL;
701 	bitstr_t *tmp_qos_bitstr;
702 	int rc;
703 	assoc_mgr_lock_t locks = { .qos = READ_LOCK };
704 
705 	if (!qos) {
706 		FREE_NULL_BITMAP(*qos_bits);
707 		return;
708 	}
709 
710 	/* Lock here to avoid g_qos_count changing under us */
711 	assoc_mgr_lock(&locks);
712 	if (!g_qos_count) {
713 		error("We have no QOS on the system Ignoring invalid "
714 		      "Allow/DenyQOS value(s) %s",
715 		      qos);
716 		assoc_mgr_unlock(&locks);
717 		FREE_NULL_BITMAP(*qos_bits);
718 		*qos_bits = NULL;
719 		return;
720 	}
721 
722 	tmp_qos_bitstr = bit_alloc(g_qos_count);
723 	tmp_qos = xstrdup(qos);
724 	one_qos_name = strtok_r(tmp_qos, ",", &name_ptr);
725 	while (one_qos_name) {
726 		memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
727 		qos_rec.name = one_qos_name;
728 		rc = assoc_mgr_fill_in_qos(acct_db_conn, &qos_rec,
729 					   accounting_enforce,
730 					   &qos_ptr, 1);
731 		if ((rc != SLURM_SUCCESS) || (qos_rec.id >= g_qos_count)) {
732 			error("Ignoring invalid Allow/DenyQOS value: %s",
733 			      one_qos_name);
734 		} else {
735 			bit_set(tmp_qos_bitstr, qos_rec.id);
736 		}
737 		one_qos_name = strtok_r(NULL, ",", &name_ptr);
738 	}
739 	assoc_mgr_unlock(&locks);
740 	xfree(tmp_qos);
741 	FREE_NULL_BITMAP(*qos_bits);
742 	*qos_bits = tmp_qos_bitstr;
743 }
744 
745 /*
746  * _build_single_partitionline_info - get a array of slurm_conf_partition_t
747  *	structures from the slurm.conf reader, build table, and set values
748  * RET 0 if no error, error code otherwise
749  * Note: Operates on common variables
750  * global: part_list - global partition list pointer
751  *	default_part - default parameters for a partition
752  */
_build_single_partitionline_info(slurm_conf_partition_t * part)753 static int _build_single_partitionline_info(slurm_conf_partition_t *part)
754 {
755 	part_record_t *part_ptr;
756 
757 	if (list_find_first(part_list, &list_find_part, part->name))
758 		fatal("%s: duplicate entry for partition %s",
759 		      __func__, part->name);
760 
761 	part_ptr = create_part_record(part->name);
762 
763 	if (part->default_flag) {
764 		if (default_part_name &&
765 		    xstrcmp(default_part_name, part->name)) {
766 			info("_parse_part_spec: changing default partition "
767 			     "from %s to %s", default_part_name, part->name);
768 			default_part_loc->flags &= (~PART_FLAG_DEFAULT);
769 		}
770 		xfree(default_part_name);
771 		default_part_name = xstrdup(part->name);
772 		default_part_loc = part_ptr;
773 		part_ptr->flags |= PART_FLAG_DEFAULT;
774 	}
775 
776 	part_ptr->cpu_bind = part->cpu_bind;
777 
778 	if (part->preempt_mode != NO_VAL16)
779 		part_ptr->preempt_mode = part->preempt_mode;
780 
781 	if (part->disable_root_jobs == NO_VAL16) {
782 		if (slurmctld_conf.conf_flags & CTL_CONF_DRJ)
783 			part_ptr->flags |= PART_FLAG_NO_ROOT;
784 	} else if (part->disable_root_jobs) {
785 		part_ptr->flags |= PART_FLAG_NO_ROOT;
786 	} else {
787 		part_ptr->flags &= (~PART_FLAG_NO_ROOT);
788 	}
789 	if (part_ptr->flags & PART_FLAG_NO_ROOT)
790 		debug2("partition %s does not allow root jobs", part_ptr->name);
791 
792 	if ((part->default_time != NO_VAL) &&
793 	    (part->default_time > part->max_time)) {
794 		info("partition %s DefaultTime exceeds MaxTime (%u > %u)",
795 		     part->name, part->default_time, part->max_time);
796 		part->default_time = NO_VAL;
797 	}
798 
799 	if (part->exclusive_user)
800 		part_ptr->flags |= PART_FLAG_EXCLUSIVE_USER;
801 	if (part->hidden_flag)
802 		part_ptr->flags |= PART_FLAG_HIDDEN;
803 	if (part->root_only_flag)
804 		part_ptr->flags |= PART_FLAG_ROOT_ONLY;
805 	if (part->req_resv_flag)
806 		part_ptr->flags |= PART_FLAG_REQ_RESV;
807 	if (part->lln_flag)
808 		part_ptr->flags |= PART_FLAG_LLN;
809 	part_ptr->max_time       = part->max_time;
810 	part_ptr->def_mem_per_cpu = part->def_mem_per_cpu;
811 	part_ptr->default_time   = part->default_time;
812 	FREE_NULL_LIST(part_ptr->job_defaults_list);
813 	part_ptr->job_defaults_list =
814 		job_defaults_copy(part->job_defaults_list);
815 	part_ptr->max_cpus_per_node = part->max_cpus_per_node;
816 	part_ptr->max_share      = part->max_share;
817 	part_ptr->max_mem_per_cpu = part->max_mem_per_cpu;
818 	part_ptr->max_nodes      = part->max_nodes;
819 	part_ptr->max_nodes_orig = part->max_nodes;
820 	part_ptr->min_nodes      = part->min_nodes;
821 	part_ptr->min_nodes_orig = part->min_nodes;
822 	part_ptr->over_time_limit = part->over_time_limit;
823 	part_ptr->preempt_mode   = part->preempt_mode;
824 	part_ptr->priority_job_factor = part->priority_job_factor;
825 	part_ptr->priority_tier  = part->priority_tier;
826 	part_ptr->qos_char       = xstrdup(part->qos_char);
827 	part_ptr->state_up       = part->state_up;
828 	part_ptr->grace_time     = part->grace_time;
829 	part_ptr->cr_type        = part->cr_type;
830 
831 	part_ptr->allow_alloc_nodes = xstrdup(part->allow_alloc_nodes);
832 	part_ptr->allow_groups = xstrdup(part->allow_groups);
833 	part_ptr->alternate = xstrdup(part->alternate);
834 	part_ptr->nodes = xstrdup(part->nodes);
835 
836 	if (part->billing_weights_str) {
837 		set_partition_billing_weights(part->billing_weights_str,
838 					      part_ptr, true);
839 	}
840 
841 	if (part->allow_accounts) {
842 		part_ptr->allow_accounts = xstrdup(part->allow_accounts);
843 		accounts_list_build(part_ptr->allow_accounts,
844 				    &part_ptr->allow_account_array);
845 	}
846 
847 	if (part->allow_qos) {
848 		part_ptr->allow_qos = xstrdup(part->allow_qos);
849 		qos_list_build(part_ptr->allow_qos,&part_ptr->allow_qos_bitstr);
850 	}
851 
852 	if (part->deny_accounts) {
853 		part_ptr->deny_accounts = xstrdup(part->deny_accounts);
854 		accounts_list_build(part_ptr->deny_accounts,
855 				    &part_ptr->deny_account_array);
856 	}
857 
858 	if (part->deny_qos) {
859 		part_ptr->deny_qos = xstrdup(part->deny_qos);
860 		qos_list_build(part_ptr->deny_qos, &part_ptr->deny_qos_bitstr);
861 	}
862 
863 	if (part->qos_char) {
864 		slurmdb_qos_rec_t qos_rec;
865 		part_ptr->qos_char = xstrdup(part->qos_char);
866 
867 		memset(&qos_rec, 0, sizeof(slurmdb_qos_rec_t));
868 		qos_rec.name = part_ptr->qos_char;
869 		if (assoc_mgr_fill_in_qos(
870 			    acct_db_conn, &qos_rec, accounting_enforce,
871 			    (slurmdb_qos_rec_t **)&part_ptr->qos_ptr, 0)
872 		    != SLURM_SUCCESS) {
873 			fatal("Partition %s has an invalid qos (%s), "
874 			      "please check your configuration",
875 			      part_ptr->name, qos_rec.name);
876 		}
877 	}
878 
879 	return 0;
880 }
881 
882 /*
883  * _build_all_partitionline_info - get a array of slurm_conf_partition_t
884  *	structures from the slurm.conf reader, build table, and set values
885  * RET 0 if no error, error code otherwise
886  * Note: Operates on common variables
887  * global: part_list - global partition list pointer
888  *	default_part - default parameters for a partition
889  */
_build_all_partitionline_info(void)890 static int _build_all_partitionline_info(void)
891 {
892 	slurm_conf_partition_t **ptr_array;
893 	int count;
894 	int i;
895 
896 	count = slurm_conf_partition_array(&ptr_array);
897 	if (count == 0)
898 		fatal("No PartitionName information available!");
899 
900 	for (i = 0; i < count; i++)
901 		_build_single_partitionline_info(ptr_array[i]);
902 
903 	return SLURM_SUCCESS;
904 }
905 
_set_max_part_prio(void * x,void * arg)906 static int _set_max_part_prio(void *x, void *arg)
907 {
908 	part_record_t *part_ptr = (part_record_t *) x;
909 
910 	if (part_ptr->priority_job_factor > part_max_priority)
911 		part_max_priority = part_ptr->priority_job_factor;
912 
913 	return 0;
914 }
915 
_reset_part_prio(void * x,void * arg)916 static int _reset_part_prio(void *x, void *arg)
917 {
918 	part_record_t *part_ptr = (part_record_t *) x;
919 
920 	/* protect against div0 if all partition priorities are zero */
921 	if (part_max_priority == 0) {
922 		part_ptr->norm_priority = 0;
923 		return 0;
924 	}
925 
926 	part_ptr->norm_priority = (double)part_ptr->priority_job_factor /
927 				  (double)part_max_priority;
928 
929 	return 0;
930 }
931 
932 /* _sync_part_prio - Set normalized partition priorities */
_sync_part_prio(void)933 static void _sync_part_prio(void)
934 {
935 	/* reset global value from part list */
936 	part_max_priority = DEF_PART_MAX_PRIORITY;
937 	list_for_each(part_list, _set_max_part_prio, NULL);
938 	/* renormalize values after finding new max */
939 	list_for_each(part_list, _reset_part_prio, NULL);
940 }
941 
_abort_job(job_record_t * job_ptr,uint32_t job_state,uint16_t state_reason,char * reason_string)942 static void _abort_job(job_record_t *job_ptr, uint32_t job_state,
943 		       uint16_t state_reason, char *reason_string)
944 {
945 	time_t now = time(NULL);
946 
947 	job_ptr->job_state = job_state | JOB_COMPLETING;
948 	build_cg_bitmap(job_ptr);
949 	job_ptr->end_time = MIN(job_ptr->end_time, now);
950 	job_ptr->state_reason = state_reason;
951 	xfree(job_ptr->state_desc);
952 	job_ptr->state_desc = xstrdup(reason_string);
953 	job_completion_logger(job_ptr, false);
954 	if (job_ptr->job_state == JOB_NODE_FAIL) {
955 		/* build_cg_bitmap() may clear JOB_COMPLETING */
956 		epilog_slurmctld(job_ptr);
957 	}
958 }
959 
_mark_het_job_unused(void * x,void * arg)960 static int _mark_het_job_unused(void *x, void *arg)
961 {
962 	job_record_t *job_ptr = (job_record_t *) x;
963 	job_ptr->bit_flags &= (~HET_JOB_FLAG);
964 	return 0;
965 }
966 
_mark_het_job_used(void * x,void * arg)967 static int _mark_het_job_used(void *x, void *arg)
968 {
969 	job_record_t *job_ptr = (job_record_t *) x;
970 	job_ptr->bit_flags |= HET_JOB_FLAG;
971 	return 0;
972 }
973 
_test_het_job_used(void * x,void * arg)974 static int _test_het_job_used(void *x, void *arg)
975 {
976 	job_record_t *job_ptr = (job_record_t *) x;
977 
978 	if ((job_ptr->het_job_id == 0) || IS_JOB_FINISHED(job_ptr))
979 		return 0;
980 	if (job_ptr->bit_flags & HET_JOB_FLAG)
981 		return 0;
982 
983 	error("Incomplete hetjob being aborted %pJ", job_ptr);
984 	_abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM, "incomplete hetjob");
985 
986 	return 0;
987 }
988 
989 /*
990  * Validate heterogeneous jobs
991  *
992  * Make sure that every active (not yet complete) job has all of its components
993  * and they are all in the same state. Also rebuild het_job_list.
994  * If hetjob is corrupted, aborts and removes it from job_list.
995  */
_validate_het_jobs(void)996 static void _validate_het_jobs(void)
997 {
998 	ListIterator job_iterator;
999 	job_record_t *job_ptr, *het_job_ptr;
1000 	hostset_t hs;
1001 	char *job_id_str;
1002 	uint32_t job_id;
1003 	bool het_job_valid;
1004 
1005 	list_for_each(job_list, _mark_het_job_unused, NULL);
1006 
1007 	job_iterator = list_iterator_create(job_list);
1008 	while ((job_ptr = list_next(job_iterator))) {
1009 		/* Checking for corrupted hetjob components */
1010 		if (job_ptr->het_job_offset != 0) {
1011 			het_job_ptr = find_job_record(job_ptr->het_job_id);
1012 			if (!het_job_ptr) {
1013 				error("Could not find hetjob leader (JobId=%u) of %pJ. Aborting and removing job as it is corrupted.",
1014 				      job_ptr->het_job_id, job_ptr);
1015 				_abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM,
1016 					   "invalid het_job_id_set");
1017 				if (list_delete_item(job_iterator) != 1)
1018 					error("Not able to remove the job.");
1019 				continue;
1020 			}
1021 		}
1022 
1023 		if ((job_ptr->het_job_id == 0) ||
1024 		    (job_ptr->het_job_offset != 0))
1025 			continue;
1026 		/* active het job leader found */
1027 		FREE_NULL_LIST(job_ptr->het_job_list);
1028 		job_id_str = NULL;
1029 		/* Need to wrap numbers with brackets for hostset functions */
1030 		xstrfmtcat(job_id_str, "[%s]", job_ptr->het_job_id_set);
1031 		hs = hostset_create(job_id_str);
1032 		xfree(job_id_str);
1033 		if (!hs) {
1034 			error("%pJ has invalid het_job_id_set(%s). Aborting and removing job as it is corrupted.",
1035 			      job_ptr, job_ptr->het_job_id_set);
1036 			_abort_job(job_ptr, JOB_FAILED, FAIL_SYSTEM,
1037 				   "invalid het_job_id_set");
1038 			if (list_delete_item(job_iterator) != 1)
1039 				error("Not able to remove the job.");
1040 			continue;
1041 		}
1042 		job_ptr->het_job_list = list_create(NULL);
1043 		het_job_valid = true;	/* assume valid for now */
1044 		while (het_job_valid && (job_id_str = hostset_shift(hs))) {
1045 			job_id = (uint32_t) strtoll(job_id_str, NULL, 10);
1046 			het_job_ptr = find_job_record(job_id);
1047 			if (!het_job_ptr) {
1048 				error("Could not find JobId=%u, part of hetjob JobId=%u",
1049 				      job_id, job_ptr->job_id);
1050 				het_job_valid = false;
1051 			} else if (het_job_ptr->het_job_id !=
1052 				   job_ptr->job_id) {
1053 				error("Invalid state of JobId=%u, part of hetjob JobId=%u",
1054 				      job_id, job_ptr->job_id);
1055 				het_job_valid = false;
1056 			} else {
1057 				list_append(job_ptr->het_job_list,
1058 					    het_job_ptr);
1059 			}
1060 			free(job_id_str);
1061 		}
1062 		hostset_destroy(hs);
1063 		if (het_job_valid) {
1064 			list_for_each(job_ptr->het_job_list, _mark_het_job_used,
1065 				      NULL);
1066 		}
1067 	}
1068 	list_iterator_destroy(job_iterator);
1069 
1070 	list_for_each(job_list, _test_het_job_used, NULL);
1071 }
1072 
1073 /* Log an error if SlurmdUser is not root and any cgroup plugin is used */
_test_cgroup_plugin_use(void)1074 static void _test_cgroup_plugin_use(void)
1075 {
1076 	char *plugins;
1077 
1078 	plugins = slurm_get_task_plugin();
1079 	if (xstrstr(plugins, "cgroup"))
1080 		error("task/cgroup plugin will not work unless SlurmdUser is root");
1081 	xfree(plugins);
1082 
1083 	plugins = slurm_get_proctrack_type();
1084 	if (xstrstr(plugins, "cgroup"))
1085 		error("proctrack/cgroup plugin will not work unless SlurmdUser is root");
1086 	xfree(plugins);
1087 }
1088 
1089 /*
1090  * read_slurm_conf - load the slurm configuration from the configured file.
1091  * read_slurm_conf can be called more than once if so desired.
1092  * IN recover - replace job, node and/or partition data with latest
1093  *              available information depending upon value
1094  *              0 = use no saved state information, rebuild everything from
1095  *		    slurm.conf contents
1096  *              1 = recover saved job and trigger state,
1097  *                  node DOWN/DRAIN/FAIL state and reason information
1098  *              2 = recover all saved state
1099  * IN reconfig - true if SIGHUP or "scontrol reconfig" and there is state in
1100  *		 memory to preserve, otherwise recover state from disk
1101  * RET SLURM_SUCCESS if no error, otherwise an error code
1102  * Note: Operates on common variables only
1103  */
read_slurm_conf(int recover,bool reconfig)1104 int read_slurm_conf(int recover, bool reconfig)
1105 {
1106 	DEF_TIMERS;
1107 	int error_code, i, rc = 0, load_job_ret = SLURM_SUCCESS;
1108 	int old_node_record_count = 0;
1109 	node_record_t *old_node_table_ptr = NULL, *node_ptr;
1110 	bool do_reorder_nodes = false;
1111 	List old_part_list = NULL;
1112 	char *old_def_part_name = NULL;
1113 	char *old_auth_type       = xstrdup(slurmctld_conf.authtype);
1114 	char *old_bb_type         = xstrdup(slurmctld_conf.bb_type);
1115 	char *old_cred_type       = xstrdup(slurmctld_conf.cred_type);
1116 	uint16_t old_preempt_mode = slurmctld_conf.preempt_mode;
1117 	char *old_preempt_type    = xstrdup(slurmctld_conf.preempt_type);
1118 	char *old_sched_type      = xstrdup(slurmctld_conf.schedtype);
1119 	char *old_select_type     = xstrdup(slurmctld_conf.select_type);
1120 	char *old_switch_type     = xstrdup(slurmctld_conf.switch_type);
1121 	char *state_save_dir      = xstrdup(slurmctld_conf.state_save_location);
1122 	char *mpi_params;
1123 	uint16_t old_select_type_p = slurmctld_conf.select_type_param;
1124 	bool cgroup_mem_confinement = false;
1125 
1126 	/* initialization */
1127 	START_TIMER;
1128 
1129 	xfree(slurmctld_config.auth_info);
1130 	slurmctld_config.auth_info = slurm_get_auth_info();
1131 	if (reconfig) {
1132 		/*
1133 		 * In order to re-use job state information,
1134 		 * update nodes_completing string (based on node bitmaps)
1135 		 */
1136 		update_job_nodes_completing();
1137 
1138 		/* save node and partition states for reconfig RPC */
1139 		old_node_record_count = node_record_count;
1140 		old_node_table_ptr    = node_record_table_ptr;
1141 
1142 		for (i = 0, node_ptr = old_node_table_ptr;
1143 		     i < node_record_count; i++, node_ptr++) {
1144 			/*
1145 			 * Store the original configured CPU count somewhere
1146 			 * (port is reused here for that purpose) so we can
1147 			 * report changes in its configuration.
1148 			 */
1149 			node_ptr->port   = node_ptr->config_ptr->cpus;
1150 			node_ptr->weight = node_ptr->config_ptr->weight;
1151 		}
1152 		node_record_table_ptr = NULL;
1153 		node_record_count = 0;
1154 		xhash_free(node_hash_table);
1155 		old_part_list = part_list;
1156 		part_list = NULL;
1157 		old_def_part_name = default_part_name;
1158 		default_part_name = NULL;
1159 	}
1160 
1161 	if ((error_code = _init_all_slurm_conf())) {
1162 		node_record_table_ptr = old_node_table_ptr;
1163 		node_record_count = old_node_record_count;
1164 		part_list = old_part_list;
1165 		default_part_name = old_def_part_name;
1166 		old_def_part_name = NULL;
1167 		goto end_it;
1168 	}
1169 
1170 	if (reconfig)
1171 		xcgroup_reconfig_slurm_cgroup_conf();
1172 
1173 	cgroup_mem_confinement = xcgroup_mem_cgroup_job_confinement();
1174 
1175 	if (slurmctld_conf.job_acct_oom_kill && cgroup_mem_confinement)
1176 		fatal("Jobs memory is being constrained by both TaskPlugin cgroup and JobAcctGather plugin. This enables two incompatible memory enforcement mechanisms, one of them must be disabled.");
1177 	else if (slurmctld_conf.job_acct_oom_kill)
1178 		info("Memory enforcing by using JobAcctGather's mechanism is discouraged, task/cgroup is recommended where available.");
1179 	else if (!cgroup_mem_confinement)
1180 		info("No memory enforcing mechanism configured.");
1181 
1182 	if (slurm_get_slurmd_user_id() != 0)
1183 		_test_cgroup_plugin_use();
1184 
1185 	if (layouts_init() != SLURM_SUCCESS) {
1186 		if (test_config) {
1187 			error("Failed to initialize the layouts framework");
1188 			test_config_rc = 1;
1189 		} else {
1190 			fatal("Failed to initialize the layouts framework");
1191 		}
1192 	}
1193 
1194 	if (slurm_topo_init() != SLURM_SUCCESS) {
1195 		if (test_config) {
1196 			error("Failed to initialize topology plugin");
1197 			test_config_rc = 1;
1198 		} else {
1199 			fatal("Failed to initialize topology plugin");
1200 		}
1201 	}
1202 
1203 	/* Build node and partition information based upon slurm.conf file */
1204 	_build_all_nodeline_info();
1205 	if (reconfig) {
1206 		if (_compare_hostnames(old_node_table_ptr,
1207 				       old_node_record_count,
1208 				       node_record_table_ptr,
1209 				       node_record_count) < 0) {
1210 			fatal("%s: hostnames inconsistency detected", __func__);
1211 		}
1212 	}
1213 	_handle_all_downnodes();
1214 	_build_all_partitionline_info();
1215 	if (!reconfig) {
1216 		restore_front_end_state(recover);
1217 
1218 		/* currently load/dump_state_lite has to run before
1219 		 * load_all_job_state. */
1220 
1221 		/* load old config */
1222 		load_config_state_lite();
1223 
1224 		/* store new config */
1225 		if (!test_config)
1226 			dump_config_state_lite();
1227 	}
1228 	update_logging();
1229 	g_slurm_jobcomp_init(slurmctld_conf.job_comp_loc);
1230 	if (slurm_sched_init() != SLURM_SUCCESS) {
1231 		if (test_config) {
1232 			error("Failed to initialize sched plugin");
1233 			test_config_rc = 1;
1234 		} else {
1235 			fatal("Failed to initialize sched plugin");
1236 		}
1237 	}
1238 	if (!reconfig && (old_preempt_mode & PREEMPT_MODE_GANG)) {
1239 		/* gs_init() must immediately follow slurm_sched_init() */
1240 		gs_init();
1241 	}
1242 	if (switch_init(1) != SLURM_SUCCESS) {
1243 		if (test_config) {
1244 			error("Failed to initialize switch plugin");
1245 			test_config_rc = 1;
1246 		} else {
1247 			fatal("Failed to initialize switch plugin");
1248 		}
1249 	}
1250 
1251 	if (default_part_loc == NULL)
1252 		error("read_slurm_conf: default partition not set.");
1253 
1254 	if (node_record_count < 1) {
1255 		error("read_slurm_conf: no nodes configured.");
1256 		test_config_rc = 1;
1257 		_purge_old_node_state(old_node_table_ptr,
1258 				      old_node_record_count);
1259 		_purge_old_part_state(old_part_list, old_def_part_name);
1260 		error_code = EINVAL;
1261 		goto end_it;
1262 	}
1263 
1264 	/*
1265 	 * Node reordering needs to be done by the topology and/or select
1266 	 * plugin. Reordering the table must be done before hashing the
1267 	 * nodes, and before any position-relative bitmaps are created.
1268 	 */
1269 	do_reorder_nodes |= slurm_topo_generate_node_ranking();
1270 	do_reorder_nodes |= select_g_node_ranking(node_record_table_ptr,
1271 						  node_record_count);
1272 	if (do_reorder_nodes)
1273 		_reorder_nodes_by_rank();
1274 	else
1275 		_reorder_nodes_by_name();
1276 
1277 	rehash_node();
1278 	slurm_topo_build_config();
1279 	route_g_reconfigure();
1280 	if (reconfig)
1281 		power_g_reconfig();
1282 	cpu_freq_reconfig();
1283 
1284 	rehash_jobs();
1285 	_set_slurmd_addr();
1286 
1287 	_stat_slurm_dirs();
1288 
1289 	/*
1290 	 * Load the layouts configuration.
1291 	 * Only load it at init time, not during reconfiguration stages.
1292 	 * It requires a full restart to switch to a new configuration for now.
1293 	 */
1294 	if (!reconfig && (layouts_load_config(recover) != SLURM_SUCCESS)) {
1295 		if (test_config) {
1296 			error("Failed to load the layouts framework configuration");
1297 			test_config_rc = 1;
1298 		} else {
1299 			fatal("Failed to load the layouts framework configuration");
1300 		}
1301 	}
1302 
1303 	/*
1304 	 * Set standard features and preserve the plugin controlled ones.
1305 	 * A reconfig always imply load the state from slurm.conf
1306 	 */
1307 	if (reconfig) {		/* Preserve state from memory */
1308 		if (old_node_table_ptr) {
1309 			info("restoring original state of nodes");
1310 			_set_features(old_node_table_ptr, old_node_record_count,
1311 				      recover);
1312 			rc = _restore_node_state(recover, old_node_table_ptr,
1313 						 old_node_record_count);
1314 			error_code = MAX(error_code, rc);  /* not fatal */
1315 		}
1316 		if (old_part_list && ((recover > 1) ||
1317 		    (slurmctld_conf.reconfig_flags & RECONFIG_KEEP_PART_INFO))) {
1318 			info("restoring original partition state");
1319 			rc = _restore_part_state(old_part_list,
1320 						 old_def_part_name,
1321 						 slurmctld_conf.reconfig_flags);
1322 			error_code = MAX(error_code, rc);  /* not fatal */
1323 		} else if (old_part_list && (slurmctld_conf.reconfig_flags &
1324 					     RECONFIG_KEEP_PART_STAT)) {
1325 			info("restoring original partition state only (up/down)");
1326 			rc = _restore_part_state(old_part_list,
1327 						 old_def_part_name,
1328 						 slurmctld_conf.reconfig_flags);
1329 			error_code = MAX(error_code, rc);  /* not fatal */
1330 		}
1331 		load_last_job_id();
1332 		reset_first_job_id();
1333 		(void) slurm_sched_g_reconfig();
1334 	} else if (recover == 0) {	/* Build everything from slurm.conf */
1335 		_set_features(node_record_table_ptr, node_record_count,
1336 			      recover);
1337 		load_last_job_id();
1338 		reset_first_job_id();
1339 		(void) slurm_sched_g_reconfig();
1340 	} else if (recover == 1) {	/* Load job & node state files */
1341 		(void) load_all_node_state(true);
1342 		_set_features(node_record_table_ptr, node_record_count,
1343 			      recover);
1344 		(void) load_all_front_end_state(true);
1345 		load_job_ret = load_all_job_state();
1346 		sync_job_priorities();
1347 	} else if (recover > 1) {	/* Load node, part & job state files */
1348 		(void) load_all_node_state(false);
1349 		_set_features(old_node_table_ptr, old_node_record_count,
1350 			      recover);
1351 		(void) load_all_front_end_state(false);
1352 		(void) load_all_part_state();
1353 		load_job_ret = load_all_job_state();
1354 		sync_job_priorities();
1355 	}
1356 
1357 	_sync_part_prio();
1358 	_build_bitmaps_pre_select();
1359 	if ((select_g_node_init(node_record_table_ptr, node_record_count)
1360 	     != SLURM_SUCCESS)						||
1361 	    (select_g_state_restore(state_save_dir) != SLURM_SUCCESS)	||
1362 	    (select_g_job_init(job_list) != SLURM_SUCCESS)) {
1363 		if (test_config) {
1364 			error("Failed to initialize node selection plugin state");
1365 			test_config_rc = 1;
1366 		} else {
1367 			fatal("Failed to initialize node selection plugin state, "
1368 			      "Clean start required.");
1369 		}
1370 	}
1371 
1372 	_gres_reconfig(reconfig);
1373 	reset_job_bitmaps();		/* must follow select_g_job_init() */
1374 
1375 	(void) _sync_nodes_to_jobs(reconfig);
1376 	(void) sync_job_files();
1377 	_purge_old_node_state(old_node_table_ptr, old_node_record_count);
1378 	_purge_old_part_state(old_part_list, old_def_part_name);
1379 
1380 	mpi_params = slurm_get_mpi_params();
1381 	reserve_port_config(mpi_params);
1382 	xfree(mpi_params);
1383 
1384 	if (license_update(slurmctld_conf.licenses) != SLURM_SUCCESS) {
1385 		if (test_config) {
1386 			error("Invalid Licenses value: %s",
1387 			      slurmctld_conf.licenses);
1388 			test_config_rc = 1;
1389 		} else {
1390 			fatal("Invalid Licenses value: %s",
1391 			      slurmctld_conf.licenses);
1392 		}
1393 	}
1394 
1395 	init_requeue_policy();
1396 	init_depend_policy();
1397 
1398 	/* NOTE: Run restore_node_features before _restore_job_accounting */
1399 	restore_node_features(recover);
1400 
1401 	if ((node_features_g_count() > 0) &&
1402 	    (node_features_g_get_node(NULL) != SLURM_SUCCESS)) {
1403 		error("failed to initialize node features");
1404 		test_config_rc = 1;
1405 	}
1406 
1407 	/*
1408 	 * _build_bitmaps() must follow node_features_g_get_node() and
1409 	 * preceed build_features_list_*()
1410 	 */
1411 	_build_bitmaps();
1412 
1413 	/* Active and available features can be different on -R */
1414 	if ((node_features_g_count() == 0) && (recover != 2))
1415 		build_feature_list_eq();
1416 	else
1417 		build_feature_list_ne();
1418 
1419 	/*
1420 	 * Must be at after nodes and partitons (e.g.
1421 	 * _build_bitmaps_pre_select()) have been created and before
1422 	 * _sync_nodes_to_comp_job().
1423 	 */
1424 	if (!test_config)
1425 		set_cluster_tres(false);
1426 
1427 	_validate_het_jobs();
1428 	(void) _sync_nodes_to_comp_job();/* must follow select_g_node_init() */
1429 	load_part_uid_allow_list(1);
1430 
1431 	/* NOTE: Run load_all_resv_state() before _restore_job_accounting */
1432 	if (reconfig) {
1433 		load_all_resv_state(0);
1434 	} else {
1435 		load_all_resv_state(recover);
1436 		if (recover >= 1) {
1437 			trigger_state_restore();
1438 			(void) slurm_sched_g_reconfig();
1439 		}
1440 	}
1441 	 if (test_config)
1442 		goto end_it;
1443 
1444 	_restore_job_accounting();
1445 
1446 	/* sort config_list by weight for scheduling */
1447 	list_sort(config_list, &list_compare_config);
1448 
1449 	/* Update plugins as possible */
1450 	if (xstrcmp(old_auth_type, slurmctld_conf.authtype)) {
1451 		xfree(slurmctld_conf.authtype);
1452 		slurmctld_conf.authtype = old_auth_type;
1453 		rc =  ESLURM_INVALID_AUTHTYPE_CHANGE;
1454 	}
1455 
1456 	if (xstrcmp(old_bb_type, slurmctld_conf.bb_type)) {
1457 		xfree(slurmctld_conf.bb_type);
1458 		slurmctld_conf.bb_type = old_bb_type;
1459 		old_bb_type = NULL;
1460 		rc =  ESLURM_INVALID_BURST_BUFFER_CHANGE;
1461 	}
1462 
1463 	if (xstrcmp(old_cred_type, slurmctld_conf.cred_type)) {
1464 		xfree(slurmctld_conf.cred_type);
1465 		slurmctld_conf.cred_type = old_cred_type;
1466 		old_cred_type = NULL;
1467 		rc = ESLURM_INVALID_CRED_TYPE_CHANGE;
1468 	}
1469 
1470 	if (xstrcmp(old_sched_type, slurmctld_conf.schedtype)) {
1471 		xfree(slurmctld_conf.schedtype);
1472 		slurmctld_conf.schedtype = old_sched_type;
1473 		old_sched_type = NULL;
1474 		rc =  ESLURM_INVALID_SCHEDTYPE_CHANGE;
1475 	}
1476 
1477 	if (xstrcmp(old_select_type, slurmctld_conf.select_type)) {
1478 		xfree(slurmctld_conf.select_type);
1479 		slurmctld_conf.select_type = old_select_type;
1480 		old_select_type = NULL;
1481 		rc =  ESLURM_INVALID_SELECTTYPE_CHANGE;
1482 	}
1483 
1484 	if (xstrcmp(old_switch_type, slurmctld_conf.switch_type)) {
1485 		xfree(slurmctld_conf.switch_type);
1486 		slurmctld_conf.switch_type = old_switch_type;
1487 		old_switch_type = NULL;
1488 		rc = ESLURM_INVALID_SWITCHTYPE_CHANGE;
1489 	}
1490 
1491 	if ((slurmctld_conf.control_cnt < 2) ||
1492 	    (slurmctld_conf.control_machine[1] == NULL))
1493 		info("%s: backup_controller not specified", __func__);
1494 
1495 	error_code = MAX(error_code, rc);	/* not fatal */
1496 
1497 	if (xstrcmp(old_preempt_type, slurmctld_conf.preempt_type)) {
1498 		info("Changing PreemptType from %s to %s",
1499 		     old_preempt_type, slurmctld_conf.preempt_type);
1500 		(void) slurm_preempt_fini();
1501 		if (slurm_preempt_init() != SLURM_SUCCESS) {
1502 			if (test_config) {
1503 				error("failed to initialize preempt plugin");
1504 				test_config_rc = 1;
1505 			} else {
1506 				fatal("failed to initialize preempt plugin");
1507 			}
1508 		}
1509 	}
1510 	rc = _update_preempt(old_preempt_mode);
1511 	error_code = MAX(error_code, rc);	/* not fatal */
1512 
1513 	/* Update plugin parameters as possible */
1514 	rc = job_submit_plugin_reconfig();
1515 	error_code = MAX(error_code, rc);	/* not fatal */
1516 	rc = prep_plugin_reconfig();
1517 	error_code = MAX(error_code, rc);	/* not fatal */
1518 	rc = switch_g_reconfig();
1519 	error_code = MAX(error_code, rc);	/* not fatal */
1520 	if (reconfig) {
1521 		rc = node_features_g_reconfig();
1522 		error_code = MAX(error_code, rc); /* not fatal */
1523 	}
1524 	rc = _preserve_select_type_param(&slurmctld_conf, old_select_type_p);
1525 	error_code = MAX(error_code, rc);	/* not fatal */
1526 	if (reconfig)
1527 		rc =  bb_g_reconfig();
1528 	else
1529 		rc = bb_g_load_state(true);
1530 	error_code = MAX(error_code, rc);	/* not fatal */
1531 
1532 	/*
1533 	 * Restore job accounting info if file missing or corrupted,
1534 	 * an extremely rare situation
1535 	 */
1536 	if (load_job_ret)
1537 		_acct_restore_active_jobs();
1538 
1539 	/* Sync select plugin with synchronized job/node/part data */
1540 	gres_plugin_reconfig();		/* Clear gres/mps counters */
1541 	select_g_reconfigure();
1542 	if (reconfig && (slurm_mcs_reconfig() != SLURM_SUCCESS))
1543 		fatal("Failed to reconfigure mcs plugin");
1544 
1545 	_set_response_cluster_rec();
1546 
1547 	slurmctld_conf.last_update = time(NULL);
1548 end_it:
1549 	xfree(old_auth_type);
1550 	xfree(old_bb_type);
1551 	xfree(old_cred_type);
1552 	xfree(old_preempt_type);
1553 	xfree(old_sched_type);
1554 	xfree(old_select_type);
1555 	xfree(old_switch_type);
1556 	xfree(state_save_dir);
1557 
1558 	END_TIMER2("read_slurm_conf");
1559 	return error_code;
1560 
1561 }
1562 
1563 /* Add feature to list
1564  * feature_list IN - destination list, either active_feature_list or
1565  *	avail_feature_list
1566  * feature IN - name of the feature to add
1567  * node_bitmap IN - bitmap of nodes with named feature */
_add_config_feature(List feature_list,char * feature,bitstr_t * node_bitmap)1568 static void _add_config_feature(List feature_list, char *feature,
1569 				bitstr_t *node_bitmap)
1570 {
1571 	node_feature_t *feature_ptr;
1572 	ListIterator feature_iter;
1573 	bool match = false;
1574 
1575 	/* If feature already in avail_feature_list, just update the bitmap */
1576 	feature_iter = list_iterator_create(feature_list);
1577 	while ((feature_ptr = list_next(feature_iter))) {
1578 		if (xstrcmp(feature, feature_ptr->name))
1579 			continue;
1580 		bit_or(feature_ptr->node_bitmap, node_bitmap);
1581 		match = true;
1582 		break;
1583 	}
1584 	list_iterator_destroy(feature_iter);
1585 
1586 	if (!match) {	/* Need to create new avail_feature_list record */
1587 		feature_ptr = xmalloc(sizeof(node_feature_t));
1588 		feature_ptr->magic = FEATURE_MAGIC;
1589 		feature_ptr->name = xstrdup(feature);
1590 		feature_ptr->node_bitmap = bit_copy(node_bitmap);
1591 		list_append(feature_list, feature_ptr);
1592 	}
1593 }
1594 
1595 /* Add feature to list
1596  * feature_list IN - destination list, either active_feature_list or
1597  *	avail_feature_list
1598  * feature IN - name of the feature to add
1599  * node_inx IN - index of the node with named feature */
_add_config_feature_inx(List feature_list,char * feature,int node_inx)1600 static void _add_config_feature_inx(List feature_list, char *feature,
1601 				    int node_inx)
1602 {
1603 	node_feature_t *feature_ptr;
1604 	ListIterator feature_iter;
1605 	bool match = false;
1606 
1607 	/* If feature already in avail_feature_list, just update the bitmap */
1608 	feature_iter = list_iterator_create(feature_list);
1609 	while ((feature_ptr = list_next(feature_iter))) {
1610 		if (xstrcmp(feature, feature_ptr->name))
1611 			continue;
1612 		bit_set(feature_ptr->node_bitmap, node_inx);
1613 		match = true;
1614 		break;
1615 	}
1616 	list_iterator_destroy(feature_iter);
1617 
1618 	if (!match) {	/* Need to create new avail_feature_list record */
1619 		feature_ptr = xmalloc(sizeof(node_feature_t));
1620 		feature_ptr->magic = FEATURE_MAGIC;
1621 		feature_ptr->name = xstrdup(feature);
1622 		feature_ptr->node_bitmap = bit_alloc(node_record_count);
1623 		bit_set(feature_ptr->node_bitmap, node_inx);
1624 		list_append(feature_list, feature_ptr);
1625 	}
1626 }
1627 
1628 /* _list_delete_feature - delete an entry from the feature list,
1629  *	see list.h for documentation */
_list_delete_feature(void * feature_entry)1630 static void _list_delete_feature(void *feature_entry)
1631 {
1632 	node_feature_t *feature_ptr = (node_feature_t *) feature_entry;
1633 
1634 	xassert(feature_ptr);
1635 	xassert(feature_ptr->magic == FEATURE_MAGIC);
1636 	xfree (feature_ptr->name);
1637 	FREE_NULL_BITMAP (feature_ptr->node_bitmap);
1638 	xfree (feature_ptr);
1639 }
1640 
1641 /*
1642  * For a configuration where available_features == active_features,
1643  * build new active and available feature lists
1644  */
build_feature_list_eq(void)1645 extern void build_feature_list_eq(void)
1646 {
1647 	ListIterator config_iterator;
1648 	config_record_t *config_ptr;
1649 	node_feature_t *active_feature_ptr, *avail_feature_ptr;
1650 	ListIterator feature_iter;
1651 	char *tmp_str, *token, *last = NULL;
1652 
1653 	FREE_NULL_LIST(active_feature_list);
1654 	FREE_NULL_LIST(avail_feature_list);
1655 	active_feature_list = list_create(_list_delete_feature);
1656 	avail_feature_list = list_create(_list_delete_feature);
1657 
1658 	config_iterator = list_iterator_create(config_list);
1659 	while ((config_ptr = list_next(config_iterator))) {
1660 		if (config_ptr->feature) {
1661 			tmp_str = xstrdup(config_ptr->feature);
1662 			token = strtok_r(tmp_str, ",", &last);
1663 			while (token) {
1664 				_add_config_feature(avail_feature_list, token,
1665 						    config_ptr->node_bitmap);
1666 				token = strtok_r(NULL, ",", &last);
1667 			}
1668 			xfree(tmp_str);
1669 		}
1670 	}
1671 	list_iterator_destroy(config_iterator);
1672 
1673 	/* Copy avail_feature_list to active_feature_list */
1674 	feature_iter = list_iterator_create(avail_feature_list);
1675 	while ((avail_feature_ptr = list_next(feature_iter))) {
1676 		active_feature_ptr = xmalloc(sizeof(node_feature_t));
1677 		active_feature_ptr->magic = FEATURE_MAGIC;
1678 		active_feature_ptr->name = xstrdup(avail_feature_ptr->name);
1679 		active_feature_ptr->node_bitmap =
1680 			bit_copy(avail_feature_ptr->node_bitmap);
1681 		list_append(active_feature_list, active_feature_ptr);
1682 	}
1683 	list_iterator_destroy(feature_iter);
1684 }
1685 
1686 /*
1687  * Log contents of avail_feature_list and active_feature_list
1688  */
log_feature_lists(void)1689 extern void log_feature_lists(void)
1690 {
1691 	node_feature_t *feature_ptr;
1692 	char *node_str;
1693 	ListIterator feature_iter;
1694 
1695 	feature_iter = list_iterator_create(avail_feature_list);
1696 	while ((feature_ptr = list_next(feature_iter))) {
1697 		node_str = bitmap2node_name(feature_ptr->node_bitmap);
1698 		info("AVAIL FEATURE:%s NODES:%s", feature_ptr->name, node_str);
1699 		xfree(node_str);
1700 	}
1701 	list_iterator_destroy(feature_iter);
1702 
1703 	feature_iter = list_iterator_create(active_feature_list);
1704 	while ((feature_ptr = list_next(feature_iter))) {
1705 		node_str = bitmap2node_name(feature_ptr->node_bitmap);
1706 		info("ACTIVE FEATURE:%s NODES:%s", feature_ptr->name, node_str);
1707 		xfree(node_str);
1708 	}
1709 	list_iterator_destroy(feature_iter);
1710 }
1711 
1712 /*
1713  * For a configuration where available_features != active_features,
1714  * build new active and available feature lists
1715  */
build_feature_list_ne(void)1716 extern void build_feature_list_ne(void)
1717 {
1718 	node_record_t *node_ptr;
1719 	char *tmp_str, *token, *last = NULL;
1720 	int i;
1721 
1722 	FREE_NULL_LIST(active_feature_list);
1723 	FREE_NULL_LIST(avail_feature_list);
1724 	active_feature_list = list_create(_list_delete_feature);
1725 	avail_feature_list = list_create(_list_delete_feature);
1726 
1727 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
1728 	     i++, node_ptr++) {
1729 		if (node_ptr->features_act) {
1730 			tmp_str = xstrdup(node_ptr->features_act);
1731 			token = strtok_r(tmp_str, ",", &last);
1732 			while (token) {
1733 				_add_config_feature_inx(active_feature_list,
1734 							token, i);
1735 				token = strtok_r(NULL, ",", &last);
1736 			}
1737 			xfree(tmp_str);
1738 		}
1739 		if (node_ptr->features) {
1740 			tmp_str = xstrdup(node_ptr->features);
1741 			token = strtok_r(tmp_str, ",", &last);
1742 			while (token) {
1743 				_add_config_feature_inx(avail_feature_list,
1744 							token, i);
1745 				if (!node_ptr->features_act) {
1746 					_add_config_feature_inx(
1747 							active_feature_list,
1748 							token, i);
1749 				}
1750 				token = strtok_r(NULL, ",", &last);
1751 			}
1752 			xfree(tmp_str);
1753 		}
1754 	}
1755 }
1756 
1757 /*
1758  * Update active_feature_list or avail_feature_list
1759  * feature_list IN - List to update: active_feature_list or avail_feature_list
1760  * new_features IN - New active_features
1761  * node_bitmap IN - Nodes with the new active_features value
1762  */
update_feature_list(List feature_list,char * new_features,bitstr_t * node_bitmap)1763 extern void update_feature_list(List feature_list, char *new_features,
1764 				bitstr_t *node_bitmap)
1765 {
1766 	node_feature_t *feature_ptr;
1767 	ListIterator feature_iter;
1768 	char *tmp_str, *token, *last = NULL;
1769 
1770 	/*
1771 	 * Clear these nodes from the feature_list record,
1772 	 * then restore as needed
1773 	 */
1774 	feature_iter = list_iterator_create(feature_list);
1775 	while ((feature_ptr = list_next(feature_iter))) {
1776 		bit_and_not(feature_ptr->node_bitmap, node_bitmap);
1777 	}
1778 	list_iterator_destroy(feature_iter);
1779 
1780 	if (new_features) {
1781 		tmp_str = xstrdup(new_features);
1782 		token = strtok_r(tmp_str, ",", &last);
1783 		while (token) {
1784 			_add_config_feature(feature_list, token, node_bitmap);
1785 			token = strtok_r(NULL, ",", &last);
1786 		}
1787 		xfree(tmp_str);
1788 	}
1789 	node_features_updated = true;
1790 }
1791 
_gres_reconfig(bool reconfig)1792 static void _gres_reconfig(bool reconfig)
1793 {
1794 	node_record_t *node_ptr;
1795 	char *gres_name;
1796 	int i;
1797 
1798 	if (reconfig) {
1799 		gres_plugin_reconfig();
1800 	} else {
1801 		for (i = 0, node_ptr = node_record_table_ptr;
1802 		     i < node_record_count; i++, node_ptr++) {
1803 			if (node_ptr->gres)
1804 				gres_name = node_ptr->gres;
1805 			else
1806 				gres_name = node_ptr->config_ptr->gres;
1807 			gres_plugin_init_node_config(node_ptr->name, gres_name,
1808 						     &node_ptr->gres_list);
1809 			if (!IS_NODE_CLOUD(node_ptr))
1810 				continue;
1811 
1812 			/*
1813 			 * Load in GRES for node now. By default Slurm gets this
1814 			 * information when the node registers for the first
1815 			 * time, which can take a while for a node in the cloud
1816 			 * to boot.
1817 			 */
1818 			gres_plugin_node_config_load(
1819 				node_ptr->config_ptr->cpus, node_ptr->name,
1820 				node_ptr->gres_list, NULL, NULL);
1821 			gres_plugin_node_config_validate(
1822 				node_ptr->name, node_ptr->config_ptr->gres,
1823 				&node_ptr->gres, &node_ptr->gres_list,
1824 				node_ptr->config_ptr->threads,
1825 				node_ptr->config_ptr->cores,
1826 				node_ptr->config_ptr->sockets,
1827 				slurmctld_conf.conf_flags & CTL_CONF_OR, NULL);
1828 		}
1829 	}
1830 }
1831 /*
1832  * Configure node features.
1833  * IN old_node_table_ptr IN - Previous nodes information
1834  * IN old_node_record_count IN - Count of previous nodes information
1835  * IN recover - replace node features data depending upon value.
1836  *              0, 1 - use data from config record, built using slurm.conf
1837  *              2 = use data from node record, built from saved state
1838  */
_set_features(node_record_t * old_node_table_ptr,int old_node_record_count,int recover)1839 static void _set_features(node_record_t *old_node_table_ptr,
1840 			  int old_node_record_count, int recover)
1841 {
1842 	node_record_t *node_ptr, *old_node_ptr;
1843 	char *tmp, *tok, *sep;
1844 	int i, node_features_cnt = node_features_g_count();
1845 
1846 	for (i = 0, old_node_ptr = old_node_table_ptr;
1847 	     i < old_node_record_count;
1848 	     i++, old_node_ptr++) {
1849 
1850 		node_ptr  = find_node_record(old_node_ptr->name);
1851 
1852 		if (node_ptr == NULL)
1853 			continue;
1854 
1855 		/*
1856 		 * Load all from state, ignore what has been read from
1857 		 * slurm.conf. Features in node record just a placeholder
1858 		 * for restore_node_features() to set up new config records.
1859 		 */
1860 		if (recover == 2) {
1861 			xfree(node_ptr->features);
1862 			xfree(node_ptr->features_act);
1863 			node_ptr->features = old_node_ptr->features;
1864 			node_ptr->features_act = old_node_ptr->features_act;
1865 			old_node_ptr->features = NULL;
1866 			old_node_ptr->features_act = NULL;
1867 			continue;
1868 		}
1869 
1870 		xfree(node_ptr->features_act);
1871 		node_ptr->features_act = xstrdup(node_ptr->features);
1872 
1873 		if (node_features_cnt == 0)
1874 			continue;
1875 
1876 		/* If we are here, there's a node_features plugin active */
1877 
1878 		/*
1879 		 * The subset of plugin-controlled features_available
1880 		 * and features_active found in the old node_ptr for this node
1881 		 * are copied into new node respective fields.
1882 		 * This will make that KNL modes are preserved while doing a
1883 		 * reconfigure. Otherwise, we should wait until node is
1884 		 * registered to get KNL available and active features.
1885 		 */
1886 		if (old_node_ptr->features != NULL) {
1887 			char *save_ptr = NULL;
1888 			if (node_ptr->features)
1889 				sep = ",";
1890 			else
1891 				sep = "";
1892 			tmp = xstrdup(old_node_ptr->features);
1893 			tok = strtok_r(tmp, ",", &save_ptr);
1894 			while (tok) {
1895 				if (node_features_g_changeable_feature(tok)) {
1896 					xstrfmtcat(node_ptr->features,
1897 						   "%s%s", sep, tok);
1898 					sep = ",";
1899 				}
1900 				tok = strtok_r(NULL, ",", &save_ptr);
1901 			}
1902 			xfree(tmp);
1903 		}
1904 
1905 		if (old_node_ptr->features_act != NULL) {
1906 			char *save_ptr = NULL;
1907 			if (node_ptr->features_act)
1908 				sep = ",";
1909 			else
1910 				sep = "";
1911 			tmp = xstrdup(old_node_ptr->features_act);
1912 			tok = strtok_r(tmp, ",", &save_ptr);
1913 			while (tok) {
1914 				if (node_features_g_changeable_feature(tok)) {
1915 					xstrfmtcat(node_ptr->features_act,
1916 						   "%s%s", sep, tok);
1917 					sep = ",";
1918 				}
1919 				tok = strtok_r(NULL, ",", &save_ptr);
1920 			}
1921 			xfree(tmp);
1922 		}
1923 	}
1924 }
1925 /* Restore node state and size information from saved records which match
1926  * the node registration message. If a node was re-configured to be down or
1927  * drained, we set those states. We only recover a node's Features if
1928  * recover==2. */
_restore_node_state(int recover,node_record_t * old_node_table_ptr,int old_node_record_count)1929 static int _restore_node_state(int recover,
1930 			       node_record_t *old_node_table_ptr,
1931 			       int old_node_record_count)
1932 {
1933 	node_record_t *node_ptr, *old_node_ptr;
1934 	int i, rc = SLURM_SUCCESS;
1935 	hostset_t hs = NULL;
1936 	bool power_save_mode = false;
1937 
1938 	if (slurmctld_conf.suspend_program && slurmctld_conf.resume_program)
1939 		power_save_mode = true;
1940 
1941 	for (i=0, node_ptr=node_record_table_ptr; i<node_record_count;
1942 	     i++, node_ptr++) {
1943 		node_ptr->not_responding = true;
1944 	}
1945 
1946 	for (i=0, old_node_ptr=old_node_table_ptr; i<old_node_record_count;
1947 	     i++, old_node_ptr++) {
1948 		bool drain_flag = false, down_flag = false;
1949 		dynamic_plugin_data_t *tmp_select_nodeinfo;
1950 
1951 		node_ptr  = find_node_record(old_node_ptr->name);
1952 		if (node_ptr == NULL)
1953 			continue;
1954 
1955 		node_ptr->not_responding = false;
1956 		if (IS_NODE_DOWN(node_ptr))
1957 			down_flag = true;
1958 		if (IS_NODE_DRAIN(node_ptr))
1959 			drain_flag = true;
1960 		if ( IS_NODE_FUTURE(old_node_ptr) &&
1961 		    !IS_NODE_FUTURE(node_ptr)) {
1962 			/* Replace FUTURE state with new state, but preserve
1963 			 * state flags (e.g. POWER) */
1964 			node_ptr->node_state =
1965 				(node_ptr->node_state     & NODE_STATE_BASE) |
1966 				(old_node_ptr->node_state & NODE_STATE_FLAGS);
1967 		} else {
1968 			node_ptr->node_state = old_node_ptr->node_state;
1969 		}
1970 
1971 		if (down_flag) {
1972 			node_ptr->node_state &= NODE_STATE_FLAGS;
1973 			node_ptr->node_state |= NODE_STATE_DOWN;
1974 		}
1975 		if (drain_flag)
1976 			node_ptr->node_state |= NODE_STATE_DRAIN;
1977 		if ((!power_save_mode) &&
1978 		    (IS_NODE_POWER_SAVE(node_ptr) ||
1979 		     IS_NODE_POWER_UP(node_ptr))) {
1980 			node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
1981 			node_ptr->node_state &= (~NODE_STATE_POWER_UP);
1982 			if (hs)
1983 				hostset_insert(hs, node_ptr->name);
1984 			else
1985 				hs = hostset_create(node_ptr->name);
1986 		}
1987 
1988 		if (IS_NODE_CLOUD(node_ptr) && !IS_NODE_POWER_SAVE(node_ptr)) {
1989 			/* Preserve NodeHostname + NodeAddr set by scontrol */
1990 			xfree(node_ptr->comm_name);
1991 			node_ptr->comm_name = old_node_ptr->comm_name;
1992 			old_node_ptr->comm_name = NULL;
1993 			xfree(node_ptr->node_hostname);
1994 			node_ptr->node_hostname = old_node_ptr->node_hostname;
1995 			old_node_ptr->node_hostname = NULL;
1996 			slurm_reset_alias(node_ptr->name, node_ptr->comm_name,
1997 					  node_ptr->node_hostname);
1998 		}
1999 
2000 		node_ptr->last_response = old_node_ptr->last_response;
2001 		node_ptr->protocol_version = old_node_ptr->protocol_version;
2002 		node_ptr->cpu_load = old_node_ptr->cpu_load;
2003 
2004 		/* make sure we get the old state from the select
2005 		 * plugin, just swap it out to avoid possible memory leak */
2006 		tmp_select_nodeinfo = node_ptr->select_nodeinfo;
2007 		node_ptr->select_nodeinfo = old_node_ptr->select_nodeinfo;
2008 		old_node_ptr->select_nodeinfo = tmp_select_nodeinfo;
2009 
2010 		if (old_node_ptr->port != node_ptr->config_ptr->cpus) {
2011 			rc = ESLURM_NEED_RESTART;
2012 			error("Configured cpu count change on %s (%u to %u)",
2013 			      node_ptr->name, old_node_ptr->port,
2014 			      node_ptr->config_ptr->cpus);
2015 		}
2016 
2017 		node_ptr->boot_time     = old_node_ptr->boot_time;
2018 		node_ptr->cpus          = old_node_ptr->cpus;
2019 		node_ptr->cores         = old_node_ptr->cores;
2020 		xfree(node_ptr->cpu_spec_list);
2021 		node_ptr->cpu_spec_list = old_node_ptr->cpu_spec_list;
2022 		old_node_ptr->cpu_spec_list = NULL;
2023 		node_ptr->core_spec_cnt = old_node_ptr->core_spec_cnt;
2024 		node_ptr->last_idle     = old_node_ptr->last_idle;
2025 		node_ptr->boards        = old_node_ptr->boards;
2026 		node_ptr->sockets       = old_node_ptr->sockets;
2027 		node_ptr->threads       = old_node_ptr->threads;
2028 		node_ptr->real_memory   = old_node_ptr->real_memory;
2029 		node_ptr->mem_spec_limit = old_node_ptr->mem_spec_limit;
2030 		node_ptr->slurmd_start_time = old_node_ptr->slurmd_start_time;
2031 		node_ptr->tmp_disk      = old_node_ptr->tmp_disk;
2032 		node_ptr->weight        = old_node_ptr->weight;
2033 
2034 		node_ptr->sus_job_cnt   = old_node_ptr->sus_job_cnt;
2035 
2036 		FREE_NULL_LIST(node_ptr->gres_list);
2037 		node_ptr->gres_list = old_node_ptr->gres_list;
2038 		old_node_ptr->gres_list = NULL;
2039 
2040 		if (node_ptr->reason == NULL) {
2041 			/* Recover only if not explicitly set in slurm.conf */
2042 			node_ptr->reason = old_node_ptr->reason;
2043 			node_ptr->reason_time = old_node_ptr->reason_time;
2044 			old_node_ptr->reason = NULL;
2045 		}
2046 		if (recover == 2) {
2047 			xfree(node_ptr->gres);
2048 			node_ptr->gres = old_node_ptr->gres;
2049 			old_node_ptr->gres = NULL;
2050 		}
2051 		if (old_node_ptr->arch) {
2052 			xfree(node_ptr->arch);
2053 			node_ptr->arch = old_node_ptr->arch;
2054 			old_node_ptr->arch = NULL;
2055 		}
2056 		if (old_node_ptr->os) {
2057 			xfree(node_ptr->os);
2058 			node_ptr->os = old_node_ptr->os;
2059 			old_node_ptr->os = NULL;
2060 		}
2061 		if (old_node_ptr->node_spec_bitmap) {
2062 			FREE_NULL_BITMAP(node_ptr->node_spec_bitmap);
2063 			node_ptr->node_spec_bitmap =
2064 				old_node_ptr->node_spec_bitmap;
2065 			old_node_ptr->node_spec_bitmap = NULL;
2066 		}
2067 	}
2068 
2069 	if (hs) {
2070 		char node_names[128];
2071 		hostset_ranged_string(hs, sizeof(node_names), node_names);
2072 		info("Cleared POWER_SAVE flag from nodes %s", node_names);
2073 		hostset_destroy(hs);
2074 		hs = NULL;
2075 	}
2076 
2077 	for (i=0, node_ptr=node_record_table_ptr; i<node_record_count;
2078 	     i++, node_ptr++) {
2079 		if (!node_ptr->not_responding)
2080 			continue;
2081 		node_ptr->not_responding = false;
2082 		if (hs)
2083 			hostset_insert(hs, node_ptr->name);
2084 		else
2085 			hs = hostset_create(node_ptr->name);
2086 	}
2087 	if (hs) {
2088 		char node_names[128];
2089 		hostset_ranged_string(hs, sizeof(node_names), node_names);
2090 		error("Nodes added to configuration (%s)", node_names);
2091 		error("Reboot of all slurm daemons is recommended");
2092 		hostset_destroy(hs);
2093 	}
2094 
2095 	return rc;
2096 }
2097 
2098 /* Purge old node state information */
_purge_old_node_state(node_record_t * old_node_table_ptr,int old_node_record_count)2099 static void _purge_old_node_state(node_record_t *old_node_table_ptr,
2100 				  int old_node_record_count)
2101 {
2102 	int i;
2103 	node_record_t *node_ptr;
2104 
2105 	node_ptr = old_node_table_ptr;
2106 	if (old_node_table_ptr) {
2107 		for (i = 0; i< old_node_record_count; i++, node_ptr++)
2108 			purge_node_rec(node_ptr);
2109 		xfree(old_node_table_ptr);
2110 	}
2111 }
2112 
2113 /* Restore partition information from saved records */
_restore_part_state(List old_part_list,char * old_def_part_name,uint16_t flags)2114 static int  _restore_part_state(List old_part_list, char *old_def_part_name,
2115 				uint16_t flags)
2116 {
2117 	int rc = SLURM_SUCCESS;
2118 	ListIterator part_iterator;
2119 	part_record_t *old_part_ptr, *part_ptr;
2120 
2121 	if (!old_part_list)
2122 		return rc;
2123 
2124 	/* For each part in list, find and update recs */
2125 	part_iterator = list_iterator_create(old_part_list);
2126 	while ((old_part_ptr = list_next(part_iterator))) {
2127 		xassert(old_part_ptr->magic == PART_MAGIC);
2128 		part_ptr = find_part_record(old_part_ptr->name);
2129 		if (part_ptr) {
2130 			if ( !(flags & RECONFIG_KEEP_PART_INFO) &&
2131 			     (flags & RECONFIG_KEEP_PART_STAT)	) {
2132 				if (part_ptr->state_up != old_part_ptr->state_up) {
2133 					info("Partition %s State differs from "
2134 					     "slurm.conf", part_ptr->name);
2135 					part_ptr->state_up = old_part_ptr->state_up;
2136 				}
2137 				continue;
2138 			}
2139 			/* Current partition found in slurm.conf,
2140 			 * report differences from slurm.conf configuration */
2141 			if (xstrcmp(part_ptr->allow_accounts,
2142 				    old_part_ptr->allow_accounts)) {
2143 				error("Partition %s AllowAccounts differs from slurm.conf",
2144 				      part_ptr->name);
2145 				xfree(part_ptr->allow_accounts);
2146 				part_ptr->allow_accounts =
2147 					xstrdup(old_part_ptr->allow_accounts);
2148 				accounts_list_build(part_ptr->allow_accounts,
2149 						&part_ptr->allow_account_array);
2150 			}
2151 			if (xstrcmp(part_ptr->allow_alloc_nodes,
2152 				    old_part_ptr->allow_alloc_nodes)) {
2153 				error("Partition %s AllowNodes differs from slurm.conf",
2154 				      part_ptr->name);
2155 				xfree(part_ptr->allow_alloc_nodes);
2156 				part_ptr->allow_alloc_nodes =
2157 					xstrdup(old_part_ptr->allow_alloc_nodes);
2158 			}
2159 			if (xstrcmp(part_ptr->allow_groups,
2160 				    old_part_ptr->allow_groups)) {
2161 				error("Partition %s AllowGroups differs from "
2162 				      "slurm.conf", part_ptr->name);
2163 				xfree(part_ptr->allow_groups);
2164 				part_ptr->allow_groups = xstrdup(old_part_ptr->
2165 								 allow_groups);
2166 			}
2167 			if (xstrcmp(part_ptr->allow_qos,
2168 				    old_part_ptr->allow_qos)) {
2169 				error("Partition %s AllowQos differs from "
2170 				      "slurm.conf", part_ptr->name);
2171 				xfree(part_ptr->allow_qos);
2172 				part_ptr->allow_qos = xstrdup(old_part_ptr->
2173 								 allow_qos);
2174 				qos_list_build(part_ptr->allow_qos,
2175 					       &part_ptr->allow_qos_bitstr);
2176 			}
2177 			if (xstrcmp(part_ptr->alternate,
2178 				    old_part_ptr->alternate)) {
2179 				error("Partition %s Alternate differs from slurm.conf",
2180 				      part_ptr->name);
2181 				xfree(part_ptr->alternate);
2182 				part_ptr->alternate =
2183 					xstrdup(old_part_ptr->alternate);
2184 			}
2185 			if (part_ptr->def_mem_per_cpu !=
2186 			    old_part_ptr->def_mem_per_cpu) {
2187 				error("Partition %s DefMemPerCPU differs from slurm.conf",
2188 				      part_ptr->name);
2189 				part_ptr->def_mem_per_cpu =
2190 					old_part_ptr->def_mem_per_cpu;
2191 			}
2192 			if (part_ptr->default_time !=
2193 			    old_part_ptr->default_time) {
2194 				error("Partition %s DefaultTime differs from slurm.conf",
2195 				      part_ptr->name);
2196 				part_ptr->default_time =
2197 					old_part_ptr->default_time;
2198 			}
2199 			if (xstrcmp(part_ptr->deny_accounts,
2200 				    old_part_ptr->deny_accounts)) {
2201 				error("Partition %s DenyAccounts differs from "
2202 				      "slurm.conf", part_ptr->name);
2203 				xfree(part_ptr->deny_accounts);
2204 				part_ptr->deny_accounts =
2205 					xstrdup(old_part_ptr->deny_accounts);
2206 				accounts_list_build(part_ptr->deny_accounts,
2207 						&part_ptr->deny_account_array);
2208 			}
2209 			if (xstrcmp(part_ptr->deny_qos,
2210 				    old_part_ptr->deny_qos)) {
2211 				error("Partition %s DenyQos differs from "
2212 				      "slurm.conf", part_ptr->name);
2213 				xfree(part_ptr->deny_qos);
2214 				part_ptr->deny_qos = xstrdup(old_part_ptr->
2215 							     deny_qos);
2216 				qos_list_build(part_ptr->deny_qos,
2217 					       &part_ptr->deny_qos_bitstr);
2218 			}
2219 			if ((part_ptr->flags & PART_FLAG_HIDDEN) !=
2220 			    (old_part_ptr->flags & PART_FLAG_HIDDEN)) {
2221 				error("Partition %s Hidden differs from "
2222 				      "slurm.conf", part_ptr->name);
2223 				if (old_part_ptr->flags & PART_FLAG_HIDDEN)
2224 					part_ptr->flags |= PART_FLAG_HIDDEN;
2225 				else
2226 					part_ptr->flags &= (~PART_FLAG_HIDDEN);
2227 			}
2228 			if ((part_ptr->flags & PART_FLAG_NO_ROOT) !=
2229 			    (old_part_ptr->flags & PART_FLAG_NO_ROOT)) {
2230 				error("Partition %s DisableRootJobs differs "
2231 				      "from slurm.conf", part_ptr->name);
2232 				if (old_part_ptr->flags & PART_FLAG_NO_ROOT)
2233 					part_ptr->flags |= PART_FLAG_NO_ROOT;
2234 				else
2235 					part_ptr->flags &= (~PART_FLAG_NO_ROOT);
2236 			}
2237 			if ((part_ptr->flags & PART_FLAG_EXCLUSIVE_USER) !=
2238 			    (old_part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)) {
2239 				error("Partition %s ExclusiveUser differs "
2240 				      "from slurm.conf", part_ptr->name);
2241 				if (old_part_ptr->flags &
2242 				    PART_FLAG_EXCLUSIVE_USER) {
2243 					part_ptr->flags |=
2244 						PART_FLAG_EXCLUSIVE_USER;
2245 				} else {
2246 					part_ptr->flags &=
2247 						(~PART_FLAG_EXCLUSIVE_USER);
2248 				}
2249 			}
2250 			if ((part_ptr->flags & PART_FLAG_ROOT_ONLY) !=
2251 			    (old_part_ptr->flags & PART_FLAG_ROOT_ONLY)) {
2252 				error("Partition %s RootOnly differs from "
2253 				      "slurm.conf", part_ptr->name);
2254 				if (old_part_ptr->flags & PART_FLAG_ROOT_ONLY)
2255 					part_ptr->flags |= PART_FLAG_ROOT_ONLY;
2256 				else
2257 					part_ptr->flags &= (~PART_FLAG_ROOT_ONLY);
2258 			}
2259 			if ((part_ptr->flags & PART_FLAG_REQ_RESV) !=
2260 			    (old_part_ptr->flags & PART_FLAG_REQ_RESV)) {
2261 				error("Partition %s ReqResv differs from "
2262 				      "slurm.conf", part_ptr->name);
2263 				if (old_part_ptr->flags & PART_FLAG_REQ_RESV)
2264 					part_ptr->flags |= PART_FLAG_REQ_RESV;
2265 				else
2266 					part_ptr->flags &= (~PART_FLAG_REQ_RESV);
2267 			}
2268 			if ((part_ptr->flags & PART_FLAG_LLN) !=
2269 			    (old_part_ptr->flags & PART_FLAG_LLN)) {
2270 				error("Partition %s LLN differs from "
2271 				      "slurm.conf", part_ptr->name);
2272 				if (old_part_ptr->flags & PART_FLAG_LLN)
2273 					part_ptr->flags |= PART_FLAG_LLN;
2274 				else
2275 					part_ptr->flags &= (~PART_FLAG_LLN);
2276 			}
2277 			if (part_ptr->grace_time != old_part_ptr->grace_time) {
2278 				error("Partition %s GraceTime differs from slurm.conf",
2279 				      part_ptr->name);
2280 				part_ptr->grace_time = old_part_ptr->grace_time;
2281 			}
2282 			if (part_ptr->max_cpus_per_node !=
2283 			    old_part_ptr->max_cpus_per_node) {
2284 				error("Partition %s MaxCPUsPerNode differs from slurm.conf"
2285 				      " (%u != %u)",
2286 				      part_ptr->name,
2287 				      part_ptr->max_cpus_per_node,
2288 				      old_part_ptr->max_cpus_per_node);
2289 				part_ptr->max_cpus_per_node =
2290 					old_part_ptr->max_cpus_per_node;
2291 			}
2292 			if (part_ptr->max_mem_per_cpu !=
2293 			    old_part_ptr->max_mem_per_cpu) {
2294 				error("Partition %s MaxMemPerNode/MaxMemPerCPU differs from slurm.conf"
2295 				      " (%"PRIu64" != %"PRIu64")",
2296 				      part_ptr->name,
2297 				      part_ptr->max_mem_per_cpu,
2298 				      old_part_ptr->max_mem_per_cpu);
2299 				part_ptr->max_mem_per_cpu =
2300 					old_part_ptr->max_mem_per_cpu;
2301 			}
2302 			if (part_ptr->max_nodes_orig !=
2303 			    old_part_ptr->max_nodes_orig) {
2304 				error("Partition %s MaxNodes differs from "
2305 				      "slurm.conf (%u != %u)", part_ptr->name,
2306 				       part_ptr->max_nodes_orig,
2307 				       old_part_ptr->max_nodes_orig);
2308 				part_ptr->max_nodes = old_part_ptr->
2309 						      max_nodes_orig;
2310 				part_ptr->max_nodes_orig = old_part_ptr->
2311 							   max_nodes_orig;
2312 			}
2313 			if (part_ptr->max_share != old_part_ptr->max_share) {
2314 				error("Partition %s Shared differs from "
2315 				      "slurm.conf", part_ptr->name);
2316 				part_ptr->max_share = old_part_ptr->max_share;
2317 			}
2318 			if (part_ptr->max_time != old_part_ptr->max_time) {
2319 				error("Partition %s MaxTime differs from "
2320 				      "slurm.conf", part_ptr->name);
2321 				part_ptr->max_time = old_part_ptr->max_time;
2322 			}
2323 			if (part_ptr->min_nodes_orig !=
2324 			    old_part_ptr->min_nodes_orig) {
2325 				error("Partition %s MinNodes differs from "
2326 				      "slurm.conf (%u != %u)", part_ptr->name,
2327 				       part_ptr->min_nodes_orig,
2328 				       old_part_ptr->min_nodes_orig);
2329 				part_ptr->min_nodes = old_part_ptr->
2330 						      min_nodes_orig;
2331 				part_ptr->min_nodes_orig = old_part_ptr->
2332 							   min_nodes_orig;
2333 			}
2334 			if (xstrcmp(part_ptr->nodes, old_part_ptr->nodes)) {
2335 				error("Partition %s Nodes differs from "
2336 				      "slurm.conf", part_ptr->name);
2337 				xfree(part_ptr->nodes);
2338 				part_ptr->nodes = xstrdup(old_part_ptr->nodes);
2339 			}
2340 			if (part_ptr->over_time_limit !=
2341 			    old_part_ptr->over_time_limit) {
2342 				error("Partition %s OverTimeLimit differs from slurm.conf",
2343 				      part_ptr->name);
2344 				part_ptr->over_time_limit =
2345 					old_part_ptr->over_time_limit;
2346 			}
2347 			if (part_ptr->preempt_mode !=
2348 			    old_part_ptr->preempt_mode) {
2349 				error("Partition %s PreemptMode differs from "
2350 				      "slurm.conf", part_ptr->name);
2351 				part_ptr->preempt_mode = old_part_ptr->
2352 							 preempt_mode;
2353 			}
2354 			if (part_ptr->priority_job_factor !=
2355 			    old_part_ptr->priority_job_factor) {
2356 				error("Partition %s PriorityJobFactor differs "
2357 				      "from slurm.conf", part_ptr->name);
2358 				part_ptr->priority_job_factor =
2359 					old_part_ptr->priority_job_factor;
2360 			}
2361 			if (part_ptr->priority_tier !=
2362 			    old_part_ptr->priority_tier) {
2363 				error("Partition %s PriorityTier differs from "
2364 				      "slurm.conf", part_ptr->name);
2365 				part_ptr->priority_tier =
2366 					old_part_ptr->priority_tier;
2367 			}
2368 			if (xstrcmp(part_ptr->qos_char,
2369 				    old_part_ptr->qos_char)) {
2370 				error("Partition %s QOS differs from slurm.conf",
2371 				      part_ptr->name);
2372 				xfree(part_ptr->qos_char);
2373 				part_ptr->qos_char =
2374 					xstrdup(old_part_ptr->qos_char);
2375 				part_ptr->qos_ptr = old_part_ptr->qos_ptr;
2376 			}
2377 			if (part_ptr->state_up != old_part_ptr->state_up) {
2378 				error("Partition %s State differs from "
2379 				      "slurm.conf", part_ptr->name);
2380 				part_ptr->state_up = old_part_ptr->state_up;
2381 			}
2382 		} else {
2383 			if ( !(flags & RECONFIG_KEEP_PART_INFO) &&
2384 			     (flags & RECONFIG_KEEP_PART_STAT) ) {
2385 				info("Partition %s missing from slurm.conf, "
2386 				     "not restoring it", old_part_ptr->name);
2387 				continue;
2388 			}
2389 			error("Partition %s missing from slurm.conf, "
2390 			      "restoring it", old_part_ptr->name);
2391 			part_ptr = create_part_record(old_part_ptr->name);
2392 
2393 			part_ptr->allow_accounts =
2394 				xstrdup(old_part_ptr->allow_accounts);
2395 			accounts_list_build(part_ptr->allow_accounts,
2396 					 &part_ptr->allow_account_array);
2397 			part_ptr->allow_alloc_nodes =
2398 				xstrdup(old_part_ptr->allow_alloc_nodes);
2399 			part_ptr->allow_groups = xstrdup(old_part_ptr->
2400 							 allow_groups);
2401 			part_ptr->allow_qos = xstrdup(old_part_ptr->
2402 						      allow_qos);
2403 			qos_list_build(part_ptr->allow_qos,
2404 				       &part_ptr->allow_qos_bitstr);
2405 			part_ptr->def_mem_per_cpu =
2406 				old_part_ptr->def_mem_per_cpu;
2407 			part_ptr->default_time = old_part_ptr->default_time;
2408 			part_ptr->deny_accounts = xstrdup(old_part_ptr->
2409 							  deny_accounts);
2410 			accounts_list_build(part_ptr->deny_accounts,
2411 					 &part_ptr->deny_account_array);
2412 			part_ptr->deny_qos = xstrdup(old_part_ptr->
2413 						     deny_qos);
2414 			qos_list_build(part_ptr->deny_qos,
2415 				       &part_ptr->deny_qos_bitstr);
2416 			part_ptr->flags = old_part_ptr->flags;
2417 			part_ptr->grace_time = old_part_ptr->grace_time;
2418 			part_ptr->job_defaults_list =
2419 				job_defaults_copy(old_part_ptr->job_defaults_list);
2420 			part_ptr->max_cpus_per_node =
2421 				old_part_ptr->max_cpus_per_node;
2422 			part_ptr->max_mem_per_cpu =
2423 				old_part_ptr->max_mem_per_cpu;
2424 			part_ptr->max_nodes = old_part_ptr->max_nodes;
2425 			part_ptr->max_nodes_orig = old_part_ptr->
2426 						   max_nodes_orig;
2427 			part_ptr->max_share = old_part_ptr->max_share;
2428 			part_ptr->max_time = old_part_ptr->max_time;
2429 			part_ptr->min_nodes = old_part_ptr->min_nodes;
2430 			part_ptr->min_nodes_orig = old_part_ptr->
2431 						   min_nodes_orig;
2432 			part_ptr->nodes = xstrdup(old_part_ptr->nodes);
2433 			part_ptr->over_time_limit =
2434 				old_part_ptr->over_time_limit;
2435 			part_ptr->preempt_mode = old_part_ptr->preempt_mode;
2436 			part_ptr->priority_job_factor =
2437 				old_part_ptr->priority_job_factor;
2438 			part_ptr->priority_tier = old_part_ptr->priority_tier;
2439 			part_ptr->qos_char =
2440 				xstrdup(old_part_ptr->qos_char);
2441 			part_ptr->qos_ptr = old_part_ptr->qos_ptr;
2442 			part_ptr->state_up = old_part_ptr->state_up;
2443 		}
2444 	}
2445 	list_iterator_destroy(part_iterator);
2446 
2447 	if (old_def_part_name &&
2448 	    ((default_part_name == NULL) ||
2449 	     xstrcmp(old_def_part_name, default_part_name))) {
2450 		part_ptr = find_part_record(old_def_part_name);
2451 		if (part_ptr) {
2452 			error("Default partition reset to %s",
2453 			      old_def_part_name);
2454 			default_part_loc  = part_ptr;
2455 			xfree(default_part_name);
2456 			default_part_name = xstrdup(old_def_part_name);
2457 		}
2458 	}
2459 
2460 	return rc;
2461 }
2462 
2463 /* Purge old partition state information */
_purge_old_part_state(List old_part_list,char * old_def_part_name)2464 static void _purge_old_part_state(List old_part_list, char *old_def_part_name)
2465 {
2466 	xfree(old_def_part_name);
2467 
2468 	if (!old_part_list)
2469 		return;
2470 	FREE_NULL_LIST(old_part_list);
2471 }
2472 
2473 /*
2474  * _preserve_select_type_param - preserve original plugin parameters.
2475  *	Daemons and/or commands must be restarted for some
2476  *	select plugin value changes to take effect.
2477  * RET zero or error code
2478  */
_preserve_select_type_param(slurm_ctl_conf_t * ctl_conf_ptr,uint16_t old_select_type_p)2479 static int  _preserve_select_type_param(slurm_ctl_conf_t *ctl_conf_ptr,
2480 					uint16_t old_select_type_p)
2481 {
2482 	int rc = SLURM_SUCCESS;
2483 
2484 	/* SelectTypeParameters cannot change */
2485 	if (old_select_type_p) {
2486 		if (old_select_type_p != ctl_conf_ptr->select_type_param) {
2487 			ctl_conf_ptr->select_type_param = old_select_type_p;
2488 			rc = ESLURM_INVALID_SELECTTYPE_CHANGE;
2489 		}
2490 	}
2491 	return rc;
2492 }
2493 
2494 /* Start or stop the gang scheduler module as needed based upon changes in
2495  *	configuration */
_update_preempt(uint16_t old_preempt_mode)2496 static int _update_preempt(uint16_t old_preempt_mode)
2497 {
2498 	uint16_t new_preempt_mode = slurm_get_preempt_mode();
2499 
2500 	if ((old_preempt_mode & PREEMPT_MODE_GANG) ==
2501 	    (new_preempt_mode & PREEMPT_MODE_GANG))
2502 		return SLURM_SUCCESS;
2503 
2504 	if (new_preempt_mode & PREEMPT_MODE_GANG) {
2505 		info("Enabling gang scheduling");
2506 		gs_init();
2507 		return SLURM_SUCCESS;
2508 	}
2509 
2510 	if (old_preempt_mode == PREEMPT_MODE_GANG) {
2511 		info("Disabling gang scheduling");
2512 		gs_wake_jobs();
2513 		gs_fini();
2514 		return SLURM_SUCCESS;
2515 	}
2516 
2517 	error("Invalid gang scheduling mode change");
2518 	return EINVAL;
2519 }
2520 
2521 /*
2522  * _sync_nodes_to_jobs - sync node state to job states on slurmctld restart.
2523  *	This routine marks nodes allocated to a job as busy no matter what
2524  *	the node's last saved state
2525  * RET count of nodes having state changed
2526  * Note: Operates on common variables, no arguments
2527  */
_sync_nodes_to_jobs(bool reconfig)2528 static int _sync_nodes_to_jobs(bool reconfig)
2529 {
2530 	job_record_t *job_ptr;
2531 	ListIterator job_iterator;
2532 	int update_cnt = 0;
2533 
2534 	job_iterator = list_iterator_create(job_list);
2535 	while ((job_ptr = list_next(job_iterator))) {
2536 		if (!reconfig &&
2537 		    job_ptr->details && job_ptr->details->prolog_running) {
2538 			job_ptr->details->prolog_running = 0;
2539 			if (IS_JOB_CONFIGURING(job_ptr)) {
2540 				prolog_slurmctld(job_ptr);
2541 				(void) bb_g_job_begin(job_ptr);
2542 			}
2543 		}
2544 
2545 		if (job_ptr->node_bitmap == NULL)
2546 			;
2547 		else if (IS_JOB_RUNNING(job_ptr) || IS_JOB_COMPLETING(job_ptr))
2548 			update_cnt += _sync_nodes_to_active_job(job_ptr);
2549 		else if (IS_JOB_SUSPENDED(job_ptr))
2550 			_sync_nodes_to_suspended_job(job_ptr);
2551 
2552 	}
2553 	list_iterator_destroy(job_iterator);
2554 
2555 	if (update_cnt) {
2556 		info("_sync_nodes_to_jobs updated state of %d nodes",
2557 		     update_cnt);
2558 	}
2559 	sync_front_end_state();
2560 	return update_cnt;
2561 }
2562 
2563 /* For jobs which are in state COMPLETING, deallocate the nodes and
2564  * issue the RPC to kill the job */
_sync_nodes_to_comp_job(void)2565 static int _sync_nodes_to_comp_job(void)
2566 {
2567 	job_record_t *job_ptr;
2568 	ListIterator job_iterator;
2569 	int update_cnt = 0;
2570 
2571 	job_iterator = list_iterator_create(job_list);
2572 	while ((job_ptr = list_next(job_iterator))) {
2573 		if ((job_ptr->node_bitmap) && IS_JOB_COMPLETING(job_ptr)) {
2574 
2575 			/* If the controller is reconfiguring
2576 			 * and the job is in completing state
2577 			 * and the slurmctld epilog is already
2578 			 * running which means deallocate_nodes()
2579 			 * was alredy called, do invoke it again
2580 			 * and don't start another epilog.
2581 			 */
2582 			if (job_ptr->epilog_running == true)
2583 				continue;
2584 
2585 			update_cnt++;
2586 			info("%s: %pJ in completing state", __func__, job_ptr);
2587 			if (!job_ptr->node_bitmap_cg)
2588 				build_cg_bitmap(job_ptr);
2589 
2590 			/* deallocate_nodes will remove this job from
2591 			 * the system before it was added, so add it
2592 			 * now
2593 			 */
2594 			if (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS)
2595 				acct_policy_job_begin(job_ptr);
2596 
2597 			if (job_ptr->front_end_ptr)
2598 				job_ptr->front_end_ptr->job_cnt_run++;
2599 			deallocate_nodes(job_ptr, false, false, false);
2600 			/* The job in completing state at slurmctld restart or
2601 			 * reconfiguration, do not log completion again.
2602 			 * job_completion_logger(job_ptr, false); */
2603 		}
2604 	}
2605 	list_iterator_destroy(job_iterator);
2606 	if (update_cnt)
2607 		info("%s: completing %d jobs", __func__, update_cnt);
2608 	return update_cnt;
2609 }
2610 
2611 /* Synchronize states of nodes and active jobs (RUNNING or COMPLETING state)
2612  * RET count of jobs with state changes */
_sync_nodes_to_active_job(job_record_t * job_ptr)2613 static int _sync_nodes_to_active_job(job_record_t *job_ptr)
2614 {
2615 	int i, cnt = 0;
2616 	uint32_t node_flags;
2617 	node_record_t *node_ptr = node_record_table_ptr;
2618 
2619 	if (job_ptr->node_bitmap_cg) /* job completing */
2620 		job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap_cg);
2621 	else
2622 		job_ptr->node_cnt = bit_set_count(job_ptr->node_bitmap);
2623 	for (i = 0; i < node_record_count; i++, node_ptr++) {
2624 		if (job_ptr->node_bitmap_cg) { /* job completing */
2625 			if (bit_test(job_ptr->node_bitmap_cg, i) == 0)
2626 				continue;
2627 		} else if (bit_test(job_ptr->node_bitmap, i) == 0)
2628 			continue;
2629 
2630 		if ((job_ptr->details &&
2631 		     (job_ptr->details->whole_node == WHOLE_NODE_USER)) ||
2632 		    (job_ptr->part_ptr &&
2633 		     (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))) {
2634 			node_ptr->owner_job_cnt++;
2635 			node_ptr->owner = job_ptr->user_id;
2636 		}
2637 
2638 		if (slurm_mcs_get_select(job_ptr) == 1) {
2639 			xfree(node_ptr->mcs_label);
2640 			node_ptr->mcs_label = xstrdup(job_ptr->mcs_label);
2641 		}
2642 
2643 		node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
2644 
2645 		node_ptr->run_job_cnt++; /* NOTE:
2646 				* This counter moved to comp_job_cnt
2647 				* by _sync_nodes_to_comp_job() */
2648 		if ((job_ptr->details) && (job_ptr->details->share_res == 0))
2649 			node_ptr->no_share_job_cnt++;
2650 
2651 		if (IS_NODE_DOWN(node_ptr)              &&
2652 		    IS_JOB_RUNNING(job_ptr)             &&
2653 		    (job_ptr->kill_on_node_fail == 0)   &&
2654 		    (job_ptr->node_cnt > 1)) {
2655 			/* This should only happen if a job was running
2656 			 * on a node that was newly configured DOWN */
2657 			int save_accounting_enforce;
2658 			info("Removing failed node %s from %pJ",
2659 			     node_ptr->name, job_ptr);
2660 			/*
2661 			 * Disable accounting here. Accounting reset for all
2662 			 * jobs in _restore_job_accounting()
2663 			 */
2664 			save_accounting_enforce = accounting_enforce;
2665 			accounting_enforce &= (~ACCOUNTING_ENFORCE_LIMITS);
2666 			job_pre_resize_acctg(job_ptr);
2667 			srun_node_fail(job_ptr, node_ptr->name);
2668 			kill_step_on_node(job_ptr, node_ptr, true);
2669 			excise_node_from_job(job_ptr, node_ptr);
2670 			job_post_resize_acctg(job_ptr);
2671 			accounting_enforce = save_accounting_enforce;
2672 		} else if (IS_NODE_DOWN(node_ptr) && IS_JOB_RUNNING(job_ptr)) {
2673 			info("Killing %pJ on DOWN node %s",
2674 			     job_ptr, node_ptr->name);
2675 			_abort_job(job_ptr, JOB_NODE_FAIL, FAIL_DOWN_NODE,
2676 				   NULL);
2677 			cnt++;
2678 		} else if (IS_NODE_IDLE(node_ptr)) {
2679 			cnt++;
2680 			node_ptr->node_state = NODE_STATE_ALLOCATED |
2681 					       node_flags;
2682 		}
2683 	}
2684 
2685 	if ((IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) &&
2686 	    (job_ptr->front_end_ptr != NULL))
2687 		job_ptr->front_end_ptr->job_cnt_run++;
2688 
2689 	return cnt;
2690 }
2691 
2692 /* Synchronize states of nodes and suspended jobs */
_sync_nodes_to_suspended_job(job_record_t * job_ptr)2693 static void _sync_nodes_to_suspended_job(job_record_t *job_ptr)
2694 {
2695 	int i;
2696 	node_record_t *node_ptr = node_record_table_ptr;
2697 
2698 	for (i = 0; i < node_record_count; i++, node_ptr++) {
2699 		if (bit_test(job_ptr->node_bitmap, i) == 0)
2700 			continue;
2701 
2702 		node_ptr->sus_job_cnt++;
2703 	}
2704 	return;
2705 }
2706 
2707 /*
2708  * Build license_list for every job.
2709  * Reset accounting for every job.
2710  * Reset the running job count for scheduling policy.
2711  * This must be called after load_all_resv_state() and restore_node_features().
2712  */
_restore_job_accounting(void)2713 static void _restore_job_accounting(void)
2714 {
2715 	job_record_t *job_ptr;
2716 	ListIterator job_iterator;
2717 	bool valid = true;
2718 	List license_list;
2719 
2720 	assoc_mgr_clear_used_info();
2721 
2722 	job_iterator = list_iterator_create(job_list);
2723 	while ((job_ptr = list_next(job_iterator))) {
2724 		if (job_ptr->array_recs)
2725 			job_ptr->array_recs->tot_run_tasks = 0;
2726 	}
2727 
2728 	list_iterator_reset(job_iterator);
2729 	while ((job_ptr = list_next(job_iterator))) {
2730 		(void) build_feature_list(job_ptr);
2731 
2732 		if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
2733 			job_array_start(job_ptr);
2734 
2735 		if (accounting_enforce & ACCOUNTING_ENFORCE_LIMITS) {
2736 			if (!IS_JOB_FINISHED(job_ptr))
2737 				acct_policy_add_job_submit(job_ptr);
2738 			if (IS_JOB_RUNNING(job_ptr) ||
2739 			    IS_JOB_SUSPENDED(job_ptr)) {
2740 				acct_policy_job_begin(job_ptr);
2741 				job_claim_resv(job_ptr);
2742 			} else if (IS_JOB_PENDING(job_ptr) &&
2743 				   job_ptr->details &&
2744 				   job_ptr->details->accrue_time)
2745 				acct_policy_add_accrue_time(job_ptr, true);
2746 		}
2747 
2748 		license_list = license_validate(job_ptr->licenses, false, false,
2749 						job_ptr->tres_req_cnt, &valid);
2750 		FREE_NULL_LIST(job_ptr->license_list);
2751 		if (valid) {
2752 			job_ptr->license_list = license_list;
2753 			xfree(job_ptr->licenses);
2754 			job_ptr->licenses =
2755 				license_list_to_string(license_list);
2756 		}
2757 
2758 		if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr))
2759 			license_job_get(job_ptr);
2760 
2761 	}
2762 	list_iterator_destroy(job_iterator);
2763 }
2764 
2765 /*
2766  * NOTE: Can be removed in/after 21.08 because the controller won't need to
2767  * build details->depend_list from the dependency string anymore because in
2768  * 20.02 the depend_list is state saved and doesn't rely on the dependency
2769  * string anymore.
2770  */
restore_job_dependencies(void)2771 extern int restore_job_dependencies(void)
2772 {
2773 	job_record_t *job_ptr;
2774 	ListIterator job_iterator;
2775 	int error_code = SLURM_SUCCESS, rc;
2776 	char *new_depend;
2777 	slurmctld_lock_t job_fed_lock = {.job = WRITE_LOCK, .fed = READ_LOCK};
2778 
2779 	lock_slurmctld(job_fed_lock);
2780 
2781 	job_iterator = list_iterator_create(job_list);
2782 	while ((job_ptr = list_next(job_iterator))) {
2783 		if ((job_ptr->details == NULL) ||
2784 		    (job_ptr->details->dependency == NULL) ||
2785 		    job_ptr->details->depend_list)
2786 			continue;
2787 		new_depend = job_ptr->details->dependency;
2788 		job_ptr->details->dependency = NULL;
2789 		rc = update_job_dependency(job_ptr, new_depend);
2790 		if (rc != SLURM_SUCCESS) {
2791 			error("Invalid dependencies discarded for %pJ: %s",
2792 				job_ptr, new_depend);
2793 			error_code = rc;
2794 		}
2795 		xfree(new_depend);
2796 	}
2797 	list_iterator_destroy(job_iterator);
2798 	unlock_slurmctld(job_fed_lock);
2799 
2800 	return error_code;
2801 }
2802 
2803 /* Flush accounting information on this cluster, then for each running or
2804  * suspended job, restore its state in the accounting system */
_acct_restore_active_jobs(void)2805 static void _acct_restore_active_jobs(void)
2806 {
2807 	job_record_t *job_ptr;
2808 	ListIterator job_iterator;
2809 	step_record_t *step_ptr;
2810 	ListIterator step_iterator;
2811 
2812 	info("Reinitializing job accounting state");
2813 	acct_storage_g_flush_jobs_on_cluster(acct_db_conn,
2814 					     time(NULL));
2815 	job_iterator = list_iterator_create(job_list);
2816 	while ((job_ptr = list_next(job_iterator))) {
2817 		if (IS_JOB_SUSPENDED(job_ptr))
2818 			jobacct_storage_g_job_suspend(acct_db_conn, job_ptr);
2819 		if (IS_JOB_SUSPENDED(job_ptr) || IS_JOB_RUNNING(job_ptr)) {
2820 			if (!with_slurmdbd)
2821 				jobacct_storage_g_job_start(
2822 					acct_db_conn, job_ptr);
2823 			else if (job_ptr->db_index != NO_VAL64)
2824 				job_ptr->db_index = 0;
2825 			step_iterator = list_iterator_create(
2826 				job_ptr->step_list);
2827 			while ((step_ptr = list_next(step_iterator))) {
2828 				jobacct_storage_g_step_start(acct_db_conn,
2829 							     step_ptr);
2830 			}
2831 			list_iterator_destroy (step_iterator);
2832 		}
2833 	}
2834 	list_iterator_destroy(job_iterator);
2835 }
2836 
2837 /* _compare_hostnames()
2838  */
_compare_hostnames(node_record_t * old_node_table,int old_node_count,node_record_t * node_table,int node_count)2839 static int _compare_hostnames(node_record_t *old_node_table,
2840 			      int old_node_count, node_record_t *node_table,
2841 			      int node_count)
2842 {
2843 	int cc;
2844 	int set_size;
2845 	char *old_ranged;
2846 	char *ranged;
2847 	hostset_t old_set;
2848 	hostset_t set;
2849 
2850 	if (old_node_count != node_count) {
2851 		error("%s: node count has changed before reconfiguration "
2852 		      "from %d to %d. You have to restart slurmctld.",
2853 		      __func__, old_node_count, node_count);
2854 		return -1;
2855 	}
2856 
2857 	old_set = hostset_create("");
2858 	for (cc = 0; cc < old_node_count; cc++)
2859 		hostset_insert(old_set, old_node_table[cc].name);
2860 
2861 	set = hostset_create("");
2862 	for (cc = 0; cc < node_count; cc++)
2863 		hostset_insert(set, node_table[cc].name);
2864 
2865 	set_size = MAXHOSTNAMELEN * node_count + node_count + 1;
2866 
2867 	old_ranged = xmalloc(set_size);
2868 	ranged = xmalloc(set_size);
2869 
2870 	hostset_ranged_string(old_set, set_size, old_ranged);
2871 	hostset_ranged_string(set, set_size, ranged);
2872 
2873 	cc = 0;
2874 	if (xstrcmp(old_ranged, ranged) != 0) {
2875 		error("%s: node names changed before reconfiguration. "
2876 		      "You have to restart slurmctld.", __func__);
2877 		cc = -1;
2878 	}
2879 
2880 	hostset_destroy(old_set);
2881 	hostset_destroy(set);
2882 	xfree(old_ranged);
2883 	xfree(ranged);
2884 
2885 	return cc;
2886 }
2887 
dump_config_state_lite(void)2888 extern int dump_config_state_lite(void)
2889 {
2890 	static int high_buffer_size = (1024 * 1024);
2891 	int error_code = 0, log_fd;
2892 	char *old_file = NULL, *new_file = NULL, *reg_file = NULL;
2893 	Buf buffer = init_buf(high_buffer_size);
2894 
2895 	DEF_TIMERS;
2896 
2897 	START_TIMER;
2898 	/* write header: version, time */
2899 	pack16(SLURM_PROTOCOL_VERSION, buffer);
2900 	pack_time(time(NULL), buffer);
2901 	packstr(slurmctld_conf.accounting_storage_type, buffer);
2902 
2903 	/* write the buffer to file */
2904 	reg_file = xstrdup_printf("%s/last_config_lite",
2905 				  slurmctld_conf.state_save_location);
2906 	old_file = xstrdup_printf("%s.old", reg_file);
2907 	new_file = xstrdup_printf("%s.new", reg_file);
2908 
2909 	log_fd = creat(new_file, 0600);
2910 	if (log_fd < 0) {
2911 		error("Can't save state, create file %s error %m",
2912 		      new_file);
2913 		error_code = errno;
2914 	} else {
2915 		int pos = 0, nwrite = get_buf_offset(buffer), amount;
2916 		char *data = (char *)get_buf_data(buffer);
2917 		high_buffer_size = MAX(nwrite, high_buffer_size);
2918 		while (nwrite > 0) {
2919 			amount = write(log_fd, &data[pos], nwrite);
2920 			if ((amount < 0) && (errno != EINTR)) {
2921 				error("Error writing file %s, %m", new_file);
2922 				error_code = errno;
2923 				break;
2924 			}
2925 			nwrite -= amount;
2926 			pos    += amount;
2927 		}
2928 		fsync(log_fd);
2929 		close(log_fd);
2930 	}
2931 	if (error_code)
2932 		(void) unlink(new_file);
2933 	else {			/* file shuffle */
2934 		(void) unlink(old_file);
2935 		if (link(reg_file, old_file))
2936 			debug4("unable to create link for %s -> %s: %m",
2937 			       reg_file, old_file);
2938 		(void) unlink(reg_file);
2939 		if (link(new_file, reg_file))
2940 			debug4("unable to create link for %s -> %s: %m",
2941 			       new_file, reg_file);
2942 		(void) unlink(new_file);
2943 	}
2944 	xfree(old_file);
2945 	xfree(reg_file);
2946 	xfree(new_file);
2947 
2948 	free_buf(buffer);
2949 
2950 	END_TIMER2("dump_config_state_lite");
2951 	return error_code;
2952 
2953 }
2954 
load_config_state_lite(void)2955 extern int load_config_state_lite(void)
2956 {
2957 	uint32_t uint32_tmp = 0;
2958 	uint16_t ver = 0;
2959 	char *state_file;
2960 	Buf buffer;
2961 	time_t buf_time;
2962 	char *last_accounting_storage_type = NULL;
2963 
2964 	/* Always ignore .old file */
2965 	state_file = xstrdup_printf("%s/last_config_lite",
2966 				    slurmctld_conf.state_save_location);
2967 
2968 	//info("looking at the %s file", state_file);
2969 	if (!(buffer = create_mmap_buf(state_file))) {
2970 		debug2("No last_config_lite file (%s) to recover", state_file);
2971 		xfree(state_file);
2972 		return ENOENT;
2973 	}
2974 	xfree(state_file);
2975 
2976 	safe_unpack16(&ver, buffer);
2977 	debug3("Version in last_conf_lite header is %u", ver);
2978 	if (ver > SLURM_PROTOCOL_VERSION || ver < SLURM_MIN_PROTOCOL_VERSION) {
2979 		if (!ignore_state_errors)
2980 			fatal("Can not recover last_conf_lite, incompatible version, (%u not between %d and %d), start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.",
2981 			      ver, SLURM_MIN_PROTOCOL_VERSION,
2982 			      SLURM_PROTOCOL_VERSION);
2983 		error("***********************************************");
2984 		error("Can not recover last_conf_lite, incompatible version, "
2985 		      "(%u not between %d and %d)",
2986 		      ver, SLURM_MIN_PROTOCOL_VERSION, SLURM_PROTOCOL_VERSION);
2987 		error("***********************************************");
2988 		free_buf(buffer);
2989 		return EFAULT;
2990 	} else {
2991 		safe_unpack_time(&buf_time, buffer);
2992 		safe_unpackstr_xmalloc(&last_accounting_storage_type,
2993 				       &uint32_tmp, buffer);
2994 	}
2995 	xassert(slurmctld_conf.accounting_storage_type);
2996 
2997 	if (last_accounting_storage_type
2998 	    && !xstrcmp(last_accounting_storage_type,
2999 		        slurmctld_conf.accounting_storage_type))
3000 		slurmctld_init_db = 0;
3001 	xfree(last_accounting_storage_type);
3002 
3003 	free_buf(buffer);
3004 	return SLURM_SUCCESS;
3005 
3006 unpack_error:
3007 	if (!ignore_state_errors)
3008 		fatal("Incomplete last_config_lite checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
3009 	error("Incomplete last_config_lite checkpoint file");
3010 	free_buf(buffer);
3011 
3012 	return SLURM_ERROR;
3013 }
3014