1 /*****************************************************************************\
2  *  node_mgr.c - manage the node records of slurm
3  *	Note: there is a global node table (node_record_table_ptr), its
4  *	hash table (node_hash_table), time stamp (last_node_update) and
5  *	configuration list (config_list)
6  *****************************************************************************
7  *  Copyright (C) 2002-2007 The Regents of the University of California.
8  *  Copyright (C) 2008-2010 Lawrence Livermore National Security.
9  *  Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
10  *  Written by Morris Jette <jette1@llnl.gov>, et. al.
11  *  CODE-OCEC-09-009. All rights reserved.
12  *
13  *  This file is part of Slurm, a resource management program.
14  *  For details, see <https://slurm.schedmd.com/>.
15  *  Please also read the included file: DISCLAIMER.
16  *
17  *  Slurm is free software; you can redistribute it and/or modify it under
18  *  the terms of the GNU General Public License as published by the Free
19  *  Software Foundation; either version 2 of the License, or (at your option)
20  *  any later version.
21  *
22  *  In addition, as a special exception, the copyright holders give permission
23  *  to link the code of portions of this program with the OpenSSL library under
24  *  certain conditions as described in each individual source file, and
25  *  distribute linked combinations including the two. You must obey the GNU
26  *  General Public License in all respects for all of the code used other than
27  *  OpenSSL. If you modify file(s) with this exception, you may extend this
28  *  exception to your version of the file(s), but you are not obligated to do
29  *  so. If you do not wish to do so, delete this exception statement from your
30  *  version.  If you delete this exception statement from all source files in
31  *  the program, then also delete it here.
32  *
33  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
34  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
35  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
36  *  details.
37  *
38  *  You should have received a copy of the GNU General Public License along
39  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
40  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
41 \*****************************************************************************/
42 
43 #include "config.h"
44 
45 #include <ctype.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <sys/types.h>
52 #include <sys/stat.h>
53 #include <time.h>
54 
55 #include "src/common/bitstring.h"
56 #include "src/common/fd.h"
57 #include "src/common/fetch_config.h"
58 #include "src/common/gres.h"
59 #include "src/common/hostlist.h"
60 #include "src/common/macros.h"
61 #include "src/common/node_features.h"
62 #include "src/common/node_select.h"
63 #include "src/common/pack.h"
64 #include "src/common/parse_time.h"
65 #include "src/common/power.h"
66 #include "src/common/read_config.h"
67 #include "src/common/slurm_accounting_storage.h"
68 #include "src/common/slurm_acct_gather_energy.h"
69 #include "src/common/slurm_ext_sensors.h"
70 #include "src/common/slurm_resource_info.h"
71 #include "src/common/slurm_mcs.h"
72 #include "src/common/xassert.h"
73 #include "src/common/xstring.h"
74 
75 #include "src/slurmctld/agent.h"
76 #include "src/slurmctld/front_end.h"
77 #include "src/slurmctld/locks.h"
78 #include "src/slurmctld/ping_nodes.h"
79 #include "src/slurmctld/proc_req.h"
80 #include "src/slurmctld/read_config.h"
81 #include "src/slurmctld/reservation.h"
82 #include "src/slurmctld/slurmctld.h"
83 #include "src/slurmctld/slurmctld_plugstack.h"
84 #include "src/slurmctld/state_save.h"
85 #include "src/common/timers.h"
86 #include "src/slurmctld/trigger_mgr.h"
87 
88 /* No need to change we always pack SLURM_PROTOCOL_VERSION */
89 #define NODE_STATE_VERSION        "PROTOCOL_VERSION"
90 
91 typedef enum {
92 	FEATURE_MODE_IND,  /* Print each node change indivually */
93 	FEATURE_MODE_COMB, /* Try to combine like changes */
94 	FEATURE_MODE_PEND, /* Print any pending change message */
95 } feature_mode_t;
96 
97 /* Global variables */
98 bitstr_t *avail_node_bitmap = NULL;	/* bitmap of available nodes */
99 bitstr_t *bf_ignore_node_bitmap = NULL; /* bitmap of nodes to ignore during a
100 					 * backfill cycle */
101 bitstr_t *booting_node_bitmap = NULL;	/* bitmap of booting nodes */
102 bitstr_t *cg_node_bitmap    = NULL;	/* bitmap of completing nodes */
103 bitstr_t *future_node_bitmap = NULL;	/* bitmap of FUTURE nodes */
104 bitstr_t *idle_node_bitmap  = NULL;	/* bitmap of idle nodes */
105 bitstr_t *power_node_bitmap = NULL;	/* bitmap of powered down nodes */
106 bitstr_t *share_node_bitmap = NULL;  	/* bitmap of sharable nodes */
107 bitstr_t *up_node_bitmap    = NULL;  	/* bitmap of non-down nodes */
108 bitstr_t *rs_node_bitmap    = NULL; 	/* bitmap of resuming nodes */
109 
110 static void 	_dump_node_state(node_record_t *dump_node_ptr, Buf buffer);
111 static front_end_record_t * _front_end_reg(
112 				slurm_node_registration_status_msg_t *reg_msg);
113 static bool	_is_cloud_hidden(node_record_t *node_ptr);
114 static void 	_make_node_down(node_record_t *node_ptr,
115 				time_t event_time);
116 static bool	_node_is_hidden(node_record_t *node_ptr, uid_t uid);
117 static Buf	_open_node_state_file(char **state_file);
118 static void 	_pack_node(node_record_t *dump_node_ptr, Buf buffer,
119 			   uint16_t protocol_version, uint16_t show_flags);
120 static void	_sync_bitmaps(node_record_t *node_ptr, int job_count);
121 static void	_update_config_ptr(bitstr_t *bitmap,
122 				   config_record_t *config_ptr);
123 static int	_update_node_active_features(char *node_names,
124 				char *active_features, int mode);
125 static int	_update_node_avail_features(char *node_names,
126 				char *avail_features, int mode);
127 static int	_update_node_gres(char *node_names, char *gres);
128 static int	_update_node_weight(char *node_names, uint32_t weight);
129 static bool 	_valid_node_state_change(uint32_t old, uint32_t new);
130 
131 /* dump_all_node_state - save the state of all nodes to file */
dump_all_node_state(void)132 int dump_all_node_state ( void )
133 {
134 	/* Save high-water mark to avoid buffer growth with copies */
135 	static int high_buffer_size = (1024 * 1024);
136 	int error_code = 0, inx, log_fd;
137 	char *old_file, *new_file, *reg_file;
138 	node_record_t *node_ptr;
139 	/* Locks: Read config and node */
140 	slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK,
141 					    NO_LOCK, NO_LOCK };
142 	Buf buffer = init_buf(high_buffer_size);
143 	DEF_TIMERS;
144 
145 	START_TIMER;
146 	/* write header: version, time */
147 	packstr(NODE_STATE_VERSION, buffer);
148 	pack16(SLURM_PROTOCOL_VERSION, buffer);
149 	pack_time(time (NULL), buffer);
150 
151 	/* write node records to buffer */
152 	lock_slurmctld (node_read_lock);
153 	for (inx = 0, node_ptr = node_record_table_ptr; inx < node_record_count;
154 	     inx++, node_ptr++) {
155 		xassert (node_ptr->magic == NODE_MAGIC);
156 		xassert (node_ptr->config_ptr->magic == CONFIG_MAGIC);
157 		_dump_node_state (node_ptr, buffer);
158 	}
159 
160 	old_file = xstrdup (slurmctld_conf.state_save_location);
161 	xstrcat (old_file, "/node_state.old");
162 	reg_file = xstrdup (slurmctld_conf.state_save_location);
163 	xstrcat (reg_file, "/node_state");
164 	new_file = xstrdup (slurmctld_conf.state_save_location);
165 	xstrcat (new_file, "/node_state.new");
166 	unlock_slurmctld (node_read_lock);
167 
168 	/* write the buffer to file */
169 	lock_state_files();
170 	log_fd = creat (new_file, 0600);
171 	if (log_fd < 0) {
172 		error ("Can't save state, error creating file %s %m", new_file);
173 		error_code = errno;
174 	} else {
175 		int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
176 		char *data = (char *)get_buf_data(buffer);
177 		high_buffer_size = MAX(nwrite, high_buffer_size);
178 		while (nwrite > 0) {
179 			amount = write(log_fd, &data[pos], nwrite);
180 			if ((amount < 0) && (errno != EINTR)) {
181 				error("Error writing file %s, %m", new_file);
182 				error_code = errno;
183 				break;
184 			}
185 			nwrite -= amount;
186 			pos    += amount;
187 		}
188 
189 		rc = fsync_and_close(log_fd, "node");
190 		if (rc && !error_code)
191 			error_code = rc;
192 	}
193 	if (error_code)
194 		(void) unlink (new_file);
195 	else {	/* file shuffle */
196 		(void) unlink (old_file);
197 		if (link(reg_file, old_file))
198 			debug4("unable to create link for %s -> %s: %m",
199 			       reg_file, old_file);
200 		(void) unlink (reg_file);
201 		if (link(new_file, reg_file))
202 			debug4("unable to create link for %s -> %s: %m",
203 			       new_file, reg_file);
204 		(void) unlink (new_file);
205 	}
206 	xfree (old_file);
207 	xfree (reg_file);
208 	xfree (new_file);
209 	unlock_state_files ();
210 
211 	free_buf (buffer);
212 	END_TIMER2("dump_all_node_state");
213 	return error_code;
214 }
215 
216 /*
217  * _dump_node_state - dump the state of a specific node to a buffer
218  * IN dump_node_ptr - pointer to node for which information is requested
219  * IN/OUT buffer - location to store data, pointers automatically advanced
220  */
_dump_node_state(node_record_t * dump_node_ptr,Buf buffer)221 static void _dump_node_state(node_record_t *dump_node_ptr, Buf buffer)
222 {
223 	packstr (dump_node_ptr->comm_name, buffer);
224 	packstr (dump_node_ptr->name, buffer);
225 	packstr (dump_node_ptr->node_hostname, buffer);
226 	packstr (dump_node_ptr->reason, buffer);
227 	packstr (dump_node_ptr->features, buffer);
228 	packstr (dump_node_ptr->features_act, buffer);
229 	packstr (dump_node_ptr->gres, buffer);
230 	packstr (dump_node_ptr->cpu_spec_list, buffer);
231 	pack32  (dump_node_ptr->next_state, buffer);
232 	pack32  (dump_node_ptr->node_state, buffer);
233 	pack32  (dump_node_ptr->cpu_bind, buffer);
234 	pack16  (dump_node_ptr->cpus, buffer);
235 	pack16  (dump_node_ptr->boards, buffer);
236 	pack16  (dump_node_ptr->sockets, buffer);
237 	pack16  (dump_node_ptr->cores, buffer);
238 	pack16  (dump_node_ptr->core_spec_cnt, buffer);
239 	pack16  (dump_node_ptr->threads, buffer);
240 	pack64  (dump_node_ptr->real_memory, buffer);
241 	pack32  (dump_node_ptr->tmp_disk, buffer);
242 	pack32  (dump_node_ptr->reason_uid, buffer);
243 	pack_time(dump_node_ptr->reason_time, buffer);
244 	pack_time(dump_node_ptr->boot_req_time, buffer);
245 	pack_time(dump_node_ptr->last_response, buffer);
246 	pack16  (dump_node_ptr->protocol_version, buffer);
247 	packstr (dump_node_ptr->mcs_label, buffer);
248 	(void) gres_plugin_node_state_pack(dump_node_ptr->gres_list, buffer,
249 					   dump_node_ptr->name);
250 }
251 
252 
253 /* Open the node state save file, or backup if necessary.
254  * state_file IN - the name of the state save file used
255  * RET the file description to read from or error code
256  */
_open_node_state_file(char ** state_file)257 static Buf _open_node_state_file(char **state_file)
258 {
259 	Buf buf;
260 
261 	*state_file = xstrdup(slurmctld_conf.state_save_location);
262 	xstrcat(*state_file, "/node_state");
263 
264 	if (!(buf = create_mmap_buf(*state_file)))
265 		error("Could not open node state file %s: %m", *state_file);
266 	else
267 		return buf;
268 
269 	error("NOTE: Trying backup state save file. Information may be lost!");
270 	xstrcat(*state_file, ".old");
271 	return create_mmap_buf(*state_file);
272 }
273 
274 /*
275  * load_all_node_state - Load the node state from file, recover on slurmctld
276  *	restart. Execute this after loading the configuration file data.
277  *	Data goes into common storage.
278  * IN state_only - if true, overwrite only node state and reason
279  *	Use this to overwrite the "UNKNOWN state typically used in slurm.conf
280  * RET 0 or error code
281  */
load_all_node_state(bool state_only)282 extern int load_all_node_state ( bool state_only )
283 {
284 	char *comm_name = NULL, *node_hostname = NULL;
285 	char *node_name = NULL, *reason = NULL, *state_file;
286 	char *features = NULL, *features_act = NULL;
287 	char *gres = NULL, *cpu_spec_list = NULL;
288 	char *mcs_label = NULL;
289 	int error_code = 0, node_cnt = 0;
290 	uint16_t core_spec_cnt = 0;
291 	uint32_t node_state, cpu_bind = 0, next_state = NO_VAL;
292 	uint16_t cpus = 1, boards = 1, sockets = 1, cores = 1, threads = 1;
293 	uint64_t real_memory;
294 	uint32_t tmp_disk, name_len;
295 	uint32_t reason_uid = NO_VAL;
296 	time_t boot_req_time = 0, reason_time = 0, last_response = 0;
297 	List gres_list = NULL;
298 	node_record_t *node_ptr;
299 	time_t time_stamp, now = time(NULL);
300 	Buf buffer;
301 	char *ver_str = NULL;
302 	hostset_t hs = NULL;
303 	hostlist_t down_nodes = NULL;
304 	bool power_save_mode = false;
305 	uint16_t protocol_version = NO_VAL16;
306 
307 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
308 
309 	if (slurmctld_conf.suspend_program && slurmctld_conf.resume_program)
310 		power_save_mode = true;
311 
312 	/* read the file */
313 	lock_state_files ();
314 	buffer = _open_node_state_file(&state_file);
315 	if (!buffer) {
316 		info("No node state file (%s) to recover", state_file);
317 		xfree(state_file);
318 		unlock_state_files();
319 		return ENOENT;
320 	}
321 	xfree(state_file);
322 	unlock_state_files();
323 
324 	safe_unpackstr_xmalloc( &ver_str, &name_len, buffer);
325 	debug3("Version string in node_state header is %s", ver_str);
326 	if (ver_str && !xstrcmp(ver_str, NODE_STATE_VERSION))
327 		safe_unpack16(&protocol_version, buffer);
328 
329 	if (!protocol_version || (protocol_version == NO_VAL16)) {
330 		if (!ignore_state_errors)
331 			fatal("Can not recover node state, data version incompatible, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
332 		error("*****************************************************");
333 		error("Can not recover node state, data version incompatible");
334 		error("*****************************************************");
335 		xfree(ver_str);
336 		free_buf(buffer);
337 		return EFAULT;
338 	}
339 	xfree(ver_str);
340 
341 	safe_unpack_time (&time_stamp, buffer);
342 
343 	while (remaining_buf (buffer) > 0) {
344 		uint32_t base_state;
345 		uint16_t obj_protocol_version = NO_VAL16;
346 		if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
347 			safe_unpackstr_xmalloc (&comm_name, &name_len, buffer);
348 			safe_unpackstr_xmalloc (&node_name, &name_len, buffer);
349 			safe_unpackstr_xmalloc (&node_hostname,
350 							    &name_len, buffer);
351 			safe_unpackstr_xmalloc (&reason,    &name_len, buffer);
352 			safe_unpackstr_xmalloc (&features,  &name_len, buffer);
353 			safe_unpackstr_xmalloc (&features_act,&name_len,buffer);
354 			safe_unpackstr_xmalloc (&gres,      &name_len, buffer);
355 			safe_unpackstr_xmalloc (&cpu_spec_list,
356 							    &name_len, buffer);
357 			safe_unpack32 (&next_state,  buffer);
358 			safe_unpack32 (&node_state,  buffer);
359 			safe_unpack32 (&cpu_bind,    buffer);
360 			safe_unpack16 (&cpus,        buffer);
361 			safe_unpack16 (&boards,     buffer);
362 			safe_unpack16 (&sockets,     buffer);
363 			safe_unpack16 (&cores,       buffer);
364 			safe_unpack16 (&core_spec_cnt, buffer);
365 			safe_unpack16 (&threads,     buffer);
366 			safe_unpack64 (&real_memory, buffer);
367 			safe_unpack32 (&tmp_disk,    buffer);
368 			safe_unpack32 (&reason_uid,  buffer);
369 			safe_unpack_time (&reason_time, buffer);
370 			safe_unpack_time (&boot_req_time, buffer);
371 			safe_unpack_time(&last_response, buffer);
372 			safe_unpack16 (&obj_protocol_version, buffer);
373 			safe_unpackstr_xmalloc (&mcs_label, &name_len, buffer);
374 			if (gres_plugin_node_state_unpack(
375 				    &gres_list, buffer, node_name,
376 				    protocol_version) != SLURM_SUCCESS)
377 				goto unpack_error;
378 			base_state = node_state & NODE_STATE_BASE;
379 		} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
380 			safe_unpackstr_xmalloc (&comm_name, &name_len, buffer);
381 			safe_unpackstr_xmalloc (&node_name, &name_len, buffer);
382 			safe_unpackstr_xmalloc (&node_hostname,
383 							    &name_len, buffer);
384 			safe_unpackstr_xmalloc (&reason,    &name_len, buffer);
385 			safe_unpackstr_xmalloc (&features,  &name_len, buffer);
386 			safe_unpackstr_xmalloc (&features_act,&name_len,buffer);
387 			safe_unpackstr_xmalloc (&gres,      &name_len, buffer);
388 			safe_unpackstr_xmalloc (&cpu_spec_list,
389 							    &name_len, buffer);
390 			safe_unpack32 (&next_state,  buffer);
391 			safe_unpack32 (&node_state,  buffer);
392 			safe_unpack32 (&cpu_bind,    buffer);
393 			safe_unpack16 (&cpus,        buffer);
394 			safe_unpack16 (&boards,     buffer);
395 			safe_unpack16 (&sockets,     buffer);
396 			safe_unpack16 (&cores,       buffer);
397 			safe_unpack16 (&core_spec_cnt, buffer);
398 			safe_unpack16 (&threads,     buffer);
399 			safe_unpack64 (&real_memory, buffer);
400 			safe_unpack32 (&tmp_disk,    buffer);
401 			safe_unpack32 (&reason_uid,  buffer);
402 			safe_unpack_time (&reason_time, buffer);
403 			safe_unpack_time (&boot_req_time, buffer);
404 			safe_unpack16 (&obj_protocol_version, buffer);
405 			safe_unpackstr_xmalloc (&mcs_label, &name_len, buffer);
406 			if (gres_plugin_node_state_unpack(
407 				    &gres_list, buffer, node_name,
408 				    protocol_version) != SLURM_SUCCESS)
409 				goto unpack_error;
410 			base_state = node_state & NODE_STATE_BASE;
411 		} else {
412 			error("%s: protocol_version %hu not supported",
413 			      __func__, protocol_version);
414 			goto unpack_error;
415 		}
416 
417 		/* validity test as possible */
418 		if ((cpus == 0) ||
419 		    (boards == 0) ||
420 		    (sockets == 0) ||
421 		    (cores == 0) ||
422 		    (threads == 0) ||
423 		    (base_state  >= NODE_STATE_END)) {
424 			error("Invalid data for node %s: procs=%u, boards=%u, "
425 			       "sockets=%u, cores=%u, threads=%u, state=%u",
426 				node_name, cpus, boards,
427 				sockets, cores, threads, node_state);
428 			error("No more node data will be processed from the checkpoint file");
429 			goto unpack_error;
430 
431 		}
432 
433 		/* find record and perform update */
434 		node_ptr = find_node_record (node_name);
435 		if (node_ptr == NULL) {
436 			error ("Node %s has vanished from configuration",
437 			       node_name);
438 		} else if (state_only) {
439 			uint32_t orig_flags;
440 			if (IS_NODE_CLOUD(node_ptr)) {
441 				if ((!power_save_mode) &&
442 				    ((node_state & NODE_STATE_POWER_SAVE) ||
443 	 			     (node_state & NODE_STATE_POWER_UP))) {
444 					node_state &= (~NODE_STATE_POWER_SAVE);
445 					node_state &= (~NODE_STATE_POWER_UP);
446 					node_state &= (~NODE_STATE_POWERING_DOWN);
447 					if (hs)
448 						hostset_insert(hs, node_name);
449 					else
450 						hs = hostset_create(node_name);
451 				}
452 				if (comm_name && node_hostname) {
453 					/* Recover NodeAddr and NodeHostName */
454 					xfree(node_ptr->comm_name);
455 					node_ptr->comm_name = comm_name;
456 					comm_name = NULL;  /* Nothing to free */
457 					xfree(node_ptr->node_hostname);
458 					node_ptr->node_hostname = node_hostname;
459 					node_hostname = NULL;  /* Nothing to free */
460 					slurm_reset_alias(node_ptr->name,
461 							  node_ptr->comm_name,
462 							  node_ptr->node_hostname);
463 				}
464 				node_ptr->node_state    = node_state;
465 			} else if (IS_NODE_UNKNOWN(node_ptr)) {
466 				if (base_state == NODE_STATE_DOWN) {
467 					orig_flags = node_ptr->node_state &
468 						     NODE_STATE_FLAGS;
469 					node_ptr->node_state = NODE_STATE_DOWN
470 						| orig_flags;
471 				}
472 				if (node_state & NODE_STATE_DRAIN)
473 					 node_ptr->node_state |=
474 						 NODE_STATE_DRAIN;
475 				if (node_state & NODE_STATE_FAIL)
476 					node_ptr->node_state |=
477 						NODE_STATE_FAIL;
478 				if (node_state & NODE_STATE_POWER_SAVE) {
479 					if (power_save_mode &&
480 					    IS_NODE_UNKNOWN(node_ptr)) {
481 						orig_flags = node_ptr->
482 							node_state &
483 							     NODE_STATE_FLAGS;
484 						node_ptr->node_state =
485 							NODE_STATE_IDLE |
486 							orig_flags |
487 							NODE_STATE_POWER_SAVE;
488 					} else if (power_save_mode) {
489 						node_ptr->node_state |=
490 							NODE_STATE_POWER_SAVE;
491 					} else if (hs)
492 						hostset_insert(hs, node_name);
493 					else
494 						hs = hostset_create(node_name);
495 					/* Recover hardware state for powered
496 					 * down nodes */
497 					node_ptr->cpus          = cpus;
498 					node_ptr->boards        = boards;
499 					node_ptr->sockets       = sockets;
500 					node_ptr->cores         = cores;
501 					node_ptr->core_spec_cnt =
502 						core_spec_cnt;
503 					xfree(node_ptr->cpu_spec_list);
504 					node_ptr->cpu_spec_list =
505 						cpu_spec_list;
506 					cpu_spec_list = NULL;/* Nothing */
507 							     /* to free */
508 					node_ptr->threads       = threads;
509 					node_ptr->real_memory   = real_memory;
510 					node_ptr->tmp_disk      = tmp_disk;
511 				}
512 				if (node_state & NODE_STATE_MAINT)
513 					node_ptr->node_state |= NODE_STATE_MAINT;
514 				if (node_state & NODE_STATE_REBOOT)
515 					node_ptr->node_state |= NODE_STATE_REBOOT;
516 				if (node_state & NODE_STATE_POWER_UP) {
517 					if (power_save_mode) {
518 						node_ptr->node_state |=
519 							NODE_STATE_POWER_UP;
520 					} else if (hs)
521 						hostset_insert(hs, node_name);
522 					else
523 						hs = hostset_create(node_name);
524 				}
525 			}
526 			if (node_ptr->reason == NULL) {
527 				node_ptr->reason = reason;
528 				reason = NULL;	/* Nothing to free */
529 				node_ptr->reason_time = reason_time;
530 				node_ptr->reason_uid = reason_uid;
531 			}
532 
533 			if (IS_NODE_POWER_UP(node_ptr) ||
534 			    IS_NODE_REBOOT(node_ptr))
535 				node_ptr->boot_req_time = boot_req_time;
536 
537 			xfree(node_ptr->features_act);
538 			node_ptr->features_act	= features_act;
539 			features_act		= NULL;	/* Nothing to free */
540 			node_ptr->gres_list	= gres_list;
541 			gres_list		= NULL;	/* Nothing to free */
542 		} else {
543 			if ((!power_save_mode) &&
544 			    ((node_state & NODE_STATE_POWER_SAVE) ||
545  			     (node_state & NODE_STATE_POWER_UP))) {
546 				node_state &= (~NODE_STATE_POWER_SAVE);
547 				node_state &= (~NODE_STATE_POWER_UP);
548 				if (hs)
549 					hostset_insert(hs, node_name);
550 				else
551 					hs = hostset_create(node_name);
552 			}
553 			if (IS_NODE_CLOUD(node_ptr) &&
554 			    comm_name && node_hostname) {
555 				/* Recover NodeAddr and NodeHostName */
556 				xfree(node_ptr->comm_name);
557 				node_ptr->comm_name = comm_name;
558 				comm_name = NULL;	/* Nothing to free */
559 				xfree(node_ptr->node_hostname);
560 				node_ptr->node_hostname = node_hostname;
561 				node_hostname = NULL;	/* Nothing to free */
562 				slurm_reset_alias(node_ptr->name,
563 						  node_ptr->comm_name,
564 						  node_ptr->node_hostname);
565 			}
566 			node_ptr->node_state    = node_state;
567 			xfree(node_ptr->reason);
568 			node_ptr->reason	= reason;
569 			reason			= NULL;	/* Nothing to free */
570 			node_ptr->reason_time	= reason_time;
571 			node_ptr->reason_uid	= reason_uid;
572 			xfree(node_ptr->features);
573 			node_ptr->features	= features;
574 			features		= NULL;	/* Nothing to free */
575 			xfree(node_ptr->features_act);
576 			node_ptr->features_act	= features_act;
577 			features_act		= NULL;	/* Nothing to free */
578 			xfree(node_ptr->gres);
579 			node_ptr->gres 		= gres;
580 			gres			= NULL;	/* Nothing to free */
581 			node_ptr->gres_list	= gres_list;
582 			gres_list		= NULL;	/* Nothing to free */
583 			xfree(node_ptr->cpu_spec_list);
584 			node_ptr->cpu_spec_list = cpu_spec_list;
585 			cpu_spec_list 		= NULL; /* Nothing to free */
586 			node_ptr->part_cnt      = 0;
587 			xfree(node_ptr->part_pptr);
588 			node_ptr->cpu_bind      = cpu_bind;
589 			node_ptr->cpus          = cpus;
590 			node_ptr->boards        = boards;
591 			node_ptr->sockets       = sockets;
592 			node_ptr->cores         = cores;
593 			node_ptr->core_spec_cnt = core_spec_cnt;
594 			node_ptr->threads       = threads;
595 			node_ptr->real_memory   = real_memory;
596 			node_ptr->tmp_disk      = tmp_disk;
597 			xfree(node_ptr->mcs_label);
598 			node_ptr->mcs_label	= mcs_label;
599 			mcs_label		= NULL; /* Nothing to free */
600 		}
601 
602 		if (node_ptr) {
603 			node_cnt++;
604 
605 			node_ptr->next_state = next_state;
606 
607 			if (IS_NODE_DOWN(node_ptr)) {
608 				if (down_nodes)
609 					hostlist_push(down_nodes, node_name);
610 				else
611 					down_nodes = hostlist_create(
612 							node_name);
613 			}
614 
615 			node_ptr->last_response = last_response;
616 			if (!node_ptr->last_response) {
617 				/*
618 				 * last_response value not saved, make best
619 				 * guess.
620 				 */
621 				if (IS_NODE_POWER_UP(node_ptr))
622 					node_ptr->last_response = now +
623 						slurmctld_conf.resume_timeout;
624 				else if (IS_NODE_POWERING_DOWN(node_ptr))
625 					node_ptr->last_response = now +
626 						slurmctld_conf.suspend_timeout;
627 			}
628 
629 			if (obj_protocol_version &&
630 			    (obj_protocol_version != NO_VAL16))
631 				node_ptr->protocol_version =
632 					obj_protocol_version;
633 			else
634 				node_ptr->protocol_version = protocol_version;
635 
636 			/* Sanity check to make sure we can take a version we
637 			 * actually understand.
638 			 */
639 			if (node_ptr->protocol_version <
640 			    SLURM_MIN_PROTOCOL_VERSION)
641 				node_ptr->protocol_version =
642 					SLURM_MIN_PROTOCOL_VERSION;
643 
644 			if (!IS_NODE_POWER_SAVE(node_ptr))
645 				node_ptr->last_idle = now;
646 		}
647 
648 		xfree(features);
649 		xfree(features_act);
650 		xfree(gres);
651 		FREE_NULL_LIST(gres_list);
652 		xfree (comm_name);
653 		xfree (node_hostname);
654 		xfree (node_name);
655 		xfree(reason);
656 		xfree(cpu_spec_list);
657 	}
658 
659 fini:	info("Recovered state of %d nodes", node_cnt);
660 	if (hs) {
661 		char node_names[128];
662 		hostset_ranged_string(hs, sizeof(node_names), node_names);
663 		info("Cleared POWER_SAVE flag from nodes %s", node_names);
664 		hostset_destroy(hs);
665 	}
666 
667 	if (down_nodes) {
668 		char *down_host_str = NULL;
669 		down_host_str = hostlist_ranged_string_xmalloc(down_nodes);
670 		info("Down nodes: %s", down_host_str);
671 		xfree(down_host_str);
672 		hostlist_destroy(down_nodes);
673 	}
674 
675 	free_buf (buffer);
676 	return error_code;
677 
678 unpack_error:
679 	if (!ignore_state_errors)
680 		fatal("Incomplete node data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
681 	error("Incomplete node data checkpoint file");
682 	error_code = EFAULT;
683 	xfree(features);
684 	xfree(gres);
685 	FREE_NULL_LIST(gres_list);
686 	xfree(comm_name);
687 	xfree(node_hostname);
688 	xfree(node_name);
689 	xfree(reason);
690 	goto fini;
691 }
692 
693 
694 /* list_compare_config - compare two entry from the config list based upon
695  *	weight, see common/list.h for documentation */
list_compare_config(void * config_entry1,void * config_entry2)696 int list_compare_config (void *config_entry1, void *config_entry2)
697 {
698 	int weight1, weight2;
699 	config_record_t *c1 = *(config_record_t **) config_entry1;
700 	config_record_t *c2 = *(config_record_t **) config_entry2;
701 
702 	weight1 = c1->weight;
703 	weight2 = c2->weight;
704 
705 	return (weight1 - weight2);
706 }
707 
708 /* Return true if the node should be hidden by virtue of being powered down
709  * and in the cloud. */
_is_cloud_hidden(node_record_t * node_ptr)710 static bool _is_cloud_hidden(node_record_t *node_ptr)
711 {
712 	if (((slurmctld_conf.private_data & PRIVATE_CLOUD_NODES) == 0) &&
713 	    IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr))
714 		return true;
715 	return false;
716 }
717 
_node_is_hidden(node_record_t * node_ptr,uid_t uid)718 static bool _node_is_hidden(node_record_t *node_ptr, uid_t uid)
719 {
720 	int i;
721 
722 	if ((slurmctld_conf.private_data & PRIVATE_DATA_NODES)
723 	    && (slurm_mcs_get_privatedata() == 1)
724 	    && !validate_operator(uid)
725 	    && (mcs_g_check_mcs_label(uid, node_ptr->mcs_label) != 0))
726 		return true;
727 
728 	if (!node_ptr->part_cnt)
729 		return false;
730 
731 	for (i = 0; i < node_ptr->part_cnt; i++) {
732 		/* return false if the node belongs to any visible partition */
733 		if (part_is_visible(node_ptr->part_pptr[i], uid)) {
734 			return false;
735 		}
736 	}
737 
738 	return true;
739 }
740 
741 /*
742  * pack_all_node - dump all configuration and node information for all nodes
743  *	in machine independent form (for network transmission)
744  * OUT buffer_ptr - pointer to the stored data
745  * OUT buffer_size - set to size of the buffer in bytes
746  * IN show_flags - node filtering options
747  * IN uid - uid of user making request (for partition filtering)
748  * IN protocol_version - slurm protocol version of client
749  * global: node_record_table_ptr - pointer to global node table
750  * NOTE: the caller must xfree the buffer at *buffer_ptr
751  * NOTE: change slurm_load_node() in api/node_info.c when data format changes
752  */
pack_all_node(char ** buffer_ptr,int * buffer_size,uint16_t show_flags,uid_t uid,uint16_t protocol_version)753 extern void pack_all_node (char **buffer_ptr, int *buffer_size,
754 			   uint16_t show_flags, uid_t uid,
755 			   uint16_t protocol_version)
756 {
757 	int inx;
758 	uint32_t nodes_packed, tmp_offset;
759 	Buf buffer;
760 	time_t now = time(NULL);
761 	node_record_t *node_ptr = node_record_table_ptr;
762 	bool hidden;
763 
764 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
765 	xassert(verify_lock(PART_LOCK, READ_LOCK));
766 
767 	buffer_ptr[0] = NULL;
768 	*buffer_size = 0;
769 
770 	buffer = init_buf (BUF_SIZE*16);
771 	nodes_packed = 0;
772 
773 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
774 		/* write header: count and time */
775 		pack32(nodes_packed, buffer);
776 		pack_time(now, buffer);
777 
778 		/* write node records */
779 		for (inx = 0; inx < node_record_count; inx++, node_ptr++) {
780 			xassert(node_ptr->magic == NODE_MAGIC);
781 			xassert(node_ptr->config_ptr->magic == CONFIG_MAGIC);
782 
783 			/*
784 			 * We can't avoid packing node records without breaking
785 			 * the node index pointers. So pack a node with a name
786 			 * of NULL and let the caller deal with it.
787 			 */
788 			hidden = false;
789 			if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
790 			    (_node_is_hidden(node_ptr, uid)))
791 				hidden = true;
792 			else if (IS_NODE_FUTURE(node_ptr) &&
793 				 (!(show_flags & SHOW_FUTURE)))
794 				hidden = true;
795 			else if (_is_cloud_hidden(node_ptr))
796 				hidden = true;
797 			else if ((node_ptr->name == NULL) ||
798 				 (node_ptr->name[0] == '\0'))
799 				hidden = true;
800 
801 			if (hidden) {
802 				char *orig_name = node_ptr->name;
803 				node_ptr->name = NULL;
804 				_pack_node(node_ptr, buffer, protocol_version,
805 				           show_flags);
806 				node_ptr->name = orig_name;
807 			} else {
808 				_pack_node(node_ptr, buffer, protocol_version,
809 					   show_flags);
810 			}
811 			nodes_packed++;
812 		}
813 	} else {
814 		error("select_g_select_jobinfo_pack: protocol_version "
815 		      "%hu not supported", protocol_version);
816 	}
817 
818 	tmp_offset = get_buf_offset (buffer);
819 	set_buf_offset (buffer, 0);
820 	pack32  (nodes_packed, buffer);
821 	set_buf_offset (buffer, tmp_offset);
822 
823 	*buffer_size = get_buf_offset (buffer);
824 	buffer_ptr[0] = xfer_buf_data (buffer);
825 }
826 
827 /*
828  * pack_one_node - dump all configuration and node information for one node
829  *	in machine independent form (for network transmission)
830  * OUT buffer_ptr - pointer to the stored data
831  * OUT buffer_size - set to size of the buffer in bytes
832  * IN show_flags - node filtering options
833  * IN uid - uid of user making request (for partition filtering)
834  * IN node_name - name of node for which information is desired,
835  *		  use first node if name is NULL
836  * IN protocol_version - slurm protocol version of client
837  * global: node_record_table_ptr - pointer to global node table
838  * NOTE: the caller must xfree the buffer at *buffer_ptr
839  * NOTE: change slurm_load_node() in api/node_info.c when data format changes
840  */
pack_one_node(char ** buffer_ptr,int * buffer_size,uint16_t show_flags,uid_t uid,char * node_name,uint16_t protocol_version)841 extern void pack_one_node (char **buffer_ptr, int *buffer_size,
842 			   uint16_t show_flags, uid_t uid, char *node_name,
843 			   uint16_t protocol_version)
844 {
845 	uint32_t nodes_packed, tmp_offset;
846 	Buf buffer;
847 	time_t now = time(NULL);
848 	node_record_t *node_ptr;
849 	bool hidden;
850 
851 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
852 	xassert(verify_lock(PART_LOCK, READ_LOCK));
853 
854 	buffer_ptr[0] = NULL;
855 	*buffer_size = 0;
856 
857 	buffer = init_buf (BUF_SIZE);
858 	nodes_packed = 0;
859 
860 	if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
861 		/* write header: count and time */
862 		pack32(nodes_packed, buffer);
863 		pack_time(now, buffer);
864 
865 		/* write node records */
866 		if (node_name)
867 			node_ptr = find_node_record(node_name);
868 		else
869 			node_ptr = node_record_table_ptr;
870 		if (node_ptr) {
871 			hidden = false;
872 			if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
873 			    (_node_is_hidden(node_ptr, uid)))
874 				hidden = true;
875 			else if (IS_NODE_FUTURE(node_ptr) &&
876 				 (!(show_flags & SHOW_FUTURE)))
877 				hidden = true;
878 //			Don't hide the node if explicitly requested by name
879 //			else if (_is_cloud_hidden(node_ptr))
880 //				hidden = true;
881 			else if ((node_ptr->name == NULL) ||
882 				 (node_ptr->name[0] == '\0'))
883 				hidden = true;
884 
885 			if (!hidden) {
886 				_pack_node(node_ptr, buffer, protocol_version,
887 					   show_flags);
888 				nodes_packed++;
889 			}
890 		}
891 	} else {
892 		error("select_g_select_jobinfo_pack: protocol_version "
893 		      "%hu not supported", protocol_version);
894 	}
895 
896 	tmp_offset = get_buf_offset (buffer);
897 	set_buf_offset (buffer, 0);
898 	pack32  (nodes_packed, buffer);
899 	set_buf_offset (buffer, tmp_offset);
900 
901 	*buffer_size = get_buf_offset (buffer);
902 	buffer_ptr[0] = xfer_buf_data (buffer);
903 }
904 
905 /*
906  * _pack_node - dump all configuration information about a specific node in
907  *	machine independent form (for network transmission)
908  * IN dump_node_ptr - pointer to node for which information is requested
909  * IN/OUT buffer - buffer where data is placed, pointers automatically updated
910  * IN protocol_version - slurm protocol version of client
911  * IN show_flags -
912  * NOTE: if you make any changes here be sure to make the corresponding changes
913  * 	to _unpack_node_info_members() in common/slurm_protocol_pack.c
914  */
_pack_node(node_record_t * dump_node_ptr,Buf buffer,uint16_t protocol_version,uint16_t show_flags)915 static void _pack_node(node_record_t *dump_node_ptr, Buf buffer,
916 		       uint16_t protocol_version, uint16_t show_flags)
917 {
918 	char *gres_drain = NULL, *gres_used = NULL;
919 
920 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
921 
922 
923 	if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
924 		packstr(dump_node_ptr->name, buffer);
925 		packstr(dump_node_ptr->node_hostname, buffer);
926 		packstr(dump_node_ptr->comm_name, buffer);
927 		packstr(dump_node_ptr->bcast_address, buffer);
928 		pack16(dump_node_ptr->port, buffer);
929 		pack32(dump_node_ptr->next_state, buffer);
930 		pack32(dump_node_ptr->node_state, buffer);
931 		packstr(dump_node_ptr->version, buffer);
932 
933 		/* Only data from config_record used for scheduling */
934 		pack16(dump_node_ptr->config_ptr->cpus, buffer);
935 		pack16(dump_node_ptr->config_ptr->boards, buffer);
936 		pack16(dump_node_ptr->config_ptr->sockets, buffer);
937 		pack16(dump_node_ptr->config_ptr->cores, buffer);
938 		pack16(dump_node_ptr->config_ptr->threads, buffer);
939 		pack64(dump_node_ptr->config_ptr->real_memory, buffer);
940 		pack32(dump_node_ptr->config_ptr->tmp_disk, buffer);
941 
942 		packstr(dump_node_ptr->mcs_label, buffer);
943 		pack32(dump_node_ptr->owner, buffer);
944 		pack16(dump_node_ptr->core_spec_cnt, buffer);
945 		pack32(dump_node_ptr->cpu_bind, buffer);
946 		pack64(dump_node_ptr->mem_spec_limit, buffer);
947 		packstr(dump_node_ptr->cpu_spec_list, buffer);
948 
949 		pack32(dump_node_ptr->cpu_load, buffer);
950 		pack64(dump_node_ptr->free_mem, buffer);
951 		pack32(dump_node_ptr->config_ptr->weight, buffer);
952 		pack32(dump_node_ptr->reason_uid, buffer);
953 
954 		pack_time(dump_node_ptr->boot_time, buffer);
955 		pack_time(dump_node_ptr->reason_time, buffer);
956 		pack_time(dump_node_ptr->slurmd_start_time, buffer);
957 
958 		select_g_select_nodeinfo_pack(dump_node_ptr->select_nodeinfo,
959 					      buffer, protocol_version);
960 
961 		packstr(dump_node_ptr->arch, buffer);
962 		packstr(dump_node_ptr->features, buffer);
963 		packstr(dump_node_ptr->features_act, buffer);
964 		if (dump_node_ptr->gres)
965 			packstr(dump_node_ptr->gres, buffer);
966 		else
967 			packstr(dump_node_ptr->config_ptr->gres, buffer);
968 
969 		/* Gathering GRES details is slow, so don't by default */
970 		if (show_flags & SHOW_DETAIL) {
971 			gres_drain =
972 				gres_get_node_drain(dump_node_ptr->gres_list);
973 			gres_used  =
974 				gres_get_node_used(dump_node_ptr->gres_list);
975 		}
976 		packstr(gres_drain, buffer);
977 		packstr(gres_used, buffer);
978 		xfree(gres_drain);
979 		xfree(gres_used);
980 
981 		packstr(dump_node_ptr->os, buffer);
982 		packstr(dump_node_ptr->reason, buffer);
983 		acct_gather_energy_pack(dump_node_ptr->energy, buffer,
984 					protocol_version);
985 		ext_sensors_data_pack(dump_node_ptr->ext_sensors, buffer,
986 				      protocol_version);
987 		power_mgmt_data_pack(dump_node_ptr->power, buffer,
988 				     protocol_version);
989 
990 		packstr(dump_node_ptr->tres_fmt_str, buffer);
991 	} else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
992 		packstr (dump_node_ptr->name, buffer);
993 		packstr (dump_node_ptr->node_hostname, buffer);
994 		packstr (dump_node_ptr->comm_name, buffer);
995 		pack16(dump_node_ptr->port, buffer);
996 		pack32(dump_node_ptr->next_state, buffer);
997 		pack32(dump_node_ptr->node_state, buffer);
998 		packstr (dump_node_ptr->version, buffer);
999 
1000 		/* Only data from config_record used for scheduling */
1001 		pack16(dump_node_ptr->config_ptr->cpus, buffer);
1002 		pack16(dump_node_ptr->config_ptr->boards, buffer);
1003 		pack16(dump_node_ptr->config_ptr->sockets, buffer);
1004 		pack16(dump_node_ptr->config_ptr->cores, buffer);
1005 		pack16(dump_node_ptr->config_ptr->threads, buffer);
1006 		pack64(dump_node_ptr->config_ptr->real_memory, buffer);
1007 		pack32(dump_node_ptr->config_ptr->tmp_disk, buffer);
1008 
1009 		packstr(dump_node_ptr->mcs_label, buffer);
1010 		pack32(dump_node_ptr->owner, buffer);
1011 		pack16(dump_node_ptr->core_spec_cnt, buffer);
1012 		pack32(dump_node_ptr->cpu_bind, buffer);
1013 		pack64(dump_node_ptr->mem_spec_limit, buffer);
1014 		packstr(dump_node_ptr->cpu_spec_list, buffer);
1015 
1016 		pack32(dump_node_ptr->cpu_load, buffer);
1017 		pack64(dump_node_ptr->free_mem, buffer);
1018 		pack32(dump_node_ptr->config_ptr->weight, buffer);
1019 		pack32(dump_node_ptr->reason_uid, buffer);
1020 
1021 		pack_time(dump_node_ptr->boot_time, buffer);
1022 		pack_time(dump_node_ptr->reason_time, buffer);
1023 		pack_time(dump_node_ptr->slurmd_start_time, buffer);
1024 
1025 		select_g_select_nodeinfo_pack(dump_node_ptr->select_nodeinfo,
1026 					      buffer, protocol_version);
1027 
1028 		packstr(dump_node_ptr->arch, buffer);
1029 		packstr(dump_node_ptr->features, buffer);
1030 		packstr(dump_node_ptr->features_act, buffer);
1031 		if (dump_node_ptr->gres)
1032 			packstr(dump_node_ptr->gres, buffer);
1033 		else
1034 			packstr(dump_node_ptr->config_ptr->gres, buffer);
1035 
1036 		/* Gathering GRES details is slow, so don't by default */
1037 		if (show_flags & SHOW_DETAIL) {
1038 			gres_drain =
1039 				gres_get_node_drain(dump_node_ptr->gres_list);
1040 			gres_used  =
1041 				gres_get_node_used(dump_node_ptr->gres_list);
1042 		}
1043 		packstr(gres_drain, buffer);
1044 		packstr(gres_used, buffer);
1045 		xfree(gres_drain);
1046 		xfree(gres_used);
1047 
1048 		packstr(dump_node_ptr->os, buffer);
1049 		packstr(dump_node_ptr->reason, buffer);
1050 		acct_gather_energy_pack(dump_node_ptr->energy, buffer,
1051 					protocol_version);
1052 		ext_sensors_data_pack(dump_node_ptr->ext_sensors, buffer,
1053 				      protocol_version);
1054 		power_mgmt_data_pack(dump_node_ptr->power, buffer,
1055 				     protocol_version);
1056 
1057 		packstr(dump_node_ptr->tres_fmt_str,buffer);
1058 	} else {
1059 		error("_pack_node: protocol_version "
1060 		      "%hu not supported", protocol_version);
1061 	}
1062 }
1063 
1064 /* Return "true" if a node's state is already "new_state". This is more
1065  * complex than simply comparing the state values due to flags (e.g.
1066  * A node might be DOWN + NO_RESPOND or IDLE + DRAIN) */
_equivalent_node_state(node_record_t * node_ptr,uint32_t new_state)1067 static bool _equivalent_node_state(node_record_t *node_ptr, uint32_t new_state)
1068 {
1069 	if (new_state == NO_VAL)	/* No change */
1070 		return true;
1071 	if ((new_state == NODE_STATE_DOWN)  && IS_NODE_DOWN(node_ptr))
1072 		return true;
1073 	if ((new_state == NODE_STATE_DRAIN) && IS_NODE_DRAIN(node_ptr))
1074 		return true;
1075 	if ((new_state == NODE_STATE_FAIL)  && IS_NODE_FAIL(node_ptr))
1076 		return true;
1077 	/* Other states might be added here */
1078 	return false;
1079 }
1080 
1081 /* Confirm that the selected ActiveFeatures are a subset of AvailableFeatures */
_valid_features_act(char * features_act,char * features)1082 static bool _valid_features_act(char *features_act, char *features)
1083 {
1084 	bool valid_subset = true;
1085 	char *tmp_act, *last_act = NULL, *tok_act;
1086 	char *tmp_avail, *last_avail = NULL, *tok_avail;
1087 
1088 	if (!features_act || (features_act[0] == '\0'))
1089 		return true;
1090 	if (!features || (features[0] == '\0'))
1091 		return false;
1092 
1093 	tmp_act = xstrdup(features_act);
1094         tok_act = strtok_r(tmp_act, ",", &last_act);
1095         while (tok_act) {
1096 		last_avail = NULL;
1097 		tmp_avail = xstrdup(features);
1098 		tok_avail = strtok_r(tmp_avail, ",", &last_avail);
1099 		while (tok_avail) {
1100 			if (!xstrcmp(tok_act, tok_avail))
1101 				break;
1102 		        tok_avail = strtok_r(NULL, ",", &last_avail);
1103 		}
1104 		xfree(tmp_avail);
1105 		if (!tok_avail) {	/* No match found */
1106 			valid_subset = false;
1107 			break;
1108 		}
1109                 tok_act = strtok_r(NULL, ",", &last_act);
1110 	}
1111 	xfree(tmp_act);
1112 
1113 	return valid_subset;
1114 }
1115 
_undo_reboot_asap(node_record_t * node_ptr)1116 static void _undo_reboot_asap(node_record_t *node_ptr)
1117 {
1118 	node_ptr->node_state &= (~NODE_STATE_DRAIN);
1119 	xfree(node_ptr->reason);
1120 }
1121 
1122 /*
1123  * update_node - update the configuration data for one or more nodes
1124  * IN update_node_msg - update node request
1125  * RET SLURM_SUCCESS or error code
1126  * global: node_record_table_ptr - pointer to global node table
1127  */
update_node(update_node_msg_t * update_node_msg)1128 int update_node ( update_node_msg_t * update_node_msg )
1129 {
1130 	int error_code = 0, node_cnt, node_inx;
1131 	node_record_t *node_ptr = NULL;
1132 	char *this_node_name = NULL, *tmp_feature, *orig_features_act = NULL;
1133 	hostlist_t host_list, hostaddr_list = NULL, hostname_list = NULL;
1134 	uint32_t base_state = 0, node_flags, state_val;
1135 	time_t now = time(NULL);
1136 
1137 	if (update_node_msg->node_names == NULL ) {
1138 		info("%s: invalid node name", __func__);
1139 		return ESLURM_INVALID_NODE_NAME;
1140 	}
1141 
1142 	host_list = hostlist_create(update_node_msg->node_names);
1143 	if (host_list == NULL) {
1144 		info("update_node: hostlist_create error on %s: %m",
1145 		      update_node_msg->node_names);
1146 		return ESLURM_INVALID_NODE_NAME;
1147 	}
1148 	node_cnt = hostlist_count(host_list);
1149 
1150 	if (update_node_msg->node_addr) {
1151 		hostaddr_list = hostlist_create(update_node_msg->node_addr);
1152 		if (hostaddr_list == NULL) {
1153 			info("update_node: hostlist_create error on %s: %m",
1154 			     update_node_msg->node_addr);
1155 			FREE_NULL_HOSTLIST(host_list);
1156 			return ESLURM_INVALID_NODE_NAME;
1157 		}
1158 		if (node_cnt != hostlist_count(hostaddr_list)) {
1159 			info("update_node: nodecount mismatch");
1160 			FREE_NULL_HOSTLIST(host_list);
1161 			FREE_NULL_HOSTLIST(hostaddr_list);
1162 			return ESLURM_INVALID_NODE_NAME;
1163 		}
1164 	}
1165 
1166 	if (update_node_msg->node_hostname) {
1167 		hostname_list = hostlist_create(update_node_msg->node_hostname);
1168 		if (hostname_list == NULL) {
1169 			info("update_node: hostlist_create error on %s: %m",
1170 			     update_node_msg->node_hostname);
1171 			FREE_NULL_HOSTLIST(host_list);
1172 			FREE_NULL_HOSTLIST(hostaddr_list);
1173 			return ESLURM_INVALID_NODE_NAME;
1174 		}
1175 		if (node_cnt != hostlist_count(hostname_list)) {
1176 			info("update_node: nodecount mismatch");
1177 			FREE_NULL_HOSTLIST(host_list);
1178 			FREE_NULL_HOSTLIST(hostaddr_list);
1179 			FREE_NULL_HOSTLIST(hostname_list);
1180 			return ESLURM_INVALID_NODE_NAME;
1181 		}
1182 	}
1183 
1184 	while ( (this_node_name = hostlist_shift (host_list)) ) {
1185 		int err_code = 0;
1186 		bool acct_updated = false;
1187 
1188 		node_ptr = find_node_record (this_node_name);
1189 		node_inx = node_ptr - node_record_table_ptr;
1190 		if (node_ptr == NULL) {
1191 			error ("update_node: node %s does not exist",
1192 				this_node_name);
1193 			error_code = ESLURM_INVALID_NODE_NAME;
1194 			free (this_node_name);
1195 			break;
1196 		}
1197 
1198 		if (hostaddr_list) {
1199 			char *this_addr = hostlist_shift(hostaddr_list);
1200 			xfree(node_ptr->comm_name);
1201 			node_ptr->comm_name = xstrdup(this_addr);
1202 			free(this_addr);
1203 		}
1204 		if (hostname_list) {
1205 			char *this_hostname = hostlist_shift(hostname_list);
1206 			xfree(node_ptr->node_hostname);
1207 			node_ptr->node_hostname = xstrdup(this_hostname);
1208 			free(this_hostname);
1209 		}
1210 		if (hostaddr_list || hostname_list) {
1211 			/* This updates the lookup table addresses */
1212 			slurm_reset_alias(node_ptr->name, node_ptr->comm_name,
1213 					  node_ptr->node_hostname);
1214 		}
1215 
1216 		if (update_node_msg->cpu_bind) {
1217 			char tmp_str[128];
1218 			slurm_sprint_cpu_bind_type(tmp_str,
1219 						   update_node_msg->cpu_bind);
1220 			info("update_node: setting CpuBind to %s for node %s",
1221 			     tmp_str, this_node_name);
1222 			if (update_node_msg->cpu_bind == CPU_BIND_OFF)
1223 				node_ptr->cpu_bind = 0;
1224 			else
1225 				node_ptr->cpu_bind = update_node_msg->cpu_bind;
1226 		}
1227 
1228 		if (update_node_msg->features || update_node_msg->features_act) {
1229 			char *features_act = NULL, *features_avail = NULL;
1230 			if (!node_features_g_node_update_valid(node_ptr,
1231 							 update_node_msg)) {
1232 				error_code = ESLURM_INVALID_FEATURE;
1233 				xfree(update_node_msg->features);
1234 				xfree(update_node_msg->features_act);
1235 			}
1236 			if (update_node_msg->features_act)
1237 				features_act = update_node_msg->features_act;
1238 			else
1239 				features_act = node_ptr->features_act;
1240 
1241 			if (update_node_msg->features)
1242 				features_avail = update_node_msg->features;
1243 			else
1244 				features_avail = node_ptr->features;
1245 			if (!_valid_features_act(features_act, features_avail)){
1246 				info("%s: Invalid ActiveFeatures (\'%s\' not subset of \'%s\' on node %s)",
1247 				     __func__, features_act, features_avail,
1248 				     node_ptr->name);
1249 				error_code = ESLURM_ACTIVE_FEATURE_NOT_SUBSET;
1250 				xfree(update_node_msg->features);
1251 				xfree(update_node_msg->features_act);
1252 			}
1253 		}
1254 
1255 		if (update_node_msg->features_act) {
1256 			if (node_ptr->features_act)
1257 				orig_features_act =
1258 					xstrdup(node_ptr->features_act);
1259 			else
1260 				orig_features_act = xstrdup(node_ptr->features);
1261 		}
1262 		if (update_node_msg->features) {
1263 			if (!update_node_msg->features_act &&
1264 			    (node_features_g_count() == 0)) {
1265 				/*
1266 				 * If no NodeFeatures plugin and no explicit
1267 				 * active features, then make active and
1268 				 * available feature values match
1269 				 */
1270 				update_node_msg->features_act =
1271 					xstrdup(update_node_msg->features);
1272 			}
1273 			xfree(node_ptr->features);
1274 			if (update_node_msg->features[0]) {
1275 				node_ptr->features =
1276 					node_features_g_node_xlate2(
1277 						update_node_msg->features);
1278 			}
1279 			/*
1280 			 * _update_node_avail_features() logs and updates
1281 			 * avail_feature_list below
1282 			 */
1283 		}
1284 
1285 		if (update_node_msg->features_act) {
1286 			tmp_feature = node_features_g_node_xlate(
1287 					update_node_msg->features_act,
1288 					orig_features_act, node_ptr->features,
1289 					node_inx);
1290 			xfree(node_ptr->features_act);
1291 			node_ptr->features_act = tmp_feature;
1292 			error_code = _update_node_active_features(
1293 						node_ptr->name,
1294 						node_ptr->features_act,
1295 						FEATURE_MODE_COMB);
1296 			xfree(orig_features_act);
1297 		}
1298 
1299 		if (update_node_msg->gres) {
1300 			xfree(node_ptr->gres);
1301 			if (update_node_msg->gres[0])
1302 				node_ptr->gres = xstrdup(update_node_msg->gres);
1303 			/* _update_node_gres() logs and updates config */
1304 		}
1305 
1306 		/* No accounting update if node state and reason are unchange */
1307 		state_val = update_node_msg->node_state;
1308 		if (_equivalent_node_state(node_ptr, state_val) &&
1309 		    !xstrcmp(node_ptr->reason, update_node_msg->reason)) {
1310 			free(this_node_name);
1311 			continue;
1312 		}
1313 
1314 		if ((update_node_msg -> reason) &&
1315 		    (update_node_msg -> reason[0])) {
1316 			xfree(node_ptr->reason);
1317 			node_ptr->reason = xstrdup(update_node_msg->reason);
1318 			node_ptr->reason_time = now;
1319 			node_ptr->reason_uid = update_node_msg->reason_uid;
1320 			info ("update_node: node %s reason set to: %s",
1321 				this_node_name, node_ptr->reason);
1322 		}
1323 
1324 		if (state_val != NO_VAL) {
1325 			base_state = node_ptr->node_state;
1326 			if (!_valid_node_state_change(base_state, state_val)) {
1327 				info("Invalid node state transition requested "
1328 				     "for node %s from=%s to=%s",
1329 				     this_node_name,
1330 				     node_state_string(base_state),
1331 				     node_state_string(state_val));
1332 				state_val = NO_VAL;
1333 				error_code = ESLURM_INVALID_NODE_STATE;
1334 			}
1335 			base_state &= NODE_STATE_BASE;
1336 		}
1337 
1338 		if (state_val != NO_VAL) {
1339 			node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
1340 			if (state_val == NODE_RESUME) {
1341 				if (IS_NODE_IDLE(node_ptr) &&
1342 				    (IS_NODE_DRAIN(node_ptr) ||
1343 				     IS_NODE_FAIL(node_ptr))) {
1344 					clusteracct_storage_g_node_up(
1345 						acct_db_conn,
1346 						node_ptr,
1347 						now);
1348 					acct_updated = true;
1349 				}
1350 				node_ptr->node_state &= (~NODE_STATE_DRAIN);
1351 				node_ptr->node_state &= (~NODE_STATE_FAIL);
1352 				node_ptr->node_state &= (~NODE_STATE_REBOOT);
1353 				node_ptr->node_state &=
1354 					(~NODE_STATE_POWERING_DOWN);
1355 				if (IS_NODE_DOWN(node_ptr)) {
1356 					state_val = NODE_STATE_IDLE;
1357 #ifndef HAVE_FRONT_END
1358 					node_ptr->node_state |=
1359 							NODE_STATE_NO_RESPOND;
1360 #endif
1361 					node_ptr->last_response = MAX(now,
1362 						node_ptr->last_response);
1363 					node_ptr->boot_time = 0;
1364 					ping_nodes_now = true;
1365 				} else if (IS_NODE_FUTURE(node_ptr)) {
1366 					if (node_ptr->port == 0) {
1367 						node_ptr->port =slurmctld_conf.
1368 								slurmd_port;
1369 					}
1370 					slurm_set_addr(	&node_ptr->slurm_addr,
1371 							node_ptr->port,
1372 							node_ptr->comm_name);
1373 					if (node_ptr->slurm_addr.sin_port) {
1374 						state_val = NODE_STATE_IDLE;
1375 #ifndef HAVE_FRONT_END
1376 						node_ptr->node_state |=
1377 							NODE_STATE_NO_RESPOND;
1378 #endif
1379 						bit_clear(future_node_bitmap,
1380 							  node_inx);
1381 						node_ptr->last_response =
1382 							MAX(now,
1383 							node_ptr->last_response);
1384 						node_ptr->boot_time = 0;
1385 						ping_nodes_now = true;
1386 					} else {
1387 						error("slurm_set_addr failure "
1388 						      "on %s",
1389 		       				      node_ptr->comm_name);
1390 						state_val = base_state;
1391 					}
1392 				} else
1393 					state_val = base_state;
1394 			} else if (state_val == NODE_STATE_UNDRAIN) {
1395 				if (IS_NODE_IDLE(node_ptr) &&
1396 				    IS_NODE_DRAIN(node_ptr)) {
1397 					clusteracct_storage_g_node_up(
1398 						acct_db_conn,
1399 						node_ptr,
1400 						now);
1401 					acct_updated = true;
1402 				}
1403 				node_ptr->node_state &= (~NODE_STATE_DRAIN);
1404 				state_val = base_state;
1405 			}
1406 
1407 			if ((state_val == NODE_STATE_DOWN) ||
1408 			    (state_val == NODE_STATE_FUTURE)) {
1409 				/* We must set node DOWN before killing
1410 				 * its jobs */
1411 				_make_node_down(node_ptr, now);
1412 				kill_running_job_by_node_name (this_node_name);
1413 				if (state_val == NODE_STATE_FUTURE) {
1414 					node_ptr->node_state = NODE_STATE_FUTURE
1415 							       | node_flags;
1416 					bit_set(future_node_bitmap, node_inx);
1417 				}
1418 			} else if (state_val == NODE_STATE_IDLE) {
1419 				/* assume they want to clear DRAIN and
1420 				 * FAIL flags too */
1421 				if (IS_NODE_DOWN(node_ptr)) {
1422 					trigger_node_up(node_ptr);
1423 					clusteracct_storage_g_node_up(
1424 						acct_db_conn,
1425 						node_ptr,
1426 						now);
1427 					acct_updated = true;
1428 				} else if (IS_NODE_IDLE(node_ptr)   &&
1429 					   (IS_NODE_DRAIN(node_ptr) ||
1430 					    IS_NODE_FAIL(node_ptr))) {
1431 					clusteracct_storage_g_node_up(
1432 						acct_db_conn,
1433 						node_ptr,
1434 						now);
1435 					acct_updated = true;
1436 				}	/* else already fully available */
1437 				node_ptr->node_state &= (~NODE_STATE_DRAIN);
1438 				node_ptr->node_state &= (~NODE_STATE_FAIL);
1439 				if (!IS_NODE_NO_RESPOND(node_ptr) ||
1440 				     IS_NODE_POWER_SAVE(node_ptr))
1441 					make_node_avail(node_inx);
1442 				bit_set (idle_node_bitmap, node_inx);
1443 				bit_set (up_node_bitmap, node_inx);
1444 				if (IS_NODE_POWER_SAVE(node_ptr))
1445 					node_ptr->last_idle = 0;
1446 				else
1447 					node_ptr->last_idle = now;
1448 			} else if (state_val == NODE_STATE_ALLOCATED) {
1449 				if (!IS_NODE_DRAIN(node_ptr) &&
1450 				    !IS_NODE_FAIL(node_ptr)  &&
1451 				    !IS_NODE_NO_RESPOND(node_ptr))
1452 					make_node_avail(node_inx);
1453 				bit_set (up_node_bitmap, node_inx);
1454 				bit_clear (idle_node_bitmap, node_inx);
1455 			} else if ((state_val == NODE_STATE_DRAIN) ||
1456 				   (state_val == NODE_STATE_FAIL)) {
1457 				uint32_t new_state = state_val;
1458 				if ((IS_NODE_ALLOCATED(node_ptr) ||
1459 				     IS_NODE_MIXED(node_ptr)) &&
1460 				    (IS_NODE_POWER_SAVE(node_ptr) ||
1461 				     IS_NODE_POWER_UP(node_ptr))) {
1462 					info("%s: DRAIN/FAIL request for node %s which is allocated and being powered up. Requeueing jobs",
1463 					     __func__, this_node_name);
1464 					kill_running_job_by_node_name(
1465 								this_node_name);
1466 				}
1467 				bit_clear (avail_node_bitmap, node_inx);
1468 				node_ptr->node_state &= (~NODE_STATE_DRAIN);
1469 				node_ptr->node_state &= (~NODE_STATE_FAIL);
1470 				state_val = node_ptr->node_state |= state_val;
1471 				if ((node_ptr->run_job_cnt  == 0) &&
1472 				    (node_ptr->comp_job_cnt == 0)) {
1473 					trigger_node_drained(node_ptr);
1474 					clusteracct_storage_g_node_down(
1475 						acct_db_conn,
1476 						node_ptr, now, NULL,
1477 						node_ptr->reason_uid);
1478 				}
1479 				if ((new_state == NODE_STATE_FAIL) &&
1480 				    (nonstop_ops.node_fail))
1481 					(nonstop_ops.node_fail)(NULL, node_ptr);
1482 			} else if (state_val == NODE_STATE_POWER_SAVE) {
1483 				if (IS_NODE_POWER_SAVE(node_ptr)) {
1484 					node_ptr->node_state &=
1485 						(~NODE_STATE_POWER_SAVE);
1486 					info("power down request repeating "
1487 					     "for node %s", this_node_name);
1488 				} else {
1489 					if (IS_NODE_DOWN(node_ptr)) {
1490 						/* Abort any power up request */
1491 						node_ptr->node_state &=
1492 							(~NODE_STATE_POWER_UP);
1493 						node_ptr->node_state =
1494 							NODE_STATE_IDLE |
1495 							(node_ptr->node_state &
1496 							 NODE_STATE_FLAGS);
1497 					} else {
1498 						node_ptr->node_state &=
1499 							(~NODE_STATE_POWER_SAVE);
1500 					}
1501 #ifndef HAVE_FRONT_END
1502 					node_ptr->node_state |=
1503 						NODE_STATE_NO_RESPOND;
1504 #endif
1505 
1506 					info("powering down node %s",
1507 					     this_node_name);
1508 				}
1509 				node_ptr->last_idle = 1;
1510 				node_ptr->next_state = NO_VAL;
1511 				bit_clear(rs_node_bitmap, node_inx);
1512 				free(this_node_name);
1513 				continue;
1514 			} else if (state_val == NODE_STATE_POWER_UP) {
1515 				if (!IS_NODE_POWER_SAVE(node_ptr)) {
1516 					if (IS_NODE_POWER_UP(node_ptr)) {
1517 						node_ptr->last_idle = now;
1518 						node_ptr->node_state |=
1519 							NODE_STATE_POWER_SAVE;
1520 						info("power up request "
1521 						     "repeating for node %s",
1522 						     this_node_name);
1523 					} else {
1524 						verbose("node %s is already "
1525 							"powered up",
1526 							this_node_name);
1527 					}
1528 				} else {
1529 					node_ptr->last_idle = now;
1530 					info("powering up node %s",
1531 					     this_node_name);
1532 				}
1533 				node_ptr->next_state = NO_VAL;
1534 				bit_clear(rs_node_bitmap, node_inx);
1535 				free(this_node_name);
1536 				continue;
1537 			} else if ((state_val & NODE_STATE_POWER_SAVE) &&
1538 				   (state_val & NODE_STATE_POWER_UP) &&
1539 				   (IS_NODE_POWER_UP(node_ptr))) {
1540 				/* Clear any reboot operation in progress */
1541 				node_ptr->node_state &= (~NODE_STATE_POWER_UP);
1542 				node_ptr->last_response = MAX(now,
1543 						node_ptr->last_response);
1544 				state_val = base_state;
1545 			} else if (state_val == NODE_STATE_NO_RESPOND) {
1546 				node_ptr->node_state |= NODE_STATE_NO_RESPOND;
1547 				state_val = base_state;
1548 				bit_clear(avail_node_bitmap, node_inx);
1549 			} else if (state_val == NODE_STATE_CANCEL_REBOOT) {
1550 				if (IS_NODE_RUNNING_JOB(node_ptr)) {
1551 					node_ptr->node_state &=
1552 						(~NODE_STATE_REBOOT);
1553 					state_val = base_state;
1554 					if (!xstrcmp(node_ptr->reason,
1555 					             "Reboot ASAP"))
1556 						_undo_reboot_asap(node_ptr);
1557 				} else {
1558 					info("REBOOT on node %s already in progress -- unable to cancel",
1559 					     this_node_name);
1560 					err_code = error_code =
1561 						ESLURM_REBOOT_IN_PROGRESS;
1562 				}
1563 			} else {
1564 				info("Invalid node state specified %u",
1565 				     state_val);
1566 				err_code = 1;
1567 				error_code = ESLURM_INVALID_NODE_STATE;
1568 			}
1569 
1570 			if (err_code == 0) {
1571 				node_ptr->node_state = state_val |
1572 						(node_ptr->node_state &
1573 						 NODE_STATE_FLAGS);
1574 
1575 				if (!IS_NODE_REBOOT(node_ptr))
1576 					node_ptr->next_state = NO_VAL;
1577 				bit_clear(rs_node_bitmap, node_inx);
1578 
1579 				info ("update_node: node %s state set to %s",
1580 					this_node_name,
1581 					node_state_string(state_val));
1582 			}
1583 		}
1584 
1585 		if (!acct_updated && !IS_NODE_DOWN(node_ptr) &&
1586 		    !IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
1587 			/* reason information is handled in
1588 			   clusteracct_storage_g_node_up()
1589 			*/
1590 			clusteracct_storage_g_node_up(
1591 				acct_db_conn, node_ptr, now);
1592 		}
1593 
1594 		free (this_node_name);
1595 	}
1596 
1597 	/* Write/clear log */
1598 	(void)_update_node_active_features(NULL, NULL, FEATURE_MODE_PEND);
1599 
1600 	FREE_NULL_HOSTLIST(host_list);
1601 	FREE_NULL_HOSTLIST(hostaddr_list);
1602 	FREE_NULL_HOSTLIST(hostname_list);
1603 	last_node_update = now;
1604 
1605 	if ((error_code == SLURM_SUCCESS) && (update_node_msg->features)) {
1606 		error_code = _update_node_avail_features(
1607 					update_node_msg->node_names,
1608 					update_node_msg->features,
1609 					FEATURE_MODE_IND);
1610 	}
1611 	if ((error_code == SLURM_SUCCESS) && (update_node_msg->gres)) {
1612 		error_code = _update_node_gres(update_node_msg->node_names,
1613 					       update_node_msg->gres);
1614 	}
1615 
1616 	/*
1617 	 * Update weight. Weight is part of config_ptr,
1618 	 * hence split config records if required
1619 	 */
1620 	if ((error_code == SLURM_SUCCESS) &&
1621 	    (update_node_msg->weight != NO_VAL))	{
1622 		error_code = _update_node_weight(update_node_msg->node_names,
1623 						 update_node_msg->weight);
1624 		if (error_code == SLURM_SUCCESS) {
1625 			/* sort config_list by weight for scheduling */
1626 			list_sort(config_list, &list_compare_config);
1627 		}
1628 	}
1629 
1630 	return error_code;
1631 }
1632 
1633 /*
1634  * restore_node_features - Make node and config (from slurm.conf) fields
1635  *	consistent for Features, Gres and Weight
1636  * IN recover -
1637  *              0, 1 - use data from config record, built using slurm.conf
1638  *              2 = use data from node record, built from saved state
1639  */
restore_node_features(int recover)1640 extern void restore_node_features(int recover)
1641 {
1642 	int i, node_features_plugin_cnt;
1643 	node_record_t *node_ptr;
1644 
1645 	node_features_plugin_cnt = node_features_g_count();
1646 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
1647 	     i++, node_ptr++) {
1648 		if (node_ptr->weight != node_ptr->config_ptr->weight) {
1649 			error("Node %s Weight(%u) differ from slurm.conf",
1650 			      node_ptr->name, node_ptr->weight);
1651 			if (recover == 2) {
1652 				_update_node_weight(node_ptr->name,
1653 						    node_ptr->weight);
1654 			} else {
1655 				node_ptr->weight = node_ptr->config_ptr->
1656 						   weight;
1657 			}
1658 		}
1659 		if (xstrcmp(node_ptr->config_ptr->feature, node_ptr->features)){
1660 			if (node_features_plugin_cnt == 0) {
1661 				error("Node %s Features(%s) differ from slurm.conf",
1662 				      node_ptr->name, node_ptr->features);
1663 			}
1664 			if (recover == 2) {
1665 				_update_node_avail_features(node_ptr->name,
1666 							    node_ptr->features,
1667 							    FEATURE_MODE_COMB);
1668 			}
1669 		}
1670 
1671 		/*
1672 		 * We lose the GRES information updated manually and always
1673 		 * use the information from slurm.conf
1674 		 */
1675 		(void) gres_plugin_node_reconfig(
1676 			node_ptr->name,
1677 			node_ptr->config_ptr->gres,
1678 			&node_ptr->gres,
1679 			&node_ptr->gres_list,
1680 			slurmctld_conf.conf_flags & CTL_CONF_OR,
1681 			node_ptr->cores,
1682 			(node_ptr->boards * node_ptr->sockets));
1683 		gres_plugin_node_state_log(node_ptr->gres_list, node_ptr->name);
1684 	}
1685 	_update_node_avail_features(NULL, NULL, FEATURE_MODE_PEND);
1686 }
1687 
1688 /* Duplicate a configuration record except for the node names & bitmap */
_dup_config(config_record_t * config_ptr)1689 config_record_t *_dup_config(config_record_t *config_ptr)
1690 {
1691 	config_record_t *new_config_ptr;
1692 
1693 	new_config_ptr = create_config_record();
1694 	new_config_ptr->magic       = config_ptr->magic;
1695 	new_config_ptr->cpus        = config_ptr->cpus;
1696 	new_config_ptr->cpu_spec_list = xstrdup(config_ptr->cpu_spec_list);
1697 	new_config_ptr->boards      = config_ptr->boards;
1698 	new_config_ptr->sockets     = config_ptr->sockets;
1699 	new_config_ptr->cores       = config_ptr->cores;
1700 	new_config_ptr->core_spec_cnt = config_ptr->core_spec_cnt;
1701 	new_config_ptr->threads     = config_ptr->threads;
1702 	new_config_ptr->real_memory = config_ptr->real_memory;
1703 	new_config_ptr->mem_spec_limit = config_ptr->mem_spec_limit;
1704 	new_config_ptr->tmp_disk    = config_ptr->tmp_disk;
1705 	new_config_ptr->weight      = config_ptr->weight;
1706 	new_config_ptr->feature     = xstrdup(config_ptr->feature);
1707 	new_config_ptr->gres        = xstrdup(config_ptr->gres);
1708 
1709 	return new_config_ptr;
1710 }
1711 
1712 /*
1713  * _update_node_weight - Update weight associated with nodes
1714  *	build new config list records as needed
1715  * IN node_names - List of nodes to update
1716  * IN weight - New weight value
1717  * RET: SLURM_SUCCESS or error code
1718  */
_update_node_weight(char * node_names,uint32_t weight)1719 static int _update_node_weight(char *node_names, uint32_t weight)
1720 {
1721 	bitstr_t *node_bitmap = NULL, *tmp_bitmap;
1722 	ListIterator config_iterator;
1723 	config_record_t *config_ptr, *new_config_ptr, *first_new = NULL;
1724 	int rc, config_cnt, tmp_cnt;
1725 
1726 	rc = node_name2bitmap(node_names, false, &node_bitmap);
1727 	if (rc) {
1728 		info("_update_node_weight: invalid node_name");
1729 		return rc;
1730 	}
1731 
1732 	/* For each config_record with one of these nodes,
1733 	 * update it (if all nodes updated) or split it into
1734 	 * a new entry */
1735 	config_iterator = list_iterator_create(config_list);
1736 	while ((config_ptr = list_next(config_iterator))) {
1737 		if (config_ptr == first_new)
1738 			break;	/* done with all original records */
1739 
1740 		tmp_bitmap = bit_copy(node_bitmap);
1741 		bit_and(tmp_bitmap, config_ptr->node_bitmap);
1742 		config_cnt = bit_set_count(config_ptr->node_bitmap);
1743 		tmp_cnt = bit_set_count(tmp_bitmap);
1744 		if (tmp_cnt == 0) {
1745 			/* no overlap, leave alone */
1746 		} else if (tmp_cnt == config_cnt) {
1747 			/* all nodes changed, update in situ */
1748 			config_ptr->weight = weight;
1749 		} else {
1750 			/* partial update, split config_record */
1751 			new_config_ptr = _dup_config(config_ptr);
1752 			if (first_new == NULL)
1753 				first_new = new_config_ptr;
1754 			/* Change weight for the given nodes */
1755 			new_config_ptr->weight      = weight;
1756 			new_config_ptr->node_bitmap = bit_copy(tmp_bitmap);
1757 			new_config_ptr->nodes = bitmap2node_name(tmp_bitmap);
1758 			_update_config_ptr(tmp_bitmap, new_config_ptr);
1759 
1760 			/* Update remaining records */
1761 			bit_and_not(config_ptr->node_bitmap, tmp_bitmap);
1762 			xfree(config_ptr->nodes);
1763 			config_ptr->nodes = bitmap2node_name(
1764 						config_ptr->node_bitmap);
1765 		}
1766 		FREE_NULL_BITMAP(tmp_bitmap);
1767 	}
1768 	list_iterator_destroy(config_iterator);
1769 	FREE_NULL_BITMAP(node_bitmap);
1770 
1771 	info("_update_node_weight: nodes %s weight set to: %u",
1772 		node_names, weight);
1773 	return SLURM_SUCCESS;
1774 }
1775 
_update_node_features_post(char * node_names,char ** last_features,char * features,bitstr_t ** last_node_bitmap,bitstr_t ** node_bitmap,int mode,const char * type)1776 static inline void _update_node_features_post(
1777 	char *node_names,
1778 	char **last_features, char *features,
1779 	bitstr_t **last_node_bitmap, bitstr_t **node_bitmap,
1780 	int mode, const char *type)
1781 {
1782 
1783 	xassert(last_features);
1784 	xassert(last_node_bitmap);
1785 	xassert(node_bitmap);
1786 
1787 	if (mode == FEATURE_MODE_IND) {
1788 		debug2("%s: nodes %s %s features set to: %s",
1789 		       __func__, node_names, type, features);
1790 	} else if (*last_features && *last_node_bitmap &&
1791 		   ((mode == FEATURE_MODE_PEND) ||
1792 		    xstrcmp(features, *last_features))) {
1793 		char *last_node_names = bitmap2node_name(*last_node_bitmap);
1794 		debug2("%s: nodes %s %s features set to: %s",
1795 		       __func__, last_node_names, type, *last_features);
1796 		xfree(last_node_names);
1797 		xfree(*last_features);
1798 		FREE_NULL_BITMAP(*last_node_bitmap);
1799 	}
1800 
1801 	if (mode == FEATURE_MODE_COMB) {
1802 		if (!*last_features) {
1803 			/* Start combining records */
1804 			*last_features = xstrdup(features);
1805 			*last_node_bitmap = *node_bitmap;
1806 			*node_bitmap = NULL;
1807 		} else {
1808 			/* Add this node to existing log info */
1809 			bit_or(*last_node_bitmap, *node_bitmap);
1810 		}
1811 	}
1812 }
1813 
1814 /*
1815  * _update_node_active_features - Update active features associated with nodes
1816  * IN node_names - List of nodes to update
1817  * IN active_features - New active features value
1818  * IN mode - FEATURE_MODE_IND : Print each node change indivually
1819  *           FEATURE_MODE_COMB: Try to combine like changes (SEE NOTE BELOW)
1820  *           FEATURE_MODE_PEND: Print any pending change message
1821  * RET: SLURM_SUCCESS or error code
1822  * NOTE: Use mode=FEATURE_MODE_IND in a loop with node write lock set,
1823  *	 then call with mode=FEATURE_MODE_PEND at the end of the loop
1824  */
_update_node_active_features(char * node_names,char * active_features,int mode)1825 static int _update_node_active_features(char *node_names, char *active_features,
1826 					int mode)
1827 {
1828 	static char *last_active_features = NULL;
1829 	static bitstr_t *last_node_bitmap = NULL;
1830 	bitstr_t *node_bitmap = NULL;
1831 	int rc;
1832 
1833 	if (mode < FEATURE_MODE_PEND) {
1834 		/* Perform update of node active features */
1835 		rc = node_name2bitmap(node_names, false, &node_bitmap);
1836 		if (rc) {
1837 			info("%s: invalid node_name (%s)", __func__,
1838 			     node_names);
1839 			return rc;
1840 		}
1841 		update_feature_list(active_feature_list, active_features,
1842 				    node_bitmap);
1843 		(void) node_features_g_node_update(active_features,
1844 						   node_bitmap);
1845 	}
1846 
1847 	_update_node_features_post(node_names,
1848 				   &last_active_features, active_features,
1849 				   &last_node_bitmap, &node_bitmap,
1850 				   mode, "active");
1851 	FREE_NULL_BITMAP(node_bitmap);
1852 
1853 	return SLURM_SUCCESS;
1854 }
1855 
1856 /*
1857  * _update_node_avail_features - Update available features associated with
1858  *	nodes, build new config list records as needed
1859  * IN node_names - List of nodes to update
1860  * IN avail_features - New available features value
1861  * IN mode - FEATURE_MODE_IND : Print each node change indivually
1862  *           FEATURE_MODE_COMB: Try to combine like changes (SEE NOTE BELOW)
1863  *           FEATURE_MODE_PEND: Print any pending change message
1864  * RET: SLURM_SUCCESS or error code
1865  * NOTE: Use mode=FEATURE_MODE_IND in a loop with node write lock set,
1866  *	 then call with mode=FEATURE_MODE_PEND at the end of the loop
1867  */
_update_node_avail_features(char * node_names,char * avail_features,int mode)1868 static int _update_node_avail_features(char *node_names, char *avail_features,
1869 				       int mode)
1870 {
1871 	static char *last_avail_features = NULL;
1872 	static bitstr_t *last_node_bitmap = NULL;
1873 	bitstr_t *node_bitmap = NULL, *tmp_bitmap;
1874 	ListIterator config_iterator;
1875 	config_record_t *config_ptr, *new_config_ptr, *first_new = NULL;
1876 	int rc, config_cnt, tmp_cnt;
1877 
1878 	if (mode < FEATURE_MODE_PEND) {
1879 		rc = node_name2bitmap(node_names, false, &node_bitmap);
1880 		if (rc) {
1881 			info("%s: invalid node_name (%s)",
1882 			     __func__, node_names);
1883 			return rc;
1884 		}
1885 
1886 		/*
1887 		 * For each config_record with one of these nodes, update it
1888 		 * (if all nodes updated) or split it into a new entry
1889 		 */
1890 		config_iterator = list_iterator_create(config_list);
1891 		while ((config_ptr = list_next(config_iterator))) {
1892 			if (config_ptr == first_new)
1893 				break;	/* done with all original records */
1894 
1895 			tmp_bitmap = bit_copy(node_bitmap);
1896 			bit_and(tmp_bitmap, config_ptr->node_bitmap);
1897 			config_cnt = bit_set_count(config_ptr->node_bitmap);
1898 			tmp_cnt = bit_set_count(tmp_bitmap);
1899 			if (tmp_cnt == 0) {
1900 				/* no overlap, leave alone */
1901 			} else if (tmp_cnt == config_cnt) {
1902 				/* all nodes changed, update in situ */
1903 				xfree(config_ptr->feature);
1904 				if (avail_features && avail_features[0]) {
1905 					config_ptr->feature =
1906 						xstrdup(avail_features);
1907 				}
1908 			} else {
1909 				/* partial update, split config_record */
1910 				new_config_ptr = _dup_config(config_ptr);
1911 				if (first_new == NULL)
1912 					first_new = new_config_ptr;
1913 				xfree(new_config_ptr->feature);
1914 				if (avail_features && avail_features[0]) {
1915 					new_config_ptr->feature =
1916 						xstrdup(avail_features);
1917 				}
1918 				new_config_ptr->node_bitmap =
1919 						bit_copy(tmp_bitmap);
1920 				new_config_ptr->nodes =
1921 						bitmap2node_name(tmp_bitmap);
1922 				_update_config_ptr(tmp_bitmap, new_config_ptr);
1923 
1924 				/* Update remaining records */
1925 				bit_and_not(config_ptr->node_bitmap, tmp_bitmap);
1926 				xfree(config_ptr->nodes);
1927 				config_ptr->nodes = bitmap2node_name(
1928 						    config_ptr->node_bitmap);
1929 			}
1930 			FREE_NULL_BITMAP(tmp_bitmap);
1931 		}
1932 		list_iterator_destroy(config_iterator);
1933 		if (avail_feature_list) {	/* List not set at startup */
1934 			update_feature_list(avail_feature_list, avail_features,
1935 					    node_bitmap);
1936 		}
1937 	}
1938 
1939 	_update_node_features_post(node_names,
1940 				   &last_avail_features, avail_features,
1941 				   &last_node_bitmap, &node_bitmap,
1942 				   mode, "available");
1943 	FREE_NULL_BITMAP(node_bitmap);
1944 
1945 	return SLURM_SUCCESS;
1946 }
1947 
1948 /*
1949  * _update_node_gres - Update generic resources associated with nodes
1950  *	build new config list records as needed
1951  * IN node_names - List of nodes to update
1952  * IN gres - New gres value
1953  * RET: SLURM_SUCCESS or error code
1954  */
_update_node_gres(char * node_names,char * gres)1955 static int _update_node_gres(char *node_names, char *gres)
1956 {
1957 	bitstr_t *changed_node_bitmap = NULL, *node_bitmap = NULL, *tmp_bitmap;
1958 	ListIterator config_iterator;
1959 	config_record_t *config_ptr, *new_config_ptr, *first_new = NULL;
1960 	node_record_t *node_ptr;
1961 	int rc, rc2, overlap1, overlap2;
1962 	int i, i_first, i_last;
1963 
1964 	rc = node_name2bitmap(node_names, false, &node_bitmap);
1965 	if (rc) {
1966 		info("%s: invalid node_name: %s", __func__, node_names);
1967 		return rc;
1968 	}
1969 
1970 	/*
1971 	 * For each config_record with one of these nodes,
1972 	 * update it (if all nodes updated) or split it into a new entry
1973 	 */
1974 	config_iterator = list_iterator_create(config_list);
1975 	while ((config_ptr = list_next(config_iterator))) {
1976 		if (config_ptr == first_new)
1977 			break;	/* done with all original records */
1978 
1979 		overlap1 = bit_overlap(node_bitmap, config_ptr->node_bitmap);
1980 		if (overlap1 == 0)
1981 			continue;  /* No changes to this config_record */
1982 
1983 		/* At least some nodes in this config need to change */
1984 		tmp_bitmap = bit_copy(node_bitmap);
1985 		bit_and(tmp_bitmap, config_ptr->node_bitmap);
1986 		i_first = bit_ffs(tmp_bitmap);
1987 		if (i_first >= 0)
1988 			i_last = bit_fls(tmp_bitmap);
1989 		else
1990 			i_last = i_first - 1;
1991 		for (i = i_first; i <= i_last; i++) {
1992 			if (!bit_test(tmp_bitmap, i))
1993 				continue;	/* Not this node */
1994 			node_ptr = node_record_table_ptr + i;
1995 			rc2 = gres_plugin_node_reconfig(
1996 				node_ptr->name,
1997 				gres, &node_ptr->gres,
1998 				&node_ptr->gres_list,
1999 				slurmctld_conf.conf_flags & CTL_CONF_OR,
2000 				node_ptr->cores,
2001 				(node_ptr->boards * node_ptr->sockets));
2002 			if (rc2 != SLURM_SUCCESS) {
2003 				bit_clear(tmp_bitmap, i);
2004 				overlap1--;
2005 				if (rc == SLURM_SUCCESS)
2006 					rc = rc2;
2007 			}
2008 			gres_plugin_node_state_log(node_ptr->gres_list,
2009 						   node_ptr->name);
2010 		}
2011 
2012 		overlap2 = bit_set_count(config_ptr->node_bitmap);
2013 		if (overlap1 == 0) {
2014 			/* No nodes actually changed in this configuration */
2015 			FREE_NULL_BITMAP(tmp_bitmap);
2016 		} else if (overlap1 == overlap2) {
2017 			/* All nodes changes in this configuration */
2018 			xfree(config_ptr->gres);
2019 			if (gres && gres[0])
2020 				config_ptr->gres = xstrdup(gres);
2021 			if (changed_node_bitmap) {
2022 				bit_or(changed_node_bitmap, tmp_bitmap);
2023 				FREE_NULL_BITMAP(tmp_bitmap);
2024 			} else {
2025 				changed_node_bitmap = tmp_bitmap;
2026 				tmp_bitmap = NULL;
2027 			}
2028 		} else {
2029 			/*
2030 			 * Some nodes changes in this configuration.
2031 			 * Split config_record in two.
2032 			 */
2033 			new_config_ptr = _dup_config(config_ptr);
2034 			if (!first_new)
2035 				first_new = new_config_ptr;
2036 			xfree(new_config_ptr->gres);
2037 			if (gres && gres[0])
2038 				new_config_ptr->gres = xstrdup(gres);
2039 			new_config_ptr->node_bitmap = tmp_bitmap;
2040 			new_config_ptr->nodes = bitmap2node_name(tmp_bitmap);
2041 			_update_config_ptr(tmp_bitmap, new_config_ptr);
2042 			if (changed_node_bitmap) {
2043 				bit_or(changed_node_bitmap, tmp_bitmap);
2044 			} else {
2045 				changed_node_bitmap = bit_copy(tmp_bitmap);
2046 			}
2047 
2048 			/* Update remaining config_record */
2049 			bit_and_not(config_ptr->node_bitmap, tmp_bitmap);
2050 			xfree(config_ptr->nodes);
2051 			config_ptr->nodes = bitmap2node_name(
2052 						config_ptr->node_bitmap);
2053 			tmp_bitmap = NULL;	/* Nothing left to free */
2054 		}
2055 	}
2056 	list_iterator_destroy(config_iterator);
2057 	FREE_NULL_BITMAP(node_bitmap);
2058 
2059 	/* Report changes nodes, may be subset of requested nodes */
2060 	if (changed_node_bitmap) {
2061 		char *change_node_str = bitmap2node_name(changed_node_bitmap);
2062 		info("%s: nodes %s gres set to: %s", __func__,
2063 		     change_node_str, gres);
2064 		FREE_NULL_BITMAP(changed_node_bitmap);
2065 		xfree(change_node_str);
2066 	}
2067 
2068 	return rc;
2069 }
2070 
2071 /* Reset the config pointer for updated jobs */
_update_config_ptr(bitstr_t * bitmap,config_record_t * config_ptr)2072 static void _update_config_ptr(bitstr_t *bitmap, config_record_t *config_ptr)
2073 {
2074 	int i;
2075 
2076 	for (i = 0; i < node_record_count; i++) {
2077 		if (!bit_test(bitmap, i))
2078 			continue;
2079 		node_record_table_ptr[i].config_ptr = config_ptr;
2080 	}
2081 }
2082 
2083 /*
2084  * drain_nodes - drain one or more nodes,
2085  *  no-op for nodes already drained or draining
2086  * IN nodes - nodes to drain
2087  * IN reason - reason to drain the nodes
2088  * RET SLURM_SUCCESS or error code
2089  * global: node_record_table_ptr - pointer to global node table
2090  */
drain_nodes(char * nodes,char * reason,uint32_t reason_uid)2091 extern int drain_nodes(char *nodes, char *reason, uint32_t reason_uid)
2092 {
2093 	int error_code = 0, node_inx;
2094 	node_record_t *node_ptr;
2095 	char  *this_node_name ;
2096 	hostlist_t host_list;
2097 	time_t now = time(NULL);
2098 
2099 	if ((nodes == NULL) || (nodes[0] == '\0')) {
2100 		error ("drain_nodes: invalid node name  %s", nodes);
2101 		return ESLURM_INVALID_NODE_NAME;
2102 	}
2103 
2104 	if ( (host_list = hostlist_create (nodes)) == NULL) {
2105 		error ("hostlist_create error on %s: %m", nodes);
2106 		return ESLURM_INVALID_NODE_NAME;
2107 	}
2108 
2109 	while ( (this_node_name = hostlist_shift (host_list)) ) {
2110 		node_ptr = find_node_record (this_node_name);
2111 		node_inx = node_ptr - node_record_table_ptr;
2112 		if (node_ptr == NULL) {
2113 			error ("drain_nodes: node %s does not exist",
2114 				this_node_name);
2115 			error_code = ESLURM_INVALID_NODE_NAME;
2116 			free (this_node_name);
2117 			break;
2118 		}
2119 
2120 		if (IS_NODE_DRAIN(node_ptr)) {
2121 			/* state already changed, nothing to do */
2122 			free (this_node_name);
2123 			continue;
2124 		}
2125 
2126 		node_ptr->node_state |= NODE_STATE_DRAIN;
2127 		bit_clear (avail_node_bitmap, node_inx);
2128 		info ("drain_nodes: node %s state set to DRAIN",
2129 			this_node_name);
2130 		if ((node_ptr->reason == NULL) ||
2131 		    (xstrncmp(node_ptr->reason, "Not responding", 14) == 0)) {
2132 			xfree(node_ptr->reason);
2133 			node_ptr->reason = xstrdup(reason);
2134 			node_ptr->reason_time = now;
2135 			node_ptr->reason_uid = reason_uid;
2136 		}
2137 		if ((node_ptr->run_job_cnt  == 0) &&
2138 		    (node_ptr->comp_job_cnt == 0)) {
2139 			/* no jobs, node is drained */
2140 			trigger_node_drained(node_ptr);
2141 			clusteracct_storage_g_node_down(acct_db_conn,
2142 							node_ptr, now, NULL,
2143 							reason_uid);
2144 		}
2145 
2146 		free (this_node_name);
2147 	}
2148 	last_node_update = time (NULL);
2149 
2150 	hostlist_destroy (host_list);
2151 	return error_code;
2152 }
2153 /* Return true if admin request to change node state from old to new is valid */
_valid_node_state_change(uint32_t old,uint32_t new)2154 static bool _valid_node_state_change(uint32_t old, uint32_t new)
2155 {
2156 	uint32_t base_state, node_flags;
2157 
2158 	if (old == new)
2159 		return true;
2160 
2161 	base_state = old & NODE_STATE_BASE;
2162 	node_flags = old & NODE_STATE_FLAGS;
2163 	switch (new) {
2164 		case NODE_STATE_DOWN:
2165 		case NODE_STATE_DRAIN:
2166 		case NODE_STATE_FAIL:
2167 		case NODE_STATE_NO_RESPOND:
2168 		case NODE_STATE_POWER_SAVE:
2169 		case NODE_STATE_POWER_UP:
2170 		case (NODE_STATE_POWER_SAVE | NODE_STATE_POWER_UP):
2171 		case NODE_STATE_UNDRAIN:
2172 			return true;
2173 
2174 		case NODE_RESUME:
2175 			if ((base_state == NODE_STATE_DOWN)   ||
2176 			    (base_state == NODE_STATE_FUTURE) ||
2177 			    (node_flags & NODE_STATE_DRAIN)   ||
2178 			    (node_flags & NODE_STATE_FAIL)    ||
2179 			    (node_flags & NODE_STATE_REBOOT)  ||
2180 			    (node_flags & NODE_STATE_POWERING_DOWN))
2181 				return true;
2182 			break;
2183 
2184 		case NODE_STATE_CANCEL_REBOOT:
2185 			if (node_flags & NODE_STATE_REBOOT)
2186 				return true;
2187 			break;
2188 
2189 		case NODE_STATE_FUTURE:
2190 			if ((base_state == NODE_STATE_DOWN) ||
2191 			    (base_state == NODE_STATE_IDLE))
2192 				return true;
2193 			break;
2194 
2195 		case NODE_STATE_IDLE:
2196 			if ((base_state == NODE_STATE_DOWN) ||
2197 			    (base_state == NODE_STATE_IDLE))
2198 				return true;
2199 			break;
2200 
2201 		case NODE_STATE_ALLOCATED:
2202 			if (base_state == NODE_STATE_ALLOCATED)
2203 				return true;
2204 			break;
2205 
2206 		default:	/* All others invalid */
2207 			break;
2208 	}
2209 
2210 	return false;
2211 }
2212 
_build_node_spec_bitmap(node_record_t * node_ptr)2213 static int _build_node_spec_bitmap(node_record_t *node_ptr)
2214 {
2215 	uint32_t c, coff, size;
2216 	int *cpu_spec_array;
2217 	uint i, node_inx;
2218 
2219 	if (node_ptr->threads == 0) {
2220 		error("Node %s has invalid thread per core count (%u)",
2221 		      node_ptr->name, node_ptr->threads);
2222 		return SLURM_ERROR;
2223 	}
2224 
2225 	if (!node_ptr->cpu_spec_list)
2226 		return SLURM_SUCCESS;
2227 	node_inx = node_ptr - node_record_table_ptr;
2228 	c = cr_get_coremap_offset(node_inx);
2229 	coff = cr_get_coremap_offset(node_inx+1);
2230 	size = coff - c;
2231 	FREE_NULL_BITMAP(node_ptr->node_spec_bitmap);
2232 	node_ptr->node_spec_bitmap = bit_alloc(size);
2233 	bit_nset(node_ptr->node_spec_bitmap, 0, size-1);
2234 
2235 	/* remove node's specialized cpus now */
2236 	cpu_spec_array = bitfmt2int(node_ptr->cpu_spec_list);
2237 	i = 0;
2238 	while (cpu_spec_array[i] != -1) {
2239 		bit_nclear(node_ptr->node_spec_bitmap,
2240 			   (cpu_spec_array[i] / node_ptr->threads),
2241 			   (cpu_spec_array[i + 1] / node_ptr->threads));
2242 		i += 2;
2243 	}
2244 	xfree(cpu_spec_array);
2245 	return SLURM_SUCCESS;
2246 }
2247 
update_node_record_acct_gather_data(acct_gather_node_resp_msg_t * msg)2248 extern int update_node_record_acct_gather_data(
2249 	acct_gather_node_resp_msg_t *msg)
2250 {
2251 	node_record_t *node_ptr;
2252 
2253 	node_ptr = find_node_record(msg->node_name);
2254 	if (node_ptr == NULL)
2255 		return ENOENT;
2256 
2257 	memcpy(node_ptr->energy, msg->energy, sizeof(acct_gather_energy_t));
2258 
2259 	return SLURM_SUCCESS;
2260 }
2261 
2262 /* A node's socket/core configuration has changed could be due to KNL NUMA
2263  * mode change and reboot. Update this node's config record, splitting an
2264  * existing record if needed. */
_split_node_config(node_record_t * node_ptr,slurm_node_registration_status_msg_t * reg_msg)2265 static void _split_node_config(node_record_t *node_ptr,
2266 			       slurm_node_registration_status_msg_t *reg_msg)
2267 {
2268 	config_record_t *config_ptr, *new_config_ptr;
2269 	int node_inx;
2270 
2271 	if (!node_ptr)
2272 		return;
2273 	config_ptr = node_ptr->config_ptr;
2274 	if (!config_ptr)
2275 		return;
2276 
2277 	node_inx = node_ptr - node_record_table_ptr;
2278 	if ((bit_set_count(config_ptr->node_bitmap) > 1) &&
2279 	    bit_test(config_ptr->node_bitmap, node_inx)) {
2280 		new_config_ptr = create_config_record();
2281 		memcpy(new_config_ptr, config_ptr, sizeof(config_record_t));
2282 		new_config_ptr->cpu_spec_list =
2283 			xstrdup(config_ptr->cpu_spec_list);
2284 		new_config_ptr->feature = xstrdup(config_ptr->feature);
2285 		new_config_ptr->gres = xstrdup(config_ptr->gres);
2286 		bit_clear(config_ptr->node_bitmap, node_inx);
2287 		xfree(config_ptr->nodes);
2288 		config_ptr->nodes = bitmap2node_name(config_ptr->node_bitmap);
2289 		new_config_ptr->node_bitmap = bit_alloc(node_record_count);
2290 		bit_set(new_config_ptr->node_bitmap, node_inx);
2291 		new_config_ptr->nodes = xstrdup(node_ptr->name);
2292 		node_ptr->config_ptr = new_config_ptr;
2293 		config_ptr = new_config_ptr;
2294 	}
2295 	config_ptr->cores = reg_msg->cores;
2296 	config_ptr->sockets = reg_msg->sockets;
2297 }
2298 
2299 /*
2300  * validate_node_specs - validate the node's specifications as valid,
2301  *	if not set state to down, in any case update last_response
2302  * IN reg_msg - node registration message
2303  * IN protocol_version - Version of Slurm on this node
2304  * OUT newly_up - set if node newly brought into service
2305  * RET 0 if no error, ENOENT if no such node, EINVAL if values too low
2306  */
validate_node_specs(slurm_node_registration_status_msg_t * reg_msg,uint16_t protocol_version,bool * newly_up)2307 extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg,
2308 			       uint16_t protocol_version, bool *newly_up)
2309 {
2310 	int error_code, i, node_inx;
2311 	config_record_t *config_ptr;
2312 	node_record_t *node_ptr;
2313 	char *reason_down = NULL;
2314 	char *orig_features = NULL, *orig_features_act = NULL;
2315 	uint32_t node_flags;
2316 	time_t now = time(NULL);
2317 	bool orig_node_avail;
2318 	static uint32_t cr_flag = NO_VAL;
2319 	static int node_features_cnt = 0;
2320 	int *cpu_spec_array;
2321 	int sockets1, sockets2;	/* total sockets on node */
2322 	int cores1, cores2;	/* total cores on node */
2323 	int threads1, threads2;	/* total threads on node */
2324 
2325 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
2326 
2327 	node_ptr = find_node_record(reg_msg->node_name);
2328 	if (node_ptr == NULL)
2329 		return ENOENT;
2330 	node_inx = node_ptr - node_record_table_ptr;
2331 	orig_node_avail = bit_test(avail_node_bitmap, node_inx);
2332 
2333 	config_ptr = node_ptr->config_ptr;
2334 	error_code = SLURM_SUCCESS;
2335 
2336 	node_ptr->protocol_version = protocol_version;
2337 	xfree(node_ptr->version);
2338 	node_ptr->version = reg_msg->version;
2339 	reg_msg->version = NULL;
2340 
2341 	if (waiting_for_node_boot(node_ptr))
2342 		return SLURM_SUCCESS;
2343 	bit_clear(booting_node_bitmap, node_inx);
2344 
2345 	if (cr_flag == NO_VAL) {
2346 		cr_flag = 0;  /* call is no-op for select/linear and others */
2347 		if (select_g_get_info_from_plugin(SELECT_CR_PLUGIN,
2348 						  NULL, &cr_flag)) {
2349 			cr_flag = NO_VAL;	/* error */
2350 		}
2351 		if (cr_flag == SELECT_TYPE_CONS_TRES)
2352 			cr_flag = SELECT_TYPE_CONS_RES;
2353 		node_features_cnt = node_features_g_count();
2354 	}
2355 
2356 	if (reg_msg->features_avail || reg_msg->features_active) {
2357 		char *sep = "";
2358 		orig_features = xstrdup(node_ptr->features);
2359 		if (orig_features && orig_features[0])
2360 			sep = ",";
2361 		if (reg_msg->features_avail) {
2362 			xstrfmtcat(orig_features, "%s%s", sep,
2363 				   reg_msg->features_avail);
2364 		}
2365 		if (node_ptr->features_act)
2366 			orig_features_act = xstrdup(node_ptr->features_act);
2367 		else
2368 			orig_features_act = xstrdup(node_ptr->features);
2369 	}
2370 	if (reg_msg->features_avail) {
2371 		if (reg_msg->features_active && !node_ptr->features_act) {
2372 			node_ptr->features_act = node_ptr->features;
2373 			node_ptr->features = NULL;
2374 		} else {
2375 			xfree(node_ptr->features);
2376 		}
2377 		node_ptr->features = node_features_g_node_xlate(
2378 					reg_msg->features_avail,
2379 					orig_features, orig_features,
2380 					node_inx);
2381 		(void) _update_node_avail_features(node_ptr->name,
2382 						   node_ptr->features,
2383 						   FEATURE_MODE_IND);
2384 	}
2385 	if (reg_msg->features_active) {
2386 		char *tmp_feature;
2387 		tmp_feature = node_features_g_node_xlate(
2388 						reg_msg->features_active,
2389 						orig_features_act,
2390 						orig_features,
2391 						node_inx);
2392 		xfree(node_ptr->features_act);
2393 		node_ptr->features_act = tmp_feature;
2394 		(void) _update_node_active_features(node_ptr->name,
2395 						    node_ptr->features_act,
2396 						    FEATURE_MODE_IND);
2397 	}
2398 	xfree(orig_features);
2399 	xfree(orig_features_act);
2400 
2401 	sockets1 = reg_msg->sockets;
2402 	cores1   = sockets1 * reg_msg->cores;
2403 	threads1 = cores1   * reg_msg->threads;
2404 	if (gres_plugin_node_config_unpack(reg_msg->gres_info,
2405 					   node_ptr->name) != SLURM_SUCCESS) {
2406 		error_code = SLURM_ERROR;
2407 		xstrcat(reason_down, "Could not unpack gres data");
2408 	} else if (gres_plugin_node_config_validate(
2409 				node_ptr->name, config_ptr->gres,
2410 				&node_ptr->gres, &node_ptr->gres_list,
2411 				reg_msg->threads, reg_msg->cores,
2412 				reg_msg->sockets,
2413 				slurmctld_conf.conf_flags & CTL_CONF_OR,
2414 				&reason_down)
2415 		   != SLURM_SUCCESS) {
2416 		error_code = EINVAL;
2417 		/* reason_down set in function above */
2418 	}
2419 	gres_plugin_node_state_log(node_ptr->gres_list, node_ptr->name);
2420 
2421 	if (!(slurmctld_conf.conf_flags & CTL_CONF_OR)) {
2422 		/* sockets1, cores1, and threads1 are set above */
2423 		sockets2 = config_ptr->sockets;
2424 		cores2   = sockets2 * config_ptr->cores;
2425 		threads2 = cores2   * config_ptr->threads;
2426 
2427 		if (threads1 < threads2) {
2428 			error("Node %s has low socket*core*thread count "
2429 			      "(%d < %d)",
2430 			      reg_msg->node_name, threads1, threads2);
2431 			error_code = EINVAL;
2432 			if (reason_down)
2433 				xstrcat(reason_down, ", ");
2434 			xstrcat(reason_down, "Low socket*core*thread count");
2435 		}
2436 
2437 		if (reg_msg->cpus < config_ptr->cpus) {
2438 			error("Node %s has low cpu count (%u < %u)",
2439 			      reg_msg->node_name, reg_msg->cpus,
2440 			      config_ptr->cpus);
2441 			error_code  = EINVAL;
2442 			if (reason_down)
2443 				xstrcat(reason_down, ", ");
2444 			xstrcat(reason_down, "Low CPUs");
2445 		}
2446 
2447 		if ((error_code == SLURM_SUCCESS) &&
2448 		    (cr_flag == SELECT_TYPE_CONS_RES) &&
2449 		    (node_features_cnt > 0) &&
2450 		    (reg_msg->sockets != config_ptr->sockets) &&
2451 		    (reg_msg->cores   != config_ptr->cores) &&
2452 		    ((reg_msg->sockets * reg_msg->cores) ==
2453 		     (config_ptr->sockets * config_ptr->cores))) {
2454 			_split_node_config(node_ptr, reg_msg);
2455 		}
2456 	}
2457 	if (reg_msg->boards > reg_msg->sockets) {
2458 		error("Node %s has more boards than sockets (%u > %u), setting board count to 1",
2459 		      reg_msg->node_name, reg_msg->boards, reg_msg->sockets);
2460 		reg_msg->boards = 1;
2461 	}
2462 
2463 	/* reset partition and node config (in that order) */
2464 
2465 	if (error_code == SLURM_SUCCESS) {
2466 		node_ptr->boards  = reg_msg->boards;
2467 		node_ptr->sockets = reg_msg->sockets;
2468 		node_ptr->cores   = reg_msg->cores;
2469 		node_ptr->threads = reg_msg->threads;
2470 		node_ptr->cpus    = reg_msg->cpus;
2471 	}
2472 	if (!(slurmctld_conf.conf_flags & CTL_CONF_OR)) {
2473 		if (reg_msg->real_memory < config_ptr->real_memory) {
2474 			error("Node %s has low real_memory size (%"PRIu64" < %"PRIu64")",
2475 			      reg_msg->node_name, reg_msg->real_memory,
2476 			      config_ptr->real_memory);
2477 			error_code  = EINVAL;
2478 			if (reason_down)
2479 				xstrcat(reason_down, ", ");
2480 			xstrcat(reason_down, "Low RealMemory");
2481 		}
2482 
2483 		if (reg_msg->tmp_disk < config_ptr->tmp_disk) {
2484 			error("Node %s has low tmp_disk size (%u < %u)",
2485 			      reg_msg->node_name, reg_msg->tmp_disk,
2486 			      config_ptr->tmp_disk);
2487 			error_code = EINVAL;
2488 			if (reason_down)
2489 				xstrcat(reason_down, ", ");
2490 			xstrcat(reason_down, "Low TmpDisk");
2491 		}
2492 	}
2493 
2494 	node_ptr->real_memory = reg_msg->real_memory;
2495 	node_ptr->tmp_disk = reg_msg->tmp_disk;
2496 
2497 	if (reg_msg->cpu_spec_list != NULL) {
2498 		xfree(node_ptr->cpu_spec_list);
2499 		node_ptr->cpu_spec_list = reg_msg->cpu_spec_list;
2500 		reg_msg->cpu_spec_list = NULL;	/* Nothing left to free */
2501 
2502 		cpu_spec_array = bitfmt2int(node_ptr->cpu_spec_list);
2503 		i = 0;
2504 		node_ptr->core_spec_cnt = 0;
2505 		while (cpu_spec_array[i] != -1) {
2506 			node_ptr->core_spec_cnt += (cpu_spec_array[i + 1] -
2507 				cpu_spec_array[i]) + 1;
2508 			i += 2;
2509 		}
2510 		if (node_ptr->threads)
2511 			node_ptr->core_spec_cnt /= node_ptr->threads;
2512 		xfree(cpu_spec_array);
2513 		if (_build_node_spec_bitmap(node_ptr) != SLURM_SUCCESS)
2514 			error_code = EINVAL;
2515 	}
2516 
2517 	xfree(node_ptr->arch);
2518 	node_ptr->arch = reg_msg->arch;
2519 	reg_msg->arch = NULL;	/* Nothing left to free */
2520 
2521 	xfree(node_ptr->os);
2522 	node_ptr->os = reg_msg->os;
2523 	reg_msg->os = NULL;	/* Nothing left to free */
2524 
2525 	if (node_ptr->cpu_load != reg_msg->cpu_load) {
2526 		node_ptr->cpu_load = reg_msg->cpu_load;
2527 		node_ptr->cpu_load_time = now;
2528 		last_node_update = now;
2529 	}
2530 	if (node_ptr->free_mem != reg_msg->free_mem) {
2531 		node_ptr->free_mem = reg_msg->free_mem;
2532 		node_ptr->free_mem_time = now;
2533 		last_node_update = now;
2534 	}
2535 
2536 	if (IS_NODE_NO_RESPOND(node_ptr) ||
2537 	    IS_NODE_POWER_UP(node_ptr) ||
2538 	    IS_NODE_POWER_SAVE(node_ptr)) {
2539 		info("Node %s now responding", node_ptr->name);
2540 
2541 		/*
2542 		 * Set last_idle in case that the node came up out of band or
2543 		 * came up after ResumeTimeout so that it can be suspended at a
2544 		 * later point.
2545 		 */
2546 		if (IS_NODE_POWER_UP(node_ptr) || IS_NODE_POWER_SAVE(node_ptr))
2547 			node_ptr->last_idle = now;
2548 
2549 		node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
2550 		node_ptr->node_state &= (~NODE_STATE_POWER_UP);
2551 		node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
2552 		node_ptr->node_state &= (~NODE_STATE_POWERING_DOWN);
2553 		if (!is_node_in_maint_reservation(node_inx))
2554 			node_ptr->node_state &= (~NODE_STATE_MAINT);
2555 
2556 		bit_clear(power_node_bitmap, node_inx);
2557 
2558 		last_node_update = now;
2559 	}
2560 
2561 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
2562 
2563 	if (node_ptr->last_response &&
2564 	    (node_ptr->boot_time > node_ptr->last_response) &&
2565 	    !IS_NODE_UNKNOWN(node_ptr)) {	/* Node just rebooted */
2566 		(void) node_features_g_get_node(node_ptr->name);
2567 	}
2568 
2569 	if (error_code) {
2570 		if (!IS_NODE_DOWN(node_ptr)
2571 			&& !IS_NODE_DRAIN(node_ptr)
2572 			&& ! IS_NODE_FAIL(node_ptr)) {
2573 			error ("Setting node %s state to DRAIN",
2574 				   reg_msg->node_name);
2575 			drain_nodes(reg_msg->node_name,
2576 						reason_down,
2577 						slurmctld_conf.slurm_user_id);
2578 		}
2579 		last_node_update = time (NULL);
2580 	} else if (reg_msg->status == ESLURMD_PROLOG_FAILED
2581 		   || reg_msg->status == ESLURMD_SETUP_ENVIRONMENT_ERROR) {
2582 		if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
2583 			char *reason;
2584 			error("%s: Prolog or job env setup failure on node %s, "
2585 			      "draining the node",
2586 			      __func__, reg_msg->node_name);
2587 			if (reg_msg->status == ESLURMD_PROLOG_FAILED)
2588 				reason = "Prolog error";
2589 			else
2590 				reason = "Job env setup error";
2591 			drain_nodes(reg_msg->node_name, reason,
2592 				    slurmctld_conf.slurm_user_id);
2593 			last_node_update = time (NULL);
2594 		}
2595 	} else {
2596 		if (IS_NODE_UNKNOWN(node_ptr) || IS_NODE_FUTURE(node_ptr)) {
2597 			bool unknown = 0;
2598 
2599 			if (IS_NODE_UNKNOWN(node_ptr))
2600 				unknown = 1;
2601 
2602 			debug("validate_node_specs: node %s registered with "
2603 			      "%u jobs",
2604 			      reg_msg->node_name,reg_msg->job_count);
2605 			if (IS_NODE_FUTURE(node_ptr)) {
2606 				if (IS_NODE_MAINT(node_ptr) &&
2607 				    !is_node_in_maint_reservation(node_inx))
2608 					node_flags &= (~NODE_STATE_MAINT);
2609 				node_flags &= (~NODE_STATE_REBOOT);
2610 			}
2611 			if (reg_msg->job_count) {
2612 				node_ptr->node_state = NODE_STATE_ALLOCATED |
2613 					node_flags;
2614 			} else {
2615 				node_ptr->node_state = NODE_STATE_IDLE |
2616 					node_flags;
2617 				node_ptr->last_idle = now;
2618 			}
2619 			last_node_update = now;
2620 
2621 			/* don't send this on a slurmctld unless needed */
2622 			if (unknown && slurmctld_init_db
2623 			    && !IS_NODE_DRAIN(node_ptr)
2624 			    && !IS_NODE_FAIL(node_ptr)) {
2625 				/* reason information is handled in
2626 				   clusteracct_storage_g_node_up()
2627 				*/
2628 				clusteracct_storage_g_node_up(
2629 					acct_db_conn, node_ptr, now);
2630 			}
2631 		} else if (IS_NODE_DOWN(node_ptr) &&
2632 			   ((slurmctld_conf.ret2service == 2) ||
2633 			    IS_NODE_REBOOT(node_ptr) ||
2634 			    ((slurmctld_conf.ret2service == 1) &&
2635 			     !xstrcmp(node_ptr->reason, "Not responding") &&
2636 			     (node_ptr->boot_time <
2637 			      node_ptr->last_response)))) {
2638 			node_flags &= (~NODE_STATE_REBOOT);
2639 			if (xstrstr(node_ptr->reason, "Reboot ASAP") &&
2640 			    (node_ptr->next_state == NO_VAL)) {
2641 				if (node_ptr->next_state != NODE_STATE_DOWN) {
2642 					xfree(node_ptr->reason);
2643 					node_ptr->reason_time = 0;
2644 					node_ptr->reason_uid = 0;
2645 				}
2646 				node_flags &= (~NODE_STATE_DRAIN);
2647 			}
2648 			if (node_ptr->next_state != NO_VAL)
2649 				node_flags &= (~NODE_STATE_DRAIN);
2650 
2651 			if (node_ptr->next_state == NODE_STATE_DOWN) {
2652 				node_ptr->node_state = node_ptr->next_state |
2653 						       node_flags;
2654 				if (node_ptr->reason) {
2655 					xstrcat(node_ptr->reason,
2656 						" : reboot complete");
2657 				}
2658 			} else if (reg_msg->job_count) {
2659 				node_ptr->node_state = NODE_STATE_ALLOCATED |
2660 						       node_flags;
2661 			} else {
2662 				node_ptr->node_state = NODE_STATE_IDLE |
2663 						       node_flags;
2664 				node_ptr->last_idle = now;
2665 			}
2666 			node_ptr->next_state = NO_VAL;
2667 			bit_clear(rs_node_bitmap, node_inx);
2668 
2669 			info("node %s returned to service",
2670 			     reg_msg->node_name);
2671 			trigger_node_up(node_ptr);
2672 			last_node_update = now;
2673 			if (!IS_NODE_DRAIN(node_ptr)
2674 			    && !IS_NODE_DOWN(node_ptr)
2675 			    && !IS_NODE_FAIL(node_ptr)) {
2676 				/* reason information is handled in
2677 				 * clusteracct_storage_g_node_up() */
2678 				clusteracct_storage_g_node_up(
2679 					acct_db_conn, node_ptr, now);
2680 			}
2681 		} else if (node_ptr->last_response &&
2682 			   (node_ptr->boot_time > node_ptr->last_response) &&
2683 			   (slurmctld_conf.ret2service != 2)) {
2684 			if (!node_ptr->reason ||
2685 			    (node_ptr->reason &&
2686 			     !xstrcmp(node_ptr->reason, "Not responding"))) {
2687 				if (node_ptr->reason)
2688 					xfree(node_ptr->reason);
2689 				node_ptr->reason_time = now;
2690 				node_ptr->reason_uid =
2691 					slurmctld_conf.slurm_user_id;
2692 				node_ptr->reason = xstrdup(
2693 					"Node unexpectedly rebooted");
2694 			}
2695 			info("%s: Node %s unexpectedly rebooted boot_time=%u last response=%u",
2696 			     __func__, reg_msg->node_name,
2697 			     (uint32_t)node_ptr->boot_time,
2698 			     (uint32_t)node_ptr->last_response);
2699 			_make_node_down(node_ptr, now);
2700 			kill_running_job_by_node_name(reg_msg->node_name);
2701 			last_node_update = now;
2702 			reg_msg->job_count = 0;
2703 		} else if (IS_NODE_ALLOCATED(node_ptr) &&
2704 			   (reg_msg->job_count == 0)) {	/* job vanished */
2705 			node_ptr->node_state = NODE_STATE_IDLE | node_flags;
2706 			node_ptr->last_idle = now;
2707 			last_node_update = now;
2708 		} else if (IS_NODE_COMPLETING(node_ptr) &&
2709 			   (reg_msg->job_count == 0)) {	/* job already done */
2710 			node_ptr->node_state &= (~NODE_STATE_COMPLETING);
2711 			last_node_update = now;
2712 			bit_clear(cg_node_bitmap, node_inx);
2713 		} else if (IS_NODE_IDLE(node_ptr) &&
2714 			   (reg_msg->job_count != 0)) {
2715 			if (node_ptr->run_job_cnt != 0) {
2716 				node_ptr->node_state = NODE_STATE_ALLOCATED |
2717 						       node_flags;
2718 				error("Invalid state for node %s, was IDLE "
2719 			      	      "with %u running jobs",
2720 			      	      node_ptr->name, reg_msg->job_count);
2721 			}
2722 			/*
2723 			 * there must be completing job(s) on this node since
2724 			 * reg_msg->job_count was set (run_job_cnt +
2725 			 * comp_job_cnt) in validate_jobs_on_node()
2726 			 */
2727 			if (node_ptr->comp_job_cnt != 0) {
2728 				node_ptr->node_state |= NODE_STATE_COMPLETING;
2729 				bit_set(cg_node_bitmap, node_inx);
2730 			}
2731 			last_node_update = now;
2732 		}
2733 		if (IS_NODE_IDLE(node_ptr)) {
2734 			node_ptr->owner = NO_VAL;
2735 			xfree(node_ptr->mcs_label);
2736 		}
2737 
2738 		select_g_update_node_config(node_inx);
2739 		_sync_bitmaps(node_ptr, reg_msg->job_count);
2740 	}
2741 
2742 	xfree(reason_down);
2743 	if (reg_msg->energy)
2744 		memcpy(node_ptr->energy, reg_msg->energy,
2745 		       sizeof(acct_gather_energy_t));
2746 
2747 	node_ptr->last_response = MAX(now, node_ptr->last_response);
2748 	node_ptr->boot_req_time = (time_t) 0;
2749 
2750 	*newly_up = (!orig_node_avail && bit_test(avail_node_bitmap, node_inx));
2751 
2752 	return error_code;
2753 }
2754 
_front_end_reg(slurm_node_registration_status_msg_t * reg_msg)2755 static front_end_record_t * _front_end_reg(
2756 		slurm_node_registration_status_msg_t *reg_msg)
2757 {
2758 	front_end_record_t *front_end_ptr;
2759 	uint32_t state_base, state_flags;
2760 	time_t now = time(NULL);
2761 
2762 	debug2("name:%s boot_time:%u up_time:%u",
2763 	       reg_msg->node_name, (unsigned int) reg_msg->slurmd_start_time,
2764 	       reg_msg->up_time);
2765 
2766 	front_end_ptr = find_front_end_record(reg_msg->node_name);
2767 	if (front_end_ptr == NULL) {
2768 		error("Registration message from unknown node %s",
2769 		      reg_msg->node_name);
2770 		return NULL;
2771 	}
2772 
2773 	front_end_ptr->boot_time = now - reg_msg->up_time;
2774 	if (front_end_ptr->last_response &&
2775 	    (front_end_ptr->boot_time > front_end_ptr->last_response)) {
2776 		info("front end %s unexpectedly rebooted, "
2777 		     "killing all previously running jobs running on it.",
2778 		     reg_msg->node_name);
2779 		(void) kill_job_by_front_end_name(front_end_ptr->name);
2780 		reg_msg->job_count = 0;
2781 	}
2782 
2783 	front_end_ptr->last_response = MAX(now, front_end_ptr->last_response);
2784 	front_end_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
2785 	state_base  = front_end_ptr->node_state & JOB_STATE_BASE;
2786 	state_flags = front_end_ptr->node_state & JOB_STATE_FLAGS;
2787 	if ((state_base == NODE_STATE_DOWN) && (front_end_ptr->reason) &&
2788 	    (!xstrncmp(front_end_ptr->reason, "Not responding", 14))) {
2789 		error("front end node %s returned to service",
2790 		      reg_msg->node_name);
2791 		state_base = NODE_STATE_IDLE;
2792 		xfree(front_end_ptr->reason);
2793 		front_end_ptr->reason_time = (time_t) 0;
2794 		front_end_ptr->reason_uid = 0;
2795 	}
2796 	if (state_base == NODE_STATE_UNKNOWN)
2797 		state_base = NODE_STATE_IDLE;
2798 
2799 	state_flags &= (~NODE_STATE_NO_RESPOND);
2800 
2801 	front_end_ptr->node_state = state_base | state_flags;
2802 	last_front_end_update = now;
2803 	return front_end_ptr;
2804 }
2805 
_build_step_id(char * buf,int buf_len,uint32_t step_id)2806 static char *_build_step_id(char *buf, int buf_len, uint32_t step_id)
2807 {
2808 	if (step_id == SLURM_BATCH_SCRIPT)
2809 		snprintf(buf, buf_len, "StepId=Batch");
2810 	else
2811 		snprintf(buf, buf_len, "StepId=%u", step_id);
2812 	return buf;
2813 }
2814 
2815 /*
2816  * validate_nodes_via_front_end - validate all nodes on a cluster as having
2817  *	a valid configuration as soon as the front-end registers. Individual
2818  *	nodes will not register with this configuration
2819  * IN reg_msg - node registration message
2820  * IN protocol_version - Version of Slurm on this node
2821  * OUT newly_up - set if node newly brought into service
2822  * RET 0 if no error, Slurm error code otherwise
2823  */
validate_nodes_via_front_end(slurm_node_registration_status_msg_t * reg_msg,uint16_t protocol_version,bool * newly_up)2824 extern int validate_nodes_via_front_end(
2825 		slurm_node_registration_status_msg_t *reg_msg,
2826 		uint16_t protocol_version, bool *newly_up)
2827 {
2828 	int error_code = 0, i, j, rc;
2829 	bool update_node_state = false;
2830 	job_record_t *job_ptr;
2831 	config_record_t *config_ptr;
2832 	node_record_t *node_ptr;
2833 	time_t now = time(NULL);
2834 	ListIterator job_iterator;
2835 	hostlist_t reg_hostlist = NULL;
2836 	char *host_str = NULL, *reason_down = NULL;
2837 	uint32_t node_flags;
2838 	front_end_record_t *front_end_ptr;
2839 	char step_str[64];
2840 
2841 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
2842 	xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
2843 	xassert(verify_lock(FED_LOCK, READ_LOCK));
2844 
2845 	if (reg_msg->up_time > now) {
2846 		error("Node up_time on %s is invalid: %u>%u",
2847 		      reg_msg->node_name, reg_msg->up_time, (uint32_t) now);
2848 		reg_msg->up_time = 0;
2849 	}
2850 
2851 	front_end_ptr = _front_end_reg(reg_msg);
2852 	if (front_end_ptr == NULL)
2853 		return ESLURM_INVALID_NODE_NAME;
2854 
2855 	front_end_ptr->protocol_version = protocol_version;
2856 	xfree(front_end_ptr->version);
2857 	front_end_ptr->version = reg_msg->version;
2858 	reg_msg->version = NULL;
2859 	*newly_up = false;
2860 
2861 	if (reg_msg->status == ESLURMD_PROLOG_FAILED) {
2862 		error("Prolog failed on node %s", reg_msg->node_name);
2863 		/* Do NOT set the node DOWN here. Unlike non-front-end systems,
2864 		 * this failure is likely due to some problem in the underlying
2865 		 * infrastructure (e.g. the block failed to boot). */
2866 		/* set_front_end_down(front_end_ptr, "Prolog failed"); */
2867 	}
2868 
2869 	/* First validate the job info */
2870 	for (i = 0; i < reg_msg->job_count; i++) {
2871 		if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) &&
2872 		     (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) {
2873 			info("NoAllocate JobId=%u %s reported",
2874 			     reg_msg->job_id[i],
2875 			     _build_step_id(step_str, sizeof(step_str),
2876 					    reg_msg->step_id[i]));
2877 			continue;
2878 		}
2879 
2880 		job_ptr = find_job_record(reg_msg->job_id[i]);
2881 		node_ptr = node_record_table_ptr;
2882 		if (job_ptr && job_ptr->node_bitmap &&
2883 		    ((j = bit_ffs(job_ptr->node_bitmap)) >= 0))
2884 			node_ptr += j;
2885 
2886 		if (job_ptr == NULL) {
2887 			error("Orphan JobId=%u %s reported on node %s",
2888 			      reg_msg->job_id[i],
2889 			      _build_step_id(step_str, sizeof(step_str),
2890 					     reg_msg->step_id[i]),
2891 			      front_end_ptr->name);
2892 			abort_job_on_node(reg_msg->job_id[i],
2893 					  job_ptr, front_end_ptr->name);
2894 			continue;
2895 		} else if (job_ptr->batch_host == NULL) {
2896 			error("Resetting NULL batch_host of JobId=%u to %s",
2897 			      reg_msg->job_id[i], front_end_ptr->name);
2898 			job_ptr->batch_host = xstrdup(front_end_ptr->name);
2899 		}
2900 
2901 
2902 		if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) {
2903 			debug3("Registered %pJ %s on %s",
2904 			       job_ptr,
2905 			       _build_step_id(step_str, sizeof(step_str),
2906 					     reg_msg->step_id[i]),
2907 			       front_end_ptr->name);
2908 			if (job_ptr->batch_flag) {
2909 				/* NOTE: Used for purging defunct batch jobs */
2910 				job_ptr->time_last_active = now;
2911 			}
2912 		}
2913 
2914 		else if (IS_JOB_COMPLETING(job_ptr)) {
2915 			/*
2916 			 * Re-send kill request as needed,
2917 			 * not necessarily an error
2918 			 */
2919 			kill_job_on_node(job_ptr, node_ptr);
2920 		}
2921 
2922 		else if (IS_JOB_PENDING(job_ptr)) {
2923 			/* Typically indicates a job requeue and the hung
2924 			 * slurmd that went DOWN is now responding */
2925 			error("Registered PENDING %pJ %s on %s",
2926 			      job_ptr,
2927 			      _build_step_id(step_str, sizeof(step_str),
2928 					     reg_msg->step_id[i]),
2929 			      front_end_ptr->name);
2930 			abort_job_on_node(reg_msg->job_id[i], job_ptr,
2931 					  front_end_ptr->name);
2932 		}
2933 
2934 		else if (difftime(now, job_ptr->end_time) <
2935 			 slurm_get_msg_timeout()) {	/* Race condition */
2936 			debug("Registered newly completed %pJ %s on %s",
2937 			      job_ptr,
2938 			      _build_step_id(step_str, sizeof(step_str),
2939 					     reg_msg->step_id[i]),
2940 			      front_end_ptr->name);
2941 		}
2942 
2943 		else {		/* else job is supposed to be done */
2944 			error("Registered %pJ %s in state %s on %s",
2945 			      job_ptr,
2946 			      _build_step_id(step_str, sizeof(step_str),
2947 					     reg_msg->step_id[i]),
2948 			      job_state_string(job_ptr->job_state),
2949 			      front_end_ptr->name);
2950 			kill_job_on_node(job_ptr, node_ptr);
2951 		}
2952 	}
2953 
2954 
2955 	/* purge orphan batch jobs */
2956 	job_iterator = list_iterator_create(job_list);
2957 	while ((job_ptr = list_next(job_iterator))) {
2958 		if (!IS_JOB_RUNNING(job_ptr) ||
2959 		    IS_JOB_CONFIGURING(job_ptr) ||
2960 		    (job_ptr->batch_flag == 0))
2961 			continue;
2962 		if (job_ptr->front_end_ptr != front_end_ptr)
2963 			continue;
2964 		if (difftime(now, job_ptr->time_last_active) <= 5)
2965 			continue;
2966 		info("Killing orphan batch %pJ", job_ptr);
2967 		job_complete(job_ptr->job_id, slurmctld_conf.slurm_user_id,
2968 			     false, false, 0);
2969 	}
2970 	list_iterator_destroy(job_iterator);
2971 
2972 	(void) gres_plugin_node_config_unpack(reg_msg->gres_info,
2973 					      node_record_table_ptr->name);
2974 	for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
2975 	     i++, node_ptr++) {
2976 		bool acct_updated = false;
2977 
2978 		config_ptr = node_ptr->config_ptr;
2979 		node_ptr->last_response = MAX(now, node_ptr->last_response);
2980 
2981 		rc = gres_plugin_node_config_validate(
2982 			node_ptr->name,
2983 			config_ptr->gres,
2984 			&node_ptr->gres,
2985 			&node_ptr->gres_list,
2986 			reg_msg->threads,
2987 			reg_msg->cores,
2988 			reg_msg->sockets,
2989 			slurmctld_conf.conf_flags & CTL_CONF_OR,
2990 			&reason_down);
2991 		if (rc) {
2992 			if (!IS_NODE_DOWN(node_ptr)) {
2993 				error("Setting node %s state to DOWN",
2994 				      node_ptr->name);
2995 			}
2996 			set_node_down(node_ptr->name, reason_down);
2997 			last_node_update = now;
2998 		}
2999 		xfree(reason_down);
3000 		gres_plugin_node_state_log(node_ptr->gres_list, node_ptr->name);
3001 
3002 		if (reg_msg->up_time) {
3003 			node_ptr->up_time = reg_msg->up_time;
3004 			node_ptr->boot_time = now - reg_msg->up_time;
3005 		}
3006 		node_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
3007 
3008 		if (IS_NODE_NO_RESPOND(node_ptr)) {
3009 			update_node_state = true;
3010 			/* This is handled by the select/cray plugin */
3011 			node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
3012 			node_ptr->node_state &= (~NODE_STATE_POWER_UP);
3013 		}
3014 
3015 		if (reg_msg->status != ESLURMD_PROLOG_FAILED) {
3016 			if (reg_hostlist)
3017 				(void) hostlist_push_host(reg_hostlist,
3018 							  node_ptr->name);
3019 			else
3020 				reg_hostlist = hostlist_create(node_ptr->name);
3021 
3022 			node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3023 			if (IS_NODE_UNKNOWN(node_ptr)) {
3024 				update_node_state = true;
3025 				*newly_up = true;
3026 				if (node_ptr->run_job_cnt) {
3027 					node_ptr->node_state =
3028 						NODE_STATE_ALLOCATED |
3029 						node_flags;
3030 				} else {
3031 					node_ptr->node_state =
3032 						NODE_STATE_IDLE |
3033 						node_flags;
3034 					node_ptr->last_idle = now;
3035 				}
3036 				if (!IS_NODE_DRAIN(node_ptr) &&
3037 				    !IS_NODE_FAIL(node_ptr)) {
3038 					/* reason information is handled in
3039 					 * clusteracct_storage_g_node_up() */
3040 					clusteracct_storage_g_node_up(
3041 						acct_db_conn,
3042 						node_ptr, now);
3043 					acct_updated = true;
3044 				}
3045 			} else if (IS_NODE_DOWN(node_ptr) &&
3046 				   ((slurmctld_conf.ret2service == 2) ||
3047 				    (node_ptr->boot_req_time != 0)    ||
3048 				    ((slurmctld_conf.ret2service == 1) &&
3049 				     !xstrcmp(node_ptr->reason,
3050 					      "Not responding")))) {
3051 				update_node_state = true;
3052 				*newly_up = true;
3053 				if (node_ptr->run_job_cnt) {
3054 					node_ptr->node_state =
3055 						NODE_STATE_ALLOCATED |
3056 						node_flags;
3057 				} else {
3058 					node_ptr->node_state =
3059 						NODE_STATE_IDLE |
3060 						node_flags;
3061 					node_ptr->last_idle = now;
3062 				}
3063 				trigger_node_up(node_ptr);
3064 				if (!IS_NODE_DRAIN(node_ptr) &&
3065 				    !IS_NODE_FAIL(node_ptr)) {
3066 					/* reason information is handled in
3067 					 * clusteracct_storage_g_node_up() */
3068 					clusteracct_storage_g_node_up(
3069 						acct_db_conn,
3070 						node_ptr, now);
3071 					acct_updated = true;
3072 				}
3073 			} else if (IS_NODE_ALLOCATED(node_ptr) &&
3074 				   (node_ptr->run_job_cnt == 0)) {
3075 				/* job vanished */
3076 				update_node_state = true;
3077 				node_ptr->node_state = NODE_STATE_IDLE |
3078 					node_flags;
3079 				node_ptr->last_idle = now;
3080 			} else if (IS_NODE_COMPLETING(node_ptr) &&
3081 				   (node_ptr->comp_job_cnt == 0)) {
3082 				/* job already done */
3083 				update_node_state = true;
3084 				node_ptr->node_state &=
3085 					(~NODE_STATE_COMPLETING);
3086 				bit_clear(cg_node_bitmap, i);
3087 			} else if (IS_NODE_IDLE(node_ptr) &&
3088 				   (node_ptr->run_job_cnt != 0)) {
3089 				update_node_state = true;
3090 				node_ptr->node_state = NODE_STATE_ALLOCATED |
3091 						       node_flags;
3092 				error("Invalid state for node %s, was IDLE "
3093 				      "with %u running jobs",
3094 				      node_ptr->name, reg_msg->job_count);
3095 			}
3096 			if (IS_NODE_IDLE(node_ptr)) {
3097 				node_ptr->owner = NO_VAL;
3098 				xfree(node_ptr->mcs_label);
3099 			}
3100 
3101 			select_g_update_node_config(i);
3102 			_sync_bitmaps(node_ptr,
3103 				      (node_ptr->run_job_cnt +
3104 				       node_ptr->comp_job_cnt));
3105 		}
3106 		if (reg_msg->energy)
3107 			memcpy(node_ptr->energy, reg_msg->energy,
3108 			       sizeof(acct_gather_energy_t));
3109 
3110 		if (!acct_updated && slurmctld_init_db &&
3111 		    !IS_NODE_DOWN(node_ptr) &&
3112 		    !IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
3113 			/* reason information is handled in
3114 			   clusteracct_storage_g_node_up()
3115 			*/
3116 			clusteracct_storage_g_node_up(
3117 				acct_db_conn, node_ptr, now);
3118 		}
3119 
3120 	}
3121 
3122 	if (reg_hostlist) {
3123 		hostlist_uniq(reg_hostlist);
3124 		host_str = hostlist_ranged_string_xmalloc(reg_hostlist);
3125 		debug("Nodes %s have registered", host_str);
3126 		xfree(host_str);
3127 		hostlist_destroy(reg_hostlist);
3128 	}
3129 
3130 	if (update_node_state)
3131 		last_node_update = time (NULL);
3132 	return error_code;
3133 }
3134 
3135 /* Sync idle, share, and avail_node_bitmaps for a given node */
_sync_bitmaps(node_record_t * node_ptr,int job_count)3136 static void _sync_bitmaps(node_record_t *node_ptr, int job_count)
3137 {
3138 	int node_inx = node_ptr - node_record_table_ptr;
3139 
3140 	if (job_count == 0) {
3141 		bit_set (idle_node_bitmap, node_inx);
3142 		bit_set (share_node_bitmap, node_inx);
3143 	}
3144 	if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr) ||
3145 	    IS_NODE_FAIL(node_ptr) || IS_NODE_NO_RESPOND(node_ptr))
3146 		bit_clear (avail_node_bitmap, node_inx);
3147 	else
3148 		make_node_avail(node_inx);
3149 	if (IS_NODE_DOWN(node_ptr))
3150 		bit_clear (up_node_bitmap, node_inx);
3151 	else
3152 		bit_set   (up_node_bitmap, node_inx);
3153 }
3154 
3155 #ifdef HAVE_FRONT_END
_node_did_resp(front_end_record_t * fe_ptr)3156 static void _node_did_resp(front_end_record_t *fe_ptr)
3157 {
3158 	uint32_t node_flags;
3159 	time_t now = time(NULL);
3160 
3161 	fe_ptr->last_response = MAX(now, fe_ptr->last_response);
3162 
3163 	if (IS_NODE_NO_RESPOND(fe_ptr)) {
3164 		info("Node %s now responding", fe_ptr->name);
3165 		last_front_end_update = now;
3166 		fe_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
3167 	}
3168 
3169 	node_flags = fe_ptr->node_state & NODE_STATE_FLAGS;
3170 	if (IS_NODE_UNKNOWN(fe_ptr)) {
3171 		last_front_end_update = now;
3172 		fe_ptr->node_state = NODE_STATE_IDLE | node_flags;
3173 	}
3174 	if (IS_NODE_DOWN(fe_ptr) &&
3175 	    ((slurmctld_conf.ret2service == 2) ||
3176 	     ((slurmctld_conf.ret2service == 1) &&
3177 	      !xstrcmp(fe_ptr->reason, "Not responding")))) {
3178 		last_front_end_update = now;
3179 		fe_ptr->node_state = NODE_STATE_IDLE | node_flags;
3180 		info("node_did_resp: node %s returned to service",
3181 		     fe_ptr->name);
3182 		trigger_front_end_up(fe_ptr);
3183 		if (!IS_NODE_DRAIN(fe_ptr) && !IS_NODE_FAIL(fe_ptr)) {
3184 			xfree(fe_ptr->reason);
3185 			fe_ptr->reason_time = 0;
3186 			fe_ptr->reason_uid = NO_VAL;
3187 		}
3188 	}
3189 	return;
3190 }
3191 #else
_node_did_resp(node_record_t * node_ptr)3192 static void _node_did_resp(node_record_t *node_ptr)
3193 {
3194 	int node_inx;
3195 	uint32_t node_flags;
3196 	time_t now = time(NULL);
3197 
3198 	node_inx = node_ptr - node_record_table_ptr;
3199 	if (waiting_for_node_boot(node_ptr))
3200 		return;
3201 	node_ptr->last_response = MAX(now, node_ptr->last_response);
3202 	if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_POWER_UP(node_ptr)) {
3203 		info("Node %s now responding", node_ptr->name);
3204 		node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
3205 		node_ptr->node_state &= (~NODE_STATE_POWER_UP);
3206 		if (!is_node_in_maint_reservation(node_inx))
3207 			node_ptr->node_state &= (~NODE_STATE_MAINT);
3208 		last_node_update = now;
3209 	}
3210 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3211 	if (IS_NODE_UNKNOWN(node_ptr)) {
3212 		node_ptr->last_idle = now;
3213 		if (node_ptr->run_job_cnt) {
3214 			node_ptr->node_state = NODE_STATE_ALLOCATED |
3215 					       node_flags;
3216 		} else
3217 			node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3218 		last_node_update = now;
3219 		if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
3220 			clusteracct_storage_g_node_up(acct_db_conn,
3221 						      node_ptr, now);
3222 		}
3223 	}
3224 	if (IS_NODE_DOWN(node_ptr) &&
3225 	    ((slurmctld_conf.ret2service == 2) ||
3226 	     (node_ptr->boot_req_time != 0)    ||
3227 	     ((slurmctld_conf.ret2service == 1) &&
3228 	      !xstrcmp(node_ptr->reason, "Not responding")))) {
3229 		node_ptr->last_idle = now;
3230 		node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3231 		info("node_did_resp: node %s returned to service",
3232 		     node_ptr->name);
3233 		trigger_node_up(node_ptr);
3234 		last_node_update = now;
3235 		if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
3236 			/* reason information is handled in
3237 			   clusteracct_storage_g_node_up()
3238 			*/
3239 			clusteracct_storage_g_node_up(acct_db_conn,
3240 						      node_ptr, now);
3241 		}
3242 	}
3243 	if (IS_NODE_IDLE(node_ptr) && !IS_NODE_COMPLETING(node_ptr)) {
3244 		bit_set (idle_node_bitmap, node_inx);
3245 		bit_set (share_node_bitmap, node_inx);
3246 	}
3247 	if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr) ||
3248 	    IS_NODE_FAIL(node_ptr)) {
3249 		bit_clear (avail_node_bitmap, node_inx);
3250 	} else
3251 		bit_set   (avail_node_bitmap, node_inx);
3252 	if (IS_NODE_DOWN(node_ptr))
3253 		bit_clear (up_node_bitmap, node_inx);
3254 	else
3255 		bit_set   (up_node_bitmap, node_inx);
3256 	return;
3257 }
3258 #endif
3259 
3260 /*
3261  * node_did_resp - record that the specified node is responding
3262  * IN name - name of the node
3263  */
node_did_resp(char * name)3264 void node_did_resp (char *name)
3265 {
3266 #ifdef HAVE_FRONT_END
3267 	front_end_record_t *node_ptr;
3268 	node_ptr = find_front_end_record (name);
3269 #else
3270 	node_record_t *node_ptr;
3271 	node_ptr = find_node_record (name);
3272 #endif
3273 
3274 	xassert(verify_lock(CONF_LOCK, READ_LOCK));
3275 
3276 	if (node_ptr == NULL) {
3277 		error ("node_did_resp unable to find node %s", name);
3278 		return;
3279 	}
3280 	_node_did_resp(node_ptr);
3281 	debug2("node_did_resp %s",name);
3282 }
3283 
3284 /*
3285  * node_not_resp - record that the specified node is not responding
3286  * IN name - name of the node
3287  * IN msg_time - time message was sent
3288  */
node_not_resp(char * name,time_t msg_time,slurm_msg_type_t resp_type)3289 void node_not_resp (char *name, time_t msg_time, slurm_msg_type_t resp_type)
3290 {
3291 #ifdef HAVE_FRONT_END
3292 	front_end_record_t *node_ptr;
3293 
3294 	node_ptr = find_front_end_record (name);
3295 #else
3296 	node_record_t *node_ptr;
3297 
3298 	node_ptr = find_node_record (name);
3299 #endif
3300 	if (node_ptr == NULL) {
3301 		error ("node_not_resp unable to find node %s", name);
3302 		return;
3303 	}
3304 
3305 	/* If the slurmd on the node responded with something we don't
3306 	 * want to ever set the node down, so mark that the node
3307 	 * responded, but for whatever reason there was a
3308 	 * communication error.  This makes it so we don't mark the
3309 	 * node down if the slurmd really is there (Wrong protocol
3310 	 * version or munge issue or whatever) so we don't kill
3311 	 * any running jobs.  RESPONSE_FORWARD_FAILED means we
3312 	 * couldn't contact the slurmd.
3313 	 * last_response could be in the future if boot in progress.
3314 	 */
3315 	if (resp_type != RESPONSE_FORWARD_FAILED) {
3316 		node_ptr->last_response = MAX(msg_time - 1,
3317 					      node_ptr->last_response);
3318 	}
3319 
3320 	if (!IS_NODE_DOWN(node_ptr)) {
3321 		/* Logged by node_no_resp_msg() on periodic basis */
3322 		node_ptr->not_responding = true;
3323 	}
3324 
3325 	if (IS_NODE_NO_RESPOND(node_ptr) ||
3326 	    IS_NODE_POWER_SAVE(node_ptr))
3327 		return;		/* Already known to be not responding */
3328 
3329 	if (node_ptr->last_response >= msg_time) {
3330 		debug("node_not_resp: node %s responded since msg sent",
3331 		      node_ptr->name);
3332 		return;
3333 	}
3334 
3335 	if (!IS_NODE_POWER_SAVE(node_ptr)) {
3336 		node_ptr->node_state |= NODE_STATE_NO_RESPOND;
3337 #ifdef HAVE_FRONT_END
3338 		last_front_end_update = time(NULL);
3339 #else
3340 		last_node_update = time(NULL);
3341 		bit_clear (avail_node_bitmap, (node_ptr - node_record_table_ptr));
3342 #endif
3343 	}
3344 
3345 	return;
3346 }
3347 
3348 /* For every node with the "not_responding" flag set, clear the flag
3349  * and log that the node is not responding using a hostlist expression */
node_no_resp_msg(void)3350 extern void node_no_resp_msg(void)
3351 {
3352 	int i;
3353 	node_record_t *node_ptr;
3354 	char *host_str = NULL;
3355 	hostlist_t no_resp_hostlist = NULL;
3356 
3357 	for (i = 0; i < node_record_count; i++) {
3358 		node_ptr = &node_record_table_ptr[i];
3359 		if (!node_ptr->not_responding ||
3360 		    IS_NODE_POWER_SAVE(node_ptr) ||
3361 		    IS_NODE_POWER_UP(node_ptr))
3362 			continue;
3363 		if (no_resp_hostlist) {
3364 			(void) hostlist_push_host(no_resp_hostlist,
3365 						  node_ptr->name);
3366 		} else
3367 			no_resp_hostlist = hostlist_create(node_ptr->name);
3368 		node_ptr->not_responding = false;
3369  	}
3370 	if (no_resp_hostlist) {
3371 		hostlist_uniq(no_resp_hostlist);
3372 		host_str = hostlist_ranged_string_xmalloc(no_resp_hostlist);
3373 		error("Nodes %s not responding", host_str);
3374 		xfree(host_str);
3375 		hostlist_destroy(no_resp_hostlist);
3376 	}
3377 }
3378 
3379 /*
3380  * set_node_down - make the specified compute node's state DOWN and
3381  *	kill jobs as needed
3382  * IN name - name of the node
3383  * IN reason - why the node is DOWN
3384  */
set_node_down(char * name,char * reason)3385 void set_node_down (char *name, char *reason)
3386 {
3387 	node_record_t *node_ptr;
3388 
3389 	node_ptr = find_node_record (name);
3390 	if (node_ptr == NULL) {
3391 		error ("set_node_down unable to find node %s", name);
3392 		return;
3393 	}
3394 	set_node_down_ptr (node_ptr, reason);
3395 
3396 	return;
3397 }
3398 
3399 /*
3400  * set_node_down_ptr - make the specified compute node's state DOWN and
3401  *	kill jobs as needed
3402  * IN node_ptr - node_ptr to the node
3403  * IN reason - why the node is DOWN
3404  */
set_node_down_ptr(node_record_t * node_ptr,char * reason)3405 void set_node_down_ptr(node_record_t *node_ptr, char *reason)
3406 {
3407 	time_t now = time(NULL);
3408 
3409 	if ((node_ptr->reason == NULL) ||
3410 	    (xstrncmp(node_ptr->reason, "Not responding", 14) == 0)) {
3411 		xfree(node_ptr->reason);
3412 		if (reason) {
3413 			node_ptr->reason = xstrdup(reason);
3414 			node_ptr->reason_time = now;
3415 			node_ptr->reason_uid = slurmctld_conf.slurm_user_id;
3416 		} else {
3417 			node_ptr->reason_time = 0;
3418 			node_ptr->reason_uid = NO_VAL;
3419 		}
3420 	}
3421 	_make_node_down(node_ptr, now);
3422 	(void) kill_running_job_by_node_name(node_ptr->name);
3423 	_sync_bitmaps(node_ptr, 0);
3424 
3425 	return;
3426 }
3427 
3428 /*
3429  * is_node_down - determine if the specified node's state is DOWN
3430  * IN name - name of the node
3431  * RET true if node exists and is down, otherwise false
3432  */
is_node_down(char * name)3433 bool is_node_down (char *name)
3434 {
3435 	node_record_t *node_ptr;
3436 
3437 	node_ptr = find_node_record (name);
3438 	if (node_ptr == NULL) {
3439 		error ("is_node_down unable to find node %s", name);
3440 		return false;
3441 	}
3442 
3443 	if (IS_NODE_DOWN(node_ptr))
3444 		return true;
3445 	return false;
3446 }
3447 
3448 /*
3449  * is_node_resp - determine if the specified node's state is responding
3450  * IN name - name of the node
3451  * RET true if node exists and is responding, otherwise false
3452  */
is_node_resp(char * name)3453 bool is_node_resp (char *name)
3454 {
3455 #ifdef HAVE_FRONT_END
3456 	front_end_record_t *node_ptr;
3457 
3458 	node_ptr = find_front_end_record (name);
3459 #else
3460 	node_record_t *node_ptr;
3461 
3462 	node_ptr = find_node_record (name);
3463 #endif
3464 	if (node_ptr == NULL) {
3465 		error ("is_node_resp unable to find node %s", name);
3466 		return false;
3467 	}
3468 
3469 	if (IS_NODE_NO_RESPOND(node_ptr))
3470 		return false;
3471 	return true;
3472 }
3473 
3474 /*
3475  * find_first_node_record - find a record for first node in the bitmap
3476  * IN node_bitmap
3477  */
find_first_node_record(bitstr_t * node_bitmap)3478 node_record_t *find_first_node_record(bitstr_t *node_bitmap)
3479 {
3480 	int inx;
3481 
3482 	if (node_bitmap == NULL) {
3483 		error ("find_first_node_record passed null bitstring");
3484 		return NULL;
3485 	}
3486 
3487 	inx = bit_ffs (node_bitmap);
3488 	if (inx < 0)
3489 		return NULL;
3490 	else
3491 		return &node_record_table_ptr[inx];
3492 }
3493 
3494 /*
3495  * msg_to_slurmd - send given msg_type (REQUEST_RECONFIGURE or REQUEST_SHUTDOWN)
3496  * to every slurmd
3497  */
msg_to_slurmd(slurm_msg_type_t msg_type)3498 void msg_to_slurmd (slurm_msg_type_t msg_type)
3499 {
3500 	int i;
3501 	shutdown_msg_t *shutdown_req;
3502 	agent_arg_t *kill_agent_args;
3503 #ifdef HAVE_FRONT_END
3504 	front_end_record_t *front_end_ptr;
3505 #else
3506 	node_record_t *node_ptr;
3507 #endif
3508 
3509 	kill_agent_args = xmalloc (sizeof (agent_arg_t));
3510 	kill_agent_args->msg_type = msg_type;
3511 	kill_agent_args->retry = 0;
3512 	kill_agent_args->hostlist = hostlist_create(NULL);
3513 	if (msg_type == REQUEST_SHUTDOWN) {
3514  		shutdown_req = xmalloc(sizeof(shutdown_msg_t));
3515 		shutdown_req->options = 0;
3516 		kill_agent_args->msg_args = shutdown_req;
3517 	}
3518 
3519 	kill_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
3520 
3521 #ifdef HAVE_FRONT_END
3522 	for (i = 0, front_end_ptr = front_end_nodes;
3523 	     i < front_end_node_cnt; i++, front_end_ptr++) {
3524 		if (kill_agent_args->protocol_version >
3525 		    front_end_ptr->protocol_version)
3526 			kill_agent_args->protocol_version =
3527 				front_end_ptr->protocol_version;
3528 
3529 		hostlist_push_host(kill_agent_args->hostlist,
3530 				   front_end_ptr->name);
3531 		kill_agent_args->node_count++;
3532 	}
3533 #else
3534 	node_ptr = node_record_table_ptr;
3535 	for (i = 0; i < node_record_count; i++, node_ptr++) {
3536 		if (IS_NODE_FUTURE(node_ptr))
3537 			continue;
3538 		if (IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr))
3539 			continue;
3540 		if (kill_agent_args->protocol_version >
3541 		    node_record_table_ptr[i].protocol_version)
3542 			kill_agent_args->protocol_version =
3543 				node_record_table_ptr[i].protocol_version;
3544 		hostlist_push_host(kill_agent_args->hostlist, node_ptr->name);
3545 		kill_agent_args->node_count++;
3546 	}
3547 #endif
3548 
3549 	if (kill_agent_args->node_count == 0) {
3550 		hostlist_destroy(kill_agent_args->hostlist);
3551 		xfree (kill_agent_args);
3552 	} else {
3553 		debug ("Spawning agent msg_type=%d", msg_type);
3554 		agent_queue_request(kill_agent_args);
3555 	}
3556 }
3557 
3558 /*
3559  * Specialized version of msg_to_slurmd that handles cross-version issues
3560  * when running configless.
3561  *
3562  * Since the REQUEST_RECONFIGURE message had no body, you could get away with
3563  * sending under the oldest format of any slurmd attached to the system.
3564  *
3565  * For configless, this would mean nothing gets sent to anyone, and those
3566  * older slurmds get REQUEST_RECONFIGURE_WITH_CONFIG and ignore it.
3567  *
3568  * So explicitly split the pool into two groups.
3569  * (Note: may need to split this into three groups for future changes.)
3570  * Note: DOES NOT SUPPORT FRONTEND.
3571  */
push_reconfig_to_slurmd(void)3572 void push_reconfig_to_slurmd(void)
3573 {
3574 #ifndef HAVE_FRONT_END
3575 	agent_arg_t *new_args, *old_args;
3576 	node_record_t *node_ptr;
3577 	config_response_msg_t *config = xmalloc(sizeof(*config));
3578 
3579 	new_args = xmalloc(sizeof(*new_args));
3580 	new_args->msg_type = REQUEST_RECONFIGURE_WITH_CONFIG;
3581 	new_args->retry = 0;
3582 	new_args->hostlist = hostlist_create(NULL);
3583 	new_args->protocol_version = SLURM_PROTOCOL_VERSION;
3584 	new_args->msg_args = config;
3585 	load_config_response_msg(config, CONFIG_REQUEST_SLURMD);
3586 
3587 	old_args = xmalloc(sizeof(*old_args));
3588 	old_args->msg_type = REQUEST_RECONFIGURE;
3589 	old_args->retry = 0;
3590 	old_args->hostlist = hostlist_create(NULL);
3591 	old_args->protocol_version = SLURM_MIN_PROTOCOL_VERSION;
3592 
3593 	node_ptr = node_record_table_ptr;
3594 	for (int i = 0; i < node_record_count; i++, node_ptr++) {
3595 		if (IS_NODE_FUTURE(node_ptr))
3596 			continue;
3597 		if (IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr))
3598 			continue;
3599 
3600 		if (node_ptr->protocol_version == SLURM_PROTOCOL_VERSION) {
3601 			hostlist_push_host(new_args->hostlist, node_ptr->name);
3602 			new_args->node_count++;
3603 		} else {
3604 			hostlist_push_host(old_args->hostlist, node_ptr->name);
3605 			old_args->node_count++;
3606 		}
3607 	}
3608 
3609 	if (new_args->node_count == 0) {
3610 		hostlist_destroy(new_args->hostlist);
3611 		slurm_free_config_response_msg(config);
3612 		xfree(new_args);
3613 	} else {
3614 		debug("Spawning agent msg_type=%d", new_args->msg_type);
3615 		agent_queue_request(new_args);
3616 	}
3617 
3618 	if (old_args->node_count == 0) {
3619 		hostlist_destroy(old_args->hostlist);
3620 		xfree(old_args);
3621 	} else {
3622 		debug("Spawning agent msg_type=%d", old_args->msg_type);
3623 		agent_queue_request(old_args);
3624 	}
3625 #else
3626 	error("%s: Cannot use configless with FrontEnd mode! Sending normal reconfigure request.",
3627 	      __func__);
3628 	msg_to_slurmd(REQUEST_RECONFIGURE);
3629 #endif
3630 }
3631 
3632 
3633 /*
3634  * make_node_alloc - flag specified node as allocated to a job
3635  * IN node_ptr - pointer to node being allocated
3636  * IN job_ptr  - pointer to job that is starting
3637  */
make_node_alloc(node_record_t * node_ptr,job_record_t * job_ptr)3638 extern void make_node_alloc(node_record_t *node_ptr, job_record_t *job_ptr)
3639 {
3640 	int inx = node_ptr - node_record_table_ptr;
3641 	uint32_t node_flags;
3642 
3643 	(node_ptr->run_job_cnt)++;
3644 	bit_clear(idle_node_bitmap, inx);
3645 	if (job_ptr->details && (job_ptr->details->share_res == 0)) {
3646 		bit_clear(share_node_bitmap, inx);
3647 		(node_ptr->no_share_job_cnt)++;
3648 	}
3649 
3650 	if ((job_ptr->details &&
3651 	     (job_ptr->details->whole_node == WHOLE_NODE_USER)) ||
3652 	    (job_ptr->part_ptr &&
3653 	     (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))) {
3654 		node_ptr->owner_job_cnt++;
3655 		node_ptr->owner = job_ptr->user_id;
3656 	}
3657 
3658 	if (slurm_mcs_get_select(job_ptr) == 1) {
3659 		xfree(node_ptr->mcs_label);
3660 		node_ptr->mcs_label = xstrdup(job_ptr->mcs_label);
3661 	}
3662 
3663 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3664 	node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
3665 	xfree(node_ptr->reason);
3666 	node_ptr->reason_time = 0;
3667 	node_ptr->reason_uid = NO_VAL;
3668 
3669 	last_node_update = time(NULL);
3670 }
3671 
3672 /* make_node_avail - flag specified node as available */
make_node_avail(int node_inx)3673 extern void make_node_avail(int node_inx)
3674 {
3675 	bit_set(avail_node_bitmap, node_inx);
3676 
3677 	/*
3678 	 * If we are in the middle of a backfill cycle, this bitmap is
3679 	 * used (when bf_continue is enabled) to avoid scheduling lower
3680 	 * priority jobs on to newly available resources.
3681 	 */
3682 	bit_set(bf_ignore_node_bitmap, node_inx);
3683 }
3684 
3685 /* make_node_comp - flag specified node as completing a job
3686  * IN node_ptr - pointer to node marked for completion of job
3687  * IN job_ptr - pointer to job that is completing
3688  * IN suspended - true if job was previously suspended
3689  */
make_node_comp(node_record_t * node_ptr,job_record_t * job_ptr,bool suspended)3690 extern void make_node_comp(node_record_t *node_ptr, job_record_t *job_ptr,
3691 			   bool suspended)
3692 {
3693 	int inx = node_ptr - node_record_table_ptr;
3694 	uint32_t node_flags;
3695 	time_t now = time(NULL);
3696 
3697 	xassert(node_ptr);
3698 	if (suspended) {
3699 		if (node_ptr->sus_job_cnt) {
3700 			(node_ptr->sus_job_cnt)--;
3701 		} else {
3702 			error("%s: %pJ node %s sus_job_cnt underflow", __func__,
3703 			      job_ptr, node_ptr->name);
3704 		}
3705 	} else {
3706 		if (node_ptr->run_job_cnt) {
3707 			(node_ptr->run_job_cnt)--;
3708 		} else {
3709 			error("%s: %pJ node %s run_job_cnt underflow", __func__,
3710 			      job_ptr, node_ptr->name);
3711 		}
3712 		if (job_ptr->details && (job_ptr->details->share_res == 0)) {
3713 			if (node_ptr->no_share_job_cnt) {
3714 				(node_ptr->no_share_job_cnt)--;
3715 			} else {
3716 				error("%s: %pJ node %s no_share_job_cnt underflow",
3717 				      __func__, job_ptr, node_ptr->name);
3718 			}
3719 			if (node_ptr->no_share_job_cnt == 0)
3720 				bit_set(share_node_bitmap, inx);
3721 		}
3722 	}
3723 
3724 	if (!IS_NODE_DOWN(node_ptr) && !IS_NODE_POWER_UP(node_ptr)) {
3725 		/* Don't verify RPC if node in DOWN or POWER_UP state */
3726 		(node_ptr->comp_job_cnt)++;
3727 		node_ptr->node_state |= NODE_STATE_COMPLETING;
3728 		bit_set(cg_node_bitmap, inx);
3729 	}
3730 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3731 
3732 	if ((node_ptr->run_job_cnt  == 0) &&
3733 	    (node_ptr->comp_job_cnt == 0)) {
3734 		node_ptr->last_idle = now;
3735 		bit_set(idle_node_bitmap, inx);
3736 		if (IS_NODE_DRAIN(node_ptr) || IS_NODE_FAIL(node_ptr)) {
3737 			trigger_node_drained(node_ptr);
3738 			clusteracct_storage_g_node_down(
3739 				acct_db_conn,
3740 				node_ptr, now, NULL,
3741 				slurmctld_conf.slurm_user_id);
3742 		}
3743 	}
3744 
3745 	if (IS_NODE_DOWN(node_ptr)) {
3746 		debug3("%s: Node %s being left DOWN", __func__, node_ptr->name);
3747 	} else if (node_ptr->run_job_cnt)
3748 		node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
3749 	else {
3750 		node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3751 		node_ptr->last_idle = now;
3752 	}
3753 	last_node_update = now;
3754 }
3755 
3756 /* _make_node_down - flag specified node as down */
_make_node_down(node_record_t * node_ptr,time_t event_time)3757 static void _make_node_down(node_record_t *node_ptr, time_t event_time)
3758 {
3759 	int inx = node_ptr - node_record_table_ptr;
3760 	uint32_t node_flags;
3761 
3762 	xassert(node_ptr);
3763 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3764 	node_flags &= (~NODE_STATE_COMPLETING);
3765 	node_ptr->node_state = NODE_STATE_DOWN | node_flags;
3766 	node_ptr->owner = NO_VAL;
3767 	xfree(node_ptr->mcs_label);
3768 	bit_clear (avail_node_bitmap, inx);
3769 	bit_clear (cg_node_bitmap,    inx);
3770 	bit_set   (idle_node_bitmap,  inx);
3771 	bit_set   (share_node_bitmap, inx);
3772 	bit_clear (up_node_bitmap,    inx);
3773 	trigger_node_down(node_ptr);
3774 	last_node_update = time (NULL);
3775 	clusteracct_storage_g_node_down(acct_db_conn,
3776 					node_ptr, event_time, NULL,
3777 					node_ptr->reason_uid);
3778 }
3779 
3780 /*
3781  * make_node_idle - flag specified node as having finished with a job
3782  * IN node_ptr - pointer to node reporting job completion
3783  * IN job_ptr - pointer to job that just completed or NULL if not applicable
3784  */
make_node_idle(node_record_t * node_ptr,job_record_t * job_ptr)3785 void make_node_idle(node_record_t *node_ptr, job_record_t *job_ptr)
3786 {
3787 	int inx = node_ptr - node_record_table_ptr;
3788 	uint32_t node_flags;
3789 	time_t now = time(NULL);
3790 	bitstr_t *node_bitmap = NULL;
3791 
3792 	if (job_ptr) {
3793 		if (job_ptr->node_bitmap_cg)
3794 			node_bitmap = job_ptr->node_bitmap_cg;
3795 		else
3796 			node_bitmap = job_ptr->node_bitmap;
3797 	}
3798 
3799 	trace_job(job_ptr, __func__, "enter");
3800 
3801 	xassert(node_ptr);
3802 	if (node_bitmap && (bit_test(node_bitmap, inx))) {
3803 		/* Not a replay */
3804 		last_job_update = now;
3805 		bit_clear(node_bitmap, inx);
3806 
3807 		if (!IS_JOB_FINISHED(job_ptr))
3808 			job_update_tres_cnt(job_ptr, inx);
3809 
3810 		if (job_ptr->node_cnt) {
3811 			/*
3812 			 * Clean up the JOB_COMPLETING flag
3813 			 * only if there is not the slurmctld
3814 			 * epilog running, otherwise wait
3815 			 * when it terminates then this
3816 			 * function will be invoked.
3817 			 */
3818 			job_ptr->node_cnt--;
3819 			if ((job_ptr->node_cnt == 0) &&
3820 			    !job_ptr->epilog_running)
3821 				cleanup_completing(job_ptr);
3822 		} else if ((job_ptr->total_cpus == 0) &&
3823 			   (job_ptr->total_nodes == 0)) {
3824 			/* Job resized to zero nodes (expanded another job) */
3825 		} else {
3826 			error("%s: %pJ node_cnt underflow", __func__, job_ptr);
3827 		}
3828 
3829 		if (IS_JOB_SUSPENDED(job_ptr)) {
3830 			/* Remove node from suspended job */
3831 			if (node_ptr->sus_job_cnt)
3832 				(node_ptr->sus_job_cnt)--;
3833 			else
3834 				error("%s: %pJ node %s sus_job_cnt underflow",
3835 				      __func__, job_ptr, node_ptr->name);
3836 		} else if (IS_JOB_RUNNING(job_ptr)) {
3837 			/* Remove node from running job */
3838 			if (node_ptr->run_job_cnt)
3839 				(node_ptr->run_job_cnt)--;
3840 			else
3841 				error("%s: %pJ node %s run_job_cnt underflow",
3842 				      __func__, job_ptr, node_ptr->name);
3843 		} else {
3844 			if (node_ptr->comp_job_cnt) {
3845 				(node_ptr->comp_job_cnt)--;
3846 			} else if (IS_NODE_DOWN(node_ptr)) {
3847 				/* We were not expecting this response,
3848 				 * ignore it */
3849 			} else {
3850 				error("%s: %pJ node %s comp_job_cnt underflow",
3851 				      __func__, job_ptr, node_ptr->name);
3852 			}
3853 			if (node_ptr->comp_job_cnt > 0)
3854 				goto fini;	/* More jobs completing */
3855 		}
3856 	}
3857 
3858 	if (node_ptr->comp_job_cnt == 0) {
3859 		node_ptr->node_state &= (~NODE_STATE_COMPLETING);
3860 		bit_clear(cg_node_bitmap, inx);
3861 		if (IS_NODE_IDLE(node_ptr)) {
3862 			node_ptr->owner = NO_VAL;
3863 			xfree(node_ptr->mcs_label);
3864 		}
3865 	}
3866 
3867 	node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3868 	if (IS_NODE_DOWN(node_ptr)) {
3869 		debug3("%s: %pJ node %s being left DOWN",
3870 		       __func__, job_ptr, node_ptr->name);
3871 		goto fini;
3872 	}
3873 	bit_set(up_node_bitmap, inx);
3874 
3875 	if (IS_NODE_DRAIN(node_ptr) || IS_NODE_FAIL(node_ptr) ||
3876 	    IS_NODE_NO_RESPOND(node_ptr))
3877 		bit_clear(avail_node_bitmap, inx);
3878 	else
3879 		make_node_avail(inx);
3880 
3881 	if ((IS_NODE_DRAIN(node_ptr) || IS_NODE_FAIL(node_ptr)) &&
3882 	    (node_ptr->run_job_cnt == 0) && (node_ptr->comp_job_cnt == 0)) {
3883 		node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3884 		bit_set(idle_node_bitmap, inx);
3885 		debug3("%s: %pJ node %s is DRAINED",
3886 		       __func__, job_ptr, node_ptr->name);
3887 		node_ptr->last_idle = now;
3888 		trigger_node_drained(node_ptr);
3889 		if (!IS_NODE_REBOOT(node_ptr))
3890 			clusteracct_storage_g_node_down(acct_db_conn,
3891 							node_ptr, now, NULL,
3892 							slurmctld_conf.slurm_user_id);
3893 	} else if (node_ptr->run_job_cnt) {
3894 		node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
3895 		if (!IS_NODE_NO_RESPOND(node_ptr) &&
3896 		     !IS_NODE_FAIL(node_ptr) && !IS_NODE_DRAIN(node_ptr))
3897 			make_node_avail(inx);
3898 	} else {
3899 		node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3900 		if (!IS_NODE_NO_RESPOND(node_ptr) &&
3901 		     !IS_NODE_FAIL(node_ptr) && !IS_NODE_DRAIN(node_ptr))
3902 			make_node_avail(inx);
3903 		if (!IS_NODE_NO_RESPOND(node_ptr) &&
3904 		    !IS_NODE_COMPLETING(node_ptr))
3905 			bit_set(idle_node_bitmap, inx);
3906 		node_ptr->last_idle = now;
3907 	}
3908 
3909 fini:
3910 	if (job_ptr &&
3911 	    ((job_ptr->details &&
3912 	      (job_ptr->details->whole_node == WHOLE_NODE_USER)) ||
3913 	     (job_ptr->part_ptr &&
3914 	      (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) {
3915 		if (node_ptr->owner_job_cnt == 0) {
3916 			error("%s: node_ptr->owner_job_cnt underflow",
3917 			      __func__);
3918 		} else if (--node_ptr->owner_job_cnt == 0) {
3919 			node_ptr->owner = NO_VAL;
3920 			xfree(node_ptr->mcs_label);
3921 		}
3922 	}
3923 	last_node_update = now;
3924 }
3925 
send_nodes_to_accounting(time_t event_time)3926 extern int send_nodes_to_accounting(time_t event_time)
3927 {
3928 	int rc = SLURM_SUCCESS, i = 0;
3929 	node_record_t *node_ptr = NULL;
3930 	char *reason = NULL;
3931 	slurmctld_lock_t node_read_lock = {
3932 		READ_LOCK, NO_LOCK, READ_LOCK, WRITE_LOCK, NO_LOCK };
3933 
3934  	lock_slurmctld(node_read_lock);
3935 	/* send nodes not in 'up' state */
3936 	node_ptr = node_record_table_ptr;
3937 	for (i = 0; i < node_record_count; i++, node_ptr++) {
3938 		if (!node_ptr->name)
3939 			continue;
3940 		if (node_ptr->reason)
3941 			reason = node_ptr->reason;
3942 		else
3943 			reason = "First Registration";
3944 		if (IS_NODE_DRAIN(node_ptr) ||
3945 		    IS_NODE_FAIL(node_ptr) ||
3946 		    IS_NODE_DOWN(node_ptr))
3947 			rc = clusteracct_storage_g_node_down(
3948 				acct_db_conn,
3949 				node_ptr, event_time,
3950 				reason,
3951 				slurmctld_conf.slurm_user_id);
3952 		if (rc == SLURM_ERROR)
3953 			break;
3954 	}
3955 	unlock_slurmctld(node_read_lock);
3956 	return rc;
3957 }
3958 
3959 /* node_fini - free all memory associated with node records */
node_fini(void)3960 extern void node_fini (void)
3961 {
3962 	FREE_NULL_LIST(active_feature_list);
3963 	FREE_NULL_LIST(avail_feature_list);
3964 	FREE_NULL_BITMAP(avail_node_bitmap);
3965 	FREE_NULL_BITMAP(bf_ignore_node_bitmap);
3966 	FREE_NULL_BITMAP(booting_node_bitmap);
3967 	FREE_NULL_BITMAP(cg_node_bitmap);
3968 	FREE_NULL_BITMAP(future_node_bitmap);
3969 	FREE_NULL_BITMAP(idle_node_bitmap);
3970 	FREE_NULL_BITMAP(power_node_bitmap);
3971 	FREE_NULL_BITMAP(share_node_bitmap);
3972 	FREE_NULL_BITMAP(up_node_bitmap);
3973 	FREE_NULL_BITMAP(rs_node_bitmap);
3974 	node_fini2();
3975 }
3976 
3977 /* Reset a node's CPU load value */
reset_node_load(char * node_name,uint32_t cpu_load)3978 extern void reset_node_load(char *node_name, uint32_t cpu_load)
3979 {
3980 #ifdef HAVE_FRONT_END
3981 	return;
3982 #else
3983 	node_record_t *node_ptr;
3984 
3985 	node_ptr = find_node_record(node_name);
3986 	if (node_ptr) {
3987 		time_t now = time(NULL);
3988 		node_ptr->cpu_load = cpu_load;
3989 		node_ptr->cpu_load_time = now;
3990 		last_node_update = now;
3991 	} else
3992 		error("reset_node_load unable to find node %s", node_name);
3993 #endif
3994 }
3995 
3996 /* Reset a node's free memory value */
reset_node_free_mem(char * node_name,uint64_t free_mem)3997 extern void reset_node_free_mem(char *node_name, uint64_t free_mem)
3998 {
3999 #ifdef HAVE_FRONT_END
4000 	return;
4001 #else
4002 	node_record_t *node_ptr;
4003 
4004 	node_ptr = find_node_record(node_name);
4005 	if (node_ptr) {
4006 		time_t now = time(NULL);
4007 		node_ptr->free_mem = free_mem;
4008 		node_ptr->free_mem_time = now;
4009 		last_node_update = now;
4010 	} else
4011 		error("reset_node_free_mem unable to find node %s", node_name);
4012 #endif
4013 }
4014 
4015 
4016 /*
4017  * Check for nodes that haven't rebooted yet.
4018  *
4019  * If the node hasn't booted by ResumeTimeout, mark the node as down.
4020  */
check_reboot_nodes()4021 extern void check_reboot_nodes()
4022 {
4023 	int i;
4024 	node_record_t *node_ptr;
4025 	time_t now = time(NULL);
4026 	uint16_t resume_timeout = slurmctld_conf.resume_timeout;
4027 
4028 	for (i = 0; i < node_record_count; i++) {
4029 		node_ptr = &node_record_table_ptr[i];
4030 
4031 		if (IS_NODE_REBOOT(node_ptr) &&
4032 		    node_ptr->boot_req_time &&
4033 		    (node_ptr->boot_req_time + resume_timeout < now)) {
4034 			char *timeout_msg = "reboot timed out";
4035 
4036 			if ((node_ptr->next_state != NO_VAL) &&
4037 			    node_ptr->reason) {
4038 				xstrfmtcat(node_ptr->reason, " : %s",
4039 					   timeout_msg);
4040 			} else {
4041 				xfree(node_ptr->reason);
4042 				node_ptr->reason = xstrdup(timeout_msg);
4043 			}
4044 			node_ptr->reason_time = now;
4045 			node_ptr->reason_uid = slurmctld_conf.slurm_user_id;
4046 
4047 			/*
4048 			 * Remove states now so that event state shows as DOWN.
4049 			 */
4050 			node_ptr->node_state &= (~NODE_STATE_REBOOT);
4051 			node_ptr->node_state &= (~NODE_STATE_DRAIN);
4052 			node_ptr->boot_req_time = 0;
4053 			set_node_down_ptr(node_ptr, NULL);
4054 
4055 			bit_clear(rs_node_bitmap, i);
4056 		}
4057 	}
4058 }
4059 
waiting_for_node_boot(struct node_record * node_ptr)4060 extern bool waiting_for_node_boot(struct node_record *node_ptr)
4061 {
4062 	xassert(node_ptr);
4063 
4064 	if ((IS_NODE_POWER_UP(node_ptr) ||
4065 	     (IS_NODE_DOWN(node_ptr) && IS_NODE_REBOOT(node_ptr))) &&
4066 	    (node_ptr->boot_time < node_ptr->boot_req_time)) {
4067 		debug("Still waiting for boot of node %s", node_ptr->name);
4068 		return true;
4069 	}
4070 
4071 	return false;
4072 }
4073