1 /*****************************************************************************\
2 * node_mgr.c - manage the node records of slurm
3 * Note: there is a global node table (node_record_table_ptr), its
4 * hash table (node_hash_table), time stamp (last_node_update) and
5 * configuration list (config_list)
6 *****************************************************************************
7 * Copyright (C) 2002-2007 The Regents of the University of California.
8 * Copyright (C) 2008-2010 Lawrence Livermore National Security.
9 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
10 * Written by Morris Jette <jette1@llnl.gov>, et. al.
11 * CODE-OCEC-09-009. All rights reserved.
12 *
13 * This file is part of Slurm, a resource management program.
14 * For details, see <https://slurm.schedmd.com/>.
15 * Please also read the included file: DISCLAIMER.
16 *
17 * Slurm is free software; you can redistribute it and/or modify it under
18 * the terms of the GNU General Public License as published by the Free
19 * Software Foundation; either version 2 of the License, or (at your option)
20 * any later version.
21 *
22 * In addition, as a special exception, the copyright holders give permission
23 * to link the code of portions of this program with the OpenSSL library under
24 * certain conditions as described in each individual source file, and
25 * distribute linked combinations including the two. You must obey the GNU
26 * General Public License in all respects for all of the code used other than
27 * OpenSSL. If you modify file(s) with this exception, you may extend this
28 * exception to your version of the file(s), but you are not obligated to do
29 * so. If you do not wish to do so, delete this exception statement from your
30 * version. If you delete this exception statement from all source files in
31 * the program, then also delete it here.
32 *
33 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
34 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
35 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
36 * details.
37 *
38 * You should have received a copy of the GNU General Public License along
39 * with Slurm; if not, write to the Free Software Foundation, Inc.,
40 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
41 \*****************************************************************************/
42
43 #include "config.h"
44
45 #include <ctype.h>
46 #include <errno.h>
47 #include <fcntl.h>
48 #include <stdio.h>
49 #include <stdlib.h>
50 #include <string.h>
51 #include <sys/types.h>
52 #include <sys/stat.h>
53 #include <time.h>
54
55 #include "src/common/bitstring.h"
56 #include "src/common/fd.h"
57 #include "src/common/fetch_config.h"
58 #include "src/common/gres.h"
59 #include "src/common/hostlist.h"
60 #include "src/common/macros.h"
61 #include "src/common/node_features.h"
62 #include "src/common/node_select.h"
63 #include "src/common/pack.h"
64 #include "src/common/parse_time.h"
65 #include "src/common/power.h"
66 #include "src/common/read_config.h"
67 #include "src/common/slurm_accounting_storage.h"
68 #include "src/common/slurm_acct_gather_energy.h"
69 #include "src/common/slurm_ext_sensors.h"
70 #include "src/common/slurm_resource_info.h"
71 #include "src/common/slurm_mcs.h"
72 #include "src/common/xassert.h"
73 #include "src/common/xstring.h"
74
75 #include "src/slurmctld/agent.h"
76 #include "src/slurmctld/front_end.h"
77 #include "src/slurmctld/locks.h"
78 #include "src/slurmctld/ping_nodes.h"
79 #include "src/slurmctld/proc_req.h"
80 #include "src/slurmctld/read_config.h"
81 #include "src/slurmctld/reservation.h"
82 #include "src/slurmctld/slurmctld.h"
83 #include "src/slurmctld/slurmctld_plugstack.h"
84 #include "src/slurmctld/state_save.h"
85 #include "src/common/timers.h"
86 #include "src/slurmctld/trigger_mgr.h"
87
88 /* No need to change we always pack SLURM_PROTOCOL_VERSION */
89 #define NODE_STATE_VERSION "PROTOCOL_VERSION"
90
91 typedef enum {
92 FEATURE_MODE_IND, /* Print each node change indivually */
93 FEATURE_MODE_COMB, /* Try to combine like changes */
94 FEATURE_MODE_PEND, /* Print any pending change message */
95 } feature_mode_t;
96
97 /* Global variables */
98 bitstr_t *avail_node_bitmap = NULL; /* bitmap of available nodes */
99 bitstr_t *bf_ignore_node_bitmap = NULL; /* bitmap of nodes to ignore during a
100 * backfill cycle */
101 bitstr_t *booting_node_bitmap = NULL; /* bitmap of booting nodes */
102 bitstr_t *cg_node_bitmap = NULL; /* bitmap of completing nodes */
103 bitstr_t *future_node_bitmap = NULL; /* bitmap of FUTURE nodes */
104 bitstr_t *idle_node_bitmap = NULL; /* bitmap of idle nodes */
105 bitstr_t *power_node_bitmap = NULL; /* bitmap of powered down nodes */
106 bitstr_t *share_node_bitmap = NULL; /* bitmap of sharable nodes */
107 bitstr_t *up_node_bitmap = NULL; /* bitmap of non-down nodes */
108 bitstr_t *rs_node_bitmap = NULL; /* bitmap of resuming nodes */
109
110 static void _dump_node_state(node_record_t *dump_node_ptr, Buf buffer);
111 static front_end_record_t * _front_end_reg(
112 slurm_node_registration_status_msg_t *reg_msg);
113 static bool _is_cloud_hidden(node_record_t *node_ptr);
114 static void _make_node_down(node_record_t *node_ptr,
115 time_t event_time);
116 static bool _node_is_hidden(node_record_t *node_ptr, uid_t uid);
117 static Buf _open_node_state_file(char **state_file);
118 static void _pack_node(node_record_t *dump_node_ptr, Buf buffer,
119 uint16_t protocol_version, uint16_t show_flags);
120 static void _sync_bitmaps(node_record_t *node_ptr, int job_count);
121 static void _update_config_ptr(bitstr_t *bitmap,
122 config_record_t *config_ptr);
123 static int _update_node_active_features(char *node_names,
124 char *active_features, int mode);
125 static int _update_node_avail_features(char *node_names,
126 char *avail_features, int mode);
127 static int _update_node_gres(char *node_names, char *gres);
128 static int _update_node_weight(char *node_names, uint32_t weight);
129 static bool _valid_node_state_change(uint32_t old, uint32_t new);
130
131 /* dump_all_node_state - save the state of all nodes to file */
dump_all_node_state(void)132 int dump_all_node_state ( void )
133 {
134 /* Save high-water mark to avoid buffer growth with copies */
135 static int high_buffer_size = (1024 * 1024);
136 int error_code = 0, inx, log_fd;
137 char *old_file, *new_file, *reg_file;
138 node_record_t *node_ptr;
139 /* Locks: Read config and node */
140 slurmctld_lock_t node_read_lock = { READ_LOCK, NO_LOCK, READ_LOCK,
141 NO_LOCK, NO_LOCK };
142 Buf buffer = init_buf(high_buffer_size);
143 DEF_TIMERS;
144
145 START_TIMER;
146 /* write header: version, time */
147 packstr(NODE_STATE_VERSION, buffer);
148 pack16(SLURM_PROTOCOL_VERSION, buffer);
149 pack_time(time (NULL), buffer);
150
151 /* write node records to buffer */
152 lock_slurmctld (node_read_lock);
153 for (inx = 0, node_ptr = node_record_table_ptr; inx < node_record_count;
154 inx++, node_ptr++) {
155 xassert (node_ptr->magic == NODE_MAGIC);
156 xassert (node_ptr->config_ptr->magic == CONFIG_MAGIC);
157 _dump_node_state (node_ptr, buffer);
158 }
159
160 old_file = xstrdup (slurmctld_conf.state_save_location);
161 xstrcat (old_file, "/node_state.old");
162 reg_file = xstrdup (slurmctld_conf.state_save_location);
163 xstrcat (reg_file, "/node_state");
164 new_file = xstrdup (slurmctld_conf.state_save_location);
165 xstrcat (new_file, "/node_state.new");
166 unlock_slurmctld (node_read_lock);
167
168 /* write the buffer to file */
169 lock_state_files();
170 log_fd = creat (new_file, 0600);
171 if (log_fd < 0) {
172 error ("Can't save state, error creating file %s %m", new_file);
173 error_code = errno;
174 } else {
175 int pos = 0, nwrite = get_buf_offset(buffer), amount, rc;
176 char *data = (char *)get_buf_data(buffer);
177 high_buffer_size = MAX(nwrite, high_buffer_size);
178 while (nwrite > 0) {
179 amount = write(log_fd, &data[pos], nwrite);
180 if ((amount < 0) && (errno != EINTR)) {
181 error("Error writing file %s, %m", new_file);
182 error_code = errno;
183 break;
184 }
185 nwrite -= amount;
186 pos += amount;
187 }
188
189 rc = fsync_and_close(log_fd, "node");
190 if (rc && !error_code)
191 error_code = rc;
192 }
193 if (error_code)
194 (void) unlink (new_file);
195 else { /* file shuffle */
196 (void) unlink (old_file);
197 if (link(reg_file, old_file))
198 debug4("unable to create link for %s -> %s: %m",
199 reg_file, old_file);
200 (void) unlink (reg_file);
201 if (link(new_file, reg_file))
202 debug4("unable to create link for %s -> %s: %m",
203 new_file, reg_file);
204 (void) unlink (new_file);
205 }
206 xfree (old_file);
207 xfree (reg_file);
208 xfree (new_file);
209 unlock_state_files ();
210
211 free_buf (buffer);
212 END_TIMER2("dump_all_node_state");
213 return error_code;
214 }
215
216 /*
217 * _dump_node_state - dump the state of a specific node to a buffer
218 * IN dump_node_ptr - pointer to node for which information is requested
219 * IN/OUT buffer - location to store data, pointers automatically advanced
220 */
_dump_node_state(node_record_t * dump_node_ptr,Buf buffer)221 static void _dump_node_state(node_record_t *dump_node_ptr, Buf buffer)
222 {
223 packstr (dump_node_ptr->comm_name, buffer);
224 packstr (dump_node_ptr->name, buffer);
225 packstr (dump_node_ptr->node_hostname, buffer);
226 packstr (dump_node_ptr->reason, buffer);
227 packstr (dump_node_ptr->features, buffer);
228 packstr (dump_node_ptr->features_act, buffer);
229 packstr (dump_node_ptr->gres, buffer);
230 packstr (dump_node_ptr->cpu_spec_list, buffer);
231 pack32 (dump_node_ptr->next_state, buffer);
232 pack32 (dump_node_ptr->node_state, buffer);
233 pack32 (dump_node_ptr->cpu_bind, buffer);
234 pack16 (dump_node_ptr->cpus, buffer);
235 pack16 (dump_node_ptr->boards, buffer);
236 pack16 (dump_node_ptr->sockets, buffer);
237 pack16 (dump_node_ptr->cores, buffer);
238 pack16 (dump_node_ptr->core_spec_cnt, buffer);
239 pack16 (dump_node_ptr->threads, buffer);
240 pack64 (dump_node_ptr->real_memory, buffer);
241 pack32 (dump_node_ptr->tmp_disk, buffer);
242 pack32 (dump_node_ptr->reason_uid, buffer);
243 pack_time(dump_node_ptr->reason_time, buffer);
244 pack_time(dump_node_ptr->boot_req_time, buffer);
245 pack_time(dump_node_ptr->last_response, buffer);
246 pack16 (dump_node_ptr->protocol_version, buffer);
247 packstr (dump_node_ptr->mcs_label, buffer);
248 (void) gres_plugin_node_state_pack(dump_node_ptr->gres_list, buffer,
249 dump_node_ptr->name);
250 }
251
252
253 /* Open the node state save file, or backup if necessary.
254 * state_file IN - the name of the state save file used
255 * RET the file description to read from or error code
256 */
_open_node_state_file(char ** state_file)257 static Buf _open_node_state_file(char **state_file)
258 {
259 Buf buf;
260
261 *state_file = xstrdup(slurmctld_conf.state_save_location);
262 xstrcat(*state_file, "/node_state");
263
264 if (!(buf = create_mmap_buf(*state_file)))
265 error("Could not open node state file %s: %m", *state_file);
266 else
267 return buf;
268
269 error("NOTE: Trying backup state save file. Information may be lost!");
270 xstrcat(*state_file, ".old");
271 return create_mmap_buf(*state_file);
272 }
273
274 /*
275 * load_all_node_state - Load the node state from file, recover on slurmctld
276 * restart. Execute this after loading the configuration file data.
277 * Data goes into common storage.
278 * IN state_only - if true, overwrite only node state and reason
279 * Use this to overwrite the "UNKNOWN state typically used in slurm.conf
280 * RET 0 or error code
281 */
load_all_node_state(bool state_only)282 extern int load_all_node_state ( bool state_only )
283 {
284 char *comm_name = NULL, *node_hostname = NULL;
285 char *node_name = NULL, *reason = NULL, *state_file;
286 char *features = NULL, *features_act = NULL;
287 char *gres = NULL, *cpu_spec_list = NULL;
288 char *mcs_label = NULL;
289 int error_code = 0, node_cnt = 0;
290 uint16_t core_spec_cnt = 0;
291 uint32_t node_state, cpu_bind = 0, next_state = NO_VAL;
292 uint16_t cpus = 1, boards = 1, sockets = 1, cores = 1, threads = 1;
293 uint64_t real_memory;
294 uint32_t tmp_disk, name_len;
295 uint32_t reason_uid = NO_VAL;
296 time_t boot_req_time = 0, reason_time = 0, last_response = 0;
297 List gres_list = NULL;
298 node_record_t *node_ptr;
299 time_t time_stamp, now = time(NULL);
300 Buf buffer;
301 char *ver_str = NULL;
302 hostset_t hs = NULL;
303 hostlist_t down_nodes = NULL;
304 bool power_save_mode = false;
305 uint16_t protocol_version = NO_VAL16;
306
307 xassert(verify_lock(CONF_LOCK, READ_LOCK));
308
309 if (slurmctld_conf.suspend_program && slurmctld_conf.resume_program)
310 power_save_mode = true;
311
312 /* read the file */
313 lock_state_files ();
314 buffer = _open_node_state_file(&state_file);
315 if (!buffer) {
316 info("No node state file (%s) to recover", state_file);
317 xfree(state_file);
318 unlock_state_files();
319 return ENOENT;
320 }
321 xfree(state_file);
322 unlock_state_files();
323
324 safe_unpackstr_xmalloc( &ver_str, &name_len, buffer);
325 debug3("Version string in node_state header is %s", ver_str);
326 if (ver_str && !xstrcmp(ver_str, NODE_STATE_VERSION))
327 safe_unpack16(&protocol_version, buffer);
328
329 if (!protocol_version || (protocol_version == NO_VAL16)) {
330 if (!ignore_state_errors)
331 fatal("Can not recover node state, data version incompatible, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
332 error("*****************************************************");
333 error("Can not recover node state, data version incompatible");
334 error("*****************************************************");
335 xfree(ver_str);
336 free_buf(buffer);
337 return EFAULT;
338 }
339 xfree(ver_str);
340
341 safe_unpack_time (&time_stamp, buffer);
342
343 while (remaining_buf (buffer) > 0) {
344 uint32_t base_state;
345 uint16_t obj_protocol_version = NO_VAL16;
346 if (protocol_version >= SLURM_19_05_PROTOCOL_VERSION) {
347 safe_unpackstr_xmalloc (&comm_name, &name_len, buffer);
348 safe_unpackstr_xmalloc (&node_name, &name_len, buffer);
349 safe_unpackstr_xmalloc (&node_hostname,
350 &name_len, buffer);
351 safe_unpackstr_xmalloc (&reason, &name_len, buffer);
352 safe_unpackstr_xmalloc (&features, &name_len, buffer);
353 safe_unpackstr_xmalloc (&features_act,&name_len,buffer);
354 safe_unpackstr_xmalloc (&gres, &name_len, buffer);
355 safe_unpackstr_xmalloc (&cpu_spec_list,
356 &name_len, buffer);
357 safe_unpack32 (&next_state, buffer);
358 safe_unpack32 (&node_state, buffer);
359 safe_unpack32 (&cpu_bind, buffer);
360 safe_unpack16 (&cpus, buffer);
361 safe_unpack16 (&boards, buffer);
362 safe_unpack16 (&sockets, buffer);
363 safe_unpack16 (&cores, buffer);
364 safe_unpack16 (&core_spec_cnt, buffer);
365 safe_unpack16 (&threads, buffer);
366 safe_unpack64 (&real_memory, buffer);
367 safe_unpack32 (&tmp_disk, buffer);
368 safe_unpack32 (&reason_uid, buffer);
369 safe_unpack_time (&reason_time, buffer);
370 safe_unpack_time (&boot_req_time, buffer);
371 safe_unpack_time(&last_response, buffer);
372 safe_unpack16 (&obj_protocol_version, buffer);
373 safe_unpackstr_xmalloc (&mcs_label, &name_len, buffer);
374 if (gres_plugin_node_state_unpack(
375 &gres_list, buffer, node_name,
376 protocol_version) != SLURM_SUCCESS)
377 goto unpack_error;
378 base_state = node_state & NODE_STATE_BASE;
379 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
380 safe_unpackstr_xmalloc (&comm_name, &name_len, buffer);
381 safe_unpackstr_xmalloc (&node_name, &name_len, buffer);
382 safe_unpackstr_xmalloc (&node_hostname,
383 &name_len, buffer);
384 safe_unpackstr_xmalloc (&reason, &name_len, buffer);
385 safe_unpackstr_xmalloc (&features, &name_len, buffer);
386 safe_unpackstr_xmalloc (&features_act,&name_len,buffer);
387 safe_unpackstr_xmalloc (&gres, &name_len, buffer);
388 safe_unpackstr_xmalloc (&cpu_spec_list,
389 &name_len, buffer);
390 safe_unpack32 (&next_state, buffer);
391 safe_unpack32 (&node_state, buffer);
392 safe_unpack32 (&cpu_bind, buffer);
393 safe_unpack16 (&cpus, buffer);
394 safe_unpack16 (&boards, buffer);
395 safe_unpack16 (&sockets, buffer);
396 safe_unpack16 (&cores, buffer);
397 safe_unpack16 (&core_spec_cnt, buffer);
398 safe_unpack16 (&threads, buffer);
399 safe_unpack64 (&real_memory, buffer);
400 safe_unpack32 (&tmp_disk, buffer);
401 safe_unpack32 (&reason_uid, buffer);
402 safe_unpack_time (&reason_time, buffer);
403 safe_unpack_time (&boot_req_time, buffer);
404 safe_unpack16 (&obj_protocol_version, buffer);
405 safe_unpackstr_xmalloc (&mcs_label, &name_len, buffer);
406 if (gres_plugin_node_state_unpack(
407 &gres_list, buffer, node_name,
408 protocol_version) != SLURM_SUCCESS)
409 goto unpack_error;
410 base_state = node_state & NODE_STATE_BASE;
411 } else {
412 error("%s: protocol_version %hu not supported",
413 __func__, protocol_version);
414 goto unpack_error;
415 }
416
417 /* validity test as possible */
418 if ((cpus == 0) ||
419 (boards == 0) ||
420 (sockets == 0) ||
421 (cores == 0) ||
422 (threads == 0) ||
423 (base_state >= NODE_STATE_END)) {
424 error("Invalid data for node %s: procs=%u, boards=%u, "
425 "sockets=%u, cores=%u, threads=%u, state=%u",
426 node_name, cpus, boards,
427 sockets, cores, threads, node_state);
428 error("No more node data will be processed from the checkpoint file");
429 goto unpack_error;
430
431 }
432
433 /* find record and perform update */
434 node_ptr = find_node_record (node_name);
435 if (node_ptr == NULL) {
436 error ("Node %s has vanished from configuration",
437 node_name);
438 } else if (state_only) {
439 uint32_t orig_flags;
440 if (IS_NODE_CLOUD(node_ptr)) {
441 if ((!power_save_mode) &&
442 ((node_state & NODE_STATE_POWER_SAVE) ||
443 (node_state & NODE_STATE_POWER_UP))) {
444 node_state &= (~NODE_STATE_POWER_SAVE);
445 node_state &= (~NODE_STATE_POWER_UP);
446 node_state &= (~NODE_STATE_POWERING_DOWN);
447 if (hs)
448 hostset_insert(hs, node_name);
449 else
450 hs = hostset_create(node_name);
451 }
452 if (comm_name && node_hostname) {
453 /* Recover NodeAddr and NodeHostName */
454 xfree(node_ptr->comm_name);
455 node_ptr->comm_name = comm_name;
456 comm_name = NULL; /* Nothing to free */
457 xfree(node_ptr->node_hostname);
458 node_ptr->node_hostname = node_hostname;
459 node_hostname = NULL; /* Nothing to free */
460 slurm_reset_alias(node_ptr->name,
461 node_ptr->comm_name,
462 node_ptr->node_hostname);
463 }
464 node_ptr->node_state = node_state;
465 } else if (IS_NODE_UNKNOWN(node_ptr)) {
466 if (base_state == NODE_STATE_DOWN) {
467 orig_flags = node_ptr->node_state &
468 NODE_STATE_FLAGS;
469 node_ptr->node_state = NODE_STATE_DOWN
470 | orig_flags;
471 }
472 if (node_state & NODE_STATE_DRAIN)
473 node_ptr->node_state |=
474 NODE_STATE_DRAIN;
475 if (node_state & NODE_STATE_FAIL)
476 node_ptr->node_state |=
477 NODE_STATE_FAIL;
478 if (node_state & NODE_STATE_POWER_SAVE) {
479 if (power_save_mode &&
480 IS_NODE_UNKNOWN(node_ptr)) {
481 orig_flags = node_ptr->
482 node_state &
483 NODE_STATE_FLAGS;
484 node_ptr->node_state =
485 NODE_STATE_IDLE |
486 orig_flags |
487 NODE_STATE_POWER_SAVE;
488 } else if (power_save_mode) {
489 node_ptr->node_state |=
490 NODE_STATE_POWER_SAVE;
491 } else if (hs)
492 hostset_insert(hs, node_name);
493 else
494 hs = hostset_create(node_name);
495 /* Recover hardware state for powered
496 * down nodes */
497 node_ptr->cpus = cpus;
498 node_ptr->boards = boards;
499 node_ptr->sockets = sockets;
500 node_ptr->cores = cores;
501 node_ptr->core_spec_cnt =
502 core_spec_cnt;
503 xfree(node_ptr->cpu_spec_list);
504 node_ptr->cpu_spec_list =
505 cpu_spec_list;
506 cpu_spec_list = NULL;/* Nothing */
507 /* to free */
508 node_ptr->threads = threads;
509 node_ptr->real_memory = real_memory;
510 node_ptr->tmp_disk = tmp_disk;
511 }
512 if (node_state & NODE_STATE_MAINT)
513 node_ptr->node_state |= NODE_STATE_MAINT;
514 if (node_state & NODE_STATE_REBOOT)
515 node_ptr->node_state |= NODE_STATE_REBOOT;
516 if (node_state & NODE_STATE_POWER_UP) {
517 if (power_save_mode) {
518 node_ptr->node_state |=
519 NODE_STATE_POWER_UP;
520 } else if (hs)
521 hostset_insert(hs, node_name);
522 else
523 hs = hostset_create(node_name);
524 }
525 }
526 if (node_ptr->reason == NULL) {
527 node_ptr->reason = reason;
528 reason = NULL; /* Nothing to free */
529 node_ptr->reason_time = reason_time;
530 node_ptr->reason_uid = reason_uid;
531 }
532
533 if (IS_NODE_POWER_UP(node_ptr) ||
534 IS_NODE_REBOOT(node_ptr))
535 node_ptr->boot_req_time = boot_req_time;
536
537 xfree(node_ptr->features_act);
538 node_ptr->features_act = features_act;
539 features_act = NULL; /* Nothing to free */
540 node_ptr->gres_list = gres_list;
541 gres_list = NULL; /* Nothing to free */
542 } else {
543 if ((!power_save_mode) &&
544 ((node_state & NODE_STATE_POWER_SAVE) ||
545 (node_state & NODE_STATE_POWER_UP))) {
546 node_state &= (~NODE_STATE_POWER_SAVE);
547 node_state &= (~NODE_STATE_POWER_UP);
548 if (hs)
549 hostset_insert(hs, node_name);
550 else
551 hs = hostset_create(node_name);
552 }
553 if (IS_NODE_CLOUD(node_ptr) &&
554 comm_name && node_hostname) {
555 /* Recover NodeAddr and NodeHostName */
556 xfree(node_ptr->comm_name);
557 node_ptr->comm_name = comm_name;
558 comm_name = NULL; /* Nothing to free */
559 xfree(node_ptr->node_hostname);
560 node_ptr->node_hostname = node_hostname;
561 node_hostname = NULL; /* Nothing to free */
562 slurm_reset_alias(node_ptr->name,
563 node_ptr->comm_name,
564 node_ptr->node_hostname);
565 }
566 node_ptr->node_state = node_state;
567 xfree(node_ptr->reason);
568 node_ptr->reason = reason;
569 reason = NULL; /* Nothing to free */
570 node_ptr->reason_time = reason_time;
571 node_ptr->reason_uid = reason_uid;
572 xfree(node_ptr->features);
573 node_ptr->features = features;
574 features = NULL; /* Nothing to free */
575 xfree(node_ptr->features_act);
576 node_ptr->features_act = features_act;
577 features_act = NULL; /* Nothing to free */
578 xfree(node_ptr->gres);
579 node_ptr->gres = gres;
580 gres = NULL; /* Nothing to free */
581 node_ptr->gres_list = gres_list;
582 gres_list = NULL; /* Nothing to free */
583 xfree(node_ptr->cpu_spec_list);
584 node_ptr->cpu_spec_list = cpu_spec_list;
585 cpu_spec_list = NULL; /* Nothing to free */
586 node_ptr->part_cnt = 0;
587 xfree(node_ptr->part_pptr);
588 node_ptr->cpu_bind = cpu_bind;
589 node_ptr->cpus = cpus;
590 node_ptr->boards = boards;
591 node_ptr->sockets = sockets;
592 node_ptr->cores = cores;
593 node_ptr->core_spec_cnt = core_spec_cnt;
594 node_ptr->threads = threads;
595 node_ptr->real_memory = real_memory;
596 node_ptr->tmp_disk = tmp_disk;
597 xfree(node_ptr->mcs_label);
598 node_ptr->mcs_label = mcs_label;
599 mcs_label = NULL; /* Nothing to free */
600 }
601
602 if (node_ptr) {
603 node_cnt++;
604
605 node_ptr->next_state = next_state;
606
607 if (IS_NODE_DOWN(node_ptr)) {
608 if (down_nodes)
609 hostlist_push(down_nodes, node_name);
610 else
611 down_nodes = hostlist_create(
612 node_name);
613 }
614
615 node_ptr->last_response = last_response;
616 if (!node_ptr->last_response) {
617 /*
618 * last_response value not saved, make best
619 * guess.
620 */
621 if (IS_NODE_POWER_UP(node_ptr))
622 node_ptr->last_response = now +
623 slurmctld_conf.resume_timeout;
624 else if (IS_NODE_POWERING_DOWN(node_ptr))
625 node_ptr->last_response = now +
626 slurmctld_conf.suspend_timeout;
627 }
628
629 if (obj_protocol_version &&
630 (obj_protocol_version != NO_VAL16))
631 node_ptr->protocol_version =
632 obj_protocol_version;
633 else
634 node_ptr->protocol_version = protocol_version;
635
636 /* Sanity check to make sure we can take a version we
637 * actually understand.
638 */
639 if (node_ptr->protocol_version <
640 SLURM_MIN_PROTOCOL_VERSION)
641 node_ptr->protocol_version =
642 SLURM_MIN_PROTOCOL_VERSION;
643
644 if (!IS_NODE_POWER_SAVE(node_ptr))
645 node_ptr->last_idle = now;
646 }
647
648 xfree(features);
649 xfree(features_act);
650 xfree(gres);
651 FREE_NULL_LIST(gres_list);
652 xfree (comm_name);
653 xfree (node_hostname);
654 xfree (node_name);
655 xfree(reason);
656 xfree(cpu_spec_list);
657 }
658
659 fini: info("Recovered state of %d nodes", node_cnt);
660 if (hs) {
661 char node_names[128];
662 hostset_ranged_string(hs, sizeof(node_names), node_names);
663 info("Cleared POWER_SAVE flag from nodes %s", node_names);
664 hostset_destroy(hs);
665 }
666
667 if (down_nodes) {
668 char *down_host_str = NULL;
669 down_host_str = hostlist_ranged_string_xmalloc(down_nodes);
670 info("Down nodes: %s", down_host_str);
671 xfree(down_host_str);
672 hostlist_destroy(down_nodes);
673 }
674
675 free_buf (buffer);
676 return error_code;
677
678 unpack_error:
679 if (!ignore_state_errors)
680 fatal("Incomplete node data checkpoint file, start with '-i' to ignore this. Warning: using -i will lose the data that can't be recovered.");
681 error("Incomplete node data checkpoint file");
682 error_code = EFAULT;
683 xfree(features);
684 xfree(gres);
685 FREE_NULL_LIST(gres_list);
686 xfree(comm_name);
687 xfree(node_hostname);
688 xfree(node_name);
689 xfree(reason);
690 goto fini;
691 }
692
693
694 /* list_compare_config - compare two entry from the config list based upon
695 * weight, see common/list.h for documentation */
list_compare_config(void * config_entry1,void * config_entry2)696 int list_compare_config (void *config_entry1, void *config_entry2)
697 {
698 int weight1, weight2;
699 config_record_t *c1 = *(config_record_t **) config_entry1;
700 config_record_t *c2 = *(config_record_t **) config_entry2;
701
702 weight1 = c1->weight;
703 weight2 = c2->weight;
704
705 return (weight1 - weight2);
706 }
707
708 /* Return true if the node should be hidden by virtue of being powered down
709 * and in the cloud. */
_is_cloud_hidden(node_record_t * node_ptr)710 static bool _is_cloud_hidden(node_record_t *node_ptr)
711 {
712 if (((slurmctld_conf.private_data & PRIVATE_CLOUD_NODES) == 0) &&
713 IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr))
714 return true;
715 return false;
716 }
717
_node_is_hidden(node_record_t * node_ptr,uid_t uid)718 static bool _node_is_hidden(node_record_t *node_ptr, uid_t uid)
719 {
720 int i;
721
722 if ((slurmctld_conf.private_data & PRIVATE_DATA_NODES)
723 && (slurm_mcs_get_privatedata() == 1)
724 && !validate_operator(uid)
725 && (mcs_g_check_mcs_label(uid, node_ptr->mcs_label) != 0))
726 return true;
727
728 if (!node_ptr->part_cnt)
729 return false;
730
731 for (i = 0; i < node_ptr->part_cnt; i++) {
732 /* return false if the node belongs to any visible partition */
733 if (part_is_visible(node_ptr->part_pptr[i], uid)) {
734 return false;
735 }
736 }
737
738 return true;
739 }
740
741 /*
742 * pack_all_node - dump all configuration and node information for all nodes
743 * in machine independent form (for network transmission)
744 * OUT buffer_ptr - pointer to the stored data
745 * OUT buffer_size - set to size of the buffer in bytes
746 * IN show_flags - node filtering options
747 * IN uid - uid of user making request (for partition filtering)
748 * IN protocol_version - slurm protocol version of client
749 * global: node_record_table_ptr - pointer to global node table
750 * NOTE: the caller must xfree the buffer at *buffer_ptr
751 * NOTE: change slurm_load_node() in api/node_info.c when data format changes
752 */
pack_all_node(char ** buffer_ptr,int * buffer_size,uint16_t show_flags,uid_t uid,uint16_t protocol_version)753 extern void pack_all_node (char **buffer_ptr, int *buffer_size,
754 uint16_t show_flags, uid_t uid,
755 uint16_t protocol_version)
756 {
757 int inx;
758 uint32_t nodes_packed, tmp_offset;
759 Buf buffer;
760 time_t now = time(NULL);
761 node_record_t *node_ptr = node_record_table_ptr;
762 bool hidden;
763
764 xassert(verify_lock(CONF_LOCK, READ_LOCK));
765 xassert(verify_lock(PART_LOCK, READ_LOCK));
766
767 buffer_ptr[0] = NULL;
768 *buffer_size = 0;
769
770 buffer = init_buf (BUF_SIZE*16);
771 nodes_packed = 0;
772
773 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
774 /* write header: count and time */
775 pack32(nodes_packed, buffer);
776 pack_time(now, buffer);
777
778 /* write node records */
779 for (inx = 0; inx < node_record_count; inx++, node_ptr++) {
780 xassert(node_ptr->magic == NODE_MAGIC);
781 xassert(node_ptr->config_ptr->magic == CONFIG_MAGIC);
782
783 /*
784 * We can't avoid packing node records without breaking
785 * the node index pointers. So pack a node with a name
786 * of NULL and let the caller deal with it.
787 */
788 hidden = false;
789 if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
790 (_node_is_hidden(node_ptr, uid)))
791 hidden = true;
792 else if (IS_NODE_FUTURE(node_ptr) &&
793 (!(show_flags & SHOW_FUTURE)))
794 hidden = true;
795 else if (_is_cloud_hidden(node_ptr))
796 hidden = true;
797 else if ((node_ptr->name == NULL) ||
798 (node_ptr->name[0] == '\0'))
799 hidden = true;
800
801 if (hidden) {
802 char *orig_name = node_ptr->name;
803 node_ptr->name = NULL;
804 _pack_node(node_ptr, buffer, protocol_version,
805 show_flags);
806 node_ptr->name = orig_name;
807 } else {
808 _pack_node(node_ptr, buffer, protocol_version,
809 show_flags);
810 }
811 nodes_packed++;
812 }
813 } else {
814 error("select_g_select_jobinfo_pack: protocol_version "
815 "%hu not supported", protocol_version);
816 }
817
818 tmp_offset = get_buf_offset (buffer);
819 set_buf_offset (buffer, 0);
820 pack32 (nodes_packed, buffer);
821 set_buf_offset (buffer, tmp_offset);
822
823 *buffer_size = get_buf_offset (buffer);
824 buffer_ptr[0] = xfer_buf_data (buffer);
825 }
826
827 /*
828 * pack_one_node - dump all configuration and node information for one node
829 * in machine independent form (for network transmission)
830 * OUT buffer_ptr - pointer to the stored data
831 * OUT buffer_size - set to size of the buffer in bytes
832 * IN show_flags - node filtering options
833 * IN uid - uid of user making request (for partition filtering)
834 * IN node_name - name of node for which information is desired,
835 * use first node if name is NULL
836 * IN protocol_version - slurm protocol version of client
837 * global: node_record_table_ptr - pointer to global node table
838 * NOTE: the caller must xfree the buffer at *buffer_ptr
839 * NOTE: change slurm_load_node() in api/node_info.c when data format changes
840 */
pack_one_node(char ** buffer_ptr,int * buffer_size,uint16_t show_flags,uid_t uid,char * node_name,uint16_t protocol_version)841 extern void pack_one_node (char **buffer_ptr, int *buffer_size,
842 uint16_t show_flags, uid_t uid, char *node_name,
843 uint16_t protocol_version)
844 {
845 uint32_t nodes_packed, tmp_offset;
846 Buf buffer;
847 time_t now = time(NULL);
848 node_record_t *node_ptr;
849 bool hidden;
850
851 xassert(verify_lock(CONF_LOCK, READ_LOCK));
852 xassert(verify_lock(PART_LOCK, READ_LOCK));
853
854 buffer_ptr[0] = NULL;
855 *buffer_size = 0;
856
857 buffer = init_buf (BUF_SIZE);
858 nodes_packed = 0;
859
860 if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
861 /* write header: count and time */
862 pack32(nodes_packed, buffer);
863 pack_time(now, buffer);
864
865 /* write node records */
866 if (node_name)
867 node_ptr = find_node_record(node_name);
868 else
869 node_ptr = node_record_table_ptr;
870 if (node_ptr) {
871 hidden = false;
872 if (((show_flags & SHOW_ALL) == 0) && (uid != 0) &&
873 (_node_is_hidden(node_ptr, uid)))
874 hidden = true;
875 else if (IS_NODE_FUTURE(node_ptr) &&
876 (!(show_flags & SHOW_FUTURE)))
877 hidden = true;
878 // Don't hide the node if explicitly requested by name
879 // else if (_is_cloud_hidden(node_ptr))
880 // hidden = true;
881 else if ((node_ptr->name == NULL) ||
882 (node_ptr->name[0] == '\0'))
883 hidden = true;
884
885 if (!hidden) {
886 _pack_node(node_ptr, buffer, protocol_version,
887 show_flags);
888 nodes_packed++;
889 }
890 }
891 } else {
892 error("select_g_select_jobinfo_pack: protocol_version "
893 "%hu not supported", protocol_version);
894 }
895
896 tmp_offset = get_buf_offset (buffer);
897 set_buf_offset (buffer, 0);
898 pack32 (nodes_packed, buffer);
899 set_buf_offset (buffer, tmp_offset);
900
901 *buffer_size = get_buf_offset (buffer);
902 buffer_ptr[0] = xfer_buf_data (buffer);
903 }
904
905 /*
906 * _pack_node - dump all configuration information about a specific node in
907 * machine independent form (for network transmission)
908 * IN dump_node_ptr - pointer to node for which information is requested
909 * IN/OUT buffer - buffer where data is placed, pointers automatically updated
910 * IN protocol_version - slurm protocol version of client
911 * IN show_flags -
912 * NOTE: if you make any changes here be sure to make the corresponding changes
913 * to _unpack_node_info_members() in common/slurm_protocol_pack.c
914 */
_pack_node(node_record_t * dump_node_ptr,Buf buffer,uint16_t protocol_version,uint16_t show_flags)915 static void _pack_node(node_record_t *dump_node_ptr, Buf buffer,
916 uint16_t protocol_version, uint16_t show_flags)
917 {
918 char *gres_drain = NULL, *gres_used = NULL;
919
920 xassert(verify_lock(CONF_LOCK, READ_LOCK));
921
922
923 if (protocol_version >= SLURM_20_02_PROTOCOL_VERSION) {
924 packstr(dump_node_ptr->name, buffer);
925 packstr(dump_node_ptr->node_hostname, buffer);
926 packstr(dump_node_ptr->comm_name, buffer);
927 packstr(dump_node_ptr->bcast_address, buffer);
928 pack16(dump_node_ptr->port, buffer);
929 pack32(dump_node_ptr->next_state, buffer);
930 pack32(dump_node_ptr->node_state, buffer);
931 packstr(dump_node_ptr->version, buffer);
932
933 /* Only data from config_record used for scheduling */
934 pack16(dump_node_ptr->config_ptr->cpus, buffer);
935 pack16(dump_node_ptr->config_ptr->boards, buffer);
936 pack16(dump_node_ptr->config_ptr->sockets, buffer);
937 pack16(dump_node_ptr->config_ptr->cores, buffer);
938 pack16(dump_node_ptr->config_ptr->threads, buffer);
939 pack64(dump_node_ptr->config_ptr->real_memory, buffer);
940 pack32(dump_node_ptr->config_ptr->tmp_disk, buffer);
941
942 packstr(dump_node_ptr->mcs_label, buffer);
943 pack32(dump_node_ptr->owner, buffer);
944 pack16(dump_node_ptr->core_spec_cnt, buffer);
945 pack32(dump_node_ptr->cpu_bind, buffer);
946 pack64(dump_node_ptr->mem_spec_limit, buffer);
947 packstr(dump_node_ptr->cpu_spec_list, buffer);
948
949 pack32(dump_node_ptr->cpu_load, buffer);
950 pack64(dump_node_ptr->free_mem, buffer);
951 pack32(dump_node_ptr->config_ptr->weight, buffer);
952 pack32(dump_node_ptr->reason_uid, buffer);
953
954 pack_time(dump_node_ptr->boot_time, buffer);
955 pack_time(dump_node_ptr->reason_time, buffer);
956 pack_time(dump_node_ptr->slurmd_start_time, buffer);
957
958 select_g_select_nodeinfo_pack(dump_node_ptr->select_nodeinfo,
959 buffer, protocol_version);
960
961 packstr(dump_node_ptr->arch, buffer);
962 packstr(dump_node_ptr->features, buffer);
963 packstr(dump_node_ptr->features_act, buffer);
964 if (dump_node_ptr->gres)
965 packstr(dump_node_ptr->gres, buffer);
966 else
967 packstr(dump_node_ptr->config_ptr->gres, buffer);
968
969 /* Gathering GRES details is slow, so don't by default */
970 if (show_flags & SHOW_DETAIL) {
971 gres_drain =
972 gres_get_node_drain(dump_node_ptr->gres_list);
973 gres_used =
974 gres_get_node_used(dump_node_ptr->gres_list);
975 }
976 packstr(gres_drain, buffer);
977 packstr(gres_used, buffer);
978 xfree(gres_drain);
979 xfree(gres_used);
980
981 packstr(dump_node_ptr->os, buffer);
982 packstr(dump_node_ptr->reason, buffer);
983 acct_gather_energy_pack(dump_node_ptr->energy, buffer,
984 protocol_version);
985 ext_sensors_data_pack(dump_node_ptr->ext_sensors, buffer,
986 protocol_version);
987 power_mgmt_data_pack(dump_node_ptr->power, buffer,
988 protocol_version);
989
990 packstr(dump_node_ptr->tres_fmt_str, buffer);
991 } else if (protocol_version >= SLURM_MIN_PROTOCOL_VERSION) {
992 packstr (dump_node_ptr->name, buffer);
993 packstr (dump_node_ptr->node_hostname, buffer);
994 packstr (dump_node_ptr->comm_name, buffer);
995 pack16(dump_node_ptr->port, buffer);
996 pack32(dump_node_ptr->next_state, buffer);
997 pack32(dump_node_ptr->node_state, buffer);
998 packstr (dump_node_ptr->version, buffer);
999
1000 /* Only data from config_record used for scheduling */
1001 pack16(dump_node_ptr->config_ptr->cpus, buffer);
1002 pack16(dump_node_ptr->config_ptr->boards, buffer);
1003 pack16(dump_node_ptr->config_ptr->sockets, buffer);
1004 pack16(dump_node_ptr->config_ptr->cores, buffer);
1005 pack16(dump_node_ptr->config_ptr->threads, buffer);
1006 pack64(dump_node_ptr->config_ptr->real_memory, buffer);
1007 pack32(dump_node_ptr->config_ptr->tmp_disk, buffer);
1008
1009 packstr(dump_node_ptr->mcs_label, buffer);
1010 pack32(dump_node_ptr->owner, buffer);
1011 pack16(dump_node_ptr->core_spec_cnt, buffer);
1012 pack32(dump_node_ptr->cpu_bind, buffer);
1013 pack64(dump_node_ptr->mem_spec_limit, buffer);
1014 packstr(dump_node_ptr->cpu_spec_list, buffer);
1015
1016 pack32(dump_node_ptr->cpu_load, buffer);
1017 pack64(dump_node_ptr->free_mem, buffer);
1018 pack32(dump_node_ptr->config_ptr->weight, buffer);
1019 pack32(dump_node_ptr->reason_uid, buffer);
1020
1021 pack_time(dump_node_ptr->boot_time, buffer);
1022 pack_time(dump_node_ptr->reason_time, buffer);
1023 pack_time(dump_node_ptr->slurmd_start_time, buffer);
1024
1025 select_g_select_nodeinfo_pack(dump_node_ptr->select_nodeinfo,
1026 buffer, protocol_version);
1027
1028 packstr(dump_node_ptr->arch, buffer);
1029 packstr(dump_node_ptr->features, buffer);
1030 packstr(dump_node_ptr->features_act, buffer);
1031 if (dump_node_ptr->gres)
1032 packstr(dump_node_ptr->gres, buffer);
1033 else
1034 packstr(dump_node_ptr->config_ptr->gres, buffer);
1035
1036 /* Gathering GRES details is slow, so don't by default */
1037 if (show_flags & SHOW_DETAIL) {
1038 gres_drain =
1039 gres_get_node_drain(dump_node_ptr->gres_list);
1040 gres_used =
1041 gres_get_node_used(dump_node_ptr->gres_list);
1042 }
1043 packstr(gres_drain, buffer);
1044 packstr(gres_used, buffer);
1045 xfree(gres_drain);
1046 xfree(gres_used);
1047
1048 packstr(dump_node_ptr->os, buffer);
1049 packstr(dump_node_ptr->reason, buffer);
1050 acct_gather_energy_pack(dump_node_ptr->energy, buffer,
1051 protocol_version);
1052 ext_sensors_data_pack(dump_node_ptr->ext_sensors, buffer,
1053 protocol_version);
1054 power_mgmt_data_pack(dump_node_ptr->power, buffer,
1055 protocol_version);
1056
1057 packstr(dump_node_ptr->tres_fmt_str,buffer);
1058 } else {
1059 error("_pack_node: protocol_version "
1060 "%hu not supported", protocol_version);
1061 }
1062 }
1063
1064 /* Return "true" if a node's state is already "new_state". This is more
1065 * complex than simply comparing the state values due to flags (e.g.
1066 * A node might be DOWN + NO_RESPOND or IDLE + DRAIN) */
_equivalent_node_state(node_record_t * node_ptr,uint32_t new_state)1067 static bool _equivalent_node_state(node_record_t *node_ptr, uint32_t new_state)
1068 {
1069 if (new_state == NO_VAL) /* No change */
1070 return true;
1071 if ((new_state == NODE_STATE_DOWN) && IS_NODE_DOWN(node_ptr))
1072 return true;
1073 if ((new_state == NODE_STATE_DRAIN) && IS_NODE_DRAIN(node_ptr))
1074 return true;
1075 if ((new_state == NODE_STATE_FAIL) && IS_NODE_FAIL(node_ptr))
1076 return true;
1077 /* Other states might be added here */
1078 return false;
1079 }
1080
1081 /* Confirm that the selected ActiveFeatures are a subset of AvailableFeatures */
_valid_features_act(char * features_act,char * features)1082 static bool _valid_features_act(char *features_act, char *features)
1083 {
1084 bool valid_subset = true;
1085 char *tmp_act, *last_act = NULL, *tok_act;
1086 char *tmp_avail, *last_avail = NULL, *tok_avail;
1087
1088 if (!features_act || (features_act[0] == '\0'))
1089 return true;
1090 if (!features || (features[0] == '\0'))
1091 return false;
1092
1093 tmp_act = xstrdup(features_act);
1094 tok_act = strtok_r(tmp_act, ",", &last_act);
1095 while (tok_act) {
1096 last_avail = NULL;
1097 tmp_avail = xstrdup(features);
1098 tok_avail = strtok_r(tmp_avail, ",", &last_avail);
1099 while (tok_avail) {
1100 if (!xstrcmp(tok_act, tok_avail))
1101 break;
1102 tok_avail = strtok_r(NULL, ",", &last_avail);
1103 }
1104 xfree(tmp_avail);
1105 if (!tok_avail) { /* No match found */
1106 valid_subset = false;
1107 break;
1108 }
1109 tok_act = strtok_r(NULL, ",", &last_act);
1110 }
1111 xfree(tmp_act);
1112
1113 return valid_subset;
1114 }
1115
_undo_reboot_asap(node_record_t * node_ptr)1116 static void _undo_reboot_asap(node_record_t *node_ptr)
1117 {
1118 node_ptr->node_state &= (~NODE_STATE_DRAIN);
1119 xfree(node_ptr->reason);
1120 }
1121
1122 /*
1123 * update_node - update the configuration data for one or more nodes
1124 * IN update_node_msg - update node request
1125 * RET SLURM_SUCCESS or error code
1126 * global: node_record_table_ptr - pointer to global node table
1127 */
update_node(update_node_msg_t * update_node_msg)1128 int update_node ( update_node_msg_t * update_node_msg )
1129 {
1130 int error_code = 0, node_cnt, node_inx;
1131 node_record_t *node_ptr = NULL;
1132 char *this_node_name = NULL, *tmp_feature, *orig_features_act = NULL;
1133 hostlist_t host_list, hostaddr_list = NULL, hostname_list = NULL;
1134 uint32_t base_state = 0, node_flags, state_val;
1135 time_t now = time(NULL);
1136
1137 if (update_node_msg->node_names == NULL ) {
1138 info("%s: invalid node name", __func__);
1139 return ESLURM_INVALID_NODE_NAME;
1140 }
1141
1142 host_list = hostlist_create(update_node_msg->node_names);
1143 if (host_list == NULL) {
1144 info("update_node: hostlist_create error on %s: %m",
1145 update_node_msg->node_names);
1146 return ESLURM_INVALID_NODE_NAME;
1147 }
1148 node_cnt = hostlist_count(host_list);
1149
1150 if (update_node_msg->node_addr) {
1151 hostaddr_list = hostlist_create(update_node_msg->node_addr);
1152 if (hostaddr_list == NULL) {
1153 info("update_node: hostlist_create error on %s: %m",
1154 update_node_msg->node_addr);
1155 FREE_NULL_HOSTLIST(host_list);
1156 return ESLURM_INVALID_NODE_NAME;
1157 }
1158 if (node_cnt != hostlist_count(hostaddr_list)) {
1159 info("update_node: nodecount mismatch");
1160 FREE_NULL_HOSTLIST(host_list);
1161 FREE_NULL_HOSTLIST(hostaddr_list);
1162 return ESLURM_INVALID_NODE_NAME;
1163 }
1164 }
1165
1166 if (update_node_msg->node_hostname) {
1167 hostname_list = hostlist_create(update_node_msg->node_hostname);
1168 if (hostname_list == NULL) {
1169 info("update_node: hostlist_create error on %s: %m",
1170 update_node_msg->node_hostname);
1171 FREE_NULL_HOSTLIST(host_list);
1172 FREE_NULL_HOSTLIST(hostaddr_list);
1173 return ESLURM_INVALID_NODE_NAME;
1174 }
1175 if (node_cnt != hostlist_count(hostname_list)) {
1176 info("update_node: nodecount mismatch");
1177 FREE_NULL_HOSTLIST(host_list);
1178 FREE_NULL_HOSTLIST(hostaddr_list);
1179 FREE_NULL_HOSTLIST(hostname_list);
1180 return ESLURM_INVALID_NODE_NAME;
1181 }
1182 }
1183
1184 while ( (this_node_name = hostlist_shift (host_list)) ) {
1185 int err_code = 0;
1186 bool acct_updated = false;
1187
1188 node_ptr = find_node_record (this_node_name);
1189 node_inx = node_ptr - node_record_table_ptr;
1190 if (node_ptr == NULL) {
1191 error ("update_node: node %s does not exist",
1192 this_node_name);
1193 error_code = ESLURM_INVALID_NODE_NAME;
1194 free (this_node_name);
1195 break;
1196 }
1197
1198 if (hostaddr_list) {
1199 char *this_addr = hostlist_shift(hostaddr_list);
1200 xfree(node_ptr->comm_name);
1201 node_ptr->comm_name = xstrdup(this_addr);
1202 free(this_addr);
1203 }
1204 if (hostname_list) {
1205 char *this_hostname = hostlist_shift(hostname_list);
1206 xfree(node_ptr->node_hostname);
1207 node_ptr->node_hostname = xstrdup(this_hostname);
1208 free(this_hostname);
1209 }
1210 if (hostaddr_list || hostname_list) {
1211 /* This updates the lookup table addresses */
1212 slurm_reset_alias(node_ptr->name, node_ptr->comm_name,
1213 node_ptr->node_hostname);
1214 }
1215
1216 if (update_node_msg->cpu_bind) {
1217 char tmp_str[128];
1218 slurm_sprint_cpu_bind_type(tmp_str,
1219 update_node_msg->cpu_bind);
1220 info("update_node: setting CpuBind to %s for node %s",
1221 tmp_str, this_node_name);
1222 if (update_node_msg->cpu_bind == CPU_BIND_OFF)
1223 node_ptr->cpu_bind = 0;
1224 else
1225 node_ptr->cpu_bind = update_node_msg->cpu_bind;
1226 }
1227
1228 if (update_node_msg->features || update_node_msg->features_act) {
1229 char *features_act = NULL, *features_avail = NULL;
1230 if (!node_features_g_node_update_valid(node_ptr,
1231 update_node_msg)) {
1232 error_code = ESLURM_INVALID_FEATURE;
1233 xfree(update_node_msg->features);
1234 xfree(update_node_msg->features_act);
1235 }
1236 if (update_node_msg->features_act)
1237 features_act = update_node_msg->features_act;
1238 else
1239 features_act = node_ptr->features_act;
1240
1241 if (update_node_msg->features)
1242 features_avail = update_node_msg->features;
1243 else
1244 features_avail = node_ptr->features;
1245 if (!_valid_features_act(features_act, features_avail)){
1246 info("%s: Invalid ActiveFeatures (\'%s\' not subset of \'%s\' on node %s)",
1247 __func__, features_act, features_avail,
1248 node_ptr->name);
1249 error_code = ESLURM_ACTIVE_FEATURE_NOT_SUBSET;
1250 xfree(update_node_msg->features);
1251 xfree(update_node_msg->features_act);
1252 }
1253 }
1254
1255 if (update_node_msg->features_act) {
1256 if (node_ptr->features_act)
1257 orig_features_act =
1258 xstrdup(node_ptr->features_act);
1259 else
1260 orig_features_act = xstrdup(node_ptr->features);
1261 }
1262 if (update_node_msg->features) {
1263 if (!update_node_msg->features_act &&
1264 (node_features_g_count() == 0)) {
1265 /*
1266 * If no NodeFeatures plugin and no explicit
1267 * active features, then make active and
1268 * available feature values match
1269 */
1270 update_node_msg->features_act =
1271 xstrdup(update_node_msg->features);
1272 }
1273 xfree(node_ptr->features);
1274 if (update_node_msg->features[0]) {
1275 node_ptr->features =
1276 node_features_g_node_xlate2(
1277 update_node_msg->features);
1278 }
1279 /*
1280 * _update_node_avail_features() logs and updates
1281 * avail_feature_list below
1282 */
1283 }
1284
1285 if (update_node_msg->features_act) {
1286 tmp_feature = node_features_g_node_xlate(
1287 update_node_msg->features_act,
1288 orig_features_act, node_ptr->features,
1289 node_inx);
1290 xfree(node_ptr->features_act);
1291 node_ptr->features_act = tmp_feature;
1292 error_code = _update_node_active_features(
1293 node_ptr->name,
1294 node_ptr->features_act,
1295 FEATURE_MODE_COMB);
1296 xfree(orig_features_act);
1297 }
1298
1299 if (update_node_msg->gres) {
1300 xfree(node_ptr->gres);
1301 if (update_node_msg->gres[0])
1302 node_ptr->gres = xstrdup(update_node_msg->gres);
1303 /* _update_node_gres() logs and updates config */
1304 }
1305
1306 /* No accounting update if node state and reason are unchange */
1307 state_val = update_node_msg->node_state;
1308 if (_equivalent_node_state(node_ptr, state_val) &&
1309 !xstrcmp(node_ptr->reason, update_node_msg->reason)) {
1310 free(this_node_name);
1311 continue;
1312 }
1313
1314 if ((update_node_msg -> reason) &&
1315 (update_node_msg -> reason[0])) {
1316 xfree(node_ptr->reason);
1317 node_ptr->reason = xstrdup(update_node_msg->reason);
1318 node_ptr->reason_time = now;
1319 node_ptr->reason_uid = update_node_msg->reason_uid;
1320 info ("update_node: node %s reason set to: %s",
1321 this_node_name, node_ptr->reason);
1322 }
1323
1324 if (state_val != NO_VAL) {
1325 base_state = node_ptr->node_state;
1326 if (!_valid_node_state_change(base_state, state_val)) {
1327 info("Invalid node state transition requested "
1328 "for node %s from=%s to=%s",
1329 this_node_name,
1330 node_state_string(base_state),
1331 node_state_string(state_val));
1332 state_val = NO_VAL;
1333 error_code = ESLURM_INVALID_NODE_STATE;
1334 }
1335 base_state &= NODE_STATE_BASE;
1336 }
1337
1338 if (state_val != NO_VAL) {
1339 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
1340 if (state_val == NODE_RESUME) {
1341 if (IS_NODE_IDLE(node_ptr) &&
1342 (IS_NODE_DRAIN(node_ptr) ||
1343 IS_NODE_FAIL(node_ptr))) {
1344 clusteracct_storage_g_node_up(
1345 acct_db_conn,
1346 node_ptr,
1347 now);
1348 acct_updated = true;
1349 }
1350 node_ptr->node_state &= (~NODE_STATE_DRAIN);
1351 node_ptr->node_state &= (~NODE_STATE_FAIL);
1352 node_ptr->node_state &= (~NODE_STATE_REBOOT);
1353 node_ptr->node_state &=
1354 (~NODE_STATE_POWERING_DOWN);
1355 if (IS_NODE_DOWN(node_ptr)) {
1356 state_val = NODE_STATE_IDLE;
1357 #ifndef HAVE_FRONT_END
1358 node_ptr->node_state |=
1359 NODE_STATE_NO_RESPOND;
1360 #endif
1361 node_ptr->last_response = MAX(now,
1362 node_ptr->last_response);
1363 node_ptr->boot_time = 0;
1364 ping_nodes_now = true;
1365 } else if (IS_NODE_FUTURE(node_ptr)) {
1366 if (node_ptr->port == 0) {
1367 node_ptr->port =slurmctld_conf.
1368 slurmd_port;
1369 }
1370 slurm_set_addr( &node_ptr->slurm_addr,
1371 node_ptr->port,
1372 node_ptr->comm_name);
1373 if (node_ptr->slurm_addr.sin_port) {
1374 state_val = NODE_STATE_IDLE;
1375 #ifndef HAVE_FRONT_END
1376 node_ptr->node_state |=
1377 NODE_STATE_NO_RESPOND;
1378 #endif
1379 bit_clear(future_node_bitmap,
1380 node_inx);
1381 node_ptr->last_response =
1382 MAX(now,
1383 node_ptr->last_response);
1384 node_ptr->boot_time = 0;
1385 ping_nodes_now = true;
1386 } else {
1387 error("slurm_set_addr failure "
1388 "on %s",
1389 node_ptr->comm_name);
1390 state_val = base_state;
1391 }
1392 } else
1393 state_val = base_state;
1394 } else if (state_val == NODE_STATE_UNDRAIN) {
1395 if (IS_NODE_IDLE(node_ptr) &&
1396 IS_NODE_DRAIN(node_ptr)) {
1397 clusteracct_storage_g_node_up(
1398 acct_db_conn,
1399 node_ptr,
1400 now);
1401 acct_updated = true;
1402 }
1403 node_ptr->node_state &= (~NODE_STATE_DRAIN);
1404 state_val = base_state;
1405 }
1406
1407 if ((state_val == NODE_STATE_DOWN) ||
1408 (state_val == NODE_STATE_FUTURE)) {
1409 /* We must set node DOWN before killing
1410 * its jobs */
1411 _make_node_down(node_ptr, now);
1412 kill_running_job_by_node_name (this_node_name);
1413 if (state_val == NODE_STATE_FUTURE) {
1414 node_ptr->node_state = NODE_STATE_FUTURE
1415 | node_flags;
1416 bit_set(future_node_bitmap, node_inx);
1417 }
1418 } else if (state_val == NODE_STATE_IDLE) {
1419 /* assume they want to clear DRAIN and
1420 * FAIL flags too */
1421 if (IS_NODE_DOWN(node_ptr)) {
1422 trigger_node_up(node_ptr);
1423 clusteracct_storage_g_node_up(
1424 acct_db_conn,
1425 node_ptr,
1426 now);
1427 acct_updated = true;
1428 } else if (IS_NODE_IDLE(node_ptr) &&
1429 (IS_NODE_DRAIN(node_ptr) ||
1430 IS_NODE_FAIL(node_ptr))) {
1431 clusteracct_storage_g_node_up(
1432 acct_db_conn,
1433 node_ptr,
1434 now);
1435 acct_updated = true;
1436 } /* else already fully available */
1437 node_ptr->node_state &= (~NODE_STATE_DRAIN);
1438 node_ptr->node_state &= (~NODE_STATE_FAIL);
1439 if (!IS_NODE_NO_RESPOND(node_ptr) ||
1440 IS_NODE_POWER_SAVE(node_ptr))
1441 make_node_avail(node_inx);
1442 bit_set (idle_node_bitmap, node_inx);
1443 bit_set (up_node_bitmap, node_inx);
1444 if (IS_NODE_POWER_SAVE(node_ptr))
1445 node_ptr->last_idle = 0;
1446 else
1447 node_ptr->last_idle = now;
1448 } else if (state_val == NODE_STATE_ALLOCATED) {
1449 if (!IS_NODE_DRAIN(node_ptr) &&
1450 !IS_NODE_FAIL(node_ptr) &&
1451 !IS_NODE_NO_RESPOND(node_ptr))
1452 make_node_avail(node_inx);
1453 bit_set (up_node_bitmap, node_inx);
1454 bit_clear (idle_node_bitmap, node_inx);
1455 } else if ((state_val == NODE_STATE_DRAIN) ||
1456 (state_val == NODE_STATE_FAIL)) {
1457 uint32_t new_state = state_val;
1458 if ((IS_NODE_ALLOCATED(node_ptr) ||
1459 IS_NODE_MIXED(node_ptr)) &&
1460 (IS_NODE_POWER_SAVE(node_ptr) ||
1461 IS_NODE_POWER_UP(node_ptr))) {
1462 info("%s: DRAIN/FAIL request for node %s which is allocated and being powered up. Requeueing jobs",
1463 __func__, this_node_name);
1464 kill_running_job_by_node_name(
1465 this_node_name);
1466 }
1467 bit_clear (avail_node_bitmap, node_inx);
1468 node_ptr->node_state &= (~NODE_STATE_DRAIN);
1469 node_ptr->node_state &= (~NODE_STATE_FAIL);
1470 state_val = node_ptr->node_state |= state_val;
1471 if ((node_ptr->run_job_cnt == 0) &&
1472 (node_ptr->comp_job_cnt == 0)) {
1473 trigger_node_drained(node_ptr);
1474 clusteracct_storage_g_node_down(
1475 acct_db_conn,
1476 node_ptr, now, NULL,
1477 node_ptr->reason_uid);
1478 }
1479 if ((new_state == NODE_STATE_FAIL) &&
1480 (nonstop_ops.node_fail))
1481 (nonstop_ops.node_fail)(NULL, node_ptr);
1482 } else if (state_val == NODE_STATE_POWER_SAVE) {
1483 if (IS_NODE_POWER_SAVE(node_ptr)) {
1484 node_ptr->node_state &=
1485 (~NODE_STATE_POWER_SAVE);
1486 info("power down request repeating "
1487 "for node %s", this_node_name);
1488 } else {
1489 if (IS_NODE_DOWN(node_ptr)) {
1490 /* Abort any power up request */
1491 node_ptr->node_state &=
1492 (~NODE_STATE_POWER_UP);
1493 node_ptr->node_state =
1494 NODE_STATE_IDLE |
1495 (node_ptr->node_state &
1496 NODE_STATE_FLAGS);
1497 } else {
1498 node_ptr->node_state &=
1499 (~NODE_STATE_POWER_SAVE);
1500 }
1501 #ifndef HAVE_FRONT_END
1502 node_ptr->node_state |=
1503 NODE_STATE_NO_RESPOND;
1504 #endif
1505
1506 info("powering down node %s",
1507 this_node_name);
1508 }
1509 node_ptr->last_idle = 1;
1510 node_ptr->next_state = NO_VAL;
1511 bit_clear(rs_node_bitmap, node_inx);
1512 free(this_node_name);
1513 continue;
1514 } else if (state_val == NODE_STATE_POWER_UP) {
1515 if (!IS_NODE_POWER_SAVE(node_ptr)) {
1516 if (IS_NODE_POWER_UP(node_ptr)) {
1517 node_ptr->last_idle = now;
1518 node_ptr->node_state |=
1519 NODE_STATE_POWER_SAVE;
1520 info("power up request "
1521 "repeating for node %s",
1522 this_node_name);
1523 } else {
1524 verbose("node %s is already "
1525 "powered up",
1526 this_node_name);
1527 }
1528 } else {
1529 node_ptr->last_idle = now;
1530 info("powering up node %s",
1531 this_node_name);
1532 }
1533 node_ptr->next_state = NO_VAL;
1534 bit_clear(rs_node_bitmap, node_inx);
1535 free(this_node_name);
1536 continue;
1537 } else if ((state_val & NODE_STATE_POWER_SAVE) &&
1538 (state_val & NODE_STATE_POWER_UP) &&
1539 (IS_NODE_POWER_UP(node_ptr))) {
1540 /* Clear any reboot operation in progress */
1541 node_ptr->node_state &= (~NODE_STATE_POWER_UP);
1542 node_ptr->last_response = MAX(now,
1543 node_ptr->last_response);
1544 state_val = base_state;
1545 } else if (state_val == NODE_STATE_NO_RESPOND) {
1546 node_ptr->node_state |= NODE_STATE_NO_RESPOND;
1547 state_val = base_state;
1548 bit_clear(avail_node_bitmap, node_inx);
1549 } else if (state_val == NODE_STATE_CANCEL_REBOOT) {
1550 if (IS_NODE_RUNNING_JOB(node_ptr)) {
1551 node_ptr->node_state &=
1552 (~NODE_STATE_REBOOT);
1553 state_val = base_state;
1554 if (!xstrcmp(node_ptr->reason,
1555 "Reboot ASAP"))
1556 _undo_reboot_asap(node_ptr);
1557 } else {
1558 info("REBOOT on node %s already in progress -- unable to cancel",
1559 this_node_name);
1560 err_code = error_code =
1561 ESLURM_REBOOT_IN_PROGRESS;
1562 }
1563 } else {
1564 info("Invalid node state specified %u",
1565 state_val);
1566 err_code = 1;
1567 error_code = ESLURM_INVALID_NODE_STATE;
1568 }
1569
1570 if (err_code == 0) {
1571 node_ptr->node_state = state_val |
1572 (node_ptr->node_state &
1573 NODE_STATE_FLAGS);
1574
1575 if (!IS_NODE_REBOOT(node_ptr))
1576 node_ptr->next_state = NO_VAL;
1577 bit_clear(rs_node_bitmap, node_inx);
1578
1579 info ("update_node: node %s state set to %s",
1580 this_node_name,
1581 node_state_string(state_val));
1582 }
1583 }
1584
1585 if (!acct_updated && !IS_NODE_DOWN(node_ptr) &&
1586 !IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
1587 /* reason information is handled in
1588 clusteracct_storage_g_node_up()
1589 */
1590 clusteracct_storage_g_node_up(
1591 acct_db_conn, node_ptr, now);
1592 }
1593
1594 free (this_node_name);
1595 }
1596
1597 /* Write/clear log */
1598 (void)_update_node_active_features(NULL, NULL, FEATURE_MODE_PEND);
1599
1600 FREE_NULL_HOSTLIST(host_list);
1601 FREE_NULL_HOSTLIST(hostaddr_list);
1602 FREE_NULL_HOSTLIST(hostname_list);
1603 last_node_update = now;
1604
1605 if ((error_code == SLURM_SUCCESS) && (update_node_msg->features)) {
1606 error_code = _update_node_avail_features(
1607 update_node_msg->node_names,
1608 update_node_msg->features,
1609 FEATURE_MODE_IND);
1610 }
1611 if ((error_code == SLURM_SUCCESS) && (update_node_msg->gres)) {
1612 error_code = _update_node_gres(update_node_msg->node_names,
1613 update_node_msg->gres);
1614 }
1615
1616 /*
1617 * Update weight. Weight is part of config_ptr,
1618 * hence split config records if required
1619 */
1620 if ((error_code == SLURM_SUCCESS) &&
1621 (update_node_msg->weight != NO_VAL)) {
1622 error_code = _update_node_weight(update_node_msg->node_names,
1623 update_node_msg->weight);
1624 if (error_code == SLURM_SUCCESS) {
1625 /* sort config_list by weight for scheduling */
1626 list_sort(config_list, &list_compare_config);
1627 }
1628 }
1629
1630 return error_code;
1631 }
1632
1633 /*
1634 * restore_node_features - Make node and config (from slurm.conf) fields
1635 * consistent for Features, Gres and Weight
1636 * IN recover -
1637 * 0, 1 - use data from config record, built using slurm.conf
1638 * 2 = use data from node record, built from saved state
1639 */
restore_node_features(int recover)1640 extern void restore_node_features(int recover)
1641 {
1642 int i, node_features_plugin_cnt;
1643 node_record_t *node_ptr;
1644
1645 node_features_plugin_cnt = node_features_g_count();
1646 for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
1647 i++, node_ptr++) {
1648 if (node_ptr->weight != node_ptr->config_ptr->weight) {
1649 error("Node %s Weight(%u) differ from slurm.conf",
1650 node_ptr->name, node_ptr->weight);
1651 if (recover == 2) {
1652 _update_node_weight(node_ptr->name,
1653 node_ptr->weight);
1654 } else {
1655 node_ptr->weight = node_ptr->config_ptr->
1656 weight;
1657 }
1658 }
1659 if (xstrcmp(node_ptr->config_ptr->feature, node_ptr->features)){
1660 if (node_features_plugin_cnt == 0) {
1661 error("Node %s Features(%s) differ from slurm.conf",
1662 node_ptr->name, node_ptr->features);
1663 }
1664 if (recover == 2) {
1665 _update_node_avail_features(node_ptr->name,
1666 node_ptr->features,
1667 FEATURE_MODE_COMB);
1668 }
1669 }
1670
1671 /*
1672 * We lose the GRES information updated manually and always
1673 * use the information from slurm.conf
1674 */
1675 (void) gres_plugin_node_reconfig(
1676 node_ptr->name,
1677 node_ptr->config_ptr->gres,
1678 &node_ptr->gres,
1679 &node_ptr->gres_list,
1680 slurmctld_conf.conf_flags & CTL_CONF_OR,
1681 node_ptr->cores,
1682 (node_ptr->boards * node_ptr->sockets));
1683 gres_plugin_node_state_log(node_ptr->gres_list, node_ptr->name);
1684 }
1685 _update_node_avail_features(NULL, NULL, FEATURE_MODE_PEND);
1686 }
1687
1688 /* Duplicate a configuration record except for the node names & bitmap */
_dup_config(config_record_t * config_ptr)1689 config_record_t *_dup_config(config_record_t *config_ptr)
1690 {
1691 config_record_t *new_config_ptr;
1692
1693 new_config_ptr = create_config_record();
1694 new_config_ptr->magic = config_ptr->magic;
1695 new_config_ptr->cpus = config_ptr->cpus;
1696 new_config_ptr->cpu_spec_list = xstrdup(config_ptr->cpu_spec_list);
1697 new_config_ptr->boards = config_ptr->boards;
1698 new_config_ptr->sockets = config_ptr->sockets;
1699 new_config_ptr->cores = config_ptr->cores;
1700 new_config_ptr->core_spec_cnt = config_ptr->core_spec_cnt;
1701 new_config_ptr->threads = config_ptr->threads;
1702 new_config_ptr->real_memory = config_ptr->real_memory;
1703 new_config_ptr->mem_spec_limit = config_ptr->mem_spec_limit;
1704 new_config_ptr->tmp_disk = config_ptr->tmp_disk;
1705 new_config_ptr->weight = config_ptr->weight;
1706 new_config_ptr->feature = xstrdup(config_ptr->feature);
1707 new_config_ptr->gres = xstrdup(config_ptr->gres);
1708
1709 return new_config_ptr;
1710 }
1711
1712 /*
1713 * _update_node_weight - Update weight associated with nodes
1714 * build new config list records as needed
1715 * IN node_names - List of nodes to update
1716 * IN weight - New weight value
1717 * RET: SLURM_SUCCESS or error code
1718 */
_update_node_weight(char * node_names,uint32_t weight)1719 static int _update_node_weight(char *node_names, uint32_t weight)
1720 {
1721 bitstr_t *node_bitmap = NULL, *tmp_bitmap;
1722 ListIterator config_iterator;
1723 config_record_t *config_ptr, *new_config_ptr, *first_new = NULL;
1724 int rc, config_cnt, tmp_cnt;
1725
1726 rc = node_name2bitmap(node_names, false, &node_bitmap);
1727 if (rc) {
1728 info("_update_node_weight: invalid node_name");
1729 return rc;
1730 }
1731
1732 /* For each config_record with one of these nodes,
1733 * update it (if all nodes updated) or split it into
1734 * a new entry */
1735 config_iterator = list_iterator_create(config_list);
1736 while ((config_ptr = list_next(config_iterator))) {
1737 if (config_ptr == first_new)
1738 break; /* done with all original records */
1739
1740 tmp_bitmap = bit_copy(node_bitmap);
1741 bit_and(tmp_bitmap, config_ptr->node_bitmap);
1742 config_cnt = bit_set_count(config_ptr->node_bitmap);
1743 tmp_cnt = bit_set_count(tmp_bitmap);
1744 if (tmp_cnt == 0) {
1745 /* no overlap, leave alone */
1746 } else if (tmp_cnt == config_cnt) {
1747 /* all nodes changed, update in situ */
1748 config_ptr->weight = weight;
1749 } else {
1750 /* partial update, split config_record */
1751 new_config_ptr = _dup_config(config_ptr);
1752 if (first_new == NULL)
1753 first_new = new_config_ptr;
1754 /* Change weight for the given nodes */
1755 new_config_ptr->weight = weight;
1756 new_config_ptr->node_bitmap = bit_copy(tmp_bitmap);
1757 new_config_ptr->nodes = bitmap2node_name(tmp_bitmap);
1758 _update_config_ptr(tmp_bitmap, new_config_ptr);
1759
1760 /* Update remaining records */
1761 bit_and_not(config_ptr->node_bitmap, tmp_bitmap);
1762 xfree(config_ptr->nodes);
1763 config_ptr->nodes = bitmap2node_name(
1764 config_ptr->node_bitmap);
1765 }
1766 FREE_NULL_BITMAP(tmp_bitmap);
1767 }
1768 list_iterator_destroy(config_iterator);
1769 FREE_NULL_BITMAP(node_bitmap);
1770
1771 info("_update_node_weight: nodes %s weight set to: %u",
1772 node_names, weight);
1773 return SLURM_SUCCESS;
1774 }
1775
_update_node_features_post(char * node_names,char ** last_features,char * features,bitstr_t ** last_node_bitmap,bitstr_t ** node_bitmap,int mode,const char * type)1776 static inline void _update_node_features_post(
1777 char *node_names,
1778 char **last_features, char *features,
1779 bitstr_t **last_node_bitmap, bitstr_t **node_bitmap,
1780 int mode, const char *type)
1781 {
1782
1783 xassert(last_features);
1784 xassert(last_node_bitmap);
1785 xassert(node_bitmap);
1786
1787 if (mode == FEATURE_MODE_IND) {
1788 debug2("%s: nodes %s %s features set to: %s",
1789 __func__, node_names, type, features);
1790 } else if (*last_features && *last_node_bitmap &&
1791 ((mode == FEATURE_MODE_PEND) ||
1792 xstrcmp(features, *last_features))) {
1793 char *last_node_names = bitmap2node_name(*last_node_bitmap);
1794 debug2("%s: nodes %s %s features set to: %s",
1795 __func__, last_node_names, type, *last_features);
1796 xfree(last_node_names);
1797 xfree(*last_features);
1798 FREE_NULL_BITMAP(*last_node_bitmap);
1799 }
1800
1801 if (mode == FEATURE_MODE_COMB) {
1802 if (!*last_features) {
1803 /* Start combining records */
1804 *last_features = xstrdup(features);
1805 *last_node_bitmap = *node_bitmap;
1806 *node_bitmap = NULL;
1807 } else {
1808 /* Add this node to existing log info */
1809 bit_or(*last_node_bitmap, *node_bitmap);
1810 }
1811 }
1812 }
1813
1814 /*
1815 * _update_node_active_features - Update active features associated with nodes
1816 * IN node_names - List of nodes to update
1817 * IN active_features - New active features value
1818 * IN mode - FEATURE_MODE_IND : Print each node change indivually
1819 * FEATURE_MODE_COMB: Try to combine like changes (SEE NOTE BELOW)
1820 * FEATURE_MODE_PEND: Print any pending change message
1821 * RET: SLURM_SUCCESS or error code
1822 * NOTE: Use mode=FEATURE_MODE_IND in a loop with node write lock set,
1823 * then call with mode=FEATURE_MODE_PEND at the end of the loop
1824 */
_update_node_active_features(char * node_names,char * active_features,int mode)1825 static int _update_node_active_features(char *node_names, char *active_features,
1826 int mode)
1827 {
1828 static char *last_active_features = NULL;
1829 static bitstr_t *last_node_bitmap = NULL;
1830 bitstr_t *node_bitmap = NULL;
1831 int rc;
1832
1833 if (mode < FEATURE_MODE_PEND) {
1834 /* Perform update of node active features */
1835 rc = node_name2bitmap(node_names, false, &node_bitmap);
1836 if (rc) {
1837 info("%s: invalid node_name (%s)", __func__,
1838 node_names);
1839 return rc;
1840 }
1841 update_feature_list(active_feature_list, active_features,
1842 node_bitmap);
1843 (void) node_features_g_node_update(active_features,
1844 node_bitmap);
1845 }
1846
1847 _update_node_features_post(node_names,
1848 &last_active_features, active_features,
1849 &last_node_bitmap, &node_bitmap,
1850 mode, "active");
1851 FREE_NULL_BITMAP(node_bitmap);
1852
1853 return SLURM_SUCCESS;
1854 }
1855
1856 /*
1857 * _update_node_avail_features - Update available features associated with
1858 * nodes, build new config list records as needed
1859 * IN node_names - List of nodes to update
1860 * IN avail_features - New available features value
1861 * IN mode - FEATURE_MODE_IND : Print each node change indivually
1862 * FEATURE_MODE_COMB: Try to combine like changes (SEE NOTE BELOW)
1863 * FEATURE_MODE_PEND: Print any pending change message
1864 * RET: SLURM_SUCCESS or error code
1865 * NOTE: Use mode=FEATURE_MODE_IND in a loop with node write lock set,
1866 * then call with mode=FEATURE_MODE_PEND at the end of the loop
1867 */
_update_node_avail_features(char * node_names,char * avail_features,int mode)1868 static int _update_node_avail_features(char *node_names, char *avail_features,
1869 int mode)
1870 {
1871 static char *last_avail_features = NULL;
1872 static bitstr_t *last_node_bitmap = NULL;
1873 bitstr_t *node_bitmap = NULL, *tmp_bitmap;
1874 ListIterator config_iterator;
1875 config_record_t *config_ptr, *new_config_ptr, *first_new = NULL;
1876 int rc, config_cnt, tmp_cnt;
1877
1878 if (mode < FEATURE_MODE_PEND) {
1879 rc = node_name2bitmap(node_names, false, &node_bitmap);
1880 if (rc) {
1881 info("%s: invalid node_name (%s)",
1882 __func__, node_names);
1883 return rc;
1884 }
1885
1886 /*
1887 * For each config_record with one of these nodes, update it
1888 * (if all nodes updated) or split it into a new entry
1889 */
1890 config_iterator = list_iterator_create(config_list);
1891 while ((config_ptr = list_next(config_iterator))) {
1892 if (config_ptr == first_new)
1893 break; /* done with all original records */
1894
1895 tmp_bitmap = bit_copy(node_bitmap);
1896 bit_and(tmp_bitmap, config_ptr->node_bitmap);
1897 config_cnt = bit_set_count(config_ptr->node_bitmap);
1898 tmp_cnt = bit_set_count(tmp_bitmap);
1899 if (tmp_cnt == 0) {
1900 /* no overlap, leave alone */
1901 } else if (tmp_cnt == config_cnt) {
1902 /* all nodes changed, update in situ */
1903 xfree(config_ptr->feature);
1904 if (avail_features && avail_features[0]) {
1905 config_ptr->feature =
1906 xstrdup(avail_features);
1907 }
1908 } else {
1909 /* partial update, split config_record */
1910 new_config_ptr = _dup_config(config_ptr);
1911 if (first_new == NULL)
1912 first_new = new_config_ptr;
1913 xfree(new_config_ptr->feature);
1914 if (avail_features && avail_features[0]) {
1915 new_config_ptr->feature =
1916 xstrdup(avail_features);
1917 }
1918 new_config_ptr->node_bitmap =
1919 bit_copy(tmp_bitmap);
1920 new_config_ptr->nodes =
1921 bitmap2node_name(tmp_bitmap);
1922 _update_config_ptr(tmp_bitmap, new_config_ptr);
1923
1924 /* Update remaining records */
1925 bit_and_not(config_ptr->node_bitmap, tmp_bitmap);
1926 xfree(config_ptr->nodes);
1927 config_ptr->nodes = bitmap2node_name(
1928 config_ptr->node_bitmap);
1929 }
1930 FREE_NULL_BITMAP(tmp_bitmap);
1931 }
1932 list_iterator_destroy(config_iterator);
1933 if (avail_feature_list) { /* List not set at startup */
1934 update_feature_list(avail_feature_list, avail_features,
1935 node_bitmap);
1936 }
1937 }
1938
1939 _update_node_features_post(node_names,
1940 &last_avail_features, avail_features,
1941 &last_node_bitmap, &node_bitmap,
1942 mode, "available");
1943 FREE_NULL_BITMAP(node_bitmap);
1944
1945 return SLURM_SUCCESS;
1946 }
1947
1948 /*
1949 * _update_node_gres - Update generic resources associated with nodes
1950 * build new config list records as needed
1951 * IN node_names - List of nodes to update
1952 * IN gres - New gres value
1953 * RET: SLURM_SUCCESS or error code
1954 */
_update_node_gres(char * node_names,char * gres)1955 static int _update_node_gres(char *node_names, char *gres)
1956 {
1957 bitstr_t *changed_node_bitmap = NULL, *node_bitmap = NULL, *tmp_bitmap;
1958 ListIterator config_iterator;
1959 config_record_t *config_ptr, *new_config_ptr, *first_new = NULL;
1960 node_record_t *node_ptr;
1961 int rc, rc2, overlap1, overlap2;
1962 int i, i_first, i_last;
1963
1964 rc = node_name2bitmap(node_names, false, &node_bitmap);
1965 if (rc) {
1966 info("%s: invalid node_name: %s", __func__, node_names);
1967 return rc;
1968 }
1969
1970 /*
1971 * For each config_record with one of these nodes,
1972 * update it (if all nodes updated) or split it into a new entry
1973 */
1974 config_iterator = list_iterator_create(config_list);
1975 while ((config_ptr = list_next(config_iterator))) {
1976 if (config_ptr == first_new)
1977 break; /* done with all original records */
1978
1979 overlap1 = bit_overlap(node_bitmap, config_ptr->node_bitmap);
1980 if (overlap1 == 0)
1981 continue; /* No changes to this config_record */
1982
1983 /* At least some nodes in this config need to change */
1984 tmp_bitmap = bit_copy(node_bitmap);
1985 bit_and(tmp_bitmap, config_ptr->node_bitmap);
1986 i_first = bit_ffs(tmp_bitmap);
1987 if (i_first >= 0)
1988 i_last = bit_fls(tmp_bitmap);
1989 else
1990 i_last = i_first - 1;
1991 for (i = i_first; i <= i_last; i++) {
1992 if (!bit_test(tmp_bitmap, i))
1993 continue; /* Not this node */
1994 node_ptr = node_record_table_ptr + i;
1995 rc2 = gres_plugin_node_reconfig(
1996 node_ptr->name,
1997 gres, &node_ptr->gres,
1998 &node_ptr->gres_list,
1999 slurmctld_conf.conf_flags & CTL_CONF_OR,
2000 node_ptr->cores,
2001 (node_ptr->boards * node_ptr->sockets));
2002 if (rc2 != SLURM_SUCCESS) {
2003 bit_clear(tmp_bitmap, i);
2004 overlap1--;
2005 if (rc == SLURM_SUCCESS)
2006 rc = rc2;
2007 }
2008 gres_plugin_node_state_log(node_ptr->gres_list,
2009 node_ptr->name);
2010 }
2011
2012 overlap2 = bit_set_count(config_ptr->node_bitmap);
2013 if (overlap1 == 0) {
2014 /* No nodes actually changed in this configuration */
2015 FREE_NULL_BITMAP(tmp_bitmap);
2016 } else if (overlap1 == overlap2) {
2017 /* All nodes changes in this configuration */
2018 xfree(config_ptr->gres);
2019 if (gres && gres[0])
2020 config_ptr->gres = xstrdup(gres);
2021 if (changed_node_bitmap) {
2022 bit_or(changed_node_bitmap, tmp_bitmap);
2023 FREE_NULL_BITMAP(tmp_bitmap);
2024 } else {
2025 changed_node_bitmap = tmp_bitmap;
2026 tmp_bitmap = NULL;
2027 }
2028 } else {
2029 /*
2030 * Some nodes changes in this configuration.
2031 * Split config_record in two.
2032 */
2033 new_config_ptr = _dup_config(config_ptr);
2034 if (!first_new)
2035 first_new = new_config_ptr;
2036 xfree(new_config_ptr->gres);
2037 if (gres && gres[0])
2038 new_config_ptr->gres = xstrdup(gres);
2039 new_config_ptr->node_bitmap = tmp_bitmap;
2040 new_config_ptr->nodes = bitmap2node_name(tmp_bitmap);
2041 _update_config_ptr(tmp_bitmap, new_config_ptr);
2042 if (changed_node_bitmap) {
2043 bit_or(changed_node_bitmap, tmp_bitmap);
2044 } else {
2045 changed_node_bitmap = bit_copy(tmp_bitmap);
2046 }
2047
2048 /* Update remaining config_record */
2049 bit_and_not(config_ptr->node_bitmap, tmp_bitmap);
2050 xfree(config_ptr->nodes);
2051 config_ptr->nodes = bitmap2node_name(
2052 config_ptr->node_bitmap);
2053 tmp_bitmap = NULL; /* Nothing left to free */
2054 }
2055 }
2056 list_iterator_destroy(config_iterator);
2057 FREE_NULL_BITMAP(node_bitmap);
2058
2059 /* Report changes nodes, may be subset of requested nodes */
2060 if (changed_node_bitmap) {
2061 char *change_node_str = bitmap2node_name(changed_node_bitmap);
2062 info("%s: nodes %s gres set to: %s", __func__,
2063 change_node_str, gres);
2064 FREE_NULL_BITMAP(changed_node_bitmap);
2065 xfree(change_node_str);
2066 }
2067
2068 return rc;
2069 }
2070
2071 /* Reset the config pointer for updated jobs */
_update_config_ptr(bitstr_t * bitmap,config_record_t * config_ptr)2072 static void _update_config_ptr(bitstr_t *bitmap, config_record_t *config_ptr)
2073 {
2074 int i;
2075
2076 for (i = 0; i < node_record_count; i++) {
2077 if (!bit_test(bitmap, i))
2078 continue;
2079 node_record_table_ptr[i].config_ptr = config_ptr;
2080 }
2081 }
2082
2083 /*
2084 * drain_nodes - drain one or more nodes,
2085 * no-op for nodes already drained or draining
2086 * IN nodes - nodes to drain
2087 * IN reason - reason to drain the nodes
2088 * RET SLURM_SUCCESS or error code
2089 * global: node_record_table_ptr - pointer to global node table
2090 */
drain_nodes(char * nodes,char * reason,uint32_t reason_uid)2091 extern int drain_nodes(char *nodes, char *reason, uint32_t reason_uid)
2092 {
2093 int error_code = 0, node_inx;
2094 node_record_t *node_ptr;
2095 char *this_node_name ;
2096 hostlist_t host_list;
2097 time_t now = time(NULL);
2098
2099 if ((nodes == NULL) || (nodes[0] == '\0')) {
2100 error ("drain_nodes: invalid node name %s", nodes);
2101 return ESLURM_INVALID_NODE_NAME;
2102 }
2103
2104 if ( (host_list = hostlist_create (nodes)) == NULL) {
2105 error ("hostlist_create error on %s: %m", nodes);
2106 return ESLURM_INVALID_NODE_NAME;
2107 }
2108
2109 while ( (this_node_name = hostlist_shift (host_list)) ) {
2110 node_ptr = find_node_record (this_node_name);
2111 node_inx = node_ptr - node_record_table_ptr;
2112 if (node_ptr == NULL) {
2113 error ("drain_nodes: node %s does not exist",
2114 this_node_name);
2115 error_code = ESLURM_INVALID_NODE_NAME;
2116 free (this_node_name);
2117 break;
2118 }
2119
2120 if (IS_NODE_DRAIN(node_ptr)) {
2121 /* state already changed, nothing to do */
2122 free (this_node_name);
2123 continue;
2124 }
2125
2126 node_ptr->node_state |= NODE_STATE_DRAIN;
2127 bit_clear (avail_node_bitmap, node_inx);
2128 info ("drain_nodes: node %s state set to DRAIN",
2129 this_node_name);
2130 if ((node_ptr->reason == NULL) ||
2131 (xstrncmp(node_ptr->reason, "Not responding", 14) == 0)) {
2132 xfree(node_ptr->reason);
2133 node_ptr->reason = xstrdup(reason);
2134 node_ptr->reason_time = now;
2135 node_ptr->reason_uid = reason_uid;
2136 }
2137 if ((node_ptr->run_job_cnt == 0) &&
2138 (node_ptr->comp_job_cnt == 0)) {
2139 /* no jobs, node is drained */
2140 trigger_node_drained(node_ptr);
2141 clusteracct_storage_g_node_down(acct_db_conn,
2142 node_ptr, now, NULL,
2143 reason_uid);
2144 }
2145
2146 free (this_node_name);
2147 }
2148 last_node_update = time (NULL);
2149
2150 hostlist_destroy (host_list);
2151 return error_code;
2152 }
2153 /* Return true if admin request to change node state from old to new is valid */
_valid_node_state_change(uint32_t old,uint32_t new)2154 static bool _valid_node_state_change(uint32_t old, uint32_t new)
2155 {
2156 uint32_t base_state, node_flags;
2157
2158 if (old == new)
2159 return true;
2160
2161 base_state = old & NODE_STATE_BASE;
2162 node_flags = old & NODE_STATE_FLAGS;
2163 switch (new) {
2164 case NODE_STATE_DOWN:
2165 case NODE_STATE_DRAIN:
2166 case NODE_STATE_FAIL:
2167 case NODE_STATE_NO_RESPOND:
2168 case NODE_STATE_POWER_SAVE:
2169 case NODE_STATE_POWER_UP:
2170 case (NODE_STATE_POWER_SAVE | NODE_STATE_POWER_UP):
2171 case NODE_STATE_UNDRAIN:
2172 return true;
2173
2174 case NODE_RESUME:
2175 if ((base_state == NODE_STATE_DOWN) ||
2176 (base_state == NODE_STATE_FUTURE) ||
2177 (node_flags & NODE_STATE_DRAIN) ||
2178 (node_flags & NODE_STATE_FAIL) ||
2179 (node_flags & NODE_STATE_REBOOT) ||
2180 (node_flags & NODE_STATE_POWERING_DOWN))
2181 return true;
2182 break;
2183
2184 case NODE_STATE_CANCEL_REBOOT:
2185 if (node_flags & NODE_STATE_REBOOT)
2186 return true;
2187 break;
2188
2189 case NODE_STATE_FUTURE:
2190 if ((base_state == NODE_STATE_DOWN) ||
2191 (base_state == NODE_STATE_IDLE))
2192 return true;
2193 break;
2194
2195 case NODE_STATE_IDLE:
2196 if ((base_state == NODE_STATE_DOWN) ||
2197 (base_state == NODE_STATE_IDLE))
2198 return true;
2199 break;
2200
2201 case NODE_STATE_ALLOCATED:
2202 if (base_state == NODE_STATE_ALLOCATED)
2203 return true;
2204 break;
2205
2206 default: /* All others invalid */
2207 break;
2208 }
2209
2210 return false;
2211 }
2212
_build_node_spec_bitmap(node_record_t * node_ptr)2213 static int _build_node_spec_bitmap(node_record_t *node_ptr)
2214 {
2215 uint32_t c, coff, size;
2216 int *cpu_spec_array;
2217 uint i, node_inx;
2218
2219 if (node_ptr->threads == 0) {
2220 error("Node %s has invalid thread per core count (%u)",
2221 node_ptr->name, node_ptr->threads);
2222 return SLURM_ERROR;
2223 }
2224
2225 if (!node_ptr->cpu_spec_list)
2226 return SLURM_SUCCESS;
2227 node_inx = node_ptr - node_record_table_ptr;
2228 c = cr_get_coremap_offset(node_inx);
2229 coff = cr_get_coremap_offset(node_inx+1);
2230 size = coff - c;
2231 FREE_NULL_BITMAP(node_ptr->node_spec_bitmap);
2232 node_ptr->node_spec_bitmap = bit_alloc(size);
2233 bit_nset(node_ptr->node_spec_bitmap, 0, size-1);
2234
2235 /* remove node's specialized cpus now */
2236 cpu_spec_array = bitfmt2int(node_ptr->cpu_spec_list);
2237 i = 0;
2238 while (cpu_spec_array[i] != -1) {
2239 bit_nclear(node_ptr->node_spec_bitmap,
2240 (cpu_spec_array[i] / node_ptr->threads),
2241 (cpu_spec_array[i + 1] / node_ptr->threads));
2242 i += 2;
2243 }
2244 xfree(cpu_spec_array);
2245 return SLURM_SUCCESS;
2246 }
2247
update_node_record_acct_gather_data(acct_gather_node_resp_msg_t * msg)2248 extern int update_node_record_acct_gather_data(
2249 acct_gather_node_resp_msg_t *msg)
2250 {
2251 node_record_t *node_ptr;
2252
2253 node_ptr = find_node_record(msg->node_name);
2254 if (node_ptr == NULL)
2255 return ENOENT;
2256
2257 memcpy(node_ptr->energy, msg->energy, sizeof(acct_gather_energy_t));
2258
2259 return SLURM_SUCCESS;
2260 }
2261
2262 /* A node's socket/core configuration has changed could be due to KNL NUMA
2263 * mode change and reboot. Update this node's config record, splitting an
2264 * existing record if needed. */
_split_node_config(node_record_t * node_ptr,slurm_node_registration_status_msg_t * reg_msg)2265 static void _split_node_config(node_record_t *node_ptr,
2266 slurm_node_registration_status_msg_t *reg_msg)
2267 {
2268 config_record_t *config_ptr, *new_config_ptr;
2269 int node_inx;
2270
2271 if (!node_ptr)
2272 return;
2273 config_ptr = node_ptr->config_ptr;
2274 if (!config_ptr)
2275 return;
2276
2277 node_inx = node_ptr - node_record_table_ptr;
2278 if ((bit_set_count(config_ptr->node_bitmap) > 1) &&
2279 bit_test(config_ptr->node_bitmap, node_inx)) {
2280 new_config_ptr = create_config_record();
2281 memcpy(new_config_ptr, config_ptr, sizeof(config_record_t));
2282 new_config_ptr->cpu_spec_list =
2283 xstrdup(config_ptr->cpu_spec_list);
2284 new_config_ptr->feature = xstrdup(config_ptr->feature);
2285 new_config_ptr->gres = xstrdup(config_ptr->gres);
2286 bit_clear(config_ptr->node_bitmap, node_inx);
2287 xfree(config_ptr->nodes);
2288 config_ptr->nodes = bitmap2node_name(config_ptr->node_bitmap);
2289 new_config_ptr->node_bitmap = bit_alloc(node_record_count);
2290 bit_set(new_config_ptr->node_bitmap, node_inx);
2291 new_config_ptr->nodes = xstrdup(node_ptr->name);
2292 node_ptr->config_ptr = new_config_ptr;
2293 config_ptr = new_config_ptr;
2294 }
2295 config_ptr->cores = reg_msg->cores;
2296 config_ptr->sockets = reg_msg->sockets;
2297 }
2298
2299 /*
2300 * validate_node_specs - validate the node's specifications as valid,
2301 * if not set state to down, in any case update last_response
2302 * IN reg_msg - node registration message
2303 * IN protocol_version - Version of Slurm on this node
2304 * OUT newly_up - set if node newly brought into service
2305 * RET 0 if no error, ENOENT if no such node, EINVAL if values too low
2306 */
validate_node_specs(slurm_node_registration_status_msg_t * reg_msg,uint16_t protocol_version,bool * newly_up)2307 extern int validate_node_specs(slurm_node_registration_status_msg_t *reg_msg,
2308 uint16_t protocol_version, bool *newly_up)
2309 {
2310 int error_code, i, node_inx;
2311 config_record_t *config_ptr;
2312 node_record_t *node_ptr;
2313 char *reason_down = NULL;
2314 char *orig_features = NULL, *orig_features_act = NULL;
2315 uint32_t node_flags;
2316 time_t now = time(NULL);
2317 bool orig_node_avail;
2318 static uint32_t cr_flag = NO_VAL;
2319 static int node_features_cnt = 0;
2320 int *cpu_spec_array;
2321 int sockets1, sockets2; /* total sockets on node */
2322 int cores1, cores2; /* total cores on node */
2323 int threads1, threads2; /* total threads on node */
2324
2325 xassert(verify_lock(CONF_LOCK, READ_LOCK));
2326
2327 node_ptr = find_node_record(reg_msg->node_name);
2328 if (node_ptr == NULL)
2329 return ENOENT;
2330 node_inx = node_ptr - node_record_table_ptr;
2331 orig_node_avail = bit_test(avail_node_bitmap, node_inx);
2332
2333 config_ptr = node_ptr->config_ptr;
2334 error_code = SLURM_SUCCESS;
2335
2336 node_ptr->protocol_version = protocol_version;
2337 xfree(node_ptr->version);
2338 node_ptr->version = reg_msg->version;
2339 reg_msg->version = NULL;
2340
2341 if (waiting_for_node_boot(node_ptr))
2342 return SLURM_SUCCESS;
2343 bit_clear(booting_node_bitmap, node_inx);
2344
2345 if (cr_flag == NO_VAL) {
2346 cr_flag = 0; /* call is no-op for select/linear and others */
2347 if (select_g_get_info_from_plugin(SELECT_CR_PLUGIN,
2348 NULL, &cr_flag)) {
2349 cr_flag = NO_VAL; /* error */
2350 }
2351 if (cr_flag == SELECT_TYPE_CONS_TRES)
2352 cr_flag = SELECT_TYPE_CONS_RES;
2353 node_features_cnt = node_features_g_count();
2354 }
2355
2356 if (reg_msg->features_avail || reg_msg->features_active) {
2357 char *sep = "";
2358 orig_features = xstrdup(node_ptr->features);
2359 if (orig_features && orig_features[0])
2360 sep = ",";
2361 if (reg_msg->features_avail) {
2362 xstrfmtcat(orig_features, "%s%s", sep,
2363 reg_msg->features_avail);
2364 }
2365 if (node_ptr->features_act)
2366 orig_features_act = xstrdup(node_ptr->features_act);
2367 else
2368 orig_features_act = xstrdup(node_ptr->features);
2369 }
2370 if (reg_msg->features_avail) {
2371 if (reg_msg->features_active && !node_ptr->features_act) {
2372 node_ptr->features_act = node_ptr->features;
2373 node_ptr->features = NULL;
2374 } else {
2375 xfree(node_ptr->features);
2376 }
2377 node_ptr->features = node_features_g_node_xlate(
2378 reg_msg->features_avail,
2379 orig_features, orig_features,
2380 node_inx);
2381 (void) _update_node_avail_features(node_ptr->name,
2382 node_ptr->features,
2383 FEATURE_MODE_IND);
2384 }
2385 if (reg_msg->features_active) {
2386 char *tmp_feature;
2387 tmp_feature = node_features_g_node_xlate(
2388 reg_msg->features_active,
2389 orig_features_act,
2390 orig_features,
2391 node_inx);
2392 xfree(node_ptr->features_act);
2393 node_ptr->features_act = tmp_feature;
2394 (void) _update_node_active_features(node_ptr->name,
2395 node_ptr->features_act,
2396 FEATURE_MODE_IND);
2397 }
2398 xfree(orig_features);
2399 xfree(orig_features_act);
2400
2401 sockets1 = reg_msg->sockets;
2402 cores1 = sockets1 * reg_msg->cores;
2403 threads1 = cores1 * reg_msg->threads;
2404 if (gres_plugin_node_config_unpack(reg_msg->gres_info,
2405 node_ptr->name) != SLURM_SUCCESS) {
2406 error_code = SLURM_ERROR;
2407 xstrcat(reason_down, "Could not unpack gres data");
2408 } else if (gres_plugin_node_config_validate(
2409 node_ptr->name, config_ptr->gres,
2410 &node_ptr->gres, &node_ptr->gres_list,
2411 reg_msg->threads, reg_msg->cores,
2412 reg_msg->sockets,
2413 slurmctld_conf.conf_flags & CTL_CONF_OR,
2414 &reason_down)
2415 != SLURM_SUCCESS) {
2416 error_code = EINVAL;
2417 /* reason_down set in function above */
2418 }
2419 gres_plugin_node_state_log(node_ptr->gres_list, node_ptr->name);
2420
2421 if (!(slurmctld_conf.conf_flags & CTL_CONF_OR)) {
2422 /* sockets1, cores1, and threads1 are set above */
2423 sockets2 = config_ptr->sockets;
2424 cores2 = sockets2 * config_ptr->cores;
2425 threads2 = cores2 * config_ptr->threads;
2426
2427 if (threads1 < threads2) {
2428 error("Node %s has low socket*core*thread count "
2429 "(%d < %d)",
2430 reg_msg->node_name, threads1, threads2);
2431 error_code = EINVAL;
2432 if (reason_down)
2433 xstrcat(reason_down, ", ");
2434 xstrcat(reason_down, "Low socket*core*thread count");
2435 }
2436
2437 if (reg_msg->cpus < config_ptr->cpus) {
2438 error("Node %s has low cpu count (%u < %u)",
2439 reg_msg->node_name, reg_msg->cpus,
2440 config_ptr->cpus);
2441 error_code = EINVAL;
2442 if (reason_down)
2443 xstrcat(reason_down, ", ");
2444 xstrcat(reason_down, "Low CPUs");
2445 }
2446
2447 if ((error_code == SLURM_SUCCESS) &&
2448 (cr_flag == SELECT_TYPE_CONS_RES) &&
2449 (node_features_cnt > 0) &&
2450 (reg_msg->sockets != config_ptr->sockets) &&
2451 (reg_msg->cores != config_ptr->cores) &&
2452 ((reg_msg->sockets * reg_msg->cores) ==
2453 (config_ptr->sockets * config_ptr->cores))) {
2454 _split_node_config(node_ptr, reg_msg);
2455 }
2456 }
2457 if (reg_msg->boards > reg_msg->sockets) {
2458 error("Node %s has more boards than sockets (%u > %u), setting board count to 1",
2459 reg_msg->node_name, reg_msg->boards, reg_msg->sockets);
2460 reg_msg->boards = 1;
2461 }
2462
2463 /* reset partition and node config (in that order) */
2464
2465 if (error_code == SLURM_SUCCESS) {
2466 node_ptr->boards = reg_msg->boards;
2467 node_ptr->sockets = reg_msg->sockets;
2468 node_ptr->cores = reg_msg->cores;
2469 node_ptr->threads = reg_msg->threads;
2470 node_ptr->cpus = reg_msg->cpus;
2471 }
2472 if (!(slurmctld_conf.conf_flags & CTL_CONF_OR)) {
2473 if (reg_msg->real_memory < config_ptr->real_memory) {
2474 error("Node %s has low real_memory size (%"PRIu64" < %"PRIu64")",
2475 reg_msg->node_name, reg_msg->real_memory,
2476 config_ptr->real_memory);
2477 error_code = EINVAL;
2478 if (reason_down)
2479 xstrcat(reason_down, ", ");
2480 xstrcat(reason_down, "Low RealMemory");
2481 }
2482
2483 if (reg_msg->tmp_disk < config_ptr->tmp_disk) {
2484 error("Node %s has low tmp_disk size (%u < %u)",
2485 reg_msg->node_name, reg_msg->tmp_disk,
2486 config_ptr->tmp_disk);
2487 error_code = EINVAL;
2488 if (reason_down)
2489 xstrcat(reason_down, ", ");
2490 xstrcat(reason_down, "Low TmpDisk");
2491 }
2492 }
2493
2494 node_ptr->real_memory = reg_msg->real_memory;
2495 node_ptr->tmp_disk = reg_msg->tmp_disk;
2496
2497 if (reg_msg->cpu_spec_list != NULL) {
2498 xfree(node_ptr->cpu_spec_list);
2499 node_ptr->cpu_spec_list = reg_msg->cpu_spec_list;
2500 reg_msg->cpu_spec_list = NULL; /* Nothing left to free */
2501
2502 cpu_spec_array = bitfmt2int(node_ptr->cpu_spec_list);
2503 i = 0;
2504 node_ptr->core_spec_cnt = 0;
2505 while (cpu_spec_array[i] != -1) {
2506 node_ptr->core_spec_cnt += (cpu_spec_array[i + 1] -
2507 cpu_spec_array[i]) + 1;
2508 i += 2;
2509 }
2510 if (node_ptr->threads)
2511 node_ptr->core_spec_cnt /= node_ptr->threads;
2512 xfree(cpu_spec_array);
2513 if (_build_node_spec_bitmap(node_ptr) != SLURM_SUCCESS)
2514 error_code = EINVAL;
2515 }
2516
2517 xfree(node_ptr->arch);
2518 node_ptr->arch = reg_msg->arch;
2519 reg_msg->arch = NULL; /* Nothing left to free */
2520
2521 xfree(node_ptr->os);
2522 node_ptr->os = reg_msg->os;
2523 reg_msg->os = NULL; /* Nothing left to free */
2524
2525 if (node_ptr->cpu_load != reg_msg->cpu_load) {
2526 node_ptr->cpu_load = reg_msg->cpu_load;
2527 node_ptr->cpu_load_time = now;
2528 last_node_update = now;
2529 }
2530 if (node_ptr->free_mem != reg_msg->free_mem) {
2531 node_ptr->free_mem = reg_msg->free_mem;
2532 node_ptr->free_mem_time = now;
2533 last_node_update = now;
2534 }
2535
2536 if (IS_NODE_NO_RESPOND(node_ptr) ||
2537 IS_NODE_POWER_UP(node_ptr) ||
2538 IS_NODE_POWER_SAVE(node_ptr)) {
2539 info("Node %s now responding", node_ptr->name);
2540
2541 /*
2542 * Set last_idle in case that the node came up out of band or
2543 * came up after ResumeTimeout so that it can be suspended at a
2544 * later point.
2545 */
2546 if (IS_NODE_POWER_UP(node_ptr) || IS_NODE_POWER_SAVE(node_ptr))
2547 node_ptr->last_idle = now;
2548
2549 node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
2550 node_ptr->node_state &= (~NODE_STATE_POWER_UP);
2551 node_ptr->node_state &= (~NODE_STATE_POWER_SAVE);
2552 node_ptr->node_state &= (~NODE_STATE_POWERING_DOWN);
2553 if (!is_node_in_maint_reservation(node_inx))
2554 node_ptr->node_state &= (~NODE_STATE_MAINT);
2555
2556 bit_clear(power_node_bitmap, node_inx);
2557
2558 last_node_update = now;
2559 }
2560
2561 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
2562
2563 if (node_ptr->last_response &&
2564 (node_ptr->boot_time > node_ptr->last_response) &&
2565 !IS_NODE_UNKNOWN(node_ptr)) { /* Node just rebooted */
2566 (void) node_features_g_get_node(node_ptr->name);
2567 }
2568
2569 if (error_code) {
2570 if (!IS_NODE_DOWN(node_ptr)
2571 && !IS_NODE_DRAIN(node_ptr)
2572 && ! IS_NODE_FAIL(node_ptr)) {
2573 error ("Setting node %s state to DRAIN",
2574 reg_msg->node_name);
2575 drain_nodes(reg_msg->node_name,
2576 reason_down,
2577 slurmctld_conf.slurm_user_id);
2578 }
2579 last_node_update = time (NULL);
2580 } else if (reg_msg->status == ESLURMD_PROLOG_FAILED
2581 || reg_msg->status == ESLURMD_SETUP_ENVIRONMENT_ERROR) {
2582 if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
2583 char *reason;
2584 error("%s: Prolog or job env setup failure on node %s, "
2585 "draining the node",
2586 __func__, reg_msg->node_name);
2587 if (reg_msg->status == ESLURMD_PROLOG_FAILED)
2588 reason = "Prolog error";
2589 else
2590 reason = "Job env setup error";
2591 drain_nodes(reg_msg->node_name, reason,
2592 slurmctld_conf.slurm_user_id);
2593 last_node_update = time (NULL);
2594 }
2595 } else {
2596 if (IS_NODE_UNKNOWN(node_ptr) || IS_NODE_FUTURE(node_ptr)) {
2597 bool unknown = 0;
2598
2599 if (IS_NODE_UNKNOWN(node_ptr))
2600 unknown = 1;
2601
2602 debug("validate_node_specs: node %s registered with "
2603 "%u jobs",
2604 reg_msg->node_name,reg_msg->job_count);
2605 if (IS_NODE_FUTURE(node_ptr)) {
2606 if (IS_NODE_MAINT(node_ptr) &&
2607 !is_node_in_maint_reservation(node_inx))
2608 node_flags &= (~NODE_STATE_MAINT);
2609 node_flags &= (~NODE_STATE_REBOOT);
2610 }
2611 if (reg_msg->job_count) {
2612 node_ptr->node_state = NODE_STATE_ALLOCATED |
2613 node_flags;
2614 } else {
2615 node_ptr->node_state = NODE_STATE_IDLE |
2616 node_flags;
2617 node_ptr->last_idle = now;
2618 }
2619 last_node_update = now;
2620
2621 /* don't send this on a slurmctld unless needed */
2622 if (unknown && slurmctld_init_db
2623 && !IS_NODE_DRAIN(node_ptr)
2624 && !IS_NODE_FAIL(node_ptr)) {
2625 /* reason information is handled in
2626 clusteracct_storage_g_node_up()
2627 */
2628 clusteracct_storage_g_node_up(
2629 acct_db_conn, node_ptr, now);
2630 }
2631 } else if (IS_NODE_DOWN(node_ptr) &&
2632 ((slurmctld_conf.ret2service == 2) ||
2633 IS_NODE_REBOOT(node_ptr) ||
2634 ((slurmctld_conf.ret2service == 1) &&
2635 !xstrcmp(node_ptr->reason, "Not responding") &&
2636 (node_ptr->boot_time <
2637 node_ptr->last_response)))) {
2638 node_flags &= (~NODE_STATE_REBOOT);
2639 if (xstrstr(node_ptr->reason, "Reboot ASAP") &&
2640 (node_ptr->next_state == NO_VAL)) {
2641 if (node_ptr->next_state != NODE_STATE_DOWN) {
2642 xfree(node_ptr->reason);
2643 node_ptr->reason_time = 0;
2644 node_ptr->reason_uid = 0;
2645 }
2646 node_flags &= (~NODE_STATE_DRAIN);
2647 }
2648 if (node_ptr->next_state != NO_VAL)
2649 node_flags &= (~NODE_STATE_DRAIN);
2650
2651 if (node_ptr->next_state == NODE_STATE_DOWN) {
2652 node_ptr->node_state = node_ptr->next_state |
2653 node_flags;
2654 if (node_ptr->reason) {
2655 xstrcat(node_ptr->reason,
2656 " : reboot complete");
2657 }
2658 } else if (reg_msg->job_count) {
2659 node_ptr->node_state = NODE_STATE_ALLOCATED |
2660 node_flags;
2661 } else {
2662 node_ptr->node_state = NODE_STATE_IDLE |
2663 node_flags;
2664 node_ptr->last_idle = now;
2665 }
2666 node_ptr->next_state = NO_VAL;
2667 bit_clear(rs_node_bitmap, node_inx);
2668
2669 info("node %s returned to service",
2670 reg_msg->node_name);
2671 trigger_node_up(node_ptr);
2672 last_node_update = now;
2673 if (!IS_NODE_DRAIN(node_ptr)
2674 && !IS_NODE_DOWN(node_ptr)
2675 && !IS_NODE_FAIL(node_ptr)) {
2676 /* reason information is handled in
2677 * clusteracct_storage_g_node_up() */
2678 clusteracct_storage_g_node_up(
2679 acct_db_conn, node_ptr, now);
2680 }
2681 } else if (node_ptr->last_response &&
2682 (node_ptr->boot_time > node_ptr->last_response) &&
2683 (slurmctld_conf.ret2service != 2)) {
2684 if (!node_ptr->reason ||
2685 (node_ptr->reason &&
2686 !xstrcmp(node_ptr->reason, "Not responding"))) {
2687 if (node_ptr->reason)
2688 xfree(node_ptr->reason);
2689 node_ptr->reason_time = now;
2690 node_ptr->reason_uid =
2691 slurmctld_conf.slurm_user_id;
2692 node_ptr->reason = xstrdup(
2693 "Node unexpectedly rebooted");
2694 }
2695 info("%s: Node %s unexpectedly rebooted boot_time=%u last response=%u",
2696 __func__, reg_msg->node_name,
2697 (uint32_t)node_ptr->boot_time,
2698 (uint32_t)node_ptr->last_response);
2699 _make_node_down(node_ptr, now);
2700 kill_running_job_by_node_name(reg_msg->node_name);
2701 last_node_update = now;
2702 reg_msg->job_count = 0;
2703 } else if (IS_NODE_ALLOCATED(node_ptr) &&
2704 (reg_msg->job_count == 0)) { /* job vanished */
2705 node_ptr->node_state = NODE_STATE_IDLE | node_flags;
2706 node_ptr->last_idle = now;
2707 last_node_update = now;
2708 } else if (IS_NODE_COMPLETING(node_ptr) &&
2709 (reg_msg->job_count == 0)) { /* job already done */
2710 node_ptr->node_state &= (~NODE_STATE_COMPLETING);
2711 last_node_update = now;
2712 bit_clear(cg_node_bitmap, node_inx);
2713 } else if (IS_NODE_IDLE(node_ptr) &&
2714 (reg_msg->job_count != 0)) {
2715 if (node_ptr->run_job_cnt != 0) {
2716 node_ptr->node_state = NODE_STATE_ALLOCATED |
2717 node_flags;
2718 error("Invalid state for node %s, was IDLE "
2719 "with %u running jobs",
2720 node_ptr->name, reg_msg->job_count);
2721 }
2722 /*
2723 * there must be completing job(s) on this node since
2724 * reg_msg->job_count was set (run_job_cnt +
2725 * comp_job_cnt) in validate_jobs_on_node()
2726 */
2727 if (node_ptr->comp_job_cnt != 0) {
2728 node_ptr->node_state |= NODE_STATE_COMPLETING;
2729 bit_set(cg_node_bitmap, node_inx);
2730 }
2731 last_node_update = now;
2732 }
2733 if (IS_NODE_IDLE(node_ptr)) {
2734 node_ptr->owner = NO_VAL;
2735 xfree(node_ptr->mcs_label);
2736 }
2737
2738 select_g_update_node_config(node_inx);
2739 _sync_bitmaps(node_ptr, reg_msg->job_count);
2740 }
2741
2742 xfree(reason_down);
2743 if (reg_msg->energy)
2744 memcpy(node_ptr->energy, reg_msg->energy,
2745 sizeof(acct_gather_energy_t));
2746
2747 node_ptr->last_response = MAX(now, node_ptr->last_response);
2748 node_ptr->boot_req_time = (time_t) 0;
2749
2750 *newly_up = (!orig_node_avail && bit_test(avail_node_bitmap, node_inx));
2751
2752 return error_code;
2753 }
2754
_front_end_reg(slurm_node_registration_status_msg_t * reg_msg)2755 static front_end_record_t * _front_end_reg(
2756 slurm_node_registration_status_msg_t *reg_msg)
2757 {
2758 front_end_record_t *front_end_ptr;
2759 uint32_t state_base, state_flags;
2760 time_t now = time(NULL);
2761
2762 debug2("name:%s boot_time:%u up_time:%u",
2763 reg_msg->node_name, (unsigned int) reg_msg->slurmd_start_time,
2764 reg_msg->up_time);
2765
2766 front_end_ptr = find_front_end_record(reg_msg->node_name);
2767 if (front_end_ptr == NULL) {
2768 error("Registration message from unknown node %s",
2769 reg_msg->node_name);
2770 return NULL;
2771 }
2772
2773 front_end_ptr->boot_time = now - reg_msg->up_time;
2774 if (front_end_ptr->last_response &&
2775 (front_end_ptr->boot_time > front_end_ptr->last_response)) {
2776 info("front end %s unexpectedly rebooted, "
2777 "killing all previously running jobs running on it.",
2778 reg_msg->node_name);
2779 (void) kill_job_by_front_end_name(front_end_ptr->name);
2780 reg_msg->job_count = 0;
2781 }
2782
2783 front_end_ptr->last_response = MAX(now, front_end_ptr->last_response);
2784 front_end_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
2785 state_base = front_end_ptr->node_state & JOB_STATE_BASE;
2786 state_flags = front_end_ptr->node_state & JOB_STATE_FLAGS;
2787 if ((state_base == NODE_STATE_DOWN) && (front_end_ptr->reason) &&
2788 (!xstrncmp(front_end_ptr->reason, "Not responding", 14))) {
2789 error("front end node %s returned to service",
2790 reg_msg->node_name);
2791 state_base = NODE_STATE_IDLE;
2792 xfree(front_end_ptr->reason);
2793 front_end_ptr->reason_time = (time_t) 0;
2794 front_end_ptr->reason_uid = 0;
2795 }
2796 if (state_base == NODE_STATE_UNKNOWN)
2797 state_base = NODE_STATE_IDLE;
2798
2799 state_flags &= (~NODE_STATE_NO_RESPOND);
2800
2801 front_end_ptr->node_state = state_base | state_flags;
2802 last_front_end_update = now;
2803 return front_end_ptr;
2804 }
2805
_build_step_id(char * buf,int buf_len,uint32_t step_id)2806 static char *_build_step_id(char *buf, int buf_len, uint32_t step_id)
2807 {
2808 if (step_id == SLURM_BATCH_SCRIPT)
2809 snprintf(buf, buf_len, "StepId=Batch");
2810 else
2811 snprintf(buf, buf_len, "StepId=%u", step_id);
2812 return buf;
2813 }
2814
2815 /*
2816 * validate_nodes_via_front_end - validate all nodes on a cluster as having
2817 * a valid configuration as soon as the front-end registers. Individual
2818 * nodes will not register with this configuration
2819 * IN reg_msg - node registration message
2820 * IN protocol_version - Version of Slurm on this node
2821 * OUT newly_up - set if node newly brought into service
2822 * RET 0 if no error, Slurm error code otherwise
2823 */
validate_nodes_via_front_end(slurm_node_registration_status_msg_t * reg_msg,uint16_t protocol_version,bool * newly_up)2824 extern int validate_nodes_via_front_end(
2825 slurm_node_registration_status_msg_t *reg_msg,
2826 uint16_t protocol_version, bool *newly_up)
2827 {
2828 int error_code = 0, i, j, rc;
2829 bool update_node_state = false;
2830 job_record_t *job_ptr;
2831 config_record_t *config_ptr;
2832 node_record_t *node_ptr;
2833 time_t now = time(NULL);
2834 ListIterator job_iterator;
2835 hostlist_t reg_hostlist = NULL;
2836 char *host_str = NULL, *reason_down = NULL;
2837 uint32_t node_flags;
2838 front_end_record_t *front_end_ptr;
2839 char step_str[64];
2840
2841 xassert(verify_lock(CONF_LOCK, READ_LOCK));
2842 xassert(verify_lock(JOB_LOCK, WRITE_LOCK));
2843 xassert(verify_lock(FED_LOCK, READ_LOCK));
2844
2845 if (reg_msg->up_time > now) {
2846 error("Node up_time on %s is invalid: %u>%u",
2847 reg_msg->node_name, reg_msg->up_time, (uint32_t) now);
2848 reg_msg->up_time = 0;
2849 }
2850
2851 front_end_ptr = _front_end_reg(reg_msg);
2852 if (front_end_ptr == NULL)
2853 return ESLURM_INVALID_NODE_NAME;
2854
2855 front_end_ptr->protocol_version = protocol_version;
2856 xfree(front_end_ptr->version);
2857 front_end_ptr->version = reg_msg->version;
2858 reg_msg->version = NULL;
2859 *newly_up = false;
2860
2861 if (reg_msg->status == ESLURMD_PROLOG_FAILED) {
2862 error("Prolog failed on node %s", reg_msg->node_name);
2863 /* Do NOT set the node DOWN here. Unlike non-front-end systems,
2864 * this failure is likely due to some problem in the underlying
2865 * infrastructure (e.g. the block failed to boot). */
2866 /* set_front_end_down(front_end_ptr, "Prolog failed"); */
2867 }
2868
2869 /* First validate the job info */
2870 for (i = 0; i < reg_msg->job_count; i++) {
2871 if ( (reg_msg->job_id[i] >= MIN_NOALLOC_JOBID) &&
2872 (reg_msg->job_id[i] <= MAX_NOALLOC_JOBID) ) {
2873 info("NoAllocate JobId=%u %s reported",
2874 reg_msg->job_id[i],
2875 _build_step_id(step_str, sizeof(step_str),
2876 reg_msg->step_id[i]));
2877 continue;
2878 }
2879
2880 job_ptr = find_job_record(reg_msg->job_id[i]);
2881 node_ptr = node_record_table_ptr;
2882 if (job_ptr && job_ptr->node_bitmap &&
2883 ((j = bit_ffs(job_ptr->node_bitmap)) >= 0))
2884 node_ptr += j;
2885
2886 if (job_ptr == NULL) {
2887 error("Orphan JobId=%u %s reported on node %s",
2888 reg_msg->job_id[i],
2889 _build_step_id(step_str, sizeof(step_str),
2890 reg_msg->step_id[i]),
2891 front_end_ptr->name);
2892 abort_job_on_node(reg_msg->job_id[i],
2893 job_ptr, front_end_ptr->name);
2894 continue;
2895 } else if (job_ptr->batch_host == NULL) {
2896 error("Resetting NULL batch_host of JobId=%u to %s",
2897 reg_msg->job_id[i], front_end_ptr->name);
2898 job_ptr->batch_host = xstrdup(front_end_ptr->name);
2899 }
2900
2901
2902 if (IS_JOB_RUNNING(job_ptr) || IS_JOB_SUSPENDED(job_ptr)) {
2903 debug3("Registered %pJ %s on %s",
2904 job_ptr,
2905 _build_step_id(step_str, sizeof(step_str),
2906 reg_msg->step_id[i]),
2907 front_end_ptr->name);
2908 if (job_ptr->batch_flag) {
2909 /* NOTE: Used for purging defunct batch jobs */
2910 job_ptr->time_last_active = now;
2911 }
2912 }
2913
2914 else if (IS_JOB_COMPLETING(job_ptr)) {
2915 /*
2916 * Re-send kill request as needed,
2917 * not necessarily an error
2918 */
2919 kill_job_on_node(job_ptr, node_ptr);
2920 }
2921
2922 else if (IS_JOB_PENDING(job_ptr)) {
2923 /* Typically indicates a job requeue and the hung
2924 * slurmd that went DOWN is now responding */
2925 error("Registered PENDING %pJ %s on %s",
2926 job_ptr,
2927 _build_step_id(step_str, sizeof(step_str),
2928 reg_msg->step_id[i]),
2929 front_end_ptr->name);
2930 abort_job_on_node(reg_msg->job_id[i], job_ptr,
2931 front_end_ptr->name);
2932 }
2933
2934 else if (difftime(now, job_ptr->end_time) <
2935 slurm_get_msg_timeout()) { /* Race condition */
2936 debug("Registered newly completed %pJ %s on %s",
2937 job_ptr,
2938 _build_step_id(step_str, sizeof(step_str),
2939 reg_msg->step_id[i]),
2940 front_end_ptr->name);
2941 }
2942
2943 else { /* else job is supposed to be done */
2944 error("Registered %pJ %s in state %s on %s",
2945 job_ptr,
2946 _build_step_id(step_str, sizeof(step_str),
2947 reg_msg->step_id[i]),
2948 job_state_string(job_ptr->job_state),
2949 front_end_ptr->name);
2950 kill_job_on_node(job_ptr, node_ptr);
2951 }
2952 }
2953
2954
2955 /* purge orphan batch jobs */
2956 job_iterator = list_iterator_create(job_list);
2957 while ((job_ptr = list_next(job_iterator))) {
2958 if (!IS_JOB_RUNNING(job_ptr) ||
2959 IS_JOB_CONFIGURING(job_ptr) ||
2960 (job_ptr->batch_flag == 0))
2961 continue;
2962 if (job_ptr->front_end_ptr != front_end_ptr)
2963 continue;
2964 if (difftime(now, job_ptr->time_last_active) <= 5)
2965 continue;
2966 info("Killing orphan batch %pJ", job_ptr);
2967 job_complete(job_ptr->job_id, slurmctld_conf.slurm_user_id,
2968 false, false, 0);
2969 }
2970 list_iterator_destroy(job_iterator);
2971
2972 (void) gres_plugin_node_config_unpack(reg_msg->gres_info,
2973 node_record_table_ptr->name);
2974 for (i = 0, node_ptr = node_record_table_ptr; i < node_record_count;
2975 i++, node_ptr++) {
2976 bool acct_updated = false;
2977
2978 config_ptr = node_ptr->config_ptr;
2979 node_ptr->last_response = MAX(now, node_ptr->last_response);
2980
2981 rc = gres_plugin_node_config_validate(
2982 node_ptr->name,
2983 config_ptr->gres,
2984 &node_ptr->gres,
2985 &node_ptr->gres_list,
2986 reg_msg->threads,
2987 reg_msg->cores,
2988 reg_msg->sockets,
2989 slurmctld_conf.conf_flags & CTL_CONF_OR,
2990 &reason_down);
2991 if (rc) {
2992 if (!IS_NODE_DOWN(node_ptr)) {
2993 error("Setting node %s state to DOWN",
2994 node_ptr->name);
2995 }
2996 set_node_down(node_ptr->name, reason_down);
2997 last_node_update = now;
2998 }
2999 xfree(reason_down);
3000 gres_plugin_node_state_log(node_ptr->gres_list, node_ptr->name);
3001
3002 if (reg_msg->up_time) {
3003 node_ptr->up_time = reg_msg->up_time;
3004 node_ptr->boot_time = now - reg_msg->up_time;
3005 }
3006 node_ptr->slurmd_start_time = reg_msg->slurmd_start_time;
3007
3008 if (IS_NODE_NO_RESPOND(node_ptr)) {
3009 update_node_state = true;
3010 /* This is handled by the select/cray plugin */
3011 node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
3012 node_ptr->node_state &= (~NODE_STATE_POWER_UP);
3013 }
3014
3015 if (reg_msg->status != ESLURMD_PROLOG_FAILED) {
3016 if (reg_hostlist)
3017 (void) hostlist_push_host(reg_hostlist,
3018 node_ptr->name);
3019 else
3020 reg_hostlist = hostlist_create(node_ptr->name);
3021
3022 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3023 if (IS_NODE_UNKNOWN(node_ptr)) {
3024 update_node_state = true;
3025 *newly_up = true;
3026 if (node_ptr->run_job_cnt) {
3027 node_ptr->node_state =
3028 NODE_STATE_ALLOCATED |
3029 node_flags;
3030 } else {
3031 node_ptr->node_state =
3032 NODE_STATE_IDLE |
3033 node_flags;
3034 node_ptr->last_idle = now;
3035 }
3036 if (!IS_NODE_DRAIN(node_ptr) &&
3037 !IS_NODE_FAIL(node_ptr)) {
3038 /* reason information is handled in
3039 * clusteracct_storage_g_node_up() */
3040 clusteracct_storage_g_node_up(
3041 acct_db_conn,
3042 node_ptr, now);
3043 acct_updated = true;
3044 }
3045 } else if (IS_NODE_DOWN(node_ptr) &&
3046 ((slurmctld_conf.ret2service == 2) ||
3047 (node_ptr->boot_req_time != 0) ||
3048 ((slurmctld_conf.ret2service == 1) &&
3049 !xstrcmp(node_ptr->reason,
3050 "Not responding")))) {
3051 update_node_state = true;
3052 *newly_up = true;
3053 if (node_ptr->run_job_cnt) {
3054 node_ptr->node_state =
3055 NODE_STATE_ALLOCATED |
3056 node_flags;
3057 } else {
3058 node_ptr->node_state =
3059 NODE_STATE_IDLE |
3060 node_flags;
3061 node_ptr->last_idle = now;
3062 }
3063 trigger_node_up(node_ptr);
3064 if (!IS_NODE_DRAIN(node_ptr) &&
3065 !IS_NODE_FAIL(node_ptr)) {
3066 /* reason information is handled in
3067 * clusteracct_storage_g_node_up() */
3068 clusteracct_storage_g_node_up(
3069 acct_db_conn,
3070 node_ptr, now);
3071 acct_updated = true;
3072 }
3073 } else if (IS_NODE_ALLOCATED(node_ptr) &&
3074 (node_ptr->run_job_cnt == 0)) {
3075 /* job vanished */
3076 update_node_state = true;
3077 node_ptr->node_state = NODE_STATE_IDLE |
3078 node_flags;
3079 node_ptr->last_idle = now;
3080 } else if (IS_NODE_COMPLETING(node_ptr) &&
3081 (node_ptr->comp_job_cnt == 0)) {
3082 /* job already done */
3083 update_node_state = true;
3084 node_ptr->node_state &=
3085 (~NODE_STATE_COMPLETING);
3086 bit_clear(cg_node_bitmap, i);
3087 } else if (IS_NODE_IDLE(node_ptr) &&
3088 (node_ptr->run_job_cnt != 0)) {
3089 update_node_state = true;
3090 node_ptr->node_state = NODE_STATE_ALLOCATED |
3091 node_flags;
3092 error("Invalid state for node %s, was IDLE "
3093 "with %u running jobs",
3094 node_ptr->name, reg_msg->job_count);
3095 }
3096 if (IS_NODE_IDLE(node_ptr)) {
3097 node_ptr->owner = NO_VAL;
3098 xfree(node_ptr->mcs_label);
3099 }
3100
3101 select_g_update_node_config(i);
3102 _sync_bitmaps(node_ptr,
3103 (node_ptr->run_job_cnt +
3104 node_ptr->comp_job_cnt));
3105 }
3106 if (reg_msg->energy)
3107 memcpy(node_ptr->energy, reg_msg->energy,
3108 sizeof(acct_gather_energy_t));
3109
3110 if (!acct_updated && slurmctld_init_db &&
3111 !IS_NODE_DOWN(node_ptr) &&
3112 !IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
3113 /* reason information is handled in
3114 clusteracct_storage_g_node_up()
3115 */
3116 clusteracct_storage_g_node_up(
3117 acct_db_conn, node_ptr, now);
3118 }
3119
3120 }
3121
3122 if (reg_hostlist) {
3123 hostlist_uniq(reg_hostlist);
3124 host_str = hostlist_ranged_string_xmalloc(reg_hostlist);
3125 debug("Nodes %s have registered", host_str);
3126 xfree(host_str);
3127 hostlist_destroy(reg_hostlist);
3128 }
3129
3130 if (update_node_state)
3131 last_node_update = time (NULL);
3132 return error_code;
3133 }
3134
3135 /* Sync idle, share, and avail_node_bitmaps for a given node */
_sync_bitmaps(node_record_t * node_ptr,int job_count)3136 static void _sync_bitmaps(node_record_t *node_ptr, int job_count)
3137 {
3138 int node_inx = node_ptr - node_record_table_ptr;
3139
3140 if (job_count == 0) {
3141 bit_set (idle_node_bitmap, node_inx);
3142 bit_set (share_node_bitmap, node_inx);
3143 }
3144 if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr) ||
3145 IS_NODE_FAIL(node_ptr) || IS_NODE_NO_RESPOND(node_ptr))
3146 bit_clear (avail_node_bitmap, node_inx);
3147 else
3148 make_node_avail(node_inx);
3149 if (IS_NODE_DOWN(node_ptr))
3150 bit_clear (up_node_bitmap, node_inx);
3151 else
3152 bit_set (up_node_bitmap, node_inx);
3153 }
3154
3155 #ifdef HAVE_FRONT_END
_node_did_resp(front_end_record_t * fe_ptr)3156 static void _node_did_resp(front_end_record_t *fe_ptr)
3157 {
3158 uint32_t node_flags;
3159 time_t now = time(NULL);
3160
3161 fe_ptr->last_response = MAX(now, fe_ptr->last_response);
3162
3163 if (IS_NODE_NO_RESPOND(fe_ptr)) {
3164 info("Node %s now responding", fe_ptr->name);
3165 last_front_end_update = now;
3166 fe_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
3167 }
3168
3169 node_flags = fe_ptr->node_state & NODE_STATE_FLAGS;
3170 if (IS_NODE_UNKNOWN(fe_ptr)) {
3171 last_front_end_update = now;
3172 fe_ptr->node_state = NODE_STATE_IDLE | node_flags;
3173 }
3174 if (IS_NODE_DOWN(fe_ptr) &&
3175 ((slurmctld_conf.ret2service == 2) ||
3176 ((slurmctld_conf.ret2service == 1) &&
3177 !xstrcmp(fe_ptr->reason, "Not responding")))) {
3178 last_front_end_update = now;
3179 fe_ptr->node_state = NODE_STATE_IDLE | node_flags;
3180 info("node_did_resp: node %s returned to service",
3181 fe_ptr->name);
3182 trigger_front_end_up(fe_ptr);
3183 if (!IS_NODE_DRAIN(fe_ptr) && !IS_NODE_FAIL(fe_ptr)) {
3184 xfree(fe_ptr->reason);
3185 fe_ptr->reason_time = 0;
3186 fe_ptr->reason_uid = NO_VAL;
3187 }
3188 }
3189 return;
3190 }
3191 #else
_node_did_resp(node_record_t * node_ptr)3192 static void _node_did_resp(node_record_t *node_ptr)
3193 {
3194 int node_inx;
3195 uint32_t node_flags;
3196 time_t now = time(NULL);
3197
3198 node_inx = node_ptr - node_record_table_ptr;
3199 if (waiting_for_node_boot(node_ptr))
3200 return;
3201 node_ptr->last_response = MAX(now, node_ptr->last_response);
3202 if (IS_NODE_NO_RESPOND(node_ptr) || IS_NODE_POWER_UP(node_ptr)) {
3203 info("Node %s now responding", node_ptr->name);
3204 node_ptr->node_state &= (~NODE_STATE_NO_RESPOND);
3205 node_ptr->node_state &= (~NODE_STATE_POWER_UP);
3206 if (!is_node_in_maint_reservation(node_inx))
3207 node_ptr->node_state &= (~NODE_STATE_MAINT);
3208 last_node_update = now;
3209 }
3210 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3211 if (IS_NODE_UNKNOWN(node_ptr)) {
3212 node_ptr->last_idle = now;
3213 if (node_ptr->run_job_cnt) {
3214 node_ptr->node_state = NODE_STATE_ALLOCATED |
3215 node_flags;
3216 } else
3217 node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3218 last_node_update = now;
3219 if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
3220 clusteracct_storage_g_node_up(acct_db_conn,
3221 node_ptr, now);
3222 }
3223 }
3224 if (IS_NODE_DOWN(node_ptr) &&
3225 ((slurmctld_conf.ret2service == 2) ||
3226 (node_ptr->boot_req_time != 0) ||
3227 ((slurmctld_conf.ret2service == 1) &&
3228 !xstrcmp(node_ptr->reason, "Not responding")))) {
3229 node_ptr->last_idle = now;
3230 node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3231 info("node_did_resp: node %s returned to service",
3232 node_ptr->name);
3233 trigger_node_up(node_ptr);
3234 last_node_update = now;
3235 if (!IS_NODE_DRAIN(node_ptr) && !IS_NODE_FAIL(node_ptr)) {
3236 /* reason information is handled in
3237 clusteracct_storage_g_node_up()
3238 */
3239 clusteracct_storage_g_node_up(acct_db_conn,
3240 node_ptr, now);
3241 }
3242 }
3243 if (IS_NODE_IDLE(node_ptr) && !IS_NODE_COMPLETING(node_ptr)) {
3244 bit_set (idle_node_bitmap, node_inx);
3245 bit_set (share_node_bitmap, node_inx);
3246 }
3247 if (IS_NODE_DOWN(node_ptr) || IS_NODE_DRAIN(node_ptr) ||
3248 IS_NODE_FAIL(node_ptr)) {
3249 bit_clear (avail_node_bitmap, node_inx);
3250 } else
3251 bit_set (avail_node_bitmap, node_inx);
3252 if (IS_NODE_DOWN(node_ptr))
3253 bit_clear (up_node_bitmap, node_inx);
3254 else
3255 bit_set (up_node_bitmap, node_inx);
3256 return;
3257 }
3258 #endif
3259
3260 /*
3261 * node_did_resp - record that the specified node is responding
3262 * IN name - name of the node
3263 */
node_did_resp(char * name)3264 void node_did_resp (char *name)
3265 {
3266 #ifdef HAVE_FRONT_END
3267 front_end_record_t *node_ptr;
3268 node_ptr = find_front_end_record (name);
3269 #else
3270 node_record_t *node_ptr;
3271 node_ptr = find_node_record (name);
3272 #endif
3273
3274 xassert(verify_lock(CONF_LOCK, READ_LOCK));
3275
3276 if (node_ptr == NULL) {
3277 error ("node_did_resp unable to find node %s", name);
3278 return;
3279 }
3280 _node_did_resp(node_ptr);
3281 debug2("node_did_resp %s",name);
3282 }
3283
3284 /*
3285 * node_not_resp - record that the specified node is not responding
3286 * IN name - name of the node
3287 * IN msg_time - time message was sent
3288 */
node_not_resp(char * name,time_t msg_time,slurm_msg_type_t resp_type)3289 void node_not_resp (char *name, time_t msg_time, slurm_msg_type_t resp_type)
3290 {
3291 #ifdef HAVE_FRONT_END
3292 front_end_record_t *node_ptr;
3293
3294 node_ptr = find_front_end_record (name);
3295 #else
3296 node_record_t *node_ptr;
3297
3298 node_ptr = find_node_record (name);
3299 #endif
3300 if (node_ptr == NULL) {
3301 error ("node_not_resp unable to find node %s", name);
3302 return;
3303 }
3304
3305 /* If the slurmd on the node responded with something we don't
3306 * want to ever set the node down, so mark that the node
3307 * responded, but for whatever reason there was a
3308 * communication error. This makes it so we don't mark the
3309 * node down if the slurmd really is there (Wrong protocol
3310 * version or munge issue or whatever) so we don't kill
3311 * any running jobs. RESPONSE_FORWARD_FAILED means we
3312 * couldn't contact the slurmd.
3313 * last_response could be in the future if boot in progress.
3314 */
3315 if (resp_type != RESPONSE_FORWARD_FAILED) {
3316 node_ptr->last_response = MAX(msg_time - 1,
3317 node_ptr->last_response);
3318 }
3319
3320 if (!IS_NODE_DOWN(node_ptr)) {
3321 /* Logged by node_no_resp_msg() on periodic basis */
3322 node_ptr->not_responding = true;
3323 }
3324
3325 if (IS_NODE_NO_RESPOND(node_ptr) ||
3326 IS_NODE_POWER_SAVE(node_ptr))
3327 return; /* Already known to be not responding */
3328
3329 if (node_ptr->last_response >= msg_time) {
3330 debug("node_not_resp: node %s responded since msg sent",
3331 node_ptr->name);
3332 return;
3333 }
3334
3335 if (!IS_NODE_POWER_SAVE(node_ptr)) {
3336 node_ptr->node_state |= NODE_STATE_NO_RESPOND;
3337 #ifdef HAVE_FRONT_END
3338 last_front_end_update = time(NULL);
3339 #else
3340 last_node_update = time(NULL);
3341 bit_clear (avail_node_bitmap, (node_ptr - node_record_table_ptr));
3342 #endif
3343 }
3344
3345 return;
3346 }
3347
3348 /* For every node with the "not_responding" flag set, clear the flag
3349 * and log that the node is not responding using a hostlist expression */
node_no_resp_msg(void)3350 extern void node_no_resp_msg(void)
3351 {
3352 int i;
3353 node_record_t *node_ptr;
3354 char *host_str = NULL;
3355 hostlist_t no_resp_hostlist = NULL;
3356
3357 for (i = 0; i < node_record_count; i++) {
3358 node_ptr = &node_record_table_ptr[i];
3359 if (!node_ptr->not_responding ||
3360 IS_NODE_POWER_SAVE(node_ptr) ||
3361 IS_NODE_POWER_UP(node_ptr))
3362 continue;
3363 if (no_resp_hostlist) {
3364 (void) hostlist_push_host(no_resp_hostlist,
3365 node_ptr->name);
3366 } else
3367 no_resp_hostlist = hostlist_create(node_ptr->name);
3368 node_ptr->not_responding = false;
3369 }
3370 if (no_resp_hostlist) {
3371 hostlist_uniq(no_resp_hostlist);
3372 host_str = hostlist_ranged_string_xmalloc(no_resp_hostlist);
3373 error("Nodes %s not responding", host_str);
3374 xfree(host_str);
3375 hostlist_destroy(no_resp_hostlist);
3376 }
3377 }
3378
3379 /*
3380 * set_node_down - make the specified compute node's state DOWN and
3381 * kill jobs as needed
3382 * IN name - name of the node
3383 * IN reason - why the node is DOWN
3384 */
set_node_down(char * name,char * reason)3385 void set_node_down (char *name, char *reason)
3386 {
3387 node_record_t *node_ptr;
3388
3389 node_ptr = find_node_record (name);
3390 if (node_ptr == NULL) {
3391 error ("set_node_down unable to find node %s", name);
3392 return;
3393 }
3394 set_node_down_ptr (node_ptr, reason);
3395
3396 return;
3397 }
3398
3399 /*
3400 * set_node_down_ptr - make the specified compute node's state DOWN and
3401 * kill jobs as needed
3402 * IN node_ptr - node_ptr to the node
3403 * IN reason - why the node is DOWN
3404 */
set_node_down_ptr(node_record_t * node_ptr,char * reason)3405 void set_node_down_ptr(node_record_t *node_ptr, char *reason)
3406 {
3407 time_t now = time(NULL);
3408
3409 if ((node_ptr->reason == NULL) ||
3410 (xstrncmp(node_ptr->reason, "Not responding", 14) == 0)) {
3411 xfree(node_ptr->reason);
3412 if (reason) {
3413 node_ptr->reason = xstrdup(reason);
3414 node_ptr->reason_time = now;
3415 node_ptr->reason_uid = slurmctld_conf.slurm_user_id;
3416 } else {
3417 node_ptr->reason_time = 0;
3418 node_ptr->reason_uid = NO_VAL;
3419 }
3420 }
3421 _make_node_down(node_ptr, now);
3422 (void) kill_running_job_by_node_name(node_ptr->name);
3423 _sync_bitmaps(node_ptr, 0);
3424
3425 return;
3426 }
3427
3428 /*
3429 * is_node_down - determine if the specified node's state is DOWN
3430 * IN name - name of the node
3431 * RET true if node exists and is down, otherwise false
3432 */
is_node_down(char * name)3433 bool is_node_down (char *name)
3434 {
3435 node_record_t *node_ptr;
3436
3437 node_ptr = find_node_record (name);
3438 if (node_ptr == NULL) {
3439 error ("is_node_down unable to find node %s", name);
3440 return false;
3441 }
3442
3443 if (IS_NODE_DOWN(node_ptr))
3444 return true;
3445 return false;
3446 }
3447
3448 /*
3449 * is_node_resp - determine if the specified node's state is responding
3450 * IN name - name of the node
3451 * RET true if node exists and is responding, otherwise false
3452 */
is_node_resp(char * name)3453 bool is_node_resp (char *name)
3454 {
3455 #ifdef HAVE_FRONT_END
3456 front_end_record_t *node_ptr;
3457
3458 node_ptr = find_front_end_record (name);
3459 #else
3460 node_record_t *node_ptr;
3461
3462 node_ptr = find_node_record (name);
3463 #endif
3464 if (node_ptr == NULL) {
3465 error ("is_node_resp unable to find node %s", name);
3466 return false;
3467 }
3468
3469 if (IS_NODE_NO_RESPOND(node_ptr))
3470 return false;
3471 return true;
3472 }
3473
3474 /*
3475 * find_first_node_record - find a record for first node in the bitmap
3476 * IN node_bitmap
3477 */
find_first_node_record(bitstr_t * node_bitmap)3478 node_record_t *find_first_node_record(bitstr_t *node_bitmap)
3479 {
3480 int inx;
3481
3482 if (node_bitmap == NULL) {
3483 error ("find_first_node_record passed null bitstring");
3484 return NULL;
3485 }
3486
3487 inx = bit_ffs (node_bitmap);
3488 if (inx < 0)
3489 return NULL;
3490 else
3491 return &node_record_table_ptr[inx];
3492 }
3493
3494 /*
3495 * msg_to_slurmd - send given msg_type (REQUEST_RECONFIGURE or REQUEST_SHUTDOWN)
3496 * to every slurmd
3497 */
msg_to_slurmd(slurm_msg_type_t msg_type)3498 void msg_to_slurmd (slurm_msg_type_t msg_type)
3499 {
3500 int i;
3501 shutdown_msg_t *shutdown_req;
3502 agent_arg_t *kill_agent_args;
3503 #ifdef HAVE_FRONT_END
3504 front_end_record_t *front_end_ptr;
3505 #else
3506 node_record_t *node_ptr;
3507 #endif
3508
3509 kill_agent_args = xmalloc (sizeof (agent_arg_t));
3510 kill_agent_args->msg_type = msg_type;
3511 kill_agent_args->retry = 0;
3512 kill_agent_args->hostlist = hostlist_create(NULL);
3513 if (msg_type == REQUEST_SHUTDOWN) {
3514 shutdown_req = xmalloc(sizeof(shutdown_msg_t));
3515 shutdown_req->options = 0;
3516 kill_agent_args->msg_args = shutdown_req;
3517 }
3518
3519 kill_agent_args->protocol_version = SLURM_PROTOCOL_VERSION;
3520
3521 #ifdef HAVE_FRONT_END
3522 for (i = 0, front_end_ptr = front_end_nodes;
3523 i < front_end_node_cnt; i++, front_end_ptr++) {
3524 if (kill_agent_args->protocol_version >
3525 front_end_ptr->protocol_version)
3526 kill_agent_args->protocol_version =
3527 front_end_ptr->protocol_version;
3528
3529 hostlist_push_host(kill_agent_args->hostlist,
3530 front_end_ptr->name);
3531 kill_agent_args->node_count++;
3532 }
3533 #else
3534 node_ptr = node_record_table_ptr;
3535 for (i = 0; i < node_record_count; i++, node_ptr++) {
3536 if (IS_NODE_FUTURE(node_ptr))
3537 continue;
3538 if (IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr))
3539 continue;
3540 if (kill_agent_args->protocol_version >
3541 node_record_table_ptr[i].protocol_version)
3542 kill_agent_args->protocol_version =
3543 node_record_table_ptr[i].protocol_version;
3544 hostlist_push_host(kill_agent_args->hostlist, node_ptr->name);
3545 kill_agent_args->node_count++;
3546 }
3547 #endif
3548
3549 if (kill_agent_args->node_count == 0) {
3550 hostlist_destroy(kill_agent_args->hostlist);
3551 xfree (kill_agent_args);
3552 } else {
3553 debug ("Spawning agent msg_type=%d", msg_type);
3554 agent_queue_request(kill_agent_args);
3555 }
3556 }
3557
3558 /*
3559 * Specialized version of msg_to_slurmd that handles cross-version issues
3560 * when running configless.
3561 *
3562 * Since the REQUEST_RECONFIGURE message had no body, you could get away with
3563 * sending under the oldest format of any slurmd attached to the system.
3564 *
3565 * For configless, this would mean nothing gets sent to anyone, and those
3566 * older slurmds get REQUEST_RECONFIGURE_WITH_CONFIG and ignore it.
3567 *
3568 * So explicitly split the pool into two groups.
3569 * (Note: may need to split this into three groups for future changes.)
3570 * Note: DOES NOT SUPPORT FRONTEND.
3571 */
push_reconfig_to_slurmd(void)3572 void push_reconfig_to_slurmd(void)
3573 {
3574 #ifndef HAVE_FRONT_END
3575 agent_arg_t *new_args, *old_args;
3576 node_record_t *node_ptr;
3577 config_response_msg_t *config = xmalloc(sizeof(*config));
3578
3579 new_args = xmalloc(sizeof(*new_args));
3580 new_args->msg_type = REQUEST_RECONFIGURE_WITH_CONFIG;
3581 new_args->retry = 0;
3582 new_args->hostlist = hostlist_create(NULL);
3583 new_args->protocol_version = SLURM_PROTOCOL_VERSION;
3584 new_args->msg_args = config;
3585 load_config_response_msg(config, CONFIG_REQUEST_SLURMD);
3586
3587 old_args = xmalloc(sizeof(*old_args));
3588 old_args->msg_type = REQUEST_RECONFIGURE;
3589 old_args->retry = 0;
3590 old_args->hostlist = hostlist_create(NULL);
3591 old_args->protocol_version = SLURM_MIN_PROTOCOL_VERSION;
3592
3593 node_ptr = node_record_table_ptr;
3594 for (int i = 0; i < node_record_count; i++, node_ptr++) {
3595 if (IS_NODE_FUTURE(node_ptr))
3596 continue;
3597 if (IS_NODE_CLOUD(node_ptr) && IS_NODE_POWER_SAVE(node_ptr))
3598 continue;
3599
3600 if (node_ptr->protocol_version == SLURM_PROTOCOL_VERSION) {
3601 hostlist_push_host(new_args->hostlist, node_ptr->name);
3602 new_args->node_count++;
3603 } else {
3604 hostlist_push_host(old_args->hostlist, node_ptr->name);
3605 old_args->node_count++;
3606 }
3607 }
3608
3609 if (new_args->node_count == 0) {
3610 hostlist_destroy(new_args->hostlist);
3611 slurm_free_config_response_msg(config);
3612 xfree(new_args);
3613 } else {
3614 debug("Spawning agent msg_type=%d", new_args->msg_type);
3615 agent_queue_request(new_args);
3616 }
3617
3618 if (old_args->node_count == 0) {
3619 hostlist_destroy(old_args->hostlist);
3620 xfree(old_args);
3621 } else {
3622 debug("Spawning agent msg_type=%d", old_args->msg_type);
3623 agent_queue_request(old_args);
3624 }
3625 #else
3626 error("%s: Cannot use configless with FrontEnd mode! Sending normal reconfigure request.",
3627 __func__);
3628 msg_to_slurmd(REQUEST_RECONFIGURE);
3629 #endif
3630 }
3631
3632
3633 /*
3634 * make_node_alloc - flag specified node as allocated to a job
3635 * IN node_ptr - pointer to node being allocated
3636 * IN job_ptr - pointer to job that is starting
3637 */
make_node_alloc(node_record_t * node_ptr,job_record_t * job_ptr)3638 extern void make_node_alloc(node_record_t *node_ptr, job_record_t *job_ptr)
3639 {
3640 int inx = node_ptr - node_record_table_ptr;
3641 uint32_t node_flags;
3642
3643 (node_ptr->run_job_cnt)++;
3644 bit_clear(idle_node_bitmap, inx);
3645 if (job_ptr->details && (job_ptr->details->share_res == 0)) {
3646 bit_clear(share_node_bitmap, inx);
3647 (node_ptr->no_share_job_cnt)++;
3648 }
3649
3650 if ((job_ptr->details &&
3651 (job_ptr->details->whole_node == WHOLE_NODE_USER)) ||
3652 (job_ptr->part_ptr &&
3653 (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER))) {
3654 node_ptr->owner_job_cnt++;
3655 node_ptr->owner = job_ptr->user_id;
3656 }
3657
3658 if (slurm_mcs_get_select(job_ptr) == 1) {
3659 xfree(node_ptr->mcs_label);
3660 node_ptr->mcs_label = xstrdup(job_ptr->mcs_label);
3661 }
3662
3663 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3664 node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
3665 xfree(node_ptr->reason);
3666 node_ptr->reason_time = 0;
3667 node_ptr->reason_uid = NO_VAL;
3668
3669 last_node_update = time(NULL);
3670 }
3671
3672 /* make_node_avail - flag specified node as available */
make_node_avail(int node_inx)3673 extern void make_node_avail(int node_inx)
3674 {
3675 bit_set(avail_node_bitmap, node_inx);
3676
3677 /*
3678 * If we are in the middle of a backfill cycle, this bitmap is
3679 * used (when bf_continue is enabled) to avoid scheduling lower
3680 * priority jobs on to newly available resources.
3681 */
3682 bit_set(bf_ignore_node_bitmap, node_inx);
3683 }
3684
3685 /* make_node_comp - flag specified node as completing a job
3686 * IN node_ptr - pointer to node marked for completion of job
3687 * IN job_ptr - pointer to job that is completing
3688 * IN suspended - true if job was previously suspended
3689 */
make_node_comp(node_record_t * node_ptr,job_record_t * job_ptr,bool suspended)3690 extern void make_node_comp(node_record_t *node_ptr, job_record_t *job_ptr,
3691 bool suspended)
3692 {
3693 int inx = node_ptr - node_record_table_ptr;
3694 uint32_t node_flags;
3695 time_t now = time(NULL);
3696
3697 xassert(node_ptr);
3698 if (suspended) {
3699 if (node_ptr->sus_job_cnt) {
3700 (node_ptr->sus_job_cnt)--;
3701 } else {
3702 error("%s: %pJ node %s sus_job_cnt underflow", __func__,
3703 job_ptr, node_ptr->name);
3704 }
3705 } else {
3706 if (node_ptr->run_job_cnt) {
3707 (node_ptr->run_job_cnt)--;
3708 } else {
3709 error("%s: %pJ node %s run_job_cnt underflow", __func__,
3710 job_ptr, node_ptr->name);
3711 }
3712 if (job_ptr->details && (job_ptr->details->share_res == 0)) {
3713 if (node_ptr->no_share_job_cnt) {
3714 (node_ptr->no_share_job_cnt)--;
3715 } else {
3716 error("%s: %pJ node %s no_share_job_cnt underflow",
3717 __func__, job_ptr, node_ptr->name);
3718 }
3719 if (node_ptr->no_share_job_cnt == 0)
3720 bit_set(share_node_bitmap, inx);
3721 }
3722 }
3723
3724 if (!IS_NODE_DOWN(node_ptr) && !IS_NODE_POWER_UP(node_ptr)) {
3725 /* Don't verify RPC if node in DOWN or POWER_UP state */
3726 (node_ptr->comp_job_cnt)++;
3727 node_ptr->node_state |= NODE_STATE_COMPLETING;
3728 bit_set(cg_node_bitmap, inx);
3729 }
3730 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3731
3732 if ((node_ptr->run_job_cnt == 0) &&
3733 (node_ptr->comp_job_cnt == 0)) {
3734 node_ptr->last_idle = now;
3735 bit_set(idle_node_bitmap, inx);
3736 if (IS_NODE_DRAIN(node_ptr) || IS_NODE_FAIL(node_ptr)) {
3737 trigger_node_drained(node_ptr);
3738 clusteracct_storage_g_node_down(
3739 acct_db_conn,
3740 node_ptr, now, NULL,
3741 slurmctld_conf.slurm_user_id);
3742 }
3743 }
3744
3745 if (IS_NODE_DOWN(node_ptr)) {
3746 debug3("%s: Node %s being left DOWN", __func__, node_ptr->name);
3747 } else if (node_ptr->run_job_cnt)
3748 node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
3749 else {
3750 node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3751 node_ptr->last_idle = now;
3752 }
3753 last_node_update = now;
3754 }
3755
3756 /* _make_node_down - flag specified node as down */
_make_node_down(node_record_t * node_ptr,time_t event_time)3757 static void _make_node_down(node_record_t *node_ptr, time_t event_time)
3758 {
3759 int inx = node_ptr - node_record_table_ptr;
3760 uint32_t node_flags;
3761
3762 xassert(node_ptr);
3763 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3764 node_flags &= (~NODE_STATE_COMPLETING);
3765 node_ptr->node_state = NODE_STATE_DOWN | node_flags;
3766 node_ptr->owner = NO_VAL;
3767 xfree(node_ptr->mcs_label);
3768 bit_clear (avail_node_bitmap, inx);
3769 bit_clear (cg_node_bitmap, inx);
3770 bit_set (idle_node_bitmap, inx);
3771 bit_set (share_node_bitmap, inx);
3772 bit_clear (up_node_bitmap, inx);
3773 trigger_node_down(node_ptr);
3774 last_node_update = time (NULL);
3775 clusteracct_storage_g_node_down(acct_db_conn,
3776 node_ptr, event_time, NULL,
3777 node_ptr->reason_uid);
3778 }
3779
3780 /*
3781 * make_node_idle - flag specified node as having finished with a job
3782 * IN node_ptr - pointer to node reporting job completion
3783 * IN job_ptr - pointer to job that just completed or NULL if not applicable
3784 */
make_node_idle(node_record_t * node_ptr,job_record_t * job_ptr)3785 void make_node_idle(node_record_t *node_ptr, job_record_t *job_ptr)
3786 {
3787 int inx = node_ptr - node_record_table_ptr;
3788 uint32_t node_flags;
3789 time_t now = time(NULL);
3790 bitstr_t *node_bitmap = NULL;
3791
3792 if (job_ptr) {
3793 if (job_ptr->node_bitmap_cg)
3794 node_bitmap = job_ptr->node_bitmap_cg;
3795 else
3796 node_bitmap = job_ptr->node_bitmap;
3797 }
3798
3799 trace_job(job_ptr, __func__, "enter");
3800
3801 xassert(node_ptr);
3802 if (node_bitmap && (bit_test(node_bitmap, inx))) {
3803 /* Not a replay */
3804 last_job_update = now;
3805 bit_clear(node_bitmap, inx);
3806
3807 if (!IS_JOB_FINISHED(job_ptr))
3808 job_update_tres_cnt(job_ptr, inx);
3809
3810 if (job_ptr->node_cnt) {
3811 /*
3812 * Clean up the JOB_COMPLETING flag
3813 * only if there is not the slurmctld
3814 * epilog running, otherwise wait
3815 * when it terminates then this
3816 * function will be invoked.
3817 */
3818 job_ptr->node_cnt--;
3819 if ((job_ptr->node_cnt == 0) &&
3820 !job_ptr->epilog_running)
3821 cleanup_completing(job_ptr);
3822 } else if ((job_ptr->total_cpus == 0) &&
3823 (job_ptr->total_nodes == 0)) {
3824 /* Job resized to zero nodes (expanded another job) */
3825 } else {
3826 error("%s: %pJ node_cnt underflow", __func__, job_ptr);
3827 }
3828
3829 if (IS_JOB_SUSPENDED(job_ptr)) {
3830 /* Remove node from suspended job */
3831 if (node_ptr->sus_job_cnt)
3832 (node_ptr->sus_job_cnt)--;
3833 else
3834 error("%s: %pJ node %s sus_job_cnt underflow",
3835 __func__, job_ptr, node_ptr->name);
3836 } else if (IS_JOB_RUNNING(job_ptr)) {
3837 /* Remove node from running job */
3838 if (node_ptr->run_job_cnt)
3839 (node_ptr->run_job_cnt)--;
3840 else
3841 error("%s: %pJ node %s run_job_cnt underflow",
3842 __func__, job_ptr, node_ptr->name);
3843 } else {
3844 if (node_ptr->comp_job_cnt) {
3845 (node_ptr->comp_job_cnt)--;
3846 } else if (IS_NODE_DOWN(node_ptr)) {
3847 /* We were not expecting this response,
3848 * ignore it */
3849 } else {
3850 error("%s: %pJ node %s comp_job_cnt underflow",
3851 __func__, job_ptr, node_ptr->name);
3852 }
3853 if (node_ptr->comp_job_cnt > 0)
3854 goto fini; /* More jobs completing */
3855 }
3856 }
3857
3858 if (node_ptr->comp_job_cnt == 0) {
3859 node_ptr->node_state &= (~NODE_STATE_COMPLETING);
3860 bit_clear(cg_node_bitmap, inx);
3861 if (IS_NODE_IDLE(node_ptr)) {
3862 node_ptr->owner = NO_VAL;
3863 xfree(node_ptr->mcs_label);
3864 }
3865 }
3866
3867 node_flags = node_ptr->node_state & NODE_STATE_FLAGS;
3868 if (IS_NODE_DOWN(node_ptr)) {
3869 debug3("%s: %pJ node %s being left DOWN",
3870 __func__, job_ptr, node_ptr->name);
3871 goto fini;
3872 }
3873 bit_set(up_node_bitmap, inx);
3874
3875 if (IS_NODE_DRAIN(node_ptr) || IS_NODE_FAIL(node_ptr) ||
3876 IS_NODE_NO_RESPOND(node_ptr))
3877 bit_clear(avail_node_bitmap, inx);
3878 else
3879 make_node_avail(inx);
3880
3881 if ((IS_NODE_DRAIN(node_ptr) || IS_NODE_FAIL(node_ptr)) &&
3882 (node_ptr->run_job_cnt == 0) && (node_ptr->comp_job_cnt == 0)) {
3883 node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3884 bit_set(idle_node_bitmap, inx);
3885 debug3("%s: %pJ node %s is DRAINED",
3886 __func__, job_ptr, node_ptr->name);
3887 node_ptr->last_idle = now;
3888 trigger_node_drained(node_ptr);
3889 if (!IS_NODE_REBOOT(node_ptr))
3890 clusteracct_storage_g_node_down(acct_db_conn,
3891 node_ptr, now, NULL,
3892 slurmctld_conf.slurm_user_id);
3893 } else if (node_ptr->run_job_cnt) {
3894 node_ptr->node_state = NODE_STATE_ALLOCATED | node_flags;
3895 if (!IS_NODE_NO_RESPOND(node_ptr) &&
3896 !IS_NODE_FAIL(node_ptr) && !IS_NODE_DRAIN(node_ptr))
3897 make_node_avail(inx);
3898 } else {
3899 node_ptr->node_state = NODE_STATE_IDLE | node_flags;
3900 if (!IS_NODE_NO_RESPOND(node_ptr) &&
3901 !IS_NODE_FAIL(node_ptr) && !IS_NODE_DRAIN(node_ptr))
3902 make_node_avail(inx);
3903 if (!IS_NODE_NO_RESPOND(node_ptr) &&
3904 !IS_NODE_COMPLETING(node_ptr))
3905 bit_set(idle_node_bitmap, inx);
3906 node_ptr->last_idle = now;
3907 }
3908
3909 fini:
3910 if (job_ptr &&
3911 ((job_ptr->details &&
3912 (job_ptr->details->whole_node == WHOLE_NODE_USER)) ||
3913 (job_ptr->part_ptr &&
3914 (job_ptr->part_ptr->flags & PART_FLAG_EXCLUSIVE_USER)))) {
3915 if (node_ptr->owner_job_cnt == 0) {
3916 error("%s: node_ptr->owner_job_cnt underflow",
3917 __func__);
3918 } else if (--node_ptr->owner_job_cnt == 0) {
3919 node_ptr->owner = NO_VAL;
3920 xfree(node_ptr->mcs_label);
3921 }
3922 }
3923 last_node_update = now;
3924 }
3925
send_nodes_to_accounting(time_t event_time)3926 extern int send_nodes_to_accounting(time_t event_time)
3927 {
3928 int rc = SLURM_SUCCESS, i = 0;
3929 node_record_t *node_ptr = NULL;
3930 char *reason = NULL;
3931 slurmctld_lock_t node_read_lock = {
3932 READ_LOCK, NO_LOCK, READ_LOCK, WRITE_LOCK, NO_LOCK };
3933
3934 lock_slurmctld(node_read_lock);
3935 /* send nodes not in 'up' state */
3936 node_ptr = node_record_table_ptr;
3937 for (i = 0; i < node_record_count; i++, node_ptr++) {
3938 if (!node_ptr->name)
3939 continue;
3940 if (node_ptr->reason)
3941 reason = node_ptr->reason;
3942 else
3943 reason = "First Registration";
3944 if (IS_NODE_DRAIN(node_ptr) ||
3945 IS_NODE_FAIL(node_ptr) ||
3946 IS_NODE_DOWN(node_ptr))
3947 rc = clusteracct_storage_g_node_down(
3948 acct_db_conn,
3949 node_ptr, event_time,
3950 reason,
3951 slurmctld_conf.slurm_user_id);
3952 if (rc == SLURM_ERROR)
3953 break;
3954 }
3955 unlock_slurmctld(node_read_lock);
3956 return rc;
3957 }
3958
3959 /* node_fini - free all memory associated with node records */
node_fini(void)3960 extern void node_fini (void)
3961 {
3962 FREE_NULL_LIST(active_feature_list);
3963 FREE_NULL_LIST(avail_feature_list);
3964 FREE_NULL_BITMAP(avail_node_bitmap);
3965 FREE_NULL_BITMAP(bf_ignore_node_bitmap);
3966 FREE_NULL_BITMAP(booting_node_bitmap);
3967 FREE_NULL_BITMAP(cg_node_bitmap);
3968 FREE_NULL_BITMAP(future_node_bitmap);
3969 FREE_NULL_BITMAP(idle_node_bitmap);
3970 FREE_NULL_BITMAP(power_node_bitmap);
3971 FREE_NULL_BITMAP(share_node_bitmap);
3972 FREE_NULL_BITMAP(up_node_bitmap);
3973 FREE_NULL_BITMAP(rs_node_bitmap);
3974 node_fini2();
3975 }
3976
3977 /* Reset a node's CPU load value */
reset_node_load(char * node_name,uint32_t cpu_load)3978 extern void reset_node_load(char *node_name, uint32_t cpu_load)
3979 {
3980 #ifdef HAVE_FRONT_END
3981 return;
3982 #else
3983 node_record_t *node_ptr;
3984
3985 node_ptr = find_node_record(node_name);
3986 if (node_ptr) {
3987 time_t now = time(NULL);
3988 node_ptr->cpu_load = cpu_load;
3989 node_ptr->cpu_load_time = now;
3990 last_node_update = now;
3991 } else
3992 error("reset_node_load unable to find node %s", node_name);
3993 #endif
3994 }
3995
3996 /* Reset a node's free memory value */
reset_node_free_mem(char * node_name,uint64_t free_mem)3997 extern void reset_node_free_mem(char *node_name, uint64_t free_mem)
3998 {
3999 #ifdef HAVE_FRONT_END
4000 return;
4001 #else
4002 node_record_t *node_ptr;
4003
4004 node_ptr = find_node_record(node_name);
4005 if (node_ptr) {
4006 time_t now = time(NULL);
4007 node_ptr->free_mem = free_mem;
4008 node_ptr->free_mem_time = now;
4009 last_node_update = now;
4010 } else
4011 error("reset_node_free_mem unable to find node %s", node_name);
4012 #endif
4013 }
4014
4015
4016 /*
4017 * Check for nodes that haven't rebooted yet.
4018 *
4019 * If the node hasn't booted by ResumeTimeout, mark the node as down.
4020 */
check_reboot_nodes()4021 extern void check_reboot_nodes()
4022 {
4023 int i;
4024 node_record_t *node_ptr;
4025 time_t now = time(NULL);
4026 uint16_t resume_timeout = slurmctld_conf.resume_timeout;
4027
4028 for (i = 0; i < node_record_count; i++) {
4029 node_ptr = &node_record_table_ptr[i];
4030
4031 if (IS_NODE_REBOOT(node_ptr) &&
4032 node_ptr->boot_req_time &&
4033 (node_ptr->boot_req_time + resume_timeout < now)) {
4034 char *timeout_msg = "reboot timed out";
4035
4036 if ((node_ptr->next_state != NO_VAL) &&
4037 node_ptr->reason) {
4038 xstrfmtcat(node_ptr->reason, " : %s",
4039 timeout_msg);
4040 } else {
4041 xfree(node_ptr->reason);
4042 node_ptr->reason = xstrdup(timeout_msg);
4043 }
4044 node_ptr->reason_time = now;
4045 node_ptr->reason_uid = slurmctld_conf.slurm_user_id;
4046
4047 /*
4048 * Remove states now so that event state shows as DOWN.
4049 */
4050 node_ptr->node_state &= (~NODE_STATE_REBOOT);
4051 node_ptr->node_state &= (~NODE_STATE_DRAIN);
4052 node_ptr->boot_req_time = 0;
4053 set_node_down_ptr(node_ptr, NULL);
4054
4055 bit_clear(rs_node_bitmap, i);
4056 }
4057 }
4058 }
4059
waiting_for_node_boot(struct node_record * node_ptr)4060 extern bool waiting_for_node_boot(struct node_record *node_ptr)
4061 {
4062 xassert(node_ptr);
4063
4064 if ((IS_NODE_POWER_UP(node_ptr) ||
4065 (IS_NODE_DOWN(node_ptr) && IS_NODE_REBOOT(node_ptr))) &&
4066 (node_ptr->boot_time < node_ptr->boot_req_time)) {
4067 debug("Still waiting for boot of node %s", node_ptr->name);
4068 return true;
4069 }
4070
4071 return false;
4072 }
4073