1 /*****************************************************************************\
2 * node_info.c - get/print the node state information of slurm
3 *****************************************************************************
4 * Copyright (C) 2002-2007 The Regents of the University of California.
5 * Copyright (C) 2008-2010 Lawrence Livermore National Security.
6 * Portions Copyright (C) 2010-2017 SchedMD LLC <https://www.schedmd.com>.
7 * Produced at Lawrence Livermore National Laboratory (cf, DISCLAIMER).
8 * Written by Morris Jette <jette1@llnl.gov> et. al.
9 * CODE-OCEC-09-009. All rights reserved.
10 *
11 * This file is part of Slurm, a resource management program.
12 * For details, see <https://slurm.schedmd.com/>.
13 * Please also read the included file: DISCLAIMER.
14 *
15 * Slurm is free software; you can redistribute it and/or modify it under
16 * the terms of the GNU General Public License as published by the Free
17 * Software Foundation; either version 2 of the License, or (at your option)
18 * any later version.
19 *
20 * In addition, as a special exception, the copyright holders give permission
21 * to link the code of portions of this program with the OpenSSL library under
22 * certain conditions as described in each individual source file, and
23 * distribute linked combinations including the two. You must obey the GNU
24 * General Public License in all respects for all of the code used other than
25 * OpenSSL. If you modify file(s) with this exception, you may extend this
26 * exception to your version of the file(s), but you are not obligated to do
27 * so. If you do not wish to do so, delete this exception statement from your
28 * version. If you delete this exception statement from all source files in
29 * the program, then also delete it here.
30 *
31 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
32 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
33 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
34 * details.
35 *
36 * You should have received a copy of the GNU General Public License along
37 * with Slurm; if not, write to the Free Software Foundation, Inc.,
38 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
39 \*****************************************************************************/
40
41 #include <arpa/inet.h>
42 #include <errno.h>
43 #include <netinet/in.h>
44 #include <stdio.h>
45 #include <stdlib.h>
46 #include <string.h>
47 #include <syslog.h>
48 #include <unistd.h>
49
50 #include "slurm/slurm.h"
51
52 #include "src/common/node_select.h"
53 #include "src/common/parse_time.h"
54 #include "src/common/slurm_acct_gather_energy.h"
55 #include "src/common/slurm_auth.h"
56 #include "src/common/slurm_ext_sensors.h"
57 #include "src/common/slurm_protocol_api.h"
58 #include "src/common/slurm_resource_info.h"
59 #include "src/common/uid.h"
60 #include "src/common/xmalloc.h"
61 #include "src/common/xstring.h"
62
63 /* Data structures for pthreads used to gather node information from multiple
64 * clusters in parallel */
65 typedef struct load_node_req_struct {
66 slurmdb_cluster_rec_t *cluster;
67 int cluster_inx;
68 slurm_msg_t *req_msg;
69 List resp_msg_list;
70 uint16_t show_flags;
71 } load_node_req_struct_t;
72
73 typedef struct load_node_resp_struct {
74 int cluster_inx;
75 node_info_msg_t *new_msg;
76 } load_node_resp_struct_t;
77
78 /*
79 * slurm_print_node_info_msg - output information about all Slurm nodes
80 * based upon message as loaded using slurm_load_node
81 * IN out - file to write to
82 * IN node_info_msg_ptr - node information message pointer
83 * IN one_liner - print as a single line if true
84 */
85 void
slurm_print_node_info_msg(FILE * out,node_info_msg_t * node_info_msg_ptr,int one_liner)86 slurm_print_node_info_msg ( FILE * out, node_info_msg_t * node_info_msg_ptr,
87 int one_liner )
88 {
89 int i;
90 node_info_t * node_ptr = node_info_msg_ptr -> node_array ;
91 char time_str[32];
92
93 slurm_make_time_str ((time_t *)&node_info_msg_ptr->last_update,
94 time_str, sizeof(time_str));
95 fprintf( out, "Node data as of %s, record count %d\n",
96 time_str, node_info_msg_ptr->record_count);
97
98 for (i = 0; i < node_info_msg_ptr-> record_count; i++) {
99 slurm_print_node_table ( out, & node_ptr[i],
100 one_liner ) ;
101 }
102 }
103
104
105 /*
106 * slurm_print_node_table - output information about a specific Slurm nodes
107 * based upon message as loaded using slurm_load_node
108 * IN out - file to write to
109 * IN node_ptr - an individual node information record pointer
110 * IN one_liner - print as a single line if true
111 */
slurm_print_node_table(FILE * out,node_info_t * node_ptr,int one_liner)112 void slurm_print_node_table(FILE *out, node_info_t *node_ptr, int one_liner)
113 {
114 char *print_this = slurm_sprint_node_table(node_ptr, one_liner);
115 fprintf(out, "%s", print_this);
116 xfree(print_this);
117 }
118
119 /* Given data structures containing information about nodes and partitions,
120 * populate the node's "partitions" field */
121 void
slurm_populate_node_partitions(node_info_msg_t * node_buffer_ptr,partition_info_msg_t * part_buffer_ptr)122 slurm_populate_node_partitions(node_info_msg_t *node_buffer_ptr,
123 partition_info_msg_t *part_buffer_ptr)
124 {
125 int i, j, n, p;
126 node_info_t *node_ptr;
127 partition_info_t *part_ptr;
128
129 if (!node_buffer_ptr || (node_buffer_ptr->record_count == 0) ||
130 !part_buffer_ptr || (part_buffer_ptr->record_count == 0))
131 return;
132
133 for (n = 0, node_ptr = node_buffer_ptr->node_array;
134 n < node_buffer_ptr->record_count; n++, node_ptr++) {
135 xfree(node_ptr->partitions);
136 }
137
138 /*
139 * Iterate through the partitions in the slurm.conf using "p". The
140 * partition has an array of node index pairs to specify the range.
141 * Using "i", iterate by two's through the node list to get the
142 * begin-end node range. Using "j", interate through the node range
143 * and add the partition name to the node's partition list. If the
144 * node on the partition is a singleton (i.e. Nodes=node1), the
145 * begin-end range are both the same node index value.
146 */
147 for (p = 0, part_ptr = part_buffer_ptr->partition_array;
148 p < part_buffer_ptr->record_count; p++, part_ptr++) {
149 for (i = 0; ; i += 2) {
150 if (part_ptr->node_inx[i] == -1)
151 break;
152 for (j = part_ptr->node_inx[i];
153 j <= part_ptr->node_inx[i+1]; j++) {
154 char *sep = "";
155 if ((j < 0) ||
156 (j >= node_buffer_ptr->record_count))
157 continue;
158 node_ptr = node_buffer_ptr->node_array + j;
159 if (node_ptr->partitions)
160 sep = ",";
161 xstrfmtcat(node_ptr->partitions, "%s%s", sep,
162 part_ptr->name);
163 }
164 }
165 }
166 }
167
168 /*
169 * slurm_sprint_node_table - output information about a specific Slurm nodes
170 * based upon message as loaded using slurm_load_node
171 * IN node_ptr - an individual node information record pointer
172 * IN one_liner - print as a single line if true
173 * RET out - char * containing formatted output (must be freed after call)
174 * NULL is returned on failure.
175 */
slurm_sprint_node_table(node_info_t * node_ptr,int one_liner)176 char *slurm_sprint_node_table(node_info_t *node_ptr, int one_liner)
177 {
178 uint32_t my_state = node_ptr->node_state;
179 char *cloud_str = "", *comp_str = "", *drain_str = "", *power_str = "";
180 char time_str[32];
181 char *out = NULL, *reason_str = NULL;
182 uint16_t alloc_cpus = 0;
183 int idle_cpus;
184 uint64_t alloc_memory;
185 char *node_alloc_tres = NULL;
186 char *line_end = (one_liner) ? " " : "\n ";
187
188 if (my_state & NODE_STATE_CLOUD) {
189 my_state &= (~NODE_STATE_CLOUD);
190 cloud_str = "+CLOUD";
191 }
192 if (my_state & NODE_STATE_COMPLETING) {
193 my_state &= (~NODE_STATE_COMPLETING);
194 comp_str = "+COMPLETING";
195 }
196 if (my_state & NODE_STATE_DRAIN) {
197 my_state &= (~NODE_STATE_DRAIN);
198 drain_str = "+DRAIN";
199 }
200 if (my_state & NODE_STATE_FAIL) {
201 my_state &= (~NODE_STATE_FAIL);
202 drain_str = "+FAIL";
203 }
204 if (my_state & NODE_STATE_POWER_SAVE) {
205 my_state &= (~NODE_STATE_POWER_SAVE);
206 power_str = "+POWER";
207 }
208 if (my_state & NODE_STATE_POWERING_DOWN) {
209 my_state &= (~NODE_STATE_POWERING_DOWN);
210 power_str = "+POWERING_DOWN";
211 }
212 slurm_get_select_nodeinfo(node_ptr->select_nodeinfo,
213 SELECT_NODEDATA_SUBCNT,
214 NODE_STATE_ALLOCATED,
215 &alloc_cpus);
216 idle_cpus = node_ptr->cpus - alloc_cpus;
217
218 if (idle_cpus && (idle_cpus != node_ptr->cpus)) {
219 my_state &= NODE_STATE_FLAGS;
220 my_state |= NODE_STATE_MIXED;
221 }
222
223 /****** Line 1 ******/
224 xstrfmtcat(out, "NodeName=%s ", node_ptr->name);
225
226 if (node_ptr->arch)
227 xstrfmtcat(out, "Arch=%s ", node_ptr->arch);
228
229 if (node_ptr->cpu_bind) {
230 char tmp_str[128];
231 slurm_sprint_cpu_bind_type(tmp_str, node_ptr->cpu_bind);
232 xstrfmtcat(out, "CpuBind=%s ", tmp_str);
233 }
234
235 xstrfmtcat(out, "CoresPerSocket=%u ", node_ptr->cores);
236
237 xstrcat(out, line_end);
238
239 /****** Line ******/
240 xstrfmtcat(out, "CPUAlloc=%u CPUTot=%u ",
241 alloc_cpus, node_ptr->cpus);
242
243 if (node_ptr->cpu_load == NO_VAL)
244 xstrcat(out, "CPULoad=N/A");
245 else
246 xstrfmtcat(out, "CPULoad=%.2f", (node_ptr->cpu_load / 100.0));
247
248 xstrcat(out, line_end);
249
250 /****** Line ******/
251 xstrfmtcat(out, "AvailableFeatures=%s", node_ptr->features);
252 xstrcat(out, line_end);
253
254 /****** Line ******/
255 xstrfmtcat(out, "ActiveFeatures=%s", node_ptr->features_act);
256 xstrcat(out, line_end);
257
258 /****** Line ******/
259 xstrfmtcat(out, "Gres=%s", node_ptr->gres);
260 xstrcat(out, line_end);
261
262 /****** Line (optional) ******/
263 if (node_ptr->gres_drain) {
264 xstrfmtcat(out, "GresDrain=%s", node_ptr->gres_drain);
265 xstrcat(out, line_end);
266 }
267
268 /****** Line (optional) ******/
269 if (node_ptr->gres_used) {
270 xstrfmtcat(out, "GresUsed=%s", node_ptr->gres_used);
271 xstrcat(out, line_end);
272 }
273
274 /****** Line (optional) ******/
275 {
276 bool line_used = false;
277
278 if (node_ptr->node_addr) {
279 xstrfmtcat(out, "NodeAddr=%s ", node_ptr->node_addr);
280 line_used = true;
281 }
282
283 if (node_ptr->node_hostname) {
284 xstrfmtcat(out, "NodeHostName=%s ",
285 node_ptr->node_hostname);
286 line_used = true;
287 }
288
289 if (node_ptr->bcast_address) {
290 xstrfmtcat(out, "BcastAddr=%s ", node_ptr->bcast_address);
291 line_used = true;
292 }
293
294 if (node_ptr->port != slurm_get_slurmd_port()) {
295 xstrfmtcat(out, "Port=%u ", node_ptr->port);
296 line_used = true;
297 }
298
299 if (node_ptr->version) {
300 xstrfmtcat(out, "Version=%s", node_ptr->version);
301 line_used = true;
302 }
303
304 if (line_used)
305 xstrcat(out, line_end);
306 }
307
308 /****** Line ******/
309 if (node_ptr->os) {
310 xstrfmtcat(out, "OS=%s ", node_ptr->os);
311 xstrcat(out, line_end);
312 }
313
314 /****** Line ******/
315 slurm_get_select_nodeinfo(node_ptr->select_nodeinfo,
316 SELECT_NODEDATA_MEM_ALLOC,
317 NODE_STATE_ALLOCATED,
318 &alloc_memory);
319 xstrfmtcat(out, "RealMemory=%"PRIu64" AllocMem=%"PRIu64" ",
320 node_ptr->real_memory, alloc_memory);
321
322 if (node_ptr->free_mem == NO_VAL64)
323 xstrcat(out, "FreeMem=N/A ");
324 else
325 xstrfmtcat(out, "FreeMem=%"PRIu64" ", node_ptr->free_mem);
326
327 xstrfmtcat(out, "Sockets=%u Boards=%u",
328 node_ptr->sockets, node_ptr->boards);
329 xstrcat(out, line_end);
330
331 /****** core & memory specialization Line (optional) ******/
332 if (node_ptr->core_spec_cnt || node_ptr->cpu_spec_list ||
333 node_ptr->mem_spec_limit) {
334 if (node_ptr->core_spec_cnt) {
335 xstrfmtcat(out, "CoreSpecCount=%u ",
336 node_ptr->core_spec_cnt);
337 }
338 if (node_ptr->cpu_spec_list) {
339 xstrfmtcat(out, "CPUSpecList=%s ",
340 node_ptr->cpu_spec_list);
341 }
342 if (node_ptr->mem_spec_limit) {
343 xstrfmtcat(out, "MemSpecLimit=%"PRIu64"",
344 node_ptr->mem_spec_limit);
345 }
346 xstrcat(out, line_end);
347 }
348
349 /****** Line ******/
350 xstrfmtcat(out, "State=%s%s%s%s%s ThreadsPerCore=%u TmpDisk=%u Weight=%u ",
351 node_state_string(my_state),
352 cloud_str, comp_str, drain_str, power_str,
353 node_ptr->threads, node_ptr->tmp_disk, node_ptr->weight);
354
355 if (node_ptr->owner == NO_VAL) {
356 xstrcat(out, "Owner=N/A ");
357 } else {
358 char *user_name = uid_to_string((uid_t) node_ptr->owner);
359 xstrfmtcat(out, "Owner=%s(%u) ", user_name, node_ptr->owner);
360 xfree(user_name);
361 }
362
363 xstrfmtcat(out, "MCS_label=%s",
364 (node_ptr->mcs_label == NULL) ? "N/A" : node_ptr->mcs_label);
365
366 xstrcat(out, line_end);
367
368 /****** Line ******/
369 if ((node_ptr->next_state != NO_VAL) &&
370 (my_state & NODE_STATE_REBOOT)) {
371 xstrfmtcat(out, "NextState=%s",
372 node_state_string(node_ptr->next_state));
373 xstrcat(out, line_end);
374 }
375
376 /****** Line ******/
377 if (node_ptr->partitions) {
378 xstrfmtcat(out, "Partitions=%s ", node_ptr->partitions);
379 xstrcat(out, line_end);
380 }
381
382 /****** Line ******/
383 if (node_ptr->boot_time) {
384 slurm_make_time_str((time_t *)&node_ptr->boot_time,
385 time_str, sizeof(time_str));
386 xstrfmtcat(out, "BootTime=%s ", time_str);
387 } else {
388 xstrcat(out, "BootTime=None ");
389 }
390
391 if (node_ptr->slurmd_start_time) {
392 slurm_make_time_str ((time_t *)&node_ptr->slurmd_start_time,
393 time_str, sizeof(time_str));
394 xstrfmtcat(out, "SlurmdStartTime=%s", time_str);
395 } else {
396 xstrcat(out, "SlurmdStartTime=None");
397 }
398 xstrcat(out, line_end);
399
400 /****** TRES Line ******/
401 select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
402 SELECT_NODEDATA_TRES_ALLOC_FMT_STR,
403 NODE_STATE_ALLOCATED, &node_alloc_tres);
404 xstrfmtcat(out, "CfgTRES=%s", node_ptr->tres_fmt_str);
405 xstrcat(out, line_end);
406 xstrfmtcat(out, "AllocTRES=%s",
407 (node_alloc_tres) ? node_alloc_tres : "");
408 xfree(node_alloc_tres);
409 xstrcat(out, line_end);
410
411 /****** Power Management Line ******/
412 if (!node_ptr->power || (node_ptr->power->cap_watts == NO_VAL))
413 xstrcat(out, "CapWatts=n/a");
414 else
415 xstrfmtcat(out, "CapWatts=%u", node_ptr->power->cap_watts);
416
417 xstrcat(out, line_end);
418
419 /****** Power Consumption Line ******/
420 if (!node_ptr->energy || node_ptr->energy->current_watts == NO_VAL)
421 xstrcat(out, "CurrentWatts=n/s AveWatts=n/s");
422 else
423 xstrfmtcat(out, "CurrentWatts=%u AveWatts=%u",
424 node_ptr->energy->current_watts,
425 node_ptr->energy->ave_watts);
426
427 xstrcat(out, line_end);
428
429 /****** external sensors Line ******/
430 if (!node_ptr->ext_sensors
431 || node_ptr->ext_sensors->consumed_energy == NO_VAL64)
432 xstrcat(out, "ExtSensorsJoules=n/s ");
433 else
434 xstrfmtcat(out, "ExtSensorsJoules=%"PRIu64" ",
435 node_ptr->ext_sensors->consumed_energy);
436
437 if (!node_ptr->ext_sensors
438 || node_ptr->ext_sensors->current_watts == NO_VAL)
439 xstrcat(out, "ExtSensorsWatts=n/s ");
440 else
441 xstrfmtcat(out, "ExtSensorsWatts=%u ",
442 node_ptr->ext_sensors->current_watts);
443
444 if (!node_ptr->ext_sensors
445 || node_ptr->ext_sensors->temperature == NO_VAL)
446 xstrcat(out, "ExtSensorsTemp=n/s");
447 else
448 xstrfmtcat(out, "ExtSensorsTemp=%u",
449 node_ptr->ext_sensors->temperature);
450
451 xstrcat(out, line_end);
452
453 /****** Line ******/
454 if (node_ptr->reason && node_ptr->reason[0])
455 xstrcat(reason_str, node_ptr->reason);
456 if (reason_str) {
457 int inx = 1;
458 char *save_ptr = NULL, *tok, *user_name;
459 tok = strtok_r(reason_str, "\n", &save_ptr);
460 while (tok) {
461 if (inx == 1) {
462 xstrcat(out, "Reason=");
463 } else {
464 xstrcat(out, line_end);
465 xstrcat(out, " ");
466 }
467 xstrfmtcat(out, "%s", tok);
468 if ((inx++ == 1) && node_ptr->reason_time) {
469 user_name = uid_to_string(node_ptr->reason_uid);
470 slurm_make_time_str((time_t *)&node_ptr->reason_time,
471 time_str, sizeof(time_str));
472 xstrfmtcat(out, " [%s@%s]", user_name, time_str);
473 xfree(user_name);
474 }
475 tok = strtok_r(NULL, "\n", &save_ptr);
476 }
477 xfree(reason_str);
478 }
479 if (one_liner)
480 xstrcat(out, "\n");
481 else
482 xstrcat(out, "\n\n");
483
484 return out;
485 }
486
_set_node_mixed(node_info_msg_t * resp)487 static void _set_node_mixed(node_info_msg_t *resp)
488 {
489 node_info_t *node_ptr = NULL;
490 int i;
491
492 if (!resp)
493 return;
494
495 for (i = 0, node_ptr = resp->node_array;
496 i < resp->record_count; i++, node_ptr++) {
497 uint16_t used_cpus = 0;
498 select_g_select_nodeinfo_get(node_ptr->select_nodeinfo,
499 SELECT_NODEDATA_SUBCNT,
500 NODE_STATE_ALLOCATED, &used_cpus);
501 if ((used_cpus != 0) && (used_cpus != node_ptr->cpus)) {
502 node_ptr->node_state &= NODE_STATE_FLAGS;
503 node_ptr->node_state |= NODE_STATE_MIXED;
504 }
505 }
506 }
507
_load_cluster_nodes(slurm_msg_t * req_msg,node_info_msg_t ** node_info_msg_pptr,slurmdb_cluster_rec_t * cluster,uint16_t show_flags)508 static int _load_cluster_nodes(slurm_msg_t *req_msg,
509 node_info_msg_t **node_info_msg_pptr,
510 slurmdb_cluster_rec_t *cluster,
511 uint16_t show_flags)
512 {
513 slurm_msg_t resp_msg;
514 int rc;
515
516 slurm_msg_t_init(&resp_msg);
517
518 if (slurm_send_recv_controller_msg(req_msg, &resp_msg, cluster) < 0)
519 return SLURM_ERROR;
520
521 switch (resp_msg.msg_type) {
522 case RESPONSE_NODE_INFO:
523 *node_info_msg_pptr = (node_info_msg_t *) resp_msg.data;
524 if (show_flags & SHOW_MIXED)
525 _set_node_mixed(*node_info_msg_pptr);
526 break;
527 case RESPONSE_SLURM_RC:
528 rc = ((return_code_msg_t *) resp_msg.data)->return_code;
529 slurm_free_return_code_msg(resp_msg.data);
530 if (rc)
531 slurm_seterrno_ret(rc);
532 *node_info_msg_pptr = NULL;
533 break;
534 default:
535 slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
536 break;
537 }
538
539 return SLURM_SUCCESS;
540 }
541
542 /* Maintain a consistent ordering of records */
_sort_by_cluster_inx(void * x,void * y)543 static int _sort_by_cluster_inx(void *x, void *y)
544 {
545 load_node_resp_struct_t *resp_x = *(load_node_resp_struct_t **) x;
546 load_node_resp_struct_t *resp_y = *(load_node_resp_struct_t **) y;
547
548 if (resp_x->cluster_inx > resp_y->cluster_inx)
549 return -1;
550 if (resp_x->cluster_inx < resp_y->cluster_inx)
551 return 1;
552 return 0;
553 }
554
555 /* Thread to read node information from some cluster */
_load_node_thread(void * args)556 static void *_load_node_thread(void *args)
557 {
558 load_node_req_struct_t *load_args = (load_node_req_struct_t *) args;
559 slurmdb_cluster_rec_t *cluster = load_args->cluster;
560 node_info_msg_t *new_msg = NULL;
561 int i, rc;
562
563 if ((rc = _load_cluster_nodes(load_args->req_msg, &new_msg, cluster,
564 load_args->show_flags)) || !new_msg) {
565 verbose("Error reading node information from cluster %s: %s",
566 cluster->name, slurm_strerror(rc));
567 } else {
568 load_node_resp_struct_t *node_resp;
569 for (i = 0; i < new_msg->record_count; i++) {
570 if (!new_msg->node_array[i].cluster_name) {
571 new_msg->node_array[i].cluster_name =
572 xstrdup(cluster->name);
573 }
574 }
575 node_resp = xmalloc(sizeof(load_node_resp_struct_t));
576 node_resp->cluster_inx = load_args->cluster_inx;
577 node_resp->new_msg = new_msg;
578 list_append(load_args->resp_msg_list, node_resp);
579 }
580 xfree(args);
581
582 return (void *) NULL;
583 }
584
_load_fed_nodes(slurm_msg_t * req_msg,node_info_msg_t ** node_info_msg_pptr,uint16_t show_flags,char * cluster_name,slurmdb_federation_rec_t * fed)585 static int _load_fed_nodes(slurm_msg_t *req_msg,
586 node_info_msg_t **node_info_msg_pptr,
587 uint16_t show_flags, char *cluster_name,
588 slurmdb_federation_rec_t *fed)
589 {
590 int cluster_inx = 0, i;
591 load_node_resp_struct_t *node_resp;
592 node_info_msg_t *orig_msg = NULL, *new_msg = NULL;
593 uint32_t new_rec_cnt;
594 slurmdb_cluster_rec_t *cluster;
595 ListIterator iter;
596 int pthread_count = 0;
597 pthread_t *load_thread = 0;
598 load_node_req_struct_t *load_args;
599 List resp_msg_list;
600
601 *node_info_msg_pptr = NULL;
602
603 /* Spawn one pthread per cluster to collect node information */
604 resp_msg_list = list_create(NULL);
605 load_thread = xmalloc(sizeof(pthread_t) *
606 list_count(fed->cluster_list));
607 iter = list_iterator_create(fed->cluster_list);
608 while ((cluster = (slurmdb_cluster_rec_t *) list_next(iter))) {
609 if ((cluster->control_host == NULL) ||
610 (cluster->control_host[0] == '\0'))
611 continue; /* Cluster down */
612
613 load_args = xmalloc(sizeof(load_node_req_struct_t));
614 load_args->cluster = cluster;
615 load_args->cluster_inx = cluster_inx++;
616 load_args->req_msg = req_msg;
617 load_args->resp_msg_list = resp_msg_list;
618 load_args->show_flags = show_flags;
619 slurm_thread_create(&load_thread[pthread_count],
620 _load_node_thread, load_args);
621 pthread_count++;
622 }
623 list_iterator_destroy(iter);
624
625 /* Wait for all pthreads to complete */
626 for (i = 0; i < pthread_count; i++)
627 pthread_join(load_thread[i], NULL);
628 xfree(load_thread);
629
630 /* Maintain a consistent cluster/node ordering */
631 list_sort(resp_msg_list, _sort_by_cluster_inx);
632
633 /* Merge the responses into a single response message */
634 iter = list_iterator_create(resp_msg_list);
635 while ((node_resp = (load_node_resp_struct_t *) list_next(iter))) {
636 new_msg = node_resp->new_msg;
637 if (!orig_msg) {
638 orig_msg = new_msg;
639 *node_info_msg_pptr = orig_msg;
640 } else {
641 /* Merge the node records */
642 orig_msg->last_update = MIN(orig_msg->last_update,
643 new_msg->last_update);
644 new_rec_cnt = orig_msg->record_count +
645 new_msg->record_count;
646 if (new_msg->record_count) {
647 orig_msg->node_array =
648 xrealloc(orig_msg->node_array,
649 sizeof(node_info_t) *
650 new_rec_cnt);
651 (void) memcpy(orig_msg->node_array +
652 orig_msg->record_count,
653 new_msg->node_array,
654 sizeof(node_info_t) *
655 new_msg->record_count);
656 orig_msg->record_count = new_rec_cnt;
657 }
658 xfree(new_msg->node_array);
659 xfree(new_msg);
660 }
661 xfree(node_resp);
662 }
663 list_iterator_destroy(iter);
664 FREE_NULL_LIST(resp_msg_list);
665
666 if (!orig_msg)
667 slurm_seterrno_ret(SLURM_ERROR);
668
669 return SLURM_SUCCESS;
670 }
671
672 /*
673 * slurm_load_node - issue RPC to get slurm all node configuration information
674 * if changed since update_time
675 * IN update_time - time of current configuration data
676 * OUT resp - place to store a node configuration pointer
677 * IN show_flags - node filtering options
678 * RET 0 or a slurm error code
679 * NOTE: free the response using slurm_free_node_info_msg
680 */
slurm_load_node(time_t update_time,node_info_msg_t ** resp,uint16_t show_flags)681 extern int slurm_load_node(time_t update_time, node_info_msg_t **resp,
682 uint16_t show_flags)
683 {
684 slurm_msg_t req_msg;
685 node_info_request_msg_t req;
686 char *cluster_name = NULL;
687 void *ptr = NULL;
688 slurmdb_federation_rec_t *fed;
689 int rc;
690
691 if (working_cluster_rec)
692 cluster_name = xstrdup(working_cluster_rec->name);
693 else
694 cluster_name = slurm_get_cluster_name();
695 if ((show_flags & SHOW_FEDERATION) && !(show_flags & SHOW_LOCAL) &&
696 (slurm_load_federation(&ptr) == SLURM_SUCCESS) &&
697 cluster_in_federation(ptr, cluster_name)) {
698 /* In federation. Need full info from all clusters */
699 update_time = (time_t) 0;
700 show_flags &= (~SHOW_LOCAL);
701 } else {
702 /* Report local cluster info only */
703 show_flags |= SHOW_LOCAL;
704 show_flags &= (~SHOW_FEDERATION);
705 }
706
707 slurm_msg_t_init(&req_msg);
708 memset(&req, 0, sizeof(req));
709 req.last_update = update_time;
710 req.show_flags = show_flags;
711 req_msg.msg_type = REQUEST_NODE_INFO;
712 req_msg.data = &req;
713
714 if ((show_flags & SHOW_FEDERATION) && ptr) { /* "ptr" check for CLANG */
715 fed = (slurmdb_federation_rec_t *) ptr;
716 rc = _load_fed_nodes(&req_msg, resp, show_flags, cluster_name,
717 fed);
718 } else {
719 rc = _load_cluster_nodes(&req_msg, resp, working_cluster_rec,
720 show_flags);
721 }
722
723 if (ptr)
724 slurm_destroy_federation_rec(ptr);
725 xfree(cluster_name);
726
727 return rc;
728 }
729
730 /*
731 * slurm_load_node2 - equivalent to slurm_load_node() with addition
732 * of cluster record for communications in a federation
733 */
slurm_load_node2(time_t update_time,node_info_msg_t ** resp,uint16_t show_flags,slurmdb_cluster_rec_t * cluster)734 extern int slurm_load_node2(time_t update_time, node_info_msg_t **resp,
735 uint16_t show_flags, slurmdb_cluster_rec_t *cluster)
736 {
737 slurm_msg_t req_msg;
738 node_info_request_msg_t req;
739
740 slurm_msg_t_init(&req_msg);
741 memset(&req, 0, sizeof(req));
742 req.last_update = update_time;
743 req.show_flags = show_flags;
744 req_msg.msg_type = REQUEST_NODE_INFO;
745 req_msg.data = &req;
746
747 return _load_cluster_nodes(&req_msg, resp, cluster, show_flags);
748 }
749
750 /*
751 * slurm_load_node_single - issue RPC to get slurm configuration information
752 * for a specific node
753 * OUT resp - place to store a node configuration pointer
754 * IN node_name - name of the node for which information is requested
755 * IN show_flags - node filtering options
756 * RET 0 or a slurm error code
757 * NOTE: free the response using slurm_free_node_info_msg
758 */
slurm_load_node_single(node_info_msg_t ** resp,char * node_name,uint16_t show_flags)759 extern int slurm_load_node_single(node_info_msg_t **resp, char *node_name,
760 uint16_t show_flags)
761 {
762 slurm_msg_t req_msg;
763 node_info_single_msg_t req;
764
765 slurm_msg_t_init(&req_msg);
766 memset(&req, 0, sizeof(req));
767 req.node_name = node_name;
768 req.show_flags = show_flags;
769 req_msg.msg_type = REQUEST_NODE_INFO_SINGLE;
770 req_msg.data = &req;
771
772 return _load_cluster_nodes(&req_msg, resp, working_cluster_rec,
773 show_flags);
774 }
775
776 /*
777 * slurm_load_node_single2 - equivalent to slurm_load_node_single() with
778 * addition of cluster record for communications in a federation
779 */
slurm_load_node_single2(node_info_msg_t ** resp,char * node_name,uint16_t show_flags,slurmdb_cluster_rec_t * cluster)780 extern int slurm_load_node_single2(node_info_msg_t **resp, char *node_name,
781 uint16_t show_flags,
782 slurmdb_cluster_rec_t *cluster)
783 {
784 slurm_msg_t req_msg;
785 node_info_single_msg_t req;
786
787 slurm_msg_t_init(&req_msg);
788 memset(&req, 0, sizeof(req));
789 req.node_name = node_name;
790 req.show_flags = show_flags;
791 req_msg.msg_type = REQUEST_NODE_INFO_SINGLE;
792 req_msg.data = &req;
793
794 return _load_cluster_nodes(&req_msg, resp, cluster, show_flags);
795 }
796
797 /*
798 * slurm_get_node_energy - issue RPC to get the energy data of all
799 * configured sensors on the target machine
800 * IN host - name of node to query, NULL if localhost
801 * IN context_id - specific plugin to query.
802 * IN delta - Use cache if data is newer than this in seconds
803 * OUT sensors_cnt - number of sensors
804 * OUT energy - array of acct_gather_energy_t structures on success or
805 * NULL other wise
806 * RET 0 on success or a slurm error code
807 * NOTE: free the response using xfree
808 */
slurm_get_node_energy(char * host,uint16_t context_id,uint16_t delta,uint16_t * sensor_cnt,acct_gather_energy_t ** energy)809 extern int slurm_get_node_energy(char *host, uint16_t context_id,
810 uint16_t delta,
811 uint16_t *sensor_cnt,
812 acct_gather_energy_t **energy)
813 {
814 int rc;
815 slurm_msg_t req_msg;
816 slurm_msg_t resp_msg;
817 acct_gather_energy_req_msg_t req;
818 uint32_t cluster_flags = slurmdb_setup_cluster_flags();
819 char *this_addr;
820
821 xassert(sensor_cnt);
822 xassert(energy);
823
824 *sensor_cnt = 0;
825 *energy = NULL;
826
827 slurm_msg_t_init(&req_msg);
828 slurm_msg_t_init(&resp_msg);
829
830 if (host)
831 slurm_conf_get_addr(host, &req_msg.address, req_msg.flags);
832 else if (cluster_flags & CLUSTER_FLAG_MULTSD) {
833 if ((this_addr = getenv("SLURMD_NODENAME"))) {
834 slurm_conf_get_addr(this_addr, &req_msg.address,
835 req_msg.flags);
836 } else {
837 this_addr = "localhost";
838 slurm_set_addr(&req_msg.address,
839 (uint16_t)slurm_get_slurmd_port(),
840 this_addr);
841 }
842 } else {
843 char this_host[256];
844 /*
845 * Set request message address to slurmd on localhost
846 */
847 gethostname_short(this_host, sizeof(this_host));
848 this_addr = slurm_conf_get_nodeaddr(this_host);
849 if (this_addr == NULL)
850 this_addr = xstrdup("localhost");
851 slurm_set_addr(&req_msg.address,
852 (uint16_t)slurm_get_slurmd_port(),
853 this_addr);
854 xfree(this_addr);
855 }
856
857 memset(&req, 0, sizeof(req));
858 req.context_id = context_id;
859 req.delta = delta;
860 req_msg.msg_type = REQUEST_ACCT_GATHER_ENERGY;
861 req_msg.data = &req;
862
863 rc = slurm_send_recv_node_msg(&req_msg, &resp_msg, 0);
864
865 if (rc != 0 || !resp_msg.auth_cred) {
866 error("slurm_get_node_energy: %m");
867 if (resp_msg.auth_cred)
868 g_slurm_auth_destroy(resp_msg.auth_cred);
869 return SLURM_ERROR;
870 }
871 if (resp_msg.auth_cred)
872 g_slurm_auth_destroy(resp_msg.auth_cred);
873 switch (resp_msg.msg_type) {
874 case RESPONSE_ACCT_GATHER_ENERGY:
875 *sensor_cnt = ((acct_gather_node_resp_msg_t *)
876 resp_msg.data)->sensor_cnt;
877 *energy = ((acct_gather_node_resp_msg_t *)
878 resp_msg.data)->energy;
879 ((acct_gather_node_resp_msg_t *) resp_msg.data)->energy = NULL;
880 slurm_free_acct_gather_node_resp_msg(resp_msg.data);
881 break;
882 case RESPONSE_SLURM_RC:
883 rc = ((return_code_msg_t *) resp_msg.data)->return_code;
884 slurm_free_return_code_msg(resp_msg.data);
885 if (rc)
886 slurm_seterrno_ret(rc);
887 break;
888 default:
889 slurm_seterrno_ret(SLURM_UNEXPECTED_MSG_ERROR);
890 break;
891 }
892
893 return SLURM_SUCCESS;
894 }
895