1 /****************************************************************************\
2 * sdiag.c - Utility for getting information about slurmctld behaviour
3 *****************************************************************************
4 * Produced at Barcelona Supercomputing Center, December 2011
5 * Written by Alejandro Lucero <alucero@bsc.es>
6 *
7 * This file is part of Slurm, a resource management program.
8 * For details, see <https://slurm.schedmd.com/>.
9 * Please also read the included file: DISCLAIMER.
10 *
11 * Slurm is free software; you can redistribute it and/or modify it under
12 * the terms of the GNU General Public License as published by the Free
13 * Software Foundation; either version 2 of the License, or (at your option)
14 * any later version.
15 *
16 * In addition, as a special exception, the copyright holders give permission
17 * to link the code of portions of this program with the OpenSSL library under
18 * certain conditions as described in each individual source file, and
19 * distribute linked combinations including the two. You must obey the GNU
20 * General Public License in all respects for all of the code used other than
21 * OpenSSL. If you modify file(s) with this exception, you may extend this
22 * exception to your version of the file(s), but you are not obligated to do
23 * so. If you do not wish to do so, delete this exception statement from your
24 * version. If you delete this exception statement from all source files in
25 * the program, then also delete it here.
26 *
27 * Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28 * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29 * FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
30 * details.
31 *
32 * You should have received a copy of the GNU General Public License along
33 * with Slurm; if not, write to the Free Software Foundation, Inc.,
34 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
35 \*****************************************************************************/
36
37 #include "config.h"
38
39 #include <stdlib.h>
40 #include <unistd.h>
41
42 #include <slurm.h>
43 #include "src/common/macros.h"
44 #include "src/common/read_config.h"
45 #include "src/common/slurm_protocol_defs.h"
46 #include "src/common/slurm_time.h"
47 #include "src/common/uid.h"
48 #include "src/common/xmalloc.h"
49 #include "src/common/xstring.h"
50
51 #include "sdiag.h"
52
53 /********************
54 * Global Variables *
55 ********************/
56 struct sdiag_parameters params;
57
58 stats_info_response_msg_t *buf;
59 uint32_t *rpc_type_ave_time = NULL, *rpc_user_ave_time = NULL;
60
61 static int _print_stats(void);
62 static void _sort_rpc(void);
63
64 stats_info_request_msg_t req;
65
66 extern void parse_command_line(int argc, char **argv);
67
main(int argc,char ** argv)68 int main(int argc, char **argv)
69 {
70 int rc = 0;
71
72 slurm_conf_init(NULL);
73 parse_command_line(argc, argv);
74
75 if (params.mode == STAT_COMMAND_RESET) {
76 req.command_id = STAT_COMMAND_RESET;
77 rc = slurm_reset_statistics((stats_info_request_msg_t *)&req);
78 if (rc == SLURM_SUCCESS)
79 printf("Reset scheduling statistics\n");
80 else
81 slurm_perror("slurm_reset_statistics");
82 } else {
83 req.command_id = STAT_COMMAND_GET;
84 rc = slurm_get_statistics(&buf,
85 (stats_info_request_msg_t *)&req);
86 if (rc == SLURM_SUCCESS) {
87 _sort_rpc();
88 rc = _print_stats();
89 #ifdef MEMORY_LEAK_DEBUG
90 slurm_free_stats_response_msg(buf);
91 xfree(rpc_type_ave_time);
92 xfree(rpc_user_ave_time);
93 #endif
94 } else
95 slurm_perror("slurm_get_statistics");
96 }
97
98 exit(rc);
99 }
100
_print_stats(void)101 static int _print_stats(void)
102 {
103 int i;
104
105 if (!buf) {
106 printf("No data available. Probably slurmctld is not working\n");
107 return -1;
108 }
109
110 printf("*******************************************************\n");
111 printf("sdiag output at %s (%ld)\n",
112 slurm_ctime2(&buf->req_time), buf->req_time);
113 printf("Data since %s (%ld)\n",
114 slurm_ctime2(&buf->req_time_start), buf->req_time_start);
115 printf("*******************************************************\n");
116
117 printf("Server thread count: %d\n", buf->server_thread_count);
118 printf("Agent queue size: %d\n", buf->agent_queue_size);
119 printf("Agent count: %d\n", buf->agent_count);
120 printf("Agent thread count: %d\n", buf->agent_thread_count);
121 printf("DBD Agent queue size: %d\n\n", buf->dbd_agent_queue_size);
122
123 printf("Jobs submitted: %d\n", buf->jobs_submitted);
124 printf("Jobs started: %d\n", buf->jobs_started);
125 printf("Jobs completed: %d\n", buf->jobs_completed);
126 printf("Jobs canceled: %d\n", buf->jobs_canceled);
127 printf("Jobs failed: %d\n\n", buf->jobs_failed);
128
129 printf("Job states ts: %s (%ld)\n",
130 slurm_ctime2(&buf->job_states_ts), buf->job_states_ts);
131 printf("Jobs pending: %d\n", buf->jobs_pending);
132 printf("Jobs running: %d\n", buf->jobs_running);
133
134 printf("\nMain schedule statistics (microseconds):\n");
135 printf("\tLast cycle: %u\n", buf->schedule_cycle_last);
136 printf("\tMax cycle: %u\n", buf->schedule_cycle_max);
137 printf("\tTotal cycles: %u\n", buf->schedule_cycle_counter);
138 if (buf->schedule_cycle_counter > 0) {
139 printf("\tMean cycle: %u\n",
140 buf->schedule_cycle_sum / buf->schedule_cycle_counter);
141 printf("\tMean depth cycle: %u\n",
142 buf->schedule_cycle_depth / buf->schedule_cycle_counter);
143 }
144 if ((buf->req_time - buf->req_time_start) > 60) {
145 printf("\tCycles per minute: %u\n",
146 (uint32_t) (buf->schedule_cycle_counter /
147 ((buf->req_time - buf->req_time_start) / 60)));
148 }
149 printf("\tLast queue length: %u\n", buf->schedule_queue_len);
150
151 if (buf->bf_active) {
152 printf("\nBackfilling stats (WARNING: data obtained"
153 " in the middle of backfilling execution.)\n");
154 } else
155 printf("\nBackfilling stats\n");
156
157 printf("\tTotal backfilled jobs (since last slurm start): %u\n",
158 buf->bf_backfilled_jobs);
159 printf("\tTotal backfilled jobs (since last stats cycle start): %u\n",
160 buf->bf_last_backfilled_jobs);
161 printf("\tTotal backfilled heterogeneous job components: %u\n",
162 buf->bf_backfilled_het_jobs);
163 printf("\tTotal cycles: %u\n", buf->bf_cycle_counter);
164 if (buf->bf_when_last_cycle > 0) {
165 printf("\tLast cycle when: %s (%ld)\n",
166 slurm_ctime2(&buf->bf_when_last_cycle),
167 buf->bf_when_last_cycle);
168 } else {
169 printf("\tLast cycle when: N/A\n");
170 }
171 printf("\tLast cycle: %u\n", buf->bf_cycle_last);
172 printf("\tMax cycle: %u\n", buf->bf_cycle_max);
173 if (buf->bf_cycle_counter > 0) {
174 printf("\tMean cycle: %"PRIu64"\n",
175 buf->bf_cycle_sum / buf->bf_cycle_counter);
176 }
177 printf("\tLast depth cycle: %u\n", buf->bf_last_depth);
178 printf("\tLast depth cycle (try sched): %u\n", buf->bf_last_depth_try);
179 if (buf->bf_cycle_counter > 0) {
180 printf("\tDepth Mean: %u\n",
181 buf->bf_depth_sum / buf->bf_cycle_counter);
182 printf("\tDepth Mean (try depth): %u\n",
183 buf->bf_depth_try_sum / buf->bf_cycle_counter);
184 }
185 printf("\tLast queue length: %u\n", buf->bf_queue_len);
186 if (buf->bf_cycle_counter > 0) {
187 printf("\tQueue length mean: %u\n",
188 buf->bf_queue_len_sum / buf->bf_cycle_counter);
189 }
190 printf("\tLast table size: %u\n", buf->bf_table_size);
191 if (buf->bf_cycle_counter > 0) {
192 printf("\tMean table size: %u\n",
193 buf->bf_table_size_sum / buf->bf_cycle_counter);
194 }
195
196 printf("\nLatency for 1000 calls to gettimeofday(): %d microseconds\n",
197 buf->gettimeofday_latency);
198
199 printf("\nRemote Procedure Call statistics by message type\n");
200 for (i = 0; i < buf->rpc_type_size; i++) {
201 printf("\t%-40s(%5u) count:%-6u "
202 "ave_time:%-6u total_time:%"PRIu64"\n",
203 rpc_num2string(buf->rpc_type_id[i]),
204 buf->rpc_type_id[i], buf->rpc_type_cnt[i],
205 rpc_type_ave_time[i], buf->rpc_type_time[i]);
206 }
207
208 printf("\nRemote Procedure Call statistics by user\n");
209 for (i = 0; i < buf->rpc_user_size; i++) {
210 char *user = uid_to_string_or_null(buf->rpc_user_id[i]);
211 if (!user)
212 xstrfmtcat(user, "%u", buf->rpc_user_id[i]);
213
214 printf("\t%-16s(%8u) count:%-6u "
215 "ave_time:%-6u total_time:%"PRIu64"\n",
216 user, buf->rpc_user_id[i], buf->rpc_user_cnt[i],
217 rpc_user_ave_time[i], buf->rpc_user_time[i]);
218
219 xfree(user);
220 }
221
222 printf("\nPending RPC statistics\n");
223 if (buf->rpc_queue_type_count == 0)
224 printf("\tNo pending RPCs\n");
225 for (i = 0; i < buf->rpc_queue_type_count; i++){
226 printf("\t%-40s(%5u) count:%-6u\n",
227 rpc_num2string(buf->rpc_queue_type_id[i]),
228 buf->rpc_queue_type_id[i],
229 buf->rpc_queue_count[i]);
230 }
231
232 if (buf->rpc_dump_count > 0) {
233 printf("\nPending RPCs\n");
234 }
235
236 for (i = 0; i < buf->rpc_dump_count; i++) {
237 printf("\t%2u: %-36s %s\n",
238 i+1,
239 rpc_num2string(buf->rpc_dump_types[i]),
240 buf->rpc_dump_hostlist[i]);
241 }
242
243 return 0;
244 }
245
_sort_rpc(void)246 static void _sort_rpc(void)
247 {
248 int i, j;
249 uint16_t type_id;
250 uint32_t type_ave, type_cnt, user_ave, user_cnt, user_id;
251 uint64_t type_time, user_time;
252
253 rpc_type_ave_time = xmalloc(sizeof(uint32_t) * buf->rpc_type_size);
254 rpc_user_ave_time = xmalloc(sizeof(uint32_t) * buf->rpc_user_size);
255
256 if (params.sort == SORT_ID) {
257 for (i = 0; i < buf->rpc_type_size; i++) {
258 for (j = i+1; j < buf->rpc_type_size; j++) {
259 if (buf->rpc_type_id[i] <= buf->rpc_type_id[j])
260 continue;
261 type_id = buf->rpc_type_id[i];
262 type_cnt = buf->rpc_type_cnt[i];
263 type_time = buf->rpc_type_time[i];
264 buf->rpc_type_id[i] = buf->rpc_type_id[j];
265 buf->rpc_type_cnt[i] = buf->rpc_type_cnt[j];
266 buf->rpc_type_time[i] = buf->rpc_type_time[j];
267 buf->rpc_type_id[j] = type_id;
268 buf->rpc_type_cnt[j] = type_cnt;
269 buf->rpc_type_time[j] = type_time;
270 }
271 if (buf->rpc_type_cnt[i]) {
272 rpc_type_ave_time[i] = buf->rpc_type_time[i] /
273 buf->rpc_type_cnt[i];
274 }
275 }
276 for (i = 0; i < buf->rpc_user_size; i++) {
277 for (j = i+1; j < buf->rpc_user_size; j++) {
278 if (buf->rpc_user_id[i] <= buf->rpc_user_id[j])
279 continue;
280 user_id = buf->rpc_user_id[i];
281 user_cnt = buf->rpc_user_cnt[i];
282 user_time = buf->rpc_user_time[i];
283 buf->rpc_user_id[i] = buf->rpc_user_id[j];
284 buf->rpc_user_cnt[i] = buf->rpc_user_cnt[j];
285 buf->rpc_user_time[i] = buf->rpc_user_time[j];
286 buf->rpc_user_id[j] = user_id;
287 buf->rpc_user_cnt[j] = user_cnt;
288 buf->rpc_user_time[j] = user_time;
289 }
290 if (buf->rpc_user_cnt[i]) {
291 rpc_user_ave_time[i] = buf->rpc_user_time[i] /
292 buf->rpc_user_cnt[i];
293 }
294 }
295 } else if (params.sort == SORT_TIME) {
296 for (i = 0; i < buf->rpc_type_size; i++) {
297 for (j = i+1; j < buf->rpc_type_size; j++) {
298 if (buf->rpc_type_time[i] >= buf->rpc_type_time[j])
299 continue;
300 type_id = buf->rpc_type_id[i];
301 type_cnt = buf->rpc_type_cnt[i];
302 type_time = buf->rpc_type_time[i];
303 buf->rpc_type_id[i] = buf->rpc_type_id[j];
304 buf->rpc_type_cnt[i] = buf->rpc_type_cnt[j];
305 buf->rpc_type_time[i] = buf->rpc_type_time[j];
306 buf->rpc_type_id[j] = type_id;
307 buf->rpc_type_cnt[j] = type_cnt;
308 buf->rpc_type_time[j] = type_time;
309 }
310 if (buf->rpc_type_cnt[i]) {
311 rpc_type_ave_time[i] = buf->rpc_type_time[i] /
312 buf->rpc_type_cnt[i];
313 }
314 }
315 for (i = 0; i < buf->rpc_user_size; i++) {
316 for (j = i+1; j < buf->rpc_user_size; j++) {
317 if (buf->rpc_user_time[i] >= buf->rpc_user_time[j])
318 continue;
319 user_id = buf->rpc_user_id[i];
320 user_cnt = buf->rpc_user_cnt[i];
321 user_time = buf->rpc_user_time[i];
322 buf->rpc_user_id[i] = buf->rpc_user_id[j];
323 buf->rpc_user_cnt[i] = buf->rpc_user_cnt[j];
324 buf->rpc_user_time[i] = buf->rpc_user_time[j];
325 buf->rpc_user_id[j] = user_id;
326 buf->rpc_user_cnt[j] = user_cnt;
327 buf->rpc_user_time[j] = user_time;
328 }
329 if (buf->rpc_user_cnt[i]) {
330 rpc_user_ave_time[i] = buf->rpc_user_time[i] /
331 buf->rpc_user_cnt[i];
332 }
333 }
334 } else if (params.sort == SORT_TIME2) {
335 for (i = 0; i < buf->rpc_type_size; i++) {
336 if (buf->rpc_type_cnt[i]) {
337 rpc_type_ave_time[i] = buf->rpc_type_time[i] /
338 buf->rpc_type_cnt[i];
339 }
340 }
341 for (i = 0; i < buf->rpc_type_size; i++) {
342 for (j = i+1; j < buf->rpc_type_size; j++) {
343 if (rpc_type_ave_time[i] >= rpc_type_ave_time[j])
344 continue;
345 type_ave = rpc_type_ave_time[i];
346 type_id = buf->rpc_type_id[i];
347 type_cnt = buf->rpc_type_cnt[i];
348 type_time = buf->rpc_type_time[i];
349 rpc_type_ave_time[i] = rpc_type_ave_time[j];
350 buf->rpc_type_id[i] = buf->rpc_type_id[j];
351 buf->rpc_type_cnt[i] = buf->rpc_type_cnt[j];
352 buf->rpc_type_time[i] = buf->rpc_type_time[j];
353 rpc_type_ave_time[j] = type_ave;
354 buf->rpc_type_id[j] = type_id;
355 buf->rpc_type_cnt[j] = type_cnt;
356 buf->rpc_type_time[j] = type_time;
357 }
358 }
359 for (i = 0; i < buf->rpc_user_size; i++) {
360 if (buf->rpc_user_cnt[i]) {
361 rpc_user_ave_time[i] = buf->rpc_user_time[i] /
362 buf->rpc_user_cnt[i];
363 }
364 }
365 for (i = 0; i < buf->rpc_user_size; i++) {
366 for (j = i+1; j < buf->rpc_user_size; j++) {
367 if (rpc_user_ave_time[i] >= rpc_user_ave_time[j])
368 continue;
369 user_ave = rpc_user_ave_time[i];
370 user_id = buf->rpc_user_id[i];
371 user_cnt = buf->rpc_user_cnt[i];
372 user_time = buf->rpc_user_time[i];
373 rpc_user_ave_time[i] = rpc_user_ave_time[j];
374 buf->rpc_user_id[i] = buf->rpc_user_id[j];
375 buf->rpc_user_cnt[i] = buf->rpc_user_cnt[j];
376 buf->rpc_user_time[i] = buf->rpc_user_time[j];
377 rpc_user_ave_time[j] = user_ave;
378 buf->rpc_user_id[j] = user_id;
379 buf->rpc_user_cnt[j] = user_cnt;
380 buf->rpc_user_time[j] = user_time;
381 }
382 }
383 } else { /* sort by count */
384 for (i = 0; i < buf->rpc_type_size; i++) {
385 for (j = i+1; j < buf->rpc_type_size; j++) {
386 if (buf->rpc_type_cnt[i] >= buf->rpc_type_cnt[j])
387 continue;
388 type_id = buf->rpc_type_id[i];
389 type_cnt = buf->rpc_type_cnt[i];
390 type_time = buf->rpc_type_time[i];
391 buf->rpc_type_id[i] = buf->rpc_type_id[j];
392 buf->rpc_type_cnt[i] = buf->rpc_type_cnt[j];
393 buf->rpc_type_time[i] = buf->rpc_type_time[j];
394 buf->rpc_type_id[j] = type_id;
395 buf->rpc_type_cnt[j] = type_cnt;
396 buf->rpc_type_time[j] = type_time;
397 }
398 if (buf->rpc_type_cnt[i]) {
399 rpc_type_ave_time[i] = buf->rpc_type_time[i] /
400 buf->rpc_type_cnt[i];
401 }
402 }
403 for (i = 0; i < buf->rpc_user_size; i++) {
404 for (j = i+1; j < buf->rpc_user_size; j++) {
405 if (buf->rpc_user_cnt[i] >= buf->rpc_user_cnt[j])
406 continue;
407 user_id = buf->rpc_user_id[i];
408 user_cnt = buf->rpc_user_cnt[i];
409 user_time = buf->rpc_user_time[i];
410 buf->rpc_user_id[i] = buf->rpc_user_id[j];
411 buf->rpc_user_cnt[i] = buf->rpc_user_cnt[j];
412 buf->rpc_user_time[i] = buf->rpc_user_time[j];
413 buf->rpc_user_id[j] = user_id;
414 buf->rpc_user_cnt[j] = user_cnt;
415 buf->rpc_user_time[j] = user_time;
416 }
417 if (buf->rpc_user_cnt[i]) {
418 rpc_user_ave_time[i] = buf->rpc_user_time[i] /
419 buf->rpc_user_cnt[i];
420 }
421 }
422 }
423 }
424