1 /****************************************************************************\
2  *  sdiag.c - Utility for getting information about slurmctld behaviour
3  *****************************************************************************
4  *  Produced at Barcelona Supercomputing Center, December 2011
5  *  Written by Alejandro Lucero <alucero@bsc.es>
6  *
7  *  This file is part of Slurm, a resource management program.
8  *  For details, see <https://slurm.schedmd.com/>.
9  *  Please also read the included file: DISCLAIMER.
10  *
11  *  Slurm is free software; you can redistribute it and/or modify it under
12  *  the terms of the GNU General Public License as published by the Free
13  *  Software Foundation; either version 2 of the License, or (at your option)
14  *  any later version.
15  *
16  *  In addition, as a special exception, the copyright holders give permission
17  *  to link the code of portions of this program with the OpenSSL library under
18  *  certain conditions as described in each individual source file, and
19  *  distribute linked combinations including the two. You must obey the GNU
20  *  General Public License in all respects for all of the code used other than
21  *  OpenSSL. If you modify file(s) with this exception, you may extend this
22  *  exception to your version of the file(s), but you are not obligated to do
23  *  so. If you do not wish to do so, delete this exception statement from your
24  *  version.  If you delete this exception statement from all source files in
25  *  the program, then also delete it here.
26  *
27  *  Slurm is distributed in the hope that it will be useful, but WITHOUT ANY
28  *  WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
29  *  FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
30  *  details.
31  *
32  *  You should have received a copy of the GNU General Public License along
33  *  with Slurm; if not, write to the Free Software Foundation, Inc.,
34  *  51 Franklin Street, Fifth Floor, Boston, MA 02110-1301  USA.
35 \*****************************************************************************/
36 
37 #include "config.h"
38 
39 #include <stdlib.h>
40 #include <unistd.h>
41 
42 #include <slurm.h>
43 #include "src/common/macros.h"
44 #include "src/common/read_config.h"
45 #include "src/common/slurm_protocol_defs.h"
46 #include "src/common/slurm_time.h"
47 #include "src/common/uid.h"
48 #include "src/common/xmalloc.h"
49 #include "src/common/xstring.h"
50 
51 #include "sdiag.h"
52 
53 /********************
54  * Global Variables *
55  ********************/
56 struct sdiag_parameters params;
57 
58 stats_info_response_msg_t *buf;
59 uint32_t *rpc_type_ave_time = NULL, *rpc_user_ave_time = NULL;
60 
61 static int  _print_stats(void);
62 static void _sort_rpc(void);
63 
64 stats_info_request_msg_t req;
65 
66 extern void parse_command_line(int argc, char **argv);
67 
main(int argc,char ** argv)68 int main(int argc, char **argv)
69 {
70 	int rc = 0;
71 
72 	slurm_conf_init(NULL);
73 	parse_command_line(argc, argv);
74 
75 	if (params.mode == STAT_COMMAND_RESET) {
76 		req.command_id = STAT_COMMAND_RESET;
77 		rc = slurm_reset_statistics((stats_info_request_msg_t *)&req);
78 		if (rc == SLURM_SUCCESS)
79 			printf("Reset scheduling statistics\n");
80 		else
81 			slurm_perror("slurm_reset_statistics");
82 	} else {
83 		req.command_id = STAT_COMMAND_GET;
84 		rc = slurm_get_statistics(&buf,
85 					  (stats_info_request_msg_t *)&req);
86 		if (rc == SLURM_SUCCESS) {
87 			_sort_rpc();
88 			rc = _print_stats();
89 #ifdef MEMORY_LEAK_DEBUG
90 			slurm_free_stats_response_msg(buf);
91 			xfree(rpc_type_ave_time);
92 			xfree(rpc_user_ave_time);
93 #endif
94 		} else
95 			slurm_perror("slurm_get_statistics");
96 	}
97 
98 	exit(rc);
99 }
100 
_print_stats(void)101 static int _print_stats(void)
102 {
103 	int i;
104 
105 	if (!buf) {
106 		printf("No data available. Probably slurmctld is not working\n");
107 		return -1;
108 	}
109 
110 	printf("*******************************************************\n");
111 	printf("sdiag output at %s (%ld)\n",
112 	       slurm_ctime2(&buf->req_time), buf->req_time);
113 	printf("Data since      %s (%ld)\n",
114 	       slurm_ctime2(&buf->req_time_start), buf->req_time_start);
115 	printf("*******************************************************\n");
116 
117 	printf("Server thread count:  %d\n", buf->server_thread_count);
118 	printf("Agent queue size:     %d\n", buf->agent_queue_size);
119 	printf("Agent count:          %d\n", buf->agent_count);
120 	printf("Agent thread count:   %d\n", buf->agent_thread_count);
121 	printf("DBD Agent queue size: %d\n\n", buf->dbd_agent_queue_size);
122 
123 	printf("Jobs submitted: %d\n", buf->jobs_submitted);
124 	printf("Jobs started:   %d\n", buf->jobs_started);
125 	printf("Jobs completed: %d\n", buf->jobs_completed);
126 	printf("Jobs canceled:  %d\n", buf->jobs_canceled);
127 	printf("Jobs failed:    %d\n\n", buf->jobs_failed);
128 
129 	printf("Job states ts:  %s (%ld)\n",
130 	       slurm_ctime2(&buf->job_states_ts), buf->job_states_ts);
131 	printf("Jobs pending:   %d\n", buf->jobs_pending);
132 	printf("Jobs running:   %d\n", buf->jobs_running);
133 
134 	printf("\nMain schedule statistics (microseconds):\n");
135 	printf("\tLast cycle:   %u\n", buf->schedule_cycle_last);
136 	printf("\tMax cycle:    %u\n", buf->schedule_cycle_max);
137 	printf("\tTotal cycles: %u\n", buf->schedule_cycle_counter);
138 	if (buf->schedule_cycle_counter > 0) {
139 		printf("\tMean cycle:   %u\n",
140 		       buf->schedule_cycle_sum / buf->schedule_cycle_counter);
141 		printf("\tMean depth cycle:  %u\n",
142 		       buf->schedule_cycle_depth / buf->schedule_cycle_counter);
143 	}
144 	if ((buf->req_time - buf->req_time_start) > 60) {
145 		printf("\tCycles per minute: %u\n",
146 		       (uint32_t) (buf->schedule_cycle_counter /
147 		       ((buf->req_time - buf->req_time_start) / 60)));
148 	}
149 	printf("\tLast queue length: %u\n", buf->schedule_queue_len);
150 
151 	if (buf->bf_active) {
152 		printf("\nBackfilling stats (WARNING: data obtained"
153 		       " in the middle of backfilling execution.)\n");
154 	} else
155 		printf("\nBackfilling stats\n");
156 
157 	printf("\tTotal backfilled jobs (since last slurm start): %u\n",
158 	       buf->bf_backfilled_jobs);
159 	printf("\tTotal backfilled jobs (since last stats cycle start): %u\n",
160 	       buf->bf_last_backfilled_jobs);
161 	printf("\tTotal backfilled heterogeneous job components: %u\n",
162 	       buf->bf_backfilled_het_jobs);
163 	printf("\tTotal cycles: %u\n", buf->bf_cycle_counter);
164 	if (buf->bf_when_last_cycle > 0) {
165 		printf("\tLast cycle when: %s (%ld)\n",
166 		       slurm_ctime2(&buf->bf_when_last_cycle),
167 		       buf->bf_when_last_cycle);
168 	} else {
169 		printf("\tLast cycle when: N/A\n");
170 	}
171 	printf("\tLast cycle: %u\n", buf->bf_cycle_last);
172 	printf("\tMax cycle:  %u\n", buf->bf_cycle_max);
173 	if (buf->bf_cycle_counter > 0) {
174 		printf("\tMean cycle: %"PRIu64"\n",
175 		       buf->bf_cycle_sum / buf->bf_cycle_counter);
176 	}
177 	printf("\tLast depth cycle: %u\n", buf->bf_last_depth);
178 	printf("\tLast depth cycle (try sched): %u\n", buf->bf_last_depth_try);
179 	if (buf->bf_cycle_counter > 0) {
180 		printf("\tDepth Mean: %u\n",
181 		       buf->bf_depth_sum / buf->bf_cycle_counter);
182 		printf("\tDepth Mean (try depth): %u\n",
183 		       buf->bf_depth_try_sum / buf->bf_cycle_counter);
184 	}
185 	printf("\tLast queue length: %u\n", buf->bf_queue_len);
186 	if (buf->bf_cycle_counter > 0) {
187 		printf("\tQueue length mean: %u\n",
188 		       buf->bf_queue_len_sum / buf->bf_cycle_counter);
189 	}
190 	printf("\tLast table size: %u\n", buf->bf_table_size);
191 	if (buf->bf_cycle_counter > 0) {
192 		printf("\tMean table size: %u\n",
193 		       buf->bf_table_size_sum / buf->bf_cycle_counter);
194 	}
195 
196 	printf("\nLatency for 1000 calls to gettimeofday(): %d microseconds\n",
197 	       buf->gettimeofday_latency);
198 
199 	printf("\nRemote Procedure Call statistics by message type\n");
200 	for (i = 0; i < buf->rpc_type_size; i++) {
201 		printf("\t%-40s(%5u) count:%-6u "
202 		       "ave_time:%-6u total_time:%"PRIu64"\n",
203 		       rpc_num2string(buf->rpc_type_id[i]),
204 		       buf->rpc_type_id[i], buf->rpc_type_cnt[i],
205 		       rpc_type_ave_time[i], buf->rpc_type_time[i]);
206 	}
207 
208 	printf("\nRemote Procedure Call statistics by user\n");
209 	for (i = 0; i < buf->rpc_user_size; i++) {
210 		char *user = uid_to_string_or_null(buf->rpc_user_id[i]);
211 		if (!user)
212 			xstrfmtcat(user, "%u", buf->rpc_user_id[i]);
213 
214 		printf("\t%-16s(%8u) count:%-6u "
215 		       "ave_time:%-6u total_time:%"PRIu64"\n",
216 		       user, buf->rpc_user_id[i], buf->rpc_user_cnt[i],
217 		       rpc_user_ave_time[i], buf->rpc_user_time[i]);
218 
219 		xfree(user);
220 	}
221 
222 	printf("\nPending RPC statistics\n");
223 	if (buf->rpc_queue_type_count == 0)
224 		printf("\tNo pending RPCs\n");
225 	for (i = 0; i < buf->rpc_queue_type_count; i++){
226 		printf("\t%-40s(%5u) count:%-6u\n",
227 		       rpc_num2string(buf->rpc_queue_type_id[i]),
228 		       buf->rpc_queue_type_id[i],
229 		       buf->rpc_queue_count[i]);
230 	}
231 
232 	if (buf->rpc_dump_count > 0) {
233 		printf("\nPending RPCs\n");
234 	}
235 
236 	for (i = 0; i < buf->rpc_dump_count; i++) {
237 		printf("\t%2u: %-36s %s\n",
238 		       i+1,
239 		       rpc_num2string(buf->rpc_dump_types[i]),
240 		       buf->rpc_dump_hostlist[i]);
241 	}
242 
243 	return 0;
244 }
245 
_sort_rpc(void)246 static void _sort_rpc(void)
247 {
248 	int i, j;
249 	uint16_t type_id;
250 	uint32_t type_ave, type_cnt, user_ave, user_cnt, user_id;
251 	uint64_t type_time, user_time;
252 
253 	rpc_type_ave_time = xmalloc(sizeof(uint32_t) * buf->rpc_type_size);
254 	rpc_user_ave_time = xmalloc(sizeof(uint32_t) * buf->rpc_user_size);
255 
256 	if (params.sort == SORT_ID) {
257 		for (i = 0; i < buf->rpc_type_size; i++) {
258 			for (j = i+1; j < buf->rpc_type_size; j++) {
259 				if (buf->rpc_type_id[i] <= buf->rpc_type_id[j])
260 					continue;
261 				type_id   = buf->rpc_type_id[i];
262 				type_cnt  = buf->rpc_type_cnt[i];
263 				type_time = buf->rpc_type_time[i];
264 				buf->rpc_type_id[i]   = buf->rpc_type_id[j];
265 				buf->rpc_type_cnt[i]  = buf->rpc_type_cnt[j];
266 				buf->rpc_type_time[i] = buf->rpc_type_time[j];
267 				buf->rpc_type_id[j]   = type_id;
268 				buf->rpc_type_cnt[j]  = type_cnt;
269 				buf->rpc_type_time[j] = type_time;
270 			}
271 			if (buf->rpc_type_cnt[i]) {
272 				rpc_type_ave_time[i] = buf->rpc_type_time[i] /
273 						       buf->rpc_type_cnt[i];
274 			}
275 		}
276 		for (i = 0; i < buf->rpc_user_size; i++) {
277 			for (j = i+1; j < buf->rpc_user_size; j++) {
278 				if (buf->rpc_user_id[i] <= buf->rpc_user_id[j])
279 					continue;
280 				user_id   = buf->rpc_user_id[i];
281 				user_cnt  = buf->rpc_user_cnt[i];
282 				user_time = buf->rpc_user_time[i];
283 				buf->rpc_user_id[i]   = buf->rpc_user_id[j];
284 				buf->rpc_user_cnt[i]  = buf->rpc_user_cnt[j];
285 				buf->rpc_user_time[i] = buf->rpc_user_time[j];
286 				buf->rpc_user_id[j]   = user_id;
287 				buf->rpc_user_cnt[j]  = user_cnt;
288 				buf->rpc_user_time[j] = user_time;
289 			}
290 			if (buf->rpc_user_cnt[i]) {
291 				rpc_user_ave_time[i] = buf->rpc_user_time[i] /
292 						       buf->rpc_user_cnt[i];
293 			}
294 		}
295 	} else if (params.sort == SORT_TIME) {
296 		for (i = 0; i < buf->rpc_type_size; i++) {
297 			for (j = i+1; j < buf->rpc_type_size; j++) {
298 				if (buf->rpc_type_time[i] >= buf->rpc_type_time[j])
299 					continue;
300 				type_id   = buf->rpc_type_id[i];
301 				type_cnt  = buf->rpc_type_cnt[i];
302 				type_time = buf->rpc_type_time[i];
303 				buf->rpc_type_id[i]   = buf->rpc_type_id[j];
304 				buf->rpc_type_cnt[i]  = buf->rpc_type_cnt[j];
305 				buf->rpc_type_time[i] = buf->rpc_type_time[j];
306 				buf->rpc_type_id[j]   = type_id;
307 				buf->rpc_type_cnt[j]  = type_cnt;
308 				buf->rpc_type_time[j] = type_time;
309 			}
310 			if (buf->rpc_type_cnt[i]) {
311 				rpc_type_ave_time[i] = buf->rpc_type_time[i] /
312 						       buf->rpc_type_cnt[i];
313 			}
314 		}
315 		for (i = 0; i < buf->rpc_user_size; i++) {
316 			for (j = i+1; j < buf->rpc_user_size; j++) {
317 				if (buf->rpc_user_time[i] >= buf->rpc_user_time[j])
318 					continue;
319 				user_id   = buf->rpc_user_id[i];
320 				user_cnt  = buf->rpc_user_cnt[i];
321 				user_time = buf->rpc_user_time[i];
322 				buf->rpc_user_id[i]   = buf->rpc_user_id[j];
323 				buf->rpc_user_cnt[i]  = buf->rpc_user_cnt[j];
324 				buf->rpc_user_time[i] = buf->rpc_user_time[j];
325 				buf->rpc_user_id[j]   = user_id;
326 				buf->rpc_user_cnt[j]  = user_cnt;
327 				buf->rpc_user_time[j] = user_time;
328 			}
329 			if (buf->rpc_user_cnt[i]) {
330 				rpc_user_ave_time[i] = buf->rpc_user_time[i] /
331 						       buf->rpc_user_cnt[i];
332 			}
333 		}
334 	} else if (params.sort == SORT_TIME2) {
335 		for (i = 0; i < buf->rpc_type_size; i++) {
336 			if (buf->rpc_type_cnt[i]) {
337 				rpc_type_ave_time[i] = buf->rpc_type_time[i] /
338 						       buf->rpc_type_cnt[i];
339 			}
340 		}
341 		for (i = 0; i < buf->rpc_type_size; i++) {
342 			for (j = i+1; j < buf->rpc_type_size; j++) {
343 				if (rpc_type_ave_time[i] >= rpc_type_ave_time[j])
344 					continue;
345 				type_ave  = rpc_type_ave_time[i];
346 				type_id   = buf->rpc_type_id[i];
347 				type_cnt  = buf->rpc_type_cnt[i];
348 				type_time = buf->rpc_type_time[i];
349 				rpc_type_ave_time[i]  = rpc_type_ave_time[j];
350 				buf->rpc_type_id[i]   = buf->rpc_type_id[j];
351 				buf->rpc_type_cnt[i]  = buf->rpc_type_cnt[j];
352 				buf->rpc_type_time[i] = buf->rpc_type_time[j];
353 				rpc_type_ave_time[j]  = type_ave;
354 				buf->rpc_type_id[j]   = type_id;
355 				buf->rpc_type_cnt[j]  = type_cnt;
356 				buf->rpc_type_time[j] = type_time;
357 			}
358 		}
359 		for (i = 0; i < buf->rpc_user_size; i++) {
360 			if (buf->rpc_user_cnt[i]) {
361 				rpc_user_ave_time[i] = buf->rpc_user_time[i] /
362 						       buf->rpc_user_cnt[i];
363 			}
364 		}
365 		for (i = 0; i < buf->rpc_user_size; i++) {
366 			for (j = i+1; j < buf->rpc_user_size; j++) {
367 				if (rpc_user_ave_time[i] >= rpc_user_ave_time[j])
368 					continue;
369 				user_ave  = rpc_user_ave_time[i];
370 				user_id   = buf->rpc_user_id[i];
371 				user_cnt  = buf->rpc_user_cnt[i];
372 				user_time = buf->rpc_user_time[i];
373 				rpc_user_ave_time[i]  = rpc_user_ave_time[j];
374 				buf->rpc_user_id[i]   = buf->rpc_user_id[j];
375 				buf->rpc_user_cnt[i]  = buf->rpc_user_cnt[j];
376 				buf->rpc_user_time[i] = buf->rpc_user_time[j];
377 				rpc_user_ave_time[j]  = user_ave;
378 				buf->rpc_user_id[j]   = user_id;
379 				buf->rpc_user_cnt[j]  = user_cnt;
380 				buf->rpc_user_time[j] = user_time;
381 			}
382 		}
383 	} else { /* sort by count */
384 		for (i = 0; i < buf->rpc_type_size; i++) {
385 			for (j = i+1; j < buf->rpc_type_size; j++) {
386 				if (buf->rpc_type_cnt[i] >= buf->rpc_type_cnt[j])
387 					continue;
388 				type_id   = buf->rpc_type_id[i];
389 				type_cnt  = buf->rpc_type_cnt[i];
390 				type_time = buf->rpc_type_time[i];
391 				buf->rpc_type_id[i]   = buf->rpc_type_id[j];
392 				buf->rpc_type_cnt[i]  = buf->rpc_type_cnt[j];
393 				buf->rpc_type_time[i] = buf->rpc_type_time[j];
394 				buf->rpc_type_id[j]   = type_id;
395 				buf->rpc_type_cnt[j]  = type_cnt;
396 				buf->rpc_type_time[j] = type_time;
397 			}
398 			if (buf->rpc_type_cnt[i]) {
399 				rpc_type_ave_time[i] = buf->rpc_type_time[i] /
400 						       buf->rpc_type_cnt[i];
401 			}
402 		}
403 		for (i = 0; i < buf->rpc_user_size; i++) {
404 			for (j = i+1; j < buf->rpc_user_size; j++) {
405 				if (buf->rpc_user_cnt[i] >= buf->rpc_user_cnt[j])
406 					continue;
407 				user_id   = buf->rpc_user_id[i];
408 				user_cnt  = buf->rpc_user_cnt[i];
409 				user_time = buf->rpc_user_time[i];
410 				buf->rpc_user_id[i]   = buf->rpc_user_id[j];
411 				buf->rpc_user_cnt[i]  = buf->rpc_user_cnt[j];
412 				buf->rpc_user_time[i] = buf->rpc_user_time[j];
413 				buf->rpc_user_id[j]   = user_id;
414 				buf->rpc_user_cnt[j]  = user_cnt;
415 				buf->rpc_user_time[j] = user_time;
416 			}
417 			if (buf->rpc_user_cnt[i]) {
418 				rpc_user_ave_time[i] = buf->rpc_user_time[i] /
419 						       buf->rpc_user_cnt[i];
420 			}
421 		}
422 	}
423 }
424