1 /*
2  * Gather top-level ZFS pool and resilver/scan statistics and print using
3  * influxdb line protocol
4  * usage: [options] [pool_name]
5  * where options are:
6  *   --execd, -e           run in telegraf execd input plugin mode, [CR] on
7  *                         stdin causes a sample to be printed and wait for
8  *                         the next [CR]
9  *   --no-histograms, -n   don't print histogram data (reduces cardinality
10  *                         if you don't care about histograms)
11  *   --sum-histogram-buckets, -s sum histogram bucket values
12  *
13  * To integrate into telegraf use one of:
14  * 1. the `inputs.execd` plugin with the `--execd` option
15  * 2. the `inputs.exec` plugin to simply run with no options
16  *
17  * NOTE: libzfs is an unstable interface. YMMV.
18  *
19  * The design goals of this software include:
20  * + be as lightweight as possible
21  * + reduce the number of external dependencies as far as possible, hence
22  *   there is no dependency on a client library for managing the metric
23  *   collection -- info is printed, KISS
24  * + broken pools or kernel bugs can cause this process to hang in an
25  *   unkillable state. For this reason, it is best to keep the damage limited
26  *   to a small process like zpool_influxdb rather than a larger collector.
27  *
28  * Copyright 2018-2020 Richard Elling
29  *
30  * This software is dual-licensed MIT and CDDL.
31  *
32  * The MIT License (MIT)
33  *
34  * Permission is hereby granted, free of charge, to any person obtaining a copy
35  * of this software and associated documentation files (the "Software"), to deal
36  * in the Software without restriction, including without limitation the rights
37  * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
38  * copies of the Software, and to permit persons to whom the Software is
39  * furnished to do so, subject to the following conditions:
40  *
41  * The above copyright notice and this permission notice shall be included in
42  * all copies or substantial portions of the Software.
43  *
44  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
45  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
46  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
47  * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
48  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
49  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
50  * SOFTWARE.
51  *
52  * CDDL HEADER START
53  *
54  * The contents of this file are subject to the terms of the
55  * Common Development and Distribution License (the "License").
56  * You may not use this file except in compliance with the License.
57  *
58  * The contents of this file are subject to the terms of the
59  * Common Development and Distribution License Version 1.0 (CDDL-1.0).
60  * You can obtain a copy of the license from the top-level file
61  * "OPENSOLARIS.LICENSE" or at <http://opensource.org/licenses/CDDL-1.0>.
62  * You may not use this file except in compliance with the license.
63  *
64  * See the License for the specific language governing permissions
65  * and limitations under the License.
66  *
67  * CDDL HEADER END
68  */
69 #include <string.h>
70 #include <getopt.h>
71 #include <stdio.h>
72 #include <stdint.h>
73 #include <inttypes.h>
74 #include <libzfs.h>
75 
76 #define	POOL_MEASUREMENT	"zpool_stats"
77 #define	SCAN_MEASUREMENT	"zpool_scan_stats"
78 #define	VDEV_MEASUREMENT	"zpool_vdev_stats"
79 #define	POOL_LATENCY_MEASUREMENT	"zpool_latency"
80 #define	POOL_QUEUE_MEASUREMENT	"zpool_vdev_queue"
81 #define	MIN_LAT_INDEX	10  /* minimum latency index 10 = 1024ns */
82 #define	POOL_IO_SIZE_MEASUREMENT	"zpool_io_size"
83 #define	MIN_SIZE_INDEX	9  /* minimum size index 9 = 512 bytes */
84 
85 /* global options */
86 int execd_mode = 0;
87 int no_histograms = 0;
88 int sum_histogram_buckets = 0;
89 char metric_data_type = 'u';
90 uint64_t metric_value_mask = UINT64_MAX;
91 uint64_t timestamp = 0;
92 int complained_about_sync = 0;
93 char *tags = "";
94 
95 typedef int (*stat_printer_f)(nvlist_t *, const char *, const char *);
96 
97 /*
98  * influxdb line protocol rules for escaping are important because the
99  * zpool name can include characters that need to be escaped
100  *
101  * caller is responsible for freeing result
102  */
103 static char *
104 escape_string(const char *s)
105 {
106 	const char *c;
107 	char *d;
108 	char *t = (char *)malloc(ZFS_MAX_DATASET_NAME_LEN * 2);
109 	if (t == NULL) {
110 		fprintf(stderr, "error: cannot allocate memory\n");
111 		exit(1);
112 	}
113 
114 	for (c = s, d = t; *c != '\0'; c++, d++) {
115 		switch (*c) {
116 		case ' ':
117 		case ',':
118 		case '=':
119 		case '\\':
120 			*d++ = '\\';
121 			zfs_fallthrough;
122 		default:
123 			*d = *c;
124 		}
125 	}
126 	*d = '\0';
127 	return (t);
128 }
129 
130 /*
131  * print key=value where value is a uint64_t
132  */
133 static void
134 print_kv(char *key, uint64_t value)
135 {
136 	printf("%s=%llu%c", key,
137 	    (u_longlong_t)value & metric_value_mask, metric_data_type);
138 }
139 
140 /*
141  * print_scan_status() prints the details as often seen in the "zpool status"
142  * output. However, unlike the zpool command, which is intended for humans,
143  * this output is suitable for long-term tracking in influxdb.
144  * TODO: update to include issued scan data
145  */
146 static int
147 print_scan_status(nvlist_t *nvroot, const char *pool_name)
148 {
149 	uint_t c;
150 	int64_t elapsed;
151 	uint64_t examined, pass_exam, paused_time, paused_ts, rate;
152 	uint64_t remaining_time;
153 	pool_scan_stat_t *ps = NULL;
154 	double pct_done;
155 	char *state[DSS_NUM_STATES] = {
156 	    "none", "scanning", "finished", "canceled"};
157 	char *func;
158 
159 	(void) nvlist_lookup_uint64_array(nvroot,
160 	    ZPOOL_CONFIG_SCAN_STATS,
161 	    (uint64_t **)&ps, &c);
162 
163 	/*
164 	 * ignore if there are no stats
165 	 */
166 	if (ps == NULL)
167 		return (0);
168 
169 	/*
170 	 * return error if state is bogus
171 	 */
172 	if (ps->pss_state >= DSS_NUM_STATES ||
173 	    ps->pss_func >= POOL_SCAN_FUNCS) {
174 		if (complained_about_sync % 1000 == 0) {
175 			fprintf(stderr, "error: cannot decode scan stats: "
176 			    "ZFS is out of sync with compiled zpool_influxdb");
177 			complained_about_sync++;
178 		}
179 		return (1);
180 	}
181 
182 	switch (ps->pss_func) {
183 	case POOL_SCAN_NONE:
184 		func = "none_requested";
185 		break;
186 	case POOL_SCAN_SCRUB:
187 		func = "scrub";
188 		break;
189 	case POOL_SCAN_RESILVER:
190 		func = "resilver";
191 		break;
192 #ifdef POOL_SCAN_REBUILD
193 	case POOL_SCAN_REBUILD:
194 		func = "rebuild";
195 		break;
196 #endif
197 	default:
198 		func = "scan";
199 	}
200 
201 	/* overall progress */
202 	examined = ps->pss_examined ? ps->pss_examined : 1;
203 	pct_done = 0.0;
204 	if (ps->pss_to_examine > 0)
205 		pct_done = 100.0 * examined / ps->pss_to_examine;
206 
207 #ifdef EZFS_SCRUB_PAUSED
208 	paused_ts = ps->pss_pass_scrub_pause;
209 	paused_time = ps->pss_pass_scrub_spent_paused;
210 #else
211 	paused_ts = 0;
212 	paused_time = 0;
213 #endif
214 
215 	/* calculations for this pass */
216 	if (ps->pss_state == DSS_SCANNING) {
217 		elapsed = (int64_t)time(NULL) - (int64_t)ps->pss_pass_start -
218 		    (int64_t)paused_time;
219 		elapsed = (elapsed > 0) ? elapsed : 1;
220 		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
221 		rate = pass_exam / elapsed;
222 		rate = (rate > 0) ? rate : 1;
223 		remaining_time = ps->pss_to_examine - examined / rate;
224 	} else {
225 		elapsed =
226 		    (int64_t)ps->pss_end_time - (int64_t)ps->pss_pass_start -
227 		    (int64_t)paused_time;
228 		elapsed = (elapsed > 0) ? elapsed : 1;
229 		pass_exam = ps->pss_pass_exam ? ps->pss_pass_exam : 1;
230 		rate = pass_exam / elapsed;
231 		remaining_time = 0;
232 	}
233 	rate = rate ? rate : 1;
234 
235 	/* influxdb line protocol format: "tags metrics timestamp" */
236 	printf("%s%s,function=%s,name=%s,state=%s ",
237 	    SCAN_MEASUREMENT, tags, func, pool_name, state[ps->pss_state]);
238 	print_kv("end_ts", ps->pss_end_time);
239 	print_kv(",errors", ps->pss_errors);
240 	print_kv(",examined", examined);
241 	print_kv(",issued", ps->pss_issued);
242 	print_kv(",pass_examined", pass_exam);
243 	print_kv(",pass_issued", ps->pss_pass_issued);
244 	print_kv(",paused_ts", paused_ts);
245 	print_kv(",paused_t", paused_time);
246 	printf(",pct_done=%.2f", pct_done);
247 	print_kv(",processed", ps->pss_processed);
248 	print_kv(",rate", rate);
249 	print_kv(",remaining_t", remaining_time);
250 	print_kv(",start_ts", ps->pss_start_time);
251 	print_kv(",to_examine", ps->pss_to_examine);
252 	print_kv(",to_process", ps->pss_to_process);
253 	printf(" %llu\n", (u_longlong_t)timestamp);
254 	return (0);
255 }
256 
257 /*
258  * get a vdev name that corresponds to the top-level vdev names
259  * printed by `zpool status`
260  */
261 static char *
262 get_vdev_name(nvlist_t *nvroot, const char *parent_name)
263 {
264 	static char vdev_name[256];
265 	char *vdev_type = NULL;
266 	uint64_t vdev_id = 0;
267 
268 	if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE,
269 	    &vdev_type) != 0) {
270 		vdev_type = "unknown";
271 	}
272 	if (nvlist_lookup_uint64(
273 	    nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) {
274 		vdev_id = UINT64_MAX;
275 	}
276 	if (parent_name == NULL) {
277 		(void) snprintf(vdev_name, sizeof (vdev_name), "%s",
278 		    vdev_type);
279 	} else {
280 		(void) snprintf(vdev_name, sizeof (vdev_name),
281 		    "%.220s/%s-%llu",
282 		    parent_name, vdev_type, (u_longlong_t)vdev_id);
283 	}
284 	return (vdev_name);
285 }
286 
287 /*
288  * get a string suitable for an influxdb tag that describes this vdev
289  *
290  * By default only the vdev hierarchical name is shown, separated by '/'
291  * If the vdev has an associated path, which is typical of leaf vdevs,
292  * then the path is added.
293  * It would be nice to have the devid instead of the path, but under
294  * Linux we cannot be sure a devid will exist and we'd rather have
295  * something than nothing, so we'll use path instead.
296  */
297 static char *
298 get_vdev_desc(nvlist_t *nvroot, const char *parent_name)
299 {
300 	static char vdev_desc[2 * MAXPATHLEN];
301 	char *vdev_type = NULL;
302 	uint64_t vdev_id = 0;
303 	char vdev_value[MAXPATHLEN];
304 	char *vdev_path = NULL;
305 	char *s, *t;
306 
307 	if (nvlist_lookup_string(nvroot, ZPOOL_CONFIG_TYPE, &vdev_type) != 0) {
308 		vdev_type = "unknown";
309 	}
310 	if (nvlist_lookup_uint64(nvroot, ZPOOL_CONFIG_ID, &vdev_id) != 0) {
311 		vdev_id = UINT64_MAX;
312 	}
313 	if (nvlist_lookup_string(
314 	    nvroot, ZPOOL_CONFIG_PATH, &vdev_path) != 0) {
315 		vdev_path = NULL;
316 	}
317 
318 	if (parent_name == NULL) {
319 		s = escape_string(vdev_type);
320 		(void) snprintf(vdev_value, sizeof (vdev_value), "vdev=%s", s);
321 		free(s);
322 	} else {
323 		s = escape_string((char *)parent_name);
324 		t = escape_string(vdev_type);
325 		(void) snprintf(vdev_value, sizeof (vdev_value),
326 		    "vdev=%s/%s-%llu", s, t, (u_longlong_t)vdev_id);
327 		free(s);
328 		free(t);
329 	}
330 	if (vdev_path == NULL) {
331 		(void) snprintf(vdev_desc, sizeof (vdev_desc), "%s",
332 		    vdev_value);
333 	} else {
334 		s = escape_string(vdev_path);
335 		(void) snprintf(vdev_desc, sizeof (vdev_desc), "path=%s,%s",
336 		    s, vdev_value);
337 		free(s);
338 	}
339 	return (vdev_desc);
340 }
341 
342 /*
343  * vdev summary stats are a combination of the data shown by
344  * `zpool status` and `zpool list -v`
345  */
346 static int
347 print_summary_stats(nvlist_t *nvroot, const char *pool_name,
348     const char *parent_name)
349 {
350 	uint_t c;
351 	vdev_stat_t *vs;
352 	char *vdev_desc = NULL;
353 	vdev_desc = get_vdev_desc(nvroot, parent_name);
354 	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
355 	    (uint64_t **)&vs, &c) != 0) {
356 		return (1);
357 	}
358 	printf("%s%s,name=%s,state=%s,%s ", POOL_MEASUREMENT, tags,
359 	    pool_name, zpool_state_to_name((vdev_state_t)vs->vs_state,
360 	    (vdev_aux_t)vs->vs_aux), vdev_desc);
361 	print_kv("alloc", vs->vs_alloc);
362 	print_kv(",free", vs->vs_space - vs->vs_alloc);
363 	print_kv(",size", vs->vs_space);
364 	print_kv(",read_bytes", vs->vs_bytes[ZIO_TYPE_READ]);
365 	print_kv(",read_errors", vs->vs_read_errors);
366 	print_kv(",read_ops", vs->vs_ops[ZIO_TYPE_READ]);
367 	print_kv(",write_bytes", vs->vs_bytes[ZIO_TYPE_WRITE]);
368 	print_kv(",write_errors", vs->vs_write_errors);
369 	print_kv(",write_ops", vs->vs_ops[ZIO_TYPE_WRITE]);
370 	print_kv(",checksum_errors", vs->vs_checksum_errors);
371 	print_kv(",fragmentation", vs->vs_fragmentation);
372 	printf(" %llu\n", (u_longlong_t)timestamp);
373 	return (0);
374 }
375 
376 /*
377  * vdev latency stats are histograms stored as nvlist arrays of uint64.
378  * Latency stats include the ZIO scheduler classes plus lower-level
379  * vdev latencies.
380  *
381  * In many cases, the top-level "root" view obscures the underlying
382  * top-level vdev operations. For example, if a pool has a log, special,
383  * or cache device, then each can behave very differently. It is useful
384  * to see how each is responding.
385  */
386 static int
387 print_vdev_latency_stats(nvlist_t *nvroot, const char *pool_name,
388     const char *parent_name)
389 {
390 	uint_t c, end = 0;
391 	nvlist_t *nv_ex;
392 	char *vdev_desc = NULL;
393 
394 	/* short_names become part of the metric name and are influxdb-ready */
395 	struct lat_lookup {
396 	    char *name;
397 	    char *short_name;
398 	    uint64_t sum;
399 	    uint64_t *array;
400 	};
401 	struct lat_lookup lat_type[] = {
402 	    {ZPOOL_CONFIG_VDEV_TOT_R_LAT_HISTO,   "total_read", 0},
403 	    {ZPOOL_CONFIG_VDEV_TOT_W_LAT_HISTO,   "total_write", 0},
404 	    {ZPOOL_CONFIG_VDEV_DISK_R_LAT_HISTO,  "disk_read", 0},
405 	    {ZPOOL_CONFIG_VDEV_DISK_W_LAT_HISTO,  "disk_write", 0},
406 	    {ZPOOL_CONFIG_VDEV_SYNC_R_LAT_HISTO,  "sync_read", 0},
407 	    {ZPOOL_CONFIG_VDEV_SYNC_W_LAT_HISTO,  "sync_write", 0},
408 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_LAT_HISTO, "async_read", 0},
409 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_LAT_HISTO, "async_write", 0},
410 	    {ZPOOL_CONFIG_VDEV_SCRUB_LAT_HISTO,   "scrub", 0},
411 #ifdef ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO
412 	    {ZPOOL_CONFIG_VDEV_TRIM_LAT_HISTO,    "trim", 0},
413 #endif
414 	    {ZPOOL_CONFIG_VDEV_REBUILD_LAT_HISTO,    "rebuild", 0},
415 	    {NULL,	NULL}
416 	};
417 
418 	if (nvlist_lookup_nvlist(nvroot,
419 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
420 		return (6);
421 	}
422 
423 	vdev_desc = get_vdev_desc(nvroot, parent_name);
424 
425 	for (int i = 0; lat_type[i].name; i++) {
426 		if (nvlist_lookup_uint64_array(nv_ex,
427 		    lat_type[i].name, &lat_type[i].array, &c) != 0) {
428 			fprintf(stderr, "error: can't get %s\n",
429 			    lat_type[i].name);
430 			return (3);
431 		}
432 		/* end count count, all of the arrays are the same size */
433 		end = c - 1;
434 	}
435 
436 	for (int bucket = 0; bucket <= end; bucket++) {
437 		if (bucket < MIN_LAT_INDEX) {
438 			/* don't print, but collect the sum */
439 			for (int i = 0; lat_type[i].name; i++) {
440 				lat_type[i].sum += lat_type[i].array[bucket];
441 			}
442 			continue;
443 		}
444 		if (bucket < end) {
445 			printf("%s%s,le=%0.6f,name=%s,%s ",
446 			    POOL_LATENCY_MEASUREMENT, tags,
447 			    (float)(1ULL << bucket) * 1e-9,
448 			    pool_name, vdev_desc);
449 		} else {
450 			printf("%s%s,le=+Inf,name=%s,%s ",
451 			    POOL_LATENCY_MEASUREMENT, tags, pool_name,
452 			    vdev_desc);
453 		}
454 		for (int i = 0; lat_type[i].name; i++) {
455 			if (bucket <= MIN_LAT_INDEX || sum_histogram_buckets) {
456 				lat_type[i].sum += lat_type[i].array[bucket];
457 			} else {
458 				lat_type[i].sum = lat_type[i].array[bucket];
459 			}
460 			print_kv(lat_type[i].short_name, lat_type[i].sum);
461 			if (lat_type[i + 1].name != NULL) {
462 				printf(",");
463 			}
464 		}
465 		printf(" %llu\n", (u_longlong_t)timestamp);
466 	}
467 	return (0);
468 }
469 
470 /*
471  * vdev request size stats are histograms stored as nvlist arrays of uint64.
472  * Request size stats include the ZIO scheduler classes plus lower-level
473  * vdev sizes. Both independent (ind) and aggregated (agg) sizes are reported.
474  *
475  * In many cases, the top-level "root" view obscures the underlying
476  * top-level vdev operations. For example, if a pool has a log, special,
477  * or cache device, then each can behave very differently. It is useful
478  * to see how each is responding.
479  */
480 static int
481 print_vdev_size_stats(nvlist_t *nvroot, const char *pool_name,
482     const char *parent_name)
483 {
484 	uint_t c, end = 0;
485 	nvlist_t *nv_ex;
486 	char *vdev_desc = NULL;
487 
488 	/* short_names become the field name */
489 	struct size_lookup {
490 	    char *name;
491 	    char *short_name;
492 	    uint64_t sum;
493 	    uint64_t *array;
494 	};
495 	struct size_lookup size_type[] = {
496 	    {ZPOOL_CONFIG_VDEV_SYNC_IND_R_HISTO,   "sync_read_ind"},
497 	    {ZPOOL_CONFIG_VDEV_SYNC_IND_W_HISTO,   "sync_write_ind"},
498 	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_R_HISTO,  "async_read_ind"},
499 	    {ZPOOL_CONFIG_VDEV_ASYNC_IND_W_HISTO,  "async_write_ind"},
500 	    {ZPOOL_CONFIG_VDEV_IND_SCRUB_HISTO,    "scrub_read_ind"},
501 	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_R_HISTO,   "sync_read_agg"},
502 	    {ZPOOL_CONFIG_VDEV_SYNC_AGG_W_HISTO,   "sync_write_agg"},
503 	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_R_HISTO,  "async_read_agg"},
504 	    {ZPOOL_CONFIG_VDEV_ASYNC_AGG_W_HISTO,  "async_write_agg"},
505 	    {ZPOOL_CONFIG_VDEV_AGG_SCRUB_HISTO,    "scrub_read_agg"},
506 #ifdef ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO
507 	    {ZPOOL_CONFIG_VDEV_IND_TRIM_HISTO,    "trim_write_ind"},
508 	    {ZPOOL_CONFIG_VDEV_AGG_TRIM_HISTO,    "trim_write_agg"},
509 #endif
510 	    {ZPOOL_CONFIG_VDEV_IND_REBUILD_HISTO,    "rebuild_write_ind"},
511 	    {ZPOOL_CONFIG_VDEV_AGG_REBUILD_HISTO,    "rebuild_write_agg"},
512 	    {NULL,	NULL}
513 	};
514 
515 	if (nvlist_lookup_nvlist(nvroot,
516 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
517 		return (6);
518 	}
519 
520 	vdev_desc = get_vdev_desc(nvroot, parent_name);
521 
522 	for (int i = 0; size_type[i].name; i++) {
523 		if (nvlist_lookup_uint64_array(nv_ex, size_type[i].name,
524 		    &size_type[i].array, &c) != 0) {
525 			fprintf(stderr, "error: can't get %s\n",
526 			    size_type[i].name);
527 			return (3);
528 		}
529 		/* end count count, all of the arrays are the same size */
530 		end = c - 1;
531 	}
532 
533 	for (int bucket = 0; bucket <= end; bucket++) {
534 		if (bucket < MIN_SIZE_INDEX) {
535 			/* don't print, but collect the sum */
536 			for (int i = 0; size_type[i].name; i++) {
537 				size_type[i].sum += size_type[i].array[bucket];
538 			}
539 			continue;
540 		}
541 
542 		if (bucket < end) {
543 			printf("%s%s,le=%llu,name=%s,%s ",
544 			    POOL_IO_SIZE_MEASUREMENT, tags, 1ULL << bucket,
545 			    pool_name, vdev_desc);
546 		} else {
547 			printf("%s%s,le=+Inf,name=%s,%s ",
548 			    POOL_IO_SIZE_MEASUREMENT, tags, pool_name,
549 			    vdev_desc);
550 		}
551 		for (int i = 0; size_type[i].name; i++) {
552 			if (bucket <= MIN_SIZE_INDEX || sum_histogram_buckets) {
553 				size_type[i].sum += size_type[i].array[bucket];
554 			} else {
555 				size_type[i].sum = size_type[i].array[bucket];
556 			}
557 			print_kv(size_type[i].short_name, size_type[i].sum);
558 			if (size_type[i + 1].name != NULL) {
559 				printf(",");
560 			}
561 		}
562 		printf(" %llu\n", (u_longlong_t)timestamp);
563 	}
564 	return (0);
565 }
566 
567 /*
568  * ZIO scheduler queue stats are stored as gauges. This is unfortunate
569  * because the values can change very rapidly and any point-in-time
570  * value will quickly be obsoleted. It is also not easy to downsample.
571  * Thus only the top-level queue stats might be beneficial... maybe.
572  */
573 static int
574 print_queue_stats(nvlist_t *nvroot, const char *pool_name,
575     const char *parent_name)
576 {
577 	nvlist_t *nv_ex;
578 	uint64_t value;
579 
580 	/* short_names are used for the field name */
581 	struct queue_lookup {
582 	    char *name;
583 	    char *short_name;
584 	};
585 	struct queue_lookup queue_type[] = {
586 	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE,	"sync_r_active"},
587 	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE,	"sync_w_active"},
588 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE,	"async_r_active"},
589 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE,	"async_w_active"},
590 	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE,	"async_scrub_active"},
591 	    {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE,	"rebuild_active"},
592 	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE,	"sync_r_pend"},
593 	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE,	"sync_w_pend"},
594 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE,	"async_r_pend"},
595 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE,	"async_w_pend"},
596 	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE,	"async_scrub_pend"},
597 	    {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE,	"rebuild_pend"},
598 	    {NULL,	NULL}
599 	};
600 
601 	if (nvlist_lookup_nvlist(nvroot,
602 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
603 		return (6);
604 	}
605 
606 	printf("%s%s,name=%s,%s ", POOL_QUEUE_MEASUREMENT, tags, pool_name,
607 	    get_vdev_desc(nvroot, parent_name));
608 	for (int i = 0; queue_type[i].name; i++) {
609 		if (nvlist_lookup_uint64(nv_ex,
610 		    queue_type[i].name, &value) != 0) {
611 			fprintf(stderr, "error: can't get %s\n",
612 			    queue_type[i].name);
613 			return (3);
614 		}
615 		print_kv(queue_type[i].short_name, value);
616 		if (queue_type[i + 1].name != NULL) {
617 			printf(",");
618 		}
619 	}
620 	printf(" %llu\n", (u_longlong_t)timestamp);
621 	return (0);
622 }
623 
624 /*
625  * top-level vdev stats are at the pool level
626  */
627 static int
628 print_top_level_vdev_stats(nvlist_t *nvroot, const char *pool_name)
629 {
630 	nvlist_t *nv_ex;
631 	uint64_t value;
632 
633 	/* short_names become part of the metric name */
634 	struct queue_lookup {
635 	    char *name;
636 	    char *short_name;
637 	};
638 	struct queue_lookup queue_type[] = {
639 	    {ZPOOL_CONFIG_VDEV_SYNC_R_ACTIVE_QUEUE, "sync_r_active_queue"},
640 	    {ZPOOL_CONFIG_VDEV_SYNC_W_ACTIVE_QUEUE, "sync_w_active_queue"},
641 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_ACTIVE_QUEUE, "async_r_active_queue"},
642 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_ACTIVE_QUEUE, "async_w_active_queue"},
643 	    {ZPOOL_CONFIG_VDEV_SCRUB_ACTIVE_QUEUE, "async_scrub_active_queue"},
644 	    {ZPOOL_CONFIG_VDEV_REBUILD_ACTIVE_QUEUE, "rebuild_active_queue"},
645 	    {ZPOOL_CONFIG_VDEV_SYNC_R_PEND_QUEUE, "sync_r_pend_queue"},
646 	    {ZPOOL_CONFIG_VDEV_SYNC_W_PEND_QUEUE, "sync_w_pend_queue"},
647 	    {ZPOOL_CONFIG_VDEV_ASYNC_R_PEND_QUEUE, "async_r_pend_queue"},
648 	    {ZPOOL_CONFIG_VDEV_ASYNC_W_PEND_QUEUE, "async_w_pend_queue"},
649 	    {ZPOOL_CONFIG_VDEV_SCRUB_PEND_QUEUE, "async_scrub_pend_queue"},
650 	    {ZPOOL_CONFIG_VDEV_REBUILD_PEND_QUEUE, "rebuild_pend_queue"},
651 	    {NULL, NULL}
652 	};
653 
654 	if (nvlist_lookup_nvlist(nvroot,
655 	    ZPOOL_CONFIG_VDEV_STATS_EX, &nv_ex) != 0) {
656 		return (6);
657 	}
658 
659 	printf("%s%s,name=%s,vdev=root ", VDEV_MEASUREMENT, tags,
660 	    pool_name);
661 	for (int i = 0; queue_type[i].name; i++) {
662 		if (nvlist_lookup_uint64(nv_ex,
663 		    queue_type[i].name, &value) != 0) {
664 			fprintf(stderr, "error: can't get %s\n",
665 			    queue_type[i].name);
666 			return (3);
667 		}
668 		if (i > 0)
669 			printf(",");
670 		print_kv(queue_type[i].short_name, value);
671 	}
672 
673 	printf(" %llu\n", (u_longlong_t)timestamp);
674 	return (0);
675 }
676 
677 /*
678  * recursive stats printer
679  */
680 static int
681 print_recursive_stats(stat_printer_f func, nvlist_t *nvroot,
682     const char *pool_name, const char *parent_name, int descend)
683 {
684 	uint_t c, children;
685 	nvlist_t **child;
686 	char vdev_name[256];
687 	int err;
688 
689 	err = func(nvroot, pool_name, parent_name);
690 	if (err)
691 		return (err);
692 
693 	if (descend && nvlist_lookup_nvlist_array(nvroot, ZPOOL_CONFIG_CHILDREN,
694 	    &child, &children) == 0) {
695 		(void) strlcpy(vdev_name, get_vdev_name(nvroot, parent_name),
696 		    sizeof (vdev_name));
697 
698 		for (c = 0; c < children; c++) {
699 			print_recursive_stats(func, child[c], pool_name,
700 			    vdev_name, descend);
701 		}
702 	}
703 	return (0);
704 }
705 
706 /*
707  * call-back to print the stats from the pool config
708  *
709  * Note: if the pool is broken, this can hang indefinitely and perhaps in an
710  * unkillable state.
711  */
712 static int
713 print_stats(zpool_handle_t *zhp, void *data)
714 {
715 	uint_t c;
716 	int err;
717 	boolean_t missing;
718 	nvlist_t *config, *nvroot;
719 	vdev_stat_t *vs;
720 	struct timespec tv;
721 	char *pool_name;
722 
723 	/* if not this pool return quickly */
724 	if (data &&
725 	    strncmp(data, zpool_get_name(zhp), ZFS_MAX_DATASET_NAME_LEN) != 0) {
726 		zpool_close(zhp);
727 		return (0);
728 	}
729 
730 	if (zpool_refresh_stats(zhp, &missing) != 0) {
731 		zpool_close(zhp);
732 		return (1);
733 	}
734 
735 	config = zpool_get_config(zhp, NULL);
736 	if (clock_gettime(CLOCK_REALTIME, &tv) != 0)
737 		timestamp = (uint64_t)time(NULL) * 1000000000;
738 	else
739 		timestamp =
740 		    ((uint64_t)tv.tv_sec * 1000000000) + (uint64_t)tv.tv_nsec;
741 
742 	if (nvlist_lookup_nvlist(
743 	    config, ZPOOL_CONFIG_VDEV_TREE, &nvroot) != 0) {
744 	zpool_close(zhp);
745 		return (2);
746 	}
747 	if (nvlist_lookup_uint64_array(nvroot, ZPOOL_CONFIG_VDEV_STATS,
748 	    (uint64_t **)&vs, &c) != 0) {
749 	zpool_close(zhp);
750 		return (3);
751 	}
752 
753 	pool_name = escape_string(zpool_get_name(zhp));
754 	err = print_recursive_stats(print_summary_stats, nvroot,
755 	    pool_name, NULL, 1);
756 	/* if any of these return an error, skip the rest */
757 	if (err == 0)
758 	err = print_top_level_vdev_stats(nvroot, pool_name);
759 
760 	if (no_histograms == 0) {
761 	if (err == 0)
762 		err = print_recursive_stats(print_vdev_latency_stats, nvroot,
763 		    pool_name, NULL, 1);
764 	if (err == 0)
765 		err = print_recursive_stats(print_vdev_size_stats, nvroot,
766 		    pool_name, NULL, 1);
767 	if (err == 0)
768 		err = print_recursive_stats(print_queue_stats, nvroot,
769 		    pool_name, NULL, 0);
770 	}
771 	if (err == 0)
772 		err = print_scan_status(nvroot, pool_name);
773 
774 	free(pool_name);
775 	zpool_close(zhp);
776 	return (err);
777 }
778 
779 static void
780 usage(char *name)
781 {
782 	fprintf(stderr, "usage: %s [--execd][--no-histograms]"
783 	    "[--sum-histogram-buckets] [--signed-int] [poolname]\n", name);
784 	exit(EXIT_FAILURE);
785 }
786 
787 int
788 main(int argc, char *argv[])
789 {
790 	int opt;
791 	int ret = 8;
792 	char *line = NULL;
793 	size_t len, tagslen = 0;
794 	struct option long_options[] = {
795 	    {"execd", no_argument, NULL, 'e'},
796 	    {"help", no_argument, NULL, 'h'},
797 	    {"no-histograms", no_argument, NULL, 'n'},
798 	    {"signed-int", no_argument, NULL, 'i'},
799 	    {"sum-histogram-buckets", no_argument, NULL, 's'},
800 	    {"tags", required_argument, NULL, 't'},
801 	    {0, 0, 0, 0}
802 	};
803 	while ((opt = getopt_long(
804 	    argc, argv, "ehinst:", long_options, NULL)) != -1) {
805 		switch (opt) {
806 		case 'e':
807 			execd_mode = 1;
808 			break;
809 		case 'i':
810 			metric_data_type = 'i';
811 			metric_value_mask = INT64_MAX;
812 			break;
813 		case 'n':
814 			no_histograms = 1;
815 			break;
816 		case 's':
817 			sum_histogram_buckets = 1;
818 			break;
819 		case 't':
820 			tagslen = strlen(optarg) + 2;
821 			tags = calloc(1, tagslen);
822 			if (tags == NULL) {
823 				fprintf(stderr,
824 				    "error: cannot allocate memory "
825 				    "for tags\n");
826 				exit(1);
827 			}
828 			(void) snprintf(tags, tagslen, ",%s", optarg);
829 			break;
830 		default:
831 			usage(argv[0]);
832 		}
833 	}
834 
835 	libzfs_handle_t *g_zfs;
836 	if ((g_zfs = libzfs_init()) == NULL) {
837 		fprintf(stderr,
838 		    "error: cannot initialize libzfs. "
839 		    "Is the zfs module loaded or zrepl running?\n");
840 		exit(EXIT_FAILURE);
841 	}
842 	if (execd_mode == 0) {
843 		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
844 		return (ret);
845 	}
846 	while (getline(&line, &len, stdin) != -1) {
847 		ret = zpool_iter(g_zfs, print_stats, argv[optind]);
848 		fflush(stdout);
849 	}
850 	return (ret);
851 }
852