1#!/bin/sh
2#
3# plugin for munin to monitor usage of unbound servers.
4# To install copy this to /usr/local/share/munin/plugins/unbound_munin_
5# and use munin-node-configure (--suggest, --shell).
6#
7# (C) 2008 W.C.A. Wijngaards.  BSD Licensed.
8#
9# To install; enable statistics and unbound-control in unbound.conf
10#	server:		extended-statistics: yes
11#			statistics-cumulative: no
12#			statistics-interval: 0
13#	remote-control:	control-enable: yes
14# Run the command unbound-control-setup to generate the key files.
15#
16# Environment variables for this script
17#	unbound_conf	- where the unbound.conf file is located.
18#	unbound_control	- where to find unbound-control executable.
19#	spoof_warn	- what level to warn about spoofing
20#	spoof_crit	- what level to crit about spoofing
21#
22# You can set them in your munin/plugin-conf.d/plugins.conf file
23# with:
24# [unbound*]
25# user root
26# env.unbound_conf /usr/local/etc/unbound/unbound.conf
27# env.unbound_control /usr/local/sbin/unbound-control
28# env.spoof_warn 1000
29# env.spoof_crit 100000
30#
31# This plugin can create different graphs depending on what name
32# you link it as (with ln -s) into the plugins directory
33# You can link it multiple times.
34# If you are only a casual user, the _hits and _by_type are most interesting,
35# possibly followed by _by_rcode.
36#
37#	unbound_munin_hits	- base volume, cache hits, unwanted traffic
38#	unbound_munin_queue	- to monitor the internal requestlist
39#	unbound_munin_memory	- memory usage
40#	unbound_munin_by_type	- incoming queries by type
41#	unbound_munin_by_class	- incoming queries by class
42#	unbound_munin_by_opcode	- incoming queries by opcode
43#	unbound_munin_by_rcode	- answers by rcode, validation status
44#	unbound_munin_by_flags	- incoming queries by flags
45#	unbound_munin_histogram	- histogram of query resolving times
46#
47# Magic markers - optional - used by installation scripts and
48# munin-config:  (originally contrib family but munin-node-configure ignores it)
49#
50#%# family=auto
51#%# capabilities=autoconf suggest
52
53# POD documentation
54: <<=cut
55=head1 NAME
56
57unbound_munin_ - Munin plugin to monitor the Unbound DNS resolver.
58
59=head1 APPLICABLE SYSTEMS
60
61System with unbound daemon.
62
63=head1 CONFIGURATION
64
65  [unbound*]
66  user root
67  env.unbound_conf /usr/local/etc/unbound/unbound.conf
68  env.unbound_control /usr/local/sbin/unbound-control
69  env.spoof_warn 1000
70  env.spoof_crit 100000
71
72Use the .env settings to override the defaults.
73
74=head1 USAGE
75
76Can be used to present different graphs. Use ln -s for that name in
77the plugins directory to enable the graph.
78unbound_munin_hits	- base volume, cache hits, unwanted traffic
79unbound_munin_queue	- to monitor the internal requestlist
80unbound_munin_memory	- memory usage
81unbound_munin_by_type	- incoming queries by type
82unbound_munin_by_class	- incoming queries by class
83unbound_munin_by_opcode	- incoming queries by opcode
84unbound_munin_by_rcode	- answers by rcode, validation status
85unbound_munin_by_flags	- incoming queries by flags
86unbound_munin_histogram - histogram of query resolving times
87
88=head1 AUTHOR
89
90Copyright 2008 W.C.A. Wijngaards
91
92=head1 LICENSE
93
94BSD
95
96=cut
97
98state="${MUNIN_PLUGSTATE}/unbound.state"
99seentags="${MUNIN_PLUGSTATE}/unbound-seentags.state"
100conf=${unbound_conf:-/usr/local/etc/unbound/unbound.conf}
101ctrl=${unbound_control:-/usr/local/sbin/unbound-control}
102warn=${spoof_warn:-1000}
103crit=${spoof_crit:-100000}
104lock=$state.lock
105
106# number of seconds between polling attempts.
107# makes the statefile hang around for at least this many seconds,
108# so that multiple links of this script can share the results.
109lee=55
110
111# to keep things within 19 characters
112ABBREV="-e s/total/t/ -e s/thread/t/ -e s/num/n/ -e s/query/q/ -e s/answer/a/ -e s/unwanted/u/ -e s/requestlist/ql/ -e s/type/t/ -e s/class/c/ -e s/opcode/o/ -e s/rcode/r/ -e s/edns/e/ -e s/mem/m/ -e s/cache/c/ -e s/mod/m/"
113
114# get value from $1 into return variable $value
115get_value ( ) {
116	value="`grep '^'$1'=' $state | sed -e 's/^.*=//'`"
117	if test "$value"x = ""x; then
118		value="0"
119	fi
120}
121
122# Update list of seen query types etc to seentags file. This is run while
123# holding the lock, after the state file is updated.
124update_seentags() {
125    tmplist="$(cat ${seentags} 2> /dev/null)
126num.query.type.A
127num.query.class.IN
128num.query.opcode.QUERY
129num.answer.rcode.NOERROR
130"
131    (echo "${tmplist}"; grep ^num ${state} | sed -e 's/=.*//') | sort -u > ${seentags}
132}
133
134# download the state from the unbound server.
135get_state ( ) {
136	# obtain lock for fetching the state
137	# because there is a race condition in fetching and writing to file
138
139	# see if the lock is stale, if so, take it
140	if test -f $lock ; then
141		pid="`cat $lock 2>&1`"
142		kill -0 "$pid" >/dev/null 2>&1
143		if test $? -ne 0 -a "$pid" != $$ ; then
144			echo $$ >$lock
145		fi
146	fi
147
148	i=0
149	while test ! -f $lock || test "`cat $lock 2>&1`" != $$; do
150		while test -f $lock; do
151			# wait
152			i=`expr $i + 1`
153			if test $i -gt 1000; then
154				sleep 1;
155			fi
156			if test $i -gt 1500; then
157				echo "error locking $lock" "=" `cat $lock`
158				rm -f $lock
159				exit 1
160			fi
161		done
162		# try to get it
163		if echo $$ >$lock ; then : ; else break; fi
164	done
165	# do not refetch if the file exists and only LEE seconds old
166	if test -f $state; then
167		now=`date +%s`
168		get_value "time.now"
169		value="`echo $value | sed -e 's/\..*$//'`"
170		if test $now -lt `expr $value + $lee`; then
171			rm -f $lock
172			return
173		fi
174	fi
175	$ctrl -c $conf stats > $state
176	if test $? -ne 0; then
177		echo "error retrieving data from unbound server"
178		rm -f $lock
179		exit 1
180	fi
181	update_seentags
182	rm -f $lock
183}
184
185if test "$1" = "autoconf" ; then
186	if test ! -f $conf; then
187		echo no "($conf does not exist)"
188		exit 0
189	fi
190	if test ! -d `dirname $state`; then
191		echo no "(`dirname $state` directory does not exist)"
192		exit 0
193	fi
194	echo yes
195	exit 0
196fi
197
198if test "$1" = "suggest" ; then
199	echo "hits"
200	echo "queue"
201	echo "memory"
202	echo "by_type"
203	echo "by_class"
204	echo "by_opcode"
205	echo "by_rcode"
206	echo "by_flags"
207	echo "histogram"
208	exit 0
209fi
210
211# determine my type, by name
212id=`echo $0 | sed -e 's/^.*unbound_munin_//'`
213if test "$id"x = ""x; then
214	# some default to keep people sane.
215	id="hits"
216fi
217
218# if $1 exists in statefile, config is echoed with label $2
219exist_config ( ) {
220	mn=`echo $1 | sed $ABBREV | tr . _`
221	if grep '^'$1'=' $state >/dev/null 2>&1; then
222		echo "$mn.label $2"
223		echo "$mn.min 0"
224		echo "$mn.type ABSOLUTE"
225	fi
226}
227
228# print label and min 0 for a name $1 in unbound format
229p_config ( ) {
230	mn=`echo $1 | sed $ABBREV | tr . _`
231	echo $mn.label "$2"
232	echo $mn.min 0
233	echo $mn.type $3
234}
235
236if test "$1" = "config" ; then
237	if test ! -f $state; then
238		get_state
239	fi
240	case $id in
241	hits)
242		echo "graph_title Unbound DNS traffic and cache hits"
243		echo "graph_args --base 1000 -l 0"
244		echo "graph_vlabel queries / \${graph_period}"
245		echo "graph_scale no"
246		echo "graph_category dns"
247		for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
248			sed -e 's/=.*//'`; do
249			exist_config $x "queries handled by `basename $x .num.queries`"
250		done
251		p_config "total.num.queries" "total queries from clients" "ABSOLUTE"
252		p_config "total.num.cachehits" "cache hits" "ABSOLUTE"
253		p_config "total.num.prefetch" "cache prefetch" "ABSOLUTE"
254		p_config "num.query.tcp" "TCP queries" "ABSOLUTE"
255		p_config "num.query.tcpout" "TCP out queries" "ABSOLUTE"
256		p_config "num.query.tls" "TLS queries" "ABSOLUTE"
257		p_config "num.query.tls.resume" "TLS resumes" "ABSOLUTE"
258		p_config "num.query.ipv6" "IPv6 queries" "ABSOLUTE"
259		p_config "unwanted.queries" "queries that failed acl" "ABSOLUTE"
260		p_config "unwanted.replies" "unwanted or unsolicited replies" "ABSOLUTE"
261		echo "u_replies.warning $warn"
262		echo "u_replies.critical $crit"
263		echo "graph_info DNS queries to the recursive resolver. The unwanted replies could be innocent duplicate packets, late replies, or spoof threats."
264		;;
265	queue)
266		echo "graph_title Unbound requestlist size"
267		echo "graph_args --base 1000 -l 0"
268		echo "graph_vlabel number of queries"
269		echo "graph_scale no"
270		echo "graph_category dns"
271		p_config "total.requestlist.avg" "Average size of queue on insert" "GAUGE"
272		p_config "total.requestlist.max" "Max size of queue (in 5 min)" "GAUGE"
273		p_config "total.requestlist.overwritten" "Number of queries replaced by new ones" "GAUGE"
274		p_config "total.requestlist.exceeded" "Number of queries dropped due to lack of space" "GAUGE"
275		echo "graph_info The queries that did not hit the cache and need recursion service take up space in the requestlist. If there are too many queries, first queries get overwritten, and at last resort dropped."
276		;;
277	memory)
278		echo "graph_title Unbound memory usage"
279		echo "graph_args --base 1024 -l 0"
280		echo "graph_vlabel memory used in bytes"
281		echo "graph_category dns"
282		p_config "mem.cache.rrset" "RRset cache memory" "GAUGE"
283		p_config "mem.cache.message" "Message cache memory" "GAUGE"
284		p_config "mem.mod.iterator" "Iterator module memory" "GAUGE"
285		p_config "mem.mod.validator" "Validator module and key cache memory" "GAUGE"
286		p_config "msg.cache.count" "msg cache count" "GAUGE"
287		p_config "rrset.cache.count" "rrset cache count" "GAUGE"
288		p_config "infra.cache.count" "infra cache count" "GAUGE"
289		p_config "key.cache.count" "key cache count" "GAUGE"
290		echo "graph_info The memory used by unbound."
291		;;
292	by_type)
293		echo "graph_title Unbound DNS queries by type"
294		echo "graph_args --base 1000 -l 0"
295		echo "graph_vlabel queries / \${graph_period}"
296		echo "graph_scale no"
297		echo "graph_category dns"
298		for nm in `grep "^num.query.type" $seentags`; do
299			tp=`echo $nm | sed -e s/num.query.type.//`
300			p_config "$nm" "$tp" "ABSOLUTE"
301		done
302		echo "graph_info queries by DNS RR type queried for"
303		;;
304	by_class)
305		echo "graph_title Unbound DNS queries by class"
306		echo "graph_args --base 1000 -l 0"
307		echo "graph_vlabel queries / \${graph_period}"
308		echo "graph_scale no"
309		echo "graph_category dns"
310		for nm in `grep "^num.query.class" $seentags`; do
311			tp=`echo $nm | sed -e s/num.query.class.//`
312			p_config "$nm" "$tp" "ABSOLUTE"
313		done
314		echo "graph_info queries by DNS RR class queried for."
315		;;
316	by_opcode)
317		echo "graph_title Unbound DNS queries by opcode"
318		echo "graph_args --base 1000 -l 0"
319		echo "graph_vlabel queries / \${graph_period}"
320		echo "graph_scale no"
321		echo "graph_category dns"
322		for nm in `grep "^num.query.opcode" $seentags`; do
323			tp=`echo $nm | sed -e s/num.query.opcode.//`
324			p_config "$nm" "$tp" "ABSOLUTE"
325		done
326		echo "graph_info queries by opcode in the query packet."
327		;;
328	by_rcode)
329		echo "graph_title Unbound DNS answers by return code"
330		echo "graph_args --base 1000 -l 0"
331		echo "graph_vlabel answer packets / \${graph_period}"
332		echo "graph_scale no"
333		echo "graph_category dns"
334		for nm in `grep "^num.answer.rcode" $seentags`; do
335			tp=`echo $nm | sed -e s/num.answer.rcode.//`
336			p_config "$nm" "$tp" "ABSOLUTE"
337		done
338		p_config "num.answer.secure" "answer secure" "ABSOLUTE"
339		p_config "num.answer.bogus" "answer bogus" "ABSOLUTE"
340		p_config "num.rrset.bogus" "num rrsets marked bogus" "ABSOLUTE"
341		echo "graph_info answers sorted by return value. rrsets bogus is the number of rrsets marked bogus per \${graph_period} by the validator"
342		;;
343	by_flags)
344		echo "graph_title Unbound DNS incoming queries by flags"
345		echo "graph_args --base 1000 -l 0"
346		echo "graph_vlabel queries / \${graph_period}"
347		echo "graph_scale no"
348		echo "graph_category dns"
349		p_config "num.query.flags.QR" "QR (query reply) flag" "ABSOLUTE"
350		p_config "num.query.flags.AA" "AA (auth answer) flag" "ABSOLUTE"
351		p_config "num.query.flags.TC" "TC (truncated) flag" "ABSOLUTE"
352		p_config "num.query.flags.RD" "RD (recursion desired) flag" "ABSOLUTE"
353		p_config "num.query.flags.RA" "RA (rec avail) flag" "ABSOLUTE"
354		p_config "num.query.flags.Z" "Z (zero) flag" "ABSOLUTE"
355		p_config "num.query.flags.AD" "AD (auth data) flag" "ABSOLUTE"
356		p_config "num.query.flags.CD" "CD (check disabled) flag" "ABSOLUTE"
357		p_config "num.query.edns.present" "EDNS OPT present" "ABSOLUTE"
358		p_config "num.query.edns.DO" "DO (DNSSEC OK) flag" "ABSOLUTE"
359		echo "graph_info This graphs plots the flags inside incoming queries. For example, if QR, AA, TC, RA, Z flags are set, the query can be rejected. RD, AD, CD and DO are legitimately set by some software."
360		;;
361	histogram)
362		echo "graph_title Unbound DNS histogram of reply time"
363		echo "graph_args --base 1000 -l 0"
364		echo "graph_vlabel queries / \${graph_period}"
365		echo "graph_scale no"
366		echo "graph_category dns"
367		echo hcache.label "cache hits"
368		echo hcache.min 0
369		echo hcache.type ABSOLUTE
370		echo hcache.draw AREA
371		echo hcache.colour 999999
372		echo h64ms.label "0 msec - 66 msec"
373		echo h64ms.min 0
374		echo h64ms.type ABSOLUTE
375		echo h64ms.draw STACK
376		echo h64ms.colour 0000FF
377		echo h128ms.label "66 msec - 131 msec"
378		echo h128ms.min 0
379		echo h128ms.type ABSOLUTE
380		echo h128ms.colour 1F00DF
381		echo h128ms.draw STACK
382		echo h256ms.label "131 msec - 262 msec"
383		echo h256ms.min 0
384		echo h256ms.type ABSOLUTE
385		echo h256ms.draw STACK
386		echo h256ms.colour 3F00BF
387		echo h512ms.label "262 msec - 524 msec"
388		echo h512ms.min 0
389		echo h512ms.type ABSOLUTE
390		echo h512ms.draw STACK
391		echo h512ms.colour 5F009F
392		echo h1s.label "524 msec - 1 sec"
393		echo h1s.min 0
394		echo h1s.type ABSOLUTE
395		echo h1s.draw STACK
396		echo h1s.colour 7F007F
397		echo h2s.label "1 sec - 2 sec"
398		echo h2s.min 0
399		echo h2s.type ABSOLUTE
400		echo h2s.draw STACK
401		echo h2s.colour 9F005F
402		echo h4s.label "2 sec - 4 sec"
403		echo h4s.min 0
404		echo h4s.type ABSOLUTE
405		echo h4s.draw STACK
406		echo h4s.colour BF003F
407		echo h8s.label "4 sec - 8 sec"
408		echo h8s.min 0
409		echo h8s.type ABSOLUTE
410		echo h8s.draw STACK
411		echo h8s.colour DF001F
412		echo h16s.label "8 sec - ..."
413		echo h16s.min 0
414		echo h16s.type ABSOLUTE
415		echo h16s.draw STACK
416		echo h16s.colour FF0000
417		echo "graph_info Histogram of the reply times for queries."
418		;;
419	esac
420
421	exit 0
422fi
423
424# do the stats itself
425get_state
426
427# get the time elapsed
428get_value "time.elapsed"
429if test $value = 0 || test $value = "0.000000"; then
430	echo "error: time elapsed 0 or could not retrieve data"
431	exit 1
432fi
433elapsed="$value"
434
435# print value for $1
436print_value ( ) {
437	mn=`echo $1 | sed $ABBREV | tr . _`
438	get_value $1
439	echo "$mn.value" $value
440}
441
442# print value if line already found in $2
443print_value_line ( ) {
444	mn=`echo $1 | sed $ABBREV | tr . _`
445	value="`echo $2 | sed -e 's/^.*=//'`"
446	echo "$mn.value" $value
447}
448
449
450case $id in
451hits)
452	for x in `grep "^thread[0-9][0-9]*\.num\.queries=" $state |
453		sed -e 's/=.*//'` total.num.queries \
454		total.num.cachehits total.num.prefetch num.query.tcp \
455		num.query.tcpout num.query.tls num.query.tls.resume \
456		num.query.ipv6 unwanted.queries \
457		unwanted.replies; do
458		if grep "^"$x"=" $state >/dev/null 2>&1; then
459			print_value $x
460		fi
461	done
462	;;
463queue)
464	for x in total.requestlist.avg total.requestlist.max \
465		total.requestlist.overwritten total.requestlist.exceeded; do
466		print_value $x
467	done
468	;;
469memory)
470	for x in mem.cache.rrset mem.cache.message mem.mod.iterator \
471		mem.mod.validator msg.cache.count rrset.cache.count \
472		infra.cache.count key.cache.count; do
473		print_value $x
474	done
475	;;
476by_type)
477	for nm in `grep "^num.query.type" $seentags`; do
478		print_value $nm
479	done
480	;;
481by_class)
482	for nm in `grep "^num.query.class" $seentags`; do
483		print_value $nm
484	done
485	;;
486by_opcode)
487	for nm in `grep "^num.query.opcode" $seentags`; do
488		print_value $nm
489	done
490	;;
491by_rcode)
492	for nm in `grep "^num.answer.rcode" $seentags`; do
493		print_value $nm
494	done
495	print_value "num.answer.secure"
496	print_value "num.answer.bogus"
497	print_value "num.rrset.bogus"
498	;;
499by_flags)
500	for x in num.query.flags.QR num.query.flags.AA num.query.flags.TC num.query.flags.RD num.query.flags.RA num.query.flags.Z num.query.flags.AD num.query.flags.CD num.query.edns.present num.query.edns.DO; do
501		print_value $x
502	done
503	;;
504histogram)
505	get_value total.num.cachehits
506	echo hcache.value $value
507	r=0
508	for x in histogram.000000.000000.to.000000.000001 \
509		histogram.000000.000001.to.000000.000002 \
510		histogram.000000.000002.to.000000.000004 \
511		histogram.000000.000004.to.000000.000008 \
512		histogram.000000.000008.to.000000.000016 \
513		histogram.000000.000016.to.000000.000032 \
514		histogram.000000.000032.to.000000.000064 \
515		histogram.000000.000064.to.000000.000128 \
516		histogram.000000.000128.to.000000.000256 \
517		histogram.000000.000256.to.000000.000512 \
518		histogram.000000.000512.to.000000.001024 \
519		histogram.000000.001024.to.000000.002048 \
520		histogram.000000.002048.to.000000.004096 \
521		histogram.000000.004096.to.000000.008192 \
522		histogram.000000.008192.to.000000.016384 \
523		histogram.000000.016384.to.000000.032768 \
524		histogram.000000.032768.to.000000.065536; do
525		get_value $x
526		r=`expr $r + $value`
527	done
528	echo h64ms.value $r
529	get_value histogram.000000.065536.to.000000.131072
530	echo h128ms.value $value
531	get_value histogram.000000.131072.to.000000.262144
532	echo h256ms.value $value
533	get_value histogram.000000.262144.to.000000.524288
534	echo h512ms.value $value
535	get_value histogram.000000.524288.to.000001.000000
536	echo h1s.value $value
537	get_value histogram.000001.000000.to.000002.000000
538	echo h2s.value $value
539	get_value histogram.000002.000000.to.000004.000000
540	echo h4s.value $value
541	get_value histogram.000004.000000.to.000008.000000
542	echo h8s.value $value
543	r=0
544	for x in histogram.000008.000000.to.000016.000000 \
545		histogram.000016.000000.to.000032.000000 \
546		histogram.000032.000000.to.000064.000000 \
547		histogram.000064.000000.to.000128.000000 \
548		histogram.000128.000000.to.000256.000000 \
549		histogram.000256.000000.to.000512.000000 \
550		histogram.000512.000000.to.001024.000000 \
551		histogram.001024.000000.to.002048.000000 \
552		histogram.002048.000000.to.004096.000000 \
553		histogram.004096.000000.to.008192.000000 \
554		histogram.008192.000000.to.016384.000000 \
555		histogram.016384.000000.to.032768.000000 \
556		histogram.032768.000000.to.065536.000000 \
557		histogram.065536.000000.to.131072.000000 \
558		histogram.131072.000000.to.262144.000000 \
559		histogram.262144.000000.to.524288.000000; do
560		get_value $x
561		r=`expr $r + $value`
562	done
563	echo h16s.value $r
564	;;
565esac
566