1#!/usr/local/bin/perl
2# -*- perl -*-
3
4=head1 NAME
5
6http_load_ Munin multigraph plugin to monitor websites's HTTP responses and performance
7
8=head1 DESCRIPTION
9
10The purpose of this plugin is to monitor several properties of a web page.
11All measurements are done for the complete web page, including images, css
12and other content a standard browser would download automatically.
13
14This version supports monitoring:
15 - loadtime: total time to download a complete web page (using serial GET requests)
16 - size: total size of a web page
17 - response: different response codes (200, 404, 500, etc)
18 - tags: HTML tags (img src, a href, etc)
19 - type: content types (image/png, text/css/, etc)
20 - elements: source of elements loaded by the web page
21
22=head1 REQUIREMENTS
23
24 - The server running this plugin must be allowed  to connect to the web
25   server(s) you are going to monitor.
26 - Some perl modules:
27   Time::HiRes, LWP::UserAgent, HTML::LinkExtor, LWP::ConnCache
28
29=head1 CONFIGURATION
30
31=head2 INITIAL SETUP
32
331. Copy this file to /usr/share/munin/plugins/
34
352. Create a file (/etc/munin/http_load_urls.txt) with one
36   full url per line, as many as you want, i.e.:
37    $ echo "http://www.dn.no/" >> /etc/munin/urls.txt
38    $ echo "http://www.intrafish.no/" >> /etc/munin/urls.txt
39
403. Add a cron job running the plugin with cron as the argument:
41   */15 * * * * <user> /usr/sbin/munin-run http_load_<site>_loadtime cron
42   <user> should be the user that has write permission to the $cachedir
43   directory set below. <site> should be any of the configured sites (all
44   sites will get updated), likewise, you should replace loadtime by any
45   metric that is enabled for that site (all metrics will get updated).
46   Set the intervals to whatever you want.
47
48   For verbose output (for debugging) you can do:
49   sudo -u <user> /usr/share/munin/plugins/http_load_ cron verbose
50
514. Run munin-node-configure --suggest --shell and run the symlink
52   commands manually to update the munin-node plugin list.xi
53
545. If you want to change the filter which the plugin uses to select which
55   tags to follow in a web page, edit the subroutine called "filter" below.)
56
57=head2 SPECIFY URLS TO MONITOR
58
591. Add a new line in /etc/munin/urls.txt with the full URL, i.e.:
60    $ echo "http://www.linpro.no/" >> /etc/munin/http_load_urls.txt
61
622. Run munin-node-configure --suggest --shell and manually
63   add the new symlink(s)
64
653. /etc/init.d/munin-node restart
66
67=head2 REMOVE A URL
68
691. Remove it from /etc/munin/http_load_urls.txt
70
712. Remove ${cachedir}/http_load_<url_id>*
72
733. Remove /etc/munin/plugins/http_load_<url_id>*
74
754. /etc/init.d/munin-node restart
76
77=head2 SINGLE GRAPH SUPPORT
78
79The default behaviour is the multigraph mode: only the loadtime will be shown
80on the Munin summary page. The graphs there are linked to a second-level
81summary page that list all other metrics. It is also possible to create
82single graphs, that would show immediately on the summary page, by using
83symlinks with a different name, postfixed with the name of the metric:
84
85 - http_load_hostname:		multigraph (default)
86 - http_load_hostname_loadtime:	loadtime only
87 - http_load_hostname_size:	total page size
88 - http_load_hostname_response:	response code
89 - http_load_hostname_tags:	HTML tags summary
90 - http_load_hostname_type:	Content-Types
91 - http_load_hostname_elements:	source site of the loaded elements
92
93Note that hostname is not the FQDN of the host, but rather the one given when
94running munin-node-configure --suggest --shell and run the symlink
95
96=head1 MAGIC MARKERS
97
98  #%# family=auto
99  #%# capabilities=autoconf suggest
100
101=head1 TODO
102
103 - Specify URLs from a standard Munin plugins configuration file (e.g., env.urls)
104 - Add support for forking to simulate real browsers
105
106=head1 AUTHORS
107
108 - Espen Braastad / Linpro AS <espen@linpro.no>, initial implementation
109 - Olivier Mehani <shtrom+munin@ssji.net>, multigraph support
110
111=cut
112
113use strict;
114use Time::HiRes qw( gettimeofday tv_interval );
115use LWP::UserAgent;
116use HTML::LinkExtor;
117use LWP::ConnCache;
118
119my $url_file="/etc/munin/http_load_urls.txt";
120my $cachedir=$ENV{MUNIN_PLUGSTATE};
121
122my $debug=$ENV{MUNIN_DEBUG};
123my $timeout=10;
124my $max_redirects=10;
125my $scriptname="http_load_";
126my $useragent="Mozilla/5.0 (Munin; $scriptname)";
127
128# Function to read the $url_file and return the contents in a hash
129sub read_urls{
130	my $file=$_[0];
131	my %urls=();
132	if(-r $file){
133		open(FILE,'<'.$file);
134		while (<FILE>) {
135			my $url=$_;
136			chomp($url);
137			my $id=get_id($url);
138			if(length($id)>0){
139				$urls{$id}=$url;
140			}
141		}
142		close (FILE);
143	}
144	return %urls;
145}
146
147# Function to read cache, return a hash
148sub read_cache{
149	my $file=$_[0];
150	my %cache=();
151	if(-r $file){
152		open(FILE,'<'.$file);
153		while (<FILE>) {
154			m/^(\S*)\s+(.*)$/;
155			$cache{ $1 } = $2;
156		}
157		close (FILE);
158	}
159	return %cache;
160}
161
162# Function to filter the html tags, which files do we want to download
163sub filter{
164	my $tag=$_[0];
165	my $status=1;
166
167	# Some example data:
168	# link href http://www.intrafish.no/template/include/css/intrafish.css
169	# script src http://www.intrafish.no/template/include/js/intrafish.js
170	# a href http://adserver.adtech.de/?adlink%7C2.0%7C405%7C119488%7C1%7C16%7CADTECH;grp=8491;loc=300;
171	# img src http://adserver.adtech.de/?adserv%7C2.0%7C405%7C119488%7C1%7C16%7CADTECH;grp=8491;
172	# area href http://go.vg.no/cgi-bin/go.cgi/sol/http://www.sol.no/sgo/vg/http://www.sol.no/underholdning/humor/?partnerid=vg
173
174	# status=1 => do download (default)
175	# status=0 => do not download
176
177	# For links, the 'rel' is more relevant that the 'src' attribute
178	if("$tag" =~ /^link/){
179		$status=0;
180		if("$tag" =~ /stylesheet$/){
181			$status=1;
182		}
183	}
184	if("$tag" eq "form action"){
185		$status=0;
186	}
187	if("$tag" eq "a href"){
188		$status=0;
189	}
190	if("$tag" eq "area href"){
191		$status=0;
192	}
193	if("$tag" eq "meta content"){
194		$status=0;
195	}
196	return $status;
197}
198
199# Return the cache file name for this plugin
200sub get_cache_file_name{
201	my $scriptname=$_[0];
202	my $id=$_[1];
203	my $file="";
204
205	$file = $scriptname . $id . ".cache";
206	$debug && print "Cache file: " . $file . "\n";
207
208	return $file;
209}
210
211# Get fieldname (making sure it is munin-1.0 "compatible" as a fieldname)
212# 1. Remove all non-word characters from a string)
213# 2. Make sure it has maximum 19 characters
214#    2.1 If not, truncate the host part, while keeping anything after an underscore (e.g., HTTP response status)
215sub get_fieldname{
216	my $url=$_[0];
217	$url =~ s/\W//g;
218	if(length($url) > 19){
219		$url =~ s/(\S+)_(\S+)/ /g;
220		my $host = $1;
221		my $info = $2;
222		my $suffixlength = length($info) + 1;
223		if ($suffixlength > 1) {
224			$url = substr($host, 0, 19 - $suffixlength) . '_' . $info;
225		} else {
226			$url = substr($url, 0, 19);
227		}
228	}
229	return $url;
230}
231
232# Same as get_fieldname except it doesn't substr
233sub get_id{
234	my $url=$_[0];
235	$url =~ s/[\W_]//g;
236	return $url;
237}
238
239sub graph_title_config{
240	my $id = $_[0];
241	my %urls = %{$_[1]};
242	my $type = $_[2];
243
244        print "graph_title $urls{$id} ${type}\n";
245        print "graph_args -l 0 --base 1000\n";
246        print "graph_category webserver\n";
247}
248
249sub size_config{
250	my $id = $_[0];
251	my %urls = %{$_[1]};
252	my %cache = %{$_[2]};
253
254	my $count = 0;
255
256	graph_title_config($id, \%urls, "size");
257
258	print "graph_vlabel Bytes\n";
259	print "graph_total Total\n";
260	print "graph_info This graph is generated by a set of serial GETs to calculate the total size of $urls{$id}.\n";
261
262	if(keys(%cache)>0){
263		for my $key ( sort reverse keys %cache ){
264			my $value=$cache{$key};
265
266			if($key =~ m/^size_(\S+)$/){
267				my $host=$1;
268				my $value=$value;
269
270				my $name=$1;
271				$name=get_fieldname($name);
272
273				print "$name.label from $host\n";
274				print "$name.min 0\n";
275				print "$name.max 20000000\n";
276				if($count eq 0){
277					print "$name.draw AREA\n";
278				} else {
279					print "$name.draw STACK\n";
280				}
281				$count+=1;
282			}
283		}
284	}
285}
286
287sub loadtime_config{
288	my $id = $_[0];
289	my %urls = %{$_[1]};
290	my %cache = %{$_[2]};
291
292	my $count = 0;
293
294	graph_title_config($id, \%urls, "loadtime");
295
296	print "graph_vlabel Seconds\n";
297	print "graph_total Total\n";
298	print "graph_info This graph is generated by a set of serial GETs to calculate the total time to load $urls{$id}. ";
299	print "Note that browsers usually fork() the GET requests, resulting in a shorter total loading time.\n";
300
301	if(keys(%cache)>0){
302		for my $key ( sort reverse keys %cache ){
303			my $value=$cache{$key};
304
305			if($key =~ m/^loadtime_(\S+)$/){
306				my $host=$1;
307				my $value=$value;
308
309				my $name=$1;
310				$name=get_fieldname($name);
311
312				print "$name.label from $host\n";
313				print "$name.min 0\n";
314				print "$name.max 400\n";
315				if($count eq 0){
316					print "$name.draw AREA\n";
317				} else {
318					print "$name.draw STACK\n";
319				}
320				$count+=1;
321			}
322		}
323	}
324}
325
326sub elements_config{
327	my $id = $_[0];
328	my %urls = %{$_[1]};
329	my %cache = %{$_[2]};
330
331	my $count = 0;
332
333	graph_title_config($id, \%urls, "elements");
334
335	print "graph_vlabel Number of elements\n";
336	print "graph_total Total\n";
337	print "graph_info This graph is generated by a set of serial GETs to count the number of elements (images, CSS files, etc) from $urls{$id}.\n";
338
339	if(keys(%cache)>0){
340		for my $key ( sort reverse keys %cache ){
341			my $value=$cache{$key};
342
343			if($key =~ m/^elements_(\S+)$/){
344				my $host=$1;
345				my $value=$value;
346
347				my $name=$1;
348				$name=get_fieldname($name);
349
350				print "$name.label from $host\n";
351				print "$name.min 0\n";
352				print "$name.max 10000\n";
353				if($count eq 0){
354					print "$name.draw AREA\n";
355				} else {
356					print "$name.draw STACK\n";
357				}
358				$count+=1;
359			}
360		}
361	}
362}
363
364sub response_config{
365	my $id = $_[0];
366	my %urls = %{$_[1]};
367	my %cache = %{$_[2]};
368
369	my $count = 0;
370
371	graph_title_config($id, \%urls, "response");
372
373	print "graph_vlabel Server response code count\n";
374	print "graph_total Total\n";
375	print "graph_info This graph is generated by a set of serial GETs to visualize the server response codes received while loading $urls{$id}.\n";
376
377	if(keys(%cache)>0){
378		for my $key ( sort reverse keys %cache ){
379			my $value=$cache{$key};
380
381			if($key =~ m/^response_(\S+)$/){
382				my $host=$1;
383				my $value=$value;
384
385				my $name=$1;
386				$name=get_fieldname($name);
387
388				$host =~ s/\_/ /g;
389				$host =~ s/(\S+)\s(\d+)/ /g;
390				$host=$1;
391				my $code=$2;
392
393				print "$name.label $host ($code)\n";
394				print "$name.min 0\n";
395				print "$name.max 10000\n";
396				if($count eq 0){
397					print "$name.draw AREA\n";
398				} else {
399					print "$name.draw STACK\n";
400				}
401				$count+=1;
402			}
403		}
404	}
405}
406
407sub type_config{
408	my $id = $_[0];
409	my %urls = %{$_[1]};
410	my %cache = %{$_[2]};
411
412	my $count = 0;
413
414	graph_title_config($id, \%urls, "type");
415
416	print "graph_vlabel Content type count\n";
417	print "graph_total Total\n";
418	print "graph_info This graph is generated by a set of serial GETs to visualize the different content types $urls{$id} consists of.\n";
419
420	if(keys(%cache)>0){
421		for my $key ( sort reverse keys %cache ){
422			my $value=$cache{$key};
423
424			if($key =~ m/^type_(\S+)$/){
425				my $type=$1;
426				my $value=$value;
427
428				my $name=$1;
429				$name=get_fieldname($name);
430
431				#$host =~ s/\_/ /g;
432				#$host =~ s/(\S+)\s(\S+)/ /g;
433				#$host=$1;
434				#my $type=$2;
435
436				print "$name.label $type\n";
437				print "$name.min 0\n";
438				print "$name.max 100000\n";
439				if($count eq 0){
440					print "$name.draw AREA\n";
441				} else {
442					print "$name.draw STACK\n";
443				}
444				$count+=1;
445			}
446		}
447	}
448}
449
450sub tags_config{
451	my $id = $_[0];
452	my %urls = %{$_[1]};
453	my %cache = %{$_[2]};
454
455	my $count = 0;
456
457	graph_title_config($id, \%urls, "tags");
458
459	print "graph_vlabel HTML tag count\n";
460	print "graph_total Total\n";
461	print "graph_info This graph is generated by a set of serial GETs to visualize the different tags $urls{$id} consists of.\n";
462
463	if(keys(%cache)>0){
464		for my $key ( sort reverse keys %cache ){
465			my $value=$cache{$key};
466
467			if($key =~ m/^tags_(\S+)$/){
468				my $host=$1;
469				my $value=$value;
470
471				my $name=$1;
472				$name=get_fieldname($name);
473
474				$host =~ s/\W/ /g;
475
476				print "$name.label $host\n";
477				print "$name.min 0\n";
478				print "$name.max 100000\n";
479				if($count eq 0){
480					print "$name.draw AREA\n";
481				} else {
482					print "$name.draw STACK\n";
483				}
484				$count+=1;
485			}
486		}
487	}
488}
489
490sub cache_values{
491	my %cache = %{$_[0]};
492	my $type = $_[1];
493
494	if(keys(%cache)>0){
495		for my $key ( sort keys %cache ){
496			my $value=$cache{$key};
497			if($key =~ m/^([A-Za-z]+)\_(\S+)$/){
498				my $name=$2;
499
500				if ($1 eq $type){
501					$name=get_fieldname($name);
502					print $name . ".value " . $value . "\n";
503				}
504			} elsif(m/^(\S+)\s+(\S+)$/){
505				if ($1 eq $type){
506					print $1 . ".value " . $2 . "\n";
507				}
508			}
509		}
510	}
511}
512
513sub multi_config{
514	my $id = $_[0];
515	my %urls = %{$_[1]};
516	my %cache = %{$_[2]};
517
518	my $count = 0;
519
520
521	print "multigraph http_load_$id\n";
522	loadtime_config($id, \%urls, \%cache);
523
524	print "\nmultigraph http_load_$id.loadtime\n";
525	loadtime_config($id, \%urls, \%cache);
526
527	print "\nmultigraph http_load_$id.size\n";
528	size_config($id, \%urls, \%cache);
529
530	print "\nmultigraph http_load_$id.elements\n";
531	elements_config($id, \%urls, \%cache);
532
533	print "\nmultigraph http_load_$id.response\n";
534	response_config($id, \%urls, \%cache);
535
536	print "\nmultigraph http_load_$id.type\n";
537	type_config($id, \%urls, \%cache);
538
539	print "\nmultigraph http_load_$id.tags\n";
540	tags_config($id, \%urls, \%cache);
541
542}
543
544sub multi_values{
545	my $id = $_[0];
546	my %cache = %{$_[1]};
547
548	my $count = 0;
549
550
551	print "multigraph http_load_$id\n";
552	cache_values(\%cache, "loadtime");
553
554	print "\nmultigraph http_load_$id.loadtime\n";
555	cache_values(\%cache, "loadtime");
556
557	print "\nmultigraph http_load_$id.size\n";
558	cache_values(\%cache, "size");
559
560	print "\nmultigraph http_load_$id.elements\n";
561	cache_values(\%cache, "elements");
562
563	print "\nmultigraph http_load_$id.response\n";
564	cache_values(\%cache, "response");
565
566	print "\nmultigraph http_load_$id.type\n";
567	cache_values(\%cache, "type");
568
569	print "\nmultigraph http_load_$id.tags\n";
570	cache_values(\%cache, "tags");
571
572}
573$debug && print "Scriptname: " . $scriptname . "\n";
574
575# Get the url id and the type of the graph
576#
577# The filename format is http_load_X_Y where
578# X: The line number in urls.txt
579# Y: The type of graph (elements, size, loadtime, ..)
580
581my ($id,$type);
582$0 =~ /http_load(?:_([^_]+)|)(_(.+))?\s*$/;
583$id  = $1;
584$type = $3;
585
586if($type eq "") {
587	$type = "multi";
588}
589
590$debug && print "Id: $id, Type: $type\n";
591
592if($ARGV[0] and $ARGV[0] eq "autoconf") {
593	my %urls=&read_urls($url_file);
594	if(keys(%urls) gt 0){
595		print "yes\n";
596		exit(0);
597	} else {
598		print "no\n";
599		exit(1);
600	}
601
602} elsif($ARGV[0] and $ARGV[0] eq "suggest") {
603	# get the url list, print suggestions for usage
604	my %urls=&read_urls($url_file);
605	while ( my ($id, $url) = each(%urls) ) {
606        	$debug && print "id: $id => url: $url\n";
607        	print $id . "\n";
608    	}
609	exit(0);
610
611} elsif($ARGV[0] and $ARGV[0] eq "cron") {
612	# This thing is run by cron and should write a cache file for munin-node to
613	# read from
614
615	my $verbose=0;
616	if(
617		$ENV{MUNIN_DEBUG} eq "1" or
618		$ARGV[1] and $ARGV[1] eq "verbose"
619	) {
620		$verbose=1;
621		print "Verbose output\n";
622	}
623
624	my %urls=&read_urls($url_file);
625	my %output;
626	my %res;
627	my $t0;
628	my ($request,$response,$status,$link,$contents,$page_parser,$cachefile);
629
630	while ( my ($id, $url) = each(%urls) ) {
631        	$verbose && print "Fetching $url (id: $id)... \n";
632
633		$t0=0;
634		$status=0;
635		%output=();
636		my $host="";
637		if($url =~ m/\w+\:\/\/([^\/]+).*/){
638			$host=$1;
639        		$verbose && print " Host: $host\n";
640		}
641
642		$output{"url"}=$url;
643		$output{"timestamp"}=time();
644        	$verbose && print " Timestamp: " . $output{"timestamp"} . "\n";
645
646	        my $browser = LWP::UserAgent->new();
647
648		$browser->agent($useragent);
649	        $browser->timeout(${timeout});
650		$browser->max_redirect( $max_redirects );
651		$browser->conn_cache(LWP::ConnCache->new());
652
653		$response = $browser->get($url);
654
655		# Calculating time from now:
656		$t0 = [gettimeofday];
657	        if ($response->is_success()) {
658	                $status=1;
659			$output{"elements_" . $host}+=1;
660	        }
661
662        	$contents = $response->content();
663	        $output{"loadtime_" . $host} += sprintf("%.6f",tv_interval ( $t0, [gettimeofday]));
664        	$output{"size_" . $host}+=length($contents);
665		$output{"response_" . $host . "_" . $response->code}+=1;
666		$output{"type_" . $response->content_type}+=1;
667
668		# For <link />s, also capture the rel attribute
669		$HTML::Tagset::linkElements{'link'} = [ qw( href rel ) ];
670	        $page_parser = HTML::LinkExtor->new(undef, $url);
671	        $page_parser->parse($contents)->eof;
672	        my @links = $page_parser->links;
673        	$verbose && print " Processing links:\n";
674
675        	%res=();
676	        foreach $link (@links){
677			my $tag;
678			my($t, %attrs) = @{$link};
679			if ($attrs{rel} =~ /.*\/([^\/]+)/) {
680				$tag=$$link[0] . " " . $1;
681			} else {
682				$tag=$$link[0] . " " . $$link[1];
683			}
684			$output{"tags_" . $$link[0] . "-" . $$link[1]}+=1;
685
686			if(filter($tag)){
687				$verbose && print "  Processing: " . $$link[0] . " " . $$link[1] . " " . $$link[2] . "\n";
688
689				# Extract the hostname and add it to the hash
690				if($$link[2] =~ m/https?\:\/\/([^\/]+).*/){
691					$host=$1;
692					$output{"elements_" . $host}+=1;
693				}
694
695                	        my $suburl=$$link[2];
696
697				$t0 = [gettimeofday];
698				$response = $browser->get($suburl);
699	        		$output{"loadtime_" . $host} += sprintf("%.6f",tv_interval ( $t0, [gettimeofday]));
700
701        			$contents = $response->content();
702        			$output{"size_" . $host}+=length($contents);
703				$output{"response_" . $host . "_" . $response->code}+=1;
704				$output{"type_" . $response->content_type}+=1;
705
706				$verbose && print "              Response: " . $response->code . " Size: " . length($contents) . "\n";
707			} else {
708				$verbose && print "  Skipping:   " . $$link[0] . " " . $$link[1] . " " . $$link[2] . "\n";
709			}
710		}
711
712		$cachefile=$cachedir . "/" . &get_cache_file_name($scriptname,$id);
713		$debug && print "Reading cache file: " . $cachefile . "... ";
714
715		my %input=read_cache($cachefile);
716
717		$debug && print "done\n";
718
719		# Resetting all values to 0 before adding new values
720		while ( my ($id, $value) = each(%input) ) {
721			$input{$id}="U";
722    		}
723
724		# Adding new values
725		while ( my ($id, $value) = each(%output) ) {
726			$input{$id}=$value;
727        		$verbose && print " Result: " . $id . " -> " . $value . "\n";
728    		}
729
730		# Writing the cache
731		$verbose && print "Writing cache file: " . $cachefile . "... ";
732		open(FILE,">".$cachefile);
733		while ( my ($id, $value) = each(%input) ) {
734			print FILE $id . " " . $value . "\n";
735		}
736		close(FILE);
737		$verbose && print "done\n";
738	}
739	exit(0);
740}elsif($ARGV[0] and $ARGV[0] eq "config") {
741	my %urls=&read_urls($url_file);
742
743	$debug && print "Reading cache file\n";
744	my $cachefile=$cachedir . "/" . &get_cache_file_name($scriptname,$id);
745	my %cache=read_cache($cachefile);
746
747	$debug && print "The cache file contains " . keys(%cache) . " lines\n";
748
749	if($type eq "size"){
750		size_config($id, \%urls, \%cache)
751	}elsif($type eq "loadtime"){
752		loadtime_config($id, \%urls, \%cache)
753	}elsif($type eq "elements"){
754		elements_config($id, \%urls, \%cache)
755	}elsif($type eq "response"){
756		response_config($id, \%urls, \%cache)
757	}elsif($type eq "type"){
758		type_config($id, \%urls, \%cache)
759	}elsif($type eq "tags"){
760		tags_config($id, \%urls, \%cache)
761	}elsif($type eq "multi"){
762		multi_config($id, \%urls, \%cache)
763	}
764	exit(0);
765} else {
766	my $cachefile=$cachedir . "/" . &get_cache_file_name($scriptname,$id);
767	$debug && print "Reading cache file: " . $cachefile . "\n";
768	my %cache=read_cache($cachefile);
769	$debug && print "Number of lines in cache file: " . keys(%cache) . "\n";
770
771	if($type eq "multi"){
772		multi_values($id, \%cache);
773	} else {
774		cache_values(\%cache, $type);
775	}
776}
777
778# vim:syntax=perl
779