1#!/usr/bin/perl -w
2
3#
4# This script reads access log in Squid format and prints "good" entries
5# Good entries are defined based on the --profile option.
6#
7# "country" profile (for building request interarrival distributions):
8#	- US-based client IP addresses
9#
10# "server" profile (for building most server-side parameters)
11#	- HTTP protocol
12#	- 2xx and 3xx status codes
13#	- GET, POST, and HEAD request methods
14#
15# "content" profile (for building content databases):
16#	- HTTP protocol
17#	- 200 status code
18#	- GET request methods
19#	- no query terms in request-URI
20#
21# The script also dumps statistics related to the above filtering choices
22#
23# "country" profile prerequisite, an IP::Country::Fast Perl module, can be
24# found at http://search.cpan.org/~nwetters/IP-Country-2.17/
25#
26
27use strict;
28use integer;
29
30# grok profile
31my ($option, $Profile) = @ARGV or
32	die("usage: $0 --profile <country|server|content>\n");
33die("unsupported option '$option'\n") unless $option eq '--profile';
34die("unsupported profile '$Profile'\n") unless
35	($Profile eq 'country' || $Profile eq 'server' || $Profile eq 'content');
36shift @ARGV; shift @ARGV;
37
38my %Ips = ();
39my %Bads = ();
40my %Countries = ();
41my %Statuses = ();
42my %Protos = ();
43my %Methods = ();
44my ($cntEntry, $cntGoodEntry, $cntBad,
45	$cntIp, $cntGoodIp,
46	$cntStatus, $cntGoodStatus,
47	$cntUri, $cntGoodUri,
48	$cntCountry, $cntGoodCountry,
49	$cntMethod, $cntGoodMethod,
50	$cntProto, $cntGoodProto) = (0) x 15;
51
52my %GoodCountries = map { ($_ => 1) } qw(US);
53my %GoodMethods = map { ($_ => 1) } qw(GET HEAD POST);
54
55my $Registry;
56
57select(STDERR);
58
59while (<>) {
60	chomp;
61	++$cntEntry;
62	&reportProgress() if $cntEntry % 1000 == 0;
63
64	my @fields = (split);
65	my @bad = ();
66
67	push @bad, 'FC' if @fields < 10;
68
69	# check response status code
70	++$cntStatus;
71	my ($sc) = ($fields[3] =~ m|\w+/(\d+)|);
72	$sc = '??' unless defined $sc;
73	if (defined $Statuses{$sc}) {
74		++$Statuses{$sc};
75	} else {
76		$Statuses{$sc} = 1;
77	}
78	my $goodStatus = $Profile eq 'country';
79	if ($Profile eq 'server') {
80		$goodStatus = $sc ne '??' && ($sc/100 == 2 || $sc/100 == 3);
81	}
82	elsif ($Profile eq 'content') {
83		$goodStatus = $sc eq '200';
84	}
85	if ($goodStatus) {
86		++$cntGoodStatus;
87	} else {
88		push @bad, 'SC';
89	}
90
91	# check protocol
92	++$cntProto;
93	my $uri = $fields[6];
94	my ($proto) = ($uri =~ m|(\w+)://|);
95	$proto = '??' unless defined $proto;
96	if (defined $Protos{$proto}) {
97		++$Protos{$proto};
98	} else {
99		$Protos{$proto} = 1;
100	}
101	my $goodProto = $Profile eq 'country' || $proto eq 'http';
102	if ($goodProto) {
103		++$cntGoodProto;
104	} else {
105		push @bad, 'PRT';
106	}
107
108	# check URI for query terms
109	++$cntUri;
110	if ($Profile ne 'content' || $uri !~ /[\?\&]/) {
111		++$cntGoodUri;
112	} else {
113		push @bad, 'URI';
114	}
115
116	# check request method
117	++$cntMethod;
118	my $method = $fields[5];
119	$method = '??' unless defined $method;
120	if (defined $Methods{$method}) {
121		++$Methods{$method};
122	} else {
123		$Methods{$method} = 1;
124	}
125	my $goodMethod = $Profile eq 'country';
126	if ($Profile eq 'server') {
127		$goodMethod = exists $GoodMethods{$method};
128	}
129	elsif ($Profile eq 'content') {
130		$goodMethod = $method eq 'GET'
131	}
132	if ($goodMethod) {
133		++$cntGoodMethod;
134	} else {
135		push @bad, 'MT';
136	}
137
138	# check client country code
139	++$cntCountry;
140	my ($ip, $cc) = ($fields[2] =~ m|([\-\.\d]+)/?(\w+)?|);
141	if (!defined $cc && $Profile eq 'country') {
142		# init IP registry if needed
143		require IP::Country::Fast;
144		$Registry = IP::Country::Fast->new() unless $Registry;
145		$cc = $Registry ? $Registry->inet_atocc($ip) : '??';
146	}
147	$cc = '??' unless defined $cc;
148	if (defined $Countries{$cc}) {
149		++$Countries{$cc};
150	} else {
151		$Countries{$cc} = 1;
152	}
153	my $goodCC = $Profile ne 'country';
154	if ($Profile eq 'country') {
155		$goodCC = !defined(%GoodCountries) || $GoodCountries{$cc};
156	}
157	if ($goodCC) {
158		++$cntGoodCountry;
159	} else {
160		push @bad, 'CC';
161	}
162
163	# maintain an IP:quality map
164	if (exists $Ips{$ip}) {
165		if ($Ips{$ip}) {
166			if (@bad) {
167				$Ips{$ip} = 0;
168			} else {
169				$Ips{$ip} = $fields[0];
170			}
171		}
172		# enable to support good IPs
173		# else {
174		# 	push @bad, 'IP';
175		# }
176	} else {
177		$Ips{$ip} = @bad ? 0 : $fields[0];
178		++$cntIp;
179	}
180
181	if (@bad) {
182		&recordBads(\@bad);
183	} else {
184		++$cntGoodEntry;
185	}
186
187	# skip bad entries
188	next if @bad;
189
190	# skip bad IPs
191	# next unless $Ips{$ip};
192
193	print(STDOUT $_, "\n");
194}
195&reportProgress();
196
197foreach my $sc (sort { $Statuses{$b} <=> $Statuses{$a} } keys %Statuses) {
198	printf("SC: %-3s %6d %6.2f\n", $sc, $Statuses{$sc},
199		&percent($Statuses{$sc}, $cntStatus));
200}
201print("\n");
202
203foreach my $proto (sort { $Protos{$b} <=> $Protos{$a} } keys %Protos) {
204	printf("PRT: %-15s %6d %6.2f\n", $proto, $Protos{$proto},
205		&percent($Protos{$proto}, $cntProto));
206}
207print("\n");
208
209printf("URI: %-5s %6d %6.2f\n", 'good', $cntGoodUri,
210	&percent($cntGoodUri, $cntUri));
211printf("URI: %-5s %6d %6.2f\n", 'bad', $cntUri-$cntGoodUri,
212	&percent($cntUri-$cntGoodUri, $cntUri));
213print("\n");
214
215foreach my $method (sort { $Methods{$b} <=> $Methods{$a} } keys %Methods) {
216	printf("MT: %-10s %6d %6.2f\n", $method, $Methods{$method},
217		&percent($Methods{$method}, $cntMethod));
218}
219print("\n");
220
221foreach my $cc (sort { $Countries{$b} <=> $Countries{$a} } keys %Countries) {
222	printf("CC: %2s %6d %6.2f\n", $cc, $Countries{$cc},
223		&percent($Countries{$cc}, $cntCountry));
224}
225print("\n");
226
227printf("entry: %-5s %6d %6.2f\n", 'good', $cntGoodEntry,
228	&percent($cntGoodEntry, $cntEntry));
229printf("entry: %-5s %6d %6.2f\n", 'bad', $cntEntry-$cntGoodEntry,
230	&percent($cntEntry-$cntGoodEntry, $cntEntry));
231print("\n");
232
233$cntGoodIp = scalar grep { $_ } values %Ips;
234printf("IPs: %-5s %6d %6.2f\n", 'good', $cntGoodIp,
235	&percent($cntGoodIp, $cntIp));
236printf("IPs: %-5s %6d %6.2f\n", 'bad', $cntIp-$cntGoodIp,
237	&percent($cntIp-$cntGoodIp, $cntIp));
238print("\n");
239
240foreach my $bas (sort { $Bads{$b} <=> $Bads{$a} } keys %Bads) {
241	printf("Bads: %-3s %6d %6.2f\n", $bas, $Bads{$bas},
242		&percent($Bads{$bas}, $cntBad));
243}
244
245
246
247
248
249
250exit(0);
251
252sub recordBads {
253	my $bads = shift;
254	foreach my $b (@{$bads}) {
255		$Bads{$b} = 0 unless defined $Bads{$b};
256		++$Bads{$b};
257		++$cntBad;
258	}
259}
260
261sub reportProgress {
262	printf("#Klines: %03d IPs: %3d  SC: %6.2f PRT: %6.2f URI: %6.2f MT: %6.2f CC: %6.2f\n",
263		$cntEntry/1000,
264		$cntIp,
265		&percent($cntGoodStatus, $cntStatus),
266		&percent($cntGoodProto, $cntProto),
267		&percent($cntGoodUri, $cntUri),
268		&percent($cntGoodMethod, $cntMethod),
269		&percent($cntGoodCountry, $cntCountry));
270}
271
272sub percent {
273	my ($part, $whole) = @_;
274	$whole = $cntEntry unless defined $whole;
275	return -1 unless $whole && defined($part);
276	no integer;
277	return 100. * $part/$whole;
278}
279