1#!/usr/bin/perl -w 2 3# 4# This script reads access log in Squid format and prints "good" entries 5# Good entries are defined based on the --profile option. 6# 7# "country" profile (for building request interarrival distributions): 8# - US-based client IP addresses 9# 10# "server" profile (for building most server-side parameters) 11# - HTTP protocol 12# - 2xx and 3xx status codes 13# - GET, POST, and HEAD request methods 14# 15# "content" profile (for building content databases): 16# - HTTP protocol 17# - 200 status code 18# - GET request methods 19# - no query terms in request-URI 20# 21# The script also dumps statistics related to the above filtering choices 22# 23# "country" profile prerequisite, an IP::Country::Fast Perl module, can be 24# found at http://search.cpan.org/~nwetters/IP-Country-2.17/ 25# 26 27use strict; 28use integer; 29 30# grok profile 31my ($option, $Profile) = @ARGV or 32 die("usage: $0 --profile <country|server|content>\n"); 33die("unsupported option '$option'\n") unless $option eq '--profile'; 34die("unsupported profile '$Profile'\n") unless 35 ($Profile eq 'country' || $Profile eq 'server' || $Profile eq 'content'); 36shift @ARGV; shift @ARGV; 37 38my %Ips = (); 39my %Bads = (); 40my %Countries = (); 41my %Statuses = (); 42my %Protos = (); 43my %Methods = (); 44my ($cntEntry, $cntGoodEntry, $cntBad, 45 $cntIp, $cntGoodIp, 46 $cntStatus, $cntGoodStatus, 47 $cntUri, $cntGoodUri, 48 $cntCountry, $cntGoodCountry, 49 $cntMethod, $cntGoodMethod, 50 $cntProto, $cntGoodProto) = (0) x 15; 51 52my %GoodCountries = map { ($_ => 1) } qw(US); 53my %GoodMethods = map { ($_ => 1) } qw(GET HEAD POST); 54 55my $Registry; 56 57select(STDERR); 58 59while (<>) { 60 chomp; 61 ++$cntEntry; 62 &reportProgress() if $cntEntry % 1000 == 0; 63 64 my @fields = (split); 65 my @bad = (); 66 67 push @bad, 'FC' if @fields < 10; 68 69 # check response status code 70 ++$cntStatus; 71 my ($sc) = ($fields[3] =~ m|\w+/(\d+)|); 72 $sc = '??' unless defined $sc; 73 if (defined $Statuses{$sc}) { 74 ++$Statuses{$sc}; 75 } else { 76 $Statuses{$sc} = 1; 77 } 78 my $goodStatus = $Profile eq 'country'; 79 if ($Profile eq 'server') { 80 $goodStatus = $sc ne '??' && ($sc/100 == 2 || $sc/100 == 3); 81 } 82 elsif ($Profile eq 'content') { 83 $goodStatus = $sc eq '200'; 84 } 85 if ($goodStatus) { 86 ++$cntGoodStatus; 87 } else { 88 push @bad, 'SC'; 89 } 90 91 # check protocol 92 ++$cntProto; 93 my $uri = $fields[6]; 94 my ($proto) = ($uri =~ m|(\w+)://|); 95 $proto = '??' unless defined $proto; 96 if (defined $Protos{$proto}) { 97 ++$Protos{$proto}; 98 } else { 99 $Protos{$proto} = 1; 100 } 101 my $goodProto = $Profile eq 'country' || $proto eq 'http'; 102 if ($goodProto) { 103 ++$cntGoodProto; 104 } else { 105 push @bad, 'PRT'; 106 } 107 108 # check URI for query terms 109 ++$cntUri; 110 if ($Profile ne 'content' || $uri !~ /[\?\&]/) { 111 ++$cntGoodUri; 112 } else { 113 push @bad, 'URI'; 114 } 115 116 # check request method 117 ++$cntMethod; 118 my $method = $fields[5]; 119 $method = '??' unless defined $method; 120 if (defined $Methods{$method}) { 121 ++$Methods{$method}; 122 } else { 123 $Methods{$method} = 1; 124 } 125 my $goodMethod = $Profile eq 'country'; 126 if ($Profile eq 'server') { 127 $goodMethod = exists $GoodMethods{$method}; 128 } 129 elsif ($Profile eq 'content') { 130 $goodMethod = $method eq 'GET' 131 } 132 if ($goodMethod) { 133 ++$cntGoodMethod; 134 } else { 135 push @bad, 'MT'; 136 } 137 138 # check client country code 139 ++$cntCountry; 140 my ($ip, $cc) = ($fields[2] =~ m|([\-\.\d]+)/?(\w+)?|); 141 if (!defined $cc && $Profile eq 'country') { 142 # init IP registry if needed 143 require IP::Country::Fast; 144 $Registry = IP::Country::Fast->new() unless $Registry; 145 $cc = $Registry ? $Registry->inet_atocc($ip) : '??'; 146 } 147 $cc = '??' unless defined $cc; 148 if (defined $Countries{$cc}) { 149 ++$Countries{$cc}; 150 } else { 151 $Countries{$cc} = 1; 152 } 153 my $goodCC = $Profile ne 'country'; 154 if ($Profile eq 'country') { 155 $goodCC = !defined(%GoodCountries) || $GoodCountries{$cc}; 156 } 157 if ($goodCC) { 158 ++$cntGoodCountry; 159 } else { 160 push @bad, 'CC'; 161 } 162 163 # maintain an IP:quality map 164 if (exists $Ips{$ip}) { 165 if ($Ips{$ip}) { 166 if (@bad) { 167 $Ips{$ip} = 0; 168 } else { 169 $Ips{$ip} = $fields[0]; 170 } 171 } 172 # enable to support good IPs 173 # else { 174 # push @bad, 'IP'; 175 # } 176 } else { 177 $Ips{$ip} = @bad ? 0 : $fields[0]; 178 ++$cntIp; 179 } 180 181 if (@bad) { 182 &recordBads(\@bad); 183 } else { 184 ++$cntGoodEntry; 185 } 186 187 # skip bad entries 188 next if @bad; 189 190 # skip bad IPs 191 # next unless $Ips{$ip}; 192 193 print(STDOUT $_, "\n"); 194} 195&reportProgress(); 196 197foreach my $sc (sort { $Statuses{$b} <=> $Statuses{$a} } keys %Statuses) { 198 printf("SC: %-3s %6d %6.2f\n", $sc, $Statuses{$sc}, 199 &percent($Statuses{$sc}, $cntStatus)); 200} 201print("\n"); 202 203foreach my $proto (sort { $Protos{$b} <=> $Protos{$a} } keys %Protos) { 204 printf("PRT: %-15s %6d %6.2f\n", $proto, $Protos{$proto}, 205 &percent($Protos{$proto}, $cntProto)); 206} 207print("\n"); 208 209printf("URI: %-5s %6d %6.2f\n", 'good', $cntGoodUri, 210 &percent($cntGoodUri, $cntUri)); 211printf("URI: %-5s %6d %6.2f\n", 'bad', $cntUri-$cntGoodUri, 212 &percent($cntUri-$cntGoodUri, $cntUri)); 213print("\n"); 214 215foreach my $method (sort { $Methods{$b} <=> $Methods{$a} } keys %Methods) { 216 printf("MT: %-10s %6d %6.2f\n", $method, $Methods{$method}, 217 &percent($Methods{$method}, $cntMethod)); 218} 219print("\n"); 220 221foreach my $cc (sort { $Countries{$b} <=> $Countries{$a} } keys %Countries) { 222 printf("CC: %2s %6d %6.2f\n", $cc, $Countries{$cc}, 223 &percent($Countries{$cc}, $cntCountry)); 224} 225print("\n"); 226 227printf("entry: %-5s %6d %6.2f\n", 'good', $cntGoodEntry, 228 &percent($cntGoodEntry, $cntEntry)); 229printf("entry: %-5s %6d %6.2f\n", 'bad', $cntEntry-$cntGoodEntry, 230 &percent($cntEntry-$cntGoodEntry, $cntEntry)); 231print("\n"); 232 233$cntGoodIp = scalar grep { $_ } values %Ips; 234printf("IPs: %-5s %6d %6.2f\n", 'good', $cntGoodIp, 235 &percent($cntGoodIp, $cntIp)); 236printf("IPs: %-5s %6d %6.2f\n", 'bad', $cntIp-$cntGoodIp, 237 &percent($cntIp-$cntGoodIp, $cntIp)); 238print("\n"); 239 240foreach my $bas (sort { $Bads{$b} <=> $Bads{$a} } keys %Bads) { 241 printf("Bads: %-3s %6d %6.2f\n", $bas, $Bads{$bas}, 242 &percent($Bads{$bas}, $cntBad)); 243} 244 245 246 247 248 249 250exit(0); 251 252sub recordBads { 253 my $bads = shift; 254 foreach my $b (@{$bads}) { 255 $Bads{$b} = 0 unless defined $Bads{$b}; 256 ++$Bads{$b}; 257 ++$cntBad; 258 } 259} 260 261sub reportProgress { 262 printf("#Klines: %03d IPs: %3d SC: %6.2f PRT: %6.2f URI: %6.2f MT: %6.2f CC: %6.2f\n", 263 $cntEntry/1000, 264 $cntIp, 265 &percent($cntGoodStatus, $cntStatus), 266 &percent($cntGoodProto, $cntProto), 267 &percent($cntGoodUri, $cntUri), 268 &percent($cntGoodMethod, $cntMethod), 269 &percent($cntGoodCountry, $cntCountry)); 270} 271 272sub percent { 273 my ($part, $whole) = @_; 274 $whole = $cntEntry unless defined $whole; 275 return -1 unless $whole && defined($part); 276 no integer; 277 return 100. * $part/$whole; 278} 279