1#!/usr/local/bin/perl 2# 3# LightSquid Project (c) 2004-2005 Sergey Erokhin aka ESL 4# 5# This program is free software; you can redistribute it and/or 6# modify it under the terms of the GNU General Public License 7# as published by the Free Software Foundation; either version 2 8# of the License, or (at your option) any later version. 9# 10# detail see in gnugpl.txt 11 12#parse access.log 13# make per user report in 'report' direcotry 14 15#usage: lightparse.pl {param} 16#if param omit - parse full access.log file 17# today - only current day 18# yesterday - yesterday 19# data in format YYYYMMDD - parse day 20# access.log.{\d}.{gz|bz2} - parse file (for process archived) 21 22# function prototypes 23sub MakeReport(); 24sub InitSkipUser(); 25sub getLPS($$); 26sub LockLSQ(); 27sub UnLockLSQ(); 28sub LOCKREMOVER(); 29 30use File::Basename; 31use Time::Local; 32 33push (@INC,(fileparse($0))[1]); 34 35require "/usr/local/etc/lightsquid/lightsquid.cfg"; 36require "common.pl"; 37 38#include ip2name function 39require "$ip2namepath/ip2name.$ip2name"; 40 41$SIG{INT} = \&LOCKREMOVER; # traps keyboard interrupt 42my $lockfilepath ="$lockpath/lockfile"; 43 44my $skipurlcntr = 0; 45my $skip4xxcntr = 0; 46my $skipfilterdatecntr= 0; 47 48my $firstrun = 1; 49my $totallines = 0; 50my $parsedlines = 0; 51my $daylines = 0; 52 53my $catname ="cat"; 54my $filename ="access.log"; 55 56undef $workday; 57 58exit unless (LockLSQ()); #Lock LSQ (block multiple instance) 59 60if ($skipurl eq "") { 61 $skipurl = "skipurl MUST be defined!!!"; 62 print "WARNING !!! \$skipurl is empty\n"; 63} 64 65($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; 66$month=sprintf("%02d",$mon+1);; 67 68my $filterdatestart=0; 69my $filterdatestop =timelocal(59,59,23,31,12-1,2020-1900)+1000; 70 71$fToday=1 if ($ARGV[0] eq "today"); 72$fToday=1 if ($ARGV[0] eq "yesterday"); 73 74if ($fToday) { 75 ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime; 76 77 $filterdate=sprintf("%04d%02d%02d",$year+1900,$mon+1,$mday);; 78 $filterdatestart=timelocal( 0, 0, 0,$mday,$mon,$year); 79 $filterdatestop =timelocal(59,59,23,$mday,$mon,$year); 80 print ">>> filter today: $filterdate\n" if ($debug); 81} 82 83if ($ARGV[0] eq "yesterday") { 84 $filterdatestart=$filterdatestart-(24*60*60); 85 $filterdatestop =$filterdatestop -(24*60*60); 86 ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($filterdatestart); 87 $filterdate=sprintf("%04d%02d%02d",$year+1900,$mon+1,$mday);; 88 print ">>> filter yesterday: $filterdate\n" if ($debug); 89} 90 91if ($ARGV[0] =~ m/^(\d\d\d\d)(\d\d)(\d\d)$/) { 92 $filterdate=$ARGV[0]; 93 $filterdatestart=timelocal( 0, 0, 0,$3,$2-1,$1); 94 $filterdatestop =timelocal(59,59,23,$3,$2-1,$1); 95 print ">>> filter date: $filterdate\n" if ($debug); 96} 97 98if ($ARGV[0] =~ m/access\.log\.(\d)/) { 99 $filename=$ARGV[0]; 100 $catname="zcat" if ($ARGV[0] =~ m/\.gz$/); 101 $catname="bzcat" if ($ARGV[0] =~ m/\.bz2$/); 102} 103 104print ">>> use file :: $logpath/$filename\n" if ($debug); 105#open FF, "$logpath\\$filename" || die "can't access log file\n"; 106open FF, "$catname $logpath/$filename|" || die "can't access log file\n"; 107 108InitSkipUser(); 109 110StartIp2Name(); 111 112undef %bigfile; $bigfilecnt=0; 113while (<FF>) { 114 chomp; 115 $totallines++; 116 117 if (0 == $squidlogtype) { 118 #squid native log 119 #970313965.619 1249 denis.local TCP_MISS/200 2598 GET http://www.emalecentral.com/tasha/thm_4374x013.jpg - DIRECT/www.emalecentral.com image/jpeg 120 # timestamp elapsed host type size method url user hierarechy type 121 122 #speed optimization for FILTERDATE mode 123 $Ltimestamp=substr $_,0,11; 124 if ($Ltimestamp<$filterdatestart or $Ltimestamp>$filterdatestop) { 125 print ">>>> skipDafteFilter URL $Lurl\n$_" if ($debug2 >= 2 ); 126 $skipfilterdatecntr++; 127 next; 128 }; 129 130 ($Ltimestamp,$Lelapsed,$Lhost,$Ltype,$Lsize,$Lmethod,$Lurl,$Luser,$Lhierarchy,$Lconttype,@Lrest)=split; 131 ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($Ltimestamp); 132 $mon++; #fix, month start from 0 133 $date =sprintf("%04d%02d%02d",$year+1900,$mon,$mday); 134 135 #check row with invalid record 136 if ( ($#Lrest >= 0) && ($#Lrest < 4) ) { 137 $str=$_; 138 #maybe two concatenated record (first - truncated) 139 if ($str =~ m/(\d+\.\d+\s+\d+\s+(\d{1,3}\.){3}\d{1,3}\s+\w+\/\d+\s+\d+\w+\s+\S+\s+\S+\s+\S+\s+\w+\/\S+\s(-|([a-zA-Z\-]+\/[a-zA-Z\-]+)))$/) { 140 $newstr=$1; 141 ($Ltimestamp,$Lelapsed,$Lhost,$Ltype,$Lsize,$Lmethod,$Lurl,$Luser,$Lhierarchy,$Lconttype)=split /\s+/,$newstr; 142 } else { 143 # maybe source url contain SPACES, try concatenate ... 144 while ($#Lrest != -1) { 145 $Lurl.="_$Luser";$Luser=$Lhierarchy;$Lhierarchy=$Lconttype;$Lconttype=shift @Lrest; 146 } 147 #do some sanity check 148 unless (($Lhierarchy =~ m/\w+\/\S+/) and ($Lconttype =~ m/-|([a-zA-Z\-]+\/[a-zA-Z\-]+)/)) { 149 $notrecoveredlines++; 150 next; 151 } 152 } 153 $recoveredlines++; 154 } 155 } else { 156 #emulated httpd log 157 #192.168.3.40 - - [15/Apr/2005:11:46:35 +0300] "GET http://mail.yandex.ru/mboxjscript? HTTP/1.0" 200 2262 TCP_MISS :DIRECT 158 #192.168.3.40 - - [15/Apr/2005:11:46:35 +0300] "GET http://css.yandex.ru/css/mail/search.js HTTP/1.0" 200 4199 TCP_HIT:NONE 159 #192.168.3.12 - - [15/Apr/2005:11:46:35 +0300] "CONNECT aero.lufthansa.com:443 HTTP/1.0" 200 35992 TCP_MISS:DIRE 160 # ($Lhost, $Luser,$Luser2,$Ldate, $u2, $Lmethod,$Lurl, $u3, $Ltype,$Lsize,$u4)=split 161 162 ($Lhost,$Luser,$Luser2,$Ldate,$u2,$Lmethod,$Lurl,$u3,$Ltype,$Lsize,$u4)=split; 163 164 $Ldate =~ m#^\[(\d\d)/(...)/(\d\d\d\d):(\d\d):(\d\d):(\d\d)#; 165 $mday=$1;$mon=$month2dec{$2};$year=$3-1900; 166 $hour=$4;$min=$5;$sec=$6; 167 168 $date =sprintf("%04d%02d%02d",$year+1900,$mon,$mday); 169 if ($filterdate) { 170 if ($date ne $filterdate) { 171 print ">>>> skipDafteFilter URL $Lurl\n$_" if ($debug2 >= 2 ); 172 $skipfilterdatecntr++; 173 next; 174 }; 175 } 176 177 if (($Luser eq "-") && ($Luser2 ne "-")) { 178 $Luser = $Luser2; 179 } 180 181 $u4 =~ m/(.*?)\s?:(.*)/; 182 $Ltype = "$1/$Ltype"; 183 } #if ($squidlogtype) 184 185 if ($year < 2000-1900) { ; #invalid record 186 print ">>>> skipn Bad Year $Lurl\n$_" if ($debug2 >= 1 ); 187 $skipbadyear++; 188 next; 189 } 190 191 #skip intranet 192 if ($Lurl =~ m/$skipurl/o) { 193 print ">>>> skipURL $Lurl\n$_" if ($debug2 >= 2 ); 194 $skipurlcntr++; 195 next; 196 }; 197 198 # skip Access denied records (TODO: report) 199 if ($Ltype =~ m#DENIED#io) { 200 $skipDenied++; 201 print ">>>> skipDenied $Ltype\n$_" if ($debug2 >= 2); 202 next; 203 }; 204 205 if ($Ltype =~ m/(HIT|UNMODIFIED)/) { 206 $CacheHIT+=$Lsize; 207 } else { 208 $CacheMISS+=$Lsize; 209 } 210 211 $parsedlines++; 212 213 if ($date ne $workday) { # close prev day, prepare for new 214 if ($firstrun) { 215 undef $firstrun; 216 $workday=$date; 217 } else { 218 MakeReport(); 219 undef %totalsize; undef %sitesize; undef %sitehit;undef %totalhit;undef %totalputpost; 220 undef %hashhost;undef %hashname; 221 undef %bigfile; $bigfilecnt=0; 222 undef %sitetime;undef %sitetimesize; 223 $daylines=0; 224 $workday=$date; 225 $sqlreq=0; 226 $CacheHIT=0;$CacheMISS=0; 227 } 228 } 229 $daylines++; 230 231 $user=lc $Luser; 232 233 $user = Ip2Name($Lhost,$user,$Ltimestamp); 234 235 next if (defined $hSkipUser{$user}); 236 237 #simplified some common banner system & counters 238 $url=$Lurl; 239 $url =~ s/([a-z]+:\/\/)??.*\.(spylog\.com)/$1www.$2/o; 240 $url =~ s/([a-z]+:\/\/)??.*\.(yimg\.com)/$1www.$2/o; 241 $url =~ s/([a-z]+:\/\/)??.*\.(adriver\.ru)/$1www.$2/o; 242 $url =~ s/([a-z]+:\/\/)??.*\.(bannerbank\.ru)/$1www.$2/o; 243 $url =~ s/([a-z]+:\/\/)??.*\.(mail\.ru)/$1www.$2/o; 244 $url =~ s/([a-z]+:\/\/)??.*\.(adnet\.ru)/$1www.$2/o; 245 $url =~ s/([a-z]+:\/\/)??.*\.(rapidshare\.de)/$1www.$2/o; 246 $url =~ s/([a-z]+:\/\/)??.*\.(rapidshare\.com)/$1www.$2/o; 247 248 $url =~ s/([a-z]+:\/\/)??.*\.(vkontakte\.ru)/$1www.$2/o; 249 $url =~ s/([a-z]+:\/\/)??.*\.(odnoklasniki\.ru)/$1www.$2/o; 250 251 252 #extract site name 253 if ($url =~ m/([a-z]+:\/\/)??([a-z0-9\-]+\.){1}(([a-z0-9\-]+\.){0,})([a-z0-9\-]+){1}(:[0-9]+)?\/(.*)/o) { 254 $site=$2.$3.$5; 255 } else { 256 $site=$Lurl; 257 } 258 259 260 $site=$Lurl if ($site eq ""); 261 262 $totalsize {$user} +=$Lsize; 263 $totalhit {$user} ++; 264 $totalputpost {$user} +=$Lsize if (($Lmethod eq "PUT") or ($Lmethod eq "POST")); 265 $sitesize {$user}{$site}+=$Lsize; 266 $sitehit {$user}{$site}++; 267 268 $sitetime {$user}{$site}[$hour]+=$Lelapsed; 269 $sitetimesize {$user}{$site}[$hour]+=$Lsize; 270 271 #.bigfile support 272 if ($Lsize > $bigfilelimit) { 273 $bigfile [$bigfilecnt]{date}=sprintf("%02d:%02d:%02d",$hour,$min,$sec); 274 $bigfile [$bigfilecnt]{link}=$Lurl; 275 $bigfile [$bigfilecnt]{size}=$Lsize; 276 $bigfile [$bigfilecnt]{user}=$user; 277 $bigfilecnt++; 278 } 279} 280 281MakeReport(); 282StopIp2Name(); 283UnLockLSQ(); 284 285if ($debug) { 286 $worktime = ( time() - $^T ); 287 print "run TIME: $worktime sec\n"; 288 print "LightSquid parser statistic report\n\n"; 289 printf( " %10u lines processed (average %.2f lines per second)\n", 290 $totallines, getLPS( $worktime, $totallines ) ); 291 printf( " %10u lines parsed\n", $parsedlines ); 292 printf( " %10u lines recovered\n", $recoveredlines ); 293 printf( " %10u lines notrecovered\n", $notrecoveredlines ); 294 printf( " %10u lines skiped by bad year\n", $skipbadyear ); 295 printf( " %10u lines skiped by date filter\n", $skipfilterdatecntr ); 296 printf( " %10u lines skiped by Denied filter\n", $skipDenied ); 297 printf( " %10u lines skiped by skipURL filter\n", $skipurlcntr ); 298 299 if ( $parsedlines == 0 ) { 300 print "\nWARNING !!!!, parsed 0 lines from total : $totallines\n"; 301 print "please check confiuration !!!!\n"; 302 print "may be wrong log format selected ?\n"; 303 } 304 305} 306 307 308 309# The END --------------------------------------------------------- 310 311##Subroutines 312# return Line Per Second value (check 0 values and correct) 313sub getLPS($$) { 314 my $time=shift; 315 my $lines=shift; 316 $time||=1; 317 $lines||=1; 318 return ($lines/$time); 319} 320 321sub MakeReport() { 322 #generate report 323 #use global var 324 325 return if ($daylines < 2); 326 327 print ">>> Make Report $workday ($daylines - log line parsed)\n" if ($debug); 328 329 $reppath="$reportpath/$workday"; 330 331 unless ( -d $reppath ) 332 { 333 mkdir $reppath, 0755 or die "Can't create dir '$reppath': $!"; 334 } 335 336 open TOTALFILE,">$reppath/.total" || die "can't create file $reppath/.total - $!"; 337 338 $tmp="";$tmpsize=0;$tmpuser=0;$tmpoveruser=0; 339 340 foreach $tuser (sort {$totalsize{$b} <=> $totalsize{$a}} keys %totalsize) { 341# $tmp.="$tuser\t$totalsize{$tuser}\t$totalhit{$tuser}\t$totalputpost{$tuser}\n"; 342 $totalputpost{$tuser}+=0; #prevent empty value 343 $tmp.=sprintf("%-20s %15s %15s %15s\n",$tuser,$totalsize{$tuser},$totalhit{$tuser},$totalputpost{$tuser}); 344 $tmpuser++; 345 $tmpsize+=$totalsize{$tuser}; 346 $tmpoveruser++ if ($totalsize{$tuser} >= $perusertrafficlimit); 347 348 open REPFILE,">$reppath/$tuser" || die "can't create file $reppath/$tuser - $!"; 349 350 print REPFILE "total: $totalsize{$tuser}\n"; 351 352 foreach $tsite (sort {$sitesize{$tuser}{$b} <=> $sitesize{$tuser}{$a}} keys %{$sitesize{$tuser}} ) { 353 printf REPFILE ("%-29s %12s %10s\t",$tsite,$sitesize{$tuser}{$tsite},$sitehit{$tuser}{$tsite}); 354 if ($timereport != 0) { 355 for ($hour=0;$hour<24;$hour++) { 356 printf REPFILE ("%d-%s ",int($sitetime{$tuser}{$tsite}[$hour]/3600),$sitetimesize{$tuser}{$tsite}[$hour]+0); 357 } 358 } 359 print REPFILE "\n"; 360 } 361 close REPFILE; 362 } 363 364 $CacheMISS=1 if ($CacheMISS == 0); 365 366 print TOTALFILE "user: $tmpuser\n"; 367 print TOTALFILE "size: $tmpsize\n"; 368 369 print TOTALFILE "$tmp"; 370 close TOTALFILE; 371 372 my ($sec_,$min_,$hour_,$mday_,$mon_,$year_,$wday_,$yday_,$isdst_) = localtime;$mon_++;$year_+=1900; 373 my $moddate=sprintf("%02d:%02d",$hour_,$min_)." :: $mday_ $MonthName[$mon_] $year_"; 374 375 open FILE,">$reppath/.features" || die "can't create file $reppath/.features - $!"; 376 print FILE "overuser: $tmpoveruser\n"; 377 print FILE "cachehit%: ".sprintf("%3.2f",($CacheHIT*100)/($CacheHIT+$CacheMISS))."\n"; 378 print FILE "cachehit: $CacheHIT\n"; 379 print FILE "cachemiss: $CacheMISS\n"; 380 print FILE "cacheall: ".($CacheHIT+$CacheMISS)."\n"; 381 print FILE "modification: $moddate\n"; 382 close FILE; 383 384 unlink "$reppath/.bigfiles"; 385 if ($bigfilecnt != 0) { 386 open MAXFILE,">$reppath/.bigfiles" || die "can't create file $reppath/.bigfiles - $!"; 387 for ($i=0;$i<$bigfilecnt;$i++) { 388 print MAXFILE "$bigfile[$i]{user}\t$bigfile[$i]{date}\t$bigfile[$i]{size}\t$bigfile[$i]{link}\n"; 389 } 390 close MAXFILE; 391 } 392 393 #create list of user that use more than $perusertrafficlimit bytes 394 unlink "$reppath/.overuser"; 395 if ($tmpoveruser) { 396 open OVERFILE,">","$reppath/.overuser" || die "can't create file $reppath/.overuser - $!"; 397 foreach $tuser (sort {$totalsize{$b} <=> $totalsize{$a}} keys %totalsize) { 398 print OVERFILE "$tuser\t$totalsize{$tuser}\n" if ($totalsize{$tuser} >= $perusertrafficlimit); 399 } 400 close OVERFILE; 401 } 402 403 CreateGroupFile($reppath); 404 CreateRealnameFile($reppath); 405} 406 407sub InitSkipUser() { 408 open F,"<$cfgpath/skipuser.cfg"; 409 while (<F>) { 410 chomp; 411 next if (/^#/); 412 $hSkipUser{$_}=1; 413 } 414 close F; 415} 416# Lock support 417sub LockLSQ() { 418 if (-f "$lockfilepath") { 419 #read data from `lockfile` 420 print STDERR "Warning, `$lockfilepath` exist, maybe anoter process running !\n"; 421 open FF,"<","$lockfilepath" or die "can't read lock file `$lockfilepath`\n"; 422 $pid=<FF>;chomp $pid;$pid =~ s/PID: //; 423 $ts =<FF>;chomp $ts ;$ts =~ s/Timestamp: //; 424 close FF; 425 #check timedelta 426 $tsdelta=time - $ts; 427 print STDERR "LockPID : $pid\n" ; 428 print STDERR "tsdelta : $tsdelta second(s) (maxlocktime: $maxlocktime)\n"; 429 430 return 0 if ($tsdelta<$maxlocktime); 431 432 print STDERR "OLD lock file ignored and removed!\n"; 433 UnLockLSQ(); 434 } 435 436 open FF,">","$lockfilepath" or die "can't create lock file `$lockfilepath`\n"; 437 print FF "PID: $$\n"; 438 $ts=time; 439 print FF "Timestamp: $ts\n"; 440 print FF "Creation time: ".localtime($ts)."\n"; 441 close FF; 442 443 return 1; 444} 445 446sub UnLockLSQ() { 447 unlink $lockfilepath or die "can't remove lock file `$lockfilepath`\n"; 448} 449 450sub LOCKREMOVER() { 451 print "INT happents, remove LOCK\n"; 452 UnLockLSQ(); 453 exit; 454} 455 456__END__ 4572004-04-23 : initial version 4582004-09-01 FIX : error in parse invalid file 4592004-09-09 ADD : add create .bigfile file contain links greater $bigfilelimit 4602004-11-08 ADD : skip 4xx records (dirty :-() TODO: do error report 4612004-11-09 ADD : use DB only if not define user name... 4622005-04-13 ADD : LightSquid publication cleanup 4632005-04-14 ADD : $debug and $debug2 variable for generate statistic 464 : if parsed lines = 0 print WARNING 4652005-04-17 ADD : add support fot HTTPDlike log file 4662005-04-19 ADD : add .bz2 support 467 : add cache hit calculationn (if Ltype contain HIT - hit else - MISS), wrong ?? 468 : add oversized user calculation 4692005-04-20 ADD : .features file added, with additional info 4702005-04-22 FIX : httpdlike parser bug; 471 FIX : mkdir 655 -> mkdir 755 4722005-04-30 ADD : Rewrite archive support, now support access.log.{D},access.log.{D}.gz,access.log.{D}.bz2 473 ADD : time report 4742005-05-03 FIX : fix wrong .features file output 4752005-05-12 FIX : empty line report only if $debug 476 FIX : date filter now ^\d\d\d\d\d\d\d\d$ ... 4772005-11-21 FIX : cosmetical changes 4782006-07-02 ADD : try recovery some type of broken log record (url contain spaces, two concatenated record) 479 : fix negative number in user file (printf -%d <2g $u <4g), now use simple print 4802006-07-05 ADD : Put & Post addet into .total file 4812006-07-10 ADD : SkipUser support 482 : GetNameByIP -> IP2NAME (see doc) 483 : $cfgpath in config 484 : .features modification: parameter support 4852006-07-29 ADD : add LOCKing, for prevent multiple LightSquid parser instance ... 486 ADD : improve SKIP speed for native squid log format (more that 3 time !!!!) 487 ADD : report line per second speed LPS in debug report 4882006-11-23 FIX : Yet another printf trouble in time report fixed 4892007-01-05 FIX : Wrong modification data writen in .features 4902008-11-28 NEW : Odnoklasniki & Vkontakte agregator added 491 FIX : Perl 5.10 fix. in several cases incorrect name was used, but size calculated correctly. 4922009-06-30 NEW : .overuser support