1#!/usr/local/bin/perl
2#
3# LightSquid Project (c) 2004-2005 Sergey Erokhin aka ESL
4#
5# This program is free software; you can redistribute it and/or
6# modify it under the terms of the GNU General Public License
7# as published by the Free Software Foundation; either version 2
8# of the License, or (at your option) any later version.
9#
10# detail see in gnugpl.txt
11
12#parse access.log
13# make per user report in 'report' direcotry
14
15#usage: lightparse.pl {param}
16#if param omit				 - parse full access.log file
17#	today					 - only current day
18#	yesterday				 - yesterday
19#	data in format YYYYMMDD	 - parse day
20#	access.log.{\d}.{gz|bz2} - parse file (for process archived)
21
22# function prototypes
23sub MakeReport();
24sub InitSkipUser();
25sub getLPS($$);
26sub LockLSQ();
27sub UnLockLSQ();
28sub LOCKREMOVER();
29
30use File::Basename;
31use Time::Local;
32
33push (@INC,(fileparse($0))[1]);
34
35require "/usr/local/etc/lightsquid/lightsquid.cfg";
36require "common.pl";
37
38#include ip2name function
39require "$ip2namepath/ip2name.$ip2name";
40
41$SIG{INT} = \&LOCKREMOVER;	# traps keyboard interrupt
42my $lockfilepath	  ="$lockpath/lockfile";
43
44my $skipurlcntr		  = 0;
45my $skip4xxcntr		  = 0;
46my $skipfilterdatecntr= 0;
47
48my $firstrun	= 1;
49my $totallines	= 0;
50my $parsedlines = 0;
51my $daylines	= 0;
52
53my $catname	  ="cat";
54my $filename  ="access.log";
55
56undef $workday;
57
58exit unless (LockLSQ()); #Lock LSQ (block multiple instance)
59
60if ($skipurl eq "") {
61   $skipurl = "skipurl MUST be defined!!!";
62   print "WARNING !!! \$skipurl is empty\n";
63}
64
65($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime;
66$month=sprintf("%02d",$mon+1);;
67
68my $filterdatestart=0;
69my $filterdatestop =timelocal(59,59,23,31,12-1,2020-1900)+1000;
70
71$fToday=1 if ($ARGV[0] eq "today");
72$fToday=1 if ($ARGV[0] eq "yesterday");
73
74if ($fToday) {
75   ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime;
76
77   $filterdate=sprintf("%04d%02d%02d",$year+1900,$mon+1,$mday);;
78   $filterdatestart=timelocal( 0, 0, 0,$mday,$mon,$year);
79   $filterdatestop =timelocal(59,59,23,$mday,$mon,$year);
80   print ">>> filter today: $filterdate\n" if ($debug);
81}
82
83if ($ARGV[0] eq "yesterday") {
84   $filterdatestart=$filterdatestart-(24*60*60);
85   $filterdatestop =$filterdatestop -(24*60*60);
86   ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($filterdatestart);
87   $filterdate=sprintf("%04d%02d%02d",$year+1900,$mon+1,$mday);;
88   print ">>> filter yesterday: $filterdate\n" if ($debug);
89}
90
91if ($ARGV[0] =~ m/^(\d\d\d\d)(\d\d)(\d\d)$/) {
92   $filterdate=$ARGV[0];
93   $filterdatestart=timelocal( 0, 0, 0,$3,$2-1,$1);
94   $filterdatestop =timelocal(59,59,23,$3,$2-1,$1);
95   print ">>> filter date:	$filterdate\n" if ($debug);
96}
97
98if ($ARGV[0] =~ m/access\.log\.(\d)/) {
99   $filename=$ARGV[0];
100   $catname="zcat" if ($ARGV[0] =~ m/\.gz$/);
101   $catname="bzcat" if ($ARGV[0] =~ m/\.bz2$/);
102}
103
104print ">>> use file :: $logpath/$filename\n" if ($debug);
105#open FF, "$logpath\\$filename" || die "can't access log file\n";
106open FF, "$catname $logpath/$filename|" || die "can't access log file\n";
107
108InitSkipUser();
109
110StartIp2Name();
111
112undef %bigfile; $bigfilecnt=0;
113while (<FF>) {
114	chomp;
115	$totallines++;
116
117	if (0 == $squidlogtype) {
118	   #squid native log
119	   #970313965.619 1249	  denis.local TCP_MISS/200 2598 GET	   http://www.emalecentral.com/tasha/thm_4374x013.jpg -		DIRECT/www.emalecentral.com image/jpeg
120	   # timestamp	  elapsed host		  type		   size method url													user  hierarechy					type
121
122	   #speed optimization for FILTERDATE mode
123	   $Ltimestamp=substr $_,0,11;
124	   if ($Ltimestamp<$filterdatestart or $Ltimestamp>$filterdatestop) {
125		  print ">>>> skipDafteFilter URL $Lurl\n$_" if ($debug2 >= 2 );
126		  $skipfilterdatecntr++;
127		  next;
128	   };
129
130	   ($Ltimestamp,$Lelapsed,$Lhost,$Ltype,$Lsize,$Lmethod,$Lurl,$Luser,$Lhierarchy,$Lconttype,@Lrest)=split;
131	   ($sec,$min,$hour,$mday,$mon,$year,$wday,$yday,$isdst) = localtime($Ltimestamp);
132	   $mon++; #fix, month start from 0
133	   $date  =sprintf("%04d%02d%02d",$year+1900,$mon,$mday);
134
135	   #check row with invalid record
136	   if ( ($#Lrest >= 0) && ($#Lrest < 4) ) {
137		  $str=$_;
138		  #maybe two concatenated record (first - truncated)
139	  if ($str =~ m/(\d+\.\d+\s+\d+\s+(\d{1,3}\.){3}\d{1,3}\s+\w+\/\d+\s+\d+\w+\s+\S+\s+\S+\s+\S+\s+\w+\/\S+\s(-|([a-zA-Z\-]+\/[a-zA-Z\-]+)))$/) {
140			$newstr=$1;
141			($Ltimestamp,$Lelapsed,$Lhost,$Ltype,$Lsize,$Lmethod,$Lurl,$Luser,$Lhierarchy,$Lconttype)=split /\s+/,$newstr;
142		  } else {
143			# maybe source url contain SPACES, try concatenate ...
144			while ($#Lrest != -1) {
145			   $Lurl.="_$Luser";$Luser=$Lhierarchy;$Lhierarchy=$Lconttype;$Lconttype=shift @Lrest;
146			}
147			#do some sanity check
148		unless (($Lhierarchy =~ m/\w+\/\S+/) and ($Lconttype =~ m/-|([a-zA-Z\-]+\/[a-zA-Z\-]+)/)) {
149			   $notrecoveredlines++;
150		   next;
151			}
152		  }
153		  $recoveredlines++;
154	   }
155	} else {
156	   #emulated httpd log
157	   #192.168.3.40 - - [15/Apr/2005:11:46:35 +0300] "GET http://mail.yandex.ru/mboxjscript? HTTP/1.0" 200 2262 TCP_MISS :DIRECT
158	   #192.168.3.40 - - [15/Apr/2005:11:46:35 +0300] "GET http://css.yandex.ru/css/mail/search.js HTTP/1.0" 200 4199 TCP_HIT:NONE
159	   #192.168.3.12 -		 -		 [15/Apr/2005:11:46:35 +0300] "CONNECT aero.lufthansa.com:443 HTTP/1.0" 200 35992 TCP_MISS:DIRE
160	   # ($Lhost,	  $Luser,$Luser2,$Ldate,			   $u2,	  $Lmethod,$Lurl,				  $u3,	   $Ltype,$Lsize,$u4)=split
161
162	   ($Lhost,$Luser,$Luser2,$Ldate,$u2,$Lmethod,$Lurl,$u3,$Ltype,$Lsize,$u4)=split;
163
164	   $Ldate =~ m#^\[(\d\d)/(...)/(\d\d\d\d):(\d\d):(\d\d):(\d\d)#;
165	   $mday=$1;$mon=$month2dec{$2};$year=$3-1900;
166	   $hour=$4;$min=$5;$sec=$6;
167
168	   $date  =sprintf("%04d%02d%02d",$year+1900,$mon,$mday);
169	   if ($filterdate) {
170		  if ($date ne $filterdate) {
171			 print ">>>> skipDafteFilter URL $Lurl\n$_" if ($debug2 >= 2 );
172			 $skipfilterdatecntr++;
173			 next;
174		  };
175	   }
176
177	   if (($Luser eq "-") && ($Luser2 ne "-")) {
178		 $Luser = $Luser2;
179	   }
180
181	   $u4 =~ m/(.*?)\s?:(.*)/;
182	   $Ltype = "$1/$Ltype";
183	}  #if ($squidlogtype)
184
185	if ($year < 2000-1900) { ; #invalid record
186	   print ">>>> skipn Bad Year  $Lurl\n$_" if ($debug2 >= 1 );
187	   $skipbadyear++;
188	   next;
189	}
190
191	#skip intranet
192	if ($Lurl =~ m/$skipurl/o) {
193	  print ">>>> skipURL $Lurl\n$_" if ($debug2 >= 2 );
194	  $skipurlcntr++;
195	  next;
196	};
197
198	# skip Access denied records (TODO: report)
199	if ($Ltype =~ m#DENIED#io) {
200	  $skipDenied++;
201	  print ">>>> skipDenied $Ltype\n$_" if ($debug2 >= 2);
202	  next;
203	};
204
205	if ($Ltype =~ m/(HIT|UNMODIFIED)/) {
206	  $CacheHIT+=$Lsize;
207	} else {
208	  $CacheMISS+=$Lsize;
209	}
210
211	$parsedlines++;
212
213	if ($date ne $workday) { # close prev day, prepare for new
214	  if ($firstrun) {
215		undef $firstrun;
216		$workday=$date;
217	  } else {
218		MakeReport();
219		undef %totalsize; undef %sitesize; undef %sitehit;undef %totalhit;undef %totalputpost;
220	undef %hashhost;undef %hashname;
221		undef %bigfile; $bigfilecnt=0;
222	undef %sitetime;undef %sitetimesize;
223		$daylines=0;
224		$workday=$date;
225	$sqlreq=0;
226	$CacheHIT=0;$CacheMISS=0;
227	  }
228	}
229	$daylines++;
230
231	$user=lc $Luser;
232
233	$user = Ip2Name($Lhost,$user,$Ltimestamp);
234
235	next if (defined $hSkipUser{$user});
236
237	#simplified some common banner system & counters
238	$url=$Lurl;
239	$url =~ s/([a-z]+:\/\/)??.*\.(spylog\.com)/$1www.$2/o;
240	$url =~ s/([a-z]+:\/\/)??.*\.(yimg\.com)/$1www.$2/o;
241	$url =~ s/([a-z]+:\/\/)??.*\.(adriver\.ru)/$1www.$2/o;
242	$url =~ s/([a-z]+:\/\/)??.*\.(bannerbank\.ru)/$1www.$2/o;
243	$url =~ s/([a-z]+:\/\/)??.*\.(mail\.ru)/$1www.$2/o;
244	$url =~ s/([a-z]+:\/\/)??.*\.(adnet\.ru)/$1www.$2/o;
245	$url =~ s/([a-z]+:\/\/)??.*\.(rapidshare\.de)/$1www.$2/o;
246	$url =~ s/([a-z]+:\/\/)??.*\.(rapidshare\.com)/$1www.$2/o;
247
248	$url =~ s/([a-z]+:\/\/)??.*\.(vkontakte\.ru)/$1www.$2/o;
249	$url =~ s/([a-z]+:\/\/)??.*\.(odnoklasniki\.ru)/$1www.$2/o;
250
251
252	#extract site name
253	if ($url =~ m/([a-z]+:\/\/)??([a-z0-9\-]+\.){1}(([a-z0-9\-]+\.){0,})([a-z0-9\-]+){1}(:[0-9]+)?\/(.*)/o) {
254	   $site=$2.$3.$5;
255	} else {
256	   $site=$Lurl;
257	}
258
259
260	$site=$Lurl if ($site eq "");
261
262	$totalsize	  {$user}		+=$Lsize;
263	$totalhit	  {$user}		++;
264	$totalputpost {$user}		+=$Lsize if (($Lmethod eq "PUT") or ($Lmethod eq "POST"));
265	$sitesize	  {$user}{$site}+=$Lsize;
266	$sitehit	  {$user}{$site}++;
267
268	$sitetime	  {$user}{$site}[$hour]+=$Lelapsed;
269	$sitetimesize {$user}{$site}[$hour]+=$Lsize;
270
271	#.bigfile support
272	if ($Lsize > $bigfilelimit) {
273		$bigfile [$bigfilecnt]{date}=sprintf("%02d:%02d:%02d",$hour,$min,$sec);
274		$bigfile [$bigfilecnt]{link}=$Lurl;
275		$bigfile [$bigfilecnt]{size}=$Lsize;
276		$bigfile [$bigfilecnt]{user}=$user;
277		$bigfilecnt++;
278	}
279}
280
281MakeReport();
282StopIp2Name();
283UnLockLSQ();
284
285if ($debug) {
286	$worktime = ( time() - $^T );
287	print "run TIME: $worktime sec\n";
288	print "LightSquid parser statistic report\n\n";
289	printf( "	   %10u lines processed (average %.2f lines per second)\n",
290		$totallines, getLPS( $worktime, $totallines ) );
291	printf( "	   %10u lines parsed\n",				  $parsedlines );
292	printf( "	   %10u lines recovered\n",				  $recoveredlines );
293	printf( "	   %10u lines notrecovered\n",			  $notrecoveredlines );
294	printf( "	   %10u lines skiped by bad year\n",	  $skipbadyear );
295	printf( "	   %10u lines skiped by date filter\n",	  $skipfilterdatecntr );
296	printf( "	   %10u lines skiped by Denied filter\n", $skipDenied );
297	printf( "	   %10u lines skiped by skipURL filter\n", $skipurlcntr );
298
299	if ( $parsedlines == 0 ) {
300		print "\nWARNING !!!!, parsed 0 lines from total : $totallines\n";
301		print "please check confiuration !!!!\n";
302		print "may be wrong log format selected ?\n";
303	}
304
305}
306
307
308
309# The END ---------------------------------------------------------
310
311##Subroutines
312# return Line Per Second value (check 0 values and correct)
313sub getLPS($$) {
314  my $time=shift;
315  my $lines=shift;
316  $time||=1;
317  $lines||=1;
318  return ($lines/$time);
319}
320
321sub MakeReport() {
322	#generate report
323	#use global var
324
325	return if ($daylines < 2);
326
327	print ">>> Make Report $workday ($daylines - log line parsed)\n" if ($debug);
328
329	$reppath="$reportpath/$workday";
330
331	unless ( -d $reppath )
332	{
333	  mkdir $reppath, 0755 or die "Can't create dir '$reppath': $!";
334	}
335
336	open TOTALFILE,">$reppath/.total" || die "can't create file	 $reppath/.total - $!";
337
338	$tmp="";$tmpsize=0;$tmpuser=0;$tmpoveruser=0;
339
340	foreach $tuser (sort {$totalsize{$b} <=> $totalsize{$a}} keys %totalsize) {
341#		   $tmp.="$tuser\t$totalsize{$tuser}\t$totalhit{$tuser}\t$totalputpost{$tuser}\n";
342	  $totalputpost{$tuser}+=0; #prevent empty value
343	  $tmp.=sprintf("%-20s %15s %15s %15s\n",$tuser,$totalsize{$tuser},$totalhit{$tuser},$totalputpost{$tuser});
344	  $tmpuser++;
345	  $tmpsize+=$totalsize{$tuser};
346	  $tmpoveruser++ if ($totalsize{$tuser} >= $perusertrafficlimit);
347
348	  open REPFILE,">$reppath/$tuser" || die "can't create file	 $reppath/$tuser - $!";
349
350	  print REPFILE "total: $totalsize{$tuser}\n";
351
352	  foreach $tsite (sort {$sitesize{$tuser}{$b} <=> $sitesize{$tuser}{$a}} keys %{$sitesize{$tuser}} ) {
353		  printf REPFILE ("%-29s %12s %10s\t",$tsite,$sitesize{$tuser}{$tsite},$sitehit{$tuser}{$tsite});
354	  if ($timereport != 0) {
355			for ($hour=0;$hour<24;$hour++) {
356		printf REPFILE ("%d-%s ",int($sitetime{$tuser}{$tsite}[$hour]/3600),$sitetimesize{$tuser}{$tsite}[$hour]+0);
357		}
358	  }
359		  print REPFILE "\n";
360	  }
361	  close REPFILE;
362	}
363
364	$CacheMISS=1 if ($CacheMISS == 0);
365
366	print TOTALFILE "user: $tmpuser\n";
367	print TOTALFILE "size: $tmpsize\n";
368
369	print TOTALFILE "$tmp";
370	close TOTALFILE;
371
372	my ($sec_,$min_,$hour_,$mday_,$mon_,$year_,$wday_,$yday_,$isdst_) = localtime;$mon_++;$year_+=1900;
373	my $moddate=sprintf("%02d:%02d",$hour_,$min_)." ::	$mday_ $MonthName[$mon_] $year_";
374
375	open FILE,">$reppath/.features" || die "can't create file  $reppath/.features - $!";
376	print FILE "overuser: $tmpoveruser\n";
377	print FILE "cachehit%: ".sprintf("%3.2f",($CacheHIT*100)/($CacheHIT+$CacheMISS))."\n";
378	print FILE "cachehit: $CacheHIT\n";
379	print FILE "cachemiss: $CacheMISS\n";
380	print FILE "cacheall: ".($CacheHIT+$CacheMISS)."\n";
381	print FILE "modification: $moddate\n";
382	close FILE;
383
384	unlink "$reppath/.bigfiles";
385	if ($bigfilecnt != 0) {
386	  open MAXFILE,">$reppath/.bigfiles" || die "can't create file	$reppath/.bigfiles - $!";
387	  for ($i=0;$i<$bigfilecnt;$i++) {
388		print MAXFILE "$bigfile[$i]{user}\t$bigfile[$i]{date}\t$bigfile[$i]{size}\t$bigfile[$i]{link}\n";
389	  }
390	  close MAXFILE;
391	}
392
393	#create list of user that use more than $perusertrafficlimit bytes
394	unlink "$reppath/.overuser";
395	if ($tmpoveruser) {
396		open OVERFILE,">","$reppath/.overuser" || die "can't create file  $reppath/.overuser - $!";
397		foreach $tuser (sort {$totalsize{$b} <=> $totalsize{$a}} keys %totalsize) {
398			print OVERFILE "$tuser\t$totalsize{$tuser}\n" if ($totalsize{$tuser} >= $perusertrafficlimit);
399		}
400		close OVERFILE;
401	}
402
403	CreateGroupFile($reppath);
404	CreateRealnameFile($reppath);
405}
406
407sub InitSkipUser() {
408 open F,"<$cfgpath/skipuser.cfg";
409 while (<F>) {
410   chomp;
411   next if (/^#/);
412   $hSkipUser{$_}=1;
413 }
414 close F;
415}
416# Lock support
417sub LockLSQ() {
418   if (-f "$lockfilepath") {
419	  #read data from `lockfile`
420	  print STDERR "Warning, `$lockfilepath` exist, maybe anoter process running !\n";
421	  open FF,"<","$lockfilepath" or die "can't read lock file `$lockfilepath`\n";
422	  $pid=<FF>;chomp $pid;$pid =~ s/PID: //;
423	  $ts =<FF>;chomp $ts ;$ts	=~ s/Timestamp: //;
424	  close FF;
425	  #check timedelta
426	  $tsdelta=time - $ts;
427	  print STDERR "LockPID : $pid\n" ;
428	  print STDERR "tsdelta : $tsdelta second(s) (maxlocktime: $maxlocktime)\n";
429
430	  return 0 if ($tsdelta<$maxlocktime);
431
432	  print STDERR "OLD lock file ignored and removed!\n";
433	  UnLockLSQ();
434   }
435
436   open FF,">","$lockfilepath" or die "can't create lock file `$lockfilepath`\n";
437   print FF "PID: $$\n";
438   $ts=time;
439   print FF "Timestamp: $ts\n";
440   print FF "Creation time: ".localtime($ts)."\n";
441   close FF;
442
443   return 1;
444}
445
446sub UnLockLSQ() {
447  unlink $lockfilepath or die "can't remove lock file `$lockfilepath`\n";
448}
449
450sub LOCKREMOVER() {
451   print "INT happents, remove LOCK\n";
452   UnLockLSQ();
453   exit;
454}
455
456__END__
4572004-04-23		: initial version
4582004-09-01 FIX	: error in parse invalid file
4592004-09-09 ADD	: add create .bigfile file contain links greater $bigfilelimit
4602004-11-08 ADD	: skip 4xx records (dirty :-() TODO: do error report
4612004-11-09 ADD	: use DB only if not define user name...
4622005-04-13 ADD	: LightSquid publication cleanup
4632005-04-14 ADD	: $debug and $debug2 variable for generate statistic
464				: if parsed lines = 0 print WARNING
4652005-04-17 ADD	: add support fot HTTPDlike log file
4662005-04-19 ADD	: add .bz2 support
467				: add cache hit calculationn (if Ltype contain HIT - hit else - MISS), wrong ??
468				: add oversized user calculation
4692005-04-20 ADD	: .features file added, with additional info
4702005-04-22 FIX	: httpdlike parser bug;
471	   	   FIX	: mkdir 655 -> mkdir 755
4722005-04-30 ADD	: Rewrite archive support, now support access.log.{D},access.log.{D}.gz,access.log.{D}.bz2
473		   ADD	: time report
4742005-05-03 FIX	: fix wrong .features file output
4752005-05-12 FIX	: empty line report only if $debug
476		   FIX	: date filter now ^\d\d\d\d\d\d\d\d$ ...
4772005-11-21 FIX	: cosmetical changes
4782006-07-02 ADD	: try recovery some type of broken log record (url contain spaces, two concatenated record)
479				: fix negative number in user file (printf -%d <2g $u <4g), now use simple print
4802006-07-05 ADD	: Put & Post addet into .total file
4812006-07-10 ADD	: SkipUser support
482				: GetNameByIP -> IP2NAME (see doc)
483				: $cfgpath in config
484				: .features modification: parameter support
4852006-07-29 ADD	: add LOCKing, for prevent multiple LightSquid parser instance ...
486		   ADD	: improve SKIP speed for native squid log format (more that 3 time !!!!)
487		   ADD	: report line per second speed LPS in debug report
4882006-11-23 FIX	: Yet another printf trouble in time report fixed
4892007-01-05 FIX	: Wrong modification data writen in .features
4902008-11-28 NEW	: Odnoklasniki & Vkontakte agregator added
491		   FIX	: Perl 5.10 fix. in several cases incorrect name was used, but size calculated correctly.
4922009-06-30 NEW	: .overuser support