1#!/usr/local/bin/perl 2#**************************************************************************** 3#**************************************************************************** 4# 5# AWFFull - A Webalizer Fork, Full o' features 6# 7# awffull_history_regen.pl 8# Pre-processing an old webalizer install prior to an upgrade 9# to AWFFull. 10# 11# Copyright (C) 2005, 2008 by Stephen McInerney 12# (spm@stedee.id.au) 13# 14# This file is part of AWFFull. 15# 16# AWFFull is free software: you can redistribute it and/or modify 17# it under the terms of the GNU General Public License as published by 18# the Free Software Foundation, either version 3 of the License, or 19# (at your option) any later version. 20# 21# AWFFull is distributed in the hope that it will be useful, 22# but WITHOUT ANY WARRANTY; without even the implied warranty of 23# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 24# GNU General Public License for more details. 25# 26# You should have received a copy of the GNU General Public License 27# along with AWFFull. If not, see <http://www.gnu.org/licenses/>. 28# 29#**************************************************************************** 30#**************************************************************************** 31# 32# awffull_history_regen.pl 33# 34# DESCRIPTION 35# -------------- 36# Given a directory, this script will parse all old weblizer html (per month) 37# files and spit out a complete history file (via STDOUT). 38# This new history file will contain all years/months from all the 39# webalizer html files. 40# 41# Designed for pre-processing an old webalizer install prior to an 42# upgrade to AWFFull. 43# 44#**************************************************************************** 45#**************************************************************************** 46# Modification History 47# 11-Sep-2005 steve Initial Creation 48# 17-Sep-2005 steve major tidy and functionalise 49#**************************************************************************** 50#**************************************************************************** 51# 52### *** Sample text to parse for 53# 54# <TR><TH COLSPAN=3 ALIGN=center BGCOLOR="#C0C0C0">Monthly Statistics for July 2005</TH></TR> 55# <TR><TH HEIGHT=4></TH></TR> 56# <TR><TD WIDTH=380><FONT SIZE="-1">Total Hits</FONT></TD> 57# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>12217843</B></FONT></TD></TR> 58# <TR><TD WIDTH=380><FONT SIZE="-1">Total Files</FONT></TD> 59# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>5384438</B></FONT></TD></TR> 60# <TR><TD WIDTH=380><FONT SIZE="-1">Total Pages</FONT></TD> 61# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>1031846</B></FONT></TD></TR> 62# <TR><TD WIDTH=380><FONT SIZE="-1">Total Visits</FONT></TD> 63# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>226836</B></FONT></TD></TR> 64# <TR><TD WIDTH=380><FONT SIZE="-1">Total KBytes</FONT></TD> 65# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>39965939</B></FONT></TD></TR> 66# <TR><TH HEIGHT=4></TH></TR> 67# <TR><TD WIDTH=380><FONT SIZE="-1">Total Unique Sites</FONT></TD> 68# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>120135</B></FONT></TD></TR> 69# <TR><TD WIDTH=380><FONT SIZE="-1">Total Unique URLs</FONT></TD> 70#**************************************************************************** 71 72use strict; # die on all bad programming 73use Getopt::Long 2.33; # Command Line Option Processing 74use Pod::Usage; # For inline documentation 75 76########################### 77## Global Variables 78########################### 79my $DATE = '/bin/date'; # Location of the GNU Date Command - default 80my $exit_status = 0; # Script Return. 0 = success! 81 82## Options 83my $opt_UsageDir = "."; # Directory to look for webalizer usage files 84my $opt_DateCommand = $DATE; # Location of the GNU Date Command 85 86########################### 87########################### 88## MAIN 89########################### 90########################### 91 92ProcessCommandLine(); 93$exit_status = RegenerateHistory(); 94 95if ($exit_status == 2) { 96 printf(STDERR "Failed to find any Webalizer usage_YYYYMM.html files.\n"); 97} 98 99exit($exit_status); 100 101########################################################################## 102########################################################################## 103#### END OF MAIN 104########################################################################## 105########################################################################## 106 107 108#### SUBROUTINES 109 110########################################################################## 111########################################################################## 112## ProcessCommandLine 113## Parse the Commandline Arguments 114########################################################################## 115sub ProcessCommandLine { 116 my $result; # result from Calling GetOptions 117 118 my $opt_Help; # Local options 119 my $opt_Man; # use for man page, or help screen 120 121 Getopt::Long::Configure("gnu_getopt"); # Configure to use GNU style Options 122 123 $result = 124 GetOptions("dir|d:s" => \$opt_UsageDir, 125 "help|\?" => \$opt_Help, 126 "man" => \$opt_Man, 127 "date:s" => \$opt_DateCommand, 128 ) 129 || pod2usage(-verbose => 0); 130 if ($opt_Help) { pod2usage(-verbose => 1); } 131 if ($opt_Man) { pod2usage(-verbose => 2); } 132 if (!-x $opt_DateCommand) { 133 printf("Invalid Date command: %s\n", $opt_DateCommand); 134 exit(1); 135 } 136} ## end sub ProcessCommandLine 137 138 139########################################################################## 140########################################################################## 141## RegenerateHistory 142## Do the hard work - process the data, generate the output 143########################################################################## 144sub RegenerateHistory { 145 my $usagefile; # The current file we're processing 146 147 # Up to Flags 148 my $in_MonthlyStats = 0; # We are currently in the right place for monthly stats in the page 149 my $in_HitsStats = 0; # Now in Hits Stats 150 my $in_FilesStats = 0; # Now in File Stats 151 my $in_PageStats = 0; # Now in Page Stats 152 my $in_VisitStats = 0; # Now in Visit Stats 153 my $in_KByteStats = 0; # Now in KByte Stats 154 my $in_SiteStats = 0; # Now in Site Stats 155 156 my @line = (); # The current input line 157 my %History; # The hash holding all the ripped data 158 159 my $cur_month = ""; # The current month 160 my $cur_year = 0; # The current year 161 my $nofiles = 2; # Return the value 2 if no files are found 162 163 ### Month stuff 164 my @DinM = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31); 165 166 opendir(DIR, $opt_UsageDir) or die "Cannot open directory $opt_UsageDir"; 167 while ($usagefile = readdir DIR) { 168 if ($usagefile =~ /^usage_[0-9]{6}\.html$/) { 169 open(FILE, "<$opt_UsageDir/$usagefile") or die "Cannot open file $usagefile"; 170 171 ($cur_year, $cur_month) = $usagefile =~ /^usage_(....)(..)\.html$/; 172 FILELINE: 173 while (<FILE>) { 174 if (/>(Monthly Statistics for|Monats-Statistik für|Maandoverzicht ) /) { 175 $in_MonthlyStats = 1; 176 $nofiles = 0; 177 } ## end if (/>Monthly Statistics for /) 178 next FILELINE if (!$in_MonthlyStats); 179 180 if ($in_MonthlyStats) { 181 182 # Exit this file, end of useful info 183 last FILELINE if (/>(Total Unique URLs|Summe unterschiedlicher URLs|Totaal verschillende URL\'s)</); 184 185 # HITS - set value 186 if ($in_HitsStats) { 187 @line = split /(<|>)/; 188 $History{$cur_year}{$cur_month}{HITS} = $line[12]; 189 $in_HitsStats = 0; 190 } 191 192 # FILES - set value 193 elsif ($in_FilesStats) { 194 @line = split /(<|>)/; 195 $History{$cur_year}{$cur_month}{FILES} = $line[12]; 196 $in_FilesStats = 0; 197 } 198 199 # PAGES - set value 200 elsif ($in_PageStats) { 201 @line = split /(<|>)/; 202 $History{$cur_year}{$cur_month}{PAGES} = $line[12]; 203 $in_PageStats = 0; 204 } 205 206 # VISITS - set value 207 elsif ($in_VisitStats) { 208 @line = split /(<|>)/; 209 $History{$cur_year}{$cur_month}{VISITS} = $line[12]; 210 $in_VisitStats = 0; 211 } 212 213 # KBYTES - set value 214 elsif ($in_KByteStats) { 215 @line = split /(<|>)/; 216 $History{$cur_year}{$cur_month}{KBYTES} = $line[12]; 217 $in_KByteStats = 0; 218 } 219 220 # SITES - set value 221 elsif ($in_SiteStats) { 222 @line = split /(<|>)/; 223 $History{$cur_year}{$cur_month}{SITES} = $line[12]; 224 $in_SiteStats = 0; 225 } 226 227 # Else, all the checks for a next section 228 elsif (/>(Total Hits|Summe Anfragen|Totaal hits)</) { 229 $in_HitsStats = 1; 230 } elsif (/>(Total Files|Summe Dateien|Totaal bestanden)</) { 231 $in_FilesStats = 1; 232 } elsif (/>(Total Pages|Summe Seiten|Totaal Pagina\'s)</) { 233 $in_PageStats = 1; 234 } elsif (/>(Total Visits|Summe Besuche|Totaal Bezoeken)</) { 235 $in_VisitStats = 1; 236 } elsif (/>(Total KBytes|Summe kb|Total kB Files)</) { 237 $in_KByteStats = 1; 238 } elsif (/>(Total Unique Sites|Summe unterschiedlicher Rechner|Totaal verschillende hosts)</) { 239 $in_SiteStats = 1; 240 } 241 } ## if ($in_MonthlyStats) { 242 } ## while (<FILE>) { 243 close(FILE); 244 245 } ## if ($usagefile =~ /^usage_[0-9]{6}\.html$/) { 246 } ## while (DIR) { 247 closedir(DIR); 248 249 my $key_year; 250 my $key_month; 251 foreach $key_year (sort (keys %History)) { 252 foreach $key_month (sort numerically (keys %{$History{$key_year}})) { 253 my $DaysInMonth = $DinM[$key_month - 1]; 254 if ($key_month == 2) { 255 my $testmonth = `$opt_DateCommand "+%m" --date="29 feb $key_year" > /dev/null 2>&1`; 256 if ($testmonth == 2) { 257 $DaysInMonth = 29; 258 } 259 } ## end if ($key_month == 2) 260 printf("%d %d %d %d %d %d 1 %d %d %d\n", 261 $key_month, $key_year, 262 $History{$key_year}{$key_month}{HITS}, $History{$key_year}{$key_month}{FILES}, 263 $History{$key_year}{$key_month}{SITES}, $History{$key_year}{$key_month}{KBYTES}, 264 $DaysInMonth, $History{$key_year}{$key_month}{PAGES}, 265 $History{$key_year}{$key_month}{VISITS} 266 ); 267 } ## foreach $key_month 268 } ## foreach $key_year 269 270 return ($nofiles); 271} ## end sub RegenerateHistory 272 273 274########################################################################## 275########################################################################## 276## numerically 277## Do a numerical sort 278########################################################################## 279sub numerically { $a <=> $b } 280 281 282########################################################################## 283########################################################################## 284########################################################################## 285 286__END__ 287 288=pod 289 290=head1 NAME 291 292awffull_history_regen.pl - Generate a history file from old Webalizer usage files 293 294=head1 SYNOPSIS 295 296awffull_history_regen.pl [options] 297 298NB! Must have the GNU Date command! 299 300=head1 OPTIONS 301 302=over 8 303 304=item B<--help> 305 306Print a brief help message and exit. 307 308=item B<--man> 309 310Print the manual page and exit. 311 312=item B<--dir directory> 313 314The directory to use, looking for old webalizer usage_YYYYMM.html files. If 315not present will use the current directory. 316 317=item B<--date gnu-date-location> 318 319This program requires the GNU date command, use this option, if it's in a non-standard place. 320 321=head1 DESCRIPTION 322 323Generate a history file from old Webalizer usage files. 324 325The resulting history file is sent only to STDOUT. 326 327=cut 328 329