1#!/usr/local/bin/perl
2#****************************************************************************
3#****************************************************************************
4#
5#   AWFFull - A Webalizer Fork, Full o' features
6#
7#   awffull_history_regen.pl
8#       Pre-processing an old webalizer install prior to an upgrade
9#       to AWFFull.
10#
11#   Copyright (C) 2005, 2008 by Stephen McInerney
12#       (spm@stedee.id.au)
13#
14#   This file is part of AWFFull.
15#
16#   AWFFull is free software: you can redistribute it and/or modify
17#   it under the terms of the GNU General Public License as published by
18#   the Free Software Foundation, either version 3 of the License, or
19#   (at your option) any later version.
20#
21#   AWFFull is distributed in the hope that it will be useful,
22#   but WITHOUT ANY WARRANTY; without even the implied warranty of
23#   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
24#   GNU General Public License for more details.
25#
26#   You should have received a copy of the GNU General Public License
27#   along with AWFFull.  If not, see <http://www.gnu.org/licenses/>.
28#
29#****************************************************************************
30#****************************************************************************
31#
32#                     awffull_history_regen.pl
33#
34# DESCRIPTION
35# --------------
36# Given a directory, this script will parse all old weblizer html (per month)
37# files and spit out a complete history file (via STDOUT).
38# This new history file will contain all years/months from all the
39# webalizer html files.
40#
41# Designed for pre-processing an old webalizer install prior to an
42# upgrade to AWFFull.
43#
44#****************************************************************************
45#****************************************************************************
46#  Modification History
47# 11-Sep-2005 steve     Initial Creation
48# 17-Sep-2005 steve     major tidy and functionalise
49#****************************************************************************
50#****************************************************************************
51#
52###  *** Sample text to parse for
53#
54# <TR><TH COLSPAN=3 ALIGN=center BGCOLOR="#C0C0C0">Monthly Statistics for July 2005</TH></TR>
55# <TR><TH HEIGHT=4></TH></TR>
56# <TR><TD WIDTH=380><FONT SIZE="-1">Total Hits</FONT></TD>
57# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>12217843</B></FONT></TD></TR>
58# <TR><TD WIDTH=380><FONT SIZE="-1">Total Files</FONT></TD>
59# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>5384438</B></FONT></TD></TR>
60# <TR><TD WIDTH=380><FONT SIZE="-1">Total Pages</FONT></TD>
61# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>1031846</B></FONT></TD></TR>
62# <TR><TD WIDTH=380><FONT SIZE="-1">Total Visits</FONT></TD>
63# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>226836</B></FONT></TD></TR>
64# <TR><TD WIDTH=380><FONT SIZE="-1">Total KBytes</FONT></TD>
65# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>39965939</B></FONT></TD></TR>
66# <TR><TH HEIGHT=4></TH></TR>
67# <TR><TD WIDTH=380><FONT SIZE="-1">Total Unique Sites</FONT></TD>
68# <TD ALIGN=right COLSPAN=2><FONT SIZE="-1"><B>120135</B></FONT></TD></TR>
69# <TR><TD WIDTH=380><FONT SIZE="-1">Total Unique URLs</FONT></TD>
70#****************************************************************************
71
72use strict;               # die on all bad programming
73use Getopt::Long 2.33;    # Command Line Option Processing
74use Pod::Usage;           # For inline documentation
75
76###########################
77## Global Variables
78###########################
79my $DATE        = '/bin/date';   # Location of the GNU Date Command - default
80my $exit_status = 0;             # Script Return. 0 = success!
81
82## Options
83my $opt_UsageDir    = ".";       # Directory to look for webalizer usage files
84my $opt_DateCommand = $DATE;     # Location of the GNU Date Command
85
86###########################
87###########################
88##         MAIN
89###########################
90###########################
91
92ProcessCommandLine();
93$exit_status = RegenerateHistory();
94
95if ($exit_status == 2) {
96    printf(STDERR "Failed to find any Webalizer usage_YYYYMM.html files.\n");
97}
98
99exit($exit_status);
100
101##########################################################################
102##########################################################################
103####                          END OF MAIN
104##########################################################################
105##########################################################################
106
107
108####             SUBROUTINES
109
110##########################################################################
111##########################################################################
112## ProcessCommandLine
113##       Parse the Commandline Arguments
114##########################################################################
115sub ProcessCommandLine {
116    my $result;    # result from Calling GetOptions
117
118    my $opt_Help;  # Local options
119    my $opt_Man;   #  use for man page, or help screen
120
121    Getopt::Long::Configure("gnu_getopt");    # Configure to use GNU style Options
122
123    $result =
124        GetOptions("dir|d:s" => \$opt_UsageDir,
125                   "help|\?" => \$opt_Help,
126                   "man"     => \$opt_Man,
127                   "date:s"  => \$opt_DateCommand,
128                  )
129        || pod2usage(-verbose => 0);
130    if ($opt_Help) { pod2usage(-verbose => 1); }
131    if ($opt_Man)  { pod2usage(-verbose => 2); }
132    if (!-x $opt_DateCommand) {
133        printf("Invalid Date command: %s\n", $opt_DateCommand);
134        exit(1);
135    }
136} ## end sub ProcessCommandLine
137
138
139##########################################################################
140##########################################################################
141## RegenerateHistory
142##      Do the hard work - process the data, generate the output
143##########################################################################
144sub RegenerateHistory {
145    my $usagefile;    # The current file we're processing
146
147    # Up to Flags
148    my $in_MonthlyStats = 0;    # We are currently in the right place for monthly stats in the page
149    my $in_HitsStats    = 0;    # Now in Hits Stats
150    my $in_FilesStats   = 0;    # Now in File Stats
151    my $in_PageStats    = 0;    # Now in Page Stats
152    my $in_VisitStats   = 0;    # Now in Visit Stats
153    my $in_KByteStats   = 0;    # Now in KByte Stats
154    my $in_SiteStats    = 0;    # Now in Site Stats
155
156    my @line = ();              # The current input line
157    my %History;                # The hash holding all the ripped data
158
159    my $cur_month = "";         # The current month
160    my $cur_year  = 0;          # The current year
161    my $nofiles   = 2;          # Return the value 2 if no files are found
162
163    ### Month stuff
164    my @DinM = (31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31);
165
166    opendir(DIR, $opt_UsageDir) or die "Cannot open directory $opt_UsageDir";
167    while ($usagefile = readdir DIR) {
168        if ($usagefile =~ /^usage_[0-9]{6}\.html$/) {
169            open(FILE, "<$opt_UsageDir/$usagefile") or die "Cannot open file $usagefile";
170
171        ($cur_year, $cur_month) = $usagefile =~ /^usage_(....)(..)\.html$/;
172        FILELINE:
173            while (<FILE>) {
174                if (/>(Monthly Statistics for|Monats-Statistik f&uuml;r|Maandoverzicht ) /) {
175                    $in_MonthlyStats = 1;
176                    $nofiles         = 0;
177                } ## end if (/>Monthly Statistics for /)
178                next FILELINE if (!$in_MonthlyStats);
179
180                if ($in_MonthlyStats) {
181
182                    # Exit this file, end of useful info
183                    last FILELINE if (/>(Total Unique URLs|Summe unterschiedlicher URLs|Totaal verschillende URL\'s)</);
184
185                    # HITS - set value
186                    if ($in_HitsStats) {
187                        @line                                           = split /(<|>)/;
188                        $History{$cur_year}{$cur_month}{HITS} = $line[12];
189                        $in_HitsStats                                   = 0;
190                    }
191
192                    # FILES - set value
193                    elsif ($in_FilesStats) {
194                        @line                                            = split /(<|>)/;
195                        $History{$cur_year}{$cur_month}{FILES} = $line[12];
196                        $in_FilesStats                                   = 0;
197                    }
198
199                    # PAGES - set value
200                    elsif ($in_PageStats) {
201                        @line                                            = split /(<|>)/;
202                        $History{$cur_year}{$cur_month}{PAGES} = $line[12];
203                        $in_PageStats                                    = 0;
204                    }
205
206                    # VISITS - set value
207                    elsif ($in_VisitStats) {
208                        @line                                             = split /(<|>)/;
209                        $History{$cur_year}{$cur_month}{VISITS} = $line[12];
210                        $in_VisitStats                                    = 0;
211                    }
212
213                    # KBYTES - set value
214                    elsif ($in_KByteStats) {
215                        @line                                             = split /(<|>)/;
216                        $History{$cur_year}{$cur_month}{KBYTES} = $line[12];
217                        $in_KByteStats                                    = 0;
218                    }
219
220                    # SITES - set value
221                    elsif ($in_SiteStats) {
222                        @line                                            = split /(<|>)/;
223                        $History{$cur_year}{$cur_month}{SITES} = $line[12];
224                        $in_SiteStats                                    = 0;
225                    }
226
227                    # Else, all the checks for a next section
228                    elsif (/>(Total Hits|Summe Anfragen|Totaal hits)</) {
229                        $in_HitsStats = 1;
230                    } elsif (/>(Total Files|Summe Dateien|Totaal bestanden)</) {
231                        $in_FilesStats = 1;
232                    } elsif (/>(Total Pages|Summe Seiten|Totaal Pagina\'s)</) {
233                        $in_PageStats = 1;
234                    } elsif (/>(Total Visits|Summe Besuche|Totaal Bezoeken)</) {
235                        $in_VisitStats = 1;
236                    } elsif (/>(Total KBytes|Summe kb|Total kB Files)</) {
237                        $in_KByteStats = 1;
238                    } elsif (/>(Total Unique Sites|Summe unterschiedlicher Rechner|Totaal verschillende hosts)</) {
239                        $in_SiteStats = 1;
240                    }
241                }    ## if ($in_MonthlyStats) {
242            }    ## while (<FILE>) {
243            close(FILE);
244
245        }    ## if ($usagefile =~ /^usage_[0-9]{6}\.html$/) {
246    }    ## while (DIR) {
247    closedir(DIR);
248
249    my $key_year;
250    my $key_month;
251    foreach $key_year (sort (keys %History)) {
252        foreach $key_month (sort numerically (keys %{$History{$key_year}})) {
253            my $DaysInMonth = $DinM[$key_month - 1];
254            if ($key_month == 2) {
255                my $testmonth = `$opt_DateCommand "+%m" --date="29 feb $key_year" > /dev/null 2>&1`;
256                if ($testmonth == 2) {
257                    $DaysInMonth = 29;
258                }
259            } ## end if ($key_month == 2)
260            printf("%d %d %d %d %d %d 1 %d %d %d\n",
261                   $key_month,                             $key_year,
262                   $History{$key_year}{$key_month}{HITS},  $History{$key_year}{$key_month}{FILES},
263                   $History{$key_year}{$key_month}{SITES}, $History{$key_year}{$key_month}{KBYTES},
264                   $DaysInMonth,                           $History{$key_year}{$key_month}{PAGES},
265                   $History{$key_year}{$key_month}{VISITS}
266                  );
267        }    ## foreach $key_month
268    }    ## foreach $key_year
269
270    return ($nofiles);
271} ## end sub RegenerateHistory
272
273
274##########################################################################
275##########################################################################
276## numerically
277##      Do a numerical sort
278##########################################################################
279sub numerically { $a <=> $b }
280
281
282##########################################################################
283##########################################################################
284##########################################################################
285
286__END__
287
288=pod
289
290=head1 NAME
291
292awffull_history_regen.pl - Generate a history file from old Webalizer usage files
293
294=head1 SYNOPSIS
295
296awffull_history_regen.pl [options]
297
298NB! Must have the GNU Date command!
299
300=head1 OPTIONS
301
302=over 8
303
304=item B<--help>
305
306Print a brief help message and exit.
307
308=item B<--man>
309
310Print the manual page and exit.
311
312=item B<--dir directory>
313
314The directory to use, looking for old webalizer usage_YYYYMM.html files. If
315not present will use the current directory.
316
317=item B<--date gnu-date-location>
318
319This program requires the GNU date command, use this option, if it's in a non-standard place.
320
321=head1 DESCRIPTION
322
323Generate a history file from old Webalizer usage files.
324
325The resulting history file is sent only to STDOUT.
326
327=cut
328
329