1package Regexp::Log::Common;
2
3use warnings;
4use strict;
5use base qw( Regexp::Log );
6use vars qw( $VERSION %DEFAULT %FORMAT %REGEXP );
7
8$VERSION = '0.10';
9
10=head1 NAME
11
12Regexp::Log::Common - A regular expression parser for the Common Log Format
13
14=head1 SYNOPSIS
15
16    my $foo = Regexp::Log::Common->new(
17        format  => '%date %request',
18        capture => [qw( ts request )],
19    );
20
21    # the format() and capture() methods can be used to set or get
22    $foo->format('%date %request %status %bytes');
23    $foo->capture(qw( ts req ));
24
25    # this is necessary to know in which order
26    # we will receive the captured fields from the regexp
27    my @fields = $foo->capture;
28
29    # the all-powerful capturing regexp :-)
30    my $re = $foo->regexp;
31
32    while (<>) {
33        my %data;
34        @data{@fields} = /$re/;    # no need for /o, it's a compiled regexp
35
36        # now munge the fields
37        ...
38    }
39
40=head1 DESCRIPTION
41
42Regexp::Log::Common uses Regexp::Log as a base class, to generate regular
43expressions for performing the usual data munging tasks on log files that
44cannot be simply split().
45
46This specific module enables the computation of regular expressions for
47parsing the log files created using the Common Log Format. An example of
48this format are the logs generated by the httpd web server using the
49keyword 'common'.
50
51The module also allows for the use of the Extended Common Log Format.
52
53For more information on how to use this module, please see Regexp::Log.
54
55=head1 ABSTRACT
56
57Enables simple parsing of log files created using the Common Log Format or the
58Extended Common Log Format, such as the logs generated by the httpd/Apache web
59server using the keyword 'common'.
60
61=cut
62
63# default values
64%DEFAULT = (
65    format  => '%host %rfc %authuser %date %request %status %bytes %referer %useragent',
66    capture => [ 'host', 'rfc', 'authuser', 'date', 'ts', 'request', 'req',
67                 'status', 'bytes', 'referer', 'ref', 'useragent', 'ua' ],
68);
69
70# predefined format strings
71%FORMAT = (
72    ':default'  => '%host %rfc %authuser %date %request %status %bytes',
73    ':common'   => '%host %rfc %authuser %date %request %status %bytes',
74    ':extended' => '%host %rfc %authuser %date %request %status %bytes %referer %useragent',
75);
76
77# the regexps that match the various fields
78%REGEXP = (
79#   %a  Remote IP-address
80#   %A  Local IP-address
81    '%a'            => '(?#=a)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!a)',
82    '%A'            => '(?#=A)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!A)',
83    '%remoteip'     => '(?#=remoteip)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!remoteip)',
84    '%localip'      => '(?#=localip)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!localip)',
85
86#   %B  Size of response in bytes, excluding HTTP headers.
87#   %b  Size of response in bytes, excluding HTTP headers. In CLF format, i.e. a '-' rather than a 0 when no bytes are sent.
88    '%B'            => '(?#=B)\d+(?#!B)',                                   # bytes (non-CLF format)
89    '%b'            => '(?#=b)-|\d+(?#!b)',                                 # bytes (CLF format)
90    '%bytes'        => '(?#=bytes)-|\d+(?#!bytes)',                         # bytes (CLF and non-CLF format)
91
92#   %D  The time taken to serve the request, in microseconds.
93    '%D'            => '(?#=D)\d+(?#!D)',                                   # response time (in microseconds)
94    '%time'         => '(?#=time)\d+(?#!time)',                             # response time (in microseconds)
95
96#   %f  Filename
97    '%F'            => '(?#=F)\S+(?#!F)',                                   # filename
98    '%filename'     => '(?#=filename)\S+(?#!filename)',                     # filename
99
100#   %h  Remote host
101    '%h'            => '(?#=h)\S+(?#!h)',                                   # numeric or name of remote host
102    '%host'         => '(?#=host)\S+(?#!host)',                             # numeric or name of remote host
103    '%remotehost'   => '(?#=remotehost)\S+(?#!remotehost)',                 # numeric or name of remote host
104
105#   %H  The request protocol
106    '%H'            => '(?#=H)\S+(?#!H)',                                   # protocol
107    '%protcol'      => '(?#=protocol)\S+(?#!protocol)',                     # protocol
108
109#   %{Foobar}i  The contents of Foobar: header line(s) in the request sent to the server.
110    '%referer'      => '(?#=referer)\"(?#=ref).*?(?#!ref)\"(?#!referer)',   # "referer"     from \"%{Referer}i\"
111    '%useragent'    => '(?#=useragent)\"(?#=ua).*?(?#!ua)\"(?#!useragent)', # "user_agent"  from \"%{User-Agent}i\"
112
113#   %k  Number of keepalive requests handled on this connection. Interesting if KeepAlive is being used, so that, for example, a '1' means the first keepalive request after the initial one, '2' the second, etc...; otherwise this is always 0 (indicating the initial request). Available in versions 2.2.11 and later.
114    '%k'            => '(?#=k)\d+(?#!k)',                                   # keep alive requests
115    '%keepalive'    => '(?#=keepalive)\d+(?#!keepalive)',                   # keep alive requests
116
117#   %l  Remote logname (from identd, if supplied). This will return a dash unless mod_ident is present and IdentityCheck is set On.
118    '%l'            => '(?#=F)\S+(?#!F)',                                   # logname
119    '%logname'      => '(?#=logname)\S+(?#!logname)',                       # logname
120    '%rfc'          => '(?#=rfc)\S+(?#!rfc)',                               # rfc931
121
122#   %m  The request method
123    '%m'            => '(?#=F)\S+(?#!F)',                                   # request method
124    '%method'       => '(?#=method)\S+(?#!method)',                         # request method
125
126#   %p  The canonical port of the server serving the request
127    '%p'            => '(?#=p)\d+(?#!p)',                                   # port
128    '%port'         => '(?#=port)\d+(?#!port)',                             # port
129
130#   %P  The process ID of the child that serviced the request.
131    '%P'            => '(?#=P)\d+(?#!P)',                                   # process id
132    '%pid'          => '(?#=pid)\d+(?#!pid)',                               # process id
133
134#   %q  The query string (prepended with a ? if a query string exists, otherwise an empty string)
135    '%q'            => '(?#=q)\".*?\"(?#!q)',                                   # "query string"
136    '%queryatring'  => '(?#=queryatring)\"(?#=qs).*?(?#!qs)\"(?#!queryatring)', # "query string"
137
138#   %r  First line of request
139    '%r'            => '(?#=r)\".*?\"(?#!r)',                               # "request"
140    '%request'      => '(?#=request)\"(?#=req).*?(?#!req)\"(?#!request)',   # "request"
141
142#   %s  Status. For requests that got internally redirected, this is the status of the *original* request --- %>s for the last.
143    '%s'            => '(?#=s)\d+(?#!s)',                                   # status
144    '%status'       => '(?#=status)\d+(?#!status)',                         # status
145
146#   %t  Time the request was received (standard english format)
147    '%t'            => '(?#=t)\[\d{2}\/\w{3}\/\d{4}(?::\d{2}){3} [-+]\d{4}\](?#!t)',                        # [date] (see note 1)
148    '%date'         => '(?#=date)\[(?#=ts)\d{2}\/\w{3}\/\d{4}(?::\d{2}){3} [-+]\d{4}(?#!ts)\](?#!date)',    # [date] (see note 1)
149
150#   %T  The time taken to serve the request, in seconds.
151    '%T'            => '(?#=T)\d+(?#!T)',                                   # response time (in seconds)
152    '%seconds'         => '(?#=seconds)\d+(?#!seconds)',                    # response time (in seconds)
153
154#   %u  Remote user (from auth; may be bogus if return status (%s) is 401)
155    '%u'            => '(?#=u)\S+(?#!u)',                                   # authuser
156    '%authuser'     => '(?#=authuser)\S+(?#!authuser)',                     # authuser
157
158#   %U  The URL path requested, not including any query string.
159    '%U'            => '(?#U)\".*?\"(?#!U)',                                # request
160    '%request'      => '(?#=request)\"(?#=req).*?(?#!req)\"(?#!request)',   # "request"
161
162#   %v  The canonical ServerName of the server serving the request.
163#   %V  The server name according to the UseCanonicalName setting.
164    '%v'            => '(?#=v)\S+(?#!v)',                                   # server name
165    '%V'            => '(?#=V)\S+(?#!V)',                                   # server name
166    '%servername'   => '(?#=servername)\S+(?#!servername)',                 # server name
167
168
169#   %X  Connection status when response is completed:
170    '%X'            => '(?#=X)\S+(?#!X)',                                   # connection status (X, + or -)
171    '%connection'   => '(?#=connection)\S+(?#!connection)',                 # connection status (X, + or -)
172
173#   %I  Bytes received, including request and headers, cannot be zero. You need to enable mod_logio to use this.
174#   %O  Bytes sent, including headers, cannot be zero. You need to enable mod_logio to use this.
175    '%I'            => '(?#=I)\S+(?#!I)',                                   # Bytes recieved
176    '%O'            => '(?#=O)\S+(?#!O)',                                   # Bytes sent
177);
178
179# note 1: date is in the format [01/Jan/1997:13:07:21 -0600]
180
1811;
182
183__END__
184
185=head1 LOG FORMATS
186
187=head2 Common Log Format
188
189The Common Log Format is made up of several fields, each delimited by a single
190space.
191
192=over 4
193
194=item * Apache LogFormat:
195
196    LogFormat "%h %l %u %t \"%r\" %>s %b common
197
198Note that the name at end, in this case 'common' is purely to identify the
199format locally, so that you can create a different LogFormat for different
200purposes. You then define in your virtual host a log line such as:
201
202    CustomLog /var/www/logs/mysite-access.log common
203
204=item * Fields:
205
206  remotehost rfc931 authuser [date] "request" status bytes
207
208=item * Example:
209
210  127.0.0.1 - - [19/Jan/2005:21:47:11 +0000] "GET /brum.css HTTP/1.1" 304 0
211
212  For the above example:
213  remotehost: 127.0.0.1
214  rfc931: -
215  authuser: -
216  [date]: [19/Jan/2005:21:47:11 +0000]
217  "request": "GET /brum.css HTTP/1.1"
218  status: 304
219  bytes: 0
220
221=item * Available Capture Fields
222
223  * host
224  * rfc
225  * authuser
226  * date
227  ** ts (date without the [])
228  * request
229  ** req (request without the quotes)
230  * status
231  * bytes
232
233=item * Method Call
234
235    my $foo = Regexp::Log::Common->new( format  => ':common' );
236
237=back
238
239=head2 Extended Common Log Format
240
241The Extended Common Log Format is made up of several fields, each delimited by
242a single space.
243
244=over 4
245
246=item * Apache LogFormat:
247
248    LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" extended
249
250=item * Fields:
251
252  remotehost rfc931 authuser [date] "request" status bytes "referer" "user_agent"
253
254=item * Example:
255
256  127.0.0.1 - - [19/Jan/2005:21:47:11 +0000] "GET /brum.css HTTP/1.1" 304 0 "http://birmingham.pm.org/" "Mozilla/2.0GoldB1 (Win95; I)"
257
258  For the above example:
259  remotehost: 127.0.0.1
260  rfc931: -
261  authuser: -
262  [date]: [19/Jan/2005:21:47:11 +0000]
263  "request": "GET /brum.css HTTP/1.1"
264  status: 304
265  bytes: 0
266  "referer": "http://birmingham.pm.org/"
267  "user_agent": "Mozilla/2.0GoldB1 (Win95; I)"
268
269=item * Available Capture Fields
270
271  * host
272  * rfc
273  * authuser
274  * date
275  ** ts (date without the [])
276  * request
277  ** req (request without the quotes)
278  * status
279  * bytes
280  * referer
281  ** ref (referer without the quotes)
282  * useragent
283  ** ua (useragent without the quotes)
284
285=item * Method Call
286
287    my $foo = Regexp::Log::Common->new( format  => ':extended' );
288
289=back
290
291=head2 Custom Log Formats
292
293There are any number of LogFormat lines you can define, and although this
294module doesn't define all the formats, you can specify your own customer format
295to extract fields as necessary.
296
297=over 4
298
299=item * Apache LogFormat:
300
301Perhaps, you need to extend the 'extended' format:
302
303    LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D %v" custom
304
305=item * Example:
306
307This can produce a log line such as:
308
309    103.245.44.14 - - [23/May/2014:21:38:01 +0100] "GET /volume/201109 HTTP/1.0" 200 37748 "-" "binlar_2.6.3 test@mgmt.mic" 2259292 blog.cpantesters.org
310
311=item * Available Capture Fields
312
313Depending on how you define the capture, this can be broken down into fields in
314a few different ways.
315
316  host rfc authuser [date] "request" status bytes "referer" "useragent" time servername
317
318or a shorthand vareity
319
320  h l u t "r" s b "referer" "useragent" D v
321
322Note that referer and useragent don't have single letter counterparts, as both
323the %{xxx}i and %{xxx}e format fields need to be defined explicitly. Currently
324only referer and useragent are defined from the %{xxx}i field set, and none are
325defined for the %{xxx}e field set. This may be expanded in the future.
326
327=item * Method Call
328
329To define these you would call the constructor, or the individual methods as:
330
331    my $foo = Regexp::Log::Common->new(
332        format  => '%host %rfc %authuser %date %request %status %bytes' .
333                   '%referer %useragent %time %servername',
334        capture => [qw( host rfc authuser ts request status bytes
335                        referer useragent time servername)],
336    );
337
338or
339
340    my $foo = Regexp::Log::Common->new(
341        format  => '%h %l %u %t %r %s %b %referer %useragent %D %v',
342        capture => [qw( h l u t r s b refereer useragent D v)],
343    );
344
345=back
346
347=head1 FORMAT FIELDS
348
349There are several format fields available, although this module does not
350support them all. The ones it does currently support are as follows:
351
352    shorthand       => longhand (if applicable)
353
354    '%a'            => '%remoteip'
355    '%A'            => '%localip'
356    '%B'            => '%bytes'
357    '%b'            => '%bytes'
358    '%D'            => '%time'
359    '%F'            => '%filename'
360    '%h'            => '%host' or '%remotehost'
361    '%H'            => '%protcol'
362    '%k'            => '%keepalive'
363    '%l'            => '%logname' or '%rfc'
364    '%m'            => '%method'
365    '%p'            => '%port'
366    '%P'            => '%pid'
367    '%q'            => '%queryatring'
368    '%r'            => '%request'
369    '%s'            => '%status'
370    '%t'            => '%date', also '%ts' (excluding surrounding '[]')
371    '%T'            => '%seconds'
372    '%u'            => '%authuser'
373    '%U'            => '%request' or '%req' (excluding surrounding '"')
374    '%v'            => '%servername'
375    '%V'            => '%servername'
376    '%X'            => '%connection'
377    '%I'
378    '%O'
379
380    %{Foobar}i fields
381
382    '%referer'      => or '%ref' (excluding surrounding '"')
383    '%useragent'    => or '%ua' (excluding surrounding '"')
384
385For a more detail explanation, please see the Apache Log Formats documentation
386at L<http://httpd.apache.org/docs/2.2/mod/mod_log_config.html#formats>.
387
388=head1 BUGS, PATCHES & FIXES
389
390There are no known bugs at the time of this release. However, if you spot a
391bug or are experiencing difficulties that are not explained within the POD
392documentation, please submit a bug to the RT system (see link below). However,
393it would help greatly if you are able to pinpoint problems or even supply a
394patch.
395
396Fixes are dependent upon their severity and my availability. Should a fix not
397be forthcoming, please feel free to (politely) remind me by sending an email
398to barbie@cpan.org .
399
400RT: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Regexp-Log-Common>
401
402=head1 SEE ALSO
403
404L<Regexp::Log>
405
406=head1 CREDITS
407
408BooK for initially putting the idea into my head, and the thread on a perl
409message board, that wanted the help that was solved with this exact module.
410
411=head1 AUTHOR
412
413  Barbie <barbie@cpan.org>
414  for Miss Barbell Productions, L<http://www.missbarbell.co.uk>
415
416=head1 COPYRIGHT AND LICENSE
417
418  Copyright (C) 2005-2014 Barbie for Miss Barbell Productions.
419
420  This distribution is free software; you can redistribute it and/or
421  modify it under the Artistic License v2.
422
423=cut
424