1package Regexp::Log::Common; 2 3use warnings; 4use strict; 5use base qw( Regexp::Log ); 6use vars qw( $VERSION %DEFAULT %FORMAT %REGEXP ); 7 8$VERSION = '0.10'; 9 10=head1 NAME 11 12Regexp::Log::Common - A regular expression parser for the Common Log Format 13 14=head1 SYNOPSIS 15 16 my $foo = Regexp::Log::Common->new( 17 format => '%date %request', 18 capture => [qw( ts request )], 19 ); 20 21 # the format() and capture() methods can be used to set or get 22 $foo->format('%date %request %status %bytes'); 23 $foo->capture(qw( ts req )); 24 25 # this is necessary to know in which order 26 # we will receive the captured fields from the regexp 27 my @fields = $foo->capture; 28 29 # the all-powerful capturing regexp :-) 30 my $re = $foo->regexp; 31 32 while (<>) { 33 my %data; 34 @data{@fields} = /$re/; # no need for /o, it's a compiled regexp 35 36 # now munge the fields 37 ... 38 } 39 40=head1 DESCRIPTION 41 42Regexp::Log::Common uses Regexp::Log as a base class, to generate regular 43expressions for performing the usual data munging tasks on log files that 44cannot be simply split(). 45 46This specific module enables the computation of regular expressions for 47parsing the log files created using the Common Log Format. An example of 48this format are the logs generated by the httpd web server using the 49keyword 'common'. 50 51The module also allows for the use of the Extended Common Log Format. 52 53For more information on how to use this module, please see Regexp::Log. 54 55=head1 ABSTRACT 56 57Enables simple parsing of log files created using the Common Log Format or the 58Extended Common Log Format, such as the logs generated by the httpd/Apache web 59server using the keyword 'common'. 60 61=cut 62 63# default values 64%DEFAULT = ( 65 format => '%host %rfc %authuser %date %request %status %bytes %referer %useragent', 66 capture => [ 'host', 'rfc', 'authuser', 'date', 'ts', 'request', 'req', 67 'status', 'bytes', 'referer', 'ref', 'useragent', 'ua' ], 68); 69 70# predefined format strings 71%FORMAT = ( 72 ':default' => '%host %rfc %authuser %date %request %status %bytes', 73 ':common' => '%host %rfc %authuser %date %request %status %bytes', 74 ':extended' => '%host %rfc %authuser %date %request %status %bytes %referer %useragent', 75); 76 77# the regexps that match the various fields 78%REGEXP = ( 79# %a Remote IP-address 80# %A Local IP-address 81 '%a' => '(?#=a)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!a)', 82 '%A' => '(?#=A)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!A)', 83 '%remoteip' => '(?#=remoteip)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!remoteip)', 84 '%localip' => '(?#=localip)\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}(?#!localip)', 85 86# %B Size of response in bytes, excluding HTTP headers. 87# %b Size of response in bytes, excluding HTTP headers. In CLF format, i.e. a '-' rather than a 0 when no bytes are sent. 88 '%B' => '(?#=B)\d+(?#!B)', # bytes (non-CLF format) 89 '%b' => '(?#=b)-|\d+(?#!b)', # bytes (CLF format) 90 '%bytes' => '(?#=bytes)-|\d+(?#!bytes)', # bytes (CLF and non-CLF format) 91 92# %D The time taken to serve the request, in microseconds. 93 '%D' => '(?#=D)\d+(?#!D)', # response time (in microseconds) 94 '%time' => '(?#=time)\d+(?#!time)', # response time (in microseconds) 95 96# %f Filename 97 '%F' => '(?#=F)\S+(?#!F)', # filename 98 '%filename' => '(?#=filename)\S+(?#!filename)', # filename 99 100# %h Remote host 101 '%h' => '(?#=h)\S+(?#!h)', # numeric or name of remote host 102 '%host' => '(?#=host)\S+(?#!host)', # numeric or name of remote host 103 '%remotehost' => '(?#=remotehost)\S+(?#!remotehost)', # numeric or name of remote host 104 105# %H The request protocol 106 '%H' => '(?#=H)\S+(?#!H)', # protocol 107 '%protcol' => '(?#=protocol)\S+(?#!protocol)', # protocol 108 109# %{Foobar}i The contents of Foobar: header line(s) in the request sent to the server. 110 '%referer' => '(?#=referer)\"(?#=ref).*?(?#!ref)\"(?#!referer)', # "referer" from \"%{Referer}i\" 111 '%useragent' => '(?#=useragent)\"(?#=ua).*?(?#!ua)\"(?#!useragent)', # "user_agent" from \"%{User-Agent}i\" 112 113# %k Number of keepalive requests handled on this connection. Interesting if KeepAlive is being used, so that, for example, a '1' means the first keepalive request after the initial one, '2' the second, etc...; otherwise this is always 0 (indicating the initial request). Available in versions 2.2.11 and later. 114 '%k' => '(?#=k)\d+(?#!k)', # keep alive requests 115 '%keepalive' => '(?#=keepalive)\d+(?#!keepalive)', # keep alive requests 116 117# %l Remote logname (from identd, if supplied). This will return a dash unless mod_ident is present and IdentityCheck is set On. 118 '%l' => '(?#=F)\S+(?#!F)', # logname 119 '%logname' => '(?#=logname)\S+(?#!logname)', # logname 120 '%rfc' => '(?#=rfc)\S+(?#!rfc)', # rfc931 121 122# %m The request method 123 '%m' => '(?#=F)\S+(?#!F)', # request method 124 '%method' => '(?#=method)\S+(?#!method)', # request method 125 126# %p The canonical port of the server serving the request 127 '%p' => '(?#=p)\d+(?#!p)', # port 128 '%port' => '(?#=port)\d+(?#!port)', # port 129 130# %P The process ID of the child that serviced the request. 131 '%P' => '(?#=P)\d+(?#!P)', # process id 132 '%pid' => '(?#=pid)\d+(?#!pid)', # process id 133 134# %q The query string (prepended with a ? if a query string exists, otherwise an empty string) 135 '%q' => '(?#=q)\".*?\"(?#!q)', # "query string" 136 '%queryatring' => '(?#=queryatring)\"(?#=qs).*?(?#!qs)\"(?#!queryatring)', # "query string" 137 138# %r First line of request 139 '%r' => '(?#=r)\".*?\"(?#!r)', # "request" 140 '%request' => '(?#=request)\"(?#=req).*?(?#!req)\"(?#!request)', # "request" 141 142# %s Status. For requests that got internally redirected, this is the status of the *original* request --- %>s for the last. 143 '%s' => '(?#=s)\d+(?#!s)', # status 144 '%status' => '(?#=status)\d+(?#!status)', # status 145 146# %t Time the request was received (standard english format) 147 '%t' => '(?#=t)\[\d{2}\/\w{3}\/\d{4}(?::\d{2}){3} [-+]\d{4}\](?#!t)', # [date] (see note 1) 148 '%date' => '(?#=date)\[(?#=ts)\d{2}\/\w{3}\/\d{4}(?::\d{2}){3} [-+]\d{4}(?#!ts)\](?#!date)', # [date] (see note 1) 149 150# %T The time taken to serve the request, in seconds. 151 '%T' => '(?#=T)\d+(?#!T)', # response time (in seconds) 152 '%seconds' => '(?#=seconds)\d+(?#!seconds)', # response time (in seconds) 153 154# %u Remote user (from auth; may be bogus if return status (%s) is 401) 155 '%u' => '(?#=u)\S+(?#!u)', # authuser 156 '%authuser' => '(?#=authuser)\S+(?#!authuser)', # authuser 157 158# %U The URL path requested, not including any query string. 159 '%U' => '(?#U)\".*?\"(?#!U)', # request 160 '%request' => '(?#=request)\"(?#=req).*?(?#!req)\"(?#!request)', # "request" 161 162# %v The canonical ServerName of the server serving the request. 163# %V The server name according to the UseCanonicalName setting. 164 '%v' => '(?#=v)\S+(?#!v)', # server name 165 '%V' => '(?#=V)\S+(?#!V)', # server name 166 '%servername' => '(?#=servername)\S+(?#!servername)', # server name 167 168 169# %X Connection status when response is completed: 170 '%X' => '(?#=X)\S+(?#!X)', # connection status (X, + or -) 171 '%connection' => '(?#=connection)\S+(?#!connection)', # connection status (X, + or -) 172 173# %I Bytes received, including request and headers, cannot be zero. You need to enable mod_logio to use this. 174# %O Bytes sent, including headers, cannot be zero. You need to enable mod_logio to use this. 175 '%I' => '(?#=I)\S+(?#!I)', # Bytes recieved 176 '%O' => '(?#=O)\S+(?#!O)', # Bytes sent 177); 178 179# note 1: date is in the format [01/Jan/1997:13:07:21 -0600] 180 1811; 182 183__END__ 184 185=head1 LOG FORMATS 186 187=head2 Common Log Format 188 189The Common Log Format is made up of several fields, each delimited by a single 190space. 191 192=over 4 193 194=item * Apache LogFormat: 195 196 LogFormat "%h %l %u %t \"%r\" %>s %b common 197 198Note that the name at end, in this case 'common' is purely to identify the 199format locally, so that you can create a different LogFormat for different 200purposes. You then define in your virtual host a log line such as: 201 202 CustomLog /var/www/logs/mysite-access.log common 203 204=item * Fields: 205 206 remotehost rfc931 authuser [date] "request" status bytes 207 208=item * Example: 209 210 127.0.0.1 - - [19/Jan/2005:21:47:11 +0000] "GET /brum.css HTTP/1.1" 304 0 211 212 For the above example: 213 remotehost: 127.0.0.1 214 rfc931: - 215 authuser: - 216 [date]: [19/Jan/2005:21:47:11 +0000] 217 "request": "GET /brum.css HTTP/1.1" 218 status: 304 219 bytes: 0 220 221=item * Available Capture Fields 222 223 * host 224 * rfc 225 * authuser 226 * date 227 ** ts (date without the []) 228 * request 229 ** req (request without the quotes) 230 * status 231 * bytes 232 233=item * Method Call 234 235 my $foo = Regexp::Log::Common->new( format => ':common' ); 236 237=back 238 239=head2 Extended Common Log Format 240 241The Extended Common Log Format is made up of several fields, each delimited by 242a single space. 243 244=over 4 245 246=item * Apache LogFormat: 247 248 LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" extended 249 250=item * Fields: 251 252 remotehost rfc931 authuser [date] "request" status bytes "referer" "user_agent" 253 254=item * Example: 255 256 127.0.0.1 - - [19/Jan/2005:21:47:11 +0000] "GET /brum.css HTTP/1.1" 304 0 "http://birmingham.pm.org/" "Mozilla/2.0GoldB1 (Win95; I)" 257 258 For the above example: 259 remotehost: 127.0.0.1 260 rfc931: - 261 authuser: - 262 [date]: [19/Jan/2005:21:47:11 +0000] 263 "request": "GET /brum.css HTTP/1.1" 264 status: 304 265 bytes: 0 266 "referer": "http://birmingham.pm.org/" 267 "user_agent": "Mozilla/2.0GoldB1 (Win95; I)" 268 269=item * Available Capture Fields 270 271 * host 272 * rfc 273 * authuser 274 * date 275 ** ts (date without the []) 276 * request 277 ** req (request without the quotes) 278 * status 279 * bytes 280 * referer 281 ** ref (referer without the quotes) 282 * useragent 283 ** ua (useragent without the quotes) 284 285=item * Method Call 286 287 my $foo = Regexp::Log::Common->new( format => ':extended' ); 288 289=back 290 291=head2 Custom Log Formats 292 293There are any number of LogFormat lines you can define, and although this 294module doesn't define all the formats, you can specify your own customer format 295to extract fields as necessary. 296 297=over 4 298 299=item * Apache LogFormat: 300 301Perhaps, you need to extend the 'extended' format: 302 303 LogFormat "%h %l %u %t \"%r\" %>s %b \"%{Referer}i\" \"%{User-agent}i\" %D %v" custom 304 305=item * Example: 306 307This can produce a log line such as: 308 309 103.245.44.14 - - [23/May/2014:21:38:01 +0100] "GET /volume/201109 HTTP/1.0" 200 37748 "-" "binlar_2.6.3 test@mgmt.mic" 2259292 blog.cpantesters.org 310 311=item * Available Capture Fields 312 313Depending on how you define the capture, this can be broken down into fields in 314a few different ways. 315 316 host rfc authuser [date] "request" status bytes "referer" "useragent" time servername 317 318or a shorthand vareity 319 320 h l u t "r" s b "referer" "useragent" D v 321 322Note that referer and useragent don't have single letter counterparts, as both 323the %{xxx}i and %{xxx}e format fields need to be defined explicitly. Currently 324only referer and useragent are defined from the %{xxx}i field set, and none are 325defined for the %{xxx}e field set. This may be expanded in the future. 326 327=item * Method Call 328 329To define these you would call the constructor, or the individual methods as: 330 331 my $foo = Regexp::Log::Common->new( 332 format => '%host %rfc %authuser %date %request %status %bytes' . 333 '%referer %useragent %time %servername', 334 capture => [qw( host rfc authuser ts request status bytes 335 referer useragent time servername)], 336 ); 337 338or 339 340 my $foo = Regexp::Log::Common->new( 341 format => '%h %l %u %t %r %s %b %referer %useragent %D %v', 342 capture => [qw( h l u t r s b refereer useragent D v)], 343 ); 344 345=back 346 347=head1 FORMAT FIELDS 348 349There are several format fields available, although this module does not 350support them all. The ones it does currently support are as follows: 351 352 shorthand => longhand (if applicable) 353 354 '%a' => '%remoteip' 355 '%A' => '%localip' 356 '%B' => '%bytes' 357 '%b' => '%bytes' 358 '%D' => '%time' 359 '%F' => '%filename' 360 '%h' => '%host' or '%remotehost' 361 '%H' => '%protcol' 362 '%k' => '%keepalive' 363 '%l' => '%logname' or '%rfc' 364 '%m' => '%method' 365 '%p' => '%port' 366 '%P' => '%pid' 367 '%q' => '%queryatring' 368 '%r' => '%request' 369 '%s' => '%status' 370 '%t' => '%date', also '%ts' (excluding surrounding '[]') 371 '%T' => '%seconds' 372 '%u' => '%authuser' 373 '%U' => '%request' or '%req' (excluding surrounding '"') 374 '%v' => '%servername' 375 '%V' => '%servername' 376 '%X' => '%connection' 377 '%I' 378 '%O' 379 380 %{Foobar}i fields 381 382 '%referer' => or '%ref' (excluding surrounding '"') 383 '%useragent' => or '%ua' (excluding surrounding '"') 384 385For a more detail explanation, please see the Apache Log Formats documentation 386at L<http://httpd.apache.org/docs/2.2/mod/mod_log_config.html#formats>. 387 388=head1 BUGS, PATCHES & FIXES 389 390There are no known bugs at the time of this release. However, if you spot a 391bug or are experiencing difficulties that are not explained within the POD 392documentation, please submit a bug to the RT system (see link below). However, 393it would help greatly if you are able to pinpoint problems or even supply a 394patch. 395 396Fixes are dependent upon their severity and my availability. Should a fix not 397be forthcoming, please feel free to (politely) remind me by sending an email 398to barbie@cpan.org . 399 400RT: L<http://rt.cpan.org/Public/Dist/Display.html?Name=Regexp-Log-Common> 401 402=head1 SEE ALSO 403 404L<Regexp::Log> 405 406=head1 CREDITS 407 408BooK for initially putting the idea into my head, and the thread on a perl 409message board, that wanted the help that was solved with this exact module. 410 411=head1 AUTHOR 412 413 Barbie <barbie@cpan.org> 414 for Miss Barbell Productions, L<http://www.missbarbell.co.uk> 415 416=head1 COPYRIGHT AND LICENSE 417 418 Copyright (C) 2005-2014 Barbie for Miss Barbell Productions. 419 420 This distribution is free software; you can redistribute it and/or 421 modify it under the Artistic License v2. 422 423=cut 424