1# Sequin v1.1.2 2# 3# by Peter Sergeant <cpan@clueball.com> 4# 5# A module for extracting and parsing search engine URLs from 6# server referrer files. Proper usage information is in the 7# README file 8 9 10# Magic Package Stuff 11require 5.005; 12use strict; 13require Exporter; 14package URI::Sequin; 15use vars qw(@ISA $VERSION @EXPORT_OK %log_types); 16@ISA = qw(Exporter); 17@EXPORT_OK = qw(se_extract log_extract %log_types key_extract); 18$VERSION = '1.2'; 19 20 21 22# &log_extract v1.0 23# =-=-=-=-=-=- =-=- 24# 25# 26# The purpose of this subroutine is to allow raw log files lines to be 27# handled. The subroutine accepts a log line, plus some information on 28# how it should be analysed, and returns a scalar value: the referring 29# URL. 30# 31# The subroutine knows a certain number of log types, and keeps regexs 32# with which to handle these logs in a globally accessable hash below, 33# called '%log_types'. If your log type is not already in the array it 34# can be added and used. 35# 36# Examples: 37# --------- 38# 39# Adding a new regex to %log_types: 40# => $log_types{'MyWebServer'} = '.+? Referer:(.+?) '; 41# 42# > It's worth pointing out that the subroutine uses $1 straight 43# > after the match has taken place to get the referrer. Because 44# > of this, you should make sure the part of string to be taken 45# > is enclosed in ()'s. If you're still unsure, this is clearly 46# > demonstrated below, where %log_types is set. 47# 48# Parsing a Log Entry 49# => $referrer = &log_extract($log_line, 'NCSA'); 50# 51# > As I hope is clear, $log_line is the log-file line that needs 52# > to be parsed, and 'Apache' refers to the relevant regex below 53# > in the %log_types hash. 54# 55 56%log_types = ( 57 # Microsoft IIS 3.0 and 2.0 58 'IIS1' => '(http:.+?),', 59 60 # Microsoft IIS4.0 (W3SVC format) 61 'IIS2' => '(http:.+?)$', 62 63 # NCSA (Apache, Netscape) 64 'NCSA' => '"(http:.+?)"', 65 66 # O'Reilly WebSite format 67 'ORW' => ' (http:.+?) ', 68 69 # General (works for most logtypes) 70 'General' => '(?:\s|"|,|^)(http:.+?)(\s|"|,|$)', 71 72); 73 74 75 76sub log_extract { 77 78 my $log_file_line = $_[0]; 79 my $log_file_type = $_[1] || 'General'; 80 81 chomp($log_file_line); 82 83 # Check that the $log_file_type contains a valid regex by using 84 # (eval) on it to see if we crash the regex engine, and by also 85 # checking if there is a regex in $log_types{$log_file_type} 86 87 my $re = eval { qr/$log_types{$log_file_type}/ }; 88 warn "Bad re: '$log_types{$log_file_type}' ($@)\n" if $@; 89 90 unless (defined $log_types{$log_file_type}) { 91 warn "Unknown Logtype - \"$log_file_type\"\n"; 92 } 93 94 95 # Return what we found 96 97 if ($log_file_line =~ m/$log_types{$log_file_type}/i) { return $1 }; 98 99 return; 100 101} 102 103 104 105# &se_extract v1.1 106# =-=-=-=-=-= =-=- 107# 108# The purpose of this subroutine is to break down the referring URL in 109# to an array, containing the $search_engine_name and the 110# $search_engine_url. 111# 112# Example: 113# => ($name, $url) = @{&se_extract($url)}; 114# 115 116sub se_extract { 117 118 my $input_url = $_[0]; 119 chomp($input_url); 120 121 # Break down the $input_url into two more useful variables, so 122 # that we can check if there is information in the query 123 # string, and if there is, we just get on with life. 124 125 my ($location, $query_string) = split(/\?/, $input_url); 126 return [] unless $query_string; 127 128 my $search_engine_name; 129 my $search_engine_url; 130 131 # This is a scary regex. It picks out with suprising accuracy 132 # the main part of a URL - the 'MSN' part of: 133 # http://biteme15.search.cgi.msn.com.uk/?asdfasdf 134 135 if ($location =~ m!(http://)?(\d+\.\d+\.\d+\.\d+(\:\d+)?)/!) { 136 return ["Unknown (IP)", $2]; 137 } 138 139 if ($location =~ m!^(.+?\. 140 ([^\.]+) 141 \. 142 (com|net|org|int|mil|\w\w| 143 (gov|mil|com|net|org|\w\w)\.\w\w 144 ) 145 (?:/|:\d+/) 146 )!x) { 147 $search_engine_url = $1; 148 $search_engine_name = "\u$2"; 149 150 } elsif ($location =~ m!^(http://)?((\w+)\.\w+(\:\d+/?)?)!) { 151 $search_engine_url = $2; 152 $search_engine_name = "\u$3"; 153 } elsif ($location =~ m!^(http://)?((\w+)([^\.\w:]|(\:\d+/?)?))!) { 154 $search_engine_url = $2; 155 $search_engine_name = "\u$3"; 156 } 157 158 159 # This has allowed us to quite accurately get the name and URL 160 # of any given search-engine. However, in the interests of 161 # total accuracy, we have a list of search-engines that we know 162 # so we can provide even more information, and make sure it's 163 # correct. 164 165 # Define this list: 166 167 my @search_engine_array = ( 168 ['Altavista', 'http://www.av.com', 169 '(altavista|av)'], 170 ['HotBot', 'http://www.hotbot.com', 171 'hotbot\.lycos'], 172 ['Infoseek', 'http://www.infoseek.com', 173 'infoseek\.go'], 174 ['Magellan', 'http://magellan.excite.com', 175 'magellan\.excite'], 176 ['Ask Jeeves', 'http://www.aj.com', 177 '(aj|askjeeves)'], 178 ['CNET Search', 'http://www.search.com', 179 '(cnet|search\.com|savysearch)'], 180 ); 181 182 # Cycle through the list 183 184 for (@search_engine_array) { 185 186 my ($se_name, $se_url, $se_regex) = @{$_}; 187 188 if ($location =~ m/$se_regex/) { 189 $search_engine_url = $se_url; 190 $search_engine_name = $se_name; 191 } 192 193 } 194 195 # Return what we know. 196 # jm: allow HTTPS search engines too ;) 197 198 if (defined $search_engine_url && $search_engine_url !~ m!^https?://!) { 199 $search_engine_url =~ s!^!http://!; 200 } 201 202 return [$search_engine_name, $search_engine_url]; 203 204} 205 206 207# &key_extract v1.1 208# =-=-=-=-=-= =-=- 209# 210# The purpose of this subroutine is to break down the referring URL in 211# to a string containing the search terms. 212# 213# Example: 214# => $terms = &key_extract($url); 215# 216 217sub key_extract { 218 219 my $input_url = $_[0]; 220 221 chomp($input_url); 222 223 # Break down the $input_url in to two more useful variables 224 225 my ($location, $query_string) = split(/\?/, $input_url); 226 return unless $query_string; 227 228 # Google Caching ... What a bitch... This will deal with it, 229 # how Google currently works... 230 231 if ($query_string =~ m!q=cache\:.+/(.+?)&!i) { 232 $_ = $1; 233 tr/+/ /; 234 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; 235 s/\s+/ /gs; 236 s/^\s+//g; 237 s/\s+$//g; 238 return $_; 239 } 240 241 # There are a number of ways in which we now try and determine 242 # what the search terms are. The first is quite clever, IMHO. 243 # We search for spaces in any of the submitted fields that 244 # isn't called 'next' or 'submit' or 'col' or 'btnG' (blame 245 # google). 246 247 248 249 if ($query_string =~ m/(?<!next)(?<!col)(?<!btnG)(?<!submit) 250 (?<!rfr)(?<!WILDCARD)(?<!METAENGINE)= 251 ([^&]*(?:\+|%2b)[^&]*)/xi) { 252 253 254 my $key_string = $1; 255 my $false = 0; 256 257 # Some search engines are determined to try and fool us 258 # :). Therefore, we kill some pseudo-matches containing 259 # %07C ( a pipe: | ) and %02C, by setting the $false 260 # scalar to a positive value, that overides a little 261 # later on. 262 263 $false++ if $key_string =~ m/(%02|%7C%7C)/; 264 265 # Clean our information from those nasty escape 266 # sequences. 267 268 for ($key_string) { 269 tr/+/ /; 270 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; 271 s/\s+/ /gs; 272 } 273 274 # Unless we decided to abort earlier, return the 275 # field that we found. 276 277 return $key_string unless ($false); 278 279 } 280 281 # Okay. If that failed, then we need to take a closer look. 282 # In the array below are many many possible prefixes for a term 283 # that might contain our data. They're in a particular order 284 # because some search engines use two of the variables. 285 286 # NB: This isn't quite finished. If you're finding that the 287 # wrong prefix is being used, please email me and tell me 288 # at pete_sergeant@hotmail.com 289 290 # If you're wondering why they're ordered in this slightly 291 # bizarre and seemingly random order, it's because some 292 # search engines have decided to use more than one of these 293 # variables, and the order these are in hopefully pick the 294 # right one first. 295 296 my @prefix_array = ( 297 298 299 '\w*query\w*', # CNET Search, Netscape 300 '\w*search(?!Type)\w*', 301 '\w*term\w*', 302 'ask', # Ask Jeeves 303 '.\w?key.\w?', 304 'palabras', 305 'DTqb1', 306 'request', 307 'ShowMatch', # syndic8 308 'keywords?', # Snap, overture.com 309 'general', # MetaCrawler, Go2Net 310 'key', # Looksmart 311 'MetaTopic', # AJ 312 'query0', # elf8888.at, thx to http://www.tnl.net/ 313 'queryString', # blogdigger.com 314 'serachfor', # mysearch.com dyslexia ;) 315 'terms', # abcsearch.com 316 'word', # baidu.com 317 'rn', 318 'mt', # MSN, HotBot 319 'qt', # Go, Infoseek, search.com 320 'oq', 321 'dom', # Domainsurfer 322 's', # Excite, blogsphere.us 323 'q', # Altavista, Google, Dogpile, Evreka, Metafind 324 'p', # Yahoo 325 't', 326 'qry', 327 'qkw', # dpxml, msxml 328 'qr', # northernlight.com 329 'qu', 330 'kw', # Sapo 331 'general', 332 'B1', 333 'sc', # Gohip 334 'szukaj', 335 'PA', 336 'MT', # goo.ne.jp 337 'req', # dir.com 338 'k', # galaxy.com 339 'cat', # Dmoz 340 'u', # Google translation 341 'va', # search.yahoo.com 342 'K', # srd.yahoo.com 343 'as_epq' # Google, sometimes. Advanced query maybe? 344 345 ); 346 347 # Cycle through each prefix and see if it's contained in the 348 # query_string. If it is, we extract the field, clean it, and 349 # return it. Simple. 350 351 352 for (@prefix_array) { 353 if ($query_string =~ m/(^|\&)$_=(.+?)(\&|$)/i) { 354 355 my $key_string = $2; 356 357 for ($key_string) { 358 tr/+/ /; 359 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; 360 s/\s+/ /gs; 361 } 362 363 if ($key_string =~ /\w/) { 364 return $key_string; 365 } 366 367 } 368 369 } 370 371 # Failing all that, some Search-Engines don't overload the 372 # query_string with values, and just make the query_string 373 # the search terms. The next part looks for that, and returns 374 # the whole query_string (cleaned) if this appears to be the 375 # case. 376 377 378 if ($query_string !~ /\=/) { 379 380 for ($query_string) { 381 tr/+/ /; 382 s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg; 383 s/\s+/ /gs; 384 } 385 386 return $query_string; 387 } 388 389 390 return; 391} 392 3931; 394 395__END__ 396 397=head1 NAME 398 399URI::Sequin - Extract information from the URLs of Search-Engines 400 401=head1 SYNOPSIS 402 403 404 use URI::Sequin qw/se_extract key_extract log_extract %log_types/; 405 406 $url = &log_extract($line_from_log_file, 'NCSA'); 407 408 $log_types{'MyLogType'} = '^(.+?) -> .+$'; 409 $url = &log_extract($line_from_log_file, 'MyLogType'); 410 411 $keyword_string = &key_extract($url); 412 413 ($search_engine_name, $search_engine_url) = @{&se_extract($url)}; 414 415 416=head1 DESCRIPTION 417 418This module provides three tools to aid people trying to analyse 419Search-Engine URLs. It�s meant mainly for those who want to analyse 420referrer logs and pick out key information about site visitors, such as 421which Search-Engine and keywords they used to find the site. 422 423The functions and globals provided (and exported by default) from this 424module are: 425 426=over 427 428=item log_extract($log_line, 'Type') 429 430This will pick out the referring URL from a line of a logfile. The 'type' can 431be one of the built in types or can be a user-created one. For more 432information, see %log_types below. This subroutine accepts a scalar, and 433returns a scalar. 434 435=item key_extract($url) 436 437This will try and determine the keywords used in $url. It accepts a scalar 438and returns a scalar. Should nothing be found, it returns an undefined value. 439 440=item se_extract($url) 441 442This will try and determine the name of the Search-Engine used and its URL. 443It accepts a scalar, and returns an array containing firstly the Search- 444Engine�s name and secondly the Search-Engine�s URL. Should the URL appear not 445to be from a Search Query, it returns a reference to an empty array. 446 447=item %log_types 448 449There are five built-in logfile types already in this hash. They are: 450 451=over 4 452 453=item * IIS1 - Microsoft IIS 3.0 and 2.0 454 455=item * IIS2 - Microsoft IIS4.0 (W3SVC format) 456 457=item * NCSA - For APACHE, NETSCAPE and any other NCSA format logs 458 459=item * ORW - O'Reilly WebSite format 460 461=item * General - A generalised one that will work with most logfiles 462 463=back 464 465It�s easy to add another one. Simply add a key to the hash, with a value that 466is a regex. Parenthesise the part that is the referring URL, as the script 467uses $1 to obtain the URL. (see the example in the Synopsis section). 468 469I have only one request for people who use this module. *Please* tell me where 470and how you've used it, and if you have any thoughts or suggestions on it, tell 471me! 472 473=back 474 475=head1 BUGS 476 477Doesn't like the Amnesi Search Engine. But then, neither do I. Also, 478the 'General' log type needs to be used with discretion ... be sure 479that none of the URLs contain literal " if you use it. 480 481=head1 AUTHOR 482 483Peter Sergeant E<lt>pete@grou.chE<gt> 484 485=head1 COPYRIGHT 486 487Copyright 2001 Peter Sergeant. 488 489This program is free software; you can redistribute it and/or modify it under 490the same terms as Perl itself. 491 492=cut 493