1# Sequin v1.1.2
2#
3# by Peter Sergeant <cpan@clueball.com>
4#
5# A module for extracting and parsing search engine URLs from
6# server referrer files. Proper usage information is in the
7# README file
8
9
10# Magic Package Stuff
11require 5.005;
12use strict;
13require Exporter;
14package URI::Sequin;
15use vars qw(@ISA $VERSION @EXPORT_OK %log_types);
16@ISA = qw(Exporter);
17@EXPORT_OK = qw(se_extract log_extract %log_types key_extract);
18$VERSION = '1.2';
19
20
21
22# &log_extract v1.0
23# =-=-=-=-=-=- =-=-
24#
25#
26# The purpose of this subroutine is to allow raw log files lines to be
27# handled. The subroutine accepts a log line, plus some information on
28# how it should be analysed, and returns a scalar value: the referring
29# URL.
30#
31# The subroutine knows a certain number of log types, and keeps regexs
32# with which to handle these logs in a globally accessable hash below,
33# called '%log_types'. If your log type is not already in the array it
34# can be added and used.
35#
36# Examples:
37# ---------
38#
39# Adding a new regex to %log_types:
40# => $log_types{'MyWebServer'} = '.+? Referer:(.+?) ';
41#
42#	> It's worth pointing out that the subroutine uses $1 straight
43#	> after the match has taken place to get the referrer. Because
44#	> of this, you should make sure the part of string to be taken
45#	> is enclosed in ()'s. If you're still unsure, this is clearly
46#	> demonstrated below, where %log_types is set.
47#
48# Parsing a Log Entry
49# => $referrer = &log_extract($log_line, 'NCSA');
50#
51#	> As I hope is clear, $log_line is the log-file line that needs
52#	> to be parsed, and 'Apache' refers to the relevant regex below
53#	> in the %log_types hash.
54#
55
56%log_types = (
57	# Microsoft IIS 3.0 and 2.0
58	'IIS1'		=>	'(http:.+?),',
59
60	# Microsoft IIS4.0 (W3SVC format)
61	'IIS2'		=>	'(http:.+?)$',
62
63	# NCSA (Apache, Netscape)
64	'NCSA'		=>	'"(http:.+?)"',
65
66	# O'Reilly WebSite format
67	'ORW'		=>	' (http:.+?) ',
68
69	# General (works for most logtypes)
70	'General'	=>	'(?:\s|"|,|^)(http:.+?)(\s|"|,|$)',
71
72);
73
74
75
76sub log_extract {
77
78	my $log_file_line	=	$_[0];
79	my $log_file_type	=	$_[1] || 'General';
80
81	chomp($log_file_line);
82
83	# Check that the $log_file_type contains a valid regex by using
84	# (eval) on it to see if we crash the regex engine, and by also
85	# checking if there is a regex in $log_types{$log_file_type}
86
87	my $re = eval { qr/$log_types{$log_file_type}/ };
88	warn "Bad re: '$log_types{$log_file_type}' ($@)\n" if $@;
89
90	unless (defined $log_types{$log_file_type}) {
91		warn "Unknown Logtype - \"$log_file_type\"\n";
92	}
93
94
95	# Return what we found
96
97	if ($log_file_line =~ m/$log_types{$log_file_type}/i) { return $1 };
98
99	return;
100
101}
102
103
104
105# &se_extract v1.1
106# =-=-=-=-=-= =-=-
107#
108# The purpose of this subroutine is to break down the referring URL in
109# to an array, containing the $search_engine_name and the
110# $search_engine_url.
111#
112# Example:
113# => ($name, $url) = @{&se_extract($url)};
114#
115
116sub se_extract {
117
118	my $input_url = $_[0];
119	chomp($input_url);
120
121	# Break down the $input_url into two more useful variables, so
122	# that we can check if there is information in the query
123	# string, and if there is, we just get on with life.
124
125	my ($location, $query_string) = split(/\?/, $input_url);
126	return [] unless $query_string;
127
128	my $search_engine_name;
129	my $search_engine_url;
130
131	# This is a scary regex. It picks out with suprising accuracy
132	# the main part of a URL - the 'MSN' part of:
133	# http://biteme15.search.cgi.msn.com.uk/?asdfasdf
134
135	if ($location =~ m!(http://)?(\d+\.\d+\.\d+\.\d+(\:\d+)?)/!) {
136		return ["Unknown (IP)", $2];
137	}
138
139	if ($location =~ m!^(.+?\.
140				([^\.]+)
141				\.
142				(com|net|org|int|mil|\w\w|
143					(gov|mil|com|net|org|\w\w)\.\w\w
144				)
145				(?:/|:\d+/)
146			  )!x) {
147		$search_engine_url	= $1;
148	 	$search_engine_name	= "\u$2";
149
150	} elsif ($location =~ m!^(http://)?((\w+)\.\w+(\:\d+/?)?)!) {
151  		$search_engine_url	= $2;
152		$search_engine_name	= "\u$3";
153	} elsif ($location =~ m!^(http://)?((\w+)([^\.\w:]|(\:\d+/?)?))!) {
154	  	$search_engine_url 	= $2;
155	        $search_engine_name	= "\u$3";
156	}
157
158
159	# This has allowed us to quite accurately get the name and URL
160	# of any given search-engine. However, in the interests of
161	# total accuracy, we have a list of search-engines that we know
162	# so we can provide even more information, and make sure it's
163	# correct.
164
165	# Define this list:
166
167	my @search_engine_array = (
168		['Altavista',	'http://www.av.com',
169			'(altavista|av)'],
170		['HotBot',	'http://www.hotbot.com',
171			'hotbot\.lycos'],
172		['Infoseek',	'http://www.infoseek.com',
173			'infoseek\.go'],
174		['Magellan',	'http://magellan.excite.com',
175			'magellan\.excite'],
176		['Ask Jeeves',	'http://www.aj.com',
177			'(aj|askjeeves)'],
178		['CNET Search',	'http://www.search.com',
179			'(cnet|search\.com|savysearch)'],
180	);
181
182	# Cycle through the list
183
184	for (@search_engine_array) {
185
186		my ($se_name, $se_url, $se_regex) = @{$_};
187
188		if ($location =~ m/$se_regex/) {
189			$search_engine_url = $se_url;
190			$search_engine_name = $se_name;
191		}
192
193	}
194
195	# Return what we know.
196	# jm: allow HTTPS search engines too ;)
197
198	if (defined $search_engine_url && $search_engine_url !~ m!^https?://!) {
199		$search_engine_url =~ s!^!http://!;
200	}
201
202	return [$search_engine_name, $search_engine_url];
203
204}
205
206
207# &key_extract v1.1
208# =-=-=-=-=-= =-=-
209#
210# The purpose of this subroutine is to break down the referring URL in
211# to a string containing the search terms.
212#
213# Example:
214# => $terms = &key_extract($url);
215#
216
217sub key_extract {
218
219	my $input_url = $_[0];
220
221	chomp($input_url);
222
223	# Break down the $input_url in to two more useful variables
224
225	my ($location, $query_string) = split(/\?/, $input_url);
226	return unless $query_string;
227
228	# Google Caching ... What a bitch... This will deal with it,
229	# how Google currently works...
230
231	if ($query_string =~ m!q=cache\:.+/(.+?)&!i) {
232		$_ = $1;
233		tr/+/ /;
234		s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
235		s/\s+/ /gs;
236		s/^\s+//g;
237		s/\s+$//g;
238		return $_;
239	}
240
241	# There are a number of ways in which we now try and determine
242	# what the search terms are. The first is quite clever, IMHO.
243	# We search for spaces in any of the submitted fields that
244	# isn't called 'next' or 'submit' or 'col' or 'btnG' (blame
245	# google).
246
247
248
249	if ($query_string =~ m/(?<!next)(?<!col)(?<!btnG)(?<!submit)
250				(?<!rfr)(?<!WILDCARD)(?<!METAENGINE)=
251				([^&]*(?:\+|%2b)[^&]*)/xi) {
252
253
254		my $key_string = $1;
255		my $false = 0;
256
257		# Some search engines are determined to try and fool us
258		# :). Therefore, we kill some pseudo-matches containing
259		# %07C ( a pipe: | ) and %02C, by setting the $false
260		# scalar to a positive value, that overides a little
261		# later on.
262
263		$false++ if $key_string =~ m/(%02|%7C%7C)/;
264
265		# Clean our information from those nasty escape
266		# sequences.
267
268		for ($key_string) {
269			tr/+/ /;
270			s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
271			s/\s+/ /gs;
272		}
273
274		# Unless we decided to abort earlier, return the
275		# field that we found.
276
277		return $key_string unless ($false);
278
279	}
280
281	# Okay. If that failed, then we need to take a closer look.
282	# In the array below are many many possible prefixes for a term
283	# that might contain our data. They're in a particular order
284	# because some search engines use two of the variables.
285
286	# NB: This isn't quite finished. If you're finding that the
287	#     wrong prefix is being used, please email me and tell me
288	#     at pete_sergeant@hotmail.com
289
290	# If you're wondering why they're ordered in this slightly
291	# bizarre and seemingly random order, it's because some
292	# search engines have decided to use more than one of these
293	# variables, and the order these are in hopefully pick the
294	# right one first.
295
296	my @prefix_array = (
297
298
299		'\w*query\w*',   # CNET Search, Netscape
300		'\w*search(?!Type)\w*',
301		'\w*term\w*',
302		'ask',             # Ask Jeeves
303		'.\w?key.\w?',
304		'palabras',
305		'DTqb1',
306		'request',
307		'ShowMatch',    # syndic8
308		'keywords?',  # Snap, overture.com
309		'general',  # MetaCrawler, Go2Net
310		'key',      # Looksmart
311    'MetaTopic', # AJ
312		'query0', # elf8888.at, thx to http://www.tnl.net/
313		'queryString', # blogdigger.com
314		'serachfor', # mysearch.com dyslexia ;)
315	  'terms', # abcsearch.com
316	  'word', # baidu.com
317    'rn',
318		'mt',  # MSN, HotBot
319		'qt',  # Go, Infoseek, search.com
320		'oq',
321		'dom', # Domainsurfer
322	  's',   # Excite, blogsphere.us
323		'q',   # Altavista, Google, Dogpile, Evreka, Metafind
324		'p',   # Yahoo
325		't',
326		'qry',
327		'qkw', # dpxml, msxml
328		'qr',  # northernlight.com
329		'qu',
330		'kw',  # Sapo
331		'general',
332		'B1',
333		'sc',  # Gohip
334		'szukaj',
335		'PA',
336		'MT',  # goo.ne.jp
337		'req', # dir.com
338		'k',   # galaxy.com
339	  'cat', # Dmoz
340		'u',   # Google translation
341		'va',  # search.yahoo.com
342		'K',   # srd.yahoo.com
343		'as_epq' # Google, sometimes. Advanced query maybe?
344
345	);
346
347	# Cycle through each prefix and see if it's contained in the
348	# query_string. If it is, we extract the field, clean it, and
349	# return it. Simple.
350
351
352	for (@prefix_array) {
353		if ($query_string =~ m/(^|\&)$_=(.+?)(\&|$)/i) {
354
355			my $key_string = $2;
356
357			for ($key_string) {
358				tr/+/ /;
359				s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
360				s/\s+/ /gs;
361			}
362
363			if ($key_string =~ /\w/) {
364				return $key_string;
365			}
366
367		}
368
369	}
370
371	# Failing all that, some Search-Engines don't overload the
372	# query_string with values, and just make the query_string
373	# the search terms. The next part looks for that, and returns
374	# the whole query_string (cleaned) if this appears to be the
375	# case.
376
377
378	if ($query_string !~ /\=/) {
379
380		for ($query_string) {
381			tr/+/ /;
382			s/%([0-9A-Fa-f]{2})/chr(hex($1))/eg;
383			s/\s+/ /gs;
384		}
385
386		return $query_string;
387	}
388
389
390	return;
391}
392
3931;
394
395__END__
396
397=head1 NAME
398
399URI::Sequin - Extract information from the URLs of Search-Engines
400
401=head1 SYNOPSIS
402
403
404	use URI::Sequin qw/se_extract key_extract log_extract %log_types/;
405
406	$url = &log_extract($line_from_log_file, 'NCSA');
407
408	$log_types{'MyLogType'} = '^(.+?) -> .+$';
409	$url = &log_extract($line_from_log_file, 'MyLogType');
410
411	$keyword_string = &key_extract($url);
412
413	($search_engine_name, $search_engine_url) = @{&se_extract($url)};
414
415
416=head1 DESCRIPTION
417
418This module provides three tools to aid people trying to analyse
419Search-Engine URLs. It�s meant mainly for those who want to analyse
420referrer logs and pick out key information about site visitors, such as
421which Search-Engine and keywords they used to find the site.
422
423The functions and globals provided (and exported by default) from this
424module are:
425
426=over
427
428=item log_extract($log_line, 'Type')
429
430This will pick out the referring URL from a line of a logfile. The 'type' can
431be one of the built in types or can be a user-created one. For more
432information, see %log_types below. This subroutine accepts a scalar, and
433returns a scalar.
434
435=item key_extract($url)
436
437This will try and determine the keywords used in $url. It accepts a scalar
438and returns a scalar. Should nothing be found, it returns an undefined value.
439
440=item se_extract($url)
441
442This will try and determine the name of the Search-Engine used and its URL.
443It accepts a scalar, and returns an array containing firstly the Search-
444Engine�s name and secondly the Search-Engine�s URL. Should the URL appear not
445to be from a Search Query, it returns a reference to an empty array.
446
447=item %log_types
448
449There are five built-in logfile types already in this hash. They are:
450
451=over 4
452
453=item * IIS1 - Microsoft IIS 3.0 and 2.0
454
455=item * IIS2 - Microsoft IIS4.0 (W3SVC format)
456
457=item * NCSA - For APACHE, NETSCAPE and any other NCSA format logs
458
459=item * ORW - O'Reilly WebSite format
460
461=item * General - A generalised one that will work with most logfiles
462
463=back
464
465It�s easy to add another one. Simply add a key to the hash, with a value that
466is a regex. Parenthesise the part that is the referring URL, as the script
467uses $1 to obtain the URL. (see the example in the Synopsis section).
468
469I have only one request for people who use this module. *Please* tell me where
470and how you've used it, and if you have any thoughts or suggestions on it, tell
471me!
472
473=back
474
475=head1 BUGS
476
477Doesn't like the Amnesi Search Engine. But then, neither do I. Also,
478the 'General' log type needs to be used with discretion ... be sure
479that none of the URLs contain literal " if you use it.
480
481=head1 AUTHOR
482
483Peter Sergeant E<lt>pete@grou.chE<gt>
484
485=head1 COPYRIGHT
486
487Copyright 2001 Peter Sergeant.
488
489This program is free software; you can redistribute it and/or modify it under
490the same terms as Perl itself.
491
492=cut
493