1package URI::ParseSearchString;
2
3require Exporter;
4@ISA = (Exporter);
5@EXPORT = ( qw (parse_search_string findEngine se_host se_name se_term) );
6
7use warnings;
8use strict;
9use URI;
10use Data::Dumper;
11
12=encoding utf8
13
14=head1 NAME
15
16URI::ParseSearchString - parse search engine referrer URLs and extract keywords used
17
18=head1 VERSION
19
20Version 3.51  (Diablo 3 edition)
21
22=cut
23
24our $VERSION = '3.51';
25
26=head1 SYNOPSIS
27
28  use URI::ParseSearchString ;
29
30  my $uparse = new URI::ParseSearchString();
31  my $ref    = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search';
32
33  my $query_terms = $uparse->se_term( $ref );
34  my $canonical   = $uparse->se_name( $ref );
35  my $hostname    = $uparse->se_host( $ref );
36
37=head1 FUNCTIONS
38
39=head2 new
40
41  Creates a new instance object of the module.
42
43  my $uparse = new URI::ParseSearchString() ;
44
45=cut
46
47my $RH_LOOKUPS = {
48
49    'answers.yahoo.com'     => { name => 'Yahoo Answers', q=>'p' },
50
51   'sapo.pt'                => { name => 'Pesquisa SAPO', q => 'q'},
52   'iol.pt'                 => { name => 'Pesquisa Iol',  q => 'q'},
53   'pesquisa.clix.pt'       => { name => 'Pesquisa Clix', q => 'question'},
54   'aeiou.pt'               => { name => 'Aeiou',         q => 'q'},
55   'cuil.pt'                => { name => 'Cuil PT',       q => 'q' },
56
57
58   'fotos.sapo.pt'          => { name => 'SAPO fotos',    q => 'word'},
59   'videos.sapo.pt'         => { name => 'SAPO videos',   q => 'word'},
60   'sabores.sapo.pt'        => { name => 'SAPO sabores',  q => 'cxSearch'},
61   'jn.sapo.pt'             => { name => 'Jornal Noticias', q => 'Pesquisa'},
62   'dn.sapo.pt'             => { name => 'Diario Noticias', q => 'Pesquisa'},
63
64
65   'rtp.pt'                 => { name => 'Rtp',           q => 'search'},
66   'record.pt'              => { name => 'Jornal Record', q => 'q'},
67   'correiodamanha.pt'      => { name => 'Correio da Manha',        q => 'pesquisa'},
68   'correiomanha.pt'        => { name => 'Correio Manha',        q => 'pesquisa'},
69   'publico.clix.pt'        => { name => 'Publico',       q => 'q'},
70   'xl.pt'                  => { name => 'XL',            q => 'pesquisa'},
71
72   'abacho.com'             => { name => 'Abacho',        q => 'q'},
73   'alice.it'               => { name => 'Alice.it',      q => 'qs' },
74   'altavista.com'          => { name => 'Altavista',     q => 'q' },
75   'aolsearch.aol.com'      => { name => 'AOL Search',    q => 'query' },
76   'as.starware.com'        => { name => 'Starware',      q => 'qry' },
77   'blogs.icerocket.com'    => { name => 'IceRocket',     q => 'q' },
78   'blogsearch.google.com'  => { name => 'Google Blogsearch', q => 'q' },
79   'busca.orange.es'        => { name => 'Orange ES',     q => 'buscar' },
80   'buscador.lycos.es'      => { name => 'Lycos ES',      q => 'query' },
81   'buscador.terra.es'      => { name => 'Terra ES',      q => 'query' },
82   'buscar.ozu.es'          => { name => 'Ozu ES',        q => 'q' },
83   'categorico.it'          => { name => 'Categorico IT', q => 'q' },
84   'cuil.com'               => { name => 'Cuil',          q => 'q' },
85   'clusty.com'             => { name => 'Clusty',        q => 'query' },
86   'excite.com'             => { name => 'Excite',        q => 'q' },
87   'excite.it'              => { name => 'Excite IT',     q => 'q' },
88   'fastweb.it'             => { name => 'Fastweb IT',    q => 'q' },
89   'fastbrowsersearch.com'  => { name => 'Fastbrowsersearch', q=> 'q' },
90   'godado.com'             => { name => 'Godado',        q => 'key' },
91   'godado.it'              => { name => 'Godado (IT)',   q => 'key' },
92   'gps.virgin.net'         => { name => 'Virgin Search', q => 'q' },
93   'ilmotore.com'           => { name => 'ilMotore',      q => 'query' },
94   'ithaki.net'             => { name => 'Ithaki',        q => 'query' },
95   'kataweb.it'             => { name => 'Kataweb IT',    q => 'q' },
96   'libero.it'              => { name => 'Libero IT',     q => 'query' },
97   'lycos.it'               => { name => 'Lycos IT',      q => 'query' },
98   'search.aol.co.uk'       => { name => 'AOL UK',        q => 'query' },
99   'search.arabia.msn.com'  => { name => 'MSN Arabia',    q => 'q' },
100   'search.bbc.co.uk'       => { name => 'BBC Search',    q => 'q' },
101   'search.conduit.com'     => { name => 'Conduit',       q => 'q' },
102   'search.icq.com'         => { name => 'ICQ dot com',   q => 'q' },
103   'search.live.com'        => { name => 'Live.com',      q => 'q' },
104   'search.lycos.co.uk'     => { name => 'Lycos UK',      q => 'query' },
105   'search.lycos.com'       => { name => 'Lycos',         q => 'query' },
106   'search.msn.co.uk'       => { name => 'MSN UK',        q => 'q' },
107   'search.msn.com'         => { name => 'MSN',           q => 'q' },
108   'search.myway.com'       => { name => 'MyWay',         q => 'searchfor' },
109   'search.mywebsearch.com' => { name => 'My Web Search', q => 'searchfor' },
110   'search.ntlworld.com'    => { name => 'NTLWorld',      q => 'q' },
111   'search.orange.co.uk'    => { name => 'Orange Search', q => 'q' },
112   'search.prodigy.msn.com' => { name => 'MSN Prodigy',   q => 'q' },
113   'search.sweetim.com'     => { name => 'Sweetim',       q => 'q' },
114   'search.virginmedia.com' => { name => 'VirginMedia',   q => 'q' },
115   'search.yahoo.co.jp'     => { name => 'Yahoo Japan',   q => 'p' },
116   'search.yahoo.com'       => { name => 'Yahoo!',        q => 'p' },
117   'search.yahoo.jp'        => { name => 'Yahoo! Japan',  q => 'p' },
118   'simpatico.ws'           => { name => 'Simpatico IT',  q => 'query' },
119   'soso.com'               => { name => 'Soso',          q => 'w' },
120   'suche.fireball.de'      => { name => 'Fireball DE',   q => 'query' },
121   'suche.web.de'           => { name => 'Suche DE',      q => 'su' },
122   'suche.t-online.de'      => { name => 'T-Online',      q => 'q' },
123   'thespider.it'           => { name => 'TheSpider IT',  q => 'q' },
124   'uk.altavista.com'       => { name => 'Altavista UK',  q => 'q' },
125   'uk.ask.com'             => { name => 'Ask UK',        q => 'q' },
126   'uk.search.yahoo.com'    => { name => 'Yahoo! UK',     q => 'p' },
127   'alltheweb.com'          => { name => 'AllTheWeb',     q => 'q' },
128   'ask.com'                => { name => 'Ask dot com',   q => 'q' },
129   'blueyonder.co.uk'       => { name => 'Blueyonder',    q => 'q' },
130   'feedster.com'           => { name => 'Feedster',      q => 'q' },
131   'google.ad'              => { name => 'Google Andorra',q => 'q' },
132   'google.ae'              => { name => 'Google United Arab Emirates', q => 'q' },
133   'google.af'              => { name => 'Google Afghanistan',          q => 'q' },
134   'google.ag'              => { name => 'Google Antiqua and Barbuda',  q => 'q' },
135   'google.am'              => { name => 'Google Armenia',              q => 'q' },
136   'google.as'              => { name => 'Google American Samoa',       q => 'q' },
137   'google.at'              => { name => 'Google Austria',    q => 'q' },
138   'google.az'              => { name => 'Google Azerbaijan', q => 'q' },
139   'google.ba'              => { name => 'Google Bosnia and Herzegovina', q => 'q' },
140   'google.be'              => { name => 'Google Belgium', q => 'q' },
141   'google.bg'              => { name => 'Google Bulgaria',q => 'q' },
142   'google.bi'              => { name => 'Google Burundi', q => 'q' },
143   'google.biz'             => { name => 'Google dot biz', q => 'q' },
144   'google.bo'              => { name => 'Google Bolivia', q => 'q' },
145   'google.bs'              => { name => 'Google Bahamas', q => 'q' },
146   'google.bz'              => { name => 'Google Belize',  q => 'q' },
147   'google.ca'              => { name => 'Google Canada',  q => 'q' },
148   'google.cc'              => { name => 'Google Cocos Islands',    q => 'q' },
149   'google.cd'              => { name => 'Google Dem Rep of Congo', q => 'q' },
150   'google.cg'              => { name => 'Google Rep of Congo',     q => 'q' },
151   'google.ch'              => { name => 'Google Switzerland',      q => 'q' },
152   'google.ci'              => { name => 'Google Cote dIvoire',     q => 'q' },
153   'google.cl'              => { name => 'Google Chile',    q => 'q' },
154   'google.cn'              => { name => 'Google China',    q => 'q' },
155   'google.co.at'           => { name => 'Google Austria',  q => 'q' },
156   'google.co.bi'           => { name => 'Google Burundi',  q => 'q' },
157   'google.co.bw'           => { name => 'Google Botswana', q => 'q' },
158   'google.co.ci'           => { name => 'Google Ivory Coast',  q => 'q' },
159   'google.co.ck'           => { name => 'Google Cook Islands', q => 'q' },
160   'google.co.cr'           => { name => 'Google Costa Rica',   q => 'q' },
161   'google.co.gg'           => { name => 'Google Guernsey',     q => 'q' },
162   'google.co.gl'           => { name => 'Google Greenland',    q => 'q' },
163   'google.co.gy'           => { name => 'Google Guyana',       q => 'q' },
164   'google.co.hu'           => { name => 'Google Hungary',      q => 'q' },
165   'google.co.id'           => { name => 'Google Indonesia',    q => 'q' },
166   'google.co.il'           => { name => 'Google Israel',       q => 'q' },
167   'google.co.im'           => { name => 'Google Isle of Man',  q => 'q' },
168   'google.co.in'           => { name => 'Google India',        q => 'q' },
169   'google.co.it'           => { name => 'Google Italy',        q => 'q' },
170   'google.co.je'           => { name => 'Google Jersey',       q => 'q' },
171   'google.co.jp'           => { name => 'Google Japan',        q => 'q' },
172   'google.co.ke'           => { name => 'Google Kenya',        q => 'q' },
173   'google.co.kr'           => { name => 'Google South Korea',  q => 'q' },
174   'google.co.ls'           => { name => 'Google Lesotho',      q => 'q' },
175   'google.co.ma'           => { name => 'Google Morocco',      q => 'q' },
176   'google.co.mu'           => { name => 'Google Mauritius',    q => 'q' },
177   'google.co.mw'           => { name => 'Google Malawi',       q => 'q' },
178   'google.co.nz'           => { name => 'Google New Zeland',   q => 'q' },
179   'google.co.pn'           => { name => 'Google Pitcairn Islands',    q => 'q' },
180   'google.co.th'           => { name => 'Google Thailand',            q => 'q' },
181   'google.co.tt'           => { name => 'Google Trinidad and Tobago', q => 'q' },
182   'google.co.ug'           => { name => 'Google Uganda',       q => 'q' },
183   'google.co.uk'           => { name => 'Google UK',           q => 'q' },
184   'google.co.uz'           => { name => 'Google Uzbekistan',   q => 'q' },
185   'google.co.ve'           => { name => 'Google Venezuela',    q => 'q' },
186   'google.co.vi'           => { name => 'Google US Virgin Islands', q => 'q' },
187   'google.co.za'           => { name => 'Google  South Africa',q => 'q' },
188   'google.co.zm'           => { name => 'Google Zambia',       q => 'q' },
189   'google.co.zw'           => { name => 'Google Zimbabwe',     q => 'q' },
190   'google.com'             => { name => 'Google',              q => 'q' },
191   'google.com.af'          => { name => 'Google Afghanistan',  q => 'q' },
192   'google.com.ag'          => { name => 'Google Antiqua and Barbuda', q => 'q' },
193   'google.com.ai'          => { name => 'Google Anguilla',    q => 'q' },
194   'google.com.ar'          => { name => 'Google Argentina',   q => 'q' },
195   'google.com.au'          => { name => 'Google Australia',   q => 'q' },
196   'google.com.az'          => { name => 'Google Azerbaijan',  q => 'q' },
197   'google.com.bd'          => { name => 'Google Bangladesh',  q => 'q' },
198   'google.com.bh'          => { name => 'Google Bahrain',     q => 'q' },
199   'google.com.bi'          => { name => 'Google Burundi',     q => 'q' },
200   'google.com.bn'          => { name => 'Google Brunei Darussalam', q => 'q' },
201   'google.com.bo'          => { name => 'Google Bolivia',     q => 'q' },
202   'google.com.br'          => { name => 'Google Brazil',      q => 'q' },
203   'google.com.bs'          => { name => 'Google Bahamas',     q => 'q' },
204   'google.com.bz'          => { name => 'Google Belize',      q => 'q' },
205   'google.com.cn'          => { name => 'Google China',       q => 'q' },
206   'google.com.co'          => { name => 'Google',             q => 'q' },
207   'google.com.cu'          => { name => 'Google Cuba',        q => 'q' },
208   'google.com.do'          => { name => 'Google Dominican Rep', q => 'q' },
209   'google.com.ec'          => { name => 'Google Ecuador',     q => 'q' },
210   'google.com.eg'          => { name => 'Google Egypt',       q => 'q' },
211   'google.com.et'          => { name => 'Google Ethiopia',    q => 'q' },
212   'google.com.fj'          => { name => 'Google Fiji',        q => 'q' },
213   'google.com.ge'          => { name => 'Google Georgia',     q => 'q' },
214   'google.com.gh'          => { name => 'Google Ghana',       q => 'q' },
215   'google.com.gi'          => { name => 'Google Gibraltar',   q => 'q' },
216   'google.com.gl'          => { name => 'Google Greenland',   q => 'q' },
217   'google.com.gp'          => { name => 'Google Guadeloupe',  q => 'q' },
218   'google.com.gr'          => { name => 'Google Greece',      q => 'q' },
219   'google.com.gt'          => { name => 'Google Guatemala',   q => 'q' },
220   'google.com.gy'          => { name => 'Google Guyana',      q => 'q' },
221   'google.com.hk'          => { name => 'Google Hong Kong',   q => 'q' },
222   'google.com.hn'          => { name => 'Google Honduras',    q => 'q' },
223   'google.com.hr'          => { name => 'Google Croatia',     q => 'q' },
224   'google.com.jm'          => { name => 'Google Jamaica',     q => 'q' },
225   'google.com.jo'          => { name => 'Google Jordan',      q => 'q' },
226   'google.com.kg'          => { name => 'Google Kyrgyzstan',  q => 'q' },
227   'google.com.kh'          => { name => 'Google Cambodia',    q => 'q' },
228   'google.com.ki'          => { name => 'Google Kiribati',    q => 'q' },
229   'google.com.kz'          => { name => 'Google Kazakhstan',  q => 'q' },
230   'google.com.lk'          => { name => 'Google Sri Lanka',   q => 'q' },
231   'google.com.lv'          => { name => 'Google Latvia',      q => 'q' },
232   'google.com.ly'          => { name => 'Google Libya',       q => 'q' },
233   'google.com.mt'          => { name => 'Google Malta',       q => 'q' },
234   'google.com.mu'          => { name => 'Google Mauritius',   q => 'q' },
235   'google.com.mw'          => { name => 'Google Malawi',      q => 'q' },
236   'google.com.mx'          => { name => 'Google Mexico',      q => 'q' },
237   'google.com.my'          => { name => 'Google Malaysia',    q => 'q' },
238   'google.com.na'          => { name => 'Google Namibia',     q => 'q' },
239   'google.com.nf'          => { name => 'Google Norfolk Island', q => 'q' },
240   'google.com.ng'          => { name => 'Google Nigeria',        q => 'q' },
241   'google.com.ni'          => { name => 'Google Nicaragua',   q => 'q' },
242   'google.com.np'          => { name => 'Google Nepal',       q => 'q' },
243   'google.com.nr'          => { name => 'Google Nauru',       q => 'q' },
244   'google.com.om'          => { name => 'Google Oman',        q => 'q' },
245   'google.com.pa'          => { name => 'Google Panama',      q => 'q' },
246   'google.com.pe'          => { name => 'Google Peru',        q => 'q' },
247   'google.com.ph'          => { name => 'Google Philipines',  q => 'q' },
248   'google.com.pk'          => { name => 'Google Pakistan',    q => 'q' },
249   'google.com.pl'          => { name => 'Google Poland',      q => 'q' },
250   'google.com.pr'          => { name => 'Google Puerto Rico', q => 'q' },
251   'google.com.pt'          => { name => 'Google Portugal',    q => 'q' },
252   'google.com.py'          => { name => 'Google Paraguay',    q => 'q' },
253   'google.com.qa'          => { name => 'Google',             q => 'q' },
254   'google.com.ru'          => { name => 'Google Russia',      q => 'q' },
255   'google.com.sa'          => { name => 'Google Saudi Arabia',    q => 'q' },
256   'google.com.sb'          => { name => 'Google Solomon Islands', q => 'q' },
257   'google.com.sc'          => { name => 'Google Seychelles',      q => 'q' },
258   'google.com.sg'          => { name => 'Google Singapore',   q => 'q' },
259   'google.com.sv'          => { name => 'Google El Savador',  q => 'q' },
260   'google.com.tj'          => { name => 'Google Tajikistan',  q => 'q' },
261   'google.com.tr'          => { name => 'Google Turkey',      q => 'q' },
262   'google.com.tt'          => { name => 'Google Trinidad and Tobago', q => 'q' },
263   'google.com.tw'          => { name => 'Google Taiwan',      q => 'q' },
264   'google.com.ua'          => { name => 'Google Ukraine',      q => 'q' },
265   'google.com.uy'          => { name => 'Google Uruguay',     q => 'q' },
266   'google.com.uz'          => { name => 'Google Uzbekistan',  q => 'q' },
267   'google.com.ve'          => { name => 'Google Venezuela',   q => 'q' },
268   'google.com.vi'          => { name => 'Google US Virgin Islands', q => 'q' },
269   'google.com.vn'          => { name => 'Google Vietnam',     q => 'q' },
270   'google.com.ws'          => { name => 'Google Samoa',       q => 'q' },
271   'google.cz'              => { name => 'Google Czech Rep',   q => 'q' },
272   'google.de'              => { name => 'Google Germany',     q => 'q' },
273   'google.dj'              => { name => 'Google Djubouti',    q => 'q' },
274   'google.dk'              => { name => 'Google Denmark',     q => 'q' },
275   'google.dm'              => { name => 'Google Dominica',    q => 'q' },
276   'google.ec'              => { name => 'Google Ecuador',     q => 'q' },
277   'google.ee'              => { name => 'Google Estonia',     q => 'q' },
278   'google.es'              => { name => 'Google Spain',       q => 'q' },
279   'google.fi'              => { name => 'Google Finland',     q => 'q' },
280   'google.fm'              => { name => 'Google Micronesia',  q => 'q' },
281   'google.fr'              => { name => 'Google France',      q => 'q' },
282   'google.gd'              => { name => 'Google Grenada',     q => 'q' },
283   'google.ge'              => { name => 'Google Georgia',     q => 'q' },
284   'google.gf'              => { name => 'Google French Guiana', q => 'q' },
285   'google.gg'              => { name => 'Google Guernsey',      q => 'q' },
286   'google.gl'              => { name => 'Google Greenland',     q => 'q' },
287   'google.gm'              => { name => 'Google Gambia',        q => 'q' },
288   'google.gp'              => { name => 'Google Guadeloupe',    q => 'q' },
289   'google.gr'              => { name => 'Google Greece',        q => 'q' },
290   'google.gy'              => { name => 'Google Guyana',        q => 'q' },
291   'google.hk'              => { name => 'Google Hong Kong',     q => 'q' },
292   'google.hn'              => { name => 'Google Honduras',      q => 'q' },
293   'google.hr'              => { name => 'Google Croatia',       q => 'q' },
294   'google.ht'              => { name => 'Google Haiti',         q => 'q' },
295   'google.hu'              => { name => 'Google Hungary',       q => 'q' },
296   'google.ie'              => { name => 'Google Ireland',       q => 'q' },
297   'google.im'              => { name => 'Google Isle of Man',   q => 'q' },
298   'google.in'              => { name => 'Google India',         q => 'q' },
299   'google.info'            => { name => 'Google dot info',      q => 'q' },
300   'google.is'              => { name => 'Google Iceland',       q => 'q' },
301   'google.it'              => { name => 'Google Italy',         q => 'q' },
302   'google.je'              => { name => 'Google Jersey',        q => 'q' },
303   'google.jo'              => { name => 'Google Jordan',        q => 'q' },
304   'google.jobs'            => { name => 'Google dot jobs',      q => 'q' },
305   'google.jp'              => { name => 'Google Japan',         q => 'q' },
306   'google.kg'              => { name => 'Google Kyrgyzstan',    q => 'q' },
307   'google.ki'              => { name => 'Google Kiribati',      q => 'q' },
308   'google.kz'              => { name => 'Google Kazakhstan',    q => 'q' },
309   'google.la'              => { name => 'Google Laos',          q => 'q' },
310   'google.li'              => { name => 'Google Liechtenstein', q => 'q' },
311   'google.lk'              => { name => 'Google Sri Lanka',     q => 'q' },
312   'google.lt'              => { name => 'Google Lithuania',     q => 'q' },
313   'google.lu'              => { name => 'Google Luxembourg',    q => 'q' },
314   'google.lv'              => { name => 'Google Latvia',        q => 'q' },
315   'google.ma'              => { name => 'Google Morocco',       q => 'q' },
316   'google.md'              => { name => 'Google Moldova',       q => 'q' },
317   'google.mn'              => { name => 'Google Mongolia',      q => 'q' },
318   'google.mobi'            => { name => 'Google dot mobi',      q => 'q' },
319   'google.ms'              => { name => 'Google Montserrat',    q => 'q' },
320   'google.mu'              => { name => 'Google Mauritius',     q => 'q' },
321   'google.mv'              => { name => 'Google Maldives',      q => 'q' },
322   'google.mw'              => { name => 'Google Malawi',        q => 'q' },
323   'google.net'             => { name => 'Google dot net',       q => 'q' },
324   'google.nf'              => { name => 'Google Norfolk Island', q => 'q' },
325   'google.nl'              => { name => 'Google Netherlands',    q => 'q' },
326   'google.no'              => { name => 'Google Norway',        q => 'q' },
327   'google.nr'              => { name => 'Google Nauru',         q => 'q' },
328   'google.nu'              => { name => 'Google Niue',          q => 'q' },
329   'google.off.ai'          => { name => 'Google Anguilla',      q => 'q' },
330   'google.ph'              => { name => 'Google Philipines',    q => 'q' },
331   'google.pk'              => { name => 'Google Pakistan',      q => 'q' },
332   'google.pl'              => { name => 'Google Poland',        q => 'q' },
333   'google.pn'              => { name => 'Google Pitcairn Islands', q => 'q' },
334   'google.pr'              => { name => 'Google Puerto Rico',   q => 'q' },
335   'google.pt'              => { name => 'Google Portugal',      q => 'q' },
336   'google.ro'              => { name => 'Google Romania',       q => 'q' },
337   'google.ru'              => { name => 'Google Russia',        q => 'q' },
338   'google.rw'              => { name => 'Google Rwanda',        q => 'q' },
339   'google.sc'              => { name => 'Google Seychelles',    q => 'q' },
340   'google.se'              => { name => 'Google Sweden',        q => 'q' },
341   'google.sg'              => { name => 'Google Singapore',     q => 'q' },
342   'google.sh'              => { name => 'Google Saint Helena',  q => 'q' },
343   'google.si'              => { name => 'Google Slovenia',      q => 'q' },
344   'google.sk'              => { name => 'Google Slovakia',      q => 'q' },
345   'google.sm'              => { name => 'Google San Marino',    q => 'q' },
346   'google.sn'              => { name => 'Google Senegal',       q => 'q' },
347   'google.sr'              => { name => 'Google Suriname',      q => 'q' },
348   'google.st'              => { name => 'Google Sao Tome',      q => 'q' },
349   'google.tk'              => { name => 'Google Tokelau',       q => 'q' },
350   'google.tm'              => { name => 'Google Turkmenistan',  q => 'q' },
351   'google.to'              => { name => 'Google Tonga',        q => 'q' },
352   'google.tp'              => { name => 'Google East Timor',   q => 'q' },
353   'google.tt'              => { name => 'Google Trinidad and Tobago', q => 'q' },
354   'google.tv'              => { name => 'Google Tuvalu', q => 'q' },
355   'google.tw'              => { name => 'Google Taiwan', q => 'q' },
356   'google.ug'              => { name => 'Google Uganda', q => 'q' },
357   'google.us'              => { name => 'Google US',     q => 'q' },
358   'google.uz'              => { name => 'Google Uzbekistan',             q => 'q' },
359   'google.vg'              => { name => 'Google British Virgin Islands', q => 'q' },
360   'google.vn'              => { name => 'Google Vietnam', q => 'q' },
361   'google.vu'              => { name => 'Google Vanuatu', q => 'q' },
362   'google.ws'              => { name => 'Google Samoa',  q => 'q' },
363   'hotbot.com'             => { name => 'HotBot',        q => 'query' },
364   'in.gr'                  => { name => 'In GR',         q => 'q' },
365   'mamma.com'              => { name => 'Mamma',         q => 'query' },
366   'mahalo.com'             => { name => 'Mahalo',        q => 'search' },
367   'megasearching.net'      => { name => 'Megasearching', q => 's' },
368   'mirago.co.uk'           => { name => 'Mirago UK',     q => 'qry' },
369   'netscape.com'           => { name => 'Netscape',      q => 's' },
370   'community.paglo.com'    => { name => 'Paglo',         q => 'q' },
371   'pathfinder.gr'          => { name => 'Pathfinder GR', q => 'q' },
372   'phantis.com'            => { name => 'Phantis GR' ,   q => 'q'},
373   'robby.gr'               => { name => 'Robby GR'     , q => 'searchstr' },
374   'sproose.com'            => { name => 'Sproose',       q => 'query' },
375   'technorati.com'         => { name => 'Technorati',    q => 'q' },
376   'tesco.net'              => { name => 'Tesco Search',  q => 'q' },
377   'tiscali.co.uk'          => { name => 'Tiscali UK',    q => 'query' },
378   'bing.com'               => { name => 'Bing',          q => 'q' },
379
380   'acbusca.com'            => { name => 'ACBusca',          q => 'query' },
381   'atalhocerto.com.br'     => { name => 'Atalho Certo',     q => 'keyword' },
382   'bastaclicar.com.br'     => { name => 'Basta Clicar',     q => 'search' },
383   'bemrapido.com.br'       => { name => 'Bem Rapido',       q => 'chave' },
384   'br.altavista.com'       => { name => 'AltaVista Brasil', q => 'q' },
385   'br.search.yahoo.com'    => { name => 'Yahoo Brazil',     q => 'p' },
386   'busca.uol.com.br'       => { name => 'Radar UOL',        q => 'q' },
387   'buscaaqui.com.br'       => { name => 'Busca Aqui',       q => 'q' },
388   'buscador.terra.com.br'  => { name => 'Terra Busca',      q => 'query' },
389   'cade.search.yahoo.com'  => { name => 'Cadê',             q => 'p' },
390   'clickgratis.com.br'     => { name => 'Click Gratis',     q => 'query' },
391   'entrada.com.br'         => { name => 'Entrada',          q => 'q' },
392   'gigabusca.com.br'       => { name => 'Giga Busca',       q => 'what' },
393   'internetica.com.br'     => { name => 'Internetica',      q => 'busca' },
394   'katatudo.com.br'        => { name => 'KataTudo',         q => 'q' },
395   'minasplanet.com.br'     => { name => 'Minas Planet',     q => 'term' },
396   'speedybusca.com.br'     => { name => 'SpeedyBusca',      q => 'q' },
397   'vaibuscar.com.br'       => { name => 'Vai Busca',        q => 'q' },
398
399   'search.conduit.com'     => { name => 'Conduit',          q=>'q'   },
400   'in.search.yahoo.com'    => { name => 'Yahoo India',      q => 'p'  },
401   'rediff.com'             => { name => 'Rediff',           q => 'MT' },
402   'guruji.com'             => { name => 'Guruji',           q => 'q'  },
403
404   'isohunt.com'            => { name => 'Isohunt',          q => 'ihq' },
405   'btjunkie.org'           => { name => 'BT Junkie',        q => 'q' },
406   'torrentz.eu'            => { name => 'Torrentz',         q => 'f' }
407
408};
409
410sub new {
411  my $class        = shift ;
412  my $self         = { } ;
413  $self->{engines} = $RH_LOOKUPS;
414  return bless $self, $class ;
415}
416
417=head2 parse_search_string
418
419This module provides a simple function to parse and extract search engine query strings. It was designed and tested having
420Apache referrer logs in mind. It can be used for a wide number of purposes, including tracking down what keywords people use
421on popular search engines before they land on a site. Although a number of existing modules and scripts exist for this purpose,
422the majority of them are either outdated using obsolete search strings associated with each engine.
423
424The default function exported is "parse_search_string" which accepts an unquoted referrer string as input and returns the
425search engine query contained within. It currently works with both escaped and un-escaped queries and will translate the search
426terms before returning them in the latter case. The function returns undef in all other cases and errors.
427
428for example:
429
430   my $ref   = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search';
431   my $terms =
432      $uparse->parse_search_string( $ref );
433
434would return I<'a simple test'>
435
436whereas
437
438   my $ref   = 'http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+more%21+complex_+search%24&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0';
439   my $terms =
440      $uparse->parse_search_string( $ref );
441
442would return I<'a more! complex_ search$'>
443
444=cut
445
446=head2 se_term
447
448Same as parse_search_string().
449
450=cut
451
452sub se_term {
453  my $self   = shift ;
454  my $string = shift ;
455  return unless defined $string ;
456  return $self->parse_search_string($string) ;
457}
458
459## internal method for creating a URI object
460
461sub _uri {
462   my $self   = shift;
463   my $string = shift;
464
465   return unless defined($string);
466
467   ## create a new URI object
468	## and return unless its http or https
469
470	my $uri = URI->new( $string );
471	return
472	   unless (defined($uri)
473	       && (ref($uri) eq 'URI::http' || ref($uri) eq 'URI::https'));
474
475	## feedster and technorati as they do not follow
476	## the usual search patterns thus we extract the query
477	## terms by taking the last element from the path segments
478
479    my $host = $uri->host;
480
481    return unless defined($host) && $host;
482
483   if ( $host =~ m/(feedster|technorati)\.com$/ ){
484	   $uri->query_form( q => ( $uri->path_segments)[-1]);
485	}
486
487	## clean up the host until it matches
488	## something we already know about
489
490	 while( ! defined $self->{'engines'}{ $host }){
491        my $c = index($host, '.');
492        last if $c <0;
493        $host= substr($host, $c+1);
494    }
495
496    return ($uri, $host);
497
498}
499
500
501sub parse_search_string {
502   my $self   = shift ;
503   my $string = shift ;
504	return unless defined($string);
505
506	my ($uri,$host) = $self->_uri( $string );
507	return unless defined($uri);
508
509	## get rid of the www
510	$host =~ m!^www\.!;
511
512	## find the query parameter the engine uses
513	my $q = $self->{'engines'}{$host}{'q'};
514	return unless defined $q;
515
516	## return the string passed to the query parameter
517	my %h_query = $uri->query_form;
518
519	return $h_query{$q}
520}
521
522=head2 findEngine
523
524Returns a list with the hostname of the search engine as the first element and
525the canonical name as the second element.
526
527  my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search';
528  my ($hostname, $canonical) = $uparse->findEngine( $ref ) ;
529
530This will return 'google.com' as the search engine hostname and 'Google' as the name.
531This function will return I<undef> on error.
532
533=cut
534
535sub findEngine {
536  my $self    = shift ;
537  my $string  = shift ;
538
539  return unless defined($string);
540
541  ## create a URI object
542
543  my ($uri,$hostname) = $self->_uri( $string );
544  return unless defined($uri) && $uri;
545  return unless defined($hostname) && $hostname;
546
547  my $canonical = $self->{'engines'}->{$hostname}->{'name'};
548
549  return ($hostname,$canonical);
550}
551
552=head2 se_host
553
554Wrapper around findEngine - returns just the hostname.
555This function will return I<undef> on error.
556
557=cut
558
559sub se_host {
560  my $self   = shift ;
561  my $string = shift ;
562  return unless defined($string) ;
563  my ($host,$name) = $self->findEngine($string) ;
564  return $host ;
565}
566
567=head2 se_name
568
569Wrapper around findEngine - returns just the canonical name;
570This function will return I<undef> on error.
571
572=cut
573
574sub se_name {
575  my $self   = shift ;
576  my $string = shift ;
577  return unless defined($string);
578  my ($host,$name) = $self->findEngine($string) ;
579  return $name ;
580}
581
582=head1 SUPPORTED ENGINES
583
584Currently supported search engines include: Sproose, Google Namibia, Google Ivory Coast, Google Oman, Technorati, Google Ecuador,
585Google Norfolk Island, Mahalo, Google UK, Yahoo! UK, Google Micronesia, Google Bahrain, Basta Clicar,
586Giga Busca, Google Greece, Google Belgium, Google Egypt, Google Chile, Godado (IT), Google Australia,
587Google Uruguay, Google India, Google Taiwan, Google Ukraine, Google US, Terra ES,
588Tesco Search, Megasearching, SAPO videos, Google Nepal, Google Israel, Google US Virgin Islands, Google Hungary,
589Google San Marino, Google Croatia, Google dot jobs, Google Panama, Google Malaysia, Internetica, Google Brunei Darussalam,
590Google Denmark, Google Pakistan, Google Solomon Islands, Google dot biz, Google Lesotho, IceRocket, Google Greenland, Fireball DE,
591Rtp, Google Portugal, Google Samoa, Google Kazakhstan, Google Blogsearch, Google Thailand, Google, Google Antiqua and Barbuda,
592Google Germany, Google Moldova, Google Zambia, Google Greece, Google Sri Lanka, Google Ireland, Google Austria,
593Google Peru, Google Guatemala, ICQ dot com, AOL UK, Google Guyana, In GR, Google dot info, MyWay, Pathfinder GR, Google Costa Rica,
594KataTudo, Google Jamaica, Google Vietnam, Google Morocco, Google Gambia, Google Singapore, Google Mauritius, Altavista, Google Afghanistan,
595Google Cote dIvoire, Google Kazakhstan, Google Czech Rep, Phantis GR, Google Bahamas, Google United Arab Emirates, Google East Timor, Ozu ES,
596Google Venezuela, Google Puerto Rico, Google Armenia, Google Croatia, Google Botswana, Google Tuvalu, Ask UK, Google Singapore, Mirago UK,
597Google Greenland, MSN Arabia, Google Nauru, Publico, Robby GR, Minas Planet, Pesquisa Iol, Google Romania, Google South Korea, Google Jersey,
598Netscape, Busca Aqui, Google Bulgaria, Google Uzbekistan, Tiscali UK, Ithaki, Cadê, Lycos IT, Google Suriname, Excite IT, Google Hong Kong,
599Kataweb IT, Google Burundi, Click Gratis, Google Vietnam, MSN, Alice.it, Google Honduras, Google Trinidad and Tobago, Google Uganda, XL,
600Jornal Noticias, Google Cook Islands, Google Japan, Google Ecuador, Google Ghana, Google Guadeloupe, Google Libya, Google Kenya, Fastbrowsersearch,
601Aeiou, Google Niue, Jornal Record, HotBot, Google Honduras, Google Georgia, Google Fiji, Google Philipines, BBC Search, Google, Google Laos,
602Soso, AltaVista Brasil, Lycos UK, SAPO fotos, Ask dot com, Google Netherlands, Google Philipines, Google Trinidad and Tobago, Google Turkey,
603AllTheWeb, Google Japan, Google Argentina, Google Vanuatu, Blueyonder, Google Greenland, Google Samoa, Google Georgia, Google Slovakia,
604Google Sri Lanka, Pesquisa SAPO, Google Latvia, Google Latvia, Correio Manha, Terra Busca, Google El Savador, Google Cambodia,
605Google Mauritius, Google China, AOL Search, Google Tokelau, Google Tonga, Correio da Manha, Radar UOL, Google Jordan, Godado, Google Jordan,
606Google Pitcairn Islands, Categorico IT, Google Morocco, Google Dominican Rep, Google France, Abacho, Google Azerbaijan, Google Andorra, Google Belize,
607Google Paraguay, Simpatico IT, Google Ethiopia, Google Uganda, Google Poland, Google Bolivia, Google Hungary, Google Russia, Diario Noticias,
608Google Puerto Rico, Google Montserrat, Yahoo! Japan, Google Seychelles, Mamma, Google Pitcairn Islands, Google  South Africa, Paglo, Google Malta,
609Google Azerbaijan, Google New Zeland, Google China, Google Norway, Google Bosnia and Herzegovina, Google Indonesia, SpeedyBusca, Entrada, Google Anguilla,
610Google Rep of Congo, Google Dominica, Google Finland, Altavista UK, Google Guyana, MSN UK, Yahoo Answers, Google British Virgin Islands, Google Guadeloupe,
611Google Lithuania, Google Antiqua and Barbuda, Google Bahamas, Google Malawi, MSN Prodigy, Bing, Google Bolivia, Google Djubouti, Google Uzbekistan, Fastweb IT,
612Google Tajikistan, Virgin Search, Google Nigeria, Yahoo Japan, Pesquisa Clix, Google Grenada, Google Haiti, Google American Samoa, Google Pakistan,
613Google Cocos Islands, Google Hong Kong, NTLWorld, ilMotore, Google Belize, Google Guernsey, Google Sweden, Google Anguilla, Google Bangladesh, Google Isle of Man,
614Google Guernsey, Google Kyrgyzstan, Google Dem Rep of Congo, Google Malawi, Orange Search, Google Seychelles, Google Guyana, Google Gibraltar,
615oogle Italy, Google Kiribati, TheSpider IT, Google Nicaragua, Google Russia, Google Venezuela, Google Poland, Google Brazil, Google Senegal, Conduit, Lycos,
616Google Isle of Man, Live.com, Google Italy, Libero IT, Google Canada, Google Nauru, Google Liechtenstein, Google Afghanistan, Cuil, Google Zimbabwe, Google Mauritius,
617Orange ES, Google Burundi, Google Portugal, ACBusca, Bem Rapido, Atalho Certo, Excite, Clusty, Yahoo Brazil, My Web Search, Google Spain, Google Uzbekistan, Google,
618Google Mexico, T-Online, Google dot mobi, Google Luxembourg, Google Austria, Yahoo!, Google Kiribati, Sweetim, Vai Busca, Google Mongolia, Google Saudi Arabia, Google dot net,
619Google Maldives, Google Trinidad and Tobago, Google Jersey, Feedster, Google Turkmenistan, Google Switzerland, Google Norfolk Island, Suche DE, Google Malawi, Google Rwanda,
620Lycos ES, Google Burundi, Google French Guiana, Google Kyrgyzstan, Google Saint Helena, VirginMedia, Google Iceland, SAPO sabores, Google India, Google Cuba,
621Google US Virgin Islands, Google Taiwan, Google Sao Tome, Google Slovenia, Starware, Google Estonia, Conduit, Yahoo India, Rediff, Guruji
622
623=head1 AUTHOR
624
625Spiros Denaxas, C<< <s.denaxas at gmail.com> >>
626
627=head1 SOURCE CODE
628
629The source code can be found on github L<https://github.com/spiros/URI-ParseSearchString>
630
631=head1 BUGS
632
633This is my first CPAN module so I encourage you to send all comments, especially bad,
634to my email address.
635
636This could not have been possible without the support of my co-workers at
637http://nestoria.co.uk - the easiest way of finding UK property.
638
639=head1 SUPPORT
640
641For more information, you could also visit my blog:
642
643	http://blog.ffffruit.com
644
645=over 4
646
647=back
648
649=head1 COPYRIGHT & LICENSE
650
651Copyright 2011 Spiros Denaxas, all rights reserved.
652
653This program is free software; you can redistribute it and/or modify it
654under the same terms as Perl itself.
655
656=cut
657
6581; # End of URI::ParseSearchString
659