1package URI::ParseSearchString; 2 3require Exporter; 4@ISA = (Exporter); 5@EXPORT = ( qw (parse_search_string findEngine se_host se_name se_term) ); 6 7use warnings; 8use strict; 9use URI; 10use Data::Dumper; 11 12=encoding utf8 13 14=head1 NAME 15 16URI::ParseSearchString - parse search engine referrer URLs and extract keywords used 17 18=head1 VERSION 19 20Version 3.51 (Diablo 3 edition) 21 22=cut 23 24our $VERSION = '3.51'; 25 26=head1 SYNOPSIS 27 28 use URI::ParseSearchString ; 29 30 my $uparse = new URI::ParseSearchString(); 31 my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search'; 32 33 my $query_terms = $uparse->se_term( $ref ); 34 my $canonical = $uparse->se_name( $ref ); 35 my $hostname = $uparse->se_host( $ref ); 36 37=head1 FUNCTIONS 38 39=head2 new 40 41 Creates a new instance object of the module. 42 43 my $uparse = new URI::ParseSearchString() ; 44 45=cut 46 47my $RH_LOOKUPS = { 48 49 'answers.yahoo.com' => { name => 'Yahoo Answers', q=>'p' }, 50 51 'sapo.pt' => { name => 'Pesquisa SAPO', q => 'q'}, 52 'iol.pt' => { name => 'Pesquisa Iol', q => 'q'}, 53 'pesquisa.clix.pt' => { name => 'Pesquisa Clix', q => 'question'}, 54 'aeiou.pt' => { name => 'Aeiou', q => 'q'}, 55 'cuil.pt' => { name => 'Cuil PT', q => 'q' }, 56 57 58 'fotos.sapo.pt' => { name => 'SAPO fotos', q => 'word'}, 59 'videos.sapo.pt' => { name => 'SAPO videos', q => 'word'}, 60 'sabores.sapo.pt' => { name => 'SAPO sabores', q => 'cxSearch'}, 61 'jn.sapo.pt' => { name => 'Jornal Noticias', q => 'Pesquisa'}, 62 'dn.sapo.pt' => { name => 'Diario Noticias', q => 'Pesquisa'}, 63 64 65 'rtp.pt' => { name => 'Rtp', q => 'search'}, 66 'record.pt' => { name => 'Jornal Record', q => 'q'}, 67 'correiodamanha.pt' => { name => 'Correio da Manha', q => 'pesquisa'}, 68 'correiomanha.pt' => { name => 'Correio Manha', q => 'pesquisa'}, 69 'publico.clix.pt' => { name => 'Publico', q => 'q'}, 70 'xl.pt' => { name => 'XL', q => 'pesquisa'}, 71 72 'abacho.com' => { name => 'Abacho', q => 'q'}, 73 'alice.it' => { name => 'Alice.it', q => 'qs' }, 74 'altavista.com' => { name => 'Altavista', q => 'q' }, 75 'aolsearch.aol.com' => { name => 'AOL Search', q => 'query' }, 76 'as.starware.com' => { name => 'Starware', q => 'qry' }, 77 'blogs.icerocket.com' => { name => 'IceRocket', q => 'q' }, 78 'blogsearch.google.com' => { name => 'Google Blogsearch', q => 'q' }, 79 'busca.orange.es' => { name => 'Orange ES', q => 'buscar' }, 80 'buscador.lycos.es' => { name => 'Lycos ES', q => 'query' }, 81 'buscador.terra.es' => { name => 'Terra ES', q => 'query' }, 82 'buscar.ozu.es' => { name => 'Ozu ES', q => 'q' }, 83 'categorico.it' => { name => 'Categorico IT', q => 'q' }, 84 'cuil.com' => { name => 'Cuil', q => 'q' }, 85 'clusty.com' => { name => 'Clusty', q => 'query' }, 86 'excite.com' => { name => 'Excite', q => 'q' }, 87 'excite.it' => { name => 'Excite IT', q => 'q' }, 88 'fastweb.it' => { name => 'Fastweb IT', q => 'q' }, 89 'fastbrowsersearch.com' => { name => 'Fastbrowsersearch', q=> 'q' }, 90 'godado.com' => { name => 'Godado', q => 'key' }, 91 'godado.it' => { name => 'Godado (IT)', q => 'key' }, 92 'gps.virgin.net' => { name => 'Virgin Search', q => 'q' }, 93 'ilmotore.com' => { name => 'ilMotore', q => 'query' }, 94 'ithaki.net' => { name => 'Ithaki', q => 'query' }, 95 'kataweb.it' => { name => 'Kataweb IT', q => 'q' }, 96 'libero.it' => { name => 'Libero IT', q => 'query' }, 97 'lycos.it' => { name => 'Lycos IT', q => 'query' }, 98 'search.aol.co.uk' => { name => 'AOL UK', q => 'query' }, 99 'search.arabia.msn.com' => { name => 'MSN Arabia', q => 'q' }, 100 'search.bbc.co.uk' => { name => 'BBC Search', q => 'q' }, 101 'search.conduit.com' => { name => 'Conduit', q => 'q' }, 102 'search.icq.com' => { name => 'ICQ dot com', q => 'q' }, 103 'search.live.com' => { name => 'Live.com', q => 'q' }, 104 'search.lycos.co.uk' => { name => 'Lycos UK', q => 'query' }, 105 'search.lycos.com' => { name => 'Lycos', q => 'query' }, 106 'search.msn.co.uk' => { name => 'MSN UK', q => 'q' }, 107 'search.msn.com' => { name => 'MSN', q => 'q' }, 108 'search.myway.com' => { name => 'MyWay', q => 'searchfor' }, 109 'search.mywebsearch.com' => { name => 'My Web Search', q => 'searchfor' }, 110 'search.ntlworld.com' => { name => 'NTLWorld', q => 'q' }, 111 'search.orange.co.uk' => { name => 'Orange Search', q => 'q' }, 112 'search.prodigy.msn.com' => { name => 'MSN Prodigy', q => 'q' }, 113 'search.sweetim.com' => { name => 'Sweetim', q => 'q' }, 114 'search.virginmedia.com' => { name => 'VirginMedia', q => 'q' }, 115 'search.yahoo.co.jp' => { name => 'Yahoo Japan', q => 'p' }, 116 'search.yahoo.com' => { name => 'Yahoo!', q => 'p' }, 117 'search.yahoo.jp' => { name => 'Yahoo! Japan', q => 'p' }, 118 'simpatico.ws' => { name => 'Simpatico IT', q => 'query' }, 119 'soso.com' => { name => 'Soso', q => 'w' }, 120 'suche.fireball.de' => { name => 'Fireball DE', q => 'query' }, 121 'suche.web.de' => { name => 'Suche DE', q => 'su' }, 122 'suche.t-online.de' => { name => 'T-Online', q => 'q' }, 123 'thespider.it' => { name => 'TheSpider IT', q => 'q' }, 124 'uk.altavista.com' => { name => 'Altavista UK', q => 'q' }, 125 'uk.ask.com' => { name => 'Ask UK', q => 'q' }, 126 'uk.search.yahoo.com' => { name => 'Yahoo! UK', q => 'p' }, 127 'alltheweb.com' => { name => 'AllTheWeb', q => 'q' }, 128 'ask.com' => { name => 'Ask dot com', q => 'q' }, 129 'blueyonder.co.uk' => { name => 'Blueyonder', q => 'q' }, 130 'feedster.com' => { name => 'Feedster', q => 'q' }, 131 'google.ad' => { name => 'Google Andorra',q => 'q' }, 132 'google.ae' => { name => 'Google United Arab Emirates', q => 'q' }, 133 'google.af' => { name => 'Google Afghanistan', q => 'q' }, 134 'google.ag' => { name => 'Google Antiqua and Barbuda', q => 'q' }, 135 'google.am' => { name => 'Google Armenia', q => 'q' }, 136 'google.as' => { name => 'Google American Samoa', q => 'q' }, 137 'google.at' => { name => 'Google Austria', q => 'q' }, 138 'google.az' => { name => 'Google Azerbaijan', q => 'q' }, 139 'google.ba' => { name => 'Google Bosnia and Herzegovina', q => 'q' }, 140 'google.be' => { name => 'Google Belgium', q => 'q' }, 141 'google.bg' => { name => 'Google Bulgaria',q => 'q' }, 142 'google.bi' => { name => 'Google Burundi', q => 'q' }, 143 'google.biz' => { name => 'Google dot biz', q => 'q' }, 144 'google.bo' => { name => 'Google Bolivia', q => 'q' }, 145 'google.bs' => { name => 'Google Bahamas', q => 'q' }, 146 'google.bz' => { name => 'Google Belize', q => 'q' }, 147 'google.ca' => { name => 'Google Canada', q => 'q' }, 148 'google.cc' => { name => 'Google Cocos Islands', q => 'q' }, 149 'google.cd' => { name => 'Google Dem Rep of Congo', q => 'q' }, 150 'google.cg' => { name => 'Google Rep of Congo', q => 'q' }, 151 'google.ch' => { name => 'Google Switzerland', q => 'q' }, 152 'google.ci' => { name => 'Google Cote dIvoire', q => 'q' }, 153 'google.cl' => { name => 'Google Chile', q => 'q' }, 154 'google.cn' => { name => 'Google China', q => 'q' }, 155 'google.co.at' => { name => 'Google Austria', q => 'q' }, 156 'google.co.bi' => { name => 'Google Burundi', q => 'q' }, 157 'google.co.bw' => { name => 'Google Botswana', q => 'q' }, 158 'google.co.ci' => { name => 'Google Ivory Coast', q => 'q' }, 159 'google.co.ck' => { name => 'Google Cook Islands', q => 'q' }, 160 'google.co.cr' => { name => 'Google Costa Rica', q => 'q' }, 161 'google.co.gg' => { name => 'Google Guernsey', q => 'q' }, 162 'google.co.gl' => { name => 'Google Greenland', q => 'q' }, 163 'google.co.gy' => { name => 'Google Guyana', q => 'q' }, 164 'google.co.hu' => { name => 'Google Hungary', q => 'q' }, 165 'google.co.id' => { name => 'Google Indonesia', q => 'q' }, 166 'google.co.il' => { name => 'Google Israel', q => 'q' }, 167 'google.co.im' => { name => 'Google Isle of Man', q => 'q' }, 168 'google.co.in' => { name => 'Google India', q => 'q' }, 169 'google.co.it' => { name => 'Google Italy', q => 'q' }, 170 'google.co.je' => { name => 'Google Jersey', q => 'q' }, 171 'google.co.jp' => { name => 'Google Japan', q => 'q' }, 172 'google.co.ke' => { name => 'Google Kenya', q => 'q' }, 173 'google.co.kr' => { name => 'Google South Korea', q => 'q' }, 174 'google.co.ls' => { name => 'Google Lesotho', q => 'q' }, 175 'google.co.ma' => { name => 'Google Morocco', q => 'q' }, 176 'google.co.mu' => { name => 'Google Mauritius', q => 'q' }, 177 'google.co.mw' => { name => 'Google Malawi', q => 'q' }, 178 'google.co.nz' => { name => 'Google New Zeland', q => 'q' }, 179 'google.co.pn' => { name => 'Google Pitcairn Islands', q => 'q' }, 180 'google.co.th' => { name => 'Google Thailand', q => 'q' }, 181 'google.co.tt' => { name => 'Google Trinidad and Tobago', q => 'q' }, 182 'google.co.ug' => { name => 'Google Uganda', q => 'q' }, 183 'google.co.uk' => { name => 'Google UK', q => 'q' }, 184 'google.co.uz' => { name => 'Google Uzbekistan', q => 'q' }, 185 'google.co.ve' => { name => 'Google Venezuela', q => 'q' }, 186 'google.co.vi' => { name => 'Google US Virgin Islands', q => 'q' }, 187 'google.co.za' => { name => 'Google South Africa',q => 'q' }, 188 'google.co.zm' => { name => 'Google Zambia', q => 'q' }, 189 'google.co.zw' => { name => 'Google Zimbabwe', q => 'q' }, 190 'google.com' => { name => 'Google', q => 'q' }, 191 'google.com.af' => { name => 'Google Afghanistan', q => 'q' }, 192 'google.com.ag' => { name => 'Google Antiqua and Barbuda', q => 'q' }, 193 'google.com.ai' => { name => 'Google Anguilla', q => 'q' }, 194 'google.com.ar' => { name => 'Google Argentina', q => 'q' }, 195 'google.com.au' => { name => 'Google Australia', q => 'q' }, 196 'google.com.az' => { name => 'Google Azerbaijan', q => 'q' }, 197 'google.com.bd' => { name => 'Google Bangladesh', q => 'q' }, 198 'google.com.bh' => { name => 'Google Bahrain', q => 'q' }, 199 'google.com.bi' => { name => 'Google Burundi', q => 'q' }, 200 'google.com.bn' => { name => 'Google Brunei Darussalam', q => 'q' }, 201 'google.com.bo' => { name => 'Google Bolivia', q => 'q' }, 202 'google.com.br' => { name => 'Google Brazil', q => 'q' }, 203 'google.com.bs' => { name => 'Google Bahamas', q => 'q' }, 204 'google.com.bz' => { name => 'Google Belize', q => 'q' }, 205 'google.com.cn' => { name => 'Google China', q => 'q' }, 206 'google.com.co' => { name => 'Google', q => 'q' }, 207 'google.com.cu' => { name => 'Google Cuba', q => 'q' }, 208 'google.com.do' => { name => 'Google Dominican Rep', q => 'q' }, 209 'google.com.ec' => { name => 'Google Ecuador', q => 'q' }, 210 'google.com.eg' => { name => 'Google Egypt', q => 'q' }, 211 'google.com.et' => { name => 'Google Ethiopia', q => 'q' }, 212 'google.com.fj' => { name => 'Google Fiji', q => 'q' }, 213 'google.com.ge' => { name => 'Google Georgia', q => 'q' }, 214 'google.com.gh' => { name => 'Google Ghana', q => 'q' }, 215 'google.com.gi' => { name => 'Google Gibraltar', q => 'q' }, 216 'google.com.gl' => { name => 'Google Greenland', q => 'q' }, 217 'google.com.gp' => { name => 'Google Guadeloupe', q => 'q' }, 218 'google.com.gr' => { name => 'Google Greece', q => 'q' }, 219 'google.com.gt' => { name => 'Google Guatemala', q => 'q' }, 220 'google.com.gy' => { name => 'Google Guyana', q => 'q' }, 221 'google.com.hk' => { name => 'Google Hong Kong', q => 'q' }, 222 'google.com.hn' => { name => 'Google Honduras', q => 'q' }, 223 'google.com.hr' => { name => 'Google Croatia', q => 'q' }, 224 'google.com.jm' => { name => 'Google Jamaica', q => 'q' }, 225 'google.com.jo' => { name => 'Google Jordan', q => 'q' }, 226 'google.com.kg' => { name => 'Google Kyrgyzstan', q => 'q' }, 227 'google.com.kh' => { name => 'Google Cambodia', q => 'q' }, 228 'google.com.ki' => { name => 'Google Kiribati', q => 'q' }, 229 'google.com.kz' => { name => 'Google Kazakhstan', q => 'q' }, 230 'google.com.lk' => { name => 'Google Sri Lanka', q => 'q' }, 231 'google.com.lv' => { name => 'Google Latvia', q => 'q' }, 232 'google.com.ly' => { name => 'Google Libya', q => 'q' }, 233 'google.com.mt' => { name => 'Google Malta', q => 'q' }, 234 'google.com.mu' => { name => 'Google Mauritius', q => 'q' }, 235 'google.com.mw' => { name => 'Google Malawi', q => 'q' }, 236 'google.com.mx' => { name => 'Google Mexico', q => 'q' }, 237 'google.com.my' => { name => 'Google Malaysia', q => 'q' }, 238 'google.com.na' => { name => 'Google Namibia', q => 'q' }, 239 'google.com.nf' => { name => 'Google Norfolk Island', q => 'q' }, 240 'google.com.ng' => { name => 'Google Nigeria', q => 'q' }, 241 'google.com.ni' => { name => 'Google Nicaragua', q => 'q' }, 242 'google.com.np' => { name => 'Google Nepal', q => 'q' }, 243 'google.com.nr' => { name => 'Google Nauru', q => 'q' }, 244 'google.com.om' => { name => 'Google Oman', q => 'q' }, 245 'google.com.pa' => { name => 'Google Panama', q => 'q' }, 246 'google.com.pe' => { name => 'Google Peru', q => 'q' }, 247 'google.com.ph' => { name => 'Google Philipines', q => 'q' }, 248 'google.com.pk' => { name => 'Google Pakistan', q => 'q' }, 249 'google.com.pl' => { name => 'Google Poland', q => 'q' }, 250 'google.com.pr' => { name => 'Google Puerto Rico', q => 'q' }, 251 'google.com.pt' => { name => 'Google Portugal', q => 'q' }, 252 'google.com.py' => { name => 'Google Paraguay', q => 'q' }, 253 'google.com.qa' => { name => 'Google', q => 'q' }, 254 'google.com.ru' => { name => 'Google Russia', q => 'q' }, 255 'google.com.sa' => { name => 'Google Saudi Arabia', q => 'q' }, 256 'google.com.sb' => { name => 'Google Solomon Islands', q => 'q' }, 257 'google.com.sc' => { name => 'Google Seychelles', q => 'q' }, 258 'google.com.sg' => { name => 'Google Singapore', q => 'q' }, 259 'google.com.sv' => { name => 'Google El Savador', q => 'q' }, 260 'google.com.tj' => { name => 'Google Tajikistan', q => 'q' }, 261 'google.com.tr' => { name => 'Google Turkey', q => 'q' }, 262 'google.com.tt' => { name => 'Google Trinidad and Tobago', q => 'q' }, 263 'google.com.tw' => { name => 'Google Taiwan', q => 'q' }, 264 'google.com.ua' => { name => 'Google Ukraine', q => 'q' }, 265 'google.com.uy' => { name => 'Google Uruguay', q => 'q' }, 266 'google.com.uz' => { name => 'Google Uzbekistan', q => 'q' }, 267 'google.com.ve' => { name => 'Google Venezuela', q => 'q' }, 268 'google.com.vi' => { name => 'Google US Virgin Islands', q => 'q' }, 269 'google.com.vn' => { name => 'Google Vietnam', q => 'q' }, 270 'google.com.ws' => { name => 'Google Samoa', q => 'q' }, 271 'google.cz' => { name => 'Google Czech Rep', q => 'q' }, 272 'google.de' => { name => 'Google Germany', q => 'q' }, 273 'google.dj' => { name => 'Google Djubouti', q => 'q' }, 274 'google.dk' => { name => 'Google Denmark', q => 'q' }, 275 'google.dm' => { name => 'Google Dominica', q => 'q' }, 276 'google.ec' => { name => 'Google Ecuador', q => 'q' }, 277 'google.ee' => { name => 'Google Estonia', q => 'q' }, 278 'google.es' => { name => 'Google Spain', q => 'q' }, 279 'google.fi' => { name => 'Google Finland', q => 'q' }, 280 'google.fm' => { name => 'Google Micronesia', q => 'q' }, 281 'google.fr' => { name => 'Google France', q => 'q' }, 282 'google.gd' => { name => 'Google Grenada', q => 'q' }, 283 'google.ge' => { name => 'Google Georgia', q => 'q' }, 284 'google.gf' => { name => 'Google French Guiana', q => 'q' }, 285 'google.gg' => { name => 'Google Guernsey', q => 'q' }, 286 'google.gl' => { name => 'Google Greenland', q => 'q' }, 287 'google.gm' => { name => 'Google Gambia', q => 'q' }, 288 'google.gp' => { name => 'Google Guadeloupe', q => 'q' }, 289 'google.gr' => { name => 'Google Greece', q => 'q' }, 290 'google.gy' => { name => 'Google Guyana', q => 'q' }, 291 'google.hk' => { name => 'Google Hong Kong', q => 'q' }, 292 'google.hn' => { name => 'Google Honduras', q => 'q' }, 293 'google.hr' => { name => 'Google Croatia', q => 'q' }, 294 'google.ht' => { name => 'Google Haiti', q => 'q' }, 295 'google.hu' => { name => 'Google Hungary', q => 'q' }, 296 'google.ie' => { name => 'Google Ireland', q => 'q' }, 297 'google.im' => { name => 'Google Isle of Man', q => 'q' }, 298 'google.in' => { name => 'Google India', q => 'q' }, 299 'google.info' => { name => 'Google dot info', q => 'q' }, 300 'google.is' => { name => 'Google Iceland', q => 'q' }, 301 'google.it' => { name => 'Google Italy', q => 'q' }, 302 'google.je' => { name => 'Google Jersey', q => 'q' }, 303 'google.jo' => { name => 'Google Jordan', q => 'q' }, 304 'google.jobs' => { name => 'Google dot jobs', q => 'q' }, 305 'google.jp' => { name => 'Google Japan', q => 'q' }, 306 'google.kg' => { name => 'Google Kyrgyzstan', q => 'q' }, 307 'google.ki' => { name => 'Google Kiribati', q => 'q' }, 308 'google.kz' => { name => 'Google Kazakhstan', q => 'q' }, 309 'google.la' => { name => 'Google Laos', q => 'q' }, 310 'google.li' => { name => 'Google Liechtenstein', q => 'q' }, 311 'google.lk' => { name => 'Google Sri Lanka', q => 'q' }, 312 'google.lt' => { name => 'Google Lithuania', q => 'q' }, 313 'google.lu' => { name => 'Google Luxembourg', q => 'q' }, 314 'google.lv' => { name => 'Google Latvia', q => 'q' }, 315 'google.ma' => { name => 'Google Morocco', q => 'q' }, 316 'google.md' => { name => 'Google Moldova', q => 'q' }, 317 'google.mn' => { name => 'Google Mongolia', q => 'q' }, 318 'google.mobi' => { name => 'Google dot mobi', q => 'q' }, 319 'google.ms' => { name => 'Google Montserrat', q => 'q' }, 320 'google.mu' => { name => 'Google Mauritius', q => 'q' }, 321 'google.mv' => { name => 'Google Maldives', q => 'q' }, 322 'google.mw' => { name => 'Google Malawi', q => 'q' }, 323 'google.net' => { name => 'Google dot net', q => 'q' }, 324 'google.nf' => { name => 'Google Norfolk Island', q => 'q' }, 325 'google.nl' => { name => 'Google Netherlands', q => 'q' }, 326 'google.no' => { name => 'Google Norway', q => 'q' }, 327 'google.nr' => { name => 'Google Nauru', q => 'q' }, 328 'google.nu' => { name => 'Google Niue', q => 'q' }, 329 'google.off.ai' => { name => 'Google Anguilla', q => 'q' }, 330 'google.ph' => { name => 'Google Philipines', q => 'q' }, 331 'google.pk' => { name => 'Google Pakistan', q => 'q' }, 332 'google.pl' => { name => 'Google Poland', q => 'q' }, 333 'google.pn' => { name => 'Google Pitcairn Islands', q => 'q' }, 334 'google.pr' => { name => 'Google Puerto Rico', q => 'q' }, 335 'google.pt' => { name => 'Google Portugal', q => 'q' }, 336 'google.ro' => { name => 'Google Romania', q => 'q' }, 337 'google.ru' => { name => 'Google Russia', q => 'q' }, 338 'google.rw' => { name => 'Google Rwanda', q => 'q' }, 339 'google.sc' => { name => 'Google Seychelles', q => 'q' }, 340 'google.se' => { name => 'Google Sweden', q => 'q' }, 341 'google.sg' => { name => 'Google Singapore', q => 'q' }, 342 'google.sh' => { name => 'Google Saint Helena', q => 'q' }, 343 'google.si' => { name => 'Google Slovenia', q => 'q' }, 344 'google.sk' => { name => 'Google Slovakia', q => 'q' }, 345 'google.sm' => { name => 'Google San Marino', q => 'q' }, 346 'google.sn' => { name => 'Google Senegal', q => 'q' }, 347 'google.sr' => { name => 'Google Suriname', q => 'q' }, 348 'google.st' => { name => 'Google Sao Tome', q => 'q' }, 349 'google.tk' => { name => 'Google Tokelau', q => 'q' }, 350 'google.tm' => { name => 'Google Turkmenistan', q => 'q' }, 351 'google.to' => { name => 'Google Tonga', q => 'q' }, 352 'google.tp' => { name => 'Google East Timor', q => 'q' }, 353 'google.tt' => { name => 'Google Trinidad and Tobago', q => 'q' }, 354 'google.tv' => { name => 'Google Tuvalu', q => 'q' }, 355 'google.tw' => { name => 'Google Taiwan', q => 'q' }, 356 'google.ug' => { name => 'Google Uganda', q => 'q' }, 357 'google.us' => { name => 'Google US', q => 'q' }, 358 'google.uz' => { name => 'Google Uzbekistan', q => 'q' }, 359 'google.vg' => { name => 'Google British Virgin Islands', q => 'q' }, 360 'google.vn' => { name => 'Google Vietnam', q => 'q' }, 361 'google.vu' => { name => 'Google Vanuatu', q => 'q' }, 362 'google.ws' => { name => 'Google Samoa', q => 'q' }, 363 'hotbot.com' => { name => 'HotBot', q => 'query' }, 364 'in.gr' => { name => 'In GR', q => 'q' }, 365 'mamma.com' => { name => 'Mamma', q => 'query' }, 366 'mahalo.com' => { name => 'Mahalo', q => 'search' }, 367 'megasearching.net' => { name => 'Megasearching', q => 's' }, 368 'mirago.co.uk' => { name => 'Mirago UK', q => 'qry' }, 369 'netscape.com' => { name => 'Netscape', q => 's' }, 370 'community.paglo.com' => { name => 'Paglo', q => 'q' }, 371 'pathfinder.gr' => { name => 'Pathfinder GR', q => 'q' }, 372 'phantis.com' => { name => 'Phantis GR' , q => 'q'}, 373 'robby.gr' => { name => 'Robby GR' , q => 'searchstr' }, 374 'sproose.com' => { name => 'Sproose', q => 'query' }, 375 'technorati.com' => { name => 'Technorati', q => 'q' }, 376 'tesco.net' => { name => 'Tesco Search', q => 'q' }, 377 'tiscali.co.uk' => { name => 'Tiscali UK', q => 'query' }, 378 'bing.com' => { name => 'Bing', q => 'q' }, 379 380 'acbusca.com' => { name => 'ACBusca', q => 'query' }, 381 'atalhocerto.com.br' => { name => 'Atalho Certo', q => 'keyword' }, 382 'bastaclicar.com.br' => { name => 'Basta Clicar', q => 'search' }, 383 'bemrapido.com.br' => { name => 'Bem Rapido', q => 'chave' }, 384 'br.altavista.com' => { name => 'AltaVista Brasil', q => 'q' }, 385 'br.search.yahoo.com' => { name => 'Yahoo Brazil', q => 'p' }, 386 'busca.uol.com.br' => { name => 'Radar UOL', q => 'q' }, 387 'buscaaqui.com.br' => { name => 'Busca Aqui', q => 'q' }, 388 'buscador.terra.com.br' => { name => 'Terra Busca', q => 'query' }, 389 'cade.search.yahoo.com' => { name => 'Cadê', q => 'p' }, 390 'clickgratis.com.br' => { name => 'Click Gratis', q => 'query' }, 391 'entrada.com.br' => { name => 'Entrada', q => 'q' }, 392 'gigabusca.com.br' => { name => 'Giga Busca', q => 'what' }, 393 'internetica.com.br' => { name => 'Internetica', q => 'busca' }, 394 'katatudo.com.br' => { name => 'KataTudo', q => 'q' }, 395 'minasplanet.com.br' => { name => 'Minas Planet', q => 'term' }, 396 'speedybusca.com.br' => { name => 'SpeedyBusca', q => 'q' }, 397 'vaibuscar.com.br' => { name => 'Vai Busca', q => 'q' }, 398 399 'search.conduit.com' => { name => 'Conduit', q=>'q' }, 400 'in.search.yahoo.com' => { name => 'Yahoo India', q => 'p' }, 401 'rediff.com' => { name => 'Rediff', q => 'MT' }, 402 'guruji.com' => { name => 'Guruji', q => 'q' }, 403 404 'isohunt.com' => { name => 'Isohunt', q => 'ihq' }, 405 'btjunkie.org' => { name => 'BT Junkie', q => 'q' }, 406 'torrentz.eu' => { name => 'Torrentz', q => 'f' } 407 408}; 409 410sub new { 411 my $class = shift ; 412 my $self = { } ; 413 $self->{engines} = $RH_LOOKUPS; 414 return bless $self, $class ; 415} 416 417=head2 parse_search_string 418 419This module provides a simple function to parse and extract search engine query strings. It was designed and tested having 420Apache referrer logs in mind. It can be used for a wide number of purposes, including tracking down what keywords people use 421on popular search engines before they land on a site. Although a number of existing modules and scripts exist for this purpose, 422the majority of them are either outdated using obsolete search strings associated with each engine. 423 424The default function exported is "parse_search_string" which accepts an unquoted referrer string as input and returns the 425search engine query contained within. It currently works with both escaped and un-escaped queries and will translate the search 426terms before returning them in the latter case. The function returns undef in all other cases and errors. 427 428for example: 429 430 my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search'; 431 my $terms = 432 $uparse->parse_search_string( $ref ); 433 434would return I<'a simple test'> 435 436whereas 437 438 my $ref = 'http://www.mamma.com/Mamma?utfout=1&qtype=0&query=a+more%21+complex_+search%24&Submit=%C2%A0%C2%A0Search%C2%A0%C2%A0'; 439 my $terms = 440 $uparse->parse_search_string( $ref ); 441 442would return I<'a more! complex_ search$'> 443 444=cut 445 446=head2 se_term 447 448Same as parse_search_string(). 449 450=cut 451 452sub se_term { 453 my $self = shift ; 454 my $string = shift ; 455 return unless defined $string ; 456 return $self->parse_search_string($string) ; 457} 458 459## internal method for creating a URI object 460 461sub _uri { 462 my $self = shift; 463 my $string = shift; 464 465 return unless defined($string); 466 467 ## create a new URI object 468 ## and return unless its http or https 469 470 my $uri = URI->new( $string ); 471 return 472 unless (defined($uri) 473 && (ref($uri) eq 'URI::http' || ref($uri) eq 'URI::https')); 474 475 ## feedster and technorati as they do not follow 476 ## the usual search patterns thus we extract the query 477 ## terms by taking the last element from the path segments 478 479 my $host = $uri->host; 480 481 return unless defined($host) && $host; 482 483 if ( $host =~ m/(feedster|technorati)\.com$/ ){ 484 $uri->query_form( q => ( $uri->path_segments)[-1]); 485 } 486 487 ## clean up the host until it matches 488 ## something we already know about 489 490 while( ! defined $self->{'engines'}{ $host }){ 491 my $c = index($host, '.'); 492 last if $c <0; 493 $host= substr($host, $c+1); 494 } 495 496 return ($uri, $host); 497 498} 499 500 501sub parse_search_string { 502 my $self = shift ; 503 my $string = shift ; 504 return unless defined($string); 505 506 my ($uri,$host) = $self->_uri( $string ); 507 return unless defined($uri); 508 509 ## get rid of the www 510 $host =~ m!^www\.!; 511 512 ## find the query parameter the engine uses 513 my $q = $self->{'engines'}{$host}{'q'}; 514 return unless defined $q; 515 516 ## return the string passed to the query parameter 517 my %h_query = $uri->query_form; 518 519 return $h_query{$q} 520} 521 522=head2 findEngine 523 524Returns a list with the hostname of the search engine as the first element and 525the canonical name as the second element. 526 527 my $ref = 'http://www.google.com/search?hl=en&q=a+simple+test&btnG=Google+Search'; 528 my ($hostname, $canonical) = $uparse->findEngine( $ref ) ; 529 530This will return 'google.com' as the search engine hostname and 'Google' as the name. 531This function will return I<undef> on error. 532 533=cut 534 535sub findEngine { 536 my $self = shift ; 537 my $string = shift ; 538 539 return unless defined($string); 540 541 ## create a URI object 542 543 my ($uri,$hostname) = $self->_uri( $string ); 544 return unless defined($uri) && $uri; 545 return unless defined($hostname) && $hostname; 546 547 my $canonical = $self->{'engines'}->{$hostname}->{'name'}; 548 549 return ($hostname,$canonical); 550} 551 552=head2 se_host 553 554Wrapper around findEngine - returns just the hostname. 555This function will return I<undef> on error. 556 557=cut 558 559sub se_host { 560 my $self = shift ; 561 my $string = shift ; 562 return unless defined($string) ; 563 my ($host,$name) = $self->findEngine($string) ; 564 return $host ; 565} 566 567=head2 se_name 568 569Wrapper around findEngine - returns just the canonical name; 570This function will return I<undef> on error. 571 572=cut 573 574sub se_name { 575 my $self = shift ; 576 my $string = shift ; 577 return unless defined($string); 578 my ($host,$name) = $self->findEngine($string) ; 579 return $name ; 580} 581 582=head1 SUPPORTED ENGINES 583 584Currently supported search engines include: Sproose, Google Namibia, Google Ivory Coast, Google Oman, Technorati, Google Ecuador, 585Google Norfolk Island, Mahalo, Google UK, Yahoo! UK, Google Micronesia, Google Bahrain, Basta Clicar, 586Giga Busca, Google Greece, Google Belgium, Google Egypt, Google Chile, Godado (IT), Google Australia, 587Google Uruguay, Google India, Google Taiwan, Google Ukraine, Google US, Terra ES, 588Tesco Search, Megasearching, SAPO videos, Google Nepal, Google Israel, Google US Virgin Islands, Google Hungary, 589Google San Marino, Google Croatia, Google dot jobs, Google Panama, Google Malaysia, Internetica, Google Brunei Darussalam, 590Google Denmark, Google Pakistan, Google Solomon Islands, Google dot biz, Google Lesotho, IceRocket, Google Greenland, Fireball DE, 591Rtp, Google Portugal, Google Samoa, Google Kazakhstan, Google Blogsearch, Google Thailand, Google, Google Antiqua and Barbuda, 592Google Germany, Google Moldova, Google Zambia, Google Greece, Google Sri Lanka, Google Ireland, Google Austria, 593Google Peru, Google Guatemala, ICQ dot com, AOL UK, Google Guyana, In GR, Google dot info, MyWay, Pathfinder GR, Google Costa Rica, 594KataTudo, Google Jamaica, Google Vietnam, Google Morocco, Google Gambia, Google Singapore, Google Mauritius, Altavista, Google Afghanistan, 595Google Cote dIvoire, Google Kazakhstan, Google Czech Rep, Phantis GR, Google Bahamas, Google United Arab Emirates, Google East Timor, Ozu ES, 596Google Venezuela, Google Puerto Rico, Google Armenia, Google Croatia, Google Botswana, Google Tuvalu, Ask UK, Google Singapore, Mirago UK, 597Google Greenland, MSN Arabia, Google Nauru, Publico, Robby GR, Minas Planet, Pesquisa Iol, Google Romania, Google South Korea, Google Jersey, 598Netscape, Busca Aqui, Google Bulgaria, Google Uzbekistan, Tiscali UK, Ithaki, Cadê, Lycos IT, Google Suriname, Excite IT, Google Hong Kong, 599Kataweb IT, Google Burundi, Click Gratis, Google Vietnam, MSN, Alice.it, Google Honduras, Google Trinidad and Tobago, Google Uganda, XL, 600Jornal Noticias, Google Cook Islands, Google Japan, Google Ecuador, Google Ghana, Google Guadeloupe, Google Libya, Google Kenya, Fastbrowsersearch, 601Aeiou, Google Niue, Jornal Record, HotBot, Google Honduras, Google Georgia, Google Fiji, Google Philipines, BBC Search, Google, Google Laos, 602Soso, AltaVista Brasil, Lycos UK, SAPO fotos, Ask dot com, Google Netherlands, Google Philipines, Google Trinidad and Tobago, Google Turkey, 603AllTheWeb, Google Japan, Google Argentina, Google Vanuatu, Blueyonder, Google Greenland, Google Samoa, Google Georgia, Google Slovakia, 604Google Sri Lanka, Pesquisa SAPO, Google Latvia, Google Latvia, Correio Manha, Terra Busca, Google El Savador, Google Cambodia, 605Google Mauritius, Google China, AOL Search, Google Tokelau, Google Tonga, Correio da Manha, Radar UOL, Google Jordan, Godado, Google Jordan, 606Google Pitcairn Islands, Categorico IT, Google Morocco, Google Dominican Rep, Google France, Abacho, Google Azerbaijan, Google Andorra, Google Belize, 607Google Paraguay, Simpatico IT, Google Ethiopia, Google Uganda, Google Poland, Google Bolivia, Google Hungary, Google Russia, Diario Noticias, 608Google Puerto Rico, Google Montserrat, Yahoo! Japan, Google Seychelles, Mamma, Google Pitcairn Islands, Google South Africa, Paglo, Google Malta, 609Google Azerbaijan, Google New Zeland, Google China, Google Norway, Google Bosnia and Herzegovina, Google Indonesia, SpeedyBusca, Entrada, Google Anguilla, 610Google Rep of Congo, Google Dominica, Google Finland, Altavista UK, Google Guyana, MSN UK, Yahoo Answers, Google British Virgin Islands, Google Guadeloupe, 611Google Lithuania, Google Antiqua and Barbuda, Google Bahamas, Google Malawi, MSN Prodigy, Bing, Google Bolivia, Google Djubouti, Google Uzbekistan, Fastweb IT, 612Google Tajikistan, Virgin Search, Google Nigeria, Yahoo Japan, Pesquisa Clix, Google Grenada, Google Haiti, Google American Samoa, Google Pakistan, 613Google Cocos Islands, Google Hong Kong, NTLWorld, ilMotore, Google Belize, Google Guernsey, Google Sweden, Google Anguilla, Google Bangladesh, Google Isle of Man, 614Google Guernsey, Google Kyrgyzstan, Google Dem Rep of Congo, Google Malawi, Orange Search, Google Seychelles, Google Guyana, Google Gibraltar, 615oogle Italy, Google Kiribati, TheSpider IT, Google Nicaragua, Google Russia, Google Venezuela, Google Poland, Google Brazil, Google Senegal, Conduit, Lycos, 616Google Isle of Man, Live.com, Google Italy, Libero IT, Google Canada, Google Nauru, Google Liechtenstein, Google Afghanistan, Cuil, Google Zimbabwe, Google Mauritius, 617Orange ES, Google Burundi, Google Portugal, ACBusca, Bem Rapido, Atalho Certo, Excite, Clusty, Yahoo Brazil, My Web Search, Google Spain, Google Uzbekistan, Google, 618Google Mexico, T-Online, Google dot mobi, Google Luxembourg, Google Austria, Yahoo!, Google Kiribati, Sweetim, Vai Busca, Google Mongolia, Google Saudi Arabia, Google dot net, 619Google Maldives, Google Trinidad and Tobago, Google Jersey, Feedster, Google Turkmenistan, Google Switzerland, Google Norfolk Island, Suche DE, Google Malawi, Google Rwanda, 620Lycos ES, Google Burundi, Google French Guiana, Google Kyrgyzstan, Google Saint Helena, VirginMedia, Google Iceland, SAPO sabores, Google India, Google Cuba, 621Google US Virgin Islands, Google Taiwan, Google Sao Tome, Google Slovenia, Starware, Google Estonia, Conduit, Yahoo India, Rediff, Guruji 622 623=head1 AUTHOR 624 625Spiros Denaxas, C<< <s.denaxas at gmail.com> >> 626 627=head1 SOURCE CODE 628 629The source code can be found on github L<https://github.com/spiros/URI-ParseSearchString> 630 631=head1 BUGS 632 633This is my first CPAN module so I encourage you to send all comments, especially bad, 634to my email address. 635 636This could not have been possible without the support of my co-workers at 637http://nestoria.co.uk - the easiest way of finding UK property. 638 639=head1 SUPPORT 640 641For more information, you could also visit my blog: 642 643 http://blog.ffffruit.com 644 645=over 4 646 647=back 648 649=head1 COPYRIGHT & LICENSE 650 651Copyright 2011 Spiros Denaxas, all rights reserved. 652 653This program is free software; you can redistribute it and/or modify it 654under the same terms as Perl itself. 655 656=cut 657 6581; # End of URI::ParseSearchString 659