1# AltaVista.pm 2# by John Heidemann 3# Copyright (C) 1996-1998 by USC/ISI 4# $Id: AltaVista.pm,v 2.905 2008/01/21 02:04:11 Daddy Exp $ 5# 6# Complete copyright notice follows below. 7 8=head1 NAME 9 10WWW::Search::AltaVista - class for searching www.altavista.com 11 12 13=head1 SYNOPSIS 14 15 require WWW::Search; 16 $search = new WWW::Search('AltaVista'); 17 18 19=head1 DESCRIPTION 20 21This class is an AltaVista specialization of WWW::Search. 22It handles making and interpreting AltaVista searches 23F<http://www.altavista.com>. 24 25This class exports no public interface; all interaction should 26be done through WWW::Search objects. 27 28 29=head1 OPTIONS 30 31The default is "any of these words" (OR of query terms). 32 33=over 8 34 35=item aqa=all+of+these+words 36 37Add the AND of these words to the query. 38 39=item aqp=this+exact+phrase 40 41Add "this exact phrase" to the query. 42 43=item aqo=any+of+these+words 44 45Add the OR of these words to the query. 46This is where the query is placed by default. 47 48=item aqn=none+of+these+words 49 50Add NOT these words to the query. 51 52=item aqb=(boolean+AND+expression)+NEAR+entry 53 54Add a boolean expression to the query. 55Operators are AND, OR, AND NOT, and NEAR. 56In the browser interface, the boolean expression can not be combined with any other query types listed above. 57You should probably build the boolean expression with parentheses and spaces and urlescape it. 58 59=item aqs=these+words 60 61Pages containing "these words" will be ranked highest. 62 63=item kgs=[0,1] 64 65To restrict the search to U.S. websites, set kgs=1. 66The default is world-wide, kgs=0. 67 68=item kls=[0,1] 69 70To restrict the search to pages in English and Spanish, set kls=1. 71The default is no language restrictions, kls=0. 72 73=item filetype=[html,pdf] 74 75To restrict the search to HTML pages only, set filetype=html. 76To restrict the search to PDF pages only, set filetype=pdf. 77The default is no restriction on page type, filetype=. 78 79=item rc=dmn&swd=net+org+or.jp 80 81To restrict the search to pages from certain domains, 82set rc=dmn and set swd to a list of desired toplevel domains. 83 84=item rc=url&lh=www.sandcrawler.com/SWB 85 86To restrict the search to pages from a particular site, 87set rc=url and set lh to the site name and path. 88Leave off the http:// from the site. 89 90=back 91 92=head1 PUBLIC METHODS 93 94=cut 95 96##################################################################### 97 98package WWW::Search::AltaVista; 99 100use strict; 101use warnings; 102 103use Carp (); 104use Date::Manip; 105use WWW::Search qw( generic_option strip_tags unescape_query ); 106use WWW::Search::Result; 107 108use base 'WWW::Search'; 109 110our $MAINTAINER = 'Martin Thurn <mthurn@cpan.org>'; 111our 112$VERSION = do { my @r = (q$Revision: 2.905 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r }; 113 114sub _undef_to_emptystring 115 { 116 return defined($_[0]) ? $_[0] : ""; 117 } # _undef_to_emptystring 118 119=head2 gui_query 120 121Call this instead of native_query() if you want to get the same results as your average Joe web surfer. 122 123=cut 124 125sub gui_query 126 { 127 my ($self, $sQuery, $rh) = @_; 128 $self->{'_options'} = { 129 'search_host' => 'http://www.altavista.com', 130 'search_path' => '/web/results', 131 'q' => $sQuery, 132 'kls' => 0, 133 avkw => 'qtrp', 134 }; 135 return $self->native_query($sQuery, $rh); 136 } # gui_query 137 138=head1 PRIVATE METHODS 139 140=head2 native_setup_search 141 142This private method does the heavy lifting after you call native_query() 143or gui_query(). 144 145=cut 146 147sub native_setup_search 148 { 149 my ($self, $native_query, $native_options_ref) = @_; 150 $self->user_agent('user'); 151 $self->{_next_to_retrieve} = 0; 152 if (!defined($self->{_options})) 153 { 154 $self->{_options} = { 155 'pg' => 'aq', 156 'avkw' => 'qtrp', 157 'aqmode' => 's', 158 'aqo' => $native_query, 159 'kgs' => 0, 160 'kls' => 0, 161 # 'dt' => 'dtrange', 162 'rc' => 'dmn', 163 'nbq' => '50', 164 'search_host' => 'http://www.altavista.com', 165 'search_path' => '/web/results', 166 }; 167 if ((my $s = $self->date_from) ne '') 168 { 169 $s = &UnixDate($s, '%m/%d/%y'); 170 $self->{_options}->{d0} = $s; 171 $self->{_options}->{dt} = 'dtrange'; 172 } # if 173 if ((my $s = $self->date_to) ne '') 174 { 175 $s = &UnixDate($s, '%m/%d/%y'); 176 $self->{_options}->{d1} = $s; 177 $self->{_options}->{dt} = 'dtrange'; 178 } # if 179 } # if 180 my($options_ref) = $self->{_options}; 181 if (defined($native_options_ref)) 182 { 183 # Copy in new options. 184 foreach (keys %$native_options_ref) 185 { 186 $options_ref->{$_} = $native_options_ref->{$_}; 187 } # foreach 188 } # if 189 # Process the options. 190 my $options = ''; 191 # For Intranet search to work, mss option must be first: 192 if (exists $options_ref->{'mss'}) 193 { 194 $options .= 'mss=' . $options_ref->{'mss'} . '&'; 195 } # if 196 foreach my $key (keys %$options_ref) 197 { 198 # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n"; 199 next if (generic_option($key)); 200 next if $key eq 'mss'; 201 $options .= $key . '=' . $options_ref->{$key} . '&'; 202 } # foreach 203 chop $options; 204 $self->{_debug} = $options_ref->{'search_debug'}; 205 $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'}); 206 $self->{_debug} = 0 if (!defined($self->{_debug})); 207 208 # Finally figure out the url. 209 $self->{_base_url} = 210 $self->{_next_url} = 211 $self->{_options}{'search_host'} . $self->{_options}{'search_path'} .'?'. $options; 212 # print STDERR $self->{_base_url} . "\n" if ($self->{_debug}); 213 } # native_setup_search 214 215sub _count_pattern 216 { 217 # Pattern for matching result-count in many languages. 218 # Language-specific subclasses might need to override this. 219 return qr{\b(?:found|fand) 220 \s+ 221 ([0-9.,]+) 222 \s+ 223 # This covers English and German: 224 (?:result|headline|Ergebnisse) 225 }x; 226 } # _count_pattern 227 228 229sub _preprocess_results_page 230 { 231 my $self = shift; 232 my $sPage = shift; 233 # return $sPage; 234 # For debugging only. Print the page contents and abort. 235 print STDERR '='x 25, "\n\n", $sPage, "\n\n", '='x 25; 236 exit 88; 237 } # _preprocess_results_page 238 239=head2 parse_tree 240 241This private method does the hard work of parsing the results out of the HTML. 242 243=cut 244 245sub parse_tree 246 { 247 my $self = shift; 248 my $tree = shift; 249 my $iHits = 0; 250 my $iCountSpoof = 0; 251 my $WS = q{[\t\r\n\240\ ]}; 252 # Only try to parse the hit count if we haven't done so already: 253 print STDERR " + start, approx_h_c is ==", $self->approximate_hit_count(), "==\n" if 2 <= $self->{_debug}; 254 if ($self->approximate_hit_count() < 1) 255 { 256 # Sometimes the hit count is inside a <DIV> tag: 257 my @aoDIV = $tree->look_down('_tag' => 'div', 258 'class' => 'xs', 259 ); 260 # Sometimes the hit count is inside a <SPAN> tag: 261 push @aoDIV, $tree->look_down('_tag' => 'span', 262 'class' => 'y', 263 ); 264 my $qrCount = $self->_count_pattern; 265 DIV_TAG: 266 foreach my $oDIV (@aoDIV) 267 { 268 next unless ref $oDIV; 269 print STDERR " + try DIV ==", $oDIV->as_HTML if 2 <= $self->{_debug}; 270 my $s = $oDIV->as_text; 271 print STDERR " + TEXT ==$s==\n" if 2 <= $self->{_debug}; 272 if ($s =~ m!$qrCount!i) 273 { 274 my $iCount = $1 || ''; 275 $iCount =~ tr!.,!!d; 276 $self->approximate_result_count($iCount); 277 print STDERR " + found approx_h_c is ==", $self->approximate_hit_count(), "==\n" if (2 <= $self->{_debug}); 278 last DIV_TAG; 279 } # if 280 } # foreach DIV_TAG 281 } # if 282 # Get the hits: 283 my @aoA = $tree->look_down( 284 '_tag' => 'a', 285 'class' => 'res', 286 ); 287 A_TAG: 288 foreach my $oA (@aoA) 289 { 290 # <a class="res" href="/r?ck_sm=4bf6b336&ci=4939&av_tc=null&q=%7Cvirus+%7Cprotease&rpos=1&rpge=1&rsrc=U&ref=200020080&uid=1da8cd3e47b05cd0&r=http%3A%2F%2Fwww.mcafee.com%2F" onmouseout="status=''; return true;" onmouseover="status='http://www.mcafee.com/'; return true;">McAfee Security - Computer Virus Software and Internet Security For Your PC</a> 291 next unless ref $oA; 292 my $sA = $oA->as_HTML; 293 print STDERR " + found A==$sA==\n" if (2 <= $self->{_debug}); 294 my $sURL = $self->absurl($self->{'_prev_url'}, $oA->attr('href')); 295 print STDERR " + the URL is ==$sURL==\n" if (2 <= $self->{_debug}); 296 # Ignore advertising links: 297 next if ($sURL =~ m!//rc10\.overture\.com!); 298 my $sTitle = $oA->as_text; 299 print STDERR " + the title is ==$sTitle==\n" if (2 <= $self->{_debug}); 300 my $oSPAN = $oA; 301 FIND_SPAN: 302 while (1) 303 { 304 last FIND_SPAN if ! ref $oSPAN; 305 last FIND_SPAN if ($oSPAN->tag eq 'span'); 306 $oSPAN = $oSPAN->right; 307 } # while 308 if (ref $oSPAN) 309 { 310 # $oSPAN now is <span class=s> which contains the description 311 # and the URL: 312 print STDERR " + found SPAN==", $oSPAN->as_HTML, "==\n" if (2 <= $self->{_debug}); 313 my $oHit = new WWW::Search::Result; 314 $oHit->add_url($sURL); 315 $oHit->title($sTitle); 316 $oHit->description(&WWW::Search::strip_tags($oSPAN->as_text)); 317 push(@{$self->{cache}}, $oHit); 318 $self->{'_num_hits'}++; 319 $iHits++; 320 } # if 321 $oA->detach; 322 $oA->delete; 323 } # foreach A_TAG 324 # Find the 'next page' link: 325 @aoA = $tree->look_down('_tag' => 'a', 326 ); 327 NEXT_TAG: 328 foreach my $oA (@aoA) 329 { 330 next NEXT_TAG unless ref $oA; 331 # Multilingual version: 332 next NEXT_TAG unless $oA->as_text =~ m!\s>>\Z!; 333 # English-only version: 334 # next NEXT_TAG unless $oA->as_text eq q{Next >>}; 335 $self->{_next_url} = $self->absurl($self->{'_prev_url'}, $oA->attr('href')); 336 last NEXT_TAG; 337 } # foreach 338 return $iHits; 339 } # parse_tree 340 341 342=head1 BUGS 343 344=over 345 346=item Not all of the above options have been tested. 347 348=item Please report bugs and send feature requests via email to 349C<bug-WWW-Search-AltaVista@rt.cpan.org>, or via the web interface at 350L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=WWW-Search-AltaVista>. 351 352=back 353 354=head1 SEE ALSO 355 356To make new back-ends, see L<WWW::Search>, 357or the specialized AltaVista searches described in options. 358 359=head1 AUTHOR 360 361Written by John Heidemann, C<johnh@isi.edu>; 362maintained by Martin Thurn, C<mthurn@cpan.org>. 363 364=head1 LICENSE 365 366This software is released under the same license as Perl itself. 367 368=head1 COPYRIGHT 369 370Copyright (c) 1996-1998 University of Southern California. 371All rights reserved. 372 373Redistribution and use in source and binary forms are permitted 374provided that the above copyright notice and this paragraph are 375duplicated in all such forms and that any documentation, advertising 376materials, and other materials related to such distribution and use 377acknowledge that the software was developed by the University of 378Southern California, Information Sciences Institute. The name of the 379University may not be used to endorse or promote products derived from 380this software without specific prior written permission. 381 382THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED 383WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF 384MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. 385 386=cut 387 3881; 389 390__END__ 391 392advanced search results: 393 394full URL, as of 2005-12 = http://www.altavista.com/web/results?itag=ody&pg=aq&aqmode=s&aqa=all&aqp=this+phrase&aqo=any&aqn=none&aqb=&kgs=1&kls=0&d2=0&dt=dtrange&dfr%5Bd%5D=1&dfr%5Bm%5D=1&dfr%5By%5D=1980&dto%5Bd%5D=14&dto%5Bm%5D=12&dto%5By%5D=1995&filetype=&rc=dmn&swd=&lh=&nbq=10 395 396http://www.altavista.com/web/results?pg=aq&avkw=qtrp&aqmode=s&aqa=&aqp=&aqo=martin+thurn&aqn=&aqb=&aqs=&kgs=0&kls=0&dt=tmperiod&d2=0&d0=&d1=&filetype=&rc=dmn&swd=&lh=&nbq=50 397 398gui query results: 399http://www.altavista.com/web/results?q=Rhonda+Thurn&kgs=0&kls=0&avkw=qtrp 400