1# AltaVista.pm
2# by John Heidemann
3# Copyright (C) 1996-1998 by USC/ISI
4# $Id: AltaVista.pm,v 2.905 2008/01/21 02:04:11 Daddy Exp $
5#
6# Complete copyright notice follows below.
7
8=head1 NAME
9
10WWW::Search::AltaVista - class for searching www.altavista.com
11
12
13=head1 SYNOPSIS
14
15    require WWW::Search;
16    $search = new WWW::Search('AltaVista');
17
18
19=head1 DESCRIPTION
20
21This class is an AltaVista specialization of WWW::Search.
22It handles making and interpreting AltaVista searches
23F<http://www.altavista.com>.
24
25This class exports no public interface; all interaction should
26be done through WWW::Search objects.
27
28
29=head1 OPTIONS
30
31The default is "any of these words" (OR of query terms).
32
33=over 8
34
35=item aqa=all+of+these+words
36
37Add the AND of these words to the query.
38
39=item aqp=this+exact+phrase
40
41Add "this exact phrase" to the query.
42
43=item aqo=any+of+these+words
44
45Add the OR of these words to the query.
46This is where the query is placed by default.
47
48=item aqn=none+of+these+words
49
50Add NOT these words to the query.
51
52=item aqb=(boolean+AND+expression)+NEAR+entry
53
54Add a boolean expression to the query.
55Operators are AND, OR, AND NOT, and NEAR.
56In the browser interface, the boolean expression can not be combined with any other query types listed above.
57You should probably build the boolean expression with parentheses and spaces and urlescape it.
58
59=item aqs=these+words
60
61Pages containing "these words" will be ranked highest.
62
63=item kgs=[0,1]
64
65To restrict the search to U.S. websites, set kgs=1.
66The default is world-wide, kgs=0.
67
68=item kls=[0,1]
69
70To restrict the search to pages in English and Spanish, set kls=1.
71The default is no language restrictions, kls=0.
72
73=item filetype=[html,pdf]
74
75To restrict the search to HTML pages only, set filetype=html.
76To restrict the search to PDF pages only, set filetype=pdf.
77The default is no restriction on page type, filetype=.
78
79=item rc=dmn&swd=net+org+or.jp
80
81To restrict the search to pages from certain domains,
82set rc=dmn and set swd to a list of desired toplevel domains.
83
84=item rc=url&lh=www.sandcrawler.com/SWB
85
86To restrict the search to pages from a particular site,
87set rc=url and set lh to the site name and path.
88Leave off the http:// from the site.
89
90=back
91
92=head1 PUBLIC METHODS
93
94=cut
95
96#####################################################################
97
98package WWW::Search::AltaVista;
99
100use strict;
101use warnings;
102
103use Carp ();
104use Date::Manip;
105use WWW::Search qw( generic_option strip_tags unescape_query );
106use WWW::Search::Result;
107
108use base 'WWW::Search';
109
110our $MAINTAINER = 'Martin Thurn <mthurn@cpan.org>';
111our
112$VERSION = do { my @r = (q$Revision: 2.905 $ =~ /\d+/g); sprintf "%d."."%03d" x $#r, @r };
113
114sub _undef_to_emptystring
115  {
116  return defined($_[0]) ? $_[0] : "";
117  } # _undef_to_emptystring
118
119=head2 gui_query
120
121Call this instead of native_query() if you want to get the same results as your average Joe web surfer.
122
123=cut
124
125sub gui_query
126  {
127  my ($self, $sQuery, $rh) = @_;
128  $self->{'_options'} = {
129                         'search_host' => 'http://www.altavista.com',
130                         'search_path' => '/web/results',
131                         'q' => $sQuery,
132                         'kls' => 0,
133                         avkw => 'qtrp',
134                        };
135  return $self->native_query($sQuery, $rh);
136  } # gui_query
137
138=head1 PRIVATE METHODS
139
140=head2 native_setup_search
141
142This private method does the heavy lifting after you call native_query()
143or gui_query().
144
145=cut
146
147sub native_setup_search
148  {
149  my ($self, $native_query, $native_options_ref) = @_;
150  $self->user_agent('user');
151  $self->{_next_to_retrieve} = 0;
152  if (!defined($self->{_options}))
153    {
154    $self->{_options} = {
155                         'pg' => 'aq',
156                         'avkw' => 'qtrp',
157                         'aqmode' => 's',
158                         'aqo' => $native_query,
159                         'kgs' => 0,
160                         'kls' => 0,
161                         # 'dt' => 'dtrange',
162                         'rc' => 'dmn',
163                         'nbq' => '50',
164                         'search_host' => 'http://www.altavista.com',
165                         'search_path' => '/web/results',
166                        };
167    if ((my $s = $self->date_from) ne '')
168      {
169      $s = &UnixDate($s, '%m/%d/%y');
170      $self->{_options}->{d0} = $s;
171      $self->{_options}->{dt} = 'dtrange';
172      } # if
173    if ((my $s = $self->date_to) ne '')
174      {
175      $s = &UnixDate($s, '%m/%d/%y');
176      $self->{_options}->{d1} = $s;
177      $self->{_options}->{dt} = 'dtrange';
178      } # if
179    } # if
180  my($options_ref) = $self->{_options};
181  if (defined($native_options_ref))
182    {
183    # Copy in new options.
184    foreach (keys %$native_options_ref)
185      {
186      $options_ref->{$_} = $native_options_ref->{$_};
187      } # foreach
188    } # if
189  # Process the options.
190  my $options = '';
191  # For Intranet search to work, mss option must be first:
192  if (exists $options_ref->{'mss'})
193    {
194    $options .= 'mss=' . $options_ref->{'mss'} . '&';
195    } # if
196  foreach my $key (keys %$options_ref)
197    {
198    # printf STDERR "option: $_ is " . $options_ref->{$_} . "\n";
199    next if (generic_option($key));
200    next if $key eq 'mss';
201    $options .= $key . '=' . $options_ref->{$key} . '&';
202    } # foreach
203  chop $options;
204  $self->{_debug} = $options_ref->{'search_debug'};
205  $self->{_debug} = 2 if ($options_ref->{'search_parse_debug'});
206  $self->{_debug} = 0 if (!defined($self->{_debug}));
207
208  # Finally figure out the url.
209  $self->{_base_url} =
210  $self->{_next_url} =
211  $self->{_options}{'search_host'} . $self->{_options}{'search_path'} .'?'. $options;
212  # print STDERR $self->{_base_url} . "\n" if ($self->{_debug});
213  } # native_setup_search
214
215sub _count_pattern
216  {
217  # Pattern for matching result-count in many languages.
218  # Language-specific subclasses might need to override this.
219  return qr{\b(?:found|fand)
220            \s+
221            ([0-9.,]+)
222            \s+
223            # This covers English and German:
224            (?:result|headline|Ergebnisse)
225            }x;
226  } # _count_pattern
227
228
229sub _preprocess_results_page
230  {
231  my $self = shift;
232  my $sPage = shift;
233  # return $sPage;
234  # For debugging only.  Print the page contents and abort.
235  print STDERR '='x 25, "\n\n", $sPage, "\n\n", '='x 25;
236  exit 88;
237  } # _preprocess_results_page
238
239=head2 parse_tree
240
241This private method does the hard work of parsing the results out of the HTML.
242
243=cut
244
245sub parse_tree
246  {
247  my $self = shift;
248  my $tree = shift;
249  my $iHits = 0;
250  my $iCountSpoof = 0;
251  my $WS = q{[\t\r\n\240\ ]};
252  # Only try to parse the hit count if we haven't done so already:
253  print STDERR " + start, approx_h_c is ==", $self->approximate_hit_count(), "==\n" if 2 <= $self->{_debug};
254  if ($self->approximate_hit_count() < 1)
255    {
256    # Sometimes the hit count is inside a <DIV> tag:
257    my @aoDIV = $tree->look_down('_tag' => 'div',
258                                  'class' => 'xs',
259                                 );
260    # Sometimes the hit count is inside a <SPAN> tag:
261    push @aoDIV, $tree->look_down('_tag' => 'span',
262                                  'class' => 'y',
263                                 );
264    my $qrCount = $self->_count_pattern;
265 DIV_TAG:
266    foreach my $oDIV (@aoDIV)
267      {
268      next unless ref $oDIV;
269      print STDERR " + try DIV ==", $oDIV->as_HTML if 2 <= $self->{_debug};
270      my $s = $oDIV->as_text;
271      print STDERR " +   TEXT ==$s==\n" if 2 <= $self->{_debug};
272      if ($s =~ m!$qrCount!i)
273        {
274        my $iCount = $1 || '';
275        $iCount =~ tr!.,!!d;
276        $self->approximate_result_count($iCount);
277        print STDERR " + found approx_h_c is ==", $self->approximate_hit_count(), "==\n" if (2 <= $self->{_debug});
278        last DIV_TAG;
279        } # if
280      } # foreach DIV_TAG
281    } # if
282  # Get the hits:
283  my @aoA = $tree->look_down(
284                             '_tag' => 'a',
285                             'class' => 'res',
286                            );
287 A_TAG:
288  foreach my $oA (@aoA)
289    {
290    # <a class="res" href="/r?ck_sm=4bf6b336&amp;ci=4939&amp;av_tc=null&amp;q=%7Cvirus+%7Cprotease&amp;rpos=1&amp;rpge=1&amp;rsrc=U&amp;ref=200020080&amp;uid=1da8cd3e47b05cd0&amp;r=http%3A%2F%2Fwww.mcafee.com%2F" onmouseout="status=''; return true;" onmouseover="status='http://www.mcafee.com/'; return true;">McAfee Security - Computer Virus Software and Internet Security For Your PC</a>
291    next unless ref $oA;
292    my $sA = $oA->as_HTML;
293    print STDERR " +   found A==$sA==\n" if (2 <= $self->{_debug});
294    my $sURL = $self->absurl($self->{'_prev_url'}, $oA->attr('href'));
295    print STDERR " +     the URL   is ==$sURL==\n" if (2 <= $self->{_debug});
296    # Ignore advertising links:
297    next if ($sURL =~ m!//rc10\.overture\.com!);
298    my $sTitle = $oA->as_text;
299    print STDERR " +     the title is ==$sTitle==\n" if (2 <= $self->{_debug});
300    my $oSPAN = $oA;
301 FIND_SPAN:
302    while (1)
303      {
304      last FIND_SPAN if ! ref $oSPAN;
305      last FIND_SPAN if ($oSPAN->tag eq 'span');
306      $oSPAN = $oSPAN->right;
307      } # while
308    if (ref $oSPAN)
309      {
310      # $oSPAN now is <span class=s> which contains the description
311      # and the URL:
312      print STDERR " +     found SPAN==", $oSPAN->as_HTML, "==\n" if (2 <= $self->{_debug});
313      my $oHit = new WWW::Search::Result;
314      $oHit->add_url($sURL);
315      $oHit->title($sTitle);
316      $oHit->description(&WWW::Search::strip_tags($oSPAN->as_text));
317      push(@{$self->{cache}}, $oHit);
318      $self->{'_num_hits'}++;
319      $iHits++;
320      } # if
321    $oA->detach;
322    $oA->delete;
323    } # foreach A_TAG
324  # Find the 'next page' link:
325  @aoA = $tree->look_down('_tag' => 'a',
326                         );
327 NEXT_TAG:
328  foreach my $oA (@aoA)
329    {
330    next NEXT_TAG unless ref $oA;
331    # Multilingual version:
332    next NEXT_TAG unless $oA->as_text =~ m!\s>>\Z!;
333    # English-only version:
334    # next NEXT_TAG unless $oA->as_text eq q{Next >>};
335    $self->{_next_url} = $self->absurl($self->{'_prev_url'}, $oA->attr('href'));
336    last NEXT_TAG;
337    } # foreach
338  return $iHits;
339  } # parse_tree
340
341
342=head1 BUGS
343
344=over
345
346=item Not all of the above options have been tested.
347
348=item Please report bugs and send feature requests via email to
349C<bug-WWW-Search-AltaVista@rt.cpan.org>, or via the web interface at
350L<https://rt.cpan.org/NoAuth/Bugs.html?Dist=WWW-Search-AltaVista>.
351
352=back
353
354=head1 SEE ALSO
355
356To make new back-ends, see L<WWW::Search>,
357or the specialized AltaVista searches described in options.
358
359=head1 AUTHOR
360
361Written by John Heidemann, C<johnh@isi.edu>;
362maintained by Martin Thurn, C<mthurn@cpan.org>.
363
364=head1 LICENSE
365
366This software is released under the same license as Perl itself.
367
368=head1 COPYRIGHT
369
370Copyright (c) 1996-1998 University of Southern California.
371All rights reserved.
372
373Redistribution and use in source and binary forms are permitted
374provided that the above copyright notice and this paragraph are
375duplicated in all such forms and that any documentation, advertising
376materials, and other materials related to such distribution and use
377acknowledge that the software was developed by the University of
378Southern California, Information Sciences Institute.  The name of the
379University may not be used to endorse or promote products derived from
380this software without specific prior written permission.
381
382THIS SOFTWARE IS PROVIDED "AS IS" AND WITHOUT ANY EXPRESS OR IMPLIED
383WARRANTIES, INCLUDING, WITHOUT LIMITATION, THE IMPLIED WARRANTIES OF
384MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE.
385
386=cut
387
3881;
389
390__END__
391
392advanced search results:
393
394full URL, as of 2005-12 = http://www.altavista.com/web/results?itag=ody&pg=aq&aqmode=s&aqa=all&aqp=this+phrase&aqo=any&aqn=none&aqb=&kgs=1&kls=0&d2=0&dt=dtrange&dfr%5Bd%5D=1&dfr%5Bm%5D=1&dfr%5By%5D=1980&dto%5Bd%5D=14&dto%5Bm%5D=12&dto%5By%5D=1995&filetype=&rc=dmn&swd=&lh=&nbq=10
395
396http://www.altavista.com/web/results?pg=aq&avkw=qtrp&aqmode=s&aqa=&aqp=&aqo=martin+thurn&aqn=&aqb=&aqs=&kgs=0&kls=0&dt=tmperiod&d2=0&d0=&d1=&filetype=&rc=dmn&swd=&lh=&nbq=50
397
398gui query results:
399http://www.altavista.com/web/results?q=Rhonda+Thurn&kgs=0&kls=0&avkw=qtrp
400