1
2#############################################################################
3## $Id: WaybackMachine.pm 6702 2006-07-25 01:43:27Z spadkins $
4#############################################################################
5
6use strict;
7
8package WWW::WebArchive::WaybackMachine;
9
10use WWW::WebArchive::Agent;
11
12use vars qw($VERSION @ISA);
13$VERSION = "0.50";
14@ISA = ("WWW::WebArchive::Agent");
15
16use WWW::Mechanize;
17
18sub restore {
19    &App::sub_entry if ($App::trace);
20    my ($self, $options) = @_;
21
22    my $dir = $options->{dir};
23    $dir = $self->{dir} if (!defined $dir);
24    $dir = "." if (!defined $dir);
25    $dir .= "/$self->{name}";
26
27    my $url = $options->{url} || die "restore(): URL not provided";
28    $url =~ s!/$!!;
29    if ($url !~ /^[a-z]+:/) {
30        $url = "http://$url";
31    }
32    my $domain = $url;
33    $domain =~ s!^[a-z]+://!!;
34    $domain =~ s!/.*!!;
35    my $seclvl_domain = $domain;
36    if ($seclvl_domain =~ /([^\.]+\.[^\.]+)$/) {
37        $seclvl_domain = $1;
38    }
39
40    my $verbose = $options->{verbose};
41    $verbose = $self->{verbose} if (!defined $verbose);
42    $verbose = 0 if (!defined $verbose);
43
44    ###################################################################
45    # Initialize User Agent
46    ###################################################################
47    my $ua = WWW::Mechanize->new();
48    $ua->agent_alias("Windows IE 6");
49    $ua->stack_depth(1);    # limit the number of pages we remember to 1 (one back() allowed)
50
51    ###################################################################
52    # Search Internet Archive Wayback Machine for cached documents
53    ###################################################################
54    my (%link, @links, $link);
55    my ($done, $next_url, $link_text, $link_url);
56    my ($link_text2, $link_url2);
57    $done = 0;
58    print "Restoring [$url]\n" if ($verbose);
59    $ua->get("http://web.archive.org/web/*sr_1nr_100/$url*");
60    $self->check_status($ua);
61    while (!$done) {
62        @links = $ua->links();
63        $done = 1;
64        foreach $link (@links) {
65            $link_text = $link->text();
66            $link_url = $link->url_abs();
67            printf("> Link: %-40s %s\n", $link_text, $link_url) if ($verbose >= 3);
68            if ($link_url =~ m!^http://web.archive.org/web/.*$seclvl_domain! &&
69                $link_text =~ m!$seclvl_domain!) {
70                printf(">> Archived Document Found: http://%s\n", $link_text) if ($verbose);
71                $link{$link_text} = $link;
72            }
73
74            if ($link_text eq "Next") {
75                $next_url = $link->url_abs();
76            }
77        }
78        if ($next_url) {
79            #print "Next: $next_url\n";
80            $ua->get($next_url);
81            $self->check_status($ua);
82            $done = 0;
83            $next_url = "";
84        }
85    }
86
87    ###################################################################
88    # Mirror cached documents to local file system
89    ###################################################################
90    my ($action, $file);
91    foreach $link_text (sort keys %link) {
92        $link = $link{$link_text};
93        $link_url = $link->url_abs();
94
95        if ($link_url =~ m!^http://web.archive.org/web/([^/]+)/(.*)$!) {
96            $action = $1;
97            $file   = $2;
98            if ($file =~ m!/$!) {
99                print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2);
100            }
101            elsif ($file =~ m!/[^/\\\.]+$!) {
102                print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2);
103            }
104            elsif ($file =~ m!/\?[DMNS]=[DA]$!) {
105                print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2);
106            }
107            else {
108                if ($action eq "*hh_") {
109                    $self->mirror($ua, "http://web.archive.org/http://$file", $file, $dir, $domain);
110                    #print "Getting historical versions [$link_url] ...\n" if ($verbose >= 1);
111                    #$ua->get($link_url);
112                    #$self->check_status($ua);
113                    #if ($ua->success()) {
114                    #    @links = $ua->links();
115                    #    foreach $link (@links) {
116                    #        $link_text2 = $link->text();
117                    #        $link_url2 = $link->url_abs();
118                    #        if ($link_url2 =~ m!^http://web.archive.org/web/.*$domain! &&
119                    #            $link_text2 =~ m!$domain!) {
120                    #            #printf(">> Archived Document Found: http://%s\n", $link_text) if ($verbose);
121                    #            printf("> Link: %-40s %s\n", $link_text2, $link_url2);
122                    #            #$link{$link_text} = $link;
123                    #        }
124                    #    }
125                    #}
126                    #else {
127                    #    print "Can't get URL [$link_url]\n";
128                    #}
129                }
130                elsif ($action =~ /^[0-9]+$/) {
131                    $self->mirror($ua, $link_url, $file, $dir, $domain);
132                }
133                else {
134                    print "Unknown link type [$link_url]\n";
135                }
136            }
137        }
138        else {
139            print "Unknown link type [$link_url]\n";
140        }
141    }
142
143    &App::sub_exit() if ($App::trace);
144}
145
146sub mirror {
147    &App::sub_entry if ($App::trace);
148    my ($self, $ua, $url, $file, $basedir, $domain) = @_;
149    if (! -f "$basedir/$file" || $App::options{clobber}) {
150        $ua->get($url);
151        $self->check_status($ua);
152        if ($ua->success()) {
153            my $content = $ua->content();
154            my $content_type = $ua->ct();
155            if ($content_type eq "text/html") {
156                $content = $self->clean_html($content, $file, $domain);
157            }
158            my $len = length($content);
159            $self->write_file("$basedir/$file", $content);
160            print "Wrote file [$file] ($len bytes)\n";
161        }
162        else {
163            print "Missed file [$file]\n";
164        }
165    }
166    else {
167        print "File exists [$file]\n";
168    }
169    &App::sub_exit() if ($App::trace);
170}
171
172sub clean_html {
173    &App::sub_entry if ($App::trace);
174    my ($self, $html, $file, $domain) = @_;
175
176    # Unix files. No CR's allowed.
177    $html =~ s/\r//g;
178
179    # clean up weird additions to <BASE>. Unfortunately, this wipes out real uses of the <BASE> tag in the original doc.
180    $html =~ s#<!-- base href="[^"<>]*" -->##;
181    $html =~ s#<BASE [^<>]*>\s*##si;  # the first one was put in by Internet Archive
182    $html =~ s#<(BASE [^<>]*)>\s*#<!-- $1 -->#si;  # there may be a real <BASE> tag. keep in comment. all URL's must be relative.
183    #$html =~ s#<link rel="stylesheet" type="text/css" href="[^"]*/style.css">#<link rel="stylesheet" type="text/css" href="style.css">\n#;
184
185    # clean up the spacing to get rid of extraneous lines
186    $html =~ s#<html>\s*#<html>\n#si;
187    $html =~ s#<head>\s*#<head>\n#si;
188    $html =~ s#</title>\s*#</title>\n#si;
189    $html =~ s#</head>\s*#</head>\n#;
190
191    # remove a really odd background="foo.html" attribute from the <body>
192    $html =~ s#<body([^<>]*) background="[^"]*.html?"#<body$1#si;
193
194    # try to rewrite web archive links (which have been made absolute)
195    if ($html =~ s#http://web.archive.org/[^"]*(http://[^"]*)#$1#g) {
196        # if we succeeded and we know the filename and domain ...
197        if ($file && $domain && $html =~ m#http://$domain#) {
198            my $reldir  = "";         # compute a relative root from the filename
199            my $absdir  = $file;
200            $absdir =~ s#^[^/]+##;   # trim off domain part
201            $absdir =~ s#[^/]+$##;    # trim off file part
202            $absdir = "/" if (!$absdir);
203            while (1) {
204                # print "Substituting [$domain$absdir] for [$reldir]\n";
205                $html =~ s#http://$domain$absdir#$reldir#g;  # substitute absolute links to file in the domain with relative paths
206                last if ($absdir eq "/");
207                $absdir =~ s#[^/]+/$##;    # trim off file part
208                $absdir = "/" if (!$absdir);
209                $reldir .= "../";
210            }
211            # print "Substituting [$domain] for [$reldir]\n";
212            $html =~ s#http://$domain#$reldir#g;  # substitute absolute links to file in the domain with relative paths
213        }
214    }
215
216    # get rid of a comment and some javascript added by the Internet Archive
217    $html =~ s#<!--\s*SOME\s*LINK\s*HREF[^<>]>\s*##s;
218    $html =~ s#<!-- SOME LINK HREF'S ON THIS PAGE HAVE BEEN REWRITTEN BY THE WAYBACK MACHINE\s*##s;
219    $html =~ s#OF THE INTERNET ARCHIVE IN ORDER TO PRESERVE THE TEMPORAL INTEGRITY OF THE SESSION. -->\s*##s;
220    $html =~ s#<script[^<>]*>\s*<!--\s*// FILE ARCHIVED[^>]*>\n</script>\s*##si;
221    $html =~ s#<!-- SOME FRAME SRC'S ON THIS PAGE HAVE BEEN REWRITTEN BY THE WAYBACK MACHINE\s*##s;
222    $html =~ s#<!--\s*// FILE ARCHIVED ON [^<>]>\s*##s;
223
224    &App::sub_exit($html) if ($App::trace);
225    return($html);
226}
227
228=head1 NAME
229
230WWW::WebArchive::WaybackMachine - An agent to retrieve files from Internet Archive's Wayback Machine (www.archive.org)
231
232=head1 SYNOPSIS
233
234    NOTE: You probably want to use this module through the WWW::WebArchive API.
235    If not, it's up to you to read the code and figure out how to use this module.
236
237=head1 DESCRIPTION
238
239An agent to retrieve files from Internet Archive's Wayback Machine (www.archive.org)
240
241=head1 ACKNOWLEDGEMENTS
242
243 * Author:  Stephen Adkins <spadkins@gmail.com>
244 * License: This is free software. It is licensed under the same terms as Perl itself.
245
246=head1 SEE ALSO
247
248L<WWW::WebArchive::Agent>, L<WWW::WebArchive>, L<WWW::Mechanize>
249
250=cut
251
2521;
253
254