1 2############################################################################# 3## $Id: WaybackMachine.pm 6702 2006-07-25 01:43:27Z spadkins $ 4############################################################################# 5 6use strict; 7 8package WWW::WebArchive::WaybackMachine; 9 10use WWW::WebArchive::Agent; 11 12use vars qw($VERSION @ISA); 13$VERSION = "0.50"; 14@ISA = ("WWW::WebArchive::Agent"); 15 16use WWW::Mechanize; 17 18sub restore { 19 &App::sub_entry if ($App::trace); 20 my ($self, $options) = @_; 21 22 my $dir = $options->{dir}; 23 $dir = $self->{dir} if (!defined $dir); 24 $dir = "." if (!defined $dir); 25 $dir .= "/$self->{name}"; 26 27 my $url = $options->{url} || die "restore(): URL not provided"; 28 $url =~ s!/$!!; 29 if ($url !~ /^[a-z]+:/) { 30 $url = "http://$url"; 31 } 32 my $domain = $url; 33 $domain =~ s!^[a-z]+://!!; 34 $domain =~ s!/.*!!; 35 my $seclvl_domain = $domain; 36 if ($seclvl_domain =~ /([^\.]+\.[^\.]+)$/) { 37 $seclvl_domain = $1; 38 } 39 40 my $verbose = $options->{verbose}; 41 $verbose = $self->{verbose} if (!defined $verbose); 42 $verbose = 0 if (!defined $verbose); 43 44 ################################################################### 45 # Initialize User Agent 46 ################################################################### 47 my $ua = WWW::Mechanize->new(); 48 $ua->agent_alias("Windows IE 6"); 49 $ua->stack_depth(1); # limit the number of pages we remember to 1 (one back() allowed) 50 51 ################################################################### 52 # Search Internet Archive Wayback Machine for cached documents 53 ################################################################### 54 my (%link, @links, $link); 55 my ($done, $next_url, $link_text, $link_url); 56 my ($link_text2, $link_url2); 57 $done = 0; 58 print "Restoring [$url]\n" if ($verbose); 59 $ua->get("http://web.archive.org/web/*sr_1nr_100/$url*"); 60 $self->check_status($ua); 61 while (!$done) { 62 @links = $ua->links(); 63 $done = 1; 64 foreach $link (@links) { 65 $link_text = $link->text(); 66 $link_url = $link->url_abs(); 67 printf("> Link: %-40s %s\n", $link_text, $link_url) if ($verbose >= 3); 68 if ($link_url =~ m!^http://web.archive.org/web/.*$seclvl_domain! && 69 $link_text =~ m!$seclvl_domain!) { 70 printf(">> Archived Document Found: http://%s\n", $link_text) if ($verbose); 71 $link{$link_text} = $link; 72 } 73 74 if ($link_text eq "Next") { 75 $next_url = $link->url_abs(); 76 } 77 } 78 if ($next_url) { 79 #print "Next: $next_url\n"; 80 $ua->get($next_url); 81 $self->check_status($ua); 82 $done = 0; 83 $next_url = ""; 84 } 85 } 86 87 ################################################################### 88 # Mirror cached documents to local file system 89 ################################################################### 90 my ($action, $file); 91 foreach $link_text (sort keys %link) { 92 $link = $link{$link_text}; 93 $link_url = $link->url_abs(); 94 95 if ($link_url =~ m!^http://web.archive.org/web/([^/]+)/(.*)$!) { 96 $action = $1; 97 $file = $2; 98 if ($file =~ m!/$!) { 99 print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2); 100 } 101 elsif ($file =~ m!/[^/\\\.]+$!) { 102 print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2); 103 } 104 elsif ($file =~ m!/\?[DMNS]=[DA]$!) { 105 print "Probably a directory index [$file] : not retrieving\n" if ($verbose >= 2); 106 } 107 else { 108 if ($action eq "*hh_") { 109 $self->mirror($ua, "http://web.archive.org/http://$file", $file, $dir, $domain); 110 #print "Getting historical versions [$link_url] ...\n" if ($verbose >= 1); 111 #$ua->get($link_url); 112 #$self->check_status($ua); 113 #if ($ua->success()) { 114 # @links = $ua->links(); 115 # foreach $link (@links) { 116 # $link_text2 = $link->text(); 117 # $link_url2 = $link->url_abs(); 118 # if ($link_url2 =~ m!^http://web.archive.org/web/.*$domain! && 119 # $link_text2 =~ m!$domain!) { 120 # #printf(">> Archived Document Found: http://%s\n", $link_text) if ($verbose); 121 # printf("> Link: %-40s %s\n", $link_text2, $link_url2); 122 # #$link{$link_text} = $link; 123 # } 124 # } 125 #} 126 #else { 127 # print "Can't get URL [$link_url]\n"; 128 #} 129 } 130 elsif ($action =~ /^[0-9]+$/) { 131 $self->mirror($ua, $link_url, $file, $dir, $domain); 132 } 133 else { 134 print "Unknown link type [$link_url]\n"; 135 } 136 } 137 } 138 else { 139 print "Unknown link type [$link_url]\n"; 140 } 141 } 142 143 &App::sub_exit() if ($App::trace); 144} 145 146sub mirror { 147 &App::sub_entry if ($App::trace); 148 my ($self, $ua, $url, $file, $basedir, $domain) = @_; 149 if (! -f "$basedir/$file" || $App::options{clobber}) { 150 $ua->get($url); 151 $self->check_status($ua); 152 if ($ua->success()) { 153 my $content = $ua->content(); 154 my $content_type = $ua->ct(); 155 if ($content_type eq "text/html") { 156 $content = $self->clean_html($content, $file, $domain); 157 } 158 my $len = length($content); 159 $self->write_file("$basedir/$file", $content); 160 print "Wrote file [$file] ($len bytes)\n"; 161 } 162 else { 163 print "Missed file [$file]\n"; 164 } 165 } 166 else { 167 print "File exists [$file]\n"; 168 } 169 &App::sub_exit() if ($App::trace); 170} 171 172sub clean_html { 173 &App::sub_entry if ($App::trace); 174 my ($self, $html, $file, $domain) = @_; 175 176 # Unix files. No CR's allowed. 177 $html =~ s/\r//g; 178 179 # clean up weird additions to <BASE>. Unfortunately, this wipes out real uses of the <BASE> tag in the original doc. 180 $html =~ s#<!-- base href="[^"<>]*" -->##; 181 $html =~ s#<BASE [^<>]*>\s*##si; # the first one was put in by Internet Archive 182 $html =~ s#<(BASE [^<>]*)>\s*#<!-- $1 -->#si; # there may be a real <BASE> tag. keep in comment. all URL's must be relative. 183 #$html =~ s#<link rel="stylesheet" type="text/css" href="[^"]*/style.css">#<link rel="stylesheet" type="text/css" href="style.css">\n#; 184 185 # clean up the spacing to get rid of extraneous lines 186 $html =~ s#<html>\s*#<html>\n#si; 187 $html =~ s#<head>\s*#<head>\n#si; 188 $html =~ s#</title>\s*#</title>\n#si; 189 $html =~ s#</head>\s*#</head>\n#; 190 191 # remove a really odd background="foo.html" attribute from the <body> 192 $html =~ s#<body([^<>]*) background="[^"]*.html?"#<body$1#si; 193 194 # try to rewrite web archive links (which have been made absolute) 195 if ($html =~ s#http://web.archive.org/[^"]*(http://[^"]*)#$1#g) { 196 # if we succeeded and we know the filename and domain ... 197 if ($file && $domain && $html =~ m#http://$domain#) { 198 my $reldir = ""; # compute a relative root from the filename 199 my $absdir = $file; 200 $absdir =~ s#^[^/]+##; # trim off domain part 201 $absdir =~ s#[^/]+$##; # trim off file part 202 $absdir = "/" if (!$absdir); 203 while (1) { 204 # print "Substituting [$domain$absdir] for [$reldir]\n"; 205 $html =~ s#http://$domain$absdir#$reldir#g; # substitute absolute links to file in the domain with relative paths 206 last if ($absdir eq "/"); 207 $absdir =~ s#[^/]+/$##; # trim off file part 208 $absdir = "/" if (!$absdir); 209 $reldir .= "../"; 210 } 211 # print "Substituting [$domain] for [$reldir]\n"; 212 $html =~ s#http://$domain#$reldir#g; # substitute absolute links to file in the domain with relative paths 213 } 214 } 215 216 # get rid of a comment and some javascript added by the Internet Archive 217 $html =~ s#<!--\s*SOME\s*LINK\s*HREF[^<>]>\s*##s; 218 $html =~ s#<!-- SOME LINK HREF'S ON THIS PAGE HAVE BEEN REWRITTEN BY THE WAYBACK MACHINE\s*##s; 219 $html =~ s#OF THE INTERNET ARCHIVE IN ORDER TO PRESERVE THE TEMPORAL INTEGRITY OF THE SESSION. -->\s*##s; 220 $html =~ s#<script[^<>]*>\s*<!--\s*// FILE ARCHIVED[^>]*>\n</script>\s*##si; 221 $html =~ s#<!-- SOME FRAME SRC'S ON THIS PAGE HAVE BEEN REWRITTEN BY THE WAYBACK MACHINE\s*##s; 222 $html =~ s#<!--\s*// FILE ARCHIVED ON [^<>]>\s*##s; 223 224 &App::sub_exit($html) if ($App::trace); 225 return($html); 226} 227 228=head1 NAME 229 230WWW::WebArchive::WaybackMachine - An agent to retrieve files from Internet Archive's Wayback Machine (www.archive.org) 231 232=head1 SYNOPSIS 233 234 NOTE: You probably want to use this module through the WWW::WebArchive API. 235 If not, it's up to you to read the code and figure out how to use this module. 236 237=head1 DESCRIPTION 238 239An agent to retrieve files from Internet Archive's Wayback Machine (www.archive.org) 240 241=head1 ACKNOWLEDGEMENTS 242 243 * Author: Stephen Adkins <spadkins@gmail.com> 244 * License: This is free software. It is licensed under the same terms as Perl itself. 245 246=head1 SEE ALSO 247 248L<WWW::WebArchive::Agent>, L<WWW::WebArchive>, L<WWW::Mechanize> 249 250=cut 251 2521; 253 254