1# -*-perl-*- 2 3use Config; 4 5&read_makefile; 6$fullperl = resolve_make_var('FULLPERL') || $Config{'perlpath'}; 7$islib = resolve_make_var('INSTALLSITELIB'); 8 9$name = $0; 10$name =~ s~^.*/~~; 11$name =~ s~.PL$~~; 12 13open(OUT,"> $name") || 14 die "Could open $name for writing: $!\n"; 15 16print "writing $name\n"; 17 18while (<DATA>) { 19 if (m~^\#!/.*/perl.*$~o) { 20 # This substitutes the path perl was installed at on this system 21 # _and_ removed any (-w) options. 22 print OUT "#!",$fullperl,$1,"\n"; 23 next; 24 } 25 if (/^use lib/o) { 26 # This substitutes the actuall library install path 27 print OUT "use lib '$islib';\n"; 28 next; 29 } 30 print OUT; 31} 32 33close(OUT); 34 35# Make it executable too, and writeable 36chmod 0755, $name; 37 38#### The library 39 40sub resolve_make_var ($) { 41 42 my($var) = shift @_; 43 my($val) = $make{$var}; 44 45# print "Resolving: ",$var,"=",$val,"\n"; 46 47 while ($val =~ s~\$\((\S+)\)~$make{$1}~g) {} 48# print "Resolved: $var: $make{$var} -> $val\n"; 49 $val; 50} 51 52 53sub read_makefile { 54 55 open(MAKEFILE, 'Makefile') || 56 die "Could not open Makefile for reading: $!\n"; 57 58 while (<MAKEFILE>) { 59 chomp; 60 next unless m/^([A-Z]+)\s*=\s*(\S+)$/; 61 $make{$1}=$2; 62# print "Makevar: $1 = $2\n"; 63 } 64 65 close(MAKEFILE) 66} 67 68__END__ 69#!/usr/bin/perl -w 70# Perl 5.002 or later. w3mir is mostly tested with perl 5.004 71# 72use lib '/hom/janl/lib/perl'; 73# 74# Once upon a long time ago this was Oscar Nierstrasz's 75# <oscar@cui.unige.ch> htget script. 76# 77# Retrieves HTML pages, creating local copies in the _current_ 78# directory. The script will check for the last-modified stamp on the 79# document, and will not fetch it if the document isn't changed. 80# 81# Bug list is in w3mir-README. 82# 83# Test cases for janl to use: 84# w3mir -r -fs http://www.eff.org/ - infinite recursion! 85# --- but cursory examination seems to indicate confused server... 86# http://java.sun.com/progGuide/index.html check out the img things. 87# 88# Copyright Holders: 89# Nicolai Langfeldt, janl@ifi.uio.no 90# Gorm Haug Eriksen, gorm@usit.uio.no 91# Chris Szurgot, szurgot@itribe.net 92# Ed Jordan, ed@olympus.itl.net 93# Alex Knowles, aknowles@avs.com aka ark. 94# Copying and modification is governed by the "Artistic License" enclosed in 95# the w3mir distribution 96# 97# History (European format date: dd/mm/yy): 98# oscar 25/03/94 -- added -s option to send output to stdout 99# oscar 28/03/94 -- made HTTP 1.0 the default 100# oscar 30/05/94 -- special handling of directory URLs missing a trailing "/" 101# gorm 20/02/95 -- added mirror capacity + fixed a couple of bugs 102# janl 28/03/95 -- added a working commandline parser. 103# janl 18/09/95 -- Changed to use a net http library. Removed dependency of 104# url.pl. 105# janl 19/09/95 -- Extensive rewrite. Simplified a lot, works better. 106# HTML files are now saved in a new and improved manner, 107# which means they can be recognized as such w/o fancy 108# filename extention type rules. 109# szurgot 27/01/96-- Added "Plaintextmode" wrapper to binmode PAGE. 110# binmode page is required under Win32, but broke modified 111# checking 112# -- Minor change added ; to "# '" strings for Emacs cperl-mode 113# szurgot 07/02/96-- When reading in local file for checking of URLs changed 114# local ($/) =0; to equal undef; 115# janl 08/02/96 -- Added szurgot's changes and changed them :-) 116# szurgot 09/02/96-- Added code to strip /#.*$/ from urls when reading from 117# local file 118# -- Added hasAlarm variable to w3http.pl. Set to 1 if you have 119# alarm(). 0 otherwise. 120# -- Moved code setting up the valid extensions list into the 121# args processing where it belonged 122# janl 20/02/96 -- Added szurgot changes again. 123# -- Make timeout code work. 124# -- and made another win32 test. 125# janl 19/03/96 -- Worked through the code for handling not-modified 126# documents, it was a bit shabby after htmlop was intro'ed. 127# janl 20/03/96 -- -l fix 128# janl 23/04/96 -- Added -fs by request (by Rik Faith) 129# janl 16/05/96 -- Made -R mandatory, added use and support for 130# w3http::SAVEBIN 131# szurgot 19/05/96-- Win95 adaptions. 132# janl 19/05/96 -- -C did not exactly work as expected. Thanks to Petr 133# Novak for bug descriptions. 134# janl 19/05/96 -- Changed logic for @didntget, @got and so on to use 135# @queue and %urlstat. 136# janl 09/09/96 -- Removed -R switch. 137# janl 14/09/96 -- Added ir (initial referer) switch 138# janl 21/09/96 -- Made retry code saner. There probably needs to be a 139# sleep before retry comencing switch. When no tty is 140# present it should be fairly long. 141# gorm 15/09/96 -- Added cr (check robot) switch. Default to 1 (on) 142# janl 22/09/96 -- Modified gorms patch to use WWW::RobotRules. Changed 143# robot switch to be consistent with current w3mir 144# practice. 145# janl 27/09/96 -- Spelling corrections from charles.curran@ox.ac.uk 146# -- Folded in manual diffs from ark. 147# ark 24/09/96 -- Simple facilities to edit the incomming file(s) 148# janl 27/09/96 -- Added switch to enable <!--NOMIRROR--> editing and 149# foolproofed ark's patch a bit. 150# janl 02/10/96 -- Added -umask switch. 151# -- Redirected documents did not have a meaningful referer 152# value (it was undefined). 153# -- Got w3mir into strict discipline, found some typos... 154# janl 20/10/96 -- Mtime is preserved 155# janl 21/10/96 -- -lc switch added. Mtime preservation works better. 156# janl 06/11/96 -- Treat 301 like 302. 157# janl 02/12/96 -- Added config file code, fetch/ignore rules, apply 158# janl 04/12/96 -- Better checking of config input. 159# janl 06/12/96 -- Putting together the URL selection/editing brains. 160# janl 07/12/96 -- Checking out some bugs. Adding multiscope options. 161# janl 12/12/96 -- Adding to and defeaturing the multiscope options. 162# janl 13/12/96 -- Continuing work in multiscope stuff 163# -- Unreferenced file and empty directory removal works. 164# janl 19/02/97 -- Can extract urls from adobe acrobat pdf files :-) 165# Important: It does _not_ edit urls, so they still 166# point at the original site(s). 167# janl 21/02/97 -- Fix -lc bug related to case and the apply things. 168# -- only use SAVEURL if needed 169# janl 11/03/97 -- Finish work on SAVEURL conditional. 170# -- Fixed directory removal code. 171# -- parse_args did not abort when unknown option/argument 172# was specified. 173# janl 12/03/97 -- Made test case for -lc. Didn't work. Fixed it. I think. 174# Realized we have bug w.r.t. hostname caseing. 175# janl 13/03/97 -- All redirected to URLs within scope are now queued. 176# That should make the mirror more complete, but it won't 177# help browsability when it comes to the redirected doc. 178# -- Moved robot retrival to the inside of the mirror loop 179# since we now possebly mirror several sites. 180# -- Changed 'fetch-options' to 'options'. 181# -- Added 'proxy-options'/-pflush to controll proxy server(s). 182# janl 09/04/97 -- Started using URI::URL. 183# janl 11/04/97 -- Debugging and using URI::URL more correctly various places 184# janl 09/05/97 -- Added --agent switch 185# janl 12/05/97 -- Simplified scope checks for root URL, changed URL 'apply' 186# processing. 187# -- Small output formating fix in the robot rules code. 188# -- Version is now 0.99 189# janl 14/05/97 -- htmpop no-longer puts '<!DOCTYPE...' into doc, so check 190# for '<HTML' instead 191# janl 11/06/97 -- Made :port optional in server part of auth-domain. 192# Always removing :80 from server part to match netloc. 193# janl 22/07/97 -- More debugging of rewrite for new features -B, -I. 194# janl 01/08/97 -- Fixed bug in RE quoting for Ignore/Fetch 195# janl 04/08/97 -- s/writepage/write_page/g 196# janl 07/09/97 -- 0.99b1 is released 197# janl 19/09/97 -- Kaj Hejer discovers omissions in non-html-url-mining code. 198# -- 0.99b2 is released 199# janl 24/09/97 -- Matt Chapman found bug in realm-name extraction. 200# janl 10/10/97 -- Referer: header supression supressed User: header instead 201# -- Added fixup handling, writes .redirs and .referers 202# (no dot in win32) 203# -- Read .w3mirc (w3mir.ini on win32) if present 204# -- Stop file removal code from removing these files 205# janl 16/10/97 -- process_tag was mangling url attributes in tags with more 206# than one of them. Problem found by Robert L. Binkley 207# janl 04/12/97 -- Fixed problem with authentication, misplaced + 208# -- default inter-docuent pause is 0. I figure it's better 209# to keep one httpd occupied in a steady stream than to 210# wait for it to die before we talk to it again. 211# janl 13/12/97 -- The arguments to index.html in the form of index.html/foo 212# handling code was incomplete. To make it complete would 213# have been hard, so it was removed. 214# -- If a URL changes from file to directory or vice versa 215# this is now handled. 216# janl 11/01/98 -- PDF files with no URLs does not cause warnings now. 217# -- Close REFERERS and REDIRECTS before calling w3mfix 218# janl 22/01/98 -- Proxy authentication as outlined by Christian Geuer 219# janl 04/02/98 -- Version 1pre1 220# janl 18/02/98 -- Fixed wild_re after tip by Prentiss Riddle. 221# -- Version 1pre2 222# janl 20/02/98 -- w3http updated to handle complex content-types. 223# -- Fix wild_re more, bug noted by James Dumser 224# -- 1.0pre3 225# janl 18/03/98 -- Version 1.0 is released 226# janl 09/04/98 -- Added feature so user can disable newline conversion. 227# janl 20/04/98 -- Only convert newlines in HTML files. -> 1.0.2 228# janl 09/05/98 -- More carefull clean_disk code. 229# -- Check if the redirected URL was a root url, if so 230# issue a warning and exit. 231# janl 12/05/98 -- use ->unix_path instead of ->as_string to derive local 232# filename. 233# janl 25/05/98 -- -B didn't work too well. 234# janl 09/07/98 -- Redirect to fragment broke us, less broken now -> 1.0.4 235# janl 24/09/98 -- Better errormessages on errors -> 1.0.5 236# janl 21/11/98 -- Fix errormessages better. 237# janl 05/01/99 -- Drop 'Referer: (commandline)' 238# janl 13/04/99 -- Add initial referer to root urls in batch mode. 239# janl 15/01/00 -- Remove some leftover print statements 240# -- Fix also-queue problem as suggested by Sven Koch 241# janl 04/02/01 -- Use epath instead of path quite often -> 1.0.10 242# 243# Variable name discipline: 244# - remote, umodified URL. Variables prefixed 'rum_' 245# - local, filesystem. Variables prefixed 'lf_'. 246# Use these prefixes so we know what we're working with at all times. 247# Also, URL objects are postfixed _o 248# 249# The apply rules and scope rules work this way: 250# - First apply the user rules to the remote url. 251# - Check if document is within scope after this. 252# - Then apply w3mir's rules to the result. This results is the local, 253# filesystem, name. 254# 255# We use features introduced in 5.002. 256require 5.002; 257 258# win32 and $nulldevice need to be globals, other modules use them. 259use vars qw($win32 $nulldevice); 260 261# To figure out what kind of system this is 262BEGIN { 263 use Config; 264 $win32 = ( $Config{'osname'} eq 'MSWin32' ); 265} 266# More ways to die: 267use Carp; 268# Http module: 269use w3http; 270# html url extraction and manupulation: 271use htmlop; 272# Extract urls from adobe acrobat pdf files: 273use w3pdfuri; 274# Date computer: 275use HTTP::Date; 276# URLs: 277use URI::URL; 278# For flush method 279use FileHandle; 280 281eval ' 282use URI; 283$URI::ABS_ALLOW_RELATIVE_SCHEME=1; 284$URI::ABS_REMOTE_LEADING_DOTS=1; 285'; 286 287# Full discipline: 288use strict; 289 290# Set params in the http package, HTTP protocol version: 291$w3http::version="1.0"; 292 293# The defaults should be for a robotic http agent on good behaviour. 294my $debug=0; # Debug level 295my $verbose=0; # Verbosity level, -1 = quiet, 0 = normal, 1... 296my $pause=0; # Pause between http requests 297my $retryPause=600; # Pause between retries. 10 minutes. 298my $retry=3; # Max 3 stabs pr. url. 299my $r=0; # Recurse? no recursion = absolutify links 300my $remove=0; # Remove files that are not there? 301my $s=0; # 0: save on disk 1: stdout 2: just forget 'em 302my $useauth=0; # Use authorization 303my %authdata; # Authorization data 304my $check_robottxt = 1; # Check robots.txt 305my $do_referer = 1; # Send referers header 306my $do_user = 1; # Send user header 307my $cache_header = ''; # The cache-control/pragma: no-cache header 308my $using_proxy = 0; # Using proxy server or not? 309my $batch=0; # Batch get URLs? 310my $read_urls=0; # Get urls from STDIN? 311my $abs=0; # Absolutify URLs? 312my $immediate_redir=0; # Immediately follow a redirect? 313my @root_urls; # This is where we start, the root documents 314my @root_dirs; # The corresponding directories. for remove 315my $chdirto=''; # Place to chdir to after reading config file 316my %nodelete=(); # Files that should not be deleted 317my $numarg=0; # Number of arguments accepted. 318my $list_nomir=0; # List of files not mirrored 319 320# Fixup related things 321my $fixrc=''; # Name of w3mfix config file 322my $fixup=1; # Do things needed to run fixup 323my $runfix=0; # Run w3mfix for user? 324my $fixopen=0; # Fixup files open? 325 326my $indexname='index.html'; 327 328my $VERSION; 329$VERSION='1.0.10'; 330$w3http::agent = my $w3mir_agent = "w3mir/$VERSION-2001-01-20"; 331my $iref=''; # Initial referer. Must evaluate to false 332 333# Derived settings 334my $mine_urls=0; # Mine URLs from documents? 335my $process_urls=0; # Perform (URL) processing of documents? 336 337# Queue of urls to get. 338my @rum_queue = (); 339my @urls = (); 340# URL status map. 341my %rum_urlstat = (); 342# Status codes: 343my $QUEUED = 0; # Queued but not gotten yet. 344my $TERROR = 100; # Transient error, retry later 345my $HLERR = 101; # Permanent error, give up 346my $GOTIT = 200; # Gotten. Note similarity to http result code 347my $NOTMOD = 304; # Not modified. 348# Negative codes for nonexistent files, easier to check. 349my $NEVERMIND= -1; # Don't want it 350my $REDIR = -302; # Does not exist, redirected 351my $ENOTFND = -404; # Does not exist. 352my $OTHERERR = -600; # Some other error happened 353my $FROBOTS = -601; # Forbidden by robots.txt rule 354 355# Directory/files survey: 356my %lf_file; # What files are present in FS? Disposition? One of: 357my $FILEDEL=0; # Delete file 358my $FILEHERE=1; # File present in filesystem only 359my $FILETHERE=2; # File present on server too. 360my %lf_dir; # Number of files/dirs in dir. If 0 dir is 361 # eligible for deletion. 362 363my %fiddled=(); # If a file becomes a directory or a directory 364 # becomes a file it is considered fiddled and 365 # w3mir will not fiddle with it again in this 366 # run. 367 368# Bitbucket device, very OS dependent. 369$nulldevice='/dev/null'; 370$nulldevice='nul:' if ($win32); 371 372# What to get, and not. 373# Text of user supplied fetch/ignore rules 374my $rule_text=" # User defined fetch/ignore rules\n"; 375# Code ref to the rule procedure 376my $rule_code; 377 378# Code to prefix and postfix the generated code. Prefix should make 379# $_ contain the url to match. Postfix should return 1, the default 380# is to get the url/file. 381my $rule_prefix='$rule_code = sub { local($_) = shift;'."\n"; 382my $rule_postfix=" return 1;\n}"; 383 384# Scope tests generated by URL/Also directives in cfg. The scope code 385# is just like the rule code, but used for program generated 386# fetch/ignore rules related to multiscope retrival. 387my $scope_fetch=" # Automatic fetch rules for multiscope retrival\n"; 388my $scope_ignore=" # Automatic ignore rules for multiscope retrival\n"; 389my $scope_code; 390 391my $scope_prefix='$scope_code = sub { local($_) = shift;'."\n"; 392my $scope_postfix=" return 0;\n}"; 393 394# Function to apply to urls, se rule comments. 395my $user_apply_code; # User specified apply code 396my $apply_code; # w3mirs apply code 397my $apply_prefix='$apply_code = sub { local($_) = @_;'."\n"; 398my $apply_lc=' $_ = lc $_; '; 399my $apply_postfix=' return $_;'."\n}"; 400my @user_apply; # List of users apply rules. 401my @internal_apply; # List of w3mirs apply rules. 402 403my $infoloss=0; # 1 if any URL translations (which cause 404 # information loss) are in effect. If this is 405 # true we use the SAVEURL operation. 406my $list; # List url on STDOUT? 407my $edit; # Edit doc? Remove <!--NOMIRROR>...<!--/NOMIRROR--> 408my $header; # Text to insert in header 409my $lc=0; # Convert urls/filenames to lowercase? 410my $fetch=0; # What to fetch: -1: Some, 0: not modified 1: all 411my $convertnl=1; # Convert newlines? 412 413# Non text/html formats we can extract urls from. Function must take one 414# argument: the filename. 415my %knownformats = ( 'application/pdf', \&w3pdfuri::list, 416 'application/x-pdf', \&w3pdfuri::list, 417 ); 418 419# Known 'magic numbers' of the known formats. The value is used as 420# key in %knownformats. the key part is a exact match for the 421# following <string> beginning at the first byte of the file. 422# This should probably be made more flexible, but not until we need it. 423 424my %knownmagic = ( '%PDF-', 'application/pdf' ); 425 426my $iinline=''; # inline RE code to make RE caseinsensitive 427my $ipost=''; # RE postfix to make it caseinsensitive 428 429usage() unless parse_args(@ARGV); 430 431{ 432 my $w3mirc='.w3mirc'; 433 434 $w3mirc='w3mir.ini' if $win32; 435 436 if (-f $w3mirc) { 437 parse_cfg_file($w3mirc); 438 $nodelete{$w3mirc}=1; 439 } 440} 441 442# Check arguments and options 443if ($#root_urls>=0) { 444 # OK 445} else { 446 print "URLs: $#rum_queue\n"; 447 usage("No URLs given"); 448} 449 450# Are we converting newlines today? 451$w3http::convert=0 unless $convertnl; 452 453if ($chdirto) { 454 &mkdir($chdirto.'/this-is-not-created-odd-or-what'); 455 chdir($chdirto) || 456 die "w3mir: Can't change working directory to '$chdirto': $!\n"; 457} 458 459$SIG{'INT'}=sub { print STDERR "\nCaught SIGINT!\n"; exit 1; }; 460$SIG{'QUIT'}=sub { print STDERR "\nCaught SIGQUIT!\n"; exit 1; }; 461$SIG{'HUP'}=sub { print STDERR "\nCaught SIGHUP!\n"; exit 1; }; 462 463&open_fixup if $fixup; 464 465# Derive how much document processing we should do. 466$mine_urls=( $r || $list ); 467$process_urls=(!$batch && !$edit && !$header); 468# $abs can be set explicitly with -abs, and implicitly if not recursing 469$abs = 1 unless $r; 470print "Absolute references\n" if $abs && $debug; 471 472# Cache_controll specified but proxy not in use? 473die "w3mir: If you want to control a cache, use a proxy server!\n" 474 if ($cache_header && !$using_proxy); 475 476# Compile the second order code 477 478# - The rum scope tests 479my $full_rules=$scope_prefix.$scope_fetch.$scope_ignore.$scope_postfix; 480# warn "Scope rules:\n-------------\n$full_rules\n---------------\n"; 481eval $full_rules; 482die "$@" if $@; 483 484die "w3mir: Program generated rules did not compile.\nPlease report to w3mir-core\@usit.uio.no. The code is:\n----\n". 485 $full_rules."\n----\n" 486 if !defined($scope_code); 487 488$full_rules=$rule_prefix.$rule_text.$rule_postfix; 489# warn "Fetch rules:\n-------------\n$full_rules\n---------------\n"; 490eval $full_rules; 491die "$@!" if $@; 492 493# - The user specified rum tests 494die "w3mir: Ignore/Fetch rules did not compile.\nPlease report to w3mir-core\@usit.uio.no. The code is:\n----\n". 495 $full_rules."\n----\n" 496 if !defined($rule_code); 497 498# - The user specified apply rules 499 500# $SIG{__WARN__} = sub { print "$_[0]\n"; confess ""; }; 501 502my $full_apply=$apply_prefix.($lc?$apply_lc:''). 503 join($ipost.";\n",@user_apply).(($#user_apply>=0)?$ipost:"").";\n". 504 $apply_postfix; 505 506eval $full_apply; 507die "$@!" if $@; 508 509die "w3mir: User apply rules did not compile.\nPlease report to w3mir-core\@usit.uio.no. The code is: 510---- 511".$full_apply." 512----\n" if !defined($apply_code); 513 514# print "user apply: $full_apply\n"; 515$user_apply_code=$apply_code; 516 517# - The w3mir generated apply rules 518 519$full_apply=$apply_prefix.($lc?$apply_lc:''). 520 join($ipost.";\n",@internal_apply).(($#internal_apply>=0)?$ipost:"").";\n". 521 $apply_postfix; 522eval $full_apply; 523die "$@!" if $@; 524 525die "Internal apply rules did not compile. The code is: 526---- 527".$full_apply." 528----\n" if !defined($apply_code); 529 530# - Information loss via -lc? There are other sources as well. 531$infoloss=1 if $lc; 532 533warn "Infoloss is $infoloss\n" if $debug; 534 535# More setup: 536 537$w3http::debug=$debug; 538 539$w3http::verbose=$verbose; 540 541my %rum_referers=(); # Array of referers, key: rum_url 542my $Robot_Blob; # WWW::RobotsRules object, decides if rum_url is 543 # forbidden to access for us. 544my $rum_url_o; # rum url, mostly the current, the one we're getting 545my %gotrobots; # Did I get robots.txt from site? key: url->netloc 546my($authuser,$authpass);# Username and password for authentication with server 547my @rum_newurls; # List of rum_urls in document 548 549if ($check_robottxt) { 550 # Eval is only way to defer loading of module until we know it's needed? 551 eval 'use WWW::RobotRules;'; 552 553 die "Could not load WWW::RobotRules, try -drr switch\n" 554 unless defined(&WWW::RobotRules::parse); 555 556 $Robot_Blob = new WWW::RobotRules $w3mir_agent; 557} 558 559# We have several main-modes of operation. Here we select one 560if ($r) { 561 562 die "w3mir: No URLs? Try 'w3mir -h' for help.\n" 563 if $#root_urls==-1; 564 565 warn "Recursive retrival comencing\n" if $debug; 566 567 die "w3mir: Sorry, you cannot combine -r/recurse with -I/read_urls\n" 568 if $read_urls; 569 570 # Recursive 571 my $url; 572 foreach $url (@root_urls) { 573 warn "Root url dequeued: $url\n" if $debug; 574 if (want_this($url)) { 575 queue($url); 576 &add_referer($url,$iref); 577 } else { 578 die "w3mir: Inconsistent configuration: Specified $url is not inside retrival scope\n"; 579 } 580 } 581 mirror(); 582 583} else { 584 if ($batch) { 585 warn "Batch retrival commencing\n" if $debug; 586 # Batch get 587 if ($read_urls) { 588 # Get URLs from <STDIN> 589 while (<STDIN>) { 590 chomp; 591 &add_referer($_,$iref); 592 batch_get($_); 593 } 594 } else { 595 # Get URLs from commandline 596 my $url; 597 foreach $url (@root_urls) { 598 &add_referer($url,$iref); 599 } 600 foreach $url (@root_urls) { 601 batch_get($url); 602 } 603 } 604 } else { 605 warn "Single url retrival commencing\n" if $debug; 606 607 # A single URL, with all processing on 608 die "w3mir: You specified several URLs and not -B/batch\n" 609 if $#root_urls>0; 610 queue($root_urls[0]); 611 &add_referer($root_urls[0],$iref); 612 mirror(); 613 } 614} 615 616&close_fixup if $fixup; 617 618# This should clean up files: 619&clean_disk if $remove; 620 621warn "w3mir: That's all (".$w3http::xfbytes.'+',$w3http::headbytes. 622 " bytes of it).\n" unless $verbose<0; 623 624if ($runfix) { 625 eval 'use Config;'; 626 warn "Running w3mfix\n"; 627 if ($win32) { 628 CORE::system($Config{'perlpath'}." w3mfix $fixrc"); 629 } else { 630 CORE::system("w3mfix $fixrc"); 631 } 632} 633 634exit 0; 635 636sub get_document { 637 # Get one document by HTTP ($1/rum_url_o). Save in given filename ($2). 638 # Possebly returning references found in the document. Caller must 639 # set up referer array, check wantedness and everything else. We 640 # handle authentication here though. 641 642 my($rum_url_o)=shift; 643 my($lf_url)=shift; 644 croak("\$rum_url_o is empty") if !defined($rum_url_o) || !$rum_url_o; 645 croak("$lf_url is empty") if !defined($lf_url) || !$lf_url; 646 647 # Make sure it's an object 648 $rum_url_o = url $rum_url_o 649 unless ref $rum_url_o; 650 651 my($slash)=($lf_url =~ /^\//); 652 # Derive a filename from the url, the filename contains no URL-quoting 653 my($lf_name) = (url "file:$lf_url")->unix_path; 654 $lf_name =~ s~^/~~ if (!$slash); 655 656 # Make all intermediate directories 657 &mkdir($lf_name) if $s==0; 658 659 my($rum_as_string) = $rum_url_o->as_string; 660 661 print STDERR "GET_DOCUMENT: '",$rum_as_string,"' -> '",$lf_name,"'\n" 662 if $debug; 663 664 my $hostport; 665 my $www_auth=''; # Value of that http reply header 666 my $page_ref; 667 my @rum_newurls; # List of URLs extracted 668 my $url_extractor; 669 my $do_query; # Do query or not? 670 671 if (defined($rum_urlstat{$rum_as_string}) && 672 $rum_urlstat{$rum_as_string}>0) { 673 warn "w3mir: Internal error, ".$rum_as_string. 674 " queued several times\n"; 675 next; 676 } 677 678 # Goto here if we want to retry b/c of authentication 679 try_again: 680 681 # Start building the extra http::query arguments again 682 my @EXTRASTUFF=(); 683 684 # We'll start by assuming that we're doing the query. 685 $do_query=1; 686 687 # If we're not checking the timestamp, or the file does not exist 688 # then we get the file unconditionally. Otherwise we only want it 689 # if it's updated. 690 691 if ($fetch==1) { 692 # Nothing do do? 693 } else { 694 if (-f $lf_name) { 695 if ($fetch==-1) { 696 print STDERR "w3mir: ".($infoloss?$rum_as_string:$lf_name). 697 ", already have it" if $verbose>=0; 698 if (!$mine_urls) { 699 # If -fs and the file exists and we don't need to mine URLs 700 # we're finished! 701 warn "Already have it, no mining, returning!\n" if $debug; 702 print STDERR "\n" if $verbose>=0; 703 return; 704 } 705 $w3http::result=1304; # Pretend it was 'not modified' 706 $do_query=0; 707 } else { 708 push(@EXTRASTUFF,$w3http::IFMODF,$lf_name); 709 } 710 } 711 } 712 713 if ($do_query) { 714 715 # Does the server want authorization for this file? $www_auth is 716 # only set if authentication was requested the first time around. 717 718 # For testing: 719 # $www_auth='Basic realm="foo"'; 720 721 if ($www_auth) { 722 my($authdata,$method,$realm); 723 724 ($method,$realm)= $www_auth =~ m/^(\S+)\s+realm=\"([^\"]+)\"/i; 725 $method=lc $method; 726 $realm=lc $realm; 727 die "w3mir: '$method' authentication needed, don't know that.\n" 728 if ($method ne 'basic'); 729 730 $hostport = $rum_url_o->netloc; 731 $authdata=$authdata{$hostport}{$realm} || $authdata{$hostport}{'*'} || 732 $authdata{'*'}{$realm} || $authdata{'*'}{'*'}; 733 734 if ($authdata) { 735 push(@EXTRASTUFF,$w3http::AUTHORIZ,$authdata); 736 } else { 737 print STDERR "w3mir: No authorization data for $hostport/$realm\n"; 738 $rum_urlstat{$rum_as_string}=$NEVERMIND; 739 next; 740 } 741 } 742 743 push(@EXTRASTUFF,$w3http::FREEHEAD,$cache_header) 744 if ($cache_header); 745 746 # Insert referer header data if at all 747 push(@EXTRASTUFF,$w3http::REFERER,$rum_referers{$rum_as_string}[0]) 748 if ($do_referer && exists($rum_referers{$rum_as_string})); 749 750 push(@EXTRASTUFF,$w3http::NOUSER) 751 unless ($do_user); 752 753 # YES, $lf_url is right, w3http::query handles this like an url so 754 # the quoting must all be in place. 755 my $binfile=$lf_url; 756 $binfile='-' if $s==1; 757 $binfile=$nulldevice if $s==2; 758 759 if ($pause) { 760 print STDERR "w3mir: sleeping\n" if $verbose>0; 761 sleep($pause); 762 } 763 764 print STDERR "w3mir: ".($infoloss?$rum_as_string:$lf_name) 765 unless $verbose<0; 766 print STDERR "\nFile: $lf_name\n" if $debug; 767 768 &w3http::query($w3http::GETURL,$rum_as_string, 769 $w3http::SAVEBIN,$binfile, 770 @EXTRASTUFF); 771 772 print STDERR "w3http::result: '",$w3http::result, 773 "' doc size: ", length($w3http::document), 774 " doc type: -",$w3http::headval{'CONTENT-TYPE'}, 775 "- plaintexthtml: ",$w3http::plaintexthtml,"\n" 776 if $debug; 777 778 print "Result: ",$w3http::result," Recurse: $r, html: ", 779 $w3http::plaintexthtml,"\n" 780 if $debug; 781 782 } # if $do_query 783 784 if ($w3http::result==200) { # 200 OK 785 $rum_urlstat{$rum_as_string}=$GOTIT; 786 787 if ($mine_urls || $process_urls) { 788 789 if ($w3http::plaintexthtml) { 790 # Only do URL manipulations if this is a html document with no 791 # special content-encoding. We do not handle encodings, yet. 792 793 my $page; 794 795 print STDERR ($process_urls)?", processing":", url mining" 796 if $verbose>0; 797 798 print STDERR "\nurl:'$lf_url'\n" 799 if $debug; 800 801 print "\nMining URLs: $mine_urls, Process: $process_urls\n" 802 if $debug; 803 804 ($page,@rum_newurls) = 805 &htmlop::process($w3http::document, 806 # Only get a new document if wanted 807 $process_urls?():($htmlop::NODOC), 808 $htmlop::CANON, 809 $htmlop::ABS,$rum_url_o, 810 # Only list urls if wanted 811 $mine_urls?($htmlop::LIST):(), 812 813 # If user wants absolute URLs do not 814 # relativize them 815 816 $abs? 817 (): 818 ( 819 $htmlop::TAGCALLBACK,\&process_tag,$lf_url, 820 ) 821 ); 822 823# print "URL: ",join("\nURL: ",@rum_newurls),"\n"; 824 825 if ($process_urls) { 826 $page_ref=\$page; 827 $w3http::document=''; 828 } else { 829 $page_ref=\$w3http::document; 830 } 831 832 } elsif ($s == 0 && 833 ($url_extractor = 834 $knownformats{$w3http::headval{'CONTENT-TYPE'}})) { 835 836 # The knownformats extractors only work on disk files so write 837 # doc to disk if not there already (non-html text will not be) 838 write_page($lf_name,$w3http::document,1); 839 840 # Now we try our hand at fetching URIs from non-html files. 841 print STDERR ", mining URLs" if $verbose>=1; 842 @rum_newurls = &$url_extractor($lf_name); 843 # warn "URLs from PDF: ",join(', ',@rum_newurls),"\n"; 844 } 845 846 847 } # if ($mine_urls || $process_urls) 848 849# print "page_ref defined: ",defined($page_ref),"\n"; 850# print "plaintext: ",$w3http::plaintext,"\n"; 851 852 $page_ref=\$w3http::document 853 if !defined($page_ref) && $w3http::plaintexthtml; 854 855 if ($w3http::plaintexthtml) { 856 # ark: this is where I want to do my changes to the page strip 857 # out the <!--NOMIRROR-->...<!--/NOMIRROR--> Stuff. 858 $$page_ref=~ s/<(!--)?\s*NO\s*MIRROR\s*(--)?>[^\000]*?<(!--)?\s*\/NO\s*MIRROR\s*(--)?>//g 859 if $edit; 860 861 if ($header) { 862 # ark: insert a header string at the start of the page 863 my $mirrorstr=$header; 864 $mirrorstr =~ s/\$url/$rum_as_string/g; 865 insert_at_start( $mirrorstr, $page_ref ); 866 } 867 } 868 869 write_page($lf_name,$page_ref,0); 870 871 # print "New urls: ",join("\n",@rum_newurls),"\n"; 872 873 return @rum_newurls; 874 } 875 876 if ($w3http::result==304 || # 304 Not modified 877 $w3http::result==1304) { # 1304 Have it 878 879 { 880 # last = out of nesting 881 882 my $rum_urlstat; 883 my $rum_newurls; 884 885 @rum_newurls=(); 886 887 print STDERR ", not modified" 888 if $verbose>=0 && $w3http::result==304; 889 890 $rum_urlstat{$rum_as_string}=$NOTMOD; 891 892 last unless $mine_urls; 893 894 $rum_newurls=get_references($lf_name); 895 896 # print "New urls: ",ref($rum_newurls),"\n"; 897 898 if (!ref($rum_newurls)) { 899 last; 900 } elsif (ref($rum_newurls) eq 'SCALAR') { 901 $page_ref=$rum_newurls; 902 } elsif (ref($rum_newurls) eq 'ARRAY') { 903 @rum_newurls=@$rum_newurls; 904 last; 905 } else { 906 die "\nw3mir: internal error: Unknown return type from get_references\n"; 907 } 908 909 # Check if it's a html file. I know this tag is in all html 910 # files, because I put it there as I pull them in. 911 last unless $$page_ref =~ /<HTML/i; 912 913 warn "$lf_name is a html file\n" if $debug; 914 915 # It's a html document 916 print STDERR ", mining URLs" if $verbose>=1; 917 918 # This will give us a list of absolute urls 919 (undef,@rum_newurls) = 920 &htmlop::process($$page_ref,$htmlop::NODOC, 921 $htmlop::ABS,$rum_as_string, 922 $htmlop::USESAVED,'W3MIR', 923 $htmlop::LIST); 924 } 925 926 print STDERR "\n" if $verbose>=0; 927 return @rum_newurls; 928 } 929 930 if ($w3http::result==302 || $w3http::result==301) { # Redirect 931 # Cern and NCSA httpd sends 302 'redirect' if a ending / is 932 # forgotten on a url. More recent httpds send 301 'permanent 933 # redirect' in this case. Here we check if the difference in URLs 934 # is just a / and if so push the url again with the / added. This 935 # code only works if the http server has the right idea about its 936 # own name. 937 # 938 # 18/3/97: Added code to queue redirected-to-URLs that are within 939 # the scope of the retrival. 940 my $new_rum_url; 941 942 $rum_urlstat{$rum_as_string}=$REDIR; 943 944 # Absolutify the new url, it might be relative to the requested 945 # document. That's a ugly wart on some servers/admins. 946 $new_rum_url=url $w3http::headval{'location'}; 947 $new_rum_url=$new_rum_url->abs($rum_url_o); 948 949 print REDIRS $rum_as_string,' -> ',$new_rum_url->as_string,"\n" 950 if $fixup; 951 952 if ($immediate_redir) { 953 print STDERR " =>> ",$new_rum_url->as_string,", getting that instead\n"; 954 return get_document($new_rum_url,$lf_url); 955 } 956 957 # Some redirect to a fragment of another doc... 958 $new_rum_url->frag(undef); 959 $new_rum_url=$new_rum_url->as_string; 960 961 if ($rum_as_string.'/' eq $new_rum_url) { 962 if (grep { $rum_as_string eq $_; } @root_urls) { 963 print STDERR "\nw3mir: missing / in a start URL detected. Please fix commandline/config file.\n"; 964 exit(1); 965 } 966 print STDERR ", missing /\n"; 967 queue($new_rum_url); 968 # Initialize referer to something meaningful 969 $rum_referers{$new_rum_url}=$rum_referers{$rum_as_string}; 970 } else { 971 print STDERR " =>> $new_rum_url"; 972 if (want_this($new_rum_url)) { 973 print STDERR ", getting that\n"; 974 queue($new_rum_url); 975 $rum_referers{$new_rum_url}=$rum_referers{$rum_as_string}; 976 } else { 977 print STDERR ", don't want it\n"; 978 } 979 } 980 return (); 981 } 982 983 if ($w3http::result==403 || # Forbidden 984 $w3http::result==404 || # Not found 985 $w3http::result==406 || # Not Acceptable, hmm, belongs here? 986 $w3http::result==410) { # Gone - no forwarding address known 987 988 $rum_urlstat{$rum_as_string}=$ENOTFND; 989 &handleerror; 990 print STDERR "Was refered from: ", 991 join(',',@{$rum_referers{$rum_as_string}}), 992 "\n" if defined(@{$rum_referers{$rum_as_string}}); 993 return (); 994 } 995 996 if ($w3http::result==407) { 997 # Proxy authentication requested 998 die "Proxy server requests authentication but failed to return the\n". 999 "REQUIRED Proxy-Authenticate header for this condition\n" 1000 unless exists($w3http::headval{'proxy-authenticate'}); 1001 1002 die "Proxy authentication is required for ".$w3http::headval{'proxy-authenticate'}."\n"; 1003 } 1004 1005 if ($w3http::result==401) { 1006 # A www-authenticate reply header should acompany a 401 message. 1007 if (!exists($w3http::headval{'www-authenticate'})) { 1008 warn "w3mir: Server indicated authentication failure but gave no www-authenticate reply\n"; 1009 $rum_urlstat{$rum_as_string}=$NEVERMIND; 1010 } else { 1011 # Unauthorized 1012 if ($www_auth) { 1013 # Failed when authorization data was supplied. 1014 $rum_urlstat{$rum_as_string}=$NEVERMIND; 1015 print STDERR ", authorization failed data needed for ", 1016 $w3http::headval{'www-authenticate'},"\n" 1017 if ($verbose>=0); 1018 } else { 1019 if ($useauth) { 1020 # First time failure, send back and retry at once with some known 1021 # user/passwd. 1022 $www_auth=$w3http::headval{'www-authenticate'}; 1023 print STDERR ", retrying with authorization\n" unless $verbose<0; 1024 goto try_again; 1025 } else { 1026 print ", authorization needed: ", 1027 $w3http::headval{'www-authenticate'},"\n"; 1028 $rum_urlstat{$rum_as_string}=$NEVERMIND; 1029 } 1030 } 1031 } 1032 return (); 1033 } 1034 1035 # Something else. 1036 &handleerror; 1037} 1038 1039 1040sub robot_check { 1041 # Check if URL is allowed by robots.txt, if we respect it at all 1042 # that is. Return 1 it allowed, 0 otherwise. 1043 1044 my($rum_url_o)=shift; 1045 my $hostport; 1046 1047 if ($check_robottxt) { 1048 1049 $hostport = $rum_url_o->netloc; 1050 if (!exists($gotrobots{$hostport})) { 1051 # Get robots.txt from the server 1052 $gotrobots{$hostport}=1; 1053 my $robourl="http://$hostport/robots.txt"; 1054 print STDERR "w3mir: $robourl" if ($verbose>=0); 1055 &w3http::query($w3http::GETURL,$robourl); 1056 $w3http::document='' if ($w3http::result != 200); 1057 print STDERR ", processing" if $verbose>=1; 1058 print STDERR "\n" if ($verbose>=0); 1059 $Robot_Blob->parse($robourl,$w3http::document); 1060 } 1061 1062 if (!$Robot_Blob->allowed($rum_url_o->as_string)) { 1063 # It is forbidden 1064 $rum_urlstat{$rum_url_o->as_string}=$FROBOTS; 1065 warn "w3mir: ",$rum_url_o->as_string,": forbidden by robots.txt\n"; 1066 return 0; 1067 } 1068 } 1069 return 1; 1070} 1071 1072 1073 1074sub batch_get { 1075 # Batch get _one_ document. 1076 my $rum_url=shift; 1077 my $lf_url; 1078 1079 $rum_url_o = url $rum_url; 1080 1081 return unless robot_check($rum_url_o); 1082 1083 ($lf_url=$rum_url) =~ s~.*/~~; 1084 if (!defined($lf_url) || $lf_url eq '') { 1085 ($lf_url=$rum_url) =~ s~/$~~; 1086 $lf_url =~ s~.*/~~; 1087 $lf_url .= "-$indexname"; 1088 } 1089 1090 warn "Batch get: $rum_url -> $lf_url\n" if $debug; 1091 1092 $immediate_redir=1; # Do follow redirects immediately 1093 1094 get_document($rum_url,$lf_url); 1095} 1096 1097 1098 1099sub mirror { 1100 # Mirror (or get) the requested url(s). Possibly recursively. 1101 # Working from whatever cwd is at invocation we'll retrieve all 1102 # files under it in the file hierarchy. 1103 1104 my $rum_url; # URL of the document we're getting now, defined at main level 1105 my $lf_url; # rum_url after apply - and 1106 my $new_lf_url; 1107 my @new_rum_urls; 1108 my $rum_ref; 1109 1110 while (defined($rum_url = pop(@rum_queue))) { 1111 1112 warn "mirror: Poped $rum_url from queue\n" if $debug; 1113 1114 # Unwanted URLs should not be queued 1115 die "Found url $rum_url that I don't want in queue!\n" 1116 unless defined($lf_url=apply($rum_url)); 1117 1118 $rum_url_o = url $rum_url; 1119 1120 next unless robot_check($rum_url_o); 1121 1122 # Figure out the filename for our local filesystem. 1123 $lf_url.=$indexname if $lf_url =~ m~/$~ || $lf_url eq ''; 1124 1125 @new_rum_urls = get_document($rum_url_o,$lf_url); 1126 1127 print join("\n",@new_rum_urls),"\n" if ($list); 1128 1129 if ($r) { 1130 foreach $rum_ref (@new_rum_urls) { 1131 # warn "Recursive url: $rum_ref\n"; 1132 $new_lf_url=apply($rum_ref); 1133 next unless $new_lf_url; 1134 1135 # warn "Want it\n"; 1136 $rum_ref =~ s/\#.*$//s; # Clip off section marks 1137 1138 add_referer($rum_ref,$rum_url_o->as_string); 1139 queue($rum_ref); 1140 } 1141 } 1142 1143 @new_rum_urls=(); 1144 1145 # Is the URL queue empty? Are there outstanding retries? Refill 1146 # the queue from the retry list. 1147 if ($#rum_queue<0 && $retry-->0) { 1148 foreach $rum_url_o (keys %rum_urlstat) { 1149 $rum_url_o = url $rum_url_o; 1150 if ($rum_urlstat{$rum_url_o->as_string}==100) { 1151 push(@rum_queue,$rum_url_o->as_string); 1152 $rum_urlstat{$rum_url_o->as_string}=0; 1153 } 1154 } 1155 if ($#rum_queue>=0) { 1156 warn "w3mir: Sleeping before retrying. $retry more times left\n" 1157 if $verbose>=0; 1158 sleep($retryPause); 1159 } 1160 } 1161 1162 } 1163} 1164 1165 1166sub get_references { 1167 # Get references from a non-html-on-disk file. Return references if 1168 # we know how to find them. Return reference do the complete page 1169 # if it's html. Return single numerical 0 if unknown format. 1170 1171 my($lf_url)=shift; 1172 my($urlextractor)=shift; 1173 1174 my $read; # Buffer of stuff read from file to check filetype 1175 my $magic; 1176 my $url_extractor; 1177 my $rum_ref; 1178 my $page; 1179 1180 warn "w3mir: Looking at local $lf_url\n" if $debug; 1181 1182 # Open file and read the first 10kilobytes for file-type-test 1183 # purposes. 1184 if (!open(TMPF,$lf_url)) { 1185 warn "Unable to open $lf_url for reading: $!\n"; 1186 last; 1187 } 1188 1189 $page=' 'x10240; 1190 $read=sysread(TMPF,$page,length($page),0); 1191 close(TMPF); 1192 1193 die "Error reading $lf_url: $!\n" if (!defined($read)); 1194 1195 if (!defined($url_extractor)) { 1196 $url_extractor=0; 1197 1198 # Check file against list of magic numbers. 1199 foreach $magic (keys %knownmagic) { 1200 if (substr($page,0,length($magic)) eq $magic) { 1201 $url_extractor = $knownformats{$knownmagic{$magic}}; 1202 last; 1203 } 1204 } 1205 } 1206 1207 # Found a extraction method, apply. 1208 if ($url_extractor) { 1209 print STDERR ", mining URLs" if $verbose>=1; 1210 return [&$url_extractor($lf_url)]; 1211 } 1212 1213 if ($page =~ /<HTML/i) { 1214 open(TMPF,$lf_url) || 1215 die "Could not open $lf_url for reading: $!\n"; 1216 # read the whole file. 1217 local($/)=undef; 1218 $page = <TMPF>; 1219 close(TMPF); 1220 return \$page; 1221 } 1222 1223 return 0; 1224} 1225 1226 1227sub open_fixup { 1228 # Open the referers and redirects files 1229 1230 my $reffile='.referers'; 1231 my $redirfile='.redirs'; 1232 my $removedfile='.notmirrored'; 1233 1234 if ($win32) { 1235 $reffile="referers"; 1236 $redirfile="redirs"; 1237 $removedfile="notmir"; 1238 } 1239 1240 $nodelete{$reffile} = $nodelete{$redirfile} = $nodelete{$removedfile} = 1; 1241 1242 $removedfile=$nulldevice unless $list_nomir; 1243 1244 open(REDIRS,"> $redirfile") || 1245 die "Could not open $redirfile for writing: $!\n"; 1246 1247 autoflush REDIRS 1; 1248 1249 open(REFERERS,"> $reffile") || 1250 die "Could not open $reffile for writing: $!\n"; 1251 1252 $fixopen=1; 1253 1254 open(REMOVED,"> $removedfile") || 1255 die "Could not open $removedfile for writing: $!\n"; 1256 1257 autoflush REMOVED 1; 1258 1259 eval 'END { close_fixup; 0; }'; 1260} 1261 1262 1263sub close_fixup { 1264 # Close the fixup data files. In the case of the referer file also 1265 # write the entire content 1266 1267 return unless $fixopen; 1268 1269 my $referer; 1270 1271 foreach $referer (keys %rum_referers) { 1272 print REFERERS $referer," <- ",join(' ',@{$rum_referers{$referer}}),"\n"; 1273 } 1274 1275 close(REFERERS) || warn "Error closing referers file: $!\n"; 1276 close(REDIRS) || warn "Error closing redirects file: $!\n"; 1277 close(REMOVED) || warn "Error closing 'removed' file: $!\n"; 1278 $fixopen=0; 1279} 1280 1281 1282sub clean_disk { 1283 # This procedure removes files that are not present on the server(s) 1284 # anymore. 1285 1286 # - To avoid removing files that were not fetched due to network 1287 # problems we only do blanket removal IFF all documents were 1288 # fetched w/o problems, eventually. 1289 # - In any case we can remove files the server said were not found 1290 1291 # The strategy has three main parts: 1292 # 1. Find all files we have 1293 # 2. Find what files we ought to have 1294 # 3. Remove the difference 1295 1296 my $complete_retrival=1; # Flag saying IFF all documents were fetched 1297 my $urlstat; # Tmp storage 1298 my $rum_url; 1299 my $lf_url; 1300 my $lf_dir; 1301 my $dirs_to_remove; 1302 1303 # For fileremoval code 1304 eval "use File::Find;" unless defined(&find); 1305 1306 die "w3mir: Could not load File::Find module. Don't use -R switch.\n" 1307 unless defined(&find); 1308 1309 # This to shut up -w 1310 $lf_dir=$File::Find::dir; 1311 1312 # ***** 1. Find out what files we have ***** 1313 # 1314 # This does two things: For each file or directory found: 1315 # - Increases entry count for the container directory 1316 # - If it's a file: $lf_file{relative_path}=$FILEHERE; 1317 1318 chop(@root_dirs); 1319 print STDERR "Looking in: ",join(", ",@root_dirs),"\n" if $debug; 1320 1321 find(\&find_files,@root_dirs); 1322 1323 # ***** 2. Find out what files we ought to have ***** 1324 # 1325 # First we loop over %rum_urlstat to determine what files are not 1326 # present on the server(s). 1327 foreach $rum_url (keys %rum_urlstat) { 1328 # Figure out name of local file from rum_url 1329 next unless defined($lf_url=apply($rum_url)); 1330 1331 $lf_url.=$indexname if $lf_url =~ m~/$~ || $lf_url eq ''; 1332 1333 # find prefixes ./, we must too. 1334 $lf_url="./".$lf_url unless substr($lf_url,0,1) eq '/'; 1335 1336 # Ignore if file does not exist here. 1337 next unless exists($lf_file{$lf_url}); 1338 1339 # The apply rules can map several remote files to same local 1340 # file. If we decided to keep file already we stay with that. 1341 next if $lf_file{$lf_url}==$FILETHERE; 1342 1343 $urlstat=$rum_urlstat{$rum_url}; 1344 1345 # Figure out the status code. 1346 if ($urlstat==$GOTIT || $urlstat==$NOTMOD) { 1347 # Present on server. Keep. 1348 $lf_file{$lf_url}=$FILETHERE; 1349 next; 1350 } elsif ($urlstat==$ENOTFND || $urlstat==$NEVERMIND ) { 1351 # One of: not on server, can't get, don't want, access forbiden: 1352 # Schedule for removal. 1353 $lf_file{$lf_url}=$FILEDEL if exists($lf_file{$lf_url}); 1354 next; 1355 } elsif ($urlstat==$OTHERERR || $urlstat==$TERROR) { 1356 # Some error occured transfering. 1357 $complete_retrival=0; # The retrival was not complete. Delete less 1358 } elsif ($urlstat==$QUEUED) { 1359 warn "w3mir: Internal inconsistency, $rum_url marked as queued after retrival terminated\n"; 1360 $complete_retrival=0; # Fishy. Be conservative about removing 1361 } else { 1362 $complete_retrival=0; 1363 warn "w3mir: Warning: $rum_url is marked as $urlstat.\n". 1364 "w3mir: Please report to w3mir-core\@usit.uio.no.\n"; 1365 } 1366 } # foreach %rum_urlstat 1367 1368 # ***** 3. Remove the difference ***** 1369 1370 # Loop over all found files: 1371 # - Should we have this file? 1372 # - If not: Remove file and decrease directory entry count 1373 # Loop as long as there are directories with 0 entry count: 1374 # - Loop over all directories with 0 entry count: 1375 # - Remove directory 1376 # - Decrease entry count of parent 1377 1378 warn "w3mir: Some error occured, conservative file removal\n" 1379 if !$complete_retrival && $verbose>=0; 1380 1381 # Remove all files we don't want removed from list of files present: 1382 foreach $lf_url (keys %nodelete) { 1383 print STDERR "Not deleting: $lf_url\n" if $verbose>=1; 1384 delete $lf_file{$lf_url} || delete $lf_file{'./'.$lf_url}; 1385 } 1386 1387 # Remove files 1388 foreach $lf_url (keys %lf_file) { 1389 if (($complete_retrival && $lf_file{$lf_url}==$FILEHERE) || 1390 ($lf_file{$lf_url} == $FILEDEL)) { 1391 if (unlink $lf_url) { 1392 ($lf_dir)= $lf_url =~ m~^(.+)/~; 1393 $lf_dir{$lf_dir}--; 1394 $dirs_to_remove=1 if ($lf_dir{$lf_dir}==0); 1395 warn "w3mir: removed file $lf_url\n" if $verbose>=0; 1396 } else { 1397 warn "w3mir: removal of file $lf_url failed: $!\n"; 1398 } 1399 } 1400 } 1401 1402 # Remove empty directories 1403 while ($dirs_to_remove) { 1404 $dirs_to_remove=0; 1405 foreach $lf_url (keys %lf_dir) { 1406 next if $lf_url eq '.'; 1407 if ($lf_dir{$lf_url}==0) { 1408 if (rmdir($lf_url)) { 1409 warn "w3mir: removed directory $lf_dir\n" if $verbose>=0; 1410 delete $lf_dir{$lf_url}; 1411 ($lf_dir)= $lf_url =~ m~^(.+)/~; 1412 $lf_dir{$lf_dir}--; 1413 $dirs_to_remove=1 if ($lf_dir{$lf_dir}==0); 1414 } else { 1415 warn "w3mir: removal of directory $lf_dir failed: $!\n"; 1416 } 1417 } 1418 } 1419 } 1420} 1421 1422 1423sub find_files { 1424 # This is called by the find procedure for every file/dir found. 1425 1426 # This builds two hashes: 1427 # lf_file{<file>}: 1: file exists 1428 # lf_dir{<dir>): Number of files in directory. 1429 1430 lstat($_); 1431 1432 $lf_dir{$File::Find::dir}++; 1433 1434 if (-f _) { 1435 $lf_file{$File::Find::name}=$FILEHERE; 1436 } elsif (-d _) { 1437 # null 1438 # Bug: If an empty directory exists it will not be removed 1439 } else { 1440 warn "w3mir: File $File::Find::name has unknown type. Ignoring.\n"; 1441 } 1442 return 0; 1443 1444} 1445 1446 1447sub handleerror { 1448 # Handle error status of last http connection, will set the rum_urlstat 1449 # appropriately and print a error message. 1450 1451 my $msg; 1452 1453 if ($verbose<0) { 1454 $msg="w3mir: ".$rum_url_o->as_string.": "; 1455 } else { 1456 $msg=": "; 1457 } 1458 1459 if ($w3http::result == 98) { 1460 # OS/Network error 1461 $msg .= "$!"; 1462 $rum_urlstat{$rum_url_o->as_string}=$OTHERERR; 1463 } elsif ($w3http::result == 100) { 1464 # Some kind of error connecting or sending request 1465 $msg .= $w3http::restext || "Timeout"; 1466 $rum_urlstat{$rum_url_o->as_string}=$TERROR; 1467 } else { 1468 # Other HTTP error 1469 $rum_urlstat{$rum_url_o->as_string}=$OTHERERR; 1470 $msg .= " ".$w3http::result." ".$w3http::restext; 1471 $msg .= " =>> ".$w3http::headval{'location'} 1472 if (defined($w3http::headval{'location'})); 1473 } 1474 print STDERR "$msg\n"; 1475} 1476 1477 1478sub queue { 1479 # Queue given url if appropriate and create a status entry for it 1480 my($rum_url_o)=url $_[0]; 1481 1482 croak("BUG: undefined \$rum_url_o") 1483 if !defined($rum_url_o); 1484 1485 croak("BUG: undefined \$rum_url_o->as_string") 1486 if !defined($rum_url_o->as_string); 1487 1488 croak("BUG: ".$rum_url_o->as_string." (fragnent) queued") 1489 if $rum_url_o->as_string =~ /\#/; 1490 1491 return if exists($rum_urlstat{$rum_url_o->as_string}); 1492 return unless want_this($rum_url_o->as_string); 1493 1494 warn "QUEUED: ",$rum_url_o->as_string,"\n" if $debug; 1495 1496 # Note lack of scope checks. 1497 $rum_urlstat{$rum_url_o->as_string}=$QUEUED; 1498 push(@rum_queue,$rum_url_o->as_string); 1499} 1500 1501 1502sub root_queue { 1503 # Queue function for root urls and directories. One or the other might 1504 # be boolean false, in that case, don't queue it. 1505 1506 my $root_url_o; 1507 1508 my($root_url)=shift; 1509 my($root_dir)=shift; 1510 1511 die "w3mir: No fragments in start URLs :".$root_url."\n" 1512 if $root_url =~ /\#/; 1513 1514 if ($root_dir) { 1515 print "Root dir: $root_dir\n" if $debug; 1516 $root_dir="./$root_dir" unless substr($root_dir,0,1) eq '/' or 1517 substr($root_dir,0,2) eq './'; 1518 push(@root_dirs,$root_dir); 1519 } 1520 1521 1522 if ($root_url) { 1523 $root_url_o=url $root_url; 1524 1525 # URL canonification, or what we do of it at least. 1526 $root_url_o->host($root_url_o->host); 1527 1528 warn "Root queue: ".$root_url_o->as_string."\n" if $debug; 1529 1530 push(@root_urls,$root_url_o->as_string); 1531 1532 return $root_url_o; 1533 } 1534 1535} 1536 1537 1538sub write_page { 1539 # write a retrieved page to wherever it's supposed to be written. 1540 # Added difficulty: all files but plaintext files have already been 1541 # written to disk in w3http. 1542 1543 # $s == 0 save to disk 1544 # $s == 1 dump to stdout 1545 # $s == 2 forget 1546 1547 my($lf_name,$page_ref,$silent) = @_; 1548 my($verb); 1549 1550 if ($silent) { 1551 $verb=-1; 1552 } else { 1553 $verb=$verbose; 1554 } 1555 1556# confess("\n\$page_ref undefined") if !defined($page_ref); 1557 1558 if ($w3http::plaintexthtml) { 1559 # I have it in memory 1560 if ($s==0) { 1561 print STDERR ", saving" if $verb>0; 1562 1563 while (-d $lf_name) { 1564 # This will run once, maybe twice, $fiddled will be canged the 1565 # first time 1566 if (exists($fiddled{$lf_name})) { 1567 warn "Cannot save $lf_name, there is a directory in the way\n"; 1568 return; 1569 } 1570 1571 $fiddled{$lf_name}=1; 1572 1573 rm_rf($lf_name); 1574 print STDERR "w3mir: $lf_name" if $verbose>=0; 1575 } 1576 1577 if (!open(PAGE,">$lf_name")) { 1578 warn "\nw3mir: can't open $lf_name for writing: $!\n"; 1579 return; 1580 } 1581 if (!$convertnl) { 1582 binmode PAGE; 1583 warn "BINMODE\n" if $debug; 1584 } 1585 if ($$page_ref ne '') { 1586 print PAGE $$page_ref || die "w3mir: Error writing $lf_name: $!\n"; 1587 } 1588 close(PAGE) || die "w3mir: Error closing $lf_name: $!\n"; 1589 print STDERR ": ", length($$page_ref), " bytes\n" 1590 if $verb>=0; 1591 setmtime($lf_name,$w3http::headval{'last-modified'}) 1592 if exists($w3http::headval{'last-modified'}); 1593 } elsif ($s==1) { 1594 print $$page_ref ; 1595 } elsif ($s==2) { 1596 print STDERR ", got and forgot it.\n" unless $verb<0; 1597 } 1598 } else { 1599 # Already written by http module, just emit a message if wanted 1600 if ($s==0) { 1601 print STDERR ": ",$w3http::doclen," bytes\n" 1602 if $verb>=0; 1603 setmtime($lf_name,$w3http::headval{'last-modified'}) 1604 if exists($w3http::headval{'last-modified'}); 1605 } elsif ($s==2) { 1606 print STDERR ", got and forgot it.\n" if $verb>=0; 1607 } 1608 } 1609} 1610 1611 1612sub setmtime { 1613 # Set mtime of the given file 1614 my($file,$time)=@_; 1615 my($tm_sec,$tm_min,$tm_hour,$tm_mday,$tm_mon,$tm_year,$tm_wday,$tm_yday, 1616 $tm_isdst,$tics); 1617 1618 $tm_isdst=0; 1619 $tm_yday=-1; 1620 1621 carp("\$time is undefined"),return if !defined($time); 1622 1623 $tics=str2time($time); 1624 utime(time, $tics, $file) || 1625 warn "Could not change mtime of $file: $!\n"; 1626} 1627 1628 1629sub movefile { 1630 # Rename a file. Note that copy is not a good alternative, since 1631 # copying over NFS is something we want to Avoid. 1632 1633 # Returns 0 if failure and 1 in case of sucess. 1634 1635 (my $old,my $new) = @_; 1636 1637 # Remove anything that might have the name already. 1638 if (-d $new) { 1639 print STDERR "\n" if $verbose>=0; 1640 rm_rf($new); 1641 $fiddled{$new}=1; 1642 print STDERR "w3mir: $new" if $verbose>=0; 1643 } elsif (-e $new) { 1644 $fiddled{$new}=1; 1645 if (unlink($new)) { 1646 print STDERR "\nw3mir: removed $new\nw3mir: $new" 1647 if $verbose>=0; 1648 } else { 1649 return 0; 1650 } 1651 1652 } 1653 1654 if ($new ne '-' && $new ne $nulldevice) { 1655 warn "MOVING $old -> $new\n" if $debug; 1656 rename($old,$new) || 1657 warn "Could not rename $old to $new: $!\n",return 0; 1658 } 1659 return 1; 1660} 1661 1662 1663sub mkdir { 1664 # Make all intermediate directories needed for a file, the file name 1665 # is expected to be included in the argument! 1666 1667 # Reasons for not using File::Path::mkpath: 1668 # - I already wrote this. 1669 # - I get to be able to produce as good and precise errormessages as 1670 # unix and perl will allow me. mkpath will not. 1671 # - It's easier to find out if it worked or not. 1672 1673 my($file) = @_; 1674 my(@dirs) = split("/",$file); 1675 my $path; 1676 my $dir; 1677 my $moved=0; 1678 1679 if (!$dirs[0]) { 1680 shift @dirs; 1681 $path=''; 1682 } else { 1683 $path = '.'; 1684 } 1685 1686 # This removes the last element of the array, it's meant to shave 1687 # off the file name leaving only the directory name, as a 1688 # convenience, for the caller. 1689 pop @dirs; 1690 foreach $dir (@dirs) { 1691 $path .= "/$dir"; 1692 stat($path); 1693 # only make if it isn't already there 1694 next if -d _; 1695 1696 while (!-d _) { 1697 if (exists($fiddled{$path})) { 1698 warn "Cannot make directory $path, there is a file in the way.\n"; 1699 return; 1700 } 1701 1702 $fiddled{$path}=1; 1703 1704 if (!-e _) { 1705 mkdir($path,0777); 1706 last; 1707 } 1708 1709 if (unlink($path)) { 1710 warn "w3mir: removed file $path\n" if $verbose>=0; 1711 } else { 1712 warn "Unable to remove $path: $!\n"; 1713 next; 1714 } 1715 1716 warn "mkdir $path\n" if $debug; 1717 mkdir($path,0777) || 1718 warn "Unable to create directory $path: $!\n"; 1719 1720 stat($path); 1721 } 1722 } 1723} 1724 1725 1726sub add_referer { 1727 # Add a referer to the list of referers of a document. Unless it's 1728 # already there. 1729 # Don't mail me if you (only) think this is a bit like a toungetwiser: 1730 1731 # Don't remember referers if BOTH fixup and referer header is disabled. 1732 return if $fixup==0 && $do_referer==0; 1733 1734 my($rum_referee,$rum_referer) = @_ ; 1735 my $re_rum_referer; 1736 1737 if (exists($rum_referers{$rum_referee})) { 1738 $re_rum_referer=quotemeta $rum_referer; 1739 if (!grep(m/^$re_rum_referer$/,@{$rum_referers{$rum_referee}})) { 1740 push(@{$rum_referers{$rum_referee}},$rum_referer); 1741 # warn "$rum_referee <- $rum_referer pushed\n"; 1742 } else { 1743 # warn "$rum_referee <- $rum_referer NOT pushed\n"; 1744 } 1745 } else { 1746 $rum_referers{$rum_referee}=[$rum_referer]; 1747 # warn "$rum_referee <- $rum_referer pushed\n"; 1748 } 1749} 1750 1751 1752sub user_apply { 1753 # Apply the user apply rules 1754 1755 return &$user_apply_code(shift); 1756 1757# Debug version: 1758# my ($foo,$bar); 1759# $foo=shift; 1760# $bar=&$apply_code($foo); 1761# print STDERR "Apply: $foo -> $bar\n"; 1762# return $bar; 1763} 1764 1765sub internal_apply { 1766 # Apply the w3mir generated apply rules 1767 1768 return &$apply_code(shift); 1769} 1770 1771 1772sub apply { 1773 # Apply the user apply rules. Then if URL is wanted return result of 1774 # w3mir apply rules. Return the undefined value otherwise. 1775 1776 my $url = user_apply(shift); 1777 1778 return internal_apply($url) 1779 if want_this($url); 1780 1781 # print REMOVED $url,"\n"; 1782 return undef; 1783} 1784 1785 1786sub want_this { 1787 # Find out if we want the url passed. Just pass it on to the 1788 # generated functions. 1789 my($rum_url)=shift; 1790 1791 # What about robot rules? 1792 1793 # Does scope rule want this? 1794 return &$scope_code($rum_url) && 1795 # Does user rule want this too? 1796 &$rule_code($rum_url) 1797 1798} 1799 1800 1801sub process_tag { 1802 # Process a tag in html file 1803 my $lf_referer = shift; # User argument 1804 my $base_url = shift; # Not used... why not? 1805 my $tag_name = shift; 1806 my $url_attrs = shift; 1807 1808 # Retrun quickly if no URL attributes 1809 return unless defined($url_attrs); 1810 1811 my $attrs = shift; 1812 1813 my $rum_url; # The absolute URL 1814 my $lf_url; # The local filesystem url 1815 my $lf_url_o; # ... and it's object 1816 my $key; 1817 1818 print STDERR "\nProcess Tag: $tag_name, URL attributes: ", 1819 join(', ',@{$url_attrs}),"\nbase_url: ",$base_url,"\nlf_referer: ", 1820 $lf_referer,"\n" 1821 if $debug>2; 1822 1823 $lf_referer =~ s~^/~~; 1824 $lf_referer = "file:/$lf_referer"; 1825 1826 foreach $key (@{$url_attrs}) { 1827 if (defined($$attrs{$key})) { 1828 $rum_url=$$attrs{$key}; 1829 print STDERR "$key = $rum_url\n" if $debug; 1830 $lf_url=apply($rum_url); 1831 if (defined($lf_url)) { 1832 1833 print STDERR "Transformed to $lf_url\n" if $debug>2; 1834 1835 $lf_url =~ s~^/~~; # Remove leading / to avoid doubeling 1836 $lf_url_o=url "file:/$lf_url"; 1837 1838 # Save new value in the hash 1839 $$attrs{$key}=($lf_url_o->rel($lf_referer))->as_string; 1840 print STDERR "New value: ",$$attrs{$key},"\n" if $debug>2; 1841 1842 # If there is potential information loss save the old value too 1843 $$attrs{"W3MIR".$key}=$rum_url if $infoloss; 1844 } 1845 } 1846 } 1847} 1848 1849 1850sub version { 1851 eval 'require LWP;'; 1852 print $w3mir_agent,"\n"; 1853 print "LWP version ",$LWP::VERSION,"\n" if defined $LWP::VERSION; 1854 print "Perl version: ",$],"\n"; 1855 exit(0); 1856} 1857 1858 1859sub parse_args { 1860 my $f; 1861 my $i; 1862 1863 $i=0; 1864 1865 while ($f=shift) { 1866 $i++; 1867 $numarg++; 1868 # This is a demonstration against Getopts::Long. 1869 if ($f =~ s/^-+//) { 1870 $s=1,next if $f eq 's'; # Stdout 1871 $r=1,next if $f eq 'r'; # Recurse 1872 $fetch=1,next if $f eq 'fa'; # Fetch all, no date test 1873 $fetch=-1,next if $f eq 'fs'; # Fetch those we don't already have. 1874 $verbose=-1,next if $f eq 'q'; # Quiet 1875 $verbose=1,next if $f eq 'c'; # Chatty 1876 &version,next if $f eq 'v'; # Version 1877 $pause=shift,next if $f eq 'p'; # Pause between requests 1878 $retryPause=shift,next if $f eq 'rp'; # Pause between retries. 1879 $s=2,$convertnl=0,next if $f eq 'f'; # Forget 1880 $retry=shift,next if $f eq 't'; # reTry 1881 $list=1,next if $f eq 'l'; # List urls 1882 $iref=shift,next if $f eq 'ir'; # Initial referer 1883 $check_robottxt = 0,next if $f eq 'drr'; # Disable robots.txt rules. 1884 umask(oct(shift)),next if $f eq 'umask'; 1885 parse_cfg_file(shift),next if $f eq 'cfgfile'; 1886 usage(),exit 0 if ($f eq 'help' || $f eq 'h' || $f eq '?'); 1887 $remove=1,next if $f eq 'R'; 1888 $cache_header = 'Pragma: no-cache',next if $f eq 'pflush'; 1889 $w3http::agent=$w3mir_agent=shift,next if $f eq 'agent'; 1890 $abs=1,next if $f eq 'abs'; 1891 $convertnl=0,$batch=1,next if $f eq 'B'; 1892 $read_urls = 1,next if $f eq 'I'; 1893 $convertnl=0,next if $f eq 'nnc'; 1894 1895 if ($f eq 'lc') { 1896 if ($i == 1) { 1897 $lc=1; 1898 $iinline=($lc?"(?i)":""); 1899 $ipost=($lc?"i":""); 1900 next; 1901 } else { 1902 die "w3mir: -lc must be the first argument on the commandline.\n"; 1903 } 1904 } 1905 1906 if ($f eq 'P') { # Proxy 1907 ($w3http::proxyserver,$w3http::proxyport)= 1908 shift =~ /([^:]+):?(\d+)?/; 1909 $w3http::proxyport=80 unless $w3http::proxyport; 1910 $using_proxy=1; 1911 next; 1912 } 1913 1914 if ($f eq 'd') { # Debugging level 1915 $f=shift; 1916 unless (($debug = $f) > 0) { 1917 die "w3mir: debug level must be a number greater than zero.\n"; 1918 } 1919 next; 1920 } 1921 1922 # Those were all the options... 1923 warn "w3mir: Unknown option: -$f. Use -h for usage info.\n"; 1924 exit(1); 1925 1926 } elsif ($f =~ /^http:/) { 1927 my ($rum_url_o,$rum_reurl,$rum_rebase,$server); 1928 1929 $rum_url_o=root_queue($f,'./'); 1930 1931 $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] ); 1932 1933 push(@internal_apply,"s/^".$rum_rebase."//"); 1934 $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n"; 1935 $scope_ignore.="return 0 if m/^". 1936 quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n"; 1937 1938 } else { 1939 # If we get this far then the commandline is broken 1940 warn "Unknown commandline argument: $f. Use -h for usage info.\n"; 1941 $numarg--; 1942 exit(1); 1943 } 1944 } 1945 return 1; 1946} 1947 1948 1949sub parse_cfg_file { 1950 # Read the configuration file. Aborts on errors. Not good to 1951 # mirror something using the wrong config. 1952 1953 my ( $file ) = @_ ; 1954 my ($key, $value, $authserver,$authrealm,$authuser,$authpasswd); 1955 my $i; 1956 1957 die "w3mir: config file $file is not a file.\n" unless -f $file; 1958 open(CFGF, $file) || die "Could not open config file $file: $!\n"; 1959 1960 $i=0; 1961 1962 while (<CFGF>) { 1963 # Trim off various junk 1964 chomp; 1965 s/^#.*//; 1966 s/^\s+|\s$//g; 1967 # Anything left? 1968 next if $_ eq ''; 1969 # Examine remains 1970 $i++; 1971 $numarg++; 1972 1973 ($key, $value) = split(/\s*:\s*/,$_,2); 1974 $key = lc $key; 1975 1976 $iref=$value,next if ( $key eq 'initial-referer' ); 1977 $header=$value,next if ( $key eq 'header' ); 1978 $pause=numeric($value),next if ( $key eq 'pause' ); 1979 $retryPause=numeric($value),next if ( $key eq 'retry-pause' ); 1980 $debug=numeric($value),next if ( $key eq 'debug' ); 1981 $retry=numeric($value),next if ( $key eq 'retries' ); 1982 umask(numeric($value)),next if ( $key eq 'umask' ); 1983 $check_robottxt=boolean($value),next if ( $key eq 'robot-rules' ); 1984 $edit=boolean($value),next if ($key eq 'remove-nomirror'); 1985 $indexname=$value,next if ($key eq 'index-name'); 1986 $s=nway($value,'save','stdout','forget'),next 1987 if ( $key eq 'file-disposition' ); 1988 $verbose=nway($value,'quiet','brief','chatty')-1,next 1989 if ( $key eq 'verbosity' ); 1990 $w3http::proxyuser=$value,next if $key eq 'http-proxy-user'; 1991 $w3http::proxypasswd=$value,next if $key eq 'http-proxy-passwd'; 1992 1993 if ( $key eq 'cd' ) { 1994 $chdirto=$value; 1995 warn "Use of 'cd' is discouraged\n" unless $verbose==-1; 1996 next; 1997 } 1998 1999 if ($key eq 'http-proxy') { 2000 ($w3http::proxyserver,$w3http::proxyport)= 2001 $value =~ /([^:]+):?(\d+)?/; 2002 $w3http::proxyport=80 unless $w3http::proxyport; 2003 $using_proxy=1; 2004 next; 2005 } 2006 2007 if ($key eq 'proxy-options') { 2008 my($val,$nval,@popts,$pragma); 2009 $pragma=1; 2010 foreach $val (split(/\s*,\*/,lc $value)) { 2011 $nval=nway($val,'no-pragma','revalidate','refresh','no-store',); 2012 # Force use of Cache-control: header 2013 $pragma=0 if ($nval==0); 2014 # use to force proxy to revalidate 2015 $pragma=0,push(@popts,'max-age=0') if ($nval==1); 2016 # use to force proxy to refresh 2017 push(@popts,'no-cache') if ($nval==2); 2018 # use if information transfered is sensitive 2019 $pragma=0,push(@popts,'no-store') if ($nval==3); 2020 } 2021 $cache_header=($pragma?'Pragma: ':'Cache-control: ').join(', ',@popts); 2022 next; 2023 } 2024 2025 2026 if ($key eq 'url') { 2027 my ($rum_url_o,$lf_dir,$rum_reurl,$rum_rebase); 2028 2029 # A two argument URL: line? 2030 if ($value =~ m/^(.+)\s+(.+)/i) { 2031 # Two arguments. 2032 # The last is a directory, it must end in / 2033 $lf_dir=$2; 2034 $lf_dir.='/' unless $lf_dir =~ m~/$~; 2035 2036 $rum_url_o=root_queue($1,$lf_dir); 2037 2038 # The first is a URL, make it more canonical, find the base. 2039 # The namespace confusion in this section is correct.(??) 2040 $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] ); 2041 2042 # print "URL: ",$rum_url_o->as_string,"\n"; 2043 # print "Base: $rum_rebase\n"; 2044 2045 # Translate from rum space to lf space: 2046 push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/"); 2047 2048 # That translation could lead to information loss. 2049 $infoloss=1; 2050 2051 # Fetch rules tests the rum_url_o->as_string. Fetch whatever 2052 # matches the base. 2053 $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n"; 2054 2055 # Ignore whatever did not match the base. 2056 $scope_ignore.="return 0 if m/^". 2057 quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n"; 2058 2059 } else { 2060 $rum_url_o=root_queue($value,'./'); 2061 2062 $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] ); 2063 2064 # Translate from rum space to lf space: 2065 push(@internal_apply,"s/^".$rum_rebase."//"); 2066 2067 $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n"; 2068 $scope_ignore.="return 0 if m/^". 2069 quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n"; 2070 } 2071 next; 2072 } 2073 2074 if ($key eq 'also-quene') { 2075 print STDERR 2076 "Found 'also-quene' keyword, please replace with 'also-queue'\n"; 2077 $key='also-queue'; 2078 } 2079 2080 if ($key eq 'also' || $key eq 'also-queue') { 2081 if ($value =~ m/^(.+)\s+(.+)/i) { 2082 my ($rum_url_o,$rum_url,$lf_dir,$rum_reurl,$rum_rebase); 2083 # Two arguments. 2084 # The last is a directory, it must end in / 2085 # print STDERR "URL ",$1," DIR ",$2,"\n"; 2086 $rum_url=$1; 2087 $lf_dir=$2; 2088 $lf_dir.='/' unless $lf_dir =~ m~/$~; 2089 die "w3mir: The target path in Also: and Also-queue: directives must ". 2090 "be relative\n" 2091 if substr($lf_dir,0,1) eq '/'; 2092 2093 if ($key eq 'also-queue') { 2094 $rum_url_o=root_queue($rum_url,$lf_dir); 2095 } else { 2096 root_queue("",$lf_dir); 2097 $rum_url_o=url $rum_url; 2098 $rum_url_o->host(lc $rum_url_o->host); 2099 } 2100 2101 # The first is a URL, find the base 2102 $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] ); 2103 2104# print "URL: $rum_url_o->as_string\n"; 2105# print "Base: $rum_rebase\n"; 2106# print "Server: $server\n"; 2107 2108 # Ok, now we can transform and select stuff the right way 2109 push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/"); 2110 $infoloss=1; 2111 2112 # Fetch rules tests the rum_url_o->as_string. Fetch whatever 2113 # matches the base. 2114 $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n"; 2115 2116 # Ignore whatever did not match the base. This cures problem 2117 # with '..' from base in in rum space pointing within the the 2118 # scope in ra space. We introduced a extra level (or more) of 2119 # directories with the apply above. Must do same with 'Also:' 2120 # directives. 2121 $scope_ignore.="return 0 if m/^". 2122 quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n"; 2123 } else { 2124 die "Also: requires 2 arguments\n"; 2125 } 2126 next; 2127 } 2128 2129 if ($key eq 'quene') { 2130 print STDERR "Found 'quene' keyword, please replace with 'queue'\n"; 2131 $key='queue'; 2132 } 2133 2134 if ($key eq 'queue') { 2135 root_queue($value,""); 2136 next; 2137 } 2138 2139 if ($key eq 'ignore-re' || $key eq 'fetch-re') { 2140 # Check that it's a re, better that I am strict than for perl to 2141 # make compilation errors. 2142 unless ($value =~ /^m(.).*\1[gimosx]*$/) { 2143 print STDERR "w3mir: $value is not a recognized regular expression\n"; 2144 exit 1; 2145 } 2146 # Fall-through to next cases! 2147 } 2148 2149 if ($key eq 'fetch' || $key eq 'fetch-re') { 2150 my $expr=$value; 2151 $expr = wild_re($expr).$ipost if ($key eq 'fetch'); 2152 $rule_text.=' return 1 if '.$expr.";\n"; 2153 next; 2154 } 2155 2156 if ($key eq 'ignore' || $key eq 'ignore-re') { 2157 my $expr=$value; 2158 $expr = wild_re($expr).$ipost if ($key eq 'ignore'); 2159 # print STDERR "Ignore expression: $expr\n"; 2160 $rule_text.=' return 0 if '.$expr.";\n"; 2161 next; 2162 } 2163 2164 2165 if ($key eq 'apply') { 2166 unless ($value =~ /^s(.).*\1.*\1[gimosxe]*$/) { 2167 print STDERR 2168 "w3mir: '$value' is not a recognized regular expression\n"; 2169 exit 1; 2170 } 2171 push(@user_apply,$value) ; 2172 $infoloss=1; 2173 next; 2174 } 2175 2176 if ($key eq 'agent') { 2177 $w3http::agent=$w3mir_agent=$value; 2178 next; 2179 } 2180 2181 # The authorization stuff: 2182 if ($key eq 'auth-domain') { 2183 $useauth=1; 2184 ($authserver, $authrealm) = split('/',$value,2); 2185 die "w3mir: server part of auth-domain has format server[:port]\n" 2186 unless $authserver =~ /^(\S+(:\d+)?)$|^\*$/; 2187 $authserver =~ s/:80$//; 2188 die "w3mir: auth-domain '$value' is not valid\n" 2189 if !defined($authserver) || !defined($authrealm); 2190 $authrealm=lc $authrealm; 2191 } 2192 2193 $authuser=$value if ($key eq 'auth-user'); 2194 $authpasswd=$value if ($key eq 'auth-passwd'); 2195 2196 # Got a full authentication spec? 2197 if ($authserver && $authrealm && $authuser && $authpasswd) { 2198 $authdata{$authserver}{$authrealm}=$authuser.":".$authpasswd; 2199 print "Authentication for $authserver/$authrealm is ". 2200 "$authuser/$authpasswd\n" if $verbose>=0; 2201 # exit; 2202 # Invalidate tmp vars 2203 $authserver=$authrealm=$authuser=$authpasswd=undef; 2204 next; 2205 } 2206 2207 next if $key eq 'auth-user' || $key eq 'auth-passwd' || 2208 $key eq 'auth-domain'; 2209 2210 if ($key eq 'fetch-options') { 2211 warn "w3mir: The 'fetch-options' directive has been renamed to 'options'\nw3mir: Please change your configuration file.\n"; 2212 $key='options'; 2213 # Fall through to 'options'! 2214 } 2215 2216 if ($key eq 'options') { 2217 2218 my($val,$nval); 2219 foreach $val (split(/\s*,\s*/,lc $value)) { 2220 if ($i==1) { 2221 $nval=nway($val,'recurse','no-date-check','only-nonexistent', 2222 'list-urls','lowercase','remove','batch','read-urls', 2223 'abs','no-newline-conv','list-nonmirrored'); 2224 $r=1,next if $nval==0; 2225 $fetch=1,next if $nval==1; 2226 $fetch=-1,next if $nval==2; 2227 $list=1,next if $nval==3; 2228 if ($nval==4) { 2229 $lc=1; 2230 $iinline=($lc?"(?i)":""); 2231 $ipost=($lc?"i":""); 2232 next ; 2233 } 2234 $remove=1,next if $nval==5; 2235 $convertnl=0,$batch=1,next if $nval==6; 2236 $read_urls=1,next if $nval==7; 2237 $abs=1,next if $nval==8; 2238 $convertnl=0,next if $nval==9; 2239 $list_nomir=1,next if $nval==10; 2240 } else { 2241 die "w3mir: options must be the first directive in the config file.\n"; 2242 } 2243 } 2244 next; 2245 } 2246 2247 if ($key eq 'disable-headers') { 2248 my($val,$nval); 2249 foreach $val (split(/\s*,\s*/,lc $value)) { 2250 $nval=nway($val,'referer','user'); 2251 $do_referer=0,next if $nval==0; 2252 $do_user=0,next if $nval==1; 2253 } 2254 next; 2255 } 2256 2257 2258 if ($key eq 'fixup') { 2259 2260 $fixrc="$file"; 2261 # warn "Fixrc: $fixrc\n"; 2262 2263 my($val,$nval); 2264 foreach $val (split(/\s*,\s*/,lc $value)) { 2265 $nval=nway($val,'on','run','noindex','off'); 2266 $runfix=1,next if $nval==1; 2267 # Disable fixup 2268 $fixup=0,next if $nval==3; 2269 # Ignore everyting else 2270 } 2271 next; 2272 } 2273 2274 die "w3mir: Unrecognized directive ('$key') in config file $file at line $.\n"; 2275 2276 } 2277 close(CFGF); 2278 2279 if (defined($w3http::proxypasswd) && $w3http::proxyuser) { 2280 warn "Proxy authentication: ".$w3http::proxyuser.":". 2281 $w3http::proxypasswd."\n" if $verbose>=0; 2282 } 2283 2284} 2285 2286 2287sub wild_re { 2288 # Here we translate unix wildcard subset to to perlre 2289 local($_) = shift; 2290 2291 # Quote anything that's RE and not wildcard: / ( ) \ | { } + $ ^ 2292 s~([\/\(\)\\\|\{\}\+)\$\^])~\\$1~g; 2293 # . -> \. 2294 s~\.~\\.~g; 2295 # * -> .* 2296 s~\*~\.\*~g; 2297 # ? -> . 2298 s~\?~\.~g; 2299 2300 # print STDERR "wild_re: $_\n"; 2301 2302 return $_ = '/'.$_.'/'; 2303} 2304 2305 2306sub numeric { 2307 # Check if argument is numeric? 2308 my ( $number ) = @_ ; 2309 return oct($number) if ($number =~ /\d+/ || $number =~ /\d+.\d+/); 2310 die "Expected a number, got \"$number\"\n"; 2311} 2312 2313 2314sub boolean { 2315 my ( $boolean ) = @_ ; 2316 2317 $boolean = lc $boolean; 2318 2319 return 0 if ($boolean eq 'false' || $boolean eq 'off' || $boolean eq '0'); 2320 return 1 if ($boolean eq 'true' || $boolean eq 'on' || $boolean eq '1'); 2321 die "Expected a boolean, got \"$boolean\"\n"; 2322} 2323 2324 2325sub nway { 2326 my ( $value ) = shift; 2327 my ( @values ) = @_; 2328 my ( $val ) = 0; 2329 2330 $value = lc $value; 2331 while (@_) { 2332 return $val if $value eq shift; 2333 $val++; 2334 } 2335 die "Expected one of ".join(", ",@values).", got \"$value\"\n"; 2336} 2337 2338 2339sub insert_at_start { 2340 # ark: inserts the first arg at the top of the html in the second arg 2341 # janl: The second arg must be a reference to a scalar. 2342 my( $str, $text_ref ) = @_; 2343 my( @possible ) =("<BODY.*?>", "</HEAD.*?>", "</TITLE.*?>", "<HTML.*?>" ); 2344 my( $f, $done ); 2345 2346 $done=0; 2347 @_=@possible; 2348 2349 while( $done!=1 && ($f=shift) ){ 2350 # print "Searching for: $f\n"; 2351 if( $$text_ref =~ /$f/i ){ 2352 # print "found it!\n"; 2353 $$text_ref =~ s/($f)/$1\n$str/i; 2354 $done=1; 2355 } 2356 } 2357} 2358 2359 2360 2361sub rm_rf { 2362 # Recursively remove directories and other files 2363 # File::Path::rmtree does a similar thing but the messages are wrong 2364 2365 my($remove)=shift; 2366 2367 eval "use File::Find;" unless defined(&finddepth); 2368 2369 die "w3mir: Could not load File::Find module when trying to remove $remove\n" 2370 unless defined(&find); 2371 2372 die "w3mir: Removal safeguard triggered on '$remove'" 2373 if $remove =~ m~/\.\./~ || $remove =~ m~/\.\.$~ || $remove =~ m~/\.$~; 2374 2375 die "rm_rf($remove); "; 2376 2377 finddepth(\&remove_everything,$remove); 2378 2379 if (rmdir($remove)) { 2380 print STDERR "\nw3mir: removed directory $remove\n" if $verbose>=0; 2381 } else { 2382 print STDERR "w3mir: could not remove $remove: $!\n"; 2383 } 2384} 2385 2386 2387sub remove_everything { 2388 # This does the removal 2389 ((-d && rmdir($_)) || unlink($_)) && $verbose>=0 && 2390 print STDERR "w3mir: removed $File::Find::name\n"; 2391} 2392 2393 2394 2395sub usage { 2396 my($message)=shift @_; 2397 2398 print STDERR "w3mir: $message\n" if $message; 2399 2400 die 'w3mir: usage: w3mir [options] <single-http-url> 2401 or: w3mir -B [-I] [options] [<http-urls>] 2402 2403 Options : 2404 -agent <agent> - Set the agent name. Default is w3mir 2405 -abs - Force all URLs to be absolute. 2406 -B - Batch-get documents. 2407 -I - The URLs to get are read from standard input. 2408 -c - be more Chatty. 2409 -cfgfile <file> - Read config from file 2410 -d <debug-level>- set debug level to 1 or 2 2411 -drr - Disable robots.txt rules. 2412 -f - Forget all files, nothing is saved to disk. 2413 -fa - Fetch All, will not check timestamps. 2414 -fs - Fetch Some, do not fetch the files we already have. 2415 -ir <referer> - Initial referer. For picky servers. 2416 -l - List URLs in the documents retrived. 2417 -lc - Convert all URLs (and filenames) to lowercase. 2418 This does not work reliably. 2419 -p <n> - Pause n seconds before retriving each doc. 2420 -q - Quiet, error-messages only 2421 -rp <n> - Retry Pause in seconds. 2422 -P <server:port>- Use host/port for proxy http requests 2423 -pflush - Flush proxy server. 2424 -r - Recursive mirroring. 2425 -R - Remove files not referenced or not present on server. 2426 -s - Send output to stdout instead of file 2427 -t <n> - How many times to (re)try getting a failed doc? 2428 -umask <umask> - Set umask for mirroring, must be usual octal format. 2429 -nnc - No Newline Conversion. Disable newline conversions. 2430 -v - Show w3mir version. 2431'; 2432} 2433__END__ 2434# -*- perl -*- There must be a blank line here 2435 2436=head1 NAME 2437 2438w3mir - all purpose HTTP-copying and mirroring tool 2439 2440=head1 SYNOPSIS 2441 2442B<w3mir> [B<options>] [I<HTTP-URL>] 2443 2444B<w3mir> B<-B> [B<options>] <I<HTTP-URLS>> 2445 2446B<w3mir> is a all purpose HTTP copying and mirroring tool. The 2447main focus of B<w3mir> is to create and maintain a browsable copy of 2448one, or several, remote WWW site(s). 2449 2450Used to the max w3mir can retrive the contents of several related 2451sites and leave the mirror browseable via a local web server, or from 2452a filesystem, such as directly from a CDROM. 2453 2454B<w3mir> has options for all operations that are simple enough for 2455options. For authentication and passwords, multiple site retrievals 2456and such you will have to resort to a L</CONFIGURATION-FILE>. If 2457browsing from a filesystem references ending in '/' needs to be 2458rewritten to end in '/index.html', and in any case, if there are URLs 2459that are redirected will need to be changed to make the mirror 2460browseable, see the documentation of B<Fixup> in the 2461L</CONFIGURATION-FILE> secton. 2462 2463B<w3mir>s default behavior is to do as little as possible and to be as 2464nice as possible to the server(s) it is getting documents from. You 2465will need to read through the options list to make B<w3mir> do more 2466complex, and, useful things. Most of the things B<w3mir> can do is 2467also documented in the w3mir-HOWTO which is available at the B<w3mir> 2468home-page (F<http://www.math.uio.no/~janl/w3mir/>) as well as in the 2469w3mir distribution bundle. 2470 2471=head1 DESCRIPTION 2472 2473You may specify many options and one HTTP-URL on the w3mir 2474command line. 2475 2476A single HTTP URL I<must> be specified either on the command line or 2477in a B<URL> directive in a configuration file. If the URL refers to a 2478directory it I<must> end with a "/", otherwise you might get surprised 2479at what gets retrieved (e.g. rather more than you expect). 2480 2481Options must be prefixed with at least one - as shown below, you can 2482use more if you want to. B<-cfgfile> is equivalent to B<--cfgfile> or 2483even B<------cfgfile>. Options cannot be I<clustered>, i.e., B<-r -R> 2484is not equivalent to B<-rR>. 2485 2486=over 4 2487 2488=item B<-h> | B<-help> | B<-?> 2489 2490prints a brief summary of all command line options and exits. 2491 2492=item B<-cfgfile> F<file> 2493 2494Makes B<w3mir> read the given configuration file. See the next section 2495for how to write such a file. 2496 2497=item B<-r> 2498 2499Puts B<w3mir> into recursive mode. The default is to fetch only one 2500document and then quit. 'I<recursive>' mode means that all the 2501documents linked to the given document that are fetched, and all they 2502link to in turn and so on. But only I<Iff> they are in the same 2503directory or under the same directory as the start document. Any 2504document that is in or under the starting documents directory is said 2505to be within the I<scope of retrieval>. 2506 2507=item B<-fa> 2508 2509Fetch All. Normally B<w3mir> will only get the document if it has been 2510updated since the last time it was fetched. This switch turns that 2511check off. 2512 2513=item B<-fs> 2514 2515Fetch Some. Not the opposite of B<-fa>, but rather, fetch the ones we 2516don't have already. This is handy to restart copying of a site 2517incompletely copied by earlier, interrupted, runs of B<w3mir>. 2518 2519=item B<-p> I<n> 2520 2521Pause for I<n> seconds between getting each document. The default is 252230 seconds. 2523 2524=item B<-rp> I<n> 2525 2526Retry Pause, in seconds. When B<w3mir> fails to get a document for some 2527technical reason (timeout mainly) the document will be queued for a 2528later retry. The retry pause is how long B<w3mir> waits between 2529finishing a mirror pass before starting a new one to get the still 2530missing documents. This should be a long time, so network conditions 2531have a chance to get better. The default is 600 seconds (10 minutes), 2532which might be a bit too short, for batch running B<w3mir> I would 2533suggest an hour (3600 seconds) or more. 2534 2535=item B<-t> I<n> 2536 2537Number of reTries. If B<w3mir> cannot get all the documents by the 2538I<n>th retry B<w3mir> gives up. The default is 3. 2539 2540=item B<-drr> 2541 2542Disable Robot Rules. The robot exclusion standard is described in 2543http://info.webcrawler.com/mak/projects/robots/norobots.html. By 2544default B<w3mir> honors this standard. This option causes B<w3mir> to 2545ignore it. 2546 2547=item B<-nnc> 2548 2549No Newline Conversion. Normally w3mir converts the newline format of 2550all files that the web server says is a text file. However, not all 2551web servers are reliable, and so binary files may become corrupted due 2552to the newline conversion w3mir performs. Use this option to stop 2553w3mir from converting newlines. This also causes the file to be 2554regarded as binary when written to disk, to disable the implicit 2555newline conversion when saving text files on most non-Unix systems. 2556 2557This will probably be on by default in version 1.1 of w3mir, but not 2558in version 1.0. 2559 2560=item B<-R> 2561 2562Remove files. Normally B<w3mir> will not remove files that are no 2563longer on the server/part of the retrieved web of files. When this 2564option is specified all files no longer needed or found on the servers 2565will be removed. If B<w3mir> fails to get a document for I<any> other 2566reason the file will not be removed. 2567 2568=item B<-B> 2569 2570Batch fetch documents whose URLs are given on the commandline. 2571 2572In combination with the B<-r> and/or B<-l> switch all HTML and PDF 2573documents will be mined for URLs, but the documents will be saved on 2574disk unchanged. When used with the B<-r> switch only one single URL 2575is allowed. When not used with the B<-r> switch no HTML/URL 2576processing will be performed at all. When the B<-B> switch is used 2577with B<-r> w3mir will not do repeated mirrorings reliably since the 2578changes w3mir needs to do, in the documents, to work reliably are not 2579done. In any case it's best not to use B<-R> in combination with 2580B<-B> since that can result in deleting rather more documents than 2581expected. Hwowever, if the person writing the documents being copied 2582is good about making references relative and placing the <HTML> tag at 2583the beginning of documents there is a fair chance that things will 2584work even so. But I wouln't bet on it. It will, however, work 2585reliably for repeated mirroring if the B<-r> switch is not used. 2586 2587When the B<-B> switch is specified redirects for a given document will 2588be followed no matter where they point. The redirected-to document 2589will be retrieved in the place of the original document. This is a 2590potential weakness, since w3mir can be directed to fetch any document 2591anywhere on the web. 2592 2593Unless used with B<-r> all retrived files will be stored in one 2594directory using the remote filename as the local filename. I.e., 2595F<http://foo/bar/gazonk.html> will be saved as F<gazonk.html>. 2596F<http://foo/bar/> will be saved as F<bar-index.html> so as to avoid 2597name colitions for the common case of URLs ending in /. 2598 2599=item B<-I> 2600 2601This switch can only be used with the B<-B> switch, and only after it 2602on the commandline or configuration file. When given w3mir will get 2603URLs from standard input (i.e., w3mir can be used as the end of a pipe 2604that produces URLs.) There should only be one URL pr. line of input. 2605 2606=item B<-q> 2607 2608Quiet. Turns off all informational messages, only errors will be 2609output. 2610 2611=item B<-c> 2612 2613Chatty. B<w3mir> will output more progress information. This can be 2614used if you're watching B<w3mir> work. 2615 2616=item B<-v> 2617 2618Version. Output B<w3mir>s version. 2619 2620=item B<-s> 2621 2622Copy the given document(s) to STDOUT. 2623 2624=item B<-f> 2625 2626Forget. The retrieved documents are not saved on disk, they are just 2627forgotten. This can be used to prime the cache in proxy servers, or 2628not save documents you just want to list the URLs in (see B<-l>). 2629 2630=item B<-l> 2631 2632List the URLs referred to in the retrieved document(s) on STDOUT. 2633 2634=item B<-umask> I<n> 2635 2636Sets the umask, i.e., the permission bits of all retrieved files. The 2637number is taken as octal unless it starts with a 0x, in which case 2638it's taken as hexadecimal. No matter what you set this to make sure 2639you get write as well as read access to created files and directories. 2640 2641Typical values are: 2642 2643=over 8 2644 2645=item 022 2646 2647let everyone read the files (and directories), only you can change 2648them. 2649 2650=item 027 2651 2652you and everyone in the same file-group as you can read, only you can 2653change them. 2654 2655=item 077 2656 2657only you can read the files, only you can change them. 2658 2659=item 0 2660 2661everyone can read, write and change everything. 2662 2663=back 2664 2665The default is whatever was set when B<w3mir> was invoked. 022 is a 2666reasonable value. 2667 2668This option has no meaning, or effect, on Win32 platforms. 2669 2670=item B<-P> I<server:port> 2671 2672Use the given server and port is a HTTP proxy server. If no port is 2673given port 80 is assumed (this is the normal HTTP port). This is 2674useful if you are inside a firewall, or use a proxy server to save 2675bandwidth. 2676 2677=item B<-pflush> 2678 2679Proxy flush, force the proxy server to flush it's cache and re-get the 2680document from the source. The I<Pragma: no-cache> HTTP/1.0 header is 2681used to implement this. 2682 2683=item B<-ir> I<referrer> 2684 2685Initial Referrer. Set the referrer of the first retrieved document. 2686Some servers are reluctant to serve certain documents unless this is 2687set right. 2688 2689=item B<-agent> I<agent> 2690 2691Set the HTTP User-Agent fields value. Some servers will serve 2692different documents according to the WWW browsers capabilities. 2693B<w3mir> normally has B<w3mir>/I<version> in this header field. 2694Netscape uses things like B<Mozilla/3.01 (X11; I; Linux 2.0.30 i586)> 2695and MSIE uses things like B<Mozilla/2.0 (compatible; MSIE 3.02; 2696Windows NT)> (remember to enclose agent strings with spaces in with 2697double quotes (")) 2698 2699=item B<-lc> 2700 2701Lower Case URLs. Some OSes, like W95 and NT, are not case sensitive 2702when it comes to filenames. Thus web masters using such OSes can case 2703filenames differently in different places (apps.html, Apps.html, 2704APPS.HTML). If you mirror to a Unix machine this can result in one 2705file on the server becoming many in the mirror. This option 2706lowercases all filenames so the mirror corresponds better with the 2707server. 2708 2709If given it must be the first option on the command line. 2710 2711This option does not work perfectly. Most especially for mixed case 2712host-names. 2713 2714=item B<-d> I<n> 2715 2716Set the debug level. A debug level higher than 0 will produce lots of 2717extra output for debugging purposes. 2718 2719=item B<-abs> 2720 2721Force all URLs to be absolute. If you retrive 2722F<http://www.ifi.uio.no/~janl/index.html> and it references foo.html 2723the referense is absolutified into 2724F<http://www.ifi.uio.no/~janl/foo.html>. In other words, you get 2725absolute references to the origin site if you use this option. 2726 2727=back 2728 2729=head1 CONFIGURATION-FILE 2730 2731Most things can be mirrored with a (long) command line. But multi 2732server mirroring, authentication and some other things are only 2733available through a configuration file. A configuration file can 2734either be specified with the B<-cfgfile> switch, but w3mir also looks 2735for .w3mirc (w3mir.ini on Win32 platforms) in the directory where 2736w3mir is started from. 2737 2738The configuration file consists of lines of comments and directives. 2739A directive consists of a keyword followed by a colon (:) and then one 2740or several arguments. 2741 2742 # This is a comment. And the next line is a directive: 2743 Options: recurse, remove 2744 2745A comment can only start at the beginning of a line. The directive 2746keywords are not case-sensitive, but the arguments I<might> be. 2747 2748=over 4 2749 2750=item Options: I<recurse> | I<no-date-check> | I<only-nonexistent> | I<list-urls> | I<lowercase> | I<remove> | I<batch> | I<input-urls> | I<no-newline-conv> | I<list-nonmirrored> 2751 2752This must be the first directive in a configuration file. 2753 2754=over 8 2755 2756=item I<recurse> 2757 2758see B<-r> switch. 2759 2760=item I<no-date-check> 2761 2762see B<-fa> switch. 2763 2764=item I<only-nonexistent> 2765 2766see B<-fs> switch. 2767 2768=item I<list-urls> 2769 2770see B<-l> option. 2771 2772=item I<lowercase> 2773 2774see B<-lc> option. 2775 2776=item I<remove> 2777 2778see B<-R> option. 2779 2780=item I<batch> 2781 2782see B<-B> option. 2783 2784=item I<input-urls> 2785 2786see B<-I> option. 2787 2788=item I<no-newline-conv> 2789 2790see B<-nnc> option. 2791 2792=item I<list-nonmirrored> 2793 2794List URLs not mirrored in a file called .notmirrored ('notmir' on 2795win32). It will contain a lot of duplicate lines and quite possebly 2796be quite large. 2797 2798=back 2799 2800=item URL: I<HTTP-URL> [I<target-directory>] 2801 2802The URL directive may only appear once in any configuration file. 2803 2804Without the optional target directory argument it corresponds directly 2805to the I<single-HTTP-URL> argument on the command line. 2806 2807If the optional target directory is given all documents from under the 2808given URL will be stored in that directory, and under. The target 2809directory is most likely only specified if the B<Also> directive is 2810also specified. 2811 2812If the URL given refers to a directory it I<must> end in a "/", 2813otherwise you might get quite surprised at what gets retrieved. 2814 2815Either one URL: directive or the single-HTTP-URL at the command-line 2816I<must> be given. 2817 2818=item Also: I<HTTP-URL directory> 2819 2820This directive is only meaningful if the I<recurse> (or B<-r>) 2821option is given. 2822 2823The directive enlarges the scope of a recursive retrieval to contain 2824the given HTTP-URL and all documents in the same directory or under. 2825Any documents retrieved because of this directive will be stored in the 2826given directory of the mirror. 2827 2828In practice this means that if the documents to be retrieved are stored 2829on several servers, or in several hierarchies on one server or any 2830combination of those. Then the B<Also> directive ensures that we get 2831everything into one single mirror. 2832 2833This also means that if you're retrieving 2834 2835 URL: http://www.foo.org/gazonk/ 2836 2837but it has inline icons or images stored in http://www.foo.org/icons/ 2838which you will also want to get, then that will be retrieved as well by 2839entering 2840 2841 Also: http://www.foo.org/icons/ icons 2842 2843As with the URL directive, if the URL refers to a directory it I<must> 2844end in a "/". 2845 2846Another use for it is when mirroring sites that have several names 2847that all refer to the same (logical) server: 2848 2849 URL: http://www.midifest.com/ 2850 Also: http://midifest.com/ . 2851 2852At this point in time B<w3mir> has no mechanism to easily enlarge the 2853scope of a mirror after it has been established. That means that you 2854should survey the documents you are going to retrieve to find out what 2855icons, graphics and other things they refer to that you want. And 2856what other sites you might like to retrieve. If you find out that 2857something is missing you will have to delete the whole mirror, add the 2858needed B<Also> directives and then reestablish the mirror. This lack 2859of flexibility in what to retrieve will be addressed at a later date. 2860 2861See also the B<Also-quene> directive. 2862 2863=item Also-quene: I<HTTP-URL directory> 2864 2865This is like Also, except that the URL itself is also quened. The 2866Also directive will not cause any documents to be retrived UNLESS they 2867are referenced by some other document w3mir has already retrived. 2868 2869=item Quene: I<HTTP-URL> 2870 2871This is quenes the URL for retrival, but does not enlarge the scope of 2872the retrival. If the URL is outside the scope of retrival it will not 2873be retrived anyway. 2874 2875The observant reader will see that B<Also-quene> is like B<Also> 2876combined with B<Quene>. 2877 2878=item Initial-referer: I<referer> 2879 2880see B<-ir> option. 2881 2882=item Ignore: F<wildcard> 2883 2884=item Fetch: F<wildcard> 2885 2886=item Ignore-RE: F<regular-expression> 2887 2888=item Fetch-RE: F<regular-expression> 2889 2890These four are used to set up rules about which documents, within the 2891scope of retrieval, should be gotten and which not. The default is to 2892get I<anything> that is within the scope of retrieval. That may not 2893be practical though. This goes for CGI scripts, and especially server 2894side image maps and other things that are executed/evaluated on the 2895server. There might be other things you want unfetched as well. 2896 2897B<w3mir> stores the I<Ignore>/I<Fetch> rules in a list. When a 2898document is considered for retrieval the URL is checked against the 2899list in the same order that the rules appeared in the configuration 2900file. If the URL matches any rule the search stops at once. If it 2901matched a I<Ignore> rule the document is not fetched and any URLs in 2902other documents pointing to it will point to the document at the 2903original server (not inside the mirror). If it matched a I<Fetch> 2904rule the document is gotten. If not matched by any ru�es the document 2905is gotten. 2906 2907The F<wildcard>s are a very limited subset of Unix-wildcards. 2908B<w3mir> understands only 'I<?>', 'I<*>', and 'I<[x-y]>' ranges. 2909 2910The F<perl-regular-expression> is perls superset of the normal Unix 2911regular expression syntax. They must be completely specified, 2912including the prefixed m, a delimiter of your choice (except the 2913paired delimiters: parenthesis, brackets and braces), and any of the 2914RE modifiers. E.g., 2915 2916 Ignore-RE: m/.gif$/i 2917 2918or 2919 2920 Ignore-RE: m~/.*/.*/.*/~ 2921 2922and so on. "#" cannot be used as delimiter as it is the comment 2923character in the configuration file. This also has the bad 2924side-effect of making you unable to match fragment names (#foobar) 2925directly. Fortunately perl allows writing ``#'' as ``\043''. 2926 2927You must be very carefull of using the RE anchors (``^'' and ``$'' 2928with the RE versions of these and the I<Apply> directive. Given the 2929rules: 2930 2931 Fetch-RE: m/foobar.cgi$/ 2932 Ignore: *.cgi 2933 2934the all files called ``foobar.cgi'' will be fetched. However, if the 2935file is referenced as ``foobar.cgi?query=mp3'' it will I<not> be 2936fetched since the ``$'' anchor will prevent it from matching the 2937I<Fetch-RE> directive and then it will match the I<Ignore> directive 2938instead. If you want to match ``foobar.cgi'' but not ``foobar.cgifu'' 2939you can use perls ``\b'' character class which matches a word 2940boundrary: 2941 2942 Fetch-RE: m/foobar.cgi\b/ 2943 Ignore: *.cgi 2944 2945which will get ``foobar.cgi'' as well as ``foobar.cgi?query=mp3'' but 2946not ``foobar.cgifu''. BUT, you must keep in mind that a lot of 2947diffetent characters make a word boundrary, maybe something more 2948subtle is needed. 2949 2950=item Apply: I<regular-expression> 2951 2952This is used to change a URL into another URL. It is a potentially 2953I<very> powerful feature, and it also provides ample chance for you to 2954shoot your own foot. The whole aparatus is somewhat tenative, if you 2955find there is a need for changes in how Apply rules work please 2956E-mail. If you are going to use this feature please read the 2957documentation for I<Fetch-RE> and I<Ignore-RE> first. 2958 2959The B<Apply> expressions are applied, in sequence, to the URLs in 2960their absolute form. I.e., with the whole 2961http://host:port/dir/ec/tory/file URL. It is only after this B<w3mir> 2962checks if a document is within the scope of retrieval or not. That 2963means that B<Apply> rules can be used to change certain URLs to fall 2964inside the scope of retrieval, and vice versa. 2965 2966The I<regular-expression> is perls superset of the usual Unix regular 2967expressions for substitution. As with I<Fetch> and I<Ignore> rules it 2968must be specified fully, with the I<s> and delimiting character. It 2969has the same restrictions with regards to delimiters. E.g., 2970 2971 Apply: s~/foo/~/bar/~i 2972 2973to translate the path element I<foo> to I<bar> in all URLs. 2974 2975"#" cannot be used as delimiter as it is the comment character in the 2976configuration file. 2977 2978Please note that w3mir expects that URLs identifying 'directories' 2979keep idenfifying directories after application of Apply rules. Ditto 2980for files. 2981 2982=item Agent: I<agent> 2983 2984see B<-agent> option. 2985 2986=item Pause: I<n> 2987 2988see B<-p> option. 2989 2990=item Retry-Pause: I<n> 2991 2992see B<-rp> option. 2993 2994=item Retries: I<n> 2995 2996see B<-t> option. 2997 2998=item debug: I<n> 2999 3000see B<-d> option. 3001 3002=item umask I<n> 3003 3004see B<-umask> option. 3005 3006=item Robot-Rules: I<on> | I<off> 3007 3008Turn robot rules on of off. See B<-drr> option. 3009 3010=item Remove-Nomirror: I<on> | I<off> 3011 3012If this is enabled sections between two consecutive 3013 3014 <!--NO MIRROR--> 3015 3016comments in a mirrored document will be removed. This editing is 3017performed even if batch getting is specified. 3018 3019=item Header: I<html/text> 3020 3021Insert this I<complete> html/text into the start of the document. 3022This will be done even if batch is specified. 3023 3024=item File-Disposition: I<save> | I<stdout> | I<forget> 3025 3026What to do with a retrieved file. The I<save> alternative is default. 3027The two others correspond to the B<-s> and B<-f> options. Only one 3028may be specified. 3029 3030=item Verbosity: I<quiet> | I<brief> | I<chatty> 3031 3032How much B<w3mir> informs you of it's progress. I<Brief> is the 3033default. The two others correspond to the B<-q> and B<-c> switches. 3034 3035=item Cd: I<directory> 3036 3037Change to given directory before starting work. If it does not exist 3038it will be quietly created. Using this option breaks the 'fixup' 3039code so consider not using it, ever. 3040 3041=item HTTP-Proxy: I<server:port> 3042 3043see the B<-P> switch. 3044 3045=item HTTP-Proxy-user: I<username> 3046 3047=item HTTP-Proxy-passwd: I<password> 3048 3049These two are is used to activate authentication with the proxy 3050server. L<w3mir> only supports I<basic> proxy autentication, and is 3051quite simpleminded about it, if proxy authentication is on L<w3mir> 3052will always give it to the proxy. The domain concept is not supported 3053with proxy-authentication. 3054 3055=item Proxy-Options: I<no-pragma> | I<revalidate> | I<refresh> | I<no-store> 3056 3057Set proxy options. There are two ways to pass proxy options, HTTP/1.0 3058compatible and HTTP/1.1 compatible. Newer proxy-servers will 3059understand the 1.1 way as well as 1.0. With old proxy-servers only 3060the 1.0 way will work. L<w3mir> will prefer the 1.0 way. 3061 3062The only 1.0 compatible proxy-option is I<refresh>, it corresponds to 3063the B<-pflush> option and forces the proxy server to pass the request 3064to a upstream server to retrieve a I<fresh> copy of the document. 3065 3066The I<no-pragma> option forces w3mir to use the HTTP/1.1 proxy 3067control header, use this only with servers you know to be new, 3068otherwise it won't work at all. Use of any option but I<refresh> will 3069also cause HTTP/1.1 to be used. 3070 3071I<revalidate> forces the proxy server to contact the upstream server 3072to validate that it has a fresh copy of the document. This is nicer 3073to the net than I<refresh> option which forces re-get of the document 3074no matter if the server has a fresh copy already. 3075 3076I<no-store> forbids the proxy from storing the document in other than 3077in transient storage. This can be used when transferring sensitive 3078documents, but is by no means any warranty that the document can't be 3079found on any storage device on the proxy-server after the transfer. 3080Cryptography, if legal in your contry, is the solution if you want the 3081contents to be secret. 3082 3083I<refresh> corresponds to the HTTP/1.0 header I<Pragma: no-cache> or 3084the identical HTTP/1.1 I<Cache-control> option. I<revalidate> and 3085I<no-store> corresponds to I<max-age=0> and I<no-store> respectively. 3086 3087=item Authorization 3088 3089B<w3mir> supports only the I<basic> authentication of HTTP/1.0. This 3090method can assign a password to a given user/server/I<realm>. The 3091"user" is your user-name on the server. The "server" is the server. 3092The I<realm> is a HTTP concept. It is simply a grouping of files and 3093documents. One file or a whole directory hierarchy can belong to a 3094realm. One server may have many realms. A user may have separate 3095passwords for each realm, or the same password for all the realms the 3096user has access to. A combination of a server and a realm is called a 3097I<domain>. 3098 3099=over 8 3100 3101=item Auth-Domain: I<server:port/realm> 3102 3103Give the server and port, and the belonging realm (making a domain) 3104that the following authentication data holds for. You may specify "*" 3105wildcard for either of I<server:port> and I<realm>, this will work 3106well if you only have one usernme and password on all the servers 3107mirrored. 3108 3109=item Auth-User: I<user> 3110 3111Your user-name. 3112 3113=item Auth-Passwd: I<password> 3114 3115Your password. 3116 3117=back 3118 3119These three directives may be repeated, in clusters, as many times as 3120needed to give the necessary authentication information 3121 3122=item Disable-Headers: I<referer> | I<user> 3123 3124Stop B<w3mir> from sending the given headers. This can be used for 3125anonymity, making your retrievals harder to track. It will be even 3126harder if you specify a generic B<Agent>, like Netscape. 3127 3128=item Fixup: I<...> 3129 3130This directive controls some aspects of the separate program w3mfix. 3131w3mfix uses the same configuration file as w3mir since it needs a lot 3132of the information in the B<w3mir> configuration file to do it's work 3133correctly. B<w3mfix> is used to make mirrors more browseable on 3134filesystems (disk or CDROM), and to fix redirected URLs and some other 3135URL editing. If you want a mirror to be browseable of disk or CDROM 3136you almost certainly need to run w3mfix. In many cases it is not 3137necessary when you run a mirror to be used through a WWW server. 3138 3139To make B<w3mir> write the data files B<w3mfix> needs, and do nothing 3140else, simply put 3141 3142=over 8 3143 3144 Fixup: on 3145 3146=back 3147 3148in the configuration file. To make B<w3mir> run B<w3mfix> 3149automatically after each time B<w3mir> has completed a mirror run 3150specify 3151 3152=over 8 3153 3154 Fixup: run 3155 3156=back 3157 3158L<w3mfix> is documented in a separate man page in a effort to not 3159prolong I<this> manpage unnecessarily. 3160 3161=item Index-name: I<name-of-index-file> 3162 3163When retriving URLs ending in '/' w3mir needs to append a filename to 3164store it localy. The default value for this is 'index.html' (this is 3165the most used, its use originated in the NCSA HTTPD as far as I know). 3166Some WWW servers use the filename 'Welcome.html' or 'welcome.html' 3167instead (this was the default in the old CERN HTTPD). And servers 3168running on limited OSes frequently use 'index.htm'. To keep things 3169consistent and sane w3mir and the server should use the same name. 3170Put 3171 3172 Index-name: welcome.html 3173 3174when mirroring from a site that uses that convention. 3175 3176When doing a multiserver retrival where the servers use two or more 3177different names for this you should use B<Apply> rules to make the 3178names consistent within the mirror. 3179 3180When making a mirror for use with a WWW server, the mirror should use 3181the same name as the new server for this, to acomplish that 3182B<Index-name> should be combined with B<Apply>. 3183 3184Here is an example of use in the to latter cases when Welcome.html is 3185the prefered I<index> name: 3186 3187 Index-name: Welcome.html 3188 Apply: s~/index.html$~/Welcome.html~ 3189 3190Similarly, if index.html is the prefered I<index> name. 3191 3192 Apply: s~/Welcome.html~/index.html~ 3193 3194I<Index-name> is not needed since index.html is the default index name. 3195 3196=back 3197 3198=head1 EXAMPLES 3199 3200=over 4 3201 3202=item * Just get the latest Dr-Fun if it has been changed since the last 3203time 3204 3205 w3mir http://sunsite.unc.edu/Dave/Dr-Fun/latest.jpg 3206 3207=item * Recursively fetch everything on the Star Wars site, remove 3208what is no longer at the server from the mirror: 3209 3210 w3mir -R -r http://www.starwars.com/ 3211 3212=item * Fetch the contents of the Sega site through a proxy, pausing 3213for 30 seconds between each document 3214 3215 w3mir -r -p 30 -P www.foo.org:4321 http://www.sega.com/ 3216 3217=item * Do everything according to F<w3mir.cfg> 3218 3219 w3mir -cfgfile w3mir.cfg 3220 3221=item * A simple configuration file 3222 3223 # Remember, options first, as many as you like, comma separated 3224 Options: recurse, remove 3225 # 3226 # Start here: 3227 URL: http://www.starwars.com/ 3228 # 3229 # Speed things up 3230 Pause: 0 3231 # 3232 # Don't get junk 3233 Ignore: *.cgi 3234 Ignore: *-cgi 3235 Ignore: *.map 3236 # 3237 # Proxy: 3238 HTTP-Proxy: www.foo.org:4321 3239 # 3240 # You _should_ cd away from the directory where the config file is. 3241 cd: starwars 3242 # 3243 # Authentication: 3244 Auth-domain: server:port/realm 3245 Auth-user: me 3246 Auth-passwd: my_password 3247 # 3248 # You can use '*' in place of server:port and/or realm: 3249 Auth-domain: */* 3250 Auth-user: otherme 3251 Auth-user: otherpassword 3252 3253=item Also: 3254 3255 # Retrive all of janl's home pages: 3256 Options: recurse 3257 # 3258 # This is the two argument form of URL:. It fetches the first into the second 3259 URL: http://www.math.uio.no/~janl/ math/janl 3260 # 3261 # These says that any documents refered to that lives under these places 3262 # should be gotten too. Into the named directories. Two arguments are 3263 # required for 'Also:'. 3264 Also: http://www.math.uio.no/drift/personer/ math/drift 3265 Also: http://www.ifi.uio.no/~janl/ ifi/janl 3266 Also: http://www.mi.uib.no/~nicolai/ math-uib/nicolai 3267 # 3268 # The options above will result in this directory hierarchy under 3269 # where you started w3mir: 3270 # w3mir/math/janl files from http://www.math.uio.no/~janl 3271 # w3mir/math/drift from http://www.math.uio.no/drift/personer/ 3272 # w3mir/ifi/janl from http://www.ifi.uio.no/~janl/ 3273 # w3mir/math-uib/nicolai from http://www.mi.uib.no/~nicolai/ 3274 3275=item Ignore-RE and Fetch-RE 3276 3277 # Get only jpeg/jpg files, no gifs 3278 Fetch-RE: m/\.jp(e)?g$/ 3279 Ignore-RE: m/\.gif$/ 3280 3281=item Apply 3282 3283As I said earlier, B<Apply> has not been used for Real Work yet, that 3284I know of. But B<Apply> I<could>, be used to map all web servers at 3285the university of Oslo inside the scope of retrieval very easily: 3286 3287 # Start at the main server 3288 URL: http://www.uio.no/ 3289 # Change http://*.uio.no and http://129.240.* to be a subdirectory 3290 # of http://www.uio.no/. 3291 Apply: s~^http://(.*\.uio\.no(?:\d+)?)/~http://www.uio.no/$1/~i 3292 Apply: s~^http://(129\.240\.[^:]*(?:\d+)?)/~http://www.uio.no/$1/~i 3293 3294 3295=back 3296 3297There are two rather extensive example files in the B<w3mir> distribution. 3298 3299=head1 BUGS 3300 3301=over 4 3302 3303=item The -lc switch does not work too well. 3304 3305=back 3306 3307=head1 FEATURES 3308 3309These are not bugs. 3310 3311=over 4 3312 3313=item URLs with two /es ('//') in the path component does not work as 3314some might expect. According to my reading of the URL spec. it is an 3315illegal construct, which is a Good Thing, because I don't know how to 3316handle it if it's legal. 3317 3318=item If you start at http://foo/bar/ then index.html might be gotten 3319twice. 3320 3321=item Some documents point to a point above the server root, i.e., 3322http://some.server/../stuff.html. Netscape, and other browsers, in 3323defiance of the URL standard documents will change the URL to 3324http://some.server/stuff.html. W3mir will not. 3325 3326=item Authentication is I<only> tried if the server requests it. This 3327might lead to a lot of extra connections going up and down, but that's 3328the way it's gotta work for now. 3329 3330=back 3331 3332=head1 SEE ALSO 3333 3334L<w3mfix> 3335 3336=head1 AUTHORS 3337 3338B<w3mir>s authors can be reached at I<w3mir-core@usit.uio.no>. 3339B<w3mir>s home page is at http://www.math.uio.no/~janl/w3mir/ 3340