1# -*-perl-*-
2
3use Config;
4
5&read_makefile;
6$fullperl = resolve_make_var('FULLPERL') || $Config{'perlpath'};
7$islib = resolve_make_var('INSTALLSITELIB');
8
9$name = $0;
10$name =~ s~^.*/~~;
11$name =~ s~.PL$~~;
12
13open(OUT,"> $name") ||
14  die "Could open $name for writing: $!\n";
15
16print "writing $name\n";
17
18while (<DATA>) {
19  if (m~^\#!/.*/perl.*$~o) {
20    # This substitutes the path perl was installed at on this system
21    # _and_ removed any (-w) options.
22    print OUT "#!",$fullperl,$1,"\n";
23    next;
24  }
25  if (/^use lib/o) {
26    # This substitutes the actuall library install path
27    print OUT "use lib '$islib';\n";
28    next;
29  }
30  print OUT;
31}
32
33close(OUT);
34
35# Make it executable too, and writeable
36chmod 0755, $name;
37
38#### The library
39
40sub resolve_make_var ($) {
41
42  my($var) = shift @_;
43  my($val) = $make{$var};
44
45#  print "Resolving: ",$var,"=",$val,"\n";
46
47  while ($val =~ s~\$\((\S+)\)~$make{$1}~g) {}
48#  print "Resolved: $var: $make{$var} -> $val\n";
49  $val;
50}
51
52
53sub read_makefile {
54
55  open(MAKEFILE, 'Makefile') ||
56    die "Could not open Makefile for reading: $!\n";
57
58  while (<MAKEFILE>) {
59    chomp;
60    next unless m/^([A-Z]+)\s*=\s*(\S+)$/;
61    $make{$1}=$2;
62#    print "Makevar: $1 = $2\n";
63  }
64
65  close(MAKEFILE)
66}
67
68__END__
69#!/usr/bin/perl -w
70# Perl 5.002 or later.  w3mir is mostly tested with perl 5.004
71#
72use lib '/hom/janl/lib/perl';
73#
74# Once upon a long time ago this was Oscar Nierstrasz's
75# <oscar@cui.unige.ch> htget script.
76#
77# Retrieves HTML pages, creating local copies in the _current_
78# directory. The script will check for the last-modified stamp on the
79# document, and will not fetch it if the document isn't changed.
80#
81# Bug list is in w3mir-README.
82#
83# Test cases for janl to use:
84#   w3mir -r -fs http://www.eff.org/ - infinite recursion!
85#     --- but cursory examination seems to indicate confused server...
86#   http://java.sun.com/progGuide/index.html check out the img things.
87#
88# Copyright Holders:
89#   Nicolai Langfeldt, janl@ifi.uio.no
90#   Gorm Haug Eriksen, gorm@usit.uio.no
91#   Chris Szurgot, szurgot@itribe.net
92#   Ed Jordan, ed@olympus.itl.net
93#   Alex Knowles, aknowles@avs.com aka ark.
94# Copying and modification is governed by the "Artistic License" enclosed in
95# the w3mir distribution
96#
97# History (European format date: dd/mm/yy):
98#  oscar 25/03/94 -- added -s option to send output to stdout
99#  oscar 28/03/94 -- made HTTP 1.0 the default
100#  oscar 30/05/94 -- special handling of directory URLs missing a trailing "/"
101#   gorm 20/02/95 -- added mirror capacity + fixed a couple of bugs
102#   janl 28/03/95 -- added a working commandline parser.
103#   janl 18/09/95 -- Changed to use a net http library.  Removed dependency of
104#		     url.pl.
105#   janl 19/09/95 -- Extensive rewrite. Simplified a lot, works better.
106#		     HTML files are now saved in a new and improved manner,
107#		     which means they can be recognized as such w/o fancy
108#		     filename extention type rules.
109# szurgot 27/01/96-- Added "Plaintextmode" wrapper to binmode PAGE.
110#                    binmode page is required under Win32, but broke modified
111#                    checking
112#                 -- Minor change added ; to "# '" strings for Emacs cperl-mode
113# szurgot 07/02/96-- When reading in local file for checking of URLs changed
114#                    local ($/) =0; to equal undef;
115#   janl 08/02/96 -- Added szurgot's changes and changed them :-)
116# szurgot 09/02/96-- Added code to strip /#.*$/ from urls when reading from
117#                    local file
118#                 -- Added hasAlarm variable to w3http.pl. Set to 1 if you have
119#                    alarm(). 0 otherwise.
120#                 -- Moved code setting up the valid extensions list into the
121#                    args processing where it belonged
122#   janl 20/02/96 -- Added szurgot changes again.
123#		  -- Make timeout code work.
124#   		  -- and made another win32 test.
125#   janl 19/03/96 -- Worked through the code for handling not-modified
126#    		     documents, it was a bit shabby after htmlop was intro'ed.
127#   janl 20/03/96 -- -l fix
128#   janl 23/04/96 -- Added -fs by request (by Rik Faith)
129#   janl 16/05/96 -- Made -R mandatory, added use and support for
130#			w3http::SAVEBIN
131# szurgot 19/05/96-- Win95 adaptions.
132#   janl 19/05/96 -- -C did not exactly work as expected.  Thanks to Petr
133#		     Novak for bug descriptions.
134#   janl 19/05/96 -- Changed logic for @didntget, @got and so on to use
135#		     @queue and %urlstat.
136#   janl 09/09/96 -- Removed -R switch.
137#   janl 14/09/96 -- Added ir (initial referer) switch
138#   janl 21/09/96 -- Made retry code saner.  There probably needs to be a
139#                    sleep before retry comencing switch.  When no tty is
140#                    present it should be fairly long.
141#   gorm 15/09/96 -- Added cr (check robot) switch. Default to 1 (on)
142#   janl 22/09/96 -- Modified gorms patch to use WWW::RobotRules.  Changed
143#		     robot switch to be consistent with current w3mir
144#                    practice.
145#   janl 27/09/96 -- Spelling corrections from charles.curran@ox.ac.uk
146#                 -- Folded in manual diffs from ark.
147#    ark 24/09/96 -- Simple facilities to edit the incomming file(s)
148#   janl 27/09/96 -- Added switch to enable <!--NOMIRROR--> editing and
149#		     foolproofed ark's patch a bit.
150#   janl 02/10/96 -- Added -umask switch.
151#		  -- Redirected documents did not have a meaningful referer
152#		     value (it was undefined).
153#		  -- Got w3mir into strict discipline, found some typos...
154#   janl 20/10/96 -- Mtime is preserved
155#   janl 21/10/96 -- -lc switch added.  Mtime preservation works better.
156#   janl 06/11/96 -- Treat 301 like 302.
157#   janl 02/12/96 -- Added config file code, fetch/ignore rules, apply
158#   janl 04/12/96 -- Better checking of config input.
159#   janl 06/12/96 -- Putting together the URL selection/editing brains.
160#   janl 07/12/96 -- Checking out some bugs.  Adding multiscope options.
161#   janl 12/12/96 -- Adding to and defeaturing the multiscope options.
162#   janl 13/12/96 -- Continuing work in multiscope stuff
163#                 -- Unreferenced file and empty directory removal works.
164#   janl 19/02/97 -- Can extract urls from adobe acrobat pdf files :-)
165#		     Important: It does _not_ edit urls, so they still
166#		     point at the original site(s).
167#   janl 21/02/97 -- Fix -lc bug related to case and the apply things.
168#		  -- only use SAVEURL if needed
169#   janl 11/03/97 -- Finish work on SAVEURL conditional.
170#		  -- Fixed directory removal code.
171#                 -- parse_args did not abort when unknown option/argument
172#		     was specified.
173#   janl 12/03/97 -- Made test case for -lc.  Didn't work.  Fixed it.  I think.
174#		     Realized we have bug w.r.t. hostname caseing.
175#   janl 13/03/97 -- All redirected to URLs within scope are now queued.
176#		     That should make the mirror more complete, but it won't
177#		     help browsability when it comes to the redirected doc.
178#		  -- Moved robot retrival to the inside of the mirror loop
179#		     since we now possebly mirror several sites.
180#		  -- Changed 'fetch-options' to 'options'.
181#                 -- Added 'proxy-options'/-pflush to controll proxy server(s).
182#   janl 09/04/97 -- Started using URI::URL.
183#   janl 11/04/97 -- Debugging and using URI::URL more correctly various places
184#   janl 09/05/97 -- Added --agent switch
185#   janl 12/05/97 -- Simplified scope checks for root URL, changed URL 'apply'
186#		     processing.
187#		  -- Small output formating fix in the robot rules code.
188#		  -- Version is now 0.99
189#   janl 14/05/97 -- htmpop no-longer puts '<!DOCTYPE...' into doc, so check
190#		     for '<HTML' instead
191#   janl 11/06/97 -- Made :port optional in server part of auth-domain.
192#			Always removing :80 from server part to match netloc.
193#   janl 22/07/97 -- More debugging of rewrite for new features -B, -I.
194#   janl 01/08/97 -- Fixed bug in RE quoting for Ignore/Fetch
195#   janl 04/08/97 -- s/writepage/write_page/g
196#   janl 07/09/97 -- 0.99b1 is released
197#   janl 19/09/97 -- Kaj Hejer discovers omissions in non-html-url-mining code.
198#                 -- 0.99b2 is released
199#   janl 24/09/97 -- Matt Chapman found bug in realm-name extraction.
200#   janl 10/10/97 -- Referer: header supression supressed User: header instead
201#		  -- Added fixup handling, writes .redirs and .referers
202#		     (no dot in win32)
203#  		  -- Read .w3mirc (w3mir.ini on win32) if present
204#		  -- Stop file removal code from removing these files
205#   janl 16/10/97 -- process_tag was mangling url attributes in tags with more
206#		     than one of them.  Problem found by Robert L. Binkley
207#   janl 04/12/97 -- Fixed problem with authentication, misplaced +
208#		  -- default inter-docuent pause is 0.  I figure it's better
209#		     to keep one httpd occupied in a steady stream than to
210#		     wait for it to die before we talk to it again.
211#   janl 13/12/97 -- The arguments to index.html in the form of index.html/foo
212#		     handling code was incomplete.  To make it complete would
213#		     have been hard, so it was removed.
214#		  -- If a URL changes from file to directory or vice versa
215#		     this is now handled.
216#   janl 11/01/98 -- PDF files with no URLs does not cause warnings now.
217#		  -- Close REFERERS and REDIRECTS before calling w3mfix
218#   janl 22/01/98 -- Proxy authentication as outlined by Christian Geuer
219#   janl 04/02/98 -- Version 1pre1
220#   janl 18/02/98 -- Fixed wild_re after tip by Prentiss Riddle.
221#		  -- Version 1pre2
222#   janl 20/02/98 -- w3http updated to handle complex content-types.
223#		  -- Fix wild_re more, bug noted by James Dumser
224#		  -- 1.0pre3
225#   janl 18/03/98 -- Version 1.0 is released
226#   janl 09/04/98 -- Added feature so user can disable newline conversion.
227#   janl 20/04/98 -- Only convert newlines in HTML files. -> 1.0.2
228#   janl 09/05/98 -- More carefull clean_disk code.
229#		  -- Check if the redirected URL was a root url, if so
230#		     issue a warning and exit.
231#   janl 12/05/98 -- use ->unix_path instead of ->as_string to derive local
232#		     filename.
233#   janl 25/05/98 -- -B didn't work too well.
234#   janl 09/07/98 -- Redirect to fragment broke us, less broken now -> 1.0.4
235#   janl 24/09/98 -- Better errormessages on errors -> 1.0.5
236#   janl 21/11/98 -- Fix errormessages better.
237#   janl 05/01/99 -- Drop 'Referer: (commandline)'
238#   janl 13/04/99 -- Add initial referer to root urls in batch mode.
239#   janl 15/01/00 -- Remove some leftover print statements
240#                 -- Fix also-queue problem as suggested by Sven Koch
241#   janl 04/02/01 -- Use epath instead of path quite often -> 1.0.10
242#
243# Variable name discipline:
244# - remote, umodified URL.  Variables prefixed 'rum_'
245# - local, filesystem.  Variables prefixed 'lf_'.
246# Use these prefixes so we know what we're working with at all times.
247# Also, URL objects are postfixed _o
248#
249# The apply rules and scope rules work this way:
250# - First apply the user rules to the remote url.
251# - Check if document is within scope after this.
252# - Then apply w3mir's rules to the result.  This results is the local,
253#   filesystem, name.
254#
255# We use features introduced in 5.002.
256require 5.002;
257
258# win32 and $nulldevice need to be globals, other modules use them.
259use vars qw($win32 $nulldevice);
260
261# To figure out what kind of system this is
262BEGIN {
263  use Config;
264  $win32 = ( $Config{'osname'} eq 'MSWin32' );
265}
266# More ways to die:
267use Carp;
268# Http module:
269use w3http;
270# html url extraction and manupulation:
271use htmlop;
272# Extract urls from adobe acrobat pdf files:
273use w3pdfuri;
274# Date computer:
275use HTTP::Date;
276# URLs:
277use URI::URL;
278# For flush method
279use FileHandle;
280
281eval '
282use URI;
283$URI::ABS_ALLOW_RELATIVE_SCHEME=1;
284$URI::ABS_REMOTE_LEADING_DOTS=1;
285';
286
287# Full discipline:
288use strict;
289
290# Set params in the http package, HTTP protocol version:
291$w3http::version="1.0";
292
293# The defaults should be for a robotic http agent on good behaviour.
294my $debug=0;			# Debug level
295my $verbose=0;			# Verbosity level, -1 = quiet, 0 = normal, 1...
296my $pause=0;			# Pause between http requests
297my $retryPause=600;		# Pause between retries. 10 minutes.
298my $retry=3;			# Max 3 stabs pr. url.
299my $r=0;			# Recurse? no recursion = absolutify links
300my $remove=0;			# Remove files that are not there?
301my $s=0;			# 0: save on disk 1: stdout 2: just forget 'em
302my $useauth=0;			# Use authorization
303my %authdata;			# Authorization data
304my $check_robottxt = 1;		# Check robots.txt
305my $do_referer = 1;		# Send referers header
306my $do_user = 1;		# Send user header
307my $cache_header = '';		# The cache-control/pragma: no-cache header
308my $using_proxy = 0;		# Using proxy server or not?
309my $batch=0;			# Batch get URLs?
310my $read_urls=0;		# Get urls from STDIN?
311my $abs=0;			# Absolutify URLs?
312my $immediate_redir=0;		# Immediately follow a redirect?
313my @root_urls;			# This is where we start, the root documents
314my @root_dirs;			# The corresponding directories.  for remove
315my $chdirto='';			# Place to chdir to after reading config file
316my %nodelete=();		# Files that should not be deleted
317my $numarg=0;			# Number of arguments accepted.
318my $list_nomir=0;		# List of files not mirrored
319
320# Fixup related things
321my $fixrc='';			# Name of w3mfix config file
322my $fixup=1;			# Do things needed to run fixup
323my $runfix=0;			# Run w3mfix for user?
324my $fixopen=0;			# Fixup files open?
325
326my $indexname='index.html';
327
328my $VERSION;
329$VERSION='1.0.10';
330$w3http::agent = my $w3mir_agent = "w3mir/$VERSION-2001-01-20";
331my $iref='';			# Initial referer.  Must evaluate to false
332
333# Derived settings
334my $mine_urls=0;		# Mine URLs from documents?
335my $process_urls=0;		# Perform (URL) processing of documents?
336
337# Queue of urls to get.
338my @rum_queue = ();
339my @urls = ();
340# URL status map.
341my %rum_urlstat = ();
342# Status codes:
343my $QUEUED   = 0;		# Queued but not gotten yet.
344my $TERROR   = 100;		# Transient error, retry later
345my $HLERR    = 101;		# Permanent error, give up
346my $GOTIT    = 200;		# Gotten. Note similarity to http result code
347my $NOTMOD   = 304;		# Not modified.
348# Negative codes for nonexistent files, easier to check.
349my $NEVERMIND= -1;		# Don't want it
350my $REDIR    = -302;		# Does not exist, redirected
351my $ENOTFND  = -404;		# Does not exist.
352my $OTHERERR = -600;		# Some other error happened
353my $FROBOTS  = -601;		# Forbidden by robots.txt rule
354
355# Directory/files survey:
356my %lf_file;		# What files are present in FS?  Disposition?  One of:
357my $FILEDEL=0;		# Delete file
358my $FILEHERE=1;		# File present in filesystem only
359my $FILETHERE=2;	# File present on server too.
360my %lf_dir;		# Number of files/dirs in dir.  If 0 dir is
361			# eligible for deletion.
362
363my %fiddled=();		# If a file becomes a directory or a directory
364                        # becomes a file it is considered fiddled and
365                        # w3mir will not fiddle with it again in this
366                        # run.
367
368# Bitbucket device, very OS dependent.
369$nulldevice='/dev/null';
370$nulldevice='nul:' if ($win32);
371
372# What to get, and not.
373# Text of user supplied fetch/ignore rules
374my $rule_text=" # User defined fetch/ignore rules\n";
375# Code ref to the rule procedure
376my $rule_code;
377
378# Code to prefix and postfix the generated code.  Prefix should make
379# $_ contain the url to match.  Postfix should return 1, the default
380# is to get the url/file.
381my $rule_prefix='$rule_code = sub { local($_) = shift;'."\n";
382my $rule_postfix=" return 1;\n}";
383
384# Scope tests generated by URL/Also directives in cfg. The scope code
385# is just like the rule code, but used for program generated
386# fetch/ignore rules related to multiscope retrival.
387my $scope_fetch=" # Automatic fetch rules for multiscope retrival\n";
388my $scope_ignore=" # Automatic ignore rules for multiscope retrival\n";
389my $scope_code;
390
391my $scope_prefix='$scope_code = sub { local($_) = shift;'."\n";
392my $scope_postfix=" return 0;\n}";
393
394# Function to apply to urls, se rule comments.
395my $user_apply_code;	# User specified apply code
396my $apply_code;		# w3mirs apply code
397my $apply_prefix='$apply_code = sub { local($_) = @_;'."\n";
398my $apply_lc=' $_ = lc $_; ';
399my $apply_postfix='  return $_;'."\n}";
400my @user_apply;		# List of users apply rules.
401my @internal_apply;	# List of w3mirs apply rules.
402
403my $infoloss=0;		# 1 if any URL translations (which cause
404                        # information loss) are in effect.  If this is
405                        # true we use the SAVEURL operation.
406my $list;		# List url on STDOUT?
407my $edit;		# Edit doc? Remove <!--NOMIRROR>...<!--/NOMIRROR-->
408my $header;		# Text to insert in header
409my $lc=0;		# Convert urls/filenames to lowercase?
410my $fetch=0;		# What to fetch: -1: Some, 0: not modified 1: all
411my $convertnl=1;	# Convert newlines?
412
413# Non text/html formats we can extract urls from.  Function must take one
414# argument: the filename.
415my %knownformats = ( 'application/pdf', \&w3pdfuri::list,
416		     'application/x-pdf', \&w3pdfuri::list,
417		   );
418
419# Known 'magic numbers' of the known formats.  The value is used as
420# key in %knownformats.  the key part is a exact match for the
421# following <string> beginning at the first byte of the file.
422# This should probably be made more flexible, but not until we need it.
423
424my %knownmagic = ( '%PDF-', 'application/pdf' );
425
426my $iinline='';	# inline RE code to make RE caseinsensitive
427my $ipost='';	# RE postfix to make it caseinsensitive
428
429usage() unless parse_args(@ARGV);
430
431{
432  my $w3mirc='.w3mirc';
433
434  $w3mirc='w3mir.ini' if $win32;
435
436  if (-f $w3mirc) {
437    parse_cfg_file($w3mirc);
438    $nodelete{$w3mirc}=1;
439  }
440}
441
442# Check arguments and options
443if ($#root_urls>=0) {
444  # OK
445} else {
446  print "URLs: $#rum_queue\n";
447  usage("No URLs given");
448}
449
450# Are we converting newlines today?
451$w3http::convert=0 unless $convertnl;
452
453if ($chdirto) {
454  &mkdir($chdirto.'/this-is-not-created-odd-or-what');
455  chdir($chdirto) ||
456    die "w3mir: Can't change working directory to '$chdirto': $!\n";
457}
458
459$SIG{'INT'}=sub { print STDERR "\nCaught SIGINT!\n"; exit 1; };
460$SIG{'QUIT'}=sub { print STDERR "\nCaught SIGQUIT!\n"; exit 1; };
461$SIG{'HUP'}=sub { print STDERR "\nCaught SIGHUP!\n"; exit 1; };
462
463&open_fixup if $fixup;
464
465# Derive how much document processing we should do.
466$mine_urls=( $r || $list );
467$process_urls=(!$batch && !$edit && !$header);
468# $abs can be set explicitly with -abs, and implicitly if not recursing
469$abs = 1 unless $r;
470print "Absolute references\n" if $abs && $debug;
471
472# Cache_controll specified but proxy not in use?
473die "w3mir: If you want to control a cache, use a proxy server!\n"
474  if ($cache_header && !$using_proxy);
475
476# Compile the second order code
477
478# - The rum scope tests
479my $full_rules=$scope_prefix.$scope_fetch.$scope_ignore.$scope_postfix;
480# warn "Scope rules:\n-------------\n$full_rules\n---------------\n";
481eval $full_rules;
482die "$@" if $@;
483
484die "w3mir: Program generated rules did not compile.\nPlease report to w3mir-core\@usit.uio.no.  The code is:\n----\n".
485  $full_rules."\n----\n"
486  if !defined($scope_code);
487
488$full_rules=$rule_prefix.$rule_text.$rule_postfix;
489# warn "Fetch rules:\n-------------\n$full_rules\n---------------\n";
490eval $full_rules;
491die "$@!" if $@;
492
493# - The user specified rum tests
494die "w3mir: Ignore/Fetch rules did not compile.\nPlease report to w3mir-core\@usit.uio.no.  The code is:\n----\n".
495  $full_rules."\n----\n"
496  if !defined($rule_code);
497
498# - The user specified apply rules
499
500# $SIG{__WARN__} = sub { print "$_[0]\n"; confess ""; };
501
502my $full_apply=$apply_prefix.($lc?$apply_lc:'').
503  join($ipost.";\n",@user_apply).(($#user_apply>=0)?$ipost:"").";\n".
504  $apply_postfix;
505
506eval $full_apply;
507die "$@!" if $@;
508
509die "w3mir: User apply rules did not compile.\nPlease report to w3mir-core\@usit.uio.no. The code is:
510----
511".$full_apply."
512----\n" if !defined($apply_code);
513
514# print "user apply: $full_apply\n";
515$user_apply_code=$apply_code;
516
517# - The w3mir generated apply rules
518
519$full_apply=$apply_prefix.($lc?$apply_lc:'').
520  join($ipost.";\n",@internal_apply).(($#internal_apply>=0)?$ipost:"").";\n".
521  $apply_postfix;
522eval $full_apply;
523die "$@!" if $@;
524
525die "Internal apply rules did not compile.  The code is:
526----
527".$full_apply."
528----\n" if !defined($apply_code);
529
530# - Information loss via -lc?  There are other sources as well.
531$infoloss=1 if $lc;
532
533warn "Infoloss is $infoloss\n" if $debug;
534
535# More setup:
536
537$w3http::debug=$debug;
538
539$w3http::verbose=$verbose;
540
541my %rum_referers=();	# Array of referers, key: rum_url
542my $Robot_Blob;		# WWW::RobotsRules object, decides if rum_url is
543			# forbidden to access for us.
544my $rum_url_o;		# rum url, mostly the current, the one we're getting
545my %gotrobots;		# Did I get robots.txt from site? key: url->netloc
546my($authuser,$authpass);# Username and password for authentication with server
547my @rum_newurls;	# List of rum_urls in document
548
549if ($check_robottxt) {
550  # Eval is only way to defer loading of module until we know it's needed?
551  eval 'use WWW::RobotRules;';
552
553  die "Could not load WWW::RobotRules, try -drr switch\n"
554    unless defined(&WWW::RobotRules::parse);
555
556  $Robot_Blob = new WWW::RobotRules $w3mir_agent;
557}
558
559# We have several main-modes of operation.  Here we select one
560if ($r) {
561
562  die "w3mir: No URLs?  Try 'w3mir -h' for help.\n"
563    if $#root_urls==-1;
564
565  warn "Recursive retrival comencing\n" if $debug;
566
567  die "w3mir: Sorry, you cannot combine -r/recurse with -I/read_urls\n"
568    if $read_urls;
569
570  # Recursive
571  my $url;
572  foreach $url (@root_urls) {
573    warn "Root url dequeued: $url\n" if $debug;
574    if (want_this($url)) {
575      queue($url);
576      &add_referer($url,$iref);
577    } else {
578      die "w3mir: Inconsistent configuration: Specified $url is not inside retrival scope\n";
579    }
580  }
581  mirror();
582
583} else {
584  if ($batch) {
585    warn "Batch retrival commencing\n" if $debug;
586    # Batch get
587    if ($read_urls) {
588      # Get URLs from <STDIN>
589      while (<STDIN>) {
590	chomp;
591	&add_referer($_,$iref);
592	batch_get($_);
593      }
594    } else {
595      # Get URLs from commandline
596      my $url;
597      foreach $url (@root_urls) {
598	&add_referer($url,$iref);
599      }
600      foreach $url (@root_urls) {
601	batch_get($url);
602      }
603    }
604  } else {
605    warn "Single url retrival commencing\n" if $debug;
606
607    # A single URL, with all processing on
608    die "w3mir: You specified several URLs and not -B/batch\n"
609      if $#root_urls>0;
610    queue($root_urls[0]);
611    &add_referer($root_urls[0],$iref);
612    mirror();
613  }
614}
615
616&close_fixup if $fixup;
617
618# This should clean up files:
619&clean_disk if $remove;
620
621warn "w3mir: That's all (".$w3http::xfbytes.'+',$w3http::headbytes.
622  " bytes of it).\n" unless $verbose<0;
623
624if ($runfix) {
625  eval 'use Config;';
626  warn "Running w3mfix\n";
627  if ($win32) {
628    CORE::system($Config{'perlpath'}." w3mfix $fixrc");
629  } else {
630    CORE::system("w3mfix $fixrc");
631  }
632}
633
634exit 0;
635
636sub get_document {
637  # Get one document by HTTP ($1/rum_url_o).  Save in given filename ($2).
638  # Possebly returning references found in the document.  Caller must
639  # set up referer array, check wantedness and everything else.  We
640  # handle authentication here though.
641
642  my($rum_url_o)=shift;
643  my($lf_url)=shift;
644  croak("\$rum_url_o is empty") if !defined($rum_url_o) || !$rum_url_o;
645  croak("$lf_url is empty") if !defined($lf_url) || !$lf_url;
646
647  # Make sure it's an object
648  $rum_url_o = url $rum_url_o
649    unless ref $rum_url_o;
650
651  my($slash)=($lf_url =~ /^\//);
652  # Derive a filename from the url, the filename contains no URL-quoting
653  my($lf_name) = (url "file:$lf_url")->unix_path;
654  $lf_name =~ s~^/~~ if (!$slash);
655
656  # Make all intermediate directories
657  &mkdir($lf_name) if $s==0;
658
659  my($rum_as_string) = $rum_url_o->as_string;
660
661  print STDERR "GET_DOCUMENT: '",$rum_as_string,"' -> '",$lf_name,"'\n"
662    if $debug;
663
664  my $hostport;
665  my $www_auth='';	# Value of that http reply header
666  my $page_ref;
667  my @rum_newurls;	# List of URLs extracted
668  my $url_extractor;
669  my $do_query;		# Do query or not?
670
671  if (defined($rum_urlstat{$rum_as_string}) &&
672      $rum_urlstat{$rum_as_string}>0) {
673    warn "w3mir: Internal error, ".$rum_as_string.
674      " queued several times\n";
675    next;
676  }
677
678  # Goto here if we want to retry b/c of authentication
679  try_again:
680
681  # Start building the extra http::query arguments again
682  my @EXTRASTUFF=();
683
684  # We'll start by assuming that we're doing the query.
685  $do_query=1;
686
687  # If we're not checking the timestamp, or the file does not exist
688  # then we get the file unconditionally.  Otherwise we only want it
689  # if it's updated.
690
691  if ($fetch==1) {
692    # Nothing do do?
693  } else {
694    if (-f $lf_name) {
695      if ($fetch==-1) {
696	print STDERR "w3mir: ".($infoloss?$rum_as_string:$lf_name).
697	  ", already have it" if $verbose>=0;
698	if (!$mine_urls) {
699	  # If -fs and the file exists and we don't need to mine URLs
700	  # we're finished!
701	  warn "Already have it, no mining, returning!\n" if $debug;
702	  print STDERR "\n" if $verbose>=0;
703	  return;
704	}
705	$w3http::result=1304;	# Pretend it was 'not modified'
706	$do_query=0;
707      } else {
708	push(@EXTRASTUFF,$w3http::IFMODF,$lf_name);
709      }
710    }
711  }
712
713  if ($do_query) {
714
715    # Does the server want authorization for this file?  $www_auth is
716    # only set if authentication was requested the first time around.
717
718    # For testing:
719    # $www_auth='Basic realm="foo"';
720
721    if ($www_auth) {
722      my($authdata,$method,$realm);
723
724      ($method,$realm)= $www_auth =~ m/^(\S+)\s+realm=\"([^\"]+)\"/i;
725      $method=lc $method;
726      $realm=lc $realm;
727      die "w3mir: '$method' authentication needed, don't know that.\n"
728	if ($method ne 'basic');
729
730      $hostport = $rum_url_o->netloc;
731      $authdata=$authdata{$hostport}{$realm} || $authdata{$hostport}{'*'} ||
732	$authdata{'*'}{$realm} || $authdata{'*'}{'*'};
733
734      if ($authdata) {
735	push(@EXTRASTUFF,$w3http::AUTHORIZ,$authdata);
736      } else {
737	print STDERR "w3mir: No authorization data for $hostport/$realm\n";
738	$rum_urlstat{$rum_as_string}=$NEVERMIND;
739	next;
740      }
741    }
742
743    push(@EXTRASTUFF,$w3http::FREEHEAD,$cache_header)
744      if ($cache_header);
745
746    # Insert referer header data if at all
747    push(@EXTRASTUFF,$w3http::REFERER,$rum_referers{$rum_as_string}[0])
748      if ($do_referer && exists($rum_referers{$rum_as_string}));
749
750    push(@EXTRASTUFF,$w3http::NOUSER)
751      unless ($do_user);
752
753    # YES, $lf_url is right, w3http::query handles this like an url so
754    # the quoting must all be in place.
755    my $binfile=$lf_url;
756    $binfile='-' if $s==1;
757    $binfile=$nulldevice if $s==2;
758
759    if ($pause) {
760      print STDERR "w3mir: sleeping\n" if $verbose>0;
761      sleep($pause);
762    }
763
764    print STDERR "w3mir: ".($infoloss?$rum_as_string:$lf_name)
765      unless $verbose<0;
766    print STDERR "\nFile: $lf_name\n" if $debug;
767
768    &w3http::query($w3http::GETURL,$rum_as_string,
769		   $w3http::SAVEBIN,$binfile,
770		   @EXTRASTUFF);
771
772    print STDERR "w3http::result: '",$w3http::result,
773    "' doc size: ", length($w3http::document),
774    " doc type: -",$w3http::headval{'CONTENT-TYPE'},
775    "- plaintexthtml: ",$w3http::plaintexthtml,"\n"
776      if $debug;
777
778    print "Result: ",$w3http::result," Recurse: $r, html: ",
779    $w3http::plaintexthtml,"\n"
780      if $debug;
781
782  } # if $do_query
783
784  if ($w3http::result==200) { # 200 OK
785    $rum_urlstat{$rum_as_string}=$GOTIT;
786
787    if ($mine_urls || $process_urls) {
788
789      if ($w3http::plaintexthtml) {
790	# Only do URL manipulations if this is a html document with no
791	# special content-encoding.  We do not handle encodings, yet.
792
793	my $page;
794
795	print STDERR ($process_urls)?", processing":", url mining"
796	  if $verbose>0;
797
798	print STDERR "\nurl:'$lf_url'\n"
799	  if $debug;
800
801	print "\nMining URLs: $mine_urls, Process: $process_urls\n"
802	  if $debug;
803
804	($page,@rum_newurls) =
805	  &htmlop::process($w3http::document,
806			   # Only get a new document if wanted
807			   $process_urls?():($htmlop::NODOC),
808			   $htmlop::CANON,
809			   $htmlop::ABS,$rum_url_o,
810			   # Only list urls if wanted
811			   $mine_urls?($htmlop::LIST):(),
812
813			   # If user wants absolute URLs do not
814			   # relativize them
815
816			   $abs?
817			   ():
818			   (
819			    $htmlop::TAGCALLBACK,\&process_tag,$lf_url,
820			    )
821			  );
822
823#	print "URL: ",join("\nURL: ",@rum_newurls),"\n";
824
825	if ($process_urls) {
826	  $page_ref=\$page;
827	  $w3http::document='';
828	} else {
829	  $page_ref=\$w3http::document;
830	}
831
832      } elsif ($s == 0 &&
833	       ($url_extractor =
834		$knownformats{$w3http::headval{'CONTENT-TYPE'}})) {
835
836	# The knownformats extractors only work on disk files so write
837	# doc to disk if not there already (non-html text will not be)
838	write_page($lf_name,$w3http::document,1);
839
840	# Now we try our hand at fetching URIs from non-html files.
841	print STDERR ", mining URLs" if $verbose>=1;
842	@rum_newurls = &$url_extractor($lf_name);
843	# warn "URLs from PDF: ",join(', ',@rum_newurls),"\n";
844      }
845
846
847    } # if ($mine_urls || $process_urls)
848
849#    print "page_ref defined: ",defined($page_ref),"\n";
850#    print "plaintext: ",$w3http::plaintext,"\n";
851
852    $page_ref=\$w3http::document
853      if !defined($page_ref) && $w3http::plaintexthtml;
854
855    if ($w3http::plaintexthtml) {
856      # ark: this is where I want to do my changes to the page strip
857      # out the <!--NOMIRROR-->...<!--/NOMIRROR--> Stuff.
858      $$page_ref=~ s/<(!--)?\s*NO\s*MIRROR\s*(--)?>[^\000]*?<(!--)?\s*\/NO\s*MIRROR\s*(--)?>//g
859	if $edit;
860
861      if ($header) {
862	# ark: insert a header string at the start of the page
863	my $mirrorstr=$header;
864	$mirrorstr =~ s/\$url/$rum_as_string/g;
865	insert_at_start( $mirrorstr, $page_ref );
866      }
867    }
868
869    write_page($lf_name,$page_ref,0);
870
871    # print "New urls: ",join("\n",@rum_newurls),"\n";
872
873    return @rum_newurls;
874  }
875
876  if ($w3http::result==304 || #  304 Not modified
877      $w3http::result==1304) { # 1304 Have it
878
879    {
880      # last = out of nesting
881
882      my $rum_urlstat;
883      my $rum_newurls;
884
885      @rum_newurls=();
886
887      print STDERR ", not modified"
888	if $verbose>=0 && $w3http::result==304;
889
890      $rum_urlstat{$rum_as_string}=$NOTMOD;
891
892      last unless $mine_urls;
893
894      $rum_newurls=get_references($lf_name);
895
896      # print "New urls: ",ref($rum_newurls),"\n";
897
898      if (!ref($rum_newurls)) {
899	last;
900      } elsif (ref($rum_newurls) eq 'SCALAR') {
901	$page_ref=$rum_newurls;
902      } elsif (ref($rum_newurls) eq 'ARRAY') {
903	@rum_newurls=@$rum_newurls;
904	last;
905      } else {
906	die "\nw3mir: internal error: Unknown return type from get_references\n";
907      }
908
909      # Check if it's a html file.  I know this tag is in all html
910      # files, because I put it there as I pull them in.
911      last unless $$page_ref =~ /<HTML/i;
912
913      warn "$lf_name is a html file\n" if $debug;
914
915      # It's a html document
916      print STDERR ", mining URLs" if $verbose>=1;
917
918      # This will give us a list of absolute urls
919      (undef,@rum_newurls) =
920	&htmlop::process($$page_ref,$htmlop::NODOC,
921			 $htmlop::ABS,$rum_as_string,
922			 $htmlop::USESAVED,'W3MIR',
923			 $htmlop::LIST);
924    }
925
926    print STDERR "\n" if $verbose>=0;
927    return @rum_newurls;
928  }
929
930  if ($w3http::result==302 || $w3http::result==301) { # Redirect
931    # Cern and NCSA httpd sends 302 'redirect' if a ending / is
932    # forgotten on a url.  More recent httpds send 301 'permanent
933    # redirect' in this case.  Here we check if the difference in URLs
934    # is just a / and if so push the url again with the / added.  This
935    # code only works if the http server has the right idea about its
936    # own name.
937    #
938    # 18/3/97: Added code to queue redirected-to-URLs that are within
939    # the scope of the retrival.
940    my $new_rum_url;
941
942    $rum_urlstat{$rum_as_string}=$REDIR;
943
944    # Absolutify the new url, it might be relative to the requested
945    # document.  That's a ugly wart on some servers/admins.
946    $new_rum_url=url $w3http::headval{'location'};
947    $new_rum_url=$new_rum_url->abs($rum_url_o);
948
949    print REDIRS $rum_as_string,' -> ',$new_rum_url->as_string,"\n"
950      if $fixup;
951
952    if ($immediate_redir) {
953      print STDERR " =>> ",$new_rum_url->as_string,", getting that instead\n";
954      return get_document($new_rum_url,$lf_url);
955    }
956
957    # Some redirect to a fragment of another doc...
958    $new_rum_url->frag(undef);
959    $new_rum_url=$new_rum_url->as_string;
960
961    if ($rum_as_string.'/' eq $new_rum_url) {
962      if (grep { $rum_as_string eq $_; } @root_urls) {
963	print STDERR "\nw3mir: missing / in a start URL detected.  Please fix commandline/config file.\n";
964	exit(1);
965      }
966      print STDERR ", missing /\n";
967      queue($new_rum_url);
968      # Initialize referer to something meaningful
969      $rum_referers{$new_rum_url}=$rum_referers{$rum_as_string};
970    } else {
971      print STDERR " =>> $new_rum_url";
972      if (want_this($new_rum_url)) {
973	print STDERR ", getting that\n";
974	queue($new_rum_url);
975	$rum_referers{$new_rum_url}=$rum_referers{$rum_as_string};
976      } else {
977	print STDERR ", don't want it\n";
978      }
979    }
980    return ();
981  }
982
983  if ($w3http::result==403 || # Forbidden
984      $w3http::result==404 || # Not found
985      $w3http::result==406 || # Not Acceptable, hmm, belongs here?
986      $w3http::result==410) { # Gone - no forwarding address known
987
988    $rum_urlstat{$rum_as_string}=$ENOTFND;
989    &handleerror;
990    print STDERR "Was refered from: ",
991      join(',',@{$rum_referers{$rum_as_string}}),
992      "\n" if defined(@{$rum_referers{$rum_as_string}});
993    return ();
994  }
995
996  if ($w3http::result==407) {
997    # Proxy authentication requested
998    die "Proxy server requests authentication but failed to return the\n".
999      "REQUIRED Proxy-Authenticate header for this condition\n"
1000	unless exists($w3http::headval{'proxy-authenticate'});
1001
1002    die "Proxy authentication is required for ".$w3http::headval{'proxy-authenticate'}."\n";
1003  }
1004
1005  if ($w3http::result==401) {
1006    # A www-authenticate reply header should acompany a 401 message.
1007    if (!exists($w3http::headval{'www-authenticate'})) {
1008      warn "w3mir: Server indicated authentication failure but gave no www-authenticate reply\n";
1009      $rum_urlstat{$rum_as_string}=$NEVERMIND;
1010    } else {
1011      # Unauthorized
1012      if ($www_auth) {
1013	# Failed when authorization data was supplied.
1014	$rum_urlstat{$rum_as_string}=$NEVERMIND;
1015	print STDERR ", authorization failed data needed for ",
1016	$w3http::headval{'www-authenticate'},"\n"
1017	  if ($verbose>=0);
1018      } else {
1019	if ($useauth) {
1020	  # First time failure, send back and retry at once with some known
1021	  # user/passwd.
1022	  $www_auth=$w3http::headval{'www-authenticate'};
1023	  print STDERR ", retrying with authorization\n" unless $verbose<0;
1024	  goto try_again;
1025	} else {
1026	  print ", authorization needed: ",
1027	    $w3http::headval{'www-authenticate'},"\n";
1028	  $rum_urlstat{$rum_as_string}=$NEVERMIND;
1029	}
1030      }
1031    }
1032    return ();
1033  }
1034
1035  # Something else.
1036  &handleerror;
1037}
1038
1039
1040sub robot_check {
1041  # Check if URL is allowed by robots.txt, if we respect it at all
1042  # that is.  Return 1 it allowed, 0 otherwise.
1043
1044  my($rum_url_o)=shift;
1045  my $hostport;
1046
1047  if ($check_robottxt) {
1048
1049    $hostport = $rum_url_o->netloc;
1050    if (!exists($gotrobots{$hostport})) {
1051      # Get robots.txt from the server
1052      $gotrobots{$hostport}=1;
1053      my $robourl="http://$hostport/robots.txt";
1054      print STDERR "w3mir: $robourl" if ($verbose>=0);
1055      &w3http::query($w3http::GETURL,$robourl);
1056      $w3http::document='' if ($w3http::result != 200);
1057      print STDERR ", processing" if $verbose>=1;
1058      print STDERR "\n" if ($verbose>=0);
1059      $Robot_Blob->parse($robourl,$w3http::document);
1060    }
1061
1062    if (!$Robot_Blob->allowed($rum_url_o->as_string)) {
1063      # It is forbidden
1064      $rum_urlstat{$rum_url_o->as_string}=$FROBOTS;
1065      warn "w3mir: ",$rum_url_o->as_string,": forbidden by robots.txt\n";
1066      return 0;
1067    }
1068  }
1069  return 1;
1070}
1071
1072
1073
1074sub batch_get {
1075  # Batch get _one_ document.
1076  my $rum_url=shift;
1077  my $lf_url;
1078
1079  $rum_url_o = url $rum_url;
1080
1081  return unless robot_check($rum_url_o);
1082
1083  ($lf_url=$rum_url) =~ s~.*/~~;
1084  if (!defined($lf_url) || $lf_url eq '') {
1085    ($lf_url=$rum_url) =~ s~/$~~;
1086    $lf_url =~ s~.*/~~;
1087    $lf_url .= "-$indexname";
1088  }
1089
1090  warn "Batch get: $rum_url -> $lf_url\n" if $debug;
1091
1092  $immediate_redir=1; # Do follow redirects immediately
1093
1094  get_document($rum_url,$lf_url);
1095}
1096
1097
1098
1099sub mirror {
1100  # Mirror (or get) the requested url(s). Possibly recursively.
1101  # Working from whatever cwd is at invocation we'll retrieve all
1102  # files under it in the file hierarchy.
1103
1104  my $rum_url;	# URL of the document we're getting now, defined at main level
1105  my $lf_url;	# rum_url after apply - and
1106  my $new_lf_url;
1107  my @new_rum_urls;
1108  my $rum_ref;
1109
1110  while (defined($rum_url = pop(@rum_queue))) {
1111
1112    warn "mirror: Poped $rum_url from queue\n" if $debug;
1113
1114    # Unwanted URLs should not be queued
1115    die "Found url $rum_url that I don't want in queue!\n"
1116      unless defined($lf_url=apply($rum_url));
1117
1118    $rum_url_o = url $rum_url;
1119
1120    next unless robot_check($rum_url_o);
1121
1122    # Figure out the filename for our local filesystem.
1123    $lf_url.=$indexname if $lf_url =~ m~/$~ || $lf_url eq '';
1124
1125    @new_rum_urls = get_document($rum_url_o,$lf_url);
1126
1127    print join("\n",@new_rum_urls),"\n" if ($list);
1128
1129    if ($r) {
1130      foreach $rum_ref (@new_rum_urls) {
1131	# warn "Recursive url: $rum_ref\n";
1132	$new_lf_url=apply($rum_ref);
1133	next unless $new_lf_url;
1134
1135	# warn "Want it\n";
1136	$rum_ref =~ s/\#.*$//s;		# Clip off section marks
1137
1138	add_referer($rum_ref,$rum_url_o->as_string);
1139	queue($rum_ref);
1140      }
1141    }
1142
1143    @new_rum_urls=();
1144
1145    # Is the URL queue empty? Are there outstanding retries?  Refill
1146    # the queue from the retry list.
1147    if ($#rum_queue<0 && $retry-->0) {
1148      foreach $rum_url_o (keys %rum_urlstat) {
1149	$rum_url_o = url $rum_url_o;
1150	if ($rum_urlstat{$rum_url_o->as_string}==100) {
1151	  push(@rum_queue,$rum_url_o->as_string);
1152	  $rum_urlstat{$rum_url_o->as_string}=0;
1153	}
1154      }
1155      if ($#rum_queue>=0) {
1156	warn "w3mir: Sleeping before retrying.  $retry more times left\n"
1157	  if $verbose>=0;
1158	sleep($retryPause);
1159      }
1160    }
1161
1162  }
1163}
1164
1165
1166sub get_references {
1167  # Get references from a non-html-on-disk file.  Return references if
1168  # we know how to find them.  Return reference do the complete page
1169  # if it's html.  Return single numerical 0 if unknown format.
1170
1171  my($lf_url)=shift;
1172  my($urlextractor)=shift;
1173
1174  my $read; # Buffer of stuff read from file to check filetype
1175  my $magic;
1176  my $url_extractor;
1177  my $rum_ref;
1178  my $page;
1179
1180  warn "w3mir: Looking at local $lf_url\n" if $debug;
1181
1182  # Open file and read the first 10kilobytes for file-type-test
1183  # purposes.
1184  if (!open(TMPF,$lf_url)) {
1185    warn "Unable to open $lf_url for reading: $!\n";
1186    last;
1187  }
1188
1189  $page=' 'x10240;
1190  $read=sysread(TMPF,$page,length($page),0);
1191  close(TMPF);
1192
1193  die "Error reading $lf_url: $!\n" if (!defined($read));
1194
1195  if (!defined($url_extractor)) {
1196    $url_extractor=0;
1197
1198    # Check file against list of magic numbers.
1199    foreach $magic (keys %knownmagic) {
1200      if (substr($page,0,length($magic)) eq $magic) {
1201	$url_extractor = $knownformats{$knownmagic{$magic}};
1202	last;
1203      }
1204    }
1205  }
1206
1207  # Found a extraction method, apply.
1208  if ($url_extractor) {
1209    print STDERR ", mining URLs" if $verbose>=1;
1210    return [&$url_extractor($lf_url)];
1211  }
1212
1213  if ($page =~ /<HTML/i) {
1214    open(TMPF,$lf_url) ||
1215      die "Could not open $lf_url for reading: $!\n";
1216    # read the whole file.
1217    local($/)=undef;
1218    $page = <TMPF>;
1219    close(TMPF);
1220    return \$page;
1221  }
1222
1223  return 0;
1224}
1225
1226
1227sub open_fixup {
1228  # Open the referers and redirects files
1229
1230  my $reffile='.referers';
1231  my $redirfile='.redirs';
1232  my $removedfile='.notmirrored';
1233
1234  if ($win32) {
1235    $reffile="referers";
1236    $redirfile="redirs";
1237    $removedfile="notmir";
1238  }
1239
1240  $nodelete{$reffile} = $nodelete{$redirfile} = $nodelete{$removedfile} = 1;
1241
1242  $removedfile=$nulldevice unless $list_nomir;
1243
1244  open(REDIRS,"> $redirfile") ||
1245    die "Could not open $redirfile for writing: $!\n";
1246
1247  autoflush REDIRS 1;
1248
1249  open(REFERERS,"> $reffile") ||
1250    die "Could not open $reffile for writing: $!\n";
1251
1252  $fixopen=1;
1253
1254  open(REMOVED,"> $removedfile") ||
1255    die "Could not open $removedfile for writing: $!\n";
1256
1257  autoflush REMOVED 1;
1258
1259  eval 'END { close_fixup; 0; }';
1260}
1261
1262
1263sub close_fixup {
1264  # Close the fixup data files.  In the case of the referer file also
1265  # write the entire content
1266
1267  return unless $fixopen;
1268
1269  my $referer;
1270
1271  foreach $referer (keys %rum_referers) {
1272    print REFERERS $referer," <- ",join(' ',@{$rum_referers{$referer}}),"\n";
1273  }
1274
1275  close(REFERERS) || warn "Error closing referers file: $!\n";
1276  close(REDIRS) || warn "Error closing redirects file: $!\n";
1277  close(REMOVED) || warn "Error closing 'removed' file: $!\n";
1278  $fixopen=0;
1279}
1280
1281
1282sub clean_disk {
1283  # This procedure removes files that are not present on the server(s)
1284  # anymore.
1285
1286  # - To avoid removing files that were not fetched due to network
1287  #   problems we only do blanket removal IFF all documents were
1288  #   fetched w/o problems, eventually.
1289  # - In any case we can remove files the server said were not found
1290
1291  # The strategy has three main parts:
1292  # 1. Find all files we have
1293  # 2. Find what files we ought to have
1294  # 3. Remove the difference
1295
1296  my $complete_retrival=1;	# Flag saying IFF all documents were fetched
1297  my $urlstat;			# Tmp storage
1298  my $rum_url;
1299  my $lf_url;
1300  my $lf_dir;
1301  my $dirs_to_remove;
1302
1303  # For fileremoval code
1304  eval "use File::Find;" unless defined(&find);
1305
1306  die "w3mir: Could not load File::Find module.  Don't use -R switch.\n"
1307    unless defined(&find);
1308
1309  # This to shut up -w
1310  $lf_dir=$File::Find::dir;
1311
1312  # ***** 1. Find out what files we have *****
1313  #
1314  # This does two things:  For each file or directory found:
1315  # - Increases entry count for the container directory
1316  # - If it's a file: $lf_file{relative_path}=$FILEHERE;
1317
1318  chop(@root_dirs);
1319  print STDERR "Looking in: ",join(", ",@root_dirs),"\n" if $debug;
1320
1321  find(\&find_files,@root_dirs);
1322
1323  # ***** 2. Find out what files we ought to have *****
1324  #
1325  # First we loop over %rum_urlstat to determine what files are not
1326  # present on the server(s).
1327  foreach $rum_url (keys %rum_urlstat) {
1328    # Figure out name of local file from rum_url
1329    next unless defined($lf_url=apply($rum_url));
1330
1331    $lf_url.=$indexname if $lf_url =~ m~/$~ || $lf_url eq '';
1332
1333    # find prefixes ./, we must too.
1334    $lf_url="./".$lf_url unless substr($lf_url,0,1) eq '/';
1335
1336    # Ignore if file does not exist here.
1337    next unless exists($lf_file{$lf_url});
1338
1339    # The apply rules can map several remote files to same local
1340    # file.  If we decided to keep file already we stay with that.
1341    next if $lf_file{$lf_url}==$FILETHERE;
1342
1343    $urlstat=$rum_urlstat{$rum_url};
1344
1345    # Figure out the status code.
1346    if ($urlstat==$GOTIT || $urlstat==$NOTMOD) {
1347      # Present on server.  Keep.
1348      $lf_file{$lf_url}=$FILETHERE;
1349      next;
1350    } elsif ($urlstat==$ENOTFND || $urlstat==$NEVERMIND ) {
1351      # One of: not on server, can't get, don't want, access forbiden:
1352      # Schedule for removal.
1353      $lf_file{$lf_url}=$FILEDEL if exists($lf_file{$lf_url});
1354      next;
1355    } elsif ($urlstat==$OTHERERR || $urlstat==$TERROR) {
1356      # Some error occured transfering.
1357      $complete_retrival=0;	# The retrival was not complete. Delete less
1358    } elsif ($urlstat==$QUEUED) {
1359      warn "w3mir: Internal inconsistency, $rum_url marked as queued after retrival terminated\n";
1360      $complete_retrival=0;	# Fishy. Be conservative about removing
1361    } else {
1362      $complete_retrival=0;
1363      warn "w3mir: Warning: $rum_url is marked as $urlstat.\n".
1364	"w3mir: Please report to w3mir-core\@usit.uio.no.\n";
1365    }
1366  }				# foreach %rum_urlstat
1367
1368  # ***** 3. Remove the difference *****
1369
1370  # Loop over all found files:
1371  # - Should we have this file?
1372  # - If not: Remove file and decrease directory entry count
1373  # Loop as long as there are directories with 0 entry count:
1374  # - Loop over all directories with 0 entry count:
1375  #   - Remove directory
1376  #   - Decrease entry count of parent
1377
1378  warn "w3mir: Some error occured, conservative file removal\n"
1379    if !$complete_retrival && $verbose>=0;
1380
1381  # Remove all files we don't want removed from list of files present:
1382  foreach $lf_url (keys %nodelete) {
1383    print STDERR "Not deleting: $lf_url\n" if $verbose>=1;
1384    delete $lf_file{$lf_url} || delete $lf_file{'./'.$lf_url};
1385  }
1386
1387  # Remove files
1388  foreach $lf_url (keys %lf_file) {
1389    if (($complete_retrival && $lf_file{$lf_url}==$FILEHERE) ||
1390	($lf_file{$lf_url} == $FILEDEL)) {
1391      if (unlink $lf_url) {
1392	($lf_dir)= $lf_url =~ m~^(.+)/~;
1393	$lf_dir{$lf_dir}--;
1394	$dirs_to_remove=1 if ($lf_dir{$lf_dir}==0);
1395	warn "w3mir: removed file $lf_url\n" if $verbose>=0;
1396      } else {
1397	warn "w3mir: removal of file $lf_url failed: $!\n";
1398      }
1399    }
1400  }
1401
1402  # Remove empty directories
1403  while ($dirs_to_remove) {
1404    $dirs_to_remove=0;
1405    foreach $lf_url (keys %lf_dir) {
1406      next if $lf_url eq '.';
1407      if ($lf_dir{$lf_url}==0) {
1408	if (rmdir($lf_url)) {
1409	  warn "w3mir: removed directory $lf_dir\n" if $verbose>=0;
1410	  delete $lf_dir{$lf_url};
1411	  ($lf_dir)= $lf_url =~ m~^(.+)/~;
1412	  $lf_dir{$lf_dir}--;
1413	  $dirs_to_remove=1 if ($lf_dir{$lf_dir}==0);
1414	} else {
1415	  warn "w3mir: removal of directory $lf_dir failed: $!\n";
1416	}
1417      }
1418    }
1419  }
1420}
1421
1422
1423sub find_files {
1424  # This is called by the find procedure for every file/dir found.
1425
1426  # This builds two hashes:
1427  # lf_file{<file>}: 1: file exists
1428  # lf_dir{<dir>): Number of files in directory.
1429
1430  lstat($_);
1431
1432  $lf_dir{$File::Find::dir}++;
1433
1434  if (-f _) {
1435    $lf_file{$File::Find::name}=$FILEHERE;
1436  } elsif (-d _) {
1437    # null
1438    # Bug: If an empty directory exists it will not be removed
1439  } else {
1440    warn "w3mir: File $File::Find::name has unknown type.  Ignoring.\n";
1441  }
1442  return 0;
1443
1444}
1445
1446
1447sub handleerror {
1448  # Handle error status of last http connection, will set the rum_urlstat
1449  # appropriately and print a error message.
1450
1451  my $msg;
1452
1453  if ($verbose<0) {
1454    $msg="w3mir: ".$rum_url_o->as_string.": ";
1455  } else {
1456    $msg=": ";
1457  }
1458
1459  if ($w3http::result == 98) {
1460    # OS/Network error
1461    $msg .= "$!";
1462    $rum_urlstat{$rum_url_o->as_string}=$OTHERERR;
1463  } elsif ($w3http::result == 100) {
1464    # Some kind of error connecting or sending request
1465    $msg .= $w3http::restext || "Timeout";
1466    $rum_urlstat{$rum_url_o->as_string}=$TERROR;
1467  } else {
1468    # Other HTTP error
1469    $rum_urlstat{$rum_url_o->as_string}=$OTHERERR;
1470    $msg .= " ".$w3http::result." ".$w3http::restext;
1471    $msg .= " =>> ".$w3http::headval{'location'}
1472    if (defined($w3http::headval{'location'}));
1473  }
1474  print STDERR "$msg\n";
1475}
1476
1477
1478sub queue {
1479  # Queue given url if appropriate and create a status entry for it
1480  my($rum_url_o)=url $_[0];
1481
1482  croak("BUG: undefined \$rum_url_o")
1483    if !defined($rum_url_o);
1484
1485  croak("BUG: undefined \$rum_url_o->as_string")
1486    if !defined($rum_url_o->as_string);
1487
1488  croak("BUG: ".$rum_url_o->as_string." (fragnent) queued")
1489    if $rum_url_o->as_string =~ /\#/;
1490
1491  return if exists($rum_urlstat{$rum_url_o->as_string});
1492  return unless want_this($rum_url_o->as_string);
1493
1494  warn "QUEUED: ",$rum_url_o->as_string,"\n" if $debug;
1495
1496  # Note lack of scope checks.
1497  $rum_urlstat{$rum_url_o->as_string}=$QUEUED;
1498  push(@rum_queue,$rum_url_o->as_string);
1499}
1500
1501
1502sub root_queue {
1503  # Queue function for root urls and directories.  One or the other might
1504  # be boolean false, in that case, don't queue it.
1505
1506  my $root_url_o;
1507
1508  my($root_url)=shift;
1509  my($root_dir)=shift;
1510
1511  die "w3mir: No fragments in start URLs :".$root_url."\n"
1512    if $root_url =~ /\#/;
1513
1514  if ($root_dir) {
1515    print "Root dir: $root_dir\n" if $debug;
1516    $root_dir="./$root_dir" unless substr($root_dir,0,1) eq '/' or
1517      substr($root_dir,0,2) eq './';
1518    push(@root_dirs,$root_dir);
1519  }
1520
1521
1522  if ($root_url) {
1523    $root_url_o=url $root_url;
1524
1525    # URL canonification, or what we do of it at least.
1526    $root_url_o->host($root_url_o->host);
1527
1528    warn "Root queue: ".$root_url_o->as_string."\n" if $debug;
1529
1530    push(@root_urls,$root_url_o->as_string);
1531
1532    return $root_url_o;
1533  }
1534
1535}
1536
1537
1538sub write_page {
1539  # write a retrieved page to wherever it's supposed to be written.
1540  # Added difficulty: all files but plaintext files have already been
1541  # written to disk in w3http.
1542
1543  # $s == 0 save to disk
1544  # $s == 1 dump to stdout
1545  # $s == 2 forget
1546
1547  my($lf_name,$page_ref,$silent) = @_;
1548  my($verb);
1549
1550  if ($silent) {
1551    $verb=-1;
1552  } else {
1553    $verb=$verbose;
1554  }
1555
1556#  confess("\n\$page_ref undefined") if !defined($page_ref);
1557
1558  if ($w3http::plaintexthtml) {
1559    # I have it in memory
1560    if ($s==0) {
1561      print STDERR ", saving" if $verb>0;
1562
1563      while (-d $lf_name) {
1564	# This will run once, maybe twice, $fiddled will be canged the
1565	# first time
1566	if (exists($fiddled{$lf_name})) {
1567	  warn "Cannot save $lf_name, there is a directory in the way\n";
1568	  return;
1569	}
1570
1571	$fiddled{$lf_name}=1;
1572
1573	rm_rf($lf_name);
1574	print STDERR "w3mir: $lf_name" if $verbose>=0;
1575      }
1576
1577      if (!open(PAGE,">$lf_name")) {
1578	warn "\nw3mir: can't open $lf_name for writing: $!\n";
1579	return;
1580      }
1581      if (!$convertnl) {
1582	binmode PAGE;
1583	warn "BINMODE\n" if $debug;
1584      }
1585      if ($$page_ref ne '') {
1586	print PAGE $$page_ref || die "w3mir: Error writing $lf_name: $!\n";
1587      }
1588      close(PAGE) || die "w3mir: Error closing $lf_name: $!\n";
1589      print STDERR ": ", length($$page_ref), " bytes\n"
1590	if $verb>=0;
1591      setmtime($lf_name,$w3http::headval{'last-modified'})
1592	if exists($w3http::headval{'last-modified'});
1593    } elsif ($s==1) {
1594      print $$page_ref ;
1595    } elsif ($s==2) {
1596      print STDERR ", got and forgot it.\n" unless $verb<0;
1597    }
1598  } else {
1599    # Already written by http module, just emit a message if wanted
1600    if ($s==0) {
1601      print STDERR ": ",$w3http::doclen," bytes\n"
1602	if $verb>=0;
1603      setmtime($lf_name,$w3http::headval{'last-modified'})
1604	  if exists($w3http::headval{'last-modified'});
1605    } elsif ($s==2) {
1606      print STDERR ", got and forgot it.\n" if $verb>=0;
1607    }
1608  }
1609}
1610
1611
1612sub setmtime {
1613  # Set mtime of the given file
1614  my($file,$time)=@_;
1615  my($tm_sec,$tm_min,$tm_hour,$tm_mday,$tm_mon,$tm_year,$tm_wday,$tm_yday,
1616     $tm_isdst,$tics);
1617
1618  $tm_isdst=0;
1619  $tm_yday=-1;
1620
1621  carp("\$time is undefined"),return if !defined($time);
1622
1623  $tics=str2time($time);
1624  utime(time, $tics, $file) ||
1625      warn "Could not change mtime of $file: $!\n";
1626}
1627
1628
1629sub movefile {
1630  # Rename a file.  Note that copy is not a good alternative, since
1631  # copying over NFS is something we want to Avoid.
1632
1633  # Returns 0 if failure and 1 in case of sucess.
1634
1635  (my $old,my $new) = @_;
1636
1637  # Remove anything that might have the name already.
1638  if (-d $new) {
1639    print STDERR "\n" if $verbose>=0;
1640    rm_rf($new);
1641    $fiddled{$new}=1;
1642    print STDERR "w3mir: $new" if $verbose>=0;
1643  } elsif (-e $new) {
1644    $fiddled{$new}=1;
1645    if (unlink($new)) {
1646      print STDERR "\nw3mir: removed $new\nw3mir: $new"
1647	if $verbose>=0;
1648    } else {
1649      return 0;
1650    }
1651
1652  }
1653
1654  if ($new ne '-' && $new ne $nulldevice) {
1655    warn "MOVING $old -> $new\n" if $debug;
1656    rename($old,$new) ||
1657      warn "Could not rename $old to $new: $!\n",return 0;
1658  }
1659  return 1;
1660}
1661
1662
1663sub mkdir {
1664  # Make all intermediate directories needed for a file, the file name
1665  # is expected to be included in the argument!
1666
1667  # Reasons for not using File::Path::mkpath:
1668  # - I already wrote this.
1669  # - I get to be able to produce as good and precise errormessages as
1670  #   unix and perl will allow me.  mkpath will not.
1671  # - It's easier to find out if it worked or not.
1672
1673  my($file) = @_;
1674  my(@dirs) = split("/",$file);
1675  my $path;
1676  my $dir;
1677  my $moved=0;
1678
1679  if (!$dirs[0]) {
1680    shift @dirs;
1681    $path='';
1682  } else {
1683    $path = '.';
1684  }
1685
1686  # This removes the last element of the array, it's meant to shave
1687  # off the file name leaving only the directory name, as a
1688  # convenience, for the caller.
1689  pop @dirs;
1690  foreach $dir (@dirs) {
1691    $path .= "/$dir";
1692    stat($path);
1693    # only make if it isn't already there
1694    next if -d _;
1695
1696    while (!-d _) {
1697      if (exists($fiddled{$path})) {
1698	warn "Cannot make directory $path, there is a file in the way.\n";
1699	return;
1700      }
1701
1702      $fiddled{$path}=1;
1703
1704      if (!-e _) {
1705	mkdir($path,0777);
1706	last;
1707      }
1708
1709      if (unlink($path)) {
1710	warn "w3mir: removed file $path\n" if $verbose>=0;
1711      } else {
1712	warn "Unable to remove $path: $!\n";
1713	next;
1714      }
1715
1716      warn "mkdir $path\n" if $debug;
1717      mkdir($path,0777) ||
1718	warn "Unable to create directory $path: $!\n";
1719
1720      stat($path);
1721    }
1722  }
1723}
1724
1725
1726sub add_referer {
1727  # Add a referer to the list of referers of a document.  Unless it's
1728  # already there.
1729  # Don't mail me if you (only) think this is a bit like a toungetwiser:
1730
1731  # Don't remember referers if BOTH fixup and referer header is disabled.
1732  return if $fixup==0 && $do_referer==0;
1733
1734  my($rum_referee,$rum_referer) = @_ ;
1735  my $re_rum_referer;
1736
1737  if (exists($rum_referers{$rum_referee})) {
1738    $re_rum_referer=quotemeta $rum_referer;
1739    if (!grep(m/^$re_rum_referer$/,@{$rum_referers{$rum_referee}})) {
1740      push(@{$rum_referers{$rum_referee}},$rum_referer);
1741      # warn "$rum_referee <- $rum_referer pushed\n";
1742    } else {
1743      # warn "$rum_referee <- $rum_referer NOT pushed\n";
1744    }
1745  } else {
1746    $rum_referers{$rum_referee}=[$rum_referer];
1747    # warn "$rum_referee <- $rum_referer pushed\n";
1748  }
1749}
1750
1751
1752sub user_apply {
1753  # Apply the user apply rules
1754
1755  return &$user_apply_code(shift);
1756
1757# Debug version:
1758#  my ($foo,$bar);
1759#  $foo=shift;
1760#  $bar=&$apply_code($foo);
1761#  print STDERR "Apply: $foo -> $bar\n";
1762#  return $bar;
1763}
1764
1765sub internal_apply {
1766  # Apply the w3mir generated apply rules
1767
1768  return &$apply_code(shift);
1769}
1770
1771
1772sub apply {
1773    # Apply the user apply rules.  Then if URL is wanted return result of
1774    # w3mir apply rules.  Return the undefined value otherwise.
1775
1776    my $url = user_apply(shift);
1777
1778    return internal_apply($url)
1779	if want_this($url);
1780
1781    # print REMOVED $url,"\n";
1782    return undef;
1783}
1784
1785
1786sub want_this {
1787  # Find out if we want the url passed.  Just pass it on to the
1788  # generated functions.
1789  my($rum_url)=shift;
1790
1791  # What about robot rules?
1792
1793  # Does scope rule want this?
1794  return &$scope_code($rum_url) &&
1795    # Does user rule want this too?
1796    &$rule_code($rum_url)
1797
1798}
1799
1800
1801sub process_tag {
1802  # Process a tag in html file
1803  my $lf_referer = shift;	# User argument
1804  my $base_url = shift;		# Not used... why not?
1805  my $tag_name = shift;
1806  my $url_attrs = shift;
1807
1808  # Retrun quickly if no URL attributes
1809  return unless defined($url_attrs);
1810
1811  my $attrs = shift;
1812
1813  my $rum_url;	# The absolute URL
1814  my $lf_url;	# The local filesystem url
1815  my $lf_url_o; # ... and it's object
1816  my $key;
1817
1818  print STDERR "\nProcess Tag: $tag_name, URL attributes: ",
1819  join(', ',@{$url_attrs}),"\nbase_url: ",$base_url,"\nlf_referer: ",
1820  $lf_referer,"\n"
1821    if $debug>2;
1822
1823  $lf_referer =~ s~^/~~;
1824  $lf_referer = "file:/$lf_referer";
1825
1826  foreach $key (@{$url_attrs}) {
1827    if (defined($$attrs{$key})) {
1828      $rum_url=$$attrs{$key};
1829      print STDERR "$key = $rum_url\n" if $debug;
1830      $lf_url=apply($rum_url);
1831      if (defined($lf_url)) {
1832
1833	print STDERR "Transformed to $lf_url\n" if $debug>2;
1834
1835	$lf_url =~ s~^/~~;  # Remove leading / to avoid doubeling
1836	$lf_url_o=url "file:/$lf_url";
1837
1838	# Save new value in the hash
1839	$$attrs{$key}=($lf_url_o->rel($lf_referer))->as_string;
1840	print STDERR "New value: ",$$attrs{$key},"\n" if $debug>2;
1841
1842	# If there is potential information loss save the old value too
1843	$$attrs{"W3MIR".$key}=$rum_url if $infoloss;
1844      }
1845    }
1846  }
1847}
1848
1849
1850sub version {
1851  eval 'require LWP;';
1852  print $w3mir_agent,"\n";
1853  print "LWP version ",$LWP::VERSION,"\n" if defined $LWP::VERSION;
1854  print "Perl version: ",$],"\n";
1855  exit(0);
1856}
1857
1858
1859sub parse_args {
1860  my $f;
1861  my $i;
1862
1863  $i=0;
1864
1865  while ($f=shift) {
1866    $i++;
1867    $numarg++;
1868    # This is a demonstration against Getopts::Long.
1869    if ($f =~ s/^-+//) {
1870      $s=1,next if $f eq 's';			# Stdout
1871      $r=1,next if $f eq 'r';			# Recurse
1872      $fetch=1,next if $f eq 'fa';		# Fetch all, no date test
1873      $fetch=-1,next if $f eq 'fs';	# Fetch those we don't already have.
1874      $verbose=-1,next if $f eq 'q'; 		# Quiet
1875      $verbose=1,next if $f eq 'c';		# Chatty
1876      &version,next if $f eq 'v';		# Version
1877      $pause=shift,next if $f eq 'p';		# Pause between requests
1878      $retryPause=shift,next if $f eq 'rp';	# Pause between retries.
1879      $s=2,$convertnl=0,next if $f eq 'f';	# Forget
1880      $retry=shift,next if $f eq 't';		# reTry
1881      $list=1,next if $f eq 'l';		# List urls
1882      $iref=shift,next if $f eq 'ir';		# Initial referer
1883      $check_robottxt = 0,next if $f eq 'drr';	# Disable robots.txt rules.
1884      umask(oct(shift)),next if $f eq 'umask';
1885      parse_cfg_file(shift),next if $f eq 'cfgfile';
1886      usage(),exit 0 if ($f eq 'help' || $f eq 'h' || $f eq '?');
1887      $remove=1,next if $f eq 'R';
1888      $cache_header = 'Pragma: no-cache',next if $f eq 'pflush';
1889      $w3http::agent=$w3mir_agent=shift,next if $f eq 'agent';
1890      $abs=1,next if $f eq 'abs';
1891      $convertnl=0,$batch=1,next if $f eq 'B';
1892      $read_urls = 1,next if $f eq 'I';
1893      $convertnl=0,next if $f eq 'nnc';
1894
1895      if ($f eq 'lc') {
1896	if ($i == 1) {
1897	  $lc=1;
1898	  $iinline=($lc?"(?i)":"");
1899	  $ipost=($lc?"i":"");
1900	  next;
1901	} else {
1902	  die "w3mir: -lc must be the first argument on the commandline.\n";
1903	}
1904      }
1905
1906      if ($f eq 'P') {				# Proxy
1907	($w3http::proxyserver,$w3http::proxyport)=
1908	  shift =~ /([^:]+):?(\d+)?/;
1909	$w3http::proxyport=80 unless $w3http::proxyport;
1910	$using_proxy=1;
1911	next;
1912      }
1913
1914      if ($f eq 'd') {		# Debugging level
1915	$f=shift;
1916	unless (($debug = $f) > 0) {
1917	  die "w3mir: debug level must be a number greater than zero.\n";
1918	}
1919	next;
1920      }
1921
1922      # Those were all the options...
1923      warn "w3mir: Unknown option: -$f.  Use -h for usage info.\n";
1924      exit(1);
1925
1926    } elsif ($f =~ /^http:/) {
1927      my ($rum_url_o,$rum_reurl,$rum_rebase,$server);
1928
1929      $rum_url_o=root_queue($f,'./');
1930
1931      $rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );
1932
1933      push(@internal_apply,"s/^".$rum_rebase."//");
1934      $scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
1935      $scope_ignore.="return 0 if m/^".
1936	quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
1937
1938    } else {
1939      # If we get this far then the commandline is broken
1940      warn "Unknown commandline argument: $f.  Use -h for usage info.\n";
1941      $numarg--;
1942      exit(1);
1943    }
1944  }
1945  return 1;
1946}
1947
1948
1949sub parse_cfg_file {
1950  # Read the configuration file.  Aborts on errors.  Not good to
1951  # mirror something using the wrong config.
1952
1953  my ( $file ) = @_ ;
1954  my ($key, $value, $authserver,$authrealm,$authuser,$authpasswd);
1955  my $i;
1956
1957  die "w3mir: config file $file is not a file.\n" unless -f $file;
1958  open(CFGF, $file) || die "Could not open config file $file: $!\n";
1959
1960  $i=0;
1961
1962  while (<CFGF>) {
1963    # Trim off various junk
1964    chomp;
1965    s/^#.*//;
1966    s/^\s+|\s$//g;
1967    # Anything left?
1968    next if $_ eq '';
1969    # Examine remains
1970    $i++;
1971    $numarg++;
1972
1973    ($key, $value) = split(/\s*:\s*/,$_,2);
1974    $key = lc $key;
1975
1976    $iref=$value,next if ( $key eq 'initial-referer' );
1977    $header=$value,next if ( $key eq 'header' );
1978    $pause=numeric($value),next if ( $key eq 'pause' );
1979    $retryPause=numeric($value),next if ( $key eq 'retry-pause' );
1980    $debug=numeric($value),next if ( $key eq 'debug' );
1981    $retry=numeric($value),next if ( $key eq 'retries' );
1982    umask(numeric($value)),next if ( $key eq 'umask' );
1983    $check_robottxt=boolean($value),next if ( $key eq 'robot-rules' );
1984    $edit=boolean($value),next if ($key eq 'remove-nomirror');
1985    $indexname=$value,next if ($key eq 'index-name');
1986    $s=nway($value,'save','stdout','forget'),next
1987      if ( $key eq 'file-disposition' );
1988    $verbose=nway($value,'quiet','brief','chatty')-1,next
1989      if ( $key eq 'verbosity' );
1990    $w3http::proxyuser=$value,next if $key eq 'http-proxy-user';
1991    $w3http::proxypasswd=$value,next if $key eq 'http-proxy-passwd';
1992
1993    if ( $key eq 'cd' ) {
1994      $chdirto=$value;
1995      warn "Use of 'cd' is discouraged\n" unless $verbose==-1;
1996      next;
1997    }
1998
1999    if ($key eq 'http-proxy') {
2000      ($w3http::proxyserver,$w3http::proxyport)=
2001	$value =~ /([^:]+):?(\d+)?/;
2002      $w3http::proxyport=80 unless $w3http::proxyport;
2003      $using_proxy=1;
2004      next;
2005    }
2006
2007    if ($key eq 'proxy-options') {
2008      my($val,$nval,@popts,$pragma);
2009      $pragma=1;
2010      foreach $val (split(/\s*,\*/,lc $value)) {
2011	$nval=nway($val,'no-pragma','revalidate','refresh','no-store',);
2012	# Force use of Cache-control: header
2013	$pragma=0 if ($nval==0);
2014	# use to force proxy to revalidate
2015	$pragma=0,push(@popts,'max-age=0') if ($nval==1);
2016	# use to force proxy to refresh
2017	push(@popts,'no-cache') if ($nval==2);
2018	# use if information transfered is sensitive
2019	$pragma=0,push(@popts,'no-store') if ($nval==3);
2020      }
2021      $cache_header=($pragma?'Pragma: ':'Cache-control: ').join(', ',@popts);
2022      next;
2023    }
2024
2025
2026    if ($key eq 'url') {
2027      my ($rum_url_o,$lf_dir,$rum_reurl,$rum_rebase);
2028
2029      # A two argument URL: line?
2030      if ($value =~ m/^(.+)\s+(.+)/i) {
2031	# Two arguments.
2032	# The last is a directory, it must end in /
2033	$lf_dir=$2;
2034	$lf_dir.='/' unless $lf_dir =~ m~/$~;
2035
2036	$rum_url_o=root_queue($1,$lf_dir);
2037
2038	# The first is a URL, make it more canonical, find the base.
2039	# The namespace confusion in this section is correct.(??)
2040	$rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );
2041
2042	# print "URL: ",$rum_url_o->as_string,"\n";
2043	# print "Base: $rum_rebase\n";
2044
2045	# Translate from rum space to lf space:
2046	push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/");
2047
2048	# That translation could lead to information loss.
2049	$infoloss=1;
2050
2051	# Fetch rules tests the rum_url_o->as_string.  Fetch whatever
2052	# matches the base.
2053	$scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
2054
2055	# Ignore whatever did not match the base.
2056	$scope_ignore.="return 0 if m/^".
2057	  quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
2058
2059      } else {
2060	$rum_url_o=root_queue($value,'./');
2061
2062	$rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );
2063
2064	# Translate from rum space to lf space:
2065	push(@internal_apply,"s/^".$rum_rebase."//");
2066
2067	$scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
2068	$scope_ignore.="return 0 if m/^".
2069	  quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
2070      }
2071      next;
2072    }
2073
2074    if ($key eq 'also-quene') {
2075      print STDERR
2076	"Found 'also-quene' keyword, please replace with 'also-queue'\n";
2077      $key='also-queue';
2078    }
2079
2080    if ($key eq 'also' || $key eq 'also-queue') {
2081      if ($value =~ m/^(.+)\s+(.+)/i) {
2082	my ($rum_url_o,$rum_url,$lf_dir,$rum_reurl,$rum_rebase);
2083	# Two arguments.
2084	# The last is a directory, it must end in /
2085	# print STDERR "URL ",$1," DIR ",$2,"\n";
2086	$rum_url=$1;
2087	$lf_dir=$2;
2088	$lf_dir.='/' unless $lf_dir =~ m~/$~;
2089	die "w3mir: The target path in Also: and Also-queue: directives must ".
2090	    "be relative\n"
2091		if substr($lf_dir,0,1) eq '/';
2092
2093	if ($key eq 'also-queue') {
2094	  $rum_url_o=root_queue($rum_url,$lf_dir);
2095	} else {
2096	  root_queue("",$lf_dir);
2097	  $rum_url_o=url $rum_url;
2098	  $rum_url_o->host(lc $rum_url_o->host);
2099	}
2100
2101	# The first is a URL, find the base
2102	$rum_rebase = quotemeta( ($rum_url_o->as_string =~ m~^(.*/)~)[0] );
2103
2104#	print "URL: $rum_url_o->as_string\n";
2105#	print "Base: $rum_rebase\n";
2106#	print "Server: $server\n";
2107
2108	# Ok, now we can transform and select stuff the right way
2109	push(@internal_apply,"s/^".$rum_rebase."/".quotemeta($lf_dir)."/");
2110	$infoloss=1;
2111
2112	# Fetch rules tests the rum_url_o->as_string.  Fetch whatever
2113	# matches the base.
2114	$scope_fetch.="return 1 if m/^".$rum_rebase."/".$ipost.";\n";
2115
2116	# Ignore whatever did not match the base.  This cures problem
2117	# with '..' from base in in rum space pointing within the the
2118	# scope in ra space.  We introduced a extra level (or more) of
2119	# directories with the apply above.  Must do same with 'Also:'
2120	# directives.
2121	$scope_ignore.="return 0 if m/^".
2122	  quotemeta("http://".$rum_url_o->netloc."/")."/".$ipost.";\n";
2123      } else {
2124	die "Also: requires 2 arguments\n";
2125      }
2126      next;
2127    }
2128
2129    if ($key eq 'quene') {
2130      print STDERR "Found 'quene' keyword, please replace with 'queue'\n";
2131      $key='queue';
2132    }
2133
2134    if ($key eq 'queue') {
2135      root_queue($value,"");
2136      next;
2137    }
2138
2139    if ($key eq 'ignore-re' || $key eq 'fetch-re') {
2140      # Check that it's a re, better that I am strict than for perl to
2141      # make compilation errors.
2142      unless ($value =~ /^m(.).*\1[gimosx]*$/) {
2143	print STDERR "w3mir: $value is not a recognized regular expression\n";
2144	exit 1;
2145      }
2146      # Fall-through to next cases!
2147    }
2148
2149    if ($key eq 'fetch' || $key eq 'fetch-re') {
2150      my $expr=$value;
2151      $expr = wild_re($expr).$ipost if ($key eq 'fetch');
2152      $rule_text.=' return 1 if '.$expr.";\n";
2153      next;
2154    }
2155
2156    if ($key eq 'ignore' || $key eq 'ignore-re') {
2157      my $expr=$value;
2158      $expr = wild_re($expr).$ipost if ($key eq 'ignore');
2159      # print STDERR "Ignore expression: $expr\n";
2160      $rule_text.=' return 0 if '.$expr.";\n";
2161      next;
2162    }
2163
2164
2165    if ($key eq 'apply') {
2166      unless ($value =~ /^s(.).*\1.*\1[gimosxe]*$/) {
2167	print STDERR
2168	  "w3mir: '$value' is not a recognized regular expression\n";
2169	exit 1;
2170      }
2171      push(@user_apply,$value) ;
2172      $infoloss=1;
2173      next;
2174    }
2175
2176    if ($key eq 'agent') {
2177      $w3http::agent=$w3mir_agent=$value;
2178      next;
2179    }
2180
2181    # The authorization stuff:
2182    if ($key eq 'auth-domain') {
2183      $useauth=1;
2184      ($authserver, $authrealm) = split('/',$value,2);
2185      die "w3mir: server part of auth-domain has format server[:port]\n"
2186	unless $authserver =~ /^(\S+(:\d+)?)$|^\*$/;
2187      $authserver =~ s/:80$//;
2188      die "w3mir: auth-domain '$value' is not valid\n"
2189	if !defined($authserver) || !defined($authrealm);
2190      $authrealm=lc $authrealm;
2191    }
2192
2193    $authuser=$value if ($key eq 'auth-user');
2194    $authpasswd=$value if ($key eq 'auth-passwd');
2195
2196    # Got a full authentication spec?
2197    if ($authserver && $authrealm && $authuser && $authpasswd) {
2198      $authdata{$authserver}{$authrealm}=$authuser.":".$authpasswd;
2199      print "Authentication for $authserver/$authrealm is ".
2200	"$authuser/$authpasswd\n" if $verbose>=0;
2201      # exit;
2202      # Invalidate tmp vars
2203      $authserver=$authrealm=$authuser=$authpasswd=undef;
2204      next;
2205    }
2206
2207    next if $key eq 'auth-user' || $key eq 'auth-passwd' ||
2208      $key eq 'auth-domain';
2209
2210    if ($key eq 'fetch-options') {
2211      warn "w3mir: The 'fetch-options' directive has been renamed to 'options'\nw3mir: Please change your configuration file.\n";
2212      $key='options';
2213      # Fall through to 'options'!
2214    }
2215
2216    if ($key eq 'options') {
2217
2218      my($val,$nval);
2219      foreach $val (split(/\s*,\s*/,lc $value)) {
2220	if ($i==1) {
2221	  $nval=nway($val,'recurse','no-date-check','only-nonexistent',
2222		     'list-urls','lowercase','remove','batch','read-urls',
2223		     'abs','no-newline-conv','list-nonmirrored');
2224	  $r=1,next if $nval==0;
2225	  $fetch=1,next if $nval==1;
2226	  $fetch=-1,next if $nval==2;
2227	  $list=1,next if $nval==3;
2228	  if ($nval==4) {
2229	    $lc=1;
2230	    $iinline=($lc?"(?i)":"");
2231	    $ipost=($lc?"i":"");
2232	    next ;
2233	  }
2234	  $remove=1,next if $nval==5;
2235	  $convertnl=0,$batch=1,next if $nval==6;
2236	  $read_urls=1,next if $nval==7;
2237	  $abs=1,next if $nval==8;
2238	  $convertnl=0,next if $nval==9;
2239	  $list_nomir=1,next if $nval==10;
2240        } else {
2241	    die "w3mir: options must be the first directive in the config file.\n";
2242	}
2243      }
2244      next;
2245    }
2246
2247    if ($key eq 'disable-headers') {
2248      my($val,$nval);
2249      foreach $val (split(/\s*,\s*/,lc $value)) {
2250	$nval=nway($val,'referer','user');
2251	$do_referer=0,next if $nval==0;
2252	$do_user=0,next if $nval==1;
2253      }
2254      next;
2255    }
2256
2257
2258    if ($key eq 'fixup') {
2259
2260      $fixrc="$file";
2261      # warn "Fixrc: $fixrc\n";
2262
2263      my($val,$nval);
2264      foreach $val (split(/\s*,\s*/,lc $value)) {
2265	$nval=nway($val,'on','run','noindex','off');
2266	$runfix=1,next if $nval==1;
2267	# Disable fixup
2268	$fixup=0,next if $nval==3;
2269	# Ignore everyting else
2270      }
2271      next;
2272    }
2273
2274    die "w3mir: Unrecognized directive ('$key') in config file $file at line $.\n";
2275
2276  }
2277  close(CFGF);
2278
2279  if (defined($w3http::proxypasswd) && $w3http::proxyuser) {
2280    warn "Proxy authentication: ".$w3http::proxyuser.":".
2281      $w3http::proxypasswd."\n" if $verbose>=0;
2282  }
2283
2284}
2285
2286
2287sub wild_re {
2288  # Here we translate unix wildcard subset to to perlre
2289  local($_) = shift;
2290
2291  # Quote anything that's RE and not wildcard: / ( ) \ | { } + $ ^
2292  s~([\/\(\)\\\|\{\}\+)\$\^])~\\$1~g;
2293  # . -> \.
2294  s~\.~\\.~g;
2295  # * -> .*
2296  s~\*~\.\*~g;
2297  # ? -> .
2298  s~\?~\.~g;
2299
2300  # print STDERR "wild_re: $_\n";
2301
2302  return $_ = '/'.$_.'/';
2303}
2304
2305
2306sub numeric {
2307  # Check if argument is numeric?
2308  my ( $number ) = @_ ;
2309  return oct($number) if ($number =~ /\d+/ || $number =~ /\d+.\d+/);
2310  die "Expected a number, got \"$number\"\n";
2311}
2312
2313
2314sub boolean {
2315  my ( $boolean ) = @_ ;
2316
2317  $boolean = lc $boolean;
2318
2319  return 0 if ($boolean eq 'false' || $boolean eq 'off' || $boolean eq '0');
2320  return 1 if ($boolean eq 'true' || $boolean eq 'on' || $boolean eq '1');
2321  die "Expected a boolean, got \"$boolean\"\n";
2322}
2323
2324
2325sub nway {
2326  my ( $value ) = shift;
2327  my ( @values ) = @_;
2328  my ( $val ) = 0;
2329
2330  $value = lc $value;
2331  while (@_) {
2332    return $val if $value eq shift;
2333    $val++;
2334  }
2335  die "Expected one of ".join(", ",@values).", got \"$value\"\n";
2336}
2337
2338
2339sub insert_at_start {
2340  # ark: inserts the first arg at the top of the html in the second arg
2341  # janl: The second arg must be a reference to a scalar.
2342  my( $str, $text_ref ) = @_;
2343  my( @possible ) =("<BODY.*?>", "</HEAD.*?>", "</TITLE.*?>", "<HTML.*?>" );
2344  my( $f, $done );
2345
2346  $done=0;
2347  @_=@possible;
2348
2349  while( $done!=1 && ($f=shift) ){
2350    # print "Searching for: $f\n";
2351    if( $$text_ref =~ /$f/i ){
2352      # print "found it!\n";
2353      $$text_ref =~ s/($f)/$1\n$str/i;
2354      $done=1;
2355    }
2356  }
2357}
2358
2359
2360
2361sub rm_rf {
2362  # Recursively remove directories and other files
2363  # File::Path::rmtree does a similar thing but the messages are wrong
2364
2365  my($remove)=shift;
2366
2367  eval "use File::Find;" unless defined(&finddepth);
2368
2369  die "w3mir: Could not load File::Find module when trying to remove $remove\n"
2370    unless defined(&find);
2371
2372  die "w3mir: Removal safeguard triggered on '$remove'"
2373    if $remove =~ m~/\.\./~ || $remove =~ m~/\.\.$~ || $remove =~ m~/\.$~;
2374
2375  die "rm_rf($remove); ";
2376
2377  finddepth(\&remove_everything,$remove);
2378
2379  if (rmdir($remove)) {
2380    print STDERR "\nw3mir: removed directory $remove\n" if $verbose>=0;
2381  } else {
2382    print STDERR "w3mir: could not remove $remove: $!\n";
2383  }
2384}
2385
2386
2387sub remove_everything {
2388  # This does the removal
2389  ((-d && rmdir($_)) || unlink($_)) && $verbose>=0 &&
2390    print STDERR "w3mir: removed $File::Find::name\n";
2391}
2392
2393
2394
2395sub usage {
2396  my($message)=shift @_;
2397
2398  print STDERR "w3mir: $message\n" if $message;
2399
2400  die 'w3mir: usage: w3mir [options] <single-http-url>
2401          or: w3mir -B [-I] [options] [<http-urls>]
2402
2403    Options :
2404	-agent <agent>	- Set the agent name.  Default is w3mir
2405	-abs		- Force all URLs to be absolute.
2406	-B		- Batch-get documents.
2407	-I		- The URLs to get are read from standard input.
2408	-c		- be more Chatty.
2409	-cfgfile <file> - Read config from file
2410	-d <debug-level>- set debug level to 1 or 2
2411	-drr		- Disable robots.txt rules.
2412	-f		- Forget all files, nothing is saved to disk.
2413	-fa		- Fetch All, will not check timestamps.
2414	-fs		- Fetch Some, do not fetch the files we already have.
2415	-ir <referer>	- Initial referer.  For picky servers.
2416	-l		- List URLs in the documents retrived.
2417	-lc		- Convert all URLs (and filenames) to lowercase.
2418				This does not work reliably.
2419	-p <n>		- Pause n seconds before retriving each doc.
2420	-q		- Quiet, error-messages only
2421        -rp <n>		- Retry Pause in seconds.
2422	-P <server:port>- Use host/port for proxy http requests
2423        -pflush		- Flush proxy server.
2424	-r		- Recursive mirroring.
2425        -R		- Remove files not referenced or not present on server.
2426	-s		- Send output to stdout instead of file
2427	-t <n>		- How many times to (re)try getting a failed doc?
2428        -umask <umask>  - Set umask for mirroring, must be usual octal format.
2429	-nnc		- No Newline Conversion.  Disable newline conversions.
2430	-v		- Show w3mir version.
2431';
2432}
2433__END__
2434# -*- perl -*- There must be a blank line here
2435
2436=head1 NAME
2437
2438w3mir - all purpose HTTP-copying and mirroring tool
2439
2440=head1 SYNOPSIS
2441
2442B<w3mir> [B<options>] [I<HTTP-URL>]
2443
2444B<w3mir> B<-B> [B<options>] <I<HTTP-URLS>>
2445
2446B<w3mir> is a all purpose HTTP copying and mirroring tool.  The
2447main focus of B<w3mir> is to create and maintain a browsable copy of
2448one, or several, remote WWW site(s).
2449
2450Used to the max w3mir can retrive the contents of several related
2451sites and leave the mirror browseable via a local web server, or from
2452a filesystem, such as directly from a CDROM.
2453
2454B<w3mir> has options for all operations that are simple enough for
2455options.  For authentication and passwords, multiple site retrievals
2456and such you will have to resort to a L</CONFIGURATION-FILE>.  If
2457browsing from a filesystem references ending in '/' needs to be
2458rewritten to end in '/index.html', and in any case, if there are URLs
2459that are redirected will need to be changed to make the mirror
2460browseable, see the documentation of B<Fixup> in the
2461L</CONFIGURATION-FILE> secton.
2462
2463B<w3mir>s default behavior is to do as little as possible and to be as
2464nice as possible to the server(s) it is getting documents from.  You
2465will need to read through the options list to make B<w3mir> do more
2466complex, and, useful things.  Most of the things B<w3mir> can do is
2467also documented in the w3mir-HOWTO which is available at the B<w3mir>
2468home-page (F<http://www.math.uio.no/~janl/w3mir/>) as well as in the
2469w3mir distribution bundle.
2470
2471=head1 DESCRIPTION
2472
2473You may specify many options and one HTTP-URL on the w3mir
2474command line.
2475
2476A single HTTP URL I<must> be specified either on the command line or
2477in a B<URL> directive in a configuration file.  If the URL refers to a
2478directory it I<must> end with a "/", otherwise you might get surprised
2479at what gets retrieved (e.g. rather more than you expect).
2480
2481Options must be prefixed with at least one - as shown below, you can
2482use more if you want to. B<-cfgfile> is equivalent to B<--cfgfile> or
2483even B<------cfgfile>.  Options cannot be I<clustered>, i.e., B<-r -R>
2484is not equivalent to B<-rR>.
2485
2486=over 4
2487
2488=item B<-h> | B<-help> | B<-?>
2489
2490prints a brief summary of all command line options and exits.
2491
2492=item B<-cfgfile> F<file>
2493
2494Makes B<w3mir> read the given configuration file.  See the next section
2495for how to write such a file.
2496
2497=item B<-r>
2498
2499Puts B<w3mir> into recursive mode.  The default is to fetch only one
2500document and then quit.  'I<recursive>' mode means that all the
2501documents linked to the given document that are fetched, and all they
2502link to in turn and so on.  But only I<Iff> they are in the same
2503directory or under the same directory as the start document.  Any
2504document that is in or under the starting documents directory is said
2505to be within the I<scope of retrieval>.
2506
2507=item B<-fa>
2508
2509Fetch All.  Normally B<w3mir> will only get the document if it has been
2510updated since the last time it was fetched.  This switch turns that
2511check off.
2512
2513=item B<-fs>
2514
2515Fetch Some.  Not the opposite of B<-fa>, but rather, fetch the ones we
2516don't have already.  This is handy to restart copying of a site
2517incompletely copied by earlier, interrupted, runs of B<w3mir>.
2518
2519=item B<-p> I<n>
2520
2521Pause for I<n> seconds between getting each document.  The default is
252230 seconds.
2523
2524=item B<-rp> I<n>
2525
2526Retry Pause, in seconds.  When B<w3mir> fails to get a document for some
2527technical reason (timeout mainly) the document will be queued for a
2528later retry.  The retry pause is how long B<w3mir> waits between
2529finishing a mirror pass before starting a new one to get the still
2530missing documents.  This should be a long time, so network conditions
2531have a chance to get better.  The default is 600 seconds (10 minutes),
2532which might be a bit too short, for batch running B<w3mir> I would
2533suggest an hour (3600 seconds) or more.
2534
2535=item B<-t> I<n>
2536
2537Number of reTries.  If B<w3mir> cannot get all the documents by the
2538I<n>th retry B<w3mir> gives up.  The default is 3.
2539
2540=item B<-drr>
2541
2542Disable Robot Rules.  The robot exclusion standard is described in
2543http://info.webcrawler.com/mak/projects/robots/norobots.html.  By
2544default B<w3mir> honors this standard.  This option causes B<w3mir> to
2545ignore it.
2546
2547=item B<-nnc>
2548
2549No Newline Conversion.  Normally w3mir converts the newline format of
2550all files that the web server says is a text file.  However, not all
2551web servers are reliable, and so binary files may become corrupted due
2552to the newline conversion w3mir performs.  Use this option to stop
2553w3mir from converting newlines.  This also causes the file to be
2554regarded as binary when written to disk, to disable the implicit
2555newline conversion when saving text files on most non-Unix systems.
2556
2557This will probably be on by default in version 1.1 of w3mir, but not
2558in version 1.0.
2559
2560=item B<-R>
2561
2562Remove files.  Normally B<w3mir> will not remove files that are no
2563longer on the server/part of the retrieved web of files.  When this
2564option is specified all files no longer needed or found on the servers
2565will be removed.  If B<w3mir> fails to get a document for I<any> other
2566reason the file will not be removed.
2567
2568=item B<-B>
2569
2570Batch fetch documents whose URLs are given on the commandline.
2571
2572In combination with the B<-r> and/or B<-l> switch all HTML and PDF
2573documents will be mined for URLs, but the documents will be saved on
2574disk unchanged.  When used with the B<-r> switch only one single URL
2575is allowed.  When not used with the B<-r> switch no HTML/URL
2576processing will be performed at all.  When the B<-B> switch is used
2577with B<-r> w3mir will not do repeated mirrorings reliably since the
2578changes w3mir needs to do, in the documents, to work reliably are not
2579done.  In any case it's best not to use B<-R> in combination with
2580B<-B> since that can result in deleting rather more documents than
2581expected.  Hwowever, if the person writing the documents being copied
2582is good about making references relative and placing the <HTML> tag at
2583the beginning of documents there is a fair chance that things will
2584work even so.  But I wouln't bet on it.  It will, however, work
2585reliably for repeated mirroring if the B<-r> switch is not used.
2586
2587When the B<-B> switch is specified redirects for a given document will
2588be followed no matter where they point.  The redirected-to document
2589will be retrieved in the place of the original document.  This is a
2590potential weakness, since w3mir can be directed to fetch any document
2591anywhere on the web.
2592
2593Unless used with B<-r> all retrived files will be stored in one
2594directory using the remote filename as the local filename.  I.e.,
2595F<http://foo/bar/gazonk.html> will be saved as F<gazonk.html>.
2596F<http://foo/bar/> will be saved as F<bar-index.html> so as to avoid
2597name colitions for the common case of URLs ending in /.
2598
2599=item B<-I>
2600
2601This switch can only be used with the B<-B> switch, and only after it
2602on the commandline or configuration file.  When given w3mir will get
2603URLs from standard input (i.e., w3mir can be used as the end of a pipe
2604that produces URLs.)  There should only be one URL pr. line of input.
2605
2606=item B<-q>
2607
2608Quiet.  Turns off all informational messages, only errors will be
2609output.
2610
2611=item B<-c>
2612
2613Chatty.  B<w3mir> will output more progress information.  This can be
2614used if you're watching B<w3mir> work.
2615
2616=item B<-v>
2617
2618Version.  Output B<w3mir>s version.
2619
2620=item B<-s>
2621
2622Copy the given document(s) to STDOUT.
2623
2624=item B<-f>
2625
2626Forget.  The retrieved documents are not saved on disk, they are just
2627forgotten.  This can be used to prime the cache in proxy servers, or
2628not save documents you just want to list the URLs in (see B<-l>).
2629
2630=item B<-l>
2631
2632List the URLs referred to in the retrieved document(s) on STDOUT.
2633
2634=item B<-umask> I<n>
2635
2636Sets the umask, i.e., the permission bits of all retrieved files.  The
2637number is taken as octal unless it starts with a 0x, in which case
2638it's taken as hexadecimal.  No matter what you set this to make sure
2639you get write as well as read access to created files and directories.
2640
2641Typical values are:
2642
2643=over 8
2644
2645=item 022
2646
2647let everyone read the files (and directories), only you can change
2648them.
2649
2650=item 027
2651
2652you and everyone in the same file-group as you can read, only you can
2653change them.
2654
2655=item 077
2656
2657only you can read the files, only you can change them.
2658
2659=item 0
2660
2661everyone can read, write and change everything.
2662
2663=back
2664
2665The default is whatever was set when B<w3mir> was invoked.  022 is a
2666reasonable value.
2667
2668This option has no meaning, or effect, on Win32 platforms.
2669
2670=item B<-P> I<server:port>
2671
2672Use the given server and port is a HTTP proxy server.  If no port is
2673given port 80 is assumed (this is the normal HTTP port).  This is
2674useful if you are inside a firewall, or use a proxy server to save
2675bandwidth.
2676
2677=item B<-pflush>
2678
2679Proxy flush, force the proxy server to flush it's cache and re-get the
2680document from the source.  The I<Pragma: no-cache> HTTP/1.0 header is
2681used to implement this.
2682
2683=item B<-ir> I<referrer>
2684
2685Initial Referrer.  Set the referrer of the first retrieved document.
2686Some servers are reluctant to serve certain documents unless this is
2687set right.
2688
2689=item B<-agent> I<agent>
2690
2691Set the HTTP User-Agent fields value.  Some servers will serve
2692different documents according to the WWW browsers capabilities.
2693B<w3mir> normally has B<w3mir>/I<version> in this header field.
2694Netscape uses things like B<Mozilla/3.01 (X11; I; Linux 2.0.30 i586)>
2695and MSIE uses things like B<Mozilla/2.0 (compatible; MSIE 3.02;
2696Windows NT)> (remember to enclose agent strings with spaces in with
2697double quotes ("))
2698
2699=item B<-lc>
2700
2701Lower Case URLs. Some OSes, like W95 and NT, are not case sensitive
2702when it comes to filenames.  Thus web masters using such OSes can case
2703filenames differently in different places (apps.html, Apps.html,
2704APPS.HTML).  If you mirror to a Unix machine this can result in one
2705file on the server becoming many in the mirror.  This option
2706lowercases all filenames so the mirror corresponds better with the
2707server.
2708
2709If given it must be the first option on the command line.
2710
2711This option does not work perfectly.  Most especially for mixed case
2712host-names.
2713
2714=item B<-d> I<n>
2715
2716Set the debug level.  A debug level higher than 0 will produce lots of
2717extra output for debugging purposes.
2718
2719=item B<-abs>
2720
2721Force all URLs to be absolute.  If you retrive
2722F<http://www.ifi.uio.no/~janl/index.html> and it references foo.html
2723the referense is absolutified into
2724F<http://www.ifi.uio.no/~janl/foo.html>.  In other words, you get
2725absolute references to the origin site if you use this option.
2726
2727=back
2728
2729=head1 CONFIGURATION-FILE
2730
2731Most things can be mirrored with a (long) command line.  But multi
2732server mirroring, authentication and some other things are only
2733available through a configuration file.  A configuration file can
2734either be specified with the B<-cfgfile> switch, but w3mir also looks
2735for .w3mirc (w3mir.ini on Win32 platforms) in the directory where
2736w3mir is started from.
2737
2738The configuration file consists of lines of comments and directives.
2739A directive consists of a keyword followed by a colon (:) and then one
2740or several arguments.
2741
2742 # This is a comment.  And the next line is a directive:
2743 Options: recurse, remove
2744
2745A comment can only start at the beginning of a line.  The directive
2746keywords are not case-sensitive, but the arguments I<might> be.
2747
2748=over 4
2749
2750=item Options: I<recurse> | I<no-date-check> | I<only-nonexistent> | I<list-urls> | I<lowercase> | I<remove> | I<batch> | I<input-urls> | I<no-newline-conv> | I<list-nonmirrored>
2751
2752This must be the first directive in a configuration file.
2753
2754=over 8
2755
2756=item I<recurse>
2757
2758see B<-r> switch.
2759
2760=item I<no-date-check>
2761
2762see B<-fa> switch.
2763
2764=item I<only-nonexistent>
2765
2766see B<-fs> switch.
2767
2768=item I<list-urls>
2769
2770see B<-l> option.
2771
2772=item I<lowercase>
2773
2774see B<-lc> option.
2775
2776=item I<remove>
2777
2778see B<-R> option.
2779
2780=item I<batch>
2781
2782see B<-B> option.
2783
2784=item I<input-urls>
2785
2786see B<-I> option.
2787
2788=item I<no-newline-conv>
2789
2790see B<-nnc> option.
2791
2792=item I<list-nonmirrored>
2793
2794List URLs not mirrored in a file called .notmirrored ('notmir' on
2795win32).  It will contain a lot of duplicate lines and quite possebly
2796be quite large.
2797
2798=back
2799
2800=item URL: I<HTTP-URL> [I<target-directory>]
2801
2802The URL directive may only appear once in any configuration file.
2803
2804Without the optional target directory argument it corresponds directly
2805to the I<single-HTTP-URL> argument on the command line.
2806
2807If the optional target directory is given all documents from under the
2808given URL will be stored in that directory, and under.  The target
2809directory is most likely only specified if the B<Also> directive is
2810also specified.
2811
2812If the URL given refers to a directory it I<must> end in a "/",
2813otherwise you might get quite surprised at what gets retrieved.
2814
2815Either one URL: directive or the single-HTTP-URL at the command-line
2816I<must> be given.
2817
2818=item Also: I<HTTP-URL directory>
2819
2820This directive is only meaningful if the I<recurse> (or B<-r>)
2821option is given.
2822
2823The directive enlarges the scope of a recursive retrieval to contain
2824the given HTTP-URL and all documents in the same directory or under.
2825Any documents retrieved because of this directive will be stored in the
2826given directory of the mirror.
2827
2828In practice this means that if the documents to be retrieved are stored
2829on several servers, or in several hierarchies on one server or any
2830combination of those.  Then the B<Also> directive ensures that we get
2831everything into one single mirror.
2832
2833This also means that if you're retrieving
2834
2835  URL: http://www.foo.org/gazonk/
2836
2837but it has inline icons or images stored in http://www.foo.org/icons/
2838which you will also want to get, then that will be retrieved as well by
2839entering
2840
2841  Also: http://www.foo.org/icons/ icons
2842
2843As with the URL directive, if the URL refers to a directory it I<must>
2844end in a "/".
2845
2846Another use for it is when mirroring sites that have several names
2847that all refer to the same (logical) server:
2848
2849  URL: http://www.midifest.com/
2850  Also: http://midifest.com/ .
2851
2852At this point in time B<w3mir> has no mechanism to easily enlarge the
2853scope of a mirror after it has been established.  That means that you
2854should survey the documents you are going to retrieve to find out what
2855icons, graphics and other things they refer to that you want.  And
2856what other sites you might like to retrieve.  If you find out that
2857something is missing you will have to delete the whole mirror, add the
2858needed B<Also> directives and then reestablish the mirror.  This lack
2859of flexibility in what to retrieve will be addressed at a later date.
2860
2861See also the B<Also-quene> directive.
2862
2863=item Also-quene: I<HTTP-URL directory>
2864
2865This is like Also, except that the URL itself is also quened.  The
2866Also directive will not cause any documents to be retrived UNLESS they
2867are referenced by some other document w3mir has already retrived.
2868
2869=item Quene: I<HTTP-URL>
2870
2871This is quenes the URL for retrival, but does not enlarge the scope of
2872the retrival.  If the URL is outside the scope of retrival it will not
2873be retrived anyway.
2874
2875The observant reader will see that B<Also-quene> is like B<Also>
2876combined with B<Quene>.
2877
2878=item Initial-referer: I<referer>
2879
2880see B<-ir> option.
2881
2882=item Ignore: F<wildcard>
2883
2884=item Fetch: F<wildcard>
2885
2886=item Ignore-RE: F<regular-expression>
2887
2888=item Fetch-RE: F<regular-expression>
2889
2890These four are used to set up rules about which documents, within the
2891scope of retrieval, should be gotten and which not.  The default is to
2892get I<anything> that is within the scope of retrieval.  That may not
2893be practical though.  This goes for CGI scripts, and especially server
2894side image maps and other things that are executed/evaluated on the
2895server.  There might be other things you want unfetched as well.
2896
2897B<w3mir> stores the I<Ignore>/I<Fetch> rules in a list.  When a
2898document is considered for retrieval the URL is checked against the
2899list in the same order that the rules appeared in the configuration
2900file.  If the URL matches any rule the search stops at once.  If it
2901matched a I<Ignore> rule the document is not fetched and any URLs in
2902other documents pointing to it will point to the document at the
2903original server (not inside the mirror).  If it matched a I<Fetch>
2904rule the document is gotten.  If not matched by any ru�es the document
2905is gotten.
2906
2907The F<wildcard>s are a very limited subset of Unix-wildcards.
2908B<w3mir> understands only 'I<?>', 'I<*>', and 'I<[x-y]>' ranges.
2909
2910The F<perl-regular-expression> is perls superset of the normal Unix
2911regular expression syntax.  They must be completely specified,
2912including the prefixed m, a delimiter of your choice (except the
2913paired delimiters: parenthesis, brackets and braces), and any of the
2914RE modifiers. E.g.,
2915
2916  Ignore-RE: m/.gif$/i
2917
2918or
2919
2920  Ignore-RE: m~/.*/.*/.*/~
2921
2922and so on.  "#" cannot be used as delimiter as it is the comment
2923character in the configuration file.  This also has the bad
2924side-effect of making you unable to match fragment names (#foobar)
2925directly.  Fortunately perl allows writing ``#'' as ``\043''.
2926
2927You must be very carefull of using the RE anchors (``^'' and ``$''
2928with the RE versions of these and the I<Apply> directive. Given the
2929rules:
2930
2931  Fetch-RE: m/foobar.cgi$/
2932  Ignore: *.cgi
2933
2934the all files called ``foobar.cgi'' will be fetched. However, if the
2935file is referenced as ``foobar.cgi?query=mp3'' it will I<not> be
2936fetched since the ``$'' anchor will prevent it from matching the
2937I<Fetch-RE> directive and then it will match the I<Ignore> directive
2938instead. If you want to match ``foobar.cgi'' but not ``foobar.cgifu''
2939you can use perls ``\b'' character class which matches a word
2940boundrary:
2941
2942  Fetch-RE: m/foobar.cgi\b/
2943  Ignore: *.cgi
2944
2945which will get ``foobar.cgi'' as well as ``foobar.cgi?query=mp3'' but
2946not ``foobar.cgifu''. BUT, you must keep in mind that a lot of
2947diffetent characters make a word boundrary, maybe something more
2948subtle is needed.
2949
2950=item Apply: I<regular-expression>
2951
2952This is used to change a URL into another URL.  It is a potentially
2953I<very> powerful feature, and it also provides ample chance for you to
2954shoot your own foot. The whole aparatus is somewhat tenative, if you
2955find there is a need for changes in how Apply rules work please
2956E-mail. If you are going to use this feature please read the
2957documentation for I<Fetch-RE> and I<Ignore-RE> first.
2958
2959The B<Apply> expressions are applied, in sequence, to the URLs in
2960their absolute form. I.e., with the whole
2961http://host:port/dir/ec/tory/file URL. It is only after this B<w3mir>
2962checks if a document is within the scope of retrieval or not. That
2963means that B<Apply> rules can be used to change certain URLs to fall
2964inside the scope of retrieval, and vice versa.
2965
2966The I<regular-expression> is perls superset of the usual Unix regular
2967expressions for substitution.  As with I<Fetch> and I<Ignore> rules it
2968must be specified fully, with the I<s> and delimiting character.  It
2969has the same restrictions with regards to delimiters. E.g.,
2970
2971  Apply: s~/foo/~/bar/~i
2972
2973to translate the path element I<foo> to I<bar> in all URLs.
2974
2975"#" cannot be used as delimiter as it is the comment character in the
2976configuration file.
2977
2978Please note that w3mir expects that URLs identifying 'directories'
2979keep idenfifying directories after application of Apply rules.  Ditto
2980for files.
2981
2982=item Agent: I<agent>
2983
2984see B<-agent> option.
2985
2986=item Pause: I<n>
2987
2988see B<-p> option.
2989
2990=item Retry-Pause: I<n>
2991
2992see B<-rp> option.
2993
2994=item Retries: I<n>
2995
2996see B<-t> option.
2997
2998=item debug: I<n>
2999
3000see B<-d> option.
3001
3002=item umask I<n>
3003
3004see B<-umask> option.
3005
3006=item Robot-Rules: I<on> | I<off>
3007
3008Turn robot rules on of off.  See B<-drr> option.
3009
3010=item Remove-Nomirror: I<on> | I<off>
3011
3012If this is enabled sections between two consecutive
3013
3014  <!--NO MIRROR-->
3015
3016comments in a mirrored document will be removed.  This editing is
3017performed even if batch getting is specified.
3018
3019=item Header: I<html/text>
3020
3021Insert this I<complete> html/text into the start of the document.
3022This will be done even if batch is specified.
3023
3024=item File-Disposition: I<save> | I<stdout> | I<forget>
3025
3026What to do with a retrieved file.  The I<save> alternative is default.
3027The two others correspond to the B<-s> and B<-f> options.  Only one
3028may be specified.
3029
3030=item Verbosity: I<quiet> | I<brief> | I<chatty>
3031
3032How much B<w3mir> informs you of it's progress.  I<Brief> is the
3033default.  The two others correspond to the B<-q> and B<-c> switches.
3034
3035=item Cd: I<directory>
3036
3037Change to given directory before starting work.  If it does not exist
3038it will be quietly created.  Using this option breaks the 'fixup'
3039code so consider not using it, ever.
3040
3041=item HTTP-Proxy: I<server:port>
3042
3043see the B<-P> switch.
3044
3045=item HTTP-Proxy-user: I<username>
3046
3047=item HTTP-Proxy-passwd: I<password>
3048
3049These two are is used to activate authentication with the proxy
3050server.  L<w3mir> only supports I<basic> proxy autentication, and is
3051quite simpleminded about it, if proxy authentication is on L<w3mir>
3052will always give it to the proxy.  The domain concept is not supported
3053with proxy-authentication.
3054
3055=item Proxy-Options: I<no-pragma> | I<revalidate> | I<refresh> | I<no-store>
3056
3057Set proxy options.  There are two ways to pass proxy options, HTTP/1.0
3058compatible and HTTP/1.1 compatible.  Newer proxy-servers will
3059understand the 1.1 way as well as 1.0.  With old proxy-servers only
3060the 1.0 way will work.  L<w3mir> will prefer the 1.0 way.
3061
3062The only 1.0 compatible proxy-option is I<refresh>, it corresponds to
3063the B<-pflush> option and forces the proxy server to pass the request
3064to a upstream server to retrieve a I<fresh> copy of the document.
3065
3066The I<no-pragma> option forces w3mir to use the HTTP/1.1 proxy
3067control header, use this only with servers you know to be new,
3068otherwise it won't work at all.  Use of any option but I<refresh> will
3069also cause HTTP/1.1 to be used.
3070
3071I<revalidate> forces the proxy server to contact the upstream server
3072to validate that it has a fresh copy of the document.  This is nicer
3073to the net than I<refresh> option which forces re-get of the document
3074no matter if the server has a fresh copy already.
3075
3076I<no-store> forbids the proxy from storing the document in other than
3077in transient storage.  This can be used when transferring sensitive
3078documents, but is by no means any warranty that the document can't be
3079found on any storage device on the proxy-server after the transfer.
3080Cryptography, if legal in your contry, is the solution if you want the
3081contents to be secret.
3082
3083I<refresh> corresponds to the HTTP/1.0 header I<Pragma: no-cache> or
3084the identical HTTP/1.1 I<Cache-control> option.  I<revalidate> and
3085I<no-store> corresponds to I<max-age=0> and I<no-store> respectively.
3086
3087=item Authorization
3088
3089B<w3mir> supports only the I<basic> authentication of HTTP/1.0.  This
3090method can assign a password to a given user/server/I<realm>.  The
3091"user" is your user-name on the server.  The "server" is the server.
3092The I<realm> is a HTTP concept.  It is simply a grouping of files and
3093documents.  One file or a whole directory hierarchy can belong to a
3094realm.  One server may have many realms.  A user may have separate
3095passwords for each realm, or the same password for all the realms the
3096user has access to.  A combination of a server and a realm is called a
3097I<domain>.
3098
3099=over 8
3100
3101=item Auth-Domain: I<server:port/realm>
3102
3103Give the server and port, and the belonging realm (making a domain)
3104that the following authentication data holds for.  You may specify "*"
3105wildcard for either of I<server:port> and I<realm>, this will work
3106well if you only have one usernme and password on all the servers
3107mirrored.
3108
3109=item Auth-User: I<user>
3110
3111Your user-name.
3112
3113=item Auth-Passwd: I<password>
3114
3115Your password.
3116
3117=back
3118
3119These three directives may be repeated, in clusters, as many times as
3120needed to give the necessary authentication information
3121
3122=item Disable-Headers: I<referer> | I<user>
3123
3124Stop B<w3mir> from sending the given headers.  This can be used for
3125anonymity, making your retrievals harder to track.  It will be even
3126harder if you specify a generic B<Agent>, like Netscape.
3127
3128=item Fixup: I<...>
3129
3130This directive controls some aspects of the separate program w3mfix.
3131w3mfix uses the same configuration file as w3mir since it needs a lot
3132of the information in the B<w3mir> configuration file to do it's work
3133correctly.  B<w3mfix> is used to make mirrors more browseable on
3134filesystems (disk or CDROM), and to fix redirected URLs and some other
3135URL editing.  If you want a mirror to be browseable of disk or CDROM
3136you almost certainly need to run w3mfix.  In many cases it is not
3137necessary when you run a mirror to be used through a WWW server.
3138
3139To make B<w3mir> write the data files B<w3mfix> needs, and do nothing
3140else, simply put
3141
3142=over 8
3143
3144  Fixup: on
3145
3146=back
3147
3148in the configuration file.  To make B<w3mir> run B<w3mfix>
3149automatically after each time B<w3mir> has completed a mirror run
3150specify
3151
3152=over 8
3153
3154  Fixup: run
3155
3156=back
3157
3158L<w3mfix> is documented in a separate man page in a effort to not
3159prolong I<this> manpage unnecessarily.
3160
3161=item Index-name: I<name-of-index-file>
3162
3163When retriving URLs ending in '/' w3mir needs to append a filename to
3164store it localy.  The default value for this is 'index.html' (this is
3165the most used, its use originated in the NCSA HTTPD as far as I know).
3166Some WWW servers use the filename 'Welcome.html' or 'welcome.html'
3167instead (this was the default in the old CERN HTTPD).  And servers
3168running on limited OSes frequently use 'index.htm'.  To keep things
3169consistent and sane w3mir and the server should use the same name.
3170Put
3171
3172  Index-name: welcome.html
3173
3174when mirroring from a site that uses that convention.
3175
3176When doing a multiserver retrival where the servers use two or more
3177different names for this you should use B<Apply> rules to make the
3178names consistent within the mirror.
3179
3180When making a mirror for use with a WWW server, the mirror should use
3181the same name as the new server for this, to acomplish that
3182B<Index-name> should be combined with B<Apply>.
3183
3184Here is an example of use in the to latter cases when Welcome.html is
3185the prefered I<index> name:
3186
3187  Index-name: Welcome.html
3188  Apply: s~/index.html$~/Welcome.html~
3189
3190Similarly, if index.html is the prefered I<index> name.
3191
3192  Apply: s~/Welcome.html~/index.html~
3193
3194I<Index-name> is not needed since index.html is the default index name.
3195
3196=back
3197
3198=head1 EXAMPLES
3199
3200=over 4
3201
3202=item * Just get the latest Dr-Fun if it has been changed since the last
3203time
3204
3205 w3mir http://sunsite.unc.edu/Dave/Dr-Fun/latest.jpg
3206
3207=item * Recursively fetch everything on the Star Wars site, remove
3208what is no longer at the server from the mirror:
3209
3210 w3mir -R -r http://www.starwars.com/
3211
3212=item * Fetch the contents of the Sega site through a proxy, pausing
3213for 30 seconds between each document
3214
3215 w3mir -r -p 30 -P www.foo.org:4321 http://www.sega.com/
3216
3217=item * Do everything according to F<w3mir.cfg>
3218
3219 w3mir -cfgfile w3mir.cfg
3220
3221=item * A simple configuration file
3222
3223 # Remember, options first, as many as you like, comma separated
3224 Options: recurse, remove
3225 #
3226 # Start here:
3227 URL: http://www.starwars.com/
3228 #
3229 # Speed things up
3230 Pause: 0
3231 #
3232 # Don't get junk
3233 Ignore: *.cgi
3234 Ignore: *-cgi
3235 Ignore: *.map
3236 #
3237 # Proxy:
3238 HTTP-Proxy: www.foo.org:4321
3239 #
3240 # You _should_ cd away from the directory where the config file is.
3241 cd: starwars
3242 #
3243 # Authentication:
3244 Auth-domain: server:port/realm
3245 Auth-user: me
3246 Auth-passwd: my_password
3247 #
3248 # You can use '*' in place of server:port and/or realm:
3249 Auth-domain: */*
3250 Auth-user: otherme
3251 Auth-user: otherpassword
3252
3253=item Also:
3254
3255 # Retrive all of janl's home pages:
3256 Options: recurse
3257 #
3258 # This is the two argument form of URL:.  It fetches the first into the second
3259 URL: http://www.math.uio.no/~janl/ math/janl
3260 #
3261 # These says that any documents refered to that lives under these places
3262 # should be gotten too.  Into the named directories.  Two arguments are
3263 # required for 'Also:'.
3264 Also: http://www.math.uio.no/drift/personer/ math/drift
3265 Also: http://www.ifi.uio.no/~janl/ ifi/janl
3266 Also: http://www.mi.uib.no/~nicolai/ math-uib/nicolai
3267 #
3268 # The options above will result in this directory hierarchy under
3269 # where you started w3mir:
3270 # w3mir/math/janl		files from http://www.math.uio.no/~janl
3271 # w3mir/math/drift		from http://www.math.uio.no/drift/personer/
3272 # w3mir/ifi/janl		from http://www.ifi.uio.no/~janl/
3273 # w3mir/math-uib/nicolai	from http://www.mi.uib.no/~nicolai/
3274
3275=item Ignore-RE and Fetch-RE
3276
3277 # Get only jpeg/jpg files, no gifs
3278 Fetch-RE: m/\.jp(e)?g$/
3279 Ignore-RE: m/\.gif$/
3280
3281=item Apply
3282
3283As I said earlier, B<Apply> has not been used for Real Work yet, that
3284I know of.  But B<Apply> I<could>, be used to map all web servers at
3285the university of Oslo inside the scope of retrieval very easily:
3286
3287  # Start at the main server
3288  URL: http://www.uio.no/
3289  # Change http://*.uio.no and http://129.240.* to be a subdirectory
3290  # of http://www.uio.no/.
3291  Apply: s~^http://(.*\.uio\.no(?:\d+)?)/~http://www.uio.no/$1/~i
3292  Apply: s~^http://(129\.240\.[^:]*(?:\d+)?)/~http://www.uio.no/$1/~i
3293
3294
3295=back
3296
3297There are two rather extensive example files in the B<w3mir> distribution.
3298
3299=head1 BUGS
3300
3301=over 4
3302
3303=item The -lc switch does not work too well.
3304
3305=back
3306
3307=head1 FEATURES
3308
3309These are not bugs.
3310
3311=over 4
3312
3313=item URLs with two /es ('//') in the path component does not work as
3314some might expect.  According to my reading of the URL spec. it is an
3315illegal construct, which is a Good Thing, because I don't know how to
3316handle it if it's legal.
3317
3318=item If you start at http://foo/bar/ then index.html might be gotten
3319twice.
3320
3321=item Some documents point to a point above the server root, i.e.,
3322http://some.server/../stuff.html.  Netscape, and other browsers, in
3323defiance of the URL standard documents will change the URL to
3324http://some.server/stuff.html.  W3mir will not.
3325
3326=item Authentication is I<only> tried if the server requests it.  This
3327might lead to a lot of extra connections going up and down, but that's
3328the way it's gotta work for now.
3329
3330=back
3331
3332=head1 SEE ALSO
3333
3334L<w3mfix>
3335
3336=head1 AUTHORS
3337
3338B<w3mir>s authors can be reached at I<w3mir-core@usit.uio.no>.
3339B<w3mir>s home page is at http://www.math.uio.no/~janl/w3mir/
3340