1# -*- perl -*-
2# w3http.pm	--- send http requests, janl's 12" mix for w3mir
3#
4$VERSION=1.0.23;
5#
6# This implements http/1.0 requests.  We'll have problems with http/0.9
7# This is in no way specific to w3mir.
8#
9# IMPORTANT: The caller should initialize the C locale for some of the
10#   things here to work correctly (specifically the strftime function).
11#
12# This is a rewrite of http.pl by Oscar Nierstrasz; I copied the code he he
13# copied from the camel book.  Some functions written by Gorm Haug Eriksen
14# (gorm@usit.uio.no) has been used as is.
15#
16# Contributors:
17#   Nicolai Langfeldt, janl@ifi.uio.no
18#   Gorm Haug Eriksen, gorm@usit.uio.no
19#   Chris Szurgot, szurgot@itribe.net
20#   Bernhard Weisshuh, bkw@weisshuhn.de
21# Copying and modification is governed by the "Artistic License" enclosed in
22# the w3mir distribution
23#
24# gorm :
25# &w3http::get_last_modified  return the last modified stamp on a file in
26#                         the right format for use with http
27#
28# janl:
29# &http::query: Send a http query.  A completely general function to send a
30#   http query.  Will extract header values, http response code and, optionaly,
31#   convert text files to local linefeed format.
32#
33# Variables to examine after a query
34# $w3http::document: The document returned by the query, if any.
35# $w3http::doclen: The length of the document
36# $w3http::result: The numerical http result code.  It may take these values:
37# - Normal HTTP reply codes
38# -  98: OS error, permanent, errormessage in $!
39# -  99: Other permanent error: see $restext.
40# - 100: Transient error: Timeout/connection broken
41# $w3http::restext: The english(?) HTTP result or w3http generated message
42# $w3http::header: The http header returned.
43# $w3http::plaintexthtml: 1 if this doc is non-content-encoded text/html
44#	(as opposed to content-encoding: compressed content-type: text/html
45#       which needs decompression before we can inspect the html)
46#	The tests are somewhat longwinded so I do it just once here.
47# %w3http::headval: Associative array of header values
48# $w3http::headval{'CONTENT-TYPE'}: Derived content type, stripped of charset
49#		qualifiers and other distractions.
50# $w3http::xfbytes: Transfered bytes, cumulative.  Document part only.
51# $w3http::headbytes: Bytes of headers received, cumulative.
52#
53# Variables that change http's behaviour/requests:
54# $w3http::agent: User agent, default is basename of $0
55# $w3http::from: Request is from, default is user@host
56# $w3http::version: The http version to use, only 1.0 is known to me.
57# $w3http::timeout: How long to wait for new data to arrive, default is 600sec
58# $w3http::buflen: Network read buffer size, default is 4096.  It might give a
59#	speedup to tune this for specific servers' so it matches their send
60#	size.  This size can be detected if we want to, I think.
61# $w3http::debug: 1 debuging output, 2, more, 3 queries and replies
62# $w3http::verbose: 0: say nothing, 1: print progress info
63# $w3http::convert: Convert text/* documents to local newline convention?
64#	The default is to do it.
65# $w3http::proxyserver: The name of the proxyserver to use.
66# $w3http::proxyport: The port of the proxyserver to use. 0 if no proxyserver.
67# $w3http::proxyuser: If this is set proxy authentication will be used.
68# $w3http::proxypasswd: The password for proxy authentication
69#
70# Things gotten from main:
71# - $main::win32: 1 if win32 restrictions apply to this system
72# - $main::nulldevice: Bit sink file/device on this system.
73#
74# History (european date format dd/mm/yy):
75#     janl ??/??/95 -- Rewrite finished
76#  szurgot ??/??/95 -- Win32 compatability
77#     janl 16/05/96 -- Added SAVEBIN option, based on idea by szurgot
78#  szurgot 03/05/96 -- Corrected typo in check for content-length against
79#                      retreive document length. Added test for zero-length
80#		       documents (Not retreived because not-modified)
81#  szurgot 19/05/96 -- Win32 adaptions, fixes.
82#     janl 19/05/96 -- Chris won an argument, and janl simplified http
83#		       retrival loop (-> version 1.0.4)
84#     janl 09/09/96 -- Incorporated a patch submited by Michael Kriby -> 1.0.5
85#     janl 16/09/96 -- Support for authorization. -> 1.0.6
86#     janl 27/09/96 -- Support for Accept header, lack pointed out by
87# 			charles@ermine.ox.ac.uk: ... HTTP/1.1 (�14.1) says
88#			``If no Accept header field is present, then it is
89#			assumed that the client accepts all media types,
90#			earlier versions of the protocol suggest that only
91#			text/plain and text/html will be offered by default.''
92#			This contradicts my memory of a http/1.0 draft.
93#			Also added $ACCEPT option.
94#     janl 20/10/96 -- Now uses HTTP::Date to produce HTTP timestamps -> 1.0.7
95#     janl 27/10/96 -- Didn't use to check if gethostbyname worked -> 1.0.8
96#     janl 02/12/96 -- Forgot a unlink when renaming temporary files.
97#     janl 21/02/97 -- Multipele $ACCEPT options work. -> 1.0.9
98#     janl 19/03/97 -- Now issues Host: header -> 1.0.10
99#     janl 10/04/97 -- Changed from wwwurl to URI::URL, and various related
100#                      changes. -> 1.0.11
101#     janl 09/05/97 -- Microsoft ISS servers are _so_ broken -> 1.0.12
102#                      (don't close the write end of the HTTP socket after
103#			sending a query to them)
104#     janl 12/05/97 -- New version of perl caught some typos, fixed
105#			longstanding bug in the newline conversion bit.
106#			-> 1.0.13
107#     janl 06/06/97 -- Demand Loading of MIME::BASE64 -> 1.0.14
108#     janl 01/12/97 -- FAT filesystems drops LSB of modtime.  Patch from
109#			Greg Lindhorst (gregli@microsoft.com)
110#                   -- whoami does not exist on win32, hardwire a default
111#			value (unknown) (also Greg L.) -> 1.0.15
112#     janl 01/22/98 -- Proxy authentication as outlined by Christian Geuer
113#     janl 02/20/98 -- Complex 'content-type' headers handled. -> 1.0.17
114#     janl 04/20/98 -- Only newline convert text/html, everything else is
115#			handled as binary. -> 1.0.18
116#     janl 12/05/98 -- Store tmpfile in its final destination directory
117#			avoiding asking movefile move it across filesystems.
118#			-> 1.0.19
119#     janl 01/08/98 -- Timeout fix from Michael Gusev, also flag short doc
120#			as error.
121#     janl 24/09/98 -- Better error handling -> 1.0.20
122#     bkw  17/12/98 -- Fixed problem with tempfile-generation when
123#                      running in forget-mode (-f)
124#     janl 05/01/99 -- Referer: dropped if argument not true -> 1.0.21
125#     janl 13/04/99 -- Added workaround for broken win32 perl resolving.
126#     janl 15/01/00 -- Patch to adapt to URI 1.0 from Takuya Tsumura and
127#                      Andrey A. Chernov
128#     ams  02/02/01 -- Handle URLs with spaces better (use epath)
129
130package w3http;
131
132require 5.002;
133use Socket;
134use HTTP::Date;
135use Sys::Hostname;
136use URI::URL;
137
138# Suplementary libwww-perl:
139sub URI::URL::_generic::basename {
140  my $self = shift;
141  my @p = $self->path_components;
142  my $old = $p[-1];
143  if (@_) {
144    splice(@p, -1, 1, shift);
145    $self->path_components(@p)
146  }
147  $old;
148}
149
150# The URI 1.0 library changed the internal organization a bit
151# Thanks to Andrey A. Chernov for the patch!
152
153sub URI::_generic::basename {
154  my $self = shift;
155  my @p = $self->path_segments;
156  my $old = $p[-1];
157  if (@_) {
158    splice(@p, -1, 1, shift);
159    $self->path_segments(@p)
160  }
161  $old;
162}
163
164
165END {
166  # Remove tmp file and such in here.  That means that main:: gotta catch
167  # interrupt signals and exit on them, so ENDs are executed.
168}
169
170use strict;
171# Global variables, we want to share them:
172use vars qw($GET $HEAD $GETURL $HEADURL $IFMOD $IFMODF $AUTHORIZ $REFERER);
173use vars qw($SAVEBIN $ACCEPT $NOUSER $FREEHEAD $agent $version $timeout);
174use vars qw($debug $convert $proxyserver $proxyport $xfbytes $headbytes);
175use vars qw($verbose $result $restext $header $document);
176use vars qw($plaintexthtml %headval $progress $doclen $proxyuser);
177use vars qw($proxypasswd);
178
179my $hasAlarm;   # Win32 does not have any alarm
180my $chime;	# Has the alarm gone off yet?
181my %address;	# My own DNS cache
182my $savALRM;	# Saved ALRM handler
183my $savPIPE;	# Saved PIPE handler
184
185# The main:: program should detect if we're running on win32 or not,
186# somehow
187if ($main::win32) {
188  warn "win32\n";
189  # Compensate for lacks of win32 perl.
190  $hasAlarm=0;
191  # Seems to be unavailable in win32/perl5.001.  It has to be in 5.003!
192#  eval "sub sockaddr_in {
193#	($port, $thataddr) = @_;
194#	$sockaddr = 'S n a4 x8';
195#	return pack($sockaddr, &AF_INET, $port, $thataddr);
196#    }";
197} else {
198  $hasAlarm=1;
199}
200
201
202# Find out some things
203my $thishost = hostname();
204my $proto = getprotobyname("tcp");
205
206(my $name, undef) = gethostbyname($thishost);
207chomp(my $user = $ENV{'LOGNAME'} || $ENV{'USER'} || `whoami` || 'unknown');
208my $from   = "$user\@$name";
209
210my $nl = "\r\n";
211# Default values, change by assignment in using-program.
212$agent  = $0; $agent =~ s~.*/~~; # Basename
213$version= "1.0";
214$timeout= 600;			# Timeout while waiting for data/connection
215my $buflen = 4096;		# recv buffer length
216$debug = 0;			# Debuging output?
217$convert = 1;			# Convert newlines of text docs to local format
218$proxyserver='';		# Proxy server.
219$proxyport=0;			# Proxy server port. 0 if no proxy.
220$proxyuser='';			# Username for proxy authentication
221$proxypasswd='';		# Password for proxy authentication
222$xfbytes=0;			# 0 bytes transfered, cumulative
223$headbytes=0;			# 0 bytes of headers, cumulative
224$doclen=0;			# 0 bytes in doc, pr. document
225my $tmpfile="w3mir$$.tmp";	# Temporary filename
226$verbose=0;			# Verbosenes, 0: silent, 1: progress info
227
228# Query opcodes
229$GET = 1;			# GET query. Arg: host,port,path
230$HEAD = 2;			# HEAD query. Arg: host,port,path
231$GETURL = 3;			# GET query. Arg: url
232$HEADURL = 4;			# HEAD query. Arg: url
233# Here we lack PUT, which is not implemented
234# Modify query thus:
235$IFMOD = 101;			# If-modified after: Arg: HTTP-date-str
236$IFMODF = 102;			# If-modified after file: Arg: local-file-name
237$AUTHORIZ= 103;			# Basic authorization. Arg: 'user:password'
238$REFERER = 104;			# Referer: Arg: Referer
239$SAVEBIN = 105;			# Write binary files to disk. Arg: File name
240				# If this opcode is used then main must provide
241				# a &main::movefile(oldname,newname) procedure
242				# that handles moving the tmp file to the
243				# final name/location.
244$ACCEPT  = 106;			# Accept header value: Arg: value
245$NOUSER  = 107;			# Don't insert user header.  Arg: none
246$FREEHEAD= 999;			# Freeform header, one line.  Arg: header
247
248sub query {
249  # Build and send a HTTP query.  And also receive response - janl 95/09/18
250  #
251  # Return codes: 0 if it didn't work.  1 if it did work.
252  # HTTP style result code in w3http::$result and message in w3http::$restext
253
254  # We do next to no argument type checking btw.
255
256  my($host,$port,$request,$query,$method,$inp,$linp,$saveto,$save,$arg);
257  my($start,$wantbytes,$thataddr,$err,$headb,$tmpf,$ldoc,$nouser,$q,$accept);
258  my($origreq,$req_o,$plaintext);
259
260  # Something ought to be said
261  $result=99;
262  $restext='w3http: internal error';
263  $nouser=0;
264
265  if ($version ne '1.0') {
266    warn "Unknown HTTP version $version, no request sent\n";
267    return 0;
268  }
269
270  $accept=$saveto=$query='';
271
272  # Find out what to ask for
273
274  while (defined($arg=shift)) {
275    if ($arg == $GET) {
276      $host=shift;
277      $port=shift;
278      $request=shift;
279      $req_o=url 'http://'.$host.':'.$port.$request;
280      if ($proxyport) {
281	$query.='GET http://'.$req_o->as_string;
282      } else {
283	$query.='GET '.$req_o->epath;
284      }
285      $query.=' HTTP/'.$version.$nl;
286    } elsif ($arg == $HEAD) {
287      $host=shift;
288      $port=shift;
289      $request=shift;
290      $req_o=url 'http://'.$host.':'.$port.$request;
291      if ($proxyport) {
292	$query.='HEAD '.$req_o->as_string;
293      } else {
294	$query.='HEAD '.$req_o->epath;
295      }
296      $query.=' HTTP/'.$version.$nl;
297    } elsif ($arg == $GETURL) {
298      $req_o=shift;
299      $req_o=url $req_o unless ref $req_o;
300      ($method,undef,undef,$host,$port,$request,undef,$q) = $req_o->crack;
301      if ($proxyport) {
302	$query.='GET '.$req_o->as_string;
303      } else {
304	$q=$req_o->equery;
305	$query.='GET '.($req_o->epath).($q?"?$q":'');
306      }
307      $query.=' HTTP/'.$version.$nl;
308    } elsif ($arg == $HEADURL) {
309      $req_o=shift;
310      $req_o=url $req_o unless ref $req_o;
311      if ($proxyport) {
312	$query.='HEAD '.$req_o->as_string;
313      } else {
314	$q=$req_o->equery;
315	$query.='HEAD '.$req_o->epath.($q?"?$q":'');
316      }
317      $query.=' HTTP/'.$version.$nl;
318    } elsif ($arg == $IFMOD) {
319      $query.='If-Modified-Since: '.(shift).$nl;
320    } elsif ($arg == $IFMODF) {
321      $query.='If-Modified-Since: '.&last_modified(shift).$nl;
322    } elsif ($arg == $AUTHORIZ) {
323      # Demand-load MIME::Base64
324      if (!defined(&MIME::Base64::encode)) {
325	eval "use MIME::Base64;";
326	die "w3http: Could not load MIME::Base64 module necessary for authentication\n"
327	  unless defined(&MIME::Base64::encode);
328      }
329      $query.='Authorization: Basic '.MIME::Base64::encode(shift,'').$nl;
330    } elsif ($arg == $REFERER) {
331      my($referer)=shift;
332      $query.='Referer: '.$referer.$nl if $referer;
333    } elsif ($arg == $SAVEBIN) {
334      $saveto=shift;
335    } elsif ($arg == $ACCEPT) {
336      $accept.='Accept: '.(shift).$nl;
337    } elsif ($arg == $NOUSER) {
338      $nouser=1;
339    } elsif ($arg == $FREEHEAD) {
340      $query.=(shift).$nl;
341    } else {
342      warn "Unknown http query opcode: $arg\n";
343    }
344    # Insert the last parts of the query:
345  }
346
347  $query.='Host: '.$req_o->netloc.$nl;
348  $query.='From: '.$from.$nl unless $nouser;
349
350  $accept='Accept: */*'.$nl unless $accept;
351
352  if ($proxyport) {
353    # Use proxy instead of originserver
354    $host=$proxyserver;
355    $port=$proxyport;
356
357    # Add authentication stuff to query
358    if ($proxyuser) {
359      # Demand-load MIME::Base64
360      if (!defined(&MIME::Base64::encode)) {
361	eval "use MIME::Base64;";
362	die "w3http: Could not load MIME::Base64 module necessary for authentication\n"
363	  unless defined(&MIME::Base64::encode);
364      }
365
366      $query.='Proxy-Authorization: Basic '.
367	MIME::Base64::encode($proxyuser.':'.$proxypasswd);
368
369      print STDERR "\nProxyuser: [$proxyuser]\nProxypasswd: [$proxypasswd]\n"
370	if $debug>=2;
371    }
372  }
373
374  $query.='User-Agent: '.$agent.$nl.$accept.$nl;
375
376  # If we're using proxy then set up things...
377  print STDERR "\nQUERY:\n",$query,"---\n" if $debug>=2;
378
379  # win32 fix: this should be added in case of troubles with
380  # gethostbyname. possible reason: nameserver down?
381  if ($host =~ /^\d+(\.\d+){3}$/) {
382    # in case gethostbyname will not work ... ;-)
383    $address{$host} = pack 'C4', (split /\./, $host);
384  }
385
386  # Find out who to ask, check if we know already
387  if (exists($address{$host})) {
388    # We know
389    $thataddr=$address{$host};
390  } else {
391    # Cache miss, get and remember.
392    (my $fqdn, undef, undef, undef, $thataddr) = gethostbyname($host);
393    # Hostname lookup failure?  Cache even misses.
394    if (defined($fqdn)) {
395      print STDERR "Lookup of $host:\nFQDN: $fqdn\n"
396	if $debug;
397      $address{$host}=$thataddr;
398      $address{$fqdn}=$thataddr if $fqdn ne $host;
399    } else {
400      $thataddr=$address{$host}=undef;
401    }
402  }
403
404  # Check if lookup failure, return
405  if (!defined($thataddr)) {
406    $restext='Host lookup failure';
407    return;
408  }
409
410  $port=80 unless defined($port) && $port;
411
412  # When connected we might receive SIGPIPE.  I'm not sure if the
413  # default behaviour of dying is beneficial in that case.  If we get
414  # alarm a timeout has expired.
415  $savPIPE = $SIG{'PIPE'};
416  $savALRM = $SIG{'ALRM'};
417
418  $chime=0;			# There has been no alarm yet
419  $SIG{'ALRM'} = \&timeout;
420  $SIG{'PIPE'} = \&ignore;
421
422  # Close the socket, just in case, and ignore error returns
423  close(FS);
424
425  socket(FS, AF_INET, SOCK_STREAM, $proto) or return &oserror;
426  warn "Got my socks on\n" if $debug;
427
428  my $paddr = sockaddr_in($port, $thataddr);
429  connect(FS, $paddr) or return &oserror;
430  warn "Connected\n" if $debug;
431
432  # Arrange timeout
433  alarm($timeout) if $hasAlarm;
434
435  # We have, in fact, received SIGPIPE on this line:
436  send(FS,$query,0) or return &oserror;
437
438  if ($chime) {
439    $result=100;
440    $restext='timeout sending query';
441    return &resetsign;
442  }
443
444  $header='';
445  $document='';
446  $inp=' 'x$buflen;
447  $doclen=$chime=$plaintext=$plaintexthtml=$save=0;
448
449  # Breaks some M$ ISS servers:
450  # shutdown(FS,1);  # Half-close socket, sending now not allowed
451
452  print STDERR ", receiving header" if $verbose>0;
453
454  # Retrive HTTP response HEADER.  Why do I use recv and not <FS>?
455  # Because then the timeout can work correctly!
456  while (1) {
457    # Set up alarm to ensure recv returns within a reasonable timeframe
458    alarm($timeout) if $hasAlarm;
459    $err = recv(FS,$inp,$buflen,0);
460    # recv returned, cancel alarm.
461    alarm(0) if $hasAlarm;
462
463    # If there has been a timeout, then we quit now.  The recv man page
464    # does not seem to allow recv to return the bytes received up to
465    # the timeout.
466    if ($chime) {
467      $result=100;
468      $restext='timeout fetching document';
469      $!=0;
470      if ($save) {
471	unlink($tmpf) ||
472	  warn "Could not unlink $tmpf: $!\n";
473      }
474      return &resetsign;
475    }
476
477    # recv returnes the undefined value on error
478    if (!defined($err)) {
479      warn "Error in recv: $!\n";
480      last;
481    }
482
483    $linp=length($inp);
484
485    # If the returned input was 0 in length then we've gotten to the
486    # end of the response.
487    last unless $linp;
488
489    # Accounting
490    $xfbytes += $linp;
491    $doclen += $linp;
492
493    # Accumulate input
494    $header.=$inp;
495
496    # eof(SOCKET) has strange semantics it seems
497    # last if eof(FS);
498
499    # Check if header is complete
500    last if ($header =~ m/(\r?\n\r?\n)/);
501  }
502
503  my $orighead = $header;
504
505  if (length($header)==0) {
506    $restext='the HTTP reply header is empty!';
507    return &resetsign;
508  }
509
510  if ($header =~ m/(\r?\n\r?\n)/) {
511    if ($`) {
512      $header=$`;
513      $document=$';
514    }
515  }
516
517  # Adjust accounting
518  $headb = length($header)+length($1);
519  $headbytes += $headb;
520  $xfbytes -= $headb;
521  $doclen -= $headb;
522
523  # Pick headers to pieces
524  ($result,$restext,%headval)=&analyze_header($header);
525
526  if (!$result) {
527    print "\n\nw3mir: BOGUS HTTP REPLY:\n-----\n$header\n-----\n";
528    print "\n\nw3mir: UNPROCESSED REPLY:\n-----\n$orighead\n-----\n";
529    print "\nw3mir: QUERY WAS:\n-----\n$query\n-----\n";
530    die;
531  }
532
533  print STDERR "REPLY:\n",$header,"\n---\n" if $debug>=2;
534
535  # Check if the document is a non-encoded text document. The contents
536  # could be (x-)?compress or (x-)gzip coded (compressed in other
537  # words).
538
539  $plaintext=defined($headval{'CONTENT-TYPE'}) &&
540    (substr($headval{'CONTENT-TYPE'},0,5) eq 'text/' || 0) &&
541      !defined($headval{'content-encoding'});
542  $plaintexthtml=$plaintext &&
543    ($headval{'CONTENT-TYPE'} eq 'text/html');
544
545  if ($result==200) {
546
547    # Save this to a file, or not?  Never save html files.
548    if ($saveto && !$plaintexthtml) {
549      # We're going to save this document directly into a file.  This
550      # stresses the VM less when getting the large binares so often
551      # found at cool sites.
552      $save=1;
553
554      my($slash)=($saveto =~ /^\//);
555      # Find a temporary filename
556      $tmpf=url "file:$saveto";
557      $tmpf->basename($tmpfile);
558      $tmpf=$tmpf->unix_path;
559      $tmpf =~ s~^/~~ if (!$slash);
560
561      # Find suitable final filename, one with no URL escapes
562      $saveto=(url "file:$saveto")->unix_path;
563      $saveto =~ s~^/~~ if (!$slash);
564
565      # If output to stdout then send it directly there rather than
566      # using disk unnecesarily.
567      $tmpf='-' if ($saveto eq '-');
568
569      # If output is nulldevice (running -f), use it also for tmpfile,
570      # since it would otherwise try to create it in /dev under unix.
571      $tmpf=$main::nulldevice if ($saveto eq $main::nulldevice);
572
573      warn "USING TMPFILE: $tmpf\n" if $debug;
574
575      open(SAVE,">$tmpf") ||
576	die "Could not open tmp file: $tmpf: $!\n";
577      binmode SAVE;		# It's a binary file...
578    }
579
580    if ($verbose>0) {
581      print STDERR ", document";
582      print STDERR "->disk" if $save;
583    }
584
585    # Now retrive document itself.  Se comments in header loop
586    $start=time;
587    $wantbytes = defined($headval{'content-length'})?
588      $headval{'content-length'}:0;
589
590    $ldoc=length($document);
591
592    while (1) {
593      alarm($timeout) if $hasAlarm;
594      recv(FS,$inp,$buflen,0);
595      alarm(0) if $hasAlarm;
596
597      if ($chime) {
598	$result=100;
599	$restext='timeout fetching document';
600	$!=0;
601	if ($save) {
602	  unlink($tmpf) ||
603	    warn "Could not unlink $tmpf: $!\n";
604	}
605	return &resetsign;
606      }
607
608      $linp=length($inp);
609
610      last unless $linp || $ldoc;
611      $ldoc = 0;
612
613      $xfbytes += $linp;
614      $doclen += $linp;
615
616      if ($verbose>0 && time-$start>5) {
617	# Write progress info ...
618	if ($wantbytes) {
619	  $progress = sprintf " %3d%%", $doclen/$wantbytes*100;
620	} else {
621	  $progress = sprintf " %d", $doclen;
622	}
623	print STDERR $progress, "\ch"x(length($progress));
624	# ...every 5 seconds
625	$start=time;
626      }
627
628      $document.=$inp;
629
630      if ($save) {
631	$err = print SAVE $document;
632	die "Error writing $tmpf: $!\n" unless $err;
633	$document='';
634      }
635
636      # The eof test seems to work very oddly for sockets.
637      # last if eof(FS);
638    }
639
640    close(FS);  # Close socket completely
641
642    print STDERR "DOCUMENT:\n----\n",$document,"\n----\n" if $debug>=255;
643
644    if ($wantbytes &&
645	$wantbytes != $doclen) {
646      $result=100;
647      $restext='transfer error; too many bytes in document';
648      $restext='document was incomplete' if ($wantbytes > $doclen) ;
649      print STDERR "SHORT DOCUMENT" if $debug>=16;
650      if ($save) {
651	unlink($tmpf) || warn "Could not unlink $tmpf: $!\n";
652      }
653      return &resetsign;
654    }
655
656    # warn "XFB: $xfbytes, DL: $doclen\n";
657    if ($save) {
658      close(SAVE);
659      &main::movefile($tmpf,$saveto);
660    }
661
662    # If this is a non-encoded text file and we're supposed to convert
663    # foreign newlines then we do it. It would be faster to do this
664    # with each chunk of input in the input loop, but this gives us
665    # two problems:
666    # - A \r\n newline could be split into two chunks.  Thus escaping
667    #   newline conversion.
668    # - It messes up the received bytes accounting rather badly.
669    #
670    # This used to be a test for $plaintext, the problem is that too
671    # many documents were typed as text/plain and so we corrupted
672    # binary files. This is bad. So now we're more paranoid about it:
673    # Only HTML gets converted.
674    if ($convert && $plaintexthtml) {
675      # Change non unix newlines to unix newlines. bare \r is known
676      # from macintosh (they hadta be different didn't they?), \r\n is
677      # known as 'network format' and from numerous systems, among
678      # them ms-dos.
679      $document =~ s~\r~\n~g unless $document =~ s~\r\n~\n~g;
680      warn "Newlines converted(?)\n" if $debug;
681    }
682
683  }				# if $result == 200
684
685  &resetsign;
686  return 1;
687}
688
689
690sub analyze_header {
691  my($header)=@_;
692  my($result,$restext,%headval,$hdln,$key,$value);
693
694  # Summary of the http spec on headers (with my comments):
695  # - Each header line ends in CRLF (or just LF, or maybe even just CR,
696  #   anyways, it's easier if all is LF).
697  $header =~ s/\r/\n/mg unless $header =~ s/\r\n/\n/mg;
698  # - If a line starts with space then it's a continuation of the previous
699  #   line (these I fold into one line).
700  $header =~ s/\n\s/ /mg;
701  # - The header field names are case insensitive (so I convert them to
702  #   lowercase)
703  # - A field may appear twice, that is equivalent to listing the values
704  #   in a comma separated list (so I fold them into a comma separated list)n
705  # - The field name and the field value are separated by ': '
706  ($result,$restext) = $header =~ m~^HTTP/\d\.\d (\d\d\d) (.*)~;
707  # Shave off http result code from the header
708  $header =~ s~^.*\n~~;
709
710  warn "Header:\n$header\n---\n" if $debug>=3;
711
712  warn "Result: $result, Text: $restext\n" if $debug>=2;
713
714  %headval=();
715
716  foreach $hdln (split(/\r?\n/m,$header)) {
717    ($key,$value)=split(': ',$hdln,2);
718    $key="\L$key";
719    # Strip leading&trailing space off the reply, some servers use
720    # copious space after.
721    $value =~ s/^\s+|\s+$//g;
722    print STDERR "K: '$key', V: '$value'\n" if $debug>=2;
723    if (defined($headval{$key})) {
724      $headval{$key}.=", ".$value;
725    } else {
726      $headval{$key}=$value;
727    }
728  }
729
730  # See if there are any type parameters in the content-type header
731  # and if so remove them.
732  if (defined($headval{'content-type'})) {
733    my $val=$headval{'content-type'};
734    ($val,undef)=split(';',$val,2) if ($val =~ /;/);
735    $headval{'CONTENT-TYPE'}=$val;
736  }
737
738  return ($result,$restext,%headval);
739}
740
741
742sub last_modified {
743  # will return the last modified time for a local file as a HTTP
744  # timestamp.
745
746  my(@tmp) = stat($_[0]);	# file doesn't exist ok to fetch
747
748  # FAT file systems strip the LSB of the file time.  Add it back in
749  # here before asking the server about a modified file.  The only way
750  # this can fail is if the newer server file was saved one second
751  # after the first version (very unlikely).  This isn't needed for
752  # NTFS file systems, but there is no good portable Perl way to
753  # determine the file system type.
754  $tmp[9] = $tmp[9] | 1 if ( $main::win32 );
755
756  # now we got the last modified in a 32 bit integer.  time to convert
757  # it and return
758  return time2str($tmp[9]);
759}
760
761
762sub timeout {
763  # Set timeout flag.  The using procedure has to set other result codes.
764  $chime=1;			# When this is 1 then the alarm has gone off
765  print STDERR "TIMEOUT!!!!\n" if $debug>=16;
766}
767
768
769sub ignore {
770  warn "I got SIGPIPE, ignoring it...\n";
771}
772
773
774sub resetsign {
775  return 0 if !defined($savALRM);
776  $SIG{'ALRM'}=$savALRM;
777  undef $savALRM;
778#  $SIG{'PIPE'}=$savPIPE;
779  return 0;
780}
781
782
783sub oserror {
784
785  resetsign;
786
787  $result=98;
788  $restext='w3http: OS error';
789  return 0;
790
791}
792
793
7941;
795