1# -*- perl -*- 2# w3http.pm --- send http requests, janl's 12" mix for w3mir 3# 4$VERSION=1.0.23; 5# 6# This implements http/1.0 requests. We'll have problems with http/0.9 7# This is in no way specific to w3mir. 8# 9# IMPORTANT: The caller should initialize the C locale for some of the 10# things here to work correctly (specifically the strftime function). 11# 12# This is a rewrite of http.pl by Oscar Nierstrasz; I copied the code he he 13# copied from the camel book. Some functions written by Gorm Haug Eriksen 14# (gorm@usit.uio.no) has been used as is. 15# 16# Contributors: 17# Nicolai Langfeldt, janl@ifi.uio.no 18# Gorm Haug Eriksen, gorm@usit.uio.no 19# Chris Szurgot, szurgot@itribe.net 20# Bernhard Weisshuh, bkw@weisshuhn.de 21# Copying and modification is governed by the "Artistic License" enclosed in 22# the w3mir distribution 23# 24# gorm : 25# &w3http::get_last_modified return the last modified stamp on a file in 26# the right format for use with http 27# 28# janl: 29# &http::query: Send a http query. A completely general function to send a 30# http query. Will extract header values, http response code and, optionaly, 31# convert text files to local linefeed format. 32# 33# Variables to examine after a query 34# $w3http::document: The document returned by the query, if any. 35# $w3http::doclen: The length of the document 36# $w3http::result: The numerical http result code. It may take these values: 37# - Normal HTTP reply codes 38# - 98: OS error, permanent, errormessage in $! 39# - 99: Other permanent error: see $restext. 40# - 100: Transient error: Timeout/connection broken 41# $w3http::restext: The english(?) HTTP result or w3http generated message 42# $w3http::header: The http header returned. 43# $w3http::plaintexthtml: 1 if this doc is non-content-encoded text/html 44# (as opposed to content-encoding: compressed content-type: text/html 45# which needs decompression before we can inspect the html) 46# The tests are somewhat longwinded so I do it just once here. 47# %w3http::headval: Associative array of header values 48# $w3http::headval{'CONTENT-TYPE'}: Derived content type, stripped of charset 49# qualifiers and other distractions. 50# $w3http::xfbytes: Transfered bytes, cumulative. Document part only. 51# $w3http::headbytes: Bytes of headers received, cumulative. 52# 53# Variables that change http's behaviour/requests: 54# $w3http::agent: User agent, default is basename of $0 55# $w3http::from: Request is from, default is user@host 56# $w3http::version: The http version to use, only 1.0 is known to me. 57# $w3http::timeout: How long to wait for new data to arrive, default is 600sec 58# $w3http::buflen: Network read buffer size, default is 4096. It might give a 59# speedup to tune this for specific servers' so it matches their send 60# size. This size can be detected if we want to, I think. 61# $w3http::debug: 1 debuging output, 2, more, 3 queries and replies 62# $w3http::verbose: 0: say nothing, 1: print progress info 63# $w3http::convert: Convert text/* documents to local newline convention? 64# The default is to do it. 65# $w3http::proxyserver: The name of the proxyserver to use. 66# $w3http::proxyport: The port of the proxyserver to use. 0 if no proxyserver. 67# $w3http::proxyuser: If this is set proxy authentication will be used. 68# $w3http::proxypasswd: The password for proxy authentication 69# 70# Things gotten from main: 71# - $main::win32: 1 if win32 restrictions apply to this system 72# - $main::nulldevice: Bit sink file/device on this system. 73# 74# History (european date format dd/mm/yy): 75# janl ??/??/95 -- Rewrite finished 76# szurgot ??/??/95 -- Win32 compatability 77# janl 16/05/96 -- Added SAVEBIN option, based on idea by szurgot 78# szurgot 03/05/96 -- Corrected typo in check for content-length against 79# retreive document length. Added test for zero-length 80# documents (Not retreived because not-modified) 81# szurgot 19/05/96 -- Win32 adaptions, fixes. 82# janl 19/05/96 -- Chris won an argument, and janl simplified http 83# retrival loop (-> version 1.0.4) 84# janl 09/09/96 -- Incorporated a patch submited by Michael Kriby -> 1.0.5 85# janl 16/09/96 -- Support for authorization. -> 1.0.6 86# janl 27/09/96 -- Support for Accept header, lack pointed out by 87# charles@ermine.ox.ac.uk: ... HTTP/1.1 (�14.1) says 88# ``If no Accept header field is present, then it is 89# assumed that the client accepts all media types, 90# earlier versions of the protocol suggest that only 91# text/plain and text/html will be offered by default.'' 92# This contradicts my memory of a http/1.0 draft. 93# Also added $ACCEPT option. 94# janl 20/10/96 -- Now uses HTTP::Date to produce HTTP timestamps -> 1.0.7 95# janl 27/10/96 -- Didn't use to check if gethostbyname worked -> 1.0.8 96# janl 02/12/96 -- Forgot a unlink when renaming temporary files. 97# janl 21/02/97 -- Multipele $ACCEPT options work. -> 1.0.9 98# janl 19/03/97 -- Now issues Host: header -> 1.0.10 99# janl 10/04/97 -- Changed from wwwurl to URI::URL, and various related 100# changes. -> 1.0.11 101# janl 09/05/97 -- Microsoft ISS servers are _so_ broken -> 1.0.12 102# (don't close the write end of the HTTP socket after 103# sending a query to them) 104# janl 12/05/97 -- New version of perl caught some typos, fixed 105# longstanding bug in the newline conversion bit. 106# -> 1.0.13 107# janl 06/06/97 -- Demand Loading of MIME::BASE64 -> 1.0.14 108# janl 01/12/97 -- FAT filesystems drops LSB of modtime. Patch from 109# Greg Lindhorst (gregli@microsoft.com) 110# -- whoami does not exist on win32, hardwire a default 111# value (unknown) (also Greg L.) -> 1.0.15 112# janl 01/22/98 -- Proxy authentication as outlined by Christian Geuer 113# janl 02/20/98 -- Complex 'content-type' headers handled. -> 1.0.17 114# janl 04/20/98 -- Only newline convert text/html, everything else is 115# handled as binary. -> 1.0.18 116# janl 12/05/98 -- Store tmpfile in its final destination directory 117# avoiding asking movefile move it across filesystems. 118# -> 1.0.19 119# janl 01/08/98 -- Timeout fix from Michael Gusev, also flag short doc 120# as error. 121# janl 24/09/98 -- Better error handling -> 1.0.20 122# bkw 17/12/98 -- Fixed problem with tempfile-generation when 123# running in forget-mode (-f) 124# janl 05/01/99 -- Referer: dropped if argument not true -> 1.0.21 125# janl 13/04/99 -- Added workaround for broken win32 perl resolving. 126# janl 15/01/00 -- Patch to adapt to URI 1.0 from Takuya Tsumura and 127# Andrey A. Chernov 128# ams 02/02/01 -- Handle URLs with spaces better (use epath) 129 130package w3http; 131 132require 5.002; 133use Socket; 134use HTTP::Date; 135use Sys::Hostname; 136use URI::URL; 137 138# Suplementary libwww-perl: 139sub URI::URL::_generic::basename { 140 my $self = shift; 141 my @p = $self->path_components; 142 my $old = $p[-1]; 143 if (@_) { 144 splice(@p, -1, 1, shift); 145 $self->path_components(@p) 146 } 147 $old; 148} 149 150# The URI 1.0 library changed the internal organization a bit 151# Thanks to Andrey A. Chernov for the patch! 152 153sub URI::_generic::basename { 154 my $self = shift; 155 my @p = $self->path_segments; 156 my $old = $p[-1]; 157 if (@_) { 158 splice(@p, -1, 1, shift); 159 $self->path_segments(@p) 160 } 161 $old; 162} 163 164 165END { 166 # Remove tmp file and such in here. That means that main:: gotta catch 167 # interrupt signals and exit on them, so ENDs are executed. 168} 169 170use strict; 171# Global variables, we want to share them: 172use vars qw($GET $HEAD $GETURL $HEADURL $IFMOD $IFMODF $AUTHORIZ $REFERER); 173use vars qw($SAVEBIN $ACCEPT $NOUSER $FREEHEAD $agent $version $timeout); 174use vars qw($debug $convert $proxyserver $proxyport $xfbytes $headbytes); 175use vars qw($verbose $result $restext $header $document); 176use vars qw($plaintexthtml %headval $progress $doclen $proxyuser); 177use vars qw($proxypasswd); 178 179my $hasAlarm; # Win32 does not have any alarm 180my $chime; # Has the alarm gone off yet? 181my %address; # My own DNS cache 182my $savALRM; # Saved ALRM handler 183my $savPIPE; # Saved PIPE handler 184 185# The main:: program should detect if we're running on win32 or not, 186# somehow 187if ($main::win32) { 188 warn "win32\n"; 189 # Compensate for lacks of win32 perl. 190 $hasAlarm=0; 191 # Seems to be unavailable in win32/perl5.001. It has to be in 5.003! 192# eval "sub sockaddr_in { 193# ($port, $thataddr) = @_; 194# $sockaddr = 'S n a4 x8'; 195# return pack($sockaddr, &AF_INET, $port, $thataddr); 196# }"; 197} else { 198 $hasAlarm=1; 199} 200 201 202# Find out some things 203my $thishost = hostname(); 204my $proto = getprotobyname("tcp"); 205 206(my $name, undef) = gethostbyname($thishost); 207chomp(my $user = $ENV{'LOGNAME'} || $ENV{'USER'} || `whoami` || 'unknown'); 208my $from = "$user\@$name"; 209 210my $nl = "\r\n"; 211# Default values, change by assignment in using-program. 212$agent = $0; $agent =~ s~.*/~~; # Basename 213$version= "1.0"; 214$timeout= 600; # Timeout while waiting for data/connection 215my $buflen = 4096; # recv buffer length 216$debug = 0; # Debuging output? 217$convert = 1; # Convert newlines of text docs to local format 218$proxyserver=''; # Proxy server. 219$proxyport=0; # Proxy server port. 0 if no proxy. 220$proxyuser=''; # Username for proxy authentication 221$proxypasswd=''; # Password for proxy authentication 222$xfbytes=0; # 0 bytes transfered, cumulative 223$headbytes=0; # 0 bytes of headers, cumulative 224$doclen=0; # 0 bytes in doc, pr. document 225my $tmpfile="w3mir$$.tmp"; # Temporary filename 226$verbose=0; # Verbosenes, 0: silent, 1: progress info 227 228# Query opcodes 229$GET = 1; # GET query. Arg: host,port,path 230$HEAD = 2; # HEAD query. Arg: host,port,path 231$GETURL = 3; # GET query. Arg: url 232$HEADURL = 4; # HEAD query. Arg: url 233# Here we lack PUT, which is not implemented 234# Modify query thus: 235$IFMOD = 101; # If-modified after: Arg: HTTP-date-str 236$IFMODF = 102; # If-modified after file: Arg: local-file-name 237$AUTHORIZ= 103; # Basic authorization. Arg: 'user:password' 238$REFERER = 104; # Referer: Arg: Referer 239$SAVEBIN = 105; # Write binary files to disk. Arg: File name 240 # If this opcode is used then main must provide 241 # a &main::movefile(oldname,newname) procedure 242 # that handles moving the tmp file to the 243 # final name/location. 244$ACCEPT = 106; # Accept header value: Arg: value 245$NOUSER = 107; # Don't insert user header. Arg: none 246$FREEHEAD= 999; # Freeform header, one line. Arg: header 247 248sub query { 249 # Build and send a HTTP query. And also receive response - janl 95/09/18 250 # 251 # Return codes: 0 if it didn't work. 1 if it did work. 252 # HTTP style result code in w3http::$result and message in w3http::$restext 253 254 # We do next to no argument type checking btw. 255 256 my($host,$port,$request,$query,$method,$inp,$linp,$saveto,$save,$arg); 257 my($start,$wantbytes,$thataddr,$err,$headb,$tmpf,$ldoc,$nouser,$q,$accept); 258 my($origreq,$req_o,$plaintext); 259 260 # Something ought to be said 261 $result=99; 262 $restext='w3http: internal error'; 263 $nouser=0; 264 265 if ($version ne '1.0') { 266 warn "Unknown HTTP version $version, no request sent\n"; 267 return 0; 268 } 269 270 $accept=$saveto=$query=''; 271 272 # Find out what to ask for 273 274 while (defined($arg=shift)) { 275 if ($arg == $GET) { 276 $host=shift; 277 $port=shift; 278 $request=shift; 279 $req_o=url 'http://'.$host.':'.$port.$request; 280 if ($proxyport) { 281 $query.='GET http://'.$req_o->as_string; 282 } else { 283 $query.='GET '.$req_o->epath; 284 } 285 $query.=' HTTP/'.$version.$nl; 286 } elsif ($arg == $HEAD) { 287 $host=shift; 288 $port=shift; 289 $request=shift; 290 $req_o=url 'http://'.$host.':'.$port.$request; 291 if ($proxyport) { 292 $query.='HEAD '.$req_o->as_string; 293 } else { 294 $query.='HEAD '.$req_o->epath; 295 } 296 $query.=' HTTP/'.$version.$nl; 297 } elsif ($arg == $GETURL) { 298 $req_o=shift; 299 $req_o=url $req_o unless ref $req_o; 300 ($method,undef,undef,$host,$port,$request,undef,$q) = $req_o->crack; 301 if ($proxyport) { 302 $query.='GET '.$req_o->as_string; 303 } else { 304 $q=$req_o->equery; 305 $query.='GET '.($req_o->epath).($q?"?$q":''); 306 } 307 $query.=' HTTP/'.$version.$nl; 308 } elsif ($arg == $HEADURL) { 309 $req_o=shift; 310 $req_o=url $req_o unless ref $req_o; 311 if ($proxyport) { 312 $query.='HEAD '.$req_o->as_string; 313 } else { 314 $q=$req_o->equery; 315 $query.='HEAD '.$req_o->epath.($q?"?$q":''); 316 } 317 $query.=' HTTP/'.$version.$nl; 318 } elsif ($arg == $IFMOD) { 319 $query.='If-Modified-Since: '.(shift).$nl; 320 } elsif ($arg == $IFMODF) { 321 $query.='If-Modified-Since: '.&last_modified(shift).$nl; 322 } elsif ($arg == $AUTHORIZ) { 323 # Demand-load MIME::Base64 324 if (!defined(&MIME::Base64::encode)) { 325 eval "use MIME::Base64;"; 326 die "w3http: Could not load MIME::Base64 module necessary for authentication\n" 327 unless defined(&MIME::Base64::encode); 328 } 329 $query.='Authorization: Basic '.MIME::Base64::encode(shift,'').$nl; 330 } elsif ($arg == $REFERER) { 331 my($referer)=shift; 332 $query.='Referer: '.$referer.$nl if $referer; 333 } elsif ($arg == $SAVEBIN) { 334 $saveto=shift; 335 } elsif ($arg == $ACCEPT) { 336 $accept.='Accept: '.(shift).$nl; 337 } elsif ($arg == $NOUSER) { 338 $nouser=1; 339 } elsif ($arg == $FREEHEAD) { 340 $query.=(shift).$nl; 341 } else { 342 warn "Unknown http query opcode: $arg\n"; 343 } 344 # Insert the last parts of the query: 345 } 346 347 $query.='Host: '.$req_o->netloc.$nl; 348 $query.='From: '.$from.$nl unless $nouser; 349 350 $accept='Accept: */*'.$nl unless $accept; 351 352 if ($proxyport) { 353 # Use proxy instead of originserver 354 $host=$proxyserver; 355 $port=$proxyport; 356 357 # Add authentication stuff to query 358 if ($proxyuser) { 359 # Demand-load MIME::Base64 360 if (!defined(&MIME::Base64::encode)) { 361 eval "use MIME::Base64;"; 362 die "w3http: Could not load MIME::Base64 module necessary for authentication\n" 363 unless defined(&MIME::Base64::encode); 364 } 365 366 $query.='Proxy-Authorization: Basic '. 367 MIME::Base64::encode($proxyuser.':'.$proxypasswd); 368 369 print STDERR "\nProxyuser: [$proxyuser]\nProxypasswd: [$proxypasswd]\n" 370 if $debug>=2; 371 } 372 } 373 374 $query.='User-Agent: '.$agent.$nl.$accept.$nl; 375 376 # If we're using proxy then set up things... 377 print STDERR "\nQUERY:\n",$query,"---\n" if $debug>=2; 378 379 # win32 fix: this should be added in case of troubles with 380 # gethostbyname. possible reason: nameserver down? 381 if ($host =~ /^\d+(\.\d+){3}$/) { 382 # in case gethostbyname will not work ... ;-) 383 $address{$host} = pack 'C4', (split /\./, $host); 384 } 385 386 # Find out who to ask, check if we know already 387 if (exists($address{$host})) { 388 # We know 389 $thataddr=$address{$host}; 390 } else { 391 # Cache miss, get and remember. 392 (my $fqdn, undef, undef, undef, $thataddr) = gethostbyname($host); 393 # Hostname lookup failure? Cache even misses. 394 if (defined($fqdn)) { 395 print STDERR "Lookup of $host:\nFQDN: $fqdn\n" 396 if $debug; 397 $address{$host}=$thataddr; 398 $address{$fqdn}=$thataddr if $fqdn ne $host; 399 } else { 400 $thataddr=$address{$host}=undef; 401 } 402 } 403 404 # Check if lookup failure, return 405 if (!defined($thataddr)) { 406 $restext='Host lookup failure'; 407 return; 408 } 409 410 $port=80 unless defined($port) && $port; 411 412 # When connected we might receive SIGPIPE. I'm not sure if the 413 # default behaviour of dying is beneficial in that case. If we get 414 # alarm a timeout has expired. 415 $savPIPE = $SIG{'PIPE'}; 416 $savALRM = $SIG{'ALRM'}; 417 418 $chime=0; # There has been no alarm yet 419 $SIG{'ALRM'} = \&timeout; 420 $SIG{'PIPE'} = \&ignore; 421 422 # Close the socket, just in case, and ignore error returns 423 close(FS); 424 425 socket(FS, AF_INET, SOCK_STREAM, $proto) or return &oserror; 426 warn "Got my socks on\n" if $debug; 427 428 my $paddr = sockaddr_in($port, $thataddr); 429 connect(FS, $paddr) or return &oserror; 430 warn "Connected\n" if $debug; 431 432 # Arrange timeout 433 alarm($timeout) if $hasAlarm; 434 435 # We have, in fact, received SIGPIPE on this line: 436 send(FS,$query,0) or return &oserror; 437 438 if ($chime) { 439 $result=100; 440 $restext='timeout sending query'; 441 return &resetsign; 442 } 443 444 $header=''; 445 $document=''; 446 $inp=' 'x$buflen; 447 $doclen=$chime=$plaintext=$plaintexthtml=$save=0; 448 449 # Breaks some M$ ISS servers: 450 # shutdown(FS,1); # Half-close socket, sending now not allowed 451 452 print STDERR ", receiving header" if $verbose>0; 453 454 # Retrive HTTP response HEADER. Why do I use recv and not <FS>? 455 # Because then the timeout can work correctly! 456 while (1) { 457 # Set up alarm to ensure recv returns within a reasonable timeframe 458 alarm($timeout) if $hasAlarm; 459 $err = recv(FS,$inp,$buflen,0); 460 # recv returned, cancel alarm. 461 alarm(0) if $hasAlarm; 462 463 # If there has been a timeout, then we quit now. The recv man page 464 # does not seem to allow recv to return the bytes received up to 465 # the timeout. 466 if ($chime) { 467 $result=100; 468 $restext='timeout fetching document'; 469 $!=0; 470 if ($save) { 471 unlink($tmpf) || 472 warn "Could not unlink $tmpf: $!\n"; 473 } 474 return &resetsign; 475 } 476 477 # recv returnes the undefined value on error 478 if (!defined($err)) { 479 warn "Error in recv: $!\n"; 480 last; 481 } 482 483 $linp=length($inp); 484 485 # If the returned input was 0 in length then we've gotten to the 486 # end of the response. 487 last unless $linp; 488 489 # Accounting 490 $xfbytes += $linp; 491 $doclen += $linp; 492 493 # Accumulate input 494 $header.=$inp; 495 496 # eof(SOCKET) has strange semantics it seems 497 # last if eof(FS); 498 499 # Check if header is complete 500 last if ($header =~ m/(\r?\n\r?\n)/); 501 } 502 503 my $orighead = $header; 504 505 if (length($header)==0) { 506 $restext='the HTTP reply header is empty!'; 507 return &resetsign; 508 } 509 510 if ($header =~ m/(\r?\n\r?\n)/) { 511 if ($`) { 512 $header=$`; 513 $document=$'; 514 } 515 } 516 517 # Adjust accounting 518 $headb = length($header)+length($1); 519 $headbytes += $headb; 520 $xfbytes -= $headb; 521 $doclen -= $headb; 522 523 # Pick headers to pieces 524 ($result,$restext,%headval)=&analyze_header($header); 525 526 if (!$result) { 527 print "\n\nw3mir: BOGUS HTTP REPLY:\n-----\n$header\n-----\n"; 528 print "\n\nw3mir: UNPROCESSED REPLY:\n-----\n$orighead\n-----\n"; 529 print "\nw3mir: QUERY WAS:\n-----\n$query\n-----\n"; 530 die; 531 } 532 533 print STDERR "REPLY:\n",$header,"\n---\n" if $debug>=2; 534 535 # Check if the document is a non-encoded text document. The contents 536 # could be (x-)?compress or (x-)gzip coded (compressed in other 537 # words). 538 539 $plaintext=defined($headval{'CONTENT-TYPE'}) && 540 (substr($headval{'CONTENT-TYPE'},0,5) eq 'text/' || 0) && 541 !defined($headval{'content-encoding'}); 542 $plaintexthtml=$plaintext && 543 ($headval{'CONTENT-TYPE'} eq 'text/html'); 544 545 if ($result==200) { 546 547 # Save this to a file, or not? Never save html files. 548 if ($saveto && !$plaintexthtml) { 549 # We're going to save this document directly into a file. This 550 # stresses the VM less when getting the large binares so often 551 # found at cool sites. 552 $save=1; 553 554 my($slash)=($saveto =~ /^\//); 555 # Find a temporary filename 556 $tmpf=url "file:$saveto"; 557 $tmpf->basename($tmpfile); 558 $tmpf=$tmpf->unix_path; 559 $tmpf =~ s~^/~~ if (!$slash); 560 561 # Find suitable final filename, one with no URL escapes 562 $saveto=(url "file:$saveto")->unix_path; 563 $saveto =~ s~^/~~ if (!$slash); 564 565 # If output to stdout then send it directly there rather than 566 # using disk unnecesarily. 567 $tmpf='-' if ($saveto eq '-'); 568 569 # If output is nulldevice (running -f), use it also for tmpfile, 570 # since it would otherwise try to create it in /dev under unix. 571 $tmpf=$main::nulldevice if ($saveto eq $main::nulldevice); 572 573 warn "USING TMPFILE: $tmpf\n" if $debug; 574 575 open(SAVE,">$tmpf") || 576 die "Could not open tmp file: $tmpf: $!\n"; 577 binmode SAVE; # It's a binary file... 578 } 579 580 if ($verbose>0) { 581 print STDERR ", document"; 582 print STDERR "->disk" if $save; 583 } 584 585 # Now retrive document itself. Se comments in header loop 586 $start=time; 587 $wantbytes = defined($headval{'content-length'})? 588 $headval{'content-length'}:0; 589 590 $ldoc=length($document); 591 592 while (1) { 593 alarm($timeout) if $hasAlarm; 594 recv(FS,$inp,$buflen,0); 595 alarm(0) if $hasAlarm; 596 597 if ($chime) { 598 $result=100; 599 $restext='timeout fetching document'; 600 $!=0; 601 if ($save) { 602 unlink($tmpf) || 603 warn "Could not unlink $tmpf: $!\n"; 604 } 605 return &resetsign; 606 } 607 608 $linp=length($inp); 609 610 last unless $linp || $ldoc; 611 $ldoc = 0; 612 613 $xfbytes += $linp; 614 $doclen += $linp; 615 616 if ($verbose>0 && time-$start>5) { 617 # Write progress info ... 618 if ($wantbytes) { 619 $progress = sprintf " %3d%%", $doclen/$wantbytes*100; 620 } else { 621 $progress = sprintf " %d", $doclen; 622 } 623 print STDERR $progress, "\ch"x(length($progress)); 624 # ...every 5 seconds 625 $start=time; 626 } 627 628 $document.=$inp; 629 630 if ($save) { 631 $err = print SAVE $document; 632 die "Error writing $tmpf: $!\n" unless $err; 633 $document=''; 634 } 635 636 # The eof test seems to work very oddly for sockets. 637 # last if eof(FS); 638 } 639 640 close(FS); # Close socket completely 641 642 print STDERR "DOCUMENT:\n----\n",$document,"\n----\n" if $debug>=255; 643 644 if ($wantbytes && 645 $wantbytes != $doclen) { 646 $result=100; 647 $restext='transfer error; too many bytes in document'; 648 $restext='document was incomplete' if ($wantbytes > $doclen) ; 649 print STDERR "SHORT DOCUMENT" if $debug>=16; 650 if ($save) { 651 unlink($tmpf) || warn "Could not unlink $tmpf: $!\n"; 652 } 653 return &resetsign; 654 } 655 656 # warn "XFB: $xfbytes, DL: $doclen\n"; 657 if ($save) { 658 close(SAVE); 659 &main::movefile($tmpf,$saveto); 660 } 661 662 # If this is a non-encoded text file and we're supposed to convert 663 # foreign newlines then we do it. It would be faster to do this 664 # with each chunk of input in the input loop, but this gives us 665 # two problems: 666 # - A \r\n newline could be split into two chunks. Thus escaping 667 # newline conversion. 668 # - It messes up the received bytes accounting rather badly. 669 # 670 # This used to be a test for $plaintext, the problem is that too 671 # many documents were typed as text/plain and so we corrupted 672 # binary files. This is bad. So now we're more paranoid about it: 673 # Only HTML gets converted. 674 if ($convert && $plaintexthtml) { 675 # Change non unix newlines to unix newlines. bare \r is known 676 # from macintosh (they hadta be different didn't they?), \r\n is 677 # known as 'network format' and from numerous systems, among 678 # them ms-dos. 679 $document =~ s~\r~\n~g unless $document =~ s~\r\n~\n~g; 680 warn "Newlines converted(?)\n" if $debug; 681 } 682 683 } # if $result == 200 684 685 &resetsign; 686 return 1; 687} 688 689 690sub analyze_header { 691 my($header)=@_; 692 my($result,$restext,%headval,$hdln,$key,$value); 693 694 # Summary of the http spec on headers (with my comments): 695 # - Each header line ends in CRLF (or just LF, or maybe even just CR, 696 # anyways, it's easier if all is LF). 697 $header =~ s/\r/\n/mg unless $header =~ s/\r\n/\n/mg; 698 # - If a line starts with space then it's a continuation of the previous 699 # line (these I fold into one line). 700 $header =~ s/\n\s/ /mg; 701 # - The header field names are case insensitive (so I convert them to 702 # lowercase) 703 # - A field may appear twice, that is equivalent to listing the values 704 # in a comma separated list (so I fold them into a comma separated list)n 705 # - The field name and the field value are separated by ': ' 706 ($result,$restext) = $header =~ m~^HTTP/\d\.\d (\d\d\d) (.*)~; 707 # Shave off http result code from the header 708 $header =~ s~^.*\n~~; 709 710 warn "Header:\n$header\n---\n" if $debug>=3; 711 712 warn "Result: $result, Text: $restext\n" if $debug>=2; 713 714 %headval=(); 715 716 foreach $hdln (split(/\r?\n/m,$header)) { 717 ($key,$value)=split(': ',$hdln,2); 718 $key="\L$key"; 719 # Strip leading&trailing space off the reply, some servers use 720 # copious space after. 721 $value =~ s/^\s+|\s+$//g; 722 print STDERR "K: '$key', V: '$value'\n" if $debug>=2; 723 if (defined($headval{$key})) { 724 $headval{$key}.=", ".$value; 725 } else { 726 $headval{$key}=$value; 727 } 728 } 729 730 # See if there are any type parameters in the content-type header 731 # and if so remove them. 732 if (defined($headval{'content-type'})) { 733 my $val=$headval{'content-type'}; 734 ($val,undef)=split(';',$val,2) if ($val =~ /;/); 735 $headval{'CONTENT-TYPE'}=$val; 736 } 737 738 return ($result,$restext,%headval); 739} 740 741 742sub last_modified { 743 # will return the last modified time for a local file as a HTTP 744 # timestamp. 745 746 my(@tmp) = stat($_[0]); # file doesn't exist ok to fetch 747 748 # FAT file systems strip the LSB of the file time. Add it back in 749 # here before asking the server about a modified file. The only way 750 # this can fail is if the newer server file was saved one second 751 # after the first version (very unlikely). This isn't needed for 752 # NTFS file systems, but there is no good portable Perl way to 753 # determine the file system type. 754 $tmp[9] = $tmp[9] | 1 if ( $main::win32 ); 755 756 # now we got the last modified in a 32 bit integer. time to convert 757 # it and return 758 return time2str($tmp[9]); 759} 760 761 762sub timeout { 763 # Set timeout flag. The using procedure has to set other result codes. 764 $chime=1; # When this is 1 then the alarm has gone off 765 print STDERR "TIMEOUT!!!!\n" if $debug>=16; 766} 767 768 769sub ignore { 770 warn "I got SIGPIPE, ignoring it...\n"; 771} 772 773 774sub resetsign { 775 return 0 if !defined($savALRM); 776 $SIG{'ALRM'}=$savALRM; 777 undef $savALRM; 778# $SIG{'PIPE'}=$savPIPE; 779 return 0; 780} 781 782 783sub oserror { 784 785 resetsign; 786 787 $result=98; 788 $restext='w3http: OS error'; 789 return 0; 790 791} 792 793 7941; 795