1package File::Fetch; 2 3use strict; 4use FileHandle; 5use File::Temp; 6use File::Copy; 7use File::Spec; 8use File::Spec::Unix; 9use File::Basename qw[dirname]; 10 11use Cwd qw[cwd]; 12use Carp qw[carp]; 13use IPC::Cmd qw[can_run run QUOTE]; 14use File::Path qw[mkpath]; 15use File::Temp qw[tempdir]; 16use Params::Check qw[check]; 17use Module::Load::Conditional qw[can_load]; 18use Locale::Maketext::Simple Style => 'gettext'; 19 20use vars qw[ $VERBOSE $PREFER_BIN $FROM_EMAIL $USER_AGENT 21 $BLACKLIST $METHOD_FAIL $VERSION $METHODS 22 $FTP_PASSIVE $TIMEOUT $DEBUG $WARN 23 ]; 24 25$VERSION = '0.24'; 26$VERSION = eval $VERSION; # avoid warnings with development releases 27$PREFER_BIN = 0; # XXX TODO implement 28$FROM_EMAIL = 'File-Fetch@example.com'; 29$USER_AGENT = "File::Fetch/$VERSION"; 30$BLACKLIST = [qw|ftp|]; 31$METHOD_FAIL = { }; 32$FTP_PASSIVE = 1; 33$TIMEOUT = 0; 34$DEBUG = 0; 35$WARN = 1; 36 37### methods available to fetch the file depending on the scheme 38$METHODS = { 39 http => [ qw|lwp wget curl lftp lynx iosock| ], 40 ftp => [ qw|lwp netftp wget curl lftp ncftp ftp| ], 41 file => [ qw|lwp lftp file| ], 42 rsync => [ qw|rsync| ] 43}; 44 45### silly warnings ### 46local $Params::Check::VERBOSE = 1; 47local $Params::Check::VERBOSE = 1; 48local $Module::Load::Conditional::VERBOSE = 0; 49local $Module::Load::Conditional::VERBOSE = 0; 50 51### see what OS we are on, important for file:// uris ### 52use constant ON_WIN => ($^O eq 'MSWin32'); 53use constant ON_VMS => ($^O eq 'VMS'); 54use constant ON_UNIX => (!ON_WIN); 55use constant HAS_VOL => (ON_WIN); 56use constant HAS_SHARE => (ON_WIN); 57 58 59=pod 60 61=head1 NAME 62 63File::Fetch - A generic file fetching mechanism 64 65=head1 SYNOPSIS 66 67 use File::Fetch; 68 69 ### build a File::Fetch object ### 70 my $ff = File::Fetch->new(uri => 'http://some.where.com/dir/a.txt'); 71 72 ### fetch the uri to cwd() ### 73 my $where = $ff->fetch() or die $ff->error; 74 75 ### fetch the uri to /tmp ### 76 my $where = $ff->fetch( to => '/tmp' ); 77 78 ### parsed bits from the uri ### 79 $ff->uri; 80 $ff->scheme; 81 $ff->host; 82 $ff->path; 83 $ff->file; 84 85=head1 DESCRIPTION 86 87File::Fetch is a generic file fetching mechanism. 88 89It allows you to fetch any file pointed to by a C<ftp>, C<http>, 90C<file>, or C<rsync> uri by a number of different means. 91 92See the C<HOW IT WORKS> section further down for details. 93 94=head1 ACCESSORS 95 96A C<File::Fetch> object has the following accessors 97 98=over 4 99 100=item $ff->uri 101 102The uri you passed to the constructor 103 104=item $ff->scheme 105 106The scheme from the uri (like 'file', 'http', etc) 107 108=item $ff->host 109 110The hostname in the uri. Will be empty if host was originally 111'localhost' for a 'file://' url. 112 113=item $ff->vol 114 115On operating systems with the concept of a volume the second element 116of a file:// is considered to the be volume specification for the file. 117Thus on Win32 this routine returns the volume, on other operating 118systems this returns nothing. 119 120On Windows this value may be empty if the uri is to a network share, in 121which case the 'share' property will be defined. Additionally, volume 122specifications that use '|' as ':' will be converted on read to use ':'. 123 124On VMS, which has a volume concept, this field will be empty because VMS 125file specifications are converted to absolute UNIX format and the volume 126information is transparently included. 127 128=item $ff->share 129 130On systems with the concept of a network share (currently only Windows) returns 131the sharename from a file://// url. On other operating systems returns empty. 132 133=item $ff->path 134 135The path from the uri, will be at least a single '/'. 136 137=item $ff->file 138 139The name of the remote file. For the local file name, the 140result of $ff->output_file will be used. 141 142=cut 143 144 145########################## 146### Object & Accessors ### 147########################## 148 149{ 150 ### template for autogenerated accessors ### 151 my $Tmpl = { 152 scheme => { default => 'http' }, 153 host => { default => 'localhost' }, 154 path => { default => '/' }, 155 file => { required => 1 }, 156 uri => { required => 1 }, 157 vol => { default => '' }, # windows for file:// uris 158 share => { default => '' }, # windows for file:// uris 159 _error_msg => { no_override => 1 }, 160 _error_msg_long => { no_override => 1 }, 161 }; 162 163 for my $method ( keys %$Tmpl ) { 164 no strict 'refs'; 165 *$method = sub { 166 my $self = shift; 167 $self->{$method} = $_[0] if @_; 168 return $self->{$method}; 169 } 170 } 171 172 sub _create { 173 my $class = shift; 174 my %hash = @_; 175 176 my $args = check( $Tmpl, \%hash ) or return; 177 178 bless $args, $class; 179 180 if( lc($args->scheme) ne 'file' and not $args->host ) { 181 return $class->_error(loc( 182 "Hostname required when fetching from '%1'",$args->scheme)); 183 } 184 185 for (qw[path file]) { 186 unless( $args->$_() ) { # 5.5.x needs the () 187 return $class->_error(loc("No '%1' specified",$_)); 188 } 189 } 190 191 return $args; 192 } 193} 194 195=item $ff->output_file 196 197The name of the output file. This is the same as $ff->file, 198but any query parameters are stripped off. For example: 199 200 http://example.com/index.html?x=y 201 202would make the output file be C<index.html> rather than 203C<index.html?x=y>. 204 205=back 206 207=cut 208 209sub output_file { 210 my $self = shift; 211 my $file = $self->file; 212 213 $file =~ s/\?.*$//g; 214 215 return $file; 216} 217 218### XXX do this or just point to URI::Escape? 219# =head2 $esc_uri = $ff->escaped_uri 220# 221# =cut 222# 223# ### most of this is stolen straight from URI::escape 224# { ### Build a char->hex map 225# my %escapes = map { chr($_) => sprintf("%%%02X", $_) } 0..255; 226# 227# sub escaped_uri { 228# my $self = shift; 229# my $uri = $self->uri; 230# 231# ### Default unsafe characters. RFC 2732 ^(uric - reserved) 232# $uri =~ s/([^A-Za-z0-9\-_.!~*'()])/ 233# $escapes{$1} || $self->_fail_hi($1)/ge; 234# 235# return $uri; 236# } 237# 238# sub _fail_hi { 239# my $self = shift; 240# my $char = shift; 241# 242# $self->_error(loc( 243# "Can't escape '%1', try using the '%2' module instead", 244# sprintf("\\x{%04X}", ord($char)), 'URI::Escape' 245# )); 246# } 247# 248# sub output_file { 249# 250# } 251# 252# 253# } 254 255=head1 METHODS 256 257=head2 $ff = File::Fetch->new( uri => 'http://some.where.com/dir/file.txt' ); 258 259Parses the uri and creates a corresponding File::Fetch::Item object, 260that is ready to be C<fetch>ed and returns it. 261 262Returns false on failure. 263 264=cut 265 266sub new { 267 my $class = shift; 268 my %hash = @_; 269 270 my ($uri); 271 my $tmpl = { 272 uri => { required => 1, store => \$uri }, 273 }; 274 275 check( $tmpl, \%hash ) or return; 276 277 ### parse the uri to usable parts ### 278 my $href = $class->_parse_uri( $uri ) or return; 279 280 ### make it into a FFI object ### 281 my $ff = $class->_create( %$href ) or return; 282 283 284 ### return the object ### 285 return $ff; 286} 287 288### parses an uri to a hash structure: 289### 290### $class->_parse_uri( 'ftp://ftp.cpan.org/pub/mirror/index.txt' ) 291### 292### becomes: 293### 294### $href = { 295### scheme => 'ftp', 296### host => 'ftp.cpan.org', 297### path => '/pub/mirror', 298### file => 'index.html' 299### }; 300### 301### In the case of file:// urls there maybe be additional fields 302### 303### For systems with volume specifications such as Win32 there will be 304### a volume specifier provided in the 'vol' field. 305### 306### 'vol' => 'volumename' 307### 308### For windows file shares there may be a 'share' key specified 309### 310### 'share' => 'sharename' 311### 312### Note that the rules of what a file:// url means vary by the operating system 313### of the host being addressed. Thus file:///d|/foo/bar.txt means the obvious 314### 'D:\foo\bar.txt' on windows, but on unix it means '/d|/foo/bar.txt' and 315### not '/foo/bar.txt' 316### 317### Similarly if the host interpreting the url is VMS then 318### file:///disk$user/my/notes/note12345.txt' means 319### 'DISK$USER:[MY.NOTES]NOTE123456.TXT' but will be returned the same as 320### if it is unix where it means /disk$user/my/notes/note12345.txt'. 321### Except for some cases in the File::Spec methods, Perl on VMS will generally 322### handle UNIX format file specifications. 323### 324### This means it is impossible to serve certain file:// urls on certain systems. 325### 326### Thus are the problems with a protocol-less specification. :-( 327### 328 329sub _parse_uri { 330 my $self = shift; 331 my $uri = shift or return; 332 333 my $href = { uri => $uri }; 334 335 ### find the scheme ### 336 $uri =~ s|^(\w+)://||; 337 $href->{scheme} = $1; 338 339 ### See rfc 1738 section 3.10 340 ### http://www.faqs.org/rfcs/rfc1738.html 341 ### And wikipedia for more on windows file:// urls 342 ### http://en.wikipedia.org/wiki/File:// 343 if( $href->{scheme} eq 'file' ) { 344 345 my @parts = split '/',$uri; 346 347 ### file://hostname/... 348 ### file://hostname/... 349 ### normalize file://localhost with file:/// 350 $href->{host} = $parts[0] || ''; 351 352 ### index in @parts where the path components begin; 353 my $index = 1; 354 355 ### file:////hostname/sharename/blah.txt 356 if ( HAS_SHARE and not length $parts[0] and not length $parts[1] ) { 357 358 $href->{host} = $parts[2] || ''; # avoid warnings 359 $href->{share} = $parts[3] || ''; # avoid warnings 360 361 $index = 4 # index after the share 362 363 ### file:///D|/blah.txt 364 ### file:///D:/blah.txt 365 } elsif (HAS_VOL) { 366 367 ### this code comes from dmq's patch, but: 368 ### XXX if volume is empty, wouldn't that be an error? --kane 369 ### if so, our file://localhost test needs to be fixed as wel 370 $href->{vol} = $parts[1] || ''; 371 372 ### correct D| style colume descriptors 373 $href->{vol} =~ s/\A([A-Z])\|\z/$1:/i if ON_WIN; 374 375 $index = 2; # index after the volume 376 } 377 378 ### rebuild the path from the leftover parts; 379 $href->{path} = join '/', '', splice( @parts, $index, $#parts ); 380 381 } else { 382 ### using anything but qw() in hash slices may produce warnings 383 ### in older perls :-( 384 @{$href}{ qw(host path) } = $uri =~ m|([^/]*)(/.*)$|s; 385 } 386 387 ### split the path into file + dir ### 388 { my @parts = File::Spec::Unix->splitpath( delete $href->{path} ); 389 $href->{path} = $parts[1]; 390 $href->{file} = $parts[2]; 391 } 392 393 ### host will be empty if the target was 'localhost' and the 394 ### scheme was 'file' 395 $href->{host} = '' if ($href->{host} eq 'localhost') and 396 ($href->{scheme} eq 'file'); 397 398 return $href; 399} 400 401=head2 $where = $ff->fetch( [to => /my/output/dir/ | \$scalar] ) 402 403Fetches the file you requested and returns the full path to the file. 404 405By default it writes to C<cwd()>, but you can override that by specifying 406the C<to> argument: 407 408 ### file fetch to /tmp, full path to the file in $where 409 $where = $ff->fetch( to => '/tmp' ); 410 411 ### file slurped into $scalar, full path to the file in $where 412 ### file is downloaded to a temp directory and cleaned up at exit time 413 $where = $ff->fetch( to => \$scalar ); 414 415Returns the full path to the downloaded file on success, and false 416on failure. 417 418=cut 419 420sub fetch { 421 my $self = shift or return; 422 my %hash = @_; 423 424 my $target; 425 my $tmpl = { 426 to => { default => cwd(), store => \$target }, 427 }; 428 429 check( $tmpl, \%hash ) or return; 430 431 my ($to, $fh); 432 ### you want us to slurp the contents 433 if( ref $target and UNIVERSAL::isa( $target, 'SCALAR' ) ) { 434 $to = tempdir( 'FileFetch.XXXXXX', CLEANUP => 1 ); 435 436 ### plain old fetch 437 } else { 438 $to = $target; 439 440 ### On VMS force to VMS format so File::Spec will work. 441 $to = VMS::Filespec::vmspath($to) if ON_VMS; 442 443 ### create the path if it doesn't exist yet ### 444 unless( -d $to ) { 445 eval { mkpath( $to ) }; 446 447 return $self->_error(loc("Could not create path '%1'",$to)) if $@; 448 } 449 } 450 451 ### set passive ftp if required ### 452 local $ENV{FTP_PASSIVE} = $FTP_PASSIVE; 453 454 ### we dont use catfile on win32 because if we are using a cygwin tool 455 ### under cmd.exe they wont understand windows style separators. 456 my $out_to = ON_WIN ? $to.'/'.$self->output_file 457 : File::Spec->catfile( $to, $self->output_file ); 458 459 for my $method ( @{ $METHODS->{$self->scheme} } ) { 460 my $sub = '_'.$method.'_fetch'; 461 462 unless( __PACKAGE__->can($sub) ) { 463 $self->_error(loc("Cannot call method for '%1' -- WEIRD!", 464 $method)); 465 next; 466 } 467 468 ### method is blacklisted ### 469 next if grep { lc $_ eq $method } @$BLACKLIST; 470 471 ### method is known to fail ### 472 next if $METHOD_FAIL->{$method}; 473 474 ### there's serious issues with IPC::Run and quoting of command 475 ### line arguments. using quotes in the wrong place breaks things, 476 ### and in the case of say, 477 ### C:\cygwin\bin\wget.EXE --quiet --passive-ftp --output-document 478 ### "index.html" "http://www.cpan.org/index.html?q=1&y=2" 479 ### it doesn't matter how you quote, it always fails. 480 local $IPC::Cmd::USE_IPC_RUN = 0; 481 482 if( my $file = $self->$sub( 483 to => $out_to 484 )){ 485 486 unless( -e $file && -s _ ) { 487 $self->_error(loc("'%1' said it fetched '%2', ". 488 "but it was not created",$method,$file)); 489 490 ### mark the failure ### 491 $METHOD_FAIL->{$method} = 1; 492 493 next; 494 495 } else { 496 497 ### slurp mode? 498 if( ref $target and UNIVERSAL::isa( $target, 'SCALAR' ) ) { 499 500 ### open the file 501 open my $fh, $file or do { 502 $self->_error( 503 loc("Could not open '%1': %2", $file, $!)); 504 return; 505 }; 506 507 ### slurp 508 $$target = do { local $/; <$fh> }; 509 510 } 511 512 my $abs = File::Spec->rel2abs( $file ); 513 return $abs; 514 515 } 516 } 517 } 518 519 520 ### if we got here, we looped over all methods, but we weren't able 521 ### to fetch it. 522 return; 523} 524 525######################## 526### _*_fetch methods ### 527######################## 528 529### LWP fetching ### 530sub _lwp_fetch { 531 my $self = shift; 532 my %hash = @_; 533 534 my ($to); 535 my $tmpl = { 536 to => { required => 1, store => \$to } 537 }; 538 check( $tmpl, \%hash ) or return; 539 540 ### modules required to download with lwp ### 541 my $use_list = { 542 LWP => '0.0', 543 'LWP::UserAgent' => '0.0', 544 'HTTP::Request' => '0.0', 545 'HTTP::Status' => '0.0', 546 URI => '0.0', 547 548 }; 549 550 if( can_load(modules => $use_list) ) { 551 552 ### setup the uri object 553 my $uri = URI->new( File::Spec::Unix->catfile( 554 $self->path, $self->file 555 ) ); 556 557 ### special rules apply for file:// uris ### 558 $uri->scheme( $self->scheme ); 559 $uri->host( $self->scheme eq 'file' ? '' : $self->host ); 560 $uri->userinfo("anonymous:$FROM_EMAIL") if $self->scheme ne 'file'; 561 562 ### set up the useragent object 563 my $ua = LWP::UserAgent->new(); 564 $ua->timeout( $TIMEOUT ) if $TIMEOUT; 565 $ua->agent( $USER_AGENT ); 566 $ua->from( $FROM_EMAIL ); 567 $ua->env_proxy; 568 569 my $res = $ua->mirror($uri, $to) or return; 570 571 ### uptodate or fetched ok ### 572 if ( $res->code == 304 or $res->code == 200 ) { 573 return $to; 574 575 } else { 576 return $self->_error(loc("Fetch failed! HTTP response: %1 %2 [%3]", 577 $res->code, HTTP::Status::status_message($res->code), 578 $res->status_line)); 579 } 580 581 } else { 582 $METHOD_FAIL->{'lwp'} = 1; 583 return; 584 } 585} 586 587### Simple IO::Socket::INET fetching ### 588sub _iosock_fetch { 589 my $self = shift; 590 my %hash = @_; 591 592 my ($to); 593 my $tmpl = { 594 to => { required => 1, store => \$to } 595 }; 596 check( $tmpl, \%hash ) or return; 597 598 my $use_list = { 599 'IO::Socket::INET' => '0.0', 600 'IO::Select' => '0.0', 601 }; 602 603 if( can_load(modules => $use_list) ) { 604 my $sock = IO::Socket::INET->new( 605 PeerHost => $self->host, 606 ( $self->host =~ /:/ ? () : ( PeerPort => 80 ) ), 607 ); 608 609 unless ( $sock ) { 610 return $self->_error(loc("Could not open socket to '%1', '%2'",$self->host,$!)); 611 } 612 613 my $fh = FileHandle->new; 614 615 # Check open() 616 617 unless ( $fh->open($to,'>') ) { 618 return $self->_error(loc( 619 "Could not open '%1' for writing: %2",$to,$!)); 620 } 621 622 my $path = File::Spec::Unix->catfile( $self->path, $self->file ); 623 my $req = "GET $path HTTP/1.0\x0d\x0aHost: " . $self->host . "\x0d\x0a\x0d\x0a"; 624 $sock->send( $req ); 625 626 my $select = IO::Select->new( $sock ); 627 628 my $resp = ''; 629 my $normal = 0; 630 while ( $select->can_read( $TIMEOUT || 60 ) ) { 631 my $ret = $sock->sysread( $resp, 4096, length($resp) ); 632 if ( !defined $ret or $ret == 0 ) { 633 $select->remove( $sock ); 634 $normal++; 635 } 636 } 637 close $sock; 638 639 unless ( $normal ) { 640 return $self->_error(loc("Socket timed out after '%1' seconds", ( $TIMEOUT || 60 ))); 641 } 642 643 # Check the "response" 644 # Strip preceeding blank lines apparently they are allowed (RFC 2616 4.1) 645 $resp =~ s/^(\x0d?\x0a)+//; 646 # Check it is an HTTP response 647 unless ( $resp =~ m!^HTTP/(\d+)\.(\d+)!i ) { 648 return $self->_error(loc("Did not get a HTTP response from '%1'",$self->host)); 649 } 650 651 # Check for OK 652 my ($code) = $resp =~ m!^HTTP/\d+\.\d+\s+(\d+)!i; 653 unless ( $code eq '200' ) { 654 return $self->_error(loc("Got a '%1' from '%2' expected '200'",$code,$self->host)); 655 } 656 657 print $fh +($resp =~ m/\x0d\x0a\x0d\x0a(.*)$/s )[0]; 658 close $fh; 659 return $to; 660 661 } else { 662 $METHOD_FAIL->{'iosock'} = 1; 663 return; 664 } 665} 666 667### Net::FTP fetching 668sub _netftp_fetch { 669 my $self = shift; 670 my %hash = @_; 671 672 my ($to); 673 my $tmpl = { 674 to => { required => 1, store => \$to } 675 }; 676 check( $tmpl, \%hash ) or return; 677 678 ### required modules ### 679 my $use_list = { 'Net::FTP' => 0 }; 680 681 if( can_load( modules => $use_list ) ) { 682 683 ### make connection ### 684 my $ftp; 685 my @options = ($self->host); 686 push(@options, Timeout => $TIMEOUT) if $TIMEOUT; 687 unless( $ftp = Net::FTP->new( @options ) ) { 688 return $self->_error(loc("Ftp creation failed: %1",$@)); 689 } 690 691 ### login ### 692 unless( $ftp->login( anonymous => $FROM_EMAIL ) ) { 693 return $self->_error(loc("Could not login to '%1'",$self->host)); 694 } 695 696 ### set binary mode, just in case ### 697 $ftp->binary; 698 699 ### create the remote path 700 ### remember remote paths are unix paths! [#11483] 701 my $remote = File::Spec::Unix->catfile( $self->path, $self->file ); 702 703 ### fetch the file ### 704 my $target; 705 unless( $target = $ftp->get( $remote, $to ) ) { 706 return $self->_error(loc("Could not fetch '%1' from '%2'", 707 $remote, $self->host)); 708 } 709 710 ### log out ### 711 $ftp->quit; 712 713 return $target; 714 715 } else { 716 $METHOD_FAIL->{'netftp'} = 1; 717 return; 718 } 719} 720 721### /bin/wget fetch ### 722sub _wget_fetch { 723 my $self = shift; 724 my %hash = @_; 725 726 my ($to); 727 my $tmpl = { 728 to => { required => 1, store => \$to } 729 }; 730 check( $tmpl, \%hash ) or return; 731 732 ### see if we have a wget binary ### 733 if( my $wget = can_run('wget') ) { 734 735 ### no verboseness, thanks ### 736 my $cmd = [ $wget, '--quiet' ]; 737 738 ### if a timeout is set, add it ### 739 push(@$cmd, '--timeout=' . $TIMEOUT) if $TIMEOUT; 740 741 ### run passive if specified ### 742 push @$cmd, '--passive-ftp' if $FTP_PASSIVE; 743 744 ### set the output document, add the uri ### 745 push @$cmd, '--output-document', $to, $self->uri; 746 747 ### with IPC::Cmd > 0.41, this is fixed in teh library, 748 ### and there's no need for special casing any more. 749 ### DO NOT quote things for IPC::Run, it breaks stuff. 750 # $IPC::Cmd::USE_IPC_RUN 751 # ? ($to, $self->uri) 752 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE); 753 754 ### shell out ### 755 my $captured; 756 unless(run( command => $cmd, 757 buffer => \$captured, 758 verbose => $DEBUG 759 )) { 760 ### wget creates the output document always, even if the fetch 761 ### fails.. so unlink it in that case 762 1 while unlink $to; 763 764 return $self->_error(loc( "Command failed: %1", $captured || '' )); 765 } 766 767 return $to; 768 769 } else { 770 $METHOD_FAIL->{'wget'} = 1; 771 return; 772 } 773} 774 775### /bin/lftp fetch ### 776sub _lftp_fetch { 777 my $self = shift; 778 my %hash = @_; 779 780 my ($to); 781 my $tmpl = { 782 to => { required => 1, store => \$to } 783 }; 784 check( $tmpl, \%hash ) or return; 785 786 ### see if we have a wget binary ### 787 if( my $lftp = can_run('lftp') ) { 788 789 ### no verboseness, thanks ### 790 my $cmd = [ $lftp, '-f' ]; 791 792 my $fh = File::Temp->new; 793 794 my $str; 795 796 ### if a timeout is set, add it ### 797 $str .= "set net:timeout $TIMEOUT;\n" if $TIMEOUT; 798 799 ### run passive if specified ### 800 $str .= "set ftp:passive-mode 1;\n" if $FTP_PASSIVE; 801 802 ### set the output document, add the uri ### 803 ### quote the URI, because lftp supports certain shell 804 ### expansions, most notably & for backgrounding. 805 ### ' quote does nto work, must be " 806 $str .= q[get ']. $self->uri .q[' -o ]. $to . $/; 807 808 if( $DEBUG ) { 809 my $pp_str = join ' ', split $/, $str; 810 print "# lftp command: $pp_str\n"; 811 } 812 813 ### write straight to the file. 814 $fh->autoflush(1); 815 print $fh $str; 816 817 ### the command needs to be 1 string to be executed 818 push @$cmd, $fh->filename; 819 820 ### with IPC::Cmd > 0.41, this is fixed in teh library, 821 ### and there's no need for special casing any more. 822 ### DO NOT quote things for IPC::Run, it breaks stuff. 823 # $IPC::Cmd::USE_IPC_RUN 824 # ? ($to, $self->uri) 825 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE); 826 827 828 ### shell out ### 829 my $captured; 830 unless(run( command => $cmd, 831 buffer => \$captured, 832 verbose => $DEBUG 833 )) { 834 ### wget creates the output document always, even if the fetch 835 ### fails.. so unlink it in that case 836 1 while unlink $to; 837 838 return $self->_error(loc( "Command failed: %1", $captured || '' )); 839 } 840 841 return $to; 842 843 } else { 844 $METHOD_FAIL->{'lftp'} = 1; 845 return; 846 } 847} 848 849 850 851### /bin/ftp fetch ### 852sub _ftp_fetch { 853 my $self = shift; 854 my %hash = @_; 855 856 my ($to); 857 my $tmpl = { 858 to => { required => 1, store => \$to } 859 }; 860 check( $tmpl, \%hash ) or return; 861 862 ### see if we have a ftp binary ### 863 if( my $ftp = can_run('ftp') ) { 864 865 my $fh = FileHandle->new; 866 867 local $SIG{CHLD} = 'IGNORE'; 868 869 unless ($fh->open("|$ftp -n")) { 870 return $self->_error(loc("%1 creation failed: %2", $ftp, $!)); 871 } 872 873 my @dialog = ( 874 "lcd " . dirname($to), 875 "open " . $self->host, 876 "user anonymous $FROM_EMAIL", 877 "cd /", 878 "cd " . $self->path, 879 "binary", 880 "get " . $self->file . " " . $self->output_file, 881 "quit", 882 ); 883 884 foreach (@dialog) { $fh->print($_, "\n") } 885 $fh->close or return; 886 887 return $to; 888 } 889} 890 891### lynx is stupid - it decompresses any .gz file it finds to be text 892### use /bin/lynx to fetch files 893sub _lynx_fetch { 894 my $self = shift; 895 my %hash = @_; 896 897 my ($to); 898 my $tmpl = { 899 to => { required => 1, store => \$to } 900 }; 901 check( $tmpl, \%hash ) or return; 902 903 ### see if we have a lynx binary ### 904 if( my $lynx = can_run('lynx') ) { 905 906 unless( IPC::Cmd->can_capture_buffer ) { 907 $METHOD_FAIL->{'lynx'} = 1; 908 909 return $self->_error(loc( 910 "Can not capture buffers. Can not use '%1' to fetch files", 911 'lynx' )); 912 } 913 914 ### check if the HTTP resource exists ### 915 if ($self->uri =~ /^https?:\/\//i) { 916 my $cmd = [ 917 $lynx, 918 '-head', 919 '-source', 920 "-auth=anonymous:$FROM_EMAIL", 921 ]; 922 923 push @$cmd, "-connect_timeout=$TIMEOUT" if $TIMEOUT; 924 925 push @$cmd, $self->uri; 926 927 ### shell out ### 928 my $head; 929 unless(run( command => $cmd, 930 buffer => \$head, 931 verbose => $DEBUG ) 932 ) { 933 return $self->_error(loc("Command failed: %1", $head || '')); 934 } 935 936 unless($head =~ /^HTTP\/\d+\.\d+ 200\b/) { 937 return $self->_error(loc("Command failed: %1", $head || '')); 938 } 939 } 940 941 ### write to the output file ourselves, since lynx ass_u_mes to much 942 my $local = FileHandle->new(">$to") 943 or return $self->_error(loc( 944 "Could not open '%1' for writing: %2",$to,$!)); 945 946 ### dump to stdout ### 947 my $cmd = [ 948 $lynx, 949 '-source', 950 "-auth=anonymous:$FROM_EMAIL", 951 ]; 952 953 push @$cmd, "-connect_timeout=$TIMEOUT" if $TIMEOUT; 954 955 ### DO NOT quote things for IPC::Run, it breaks stuff. 956 push @$cmd, $self->uri; 957 958 ### with IPC::Cmd > 0.41, this is fixed in teh library, 959 ### and there's no need for special casing any more. 960 ### DO NOT quote things for IPC::Run, it breaks stuff. 961 # $IPC::Cmd::USE_IPC_RUN 962 # ? $self->uri 963 # : QUOTE. $self->uri .QUOTE; 964 965 966 ### shell out ### 967 my $captured; 968 unless(run( command => $cmd, 969 buffer => \$captured, 970 verbose => $DEBUG ) 971 ) { 972 return $self->_error(loc("Command failed: %1", $captured || '')); 973 } 974 975 ### print to local file ### 976 ### XXX on a 404 with a special error page, $captured will actually 977 ### hold the contents of that page, and make it *appear* like the 978 ### request was a success, when really it wasn't :( 979 ### there doesn't seem to be an option for lynx to change the exit 980 ### code based on a 4XX status or so. 981 ### the closest we can come is using --error_file and parsing that, 982 ### which is very unreliable ;( 983 $local->print( $captured ); 984 $local->close or return; 985 986 return $to; 987 988 } else { 989 $METHOD_FAIL->{'lynx'} = 1; 990 return; 991 } 992} 993 994### use /bin/ncftp to fetch files 995sub _ncftp_fetch { 996 my $self = shift; 997 my %hash = @_; 998 999 my ($to); 1000 my $tmpl = { 1001 to => { required => 1, store => \$to } 1002 }; 1003 check( $tmpl, \%hash ) or return; 1004 1005 ### we can only set passive mode in interactive sesssions, so bail out 1006 ### if $FTP_PASSIVE is set 1007 return if $FTP_PASSIVE; 1008 1009 ### see if we have a ncftp binary ### 1010 if( my $ncftp = can_run('ncftp') ) { 1011 1012 my $cmd = [ 1013 $ncftp, 1014 '-V', # do not be verbose 1015 '-p', $FROM_EMAIL, # email as password 1016 $self->host, # hostname 1017 dirname($to), # local dir for the file 1018 # remote path to the file 1019 ### DO NOT quote things for IPC::Run, it breaks stuff. 1020 $IPC::Cmd::USE_IPC_RUN 1021 ? File::Spec::Unix->catdir( $self->path, $self->file ) 1022 : QUOTE. File::Spec::Unix->catdir( 1023 $self->path, $self->file ) .QUOTE 1024 1025 ]; 1026 1027 ### shell out ### 1028 my $captured; 1029 unless(run( command => $cmd, 1030 buffer => \$captured, 1031 verbose => $DEBUG ) 1032 ) { 1033 return $self->_error(loc("Command failed: %1", $captured || '')); 1034 } 1035 1036 return $to; 1037 1038 } else { 1039 $METHOD_FAIL->{'ncftp'} = 1; 1040 return; 1041 } 1042} 1043 1044### use /bin/curl to fetch files 1045sub _curl_fetch { 1046 my $self = shift; 1047 my %hash = @_; 1048 1049 my ($to); 1050 my $tmpl = { 1051 to => { required => 1, store => \$to } 1052 }; 1053 check( $tmpl, \%hash ) or return; 1054 1055 if (my $curl = can_run('curl')) { 1056 1057 ### these long opts are self explanatory - I like that -jmb 1058 my $cmd = [ $curl, '-q' ]; 1059 1060 push(@$cmd, '--connect-timeout', $TIMEOUT) if $TIMEOUT; 1061 1062 push(@$cmd, '--silent') unless $DEBUG; 1063 1064 ### curl does the right thing with passive, regardless ### 1065 if ($self->scheme eq 'ftp') { 1066 push(@$cmd, '--user', "anonymous:$FROM_EMAIL"); 1067 } 1068 1069 ### curl doesn't follow 302 (temporarily moved) etc automatically 1070 ### so we add --location to enable that. 1071 push @$cmd, '--fail', '--location', '--output', $to, $self->uri; 1072 1073 ### with IPC::Cmd > 0.41, this is fixed in teh library, 1074 ### and there's no need for special casing any more. 1075 ### DO NOT quote things for IPC::Run, it breaks stuff. 1076 # $IPC::Cmd::USE_IPC_RUN 1077 # ? ($to, $self->uri) 1078 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE); 1079 1080 1081 my $captured; 1082 unless(run( command => $cmd, 1083 buffer => \$captured, 1084 verbose => $DEBUG ) 1085 ) { 1086 1087 return $self->_error(loc("Command failed: %1", $captured || '')); 1088 } 1089 1090 return $to; 1091 1092 } else { 1093 $METHOD_FAIL->{'curl'} = 1; 1094 return; 1095 } 1096} 1097 1098 1099### use File::Copy for fetching file:// urls ### 1100### 1101### See section 3.10 of RFC 1738 (http://www.faqs.org/rfcs/rfc1738.html) 1102### Also see wikipedia on file:// (http://en.wikipedia.org/wiki/File://) 1103### 1104 1105sub _file_fetch { 1106 my $self = shift; 1107 my %hash = @_; 1108 1109 my ($to); 1110 my $tmpl = { 1111 to => { required => 1, store => \$to } 1112 }; 1113 check( $tmpl, \%hash ) or return; 1114 1115 1116 1117 ### prefix a / on unix systems with a file uri, since it would 1118 ### look somewhat like this: 1119 ### file:///home/kane/file 1120 ### wheras windows file uris for 'c:\some\dir\file' might look like: 1121 ### file:///C:/some/dir/file 1122 ### file:///C|/some/dir/file 1123 ### or for a network share '\\host\share\some\dir\file': 1124 ### file:////host/share/some/dir/file 1125 ### 1126 ### VMS file uri's for 'DISK$USER:[MY.NOTES]NOTE123456.TXT' might look like: 1127 ### file://vms.host.edu/disk$user/my/notes/note12345.txt 1128 ### 1129 1130 my $path = $self->path; 1131 my $vol = $self->vol; 1132 my $share = $self->share; 1133 1134 my $remote; 1135 if (!$share and $self->host) { 1136 return $self->_error(loc( 1137 "Currently %1 cannot handle hosts in %2 urls", 1138 'File::Fetch', 'file://' 1139 )); 1140 } 1141 1142 if( $vol ) { 1143 $path = File::Spec->catdir( split /\//, $path ); 1144 $remote = File::Spec->catpath( $vol, $path, $self->file); 1145 1146 } elsif( $share ) { 1147 ### win32 specific, and a share name, so we wont bother with File::Spec 1148 $path =~ s|/+|\\|g; 1149 $remote = "\\\\".$self->host."\\$share\\$path"; 1150 1151 } else { 1152 ### File::Spec on VMS can not currently handle UNIX syntax. 1153 my $file_class = ON_VMS 1154 ? 'File::Spec::Unix' 1155 : 'File::Spec'; 1156 1157 $remote = $file_class->catfile( $path, $self->file ); 1158 } 1159 1160 ### File::Copy is littered with 'die' statements :( ### 1161 my $rv = eval { File::Copy::copy( $remote, $to ) }; 1162 1163 ### something went wrong ### 1164 if( !$rv or $@ ) { 1165 return $self->_error(loc("Could not copy '%1' to '%2': %3 %4", 1166 $remote, $to, $!, $@)); 1167 } 1168 1169 return $to; 1170} 1171 1172### use /usr/bin/rsync to fetch files 1173sub _rsync_fetch { 1174 my $self = shift; 1175 my %hash = @_; 1176 1177 my ($to); 1178 my $tmpl = { 1179 to => { required => 1, store => \$to } 1180 }; 1181 check( $tmpl, \%hash ) or return; 1182 1183 if (my $rsync = can_run('rsync')) { 1184 1185 my $cmd = [ $rsync ]; 1186 1187 ### XXX: rsync has no I/O timeouts at all, by default 1188 push(@$cmd, '--timeout=' . $TIMEOUT) if $TIMEOUT; 1189 1190 push(@$cmd, '--quiet') unless $DEBUG; 1191 1192 ### DO NOT quote things for IPC::Run, it breaks stuff. 1193 push @$cmd, $self->uri, $to; 1194 1195 ### with IPC::Cmd > 0.41, this is fixed in teh library, 1196 ### and there's no need for special casing any more. 1197 ### DO NOT quote things for IPC::Run, it breaks stuff. 1198 # $IPC::Cmd::USE_IPC_RUN 1199 # ? ($to, $self->uri) 1200 # : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE); 1201 1202 my $captured; 1203 unless(run( command => $cmd, 1204 buffer => \$captured, 1205 verbose => $DEBUG ) 1206 ) { 1207 1208 return $self->_error(loc("Command %1 failed: %2", 1209 "@$cmd" || '', $captured || '')); 1210 } 1211 1212 return $to; 1213 1214 } else { 1215 $METHOD_FAIL->{'rsync'} = 1; 1216 return; 1217 } 1218} 1219 1220################################# 1221# 1222# Error code 1223# 1224################################# 1225 1226=pod 1227 1228=head2 $ff->error([BOOL]) 1229 1230Returns the last encountered error as string. 1231Pass it a true value to get the C<Carp::longmess()> output instead. 1232 1233=cut 1234 1235### error handling the way Archive::Extract does it 1236sub _error { 1237 my $self = shift; 1238 my $error = shift; 1239 1240 $self->_error_msg( $error ); 1241 $self->_error_msg_long( Carp::longmess($error) ); 1242 1243 if( $WARN ) { 1244 carp $DEBUG ? $self->_error_msg_long : $self->_error_msg; 1245 } 1246 1247 return; 1248} 1249 1250sub error { 1251 my $self = shift; 1252 return shift() ? $self->_error_msg_long : $self->_error_msg; 1253} 1254 1255 12561; 1257 1258=pod 1259 1260=head1 HOW IT WORKS 1261 1262File::Fetch is able to fetch a variety of uris, by using several 1263external programs and modules. 1264 1265Below is a mapping of what utilities will be used in what order 1266for what schemes, if available: 1267 1268 file => LWP, lftp, file 1269 http => LWP, wget, curl, lftp, lynx, iosock 1270 ftp => LWP, Net::FTP, wget, curl, lftp, ncftp, ftp 1271 rsync => rsync 1272 1273If you'd like to disable the use of one or more of these utilities 1274and/or modules, see the C<$BLACKLIST> variable further down. 1275 1276If a utility or module isn't available, it will be marked in a cache 1277(see the C<$METHOD_FAIL> variable further down), so it will not be 1278tried again. The C<fetch> method will only fail when all options are 1279exhausted, and it was not able to retrieve the file. 1280 1281C<iosock> is a very limited L<IO::Socket::INET> based mechanism for 1282retrieving C<http> schemed urls. It doesn't follow redirects for instance. 1283 1284A special note about fetching files from an ftp uri: 1285 1286By default, all ftp connections are done in passive mode. To change 1287that, see the C<$FTP_PASSIVE> variable further down. 1288 1289Furthermore, ftp uris only support anonymous connections, so no 1290named user/password pair can be passed along. 1291 1292C</bin/ftp> is blacklisted by default; see the C<$BLACKLIST> variable 1293further down. 1294 1295=head1 GLOBAL VARIABLES 1296 1297The behaviour of File::Fetch can be altered by changing the following 1298global variables: 1299 1300=head2 $File::Fetch::FROM_EMAIL 1301 1302This is the email address that will be sent as your anonymous ftp 1303password. 1304 1305Default is C<File-Fetch@example.com>. 1306 1307=head2 $File::Fetch::USER_AGENT 1308 1309This is the useragent as C<LWP> will report it. 1310 1311Default is C<File::Fetch/$VERSION>. 1312 1313=head2 $File::Fetch::FTP_PASSIVE 1314 1315This variable controls whether the environment variable C<FTP_PASSIVE> 1316and any passive switches to commandline tools will be set to true. 1317 1318Default value is 1. 1319 1320Note: When $FTP_PASSIVE is true, C<ncftp> will not be used to fetch 1321files, since passive mode can only be set interactively for this binary 1322 1323=head2 $File::Fetch::TIMEOUT 1324 1325When set, controls the network timeout (counted in seconds). 1326 1327Default value is 0. 1328 1329=head2 $File::Fetch::WARN 1330 1331This variable controls whether errors encountered internally by 1332C<File::Fetch> should be C<carp>'d or not. 1333 1334Set to false to silence warnings. Inspect the output of the C<error()> 1335method manually to see what went wrong. 1336 1337Defaults to C<true>. 1338 1339=head2 $File::Fetch::DEBUG 1340 1341This enables debugging output when calling commandline utilities to 1342fetch files. 1343This also enables C<Carp::longmess> errors, instead of the regular 1344C<carp> errors. 1345 1346Good for tracking down why things don't work with your particular 1347setup. 1348 1349Default is 0. 1350 1351=head2 $File::Fetch::BLACKLIST 1352 1353This is an array ref holding blacklisted modules/utilities for fetching 1354files with. 1355 1356To disallow the use of, for example, C<LWP> and C<Net::FTP>, you could 1357set $File::Fetch::BLACKLIST to: 1358 1359 $File::Fetch::BLACKLIST = [qw|lwp netftp|] 1360 1361The default blacklist is [qw|ftp|], as C</bin/ftp> is rather unreliable. 1362 1363See the note on C<MAPPING> below. 1364 1365=head2 $File::Fetch::METHOD_FAIL 1366 1367This is a hashref registering what modules/utilities were known to fail 1368for fetching files (mostly because they weren't installed). 1369 1370You can reset this cache by assigning an empty hashref to it, or 1371individually remove keys. 1372 1373See the note on C<MAPPING> below. 1374 1375=head1 MAPPING 1376 1377 1378Here's a quick mapping for the utilities/modules, and their names for 1379the $BLACKLIST, $METHOD_FAIL and other internal functions. 1380 1381 LWP => lwp 1382 Net::FTP => netftp 1383 wget => wget 1384 lynx => lynx 1385 ncftp => ncftp 1386 ftp => ftp 1387 curl => curl 1388 rsync => rsync 1389 lftp => lftp 1390 IO::Socket => iosock 1391 1392=head1 FREQUENTLY ASKED QUESTIONS 1393 1394=head2 So how do I use a proxy with File::Fetch? 1395 1396C<File::Fetch> currently only supports proxies with LWP::UserAgent. 1397You will need to set your environment variables accordingly. For 1398example, to use an ftp proxy: 1399 1400 $ENV{ftp_proxy} = 'foo.com'; 1401 1402Refer to the LWP::UserAgent manpage for more details. 1403 1404=head2 I used 'lynx' to fetch a file, but its contents is all wrong! 1405 1406C<lynx> can only fetch remote files by dumping its contents to C<STDOUT>, 1407which we in turn capture. If that content is a 'custom' error file 1408(like, say, a C<404 handler>), you will get that contents instead. 1409 1410Sadly, C<lynx> doesn't support any options to return a different exit 1411code on non-C<200 OK> status, giving us no way to tell the difference 1412between a 'successfull' fetch and a custom error page. 1413 1414Therefor, we recommend to only use C<lynx> as a last resort. This is 1415why it is at the back of our list of methods to try as well. 1416 1417=head2 Files I'm trying to fetch have reserved characters or non-ASCII characters in them. What do I do? 1418 1419C<File::Fetch> is relatively smart about things. When trying to write 1420a file to disk, it removes the C<query parameters> (see the 1421C<output_file> method for details) from the file name before creating 1422it. In most cases this suffices. 1423 1424If you have any other characters you need to escape, please install 1425the C<URI::Escape> module from CPAN, and pre-encode your URI before 1426passing it to C<File::Fetch>. You can read about the details of URIs 1427and URI encoding here: 1428 1429 http://www.faqs.org/rfcs/rfc2396.html 1430 1431=head1 TODO 1432 1433=over 4 1434 1435=item Implement $PREFER_BIN 1436 1437To indicate to rather use commandline tools than modules 1438 1439=back 1440 1441=head1 BUG REPORTS 1442 1443Please report bugs or other issues to E<lt>bug-file-fetch@rt.cpan.org<gt>. 1444 1445=head1 AUTHOR 1446 1447This module by Jos Boumans E<lt>kane@cpan.orgE<gt>. 1448 1449=head1 COPYRIGHT 1450 1451This library is free software; you may redistribute and/or modify it 1452under the same terms as Perl itself. 1453 1454 1455=cut 1456 1457# Local variables: 1458# c-indentation-style: bsd 1459# c-basic-offset: 4 1460# indent-tabs-mode: nil 1461# End: 1462# vim: expandtab shiftwidth=4: 1463 1464 1465 1466 1467