1package File::Fetch;
2
3use strict;
4use FileHandle;
5use File::Temp;
6use File::Copy;
7use File::Spec;
8use File::Spec::Unix;
9use File::Basename              qw[dirname];
10
11use Cwd                         qw[cwd];
12use Carp                        qw[carp];
13use IPC::Cmd                    qw[can_run run QUOTE];
14use File::Path                  qw[mkpath];
15use File::Temp                  qw[tempdir];
16use Params::Check               qw[check];
17use Module::Load::Conditional   qw[can_load];
18use Locale::Maketext::Simple    Style => 'gettext';
19
20use vars    qw[ $VERBOSE $PREFER_BIN $FROM_EMAIL $USER_AGENT
21                $BLACKLIST $METHOD_FAIL $VERSION $METHODS
22                $FTP_PASSIVE $TIMEOUT $DEBUG $WARN
23            ];
24
25$VERSION        = '0.24';
26$VERSION        = eval $VERSION;    # avoid warnings with development releases
27$PREFER_BIN     = 0;                # XXX TODO implement
28$FROM_EMAIL     = 'File-Fetch@example.com';
29$USER_AGENT     = "File::Fetch/$VERSION";
30$BLACKLIST      = [qw|ftp|];
31$METHOD_FAIL    = { };
32$FTP_PASSIVE    = 1;
33$TIMEOUT        = 0;
34$DEBUG          = 0;
35$WARN           = 1;
36
37### methods available to fetch the file depending on the scheme
38$METHODS = {
39    http    => [ qw|lwp wget curl lftp lynx iosock| ],
40    ftp     => [ qw|lwp netftp wget curl lftp ncftp ftp| ],
41    file    => [ qw|lwp lftp file| ],
42    rsync   => [ qw|rsync| ]
43};
44
45### silly warnings ###
46local $Params::Check::VERBOSE               = 1;
47local $Params::Check::VERBOSE               = 1;
48local $Module::Load::Conditional::VERBOSE   = 0;
49local $Module::Load::Conditional::VERBOSE   = 0;
50
51### see what OS we are on, important for file:// uris ###
52use constant ON_WIN     => ($^O eq 'MSWin32');
53use constant ON_VMS     => ($^O eq 'VMS');
54use constant ON_UNIX    => (!ON_WIN);
55use constant HAS_VOL    => (ON_WIN);
56use constant HAS_SHARE  => (ON_WIN);
57
58
59=pod
60
61=head1 NAME
62
63File::Fetch - A generic file fetching mechanism
64
65=head1 SYNOPSIS
66
67    use File::Fetch;
68
69    ### build a File::Fetch object ###
70    my $ff = File::Fetch->new(uri => 'http://some.where.com/dir/a.txt');
71
72    ### fetch the uri to cwd() ###
73    my $where = $ff->fetch() or die $ff->error;
74
75    ### fetch the uri to /tmp ###
76    my $where = $ff->fetch( to => '/tmp' );
77
78    ### parsed bits from the uri ###
79    $ff->uri;
80    $ff->scheme;
81    $ff->host;
82    $ff->path;
83    $ff->file;
84
85=head1 DESCRIPTION
86
87File::Fetch is a generic file fetching mechanism.
88
89It allows you to fetch any file pointed to by a C<ftp>, C<http>,
90C<file>, or C<rsync> uri by a number of different means.
91
92See the C<HOW IT WORKS> section further down for details.
93
94=head1 ACCESSORS
95
96A C<File::Fetch> object has the following accessors
97
98=over 4
99
100=item $ff->uri
101
102The uri you passed to the constructor
103
104=item $ff->scheme
105
106The scheme from the uri (like 'file', 'http', etc)
107
108=item $ff->host
109
110The hostname in the uri.  Will be empty if host was originally
111'localhost' for a 'file://' url.
112
113=item $ff->vol
114
115On operating systems with the concept of a volume the second element
116of a file:// is considered to the be volume specification for the file.
117Thus on Win32 this routine returns the volume, on other operating
118systems this returns nothing.
119
120On Windows this value may be empty if the uri is to a network share, in
121which case the 'share' property will be defined. Additionally, volume
122specifications that use '|' as ':' will be converted on read to use ':'.
123
124On VMS, which has a volume concept, this field will be empty because VMS
125file specifications are converted to absolute UNIX format and the volume
126information is transparently included.
127
128=item $ff->share
129
130On systems with the concept of a network share (currently only Windows) returns
131the sharename from a file://// url.  On other operating systems returns empty.
132
133=item $ff->path
134
135The path from the uri, will be at least a single '/'.
136
137=item $ff->file
138
139The name of the remote file. For the local file name, the
140result of $ff->output_file will be used.
141
142=cut
143
144
145##########################
146### Object & Accessors ###
147##########################
148
149{
150    ### template for autogenerated accessors ###
151    my $Tmpl = {
152        scheme          => { default => 'http' },
153        host            => { default => 'localhost' },
154        path            => { default => '/' },
155        file            => { required => 1 },
156        uri             => { required => 1 },
157        vol             => { default => '' }, # windows for file:// uris
158        share           => { default => '' }, # windows for file:// uris
159        _error_msg      => { no_override => 1 },
160        _error_msg_long => { no_override => 1 },
161    };
162
163    for my $method ( keys %$Tmpl ) {
164        no strict 'refs';
165        *$method = sub {
166                        my $self = shift;
167                        $self->{$method} = $_[0] if @_;
168                        return $self->{$method};
169                    }
170    }
171
172    sub _create {
173        my $class = shift;
174        my %hash  = @_;
175
176        my $args = check( $Tmpl, \%hash ) or return;
177
178        bless $args, $class;
179
180        if( lc($args->scheme) ne 'file' and not $args->host ) {
181            return $class->_error(loc(
182                "Hostname required when fetching from '%1'",$args->scheme));
183        }
184
185        for (qw[path file]) {
186            unless( $args->$_() ) { # 5.5.x needs the ()
187                return $class->_error(loc("No '%1' specified",$_));
188            }
189        }
190
191        return $args;
192    }
193}
194
195=item $ff->output_file
196
197The name of the output file. This is the same as $ff->file,
198but any query parameters are stripped off. For example:
199
200    http://example.com/index.html?x=y
201
202would make the output file be C<index.html> rather than
203C<index.html?x=y>.
204
205=back
206
207=cut
208
209sub output_file {
210    my $self = shift;
211    my $file = $self->file;
212
213    $file =~ s/\?.*$//g;
214
215    return $file;
216}
217
218### XXX do this or just point to URI::Escape?
219# =head2 $esc_uri = $ff->escaped_uri
220#
221# =cut
222#
223# ### most of this is stolen straight from URI::escape
224# {   ### Build a char->hex map
225#     my %escapes = map { chr($_) => sprintf("%%%02X", $_) } 0..255;
226#
227#     sub escaped_uri {
228#         my $self = shift;
229#         my $uri  = $self->uri;
230#
231#         ### Default unsafe characters.  RFC 2732 ^(uric - reserved)
232#         $uri =~ s/([^A-Za-z0-9\-_.!~*'()])/
233#                     $escapes{$1} || $self->_fail_hi($1)/ge;
234#
235#         return $uri;
236#     }
237#
238#     sub _fail_hi {
239#         my $self = shift;
240#         my $char = shift;
241#
242#         $self->_error(loc(
243#             "Can't escape '%1', try using the '%2' module instead",
244#             sprintf("\\x{%04X}", ord($char)), 'URI::Escape'
245#         ));
246#     }
247#
248#     sub output_file {
249#
250#     }
251#
252#
253# }
254
255=head1 METHODS
256
257=head2 $ff = File::Fetch->new( uri => 'http://some.where.com/dir/file.txt' );
258
259Parses the uri and creates a corresponding File::Fetch::Item object,
260that is ready to be C<fetch>ed and returns it.
261
262Returns false on failure.
263
264=cut
265
266sub new {
267    my $class = shift;
268    my %hash  = @_;
269
270    my ($uri);
271    my $tmpl = {
272        uri => { required => 1, store => \$uri },
273    };
274
275    check( $tmpl, \%hash ) or return;
276
277    ### parse the uri to usable parts ###
278    my $href    = $class->_parse_uri( $uri ) or return;
279
280    ### make it into a FFI object ###
281    my $ff      = $class->_create( %$href ) or return;
282
283
284    ### return the object ###
285    return $ff;
286}
287
288### parses an uri to a hash structure:
289###
290### $class->_parse_uri( 'ftp://ftp.cpan.org/pub/mirror/index.txt' )
291###
292### becomes:
293###
294### $href = {
295###     scheme  => 'ftp',
296###     host    => 'ftp.cpan.org',
297###     path    => '/pub/mirror',
298###     file    => 'index.html'
299### };
300###
301### In the case of file:// urls there maybe be additional fields
302###
303### For systems with volume specifications such as Win32 there will be
304### a volume specifier provided in the 'vol' field.
305###
306###   'vol' => 'volumename'
307###
308### For windows file shares there may be a 'share' key specified
309###
310###   'share' => 'sharename'
311###
312### Note that the rules of what a file:// url means vary by the operating system
313### of the host being addressed. Thus file:///d|/foo/bar.txt means the obvious
314### 'D:\foo\bar.txt' on windows, but on unix it means '/d|/foo/bar.txt' and
315### not '/foo/bar.txt'
316###
317### Similarly if the host interpreting the url is VMS then
318### file:///disk$user/my/notes/note12345.txt' means
319### 'DISK$USER:[MY.NOTES]NOTE123456.TXT' but will be returned the same as
320### if it is unix where it means /disk$user/my/notes/note12345.txt'.
321### Except for some cases in the File::Spec methods, Perl on VMS will generally
322### handle UNIX format file specifications.
323###
324### This means it is impossible to serve certain file:// urls on certain systems.
325###
326### Thus are the problems with a protocol-less specification. :-(
327###
328
329sub _parse_uri {
330    my $self = shift;
331    my $uri  = shift or return;
332
333    my $href = { uri => $uri };
334
335    ### find the scheme ###
336    $uri            =~ s|^(\w+)://||;
337    $href->{scheme} = $1;
338
339    ### See rfc 1738 section 3.10
340    ### http://www.faqs.org/rfcs/rfc1738.html
341    ### And wikipedia for more on windows file:// urls
342    ### http://en.wikipedia.org/wiki/File://
343    if( $href->{scheme} eq 'file' ) {
344
345        my @parts = split '/',$uri;
346
347        ### file://hostname/...
348        ### file://hostname/...
349        ### normalize file://localhost with file:///
350        $href->{host} = $parts[0] || '';
351
352        ### index in @parts where the path components begin;
353        my $index = 1;
354
355        ### file:////hostname/sharename/blah.txt
356        if ( HAS_SHARE and not length $parts[0] and not length $parts[1] ) {
357
358            $href->{host}   = $parts[2] || '';  # avoid warnings
359            $href->{share}  = $parts[3] || '';  # avoid warnings
360
361            $index          = 4         # index after the share
362
363        ### file:///D|/blah.txt
364        ### file:///D:/blah.txt
365        } elsif (HAS_VOL) {
366
367            ### this code comes from dmq's patch, but:
368            ### XXX if volume is empty, wouldn't that be an error? --kane
369            ### if so, our file://localhost test needs to be fixed as wel
370            $href->{vol}    = $parts[1] || '';
371
372            ### correct D| style colume descriptors
373            $href->{vol}    =~ s/\A([A-Z])\|\z/$1:/i if ON_WIN;
374
375            $index          = 2;        # index after the volume
376        }
377
378        ### rebuild the path from the leftover parts;
379        $href->{path} = join '/', '', splice( @parts, $index, $#parts );
380
381    } else {
382        ### using anything but qw() in hash slices may produce warnings
383        ### in older perls :-(
384        @{$href}{ qw(host path) } = $uri =~ m|([^/]*)(/.*)$|s;
385    }
386
387    ### split the path into file + dir ###
388    {   my @parts = File::Spec::Unix->splitpath( delete $href->{path} );
389        $href->{path} = $parts[1];
390        $href->{file} = $parts[2];
391    }
392
393    ### host will be empty if the target was 'localhost' and the
394    ### scheme was 'file'
395    $href->{host} = '' if   ($href->{host}      eq 'localhost') and
396                            ($href->{scheme}    eq 'file');
397
398    return $href;
399}
400
401=head2 $where = $ff->fetch( [to => /my/output/dir/ | \$scalar] )
402
403Fetches the file you requested and returns the full path to the file.
404
405By default it writes to C<cwd()>, but you can override that by specifying
406the C<to> argument:
407
408    ### file fetch to /tmp, full path to the file in $where
409    $where = $ff->fetch( to => '/tmp' );
410
411    ### file slurped into $scalar, full path to the file in $where
412    ### file is downloaded to a temp directory and cleaned up at exit time
413    $where = $ff->fetch( to => \$scalar );
414
415Returns the full path to the downloaded file on success, and false
416on failure.
417
418=cut
419
420sub fetch {
421    my $self = shift or return;
422    my %hash = @_;
423
424    my $target;
425    my $tmpl = {
426        to  => { default => cwd(), store => \$target },
427    };
428
429    check( $tmpl, \%hash ) or return;
430
431    my ($to, $fh);
432    ### you want us to slurp the contents
433    if( ref $target and UNIVERSAL::isa( $target, 'SCALAR' ) ) {
434        $to = tempdir( 'FileFetch.XXXXXX', CLEANUP => 1 );
435
436    ### plain old fetch
437    } else {
438        $to = $target;
439
440        ### On VMS force to VMS format so File::Spec will work.
441        $to = VMS::Filespec::vmspath($to) if ON_VMS;
442
443        ### create the path if it doesn't exist yet ###
444        unless( -d $to ) {
445            eval { mkpath( $to ) };
446
447            return $self->_error(loc("Could not create path '%1'",$to)) if $@;
448        }
449    }
450
451    ### set passive ftp if required ###
452    local $ENV{FTP_PASSIVE} = $FTP_PASSIVE;
453
454    ### we dont use catfile on win32 because if we are using a cygwin tool
455    ### under cmd.exe they wont understand windows style separators.
456    my $out_to = ON_WIN ? $to.'/'.$self->output_file
457                        : File::Spec->catfile( $to, $self->output_file );
458
459    for my $method ( @{ $METHODS->{$self->scheme} } ) {
460        my $sub =  '_'.$method.'_fetch';
461
462        unless( __PACKAGE__->can($sub) ) {
463            $self->_error(loc("Cannot call method for '%1' -- WEIRD!",
464                        $method));
465            next;
466        }
467
468        ### method is blacklisted ###
469        next if grep { lc $_ eq $method } @$BLACKLIST;
470
471        ### method is known to fail ###
472        next if $METHOD_FAIL->{$method};
473
474        ### there's serious issues with IPC::Run and quoting of command
475        ### line arguments. using quotes in the wrong place breaks things,
476        ### and in the case of say,
477        ### C:\cygwin\bin\wget.EXE --quiet --passive-ftp --output-document
478        ### "index.html" "http://www.cpan.org/index.html?q=1&y=2"
479        ### it doesn't matter how you quote, it always fails.
480        local $IPC::Cmd::USE_IPC_RUN = 0;
481
482        if( my $file = $self->$sub(
483                        to => $out_to
484        )){
485
486            unless( -e $file && -s _ ) {
487                $self->_error(loc("'%1' said it fetched '%2', ".
488                     "but it was not created",$method,$file));
489
490                ### mark the failure ###
491                $METHOD_FAIL->{$method} = 1;
492
493                next;
494
495            } else {
496
497                ### slurp mode?
498                if( ref $target and UNIVERSAL::isa( $target, 'SCALAR' ) ) {
499
500                    ### open the file
501                    open my $fh, $file or do {
502                        $self->_error(
503                            loc("Could not open '%1': %2", $file, $!));
504                        return;
505                    };
506
507                    ### slurp
508                    $$target = do { local $/; <$fh> };
509
510                }
511
512                my $abs = File::Spec->rel2abs( $file );
513                return $abs;
514
515            }
516        }
517    }
518
519
520    ### if we got here, we looped over all methods, but we weren't able
521    ### to fetch it.
522    return;
523}
524
525########################
526### _*_fetch methods ###
527########################
528
529### LWP fetching ###
530sub _lwp_fetch {
531    my $self = shift;
532    my %hash = @_;
533
534    my ($to);
535    my $tmpl = {
536        to  => { required => 1, store => \$to }
537    };
538    check( $tmpl, \%hash ) or return;
539
540    ### modules required to download with lwp ###
541    my $use_list = {
542        LWP                 => '0.0',
543        'LWP::UserAgent'    => '0.0',
544        'HTTP::Request'     => '0.0',
545        'HTTP::Status'      => '0.0',
546        URI                 => '0.0',
547
548    };
549
550    if( can_load(modules => $use_list) ) {
551
552        ### setup the uri object
553        my $uri = URI->new( File::Spec::Unix->catfile(
554                                    $self->path, $self->file
555                        ) );
556
557        ### special rules apply for file:// uris ###
558        $uri->scheme( $self->scheme );
559        $uri->host( $self->scheme eq 'file' ? '' : $self->host );
560        $uri->userinfo("anonymous:$FROM_EMAIL") if $self->scheme ne 'file';
561
562        ### set up the useragent object
563        my $ua = LWP::UserAgent->new();
564        $ua->timeout( $TIMEOUT ) if $TIMEOUT;
565        $ua->agent( $USER_AGENT );
566        $ua->from( $FROM_EMAIL );
567        $ua->env_proxy;
568
569        my $res = $ua->mirror($uri, $to) or return;
570
571        ### uptodate or fetched ok ###
572        if ( $res->code == 304 or $res->code == 200 ) {
573            return $to;
574
575        } else {
576            return $self->_error(loc("Fetch failed! HTTP response: %1 %2 [%3]",
577                        $res->code, HTTP::Status::status_message($res->code),
578                        $res->status_line));
579        }
580
581    } else {
582        $METHOD_FAIL->{'lwp'} = 1;
583        return;
584    }
585}
586
587### Simple IO::Socket::INET fetching ###
588sub _iosock_fetch {
589    my $self = shift;
590    my %hash = @_;
591
592    my ($to);
593    my $tmpl = {
594        to  => { required => 1, store => \$to }
595    };
596    check( $tmpl, \%hash ) or return;
597
598    my $use_list = {
599        'IO::Socket::INET' => '0.0',
600        'IO::Select'       => '0.0',
601    };
602
603    if( can_load(modules => $use_list) ) {
604        my $sock = IO::Socket::INET->new(
605            PeerHost => $self->host,
606            ( $self->host =~ /:/ ? () : ( PeerPort => 80 ) ),
607        );
608
609        unless ( $sock ) {
610            return $self->_error(loc("Could not open socket to '%1', '%2'",$self->host,$!));
611        }
612
613        my $fh = FileHandle->new;
614
615        # Check open()
616
617        unless ( $fh->open($to,'>') ) {
618            return $self->_error(loc(
619                 "Could not open '%1' for writing: %2",$to,$!));
620        }
621
622        my $path = File::Spec::Unix->catfile( $self->path, $self->file );
623        my $req = "GET $path HTTP/1.0\x0d\x0aHost: " . $self->host . "\x0d\x0a\x0d\x0a";
624        $sock->send( $req );
625
626        my $select = IO::Select->new( $sock );
627
628        my $resp = '';
629        my $normal = 0;
630        while ( $select->can_read( $TIMEOUT || 60 ) ) {
631          my $ret = $sock->sysread( $resp, 4096, length($resp) );
632          if ( !defined $ret or $ret == 0 ) {
633            $select->remove( $sock );
634            $normal++;
635          }
636        }
637        close $sock;
638
639        unless ( $normal ) {
640            return $self->_error(loc("Socket timed out after '%1' seconds", ( $TIMEOUT || 60 )));
641        }
642
643        # Check the "response"
644        # Strip preceeding blank lines apparently they are allowed (RFC 2616 4.1)
645        $resp =~ s/^(\x0d?\x0a)+//;
646        # Check it is an HTTP response
647        unless ( $resp =~ m!^HTTP/(\d+)\.(\d+)!i ) {
648            return $self->_error(loc("Did not get a HTTP response from '%1'",$self->host));
649        }
650
651        # Check for OK
652        my ($code) = $resp =~ m!^HTTP/\d+\.\d+\s+(\d+)!i;
653        unless ( $code eq '200' ) {
654            return $self->_error(loc("Got a '%1' from '%2' expected '200'",$code,$self->host));
655        }
656
657        print $fh +($resp =~ m/\x0d\x0a\x0d\x0a(.*)$/s )[0];
658        close $fh;
659        return $to;
660
661    } else {
662        $METHOD_FAIL->{'iosock'} = 1;
663        return;
664    }
665}
666
667### Net::FTP fetching
668sub _netftp_fetch {
669    my $self = shift;
670    my %hash = @_;
671
672    my ($to);
673    my $tmpl = {
674        to  => { required => 1, store => \$to }
675    };
676    check( $tmpl, \%hash ) or return;
677
678    ### required modules ###
679    my $use_list = { 'Net::FTP' => 0 };
680
681    if( can_load( modules => $use_list ) ) {
682
683        ### make connection ###
684        my $ftp;
685        my @options = ($self->host);
686        push(@options, Timeout => $TIMEOUT) if $TIMEOUT;
687        unless( $ftp = Net::FTP->new( @options ) ) {
688            return $self->_error(loc("Ftp creation failed: %1",$@));
689        }
690
691        ### login ###
692        unless( $ftp->login( anonymous => $FROM_EMAIL ) ) {
693            return $self->_error(loc("Could not login to '%1'",$self->host));
694        }
695
696        ### set binary mode, just in case ###
697        $ftp->binary;
698
699        ### create the remote path
700        ### remember remote paths are unix paths! [#11483]
701        my $remote = File::Spec::Unix->catfile( $self->path, $self->file );
702
703        ### fetch the file ###
704        my $target;
705        unless( $target = $ftp->get( $remote, $to ) ) {
706            return $self->_error(loc("Could not fetch '%1' from '%2'",
707                        $remote, $self->host));
708        }
709
710        ### log out ###
711        $ftp->quit;
712
713        return $target;
714
715    } else {
716        $METHOD_FAIL->{'netftp'} = 1;
717        return;
718    }
719}
720
721### /bin/wget fetch ###
722sub _wget_fetch {
723    my $self = shift;
724    my %hash = @_;
725
726    my ($to);
727    my $tmpl = {
728        to  => { required => 1, store => \$to }
729    };
730    check( $tmpl, \%hash ) or return;
731
732    ### see if we have a wget binary ###
733    if( my $wget = can_run('wget') ) {
734
735        ### no verboseness, thanks ###
736        my $cmd = [ $wget, '--quiet' ];
737
738        ### if a timeout is set, add it ###
739        push(@$cmd, '--timeout=' . $TIMEOUT) if $TIMEOUT;
740
741        ### run passive if specified ###
742        push @$cmd, '--passive-ftp' if $FTP_PASSIVE;
743
744        ### set the output document, add the uri ###
745        push @$cmd, '--output-document', $to, $self->uri;
746
747        ### with IPC::Cmd > 0.41, this is fixed in teh library,
748        ### and there's no need for special casing any more.
749        ### DO NOT quote things for IPC::Run, it breaks stuff.
750        # $IPC::Cmd::USE_IPC_RUN
751        #    ? ($to, $self->uri)
752        #    : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
753
754        ### shell out ###
755        my $captured;
756        unless(run( command => $cmd,
757                    buffer  => \$captured,
758                    verbose => $DEBUG
759        )) {
760            ### wget creates the output document always, even if the fetch
761            ### fails.. so unlink it in that case
762            1 while unlink $to;
763
764            return $self->_error(loc( "Command failed: %1", $captured || '' ));
765        }
766
767        return $to;
768
769    } else {
770        $METHOD_FAIL->{'wget'} = 1;
771        return;
772    }
773}
774
775### /bin/lftp fetch ###
776sub _lftp_fetch {
777    my $self = shift;
778    my %hash = @_;
779
780    my ($to);
781    my $tmpl = {
782        to  => { required => 1, store => \$to }
783    };
784    check( $tmpl, \%hash ) or return;
785
786    ### see if we have a wget binary ###
787    if( my $lftp = can_run('lftp') ) {
788
789        ### no verboseness, thanks ###
790        my $cmd = [ $lftp, '-f' ];
791
792        my $fh = File::Temp->new;
793
794        my $str;
795
796        ### if a timeout is set, add it ###
797        $str .= "set net:timeout $TIMEOUT;\n" if $TIMEOUT;
798
799        ### run passive if specified ###
800        $str .= "set ftp:passive-mode 1;\n" if $FTP_PASSIVE;
801
802        ### set the output document, add the uri ###
803        ### quote the URI, because lftp supports certain shell
804        ### expansions, most notably & for backgrounding.
805        ### ' quote does nto work, must be "
806        $str .= q[get ']. $self->uri .q[' -o ]. $to . $/;
807
808        if( $DEBUG ) {
809            my $pp_str = join ' ', split $/, $str;
810            print "# lftp command: $pp_str\n";
811        }
812
813        ### write straight to the file.
814        $fh->autoflush(1);
815        print $fh $str;
816
817        ### the command needs to be 1 string to be executed
818        push @$cmd, $fh->filename;
819
820        ### with IPC::Cmd > 0.41, this is fixed in teh library,
821        ### and there's no need for special casing any more.
822        ### DO NOT quote things for IPC::Run, it breaks stuff.
823        # $IPC::Cmd::USE_IPC_RUN
824        #    ? ($to, $self->uri)
825        #    : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
826
827
828        ### shell out ###
829        my $captured;
830        unless(run( command => $cmd,
831                    buffer  => \$captured,
832                    verbose => $DEBUG
833        )) {
834            ### wget creates the output document always, even if the fetch
835            ### fails.. so unlink it in that case
836            1 while unlink $to;
837
838            return $self->_error(loc( "Command failed: %1", $captured || '' ));
839        }
840
841        return $to;
842
843    } else {
844        $METHOD_FAIL->{'lftp'} = 1;
845        return;
846    }
847}
848
849
850
851### /bin/ftp fetch ###
852sub _ftp_fetch {
853    my $self = shift;
854    my %hash = @_;
855
856    my ($to);
857    my $tmpl = {
858        to  => { required => 1, store => \$to }
859    };
860    check( $tmpl, \%hash ) or return;
861
862    ### see if we have a ftp binary ###
863    if( my $ftp = can_run('ftp') ) {
864
865        my $fh = FileHandle->new;
866
867        local $SIG{CHLD} = 'IGNORE';
868
869        unless ($fh->open("|$ftp -n")) {
870            return $self->_error(loc("%1 creation failed: %2", $ftp, $!));
871        }
872
873        my @dialog = (
874            "lcd " . dirname($to),
875            "open " . $self->host,
876            "user anonymous $FROM_EMAIL",
877            "cd /",
878            "cd " . $self->path,
879            "binary",
880            "get " . $self->file . " " . $self->output_file,
881            "quit",
882        );
883
884        foreach (@dialog) { $fh->print($_, "\n") }
885        $fh->close or return;
886
887        return $to;
888    }
889}
890
891### lynx is stupid - it decompresses any .gz file it finds to be text
892### use /bin/lynx to fetch files
893sub _lynx_fetch {
894    my $self = shift;
895    my %hash = @_;
896
897    my ($to);
898    my $tmpl = {
899        to  => { required => 1, store => \$to }
900    };
901    check( $tmpl, \%hash ) or return;
902
903    ### see if we have a lynx binary ###
904    if( my $lynx = can_run('lynx') ) {
905
906        unless( IPC::Cmd->can_capture_buffer ) {
907            $METHOD_FAIL->{'lynx'} = 1;
908
909            return $self->_error(loc(
910                "Can not capture buffers. Can not use '%1' to fetch files",
911                'lynx' ));
912        }
913
914        ### check if the HTTP resource exists ###
915        if ($self->uri =~ /^https?:\/\//i) {
916            my $cmd = [
917                $lynx,
918                '-head',
919                '-source',
920                "-auth=anonymous:$FROM_EMAIL",
921            ];
922
923            push @$cmd, "-connect_timeout=$TIMEOUT" if $TIMEOUT;
924
925            push @$cmd, $self->uri;
926
927            ### shell out ###
928            my $head;
929            unless(run( command => $cmd,
930                        buffer  => \$head,
931                        verbose => $DEBUG )
932            ) {
933                return $self->_error(loc("Command failed: %1", $head || ''));
934            }
935
936            unless($head =~ /^HTTP\/\d+\.\d+ 200\b/) {
937                return $self->_error(loc("Command failed: %1", $head || ''));
938            }
939        }
940
941        ### write to the output file ourselves, since lynx ass_u_mes to much
942        my $local = FileHandle->new(">$to")
943                        or return $self->_error(loc(
944                            "Could not open '%1' for writing: %2",$to,$!));
945
946        ### dump to stdout ###
947        my $cmd = [
948            $lynx,
949            '-source',
950            "-auth=anonymous:$FROM_EMAIL",
951        ];
952
953        push @$cmd, "-connect_timeout=$TIMEOUT" if $TIMEOUT;
954
955        ### DO NOT quote things for IPC::Run, it breaks stuff.
956        push @$cmd, $self->uri;
957
958        ### with IPC::Cmd > 0.41, this is fixed in teh library,
959        ### and there's no need for special casing any more.
960        ### DO NOT quote things for IPC::Run, it breaks stuff.
961        # $IPC::Cmd::USE_IPC_RUN
962        #    ? $self->uri
963        #    : QUOTE. $self->uri .QUOTE;
964
965
966        ### shell out ###
967        my $captured;
968        unless(run( command => $cmd,
969                    buffer  => \$captured,
970                    verbose => $DEBUG )
971        ) {
972            return $self->_error(loc("Command failed: %1", $captured || ''));
973        }
974
975        ### print to local file ###
976        ### XXX on a 404 with a special error page, $captured will actually
977        ### hold the contents of that page, and make it *appear* like the
978        ### request was a success, when really it wasn't :(
979        ### there doesn't seem to be an option for lynx to change the exit
980        ### code based on a 4XX status or so.
981        ### the closest we can come is using --error_file and parsing that,
982        ### which is very unreliable ;(
983        $local->print( $captured );
984        $local->close or return;
985
986        return $to;
987
988    } else {
989        $METHOD_FAIL->{'lynx'} = 1;
990        return;
991    }
992}
993
994### use /bin/ncftp to fetch files
995sub _ncftp_fetch {
996    my $self = shift;
997    my %hash = @_;
998
999    my ($to);
1000    my $tmpl = {
1001        to  => { required => 1, store => \$to }
1002    };
1003    check( $tmpl, \%hash ) or return;
1004
1005    ### we can only set passive mode in interactive sesssions, so bail out
1006    ### if $FTP_PASSIVE is set
1007    return if $FTP_PASSIVE;
1008
1009    ### see if we have a ncftp binary ###
1010    if( my $ncftp = can_run('ncftp') ) {
1011
1012        my $cmd = [
1013            $ncftp,
1014            '-V',                   # do not be verbose
1015            '-p', $FROM_EMAIL,      # email as password
1016            $self->host,            # hostname
1017            dirname($to),           # local dir for the file
1018                                    # remote path to the file
1019            ### DO NOT quote things for IPC::Run, it breaks stuff.
1020            $IPC::Cmd::USE_IPC_RUN
1021                        ? File::Spec::Unix->catdir( $self->path, $self->file )
1022                        : QUOTE. File::Spec::Unix->catdir(
1023                                        $self->path, $self->file ) .QUOTE
1024
1025        ];
1026
1027        ### shell out ###
1028        my $captured;
1029        unless(run( command => $cmd,
1030                    buffer  => \$captured,
1031                    verbose => $DEBUG )
1032        ) {
1033            return $self->_error(loc("Command failed: %1", $captured || ''));
1034        }
1035
1036        return $to;
1037
1038    } else {
1039        $METHOD_FAIL->{'ncftp'} = 1;
1040        return;
1041    }
1042}
1043
1044### use /bin/curl to fetch files
1045sub _curl_fetch {
1046    my $self = shift;
1047    my %hash = @_;
1048
1049    my ($to);
1050    my $tmpl = {
1051        to  => { required => 1, store => \$to }
1052    };
1053    check( $tmpl, \%hash ) or return;
1054
1055    if (my $curl = can_run('curl')) {
1056
1057        ### these long opts are self explanatory - I like that -jmb
1058	    my $cmd = [ $curl, '-q' ];
1059
1060	    push(@$cmd, '--connect-timeout', $TIMEOUT) if $TIMEOUT;
1061
1062	    push(@$cmd, '--silent') unless $DEBUG;
1063
1064        ### curl does the right thing with passive, regardless ###
1065    	if ($self->scheme eq 'ftp') {
1066    		push(@$cmd, '--user', "anonymous:$FROM_EMAIL");
1067    	}
1068
1069        ### curl doesn't follow 302 (temporarily moved) etc automatically
1070        ### so we add --location to enable that.
1071        push @$cmd, '--fail', '--location', '--output', $to, $self->uri;
1072
1073        ### with IPC::Cmd > 0.41, this is fixed in teh library,
1074        ### and there's no need for special casing any more.
1075        ### DO NOT quote things for IPC::Run, it breaks stuff.
1076        # $IPC::Cmd::USE_IPC_RUN
1077        #    ? ($to, $self->uri)
1078        #    : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
1079
1080
1081        my $captured;
1082        unless(run( command => $cmd,
1083                    buffer  => \$captured,
1084                    verbose => $DEBUG )
1085        ) {
1086
1087            return $self->_error(loc("Command failed: %1", $captured || ''));
1088        }
1089
1090        return $to;
1091
1092    } else {
1093        $METHOD_FAIL->{'curl'} = 1;
1094        return;
1095    }
1096}
1097
1098
1099### use File::Copy for fetching file:// urls ###
1100###
1101### See section 3.10 of RFC 1738 (http://www.faqs.org/rfcs/rfc1738.html)
1102### Also see wikipedia on file:// (http://en.wikipedia.org/wiki/File://)
1103###
1104
1105sub _file_fetch {
1106    my $self = shift;
1107    my %hash = @_;
1108
1109    my ($to);
1110    my $tmpl = {
1111        to  => { required => 1, store => \$to }
1112    };
1113    check( $tmpl, \%hash ) or return;
1114
1115
1116
1117    ### prefix a / on unix systems with a file uri, since it would
1118    ### look somewhat like this:
1119    ###     file:///home/kane/file
1120    ### wheras windows file uris for 'c:\some\dir\file' might look like:
1121    ###     file:///C:/some/dir/file
1122    ###     file:///C|/some/dir/file
1123    ### or for a network share '\\host\share\some\dir\file':
1124    ###     file:////host/share/some/dir/file
1125    ###
1126    ### VMS file uri's for 'DISK$USER:[MY.NOTES]NOTE123456.TXT' might look like:
1127    ###     file://vms.host.edu/disk$user/my/notes/note12345.txt
1128    ###
1129
1130    my $path    = $self->path;
1131    my $vol     = $self->vol;
1132    my $share   = $self->share;
1133
1134    my $remote;
1135    if (!$share and $self->host) {
1136        return $self->_error(loc(
1137            "Currently %1 cannot handle hosts in %2 urls",
1138            'File::Fetch', 'file://'
1139        ));
1140    }
1141
1142    if( $vol ) {
1143        $path   = File::Spec->catdir( split /\//, $path );
1144        $remote = File::Spec->catpath( $vol, $path, $self->file);
1145
1146    } elsif( $share ) {
1147        ### win32 specific, and a share name, so we wont bother with File::Spec
1148        $path   =~ s|/+|\\|g;
1149        $remote = "\\\\".$self->host."\\$share\\$path";
1150
1151    } else {
1152        ### File::Spec on VMS can not currently handle UNIX syntax.
1153        my $file_class = ON_VMS
1154            ? 'File::Spec::Unix'
1155            : 'File::Spec';
1156
1157        $remote  = $file_class->catfile( $path, $self->file );
1158    }
1159
1160    ### File::Copy is littered with 'die' statements :( ###
1161    my $rv = eval { File::Copy::copy( $remote, $to ) };
1162
1163    ### something went wrong ###
1164    if( !$rv or $@ ) {
1165        return $self->_error(loc("Could not copy '%1' to '%2': %3 %4",
1166                             $remote, $to, $!, $@));
1167    }
1168
1169    return $to;
1170}
1171
1172### use /usr/bin/rsync to fetch files
1173sub _rsync_fetch {
1174    my $self = shift;
1175    my %hash = @_;
1176
1177    my ($to);
1178    my $tmpl = {
1179        to  => { required => 1, store => \$to }
1180    };
1181    check( $tmpl, \%hash ) or return;
1182
1183    if (my $rsync = can_run('rsync')) {
1184
1185        my $cmd = [ $rsync ];
1186
1187        ### XXX: rsync has no I/O timeouts at all, by default
1188        push(@$cmd, '--timeout=' . $TIMEOUT) if $TIMEOUT;
1189
1190        push(@$cmd, '--quiet') unless $DEBUG;
1191
1192        ### DO NOT quote things for IPC::Run, it breaks stuff.
1193        push @$cmd, $self->uri, $to;
1194
1195        ### with IPC::Cmd > 0.41, this is fixed in teh library,
1196        ### and there's no need for special casing any more.
1197        ### DO NOT quote things for IPC::Run, it breaks stuff.
1198        # $IPC::Cmd::USE_IPC_RUN
1199        #    ? ($to, $self->uri)
1200        #    : (QUOTE. $to .QUOTE, QUOTE. $self->uri .QUOTE);
1201
1202        my $captured;
1203        unless(run( command => $cmd,
1204                    buffer  => \$captured,
1205                    verbose => $DEBUG )
1206        ) {
1207
1208            return $self->_error(loc("Command %1 failed: %2",
1209                "@$cmd" || '', $captured || ''));
1210        }
1211
1212        return $to;
1213
1214    } else {
1215        $METHOD_FAIL->{'rsync'} = 1;
1216        return;
1217    }
1218}
1219
1220#################################
1221#
1222# Error code
1223#
1224#################################
1225
1226=pod
1227
1228=head2 $ff->error([BOOL])
1229
1230Returns the last encountered error as string.
1231Pass it a true value to get the C<Carp::longmess()> output instead.
1232
1233=cut
1234
1235### error handling the way Archive::Extract does it
1236sub _error {
1237    my $self    = shift;
1238    my $error   = shift;
1239
1240    $self->_error_msg( $error );
1241    $self->_error_msg_long( Carp::longmess($error) );
1242
1243    if( $WARN ) {
1244        carp $DEBUG ? $self->_error_msg_long : $self->_error_msg;
1245    }
1246
1247    return;
1248}
1249
1250sub error {
1251    my $self = shift;
1252    return shift() ? $self->_error_msg_long : $self->_error_msg;
1253}
1254
1255
12561;
1257
1258=pod
1259
1260=head1 HOW IT WORKS
1261
1262File::Fetch is able to fetch a variety of uris, by using several
1263external programs and modules.
1264
1265Below is a mapping of what utilities will be used in what order
1266for what schemes, if available:
1267
1268    file    => LWP, lftp, file
1269    http    => LWP, wget, curl, lftp, lynx, iosock
1270    ftp     => LWP, Net::FTP, wget, curl, lftp, ncftp, ftp
1271    rsync   => rsync
1272
1273If you'd like to disable the use of one or more of these utilities
1274and/or modules, see the C<$BLACKLIST> variable further down.
1275
1276If a utility or module isn't available, it will be marked in a cache
1277(see the C<$METHOD_FAIL> variable further down), so it will not be
1278tried again. The C<fetch> method will only fail when all options are
1279exhausted, and it was not able to retrieve the file.
1280
1281C<iosock> is a very limited L<IO::Socket::INET> based mechanism for
1282retrieving C<http> schemed urls. It doesn't follow redirects for instance.
1283
1284A special note about fetching files from an ftp uri:
1285
1286By default, all ftp connections are done in passive mode. To change
1287that, see the C<$FTP_PASSIVE> variable further down.
1288
1289Furthermore, ftp uris only support anonymous connections, so no
1290named user/password pair can be passed along.
1291
1292C</bin/ftp> is blacklisted by default; see the C<$BLACKLIST> variable
1293further down.
1294
1295=head1 GLOBAL VARIABLES
1296
1297The behaviour of File::Fetch can be altered by changing the following
1298global variables:
1299
1300=head2 $File::Fetch::FROM_EMAIL
1301
1302This is the email address that will be sent as your anonymous ftp
1303password.
1304
1305Default is C<File-Fetch@example.com>.
1306
1307=head2 $File::Fetch::USER_AGENT
1308
1309This is the useragent as C<LWP> will report it.
1310
1311Default is C<File::Fetch/$VERSION>.
1312
1313=head2 $File::Fetch::FTP_PASSIVE
1314
1315This variable controls whether the environment variable C<FTP_PASSIVE>
1316and any passive switches to commandline tools will be set to true.
1317
1318Default value is 1.
1319
1320Note: When $FTP_PASSIVE is true, C<ncftp> will not be used to fetch
1321files, since passive mode can only be set interactively for this binary
1322
1323=head2 $File::Fetch::TIMEOUT
1324
1325When set, controls the network timeout (counted in seconds).
1326
1327Default value is 0.
1328
1329=head2 $File::Fetch::WARN
1330
1331This variable controls whether errors encountered internally by
1332C<File::Fetch> should be C<carp>'d or not.
1333
1334Set to false to silence warnings. Inspect the output of the C<error()>
1335method manually to see what went wrong.
1336
1337Defaults to C<true>.
1338
1339=head2 $File::Fetch::DEBUG
1340
1341This enables debugging output when calling commandline utilities to
1342fetch files.
1343This also enables C<Carp::longmess> errors, instead of the regular
1344C<carp> errors.
1345
1346Good for tracking down why things don't work with your particular
1347setup.
1348
1349Default is 0.
1350
1351=head2 $File::Fetch::BLACKLIST
1352
1353This is an array ref holding blacklisted modules/utilities for fetching
1354files with.
1355
1356To disallow the use of, for example, C<LWP> and C<Net::FTP>, you could
1357set $File::Fetch::BLACKLIST to:
1358
1359    $File::Fetch::BLACKLIST = [qw|lwp netftp|]
1360
1361The default blacklist is [qw|ftp|], as C</bin/ftp> is rather unreliable.
1362
1363See the note on C<MAPPING> below.
1364
1365=head2 $File::Fetch::METHOD_FAIL
1366
1367This is a hashref registering what modules/utilities were known to fail
1368for fetching files (mostly because they weren't installed).
1369
1370You can reset this cache by assigning an empty hashref to it, or
1371individually remove keys.
1372
1373See the note on C<MAPPING> below.
1374
1375=head1 MAPPING
1376
1377
1378Here's a quick mapping for the utilities/modules, and their names for
1379the $BLACKLIST, $METHOD_FAIL and other internal functions.
1380
1381    LWP         => lwp
1382    Net::FTP    => netftp
1383    wget        => wget
1384    lynx        => lynx
1385    ncftp       => ncftp
1386    ftp         => ftp
1387    curl        => curl
1388    rsync       => rsync
1389    lftp        => lftp
1390    IO::Socket  => iosock
1391
1392=head1 FREQUENTLY ASKED QUESTIONS
1393
1394=head2 So how do I use a proxy with File::Fetch?
1395
1396C<File::Fetch> currently only supports proxies with LWP::UserAgent.
1397You will need to set your environment variables accordingly. For
1398example, to use an ftp proxy:
1399
1400    $ENV{ftp_proxy} = 'foo.com';
1401
1402Refer to the LWP::UserAgent manpage for more details.
1403
1404=head2 I used 'lynx' to fetch a file, but its contents is all wrong!
1405
1406C<lynx> can only fetch remote files by dumping its contents to C<STDOUT>,
1407which we in turn capture. If that content is a 'custom' error file
1408(like, say, a C<404 handler>), you will get that contents instead.
1409
1410Sadly, C<lynx> doesn't support any options to return a different exit
1411code on non-C<200 OK> status, giving us no way to tell the difference
1412between a 'successfull' fetch and a custom error page.
1413
1414Therefor, we recommend to only use C<lynx> as a last resort. This is
1415why it is at the back of our list of methods to try as well.
1416
1417=head2 Files I'm trying to fetch have reserved characters or non-ASCII characters in them. What do I do?
1418
1419C<File::Fetch> is relatively smart about things. When trying to write
1420a file to disk, it removes the C<query parameters> (see the
1421C<output_file> method for details) from the file name before creating
1422it. In most cases this suffices.
1423
1424If you have any other characters you need to escape, please install
1425the C<URI::Escape> module from CPAN, and pre-encode your URI before
1426passing it to C<File::Fetch>. You can read about the details of URIs
1427and URI encoding here:
1428
1429  http://www.faqs.org/rfcs/rfc2396.html
1430
1431=head1 TODO
1432
1433=over 4
1434
1435=item Implement $PREFER_BIN
1436
1437To indicate to rather use commandline tools than modules
1438
1439=back
1440
1441=head1 BUG REPORTS
1442
1443Please report bugs or other issues to E<lt>bug-file-fetch@rt.cpan.org<gt>.
1444
1445=head1 AUTHOR
1446
1447This module by Jos Boumans E<lt>kane@cpan.orgE<gt>.
1448
1449=head1 COPYRIGHT
1450
1451This library is free software; you may redistribute and/or modify it
1452under the same terms as Perl itself.
1453
1454
1455=cut
1456
1457# Local variables:
1458# c-indentation-style: bsd
1459# c-basic-offset: 4
1460# indent-tabs-mode: nil
1461# End:
1462# vim: expandtab shiftwidth=4:
1463
1464
1465
1466
1467