1# Copyright 2008-2010 Tim Rayner
2#
3# This file is part of Bio::MAGETAB.
4#
5# Bio::MAGETAB is free software: you can redistribute it and/or modify
6# it under the terms of the GNU General Public License as published by
7# the Free Software Foundation, either version 2 of the License, or
8# (at your option) any later version.
9#
10# Bio::MAGETAB is distributed in the hope that it will be useful,
11# but WITHOUT ANY WARRANTY; without even the implied warranty of
12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13# GNU General Public License for more details.
14#
15# You should have received a copy of the GNU General Public License
16# along with Bio::MAGETAB.  If not, see <http://www.gnu.org/licenses/>.
17#
18# $Id: Tabfile.pm 340 2010-07-23 13:19:27Z tfrayner $
19
20package Bio::MAGETAB::Util::Reader::Tabfile;
21
22use Moose;
23use MooseX::FollowPBP;
24
25use MooseX::Types::Moose qw( Str FileHandle );
26use Bio::MAGETAB::Types qw( Uri );
27
28use Carp;
29use charnames qw( :full );
30use Text::CSV_XS;
31
32use Bio::MAGETAB::Util::Builder;
33
34has 'uri'                => ( is         => 'rw',
35                              isa        => Uri,
36                              coerce     => 1,
37                              required   => 1 );
38
39has 'eol_char'           => ( is         => 'rw',
40                              isa        => Str,
41                              required   => 0 );
42
43has 'filehandle'         => ( is         => 'rw',
44                              isa        => FileHandle,
45                              required   => 0 );
46
47has 'csv_parser'         => ( is         => 'rw',
48                              isa        => 'Text::CSV_XS',
49                              required   => 0,
50                              handles    => [ qw(print) ]);
51
52has 'builder'            => ( is         => 'ro',
53                              isa        => 'Bio::MAGETAB::Util::Builder',
54                              default    => sub { Bio::MAGETAB::Util::Builder->new() },
55                              required   => 1 );
56
57# Define some standard regexps:
58my $RE_EMPTY_STRING             = qr{\A \s* \z}xms;
59my $RE_COMMENTED_STRING         = qr{\A [\"\s]* \#}xms;
60my $RE_SURROUNDED_BY_WHITESPACE = qr{\A [\"\s]* (.*?) [\"\s]* \z}xms;
61
62sub BUILD {
63
64    my ( $self, $params ) = @_;
65
66    $self->_calculate_eol_char();
67    $self->_construct_csv_parser();
68    $self->_cache_filehandle();
69
70    return;
71}
72
73sub getline {
74
75    my ( $self, $fh ) = @_;
76
77    $fh ||= $self->get_filehandle();
78
79    return $self->get_csv_parser()->getline($fh);
80}
81
82sub can_ignore {
83
84    my ( $self, $larry ) = @_;
85
86    # Skip empty lines.
87    my $line = join( q{}, @$larry );
88    return 1 if ( $line =~ $RE_EMPTY_STRING );
89
90    # Allow hash comments.
91    return 1 if ( $line =~ $RE_COMMENTED_STRING );
92
93    return;
94}
95
96sub strip_whitespace {
97
98    my ( $self, $larry ) = @_;
99
100    # Strip surrounding whitespace from each element.
101    foreach my $element ( @$larry ) {
102        $element =~ s/$RE_SURROUNDED_BY_WHITESPACE/$1/xms;
103    }
104
105    return $larry;
106}
107
108sub confirm_full_parse {
109
110    my ( $self, $nextline ) = @_;
111
112    # $nextline is an optional argument used to check for correct
113    # parsing in the middle of a file (where $error != 2012, but we
114    # don't want to throw an exception if we have a real $nextline).
115    my $csv_parser = $self->get_csv_parser();
116
117    # Check we've parsed to the end of the file.
118    my ( $error, $mess ) = $csv_parser->error_diag();
119    unless ( $nextline || $error == 2012 ) {    # 2012 is the Text::CSV_XS EOF code.
120	croak(
121	    sprintf(
122		"Error in tab-delimited format: %s. Bad input was:\n\n%s\n",
123		$mess,
124		$csv_parser->error_input(),
125	    ),
126	);
127    }
128}
129
130###################
131# PRIVATE METHODS #
132###################
133
134sub _calculate_eol_char {
135
136    my ( $self ) = @_;
137
138    unless ( $self->get_eol_char() ) {
139	my ($eols, $eol_char) = $self->_check_linebreaks();
140	unless ( $eol_char ) {
141	    croak(
142		sprintf(
143		    "Error: Cannot correctly parse linebreaks in file %s"
144			. " (%s unix, %s dos, %s mac)\n",
145		    $self->_get_filepath(),
146		    $eols->{unix},
147		    $eols->{dos},
148		    $eols->{mac},
149		)
150	    );
151	}
152	$self->set_eol_char( $eol_char );
153    }
154
155    if (    ( $self->get_eol_char() eq "\015" )
156         && ( $Text::CSV_XS::VERSION < 0.27 ) ) {
157
158	# Mac linebreaks not supported by older versions of Text::CSV_XS.
159	die("Error: Mac linebreaks not supported by this version"
160	  . " of Text::CSV_XS. Please upgrade to version 0.27 or higher.\n");
161    }
162
163    return $self->get_eol_char();
164}
165
166sub _construct_csv_parser {
167
168    my ( $self ) = @_;
169
170    # We cache this in a private attribute so each file only gets one
171    # parser (better for error trackage).
172    unless ( $self->get_csv_parser() ) {
173        my $csv_parser = Text::CSV_XS->new(
174            {   sep_char    => qq{\t},
175                quote_char  => qq{"},                   # default
176                escape_char => qq{"},                   # default
177                binary      => 1,
178                eol         => ( $self->_calculate_eol_char() || "\n" ),
179                allow_loose_quotes => 1,
180            }
181        );
182        $self->set_csv_parser( $csv_parser );
183    }
184
185    return $self->get_csv_parser();
186}
187
188sub _get_filepath {
189
190    my ( $self, $dir ) = @_;
191
192    my $uri = $self->get_uri();
193
194    # Assume file as default URI scheme.
195    my $path;
196    if ( ! $uri->scheme() || $uri->scheme() eq 'file' ) {
197
198	$uri->scheme('file');
199
200	# URI::File specific, this avoids quoting e.g. spaces in filenames.
201	my $uri_path = $uri->file();
202
203	if ( $dir ) {
204	    $path = File::Spec->file_name_is_absolute( $uri_path )
205		  ? $uri_path
206		  : File::Spec->catfile( $dir, $uri_path );
207	}
208	else {
209	    $path = File::Spec->rel2abs( $uri_path );
210	}
211    }
212    # Add the common network URI schemes.
213    elsif ( $uri->scheme() eq 'http' || $uri->scheme() eq 'ftp' ) {
214	$path = $self->_cache_network_file( $uri, $dir );
215    }
216    else {
217	croak(sprintf(
218	    "ERROR: Unsupported URI scheme: %s\n", $uri->scheme(),
219	));
220    }
221
222    return $path;
223}
224
225sub _cache_filehandle {
226
227    my ( $self ) = @_;
228
229    my $fh;
230    unless ( $fh = $self->get_filehandle ) {
231        my $path = $self->_get_filepath();
232        open( $fh, '<', $path )
233            or croak(qq{Error: Unable to open file "$path": $!});
234        $self->set_filehandle( $fh );
235    }
236
237    return $fh;
238}
239
240
241sub _cache_network_file {
242
243    my ( $self, $uri, $dir ) = @_;
244
245    require LWP::UserAgent;
246
247    # N.B. we don't handle URI fragments, just the path.
248    my ( $basename ) = ( $uri->path() =~ m!/([^/]+) \z!xms );
249
250    my $target;
251    if ( $dir ) {
252	$target = File::Spec->catfile( $dir, $basename );
253    }
254    else {
255	$target = $basename;
256    }
257
258    # Only download the file once.
259    unless ( -f $target ) {
260
261	printf STDOUT (
262	    qq{Downloading network file "%s"...\n},
263	    $uri->as_string(),
264	);
265
266	# Download the $uri->as_string()
267	my $ua = LWP::UserAgent->new();
268
269	my $response = $ua->get(
270	    $uri->as_string(),
271	    ':content_file' => $target,
272	);
273
274	unless ( $response->is_success() ) {
275	    croak(sprintf(
276		qq{Error downloading network file "%s" : %s\n},
277		$uri->as_string(),
278		$response->status_line(),
279	    ));
280	}
281    }
282
283    return $target;
284}
285
286sub _check_linebreaks {
287
288    # Checks for Mac, Unix or Dos line endings by reading the whole
289    # file in chunks, and regexp matching the various linebreak types.
290    # Returns the appropriate linebreak for acceptable line breaks
291    # (N.B. line breaks *must* be unanimous), undef for not.
292
293    my ( $self ) = @_;
294
295    my $path = $self->_get_filepath();
296
297    my $bytelength = -s $path;
298
299    my $fh = $self->_cache_filehandle();
300
301    # Count all the line endings. This can get memory intensive
302    # (implicit list generation, can be over 1,000,000 entries for
303    # Affy CEL). We read the file in defined chunks to address this.
304    my ( $unix_count, $mac_count, $dos_count );
305    my $chunk_size          = 3_000_000;    # ~10 chunks to a big CEL file.
306    my $previous_final_char = q{};
307    for ( my $offset = 0; $offset < $bytelength; $offset += $chunk_size ) {
308
309        my $chunk;
310
311	my $bytes_read = read( $fh, $chunk, $chunk_size );
312
313	unless ( defined($bytes_read) ) {
314	    croak("Error reading file chunk at offset $offset ($path): $!\n");
315	}
316
317	# Lists generated implicitly here.
318        $unix_count += () = ( $chunk =~ m{\N{LINE FEED}}g );
319        $mac_count  += () = ( $chunk =~ m{\N{CARRIAGE RETURN}}g );
320        $dos_count  += () = ( $chunk =~ m{\N{CARRIAGE RETURN}\N{LINE FEED}}g );
321
322        # DOS line endings could conceivably be split between chunks.
323	if ( $bytes_read ) {    # Skip if at end of file.
324	    if (   ( substr( $chunk, 0, 1 ) eq "\N{LINE FEED}" )
325		&& ( $previous_final_char eq "\N{CARRIAGE RETURN}" ) ) {
326		$dos_count++;
327	    }
328	    $previous_final_char = substr( $chunk, -1, 1 );
329	}
330    }
331
332    seek($fh, 0, 0)
333        or croak("Error rewinding file $path in sub _check_linebreaks: $!\n");
334
335    my $dos  = $dos_count;
336    my $mac  = $mac_count  - $dos_count;
337    my $unix = $unix_count - $dos_count;
338
339    # Set to undef on failure.
340    my $line_ending = undef;
341
342    # Determine the file line endings format, return the "standard" line
343    # ending to use
344    if ( $unix && !$mac && !$dos ) {    # Unix
345        $line_ending = "\N{LINE FEED}";
346    }
347    elsif ( $mac && !$unix && !$dos ) {    # Mac
348        $line_ending = "\N{CARRIAGE RETURN}";
349    }
350    elsif ( $dos && !$mac && !$unix ) {    # DOS
351        $line_ending = "\N{CARRIAGE RETURN}\N{LINE FEED}";
352    }
353
354    # Calling in scalar context just gives $line_ending.
355    my $counts = {
356        unix => $unix,
357        dos  => $dos,
358        mac  => $mac,
359    };
360
361    return wantarray ? ( $counts, $line_ending ) : $line_ending;
362}
363
364# Make the classes immutable. In theory this speeds up object
365# instantiation for a small compilation time cost.
366__PACKAGE__->meta->make_immutable();
367
368no Moose;
369
370=head1 NAME
371
372Bio::MAGETAB::Util::Reader::Tabfile - An abstract class providing methods for
373handling tab-delimited files.
374
375=head1 SYNOPSIS
376
377 use base qw(Bio::MAGETAB::Util::Reader::Tabfile);
378
379=head1 DESCRIPTION
380
381This abstract class acts as a wrapper for the Text::CSV_XS module and
382line ending detection code used by the rest of the
383Bio::MAGETAB::Util::Reader modules. It is not designed to be used
384directly.
385
386=head1 ATTRIBUTES
387
388=over 2
389
390=item uri
391
392Required URI path to the file to be parsed.
393
394=item eol_char
395
396The end-of-line character to use while parsing. Typically this is set
397by the Reader subclasses.
398
399=item filehandle
400
401The filehandle for the file being parsed.
402
403=item csv_parser
404
405A Text::CSV_XS parser object.
406
407=item builder
408
409A Bio::MAGETAB::Util::Builder object, used by subclasses to track
410MAGE-TAB object creation.
411
412=back
413
414=head1 METHODS
415
416=over 2
417
418=item getline
419
420A simple wrapper for the Text::CSV_XS C<getline()> method which takes
421an optional filehandle argument, using the cached filehandle returned
422by C<get_filehandle()> as the default. This filehandle argument can be
423useful when explicitly controlling the read position of the script
424within the file (e.g. as in ADF parsing).
425
426=item can_ignore
427
428When passed an arrayref of column values for a given line, returns 1
429if the line is ignorable (typically blank or commented lines fall into
430this category) or undef if not.
431
432=item strip_whitespace
433
434This method strips any whitespace surrounding the string values passed
435to it in an arrayref.
436
437=item confirm_full_parse
438
439Raises an exception if the file has not been parsed to completion
440(i.e., EOF). Takes a line arrayref as returned by
441C<$self-E<gt>getline()> as an optional argument to allow testing
442for either (a) the existence of a next line in the file, or (b)
443EOF. This is useful when pausing parsing partway through a file,
444e.g. after parsing the ADF header section.
445
446=back
447
448In addition, each attribute has accessor (get_*) and mutator (set_*) methods.
449
450=head1 SEE ALSO
451
452L<Bio::MAGETAB::Util::Reader>
453L<Bio::MAGETAB::Util::Reader::TagValueFile>
454
455=head1 AUTHOR
456
457Tim F. Rayner <tfrayner@gmail.com>
458
459=head1 LICENSE
460
461This library is released under version 2 of the GNU General Public
462License (GPL).
463
464=cut
465
4661;
467