1# Copyright 2008-2010 Tim Rayner 2# 3# This file is part of Bio::MAGETAB. 4# 5# Bio::MAGETAB is free software: you can redistribute it and/or modify 6# it under the terms of the GNU General Public License as published by 7# the Free Software Foundation, either version 2 of the License, or 8# (at your option) any later version. 9# 10# Bio::MAGETAB is distributed in the hope that it will be useful, 11# but WITHOUT ANY WARRANTY; without even the implied warranty of 12# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 13# GNU General Public License for more details. 14# 15# You should have received a copy of the GNU General Public License 16# along with Bio::MAGETAB. If not, see <http://www.gnu.org/licenses/>. 17# 18# $Id: Tabfile.pm 340 2010-07-23 13:19:27Z tfrayner $ 19 20package Bio::MAGETAB::Util::Reader::Tabfile; 21 22use Moose; 23use MooseX::FollowPBP; 24 25use MooseX::Types::Moose qw( Str FileHandle ); 26use Bio::MAGETAB::Types qw( Uri ); 27 28use Carp; 29use charnames qw( :full ); 30use Text::CSV_XS; 31 32use Bio::MAGETAB::Util::Builder; 33 34has 'uri' => ( is => 'rw', 35 isa => Uri, 36 coerce => 1, 37 required => 1 ); 38 39has 'eol_char' => ( is => 'rw', 40 isa => Str, 41 required => 0 ); 42 43has 'filehandle' => ( is => 'rw', 44 isa => FileHandle, 45 required => 0 ); 46 47has 'csv_parser' => ( is => 'rw', 48 isa => 'Text::CSV_XS', 49 required => 0, 50 handles => [ qw(print) ]); 51 52has 'builder' => ( is => 'ro', 53 isa => 'Bio::MAGETAB::Util::Builder', 54 default => sub { Bio::MAGETAB::Util::Builder->new() }, 55 required => 1 ); 56 57# Define some standard regexps: 58my $RE_EMPTY_STRING = qr{\A \s* \z}xms; 59my $RE_COMMENTED_STRING = qr{\A [\"\s]* \#}xms; 60my $RE_SURROUNDED_BY_WHITESPACE = qr{\A [\"\s]* (.*?) [\"\s]* \z}xms; 61 62sub BUILD { 63 64 my ( $self, $params ) = @_; 65 66 $self->_calculate_eol_char(); 67 $self->_construct_csv_parser(); 68 $self->_cache_filehandle(); 69 70 return; 71} 72 73sub getline { 74 75 my ( $self, $fh ) = @_; 76 77 $fh ||= $self->get_filehandle(); 78 79 return $self->get_csv_parser()->getline($fh); 80} 81 82sub can_ignore { 83 84 my ( $self, $larry ) = @_; 85 86 # Skip empty lines. 87 my $line = join( q{}, @$larry ); 88 return 1 if ( $line =~ $RE_EMPTY_STRING ); 89 90 # Allow hash comments. 91 return 1 if ( $line =~ $RE_COMMENTED_STRING ); 92 93 return; 94} 95 96sub strip_whitespace { 97 98 my ( $self, $larry ) = @_; 99 100 # Strip surrounding whitespace from each element. 101 foreach my $element ( @$larry ) { 102 $element =~ s/$RE_SURROUNDED_BY_WHITESPACE/$1/xms; 103 } 104 105 return $larry; 106} 107 108sub confirm_full_parse { 109 110 my ( $self, $nextline ) = @_; 111 112 # $nextline is an optional argument used to check for correct 113 # parsing in the middle of a file (where $error != 2012, but we 114 # don't want to throw an exception if we have a real $nextline). 115 my $csv_parser = $self->get_csv_parser(); 116 117 # Check we've parsed to the end of the file. 118 my ( $error, $mess ) = $csv_parser->error_diag(); 119 unless ( $nextline || $error == 2012 ) { # 2012 is the Text::CSV_XS EOF code. 120 croak( 121 sprintf( 122 "Error in tab-delimited format: %s. Bad input was:\n\n%s\n", 123 $mess, 124 $csv_parser->error_input(), 125 ), 126 ); 127 } 128} 129 130################### 131# PRIVATE METHODS # 132################### 133 134sub _calculate_eol_char { 135 136 my ( $self ) = @_; 137 138 unless ( $self->get_eol_char() ) { 139 my ($eols, $eol_char) = $self->_check_linebreaks(); 140 unless ( $eol_char ) { 141 croak( 142 sprintf( 143 "Error: Cannot correctly parse linebreaks in file %s" 144 . " (%s unix, %s dos, %s mac)\n", 145 $self->_get_filepath(), 146 $eols->{unix}, 147 $eols->{dos}, 148 $eols->{mac}, 149 ) 150 ); 151 } 152 $self->set_eol_char( $eol_char ); 153 } 154 155 if ( ( $self->get_eol_char() eq "\015" ) 156 && ( $Text::CSV_XS::VERSION < 0.27 ) ) { 157 158 # Mac linebreaks not supported by older versions of Text::CSV_XS. 159 die("Error: Mac linebreaks not supported by this version" 160 . " of Text::CSV_XS. Please upgrade to version 0.27 or higher.\n"); 161 } 162 163 return $self->get_eol_char(); 164} 165 166sub _construct_csv_parser { 167 168 my ( $self ) = @_; 169 170 # We cache this in a private attribute so each file only gets one 171 # parser (better for error trackage). 172 unless ( $self->get_csv_parser() ) { 173 my $csv_parser = Text::CSV_XS->new( 174 { sep_char => qq{\t}, 175 quote_char => qq{"}, # default 176 escape_char => qq{"}, # default 177 binary => 1, 178 eol => ( $self->_calculate_eol_char() || "\n" ), 179 allow_loose_quotes => 1, 180 } 181 ); 182 $self->set_csv_parser( $csv_parser ); 183 } 184 185 return $self->get_csv_parser(); 186} 187 188sub _get_filepath { 189 190 my ( $self, $dir ) = @_; 191 192 my $uri = $self->get_uri(); 193 194 # Assume file as default URI scheme. 195 my $path; 196 if ( ! $uri->scheme() || $uri->scheme() eq 'file' ) { 197 198 $uri->scheme('file'); 199 200 # URI::File specific, this avoids quoting e.g. spaces in filenames. 201 my $uri_path = $uri->file(); 202 203 if ( $dir ) { 204 $path = File::Spec->file_name_is_absolute( $uri_path ) 205 ? $uri_path 206 : File::Spec->catfile( $dir, $uri_path ); 207 } 208 else { 209 $path = File::Spec->rel2abs( $uri_path ); 210 } 211 } 212 # Add the common network URI schemes. 213 elsif ( $uri->scheme() eq 'http' || $uri->scheme() eq 'ftp' ) { 214 $path = $self->_cache_network_file( $uri, $dir ); 215 } 216 else { 217 croak(sprintf( 218 "ERROR: Unsupported URI scheme: %s\n", $uri->scheme(), 219 )); 220 } 221 222 return $path; 223} 224 225sub _cache_filehandle { 226 227 my ( $self ) = @_; 228 229 my $fh; 230 unless ( $fh = $self->get_filehandle ) { 231 my $path = $self->_get_filepath(); 232 open( $fh, '<', $path ) 233 or croak(qq{Error: Unable to open file "$path": $!}); 234 $self->set_filehandle( $fh ); 235 } 236 237 return $fh; 238} 239 240 241sub _cache_network_file { 242 243 my ( $self, $uri, $dir ) = @_; 244 245 require LWP::UserAgent; 246 247 # N.B. we don't handle URI fragments, just the path. 248 my ( $basename ) = ( $uri->path() =~ m!/([^/]+) \z!xms ); 249 250 my $target; 251 if ( $dir ) { 252 $target = File::Spec->catfile( $dir, $basename ); 253 } 254 else { 255 $target = $basename; 256 } 257 258 # Only download the file once. 259 unless ( -f $target ) { 260 261 printf STDOUT ( 262 qq{Downloading network file "%s"...\n}, 263 $uri->as_string(), 264 ); 265 266 # Download the $uri->as_string() 267 my $ua = LWP::UserAgent->new(); 268 269 my $response = $ua->get( 270 $uri->as_string(), 271 ':content_file' => $target, 272 ); 273 274 unless ( $response->is_success() ) { 275 croak(sprintf( 276 qq{Error downloading network file "%s" : %s\n}, 277 $uri->as_string(), 278 $response->status_line(), 279 )); 280 } 281 } 282 283 return $target; 284} 285 286sub _check_linebreaks { 287 288 # Checks for Mac, Unix or Dos line endings by reading the whole 289 # file in chunks, and regexp matching the various linebreak types. 290 # Returns the appropriate linebreak for acceptable line breaks 291 # (N.B. line breaks *must* be unanimous), undef for not. 292 293 my ( $self ) = @_; 294 295 my $path = $self->_get_filepath(); 296 297 my $bytelength = -s $path; 298 299 my $fh = $self->_cache_filehandle(); 300 301 # Count all the line endings. This can get memory intensive 302 # (implicit list generation, can be over 1,000,000 entries for 303 # Affy CEL). We read the file in defined chunks to address this. 304 my ( $unix_count, $mac_count, $dos_count ); 305 my $chunk_size = 3_000_000; # ~10 chunks to a big CEL file. 306 my $previous_final_char = q{}; 307 for ( my $offset = 0; $offset < $bytelength; $offset += $chunk_size ) { 308 309 my $chunk; 310 311 my $bytes_read = read( $fh, $chunk, $chunk_size ); 312 313 unless ( defined($bytes_read) ) { 314 croak("Error reading file chunk at offset $offset ($path): $!\n"); 315 } 316 317 # Lists generated implicitly here. 318 $unix_count += () = ( $chunk =~ m{\N{LINE FEED}}g ); 319 $mac_count += () = ( $chunk =~ m{\N{CARRIAGE RETURN}}g ); 320 $dos_count += () = ( $chunk =~ m{\N{CARRIAGE RETURN}\N{LINE FEED}}g ); 321 322 # DOS line endings could conceivably be split between chunks. 323 if ( $bytes_read ) { # Skip if at end of file. 324 if ( ( substr( $chunk, 0, 1 ) eq "\N{LINE FEED}" ) 325 && ( $previous_final_char eq "\N{CARRIAGE RETURN}" ) ) { 326 $dos_count++; 327 } 328 $previous_final_char = substr( $chunk, -1, 1 ); 329 } 330 } 331 332 seek($fh, 0, 0) 333 or croak("Error rewinding file $path in sub _check_linebreaks: $!\n"); 334 335 my $dos = $dos_count; 336 my $mac = $mac_count - $dos_count; 337 my $unix = $unix_count - $dos_count; 338 339 # Set to undef on failure. 340 my $line_ending = undef; 341 342 # Determine the file line endings format, return the "standard" line 343 # ending to use 344 if ( $unix && !$mac && !$dos ) { # Unix 345 $line_ending = "\N{LINE FEED}"; 346 } 347 elsif ( $mac && !$unix && !$dos ) { # Mac 348 $line_ending = "\N{CARRIAGE RETURN}"; 349 } 350 elsif ( $dos && !$mac && !$unix ) { # DOS 351 $line_ending = "\N{CARRIAGE RETURN}\N{LINE FEED}"; 352 } 353 354 # Calling in scalar context just gives $line_ending. 355 my $counts = { 356 unix => $unix, 357 dos => $dos, 358 mac => $mac, 359 }; 360 361 return wantarray ? ( $counts, $line_ending ) : $line_ending; 362} 363 364# Make the classes immutable. In theory this speeds up object 365# instantiation for a small compilation time cost. 366__PACKAGE__->meta->make_immutable(); 367 368no Moose; 369 370=head1 NAME 371 372Bio::MAGETAB::Util::Reader::Tabfile - An abstract class providing methods for 373handling tab-delimited files. 374 375=head1 SYNOPSIS 376 377 use base qw(Bio::MAGETAB::Util::Reader::Tabfile); 378 379=head1 DESCRIPTION 380 381This abstract class acts as a wrapper for the Text::CSV_XS module and 382line ending detection code used by the rest of the 383Bio::MAGETAB::Util::Reader modules. It is not designed to be used 384directly. 385 386=head1 ATTRIBUTES 387 388=over 2 389 390=item uri 391 392Required URI path to the file to be parsed. 393 394=item eol_char 395 396The end-of-line character to use while parsing. Typically this is set 397by the Reader subclasses. 398 399=item filehandle 400 401The filehandle for the file being parsed. 402 403=item csv_parser 404 405A Text::CSV_XS parser object. 406 407=item builder 408 409A Bio::MAGETAB::Util::Builder object, used by subclasses to track 410MAGE-TAB object creation. 411 412=back 413 414=head1 METHODS 415 416=over 2 417 418=item getline 419 420A simple wrapper for the Text::CSV_XS C<getline()> method which takes 421an optional filehandle argument, using the cached filehandle returned 422by C<get_filehandle()> as the default. This filehandle argument can be 423useful when explicitly controlling the read position of the script 424within the file (e.g. as in ADF parsing). 425 426=item can_ignore 427 428When passed an arrayref of column values for a given line, returns 1 429if the line is ignorable (typically blank or commented lines fall into 430this category) or undef if not. 431 432=item strip_whitespace 433 434This method strips any whitespace surrounding the string values passed 435to it in an arrayref. 436 437=item confirm_full_parse 438 439Raises an exception if the file has not been parsed to completion 440(i.e., EOF). Takes a line arrayref as returned by 441C<$self-E<gt>getline()> as an optional argument to allow testing 442for either (a) the existence of a next line in the file, or (b) 443EOF. This is useful when pausing parsing partway through a file, 444e.g. after parsing the ADF header section. 445 446=back 447 448In addition, each attribute has accessor (get_*) and mutator (set_*) methods. 449 450=head1 SEE ALSO 451 452L<Bio::MAGETAB::Util::Reader> 453L<Bio::MAGETAB::Util::Reader::TagValueFile> 454 455=head1 AUTHOR 456 457Tim F. Rayner <tfrayner@gmail.com> 458 459=head1 LICENSE 460 461This library is released under version 2 of the GNU General Public 462License (GPL). 463 464=cut 465 4661; 467