1# 2# BioPerl module for Bio::AlignIO 3# 4# based on the Bio::SeqIO module 5# by Ewan Birney <birney@ebi.ac.uk> 6# and Lincoln Stein <lstein@cshl.org> 7# 8# Copyright Peter Schattner 9# 10# You may distribute this module under the same terms as perl itself 11# 12# History 13# September, 2000 AlignIO written by Peter Schattner 14 15# POD documentation - main docs before the code 16 17=head1 NAME 18 19Bio::AlignIO - Handler for AlignIO Formats 20 21=head1 SYNOPSIS 22 23 use Bio::AlignIO; 24 25 $inputfilename = "testaln.fasta"; 26 $in = Bio::AlignIO->new(-file => $inputfilename , 27 -format => 'fasta'); 28 $out = Bio::AlignIO->new(-file => ">out.aln.pfam" , 29 -format => 'pfam'); 30 31 while ( my $aln = $in->next_aln() ) { 32 $out->write_aln($aln); 33 } 34 35 # OR 36 37 use Bio::AlignIO; 38 39 open MYIN, '<', 'testaln.fasta' or die "Could not read file 'testaln.fasta': $!\n"; 40 $in = Bio::AlignIO->newFh(-fh => \*MYIN, 41 -format => 'fasta'); 42 open my $MYOUT, '>', 'testaln.pfam' or die "Could not write file 'testaln.pfam': $!\n"; 43 $out = Bio::AlignIO->newFh(-fh => $MYOUT, 44 -format => 'pfam'); 45 46 # World's smallest Fasta<->pfam format converter: 47 print $out $_ while <$in>; 48 49=head1 DESCRIPTION 50 51L<Bio::AlignIO> is a handler module for the formats in the AlignIO set, 52for example, L<Bio::AlignIO::fasta>. It is the officially sanctioned way 53of getting at the alignment objects. The resulting alignment is a 54L<Bio::Align::AlignI>-compliant object. 55 56The idea is that you request an object for a particular format. 57All the objects have a notion of an internal file that is read 58from or written to. A particular AlignIO object instance is configured 59for either input or output, you can think of it as a stream object. 60 61Each object has functions: 62 63 $stream->next_aln(); 64 65And: 66 67 $stream->write_aln($aln); 68 69Also: 70 71 $stream->type() # returns 'INPUT' or 'OUTPUT' 72 73As an added bonus, you can recover a filehandle that is tied to the 74AlignIO object, allowing you to use the standard E<lt>E<gt> and print 75operations to read and write alignment objects: 76 77 use Bio::AlignIO; 78 79 # read from standard input 80 $stream = Bio::AlignIO->newFh(-format => 'Fasta'); 81 82 while ( $aln = <$stream> ) { 83 # do something with $aln 84 } 85 86And: 87 88 print $stream $aln; # when stream is in output mode 89 90L<Bio::AlignIO> is patterned on the L<Bio::SeqIO> module and shares 91most of its features. One significant difference is that 92L<Bio::AlignIO> usually handles IO for only a single alignment at a time, 93whereas L<Bio::SeqIO> handles IO for multiple sequences in a single stream. 94The principal reason for this is that whereas simultaneously handling 95multiple sequences is a common requirement, simultaneous handling of 96multiple alignments is not. The only current exception is format 97C<bl2seq> which parses results of the BLAST C<bl2seq> program and which 98may produce several alignment pairs. This set of alignment pairs can 99be read using multiple calls to L<next_aln>. 100 101=head1 CONSTRUCTORS 102 103=head2 Bio::AlignIO-E<gt>new() 104 105 $seqIO = Bio::AlignIO->new(-file => 'filename', -format=>$format); 106 $seqIO = Bio::AlignIO->new(-fh => \*FILEHANDLE, -format=>$format); 107 $seqIO = Bio::AlignIO->new(-format => $format); 108 $seqIO = Bio::AlignIO->new(-fh => \*STDOUT, -format => $format); 109 110The L<new> class method constructs a new L<Bio::AlignIO> object. 111The returned object can be used to retrieve or print alignment 112objects. L<new> accepts the following parameters: 113 114=over 4 115 116=item -file 117 118A file path to be opened for reading or writing. The usual Perl 119conventions apply: 120 121 'file' # open file for reading 122 '>file' # open file for writing 123 '>>file' # open file for appending 124 '+<file' # open file read/write 125 'command |' # open a pipe from the command 126 '| command' # open a pipe to the command 127 128=item -fh 129 130You may provide new() with a previously-opened filehandle. For 131example, to read from STDIN: 132 133 $seqIO = Bio::AlignIO->new(-fh => \*STDIN); 134 135Note that you must pass filehandles as references to globs. 136 137If neither a filehandle nor a filename is specified, then the module 138will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt> 139semantics. 140 141=item -format 142 143Specify the format of the file. Supported formats include: 144 145 bl2seq Bl2seq Blast output 146 clustalw clustalw (.aln) format 147 emboss EMBOSS water and needle format 148 fasta FASTA format 149 maf Multiple Alignment Format 150 mase mase (seaview) format 151 mega MEGA format 152 meme MEME format 153 msf msf (GCG) format 154 nexus Swofford et al NEXUS format 155 pfam Pfam sequence alignment format 156 phylip Felsenstein PHYLIP format 157 prodom prodom (protein domain) format 158 psi PSI-BLAST format 159 selex selex (hmmer) format 160 stockholm stockholm format 161 162Currently only those formats which were implemented in L<Bio::SimpleAlign> 163have been incorporated into L<Bio::AlignIO>. Specifically, C<mase>, C<stockholm> 164and C<prodom> have only been implemented for input. See the specific module 165(e.g. L<Bio::AlignIO::prodom>) for notes on supported versions. 166 167If no format is specified and a filename is given, then the module 168will attempt to deduce it from the filename suffix. If this is unsuccessful, 169C<fasta> format is assumed. 170 171The format name is case insensitive; C<FASTA>, C<Fasta> and C<fasta> are 172all treated equivalently. 173 174=back 175 176=head2 Bio::AlignIO-E<gt>newFh() 177 178 $fh = Bio::AlignIO->newFh(-fh => \*FILEHANDLE, -format=>$format); 179 # read from STDIN or use @ARGV: 180 $fh = Bio::AlignIO->newFh(-format => $format); 181 182This constructor behaves like L<new>, but returns a tied filehandle 183rather than a L<Bio::AlignIO> object. You can read sequences from this 184object using the familiar E<lt>E<gt> operator, and write to it using 185L<print>. The usual array and $_ semantics work. For example, you can 186read all sequence objects into an array like this: 187 188 @sequences = <$fh>; 189 190Other operations, such as read(), sysread(), write(), close(), and printf() 191are not supported. 192 193=over 1 194 195=item -flush 196 197By default, all files (or filehandles) opened for writing alignments 198will be flushed after each write_aln() making the file immediately 199usable. If you do not need this facility and would like to marginally 200improve the efficiency of writing multiple sequences to the same file 201(or filehandle), pass the -flush option '0' or any other value that 202evaluates as defined but false: 203 204 my $clustal = Bio::AlignIO->new( -file => "<prot.aln", 205 -format => "clustalw" ); 206 my $msf = Bio::AlignIO->new(-file => ">prot.msf", 207 -format => "msf", 208 -flush => 0 ); # go as fast as we can! 209 while($seq = $clustal->next_aln) { $msf->write_aln($seq) } 210 211=back 212 213=head1 OBJECT METHODS 214 215See below for more detailed summaries. The main methods are: 216 217=head2 $alignment = $AlignIO-E<gt>next_aln() 218 219Fetch an alignment from a formatted file. 220 221=head2 $AlignIO-E<gt>write_aln($aln) 222 223Write the specified alignment to a file.. 224 225=head2 TIEHANDLE(), READLINE(), PRINT() 226 227These provide the tie interface. See L<perltie> for more details. 228 229=head1 FEEDBACK 230 231=head2 Mailing Lists 232 233User feedback is an integral part of the evolution of this and other 234Bioperl modules. Send your comments and suggestions preferably to one 235of the Bioperl mailing lists. Your participation is much appreciated. 236 237 bioperl-l@bioperl.org - General discussion 238 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 239 240=head2 Support 241 242Please direct usage questions or support issues to the mailing list: 243 244I<bioperl-l@bioperl.org> 245 246rather than to the module maintainer directly. Many experienced and 247reponsive experts will be able look at the problem and quickly 248address it. Please include a thorough description of the problem 249with code and data examples if at all possible. 250 251=head2 Reporting Bugs 252 253Report bugs to the Bioperl bug tracking system to help us keep track 254the bugs and their resolution. Bug reports can be submitted via the 255web: 256 257 https://github.com/bioperl/bioperl-live/issues 258 259=head1 AUTHOR - Peter Schattner 260 261Email: schattner@alum.mit.edu 262 263=head1 CONTRIBUTORS 264 265Jason Stajich, jason@bioperl.org 266 267=head1 APPENDIX 268 269The rest of the documentation details each of the object 270methods. Internal methods are usually preceded with a _ 271 272=cut 273 274# 'Let the code begin... 275 276package Bio::AlignIO; 277$Bio::AlignIO::VERSION = '1.7.7'; 278use strict; 279 280use Bio::Seq; 281use Bio::LocatableSeq; 282use Bio::SimpleAlign; 283use Bio::Tools::GuessSeqFormat; 284use base qw(Bio::Root::Root Bio::Root::IO); 285 286=head2 new 287 288 Title : new 289 Usage : $stream = Bio::AlignIO->new(-file => $filename, 290 -format => 'Format') 291 Function: Returns a new seqstream 292 Returns : A Bio::AlignIO::Handler initialised with 293 the appropriate format 294 Args : -file => $filename 295 -format => format 296 -fh => filehandle to attach to 297 -displayname_flat => 1 [optional] 298 to force the displayname to not show start/end 299 information 300 301=cut 302 303sub new { 304 my ($caller,@args) = @_; 305 my $class = ref($caller) || $caller; 306 307 # or do we want to call SUPER on an object if $caller is an 308 # object? 309 if( $class =~ /Bio::AlignIO::(\S+)/ ) { 310 my ($self) = $class->SUPER::new(@args); 311 $self->_initialize(@args); 312 return $self; 313 } else { 314 315 my %param = @args; 316 @param{ map { lc $_ } keys %param } = values %param; # lowercase keys 317 my $format = $param{'-format'} || 318 $class->_guess_format( $param{-file} || $ARGV[0] ); 319 unless ($format) { 320 if ($param{-file}) { 321 $format = Bio::Tools::GuessSeqFormat->new(-file => $param{-file}||$ARGV[0] )->guess; 322 } 323 elsif ($param{-fh}) { 324 $format = Bio::Tools::GuessSeqFormat->new(-fh => $param{-fh}||$ARGV[0] )->guess; 325 } 326 } 327 $format = "\L$format"; # normalize capitalization to lower case 328 $class->throw("Unknown format given or could not determine it [$format]") 329 unless $format; 330 331 return unless( $class->_load_format_module($format) ); 332 return "Bio::AlignIO::$format"->new(@args); 333 } 334} 335 336 337=head2 newFh 338 339 Title : newFh 340 Usage : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format') 341 Function: does a new() followed by an fh() 342 Example : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format') 343 $sequence = <$fh>; # read a sequence object 344 print $fh $sequence; # write a sequence object 345 Returns : filehandle tied to the Bio::AlignIO::Fh class 346 Args : 347 348=cut 349 350sub newFh { 351 my $class = shift; 352 return unless my $self = $class->new(@_); 353 return $self->fh; 354} 355 356=head2 fh 357 358 Title : fh 359 Usage : $obj->fh 360 Function: 361 Example : $fh = $obj->fh; # make a tied filehandle 362 $sequence = <$fh>; # read a sequence object 363 print $fh $sequence; # write a sequence object 364 Returns : filehandle tied to the Bio::AlignIO::Fh class 365 Args : 366 367=cut 368 369 370sub fh { 371 my $self = shift; 372 my $class = ref($self) || $self; 373 my $s = Symbol::gensym; 374 tie $$s,$class,$self; 375 return $s; 376} 377 378 379=head2 format 380 381 Title : format 382 Usage : $format = $stream->format() 383 Function: Get the alignment format 384 Returns : alignment format 385 Args : none 386 387=cut 388 389# format() method inherited from Bio::Root::IO 390 391 392# _initialize is where the heavy stuff will happen when new is called 393 394sub _initialize { 395 my($self,@args) = @_; 396 my ($flat,$alphabet,$width) = $self->_rearrange([qw(DISPLAYNAME_FLAT ALPHABET WIDTH)], 397 @args); 398 $self->force_displayname_flat($flat) if defined $flat; 399 $self->alphabet($alphabet); 400 $self->width($width) if defined $width; 401 $self->_initialize_io(@args); 402 1; 403} 404 405=head2 _load_format_module 406 407 Title : _load_format_module 408 Usage : *INTERNAL AlignIO stuff* 409 Function: Loads up (like use) a module at run time on demand 410 Example : 411 Returns : 412 Args : 413 414=cut 415 416sub _load_format_module { 417 my ($self,$format) = @_; 418 my $module = "Bio::AlignIO::" . $format; 419 my $ok; 420 421 eval { 422 $ok = $self->_load_module($module); 423 }; 424 if ( $@ ) { 425 print STDERR <<END; 426$self: $format cannot be found 427Exception $@ 428For more information about the AlignIO system please see the AlignIO docs. 429This includes ways of checking for formats at compile time, not run time 430END 431 ; 432 return; 433 } 434 return 1; 435} 436 437=head2 next_aln 438 439 Title : next_aln 440 Usage : $aln = stream->next_aln 441 Function: reads the next $aln object from the stream 442 Returns : a Bio::Align::AlignI compliant object 443 Args : 444 445=cut 446 447sub next_aln { 448 my ($self,$aln) = @_; 449 $self->throw("Sorry, you cannot read from a generic Bio::AlignIO object."); 450} 451 452=head2 write_aln 453 454 Title : write_aln 455 Usage : $stream->write_aln($aln) 456 Function: writes the $aln object into the stream 457 Returns : 1 for success and 0 for error 458 Args : Bio::Seq object 459 460=cut 461 462sub write_aln { 463 my ($self,$aln) = @_; 464 $self->throw("Sorry, you cannot write to a generic Bio::AlignIO object."); 465} 466 467=head2 _guess_format 468 469 Title : _guess_format 470 Usage : $obj->_guess_format($filename) 471 Function: 472 Example : 473 Returns : guessed format of filename (lower case) 474 Args : 475 476=cut 477 478sub _guess_format { 479my $class = shift; 480 return unless $_ = shift; 481 return 'clustalw' if /\.aln$/i; 482 return 'emboss' if /\.(water|needle)$/i; 483 return 'metafasta' if /\.metafasta$/; 484 return 'fasta' if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i; 485 return 'maf' if /\.maf/i; 486 return 'mega' if /\.(meg|mega)$/i; 487 return 'meme' if /\.meme$/i; 488 return 'msf' if /\.(msf|pileup|gcg)$/i; 489 return 'nexus' if /\.(nexus|nex)$/i; 490 return 'pfam' if /\.(pfam|pfm)$/i; 491 return 'phylip' if /\.(phylip|phlp|phyl|phy|ph)$/i; 492 return 'psi' if /\.psi$/i; 493 return 'stockholm' if /\.stk$/i; 494 return 'selex' if /\.(selex|slx|selx|slex|sx)$/i; 495 return 'xmfa' if /\.xmfa$/i; 496} 497 498sub DESTROY { 499 my $self = shift; 500 $self->close(); 501} 502 503sub TIEHANDLE { 504 my $class = shift; 505 return bless {'alignio' => shift},$class; 506} 507 508sub READLINE { 509 my $self = shift; 510 return $self->{'alignio'}->next_aln() || undef unless wantarray; 511 my (@list,$obj); 512 push @list,$obj while $obj = $self->{'alignio'}->next_aln(); 513 return @list; 514} 515 516sub PRINT { 517 my $self = shift; 518 $self->{'alignio'}->write_aln(@_); 519} 520 521 522=head2 force_displayname_flat 523 524 Title : force_displayname_flat 525 Usage : $obj->force_displayname_flat($newval) 526 Function: 527 Example : 528 Returns : value of force_displayname_flat (a scalar) 529 Args : on set, new value (a scalar or undef, optional) 530 531 532=cut 533 534sub force_displayname_flat{ 535 my $self = shift; 536 return $self->{'_force_displayname_flat'} = shift if @_; 537 return $self->{'_force_displayname_flat'} || 0; 538} 539 540=head2 alphabet 541 542 Title : alphabet 543 Usage : $obj->alphabet($newval) 544 Function: Get/Set alphabet for purpose of passing to Bio::LocatableSeq creation 545 Example : $obj->alphabet('dna'); 546 Returns : value of alphabet (a scalar) 547 Args : on set, new value (a scalar or undef, optional) 548 549 550=cut 551 552sub alphabet { 553 my $self = shift; 554 my $value = shift; 555 if ( defined $value ) { 556 $self->throw("Invalid alphabet $value") unless $value eq 'rna' || $value eq 'protein' || $value eq 'dna'; 557 $self->{'_alphabet'} = $value; 558 } 559 return $self->{'_alphabet'}; 560} 561 562 5631; 564