1#
2# BioPerl module for Bio::AlignIO
3#
4#	based on the Bio::SeqIO module
5#       by Ewan Birney <birney@ebi.ac.uk>
6#       and Lincoln Stein  <lstein@cshl.org>
7#
8# Copyright Peter Schattner
9#
10# You may distribute this module under the same terms as perl itself
11#
12# History
13# September, 2000 AlignIO written by Peter Schattner
14
15# POD documentation - main docs before the code
16
17=head1 NAME
18
19Bio::AlignIO - Handler for AlignIO Formats
20
21=head1 SYNOPSIS
22
23    use Bio::AlignIO;
24
25    $inputfilename = "testaln.fasta";
26    $in  = Bio::AlignIO->new(-file   => $inputfilename ,
27                             -format => 'fasta');
28    $out = Bio::AlignIO->new(-file   => ">out.aln.pfam" ,
29                             -format => 'pfam');
30
31    while ( my $aln = $in->next_aln() ) {
32        $out->write_aln($aln);
33    }
34
35    # OR
36
37    use Bio::AlignIO;
38
39    open MYIN, '<', 'testaln.fasta' or die "Could not read file 'testaln.fasta': $!\n";
40    $in  = Bio::AlignIO->newFh(-fh     => \*MYIN,
41                               -format => 'fasta');
42    open my $MYOUT, '>', 'testaln.pfam' or die "Could not write file 'testaln.pfam': $!\n";
43    $out = Bio::AlignIO->newFh(-fh     =>  $MYOUT,
44                               -format => 'pfam');
45
46    # World's smallest Fasta<->pfam format converter:
47    print $out $_ while <$in>;
48
49=head1 DESCRIPTION
50
51L<Bio::AlignIO> is a handler module for the formats in the AlignIO set,
52for example, L<Bio::AlignIO::fasta>. It is the officially sanctioned way
53of getting at the alignment objects. The resulting alignment is a
54L<Bio::Align::AlignI>-compliant object.
55
56The idea is that you request an object for a particular format.
57All the objects have a notion of an internal file that is read
58from or written to. A particular AlignIO object instance is configured
59for either input or output, you can think of it as a stream object.
60
61Each object has functions:
62
63   $stream->next_aln();
64
65And:
66
67   $stream->write_aln($aln);
68
69Also:
70
71   $stream->type() # returns 'INPUT' or 'OUTPUT'
72
73As an added bonus, you can recover a filehandle that is tied to the
74AlignIO object, allowing you to use the standard E<lt>E<gt> and print
75operations to read and write alignment objects:
76
77    use Bio::AlignIO;
78
79    # read from standard input
80    $stream = Bio::AlignIO->newFh(-format => 'Fasta');
81
82    while ( $aln = <$stream> ) {
83	     # do something with $aln
84    }
85
86And:
87
88    print $stream $aln; # when stream is in output mode
89
90L<Bio::AlignIO> is patterned on the L<Bio::SeqIO> module and shares
91most of its features.  One significant difference is that
92L<Bio::AlignIO> usually handles IO for only a single alignment at a time,
93whereas L<Bio::SeqIO> handles IO for multiple sequences in a single stream.
94The principal reason for this is that whereas simultaneously handling
95multiple sequences is a common requirement, simultaneous handling of
96multiple alignments is not. The only current exception is format
97C<bl2seq> which parses results of the BLAST C<bl2seq> program and which
98may produce several alignment pairs.  This set of alignment pairs can
99be read using multiple calls to L<next_aln>.
100
101=head1 CONSTRUCTORS
102
103=head2 Bio::AlignIO-E<gt>new()
104
105   $seqIO = Bio::AlignIO->new(-file => 'filename',   -format=>$format);
106   $seqIO = Bio::AlignIO->new(-fh   => \*FILEHANDLE, -format=>$format);
107   $seqIO = Bio::AlignIO->new(-format => $format);
108   $seqIO = Bio::AlignIO->new(-fh => \*STDOUT, -format => $format);
109
110The L<new> class method constructs a new L<Bio::AlignIO> object.
111The returned object can be used to retrieve or print alignment
112objects. L<new> accepts the following parameters:
113
114=over 4
115
116=item -file
117
118A file path to be opened for reading or writing.  The usual Perl
119conventions apply:
120
121   'file'       # open file for reading
122   '>file'      # open file for writing
123   '>>file'     # open file for appending
124   '+<file'     # open file read/write
125   'command |'  # open a pipe from the command
126   '| command'  # open a pipe to the command
127
128=item -fh
129
130You may provide new() with a previously-opened filehandle.  For
131example, to read from STDIN:
132
133   $seqIO = Bio::AlignIO->new(-fh => \*STDIN);
134
135Note that you must pass filehandles as references to globs.
136
137If neither a filehandle nor a filename is specified, then the module
138will read from the @ARGV array or STDIN, using the familiar E<lt>E<gt>
139semantics.
140
141=item -format
142
143Specify the format of the file.  Supported formats include:
144
145   bl2seq      Bl2seq Blast output
146   clustalw    clustalw (.aln) format
147   emboss      EMBOSS water and needle format
148   fasta       FASTA format
149   maf         Multiple Alignment Format
150   mase        mase (seaview) format
151   mega        MEGA format
152   meme        MEME format
153   msf         msf (GCG) format
154   nexus       Swofford et al NEXUS format
155   pfam        Pfam sequence alignment format
156   phylip      Felsenstein PHYLIP format
157   prodom      prodom (protein domain) format
158   psi         PSI-BLAST format
159   selex       selex (hmmer) format
160   stockholm   stockholm format
161
162Currently only those formats which were implemented in L<Bio::SimpleAlign>
163have been incorporated into L<Bio::AlignIO>.  Specifically, C<mase>, C<stockholm>
164and C<prodom> have only been implemented for input. See the specific module
165(e.g. L<Bio::AlignIO::prodom>) for notes on supported versions.
166
167If no format is specified and a filename is given, then the module
168will attempt to deduce it from the filename suffix.  If this is unsuccessful,
169C<fasta> format is assumed.
170
171The format name is case insensitive; C<FASTA>, C<Fasta> and C<fasta> are
172all treated equivalently.
173
174=back
175
176=head2 Bio::AlignIO-E<gt>newFh()
177
178   $fh = Bio::AlignIO->newFh(-fh   => \*FILEHANDLE, -format=>$format);
179   # read from STDIN or use @ARGV:
180   $fh = Bio::AlignIO->newFh(-format => $format);
181
182This constructor behaves like L<new>, but returns a tied filehandle
183rather than a L<Bio::AlignIO> object.  You can read sequences from this
184object using the familiar E<lt>E<gt> operator, and write to it using
185L<print>. The usual array and $_ semantics work.  For example, you can
186read all sequence objects into an array like this:
187
188  @sequences = <$fh>;
189
190Other operations, such as read(), sysread(), write(), close(), and printf()
191are not supported.
192
193=over 1
194
195=item -flush
196
197By default, all files (or filehandles) opened for writing alignments
198will be flushed after each write_aln() making the file immediately
199usable.  If you do not need this facility and would like to marginally
200improve the efficiency of writing multiple sequences to the same file
201(or filehandle), pass the -flush option '0' or any other value that
202evaluates as defined but false:
203
204  my $clustal = Bio::AlignIO->new( -file   => "<prot.aln",
205                                   -format => "clustalw" );
206  my $msf = Bio::AlignIO->new(-file   => ">prot.msf",
207                              -format => "msf",
208                              -flush  => 0 ); # go as fast as we can!
209  while($seq = $clustal->next_aln) { $msf->write_aln($seq) }
210
211=back
212
213=head1 OBJECT METHODS
214
215See below for more detailed summaries.  The main methods are:
216
217=head2 $alignment = $AlignIO-E<gt>next_aln()
218
219Fetch an alignment from a formatted file.
220
221=head2 $AlignIO-E<gt>write_aln($aln)
222
223Write the specified alignment to a file..
224
225=head2 TIEHANDLE(), READLINE(), PRINT()
226
227These provide the tie interface.  See L<perltie> for more details.
228
229=head1 FEEDBACK
230
231=head2 Mailing Lists
232
233User feedback is an integral part of the evolution of this and other
234Bioperl modules. Send your comments and suggestions preferably to one
235of the Bioperl mailing lists.  Your participation is much appreciated.
236
237  bioperl-l@bioperl.org                  - General discussion
238  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
239
240=head2 Support
241
242Please direct usage questions or support issues to the mailing list:
243
244I<bioperl-l@bioperl.org>
245
246rather than to the module maintainer directly. Many experienced and
247reponsive experts will be able look at the problem and quickly
248address it. Please include a thorough description of the problem
249with code and data examples if at all possible.
250
251=head2 Reporting Bugs
252
253Report bugs to the Bioperl bug tracking system to help us keep track
254the bugs and their resolution.  Bug reports can be submitted via the
255web:
256
257  https://github.com/bioperl/bioperl-live/issues
258
259=head1 AUTHOR - Peter Schattner
260
261Email: schattner@alum.mit.edu
262
263=head1 CONTRIBUTORS
264
265Jason Stajich, jason@bioperl.org
266
267=head1 APPENDIX
268
269The rest of the documentation details each of the object
270methods. Internal methods are usually preceded with a _
271
272=cut
273
274# 'Let the code begin...
275
276package Bio::AlignIO;
277$Bio::AlignIO::VERSION = '1.7.7';
278use strict;
279
280use Bio::Seq;
281use Bio::LocatableSeq;
282use Bio::SimpleAlign;
283use Bio::Tools::GuessSeqFormat;
284use base qw(Bio::Root::Root Bio::Root::IO);
285
286=head2 new
287
288 Title   : new
289 Usage   : $stream = Bio::AlignIO->new(-file => $filename,
290                                       -format => 'Format')
291 Function: Returns a new seqstream
292 Returns : A Bio::AlignIO::Handler initialised with
293           the appropriate format
294 Args    : -file => $filename
295           -format => format
296           -fh => filehandle to attach to
297           -displayname_flat => 1 [optional]
298                                to force the displayname to not show start/end
299                                information
300
301=cut
302
303sub new {
304    my ($caller,@args) = @_;
305    my $class = ref($caller) || $caller;
306
307    # or do we want to call SUPER on an object if $caller is an
308    # object?
309    if( $class =~ /Bio::AlignIO::(\S+)/ ) {
310	my ($self) = $class->SUPER::new(@args);
311	$self->_initialize(@args);
312	return $self;
313    } else {
314
315	my %param = @args;
316	@param{ map { lc $_ } keys %param } = values %param; # lowercase keys
317	my $format = $param{'-format'} ||
318	    $class->_guess_format( $param{-file} || $ARGV[0] );
319        unless ($format) {
320            if ($param{-file}) {
321                $format = Bio::Tools::GuessSeqFormat->new(-file => $param{-file}||$ARGV[0] )->guess;
322            }
323            elsif ($param{-fh}) {
324                $format = Bio::Tools::GuessSeqFormat->new(-fh => $param{-fh}||$ARGV[0] )->guess;
325            }
326        }
327	$format = "\L$format";	# normalize capitalization to lower case
328        $class->throw("Unknown format given or could not determine it [$format]")
329            unless $format;
330
331	return unless( $class->_load_format_module($format) );
332	return "Bio::AlignIO::$format"->new(@args);
333    }
334}
335
336
337=head2 newFh
338
339 Title   : newFh
340 Usage   : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
341 Function: does a new() followed by an fh()
342 Example : $fh = Bio::AlignIO->newFh(-file=>$filename,-format=>'Format')
343           $sequence = <$fh>;   # read a sequence object
344           print $fh $sequence; # write a sequence object
345 Returns : filehandle tied to the Bio::AlignIO::Fh class
346 Args    :
347
348=cut
349
350sub newFh {
351  my $class = shift;
352  return unless my $self = $class->new(@_);
353  return $self->fh;
354}
355
356=head2 fh
357
358 Title   : fh
359 Usage   : $obj->fh
360 Function:
361 Example : $fh = $obj->fh;      # make a tied filehandle
362           $sequence = <$fh>;   # read a sequence object
363           print $fh $sequence; # write a sequence object
364 Returns : filehandle tied to the Bio::AlignIO::Fh class
365 Args    :
366
367=cut
368
369
370sub fh {
371  my $self = shift;
372  my $class = ref($self) || $self;
373  my $s = Symbol::gensym;
374  tie $$s,$class,$self;
375  return $s;
376}
377
378
379=head2 format
380
381 Title   : format
382 Usage   : $format = $stream->format()
383 Function: Get the alignment format
384 Returns : alignment format
385 Args    : none
386
387=cut
388
389# format() method inherited from Bio::Root::IO
390
391
392# _initialize is where the heavy stuff will happen when new is called
393
394sub _initialize {
395  my($self,@args) = @_;
396  my ($flat,$alphabet,$width) = $self->_rearrange([qw(DISPLAYNAME_FLAT ALPHABET WIDTH)],
397				 @args);
398  $self->force_displayname_flat($flat) if defined $flat;
399  $self->alphabet($alphabet);
400  $self->width($width) if defined $width;
401  $self->_initialize_io(@args);
402  1;
403}
404
405=head2 _load_format_module
406
407 Title   : _load_format_module
408 Usage   : *INTERNAL AlignIO stuff*
409 Function: Loads up (like use) a module at run time on demand
410 Example :
411 Returns :
412 Args    :
413
414=cut
415
416sub _load_format_module {
417  my ($self,$format) = @_;
418  my $module = "Bio::AlignIO::" . $format;
419  my $ok;
420
421  eval {
422      $ok = $self->_load_module($module);
423  };
424  if ( $@ ) {
425    print STDERR <<END;
426$self: $format cannot be found
427Exception $@
428For more information about the AlignIO system please see the AlignIO docs.
429This includes ways of checking for formats at compile time, not run time
430END
431  ;
432    return;
433  }
434  return 1;
435}
436
437=head2 next_aln
438
439 Title   : next_aln
440 Usage   : $aln = stream->next_aln
441 Function: reads the next $aln object from the stream
442 Returns : a Bio::Align::AlignI compliant object
443 Args    :
444
445=cut
446
447sub next_aln {
448   my ($self,$aln) = @_;
449   $self->throw("Sorry, you cannot read from a generic Bio::AlignIO object.");
450}
451
452=head2 write_aln
453
454 Title   : write_aln
455 Usage   : $stream->write_aln($aln)
456 Function: writes the $aln object into the stream
457 Returns : 1 for success and 0 for error
458 Args    : Bio::Seq object
459
460=cut
461
462sub write_aln {
463    my ($self,$aln) = @_;
464    $self->throw("Sorry, you cannot write to a generic Bio::AlignIO object.");
465}
466
467=head2 _guess_format
468
469 Title   : _guess_format
470 Usage   : $obj->_guess_format($filename)
471 Function:
472 Example :
473 Returns : guessed format of filename (lower case)
474 Args    :
475
476=cut
477
478sub _guess_format {
479my $class = shift;
480   return unless $_ = shift;
481   return 'clustalw'    if /\.aln$/i;
482   return 'emboss'      if /\.(water|needle)$/i;
483   return 'metafasta'   if /\.metafasta$/;
484   return 'fasta'       if /\.(fasta|fast|seq|fa|fsa|nt|aa)$/i;
485   return 'maf'         if /\.maf/i;
486   return 'mega'        if /\.(meg|mega)$/i;
487   return 'meme'        if /\.meme$/i;
488   return 'msf'         if /\.(msf|pileup|gcg)$/i;
489   return 'nexus'       if /\.(nexus|nex)$/i;
490   return 'pfam'        if /\.(pfam|pfm)$/i;
491   return 'phylip'      if /\.(phylip|phlp|phyl|phy|ph)$/i;
492   return 'psi'         if /\.psi$/i;
493   return 'stockholm'   if /\.stk$/i;
494   return 'selex'       if /\.(selex|slx|selx|slex|sx)$/i;
495   return 'xmfa'        if /\.xmfa$/i;
496}
497
498sub DESTROY {
499    my $self = shift;
500    $self->close();
501}
502
503sub TIEHANDLE {
504  my $class = shift;
505  return bless {'alignio' => shift},$class;
506}
507
508sub READLINE {
509  my $self = shift;
510  return $self->{'alignio'}->next_aln() || undef unless wantarray;
511  my (@list,$obj);
512  push @list,$obj  while $obj = $self->{'alignio'}->next_aln();
513  return @list;
514}
515
516sub PRINT {
517  my $self = shift;
518  $self->{'alignio'}->write_aln(@_);
519}
520
521
522=head2 force_displayname_flat
523
524 Title   : force_displayname_flat
525 Usage   : $obj->force_displayname_flat($newval)
526 Function:
527 Example :
528 Returns : value of force_displayname_flat (a scalar)
529 Args    : on set, new value (a scalar or undef, optional)
530
531
532=cut
533
534sub force_displayname_flat{
535    my $self = shift;
536    return $self->{'_force_displayname_flat'} = shift if @_;
537    return $self->{'_force_displayname_flat'} || 0;
538}
539
540=head2 alphabet
541
542 Title   : alphabet
543 Usage   : $obj->alphabet($newval)
544 Function: Get/Set alphabet for purpose of passing to Bio::LocatableSeq creation
545 Example : $obj->alphabet('dna');
546 Returns : value of alphabet (a scalar)
547 Args    : on set, new value (a scalar or undef, optional)
548
549
550=cut
551
552sub alphabet {
553    my $self = shift;
554    my $value = shift;
555    if ( defined $value ) {
556        $self->throw("Invalid alphabet $value") unless $value eq 'rna' || $value eq 'protein' || $value eq 'dna';
557        $self->{'_alphabet'} = $value;
558    }
559    return $self->{'_alphabet'};
560}
561
562
5631;
564