1#
2# BioPerl module for Bio::Tools::Run::Alignment::MAFFT
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Jason Stajich
7#
8# Copyright Jason Stajich
9#
10# You may distribute this module under the same terms as perl itself
11#
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::Tools::Run::Alignment::MAFFT - run the MAFFT alignment tools
17
18=head1 SYNOPSIS
19
20  # Build a MAFFT alignment factory
21  $factory = Bio::Tools::Run::Alignment::MAFFT->new(@params);
22
23  # Pass the factory a list of sequences to be aligned.
24  $inputfilename = 't/cysprot.fa';
25  # $aln is a SimpleAlign object.
26  $aln = $factory->align($inputfilename);
27
28  # or where @seq_array is an array of Bio::Seq objects
29  $seq_array_ref = \@seq_array;
30  $aln = $factory->align($seq_array_ref);
31
32  #There are various additional options available.
33
34=head1 DESCRIPTION
35
36You can get MAFFT from L<http://mafft.cbrc.jp/alignment/software/>.
37"fftnsi" is the default method for Mafft version 4 in this
38implementation.
39
40See Bio::Tools::Run::Alignment::Clustalw for a description on how to
41specify parameters to the underlying alignment program. See the MAFFT
42manual page for a description of the MAFFT parameters.
43
44=head1 FEEDBACK
45
46=head2 Mailing Lists
47
48User feedback is an integral part of the evolution of this and other
49Bioperl modules. Send your comments and suggestions preferably to one
50of the Bioperl mailing lists.  Your participation is much appreciated.
51
52  bioperl-l@bioperl.org                  - General discussion
53  http://bioperl.org/MailList.html - About the mailing lists
54
55=head2 Support
56
57Please direct usage questions or support issues to the mailing list:
58
59I<bioperl-l@bioperl.org>
60
61rather than to the module maintainer directly. Many experienced and
62reponsive experts will be able look at the problem and quickly
63address it. Please include a thorough description of the problem
64with code and data examples if at all possible.
65
66=head2 Reporting Bugs
67
68Report bugs to the Bioperl bug tracking system to help us keep track
69the bugs and their resolution.  Bug reports can be submitted via the web:
70
71  http://redmine.open-bio.org/projects/bioperl/
72
73=head1 AUTHOR -  Jason Stajich
74
75Email jason-at-bioperl.org
76
77=head1 APPENDIX
78
79The rest of the documentation details each of the object
80methods. Internal methods are usually preceded with a _
81
82=cut
83
84package Bio::Tools::Run::Alignment::MAFFT;
85
86use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS
87            @MAFFT4_PARAMS @MAFFT4_SWITCHES @OTHER_SWITCHES %OK_FIELD
88            @MAFFT_ALN_METHODS @MAFFT6_PARAMS @MAFFT6_SWITCHES %OK_FIELD6
89            );
90use strict;
91use Bio::Seq;
92use Bio::SeqIO;
93use Bio::SimpleAlign;
94use Bio::AlignIO;
95
96use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase
97          Bio::Factory::ApplicationFactoryI);
98
99BEGIN {
100    %DEFAULTS = ( 'OUTPUT' => 'fasta',
101                  'METHOD' => 'fftnsi',
102                  'CYCLES' => 2);
103    @MAFFT4_PARAMS =qw( METHOD CYCLES );
104    @MAFFT4_SWITCHES = qw( NJ ALL_POSITIVE);
105
106    # NB: Mafft6 options are case-sensitive (eg. --lop and --LOP is different)
107    @MAFFT6_PARAMS   = qw( weighti retree maxiterate partsize groupsize
108        op ep lop lep lexp LOP LEXP bl jtt tm aamatrix fmodel seed );
109    @MAFFT6_SWITCHES = qw( auto 6merpair globalpair localpair genafpair
110        fastapair fft nofft noscore memsave parttree dpparttree fastaparttree
111        clustalout inputorder reorder treeout nuc amino
112        );
113
114    @OTHER_SWITCHES = qw(QUIET ALIGN OUTPUT OUTFILE);
115    @MAFFT_ALN_METHODS = qw(fftnsi fftns nwnsi nwns fftnsrough nwnsrough);
116    #@MAFFT6_ALN_METHODS = qw(linsi ginsi einsi fftnsi fftns nwnsi nwns)
117    # Authorize attribute fields
118    foreach my $attr ( @MAFFT4_SWITCHES,@MAFFT4_PARAMS,@OTHER_SWITCHES ) {
119        $OK_FIELD{$attr}++;
120    }
121    foreach my $attr ( @MAFFT6_PARAMS, @MAFFT6_SWITCHES ) {
122        $OK_FIELD6{$attr}++
123    }
124}
125
126=head2 program_name
127
128 Title   : program_name
129 Usage   : $factory->program_name()
130 Function: holds the program name
131 Returns:  string
132 Args    : None
133
134=cut
135
136sub program_name {
137    return 'mafft';
138}
139
140=head2 executable
141
142 Title   : executable
143 Usage   : my $exe = $blastfactory->executable('blastall');
144 Function: Finds the full path to the 'codeml' executable
145 Returns : string representing the full path to the exe
146 Args    : [optional] name of executable to set path to
147           [optional] boolean flag whether or not warn when exe is not found
148
149
150=cut
151
152sub executable {
153    my ($self, $exename, $exe,$warn) = @_;
154    $exename = $self->program_name unless (defined $exename );
155
156    if( defined $exe && -x $exe ) {
157        $self->{'_pathtoexe'}->{$exename} = $exe;
158    }
159    unless( defined $self->{'_pathtoexe'}->{$exename} ) {
160        my $f = $self->program_path($exename);
161        $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f );
162
163        #  This is how I meant to split up these conditionals --jason
164        # if exe is null we will execute this (handle the case where
165        # PROGRAMDIR pointed to something invalid)
166        unless( $exe )  {  # we didn't find it in that last conditional
167        if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) {
168            $self->{'_pathtoexe'}->{$exename} = $exe;
169        } else {
170            $self->warn("Cannot find executable for $exename") if $warn;
171            $self->{'_pathtoexe'}->{$exename} = undef;
172        }
173        }
174    }
175    return $self->{'_pathtoexe'}->{$exename};
176}
177
178
179=head2 program_path
180
181 Title   : program_path
182 Usage   : my $path = $factory->program_path();
183 Function: Builds path for executable
184 Returns : string representing the full path to the exe
185 Args    : none
186
187=cut
188
189sub program_path {
190    my ($self,$program_name) = @_;
191    my @path;
192    push @path, $self->program_dir if $self->program_dir;
193    push @path, $program_name .($^O =~ /mswin/i ?'.exe':'');
194
195    return Bio::Root::IO->catfile(@path);
196}
197
198=head2 program_dir
199
200 Title   : program_dir
201 Usage   : $factory->program_dir(@params)
202 Function: returns the program directory, obtained from ENV variable.
203 Returns:  string
204 Args    :
205
206=cut
207
208sub program_dir {
209        return File::Spec->rel2abs($ENV{MAFFTDIR}) if $ENV{MAFFTDIR};
210}
211
212sub new {
213    my ($class,@args) = @_;
214    my $self = $class->SUPER::new(@args);
215    my ($attr, $value);
216
217    while (@args)  {
218        $attr =   shift @args;
219        $value =  shift @args;
220        next if( $attr =~ /^-/); # don't want named parameters
221        $self->$attr($value);
222    }
223
224    $self->output($DEFAULTS{'OUTPUT'}) unless( $self->output );
225    if ( ! $self->_version6 ) {
226        $self->method($DEFAULTS{'METHOD'}) unless( $self->method );
227    }
228    return $self;
229}
230
231sub AUTOLOAD {
232    my $self = shift;
233    my $attr = $AUTOLOAD;
234    $attr =~ s/.*:://;
235
236    # NB: Mafft6 options are case-sensitive
237    if ( $self->_version6 ) {
238        if ( $OK_FIELD6{ $attr } ) {
239            # Don't want the attrs to clash with bioperl attributes
240            $self->{version6attrs}{$attr} = shift if @_;
241            return $self->{version6attrs}{$attr};
242        }
243    }
244    $attr = uc $attr;
245    # aliasing
246    $attr = 'OUTFILE' if $attr eq 'OUTFILE_NAME';
247    $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
248
249    $self->{$attr} = shift if @_;
250    return $self->{$attr};
251}
252
253=head2 error_string
254
255 Title   : error_string
256 Usage   : $obj->error_string($newval)
257 Function: Where the output from the last analysis run is stored.
258 Returns : value of error_string
259 Args    : newvalue (optional)
260
261
262=cut
263
264sub error_string{
265   my ($self,$value) = @_;
266   if( defined $value) {
267      $self->{'error_string'} = $value;
268    }
269    return $self->{'error_string'};
270
271}
272
273=head2  version
274
275 Title   : version
276 Usage   : exit if $prog->version() < 1.8
277 Function: Determine the version number of the program
278 Example :
279 Returns : float or undef
280 Args    : none
281
282=cut
283
284sub version {
285    my ($self) = @_;
286    my $exe;
287    return unless $exe = $self->executable;
288    # this is a bit of a hack, but MAFFT is just a gawk script
289    # so we are actually grepping the scriptfile
290    # UPDATE (Torsten Seemann)
291    # it now seems to be a 'sh' script and the format has changed
292    # slightly. i've tried to make the change compatible with both...
293    # version="v5.860 (2006/06/12)"; export version
294
295    if( open(my $NAME, "grep 'export version' $exe | ") ) {
296        while(<$NAME>) {
297            if( /version.*?([\d.a-z]+)\s+/ ) {
298                return $1;
299            }
300        }
301        $self->warn("No version found");
302        close($NAME);
303    } else {
304        $self->warn("$!");
305    }
306    return;
307}
308
309=head2 run
310
311 Title   : run
312 Usage   : my $output = $application->run(\@seqs);
313 Function: Generic run of an application
314 Returns : Bio::SimpleAlign object
315 Args    : array ref of Bio::PrimarySeqI objects OR
316           filename of sequences to run with
317
318=cut
319
320sub run {
321   my ($self,$seqs) = @_;
322   return $self->align($seqs);
323}
324
325=head2  align
326
327 Title   : align
328 Usage   :
329    $inputfilename = 't/data/cysprot.fa';
330    $aln = $factory->align($inputfilename);
331or
332    $seq_array_ref = \@seq_array;
333    # @seq_array is an array of Seq objs
334    $aln = $factory->align($seq_array_ref);
335 Function: Perform a multiple sequence alignment
336 Returns : Reference to a SimpleAlign object containing the
337           sequence alignment.
338 Args    : Name of a file containing a set of unaligned fasta sequences
339           or else an array of references to Bio::Seq objects.
340
341 Throws an exception if argument is not either a string (eg a
342 filename) or a reference to an array of Bio::Seq objects.  If
343 argument is string, throws exception if file corresponding to string
344 name can not be found. If argument is Bio::Seq array, throws
345 exception if less than two sequence objects are in array.
346
347=cut
348
349sub align {
350    my ($self,$input) = @_;
351    # Create input file pointer
352    $self->io->_io_cleanup();
353    my ($infilename,$type) = $self->_setinput($input);
354    if (! $infilename) {
355        $self->throw("Bad input data or less than 2 sequences in $input !");
356    }
357
358    my ($param_string,$outstr) = $self->_setparams();
359
360    # run mafft
361    return $self->_run($infilename, $param_string,$outstr);
362}
363
364=head2  _run
365
366 Title   :  _run
367 Usage   :  Internal function, not to be called directly
368 Function:  makes actual system call to tcoffee program
369 Example :
370 Returns : nothing; tcoffee output is written to a
371           temporary file OR specified output file
372 Args    : Name of a file containing a set of unaligned fasta sequences
373           and hash of parameters to be passed to tcoffee
374
375
376=cut
377
378sub _run {
379    my ($self,$infilename,$paramstr,$outstr) = @_;
380     my $commandstring = $self->executable()." $paramstr $infilename $outstr";
381
382    $self->debug( "mafft command = $commandstring \n");
383
384    my $status = system($commandstring);
385    my $outfile = $self->outfile();
386    if( !-e $outfile || -z $outfile ) {
387        $self->warn( "MAFFT call crashed: $? [command $commandstring]\n");
388        return;
389    }
390
391    my $in  = Bio::AlignIO->new('-file' => $outfile,
392                                '-format' => $self->output);
393    my $aln = $in->next_aln();
394    return $aln;
395}
396
397
398=head2  _setinput
399
400 Title   :  _setinput
401 Usage   :  Internal function, not to be called directly
402 Function:  Create input file for mafft programs
403 Example :
404 Returns : name of file containing mafft data input
405 Args    : Seq or Align object reference or input file name
406
407
408=cut
409
410sub _setinput {
411    my ($self,$input) = @_;
412    my ($infilename, $seq, $temp, $tfh);
413    if (! ref $input) {
414        # check that file exists or throw
415        $infilename = $input;
416        unless (-e $input) {return 0;}
417        return ($infilename);
418    } elsif (ref($input) =~ /ARRAY/i ) { #  $input may be an
419                                         #  array of BioSeq objects...
420        #  Open temporary file for both reading & writing of array
421        ($tfh,$infilename) = $self->io->tempfile();
422        if( ! ref($input->[0]) ) {
423            $self->warn("passed an array ref which did not contain objects to _setinput");
424            return;
425        } elsif ( $input->[0]->isa('Bio::PrimarySeqI') ) {
426            $temp =  Bio::SeqIO->new('-fh' => $tfh,
427                                     '-format' => 'fasta');
428            my $ct = 1;
429            foreach $seq (@$input) {
430                return 0 unless ( ref($seq) &&
431                                  $seq->isa("Bio::PrimarySeqI") );
432                if( ! defined $seq->display_id ||
433                    $seq->display_id =~ /^\s+$/
434                    ) {
435                    $seq->display_id( "Seq".$ct++);
436                }
437                $temp->write_seq($seq);
438            }
439            $temp->close();
440            undef $temp;
441            close($tfh);
442            $tfh = undef;
443        }  else {
444            $self->warn( "got an array ref with 1st entry ".
445                         $input->[0].
446                         " and don't know what to do with it\n");
447        }
448
449        return ($infilename);
450    } else {
451        $self->warn("Got $input and don't know what to do with it\n");
452    }
453    return 0;
454}
455
456
457=head2  _setparams
458
459 Title   :  _setparams
460 Usage   :  Internal function, not to be called directly
461 Function:  Create parameter inputs for mafft program
462 Example :
463 Returns : parameter string to be passed to mafft program
464 Args    : name of calling object
465
466=cut
467
468sub _setparams {
469    my ($self) = @_;
470    my ($outfile,$param_string) = ('','');
471
472    # Set default output file if no explicit output file selected
473    unless (defined($outfile = $self->outfile) ) {
474        my $tfh;
475        ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir());
476        close($tfh);
477        undef $tfh;
478        $self->outfile($outfile);
479    }
480    my ($attr,$value);
481
482    if ( $self->_version6 ) {
483        for  $attr ( @MAFFT6_SWITCHES) {
484            $value = $self->$attr();
485            next unless defined $value;
486            my $attr_key = lc $attr; #put switches in format expected by mafft
487            $attr_key = ' --'.$attr_key;
488            $param_string .= $attr_key ;
489        }
490        for  $attr ( @MAFFT6_PARAMS ) {
491            $value = $self->$attr();
492            next unless (defined $value);
493            my $attr_key = lc $attr;
494            $attr_key = ' --'.$attr_key;
495            $param_string .= $attr_key .' '.$value;
496        }
497        if ( ! $self->no_param_checks ) {
498            my @incompatible = qw/auto 6merpair globalpair localpair genafpair
499                fastapair/;
500            my @set = grep { $self->$_ } @incompatible;
501            if ( @set > 1 ) {
502                $self->throw("You can't specify more than one of @set");
503            }
504        }
505    }
506    else {
507        for  $attr ( @MAFFT4_SWITCHES) {
508            $value = $self->$attr();
509            next unless defined $value;
510            my $attr_key = lc $attr; #put switches in format expected by mafft
511            $attr_key = ' --'.$attr_key;
512            $param_string .= $attr_key ;
513        }
514        # Method is a version 4 option
515        my $method = $self->method;
516        $self->throw("no method ") unless defined $method;
517        if( $method !~ /(rough|nsi)$/ &&
518            defined $self->cycles) {
519            $param_string .= " ".$self->cycles;
520        }
521    }
522
523    my $outputstr = " 1>$outfile" ;
524
525    if ($self->quiet() || $self->verbose < 0) {
526        $param_string .= " --quiet";
527        my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
528        $outputstr .= " 2> $null";
529    }
530    return ($param_string, $outputstr);
531}
532
533=head2 methods
534
535 Title   : methods
536 Usage   : my @methods = $self->methods()
537 Function: Get/Set Alignment methods - NOT VALIDATED
538 Returns : array of strings
539 Args    : arrayref of strings
540
541
542=cut
543
544sub methods {
545   my ($self) = shift;
546   return @MAFFT_ALN_METHODS;
547}
548
549
550=head2  _version6
551
552 Title   : _version6
553 Usage   : Internal function, not to be called directly
554 Function: Check if the version of MAFFT is 6
555 Example :
556 Returns : Boolean
557 Args    : None
558
559=cut
560
561sub _version6 {
562    my $self = shift;
563    if ( ! defined $self->{_version6} ) {
564        my $version = $self->version || '';
565        if ( $version =~ /^v6/ ) {
566            $self->{_version6} = 1;
567        }
568        else {
569            $self->{_version6} = '';
570        }
571    }
572    return $self->{_version6};
573}
574
575
576=head1 Bio::Tools::Run::BaseWrapper methods
577
578=cut
579
580=head2 no_param_checks
581
582 Title   : no_param_checks
583 Usage   : $obj->no_param_checks($newval)
584 Function: Boolean flag as to whether or not we should
585           trust the sanity checks for parameter values
586 Returns : value of no_param_checks
587 Args    : newvalue (optional)
588
589
590=cut
591
592=head2 save_tempfiles
593
594 Title   : save_tempfiles
595 Usage   : $obj->save_tempfiles($newval)
596 Function:
597 Returns : value of save_tempfiles
598 Args    : newvalue (optional)
599
600
601=cut
602
603=head2 outfile_name
604
605 Title   : outfile_name
606 Usage   : my $outfile = $mafft->outfile_name();
607 Function: Get/Set the name of the output file for this run
608           (if you wanted to do something special)
609 Returns : string
610 Args    : [optional] string to set value to
611
612
613=cut
614
615
616=head2 tempdir
617
618 Title   : tempdir
619 Usage   : my $tmpdir = $self->tempdir();
620 Function: Retrieve a temporary directory name (which is created)
621 Returns : string which is the name of the temporary directory
622 Args    : none
623
624
625=cut
626
627=head2 cleanup
628
629 Title   : cleanup
630 Usage   : $mafft->cleanup();
631 Function: Will cleanup the tempdir directory
632 Returns : none
633 Args    : none
634
635
636=cut
637
638=head2 io
639
640 Title   : io
641 Usage   : $obj->io($newval)
642 Function:  Gets a L<Bio::Root::IO> object
643 Returns : L<Bio::Root::IO>
644 Args    : none
645
646
647=cut
648
6491; # Needed to keep compiler happy
650