1# 2# BioPerl module for Bio::Tools::Run::Alignment::MAFFT 3# 4# Please direct questions and support issues to <bioperl-l@bioperl.org> 5# 6# Cared for by Jason Stajich 7# 8# Copyright Jason Stajich 9# 10# You may distribute this module under the same terms as perl itself 11# 12# POD documentation - main docs before the code 13 14=head1 NAME 15 16Bio::Tools::Run::Alignment::MAFFT - run the MAFFT alignment tools 17 18=head1 SYNOPSIS 19 20 # Build a MAFFT alignment factory 21 $factory = Bio::Tools::Run::Alignment::MAFFT->new(@params); 22 23 # Pass the factory a list of sequences to be aligned. 24 $inputfilename = 't/cysprot.fa'; 25 # $aln is a SimpleAlign object. 26 $aln = $factory->align($inputfilename); 27 28 # or where @seq_array is an array of Bio::Seq objects 29 $seq_array_ref = \@seq_array; 30 $aln = $factory->align($seq_array_ref); 31 32 #There are various additional options available. 33 34=head1 DESCRIPTION 35 36You can get MAFFT from L<http://mafft.cbrc.jp/alignment/software/>. 37"fftnsi" is the default method for Mafft version 4 in this 38implementation. 39 40See Bio::Tools::Run::Alignment::Clustalw for a description on how to 41specify parameters to the underlying alignment program. See the MAFFT 42manual page for a description of the MAFFT parameters. 43 44=head1 FEEDBACK 45 46=head2 Mailing Lists 47 48User feedback is an integral part of the evolution of this and other 49Bioperl modules. Send your comments and suggestions preferably to one 50of the Bioperl mailing lists. Your participation is much appreciated. 51 52 bioperl-l@bioperl.org - General discussion 53 http://bioperl.org/MailList.html - About the mailing lists 54 55=head2 Support 56 57Please direct usage questions or support issues to the mailing list: 58 59I<bioperl-l@bioperl.org> 60 61rather than to the module maintainer directly. Many experienced and 62reponsive experts will be able look at the problem and quickly 63address it. Please include a thorough description of the problem 64with code and data examples if at all possible. 65 66=head2 Reporting Bugs 67 68Report bugs to the Bioperl bug tracking system to help us keep track 69the bugs and their resolution. Bug reports can be submitted via the web: 70 71 http://redmine.open-bio.org/projects/bioperl/ 72 73=head1 AUTHOR - Jason Stajich 74 75Email jason-at-bioperl.org 76 77=head1 APPENDIX 78 79The rest of the documentation details each of the object 80methods. Internal methods are usually preceded with a _ 81 82=cut 83 84package Bio::Tools::Run::Alignment::MAFFT; 85 86use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS 87 @MAFFT4_PARAMS @MAFFT4_SWITCHES @OTHER_SWITCHES %OK_FIELD 88 @MAFFT_ALN_METHODS @MAFFT6_PARAMS @MAFFT6_SWITCHES %OK_FIELD6 89 ); 90use strict; 91use Bio::Seq; 92use Bio::SeqIO; 93use Bio::SimpleAlign; 94use Bio::AlignIO; 95 96use base qw(Bio::Root::Root Bio::Tools::Run::WrapperBase 97 Bio::Factory::ApplicationFactoryI); 98 99BEGIN { 100 %DEFAULTS = ( 'OUTPUT' => 'fasta', 101 'METHOD' => 'fftnsi', 102 'CYCLES' => 2); 103 @MAFFT4_PARAMS =qw( METHOD CYCLES ); 104 @MAFFT4_SWITCHES = qw( NJ ALL_POSITIVE); 105 106 # NB: Mafft6 options are case-sensitive (eg. --lop and --LOP is different) 107 @MAFFT6_PARAMS = qw( weighti retree maxiterate partsize groupsize 108 op ep lop lep lexp LOP LEXP bl jtt tm aamatrix fmodel seed ); 109 @MAFFT6_SWITCHES = qw( auto 6merpair globalpair localpair genafpair 110 fastapair fft nofft noscore memsave parttree dpparttree fastaparttree 111 clustalout inputorder reorder treeout nuc amino 112 ); 113 114 @OTHER_SWITCHES = qw(QUIET ALIGN OUTPUT OUTFILE); 115 @MAFFT_ALN_METHODS = qw(fftnsi fftns nwnsi nwns fftnsrough nwnsrough); 116 #@MAFFT6_ALN_METHODS = qw(linsi ginsi einsi fftnsi fftns nwnsi nwns) 117 # Authorize attribute fields 118 foreach my $attr ( @MAFFT4_SWITCHES,@MAFFT4_PARAMS,@OTHER_SWITCHES ) { 119 $OK_FIELD{$attr}++; 120 } 121 foreach my $attr ( @MAFFT6_PARAMS, @MAFFT6_SWITCHES ) { 122 $OK_FIELD6{$attr}++ 123 } 124} 125 126=head2 program_name 127 128 Title : program_name 129 Usage : $factory->program_name() 130 Function: holds the program name 131 Returns: string 132 Args : None 133 134=cut 135 136sub program_name { 137 return 'mafft'; 138} 139 140=head2 executable 141 142 Title : executable 143 Usage : my $exe = $blastfactory->executable('blastall'); 144 Function: Finds the full path to the 'codeml' executable 145 Returns : string representing the full path to the exe 146 Args : [optional] name of executable to set path to 147 [optional] boolean flag whether or not warn when exe is not found 148 149 150=cut 151 152sub executable { 153 my ($self, $exename, $exe,$warn) = @_; 154 $exename = $self->program_name unless (defined $exename ); 155 156 if( defined $exe && -x $exe ) { 157 $self->{'_pathtoexe'}->{$exename} = $exe; 158 } 159 unless( defined $self->{'_pathtoexe'}->{$exename} ) { 160 my $f = $self->program_path($exename); 161 $exe = $self->{'_pathtoexe'}->{$exename} = $f if(-e $f && -x $f ); 162 163 # This is how I meant to split up these conditionals --jason 164 # if exe is null we will execute this (handle the case where 165 # PROGRAMDIR pointed to something invalid) 166 unless( $exe ) { # we didn't find it in that last conditional 167 if( ($exe = $self->io->exists_exe($exename)) && -x $exe ) { 168 $self->{'_pathtoexe'}->{$exename} = $exe; 169 } else { 170 $self->warn("Cannot find executable for $exename") if $warn; 171 $self->{'_pathtoexe'}->{$exename} = undef; 172 } 173 } 174 } 175 return $self->{'_pathtoexe'}->{$exename}; 176} 177 178 179=head2 program_path 180 181 Title : program_path 182 Usage : my $path = $factory->program_path(); 183 Function: Builds path for executable 184 Returns : string representing the full path to the exe 185 Args : none 186 187=cut 188 189sub program_path { 190 my ($self,$program_name) = @_; 191 my @path; 192 push @path, $self->program_dir if $self->program_dir; 193 push @path, $program_name .($^O =~ /mswin/i ?'.exe':''); 194 195 return Bio::Root::IO->catfile(@path); 196} 197 198=head2 program_dir 199 200 Title : program_dir 201 Usage : $factory->program_dir(@params) 202 Function: returns the program directory, obtained from ENV variable. 203 Returns: string 204 Args : 205 206=cut 207 208sub program_dir { 209 return File::Spec->rel2abs($ENV{MAFFTDIR}) if $ENV{MAFFTDIR}; 210} 211 212sub new { 213 my ($class,@args) = @_; 214 my $self = $class->SUPER::new(@args); 215 my ($attr, $value); 216 217 while (@args) { 218 $attr = shift @args; 219 $value = shift @args; 220 next if( $attr =~ /^-/); # don't want named parameters 221 $self->$attr($value); 222 } 223 224 $self->output($DEFAULTS{'OUTPUT'}) unless( $self->output ); 225 if ( ! $self->_version6 ) { 226 $self->method($DEFAULTS{'METHOD'}) unless( $self->method ); 227 } 228 return $self; 229} 230 231sub AUTOLOAD { 232 my $self = shift; 233 my $attr = $AUTOLOAD; 234 $attr =~ s/.*:://; 235 236 # NB: Mafft6 options are case-sensitive 237 if ( $self->_version6 ) { 238 if ( $OK_FIELD6{ $attr } ) { 239 # Don't want the attrs to clash with bioperl attributes 240 $self->{version6attrs}{$attr} = shift if @_; 241 return $self->{version6attrs}{$attr}; 242 } 243 } 244 $attr = uc $attr; 245 # aliasing 246 $attr = 'OUTFILE' if $attr eq 'OUTFILE_NAME'; 247 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; 248 249 $self->{$attr} = shift if @_; 250 return $self->{$attr}; 251} 252 253=head2 error_string 254 255 Title : error_string 256 Usage : $obj->error_string($newval) 257 Function: Where the output from the last analysis run is stored. 258 Returns : value of error_string 259 Args : newvalue (optional) 260 261 262=cut 263 264sub error_string{ 265 my ($self,$value) = @_; 266 if( defined $value) { 267 $self->{'error_string'} = $value; 268 } 269 return $self->{'error_string'}; 270 271} 272 273=head2 version 274 275 Title : version 276 Usage : exit if $prog->version() < 1.8 277 Function: Determine the version number of the program 278 Example : 279 Returns : float or undef 280 Args : none 281 282=cut 283 284sub version { 285 my ($self) = @_; 286 my $exe; 287 return unless $exe = $self->executable; 288 # this is a bit of a hack, but MAFFT is just a gawk script 289 # so we are actually grepping the scriptfile 290 # UPDATE (Torsten Seemann) 291 # it now seems to be a 'sh' script and the format has changed 292 # slightly. i've tried to make the change compatible with both... 293 # version="v5.860 (2006/06/12)"; export version 294 295 if( open(my $NAME, "grep 'export version' $exe | ") ) { 296 while(<$NAME>) { 297 if( /version.*?([\d.a-z]+)\s+/ ) { 298 return $1; 299 } 300 } 301 $self->warn("No version found"); 302 close($NAME); 303 } else { 304 $self->warn("$!"); 305 } 306 return; 307} 308 309=head2 run 310 311 Title : run 312 Usage : my $output = $application->run(\@seqs); 313 Function: Generic run of an application 314 Returns : Bio::SimpleAlign object 315 Args : array ref of Bio::PrimarySeqI objects OR 316 filename of sequences to run with 317 318=cut 319 320sub run { 321 my ($self,$seqs) = @_; 322 return $self->align($seqs); 323} 324 325=head2 align 326 327 Title : align 328 Usage : 329 $inputfilename = 't/data/cysprot.fa'; 330 $aln = $factory->align($inputfilename); 331or 332 $seq_array_ref = \@seq_array; 333 # @seq_array is an array of Seq objs 334 $aln = $factory->align($seq_array_ref); 335 Function: Perform a multiple sequence alignment 336 Returns : Reference to a SimpleAlign object containing the 337 sequence alignment. 338 Args : Name of a file containing a set of unaligned fasta sequences 339 or else an array of references to Bio::Seq objects. 340 341 Throws an exception if argument is not either a string (eg a 342 filename) or a reference to an array of Bio::Seq objects. If 343 argument is string, throws exception if file corresponding to string 344 name can not be found. If argument is Bio::Seq array, throws 345 exception if less than two sequence objects are in array. 346 347=cut 348 349sub align { 350 my ($self,$input) = @_; 351 # Create input file pointer 352 $self->io->_io_cleanup(); 353 my ($infilename,$type) = $self->_setinput($input); 354 if (! $infilename) { 355 $self->throw("Bad input data or less than 2 sequences in $input !"); 356 } 357 358 my ($param_string,$outstr) = $self->_setparams(); 359 360 # run mafft 361 return $self->_run($infilename, $param_string,$outstr); 362} 363 364=head2 _run 365 366 Title : _run 367 Usage : Internal function, not to be called directly 368 Function: makes actual system call to tcoffee program 369 Example : 370 Returns : nothing; tcoffee output is written to a 371 temporary file OR specified output file 372 Args : Name of a file containing a set of unaligned fasta sequences 373 and hash of parameters to be passed to tcoffee 374 375 376=cut 377 378sub _run { 379 my ($self,$infilename,$paramstr,$outstr) = @_; 380 my $commandstring = $self->executable()." $paramstr $infilename $outstr"; 381 382 $self->debug( "mafft command = $commandstring \n"); 383 384 my $status = system($commandstring); 385 my $outfile = $self->outfile(); 386 if( !-e $outfile || -z $outfile ) { 387 $self->warn( "MAFFT call crashed: $? [command $commandstring]\n"); 388 return; 389 } 390 391 my $in = Bio::AlignIO->new('-file' => $outfile, 392 '-format' => $self->output); 393 my $aln = $in->next_aln(); 394 return $aln; 395} 396 397 398=head2 _setinput 399 400 Title : _setinput 401 Usage : Internal function, not to be called directly 402 Function: Create input file for mafft programs 403 Example : 404 Returns : name of file containing mafft data input 405 Args : Seq or Align object reference or input file name 406 407 408=cut 409 410sub _setinput { 411 my ($self,$input) = @_; 412 my ($infilename, $seq, $temp, $tfh); 413 if (! ref $input) { 414 # check that file exists or throw 415 $infilename = $input; 416 unless (-e $input) {return 0;} 417 return ($infilename); 418 } elsif (ref($input) =~ /ARRAY/i ) { # $input may be an 419 # array of BioSeq objects... 420 # Open temporary file for both reading & writing of array 421 ($tfh,$infilename) = $self->io->tempfile(); 422 if( ! ref($input->[0]) ) { 423 $self->warn("passed an array ref which did not contain objects to _setinput"); 424 return; 425 } elsif ( $input->[0]->isa('Bio::PrimarySeqI') ) { 426 $temp = Bio::SeqIO->new('-fh' => $tfh, 427 '-format' => 'fasta'); 428 my $ct = 1; 429 foreach $seq (@$input) { 430 return 0 unless ( ref($seq) && 431 $seq->isa("Bio::PrimarySeqI") ); 432 if( ! defined $seq->display_id || 433 $seq->display_id =~ /^\s+$/ 434 ) { 435 $seq->display_id( "Seq".$ct++); 436 } 437 $temp->write_seq($seq); 438 } 439 $temp->close(); 440 undef $temp; 441 close($tfh); 442 $tfh = undef; 443 } else { 444 $self->warn( "got an array ref with 1st entry ". 445 $input->[0]. 446 " and don't know what to do with it\n"); 447 } 448 449 return ($infilename); 450 } else { 451 $self->warn("Got $input and don't know what to do with it\n"); 452 } 453 return 0; 454} 455 456 457=head2 _setparams 458 459 Title : _setparams 460 Usage : Internal function, not to be called directly 461 Function: Create parameter inputs for mafft program 462 Example : 463 Returns : parameter string to be passed to mafft program 464 Args : name of calling object 465 466=cut 467 468sub _setparams { 469 my ($self) = @_; 470 my ($outfile,$param_string) = ('',''); 471 472 # Set default output file if no explicit output file selected 473 unless (defined($outfile = $self->outfile) ) { 474 my $tfh; 475 ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir()); 476 close($tfh); 477 undef $tfh; 478 $self->outfile($outfile); 479 } 480 my ($attr,$value); 481 482 if ( $self->_version6 ) { 483 for $attr ( @MAFFT6_SWITCHES) { 484 $value = $self->$attr(); 485 next unless defined $value; 486 my $attr_key = lc $attr; #put switches in format expected by mafft 487 $attr_key = ' --'.$attr_key; 488 $param_string .= $attr_key ; 489 } 490 for $attr ( @MAFFT6_PARAMS ) { 491 $value = $self->$attr(); 492 next unless (defined $value); 493 my $attr_key = lc $attr; 494 $attr_key = ' --'.$attr_key; 495 $param_string .= $attr_key .' '.$value; 496 } 497 if ( ! $self->no_param_checks ) { 498 my @incompatible = qw/auto 6merpair globalpair localpair genafpair 499 fastapair/; 500 my @set = grep { $self->$_ } @incompatible; 501 if ( @set > 1 ) { 502 $self->throw("You can't specify more than one of @set"); 503 } 504 } 505 } 506 else { 507 for $attr ( @MAFFT4_SWITCHES) { 508 $value = $self->$attr(); 509 next unless defined $value; 510 my $attr_key = lc $attr; #put switches in format expected by mafft 511 $attr_key = ' --'.$attr_key; 512 $param_string .= $attr_key ; 513 } 514 # Method is a version 4 option 515 my $method = $self->method; 516 $self->throw("no method ") unless defined $method; 517 if( $method !~ /(rough|nsi)$/ && 518 defined $self->cycles) { 519 $param_string .= " ".$self->cycles; 520 } 521 } 522 523 my $outputstr = " 1>$outfile" ; 524 525 if ($self->quiet() || $self->verbose < 0) { 526 $param_string .= " --quiet"; 527 my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; 528 $outputstr .= " 2> $null"; 529 } 530 return ($param_string, $outputstr); 531} 532 533=head2 methods 534 535 Title : methods 536 Usage : my @methods = $self->methods() 537 Function: Get/Set Alignment methods - NOT VALIDATED 538 Returns : array of strings 539 Args : arrayref of strings 540 541 542=cut 543 544sub methods { 545 my ($self) = shift; 546 return @MAFFT_ALN_METHODS; 547} 548 549 550=head2 _version6 551 552 Title : _version6 553 Usage : Internal function, not to be called directly 554 Function: Check if the version of MAFFT is 6 555 Example : 556 Returns : Boolean 557 Args : None 558 559=cut 560 561sub _version6 { 562 my $self = shift; 563 if ( ! defined $self->{_version6} ) { 564 my $version = $self->version || ''; 565 if ( $version =~ /^v6/ ) { 566 $self->{_version6} = 1; 567 } 568 else { 569 $self->{_version6} = ''; 570 } 571 } 572 return $self->{_version6}; 573} 574 575 576=head1 Bio::Tools::Run::BaseWrapper methods 577 578=cut 579 580=head2 no_param_checks 581 582 Title : no_param_checks 583 Usage : $obj->no_param_checks($newval) 584 Function: Boolean flag as to whether or not we should 585 trust the sanity checks for parameter values 586 Returns : value of no_param_checks 587 Args : newvalue (optional) 588 589 590=cut 591 592=head2 save_tempfiles 593 594 Title : save_tempfiles 595 Usage : $obj->save_tempfiles($newval) 596 Function: 597 Returns : value of save_tempfiles 598 Args : newvalue (optional) 599 600 601=cut 602 603=head2 outfile_name 604 605 Title : outfile_name 606 Usage : my $outfile = $mafft->outfile_name(); 607 Function: Get/Set the name of the output file for this run 608 (if you wanted to do something special) 609 Returns : string 610 Args : [optional] string to set value to 611 612 613=cut 614 615 616=head2 tempdir 617 618 Title : tempdir 619 Usage : my $tmpdir = $self->tempdir(); 620 Function: Retrieve a temporary directory name (which is created) 621 Returns : string which is the name of the temporary directory 622 Args : none 623 624 625=cut 626 627=head2 cleanup 628 629 Title : cleanup 630 Usage : $mafft->cleanup(); 631 Function: Will cleanup the tempdir directory 632 Returns : none 633 Args : none 634 635 636=cut 637 638=head2 io 639 640 Title : io 641 Usage : $obj->io($newval) 642 Function: Gets a L<Bio::Root::IO> object 643 Returns : L<Bio::Root::IO> 644 Args : none 645 646 647=cut 648 6491; # Needed to keep compiler happy 650