1#
2# BioPerl module for Bio::Tools::Run::Alignment::Kalign
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Albert Vilella
7#
8# Copyright Albert Vilella
9#
10# You may distribute this module under the same terms as perl itself
11#
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::Tools::Run::Alignment::Kalign - Object for the calculation of an
17iterative multiple sequence alignment from a set of unaligned
18sequences or alignments using the KALIGN program
19
20=head1 SYNOPSIS
21
22  # Build a kalign alignment factory
23  $factory = Bio::Tools::Run::Alignment::Kalign->new(@params);
24
25  # Pass the factory a list of sequences to be aligned.
26  $inputfilename = 't/cysprot.fa';
27  # $aln is a SimpleAlign object.
28  $aln = $factory->align($inputfilename);
29
30  # or where @seq_array is an array of Bio::Seq objects
31  $seq_array_ref = \@seq_array;
32  $aln = $factory->align($seq_array_ref);
33
34  # Or one can pass the factory a pair of (sub)alignments
35  #to be aligned against each other, e.g.:
36
37  #There are various additional options and input formats available.
38  #See the DESCRIPTION section that follows for additional details.
39
40=head1 DESCRIPTION
41
42Please cite:
43
44        Timo Lassmann and Erik L.L. Sonnhammer (2005)
45        Kalign - an accurate and fast multiple sequence alignment algorithm.
46        BMC Bioinformatics 6:298
47
48http://msa.cgb.ki.se/downloads/kalign/current.tar.gz
49
50
51=head2 Helping the module find your executable
52
53You will need to enable Kalign to find the kalign program. This can be
54done in (at least) three ways:
55
56  1. Make sure the kalign executable is in your path (i.e.
57     'which kalign' returns a valid program
58  2. define an environmental variable KALIGNDIR which points to a
59     directory containing the 'kalign' app:
60   In bash
61	export KALIGNDIR=/home/progs/kalign   or
62   In csh/tcsh
63        setenv KALIGNDIR /home/progs/kalign
64
65  3. include a definition of an environmental variable KALIGNDIR
66      in every script that will
67     BEGIN {$ENV{KALIGNDIR} = '/home/progs/kalign'; }
68     use Bio::Tools::Run::Alignment::Kalign;
69
70=head1 FEEDBACK
71
72=head2 Mailing Lists
73
74User feedback is an integral part of the evolution of this and other
75Bioperl modules. Send your comments and suggestions preferably to one
76of the Bioperl mailing lists.  Your participation is much appreciated.
77
78  bioperl-l@bioperl.org                  - General discussion
79  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
80
81=head2 Support
82
83Please direct usage questions or support issues to the mailing list:
84
85I<bioperl-l@bioperl.org>
86
87rather than to the module maintainer directly. Many experienced and
88reponsive experts will be able look at the problem and quickly
89address it. Please include a thorough description of the problem
90with code and data examples if at all possible.
91
92=head2 Reporting Bugs
93
94Report bugs to the Bioperl bug tracking system to help us keep track
95the bugs and their resolution.  Bug reports can be submitted via the web:
96
97 http://redmine.open-bio.org/projects/bioperl/
98
99=head1 AUTHOR -  Albert Vilella
100
101Email idontlikespam@hotmail.com
102
103=head1 APPENDIX
104
105The rest of the documentation details each of the object
106methods. Internal methods are usually preceded with a _
107
108=cut
109
110package Bio::Tools::Run::Alignment::Kalign;
111
112use vars qw($AUTOLOAD @ISA $PROGRAMNAME $PROGRAM %DEFAULTS
113            @KALIGN_PARAMS @KALIGN_SWITCHES %OK_FIELD
114            );
115use strict;
116use Bio::Seq;
117use Bio::SeqIO;
118use Bio::SimpleAlign;
119use Bio::AlignIO;
120use Bio::Root::Root;
121use Bio::Root::IO;
122use Bio::Factory::ApplicationFactoryI;
123use  Bio::Tools::Run::WrapperBase;
124@ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase
125          Bio::Factory::ApplicationFactoryI);
126
127
128BEGIN {
129    %DEFAULTS = ( 'AFORMAT' => 'fasta' );
130    @KALIGN_PARAMS = qw(IN OUT GAPOPEN GAPEXTENSION TERMINAL_GAP_EXTENSION_PENALTY MATRIX_BONUS
131                        SORT FEATURE DISTANCE TREE ZCUTOFF FORMAT
132			MAXMB MAXHOURS MAXITERS);
133    @KALIGN_SWITCHES = qw(QUIET);
134
135# Authorize attribute fields
136    foreach my $attr ( @KALIGN_PARAMS, @KALIGN_SWITCHES ) {
137	$OK_FIELD{$attr}++; }
138}
139
140=head2 program_name
141
142 Title   : program_name
143 Usage   : $factory->program_name()
144 Function: holds the program name
145 Returns:  string
146 Args    : None
147
148=cut
149
150sub program_name {
151        return 'kalign';
152}
153
154=head2 program_dir
155
156 Title   : program_dir
157 Usage   : $factory->program_dir(@params)
158 Function: returns the program directory, obtained from ENV variable.
159 Returns:  string
160 Args    :
161
162=cut
163
164sub program_dir {
165        return Bio::Root::IO->catfile($ENV{KALIGNDIR}) if $ENV{KALIGNDIR};
166}
167
168=head2 new
169
170 Title   : new
171 Usage   : my $kalign = Bio::Tools::Run::Alignment::Kalign->new();
172 Function: Constructor
173 Returns : Bio::Tools::Run::Alignment::Kalign
174 Args    : -outfile_name => $outname
175
176
177=cut
178
179sub new {
180    my ($class,@args) = @_;
181    my( @kalign_args, @obj_args);
182    while( my $arg = shift @args ) {
183	if( $arg =~ /^-/ ) {
184	    push @obj_args, $arg, shift @args;
185	} else {
186	    push @kalign_args,$arg, shift @args;
187	}
188    }
189    my $self = $class->SUPER::new(@obj_args);
190
191    my ($on) = $self->_rearrange([qw(OUTFILE_NAME)],@obj_args);
192
193    $self->outfile_name($on || '');
194    my ($attr, $value);
195    # FIXME: only tested with fasta output format right now...
196    $self->aformat($DEFAULTS{'AFORMAT'});
197
198    while ( @kalign_args)  {
199	$attr =   shift @kalign_args;
200	$value =  shift @kalign_args;
201	next if( $attr =~ /^-/); # don't want named parameters
202	$self->$attr($value);
203    }
204
205    if( defined $self->out ) {
206	$self->outfile_name($self->out);
207    }
208    return $self;
209}
210
211sub AUTOLOAD {
212    my $self = shift;
213    my $attr = $AUTOLOAD;
214    $attr =~ s/.*:://;
215    $attr = uc $attr;
216    # aliasing
217    $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
218
219    $self->{$attr} = shift if @_;
220    return $self->{$attr};
221}
222
223=head2 error_string
224
225 Title   : error_string
226 Usage   : $obj->error_string($newval)
227 Function: Where the output from the last analysus run is stored.
228 Returns : value of error_string
229 Args    : newvalue (optional)
230
231
232=cut
233
234sub error_string{
235   my ($self,$value) = @_;
236   if( defined $value) {
237      $self->{'error_string'} = $value;
238    }
239    return $self->{'error_string'};
240
241}
242
243=head2  version
244
245 Title   : version
246 Usage   : exit if $prog->version() < 2
247 Function: Determine the version number of the program
248 Example :
249 Returns : float or undef
250 Args    : none
251
252=cut
253
254sub version {
255    my ($self) = @_;
256    my $exe;
257    # Kalign version 2.01, Copyright (C) 2004, 2005, 2006 Timo Lassmann
258    return undef unless $exe = $self->executable;
259    my $string = `$exe 2>&1` ;
260    $string =~ /Kalign\s+version\s+(\d+\.\d+)/m;
261    return $1 || undef;
262}
263
264=head2 run
265
266 Title   : run
267 Usage   : my $output = $application->run(\@seqs);
268 Function: Generic run of an application
269 Returns : Bio::SimpleAlign object
270 Args    : Arrayref of Bio::PrimarySeqI objects or
271           a filename to run on
272
273=cut
274
275sub run {
276    my $self = shift;
277    return $self->align(shift);
278}
279
280=head2  align
281
282 Title   : align
283 Usage   :
284	$inputfilename = 't/data/cysprot.fa';
285	$aln = $factory->align($inputfilename);
286or
287	$seq_array_ref = \@seq_array;
288        # @seq_array is array of Seq objs
289	$aln = $factory->align($seq_array_ref);
290 Function: Perform a multiple sequence alignment
291 Returns : Reference to a SimpleAlign object containing the
292           sequence alignment.
293 Args    : Name of a file containing a set of unaligned fasta sequences
294           or else an array of references to Bio::Seq objects.
295
296 Throws an exception if argument is not either a string (eg a
297 filename) or a reference to an array of Bio::Seq objects.  If
298 argument is string, throws exception if file corresponding to string
299 name can not be found. If argument is Bio::Seq array, throws
300 exception if less than two sequence objects are in array.
301
302=cut
303
304sub align {
305    my ($self,$input) = @_;
306    # Create input file pointer
307    $self->io->_io_cleanup();
308    my $infilename;
309    if( defined $input ) {
310	$infilename = $self->_setinput($input);
311    } elsif( defined $self->in ) {
312	$infilename = $self->_setinput($self->in);
313    } else {
314	$self->throw("No inputdata provided\n");
315    }
316    if (! $infilename) {
317	$self->throw("Bad input data or less than 2 sequences in $input !");
318    }
319
320    my $param_string = $self->_setparams();
321
322    # run kalign
323    return &_run($self, $infilename, $param_string);
324}
325
326=head2  _run
327
328 Title   :  _run
329 Usage   :  Internal function, not to be called directly
330 Function:  makes actual system call to kalign program
331 Example :
332 Returns : nothing; kalign output is written to a
333           temporary file OR specified output file
334 Args    : Name of a file containing a set of unaligned fasta sequences
335           and hash of parameters to be passed to kalign
336
337
338=cut
339
340sub _run {
341    my ($self,$infilename,$params) = @_;
342    my $commandstring = $self->executable." -in $infilename $params";
343
344    $self->debug( "kalign command = $commandstring \n");
345
346    my $status = system($commandstring);
347    my $outfile = $self->outfile_name();
348    if( !-e $outfile || -z $outfile ) {
349	$self->warn( "Kalign call crashed: $? [command $commandstring]\n");
350	return undef;
351    }
352
353    my $in  = Bio::AlignIO->new('-file'   => $outfile,
354				'-format' => $self->aformat);
355    my $aln = $in->next_aln();
356    return $aln;
357}
358
359
360=head2  _setinput
361
362 Title   :  _setinput
363 Usage   :  Internal function, not to be called directly
364 Function:  Create input file for kalign program
365 Example :
366 Returns : name of file containing kalign data input AND
367 Args    : Arrayref of Seqs or input file name
368
369
370=cut
371
372sub _setinput {
373    my ($self,$input) = @_;
374    my ($infilename, $seq, $temp, $tfh);
375    if (! ref $input) {
376	# check that file exists or throw
377	$infilename = $input;
378	unless (-e $input) {return 0;}
379	# let's peek and guess
380	open(IN,$infilename) || $self->throw("Cannot open $infilename");
381	my $header;
382	while( defined ($header = <IN>) ) {
383	    last if $header !~ /^\s+$/;
384	}
385	close(IN);
386	if ( $header !~ /^>\s*\S+/ ){
387	    $self->throw("Need to provide a FASTA format file to kalign!");
388	}
389	return ($infilename);
390    } elsif (ref($input) =~ /ARRAY/i ) { #  $input may be an
391	#  array of BioSeq objects...
392        #  Open temporary file for both reading & writing of array
393	($tfh,$infilename) = $self->io->tempfile();
394	if( ! ref($input->[0]) ) {
395	    $self->warn("passed an array ref which did not contain objects to _setinput");
396	    return undef;
397	} elsif( $input->[0]->isa('Bio::PrimarySeqI') ) {
398	    $temp =  Bio::SeqIO->new('-fh' => $tfh,
399				     '-format' => 'fasta');
400	    my $ct = 1;
401	    foreach $seq (@$input) {
402		return 0 unless ( ref($seq) &&
403				  $seq->isa("Bio::PrimarySeqI") );
404		if( ! defined $seq->display_id ||
405		    $seq->display_id =~ /^\s+$/) {
406		    $seq->display_id( "Seq".$ct++);
407		}
408		$temp->write_seq($seq);
409	    }
410	    $temp->close();
411	    undef $temp;
412	    close($tfh);
413	    $tfh = undef;
414	} else {
415	    $self->warn( "got an array ref with 1st entry ".
416			 $input->[0].
417			 " and don't know what to do with it\n");
418	}
419	return ($infilename);
420    } else {
421	$self->warn("Got $input and don't know what to do with it\n");
422    }
423    return 0;
424}
425
426
427=head2  _setparams
428
429 Title   :  _setparams
430 Usage   :  Internal function, not to be called directly
431 Function:  Create parameter inputs for kalign program
432 Example :
433 Returns : parameter string to be passed to kalign
434           during align or profile_align
435 Args    : name of calling object
436
437=cut
438
439sub _setparams {
440    my ($self) = @_;
441    my ($attr, $value,$param_string);
442    $param_string = '';
443    my $laststr;
444    for  $attr ( @KALIGN_PARAMS ) {
445	$value = $self->$attr();
446	next unless (defined $value);
447	my $attr_key = lc $attr;
448        $attr_key = ' -'.$attr_key;
449        $param_string .= $attr_key .' '.$value;
450
451    }
452    for  $attr ( @KALIGN_SWITCHES) {
453 	$value = $self->$attr();
454 	next unless ($value);
455 	my $attr_key = lc $attr; #put switches in format expected by tcoffee
456 	$attr_key = ' -'.$attr_key;
457 	$param_string .= $attr_key ;
458    }
459
460    # Set default output file if no explicit output file selected
461    unless ($self->outfile_name ) {
462	my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir());
463	close($tfh);
464	undef $tfh;
465	$self->outfile_name($outfile);
466    }
467    $param_string .= " -out ".$self->outfile_name;
468
469    if ($self->quiet() || $self->verbose < 0) {
470	my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
471	$param_string .= " 2> $null";
472    }
473    return $param_string;
474}
475
476=head2 aformat
477
478 Title   : aformat
479 Usage   : my $alignmentformat = $self->aformat();
480 Function: Get/Set alignment format
481 Returns : string
482 Args    : string
483
484
485=cut
486
487sub aformat{
488    my $self = shift;
489    $self->{'_aformat'} = shift if @_;
490    return $self->{'_aformat'};
491}
492
493=head1 Bio::Tools::Run::BaseWrapper methods
494
495=cut
496
497=head2 no_param_checks
498
499 Title   : no_param_checks
500 Usage   : $obj->no_param_checks($newval)
501 Function: Boolean flag as to whether or not we should
502           trust the sanity checks for parameter values
503 Returns : value of no_param_checks
504 Args    : newvalue (optional)
505
506
507=cut
508
509=head2 save_tempfiles
510
511 Title   : save_tempfiles
512 Usage   : $obj->save_tempfiles($newval)
513 Function:
514 Returns : value of save_tempfiles
515 Args    : newvalue (optional)
516
517
518=cut
519
520=head2 outfile_name
521
522 Title   : outfile_name
523 Usage   : my $outfile = $kalign->outfile_name();
524 Function: Get/Set the name of the output file for this run
525           (if you wanted to do something special)
526 Returns : string
527 Args    : [optional] string to set value to
528
529
530=cut
531
532
533=head2 tempdir
534
535 Title   : tempdir
536 Usage   : my $tmpdir = $self->tempdir();
537 Function: Retrieve a temporary directory name (which is created)
538 Returns : string which is the name of the temporary directory
539 Args    : none
540
541
542=cut
543
544=head2 cleanup
545
546 Title   : cleanup
547 Usage   : $kalign->cleanup();
548 Function: Will cleanup the tempdir directory
549 Returns : none
550 Args    : none
551
552
553=cut
554
555=head2 io
556
557 Title   : io
558 Usage   : $obj->io($newval)
559 Function:  Gets a L<Bio::Root::IO> object
560 Returns : L<Bio::Root::IO>
561 Args    : none
562
563
564=cut
565
5661; # Needed to keep compiler happy
567