1#
2# BioPerl module for Bio::Tools::Run::Alignment::DBA
3#
4# Copyright Shawn Hoon
5#
6# You may distribute this module under the same terms as perl itself
7# POD documentation - main docs before the code
8
9=head1 NAME
10
11Bio::Tools::Run::Alignment::DBA - Object for the alignment of two
12sequences using the DNA Block Aligner program.
13
14=head1 SYNOPSIS
15
16  use Bio::Tools::Run::Alignment::DBA;
17
18  #  Build a dba alignment factory
19  my @params = ('matchA' => 0.75,
20                 'matchB' => '0.55',
21                 'dymem' =>'linear');
22  my $factory = Bio::Tools::Run::Alignment::DBA->new(@params);
23
24  #  Pass the factory a filename with 2 sequences to be aligned.
25  $inputfilename = 't/data/dbaseq.fa';
26  # @hsps is an array of GenericHSP objects
27  my @hsps = $factory->align($inputfilename);
28
29  # or
30  my @files = ('t/data/dbaseq1.fa','t/data/dbaseq2.fa');
31  my @hsps = $factory->align(\@files);
32
33  # or where @seq_array is an array of Bio::Seq objects
34  $seq_array_ref = \@seq_array;
35  my @hsps = $factory->align($seq_array_ref);
36
37=head1 DESCRIPTION
38
39DNA Block Aligner program (DBA) was developed by Ewan Birney. DBA
40is part of the Wise package available at
41L<http://www.sanger.ac.uk/software/wise2>.
42
43You will need to enable dba to find the dba program. This can
44be done in a few different ways:
45
461. Define an environmental variable WISEDIR:
47
48  export WISEDIR =/usr/local/share/wise2.2.0
49
502. Include a definition of an environmental variable WISEDIR in
51every script that will use DBA.pm:
52
53  $ENV{WISEDIR} = '/usr/local/share/wise2.2.20';
54
553. Make sure that the dba application is in your PATH.
56
57=head1 FEEDBACK
58
59=head2 Mailing Lists
60
61User feedback is an integral part of the evolution of this and other
62Bioperl modules. Send your comments and suggestions preferably to one
63of the Bioperl mailing lists.  Your participation is much appreciated.
64
65  bioperl-l@bioperl.org                  - General discussion
66  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
67
68=head2 Support
69
70Please direct usage questions or support issues to the mailing list:
71
72I<bioperl-l@bioperl.org>
73
74rather than to the module maintainer directly. Many experienced and
75reponsive experts will be able look at the problem and quickly
76address it. Please include a thorough description of the problem
77with code and data examples if at all possible.
78
79=head2 Reporting Bugs
80
81Report bugs to the Bioperl bug tracking system to help us keep track
82the bugs and their resolution.  Bug reports can be submitted via the
83web:
84
85  http://redmine.open-bio.org/projects/bioperl/
86
87=head1 AUTHOR - Shawn Hoon
88
89Email shawnh@fugu-sg.org
90
91=head1 APPENDIX
92
93The rest of the documentation details each of the object
94methods. Internal methods are usually preceded with a _
95
96=cut
97
98package Bio::Tools::Run::Alignment::DBA;
99use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
100            @DBA_SWITCHES @DBA_PARAMS @OTHER_SWITCHES %OK_FIELD);
101use strict;
102use Bio::SeqIO;
103use Bio::SimpleAlign;
104use Bio::AlignIO;
105use Bio::Root::Root;
106use Bio::Root::IO;
107use Bio::Factory::ApplicationFactoryI;
108use Bio::Search::HSP::GenericHSP;
109use Bio::Tools::Run::WrapperBase;
110
111@ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
112
113BEGIN {
114
115    @DBA_PARAMS = qw(MATCHA MATCHB MATCHC MATCHD GAP BLOCKOPEN UMATCH SINGLE
116                     NOMATCHN PARAMS KBYTE DYMEM DYDEBUG ERRORLOG);
117    @OTHER_SWITCHES = qw(OUTFILE);
118    @DBA_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD ALIGN LABEL);
119
120    # Authorize attribute fields
121    foreach my $attr ( @DBA_PARAMS, @DBA_SWITCHES,
122                       @OTHER_SWITCHES) { $OK_FIELD{$attr}++; }
123}
124
125=head2 program_name
126
127 Title   : program_name
128 Usage   : $factory>program_name()
129 Function: holds the program name
130 Returns:  string
131 Args    : None
132
133=cut
134
135sub program_name {
136  return 'dba';
137}
138
139=head2 program_dir
140
141 Title   : program_dir
142 Usage   : $factory->program_dir(@params)
143 Function: returns the program directory, obtained from ENV variable.
144 Returns:  string
145 Args    :
146
147=cut
148
149sub program_dir {
150  return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin") if $ENV{WISEDIR};
151}
152
153sub new {
154  my ($class, @args) = @_;
155  my $self = $class->SUPER::new(@args);
156
157  my ($attr, $value);
158
159  while (@args) {
160    $attr =   shift @args;
161    $value =  shift @args;
162    next if( $attr =~ /^-/ ); # don't want named parameters
163    if ($attr =~/'PROGRAM'/i )  {
164      $self->executable($value);
165      next;
166    }
167    $self->$attr($value);
168  }
169  return $self;
170}
171
172sub AUTOLOAD {
173    my $self = shift;
174    my $attr = $AUTOLOAD;
175    $attr =~ s/.*:://;
176    $attr = uc $attr;
177    $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
178    $self->{$attr} = shift if @_;
179    return $self->{$attr};
180}
181
182=head2  version
183
184 Title   : version
185 Usage   : exit if $prog->version() < 1.8
186 Function: Determine the version number of the program
187 Example :
188 Returns : float or undef
189 Args    : none
190
191=cut
192
193sub version {
194    my ($self) = @_;
195
196    my $exe = $self->executable();
197    return undef unless defined $exe;
198    my $string = `$exe -- ` ;
199    $string =~ /\(([\d.]+)\)/;
200    return $1 || undef;
201}
202
203=head2  align
204
205 Title   : align
206 Usage   :
207            $inputfilename = 't/data/seq.fa';
208            @hsps = $factory->align($inputfilename);
209          or
210            #@seq_array is array of Seq objs
211            $seq_array_ref = \@seq_array;
212            @hsps = $factory->align($seq_array_ref);
213          or
214            my @files = ('t/data/seq1.fa','t/data/seq2.fa');
215            @hsps = $factory->align(\@files);
216 Function: Perform a DBA  alignment
217
218
219 Returns : An array of Bio::Search::HSP::GenericHSP objects
220 Args    : Name of a file containing a set of 2 fasta sequences
221           or else a reference to an array  to 2  Bio::Seq objects.
222           or else a reference to an array of 2 file
223              names containing 1 fasta sequence each
224
225 Throws an exception if argument is not either a string (eg a
226 filename) or a reference to an array of 2 Bio::Seq objects.  If
227 argument is string, throws exception if file corresponding to string
228 name can not be found. If argument is Bio::Seq array, throws
229 exception if less than two sequence objects are in array.
230
231=cut
232
233sub align {
234
235    my ($self,$input) = @_;
236    my ($temp,$infile1, $infile2, $seq);
237    my ($attr, $value, $switch);
238
239# Create input file pointer
240    ($infile1,$infile2)= $self->_setinput($input);
241    if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !");}
242
243# Create parameter string to pass to dba program
244    my $param_string = $self->_setparams();
245
246# run dba
247    my @hsps = $self->_run($infile1,$infile2,$param_string);
248    return @hsps;
249}
250
251#################################################
252
253=head2  _run
254
255 Title   :  _run
256 Usage   :  Internal function, not to be called directly
257 Function:   makes actual system call to dba program
258 Example :
259 Returns : nothing; dba  output is written to a temp file
260 Args    : Name of a file containing a set of unaligned fasta sequences
261           and hash of parameters to be passed to dba
262
263=cut
264
265sub _run {
266    my ($self,$infile1,$infile2,$param_string) = @_;
267    my $instring;
268    $self->debug( "Program ".$self->executable."\n");
269    unless( $self->outfile){
270	  my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir);
271    close($tfh);
272  	undef $tfh;
273  	$self->outfile($outfile);
274    }
275    my $outfile = $self->outfile();
276    my $commandstring = $self->executable." $param_string -pff $infile1 $infile2 > $outfile";
277    $self->debug( "dba command = $commandstring");
278    my $status = system($commandstring);
279    $self->throw( "DBA call ($commandstring) crashed: $? \n") unless $status==0;
280    #parse pff format and return a Bio::Search::HSP::GenericHSP array
281    my $hsps   = $self->_parse_results($outfile);
282
283    return @{$hsps};
284}
285
286=head2  _parse_results
287
288 Title   :  __parse_results
289 Usage   :  Internal function, not to be called directly
290 Function:  Parses dba output
291 Example :
292 Returns : an reference to an array of GenericHSPs
293 Args    : the name of the output file
294
295=cut
296
297sub _parse_results {
298    my ($self,$outfile) = @_;
299    $outfile||$self->throw("No outfile specified");
300    my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align);
301    my $count = 0;
302    my @hsps;
303    open(OUT,$outfile);
304    my (%query,%subject);
305    while(my $entry = <OUT>){
306      if($entry =~ /^>(.+)/ ) {
307        $tempname = $1;
308        if( defined $name ) {
309          if($count == 0){
310            my @parse = split("\t",$name);
311            $query{seqname}  = $parse[0];
312            $query{start}    = $parse[3];
313            $query{end}      = $parse[4];
314            $query{score}    = $parse[5];
315            $query{strand}   = ($parse[6] eq '+') ? 1 : -1;
316            my @tags         = split(";",$parse[8]);
317            foreach my $tag(@tags){
318              $tag =~/(\S+)\s+(\S+)/;
319              $query{$1} = $2;
320            }
321            $query{seq}      = $seqchar;
322            $count++;
323          }
324          elsif ($count == 1){
325            my @parse = split("\t",$name);
326            $subject{seqname}  = $parse[0];
327            $subject{start}    = $parse[3];
328            $subject{end}      = $parse[4];
329            $subject{score}    = $parse[5];
330            $subject{strand}   = ($parse[6] eq '+') ? 1:-1;
331            my @tags         = split(";",$parse[8]);
332            foreach my $tag(@tags){
333              $tag =~/(\S+)\s+(\S+)/;
334              $subject{$1} = $2;
335            }
336            $subject{seq}   = $seqchar;
337            #create homology string
338            my $xor = $query{seq}^$subject{seq};
339            my $identical = $xor=~tr/\c@/*/;
340            $xor=~tr/*/ /c;
341            my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm     =>'DBA',
342                                                       -score         =>$query{score},
343                                                       -hsp_length    =>length($query{seq}),
344                                                       -query_gaps    =>$query{gaps},
345                                                       -hit_gaps      =>$subject{gaps},
346                                                       -query_name    =>$query{seqname},
347                                                       -query_start   =>$query{start},
348                                                       -query_end     =>$query{end},
349                                                       -hit_name      =>$subject{seqname},
350                                                       -hit_start     =>$subject{start},
351                                                       -hit_end       =>$subject{end},
352                                                       -hit_length    =>length($self->_subject_seq->seq),
353                                                       -query_length  =>length($self->_query_seq->seq),
354                                                       -query_seq     =>$query{seq},
355                                                       -hit_seq       =>$subject{seq},
356                                                       -conserved     =>$identical,
357                                                       -identical   =>$identical,
358                                                       -homology_seq  =>$xor);
359            push @hsps, $hsp;
360            $count  = 0;
361          }
362      }
363          $name = $tempname;
364          $seqchar  = "";
365          next;
366      }
367      $entry =~ s/[^A-Za-z\.\-]//g;
368      $seqchar .= $entry;
369  }
370  #do for the last entry
371  if($count == 1){
372    my @parse = split("\t",$name);
373    $subject{seqname}  = $parse[1];
374    $subject{start}    = $parse[3];
375    $subject{end}      = $parse[4];
376    $subject{score}    = $parse[5];
377    $subject{strand}   = ($parse[6] eq '+') ? 1:-1;
378    my @tags         = split(";",$parse[8]);
379    foreach my $tag(@tags){
380      $tag =~/(\S+)\s+(\S+)/;
381      $subject{$1} = $2;
382    }
383    $subject{seq}   = $seqchar;
384
385    #create homology string
386
387    my $xor = $query{seq}^$subject{seq};
388    my $identical = $xor=~tr/\c@/*/;
389    $xor=~tr/*/ /c;
390    my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm     =>'DBA',
391                                               -score         =>$query{score},
392                                               -hsp_length    =>length($query{seq}),
393                                               -query_gaps    =>$query{gaps},
394                                               -hit_gaps      =>$subject{gaps},
395                                               -query_name    =>$query{seqname},
396                                               -query_start   =>$query{start},
397                                               -query_end     =>$query{end},
398                                               -hit_name      =>$subject{seqname},
399                                               -hit_start     =>$subject{start},
400                                               -hit_end       =>$subject{end},
401                                               -hit_length    =>length($self->_subject_seq->seq),
402                                               -query_length  =>length($self->_query_seq->seq),
403                                               -query_seq     =>$query{seq},
404                                               -hit_seq       =>$subject{seq},
405                                               -conserved     =>$identical,
406                                               -identical     =>$identical,
407                                               -homology_seq  =>$xor);
408     push @hsps, $hsp;
409  }
410
411
412  return \@hsps;
413}
414
415=head2  _setinput()
416
417 Title   :  _setinput
418 Usage   :  Internal function, not to be called directly
419 Function:   Create input file for dba program
420 Example :
421 Returns : name of file containing dba data input
422 Args    : Seq or Align object reference or input file name
423
424=cut
425
426sub _setinput {
427  my ($self, $input, $suffix) = @_;
428  my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2);
429
430  #there is gotta be some repetition here...need to clean up
431
432  if (ref($input) ne "ARRAY"){ #a single file containg 2 seqeunces
433    $infilename = $input;
434    unless(-e $input){return 0;}
435    my  $in  = Bio::SeqIO->new(-file => $infilename , '-format' => 'Fasta');
436    ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir);
437    ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir);
438
439    my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'Fasta','-flush'=>1);
440    my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'Fasta','-flush'=>1);
441
442    my $seq1 = $in->next_seq() || return 0;
443    my $seq2 = $in->next_seq() || return 0;
444    $out1->write_seq($seq1);
445    $out2->write_seq($seq2);
446    $self->_query_seq($seq1);
447    $self->_subject_seq($seq2);
448    $out1->close();
449    $out2->close();
450    close($tfh1);
451    close($tfh2);
452    undef $tfh1;
453    undef $tfh2;
454    return $outfile1,$outfile2;
455  }
456  else {
457    scalar(@{$input}) == 2 || $self->throw("dba alignment can only be run  on 2 sequences not.");
458    if(ref($input->[0]) eq ""){#passing in two file names
459      my $in1  = Bio::SeqIO->new(-file => $input->[0], '-format' => 'fasta');
460      my $in2  = Bio::SeqIO->new(-file => $input->[1], '-format' => 'fasta');
461
462      ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir);
463      ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir);
464
465      my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta');
466      my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta');
467
468      my $seq1 = $in1->next_seq() || return 0;
469      my $seq2 = $in2->next_seq() || return 0;
470      $out1->write_seq($seq1);
471      $out2->write_seq($seq2);
472      $self->_query_seq($seq1);
473      $self->_subject_seq($seq2);
474      close($tfh1);
475      close($tfh2);
476      undef $tfh1;
477      undef $tfh2;
478      return $outfile1,$outfile2;
479    }
480    elsif($input->[0]->isa("Bio::PrimarySeqI") && $input->[1]->isa("Bio::PrimarySeqI")) {
481      ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir);
482      ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir);
483
484      my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta');
485      my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta');
486
487      $out1->write_seq($input->[0]);
488      $out2->write_seq($input->[1]);
489      $self->_query_seq($input->[0]);
490      $self->_subject_seq($input->[1]);
491
492      close($tfh1);
493      close($tfh2);
494      undef $tfh1;
495      undef $tfh2;
496      return $outfile1,$outfile2;
497    }
498    else {
499       return 0;
500    }
501  }
502  return 0;
503}
504
505=head2  _setparams()
506
507 Title   :  _setparams
508 Usage   :  Internal function, not to be called directly
509 Function:   Create parameter inputs for dba program
510 Example :
511 Returns : parameter string to be passed to dba
512           during align or profile_align
513 Args    : name of calling object
514
515=cut
516
517sub _setparams {
518    my ($attr, $value, $self);
519
520    $self = shift;
521
522    my $param_string = "";
523    for  $attr ( @DBA_PARAMS ) {
524	$value = $self->$attr();
525	next unless (defined $value);
526#      next if $attr =~/outfile/i;
527
528	my $attr_key = lc $attr; #put params in format expected by dba
529	if($attr_key =~ /match([ABCDabcd])/i){
530	    $attr_key = "match".uc($1);
531	}
532	$attr_key = ' -'.$attr_key;
533	$param_string .= $attr_key.' '.$value;
534    }
535    for  $attr ( @DBA_SWITCHES) {
536	$value = $self->$attr();
537	next unless ($value);
538	my $attr_key = lc $attr; #put switches in format expected by dba
539	$attr_key = ' -'.$attr_key;
540	$param_string .= $attr_key ;
541    }
542    return $param_string;
543}
544
545=head2  _query_seq()
546
547 Title   :  _query_seq
548 Usage   :  Internal function, not to be called directly
549 Function:  get/set for the query sequence
550 Example :
551 Returns :
552 Args    :
553
554=cut
555
556sub _query_seq {
557  my ($self,$seq) = @_;
558  if(defined $seq){
559    $self->{'_query_seq'} = $seq;
560  }
561  return $self->{'_query_seq'};
562}
563
564=head2  _subject_seq()
565
566 Title   :  _subject_seq
567 Usage   :  Internal function, not to be called directly
568 Function:  get/set for the subject sequence
569 Example :
570 Returns :
571
572 Args    :
573
574=cut
575
576sub _subject_seq {
577  my ($self,$seq) = @_;
578  if(defined $seq){
579    $self->{'_subject_seq'} = $seq;
580  }
581  return $self->{'_subject_seq'};
582}
5831; # Needed to keep compiler happy
584
585