1# 2# BioPerl module for Bio::Tools::Run::Alignment::DBA 3# 4# Copyright Shawn Hoon 5# 6# You may distribute this module under the same terms as perl itself 7# POD documentation - main docs before the code 8 9=head1 NAME 10 11Bio::Tools::Run::Alignment::DBA - Object for the alignment of two 12sequences using the DNA Block Aligner program. 13 14=head1 SYNOPSIS 15 16 use Bio::Tools::Run::Alignment::DBA; 17 18 # Build a dba alignment factory 19 my @params = ('matchA' => 0.75, 20 'matchB' => '0.55', 21 'dymem' =>'linear'); 22 my $factory = Bio::Tools::Run::Alignment::DBA->new(@params); 23 24 # Pass the factory a filename with 2 sequences to be aligned. 25 $inputfilename = 't/data/dbaseq.fa'; 26 # @hsps is an array of GenericHSP objects 27 my @hsps = $factory->align($inputfilename); 28 29 # or 30 my @files = ('t/data/dbaseq1.fa','t/data/dbaseq2.fa'); 31 my @hsps = $factory->align(\@files); 32 33 # or where @seq_array is an array of Bio::Seq objects 34 $seq_array_ref = \@seq_array; 35 my @hsps = $factory->align($seq_array_ref); 36 37=head1 DESCRIPTION 38 39DNA Block Aligner program (DBA) was developed by Ewan Birney. DBA 40is part of the Wise package available at 41L<http://www.sanger.ac.uk/software/wise2>. 42 43You will need to enable dba to find the dba program. This can 44be done in a few different ways: 45 461. Define an environmental variable WISEDIR: 47 48 export WISEDIR =/usr/local/share/wise2.2.0 49 502. Include a definition of an environmental variable WISEDIR in 51every script that will use DBA.pm: 52 53 $ENV{WISEDIR} = '/usr/local/share/wise2.2.20'; 54 553. Make sure that the dba application is in your PATH. 56 57=head1 FEEDBACK 58 59=head2 Mailing Lists 60 61User feedback is an integral part of the evolution of this and other 62Bioperl modules. Send your comments and suggestions preferably to one 63of the Bioperl mailing lists. Your participation is much appreciated. 64 65 bioperl-l@bioperl.org - General discussion 66 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 67 68=head2 Support 69 70Please direct usage questions or support issues to the mailing list: 71 72I<bioperl-l@bioperl.org> 73 74rather than to the module maintainer directly. Many experienced and 75reponsive experts will be able look at the problem and quickly 76address it. Please include a thorough description of the problem 77with code and data examples if at all possible. 78 79=head2 Reporting Bugs 80 81Report bugs to the Bioperl bug tracking system to help us keep track 82the bugs and their resolution. Bug reports can be submitted via the 83web: 84 85 http://redmine.open-bio.org/projects/bioperl/ 86 87=head1 AUTHOR - Shawn Hoon 88 89Email shawnh@fugu-sg.org 90 91=head1 APPENDIX 92 93The rest of the documentation details each of the object 94methods. Internal methods are usually preceded with a _ 95 96=cut 97 98package Bio::Tools::Run::Alignment::DBA; 99use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME 100 @DBA_SWITCHES @DBA_PARAMS @OTHER_SWITCHES %OK_FIELD); 101use strict; 102use Bio::SeqIO; 103use Bio::SimpleAlign; 104use Bio::AlignIO; 105use Bio::Root::Root; 106use Bio::Root::IO; 107use Bio::Factory::ApplicationFactoryI; 108use Bio::Search::HSP::GenericHSP; 109use Bio::Tools::Run::WrapperBase; 110 111@ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); 112 113BEGIN { 114 115 @DBA_PARAMS = qw(MATCHA MATCHB MATCHC MATCHD GAP BLOCKOPEN UMATCH SINGLE 116 NOMATCHN PARAMS KBYTE DYMEM DYDEBUG ERRORLOG); 117 @OTHER_SWITCHES = qw(OUTFILE); 118 @DBA_SWITCHES = qw(HELP SILENT QUIET ERROROFFSTD ALIGN LABEL); 119 120 # Authorize attribute fields 121 foreach my $attr ( @DBA_PARAMS, @DBA_SWITCHES, 122 @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } 123} 124 125=head2 program_name 126 127 Title : program_name 128 Usage : $factory>program_name() 129 Function: holds the program name 130 Returns: string 131 Args : None 132 133=cut 134 135sub program_name { 136 return 'dba'; 137} 138 139=head2 program_dir 140 141 Title : program_dir 142 Usage : $factory->program_dir(@params) 143 Function: returns the program directory, obtained from ENV variable. 144 Returns: string 145 Args : 146 147=cut 148 149sub program_dir { 150 return Bio::Root::IO->catfile($ENV{WISEDIR},"/src/bin") if $ENV{WISEDIR}; 151} 152 153sub new { 154 my ($class, @args) = @_; 155 my $self = $class->SUPER::new(@args); 156 157 my ($attr, $value); 158 159 while (@args) { 160 $attr = shift @args; 161 $value = shift @args; 162 next if( $attr =~ /^-/ ); # don't want named parameters 163 if ($attr =~/'PROGRAM'/i ) { 164 $self->executable($value); 165 next; 166 } 167 $self->$attr($value); 168 } 169 return $self; 170} 171 172sub AUTOLOAD { 173 my $self = shift; 174 my $attr = $AUTOLOAD; 175 $attr =~ s/.*:://; 176 $attr = uc $attr; 177 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; 178 $self->{$attr} = shift if @_; 179 return $self->{$attr}; 180} 181 182=head2 version 183 184 Title : version 185 Usage : exit if $prog->version() < 1.8 186 Function: Determine the version number of the program 187 Example : 188 Returns : float or undef 189 Args : none 190 191=cut 192 193sub version { 194 my ($self) = @_; 195 196 my $exe = $self->executable(); 197 return undef unless defined $exe; 198 my $string = `$exe -- ` ; 199 $string =~ /\(([\d.]+)\)/; 200 return $1 || undef; 201} 202 203=head2 align 204 205 Title : align 206 Usage : 207 $inputfilename = 't/data/seq.fa'; 208 @hsps = $factory->align($inputfilename); 209 or 210 #@seq_array is array of Seq objs 211 $seq_array_ref = \@seq_array; 212 @hsps = $factory->align($seq_array_ref); 213 or 214 my @files = ('t/data/seq1.fa','t/data/seq2.fa'); 215 @hsps = $factory->align(\@files); 216 Function: Perform a DBA alignment 217 218 219 Returns : An array of Bio::Search::HSP::GenericHSP objects 220 Args : Name of a file containing a set of 2 fasta sequences 221 or else a reference to an array to 2 Bio::Seq objects. 222 or else a reference to an array of 2 file 223 names containing 1 fasta sequence each 224 225 Throws an exception if argument is not either a string (eg a 226 filename) or a reference to an array of 2 Bio::Seq objects. If 227 argument is string, throws exception if file corresponding to string 228 name can not be found. If argument is Bio::Seq array, throws 229 exception if less than two sequence objects are in array. 230 231=cut 232 233sub align { 234 235 my ($self,$input) = @_; 236 my ($temp,$infile1, $infile2, $seq); 237 my ($attr, $value, $switch); 238 239# Create input file pointer 240 ($infile1,$infile2)= $self->_setinput($input); 241 if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in $input !");} 242 243# Create parameter string to pass to dba program 244 my $param_string = $self->_setparams(); 245 246# run dba 247 my @hsps = $self->_run($infile1,$infile2,$param_string); 248 return @hsps; 249} 250 251################################################# 252 253=head2 _run 254 255 Title : _run 256 Usage : Internal function, not to be called directly 257 Function: makes actual system call to dba program 258 Example : 259 Returns : nothing; dba output is written to a temp file 260 Args : Name of a file containing a set of unaligned fasta sequences 261 and hash of parameters to be passed to dba 262 263=cut 264 265sub _run { 266 my ($self,$infile1,$infile2,$param_string) = @_; 267 my $instring; 268 $self->debug( "Program ".$self->executable."\n"); 269 unless( $self->outfile){ 270 my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir); 271 close($tfh); 272 undef $tfh; 273 $self->outfile($outfile); 274 } 275 my $outfile = $self->outfile(); 276 my $commandstring = $self->executable." $param_string -pff $infile1 $infile2 > $outfile"; 277 $self->debug( "dba command = $commandstring"); 278 my $status = system($commandstring); 279 $self->throw( "DBA call ($commandstring) crashed: $? \n") unless $status==0; 280 #parse pff format and return a Bio::Search::HSP::GenericHSP array 281 my $hsps = $self->_parse_results($outfile); 282 283 return @{$hsps}; 284} 285 286=head2 _parse_results 287 288 Title : __parse_results 289 Usage : Internal function, not to be called directly 290 Function: Parses dba output 291 Example : 292 Returns : an reference to an array of GenericHSPs 293 Args : the name of the output file 294 295=cut 296 297sub _parse_results { 298 my ($self,$outfile) = @_; 299 $outfile||$self->throw("No outfile specified"); 300 my ($start,$end,$name,$seqname,$seq,$seqchar,$tempname,%align); 301 my $count = 0; 302 my @hsps; 303 open(OUT,$outfile); 304 my (%query,%subject); 305 while(my $entry = <OUT>){ 306 if($entry =~ /^>(.+)/ ) { 307 $tempname = $1; 308 if( defined $name ) { 309 if($count == 0){ 310 my @parse = split("\t",$name); 311 $query{seqname} = $parse[0]; 312 $query{start} = $parse[3]; 313 $query{end} = $parse[4]; 314 $query{score} = $parse[5]; 315 $query{strand} = ($parse[6] eq '+') ? 1 : -1; 316 my @tags = split(";",$parse[8]); 317 foreach my $tag(@tags){ 318 $tag =~/(\S+)\s+(\S+)/; 319 $query{$1} = $2; 320 } 321 $query{seq} = $seqchar; 322 $count++; 323 } 324 elsif ($count == 1){ 325 my @parse = split("\t",$name); 326 $subject{seqname} = $parse[0]; 327 $subject{start} = $parse[3]; 328 $subject{end} = $parse[4]; 329 $subject{score} = $parse[5]; 330 $subject{strand} = ($parse[6] eq '+') ? 1:-1; 331 my @tags = split(";",$parse[8]); 332 foreach my $tag(@tags){ 333 $tag =~/(\S+)\s+(\S+)/; 334 $subject{$1} = $2; 335 } 336 $subject{seq} = $seqchar; 337 #create homology string 338 my $xor = $query{seq}^$subject{seq}; 339 my $identical = $xor=~tr/\c@/*/; 340 $xor=~tr/*/ /c; 341 my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm =>'DBA', 342 -score =>$query{score}, 343 -hsp_length =>length($query{seq}), 344 -query_gaps =>$query{gaps}, 345 -hit_gaps =>$subject{gaps}, 346 -query_name =>$query{seqname}, 347 -query_start =>$query{start}, 348 -query_end =>$query{end}, 349 -hit_name =>$subject{seqname}, 350 -hit_start =>$subject{start}, 351 -hit_end =>$subject{end}, 352 -hit_length =>length($self->_subject_seq->seq), 353 -query_length =>length($self->_query_seq->seq), 354 -query_seq =>$query{seq}, 355 -hit_seq =>$subject{seq}, 356 -conserved =>$identical, 357 -identical =>$identical, 358 -homology_seq =>$xor); 359 push @hsps, $hsp; 360 $count = 0; 361 } 362 } 363 $name = $tempname; 364 $seqchar = ""; 365 next; 366 } 367 $entry =~ s/[^A-Za-z\.\-]//g; 368 $seqchar .= $entry; 369 } 370 #do for the last entry 371 if($count == 1){ 372 my @parse = split("\t",$name); 373 $subject{seqname} = $parse[1]; 374 $subject{start} = $parse[3]; 375 $subject{end} = $parse[4]; 376 $subject{score} = $parse[5]; 377 $subject{strand} = ($parse[6] eq '+') ? 1:-1; 378 my @tags = split(";",$parse[8]); 379 foreach my $tag(@tags){ 380 $tag =~/(\S+)\s+(\S+)/; 381 $subject{$1} = $2; 382 } 383 $subject{seq} = $seqchar; 384 385 #create homology string 386 387 my $xor = $query{seq}^$subject{seq}; 388 my $identical = $xor=~tr/\c@/*/; 389 $xor=~tr/*/ /c; 390 my $hsp= Bio::Search::HSP::GenericHSP->new(-algorithm =>'DBA', 391 -score =>$query{score}, 392 -hsp_length =>length($query{seq}), 393 -query_gaps =>$query{gaps}, 394 -hit_gaps =>$subject{gaps}, 395 -query_name =>$query{seqname}, 396 -query_start =>$query{start}, 397 -query_end =>$query{end}, 398 -hit_name =>$subject{seqname}, 399 -hit_start =>$subject{start}, 400 -hit_end =>$subject{end}, 401 -hit_length =>length($self->_subject_seq->seq), 402 -query_length =>length($self->_query_seq->seq), 403 -query_seq =>$query{seq}, 404 -hit_seq =>$subject{seq}, 405 -conserved =>$identical, 406 -identical =>$identical, 407 -homology_seq =>$xor); 408 push @hsps, $hsp; 409 } 410 411 412 return \@hsps; 413} 414 415=head2 _setinput() 416 417 Title : _setinput 418 Usage : Internal function, not to be called directly 419 Function: Create input file for dba program 420 Example : 421 Returns : name of file containing dba data input 422 Args : Seq or Align object reference or input file name 423 424=cut 425 426sub _setinput { 427 my ($self, $input, $suffix) = @_; 428 my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2); 429 430 #there is gotta be some repetition here...need to clean up 431 432 if (ref($input) ne "ARRAY"){ #a single file containg 2 seqeunces 433 $infilename = $input; 434 unless(-e $input){return 0;} 435 my $in = Bio::SeqIO->new(-file => $infilename , '-format' => 'Fasta'); 436 ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); 437 ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); 438 439 my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'Fasta','-flush'=>1); 440 my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'Fasta','-flush'=>1); 441 442 my $seq1 = $in->next_seq() || return 0; 443 my $seq2 = $in->next_seq() || return 0; 444 $out1->write_seq($seq1); 445 $out2->write_seq($seq2); 446 $self->_query_seq($seq1); 447 $self->_subject_seq($seq2); 448 $out1->close(); 449 $out2->close(); 450 close($tfh1); 451 close($tfh2); 452 undef $tfh1; 453 undef $tfh2; 454 return $outfile1,$outfile2; 455 } 456 else { 457 scalar(@{$input}) == 2 || $self->throw("dba alignment can only be run on 2 sequences not."); 458 if(ref($input->[0]) eq ""){#passing in two file names 459 my $in1 = Bio::SeqIO->new(-file => $input->[0], '-format' => 'fasta'); 460 my $in2 = Bio::SeqIO->new(-file => $input->[1], '-format' => 'fasta'); 461 462 ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); 463 ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); 464 465 my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); 466 my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); 467 468 my $seq1 = $in1->next_seq() || return 0; 469 my $seq2 = $in2->next_seq() || return 0; 470 $out1->write_seq($seq1); 471 $out2->write_seq($seq2); 472 $self->_query_seq($seq1); 473 $self->_subject_seq($seq2); 474 close($tfh1); 475 close($tfh2); 476 undef $tfh1; 477 undef $tfh2; 478 return $outfile1,$outfile2; 479 } 480 elsif($input->[0]->isa("Bio::PrimarySeqI") && $input->[1]->isa("Bio::PrimarySeqI")) { 481 ($tfh1,$outfile1) = $self->io->tempfile(-dir=>$self->tempdir); 482 ($tfh2,$outfile2) = $self->io->tempfile(-dir=>$self->tempdir); 483 484 my $out1 = Bio::SeqIO->new(-fh=> $tfh1 , '-format' => 'fasta'); 485 my $out2 = Bio::SeqIO->new(-fh=> $tfh2 , '-format' => 'fasta'); 486 487 $out1->write_seq($input->[0]); 488 $out2->write_seq($input->[1]); 489 $self->_query_seq($input->[0]); 490 $self->_subject_seq($input->[1]); 491 492 close($tfh1); 493 close($tfh2); 494 undef $tfh1; 495 undef $tfh2; 496 return $outfile1,$outfile2; 497 } 498 else { 499 return 0; 500 } 501 } 502 return 0; 503} 504 505=head2 _setparams() 506 507 Title : _setparams 508 Usage : Internal function, not to be called directly 509 Function: Create parameter inputs for dba program 510 Example : 511 Returns : parameter string to be passed to dba 512 during align or profile_align 513 Args : name of calling object 514 515=cut 516 517sub _setparams { 518 my ($attr, $value, $self); 519 520 $self = shift; 521 522 my $param_string = ""; 523 for $attr ( @DBA_PARAMS ) { 524 $value = $self->$attr(); 525 next unless (defined $value); 526# next if $attr =~/outfile/i; 527 528 my $attr_key = lc $attr; #put params in format expected by dba 529 if($attr_key =~ /match([ABCDabcd])/i){ 530 $attr_key = "match".uc($1); 531 } 532 $attr_key = ' -'.$attr_key; 533 $param_string .= $attr_key.' '.$value; 534 } 535 for $attr ( @DBA_SWITCHES) { 536 $value = $self->$attr(); 537 next unless ($value); 538 my $attr_key = lc $attr; #put switches in format expected by dba 539 $attr_key = ' -'.$attr_key; 540 $param_string .= $attr_key ; 541 } 542 return $param_string; 543} 544 545=head2 _query_seq() 546 547 Title : _query_seq 548 Usage : Internal function, not to be called directly 549 Function: get/set for the query sequence 550 Example : 551 Returns : 552 Args : 553 554=cut 555 556sub _query_seq { 557 my ($self,$seq) = @_; 558 if(defined $seq){ 559 $self->{'_query_seq'} = $seq; 560 } 561 return $self->{'_query_seq'}; 562} 563 564=head2 _subject_seq() 565 566 Title : _subject_seq 567 Usage : Internal function, not to be called directly 568 Function: get/set for the subject sequence 569 Example : 570 Returns : 571 572 Args : 573 574=cut 575 576sub _subject_seq { 577 my ($self,$seq) = @_; 578 if(defined $seq){ 579 $self->{'_subject_seq'} = $seq; 580 } 581 return $self->{'_subject_seq'}; 582} 5831; # Needed to keep compiler happy 584 585