1# BioPerl module for TribeMCL 2# 3# Please direct questions and support issues to <bioperl-l@bioperl.org> 4# 5# Cared for by Shawn Hoon <shawnh@fugu-sg.org> 6# 7# Copyright Shawn Hoon 8# 9# You may distribute this module under the same terms as perl itself 10 11# POD documentation - main docs before the code 12 13=head1 NAME 14 15Bio::Tools::Run::TribeMCL 16 17=head1 SYNOPSIS 18 19 use Bio::Tools::Run::TribeMCL; 20 use Bio::SearchIO; 21 22 # 3 methods to input the blast results 23 24 # straight forward raw blast output (NCBI or WU-BLAST) 25 my @params = ('inputtype'=>'blastfile'); 26 27 # OR 28 29 # markov program format 30 # protein_id1 protein_id2 evalue_magnitude evalue_factor 31 # for example: 32 # proteins ENSP00000257547 and ENSP00000261659 33 # with a blast score evalue of 1e-50 34 # and proteins O42187 and ENSP00000257547 35 # with a blast score evalue of 1e-119 36 # entry would be 37 38 my @array = [[qw(ENSP00000257547 ENSP00000261659 1 50)], 39 [qw(O42187 ENSP00000257547 1 119)]]; 40 my @params = ('pairs'=>\@array,I=>'2.0'); 41 42 # OR 43 44 # pass in a searchio object 45 # slowest of the 3 methods as it does more rigourous parsing 46 # than required for us here 47 48 my $sio = Bio::SearchIO->new(-format=>'blast', 49 -file=>'blast.out'); 50 my @params=('inputtype'=>'searchio',I=>'2.0'); 51 52 53 # you can specify the path to the executable manually in the following way 54 my @params=('inputtype'=>'blastfile',I=>'2.0', 55 'mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl', 56 'matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); 57 my $fact = Bio::Tools::Run::TribeMCL->new(@params); 58 59 # OR 60 61 $fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); 62 $fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl'); 63 64 # to run 65 66 my $fact = Bio::Tools::Run::TribeMCL->new(@params); 67 68 # Run the program 69 # returns an array reference to clusters where members are the ids 70 # for example :2 clusters with 3 members per cluster: 71 # $fam = [ [mem1 mem2 mem3],[mem1 mem2 mem3]] 72 73 # pass in either the blastfile path/searchio obj/the array ref to scores 74 my $fam = $fact->run($sio); 75 76 # print out your clusters 77 78 for (my $i = 0; $i <scalar(@{$fam}); $i++){ 79 print "Cluster $i \t ".scalar(@{$fam->[$i]})." members\n"; 80 foreach my $member (@{$fam->[$i]}){ 81 print "\t$member\n"; 82 } 83 } 84 85=head1 DESCRIPTION 86 87TribeMCL is a method for clustering proteins into related groups, 88which are termed 'protein families'. This clustering is achieved by 89analysing similarity patterns between proteins in a given dataset, and 90using these patterns to assign proteins into related groups. In many 91cases, proteins in the same protein family will have similar 92functional properties. 93 94TribeMCL uses a novel clustering method (Markov Clustering or MCL) 95which solves problems which normally hinder protein sequence 96clustering. 97 98Reference: 99 100 Enright A.J., Van Dongen S., Ouzounis C.A; Nucleic Acids 101 Res. 30(7):1575-1584 (2002) 102 103You will need tribe-matrix (the program used to generate the matrix 104for input into mcl) and mcl (the clustering software) available at: 105 106 http://www.ebi.ac.uk/research/cgg/tribe/ or 107 http://micans.org/mcl/. 108 109Future Work in this module: Port the tribe-matrix program into perl so 110that we can enable have a SearchIO kinda module for reading and 111writing mcl data format 112 113=head1 FEEDBACK 114 115=head2 Mailing Lists 116 117User feedback is an integral part of the evolution of this and other 118Bioperl modules. Send your comments and suggestions preferably to one 119of the Bioperl mailing lists. Your participation is much appreciated. 120 121 bioperl-l@bioperl.org - General discussion 122 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 123 124=head2 Support 125 126Please direct usage questions or support issues to the mailing list: 127 128I<bioperl-l@bioperl.org> 129 130rather than to the module maintainer directly. Many experienced and 131reponsive experts will be able look at the problem and quickly 132address it. Please include a thorough description of the problem 133with code and data examples if at all possible. 134 135=head2 Reporting Bugs 136 137Report bugs to the Bioperl bug tracking system to help us keep track 138the bugs and their resolution. Bug reports can be submitted via the 139web: 140 141 http://redmine.open-bio.org/projects/bioperl/ 142 143=head1 AUTHOR - Shawn Hoon 144 145Email shawnh@fugu-sg.org 146 147=head1 APPENDIX 148 149The rest of the documentation details each of the object 150methods. Internal methods are usually preceded with a "_". 151 152=cut 153 154 155# Let the code begin... 156package Bio::Tools::Run::TribeMCL; 157use vars qw($AUTOLOAD @ISA $PROGRAMDIR 158 @TRIBEMCL_PARAMS @MATRIXPROGRAM_PARAMS @MCLPROGRAM_PARAMS 159 @OTHER_SWITCHES %OK_FIELD 160 $MATRIXPROGRAM_NAME $MCLPROGRAM_NAME 161 $MCLPROGRAM $MATRIXPROGRAM 162 ); 163use strict; 164 165use Bio::SeqIO; 166use Bio::Root::Root; 167use Bio::Root::IO; 168use Bio::Cluster::SequenceFamily; 169use Bio::Factory::ApplicationFactoryI; 170use Bio::Tools::Run::WrapperBase; 171use Bio::Annotation::DBLink; 172use Bio::Seq; 173use Bio::Species; 174 175@ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); 176 177# You will need to enable mcl and tribe-matrix to use this wrapper. This 178# can be done in (at least) two ways: 179# 180# 1. define an environmental variable TRIBEDIR 181# export TRIBEDIR =/usr/local/share/mclbin/ 182# where the tribe-matrix and mcl programs are located. 183#you probably need to copy them individually to the same directory 184#if the installation puts them in different directories. or simply put them in 185#your standard /usr/local/bin 186# 187# 2. include a definition of an environmental variable TRIBEDIR in 188# every script that will use TRIBEMCL.pm 189# $ENV{TRIBEDIR} = '/usr/local/share/mclbin/; 190# 191#3. Manually set the path to the executabes in your code: 192# 193 194#my @params=('inputtype'=>'blastfile',I=>'2.0',' 195# mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl',' 196# matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); 197#my $fact = Bio::Tools::Run::TribeMCL->new(@params); 198 199#or 200#$fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix'); 201#$fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl'); 202 203 204BEGIN { 205 $MATRIXPROGRAM_NAME = 'tribe-matrix'; 206 $MCLPROGRAM_NAME = 'mcl'; 207 if (defined $ENV{TRIBEDIR}) { 208 $PROGRAMDIR = $ENV{TRIBEDIR} || ''; 209 $MCLPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MCLPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':'')); 210 $MATRIXPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MATRIXPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':'')); 211 } 212 213 @TRIBEMCL_PARAMS = qw(inputtype hsp hit scorefile blastfile description_file searchio pairs mcl matrix weight description family_tag use_db); 214 215 @MATRIXPROGRAM_PARAMS = qw(ind out chunk); 216 @MCLPROGRAM_PARAMS = qw(I t P R pct o); 217 218 @OTHER_SWITCHES = qw(verbose quiet); 219 220 # Authorize attribute fields 221 foreach my $attr (@TRIBEMCL_PARAMS, @MATRIXPROGRAM_PARAMS, 222 @MCLPROGRAM_PARAMS, @OTHER_SWITCHES) { 223 $OK_FIELD{$attr}++; 224 } 225} 226 227sub new { 228 my ($class, @args) = @_; 229 my $self = $class->SUPER::new(@args); 230 231 my ($attr, $value); 232 while (@args) { 233 $attr = shift @args; 234 $value = shift @args; 235 next if( $attr =~ /^-/ ); # don't want named parameters 236 if ($attr =~/MCL/i) { 237 $self->mcl_executable($value); 238 next; 239 } 240 if ($attr =~ /MATRIX/i){ 241 $self->matrix_executable($value); 242 next; 243 } 244 $self->$attr($value); 245 } 246 defined($self->weight) || $self->weight(200); 247 248 return $self; 249} 250 251sub AUTOLOAD { 252 my $self = shift; 253 my $attr = $AUTOLOAD; 254 $attr =~ s/.*:://; 255 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; 256 $self->{$attr} = shift if @_; 257 return $self->{$attr}; 258} 259 260=head2 mcl_executable 261 262 Title : mcl_executable 263 Usage : $self->mcl_executable() 264 Function: get set for path to mcl executable 265 Returns : String or undef if not installed 266 Args : [optional] string of path to executable 267 [optional] boolean to warn on missing executable status 268 269=cut 270 271sub mcl_executable{ 272 my ($self,$exe,$warn) = @_; 273 274 if( defined $exe ) { 275 $self->{'_mcl_exe'} = $exe; 276 } 277 unless( defined $self->{'_mcl_exe'} ) { 278 # this would be the name set in the BEGIN block 279 if( $MCLPROGRAM && -e $MCLPROGRAM && -x $MCLPROGRAM ) { 280 $self->{'_mcl_exe'} = $MCLPROGRAM; 281 } else { 282 my $exe; 283 if( ( $exe = $self->io->exists_exe($MCLPROGRAM_NAME) ) && 284 -x $exe ) { 285 $self->{'_mcl_exe'} = $exe; 286 } else { 287 $self->warn("Cannot find executable for $MCLPROGRAM_NAME") if $warn; 288 $self->{'_mcl_exe'} = undef; 289 } 290 } 291 } 292 $self->{'_mcl_exe'}; 293} 294 295=head2 matrix_executable 296 297 Title : matrix_executable 298 Usage : $self->matrix_executable() 299 Function: get set for path to tribe-matrix executable 300 Returns : String or undef if not installed 301 Args : [optional] string of path to executable 302 [optional] boolean to warn on missing executable status 303 304=cut 305 306sub matrix_executable{ 307 my ($self,$exe,$warn) = @_; 308 309 if( defined $exe ) { 310 $self->{'_matrix_exe'} = $exe; 311 } 312 unless( defined $self->{'_matrix_exe'} ) { 313 # this would be the name set in the BEGIN block 314 if( $MATRIXPROGRAM && -e $MATRIXPROGRAM && -x $MATRIXPROGRAM ) { 315 $self->{'_matrix_exe'} = $MATRIXPROGRAM; 316 } else { 317 my $exe; 318 if( ( $exe = $self->io->exists_exe($MATRIXPROGRAM_NAME) ) && 319 -x $exe ) { 320 $self->{'_matrix_exe'} = $exe; 321 } else { 322 $self->warn("Cannot find executable for $MATRIXPROGRAM_NAME") 323 if $warn; 324 $self->{'_matrix_exe'} = undef; 325 } 326 } 327 } 328 $self->{'_matrix_exe'}; 329} 330 331=head2 run 332 333 Title : run 334 Usage : $self->run() 335 Function: runs the clustering 336 Returns : Array Ref of clustered Ids 337 Args : 338 339=cut 340 341sub run { 342 my ($self,$input) = @_; 343 if($self->description_file){ 344 $self->_setup_description($self->description_file); 345 } 346 my $file = $self->_setup_input($input); 347 defined($file) || $self->throw("Error setting up input "); 348 #run tribe-matrix to generate matrix for mcl 349 my ($index_file, $mcl_infile) = $self->_run_matrix($file); 350 $self->throw("tribe-matrix not run properly as index file is missing") 351 unless (-e $index_file); 352 $self->throw("tribe-matrix not run properly as matrix file is missing") 353 unless (-e $mcl_infile); 354 #run mcl 355 my $clusters = $self->_run_mcl($index_file,$mcl_infile); 356 my $families; 357 if($self->description){ 358 my %consensus = $self->_consensifier($clusters); 359 $families = $self->_generate_families($clusters,\%consensus); 360 } 361 else { 362 $families = $self->_generate_families($clusters); 363 } 364 365 return @{$families}; 366} 367 368sub _generate_families { 369 my ($self,$clusters,$consensus) = @_; 370 my $family_tag = $self->family_tag || "TribeFamily"; 371 my @fam; 372 if($consensus){ 373 my %description = %{$self->description}; 374 my %consensus = %{$consensus}; 375 for(my $i = 0; $i < scalar(@{$clusters}); $i++){ 376 my @mem; 377 foreach my $member (@{$clusters->[$i]}){ 378 my $mem = Bio::Seq->new(-id=>$member, 379 -alphabet=>"protein", 380 -desc=>$description{$member}->[1]); 381 my $annot = Bio::Annotation::DBLink->new(-database=>$description{$member}->[0], 382 -primary_id=>$member); 383 384 $mem->annotation->add_Annotation('dblink',$annot); 385 386 #store species information 387 my $taxon_str = $description{$member}->[2]; 388 #parse taxon info into hash 389 my %taxon; 390 $taxon_str=~s/=;/=undef;/g if $taxon_str; 391 %taxon = map{split '=',$_}split';',$taxon_str if $taxon_str; 392 my $name = $taxon{'taxon_common_name'}; 393 my @classification = $taxon{'taxon_classification'} ? split(':',$taxon{'taxon_classification'}) : (); 394 my $tax_id = $taxon{'taxon_id'}; 395 my $sub_species = $taxon{'taxon_sub_species'}; 396 397 my $species = Bio::Species->new(); 398 $species->common_name($name) if $name; #*** should this actually be scientific_name() ? 399 $species->sub_species($sub_species) if $sub_species; 400 $species->ncbi_taxid($tax_id) if $tax_id; 401 $species->classification(@classification) if @classification; 402 $mem->species($species); 403 404 push @mem, $mem; 405 } 406 my $id = $family_tag."_".$i; 407 my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id, 408 -description=>$consensus{$i}{desc}, 409 -annotation_score=>$consensus{$i}{conf}, 410 -members=>\@mem); 411 push @fam, $fam; 412 } 413 return \@fam; 414 } 415 else { 416 for(my $i = 0; $i < scalar(@{$clusters}); $i++){ 417 my @mem; 418 foreach my $member (@{$clusters->[$i]}){ 419 my $mem = Bio::Seq->new(-id=>$member, 420 -alphabet=>"protein"); 421 push @mem, $mem; 422 } 423 my $id = $family_tag."_".$i; 424 my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id, 425 -members=>\@mem); 426 push @fam, $fam; 427 } 428 return \@fam; 429 } 430 431} 432 433 434sub _consensifier { 435 my ($self,$clusters) = @_; 436 eval { 437 require "Algorithm/Diff.pm"; 438 }; 439 if($@){ 440 $self->warn("Algorith::Diff is needed to run TribeMCL"); 441 return undef; 442 } 443 my %description = %{$self->description}; 444 my %consensus; 445 my $best_annotation; 446 my %use_db; 447 if($self->use_db){ 448 foreach my $key(split(',',$self->use_db)){ 449 $use_db{$key}++; 450 } 451 } 452CLUSTER: 453 for(my $i = 0; $i < scalar(@{$clusters}); $i++){ 454 my @desc; 455 my @orig_desc; 456 my $total_members = scalar(@{$clusters->[$i]}); 457 458 foreach my $member(@{$clusters->[$i]}){ 459 #if specify which dbs to use for consensifying 460 if($self->use_db){ 461 if($use_db{$description{$member}->[0]}){ 462 push @desc, $description{$member}->[1] if $description{$member}->[1]; 463 push @orig_desc, $description{$member}->[1] if $description{$member}->[1]; 464 } 465 } 466 else { 467 push @desc, $description{$member}->[1] if $description{$member}->[1]; 468 push @orig_desc, $description{$member}->[1] if $description{$member}->[1]; 469 } 470 471 } 472 if($#desc < 0){ #truly unknown 473 $consensus{$i}{desc} = "UNKNOWN"; 474 $consensus{$i}{conf} = 0; 475 next CLUSTER; 476 } 477 if($#desc == 0){#only a single description 478 $consensus{$i}{desc} = grep(/S+/,@desc); 479 $consensus{$i}{desc} = $consensus{$i}{desc} || "UNKNOWN"; 480 if ($consensus{$i}{desc} eq "UNKNOWN") { 481 $consensus{$i}{conf} = 0; 482 } else { 483 $consensus{$i}{conf} = 100 * int(1/$total_members); 484 } 485 next CLUSTER; 486 } 487 488 #all the same desc 489 my %desc = (); 490 foreach my $desc (@desc) { $desc{$desc}++; } 491 if ( (keys %desc) == 1 ) { 492 my ($best_annotation,) = keys %desc; 493 my $n = grep($_ eq $best_annotation, @desc); 494 my $perc= int( 100*($n/$total_members) ); 495 $consensus{$i}{desc} = $best_annotation; 496 $consensus{$i}{conf} = $perc; 497 next CLUSTER; 498 } 499 500 my %lcshash = (); 501 my %lcnext = (); 502 while (@desc) { 503 # do an all-against-all LCS (longest commong substring) of the 504 # descriptions of all members; take the resulting strings, and 505 # again do an all-against-all LCS on them, until we have nothing 506 # left. The LCS's found along the way are in lcshash. 507 # 508 # Incidentally, longest common substring is a misnomer, since it 509 # is not guaranteed to occur in either of the original strings. It 510 # is more like the common parts of a Unix diff ... 511 for (my $i=0;$i<@desc;$i++) { 512 for (my $j=$i+1;$j<@desc;$j++){ 513 my @list1=split(" ",$desc[$i]); 514 my @list2=split(" ",$desc[$j]); 515 my @lcs=Algorithm::Diff::LCS(\@list1,\@list2); 516 my $lcs=join(" ",@lcs); 517 $lcshash{$lcs}=1; 518 $lcnext{$lcs}=1; 519 } 520 } 521 @desc=keys(%lcnext); 522 undef %lcnext; 523 } 524 my ($best_score, $best_perc)=(0, 0); 525 my @all_cands=sort {length($b) <=>length($a)} keys %lcshash ; 526 foreach my $candidate_consensus (@all_cands) { 527 my @temp=split(" ",$candidate_consensus); 528 my $length=@temp; # num of words in annotation 529 530 # see how many members of cluster contain this LCS: 531 532 my ($lcs_count)=0; 533 foreach my $orig_desc (@orig_desc) { 534 my @list1=split(" ",$candidate_consensus); 535 my @list2=split(" ",$orig_desc); 536 my @lcs=Algorithm::Diff::LCS(\@list1,\@list2); 537 my $lcs=join(" ",@lcs); 538 539 if ($lcs eq $candidate_consensus || 540 index($orig_desc,$candidate_consensus) != -1 # addition; 541 # many good (single word) annotations fall out otherwise 542 ) { 543 $lcs_count++; 544 545 # Following is occurs frequently, as LCS is _not_ the longest 546 # common substring ... so we can't use the shortcut either 547 548 # if ( index($orig_desc,$candidate_consensus) == -1 ) { 549 # warn "lcs:'$lcs' eq cons:'$candidate_consensus' and 550 # orig:'$orig_desc', but index == -1\n" 551 # } 552 } 553 } 554 my $perc_with_desc=(($lcs_count/$total_members))*100; 555 my $perc=($lcs_count/$total_members)*100; 556 my $score=$perc + ($length*14); # take length into account as well 557 $score = 0 if $length==0; 558 if (($perc_with_desc >= 40) && ($length >= 1)) { 559 if ($score > $best_score) { 560 $best_score=$score; 561 $best_perc=$perc; 562 $best_annotation=$candidate_consensus; 563 } 564 } 565 } 566 if ($best_perc==0 || $best_perc >= 100 ) { 567 $best_perc='NA'; 568 } 569 if ($best_annotation eq '') { 570 $best_annotation = 'AMBIGUOUS'; 571 } 572 $consensus{$i}{desc} = $best_annotation; 573 $consensus{$i}{conf} = $best_perc; 574 } 575 return %consensus; 576} 577 578sub _setup_description { 579 my ($self,$file) = @_; 580 my $goners='().-'; 581 my $spaces= ' ' x length($goners); 582 my $filter = "tr '$goners' '$spaces' < $file"; 583 open (FILE,"$filter | ") || die "$filter: $!"; 584 my %description; 585 while(<FILE>){ 586 chomp; 587 my ($db,$acc,$description,$taxon_str) = split("\t",$_); 588 $description || $self->throw("Wrongly formated description file"); 589 $description = $self->_apply_edits($description); 590 591 if($description{$acc}){ 592 $self->warn("Duplicated entry $acc in description file, overwriting.."); 593 } 594 $description{$acc} = [$db,$description,$taxon_str]; 595 } 596 $self->description(\%description); 597} 598 599sub as_words { 600 #add ^ and $ to regexp 601 my (@words); 602 my @newwords=(); 603 604 foreach my $word (@words) { push @newwords, "^$word\$" }; 605} 606 607sub _apply_edits { 608 my ($self,$desc) = @_; 609 my @deletes = ( 'FOR\$', 'SIMILAR TO\$', 'SIMILAR TO PROTEIN\$', 610 'RIKEN.*FULL.*LENGTH.*ENRICHED.*LIBRARY', 611 '\w*\d{4,}','HYPOTHETICAL PROTEIN' 612 ); 613 my @newwords = &as_words(qw(NOVEL PUTATIVE PREDICTED 614 UNNAMED UNNMAED ORF CLONE MRNA 615 CDNA EST RIKEN FIS KIAA\d+ \S+RIK IMAGE HSPC\d+ 616 FOR HYPOTETICAL HYPOTHETICAL)); 617 push @deletes, @newwords; 618 619 foreach my $re ( @deletes ) { $desc=~s/$re//g; } 620 621 #Apply some fixes to the annotation: 622 $desc=~s/EC (\d+) (\d+) (\d+) (\d+)/EC $1.$2.$3.$4/; 623 $desc=~s/EC (\d+) (\d+) (\d+)/EC $1.$2.$3.-/; 624 $desc=~s/EC (\d+) (\d+)/EC $1\.$2.-.-/; 625 $desc=~s/(\d+) (\d+) KDA/$1.$2 KDA/; 626 return $desc; 627} 628 629=head2 _run_mcl 630 631 Title : _run_mcl 632 Usage : $self->_run_mcl() 633 Function: internal function for running the mcl program 634 Returns : Array Ref of clustered Ids 635 Args : Index_file name, matrix input file name 636 637=cut 638 639sub _run_mcl { 640 my ($self,$ind_file,$infile) = @_; 641 my $exe = $self->mcl_executable || $self->throw("mcl not found."); 642 my $cmd = $exe . " $infile"; 643 unless (defined $self->o) { 644 my ($fh,$o) = $self->io->tempfile(-dir=>$self->tempdir); 645 $self->o($o); 646 # file handle not use later so deleted 647 close($fh); 648 undef $fh; 649 } 650 unless (defined $self->I) { 651 $self->I(3.0); 652 } 653 654 foreach my $param (@MCLPROGRAM_PARAMS) { 655 if (defined $self->$param) { 656 $cmd .= " -$param ".$self->$param; 657 } 658 } 659 if($self->quiet || 660 ($self->verbose < 0)){ 661 $cmd .= " -V all"; 662 if( $^O !~ /Mac/) { 663 my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; 664 $cmd .= " 2> $null"; 665 } 666 } 667 668 my $status = system($cmd); 669 670 $self->throw( "mcl call ($cmd) crashed: $? \n") unless $status==0; 671 my $families = $self->_parse_mcl($ind_file,$self->o); 672 return $families; 673} 674 675=head2 _run_matrix 676 677 Title : _run_matrix 678 Usage : $self->_run_matrix() 679 Function: internal function for running the tribe-matrix program 680 Returns : index filepath and matrix file path 681 Args : filepath of parsed ids and scores 682 683=cut 684 685sub _run_matrix { 686 my ($self,$parse_file) = @_; 687 my $exe = $self->matrix_executable || $self->throw("tribe-matrix not found."); 688 my $cmd = $exe . " $parse_file"; 689 unless (defined $self->ind) { 690 my ($fh,$indexfile) = $self->io->tempfile(-dir=>$self->tempdir); 691 $self->ind($indexfile); 692 # file handle not use later so deleted 693 close($fh); 694 undef $fh; 695 } 696 unless (defined $self->out) { 697 my ($fh,$matrixfile) = $self->io->tempfile(-dir=>$self->tempdir); 698 $self->out($matrixfile); 699 # file handle not use later so deleted 700 close($fh); 701 undef $fh; 702 } 703 foreach my $param (@MATRIXPROGRAM_PARAMS) { 704 if (defined $self->$param) { 705 $cmd .= " -$param ".$self->$param; 706 } 707 } 708 my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; 709 $cmd .= " > $null"; 710 my $status = system($cmd); 711 712 $self->throw( "tribe-matrix call ($cmd) crashed: $? \n") unless $status==0; 713 714 return ($self->ind,$self->out); 715} 716 717=head2 _setup_input 718 719 Title : _setup_input 720 Usage : $self->_setup_input() 721 Function: internal function for running setting up the inputs 722 needed for running mcl 723 Returns : filepath of parsed ids and scores 724 Args : 725 726=cut 727 728sub _setup_input { 729 my ($self,$input) = @_; 730 my ($type,$rc); 731 732 my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir); 733 734 $type = $self->inputtype(); 735 if($type=~/scorefile/i){ 736 -e $self->scorefile || 737 $self->throw("Scorefile doesn't seem to exist or accessible"); 738 return $self->scorefile; 739 } 740 if($type =~/blastfile/i){ 741 $self->blastfile($input); 742 $rc = $self->_parse_blastfile($self->blastfile,$tfh); 743 } 744 elsif($type=~/searchio/i){ 745 $self->searchio($input); 746 $rc = $self->_get_from_searchio($self->searchio,$tfh); 747 } 748 elsif($type=~/pairs/i) { 749 $self->pairs($input); 750 for my $line (@{ $self->pairs }){ 751 print $tfh join("\t",@{$line}), "\n"; 752 $rc++; 753 } 754 } 755 elsif($type =~/hsp/i) { 756 $self->hsp($input); 757 $rc = $self->_get_from_hsp($self->hsp,$tfh); 758 } 759 elsif($type=~/hit/i){ 760 $self->hit($input); 761 $rc = $self->_get_from_hit($self->hit,$tfh); 762 } 763 else { 764 $self->throw("Must set inputtype to either blastfile,searchio or ". 765 "paris using \$fact->blastfile |\$fact->searchio| \$fact->pairs"); 766 } 767 unless ( $rc ) { 768 $self->throw("Need inputs for running tribe mcl, nothing provided"); 769 } 770 close($tfh); 771 $tfh= undef; 772 return $outfile; 773} 774 775=head2 _get_from_hsp 776 777 Title : _get_from_hsp 778 Usage : $self->_get_from_hsp() 779 Function: internal function for getting blast scores from hsp 780 Returns : array ref to ids and score [protein1 protein2 magnitude factor] 781 Args : L<Bio::Search::HSP::GenericHSP> 782 783=cut 784 785sub _get_from_hsp { 786 my ($self,$hsp,$tfh) = @_; 787 my @array; 788 my $count; 789 foreach my $pair (@{$hsp}){ 790 my $sig = $pair->score; 791 $sig =~ s/^e-/1e-/g; 792 my $expect=sprintf("%e",$sig); 793 if ($expect==0){ 794 my $wt = $self->weight; 795 $expect=sprintf("%e","1e-$wt"); 796 } 797 my $first=(split("e-",$expect))[0]; 798 my $second=(split("e-",$expect))[1]; 799 800 print $tfh join("\t", $pair->feature1->seq_id, 801 $pair->feature2->seq_id,int($first), 802 int($second) ), "\n"; 803 $count++; 804 } 805 return $count; 806} 807 808sub _get_from_hit { 809 my ($self,$hit,$tfh) = @_; 810 my $count; 811 foreach my $pair(@{$hit}){ 812 my $sig = $pair->raw_score; 813 $sig =~s/^e-/1e-/g; 814 my $expect = sprintf("%e",$sig); 815 if ($expect==0){ 816 my $wt = $self->weight; 817 $expect=sprintf("%e","1e-$wt"); 818 } 819 my $first=(split("e-",$expect))[0]; 820 my $second=(split("e-",$expect))[1]; 821 print $tfh join("\t",$pair->name,$pair->description,int($first),int($second)),"\n"; 822 $count++; 823 } 824 return $count; 825} 826 827 828=head2 _get_from_searchio 829 830 Title : _get_from_searchio 831 Usage : $self->_get_from_searchio() 832 Function: internal function for parsing blast scores from searchio object 833 Returns : array ref to ids and score [protein1 protein2 magnitude factor] 834 Args : L<Bio::Tools::SearchIO> 835 836=cut 837 838sub _get_from_searchio { 839 my ($self,$sio,$tfh) = @_; 840 my @array; 841 my $count; 842 while( my $result = $sio->next_result ) { 843 while( my $hit = $result->next_hit ) { 844 while( my $hsp = $hit->next_hsp ) { 845 my $sig = $hsp->evalue; 846 $sig =~ s/^e-/1e-/g; 847 my $expect=sprintf("%e",$sig); 848 if ($expect==0){ 849 my $wt = $self->weight; 850 $expect=sprintf("%e","1e-$wt"); 851 } 852 my $first=(split("e-",$expect))[0]; 853 my $second=(split("e-",$expect))[1]; 854 print $tfh join("\t", 855 $hsp->feature1->seq_id, 856 $hsp->feature2->seq_id, 857 int($first), 858 int($second) ), "\n"; 859 860 $count++; 861 } 862 } 863 } 864 return $count; 865} 866 867=head2 _parse_blastfile 868 869 Title : _parse_blastfile 870 Usage : $self->_parse_blastfile() 871 Function: internal function for quickly parsing blast evalue 872 scores from raw blast output file 873 Returns : array ref to ids and score [protein1 protein2 magnitude factor] 874 Args : file path 875 876=cut 877 878sub _parse_blastfile { 879 my ($self, $file,$tfh) = @_; 880 open(FILE,$file) || $self->throw("Cannot open Blast Output File"); 881 my ($query,$reference,$first,$second); 882 my @array; 883 my $count; 884 my $weight = $self->weight; 885 while(<FILE>){ 886 if(/Query=\s+(\S+)/){ 887 $query = $1; 888 } 889 if(/^>(\S+)/){ 890 $reference = $1; 891 } 892 if (/Expect = (\S+)/){ 893 my $raw=$1; 894 $raw=~s/^e-/1e-/g; 895 my $expect=sprintf("%e",$raw); 896 if ($expect==0){ 897 $expect=sprintf("%e","1e-$weight"); 898 } 899 $first=(split("e-",$expect))[0]; 900 $second=(split("e-",$expect))[1]; 901 print $tfh join("\t", $query, 902 $reference, 903 int($first), 904 int($second)), "\n"; 905 $count++; 906 } 907 } 908 return $count; 909} 910 911=head2 _parse_mcl 912 913 Title : _parse_mcl 914 Usage : $self->_parse_mcl() 915 Function: internal function for quickly parsing mcl output and 916 generating the array of clusters 917 Returns : Array Ref of clustered Ids 918 Args : index file path, mcl output file path 919 920=cut 921 922sub _parse_mcl { 923 my ($self,$ind,$mcl) = @_; 924 open (MCL,$mcl) || $self->throw("Error, cannot open $mcl for parsing"); 925 my $i =-1; 926 my $start; 927 my (@cluster,@out); 928 while(<MCL>) { 929 if ($start) { 930 chomp($_); 931 $cluster[$i] = join(" ",$cluster[$i],"$_"); 932 } 933 if(/^\d+/){ 934 $start = 1; 935 $i++; 936 $cluster[$i] = join(" ",$cluster[$i] || '',"$_"); 937 } 938 if (/\$$/){ 939 $start = 0; 940 } 941 last if /^\(mclruninfo/; 942 } 943 open (IND,$ind) || $self->throw("Cannot open $ind for parsing"); 944 my %hash; 945 while(<IND>){ 946 /^(\S+)\s+(\S+)/; 947 $hash{$1}=$2; 948 } 949 950 for (my $j=0;$j<$i+1;$j++) { 951 my @array=split(" ",$cluster[$j]); 952 for (my $p=1;$p<$#array;$p++){ 953 push @{$out[$array[0]]}, $hash{$array[$p]}; 954 } 955 } 956 return \@out; 957} 958 959 9601; 961