1# BioPerl module for TribeMCL
2#
3# Please direct questions and support issues to <bioperl-l@bioperl.org>
4#
5# Cared for by Shawn Hoon <shawnh@fugu-sg.org>
6#
7# Copyright Shawn Hoon
8#
9# You may distribute this module under the same terms as perl itself
10
11# POD documentation - main docs before the code
12
13=head1 NAME
14
15Bio::Tools::Run::TribeMCL
16
17=head1 SYNOPSIS
18
19  use Bio::Tools::Run::TribeMCL;
20  use Bio::SearchIO;
21
22  # 3 methods to input the blast results
23
24  # straight forward raw blast output (NCBI or WU-BLAST)
25  my @params = ('inputtype'=>'blastfile');
26
27  # OR
28
29  # markov program format
30  # protein_id1 protein_id2 evalue_magnitude evalue_factor
31  # for example:
32  # proteins ENSP00000257547  and ENSP00000261659
33  # with a blast score evalue of 1e-50
34  # and proteins O42187 and ENSP00000257547
35  # with a blast score evalue of 1e-119
36  # entry would be
37
38  my @array  = [[qw(ENSP00000257547 ENSP00000261659 1 50)],
39		[qw(O42187 ENSP00000257547 1 119)]];
40  my @params = ('pairs'=>\@array,I=>'2.0');
41
42  # OR
43
44  # pass in a searchio object
45  # slowest of the 3 methods as it does more rigourous parsing
46  # than required for us here
47
48  my $sio = Bio::SearchIO->new(-format=>'blast',
49                               -file=>'blast.out');
50  my @params=('inputtype'=>'searchio',I=>'2.0');
51
52
53  # you can specify the path to the executable manually in the following way
54  my @params=('inputtype'=>'blastfile',I=>'2.0',
55	      'mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl',
56	      'matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
57  my $fact = Bio::Tools::Run::TribeMCL->new(@params);
58
59  # OR
60
61  $fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
62  $fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl');
63
64  # to run
65
66  my $fact = Bio::Tools::Run::TribeMCL->new(@params);
67
68  # Run the program
69  # returns an array reference to clusters where members are the ids
70  # for example :2 clusters with 3 members per cluster:
71  #  $fam = [ [mem1 mem2 mem3],[mem1 mem2 mem3]]
72
73  # pass in either the blastfile path/searchio obj/the array ref to scores
74  my $fam = $fact->run($sio);
75
76  # print out your clusters
77
78  for (my $i = 0; $i <scalar(@{$fam}); $i++){
79    print "Cluster $i \t ".scalar(@{$fam->[$i]})." members\n";
80    foreach my $member (@{$fam->[$i]}){
81      print "\t$member\n";
82    }
83  }
84
85=head1 DESCRIPTION
86
87TribeMCL is a method for clustering proteins into related groups,
88which are termed 'protein families'. This clustering is achieved by
89analysing similarity patterns between proteins in a given dataset, and
90using these patterns to assign proteins into related groups. In many
91cases, proteins in the same protein family will have similar
92functional properties.
93
94TribeMCL uses a novel clustering method (Markov Clustering or MCL)
95which solves problems which normally hinder protein sequence
96clustering.
97
98Reference:
99
100  Enright A.J., Van Dongen S., Ouzounis C.A; Nucleic Acids
101  Res. 30(7):1575-1584 (2002)
102
103You will need tribe-matrix (the program used to generate the matrix
104for input into mcl) and mcl (the clustering software) available at:
105
106  http://www.ebi.ac.uk/research/cgg/tribe/ or
107  http://micans.org/mcl/.
108
109Future Work in this module: Port the tribe-matrix program into perl so
110that we can enable have a SearchIO kinda module for reading and
111writing mcl data format
112
113=head1 FEEDBACK
114
115=head2 Mailing Lists
116
117User feedback is an integral part of the evolution of this and other
118Bioperl modules. Send your comments and suggestions preferably to one
119of the Bioperl mailing lists. Your participation is much appreciated.
120
121  bioperl-l@bioperl.org                  - General discussion
122  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
123
124=head2 Support
125
126Please direct usage questions or support issues to the mailing list:
127
128I<bioperl-l@bioperl.org>
129
130rather than to the module maintainer directly. Many experienced and
131reponsive experts will be able look at the problem and quickly
132address it. Please include a thorough description of the problem
133with code and data examples if at all possible.
134
135=head2 Reporting Bugs
136
137Report bugs to the Bioperl bug tracking system to help us keep track
138the bugs and their resolution.  Bug reports can be submitted via the
139web:
140
141  http://redmine.open-bio.org/projects/bioperl/
142
143=head1 AUTHOR - Shawn Hoon
144
145Email shawnh@fugu-sg.org
146
147=head1 APPENDIX
148
149The rest of the documentation details each of the object
150methods. Internal methods are usually preceded with a "_".
151
152=cut
153
154
155# Let the code begin...
156package Bio::Tools::Run::TribeMCL;
157use vars qw($AUTOLOAD @ISA  $PROGRAMDIR
158            @TRIBEMCL_PARAMS @MATRIXPROGRAM_PARAMS @MCLPROGRAM_PARAMS
159            @OTHER_SWITCHES %OK_FIELD
160      	    $MATRIXPROGRAM_NAME $MCLPROGRAM_NAME
161      	    $MCLPROGRAM $MATRIXPROGRAM
162	    );
163use strict;
164
165use Bio::SeqIO;
166use Bio::Root::Root;
167use Bio::Root::IO;
168use Bio::Cluster::SequenceFamily;
169use Bio::Factory::ApplicationFactoryI;
170use Bio::Tools::Run::WrapperBase;
171use Bio::Annotation::DBLink;
172use Bio::Seq;
173use Bio::Species;
174
175@ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
176
177# You will need to enable mcl and tribe-matrix to use this wrapper. This
178# can be done in (at least) two ways:
179#
180# 1. define an environmental variable TRIBEDIR
181# export TRIBEDIR =/usr/local/share/mclbin/
182# where the tribe-matrix and mcl programs are located.
183#you probably need to copy them individually to the same directory
184#if the installation puts them in different directories. or simply put them in
185#your standard /usr/local/bin
186#
187# 2. include a definition of an environmental variable TRIBEDIR in
188# every script that will use TRIBEMCL.pm
189# $ENV{TRIBEDIR} = '/usr/local/share/mclbin/;
190#
191#3. Manually set the path to the executabes in your code:
192#
193
194#my @params=('inputtype'=>'blastfile',I=>'2.0','
195#                       mcl'=>'/home/shawn/software/mcl-02-150/src/shmcl/mcl','
196#                       matrix'=>'/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
197#my $fact = Bio::Tools::Run::TribeMCL->new(@params);
198
199#or
200#$fact->matrix_executable('/home/shawn/software/mcl-02-150/src/contrib/tribe/tribe-matrix');
201#$fact->mcl_executable('/home/shawn/software/mcl-02-150/src/shmcl/mcl');
202
203
204BEGIN {
205    $MATRIXPROGRAM_NAME = 'tribe-matrix';
206    $MCLPROGRAM_NAME    = 'mcl';
207    if (defined $ENV{TRIBEDIR}) {
208        $PROGRAMDIR = $ENV{TRIBEDIR} || '';
209        $MCLPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MCLPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':''));
210        $MATRIXPROGRAM = Bio::Root::IO->catfile($PROGRAMDIR,$MATRIXPROGRAM_NAME.($^O =~ /mswin/i ?'.exe':''));
211    }
212
213    @TRIBEMCL_PARAMS = qw(inputtype hsp hit scorefile blastfile description_file searchio pairs mcl matrix weight description family_tag use_db);
214
215    @MATRIXPROGRAM_PARAMS = qw(ind out chunk);
216    @MCLPROGRAM_PARAMS = qw(I t P R pct o);
217
218    @OTHER_SWITCHES = qw(verbose quiet);
219
220    # Authorize attribute fields
221    foreach my $attr (@TRIBEMCL_PARAMS, @MATRIXPROGRAM_PARAMS,
222                      @MCLPROGRAM_PARAMS, @OTHER_SWITCHES) {
223      $OK_FIELD{$attr}++;
224    }
225}
226
227sub new {
228  my ($class, @args) = @_;
229  my $self = $class->SUPER::new(@args);
230
231  my ($attr, $value);
232  while (@args) {
233    $attr =   shift @args;
234    $value =  shift @args;
235    next if( $attr =~ /^-/ ); # don't want named parameters
236    if ($attr =~/MCL/i) {
237      $self->mcl_executable($value);
238      next;
239    }
240    if ($attr =~ /MATRIX/i){
241      $self->matrix_executable($value);
242      next;
243    }
244    $self->$attr($value);
245  }
246  defined($self->weight) || $self->weight(200);
247
248  return $self;
249}
250
251sub AUTOLOAD {
252    my $self = shift;
253    my $attr = $AUTOLOAD;
254    $attr =~ s/.*:://;
255    $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
256    $self->{$attr} = shift if @_;
257    return $self->{$attr};
258}
259
260=head2 mcl_executable
261
262 Title   : mcl_executable
263 Usage   : $self->mcl_executable()
264 Function: get set for path to mcl executable
265 Returns : String or undef if not installed
266 Args    : [optional] string of path to executable
267           [optional] boolean to warn on missing executable status
268
269=cut
270
271sub mcl_executable{
272   my ($self,$exe,$warn) = @_;
273
274   if( defined $exe ) {
275       $self->{'_mcl_exe'} = $exe;
276   }
277   unless( defined $self->{'_mcl_exe'} ) {
278       # this would be the name set in the BEGIN block
279       if( $MCLPROGRAM && -e $MCLPROGRAM && -x $MCLPROGRAM ) {
280	   $self->{'_mcl_exe'} = $MCLPROGRAM;
281       } else {
282	   my $exe;
283	   if( ( $exe = $self->io->exists_exe($MCLPROGRAM_NAME) ) &&
284	       -x $exe ) {
285	       $self->{'_mcl_exe'} = $exe;
286	   } else {
287	       $self->warn("Cannot find executable for $MCLPROGRAM_NAME") if $warn;
288	       $self->{'_mcl_exe'} = undef;
289	   }
290       }
291   }
292   $self->{'_mcl_exe'};
293}
294
295=head2 matrix_executable
296
297 Title   : matrix_executable
298 Usage   : $self->matrix_executable()
299 Function: get set for path to tribe-matrix executable
300 Returns : String or undef if not installed
301 Args    : [optional] string of path to executable
302           [optional] boolean to warn on missing executable status
303
304=cut
305
306sub matrix_executable{
307   my ($self,$exe,$warn) = @_;
308
309   if( defined $exe ) {
310       $self->{'_matrix_exe'} = $exe;
311   }
312   unless( defined $self->{'_matrix_exe'} ) {
313       # this would be the name set in the BEGIN block
314       if( $MATRIXPROGRAM && -e $MATRIXPROGRAM && -x $MATRIXPROGRAM ) {
315	   $self->{'_matrix_exe'} = $MATRIXPROGRAM;
316       } else {
317	   my $exe;
318	   if( ( $exe = $self->io->exists_exe($MATRIXPROGRAM_NAME) ) &&
319	       -x $exe ) {
320	       $self->{'_matrix_exe'} = $exe;
321	   } else {
322	       $self->warn("Cannot find executable for $MATRIXPROGRAM_NAME")
323		   if $warn;
324	       $self->{'_matrix_exe'} = undef;
325	   }
326       }
327   }
328   $self->{'_matrix_exe'};
329}
330
331=head2 run
332
333 Title   : run
334 Usage   : $self->run()
335 Function: runs the clustering
336 Returns : Array Ref of clustered Ids
337 Args    :
338
339=cut
340
341sub run {
342  my ($self,$input) = @_;
343  if($self->description_file){
344      $self->_setup_description($self->description_file);
345  }
346  my $file = $self->_setup_input($input);
347  defined($file) || $self->throw("Error setting up input ");
348  #run tribe-matrix to generate matrix for mcl
349  my ($index_file, $mcl_infile) = $self->_run_matrix($file);
350  $self->throw("tribe-matrix not run properly as index file is missing")
351      unless (-e $index_file);
352  $self->throw("tribe-matrix not run properly as matrix file is missing")
353      unless (-e $mcl_infile);
354  #run mcl
355  my $clusters = $self->_run_mcl($index_file,$mcl_infile);
356  my $families;
357  if($self->description){
358    my %consensus = $self->_consensifier($clusters);
359    $families = $self->_generate_families($clusters,\%consensus);
360  }
361  else {
362    $families = $self->_generate_families($clusters);
363  }
364
365  return @{$families};
366}
367
368sub _generate_families {
369    my ($self,$clusters,$consensus) = @_;
370    my $family_tag = $self->family_tag || "TribeFamily";
371    my @fam;
372    if($consensus){
373      my %description = %{$self->description};
374      my %consensus = %{$consensus};
375      for(my $i = 0; $i < scalar(@{$clusters}); $i++){
376          my @mem;
377          foreach my $member (@{$clusters->[$i]}){
378              my $mem = Bio::Seq->new(-id=>$member,
379                                      -alphabet=>"protein",
380                                      -desc=>$description{$member}->[1]);
381              my $annot = Bio::Annotation::DBLink->new(-database=>$description{$member}->[0],
382                                                       -primary_id=>$member);
383
384              $mem->annotation->add_Annotation('dblink',$annot);
385
386              #store species information
387              my $taxon_str = $description{$member}->[2];
388              #parse taxon info into hash
389              my %taxon;
390              $taxon_str=~s/=;/=undef;/g if $taxon_str;
391              %taxon = map{split '=',$_}split';',$taxon_str if $taxon_str;
392              my $name = $taxon{'taxon_common_name'};
393              my @classification = $taxon{'taxon_classification'} ? split(':',$taxon{'taxon_classification'}) : ();
394              my $tax_id = $taxon{'taxon_id'};
395              my $sub_species = $taxon{'taxon_sub_species'};
396
397              my $species = Bio::Species->new();
398              $species->common_name($name) if $name; #*** should this actually be scientific_name() ?
399              $species->sub_species($sub_species) if $sub_species;
400              $species->ncbi_taxid($tax_id) if $tax_id;
401              $species->classification(@classification) if @classification;
402              $mem->species($species);
403
404              push @mem, $mem;
405          }
406          my $id = $family_tag."_".$i;
407          my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id,
408                                              -description=>$consensus{$i}{desc},
409                                              -annotation_score=>$consensus{$i}{conf},
410                                              -members=>\@mem);
411          push @fam, $fam;
412      }
413      return \@fam;
414    }
415    else {
416        for(my $i = 0; $i < scalar(@{$clusters}); $i++){
417          my @mem;
418          foreach my $member (@{$clusters->[$i]}){
419              my $mem = Bio::Seq->new(-id=>$member,
420                                      -alphabet=>"protein");
421              push @mem, $mem;
422          }
423          my $id = $family_tag."_".$i;
424          my $fam = Bio::Cluster::SequenceFamily->new(-family_id=>$id,
425                                                      -members=>\@mem);
426          push @fam, $fam;
427      }
428     return \@fam;
429    }
430
431}
432
433
434sub _consensifier {
435    my ($self,$clusters) = @_;
436    eval {
437      require "Algorithm/Diff.pm";
438    };
439    if($@){
440      $self->warn("Algorith::Diff is needed to run TribeMCL");
441      return undef;
442    }
443    my %description = %{$self->description};
444    my %consensus;
445    my $best_annotation;
446    my %use_db;
447    if($self->use_db){
448      foreach my $key(split(',',$self->use_db)){
449        $use_db{$key}++;
450      }
451    }
452CLUSTER:
453    for(my $i = 0; $i < scalar(@{$clusters}); $i++){
454        my @desc;
455        my @orig_desc;
456        my $total_members = scalar(@{$clusters->[$i]});
457
458        foreach my $member(@{$clusters->[$i]}){
459          #if specify which dbs to use for consensifying
460          if($self->use_db){
461              if($use_db{$description{$member}->[0]}){
462                push @desc, $description{$member}->[1] if $description{$member}->[1];
463                push @orig_desc, $description{$member}->[1] if $description{$member}->[1];
464              }
465          }
466          else {
467            push @desc, $description{$member}->[1] if $description{$member}->[1];
468            push @orig_desc, $description{$member}->[1] if $description{$member}->[1];
469          }
470
471        }
472        if($#desc < 0){ #truly unknown
473            $consensus{$i}{desc} = "UNKNOWN";
474            $consensus{$i}{conf} = 0;
475            next CLUSTER;
476        }
477        if($#desc == 0){#only a single description
478            $consensus{$i}{desc} = grep(/S+/,@desc);
479            $consensus{$i}{desc} = $consensus{$i}{desc} || "UNKNOWN";
480	    if ($consensus{$i}{desc} eq "UNKNOWN") {
481	      $consensus{$i}{conf} = 0;
482	    } else {
483	      $consensus{$i}{conf} = 100 * int(1/$total_members);
484	    }
485            next CLUSTER;
486        }
487
488        #all the same desc
489        my %desc = ();
490        foreach my $desc (@desc) {        $desc{$desc}++;     }
491        if ( (keys %desc) == 1 ) {
492          my ($best_annotation,) = keys %desc;
493          my $n = grep($_ eq $best_annotation, @desc);
494          my $perc= int( 100*($n/$total_members) );
495          $consensus{$i}{desc} = $best_annotation;
496          $consensus{$i}{conf} = $perc;
497          next CLUSTER;
498        }
499
500        my %lcshash = ();
501        my %lcnext = ();
502        while (@desc) {
503          # do an all-against-all LCS (longest commong substring) of the
504          # descriptions of all members; take the resulting strings, and
505          # again do an all-against-all LCS on them, until we have nothing
506          # left. The LCS's found along the way are in lcshash.
507          #
508          # Incidentally, longest common substring is a misnomer, since it
509          # is not guaranteed to occur in either of the original strings. It
510          # is more like the common parts of a Unix diff ...
511          for (my $i=0;$i<@desc;$i++) {
512            for (my $j=$i+1;$j<@desc;$j++){
513                my @list1=split(" ",$desc[$i]);
514                my @list2=split(" ",$desc[$j]);
515                my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
516                my $lcs=join(" ",@lcs);
517                $lcshash{$lcs}=1;
518                $lcnext{$lcs}=1;
519            }
520          }
521          @desc=keys(%lcnext);
522          undef %lcnext;
523        }
524        my ($best_score, $best_perc)=(0, 0);
525        my @all_cands=sort {length($b) <=>length($a)} keys %lcshash ;
526        foreach my $candidate_consensus (@all_cands) {
527          my @temp=split(" ",$candidate_consensus);
528          my $length=@temp;               # num of words in annotation
529
530          # see how many members of cluster contain this LCS:
531
532          my ($lcs_count)=0;
533          foreach my $orig_desc (@orig_desc) {
534            my @list1=split(" ",$candidate_consensus);
535            my @list2=split(" ",$orig_desc);
536            my @lcs=Algorithm::Diff::LCS(\@list1,\@list2);
537            my $lcs=join(" ",@lcs);
538
539            if ($lcs eq $candidate_consensus ||
540                index($orig_desc,$candidate_consensus) != -1 # addition;
541                # many good (single word) annotations fall out otherwise
542               ) {
543                $lcs_count++;
544
545                # Following is occurs frequently, as LCS is _not_ the longest
546                # common substring ... so we can't use the shortcut either
547
548                # if ( index($orig_desc,$candidate_consensus) == -1 ) {
549                #   warn "lcs:'$lcs' eq cons:'$candidate_consensus' and
550                # orig:'$orig_desc', but index == -1\n"
551                # }
552            }
553          }
554          my $perc_with_desc=(($lcs_count/$total_members))*100;
555          my $perc=($lcs_count/$total_members)*100;
556          my $score=$perc + ($length*14); # take length into account as well
557          $score = 0 if $length==0;
558          if (($perc_with_desc >= 40) && ($length >= 1)) {
559            if ($score > $best_score) {
560                $best_score=$score;
561                $best_perc=$perc;
562                $best_annotation=$candidate_consensus;
563            }
564          }
565      }
566      if ($best_perc==0 || $best_perc >= 100 )  {
567        $best_perc='NA';
568      }
569      if  ($best_annotation eq  '')  {
570        $best_annotation = 'AMBIGUOUS';
571      }
572      $consensus{$i}{desc} = $best_annotation;
573      $consensus{$i}{conf} = $best_perc;
574  }
575  return %consensus;
576}
577
578sub _setup_description {
579    my ($self,$file) = @_;
580    my $goners='().-';
581    my $spaces= ' ' x length($goners);
582    my $filter = "tr '$goners' '$spaces' < $file";
583    open (FILE,"$filter | ") || die "$filter: $!";
584    my %description;
585    while(<FILE>){
586        chomp;
587        my ($db,$acc,$description,$taxon_str) = split("\t",$_);
588        $description || $self->throw("Wrongly formated description file");
589        $description = $self->_apply_edits($description);
590
591        if($description{$acc}){
592            $self->warn("Duplicated entry $acc in description file, overwriting..");
593        }
594        $description{$acc} = [$db,$description,$taxon_str];
595    }
596    $self->description(\%description);
597}
598
599sub as_words {
600    #add ^ and $ to regexp
601    my (@words);
602    my @newwords=();
603
604    foreach my $word (@words) { push @newwords, "^$word\$" };
605}
606
607sub _apply_edits {
608  my ($self,$desc) = @_;
609  my @deletes = ( 'FOR\$',  'SIMILAR TO\$', 'SIMILAR TO PROTEIN\$',
610               'RIKEN.*FULL.*LENGTH.*ENRICHED.*LIBRARY',
611               '\w*\d{4,}','HYPOTHETICAL PROTEIN'
612               );
613  my @newwords =  &as_words(qw(NOVEL PUTATIVE PREDICTED
614                               UNNAMED UNNMAED ORF CLONE MRNA
615                               CDNA EST RIKEN FIS KIAA\d+ \S+RIK IMAGE HSPC\d+
616                               FOR HYPOTETICAL HYPOTHETICAL));
617  push @deletes, @newwords;
618
619  foreach my $re ( @deletes ) { $desc=~s/$re//g; }
620
621  #Apply some fixes to the annotation:
622  $desc=~s/EC (\d+) (\d+) (\d+) (\d+)/EC $1.$2.$3.$4/;
623  $desc=~s/EC (\d+) (\d+) (\d+)/EC $1.$2.$3.-/;
624  $desc=~s/EC (\d+) (\d+)/EC $1\.$2.-.-/;
625  $desc=~s/(\d+) (\d+) KDA/$1.$2 KDA/;
626  return $desc;
627}
628
629=head2 _run_mcl
630
631 Title   : _run_mcl
632 Usage   : $self->_run_mcl()
633 Function: internal function for running the mcl program
634 Returns : Array Ref of clustered Ids
635 Args    : Index_file name, matrix input file name
636
637=cut
638
639sub _run_mcl {
640  my ($self,$ind_file,$infile) = @_;
641  my $exe = $self->mcl_executable || $self->throw("mcl not found.");
642  my $cmd = $exe . " $infile";
643  unless (defined $self->o) {
644    my ($fh,$o) = $self->io->tempfile(-dir=>$self->tempdir);
645    $self->o($o);
646    # file handle not use later so deleted
647    close($fh);
648    undef $fh;
649  }
650  unless (defined $self->I) {
651    $self->I(3.0);
652  }
653
654  foreach my $param (@MCLPROGRAM_PARAMS) {
655    if (defined $self->$param) {
656      $cmd .= " -$param ".$self->$param;
657    }
658  }
659  if($self->quiet ||
660     ($self->verbose < 0)){
661      $cmd .= " -V all";
662      if( $^O !~ /Mac/) {
663	  my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
664	  $cmd .= " 2> $null";
665      }
666  }
667
668  my $status = system($cmd);
669
670  $self->throw( "mcl  call ($cmd) crashed: $? \n") unless $status==0;
671  my $families = $self->_parse_mcl($ind_file,$self->o);
672  return $families;
673}
674
675=head2 _run_matrix
676
677 Title   : _run_matrix
678 Usage   : $self->_run_matrix()
679 Function: internal function for running the tribe-matrix program
680 Returns : index filepath and matrix file path
681 Args    : filepath of parsed ids and scores
682
683=cut
684
685sub _run_matrix {
686  my ($self,$parse_file) = @_;
687  my $exe = $self->matrix_executable || $self->throw("tribe-matrix not found.");
688  my $cmd = $exe . " $parse_file";
689  unless (defined $self->ind) {
690    my ($fh,$indexfile) = $self->io->tempfile(-dir=>$self->tempdir);
691    $self->ind($indexfile);
692    # file handle not use later so deleted
693    close($fh);
694    undef $fh;
695  }
696  unless (defined $self->out) {
697    my ($fh,$matrixfile) = $self->io->tempfile(-dir=>$self->tempdir);
698    $self->out($matrixfile);
699    # file handle not use later so deleted
700    close($fh);
701    undef $fh;
702  }
703  foreach my $param (@MATRIXPROGRAM_PARAMS) {
704    if (defined $self->$param) {
705      $cmd .= " -$param ".$self->$param;
706    }
707  }
708  my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
709  $cmd .= " > $null";
710  my $status = system($cmd);
711
712  $self->throw( "tribe-matrix call ($cmd) crashed: $? \n") unless $status==0;
713
714  return ($self->ind,$self->out);
715}
716
717=head2 _setup_input
718
719 Title   : _setup_input
720 Usage   : $self->_setup_input()
721 Function: internal function for running setting up the inputs
722            needed for running mcl
723 Returns : filepath of parsed ids and scores
724 Args    :
725
726=cut
727
728sub _setup_input {
729    my ($self,$input) = @_;
730    my ($type,$rc);
731
732    my ($tfh,$outfile) = $self->io->tempfile(-dir=>$self->tempdir);
733
734    $type = $self->inputtype();
735    if($type=~/scorefile/i){
736        -e $self->scorefile ||
737            $self->throw("Scorefile doesn't seem to exist or accessible");
738        return $self->scorefile;
739    }
740    if($type =~/blastfile/i){
741	    $self->blastfile($input);
742	    $rc = $self->_parse_blastfile($self->blastfile,$tfh);
743    }
744    elsif($type=~/searchio/i){
745	    $self->searchio($input);
746	    $rc = $self->_get_from_searchio($self->searchio,$tfh);
747    }
748    elsif($type=~/pairs/i) {
749	    $self->pairs($input);
750	    for my $line (@{ $self->pairs }){
751	      print $tfh join("\t",@{$line}), "\n";
752	      $rc++;
753	    }
754    }
755    elsif($type =~/hsp/i) {
756	    $self->hsp($input);
757	    $rc = $self->_get_from_hsp($self->hsp,$tfh);
758    }
759    elsif($type=~/hit/i){
760      $self->hit($input);
761      $rc = $self->_get_from_hit($self->hit,$tfh);
762    }
763    else {
764        $self->throw("Must set inputtype to either blastfile,searchio or ".
765                     "paris using \$fact->blastfile |\$fact->searchio| \$fact->pairs");
766    }
767    unless ( $rc ) {
768	    $self->throw("Need inputs for running tribe mcl, nothing provided");
769    }
770    close($tfh);
771    $tfh= undef;
772    return $outfile;
773}
774
775=head2 _get_from_hsp
776
777 Title   : _get_from_hsp
778 Usage   : $self->_get_from_hsp()
779 Function: internal function for getting blast scores from hsp
780 Returns : array ref to ids and score [protein1 protein2 magnitude factor]
781 Args    : L<Bio::Search::HSP::GenericHSP>
782
783=cut
784
785sub _get_from_hsp {
786    my ($self,$hsp,$tfh) = @_;
787    my @array;
788    my $count;
789    foreach my $pair (@{$hsp}){
790        my $sig = $pair->score;
791        $sig =~ s/^e-/1e-/g;
792        my $expect=sprintf("%e",$sig);
793        if ($expect==0){
794          my $wt = $self->weight;
795          $expect=sprintf("%e","1e-$wt");
796        }
797        my $first=(split("e-",$expect))[0];
798        my $second=(split("e-",$expect))[1];
799
800	print $tfh join("\t", $pair->feature1->seq_id,
801			$pair->feature2->seq_id,int($first),
802			int($second) ), "\n";
803	$count++;
804    }
805    return $count;
806}
807
808sub _get_from_hit {
809    my ($self,$hit,$tfh) = @_;
810    my $count;
811    foreach my $pair(@{$hit}){
812        my $sig = $pair->raw_score;
813        $sig =~s/^e-/1e-/g;
814        my $expect = sprintf("%e",$sig);
815        if ($expect==0){
816          my $wt = $self->weight;
817          $expect=sprintf("%e","1e-$wt");
818        }
819        my $first=(split("e-",$expect))[0];
820        my $second=(split("e-",$expect))[1];
821        print $tfh join("\t",$pair->name,$pair->description,int($first),int($second)),"\n";
822        $count++;
823    }
824    return $count;
825}
826
827
828=head2 _get_from_searchio
829
830 Title   : _get_from_searchio
831 Usage   : $self->_get_from_searchio()
832 Function: internal function for parsing blast scores from searchio object
833 Returns : array ref to ids and score [protein1 protein2 magnitude factor]
834 Args    :  L<Bio::Tools::SearchIO>
835
836=cut
837
838sub _get_from_searchio {
839  my ($self,$sio,$tfh) = @_;
840  my @array;
841  my $count;
842  while( my $result = $sio->next_result ) {
843    while( my $hit = $result->next_hit ) {
844      while( my $hsp = $hit->next_hsp ) {
845        my $sig = $hsp->evalue;
846        $sig =~ s/^e-/1e-/g;
847        my $expect=sprintf("%e",$sig);
848        if ($expect==0){
849          my $wt = $self->weight;
850          $expect=sprintf("%e","1e-$wt");
851        }
852        my $first=(split("e-",$expect))[0];
853        my $second=(split("e-",$expect))[1];
854        print $tfh join("\t",
855			$hsp->feature1->seq_id,
856			$hsp->feature2->seq_id,
857			int($first),
858			int($second) ), "\n";
859
860	$count++;
861      }
862    }
863  }
864  return $count;
865}
866
867=head2 _parse_blastfile
868
869 Title   : _parse_blastfile
870 Usage   : $self->_parse_blastfile()
871 Function: internal function for quickly parsing blast evalue
872           scores from raw blast output file
873 Returns : array ref to ids and score [protein1 protein2 magnitude factor]
874 Args    :  file path
875
876=cut
877
878sub _parse_blastfile {
879  my ($self, $file,$tfh) = @_;
880  open(FILE,$file) || $self->throw("Cannot open Blast Output File");
881  my ($query,$reference,$first,$second);
882  my @array;
883  my $count;
884  my $weight = $self->weight;
885  while(<FILE>){
886    if(/Query=\s+(\S+)/){
887      $query = $1;
888    }
889    if(/^>(\S+)/){
890      $reference = $1;
891    }
892    if (/Expect = (\S+)/){
893      my $raw=$1;
894      $raw=~s/^e-/1e-/g;
895      my $expect=sprintf("%e",$raw);
896      if ($expect==0){
897        $expect=sprintf("%e","1e-$weight");
898			}
899      $first=(split("e-",$expect))[0];
900      $second=(split("e-",$expect))[1];
901      print $tfh join("\t", $query,
902		      $reference,
903		      int($first),
904		      int($second)), "\n";
905      $count++;
906    }
907  }
908  return $count;
909}
910
911=head2 _parse_mcl
912
913 Title   : _parse_mcl
914 Usage   : $self->_parse_mcl()
915 Function: internal function for quickly parsing mcl output and
916           generating the array of clusters
917 Returns : Array Ref of clustered Ids
918 Args    :  index file path, mcl output file path
919
920=cut
921
922sub _parse_mcl {
923  my ($self,$ind,$mcl) = @_;
924  open (MCL,$mcl) || $self->throw("Error, cannot open $mcl for parsing");
925  my $i =-1;
926  my $start;
927  my (@cluster,@out);
928  while(<MCL>) {
929      if ($start) {
930	  chomp($_);
931	  $cluster[$i] = join(" ",$cluster[$i],"$_");
932      }
933      if(/^\d+/){
934	  $start = 1;
935	  $i++;
936	  $cluster[$i] = join(" ",$cluster[$i] || '',"$_");
937      }
938      if (/\$$/){
939	  $start = 0;
940      }
941      last if /^\(mclruninfo/;
942  }
943  open (IND,$ind) || $self->throw("Cannot open $ind for parsing");
944  my %hash;
945  while(<IND>){
946      /^(\S+)\s+(\S+)/;
947      $hash{$1}=$2;
948  }
949
950  for (my $j=0;$j<$i+1;$j++)	{
951      my @array=split(" ",$cluster[$j]);
952      for (my $p=1;$p<$#array;$p++){
953	  push @{$out[$array[0]]}, $hash{$array[$p]};
954      }
955  }
956  return \@out;
957}
958
959
9601;
961