1#
2# BioPerl module for Bio::Tree::DistanceFactory
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Jason Stajich <jason-at-bioperl.org>
7#
8# Copyright Jason Stajich
9#
10# You may distribute this module under the same terms as perl itself
11
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::Tree::DistanceFactory - Construct a tree using distance based methods
17
18=head1 SYNOPSIS
19
20  use Bio::Tree::DistanceFactory;
21  use Bio::AlignIO;
22  use Bio::Align::DNAStatistics;
23  my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ");
24  my $stats    = Bio::Align::DNAStatistics->new();
25
26  my $alnin    = Bio::AlignIO->new(-format => 'clustalw',
27                                   -file   => 'file.aln');
28  my $aln = $alnin->next_aln;
29  # Of course matrix can come from a different place
30  # like PHYLIP if you prefer, Bio::Matrix::IO should be able
31  # to parse many things
32  my $jcmatrix = $stats->distance(-align => $aln,
33                                  -method => 'Jukes-Cantor');
34  my $tree = $tfactory->make_tree($jcmatrix);
35
36
37=head1 DESCRIPTION
38
39This is a factory which will construct a phylogenetic tree based on
40the pairwise sequence distances for a set of sequences.  Currently
41UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree
42construction methods are implemented.
43
44=head1 REFERENCES
45
46Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis",
47Cambridge Univ Press, Cambridge, UK.
48
49Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge
50Neighbour-Joining trees of protein sequences." Bioinformatics
5118(11):1546-1547.
52
53Saitou N and Nei M, (1987) "The neighbor-joining method: a new method
54for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25.
55
56=head1 FEEDBACK
57
58=head2 Mailing Lists
59
60User feedback is an integral part of the evolution of this and other
61Bioperl modules. Send your comments and suggestions preferably to
62the Bioperl mailing list.  Your participation is much appreciated.
63
64  bioperl-l@bioperl.org                  - General discussion
65  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
66
67=head2 Support
68
69Please direct usage questions or support issues to the mailing list:
70
71I<bioperl-l@bioperl.org>
72
73rather than to the module maintainer directly. Many experienced and
74reponsive experts will be able look at the problem and quickly
75address it. Please include a thorough description of the problem
76with code and data examples if at all possible.
77
78=head2 Reporting Bugs
79
80Report bugs to the Bioperl bug tracking system to help us keep track
81of the bugs and their resolution. Bug reports can be submitted the web:
82
83  https://github.com/bioperl/bioperl-live/issues
84
85=head1 AUTHOR - Jason Stajich
86
87Email jason-at-bioperl.org
88
89=head1 APPENDIX
90
91The rest of the documentation details each of the object methods.
92Internal methods are usually preceded with a _
93
94=cut
95
96package Bio::Tree::DistanceFactory;
97$Bio::Tree::DistanceFactory::VERSION = '1.7.7';
98use vars qw($DefaultMethod $Precision);
99use strict;
100
101# some defaults
102$DefaultMethod = 'UPGMA';
103$Precision = 5;
104
105use Bio::Tree::Node;
106use Bio::Tree::Tree;
107
108use base qw(Bio::Root::Root);
109
110=head2 new
111
112 Title   : new
113 Usage   : my $obj = Bio::Tree::DistanceFactory->new();
114 Function: Builds a new Bio::Tree::DistanceFactory object
115 Returns : an instance of Bio::Tree::DistanceFactory
116 Args    : -method => 'NJ' or 'UPGMA'
117
118
119=cut
120
121sub new {
122  my($class,@args) = @_;
123  my $self = $class->SUPER::new(@args);
124
125  my ($method) = $self->_rearrange([qw(METHOD)],
126				   @args);
127  $self->method($method || $DefaultMethod);
128  return $self;
129}
130
131=head2 make_tree
132
133 Title   : make_tree
134 Usage   : my $tree = $disttreefact->make_tree($matrix);
135 Function: Build a Tree based on a distance matrix
136 Returns : L<Bio::Tree::TreeI>
137 Args    : L<Bio::Matrix::MatrixI> object
138
139
140=cut
141
142sub make_tree{
143   my ($self,$matrix) = @_;
144   if( ! defined $matrix || !ref($matrix) ||
145       ! $matrix->isa('Bio::Matrix::MatrixI') ) {
146       $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree");
147       return;
148   }
149
150   my $method = uc ($self->method);
151   if( $method =~ /NJ/i ) {
152       return $self->_nj($matrix);
153   } elsif( $method =~ /UPGMA/i ) {
154       return $self->_upgma($matrix);
155   } else {
156       $self->warn("Unknown tree construction method '$method'.  Cannot run.");
157       return;
158   }
159
160}
161
162
163=head2 _nj
164
165 Title   : _nj
166 Usage   : my $tree = $disttreefact->_nj($matrix);
167 Function: Construct a tree based on distance matrix using the
168           Neighbor Joining algorithm (Saitou and Nei, 1987)
169           Implementation based on Kevin Howe's Quicktree implementation
170           and uses his tricks (some based on Bill Bruno's work) to eliminate
171           negative branch lengths
172 Returns : L<Bio::Tree::TreeI>
173 Args    : L<Bio::Matrix::MatrixI> object
174
175=cut
176
177sub _nj {
178   my ($self,$distmat) = @_;
179
180   # we assume type checking of $aln has already been done
181   # client shouldn't be calling this directly anyways, using the
182   # make_tree method is preferred
183
184   # so that we can trim the number of digits shown as the branch length
185   my $precisionstr = "%.$Precision"."f";
186
187   my @names =  $distmat->column_names;
188   my $N = scalar @names;
189   my ($i,$j,$m,@nodes,$mat,@r);
190   my $L = $N;
191
192   if( $N < 2 ) {
193       $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n");
194       return;
195   } elsif( $N == 2 ) {
196       $i = 0;
197       my $d = sprintf($precisionstr,
198		       $distmat->get_entry($names[0],$names[1]) / 2);
199       my $root = Bio::Tree::Node->new();
200       for my $nm ( @names ) {
201	   $root->add_Descendent( Bio::Tree::Node->new(-id => $nm,
202							-branch_length => $d));
203       }
204       return Bio::Tree::Tree->new(-root => $root);
205   }
206   my $c = 0;
207
208   for ( $i = 0; $i < $N; $i++ ) {
209       push @nodes, Bio::Tree::Node->new(-id => $names[$i]);
210       my $ri = 0;
211       for( $j = 0; $j < $N; $j++ ) {
212	   $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]);
213	   $ri += $mat->[$i][$j];
214       }
215       $r[$i] = $ri / ($L -2);
216   }
217
218   for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) {
219       my ($mini,$minj,$min);
220       for($i = 0; $i < $N; $i++ ) {
221	   next unless defined $nodes[$i];
222	   for( $j = 0; $j < $i; $j++ ) {
223	       next unless defined $nodes[$j];
224	       my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]);
225	       if( ! defined $min ||
226		   $dist <= $min) {
227		   ($mini,$minj,$min) = ($i,$j,$dist);
228	       }
229	   }
230       }
231       my $dij    = $mat->[$mini][$minj];
232       my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2;
233       my $dist_j = $dij - $dist_i;
234
235       # deal with negative branch lengths
236       # per code in K.Howe's quicktree
237       if( $dist_i < 0 ) {
238	   $dist_i = 0;
239	   $dist_j = $dij;
240	   $dist_j = 0 if( $dist_j < 0 );
241       } elsif( $dist_j < 0 ) {
242	   $dist_j = 0;
243	   $dist_i = $dij;
244	   $dist_i = 0 if( $dist_i < 0 );
245       }
246
247       $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i));
248       $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j));
249
250       my $newnode = Bio::Tree::Node->new(-descendents => [ $nodes[$mini],
251							    $nodes[$minj] ]);
252
253       $nodes[$mini] = $newnode;
254       delete $nodes[$minj];
255
256       # update the distance matrix
257       $r[$mini] = 0;
258       my ($dmi,$dmj);
259       for( $m = 0; $m < $N; $m++ ) {
260	   next unless defined $nodes[$m];
261	   if( $m != $mini ) {
262	       $dmj = $mat->[$m][$minj];
263
264	       my ($row,$col);
265	       ($row,$col) = ($m,$mini);
266	       $dmi = $mat->[$row][$col];
267
268	       # from K.Howe's notes in quicktree
269	       # we can actually adjust r[m] here, by using the form:
270	       # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1)
271
272	       # Note: in Bill Bruno's method for negative branch
273	       # elimination, then if either dist_i is positive and
274	       # dist_j is 0, or dist_i is zero and dist_j is positive
275	       # (after adjustment) then the matrix entry is formed
276	       # from the distance to the node in question (m) to the
277	       # node with the zero branch length (whichever it was).
278	       # I think my code already has the same effect; this is
279	       # certainly true if dij is equal to dist_i + dist_j,
280	       # which it should have been fixed to
281
282	       my $dmk = $mat->[$row][$col] = $mat->[$col][$row] =
283		   ($dmi + $dmj - $dij) / 2;
284
285	       # If we don't want to try and correct negative brlens
286	       # this is essentially what is in Edddy et al, BSA book.
287	       # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1);
288	       #
289	       $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj +
290			 $mat->[$row][$col]) / ( $L - 3);
291	       $r[$mini] += $dmk;
292	   }
293       }
294       $L--;
295       $r[$mini] /= $L - 2;
296   }
297
298   # should be 3 nodes left
299   my (@leftovernodes,@leftovers);
300   for( my $k = 0; $k < $N; $k++ ) {
301       if( defined $nodes[$k] ) {
302	   push @leftovers, $k;
303	   push @leftovernodes, $nodes[$k];
304       }
305   }
306   my ($l_0,$l_1,$l_2) = @leftovers;
307
308   my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] -
309		  $mat->[$l_2][$l_1] ) / 2;
310
311   my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i);
312   my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i);
313
314   # This is Kev's code to get rid of negative branch lengths
315   if( $dist_i < 0 ) {
316       $dist_i = 0;
317       $dist_j = $mat->[$l_1][$l_0];
318       $dist_k = $mat->[$l_2][$l_0];
319       if( $dist_j < 0 ) {
320	   $dist_j = 0;
321	   $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2;
322	   $dist_k = 0 if( $dist_k < 0 );
323       } elsif( $dist_k < 0 ) {
324	   $dist_k = 0;
325	   $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2;
326	   $dist_j = 0 if( $dist_j < 0 );
327       }
328   } elsif( $dist_j < 0 ) {
329       $dist_j = 0;
330       $dist_i = $mat->[$l_1][$l_0];
331       $dist_k = $mat->[$l_2][$l_1];
332       if( $dist_i < 0 ) {
333	   $dist_i = 0;
334	   $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2;
335	   $dist_k = 0 if( $dist_k  < 0 );
336       } elsif( $dist_k < 0 ) {
337	   $dist_k = 0;
338	   $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
339	   $dist_i = 0 if( $dist_i < 0 );
340       }
341   } elsif( $dist_k < 0 ) {
342       $dist_k = 0;
343       $dist_i = $mat->[$l_2][$l_0];
344       $dist_j = $mat->[$l_2][$l_1];
345       if( $dist_i < 0 ) {
346	   $dist_i = 0;
347	   $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2;
348	   $dist_j = 0 if $dist_j < 0;
349       } elsif( $dist_j < 0  ) {
350	   $dist_j = 0;
351	   $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2;
352	   $dist_i = 0 if $dist_i < 0;
353       }
354   }
355   $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i));
356   $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j));
357   $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k));
358
359   Bio::Tree::Tree->new(-root => Bio::Tree::Node->new
360			(-descendents => \@leftovernodes));
361}
362
363=head2 _upgma
364
365 Title   : _upgma
366 Usage   : my $tree = $disttreefact->_upgma($matrix);
367 Function: Construct a tree based on alignment using UPGMA
368 Returns : L<Bio::Tree::TreeI>
369 Args    : L<Bio::Matrix::MatrixI> object
370
371
372=cut
373
374sub _upgma{
375   my ($self,$distmat) = @_;
376   # we assume type checking of $matrix has already been done
377   # client shouldn't be calling this directly anyways, using the
378   # make_tree method is preferred
379
380   # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998
381   # originally by Sokal and Michener 1956
382
383   my $precisionstr = "%.$Precision"."f";
384
385   my ($i,$j,$x,$y,@dmat,@orig,@nodes);
386
387   my @names = $distmat->column_names;
388   my $c = 0;
389   my @clusters = map {
390       my $r = { 'id'        => $c,
391		 'height'    => 0,
392		 'contains'  => [$c],
393	     };
394       $c++;
395       $r;
396   } @names;
397
398   my $K = scalar @clusters;
399   my (@mins,$min);
400   for ( $i = 0; $i < $K; $i++ ) {
401       for( $j = $i+1; $j < $K; $j++ ) {
402	   my $d =  $distmat->get_entry($names[$i],$names[$j]);
403	   # get Min here on first time around, save 1 cycle
404	   $dmat[$j][$i] = $dmat[$i][$j] = $d;
405	   $orig[$i][$j] = $orig[$j][$i] = $d;
406	   if ( ! defined $min || $d <= $min ) {
407	       if( defined $min && $min == $d ) {
408		   push @mins, [$i,$j];
409	       } else {
410		   @mins = [$i,$j];
411		   $min  = $d;
412	       }
413	   }
414       }
415   }
416   # distance between each cluster is avg distance
417   # between pairs of sequences from each cluster
418   while( $K > 1 ) {
419       # fencepost - we already have found the $min
420       # so very first time loop is executed we can skip checking
421       unless( defined $min ) {
422	   for($i = 0; $i < $K; $i++ ) {
423	       for( $j = $i+1; $j < $K; $j++ ) {
424		   my $dij = $dmat[$i][$j];
425		   if( ! defined $min ||
426		       $dij <= $min) {
427		       if( defined $min &&
428			   $min == $dij ) {
429			   push @mins, [$i,$j];
430		       } else {
431			   @mins = [ $i,$j ];
432			   $min = $dij;
433		       }
434		   }
435	       }
436	   }
437       }
438       # randomly break ties
439       ($x,$y) = @{ $mins[int(rand(scalar @mins))] };
440
441       # now we are going to join clusters x and y, make a new cluster
442
443       my $node = Bio::Tree::Node->new();
444       my @subids;
445       for my $cid ( $x,$y ) {
446	   my $nid = $clusters[$cid]->{'id'};
447	   if( ! defined $nodes[$nid] ) {
448	       $nodes[$nid] = Bio::Tree::Node->new(-id => $names[$nid]);
449	   }
450	   $nodes[$nid]->branch_length
451	       (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'}));
452	   $node->add_Descendent($nodes[$nid]);
453	   push @subids, @{ $clusters[$cid]->{'contains'} };
454       }
455       my $cluster = { 'id'       => $c++,
456		       'height'   => $min / 2,
457		       'contains' => [@subids],
458		   };
459
460       $K--; # we are going to drop the last node so go ahead and decrement K
461       $nodes[$cluster->{'id'}] = $node;
462       if ( $y != $K ) {
463	   $clusters[$y] = $clusters[$K];
464	   $dmat[$y] = $dmat[$K];
465	   for ( $i = 0; $i < $K; $i++ ) {
466	       $dmat[$i][$y] = $dmat[$y][$i];
467	   }
468       }
469       delete $clusters[$K];
470       $clusters[$x] = $cluster;
471       # now recalculate @dmat
472       for( $i = 0; $i < $K; $i++ ) {
473	   if( $i != $x) {
474	       $dmat[$i][$x] = $dmat[$x][$i] =
475		   &_upgma_distance($clusters[$i],$clusters[$x],\@orig);
476	   } else {
477	       $dmat[$i][$i] = 0;
478	   }
479       }
480       # reset so next loop iteration
481       # we will find minimum distance
482       @mins = ();
483       $min = undef;
484   }
485   Bio::Tree::Tree->new(-root => $nodes[-1]);
486}
487
488# calculate avg distance between clusters - be they
489# single sequences or the combination of multiple seqences
490# $cluster_i and $cluster_j are the clusters to operate on
491# and $distances is a matrix (arrayref of arrayrefs) of pairwise
492# differences indexed on the sequence ids -
493# so $distances->[0][1] is the distance between sequences 0 and 1
494
495sub _upgma_distance {
496    my ($cluster_i, $cluster_j, $distances) = @_;
497    my $ilen = scalar @{ $cluster_i->{'contains'} };
498    my $jlen = scalar @{ $cluster_j->{'contains'} };
499    my ($d,$count);
500    for( my $i = 0; $i < $ilen; $i++ ) {
501	my $i_id = $cluster_i->{'contains'}->[$i];
502	for( my $j = 0; $j < $jlen; $j++) {
503	    my $j_id = $cluster_j->{'contains'}->[$j];
504	    if( ! defined $distances->[$i_id][$j_id] ) {
505		warn("no value for $i_id $j_id\n");
506	    } else {
507		$d += $distances->[$i_id][$j_id];
508	    }
509	    $count++;
510	}
511    }
512    return $d / $count;
513}
514
515=head2 method
516
517 Title   : method
518 Usage   : $obj->method($newval)
519 Function:
520 Example :
521 Returns : value of method (a scalar)
522 Args    : on set, new value (a scalar or undef, optional)
523
524
525=cut
526
527sub method{
528    my $self = shift;
529    return $self->{'_method'} = shift if @_;
530    return $self->{'_method'};
531}
532
533
534=head2 check_additivity
535
536 Title     : check_additivity
537 Usage     : if( $distance->check_additivity($matrix) ) {
538             }
539 Function  : See if matrix obeys additivity principal
540 Returns   : boolean
541 Args      : Bio::Matrix::MatrixI
542 References: Based on a Java implementation by
543             Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3
544             http://www.dina.kvl.dk/~sestoft/bsa.html
545             which in turn is based on algorithms described in
546             R. Durbin, S. Eddy, A. Krogh, G. Mitchison.
547             Biological Sequence Analysis CUP 1998, Chapter 7.
548
549=cut
550
551sub check_additivity{
552   my ($self,$matrix) = @_;
553   my @names = $matrix->column_names;
554   my $len = scalar @names;
555   return unless $len >= 4;
556   # look at all sets of 4
557   for( my $i = 0; $i < $len; $i++ ) {
558       for( my $j = $i+1; $j< $len; $j++) {
559	   for( my $k = $j+1; $k < $len; $k ++ ) {
560	       for( my $m = $k +1; $m < $len; $m++ ) {
561		   my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) +
562		       $matrix->get_entry($names[$k],$names[$m]);
563		   my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) +
564		       $matrix->get_entry($names[$j],$names[$m]);
565		   my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) +
566		       $matrix->get_entry($names[$j],$names[$k]);
567		   if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk)
568			  || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm)
569			  || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) {
570		       return 0;
571		   }
572	       }
573	   }
574       }
575   }
576   return 1;
577}
578
5791;
580