1# 2# BioPerl module for Bio::Tree::DistanceFactory 3# 4# Please direct questions and support issues to <bioperl-l@bioperl.org> 5# 6# Cared for by Jason Stajich <jason-at-bioperl.org> 7# 8# Copyright Jason Stajich 9# 10# You may distribute this module under the same terms as perl itself 11 12# POD documentation - main docs before the code 13 14=head1 NAME 15 16Bio::Tree::DistanceFactory - Construct a tree using distance based methods 17 18=head1 SYNOPSIS 19 20 use Bio::Tree::DistanceFactory; 21 use Bio::AlignIO; 22 use Bio::Align::DNAStatistics; 23 my $tfactory = Bio::Tree::DistanceFactory->new(-method => "NJ"); 24 my $stats = Bio::Align::DNAStatistics->new(); 25 26 my $alnin = Bio::AlignIO->new(-format => 'clustalw', 27 -file => 'file.aln'); 28 my $aln = $alnin->next_aln; 29 # Of course matrix can come from a different place 30 # like PHYLIP if you prefer, Bio::Matrix::IO should be able 31 # to parse many things 32 my $jcmatrix = $stats->distance(-align => $aln, 33 -method => 'Jukes-Cantor'); 34 my $tree = $tfactory->make_tree($jcmatrix); 35 36 37=head1 DESCRIPTION 38 39This is a factory which will construct a phylogenetic tree based on 40the pairwise sequence distances for a set of sequences. Currently 41UPGMA (Sokal and Michener 1958) and NJ (Saitou and Nei 1987) tree 42construction methods are implemented. 43 44=head1 REFERENCES 45 46Eddy SR, Durbin R, Krogh A, Mitchison G, (1998) "Biological Sequence Analysis", 47Cambridge Univ Press, Cambridge, UK. 48 49Howe K, Bateman A, Durbin R, (2002) "QuickTree: building huge 50Neighbour-Joining trees of protein sequences." Bioinformatics 5118(11):1546-1547. 52 53Saitou N and Nei M, (1987) "The neighbor-joining method: a new method 54for reconstructing phylogenetic trees." Mol Biol Evol 4(4):406-25. 55 56=head1 FEEDBACK 57 58=head2 Mailing Lists 59 60User feedback is an integral part of the evolution of this and other 61Bioperl modules. Send your comments and suggestions preferably to 62the Bioperl mailing list. Your participation is much appreciated. 63 64 bioperl-l@bioperl.org - General discussion 65 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 66 67=head2 Support 68 69Please direct usage questions or support issues to the mailing list: 70 71I<bioperl-l@bioperl.org> 72 73rather than to the module maintainer directly. Many experienced and 74reponsive experts will be able look at the problem and quickly 75address it. Please include a thorough description of the problem 76with code and data examples if at all possible. 77 78=head2 Reporting Bugs 79 80Report bugs to the Bioperl bug tracking system to help us keep track 81of the bugs and their resolution. Bug reports can be submitted the web: 82 83 https://github.com/bioperl/bioperl-live/issues 84 85=head1 AUTHOR - Jason Stajich 86 87Email jason-at-bioperl.org 88 89=head1 APPENDIX 90 91The rest of the documentation details each of the object methods. 92Internal methods are usually preceded with a _ 93 94=cut 95 96package Bio::Tree::DistanceFactory; 97$Bio::Tree::DistanceFactory::VERSION = '1.7.7'; 98use vars qw($DefaultMethod $Precision); 99use strict; 100 101# some defaults 102$DefaultMethod = 'UPGMA'; 103$Precision = 5; 104 105use Bio::Tree::Node; 106use Bio::Tree::Tree; 107 108use base qw(Bio::Root::Root); 109 110=head2 new 111 112 Title : new 113 Usage : my $obj = Bio::Tree::DistanceFactory->new(); 114 Function: Builds a new Bio::Tree::DistanceFactory object 115 Returns : an instance of Bio::Tree::DistanceFactory 116 Args : -method => 'NJ' or 'UPGMA' 117 118 119=cut 120 121sub new { 122 my($class,@args) = @_; 123 my $self = $class->SUPER::new(@args); 124 125 my ($method) = $self->_rearrange([qw(METHOD)], 126 @args); 127 $self->method($method || $DefaultMethod); 128 return $self; 129} 130 131=head2 make_tree 132 133 Title : make_tree 134 Usage : my $tree = $disttreefact->make_tree($matrix); 135 Function: Build a Tree based on a distance matrix 136 Returns : L<Bio::Tree::TreeI> 137 Args : L<Bio::Matrix::MatrixI> object 138 139 140=cut 141 142sub make_tree{ 143 my ($self,$matrix) = @_; 144 if( ! defined $matrix || !ref($matrix) || 145 ! $matrix->isa('Bio::Matrix::MatrixI') ) { 146 $self->warn("Need to provide a valid Bio::Matrix::MatrixI object to make_tree"); 147 return; 148 } 149 150 my $method = uc ($self->method); 151 if( $method =~ /NJ/i ) { 152 return $self->_nj($matrix); 153 } elsif( $method =~ /UPGMA/i ) { 154 return $self->_upgma($matrix); 155 } else { 156 $self->warn("Unknown tree construction method '$method'. Cannot run."); 157 return; 158 } 159 160} 161 162 163=head2 _nj 164 165 Title : _nj 166 Usage : my $tree = $disttreefact->_nj($matrix); 167 Function: Construct a tree based on distance matrix using the 168 Neighbor Joining algorithm (Saitou and Nei, 1987) 169 Implementation based on Kevin Howe's Quicktree implementation 170 and uses his tricks (some based on Bill Bruno's work) to eliminate 171 negative branch lengths 172 Returns : L<Bio::Tree::TreeI> 173 Args : L<Bio::Matrix::MatrixI> object 174 175=cut 176 177sub _nj { 178 my ($self,$distmat) = @_; 179 180 # we assume type checking of $aln has already been done 181 # client shouldn't be calling this directly anyways, using the 182 # make_tree method is preferred 183 184 # so that we can trim the number of digits shown as the branch length 185 my $precisionstr = "%.$Precision"."f"; 186 187 my @names = $distmat->column_names; 188 my $N = scalar @names; 189 my ($i,$j,$m,@nodes,$mat,@r); 190 my $L = $N; 191 192 if( $N < 2 ) { 193 $self->warn("Can only perform NJ treebuilding on sets of 2 or more species\n"); 194 return; 195 } elsif( $N == 2 ) { 196 $i = 0; 197 my $d = sprintf($precisionstr, 198 $distmat->get_entry($names[0],$names[1]) / 2); 199 my $root = Bio::Tree::Node->new(); 200 for my $nm ( @names ) { 201 $root->add_Descendent( Bio::Tree::Node->new(-id => $nm, 202 -branch_length => $d)); 203 } 204 return Bio::Tree::Tree->new(-root => $root); 205 } 206 my $c = 0; 207 208 for ( $i = 0; $i < $N; $i++ ) { 209 push @nodes, Bio::Tree::Node->new(-id => $names[$i]); 210 my $ri = 0; 211 for( $j = 0; $j < $N; $j++ ) { 212 $mat->[$i][$j] = $distmat->get_entry($names[$i],$names[$j]); 213 $ri += $mat->[$i][$j]; 214 } 215 $r[$i] = $ri / ($L -2); 216 } 217 218 for( my $nodecount = 0; $nodecount < $N-3; $nodecount++) { 219 my ($mini,$minj,$min); 220 for($i = 0; $i < $N; $i++ ) { 221 next unless defined $nodes[$i]; 222 for( $j = 0; $j < $i; $j++ ) { 223 next unless defined $nodes[$j]; 224 my $dist = $mat->[$i][$j] - ($r[$i] + $r[$j]); 225 if( ! defined $min || 226 $dist <= $min) { 227 ($mini,$minj,$min) = ($i,$j,$dist); 228 } 229 } 230 } 231 my $dij = $mat->[$mini][$minj]; 232 my $dist_i = ($dij + $r[$mini] - $r[$minj]) / 2; 233 my $dist_j = $dij - $dist_i; 234 235 # deal with negative branch lengths 236 # per code in K.Howe's quicktree 237 if( $dist_i < 0 ) { 238 $dist_i = 0; 239 $dist_j = $dij; 240 $dist_j = 0 if( $dist_j < 0 ); 241 } elsif( $dist_j < 0 ) { 242 $dist_j = 0; 243 $dist_i = $dij; 244 $dist_i = 0 if( $dist_i < 0 ); 245 } 246 247 $nodes[$mini]->branch_length(sprintf($precisionstr,$dist_i)); 248 $nodes[$minj]->branch_length(sprintf($precisionstr,$dist_j)); 249 250 my $newnode = Bio::Tree::Node->new(-descendents => [ $nodes[$mini], 251 $nodes[$minj] ]); 252 253 $nodes[$mini] = $newnode; 254 delete $nodes[$minj]; 255 256 # update the distance matrix 257 $r[$mini] = 0; 258 my ($dmi,$dmj); 259 for( $m = 0; $m < $N; $m++ ) { 260 next unless defined $nodes[$m]; 261 if( $m != $mini ) { 262 $dmj = $mat->[$m][$minj]; 263 264 my ($row,$col); 265 ($row,$col) = ($m,$mini); 266 $dmi = $mat->[$row][$col]; 267 268 # from K.Howe's notes in quicktree 269 # we can actually adjust r[m] here, by using the form: 270 # rm = ((rm * numseqs) - dmi - dmj + dmk) / (numseqs-1) 271 272 # Note: in Bill Bruno's method for negative branch 273 # elimination, then if either dist_i is positive and 274 # dist_j is 0, or dist_i is zero and dist_j is positive 275 # (after adjustment) then the matrix entry is formed 276 # from the distance to the node in question (m) to the 277 # node with the zero branch length (whichever it was). 278 # I think my code already has the same effect; this is 279 # certainly true if dij is equal to dist_i + dist_j, 280 # which it should have been fixed to 281 282 my $dmk = $mat->[$row][$col] = $mat->[$col][$row] = 283 ($dmi + $dmj - $dij) / 2; 284 285 # If we don't want to try and correct negative brlens 286 # this is essentially what is in Edddy et al, BSA book. 287 # $r[$m] = (($r[$m] * $L) - $dmi - $dmj + $dmk) / ($L-1); 288 # 289 $r[$m] = (($r[$m] * ($L - 2)) - $dmi - $dmj + 290 $mat->[$row][$col]) / ( $L - 3); 291 $r[$mini] += $dmk; 292 } 293 } 294 $L--; 295 $r[$mini] /= $L - 2; 296 } 297 298 # should be 3 nodes left 299 my (@leftovernodes,@leftovers); 300 for( my $k = 0; $k < $N; $k++ ) { 301 if( defined $nodes[$k] ) { 302 push @leftovers, $k; 303 push @leftovernodes, $nodes[$k]; 304 } 305 } 306 my ($l_0,$l_1,$l_2) = @leftovers; 307 308 my $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0] - 309 $mat->[$l_2][$l_1] ) / 2; 310 311 my $dist_j = ( $mat->[$l_1][$l_0] - $dist_i); 312 my $dist_k = ( $mat->[$l_2][$l_0] - $dist_i); 313 314 # This is Kev's code to get rid of negative branch lengths 315 if( $dist_i < 0 ) { 316 $dist_i = 0; 317 $dist_j = $mat->[$l_1][$l_0]; 318 $dist_k = $mat->[$l_2][$l_0]; 319 if( $dist_j < 0 ) { 320 $dist_j = 0; 321 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1] ) / 2; 322 $dist_k = 0 if( $dist_k < 0 ); 323 } elsif( $dist_k < 0 ) { 324 $dist_k = 0; 325 $dist_j = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_1]) / 2; 326 $dist_j = 0 if( $dist_j < 0 ); 327 } 328 } elsif( $dist_j < 0 ) { 329 $dist_j = 0; 330 $dist_i = $mat->[$l_1][$l_0]; 331 $dist_k = $mat->[$l_2][$l_1]; 332 if( $dist_i < 0 ) { 333 $dist_i = 0; 334 $dist_k = ( $mat->[$l_2][$l_0] + $mat->[$l_2][$l_1]) / 2; 335 $dist_k = 0 if( $dist_k < 0 ); 336 } elsif( $dist_k < 0 ) { 337 $dist_k = 0; 338 $dist_i = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2; 339 $dist_i = 0 if( $dist_i < 0 ); 340 } 341 } elsif( $dist_k < 0 ) { 342 $dist_k = 0; 343 $dist_i = $mat->[$l_2][$l_0]; 344 $dist_j = $mat->[$l_2][$l_1]; 345 if( $dist_i < 0 ) { 346 $dist_i = 0; 347 $dist_j = ( $mat->[$l_1][$l_0] + $mat->[$l_2][$l_1] ) / 2; 348 $dist_j = 0 if $dist_j < 0; 349 } elsif( $dist_j < 0 ) { 350 $dist_j = 0; 351 $dist_i = ($mat->[$l_1][$l_0] + $mat->[$l_2][$l_0]) / 2; 352 $dist_i = 0 if $dist_i < 0; 353 } 354 } 355 $leftovernodes[0]->branch_length(sprintf($precisionstr,$dist_i)); 356 $leftovernodes[1]->branch_length(sprintf($precisionstr,$dist_j)); 357 $leftovernodes[2]->branch_length(sprintf($precisionstr,$dist_k)); 358 359 Bio::Tree::Tree->new(-root => Bio::Tree::Node->new 360 (-descendents => \@leftovernodes)); 361} 362 363=head2 _upgma 364 365 Title : _upgma 366 Usage : my $tree = $disttreefact->_upgma($matrix); 367 Function: Construct a tree based on alignment using UPGMA 368 Returns : L<Bio::Tree::TreeI> 369 Args : L<Bio::Matrix::MatrixI> object 370 371 372=cut 373 374sub _upgma{ 375 my ($self,$distmat) = @_; 376 # we assume type checking of $matrix has already been done 377 # client shouldn't be calling this directly anyways, using the 378 # make_tree method is preferred 379 380 # algorithm, from Eddy, Durbin, Krogh, Mitchison, 1998 381 # originally by Sokal and Michener 1956 382 383 my $precisionstr = "%.$Precision"."f"; 384 385 my ($i,$j,$x,$y,@dmat,@orig,@nodes); 386 387 my @names = $distmat->column_names; 388 my $c = 0; 389 my @clusters = map { 390 my $r = { 'id' => $c, 391 'height' => 0, 392 'contains' => [$c], 393 }; 394 $c++; 395 $r; 396 } @names; 397 398 my $K = scalar @clusters; 399 my (@mins,$min); 400 for ( $i = 0; $i < $K; $i++ ) { 401 for( $j = $i+1; $j < $K; $j++ ) { 402 my $d = $distmat->get_entry($names[$i],$names[$j]); 403 # get Min here on first time around, save 1 cycle 404 $dmat[$j][$i] = $dmat[$i][$j] = $d; 405 $orig[$i][$j] = $orig[$j][$i] = $d; 406 if ( ! defined $min || $d <= $min ) { 407 if( defined $min && $min == $d ) { 408 push @mins, [$i,$j]; 409 } else { 410 @mins = [$i,$j]; 411 $min = $d; 412 } 413 } 414 } 415 } 416 # distance between each cluster is avg distance 417 # between pairs of sequences from each cluster 418 while( $K > 1 ) { 419 # fencepost - we already have found the $min 420 # so very first time loop is executed we can skip checking 421 unless( defined $min ) { 422 for($i = 0; $i < $K; $i++ ) { 423 for( $j = $i+1; $j < $K; $j++ ) { 424 my $dij = $dmat[$i][$j]; 425 if( ! defined $min || 426 $dij <= $min) { 427 if( defined $min && 428 $min == $dij ) { 429 push @mins, [$i,$j]; 430 } else { 431 @mins = [ $i,$j ]; 432 $min = $dij; 433 } 434 } 435 } 436 } 437 } 438 # randomly break ties 439 ($x,$y) = @{ $mins[int(rand(scalar @mins))] }; 440 441 # now we are going to join clusters x and y, make a new cluster 442 443 my $node = Bio::Tree::Node->new(); 444 my @subids; 445 for my $cid ( $x,$y ) { 446 my $nid = $clusters[$cid]->{'id'}; 447 if( ! defined $nodes[$nid] ) { 448 $nodes[$nid] = Bio::Tree::Node->new(-id => $names[$nid]); 449 } 450 $nodes[$nid]->branch_length 451 (sprintf($precisionstr,$min/2 - $clusters[$cid]->{'height'})); 452 $node->add_Descendent($nodes[$nid]); 453 push @subids, @{ $clusters[$cid]->{'contains'} }; 454 } 455 my $cluster = { 'id' => $c++, 456 'height' => $min / 2, 457 'contains' => [@subids], 458 }; 459 460 $K--; # we are going to drop the last node so go ahead and decrement K 461 $nodes[$cluster->{'id'}] = $node; 462 if ( $y != $K ) { 463 $clusters[$y] = $clusters[$K]; 464 $dmat[$y] = $dmat[$K]; 465 for ( $i = 0; $i < $K; $i++ ) { 466 $dmat[$i][$y] = $dmat[$y][$i]; 467 } 468 } 469 delete $clusters[$K]; 470 $clusters[$x] = $cluster; 471 # now recalculate @dmat 472 for( $i = 0; $i < $K; $i++ ) { 473 if( $i != $x) { 474 $dmat[$i][$x] = $dmat[$x][$i] = 475 &_upgma_distance($clusters[$i],$clusters[$x],\@orig); 476 } else { 477 $dmat[$i][$i] = 0; 478 } 479 } 480 # reset so next loop iteration 481 # we will find minimum distance 482 @mins = (); 483 $min = undef; 484 } 485 Bio::Tree::Tree->new(-root => $nodes[-1]); 486} 487 488# calculate avg distance between clusters - be they 489# single sequences or the combination of multiple seqences 490# $cluster_i and $cluster_j are the clusters to operate on 491# and $distances is a matrix (arrayref of arrayrefs) of pairwise 492# differences indexed on the sequence ids - 493# so $distances->[0][1] is the distance between sequences 0 and 1 494 495sub _upgma_distance { 496 my ($cluster_i, $cluster_j, $distances) = @_; 497 my $ilen = scalar @{ $cluster_i->{'contains'} }; 498 my $jlen = scalar @{ $cluster_j->{'contains'} }; 499 my ($d,$count); 500 for( my $i = 0; $i < $ilen; $i++ ) { 501 my $i_id = $cluster_i->{'contains'}->[$i]; 502 for( my $j = 0; $j < $jlen; $j++) { 503 my $j_id = $cluster_j->{'contains'}->[$j]; 504 if( ! defined $distances->[$i_id][$j_id] ) { 505 warn("no value for $i_id $j_id\n"); 506 } else { 507 $d += $distances->[$i_id][$j_id]; 508 } 509 $count++; 510 } 511 } 512 return $d / $count; 513} 514 515=head2 method 516 517 Title : method 518 Usage : $obj->method($newval) 519 Function: 520 Example : 521 Returns : value of method (a scalar) 522 Args : on set, new value (a scalar or undef, optional) 523 524 525=cut 526 527sub method{ 528 my $self = shift; 529 return $self->{'_method'} = shift if @_; 530 return $self->{'_method'}; 531} 532 533 534=head2 check_additivity 535 536 Title : check_additivity 537 Usage : if( $distance->check_additivity($matrix) ) { 538 } 539 Function : See if matrix obeys additivity principal 540 Returns : boolean 541 Args : Bio::Matrix::MatrixI 542 References: Based on a Java implementation by 543 Peter Sestoft, sestoft@dina.kvl.dk 1999-12-07 version 0.3 544 http://www.dina.kvl.dk/~sestoft/bsa.html 545 which in turn is based on algorithms described in 546 R. Durbin, S. Eddy, A. Krogh, G. Mitchison. 547 Biological Sequence Analysis CUP 1998, Chapter 7. 548 549=cut 550 551sub check_additivity{ 552 my ($self,$matrix) = @_; 553 my @names = $matrix->column_names; 554 my $len = scalar @names; 555 return unless $len >= 4; 556 # look at all sets of 4 557 for( my $i = 0; $i < $len; $i++ ) { 558 for( my $j = $i+1; $j< $len; $j++) { 559 for( my $k = $j+1; $k < $len; $k ++ ) { 560 for( my $m = $k +1; $m < $len; $m++ ) { 561 my $DijDkm = $matrix->get_entry($names[$i],$names[$j]) + 562 $matrix->get_entry($names[$k],$names[$m]); 563 my $DikDjm = $matrix->get_entry($names[$i],$names[$k]) + 564 $matrix->get_entry($names[$j],$names[$m]); 565 my $DimDjk = $matrix->get_entry($names[$i],$names[$m]) + 566 $matrix->get_entry($names[$j],$names[$k]); 567 if( !( ( $DijDkm == $DikDjm && $DijDkm >= $DimDjk) 568 || ( $DijDkm == $DimDjk && $DijDkm >= $DikDjm) 569 || ( $DikDjm == $DimDjk && $DikDjm >= $DijDkm) )) { 570 return 0; 571 } 572 } 573 } 574 } 575 } 576 return 1; 577} 578 5791; 580