1#!/usr/bin/perl 2use strict; 3use warnings; 4use Test::More 'no_plan'; 5use Bio::Phylo::IO 'parse_tree'; 6use List::Util qw[sum min max]; 7use constant True => !undef; 8use constant False => undef; 9 10# # # # # # # # # # # 11# Rank functions Daniel Ford, Tanja Gernhard 2006 12# Functions: 13# rankprob(t,u) - returns the probability distribution 14# of the rank of vertex "u" in tree "t" 15# expectedrank(t,u) - returns the expected rank 16# of vertex "u" and the variance 17# compare(t,u,v) - returns the probability that "u" 18# is below "v" in tree "t" 19 20# This is a more or less literal port to Perl of the functions developed by 21# Daniel Ford and Tanja Stadler (nee Gernhard) during Tanja's dissertation 22# research. Some minimal changes were made to make the code more idiomatic. 23 24# Bio::Phylo contains a port of this functionality, mostly implemented by 25# calc_* methods in Bio::Phylo::Forest::NodeRole. To verify that this port 26# has the same outputs as the original code we first verified that the code 27# below matches that of the python code (it does), and then we compare the 28# output of this literal port with that of the methods that integrate the 29# functionality in the Bio::Phylo API. Those tests are here at the bottom 30# of the file. 31 32# Calculation of n choose j 33# This version saves partial results for use later 34my @nc_matrix; #stores the values of nchoose(n,j) 35# -- note: order of indices is reversed 36sub nchoose_static { 37 my ( $n, $j, @nc_matrix ) = @_; 38 return 0 if $j > $n; 39 if ( @nc_matrix < $j + 1 ) { 40 push @nc_matrix, [] for @nc_matrix .. $j; 41 } 42 if ( @{ $nc_matrix[$j] } < $n + 1 ) { 43 push @{ $nc_matrix[$j] }, 0 for @{ $nc_matrix[$j] } .. $j - 1; 44 } 45 push @{ $nc_matrix[$j] }, 1 if @{ $nc_matrix[$j] } == $j; 46 for my $i ( @{ $nc_matrix[$j] } .. $n ) { 47 push @{ $nc_matrix[$j] }, $nc_matrix[$j]->[$i-1] * $i / ( $i - $j ); 48 } 49 return $nc_matrix[$j]->[$n]; 50} 51 52# dynamic programming version 53sub nchoose { 54 my ( $n, $j ) = @_; 55 return nchoose_static($n,$j,@nc_matrix); 56} 57 58# GCD - assumes positive integers as input 59# (subroutine for compare(t,u,v)) 60sub gcd { 61 my ( $n, $m ) = @_; 62 return $n if $n == $m; 63 ( $n, $m ) = ( $m, $n ) if $m > $n; 64 my $i = int($n / $m); 65 $n = $n - $m * $i; 66 return $m if $n == 0; 67 68 # recurse 69 return gcd($m,$n); 70} 71 72# Takes two large integers and attempts to divide them and give 73# the float answer without overflowing 74# (subroutine for compare(t,u,v)) 75# does this by first taking out the gcd 76sub gcd_divide { 77 my ( $n, $m ) = @_; 78 my $x = gcd($n,$m); 79 $n /= $x; 80 $m /= $x; 81 return $n/$m; 82} 83 84# get the number of descendants of u and of all vertices on the 85# path to the root (subroutine for rankprob(t,u)) 86sub numDescendants { 87 my ($tree,$u) = @_; 88 89 # focal node (subtree) is empty, i.e. a leaf 90 return [False,False] unless @{ $tree }; 91 92 # focal node is u 93 return [True,[$tree->[2]->{"leaves_below"}-1]] if $tree->[2]->{"label"}==$u; 94 95 # recurse left 96 my $x = numDescendants( $tree->[0], $u ); 97 if ( $x->[0] ) { 98 my $n; 99 100 # focal node has no sibling 101 if ( not @{ $tree->[1] } ) { 102 $n = 0; 103 } 104 else { 105 $n = $tree->[1]->[2]->{"leaves_below"} - 1; 106 } 107 return [ True, [ @{ $x->[1] }, $n ] ]; 108 } 109 110 # recurse right 111 my $y = numDescendants( $tree->[1], $u ); 112 if ( $y->[0] ) { 113 my $n; 114 115 # focal node has no sibling 116 if ( not @{ $tree->[0] } ) { 117 $n = 0; 118 } 119 else { 120 $n = $tree->[0]->[2]->{"leaves_below"} - 1; 121 } 122 return [ True, [ @{ $y->[1] }, $n ] ]; 123 } 124 else { 125 return [False,False]; 126 } 127} 128 129# returns the subtree rooted at the common ancestor of u and v 130# (subroutine for compare(t,u,v)) 131# return 132# True/False - have we found u yet 133# True/False - have we found v yet 134# the subtree - if we have found u and v 135# the u half of the subtree 136# the v half of the subtree 137sub subtree { 138 my ($tree,$u,$v) = @_; 139 140 # tree is empty 141 if ( not @{ $tree } ) { 142 return False, False, False, False, False; 143 } 144 145 # recurse left and right 146 my ( $found_ul, $found_vl, $subtree_l, $subtree_ul, $subtree_vl ) = subtree( $tree->[0], $u, $v ); 147 my ( $found_ur, $found_vr, $subtree_r, $subtree_ur, $subtree_vr ) = subtree( $tree->[1], $u, $v ); 148 149 # both were left descendants of focal node, return result 150 if ( $found_ul and $found_vl ) { 151 return $found_ul, $found_vl, $subtree_l, $subtree_ul, $subtree_vl; 152 } 153 154 # both were right descendants of focal node, return result 155 if ( $found_ur and $found_vr ) { 156 return $found_ur, $found_vr, $subtree_r, $subtree_ur, $subtree_vr; 157 } 158 159 # have we found either? 160 my $found_u = ( $found_ul or $found_ur or $tree->[2]->{"label"} == $u ); 161 my $found_v = ( $found_vl or $found_vr or $tree->[2]->{"label"} == $v ); 162 163 # initialize and assign subtrees 164 my ( $subtree_u, $subtree_v ); 165 $subtree_u = $subtree_ul if $found_ul; 166 $subtree_v = $subtree_vl if $found_vl; 167 $subtree_u = $subtree_ur if $found_ur; 168 $subtree_v = $subtree_vr if $found_vr; 169 if ( $found_u and (not $found_v) ) { 170 $subtree_u = $tree; 171 } 172 elsif ( $found_v and (not $found_u) ) { 173 $subtree_v = $tree; 174 } 175 $subtree_u = $tree if $tree->[2]->{"label"} == $u; 176 $subtree_v = $tree if $tree->[2]->{"label"} == $v; 177 178 # return results 179 return $found_u, $found_v, $tree, $subtree_u, $subtree_v; 180} 181 182# A version of rankprob which uses the function numDescendants 183sub rankprob { 184 my ($t,$u) = @_; 185 my $x = numDescendants($t,$u); 186 $x = $x->[1]; 187 my $lhsm = $x->[0]; 188 my $k = scalar(@$x); 189 my $start = 1; 190 my $end = 1; 191 my $rp = [0,1]; 192 my $step = 1; 193 while ( $step < $k ) { 194 my $rhsm = $x->[$step]; 195 my $newstart = $start+1; 196 my $newend = $end + $rhsm + 1; 197 my $rp2 = []; 198 for my $i ( 0 .. $newend ) { 199 push @$rp2, 0; 200 } 201 for my $i ( $newstart .. $newend ) { 202 my $q = max( 0, $i - 1 - $end ); 203 for my $j ( $q .. min( $rhsm, $i - 2 ) ) { 204 my $a = $rp->[$i-$j-1] * nchoose($lhsm + $rhsm - ($i-1),$rhsm-$j) * nchoose($i-2,$j); 205 $rp2->[$i]+=$a; 206 } 207 } 208 $rp = $rp2; 209 $start = $newstart; 210 $end = $newend; 211 $lhsm = $lhsm+$rhsm+1; 212 $step += 1; 213 } 214 my $tot = sum( @{ $rp } ); 215 for my $i ( 0..$#{ $rp } ) { 216 $rp->[$i] = $rp->[$i] / $tot; 217 } 218 return $rp; 219} 220 221# For tree "t" and vertex "u" calculate the 222# expected rank and variance 223sub expectedrank { 224 my ( $t, $u ) = @_; 225 my $rp = rankprob( $t, $u ); 226 my $mu = 0; 227 my $sigma = 0; 228 for my $i ( 0 .. $#{ $rp } ) { 229 $mu += $i * $rp->[$i]; 230 $sigma += $i * $i * $rp->[$i]; 231 } 232 return $mu, $sigma - $mu * $mu; 233} 234 235# Gives the probability that vertex labeled v is 236# below vertex labeled u 237sub compare { 238 my ($t,$u,$v) = @_; 239 my ($found_u,$found_v,$subtree,$subtree_u,$subtree_v) = subtree($t,$u,$v); 240 241 # both vertices need to occur in the same tree, of course 242 if ( not ($found_u and $found_v) ) { 243 print "This tree does not have those vertices!"; 244 return 0; 245 } 246 247 # $u is the root of the subtree, 248 # hence $v MUST be below $u 249 if ( $subtree->[2]->{"label"} == $u ) { 250 return 1.0; 251 } 252 253 # $v is the root of the subtree, 254 # so it can't be below $u 255 if ( $subtree->[2]->{"label"} == $v ) { 256 return 0.0; 257 } 258 259 # calculate rank probabilities in 260 # respective subtrees 261 my $x = rankprob($subtree_u,$u); 262 my $y = rankprob($subtree_v,$v); 263 my $usize = $subtree_u->[2]->{"leaves_below"} - 1; 264 my $vsize = $subtree_v->[2]->{"leaves_below"} - 1; 265 266 for my $i ( scalar(@$x) .. $usize + 1 ) { 267 push @$x, 0; 268 } 269 my $xcumulative = [0]; 270 for my $i ( 1 .. $#{ $x } ) { 271 push @$xcumulative, $xcumulative->[$i-1] + $x->[$i]; 272 } 273 my $rp = [0]; 274 for my $i ( 1 .. $#{ $y } ) { 275 push @$rp, 0; 276 for my $j ( 1 .. $usize) { 277 my $a = $y->[$i] * nchoose($i-1+$j,$j) * nchoose($vsize-$i+$usize-$j, $usize-$j) * $xcumulative->[$j]; 278 $rp->[$i] += $a; 279 } 280 } 281 my $tot = nchoose($usize+$vsize,$vsize); 282 return sum(@$rp)/$tot; 283} 284 285my $t1 = [ 286 [ 287 [], 288 [], 289 { 'leaves_below' => 2, 'label' => 4 } 290 ], 291 [], 292 { 'leaves_below' => 3, 'label'=> 3 } 293]; 294my $t2 = [ 295 [ 296 [], 297 [], 298 { 'leaves_below' => 2, 'label'=> 7 } 299 ], 300 [ 301 [], 302 [], 303 { 'leaves_below' => 2, 'label' => 8 } 304 ], 305 { 'leaves_below' => 4, 'label' => 6 } 306]; 307my $t3 = [ 308 [], 309 [], 310 {'leaves_below' => 2, 'label' => 5 } 311]; 312my $t4 = [ $t1, $t3, { 'leaves_below' => 5, 'label' => 2 } ]; 313 314# Newick: (((l6,l7)7,(l8,l9)8)6,(((l1,l2)4,l3)3,(l4,l5)5)2)1; 315my $t = [ $t2, $t4, { 'leaves_below' => 9, 'label' => 1 } ]; 316my $tree = parse_tree( 317 '-format' => 'newick', 318 '-string' => '(((l6,l7)7,(l8,l9)8)6,(((l1,l2)4,l3)3,(l4,l5)5)2)1;', 319); 320 321 322# test numDescendants 323{ 324 for my $i ( 1 .. 8 ) { 325 my $stadler_result = numDescendants($t,4); 326 my $port_result = $tree->get_root->calc_rankprob_tipcounts(4); 327 is_deeply($stadler_result,$port_result); 328 } 329} 330 331# test subtree 332{ 333 for my $i ( 1 .. 8 ) { 334 for my $j ( 1 .. 8 ) { 335 next if $i == $j; 336 my ( $sb1, $sb2, $sn1, $sn2, $sn3 ) = subtree($t,$i,$j); 337 my ( $pb1, $pb2, $pn1, $pn2, $pn3 ) = $tree->get_root->get_subtrees($i,$j); 338 339 # the booleans must be the same 340 ok( not ( $sb1 ^ $pb1 ) and not ( $sb2 ^ $pb2 ) ); 341 342 # if we have a subtree for one we must have one for the other and 343 # the root node must have the same label 344 if ( $sn1 ) { 345 ok( $pn1 ); 346 ok( $sn1->[2]->{'label'} == $pn1->get_name ); 347 } 348 else { 349 ok( ! $pn1 ); 350 } 351 if ( $sn2 ) { 352 ok( $pn2 ); 353 ok( $sn2->[2]->{'label'} == $pn2->get_name ); 354 } 355 else { 356 ok( ! $pn2 ); 357 } 358 if ( $sn3 ) { 359 ok( $pn3 ); 360 ok( $sn3->[2]->{'label'} == $pn3->get_name ); 361 } 362 else { 363 ok( ! $pn3 ); 364 } 365 } 366 } 367} 368 369# test rankprob 370{ 371 for my $i ( 1 .. 8 ) { 372 my $stadler_rankprob = rankprob($t,$i); 373 my $port_rankprob = $tree->get_root->calc_rankprob($i); 374 is_deeply($stadler_rankprob,$port_rankprob); 375 } 376} 377 378# test expectedrank 379{ 380 for my $i ( 1 .. 8 ) { 381 my @stadler_exp = expectedrank($t,$i); 382 my @port_exp = $tree->get_root->calc_expected_rank($i); 383 is_deeply(\@stadler_exp,\@port_exp); 384 } 385} 386 387# test compare 388{ 389 for my $i ( 1 .. 8 ) { 390 for my $j ( 1 .. 8 ) { 391 next if $i == $j; 392 my $stadler_p = compare($t,$i,$j); 393 my $port_p = $tree->get_root->calc_rankprob_compare($i,$j); 394 ok( $stadler_p == $port_p ); 395 } 396 } 397}