Bio-Phylo-v2.0.1/t/54-rankprob.t

#!/usr/bin/perl
use strict;
use warnings;
use Test::More 'no_plan';
use Bio::Phylo::IO 'parse_tree';
use List::Util qw[sum min max];
use constant True  => !undef;
use constant False => undef;

# # # # # # # # # # #
# Rank functions Daniel Ford, Tanja Gernhard 2006
# Functions:
# rankprob(t,u)     - returns the probability distribution
#                     of the rank of vertex "u" in tree "t"
# expectedrank(t,u) - returns the expected rank
#                     of vertex "u" and the variance
# compare(t,u,v)    - returns the probability that "u"
#                     is below "v" in tree "t"

# This is a more or less literal port to Perl of the functions developed by
# Daniel Ford and Tanja Stadler (nee Gernhard) during Tanja's dissertation
# research. Some minimal changes were made to make the code more idiomatic.

# Bio::Phylo contains a port of this functionality, mostly implemented by
# calc_* methods in Bio::Phylo::Forest::NodeRole. To verify that this port
# has the same outputs as the original code we first verified that the code
# below matches that of the python code (it does), and then we compare the
# output of this literal port with that of the methods that integrate the
# functionality in the Bio::Phylo API. Those tests are here at the bottom
# of the file.

# Calculation of n choose j
# This version saves partial results for use later
my @nc_matrix; #stores the values of nchoose(n,j)
# -- note: order of indices is reversed
sub nchoose_static {
	my ( $n, $j, @nc_matrix ) = @_;
	return 0 if $j > $n;
	if ( @nc_matrix < $j + 1 ) {
		push @nc_matrix, [] for @nc_matrix .. $j;
	}
	if ( @{ $nc_matrix[$j] } < $n + 1 ) {
		push @{ $nc_matrix[$j] }, 0 for @{ $nc_matrix[$j] } .. $j - 1;
	}
	push @{ $nc_matrix[$j] }, 1 if @{ $nc_matrix[$j] } == $j;
	for my $i ( @{ $nc_matrix[$j] } .. $n ) {
		push @{ $nc_matrix[$j] }, $nc_matrix[$j]->[$i-1] * $i / ( $i - $j );
	}
	return $nc_matrix[$j]->[$n];
}

# dynamic programming version
sub nchoose {
	my ( $n, $j ) = @_;
	return nchoose_static($n,$j,@nc_matrix);
}

# GCD - assumes positive integers as input
# (subroutine for compare(t,u,v))
sub gcd {
	my ( $n, $m ) = @_;
	return $n if $n == $m;
	( $n, $m ) =  ( $m, $n ) if $m > $n;
	my $i = int($n / $m);
	$n = $n - $m * $i;
	return $m if $n == 0;

	# recurse
	return gcd($m,$n);
}

# Takes two large integers and attempts to divide them and give
# the float answer without overflowing
# (subroutine for compare(t,u,v))
# does this by first taking out the gcd
sub gcd_divide {
	my ( $n, $m ) = @_;
	my $x = gcd($n,$m);
	$n /= $x;
	$m /= $x;
	return $n/$m;
}

# get the number of descendants of u and of all vertices on the
# path to the root (subroutine for rankprob(t,u))
sub numDescendants {
	my ($tree,$u) = @_;

	# focal node (subtree) is empty, i.e. a leaf
	return [False,False] unless @{ $tree };

	# focal node is u
	return [True,[$tree->[2]->{"leaves_below"}-1]] if $tree->[2]->{"label"}==$u;

	# recurse left
	my $x = numDescendants( $tree->[0], $u );
	if ( $x->[0] ) {
		my $n;

		# focal node has no sibling
		if ( not @{ $tree->[1] } ) {
			$n = 0;
		}
		else {
			$n = $tree->[1]->[2]->{"leaves_below"} - 1;
		}
		return [ True, [ @{ $x->[1] }, $n ] ];
	}

	# recurse right
	my $y = numDescendants( $tree->[1], $u );
	if ( $y->[0] ) {
		my $n;

		# focal node has no sibling
		if ( not @{ $tree->[0] } ) {
			$n = 0;
		}
		else {
			$n = $tree->[0]->[2]->{"leaves_below"} - 1;
		}
		return [ True, [ @{ $y->[1] }, $n ] ];
	}
	else {
		return [False,False];
	}
}

# returns the subtree rooted at the common ancestor of u and v
# (subroutine for compare(t,u,v))
# return
# True/False - have we found u yet
# True/False - have we found v yet
# the subtree - if we have found u and v
# the u half of the subtree
# the v half of the subtree
sub subtree {
	my ($tree,$u,$v) = @_;

	# tree is empty
	if ( not @{ $tree } ) {
		return False, False, False, False, False;
	}

	# recurse left and right
	my ( $found_ul, $found_vl, $subtree_l, $subtree_ul, $subtree_vl ) = subtree( $tree->[0], $u, $v );
	my ( $found_ur, $found_vr, $subtree_r, $subtree_ur, $subtree_vr ) = subtree( $tree->[1], $u, $v );

	# both were left descendants of focal node, return result
	if ( $found_ul and $found_vl ) {
		return $found_ul, $found_vl, $subtree_l, $subtree_ul, $subtree_vl;
	}

	# both were right descendants of focal node, return result
	if ( $found_ur and $found_vr ) {
		return $found_ur, $found_vr, $subtree_r, $subtree_ur, $subtree_vr;
	}

	# have we found either?
	my $found_u = ( $found_ul or $found_ur or $tree->[2]->{"label"} == $u );
	my $found_v = ( $found_vl or $found_vr or $tree->[2]->{"label"} == $v );

	# initialize and assign subtrees
	my ( $subtree_u, $subtree_v );
	$subtree_u = $subtree_ul if $found_ul;
	$subtree_v = $subtree_vl if $found_vl;
	$subtree_u = $subtree_ur if $found_ur;
	$subtree_v = $subtree_vr if $found_vr;
	if ( $found_u and (not $found_v) ) {
		$subtree_u = $tree;
	}
	elsif ( $found_v and (not $found_u) ) {
		$subtree_v = $tree;
	}
	$subtree_u = $tree if $tree->[2]->{"label"} == $u;
	$subtree_v = $tree if $tree->[2]->{"label"} == $v;

	# return results
	return $found_u, $found_v, $tree, $subtree_u, $subtree_v;
}

# A version of rankprob which uses the function numDescendants
sub rankprob {
	my ($t,$u) = @_;
	my $x = numDescendants($t,$u);
	$x = $x->[1];
	my $lhsm = $x->[0];
	my $k = scalar(@$x);
	my $start = 1;
	my $end = 1;
	my $rp = [0,1];
	my $step = 1;
	while ( $step < $k ) {
		my $rhsm = $x->[$step];
		my $newstart = $start+1;
		my $newend = $end + $rhsm + 1;
		my $rp2 = [];
		for my $i ( 0 .. $newend ) {
			push @$rp2, 0;
		}
		for my $i ( $newstart .. $newend ) {
			my $q = max( 0, $i - 1 - $end );
			for my $j ( $q .. min( $rhsm, $i - 2 ) ) {
				my $a = $rp->[$i-$j-1] * nchoose($lhsm + $rhsm - ($i-1),$rhsm-$j) * nchoose($i-2,$j);
				$rp2->[$i]+=$a;
			}
		}
		$rp = $rp2;
		$start = $newstart;
		$end = $newend;
		$lhsm = $lhsm+$rhsm+1;
		$step += 1;
	}
	my $tot = sum( @{ $rp } );
	for my $i ( 0..$#{ $rp } ) {
		$rp->[$i] = $rp->[$i] / $tot;
	}
	return $rp;
}

# For tree "t" and vertex "u" calculate the
# expected rank and variance
sub expectedrank {
	my ( $t, $u ) = @_;
	my $rp = rankprob( $t, $u );
	my $mu = 0;
	my $sigma = 0;
	for my $i ( 0 .. $#{ $rp } ) {
		$mu += $i * $rp->[$i];
		$sigma += $i * $i * $rp->[$i];
	}
	return $mu, $sigma - $mu * $mu;
}

# Gives the probability that vertex labeled v is
# below vertex labeled u
sub compare {
	my ($t,$u,$v) = @_;
	my ($found_u,$found_v,$subtree,$subtree_u,$subtree_v) = subtree($t,$u,$v);

	# both vertices need to occur in the same tree, of course
	if ( not ($found_u and $found_v) ) {
		print "This tree does not have those vertices!";
		return 0;
	}

	# $u is the root of the subtree,
	# hence $v MUST be below $u
	if ( $subtree->[2]->{"label"} == $u ) {
		return 1.0;
	}

	# $v is the root of the subtree,
	# so it can't be below $u
	if ( $subtree->[2]->{"label"} == $v ) {
		return 0.0;
	}

	# calculate rank probabilities in
	# respective subtrees
	my $x = rankprob($subtree_u,$u);
	my $y = rankprob($subtree_v,$v);
	my $usize = $subtree_u->[2]->{"leaves_below"} - 1;
	my $vsize = $subtree_v->[2]->{"leaves_below"} - 1;

	for my $i ( scalar(@$x) .. $usize + 1 ) {
		push @$x, 0;
	}
	my $xcumulative = [0];
	for my $i ( 1 .. $#{ $x } ) {
		push @$xcumulative, $xcumulative->[$i-1] + $x->[$i];
	}
	my $rp = [0];
	for my $i ( 1 .. $#{ $y } ) {
		push @$rp, 0;
		for my $j ( 1 .. $usize) {
			my $a = $y->[$i] * nchoose($i-1+$j,$j) * nchoose($vsize-$i+$usize-$j, $usize-$j) * $xcumulative->[$j];
			$rp->[$i] += $a;
		}
	}
	my $tot = nchoose($usize+$vsize,$vsize);
	return sum(@$rp)/$tot;
}

my $t1 = [
		[
			[],
			[],
			{ 'leaves_below' => 2, 'label' => 4 }
		],
		[],
		{ 'leaves_below' => 3, 'label'=> 3 }
];
my $t2 = [
		[
			[],
			[],
			{ 'leaves_below' => 2, 'label'=> 7 }
		],
		[
			[],
			[],
			{ 'leaves_below' => 2, 'label' => 8 }
		],
	{ 'leaves_below' => 4, 'label' => 6 }
];
my $t3 = [
	[],
	[],
	{'leaves_below' => 2, 'label' => 5 }
];
my $t4 = [ $t1, $t3, { 'leaves_below' => 5, 'label' => 2 } ];

# Newick: (((l6,l7)7,(l8,l9)8)6,(((l1,l2)4,l3)3,(l4,l5)5)2)1;
my $t  = [ $t2, $t4, { 'leaves_below' => 9, 'label' => 1 } ];
my $tree = parse_tree(
	'-format' => 'newick',
	'-string' => '(((l6,l7)7,(l8,l9)8)6,(((l1,l2)4,l3)3,(l4,l5)5)2)1;',
);


# test numDescendants
{
	for my $i ( 1 .. 8 ) {
		my $stadler_result = numDescendants($t,4);
		my $port_result = $tree->get_root->calc_rankprob_tipcounts(4);
		is_deeply($stadler_result,$port_result);
	}
}

# test subtree
{
	for my $i ( 1 .. 8 ) {
		for my $j ( 1 .. 8 ) {
			next if $i == $j;
			my ( $sb1, $sb2, $sn1, $sn2, $sn3 ) = subtree($t,$i,$j);
			my ( $pb1, $pb2, $pn1, $pn2, $pn3 ) = $tree->get_root->get_subtrees($i,$j);

			# the booleans must be the same
			ok( not ( $sb1 ^ $pb1 ) and not ( $sb2 ^ $pb2 ) );

			# if we have a subtree for one we must have one for the other and
			# the root node must have the same label
			if ( $sn1 ) {
				ok( $pn1 );
				ok( $sn1->[2]->{'label'} == $pn1->get_name );
			}
			else {
				ok( ! $pn1 );
			}
			if ( $sn2 ) {
				ok( $pn2 );
				ok( $sn2->[2]->{'label'} == $pn2->get_name );
			}
			else {
				ok( ! $pn2 );
			}
			if ( $sn3 ) {
				ok( $pn3 );
				ok( $sn3->[2]->{'label'} == $pn3->get_name );
			}
			else {
				ok( ! $pn3 );
			}
		}
	}
}

# test rankprob
{
	for my $i ( 1 .. 8 ) {
		my $stadler_rankprob = rankprob($t,$i);
		my $port_rankprob = $tree->get_root->calc_rankprob($i);
		is_deeply($stadler_rankprob,$port_rankprob);
	}
}

# test expectedrank
{
	for my $i ( 1 .. 8 ) {
		my @stadler_exp = expectedrank($t,$i);
		my @port_exp = $tree->get_root->calc_expected_rank($i);
		is_deeply(\@stadler_exp,\@port_exp);
	}
}

# test compare
{
	for my $i ( 1 .. 8 ) {
		for my $j ( 1 .. 8 ) {
			next if $i == $j;
			my $stadler_p = compare($t,$i,$j);
			my $port_p = $tree->get_root->calc_rankprob_compare($i,$j);
			ok( $stadler_p == $port_p );
		}
	}
}