1#!/usr/bin/perl
2use strict;
3use warnings;
4use Test::More 'no_plan';
5use Bio::Phylo::IO 'parse_tree';
6use List::Util qw[sum min max];
7use constant True  => !undef;
8use constant False => undef;
9
10# # # # # # # # # # #
11# Rank functions Daniel Ford, Tanja Gernhard 2006
12# Functions:
13# rankprob(t,u)     - returns the probability distribution
14#                     of the rank of vertex "u" in tree "t"
15# expectedrank(t,u) - returns the expected rank
16#                     of vertex "u" and the variance
17# compare(t,u,v)    - returns the probability that "u"
18#                     is below "v" in tree "t"
19
20# This is a more or less literal port to Perl of the functions developed by
21# Daniel Ford and Tanja Stadler (nee Gernhard) during Tanja's dissertation
22# research. Some minimal changes were made to make the code more idiomatic.
23
24# Bio::Phylo contains a port of this functionality, mostly implemented by
25# calc_* methods in Bio::Phylo::Forest::NodeRole. To verify that this port
26# has the same outputs as the original code we first verified that the code
27# below matches that of the python code (it does), and then we compare the
28# output of this literal port with that of the methods that integrate the
29# functionality in the Bio::Phylo API. Those tests are here at the bottom
30# of the file.
31
32# Calculation of n choose j
33# This version saves partial results for use later
34my @nc_matrix; #stores the values of nchoose(n,j)
35# -- note: order of indices is reversed
36sub nchoose_static {
37	my ( $n, $j, @nc_matrix ) = @_;
38	return 0 if $j > $n;
39	if ( @nc_matrix < $j + 1 ) {
40		push @nc_matrix, [] for @nc_matrix .. $j;
41	}
42	if ( @{ $nc_matrix[$j] } < $n + 1 ) {
43		push @{ $nc_matrix[$j] }, 0 for @{ $nc_matrix[$j] } .. $j - 1;
44	}
45	push @{ $nc_matrix[$j] }, 1 if @{ $nc_matrix[$j] } == $j;
46	for my $i ( @{ $nc_matrix[$j] } .. $n ) {
47		push @{ $nc_matrix[$j] }, $nc_matrix[$j]->[$i-1] * $i / ( $i - $j );
48	}
49	return $nc_matrix[$j]->[$n];
50}
51
52# dynamic programming version
53sub nchoose {
54	my ( $n, $j ) = @_;
55	return nchoose_static($n,$j,@nc_matrix);
56}
57
58# GCD - assumes positive integers as input
59# (subroutine for compare(t,u,v))
60sub gcd {
61	my ( $n, $m ) = @_;
62	return $n if $n == $m;
63	( $n, $m ) =  ( $m, $n ) if $m > $n;
64	my $i = int($n / $m);
65	$n = $n - $m * $i;
66	return $m if $n == 0;
67
68	# recurse
69	return gcd($m,$n);
70}
71
72# Takes two large integers and attempts to divide them and give
73# the float answer without overflowing
74# (subroutine for compare(t,u,v))
75# does this by first taking out the gcd
76sub gcd_divide {
77	my ( $n, $m ) = @_;
78	my $x = gcd($n,$m);
79	$n /= $x;
80	$m /= $x;
81	return $n/$m;
82}
83
84# get the number of descendants of u and of all vertices on the
85# path to the root (subroutine for rankprob(t,u))
86sub numDescendants {
87	my ($tree,$u) = @_;
88
89	# focal node (subtree) is empty, i.e. a leaf
90	return [False,False] unless @{ $tree };
91
92	# focal node is u
93	return [True,[$tree->[2]->{"leaves_below"}-1]] if $tree->[2]->{"label"}==$u;
94
95	# recurse left
96	my $x = numDescendants( $tree->[0], $u );
97	if ( $x->[0] ) {
98		my $n;
99
100		# focal node has no sibling
101		if ( not @{ $tree->[1] } ) {
102			$n = 0;
103		}
104		else {
105			$n = $tree->[1]->[2]->{"leaves_below"} - 1;
106		}
107		return [ True, [ @{ $x->[1] }, $n ] ];
108	}
109
110	# recurse right
111	my $y = numDescendants( $tree->[1], $u );
112	if ( $y->[0] ) {
113		my $n;
114
115		# focal node has no sibling
116		if ( not @{ $tree->[0] } ) {
117			$n = 0;
118		}
119		else {
120			$n = $tree->[0]->[2]->{"leaves_below"} - 1;
121		}
122		return [ True, [ @{ $y->[1] }, $n ] ];
123	}
124	else {
125		return [False,False];
126	}
127}
128
129# returns the subtree rooted at the common ancestor of u and v
130# (subroutine for compare(t,u,v))
131# return
132# True/False - have we found u yet
133# True/False - have we found v yet
134# the subtree - if we have found u and v
135# the u half of the subtree
136# the v half of the subtree
137sub subtree {
138	my ($tree,$u,$v) = @_;
139
140	# tree is empty
141	if ( not @{ $tree } ) {
142		return False, False, False, False, False;
143	}
144
145	# recurse left and right
146	my ( $found_ul, $found_vl, $subtree_l, $subtree_ul, $subtree_vl ) = subtree( $tree->[0], $u, $v );
147	my ( $found_ur, $found_vr, $subtree_r, $subtree_ur, $subtree_vr ) = subtree( $tree->[1], $u, $v );
148
149	# both were left descendants of focal node, return result
150	if ( $found_ul and $found_vl ) {
151		return $found_ul, $found_vl, $subtree_l, $subtree_ul, $subtree_vl;
152	}
153
154	# both were right descendants of focal node, return result
155	if ( $found_ur and $found_vr ) {
156		return $found_ur, $found_vr, $subtree_r, $subtree_ur, $subtree_vr;
157	}
158
159	# have we found either?
160	my $found_u = ( $found_ul or $found_ur or $tree->[2]->{"label"} == $u );
161	my $found_v = ( $found_vl or $found_vr or $tree->[2]->{"label"} == $v );
162
163	# initialize and assign subtrees
164	my ( $subtree_u, $subtree_v );
165	$subtree_u = $subtree_ul if $found_ul;
166	$subtree_v = $subtree_vl if $found_vl;
167	$subtree_u = $subtree_ur if $found_ur;
168	$subtree_v = $subtree_vr if $found_vr;
169	if ( $found_u and (not $found_v) ) {
170		$subtree_u = $tree;
171	}
172	elsif ( $found_v and (not $found_u) ) {
173		$subtree_v = $tree;
174	}
175	$subtree_u = $tree if $tree->[2]->{"label"} == $u;
176	$subtree_v = $tree if $tree->[2]->{"label"} == $v;
177
178	# return results
179	return $found_u, $found_v, $tree, $subtree_u, $subtree_v;
180}
181
182# A version of rankprob which uses the function numDescendants
183sub rankprob {
184	my ($t,$u) = @_;
185	my $x = numDescendants($t,$u);
186	$x = $x->[1];
187	my $lhsm = $x->[0];
188	my $k = scalar(@$x);
189	my $start = 1;
190	my $end = 1;
191	my $rp = [0,1];
192	my $step = 1;
193	while ( $step < $k ) {
194		my $rhsm = $x->[$step];
195		my $newstart = $start+1;
196		my $newend = $end + $rhsm + 1;
197		my $rp2 = [];
198		for my $i ( 0 .. $newend ) {
199			push @$rp2, 0;
200		}
201		for my $i ( $newstart .. $newend ) {
202			my $q = max( 0, $i - 1 - $end );
203			for my $j ( $q .. min( $rhsm, $i - 2 ) ) {
204				my $a = $rp->[$i-$j-1] * nchoose($lhsm + $rhsm - ($i-1),$rhsm-$j) * nchoose($i-2,$j);
205				$rp2->[$i]+=$a;
206			}
207		}
208		$rp = $rp2;
209		$start = $newstart;
210		$end = $newend;
211		$lhsm = $lhsm+$rhsm+1;
212		$step += 1;
213	}
214	my $tot = sum( @{ $rp } );
215	for my $i ( 0..$#{ $rp } ) {
216		$rp->[$i] = $rp->[$i] / $tot;
217	}
218	return $rp;
219}
220
221# For tree "t" and vertex "u" calculate the
222# expected rank and variance
223sub expectedrank {
224	my ( $t, $u ) = @_;
225	my $rp = rankprob( $t, $u );
226	my $mu = 0;
227	my $sigma = 0;
228	for my $i ( 0 .. $#{ $rp } ) {
229		$mu += $i * $rp->[$i];
230		$sigma += $i * $i * $rp->[$i];
231	}
232	return $mu, $sigma - $mu * $mu;
233}
234
235# Gives the probability that vertex labeled v is
236# below vertex labeled u
237sub compare {
238	my ($t,$u,$v) = @_;
239	my ($found_u,$found_v,$subtree,$subtree_u,$subtree_v) = subtree($t,$u,$v);
240
241	# both vertices need to occur in the same tree, of course
242	if ( not ($found_u and $found_v) ) {
243		print "This tree does not have those vertices!";
244		return 0;
245	}
246
247	# $u is the root of the subtree,
248	# hence $v MUST be below $u
249	if ( $subtree->[2]->{"label"} == $u ) {
250		return 1.0;
251	}
252
253	# $v is the root of the subtree,
254	# so it can't be below $u
255	if ( $subtree->[2]->{"label"} == $v ) {
256		return 0.0;
257	}
258
259	# calculate rank probabilities in
260	# respective subtrees
261	my $x = rankprob($subtree_u,$u);
262	my $y = rankprob($subtree_v,$v);
263	my $usize = $subtree_u->[2]->{"leaves_below"} - 1;
264	my $vsize = $subtree_v->[2]->{"leaves_below"} - 1;
265
266	for my $i ( scalar(@$x) .. $usize + 1 ) {
267		push @$x, 0;
268	}
269	my $xcumulative = [0];
270	for my $i ( 1 .. $#{ $x } ) {
271		push @$xcumulative, $xcumulative->[$i-1] + $x->[$i];
272	}
273	my $rp = [0];
274	for my $i ( 1 .. $#{ $y } ) {
275		push @$rp, 0;
276		for my $j ( 1 .. $usize) {
277			my $a = $y->[$i] * nchoose($i-1+$j,$j) * nchoose($vsize-$i+$usize-$j, $usize-$j) * $xcumulative->[$j];
278			$rp->[$i] += $a;
279		}
280	}
281	my $tot = nchoose($usize+$vsize,$vsize);
282	return sum(@$rp)/$tot;
283}
284
285my $t1 = [
286		[
287			[],
288			[],
289			{ 'leaves_below' => 2, 'label' => 4 }
290		],
291		[],
292		{ 'leaves_below' => 3, 'label'=> 3 }
293];
294my $t2 = [
295		[
296			[],
297			[],
298			{ 'leaves_below' => 2, 'label'=> 7 }
299		],
300		[
301			[],
302			[],
303			{ 'leaves_below' => 2, 'label' => 8 }
304		],
305	{ 'leaves_below' => 4, 'label' => 6 }
306];
307my $t3 = [
308	[],
309	[],
310	{'leaves_below' => 2, 'label' => 5 }
311];
312my $t4 = [ $t1, $t3, { 'leaves_below' => 5, 'label' => 2 } ];
313
314# Newick: (((l6,l7)7,(l8,l9)8)6,(((l1,l2)4,l3)3,(l4,l5)5)2)1;
315my $t  = [ $t2, $t4, { 'leaves_below' => 9, 'label' => 1 } ];
316my $tree = parse_tree(
317	'-format' => 'newick',
318	'-string' => '(((l6,l7)7,(l8,l9)8)6,(((l1,l2)4,l3)3,(l4,l5)5)2)1;',
319);
320
321
322# test numDescendants
323{
324	for my $i ( 1 .. 8 ) {
325		my $stadler_result = numDescendants($t,4);
326		my $port_result = $tree->get_root->calc_rankprob_tipcounts(4);
327		is_deeply($stadler_result,$port_result);
328	}
329}
330
331# test subtree
332{
333	for my $i ( 1 .. 8 ) {
334		for my $j ( 1 .. 8 ) {
335			next if $i == $j;
336			my ( $sb1, $sb2, $sn1, $sn2, $sn3 ) = subtree($t,$i,$j);
337			my ( $pb1, $pb2, $pn1, $pn2, $pn3 ) = $tree->get_root->get_subtrees($i,$j);
338
339			# the booleans must be the same
340			ok( not ( $sb1 ^ $pb1 ) and not ( $sb2 ^ $pb2 ) );
341
342			# if we have a subtree for one we must have one for the other and
343			# the root node must have the same label
344			if ( $sn1 ) {
345				ok( $pn1 );
346				ok( $sn1->[2]->{'label'} == $pn1->get_name );
347			}
348			else {
349				ok( ! $pn1 );
350			}
351			if ( $sn2 ) {
352				ok( $pn2 );
353				ok( $sn2->[2]->{'label'} == $pn2->get_name );
354			}
355			else {
356				ok( ! $pn2 );
357			}
358			if ( $sn3 ) {
359				ok( $pn3 );
360				ok( $sn3->[2]->{'label'} == $pn3->get_name );
361			}
362			else {
363				ok( ! $pn3 );
364			}
365		}
366	}
367}
368
369# test rankprob
370{
371	for my $i ( 1 .. 8 ) {
372		my $stadler_rankprob = rankprob($t,$i);
373		my $port_rankprob = $tree->get_root->calc_rankprob($i);
374		is_deeply($stadler_rankprob,$port_rankprob);
375	}
376}
377
378# test expectedrank
379{
380	for my $i ( 1 .. 8 ) {
381		my @stadler_exp = expectedrank($t,$i);
382		my @port_exp = $tree->get_root->calc_expected_rank($i);
383		is_deeply(\@stadler_exp,\@port_exp);
384	}
385}
386
387# test compare
388{
389	for my $i ( 1 .. 8 ) {
390		for my $j ( 1 .. 8 ) {
391			next if $i == $j;
392			my $stadler_p = compare($t,$i,$j);
393			my $port_p = $tree->get_root->calc_rankprob_compare($i,$j);
394			ok( $stadler_p == $port_p );
395		}
396	}
397}