1 /*
2 * phylotesting.cpp
3 *
4 * Created on: Sep 21, 2019
5 * Author: minh
6 */
7
8
9
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13 #include <iqtree_config.h>
14
15 #include "treetesting.h"
16 #include "tree/phylotree.h"
17 #include "tree/phylosupertree.h"
18 #include "gsl/mygsl.h"
19 #include "utils/timeutil.h"
20
21
printSiteLh(const char * filename,PhyloTree * tree,double * ptn_lh,bool append,const char * linename)22 void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh,
23 bool append, const char *linename) {
24 int i;
25 double *pattern_lh;
26 if (!ptn_lh) {
27 pattern_lh = new double[tree->getAlnNPattern()];
28 tree->computePatternLikelihood(pattern_lh);
29 } else
30 pattern_lh = ptn_lh;
31
32 try {
33 ofstream out;
34 out.exceptions(ios::failbit | ios::badbit);
35 if (append) {
36 out.open(filename, ios::out | ios::app);
37 } else {
38 out.open(filename);
39 out << 1 << " " << tree->getAlnNSite() << endl;
40 }
41 IntVector pattern_index;
42 tree->aln->getSitePatternIndex(pattern_index);
43 if (!linename)
44 out << "Site_Lh ";
45 else {
46 out.width(10);
47 out << left << linename;
48 }
49 for (i = 0; i < tree->getAlnNSite(); i++)
50 out << " " << pattern_lh[pattern_index[i]];
51 out << endl;
52 out.close();
53 if (!append)
54 cout << "Site log-likelihoods printed to " << filename << endl;
55 } catch (ios::failure) {
56 outError(ERR_WRITE_OUTPUT, filename);
57 }
58
59 if (!ptn_lh)
60 delete[] pattern_lh;
61 }
62
printPartitionLh(const char * filename,PhyloTree * tree,double * ptn_lh,bool append,const char * linename)63 void printPartitionLh(const char*filename, PhyloTree *tree, double *ptn_lh,
64 bool append, const char *linename) {
65
66 ASSERT(tree->isSuperTree());
67 PhyloSuperTree *stree = (PhyloSuperTree*)tree;
68 int i;
69 double *pattern_lh;
70 if (!ptn_lh) {
71 pattern_lh = new double[tree->getAlnNPattern()];
72 tree->computePatternLikelihood(pattern_lh);
73 } else
74 pattern_lh = ptn_lh;
75
76 double partition_lh[stree->size()];
77 int part;
78 double *pattern_lh_ptr = pattern_lh;
79 for (part = 0; part < stree->size(); part++) {
80 size_t nptn = stree->at(part)->getAlnNPattern();
81 partition_lh[part] = 0.0;
82 for (i = 0; i < nptn; i++)
83 partition_lh[part] += pattern_lh_ptr[i] * stree->at(part)->ptn_freq[i];
84 pattern_lh_ptr += nptn;
85 }
86
87 try {
88 ofstream out;
89 out.exceptions(ios::failbit | ios::badbit);
90 if (append) {
91 out.open(filename, ios::out | ios::app);
92 } else {
93 out.open(filename);
94 out << 1 << " " << stree->size() << endl;
95 }
96 if (!linename)
97 out << "Part_Lh ";
98 else {
99 out.width(10);
100 out << left << linename;
101 }
102 for (i = 0; i < stree->size(); i++)
103 out << " " << partition_lh[i];
104 out << endl;
105 out.close();
106 if (!append)
107 cout << "Partition log-likelihoods printed to " << filename << endl;
108 } catch (ios::failure) {
109 outError(ERR_WRITE_OUTPUT, filename);
110 }
111
112 if (!ptn_lh)
113 delete[] pattern_lh;
114 }
115
printSiteLhCategory(const char * filename,PhyloTree * tree,SiteLoglType wsl)116 void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
117
118 if (wsl == WSL_NONE || wsl == WSL_SITE)
119 return;
120 int ncat = tree->getNumLhCat(wsl);
121 if (tree->isSuperTree()) {
122 PhyloSuperTree *stree = (PhyloSuperTree*)tree;
123 for (auto it = stree->begin(); it != stree->end(); it++) {
124 int part_ncat = (*it)->getNumLhCat(wsl);
125 if (part_ncat > ncat)
126 ncat = part_ncat;
127 }
128 }
129 int i;
130
131
132 try {
133 ofstream out;
134 out.exceptions(ios::failbit | ios::badbit);
135 out.open(filename);
136 out << "# Site likelihood per rate/mixture category" << endl
137 << "# This file can be read in MS Excel or in R with command:" << endl
138 << "# tab=read.table('" << filename << "',header=TRUE,fill=TRUE)" << endl
139 << "# Columns are tab-separated with following meaning:" << endl;
140 if (tree->isSuperTree()) {
141 out << "# Part: Partition ID (1=" << ((PhyloSuperTree*)tree)->at(0)->aln->name << ", etc)" << endl
142 << "# Site: Site ID within partition (starting from 1 for each partition)" << endl;
143 } else
144 out << "# Site: Alignment site ID" << endl;
145
146 out << "# LnL: Logarithm of site likelihood" << endl
147 << "# Thus, sum of LnL is equal to tree log-likelihood" << endl
148 << "# LnLW_k: Logarithm of (category-k site likelihood times category-k weight)" << endl
149 << "# Thus, sum of exp(LnLW_k) is equal to exp(LnL)" << endl;
150
151 if (tree->isSuperTree()) {
152 out << "Part\tSite\tLnL";
153 } else
154 out << "Site\tLnL";
155 for (i = 0; i < ncat; i++)
156 out << "\tLnLW_" << i+1;
157 out << endl;
158 out.precision(4);
159 out.setf(ios::fixed);
160
161 tree->writeSiteLh(out, wsl);
162
163 out.close();
164 cout << "Site log-likelihoods per category printed to " << filename << endl;
165 /*
166 if (!tree->isSuperTree()) {
167 cout << "Log-likelihood of constant sites: " << endl;
168 double const_prob = 0.0;
169 for (i = 0; i < tree->aln->getNPattern(); i++)
170 if (tree->aln->at(i).isConst()) {
171 Pattern pat = tree->aln->at(i);
172 for (Pattern::iterator it = pat.begin(); it != pat.end(); it++)
173 cout << tree->aln->convertStateBackStr(*it);
174 cout << ": " << pattern_lh[i] << endl;
175 const_prob += exp(pattern_lh[i]);
176 }
177 cout << "Probability of const sites: " << const_prob << endl;
178 }
179 */
180 } catch (ios::failure) {
181 outError(ERR_WRITE_OUTPUT, filename);
182 }
183
184 }
185
printAncestralSequences(const char * out_prefix,PhyloTree * tree,AncestralSeqType ast)186 void printAncestralSequences(const char *out_prefix, PhyloTree *tree, AncestralSeqType ast) {
187
188 // int *joint_ancestral = NULL;
189 //
190 // if (tree->params->print_ancestral_sequence == AST_JOINT) {
191 // joint_ancestral = new int[nptn*tree->leafNum];
192 // tree->computeJointAncestralSequences(joint_ancestral);
193 // }
194
195 string filename = (string)out_prefix + ".state";
196 // string filenameseq = (string)out_prefix + ".stateseq";
197
198 try {
199 ofstream out;
200 out.exceptions(ios::failbit | ios::badbit);
201 out.open(filename.c_str());
202 out.setf(ios::fixed, ios::floatfield);
203 out.precision(5);
204
205 // ofstream outseq;
206 // outseq.exceptions(ios::failbit | ios::badbit);
207 // outseq.open(filenameseq.c_str());
208
209 NodeVector nodes;
210 tree->getInternalNodes(nodes);
211
212 double *marginal_ancestral_prob;
213 int *marginal_ancestral_seq;
214
215 // if (tree->params->print_ancestral_sequence == AST_JOINT)
216 // outseq << 2*(tree->nodeNum-tree->leafNum) << " " << nsites << endl;
217 // else
218 // outseq << (tree->nodeNum-tree->leafNum) << " " << nsites << endl;
219 //
220 // int name_width = max(tree->aln->getMaxSeqNameLength(),6)+10;
221
222 out << "# Ancestral state reconstruction for all nodes in " << tree->params->out_prefix << ".treefile" << endl
223 << "# This file can be read in MS Excel or in R with command:" << endl
224 << "# tab=read.table('" << tree->params->out_prefix << ".state',header=TRUE)" << endl
225 << "# Columns are tab-separated with following meaning:" << endl
226 << "# Node: Node name in the tree" << endl;
227 if (tree->isSuperTree()) {
228 PhyloSuperTree *stree = (PhyloSuperTree*)tree;
229 out << "# Part: Partition ID (1=" << stree->at(0)->aln->name << ", etc)" << endl
230 << "# Site: Site ID within partition (starting from 1 for each partition)" << endl;
231 } else
232 out << "# Site: Alignment site ID" << endl;
233
234 out << "# State: Most likely state assignment" << endl
235 << "# p_X: Posterior probability for state X (empirical Bayesian method)" << endl;
236
237 if (tree->isSuperTree()) {
238 PhyloSuperTree *stree = (PhyloSuperTree*)tree;
239 out << "Node\tPart\tSite\tState";
240 for (size_t i = 0; i < stree->front()->aln->num_states; i++)
241 out << "\tp_" << stree->front()->aln->convertStateBackStr(i);
242 } else {
243 out << "Node\tSite\tState";
244 for (size_t i = 0; i < tree->aln->num_states; i++)
245 out << "\tp_" << tree->aln->convertStateBackStr(i);
246 }
247 out << endl;
248
249
250 bool orig_kernel_nonrev;
251 tree->initMarginalAncestralState(out, orig_kernel_nonrev, marginal_ancestral_prob, marginal_ancestral_seq);
252
253 for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++) {
254 PhyloNode *node = (PhyloNode*)(*it);
255 PhyloNode *dad = (PhyloNode*)node->neighbors[0]->node;
256
257 tree->computeMarginalAncestralState((PhyloNeighbor*)dad->findNeighbor(node), dad,
258 marginal_ancestral_prob, marginal_ancestral_seq);
259
260 // int *joint_ancestral_node = joint_ancestral + (node->id - tree->leafNum)*nptn;
261
262 // set node name if neccessary
263 if (node->name.empty() || !isalpha(node->name[0])) {
264 node->name = "Node" + convertIntToString(node->id-tree->leafNum+1);
265 }
266
267 // print ancestral state probabilities
268 tree->writeMarginalAncestralState(out, node, marginal_ancestral_prob, marginal_ancestral_seq);
269
270 // print ancestral sequences
271 // outseq.width(name_width);
272 // outseq << left << node->name << " ";
273 // for (i = 0; i < nsites; i++)
274 // outseq << tree->aln->convertStateBackStr(marginal_ancestral_seq[pattern_index[i]]);
275 // outseq << endl;
276 //
277 // if (tree->params->print_ancestral_sequence == AST_JOINT) {
278 // outseq.width(name_width);
279 // outseq << left << (node->name+"_joint") << " ";
280 // for (i = 0; i < nsites; i++)
281 // outseq << tree->aln->convertStateBackStr(joint_ancestral_node[pattern_index[i]]);
282 // outseq << endl;
283 // }
284 }
285
286 tree->endMarginalAncestralState(orig_kernel_nonrev, marginal_ancestral_prob, marginal_ancestral_seq);
287
288 out.close();
289 // outseq.close();
290 cout << "Ancestral state probabilities printed to " << filename << endl;
291 // cout << "Ancestral sequences printed to " << filenameseq << endl;
292
293 } catch (ios::failure) {
294 outError(ERR_WRITE_OUTPUT, filename);
295 }
296
297 // if (joint_ancestral)
298 // delete[] joint_ancestral;
299
300 }
301
printSiteProbCategory(const char * filename,PhyloTree * tree,SiteLoglType wsl)302 void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
303
304 if (wsl == WSL_NONE || wsl == WSL_SITE)
305 return;
306 // error checking
307 if (!tree->getModel()->isMixture()) {
308 if (wsl != WSL_RATECAT) {
309 outWarning("Switch now to '-wspr' as it is the only option for non-mixture model");
310 wsl = WSL_RATECAT;
311 }
312 } else {
313 // mixture model
314 if (wsl == WSL_MIXTURE_RATECAT && tree->getModelFactory()->fused_mix_rate) {
315 outWarning("-wspmr is not suitable for fused mixture model, switch now to -wspm");
316 wsl = WSL_MIXTURE;
317 }
318 }
319 size_t cat, ncat = tree->getNumLhCat(wsl);
320 double *ptn_prob_cat = new double[((size_t)tree->getAlnNPattern())*ncat];
321 tree->computePatternProbabilityCategory(ptn_prob_cat, wsl);
322
323 try {
324 ofstream out;
325 out.exceptions(ios::failbit | ios::badbit);
326 out.open(filename);
327 if (tree->isSuperTree())
328 out << "Set\t";
329 out << "Site";
330 for (cat = 0; cat < ncat; cat++)
331 out << "\tp" << cat+1;
332 out << endl;
333 IntVector pattern_index;
334 if (tree->isSuperTree()) {
335 PhyloSuperTree *super_tree = (PhyloSuperTree*)tree;
336 size_t offset = 0;
337 for (PhyloSuperTree::iterator it = super_tree->begin(); it != super_tree->end(); it++) {
338 size_t part_ncat = (*it)->getNumLhCat(wsl);
339 (*it)->aln->getSitePatternIndex(pattern_index);
340 size_t site, nsite = (*it)->aln->getNSite();
341 for (site = 0; site < nsite; site++) {
342 out << (it-super_tree->begin())+1 << "\t" << site+1;
343 double *prob_cat = ptn_prob_cat + (offset+pattern_index[site]*part_ncat);
344 for (cat = 0; cat < part_ncat; cat++)
345 out << "\t" << prob_cat[cat];
346 out << endl;
347 }
348 offset += (*it)->aln->getNPattern()*(*it)->getNumLhCat(wsl);
349 }
350 } else {
351 tree->aln->getSitePatternIndex(pattern_index);
352 int nsite = tree->getAlnNSite();
353 for (int site = 0; site < nsite; site++) {
354 out << site+1;
355 double *prob_cat = ptn_prob_cat + pattern_index[site]*ncat;
356 for (cat = 0; cat < ncat; cat++) {
357 out << "\t" << prob_cat[cat];
358 }
359 out << endl;
360 }
361 }
362 out.close();
363 cout << "Site probabilities per category printed to " << filename << endl;
364 } catch (ios::failure) {
365 outError(ERR_WRITE_OUTPUT, filename);
366 }
367
368 }
369
370
printSiteStateFreq(const char * filename,PhyloTree * tree,double * state_freqs)371 void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs) {
372
373 int i, j, nsites = tree->getAlnNSite(), nstates = tree->aln->num_states;
374 double *ptn_state_freq;
375 if (state_freqs) {
376 ptn_state_freq = state_freqs;
377 } else {
378 ptn_state_freq = new double[((size_t)tree->getAlnNPattern()) * nstates];
379 tree->computePatternStateFreq(ptn_state_freq);
380 }
381
382 try {
383 ofstream out;
384 out.exceptions(ios::failbit | ios::badbit);
385 out.open(filename);
386 IntVector pattern_index;
387 tree->aln->getSitePatternIndex(pattern_index);
388 for (i = 0; i < nsites; i++) {
389 out.width(6);
390 out << left << i+1 << " ";
391 double *state_freq = &ptn_state_freq[pattern_index[i]*nstates];
392 for (j = 0; j < nstates; j++) {
393 out.width(15);
394 out << state_freq[j] << " ";
395 }
396 out << endl;
397 }
398 out.close();
399 cout << "Site state frequency vectors printed to " << filename << endl;
400 } catch (ios::failure) {
401 outError(ERR_WRITE_OUTPUT, filename);
402 }
403 if (!state_freqs)
404 delete [] ptn_state_freq;
405 }
406
printSiteStateFreq(const char * filename,Alignment * aln)407 void printSiteStateFreq(const char* filename, Alignment *aln) {
408 if (aln->site_state_freq.empty())
409 return;
410 int i, j, nsites = aln->getNSite(), nstates = aln->num_states;
411 try {
412 ofstream out;
413 out.exceptions(ios::failbit | ios::badbit);
414 out.open(filename);
415 IntVector pattern_index;
416 aln->getSitePatternIndex(pattern_index);
417 for (i = 0; i < nsites; i++) {
418 out.width(6);
419 out << left << i+1 << " ";
420 double *state_freq = aln->site_state_freq[pattern_index[i]];
421 for (j = 0; j < nstates; j++) {
422 out.width(15);
423 out << state_freq[j] << " ";
424 }
425 out << endl;
426 }
427 out.close();
428 cout << "Site state frequency vectors printed to " << filename << endl;
429 } catch (ios::failure) {
430 outError(ERR_WRITE_OUTPUT, filename);
431 }
432 }
433
countDistinctTrees(const char * filename,bool rooted,IQTree * tree,IntVector & distinct_ids,bool exclude_duplicate)434 int countDistinctTrees(const char *filename, bool rooted, IQTree *tree, IntVector &distinct_ids, bool exclude_duplicate) {
435 StringIntMap treels;
436 try {
437 ifstream in;
438 in.exceptions(ios::failbit | ios::badbit);
439 in.open(filename);
440 // remove the failbit
441 in.exceptions(ios::badbit);
442 int tree_id;
443 for (tree_id = 0; !in.eof(); tree_id++) {
444 if (exclude_duplicate) {
445 tree->freeNode();
446 tree->readTree(in, rooted);
447 tree->setAlignment(tree->aln);
448 tree->setRootNode(tree->params->root);
449 StringIntMap::iterator it = treels.end();
450 ostringstream ostr;
451 tree->printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
452 it = treels.find(ostr.str());
453 if (it != treels.end()) { // already in treels
454 distinct_ids.push_back(it->second);
455 } else {
456 distinct_ids.push_back(-1);
457 treels[ostr.str()] = tree_id;
458 }
459 } else {
460 // ignore tree
461 char ch;
462 do {
463 in >> ch;
464 } while (!in.eof() && ch != ';');
465 distinct_ids.push_back(-1);
466 }
467 char ch;
468 in.exceptions(ios::goodbit);
469 (in) >> ch;
470 if (in.eof()) break;
471 in.unget();
472 in.exceptions(ios::failbit | ios::badbit);
473
474 }
475 in.close();
476 } catch (ios::failure) {
477 outError("Cannot read file ", filename);
478 }
479 if (exclude_duplicate)
480 return treels.size();
481 else
482 return distinct_ids.size();
483 }
484
485 //const double TOL_RELL_SCORE = 0.01;
486
487 /*
488 Problem: solve the following linear system equation:
489 a_1*x + b_1*y = c_1
490 a_2*x + b_2*y = c_2
491 ....
492 a_n*x + b_n*y = c_n
493
494 becomes minimizing weighted least square:
495
496 sum_k { w_k*[ c_k - (a_k*x + b_k*y) ]^2 }
497
498
499 the solution is:
500
501 x = [(sum_k w_k*b_k*c_k)*(sum_k w_k*a_k*b_k) - (sum_k w_k*a_k*c_k)(sum_k w_k*b_k^2)] /
502 [ (sum_k w_k*a_k*b_k)^2 - (sum_k w_k*a_k^2)*(sum_k w_k*b_k^2) ]
503
504 y = [(sum_k w_k*a_k*c_k)*(sum_k w_k*a_k*b_k) - (sum_k w_k*b_k*c_k)(sum_k w_k*a_k^2)] /
505 [ (sum_k w_k*a_k*b_k)^2 - (sum_k w_k*a_k^2)*(sum_k w*k*b_k^2) ]
506
507 @param n number of data points
508 @param w weight vector of length n
509 @param a a value vector of length n
510 @param b b value vector of length n
511 @param c c value vector of length n
512 @param[out] x x-value
513 @param[out] y y-value
514 @return least square value
515 */
doWeightedLeastSquare(int n,double * w,double * a,double * b,double * c,double & x,double & y,double & se)516 void doWeightedLeastSquare(int n, double *w, double *a, double *b, double *c, double &x, double &y, double &se) {
517 int k;
518 double BC = 0.0, AB = 0.0, AC = 0.0, A2 = 0.0, B2 = 0.0;
519 double denom;
520 for (k = 0; k < n; k++) {
521 double wa = w[k]*a[k];
522 double wb = w[k]*b[k];
523 AB += wa*b[k];
524 BC += wb*c[k];
525 AC += wa*c[k];
526 A2 += wa*a[k];
527 B2 += wb*b[k];
528 }
529 denom = 1.0/(AB*AB - A2*B2);
530 x = (BC*AB - AC*B2) * denom;
531 y = (AC*AB - BC*A2) * denom;
532
533 se = -denom*(B2+A2+2*AB);
534 ASSERT(se >= 0.0);
535 }
536
537 /**
538 MLE estimates for AU test
539 */
540 class OptimizationAUTest : public Optimization {
541
542 public:
543
OptimizationAUTest(double d,double c,int nscales,double * bp,double * rr,double * rr_inv)544 OptimizationAUTest(double d, double c, int nscales, double *bp, double *rr, double *rr_inv) {
545 this->d = d;
546 this->c = c;
547 this->bp = bp;
548 this->rr = rr;
549 this->rr_inv = rr_inv;
550 this->nscales = nscales;
551
552 }
553
554 /**
555 return the number of dimensions
556 */
getNDim()557 virtual int getNDim() { return 2; }
558
559
560 /**
561 the target function which needs to be optimized
562 @param x the input vector x
563 @return the function value at x
564 */
targetFunk(double x[])565 virtual double targetFunk(double x[]) {
566 d = x[1];
567 c = x[2];
568 double res = 0.0;
569 for (int k = 0; k < nscales; k++) {
570 double cdf = gsl_cdf_ugaussian_P(d*rr[k] + c*rr_inv[k]);
571 res += bp[k] * log(1.0 - cdf) + (1.0-bp[k])*log(cdf);
572 }
573 return res;
574 }
575
optimizeDC()576 void optimizeDC() {
577 double x[3], lower[3], upper[3];
578 bool bound_check[3];
579 x[1] = d;
580 x[2] = c;
581 lower[1] = lower[2] = 1e-4;
582 upper[1] = upper[2] = 100.0;
583 bound_check[1] = bound_check[2] = false;
584 minimizeMultiDimen(x, 2, lower, upper, bound_check, 1e-4);
585 d = x[1];
586 c = x[2];
587 }
588
589 double d, c;
590 int nscales;
591 double *bp;
592 double *rr;
593 double *rr_inv;
594 };
595
596
597 /* BEGIN CODE WAS TAKEN FROM CONSEL PROGRAM */
598
599 /* binary search for a sorted vector
600 find k s.t. vec[k-1] <= t < vec[k]
601 */
cntdist2(double * vec,int bb,double t)602 int cntdist2(double *vec, int bb, double t)
603 {
604 int i,i0,i1;
605
606 i0=0; i1=bb-1;
607 if(t < vec[0]) return 0;
608 else if(vec[bb-1] <= t) return bb;
609
610 while(i1-i0>1) {
611 i=(i0+i1)/2;
612 if(vec[i] <= t) i0=i;
613 else i1=i;
614 }
615
616 return i1;
617 }
618
619 /*
620 smoothing the counting for a sorted vector
621 the piecewise linear function connecting
622 F(v[i]) = 1/(2n) + i/n, for i=0,...,n-1
623 F(1.5v[0]-0.5v[1]) = 0
624 F(1.5v[n-1]-0.5v[n-2]) = 1.
625
626 1. F(x)=0 for x<=1.5v[0]-0.5v[1]
627
628 2. F(x)=1/(2n) + (1/n)*(x-v[0])/(v[1]-v[0])
629 for 1.5v[0]-0.5v[1] < x <= v[0]
630
631 3. F(x)=1/(2n) + i/n + (1/n)*(x-v[i])/(v[i]-v[i+1])
632 for v[i] < x <= v[i+1], i=0,...,
633
634 4. F(x)=1-(1/2n) + (1/n)*(x-v[n-1])/(v[n-1]-v[n-2])
635 for v[n-1] < x <= 1.5v[n-1]-0.5v[n-2]
636
637 5. F(x)=1 for x > 1.5v[n-1]-0.5v[n-2]
638 */
cntdist3(double * vec,int bb,double t)639 double cntdist3(double *vec, int bb, double t)
640 {
641 double p,n;
642 int i;
643 i=cntdist2(vec,bb,t)-1; /* to find vec[i] <= t < vec[i+1] */
644 n=(double)bb;
645 if(i<0) {
646 if(vec[1]>vec[0]) p=0.5+(t-vec[0])/(vec[1]-vec[0]);
647 else p=0.0;
648 } else if(i<bb-1) {
649 if(vec[i+1]>vec[i]) p=0.5+(double)i+(t-vec[i])/(vec[i+1]-vec[i]);
650 else p=0.5+(double)i; /* <- should never happen */
651 } else {
652 if(vec[bb-1]-vec[bb-2]>0) p=n-0.5+(t-vec[bb-1])/(vec[bb-1]-vec[bb-2]);
653 else p=n;
654 }
655 if(p>n) p=n; else if(p<0.0) p=0.0;
656 return p;
657 }
658
log3(double x)659 double log3(double x)
660 {
661 double y,z1,z2,z3,z4,z5;
662 if(fabs(x)>1.0e-3) {
663 y=-log(1.0-x);
664 } else {
665 z1=x; z2=z1*x; z3=z2*x; z4=z3*x; z5=z4*x;
666 y=((((z5/5.0)+z4/4.0)+z3/3.0)+z2/2.0)+z1;
667 }
668 return y;
669 }
670
671 int mleloopmax=30;
672 double mleeps=1e-10;
mlecoef(double * cnts,double * rr,double bb,int kk,double * coef0,double * lrt,int * df,double * se)673 int mlecoef(double *cnts, double *rr, double bb, int kk,
674 double *coef0, /* set initinal value (size=2) */
675 double *lrt, int *df, /* LRT statistic */
676 double *se
677 )
678 {
679 int i,m,loop;
680 double coef[2], update[2];
681 double d1f, d2f, d11f, d12f, d22f; /* derivatives */
682 double v11, v12, v22; /* inverse of -d??f */
683 double a,e;
684 double s[kk], r[kk],c[kk], b[kk],z[kk],p[kk],d[kk],g[kk],h[kk];
685
686 m=0;
687 for(i=0;i<kk;i++)
688 {
689 r[m]=rr[i]; s[m]=sqrt(rr[i]); c[m]=cnts[i]*bb; b[m]=bb;
690 m++;
691 }
692 if(m<2) return 1;
693
694 coef[0]=coef0[0]; /* signed distance */
695 coef[1]=coef0[1]; /* curvature */
696
697 for(loop=0;loop<mleloopmax;loop++) {
698 d1f=d2f=d11f=d12f=d22f=0.0;
699 for(i=0;i<m;i++) {
700 z[i]=coef[0]*s[i]+coef[1]/s[i];
701 p[i]=gsl_cdf_ugaussian_P(-z[i]);
702 d[i]=gsl_ran_ugaussian_pdf(z[i]);
703 if(p[i]>0.0 && p[i]<1.0) {
704 g[i]=d[i]*( d[i]*(-c[i]+2.0*c[i]*p[i]-b[i]*p[i]*p[i])/
705 (p[i]*p[i]*(1.0-p[i])*(1.0-p[i]))
706 + z[i]*(c[i]-b[i]*p[i])/(p[i]*(1.0-p[i])) );
707 h[i]=d[i]*(c[i]-b[i]*p[i])/(p[i]*(1.0-p[i]));
708 } else { g[i]=h[i]=0.0; }
709 d1f+= -h[i]*s[i]; d2f+= -h[i]/s[i];
710 d11f+= g[i]*r[i]; d12f+= g[i]; d22f+= g[i]/r[i];
711 }
712
713 a=d11f*d22f-d12f*d12f;
714 if(a==0.0) {
715 return 2;
716 }
717 v11=-d22f/a; v12=d12f/a; v22=-d11f/a;
718
719 /* Newton-Raphson update */
720 update[0]=v11*d1f+v12*d2f; update[1]=v12*d1f+v22*d2f;
721 coef[0]+=update[0]; coef[1]+=update[1];
722
723 /* check convergence */
724 e=-d11f*update[0]*update[0]-2.0*d12f*update[0]*update[1]
725 -d22f*update[1]*update[1];
726
727 if(e<mleeps) break;
728 }
729
730 /* calc log-likelihood */
731 *lrt=0.0; *df=0;
732 for(i=0;i<m;i++) {
733 if(p[i]>0.0 && p[i]<1.0) {
734 *df+=1;
735 if(c[i]>0.0) a=c[i]*log(c[i]/b[i]/p[i]); else a=0.0;
736 if(c[i]<b[i]) a+=(b[i]-c[i])*(log3(p[i])-log3(c[i]/b[i]));
737 *lrt += a;
738 }
739 }
740 *lrt *= 2.0; *df -= 2;
741
742 /* write back the results */
743 coef0[0]=coef[0]; coef0[1]=coef[1];
744 *se = v11 + v22 - 2*v12;
745 // vmat[0][0]=v11;vmat[0][1]=vmat[1][0]=v12;vmat[1][1]=v22;
746 if(loop==mleloopmax || *df< 0) i=1; else i=0;
747 return i;
748 }
749
750 /* END CODE WAS TAKEN FROM CONSEL PROGRAM */
751
752 /**
753 @param tree_lhs RELL score matrix of size #trees x #replicates
754 */
performAUTest(Params & params,PhyloTree * tree,double * pattern_lhs,vector<TreeInfo> & info)755 void performAUTest(Params ¶ms, PhyloTree *tree, double *pattern_lhs, vector<TreeInfo> &info) {
756
757 if (params.topotest_replicates < 10000)
758 outWarning("Too few replicates for AU test. At least -zb 10000 for reliable results!");
759
760 /* STEP 1: specify scale factors */
761 size_t nscales = 10;
762 double r[] = {0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4};
763 double rr[] = {sqrt(0.5), sqrt(0.6), sqrt(0.7), sqrt(0.8), sqrt(0.9), 1.0,
764 sqrt(1.1), sqrt(1.2), sqrt(1.3), sqrt(1.4)};
765 double rr_inv[] = {sqrt(1/0.5), sqrt(1/0.6), sqrt(1/0.7), sqrt(1/0.8), sqrt(1/0.9), 1.0,
766 sqrt(1/1.1), sqrt(1/1.2), sqrt(1/1.3), sqrt(1/1.4)};
767
768 /* STEP 2: compute bootstrap proportion */
769 size_t ntrees = info.size();
770 size_t nboot = params.topotest_replicates;
771 // double nboot_inv = 1.0 / nboot;
772
773 size_t nptn = tree->getAlnNPattern();
774 size_t maxnptn = get_safe_upper_limit(nptn);
775
776 // double *bp = new double[ntrees*nscales];
777 // memset(bp, 0, sizeof(double)*ntrees*nscales);
778
779 double *treelhs;
780 cout << (ntrees*nscales*nboot*sizeof(double) >> 20) << " MB required for AU test" << endl;
781 treelhs = new double[ntrees*nscales*nboot];
782 if (!treelhs)
783 outError("Not enough memory to perform AU test!");
784
785 size_t k, tid, ptn;
786
787 double start_time = getRealTime();
788
789 cout << "Generating " << nscales << " x " << nboot << " multiscale bootstrap replicates... ";
790
791 #ifdef _OPENMP
792 #pragma omp parallel private(k, tid, ptn)
793 {
794 int *rstream;
795 init_random(params.ran_seed + omp_get_thread_num(), false, &rstream);
796 #else
797 int *rstream = randstream;
798 #endif
799 size_t boot;
800 int *boot_sample = aligned_alloc<int>(maxnptn);
801 memset(boot_sample, 0, maxnptn*sizeof(int));
802
803 double *boot_sample_dbl = aligned_alloc<double>(maxnptn);
804
805 #ifdef _OPENMP
806 #pragma omp for schedule(dynamic)
807 #endif
808 for (k = 0; k < nscales; k++) {
809 string str = "SCALE=" + convertDoubleToString(r[k]);
810 for (boot = 0; boot < nboot; boot++) {
811 if (r[k] == 1.0 && boot == 0)
812 // 2018-10-23: get one of the bootstrap sample as the original alignment
813 tree->aln->getPatternFreq(boot_sample);
814 else
815 tree->aln->createBootstrapAlignment(boot_sample, str.c_str(), rstream);
816 for (ptn = 0; ptn < maxnptn; ptn++)
817 boot_sample_dbl[ptn] = boot_sample[ptn];
818 double max_lh = -DBL_MAX, second_max_lh = -DBL_MAX;
819 int max_tid = -1;
820 for (tid = 0; tid < ntrees; tid++) {
821 double *pattern_lh = pattern_lhs + (tid*maxnptn);
822 double tree_lh;
823 if (params.SSE == LK_386) {
824 tree_lh = 0.0;
825 for (ptn = 0; ptn < nptn; ptn++)
826 tree_lh += pattern_lh[ptn] * boot_sample_dbl[ptn];
827 } else {
828 tree_lh = tree->dotProductDoubleCall(pattern_lh, boot_sample_dbl, nptn);
829 }
830 // rescale lh
831 tree_lh /= r[k];
832
833 // find the max and second max
834 if (tree_lh > max_lh) {
835 second_max_lh = max_lh;
836 max_lh = tree_lh;
837 max_tid = tid;
838 } else if (tree_lh > second_max_lh)
839 second_max_lh = tree_lh;
840
841 treelhs[(tid*nscales+k)*nboot + boot] = tree_lh;
842 }
843
844 // compute difference from max_lh
845 for (tid = 0; tid < ntrees; tid++)
846 if (tid != max_tid)
847 treelhs[(tid*nscales+k)*nboot + boot] = max_lh - treelhs[(tid*nscales+k)*nboot + boot];
848 else
849 treelhs[(tid*nscales+k)*nboot + boot] = second_max_lh - max_lh;
850 // bp[k*ntrees+max_tid] += nboot_inv;
851 } // for boot
852
853 // sort the replicates
854 for (tid = 0; tid < ntrees; tid++) {
855 quicksort<double,int>(treelhs + (tid*nscales+k)*nboot, 0, nboot-1);
856 }
857
858 } // for scale
859
860 aligned_free(boot_sample_dbl);
861 aligned_free(boot_sample);
862
863 #ifdef _OPENMP
864 finish_random(rstream);
865 }
866 #endif
867
868 // if (verbose_mode >= VB_MED) {
869 // cout << "scale";
870 // for (k = 0; k < nscales; k++)
871 // cout << "\t" << r[k];
872 // cout << endl;
873 // for (tid = 0; tid < ntrees; tid++) {
874 // cout << tid;
875 // for (k = 0; k < nscales; k++) {
876 // cout << "\t" << bp[tid+k*ntrees];
877 // }
878 // cout << endl;
879 // }
880 // }
881
882 cout << getRealTime() - start_time << " seconds" << endl;
883
884 /* STEP 3: weighted least square fit */
885
886 double *cc = new double[nscales];
887 double *w = new double[nscales];
888 double *this_bp = new double[nscales];
889 cout << "TreeID\tAU\tRSS\td\tc" << endl;
890 for (tid = 0; tid < ntrees; tid++) {
891 double *this_stat = treelhs + tid*nscales*nboot;
892 double xn = this_stat[(nscales/2)*nboot + nboot/2], x;
893 double c, d; // c, d in original paper
894 int idf0 = -2;
895 double z = 0.0, z0 = 0.0, thp = 0.0, th = 0.0, ze = 0.0, ze0 = 0.0;
896 double pval, se;
897 int df;
898 double rss = 0.0;
899 int step;
900 const int max_step = 30;
901 bool failed = false;
902 for (step = 0; step < max_step; step++) {
903 x = xn;
904 int num_k = 0;
905 for (k = 0; k < nscales; k++) {
906 this_bp[k] = cntdist3(this_stat + k*nboot, nboot, x) / nboot;
907 if (this_bp[k] <= 0 || this_bp[k] >= 1) {
908 cc[k] = w[k] = 0.0;
909 } else {
910 double bp_val = this_bp[k];
911 cc[k] = -gsl_cdf_ugaussian_Pinv(bp_val);
912 double bp_pdf = gsl_ran_ugaussian_pdf(cc[k]);
913 w[k] = bp_pdf*bp_pdf*nboot / (bp_val*(1.0-bp_val));
914 num_k++;
915 }
916 }
917 df = num_k-2;
918 if (num_k >= 2) {
919 // first obtain d and c by weighted least square
920 doWeightedLeastSquare(nscales, w, rr, rr_inv, cc, d, c, se);
921
922 // maximum likelhood fit
923 double coef0[2] = {d, c};
924 int mlefail = mlecoef(this_bp, r, nboot, nscales, coef0, &rss, &df, &se);
925
926 if (!mlefail) {
927 d = coef0[0];
928 c = coef0[1];
929 }
930
931 se = gsl_ran_ugaussian_pdf(d-c)*sqrt(se);
932
933 // second, perform MLE estimate of d and c
934 // OptimizationAUTest mle(d, c, nscales, this_bp, rr, rr_inv);
935 // mle.optimizeDC();
936 // d = mle.d;
937 // c = mle.c;
938
939 /* STEP 4: compute p-value according to Eq. 11 */
940 pval = gsl_cdf_ugaussian_Q(d-c);
941 z = -pval;
942 ze = se;
943 // compute sum of squared difference
944 rss = 0.0;
945 for (k = 0; k < nscales; k++) {
946 double diff = cc[k] - (rr[k]*d + rr_inv[k]*c);
947 rss += w[k] * diff * diff;
948 }
949
950 } else {
951 // not enough data for WLS
952 int num0 = 0;
953 for (k = 0; k < nscales; k++)
954 if (this_bp[k] <= 0.0) num0++;
955 if (num0 > nscales/2)
956 pval = 0.0;
957 else
958 pval = 1.0;
959 se = 0.0;
960 d = c = 0.0;
961 rss = 0.0;
962 if (verbose_mode >= VB_MED)
963 cout << " error in wls" << endl;
964 //info[tid].au_pvalue = pval;
965 //break;
966 }
967
968
969 if (verbose_mode >= VB_MED) {
970 cout.unsetf(ios::fixed);
971 cout << "\t" << step << "\t" << th << "\t" << x << "\t" << pval << "\t" << se << "\t" << nscales-2 << "\t" << d << "\t" << c << "\t" << z << "\t" << ze << "\t" << rss << endl;
972 }
973
974 if(df < 0 && idf0 < 0) { failed = true; break;} /* degenerated */
975
976 if ((df < 0) || (idf0 >= 0 && (z-z0)*(x-thp) > 0.0 && fabs(z-z0)>0.1*ze0)) {
977 if (verbose_mode >= VB_MED)
978 cout << " non-monotone" << endl;
979 th=x;
980 xn=0.5*x+0.5*thp;
981 continue;
982 }
983 if(idf0 >= 0 && (fabs(z-z0)<0.01*ze0)) {
984 if(fabs(th)<1e-10)
985 xn=th;
986 else th=x;
987 } else
988 xn=0.5*th+0.5*x;
989 info[tid].au_pvalue = pval;
990 thp=x;
991 z0=z;
992 ze0=ze;
993 idf0 = df;
994 if(fabs(x-th)<1e-10) break;
995 } // for step
996
997 if (failed && verbose_mode >= VB_MED)
998 cout << " degenerated" << endl;
999
1000 if (step == max_step) {
1001 if (verbose_mode >= VB_MED)
1002 cout << " non-convergence" << endl;
1003 failed = true;
1004 }
1005
1006 double pchi2 = (failed) ? 0.0 : computePValueChiSquare(rss, df);
1007 cout << tid+1 << "\t" << info[tid].au_pvalue << "\t" << rss << "\t" << d << "\t" << c;
1008
1009 // warning if p-value of chi-square < 0.01 (rss too high)
1010 if (pchi2 < 0.01)
1011 cout << " !!!";
1012 cout << endl;
1013 }
1014
1015 delete [] this_bp;
1016 delete [] w;
1017 delete [] cc;
1018
1019 cout << "Time for AU test: " << getRealTime() - start_time << " seconds" << endl;
1020 // delete [] bp;
1021 }
1022
1023
evaluateTrees(string treeset_file,Params & params,IQTree * tree,vector<TreeInfo> & info,IntVector & distinct_ids)1024 void evaluateTrees(string treeset_file, Params ¶ms, IQTree *tree, vector<TreeInfo> &info, IntVector &distinct_ids)
1025 {
1026 if (treeset_file.empty())
1027 return;
1028 cout << endl;
1029 //MTreeSet trees(treeset_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
1030 cout << "Reading trees in " << treeset_file << " ..." << endl;
1031 size_t ntrees = countDistinctTrees(treeset_file.c_str(), params.is_rooted, tree, distinct_ids, params.distinct_trees);
1032 if (ntrees < distinct_ids.size()) {
1033 cout << "WARNING: " << distinct_ids.size() << " trees detected but only " << ntrees << " distinct trees will be evaluated" << endl;
1034 } else {
1035 cout << ntrees << (params.distinct_trees ? " distinct" : "") << " trees detected" << endl;
1036 }
1037 if (ntrees == 0) return;
1038 ifstream in(treeset_file);
1039
1040 //if (trees.size() == 1) return;
1041 //string tree_file = treeset_file;
1042 string tree_file = params.out_prefix;
1043 tree_file += ".trees";
1044 ofstream treeout;
1045 //if (!params.fixed_branch_length) {
1046 treeout.open(tree_file.c_str());
1047 //}
1048 string score_file = params.out_prefix;
1049 score_file += ".treelh";
1050 ofstream scoreout;
1051 if (params.print_tree_lh)
1052 scoreout.open(score_file.c_str());
1053 string site_lh_file = params.out_prefix;
1054 site_lh_file += ".sitelh";
1055 if (params.print_site_lh) {
1056 ofstream site_lh_out(site_lh_file.c_str());
1057 site_lh_out << ntrees << " " << tree->getAlnNSite() << endl;
1058 site_lh_out.close();
1059 }
1060
1061 if (params.print_partition_lh && !tree->isSuperTree()) {
1062 outWarning("-wpl does not work with non-partition model");
1063 params.print_partition_lh = false;
1064 }
1065 string part_lh_file = params.out_prefix;
1066 part_lh_file += ".partlh";
1067 if (params.print_partition_lh) {
1068 ofstream part_lh_out(part_lh_file.c_str());
1069 part_lh_out << ntrees << " " << ((PhyloSuperTree*)tree)->size() << endl;
1070 part_lh_out.close();
1071 }
1072
1073 double time_start = getRealTime();
1074
1075 int *boot_samples = NULL;
1076 size_t boot;
1077 //double *saved_tree_lhs = NULL;
1078 double *tree_lhs = NULL; // RELL score matrix of size #trees x #replicates
1079 double *pattern_lh = NULL;
1080 double *pattern_lhs = NULL;
1081 double *orig_tree_lh = NULL; // Original tree log-likelihoods
1082 double *max_lh = NULL;
1083 double *lhdiff_weights = NULL;
1084 size_t nptn = tree->getAlnNPattern();
1085 size_t maxnptn = get_safe_upper_limit(nptn);
1086
1087 if (params.topotest_replicates && ntrees > 1) {
1088 size_t mem_size = (size_t)params.topotest_replicates*nptn*sizeof(int) +
1089 ntrees*params.topotest_replicates*sizeof(double) +
1090 (nptn + ntrees*3 + params.topotest_replicates*2)*sizeof(double) +
1091 ntrees*sizeof(TreeInfo) +
1092 params.do_weighted_test*(ntrees * nptn * sizeof(double) + ntrees*ntrees*sizeof(double));
1093 cout << "Note: " << ((double)mem_size/1024)/1024 << " MB of RAM required!" << endl;
1094 if (mem_size > getMemorySize()-100000)
1095 outWarning("The required memory does not fit in RAM!");
1096 cout << "Creating " << params.topotest_replicates << " bootstrap replicates..." << endl;
1097 if (!(boot_samples = new int [params.topotest_replicates*nptn]))
1098 outError(ERR_NO_MEMORY);
1099 #ifdef _OPENMP
1100 #pragma omp parallel private(boot) if(nptn > 10000)
1101 {
1102 int *rstream;
1103 init_random(params.ran_seed + omp_get_thread_num(), false, &rstream);
1104 #pragma omp for schedule(static)
1105 #else
1106 int *rstream = randstream;
1107 #endif
1108 for (boot = 0; boot < params.topotest_replicates; boot++)
1109 if (boot == 0)
1110 tree->aln->getPatternFreq(boot_samples + (boot*nptn));
1111 else
1112 tree->aln->createBootstrapAlignment(boot_samples + (boot*nptn), params.bootstrap_spec, rstream);
1113 #ifdef _OPENMP
1114 finish_random(rstream);
1115 }
1116 #endif
1117 cout << "done" << endl;
1118 //if (!(saved_tree_lhs = new double [ntrees * params.topotest_replicates]))
1119 // outError(ERR_NO_MEMORY);
1120 if (!(tree_lhs = new double [ntrees * params.topotest_replicates]))
1121 outError(ERR_NO_MEMORY);
1122 if (params.do_weighted_test || params.do_au_test) {
1123 if (!(lhdiff_weights = new double [ntrees * ntrees]))
1124 outError(ERR_NO_MEMORY);
1125 pattern_lhs = aligned_alloc<double>(ntrees*maxnptn);
1126 // if (!(pattern_lhs = new double[ntrees* nptn]))
1127 // outError(ERR_NO_MEMORY);
1128 }
1129 pattern_lh = aligned_alloc<double>(maxnptn);
1130 // if (!(pattern_lh = new double[nptn]))
1131 // outError(ERR_NO_MEMORY);
1132 if (!(orig_tree_lh = new double[ntrees]))
1133 outError(ERR_NO_MEMORY);
1134 if (!(max_lh = new double[params.topotest_replicates]))
1135 outError(ERR_NO_MEMORY);
1136 }
1137 int tree_index, tid, tid2;
1138 info.resize(ntrees);
1139 //for (MTreeSet::iterator it = trees.begin(); it != trees.end(); it++, tree_index++) {
1140 for (tree_index = 0, tid = 0; tree_index < distinct_ids.size(); tree_index++) {
1141
1142 cout << "Tree " << tree_index + 1;
1143 if (distinct_ids[tree_index] >= 0) {
1144 cout << " / identical to tree " << distinct_ids[tree_index]+1 << endl;
1145 // ignore tree
1146 char ch;
1147 do {
1148 in >> ch;
1149 } while (!in.eof() && ch != ';');
1150 continue;
1151 }
1152 tree->freeNode();
1153 tree->readTree(in, tree->rooted);
1154 if (!tree->findNodeName(tree->aln->getSeqName(0))) {
1155 outError("Taxon " + tree->aln->getSeqName(0) + " not found in tree");
1156 }
1157
1158 if (tree->rooted && tree->getModel()->isReversible()) {
1159 if (tree->leafNum != tree->aln->getNSeq()+1)
1160 outError("Tree does not have same number of taxa as alignment");
1161 tree->convertToUnrooted();
1162 } else if (!tree->rooted && !tree->getModel()->isReversible()) {
1163 if (tree->leafNum != tree->aln->getNSeq())
1164 outError("Tree does not have same number of taxa as alignment");
1165 tree->convertToRooted();
1166 }
1167 tree->setAlignment(tree->aln);
1168 tree->setRootNode(params.root);
1169 if (tree->isSuperTree())
1170 ((PhyloSuperTree*) tree)->mapTrees();
1171
1172 tree->initializeAllPartialLh();
1173 tree->fixNegativeBranch(false);
1174 if (params.fixed_branch_length) {
1175 tree->setCurScore(tree->computeLikelihood());
1176 } else if (params.topotest_optimize_model) {
1177 tree->getModelFactory()->optimizeParameters(BRLEN_OPTIMIZE, false, params.modelEps);
1178 tree->setCurScore(tree->computeLikelihood());
1179 } else {
1180 tree->setCurScore(tree->optimizeAllBranches(100, 0.001));
1181 }
1182 treeout << "[ tree " << tree_index+1 << " lh=" << tree->getCurScore() << " ]";
1183 tree->printTree(treeout);
1184 treeout << endl;
1185 if (params.print_tree_lh)
1186 scoreout << tree->getCurScore() << endl;
1187
1188 cout << " / LogL: " << tree->getCurScore() << endl;
1189
1190 if (pattern_lh) {
1191 double curScore = tree->getCurScore();
1192 memset(pattern_lh, 0, maxnptn*sizeof(double));
1193 tree->computePatternLikelihood(pattern_lh, &curScore);
1194 if (params.do_weighted_test || params.do_au_test)
1195 memcpy(pattern_lhs + tid*maxnptn, pattern_lh, maxnptn*sizeof(double));
1196 }
1197 if (params.print_site_lh) {
1198 string tree_name = "Tree" + convertIntToString(tree_index+1);
1199 printSiteLh(site_lh_file.c_str(), tree, pattern_lh, true, tree_name.c_str());
1200 }
1201 if (params.print_partition_lh) {
1202 string tree_name = "Tree" + convertIntToString(tree_index+1);
1203 printPartitionLh(part_lh_file.c_str(), tree, pattern_lh, true, tree_name.c_str());
1204 }
1205 info[tid].logl = tree->getCurScore();
1206
1207 if (!params.topotest_replicates || ntrees <= 1) {
1208 tid++;
1209 continue;
1210 }
1211 // now compute RELL scores
1212 orig_tree_lh[tid] = tree->getCurScore();
1213 double *tree_lhs_offset = tree_lhs + (tid*params.topotest_replicates);
1214 for (boot = 0; boot < params.topotest_replicates; boot++) {
1215 double lh = 0.0;
1216 int *this_boot_sample = boot_samples + (boot*nptn);
1217 for (size_t ptn = 0; ptn < nptn; ptn++)
1218 lh += pattern_lh[ptn] * this_boot_sample[ptn];
1219 tree_lhs_offset[boot] = lh;
1220 }
1221 tid++;
1222 }
1223
1224 ASSERT(tid == ntrees);
1225
1226 if (params.topotest_replicates && ntrees > 1) {
1227 double *tree_probs = new double[ntrees];
1228 memset(tree_probs, 0, ntrees*sizeof(double));
1229 int *tree_ranks = new int[ntrees];
1230
1231 /* perform RELL BP method */
1232 cout << "Performing RELL-BP test..." << endl;
1233 int *maxtid = new int[params.topotest_replicates];
1234 double *maxL = new double[params.topotest_replicates];
1235 int *maxcount = new int[params.topotest_replicates];
1236 memset(maxtid, 0, params.topotest_replicates*sizeof(int));
1237 memcpy(maxL, tree_lhs, params.topotest_replicates*sizeof(double));
1238 for (boot = 0; boot < params.topotest_replicates; boot++)
1239 maxcount[boot] = 1;
1240 for (tid = 1; tid < ntrees; tid++) {
1241 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1242 for (boot = 0; boot < params.topotest_replicates; boot++)
1243 if (tree_lhs_offset[boot] > maxL[boot] + params.ufboot_epsilon) {
1244 maxL[boot] = tree_lhs_offset[boot];
1245 maxtid[boot] = tid;
1246 maxcount[boot] = 1;
1247 } else if (tree_lhs_offset[boot] > maxL[boot] - params.ufboot_epsilon &&
1248 random_double() <= 1.0/(maxcount[boot]+1)) {
1249 maxL[boot] = max(maxL[boot],tree_lhs_offset[boot]);
1250 maxtid[boot] = tid;
1251 maxcount[boot]++;
1252 }
1253 }
1254 for (boot = 0; boot < params.topotest_replicates; boot++)
1255 tree_probs[maxtid[boot]] += 1.0;
1256 for (tid = 0; tid < ntrees; tid++) {
1257 tree_probs[tid] /= params.topotest_replicates;
1258 info[tid].rell_confident = false;
1259 info[tid].rell_bp = tree_probs[tid];
1260 }
1261 sort_index(tree_probs, tree_probs + ntrees, tree_ranks);
1262 double prob_sum = 0.0;
1263 // obtain the confidence set
1264 for (tid = ntrees-1; tid >= 0; tid--) {
1265 info[tree_ranks[tid]].rell_confident = true;
1266 prob_sum += tree_probs[tree_ranks[tid]];
1267 if (prob_sum > 0.95) break;
1268 }
1269
1270 // sanity check
1271 for (tid = 0, prob_sum = 0.0; tid < ntrees; tid++)
1272 prob_sum += tree_probs[tid];
1273 if (fabs(prob_sum-1.0) > 0.01)
1274 outError("Internal error: Wrong ", __func__);
1275
1276 delete [] maxcount;
1277 delete [] maxL;
1278 delete [] maxtid;
1279
1280 /* now do the SH test */
1281 cout << "Performing KH and SH test..." << endl;
1282 // SH centering step
1283 for (boot = 0; boot < params.topotest_replicates; boot++)
1284 max_lh[boot] = -DBL_MAX;
1285 double *avg_lh = new double[ntrees];
1286 for (tid = 0; tid < ntrees; tid++) {
1287 avg_lh[tid] = 0.0;
1288 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1289 for (boot = 0; boot < params.topotest_replicates; boot++)
1290 avg_lh[tid] += tree_lhs_offset[boot];
1291 avg_lh[tid] /= params.topotest_replicates;
1292 for (boot = 0; boot < params.topotest_replicates; boot++) {
1293 max_lh[boot] = max(max_lh[boot], tree_lhs_offset[boot] - avg_lh[tid]);
1294 }
1295 }
1296
1297 double orig_max_lh = orig_tree_lh[0];
1298 size_t orig_max_id = 0;
1299 double orig_2ndmax_lh = -DBL_MAX;
1300 size_t orig_2ndmax_id = -1;
1301 // find the max tree ID
1302 for (tid = 1; tid < ntrees; tid++)
1303 if (orig_max_lh < orig_tree_lh[tid]) {
1304 orig_max_lh = orig_tree_lh[tid];
1305 orig_max_id = tid;
1306 }
1307 // find the 2nd max tree ID
1308 for (tid = 0; tid < ntrees; tid++)
1309 if (tid != orig_max_id && orig_2ndmax_lh < orig_tree_lh[tid]) {
1310 orig_2ndmax_lh = orig_tree_lh[tid];
1311 orig_2ndmax_id = tid;
1312 }
1313
1314
1315 // SH compute p-value
1316 for (tid = 0; tid < ntrees; tid++) {
1317 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1318 // SH compute original deviation from max_lh
1319 info[tid].kh_pvalue = 0.0;
1320 info[tid].sh_pvalue = 0.0;
1321 size_t max_id = (tid != orig_max_id) ? orig_max_id : orig_2ndmax_id;
1322 double orig_diff = orig_tree_lh[max_id] - orig_tree_lh[tid] - avg_lh[tid];
1323 double *max_kh = tree_lhs + (max_id * params.topotest_replicates);
1324 for (boot = 0; boot < params.topotest_replicates; boot++) {
1325 if (max_lh[boot] - tree_lhs_offset[boot] > orig_diff)
1326 info[tid].sh_pvalue += 1.0;
1327 //double max_kh_here = max(max_kh[boot]-avg_lh[max_id], tree_lhs_offset[boot]-avg_lh[tid]);
1328 double max_kh_here = (max_kh[boot]-avg_lh[max_id]);
1329 if (max_kh_here - tree_lhs_offset[boot] > orig_diff)
1330 info[tid].kh_pvalue += 1.0;
1331 }
1332 info[tid].sh_pvalue /= params.topotest_replicates;
1333 info[tid].kh_pvalue /= params.topotest_replicates;
1334 }
1335
1336 if (params.do_weighted_test) {
1337
1338 cout << "Computing pairwise logl difference variance ..." << endl;
1339 /* computing lhdiff_weights as 1/sqrt(lhdiff_variance) */
1340 for (tid = 0; tid < ntrees; tid++) {
1341 double *pattern_lh1 = pattern_lhs + (tid * maxnptn);
1342 lhdiff_weights[tid*ntrees+tid] = 0.0;
1343 for (tid2 = tid+1; tid2 < ntrees; tid2++) {
1344 double lhdiff_variance = tree->computeLogLDiffVariance(pattern_lh1, pattern_lhs + (tid2*maxnptn));
1345 lhdiff_weights[tid*ntrees+tid2] = 1.0/sqrt(lhdiff_variance);
1346 lhdiff_weights[tid2*ntrees+tid] = lhdiff_weights[tid*ntrees+tid2];
1347 }
1348 }
1349
1350 // Weighted KH and SH test
1351 cout << "Performing WKH and WSH test..." << endl;
1352 for (tid = 0; tid < ntrees; tid++) {
1353 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1354 info[tid].wkh_pvalue = 0.0;
1355 info[tid].wsh_pvalue = 0.0;
1356 double worig_diff = -DBL_MAX;
1357 size_t max_id = -1;
1358 for (tid2 = 0; tid2 < ntrees; tid2++)
1359 if (tid2 != tid) {
1360 double wdiff = (orig_tree_lh[tid2] - orig_tree_lh[tid])*lhdiff_weights[tid*ntrees+tid2];
1361 if (wdiff > worig_diff) {
1362 worig_diff = wdiff;
1363 max_id = tid2;
1364 }
1365 }
1366 for (boot = 0; boot < params.topotest_replicates; boot++) {
1367 double wmax_diff = -DBL_MAX;
1368 for (tid2 = 0; tid2 < ntrees; tid2++)
1369 if (tid2 != tid)
1370 wmax_diff = max(wmax_diff,
1371 (tree_lhs[tid2*params.topotest_replicates+boot] - avg_lh[tid2] -
1372 tree_lhs_offset[boot] + avg_lh[tid]) * lhdiff_weights[tid*ntrees+tid2]);
1373 if (wmax_diff > worig_diff)
1374 info[tid].wsh_pvalue += 1.0;
1375 wmax_diff = (tree_lhs[max_id*params.topotest_replicates+boot] - avg_lh[max_id] -
1376 tree_lhs_offset[boot] + avg_lh[tid]);
1377 if (wmax_diff > orig_tree_lh[max_id] - orig_tree_lh[tid])
1378 info[tid].wkh_pvalue += 1.0;
1379 }
1380 info[tid].wsh_pvalue /= params.topotest_replicates;
1381 info[tid].wkh_pvalue /= params.topotest_replicates;
1382 }
1383 }
1384
1385 delete [] avg_lh;
1386
1387 /* now to ELW - Expected Likelihood Weight method */
1388 cout << "Performing ELW test..." << endl;
1389
1390 for (boot = 0; boot < params.topotest_replicates; boot++)
1391 max_lh[boot] = -DBL_MAX;
1392 for (tid = 0; tid < ntrees; tid++) {
1393 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1394 for (boot = 0; boot < params.topotest_replicates; boot++)
1395 max_lh[boot] = max(max_lh[boot], tree_lhs_offset[boot]);
1396 }
1397 double *sumL = new double[params.topotest_replicates];
1398 memset(sumL, 0, sizeof(double) * params.topotest_replicates);
1399 for (tid = 0; tid < ntrees; tid++) {
1400 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1401 for (boot = 0; boot < params.topotest_replicates; boot++) {
1402 tree_lhs_offset[boot] = exp(tree_lhs_offset[boot] - max_lh[boot]);
1403 sumL[boot] += tree_lhs_offset[boot];
1404 }
1405 }
1406 for (tid = 0; tid < ntrees; tid++) {
1407 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1408 tree_probs[tid] = 0.0;
1409 for (boot = 0; boot < params.topotest_replicates; boot++) {
1410 tree_probs[tid] += (tree_lhs_offset[boot] / sumL[boot]);
1411 }
1412 tree_probs[tid] /= params.topotest_replicates;
1413 info[tid].elw_confident = false;
1414 info[tid].elw_value = tree_probs[tid];
1415 }
1416
1417 sort_index(tree_probs, tree_probs + ntrees, tree_ranks);
1418 prob_sum = 0.0;
1419 // obtain the confidence set
1420 for (tid = ntrees-1; tid >= 0; tid--) {
1421 info[tree_ranks[tid]].elw_confident = true;
1422 prob_sum += tree_probs[tree_ranks[tid]];
1423 if (prob_sum > 0.95) break;
1424 }
1425
1426 // sanity check
1427 for (tid = 0, prob_sum = 0.0; tid < ntrees; tid++)
1428 prob_sum += tree_probs[tid];
1429 if (fabs(prob_sum-1.0) > 0.01)
1430 outError("Internal error: Wrong ", __func__);
1431 delete [] sumL;
1432
1433 if (params.do_au_test) {
1434 cout << "Performing approximately unbiased (AU) test..." << endl;
1435 performAUTest(params, tree, pattern_lhs, info);
1436 }
1437
1438 delete [] tree_ranks;
1439 delete [] tree_probs;
1440
1441 }
1442 if (max_lh)
1443 delete [] max_lh;
1444 if (orig_tree_lh)
1445 delete [] orig_tree_lh;
1446 if (pattern_lh)
1447 aligned_free(pattern_lh);
1448 if (pattern_lhs)
1449 aligned_free(pattern_lhs);
1450 if (lhdiff_weights)
1451 delete [] lhdiff_weights;
1452 if (tree_lhs)
1453 delete [] tree_lhs;
1454 //if (saved_tree_lhs)
1455 // delete [] saved_tree_lhs;
1456 if (boot_samples)
1457 delete [] boot_samples;
1458
1459 if (params.print_tree_lh) {
1460 scoreout.close();
1461 }
1462
1463 treeout.close();
1464 in.close();
1465
1466 cout << "Time for evaluating all trees: " << getRealTime() - time_start << " sec." << endl;
1467
1468 }
1469
1470
evaluateTrees(string treeset_file,Params & params,IQTree * tree)1471 void evaluateTrees(string treeset_file, Params ¶ms, IQTree *tree) {
1472 vector<TreeInfo> info;
1473 IntVector distinct_ids;
1474 evaluateTrees(treeset_file, params, tree, info, distinct_ids);
1475 }
1476
1477
1478
1479