1 /*
2  * phylotesting.cpp
3  *
4  *  Created on: Sep 21, 2019
5  *      Author: minh
6  */
7 
8 
9 
10 #ifdef HAVE_CONFIG_H
11 #include <config.h>
12 #endif
13 #include <iqtree_config.h>
14 
15 #include "treetesting.h"
16 #include "tree/phylotree.h"
17 #include "tree/phylosupertree.h"
18 #include "gsl/mygsl.h"
19 #include "utils/timeutil.h"
20 
21 
printSiteLh(const char * filename,PhyloTree * tree,double * ptn_lh,bool append,const char * linename)22 void printSiteLh(const char*filename, PhyloTree *tree, double *ptn_lh,
23                  bool append, const char *linename) {
24     int i;
25     double *pattern_lh;
26     if (!ptn_lh) {
27         pattern_lh = new double[tree->getAlnNPattern()];
28         tree->computePatternLikelihood(pattern_lh);
29     } else
30         pattern_lh = ptn_lh;
31 
32     try {
33         ofstream out;
34         out.exceptions(ios::failbit | ios::badbit);
35         if (append) {
36             out.open(filename, ios::out | ios::app);
37         } else {
38             out.open(filename);
39             out << 1 << " " << tree->getAlnNSite() << endl;
40         }
41         IntVector pattern_index;
42         tree->aln->getSitePatternIndex(pattern_index);
43         if (!linename)
44             out << "Site_Lh   ";
45         else {
46             out.width(10);
47             out << left << linename;
48         }
49         for (i = 0; i < tree->getAlnNSite(); i++)
50             out << " " << pattern_lh[pattern_index[i]];
51         out << endl;
52         out.close();
53         if (!append)
54             cout << "Site log-likelihoods printed to " << filename << endl;
55     } catch (ios::failure) {
56         outError(ERR_WRITE_OUTPUT, filename);
57     }
58 
59     if (!ptn_lh)
60         delete[] pattern_lh;
61 }
62 
printPartitionLh(const char * filename,PhyloTree * tree,double * ptn_lh,bool append,const char * linename)63 void printPartitionLh(const char*filename, PhyloTree *tree, double *ptn_lh,
64                       bool append, const char *linename) {
65 
66     ASSERT(tree->isSuperTree());
67     PhyloSuperTree *stree = (PhyloSuperTree*)tree;
68     int i;
69     double *pattern_lh;
70     if (!ptn_lh) {
71         pattern_lh = new double[tree->getAlnNPattern()];
72         tree->computePatternLikelihood(pattern_lh);
73     } else
74         pattern_lh = ptn_lh;
75 
76     double partition_lh[stree->size()];
77     int part;
78     double *pattern_lh_ptr = pattern_lh;
79     for (part = 0; part < stree->size(); part++) {
80         size_t nptn = stree->at(part)->getAlnNPattern();
81         partition_lh[part] = 0.0;
82         for (i = 0; i < nptn; i++)
83             partition_lh[part] += pattern_lh_ptr[i] * stree->at(part)->ptn_freq[i];
84         pattern_lh_ptr += nptn;
85     }
86 
87     try {
88         ofstream out;
89         out.exceptions(ios::failbit | ios::badbit);
90         if (append) {
91             out.open(filename, ios::out | ios::app);
92         } else {
93             out.open(filename);
94             out << 1 << " " << stree->size() << endl;
95         }
96         if (!linename)
97             out << "Part_Lh   ";
98         else {
99             out.width(10);
100             out << left << linename;
101         }
102         for (i = 0; i < stree->size(); i++)
103             out << " " << partition_lh[i];
104         out << endl;
105         out.close();
106         if (!append)
107             cout << "Partition log-likelihoods printed to " << filename << endl;
108     } catch (ios::failure) {
109         outError(ERR_WRITE_OUTPUT, filename);
110     }
111 
112     if (!ptn_lh)
113         delete[] pattern_lh;
114 }
115 
printSiteLhCategory(const char * filename,PhyloTree * tree,SiteLoglType wsl)116 void printSiteLhCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
117 
118     if (wsl == WSL_NONE || wsl == WSL_SITE)
119         return;
120     int ncat = tree->getNumLhCat(wsl);
121     if (tree->isSuperTree()) {
122         PhyloSuperTree *stree = (PhyloSuperTree*)tree;
123         for (auto it = stree->begin(); it != stree->end(); it++) {
124             int part_ncat = (*it)->getNumLhCat(wsl);
125             if (part_ncat > ncat)
126                 ncat = part_ncat;
127         }
128     }
129     int i;
130 
131 
132     try {
133         ofstream out;
134         out.exceptions(ios::failbit | ios::badbit);
135         out.open(filename);
136         out << "# Site likelihood per rate/mixture category" << endl
137         << "# This file can be read in MS Excel or in R with command:" << endl
138         << "#   tab=read.table('" <<  filename << "',header=TRUE,fill=TRUE)" << endl
139         << "# Columns are tab-separated with following meaning:" << endl;
140         if (tree->isSuperTree()) {
141             out << "#   Part:   Partition ID (1=" << ((PhyloSuperTree*)tree)->at(0)->aln->name << ", etc)" << endl
142             << "#   Site:   Site ID within partition (starting from 1 for each partition)" << endl;
143         } else
144             out << "#   Site:   Alignment site ID" << endl;
145 
146         out << "#   LnL:    Logarithm of site likelihood" << endl
147         << "#           Thus, sum of LnL is equal to tree log-likelihood" << endl
148         << "#   LnLW_k: Logarithm of (category-k site likelihood times category-k weight)" << endl
149         << "#           Thus, sum of exp(LnLW_k) is equal to exp(LnL)" << endl;
150 
151         if (tree->isSuperTree()) {
152             out << "Part\tSite\tLnL";
153         } else
154             out << "Site\tLnL";
155         for (i = 0; i < ncat; i++)
156             out << "\tLnLW_" << i+1;
157         out << endl;
158         out.precision(4);
159         out.setf(ios::fixed);
160 
161         tree->writeSiteLh(out, wsl);
162 
163         out.close();
164         cout << "Site log-likelihoods per category printed to " << filename << endl;
165         /*
166          if (!tree->isSuperTree()) {
167          cout << "Log-likelihood of constant sites: " << endl;
168          double const_prob = 0.0;
169          for (i = 0; i < tree->aln->getNPattern(); i++)
170          if (tree->aln->at(i).isConst()) {
171          Pattern pat = tree->aln->at(i);
172          for (Pattern::iterator it = pat.begin(); it != pat.end(); it++)
173          cout << tree->aln->convertStateBackStr(*it);
174          cout << ": " << pattern_lh[i] << endl;
175          const_prob += exp(pattern_lh[i]);
176          }
177          cout << "Probability of const sites: " << const_prob << endl;
178          }
179          */
180     } catch (ios::failure) {
181         outError(ERR_WRITE_OUTPUT, filename);
182     }
183 
184 }
185 
printAncestralSequences(const char * out_prefix,PhyloTree * tree,AncestralSeqType ast)186 void printAncestralSequences(const char *out_prefix, PhyloTree *tree, AncestralSeqType ast) {
187 
188     //    int *joint_ancestral = NULL;
189     //
190     //    if (tree->params->print_ancestral_sequence == AST_JOINT) {
191     //        joint_ancestral = new int[nptn*tree->leafNum];
192     //        tree->computeJointAncestralSequences(joint_ancestral);
193     //    }
194 
195     string filename = (string)out_prefix + ".state";
196     //    string filenameseq = (string)out_prefix + ".stateseq";
197 
198     try {
199         ofstream out;
200         out.exceptions(ios::failbit | ios::badbit);
201         out.open(filename.c_str());
202         out.setf(ios::fixed, ios::floatfield);
203         out.precision(5);
204 
205         //        ofstream outseq;
206         //        outseq.exceptions(ios::failbit | ios::badbit);
207         //        outseq.open(filenameseq.c_str());
208 
209         NodeVector nodes;
210         tree->getInternalNodes(nodes);
211 
212         double *marginal_ancestral_prob;
213         int *marginal_ancestral_seq;
214 
215         //        if (tree->params->print_ancestral_sequence == AST_JOINT)
216         //            outseq << 2*(tree->nodeNum-tree->leafNum) << " " << nsites << endl;
217         //        else
218         //            outseq << (tree->nodeNum-tree->leafNum) << " " << nsites << endl;
219         //
220         //        int name_width = max(tree->aln->getMaxSeqNameLength(),6)+10;
221 
222         out << "# Ancestral state reconstruction for all nodes in " << tree->params->out_prefix << ".treefile" << endl
223         << "# This file can be read in MS Excel or in R with command:" << endl
224         << "#   tab=read.table('" <<  tree->params->out_prefix << ".state',header=TRUE)" << endl
225         << "# Columns are tab-separated with following meaning:" << endl
226         << "#   Node:  Node name in the tree" << endl;
227         if (tree->isSuperTree()) {
228             PhyloSuperTree *stree = (PhyloSuperTree*)tree;
229             out << "#   Part:  Partition ID (1=" << stree->at(0)->aln->name << ", etc)" << endl
230             << "#   Site:  Site ID within partition (starting from 1 for each partition)" << endl;
231         } else
232             out << "#   Site:  Alignment site ID" << endl;
233 
234         out << "#   State: Most likely state assignment" << endl
235         << "#   p_X:   Posterior probability for state X (empirical Bayesian method)" << endl;
236 
237         if (tree->isSuperTree()) {
238             PhyloSuperTree *stree = (PhyloSuperTree*)tree;
239             out << "Node\tPart\tSite\tState";
240             for (size_t i = 0; i < stree->front()->aln->num_states; i++)
241                 out << "\tp_" << stree->front()->aln->convertStateBackStr(i);
242         } else {
243             out << "Node\tSite\tState";
244             for (size_t i = 0; i < tree->aln->num_states; i++)
245                 out << "\tp_" << tree->aln->convertStateBackStr(i);
246         }
247         out << endl;
248 
249 
250         bool orig_kernel_nonrev;
251         tree->initMarginalAncestralState(out, orig_kernel_nonrev, marginal_ancestral_prob, marginal_ancestral_seq);
252 
253         for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++) {
254             PhyloNode *node = (PhyloNode*)(*it);
255             PhyloNode *dad = (PhyloNode*)node->neighbors[0]->node;
256 
257             tree->computeMarginalAncestralState((PhyloNeighbor*)dad->findNeighbor(node), dad,
258                                                 marginal_ancestral_prob, marginal_ancestral_seq);
259 
260             //            int *joint_ancestral_node = joint_ancestral + (node->id - tree->leafNum)*nptn;
261 
262             // set node name if neccessary
263             if (node->name.empty() || !isalpha(node->name[0])) {
264                 node->name = "Node" + convertIntToString(node->id-tree->leafNum+1);
265             }
266 
267             // print ancestral state probabilities
268             tree->writeMarginalAncestralState(out, node, marginal_ancestral_prob, marginal_ancestral_seq);
269 
270             // print ancestral sequences
271             //            outseq.width(name_width);
272             //            outseq << left << node->name << " ";
273             //            for (i = 0; i < nsites; i++)
274             //                outseq << tree->aln->convertStateBackStr(marginal_ancestral_seq[pattern_index[i]]);
275             //            outseq << endl;
276             //
277             //            if (tree->params->print_ancestral_sequence == AST_JOINT) {
278             //                outseq.width(name_width);
279             //                outseq << left << (node->name+"_joint") << " ";
280             //                for (i = 0; i < nsites; i++)
281             //                    outseq << tree->aln->convertStateBackStr(joint_ancestral_node[pattern_index[i]]);
282             //                outseq << endl;
283             //            }
284         }
285 
286         tree->endMarginalAncestralState(orig_kernel_nonrev, marginal_ancestral_prob, marginal_ancestral_seq);
287 
288         out.close();
289         //        outseq.close();
290         cout << "Ancestral state probabilities printed to " << filename << endl;
291         //        cout << "Ancestral sequences printed to " << filenameseq << endl;
292 
293     } catch (ios::failure) {
294         outError(ERR_WRITE_OUTPUT, filename);
295     }
296 
297     //    if (joint_ancestral)
298     //        delete[] joint_ancestral;
299 
300 }
301 
printSiteProbCategory(const char * filename,PhyloTree * tree,SiteLoglType wsl)302 void printSiteProbCategory(const char*filename, PhyloTree *tree, SiteLoglType wsl) {
303 
304     if (wsl == WSL_NONE || wsl == WSL_SITE)
305         return;
306     // error checking
307     if (!tree->getModel()->isMixture()) {
308         if (wsl != WSL_RATECAT) {
309             outWarning("Switch now to '-wspr' as it is the only option for non-mixture model");
310             wsl = WSL_RATECAT;
311         }
312     } else {
313         // mixture model
314         if (wsl == WSL_MIXTURE_RATECAT && tree->getModelFactory()->fused_mix_rate) {
315             outWarning("-wspmr is not suitable for fused mixture model, switch now to -wspm");
316             wsl = WSL_MIXTURE;
317         }
318     }
319     size_t cat, ncat = tree->getNumLhCat(wsl);
320     double *ptn_prob_cat = new double[((size_t)tree->getAlnNPattern())*ncat];
321     tree->computePatternProbabilityCategory(ptn_prob_cat, wsl);
322 
323     try {
324         ofstream out;
325         out.exceptions(ios::failbit | ios::badbit);
326         out.open(filename);
327         if (tree->isSuperTree())
328             out << "Set\t";
329         out << "Site";
330         for (cat = 0; cat < ncat; cat++)
331             out << "\tp" << cat+1;
332         out << endl;
333         IntVector pattern_index;
334         if (tree->isSuperTree()) {
335             PhyloSuperTree *super_tree = (PhyloSuperTree*)tree;
336             size_t offset = 0;
337             for (PhyloSuperTree::iterator it = super_tree->begin(); it != super_tree->end(); it++) {
338                 size_t part_ncat = (*it)->getNumLhCat(wsl);
339                 (*it)->aln->getSitePatternIndex(pattern_index);
340                 size_t site, nsite = (*it)->aln->getNSite();
341                 for (site = 0; site < nsite; site++) {
342                     out << (it-super_tree->begin())+1 << "\t" << site+1;
343                     double *prob_cat = ptn_prob_cat + (offset+pattern_index[site]*part_ncat);
344                     for (cat = 0; cat < part_ncat; cat++)
345                         out << "\t" << prob_cat[cat];
346                     out << endl;
347                 }
348                 offset += (*it)->aln->getNPattern()*(*it)->getNumLhCat(wsl);
349             }
350         } else {
351             tree->aln->getSitePatternIndex(pattern_index);
352             int nsite = tree->getAlnNSite();
353             for (int site = 0; site < nsite; site++) {
354                 out << site+1;
355                 double *prob_cat = ptn_prob_cat + pattern_index[site]*ncat;
356                 for (cat = 0; cat < ncat; cat++) {
357                     out << "\t" << prob_cat[cat];
358                 }
359                 out << endl;
360             }
361         }
362         out.close();
363         cout << "Site probabilities per category printed to " << filename << endl;
364     } catch (ios::failure) {
365         outError(ERR_WRITE_OUTPUT, filename);
366     }
367 
368 }
369 
370 
printSiteStateFreq(const char * filename,PhyloTree * tree,double * state_freqs)371 void printSiteStateFreq(const char*filename, PhyloTree *tree, double *state_freqs) {
372 
373     int i, j, nsites = tree->getAlnNSite(), nstates = tree->aln->num_states;
374     double *ptn_state_freq;
375     if (state_freqs) {
376         ptn_state_freq = state_freqs;
377     } else {
378         ptn_state_freq = new double[((size_t)tree->getAlnNPattern()) * nstates];
379         tree->computePatternStateFreq(ptn_state_freq);
380     }
381 
382     try {
383         ofstream out;
384         out.exceptions(ios::failbit | ios::badbit);
385         out.open(filename);
386         IntVector pattern_index;
387         tree->aln->getSitePatternIndex(pattern_index);
388         for (i = 0; i < nsites; i++) {
389             out.width(6);
390             out << left << i+1 << " ";
391             double *state_freq = &ptn_state_freq[pattern_index[i]*nstates];
392             for (j = 0; j < nstates; j++) {
393                 out.width(15);
394                 out << state_freq[j] << " ";
395             }
396             out << endl;
397         }
398         out.close();
399         cout << "Site state frequency vectors printed to " << filename << endl;
400     } catch (ios::failure) {
401         outError(ERR_WRITE_OUTPUT, filename);
402     }
403     if (!state_freqs)
404         delete [] ptn_state_freq;
405 }
406 
printSiteStateFreq(const char * filename,Alignment * aln)407 void printSiteStateFreq(const char* filename, Alignment *aln) {
408     if (aln->site_state_freq.empty())
409         return;
410     int i, j, nsites = aln->getNSite(), nstates = aln->num_states;
411     try {
412         ofstream out;
413         out.exceptions(ios::failbit | ios::badbit);
414         out.open(filename);
415         IntVector pattern_index;
416         aln->getSitePatternIndex(pattern_index);
417         for (i = 0; i < nsites; i++) {
418             out.width(6);
419             out << left << i+1 << " ";
420             double *state_freq = aln->site_state_freq[pattern_index[i]];
421             for (j = 0; j < nstates; j++) {
422                 out.width(15);
423                 out << state_freq[j] << " ";
424             }
425             out << endl;
426         }
427         out.close();
428         cout << "Site state frequency vectors printed to " << filename << endl;
429     } catch (ios::failure) {
430         outError(ERR_WRITE_OUTPUT, filename);
431     }
432 }
433 
countDistinctTrees(const char * filename,bool rooted,IQTree * tree,IntVector & distinct_ids,bool exclude_duplicate)434 int countDistinctTrees(const char *filename, bool rooted, IQTree *tree, IntVector &distinct_ids, bool exclude_duplicate) {
435     StringIntMap treels;
436     try {
437         ifstream in;
438         in.exceptions(ios::failbit | ios::badbit);
439         in.open(filename);
440         // remove the failbit
441         in.exceptions(ios::badbit);
442         int tree_id;
443         for (tree_id = 0; !in.eof(); tree_id++) {
444             if (exclude_duplicate) {
445                 tree->freeNode();
446                 tree->readTree(in, rooted);
447                 tree->setAlignment(tree->aln);
448                 tree->setRootNode(tree->params->root);
449                 StringIntMap::iterator it = treels.end();
450                 ostringstream ostr;
451                 tree->printTree(ostr, WT_TAXON_ID | WT_SORT_TAXA);
452                 it = treels.find(ostr.str());
453                 if (it != treels.end()) { // already in treels
454                     distinct_ids.push_back(it->second);
455                 } else {
456                     distinct_ids.push_back(-1);
457                     treels[ostr.str()] = tree_id;
458                 }
459             } else {
460                 // ignore tree
461                 char ch;
462                 do {
463                     in >> ch;
464                 } while (!in.eof() && ch != ';');
465                 distinct_ids.push_back(-1);
466             }
467             char ch;
468             in.exceptions(ios::goodbit);
469             (in) >> ch;
470             if (in.eof()) break;
471             in.unget();
472             in.exceptions(ios::failbit | ios::badbit);
473 
474         }
475         in.close();
476     } catch (ios::failure) {
477         outError("Cannot read file ", filename);
478     }
479     if (exclude_duplicate)
480         return treels.size();
481     else
482         return distinct_ids.size();
483 }
484 
485 //const double TOL_RELL_SCORE = 0.01;
486 
487 /*
488  Problem: solve the following linear system equation:
489  a_1*x + b_1*y = c_1
490  a_2*x + b_2*y = c_2
491  ....
492  a_n*x + b_n*y = c_n
493 
494  becomes minimizing weighted least square:
495 
496  sum_k { w_k*[ c_k - (a_k*x + b_k*y) ]^2 }
497 
498 
499  the solution is:
500 
501  x = [(sum_k w_k*b_k*c_k)*(sum_k w_k*a_k*b_k) - (sum_k w_k*a_k*c_k)(sum_k w_k*b_k^2)] /
502  [ (sum_k w_k*a_k*b_k)^2 - (sum_k w_k*a_k^2)*(sum_k w_k*b_k^2) ]
503 
504  y = [(sum_k w_k*a_k*c_k)*(sum_k w_k*a_k*b_k) - (sum_k w_k*b_k*c_k)(sum_k w_k*a_k^2)] /
505  [ (sum_k w_k*a_k*b_k)^2 - (sum_k w_k*a_k^2)*(sum_k w*k*b_k^2) ]
506 
507  @param n number of data points
508  @param w weight vector of length n
509  @param a a value vector of length n
510  @param b b value vector of length n
511  @param c c value vector of length n
512  @param[out] x x-value
513  @param[out] y y-value
514  @return least square value
515  */
doWeightedLeastSquare(int n,double * w,double * a,double * b,double * c,double & x,double & y,double & se)516 void doWeightedLeastSquare(int n, double *w, double *a, double *b, double *c, double &x, double &y, double &se) {
517     int k;
518     double BC = 0.0, AB = 0.0, AC = 0.0, A2 = 0.0, B2 = 0.0;
519     double denom;
520     for (k = 0; k < n; k++) {
521         double wa = w[k]*a[k];
522         double wb = w[k]*b[k];
523         AB += wa*b[k];
524         BC += wb*c[k];
525         AC += wa*c[k];
526         A2 += wa*a[k];
527         B2 += wb*b[k];
528     }
529     denom = 1.0/(AB*AB - A2*B2);
530     x = (BC*AB - AC*B2) * denom;
531     y = (AC*AB - BC*A2) * denom;
532 
533     se = -denom*(B2+A2+2*AB);
534     ASSERT(se >= 0.0);
535 }
536 
537 /**
538  MLE estimates for AU test
539  */
540 class OptimizationAUTest : public Optimization {
541 
542 public:
543 
OptimizationAUTest(double d,double c,int nscales,double * bp,double * rr,double * rr_inv)544     OptimizationAUTest(double d, double c, int nscales, double *bp, double *rr, double *rr_inv) {
545         this->d = d;
546         this->c = c;
547         this->bp = bp;
548         this->rr = rr;
549         this->rr_inv = rr_inv;
550         this->nscales = nscales;
551 
552     }
553 
554     /**
555      return the number of dimensions
556      */
getNDim()557     virtual int getNDim() { return 2; }
558 
559 
560     /**
561      the target function which needs to be optimized
562      @param x the input vector x
563      @return the function value at x
564      */
targetFunk(double x[])565     virtual double targetFunk(double x[]) {
566         d = x[1];
567         c = x[2];
568         double res = 0.0;
569         for (int k = 0; k < nscales; k++) {
570             double cdf = gsl_cdf_ugaussian_P(d*rr[k] + c*rr_inv[k]);
571             res += bp[k] * log(1.0 - cdf) + (1.0-bp[k])*log(cdf);
572         }
573         return res;
574     }
575 
optimizeDC()576     void optimizeDC() {
577         double x[3], lower[3], upper[3];
578         bool bound_check[3];
579         x[1] = d;
580         x[2] = c;
581         lower[1] = lower[2] = 1e-4;
582         upper[1] = upper[2] = 100.0;
583         bound_check[1] = bound_check[2] = false;
584         minimizeMultiDimen(x, 2, lower, upper, bound_check, 1e-4);
585         d = x[1];
586         c = x[2];
587     }
588 
589     double d, c;
590     int nscales;
591     double *bp;
592     double *rr;
593     double *rr_inv;
594 };
595 
596 
597 /* BEGIN CODE WAS TAKEN FROM CONSEL PROGRAM */
598 
599 /* binary search for a sorted vector
600  find k s.t. vec[k-1] <= t < vec[k]
601  */
cntdist2(double * vec,int bb,double t)602 int cntdist2(double *vec, int bb, double t)
603 {
604     int i,i0,i1;
605 
606     i0=0; i1=bb-1;
607     if(t < vec[0]) return 0;
608     else if(vec[bb-1] <= t) return bb;
609 
610     while(i1-i0>1) {
611         i=(i0+i1)/2;
612         if(vec[i] <= t) i0=i;
613         else i1=i;
614     }
615 
616     return i1;
617 }
618 
619 /*
620  smoothing the counting for a sorted vector
621  the piecewise linear function connecting
622  F(v[i]) =  1/(2n) + i/n, for i=0,...,n-1
623  F(1.5v[0]-0.5v[1]) = 0
624  F(1.5v[n-1]-0.5v[n-2]) = 1.
625 
626  1. F(x)=0 for x<=1.5v[0]-0.5v[1]
627 
628  2. F(x)=1/(2n) + (1/n)*(x-v[0])/(v[1]-v[0])
629  for 1.5v[0]-0.5v[1] < x <= v[0]
630 
631  3. F(x)=1/(2n) + i/n + (1/n)*(x-v[i])/(v[i]-v[i+1])
632  for v[i] < x <= v[i+1], i=0,...,
633 
634  4. F(x)=1-(1/2n) + (1/n)*(x-v[n-1])/(v[n-1]-v[n-2])
635  for v[n-1] < x <= 1.5v[n-1]-0.5v[n-2]
636 
637  5. F(x)=1 for x > 1.5v[n-1]-0.5v[n-2]
638  */
cntdist3(double * vec,int bb,double t)639 double cntdist3(double *vec, int bb, double t)
640 {
641     double p,n;
642     int i;
643     i=cntdist2(vec,bb,t)-1; /* to find vec[i] <= t < vec[i+1] */
644     n=(double)bb;
645     if(i<0) {
646         if(vec[1]>vec[0]) p=0.5+(t-vec[0])/(vec[1]-vec[0]);
647         else p=0.0;
648     } else if(i<bb-1) {
649         if(vec[i+1]>vec[i]) p=0.5+(double)i+(t-vec[i])/(vec[i+1]-vec[i]);
650         else p=0.5+(double)i; /* <- should never happen */
651     } else {
652         if(vec[bb-1]-vec[bb-2]>0) p=n-0.5+(t-vec[bb-1])/(vec[bb-1]-vec[bb-2]);
653         else p=n;
654     }
655     if(p>n) p=n; else if(p<0.0) p=0.0;
656     return p;
657 }
658 
log3(double x)659 double log3(double x)
660 {
661     double y,z1,z2,z3,z4,z5;
662     if(fabs(x)>1.0e-3) {
663         y=-log(1.0-x);
664     } else {
665         z1=x; z2=z1*x; z3=z2*x; z4=z3*x; z5=z4*x;
666         y=((((z5/5.0)+z4/4.0)+z3/3.0)+z2/2.0)+z1;
667     }
668     return y;
669 }
670 
671 int mleloopmax=30;
672 double mleeps=1e-10;
mlecoef(double * cnts,double * rr,double bb,int kk,double * coef0,double * lrt,int * df,double * se)673 int mlecoef(double *cnts, double *rr, double bb, int kk,
674             double *coef0, /* set initinal value (size=2) */
675             double *lrt, int *df, /* LRT statistic */
676             double *se
677             )
678 {
679     int i,m,loop;
680     double coef[2], update[2];
681     double d1f, d2f, d11f, d12f, d22f; /* derivatives */
682     double v11, v12, v22; /* inverse of -d??f */
683     double a,e;
684     double s[kk], r[kk],c[kk], b[kk],z[kk],p[kk],d[kk],g[kk],h[kk];
685 
686     m=0;
687     for(i=0;i<kk;i++)
688         {
689         r[m]=rr[i]; s[m]=sqrt(rr[i]); c[m]=cnts[i]*bb; b[m]=bb;
690         m++;
691         }
692     if(m<2) return 1;
693 
694     coef[0]=coef0[0]; /* signed distance */
695     coef[1]=coef0[1]; /* curvature */
696 
697     for(loop=0;loop<mleloopmax;loop++) {
698         d1f=d2f=d11f=d12f=d22f=0.0;
699         for(i=0;i<m;i++) {
700             z[i]=coef[0]*s[i]+coef[1]/s[i];
701             p[i]=gsl_cdf_ugaussian_P(-z[i]);
702             d[i]=gsl_ran_ugaussian_pdf(z[i]);
703             if(p[i]>0.0 && p[i]<1.0) {
704                 g[i]=d[i]*( d[i]*(-c[i]+2.0*c[i]*p[i]-b[i]*p[i]*p[i])/
705                            (p[i]*p[i]*(1.0-p[i])*(1.0-p[i]))
706                            + z[i]*(c[i]-b[i]*p[i])/(p[i]*(1.0-p[i])) );
707                 h[i]=d[i]*(c[i]-b[i]*p[i])/(p[i]*(1.0-p[i]));
708             } else { g[i]=h[i]=0.0; }
709             d1f+= -h[i]*s[i]; d2f+= -h[i]/s[i];
710             d11f+= g[i]*r[i]; d12f+= g[i]; d22f+= g[i]/r[i];
711         }
712 
713         a=d11f*d22f-d12f*d12f;
714         if(a==0.0) {
715             return 2;
716         }
717         v11=-d22f/a; v12=d12f/a; v22=-d11f/a;
718 
719         /* Newton-Raphson update */
720         update[0]=v11*d1f+v12*d2f; update[1]=v12*d1f+v22*d2f;
721         coef[0]+=update[0]; coef[1]+=update[1];
722 
723         /* check convergence */
724         e=-d11f*update[0]*update[0]-2.0*d12f*update[0]*update[1]
725         -d22f*update[1]*update[1];
726 
727         if(e<mleeps) break;
728     }
729 
730     /* calc log-likelihood */
731     *lrt=0.0; *df=0;
732     for(i=0;i<m;i++) {
733         if(p[i]>0.0 && p[i]<1.0) {
734             *df+=1;
735             if(c[i]>0.0) a=c[i]*log(c[i]/b[i]/p[i]); else a=0.0;
736             if(c[i]<b[i]) a+=(b[i]-c[i])*(log3(p[i])-log3(c[i]/b[i]));
737             *lrt += a;
738         }
739     }
740     *lrt *= 2.0; *df -= 2;
741 
742     /* write back the results */
743     coef0[0]=coef[0]; coef0[1]=coef[1];
744     *se = v11 + v22 - 2*v12;
745     //  vmat[0][0]=v11;vmat[0][1]=vmat[1][0]=v12;vmat[1][1]=v22;
746     if(loop==mleloopmax || *df< 0) i=1; else i=0;
747     return i;
748 }
749 
750 /* END CODE WAS TAKEN FROM CONSEL PROGRAM */
751 
752 /**
753  @param tree_lhs RELL score matrix of size #trees x #replicates
754  */
performAUTest(Params & params,PhyloTree * tree,double * pattern_lhs,vector<TreeInfo> & info)755 void performAUTest(Params &params, PhyloTree *tree, double *pattern_lhs, vector<TreeInfo> &info) {
756 
757     if (params.topotest_replicates < 10000)
758         outWarning("Too few replicates for AU test. At least -zb 10000 for reliable results!");
759 
760     /* STEP 1: specify scale factors */
761     size_t nscales = 10;
762     double r[] = {0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4};
763     double rr[] = {sqrt(0.5), sqrt(0.6), sqrt(0.7), sqrt(0.8), sqrt(0.9), 1.0,
764         sqrt(1.1), sqrt(1.2), sqrt(1.3), sqrt(1.4)};
765     double rr_inv[] = {sqrt(1/0.5), sqrt(1/0.6), sqrt(1/0.7), sqrt(1/0.8), sqrt(1/0.9), 1.0,
766         sqrt(1/1.1), sqrt(1/1.2), sqrt(1/1.3), sqrt(1/1.4)};
767 
768     /* STEP 2: compute bootstrap proportion */
769     size_t ntrees = info.size();
770     size_t nboot = params.topotest_replicates;
771     //    double nboot_inv = 1.0 / nboot;
772 
773     size_t nptn = tree->getAlnNPattern();
774     size_t maxnptn = get_safe_upper_limit(nptn);
775 
776     //    double *bp = new double[ntrees*nscales];
777     //    memset(bp, 0, sizeof(double)*ntrees*nscales);
778 
779     double *treelhs;
780     cout << (ntrees*nscales*nboot*sizeof(double) >> 20) << " MB required for AU test" << endl;
781     treelhs = new double[ntrees*nscales*nboot];
782     if (!treelhs)
783         outError("Not enough memory to perform AU test!");
784 
785     size_t k, tid, ptn;
786 
787     double start_time = getRealTime();
788 
789     cout << "Generating " << nscales << " x " << nboot << " multiscale bootstrap replicates... ";
790 
791 #ifdef _OPENMP
792 #pragma omp parallel private(k, tid, ptn)
793     {
794     int *rstream;
795     init_random(params.ran_seed + omp_get_thread_num(), false, &rstream);
796 #else
797     int *rstream = randstream;
798 #endif
799     size_t boot;
800     int *boot_sample = aligned_alloc<int>(maxnptn);
801     memset(boot_sample, 0, maxnptn*sizeof(int));
802 
803     double *boot_sample_dbl = aligned_alloc<double>(maxnptn);
804 
805 #ifdef _OPENMP
806 #pragma omp for schedule(dynamic)
807 #endif
808     for (k = 0; k < nscales; k++) {
809         string str = "SCALE=" + convertDoubleToString(r[k]);
810         for (boot = 0; boot < nboot; boot++) {
811             if (r[k] == 1.0 && boot == 0)
812                 // 2018-10-23: get one of the bootstrap sample as the original alignment
813                 tree->aln->getPatternFreq(boot_sample);
814             else
815                 tree->aln->createBootstrapAlignment(boot_sample, str.c_str(), rstream);
816             for (ptn = 0; ptn < maxnptn; ptn++)
817                 boot_sample_dbl[ptn] = boot_sample[ptn];
818             double max_lh = -DBL_MAX, second_max_lh = -DBL_MAX;
819             int max_tid = -1;
820             for (tid = 0; tid < ntrees; tid++) {
821                 double *pattern_lh = pattern_lhs + (tid*maxnptn);
822                 double tree_lh;
823                 if (params.SSE == LK_386) {
824                     tree_lh = 0.0;
825                     for (ptn = 0; ptn < nptn; ptn++)
826                         tree_lh += pattern_lh[ptn] * boot_sample_dbl[ptn];
827                 } else {
828                     tree_lh = tree->dotProductDoubleCall(pattern_lh, boot_sample_dbl, nptn);
829                 }
830                 // rescale lh
831                 tree_lh /= r[k];
832 
833                 // find the max and second max
834                 if (tree_lh > max_lh) {
835                     second_max_lh = max_lh;
836                     max_lh = tree_lh;
837                     max_tid = tid;
838                 } else if (tree_lh > second_max_lh)
839                     second_max_lh = tree_lh;
840 
841                 treelhs[(tid*nscales+k)*nboot + boot] = tree_lh;
842             }
843 
844             // compute difference from max_lh
845             for (tid = 0; tid < ntrees; tid++)
846                 if (tid != max_tid)
847                     treelhs[(tid*nscales+k)*nboot + boot] = max_lh - treelhs[(tid*nscales+k)*nboot + boot];
848                 else
849                     treelhs[(tid*nscales+k)*nboot + boot] = second_max_lh - max_lh;
850             //            bp[k*ntrees+max_tid] += nboot_inv;
851         } // for boot
852 
853         // sort the replicates
854         for (tid = 0; tid < ntrees; tid++) {
855             quicksort<double,int>(treelhs + (tid*nscales+k)*nboot, 0, nboot-1);
856         }
857 
858     } // for scale
859 
860     aligned_free(boot_sample_dbl);
861     aligned_free(boot_sample);
862 
863 #ifdef _OPENMP
864     finish_random(rstream);
865     }
866 #endif
867 
868     //    if (verbose_mode >= VB_MED) {
869     //        cout << "scale";
870     //        for (k = 0; k < nscales; k++)
871     //            cout << "\t" << r[k];
872     //        cout << endl;
873     //        for (tid = 0; tid < ntrees; tid++) {
874     //            cout << tid;
875     //            for (k = 0; k < nscales; k++) {
876     //                cout << "\t" << bp[tid+k*ntrees];
877     //            }
878     //            cout << endl;
879     //        }
880     //    }
881 
882     cout << getRealTime() - start_time << " seconds" << endl;
883 
884     /* STEP 3: weighted least square fit */
885 
886     double *cc = new double[nscales];
887     double *w = new double[nscales];
888     double *this_bp = new double[nscales];
889     cout << "TreeID\tAU\tRSS\td\tc" << endl;
890     for (tid = 0; tid < ntrees; tid++) {
891         double *this_stat = treelhs + tid*nscales*nboot;
892         double xn = this_stat[(nscales/2)*nboot + nboot/2], x;
893         double c, d; // c, d in original paper
894         int idf0 = -2;
895         double z = 0.0, z0 = 0.0, thp = 0.0, th = 0.0, ze = 0.0, ze0 = 0.0;
896         double pval, se;
897         int df;
898         double rss = 0.0;
899         int step;
900         const int max_step = 30;
901         bool failed = false;
902         for (step = 0; step < max_step; step++) {
903             x = xn;
904             int num_k = 0;
905             for (k = 0; k < nscales; k++) {
906                 this_bp[k] = cntdist3(this_stat + k*nboot, nboot, x) / nboot;
907                 if (this_bp[k] <= 0 || this_bp[k] >= 1) {
908                     cc[k] = w[k] = 0.0;
909                 } else {
910                     double bp_val = this_bp[k];
911                     cc[k] = -gsl_cdf_ugaussian_Pinv(bp_val);
912                     double bp_pdf = gsl_ran_ugaussian_pdf(cc[k]);
913                     w[k] = bp_pdf*bp_pdf*nboot / (bp_val*(1.0-bp_val));
914                     num_k++;
915                 }
916             }
917             df = num_k-2;
918             if (num_k >= 2) {
919                 // first obtain d and c by weighted least square
920                 doWeightedLeastSquare(nscales, w, rr, rr_inv, cc, d, c, se);
921 
922                 // maximum likelhood fit
923                 double coef0[2] = {d, c};
924                 int mlefail = mlecoef(this_bp, r, nboot, nscales, coef0, &rss, &df, &se);
925 
926                 if (!mlefail) {
927                     d = coef0[0];
928                     c = coef0[1];
929                 }
930 
931                 se = gsl_ran_ugaussian_pdf(d-c)*sqrt(se);
932 
933                 // second, perform MLE estimate of d and c
934                 //            OptimizationAUTest mle(d, c, nscales, this_bp, rr, rr_inv);
935                 //            mle.optimizeDC();
936                 //            d = mle.d;
937                 //            c = mle.c;
938 
939                 /* STEP 4: compute p-value according to Eq. 11 */
940                 pval = gsl_cdf_ugaussian_Q(d-c);
941                 z = -pval;
942                 ze = se;
943                 // compute sum of squared difference
944                 rss = 0.0;
945                 for (k = 0; k < nscales; k++) {
946                     double diff = cc[k] - (rr[k]*d + rr_inv[k]*c);
947                     rss += w[k] * diff * diff;
948                 }
949 
950             } else {
951                 // not enough data for WLS
952                 int num0 = 0;
953                 for (k = 0; k < nscales; k++)
954                     if (this_bp[k] <= 0.0) num0++;
955                 if (num0 > nscales/2)
956                     pval = 0.0;
957                 else
958                     pval = 1.0;
959                 se = 0.0;
960                 d = c = 0.0;
961                 rss = 0.0;
962                 if (verbose_mode >= VB_MED)
963                     cout << "   error in wls" << endl;
964                 //info[tid].au_pvalue = pval;
965                 //break;
966             }
967 
968 
969             if (verbose_mode >= VB_MED) {
970                 cout.unsetf(ios::fixed);
971                 cout << "\t" << step << "\t" << th << "\t" << x << "\t" << pval << "\t" << se << "\t" << nscales-2 << "\t" << d << "\t" << c << "\t" << z << "\t" << ze << "\t" << rss << endl;
972             }
973 
974             if(df < 0 && idf0 < 0) { failed = true; break;} /* degenerated */
975 
976             if ((df < 0) || (idf0 >= 0 && (z-z0)*(x-thp) > 0.0 && fabs(z-z0)>0.1*ze0)) {
977                 if (verbose_mode >= VB_MED)
978                     cout << "   non-monotone" << endl;
979                 th=x;
980                 xn=0.5*x+0.5*thp;
981                 continue;
982             }
983             if(idf0 >= 0 && (fabs(z-z0)<0.01*ze0)) {
984                 if(fabs(th)<1e-10)
985                     xn=th;
986                 else th=x;
987             } else
988                 xn=0.5*th+0.5*x;
989             info[tid].au_pvalue = pval;
990             thp=x;
991             z0=z;
992             ze0=ze;
993             idf0 = df;
994             if(fabs(x-th)<1e-10) break;
995         } // for step
996 
997         if (failed && verbose_mode >= VB_MED)
998             cout << "   degenerated" << endl;
999 
1000         if (step == max_step) {
1001             if (verbose_mode >= VB_MED)
1002                 cout << "   non-convergence" << endl;
1003             failed = true;
1004         }
1005 
1006         double pchi2 = (failed) ? 0.0 : computePValueChiSquare(rss, df);
1007         cout << tid+1 << "\t" << info[tid].au_pvalue << "\t" << rss << "\t" << d << "\t" << c;
1008 
1009         // warning if p-value of chi-square < 0.01 (rss too high)
1010         if (pchi2 < 0.01)
1011             cout << " !!!";
1012         cout << endl;
1013     }
1014 
1015     delete [] this_bp;
1016     delete [] w;
1017     delete [] cc;
1018 
1019     cout << "Time for AU test: " << getRealTime() - start_time << " seconds" << endl;
1020     //    delete [] bp;
1021 }
1022 
1023 
evaluateTrees(string treeset_file,Params & params,IQTree * tree,vector<TreeInfo> & info,IntVector & distinct_ids)1024 void evaluateTrees(string treeset_file, Params &params, IQTree *tree, vector<TreeInfo> &info, IntVector &distinct_ids)
1025 {
1026     if (treeset_file.empty())
1027         return;
1028     cout << endl;
1029     //MTreeSet trees(treeset_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
1030     cout << "Reading trees in " << treeset_file << " ..." << endl;
1031     size_t ntrees = countDistinctTrees(treeset_file.c_str(), params.is_rooted, tree, distinct_ids, params.distinct_trees);
1032     if (ntrees < distinct_ids.size()) {
1033         cout << "WARNING: " << distinct_ids.size() << " trees detected but only " << ntrees << " distinct trees will be evaluated" << endl;
1034     } else {
1035         cout << ntrees << (params.distinct_trees ? " distinct" : "") << " trees detected" << endl;
1036     }
1037     if (ntrees == 0) return;
1038     ifstream in(treeset_file);
1039 
1040     //if (trees.size() == 1) return;
1041     //string tree_file = treeset_file;
1042     string tree_file = params.out_prefix;
1043     tree_file += ".trees";
1044     ofstream treeout;
1045     //if (!params.fixed_branch_length) {
1046     treeout.open(tree_file.c_str());
1047     //}
1048     string score_file = params.out_prefix;
1049     score_file += ".treelh";
1050     ofstream scoreout;
1051     if (params.print_tree_lh)
1052         scoreout.open(score_file.c_str());
1053     string site_lh_file = params.out_prefix;
1054     site_lh_file += ".sitelh";
1055     if (params.print_site_lh) {
1056         ofstream site_lh_out(site_lh_file.c_str());
1057         site_lh_out << ntrees << " " << tree->getAlnNSite() << endl;
1058         site_lh_out.close();
1059     }
1060 
1061     if (params.print_partition_lh && !tree->isSuperTree()) {
1062         outWarning("-wpl does not work with non-partition model");
1063         params.print_partition_lh = false;
1064     }
1065     string part_lh_file = params.out_prefix;
1066     part_lh_file += ".partlh";
1067     if (params.print_partition_lh) {
1068         ofstream part_lh_out(part_lh_file.c_str());
1069         part_lh_out << ntrees << " " << ((PhyloSuperTree*)tree)->size() << endl;
1070         part_lh_out.close();
1071     }
1072 
1073     double time_start = getRealTime();
1074 
1075     int *boot_samples = NULL;
1076     size_t boot;
1077     //double *saved_tree_lhs = NULL;
1078     double *tree_lhs = NULL; // RELL score matrix of size #trees x #replicates
1079     double *pattern_lh = NULL;
1080     double *pattern_lhs = NULL;
1081     double *orig_tree_lh = NULL; // Original tree log-likelihoods
1082     double *max_lh = NULL;
1083     double *lhdiff_weights = NULL;
1084     size_t nptn = tree->getAlnNPattern();
1085     size_t maxnptn = get_safe_upper_limit(nptn);
1086 
1087     if (params.topotest_replicates && ntrees > 1) {
1088         size_t mem_size = (size_t)params.topotest_replicates*nptn*sizeof(int) +
1089         ntrees*params.topotest_replicates*sizeof(double) +
1090         (nptn + ntrees*3 + params.topotest_replicates*2)*sizeof(double) +
1091         ntrees*sizeof(TreeInfo) +
1092         params.do_weighted_test*(ntrees * nptn * sizeof(double) + ntrees*ntrees*sizeof(double));
1093         cout << "Note: " << ((double)mem_size/1024)/1024 << " MB of RAM required!" << endl;
1094         if (mem_size > getMemorySize()-100000)
1095             outWarning("The required memory does not fit in RAM!");
1096         cout << "Creating " << params.topotest_replicates << " bootstrap replicates..." << endl;
1097         if (!(boot_samples = new int [params.topotest_replicates*nptn]))
1098             outError(ERR_NO_MEMORY);
1099 #ifdef _OPENMP
1100 #pragma omp parallel private(boot) if(nptn > 10000)
1101         {
1102         int *rstream;
1103         init_random(params.ran_seed + omp_get_thread_num(), false, &rstream);
1104 #pragma omp for schedule(static)
1105 #else
1106         int *rstream = randstream;
1107 #endif
1108         for (boot = 0; boot < params.topotest_replicates; boot++)
1109             if (boot == 0)
1110                 tree->aln->getPatternFreq(boot_samples + (boot*nptn));
1111             else
1112                 tree->aln->createBootstrapAlignment(boot_samples + (boot*nptn), params.bootstrap_spec, rstream);
1113 #ifdef _OPENMP
1114         finish_random(rstream);
1115         }
1116 #endif
1117         cout << "done" << endl;
1118         //if (!(saved_tree_lhs = new double [ntrees * params.topotest_replicates]))
1119         //    outError(ERR_NO_MEMORY);
1120         if (!(tree_lhs = new double [ntrees * params.topotest_replicates]))
1121             outError(ERR_NO_MEMORY);
1122         if (params.do_weighted_test || params.do_au_test) {
1123             if (!(lhdiff_weights = new double [ntrees * ntrees]))
1124                 outError(ERR_NO_MEMORY);
1125             pattern_lhs = aligned_alloc<double>(ntrees*maxnptn);
1126             //            if (!(pattern_lhs = new double[ntrees* nptn]))
1127             //                outError(ERR_NO_MEMORY);
1128         }
1129         pattern_lh = aligned_alloc<double>(maxnptn);
1130         //        if (!(pattern_lh = new double[nptn]))
1131         //            outError(ERR_NO_MEMORY);
1132         if (!(orig_tree_lh = new double[ntrees]))
1133             outError(ERR_NO_MEMORY);
1134         if (!(max_lh = new double[params.topotest_replicates]))
1135             outError(ERR_NO_MEMORY);
1136     }
1137     int tree_index, tid, tid2;
1138     info.resize(ntrees);
1139     //for (MTreeSet::iterator it = trees.begin(); it != trees.end(); it++, tree_index++) {
1140     for (tree_index = 0, tid = 0; tree_index < distinct_ids.size(); tree_index++) {
1141 
1142         cout << "Tree " << tree_index + 1;
1143         if (distinct_ids[tree_index] >= 0) {
1144             cout << " / identical to tree " << distinct_ids[tree_index]+1 << endl;
1145             // ignore tree
1146             char ch;
1147             do {
1148                 in >> ch;
1149             } while (!in.eof() && ch != ';');
1150             continue;
1151         }
1152         tree->freeNode();
1153         tree->readTree(in, tree->rooted);
1154         if (!tree->findNodeName(tree->aln->getSeqName(0))) {
1155             outError("Taxon " + tree->aln->getSeqName(0) + " not found in tree");
1156         }
1157 
1158         if (tree->rooted && tree->getModel()->isReversible()) {
1159             if (tree->leafNum != tree->aln->getNSeq()+1)
1160                 outError("Tree does not have same number of taxa as alignment");
1161             tree->convertToUnrooted();
1162         } else if (!tree->rooted && !tree->getModel()->isReversible()) {
1163             if (tree->leafNum != tree->aln->getNSeq())
1164                 outError("Tree does not have same number of taxa as alignment");
1165             tree->convertToRooted();
1166         }
1167         tree->setAlignment(tree->aln);
1168         tree->setRootNode(params.root);
1169         if (tree->isSuperTree())
1170             ((PhyloSuperTree*) tree)->mapTrees();
1171 
1172         tree->initializeAllPartialLh();
1173         tree->fixNegativeBranch(false);
1174         if (params.fixed_branch_length) {
1175             tree->setCurScore(tree->computeLikelihood());
1176         } else if (params.topotest_optimize_model) {
1177             tree->getModelFactory()->optimizeParameters(BRLEN_OPTIMIZE, false, params.modelEps);
1178             tree->setCurScore(tree->computeLikelihood());
1179         } else {
1180             tree->setCurScore(tree->optimizeAllBranches(100, 0.001));
1181         }
1182         treeout << "[ tree " << tree_index+1 << " lh=" << tree->getCurScore() << " ]";
1183         tree->printTree(treeout);
1184         treeout << endl;
1185         if (params.print_tree_lh)
1186             scoreout << tree->getCurScore() << endl;
1187 
1188         cout << " / LogL: " << tree->getCurScore() << endl;
1189 
1190         if (pattern_lh) {
1191             double curScore = tree->getCurScore();
1192             memset(pattern_lh, 0, maxnptn*sizeof(double));
1193             tree->computePatternLikelihood(pattern_lh, &curScore);
1194             if (params.do_weighted_test || params.do_au_test)
1195                 memcpy(pattern_lhs + tid*maxnptn, pattern_lh, maxnptn*sizeof(double));
1196         }
1197         if (params.print_site_lh) {
1198             string tree_name = "Tree" + convertIntToString(tree_index+1);
1199             printSiteLh(site_lh_file.c_str(), tree, pattern_lh, true, tree_name.c_str());
1200         }
1201         if (params.print_partition_lh) {
1202             string tree_name = "Tree" + convertIntToString(tree_index+1);
1203             printPartitionLh(part_lh_file.c_str(), tree, pattern_lh, true, tree_name.c_str());
1204         }
1205         info[tid].logl = tree->getCurScore();
1206 
1207         if (!params.topotest_replicates || ntrees <= 1) {
1208             tid++;
1209             continue;
1210         }
1211         // now compute RELL scores
1212         orig_tree_lh[tid] = tree->getCurScore();
1213         double *tree_lhs_offset = tree_lhs + (tid*params.topotest_replicates);
1214         for (boot = 0; boot < params.topotest_replicates; boot++) {
1215             double lh = 0.0;
1216             int *this_boot_sample = boot_samples + (boot*nptn);
1217             for (size_t ptn = 0; ptn < nptn; ptn++)
1218                 lh += pattern_lh[ptn] * this_boot_sample[ptn];
1219             tree_lhs_offset[boot] = lh;
1220         }
1221         tid++;
1222     }
1223 
1224     ASSERT(tid == ntrees);
1225 
1226     if (params.topotest_replicates && ntrees > 1) {
1227         double *tree_probs = new double[ntrees];
1228         memset(tree_probs, 0, ntrees*sizeof(double));
1229         int *tree_ranks = new int[ntrees];
1230 
1231         /* perform RELL BP method */
1232         cout << "Performing RELL-BP test..." << endl;
1233         int *maxtid = new int[params.topotest_replicates];
1234         double *maxL = new double[params.topotest_replicates];
1235         int *maxcount = new int[params.topotest_replicates];
1236         memset(maxtid, 0, params.topotest_replicates*sizeof(int));
1237         memcpy(maxL, tree_lhs, params.topotest_replicates*sizeof(double));
1238         for (boot = 0; boot < params.topotest_replicates; boot++)
1239             maxcount[boot] = 1;
1240         for (tid = 1; tid < ntrees; tid++) {
1241             double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1242             for (boot = 0; boot < params.topotest_replicates; boot++)
1243                 if (tree_lhs_offset[boot] > maxL[boot] + params.ufboot_epsilon) {
1244                     maxL[boot] = tree_lhs_offset[boot];
1245                     maxtid[boot] = tid;
1246                     maxcount[boot] = 1;
1247                 } else if (tree_lhs_offset[boot] > maxL[boot] - params.ufboot_epsilon &&
1248                            random_double() <= 1.0/(maxcount[boot]+1)) {
1249                     maxL[boot] = max(maxL[boot],tree_lhs_offset[boot]);
1250                     maxtid[boot] = tid;
1251                     maxcount[boot]++;
1252                 }
1253         }
1254         for (boot = 0; boot < params.topotest_replicates; boot++)
1255             tree_probs[maxtid[boot]] += 1.0;
1256         for (tid = 0; tid < ntrees; tid++) {
1257             tree_probs[tid] /= params.topotest_replicates;
1258             info[tid].rell_confident = false;
1259             info[tid].rell_bp = tree_probs[tid];
1260         }
1261         sort_index(tree_probs, tree_probs + ntrees, tree_ranks);
1262         double prob_sum = 0.0;
1263         // obtain the confidence set
1264         for (tid = ntrees-1; tid >= 0; tid--) {
1265             info[tree_ranks[tid]].rell_confident = true;
1266             prob_sum += tree_probs[tree_ranks[tid]];
1267             if (prob_sum > 0.95) break;
1268         }
1269 
1270         // sanity check
1271         for (tid = 0, prob_sum = 0.0; tid < ntrees; tid++)
1272             prob_sum += tree_probs[tid];
1273         if (fabs(prob_sum-1.0) > 0.01)
1274             outError("Internal error: Wrong ", __func__);
1275 
1276         delete [] maxcount;
1277         delete [] maxL;
1278         delete [] maxtid;
1279 
1280         /* now do the SH test */
1281         cout << "Performing KH and SH test..." << endl;
1282         // SH centering step
1283         for (boot = 0; boot < params.topotest_replicates; boot++)
1284             max_lh[boot] = -DBL_MAX;
1285         double *avg_lh = new double[ntrees];
1286         for (tid = 0; tid < ntrees; tid++) {
1287             avg_lh[tid] = 0.0;
1288             double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1289             for (boot = 0; boot < params.topotest_replicates; boot++)
1290                 avg_lh[tid] += tree_lhs_offset[boot];
1291             avg_lh[tid] /= params.topotest_replicates;
1292             for (boot = 0; boot < params.topotest_replicates; boot++) {
1293                 max_lh[boot] = max(max_lh[boot], tree_lhs_offset[boot] - avg_lh[tid]);
1294             }
1295         }
1296 
1297         double orig_max_lh = orig_tree_lh[0];
1298         size_t orig_max_id = 0;
1299         double orig_2ndmax_lh = -DBL_MAX;
1300         size_t orig_2ndmax_id = -1;
1301         // find the max tree ID
1302         for (tid = 1; tid < ntrees; tid++)
1303             if (orig_max_lh < orig_tree_lh[tid]) {
1304                 orig_max_lh = orig_tree_lh[tid];
1305                 orig_max_id = tid;
1306             }
1307         // find the 2nd max tree ID
1308         for (tid = 0; tid < ntrees; tid++)
1309             if (tid != orig_max_id && orig_2ndmax_lh < orig_tree_lh[tid]) {
1310                 orig_2ndmax_lh = orig_tree_lh[tid];
1311                 orig_2ndmax_id = tid;
1312             }
1313 
1314 
1315         // SH compute p-value
1316         for (tid = 0; tid < ntrees; tid++) {
1317             double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1318             // SH compute original deviation from max_lh
1319             info[tid].kh_pvalue = 0.0;
1320             info[tid].sh_pvalue = 0.0;
1321             size_t max_id = (tid != orig_max_id) ? orig_max_id : orig_2ndmax_id;
1322             double orig_diff = orig_tree_lh[max_id] - orig_tree_lh[tid] - avg_lh[tid];
1323             double *max_kh = tree_lhs + (max_id * params.topotest_replicates);
1324             for (boot = 0; boot < params.topotest_replicates; boot++) {
1325                 if (max_lh[boot] - tree_lhs_offset[boot] > orig_diff)
1326                     info[tid].sh_pvalue += 1.0;
1327                 //double max_kh_here = max(max_kh[boot]-avg_lh[max_id], tree_lhs_offset[boot]-avg_lh[tid]);
1328                 double max_kh_here = (max_kh[boot]-avg_lh[max_id]);
1329                 if (max_kh_here - tree_lhs_offset[boot] > orig_diff)
1330                     info[tid].kh_pvalue += 1.0;
1331             }
1332             info[tid].sh_pvalue /= params.topotest_replicates;
1333             info[tid].kh_pvalue /= params.topotest_replicates;
1334         }
1335 
1336         if (params.do_weighted_test) {
1337 
1338             cout << "Computing pairwise logl difference variance ..." << endl;
1339             /* computing lhdiff_weights as 1/sqrt(lhdiff_variance) */
1340             for (tid = 0; tid < ntrees; tid++) {
1341                 double *pattern_lh1 = pattern_lhs + (tid * maxnptn);
1342                 lhdiff_weights[tid*ntrees+tid] = 0.0;
1343                 for (tid2 = tid+1; tid2 < ntrees; tid2++) {
1344                     double lhdiff_variance = tree->computeLogLDiffVariance(pattern_lh1, pattern_lhs + (tid2*maxnptn));
1345                     lhdiff_weights[tid*ntrees+tid2] = 1.0/sqrt(lhdiff_variance);
1346                     lhdiff_weights[tid2*ntrees+tid] = lhdiff_weights[tid*ntrees+tid2];
1347                 }
1348             }
1349 
1350             // Weighted KH and SH test
1351             cout << "Performing WKH and WSH test..." << endl;
1352             for (tid = 0; tid < ntrees; tid++) {
1353                 double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1354                 info[tid].wkh_pvalue = 0.0;
1355                 info[tid].wsh_pvalue = 0.0;
1356                 double worig_diff = -DBL_MAX;
1357                 size_t max_id = -1;
1358                 for (tid2 = 0; tid2 < ntrees; tid2++)
1359                     if (tid2 != tid) {
1360                         double wdiff = (orig_tree_lh[tid2] - orig_tree_lh[tid])*lhdiff_weights[tid*ntrees+tid2];
1361                         if (wdiff > worig_diff) {
1362                             worig_diff = wdiff;
1363                             max_id = tid2;
1364                         }
1365                     }
1366                 for (boot = 0; boot < params.topotest_replicates; boot++) {
1367                     double wmax_diff = -DBL_MAX;
1368                     for (tid2 = 0; tid2 < ntrees; tid2++)
1369                         if (tid2 != tid)
1370                             wmax_diff = max(wmax_diff,
1371                                             (tree_lhs[tid2*params.topotest_replicates+boot] - avg_lh[tid2] -
1372                                              tree_lhs_offset[boot] + avg_lh[tid]) * lhdiff_weights[tid*ntrees+tid2]);
1373                     if (wmax_diff > worig_diff)
1374                         info[tid].wsh_pvalue += 1.0;
1375                     wmax_diff = (tree_lhs[max_id*params.topotest_replicates+boot] - avg_lh[max_id] -
1376                                  tree_lhs_offset[boot] + avg_lh[tid]);
1377                     if (wmax_diff >  orig_tree_lh[max_id] - orig_tree_lh[tid])
1378                         info[tid].wkh_pvalue += 1.0;
1379                 }
1380                 info[tid].wsh_pvalue /= params.topotest_replicates;
1381                 info[tid].wkh_pvalue /= params.topotest_replicates;
1382             }
1383         }
1384 
1385         delete [] avg_lh;
1386 
1387         /* now to ELW - Expected Likelihood Weight method */
1388         cout << "Performing ELW test..." << endl;
1389 
1390         for (boot = 0; boot < params.topotest_replicates; boot++)
1391             max_lh[boot] = -DBL_MAX;
1392         for (tid = 0; tid < ntrees; tid++) {
1393             double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1394             for (boot = 0; boot < params.topotest_replicates; boot++)
1395                 max_lh[boot] = max(max_lh[boot], tree_lhs_offset[boot]);
1396         }
1397         double *sumL = new double[params.topotest_replicates];
1398         memset(sumL, 0, sizeof(double) * params.topotest_replicates);
1399         for (tid = 0; tid < ntrees; tid++) {
1400             double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1401             for (boot = 0; boot < params.topotest_replicates; boot++) {
1402                 tree_lhs_offset[boot] = exp(tree_lhs_offset[boot] - max_lh[boot]);
1403                 sumL[boot] += tree_lhs_offset[boot];
1404             }
1405         }
1406         for (tid = 0; tid < ntrees; tid++) {
1407             double *tree_lhs_offset = tree_lhs + (tid * params.topotest_replicates);
1408             tree_probs[tid] = 0.0;
1409             for (boot = 0; boot < params.topotest_replicates; boot++) {
1410                 tree_probs[tid] += (tree_lhs_offset[boot] / sumL[boot]);
1411             }
1412             tree_probs[tid] /= params.topotest_replicates;
1413             info[tid].elw_confident = false;
1414             info[tid].elw_value = tree_probs[tid];
1415         }
1416 
1417         sort_index(tree_probs, tree_probs + ntrees, tree_ranks);
1418         prob_sum = 0.0;
1419         // obtain the confidence set
1420         for (tid = ntrees-1; tid >= 0; tid--) {
1421             info[tree_ranks[tid]].elw_confident = true;
1422             prob_sum += tree_probs[tree_ranks[tid]];
1423             if (prob_sum > 0.95) break;
1424         }
1425 
1426         // sanity check
1427         for (tid = 0, prob_sum = 0.0; tid < ntrees; tid++)
1428             prob_sum += tree_probs[tid];
1429         if (fabs(prob_sum-1.0) > 0.01)
1430             outError("Internal error: Wrong ", __func__);
1431         delete [] sumL;
1432 
1433         if (params.do_au_test) {
1434             cout << "Performing approximately unbiased (AU) test..." << endl;
1435             performAUTest(params, tree, pattern_lhs, info);
1436         }
1437 
1438         delete [] tree_ranks;
1439         delete [] tree_probs;
1440 
1441     }
1442     if (max_lh)
1443         delete [] max_lh;
1444     if (orig_tree_lh)
1445         delete [] orig_tree_lh;
1446     if (pattern_lh)
1447         aligned_free(pattern_lh);
1448     if (pattern_lhs)
1449         aligned_free(pattern_lhs);
1450     if (lhdiff_weights)
1451         delete [] lhdiff_weights;
1452     if (tree_lhs)
1453         delete [] tree_lhs;
1454     //if (saved_tree_lhs)
1455     //    delete [] saved_tree_lhs;
1456     if (boot_samples)
1457         delete [] boot_samples;
1458 
1459     if (params.print_tree_lh) {
1460         scoreout.close();
1461     }
1462 
1463     treeout.close();
1464     in.close();
1465 
1466     cout << "Time for evaluating all trees: " << getRealTime() - time_start << " sec." << endl;
1467 
1468 }
1469 
1470 
evaluateTrees(string treeset_file,Params & params,IQTree * tree)1471 void evaluateTrees(string treeset_file, Params &params, IQTree *tree) {
1472     vector<TreeInfo> info;
1473     IntVector distinct_ids;
1474     evaluateTrees(treeset_file, params, tree, info, distinct_ids);
1475 }
1476 
1477 
1478 
1479