1 /*
2 * upperbounds.cpp
3 *
4 * Created on: Aug 13, 2014
5 * Author: olga
6 */
7 #include "upperbounds.h"
8 #include "phylonode.h"
9 #include <string.h>
10 #include "timeutil.h"
11
UpperBounds(Params * params,Alignment * alignment,IQTree * tree)12 void UpperBounds(Params *params, Alignment* alignment, IQTree* tree){
13
14 // Output details --------------------------------------------------
15 // UpperBounds File
16 string out_file = params->out_prefix;
17 //out_file += ".ub";
18 out_file = "results.trueSplits.ub";
19 ofstream out;
20 out.exceptions(ios::failbit | ios::badbit);
21 out.open((char*)out_file.c_str(),std::ofstream::out | std::ofstream::app);
22
23 // Details on Split: A|B
24 string out_file_split = params->out_prefix;
25 //out_file_split += ".split.ub";
26 out_file_split = "results.trueSplits.ub.splits";
27 ofstream out_split;
28 out_split.exceptions(ios::failbit | ios::badbit);
29 out_split.open((char*)out_file_split.c_str(),std::ofstream::out | std::ofstream::app);
30
31 // Within Family Info: A|B
32 string out_file_within = params->out_prefix;
33 //out_file_within += ".within.ub";
34 out_file_within = "results.within.ub";
35 ofstream out_within;
36 out_within.exceptions(ios::failbit | ios::badbit);
37 out_within.open((char*)out_file_within.c_str(),std::ofstream::out | std::ofstream::app);
38
39 // Between Families Info: A|B
40 string out_file_between = params->out_prefix;
41 //out_file_between += ".between.ub";
42 out_file_between = "results.between.ub";
43 ofstream out_between;
44 out_between.exceptions(ios::failbit | ios::badbit);
45 out_between.open((char*)out_file_between.c_str(),std::ofstream::out | std::ofstream::app);
46
47 /* ------------------------------------------------------------------------------------------------------
48 * All output files:
49 * out -> "results.trueSplits.ub" -> upper bounds for all splits from an input tree
50 * out_split -> "results.trueSplits.ub.splits" -> list of all splits from an input tree
51 * out_within -> "results.within.ub" -> comparison of upper bounds within Split Families
52 * out_between -> "results.between.ub" -> comparison of upper bounds between Split Families
53 * ------------------------------------------------------------------------------------------------------
54 * FORMAT:
55 * general info (first columns of every file)...........................................................................
56 * leafNum getNSite() min(taxaA,taxaB) brLen
57 *
58 * out .................................................................................................................
59 * L(A|B) L(A)L(B) cN*L(A)L(B) L(A|B)/L(A)L(B) L(A|B)/cN*L(A)L(B) coef
60 *
61 *[4],[5] - the difference between likelihood and UB normalized by likelihood value
62 * if >1, the inequality is true.
63 * if <1, false.
64 *
65 * out_within ..........................................................................................................
66 * UB_true [1..N] UB_random_AB/UB_true (how smaller is the bound for random tree)
67 *
68 * out_between..........................................................................................................
69 * UB_true [1..N] UB_random_CD/UB_true (how smaller is the bound for random tree)
70 *
71 * ------------------------------------------------------------------------------------------------------ */
72
73 int i=0;//, h=0;
74
75 // Printing info about the TreeLogL changes during the tree search
76 /* cout<<"mlInitial = "<<tree->mlInitial<<endl;
77 cout<<"mlFirstOpt = "<<tree->mlFirstOpt<<endl;
78 cout<<"mlBestTree = "<<tree->getBestScore()<<endl;
79 cout<<"mlUnConstr = "<<alignment->computeUnconstrainedLogL()<<endl;*/
80
81 //double mlQuestionary = tree->mlInitial; //or tree->mlFirstOpt for example
82
83 /* ------------------------------------------------------------------------------------------------------
84 * Main PART
85 * ------------------------------------------------------------------------------------------------------ */
86 cout<<"Starting Upper Bounds analysis.."<<endl;
87
88 NodeVector branch1, branch2;
89 tree->getBranches(branch1, branch2);
90 int allSplits = 0;
91 // int R=10; // R is the number of random trees we will generate
92
93 // A loop over all A|B present on tree T
94 for(i = 0; i != branch1.size(); i++){
95 vector<int> taxaA, taxaB;
96 vector<string> taxaAname, taxaBname;
97 tree->getTaxaID(taxaA,branch1[i],branch2[i]);
98 tree->getTaxaID(taxaB,branch2[i],branch1[i]);
99
100 /* ------------------------------------------------------------------------------------------------------------
101 * TEST 1: This is the part for tests on [ai/(ai+bi)] and [bi/(ai+bi)] fractions
102 */
103 int test1 = 1;
104 if(test1 == 1){
105 if(taxaA.size() > 3 && taxaB.size() > 3){ // IQTree does not compute lh of tree with less than 4 taxa.
106 allSplits++;
107 sumFraction(((PhyloNode*) branch1[i]), ((PhyloNode*) branch2[i]), tree);
108 }
109 }
110
111 /* ------------------------------------------------------------------------------------------------------------
112 * TEST 2: This is the part for tests on random trees and evaluation of Upper Bounds for each split on the input tree
113 */
114 int test2 = 0;
115 if(test2 == 1){
116 if(taxaA.size() > 3 && taxaB.size() > 3){ // IQTree does not compute lh of tree with less than 4 taxa.
117 allSplits++;
118
119 // Dealing with subtrees T_A and T_B
120 PhyloTree *treeA, *treeB;
121 treeA = extractSubtreeUB(taxaA,tree,params,1);
122 treeB = extractSubtreeUB(taxaB,tree,params,1);
123
124 // Upper Bound for a given split from the input tree
125 double brLen = branch1[i]->findNeighbor(branch2[i])->length;
126 double coef = tree->aln->getNSite()*(log(1+3*exp(-brLen)) - log(1-exp(-brLen)));
127 double coef2 = tree->aln->getNSite()*log(1+3*exp(-brLen));
128 double UB_true = coef + treeA->getCurScore() + treeB->getCurScore();
129 double UB_true2 = coef2 + treeA->getCurScore() + treeB->getCurScore();
130
131 //cout<<"UB_true = "<<UB_true<<endl;
132 out<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"
133 <<tree->getCurScore()<<"\t"<<treeA->getCurScore() + treeB->getCurScore()<<"\t"<<UB_true<<"\t"<<UB_true2<<"\t"
134 <<tree->getCurScore()/(treeA->getCurScore() + treeB->getCurScore())<<"\t"
135 <<tree->getCurScore()/UB_true<<"\t"<<tree->getCurScore()/UB_true2<<"\t"<<coef<<"\t"<<coef2<<endl;
136
137 /*
138 // Comparison of Upper Bounds within Split Family ----------------------------------------
139 out_within<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"<<UB_true<<"\t";
140
141 cout<<"comparison within family...."<<endl;
142 double UB_random_AB;
143 for(j=0; j<30; j++){
144 //cout<<"generating "<<j<<" random_AB tree..."<<endl;
145 UB_random_AB = RandomTreeAB(tree, treeA, treeB, taxaA, taxaB, params,brLen);
146 //cout<<"The upper bound for random tree: "<<UB_random_AB<<endl;
147 out_within<<UB_random_AB/UB_true<<"\t";
148 }
149 out_within<<endl;
150
151
152 // --------------------------------------------------------------------------------------
153
154 // Comparison of Upper Bounds between Split Families ------------------------------------
155 out_between<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"<<UB_true<<"\t";
156
157 cout<<"comparison between families...."<<endl;
158 // creating split C|D which conflicts with A|B
159 int n=0;
160 n=int(min(taxaA.size(),taxaB.size())/2.);
161 //cout<<"taxaA.size() = "<<taxaA.size()<<", taxaB.size() = "<<taxaB.size()<<", n = "<<n<<endl;
162
163 vector<int> taxaC, taxaD;
164 PhyloTree *treeC, *treeD;
165
166 // ContraSplit1: changing 1/2 of taxa
167 taxaC = taxaA;
168 taxaD = taxaB;
169 for(h=0; h<n; h++){
170 taxaC[h]=taxaB[h];
171 taxaD[h]=taxaA[h];
172 }
173 treeC = extractSubtreeUB(taxaC,tree,params);
174 treeD = extractSubtreeUB(taxaD,tree,params);
175 double UB_random_CD;
176 for(j=0; j<R; j++){
177 //cout<<"generating "<<j<<" random_CD1 tree..."<<endl;
178 UB_random_CD = RandomTreeAB(tree, treeC, treeD, taxaC, taxaD, params);
179 out_between<<UB_random_CD/UB_true<<"\t";
180 }
181
182 // ContraSplit 2: changing 1/4 of taxa
183 taxaC = taxaA;
184 taxaD = taxaB;
185 for(h=0; h<int(n/2.); h++){
186 taxaC[h]=taxaB[h];
187 taxaD[h]=taxaA[h];
188 }
189 treeC = extractSubtreeUB(taxaC,tree,params);
190 treeD = extractSubtreeUB(taxaD,tree,params);
191 for(j=0; j<R; j++){
192 //cout<<"generating "<<j<<" random_CD2 tree..."<<endl;
193 UB_random_CD = RandomTreeAB(tree, treeC, treeD, taxaC, taxaD, params);
194 out_between<<UB_random_CD/UB_true<<"\t";
195 }
196 out_between<<endl;
197 */
198
199 /*
200
201 // Printing Tree and its subtrees. This was just for check.
202 cout<<"Tree T(A|B)"<<endl;
203 tree->printTree(cout,2);
204 cout<<endl<<"Tree T(A)"<<endl;
205 treeA->printTree(cout,2);
206 cout<<endl<<"Tree T(B)"<<endl;
207 treeB->printTree(cout,2);
208 cout<<endl;
209
210 cout<<"Tree T(A|B)"<<endl;
211 printTreeUB(tree);
212 cout<<endl<<endl<<"Tree T(A)"<<endl;
213 printTreeUB(treeA);
214 cout<<endl<<"Tree T(B)"<<endl;
215 printTreeUB(treeB);
216
217 // Printing out the results ----------------------------------------------------------
218 // Split A|B ------------------------------------------
219 out_split<<min(taxaA.size(),taxaB.size())<<"|"<<((double) max(taxaA.size(),taxaB.size()))<<"\t";
220 if(min(taxaA.size(),taxaB.size()) == taxaA.size()){
221 for(int f = 0; f < taxaA.size()-1; f++)
222 out_split<<taxaA[f]<<",";
223 out_split<<taxaA[taxaA.size()-1]<<"\t|\t";
224 for(int f = 0; f < taxaB.size()-1; f++)
225 out_split<<taxaB[f]<<",";
226 out_split<<taxaB[taxaB.size()-1];
227 } else {
228 for(int f = 0; f < taxaB.size()-1; f++)
229 out_split<<taxaB[f]<<",";
230 out_split<<taxaB[taxaB.size()-1]<<"\t|\t";
231 for(int f = 0; f < taxaA.size()-1; f++)
232 out_split<<taxaA[f]<<",";
233 out_split<<taxaA[taxaA.size()-1];
234 }
235 out_split<<endl;
236 // ----------------------------------------------------
237
238 //out<<min(taxaA.size(),taxaB.size())<<"|"<<((double) max(taxaA.size(),taxaB.size()))<<"\t"<<br_len<<"\t"<<tree->curScore<<"\t";
239 out<<params->aln_file<<"\t";
240 if(tree->mlInitial == 0)
241 out<<"0"<<"\t";
242 else
243 out<<"1"<<"\t";
244 out<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<br_len<<"\t"<<tree->curScore<<"\t";
245
246
247 if(min(taxaA.size(),taxaB.size()) == taxaA.size()){
248 out<<treeA->curScore<<"\t"<<treeB->curScore<<"\t";
249 }
250 else{
251 out<<treeB->curScore<<"\t"<<treeA->curScore<<"\t";
252 }
253
254 out<<tree->aln->size()*(log(1+3*exp(-br_len)) - log(1-exp(-br_len)))<<"\t"<<diff_1<<"\t"<<diff_2;
255
256 if(diff_1>0){
257 out<<"\t"<<"FALSE\t0";
258 BadSplits1++;
259 }else{
260 out<<"\t"<<"TRUE\t1";
261 }
262
263 if(diff_2>0){
264 out<<"\t"<<"FALSE\t0";
265 BadSplits2++;
266 }else{
267 out<<"\t"<<"TRUE\t1";
268 }
269 out<<endl;
270
271 // END: Printing out the results -----------------------------------------------------
272 */
273 }} // END: if taxaA.size() and taxaB.size() >3
274
275 }
276 // END: the loop over all A|B present on tree T
277
278 out_within.close();
279 out_between.close();
280 out.close();
281 out_split.close();
282 }
283
extractSubtreeUB(IntVector & ids,MTree * tree,Params * params,int sw)284 PhyloTree* extractSubtreeUB(IntVector &ids, MTree* tree, Params *params, int sw) {
285 string taxa_set;
286 int i;
287 for(i = 0; i < tree->leafNum; i++)
288 taxa_set.push_back(0);
289 for (i = 0; i < ids.size(); i++)
290 taxa_set[ids[i]]=1;
291
292 PhyloTree *treeCopy = new PhyloTree(); // this will be a new subtree
293 Alignment *alignment = new Alignment();
294 alignment->extractSubAlignment(((PhyloTree*)tree)->aln,ids,0);
295
296 treeCopy->copyTree(tree, taxa_set);
297 treeCopy->setAlignment(alignment);
298 if(sw == 1){
299 treeCopy->setModel(((PhyloTree*)tree)->getModel());
300 treeCopy->setRate(((PhyloTree*)tree)->getRate());
301 treeCopy->setModelFactory(((PhyloTree*)tree)->getModelFactory());
302 treeCopy->initializeAllPartialLh();
303 treeCopy->setCurScore(treeCopy->computeLikelihood());
304 }
305
306 return treeCopy;
307 }
308
printTreeUB(MTree * tree)309 void printTreeUB(MTree *tree){
310 int i=0, j=0;
311
312 NodeVector nodeLeaves;
313 tree->getTaxa(nodeLeaves);
314 cout<<"Taxa nodes:"<<endl;
315 for(i=0; i<nodeLeaves.size(); i++){
316 cout<<nodeLeaves[i]->name<<":"<<nodeLeaves[i]->id<<"(";
317 for(j=0; j<nodeLeaves[i]->neighbors.size(); j++)
318 cout<<nodeLeaves[i]->neighbors[j]->node->name<<":"<<nodeLeaves[i]->neighbors[j]->node->id<<"["<<nodeLeaves[i]->neighbors[j]->length<<"]"<<",";
319 cout<<")"<<endl;
320 }
321
322 NodeVector nodeInternal;
323 tree->getInternalNodes(nodeInternal);
324 cout<<"Internal nodes:"<<endl;
325 if(nodeInternal.size() == 0)
326 cout<<"no internal nodes"<<endl;
327 else{
328 for(i=0; i<nodeInternal.size(); i++){
329 cout<<nodeInternal[i]->name<<":"<<nodeInternal[i]->id<<"(";
330 for(j=0; j<nodeInternal[i]->neighbors.size(); j++)
331 cout<<nodeInternal[i]->neighbors[j]->node->name<<":"<<nodeInternal[i]->neighbors[j]->node->id<<"["<<nodeInternal[i]->neighbors[j]->length<<"]"<<",";
332 cout<<")"<<endl;
333 }
334 }
335 }
336
generateRandomYH_UB(Params & params,PhyloTree * tree)337 MTree* generateRandomYH_UB(Params ¶ms, PhyloTree *tree){
338 MExtTree* treeR = new MExtTree();
339 bool binary = TRUE;
340
341 int size = tree->leafNum;
342 if (size < 3)
343 outError(ERR_FEW_TAXA);
344
345 treeR->root = treeR->newNode();
346 int i;
347 NodeVector myleaves;
348 NodeVector innodes;
349 Node *node;
350 double len;
351
352 innodes.push_back(treeR->root);
353 // create initial tree with 3 leaves
354 for (i = 0; i < 3; i++) {
355 node = treeR->newNode();
356 len = randomLen(params);
357 treeR->root->addNeighbor(node, len);
358 node->addNeighbor(treeR->root, len);
359 myleaves.push_back(node);
360 }
361
362 // additionally add a leaf
363 for (i = 3; i < size; i++)
364 {
365 int index;
366 if (binary) {
367 index = random_int(i);
368 } else {
369 index = random_int(i + innodes.size());
370 }
371 if (index < i) {
372 node = myleaves[index];
373 innodes.push_back(node);
374 // add the first leaf
375 Node *newleaf = treeR->newNode();
376 len = randomLen(params);
377 node->addNeighbor(newleaf, len);
378 newleaf->addNeighbor(node, len);
379 myleaves[index] = newleaf;
380
381 // add the second leaf
382 newleaf = treeR->newNode();
383 len = randomLen(params);
384 node->addNeighbor(newleaf, len);
385 newleaf->addNeighbor(node, len);
386 myleaves.push_back(newleaf);
387 }
388 else {
389 node = innodes[index-i];
390 // add only 1 new leaf
391 Node *newleaf = treeR->newNode();
392 len = randomLen(params);
393 node->addNeighbor(newleaf, len);
394 newleaf->addNeighbor(node, len);
395 myleaves.push_back(newleaf);
396 }
397 }
398
399 treeR->root = myleaves[0];
400 // indexing the leaves
401 treeR->setLeavesName(myleaves);
402 treeR->leafNum = myleaves.size();
403 treeR->nodeNum = treeR->leafNum;
404 treeR->initializeTree();
405
406 NodeVector taxa;
407 treeR->getTaxa(taxa);
408 ASSERT(taxa.size() == size);
409 for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
410 (*it)->name = tree->aln->getSeqName((*it)->id);
411
412 return (MTree*)treeR;
413 }
414
RandomTreeAB(PhyloTree * treeORGN,PhyloTree * treeAorgn,PhyloTree * treeBorgn,IntVector & taxaA,IntVector & taxaB,Params * params,double brLen)415 double RandomTreeAB(PhyloTree* treeORGN, PhyloTree* treeAorgn, PhyloTree* treeBorgn, IntVector &taxaA, IntVector &taxaB, Params* params, double brLen){
416 PhyloTree *tree = new PhyloTree();
417 MTree *treeA = new MTree();
418 MTree *treeB = new MTree();
419
420 treeA = generateRandomYH_UB(*params,treeAorgn);
421 treeB = generateRandomYH_UB(*params,treeBorgn);
422
423 /*
424 // PrintTree ---------------
425 cout<<"TreeA.root:"<<treeA->root->name<<treeA->root->id<<endl;
426 cout<<"TreeB.root:"<<treeB->root->name<<treeB->root->id<<endl;
427 cout<<"TreeA:"<<endl;
428 treeA->printTree(cout);
429 cout<<endl<<"TreeB:"<<endl;
430 treeB->printTree(cout);
431 cout<<endl;
432 // -------------------------
433 */
434
435 extendingTree(treeA,params);
436 extendingTree(treeB,params);
437
438 /*
439 // PrintTree ---------------
440 cout<<"TreeA.root:"<<treeA->root->name<<treeA->root->id<<endl;
441 cout<<"TreeB.root:"<<treeB->root->name<<treeB->root->id<<endl;
442
443 cout<<"extended TreeA:"<<endl;
444 treeA->printTree(cout);
445 cout<<endl<<"extended TreeB:"<<endl;
446 treeB->printTree(cout);
447 cout<<endl;
448 // -------------------------
449 */
450
451 treeA->root->name = "NewNodeA";
452 treeB->root->name = "NewNodeB";
453 treeA->root->addNeighbor(treeB->root,0.0,tree->branchNum);
454 treeB->root->addNeighbor(treeA->root,0.0,tree->branchNum);
455
456 tree->copyTree(treeA);
457 /* cout<<"Leaves number = "<<tree->leafNum<<endl;
458 cout<<"Nodes number = "<<tree->nodeNum<<endl;
459 cout<<"Branch number:"<<tree->branchNum<<endl;
460 */
461 //tree->printTree(cout);
462 //cout<<endl;
463
464 NodeVector brID;
465 //brID= getBranchABid(brLen, tree);
466 brID.push_back(tree->findNodeName(treeA->root->name));
467 brID.push_back(tree->findNodeName(treeB->root->name));
468
469 if(brLen == 0){
470 brLen = randomLen(*params);
471 }
472 //tree->findNodeName(treeA->root->name)->findNeighbor(treeB->root)->length = brLen;
473 //tree->findNodeName(treeB->root->name)->findNeighbor(treeA->root)->length = brLen;
474
475
476 tree->findNodeID(brID[0]->id)->findNeighbor(brID[1])->length = brLen;
477 tree->findNodeID(brID[1]->id)->findNeighbor(brID[0])->length = brLen;
478
479
480 tree->setAlignment(treeORGN->aln);
481 tree->setModel(((PhyloTree*)treeORGN)->getModel());
482 tree->setRate(((PhyloTree*)treeORGN)->getRate());
483 tree->setModelFactory(((PhyloTree*)treeORGN)->getModelFactory());
484 tree->initializeAllPartialLh();
485
486 tree->setCurScore(tree->computeLikelihood());
487 //cout<<"LogLh score before optimization: "<<tree->curScore<<endl;
488 tree->params = params;
489 //tree->curScore = tree->optimizeAllBranches(50);
490 //cout<<"LogLh score after optimization: "<<tree->curScore<<endl;
491
492 //double len = tree->findNodeName(treeA->root->name)->findNeighbor(treeB->root)->length;
493 double len = tree->findNodeID(brID[0]->id)->findNeighbor(brID[1])->length;
494 //cout<<"The length of corresponding branch after optimization: "<<len<<endl;
495 //cout<<"before it was equal to "<<brLen<<endl;
496
497 string out_file = "results.branches.ub";
498 ofstream out;
499 out.exceptions(ios::failbit | ios::badbit);
500 out.open((char*)out_file.c_str(),std::ofstream::out | std::ofstream::app);
501
502 //len = 1;
503
504 double coef = tree->aln->getNSite()*(log(1+3*exp(-len)) - log(1-exp(-len)));
505 double U = coef + UpperBoundAB(taxaA, taxaB, tree, params);
506
507 // leafNum alnLen brLen (before opt) brLen (after opt) coef UB
508 //out<<treeORGN->leafNum<<"\t"<<treeORGN->aln->getNSite()<<"\t"<<brLen<<"\t"<<len<<"\t"<<coef<<"\t"<<U<<endl;
509
510 out.close();
511 return U;
512 }
513
UpperBoundAB(IntVector & taxaA,IntVector & taxaB,PhyloTree * tree,Params * params)514 double UpperBoundAB(IntVector &taxaA, IntVector &taxaB, PhyloTree* tree, Params *params){
515 double U = 0.0;
516
517 PhyloTree *treeA, *treeB;
518 treeA = extractSubtreeUB(taxaA,tree,params,1);
519 treeB = extractSubtreeUB(taxaB,tree,params,1);
520
521 U = treeA->getCurScore() + treeB->getCurScore();
522
523 return U;
524 }
525
getBranchABid(double brLen,PhyloTree * tree)526 NodeVector getBranchABid(double brLen, PhyloTree* tree){
527 NodeVector branch1, branch2;
528 NodeVector branch;
529 tree->getBranches(branch1, branch2);
530 for(int i = 0; i != branch1.size(); i++){
531 if(branch1[i]->findNeighbor(branch2[i])->length == 0.0){
532 branch.push_back(branch1[i]);
533 branch.push_back(branch2[i]);
534 return branch;
535 }
536 }
537 outError("UpperBounds: did not find matching branch:(");
538 return branch;
539 }
540
extendingTree(MTree * tree,Params * params)541 void extendingTree(MTree *tree, Params* params){
542
543 // Choose random internal node
544 int maxR = tree->nodeNum-1;
545 int randomNodeID = rand() % maxR;
546 //cout<<"randomNodeID = "<<randomNodeID<<endl;
547 if(randomNodeID<tree->leafNum){
548 if(randomNodeID+tree->leafNum > tree->nodeNum-1){
549 randomNodeID += tree->nodeNum - tree->leafNum;
550 //cout<<"adding nodeNum-leafNum"<<endl;
551 }
552 else{
553 randomNodeID+=tree->leafNum;
554 //cout<<"adding leafNum"<<endl;
555 }
556 }
557
558 //cout<<"leafNum = "<<tree->leafNum-1<<" < random = "<<randomNodeID<<" < nodeNum = "<<tree->nodeNum-1<<endl;
559
560 ASSERT(randomNodeID < tree->nodeNum && randomNodeID > tree->leafNum-1);
561
562 // Choose random neighbor
563 int randomNeiID = rand() % 2;
564 //cout<<"randomNeiID = "<<randomNeiID<<endl;
565
566
567 Node *randomNode;
568 randomNode = tree->findNodeID(randomNodeID);
569 Node *randomNeiNode = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->node;
570
571 // Create new node ----------------------------
572 string str;
573 str = "NewNode";
574 const char *ch = str.c_str();
575 Node* newNode1 = tree->newNode(tree->nodeNum,ch);
576 tree->nodeNum++;
577
578 // Add new node as a neighbor to randomNei->node
579 double len = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->length;
580 int id = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->id;
581
582 randomNeiNode->findNeighbor(randomNode)->node = newNode1;
583 newNode1->addNeighbor(randomNeiNode,len,id);
584
585
586 //Change randomNei with this new node for randomNode. Create new branch.
587 randomNode->neighbors[randomNeiID]->node = newNode1;
588 randomNode->neighbors[randomNeiID]->id = tree->branchNum;
589 tree->branchNum++;
590 randomNode->neighbors[randomNeiID]->length = randomLen(*params);
591
592 newNode1->addNeighbor(randomNode,randomNode->neighbors[randomNeiID]->length,randomNode->neighbors[randomNeiID]->id);
593
594 tree->root = newNode1;
595
596 }
597
getBestNNIForBranUB(PhyloNode * node1,PhyloNode * node2,PhyloTree * tree)598 NNIMove getBestNNIForBranUB(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree){
599
600 NNIMove nniMoves[2];
601
602 // Initialize node1 and node2 in nniMoves
603 nniMoves[0].node1 = nniMoves[1].node1 = node1;
604 nniMoves[0].node2 = nniMoves[1].node2 = node2;
605
606 // Initialize two NNIs
607 int cnt;
608 double t[4];
609 FOR_NEIGHBOR_IT(node1, node2, node1_it) {
610 cnt = 0;
611 t[cnt]=(*node1_it)->length;
612 FOR_NEIGHBOR_IT(node2, node1, node2_it) {
613 // Initialize the 2 NNI moves
614 nniMoves[cnt].node1Nei_it = node1_it; // for both cnt = 0,1 this is the same neighbor of node1,
615 // which will be swapped with nei1 and nei2 of node2
616 nniMoves[cnt].node2Nei_it = node2_it;
617 t[cnt+2] = (*node2_it)->length;
618 cnt++;
619 }
620 break;
621 }
622
623 NeighborVec::iterator node1Nei2_it;
624
625 FOR_NEIGHBOR_IT(node1, node2, node1_it){
626 if ((*node1_it)->node != (*nniMoves[0].node1Nei_it)->node){
627 t[cnt]=(*node1_it)->length;
628 node1Nei2_it = node1_it;
629 break;
630 }
631 }
632
633 /*
634 * Correspondence:
635 *
636 * Nodes, incident to node1 with corresponding branches:
637 * nniMoves[0].node1Nei_it | t[0]
638 * node1Nei2_it | t[1]
639 *
640 * Nodes, incident to node2 with corresponding branches:
641 * nniMoves[0].node2Nei_it | t[2]
642 * nniMoves[1].node2Nei_it | t[3]
643 *
644 * NNIs:
645 * nniMoves[0] -> swapping (nniMoves[0].node1Nei_it | t[0]) with (nniMoves[0].node2Nei_it | t[2])
646 * corresponding coef: q1
647 *
648 * nniMoves[1] -> swapping (nniMoves[1].node1Nei_it | t[0]) with (nniMoves[1].node2Nei_it | t[3])
649 * corresponding coef: q2
650 */
651
652 double L[4]; // likelihoods of 4 subtrees
653 double score[4];
654 L[0] = L[1] = L[2] = L[3] = 0.0;
655 score[0] = score[1] = score[2] = score[3] = 0.0;
656
657 double UB = 0.0; // in log terms
658 int nsite = tree->aln->getNSite();
659 UB = nsite*logC(node1->findNeighbor(node2)->length,tree); // coefficient c
660
661 //int ncat = tree->site_rate->getNDiscreteRate();
662 int nptn = tree->aln->getNPattern();
663 int nstates = tree->aln->num_states;
664 int i,x;
665 //int cat;
666 IntVector ptnFreq;
667 tree->aln->getPatternFreq(ptnFreq);
668
669 int clear_pl_lh[4]; // if equals to 1, partial likelihoods were computed, don't clear.
670 clear_pl_lh[0] = clear_pl_lh[1] = clear_pl_lh[2] = clear_pl_lh[3] = 1;
671
672 double* T1_partial_lh;
673 if(((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_partial_lh_computed() == 0){
674 tree->computePartialLikelihood((PhyloNeighbor*) (*nniMoves[0].node1Nei_it), node1);
675 clear_pl_lh[0] = 0;
676 }
677 T1_partial_lh = ((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_partial_lh();
678
679 double* T2_partial_lh;
680 if(((PhyloNeighbor*) (*node1Nei2_it))->get_partial_lh_computed() == 0){
681 tree->computePartialLikelihood(((PhyloNeighbor*) (*node1Nei2_it)), node1);
682 clear_pl_lh[1] = 0;
683 }
684 T2_partial_lh = ((PhyloNeighbor*) (*node1Nei2_it))->get_partial_lh();
685
686 double* T3_partial_lh;
687 if(((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_partial_lh_computed() == 0){
688 tree->computePartialLikelihood(((PhyloNeighbor*) (*nniMoves[0].node2Nei_it)), node1);
689 clear_pl_lh[2] = 0;
690 }
691 T3_partial_lh = ((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_partial_lh();
692
693 double* T4_partial_lh;
694 if(((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_partial_lh_computed() == 0){
695 tree->computePartialLikelihood(((PhyloNeighbor*) (*nniMoves[1].node2Nei_it)), node1);
696 clear_pl_lh[3] = 0;
697 }
698 T4_partial_lh = ((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_partial_lh();
699
700 for(i = 0; i<nptn; i++){
701 score[0] = score[1] = score[2] = score[3] = 0.0;
702 // Sum over Gamma categories and over states
703 //for(cat = 0; cat < ncat; cat++){
704 for(x = 0; x < nstates; x++){
705 // First subtree --------------------------
706 score[0] += tree->getModel()->state_freq[x]*T1_partial_lh[i*nstates+x];
707 // Second subtree --------------------------
708 score[1] += tree->getModel()->state_freq[x]*T2_partial_lh[i*nstates+x];
709 // Third subtree --------------------------
710 score[2] += tree->getModel()->state_freq[x]*T3_partial_lh[i*nstates+x];
711 // Fourth subtree --------------------------
712 score[3] += tree->getModel()->state_freq[x]*T4_partial_lh[i*nstates+x];
713 }
714 //}
715 L[0] += log(score[0])*ptnFreq[i];
716 L[1] += log(score[1])*ptnFreq[i];
717 L[2] += log(score[2])*ptnFreq[i];
718 L[3] += log(score[3])*ptnFreq[i];
719
720 ASSERT(isnormal(L[0] + L[1] + L[2] + L[3]));
721
722 }
723
724 /*
725 if(clear_pl_lh[0] == 0){
726 ((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->clearPartialLh();
727 }
728 if(clear_pl_lh[1] == 0){
729 ((PhyloNeighbor*) (*node1Nei2_it))->clearPartialLh();
730 }
731 if(clear_pl_lh[2] == 0){
732 ((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->clearPartialLh();
733 }
734 if(clear_pl_lh[3] == 0){
735 ((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->clearPartialLh();
736 }*/
737 //cout<<"Clear_pl_lh:"<<clear_pl_lh[0]<<" "<<clear_pl_lh[1]<<" "<<clear_pl_lh[2]<<" "<<clear_pl_lh[3]<<endl;
738
739 //double logNcat = log(((double)ncat));
740 L[0] = L[0] + ((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_lh_scale_factor();
741 L[1] = L[1] + ((PhyloNeighbor*) (*node1Nei2_it))->get_lh_scale_factor();
742 L[2] = L[2] + ((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_lh_scale_factor();
743 L[3] = L[3] + ((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_lh_scale_factor();
744
745 /* // Print some info:
746 cout<<"The log likelihood of the parent tree T:"<<tree->computeLikelihood()<<endl;
747 cout<<"The log likelihoods of the four subtrees:"<<endl;
748 cout<<"Node"<<(*nniMoves[0].node1Nei_it)->node->id<<": L[0] = "<<L[0]<<endl;
749 cout<<"Node"<<(*node1Nei2_it)->node->id<<": L[1] = "<<L[1]<<endl;
750 cout<<"Node"<<(*nniMoves[0].node2Nei_it)->node->id<<": L[2] = "<<L[2]<<endl;
751 cout<<"Node"<<(*nniMoves[1].node2Nei_it)->node->id<<": L[3] = "<<L[3]<<endl;*/
752
753 UB += L[0] + L[1] + L[2] + L[3];
754
755 double q1 = logC(t[0]+t[3],tree) + logC(t[1]+t[2],tree);
756 double q2 = logC(t[0]+t[2],tree) + logC(t[1]+t[3],tree);
757 //cout<<"Coefficients q1 and q2:"<<endl<<q1<<endl<<q2<<endl;
758
759 double UBq1 = UB + nsite*q1;
760 double UBq2 = UB + nsite*q2;
761
762 string out_file_UB = tree->params->out_prefix;
763 out_file_UB += ".UB.NNI.upperBounds";
764 ofstream out_UB;
765 out_UB.exceptions(ios::failbit | ios::badbit);
766 out_UB.open((char*)out_file_UB.c_str(),std::ofstream::out | std::ofstream::app);
767
768 out_UB << tree->getCurScore() << "\t" << UBq1 << "\t" << UBq2 << "\t" << tree->getCurScore() - UBq1 << "\t" << tree->getCurScore() - UBq2 << endl;
769
770 out_UB.close();
771
772 if(UBq1 < tree->getCurScore()){
773 tree->skippedNNIub += 1;
774 /* tree->meanUB += UBq1;
775 if(UBq1 < tree->minUB){
776 tree->minUB = UBq1;
777 } else if(UBq1 > tree->maxUB){
778 tree->maxUB = UBq1;
779 }*/
780 //cout<<"----------------- UBq1 < L !!!"<<endl;
781 }
782 if(UBq2 < tree->getCurScore()){
783 tree->skippedNNIub += 1;
784 /*tree->meanUB += UBq2;
785 if(UBq2 < tree->minUB){
786 tree->minUB = UBq2;
787 } else if(UBq2 > tree->maxUB){
788 tree->maxUB = UBq2;
789 }*/
790 //cout<<"----------------- UBq2 < L !!!"<<endl;
791 }
792
793 // Decide which NNI has a larger UB (we base our decision on q coefficients)
794 if(q1 > q2){
795 // NNI 1:
796 //nniMoves[0].newLen[0] = NULL;
797 nniMoves[0].newloglh = UBq1;
798 //cout<<"q1 and NNI1 is chosen with UB "<<UBq1<<endl;
799 return nniMoves[0];
800 } else {
801 // NNI 2:
802 //nniMoves[1].newLen[0] = NULL;
803 nniMoves[1].newloglh = UBq2;
804 //cout<<"q2 and NNI2 is chosen with UB "<<UBq2<<endl;
805 return nniMoves[1];
806 }
807 }
808
logC(double t,PhyloTree * tree)809 double logC(double t, PhyloTree* tree){
810 //double c = log((1+3*exp(-t)))-log(1-exp(-t));
811
812 int i, m = tree->aln->num_states*tree->aln->num_states, n = tree->aln->num_states;
813 double* TransMatrix = new double[m];
814 tree->getModelFactory()->computeTransMatrix(t,TransMatrix);
815 double maxTransProb = 0.0;
816 for(i = 0; i < m; i++)
817 if(TransMatrix[i]>maxTransProb)
818 maxTransProb = TransMatrix[i];
819 //maxTransProb = 0.25*(1+3*exp(-3*t/4));
820 //maxTransProb = 1;
821
822 if(tree->minStateFreq == 0.0){
823 tree->minStateFreq = tree->getModel()->state_freq[0];
824 for(i = 1; i < n; i++){
825 if(tree->minStateFreq > tree->getModel()->state_freq[i])
826 tree->minStateFreq = tree->getModel()->state_freq[i];
827 }
828
829 }
830 //cout<<tree->minStateFreq<<endl;
831 //tree->minStateFreq = 0.25;
832 //assert(isnormal(log(maxTransProb/tree->minStateFreq)));
833 return log(maxTransProb/tree->minStateFreq);
834 }
835
sumFraction(PhyloNode * node1,PhyloNode * node2,PhyloTree * tree)836 void sumFraction(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree){
837 PhyloNeighbor* nei1 = (PhyloNeighbor*) node1->findNeighbor(node2);
838 PhyloNeighbor* nei2 = (PhyloNeighbor*) node2->findNeighbor(node1);
839
840 // int loglh = tree->computeLikelihood();
841
842 double* T1_partial_lh;
843 if(nei1->get_partial_lh_computed() == 0){
844 tree->computePartialLikelihood(nei1, node1);
845 }
846 T1_partial_lh = nei1->get_partial_lh();
847
848 double* T2_partial_lh;
849 if(nei2->get_partial_lh_computed() == 0){
850 tree->computePartialLikelihood(nei2, node2);
851 }
852 T2_partial_lh = nei2->get_partial_lh();
853
854 double score[3];
855 score[0] = score[1] = score[2] = 0.0;
856
857 double plh[3];
858 plh[0]=plh[1]=plh[2]=0.0;
859
860 int nptn = tree->aln->getNPattern();
861 int nstates = tree->aln->num_states;
862 int j,i,x,y;
863
864 double plhx[nstates];
865 double plhy[nstates];
866
867 double *eigen = tree->getModel()->getEigenvectors();
868
869 for(i = 0; i<nptn; i++){
870 score[0] = score[1] = score[2] = 0.0;
871
872 // computing partial likelihoods
873 for(x = 0; x < nstates; x++){
874 plhx[x] = 0.0;
875 plhy[x] = 0.0;
876 for(j = 0; j<nstates; j++){
877 plhx[x]+= T1_partial_lh[i*nstates+j]*eigen[x*nstates+j];
878 plhy[x]+= T2_partial_lh[i*nstates+j]*eigen[x*nstates+j];
879 }
880 }
881
882 for(x = 0; x < nstates; x++){
883 for(y = 0; y < nstates; y++){
884 if(x == y){
885 // Term for a pair of matching nucleotides --------------------------
886 score[0] += plhx[x]*plhy[y];
887 } else {
888 // Term for a pair of non-matching nucleotides ----------------------
889 score[1] += plhx[x]*plhy[y];
890 }
891 // Full sum ---------------------------------------------------------
892 score[2] += plhx[x]*plhy[y];
893 }
894 }
895
896 ASSERT(isnormal(score[0] + score[1] + score[2]));
897
898 cout<<"BranchLEN |"<< nei1->length
899 <<"| FRACTION of ai (sum over matching pairs) |"<<score[0]/score[2]
900 <<"| FRACTION of bi (sum over non-matching pairs) |"<<score[1]/score[2]
901 <<"| likelihood |"<<score[2]
902 <<endl;
903
904 }
905
906
907 }
908