1 /*
2  * upperbounds.cpp
3  *
4  *  Created on: Aug 13, 2014
5  *      Author: olga
6  */
7 #include "upperbounds.h"
8 #include "phylonode.h"
9 #include <string.h>
10 #include "timeutil.h"
11 
UpperBounds(Params * params,Alignment * alignment,IQTree * tree)12 void UpperBounds(Params *params, Alignment* alignment, IQTree* tree){
13 
14 // Output details --------------------------------------------------
15 // UpperBounds File
16 	string out_file = params->out_prefix;
17 	//out_file += ".ub";
18 	out_file = "results.trueSplits.ub";
19 	ofstream out;
20 	out.exceptions(ios::failbit | ios::badbit);
21 	out.open((char*)out_file.c_str(),std::ofstream::out | std::ofstream::app);
22 
23 // Details on Split: A|B
24 	string out_file_split = params->out_prefix;
25 	//out_file_split += ".split.ub";
26 	out_file_split = "results.trueSplits.ub.splits";
27 	ofstream out_split;
28 	out_split.exceptions(ios::failbit | ios::badbit);
29 	out_split.open((char*)out_file_split.c_str(),std::ofstream::out | std::ofstream::app);
30 
31 // Within Family Info: A|B
32 	string out_file_within = params->out_prefix;
33 	//out_file_within += ".within.ub";
34 	out_file_within = "results.within.ub";
35 	ofstream out_within;
36 	out_within.exceptions(ios::failbit | ios::badbit);
37 	out_within.open((char*)out_file_within.c_str(),std::ofstream::out | std::ofstream::app);
38 
39 // Between Families Info: A|B
40 	string out_file_between = params->out_prefix;
41 	//out_file_between += ".between.ub";
42 	out_file_between = "results.between.ub";
43 	ofstream out_between;
44 	out_between.exceptions(ios::failbit | ios::badbit);
45 	out_between.open((char*)out_file_between.c_str(),std::ofstream::out | std::ofstream::app);
46 
47 	/* ------------------------------------------------------------------------------------------------------
48 	 * All output files:
49 	 * 	out 		-> "results.trueSplits.ub"			-> upper bounds for all splits from an input tree
50 	 * 	out_split   -> "results.trueSplits.ub.splits"	-> list of 			all splits from an input tree
51 	 * 	out_within	-> "results.within.ub"				-> comparison of upper bounds within  Split Families
52 	 * 	out_between	-> "results.between.ub"				-> comparison of upper bounds between Split Families
53 	 * ------------------------------------------------------------------------------------------------------
54 	 * FORMAT:
55 	 * general info (first columns of every file)...........................................................................
56 	 * 		leafNum		getNSite()	min(taxaA,taxaB)	brLen
57 	 *
58 	 * out .................................................................................................................
59 	 * 		L(A|B)		L(A)L(B)	cN*L(A)L(B)		L(A|B)/L(A)L(B)		L(A|B)/cN*L(A)L(B)	coef
60 	 *
61 	 *[4],[5] - the difference between likelihood and UB normalized by likelihood value
62 	 * if >1, the inequality is true.
63 	 * if <1, false.
64 	 *
65 	 * out_within ..........................................................................................................
66 	 * 		UB_true		[1..N] UB_random_AB/UB_true (how smaller is the bound for random tree)
67 	 *
68 	 * out_between..........................................................................................................
69 	 * 		UB_true		[1..N] UB_random_CD/UB_true (how smaller is the bound for random tree)
70 	 *
71 	 * ------------------------------------------------------------------------------------------------------ */
72 
73 	int i=0;//, h=0;
74 
75 	// Printing info about the TreeLogL changes during the tree search
76 /*	cout<<"mlInitial  = "<<tree->mlInitial<<endl;
77 	cout<<"mlFirstOpt = "<<tree->mlFirstOpt<<endl;
78 	cout<<"mlBestTree = "<<tree->getBestScore()<<endl;
79 	cout<<"mlUnConstr = "<<alignment->computeUnconstrainedLogL()<<endl;*/
80 
81 	//double mlQuestionary = tree->mlInitial; //or tree->mlFirstOpt for example
82 
83 	/* ------------------------------------------------------------------------------------------------------
84 	 * Main PART
85 	 * ------------------------------------------------------------------------------------------------------ */
86 	cout<<"Starting Upper Bounds analysis.."<<endl;
87 
88 	NodeVector branch1, branch2;
89 	tree->getBranches(branch1, branch2);
90 	int allSplits = 0;
91 //	int R=10; // R is the number of random trees we will generate
92 
93 // A loop over all A|B present on tree T
94 	for(i = 0; i != branch1.size(); i++){
95 		vector<int> taxaA, taxaB;
96 		vector<string> taxaAname, taxaBname;
97 		tree->getTaxaID(taxaA,branch1[i],branch2[i]);
98 		tree->getTaxaID(taxaB,branch2[i],branch1[i]);
99 
100 		/* ------------------------------------------------------------------------------------------------------------
101 		 * TEST 1: This is the part for tests on [ai/(ai+bi)] and [bi/(ai+bi)] fractions
102 		 */
103 		int test1 = 1;
104 		if(test1 == 1){
105 			if(taxaA.size() > 3 && taxaB.size() > 3){ // IQTree does not compute lh of tree with less than 4 taxa.
106 				allSplits++;
107 				sumFraction(((PhyloNode*) branch1[i]), ((PhyloNode*) branch2[i]), tree);
108 		}
109 		}
110 
111 		/* ------------------------------------------------------------------------------------------------------------
112 		 * TEST 2: This is the part for tests on random trees and evaluation of Upper Bounds for each split on the input tree
113 		 */
114 		int test2 = 0;
115 		if(test2 == 1){
116 		if(taxaA.size() > 3 && taxaB.size() > 3){ // IQTree does not compute lh of tree with less than 4 taxa.
117 			allSplits++;
118 
119 			// Dealing with subtrees T_A and T_B
120 			PhyloTree *treeA, *treeB;
121 			treeA = extractSubtreeUB(taxaA,tree,params,1);
122 			treeB = extractSubtreeUB(taxaB,tree,params,1);
123 
124 			// Upper Bound for a given split from the input tree
125 			double brLen = branch1[i]->findNeighbor(branch2[i])->length;
126 			double coef  = tree->aln->getNSite()*(log(1+3*exp(-brLen)) - log(1-exp(-brLen)));
127 			double coef2  = tree->aln->getNSite()*log(1+3*exp(-brLen));
128 			double UB_true  = coef + treeA->getCurScore() + treeB->getCurScore();
129 			double UB_true2 = coef2 + treeA->getCurScore() + treeB->getCurScore();
130 
131 			//cout<<"UB_true = "<<UB_true<<endl;
132 			out<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"
133 					<<tree->getCurScore()<<"\t"<<treeA->getCurScore() + treeB->getCurScore()<<"\t"<<UB_true<<"\t"<<UB_true2<<"\t"
134 					<<tree->getCurScore()/(treeA->getCurScore() + treeB->getCurScore())<<"\t"
135 					<<tree->getCurScore()/UB_true<<"\t"<<tree->getCurScore()/UB_true2<<"\t"<<coef<<"\t"<<coef2<<endl;
136 
137 /*
138 			// Comparison of Upper Bounds within Split Family ----------------------------------------
139 			out_within<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"<<UB_true<<"\t";
140 
141 			cout<<"comparison within family...."<<endl;
142 			double UB_random_AB;
143 			for(j=0; j<30; j++){
144 				//cout<<"generating "<<j<<" random_AB tree..."<<endl;
145 				UB_random_AB = RandomTreeAB(tree, treeA, treeB, taxaA, taxaB, params,brLen);
146 				//cout<<"The upper bound for random tree: "<<UB_random_AB<<endl;
147 				out_within<<UB_random_AB/UB_true<<"\t";
148 			}
149 			out_within<<endl;
150 
151 
152 			// --------------------------------------------------------------------------------------
153 
154 			// Comparison of Upper Bounds between Split Families ------------------------------------
155 			out_between<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<brLen<<"\t"<<UB_true<<"\t";
156 
157 			cout<<"comparison between families...."<<endl;
158 			// creating split C|D which conflicts with A|B
159 			int n=0;
160 			n=int(min(taxaA.size(),taxaB.size())/2.);
161 			//cout<<"taxaA.size() = "<<taxaA.size()<<", taxaB.size() = "<<taxaB.size()<<", n = "<<n<<endl;
162 
163 			vector<int> taxaC, taxaD;
164 			PhyloTree *treeC, *treeD;
165 
166 			// ContraSplit1: changing 1/2 of taxa
167 			taxaC = taxaA;
168 			taxaD = taxaB;
169 			for(h=0; h<n; h++){
170 				taxaC[h]=taxaB[h];
171 				taxaD[h]=taxaA[h];
172 			}
173 			treeC = extractSubtreeUB(taxaC,tree,params);
174 			treeD = extractSubtreeUB(taxaD,tree,params);
175 			double UB_random_CD;
176 			for(j=0; j<R; j++){
177 				//cout<<"generating "<<j<<" random_CD1 tree..."<<endl;
178 				UB_random_CD = RandomTreeAB(tree, treeC, treeD, taxaC, taxaD, params);
179 				out_between<<UB_random_CD/UB_true<<"\t";
180 			}
181 
182 			// ContraSplit 2: changing 1/4 of taxa
183 			taxaC = taxaA;
184 			taxaD = taxaB;
185 			for(h=0; h<int(n/2.); h++){
186 				taxaC[h]=taxaB[h];
187 				taxaD[h]=taxaA[h];
188 			}
189 			treeC = extractSubtreeUB(taxaC,tree,params);
190 			treeD = extractSubtreeUB(taxaD,tree,params);
191 			for(j=0; j<R; j++){
192 				//cout<<"generating "<<j<<" random_CD2 tree..."<<endl;
193 				UB_random_CD = RandomTreeAB(tree, treeC, treeD, taxaC, taxaD, params);
194 				out_between<<UB_random_CD/UB_true<<"\t";
195 			}
196 			out_between<<endl;
197 */
198 
199 /*
200 
201 	// Printing Tree and its subtrees. This was just for check.
202 			cout<<"Tree T(A|B)"<<endl;
203 			tree->printTree(cout,2);
204 			cout<<endl<<"Tree T(A)"<<endl;
205 			treeA->printTree(cout,2);
206 			cout<<endl<<"Tree T(B)"<<endl;
207 			treeB->printTree(cout,2);
208 			cout<<endl;
209 
210 			cout<<"Tree T(A|B)"<<endl;
211 			printTreeUB(tree);
212 			cout<<endl<<endl<<"Tree T(A)"<<endl;
213 			printTreeUB(treeA);
214 			cout<<endl<<"Tree T(B)"<<endl;
215 			printTreeUB(treeB);
216 
217 	// Printing out the results ----------------------------------------------------------
218 			// Split A|B ------------------------------------------
219 			out_split<<min(taxaA.size(),taxaB.size())<<"|"<<((double) max(taxaA.size(),taxaB.size()))<<"\t";
220 			if(min(taxaA.size(),taxaB.size()) == taxaA.size()){
221 				for(int f = 0; f < taxaA.size()-1; f++)
222 					out_split<<taxaA[f]<<",";
223 				out_split<<taxaA[taxaA.size()-1]<<"\t|\t";
224 				for(int f = 0; f < taxaB.size()-1; f++)
225 					out_split<<taxaB[f]<<",";
226 				out_split<<taxaB[taxaB.size()-1];
227 			} else {
228 				for(int f = 0; f < taxaB.size()-1; f++)
229 					out_split<<taxaB[f]<<",";
230 				out_split<<taxaB[taxaB.size()-1]<<"\t|\t";
231 				for(int f = 0; f < taxaA.size()-1; f++)
232 					out_split<<taxaA[f]<<",";
233 				out_split<<taxaA[taxaA.size()-1];
234 			}
235 			out_split<<endl;
236 			// ----------------------------------------------------
237 
238 			//out<<min(taxaA.size(),taxaB.size())<<"|"<<((double) max(taxaA.size(),taxaB.size()))<<"\t"<<br_len<<"\t"<<tree->curScore<<"\t";
239 			out<<params->aln_file<<"\t";
240 			if(tree->mlInitial == 0)
241 				out<<"0"<<"\t";
242 			else
243 				out<<"1"<<"\t";
244 			out<<tree->leafNum<<"\t"<<tree->aln->getNSite()<<"\t"<<min(taxaA.size(),taxaB.size())<<"\t"<<br_len<<"\t"<<tree->curScore<<"\t";
245 
246 
247 			if(min(taxaA.size(),taxaB.size()) == taxaA.size()){
248 				out<<treeA->curScore<<"\t"<<treeB->curScore<<"\t";
249 			}
250 			else{
251 				out<<treeB->curScore<<"\t"<<treeA->curScore<<"\t";
252 			}
253 
254 			out<<tree->aln->size()*(log(1+3*exp(-br_len)) - log(1-exp(-br_len)))<<"\t"<<diff_1<<"\t"<<diff_2;
255 
256 			if(diff_1>0){
257 				out<<"\t"<<"FALSE\t0";
258 				BadSplits1++;
259 			}else{
260 				out<<"\t"<<"TRUE\t1";
261 			}
262 
263 			if(diff_2>0){
264 				out<<"\t"<<"FALSE\t0";
265 				BadSplits2++;
266 			}else{
267 				out<<"\t"<<"TRUE\t1";
268 			}
269 			out<<endl;
270 
271 	// END: Printing out the results -----------------------------------------------------
272 */
273 		}} // END: if taxaA.size() and taxaB.size() >3
274 
275 	}
276 	// END: the loop over all A|B present on tree T
277 
278 	out_within.close();
279 	out_between.close();
280 	out.close();
281 	out_split.close();
282 }
283 
extractSubtreeUB(IntVector & ids,MTree * tree,Params * params,int sw)284 PhyloTree* extractSubtreeUB(IntVector &ids, MTree* tree, Params *params, int sw) {
285 	string taxa_set;
286 	int i;
287 	for(i = 0; i < tree->leafNum; i++)
288 		taxa_set.push_back(0);
289 	for (i = 0; i < ids.size(); i++)
290 		taxa_set[ids[i]]=1;
291 
292 	PhyloTree *treeCopy = new PhyloTree(); // this will be a new subtree
293 	Alignment *alignment = new Alignment();
294 	alignment->extractSubAlignment(((PhyloTree*)tree)->aln,ids,0);
295 
296 	treeCopy->copyTree(tree, taxa_set);
297 	treeCopy->setAlignment(alignment);
298 	if(sw == 1){
299 		treeCopy->setModel(((PhyloTree*)tree)->getModel());
300 		treeCopy->setRate(((PhyloTree*)tree)->getRate());
301 		treeCopy->setModelFactory(((PhyloTree*)tree)->getModelFactory());
302 		treeCopy->initializeAllPartialLh();
303 		treeCopy->setCurScore(treeCopy->computeLikelihood());
304 	}
305 
306 	return treeCopy;
307 }
308 
printTreeUB(MTree * tree)309 void printTreeUB(MTree *tree){
310 	int i=0, j=0;
311 
312 	NodeVector nodeLeaves;
313 	tree->getTaxa(nodeLeaves);
314 	cout<<"Taxa nodes:"<<endl;
315 	for(i=0; i<nodeLeaves.size(); i++){
316 		cout<<nodeLeaves[i]->name<<":"<<nodeLeaves[i]->id<<"(";
317 		for(j=0; j<nodeLeaves[i]->neighbors.size(); j++)
318 			cout<<nodeLeaves[i]->neighbors[j]->node->name<<":"<<nodeLeaves[i]->neighbors[j]->node->id<<"["<<nodeLeaves[i]->neighbors[j]->length<<"]"<<",";
319 		cout<<")"<<endl;
320 	}
321 
322 	NodeVector nodeInternal;
323 	tree->getInternalNodes(nodeInternal);
324 	cout<<"Internal nodes:"<<endl;
325 	if(nodeInternal.size() == 0)
326 		cout<<"no internal nodes"<<endl;
327 	else{
328 		for(i=0; i<nodeInternal.size(); i++){
329 			cout<<nodeInternal[i]->name<<":"<<nodeInternal[i]->id<<"(";
330 			for(j=0; j<nodeInternal[i]->neighbors.size(); j++)
331 				cout<<nodeInternal[i]->neighbors[j]->node->name<<":"<<nodeInternal[i]->neighbors[j]->node->id<<"["<<nodeInternal[i]->neighbors[j]->length<<"]"<<",";
332 			cout<<")"<<endl;
333 		}
334 	}
335 }
336 
generateRandomYH_UB(Params & params,PhyloTree * tree)337 MTree* generateRandomYH_UB(Params &params, PhyloTree *tree){
338 	MExtTree* treeR = new MExtTree();
339 	bool binary = TRUE;
340 
341 	int size = tree->leafNum;
342 	if (size < 3)
343 		outError(ERR_FEW_TAXA);
344 
345 	treeR->root = treeR->newNode();
346 	int i;
347 	NodeVector myleaves;
348 	NodeVector innodes;
349 	Node *node;
350 	double len;
351 
352 	innodes.push_back(treeR->root);
353 	// create initial tree with 3 leaves
354 	for (i = 0; i < 3; i++) {
355 		node = treeR->newNode();
356 		len = randomLen(params);
357 		treeR->root->addNeighbor(node, len);
358 		node->addNeighbor(treeR->root, len);
359 		myleaves.push_back(node);
360 	}
361 
362 	// additionally add a leaf
363 	for (i = 3; i < size; i++)
364 	{
365 		int index;
366 		if (binary) {
367 			index = random_int(i);
368 		} else {
369  			index = random_int(i + innodes.size());
370 		}
371 		if (index < i) {
372 			node = myleaves[index];
373 			innodes.push_back(node);
374 			// add the first leaf
375 			Node *newleaf = treeR->newNode();
376 			len = randomLen(params);
377 			node->addNeighbor(newleaf, len);
378 			newleaf->addNeighbor(node, len);
379 			myleaves[index] = newleaf;
380 
381 			// add the second leaf
382 			newleaf = treeR->newNode();
383 			len = randomLen(params);
384 			node->addNeighbor(newleaf, len);
385 			newleaf->addNeighbor(node, len);
386 			myleaves.push_back(newleaf);
387 		}
388 		else {
389 			node = innodes[index-i];
390 			// add only 1 new leaf
391 			Node *newleaf = treeR->newNode();
392 			len = randomLen(params);
393 			node->addNeighbor(newleaf, len);
394 			newleaf->addNeighbor(node, len);
395 			myleaves.push_back(newleaf);
396 		}
397 	}
398 
399 	treeR->root = myleaves[0];
400 	// indexing the leaves
401 	treeR->setLeavesName(myleaves);
402 	treeR->leafNum = myleaves.size();
403 	treeR->nodeNum = treeR->leafNum;
404 	treeR->initializeTree();
405 
406 	NodeVector taxa;
407 	treeR->getTaxa(taxa);
408 	ASSERT(taxa.size() == size);
409 	for (NodeVector::iterator it = taxa.begin(); it != taxa.end(); it++)
410 		(*it)->name = tree->aln->getSeqName((*it)->id);
411 
412 	return (MTree*)treeR;
413 }
414 
RandomTreeAB(PhyloTree * treeORGN,PhyloTree * treeAorgn,PhyloTree * treeBorgn,IntVector & taxaA,IntVector & taxaB,Params * params,double brLen)415 double RandomTreeAB(PhyloTree* treeORGN, PhyloTree* treeAorgn, PhyloTree* treeBorgn, IntVector &taxaA, IntVector &taxaB, Params* params, double brLen){
416 	PhyloTree *tree  = new PhyloTree();
417 	MTree *treeA = new MTree();
418 	MTree *treeB = new MTree();
419 
420 	treeA = generateRandomYH_UB(*params,treeAorgn);
421 	treeB = generateRandomYH_UB(*params,treeBorgn);
422 
423 /*
424 	// PrintTree ---------------
425 	cout<<"TreeA.root:"<<treeA->root->name<<treeA->root->id<<endl;
426 	cout<<"TreeB.root:"<<treeB->root->name<<treeB->root->id<<endl;
427 	cout<<"TreeA:"<<endl;
428 	treeA->printTree(cout);
429 	cout<<endl<<"TreeB:"<<endl;
430 	treeB->printTree(cout);
431 	cout<<endl;
432 	// -------------------------
433 */
434 
435 	extendingTree(treeA,params);
436 	extendingTree(treeB,params);
437 
438 /*
439 	// PrintTree ---------------
440 	cout<<"TreeA.root:"<<treeA->root->name<<treeA->root->id<<endl;
441 	cout<<"TreeB.root:"<<treeB->root->name<<treeB->root->id<<endl;
442 
443 	cout<<"extended TreeA:"<<endl;
444 	treeA->printTree(cout);
445 	cout<<endl<<"extended TreeB:"<<endl;
446 	treeB->printTree(cout);
447 	cout<<endl;
448 	// -------------------------
449 */
450 
451 	treeA->root->name = "NewNodeA";
452 	treeB->root->name = "NewNodeB";
453 	treeA->root->addNeighbor(treeB->root,0.0,tree->branchNum);
454 	treeB->root->addNeighbor(treeA->root,0.0,tree->branchNum);
455 
456 	tree->copyTree(treeA);
457 /*	cout<<"Leaves number = "<<tree->leafNum<<endl;
458 	cout<<"Nodes  number = "<<tree->nodeNum<<endl;
459 	cout<<"Branch number:"<<tree->branchNum<<endl;
460 	*/
461 	//tree->printTree(cout);
462 	//cout<<endl;
463 
464 	NodeVector brID;
465 	//brID= getBranchABid(brLen, tree);
466 	brID.push_back(tree->findNodeName(treeA->root->name));
467 	brID.push_back(tree->findNodeName(treeB->root->name));
468 
469 	if(brLen == 0){
470 		brLen = randomLen(*params);
471 	}
472 	//tree->findNodeName(treeA->root->name)->findNeighbor(treeB->root)->length = brLen;
473 	//tree->findNodeName(treeB->root->name)->findNeighbor(treeA->root)->length = brLen;
474 
475 
476 	tree->findNodeID(brID[0]->id)->findNeighbor(brID[1])->length = brLen;
477 	tree->findNodeID(brID[1]->id)->findNeighbor(brID[0])->length = brLen;
478 
479 
480 	tree->setAlignment(treeORGN->aln);
481 	tree->setModel(((PhyloTree*)treeORGN)->getModel());
482 	tree->setRate(((PhyloTree*)treeORGN)->getRate());
483 	tree->setModelFactory(((PhyloTree*)treeORGN)->getModelFactory());
484 	tree->initializeAllPartialLh();
485 
486 	tree->setCurScore(tree->computeLikelihood());
487 	//cout<<"LogLh score before optimization: "<<tree->curScore<<endl;
488 	tree->params = params;
489 	//tree->curScore = tree->optimizeAllBranches(50);
490 	//cout<<"LogLh score after  optimization: "<<tree->curScore<<endl;
491 
492 	//double len = tree->findNodeName(treeA->root->name)->findNeighbor(treeB->root)->length;
493 	double len = tree->findNodeID(brID[0]->id)->findNeighbor(brID[1])->length;
494 	//cout<<"The length of corresponding branch after optimization: "<<len<<endl;
495 	//cout<<"before it was equal to "<<brLen<<endl;
496 
497 	string out_file = "results.branches.ub";
498 	ofstream out;
499 	out.exceptions(ios::failbit | ios::badbit);
500 	out.open((char*)out_file.c_str(),std::ofstream::out | std::ofstream::app);
501 
502 	//len = 1;
503 
504 	double coef = tree->aln->getNSite()*(log(1+3*exp(-len)) - log(1-exp(-len)));
505 	double U = coef + UpperBoundAB(taxaA, taxaB, tree, params);
506 
507 	// leafNum		alnLen		brLen (before opt)		brLen (after opt)		coef 		UB
508 	//out<<treeORGN->leafNum<<"\t"<<treeORGN->aln->getNSite()<<"\t"<<brLen<<"\t"<<len<<"\t"<<coef<<"\t"<<U<<endl;
509 
510 	out.close();
511 	return U;
512 }
513 
UpperBoundAB(IntVector & taxaA,IntVector & taxaB,PhyloTree * tree,Params * params)514 double UpperBoundAB(IntVector &taxaA, IntVector &taxaB, PhyloTree* tree, Params *params){
515 	double U = 0.0;
516 
517 	PhyloTree *treeA, *treeB;
518 	treeA = extractSubtreeUB(taxaA,tree,params,1);
519 	treeB = extractSubtreeUB(taxaB,tree,params,1);
520 
521 	U = treeA->getCurScore() + treeB->getCurScore();
522 
523 	return U;
524 }
525 
getBranchABid(double brLen,PhyloTree * tree)526 NodeVector getBranchABid(double brLen, PhyloTree* tree){
527 	NodeVector branch1, branch2;
528 	NodeVector branch;
529 	tree->getBranches(branch1, branch2);
530 	for(int i = 0; i != branch1.size(); i++){
531 		if(branch1[i]->findNeighbor(branch2[i])->length == 0.0){
532 			branch.push_back(branch1[i]);
533 			branch.push_back(branch2[i]);
534 			return branch;
535 		}
536 	}
537 	outError("UpperBounds: did not find matching branch:(");
538 	return branch;
539 }
540 
extendingTree(MTree * tree,Params * params)541 void extendingTree(MTree *tree, Params* params){
542 
543 	// Choose random internal node
544 	int maxR = tree->nodeNum-1;
545 	int randomNodeID = rand() % maxR;
546 	//cout<<"randomNodeID = "<<randomNodeID<<endl;
547 	if(randomNodeID<tree->leafNum){
548 		if(randomNodeID+tree->leafNum > tree->nodeNum-1){
549 			randomNodeID += tree->nodeNum - tree->leafNum;
550 			//cout<<"adding nodeNum-leafNum"<<endl;
551 		}
552 		else{
553 			randomNodeID+=tree->leafNum;
554 			//cout<<"adding leafNum"<<endl;
555 		}
556 	}
557 
558 	//cout<<"leafNum = "<<tree->leafNum-1<<" < random = "<<randomNodeID<<" < nodeNum = "<<tree->nodeNum-1<<endl;
559 
560 	ASSERT(randomNodeID < tree->nodeNum && randomNodeID > tree->leafNum-1);
561 
562 	// Choose random neighbor
563 	int randomNeiID = rand() % 2;
564 	//cout<<"randomNeiID = "<<randomNeiID<<endl;
565 
566 
567 	Node *randomNode;
568 	randomNode = tree->findNodeID(randomNodeID);
569 	Node *randomNeiNode = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->node;
570 
571 	// Create new node ----------------------------
572 	string str;
573 	str = "NewNode";
574 	const char *ch = str.c_str();
575 	Node* newNode1 = tree->newNode(tree->nodeNum,ch);
576 	tree->nodeNum++;
577 
578 	// Add new node as a neighbor to randomNei->node
579 	double len = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->length;
580 	int    id  = tree->findNodeID(randomNodeID)->neighbors[randomNeiID]->id;
581 
582 	randomNeiNode->findNeighbor(randomNode)->node = newNode1;
583 	newNode1->addNeighbor(randomNeiNode,len,id);
584 
585 
586 	//Change randomNei with this new node for randomNode. Create new branch.
587 	randomNode->neighbors[randomNeiID]->node = newNode1;
588 	randomNode->neighbors[randomNeiID]->id = tree->branchNum;
589 	tree->branchNum++;
590 	randomNode->neighbors[randomNeiID]->length = randomLen(*params);
591 
592 	newNode1->addNeighbor(randomNode,randomNode->neighbors[randomNeiID]->length,randomNode->neighbors[randomNeiID]->id);
593 
594 	tree->root = newNode1;
595 
596 }
597 
getBestNNIForBranUB(PhyloNode * node1,PhyloNode * node2,PhyloTree * tree)598 NNIMove getBestNNIForBranUB(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree){
599 
600 	NNIMove nniMoves[2];
601 
602     // Initialize node1 and node2 in nniMoves
603 	nniMoves[0].node1 = nniMoves[1].node1 = node1;
604 	nniMoves[0].node2 = nniMoves[1].node2 = node2;
605 
606 	// Initialize two NNIs
607 	int cnt;
608 	double t[4];
609     FOR_NEIGHBOR_IT(node1, node2, node1_it) {
610 			cnt = 0;
611 			t[cnt]=(*node1_it)->length;
612 			FOR_NEIGHBOR_IT(node2, node1, node2_it) {
613 				//   Initialize the 2 NNI moves
614 				nniMoves[cnt].node1Nei_it = node1_it; // for both cnt = 0,1 this is the same neighbor of node1,
615 													  // which will be swapped with nei1 and nei2 of node2
616 				nniMoves[cnt].node2Nei_it = node2_it;
617 				t[cnt+2] = (*node2_it)->length;
618 				cnt++;
619 			}
620 			break;
621     }
622 
623     NeighborVec::iterator node1Nei2_it;
624 
625     FOR_NEIGHBOR_IT(node1, node2, node1_it){
626     	if ((*node1_it)->node != (*nniMoves[0].node1Nei_it)->node){
627     		t[cnt]=(*node1_it)->length;
628     		node1Nei2_it = node1_it;
629     		break;
630     	}
631     }
632 
633     /*
634      * Correspondence:
635      *
636      * Nodes, incident to node1 with corresponding branches:
637      * nniMoves[0].node1Nei_it	| t[0]
638      * node1Nei2_it				| t[1]
639      *
640      * Nodes, incident to node2 with corresponding branches:
641      * nniMoves[0].node2Nei_it	| t[2]
642      * nniMoves[1].node2Nei_it	| t[3]
643      *
644      * NNIs:
645      * nniMoves[0] -> swapping (nniMoves[0].node1Nei_it	| t[0]) with (nniMoves[0].node2Nei_it	| t[2])
646      * corresponding coef: q1
647      *
648      * nniMoves[1] -> swapping (nniMoves[1].node1Nei_it	| t[0]) with (nniMoves[1].node2Nei_it	| t[3])
649      * corresponding coef: q2
650      */
651 
652     double L[4]; // likelihoods of 4 subtrees
653     double score[4];
654     L[0] = L[1] = L[2] = L[3] = 0.0;
655     score[0] = score[1] = score[2] = score[3] = 0.0;
656 
657     double UB = 0.0; // in log terms
658     int nsite = tree->aln->getNSite();
659     UB = nsite*logC(node1->findNeighbor(node2)->length,tree); // coefficient c
660 
661     //int ncat = tree->site_rate->getNDiscreteRate();
662     int nptn = tree->aln->getNPattern();
663     int nstates = tree->aln->num_states;
664     int i,x;
665     //int cat;
666     IntVector ptnFreq;
667     tree->aln->getPatternFreq(ptnFreq);
668 
669     int clear_pl_lh[4]; // if equals to 1, partial likelihoods were computed, don't clear.
670     clear_pl_lh[0] = clear_pl_lh[1] = clear_pl_lh[2] = clear_pl_lh[3] = 1;
671 
672     double* T1_partial_lh;
673     if(((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_partial_lh_computed() == 0){
674     	tree->computePartialLikelihood((PhyloNeighbor*) (*nniMoves[0].node1Nei_it), node1);
675     	clear_pl_lh[0] = 0;
676     }
677     T1_partial_lh = ((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_partial_lh();
678 
679     double* T2_partial_lh;
680     if(((PhyloNeighbor*) (*node1Nei2_it))->get_partial_lh_computed() == 0){
681     	tree->computePartialLikelihood(((PhyloNeighbor*) (*node1Nei2_it)), node1);
682     	clear_pl_lh[1] = 0;
683     }
684     T2_partial_lh = ((PhyloNeighbor*) (*node1Nei2_it))->get_partial_lh();
685 
686     double* T3_partial_lh;
687     if(((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_partial_lh_computed() == 0){
688     	tree->computePartialLikelihood(((PhyloNeighbor*) (*nniMoves[0].node2Nei_it)), node1);
689     	clear_pl_lh[2] = 0;
690     }
691     T3_partial_lh = ((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_partial_lh();
692 
693     double* T4_partial_lh;
694     if(((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_partial_lh_computed() == 0){
695     	tree->computePartialLikelihood(((PhyloNeighbor*) (*nniMoves[1].node2Nei_it)), node1);
696     	clear_pl_lh[3] = 0;
697     }
698     T4_partial_lh = ((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_partial_lh();
699 
700     for(i = 0; i<nptn; i++){
701     	score[0] = score[1] = score[2] = score[3] = 0.0;
702     	// Sum over Gamma categories and over states
703     	//for(cat = 0; cat < ncat; cat++){
704     		for(x = 0; x < nstates; x++){
705     		// First  subtree --------------------------
706     			score[0] += tree->getModel()->state_freq[x]*T1_partial_lh[i*nstates+x];
707     		// Second subtree --------------------------
708     			score[1] += tree->getModel()->state_freq[x]*T2_partial_lh[i*nstates+x];
709     	   	// Third  subtree --------------------------
710     			score[2] += tree->getModel()->state_freq[x]*T3_partial_lh[i*nstates+x];
711     	   	// Fourth subtree --------------------------
712     			score[3] += tree->getModel()->state_freq[x]*T4_partial_lh[i*nstates+x];
713     		}
714    	//}
715     	L[0] += log(score[0])*ptnFreq[i];
716     	L[1] += log(score[1])*ptnFreq[i];
717     	L[2] += log(score[2])*ptnFreq[i];
718     	L[3] += log(score[3])*ptnFreq[i];
719 
720         ASSERT(isnormal(L[0] + L[1] + L[2] + L[3]));
721 
722     }
723 
724 /*
725     if(clear_pl_lh[0] == 0){
726     	((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->clearPartialLh();
727     }
728     if(clear_pl_lh[1] == 0){
729     	((PhyloNeighbor*) (*node1Nei2_it))->clearPartialLh();
730     }
731     if(clear_pl_lh[2] == 0){
732     	((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->clearPartialLh();
733     }
734     if(clear_pl_lh[3] == 0){
735     	((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->clearPartialLh();
736     }*/
737     //cout<<"Clear_pl_lh:"<<clear_pl_lh[0]<<" "<<clear_pl_lh[1]<<" "<<clear_pl_lh[2]<<" "<<clear_pl_lh[3]<<endl;
738 
739     //double logNcat = log(((double)ncat));
740     L[0] = L[0] + ((PhyloNeighbor*) (*nniMoves[0].node1Nei_it))->get_lh_scale_factor();
741     L[1] = L[1] + ((PhyloNeighbor*) (*node1Nei2_it))->get_lh_scale_factor();
742     L[2] = L[2] + ((PhyloNeighbor*) (*nniMoves[0].node2Nei_it))->get_lh_scale_factor();
743     L[3] = L[3] + ((PhyloNeighbor*) (*nniMoves[1].node2Nei_it))->get_lh_scale_factor();
744 
745 /*   // Print some info:
746     cout<<"The log likelihood  of the parent tree T:"<<tree->computeLikelihood()<<endl;
747     cout<<"The log likelihoods of the four subtrees:"<<endl;
748     cout<<"Node"<<(*nniMoves[0].node1Nei_it)->node->id<<": L[0] = "<<L[0]<<endl;
749     cout<<"Node"<<(*node1Nei2_it)->node->id<<": L[1] = "<<L[1]<<endl;
750     cout<<"Node"<<(*nniMoves[0].node2Nei_it)->node->id<<": L[2] = "<<L[2]<<endl;
751     cout<<"Node"<<(*nniMoves[1].node2Nei_it)->node->id<<": L[3] = "<<L[3]<<endl;*/
752 
753     UB += L[0] + L[1] + L[2] + L[3];
754 
755     double q1 = logC(t[0]+t[3],tree) + logC(t[1]+t[2],tree);
756     double q2 = logC(t[0]+t[2],tree) + logC(t[1]+t[3],tree);
757     //cout<<"Coefficients q1 and q2:"<<endl<<q1<<endl<<q2<<endl;
758 
759     double UBq1 = UB + nsite*q1;
760     double UBq2 = UB + nsite*q2;
761 
762 	string out_file_UB = tree->params->out_prefix;
763 	out_file_UB += ".UB.NNI.upperBounds";
764 	ofstream out_UB;
765 	out_UB.exceptions(ios::failbit | ios::badbit);
766 	out_UB.open((char*)out_file_UB.c_str(),std::ofstream::out | std::ofstream::app);
767 
768 	out_UB << tree->getCurScore() << "\t" << UBq1 << "\t" << UBq2 << "\t" << tree->getCurScore() - UBq1 << "\t" << tree->getCurScore() - UBq2 << endl;
769 
770 	out_UB.close();
771 
772     if(UBq1 < tree->getCurScore()){
773     	tree->skippedNNIub += 1;
774   /*  	tree->meanUB += UBq1;
775     	if(UBq1 < tree->minUB){
776     		tree->minUB = UBq1;
777     	} else if(UBq1 > tree->maxUB){
778     		tree->maxUB = UBq1;
779     	}*/
780     	//cout<<"----------------- UBq1 < L !!!"<<endl;
781     }
782     if(UBq2 < tree->getCurScore()){
783     	tree->skippedNNIub += 1;
784     	/*tree->meanUB += UBq2;
785     	if(UBq2 < tree->minUB){
786     		tree->minUB = UBq2;
787     	} else if(UBq2 > tree->maxUB){
788     		tree->maxUB = UBq2;
789     	}*/
790     	//cout<<"----------------- UBq2 < L !!!"<<endl;
791     }
792 
793 	// Decide which NNI has a larger UB (we base our decision on q coefficients)
794 	if(q1 > q2){
795 		// NNI 1:
796 		//nniMoves[0].newLen[0] = NULL;
797 		nniMoves[0].newloglh = UBq1;
798 		//cout<<"q1 and NNI1 is chosen with UB "<<UBq1<<endl;
799 		return nniMoves[0];
800 	} else {
801 		// NNI 2:
802 		//nniMoves[1].newLen[0] = NULL;
803 		nniMoves[1].newloglh = UBq2;
804 		//cout<<"q2 and NNI2 is chosen with UB "<<UBq2<<endl;
805 		return nniMoves[1];
806 	}
807 }
808 
logC(double t,PhyloTree * tree)809 double logC(double t, PhyloTree* tree){
810 	//double c = log((1+3*exp(-t)))-log(1-exp(-t));
811 
812 	int i, m = tree->aln->num_states*tree->aln->num_states, n = tree->aln->num_states;
813 	double* TransMatrix = new double[m];
814 	tree->getModelFactory()->computeTransMatrix(t,TransMatrix);
815 	double maxTransProb = 0.0;
816 	for(i = 0; i < m; i++)
817 		if(TransMatrix[i]>maxTransProb)
818 			maxTransProb = TransMatrix[i];
819 	//maxTransProb = 0.25*(1+3*exp(-3*t/4));
820 	//maxTransProb = 1;
821 
822 	if(tree->minStateFreq == 0.0){
823 		tree->minStateFreq = tree->getModel()->state_freq[0];
824 		for(i = 1; i < n; i++){
825 			if(tree->minStateFreq > tree->getModel()->state_freq[i])
826 				tree->minStateFreq = tree->getModel()->state_freq[i];
827 		}
828 
829 	}
830 	//cout<<tree->minStateFreq<<endl;
831 	//tree->minStateFreq = 0.25;
832 	//assert(isnormal(log(maxTransProb/tree->minStateFreq)));
833 	return log(maxTransProb/tree->minStateFreq);
834 }
835 
sumFraction(PhyloNode * node1,PhyloNode * node2,PhyloTree * tree)836 void sumFraction(PhyloNode *node1, PhyloNode *node2, PhyloTree *tree){
837 	PhyloNeighbor* nei1 = (PhyloNeighbor*) node1->findNeighbor(node2);
838 	PhyloNeighbor* nei2 = (PhyloNeighbor*) node2->findNeighbor(node1);
839 
840 //	int loglh = tree->computeLikelihood();
841 
842     double* T1_partial_lh;
843     if(nei1->get_partial_lh_computed() == 0){
844     	tree->computePartialLikelihood(nei1, node1);
845     }
846     T1_partial_lh = nei1->get_partial_lh();
847 
848     double* T2_partial_lh;
849     if(nei2->get_partial_lh_computed() == 0){
850     	tree->computePartialLikelihood(nei2, node2);
851     }
852     T2_partial_lh = nei2->get_partial_lh();
853 
854     double score[3];
855     score[0] = score[1] = score[2] = 0.0;
856 
857     double plh[3];
858     plh[0]=plh[1]=plh[2]=0.0;
859 
860     int nptn = tree->aln->getNPattern();
861     int nstates = tree->aln->num_states;
862     int j,i,x,y;
863 
864     double plhx[nstates];
865     double plhy[nstates];
866 
867     double *eigen = tree->getModel()->getEigenvectors();
868 
869     for(i = 0; i<nptn; i++){
870     	score[0] = score[1] = score[2] = 0.0;
871 
872     	// computing partial likelihoods
873 		for(x = 0; x < nstates; x++){
874 			plhx[x] = 0.0;
875 			plhy[x] = 0.0;
876 			for(j = 0; j<nstates; j++){
877 				plhx[x]+= T1_partial_lh[i*nstates+j]*eigen[x*nstates+j];
878 				plhy[x]+= T2_partial_lh[i*nstates+j]*eigen[x*nstates+j];
879 			}
880 		}
881 
882 		for(x = 0; x < nstates; x++){
883 			for(y = 0; y < nstates; y++){
884 				if(x == y){
885 				// Term for a pair of matching nucleotides --------------------------
886 					score[0] += plhx[x]*plhy[y];
887 				} else {
888 				// Term for a pair of non-matching nucleotides ----------------------
889 					score[1] += plhx[x]*plhy[y];
890 				}
891 			// Full sum ---------------------------------------------------------
892 			score[2] += plhx[x]*plhy[y];
893 			}
894 		}
895 
896         ASSERT(isnormal(score[0] + score[1] + score[2]));
897 
898 		cout<<"BranchLEN |"<< nei1->length
899 			<<"| FRACTION of ai (sum over matching pairs) |"<<score[0]/score[2]
900 		    <<"| FRACTION of bi (sum over non-matching pairs) |"<<score[1]/score[2]
901 		    <<"| likelihood |"<<score[2]
902 		    <<endl;
903 
904     }
905 
906 
907 }
908