1 /*
2  * phylotreepars.cpp
3  *
4  * Fast implementation of parsimony kernel
5  *
6  *  Created on: May 18, 2015
7  *      Author: minh
8  */
9 
10 #include "phylotree.h"
11 //#include "vectorclass/vectorclass.h"
12 #include "phylosupertree.h"
13 
14 #if defined (__GNUC__) || defined(__clang__)
15 #define vml_popcnt __builtin_popcount
16 #else
17 // taken from vectorclass library
vml_popcnt(uint32_t a)18 static inline uint32_t vml_popcnt (uint32_t a) {
19     // popcnt instruction not available
20     uint32_t b = a - ((a >> 1) & 0x55555555);
21     uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
22     uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
23     uint32_t e = d * 0x01010101;
24     return   e >> 24;
25 }
26 #endif
27 
28 /***********************************************************/
29 /****** optimized version of parsimony kernel **************/
30 /***********************************************************/
31 
computePartialParsimonyFast(PhyloNeighbor * dad_branch,PhyloNode * dad)32 void PhyloTree::computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode *dad) {
33     if (dad_branch->partial_lh_computed & 2)
34         return;
35     Node *node = dad_branch->node;
36     int nstates = aln->getMaxNumStates();
37     int site = 0;
38 
39     dad_branch->partial_lh_computed |= 2;
40 
41     vector<Alignment*> *partitions = NULL;
42     if (aln->isSuperAlignment())
43         partitions = &((SuperAlignment*)aln)->partitions;
44     else {
45         partitions = new vector<Alignment*>;
46         partitions->push_back(aln);
47     }
48 
49     if (node->name == ROOT_NAME) {
50         ASSERT(dad);
51         int pars_size = getBitsBlockSize();
52         memset(dad_branch->partial_pars, 255, pars_size*sizeof(UINT));
53         size_t nsites = (aln->num_parsimony_sites+UINT_BITS-1)/UINT_BITS;
54         dad_branch->partial_pars[nstates*nsites] = 0;
55     } else if (node->isLeaf() && dad) {
56         // external node
57         int leafid = node->id;
58         memset(dad_branch->partial_pars, 0, getBitsBlockSize()*sizeof(UINT));
59         int max_sites = ((aln->num_parsimony_sites+UINT_BITS-1)/UINT_BITS)*UINT_BITS;
60         int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
61 //        if (aln->ordered_pattern.empty())
62 //            aln->orderPatternByNumChars();
63         ASSERT(!aln->ordered_pattern.empty());
64         int start_pos = 0;
65         for (vector<Alignment*>::iterator alnit = partitions->begin(); alnit != partitions->end(); alnit++) {
66             int end_pos = start_pos + (*alnit)->ordered_pattern.size();
67             switch ((*alnit)->seq_type) {
68             case SEQ_DNA:
69                 for (int patid = start_pos; patid != end_pos; patid++) {
70                     Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
71                     int state = pat->at(leafid);
72                     int freq = pat->frequency;
73                     if (state < 4) {
74                         for (int j = 0; j < freq; j++, site++) {
75                             dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
76                         }
77                     } else if (state == (*alnit)->STATE_UNKNOWN) {
78                         for (int j = 0; j < freq; j++, site++) {
79                             UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
80                             UINT bit1 = (1 << (site%UINT_BITS));
81                             p[0] |= bit1;
82                             p[1] |= bit1;
83                             p[2] |= bit1;
84                             p[3] |= bit1;
85                         }
86                     } else {
87                         state -= 3;
88                         ASSERT(state < 15);
89                         for (int j = 0; j < freq; j++, site++) {
90                             UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
91                             UINT bit1 = (1 << (site%UINT_BITS));
92                             for (int i = 0; i < 4; i++)
93                                 if (state & (1<<i))
94                                     p[i] |= bit1;
95                         }
96                     }
97                 }
98                 //assert(site == aln->num_informative_sites);
99                 // add dummy states
100                 //if (site < max_sites)
101                 //    dad_branch->partial_pars[(site/UINT_BITS)*4] |= ~((1<<(site%UINT_BITS)) - 1);
102                 break;
103             case SEQ_PROTEIN:
104                 for (int patid = start_pos; patid != end_pos; patid++) {
105                     Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
106                     int state = pat->at(leafid);
107                     int freq = pat->frequency;
108                     if (state < 20) {
109                         for (int j = 0; j < freq; j++, site++) {
110                             dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
111                         }
112                     } else if (state == (*alnit)->STATE_UNKNOWN) {
113                         for (int j = 0; j < freq; j++, site++) {
114                             UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
115                             UINT bit1 = (1 << (site%UINT_BITS));
116                             for (int i = 0; i < 20; i++)
117                                     p[i] |= bit1;
118                         }
119                     } else {
120                         ASSERT(state < 23);
121                         state = (state-20)*2;
122                         for (int j = 0; j < freq; j++, site++) {
123                             UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
124                             UINT bit1 = (1 << (site%UINT_BITS));
125                             p[ambi_aa[state]] |= bit1;
126                             p[ambi_aa[state+1]] |= bit1;
127                         }
128                     }
129                 }
130                 //assert(site == aln->num_informative_sites);
131                 // add dummy states
132                 //if (site < max_sites)
133                 //    dad_branch->partial_pars[(site/UINT_BITS)*20] |= ~((1<<(site%UINT_BITS)) - 1);
134                 break;
135             default:
136                 for (int patid = start_pos; patid != end_pos; patid++) {
137                     Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
138                     int state = pat->at(leafid);
139                     int freq = pat->frequency;
140                     if (aln->seq_type == SEQ_POMO && state >= (*alnit)->num_states && state < (*alnit)->STATE_UNKNOWN) {
141                         state = (*alnit)->convertPomoState(state);
142                     }
143                      if (state < (*alnit)->num_states) {
144                         for (int j = 0; j < freq; j++, site++) {
145                             dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
146                         }
147                     } else if (state == (*alnit)->STATE_UNKNOWN) {
148                         for (int j = 0; j < freq; j++, site++) {
149                             UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
150                             UINT bit1 = (1 << (site%UINT_BITS));
151                             for (int i = 0; i < (*alnit)->num_states; i++)
152                                     p[i] |= bit1;
153                         }
154                     } else {
155                         ASSERT(0);
156                     }
157                 }
158                 break;
159             } // end of switch
160 
161             start_pos = end_pos;
162         } // FOR LOOP
163 
164         ASSERT(site == aln->num_parsimony_sites);
165         // add dummy states
166         if (site < max_sites)
167             dad_branch->partial_pars[(site/UINT_BITS)*nstates] |= ~((1<<(site%UINT_BITS)) - 1);
168     } else {
169         // internal node
170         ASSERT(node->degree() == 3); // it works only for strictly bifurcating tree
171         PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
172         FOR_NEIGHBOR_IT(node, dad, it) {
173             PhyloNeighbor* pit = (PhyloNeighbor*) (*it);
174             if ((*it)->node->name != ROOT_NAME && (pit->partial_lh_computed & 2) == 0) {
175                 computePartialParsimonyFast(pit, (PhyloNode*) node);
176             }
177             if (!left) left = pit; else right = pit;
178         }
179 //        UINT score = left->partial_pars[0] + right->partial_pars[0];
180         UINT score = 0;
181         int nsites = aln->num_parsimony_sites;
182         nsites = (nsites+UINT_BITS-1)/UINT_BITS;
183 
184         switch (nstates) {
185         case 4:
186             #ifdef _OPENMP
187             #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
188             #endif
189 			for (site = 0; site<nsites; site++) {
190 				UINT w;
191                 size_t offset = nstates*site;
192                 UINT *x = left->partial_pars + offset;
193                 UINT *y = right->partial_pars + offset;
194                 UINT *z = dad_branch->partial_pars + offset;
195 				z[0] = x[0] & y[0];
196 				z[1] = x[1] & y[1];
197 				z[2] = x[2] & y[2];
198 				z[3] = x[3] & y[3];
199 				w = z[0] | z[1] | z[2] | z[3];
200 				w = ~w;
201 				score += __builtin_popcount(w);
202 				z[0] |= w & (x[0] | y[0]);
203 				z[1] |= w & (x[1] | y[1]);
204 				z[2] |= w & (x[2] | y[2]);
205 				z[3] |= w & (x[3] | y[3]);
206 			}
207 			break;
208         default:
209             #ifdef _OPENMP
210             #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
211             #endif
212 			for (site = 0; site<nsites; site++) {
213 				int i;
214 				UINT w = 0;
215                 size_t offset = nstates*site;
216                 UINT *x = left->partial_pars + offset;
217                 UINT *y = right->partial_pars + offset;
218                 UINT *z = dad_branch->partial_pars + offset;
219 
220 				for (i = 0; i < nstates; i++) {
221 					z[i] = x[i] & y[i];
222 					w |= z[i];
223 				}
224 				w = ~w;
225 				score += vml_popcnt(w);
226 				for (i = 0; i < nstates; i++) {
227 					z[i] |= w & (x[i] | y[i]);
228 				}
229 			}
230 			break;
231         }
232         dad_branch->partial_pars[nstates*nsites] = score + left->partial_pars[nstates*nsites] + right->partial_pars[nstates*nsites];
233 //        dad_branch->partial_pars[0] = score;
234     }
235 
236     if (!aln->isSuperAlignment())
237         delete partitions;
238 }
239 
240 
computeParsimonyBranchFast(PhyloNeighbor * dad_branch,PhyloNode * dad,int * branch_subst)241 int PhyloTree::computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
242     PhyloNode *node = (PhyloNode*) dad_branch->node;
243     PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
244     ASSERT(node_branch);
245     if (!central_partial_pars)
246         initializeAllPartialPars();
247     if ((dad_branch->partial_lh_computed & 2) == 0)
248         computePartialParsimonyFast(dad_branch, dad);
249     if ((node_branch->partial_lh_computed & 2) == 0)
250         computePartialParsimonyFast(node_branch, node);
251     int site;
252     int nsites = (aln->num_parsimony_sites + UINT_BITS-1) / UINT_BITS;
253     int nstates = aln->getMaxNumStates();
254 
255     int scoreid = ((aln->num_parsimony_sites+UINT_BITS-1)/UINT_BITS)*nstates;
256     UINT sum_end_node = (dad_branch->partial_pars[scoreid] + node_branch->partial_pars[scoreid]);
257     UINT score = sum_end_node;
258 
259     UINT lower_bound = best_pars_score;
260     if (branch_subst) lower_bound = INT_MAX;
261     switch (nstates) {
262     case 4:
263         #ifdef _OPENMP
264         #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
265         #endif
266 		for (site = 0; site < nsites; site++) {
267             size_t offset = 4*site;
268             UINT *x = dad_branch->partial_pars + offset;
269             UINT *y = node_branch->partial_pars + offset;
270 			UINT w = (x[0] & y[0]) | (x[1] & y[1]) | (x[2] & y[2]) | (x[3] & y[3]);
271 			w = ~w;
272 			score += vml_popcnt(w);
273 //            #ifndef _OPENMP
274 //            if (score >= lower_bound)
275 //                break;
276 //            #endif
277 		}
278 		break;
279     default:
280         #ifdef _OPENMP
281         #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
282         #endif
283 		for (site = 0; site < nsites; site++) {
284             size_t offset = nstates*site;
285             UINT *x = dad_branch->partial_pars + offset;
286             UINT *y = node_branch->partial_pars + offset;
287 			int i;
288 			UINT w = x[0] & y[0];
289 			for (i = 1; i < nstates; i++) {
290 				w |= x[i] & y[i];
291 			}
292 			w = ~w;
293 			score += vml_popcnt(w);
294 //            #ifndef _OPENMP
295 //            if (score >= lower_bound)
296 //                break;
297 //            #endif
298 		}
299 		break;
300     }
301     if (branch_subst)
302         *branch_subst = score - sum_end_node;
303 //    score += sum_end_node;
304     return score;
305 }
306 
computeAllPartialPars(PhyloNode * node,PhyloNode * dad)307 void PhyloTree::computeAllPartialPars(PhyloNode *node, PhyloNode *dad) {
308 	if (!node) node = (PhyloNode*)root;
309 	FOR_NEIGHBOR_IT(node, dad, it) {
310 		if ((((PhyloNeighbor*)*it)->partial_lh_computed & 1) == 0)
311 			computePartialParsimony((PhyloNeighbor*)*it, node);
312 		PhyloNeighbor *rev = (PhyloNeighbor*) (*it)->node->findNeighbor(node);
313 		if ((rev->partial_lh_computed & 1) == 0)
314 			computePartialParsimony(rev, (PhyloNode*)(*it)->node);
315 		computeAllPartialPars((PhyloNode*)(*it)->node, node);
316 	}
317 }
318 
JukesCantorCorrection(double dist,double alpha)319 double PhyloTree::JukesCantorCorrection(double dist, double alpha) {
320     double z = (double) aln->num_states / (aln->num_states - 1);
321     double x = 1.0 - (z * dist);
322     if (x > 0) {
323         if (alpha <= 0.0) {
324             dist = -log(x) / z;
325         } else {
326             //if (verbose_mode >= VB_MAX) cout << "alpha: " << alpha << endl;
327             dist = alpha * (pow(x, -1.0/alpha) - 1) / z;
328         }
329     }
330     // Branch lengths under PoMo are #events, which is ~N^2 * #substitutions
331     if (aln->seq_type == SEQ_POMO)
332         dist *= aln->virtual_pop_size * aln->virtual_pop_size;
333     if (dist < Params::getInstance().min_branch_length)
334         dist = Params::getInstance().min_branch_length;
335     return dist;
336 }
337 
setParsimonyBranchLengths()338 int PhyloTree::setParsimonyBranchLengths() {
339 
340     NodeVector nodes1, nodes2;
341     getBranches(nodes1, nodes2);
342     clearAllPartialLH();
343 
344     int sum_score = 0;
345     double persite = 1.0/getAlnNSite();
346     double alpha = (site_rate) ? site_rate->getGammaShape() : 1.0;
347 //    int pars_score;
348     //int i, state;
349 
350     PhyloNeighbor *dad_branch = (PhyloNeighbor*)nodes1[0]->findNeighbor(nodes2[0]);
351     PhyloNeighbor *node_branch = (PhyloNeighbor*)nodes2[0]->findNeighbor(nodes1[0]);
352     PhyloNode *dad =  (PhyloNode*) nodes1[0];
353     PhyloNode *node =  (PhyloNode*) nodes2[0];
354 
355     // determine state of the root
356     int branch_subst = 0;
357     int pars_score = computeParsimonyBranchFast(dad_branch, dad, &branch_subst);
358     int site, real_site;
359     int nsites = (aln->num_parsimony_sites + UINT_BITS-1) / UINT_BITS;
360     int nstates = aln->getMaxNumStates();
361 
362     vector<vector<StateType> > sequences;
363     sequences.resize(nodeNum, vector<StateType>(aln->num_parsimony_sites, aln->STATE_UNKNOWN));
364     vector<bool> done;
365     done.resize(nodeNum, false);
366     done[node->id] = done[dad->id] = true;
367 
368     int subst = 0;
369 
370     for (site = 0, real_site = 0; site < nsites; site++) {
371         size_t offset = nstates*site;
372         UINT *x = dad_branch->partial_pars + offset;
373         UINT *y = node_branch->partial_pars + offset;
374         UINT w = x[0] & y[0];
375         int state;
376         for (state = 1; state < nstates; state++) {
377             w |= x[state] & y[state];
378         }
379         UINT bit = 1;
380         for (int s = 0; s < UINT_BITS && real_site < aln->num_parsimony_sites; s++, bit = bit << 1, real_site++)
381         if (w & bit) {
382             // intersection is non-empty
383             for (state = 0; state < nstates; state++)
384                 if ((x[state] & bit) && (y[state] & bit)) {
385                     // assign the first state in the intersection
386                     sequences[node->id][real_site] = sequences[dad->id][real_site] = state;
387                     break;
388                 }
389         } else {
390             // intersection is empty
391             subst++;
392             for (state = 0; state < nstates; state++)
393                 if (x[state] & bit) {
394                     // assign the first admissible state
395                     sequences[node->id][real_site] = state;
396                     break;
397                 }
398             for (state = 0; state < nstates; state++)
399                 if (y[state] & bit) {
400                     // assign the first admissible state
401                     sequences[dad->id][real_site] = state;
402                     break;
403                 }
404         }
405     }
406 
407     ASSERT(subst == branch_subst);
408     sum_score += subst;
409     fixOneNegativeBranch(correctBranchLengthF81(subst*persite, alpha), dad_branch, dad);
410 
411 
412     // walking down the tree to assign node states
413     for (int id = 1; id < nodes1.size(); id++) {
414         // arrange such that states of dad are known
415         if (done[nodes1[id]->id]) {
416             dad = (PhyloNode*)nodes1[id];
417             node = (PhyloNode*)nodes2[id];
418         } else {
419             ASSERT(done[nodes2[id]->id]);
420             dad = (PhyloNode*)nodes2[id];
421             node = (PhyloNode*)nodes1[id];
422         }
423         done[node->id] = true;
424         // now determine states of node
425         dad_branch = (PhyloNeighbor*)dad->findNeighbor(node);
426         node_branch = (PhyloNeighbor*)node->findNeighbor(dad);
427         subst = 0;
428         for (site = 0, real_site = 0; site < nsites; site++) {
429             size_t offset = nstates*site;
430             UINT *x = dad_branch->partial_pars + offset;
431             //UINT *y = node_branch->partial_pars + offset;
432             int state;
433             UINT bit = 1;
434             for (int s = 0; s < UINT_BITS && real_site < aln->num_parsimony_sites; s++, bit = bit << 1, real_site++) {
435                 StateType dad_state = sequences[dad->id][real_site];
436                 ASSERT(dad_state < nstates);
437                 //ASSERT(y[dad_state] & bit);
438                 if (x[dad_state] & bit) {
439                     // same state as dad
440                     sequences[node->id][real_site] = dad_state;
441                 } else {
442                     // different state from dad
443                     subst++;
444                     for (state = 0; state < nstates; state++)
445                         if (x[state] & bit) {
446                             // assign the first admissible state
447                             sequences[node->id][real_site] = state;
448                             break;
449                         }
450                 }
451             }
452         }
453         fixOneNegativeBranch(correctBranchLengthF81(subst*persite, alpha), dad_branch, dad);
454 //        computeParsimonyBranchFast(dad_branch, dad, &branch_subst);
455 //        ASSERT(subst <= branch_subst);
456         sum_score += subst;
457     }
458 
459     ASSERT(pars_score == sum_score);
460 
461     return nodes1.size();
462 }
463 
464 
465 /****************************************************************************
466  Sankoff parsimony function
467  ****************************************************************************/
468 
469 
initCostMatrix(CostMatrixType cost_type)470 void PhyloTree::initCostMatrix(CostMatrixType cost_type) {
471     if(cost_matrix){
472         aligned_free(cost_matrix);
473         cost_matrix = NULL;
474     }
475     ASSERT(aln);
476     int cost_nstates = aln->num_states;
477     // allocate memory for cost_matrix
478     cost_matrix = aligned_alloc<unsigned int>(cost_nstates * cost_nstates);
479 
480     switch (cost_type) {
481         case CM_LINEAR:
482             for(int i = 0; i < cost_nstates; i++){
483                 for(int j = 0; j < cost_nstates; j++)
484                     cost_matrix[i * cost_nstates + j] = abs(i-j);
485             }
486             break;
487         case CM_UNIFORM:
488             for(int i = 0; i < cost_nstates; i++){
489                 for(int j = 0; j < cost_nstates; j++)
490                     cost_matrix[i * cost_nstates + j] = ((i==j) ? 0 : 1);
491             }
492             break;
493     }
494     clearAllPartialLH();
495 }
496 
loadCostMatrixFile(char * file_name)497 void PhyloTree::loadCostMatrixFile(char * file_name){
498     if(cost_matrix){
499         aligned_free(cost_matrix);
500         cost_matrix = NULL;
501     }
502     //    if(strcmp(file_name, "fitch") == 0)
503     ////    if(file_name == NULL)
504     //        cost_matrix = new SankoffCostMatrix(aln->num_states);
505     //    else
506     //        cost_matrix = new SankoffCostMatrix(file_name);
507 
508     int cost_nstates;
509     if(strcmp(file_name, "fitch") == 0 || strcmp(file_name, "e") == 0) { // uniform cost
510         cost_nstates = aln->num_states;
511         cost_matrix = aligned_alloc<unsigned int>(cost_nstates * cost_nstates);
512         for(int i = 0; i < cost_nstates; i++)
513             for(int j = 0; j < cost_nstates; j++){
514                 if(j == i) cost_matrix[i * cost_nstates + j] = 0;
515                 else cost_matrix[i * cost_nstates + j] = 1;
516             }
517     } else{ // Sankoff cost
518         cout << "Loading cost matrix from " << file_name << "..." << endl;
519         ifstream fin(file_name);
520         if(!fin.is_open()){
521             outError("Reading cost matrix file cannot perform. Please check your input file!");
522         }
523         fin >> cost_nstates;
524         if (cost_nstates != aln->num_states)
525             outError("Cost matrix file does not have the same size as number of alignment states");
526         // allocate memory for cost_matrix
527         cost_matrix = aligned_alloc<unsigned int>(cost_nstates * cost_nstates);
528 
529         // read numbers from file
530         for(int i = 0; i < cost_nstates; i++){
531             for(int j = 0; j < cost_nstates; j++)
532                 fin >> cost_matrix[i * cost_nstates + j];
533         }
534 
535         fin.close();
536 
537     }
538 
539     int i, j, k;
540     bool changed = false;
541 
542     for (k = 0; k < cost_nstates; k++)
543         for (i = 0; i < cost_nstates; i++)
544             for (j = 0; j < cost_nstates; j++)
545                 if (cost_matrix[i*cost_nstates+j] > cost_matrix[i*cost_nstates+k] + cost_matrix[k*cost_nstates+j]) {
546                     changed = true;
547                     cost_matrix[i*cost_nstates+j] = cost_matrix[i*cost_nstates+k] + cost_matrix[k*cost_nstates+j];
548                 }
549 
550     if (changed) {
551         cout << "WARING: Cost matrix does not satisfy triangular inenquality and is automatically fixed to:" << endl;
552         cout << cost_nstates << endl;
553         for (i = 0; i < cost_nstates; i++) {
554             for (j = 0; j < cost_nstates; j++)
555                 cout << "  " << cost_matrix[i*cost_nstates+j];
556             cout << endl;
557         }
558     } else {
559         cout << "Cost matrix satisfies triangular inenquality" << endl;
560     }
561 }
562 
computeTipPartialParsimony()563 void PhyloTree::computeTipPartialParsimony() {
564     if ((tip_partial_lh_computed & 2) != 0)
565         return;
566     tip_partial_lh_computed |= 2;
567 
568     int i, state, nstates = aln->num_states;
569 
570     size_t nptn = aln->ordered_pattern.size();
571     size_t maxptn = get_safe_upper_limit_float(nptn);
572     int ptn;
573     for (ptn = 0; ptn < nptn; ptn++)
574         ptn_freq_pars[ptn] = aln->ordered_pattern[ptn].frequency;
575     for (ptn = nptn; ptn < maxptn; ptn++)
576         ptn_freq_pars[ptn] = 0;
577 
578 
579     ASSERT(tip_partial_pars);
580     // ambiguous characters
581     int ambi_aa[] = {
582         4+8, // B = N or D
583         32+64, // Z = Q or E
584         512+1024 // U = I or L
585     };
586 
587     memset(tip_partial_pars, 0, (aln->STATE_UNKNOWN+1)*nstates*sizeof(UINT));
588 
589     // initialize real states with cost_matrix
590     memcpy(tip_partial_pars, cost_matrix, nstates*nstates*sizeof(UINT));
591 
592     UINT *this_tip_partial_pars;
593 
594     switch (aln->seq_type) {
595         case SEQ_DNA:
596             for (state = 4; state < 18; state++) {
597                 int cstate = state-nstates+1;
598                 this_tip_partial_pars = &tip_partial_pars[state*nstates];
599                 for (i = 0; i < nstates; i++) {
600                     if ((cstate) & (1 << i))
601                         this_tip_partial_pars[i] = 0;
602                     else {
603                         this_tip_partial_pars[i] = UINT_MAX;
604                         for (int j = 0; j < nstates; j++)
605                             if ((cstate) & (1 << j))
606                                 this_tip_partial_pars[i] = min(this_tip_partial_pars[i], cost_matrix[i*nstates+j]);
607                     }
608                 }
609             }
610             break;
611         case SEQ_PROTEIN:
612             for (state = 0; state < sizeof(ambi_aa)/sizeof(int); state++) {
613                 this_tip_partial_pars = &tip_partial_pars[(state+20)*nstates];
614                 for (i = 0; i < nstates; i++) {
615                     if (ambi_aa[state] & (1 << i))
616                         this_tip_partial_pars[i] = 0;
617                     else {
618                         this_tip_partial_pars[i] = UINT_MAX;
619                         for (int j = 0; j < nstates; j++)
620                             if (ambi_aa[state] & (1 << j))
621                                 this_tip_partial_pars[i] = min(this_tip_partial_pars[i], cost_matrix[i*nstates+j]);
622                     }
623                 }
624             }
625             break;
626         case SEQ_POMO:
627             ASSERT(0 && "POMO not handled with Sankoff parsimony");
628             break;
629         default:
630             break;
631     }
632 }
633 /**
634  compute partial parsimony score of the subtree rooted at dad
635  @param dad_branch the branch leading to the subtree
636  @param dad its dad, used to direct the traversal
637  */
computePartialParsimonySankoff(PhyloNeighbor * dad_branch,PhyloNode * dad)638 void PhyloTree::computePartialParsimonySankoff(PhyloNeighbor *dad_branch, PhyloNode *dad){
639     // don't recompute the parsimony
640     if (dad_branch->partial_lh_computed & 2)
641         return;
642 
643     Node *node = dad_branch->node;
644     //assert(node->degree() <= 3);
645     /*
646      if(aln->num_states != cost_nstates){
647      cout << "Your cost matrix is not compatible with the alignment"
648      << " in terms of number of states. Please check!" << endl;
649      exit(1);
650      }
651      */
652     int nstates = aln->num_states;
653     assert(dad_branch->partial_pars);
654 
655     int pars_block_size = getBitsBlockSize();
656 
657 
658     // internal node
659     UINT i, j, ptn, min_child_ptn_pars;
660 
661     UINT * partial_pars = dad_branch->partial_pars;
662     memset(partial_pars, 0, sizeof(UINT)*pars_block_size);
663 
664     PhyloNeighbor *left = NULL, *right = NULL;
665 
666     FOR_NEIGHBOR_IT(node, dad, it)
667         if ((*it)->node->name != ROOT_NAME) {
668             if (!(*it)->node->isLeaf())
669                 computePartialParsimonySankoff((PhyloNeighbor*) (*it), (PhyloNode*) node);
670             if (!left)
671                 left = ((PhyloNeighbor*)*it);
672             else
673                 right = ((PhyloNeighbor*)*it);
674         }
675 
676     if (!left->node->isLeaf() && right->node->isLeaf()) {
677         // swap leaf and internal node
678         PhyloNeighbor *tmp = left;
679         left = right;
680         right = tmp;
681     }
682     ASSERT(node->degree() >= 3);
683 
684     if (node->degree() > 3) {
685         // multifurcating node
686         for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++) {
687             int ptn_start_index = ptn*nstates;
688             UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
689 
690             FOR_NEIGHBOR_IT(node, dad, it) if ((*it)->node->name != ROOT_NAME) {
691                 if ((*it)->node->isLeaf()) {
692                     // leaf node
693                     UINT *partial_pars_child_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][(*it)->node->id]*nstates];
694 
695                     for(i = 0; i < nstates; i++){
696                         partial_pars_ptr[i] += partial_pars_child_ptr[i];
697                     }
698                 } else {
699                     // internal node
700                     UINT *partial_pars_child_ptr = &((PhyloNeighbor*) (*it))->partial_pars[ptn_start_index];
701                     UINT *cost_matrix_ptr = cost_matrix;
702 
703                     for (i = 0; i < nstates; i++){
704                         // min(j->i) from child_branch
705                         min_child_ptn_pars = partial_pars_child_ptr[0] + cost_matrix_ptr[0];
706                         for(j = 1; j < nstates; j++) {
707                             UINT value = partial_pars_child_ptr[j] + cost_matrix_ptr[j];
708                             min_child_ptn_pars = min(value, min_child_ptn_pars);
709                         }
710                         partial_pars_ptr[i] += min_child_ptn_pars;
711                         cost_matrix_ptr += nstates;
712                     }
713                 }
714             }
715         }
716     } else if (left->node->isLeaf() && right->node->isLeaf()) {
717         // tip-tip case
718         for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
719             // ignore const ptn because it does not affect pars score
720             //if (aln->at(ptn).isConst()) continue;
721             int ptn_start_index = ptn*nstates;
722 
723             UINT *left_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][left->node->id]*nstates];
724             UINT *right_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][right->node->id]*nstates];
725             UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
726 
727             for (i = 0; i < nstates; i++){
728                 // min(j->i) from child_branch
729                 partial_pars_ptr[i] = left_ptr[i] + right_ptr[i];
730             }
731         }
732     } else if (left->node->isLeaf() && !right->node->isLeaf()) {
733         // tip-inner case
734         for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
735             // ignore const ptn because it does not affect pars score
736             //if (aln->at(ptn).isConst()) continue;
737             int ptn_start_index = ptn*nstates;
738 
739             UINT *left_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][left->node->id]*nstates];
740             UINT *right_ptr = &right->partial_pars[ptn_start_index];
741             UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
742             UINT *cost_matrix_ptr = cost_matrix;
743             UINT right_contrib;
744 
745             for(i = 0; i < nstates; i++){
746                 // min(j->i) from child_branch
747                 right_contrib = right_ptr[0] + cost_matrix_ptr[0];
748                 for(j = 1; j < nstates; j++) {
749                     right_contrib = min(right_ptr[j] + cost_matrix_ptr[j], right_contrib);
750                 }
751                 partial_pars_ptr[i] = left_ptr[i] + right_contrib;
752                 cost_matrix_ptr += nstates;
753             }
754         }
755     } else {
756         // inner-inner case
757         for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
758             // ignore const ptn because it does not affect pars score
759             //if (aln->at(ptn).isConst()) continue;
760             int ptn_start_index = ptn*nstates;
761 
762             UINT *left_ptr = &left->partial_pars[ptn_start_index];
763             UINT *right_ptr = &right->partial_pars[ptn_start_index];
764             UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
765             UINT *cost_matrix_ptr = cost_matrix;
766             UINT left_contrib, right_contrib;
767 
768             for(i = 0; i < nstates; i++){
769                 // min(j->i) from child_branch
770                 left_contrib = left_ptr[0] + cost_matrix_ptr[0];
771                 right_contrib = right_ptr[0] + cost_matrix_ptr[0];
772                 for(j = 1; j < nstates; j++) {
773                     left_contrib = min(left_ptr[j] + cost_matrix_ptr[j], left_contrib);
774                     right_contrib = min(right_ptr[j] + cost_matrix_ptr[j], right_contrib);
775                 }
776                 partial_pars_ptr[i] = left_contrib+right_contrib;
777                 cost_matrix_ptr += nstates;
778             }
779         }
780 
781     }
782 
783     dad_branch->partial_lh_computed |= 2;
784 }
785 
786 /**
787  compute tree parsimony score based on a particular branch
788  @param dad_branch the branch leading to the subtree
789  @param dad its dad, used to direct the traversal
790  @param branch_subst (OUT) if not NULL, the number of substitutions on this branch
791  @return parsimony score of the tree
792  */
computeParsimonyBranchSankoff(PhyloNeighbor * dad_branch,PhyloNode * dad,int * branch_subst)793 int PhyloTree::computeParsimonyBranchSankoff(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
794 
795     if ((tip_partial_lh_computed & 2) == 0)
796         computeTipPartialParsimony();
797 
798     PhyloNode *node = (PhyloNode*) dad_branch->node;
799     PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
800     assert(node_branch);
801 
802     if (!central_partial_pars)
803         initializeAllPartialPars();
804 
805     // DTH: I don't really understand what this is for. ###########
806     // swap node and dad if dad is a leaf
807     if (node->isLeaf()) {
808         PhyloNode *tmp_node = dad;
809         dad = node;
810         node = tmp_node;
811         PhyloNeighbor *tmp_nei = dad_branch;
812         dad_branch = node_branch;
813         node_branch = tmp_nei;
814         //        cout << "swapped\n";
815     }
816 
817     //int nptn = aln->size();
818     //    if(!_pattern_pars) _pattern_pars = aligned_alloc<BootValTypePars>(nptn+VCSIZE_USHORT);
819     //    memset(_pattern_pars, 0, sizeof(BootValTypePars) * (nptn+VCSIZE_USHORT));
820 
821     if ((dad_branch->partial_lh_computed & 2) == 0 && !node->isLeaf())
822         computePartialParsimonySankoff(dad_branch, dad);
823     if ((node_branch->partial_lh_computed & 2) == 0 && !dad->isLeaf())
824         computePartialParsimonySankoff(node_branch, node);
825 
826     // now combine likelihood at the branch
827     UINT tree_pars = 0;
828     int nstates = aln->num_states;
829     UINT i, j, ptn;
830     UINT branch_pars = 0;
831 
832     if (dad->isLeaf()) {
833         // external node
834         for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
835             int ptn_start_index = ptn * nstates;
836             UINT *node_branch_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][dad->id]*nstates];
837             UINT *dad_branch_ptr = &dad_branch->partial_pars[ptn_start_index];
838             UINT min_ptn_pars = node_branch_ptr[0] + dad_branch_ptr[0];
839             UINT br_ptn_pars = node_branch_ptr[0];
840             for (i = 1; i < nstates; i++){
841                 // min(j->i) from node_branch
842                 UINT min_score = node_branch_ptr[i] + dad_branch_ptr[i];
843                 if (min_score < min_ptn_pars) {
844                     min_ptn_pars = min_score;
845                     br_ptn_pars = node_branch_ptr[i];
846                 }
847             }
848             //_pattern_pars[ptn] = min_ptn_pars;
849             tree_pars += min_ptn_pars * aln->ordered_pattern[ptn].frequency;
850             branch_pars += br_ptn_pars * aln->ordered_pattern[ptn].frequency;
851         }
852     }  else {
853         // internal node
854         for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
855             int ptn_start_index = ptn * nstates;
856             UINT *node_branch_ptr = &node_branch->partial_pars[ptn_start_index];
857             UINT *dad_branch_ptr = &dad_branch->partial_pars[ptn_start_index];
858             UINT *cost_matrix_ptr = cost_matrix;
859             UINT min_ptn_pars = UINT_MAX;
860             UINT br_ptn_pars = UINT_MAX;
861             for(i = 0; i < nstates; i++){
862                 // min(j->i) from node_branch
863                 UINT min_score = node_branch_ptr[0] + cost_matrix_ptr[0];
864                 UINT branch_score = cost_matrix_ptr[0];
865                 for(j = 1; j < nstates; j++) {
866                     UINT value = node_branch_ptr[j] + cost_matrix_ptr[j];
867                     if (value < min_score) {
868                         min_score = value;
869                         branch_score = cost_matrix_ptr[j];
870                     }
871 
872                 }
873                 min_score = min_score + dad_branch_ptr[i];
874                 if (min_score < min_ptn_pars) {
875                     min_ptn_pars = min_score;
876                     br_ptn_pars = branch_score;
877                 }
878                 cost_matrix_ptr += nstates;
879             }
880             //_pattern_pars[ptn] = min_ptn_pars;
881             tree_pars += min_ptn_pars * aln->ordered_pattern[ptn].frequency;
882             branch_pars += br_ptn_pars * aln->ordered_pattern[ptn].frequency;
883         }
884     }
885     if (branch_subst)
886         *branch_subst = branch_pars;
887     //    cout << endl;
888     return tree_pars;
889 }
890 
891 /****************************************************************************
892  Stepwise addition (greedy) by maximum parsimony
893  ****************************************************************************/
894 
895 // random generator function:
896 //ptrdiff_t myrandom(ptrdiff_t i) {
897 //    return random_int(i);
898 //}
899 
900 // pointer object to it:
901 //ptrdiff_t (*p_myrandom)(ptrdiff_t) = myrandom;
902 
create3TaxonTree(IntVector & taxon_order,int * rand_stream)903 void PhyloTree::create3TaxonTree(IntVector &taxon_order, int *rand_stream) {
904     freeNode();
905     int nseq = aln->getNSeq();
906     taxon_order.resize(nseq);
907     for (int i = 0; i < nseq; i++)
908         taxon_order[i] = i;
909     // randomize the addition order
910     my_random_shuffle(taxon_order.begin(), taxon_order.end(), rand_stream);
911 
912     root = newNode(nseq);
913 
914     // create star tree
915     for (leafNum = 0; leafNum < 3; leafNum++) {
916         if (leafNum < 3 && verbose_mode >= VB_MAX)
917             cout << "Add " << aln->getSeqName(taxon_order[leafNum]) << " to the tree" << endl;
918         Node *new_taxon = newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
919         root->addNeighbor(new_taxon, -1.0);
920         new_taxon->addNeighbor(root, -1.0);
921     }
922     root = root->neighbors[0]->node;
923 }
924 
copyConstraintTree(MTree * tree,IntVector & taxon_order,int * rand_stream)925 void PhyloTree::copyConstraintTree(MTree *tree, IntVector &taxon_order, int *rand_stream) {
926     MTree::copyTree(tree);
927     // assign proper taxon IDs
928     NodeVector nodes;
929     NodeVector::iterator it;
930     getTaxa(nodes);
931     leafNum = nodes.size();
932     vector<int> pushed;
933     pushed.resize(aln->getNSeq(), 0);
934 
935     // name map for fast lookup
936     StrVector seq_names = aln->getSeqNames();
937     StringIntMap name2id;
938     for (auto sit = seq_names.begin(); sit != seq_names.end(); sit++)
939         name2id[*sit] = sit - seq_names.begin();
940 
941     // reindex taxon ID from alignment
942     for (it = nodes.begin(); it != nodes.end(); it++) {
943         (*it)->id = name2id[(*it)->name];
944         ASSERT((*it)->id >= 0);
945         taxon_order.push_back((*it)->id);
946         pushed[(*it)->id] = 1;
947     }
948     ASSERT(taxon_order.size() == constraintTree.leafNum);
949 
950     // reindex internal nodes properly
951     nodes.clear();
952     getInternalNodes(nodes);
953     for (it = nodes.begin(); it != nodes.end(); it++)
954         (*it)->id = aln->getNSeq() + (it - nodes.begin());
955 
956     // add the remaining taxa
957     for (int i = 0; i < aln->getNSeq(); i++)
958         if (!pushed[i]) {
959             taxon_order.push_back(i);
960         }
961     // randomize the addition order
962     my_random_shuffle(taxon_order.begin()+constraintTree.leafNum, taxon_order.end(), rand_stream);
963 }
964 
965 /**
966  get all neighboring branches to a removed node
967  */
getNeiBranches(NeighborVec & removed_nei,NodeVector & attached_node,NodeVector & added_nodes,int i,NodeVector & nodes1,NodeVector & nodes2)968 void getNeiBranches(NeighborVec &removed_nei, NodeVector &attached_node, NodeVector &added_nodes, int i,
969                     NodeVector &nodes1, NodeVector &nodes2)
970 {
971     // get target branches surrounding attached_node
972     FOR_NEIGHBOR_IT(attached_node[i], NULL, it) {
973         if (attached_node[i]->id < (*it)->node->id) {
974             nodes1.push_back(attached_node[i]);
975             nodes2.push_back((*it)->node);
976         } else {
977             nodes2.push_back(attached_node[i]);
978             nodes1.push_back((*it)->node);
979         }
980     }
981     // get target branches surrounding previous added_nodes
982     int j;
983     for (j = i-1; j >= 0; j--) {
984         if (attached_node[j] != attached_node[i])
985             break;
986         Node *node = added_nodes[j];
987         FOR_NEIGHBOR_IT(node, NULL, it) {
988             if (node->id < (*it)->node->id) {
989                 bool present = false;
990                 for (int k = 0; k < nodes1.size(); k++)
991                     if (node == nodes1[k] && (*it)->node == nodes2[k]) {
992                         present = true;
993                         break;
994                     }
995                 if (present) continue;
996                 nodes1.push_back(node);
997                 nodes2.push_back((*it)->node);
998             } else {
999                 bool present = false;
1000                 for (int k = 0; k < nodes1.size(); k++)
1001                     if (node == nodes2[k] && (*it)->node == nodes1[k]) {
1002                         present = true;
1003                         break;
1004                     }
1005                 if (present) continue;
1006                 nodes2.push_back(node);
1007                 nodes1.push_back((*it)->node);
1008             }
1009         }
1010         // check that exactly two branches are added
1011     }
1012     ASSERT(nodes1.size() == 3 + (i-j-1)*2);
1013 
1014 }
1015 
insertNode2Branch(Node * added_node,Node * target_node,Node * target_dad)1016 void PhyloTree::insertNode2Branch(Node* added_node, Node* target_node, Node* target_dad) {
1017     target_node->updateNeighbor(target_dad, added_node, -1.0);
1018     target_dad->updateNeighbor(target_node, added_node, -1.0);
1019     added_node->updateNeighbor((Node*) 1, target_node, -1.0);
1020     added_node->updateNeighbor((Node*) 2, target_dad, -1.0);
1021     ((PhyloNeighbor*) added_node->findNeighbor(target_node))->partial_pars =
1022     ((PhyloNeighbor*) target_dad->findNeighbor(added_node))->partial_pars;
1023     ((PhyloNeighbor*) added_node->findNeighbor(target_dad))->partial_pars =
1024     ((PhyloNeighbor*) target_node->findNeighbor(added_node))->partial_pars;
1025 
1026     ((PhyloNeighbor*) added_node->findNeighbor(target_node))->partial_lh_computed =
1027     ((PhyloNeighbor*) target_dad->findNeighbor(added_node))->partial_lh_computed;
1028     ((PhyloNeighbor*) added_node->findNeighbor(target_dad))->partial_lh_computed =
1029     ((PhyloNeighbor*) target_node->findNeighbor(added_node))->partial_lh_computed;
1030 
1031     PhyloNode *ass_node = (PhyloNode*)added_node->neighbors[0]->node;
1032     ((PhyloNeighbor*)ass_node->findNeighbor(added_node))->clearPartialLh();
1033     ass_node->clearReversePartialLh((PhyloNode*)added_node);
1034 }
1035 
computeParsimonyTree(const char * out_prefix,Alignment * alignment,int * rand_stream)1036 int PhyloTree::computeParsimonyTree(const char *out_prefix, Alignment *alignment, int *rand_stream) {
1037     aln = alignment;
1038     int nseq = aln->getNSeq();
1039     if (nseq < 3)
1040         outError(ERR_FEW_TAXA);
1041 
1042     IntVector taxon_order;
1043     taxon_order.reserve(aln->getNSeq());
1044 
1045     NeighborVec removed_nei; // removed Neighbor
1046     NodeVector attached_node; // node attached to removed Neighbor
1047     NodeVector added_nodes; // newly added nodes
1048     int newNodeID;
1049     size_t index;
1050     size_t pars_block_size = getBitsBlockSize();
1051 
1052     if (constraintTree.empty()) {
1053         create3TaxonTree(taxon_order, rand_stream);
1054         ASSERT(leafNum == 3);
1055         initializeAllPartialPars();
1056         index = (2*leafNum-3)*2;
1057         newNodeID = nseq + leafNum - 2;
1058     } else {
1059         // first copy the constraint tree
1060         copyConstraintTree(&constraintTree, taxon_order, rand_stream);
1061         newNodeID = nodeNum - leafNum + nseq;
1062         index = (branchNum)*2;
1063 
1064         // initialize partial_pars to reuse later
1065         initializeAllPartialPars();
1066 
1067         // extract a bifurcating subtree and get removed nodes to insert later
1068         extractBifurcatingSubTree(removed_nei, attached_node, rand_stream);
1069 
1070         added_nodes.reserve(removed_nei.size());
1071     }
1072     if (verbose_mode >= VB_MAX)
1073         cout << "computeParsimony: " << computeParsimony() << endl;
1074 
1075     //UINT *tmp_partial_pars;
1076     //tmp_partial_pars = newBitsBlock();
1077     if (nseq == 3)
1078         best_pars_score = computeParsimony();
1079 
1080     best_pars_score = 0;
1081     if (leafNum == nseq) {
1082         outWarning("Constraint tree has all taxa and is bifurcating, which strictly enforces final tree!");
1083     }
1084 
1085     // stepwise adding the next taxon for the remaining taxa
1086     for (int step = 0; leafNum < nseq; step++) {
1087         NodeVector nodes1, nodes2;
1088         PhyloNode *target_node = NULL;
1089         PhyloNode *target_dad = NULL;
1090         best_pars_score = UINT_MAX;
1091 
1092         // create a new node attached to new taxon or removed node
1093         PhyloNode *added_node = (PhyloNode*)newNode(newNodeID++);
1094         PhyloNode *new_taxon;
1095 
1096         if (step < removed_nei.size()) {
1097             // add the removed_nei (from constraint tree) back to the tree
1098             getNeiBranches(removed_nei, attached_node, added_nodes, step, nodes1, nodes2);
1099             new_taxon = (PhyloNode*)removed_nei[step]->node;
1100             added_node->neighbors.push_back(removed_nei[step]);
1101             new_taxon->updateNeighbor(attached_node[step], added_node);
1102             added_nodes.push_back(added_node);
1103         } else {
1104             // add new taxon to the tree
1105             if (verbose_mode >= VB_MAX)
1106                 cout << "Adding " << aln->getSeqName(taxon_order[leafNum]) << " to the tree..." << endl;
1107             getBranches(nodes1, nodes2);
1108 
1109             // allocate a new taxon
1110             new_taxon = (PhyloNode*)newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
1111 
1112             // link new_taxon and added_node
1113             added_node->addNeighbor(new_taxon, -1.0);
1114             new_taxon->addNeighbor(added_node, -1.0);
1115 
1116             // allocate memory
1117             ((PhyloNeighbor*)new_taxon->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1118             ((PhyloNeighbor*)added_node->findNeighbor(new_taxon))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1119         }
1120         // preserve two neighbors
1121         added_node->addNeighbor((Node*) 1, -1.0);
1122         added_node->addNeighbor((Node*) 2, -1.0);
1123 
1124         for (int nodeid = 0; nodeid < nodes1.size(); nodeid++) {
1125 
1126             int score = addTaxonMPFast(new_taxon, added_node, nodes1[nodeid], nodes2[nodeid]);
1127             if (score < best_pars_score) {
1128                 best_pars_score = score;
1129                 target_node = (PhyloNode*)nodes1[nodeid];
1130                 target_dad = (PhyloNode*)nodes2[nodeid];
1131             }
1132         }
1133 
1134         if (verbose_mode >= VB_MAX)
1135             cout << ", score = " << best_pars_score << endl;
1136         // now insert the new node in the middle of the branch node-dad
1137         insertNode2Branch(added_node, target_node, target_dad);
1138 
1139         // assign partial_pars storage
1140         ((PhyloNeighbor*)target_dad->findNeighbor(added_node))->clearPartialLh();
1141         ((PhyloNeighbor*)target_dad->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1142         ((PhyloNeighbor*)target_node->findNeighbor(added_node))->clearPartialLh();
1143         ((PhyloNeighbor*)target_node->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1144 
1145         target_dad->clearReversePartialLh(added_node);
1146         target_node->clearReversePartialLh(added_node);
1147 
1148         // increase number of taxa
1149         leafNum += getNumTaxa(new_taxon, added_node);
1150     }
1151 
1152     ASSERT(index == 4*leafNum-6);
1153 
1154     nodeNum = 2 * leafNum - 2;
1155     initializeTree();
1156     // parsimony tree is always unrooted
1157     bool orig_rooted = rooted;
1158     rooted = false;
1159     setAlignment(alignment);
1160 //    initializeAllPartialPars();
1161 //    clearAllPartialLH();
1162     fixNegativeBranch(true);
1163     // convert to rooted tree if originally so
1164     if (orig_rooted)
1165         convertToRooted();
1166     if (out_prefix) {
1167 		string file_name = out_prefix;
1168 		file_name += ".parstree";
1169 		printTree(file_name.c_str(), WT_NEWLINE + WT_BR_LEN);
1170     }
1171 //    if (isSuperTree())
1172 //        ((PhyloSuperTree*)this)->mapTrees();
1173 
1174     return best_pars_score;
1175 }
1176 
addTaxonMPFast(Node * added_taxon,Node * added_node,Node * node,Node * dad)1177 int PhyloTree::addTaxonMPFast(Node *added_taxon, Node* added_node, Node* node, Node* dad) {
1178 
1179     // now insert the new node in the middle of the branch node-dad
1180     insertNode2Branch(added_node, node, dad);
1181 
1182     // compute the likelihood
1183     int score = computeParsimonyBranch((PhyloNeighbor*)added_taxon->findNeighbor(added_node), (PhyloNode*)added_taxon);
1184 
1185     // remove the added node
1186     node->updateNeighbor(added_node, dad);
1187     dad->updateNeighbor(added_node, node);
1188     added_node->updateNeighbor(node, (Node*) 1);
1189     added_node->updateNeighbor(dad, (Node*) 2);
1190 
1191     // set partial_pars to COMPUTED
1192     ((PhyloNeighbor*)node->findNeighbor(dad))->partial_lh_computed |= 2;
1193     ((PhyloNeighbor*)dad->findNeighbor(node))->partial_lh_computed |= 2;
1194 
1195     // now tranverse the tree downwards
1196 
1197 //    FOR_NEIGHBOR_IT(node, dad, it){
1198 //        addTaxonMPFast(added_node, target_node, target_dad, target_partial_pars, (*it)->node, node);
1199 //    }
1200     return score;
1201 
1202 }
1203 
extractBifurcatingSubTree(NeighborVec & removed_nei,NodeVector & attached_node,int * rand_stream)1204 void PhyloTree::extractBifurcatingSubTree(NeighborVec &removed_nei, NodeVector &attached_node, int *rand_stream) {
1205     NodeVector nodes;
1206     getMultifurcatingNodes(nodes);
1207     if (nodes.empty())
1208         return;
1209 
1210     int i;
1211 
1212     computeBranchDirection();
1213 
1214     // firstly make bifurcating tree
1215     for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++)
1216     {
1217         Node *node = (*it);
1218         int id[3];
1219         id[0] = -1;
1220         // find the neighbor toward root to preserve root
1221         for (i = 0; i < node->neighbors.size(); i++)
1222             if (((PhyloNeighbor*)node->neighbors[i])->direction == TOWARD_ROOT) {
1223                 id[0] = i;
1224                 break;
1225             }
1226         ASSERT(id[0] >= 0);
1227         // randomly choose 2 neighbors to reserve
1228         do {
1229             id[1] = random_int(node->degree(), rand_stream);
1230         } while (id[1] == id[0]);
1231 
1232         do {
1233             id[2] = random_int(node->degree(), rand_stream);
1234         } while (id[2] == id[0] || id[2] == id[1]);
1235 
1236         std::sort(id, id+3);
1237 
1238         // remove taxa
1239         int cur_size = removed_nei.size();
1240         for (i = 0; i < node->degree(); i++)
1241             if (i != id[0] && i != id[1] && i != id[2]) {
1242                 removed_nei.push_back(node->neighbors[i]);
1243                 attached_node.push_back(node);
1244             }
1245         // randomize removed_nei
1246         my_random_shuffle(removed_nei.begin() + cur_size, removed_nei.end(), rand_stream);
1247 
1248         // remove neigbors to make bifurcating tree
1249         node->neighbors[0] = node->neighbors[id[0]];
1250         node->neighbors[1] = node->neighbors[id[1]];
1251         node->neighbors[2] = node->neighbors[id[2]];
1252         node->neighbors.resize(3);
1253     }
1254 
1255     leafNum = getNumTaxa();
1256 
1257 }
1258