1 /*
2 * phylotreepars.cpp
3 *
4 * Fast implementation of parsimony kernel
5 *
6 * Created on: May 18, 2015
7 * Author: minh
8 */
9
10 #include "phylotree.h"
11 //#include "vectorclass/vectorclass.h"
12 #include "phylosupertree.h"
13
14 #if defined (__GNUC__) || defined(__clang__)
15 #define vml_popcnt __builtin_popcount
16 #else
17 // taken from vectorclass library
vml_popcnt(uint32_t a)18 static inline uint32_t vml_popcnt (uint32_t a) {
19 // popcnt instruction not available
20 uint32_t b = a - ((a >> 1) & 0x55555555);
21 uint32_t c = (b & 0x33333333) + ((b >> 2) & 0x33333333);
22 uint32_t d = (c + (c >> 4)) & 0x0F0F0F0F;
23 uint32_t e = d * 0x01010101;
24 return e >> 24;
25 }
26 #endif
27
28 /***********************************************************/
29 /****** optimized version of parsimony kernel **************/
30 /***********************************************************/
31
computePartialParsimonyFast(PhyloNeighbor * dad_branch,PhyloNode * dad)32 void PhyloTree::computePartialParsimonyFast(PhyloNeighbor *dad_branch, PhyloNode *dad) {
33 if (dad_branch->partial_lh_computed & 2)
34 return;
35 Node *node = dad_branch->node;
36 int nstates = aln->getMaxNumStates();
37 int site = 0;
38
39 dad_branch->partial_lh_computed |= 2;
40
41 vector<Alignment*> *partitions = NULL;
42 if (aln->isSuperAlignment())
43 partitions = &((SuperAlignment*)aln)->partitions;
44 else {
45 partitions = new vector<Alignment*>;
46 partitions->push_back(aln);
47 }
48
49 if (node->name == ROOT_NAME) {
50 ASSERT(dad);
51 int pars_size = getBitsBlockSize();
52 memset(dad_branch->partial_pars, 255, pars_size*sizeof(UINT));
53 size_t nsites = (aln->num_parsimony_sites+UINT_BITS-1)/UINT_BITS;
54 dad_branch->partial_pars[nstates*nsites] = 0;
55 } else if (node->isLeaf() && dad) {
56 // external node
57 int leafid = node->id;
58 memset(dad_branch->partial_pars, 0, getBitsBlockSize()*sizeof(UINT));
59 int max_sites = ((aln->num_parsimony_sites+UINT_BITS-1)/UINT_BITS)*UINT_BITS;
60 int ambi_aa[] = {2, 3, 5, 6, 9, 10}; // {4+8, 32+64, 512+1024};
61 // if (aln->ordered_pattern.empty())
62 // aln->orderPatternByNumChars();
63 ASSERT(!aln->ordered_pattern.empty());
64 int start_pos = 0;
65 for (vector<Alignment*>::iterator alnit = partitions->begin(); alnit != partitions->end(); alnit++) {
66 int end_pos = start_pos + (*alnit)->ordered_pattern.size();
67 switch ((*alnit)->seq_type) {
68 case SEQ_DNA:
69 for (int patid = start_pos; patid != end_pos; patid++) {
70 Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
71 int state = pat->at(leafid);
72 int freq = pat->frequency;
73 if (state < 4) {
74 for (int j = 0; j < freq; j++, site++) {
75 dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
76 }
77 } else if (state == (*alnit)->STATE_UNKNOWN) {
78 for (int j = 0; j < freq; j++, site++) {
79 UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
80 UINT bit1 = (1 << (site%UINT_BITS));
81 p[0] |= bit1;
82 p[1] |= bit1;
83 p[2] |= bit1;
84 p[3] |= bit1;
85 }
86 } else {
87 state -= 3;
88 ASSERT(state < 15);
89 for (int j = 0; j < freq; j++, site++) {
90 UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
91 UINT bit1 = (1 << (site%UINT_BITS));
92 for (int i = 0; i < 4; i++)
93 if (state & (1<<i))
94 p[i] |= bit1;
95 }
96 }
97 }
98 //assert(site == aln->num_informative_sites);
99 // add dummy states
100 //if (site < max_sites)
101 // dad_branch->partial_pars[(site/UINT_BITS)*4] |= ~((1<<(site%UINT_BITS)) - 1);
102 break;
103 case SEQ_PROTEIN:
104 for (int patid = start_pos; patid != end_pos; patid++) {
105 Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
106 int state = pat->at(leafid);
107 int freq = pat->frequency;
108 if (state < 20) {
109 for (int j = 0; j < freq; j++, site++) {
110 dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
111 }
112 } else if (state == (*alnit)->STATE_UNKNOWN) {
113 for (int j = 0; j < freq; j++, site++) {
114 UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
115 UINT bit1 = (1 << (site%UINT_BITS));
116 for (int i = 0; i < 20; i++)
117 p[i] |= bit1;
118 }
119 } else {
120 ASSERT(state < 23);
121 state = (state-20)*2;
122 for (int j = 0; j < freq; j++, site++) {
123 UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
124 UINT bit1 = (1 << (site%UINT_BITS));
125 p[ambi_aa[state]] |= bit1;
126 p[ambi_aa[state+1]] |= bit1;
127 }
128 }
129 }
130 //assert(site == aln->num_informative_sites);
131 // add dummy states
132 //if (site < max_sites)
133 // dad_branch->partial_pars[(site/UINT_BITS)*20] |= ~((1<<(site%UINT_BITS)) - 1);
134 break;
135 default:
136 for (int patid = start_pos; patid != end_pos; patid++) {
137 Alignment::iterator pat = aln->ordered_pattern.begin()+ patid;
138 int state = pat->at(leafid);
139 int freq = pat->frequency;
140 if (aln->seq_type == SEQ_POMO && state >= (*alnit)->num_states && state < (*alnit)->STATE_UNKNOWN) {
141 state = (*alnit)->convertPomoState(state);
142 }
143 if (state < (*alnit)->num_states) {
144 for (int j = 0; j < freq; j++, site++) {
145 dad_branch->partial_pars[(site/UINT_BITS)*nstates+state] |= (1 << (site % UINT_BITS));
146 }
147 } else if (state == (*alnit)->STATE_UNKNOWN) {
148 for (int j = 0; j < freq; j++, site++) {
149 UINT *p = dad_branch->partial_pars+((site/UINT_BITS)*nstates);
150 UINT bit1 = (1 << (site%UINT_BITS));
151 for (int i = 0; i < (*alnit)->num_states; i++)
152 p[i] |= bit1;
153 }
154 } else {
155 ASSERT(0);
156 }
157 }
158 break;
159 } // end of switch
160
161 start_pos = end_pos;
162 } // FOR LOOP
163
164 ASSERT(site == aln->num_parsimony_sites);
165 // add dummy states
166 if (site < max_sites)
167 dad_branch->partial_pars[(site/UINT_BITS)*nstates] |= ~((1<<(site%UINT_BITS)) - 1);
168 } else {
169 // internal node
170 ASSERT(node->degree() == 3); // it works only for strictly bifurcating tree
171 PhyloNeighbor *left = NULL, *right = NULL; // left & right are two neighbors leading to 2 subtrees
172 FOR_NEIGHBOR_IT(node, dad, it) {
173 PhyloNeighbor* pit = (PhyloNeighbor*) (*it);
174 if ((*it)->node->name != ROOT_NAME && (pit->partial_lh_computed & 2) == 0) {
175 computePartialParsimonyFast(pit, (PhyloNode*) node);
176 }
177 if (!left) left = pit; else right = pit;
178 }
179 // UINT score = left->partial_pars[0] + right->partial_pars[0];
180 UINT score = 0;
181 int nsites = aln->num_parsimony_sites;
182 nsites = (nsites+UINT_BITS-1)/UINT_BITS;
183
184 switch (nstates) {
185 case 4:
186 #ifdef _OPENMP
187 #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
188 #endif
189 for (site = 0; site<nsites; site++) {
190 UINT w;
191 size_t offset = nstates*site;
192 UINT *x = left->partial_pars + offset;
193 UINT *y = right->partial_pars + offset;
194 UINT *z = dad_branch->partial_pars + offset;
195 z[0] = x[0] & y[0];
196 z[1] = x[1] & y[1];
197 z[2] = x[2] & y[2];
198 z[3] = x[3] & y[3];
199 w = z[0] | z[1] | z[2] | z[3];
200 w = ~w;
201 score += __builtin_popcount(w);
202 z[0] |= w & (x[0] | y[0]);
203 z[1] |= w & (x[1] | y[1]);
204 z[2] |= w & (x[2] | y[2]);
205 z[3] |= w & (x[3] | y[3]);
206 }
207 break;
208 default:
209 #ifdef _OPENMP
210 #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
211 #endif
212 for (site = 0; site<nsites; site++) {
213 int i;
214 UINT w = 0;
215 size_t offset = nstates*site;
216 UINT *x = left->partial_pars + offset;
217 UINT *y = right->partial_pars + offset;
218 UINT *z = dad_branch->partial_pars + offset;
219
220 for (i = 0; i < nstates; i++) {
221 z[i] = x[i] & y[i];
222 w |= z[i];
223 }
224 w = ~w;
225 score += vml_popcnt(w);
226 for (i = 0; i < nstates; i++) {
227 z[i] |= w & (x[i] | y[i]);
228 }
229 }
230 break;
231 }
232 dad_branch->partial_pars[nstates*nsites] = score + left->partial_pars[nstates*nsites] + right->partial_pars[nstates*nsites];
233 // dad_branch->partial_pars[0] = score;
234 }
235
236 if (!aln->isSuperAlignment())
237 delete partitions;
238 }
239
240
computeParsimonyBranchFast(PhyloNeighbor * dad_branch,PhyloNode * dad,int * branch_subst)241 int PhyloTree::computeParsimonyBranchFast(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
242 PhyloNode *node = (PhyloNode*) dad_branch->node;
243 PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
244 ASSERT(node_branch);
245 if (!central_partial_pars)
246 initializeAllPartialPars();
247 if ((dad_branch->partial_lh_computed & 2) == 0)
248 computePartialParsimonyFast(dad_branch, dad);
249 if ((node_branch->partial_lh_computed & 2) == 0)
250 computePartialParsimonyFast(node_branch, node);
251 int site;
252 int nsites = (aln->num_parsimony_sites + UINT_BITS-1) / UINT_BITS;
253 int nstates = aln->getMaxNumStates();
254
255 int scoreid = ((aln->num_parsimony_sites+UINT_BITS-1)/UINT_BITS)*nstates;
256 UINT sum_end_node = (dad_branch->partial_pars[scoreid] + node_branch->partial_pars[scoreid]);
257 UINT score = sum_end_node;
258
259 UINT lower_bound = best_pars_score;
260 if (branch_subst) lower_bound = INT_MAX;
261 switch (nstates) {
262 case 4:
263 #ifdef _OPENMP
264 #pragma omp parallel for private (site) reduction(+: score) if(nsites>200)
265 #endif
266 for (site = 0; site < nsites; site++) {
267 size_t offset = 4*site;
268 UINT *x = dad_branch->partial_pars + offset;
269 UINT *y = node_branch->partial_pars + offset;
270 UINT w = (x[0] & y[0]) | (x[1] & y[1]) | (x[2] & y[2]) | (x[3] & y[3]);
271 w = ~w;
272 score += vml_popcnt(w);
273 // #ifndef _OPENMP
274 // if (score >= lower_bound)
275 // break;
276 // #endif
277 }
278 break;
279 default:
280 #ifdef _OPENMP
281 #pragma omp parallel for private (site) reduction(+: score) if(nsites > 800/nstates)
282 #endif
283 for (site = 0; site < nsites; site++) {
284 size_t offset = nstates*site;
285 UINT *x = dad_branch->partial_pars + offset;
286 UINT *y = node_branch->partial_pars + offset;
287 int i;
288 UINT w = x[0] & y[0];
289 for (i = 1; i < nstates; i++) {
290 w |= x[i] & y[i];
291 }
292 w = ~w;
293 score += vml_popcnt(w);
294 // #ifndef _OPENMP
295 // if (score >= lower_bound)
296 // break;
297 // #endif
298 }
299 break;
300 }
301 if (branch_subst)
302 *branch_subst = score - sum_end_node;
303 // score += sum_end_node;
304 return score;
305 }
306
computeAllPartialPars(PhyloNode * node,PhyloNode * dad)307 void PhyloTree::computeAllPartialPars(PhyloNode *node, PhyloNode *dad) {
308 if (!node) node = (PhyloNode*)root;
309 FOR_NEIGHBOR_IT(node, dad, it) {
310 if ((((PhyloNeighbor*)*it)->partial_lh_computed & 1) == 0)
311 computePartialParsimony((PhyloNeighbor*)*it, node);
312 PhyloNeighbor *rev = (PhyloNeighbor*) (*it)->node->findNeighbor(node);
313 if ((rev->partial_lh_computed & 1) == 0)
314 computePartialParsimony(rev, (PhyloNode*)(*it)->node);
315 computeAllPartialPars((PhyloNode*)(*it)->node, node);
316 }
317 }
318
JukesCantorCorrection(double dist,double alpha)319 double PhyloTree::JukesCantorCorrection(double dist, double alpha) {
320 double z = (double) aln->num_states / (aln->num_states - 1);
321 double x = 1.0 - (z * dist);
322 if (x > 0) {
323 if (alpha <= 0.0) {
324 dist = -log(x) / z;
325 } else {
326 //if (verbose_mode >= VB_MAX) cout << "alpha: " << alpha << endl;
327 dist = alpha * (pow(x, -1.0/alpha) - 1) / z;
328 }
329 }
330 // Branch lengths under PoMo are #events, which is ~N^2 * #substitutions
331 if (aln->seq_type == SEQ_POMO)
332 dist *= aln->virtual_pop_size * aln->virtual_pop_size;
333 if (dist < Params::getInstance().min_branch_length)
334 dist = Params::getInstance().min_branch_length;
335 return dist;
336 }
337
setParsimonyBranchLengths()338 int PhyloTree::setParsimonyBranchLengths() {
339
340 NodeVector nodes1, nodes2;
341 getBranches(nodes1, nodes2);
342 clearAllPartialLH();
343
344 int sum_score = 0;
345 double persite = 1.0/getAlnNSite();
346 double alpha = (site_rate) ? site_rate->getGammaShape() : 1.0;
347 // int pars_score;
348 //int i, state;
349
350 PhyloNeighbor *dad_branch = (PhyloNeighbor*)nodes1[0]->findNeighbor(nodes2[0]);
351 PhyloNeighbor *node_branch = (PhyloNeighbor*)nodes2[0]->findNeighbor(nodes1[0]);
352 PhyloNode *dad = (PhyloNode*) nodes1[0];
353 PhyloNode *node = (PhyloNode*) nodes2[0];
354
355 // determine state of the root
356 int branch_subst = 0;
357 int pars_score = computeParsimonyBranchFast(dad_branch, dad, &branch_subst);
358 int site, real_site;
359 int nsites = (aln->num_parsimony_sites + UINT_BITS-1) / UINT_BITS;
360 int nstates = aln->getMaxNumStates();
361
362 vector<vector<StateType> > sequences;
363 sequences.resize(nodeNum, vector<StateType>(aln->num_parsimony_sites, aln->STATE_UNKNOWN));
364 vector<bool> done;
365 done.resize(nodeNum, false);
366 done[node->id] = done[dad->id] = true;
367
368 int subst = 0;
369
370 for (site = 0, real_site = 0; site < nsites; site++) {
371 size_t offset = nstates*site;
372 UINT *x = dad_branch->partial_pars + offset;
373 UINT *y = node_branch->partial_pars + offset;
374 UINT w = x[0] & y[0];
375 int state;
376 for (state = 1; state < nstates; state++) {
377 w |= x[state] & y[state];
378 }
379 UINT bit = 1;
380 for (int s = 0; s < UINT_BITS && real_site < aln->num_parsimony_sites; s++, bit = bit << 1, real_site++)
381 if (w & bit) {
382 // intersection is non-empty
383 for (state = 0; state < nstates; state++)
384 if ((x[state] & bit) && (y[state] & bit)) {
385 // assign the first state in the intersection
386 sequences[node->id][real_site] = sequences[dad->id][real_site] = state;
387 break;
388 }
389 } else {
390 // intersection is empty
391 subst++;
392 for (state = 0; state < nstates; state++)
393 if (x[state] & bit) {
394 // assign the first admissible state
395 sequences[node->id][real_site] = state;
396 break;
397 }
398 for (state = 0; state < nstates; state++)
399 if (y[state] & bit) {
400 // assign the first admissible state
401 sequences[dad->id][real_site] = state;
402 break;
403 }
404 }
405 }
406
407 ASSERT(subst == branch_subst);
408 sum_score += subst;
409 fixOneNegativeBranch(correctBranchLengthF81(subst*persite, alpha), dad_branch, dad);
410
411
412 // walking down the tree to assign node states
413 for (int id = 1; id < nodes1.size(); id++) {
414 // arrange such that states of dad are known
415 if (done[nodes1[id]->id]) {
416 dad = (PhyloNode*)nodes1[id];
417 node = (PhyloNode*)nodes2[id];
418 } else {
419 ASSERT(done[nodes2[id]->id]);
420 dad = (PhyloNode*)nodes2[id];
421 node = (PhyloNode*)nodes1[id];
422 }
423 done[node->id] = true;
424 // now determine states of node
425 dad_branch = (PhyloNeighbor*)dad->findNeighbor(node);
426 node_branch = (PhyloNeighbor*)node->findNeighbor(dad);
427 subst = 0;
428 for (site = 0, real_site = 0; site < nsites; site++) {
429 size_t offset = nstates*site;
430 UINT *x = dad_branch->partial_pars + offset;
431 //UINT *y = node_branch->partial_pars + offset;
432 int state;
433 UINT bit = 1;
434 for (int s = 0; s < UINT_BITS && real_site < aln->num_parsimony_sites; s++, bit = bit << 1, real_site++) {
435 StateType dad_state = sequences[dad->id][real_site];
436 ASSERT(dad_state < nstates);
437 //ASSERT(y[dad_state] & bit);
438 if (x[dad_state] & bit) {
439 // same state as dad
440 sequences[node->id][real_site] = dad_state;
441 } else {
442 // different state from dad
443 subst++;
444 for (state = 0; state < nstates; state++)
445 if (x[state] & bit) {
446 // assign the first admissible state
447 sequences[node->id][real_site] = state;
448 break;
449 }
450 }
451 }
452 }
453 fixOneNegativeBranch(correctBranchLengthF81(subst*persite, alpha), dad_branch, dad);
454 // computeParsimonyBranchFast(dad_branch, dad, &branch_subst);
455 // ASSERT(subst <= branch_subst);
456 sum_score += subst;
457 }
458
459 ASSERT(pars_score == sum_score);
460
461 return nodes1.size();
462 }
463
464
465 /****************************************************************************
466 Sankoff parsimony function
467 ****************************************************************************/
468
469
initCostMatrix(CostMatrixType cost_type)470 void PhyloTree::initCostMatrix(CostMatrixType cost_type) {
471 if(cost_matrix){
472 aligned_free(cost_matrix);
473 cost_matrix = NULL;
474 }
475 ASSERT(aln);
476 int cost_nstates = aln->num_states;
477 // allocate memory for cost_matrix
478 cost_matrix = aligned_alloc<unsigned int>(cost_nstates * cost_nstates);
479
480 switch (cost_type) {
481 case CM_LINEAR:
482 for(int i = 0; i < cost_nstates; i++){
483 for(int j = 0; j < cost_nstates; j++)
484 cost_matrix[i * cost_nstates + j] = abs(i-j);
485 }
486 break;
487 case CM_UNIFORM:
488 for(int i = 0; i < cost_nstates; i++){
489 for(int j = 0; j < cost_nstates; j++)
490 cost_matrix[i * cost_nstates + j] = ((i==j) ? 0 : 1);
491 }
492 break;
493 }
494 clearAllPartialLH();
495 }
496
loadCostMatrixFile(char * file_name)497 void PhyloTree::loadCostMatrixFile(char * file_name){
498 if(cost_matrix){
499 aligned_free(cost_matrix);
500 cost_matrix = NULL;
501 }
502 // if(strcmp(file_name, "fitch") == 0)
503 //// if(file_name == NULL)
504 // cost_matrix = new SankoffCostMatrix(aln->num_states);
505 // else
506 // cost_matrix = new SankoffCostMatrix(file_name);
507
508 int cost_nstates;
509 if(strcmp(file_name, "fitch") == 0 || strcmp(file_name, "e") == 0) { // uniform cost
510 cost_nstates = aln->num_states;
511 cost_matrix = aligned_alloc<unsigned int>(cost_nstates * cost_nstates);
512 for(int i = 0; i < cost_nstates; i++)
513 for(int j = 0; j < cost_nstates; j++){
514 if(j == i) cost_matrix[i * cost_nstates + j] = 0;
515 else cost_matrix[i * cost_nstates + j] = 1;
516 }
517 } else{ // Sankoff cost
518 cout << "Loading cost matrix from " << file_name << "..." << endl;
519 ifstream fin(file_name);
520 if(!fin.is_open()){
521 outError("Reading cost matrix file cannot perform. Please check your input file!");
522 }
523 fin >> cost_nstates;
524 if (cost_nstates != aln->num_states)
525 outError("Cost matrix file does not have the same size as number of alignment states");
526 // allocate memory for cost_matrix
527 cost_matrix = aligned_alloc<unsigned int>(cost_nstates * cost_nstates);
528
529 // read numbers from file
530 for(int i = 0; i < cost_nstates; i++){
531 for(int j = 0; j < cost_nstates; j++)
532 fin >> cost_matrix[i * cost_nstates + j];
533 }
534
535 fin.close();
536
537 }
538
539 int i, j, k;
540 bool changed = false;
541
542 for (k = 0; k < cost_nstates; k++)
543 for (i = 0; i < cost_nstates; i++)
544 for (j = 0; j < cost_nstates; j++)
545 if (cost_matrix[i*cost_nstates+j] > cost_matrix[i*cost_nstates+k] + cost_matrix[k*cost_nstates+j]) {
546 changed = true;
547 cost_matrix[i*cost_nstates+j] = cost_matrix[i*cost_nstates+k] + cost_matrix[k*cost_nstates+j];
548 }
549
550 if (changed) {
551 cout << "WARING: Cost matrix does not satisfy triangular inenquality and is automatically fixed to:" << endl;
552 cout << cost_nstates << endl;
553 for (i = 0; i < cost_nstates; i++) {
554 for (j = 0; j < cost_nstates; j++)
555 cout << " " << cost_matrix[i*cost_nstates+j];
556 cout << endl;
557 }
558 } else {
559 cout << "Cost matrix satisfies triangular inenquality" << endl;
560 }
561 }
562
computeTipPartialParsimony()563 void PhyloTree::computeTipPartialParsimony() {
564 if ((tip_partial_lh_computed & 2) != 0)
565 return;
566 tip_partial_lh_computed |= 2;
567
568 int i, state, nstates = aln->num_states;
569
570 size_t nptn = aln->ordered_pattern.size();
571 size_t maxptn = get_safe_upper_limit_float(nptn);
572 int ptn;
573 for (ptn = 0; ptn < nptn; ptn++)
574 ptn_freq_pars[ptn] = aln->ordered_pattern[ptn].frequency;
575 for (ptn = nptn; ptn < maxptn; ptn++)
576 ptn_freq_pars[ptn] = 0;
577
578
579 ASSERT(tip_partial_pars);
580 // ambiguous characters
581 int ambi_aa[] = {
582 4+8, // B = N or D
583 32+64, // Z = Q or E
584 512+1024 // U = I or L
585 };
586
587 memset(tip_partial_pars, 0, (aln->STATE_UNKNOWN+1)*nstates*sizeof(UINT));
588
589 // initialize real states with cost_matrix
590 memcpy(tip_partial_pars, cost_matrix, nstates*nstates*sizeof(UINT));
591
592 UINT *this_tip_partial_pars;
593
594 switch (aln->seq_type) {
595 case SEQ_DNA:
596 for (state = 4; state < 18; state++) {
597 int cstate = state-nstates+1;
598 this_tip_partial_pars = &tip_partial_pars[state*nstates];
599 for (i = 0; i < nstates; i++) {
600 if ((cstate) & (1 << i))
601 this_tip_partial_pars[i] = 0;
602 else {
603 this_tip_partial_pars[i] = UINT_MAX;
604 for (int j = 0; j < nstates; j++)
605 if ((cstate) & (1 << j))
606 this_tip_partial_pars[i] = min(this_tip_partial_pars[i], cost_matrix[i*nstates+j]);
607 }
608 }
609 }
610 break;
611 case SEQ_PROTEIN:
612 for (state = 0; state < sizeof(ambi_aa)/sizeof(int); state++) {
613 this_tip_partial_pars = &tip_partial_pars[(state+20)*nstates];
614 for (i = 0; i < nstates; i++) {
615 if (ambi_aa[state] & (1 << i))
616 this_tip_partial_pars[i] = 0;
617 else {
618 this_tip_partial_pars[i] = UINT_MAX;
619 for (int j = 0; j < nstates; j++)
620 if (ambi_aa[state] & (1 << j))
621 this_tip_partial_pars[i] = min(this_tip_partial_pars[i], cost_matrix[i*nstates+j]);
622 }
623 }
624 }
625 break;
626 case SEQ_POMO:
627 ASSERT(0 && "POMO not handled with Sankoff parsimony");
628 break;
629 default:
630 break;
631 }
632 }
633 /**
634 compute partial parsimony score of the subtree rooted at dad
635 @param dad_branch the branch leading to the subtree
636 @param dad its dad, used to direct the traversal
637 */
computePartialParsimonySankoff(PhyloNeighbor * dad_branch,PhyloNode * dad)638 void PhyloTree::computePartialParsimonySankoff(PhyloNeighbor *dad_branch, PhyloNode *dad){
639 // don't recompute the parsimony
640 if (dad_branch->partial_lh_computed & 2)
641 return;
642
643 Node *node = dad_branch->node;
644 //assert(node->degree() <= 3);
645 /*
646 if(aln->num_states != cost_nstates){
647 cout << "Your cost matrix is not compatible with the alignment"
648 << " in terms of number of states. Please check!" << endl;
649 exit(1);
650 }
651 */
652 int nstates = aln->num_states;
653 assert(dad_branch->partial_pars);
654
655 int pars_block_size = getBitsBlockSize();
656
657
658 // internal node
659 UINT i, j, ptn, min_child_ptn_pars;
660
661 UINT * partial_pars = dad_branch->partial_pars;
662 memset(partial_pars, 0, sizeof(UINT)*pars_block_size);
663
664 PhyloNeighbor *left = NULL, *right = NULL;
665
666 FOR_NEIGHBOR_IT(node, dad, it)
667 if ((*it)->node->name != ROOT_NAME) {
668 if (!(*it)->node->isLeaf())
669 computePartialParsimonySankoff((PhyloNeighbor*) (*it), (PhyloNode*) node);
670 if (!left)
671 left = ((PhyloNeighbor*)*it);
672 else
673 right = ((PhyloNeighbor*)*it);
674 }
675
676 if (!left->node->isLeaf() && right->node->isLeaf()) {
677 // swap leaf and internal node
678 PhyloNeighbor *tmp = left;
679 left = right;
680 right = tmp;
681 }
682 ASSERT(node->degree() >= 3);
683
684 if (node->degree() > 3) {
685 // multifurcating node
686 for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++) {
687 int ptn_start_index = ptn*nstates;
688 UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
689
690 FOR_NEIGHBOR_IT(node, dad, it) if ((*it)->node->name != ROOT_NAME) {
691 if ((*it)->node->isLeaf()) {
692 // leaf node
693 UINT *partial_pars_child_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][(*it)->node->id]*nstates];
694
695 for(i = 0; i < nstates; i++){
696 partial_pars_ptr[i] += partial_pars_child_ptr[i];
697 }
698 } else {
699 // internal node
700 UINT *partial_pars_child_ptr = &((PhyloNeighbor*) (*it))->partial_pars[ptn_start_index];
701 UINT *cost_matrix_ptr = cost_matrix;
702
703 for (i = 0; i < nstates; i++){
704 // min(j->i) from child_branch
705 min_child_ptn_pars = partial_pars_child_ptr[0] + cost_matrix_ptr[0];
706 for(j = 1; j < nstates; j++) {
707 UINT value = partial_pars_child_ptr[j] + cost_matrix_ptr[j];
708 min_child_ptn_pars = min(value, min_child_ptn_pars);
709 }
710 partial_pars_ptr[i] += min_child_ptn_pars;
711 cost_matrix_ptr += nstates;
712 }
713 }
714 }
715 }
716 } else if (left->node->isLeaf() && right->node->isLeaf()) {
717 // tip-tip case
718 for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
719 // ignore const ptn because it does not affect pars score
720 //if (aln->at(ptn).isConst()) continue;
721 int ptn_start_index = ptn*nstates;
722
723 UINT *left_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][left->node->id]*nstates];
724 UINT *right_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][right->node->id]*nstates];
725 UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
726
727 for (i = 0; i < nstates; i++){
728 // min(j->i) from child_branch
729 partial_pars_ptr[i] = left_ptr[i] + right_ptr[i];
730 }
731 }
732 } else if (left->node->isLeaf() && !right->node->isLeaf()) {
733 // tip-inner case
734 for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
735 // ignore const ptn because it does not affect pars score
736 //if (aln->at(ptn).isConst()) continue;
737 int ptn_start_index = ptn*nstates;
738
739 UINT *left_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][left->node->id]*nstates];
740 UINT *right_ptr = &right->partial_pars[ptn_start_index];
741 UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
742 UINT *cost_matrix_ptr = cost_matrix;
743 UINT right_contrib;
744
745 for(i = 0; i < nstates; i++){
746 // min(j->i) from child_branch
747 right_contrib = right_ptr[0] + cost_matrix_ptr[0];
748 for(j = 1; j < nstates; j++) {
749 right_contrib = min(right_ptr[j] + cost_matrix_ptr[j], right_contrib);
750 }
751 partial_pars_ptr[i] = left_ptr[i] + right_contrib;
752 cost_matrix_ptr += nstates;
753 }
754 }
755 } else {
756 // inner-inner case
757 for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
758 // ignore const ptn because it does not affect pars score
759 //if (aln->at(ptn).isConst()) continue;
760 int ptn_start_index = ptn*nstates;
761
762 UINT *left_ptr = &left->partial_pars[ptn_start_index];
763 UINT *right_ptr = &right->partial_pars[ptn_start_index];
764 UINT *partial_pars_ptr = &partial_pars[ptn_start_index];
765 UINT *cost_matrix_ptr = cost_matrix;
766 UINT left_contrib, right_contrib;
767
768 for(i = 0; i < nstates; i++){
769 // min(j->i) from child_branch
770 left_contrib = left_ptr[0] + cost_matrix_ptr[0];
771 right_contrib = right_ptr[0] + cost_matrix_ptr[0];
772 for(j = 1; j < nstates; j++) {
773 left_contrib = min(left_ptr[j] + cost_matrix_ptr[j], left_contrib);
774 right_contrib = min(right_ptr[j] + cost_matrix_ptr[j], right_contrib);
775 }
776 partial_pars_ptr[i] = left_contrib+right_contrib;
777 cost_matrix_ptr += nstates;
778 }
779 }
780
781 }
782
783 dad_branch->partial_lh_computed |= 2;
784 }
785
786 /**
787 compute tree parsimony score based on a particular branch
788 @param dad_branch the branch leading to the subtree
789 @param dad its dad, used to direct the traversal
790 @param branch_subst (OUT) if not NULL, the number of substitutions on this branch
791 @return parsimony score of the tree
792 */
computeParsimonyBranchSankoff(PhyloNeighbor * dad_branch,PhyloNode * dad,int * branch_subst)793 int PhyloTree::computeParsimonyBranchSankoff(PhyloNeighbor *dad_branch, PhyloNode *dad, int *branch_subst) {
794
795 if ((tip_partial_lh_computed & 2) == 0)
796 computeTipPartialParsimony();
797
798 PhyloNode *node = (PhyloNode*) dad_branch->node;
799 PhyloNeighbor *node_branch = (PhyloNeighbor*) node->findNeighbor(dad);
800 assert(node_branch);
801
802 if (!central_partial_pars)
803 initializeAllPartialPars();
804
805 // DTH: I don't really understand what this is for. ###########
806 // swap node and dad if dad is a leaf
807 if (node->isLeaf()) {
808 PhyloNode *tmp_node = dad;
809 dad = node;
810 node = tmp_node;
811 PhyloNeighbor *tmp_nei = dad_branch;
812 dad_branch = node_branch;
813 node_branch = tmp_nei;
814 // cout << "swapped\n";
815 }
816
817 //int nptn = aln->size();
818 // if(!_pattern_pars) _pattern_pars = aligned_alloc<BootValTypePars>(nptn+VCSIZE_USHORT);
819 // memset(_pattern_pars, 0, sizeof(BootValTypePars) * (nptn+VCSIZE_USHORT));
820
821 if ((dad_branch->partial_lh_computed & 2) == 0 && !node->isLeaf())
822 computePartialParsimonySankoff(dad_branch, dad);
823 if ((node_branch->partial_lh_computed & 2) == 0 && !dad->isLeaf())
824 computePartialParsimonySankoff(node_branch, node);
825
826 // now combine likelihood at the branch
827 UINT tree_pars = 0;
828 int nstates = aln->num_states;
829 UINT i, j, ptn;
830 UINT branch_pars = 0;
831
832 if (dad->isLeaf()) {
833 // external node
834 for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
835 int ptn_start_index = ptn * nstates;
836 UINT *node_branch_ptr = &tip_partial_pars[aln->ordered_pattern[ptn][dad->id]*nstates];
837 UINT *dad_branch_ptr = &dad_branch->partial_pars[ptn_start_index];
838 UINT min_ptn_pars = node_branch_ptr[0] + dad_branch_ptr[0];
839 UINT br_ptn_pars = node_branch_ptr[0];
840 for (i = 1; i < nstates; i++){
841 // min(j->i) from node_branch
842 UINT min_score = node_branch_ptr[i] + dad_branch_ptr[i];
843 if (min_score < min_ptn_pars) {
844 min_ptn_pars = min_score;
845 br_ptn_pars = node_branch_ptr[i];
846 }
847 }
848 //_pattern_pars[ptn] = min_ptn_pars;
849 tree_pars += min_ptn_pars * aln->ordered_pattern[ptn].frequency;
850 branch_pars += br_ptn_pars * aln->ordered_pattern[ptn].frequency;
851 }
852 } else {
853 // internal node
854 for (ptn = 0; ptn < aln->ordered_pattern.size(); ptn++){
855 int ptn_start_index = ptn * nstates;
856 UINT *node_branch_ptr = &node_branch->partial_pars[ptn_start_index];
857 UINT *dad_branch_ptr = &dad_branch->partial_pars[ptn_start_index];
858 UINT *cost_matrix_ptr = cost_matrix;
859 UINT min_ptn_pars = UINT_MAX;
860 UINT br_ptn_pars = UINT_MAX;
861 for(i = 0; i < nstates; i++){
862 // min(j->i) from node_branch
863 UINT min_score = node_branch_ptr[0] + cost_matrix_ptr[0];
864 UINT branch_score = cost_matrix_ptr[0];
865 for(j = 1; j < nstates; j++) {
866 UINT value = node_branch_ptr[j] + cost_matrix_ptr[j];
867 if (value < min_score) {
868 min_score = value;
869 branch_score = cost_matrix_ptr[j];
870 }
871
872 }
873 min_score = min_score + dad_branch_ptr[i];
874 if (min_score < min_ptn_pars) {
875 min_ptn_pars = min_score;
876 br_ptn_pars = branch_score;
877 }
878 cost_matrix_ptr += nstates;
879 }
880 //_pattern_pars[ptn] = min_ptn_pars;
881 tree_pars += min_ptn_pars * aln->ordered_pattern[ptn].frequency;
882 branch_pars += br_ptn_pars * aln->ordered_pattern[ptn].frequency;
883 }
884 }
885 if (branch_subst)
886 *branch_subst = branch_pars;
887 // cout << endl;
888 return tree_pars;
889 }
890
891 /****************************************************************************
892 Stepwise addition (greedy) by maximum parsimony
893 ****************************************************************************/
894
895 // random generator function:
896 //ptrdiff_t myrandom(ptrdiff_t i) {
897 // return random_int(i);
898 //}
899
900 // pointer object to it:
901 //ptrdiff_t (*p_myrandom)(ptrdiff_t) = myrandom;
902
create3TaxonTree(IntVector & taxon_order,int * rand_stream)903 void PhyloTree::create3TaxonTree(IntVector &taxon_order, int *rand_stream) {
904 freeNode();
905 int nseq = aln->getNSeq();
906 taxon_order.resize(nseq);
907 for (int i = 0; i < nseq; i++)
908 taxon_order[i] = i;
909 // randomize the addition order
910 my_random_shuffle(taxon_order.begin(), taxon_order.end(), rand_stream);
911
912 root = newNode(nseq);
913
914 // create star tree
915 for (leafNum = 0; leafNum < 3; leafNum++) {
916 if (leafNum < 3 && verbose_mode >= VB_MAX)
917 cout << "Add " << aln->getSeqName(taxon_order[leafNum]) << " to the tree" << endl;
918 Node *new_taxon = newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
919 root->addNeighbor(new_taxon, -1.0);
920 new_taxon->addNeighbor(root, -1.0);
921 }
922 root = root->neighbors[0]->node;
923 }
924
copyConstraintTree(MTree * tree,IntVector & taxon_order,int * rand_stream)925 void PhyloTree::copyConstraintTree(MTree *tree, IntVector &taxon_order, int *rand_stream) {
926 MTree::copyTree(tree);
927 // assign proper taxon IDs
928 NodeVector nodes;
929 NodeVector::iterator it;
930 getTaxa(nodes);
931 leafNum = nodes.size();
932 vector<int> pushed;
933 pushed.resize(aln->getNSeq(), 0);
934
935 // name map for fast lookup
936 StrVector seq_names = aln->getSeqNames();
937 StringIntMap name2id;
938 for (auto sit = seq_names.begin(); sit != seq_names.end(); sit++)
939 name2id[*sit] = sit - seq_names.begin();
940
941 // reindex taxon ID from alignment
942 for (it = nodes.begin(); it != nodes.end(); it++) {
943 (*it)->id = name2id[(*it)->name];
944 ASSERT((*it)->id >= 0);
945 taxon_order.push_back((*it)->id);
946 pushed[(*it)->id] = 1;
947 }
948 ASSERT(taxon_order.size() == constraintTree.leafNum);
949
950 // reindex internal nodes properly
951 nodes.clear();
952 getInternalNodes(nodes);
953 for (it = nodes.begin(); it != nodes.end(); it++)
954 (*it)->id = aln->getNSeq() + (it - nodes.begin());
955
956 // add the remaining taxa
957 for (int i = 0; i < aln->getNSeq(); i++)
958 if (!pushed[i]) {
959 taxon_order.push_back(i);
960 }
961 // randomize the addition order
962 my_random_shuffle(taxon_order.begin()+constraintTree.leafNum, taxon_order.end(), rand_stream);
963 }
964
965 /**
966 get all neighboring branches to a removed node
967 */
getNeiBranches(NeighborVec & removed_nei,NodeVector & attached_node,NodeVector & added_nodes,int i,NodeVector & nodes1,NodeVector & nodes2)968 void getNeiBranches(NeighborVec &removed_nei, NodeVector &attached_node, NodeVector &added_nodes, int i,
969 NodeVector &nodes1, NodeVector &nodes2)
970 {
971 // get target branches surrounding attached_node
972 FOR_NEIGHBOR_IT(attached_node[i], NULL, it) {
973 if (attached_node[i]->id < (*it)->node->id) {
974 nodes1.push_back(attached_node[i]);
975 nodes2.push_back((*it)->node);
976 } else {
977 nodes2.push_back(attached_node[i]);
978 nodes1.push_back((*it)->node);
979 }
980 }
981 // get target branches surrounding previous added_nodes
982 int j;
983 for (j = i-1; j >= 0; j--) {
984 if (attached_node[j] != attached_node[i])
985 break;
986 Node *node = added_nodes[j];
987 FOR_NEIGHBOR_IT(node, NULL, it) {
988 if (node->id < (*it)->node->id) {
989 bool present = false;
990 for (int k = 0; k < nodes1.size(); k++)
991 if (node == nodes1[k] && (*it)->node == nodes2[k]) {
992 present = true;
993 break;
994 }
995 if (present) continue;
996 nodes1.push_back(node);
997 nodes2.push_back((*it)->node);
998 } else {
999 bool present = false;
1000 for (int k = 0; k < nodes1.size(); k++)
1001 if (node == nodes2[k] && (*it)->node == nodes1[k]) {
1002 present = true;
1003 break;
1004 }
1005 if (present) continue;
1006 nodes2.push_back(node);
1007 nodes1.push_back((*it)->node);
1008 }
1009 }
1010 // check that exactly two branches are added
1011 }
1012 ASSERT(nodes1.size() == 3 + (i-j-1)*2);
1013
1014 }
1015
insertNode2Branch(Node * added_node,Node * target_node,Node * target_dad)1016 void PhyloTree::insertNode2Branch(Node* added_node, Node* target_node, Node* target_dad) {
1017 target_node->updateNeighbor(target_dad, added_node, -1.0);
1018 target_dad->updateNeighbor(target_node, added_node, -1.0);
1019 added_node->updateNeighbor((Node*) 1, target_node, -1.0);
1020 added_node->updateNeighbor((Node*) 2, target_dad, -1.0);
1021 ((PhyloNeighbor*) added_node->findNeighbor(target_node))->partial_pars =
1022 ((PhyloNeighbor*) target_dad->findNeighbor(added_node))->partial_pars;
1023 ((PhyloNeighbor*) added_node->findNeighbor(target_dad))->partial_pars =
1024 ((PhyloNeighbor*) target_node->findNeighbor(added_node))->partial_pars;
1025
1026 ((PhyloNeighbor*) added_node->findNeighbor(target_node))->partial_lh_computed =
1027 ((PhyloNeighbor*) target_dad->findNeighbor(added_node))->partial_lh_computed;
1028 ((PhyloNeighbor*) added_node->findNeighbor(target_dad))->partial_lh_computed =
1029 ((PhyloNeighbor*) target_node->findNeighbor(added_node))->partial_lh_computed;
1030
1031 PhyloNode *ass_node = (PhyloNode*)added_node->neighbors[0]->node;
1032 ((PhyloNeighbor*)ass_node->findNeighbor(added_node))->clearPartialLh();
1033 ass_node->clearReversePartialLh((PhyloNode*)added_node);
1034 }
1035
computeParsimonyTree(const char * out_prefix,Alignment * alignment,int * rand_stream)1036 int PhyloTree::computeParsimonyTree(const char *out_prefix, Alignment *alignment, int *rand_stream) {
1037 aln = alignment;
1038 int nseq = aln->getNSeq();
1039 if (nseq < 3)
1040 outError(ERR_FEW_TAXA);
1041
1042 IntVector taxon_order;
1043 taxon_order.reserve(aln->getNSeq());
1044
1045 NeighborVec removed_nei; // removed Neighbor
1046 NodeVector attached_node; // node attached to removed Neighbor
1047 NodeVector added_nodes; // newly added nodes
1048 int newNodeID;
1049 size_t index;
1050 size_t pars_block_size = getBitsBlockSize();
1051
1052 if (constraintTree.empty()) {
1053 create3TaxonTree(taxon_order, rand_stream);
1054 ASSERT(leafNum == 3);
1055 initializeAllPartialPars();
1056 index = (2*leafNum-3)*2;
1057 newNodeID = nseq + leafNum - 2;
1058 } else {
1059 // first copy the constraint tree
1060 copyConstraintTree(&constraintTree, taxon_order, rand_stream);
1061 newNodeID = nodeNum - leafNum + nseq;
1062 index = (branchNum)*2;
1063
1064 // initialize partial_pars to reuse later
1065 initializeAllPartialPars();
1066
1067 // extract a bifurcating subtree and get removed nodes to insert later
1068 extractBifurcatingSubTree(removed_nei, attached_node, rand_stream);
1069
1070 added_nodes.reserve(removed_nei.size());
1071 }
1072 if (verbose_mode >= VB_MAX)
1073 cout << "computeParsimony: " << computeParsimony() << endl;
1074
1075 //UINT *tmp_partial_pars;
1076 //tmp_partial_pars = newBitsBlock();
1077 if (nseq == 3)
1078 best_pars_score = computeParsimony();
1079
1080 best_pars_score = 0;
1081 if (leafNum == nseq) {
1082 outWarning("Constraint tree has all taxa and is bifurcating, which strictly enforces final tree!");
1083 }
1084
1085 // stepwise adding the next taxon for the remaining taxa
1086 for (int step = 0; leafNum < nseq; step++) {
1087 NodeVector nodes1, nodes2;
1088 PhyloNode *target_node = NULL;
1089 PhyloNode *target_dad = NULL;
1090 best_pars_score = UINT_MAX;
1091
1092 // create a new node attached to new taxon or removed node
1093 PhyloNode *added_node = (PhyloNode*)newNode(newNodeID++);
1094 PhyloNode *new_taxon;
1095
1096 if (step < removed_nei.size()) {
1097 // add the removed_nei (from constraint tree) back to the tree
1098 getNeiBranches(removed_nei, attached_node, added_nodes, step, nodes1, nodes2);
1099 new_taxon = (PhyloNode*)removed_nei[step]->node;
1100 added_node->neighbors.push_back(removed_nei[step]);
1101 new_taxon->updateNeighbor(attached_node[step], added_node);
1102 added_nodes.push_back(added_node);
1103 } else {
1104 // add new taxon to the tree
1105 if (verbose_mode >= VB_MAX)
1106 cout << "Adding " << aln->getSeqName(taxon_order[leafNum]) << " to the tree..." << endl;
1107 getBranches(nodes1, nodes2);
1108
1109 // allocate a new taxon
1110 new_taxon = (PhyloNode*)newNode(taxon_order[leafNum], aln->getSeqName(taxon_order[leafNum]).c_str());
1111
1112 // link new_taxon and added_node
1113 added_node->addNeighbor(new_taxon, -1.0);
1114 new_taxon->addNeighbor(added_node, -1.0);
1115
1116 // allocate memory
1117 ((PhyloNeighbor*)new_taxon->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1118 ((PhyloNeighbor*)added_node->findNeighbor(new_taxon))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1119 }
1120 // preserve two neighbors
1121 added_node->addNeighbor((Node*) 1, -1.0);
1122 added_node->addNeighbor((Node*) 2, -1.0);
1123
1124 for (int nodeid = 0; nodeid < nodes1.size(); nodeid++) {
1125
1126 int score = addTaxonMPFast(new_taxon, added_node, nodes1[nodeid], nodes2[nodeid]);
1127 if (score < best_pars_score) {
1128 best_pars_score = score;
1129 target_node = (PhyloNode*)nodes1[nodeid];
1130 target_dad = (PhyloNode*)nodes2[nodeid];
1131 }
1132 }
1133
1134 if (verbose_mode >= VB_MAX)
1135 cout << ", score = " << best_pars_score << endl;
1136 // now insert the new node in the middle of the branch node-dad
1137 insertNode2Branch(added_node, target_node, target_dad);
1138
1139 // assign partial_pars storage
1140 ((PhyloNeighbor*)target_dad->findNeighbor(added_node))->clearPartialLh();
1141 ((PhyloNeighbor*)target_dad->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1142 ((PhyloNeighbor*)target_node->findNeighbor(added_node))->clearPartialLh();
1143 ((PhyloNeighbor*)target_node->findNeighbor(added_node))->partial_pars = central_partial_pars + ((index++) * pars_block_size);
1144
1145 target_dad->clearReversePartialLh(added_node);
1146 target_node->clearReversePartialLh(added_node);
1147
1148 // increase number of taxa
1149 leafNum += getNumTaxa(new_taxon, added_node);
1150 }
1151
1152 ASSERT(index == 4*leafNum-6);
1153
1154 nodeNum = 2 * leafNum - 2;
1155 initializeTree();
1156 // parsimony tree is always unrooted
1157 bool orig_rooted = rooted;
1158 rooted = false;
1159 setAlignment(alignment);
1160 // initializeAllPartialPars();
1161 // clearAllPartialLH();
1162 fixNegativeBranch(true);
1163 // convert to rooted tree if originally so
1164 if (orig_rooted)
1165 convertToRooted();
1166 if (out_prefix) {
1167 string file_name = out_prefix;
1168 file_name += ".parstree";
1169 printTree(file_name.c_str(), WT_NEWLINE + WT_BR_LEN);
1170 }
1171 // if (isSuperTree())
1172 // ((PhyloSuperTree*)this)->mapTrees();
1173
1174 return best_pars_score;
1175 }
1176
addTaxonMPFast(Node * added_taxon,Node * added_node,Node * node,Node * dad)1177 int PhyloTree::addTaxonMPFast(Node *added_taxon, Node* added_node, Node* node, Node* dad) {
1178
1179 // now insert the new node in the middle of the branch node-dad
1180 insertNode2Branch(added_node, node, dad);
1181
1182 // compute the likelihood
1183 int score = computeParsimonyBranch((PhyloNeighbor*)added_taxon->findNeighbor(added_node), (PhyloNode*)added_taxon);
1184
1185 // remove the added node
1186 node->updateNeighbor(added_node, dad);
1187 dad->updateNeighbor(added_node, node);
1188 added_node->updateNeighbor(node, (Node*) 1);
1189 added_node->updateNeighbor(dad, (Node*) 2);
1190
1191 // set partial_pars to COMPUTED
1192 ((PhyloNeighbor*)node->findNeighbor(dad))->partial_lh_computed |= 2;
1193 ((PhyloNeighbor*)dad->findNeighbor(node))->partial_lh_computed |= 2;
1194
1195 // now tranverse the tree downwards
1196
1197 // FOR_NEIGHBOR_IT(node, dad, it){
1198 // addTaxonMPFast(added_node, target_node, target_dad, target_partial_pars, (*it)->node, node);
1199 // }
1200 return score;
1201
1202 }
1203
extractBifurcatingSubTree(NeighborVec & removed_nei,NodeVector & attached_node,int * rand_stream)1204 void PhyloTree::extractBifurcatingSubTree(NeighborVec &removed_nei, NodeVector &attached_node, int *rand_stream) {
1205 NodeVector nodes;
1206 getMultifurcatingNodes(nodes);
1207 if (nodes.empty())
1208 return;
1209
1210 int i;
1211
1212 computeBranchDirection();
1213
1214 // firstly make bifurcating tree
1215 for (NodeVector::iterator it = nodes.begin(); it != nodes.end(); it++)
1216 {
1217 Node *node = (*it);
1218 int id[3];
1219 id[0] = -1;
1220 // find the neighbor toward root to preserve root
1221 for (i = 0; i < node->neighbors.size(); i++)
1222 if (((PhyloNeighbor*)node->neighbors[i])->direction == TOWARD_ROOT) {
1223 id[0] = i;
1224 break;
1225 }
1226 ASSERT(id[0] >= 0);
1227 // randomly choose 2 neighbors to reserve
1228 do {
1229 id[1] = random_int(node->degree(), rand_stream);
1230 } while (id[1] == id[0]);
1231
1232 do {
1233 id[2] = random_int(node->degree(), rand_stream);
1234 } while (id[2] == id[0] || id[2] == id[1]);
1235
1236 std::sort(id, id+3);
1237
1238 // remove taxa
1239 int cur_size = removed_nei.size();
1240 for (i = 0; i < node->degree(); i++)
1241 if (i != id[0] && i != id[1] && i != id[2]) {
1242 removed_nei.push_back(node->neighbors[i]);
1243 attached_node.push_back(node);
1244 }
1245 // randomize removed_nei
1246 my_random_shuffle(removed_nei.begin() + cur_size, removed_nei.end(), rand_stream);
1247
1248 // remove neigbors to make bifurcating tree
1249 node->neighbors[0] = node->neighbors[id[0]];
1250 node->neighbors[1] = node->neighbors[id[1]];
1251 node->neighbors[2] = node->neighbors[id[2]];
1252 node->neighbors.resize(3);
1253 }
1254
1255 leafNum = getNumTaxa();
1256
1257 }
1258