1 /***************************************************************************
2 * Copyright (C) 2006 by BUI Quang Minh, Steffen Klaere, Arndt von Haeseler *
3 * minh.bui@univie.ac.at *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
20
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24
25 #include <iqtree_config.h>
26
27 #if defined WIN32 || defined _WIN32 || defined __WIN32__
28 //#include <winsock2.h>
29 //#include <windows.h>
30 //extern __declspec(dllexport) int gethostname(char *name, int namelen);
31 #else
32 #include <sys/resource.h>
33 #endif
34
35 //#include "Eigen/Core"
36 #include <stdio.h>
37 #include "tree/phylotree.h"
38 #include <signal.h>
39 #include <cstdio>
40 #include <streambuf>
41 #include <iostream>
42 #include <cstdlib>
43 #include <errno.h>
44 #include "pda/greedy.h"
45 #include "pda/pruning.h"
46 //#include "naivegreedy.h"
47 #include "pda/splitgraph.h"
48 #include "pda/circularnetwork.h"
49 #include "tree/mtreeset.h"
50 #include "tree/mexttree.h"
51 #include "ncl/ncl.h"
52 #include "nclextra/msetsblock.h"
53 #include "nclextra/myreader.h"
54 #include "phyloanalysis.h"
55 #include "tree/matree.h"
56 //#include "ngs.h"
57 #include "obsolete/parsmultistate.h"
58 //#include "gss.h"
59 #include "alignment/maalignment.h" //added by MA
60 #include "tree/ncbitree.h"
61 #include "pda/ecopd.h"
62 #include "tree/upperbounds.h"
63 #include "pda/ecopdmtreeset.h"
64 #include "pda/gurobiwrapper.h"
65 #include "utils/timeutil.h"
66 //#include <unistd.h>
67 #include <stdlib.h>
68 #include "vectorclass/instrset.h"
69
70 #include "utils/MPIHelper.h"
71
72 #ifdef _OPENMP
73 #include <omp.h>
74 #endif
75
76 using namespace std;
77
78
79
generateRandomTree(Params & params)80 void generateRandomTree(Params ¶ms)
81 {
82 if (params.sub_size < 3 && !params.aln_file) {
83 outError(ERR_FEW_TAXA);
84 }
85
86 if (!params.user_file) {
87 outError("Please specify an output tree file name");
88 }
89 ////cout << "Random number seed: " << params.ran_seed << endl << endl;
90
91 SplitGraph sg;
92
93 try {
94
95 if (params.tree_gen == YULE_HARDING || params.tree_gen == CATERPILLAR ||
96 params.tree_gen == BALANCED || params.tree_gen == UNIFORM || params.tree_gen == STAR_TREE) {
97 if (!overwriteFile(params.user_file)) return;
98 ofstream out;
99 out.open(params.user_file);
100 MTree itree;
101
102 if (params.second_tree) {
103 cout << "Generating random branch lengths on tree " << params.second_tree << " ..." << endl;
104 itree.readTree(params.second_tree, params.is_rooted);
105 } else
106 switch (params.tree_gen) {
107 case YULE_HARDING:
108 cout << "Generating random Yule-Harding tree..." << endl;
109 break;
110 case UNIFORM:
111 cout << "Generating random uniform tree..." << endl;
112 break;
113 case CATERPILLAR:
114 cout << "Generating random caterpillar tree..." << endl;
115 break;
116 case BALANCED:
117 cout << "Generating random balanced tree..." << endl;
118 break;
119 case STAR_TREE:
120 cout << "Generating star tree with random external branch lengths..." << endl;
121 break;
122 default: break;
123 }
124 ofstream out2;
125 if (params.num_zero_len) {
126 cout << "Setting " << params.num_zero_len << " internal branches to zero length..." << endl;
127 string str = params.user_file;
128 str += ".collapsed";
129 out2.open(str.c_str());
130 }
131 for (int i = 0; i < params.repeated_time; i++) {
132 MExtTree mtree;
133 if (itree.root) {
134 mtree.copyTree(&itree);
135 mtree.generateRandomBranchLengths(params);
136 } else {
137 mtree.generateRandomTree(params.tree_gen, params);
138 }
139 if (params.num_zero_len) {
140 mtree.setZeroInternalBranches(params.num_zero_len);
141 MExtTree collapsed_tree;
142 collapsed_tree.copyTree(&mtree);
143 collapsed_tree.collapseZeroBranches();
144 collapsed_tree.printTree(out2);
145 out2 << endl;
146 }
147 mtree.printTree(out);
148 out << endl;
149 }
150 out.close();
151 cout << params.repeated_time << " tree(s) printed to " << params.user_file << endl;
152 if (params.num_zero_len) {
153 out2.close();
154 cout << params.repeated_time << " collapsed tree(s) printed to " << params.user_file << ".collapsed" << endl;
155 }
156 }
157 // Generate random trees if optioned
158 else if (params.tree_gen == CIRCULAR_SPLIT_GRAPH) {
159 cout << "Generating random circular split network..." << endl;
160 if (!overwriteFile(params.user_file)) return;
161 sg.generateCircular(params);
162 } else if (params.tree_gen == TAXA_SET) {
163 sg.init(params);
164 cout << "Generating random taxa set of size " << params.sub_size <<
165 " overlap " << params.overlap << " with " << params.repeated_time << " times..." << endl;
166 if (!overwriteFile(params.pdtaxa_file)) return;
167 sg.generateTaxaSet(params.pdtaxa_file, params.sub_size, params.overlap, params.repeated_time);
168 }
169 } catch (bad_alloc) {
170 outError(ERR_NO_MEMORY);
171 } catch (ios::failure) {
172 outError(ERR_WRITE_OUTPUT, params.user_file);
173 }
174
175 // calculate the distance
176 if (params.run_mode == CALC_DIST) {
177 if (params.tree_gen == CIRCULAR_SPLIT_GRAPH) {
178 cout << "Calculating distance matrix..." << endl;
179 sg.calcDistance(params.dist_file);
180 cout << "Distances printed to " << params.dist_file << endl;
181 }// else {
182 //mtree.calcDist(params.dist_file);
183 //}
184 }
185
186 }
187
separator(ostream & out,int type=0)188 inline void separator(ostream &out, int type = 0) {
189 switch (type) {
190 case 0:
191 out << endl << "==============================================================================" << endl;
192 break;
193 case 1:
194 out << endl << "-----------------------------------------------------------" << endl;
195 break;
196 default:
197 break;
198 }
199 }
200
201
printCopyright(ostream & out)202 void printCopyright(ostream &out) {
203 #ifdef IQ_TREE
204 out << "IQ-TREE";
205 #ifdef _IQTREE_MPI
206 out << " MPI";
207 #endif
208 #ifdef _OPENMP
209 out << " multicore";
210 #endif
211 #ifdef __AVX512KNL
212 out << " Xeon Phi KNL";
213 #endif
214 out << " version ";
215 #else
216 out << "PDA - Phylogenetic Diversity Analyzer version ";
217 #endif
218 out << iqtree_VERSION_MAJOR << "." << iqtree_VERSION_MINOR << iqtree_VERSION_PATCH;
219
220 #if defined _WIN32 || defined WIN32
221 out << " for Windows";
222 #elif defined __APPLE__ || defined __MACH__
223 out << " for Mac OS X";
224 #elif defined __linux__
225 out << " for Linux";
226 #elif defined __unix__ || defined __unix
227 out << " for Unix";
228 #else
229 out << " for unknown platform"
230 #endif
231
232 out << " " << 8*sizeof(void*) << "-bit" << " built " << __DATE__;
233 #if defined DEBUG
234 out << " - debug mode";
235 #endif
236
237 #ifdef IQ_TREE
238 out << endl << "Developed by Bui Quang Minh, Nguyen Lam Tung, Olga Chernomor,"
239 << endl << "Heiko Schmidt, Dominik Schrempf, Michael Woodhams." << endl << endl;
240 #else
241 out << endl << "Copyright (c) 2006-2014 Olga Chernomor, Arndt von Haeseler and Bui Quang Minh." << endl << endl;
242 #endif
243 }
244
printRunMode(ostream & out,RunMode run_mode)245 void printRunMode(ostream &out, RunMode run_mode) {
246 switch (run_mode) {
247 case DETECTED: out << "Detected"; break;
248 case GREEDY: out << "Greedy"; break;
249 case PRUNING: out << "Pruning"; break;
250 case BOTH_ALG: out << "Greedy and Pruning"; break;
251 case EXHAUSTIVE: out << "Exhaustive"; break;
252 case DYNAMIC_PROGRAMMING: out << "Dynamic Programming"; break;
253 case LINEAR_PROGRAMMING: out << "Integer Linear Programming"; break;
254 default: outError(ERR_INTERNAL);
255 }
256 }
257
258 /**
259 summarize the running with header
260 */
summarizeHeader(ostream & out,Params & params,bool budget_constraint,InputType analysis_type)261 void summarizeHeader(ostream &out, Params ¶ms, bool budget_constraint, InputType analysis_type) {
262 printCopyright(out);
263 out << "Input tree/split network file name: " << params.user_file << endl;
264 if(params.eco_dag_file)
265 out << "Input food web file name: "<<params.eco_dag_file<<endl;
266 out << "Input file format: " << ((params.intype == IN_NEWICK) ? "Newick" : ( (params.intype == IN_NEXUS) ? "Nexus" : "Unknown" )) << endl;
267 if (params.initial_file != NULL)
268 out << "Initial taxa file: " << params.initial_file << endl;
269 if (params.param_file != NULL)
270 out << "Parameter file: " << params.param_file << endl;
271 out << endl;
272 out << "Type of measure: " << ((params.root != NULL || params.is_rooted) ? "Rooted": "Unrooted") <<
273 (analysis_type== IN_NEWICK ? " phylogenetic diversity (PD)" : " split diversity (SD)");
274 if (params.root != NULL) out << " at " << params.root;
275 out << endl;
276 if (params.run_mode != CALC_DIST && params.run_mode != PD_USER_SET) {
277 out << "Search objective: " << ((params.find_pd_min) ? "Minimum" : "Maximum") << endl;
278 out << "Search algorithm: ";
279 printRunMode(out, params.run_mode);
280 if (params.run_mode == DETECTED) {
281 out << " -> ";
282 printRunMode(out, params.detected_mode);
283 }
284 out << endl;
285 out << "Search option: " << ((params.find_all) ? "Multiple optimal sets" : "Single optimal set") << endl;
286 }
287 out << endl;
288 out << "Type of analysis: ";
289 switch (params.run_mode) {
290 case PD_USER_SET: out << "PD/SD of user sets";
291 if (params.pdtaxa_file) out << " (" << params.pdtaxa_file << ")"; break;
292 case CALC_DIST: out << "Distance matrix computation"; break;
293 default:
294 out << ((budget_constraint) ? "Budget constraint " : "Subset size k ");
295 if (params.intype == IN_NEWICK)
296 out << ((analysis_type == IN_NEWICK) ? "on tree" : "on tree -> split network");
297 else
298 out << "on split network";
299 }
300 out << endl;
301 //out << "Random number seed: " << params.ran_seed << endl;
302 }
303
summarizeFooter(ostream & out,Params & params)304 void summarizeFooter(ostream &out, Params ¶ms) {
305 separator(out);
306 time_t beginTime;
307 time (&beginTime);
308 char *date;
309 date = ctime(&beginTime);
310
311 out << "Time used: " << params.run_time << " seconds." << endl;
312 out << "Finished time: " << date << endl;
313 }
314
315
getMaxNameLen(vector<string> & setName)316 int getMaxNameLen(vector<string> &setName) {
317 int len = 0;
318 for (vector<string>::iterator it = setName.begin(); it != setName.end(); it++)
319 if (len < (*it).length())
320 len = (*it).length();
321 return len;
322 }
323
printPDUser(ostream & out,Params & params,PDRelatedMeasures & pd_more)324 void printPDUser(ostream &out, Params ¶ms, PDRelatedMeasures &pd_more) {
325 out << "List of user-defined sets of taxa with PD score computed" << endl << endl;
326 int maxlen = getMaxNameLen(pd_more.setName)+2;
327 out.width(maxlen);
328 out << "Name" << " PD";
329 if (params.exclusive_pd) out << " excl.-PD";
330 if (params.endemic_pd) out << " PD-Endem.";
331 if (params.complement_area) out << " PD-Compl. given area " << params.complement_area;
332 out << endl;
333 int cnt;
334 for (cnt = 0; cnt < pd_more.setName.size(); cnt++) {
335 out.width(maxlen);
336 out << pd_more.setName[cnt] << " ";
337 out.width(7);
338 out << pd_more.PDScore[cnt] << " ";
339 if (params.exclusive_pd) {
340 out.width(7);
341 out << pd_more.exclusivePD[cnt] << " ";
342 }
343 if (params.endemic_pd) {
344 out.width(7);
345 out << pd_more.PDEndemism[cnt] << " ";
346 }
347 if (params.complement_area) {
348 out.width(8);
349 out << pd_more.PDComplementarity[cnt];
350 }
351 out << endl;
352 }
353 separator(out, 1);
354 }
355
summarizeTree(Params & params,PDTree & tree,vector<PDTaxaSet> & taxa_set,PDRelatedMeasures & pd_more)356 void summarizeTree(Params ¶ms, PDTree &tree, vector<PDTaxaSet> &taxa_set,
357 PDRelatedMeasures &pd_more) {
358 string filename;
359 if (params.out_file == NULL) {
360 filename = params.out_prefix;
361 filename += ".pda";
362 } else
363 filename = params.out_file;
364
365 try {
366 ofstream out;
367 out.exceptions(ios::failbit | ios::badbit);
368 out.open(filename.c_str());
369
370 summarizeHeader(out, params, false, IN_NEWICK);
371 out << "Tree size: " << tree.leafNum-params.is_rooted << " taxa, " <<
372 tree.nodeNum-1-params.is_rooted << " branches" << endl;
373 separator(out);
374
375 vector<PDTaxaSet>::iterator tid;
376
377 if (params.run_mode == PD_USER_SET) {
378 printPDUser(out, params, pd_more);
379 }
380 else if (taxa_set.size() > 1)
381 out << "Optimal PD-sets with k = " << params.min_size-params.is_rooted <<
382 " to " << params.sub_size-params.is_rooted << endl << endl;
383
384
385 int subsize = params.min_size-params.is_rooted;
386 if (params.run_mode == PD_USER_SET) subsize = 1;
387 for (tid = taxa_set.begin(); tid != taxa_set.end(); tid++, subsize++) {
388 if (tid != taxa_set.begin())
389 separator(out, 1);
390 if (params.run_mode == PD_USER_SET) {
391 out << "Set " << subsize << " has PD score of " << tid->score << endl;
392 }
393 else {
394 out << "For k = " << subsize << " the optimal PD score is " << (*tid).score << endl;
395 out << "The optimal PD set has " << subsize << " taxa:" << endl;
396 }
397 for (NodeVector::iterator it = (*tid).begin(); it != (*tid).end(); it++)
398 if ((*it)->name != ROOT_NAME){
399 out << (*it)->name << endl;
400 }
401 if (!tid->tree_str.empty()) {
402 out << endl << "Corresponding sub-tree: " << endl;
403 out << tid->tree_str << endl;
404 }
405 tid->clear();
406 }
407 taxa_set.clear();
408
409 summarizeFooter(out, params);
410 out.close();
411 cout << endl << "Results are summarized in " << filename << endl << endl;
412
413 } catch (ios::failure) {
414 outError(ERR_WRITE_OUTPUT, filename);
415 }
416 }
417
418
printTaxaSet(Params & params,vector<PDTaxaSet> & taxa_set,RunMode cur_mode)419 void printTaxaSet(Params ¶ms, vector<PDTaxaSet> &taxa_set, RunMode cur_mode) {
420 int subsize = params.min_size-params.is_rooted;
421 ofstream out;
422 ofstream scoreout;
423 string filename;
424 filename = params.out_prefix;
425 filename += ".score";
426 scoreout.open(filename.c_str());
427 if (!scoreout.is_open())
428 outError(ERR_WRITE_OUTPUT, filename);
429 cout << "PD scores printed to " << filename << endl;
430
431 if (params.nr_output == 1) {
432 filename = params.out_prefix;
433 filename += ".pdtaxa";
434 out.open(filename.c_str());
435 if (!out.is_open())
436 outError(ERR_WRITE_OUTPUT, filename);
437 }
438 for (vector<PDTaxaSet>::iterator tid = taxa_set.begin(); tid != taxa_set.end(); tid++, subsize++) {
439 if (params.nr_output > 10) {
440 filename = params.out_prefix;
441 filename += ".";
442 filename += subsize;
443 if (params.run_mode == BOTH_ALG) {
444 if (cur_mode == GREEDY)
445 filename += ".greedy";
446 else
447 filename += ".pruning";
448 } else {
449 filename += ".pdtree";
450 }
451 (*tid).printTree((char*)filename.c_str());
452
453 filename = params.out_prefix;
454 filename += ".";
455 filename += subsize;
456 filename += ".pdtaxa";
457 (*tid).printTaxa((char*)filename.c_str());
458 } else {
459 out << subsize << " " << (*tid).score << endl;
460 scoreout << subsize << " " << (*tid).score << endl;
461 (*tid).printTaxa(out);
462 }
463 }
464
465 if (params.nr_output == 1) {
466 out.close();
467 cout << "All taxa list(s) printed to " << filename << endl;
468 }
469
470 scoreout.close();
471 }
472
473
474 /**
475 run PD algorithm on trees
476 */
runPDTree(Params & params)477 void runPDTree(Params ¶ms)
478 {
479
480 if (params.run_mode == CALC_DIST) {
481 bool is_rooted = false;
482 MExtTree tree(params.user_file, is_rooted);
483 cout << "Tree contains " << tree.leafNum << " taxa." << endl;
484 cout << "Calculating distance matrix..." << endl;
485 tree.calcDist(params.dist_file);
486 cout << "Distances printed to " << params.dist_file << endl;
487 return;
488 }
489
490 double t_begin, t_end;
491 //char filename[300];
492 //int idx;
493
494 vector<PDTaxaSet> taxa_set;
495
496 if (params.run_mode == PD_USER_SET) {
497 // compute score of user-defined sets
498 t_begin = getCPUTime();
499 cout << "Computing PD score for user-defined set of taxa..." << endl;
500 PDTree tree(params);
501 PDRelatedMeasures pd_more;
502 tree.computePD(params, taxa_set, pd_more);
503
504 if (params.endemic_pd)
505 tree.calcPDEndemism(taxa_set, pd_more.PDEndemism);
506 if (params.complement_area != NULL)
507 tree.calcPDComplementarity(taxa_set, params.complement_area, pd_more.PDComplementarity);
508
509 t_end = getCPUTime();
510 params.run_time = (t_end-t_begin);
511 summarizeTree(params, tree, taxa_set, pd_more);
512 return;
513 }
514
515
516 /*********************************************
517 run greedy algorithm
518 *********************************************/
519
520 if (params.sub_size < 2) {
521 outError(ERR_NO_K);
522 }
523
524 bool detected_greedy = (params.run_mode != PRUNING);
525
526 Greedy test_greedy;
527
528 test_greedy.init(params);
529
530 if (params.root == NULL && !params.is_rooted)
531 cout << endl << "Running PD algorithm on UNROOTED tree..." << endl;
532 else
533 cout << endl << "Running PD algorithm on ROOTED tree..." << endl;
534
535 if (verbose_mode >= VB_DEBUG)
536 test_greedy.drawTree(cout, WT_INT_NODE + WT_BR_SCALE + WT_BR_LEN);
537
538 if (params.run_mode == GREEDY || params.run_mode == BOTH_ALG ||
539 (params.run_mode == DETECTED)) {
540
541 if (params.run_mode == DETECTED && params.sub_size >= test_greedy.leafNum * 7 / 10
542 && params.min_size < 2)
543 detected_greedy = false;
544
545 if (detected_greedy) {
546 params.detected_mode = GREEDY;
547 t_begin=getCPUTime();
548 cout << endl << "Greedy Algorithm..." << endl;
549
550 taxa_set.clear();
551 test_greedy.run(params, taxa_set);
552
553 t_end=getCPUTime();
554 params.run_time = (t_end-t_begin);
555 cout << "Time used: " << params.run_time << " seconds." << endl;
556 if (params.min_size == params.sub_size)
557 cout << "Resulting tree length = " << taxa_set[0].score << endl;
558
559 if (params.nr_output > 0)
560 printTaxaSet(params, taxa_set, GREEDY);
561
562 PDRelatedMeasures pd_more;
563
564 summarizeTree(params, test_greedy, taxa_set, pd_more);
565 }
566 }
567
568 /*********************************************
569 run pruning algorithm
570 *********************************************/
571 if (params.run_mode == PRUNING || params.run_mode == BOTH_ALG ||
572 (params.run_mode == DETECTED)) {
573
574 Pruning test_pruning;
575
576 if (params.run_mode == PRUNING || params.run_mode == BOTH_ALG) {
577 //Pruning test_pruning(params);
578 test_pruning.init(params);
579 } else if (!detected_greedy) {
580 test_pruning.init(test_greedy);
581 } else {
582 return;
583 }
584 params.detected_mode = PRUNING;
585 t_begin=getCPUTime();
586 cout << endl << "Pruning Algorithm..." << endl;
587 taxa_set.clear();
588 test_pruning.run(params, taxa_set);
589
590 t_end=getCPUTime();
591 params.run_time = (t_end-t_begin) ;
592 cout << "Time used: " << params.run_time << " seconds.\n";
593 if (params.min_size == params.sub_size)
594 cout << "Resulting tree length = " << taxa_set[0].score << endl;
595
596 if (params.nr_output > 0)
597 printTaxaSet(params, taxa_set, PRUNING);
598
599 PDRelatedMeasures pd_more;
600
601 summarizeTree(params, test_pruning, taxa_set, pd_more);
602
603 }
604
605 }
606
checkSplitDistance(ostream & out,PDNetwork & sg)607 void checkSplitDistance(ostream &out, PDNetwork &sg) {
608 mmatrix(double) dist;
609 sg.calcDistance(dist);
610 int ntaxa = sg.getNTaxa();
611 int i, j;
612 bool found = false;
613 for (i = 0; i < ntaxa-1; i++) {
614 bool first = true;
615 for (j = i+1; j < ntaxa; j++)
616 if (abs(dist[i][j]) <= 1e-5) {
617 if (!found) {
618 out << "The following sets of taxa (each set in a line) have very small split-distance" << endl;
619 out << "( <= 1e-5) as computed from the split system. To avoid a lot of multiple" << endl;
620 out << "optimal PD sets to be reported, one should only keep one taxon from each set" << endl;
621 out << "and exclude the rest from the analysis." << endl << endl;
622 }
623 if (first)
624 out << sg.getTaxa()->GetTaxonLabel(i);
625 found = true;
626 first = false;
627 out << ", " << sg.getTaxa()->GetTaxonLabel(j);
628 }
629 if (!first) out << endl;
630 }
631 if (found)
632 separator(out);
633 }
634
635
636
637 /**
638 check if the set are nested and there are no multiple optimal sets.
639 If yes, return the ranking as could be produced by a greedy algorithm
640 */
makeRanking(vector<SplitSet> & pd_set,IntVector & indices,IntVector & ranking)641 bool makeRanking(vector<SplitSet> &pd_set, IntVector &indices, IntVector &ranking) {
642 vector<SplitSet>::iterator it;
643 IntVector::iterator inti;
644 ranking.clear();
645 bool nested = true;
646 Split *cur_sp = NULL;
647 int id = 1;
648 for (it = pd_set.begin(); it != pd_set.end(); it++) {
649 if ((*it).empty()) continue;
650 if ((*it).size() > 1) {
651 nested = false;
652 ranking.push_back(-10);
653 indices.push_back(0);
654 }
655 Split *sp = (*it)[0];
656
657 if (!cur_sp) {
658 IntVector sp_tax;
659 sp->getTaxaList(sp_tax);
660 ranking.insert(ranking.end(), sp_tax.begin(), sp_tax.end());
661 for (inti = sp_tax.begin(); inti != sp_tax.end(); inti++)
662 indices.push_back(id++);
663 } else {
664 if ( !cur_sp->subsetOf(*sp)) {
665 ranking.push_back(-1);
666 indices.push_back(0);
667 nested = false;
668 }
669 Split sp_diff(*sp);
670 sp_diff -= *cur_sp;
671 Split sp_diff2(*cur_sp);
672 sp_diff2 -= *sp;
673 IntVector sp_tax;
674 sp_diff2.getTaxaList(sp_tax);
675 ranking.insert(ranking.end(), sp_tax.begin(), sp_tax.end());
676 for (inti = sp_tax.begin(); inti != sp_tax.end(); inti++)
677 indices.push_back(-id);
678 sp_diff.getTaxaList(sp_tax);
679 ranking.insert(ranking.end(), sp_tax.begin(), sp_tax.end());
680 for (inti = sp_tax.begin(); inti != sp_tax.end(); inti++)
681 indices.push_back(id);
682 if ( !cur_sp->subsetOf(*sp)) {
683 ranking.push_back(-2);
684 indices.push_back(0);
685 }
686 id++;
687 }
688 cur_sp = sp;
689 }
690 return nested;
691 }
692
693
printNexusSets(const char * filename,PDNetwork & sg,vector<SplitSet> & pd_set)694 void printNexusSets(const char *filename, PDNetwork &sg, vector<SplitSet> &pd_set) {
695 try {
696 ofstream out;
697 out.open(filename);
698 out << "#NEXUS" << endl << "BEGIN Sets;" << endl;
699 vector<SplitSet>::iterator it;
700 for (it = pd_set.begin(); it != pd_set.end(); it++) {
701 int id = 1;
702 for (SplitSet::iterator sit = (*it).begin(); sit != (*it).end(); sit++, id++) {
703 IntVector taxa;
704 (*sit)->getTaxaList(taxa);
705 out << " TAXSET Opt_" << taxa.size() << "_" << id << " =";
706 for (IntVector::iterator iit = taxa.begin(); iit != taxa.end(); iit++) {
707 if (sg.isPDArea())
708 out << " '" << sg.getSetsBlock()->getSet(*iit)->name << "'";
709 else
710 out << " '" << sg.getTaxa()->GetTaxonLabel(*iit) << "'";
711 }
712 out << ";" << endl;
713 }
714 }
715 out << "END; [Sets]" << endl;
716 out.close();
717 cout << endl << "Optimal sets are written to nexus file " << filename << endl;
718 } catch (ios::failure) {
719 outError(ERR_WRITE_OUTPUT, filename);
720 }
721
722 }
723
724
725
computeTaxaFrequency(SplitSet & taxa_set,DoubleVector & freq)726 void computeTaxaFrequency(SplitSet &taxa_set, DoubleVector &freq) {
727 ASSERT(taxa_set.size());
728 int ntaxa = taxa_set[0]->getNTaxa();
729 int i;
730
731 freq.resize(ntaxa, 0);
732 for (SplitSet::iterator it2 = taxa_set.begin(); it2 != taxa_set.end(); it2++) {
733 for ( i = 0; i < ntaxa; i++)
734 if ((*it2)->containTaxon(i)) freq[i] += 1.0;
735 }
736
737 for ( i = 0; i < ntaxa; i++)
738 freq[i] /= taxa_set.size();
739
740 }
741
742 /**
743 summarize the running results
744 */
summarizeSplit(Params & params,PDNetwork & sg,vector<SplitSet> & pd_set,PDRelatedMeasures & pd_more,bool full_report)745 void summarizeSplit(Params ¶ms, PDNetwork &sg, vector<SplitSet> &pd_set, PDRelatedMeasures &pd_more, bool full_report) {
746 int i;
747
748
749 if (params.nexus_output) {
750 string nex_file = params.out_prefix;
751 nex_file += ".pdsets.nex";
752 printNexusSets(nex_file.c_str(), sg, pd_set);
753 }
754 string filename;
755 if (params.out_file == NULL) {
756 filename = params.out_prefix;
757 filename += ".pda";
758 } else
759 filename = params.out_file;
760
761 try {
762 ofstream out;
763 out.open(filename.c_str());
764 /****************************/
765 /********** HEADER **********/
766 /****************************/
767 summarizeHeader(out, params, sg.isBudgetConstraint(), IN_NEXUS);
768
769 out << "Network size: " << sg.getNTaxa()-params.is_rooted << " taxa, " <<
770 sg.getNSplits()-params.is_rooted << " splits (of which " <<
771 sg.getNTrivialSplits() << " are trivial splits)" << endl;
772 out << "Network type: " << ((sg.isCircular()) ? "Circular" : "General") << endl;
773
774 separator(out);
775
776 checkSplitDistance(out, sg);
777
778 int c_num = 0;
779 //int subsize = (sg.isBudgetConstraint()) ? params.budget : (params.sub_size-params.is_rooted);
780 //subsize -= pd_set.size()-1;
781 int subsize = (sg.isBudgetConstraint()) ? params.min_budget : params.min_size-params.is_rooted;
782 int stepsize = (sg.isBudgetConstraint()) ? params.step_budget : params.step_size;
783 if (params.detected_mode != LINEAR_PROGRAMMING) stepsize = 1;
784 vector<SplitSet>::iterator it;
785 SplitSet::iterator it2;
786
787
788 if (params.run_mode == PD_USER_SET) {
789 printPDUser(out, params, pd_more);
790 }
791
792 /****************************/
793 /********** SUMMARY *********/
794 /****************************/
795
796 if (params.run_mode != PD_USER_SET && !params.num_bootstrap_samples) {
797 out << "Summary of the PD-score and the number of optimal PD-sets with the same " << endl << "optimal PD-score found." << endl;
798
799 if (sg.isBudgetConstraint())
800 out << endl << "Budget PD-score %PD-score #PD-sets" << endl;
801 else
802 out << endl << "Size-k PD-score %PD-score #PD-sets" << endl;
803
804 int sizex = subsize;
805 double total = sg.calcWeight();
806
807 for (it = pd_set.begin(); it != pd_set.end(); it++, sizex+=stepsize) {
808 out.width(6);
809 out << right << sizex << " ";
810 out.width(10);
811 out << right << (*it).getWeight() << " ";
812 out.width(10);
813 out << right << ((*it).getWeight()/total)*100.0 << " ";
814 out.width(6);
815 out << right << (*it).size();
816 out << endl;
817 }
818
819 out << endl;
820 if (!params.find_all)
821 out << "Note: You did not choose the option to find multiple optimal PD sets." << endl <<
822 "That's why we only reported one PD-set per size-k or budget. If you want" << endl <<
823 "to determine all multiple PD-sets, use the '-a' option.";
824 else {
825 out << "Note: The number of multiple optimal PD sets to be reported is limited to " << params.pd_limit << "." << endl <<
826 "There might be cases where the actual #PD-sets exceeds that upper-limit but" << endl <<
827 "won't be listed here. Please refer to the above list to identify such cases." << endl <<
828 "To increase the upper-limit, use the '-lim <limit_number>' option.";
829 }
830 out << endl;
831 separator(out);
832 }
833
834 if (!full_report) {
835 out.close();
836 return;
837 }
838
839
840 /****************************/
841 /********* BOOTSTRAP ********/
842 /****************************/
843 if (params.run_mode != PD_USER_SET && params.num_bootstrap_samples) {
844 out << "Summary of the bootstrap analysis " << endl;
845 for (it = pd_set.begin(); it != pd_set.end(); it++) {
846 DoubleVector freq;
847 computeTaxaFrequency((*it), freq);
848 out << "For k/budget = " << subsize << " the " << ((sg.isPDArea()) ? "areas" : "taxa")
849 << " supports are: " << endl;
850 for (i = 0; i < freq.size(); i++)
851 out << ((sg.isPDArea()) ? sg.getSetsBlock()->getSet(i)->name : sg.getTaxa()->GetTaxonLabel(i))
852 << "\t" << freq[i] << endl;
853 if ((it+1) != pd_set.end()) separator(out, 1);
854 }
855 out << endl;
856 separator(out);
857 }
858
859 /****************************/
860 /********** RANKING *********/
861 /****************************/
862
863 if (params.run_mode != PD_USER_SET && !params.num_bootstrap_samples) {
864
865
866 IntVector ranking;
867 IntVector index;
868
869 out << "Ranking based on the optimal sets" << endl;
870
871
872 if (!makeRanking(pd_set, index, ranking)) {
873 out << "WARNING: Optimal sets are not nested, so ranking should not be considered stable" << endl;
874 }
875 if (subsize > 1) {
876 out << "WARNING: The first " << subsize << " ranks should be treated equal" << endl;
877 }
878 out << endl << "Rank* ";
879 if (!sg.isPDArea())
880 out << "Taxon names" << endl;
881 else
882 out << "Area names" << endl;
883
884
885 for (IntVector::iterator intv = ranking.begin(), intid = index.begin(); intv != ranking.end(); intv ++, intid++) {
886 if (*intv == -10)
887 out << "<--- multiple optimal set here --->" << endl;
888 else if (*intv == -1)
889 out << "<--- BEGIN: greedy does not work --->" << endl;
890 else if (*intv == -2)
891 out << "<--- END --->" << endl;
892 else {
893 out.width(5);
894 out << right << *intid << " ";
895 if (sg.isPDArea())
896 out << sg.getSetsBlock()->getSet(*intv)->name << endl;
897 else
898 out << sg.getTaxa()->GetTaxonLabel(*intv) << endl;
899 }
900 }
901 out << endl;
902 out << "(*) Negative ranks indicate the point at which the greedy algorithm" << endl <<
903 " does not work. In that case, the corresponding taxon/area names" << endl <<
904 " should be deleted from the optimal set of the same size" << endl;
905 separator(out);
906 }
907
908 int max_len = sg.getTaxa()->GetMaxTaxonLabelLength();
909
910 /****************************/
911 /***** DETAILED SETS ********/
912 /****************************/
913
914 if (params.run_mode != PD_USER_SET)
915 out << "Detailed information of all taxa found in the optimal PD-sets" << endl;
916
917 if (pd_set.size() > 1) {
918 if (sg.isBudgetConstraint())
919 out << "with budget = " << params.min_budget <<
920 " to " << params.budget << endl << endl;
921 else
922 out << "with k = " << params.min_size-params.is_rooted <<
923 " to " << params.sub_size-params.is_rooted << endl << endl;
924 }
925
926 if (params.run_mode != PD_USER_SET)
927 separator(out,1);
928
929 for (it = pd_set.begin(); it != pd_set.end(); it++, subsize+=stepsize) {
930
931 // check if the pd-sets are the same as previous one
932 if (sg.isBudgetConstraint() && it != pd_set.begin()) {
933 vector<SplitSet>::iterator prev, next;
934 for (next=it, prev=it-1; next != pd_set.end() && next->getWeight() == (*prev).getWeight() &&
935 next->size() == (*prev).size(); next++ ) ;
936 if (next != it) {
937 // found something in between!
938 out << endl;
939 //out << endl << "**************************************************************" << endl;
940 out << "For budget = " << subsize << " -> " << subsize+(next-it-1)*stepsize <<
941 " the optimal PD score and PD sets" << endl;
942 out << "are identical to the case when budget = " << subsize-stepsize << endl;
943 //out << "**************************************************************" << endl;
944 subsize += (next-it)*stepsize;
945 it = next;
946 if (it == pd_set.end()) break;
947 }
948 }
949
950 if (it != pd_set.begin()) separator(out, 1);
951
952 int num_sets = (*it).size();
953 double weight = (*it).getWeight();
954
955 if (params.run_mode != PD_USER_SET) {
956 out << "For " << ((sg.isBudgetConstraint()) ? "budget" : "k") << " = " << subsize;
957 out << " the optimal PD score is " << weight << endl;
958
959 if (num_sets == 1) {
960 if (!sg.isBudgetConstraint())
961 out << "The optimal PD set has " << (*it)[0]->countTaxa()-params.is_rooted <<
962 ((sg.isPDArea()) ? " areas" : " taxa");
963 else
964 out << "The optimal PD set has " << (*it)[0]->countTaxa()-params.is_rooted <<
965 ((sg.isPDArea()) ? " areas" : " taxa") << " and requires " << sg.calcCost(*(*it)[0]) << " budget";
966 if (!sg.isPDArea()) out << " and covers " << sg.countSplits(*(*it)[0]) <<
967 " splits (of which " << sg.countInternalSplits(*(*it)[0]) << " are internal splits)";
968 out << endl;
969 }
970 else
971 out << "Found " << num_sets << " PD sets with the same optimal score." << endl;
972 }
973 for (it2 = (*it).begin(), c_num=1; it2 != (*it).end(); it2++, c_num++){
974 Split *this_set = *it2;
975
976 if (params.run_mode == PD_USER_SET && it2 != (*it).begin())
977 separator(out, 1);
978
979 if (params.run_mode == PD_USER_SET) {
980 if (!sg.isBudgetConstraint())
981 out << "Set " << c_num << " has PD score of " << this_set->getWeight();
982 else
983 out << "Set " << c_num << " has PD score of " << this_set->getWeight() <<
984 " and requires " << sg.calcCost(*this_set) << " budget";
985 } else if (num_sets > 1) {
986 if (!sg.isBudgetConstraint())
987 out << endl << "PD set " << c_num;
988 else
989 out << endl << "PD set " << c_num << " has " << this_set->countTaxa()-params.is_rooted <<
990 " taxa and requires " << sg.calcCost(*this_set) << " budget";
991 }
992
993 if (!sg.isPDArea() && (num_sets > 1 || params.run_mode == PD_USER_SET ))
994 out << " and covers " << sg.countSplits(*(*it)[0]) << " splits (of which "
995 << sg.countInternalSplits(*(*it)[0]) << " are internal splits)";
996 out << endl;
997
998 if (params.run_mode != PD_USER_SET && sg.isPDArea()) {
999 for (i = 0; i < sg.getSetsBlock()->getNSets(); i++)
1000 if (this_set->containTaxon(i)) {
1001 if (sg.isBudgetConstraint()) {
1002 out.width(max_len);
1003 out << left << sg.getSetsBlock()->getSet(i)->name << "\t";
1004 out.width(10);
1005 out << right << sg.getPdaBlock()->getCost(i);
1006 out << endl;
1007
1008 } else {
1009 out << sg.getSetsBlock()->getSet(i)->name << endl;
1010 }
1011 }
1012
1013 Split sp(sg.getNTaxa());
1014 for (i = 0; i < sg.getSetsBlock()->getNSets(); i++)
1015 if (this_set->containTaxon(i))
1016 sp += *(sg.area_taxa[i]);
1017 out << endl << "which contains " << sp.countTaxa() - params.is_rooted << " taxa: " << endl;
1018 for (i = 0; i < sg.getNTaxa(); i++)
1019 if (sg.getTaxa()->GetTaxonLabel(i) != ROOT_NAME && sp.containTaxon(i))
1020 out << sg.getTaxa()->GetTaxonLabel(i) << endl;
1021
1022 } else
1023 for ( i = 0; i < sg.getNTaxa(); i++)
1024 if (sg.getTaxa()->GetTaxonLabel(i) != ROOT_NAME && this_set->containTaxon(i)) {
1025 if (sg.isBudgetConstraint()) {
1026 out.width(max_len);
1027 out << left << sg.getTaxa()->GetTaxonLabel(i) << "\t";
1028 out.width(10);
1029 out << right << sg.getPdaBlock()->getCost(i);
1030 out << endl;
1031
1032 } else {
1033 out << sg.getTaxa()->GetTaxonLabel(i) << endl;
1034 }
1035 }
1036 }
1037 }
1038
1039 /****************************/
1040 /********** FOOTER **********/
1041 /****************************/
1042
1043 summarizeFooter(out, params);
1044
1045 out.close();
1046 cout << endl << "Results are summarized in " << filename << endl << endl;
1047 } catch (ios::failure) {
1048 outError(ERR_WRITE_OUTPUT, filename);
1049 }
1050 }
1051
printGainMatrix(char * filename,mmatrix (double)& delta_gain,int start_k)1052 void printGainMatrix(char *filename, mmatrix(double) &delta_gain, int start_k) {
1053 try {
1054 ofstream out;
1055 out.exceptions(ios::failbit | ios::badbit);
1056 out.open(filename);
1057 int k = start_k;
1058 for (mmatrix(double)::iterator it = delta_gain.begin(); it != delta_gain.end(); it++, k++) {
1059 out << k;
1060 for (int i = 0; i < (*it).size(); i++)
1061 out << " " << (*it)[i];
1062 out << endl;
1063 }
1064 out.close();
1065 cout << "PD gain matrix printed to " << filename << endl;
1066 } catch (ios::failure) {
1067 outError(ERR_WRITE_OUTPUT, filename);
1068 }
1069 }
1070
1071 /**
1072 run PD algorithm on split networks
1073 */
runPDSplit(Params & params)1074 void runPDSplit(Params ¶ms) {
1075
1076 cout << "Using NCL - Nexus Class Library" << endl << endl;
1077
1078 // init a split graph class from the parameters
1079 CircularNetwork sg(params);
1080 int i;
1081
1082 // this vector of SplitSet store all the optimal PD sets
1083 vector<SplitSet> pd_set;
1084 // this define an order of taxa (circular order in case of circular networks)
1085 vector<int> taxa_order;
1086 // this store a particular taxa set
1087 Split taxa_set;
1088
1089
1090 if (sg.isCircular()) {
1091 // is a circular network, get circular order
1092 for (i = 0; i < sg.getNTaxa(); i++)
1093 taxa_order.push_back(sg.getCircleId(i));
1094 } else
1095 // otherwise, get the incremental order
1096 for (i = 0; i < sg.getNTaxa(); i++)
1097 taxa_order.push_back(i);
1098
1099 PDRelatedMeasures pd_more;
1100
1101 // begining time of the algorithm run
1102 double time_begin = getCPUTime();
1103 //time(&time_begin);
1104 // check parameters
1105 if (sg.isPDArea()) {
1106 if (sg.isBudgetConstraint()) {
1107 int budget = (params.budget >= 0) ? params.budget : sg.getPdaBlock()->getBudget();
1108 if (budget < 0 && params.pd_proportion == 0.0) params.run_mode = PD_USER_SET;
1109 } else {
1110 int sub_size = (params.sub_size >= 1) ? params.sub_size : sg.getPdaBlock()->getSubSize();
1111 if (sub_size < 1 && params.pd_proportion == 0.0) params.run_mode = PD_USER_SET;
1112
1113 }
1114 }
1115
1116 if (params.run_mode == PD_USER_SET) {
1117 // compute score of user-defined sets
1118 cout << "Computing PD score for user-defined set of taxa..." << endl;
1119 pd_set.resize(1);
1120 sg.computePD(params, pd_set[0], pd_more);
1121 if (params.endemic_pd)
1122 sg.calcPDEndemism(pd_set[0], pd_more.PDEndemism);
1123
1124 if (params.complement_area != NULL)
1125 sg.calcPDComplementarity(pd_set[0], params.complement_area, pd_more.setName, pd_more.PDComplementarity);
1126
1127 } else {
1128 // otherwise, call the main function
1129 if (params.num_bootstrap_samples) {
1130 cout << endl << "======= START BOOTSTRAP ANALYSIS =======" << endl;
1131 MTreeSet *mtrees = sg.getMTrees();
1132 if (mtrees->size() < 100)
1133 cout << "WARNING: bootstrap may be unstable with less than 100 trees" << endl;
1134 vector<string> taxname;
1135 sg.getTaxaName(taxname);
1136 i = 1;
1137 for (MTreeSet::iterator it = mtrees->begin(); it != mtrees->end(); it++, i++) {
1138 cout << "---------- TREE " << i << " ----------" << endl;
1139 // convert tree into split sytem
1140 SplitGraph sg2;
1141 (*it)->convertSplits(taxname, sg2);
1142 // change the current split system
1143 for (SplitGraph::reverse_iterator it = sg.rbegin(); it != sg.rend(); it++) {
1144 delete *it;
1145 }
1146 sg.clear();
1147 sg.insert(sg.begin(), sg2.begin(), sg2.end());
1148 sg2.clear();
1149
1150 // now findPD on the converted tree-split system
1151 sg.findPD(params, pd_set, taxa_order);
1152 }
1153 cout << "======= DONE BOOTSTRAP ANALYSIS =======" << endl << endl;
1154 } else {
1155 sg.findPD(params, pd_set, taxa_order);
1156 }
1157 }
1158
1159 // ending time
1160 double time_end = getCPUTime();
1161 //time(&time_end);
1162 params.run_time = time_end - time_begin;
1163
1164 cout << "Time used: " << (double) (params.run_time) << " seconds." << endl;
1165
1166 if (verbose_mode >= VB_DEBUG && !sg.isPDArea()) {
1167 cout << "PD set(s) with score(s): " << endl;
1168 for (vector<SplitSet>::iterator it = pd_set.begin(); it != pd_set.end(); it++)
1169 for (SplitSet::iterator it2 = (*it).begin(); it2 != (*it).end(); it2++ ){
1170 //(*it)->report(cout);
1171 cout << " " << (*it2)->getWeight() << " ";
1172 for (i = 0; i < sg.getNTaxa(); i++)
1173 if ((*it2)->containTaxon(i))
1174 cout << sg.getTaxa()->GetTaxonLabel(i) << " ";
1175 if (sg.isBudgetConstraint())
1176 cout << " (budget = " << sg.calcCost(*(*it2)) << ")";
1177 cout << endl;
1178 }
1179 }
1180
1181 sg.printOutputSetScore(params, pd_set);
1182
1183
1184 summarizeSplit(params, sg, pd_set, pd_more, true);
1185
1186 if (params.calc_pdgain) {
1187 mmatrix(double) delta_gain;
1188 sg.calcPDGain(pd_set, delta_gain);
1189 string filename = params.out_prefix;
1190 filename += ".pdgain";
1191 printGainMatrix((char*)filename.c_str(), delta_gain, pd_set.front().front()->countTaxa());
1192 //cout << delta_gain;
1193 }
1194
1195
1196 //for (i = pd_set.size()-1; i >= 0; i--)
1197 // delete pd_set[i];
1198
1199 }
1200
printSplitSet(SplitGraph & sg,SplitIntMap & hash_ss)1201 void printSplitSet(SplitGraph &sg, SplitIntMap &hash_ss) {
1202 /*
1203 for (SplitIntMap::iterator it = hash_ss.begin(); it != hash_ss.end(); it++) {
1204 if ((*it)->getWeight() > 50 && (*it)->countTaxa() > 1)
1205 (*it)->report(cout);
1206 }*/
1207 sg.getTaxa()->Report(cout);
1208 for (SplitGraph::iterator it = sg.begin(); it != sg.end(); it++) {
1209 if ((*it)->getWeight() > 50 && (*it)->countTaxa() > 1)
1210 (*it)->report(cout);
1211 }
1212 }
1213
readTaxaOrder(char * taxa_order_file,StrVector & taxa_order)1214 void readTaxaOrder(char *taxa_order_file, StrVector &taxa_order) {
1215
1216 }
1217
calcTreeCluster(Params & params)1218 void calcTreeCluster(Params ¶ms) {
1219 ASSERT(params.taxa_order_file);
1220 MExtTree tree(params.user_file, params.is_rooted);
1221 // StrVector taxa_order;
1222 //readTaxaOrder(params.taxa_order_file, taxa_order);
1223 NodeVector taxa;
1224 mmatrix(int) clusters;
1225 clusters.reserve(tree.leafNum - 3);
1226 tree.getTaxa(taxa);
1227 sort(taxa.begin(), taxa.end(), nodenamecmp);
1228 tree.createCluster(taxa, clusters);
1229 int cnt = 1;
1230
1231 string treename = params.out_prefix;
1232 treename += ".clu-id";
1233 tree.printTree(treename.c_str());
1234
1235 for (mmatrix(int)::iterator it = clusters.begin(); it != clusters.end(); it++, cnt++) {
1236 ostringstream filename;
1237 filename << params.out_prefix << "." << cnt << ".clu";
1238 ofstream out(filename.str().c_str());
1239
1240 ostringstream filename2;
1241 filename2 << params.out_prefix << "." << cnt << ".name-clu";
1242 ofstream out2(filename2.str().c_str());
1243
1244 out << "w" << endl << "c" << endl << "4" << endl << "b" << endl << "g" << endl << 4-params.is_rooted << endl;
1245 IntVector::iterator it2;
1246 NodeVector::iterator it3;
1247 for (it2 = (*it).begin(), it3 = taxa.begin(); it2 != (*it).end(); it2++, it3++)
1248 if ((*it3)->name != ROOT_NAME) {
1249 out << char((*it2)+'a') << endl;
1250 out2 << (*it3)->name << " " << char((*it2)+'a') << endl;
1251 }
1252 out << "y" << endl;
1253 out.close();
1254 out2.close();
1255 cout << "Cluster " << cnt << " printed to " << filename.rdbuf() << " and " << filename2.rdbuf() << endl;
1256 }
1257 }
1258
1259
printTaxa(Params & params)1260 void printTaxa(Params ¶ms) {
1261 MTree mytree(params.user_file, params.is_rooted);
1262 vector<string> taxname;
1263 taxname.resize(mytree.leafNum);
1264 mytree.getTaxaName(taxname);
1265 sort(taxname.begin(), taxname.end());
1266
1267 string filename = params.out_prefix;
1268 filename += ".taxa";
1269
1270 try {
1271 ofstream out;
1272 out.exceptions(ios::failbit | ios::badbit);
1273 out.open(filename.c_str());
1274 for (vector<string>::iterator it = taxname.begin(); it != taxname.end(); it++) {
1275 if ((*it) != ROOT_NAME) out << (*it);
1276 out << endl;
1277 }
1278 out.close();
1279 cout << "All taxa names printed to " << filename << endl;
1280 } catch (ios::failure) {
1281 outError(ERR_WRITE_OUTPUT, filename);
1282 }
1283 }
1284
printAreaList(Params & params)1285 void printAreaList(Params ¶ms) {
1286 MSetsBlock *sets;
1287 sets = new MSetsBlock();
1288 cout << "Reading input file " << params.user_file << "..." << endl;
1289
1290 MyReader nexus(params.user_file);
1291
1292 nexus.Add(sets);
1293
1294 MyToken token(nexus.inf);
1295 nexus.Execute(token);
1296
1297 //sets->Report(cout);
1298
1299 TaxaSetNameVector *allsets = sets->getSets();
1300
1301 string filename = params.out_prefix;
1302 filename += ".names";
1303
1304 try {
1305 ofstream out;
1306 out.exceptions(ios::failbit | ios::badbit);
1307 out.open(filename.c_str());
1308 for (TaxaSetNameVector::iterator it = allsets->begin(); it != allsets->end(); it++) {
1309 out << (*it)->name;
1310 out << endl;
1311 }
1312 out.close();
1313 cout << "All area names printed to " << filename << endl;
1314 } catch (ios::failure) {
1315 outError(ERR_WRITE_OUTPUT, filename);
1316 }
1317
1318 delete sets;
1319 }
1320
scaleBranchLength(Params & params)1321 void scaleBranchLength(Params ¶ms) {
1322 params.is_rooted = true;
1323 PDTree tree(params);
1324 if (params.run_mode == SCALE_BRANCH_LEN) {
1325 cout << "Scaling branch length with a factor of " << params.scaling_factor << " ..." << endl;
1326 tree.scaleLength(params.scaling_factor, false);
1327 } else {
1328 cout << "Scaling clade support with a factor of " << params.scaling_factor << " ..." << endl;
1329 tree.scaleCladeSupport(params.scaling_factor, false);
1330 }
1331 if (params.out_file != NULL)
1332 tree.printTree(params.out_file);
1333 else {
1334 tree.printTree(cout);
1335 cout << endl;
1336 }
1337 }
1338
calcDistribution(Params & params)1339 void calcDistribution(Params ¶ms) {
1340
1341 PDTree mytree(params);
1342
1343 string filename = params.out_prefix;
1344 filename += ".randompd";
1345
1346 try {
1347 ofstream out;
1348 out.exceptions(ios::failbit | ios::badbit);
1349 out.open(filename.c_str());
1350 for (int size = params.min_size; size <= params.sub_size; size += params.step_size) {
1351 out << size;
1352 for (int sample = 0; sample < params.sample_size; sample++) {
1353 Split taxset(mytree.leafNum);
1354 taxset.randomize(size);
1355 mytree.calcPD(taxset);
1356 out << " " << taxset.getWeight();
1357 }
1358 out << endl;
1359 }
1360 out.close();
1361 cout << "PD distribution is printed to " << filename << endl;
1362 } catch (ios::failure) {
1363 outError(ERR_WRITE_OUTPUT, filename);
1364 }
1365 }
1366
printRFDist(string filename,double * rfdist,int n,int m,int rf_dist_mode,bool print_msg=true)1367 void printRFDist(string filename, double *rfdist, int n, int m, int rf_dist_mode, bool print_msg = true) {
1368 int i, j;
1369
1370 try {
1371 ofstream out;
1372 out.exceptions(ios::failbit | ios::badbit);
1373 out.open(filename);
1374 if (Params::getInstance().output_format == FORMAT_CSV) {
1375 out << "# Robinson-Foulds distances" << endl
1376 << "# This file can be read in MS Excel or in R with command:" << endl
1377 << "# dat=read.csv('" << filename << "',comment.char='#')" << endl
1378 << "# Columns are comma-separated with following meanings:" << endl
1379 << "# ID1: Tree 1 ID" << endl
1380 << "# ID2: Tree 2 ID" << endl
1381 << "# Dist: Robinson-Foulds distance" << endl
1382 << "ID1,ID2,Dist" << endl;
1383 if (rf_dist_mode == RF_ADJACENT_PAIR) {
1384 for (i = 0; i < n; i++)
1385 out << i+1 << ',' << i+2 << ',' << rfdist[i] << endl;
1386 } else if (Params::getInstance().rf_same_pair) {
1387 for (i = 0; i < n; i++)
1388 out << i+1 << ',' << i+1 << ',' << rfdist[i] << endl;
1389 } else {
1390 for (i = 0; i < n; i++) {
1391 for (j = 0; j < m; j++)
1392 out << i+1 << ',' << j+1 << ',' << rfdist[i*m+j] << endl;
1393 }
1394 }
1395 } else if (rf_dist_mode == RF_ADJACENT_PAIR || Params::getInstance().rf_same_pair) {
1396 out << "XXX ";
1397 out << 1 << " " << n << endl;
1398 for (i = 0; i < n; i++)
1399 out << " " << rfdist[i];
1400 out << endl;
1401 } else {
1402 // all pairs
1403 out << n << " " << m << endl;
1404 for (i = 0; i < n; i++) {
1405 out << "Tree" << i << " ";
1406 for (j = 0; j < m; j++)
1407 out << " " << rfdist[i*m+j];
1408 out << endl;
1409 }
1410 }
1411 out.close();
1412 if (print_msg)
1413 cout << "Robinson-Foulds distances printed to " << filename << endl;
1414 } catch (ios::failure) {
1415 outError(ERR_WRITE_OUTPUT, filename);
1416 }
1417 }
1418
computeRFDistExtended(const char * trees1,const char * trees2,const char * filename)1419 void computeRFDistExtended(const char *trees1, const char *trees2, const char *filename) {
1420 cout << "Reading input trees 1 file " << trees1 << endl;
1421 int ntrees = 0, ntrees2 = 0;
1422 double *rfdist_raw = NULL;
1423 try {
1424 ifstream in;
1425 in.exceptions(ios::failbit | ios::badbit);
1426 in.open(trees1);
1427 IntVector rfdist;
1428 for (ntrees = 1; !in.eof(); ntrees++) {
1429 MTree tree;
1430 bool is_rooted = false;
1431
1432 // read in the tree and convert into split system for indexing
1433 tree.readTree(in, is_rooted);
1434 if (verbose_mode >= VB_DEBUG)
1435 cout << ntrees << " " << endl;
1436 DoubleVector dist;
1437 tree.computeRFDist(trees2, dist);
1438 ntrees2 = dist.size();
1439 rfdist.insert(rfdist.end(), dist.begin(), dist.end());
1440 char ch;
1441 in.exceptions(ios::goodbit);
1442 (in) >> ch;
1443 if (in.eof()) break;
1444 in.unget();
1445 in.exceptions(ios::failbit | ios::badbit);
1446
1447 }
1448
1449 in.close();
1450 ASSERT(ntrees * ntrees2 == rfdist.size());
1451 rfdist_raw = new double[rfdist.size()];
1452 copy(rfdist.begin(), rfdist.end(), rfdist_raw);
1453
1454 } catch (ios::failure) {
1455 outError(ERR_READ_INPUT, trees1);
1456 }
1457
1458 printRFDist(filename, rfdist_raw, ntrees, ntrees2, RF_TWO_TREE_SETS_EXTENDED);
1459 delete [] rfdist_raw;
1460 }
1461
computeRFDistSamePair(const char * trees1,const char * trees2,const char * filename)1462 void computeRFDistSamePair(const char *trees1, const char *trees2, const char *filename) {
1463 cout << "Reading input trees 1 file " << trees1 << endl;
1464 int ntrees = 0, ntrees2 = 0;
1465 double *rfdist_raw = NULL;
1466 try {
1467 ifstream in;
1468 in.exceptions(ios::failbit | ios::badbit);
1469 in.open(trees1);
1470
1471 ifstream in2;
1472 in2.exceptions(ios::failbit | ios::badbit);
1473 in2.open(trees2);
1474
1475 DoubleVector rfdist;
1476 for (ntrees = 1; !in.eof() && !in2.eof(); ntrees++) {
1477 MTree tree;
1478 bool is_rooted = false;
1479 // read in the tree and convert into split system for indexing
1480 tree.readTree(in, is_rooted);
1481
1482 if (verbose_mode >= VB_DEBUG)
1483 cout << ntrees << " " << endl;
1484 DoubleVector dist;
1485 tree.computeRFDist(in2, dist, 0, true);
1486 ntrees2 = dist.size();
1487 rfdist.insert(rfdist.end(), dist.begin(), dist.end());
1488 char ch;
1489 in.exceptions(ios::goodbit);
1490 (in) >> ch;
1491 if (in.eof()) break;
1492 in.unget();
1493 in.exceptions(ios::failbit | ios::badbit);
1494
1495 }
1496
1497 in.close();
1498 in2.close();
1499 ASSERT(ntrees * ntrees2 == rfdist.size());
1500 rfdist_raw = new double[rfdist.size()];
1501 copy(rfdist.begin(), rfdist.end(), rfdist_raw);
1502
1503 } catch (ios::failure) {
1504 outError(ERR_READ_INPUT, trees1);
1505 }
1506
1507 printRFDist(filename, rfdist_raw, ntrees, ntrees2, RF_TWO_TREE_SETS_EXTENDED);
1508
1509 delete [] rfdist_raw;
1510 }
1511
computeRFDist(Params & params)1512 void computeRFDist(Params ¶ms) {
1513
1514 if (!params.user_file) outError("User tree file not provided");
1515
1516 string filename = params.out_prefix;
1517 filename += ".rfdist";
1518
1519 if (params.rf_dist_mode == RF_TWO_TREE_SETS_EXTENDED) {
1520 computeRFDistExtended(params.user_file, params.second_tree, filename.c_str());
1521 return;
1522 }
1523
1524 if (params.rf_same_pair) {
1525 computeRFDistSamePair(params.user_file, params.second_tree, filename.c_str());
1526 return;
1527 }
1528
1529 MTreeSet trees(params.user_file, params.is_rooted, params.tree_burnin, params.tree_max_count);
1530 int n = trees.size(), m = trees.size();
1531 double *rfdist;
1532 double *incomp_splits = NULL;
1533 string infoname = params.out_prefix;
1534 infoname += ".rfinfo";
1535 string treename = params.out_prefix;
1536 treename += ".rftree";
1537 if (params.rf_dist_mode == RF_TWO_TREE_SETS) {
1538 MTreeSet treeset2(params.second_tree, params.is_rooted, params.tree_burnin, params.tree_max_count);
1539 cout << "Computing Robinson-Foulds distances between two sets of trees" << endl;
1540 m = treeset2.size();
1541 size_t size = n*m;
1542 if (params.rf_same_pair) {
1543 if (m != n)
1544 outError("Tree sets has different number of trees");
1545 size = n;
1546 }
1547 rfdist = new double [size];
1548 memset(rfdist, 0, size*sizeof(double));
1549 if (verbose_mode >= VB_MAX) {
1550 incomp_splits = new double [size];
1551 memset(incomp_splits, 0, size*sizeof(double));
1552 }
1553 if (verbose_mode >= VB_MED)
1554 trees.computeRFDist(rfdist, &treeset2, params.rf_same_pair,
1555 infoname.c_str(),treename.c_str(), incomp_splits);
1556 else
1557 trees.computeRFDist(rfdist, &treeset2, params.rf_same_pair);
1558 } else {
1559 rfdist = new double [n*n];
1560 memset(rfdist, 0, n*n* sizeof(double));
1561 trees.computeRFDist(rfdist, params.rf_dist_mode, params.split_weight_threshold);
1562 }
1563
1564 //if (verbose_mode >= VB_MED) printRFDist(cout, rfdist, n, m, params.rf_dist_mode);
1565
1566 printRFDist(filename, rfdist, n, m, params.rf_dist_mode);
1567
1568 if (incomp_splits) {
1569 filename = params.out_prefix;
1570 filename += ".incomp";
1571 printRFDist(filename, incomp_splits, n, m, params.rf_dist_mode, false);
1572 cout << "Number of incompatible splits in printed to " << filename << endl;
1573 }
1574
1575 if (incomp_splits) delete [] incomp_splits;
1576 delete [] rfdist;
1577 }
1578
1579
testInputFile(Params & params)1580 void testInputFile(Params ¶ms) {
1581 SplitGraph sg(params);
1582 if (sg.isWeaklyCompatible())
1583 cout << "The split system is weakly compatible." << endl;
1584 else
1585 cout << "The split system is NOT weakly compatible." << endl;
1586
1587 }
1588
1589 /**MINH ANH: for some statistics about the branches on the input tree*/
branchStats(Params & params)1590 void branchStats(Params ¶ms){
1591 MaTree mytree(params.user_file, params.is_rooted);
1592 mytree.drawTree(cout,WT_TAXON_ID + WT_INT_NODE);
1593 //report to output file
1594 string output;
1595 if (params.out_file)
1596 output = params.out_file;
1597 else {
1598 if (params.out_prefix)
1599 output = params.out_prefix;
1600 else
1601 output = params.user_file;
1602 output += ".stats";
1603 }
1604
1605 try {
1606 ofstream out;
1607 out.exceptions(ios::failbit | ios::badbit);
1608 out.open(output.c_str());
1609 mytree.printBrInfo(out);
1610 } catch (ios::failure) {
1611 outError(ERR_WRITE_OUTPUT, output);
1612 }
1613 cout << "Information about branch lengths of the tree is printed to: " << output << endl;
1614
1615 /***** Following added by BQM to print internal branch lengths */
1616 NodeVector nodes1, nodes2;
1617 mytree.generateNNIBraches(nodes1, nodes2);
1618 output = params.out_prefix;
1619 output += ".inlen";
1620 try {
1621 ofstream out;
1622 out.exceptions(ios::failbit | ios::badbit);
1623 out.open(output.c_str());
1624 for (int i = 0; i < nodes1.size(); i++)
1625 out << nodes1[i]->findNeighbor(nodes2[i])->length << " ";
1626 out << endl;
1627 } catch (ios::failure) {
1628 outError(ERR_WRITE_OUTPUT, output);
1629 }
1630 cout << "Internal branch lengths printed to: " << output << endl;
1631 }
1632
1633 /**MINH ANH: for comparison between the input tree and each tree in a given set of trees*/
compare(Params & params)1634 void compare(Params ¶ms){
1635 MaTree mytree(params.second_tree, params.is_rooted);
1636 //sort taxon names and update nodeID, to be consistent with MTreeSet
1637 NodeVector taxa;
1638 mytree.getTaxa(taxa);
1639 sort(taxa.begin(), taxa.end(), nodenamecmp);
1640 int i;
1641 NodeVector::iterator it;
1642 for (it = taxa.begin(), i = 0; it != taxa.end(); it++, i++)
1643 (*it)->id = i;
1644
1645 string drawFile = params.second_tree;
1646 drawFile += ".draw";
1647 try {
1648 ofstream out1;
1649 out1.exceptions(ios::failbit | ios::badbit);
1650 out1.open(drawFile.c_str());
1651 mytree.drawTree(out1,WT_TAXON_ID + WT_INT_NODE);
1652 } catch (ios::failure) {
1653 outError(ERR_WRITE_OUTPUT, drawFile);
1654 }
1655 cout << "Tree with branchID (nodeID) was printed to: " << drawFile << endl;
1656
1657
1658 MTreeSet trees(params.user_file,params.is_rooted, params.tree_burnin, params.tree_max_count);
1659 DoubleMatrix brMatrix;
1660 DoubleVector BSDs;
1661 IntVector RFs;
1662 mytree.comparedTo(trees, brMatrix, RFs, BSDs);
1663 int numTree = trees.size();
1664 int numNode = mytree.nodeNum;
1665
1666 string output;
1667 if (params.out_file)
1668 output = params.out_file;
1669 else {
1670 if (params.out_prefix)
1671 output = params.out_prefix;
1672 else
1673 output = params.user_file;
1674 output += ".compare";
1675 }
1676
1677 try {
1678 ofstream out;
1679 out.exceptions(ios::failbit | ios::badbit);
1680 out.open(output.c_str());
1681 //print the header
1682 out << "tree " ;
1683 for (int nodeID = 0; nodeID < numNode; nodeID++ )
1684 if ( brMatrix[0][nodeID] != -2 )
1685 out << "br_" << nodeID << " ";
1686 out << "RF BSD" << endl;
1687 for ( int treeID = 0; treeID < numTree; treeID++ )
1688 {
1689 out << treeID << " ";
1690 for (int nodeID = 0; nodeID < numNode; nodeID++ )
1691 if ( brMatrix[treeID][nodeID] != -2 )
1692 out << brMatrix[treeID][nodeID] << " ";
1693 out << RFs[treeID] << " " << BSDs[treeID] << endl;
1694 }
1695 } catch (ios::failure) {
1696 outError(ERR_WRITE_OUTPUT, output);
1697 }
1698 cout << "Comparison with the given set of trees is printed to: " << output << endl;
1699 }
1700
1701 /**MINH ANH: to compute 'guided bootstrap' alignment*/
guidedBootstrap(Params & params)1702 void guidedBootstrap(Params ¶ms)
1703 {
1704 MaAlignment inputAlign(params.aln_file,params.sequence_type, params.intype, params.model_name);
1705 inputAlign.readLogLL(params.siteLL_file);
1706
1707 string outFre_name = params.out_prefix;
1708 outFre_name += ".patInfo";
1709
1710 inputAlign.printPatObsExpFre(outFre_name.c_str());
1711
1712 string gboAln_name = params.out_prefix;
1713 gboAln_name += ".gbo";
1714
1715 MaAlignment gboAlign;
1716 double prob;
1717 gboAlign.generateExpectedAlignment(&inputAlign, prob);
1718 gboAlign.printAlignment(IN_PHYLIP, gboAln_name.c_str());
1719
1720
1721 string outProb_name = params.out_prefix;
1722 outProb_name += ".gbo.logP";
1723 try {
1724 ofstream outProb;
1725 outProb.exceptions(ios::failbit | ios::badbit);
1726 outProb.open(outProb_name.c_str());
1727 outProb.precision(10);
1728 outProb << prob << endl;
1729 outProb.close();
1730 } catch (ios::failure) {
1731 outError(ERR_WRITE_OUTPUT, outProb_name);
1732 }
1733
1734 cout << "Information about patterns in the input alignment is printed to: " << outFre_name << endl;
1735 cout << "A 'guided bootstrap' alignment is printed to: " << gboAln_name << endl;
1736 cout << "Log of the probability of the new alignment is printed to: " << outProb_name << endl;
1737 }
1738
1739 /**MINH ANH: to compute the probability of an alignment given the multinomial distribution of patterns frequencies derived from a reference alignment*/
computeMulProb(Params & params)1740 void computeMulProb(Params ¶ms)
1741 {
1742 Alignment refAlign(params.second_align, params.sequence_type, params.intype, params.model_name);
1743 Alignment inputAlign(params.aln_file, params.sequence_type, params.intype, params.model_name);
1744 double prob;
1745 inputAlign.multinomialProb(refAlign,prob);
1746 //Printing
1747 string outProb_name = params.out_prefix;
1748 outProb_name += ".mprob";
1749 try {
1750 ofstream outProb;
1751 outProb.exceptions(ios::failbit | ios::badbit);
1752 outProb.open(outProb_name.c_str());
1753 outProb.precision(10);
1754 outProb << prob << endl;
1755 outProb.close();
1756 } catch (ios::failure) {
1757 outError(ERR_WRITE_OUTPUT, outProb_name);
1758 }
1759 cout << "Probability of alignment " << params.aln_file << " given alignment " << params.second_align << " is: " << prob << endl;
1760 cout << "The probability is printed to: " << outProb_name << endl;
1761 }
1762
processNCBITree(Params & params)1763 void processNCBITree(Params ¶ms) {
1764 NCBITree tree;
1765 Node *dad = tree.readNCBITree(params.user_file, params.ncbi_taxid, params.ncbi_taxon_level, params.ncbi_ignore_level);
1766 if (params.ncbi_names_file) tree.readNCBINames(params.ncbi_names_file);
1767
1768 cout << "Dad ID: " << dad->name << " Root ID: " << tree.root->name << endl;
1769 string str = params.user_file;
1770 str += ".tree";
1771 if (params.out_file) str = params.out_file;
1772 //tree.printTree(str.c_str(), WT_SORT_TAXA | WT_BR_LEN);
1773 cout << "NCBI tree printed to " << str << endl;
1774 try {
1775 ofstream out;
1776 out.exceptions(ios::failbit | ios::badbit);
1777 out.open(str.c_str());
1778 tree.printTree(out, WT_SORT_TAXA | WT_BR_LEN | WT_TAXON_ID, tree.root, dad);
1779 out << ";" << endl;
1780 out.close();
1781 } catch (ios::failure) {
1782 outError(ERR_WRITE_OUTPUT, str);
1783 }
1784 }
1785
1786 /* write simultaneously to cout/cerr and a file */
1787 class outstreambuf : public streambuf {
1788 public:
1789 outstreambuf* open( const char* name, ios::openmode mode = ios::out);
1790 bool is_open();
1791 outstreambuf* close();
~outstreambuf()1792 ~outstreambuf() { close(); }
get_fout_buf()1793 streambuf *get_fout_buf() {
1794 return fout_buf;
1795 }
get_cout_buf()1796 streambuf *get_cout_buf() {
1797 return cout_buf;
1798 }
get_fout()1799 ofstream *get_fout() {
1800 return &fout;
1801 }
1802
1803 protected:
1804 ofstream fout;
1805 streambuf *cout_buf;
1806 streambuf *fout_buf;
1807 virtual int overflow( int c = EOF);
1808 virtual int sync();
1809 };
1810
open(const char * name,ios::openmode mode)1811 outstreambuf* outstreambuf::open( const char* name, ios::openmode mode) {
1812 if (!(Params::getInstance().suppress_output_flags & OUT_LOG) && MPIHelper::getInstance().isMaster()) {
1813 fout.open(name, mode);
1814 if (!fout.is_open()) {
1815 cerr << "ERROR: Could not open " << name << " for logging" << endl;
1816 exit(EXIT_FAILURE);
1817 return NULL;
1818 }
1819 fout_buf = fout.rdbuf();
1820 }
1821 cout_buf = cout.rdbuf();
1822 cout.rdbuf(this);
1823 return this;
1824 }
1825
is_open()1826 bool outstreambuf::is_open() {
1827 return fout.is_open();
1828 }
1829
close()1830 outstreambuf* outstreambuf::close() {
1831 cout.rdbuf(cout_buf);
1832 if ( fout.is_open()) {
1833 sync();
1834 fout.close();
1835 return this;
1836 }
1837 return NULL;
1838 }
1839
overflow(int c)1840 int outstreambuf::overflow( int c) { // used for output buffer only
1841 if ((verbose_mode >= VB_MIN && MPIHelper::getInstance().isMaster()) || verbose_mode >= VB_MED)
1842 if (cout_buf->sputc(c) == EOF) return EOF;
1843 if (Params::getInstance().suppress_output_flags & OUT_LOG)
1844 return c;
1845 if (!MPIHelper::getInstance().isMaster())
1846 return c;
1847 if (fout_buf->sputc(c) == EOF) return EOF;
1848 return c;
1849 }
1850
1851
1852
sync()1853 int outstreambuf::sync() { // used for output buffer only
1854 if ((verbose_mode >= VB_MIN && MPIHelper::getInstance().isMaster()) || verbose_mode >= VB_MED)
1855 cout_buf->pubsync();
1856 if ((Params::getInstance().suppress_output_flags & OUT_LOG) || !MPIHelper::getInstance().isMaster())
1857 return 0;
1858 return fout_buf->pubsync();
1859 }
1860
1861 class errstreambuf : public streambuf {
1862 public:
init(streambuf * fout_buf)1863 void init(streambuf *fout_buf) {
1864 this->fout_buf = fout_buf;
1865 cerr_buf = cerr.rdbuf();
1866 cerr.rdbuf(this);
1867 new_line = true;
1868 }
1869
~errstreambuf()1870 ~errstreambuf() {
1871 cerr.rdbuf(cerr_buf);
1872 }
1873
1874 protected:
1875 streambuf *cerr_buf;
1876 streambuf *fout_buf;
1877 bool new_line;
1878
overflow(int c=EOF)1879 virtual int overflow( int c = EOF) {
1880 if (new_line)
1881 cerr_buf->sputn("ERROR: ", 7);
1882 if (cerr_buf->sputc(c) == EOF) {
1883 new_line = false;
1884 if (c == '\n') new_line = true;
1885 return EOF;
1886 }
1887 if ((Params::getInstance().suppress_output_flags & OUT_LOG)) {
1888 new_line = false;
1889 if (c == '\n') new_line = true;
1890 return c;
1891 }
1892 if (new_line)
1893 fout_buf->sputn("ERROR: ", 7);
1894 new_line = false;
1895 if (c == '\n') new_line = true;
1896 if (fout_buf->sputc(c) == EOF) return EOF;
1897 return c;
1898 }
1899
sync()1900 virtual int sync() {
1901 cerr_buf->pubsync();
1902 if (Params::getInstance().suppress_output_flags & OUT_LOG)
1903 return 0;
1904 return fout_buf->pubsync();
1905 }
1906 };
1907
1908 class muststreambuf : public streambuf {
1909 public:
init(streambuf * cout_buf,streambuf * fout_buf)1910 void init(streambuf *cout_buf, streambuf *fout_buf) {
1911 this->fout_buf = fout_buf;
1912 this->cout_buf = cout_buf;
1913 }
1914
1915 protected:
1916 streambuf *cout_buf;
1917 streambuf *fout_buf;
1918
overflow(int c=EOF)1919 virtual int overflow( int c = EOF) {
1920 if (cout_buf->sputc(c) == EOF) {
1921 return EOF;
1922 }
1923 if (fout_buf->sputc(c) == EOF) return EOF;
1924 return c;
1925 }
1926
sync()1927 virtual int sync() {
1928 cout_buf->pubsync();
1929 return fout_buf->pubsync();
1930 }
1931 };
1932
1933
1934 /*********************************************************************************
1935 * GLOBAL VARIABLES
1936 *********************************************************************************/
1937 outstreambuf _out_buf;
1938 errstreambuf _err_buf;
1939 muststreambuf _must_buf;
1940 ostream cmust(&_must_buf);
1941
1942 string _log_file;
1943 int _exit_wait_optn = FALSE;
1944
startLogFile(bool append_log)1945 extern "C" void startLogFile(bool append_log) {
1946 if (append_log)
1947 _out_buf.open(_log_file.c_str(), ios::app);
1948 else
1949 _out_buf.open(_log_file.c_str());
1950 _err_buf.init(_out_buf.get_fout_buf());
1951 _must_buf.init(_out_buf.get_cout_buf(), _out_buf.get_fout_buf());
1952 }
1953
endLogFile()1954 extern "C" void endLogFile() {
1955 if (_out_buf.is_open())
1956 _out_buf.close();
1957 }
1958
funcExit(void)1959 void funcExit(void) {
1960 if(_exit_wait_optn) {
1961 printf("\npress [return] to finish: ");
1962 fflush(stdout);
1963 while (getchar() != '\n');
1964 }
1965
1966 endLogFile();
1967 MPIHelper::getInstance().finalize();
1968 }
1969
funcAbort(int signal_number)1970 extern "C" void funcAbort(int signal_number)
1971 {
1972 /*Your code goes here. You can output debugging info.
1973 If you return from this function, and it was called
1974 because abort() was called, your program will exit or crash anyway
1975 (with a dialog box on Windows).
1976 */
1977 #if (defined(__GNUC__) || defined(__clang__)) && !defined(WIN32) && !defined(__CYGWIN__)
1978 print_stacktrace(cerr);
1979 #endif
1980
1981 cerr << endl << "*** IQ-TREE CRASHES WITH SIGNAL ";
1982 switch (signal_number) {
1983 case SIGABRT: cerr << "ABORTED"; break;
1984 case SIGFPE: cerr << "ERRONEOUS NUMERIC"; break;
1985 case SIGILL: cerr << "ILLEGAL INSTRUCTION"; break;
1986 case SIGSEGV: cerr << "SEGMENTATION FAULT"; break;
1987 #if !defined WIN32 && !defined _WIN32 && !defined __WIN32__
1988 case SIGBUS: cerr << "BUS ERROR"; break;
1989 #endif
1990 }
1991 cerr << endl;
1992 cerr << "*** For bug report please send to developers:" << endl << "*** Log file: " << _log_file;
1993 cerr << endl << "*** Alignment files (if possible)" << endl;
1994 funcExit();
1995 signal(signal_number, SIG_DFL);
1996 }
1997
getintargv(int * argc,char ** argv[])1998 extern "C" void getintargv(int *argc, char **argv[])
1999 {
2000 int done;
2001 int count;
2002 int n;
2003 int l;
2004 char ch;
2005 char *argtmp;
2006 char **argstr;
2007
2008 argtmp = (char *)calloc(10100, sizeof(char));
2009 argstr = (char **)calloc(100, sizeof(char*));
2010 for(n=0; n<100; n++) {
2011 argstr[n] = &(argtmp[n * 100]);
2012 }
2013 n=1;
2014
2015 fprintf(stdout, "\nYou seem to have click-started this program,");
2016 fprintf(stdout, "\ndo you want to enter commandline parameters: [y]es, [n]o: ");
2017 fflush(stdout);
2018
2019 /* read one char */
2020 ch = getc(stdin);
2021 if (ch != '\n') {
2022 do ;
2023 while (getc(stdin) != '\n');
2024 }
2025 ch = (char) tolower((int) ch);
2026
2027 if (ch == 'y') {
2028 done=FALSE;
2029
2030 fprintf(stdout, "\nEnter single parameter [! for none]: ");
2031 fflush(stdout);
2032 count = fscanf(stdin, "%s", argstr[n]);
2033 do ;
2034 while (getc(stdin) != '\n');
2035
2036 if(argstr[0][0] == '!') {
2037 count = 0;
2038 } else {
2039 if (strlen(argstr[n]) > 100) {
2040 fprintf(stdout, "\nParameter too long!!!\n");
2041 } else {
2042 n++;
2043 }
2044 }
2045
2046 while(!done) {
2047 fprintf(stdout, "\nCurrent commandline: ");
2048 for(l=1; l<n; l++) {
2049 fprintf(stdout, "%s ", argstr[l]);
2050 }
2051 fprintf(stdout, "\nQuit [q]; confirm [y]%s%s%s: ",
2052 (n<99 ? ", extend [e]" : ""),
2053 (n>1 ? ", delete last [l]" : ""),
2054 (n>1 ? ", delete all [a]" : ""));
2055 fflush(stdout);
2056
2057 /* read one char */
2058 ch = getc(stdin);
2059 /* ch = getchar(); */
2060 if (ch != '\n') {
2061 do ;
2062 while (getc(stdin) != '\n');
2063 /* while (getchar() != '\n'); */
2064 }
2065 ch = (char) tolower((int) ch);
2066
2067 switch (ch) {
2068 case 'y':
2069 done=TRUE;
2070 break;
2071 case 'e':
2072 fprintf(stdout, "\nEnter single parameter [! for none]: ");
2073 fflush(stdout);
2074 count = fscanf(stdin, "%s", argstr[n]);
2075 do ;
2076 while (getc(stdin) != '\n');
2077
2078 if(argstr[0][0] == '!') {
2079 count = 0;
2080 } else {
2081 if (strlen(argstr[n]) > 100) {
2082 fprintf(stdout, "\nParameter too long!!!\n");
2083 } else {
2084 n++;
2085 }
2086 }
2087 break;
2088 case 'l':
2089 if (n>1) n--;
2090 break;
2091 case 'a':
2092 n=1;
2093 break;
2094 case 'q':
2095 // tp_exit(0, NULL, FALSE, __FILE__, __LINE__, _exit_wait_optn);
2096 if(_exit_wait_optn) {
2097 printf("\npress [return] to finish: ");
2098 fflush(stdout);
2099 while (getchar() != '\n');
2100 }
2101 exit(0);
2102 break;
2103 }
2104 }
2105 }
2106
2107 *argc = n;
2108 *argv = argstr;
2109 } /* getintargv */
2110
2111 /*********************************************************************************************************************************
2112 Olga: ECOpd - phylogenetic diversity with ecological constraint: choosing a viable subset of species which maximizes PD/SD
2113 *********************************************************************************************************************************/
2114
processECOpd(Params & params)2115 void processECOpd(Params ¶ms) {
2116 double startTime = getCPUTime();
2117 params.detected_mode = LINEAR_PROGRAMMING;
2118 cout<<"----------------------------------------------------------------------------------------"<<endl;
2119 int i;
2120 double score;
2121 double *variables;
2122 int threads = params.gurobi_threads;
2123 params.gurobi_format=true;
2124
2125 string model_file,subFoodWeb,outFile;
2126
2127 model_file = params.out_prefix;
2128 model_file += ".lp";
2129
2130 subFoodWeb = params.out_prefix;
2131 subFoodWeb += ".subFoodWeb";
2132
2133 outFile = params.out_prefix;
2134 outFile += ".pda";
2135
2136 //Defining the input phylo type: t - rooted/unrooted tree, n - split network
2137 params.intype=detectInputFile(params.user_file);
2138 if(params.intype == IN_NEWICK){
2139 params.eco_type = "t";
2140 } else if(params.intype == IN_NEXUS){
2141 params.eco_type = "n";
2142 }
2143
2144 // Checking whether to treat the food web as weighted or non weighted
2145 if(params.diet_max == 0){
2146 params.eco_weighted = false;
2147 }else if(params.diet_max > 100 || params.diet_max < 0){
2148 cout<<"The minimum percentage of the diet to be conserved for each predator"<<endl;
2149 cout<<"d = "<<params.diet_max<<endl;
2150 cout<<"ERROR: Wrong value of parameter d. It must be within the range 0 <= d <= 100"<<endl;
2151 exit(0);
2152 }else{
2153 params.eco_weighted = true;
2154 }
2155
2156 if(strcmp(params.eco_type,"t")==0){
2157 /*--------------------------------- EcoPD Trees ---------------------------------*/
2158 ECOpd tree(params.user_file,params.is_rooted);
2159
2160 // Setting all the information-----------------
2161 tree.phyloType = "t";
2162 tree.TaxaNUM = tree.leafNum;
2163 if(verbose_mode == VB_MAX){
2164 cout<<"TaxaNUM = "<<tree.TaxaNUM<<endl;
2165 cout<<"LeafNUM = "<<tree.leafNum<<endl;
2166 cout<<"root_id = "<<tree.root->id<<" root_name = "<<tree.root->name<<endl;
2167
2168 for(i=0; i<tree.leafNum; i++){
2169 cout<<i<<" "<<tree.findNodeID(i)->name <<endl;
2170 }
2171 }
2172
2173 //Getting Species Names from tree
2174 for(i = 0; i < tree.TaxaNUM; i++)
2175 (tree.phyloNames).push_back(tree.findNodeID(i)->name);
2176 //for(i=0;i<tree.phyloNames.size();i++)
2177 // cout<<"["<<i<<"] "<<tree.phyloNames[i]<<endl;
2178
2179 // Full species list including info from tree and food web. Here adding names from phyloInput.
2180 for(i=0; i<tree.TaxaNUM; i++)
2181 tree.names.push_back(&(tree.phyloNames[i]));
2182
2183 // Read the taxa to be included in the final optimal subset
2184 if(params.initial_file)
2185 tree.readInitialTaxa(params.initial_file);
2186
2187 // Read the DAG file, Synchronize species on the Tree and in the Food Web
2188 tree.weighted = params.eco_weighted;
2189 tree.T = params.diet_max*0.01;
2190 tree.readDAG(params.eco_dag_file);
2191 tree.defineK(params);
2192
2193 // IP formulation
2194 cout<<"Formulating an IP problem..."<<endl;
2195 if(tree.rooted){
2196 tree.printECOlpRooted(model_file.c_str(),tree);
2197 } else {
2198 tree.printECOlpUnrooted(model_file.c_str(),tree);
2199 }
2200
2201 // Solve IP problem
2202 cout<<"Solving the problem..."<<endl;
2203 variables = new double[tree.nvar];
2204 int g_return = gurobi_solve((char*)model_file.c_str(), tree.nvar, &score, variables, verbose_mode, threads);
2205 if(verbose_mode == VB_MAX){
2206 cout<<"GUROBI finished with "<<g_return<<" return."<<endl;
2207 for(i=0; i<tree.nvar; i++)
2208 cout<<"x"<<i<<" = "<<variables[i]<<endl;
2209 cout<<"score = "<<score<<endl;
2210 }
2211 tree.dietConserved(variables);
2212 params.run_time = getCPUTime() - startTime;
2213 tree.printResults((char*)outFile.c_str(),variables,score,params);
2214 tree.printSubFoodWeb((char*)subFoodWeb.c_str(),variables);
2215 delete[] variables;
2216
2217 } else if(strcmp(params.eco_type,"n")==0){
2218 /*----------------------------- EcoPD SplitNetwork ------------------------------*/
2219 params.intype=detectInputFile(params.user_file);
2220 PDNetwork splitSYS(params);
2221 ECOpd ecoInfDAG;
2222
2223 // Get the species names from SplitNetwork
2224 splitSYS.speciesList(&(ecoInfDAG.phyloNames));
2225 //for(i=0;i<ecoInfDAG.phyloNames.size();i++)
2226 // cout<<"["<<i<<"] "<<ecoInfDAG.phyloNames[i]<<endl;
2227
2228 ecoInfDAG.phyloType = "n";
2229 ecoInfDAG.TaxaNUM = splitSYS.getNTaxa();
2230
2231 // Full species list including info from tree and food web
2232 for(i=0; i<ecoInfDAG.TaxaNUM; i++)
2233 ecoInfDAG.names.push_back(&(ecoInfDAG.phyloNames[i]));
2234
2235 ecoInfDAG.weighted = params.eco_weighted;
2236 // Read the taxa to be included in the final optimal subset
2237 if(params.initial_file)
2238 ecoInfDAG.readInitialTaxa(params.initial_file);
2239 ecoInfDAG.T = params.diet_max*0.01;
2240 ecoInfDAG.readDAG(params.eco_dag_file);
2241 ecoInfDAG.defineK(params);
2242
2243 cout<<"Formulating an IP problem..."<<endl;
2244 splitSYS.transformEcoLP(params, model_file.c_str(), 0);
2245 /**
2246 * (subset_size-4) - influences constraints for conserved splits.
2247 * should be less than taxaNUM in the split system.
2248 * With 0 prints all the constraints.
2249 * Values different of 0 reduce the # of constraints.
2250 **/
2251
2252 ecoInfDAG.printInfDAG(model_file.c_str(),splitSYS,params);
2253 cout<<"Solving the problem..."<<endl;
2254 variables = new double[ecoInfDAG.nvar];
2255 int g_return = gurobi_solve((char*)model_file.c_str(), ecoInfDAG.nvar, &score, variables, verbose_mode, threads);
2256 if(verbose_mode == VB_MAX){
2257 cout<<"GUROBI finished with "<<g_return<<" return."<<endl;
2258 for(i=0; i<ecoInfDAG.nvar; i++)
2259 cout<<"x"<<i<<" = "<<variables[i]<<endl;
2260 cout<<"score = "<<score<<endl;
2261 }
2262 ecoInfDAG.splitsNUM = splitSYS.getNSplits();
2263 ecoInfDAG.totalSD = splitSYS.calcWeight();
2264 ecoInfDAG.dietConserved(variables);
2265 params.run_time = getCPUTime() - startTime;
2266 ecoInfDAG.printResults((char*)outFile.c_str(),variables, score,params);
2267 ecoInfDAG.printSubFoodWeb((char*)subFoodWeb.c_str(),variables);
2268 delete[] variables;
2269 }
2270 }
2271
collapseLowBranchSupport(char * user_file,char * split_threshold_str)2272 void collapseLowBranchSupport(char *user_file, char *split_threshold_str) {
2273 DoubleVector minsup;
2274 convert_double_vec(split_threshold_str, minsup, '/');
2275 if (minsup.empty())
2276 outError("wrong -minsupnew argument, please use back-slash separated string");
2277 MExtTree tree;
2278 bool isrooted = false;
2279 tree.readTree(user_file, isrooted);
2280 tree.collapseLowBranchSupport(minsup);
2281 tree.collapseZeroBranches(NULL, NULL, -1.0);
2282 if (verbose_mode >= VB_MED)
2283 tree.drawTree(cout);
2284 string outfile = (string)user_file + ".collapsed";
2285 tree.printTree(outfile.c_str());
2286 cout << "Tree with collapsed branches written to " << outfile << endl;
2287 }
2288
2289
2290 /********************************************************
2291 main function
2292 ********************************************************/
2293 /*
2294 int main(){
2295 IQTree tree;
2296 char * str = "(1, (2, 345));";
2297 string k;
2298 tree.pllConvertTaxaID2IQTreeForm(str, k);
2299 cout << str << endl;
2300 cout << k << endl;
2301 cout << "WHAT" << endl;
2302 return 0;
2303 }
2304 */
2305
2306
main(int argc,char * argv[])2307 int main(int argc, char *argv[]) {
2308
2309 /*
2310 Instruction set ID reported by vectorclass::instrset_detect
2311 0 = 80386 instruction set
2312 1 or above = SSE (XMM) supported by CPU (not testing for O.S. support)
2313 2 or above = SSE2
2314 3 or above = SSE3
2315 4 or above = Supplementary SSE3 (SSSE3)
2316 5 or above = SSE4.1
2317 6 or above = SSE4.2
2318 7 or above = AVX supported by CPU and operating system
2319 8 or above = AVX2
2320 9 or above = AVX512F
2321 */
2322 int instruction_set;
2323
2324 MPIHelper::getInstance().init(argc, argv);
2325
2326 atexit(funcExit);
2327
2328 /*************************/
2329 { /* local scope */
2330 int found = FALSE; /* "click" found in cmd name? */
2331 int n, dummyint;
2332 char *tmpstr;
2333 int intargc;
2334 char **intargv;
2335 intargc = 0;
2336 intargv = NULL;
2337
2338 for (n = strlen(argv[0]) - 5;
2339 (n >= 0) && !found && (argv[0][n] != '/')
2340 && (argv[0][n] != '\\'); n--) {
2341
2342 tmpstr = &(argv[0][n]);
2343 dummyint = 0;
2344 (void) sscanf(tmpstr, "click%n", &dummyint);
2345 if (dummyint == 5) found = TRUE;
2346 else {
2347 dummyint = 0;
2348 (void) sscanf(tmpstr, "CLICK%n", &dummyint);
2349 if (dummyint == 5) found = TRUE;
2350 else {
2351 dummyint = 0;
2352 (void) sscanf(tmpstr, "Click%n", &dummyint);
2353 if (dummyint == 5) found = TRUE;
2354 }
2355 }
2356 }
2357 if (found) _exit_wait_optn = TRUE;
2358
2359 if (_exit_wait_optn) { // get commandline parameters from keyboard
2360 getintargv(&intargc, &intargv);
2361 fprintf(stdout, "\n\n");
2362 if (intargc > 1) { // if there were option entered, use them as argc/argv
2363 argc = intargc;
2364 argv = intargv;
2365 }
2366 }
2367 } /* local scope */
2368 /*************************/
2369
2370 parseArg(argc, argv, Params::getInstance());
2371
2372 // 2015-12-05
2373 Checkpoint *checkpoint = new Checkpoint;
2374 string filename = (string)Params::getInstance().out_prefix +".ckp.gz";
2375 checkpoint->setFileName(filename);
2376
2377 bool append_log = false;
2378
2379 if (!Params::getInstance().ignore_checkpoint && fileExists(filename)) {
2380 checkpoint->load();
2381 if (checkpoint->hasKey("finished")) {
2382 if (checkpoint->getBool("finished")) {
2383 if (Params::getInstance().force_unfinished) {
2384 if (MPIHelper::getInstance().isMaster())
2385 cout << "NOTE: Continue analysis although a previous run already finished" << endl;
2386 } else {
2387 delete checkpoint;
2388 if (MPIHelper::getInstance().isMaster())
2389 outError("Checkpoint (" + filename + ") indicates that a previous run successfully finished\n" +
2390 "Use `-redo` option if you really want to redo the analysis and overwrite all output files.\n" +
2391 "Use `--redo-tree` option if you want to restore ModelFinder and only redo tree search.\n" +
2392 "Use `--undo` option if you want to continue previous run when changing/adding options."
2393 );
2394 else
2395 exit(EXIT_SUCCESS);
2396 exit(EXIT_FAILURE);
2397 }
2398 } else {
2399 append_log = true;
2400 }
2401 } else {
2402 if (MPIHelper::getInstance().isMaster())
2403 outWarning("Ignore invalid checkpoint file " + filename);
2404 checkpoint->clear();
2405 }
2406 }
2407
2408 // after loading, workers are not allowed to write checkpoint anymore
2409 if (MPIHelper::getInstance().isWorker())
2410 checkpoint->setFileName("");
2411
2412 _log_file = Params::getInstance().out_prefix;
2413 _log_file += ".log";
2414 startLogFile(append_log);
2415 time_t start_time;
2416
2417 if (append_log) {
2418 cout << endl << "******************************************************"
2419 << endl << "CHECKPOINT: Resuming analysis from " << filename << endl << endl;
2420 }
2421
2422 MPIHelper::getInstance().syncRandomSeed();
2423
2424 signal(SIGABRT, &funcAbort);
2425 signal(SIGFPE, &funcAbort);
2426 signal(SIGILL, &funcAbort);
2427 signal(SIGSEGV, &funcAbort);
2428 #if !defined WIN32 && !defined _WIN32 && !defined __WIN32__
2429 signal(SIGBUS, &funcAbort);
2430 #endif
2431 printCopyright(cout);
2432
2433 /*
2434 double x=1e-100;
2435 double y=1e-101;
2436 if (x > y) cout << "ok!" << endl;
2437 else cout << "shit!" << endl;
2438 */
2439 //FILE *pfile = popen("hostname","r");
2440 char hostname[100];
2441 #if defined WIN32 || defined _WIN32 || defined __WIN32__
2442 WSADATA wsaData;
2443 WSAStartup(MAKEWORD(2, 2), &wsaData);
2444 gethostname(hostname, sizeof(hostname));
2445 WSACleanup();
2446 #else
2447 gethostname(hostname, sizeof(hostname));
2448 #endif
2449 //fgets(hostname, sizeof(hostname), pfile);
2450 //pclose(pfile);
2451
2452 instruction_set = instrset_detect();
2453 #if defined(BINARY32) || defined(__NOAVX__)
2454 instruction_set = min(instruction_set, (int)LK_SSE42);
2455 #endif
2456 if (instruction_set < LK_SSE2) outError("Your CPU does not support SSE2!");
2457 bool has_fma3 = (instruction_set >= LK_AVX) && hasFMA3();
2458
2459 #ifdef __FMA__
2460 bool has_fma = has_fma3;
2461 if (!has_fma) {
2462 outError("Your CPU does not support FMA instruction, quiting now...");
2463 }
2464 #endif
2465
2466 cout << "Host: " << hostname << " (";
2467 switch (instruction_set) {
2468 case 0: cout << "x86, "; break;
2469 case 1: cout << "SSE, "; break;
2470 case 2: cout << "SSE2, "; break;
2471 case 3: cout << "SSE3, "; break;
2472 case 4: cout << "SSSE3, "; break;
2473 case 5: cout << "SSE4.1, "; break;
2474 case 6: cout << "SSE4.2, "; break;
2475 case 7: cout << "AVX, "; break;
2476 case 8: cout << "AVX2, "; break;
2477 default: cout << "AVX512, "; break;
2478 }
2479 if (has_fma3) cout << "FMA3, ";
2480 // if (has_fma4) cout << "FMA4, ";
2481 //#if defined __APPLE__ || defined __MACH__
2482 cout << (int)(((getMemorySize()/1024.0)/1024)/1024) << " GB RAM)" << endl;
2483 //#else
2484 // cout << (int)(((getMemorySize()/1000.0)/1000)/1000) << " GB RAM)" << endl;
2485 //#endif
2486
2487 cout << "Command:";
2488 int i;
2489 for (i = 0; i < argc; i++)
2490 cout << " " << argv[i];
2491 cout << endl;
2492
2493 checkpoint->get("iqtree.seed", Params::getInstance().ran_seed);
2494 cout << "Seed: " << Params::getInstance().ran_seed << " ";
2495 init_random(Params::getInstance().ran_seed + MPIHelper::getInstance().getProcessID(), true);
2496
2497 time(&start_time);
2498 cout << "Time: " << ctime(&start_time);
2499
2500 // increase instruction set level with FMA
2501 if (has_fma3 && instruction_set < LK_AVX_FMA)
2502 instruction_set = LK_AVX_FMA;
2503
2504 Params::getInstance().SSE = min(Params::getInstance().SSE, (LikelihoodKernel)instruction_set);
2505
2506 cout << "Kernel: ";
2507
2508 if (Params::getInstance().lk_safe_scaling) {
2509 cout << "Safe ";
2510 }
2511
2512 if (Params::getInstance().pll) {
2513 #ifdef __AVX__
2514 cout << "PLL-AVX";
2515 #else
2516 cout << "PLL-SSE3";
2517 #endif
2518 } else {
2519 if (Params::getInstance().SSE >= LK_AVX512)
2520 cout << "AVX-512";
2521 else if (Params::getInstance().SSE >= LK_AVX_FMA) {
2522 cout << "AVX+FMA";
2523 } else if (Params::getInstance().SSE >= LK_AVX) {
2524 cout << "AVX";
2525 } else if (Params::getInstance().SSE >= LK_SSE2) {
2526 cout << "SSE2";
2527 } else
2528 cout << "x86";
2529 }
2530
2531 #ifdef _OPENMP
2532 if (Params::getInstance().num_threads >= 1) {
2533 omp_set_num_threads(Params::getInstance().num_threads);
2534 Params::getInstance().num_threads = omp_get_max_threads();
2535 }
2536 // int max_threads = omp_get_max_threads();
2537 int max_procs = countPhysicalCPUCores();
2538 cout << " - ";
2539 if (Params::getInstance().num_threads > 0)
2540 cout << Params::getInstance().num_threads << " threads";
2541 else
2542 cout << "auto-detect threads";
2543 cout << " (" << max_procs << " CPU cores detected)";
2544 if (Params::getInstance().num_threads > max_procs) {
2545 cout << endl;
2546 outError("You have specified more threads than CPU cores available");
2547 }
2548 omp_set_nested(false); // don't allow nested OpenMP parallelism
2549 #else
2550 if (Params::getInstance().num_threads != 1) {
2551 cout << endl << endl;
2552 outError("Number of threads must be 1 for sequential version.");
2553 }
2554 #endif
2555
2556 #ifdef _IQTREE_MPI
2557 cout << endl << "MPI: " << MPIHelper::getInstance().getNumProcesses() << " processes";
2558 #endif
2559
2560 int num_procs = countPhysicalCPUCores();
2561 #ifdef _OPENMP
2562 if (num_procs > 1 && Params::getInstance().num_threads == 1) {
2563 cout << endl << endl << "HINT: Use -nt option to specify number of threads because your CPU has " << num_procs << " cores!";
2564 cout << endl << "HINT: -nt AUTO will automatically determine the best number of threads to use.";
2565 }
2566 #else
2567 if (num_procs > 1)
2568 cout << endl << endl << "NOTE: Consider using the multicore version because your CPU has " << num_procs << " cores!";
2569 #endif
2570
2571 //cout << "sizeof(int)=" << sizeof(int) << endl;
2572 cout << endl << endl;
2573
2574 cout.precision(3);
2575 cout.setf(ios::fixed);
2576
2577 // checkpoint general run information
2578 checkpoint->startStruct("iqtree");
2579 string command;
2580
2581 if (CKP_RESTORE_STRING(command)) {
2582 // compare command between saved and current commands
2583 stringstream ss(command);
2584 string str;
2585 bool mismatch = false;
2586 for (i = 1; i < argc; i++) {
2587 if (!(ss >> str)) {
2588 outWarning("Number of command-line arguments differs from checkpoint");
2589 mismatch = true;
2590 break;
2591 }
2592 if (str != argv[i]) {
2593 outWarning((string)"Command-line argument `" + argv[i] + "` differs from checkpoint `" + str + "`");
2594 mismatch = true;
2595 }
2596 }
2597 if (mismatch) {
2598 outWarning("Command-line differs from checkpoint!");
2599 }
2600 command = "";
2601 }
2602
2603 for (i = 1; i < argc; i++)
2604 command += string(" ") + argv[i];
2605 CKP_SAVE(command);
2606 int seed = Params::getInstance().ran_seed;
2607 CKP_SAVE(seed);
2608 CKP_SAVE(start_time);
2609
2610 // check for incompatible version
2611 string version;
2612 stringstream sversion;
2613 sversion << iqtree_VERSION_MAJOR << "." << iqtree_VERSION_MINOR << iqtree_VERSION_PATCH;
2614 version = sversion.str();
2615 CKP_SAVE(version);
2616 checkpoint->endStruct();
2617
2618 if (MPIHelper::getInstance().getNumProcesses() > 1) {
2619 if (Params::getInstance().aln_file || Params::getInstance().partition_file) {
2620 runPhyloAnalysis(Params::getInstance(), checkpoint);
2621 } else {
2622 outError("Please use one MPI process! The feature you wanted does not need parallelization.");
2623 }
2624 } else
2625 // call the main function
2626 if (Params::getInstance().tree_gen != NONE) {
2627 generateRandomTree(Params::getInstance());
2628 } else if (Params::getInstance().do_pars_multistate) {
2629 doParsMultiState(Params::getInstance());
2630 } else if (Params::getInstance().rf_dist_mode != 0) {
2631 computeRFDist(Params::getInstance());
2632 } else if (Params::getInstance().test_input != TEST_NONE) {
2633 Params::getInstance().intype = detectInputFile(Params::getInstance().user_file);
2634 testInputFile(Params::getInstance());
2635 } else if (Params::getInstance().run_mode == PRINT_TAXA) {
2636 printTaxa(Params::getInstance());
2637 } else if (Params::getInstance().run_mode == PRINT_AREA) {
2638 printAreaList(Params::getInstance());
2639 } else if (Params::getInstance().run_mode == SCALE_BRANCH_LEN || Params::getInstance().run_mode == SCALE_NODE_NAME) {
2640 scaleBranchLength(Params::getInstance());
2641 } else if (Params::getInstance().run_mode == PD_DISTRIBUTION) {
2642 calcDistribution(Params::getInstance());
2643 } else if (Params::getInstance().run_mode == STATS){ /**MINH ANH: for some statistics on the input tree*/
2644 branchStats(Params::getInstance()); // MA
2645 } else if (Params::getInstance().branch_cluster > 0) {
2646 calcTreeCluster(Params::getInstance());
2647 } else if (Params::getInstance().ncbi_taxid) {
2648 processNCBITree(Params::getInstance());
2649 } else if (Params::getInstance().user_file && Params::getInstance().eco_dag_file) { /**ECOpd analysis*/
2650 processECOpd(Params::getInstance());
2651 } else if ((Params::getInstance().aln_file || Params::getInstance().partition_file) &&
2652 Params::getInstance().consensus_type != CT_ASSIGN_SUPPORT_EXTENDED)
2653 {
2654 if ((Params::getInstance().siteLL_file || Params::getInstance().second_align) && !Params::getInstance().gbo_replicates)
2655 {
2656 if (Params::getInstance().siteLL_file)
2657 guidedBootstrap(Params::getInstance());
2658 if (Params::getInstance().second_align)
2659 computeMulProb(Params::getInstance());
2660 } else {
2661 runPhyloAnalysis(Params::getInstance(), checkpoint);
2662 }
2663 // } else if (Params::getInstance().ngs_file || Params::getInstance().ngs_mapped_reads) {
2664 // runNGSAnalysis(Params::getInstance());
2665 // } else if (Params::getInstance().pdtaxa_file && Params::getInstance().gene_scale_factor >=0.0 && Params::getInstance().gene_pvalue_file) {
2666 // runGSSAnalysis(Params::getInstance());
2667 } else if (Params::getInstance().consensus_type != CT_NONE) {
2668 MExtTree tree;
2669 switch (Params::getInstance().consensus_type) {
2670 case CT_CONSENSUS_TREE:
2671 computeConsensusTree(Params::getInstance().user_file, Params::getInstance().tree_burnin, Params::getInstance().tree_max_count, Params::getInstance().split_threshold,
2672 Params::getInstance().split_weight_threshold, Params::getInstance().out_file, Params::getInstance().out_prefix, Params::getInstance().tree_weight_file, &Params::getInstance());
2673 break;
2674 case CT_CONSENSUS_NETWORK:
2675 computeConsensusNetwork(Params::getInstance().user_file, Params::getInstance().tree_burnin, Params::getInstance().tree_max_count, Params::getInstance().split_threshold,
2676 Params::getInstance().split_weight_summary, Params::getInstance().split_weight_threshold, Params::getInstance().out_file, Params::getInstance().out_prefix, Params::getInstance().tree_weight_file);
2677 break;
2678 case CT_ASSIGN_SUPPORT:
2679 assignBootstrapSupport(Params::getInstance().user_file, Params::getInstance().tree_burnin, Params::getInstance().tree_max_count,
2680 Params::getInstance().second_tree, Params::getInstance().is_rooted, Params::getInstance().out_file,
2681 Params::getInstance().out_prefix, tree, Params::getInstance().tree_weight_file, &Params::getInstance());
2682 break;
2683 case CT_ASSIGN_SUPPORT_EXTENDED:
2684 assignBranchSupportNew(Params::getInstance());
2685 break;
2686 case CT_NONE: break;
2687 /**MINH ANH: for some comparison*/
2688 case COMPARE: compare(Params::getInstance()); break; //MA
2689 }
2690 } else if (Params::getInstance().split_threshold_str) {
2691 // for Ricardo: keep those splits from input tree above given support threshold
2692 collapseLowBranchSupport(Params::getInstance().user_file, Params::getInstance().split_threshold_str);
2693 } else {
2694 Params::getInstance().intype = detectInputFile(Params::getInstance().user_file);
2695 if (Params::getInstance().intype == IN_NEWICK && Params::getInstance().pdtaxa_file && Params::getInstance().tree_gen == NONE) {
2696 if (Params::getInstance().budget_file) {
2697 //if (Params::getInstance().budget < 0) Params::getInstance().run_mode = PD_USER_SET;
2698 } else {
2699 if (Params::getInstance().sub_size < 1 && Params::getInstance().pd_proportion == 0.0)
2700 Params::getInstance().run_mode = PD_USER_SET;
2701 }
2702 // input is a tree, check if it is a reserve selection -> convert to splits
2703 if (Params::getInstance().run_mode != PD_USER_SET) Params::getInstance().multi_tree = true;
2704 }
2705
2706
2707 if (Params::getInstance().intype == IN_NEWICK && !Params::getInstance().find_all && Params::getInstance().budget_file == NULL &&
2708 Params::getInstance().find_pd_min == false && Params::getInstance().calc_pdgain == false &&
2709 Params::getInstance().run_mode != LINEAR_PROGRAMMING && Params::getInstance().multi_tree == false)
2710 runPDTree(Params::getInstance());
2711 else if (Params::getInstance().intype == IN_NEXUS || Params::getInstance().intype == IN_NEWICK) {
2712 if (Params::getInstance().run_mode == LINEAR_PROGRAMMING && Params::getInstance().find_pd_min)
2713 outError("Current linear programming does not support finding minimal PD sets!");
2714 if (Params::getInstance().find_all && Params::getInstance().run_mode == LINEAR_PROGRAMMING)
2715 Params::getInstance().binary_programming = true;
2716 runPDSplit(Params::getInstance());
2717 } else {
2718 outError("Unknown file input format");
2719 }
2720 }
2721
2722 time(&start_time);
2723 cout << "Date and Time: " << ctime(&start_time);
2724 delete checkpoint;
2725
2726 finish_random();
2727
2728 return EXIT_SUCCESS;
2729 }
2730