1 /***************************************************************************
2  *   Copyright (C) 2009 by BUI Quang Minh   *
3  *   minh.bui@univie.ac.at   *
4  *                                                                         *
5  *   This program is free software; you can redistribute it and/or modify  *
6  *   it under the terms of the GNU General Public License as published by  *
7  *   the Free Software Foundation; either version 2 of the License, or     *
8  *   (at your option) any later version.                                   *
9  *                                                                         *
10  *   This program is distributed in the hope that it will be useful,       *
11  *   but WITHOUT ANY WARRANTY; without even the implied warranty of        *
12  *   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the         *
13  *   GNU General Public License for more details.                          *
14  *                                                                         *
15  *   You should have received a copy of the GNU General Public License     *
16  *   along with this program; if not, write to the                         *
17  *   Free Software Foundation, Inc.,                                       *
18  *   59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.             *
19  ***************************************************************************/
20 
21 #include <stdarg.h>
22 #include "superalignment.h"
23 #include "nclextra/msetsblock.h"
24 #include "nclextra/myreader.h"
25 #include "main/phylotesting.h"
26 
createAlignment(string aln_file,const char * sequence_type,InputType intype,string model_name)27 Alignment *createAlignment(string aln_file, const char *sequence_type, InputType intype, string model_name) {
28     bool is_dir = isDirectory(aln_file.c_str());
29 
30     if (!is_dir && aln_file.find(',') == string::npos)
31         return new Alignment((char*)aln_file.c_str(), (char*)sequence_type, intype, model_name);
32 
33     SuperAlignment *super_aln = new SuperAlignment;
34     if (is_dir)
35         super_aln->readPartitionDir(aln_file, (char*)sequence_type, intype, model_name, true);
36     else
37         super_aln->readPartitionList(aln_file, (char*)sequence_type, intype, model_name, true);
38     super_aln->init();
39     Alignment *aln = super_aln->concatenateAlignments();
40     if (aln->isSuperAlignment())
41         outError("Cannot concatenate alignments of different data type ", aln_file);
42     delete super_aln;
43     return aln;
44 }
45 
SuperAlignment()46 SuperAlignment::SuperAlignment() : Alignment() {
47     max_num_states = 0;
48 }
49 
SuperAlignment(Params & params)50 SuperAlignment::SuperAlignment(Params &params) : Alignment()
51 {
52     readFromParams(params);
53 
54     init();
55 
56     cout << "Degree of missing data: " << computeMissingData() << endl;
57 
58 #ifdef _OPENMP
59     if (params.num_threads > partitions.size()) {
60         cout << "Info: multi-threading strategy over alignment sites" << endl;
61     } else {
62         cout << "Info: multi-threading strategy over partitions" << endl;
63     }
64 #endif
65     cout << endl;
66 
67 }
68 
readFromParams(Params & params)69 void SuperAlignment::readFromParams(Params &params) {
70     if (isDirectory(params.partition_file)) {
71         // reading all files in the directory
72         readPartitionDir(params.partition_file, params.sequence_type, params.intype, params.model_name, params.remove_empty_seq);
73     } else if (strstr(params.partition_file, ",") != nullptr) {
74         // reading all files in a comma-separated list
75         readPartitionList(params.partition_file, params.sequence_type, params.intype, params.model_name, params.remove_empty_seq);
76     } else {
77         cout << "Reading partition model file " << params.partition_file << " ..." << endl;
78         if (detectInputFile(params.partition_file) == IN_NEXUS) {
79             readPartitionNexus(params);
80             if (partitions.empty()) {
81                 outError("No partition found in SETS block. An example syntax looks like: \n#nexus\nbegin sets;\n  charset part1=1-100;\n  charset part2=101-300;\nend;");
82             }
83         } else
84             readPartitionRaxml(params);
85     }
86     if (partitions.empty())
87         outError("No partition found");
88 
89     // check for duplicated partition names
90     unordered_set<string> part_names;
91     for (auto pit = partitions.begin(); pit != partitions.end(); pit++) {
92         if (part_names.find((*pit)->name) != part_names.end())
93             outError("Duplicated partition name ", (*pit)->name);
94         part_names.insert((*pit)->name);
95     }
96 
97     if (params.subsampling != 0) {
98         // sumsample a number of partitions
99         int subsample = params.subsampling;
100         if (abs(subsample) >= partitions.size())
101             outError("--subsample must be between -" + convertIntToString(partitions.size()-1) + " and " + convertIntToString(partitions.size()-1));
102         cout << "Random subsampling " << ((subsample > 0) ? subsample : partitions.size() + subsample)
103              << " partitions (seed: " << params.subsampling_seed <<  ")..." << endl;
104         int *rstream;
105         init_random(params.subsampling_seed, false, &rstream);
106         // make sure to sub-sample exact number
107         vector<bool> sample;
108         int i;
109         sample.resize(partitions.size(), false);
110         for (int num = 0; num < abs(subsample); ) {
111             i = random_int(sample.size(), rstream);
112             if (!sample[i]) {
113                 sample[i] = true;
114                 num++;
115             }
116         }
117         finish_random(rstream);
118         if (subsample < 0) {
119             // reverse sampling
120             for (i = 0; i < sample.size(); i++)
121                 sample[i] = !sample[i];
122         }
123         vector<Alignment*> keep_partitions;
124         for (i = 0; i < sample.size(); i++)
125             if (sample[i])
126                 keep_partitions.push_back(partitions[i]);
127         // now replace partitions
128         partitions = keep_partitions;
129     }
130 
131     // Initialize the counter for evaluated NNIs on subtrees
132     cout << "Subset\tType\tSeqs\tSites\tInfor\tInvar\tModel\tName" << endl;
133     int part = 0;
134     for (auto it = partitions.begin(); it != partitions.end(); it++, part++) {
135         cout << part+1 << "\t" << (*it)->sequence_type << "\t" << (*it)->getNSeq()
136         << "\t" << (*it)->getNSite() << "\t" << (*it)->num_informative_sites
137         << "\t" << (*it)->getNSite()-(*it)->num_variant_sites << "\t"
138         << (*it)->model_name << "\t" << (*it)->name << endl;
139         if ((*it)->num_variant_sites == 0) {
140             outWarning("No variant sites in partition " + (*it)->name);
141         } else if ((*it)->num_informative_sites == 0) {
142             outWarning("No parsimony-informative sites in partition " + (*it)->name);
143         }
144     }
145 }
146 
init(StrVector * sequence_names)147 void SuperAlignment::init(StrVector *sequence_names) {
148     // start original code
149 
150     max_num_states = 0;
151 	// first build taxa_index and partitions
152 	int site, seq, nsite = partitions.size();
153 
154     // BUG FIX 2016-11-29: when merging partitions with -m TESTMERGE, sequence order is changed
155     // get the taxa names from existing tree
156     if (sequence_names && !sequence_names->empty()) {
157         seq_names = *sequence_names;
158         taxa_index.resize(seq_names.size());
159         for (auto i = taxa_index.begin(); i != taxa_index.end(); i++)
160             i->resize(nsite, -1);
161     }
162 
163     site = 0;
164 	for (auto it = partitions.begin(); it != partitions.end(); it++, site++) {
165 //        partitions.push_back((*it)->aln);
166 		int nseq = (*it)->getNSeq();
167 		//cout << "nseq  = " << nseq << endl;
168 		for (seq = 0; seq < nseq; seq++) {
169 			int id = getSeqID((*it)->getSeqName(seq));
170 			if (id < 0) {
171 				seq_names.push_back((*it)->getSeqName(seq));
172 				id = seq_names.size()-1;
173 				IntVector vec(nsite, -1);
174 				vec[site] = seq;
175 				taxa_index.push_back(vec);
176 			} else
177 				taxa_index[id][site] = seq;
178 		}
179 	}
180 	// now the patterns of sequence-genes presence/absence
181 	buildPattern();
182 }
183 
buildPattern()184 void SuperAlignment::buildPattern() {
185 	int site, seq, nsite = partitions.size();
186 
187 	seq_type = SEQ_BINARY;
188 	num_states = 2; // binary type because the super alignment presents the presence/absence of taxa in the partitions
189 	STATE_UNKNOWN = 2;
190 	site_pattern.resize(nsite, -1);
191 	clear();
192 	pattern_index.clear();
193 	VerboseMode save_mode = verbose_mode;
194 	verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
195 	int nseq = getNSeq();
196 	for (site = 0; site < nsite; site++) {
197  		Pattern pat;
198  		pat.resize(nseq, 0);
199 		for (seq = 0; seq < nseq; seq++)
200 			pat[seq] = (taxa_index[seq][site] >= 0)? 1 : 0;
201 		addPattern(pat, site);
202 	}
203 	verbose_mode = save_mode;
204 	countConstSite();
205 //    buildSeqStates();
206 }
207 
readPartition(Params & params)208 void SuperAlignment::readPartition(Params &params) {
209     try {
210         ifstream in;
211         in.exceptions(ios::failbit | ios::badbit);
212         in.open(params.partition_file);
213         in.exceptions(ios::badbit);
214 
215         while (!in.eof()) {
216             CharSet info;
217             getline(in, info.name, ',');
218             if (in.eof()) break;
219             getline(in, info.model_name, ',');
220             if (model_name == "") info.model_name = params.model_name;
221             getline(in, info.aln_file, ',');
222             if (info.aln_file == "" && params.aln_file) info.aln_file = params.aln_file;
223             getline(in, info.sequence_type, ',');
224             if (info.sequence_type=="" && params.sequence_type)
225                 info.sequence_type = params.sequence_type;
226             safeGetline(in, info.position_spec);
227             trimString(info.sequence_type);
228             //            cout << endl << "Reading partition " << info.name << " (model=" << info.model_name << ", aln=" <<
229             //                    info.aln_file << ", seq=" << info.sequence_type << ", pos=" << ((info.position_spec.length() >= 20) ? info.position_spec.substr(0,20)+"..." : info.position_spec) << ") ..." << endl;
230 
231             // TODO move this to supertree
232 //            info.nniMoves[0].ptnlh = NULL;
233 //            info.nniMoves[1].ptnlh = NULL;
234 //            info.cur_ptnlh = NULL;
235 //            part_info.push_back(info);
236             Alignment *part_aln = createAlignment(info.aln_file, info.sequence_type.c_str(), params.intype, info.model_name);
237             if (!info.position_spec.empty()) {
238                 Alignment *new_aln = new Alignment();
239                 new_aln->extractSites(part_aln, info.position_spec.c_str());
240                 delete part_aln;
241                 part_aln = new_aln;
242             }
243             part_aln->name = info.name;
244             part_aln->model_name = info.model_name;
245             part_aln->position_spec = info.position_spec;
246             part_aln->aln_file = info.aln_file;
247             part_aln->sequence_type = info.sequence_type;
248             partitions.push_back(part_aln);
249             // TODO move this to supertree
250 //            PhyloTree *tree = new PhyloTree(part_aln);
251 //            push_back(tree);
252         }
253 
254         in.clear();
255         // set the failbit again
256         in.exceptions(ios::failbit | ios::badbit);
257         in.close();
258     } catch(ios::failure) {
259         outError(ERR_READ_INPUT);
260     } catch (string str) {
261         outError(str);
262     }
263 
264 
265 }
266 
readPartitionRaxml(Params & params)267 void SuperAlignment::readPartitionRaxml(Params &params) {
268     try {
269         ifstream in;
270         in.exceptions(ios::failbit | ios::badbit);
271         in.open(params.partition_file);
272         in.exceptions(ios::badbit);
273 //        PartitionInfo info;
274         Alignment *input_aln = NULL;
275         if (!params.aln_file)
276             outError("Please supply an alignment with -s option");
277 
278         input_aln = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
279 
280         cout << endl << "Partition file is not in NEXUS format, assuming RAxML-style partition file..." << endl;
281 
282         size_t pos = params.model_name.find_first_of("+*");
283         string rate_type = "";
284         if (pos != string::npos) rate_type = params.model_name.substr(pos);
285 
286         while (!in.eof()) {
287             CharSet info;
288             getline(in, info.model_name, ',');
289             if (in.eof()) break;
290             trimString(info.model_name);
291             //            std::transform(info.model_name.begin(), info.model_name.end(), info.model_name.begin(), ::toupper);
292 
293             bool is_ASC = info.model_name.substr(0,4) == "ASC_";
294             if (is_ASC) info.model_name.erase(0, 4);
295             StateFreqType freq = FREQ_UNKNOWN;
296             if (info.model_name.find_first_of("*+{") == string::npos ) {
297                 if (*info.model_name.rbegin() == 'F' && info.model_name != "DAYHOFF") {
298                     freq = FREQ_EMPIRICAL;
299                     info.model_name.erase(info.model_name.length()-1);
300                 } else if (*info.model_name.rbegin() == 'X' && info.model_name != "LG4X") {
301                     freq = FREQ_ESTIMATE;
302                     info.model_name.erase(info.model_name.length()-1);
303                 }
304             }
305 
306             if (info.model_name.empty())
307                 outError("Please give model names in partition file!");
308             if (info.model_name == "BIN") {
309                 info.sequence_type = "BIN";
310                 info.model_name = "GTR2";
311             } else if (info.model_name == "DNA") {
312                 info.sequence_type = "DNA";
313                 info.model_name = "GTR";
314             } else if (info.model_name == "MULTI") {
315                 info.sequence_type = "MORPH";
316                 info.model_name = "MK";
317             } else if (info.model_name.substr(0,5) == "CODON") {
318                 info.sequence_type = info.model_name;
319                 info.model_name = "GY";
320             } else {
321                 info.sequence_type = "AA";
322                 if (*info.model_name.begin() == '[') {
323                     if (*info.model_name.rbegin() != ']')
324                         outError("User-defined protein model should be [myProtenSubstitutionModelFileName]");
325                     info.model_name = info.model_name.substr(1, info.model_name.length()-2);
326                 }
327             }
328 
329             if (freq == FREQ_EMPIRICAL)
330                 info.model_name += "+F";
331             else if (freq == FREQ_ESTIMATE)
332                 info.model_name += "+FO";
333             if (is_ASC)
334                 info.model_name += "+ASC";
335             info.model_name += rate_type;
336 
337             getline(in, info.name, '=');
338             trimString(info.name);
339             if (info.name.empty())
340                 outError("Please give partition names in partition file!");
341 
342             safeGetline(in, info.position_spec);
343             trimString(info.position_spec);
344             if (info.position_spec.empty())
345                 outError("Please specify alignment positions for partition" + info.name);
346             std::replace(info.position_spec.begin(), info.position_spec.end(), ',', ' ');
347 
348             //            cout << "Reading partition " << info.name << " (model=" << info.model_name << ", seq=" << info.sequence_type << ", pos=" << ((info.position_spec.length() >= 20) ? info.position_spec.substr(0,20)+"..." : info.position_spec) << ") ..." << endl;
349 
350             // TODO to supertree
351 //            info.nniMoves[0].ptnlh = NULL;
352 //            info.nniMoves[1].ptnlh = NULL;
353 //            info.cur_ptnlh = NULL;
354 //            part_info.push_back(info);
355             Alignment *part_aln = new Alignment();
356             part_aln->extractSites(input_aln, info.position_spec.c_str());
357 
358             Alignment *new_aln;
359             if (params.remove_empty_seq)
360                 new_aln = part_aln->removeGappySeq();
361             else
362                 new_aln = part_aln;
363             // also rebuild states set of each sequence for likelihood computation
364 //            new_aln->buildSeqStates();
365 
366             if (part_aln != new_aln) delete part_aln;
367 
368             new_aln->name = info.name;
369             new_aln->model_name = info.model_name;
370             new_aln->position_spec = info.position_spec;
371             new_aln->aln_file = info.aln_file;
372             new_aln->sequence_type = info.sequence_type;
373             partitions.push_back(new_aln);
374             // TODO move to supertree
375 //            PhyloTree *tree = new PhyloTree(new_aln);
376 //            push_back(tree);
377             //            cout << new_aln->getNSeq() << " sequences and " << new_aln->getNSite() << " sites extracted" << endl;
378             //            params = origin_params;
379         }
380 
381         in.clear();
382         // set the failbit again
383         in.exceptions(ios::failbit | ios::badbit);
384         in.close();
385     } catch(ios::failure) {
386         outError(ERR_READ_INPUT);
387     } catch (string str) {
388         outError(str);
389     }
390 
391 
392 }
393 
readPartitionNexus(Params & params)394 void SuperAlignment::readPartitionNexus(Params &params) {
395 //    Params origin_params = params;
396     MSetsBlock *sets_block = new MSetsBlock();
397     NxsTaxaBlock *taxa_block = NULL;
398     NxsAssumptionsBlock *assumptions_block = NULL;
399     NxsDataBlock *data_block = NULL;
400     MyReader nexus(params.partition_file);
401     nexus.Add(sets_block);
402 
403     if (!params.aln_file) {
404         taxa_block = new NxsTaxaBlock();
405         assumptions_block = new NxsAssumptionsBlock(taxa_block);
406         data_block = new NxsDataBlock(taxa_block, assumptions_block);
407         nexus.Add(taxa_block);
408         nexus.Add(assumptions_block);
409         nexus.Add(data_block);
410     }
411 
412     MyToken token(nexus.inf);
413     nexus.Execute(token);
414 
415     Alignment *input_aln = NULL;
416     if (params.aln_file) {
417         input_aln = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
418     } else {
419         if (data_block->GetNTax() > 0) {
420             input_aln = new Alignment(data_block, params.sequence_type, params.model_name);
421         }
422         delete data_block;
423         delete assumptions_block;
424         delete taxa_block;
425     }
426 
427     bool empty_partition = true;
428     vector<CharSet*>::iterator it;
429     for (it = sets_block->charsets.begin(); it != sets_block->charsets.end(); it++)
430         if ((*it)->model_name != "") {
431             empty_partition = false;
432             break;
433         }
434     if (empty_partition) {
435         cout << "NOTE: No CharPartition defined, use all CharSets" << endl;
436     }
437 
438     cout << endl << "Loading " << sets_block->charsets.size() << " partitions..." << endl;
439 
440     for (it = sets_block->charsets.begin(); it != sets_block->charsets.end(); it++)
441         if (empty_partition || (*it)->char_partition != "") {
442             if ((*it)->model_name == "")
443                 (*it)->model_name = params.model_name;
444             if ((*it)->aln_file == "" && !input_aln) {
445                 if (!(*it)->position_spec.empty()) {
446                     (*it)->aln_file = (*it)->position_spec;
447                     (*it)->position_spec = "";
448                 } else
449                     outError("No input data for partition ", (*it)->name);
450             }
451             if ((*it)->sequence_type=="" && params.sequence_type)
452                 (*it)->sequence_type = params.sequence_type;
453 
454             if ((*it)->sequence_type == "" && !(*it)->model_name.empty()) {
455                 // try to get sequence type from model
456             //TODO: why compile error?
457                 (*it)->sequence_type = detectSeqTypeName((*it)->model_name.substr(0, (*it)->model_name.find_first_of("+*")));
458             }
459             if ((*it)->aln_file == "" && ((*it)->position_spec == "" || (*it)->position_spec == "*"))
460                 outError("Empty position range for partition ", (*it)->name);
461             trimString((*it)->sequence_type);
462             //            cout << endl << "Reading partition " << info.name << " (model=" << info.model_name << ", aln=" <<
463             //                info.aln_file << ", seq=" << info.sequence_type << ", pos=" << ((info.position_spec.length() >= 20) ? info.position_spec.substr(0,20)+"..." : info.position_spec) << ") ..." << endl;
464             if ((*it)->sequence_type != "" && Alignment::getSeqType((*it)->sequence_type.c_str()) == SEQ_UNKNOWN)
465                 outError("Unknown sequence type " + (*it)->sequence_type);
466 
467             // TODO move to supertree
468 //            info.nniMoves[0].ptnlh = NULL;
469 //            info.nniMoves[1].ptnlh = NULL;
470 //            info.cur_ptnlh = NULL;
471 //            part_info.push_back(info);
472             Alignment *part_aln;
473             if ((*it)->aln_file != "") {
474                 part_aln = createAlignment((*it)->aln_file, (*it)->sequence_type.c_str(), params.intype, (*it)->model_name);
475             } else {
476                 part_aln = input_aln;
477             }
478             if (!(*it)->position_spec.empty() && (*it)->position_spec != "*") {
479                 Alignment *new_aln = new Alignment();
480                 new_aln->extractSites(part_aln, (*it)->position_spec.c_str());
481                 if (part_aln != input_aln) delete part_aln;
482                 part_aln = new_aln;
483             }
484             if (part_aln->seq_type == SEQ_DNA && ((*it)->sequence_type.substr(0, 5) == "CODON" || (*it)->sequence_type.substr(0, 5) == "NT2AA")) {
485                 Alignment *new_aln = new Alignment();
486                 new_aln->convertToCodonOrAA(part_aln, &(*it)->sequence_type[5], (*it)->sequence_type.substr(0, 5) == "NT2AA");
487                 if (part_aln != input_aln) delete part_aln;
488                 part_aln = new_aln;
489             }
490             Alignment *new_aln;
491             if (params.remove_empty_seq)
492                 new_aln = part_aln->removeGappySeq();
493             else
494                 new_aln = part_aln;
495             // also rebuild states set of each sequence for likelihood computation
496 //            new_aln->buildSeqStates();
497 
498             if (part_aln != new_aln && part_aln != input_aln) delete part_aln;
499             new_aln->name = (*it)->name;
500             new_aln->model_name = (*it)->model_name;
501             new_aln->aln_file = (*it)->aln_file;
502             new_aln->position_spec = (*it)->position_spec;
503             new_aln->sequence_type = (*it)->sequence_type;
504             new_aln->tree_len = (*it)->tree_len;
505             partitions.push_back(new_aln);
506 //            PhyloTree *tree = new PhyloTree(new_aln);
507 //            push_back(tree);
508 //            params = origin_params;
509             //            cout << new_aln->getNSeq() << " sequences and " << new_aln->getNSite() << " sites extracted" << endl;
510         }
511 
512     if (input_aln)
513         delete input_aln;
514     delete sets_block;
515 }
516 
readPartitionDir(string partition_dir,char * sequence_type,InputType & intype,string model,bool remove_empty_seq)517 void SuperAlignment::readPartitionDir(string partition_dir, char *sequence_type,
518                                       InputType &intype, string model, bool remove_empty_seq) {
519     //    Params origin_params = params;
520 
521     StrVector filenames;
522     string dir = partition_dir;
523     if (dir.back() != '/')
524         dir.append("/");
525     getFilesInDir(partition_dir.c_str(), filenames);
526     if (filenames.empty())
527         outError("No file found in ", partition_dir);
528     std::sort(filenames.begin(), filenames.end());
529     cout << "Reading " << filenames.size() << " alignment files in directory " << partition_dir << endl;
530 
531     for (auto it = filenames.begin(); it != filenames.end(); it++)
532     {
533         Alignment *part_aln;
534         part_aln = createAlignment(dir+*it, sequence_type, intype, model_name);
535 //        if (part_aln->seq_type == SEQ_DNA && (strncmp(params.sequence_type, "CODON", 5) == 0 || strncmp(params.sequence_type, "NT2AA", 5) == 0)) {
536 //            Alignment *new_aln = new Alignment();
537 //            new_aln->convertToCodonOrAA(part_aln, params.sequence_type+5, strncmp(params.sequence_type, "NT2AA", 5) == 0);
538 //            delete part_aln;
539 //            part_aln = new_aln;
540 //        }
541         Alignment *new_aln;
542         if (remove_empty_seq)
543             new_aln = part_aln->removeGappySeq();
544         else
545             new_aln = part_aln;
546         // also rebuild states set of each sequence for likelihood computation
547 //        new_aln->buildSeqStates();
548 
549         if (part_aln != new_aln) delete part_aln;
550         new_aln->name = *it;
551         new_aln->model_name = model_name;
552         new_aln->aln_file = dir + *it;
553         new_aln->position_spec = "";
554         if (sequence_type)
555             new_aln->sequence_type = sequence_type;
556         partitions.push_back(new_aln);
557     }
558 }
559 
readPartitionList(string file_list,char * sequence_type,InputType & intype,string model,bool remove_empty_seq)560 void SuperAlignment::readPartitionList(string file_list, char *sequence_type,
561     InputType &intype, string model, bool remove_empty_seq)
562 {
563     //    Params origin_params = params;
564 
565     StrVector filenames;
566     stringstream ss(file_list);
567     string token;
568     while (getline(ss, token, ','))
569         filenames.push_back(token);
570     if (filenames.empty())
571         outError("No file found in ", file_list);
572     cout << "Reading " << filenames.size() << " alignment files..." << endl;
573 
574     for (auto it = filenames.begin(); it != filenames.end(); it++)
575         {
576         Alignment *part_aln;
577         part_aln = createAlignment(*it, sequence_type, intype, model_name);
578         //        if (part_aln->seq_type == SEQ_DNA && (strncmp(params.sequence_type, "CODON", 5) == 0 || strncmp(params.sequence_type, "NT2AA", 5) == 0)) {
579         //            Alignment *new_aln = new Alignment();
580         //            new_aln->convertToCodonOrAA(part_aln, params.sequence_type+5, strncmp(params.sequence_type, "NT2AA", 5) == 0);
581         //            delete part_aln;
582         //            part_aln = new_aln;
583         //        }
584         Alignment *new_aln;
585         if (remove_empty_seq)
586             new_aln = part_aln->removeGappySeq();
587         else
588             new_aln = part_aln;
589         // also rebuild states set of each sequence for likelihood computation
590 //        new_aln->buildSeqStates();
591 
592         if (part_aln != new_aln) delete part_aln;
593         new_aln->name = *it;
594         new_aln->model_name = model_name;
595         new_aln->aln_file = *it;
596         new_aln->position_spec = "";
597         if (sequence_type)
598             new_aln->sequence_type = sequence_type;
599         partitions.push_back(new_aln);
600         }
601 }
602 
printPartition(const char * filename,const char * aln_file)603 void SuperAlignment::printPartition(const char *filename, const char *aln_file) {
604     try {
605         ofstream out;
606         out.exceptions(ios::failbit | ios::badbit);
607         out.open(filename);
608         printPartition(out, aln_file);
609         out.close();
610         cout << "Partition information was printed to " << filename << endl;
611     } catch (ios::failure &) {
612         outError(ERR_WRITE_OUTPUT, filename);
613     }
614 
615 }
616 
printPartition(ostream & out,const char * aln_file,bool append)617 void SuperAlignment::printPartition(ostream &out, const char *aln_file, bool append) {
618     if (append)
619         out << endl;
620     else
621         out << "#nexus" << endl;
622     if (aln_file)
623         out << "[ partition information for alignment written in " << aln_file <<" file ]" << endl;
624     out << "begin sets;" << endl;
625     int part; int start_site;
626     for (part = 0, start_site = 1; part < partitions.size(); part++) {
627         string name = partitions[part]->name;
628         replace(name.begin(), name.end(), '+', '_');
629         int end_site = start_site + partitions[part]->getNSite();
630         out << "  charset " << name << " = " << start_site << "-" << end_site-1 << ";" << endl;
631         start_site = end_site;
632     }
633     bool ok_model = true;
634     for (part = 0; part < partitions.size(); part++)
635         if (partitions[part]->model_name.empty()) {
636             ok_model = false;
637             break;
638         }
639     if (ok_model) {
640         out << "  charpartition mymodels =" << endl;
641         for (part = 0; part < partitions.size(); part++) {
642             string name = partitions[part]->name;
643             replace(name.begin(), name.end(), '+', '_');
644             if (part > 0) out << "," << endl;
645 //            out << "    " << at(part)->getModelNameParams() << ":" << name;
646             out << "    " << partitions[part]->model_name << ":" << name;
647         }
648         out << ";" << endl;
649     }
650     out << "end;" << endl;
651 }
652 
printBestPartition(const char * filename)653 void SuperAlignment::printBestPartition(const char *filename) {
654     try {
655         ofstream out;
656         out.exceptions(ios::failbit | ios::badbit);
657         out.open(filename);
658         out << "#nexus" << endl
659         << "begin sets;" << endl;
660         int part;
661         for (part = 0; part < partitions.size(); part++) {
662             string name = partitions[part]->name;
663             replace(name.begin(), name.end(), '+', '_');
664             out << "  charset " << name << " = ";
665             if (!partitions[part]->aln_file.empty()) out << partitions[part]->aln_file << ": ";
666             if (partitions[part]->seq_type == SEQ_CODON)
667                 out << "CODON, ";
668             string pos = partitions[part]->position_spec;
669             replace(pos.begin(), pos.end(), ',' , ' ');
670             out << pos << ";" << endl;
671         }
672         bool ok_model = true;
673         for (part = 0; part < partitions.size(); part++)
674             if (partitions[part]->model_name.empty()) {
675                 ok_model = false;
676                 break;
677             }
678         if (ok_model) {
679             out << "  charpartition mymodels =" << endl;
680             for (part = 0; part < partitions.size(); part++) {
681                 string name = partitions[part]->name;
682                 replace(name.begin(), name.end(), '+', '_');
683                 if (part > 0) out << "," << endl;
684                 out << "    " << partitions[part]->model_name << ": " << name;
685             }
686             out << ";" << endl;
687         }
688         out << "end;" << endl;
689         out.close();
690         cout << "Partition information was printed to " << filename << endl;
691     } catch (ios::failure &) {
692         outError(ERR_WRITE_OUTPUT, filename);
693     }
694 
695 }
696 
697 
printPartitionRaxml(const char * filename)698 void SuperAlignment::printPartitionRaxml(const char *filename) {
699     int part;
700 //    for (part = 0; part < partitions.size(); part++) {
701 //        if (partitions[part]->aln_file != "") {
702 //            cout << "INFO: Printing partition in RAxML format is not possible" << endl;
703 //            return;
704 //        }
705 //    }
706     try {
707         ofstream out;
708         out.exceptions(ios::failbit | ios::badbit);
709         out.open(filename);
710         int start_site;
711         for (part = 0, start_site = 1; part < partitions.size(); part++) {
712             string name = partitions[part]->name;
713             replace(name.begin(), name.end(), '+', '_');
714             int end_site = start_site + partitions[part]->getNSite();
715             switch (partitions[part]->seq_type) {
716                 case SEQ_DNA: out << "DNA, "; break;
717                 case SEQ_BINARY: out << "BIN, "; break;
718                 case SEQ_MORPH: out << "MULTI, "; break;
719                 default: out << partitions[part]->model_name << ","; break;
720             }
721             out << name << " = " << start_site << "-" << end_site-1 << endl;
722             start_site = end_site;
723         }
724         out.close();
725         cout << "Partition information in Raxml format was printed to " << filename << endl;
726     } catch (ios::failure &) {
727         outError(ERR_WRITE_OUTPUT, filename);
728     }
729 
730 }
731 
printBestPartitionRaxml(const char * filename)732 void SuperAlignment::printBestPartitionRaxml(const char *filename) {
733     int part;
734 //    for (part = 0; part < partitions.size(); part++) {
735 //        if (partitions[part]->aln_file != "") {
736 //            cout << "INFO: Printing partition in RAxML format is not possible" << endl;
737 //            return;
738 //        }
739 //    }
740     try {
741         ofstream out;
742         out.exceptions(ios::failbit | ios::badbit);
743         out.open(filename);
744         for (part = 0; part < partitions.size(); part++) {
745             string name = partitions[part]->name;
746             replace(name.begin(), name.end(), '+', '_');
747             if (partitions[part]->model_name.find("+ASC") != string::npos)
748                 out << "ASC_";
749             switch (partitions[part]->seq_type) {
750                 case SEQ_DNA: out << "DNA"; break;
751                 case SEQ_BINARY: out << "BIN"; break;
752                 case SEQ_MORPH: out << "MULTI"; break;
753                 case SEQ_PROTEIN:
754                     out << partitions[part]->model_name.substr(0, partitions[part]->model_name.find_first_of("*{+"));
755                     break;
756                 case SEQ_CODON:
757                     out << "CODON_" << partitions[part]->model_name.substr(0, partitions[part]->model_name.find_first_of("*{+"));
758                     break;
759                 default: out << partitions[part]->model_name; break;
760             }
761             if (partitions[part]->model_name.find("+FO") != string::npos)
762                 out << "X";
763             else if (partitions[part]->model_name.find("+F") != string::npos)
764                 out << "F";
765 
766             out << ", " << name << " = " << partitions[part]->position_spec << endl;
767         }
768         out.close();
769         cout << "Partition information in Raxml format was printed to " << filename << endl;
770     } catch (ios::failure &) {
771         outError(ERR_WRITE_OUTPUT, filename);
772     }
773 
774 }
775 
776 
linkSubAlignment(int part)777 void SuperAlignment::linkSubAlignment(int part) {
778 	ASSERT(taxa_index.size() == getNSeq());
779 	int nseq = getNSeq(), seq;
780 	vector<bool> checked;
781 	checked.resize(partitions[part]->getNSeq(), false);
782 	for (seq = 0; seq < nseq; seq++) {
783 		int id = partitions[part]->getSeqID(getSeqName(seq));
784 		if (id < 0)
785 			taxa_index[seq][part] = -1;
786 		else {
787 			taxa_index[seq][part] = id;
788 			checked[id] = true;
789 		}
790 	}
791 	if (verbose_mode >= VB_MED) {
792 
793 	}
794 	// sanity check that all seqnames in partition must be present in superalignment
795 	for (seq = 0; seq < checked.size(); seq++) {
796 		ASSERT(checked[seq]);
797 	}
798 }
799 
extractSubAlignment(Alignment * aln,IntVector & seq_id,int min_true_char,int min_taxa,IntVector * kept_partitions)800 void SuperAlignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa, IntVector *kept_partitions) {
801 	ASSERT(aln->isSuperAlignment());
802 	SuperAlignment *saln = (SuperAlignment*)aln;
803     name = aln->name;
804     model_name = aln->model_name;
805     sequence_type = aln->sequence_type;
806     position_spec = aln->position_spec;
807     aln_file = aln->aln_file;
808 
809     int i;
810     IntVector::iterator it;
811     for (it = seq_id.begin(); it != seq_id.end(); it++) {
812         ASSERT(*it >= 0 && *it < aln->getNSeq());
813         seq_names.push_back(aln->getSeqName(*it));
814     }
815 
816 	// BUG HERE!
817 	//Alignment::extractSubAlignment(aln, seq_id, 0);
818 
819 	taxa_index.resize(getNSeq());
820 	for (i = 0; i < getNSeq(); i++)
821 		taxa_index[i].resize(saln->partitions.size(), -1);
822 
823 	int part = 0;
824 //	partitions.resize(saln->partitions.size());
825     partitions.resize(0);
826 	for (vector<Alignment*>::iterator ait = saln->partitions.begin(); ait != saln->partitions.end(); ait++, part++) {
827 		IntVector sub_seq_id;
828 		for (IntVector::iterator it = seq_id.begin(); it != seq_id.end(); it++)
829 			if (saln->taxa_index[*it][part] >= 0)
830 				sub_seq_id.push_back(saln->taxa_index[*it][part]);
831         if (sub_seq_id.size() < min_taxa)
832             continue;
833 		Alignment *subaln = new Alignment;
834 		subaln->extractSubAlignment(*ait, sub_seq_id, 0);
835 		partitions.push_back(subaln);
836 		linkSubAlignment(partitions.size()-1);
837         if (kept_partitions) kept_partitions->push_back(part);
838 //		cout << subaln->getNSeq() << endl;
839 //		subaln->printPhylip(cout);
840 	}
841 
842     if (partitions.size() < saln->partitions.size()) {
843         for (i = 0; i < getNSeq(); i++)
844             taxa_index[i].resize(partitions.size());
845     }
846 
847 	// now build the patterns based on taxa_index
848 	buildPattern();
849 }
850 
extractPartitions(IntVector & part_id)851 SuperAlignment *SuperAlignment::extractPartitions(IntVector &part_id) {
852     SuperAlignment *newaln = new SuperAlignment;
853     newaln->name = name;
854     newaln->model_name = model_name;
855     newaln->sequence_type = sequence_type;
856     newaln->position_spec = position_spec;
857     newaln->aln_file = aln_file;
858 
859     unordered_set<string> seq_names_set;
860     IntVector::iterator it;
861     for (it = part_id.begin(); it != part_id.end(); it++) {
862         for (auto seq = partitions[*it]->seq_names.begin(); seq != partitions[*it]->seq_names.end(); seq++)
863             if (seq_names_set.find(*seq) == seq_names_set.end()) {
864                 newaln->seq_names.push_back(*seq);
865                 seq_names_set.insert(*seq);
866             }
867     }
868 
869     int i;
870     newaln->taxa_index.resize(newaln->getNSeq());
871     for (i = 0; i < newaln->getNSeq(); i++)
872         newaln->taxa_index[i].resize(part_id.size(), -1);
873 
874     int part = 0;
875     for (auto ait = part_id.begin(); ait != part_id.end(); ait++, part++) {
876         newaln->partitions.push_back(partitions[*ait]);
877         newaln->linkSubAlignment(newaln->partitions.size()-1);
878     }
879 
880     // now build the patterns based on taxa_index
881     newaln->buildPattern();
882     return newaln;
883 }
884 
removePartitions(set<int> & removed_id)885 void SuperAlignment::removePartitions(set<int> &removed_id) {
886 
887     // remove part_id from partitions
888     vector<Alignment*> new_partitions;
889     int i;
890     for (i = 0; i < partitions.size(); i++)
891         if (removed_id.find(i) == removed_id.end()) {
892             // not found in the removed set
893             new_partitions.push_back(partitions[i]);
894         } else {
895             delete partitions[i];
896             partitions[i] = NULL;
897         }
898 
899     ASSERT(new_partitions.size() + removed_id.size() == partitions.size());
900     partitions = new_partitions;
901 
902     // get the union seq_names of remaining partitions
903     unordered_set<string> seq_names_set;
904     seq_names.clear();
905     for (auto it = partitions.begin(); it != partitions.end(); it++) {
906         for (auto seq = (*it)->seq_names.begin(); seq != (*it)->seq_names.end(); seq++)
907             if (seq_names_set.find(*seq) == seq_names_set.end()) {
908                 seq_names.push_back(*seq);
909                 seq_names_set.insert(*seq);
910             }
911     }
912 
913 
914     // build the taxa_index
915     taxa_index.resize(getNSeq());
916     for (i = 0; i < getNSeq(); i++)
917         taxa_index[i].resize(partitions.size(), -1);
918     for (i = 0; i < partitions.size(); i++)
919         linkSubAlignment(i);
920 
921     // now build the patterns based on taxa_index
922     buildPattern();
923 }
924 
removeIdenticalSeq(string not_remove,bool keep_two,StrVector & removed_seqs,StrVector & target_seqs)925 Alignment *SuperAlignment::removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs) {
926     IntVector checked;
927     vector<bool> removed;
928     checked.resize(getNSeq(), 0);
929     removed.resize(getNSeq(), false);
930     int seq1;
931 
932 	for (seq1 = 0; seq1 < getNSeq(); seq1++) {
933         if (checked[seq1]) continue;
934         bool first_ident_seq = true;
935 		for (int seq2 = seq1+1; seq2 < getNSeq(); seq2++) {
936 			if (getSeqName(seq2) == not_remove || removed[seq2]) continue;
937 			bool equal_seq = true;
938 			int part = 0;
939 			// check if seq1 and seq2 are identical over all partitions
940 			for (vector<Alignment*>::iterator ait = partitions.begin(); ait != partitions.end(); ait++, part++) {
941 				int subseq1 = taxa_index[seq1][part];
942 				int subseq2 = taxa_index[seq2][part];
943 				if (subseq1 < 0 && subseq2 < 0) // continue if both seqs are absent in this partition
944 					continue;
945 				if (subseq1 < 0 && subseq2 > 0) {
946 					// if one sequence is present and the other is absent for a gene, we conclude that they are not identical
947 					equal_seq = false;
948 					break;
949 				}
950 				if (subseq1 > 0 && subseq2 < 0) {
951 					// if one sequence is present and the other is absent for a gene, we conclude that they are not identical
952 					equal_seq = false;
953 					break;
954 				}
955 				// now if both seqs are present, check sequence content
956 				for (iterator it = (*ait)->begin(); it != (*ait)->end(); it++)
957 					if  ((*it)[subseq1] != (*it)[subseq2]) {
958 						equal_seq = false;
959 						break;
960 					}
961 			}
962 			if (equal_seq) {
963 				if (removed_seqs.size() < getNSeq()-3 && (!keep_two || !first_ident_seq)) {
964 					removed_seqs.push_back(getSeqName(seq2));
965 					target_seqs.push_back(getSeqName(seq1));
966 					removed[seq2] = true;
967 				} else {
968                     cout << "NOTE: " << getSeqName(seq2) << " is identical to " << getSeqName(seq1) << " but kept for subsequent analysis" << endl;
969                 }
970 				checked[seq2] = 1;
971 				first_ident_seq = false;
972 			}
973 		}
974 		checked[seq1] = 1;
975 	}
976 
977 	if (removed_seqs.empty()) return this; // do nothing if the list is empty
978 
979     if (removed_seqs.size() >= getNSeq()-3)
980         outWarning("Your alignment contains too many identical sequences!");
981 
982 	// now remove identical sequences
983 	IntVector keep_seqs;
984 	for (seq1 = 0; seq1 < getNSeq(); seq1++)
985 		if (!removed[seq1]) keep_seqs.push_back(seq1);
986 	SuperAlignment *aln;
987 	aln = new SuperAlignment;
988 	aln->extractSubAlignment(this, keep_seqs, 0);
989 	return aln;
990 }
991 
checkAbsentStates(string msg)992 int SuperAlignment::checkAbsentStates(string msg) {
993     int count = 0;
994     for (auto it = partitions.begin(); it != partitions.end(); it++)
995         count += (*it)->checkAbsentStates("partition " + convertIntToString((it-partitions.begin())+1));
996     return count;
997 }
998 
999 /*
1000 void SuperAlignment::checkGappySeq() {
1001 	int nseq = getNSeq(), part = 0, i;
1002 	IntVector gap_only_seq;
1003 	gap_only_seq.resize(nseq, 1);
1004 	//cout << "Checking gaps..." << endl;
1005 	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
1006 		IntVector keep_seqs;
1007 		for (i = 0; i < nseq; i++)
1008 			if (taxa_index[i][part] >= 0)
1009 			if (!(*it)->isGapOnlySeq(taxa_index[i][part])) {
1010 				keep_seqs.push_back(taxa_index[i][part]);
1011 				gap_only_seq[i] = 0;
1012 			}
1013 		if (keep_seqs.size() < (*it)->getNSeq()) {
1014 			cout << "Discard " << (*it)->getNSeq() - keep_seqs.size()
1015 				 << " sequences from partition number " << part+1 << endl;
1016 			Alignment *aln = new Alignment;
1017 			aln->extractSubAlignment((*it), keep_seqs, 0);
1018 			delete (*it);
1019 			(*it) = aln;
1020 			linkSubAlignment(part);
1021 		}
1022 		cout << __func__ << " num_states = " << (*it)->num_states << endl;
1023 	}
1024 	int wrong_seq = 0;
1025 	for (i = 0; i < nseq; i++)
1026 		if (gap_only_seq[i]) {
1027 			cout << "ERROR: Sequence " << getSeqName(i) << " contains only gaps or missing data" << endl;
1028 			wrong_seq++;
1029 		}
1030 	if (wrong_seq) {
1031 		outError("Some sequences (see above) are problematic, please check your alignment again");
1032 		}
1033 }
1034 */
getSitePatternIndex(IntVector & pattern_index)1035 void SuperAlignment::getSitePatternIndex(IntVector &pattern_index) {
1036 	int nptn = 0;
1037 	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1038 		int nsite = pattern_index.size();
1039 		pattern_index.insert(pattern_index.end(), (*it)->site_pattern.begin(), (*it)->site_pattern.end());
1040 		for (int i = nsite; i < pattern_index.size(); i++)
1041 			pattern_index[i] += nptn;
1042 		nptn += (*it)->getNPattern();
1043 	}
1044 }
1045 
getPatternFreq(IntVector & pattern_freq)1046 void SuperAlignment::getPatternFreq(IntVector &pattern_freq) {
1047 	ASSERT(isSuperAlignment());
1048 	size_t offset = 0;
1049 	if (!pattern_freq.empty()) pattern_freq.resize(0);
1050 	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1051 		IntVector freq;
1052 		(*it)->getPatternFreq(freq);
1053 		pattern_freq.insert(pattern_freq.end(), freq.begin(), freq.end());
1054 		offset += freq.size();
1055 	}
1056 }
1057 
getPatternFreq(int * pattern_freq)1058 void SuperAlignment::getPatternFreq(int *pattern_freq) {
1059     ASSERT(isSuperAlignment());
1060     size_t offset = 0;
1061     for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1062         (*it)->getPatternFreq(pattern_freq + offset);
1063         offset += (*it)->getNPattern();
1064     }
1065 }
1066 
printSiteInfo(const char * filename)1067 void SuperAlignment::printSiteInfo(const char* filename) {
1068     try {
1069         ofstream out(filename);
1070         printSiteInfoHeader(out, filename, true);
1071         int id = 1;
1072         for (auto it = partitions.begin(); it != partitions.end(); it++, id++)
1073             (*it)->printSiteInfo(out, id);
1074         out.close();
1075     } catch (...) {
1076         outError(ERR_WRITE_OUTPUT, filename);
1077     }
1078 }
1079 
computeDivergenceMatrix(double * pair_freq,double * state_freq,bool normalize)1080 void SuperAlignment::computeDivergenceMatrix(double *pair_freq, double *state_freq, bool normalize) {
1081     int nstates = partitions[0]->num_states;
1082     int nstates2 = nstates*nstates;
1083     memset(pair_freq, 0, sizeof(double)*nstates2);
1084     memset(state_freq, 0, sizeof(double)*nstates);
1085 
1086     double *part_pair_freq = new double[nstates2];
1087     double *part_state_freq = new double[nstates];
1088     int i, j;
1089 
1090     for (auto it = partitions.begin(); it != partitions.end(); it++) {
1091         (*it)->computeDivergenceMatrix(part_pair_freq, part_state_freq, false);
1092         for (i = 0; i < nstates2; i++)
1093             pair_freq[i] += part_pair_freq[i];
1094         for (i = 0; i < nstates; i++)
1095             state_freq[i] += part_state_freq[i];
1096     }
1097     if (normalize) {
1098         double sum = 0.0;
1099         for (i = 0; i < nstates; i++)
1100             sum += state_freq[i];
1101         sum = 1.0/sum;
1102         for (i = 0; i < nstates; i++)
1103             state_freq[i] *= sum;
1104         for (i = 0; i < nstates; i++) {
1105             sum = 0.0;
1106             double *pair_freq_ptr = pair_freq + (i*nstates);
1107             for (j = 0; j < nstates; j++)
1108                 sum += pair_freq_ptr[j];
1109             sum = 1.0/sum;
1110             for (j = 0; j < nstates; j++)
1111                 pair_freq_ptr[j] *= sum;
1112         }
1113     }
1114     delete [] part_state_freq;
1115     delete [] part_pair_freq;
1116 }
1117 
doSymTest(size_t vecid,vector<SymTestResult> & vec_sym,vector<SymTestResult> & vec_marsym,vector<SymTestResult> & vec_intsym,int * rstream,vector<SymTestStat> * stats)1118 void SuperAlignment::doSymTest(size_t vecid, vector<SymTestResult> &vec_sym, vector<SymTestResult> &vec_marsym,
1119                                vector<SymTestResult> &vec_intsym, int *rstream, vector<SymTestStat> *stats) {
1120 
1121     vector<vector<SymTestStat> >all_stats;
1122     if (stats)
1123         all_stats.resize(partitions.size());
1124 
1125     int i, nparts = partitions.size();
1126     #ifdef _OPENMP
1127     #pragma omp parallel for private(i)
1128     #endif
1129     for (i = 0; i < nparts; i++) {
1130         if (stats) {
1131             partitions[i]->doSymTest(vecid + i, vec_sym, vec_marsym, vec_intsym, rstream, &all_stats[i]);
1132             for (auto it = all_stats[i].begin(); it != all_stats[i].end(); it++)
1133                 it->part = i;
1134         } else
1135             partitions[i]->doSymTest(vecid + i, vec_sym, vec_marsym, vec_intsym, rstream);
1136     }
1137     if (stats) {
1138         for (i = 0; i < nparts; i++)
1139             stats->insert(stats->end(), all_stats[i].begin(), all_stats[i].end());
1140     }
1141 }
1142 
1143 /*
1144 void SuperAlignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq, const char *spec) {
1145 	ASSERT(aln->isSuperAlignment());
1146 	Alignment::copyAlignment(aln);
1147 	SuperAlignment *super_aln = (SuperAlignment*) aln;
1148 	ASSERT(partitions.empty());
1149     name = aln->name;
1150     model_name = aln->model_name;
1151     sequence_type = aln->sequence_type;
1152     position_spec = aln->position_spec;
1153     aln_file = aln->aln_file;
1154 
1155 	if (spec && strncmp(spec, "GENE", 4) == 0) {
1156 		// resampling whole genes
1157         partitions.resize(super_aln->partitions.size(), NULL);
1158         int i, ptn;
1159         for (i = 0; i < super_aln->partitions.size(); i++) {
1160 
1161             // get a random gene
1162 			int part = random_int(super_aln->partitions.size());
1163 
1164             // ptn_freq stores pattern frequency of bootstrap aln
1165 
1166             IntVector ptn_freq;
1167             if (strncmp(spec,"GENESITE",8) == 0) {
1168                 // resample sites of this gene
1169                 super_aln->partitions[part]->createBootstrapAlignment(ptn_freq);
1170                 ASSERT(ptn_freq.size() == super_aln->partitions[part]->size());
1171             } else {
1172                 // copy ptn_freq from this gene
1173                 for (ptn = 0; ptn < super_aln->partitions[part]->size(); ptn++)
1174                     ptn_freq.push_back(super_aln->partitions[part]->at(ptn).frequency);
1175             }
1176 
1177             if (!partitions[part]) {
1178                 // allocate the partition
1179                 partitions[part] = new Alignment;
1180                 partitions[part]->copyAlignment(super_aln->partitions[part]);
1181                 for (ptn = 0; ptn < super_aln->partitions[part]->size(); ptn++)
1182                     partitions[part]->at(ptn).frequency = ptn_freq[ptn];
1183             } else {
1184                 // increase frequency if already existed
1185                 for (ptn = 0; ptn < super_aln->partitions[part]->size(); ptn++)
1186                     partitions[part]->at(ptn).frequency += ptn_freq[ptn];
1187             }
1188         }
1189 
1190         // fulfill genes that are missing
1191         for (i = 0; i < partitions.size(); i++)
1192             if (!partitions[i]) {
1193                 partitions[i] = new Alignment;
1194                 partitions[i]->copyAlignment(super_aln->partitions[i]);
1195                 // reset all frequency
1196                 for (ptn = 0; ptn < partitions[i]->size(); ptn++)
1197                     partitions[i]->at(ptn).frequency = 0;
1198             }
1199 
1200         // fill up pattern_freq vector
1201         if (pattern_freq) {
1202             pattern_freq->resize(0);
1203             for (i = 0; i < partitions.size(); i++)
1204                 for (ptn = 0; ptn < partitions[i]->size(); ptn++)
1205                     pattern_freq->push_back(partitions[i]->at(ptn).frequency);
1206         }
1207     } else if (!spec) {
1208 		// resampling sites within genes
1209         for (vector<Alignment*>::iterator it = super_aln->partitions.begin(); it != super_aln->partitions.end(); it++) {
1210             Alignment *boot_aln = new Alignment;
1211             if (pattern_freq) {
1212                 IntVector part_pattern_freq;
1213                 boot_aln->createBootstrapAlignment(*it, &part_pattern_freq);
1214                 pattern_freq->insert(pattern_freq->end(), part_pattern_freq.begin(), part_pattern_freq.end());
1215             } else {
1216                 boot_aln->createBootstrapAlignment(*it);
1217             }
1218             partitions.push_back(boot_aln);
1219         }
1220     } else {
1221         outError("Wrong -bsam, either -bsam GENE or -bsam GENESITE");
1222     }
1223 	taxa_index = super_aln->taxa_index;
1224     countConstSite();
1225 }
1226 */
1227 
createBootstrapAlignment(Alignment * aln,IntVector * pattern_freq,const char * spec)1228 void SuperAlignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq, const char *spec) {
1229     ASSERT(aln->isSuperAlignment());
1230     SuperAlignment *super_aln = (SuperAlignment*) aln;
1231     ASSERT(partitions.empty());
1232     name = aln->name;
1233     model_name = aln->model_name;
1234     sequence_type = aln->sequence_type;
1235     position_spec = aln->position_spec;
1236     aln_file = aln->aln_file;
1237 
1238     if (!spec) {
1239         // resampling sites within genes
1240         Alignment::copyAlignment(aln);
1241         partitions.reserve(super_aln->partitions.size());
1242         for (vector<Alignment*>::iterator it = super_aln->partitions.begin(); it != super_aln->partitions.end(); it++) {
1243             Alignment *boot_aln = new Alignment;
1244             if (pattern_freq) {
1245                 IntVector part_pattern_freq;
1246                 boot_aln->createBootstrapAlignment(*it, &part_pattern_freq);
1247                 pattern_freq->insert(pattern_freq->end(), part_pattern_freq.begin(), part_pattern_freq.end());
1248             } else {
1249                 boot_aln->createBootstrapAlignment(*it);
1250             }
1251             partitions.push_back(boot_aln);
1252         }
1253         taxa_index = super_aln->taxa_index;
1254         countConstSite();
1255     } else if (strcmp(spec, "GENE") == 0) {
1256         ASSERT(!pattern_freq);
1257         // resampling whole genes
1258         IntVector gene_freq;
1259         random_resampling(super_aln->partitions.size(), gene_freq);
1260         for (int i = 0; i < gene_freq.size(); i++)
1261             if (gene_freq[i] > 0) {
1262                 Alignment *boot_aln = new Alignment;
1263                 boot_aln->copyAlignment(super_aln->partitions[i]);
1264                 if (gene_freq[i] > 1) {
1265                     for (auto it = boot_aln->begin(); it != boot_aln->end(); it++)
1266                         it->frequency *= gene_freq[i];
1267                     auto site_pattern = boot_aln->site_pattern;
1268                     for (int j = 1; j < gene_freq[i]; j++)
1269                         boot_aln->site_pattern.insert(boot_aln->site_pattern.end(), site_pattern.begin(), site_pattern.end());
1270                     boot_aln->countConstSite();
1271                 }
1272                 partitions.push_back(boot_aln);
1273             }
1274         init();
1275     } else if (strcmp(spec, "GENESITE") == 0) {
1276         ASSERT(!pattern_freq);
1277         // resampling whole genes then sites within resampled genes
1278         IntVector gene_freq;
1279         random_resampling(super_aln->partitions.size(), gene_freq);
1280         for (int i = 0; i < gene_freq.size(); i++)
1281             for (int rep = 0; rep < gene_freq[i]; rep++) {
1282             Alignment *boot_aln = new Alignment;
1283             boot_aln->createBootstrapAlignment(super_aln->partitions[i]);
1284             boot_aln->name = boot_aln->name + "." + convertIntToString(rep);
1285             partitions.push_back(boot_aln);
1286         }
1287         init();
1288     } else {
1289         outError("Wrong -bsam, either -bsam GENE or -bsam GENESITE");
1290     }
1291 }
1292 
createBootstrapAlignment(IntVector & pattern_freq,const char * spec)1293 void SuperAlignment::createBootstrapAlignment(IntVector &pattern_freq, const char *spec) {
1294 	ASSERT(isSuperAlignment());
1295 	int nptn = 0;
1296 	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1297 		nptn += (*it)->getNPattern();
1298 	}
1299 	pattern_freq.resize(0);
1300 	int *internal_freq = new int[nptn];
1301 	createBootstrapAlignment(internal_freq, spec);
1302 	pattern_freq.insert(pattern_freq.end(), internal_freq, internal_freq + nptn);
1303 	delete [] internal_freq;
1304 
1305 }
1306 
1307 
createBootstrapAlignment(int * pattern_freq,const char * spec,int * rstream)1308 void SuperAlignment::createBootstrapAlignment(int *pattern_freq, const char *spec, int *rstream) {
1309 	ASSERT(isSuperAlignment());
1310 //	if (spec && strncmp(spec, "GENE", 4) != 0) outError("Unsupported yet. ", __func__);
1311 
1312 	if (spec && strncmp(spec, "GENE", 4) == 0) {
1313 		// resampling whole genes
1314 		int nptn = 0;
1315 		IntVector part_pos;
1316 		for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1317 			part_pos.push_back(nptn);
1318 			nptn += (*it)->getNPattern();
1319 		}
1320 		memset(pattern_freq, 0, nptn * sizeof(int));
1321         IntVector gene_freq;
1322         random_resampling(partitions.size(), gene_freq, rstream);
1323 		for (int part = 0; part < partitions.size(); part++)
1324         for (int rep = 0; rep < gene_freq[part]; rep++){
1325 			Alignment *aln = partitions[part];
1326 			if (strncmp(spec,"GENESITE",8) == 0) {
1327 				// then resampling sites in resampled gene
1328                 IntVector sample;
1329                 random_resampling(aln->getNSite(), sample, rstream);
1330 				for (int site = 0; site < sample.size(); site++)
1331                 for (int rep2 = 0; rep2 < sample[site]; rep2++) {
1332 					int ptn_id = aln->getPatternID(site);
1333 					pattern_freq[ptn_id + part_pos[part]]++;
1334 				}
1335 			} else {
1336 				for (int ptn = 0; ptn < aln->getNPattern(); ptn++)
1337 					pattern_freq[ptn + part_pos[part]] += aln->at(ptn).frequency;
1338 			}
1339 		}
1340 	} else {
1341 		// resampling sites within genes
1342 		int offset = 0;
1343 		for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1344             if (spec && strncmp(spec, "SCALE=", 6) == 0)
1345                 (*it)->createBootstrapAlignment(pattern_freq + offset, spec, rstream);
1346             else
1347                 (*it)->createBootstrapAlignment(pattern_freq + offset, NULL, rstream);
1348 			offset += (*it)->getNPattern();
1349 		}
1350 	}
1351 }
1352 
1353 /**
1354  * shuffle alignment by randomizing the order of sites
1355  */
shuffleAlignment()1356 void SuperAlignment::shuffleAlignment() {
1357 	ASSERT(isSuperAlignment());
1358 	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1359 		(*it)->shuffleAlignment();
1360 	}
1361 }
1362 
1363 
computeObsDist(int seq1,int seq2)1364 double SuperAlignment::computeObsDist(int seq1, int seq2) {
1365 	int site;
1366 	int diff_pos = 0, total_pos = 0;
1367 	for (site = 0; site < getNSite(); site++) {
1368 		int id1 = taxa_index[seq1][site];
1369 		int id2 = taxa_index[seq2][site];
1370 		if (id1 < 0 || id2 < 0) continue;
1371 		int num_states = partitions[site]->num_states;
1372 		for (Alignment::iterator it = partitions[site]->begin(); it != partitions[site]->end(); it++)
1373 			if  ((*it)[id1] < num_states && (*it)[id2] < num_states) {
1374 				total_pos += (*it).frequency;
1375 				if ((*it)[id1] != (*it)[id2] )
1376 					diff_pos += (*it).frequency;
1377 			}
1378 	}
1379 	if (!total_pos)
1380 		return MAX_GENETIC_DIST; // return +INF if no overlap between two sequences
1381 	return ((double)diff_pos) / total_pos;
1382 }
1383 
1384 
computeDist(int seq1,int seq2)1385 double SuperAlignment::computeDist(int seq1, int seq2) {
1386 	if (partitions.empty()) return 0.0;
1387 	double obs_dist = computeObsDist(seq1, seq2);
1388     int num_states = partitions[0]->num_states;
1389     double z = (double)num_states / (num_states-1);
1390     double x = 1.0 - (z * obs_dist);
1391 
1392     if (x <= 0) {
1393         // string str = "Too long distance between two sequences ";
1394         // str += getSeqName(seq1);
1395         // str += " and ";
1396         // str += getSeqName(seq2);
1397         // outWarning(str);
1398         return MAX_GENETIC_DIST;
1399     }
1400 
1401     return -log(x) / z;
1402     //return computeObsDist(seq1, seq2);
1403 	//  AVERAGE DISTANCE
1404 
1405 	double dist = 0;
1406 	int part = 0, num = 0;
1407 	for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
1408 		int id1 = taxa_index[seq1][part];
1409 		int id2 = taxa_index[seq2][part];
1410 		if (id1 < 0 || id2 < 0) continue;
1411 		dist += (*it)->computeDist(id1, id2);
1412 	}
1413 	if (num == 0) // two sequences are not overlapping at all!
1414 		return MAX_GENETIC_DIST;
1415 	return dist / num;
1416 }
1417 
~SuperAlignment()1418 SuperAlignment::~SuperAlignment()
1419 {
1420 	for (vector<Alignment*>::reverse_iterator it = partitions.rbegin(); it != partitions.rend(); it++)
1421 		delete (*it);
1422 	partitions.clear();
1423 }
1424 
printAlignment(InputType format,ostream & out,bool append,const char * aln_site_list,int exclude_sites,const char * ref_seq_name)1425 void SuperAlignment::printAlignment(InputType format, ostream &out, bool append,
1426                                     const char *aln_site_list, int exclude_sites,
1427                                     const char *ref_seq_name)
1428 {
1429     Alignment *concat = concatenateAlignments();
1430     concat->printAlignment(format, out, append, aln_site_list, exclude_sites, ref_seq_name);
1431     delete concat;
1432     if (format == IN_NEXUS)
1433         printPartition(out, NULL, true);
1434 }
1435 
printSubAlignments(Params & params)1436 void SuperAlignment::printSubAlignments(Params &params) {
1437 	vector<Alignment*>::iterator pit;
1438 	string filename;
1439 	int part;
1440 	for (pit = partitions.begin(), part = 0; pit != partitions.end(); pit++, part++) {
1441 		if (params.aln_output)
1442 			filename = params.aln_output;
1443 		else
1444 			filename = params.out_prefix;
1445 		filename += "." + (*pit)->name;
1446         int exclude_sites = (params.aln_nogaps) ? EXCLUDE_GAP : 0;
1447         (*pit)->printAlignment(params.aln_output_format, filename.c_str(), false, NULL, exclude_sites, NULL);
1448 	}
1449 }
1450 
computeUnconstrainedLogL()1451 double SuperAlignment::computeUnconstrainedLogL() {
1452 	double logl = 0.0;
1453 	vector<Alignment*>::iterator pit;
1454 	for (pit = partitions.begin(); pit != partitions.end(); pit++)
1455 		logl += (*pit)->computeUnconstrainedLogL();
1456 	return logl;
1457 }
1458 
computeMissingData()1459 double SuperAlignment::computeMissingData() {
1460 	double ret = 0.0;
1461 	size_t len = 0;
1462 	vector<Alignment*>::iterator pit;
1463 	for (pit = partitions.begin(); pit != partitions.end(); pit++) {
1464 		ret += (*pit)->getNSeq() * (*pit)->getNSite();
1465 		len += (*pit)->getNSite();
1466 	}
1467 	ret /= getNSeq() * len;
1468 	return 1.0 - ret;
1469 
1470 }
1471 
concatenateAlignments(set<int> & ids)1472 Alignment *SuperAlignment::concatenateAlignments(set<int> &ids) {
1473 	string union_taxa;
1474 	int nsites = 0, nstates = 0;
1475     set<int>::iterator it;
1476 	SeqType sub_type = SEQ_UNKNOWN;
1477 	for (it = ids.begin(); it != ids.end(); it++) {
1478 		int id = *it;
1479 		ASSERT(id >= 0 && id < partitions.size());
1480 		if (nstates == 0) nstates = partitions[id]->num_states;
1481 		if (sub_type == SEQ_UNKNOWN) sub_type = partitions[id]->seq_type;
1482 		if (sub_type != partitions[id]->seq_type)
1483 			outError("Cannot concatenate sub-alignments of different type");
1484 		if (nstates != partitions[id]->num_states)
1485 			outError("Cannot concatenate sub-alignments of different #states");
1486 
1487 		string taxa_set;
1488         Pattern taxa_pat = getPattern(id);
1489         taxa_set.insert(taxa_set.begin(), taxa_pat.begin(), taxa_pat.end());
1490 		nsites += partitions[id]->getNSite();
1491 		if (it == ids.begin()) union_taxa = taxa_set; else {
1492 			for (int j = 0; j < union_taxa.length(); j++)
1493 				if (taxa_set[j] == 1) union_taxa[j] = 1;
1494 		}
1495 	}
1496 
1497 	Alignment *aln = new Alignment;
1498 	for (int i = 0; i < union_taxa.length(); i++)
1499 		if (union_taxa[i] == 1) {
1500 			aln->seq_names.push_back(getSeqName(i));
1501 		}
1502 	aln->num_states = nstates;
1503 	aln->seq_type = sub_type;
1504 	aln->site_pattern.resize(nsites, -1);
1505     aln->clear();
1506     aln->pattern_index.clear();
1507     aln->STATE_UNKNOWN = partitions[*ids.begin()]->STATE_UNKNOWN;
1508     aln->genetic_code = partitions[*ids.begin()]->genetic_code;
1509     if (aln->seq_type == SEQ_CODON) {
1510     	aln->codon_table = new char[aln->num_states];
1511     	memcpy(aln->codon_table, partitions[*ids.begin()]->codon_table, aln->num_states);
1512     	aln->non_stop_codon = new char[strlen(aln->genetic_code)];
1513     	memcpy(aln->non_stop_codon, partitions[*ids.begin()]->non_stop_codon, strlen(aln->genetic_code));
1514     }
1515 
1516     int site = 0;
1517     for (it = ids.begin(); it != ids.end(); it++) {
1518     	int id = *it;
1519         // 2018-08-23: important bugfix in v1.6: taxa_set has wrong correspondance
1520 		//string taxa_set;
1521         //Pattern taxa_pat = getPattern(id);
1522         //taxa_set.insert(taxa_set.begin(), taxa_pat.begin(), taxa_pat.end());
1523     	for (Alignment::iterator it = partitions[id]->begin(); it != partitions[id]->end(); it++) {
1524     		Pattern pat;
1525     		//int part_seq = 0;
1526     		for (int seq = 0; seq < union_taxa.size(); seq++)
1527     			if (union_taxa[seq] == 1) {
1528     				char ch = aln->STATE_UNKNOWN;
1529                     int seq_part = taxa_index[seq][id];
1530                     if (seq_part >= 0)
1531                         ch = (*it)[seq_part];
1532                     //if (taxa_set[seq] == 1) {
1533                     //    ch = (*it)[part_seq++];
1534                     //}
1535     				pat.push_back(ch);
1536     			}
1537     		//ASSERT(part_seq == partitions[id]->getNSeq());
1538     		aln->addPattern(pat, site, (*it).frequency);
1539     		// IMPORTANT BUG FIX FOLLOW
1540     		int ptnindex = aln->pattern_index[pat];
1541             for (int j = 0; j < (*it).frequency; j++)
1542                 aln->site_pattern[site++] = ptnindex;
1543 
1544     	}
1545     }
1546     aln->countConstSite();
1547 //    aln->buildSeqStates();
1548 
1549 	return aln;
1550 }
1551 
concatenateAlignments()1552 Alignment *SuperAlignment::concatenateAlignments() {
1553     vector<SeqType> seq_types;
1554     vector<char*> genetic_codes;
1555     vector<set<int> > ids;
1556     for (int i = 0; i < partitions.size(); i++) {
1557         bool found = false;
1558         for (int j = 0; j < seq_types.size(); j++)
1559             if (partitions[i]->seq_type == seq_types[j] && partitions[i]->genetic_code == genetic_codes[j]) {
1560                 ids[j].insert(i);
1561                 found = true;
1562                 break;
1563             }
1564         if (found)
1565             continue;
1566         // create a new partition
1567         seq_types.push_back(partitions[i]->seq_type);
1568         genetic_codes.push_back(partitions[i]->genetic_code);
1569         ids.push_back(set<int>());
1570         ids.back().insert(i);
1571     }
1572     if (seq_types.size() == 1)
1573         return concatenateAlignments(ids[0]);
1574 
1575     // mixed data with >= 2 partitions
1576     SuperAlignment *saln = new SuperAlignment();
1577     saln->max_num_states = 0;
1578     // first build taxa_index and partitions
1579     int site, seq, nsite = ids.size();
1580 
1581     // BUG FIX 2016-11-29: when merging partitions with -m TESTMERGE, sequence order is changed
1582     // get the taxa names from existing tree
1583 
1584     saln->seq_names = seq_names;
1585     saln->taxa_index.resize(saln->seq_names.size());
1586     for (auto it = saln->taxa_index.begin(); it != saln->taxa_index.end(); it++)
1587         it->resize(nsite, -1);
1588 
1589     for (site = 0; site != nsite; site++) {
1590         Alignment *part_aln = concatenateAlignments(ids[site]);
1591         saln->partitions.push_back(part_aln);
1592         int nseq = part_aln->getNSeq();
1593         //cout << "nseq  = " << nseq << endl;
1594         for (seq = 0; seq < nseq; seq++) {
1595             int id = saln->getSeqID(part_aln->getSeqName(seq));
1596             ASSERT(id >= 0);
1597             saln->taxa_index[id][site] = seq;
1598         }
1599     }
1600     // now the patterns of sequence-genes presence/absence
1601     saln->buildPattern();
1602     return saln;
1603 }
1604 
countConstSite()1605 void SuperAlignment::countConstSite() {
1606     num_informative_sites = 0;
1607     num_variant_sites = 0;
1608     max_num_states = 0;
1609     frac_const_sites = 0;
1610     frac_invariant_sites = 0;
1611     num_parsimony_sites = 0;
1612     size_t nsites = 0;
1613     for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1614         (*it)->countConstSite();
1615         num_informative_sites += (*it)->num_informative_sites;
1616         num_variant_sites += (*it)->num_variant_sites;
1617         if ((*it)->num_states > max_num_states)
1618             max_num_states = (*it)->num_states;
1619         nsites += (*it)->getNSite();
1620         frac_const_sites += (*it)->frac_const_sites * (*it)->getNSite();
1621         frac_invariant_sites += (*it)->frac_invariant_sites * (*it)->getNSite();
1622     }
1623     frac_const_sites /= nsites;
1624     frac_invariant_sites /= nsites;
1625 }
1626 
orderPatternByNumChars(int pat_type)1627 void SuperAlignment::orderPatternByNumChars(int pat_type) {
1628     const int UINT_BITS = sizeof(UINT)*8;
1629     if (pat_type == PAT_INFORMATIVE)
1630         num_parsimony_sites = num_informative_sites;
1631     else
1632         num_parsimony_sites = num_variant_sites;
1633 
1634     int maxi = (num_parsimony_sites+UINT_BITS-1)/UINT_BITS;
1635     pars_lower_bound = new UINT[maxi+1];
1636     memset(pars_lower_bound, 0, (maxi+1)*sizeof(UINT));
1637     int part, nseq = getNSeq();
1638 
1639     // compute ordered_pattern
1640     ordered_pattern.clear();
1641 //    UINT sum_scores[npart];
1642     for (part  = 0; part != partitions.size(); part++) {
1643         partitions[part]->orderPatternByNumChars(pat_type);
1644         // partial_partition
1645         if (Params::getInstance().partition_type == TOPO_UNLINKED)
1646             continue;
1647         for (vector<Pattern>::iterator pit = partitions[part]->ordered_pattern.begin(); pit != partitions[part]->ordered_pattern.end(); pit++) {
1648             Pattern pattern(*pit);
1649             pattern.resize(nseq); // maximal unknown states
1650             for (int j = 0; j < nseq; j++)
1651                 if (taxa_index[j][part] >= 0)
1652                     pattern[j] = (*pit)[taxa_index[j][part]];
1653                 else
1654                     pattern[j] = partitions[part]->STATE_UNKNOWN;
1655             ordered_pattern.push_back(pattern);
1656         }
1657 //        sum_scores[part] = partitions[part]->pars_lower_bound[0];
1658     }
1659     // TODO compute pars_lower_bound (lower bound of pars score for remaining patterns)
1660 }
1661