1 /***************************************************************************
2 * Copyright (C) 2009 by BUI Quang Minh *
3 * minh.bui@univie.ac.at *
4 * *
5 * This program is free software; you can redistribute it and/or modify *
6 * it under the terms of the GNU General Public License as published by *
7 * the Free Software Foundation; either version 2 of the License, or *
8 * (at your option) any later version. *
9 * *
10 * This program is distributed in the hope that it will be useful, *
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of *
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the *
13 * GNU General Public License for more details. *
14 * *
15 * You should have received a copy of the GNU General Public License *
16 * along with this program; if not, write to the *
17 * Free Software Foundation, Inc., *
18 * 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. *
19 ***************************************************************************/
20
21 #include <stdarg.h>
22 #include "superalignment.h"
23 #include "nclextra/msetsblock.h"
24 #include "nclextra/myreader.h"
25 #include "main/phylotesting.h"
26
createAlignment(string aln_file,const char * sequence_type,InputType intype,string model_name)27 Alignment *createAlignment(string aln_file, const char *sequence_type, InputType intype, string model_name) {
28 bool is_dir = isDirectory(aln_file.c_str());
29
30 if (!is_dir && aln_file.find(',') == string::npos)
31 return new Alignment((char*)aln_file.c_str(), (char*)sequence_type, intype, model_name);
32
33 SuperAlignment *super_aln = new SuperAlignment;
34 if (is_dir)
35 super_aln->readPartitionDir(aln_file, (char*)sequence_type, intype, model_name, true);
36 else
37 super_aln->readPartitionList(aln_file, (char*)sequence_type, intype, model_name, true);
38 super_aln->init();
39 Alignment *aln = super_aln->concatenateAlignments();
40 if (aln->isSuperAlignment())
41 outError("Cannot concatenate alignments of different data type ", aln_file);
42 delete super_aln;
43 return aln;
44 }
45
SuperAlignment()46 SuperAlignment::SuperAlignment() : Alignment() {
47 max_num_states = 0;
48 }
49
SuperAlignment(Params & params)50 SuperAlignment::SuperAlignment(Params ¶ms) : Alignment()
51 {
52 readFromParams(params);
53
54 init();
55
56 cout << "Degree of missing data: " << computeMissingData() << endl;
57
58 #ifdef _OPENMP
59 if (params.num_threads > partitions.size()) {
60 cout << "Info: multi-threading strategy over alignment sites" << endl;
61 } else {
62 cout << "Info: multi-threading strategy over partitions" << endl;
63 }
64 #endif
65 cout << endl;
66
67 }
68
readFromParams(Params & params)69 void SuperAlignment::readFromParams(Params ¶ms) {
70 if (isDirectory(params.partition_file)) {
71 // reading all files in the directory
72 readPartitionDir(params.partition_file, params.sequence_type, params.intype, params.model_name, params.remove_empty_seq);
73 } else if (strstr(params.partition_file, ",") != nullptr) {
74 // reading all files in a comma-separated list
75 readPartitionList(params.partition_file, params.sequence_type, params.intype, params.model_name, params.remove_empty_seq);
76 } else {
77 cout << "Reading partition model file " << params.partition_file << " ..." << endl;
78 if (detectInputFile(params.partition_file) == IN_NEXUS) {
79 readPartitionNexus(params);
80 if (partitions.empty()) {
81 outError("No partition found in SETS block. An example syntax looks like: \n#nexus\nbegin sets;\n charset part1=1-100;\n charset part2=101-300;\nend;");
82 }
83 } else
84 readPartitionRaxml(params);
85 }
86 if (partitions.empty())
87 outError("No partition found");
88
89 // check for duplicated partition names
90 unordered_set<string> part_names;
91 for (auto pit = partitions.begin(); pit != partitions.end(); pit++) {
92 if (part_names.find((*pit)->name) != part_names.end())
93 outError("Duplicated partition name ", (*pit)->name);
94 part_names.insert((*pit)->name);
95 }
96
97 if (params.subsampling != 0) {
98 // sumsample a number of partitions
99 int subsample = params.subsampling;
100 if (abs(subsample) >= partitions.size())
101 outError("--subsample must be between -" + convertIntToString(partitions.size()-1) + " and " + convertIntToString(partitions.size()-1));
102 cout << "Random subsampling " << ((subsample > 0) ? subsample : partitions.size() + subsample)
103 << " partitions (seed: " << params.subsampling_seed << ")..." << endl;
104 int *rstream;
105 init_random(params.subsampling_seed, false, &rstream);
106 // make sure to sub-sample exact number
107 vector<bool> sample;
108 int i;
109 sample.resize(partitions.size(), false);
110 for (int num = 0; num < abs(subsample); ) {
111 i = random_int(sample.size(), rstream);
112 if (!sample[i]) {
113 sample[i] = true;
114 num++;
115 }
116 }
117 finish_random(rstream);
118 if (subsample < 0) {
119 // reverse sampling
120 for (i = 0; i < sample.size(); i++)
121 sample[i] = !sample[i];
122 }
123 vector<Alignment*> keep_partitions;
124 for (i = 0; i < sample.size(); i++)
125 if (sample[i])
126 keep_partitions.push_back(partitions[i]);
127 // now replace partitions
128 partitions = keep_partitions;
129 }
130
131 // Initialize the counter for evaluated NNIs on subtrees
132 cout << "Subset\tType\tSeqs\tSites\tInfor\tInvar\tModel\tName" << endl;
133 int part = 0;
134 for (auto it = partitions.begin(); it != partitions.end(); it++, part++) {
135 cout << part+1 << "\t" << (*it)->sequence_type << "\t" << (*it)->getNSeq()
136 << "\t" << (*it)->getNSite() << "\t" << (*it)->num_informative_sites
137 << "\t" << (*it)->getNSite()-(*it)->num_variant_sites << "\t"
138 << (*it)->model_name << "\t" << (*it)->name << endl;
139 if ((*it)->num_variant_sites == 0) {
140 outWarning("No variant sites in partition " + (*it)->name);
141 } else if ((*it)->num_informative_sites == 0) {
142 outWarning("No parsimony-informative sites in partition " + (*it)->name);
143 }
144 }
145 }
146
init(StrVector * sequence_names)147 void SuperAlignment::init(StrVector *sequence_names) {
148 // start original code
149
150 max_num_states = 0;
151 // first build taxa_index and partitions
152 int site, seq, nsite = partitions.size();
153
154 // BUG FIX 2016-11-29: when merging partitions with -m TESTMERGE, sequence order is changed
155 // get the taxa names from existing tree
156 if (sequence_names && !sequence_names->empty()) {
157 seq_names = *sequence_names;
158 taxa_index.resize(seq_names.size());
159 for (auto i = taxa_index.begin(); i != taxa_index.end(); i++)
160 i->resize(nsite, -1);
161 }
162
163 site = 0;
164 for (auto it = partitions.begin(); it != partitions.end(); it++, site++) {
165 // partitions.push_back((*it)->aln);
166 int nseq = (*it)->getNSeq();
167 //cout << "nseq = " << nseq << endl;
168 for (seq = 0; seq < nseq; seq++) {
169 int id = getSeqID((*it)->getSeqName(seq));
170 if (id < 0) {
171 seq_names.push_back((*it)->getSeqName(seq));
172 id = seq_names.size()-1;
173 IntVector vec(nsite, -1);
174 vec[site] = seq;
175 taxa_index.push_back(vec);
176 } else
177 taxa_index[id][site] = seq;
178 }
179 }
180 // now the patterns of sequence-genes presence/absence
181 buildPattern();
182 }
183
buildPattern()184 void SuperAlignment::buildPattern() {
185 int site, seq, nsite = partitions.size();
186
187 seq_type = SEQ_BINARY;
188 num_states = 2; // binary type because the super alignment presents the presence/absence of taxa in the partitions
189 STATE_UNKNOWN = 2;
190 site_pattern.resize(nsite, -1);
191 clear();
192 pattern_index.clear();
193 VerboseMode save_mode = verbose_mode;
194 verbose_mode = min(verbose_mode, VB_MIN); // to avoid printing gappy sites in addPattern
195 int nseq = getNSeq();
196 for (site = 0; site < nsite; site++) {
197 Pattern pat;
198 pat.resize(nseq, 0);
199 for (seq = 0; seq < nseq; seq++)
200 pat[seq] = (taxa_index[seq][site] >= 0)? 1 : 0;
201 addPattern(pat, site);
202 }
203 verbose_mode = save_mode;
204 countConstSite();
205 // buildSeqStates();
206 }
207
readPartition(Params & params)208 void SuperAlignment::readPartition(Params ¶ms) {
209 try {
210 ifstream in;
211 in.exceptions(ios::failbit | ios::badbit);
212 in.open(params.partition_file);
213 in.exceptions(ios::badbit);
214
215 while (!in.eof()) {
216 CharSet info;
217 getline(in, info.name, ',');
218 if (in.eof()) break;
219 getline(in, info.model_name, ',');
220 if (model_name == "") info.model_name = params.model_name;
221 getline(in, info.aln_file, ',');
222 if (info.aln_file == "" && params.aln_file) info.aln_file = params.aln_file;
223 getline(in, info.sequence_type, ',');
224 if (info.sequence_type=="" && params.sequence_type)
225 info.sequence_type = params.sequence_type;
226 safeGetline(in, info.position_spec);
227 trimString(info.sequence_type);
228 // cout << endl << "Reading partition " << info.name << " (model=" << info.model_name << ", aln=" <<
229 // info.aln_file << ", seq=" << info.sequence_type << ", pos=" << ((info.position_spec.length() >= 20) ? info.position_spec.substr(0,20)+"..." : info.position_spec) << ") ..." << endl;
230
231 // TODO move this to supertree
232 // info.nniMoves[0].ptnlh = NULL;
233 // info.nniMoves[1].ptnlh = NULL;
234 // info.cur_ptnlh = NULL;
235 // part_info.push_back(info);
236 Alignment *part_aln = createAlignment(info.aln_file, info.sequence_type.c_str(), params.intype, info.model_name);
237 if (!info.position_spec.empty()) {
238 Alignment *new_aln = new Alignment();
239 new_aln->extractSites(part_aln, info.position_spec.c_str());
240 delete part_aln;
241 part_aln = new_aln;
242 }
243 part_aln->name = info.name;
244 part_aln->model_name = info.model_name;
245 part_aln->position_spec = info.position_spec;
246 part_aln->aln_file = info.aln_file;
247 part_aln->sequence_type = info.sequence_type;
248 partitions.push_back(part_aln);
249 // TODO move this to supertree
250 // PhyloTree *tree = new PhyloTree(part_aln);
251 // push_back(tree);
252 }
253
254 in.clear();
255 // set the failbit again
256 in.exceptions(ios::failbit | ios::badbit);
257 in.close();
258 } catch(ios::failure) {
259 outError(ERR_READ_INPUT);
260 } catch (string str) {
261 outError(str);
262 }
263
264
265 }
266
readPartitionRaxml(Params & params)267 void SuperAlignment::readPartitionRaxml(Params ¶ms) {
268 try {
269 ifstream in;
270 in.exceptions(ios::failbit | ios::badbit);
271 in.open(params.partition_file);
272 in.exceptions(ios::badbit);
273 // PartitionInfo info;
274 Alignment *input_aln = NULL;
275 if (!params.aln_file)
276 outError("Please supply an alignment with -s option");
277
278 input_aln = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
279
280 cout << endl << "Partition file is not in NEXUS format, assuming RAxML-style partition file..." << endl;
281
282 size_t pos = params.model_name.find_first_of("+*");
283 string rate_type = "";
284 if (pos != string::npos) rate_type = params.model_name.substr(pos);
285
286 while (!in.eof()) {
287 CharSet info;
288 getline(in, info.model_name, ',');
289 if (in.eof()) break;
290 trimString(info.model_name);
291 // std::transform(info.model_name.begin(), info.model_name.end(), info.model_name.begin(), ::toupper);
292
293 bool is_ASC = info.model_name.substr(0,4) == "ASC_";
294 if (is_ASC) info.model_name.erase(0, 4);
295 StateFreqType freq = FREQ_UNKNOWN;
296 if (info.model_name.find_first_of("*+{") == string::npos ) {
297 if (*info.model_name.rbegin() == 'F' && info.model_name != "DAYHOFF") {
298 freq = FREQ_EMPIRICAL;
299 info.model_name.erase(info.model_name.length()-1);
300 } else if (*info.model_name.rbegin() == 'X' && info.model_name != "LG4X") {
301 freq = FREQ_ESTIMATE;
302 info.model_name.erase(info.model_name.length()-1);
303 }
304 }
305
306 if (info.model_name.empty())
307 outError("Please give model names in partition file!");
308 if (info.model_name == "BIN") {
309 info.sequence_type = "BIN";
310 info.model_name = "GTR2";
311 } else if (info.model_name == "DNA") {
312 info.sequence_type = "DNA";
313 info.model_name = "GTR";
314 } else if (info.model_name == "MULTI") {
315 info.sequence_type = "MORPH";
316 info.model_name = "MK";
317 } else if (info.model_name.substr(0,5) == "CODON") {
318 info.sequence_type = info.model_name;
319 info.model_name = "GY";
320 } else {
321 info.sequence_type = "AA";
322 if (*info.model_name.begin() == '[') {
323 if (*info.model_name.rbegin() != ']')
324 outError("User-defined protein model should be [myProtenSubstitutionModelFileName]");
325 info.model_name = info.model_name.substr(1, info.model_name.length()-2);
326 }
327 }
328
329 if (freq == FREQ_EMPIRICAL)
330 info.model_name += "+F";
331 else if (freq == FREQ_ESTIMATE)
332 info.model_name += "+FO";
333 if (is_ASC)
334 info.model_name += "+ASC";
335 info.model_name += rate_type;
336
337 getline(in, info.name, '=');
338 trimString(info.name);
339 if (info.name.empty())
340 outError("Please give partition names in partition file!");
341
342 safeGetline(in, info.position_spec);
343 trimString(info.position_spec);
344 if (info.position_spec.empty())
345 outError("Please specify alignment positions for partition" + info.name);
346 std::replace(info.position_spec.begin(), info.position_spec.end(), ',', ' ');
347
348 // cout << "Reading partition " << info.name << " (model=" << info.model_name << ", seq=" << info.sequence_type << ", pos=" << ((info.position_spec.length() >= 20) ? info.position_spec.substr(0,20)+"..." : info.position_spec) << ") ..." << endl;
349
350 // TODO to supertree
351 // info.nniMoves[0].ptnlh = NULL;
352 // info.nniMoves[1].ptnlh = NULL;
353 // info.cur_ptnlh = NULL;
354 // part_info.push_back(info);
355 Alignment *part_aln = new Alignment();
356 part_aln->extractSites(input_aln, info.position_spec.c_str());
357
358 Alignment *new_aln;
359 if (params.remove_empty_seq)
360 new_aln = part_aln->removeGappySeq();
361 else
362 new_aln = part_aln;
363 // also rebuild states set of each sequence for likelihood computation
364 // new_aln->buildSeqStates();
365
366 if (part_aln != new_aln) delete part_aln;
367
368 new_aln->name = info.name;
369 new_aln->model_name = info.model_name;
370 new_aln->position_spec = info.position_spec;
371 new_aln->aln_file = info.aln_file;
372 new_aln->sequence_type = info.sequence_type;
373 partitions.push_back(new_aln);
374 // TODO move to supertree
375 // PhyloTree *tree = new PhyloTree(new_aln);
376 // push_back(tree);
377 // cout << new_aln->getNSeq() << " sequences and " << new_aln->getNSite() << " sites extracted" << endl;
378 // params = origin_params;
379 }
380
381 in.clear();
382 // set the failbit again
383 in.exceptions(ios::failbit | ios::badbit);
384 in.close();
385 } catch(ios::failure) {
386 outError(ERR_READ_INPUT);
387 } catch (string str) {
388 outError(str);
389 }
390
391
392 }
393
readPartitionNexus(Params & params)394 void SuperAlignment::readPartitionNexus(Params ¶ms) {
395 // Params origin_params = params;
396 MSetsBlock *sets_block = new MSetsBlock();
397 NxsTaxaBlock *taxa_block = NULL;
398 NxsAssumptionsBlock *assumptions_block = NULL;
399 NxsDataBlock *data_block = NULL;
400 MyReader nexus(params.partition_file);
401 nexus.Add(sets_block);
402
403 if (!params.aln_file) {
404 taxa_block = new NxsTaxaBlock();
405 assumptions_block = new NxsAssumptionsBlock(taxa_block);
406 data_block = new NxsDataBlock(taxa_block, assumptions_block);
407 nexus.Add(taxa_block);
408 nexus.Add(assumptions_block);
409 nexus.Add(data_block);
410 }
411
412 MyToken token(nexus.inf);
413 nexus.Execute(token);
414
415 Alignment *input_aln = NULL;
416 if (params.aln_file) {
417 input_aln = createAlignment(params.aln_file, params.sequence_type, params.intype, params.model_name);
418 } else {
419 if (data_block->GetNTax() > 0) {
420 input_aln = new Alignment(data_block, params.sequence_type, params.model_name);
421 }
422 delete data_block;
423 delete assumptions_block;
424 delete taxa_block;
425 }
426
427 bool empty_partition = true;
428 vector<CharSet*>::iterator it;
429 for (it = sets_block->charsets.begin(); it != sets_block->charsets.end(); it++)
430 if ((*it)->model_name != "") {
431 empty_partition = false;
432 break;
433 }
434 if (empty_partition) {
435 cout << "NOTE: No CharPartition defined, use all CharSets" << endl;
436 }
437
438 cout << endl << "Loading " << sets_block->charsets.size() << " partitions..." << endl;
439
440 for (it = sets_block->charsets.begin(); it != sets_block->charsets.end(); it++)
441 if (empty_partition || (*it)->char_partition != "") {
442 if ((*it)->model_name == "")
443 (*it)->model_name = params.model_name;
444 if ((*it)->aln_file == "" && !input_aln) {
445 if (!(*it)->position_spec.empty()) {
446 (*it)->aln_file = (*it)->position_spec;
447 (*it)->position_spec = "";
448 } else
449 outError("No input data for partition ", (*it)->name);
450 }
451 if ((*it)->sequence_type=="" && params.sequence_type)
452 (*it)->sequence_type = params.sequence_type;
453
454 if ((*it)->sequence_type == "" && !(*it)->model_name.empty()) {
455 // try to get sequence type from model
456 //TODO: why compile error?
457 (*it)->sequence_type = detectSeqTypeName((*it)->model_name.substr(0, (*it)->model_name.find_first_of("+*")));
458 }
459 if ((*it)->aln_file == "" && ((*it)->position_spec == "" || (*it)->position_spec == "*"))
460 outError("Empty position range for partition ", (*it)->name);
461 trimString((*it)->sequence_type);
462 // cout << endl << "Reading partition " << info.name << " (model=" << info.model_name << ", aln=" <<
463 // info.aln_file << ", seq=" << info.sequence_type << ", pos=" << ((info.position_spec.length() >= 20) ? info.position_spec.substr(0,20)+"..." : info.position_spec) << ") ..." << endl;
464 if ((*it)->sequence_type != "" && Alignment::getSeqType((*it)->sequence_type.c_str()) == SEQ_UNKNOWN)
465 outError("Unknown sequence type " + (*it)->sequence_type);
466
467 // TODO move to supertree
468 // info.nniMoves[0].ptnlh = NULL;
469 // info.nniMoves[1].ptnlh = NULL;
470 // info.cur_ptnlh = NULL;
471 // part_info.push_back(info);
472 Alignment *part_aln;
473 if ((*it)->aln_file != "") {
474 part_aln = createAlignment((*it)->aln_file, (*it)->sequence_type.c_str(), params.intype, (*it)->model_name);
475 } else {
476 part_aln = input_aln;
477 }
478 if (!(*it)->position_spec.empty() && (*it)->position_spec != "*") {
479 Alignment *new_aln = new Alignment();
480 new_aln->extractSites(part_aln, (*it)->position_spec.c_str());
481 if (part_aln != input_aln) delete part_aln;
482 part_aln = new_aln;
483 }
484 if (part_aln->seq_type == SEQ_DNA && ((*it)->sequence_type.substr(0, 5) == "CODON" || (*it)->sequence_type.substr(0, 5) == "NT2AA")) {
485 Alignment *new_aln = new Alignment();
486 new_aln->convertToCodonOrAA(part_aln, &(*it)->sequence_type[5], (*it)->sequence_type.substr(0, 5) == "NT2AA");
487 if (part_aln != input_aln) delete part_aln;
488 part_aln = new_aln;
489 }
490 Alignment *new_aln;
491 if (params.remove_empty_seq)
492 new_aln = part_aln->removeGappySeq();
493 else
494 new_aln = part_aln;
495 // also rebuild states set of each sequence for likelihood computation
496 // new_aln->buildSeqStates();
497
498 if (part_aln != new_aln && part_aln != input_aln) delete part_aln;
499 new_aln->name = (*it)->name;
500 new_aln->model_name = (*it)->model_name;
501 new_aln->aln_file = (*it)->aln_file;
502 new_aln->position_spec = (*it)->position_spec;
503 new_aln->sequence_type = (*it)->sequence_type;
504 new_aln->tree_len = (*it)->tree_len;
505 partitions.push_back(new_aln);
506 // PhyloTree *tree = new PhyloTree(new_aln);
507 // push_back(tree);
508 // params = origin_params;
509 // cout << new_aln->getNSeq() << " sequences and " << new_aln->getNSite() << " sites extracted" << endl;
510 }
511
512 if (input_aln)
513 delete input_aln;
514 delete sets_block;
515 }
516
readPartitionDir(string partition_dir,char * sequence_type,InputType & intype,string model,bool remove_empty_seq)517 void SuperAlignment::readPartitionDir(string partition_dir, char *sequence_type,
518 InputType &intype, string model, bool remove_empty_seq) {
519 // Params origin_params = params;
520
521 StrVector filenames;
522 string dir = partition_dir;
523 if (dir.back() != '/')
524 dir.append("/");
525 getFilesInDir(partition_dir.c_str(), filenames);
526 if (filenames.empty())
527 outError("No file found in ", partition_dir);
528 std::sort(filenames.begin(), filenames.end());
529 cout << "Reading " << filenames.size() << " alignment files in directory " << partition_dir << endl;
530
531 for (auto it = filenames.begin(); it != filenames.end(); it++)
532 {
533 Alignment *part_aln;
534 part_aln = createAlignment(dir+*it, sequence_type, intype, model_name);
535 // if (part_aln->seq_type == SEQ_DNA && (strncmp(params.sequence_type, "CODON", 5) == 0 || strncmp(params.sequence_type, "NT2AA", 5) == 0)) {
536 // Alignment *new_aln = new Alignment();
537 // new_aln->convertToCodonOrAA(part_aln, params.sequence_type+5, strncmp(params.sequence_type, "NT2AA", 5) == 0);
538 // delete part_aln;
539 // part_aln = new_aln;
540 // }
541 Alignment *new_aln;
542 if (remove_empty_seq)
543 new_aln = part_aln->removeGappySeq();
544 else
545 new_aln = part_aln;
546 // also rebuild states set of each sequence for likelihood computation
547 // new_aln->buildSeqStates();
548
549 if (part_aln != new_aln) delete part_aln;
550 new_aln->name = *it;
551 new_aln->model_name = model_name;
552 new_aln->aln_file = dir + *it;
553 new_aln->position_spec = "";
554 if (sequence_type)
555 new_aln->sequence_type = sequence_type;
556 partitions.push_back(new_aln);
557 }
558 }
559
readPartitionList(string file_list,char * sequence_type,InputType & intype,string model,bool remove_empty_seq)560 void SuperAlignment::readPartitionList(string file_list, char *sequence_type,
561 InputType &intype, string model, bool remove_empty_seq)
562 {
563 // Params origin_params = params;
564
565 StrVector filenames;
566 stringstream ss(file_list);
567 string token;
568 while (getline(ss, token, ','))
569 filenames.push_back(token);
570 if (filenames.empty())
571 outError("No file found in ", file_list);
572 cout << "Reading " << filenames.size() << " alignment files..." << endl;
573
574 for (auto it = filenames.begin(); it != filenames.end(); it++)
575 {
576 Alignment *part_aln;
577 part_aln = createAlignment(*it, sequence_type, intype, model_name);
578 // if (part_aln->seq_type == SEQ_DNA && (strncmp(params.sequence_type, "CODON", 5) == 0 || strncmp(params.sequence_type, "NT2AA", 5) == 0)) {
579 // Alignment *new_aln = new Alignment();
580 // new_aln->convertToCodonOrAA(part_aln, params.sequence_type+5, strncmp(params.sequence_type, "NT2AA", 5) == 0);
581 // delete part_aln;
582 // part_aln = new_aln;
583 // }
584 Alignment *new_aln;
585 if (remove_empty_seq)
586 new_aln = part_aln->removeGappySeq();
587 else
588 new_aln = part_aln;
589 // also rebuild states set of each sequence for likelihood computation
590 // new_aln->buildSeqStates();
591
592 if (part_aln != new_aln) delete part_aln;
593 new_aln->name = *it;
594 new_aln->model_name = model_name;
595 new_aln->aln_file = *it;
596 new_aln->position_spec = "";
597 if (sequence_type)
598 new_aln->sequence_type = sequence_type;
599 partitions.push_back(new_aln);
600 }
601 }
602
printPartition(const char * filename,const char * aln_file)603 void SuperAlignment::printPartition(const char *filename, const char *aln_file) {
604 try {
605 ofstream out;
606 out.exceptions(ios::failbit | ios::badbit);
607 out.open(filename);
608 printPartition(out, aln_file);
609 out.close();
610 cout << "Partition information was printed to " << filename << endl;
611 } catch (ios::failure &) {
612 outError(ERR_WRITE_OUTPUT, filename);
613 }
614
615 }
616
printPartition(ostream & out,const char * aln_file,bool append)617 void SuperAlignment::printPartition(ostream &out, const char *aln_file, bool append) {
618 if (append)
619 out << endl;
620 else
621 out << "#nexus" << endl;
622 if (aln_file)
623 out << "[ partition information for alignment written in " << aln_file <<" file ]" << endl;
624 out << "begin sets;" << endl;
625 int part; int start_site;
626 for (part = 0, start_site = 1; part < partitions.size(); part++) {
627 string name = partitions[part]->name;
628 replace(name.begin(), name.end(), '+', '_');
629 int end_site = start_site + partitions[part]->getNSite();
630 out << " charset " << name << " = " << start_site << "-" << end_site-1 << ";" << endl;
631 start_site = end_site;
632 }
633 bool ok_model = true;
634 for (part = 0; part < partitions.size(); part++)
635 if (partitions[part]->model_name.empty()) {
636 ok_model = false;
637 break;
638 }
639 if (ok_model) {
640 out << " charpartition mymodels =" << endl;
641 for (part = 0; part < partitions.size(); part++) {
642 string name = partitions[part]->name;
643 replace(name.begin(), name.end(), '+', '_');
644 if (part > 0) out << "," << endl;
645 // out << " " << at(part)->getModelNameParams() << ":" << name;
646 out << " " << partitions[part]->model_name << ":" << name;
647 }
648 out << ";" << endl;
649 }
650 out << "end;" << endl;
651 }
652
printBestPartition(const char * filename)653 void SuperAlignment::printBestPartition(const char *filename) {
654 try {
655 ofstream out;
656 out.exceptions(ios::failbit | ios::badbit);
657 out.open(filename);
658 out << "#nexus" << endl
659 << "begin sets;" << endl;
660 int part;
661 for (part = 0; part < partitions.size(); part++) {
662 string name = partitions[part]->name;
663 replace(name.begin(), name.end(), '+', '_');
664 out << " charset " << name << " = ";
665 if (!partitions[part]->aln_file.empty()) out << partitions[part]->aln_file << ": ";
666 if (partitions[part]->seq_type == SEQ_CODON)
667 out << "CODON, ";
668 string pos = partitions[part]->position_spec;
669 replace(pos.begin(), pos.end(), ',' , ' ');
670 out << pos << ";" << endl;
671 }
672 bool ok_model = true;
673 for (part = 0; part < partitions.size(); part++)
674 if (partitions[part]->model_name.empty()) {
675 ok_model = false;
676 break;
677 }
678 if (ok_model) {
679 out << " charpartition mymodels =" << endl;
680 for (part = 0; part < partitions.size(); part++) {
681 string name = partitions[part]->name;
682 replace(name.begin(), name.end(), '+', '_');
683 if (part > 0) out << "," << endl;
684 out << " " << partitions[part]->model_name << ": " << name;
685 }
686 out << ";" << endl;
687 }
688 out << "end;" << endl;
689 out.close();
690 cout << "Partition information was printed to " << filename << endl;
691 } catch (ios::failure &) {
692 outError(ERR_WRITE_OUTPUT, filename);
693 }
694
695 }
696
697
printPartitionRaxml(const char * filename)698 void SuperAlignment::printPartitionRaxml(const char *filename) {
699 int part;
700 // for (part = 0; part < partitions.size(); part++) {
701 // if (partitions[part]->aln_file != "") {
702 // cout << "INFO: Printing partition in RAxML format is not possible" << endl;
703 // return;
704 // }
705 // }
706 try {
707 ofstream out;
708 out.exceptions(ios::failbit | ios::badbit);
709 out.open(filename);
710 int start_site;
711 for (part = 0, start_site = 1; part < partitions.size(); part++) {
712 string name = partitions[part]->name;
713 replace(name.begin(), name.end(), '+', '_');
714 int end_site = start_site + partitions[part]->getNSite();
715 switch (partitions[part]->seq_type) {
716 case SEQ_DNA: out << "DNA, "; break;
717 case SEQ_BINARY: out << "BIN, "; break;
718 case SEQ_MORPH: out << "MULTI, "; break;
719 default: out << partitions[part]->model_name << ","; break;
720 }
721 out << name << " = " << start_site << "-" << end_site-1 << endl;
722 start_site = end_site;
723 }
724 out.close();
725 cout << "Partition information in Raxml format was printed to " << filename << endl;
726 } catch (ios::failure &) {
727 outError(ERR_WRITE_OUTPUT, filename);
728 }
729
730 }
731
printBestPartitionRaxml(const char * filename)732 void SuperAlignment::printBestPartitionRaxml(const char *filename) {
733 int part;
734 // for (part = 0; part < partitions.size(); part++) {
735 // if (partitions[part]->aln_file != "") {
736 // cout << "INFO: Printing partition in RAxML format is not possible" << endl;
737 // return;
738 // }
739 // }
740 try {
741 ofstream out;
742 out.exceptions(ios::failbit | ios::badbit);
743 out.open(filename);
744 for (part = 0; part < partitions.size(); part++) {
745 string name = partitions[part]->name;
746 replace(name.begin(), name.end(), '+', '_');
747 if (partitions[part]->model_name.find("+ASC") != string::npos)
748 out << "ASC_";
749 switch (partitions[part]->seq_type) {
750 case SEQ_DNA: out << "DNA"; break;
751 case SEQ_BINARY: out << "BIN"; break;
752 case SEQ_MORPH: out << "MULTI"; break;
753 case SEQ_PROTEIN:
754 out << partitions[part]->model_name.substr(0, partitions[part]->model_name.find_first_of("*{+"));
755 break;
756 case SEQ_CODON:
757 out << "CODON_" << partitions[part]->model_name.substr(0, partitions[part]->model_name.find_first_of("*{+"));
758 break;
759 default: out << partitions[part]->model_name; break;
760 }
761 if (partitions[part]->model_name.find("+FO") != string::npos)
762 out << "X";
763 else if (partitions[part]->model_name.find("+F") != string::npos)
764 out << "F";
765
766 out << ", " << name << " = " << partitions[part]->position_spec << endl;
767 }
768 out.close();
769 cout << "Partition information in Raxml format was printed to " << filename << endl;
770 } catch (ios::failure &) {
771 outError(ERR_WRITE_OUTPUT, filename);
772 }
773
774 }
775
776
linkSubAlignment(int part)777 void SuperAlignment::linkSubAlignment(int part) {
778 ASSERT(taxa_index.size() == getNSeq());
779 int nseq = getNSeq(), seq;
780 vector<bool> checked;
781 checked.resize(partitions[part]->getNSeq(), false);
782 for (seq = 0; seq < nseq; seq++) {
783 int id = partitions[part]->getSeqID(getSeqName(seq));
784 if (id < 0)
785 taxa_index[seq][part] = -1;
786 else {
787 taxa_index[seq][part] = id;
788 checked[id] = true;
789 }
790 }
791 if (verbose_mode >= VB_MED) {
792
793 }
794 // sanity check that all seqnames in partition must be present in superalignment
795 for (seq = 0; seq < checked.size(); seq++) {
796 ASSERT(checked[seq]);
797 }
798 }
799
extractSubAlignment(Alignment * aln,IntVector & seq_id,int min_true_char,int min_taxa,IntVector * kept_partitions)800 void SuperAlignment::extractSubAlignment(Alignment *aln, IntVector &seq_id, int min_true_char, int min_taxa, IntVector *kept_partitions) {
801 ASSERT(aln->isSuperAlignment());
802 SuperAlignment *saln = (SuperAlignment*)aln;
803 name = aln->name;
804 model_name = aln->model_name;
805 sequence_type = aln->sequence_type;
806 position_spec = aln->position_spec;
807 aln_file = aln->aln_file;
808
809 int i;
810 IntVector::iterator it;
811 for (it = seq_id.begin(); it != seq_id.end(); it++) {
812 ASSERT(*it >= 0 && *it < aln->getNSeq());
813 seq_names.push_back(aln->getSeqName(*it));
814 }
815
816 // BUG HERE!
817 //Alignment::extractSubAlignment(aln, seq_id, 0);
818
819 taxa_index.resize(getNSeq());
820 for (i = 0; i < getNSeq(); i++)
821 taxa_index[i].resize(saln->partitions.size(), -1);
822
823 int part = 0;
824 // partitions.resize(saln->partitions.size());
825 partitions.resize(0);
826 for (vector<Alignment*>::iterator ait = saln->partitions.begin(); ait != saln->partitions.end(); ait++, part++) {
827 IntVector sub_seq_id;
828 for (IntVector::iterator it = seq_id.begin(); it != seq_id.end(); it++)
829 if (saln->taxa_index[*it][part] >= 0)
830 sub_seq_id.push_back(saln->taxa_index[*it][part]);
831 if (sub_seq_id.size() < min_taxa)
832 continue;
833 Alignment *subaln = new Alignment;
834 subaln->extractSubAlignment(*ait, sub_seq_id, 0);
835 partitions.push_back(subaln);
836 linkSubAlignment(partitions.size()-1);
837 if (kept_partitions) kept_partitions->push_back(part);
838 // cout << subaln->getNSeq() << endl;
839 // subaln->printPhylip(cout);
840 }
841
842 if (partitions.size() < saln->partitions.size()) {
843 for (i = 0; i < getNSeq(); i++)
844 taxa_index[i].resize(partitions.size());
845 }
846
847 // now build the patterns based on taxa_index
848 buildPattern();
849 }
850
extractPartitions(IntVector & part_id)851 SuperAlignment *SuperAlignment::extractPartitions(IntVector &part_id) {
852 SuperAlignment *newaln = new SuperAlignment;
853 newaln->name = name;
854 newaln->model_name = model_name;
855 newaln->sequence_type = sequence_type;
856 newaln->position_spec = position_spec;
857 newaln->aln_file = aln_file;
858
859 unordered_set<string> seq_names_set;
860 IntVector::iterator it;
861 for (it = part_id.begin(); it != part_id.end(); it++) {
862 for (auto seq = partitions[*it]->seq_names.begin(); seq != partitions[*it]->seq_names.end(); seq++)
863 if (seq_names_set.find(*seq) == seq_names_set.end()) {
864 newaln->seq_names.push_back(*seq);
865 seq_names_set.insert(*seq);
866 }
867 }
868
869 int i;
870 newaln->taxa_index.resize(newaln->getNSeq());
871 for (i = 0; i < newaln->getNSeq(); i++)
872 newaln->taxa_index[i].resize(part_id.size(), -1);
873
874 int part = 0;
875 for (auto ait = part_id.begin(); ait != part_id.end(); ait++, part++) {
876 newaln->partitions.push_back(partitions[*ait]);
877 newaln->linkSubAlignment(newaln->partitions.size()-1);
878 }
879
880 // now build the patterns based on taxa_index
881 newaln->buildPattern();
882 return newaln;
883 }
884
removePartitions(set<int> & removed_id)885 void SuperAlignment::removePartitions(set<int> &removed_id) {
886
887 // remove part_id from partitions
888 vector<Alignment*> new_partitions;
889 int i;
890 for (i = 0; i < partitions.size(); i++)
891 if (removed_id.find(i) == removed_id.end()) {
892 // not found in the removed set
893 new_partitions.push_back(partitions[i]);
894 } else {
895 delete partitions[i];
896 partitions[i] = NULL;
897 }
898
899 ASSERT(new_partitions.size() + removed_id.size() == partitions.size());
900 partitions = new_partitions;
901
902 // get the union seq_names of remaining partitions
903 unordered_set<string> seq_names_set;
904 seq_names.clear();
905 for (auto it = partitions.begin(); it != partitions.end(); it++) {
906 for (auto seq = (*it)->seq_names.begin(); seq != (*it)->seq_names.end(); seq++)
907 if (seq_names_set.find(*seq) == seq_names_set.end()) {
908 seq_names.push_back(*seq);
909 seq_names_set.insert(*seq);
910 }
911 }
912
913
914 // build the taxa_index
915 taxa_index.resize(getNSeq());
916 for (i = 0; i < getNSeq(); i++)
917 taxa_index[i].resize(partitions.size(), -1);
918 for (i = 0; i < partitions.size(); i++)
919 linkSubAlignment(i);
920
921 // now build the patterns based on taxa_index
922 buildPattern();
923 }
924
removeIdenticalSeq(string not_remove,bool keep_two,StrVector & removed_seqs,StrVector & target_seqs)925 Alignment *SuperAlignment::removeIdenticalSeq(string not_remove, bool keep_two, StrVector &removed_seqs, StrVector &target_seqs) {
926 IntVector checked;
927 vector<bool> removed;
928 checked.resize(getNSeq(), 0);
929 removed.resize(getNSeq(), false);
930 int seq1;
931
932 for (seq1 = 0; seq1 < getNSeq(); seq1++) {
933 if (checked[seq1]) continue;
934 bool first_ident_seq = true;
935 for (int seq2 = seq1+1; seq2 < getNSeq(); seq2++) {
936 if (getSeqName(seq2) == not_remove || removed[seq2]) continue;
937 bool equal_seq = true;
938 int part = 0;
939 // check if seq1 and seq2 are identical over all partitions
940 for (vector<Alignment*>::iterator ait = partitions.begin(); ait != partitions.end(); ait++, part++) {
941 int subseq1 = taxa_index[seq1][part];
942 int subseq2 = taxa_index[seq2][part];
943 if (subseq1 < 0 && subseq2 < 0) // continue if both seqs are absent in this partition
944 continue;
945 if (subseq1 < 0 && subseq2 > 0) {
946 // if one sequence is present and the other is absent for a gene, we conclude that they are not identical
947 equal_seq = false;
948 break;
949 }
950 if (subseq1 > 0 && subseq2 < 0) {
951 // if one sequence is present and the other is absent for a gene, we conclude that they are not identical
952 equal_seq = false;
953 break;
954 }
955 // now if both seqs are present, check sequence content
956 for (iterator it = (*ait)->begin(); it != (*ait)->end(); it++)
957 if ((*it)[subseq1] != (*it)[subseq2]) {
958 equal_seq = false;
959 break;
960 }
961 }
962 if (equal_seq) {
963 if (removed_seqs.size() < getNSeq()-3 && (!keep_two || !first_ident_seq)) {
964 removed_seqs.push_back(getSeqName(seq2));
965 target_seqs.push_back(getSeqName(seq1));
966 removed[seq2] = true;
967 } else {
968 cout << "NOTE: " << getSeqName(seq2) << " is identical to " << getSeqName(seq1) << " but kept for subsequent analysis" << endl;
969 }
970 checked[seq2] = 1;
971 first_ident_seq = false;
972 }
973 }
974 checked[seq1] = 1;
975 }
976
977 if (removed_seqs.empty()) return this; // do nothing if the list is empty
978
979 if (removed_seqs.size() >= getNSeq()-3)
980 outWarning("Your alignment contains too many identical sequences!");
981
982 // now remove identical sequences
983 IntVector keep_seqs;
984 for (seq1 = 0; seq1 < getNSeq(); seq1++)
985 if (!removed[seq1]) keep_seqs.push_back(seq1);
986 SuperAlignment *aln;
987 aln = new SuperAlignment;
988 aln->extractSubAlignment(this, keep_seqs, 0);
989 return aln;
990 }
991
checkAbsentStates(string msg)992 int SuperAlignment::checkAbsentStates(string msg) {
993 int count = 0;
994 for (auto it = partitions.begin(); it != partitions.end(); it++)
995 count += (*it)->checkAbsentStates("partition " + convertIntToString((it-partitions.begin())+1));
996 return count;
997 }
998
999 /*
1000 void SuperAlignment::checkGappySeq() {
1001 int nseq = getNSeq(), part = 0, i;
1002 IntVector gap_only_seq;
1003 gap_only_seq.resize(nseq, 1);
1004 //cout << "Checking gaps..." << endl;
1005 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
1006 IntVector keep_seqs;
1007 for (i = 0; i < nseq; i++)
1008 if (taxa_index[i][part] >= 0)
1009 if (!(*it)->isGapOnlySeq(taxa_index[i][part])) {
1010 keep_seqs.push_back(taxa_index[i][part]);
1011 gap_only_seq[i] = 0;
1012 }
1013 if (keep_seqs.size() < (*it)->getNSeq()) {
1014 cout << "Discard " << (*it)->getNSeq() - keep_seqs.size()
1015 << " sequences from partition number " << part+1 << endl;
1016 Alignment *aln = new Alignment;
1017 aln->extractSubAlignment((*it), keep_seqs, 0);
1018 delete (*it);
1019 (*it) = aln;
1020 linkSubAlignment(part);
1021 }
1022 cout << __func__ << " num_states = " << (*it)->num_states << endl;
1023 }
1024 int wrong_seq = 0;
1025 for (i = 0; i < nseq; i++)
1026 if (gap_only_seq[i]) {
1027 cout << "ERROR: Sequence " << getSeqName(i) << " contains only gaps or missing data" << endl;
1028 wrong_seq++;
1029 }
1030 if (wrong_seq) {
1031 outError("Some sequences (see above) are problematic, please check your alignment again");
1032 }
1033 }
1034 */
getSitePatternIndex(IntVector & pattern_index)1035 void SuperAlignment::getSitePatternIndex(IntVector &pattern_index) {
1036 int nptn = 0;
1037 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1038 int nsite = pattern_index.size();
1039 pattern_index.insert(pattern_index.end(), (*it)->site_pattern.begin(), (*it)->site_pattern.end());
1040 for (int i = nsite; i < pattern_index.size(); i++)
1041 pattern_index[i] += nptn;
1042 nptn += (*it)->getNPattern();
1043 }
1044 }
1045
getPatternFreq(IntVector & pattern_freq)1046 void SuperAlignment::getPatternFreq(IntVector &pattern_freq) {
1047 ASSERT(isSuperAlignment());
1048 size_t offset = 0;
1049 if (!pattern_freq.empty()) pattern_freq.resize(0);
1050 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1051 IntVector freq;
1052 (*it)->getPatternFreq(freq);
1053 pattern_freq.insert(pattern_freq.end(), freq.begin(), freq.end());
1054 offset += freq.size();
1055 }
1056 }
1057
getPatternFreq(int * pattern_freq)1058 void SuperAlignment::getPatternFreq(int *pattern_freq) {
1059 ASSERT(isSuperAlignment());
1060 size_t offset = 0;
1061 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1062 (*it)->getPatternFreq(pattern_freq + offset);
1063 offset += (*it)->getNPattern();
1064 }
1065 }
1066
printSiteInfo(const char * filename)1067 void SuperAlignment::printSiteInfo(const char* filename) {
1068 try {
1069 ofstream out(filename);
1070 printSiteInfoHeader(out, filename, true);
1071 int id = 1;
1072 for (auto it = partitions.begin(); it != partitions.end(); it++, id++)
1073 (*it)->printSiteInfo(out, id);
1074 out.close();
1075 } catch (...) {
1076 outError(ERR_WRITE_OUTPUT, filename);
1077 }
1078 }
1079
computeDivergenceMatrix(double * pair_freq,double * state_freq,bool normalize)1080 void SuperAlignment::computeDivergenceMatrix(double *pair_freq, double *state_freq, bool normalize) {
1081 int nstates = partitions[0]->num_states;
1082 int nstates2 = nstates*nstates;
1083 memset(pair_freq, 0, sizeof(double)*nstates2);
1084 memset(state_freq, 0, sizeof(double)*nstates);
1085
1086 double *part_pair_freq = new double[nstates2];
1087 double *part_state_freq = new double[nstates];
1088 int i, j;
1089
1090 for (auto it = partitions.begin(); it != partitions.end(); it++) {
1091 (*it)->computeDivergenceMatrix(part_pair_freq, part_state_freq, false);
1092 for (i = 0; i < nstates2; i++)
1093 pair_freq[i] += part_pair_freq[i];
1094 for (i = 0; i < nstates; i++)
1095 state_freq[i] += part_state_freq[i];
1096 }
1097 if (normalize) {
1098 double sum = 0.0;
1099 for (i = 0; i < nstates; i++)
1100 sum += state_freq[i];
1101 sum = 1.0/sum;
1102 for (i = 0; i < nstates; i++)
1103 state_freq[i] *= sum;
1104 for (i = 0; i < nstates; i++) {
1105 sum = 0.0;
1106 double *pair_freq_ptr = pair_freq + (i*nstates);
1107 for (j = 0; j < nstates; j++)
1108 sum += pair_freq_ptr[j];
1109 sum = 1.0/sum;
1110 for (j = 0; j < nstates; j++)
1111 pair_freq_ptr[j] *= sum;
1112 }
1113 }
1114 delete [] part_state_freq;
1115 delete [] part_pair_freq;
1116 }
1117
doSymTest(size_t vecid,vector<SymTestResult> & vec_sym,vector<SymTestResult> & vec_marsym,vector<SymTestResult> & vec_intsym,int * rstream,vector<SymTestStat> * stats)1118 void SuperAlignment::doSymTest(size_t vecid, vector<SymTestResult> &vec_sym, vector<SymTestResult> &vec_marsym,
1119 vector<SymTestResult> &vec_intsym, int *rstream, vector<SymTestStat> *stats) {
1120
1121 vector<vector<SymTestStat> >all_stats;
1122 if (stats)
1123 all_stats.resize(partitions.size());
1124
1125 int i, nparts = partitions.size();
1126 #ifdef _OPENMP
1127 #pragma omp parallel for private(i)
1128 #endif
1129 for (i = 0; i < nparts; i++) {
1130 if (stats) {
1131 partitions[i]->doSymTest(vecid + i, vec_sym, vec_marsym, vec_intsym, rstream, &all_stats[i]);
1132 for (auto it = all_stats[i].begin(); it != all_stats[i].end(); it++)
1133 it->part = i;
1134 } else
1135 partitions[i]->doSymTest(vecid + i, vec_sym, vec_marsym, vec_intsym, rstream);
1136 }
1137 if (stats) {
1138 for (i = 0; i < nparts; i++)
1139 stats->insert(stats->end(), all_stats[i].begin(), all_stats[i].end());
1140 }
1141 }
1142
1143 /*
1144 void SuperAlignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq, const char *spec) {
1145 ASSERT(aln->isSuperAlignment());
1146 Alignment::copyAlignment(aln);
1147 SuperAlignment *super_aln = (SuperAlignment*) aln;
1148 ASSERT(partitions.empty());
1149 name = aln->name;
1150 model_name = aln->model_name;
1151 sequence_type = aln->sequence_type;
1152 position_spec = aln->position_spec;
1153 aln_file = aln->aln_file;
1154
1155 if (spec && strncmp(spec, "GENE", 4) == 0) {
1156 // resampling whole genes
1157 partitions.resize(super_aln->partitions.size(), NULL);
1158 int i, ptn;
1159 for (i = 0; i < super_aln->partitions.size(); i++) {
1160
1161 // get a random gene
1162 int part = random_int(super_aln->partitions.size());
1163
1164 // ptn_freq stores pattern frequency of bootstrap aln
1165
1166 IntVector ptn_freq;
1167 if (strncmp(spec,"GENESITE",8) == 0) {
1168 // resample sites of this gene
1169 super_aln->partitions[part]->createBootstrapAlignment(ptn_freq);
1170 ASSERT(ptn_freq.size() == super_aln->partitions[part]->size());
1171 } else {
1172 // copy ptn_freq from this gene
1173 for (ptn = 0; ptn < super_aln->partitions[part]->size(); ptn++)
1174 ptn_freq.push_back(super_aln->partitions[part]->at(ptn).frequency);
1175 }
1176
1177 if (!partitions[part]) {
1178 // allocate the partition
1179 partitions[part] = new Alignment;
1180 partitions[part]->copyAlignment(super_aln->partitions[part]);
1181 for (ptn = 0; ptn < super_aln->partitions[part]->size(); ptn++)
1182 partitions[part]->at(ptn).frequency = ptn_freq[ptn];
1183 } else {
1184 // increase frequency if already existed
1185 for (ptn = 0; ptn < super_aln->partitions[part]->size(); ptn++)
1186 partitions[part]->at(ptn).frequency += ptn_freq[ptn];
1187 }
1188 }
1189
1190 // fulfill genes that are missing
1191 for (i = 0; i < partitions.size(); i++)
1192 if (!partitions[i]) {
1193 partitions[i] = new Alignment;
1194 partitions[i]->copyAlignment(super_aln->partitions[i]);
1195 // reset all frequency
1196 for (ptn = 0; ptn < partitions[i]->size(); ptn++)
1197 partitions[i]->at(ptn).frequency = 0;
1198 }
1199
1200 // fill up pattern_freq vector
1201 if (pattern_freq) {
1202 pattern_freq->resize(0);
1203 for (i = 0; i < partitions.size(); i++)
1204 for (ptn = 0; ptn < partitions[i]->size(); ptn++)
1205 pattern_freq->push_back(partitions[i]->at(ptn).frequency);
1206 }
1207 } else if (!spec) {
1208 // resampling sites within genes
1209 for (vector<Alignment*>::iterator it = super_aln->partitions.begin(); it != super_aln->partitions.end(); it++) {
1210 Alignment *boot_aln = new Alignment;
1211 if (pattern_freq) {
1212 IntVector part_pattern_freq;
1213 boot_aln->createBootstrapAlignment(*it, &part_pattern_freq);
1214 pattern_freq->insert(pattern_freq->end(), part_pattern_freq.begin(), part_pattern_freq.end());
1215 } else {
1216 boot_aln->createBootstrapAlignment(*it);
1217 }
1218 partitions.push_back(boot_aln);
1219 }
1220 } else {
1221 outError("Wrong -bsam, either -bsam GENE or -bsam GENESITE");
1222 }
1223 taxa_index = super_aln->taxa_index;
1224 countConstSite();
1225 }
1226 */
1227
createBootstrapAlignment(Alignment * aln,IntVector * pattern_freq,const char * spec)1228 void SuperAlignment::createBootstrapAlignment(Alignment *aln, IntVector* pattern_freq, const char *spec) {
1229 ASSERT(aln->isSuperAlignment());
1230 SuperAlignment *super_aln = (SuperAlignment*) aln;
1231 ASSERT(partitions.empty());
1232 name = aln->name;
1233 model_name = aln->model_name;
1234 sequence_type = aln->sequence_type;
1235 position_spec = aln->position_spec;
1236 aln_file = aln->aln_file;
1237
1238 if (!spec) {
1239 // resampling sites within genes
1240 Alignment::copyAlignment(aln);
1241 partitions.reserve(super_aln->partitions.size());
1242 for (vector<Alignment*>::iterator it = super_aln->partitions.begin(); it != super_aln->partitions.end(); it++) {
1243 Alignment *boot_aln = new Alignment;
1244 if (pattern_freq) {
1245 IntVector part_pattern_freq;
1246 boot_aln->createBootstrapAlignment(*it, &part_pattern_freq);
1247 pattern_freq->insert(pattern_freq->end(), part_pattern_freq.begin(), part_pattern_freq.end());
1248 } else {
1249 boot_aln->createBootstrapAlignment(*it);
1250 }
1251 partitions.push_back(boot_aln);
1252 }
1253 taxa_index = super_aln->taxa_index;
1254 countConstSite();
1255 } else if (strcmp(spec, "GENE") == 0) {
1256 ASSERT(!pattern_freq);
1257 // resampling whole genes
1258 IntVector gene_freq;
1259 random_resampling(super_aln->partitions.size(), gene_freq);
1260 for (int i = 0; i < gene_freq.size(); i++)
1261 if (gene_freq[i] > 0) {
1262 Alignment *boot_aln = new Alignment;
1263 boot_aln->copyAlignment(super_aln->partitions[i]);
1264 if (gene_freq[i] > 1) {
1265 for (auto it = boot_aln->begin(); it != boot_aln->end(); it++)
1266 it->frequency *= gene_freq[i];
1267 auto site_pattern = boot_aln->site_pattern;
1268 for (int j = 1; j < gene_freq[i]; j++)
1269 boot_aln->site_pattern.insert(boot_aln->site_pattern.end(), site_pattern.begin(), site_pattern.end());
1270 boot_aln->countConstSite();
1271 }
1272 partitions.push_back(boot_aln);
1273 }
1274 init();
1275 } else if (strcmp(spec, "GENESITE") == 0) {
1276 ASSERT(!pattern_freq);
1277 // resampling whole genes then sites within resampled genes
1278 IntVector gene_freq;
1279 random_resampling(super_aln->partitions.size(), gene_freq);
1280 for (int i = 0; i < gene_freq.size(); i++)
1281 for (int rep = 0; rep < gene_freq[i]; rep++) {
1282 Alignment *boot_aln = new Alignment;
1283 boot_aln->createBootstrapAlignment(super_aln->partitions[i]);
1284 boot_aln->name = boot_aln->name + "." + convertIntToString(rep);
1285 partitions.push_back(boot_aln);
1286 }
1287 init();
1288 } else {
1289 outError("Wrong -bsam, either -bsam GENE or -bsam GENESITE");
1290 }
1291 }
1292
createBootstrapAlignment(IntVector & pattern_freq,const char * spec)1293 void SuperAlignment::createBootstrapAlignment(IntVector &pattern_freq, const char *spec) {
1294 ASSERT(isSuperAlignment());
1295 int nptn = 0;
1296 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1297 nptn += (*it)->getNPattern();
1298 }
1299 pattern_freq.resize(0);
1300 int *internal_freq = new int[nptn];
1301 createBootstrapAlignment(internal_freq, spec);
1302 pattern_freq.insert(pattern_freq.end(), internal_freq, internal_freq + nptn);
1303 delete [] internal_freq;
1304
1305 }
1306
1307
createBootstrapAlignment(int * pattern_freq,const char * spec,int * rstream)1308 void SuperAlignment::createBootstrapAlignment(int *pattern_freq, const char *spec, int *rstream) {
1309 ASSERT(isSuperAlignment());
1310 // if (spec && strncmp(spec, "GENE", 4) != 0) outError("Unsupported yet. ", __func__);
1311
1312 if (spec && strncmp(spec, "GENE", 4) == 0) {
1313 // resampling whole genes
1314 int nptn = 0;
1315 IntVector part_pos;
1316 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1317 part_pos.push_back(nptn);
1318 nptn += (*it)->getNPattern();
1319 }
1320 memset(pattern_freq, 0, nptn * sizeof(int));
1321 IntVector gene_freq;
1322 random_resampling(partitions.size(), gene_freq, rstream);
1323 for (int part = 0; part < partitions.size(); part++)
1324 for (int rep = 0; rep < gene_freq[part]; rep++){
1325 Alignment *aln = partitions[part];
1326 if (strncmp(spec,"GENESITE",8) == 0) {
1327 // then resampling sites in resampled gene
1328 IntVector sample;
1329 random_resampling(aln->getNSite(), sample, rstream);
1330 for (int site = 0; site < sample.size(); site++)
1331 for (int rep2 = 0; rep2 < sample[site]; rep2++) {
1332 int ptn_id = aln->getPatternID(site);
1333 pattern_freq[ptn_id + part_pos[part]]++;
1334 }
1335 } else {
1336 for (int ptn = 0; ptn < aln->getNPattern(); ptn++)
1337 pattern_freq[ptn + part_pos[part]] += aln->at(ptn).frequency;
1338 }
1339 }
1340 } else {
1341 // resampling sites within genes
1342 int offset = 0;
1343 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1344 if (spec && strncmp(spec, "SCALE=", 6) == 0)
1345 (*it)->createBootstrapAlignment(pattern_freq + offset, spec, rstream);
1346 else
1347 (*it)->createBootstrapAlignment(pattern_freq + offset, NULL, rstream);
1348 offset += (*it)->getNPattern();
1349 }
1350 }
1351 }
1352
1353 /**
1354 * shuffle alignment by randomizing the order of sites
1355 */
shuffleAlignment()1356 void SuperAlignment::shuffleAlignment() {
1357 ASSERT(isSuperAlignment());
1358 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1359 (*it)->shuffleAlignment();
1360 }
1361 }
1362
1363
computeObsDist(int seq1,int seq2)1364 double SuperAlignment::computeObsDist(int seq1, int seq2) {
1365 int site;
1366 int diff_pos = 0, total_pos = 0;
1367 for (site = 0; site < getNSite(); site++) {
1368 int id1 = taxa_index[seq1][site];
1369 int id2 = taxa_index[seq2][site];
1370 if (id1 < 0 || id2 < 0) continue;
1371 int num_states = partitions[site]->num_states;
1372 for (Alignment::iterator it = partitions[site]->begin(); it != partitions[site]->end(); it++)
1373 if ((*it)[id1] < num_states && (*it)[id2] < num_states) {
1374 total_pos += (*it).frequency;
1375 if ((*it)[id1] != (*it)[id2] )
1376 diff_pos += (*it).frequency;
1377 }
1378 }
1379 if (!total_pos)
1380 return MAX_GENETIC_DIST; // return +INF if no overlap between two sequences
1381 return ((double)diff_pos) / total_pos;
1382 }
1383
1384
computeDist(int seq1,int seq2)1385 double SuperAlignment::computeDist(int seq1, int seq2) {
1386 if (partitions.empty()) return 0.0;
1387 double obs_dist = computeObsDist(seq1, seq2);
1388 int num_states = partitions[0]->num_states;
1389 double z = (double)num_states / (num_states-1);
1390 double x = 1.0 - (z * obs_dist);
1391
1392 if (x <= 0) {
1393 // string str = "Too long distance between two sequences ";
1394 // str += getSeqName(seq1);
1395 // str += " and ";
1396 // str += getSeqName(seq2);
1397 // outWarning(str);
1398 return MAX_GENETIC_DIST;
1399 }
1400
1401 return -log(x) / z;
1402 //return computeObsDist(seq1, seq2);
1403 // AVERAGE DISTANCE
1404
1405 double dist = 0;
1406 int part = 0, num = 0;
1407 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++, part++) {
1408 int id1 = taxa_index[seq1][part];
1409 int id2 = taxa_index[seq2][part];
1410 if (id1 < 0 || id2 < 0) continue;
1411 dist += (*it)->computeDist(id1, id2);
1412 }
1413 if (num == 0) // two sequences are not overlapping at all!
1414 return MAX_GENETIC_DIST;
1415 return dist / num;
1416 }
1417
~SuperAlignment()1418 SuperAlignment::~SuperAlignment()
1419 {
1420 for (vector<Alignment*>::reverse_iterator it = partitions.rbegin(); it != partitions.rend(); it++)
1421 delete (*it);
1422 partitions.clear();
1423 }
1424
printAlignment(InputType format,ostream & out,bool append,const char * aln_site_list,int exclude_sites,const char * ref_seq_name)1425 void SuperAlignment::printAlignment(InputType format, ostream &out, bool append,
1426 const char *aln_site_list, int exclude_sites,
1427 const char *ref_seq_name)
1428 {
1429 Alignment *concat = concatenateAlignments();
1430 concat->printAlignment(format, out, append, aln_site_list, exclude_sites, ref_seq_name);
1431 delete concat;
1432 if (format == IN_NEXUS)
1433 printPartition(out, NULL, true);
1434 }
1435
printSubAlignments(Params & params)1436 void SuperAlignment::printSubAlignments(Params ¶ms) {
1437 vector<Alignment*>::iterator pit;
1438 string filename;
1439 int part;
1440 for (pit = partitions.begin(), part = 0; pit != partitions.end(); pit++, part++) {
1441 if (params.aln_output)
1442 filename = params.aln_output;
1443 else
1444 filename = params.out_prefix;
1445 filename += "." + (*pit)->name;
1446 int exclude_sites = (params.aln_nogaps) ? EXCLUDE_GAP : 0;
1447 (*pit)->printAlignment(params.aln_output_format, filename.c_str(), false, NULL, exclude_sites, NULL);
1448 }
1449 }
1450
computeUnconstrainedLogL()1451 double SuperAlignment::computeUnconstrainedLogL() {
1452 double logl = 0.0;
1453 vector<Alignment*>::iterator pit;
1454 for (pit = partitions.begin(); pit != partitions.end(); pit++)
1455 logl += (*pit)->computeUnconstrainedLogL();
1456 return logl;
1457 }
1458
computeMissingData()1459 double SuperAlignment::computeMissingData() {
1460 double ret = 0.0;
1461 size_t len = 0;
1462 vector<Alignment*>::iterator pit;
1463 for (pit = partitions.begin(); pit != partitions.end(); pit++) {
1464 ret += (*pit)->getNSeq() * (*pit)->getNSite();
1465 len += (*pit)->getNSite();
1466 }
1467 ret /= getNSeq() * len;
1468 return 1.0 - ret;
1469
1470 }
1471
concatenateAlignments(set<int> & ids)1472 Alignment *SuperAlignment::concatenateAlignments(set<int> &ids) {
1473 string union_taxa;
1474 int nsites = 0, nstates = 0;
1475 set<int>::iterator it;
1476 SeqType sub_type = SEQ_UNKNOWN;
1477 for (it = ids.begin(); it != ids.end(); it++) {
1478 int id = *it;
1479 ASSERT(id >= 0 && id < partitions.size());
1480 if (nstates == 0) nstates = partitions[id]->num_states;
1481 if (sub_type == SEQ_UNKNOWN) sub_type = partitions[id]->seq_type;
1482 if (sub_type != partitions[id]->seq_type)
1483 outError("Cannot concatenate sub-alignments of different type");
1484 if (nstates != partitions[id]->num_states)
1485 outError("Cannot concatenate sub-alignments of different #states");
1486
1487 string taxa_set;
1488 Pattern taxa_pat = getPattern(id);
1489 taxa_set.insert(taxa_set.begin(), taxa_pat.begin(), taxa_pat.end());
1490 nsites += partitions[id]->getNSite();
1491 if (it == ids.begin()) union_taxa = taxa_set; else {
1492 for (int j = 0; j < union_taxa.length(); j++)
1493 if (taxa_set[j] == 1) union_taxa[j] = 1;
1494 }
1495 }
1496
1497 Alignment *aln = new Alignment;
1498 for (int i = 0; i < union_taxa.length(); i++)
1499 if (union_taxa[i] == 1) {
1500 aln->seq_names.push_back(getSeqName(i));
1501 }
1502 aln->num_states = nstates;
1503 aln->seq_type = sub_type;
1504 aln->site_pattern.resize(nsites, -1);
1505 aln->clear();
1506 aln->pattern_index.clear();
1507 aln->STATE_UNKNOWN = partitions[*ids.begin()]->STATE_UNKNOWN;
1508 aln->genetic_code = partitions[*ids.begin()]->genetic_code;
1509 if (aln->seq_type == SEQ_CODON) {
1510 aln->codon_table = new char[aln->num_states];
1511 memcpy(aln->codon_table, partitions[*ids.begin()]->codon_table, aln->num_states);
1512 aln->non_stop_codon = new char[strlen(aln->genetic_code)];
1513 memcpy(aln->non_stop_codon, partitions[*ids.begin()]->non_stop_codon, strlen(aln->genetic_code));
1514 }
1515
1516 int site = 0;
1517 for (it = ids.begin(); it != ids.end(); it++) {
1518 int id = *it;
1519 // 2018-08-23: important bugfix in v1.6: taxa_set has wrong correspondance
1520 //string taxa_set;
1521 //Pattern taxa_pat = getPattern(id);
1522 //taxa_set.insert(taxa_set.begin(), taxa_pat.begin(), taxa_pat.end());
1523 for (Alignment::iterator it = partitions[id]->begin(); it != partitions[id]->end(); it++) {
1524 Pattern pat;
1525 //int part_seq = 0;
1526 for (int seq = 0; seq < union_taxa.size(); seq++)
1527 if (union_taxa[seq] == 1) {
1528 char ch = aln->STATE_UNKNOWN;
1529 int seq_part = taxa_index[seq][id];
1530 if (seq_part >= 0)
1531 ch = (*it)[seq_part];
1532 //if (taxa_set[seq] == 1) {
1533 // ch = (*it)[part_seq++];
1534 //}
1535 pat.push_back(ch);
1536 }
1537 //ASSERT(part_seq == partitions[id]->getNSeq());
1538 aln->addPattern(pat, site, (*it).frequency);
1539 // IMPORTANT BUG FIX FOLLOW
1540 int ptnindex = aln->pattern_index[pat];
1541 for (int j = 0; j < (*it).frequency; j++)
1542 aln->site_pattern[site++] = ptnindex;
1543
1544 }
1545 }
1546 aln->countConstSite();
1547 // aln->buildSeqStates();
1548
1549 return aln;
1550 }
1551
concatenateAlignments()1552 Alignment *SuperAlignment::concatenateAlignments() {
1553 vector<SeqType> seq_types;
1554 vector<char*> genetic_codes;
1555 vector<set<int> > ids;
1556 for (int i = 0; i < partitions.size(); i++) {
1557 bool found = false;
1558 for (int j = 0; j < seq_types.size(); j++)
1559 if (partitions[i]->seq_type == seq_types[j] && partitions[i]->genetic_code == genetic_codes[j]) {
1560 ids[j].insert(i);
1561 found = true;
1562 break;
1563 }
1564 if (found)
1565 continue;
1566 // create a new partition
1567 seq_types.push_back(partitions[i]->seq_type);
1568 genetic_codes.push_back(partitions[i]->genetic_code);
1569 ids.push_back(set<int>());
1570 ids.back().insert(i);
1571 }
1572 if (seq_types.size() == 1)
1573 return concatenateAlignments(ids[0]);
1574
1575 // mixed data with >= 2 partitions
1576 SuperAlignment *saln = new SuperAlignment();
1577 saln->max_num_states = 0;
1578 // first build taxa_index and partitions
1579 int site, seq, nsite = ids.size();
1580
1581 // BUG FIX 2016-11-29: when merging partitions with -m TESTMERGE, sequence order is changed
1582 // get the taxa names from existing tree
1583
1584 saln->seq_names = seq_names;
1585 saln->taxa_index.resize(saln->seq_names.size());
1586 for (auto it = saln->taxa_index.begin(); it != saln->taxa_index.end(); it++)
1587 it->resize(nsite, -1);
1588
1589 for (site = 0; site != nsite; site++) {
1590 Alignment *part_aln = concatenateAlignments(ids[site]);
1591 saln->partitions.push_back(part_aln);
1592 int nseq = part_aln->getNSeq();
1593 //cout << "nseq = " << nseq << endl;
1594 for (seq = 0; seq < nseq; seq++) {
1595 int id = saln->getSeqID(part_aln->getSeqName(seq));
1596 ASSERT(id >= 0);
1597 saln->taxa_index[id][site] = seq;
1598 }
1599 }
1600 // now the patterns of sequence-genes presence/absence
1601 saln->buildPattern();
1602 return saln;
1603 }
1604
countConstSite()1605 void SuperAlignment::countConstSite() {
1606 num_informative_sites = 0;
1607 num_variant_sites = 0;
1608 max_num_states = 0;
1609 frac_const_sites = 0;
1610 frac_invariant_sites = 0;
1611 num_parsimony_sites = 0;
1612 size_t nsites = 0;
1613 for (vector<Alignment*>::iterator it = partitions.begin(); it != partitions.end(); it++) {
1614 (*it)->countConstSite();
1615 num_informative_sites += (*it)->num_informative_sites;
1616 num_variant_sites += (*it)->num_variant_sites;
1617 if ((*it)->num_states > max_num_states)
1618 max_num_states = (*it)->num_states;
1619 nsites += (*it)->getNSite();
1620 frac_const_sites += (*it)->frac_const_sites * (*it)->getNSite();
1621 frac_invariant_sites += (*it)->frac_invariant_sites * (*it)->getNSite();
1622 }
1623 frac_const_sites /= nsites;
1624 frac_invariant_sites /= nsites;
1625 }
1626
orderPatternByNumChars(int pat_type)1627 void SuperAlignment::orderPatternByNumChars(int pat_type) {
1628 const int UINT_BITS = sizeof(UINT)*8;
1629 if (pat_type == PAT_INFORMATIVE)
1630 num_parsimony_sites = num_informative_sites;
1631 else
1632 num_parsimony_sites = num_variant_sites;
1633
1634 int maxi = (num_parsimony_sites+UINT_BITS-1)/UINT_BITS;
1635 pars_lower_bound = new UINT[maxi+1];
1636 memset(pars_lower_bound, 0, (maxi+1)*sizeof(UINT));
1637 int part, nseq = getNSeq();
1638
1639 // compute ordered_pattern
1640 ordered_pattern.clear();
1641 // UINT sum_scores[npart];
1642 for (part = 0; part != partitions.size(); part++) {
1643 partitions[part]->orderPatternByNumChars(pat_type);
1644 // partial_partition
1645 if (Params::getInstance().partition_type == TOPO_UNLINKED)
1646 continue;
1647 for (vector<Pattern>::iterator pit = partitions[part]->ordered_pattern.begin(); pit != partitions[part]->ordered_pattern.end(); pit++) {
1648 Pattern pattern(*pit);
1649 pattern.resize(nseq); // maximal unknown states
1650 for (int j = 0; j < nseq; j++)
1651 if (taxa_index[j][part] >= 0)
1652 pattern[j] = (*pit)[taxa_index[j][part]];
1653 else
1654 pattern[j] = partitions[part]->STATE_UNKNOWN;
1655 ordered_pattern.push_back(pattern);
1656 }
1657 // sum_scores[part] = partitions[part]->pars_lower_bound[0];
1658 }
1659 // TODO compute pars_lower_bound (lower bound of pars score for remaining patterns)
1660 }
1661