1 /*
2 * timetree.cpp
3 * Interface to call dating method incl. LSD2
4 * Created on: Apr 4, 2020
5 * Author: minh
6 */
7
8 #include "timetree.h"
9
10 #ifdef USE_LSD2
11 #include "lsd2/src/lsd.h"
12 #endif
13
14 /** map from taxon name to date */
15 typedef unordered_map<string, string> TaxonDateMap;
16 #define YEAR_SCALE 100000
17
18 /**
19 @param[in] date date string
20 @return converted date as a float or YYYY-MM[-DD] format
21 */
convertDate(string date)22 string convertDate(string date) {
23 // check for range in x:y format
24 if (date.find(':') != string::npos) {
25 StrVector vec;
26 convert_string_vec(date.c_str(), vec, ':');
27 if (vec.size() != 2)
28 outError("Invalid date range " + date);
29 if (vec[0].empty() || vec[0] == "NA")
30 return "u(" + vec[1] + ")";
31 if (vec[1].empty() || vec[1] == "NA")
32 return "l(" + vec[0] + ")";
33
34 return "b(" + vec[0] + "," + vec[1] + ")";
35 }
36 if (date.empty() || !isdigit(date[0]) || date[0] == '-')
37 return date;
38 DoubleVector vec;
39 try {
40 convert_double_vec(date.c_str(), vec, '-');
41 } catch (...) {
42 outError("Invalid date " + date);
43 }
44 // otherwise, return the original date string
45 return date;
46 }
47
48 /**
49 read a date file. Each line has two strings: name and date
50 */
readDateFile(string date_file,set<string> & node_names,TaxonDateMap & dates)51 void readDateFile(string date_file, set<string> &node_names, TaxonDateMap &dates) {
52 try {
53 cout << "Reading date file " << date_file << " ..." << endl;
54 ifstream in;
55 in.exceptions(ios::failbit | ios::badbit);
56 in.open(date_file);
57 int line_num;
58 for (line_num = 1; !in.eof(); line_num++) {
59 string line_out = "Line " + convertIntToString(line_num) + ": ";
60 string line;
61 if (!safeGetline(in, line))
62 break;
63 // ignore comment
64 if (line.find('#') != string::npos)
65 line = line.substr(0, line.find('#'));
66 trimString(line);
67 if (line.empty()) // ignore empty line
68 continue;
69 string name, date;
70 istringstream line_in(line);
71 if (!(line_in >> name >> date))
72 throw line_out + "'" + line + "' does not contain name and date";
73 // error checking, make sure that name appear in tree
74 StrVector name_vec;
75 convert_string_vec(name.c_str(), name_vec);
76 for (auto s : name_vec)
77 if (node_names.find(s) == node_names.end())
78 throw line_out + "'" + s + "' does not appear in tree";
79 // error checking, make sure is date is valid
80 if (date.empty())
81 throw line_out + "date is empty";
82 try {
83 int end_pos;
84 convert_double(date.c_str(), end_pos);
85 } catch (string str) {
86 throw line_out + str;
87 }
88 dates[name] = date;
89 }
90 in.clear();
91 // set the failbit again
92 in.exceptions(ios::failbit | ios::badbit);
93 in.close();
94 } catch (ios::failure) {
95 outError(ERR_READ_INPUT, date_file);
96 } catch (string str) {
97 outError(str);
98 } catch (...) {
99 outError("Error reading date file " + date_file);
100 }
101 }
102
103 /** read the date information from the alignment taxon names */
readDateTaxName(set<string> & nodenames,TaxonDateMap & dates)104 void readDateTaxName(set<string> &nodenames, TaxonDateMap &dates) {
105 cout << "Extracting date from node names..." << endl;
106 for (string name : nodenames) {
107 // get the date in the taxon name after the '|' sign
108 auto pos = name.rfind('|');
109 if (pos == string::npos)
110 continue;
111 string date = name.substr(pos+1);
112 try {
113 // try to parse
114 int end_pos;
115 convert_double(date.c_str(), end_pos);
116 // it works! so get the date
117 dates[name] = date;
118 } catch (...) {
119 // does not work, ignore the taxon name
120 continue;
121 }
122 }
123 }
124
writeOutgroup(ostream & out,const char * outgroup)125 void writeOutgroup(ostream &out, const char *outgroup) {
126 StrVector outgroup_names;
127 convert_string_vec(outgroup, outgroup_names);
128 try {
129 out << outgroup_names.size() << endl;
130 for (auto outgroup : outgroup_names) {
131 out << outgroup << endl;
132 }
133 } catch (...) {
134 ASSERT(0 && "Error writing outgroup stream");
135 }
136 }
137
writeDate(string date_file,ostream & out,set<string> & nodenames)138 void writeDate(string date_file, ostream &out, set<string> &nodenames) {
139 TaxonDateMap dates;
140 if (date_file == "TAXNAME") {
141 // read the dates from alignment taxon names
142 readDateTaxName(nodenames, dates);
143 } else {
144 readDateFile(date_file, nodenames, dates);
145 }
146 // only retain taxon appearing in alignment
147 TaxonDateMap retained_dates;
148 set<string> outgroup_set;
149 if (Params::getInstance().root) {
150 StrVector outgroup_names;
151 convert_string_vec(Params::getInstance().root, outgroup_names);
152 for (auto name : outgroup_names)
153 outgroup_set.insert(name);
154 }
155 if (verbose_mode >= VB_MED)
156 cout << "Node\tDate" << endl;
157 for (auto name: nodenames) {
158 string date = "NA";
159 if (dates.find(name) == dates.end()) {
160 // taxon present in the dates
161 // if (!Params::getInstance().date_tip.empty())
162 // date = Params::getInstance().date_tip;
163 } else if (outgroup_set.find(name) == outgroup_set.end() || Params::getInstance().date_with_outgroup) {
164 // ignore the date of the outgroup
165 date = dates[name];
166 }
167 if (date != "NA") {
168 retained_dates[name] = date;
169 dates.erase(name);
170 }
171 if (verbose_mode >= VB_MED)
172 cout << name << "\t" << date << endl;
173 }
174
175 // add remaining ancestral dates
176 for (auto date : dates) {
177 if (date.first.substr(0,4) == "mrca" || date.first.substr(0,8) == "ancestor")
178 retained_dates[date.first] = date.second;
179 else if (date.first.find(',') != string::npos) {
180 retained_dates["ancestor(" + date.first + ")"] = date.second;
181 } else if (outgroup_set.find(date.first) == outgroup_set.end() || Params::getInstance().date_with_outgroup) {
182 retained_dates[date.first] = date.second;
183 }
184 }
185
186 // if (!Params::getInstance().date_root.empty()) {
187 // retained_dates["root"] = Params::getInstance().date_root;
188 // }
189
190 cout << retained_dates.size() << " dates extracted" << endl;
191 try {
192 out << retained_dates.size() << endl;
193 for (auto date : retained_dates) {
194 out << date.first << " " << convertDate(date.second) << endl;
195 }
196 } catch (...) {
197 ASSERT(0 && "Error writing date stream");
198 }
199 }
200
201 #ifdef USE_LSD2
runLSD2(PhyloTree * tree)202 void runLSD2(PhyloTree *tree) {
203 string basename = (string)Params::getInstance().out_prefix + ".timetree";
204 string treefile = basename + ".subst";
205 stringstream tree_stream, outgroup_stream, date_stream;
206 tree->printTree(tree_stream);
207 StrVector arg = {"lsd", "-i", treefile, "-s", convertIntToString(tree->getAlnNSite()), "-o", basename};
208 if (Params::getInstance().date_debug) {
209 ofstream out(treefile);
210 out << tree_stream.str();
211 out.close();
212 cout << "Tree printed to " << treefile << endl;
213 }
214
215 if (Params::getInstance().date_replicates > 0) {
216 arg.push_back("-f");
217 arg.push_back(convertIntToString(Params::getInstance().date_replicates));
218 if (Params::getInstance().clock_stddev >= 0) {
219 arg.push_back("-q");
220 arg.push_back(convertDoubleToString(Params::getInstance().clock_stddev));
221 }
222 }
223
224 if (Params::getInstance().date_outlier >= 0) {
225 arg.push_back("-e");
226 arg.push_back(convertIntToString(Params::getInstance().date_outlier));
227 }
228
229 if (Params::getInstance().root) {
230 // print outgroup file for LSD
231 writeOutgroup(outgroup_stream, Params::getInstance().root);
232 string outgroup_file = basename + ".outgroup";
233 arg.push_back("-g");
234 arg.push_back(outgroup_file); // only fake file
235 if (!Params::getInstance().date_with_outgroup)
236 arg.push_back("-G");
237 if (Params::getInstance().date_debug) {
238 ofstream out(outgroup_file);
239 out << outgroup_stream.str();
240 out.close();
241 cout << "Outgroup printed to " << outgroup_file << endl;
242 }
243 } else {
244 // search for all possible rootings
245 arg.push_back("-r");
246 arg.push_back("a");
247 }
248
249 if (Params::getInstance().date_file != "") {
250 // parse the date file
251 set<string> nodenames;
252 tree->getNodeName(nodenames);
253 writeDate(Params::getInstance().date_file, date_stream, nodenames);
254 string date_file = basename + ".date";
255 arg.push_back("-d");
256 arg.push_back(date_file); // only fake file
257 if (Params::getInstance().date_debug) {
258 ofstream out(date_file);
259 out << date_stream.str();
260 out.close();
261 cout << "Date file printed to " << date_file << endl;
262 }
263 }
264 // input tip and root date
265 if (Params::getInstance().date_root != "") {
266 arg.push_back("-a");
267 arg.push_back(convertDate(Params::getInstance().date_root));
268 }
269
270 if (Params::getInstance().date_tip != "") {
271 arg.push_back("-z");
272 arg.push_back(convertDate(Params::getInstance().date_tip));
273 }
274
275 lsd::InputOutputStream io(tree_stream.str(), outgroup_stream.str(), date_stream.str(), "", "");
276
277 if (Params::getInstance().dating_options != "") {
278 // extra options for LSD
279 StrVector options;
280 convert_string_vec(Params::getInstance().dating_options.c_str(), options, ' ');
281 for (auto opt : options)
282 if (!opt.empty())
283 arg.push_back(opt);
284 }
285
286 cout << "Building time tree by least-square dating (LSD) with command:" << endl;
287
288 int argc = arg.size();
289 char *argv[argc];
290 for (int i = 0; i < argc; i++)
291 argv[i] = (char*)arg[i].c_str();
292 std::copy(arg.begin(), arg.end(), std::ostream_iterator<string>(std::cout, " "));
293 cout << endl;
294
295 // main call to LSD!
296 lsd::buildTimeTree(argc, argv, &io);
297
298 // fetch the output
299 string report_file = basename + ".lsd";
300 //string tree1_file = basename + ".raw";
301 string tree2_file = basename + ".nex";
302 string tree3_file = basename + ".nwk";
303 try {
304 ofstream out;
305 out.open(report_file);
306 out << ((ostringstream*)io.outResult)->str();
307 out.close();
308 // out.open(tree1_file);
309 // out << ((ostringstream*)io.outTree1)->str();
310 // out.close();
311 out.open(tree2_file);
312 out << ((ostringstream*)io.outTree2)->str();
313 out.close();
314 out.open(tree3_file);
315 out << ((stringstream*)io.outTree3)->str();
316 out.close();
317 } catch (...) {
318 outError("Couldn't write LSD output files");
319 }
320
321 if (((stringstream*)io.outTree3)->str().empty()) {
322 outError("Something went wrong, LSD could not date the tree");
323 } else {
324 cout << "LSD results written to:" << endl;
325 cout << " LSD report: " << report_file << endl;
326 // cout << " Time tree in nexus format: " << tree1_file << endl;
327 cout << " Time tree in nexus format: " << tree2_file << endl;
328 cout << " Time tree in newick format: " << tree3_file << endl;
329 cout << endl;
330 }
331 }
332 #endif
333
doTimeTree(PhyloTree * tree)334 void doTimeTree(PhyloTree *tree) {
335
336 cout << "--- Start phylogenetic dating ---" << endl;
337 cout.unsetf(ios::fixed);
338
339 #ifdef USE_LSD2
340 if (Params::getInstance().dating_method == "LSD") {
341 runLSD2(tree);
342 cout << "--- End phylogenetic dating ---" << endl;
343 return;
344 }
345 #endif
346 // This line shouldn't be reached
347 outError("Unsupported " + Params::getInstance().dating_method + " dating method");
348 }
349