1 /*
2  * timetree.cpp
3  * Interface to call dating method incl. LSD2
4  *  Created on: Apr 4, 2020
5  *      Author: minh
6  */
7 
8 #include "timetree.h"
9 
10 #ifdef USE_LSD2
11 #include "lsd2/src/lsd.h"
12 #endif
13 
14 /** map from taxon name to date */
15 typedef unordered_map<string, string> TaxonDateMap;
16 #define YEAR_SCALE 100000
17 
18 /**
19  @param[in] date date string
20  @return converted date as a float or YYYY-MM[-DD] format
21  */
convertDate(string date)22 string convertDate(string date) {
23     // check for range in x:y format
24     if (date.find(':') != string::npos) {
25         StrVector vec;
26         convert_string_vec(date.c_str(), vec, ':');
27         if (vec.size() != 2)
28             outError("Invalid date range " + date);
29         if (vec[0].empty() || vec[0] == "NA")
30             return "u(" + vec[1] + ")";
31         if (vec[1].empty() || vec[1] == "NA")
32             return "l(" + vec[0] + ")";
33 
34         return "b(" + vec[0] + "," + vec[1] + ")";
35     }
36     if (date.empty() || !isdigit(date[0]) || date[0] == '-')
37         return date;
38     DoubleVector vec;
39     try {
40         convert_double_vec(date.c_str(), vec, '-');
41     } catch (...) {
42         outError("Invalid date " + date);
43     }
44     // otherwise, return the original date string
45     return date;
46 }
47 
48 /**
49  read a date file. Each line has two strings: name and date
50  */
readDateFile(string date_file,set<string> & node_names,TaxonDateMap & dates)51 void readDateFile(string date_file, set<string> &node_names, TaxonDateMap &dates) {
52     try {
53         cout << "Reading date file " << date_file << " ..." << endl;
54         ifstream in;
55         in.exceptions(ios::failbit | ios::badbit);
56         in.open(date_file);
57         int line_num;
58         for (line_num = 1; !in.eof(); line_num++) {
59             string line_out = "Line " + convertIntToString(line_num) + ": ";
60             string line;
61             if (!safeGetline(in, line))
62                 break;
63             // ignore comment
64             if (line.find('#') != string::npos)
65                 line = line.substr(0, line.find('#'));
66             trimString(line);
67             if (line.empty()) // ignore empty line
68                 continue;
69             string name, date;
70             istringstream line_in(line);
71             if (!(line_in >> name >> date))
72                 throw line_out + "'" + line + "' does not contain name and date";
73             // error checking, make sure that name appear in tree
74             StrVector name_vec;
75             convert_string_vec(name.c_str(), name_vec);
76             for (auto s : name_vec)
77                 if (node_names.find(s) == node_names.end())
78                     throw line_out + "'" + s + "' does not appear in tree";
79             // error checking, make sure is date is valid
80             if (date.empty())
81                 throw line_out + "date is empty";
82             try {
83                 int end_pos;
84                 convert_double(date.c_str(), end_pos);
85             } catch (string str) {
86                 throw line_out + str;
87             }
88             dates[name] = date;
89         }
90         in.clear();
91         // set the failbit again
92         in.exceptions(ios::failbit | ios::badbit);
93         in.close();
94     } catch (ios::failure) {
95         outError(ERR_READ_INPUT, date_file);
96     } catch (string str) {
97         outError(str);
98     } catch (...) {
99         outError("Error reading date file " + date_file);
100     }
101 }
102 
103 /** read the date information from the alignment taxon names */
readDateTaxName(set<string> & nodenames,TaxonDateMap & dates)104 void readDateTaxName(set<string> &nodenames, TaxonDateMap &dates) {
105     cout << "Extracting date from node names..." << endl;
106     for (string name : nodenames) {
107         // get the date in the taxon name after the '|' sign
108         auto pos = name.rfind('|');
109         if (pos == string::npos)
110             continue;
111         string date = name.substr(pos+1);
112         try {
113             // try to parse
114             int end_pos;
115             convert_double(date.c_str(), end_pos);
116             // it works! so get the date
117             dates[name] = date;
118         } catch (...) {
119             // does not work, ignore the taxon name
120             continue;
121         }
122     }
123 }
124 
writeOutgroup(ostream & out,const char * outgroup)125 void writeOutgroup(ostream &out, const char *outgroup) {
126     StrVector outgroup_names;
127     convert_string_vec(outgroup, outgroup_names);
128     try {
129         out << outgroup_names.size() << endl;
130         for (auto outgroup : outgroup_names) {
131             out << outgroup << endl;
132         }
133     } catch (...) {
134         ASSERT(0 && "Error writing outgroup stream");
135     }
136 }
137 
writeDate(string date_file,ostream & out,set<string> & nodenames)138 void writeDate(string date_file, ostream &out, set<string> &nodenames) {
139     TaxonDateMap dates;
140     if (date_file == "TAXNAME") {
141         // read the dates from alignment taxon names
142         readDateTaxName(nodenames, dates);
143     } else {
144         readDateFile(date_file, nodenames, dates);
145     }
146     // only retain taxon appearing in alignment
147     TaxonDateMap retained_dates;
148     set<string> outgroup_set;
149     if (Params::getInstance().root) {
150         StrVector outgroup_names;
151         convert_string_vec(Params::getInstance().root, outgroup_names);
152         for (auto name : outgroup_names)
153             outgroup_set.insert(name);
154     }
155     if (verbose_mode >= VB_MED)
156         cout << "Node\tDate" << endl;
157     for (auto name: nodenames) {
158         string date = "NA";
159         if (dates.find(name) == dates.end()) {
160             // taxon present in the dates
161 //            if (!Params::getInstance().date_tip.empty())
162 //                date = Params::getInstance().date_tip;
163         } else if (outgroup_set.find(name) == outgroup_set.end() || Params::getInstance().date_with_outgroup) {
164             // ignore the date of the outgroup
165             date = dates[name];
166         }
167         if (date != "NA") {
168             retained_dates[name] = date;
169             dates.erase(name);
170         }
171         if (verbose_mode >= VB_MED)
172             cout << name << "\t" << date << endl;
173     }
174 
175     // add remaining ancestral dates
176     for (auto date : dates) {
177         if (date.first.substr(0,4) == "mrca" || date.first.substr(0,8) == "ancestor")
178             retained_dates[date.first] = date.second;
179         else if (date.first.find(',') != string::npos) {
180             retained_dates["ancestor(" + date.first + ")"] = date.second;
181         } else if (outgroup_set.find(date.first) == outgroup_set.end() || Params::getInstance().date_with_outgroup) {
182             retained_dates[date.first] = date.second;
183         }
184     }
185 
186 //    if (!Params::getInstance().date_root.empty()) {
187 //        retained_dates["root"] = Params::getInstance().date_root;
188 //    }
189 
190     cout << retained_dates.size() << " dates extracted" << endl;
191     try {
192         out << retained_dates.size() << endl;
193         for (auto date : retained_dates) {
194             out << date.first << " " << convertDate(date.second) << endl;
195         }
196     } catch (...) {
197         ASSERT(0 && "Error writing date stream");
198     }
199 }
200 
201 #ifdef USE_LSD2
runLSD2(PhyloTree * tree)202 void runLSD2(PhyloTree *tree) {
203     string basename = (string)Params::getInstance().out_prefix + ".timetree";
204     string treefile = basename + ".subst";
205     stringstream tree_stream, outgroup_stream, date_stream;
206     tree->printTree(tree_stream);
207     StrVector arg = {"lsd", "-i", treefile, "-s", convertIntToString(tree->getAlnNSite()), "-o", basename};
208     if (Params::getInstance().date_debug) {
209         ofstream out(treefile);
210         out << tree_stream.str();
211         out.close();
212         cout << "Tree printed to " << treefile << endl;
213     }
214 
215     if (Params::getInstance().date_replicates > 0) {
216         arg.push_back("-f");
217         arg.push_back(convertIntToString(Params::getInstance().date_replicates));
218         if (Params::getInstance().clock_stddev >= 0) {
219             arg.push_back("-q");
220             arg.push_back(convertDoubleToString(Params::getInstance().clock_stddev));
221         }
222     }
223 
224     if (Params::getInstance().date_outlier >= 0) {
225         arg.push_back("-e");
226         arg.push_back(convertIntToString(Params::getInstance().date_outlier));
227     }
228 
229     if (Params::getInstance().root) {
230         // print outgroup file for LSD
231         writeOutgroup(outgroup_stream, Params::getInstance().root);
232         string outgroup_file = basename + ".outgroup";
233         arg.push_back("-g");
234         arg.push_back(outgroup_file); // only fake file
235         if (!Params::getInstance().date_with_outgroup)
236             arg.push_back("-G");
237         if (Params::getInstance().date_debug) {
238             ofstream out(outgroup_file);
239             out << outgroup_stream.str();
240             out.close();
241             cout << "Outgroup printed to " << outgroup_file << endl;
242         }
243     } else {
244         // search for all possible rootings
245         arg.push_back("-r");
246         arg.push_back("a");
247     }
248 
249     if (Params::getInstance().date_file != "") {
250         // parse the date file
251         set<string> nodenames;
252         tree->getNodeName(nodenames);
253         writeDate(Params::getInstance().date_file, date_stream, nodenames);
254         string date_file = basename + ".date";
255         arg.push_back("-d");
256         arg.push_back(date_file);  // only fake file
257         if (Params::getInstance().date_debug) {
258             ofstream out(date_file);
259             out << date_stream.str();
260             out.close();
261             cout << "Date file printed to " << date_file << endl;
262         }
263     }
264     // input tip and root date
265     if (Params::getInstance().date_root != "") {
266         arg.push_back("-a");
267         arg.push_back(convertDate(Params::getInstance().date_root));
268     }
269 
270     if (Params::getInstance().date_tip != "") {
271         arg.push_back("-z");
272         arg.push_back(convertDate(Params::getInstance().date_tip));
273     }
274 
275     lsd::InputOutputStream io(tree_stream.str(), outgroup_stream.str(), date_stream.str(), "", "");
276 
277     if (Params::getInstance().dating_options != "") {
278         // extra options for LSD
279         StrVector options;
280         convert_string_vec(Params::getInstance().dating_options.c_str(), options, ' ');
281         for (auto opt : options)
282             if (!opt.empty())
283                 arg.push_back(opt);
284     }
285 
286     cout << "Building time tree by least-square dating (LSD) with command:" << endl;
287 
288     int argc = arg.size();
289     char *argv[argc];
290     for (int i = 0; i < argc; i++)
291         argv[i] = (char*)arg[i].c_str();
292     std::copy(arg.begin(), arg.end(), std::ostream_iterator<string>(std::cout, " "));
293     cout << endl;
294 
295     // main call to LSD!
296     lsd::buildTimeTree(argc, argv, &io);
297 
298     // fetch the output
299     string report_file = basename + ".lsd";
300     //string tree1_file = basename + ".raw";
301     string tree2_file = basename + ".nex";
302     string tree3_file = basename + ".nwk";
303     try {
304         ofstream out;
305         out.open(report_file);
306         out << ((ostringstream*)io.outResult)->str();
307         out.close();
308 //        out.open(tree1_file);
309 //        out << ((ostringstream*)io.outTree1)->str();
310 //        out.close();
311         out.open(tree2_file);
312         out << ((ostringstream*)io.outTree2)->str();
313         out.close();
314         out.open(tree3_file);
315         out << ((stringstream*)io.outTree3)->str();
316         out.close();
317     } catch (...) {
318         outError("Couldn't write LSD output files");
319     }
320 
321     if (((stringstream*)io.outTree3)->str().empty()) {
322         outError("Something went wrong, LSD could not date the tree");
323     } else {
324         cout << "LSD results written to:" << endl;
325         cout << "  LSD report:                  " << report_file << endl;
326     //    cout << "  Time tree in nexus format:   " << tree1_file << endl;
327         cout << "  Time tree in nexus format:   " << tree2_file << endl;
328         cout << "  Time tree in newick format:  " << tree3_file << endl;
329         cout << endl;
330     }
331 }
332 #endif
333 
doTimeTree(PhyloTree * tree)334 void doTimeTree(PhyloTree *tree) {
335 
336     cout << "--- Start phylogenetic dating ---" << endl;
337     cout.unsetf(ios::fixed);
338 
339 #ifdef USE_LSD2
340     if (Params::getInstance().dating_method == "LSD") {
341         runLSD2(tree);
342         cout << "--- End phylogenetic dating ---" << endl;
343         return;
344     }
345 #endif
346     // This line shouldn't be reached
347     outError("Unsupported " + Params::getInstance().dating_method + " dating method");
348 }
349