1 //
2 // C++ Implementation: StateSpace
3 //
4 // Description:
5 //
6 //
7 // Author: BUI Quang Minh(C) 2018
8 //
9 // Copyright: See COPYING file that comes with this distribution
10 //
11 //
12 #include "statespace.h"
13 
14 namespace PML {
15 
16 const char* const ERR_NOT_A_LIST = "list '[...]' expected";
17 const char* const ERR_NOT_A_MAP = "'key: value' pairs expected";
18 const char* const ERR_UNDEFINED_STATE = "undefined state";
19 const char* const ERR_STRING_LIST = "string or list [...] expected";
20 const char* const ERR_TRANSLATE_LENGTH = "translate length different from #states";
21 
22 const char* const KEY_DATATYPE = "datatype";
23 const char* const KEY_STATE = "state";
24 const char* const KEY_MISSING = "missing";
25 const char* const KEY_GAP = "gap";
26 const char* const KEY_EQUATE = "equate";
27 const char* const KEY_TRANSLATE = "translate";
28 
29 const char* builtin_state_spaces = R"(
30 ### DNA data definition ###
31 - datatype: DNA
32   state: &Nucleotide [ A, C, G, T ]  # anchor to Nucleotide
33   missing: &NTmissing [ N, "?" ]
34   gap: &NTgap "-"
35   equate:
36     U: T      # T and U are the same
37     R: [A, G] # R is interpreted as A or G
38     Y: [C, T]
39     W: [A, T]
40     S: [G, C]
41     M: [A, C]
42     K: [G, T]
43     B: [C, G, T]
44     H: [A, C, T]
45     D: [A, G, T]
46     V: [A, G, C]
47 
48 ### Amino-acid data definition ###
49 - datatype: AA
50   state: [ A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V ]
51   missing: [ X, "?", "*" ]
52   gap: "-"
53   equate:
54     B: [ N, D ]
55     Z: [ Q, E ]
56     J: [ I, L ]
57 
58 ### Binary (0/1) data ###
59 - datatype: BIN
60   state: [ 0, 1 ]
61   missing: "?"
62   gap: "-"
63 
64 ### RY data definition ###
65 - datatype: RY
66   state: [ R, Y ] # R=AG, Y=CT
67   missing: [ N, "?", W, S, M, K, B, H, D, V ]
68   gap: "-"
69   equate:
70     A: R
71     C: Y
72     G: R
73     T: Y
74     U: Y
75 
76 ### Morphological data ###
77 - datatype: MORPH
78   state: [ 0..9, A..Z ]
79   missing: "?"
80   gap: "-"
81 
82 ### Codon data with standard genetic code ###
83 - datatype: CODON
84   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
85   missing: [ *NTmissing, *NTmissing, *NTmissing ]
86   gap: [ *NTgap, *NTgap, *NTgap ]
87   translate: [ K, N, K, N, T, T, T, T, R, S, R, S, I, I, M, I,
88                Q, H, Q, H, P, P, P, P, R, R, R, R, L, L, L, L,
89                E, D, E, D, A, A, A, A, G, G, G, G, V, V, V, V,
90                X, Y, X, Y, S, S, S, S, X, C, W, C, L, F, L, F ]
91 
92 ### Codon data with Vertebrate Mitochondrial code ###
93 - datatype: CODON2
94   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
95   missing: [ *NTmissing, *NTmissing, *NTmissing ]
96   gap: [ *NTgap, *NTgap, *NTgap ]
97   translate: KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
98 
99 ### Codon data with Vertebrate Mitochondrial code ###
100 - datatype: CODON2
101   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
102   missing: [ *NTmissing, *NTmissing, *NTmissing ]
103   gap: [ *NTgap, *NTgap, *NTgap ]
104   translate: KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
105 
106 ### Codon data with Yeast Mitochondrial code ###
107 - datatype: CODON3
108   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
109   missing: [ *NTmissing, *NTmissing, *NTmissing ]
110   gap: [ *NTgap, *NTgap, *NTgap ]
111   translate: KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
112 
113 ### Codon data with Mold, Protozoan code ###
114 - datatype: CODON4
115   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
116   missing: [ *NTmissing, *NTmissing, *NTmissing ]
117   gap: [ *NTgap, *NTgap, *NTgap ]
118   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
119 
120 ### Codon data with Invertebrate Mitochondrial code ###
121 - datatype: CODON5
122   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
123   missing: [ *NTmissing, *NTmissing, *NTmissing ]
124   gap: [ *NTgap, *NTgap, *NTgap ]
125   translate: KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
126 
127 ### Codon data with Ciliate, Dasycladacean and Hexamita Nuclear code ###
128 - datatype: CODON6
129   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
130   missing: [ *NTmissing, *NTmissing, *NTmissing ]
131   gap: [ *NTgap, *NTgap, *NTgap ]
132   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF
133 
134 ### Codon data with Echinoderm and Flatworm Mitochondrial code ###
135 - datatype: CODON9
136   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
137   missing: [ *NTmissing, *NTmissing, *NTmissing ]
138   gap: [ *NTgap, *NTgap, *NTgap ]
139   translate: NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
140 
141 ### Codon data with Euplotid Nuclear code ###
142 - datatype: CODON10
143   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
144   missing: [ *NTmissing, *NTmissing, *NTmissing ]
145   gap: [ *NTgap, *NTgap, *NTgap ]
146   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF
147 
148 ### Codon data with Bacterial, Archaeal and Plant Plastid code ###
149 - datatype: CODON11
150   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
151   missing: [ *NTmissing, *NTmissing, *NTmissing ]
152   gap: [ *NTgap, *NTgap, *NTgap ]
153   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF
154 
155 ### Codon data with Alternative Yeast Nuclear code ###
156 - datatype: CODON12
157   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
158   missing: [ *NTmissing, *NTmissing, *NTmissing ]
159   gap: [ *NTgap, *NTgap, *NTgap ]
160   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF
161 
162 ### Codon data with Ascidian Mitochondrial code ###
163 - datatype: CODON13
164   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
165   missing: [ *NTmissing, *NTmissing, *NTmissing ]
166   gap: [ *NTgap, *NTgap, *NTgap ]
167   translate: KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
168 
169 ### Codon data with Alternative Flatworm Mitochondrial code ###
170 - datatype: CODON14
171   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
172   missing: [ *NTmissing, *NTmissing, *NTmissing ]
173   gap: [ *NTgap, *NTgap, *NTgap ]
174   translate: NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF
175 
176 ### Codon data with Blepharisma Nuclear code ###
177 - datatype: CODON15
178   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
179   missing: [ *NTmissing, *NTmissing, *NTmissing ]
180   gap: [ *NTgap, *NTgap, *NTgap ]
181   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF
182 
183 ### Codon data with Chlorophycean Mitochondrial code ###
184 - datatype: CODON16
185   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
186   missing: [ *NTmissing, *NTmissing, *NTmissing ]
187   gap: [ *NTgap, *NTgap, *NTgap ]
188   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF
189 
190 ### Codon data with Trematode Mitochondrial code ###
191 - datatype: CODON21
192   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
193   missing: [ *NTmissing, *NTmissing, *NTmissing ]
194   gap: [ *NTgap, *NTgap, *NTgap ]
195   translate: NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
196 
197 ### Codon data with Scenedesmus obliquus mitochondrial code ###
198 - datatype: CODON22
199   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
200   missing: [ *NTmissing, *NTmissing, *NTmissing ]
201   gap: [ *NTgap, *NTgap, *NTgap ]
202   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF
203 
204 ### Codon data with Thraustochytrium Mitochondrial code ###
205 - datatype: CODON23
206   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
207   missing: [ *NTmissing, *NTmissing, *NTmissing ]
208   gap: [ *NTgap, *NTgap, *NTgap ]
209   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF
210 
211 ### Codon data with Pterobranchia mitochondrial code ###
212 - datatype: CODON24
213   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
214   missing: [ *NTmissing, *NTmissing, *NTmissing ]
215   gap: [ *NTgap, *NTgap, *NTgap ]
216   translate: KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
217 
218 ### Codon data with Candidate Division SR1 and Gracilibacteria code ###
219 - datatype: CODON25
220   state: [ *Nucleotide, *Nucleotide, *Nucleotide ]  # reference to Nucleotide
221   missing: [ *NTmissing, *NTmissing, *NTmissing ]
222   gap: [ *NTgap, *NTgap, *NTgap ]
223   translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF
224 
225 
226 )";
227 
StateSpace()228 StateSpace::StateSpace() {
229     num_states = 0;
230     num_all_states = 0;
231 }
232 
~StateSpace()233 StateSpace::~StateSpace() {
234 
235 }
236 
isUnknown(StateType state)237 bool StateSpace::isUnknown(StateType state) {
238     return (state == num_states);
239 }
240 
toState(string str)241 StateType StateSpace::toState(string str) {
242     StringStateMap::iterator it;
243     it = states.find(str);
244     if (it == states.end())
245         throw str + " is not a valid state symbol";
246     return it->second;
247 }
248 
toState(string & str,StateVector & str_states)249 void StateSpace::toState(string &str, StateVector &str_states) {
250     size_t pos;
251     for (pos = 0; pos < str.length();) {
252         bool found = false;
253         for (int len = min_state_len; len <= max_state_len; len++) {
254             auto it = states.find(str.substr(pos, len));
255             if (it == states.end())
256                 continue;
257             found = true;
258             str_states.push_back(it->second);
259             pos += len;
260             break;
261         }
262         if (!found)
263             throw str.substr(pos, max_state_len) + " is not a valid state symbol";
264     }
265 }
266 
toString(StateType state)267 string StateSpace::toString(StateType state) {
268     auto it = raw_states.find(state);
269     ASSERT(it != raw_states.end());
270     return it->second;
271 }
272 
273 /**
274  parse a string with range (e.g. 1..5) to a vector of string
275  */
parseRange(string str,StrVector & list)276 void parseRange(string str, StrVector &list) {
277     size_t pos;
278     if ((pos = str.find("..")) == string::npos) {
279         list.push_back(str);
280         return;
281     }
282     string first = str.substr(0, pos);
283     string last = str.substr(pos+2);
284     trimString(first);
285     trimString(last);
286     if (first.length() == 1 && last.length() == 1 && first[0] < last[0]) {
287         for (char ch = first[0]; ch <= last[0]; ch++)
288             list.push_back(string(1,ch));
289     } else {
290         list.push_back(str);
291     }
292 }
293 
294 /**
295  parse a list into a vector of string
296  */
parseList(YAML::const_iterator first,YAML::const_iterator last,StrVector & list)297 void parseList(YAML::const_iterator first, YAML::const_iterator last, StrVector &list) {
298     StrVector this_list;
299     if (first->IsScalar())
300         parseRange(first->Scalar(), this_list);
301     else if (first->IsSequence()) {
302         for (auto it = first->begin(); it != first->end(); it++) {
303             parseRange(it->Scalar(), this_list);
304         }
305     } else {
306         throw YAML::Exception(first->Mark(), ERR_STRING_LIST);
307     }
308     StrVector last_list;
309     first++;
310     if (first != last)
311         parseList(first, last, last_list);
312     else
313         last_list = { "" };
314     for (auto sit = this_list.begin(); sit != this_list.end(); sit++)
315         for (auto sit2 = last_list.begin(); sit2 != last_list.end(); sit2++ )
316             list.push_back(*sit + *sit2);
317 }
318 
319 /**
320  parse a YAML::Node into a list of strings
321  @param extend_length TRUE to make vector of characters if list has length 1
322  */
parseStringList(YAML::Node node,StrVector & list,bool extend_length=false)323 void parseStringList(YAML::Node node, StrVector &list, bool extend_length = false) {
324     if (node.IsScalar()) {
325         // scalar assumed to be string
326         parseRange(node.Scalar(), list);
327     } else if (node.IsSequence()) {
328         YAML::const_iterator it;
329         // check if a sequence of scalars
330         bool all_scalars = true;
331         for (it = node.begin(); it != node.end(); it++)
332             if (!it->IsScalar()) {
333                 all_scalars = false;
334                 break;
335             }
336 
337         if (all_scalars) {
338             for (it = node.begin(); it != node.end(); it++)
339                 parseRange(it->Scalar(), list);
340         } else {
341             // now it can be a sequence of sequences, merge them together
342             parseList(node.begin(), node.end(), list);
343         }
344     } else {
345         throw YAML::Exception(node.Mark(), ERR_STRING_LIST);
346     }
347 
348     if (list.size() == 1 && extend_length) {
349         // single list, convert to vector of characters
350         for (auto i = list[0].begin()+1; i != list[0].end(); i++)
351             list.push_back(string(1,*i));
352         list[0] = list[0].substr(0,1);
353     }
354 }
355 
resetStateSpace()356 void StateSpace::resetStateSpace() {
357     space_name = "";
358     num_states = 0;
359     num_all_states = 0;
360     states.clear();
361     raw_states.clear();
362     equate.clear();
363     translate.clear();
364     min_state_len = max_state_len = 0;
365 }
366 
parseStateSpace(YAML::Node datatype)367 void StateSpace::parseStateSpace(YAML::Node datatype) {
368     if (!datatype.IsMap())
369         throw YAML::Exception(datatype.Mark(), ERR_NOT_A_MAP);
370     if (!datatype[KEY_DATATYPE])
371         throw YAML::Exception(datatype.Mark(), "'datatype: XXX' declaration not found");
372     resetStateSpace();
373     space_name = datatype[KEY_DATATYPE].Scalar();
374     // definition found
375     // parse state: symbols
376     if (!datatype[KEY_STATE])
377         throw YAML::Exception(datatype.Mark(), "datatype does not have 'state: [...]'");
378     StrVector allstates;
379     parseStringList(datatype[KEY_STATE], allstates);
380     if (allstates.size() < 2)
381         throw YAML::Exception(datatype[KEY_STATE].Mark(), "state space must have at least 2 states");
382     StateType stateID = 0;
383     for (auto sit = allstates.begin(); sit != allstates.end(); sit++, stateID++) {
384         states[*sit] = stateID;
385         raw_states[stateID] = *sit;
386     }
387     num_states = stateID;
388 
389     if (verbose_mode >= VB_MED)
390         cout << states.size() << " " << KEY_STATE << endl;
391 
392     // parse missing: symbols
393     if (datatype[KEY_MISSING]) {
394         StrVector list;
395         parseStringList(datatype[KEY_MISSING], list);
396         for (auto i = list.begin(); i != list.end(); i++) {
397             states[*i] = stateID;
398             raw_states[stateID] = *i;
399         }
400     }
401 
402     // parse gap: symbols
403     if (datatype[KEY_GAP]) {
404         StrVector list;
405         parseStringList(datatype[KEY_GAP], list);
406         for (auto i = list.begin(); i != list.end(); i++) {
407             states[*i] = stateID;
408             raw_states[stateID] = *i;
409         }
410     }
411 
412     stateID++;
413 
414     // parse equate: symbols
415     YAML::Node node_equate;
416     if ((node_equate = datatype[KEY_EQUATE])) {
417         if (!node_equate.IsMap())
418             throw YAML::Exception(node_equate.Mark(), ERR_NOT_A_MAP);
419         for (auto nit = node_equate.begin(); nit != node_equate.end(); nit++) {
420             string key = nit->first.Scalar();
421             states[key] = stateID;
422             auto value = nit->second;
423             StrVector values;
424             parseStringList(value, values);
425             for (auto i = values.begin(); i != values.end(); i++) {
426                 if (states.find(*i) == states.end())
427                     throw YAML::Exception(value.Mark(), ERR_UNDEFINED_STATE);
428                 if (equate.find(stateID) == equate.end())
429                     equate[stateID] = { states[*i] };
430                 else
431                     equate[stateID].push_back(states[*i]);
432             }
433             if (equate[stateID].size() == 1) {
434                 // map to just one state, so it's not an ambiguous state
435                 states[key] = equate[stateID][0];
436                 equate.erase(stateID);
437             } else {
438                 // increase number of states
439                 raw_states[stateID] = key;
440                 stateID++;
441             }
442         } // for Node
443         if (verbose_mode >= VB_MED)
444             cout << equate.size() << " ambiguous states" << endl;
445     } // equate
446 
447     // parse translate
448     if (datatype[KEY_TRANSLATE]) {
449         parseStringList(datatype[KEY_TRANSLATE], translate, true);
450         if (translate.size() != num_states)
451             throw YAML::Exception(datatype[KEY_TRANSLATE].Mark(), ERR_TRANSLATE_LENGTH);
452     }
453 
454     num_all_states = stateID;
455     min_state_len = max_state_len = states.begin()->first.length();
456     for (auto i = states.begin(); i != states.end(); i++) {
457         if (min_state_len > i->first.length())
458             min_state_len = i->first.length();
459         if (max_state_len < i->first.length())
460             max_state_len = i->first.length();
461     }
462 }
463 
initStateSpace(SeqType seqtype)464 void StateSpace::initStateSpace(SeqType seqtype) {
465 
466     string name;
467     switch (seqtype) {
468     case SEQ_DNA: name = "DNA"; break;
469     case SEQ_CODON: name = "CODON"; break;
470     case SEQ_MORPH: name = "MORPH"; break;
471     case SEQ_BINARY: name = "BIN"; break;
472     case SEQ_PROTEIN: name = "AA"; break;
473     case SEQ_MULTISTATE: name = "MULTI"; break;
474     case SEQ_POMO: outError("Unhandled POMO state space"); break;
475     case SEQ_UNKNOWN: ASSERT(0);
476     }
477 
478     try {
479         YAML::Node spaceDef = YAML::Load(builtin_state_spaces);
480         if (!spaceDef.IsSequence())
481             throw YAML::Exception(spaceDef.Mark(), ERR_NOT_A_LIST);
482         for (auto it = spaceDef.begin(); it != spaceDef.end(); it++)
483         {
484             auto datatype = *it;
485             if (!(datatype[KEY_DATATYPE]))
486                 continue;
487             if (datatype[KEY_DATATYPE].Scalar() == name) {
488                 parseStateSpace(datatype);
489                 break;
490             }
491         }
492     } catch (YAML::Exception &e) {
493         outError(e.what());
494     }
495 }
496 
497 } // namespace PML
498