1 //
2 // C++ Implementation: StateSpace
3 //
4 // Description:
5 //
6 //
7 // Author: BUI Quang Minh(C) 2018
8 //
9 // Copyright: See COPYING file that comes with this distribution
10 //
11 //
12 #include "statespace.h"
13
14 namespace PML {
15
16 const char* const ERR_NOT_A_LIST = "list '[...]' expected";
17 const char* const ERR_NOT_A_MAP = "'key: value' pairs expected";
18 const char* const ERR_UNDEFINED_STATE = "undefined state";
19 const char* const ERR_STRING_LIST = "string or list [...] expected";
20 const char* const ERR_TRANSLATE_LENGTH = "translate length different from #states";
21
22 const char* const KEY_DATATYPE = "datatype";
23 const char* const KEY_STATE = "state";
24 const char* const KEY_MISSING = "missing";
25 const char* const KEY_GAP = "gap";
26 const char* const KEY_EQUATE = "equate";
27 const char* const KEY_TRANSLATE = "translate";
28
29 const char* builtin_state_spaces = R"(
30 ### DNA data definition ###
31 - datatype: DNA
32 state: &Nucleotide [ A, C, G, T ] # anchor to Nucleotide
33 missing: &NTmissing [ N, "?" ]
34 gap: &NTgap "-"
35 equate:
36 U: T # T and U are the same
37 R: [A, G] # R is interpreted as A or G
38 Y: [C, T]
39 W: [A, T]
40 S: [G, C]
41 M: [A, C]
42 K: [G, T]
43 B: [C, G, T]
44 H: [A, C, T]
45 D: [A, G, T]
46 V: [A, G, C]
47
48 ### Amino-acid data definition ###
49 - datatype: AA
50 state: [ A, R, N, D, C, Q, E, G, H, I, L, K, M, F, P, S, T, W, Y, V ]
51 missing: [ X, "?", "*" ]
52 gap: "-"
53 equate:
54 B: [ N, D ]
55 Z: [ Q, E ]
56 J: [ I, L ]
57
58 ### Binary (0/1) data ###
59 - datatype: BIN
60 state: [ 0, 1 ]
61 missing: "?"
62 gap: "-"
63
64 ### RY data definition ###
65 - datatype: RY
66 state: [ R, Y ] # R=AG, Y=CT
67 missing: [ N, "?", W, S, M, K, B, H, D, V ]
68 gap: "-"
69 equate:
70 A: R
71 C: Y
72 G: R
73 T: Y
74 U: Y
75
76 ### Morphological data ###
77 - datatype: MORPH
78 state: [ 0..9, A..Z ]
79 missing: "?"
80 gap: "-"
81
82 ### Codon data with standard genetic code ###
83 - datatype: CODON
84 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
85 missing: [ *NTmissing, *NTmissing, *NTmissing ]
86 gap: [ *NTgap, *NTgap, *NTgap ]
87 translate: [ K, N, K, N, T, T, T, T, R, S, R, S, I, I, M, I,
88 Q, H, Q, H, P, P, P, P, R, R, R, R, L, L, L, L,
89 E, D, E, D, A, A, A, A, G, G, G, G, V, V, V, V,
90 X, Y, X, Y, S, S, S, S, X, C, W, C, L, F, L, F ]
91
92 ### Codon data with Vertebrate Mitochondrial code ###
93 - datatype: CODON2
94 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
95 missing: [ *NTmissing, *NTmissing, *NTmissing ]
96 gap: [ *NTgap, *NTgap, *NTgap ]
97 translate: KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
98
99 ### Codon data with Vertebrate Mitochondrial code ###
100 - datatype: CODON2
101 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
102 missing: [ *NTmissing, *NTmissing, *NTmissing ]
103 gap: [ *NTgap, *NTgap, *NTgap ]
104 translate: KNKNTTTT*S*SMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
105
106 ### Codon data with Yeast Mitochondrial code ###
107 - datatype: CODON3
108 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
109 missing: [ *NTmissing, *NTmissing, *NTmissing ]
110 gap: [ *NTgap, *NTgap, *NTgap ]
111 translate: KNKNTTTTRSRSMIMIQHQHPPPPRRRRTTTTEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
112
113 ### Codon data with Mold, Protozoan code ###
114 - datatype: CODON4
115 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
116 missing: [ *NTmissing, *NTmissing, *NTmissing ]
117 gap: [ *NTgap, *NTgap, *NTgap ]
118 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
119
120 ### Codon data with Invertebrate Mitochondrial code ###
121 - datatype: CODON5
122 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
123 missing: [ *NTmissing, *NTmissing, *NTmissing ]
124 gap: [ *NTgap, *NTgap, *NTgap ]
125 translate: KNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
126
127 ### Codon data with Ciliate, Dasycladacean and Hexamita Nuclear code ###
128 - datatype: CODON6
129 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
130 missing: [ *NTmissing, *NTmissing, *NTmissing ]
131 gap: [ *NTgap, *NTgap, *NTgap ]
132 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVQYQYSSSS*CWCLFLF
133
134 ### Codon data with Echinoderm and Flatworm Mitochondrial code ###
135 - datatype: CODON9
136 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
137 missing: [ *NTmissing, *NTmissing, *NTmissing ]
138 gap: [ *NTgap, *NTgap, *NTgap ]
139 translate: NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
140
141 ### Codon data with Euplotid Nuclear code ###
142 - datatype: CODON10
143 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
144 missing: [ *NTmissing, *NTmissing, *NTmissing ]
145 gap: [ *NTgap, *NTgap, *NTgap ]
146 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSCCWCLFLF
147
148 ### Codon data with Bacterial, Archaeal and Plant Plastid code ###
149 - datatype: CODON11
150 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
151 missing: [ *NTmissing, *NTmissing, *NTmissing ]
152 gap: [ *NTgap, *NTgap, *NTgap ]
153 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF
154
155 ### Codon data with Alternative Yeast Nuclear code ###
156 - datatype: CODON12
157 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
158 missing: [ *NTmissing, *NTmissing, *NTmissing ]
159 gap: [ *NTgap, *NTgap, *NTgap ]
160 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLSLEDEDAAAAGGGGVVVV*Y*YSSSS*CWCLFLF
161
162 ### Codon data with Ascidian Mitochondrial code ###
163 - datatype: CODON13
164 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
165 missing: [ *NTmissing, *NTmissing, *NTmissing ]
166 gap: [ *NTgap, *NTgap, *NTgap ]
167 translate: KNKNTTTTGSGSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
168
169 ### Codon data with Alternative Flatworm Mitochondrial code ###
170 - datatype: CODON14
171 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
172 missing: [ *NTmissing, *NTmissing, *NTmissing ]
173 gap: [ *NTgap, *NTgap, *NTgap ]
174 translate: NNKNTTTTSSSSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVVYY*YSSSSWCWCLFLF
175
176 ### Codon data with Blepharisma Nuclear code ###
177 - datatype: CODON15
178 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
179 missing: [ *NTmissing, *NTmissing, *NTmissing ]
180 gap: [ *NTgap, *NTgap, *NTgap ]
181 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YQYSSSS*CWCLFLF
182
183 ### Codon data with Chlorophycean Mitochondrial code ###
184 - datatype: CODON16
185 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
186 missing: [ *NTmissing, *NTmissing, *NTmissing ]
187 gap: [ *NTgap, *NTgap, *NTgap ]
188 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLYSSSS*CWCLFLF
189
190 ### Codon data with Trematode Mitochondrial code ###
191 - datatype: CODON21
192 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
193 missing: [ *NTmissing, *NTmissing, *NTmissing ]
194 gap: [ *NTgap, *NTgap, *NTgap ]
195 translate: NNKNTTTTSSSSMIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
196
197 ### Codon data with Scenedesmus obliquus mitochondrial code ###
198 - datatype: CODON22
199 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
200 missing: [ *NTmissing, *NTmissing, *NTmissing ]
201 gap: [ *NTgap, *NTgap, *NTgap ]
202 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*YLY*SSS*CWCLFLF
203
204 ### Codon data with Thraustochytrium Mitochondrial code ###
205 - datatype: CODON23
206 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
207 missing: [ *NTmissing, *NTmissing, *NTmissing ]
208 gap: [ *NTgap, *NTgap, *NTgap ]
209 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSS*CWC*FLF
210
211 ### Codon data with Pterobranchia mitochondrial code ###
212 - datatype: CODON24
213 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
214 missing: [ *NTmissing, *NTmissing, *NTmissing ]
215 gap: [ *NTgap, *NTgap, *NTgap ]
216 translate: KNKNTTTTSSKSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSWCWCLFLF
217
218 ### Codon data with Candidate Division SR1 and Gracilibacteria code ###
219 - datatype: CODON25
220 state: [ *Nucleotide, *Nucleotide, *Nucleotide ] # reference to Nucleotide
221 missing: [ *NTmissing, *NTmissing, *NTmissing ]
222 gap: [ *NTgap, *NTgap, *NTgap ]
223 translate: KNKNTTTTRSRSIIMIQHQHPPPPRRRRLLLLEDEDAAAAGGGGVVVV*Y*YSSSSGCWCLFLF
224
225
226 )";
227
StateSpace()228 StateSpace::StateSpace() {
229 num_states = 0;
230 num_all_states = 0;
231 }
232
~StateSpace()233 StateSpace::~StateSpace() {
234
235 }
236
isUnknown(StateType state)237 bool StateSpace::isUnknown(StateType state) {
238 return (state == num_states);
239 }
240
toState(string str)241 StateType StateSpace::toState(string str) {
242 StringStateMap::iterator it;
243 it = states.find(str);
244 if (it == states.end())
245 throw str + " is not a valid state symbol";
246 return it->second;
247 }
248
toState(string & str,StateVector & str_states)249 void StateSpace::toState(string &str, StateVector &str_states) {
250 size_t pos;
251 for (pos = 0; pos < str.length();) {
252 bool found = false;
253 for (int len = min_state_len; len <= max_state_len; len++) {
254 auto it = states.find(str.substr(pos, len));
255 if (it == states.end())
256 continue;
257 found = true;
258 str_states.push_back(it->second);
259 pos += len;
260 break;
261 }
262 if (!found)
263 throw str.substr(pos, max_state_len) + " is not a valid state symbol";
264 }
265 }
266
toString(StateType state)267 string StateSpace::toString(StateType state) {
268 auto it = raw_states.find(state);
269 ASSERT(it != raw_states.end());
270 return it->second;
271 }
272
273 /**
274 parse a string with range (e.g. 1..5) to a vector of string
275 */
parseRange(string str,StrVector & list)276 void parseRange(string str, StrVector &list) {
277 size_t pos;
278 if ((pos = str.find("..")) == string::npos) {
279 list.push_back(str);
280 return;
281 }
282 string first = str.substr(0, pos);
283 string last = str.substr(pos+2);
284 trimString(first);
285 trimString(last);
286 if (first.length() == 1 && last.length() == 1 && first[0] < last[0]) {
287 for (char ch = first[0]; ch <= last[0]; ch++)
288 list.push_back(string(1,ch));
289 } else {
290 list.push_back(str);
291 }
292 }
293
294 /**
295 parse a list into a vector of string
296 */
parseList(YAML::const_iterator first,YAML::const_iterator last,StrVector & list)297 void parseList(YAML::const_iterator first, YAML::const_iterator last, StrVector &list) {
298 StrVector this_list;
299 if (first->IsScalar())
300 parseRange(first->Scalar(), this_list);
301 else if (first->IsSequence()) {
302 for (auto it = first->begin(); it != first->end(); it++) {
303 parseRange(it->Scalar(), this_list);
304 }
305 } else {
306 throw YAML::Exception(first->Mark(), ERR_STRING_LIST);
307 }
308 StrVector last_list;
309 first++;
310 if (first != last)
311 parseList(first, last, last_list);
312 else
313 last_list = { "" };
314 for (auto sit = this_list.begin(); sit != this_list.end(); sit++)
315 for (auto sit2 = last_list.begin(); sit2 != last_list.end(); sit2++ )
316 list.push_back(*sit + *sit2);
317 }
318
319 /**
320 parse a YAML::Node into a list of strings
321 @param extend_length TRUE to make vector of characters if list has length 1
322 */
parseStringList(YAML::Node node,StrVector & list,bool extend_length=false)323 void parseStringList(YAML::Node node, StrVector &list, bool extend_length = false) {
324 if (node.IsScalar()) {
325 // scalar assumed to be string
326 parseRange(node.Scalar(), list);
327 } else if (node.IsSequence()) {
328 YAML::const_iterator it;
329 // check if a sequence of scalars
330 bool all_scalars = true;
331 for (it = node.begin(); it != node.end(); it++)
332 if (!it->IsScalar()) {
333 all_scalars = false;
334 break;
335 }
336
337 if (all_scalars) {
338 for (it = node.begin(); it != node.end(); it++)
339 parseRange(it->Scalar(), list);
340 } else {
341 // now it can be a sequence of sequences, merge them together
342 parseList(node.begin(), node.end(), list);
343 }
344 } else {
345 throw YAML::Exception(node.Mark(), ERR_STRING_LIST);
346 }
347
348 if (list.size() == 1 && extend_length) {
349 // single list, convert to vector of characters
350 for (auto i = list[0].begin()+1; i != list[0].end(); i++)
351 list.push_back(string(1,*i));
352 list[0] = list[0].substr(0,1);
353 }
354 }
355
resetStateSpace()356 void StateSpace::resetStateSpace() {
357 space_name = "";
358 num_states = 0;
359 num_all_states = 0;
360 states.clear();
361 raw_states.clear();
362 equate.clear();
363 translate.clear();
364 min_state_len = max_state_len = 0;
365 }
366
parseStateSpace(YAML::Node datatype)367 void StateSpace::parseStateSpace(YAML::Node datatype) {
368 if (!datatype.IsMap())
369 throw YAML::Exception(datatype.Mark(), ERR_NOT_A_MAP);
370 if (!datatype[KEY_DATATYPE])
371 throw YAML::Exception(datatype.Mark(), "'datatype: XXX' declaration not found");
372 resetStateSpace();
373 space_name = datatype[KEY_DATATYPE].Scalar();
374 // definition found
375 // parse state: symbols
376 if (!datatype[KEY_STATE])
377 throw YAML::Exception(datatype.Mark(), "datatype does not have 'state: [...]'");
378 StrVector allstates;
379 parseStringList(datatype[KEY_STATE], allstates);
380 if (allstates.size() < 2)
381 throw YAML::Exception(datatype[KEY_STATE].Mark(), "state space must have at least 2 states");
382 StateType stateID = 0;
383 for (auto sit = allstates.begin(); sit != allstates.end(); sit++, stateID++) {
384 states[*sit] = stateID;
385 raw_states[stateID] = *sit;
386 }
387 num_states = stateID;
388
389 if (verbose_mode >= VB_MED)
390 cout << states.size() << " " << KEY_STATE << endl;
391
392 // parse missing: symbols
393 if (datatype[KEY_MISSING]) {
394 StrVector list;
395 parseStringList(datatype[KEY_MISSING], list);
396 for (auto i = list.begin(); i != list.end(); i++) {
397 states[*i] = stateID;
398 raw_states[stateID] = *i;
399 }
400 }
401
402 // parse gap: symbols
403 if (datatype[KEY_GAP]) {
404 StrVector list;
405 parseStringList(datatype[KEY_GAP], list);
406 for (auto i = list.begin(); i != list.end(); i++) {
407 states[*i] = stateID;
408 raw_states[stateID] = *i;
409 }
410 }
411
412 stateID++;
413
414 // parse equate: symbols
415 YAML::Node node_equate;
416 if ((node_equate = datatype[KEY_EQUATE])) {
417 if (!node_equate.IsMap())
418 throw YAML::Exception(node_equate.Mark(), ERR_NOT_A_MAP);
419 for (auto nit = node_equate.begin(); nit != node_equate.end(); nit++) {
420 string key = nit->first.Scalar();
421 states[key] = stateID;
422 auto value = nit->second;
423 StrVector values;
424 parseStringList(value, values);
425 for (auto i = values.begin(); i != values.end(); i++) {
426 if (states.find(*i) == states.end())
427 throw YAML::Exception(value.Mark(), ERR_UNDEFINED_STATE);
428 if (equate.find(stateID) == equate.end())
429 equate[stateID] = { states[*i] };
430 else
431 equate[stateID].push_back(states[*i]);
432 }
433 if (equate[stateID].size() == 1) {
434 // map to just one state, so it's not an ambiguous state
435 states[key] = equate[stateID][0];
436 equate.erase(stateID);
437 } else {
438 // increase number of states
439 raw_states[stateID] = key;
440 stateID++;
441 }
442 } // for Node
443 if (verbose_mode >= VB_MED)
444 cout << equate.size() << " ambiguous states" << endl;
445 } // equate
446
447 // parse translate
448 if (datatype[KEY_TRANSLATE]) {
449 parseStringList(datatype[KEY_TRANSLATE], translate, true);
450 if (translate.size() != num_states)
451 throw YAML::Exception(datatype[KEY_TRANSLATE].Mark(), ERR_TRANSLATE_LENGTH);
452 }
453
454 num_all_states = stateID;
455 min_state_len = max_state_len = states.begin()->first.length();
456 for (auto i = states.begin(); i != states.end(); i++) {
457 if (min_state_len > i->first.length())
458 min_state_len = i->first.length();
459 if (max_state_len < i->first.length())
460 max_state_len = i->first.length();
461 }
462 }
463
initStateSpace(SeqType seqtype)464 void StateSpace::initStateSpace(SeqType seqtype) {
465
466 string name;
467 switch (seqtype) {
468 case SEQ_DNA: name = "DNA"; break;
469 case SEQ_CODON: name = "CODON"; break;
470 case SEQ_MORPH: name = "MORPH"; break;
471 case SEQ_BINARY: name = "BIN"; break;
472 case SEQ_PROTEIN: name = "AA"; break;
473 case SEQ_MULTISTATE: name = "MULTI"; break;
474 case SEQ_POMO: outError("Unhandled POMO state space"); break;
475 case SEQ_UNKNOWN: ASSERT(0);
476 }
477
478 try {
479 YAML::Node spaceDef = YAML::Load(builtin_state_spaces);
480 if (!spaceDef.IsSequence())
481 throw YAML::Exception(spaceDef.Mark(), ERR_NOT_A_LIST);
482 for (auto it = spaceDef.begin(); it != spaceDef.end(); it++)
483 {
484 auto datatype = *it;
485 if (!(datatype[KEY_DATATYPE]))
486 continue;
487 if (datatype[KEY_DATATYPE].Scalar() == name) {
488 parseStateSpace(datatype);
489 break;
490 }
491 }
492 } catch (YAML::Exception &e) {
493 outError(e.what());
494 }
495 }
496
497 } // namespace PML
498