1 #include <list>
2 #include <string>
3 #include <cstring>
4 #include <iostream>
5 #include <algorithm>
6 namespace edn {
7   using std::cout;
8   using std::endl;
9   using std::string;
10   using std::list;
11 
12   string validSymbolChars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ.*+!-_?$%&=:#/";
13   enum TokenType {
14     TokenString,
15     TokenAtom,
16     TokenParen
17   };
18 
19   struct EdnToken {
20     TokenType type;
21     int line;
22     string value;
23   };
24 
25   enum NodeType {
26     EdnNil,
27     EdnSymbol,
28     EdnKeyword,
29     EdnBool,
30     EdnInt,
31     EdnFloat,
32     EdnString,
33     EdnChar,
34 
35     EdnList,
36     EdnVector,
37     EdnMap,
38     EdnSet,
39 
40     EdnDiscard,
41     EdnTagged
42   };
43 
44   struct EdnNode {
45     NodeType type;
46     int line;
47     string value;
48     list<EdnNode> values;
49   };
50 
createToken(TokenType type,int line,string value,list<EdnToken> & tokens)51   void createToken(TokenType type, int line, string value, list<EdnToken> &tokens) {
52     EdnToken token;
53     token.type = type;
54     token.line = line;
55     token.value = value;
56     tokens.push_back(token);
57   }
58 
typeToString(NodeType type)59   string typeToString(NodeType type) {
60     string output;
61     switch (type) {
62       case EdnSymbol:  output = "EdnSymbol";   break;
63       case EdnKeyword: output = "EdnKeyword";  break;
64       case EdnInt:     output = "EdnInt";      break;
65       case EdnFloat:   output = "EdnFloat";    break;
66       case EdnChar:    output = "EdnChar";     break;
67       case EdnBool:    output = "EdnBool";     break;
68       case EdnNil:     output = "EdnNil";      break;
69       case EdnString:  output = "EdnString";   break;
70       case EdnTagged:  output = "EdnTagged";   break;
71       case EdnList:    output = "EdnList";     break;
72       case EdnVector:  output = "EdnVector";   break;
73       case EdnSet:     output = "EdnSet";      break;
74       case EdnMap:     output = "EdnMap";      break;
75       case EdnDiscard: output = "EdnDiscard";  break;
76     }
77     return output;
78   }
79 
80   //by default checks if first char is in range of chars
strRangeIn(string str,const char * range,int start=0,int stop=1)81   bool strRangeIn(string str, const char* range, int start = 0, int stop = 1) {
82     string strRange = str.substr(start, stop);
83     return (std::strspn(strRange.c_str(), range) == strRange.length());
84   }
85 
lex(string edn)86   list<EdnToken> lex(string edn) {
87     string::iterator it;
88     int line = 1;
89     char escapeChar = '\\';
90     bool escaping = false;
91     bool inString = false;
92     string stringContent = "";
93     bool inComment = false;
94     string token = "";
95     string paren = "";
96     list<EdnToken> tokens;
97 
98     for (it = edn.begin(); it != edn.end(); ++it) {
99       if (*it == '\n' || *it == '\r') line++;
100 
101       if (!inString && *it == ';' && !escaping) inComment = true;
102 
103       if (inComment) {
104         if (*it == '\n') {
105           inComment = false;
106           if (token != "") {
107               createToken(TokenAtom, line, token, tokens);
108               token = "";
109           }
110           continue;
111         }
112       }
113 
114       if (*it == '"' && !escaping) {
115         if (inString) {
116           createToken(TokenString, line, stringContent, tokens);
117           inString = false;
118         } else {
119           stringContent = "";
120           inString = true;
121         }
122         continue;
123       }
124 
125       if (inString) {
126         if (*it == escapeChar && !escaping) {
127           escaping = true;
128           continue;
129         }
130 
131         if (escaping) {
132           escaping = false;
133           if (*it == 't' || *it == 'n' || *it == 'f' || *it == 'r') stringContent += escapeChar;
134         }
135         stringContent += *it;
136       } else if (*it == '(' || *it == ')' || *it == '[' || *it == ']' || *it == '{' ||
137                  *it == '}' || *it == '\t' || *it == '\n' || *it == '\r' || *it == ' ' || *it == ',') {
138         if (token != "") {
139           createToken(TokenAtom, line, token, tokens);
140           token = "";
141         }
142         if (*it == '(' || *it == ')' || *it == '[' || *it == ']' || *it == '{' || *it == '}') {
143           paren = "";
144           paren += *it;
145           createToken(TokenParen, line, paren, tokens);
146         }
147       } else {
148         if (escaping) {
149           escaping = false;
150         } else if (*it == escapeChar) {
151           escaping = true;
152         }
153 
154         if (token == "#_" || (token.length() == 2 && token[0] == escapeChar)) {
155           createToken(TokenAtom, line, token, tokens);
156           token = "";
157         }
158         token += *it;
159       }
160     }
161     if (token != "") {
162       createToken(TokenAtom, line, token, tokens);
163     }
164 
165     return tokens;
166   }
167 
uppercase(string & str)168   void uppercase(string &str) {
169     std::transform(str.begin(), str.end(), str.begin(), ::toupper);
170   }
171 
validSymbol(string value)172   bool validSymbol(string value) {
173     //first we uppercase the value
174     uppercase(value);
175 
176     if (std::strspn(value.c_str(), validSymbolChars.c_str()) != value.length())
177       return false;
178 
179     //if the value starts with a number that is not ok
180     if (strRangeIn(value, "0123456789"))
181       return false;
182 
183     //first char can not start with : # or / - but / by itself is valid
184     if (strRangeIn(value, ":#/") && !(value.length() == 1 && value[0] == '/'))
185       return false;
186 
187     //if the first car is - + or . then the next char must NOT be numeric, by by themselves they are valid
188     if (strRangeIn(value, "-+.") && value.length() > 1 && strRangeIn(value, "0123456789", 1))
189       return false;
190 
191     if (std::count(value.begin(), value.end(), '/') > 1)
192       return false;
193 
194     return true;
195   }
196 
validKeyword(string value)197   bool validKeyword(string value) {
198     return (value[0] == ':' && validSymbol(value.substr(1,value.length() - 1)));
199   }
200 
validNil(string value)201   bool validNil(string value) {
202     return (value == "nil");
203   }
204 
validBool(string value)205   bool validBool(string value) {
206     return (value == "true" || value == "false");
207   }
208 
validInt(string value,bool allowSign=true)209   bool validInt(string value, bool allowSign = true) {
210     //if we have a positive or negative symbol that is ok but remove it for testing
211     if (strRangeIn(value, "-+") && value.length() > 1 && allowSign)
212       value = value.substr(1, value.length() - 1);
213 
214     //if string ends with N or M that is ok, but remove it for testing
215     if (strRangeIn(value, "NM", value.length() - 1, 1))
216       value = value.substr(0, value.length() - 2);
217 
218     if (std::strspn(value.c_str(), "0123456789") != value.length())
219       return false;
220 
221     return true;
222   }
223 
validFloat(string value)224   bool validFloat(string value) {
225     uppercase(value);
226 
227     string front;
228     string back;
229     int epos;
230     int periodPos = value.find_first_of('.');
231     if (periodPos) {
232       front = value.substr(0, periodPos);
233       back = value.substr(periodPos + 1);
234     } else {
235       front = "";
236       back = value;
237     }
238 
239     if(front == "" || validInt(front)) {
240       epos = back.find_first_of('E');
241       if(epos > -1) {
242         //ends with E which is invalid
243         if ((unsigned)epos == back.length() - 1) return false;
244 
245         //both the decimal and exponent should be valid - do not allow + or - on dec (pass false as arg to validInt)
246         if (!validInt(back.substr(0, epos), false) || !validInt(back.substr(epos + 1))) return false;
247       } else {
248         //if back ends with M remove for validation
249         if (strRangeIn(back, "M", back.length() - 1, 1))
250           back = back.substr(0, back.length() - 1);
251 
252         if (!validInt(back, false)) return false;
253       }
254       return true;
255     }
256 
257     return false;
258   }
259 
validChar(string value)260   bool validChar(string value) {
261     return (value.at(0) == '\\' && value.length() == 2);
262   }
263 
handleAtom(EdnToken token)264   EdnNode handleAtom(EdnToken token) {
265     EdnNode node;
266     node.line = token.line;
267     node.value = token.value;
268 
269     if (validNil(token.value))
270       node.type = EdnNil;
271     else if (token.type == TokenString)
272       node.type = EdnString;
273     else if (validChar(token.value))
274       node.type = EdnChar;
275     else if (validBool(token.value))
276       node.type = EdnBool;
277     else if (validInt(token.value))
278       node.type = EdnInt;
279     else if (validFloat(token.value))
280       node.type = EdnFloat;
281     else if (validKeyword(token.value))
282       node.type = EdnKeyword;
283     else if (validSymbol(token.value))
284       node.type = EdnSymbol;
285     else
286       throw string("Could not parse atom");
287 
288     return node;
289   }
290 
handleCollection(EdnToken token,list<EdnNode> values)291   EdnNode handleCollection(EdnToken token, list<EdnNode> values) {
292     EdnNode node;
293     node.line = token.line;
294     node.values = values;
295 
296     if (token.value == "(") {
297       node.type = EdnList;
298     }
299     else if (token.value == "[") {
300       node.type = EdnVector;
301     }
302     if (token.value == "{") {
303       node.type = EdnMap;
304     }
305     return node;
306   }
307 
handleTagged(EdnToken token,EdnNode value)308   EdnNode handleTagged(EdnToken token, EdnNode value) {
309     EdnNode node;
310     node.line = token.line;
311 
312     string tagName = token.value.substr(1, token.value.length() - 1);
313     if (tagName == "_") {
314       node.type = EdnDiscard;
315     } else if (tagName == "") {
316       //special case where we return early as # { is a set - thus tagname is empty
317       node.type = EdnSet;
318       if (value.type != EdnMap) {
319         throw string("Was expection a { } after hash to build set");
320       }
321       node.values = value.values;
322       return node;
323     } else {
324       node.type = EdnTagged;
325     }
326 
327     if (!validSymbol(tagName)) {
328       throw string("Invalid tag name");
329     }
330 
331     EdnToken symToken;
332     symToken.type = TokenAtom;
333     symToken.line = token.line;
334     symToken.value = tagName;
335 
336     list<EdnNode> values;
337     values.push_back(handleAtom(symToken));
338     values.push_back(value);
339 
340     node.values = values;
341     return node;
342   }
343 
shiftToken(list<EdnToken> & tokens)344   EdnToken shiftToken(list<EdnToken> &tokens) {
345     EdnToken nextToken = tokens.front();
346     tokens.pop_front();
347     return nextToken;
348   }
349 
readAhead(EdnToken token,list<EdnToken> & tokens)350   EdnNode readAhead(EdnToken token, list<EdnToken> &tokens) {
351     if (token.type == TokenParen) {
352 
353       EdnToken nextToken;
354       list<EdnNode> L;
355       string closeParen;
356       if (token.value == "(") closeParen = ")";
357       if (token.value == "[") closeParen = "]";
358       if (token.value == "{") closeParen = "}";
359 
360       while (true) {
361         if (tokens.empty()) throw string("unexpected end of list");
362 
363         nextToken = shiftToken(tokens);
364 
365         if (nextToken.value == closeParen) {
366           return handleCollection(token, L);
367         } else {
368           L.push_back(readAhead(nextToken, tokens));
369         }
370       }
371     } else if (token.value == ")" || token.value == "]" || token.value == "}") {
372       throw string("Unexpected " + token.value);
373     } else {
374       if (token.value.size() && token.value.at(0) == '#') {
375         return handleTagged(token, readAhead(shiftToken(tokens), tokens));
376       } else {
377         return handleAtom(token);
378       }
379     }
380   }
381 
escapeQuotes(const string & before)382   string escapeQuotes(const string &before) {
383     string after;
384     after.reserve(before.length() + 4);
385 
386     for (string::size_type i = 0; i < before.length(); ++i) {
387       if (before[i] == '"' || before[i] == '\\') {
388           after += '\\';
389       }
390       after += before[i];
391     }
392     return after;
393   }
394 
pprint(EdnNode & node,int indent=1)395   string pprint(EdnNode &node, int indent = 1) {
396     string prefix("");
397     if (indent) {
398       prefix.insert(0, indent, ' ');
399     }
400 
401     string output;
402     if (node.type == EdnList || node.type == EdnSet || node.type == EdnVector || node.type == EdnMap) {
403       string vals = "";
404       for (list<EdnNode>::iterator it=node.values.begin(); it != node.values.end(); ++it) {
405         if (vals.length() > 0) vals += prefix;
406         vals += pprint(*it, indent + 1);
407         if (node.type == EdnMap) {
408           ++it;
409           vals += " " + pprint(*it, 1);
410         }
411         if (std::distance(it, node.values.end()) != 1) vals += "\n";
412       }
413 
414       if (node.type == EdnList) output = "(" + vals + ")";
415       else if (node.type == EdnVector) output = "[" + vals + "]";
416       else if (node.type == EdnMap) output = "{" + vals + "}";
417       else if (node.type == EdnSet) output = "#{" + vals + "}";
418 
419       #ifdef DEBUG
420         return "<" + typeToString(node.type) + " " + output + ">";
421       #endif
422     } else if (node.type == EdnTagged) {
423       output = "#" + pprint(node.values.front()) + " " + pprint(node.values.back());
424       #ifdef DEBUG
425         return "<EdnTagged " + output + ">";
426       #endif
427     } else if (node.type == EdnString) {
428       output = "\"" + escapeQuotes(node.value) + "\"";
429       #ifdef DEBUG
430         return "<EdnString " + output + ">";
431       #endif
432     } else {
433       #ifdef DEBUG
434         return "<" + typeToString(node.type) + " " + node.value + ">";
435       #endif
436 
437       output = node.value;
438     }
439     return output;
440   }
441 
read(string edn)442   EdnNode read(string edn) {
443     list<EdnToken> tokens = lex(edn);
444 
445     if (tokens.size() == 0) {
446       throw string("No parsable tokens found in string");
447     }
448 
449     return readAhead(shiftToken(tokens), tokens);
450   }
451 }
452