1 #include <list> 2 #include <string> 3 #include <cstring> 4 #include <iostream> 5 #include <algorithm> 6 namespace edn { 7 using std::cout; 8 using std::endl; 9 using std::string; 10 using std::list; 11 12 string validSymbolChars = "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ.*+!-_?$%&=:#/"; 13 enum TokenType { 14 TokenString, 15 TokenAtom, 16 TokenParen 17 }; 18 19 struct EdnToken { 20 TokenType type; 21 int line; 22 string value; 23 }; 24 25 enum NodeType { 26 EdnNil, 27 EdnSymbol, 28 EdnKeyword, 29 EdnBool, 30 EdnInt, 31 EdnFloat, 32 EdnString, 33 EdnChar, 34 35 EdnList, 36 EdnVector, 37 EdnMap, 38 EdnSet, 39 40 EdnDiscard, 41 EdnTagged 42 }; 43 44 struct EdnNode { 45 NodeType type; 46 int line; 47 string value; 48 list<EdnNode> values; 49 }; 50 createToken(TokenType type,int line,string value,list<EdnToken> & tokens)51 void createToken(TokenType type, int line, string value, list<EdnToken> &tokens) { 52 EdnToken token; 53 token.type = type; 54 token.line = line; 55 token.value = value; 56 tokens.push_back(token); 57 } 58 typeToString(NodeType type)59 string typeToString(NodeType type) { 60 string output; 61 switch (type) { 62 case EdnSymbol: output = "EdnSymbol"; break; 63 case EdnKeyword: output = "EdnKeyword"; break; 64 case EdnInt: output = "EdnInt"; break; 65 case EdnFloat: output = "EdnFloat"; break; 66 case EdnChar: output = "EdnChar"; break; 67 case EdnBool: output = "EdnBool"; break; 68 case EdnNil: output = "EdnNil"; break; 69 case EdnString: output = "EdnString"; break; 70 case EdnTagged: output = "EdnTagged"; break; 71 case EdnList: output = "EdnList"; break; 72 case EdnVector: output = "EdnVector"; break; 73 case EdnSet: output = "EdnSet"; break; 74 case EdnMap: output = "EdnMap"; break; 75 case EdnDiscard: output = "EdnDiscard"; break; 76 } 77 return output; 78 } 79 80 //by default checks if first char is in range of chars strRangeIn(string str,const char * range,int start=0,int stop=1)81 bool strRangeIn(string str, const char* range, int start = 0, int stop = 1) { 82 string strRange = str.substr(start, stop); 83 return (std::strspn(strRange.c_str(), range) == strRange.length()); 84 } 85 lex(string edn)86 list<EdnToken> lex(string edn) { 87 string::iterator it; 88 int line = 1; 89 char escapeChar = '\\'; 90 bool escaping = false; 91 bool inString = false; 92 string stringContent = ""; 93 bool inComment = false; 94 string token = ""; 95 string paren = ""; 96 list<EdnToken> tokens; 97 98 for (it = edn.begin(); it != edn.end(); ++it) { 99 if (*it == '\n' || *it == '\r') line++; 100 101 if (!inString && *it == ';' && !escaping) inComment = true; 102 103 if (inComment) { 104 if (*it == '\n') { 105 inComment = false; 106 if (token != "") { 107 createToken(TokenAtom, line, token, tokens); 108 token = ""; 109 } 110 continue; 111 } 112 } 113 114 if (*it == '"' && !escaping) { 115 if (inString) { 116 createToken(TokenString, line, stringContent, tokens); 117 inString = false; 118 } else { 119 stringContent = ""; 120 inString = true; 121 } 122 continue; 123 } 124 125 if (inString) { 126 if (*it == escapeChar && !escaping) { 127 escaping = true; 128 continue; 129 } 130 131 if (escaping) { 132 escaping = false; 133 if (*it == 't' || *it == 'n' || *it == 'f' || *it == 'r') stringContent += escapeChar; 134 } 135 stringContent += *it; 136 } else if (*it == '(' || *it == ')' || *it == '[' || *it == ']' || *it == '{' || 137 *it == '}' || *it == '\t' || *it == '\n' || *it == '\r' || *it == ' ' || *it == ',') { 138 if (token != "") { 139 createToken(TokenAtom, line, token, tokens); 140 token = ""; 141 } 142 if (*it == '(' || *it == ')' || *it == '[' || *it == ']' || *it == '{' || *it == '}') { 143 paren = ""; 144 paren += *it; 145 createToken(TokenParen, line, paren, tokens); 146 } 147 } else { 148 if (escaping) { 149 escaping = false; 150 } else if (*it == escapeChar) { 151 escaping = true; 152 } 153 154 if (token == "#_" || (token.length() == 2 && token[0] == escapeChar)) { 155 createToken(TokenAtom, line, token, tokens); 156 token = ""; 157 } 158 token += *it; 159 } 160 } 161 if (token != "") { 162 createToken(TokenAtom, line, token, tokens); 163 } 164 165 return tokens; 166 } 167 uppercase(string & str)168 void uppercase(string &str) { 169 std::transform(str.begin(), str.end(), str.begin(), ::toupper); 170 } 171 validSymbol(string value)172 bool validSymbol(string value) { 173 //first we uppercase the value 174 uppercase(value); 175 176 if (std::strspn(value.c_str(), validSymbolChars.c_str()) != value.length()) 177 return false; 178 179 //if the value starts with a number that is not ok 180 if (strRangeIn(value, "0123456789")) 181 return false; 182 183 //first char can not start with : # or / - but / by itself is valid 184 if (strRangeIn(value, ":#/") && !(value.length() == 1 && value[0] == '/')) 185 return false; 186 187 //if the first car is - + or . then the next char must NOT be numeric, by by themselves they are valid 188 if (strRangeIn(value, "-+.") && value.length() > 1 && strRangeIn(value, "0123456789", 1)) 189 return false; 190 191 if (std::count(value.begin(), value.end(), '/') > 1) 192 return false; 193 194 return true; 195 } 196 validKeyword(string value)197 bool validKeyword(string value) { 198 return (value[0] == ':' && validSymbol(value.substr(1,value.length() - 1))); 199 } 200 validNil(string value)201 bool validNil(string value) { 202 return (value == "nil"); 203 } 204 validBool(string value)205 bool validBool(string value) { 206 return (value == "true" || value == "false"); 207 } 208 validInt(string value,bool allowSign=true)209 bool validInt(string value, bool allowSign = true) { 210 //if we have a positive or negative symbol that is ok but remove it for testing 211 if (strRangeIn(value, "-+") && value.length() > 1 && allowSign) 212 value = value.substr(1, value.length() - 1); 213 214 //if string ends with N or M that is ok, but remove it for testing 215 if (strRangeIn(value, "NM", value.length() - 1, 1)) 216 value = value.substr(0, value.length() - 2); 217 218 if (std::strspn(value.c_str(), "0123456789") != value.length()) 219 return false; 220 221 return true; 222 } 223 validFloat(string value)224 bool validFloat(string value) { 225 uppercase(value); 226 227 string front; 228 string back; 229 int epos; 230 int periodPos = value.find_first_of('.'); 231 if (periodPos) { 232 front = value.substr(0, periodPos); 233 back = value.substr(periodPos + 1); 234 } else { 235 front = ""; 236 back = value; 237 } 238 239 if(front == "" || validInt(front)) { 240 epos = back.find_first_of('E'); 241 if(epos > -1) { 242 //ends with E which is invalid 243 if ((unsigned)epos == back.length() - 1) return false; 244 245 //both the decimal and exponent should be valid - do not allow + or - on dec (pass false as arg to validInt) 246 if (!validInt(back.substr(0, epos), false) || !validInt(back.substr(epos + 1))) return false; 247 } else { 248 //if back ends with M remove for validation 249 if (strRangeIn(back, "M", back.length() - 1, 1)) 250 back = back.substr(0, back.length() - 1); 251 252 if (!validInt(back, false)) return false; 253 } 254 return true; 255 } 256 257 return false; 258 } 259 validChar(string value)260 bool validChar(string value) { 261 return (value.at(0) == '\\' && value.length() == 2); 262 } 263 handleAtom(EdnToken token)264 EdnNode handleAtom(EdnToken token) { 265 EdnNode node; 266 node.line = token.line; 267 node.value = token.value; 268 269 if (validNil(token.value)) 270 node.type = EdnNil; 271 else if (token.type == TokenString) 272 node.type = EdnString; 273 else if (validChar(token.value)) 274 node.type = EdnChar; 275 else if (validBool(token.value)) 276 node.type = EdnBool; 277 else if (validInt(token.value)) 278 node.type = EdnInt; 279 else if (validFloat(token.value)) 280 node.type = EdnFloat; 281 else if (validKeyword(token.value)) 282 node.type = EdnKeyword; 283 else if (validSymbol(token.value)) 284 node.type = EdnSymbol; 285 else 286 throw string("Could not parse atom"); 287 288 return node; 289 } 290 handleCollection(EdnToken token,list<EdnNode> values)291 EdnNode handleCollection(EdnToken token, list<EdnNode> values) { 292 EdnNode node; 293 node.line = token.line; 294 node.values = values; 295 296 if (token.value == "(") { 297 node.type = EdnList; 298 } 299 else if (token.value == "[") { 300 node.type = EdnVector; 301 } 302 if (token.value == "{") { 303 node.type = EdnMap; 304 } 305 return node; 306 } 307 handleTagged(EdnToken token,EdnNode value)308 EdnNode handleTagged(EdnToken token, EdnNode value) { 309 EdnNode node; 310 node.line = token.line; 311 312 string tagName = token.value.substr(1, token.value.length() - 1); 313 if (tagName == "_") { 314 node.type = EdnDiscard; 315 } else if (tagName == "") { 316 //special case where we return early as # { is a set - thus tagname is empty 317 node.type = EdnSet; 318 if (value.type != EdnMap) { 319 throw string("Was expection a { } after hash to build set"); 320 } 321 node.values = value.values; 322 return node; 323 } else { 324 node.type = EdnTagged; 325 } 326 327 if (!validSymbol(tagName)) { 328 throw string("Invalid tag name"); 329 } 330 331 EdnToken symToken; 332 symToken.type = TokenAtom; 333 symToken.line = token.line; 334 symToken.value = tagName; 335 336 list<EdnNode> values; 337 values.push_back(handleAtom(symToken)); 338 values.push_back(value); 339 340 node.values = values; 341 return node; 342 } 343 shiftToken(list<EdnToken> & tokens)344 EdnToken shiftToken(list<EdnToken> &tokens) { 345 EdnToken nextToken = tokens.front(); 346 tokens.pop_front(); 347 return nextToken; 348 } 349 readAhead(EdnToken token,list<EdnToken> & tokens)350 EdnNode readAhead(EdnToken token, list<EdnToken> &tokens) { 351 if (token.type == TokenParen) { 352 353 EdnToken nextToken; 354 list<EdnNode> L; 355 string closeParen; 356 if (token.value == "(") closeParen = ")"; 357 if (token.value == "[") closeParen = "]"; 358 if (token.value == "{") closeParen = "}"; 359 360 while (true) { 361 if (tokens.empty()) throw string("unexpected end of list"); 362 363 nextToken = shiftToken(tokens); 364 365 if (nextToken.value == closeParen) { 366 return handleCollection(token, L); 367 } else { 368 L.push_back(readAhead(nextToken, tokens)); 369 } 370 } 371 } else if (token.value == ")" || token.value == "]" || token.value == "}") { 372 throw string("Unexpected " + token.value); 373 } else { 374 if (token.value.size() && token.value.at(0) == '#') { 375 return handleTagged(token, readAhead(shiftToken(tokens), tokens)); 376 } else { 377 return handleAtom(token); 378 } 379 } 380 } 381 escapeQuotes(const string & before)382 string escapeQuotes(const string &before) { 383 string after; 384 after.reserve(before.length() + 4); 385 386 for (string::size_type i = 0; i < before.length(); ++i) { 387 if (before[i] == '"' || before[i] == '\\') { 388 after += '\\'; 389 } 390 after += before[i]; 391 } 392 return after; 393 } 394 pprint(EdnNode & node,int indent=1)395 string pprint(EdnNode &node, int indent = 1) { 396 string prefix(""); 397 if (indent) { 398 prefix.insert(0, indent, ' '); 399 } 400 401 string output; 402 if (node.type == EdnList || node.type == EdnSet || node.type == EdnVector || node.type == EdnMap) { 403 string vals = ""; 404 for (list<EdnNode>::iterator it=node.values.begin(); it != node.values.end(); ++it) { 405 if (vals.length() > 0) vals += prefix; 406 vals += pprint(*it, indent + 1); 407 if (node.type == EdnMap) { 408 ++it; 409 vals += " " + pprint(*it, 1); 410 } 411 if (std::distance(it, node.values.end()) != 1) vals += "\n"; 412 } 413 414 if (node.type == EdnList) output = "(" + vals + ")"; 415 else if (node.type == EdnVector) output = "[" + vals + "]"; 416 else if (node.type == EdnMap) output = "{" + vals + "}"; 417 else if (node.type == EdnSet) output = "#{" + vals + "}"; 418 419 #ifdef DEBUG 420 return "<" + typeToString(node.type) + " " + output + ">"; 421 #endif 422 } else if (node.type == EdnTagged) { 423 output = "#" + pprint(node.values.front()) + " " + pprint(node.values.back()); 424 #ifdef DEBUG 425 return "<EdnTagged " + output + ">"; 426 #endif 427 } else if (node.type == EdnString) { 428 output = "\"" + escapeQuotes(node.value) + "\""; 429 #ifdef DEBUG 430 return "<EdnString " + output + ">"; 431 #endif 432 } else { 433 #ifdef DEBUG 434 return "<" + typeToString(node.type) + " " + node.value + ">"; 435 #endif 436 437 output = node.value; 438 } 439 return output; 440 } 441 read(string edn)442 EdnNode read(string edn) { 443 list<EdnToken> tokens = lex(edn); 444 445 if (tokens.size() == 0) { 446 throw string("No parsable tokens found in string"); 447 } 448 449 return readAhead(shiftToken(tokens), tokens); 450 } 451 } 452