1 // A fast YAML 1.2 parser and writer 2 // Written by Robert van Engelen 3 // 4 // https://yaml.org/spec/1.2/spec.html 5 // 6 // YAML doesn't define a formal grammar but instead defines over 200 rules. 7 // YAML uses indent to define structure. This RE/flex lexer and parser uses 8 // indentation anchors \i, \j, and \k to parse YAML structures. 9 // 10 // This YAML parser follows the specification but does not generate errors for 11 // invalid YAML syntax, just tries to make sense of it all (YAML is complex!). 12 // 13 // Note: 14 // - directives are ignored 15 // - anchors (&id and *id) are stored with the YAML structure, but not resolved 16 // - tags (!tag) are parsed and stored with the YAML structure, but not written 17 // - scalars are always stored as strings (not converted to numbers/Booleans) 18 // 19 // YAML tokens generated by the lexer: 20 // - 'S' YAML document start marker --- 21 // - 'E' YAML document end marker ... 22 // - ';' newline, i.e. \r?\n 23 // - '=' one or more empty or blank lines 24 // - '$' string (a YAML scalar, quoted scalar, or block scalar) 25 // - '-' sequence dash 26 // - '?' map key 27 // - ':' map colon 28 // - '>' indent after ';' or '=' 29 // - '<' dedent after ';' or '=' 30 // - '[' flow sequence open bracket 31 // - ']' flow sequence close bracket 32 // - '{' flow sequence open brace 33 // - '}' flow sequence close brace 34 // - ',' flow sequence or map comma 35 // 36 // YAML test files: 37 // - https://www.genivia.com/files/yamltests.zip 38 39 %top{ 40 #include <stdlib.h> // strtoul() 41 #include <iostream> // std::cout etc. 42 #include <iomanip> // std::setw 43 #include <vector> // to store YAML containers 44 } 45 46 %{ 47 // #define SHOW_TOKENS // to output tokens for debugging 48 %} 49 50 // Lexer class members 51 %class{ 52 53 public: 54 55 // wide string to accumulate YAML scalars 56 std::wstring string; 57 58 protected: 59 60 // count number of newlines matched newlines()61 size_t newlines() 62 { 63 return chr() == '\r' ? size()/2 : size(); 64 } 65 66 // parse the indent value given after a '|' or '>', if present parse_indent(size_t offset)67 void parse_indent(size_t offset) 68 { 69 indent = strtoul(text() + offset, NULL, 10); 70 } 71 72 // use the parsed indent value given after a '|' or '>' to adjust the indent adjust_indent()73 void adjust_indent() 74 { 75 if (indent > 0) 76 { 77 std::vector<size_t>& stops = matcher().stops(); 78 size_t spaces = stops.back(); 79 if (spaces > indent) 80 { 81 stops.pop_back(); 82 if (stops.empty()) 83 { 84 stops.push_back(indent); 85 } 86 else 87 { 88 spaces -= stops.back(); 89 stops.push_back(stops.back() + indent); 90 } 91 string.append(spaces - indent, L' '); 92 } 93 } 94 } 95 96 // clear the string scalar before accumulating a new scalar clear()97 void clear() 98 { 99 string.clear(); 100 } 101 102 // add one or n chars c to the string 103 void add(wchar_t c, size_t n = 1) 104 { 105 while (n-- > 0) 106 string.push_back(c); 107 } 108 109 // add indent to the string, prefixed with a \n if nl is true add_indent()110 void add_indent() 111 { 112 if (nl) 113 string.push_back(L'\n'); 114 size_t stop = matcher().last_stop(); 115 if (size() > stop) 116 string.append(size() - stop, L' '); 117 } 118 119 // if nl is true, add a \n to the string then reset nl add_newline()120 void add_newline() 121 { 122 if (nl) 123 { 124 string.push_back(L'\n'); 125 nl = false; 126 } 127 } 128 129 // add sp spaces to the string add_space()130 void add_space() 131 { 132 string.append(sp, L' '); 133 sp = 0; 134 } 135 136 // chomp the string chomp()137 void chomp() 138 { 139 switch (mode) 140 { 141 case CLIP: 142 while (!string.empty() && string.back() == L'\n') 143 string.pop_back(); 144 string.push_back(L'\n'); 145 break; 146 case STRIP: 147 while (!string.empty() && string.back() == L'\n') 148 string.pop_back(); 149 break; 150 case KEEP: 151 break; 152 } 153 } 154 155 unsigned long indent; // block scalar indent value 156 size_t sp; // insert spaces in folded block scalar 157 bool nl; // insert newline in folded block scalar 158 enum { CLIP, STRIP, KEEP } mode; // chomp mode 159 } 160 161 // Lexer class initialization at construction 162 %init{ 163 indent = 0; 164 nl = false; 165 sp = 0; 166 mode = CLIP; 167 } 168 169 %o fast freespace dotall unicode 170 171 %x APOS QUOT PRES FOLD PBLK FBLK 172 173 direct \h* % [^\n]* \n 174 comment \h* # [^\n]* 175 ic [^-?:\\\[\]{},!&*#'"@`[:space:]] 176 rc [^\\\[\]{}:,[:space:]] 177 rd [^-\\\[\]{}:,[:space:]] 178 rh [^#\\\[\]{}:,[:space:]] 179 scalar ({ic} | [-:?] {rd} | --- {rc}) ({rc} | :+ {rc} | \h+ {rh})* 180 tag [!&*] {rc}+ 181 h2 [[:xdigit:]]{2} 182 h4 [[:xdigit:]]{4} 183 h8 [[:xdigit:]]{8} 184 nl \h* (# [^\n]* | \r)? \n 185 lf \r? \n 186 bl {lf} (\h* {lf})+ 187 br \h+ | (\h* {lf})+ 188 189 %% 190 191 {direct} { /* ignore directive */ } 192 {comment} { /* ignore comment */ } 193 \h* {lf} { return ';'; } 194 \h* {bl} { return '='; } 195 ^ \h+ \i { return '>'; } 196 ^ \h+ \j | 197 \j { return '<'; } 198 \h+ { /* ignore spaces and tabs */ } 199 "---" {br} { return 'S'; } 200 "..." {br} { return 'E'; } 201 "-" { return '-'; } 202 "?" { return '?'; } 203 ":" { return ':'; } 204 "," { return ','; } 205 "[" { return '['; } 206 "]" { return ']'; } 207 "{" { return '{'; } 208 "}" { return '}'; } 209 "'" { clear(); start(APOS); } 210 \" { clear(); start(QUOT); } 211 "|" \d* {nl} { clear(); parse_indent(1); mode = CLIP; start(PRES); } 212 "|-" \d* {nl} { clear(); parse_indent(2); mode = STRIP; start(PRES); } 213 "|+" \d* {nl} { clear(); parse_indent(2); mode = KEEP; start(PRES); } 214 ">" \d* {nl} { clear(); parse_indent(1); mode = CLIP; start(FOLD); } 215 ">-" \d* {nl} { clear(); parse_indent(2); mode = STRIP; start(FOLD); } 216 ">+" \d* {nl} { clear(); parse_indent(2); mode = KEEP; start(FOLD); } 217 {tag} { return chr(); } 218 {scalar} { string = wstr(); return '$'; } 219 220 <APOS>{ 221 ' { start(INITIAL); return '$'; } 222 '' { add(L'\''); } 223 } 224 225 <QUOT>{ 226 \\ {lf} { /* ignore \LF */ } 227 \" { start(INITIAL); return '$'; } 228 \\ 0 { add(L'\0'); } 229 \\ a { add(L'\a'); } 230 \\ b { add(L'\b'); } 231 \\ t { add(L'\t'); } 232 \\ n { add(L'\n'); } 233 \\ v { add(L'\v'); } 234 \\ f { add(L'\f'); } 235 \\ r { add(L'\r'); } 236 \\ e { add(0x1b); } 237 \\ N { add(0x85); } 238 \\ _ { add(0xa0); } 239 \\ L { add(0x2028); } 240 \\ P { add(0x2029); } 241 \\ x {h2} { add(strtoul(text() + 2, NULL, 16)); } 242 \\ u {h4} { add(strtoul(text() + 2, NULL, 16)); } 243 \\ U {h8} { add(strtoul(text() + 2, NULL, 16)); } 244 \\ . { add(wstr()[1]); } 245 } 246 247 <APOS,QUOT>{ 248 ^ \h+ \k? { /* ignore nodent/undent */ } 249 \h* {lf} { add(L' '); } 250 {bl} { add(L'\n', newlines() - 1); } 251 . { add(wchr()); } 252 } 253 254 <PRES>{ 255 ^ \h+ \i { adjust_indent(); start(PBLK); } 256 } 257 258 <FOLD>{ 259 ^ \h+ \i { adjust_indent(); sp = 0; nl = false; start(FBLK); } 260 } 261 262 <PBLK>{ 263 {lf} { add(L'\n'); } 264 {bl} { add(L'\n', newlines()); } 265 ^ \h* \j | 266 \j { chomp(); start(INITIAL); return '$'; } 267 ^ \h+ { } 268 ^ \h+ \k { add_indent(); } 269 . { add(wchr()); } 270 } 271 272 <FBLK>{ 273 \h+ {lf} { sp = size() - 1 - (*(matcher().end() - 2) == '\r'); } 274 {lf} { sp = 1; } 275 {bl} { add(L'\n', newlines() - 1); } 276 ^ \h* \j | 277 \j { chomp(); start(INITIAL); return '$'; } 278 ^ \h+ { add_newline(); } 279 ^ \h+ \k { sp = 0; nl = true; add_indent(); } 280 . { add_space(); add(wchr()); } 281 } 282 283 %% 284 285 // YAML 1.2 value with YAML writer 286 class YAML { 287 288 public: 289 290 typedef std::wstring Str; // YAML string (scalars) 291 typedef std::vector<YAML> Seq; // YAML sequence 292 typedef std::pair<YAML,YAML> Duo; 293 typedef std::vector<Duo> Map; // YAML map 294 295 YAML() { } 296 297 void write(std::ostream& os, size_t indent, bool key) const 298 { 299 if (!str.empty()) 300 { 301 write_string(os, str); 302 } 303 else if (!seq.empty()) 304 { 305 for (YAML::Seq::const_iterator i = seq.begin(); i != seq.end(); ++i) 306 { 307 if (key) 308 { 309 os << "? "; 310 key = false; 311 ++indent; 312 } 313 else 314 { 315 write_indent(os, indent); 316 } 317 os << "- "; 318 i->write(os, indent + 1, false); 319 } 320 } 321 else if (!map.empty()) 322 { 323 for (YAML::Map::const_iterator i = map.begin(); i != map.end(); ++i) 324 { 325 if (key) 326 { 327 os << "? "; 328 key = false; 329 ++indent; 330 } 331 else 332 { 333 write_indent(os, indent); 334 } 335 i->first.write(os, indent, !i->first.seq.empty() || !i->first.map.empty()); 336 if (!i->first.seq.empty() || !i->first.map.empty()) 337 write_indent(os, indent); 338 os << ": "; 339 i->second.write(os, indent + 1, false); 340 } 341 } 342 } 343 344 Str tag; // YAML tag, starts with '!' 345 Str ref; // YAML anchor, starts with '&' or '*' 346 Str str; // YAML string (scalar) 347 Seq seq; // YAML sequence 348 Map map; // YAML map 349 350 protected: 351 352 // write newline with indent 353 static void write_indent(std::ostream& os, size_t indent) 354 { 355 os << '\n'; 356 while (indent-- > 0) 357 os << " "; 358 } 359 360 // write YAML quoted string 361 static void write_string(std::ostream& os, const std::wstring& s) 362 { 363 os << '"'; 364 for (std::wstring::const_iterator i = s.begin(); i != s.end(); ++i) 365 { 366 switch (*i) 367 { 368 case '"' : 369 case '\\': os << '\\' << static_cast<char>(*i); break; 370 case '\0': os << "\\0"; break; 371 case '\a': os << "\\a"; break; 372 case '\b': os << "\\b"; break; 373 case '\t': os << "\\t"; break; 374 case '\n': os << "\\n"; break; 375 case '\v': os << "\\v"; break; 376 case '\f': os << "\\f"; break; 377 case '\r': os << "\\r"; break; 378 default : if (*i >= '\x20' && *i <= '\x7f') 379 { // emit printable char 380 os << static_cast<char>(*i); 381 } 382 else if (*i < 0x20) 383 { // emit \xxx for control codes 384 os << "\\x" << std::internal << std::setw(2) << std::setfill('0') << std::hex << *i << std::dec; 385 } 386 else if (*i >= 0xD800 && *i < 0xE000) 387 { // UTF-16 surrogates 388 char buf[8]; 389 int c = 0x010000 + ((*i - 0xD800) << 10); 390 c += *++i - 0xDC00; 391 buf[reflex::utf8(c, buf)] = '\0'; // convert to UTF-8 and make \0-terminated 392 os << buf; 393 } 394 else 395 { // else emit UTF-8 396 char buf[8]; 397 buf[reflex::utf8(*i, buf)] = '\0'; // convert to UTF-8 and make \0-terminated 398 os << buf; 399 } 400 } 401 } 402 os << '"'; 403 } 404 }; 405 406 std::ostream& operator<<(std::ostream& os, const YAML& data) 407 { 408 data.write(os, 0, false); 409 return os; 410 } 411 412 // YAML 1.2 parser derived from the lexer 413 class YAMLParser : public Lexer { 414 415 public: 416 417 YAMLParser(FILE *fd = NULL) : Lexer(fd), token(lex()) { } 418 419 // parse YAML documents 420 void parse() 421 { 422 while (true) 423 { 424 YAML data; 425 if (token == 'S') 426 next(); 427 else if (token == 0 || token == 'E') 428 break; 429 parse(data); 430 doc.push_back(data); 431 } 432 } 433 434 // write YAML documents 435 void write(std::ostream &os) const 436 { 437 for (YAML::Seq::const_iterator i = doc.begin(); i != doc.end(); ++i) 438 os << "--- " << *i << '\n'; 439 os << "...\n"; 440 } 441 442 YAML::Seq doc; // sequence of YAML documents parsed 443 444 protected: 445 446 // parse YAML data 447 void parse(YAML& data) 448 { 449 if (token == '=' || token == ';') 450 next(); 451 if (token == '!') 452 { 453 data.tag = string; 454 next(); 455 } 456 if (token == '&' || token == '*') 457 { 458 data.ref = string; 459 next(); 460 } 461 switch (token) 462 { 463 case '-': parse_seq(data); break; 464 case '>': parse_ind(data); break; 465 case '$': parse_str_or_map(data); break; 466 case '[': parse_flow_seq(data); break; 467 case '{': parse_flow_map(data); break; 468 case '?': parse_key(data); break; 469 case ':': parse_map(data); break; 470 default: 471 #ifdef SHOW_TOKENS 472 std::cout << "skipping " << (char)token << "\n"; 473 #endif 474 next(); 475 break; 476 } 477 } 478 479 // parse "? key : val ..." 480 void parse_key(YAML& data) 481 { 482 next(); 483 if (token == ';' || token == '=') 484 { 485 next(); 486 if (token == '>') 487 parse_ind(data); 488 } 489 else 490 { 491 matcher().insert_stop(matcher().columno()); 492 parse(data); 493 if (token == '<') 494 next(); 495 } 496 if (token == ':') 497 parse_map(data); 498 } 499 500 // parse "- val ..." 501 void parse_seq(YAML& data) 502 { 503 if (data.tag.empty()) 504 data.tag = L"!!seq"; 505 size_t level = 0; 506 while (true) 507 { 508 if (token == '>') 509 { 510 next(); 511 ++level; 512 } 513 while (token == '<') 514 { 515 if (level == 0) 516 return; 517 next(); 518 if (level == 1) 519 return; 520 --level; 521 } 522 if (token != '-') 523 break; 524 next(); 525 YAML val; 526 parse(val); 527 data.seq.push_back(val); 528 } 529 } 530 531 // parse indented value (string, nested sequence, or nested map) 532 void parse_ind(YAML& data) 533 { 534 next(); 535 if (token == '-') 536 { 537 parse_seq(data); 538 } 539 else 540 { 541 bool sp = true; 542 size_t level = 0; 543 while (token == '$') 544 { 545 if (data.str.empty()) 546 data.str = string; 547 else if (sp) 548 data.str.append(L" ").append(string); 549 else 550 data.str.append(string); 551 sp = true; 552 next(); 553 if (token == ';') 554 { 555 next(); 556 } 557 else if (token == '=') 558 { 559 data.str.append(newlines() - 1, L'\n'); 560 sp = false; 561 next(); 562 } 563 else 564 { 565 break; 566 } 567 if (token == '>') 568 { 569 ++level; 570 data.str.push_back(L'\n'); 571 sp = false; 572 next(); 573 } 574 while (token == '<') 575 { 576 if (level == 0) 577 break; 578 next(); 579 --level; 580 } 581 } 582 } 583 if (token == ':') 584 parse_map(data); 585 if (token == '<') 586 next(); 587 } 588 589 // parse string 590 void parse_str(YAML& data) 591 { 592 if (data.tag.empty()) 593 data.tag = L"!!str"; 594 data.str = string; 595 next(); 596 if (token == ';') 597 next(); 598 } 599 600 // parse string of "key: val ..." 601 void parse_str_or_map(YAML& data) 602 { 603 if (data.tag.empty()) 604 data.tag = L"!!str"; 605 data.str = string; 606 next(); 607 if (token == ':') 608 parse_map(data); 609 else if (token == ';' || token == '=') 610 next(); 611 } 612 613 // key is given in data, now parse ": val ..." 614 void parse_map(YAML& data) 615 { 616 next(); 617 YAML val1; 618 if (token == ';' || token == '=') 619 { 620 next(); 621 if (token != 0 && token != 'S' && token != 'E' && token != '$' && token != '?' && token != ':' && token != '<') 622 parse(val1); 623 } 624 else if (token != 0 && token != 'S' && token != 'E' && token != '?' && token != ':' && token != '<') 625 { 626 parse(val1); 627 } 628 YAML::Duo duo(data, val1); 629 data.tag = L"!!map"; 630 data.str.clear(); 631 data.seq.clear(); 632 data.map.clear(); 633 data.map.push_back(duo); 634 while (token != 0 && token != 'S' && token != 'E' && token != '<') 635 { 636 YAML key, val; 637 if (token == ';' || token == '=') 638 next(); 639 if (token == '$') 640 { 641 parse_str(key); 642 } 643 else if (token == '?') 644 { 645 next(); 646 if (token == ';' || token == '=') 647 { 648 next(); 649 if (token == '-') 650 parse_seq(val); 651 else if (token == '>') 652 parse_ind(key); 653 } 654 else 655 { 656 matcher().insert_stop(matcher().columno()); 657 parse(key); 658 if (token == '<') 659 next(); 660 } 661 } 662 if (token == ':') 663 { 664 next(); 665 if (token == ';' || token == '=') 666 { 667 next(); 668 if (token == '-') 669 parse_seq(val); 670 else if (token == '>') 671 parse_ind(val); 672 } 673 else 674 { 675 matcher().insert_stop(matcher().columno()); 676 parse(val); 677 if (token == '<') 678 next(); 679 } 680 } 681 else 682 { 683 break; 684 } 685 data.map.push_back(YAML::Duo(key, val)); 686 } 687 } 688 689 // parse "[ val, ... ]" 690 void parse_flow_seq(YAML& data) 691 { 692 if (data.tag.empty()) 693 data.tag = L"!!seq"; 694 size_t level = 0; 695 next(); 696 while (token != 0 && token != 'S' && token != 'E' && token != ']') 697 { 698 YAML val; 699 if (token == ';' || token == '=') 700 next(); 701 if (token == '>') 702 { 703 ++level; 704 next(); 705 } 706 else 707 { 708 while (token == '<') 709 { 710 if (level > 0) 711 --level; 712 next(); 713 } 714 } 715 parse(val); 716 data.seq.push_back(val); 717 if (token == ';' || token == '=') 718 next(); 719 if (token == '>') 720 { 721 ++level; 722 next(); 723 } 724 else 725 { 726 while (token == '<') 727 { 728 if (level > 0) 729 --level; 730 next(); 731 } 732 } 733 if (token == ',') 734 next(); 735 if (token == ';' || token == '=') 736 next(); 737 if (token == '>') 738 { 739 ++level; 740 next(); 741 } 742 else 743 { 744 while (token == '<') 745 { 746 if (level > 0) 747 --level; 748 next(); 749 } 750 } 751 } 752 if (token == ']') 753 next(); 754 if (token == ';' || token == '=') 755 next(); 756 while (token == '<' && level-- > 0) 757 next(); 758 } 759 760 // parse "{ key:val, ... }" 761 void parse_flow_map(YAML& data) 762 { 763 if (data.tag.empty()) 764 data.tag = L"!!map"; 765 size_t level = 0; 766 next(); 767 while (token != 0 && token != 'S' && token != 'E' && token != '}') 768 { 769 YAML key, val; 770 if (token == ';' || token == '=') 771 next(); 772 if (token == '>') 773 { 774 ++level; 775 next(); 776 } 777 else 778 { 779 while (token == '<') 780 { 781 if (level > 0) 782 --level; 783 next(); 784 } 785 } 786 if (token == '$') 787 parse_str(key); 788 else if (token == '[') 789 parse_flow_seq(key); 790 else if (token == '{') 791 parse_flow_map(key); 792 if (token == ':') 793 { 794 next(); 795 if (token != ';' && token != '=' && token != ',') 796 parse(val); 797 } 798 data.map.push_back(YAML::Duo(key, val)); 799 if (token == ';' || token == '=') 800 next(); 801 if (token == '>') 802 { 803 ++level; 804 next(); 805 } 806 else 807 { 808 while (token == '<') 809 { 810 if (level > 0) 811 --level; 812 next(); 813 } 814 } 815 if (token == ',') 816 next(); 817 if (token == ';' || token == '=') 818 next(); 819 if (token == '>') 820 { 821 ++level; 822 next(); 823 } 824 else if (token == '<') 825 { 826 if (level > 0) 827 --level; 828 next(); 829 } 830 } 831 if (token == '}') 832 next(); 833 if (token == ';' || token == '=') 834 next(); 835 while (token == '<' && level-- > 0) 836 next(); 837 } 838 839 // get next token, zero when EOF 840 int next() 841 { 842 #ifdef SHOW_TOKENS // produce token sequence for debugging 843 std::cout << "token " << (char)token << '\n'; 844 if (token == '$') 845 std::wcout << L">>>" << string << L"<<<\n"; 846 #endif 847 if (token != 0) 848 token = lex(); 849 return token; 850 } 851 852 int token; 853 }; 854 855 std::ostream& operator<<(std::ostream& os, const YAMLParser& parser) 856 { 857 parser.write(os); 858 return os; 859 } 860 861 // The main program parses YAML from a file or from stdin and writes it 862 int main(int argc, char **argv) 863 { 864 FILE *fd = stdin; 865 // open file if a file name is given on the command line 866 if (argc > 1 && (fd = fopen(argv[1], "r")) == NULL) 867 exit(EXIT_FAILURE); 868 YAMLParser yaml(fd); 869 yaml.parse(); 870 std::cout << yaml; 871 } 872