1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is a YAML 1.2 parser. 10 // 11 // See http://www.yaml.org/spec/1.2/spec.html for the full standard. 12 // 13 // This currently does not implement the following: 14 // * Tag resolution. 15 // * UTF-16. 16 // * BOMs anywhere other than the first Unicode scalar value in the file. 17 // 18 // The most important class here is Stream. This represents a YAML stream with 19 // 0, 1, or many documents. 20 // 21 // SourceMgr sm; 22 // StringRef input = getInput(); 23 // yaml::Stream stream(input, sm); 24 // 25 // for (yaml::document_iterator di = stream.begin(), de = stream.end(); 26 // di != de; ++di) { 27 // yaml::Node *n = di->getRoot(); 28 // if (n) { 29 // // Do something with n... 30 // } else 31 // break; 32 // } 33 // 34 //===----------------------------------------------------------------------===// 35 36 #ifndef LLVM_SUPPORT_YAMLPARSER_H 37 #define LLVM_SUPPORT_YAMLPARSER_H 38 39 #include "llvm/ADT/StringRef.h" 40 #include "llvm/Support/Allocator.h" 41 #include "llvm/Support/SMLoc.h" 42 #include "llvm/Support/SourceMgr.h" 43 #include <cassert> 44 #include <cstddef> 45 #include <iterator> 46 #include <map> 47 #include <memory> 48 #include <string> 49 #include <system_error> 50 51 namespace llvm { 52 53 class MemoryBufferRef; 54 class raw_ostream; 55 class Twine; 56 57 namespace yaml { 58 59 class Document; 60 class document_iterator; 61 class Node; 62 class Scanner; 63 struct Token; 64 65 /// Dump all the tokens in this stream to OS. 66 /// \returns true if there was an error, false otherwise. 67 bool dumpTokens(StringRef Input, raw_ostream &); 68 69 /// Scans all tokens in input without outputting anything. This is used 70 /// for benchmarking the tokenizer. 71 /// \returns true if there was an error, false otherwise. 72 bool scanTokens(StringRef Input); 73 74 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable 75 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is 76 /// false, those UTF8 sequences encoding printable unicode scalars will not be 77 /// escaped, but emitted verbatim. 78 std::string escape(StringRef Input, bool EscapePrintable = true); 79 80 /// Parse \p S as a bool according to https://yaml.org/type/bool.html. 81 llvm::Optional<bool> parseBool(StringRef S); 82 83 /// This class represents a YAML stream potentially containing multiple 84 /// documents. 85 class Stream { 86 public: 87 /// This keeps a reference to the string referenced by \p Input. 88 Stream(StringRef Input, SourceMgr &, bool ShowColors = true, 89 std::error_code *EC = nullptr); 90 91 Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true, 92 std::error_code *EC = nullptr); 93 ~Stream(); 94 95 document_iterator begin(); 96 document_iterator end(); 97 void skip(); 98 bool failed(); 99 100 bool validate() { 101 skip(); 102 return !failed(); 103 } 104 105 void printError(Node *N, const Twine &Msg, 106 SourceMgr::DiagKind Kind = SourceMgr::DK_Error); 107 void printError(const SMRange &Range, const Twine &Msg, 108 SourceMgr::DiagKind Kind = SourceMgr::DK_Error); 109 110 private: 111 friend class Document; 112 113 std::unique_ptr<Scanner> scanner; 114 std::unique_ptr<Document> CurrentDoc; 115 }; 116 117 /// Abstract base class for all Nodes. 118 class Node { 119 virtual void anchor(); 120 121 public: 122 enum NodeKind { 123 NK_Null, 124 NK_Scalar, 125 NK_BlockScalar, 126 NK_KeyValue, 127 NK_Mapping, 128 NK_Sequence, 129 NK_Alias 130 }; 131 132 Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, 133 StringRef Tag); 134 135 // It's not safe to copy YAML nodes; the document is streamed and the position 136 // is part of the state. 137 Node(const Node &) = delete; 138 void operator=(const Node &) = delete; 139 140 void *operator new(size_t Size, BumpPtrAllocator &Alloc, 141 size_t Alignment = 16) noexcept { 142 return Alloc.Allocate(Size, Alignment); 143 } 144 145 void operator delete(void *Ptr, BumpPtrAllocator &Alloc, 146 size_t Size) noexcept { 147 Alloc.Deallocate(Ptr, Size, 0); 148 } 149 150 void operator delete(void *) noexcept = delete; 151 152 /// Get the value of the anchor attached to this node. If it does not 153 /// have one, getAnchor().size() will be 0. 154 StringRef getAnchor() const { return Anchor; } 155 156 /// Get the tag as it was written in the document. This does not 157 /// perform tag resolution. 158 StringRef getRawTag() const { return Tag; } 159 160 /// Get the verbatium tag for a given Node. This performs tag resoluton 161 /// and substitution. 162 std::string getVerbatimTag() const; 163 164 SMRange getSourceRange() const { return SourceRange; } 165 void setSourceRange(SMRange SR) { SourceRange = SR; } 166 167 // These functions forward to Document and Scanner. 168 Token &peekNext(); 169 Token getNext(); 170 Node *parseBlockNode(); 171 BumpPtrAllocator &getAllocator(); 172 void setError(const Twine &Message, Token &Location) const; 173 bool failed() const; 174 175 virtual void skip() {} 176 177 unsigned int getType() const { return TypeID; } 178 179 protected: 180 std::unique_ptr<Document> &Doc; 181 SMRange SourceRange; 182 183 ~Node() = default; 184 185 private: 186 unsigned int TypeID; 187 StringRef Anchor; 188 /// The tag as typed in the document. 189 StringRef Tag; 190 }; 191 192 /// A null value. 193 /// 194 /// Example: 195 /// !!null null 196 class NullNode final : public Node { 197 void anchor() override; 198 199 public: 200 NullNode(std::unique_ptr<Document> &D) 201 : Node(NK_Null, D, StringRef(), StringRef()) {} 202 203 static bool classof(const Node *N) { return N->getType() == NK_Null; } 204 }; 205 206 /// A scalar node is an opaque datum that can be presented as a 207 /// series of zero or more Unicode scalar values. 208 /// 209 /// Example: 210 /// Adena 211 class ScalarNode final : public Node { 212 void anchor() override; 213 214 public: 215 ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 216 StringRef Val) 217 : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { 218 SMLoc Start = SMLoc::getFromPointer(Val.begin()); 219 SMLoc End = SMLoc::getFromPointer(Val.end()); 220 SourceRange = SMRange(Start, End); 221 } 222 223 // Return Value without any escaping or folding or other fun YAML stuff. This 224 // is the exact bytes that are contained in the file (after conversion to 225 // utf8). 226 StringRef getRawValue() const { return Value; } 227 228 /// Gets the value of this node as a StringRef. 229 /// 230 /// \param Storage is used to store the content of the returned StringRef if 231 /// it requires any modification from how it appeared in the source. 232 /// This happens with escaped characters and multi-line literals. 233 StringRef getValue(SmallVectorImpl<char> &Storage) const; 234 235 static bool classof(const Node *N) { 236 return N->getType() == NK_Scalar; 237 } 238 239 private: 240 StringRef Value; 241 242 StringRef unescapeDoubleQuoted(StringRef UnquotedValue, 243 StringRef::size_type Start, 244 SmallVectorImpl<char> &Storage) const; 245 }; 246 247 /// A block scalar node is an opaque datum that can be presented as a 248 /// series of zero or more Unicode scalar values. 249 /// 250 /// Example: 251 /// | 252 /// Hello 253 /// World 254 class BlockScalarNode final : public Node { 255 void anchor() override; 256 257 public: 258 BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 259 StringRef Value, StringRef RawVal) 260 : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { 261 SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); 262 SMLoc End = SMLoc::getFromPointer(RawVal.end()); 263 SourceRange = SMRange(Start, End); 264 } 265 266 /// Gets the value of this node as a StringRef. 267 StringRef getValue() const { return Value; } 268 269 static bool classof(const Node *N) { 270 return N->getType() == NK_BlockScalar; 271 } 272 273 private: 274 StringRef Value; 275 }; 276 277 /// A key and value pair. While not technically a Node under the YAML 278 /// representation graph, it is easier to treat them this way. 279 /// 280 /// TODO: Consider making this not a child of Node. 281 /// 282 /// Example: 283 /// Section: .text 284 class KeyValueNode final : public Node { 285 void anchor() override; 286 287 public: 288 KeyValueNode(std::unique_ptr<Document> &D) 289 : Node(NK_KeyValue, D, StringRef(), StringRef()) {} 290 291 /// Parse and return the key. 292 /// 293 /// This may be called multiple times. 294 /// 295 /// \returns The key, or nullptr if failed() == true. 296 Node *getKey(); 297 298 /// Parse and return the value. 299 /// 300 /// This may be called multiple times. 301 /// 302 /// \returns The value, or nullptr if failed() == true. 303 Node *getValue(); 304 305 void skip() override { 306 if (Node *Key = getKey()) { 307 Key->skip(); 308 if (Node *Val = getValue()) 309 Val->skip(); 310 } 311 } 312 313 static bool classof(const Node *N) { 314 return N->getType() == NK_KeyValue; 315 } 316 317 private: 318 Node *Key = nullptr; 319 Node *Value = nullptr; 320 }; 321 322 /// This is an iterator abstraction over YAML collections shared by both 323 /// sequences and maps. 324 /// 325 /// BaseT must have a ValueT* member named CurrentEntry and a member function 326 /// increment() which must set CurrentEntry to 0 to create an end iterator. 327 template <class BaseT, class ValueT> class basic_collection_iterator { 328 public: 329 using iterator_category = std::input_iterator_tag; 330 using value_type = ValueT; 331 using difference_type = std::ptrdiff_t; 332 using pointer = value_type *; 333 using reference = value_type &; 334 335 basic_collection_iterator() = default; 336 basic_collection_iterator(BaseT *B) : Base(B) {} 337 338 ValueT *operator->() const { 339 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 340 return Base->CurrentEntry; 341 } 342 343 ValueT &operator*() const { 344 assert(Base && Base->CurrentEntry && 345 "Attempted to dereference end iterator!"); 346 return *Base->CurrentEntry; 347 } 348 349 operator ValueT *() const { 350 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 351 return Base->CurrentEntry; 352 } 353 354 /// Note on EqualityComparable: 355 /// 356 /// The iterator is not re-entrant, 357 /// it is meant to be used for parsing YAML on-demand 358 /// Once iteration started - it can point only to one entry at a time 359 /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal 360 /// iff Base and Other.Base are equal. 361 bool operator==(const basic_collection_iterator &Other) const { 362 if (Base && (Base == Other.Base)) { 363 assert((Base->CurrentEntry == Other.Base->CurrentEntry) 364 && "Equal Bases expected to point to equal Entries"); 365 } 366 367 return Base == Other.Base; 368 } 369 370 bool operator!=(const basic_collection_iterator &Other) const { 371 return !(Base == Other.Base); 372 } 373 374 basic_collection_iterator &operator++() { 375 assert(Base && "Attempted to advance iterator past end!"); 376 Base->increment(); 377 // Create an end iterator. 378 if (!Base->CurrentEntry) 379 Base = nullptr; 380 return *this; 381 } 382 383 private: 384 BaseT *Base = nullptr; 385 }; 386 387 // The following two templates are used for both MappingNode and Sequence Node. 388 template <class CollectionType> 389 typename CollectionType::iterator begin(CollectionType &C) { 390 assert(C.IsAtBeginning && "You may only iterate over a collection once!"); 391 C.IsAtBeginning = false; 392 typename CollectionType::iterator ret(&C); 393 ++ret; 394 return ret; 395 } 396 397 template <class CollectionType> void skip(CollectionType &C) { 398 // TODO: support skipping from the middle of a parsed collection ;/ 399 assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!"); 400 if (C.IsAtBeginning) 401 for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; 402 ++i) 403 i->skip(); 404 } 405 406 /// Represents a YAML map created from either a block map for a flow map. 407 /// 408 /// This parses the YAML stream as increment() is called. 409 /// 410 /// Example: 411 /// Name: _main 412 /// Scope: Global 413 class MappingNode final : public Node { 414 void anchor() override; 415 416 public: 417 enum MappingType { 418 MT_Block, 419 MT_Flow, 420 MT_Inline ///< An inline mapping node is used for "[key: value]". 421 }; 422 423 MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 424 MappingType MT) 425 : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {} 426 427 friend class basic_collection_iterator<MappingNode, KeyValueNode>; 428 429 using iterator = basic_collection_iterator<MappingNode, KeyValueNode>; 430 431 template <class T> friend typename T::iterator yaml::begin(T &); 432 template <class T> friend void yaml::skip(T &); 433 434 iterator begin() { return yaml::begin(*this); } 435 436 iterator end() { return iterator(); } 437 438 void skip() override { yaml::skip(*this); } 439 440 static bool classof(const Node *N) { 441 return N->getType() == NK_Mapping; 442 } 443 444 private: 445 MappingType Type; 446 bool IsAtBeginning = true; 447 bool IsAtEnd = false; 448 KeyValueNode *CurrentEntry = nullptr; 449 450 void increment(); 451 }; 452 453 /// Represents a YAML sequence created from either a block sequence for a 454 /// flow sequence. 455 /// 456 /// This parses the YAML stream as increment() is called. 457 /// 458 /// Example: 459 /// - Hello 460 /// - World 461 class SequenceNode final : public Node { 462 void anchor() override; 463 464 public: 465 enum SequenceType { 466 ST_Block, 467 ST_Flow, 468 // Use for: 469 // 470 // key: 471 // - val1 472 // - val2 473 // 474 // As a BlockMappingEntry and BlockEnd are not created in this case. 475 ST_Indentless 476 }; 477 478 SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 479 SequenceType ST) 480 : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {} 481 482 friend class basic_collection_iterator<SequenceNode, Node>; 483 484 using iterator = basic_collection_iterator<SequenceNode, Node>; 485 486 template <class T> friend typename T::iterator yaml::begin(T &); 487 template <class T> friend void yaml::skip(T &); 488 489 void increment(); 490 491 iterator begin() { return yaml::begin(*this); } 492 493 iterator end() { return iterator(); } 494 495 void skip() override { yaml::skip(*this); } 496 497 static bool classof(const Node *N) { 498 return N->getType() == NK_Sequence; 499 } 500 501 private: 502 SequenceType SeqType; 503 bool IsAtBeginning = true; 504 bool IsAtEnd = false; 505 bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','. 506 Node *CurrentEntry = nullptr; 507 }; 508 509 /// Represents an alias to a Node with an anchor. 510 /// 511 /// Example: 512 /// *AnchorName 513 class AliasNode final : public Node { 514 void anchor() override; 515 516 public: 517 AliasNode(std::unique_ptr<Document> &D, StringRef Val) 518 : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} 519 520 StringRef getName() const { return Name; } 521 522 static bool classof(const Node *N) { return N->getType() == NK_Alias; } 523 524 private: 525 StringRef Name; 526 }; 527 528 /// A YAML Stream is a sequence of Documents. A document contains a root 529 /// node. 530 class Document { 531 public: 532 Document(Stream &ParentStream); 533 534 /// Root for parsing a node. Returns a single node. 535 Node *parseBlockNode(); 536 537 /// Finish parsing the current document and return true if there are 538 /// more. Return false otherwise. 539 bool skip(); 540 541 /// Parse and return the root level node. 542 Node *getRoot() { 543 if (Root) 544 return Root; 545 return Root = parseBlockNode(); 546 } 547 548 const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } 549 550 private: 551 friend class Node; 552 friend class document_iterator; 553 554 /// Stream to read tokens from. 555 Stream &stream; 556 557 /// Used to allocate nodes to. All are destroyed without calling their 558 /// destructor when the document is destroyed. 559 BumpPtrAllocator NodeAllocator; 560 561 /// The root node. Used to support skipping a partially parsed 562 /// document. 563 Node *Root; 564 565 /// Maps tag prefixes to their expansion. 566 std::map<StringRef, StringRef> TagMap; 567 568 Token &peekNext(); 569 Token getNext(); 570 void setError(const Twine &Message, Token &Location) const; 571 bool failed() const; 572 573 /// Parse %BLAH directives and return true if any were encountered. 574 bool parseDirectives(); 575 576 /// Parse %YAML 577 void parseYAMLDirective(); 578 579 /// Parse %TAG 580 void parseTAGDirective(); 581 582 /// Consume the next token and error if it is not \a TK. 583 bool expectToken(int TK); 584 }; 585 586 /// Iterator abstraction for Documents over a Stream. 587 class document_iterator { 588 public: 589 document_iterator() = default; 590 document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} 591 592 bool operator==(const document_iterator &Other) const { 593 if (isAtEnd() || Other.isAtEnd()) 594 return isAtEnd() && Other.isAtEnd(); 595 596 return Doc == Other.Doc; 597 } 598 bool operator!=(const document_iterator &Other) const { 599 return !(*this == Other); 600 } 601 602 document_iterator operator++() { 603 assert(Doc && "incrementing iterator past the end."); 604 if (!(*Doc)->skip()) { 605 Doc->reset(nullptr); 606 } else { 607 Stream &S = (*Doc)->stream; 608 Doc->reset(new Document(S)); 609 } 610 return *this; 611 } 612 613 Document &operator*() { return *Doc->get(); } 614 615 std::unique_ptr<Document> &operator->() { return *Doc; } 616 617 private: 618 bool isAtEnd() const { return !Doc || !*Doc; } 619 620 std::unique_ptr<Document> *Doc = nullptr; 621 }; 622 623 } // end namespace yaml 624 625 } // end namespace llvm 626 627 #endif // LLVM_SUPPORT_YAMLPARSER_H 628