1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===// 2 // 3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. 4 // See https://llvm.org/LICENSE.txt for license information. 5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception 6 // 7 //===----------------------------------------------------------------------===// 8 // 9 // This is a YAML 1.2 parser. 10 // 11 // See http://www.yaml.org/spec/1.2/spec.html for the full standard. 12 // 13 // This currently does not implement the following: 14 // * Multi-line literal folding. 15 // * Tag resolution. 16 // * UTF-16. 17 // * BOMs anywhere other than the first Unicode scalar value in the file. 18 // 19 // The most important class here is Stream. This represents a YAML stream with 20 // 0, 1, or many documents. 21 // 22 // SourceMgr sm; 23 // StringRef input = getInput(); 24 // yaml::Stream stream(input, sm); 25 // 26 // for (yaml::document_iterator di = stream.begin(), de = stream.end(); 27 // di != de; ++di) { 28 // yaml::Node *n = di->getRoot(); 29 // if (n) { 30 // // Do something with n... 31 // } else 32 // break; 33 // } 34 // 35 //===----------------------------------------------------------------------===// 36 37 #ifndef LLVM_SUPPORT_YAMLPARSER_H 38 #define LLVM_SUPPORT_YAMLPARSER_H 39 40 #include "llvm/ADT/StringRef.h" 41 #include "llvm/Support/Allocator.h" 42 #include "llvm/Support/SMLoc.h" 43 #include <cassert> 44 #include <cstddef> 45 #include <iterator> 46 #include <map> 47 #include <memory> 48 #include <string> 49 #include <system_error> 50 51 namespace llvm { 52 53 class MemoryBufferRef; 54 class SourceMgr; 55 class raw_ostream; 56 class Twine; 57 58 namespace yaml { 59 60 class Document; 61 class document_iterator; 62 class Node; 63 class Scanner; 64 struct Token; 65 66 /// Dump all the tokens in this stream to OS. 67 /// \returns true if there was an error, false otherwise. 68 bool dumpTokens(StringRef Input, raw_ostream &); 69 70 /// Scans all tokens in input without outputting anything. This is used 71 /// for benchmarking the tokenizer. 72 /// \returns true if there was an error, false otherwise. 73 bool scanTokens(StringRef Input); 74 75 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable 76 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is 77 /// false, those UTF8 sequences encoding printable unicode scalars will not be 78 /// escaped, but emitted verbatim. 79 std::string escape(StringRef Input, bool EscapePrintable = true); 80 81 /// This class represents a YAML stream potentially containing multiple 82 /// documents. 83 class Stream { 84 public: 85 /// This keeps a reference to the string referenced by \p Input. 86 Stream(StringRef Input, SourceMgr &, bool ShowColors = true, 87 std::error_code *EC = nullptr); 88 89 Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true, 90 std::error_code *EC = nullptr); 91 ~Stream(); 92 93 document_iterator begin(); 94 document_iterator end(); 95 void skip(); 96 bool failed(); 97 98 bool validate() { 99 skip(); 100 return !failed(); 101 } 102 103 void printError(Node *N, const Twine &Msg); 104 105 private: 106 friend class Document; 107 108 std::unique_ptr<Scanner> scanner; 109 std::unique_ptr<Document> CurrentDoc; 110 }; 111 112 /// Abstract base class for all Nodes. 113 class Node { 114 virtual void anchor(); 115 116 public: 117 enum NodeKind { 118 NK_Null, 119 NK_Scalar, 120 NK_BlockScalar, 121 NK_KeyValue, 122 NK_Mapping, 123 NK_Sequence, 124 NK_Alias 125 }; 126 127 Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor, 128 StringRef Tag); 129 130 // It's not safe to copy YAML nodes; the document is streamed and the position 131 // is part of the state. 132 Node(const Node &) = delete; 133 void operator=(const Node &) = delete; 134 135 void *operator new(size_t Size, BumpPtrAllocator &Alloc, 136 size_t Alignment = 16) noexcept { 137 return Alloc.Allocate(Size, Alignment); 138 } 139 140 void operator delete(void *Ptr, BumpPtrAllocator &Alloc, 141 size_t Size) noexcept { 142 Alloc.Deallocate(Ptr, Size, 0); 143 } 144 145 void operator delete(void *) noexcept = delete; 146 147 /// Get the value of the anchor attached to this node. If it does not 148 /// have one, getAnchor().size() will be 0. 149 StringRef getAnchor() const { return Anchor; } 150 151 /// Get the tag as it was written in the document. This does not 152 /// perform tag resolution. 153 StringRef getRawTag() const { return Tag; } 154 155 /// Get the verbatium tag for a given Node. This performs tag resoluton 156 /// and substitution. 157 std::string getVerbatimTag() const; 158 159 SMRange getSourceRange() const { return SourceRange; } 160 void setSourceRange(SMRange SR) { SourceRange = SR; } 161 162 // These functions forward to Document and Scanner. 163 Token &peekNext(); 164 Token getNext(); 165 Node *parseBlockNode(); 166 BumpPtrAllocator &getAllocator(); 167 void setError(const Twine &Message, Token &Location) const; 168 bool failed() const; 169 170 virtual void skip() {} 171 172 unsigned int getType() const { return TypeID; } 173 174 protected: 175 std::unique_ptr<Document> &Doc; 176 SMRange SourceRange; 177 178 ~Node() = default; 179 180 private: 181 unsigned int TypeID; 182 StringRef Anchor; 183 /// The tag as typed in the document. 184 StringRef Tag; 185 }; 186 187 /// A null value. 188 /// 189 /// Example: 190 /// !!null null 191 class NullNode final : public Node { 192 void anchor() override; 193 194 public: 195 NullNode(std::unique_ptr<Document> &D) 196 : Node(NK_Null, D, StringRef(), StringRef()) {} 197 198 static bool classof(const Node *N) { return N->getType() == NK_Null; } 199 }; 200 201 /// A scalar node is an opaque datum that can be presented as a 202 /// series of zero or more Unicode scalar values. 203 /// 204 /// Example: 205 /// Adena 206 class ScalarNode final : public Node { 207 void anchor() override; 208 209 public: 210 ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 211 StringRef Val) 212 : Node(NK_Scalar, D, Anchor, Tag), Value(Val) { 213 SMLoc Start = SMLoc::getFromPointer(Val.begin()); 214 SMLoc End = SMLoc::getFromPointer(Val.end()); 215 SourceRange = SMRange(Start, End); 216 } 217 218 // Return Value without any escaping or folding or other fun YAML stuff. This 219 // is the exact bytes that are contained in the file (after conversion to 220 // utf8). 221 StringRef getRawValue() const { return Value; } 222 223 /// Gets the value of this node as a StringRef. 224 /// 225 /// \param Storage is used to store the content of the returned StringRef iff 226 /// it requires any modification from how it appeared in the source. 227 /// This happens with escaped characters and multi-line literals. 228 StringRef getValue(SmallVectorImpl<char> &Storage) const; 229 230 static bool classof(const Node *N) { 231 return N->getType() == NK_Scalar; 232 } 233 234 private: 235 StringRef Value; 236 237 StringRef unescapeDoubleQuoted(StringRef UnquotedValue, 238 StringRef::size_type Start, 239 SmallVectorImpl<char> &Storage) const; 240 }; 241 242 /// A block scalar node is an opaque datum that can be presented as a 243 /// series of zero or more Unicode scalar values. 244 /// 245 /// Example: 246 /// | 247 /// Hello 248 /// World 249 class BlockScalarNode final : public Node { 250 void anchor() override; 251 252 public: 253 BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 254 StringRef Value, StringRef RawVal) 255 : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) { 256 SMLoc Start = SMLoc::getFromPointer(RawVal.begin()); 257 SMLoc End = SMLoc::getFromPointer(RawVal.end()); 258 SourceRange = SMRange(Start, End); 259 } 260 261 /// Gets the value of this node as a StringRef. 262 StringRef getValue() const { return Value; } 263 264 static bool classof(const Node *N) { 265 return N->getType() == NK_BlockScalar; 266 } 267 268 private: 269 StringRef Value; 270 }; 271 272 /// A key and value pair. While not technically a Node under the YAML 273 /// representation graph, it is easier to treat them this way. 274 /// 275 /// TODO: Consider making this not a child of Node. 276 /// 277 /// Example: 278 /// Section: .text 279 class KeyValueNode final : public Node { 280 void anchor() override; 281 282 public: 283 KeyValueNode(std::unique_ptr<Document> &D) 284 : Node(NK_KeyValue, D, StringRef(), StringRef()) {} 285 286 /// Parse and return the key. 287 /// 288 /// This may be called multiple times. 289 /// 290 /// \returns The key, or nullptr if failed() == true. 291 Node *getKey(); 292 293 /// Parse and return the value. 294 /// 295 /// This may be called multiple times. 296 /// 297 /// \returns The value, or nullptr if failed() == true. 298 Node *getValue(); 299 300 void skip() override { 301 if (Node *Key = getKey()) { 302 Key->skip(); 303 if (Node *Val = getValue()) 304 Val->skip(); 305 } 306 } 307 308 static bool classof(const Node *N) { 309 return N->getType() == NK_KeyValue; 310 } 311 312 private: 313 Node *Key = nullptr; 314 Node *Value = nullptr; 315 }; 316 317 /// This is an iterator abstraction over YAML collections shared by both 318 /// sequences and maps. 319 /// 320 /// BaseT must have a ValueT* member named CurrentEntry and a member function 321 /// increment() which must set CurrentEntry to 0 to create an end iterator. 322 template <class BaseT, class ValueT> 323 class basic_collection_iterator 324 : public std::iterator<std::input_iterator_tag, ValueT> { 325 public: 326 basic_collection_iterator() = default; 327 basic_collection_iterator(BaseT *B) : Base(B) {} 328 329 ValueT *operator->() const { 330 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 331 return Base->CurrentEntry; 332 } 333 334 ValueT &operator*() const { 335 assert(Base && Base->CurrentEntry && 336 "Attempted to dereference end iterator!"); 337 return *Base->CurrentEntry; 338 } 339 340 operator ValueT *() const { 341 assert(Base && Base->CurrentEntry && "Attempted to access end iterator!"); 342 return Base->CurrentEntry; 343 } 344 345 /// Note on EqualityComparable: 346 /// 347 /// The iterator is not re-entrant, 348 /// it is meant to be used for parsing YAML on-demand 349 /// Once iteration started - it can point only to one entry at a time 350 /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal 351 /// iff Base and Other.Base are equal. 352 bool operator==(const basic_collection_iterator &Other) const { 353 if (Base && (Base == Other.Base)) { 354 assert((Base->CurrentEntry == Other.Base->CurrentEntry) 355 && "Equal Bases expected to point to equal Entries"); 356 } 357 358 return Base == Other.Base; 359 } 360 361 bool operator!=(const basic_collection_iterator &Other) const { 362 return !(Base == Other.Base); 363 } 364 365 basic_collection_iterator &operator++() { 366 assert(Base && "Attempted to advance iterator past end!"); 367 Base->increment(); 368 // Create an end iterator. 369 if (!Base->CurrentEntry) 370 Base = nullptr; 371 return *this; 372 } 373 374 private: 375 BaseT *Base = nullptr; 376 }; 377 378 // The following two templates are used for both MappingNode and Sequence Node. 379 template <class CollectionType> 380 typename CollectionType::iterator begin(CollectionType &C) { 381 assert(C.IsAtBeginning && "You may only iterate over a collection once!"); 382 C.IsAtBeginning = false; 383 typename CollectionType::iterator ret(&C); 384 ++ret; 385 return ret; 386 } 387 388 template <class CollectionType> void skip(CollectionType &C) { 389 // TODO: support skipping from the middle of a parsed collection ;/ 390 assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!"); 391 if (C.IsAtBeginning) 392 for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e; 393 ++i) 394 i->skip(); 395 } 396 397 /// Represents a YAML map created from either a block map for a flow map. 398 /// 399 /// This parses the YAML stream as increment() is called. 400 /// 401 /// Example: 402 /// Name: _main 403 /// Scope: Global 404 class MappingNode final : public Node { 405 void anchor() override; 406 407 public: 408 enum MappingType { 409 MT_Block, 410 MT_Flow, 411 MT_Inline ///< An inline mapping node is used for "[key: value]". 412 }; 413 414 MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 415 MappingType MT) 416 : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {} 417 418 friend class basic_collection_iterator<MappingNode, KeyValueNode>; 419 420 using iterator = basic_collection_iterator<MappingNode, KeyValueNode>; 421 422 template <class T> friend typename T::iterator yaml::begin(T &); 423 template <class T> friend void yaml::skip(T &); 424 425 iterator begin() { return yaml::begin(*this); } 426 427 iterator end() { return iterator(); } 428 429 void skip() override { yaml::skip(*this); } 430 431 static bool classof(const Node *N) { 432 return N->getType() == NK_Mapping; 433 } 434 435 private: 436 MappingType Type; 437 bool IsAtBeginning = true; 438 bool IsAtEnd = false; 439 KeyValueNode *CurrentEntry = nullptr; 440 441 void increment(); 442 }; 443 444 /// Represents a YAML sequence created from either a block sequence for a 445 /// flow sequence. 446 /// 447 /// This parses the YAML stream as increment() is called. 448 /// 449 /// Example: 450 /// - Hello 451 /// - World 452 class SequenceNode final : public Node { 453 void anchor() override; 454 455 public: 456 enum SequenceType { 457 ST_Block, 458 ST_Flow, 459 // Use for: 460 // 461 // key: 462 // - val1 463 // - val2 464 // 465 // As a BlockMappingEntry and BlockEnd are not created in this case. 466 ST_Indentless 467 }; 468 469 SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag, 470 SequenceType ST) 471 : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {} 472 473 friend class basic_collection_iterator<SequenceNode, Node>; 474 475 using iterator = basic_collection_iterator<SequenceNode, Node>; 476 477 template <class T> friend typename T::iterator yaml::begin(T &); 478 template <class T> friend void yaml::skip(T &); 479 480 void increment(); 481 482 iterator begin() { return yaml::begin(*this); } 483 484 iterator end() { return iterator(); } 485 486 void skip() override { yaml::skip(*this); } 487 488 static bool classof(const Node *N) { 489 return N->getType() == NK_Sequence; 490 } 491 492 private: 493 SequenceType SeqType; 494 bool IsAtBeginning = true; 495 bool IsAtEnd = false; 496 bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','. 497 Node *CurrentEntry = nullptr; 498 }; 499 500 /// Represents an alias to a Node with an anchor. 501 /// 502 /// Example: 503 /// *AnchorName 504 class AliasNode final : public Node { 505 void anchor() override; 506 507 public: 508 AliasNode(std::unique_ptr<Document> &D, StringRef Val) 509 : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {} 510 511 StringRef getName() const { return Name; } 512 Node *getTarget(); 513 514 static bool classof(const Node *N) { return N->getType() == NK_Alias; } 515 516 private: 517 StringRef Name; 518 }; 519 520 /// A YAML Stream is a sequence of Documents. A document contains a root 521 /// node. 522 class Document { 523 public: 524 Document(Stream &ParentStream); 525 526 /// Root for parsing a node. Returns a single node. 527 Node *parseBlockNode(); 528 529 /// Finish parsing the current document and return true if there are 530 /// more. Return false otherwise. 531 bool skip(); 532 533 /// Parse and return the root level node. 534 Node *getRoot() { 535 if (Root) 536 return Root; 537 return Root = parseBlockNode(); 538 } 539 540 const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; } 541 542 private: 543 friend class Node; 544 friend class document_iterator; 545 546 /// Stream to read tokens from. 547 Stream &stream; 548 549 /// Used to allocate nodes to. All are destroyed without calling their 550 /// destructor when the document is destroyed. 551 BumpPtrAllocator NodeAllocator; 552 553 /// The root node. Used to support skipping a partially parsed 554 /// document. 555 Node *Root; 556 557 /// Maps tag prefixes to their expansion. 558 std::map<StringRef, StringRef> TagMap; 559 560 Token &peekNext(); 561 Token getNext(); 562 void setError(const Twine &Message, Token &Location) const; 563 bool failed() const; 564 565 /// Parse %BLAH directives and return true if any were encountered. 566 bool parseDirectives(); 567 568 /// Parse %YAML 569 void parseYAMLDirective(); 570 571 /// Parse %TAG 572 void parseTAGDirective(); 573 574 /// Consume the next token and error if it is not \a TK. 575 bool expectToken(int TK); 576 }; 577 578 /// Iterator abstraction for Documents over a Stream. 579 class document_iterator { 580 public: 581 document_iterator() = default; 582 document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {} 583 584 bool operator==(const document_iterator &Other) const { 585 if (isAtEnd() || Other.isAtEnd()) 586 return isAtEnd() && Other.isAtEnd(); 587 588 return Doc == Other.Doc; 589 } 590 bool operator!=(const document_iterator &Other) const { 591 return !(*this == Other); 592 } 593 594 document_iterator operator++() { 595 assert(Doc && "incrementing iterator past the end."); 596 if (!(*Doc)->skip()) { 597 Doc->reset(nullptr); 598 } else { 599 Stream &S = (*Doc)->stream; 600 Doc->reset(new Document(S)); 601 } 602 return *this; 603 } 604 605 Document &operator*() { return *Doc->get(); } 606 607 std::unique_ptr<Document> &operator->() { return *Doc; } 608 609 private: 610 bool isAtEnd() const { return !Doc || !*Doc; } 611 612 std::unique_ptr<Document> *Doc = nullptr; 613 }; 614 615 } // end namespace yaml 616 617 } // end namespace llvm 618 619 #endif // LLVM_SUPPORT_YAMLPARSER_H 620