1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This is a YAML 1.2 parser.
10 //
11 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
12 //
13 //  This currently does not implement the following:
14 //    * Tag resolution.
15 //    * UTF-16.
16 //    * BOMs anywhere other than the first Unicode scalar value in the file.
17 //
18 //  The most important class here is Stream. This represents a YAML stream with
19 //  0, 1, or many documents.
20 //
21 //  SourceMgr sm;
22 //  StringRef input = getInput();
23 //  yaml::Stream stream(input, sm);
24 //
25 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
26 //       di != de; ++di) {
27 //    yaml::Node *n = di->getRoot();
28 //    if (n) {
29 //      // Do something with n...
30 //    } else
31 //      break;
32 //  }
33 //
34 //===----------------------------------------------------------------------===//
35 
36 #ifndef LLVM_SUPPORT_YAMLPARSER_H
37 #define LLVM_SUPPORT_YAMLPARSER_H
38 
39 #include "llvm/ADT/StringRef.h"
40 #include "llvm/Support/Allocator.h"
41 #include "llvm/Support/SMLoc.h"
42 #include "llvm/Support/SourceMgr.h"
43 #include <cassert>
44 #include <cstddef>
45 #include <iterator>
46 #include <map>
47 #include <memory>
48 #include <optional>
49 #include <string>
50 #include <system_error>
51 
52 namespace llvm {
53 
54 class MemoryBufferRef;
55 class raw_ostream;
56 class Twine;
57 
58 namespace yaml {
59 
60 class Document;
61 class document_iterator;
62 class Node;
63 class Scanner;
64 struct Token;
65 
66 /// Dump all the tokens in this stream to OS.
67 /// \returns true if there was an error, false otherwise.
68 bool dumpTokens(StringRef Input, raw_ostream &);
69 
70 /// Scans all tokens in input without outputting anything. This is used
71 ///        for benchmarking the tokenizer.
72 /// \returns true if there was an error, false otherwise.
73 bool scanTokens(StringRef Input);
74 
75 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
76 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
77 /// false, those UTF8 sequences encoding printable unicode scalars will not be
78 /// escaped, but emitted verbatim.
79 std::string escape(StringRef Input, bool EscapePrintable = true);
80 
81 /// Parse \p S as a bool according to https://yaml.org/type/bool.html.
82 std::optional<bool> parseBool(StringRef S);
83 
84 /// This class represents a YAML stream potentially containing multiple
85 ///        documents.
86 class Stream {
87 public:
88   /// This keeps a reference to the string referenced by \p Input.
89   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
90          std::error_code *EC = nullptr);
91 
92   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
93          std::error_code *EC = nullptr);
94   ~Stream();
95 
96   document_iterator begin();
97   document_iterator end();
98   void skip();
99   bool failed();
100 
validate()101   bool validate() {
102     skip();
103     return !failed();
104   }
105 
106   void printError(Node *N, const Twine &Msg,
107                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
108   void printError(const SMRange &Range, const Twine &Msg,
109                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
110 
111 private:
112   friend class Document;
113 
114   std::unique_ptr<Scanner> scanner;
115   std::unique_ptr<Document> CurrentDoc;
116 };
117 
118 /// Abstract base class for all Nodes.
119 class Node {
120   virtual void anchor();
121 
122 public:
123   enum NodeKind {
124     NK_Null,
125     NK_Scalar,
126     NK_BlockScalar,
127     NK_KeyValue,
128     NK_Mapping,
129     NK_Sequence,
130     NK_Alias
131   };
132 
133   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
134        StringRef Tag);
135 
136   // It's not safe to copy YAML nodes; the document is streamed and the position
137   // is part of the state.
138   Node(const Node &) = delete;
139   void operator=(const Node &) = delete;
140 
141   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
142                      size_t Alignment = 16) noexcept {
143     return Alloc.Allocate(Size, Alignment);
144   }
145 
delete(void * Ptr,BumpPtrAllocator & Alloc,size_t Size)146   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
147                        size_t Size) noexcept {
148     Alloc.Deallocate(Ptr, Size, 0);
149   }
150 
151   void operator delete(void *) noexcept = delete;
152 
153   /// Get the value of the anchor attached to this node. If it does not
154   ///        have one, getAnchor().size() will be 0.
getAnchor()155   StringRef getAnchor() const { return Anchor; }
156 
157   /// Get the tag as it was written in the document. This does not
158   ///   perform tag resolution.
getRawTag()159   StringRef getRawTag() const { return Tag; }
160 
161   /// Get the verbatium tag for a given Node. This performs tag resoluton
162   ///   and substitution.
163   std::string getVerbatimTag() const;
164 
getSourceRange()165   SMRange getSourceRange() const { return SourceRange; }
setSourceRange(SMRange SR)166   void setSourceRange(SMRange SR) { SourceRange = SR; }
167 
168   // These functions forward to Document and Scanner.
169   Token &peekNext();
170   Token getNext();
171   Node *parseBlockNode();
172   BumpPtrAllocator &getAllocator();
173   void setError(const Twine &Message, Token &Location) const;
174   bool failed() const;
175 
skip()176   virtual void skip() {}
177 
getType()178   unsigned int getType() const { return TypeID; }
179 
180 protected:
181   std::unique_ptr<Document> &Doc;
182   SMRange SourceRange;
183 
184   ~Node() = default;
185 
186 private:
187   unsigned int TypeID;
188   StringRef Anchor;
189   /// The tag as typed in the document.
190   StringRef Tag;
191 };
192 
193 /// A null value.
194 ///
195 /// Example:
196 ///   !!null null
197 class NullNode final : public Node {
198   void anchor() override;
199 
200 public:
NullNode(std::unique_ptr<Document> & D)201   NullNode(std::unique_ptr<Document> &D)
202       : Node(NK_Null, D, StringRef(), StringRef()) {}
203 
classof(const Node * N)204   static bool classof(const Node *N) { return N->getType() == NK_Null; }
205 };
206 
207 /// A scalar node is an opaque datum that can be presented as a
208 ///        series of zero or more Unicode scalar values.
209 ///
210 /// Example:
211 ///   Adena
212 class ScalarNode final : public Node {
213   void anchor() override;
214 
215 public:
ScalarNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,StringRef Val)216   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
217              StringRef Val)
218       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
219     SMLoc Start = SMLoc::getFromPointer(Val.begin());
220     SMLoc End = SMLoc::getFromPointer(Val.end());
221     SourceRange = SMRange(Start, End);
222   }
223 
224   // Return Value without any escaping or folding or other fun YAML stuff. This
225   // is the exact bytes that are contained in the file (after conversion to
226   // utf8).
getRawValue()227   StringRef getRawValue() const { return Value; }
228 
229   /// Gets the value of this node as a StringRef.
230   ///
231   /// \param Storage is used to store the content of the returned StringRef if
232   ///        it requires any modification from how it appeared in the source.
233   ///        This happens with escaped characters and multi-line literals.
234   StringRef getValue(SmallVectorImpl<char> &Storage) const;
235 
classof(const Node * N)236   static bool classof(const Node *N) {
237     return N->getType() == NK_Scalar;
238   }
239 
240 private:
241   StringRef Value;
242 
243   StringRef getDoubleQuotedValue(StringRef UnquotedValue,
244                                  SmallVectorImpl<char> &Storage) const;
245 
246   static StringRef getSingleQuotedValue(StringRef RawValue,
247                                         SmallVectorImpl<char> &Storage);
248 
249   static StringRef getPlainValue(StringRef RawValue,
250                                  SmallVectorImpl<char> &Storage);
251 };
252 
253 /// A block scalar node is an opaque datum that can be presented as a
254 ///        series of zero or more Unicode scalar values.
255 ///
256 /// Example:
257 ///   |
258 ///     Hello
259 ///     World
260 class BlockScalarNode final : public Node {
261   void anchor() override;
262 
263 public:
BlockScalarNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,StringRef Value,StringRef RawVal)264   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
265                   StringRef Value, StringRef RawVal)
266       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
267     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
268     SMLoc End = SMLoc::getFromPointer(RawVal.end());
269     SourceRange = SMRange(Start, End);
270   }
271 
272   /// Gets the value of this node as a StringRef.
getValue()273   StringRef getValue() const { return Value; }
274 
classof(const Node * N)275   static bool classof(const Node *N) {
276     return N->getType() == NK_BlockScalar;
277   }
278 
279 private:
280   StringRef Value;
281 };
282 
283 /// A key and value pair. While not technically a Node under the YAML
284 ///        representation graph, it is easier to treat them this way.
285 ///
286 /// TODO: Consider making this not a child of Node.
287 ///
288 /// Example:
289 ///   Section: .text
290 class KeyValueNode final : public Node {
291   void anchor() override;
292 
293 public:
KeyValueNode(std::unique_ptr<Document> & D)294   KeyValueNode(std::unique_ptr<Document> &D)
295       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
296 
297   /// Parse and return the key.
298   ///
299   /// This may be called multiple times.
300   ///
301   /// \returns The key, or nullptr if failed() == true.
302   Node *getKey();
303 
304   /// Parse and return the value.
305   ///
306   /// This may be called multiple times.
307   ///
308   /// \returns The value, or nullptr if failed() == true.
309   Node *getValue();
310 
skip()311   void skip() override {
312     if (Node *Key = getKey()) {
313       Key->skip();
314       if (Node *Val = getValue())
315         Val->skip();
316     }
317   }
318 
classof(const Node * N)319   static bool classof(const Node *N) {
320     return N->getType() == NK_KeyValue;
321   }
322 
323 private:
324   Node *Key = nullptr;
325   Node *Value = nullptr;
326 };
327 
328 /// This is an iterator abstraction over YAML collections shared by both
329 ///        sequences and maps.
330 ///
331 /// BaseT must have a ValueT* member named CurrentEntry and a member function
332 /// increment() which must set CurrentEntry to 0 to create an end iterator.
333 template <class BaseT, class ValueT> class basic_collection_iterator {
334 public:
335   using iterator_category = std::input_iterator_tag;
336   using value_type = ValueT;
337   using difference_type = std::ptrdiff_t;
338   using pointer = value_type *;
339   using reference = value_type &;
340 
341   basic_collection_iterator() = default;
basic_collection_iterator(BaseT * B)342   basic_collection_iterator(BaseT *B) : Base(B) {}
343 
344   ValueT *operator->() const {
345     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
346     return Base->CurrentEntry;
347   }
348 
349   ValueT &operator*() const {
350     assert(Base && Base->CurrentEntry &&
351            "Attempted to dereference end iterator!");
352     return *Base->CurrentEntry;
353   }
354 
355   operator ValueT *() const {
356     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
357     return Base->CurrentEntry;
358   }
359 
360   /// Note on EqualityComparable:
361   ///
362   /// The iterator is not re-entrant,
363   /// it is meant to be used for parsing YAML on-demand
364   /// Once iteration started - it can point only to one entry at a time
365   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
366   /// iff Base and Other.Base are equal.
367   bool operator==(const basic_collection_iterator &Other) const {
368     if (Base && (Base == Other.Base)) {
369       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
370              && "Equal Bases expected to point to equal Entries");
371     }
372 
373     return Base == Other.Base;
374   }
375 
376   bool operator!=(const basic_collection_iterator &Other) const {
377     return !(Base == Other.Base);
378   }
379 
380   basic_collection_iterator &operator++() {
381     assert(Base && "Attempted to advance iterator past end!");
382     Base->increment();
383     // Create an end iterator.
384     if (!Base->CurrentEntry)
385       Base = nullptr;
386     return *this;
387   }
388 
389 private:
390   BaseT *Base = nullptr;
391 };
392 
393 // The following two templates are used for both MappingNode and Sequence Node.
394 template <class CollectionType>
begin(CollectionType & C)395 typename CollectionType::iterator begin(CollectionType &C) {
396   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
397   C.IsAtBeginning = false;
398   typename CollectionType::iterator ret(&C);
399   ++ret;
400   return ret;
401 }
402 
skip(CollectionType & C)403 template <class CollectionType> void skip(CollectionType &C) {
404   // TODO: support skipping from the middle of a parsed collection ;/
405   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
406   if (C.IsAtBeginning)
407     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
408          ++i)
409       i->skip();
410 }
411 
412 /// Represents a YAML map created from either a block map for a flow map.
413 ///
414 /// This parses the YAML stream as increment() is called.
415 ///
416 /// Example:
417 ///   Name: _main
418 ///   Scope: Global
419 class MappingNode final : public Node {
420   void anchor() override;
421 
422 public:
423   enum MappingType {
424     MT_Block,
425     MT_Flow,
426     MT_Inline ///< An inline mapping node is used for "[key: value]".
427   };
428 
MappingNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,MappingType MT)429   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
430               MappingType MT)
431       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
432 
433   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
434 
435   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
436 
437   template <class T> friend typename T::iterator yaml::begin(T &);
438   template <class T> friend void yaml::skip(T &);
439 
begin()440   iterator begin() { return yaml::begin(*this); }
441 
end()442   iterator end() { return iterator(); }
443 
skip()444   void skip() override { yaml::skip(*this); }
445 
classof(const Node * N)446   static bool classof(const Node *N) {
447     return N->getType() == NK_Mapping;
448   }
449 
450 private:
451   MappingType Type;
452   bool IsAtBeginning = true;
453   bool IsAtEnd = false;
454   KeyValueNode *CurrentEntry = nullptr;
455 
456   void increment();
457 };
458 
459 /// Represents a YAML sequence created from either a block sequence for a
460 ///        flow sequence.
461 ///
462 /// This parses the YAML stream as increment() is called.
463 ///
464 /// Example:
465 ///   - Hello
466 ///   - World
467 class SequenceNode final : public Node {
468   void anchor() override;
469 
470 public:
471   enum SequenceType {
472     ST_Block,
473     ST_Flow,
474     // Use for:
475     //
476     // key:
477     // - val1
478     // - val2
479     //
480     // As a BlockMappingEntry and BlockEnd are not created in this case.
481     ST_Indentless
482   };
483 
SequenceNode(std::unique_ptr<Document> & D,StringRef Anchor,StringRef Tag,SequenceType ST)484   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
485                SequenceType ST)
486       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
487 
488   friend class basic_collection_iterator<SequenceNode, Node>;
489 
490   using iterator = basic_collection_iterator<SequenceNode, Node>;
491 
492   template <class T> friend typename T::iterator yaml::begin(T &);
493   template <class T> friend void yaml::skip(T &);
494 
495   void increment();
496 
begin()497   iterator begin() { return yaml::begin(*this); }
498 
end()499   iterator end() { return iterator(); }
500 
skip()501   void skip() override { yaml::skip(*this); }
502 
classof(const Node * N)503   static bool classof(const Node *N) {
504     return N->getType() == NK_Sequence;
505   }
506 
507 private:
508   SequenceType SeqType;
509   bool IsAtBeginning = true;
510   bool IsAtEnd = false;
511   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
512   Node *CurrentEntry = nullptr;
513 };
514 
515 /// Represents an alias to a Node with an anchor.
516 ///
517 /// Example:
518 ///   *AnchorName
519 class AliasNode final : public Node {
520   void anchor() override;
521 
522 public:
AliasNode(std::unique_ptr<Document> & D,StringRef Val)523   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
524       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
525 
getName()526   StringRef getName() const { return Name; }
527 
classof(const Node * N)528   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
529 
530 private:
531   StringRef Name;
532 };
533 
534 /// A YAML Stream is a sequence of Documents. A document contains a root
535 ///        node.
536 class Document {
537 public:
538   Document(Stream &ParentStream);
539 
540   /// Root for parsing a node. Returns a single node.
541   Node *parseBlockNode();
542 
543   /// Finish parsing the current document and return true if there are
544   ///        more. Return false otherwise.
545   bool skip();
546 
547   /// Parse and return the root level node.
getRoot()548   Node *getRoot() {
549     if (Root)
550       return Root;
551     return Root = parseBlockNode();
552   }
553 
getTagMap()554   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
555 
556 private:
557   friend class Node;
558   friend class document_iterator;
559 
560   /// Stream to read tokens from.
561   Stream &stream;
562 
563   /// Used to allocate nodes to. All are destroyed without calling their
564   ///        destructor when the document is destroyed.
565   BumpPtrAllocator NodeAllocator;
566 
567   /// The root node. Used to support skipping a partially parsed
568   ///        document.
569   Node *Root;
570 
571   /// Maps tag prefixes to their expansion.
572   std::map<StringRef, StringRef> TagMap;
573 
574   Token &peekNext();
575   Token getNext();
576   void setError(const Twine &Message, Token &Location) const;
577   bool failed() const;
578 
579   /// Parse %BLAH directives and return true if any were encountered.
580   bool parseDirectives();
581 
582   /// Parse %YAML
583   void parseYAMLDirective();
584 
585   /// Parse %TAG
586   void parseTAGDirective();
587 
588   /// Consume the next token and error if it is not \a TK.
589   bool expectToken(int TK);
590 };
591 
592 /// Iterator abstraction for Documents over a Stream.
593 class document_iterator {
594 public:
595   document_iterator() = default;
document_iterator(std::unique_ptr<Document> & D)596   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
597 
598   bool operator==(const document_iterator &Other) const {
599     if (isAtEnd() || Other.isAtEnd())
600       return isAtEnd() && Other.isAtEnd();
601 
602     return Doc == Other.Doc;
603   }
604   bool operator!=(const document_iterator &Other) const {
605     return !(*this == Other);
606   }
607 
608   document_iterator operator++() {
609     assert(Doc && "incrementing iterator past the end.");
610     if (!(*Doc)->skip()) {
611       Doc->reset(nullptr);
612     } else {
613       Stream &S = (*Doc)->stream;
614       Doc->reset(new Document(S));
615     }
616     return *this;
617   }
618 
619   Document &operator*() { return **Doc; }
620 
621   std::unique_ptr<Document> &operator->() { return *Doc; }
622 
623 private:
isAtEnd()624   bool isAtEnd() const { return !Doc || !*Doc; }
625 
626   std::unique_ptr<Document> *Doc = nullptr;
627 };
628 
629 } // end namespace yaml
630 
631 } // end namespace llvm
632 
633 #endif // LLVM_SUPPORT_YAMLPARSER_H
634