1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This is a YAML 1.2 parser.
10 //
11 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
12 //
13 //  This currently does not implement the following:
14 //    * Multi-line literal folding.
15 //    * Tag resolution.
16 //    * UTF-16.
17 //    * BOMs anywhere other than the first Unicode scalar value in the file.
18 //
19 //  The most important class here is Stream. This represents a YAML stream with
20 //  0, 1, or many documents.
21 //
22 //  SourceMgr sm;
23 //  StringRef input = getInput();
24 //  yaml::Stream stream(input, sm);
25 //
26 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
27 //       di != de; ++di) {
28 //    yaml::Node *n = di->getRoot();
29 //    if (n) {
30 //      // Do something with n...
31 //    } else
32 //      break;
33 //  }
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #ifndef LLVM_SUPPORT_YAMLPARSER_H
38 #define LLVM_SUPPORT_YAMLPARSER_H
39 
40 #include "llvm/ADT/StringRef.h"
41 #include "llvm/Support/Allocator.h"
42 #include "llvm/Support/SMLoc.h"
43 #include "llvm/Support/SourceMgr.h"
44 #include <cassert>
45 #include <cstddef>
46 #include <iterator>
47 #include <map>
48 #include <memory>
49 #include <string>
50 #include <system_error>
51 
52 namespace llvm {
53 
54 class MemoryBufferRef;
55 class raw_ostream;
56 class Twine;
57 
58 namespace yaml {
59 
60 class Document;
61 class document_iterator;
62 class Node;
63 class Scanner;
64 struct Token;
65 
66 /// Dump all the tokens in this stream to OS.
67 /// \returns true if there was an error, false otherwise.
68 bool dumpTokens(StringRef Input, raw_ostream &);
69 
70 /// Scans all tokens in input without outputting anything. This is used
71 ///        for benchmarking the tokenizer.
72 /// \returns true if there was an error, false otherwise.
73 bool scanTokens(StringRef Input);
74 
75 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
76 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
77 /// false, those UTF8 sequences encoding printable unicode scalars will not be
78 /// escaped, but emitted verbatim.
79 std::string escape(StringRef Input, bool EscapePrintable = true);
80 
81 /// Parse \p S as a bool according to https://yaml.org/type/bool.html.
82 llvm::Optional<bool> parseBool(StringRef S);
83 
84 /// This class represents a YAML stream potentially containing multiple
85 ///        documents.
86 class Stream {
87 public:
88   /// This keeps a reference to the string referenced by \p Input.
89   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
90          std::error_code *EC = nullptr);
91 
92   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
93          std::error_code *EC = nullptr);
94   ~Stream();
95 
96   document_iterator begin();
97   document_iterator end();
98   void skip();
99   bool failed();
100 
101   bool validate() {
102     skip();
103     return !failed();
104   }
105 
106   void printError(Node *N, const Twine &Msg,
107                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
108   void printError(const SMRange &Range, const Twine &Msg,
109                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
110 
111 private:
112   friend class Document;
113 
114   std::unique_ptr<Scanner> scanner;
115   std::unique_ptr<Document> CurrentDoc;
116 };
117 
118 /// Abstract base class for all Nodes.
119 class Node {
120   virtual void anchor();
121 
122 public:
123   enum NodeKind {
124     NK_Null,
125     NK_Scalar,
126     NK_BlockScalar,
127     NK_KeyValue,
128     NK_Mapping,
129     NK_Sequence,
130     NK_Alias
131   };
132 
133   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
134        StringRef Tag);
135 
136   // It's not safe to copy YAML nodes; the document is streamed and the position
137   // is part of the state.
138   Node(const Node &) = delete;
139   void operator=(const Node &) = delete;
140 
141   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
142                      size_t Alignment = 16) noexcept {
143     return Alloc.Allocate(Size, Alignment);
144   }
145 
146   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
147                        size_t Size) noexcept {
148     Alloc.Deallocate(Ptr, Size, 0);
149   }
150 
151   void operator delete(void *) noexcept = delete;
152 
153   /// Get the value of the anchor attached to this node. If it does not
154   ///        have one, getAnchor().size() will be 0.
155   StringRef getAnchor() const { return Anchor; }
156 
157   /// Get the tag as it was written in the document. This does not
158   ///   perform tag resolution.
159   StringRef getRawTag() const { return Tag; }
160 
161   /// Get the verbatium tag for a given Node. This performs tag resoluton
162   ///   and substitution.
163   std::string getVerbatimTag() const;
164 
165   SMRange getSourceRange() const { return SourceRange; }
166   void setSourceRange(SMRange SR) { SourceRange = SR; }
167 
168   // These functions forward to Document and Scanner.
169   Token &peekNext();
170   Token getNext();
171   Node *parseBlockNode();
172   BumpPtrAllocator &getAllocator();
173   void setError(const Twine &Message, Token &Location) const;
174   bool failed() const;
175 
176   virtual void skip() {}
177 
178   unsigned int getType() const { return TypeID; }
179 
180 protected:
181   std::unique_ptr<Document> &Doc;
182   SMRange SourceRange;
183 
184   ~Node() = default;
185 
186 private:
187   unsigned int TypeID;
188   StringRef Anchor;
189   /// The tag as typed in the document.
190   StringRef Tag;
191 };
192 
193 /// A null value.
194 ///
195 /// Example:
196 ///   !!null null
197 class NullNode final : public Node {
198   void anchor() override;
199 
200 public:
201   NullNode(std::unique_ptr<Document> &D)
202       : Node(NK_Null, D, StringRef(), StringRef()) {}
203 
204   static bool classof(const Node *N) { return N->getType() == NK_Null; }
205 };
206 
207 /// A scalar node is an opaque datum that can be presented as a
208 ///        series of zero or more Unicode scalar values.
209 ///
210 /// Example:
211 ///   Adena
212 class ScalarNode final : public Node {
213   void anchor() override;
214 
215 public:
216   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
217              StringRef Val)
218       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
219     SMLoc Start = SMLoc::getFromPointer(Val.begin());
220     SMLoc End = SMLoc::getFromPointer(Val.end());
221     SourceRange = SMRange(Start, End);
222   }
223 
224   // Return Value without any escaping or folding or other fun YAML stuff. This
225   // is the exact bytes that are contained in the file (after conversion to
226   // utf8).
227   StringRef getRawValue() const { return Value; }
228 
229   /// Gets the value of this node as a StringRef.
230   ///
231   /// \param Storage is used to store the content of the returned StringRef if
232   ///        it requires any modification from how it appeared in the source.
233   ///        This happens with escaped characters and multi-line literals.
234   StringRef getValue(SmallVectorImpl<char> &Storage) const;
235 
236   static bool classof(const Node *N) {
237     return N->getType() == NK_Scalar;
238   }
239 
240 private:
241   StringRef Value;
242 
243   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
244                                  StringRef::size_type Start,
245                                  SmallVectorImpl<char> &Storage) const;
246 };
247 
248 /// A block scalar node is an opaque datum that can be presented as a
249 ///        series of zero or more Unicode scalar values.
250 ///
251 /// Example:
252 ///   |
253 ///     Hello
254 ///     World
255 class BlockScalarNode final : public Node {
256   void anchor() override;
257 
258 public:
259   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
260                   StringRef Value, StringRef RawVal)
261       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
262     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
263     SMLoc End = SMLoc::getFromPointer(RawVal.end());
264     SourceRange = SMRange(Start, End);
265   }
266 
267   /// Gets the value of this node as a StringRef.
268   StringRef getValue() const { return Value; }
269 
270   static bool classof(const Node *N) {
271     return N->getType() == NK_BlockScalar;
272   }
273 
274 private:
275   StringRef Value;
276 };
277 
278 /// A key and value pair. While not technically a Node under the YAML
279 ///        representation graph, it is easier to treat them this way.
280 ///
281 /// TODO: Consider making this not a child of Node.
282 ///
283 /// Example:
284 ///   Section: .text
285 class KeyValueNode final : public Node {
286   void anchor() override;
287 
288 public:
289   KeyValueNode(std::unique_ptr<Document> &D)
290       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
291 
292   /// Parse and return the key.
293   ///
294   /// This may be called multiple times.
295   ///
296   /// \returns The key, or nullptr if failed() == true.
297   Node *getKey();
298 
299   /// Parse and return the value.
300   ///
301   /// This may be called multiple times.
302   ///
303   /// \returns The value, or nullptr if failed() == true.
304   Node *getValue();
305 
306   void skip() override {
307     if (Node *Key = getKey()) {
308       Key->skip();
309       if (Node *Val = getValue())
310         Val->skip();
311     }
312   }
313 
314   static bool classof(const Node *N) {
315     return N->getType() == NK_KeyValue;
316   }
317 
318 private:
319   Node *Key = nullptr;
320   Node *Value = nullptr;
321 };
322 
323 /// This is an iterator abstraction over YAML collections shared by both
324 ///        sequences and maps.
325 ///
326 /// BaseT must have a ValueT* member named CurrentEntry and a member function
327 /// increment() which must set CurrentEntry to 0 to create an end iterator.
328 template <class BaseT, class ValueT> class basic_collection_iterator {
329 public:
330   using iterator_category = std::input_iterator_tag;
331   using value_type = ValueT;
332   using difference_type = std::ptrdiff_t;
333   using pointer = value_type *;
334   using reference = value_type &;
335 
336   basic_collection_iterator() = default;
337   basic_collection_iterator(BaseT *B) : Base(B) {}
338 
339   ValueT *operator->() const {
340     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
341     return Base->CurrentEntry;
342   }
343 
344   ValueT &operator*() const {
345     assert(Base && Base->CurrentEntry &&
346            "Attempted to dereference end iterator!");
347     return *Base->CurrentEntry;
348   }
349 
350   operator ValueT *() const {
351     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
352     return Base->CurrentEntry;
353   }
354 
355   /// Note on EqualityComparable:
356   ///
357   /// The iterator is not re-entrant,
358   /// it is meant to be used for parsing YAML on-demand
359   /// Once iteration started - it can point only to one entry at a time
360   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
361   /// iff Base and Other.Base are equal.
362   bool operator==(const basic_collection_iterator &Other) const {
363     if (Base && (Base == Other.Base)) {
364       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
365              && "Equal Bases expected to point to equal Entries");
366     }
367 
368     return Base == Other.Base;
369   }
370 
371   bool operator!=(const basic_collection_iterator &Other) const {
372     return !(Base == Other.Base);
373   }
374 
375   basic_collection_iterator &operator++() {
376     assert(Base && "Attempted to advance iterator past end!");
377     Base->increment();
378     // Create an end iterator.
379     if (!Base->CurrentEntry)
380       Base = nullptr;
381     return *this;
382   }
383 
384 private:
385   BaseT *Base = nullptr;
386 };
387 
388 // The following two templates are used for both MappingNode and Sequence Node.
389 template <class CollectionType>
390 typename CollectionType::iterator begin(CollectionType &C) {
391   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
392   C.IsAtBeginning = false;
393   typename CollectionType::iterator ret(&C);
394   ++ret;
395   return ret;
396 }
397 
398 template <class CollectionType> void skip(CollectionType &C) {
399   // TODO: support skipping from the middle of a parsed collection ;/
400   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
401   if (C.IsAtBeginning)
402     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
403          ++i)
404       i->skip();
405 }
406 
407 /// Represents a YAML map created from either a block map for a flow map.
408 ///
409 /// This parses the YAML stream as increment() is called.
410 ///
411 /// Example:
412 ///   Name: _main
413 ///   Scope: Global
414 class MappingNode final : public Node {
415   void anchor() override;
416 
417 public:
418   enum MappingType {
419     MT_Block,
420     MT_Flow,
421     MT_Inline ///< An inline mapping node is used for "[key: value]".
422   };
423 
424   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
425               MappingType MT)
426       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
427 
428   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
429 
430   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
431 
432   template <class T> friend typename T::iterator yaml::begin(T &);
433   template <class T> friend void yaml::skip(T &);
434 
435   iterator begin() { return yaml::begin(*this); }
436 
437   iterator end() { return iterator(); }
438 
439   void skip() override { yaml::skip(*this); }
440 
441   static bool classof(const Node *N) {
442     return N->getType() == NK_Mapping;
443   }
444 
445 private:
446   MappingType Type;
447   bool IsAtBeginning = true;
448   bool IsAtEnd = false;
449   KeyValueNode *CurrentEntry = nullptr;
450 
451   void increment();
452 };
453 
454 /// Represents a YAML sequence created from either a block sequence for a
455 ///        flow sequence.
456 ///
457 /// This parses the YAML stream as increment() is called.
458 ///
459 /// Example:
460 ///   - Hello
461 ///   - World
462 class SequenceNode final : public Node {
463   void anchor() override;
464 
465 public:
466   enum SequenceType {
467     ST_Block,
468     ST_Flow,
469     // Use for:
470     //
471     // key:
472     // - val1
473     // - val2
474     //
475     // As a BlockMappingEntry and BlockEnd are not created in this case.
476     ST_Indentless
477   };
478 
479   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
480                SequenceType ST)
481       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
482 
483   friend class basic_collection_iterator<SequenceNode, Node>;
484 
485   using iterator = basic_collection_iterator<SequenceNode, Node>;
486 
487   template <class T> friend typename T::iterator yaml::begin(T &);
488   template <class T> friend void yaml::skip(T &);
489 
490   void increment();
491 
492   iterator begin() { return yaml::begin(*this); }
493 
494   iterator end() { return iterator(); }
495 
496   void skip() override { yaml::skip(*this); }
497 
498   static bool classof(const Node *N) {
499     return N->getType() == NK_Sequence;
500   }
501 
502 private:
503   SequenceType SeqType;
504   bool IsAtBeginning = true;
505   bool IsAtEnd = false;
506   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
507   Node *CurrentEntry = nullptr;
508 };
509 
510 /// Represents an alias to a Node with an anchor.
511 ///
512 /// Example:
513 ///   *AnchorName
514 class AliasNode final : public Node {
515   void anchor() override;
516 
517 public:
518   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
519       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
520 
521   StringRef getName() const { return Name; }
522 
523   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
524 
525 private:
526   StringRef Name;
527 };
528 
529 /// A YAML Stream is a sequence of Documents. A document contains a root
530 ///        node.
531 class Document {
532 public:
533   Document(Stream &ParentStream);
534 
535   /// Root for parsing a node. Returns a single node.
536   Node *parseBlockNode();
537 
538   /// Finish parsing the current document and return true if there are
539   ///        more. Return false otherwise.
540   bool skip();
541 
542   /// Parse and return the root level node.
543   Node *getRoot() {
544     if (Root)
545       return Root;
546     return Root = parseBlockNode();
547   }
548 
549   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
550 
551 private:
552   friend class Node;
553   friend class document_iterator;
554 
555   /// Stream to read tokens from.
556   Stream &stream;
557 
558   /// Used to allocate nodes to. All are destroyed without calling their
559   ///        destructor when the document is destroyed.
560   BumpPtrAllocator NodeAllocator;
561 
562   /// The root node. Used to support skipping a partially parsed
563   ///        document.
564   Node *Root;
565 
566   /// Maps tag prefixes to their expansion.
567   std::map<StringRef, StringRef> TagMap;
568 
569   Token &peekNext();
570   Token getNext();
571   void setError(const Twine &Message, Token &Location) const;
572   bool failed() const;
573 
574   /// Parse %BLAH directives and return true if any were encountered.
575   bool parseDirectives();
576 
577   /// Parse %YAML
578   void parseYAMLDirective();
579 
580   /// Parse %TAG
581   void parseTAGDirective();
582 
583   /// Consume the next token and error if it is not \a TK.
584   bool expectToken(int TK);
585 };
586 
587 /// Iterator abstraction for Documents over a Stream.
588 class document_iterator {
589 public:
590   document_iterator() = default;
591   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
592 
593   bool operator==(const document_iterator &Other) const {
594     if (isAtEnd() || Other.isAtEnd())
595       return isAtEnd() && Other.isAtEnd();
596 
597     return Doc == Other.Doc;
598   }
599   bool operator!=(const document_iterator &Other) const {
600     return !(*this == Other);
601   }
602 
603   document_iterator operator++() {
604     assert(Doc && "incrementing iterator past the end.");
605     if (!(*Doc)->skip()) {
606       Doc->reset(nullptr);
607     } else {
608       Stream &S = (*Doc)->stream;
609       Doc->reset(new Document(S));
610     }
611     return *this;
612   }
613 
614   Document &operator*() { return *Doc->get(); }
615 
616   std::unique_ptr<Document> &operator->() { return *Doc; }
617 
618 private:
619   bool isAtEnd() const { return !Doc || !*Doc; }
620 
621   std::unique_ptr<Document> *Doc = nullptr;
622 };
623 
624 } // end namespace yaml
625 
626 } // end namespace llvm
627 
628 #endif // LLVM_SUPPORT_YAMLPARSER_H
629