1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This is a YAML 1.2 parser.
10 //
11 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
12 //
13 //  This currently does not implement the following:
14 //    * Multi-line literal folding.
15 //    * Tag resolution.
16 //    * UTF-16.
17 //    * BOMs anywhere other than the first Unicode scalar value in the file.
18 //
19 //  The most important class here is Stream. This represents a YAML stream with
20 //  0, 1, or many documents.
21 //
22 //  SourceMgr sm;
23 //  StringRef input = getInput();
24 //  yaml::Stream stream(input, sm);
25 //
26 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
27 //       di != de; ++di) {
28 //    yaml::Node *n = di->getRoot();
29 //    if (n) {
30 //      // Do something with n...
31 //    } else
32 //      break;
33 //  }
34 //
35 //===----------------------------------------------------------------------===//
36 
37 #ifndef LLVM_SUPPORT_YAMLPARSER_H
38 #define LLVM_SUPPORT_YAMLPARSER_H
39 
40 #include "llvm/ADT/StringRef.h"
41 #include "llvm/Support/Allocator.h"
42 #include "llvm/Support/SMLoc.h"
43 #include <cassert>
44 #include <cstddef>
45 #include <iterator>
46 #include <map>
47 #include <memory>
48 #include <string>
49 #include <system_error>
50 
51 namespace llvm {
52 
53 class MemoryBufferRef;
54 class SourceMgr;
55 class raw_ostream;
56 class Twine;
57 
58 namespace yaml {
59 
60 class Document;
61 class document_iterator;
62 class Node;
63 class Scanner;
64 struct Token;
65 
66 /// Dump all the tokens in this stream to OS.
67 /// \returns true if there was an error, false otherwise.
68 bool dumpTokens(StringRef Input, raw_ostream &);
69 
70 /// Scans all tokens in input without outputting anything. This is used
71 ///        for benchmarking the tokenizer.
72 /// \returns true if there was an error, false otherwise.
73 bool scanTokens(StringRef Input);
74 
75 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
76 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
77 /// false, those UTF8 sequences encoding printable unicode scalars will not be
78 /// escaped, but emitted verbatim.
79 std::string escape(StringRef Input, bool EscapePrintable = true);
80 
81 /// This class represents a YAML stream potentially containing multiple
82 ///        documents.
83 class Stream {
84 public:
85   /// This keeps a reference to the string referenced by \p Input.
86   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
87          std::error_code *EC = nullptr);
88 
89   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
90          std::error_code *EC = nullptr);
91   ~Stream();
92 
93   document_iterator begin();
94   document_iterator end();
95   void skip();
96   bool failed();
97 
98   bool validate() {
99     skip();
100     return !failed();
101   }
102 
103   void printError(Node *N, const Twine &Msg);
104 
105 private:
106   friend class Document;
107 
108   std::unique_ptr<Scanner> scanner;
109   std::unique_ptr<Document> CurrentDoc;
110 };
111 
112 /// Abstract base class for all Nodes.
113 class Node {
114   virtual void anchor();
115 
116 public:
117   enum NodeKind {
118     NK_Null,
119     NK_Scalar,
120     NK_BlockScalar,
121     NK_KeyValue,
122     NK_Mapping,
123     NK_Sequence,
124     NK_Alias
125   };
126 
127   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
128        StringRef Tag);
129 
130   // It's not safe to copy YAML nodes; the document is streamed and the position
131   // is part of the state.
132   Node(const Node &) = delete;
133   void operator=(const Node &) = delete;
134 
135   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
136                      size_t Alignment = 16) noexcept {
137     return Alloc.Allocate(Size, Alignment);
138   }
139 
140   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
141                        size_t Size) noexcept {
142     Alloc.Deallocate(Ptr, Size);
143   }
144 
145   void operator delete(void *) noexcept = delete;
146 
147   /// Get the value of the anchor attached to this node. If it does not
148   ///        have one, getAnchor().size() will be 0.
149   StringRef getAnchor() const { return Anchor; }
150 
151   /// Get the tag as it was written in the document. This does not
152   ///   perform tag resolution.
153   StringRef getRawTag() const { return Tag; }
154 
155   /// Get the verbatium tag for a given Node. This performs tag resoluton
156   ///   and substitution.
157   std::string getVerbatimTag() const;
158 
159   SMRange getSourceRange() const { return SourceRange; }
160   void setSourceRange(SMRange SR) { SourceRange = SR; }
161 
162   // These functions forward to Document and Scanner.
163   Token &peekNext();
164   Token getNext();
165   Node *parseBlockNode();
166   BumpPtrAllocator &getAllocator();
167   void setError(const Twine &Message, Token &Location) const;
168   bool failed() const;
169 
170   virtual void skip() {}
171 
172   unsigned int getType() const { return TypeID; }
173 
174 protected:
175   std::unique_ptr<Document> &Doc;
176   SMRange SourceRange;
177 
178   ~Node() = default;
179 
180 private:
181   unsigned int TypeID;
182   StringRef Anchor;
183   /// The tag as typed in the document.
184   StringRef Tag;
185 };
186 
187 /// A null value.
188 ///
189 /// Example:
190 ///   !!null null
191 class NullNode final : public Node {
192   void anchor() override;
193 
194 public:
195   NullNode(std::unique_ptr<Document> &D)
196       : Node(NK_Null, D, StringRef(), StringRef()) {}
197 
198   static bool classof(const Node *N) { return N->getType() == NK_Null; }
199 };
200 
201 /// A scalar node is an opaque datum that can be presented as a
202 ///        series of zero or more Unicode scalar values.
203 ///
204 /// Example:
205 ///   Adena
206 class ScalarNode final : public Node {
207   void anchor() override;
208 
209 public:
210   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
211              StringRef Val)
212       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
213     SMLoc Start = SMLoc::getFromPointer(Val.begin());
214     SMLoc End = SMLoc::getFromPointer(Val.end());
215     SourceRange = SMRange(Start, End);
216   }
217 
218   // Return Value without any escaping or folding or other fun YAML stuff. This
219   // is the exact bytes that are contained in the file (after conversion to
220   // utf8).
221   StringRef getRawValue() const { return Value; }
222 
223   /// Gets the value of this node as a StringRef.
224   ///
225   /// \param Storage is used to store the content of the returned StringRef iff
226   ///        it requires any modification from how it appeared in the source.
227   ///        This happens with escaped characters and multi-line literals.
228   StringRef getValue(SmallVectorImpl<char> &Storage) const;
229 
230   static bool classof(const Node *N) {
231     return N->getType() == NK_Scalar;
232   }
233 
234 private:
235   StringRef Value;
236 
237   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
238                                  StringRef::size_type Start,
239                                  SmallVectorImpl<char> &Storage) const;
240 };
241 
242 /// A block scalar node is an opaque datum that can be presented as a
243 ///        series of zero or more Unicode scalar values.
244 ///
245 /// Example:
246 ///   |
247 ///     Hello
248 ///     World
249 class BlockScalarNode final : public Node {
250   void anchor() override;
251 
252 public:
253   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
254                   StringRef Value, StringRef RawVal)
255       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
256     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
257     SMLoc End = SMLoc::getFromPointer(RawVal.end());
258     SourceRange = SMRange(Start, End);
259   }
260 
261   /// Gets the value of this node as a StringRef.
262   StringRef getValue() const { return Value; }
263 
264   static bool classof(const Node *N) {
265     return N->getType() == NK_BlockScalar;
266   }
267 
268 private:
269   StringRef Value;
270 };
271 
272 /// A key and value pair. While not technically a Node under the YAML
273 ///        representation graph, it is easier to treat them this way.
274 ///
275 /// TODO: Consider making this not a child of Node.
276 ///
277 /// Example:
278 ///   Section: .text
279 class KeyValueNode final : public Node {
280   void anchor() override;
281 
282 public:
283   KeyValueNode(std::unique_ptr<Document> &D)
284       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
285 
286   /// Parse and return the key.
287   ///
288   /// This may be called multiple times.
289   ///
290   /// \returns The key, or nullptr if failed() == true.
291   Node *getKey();
292 
293   /// Parse and return the value.
294   ///
295   /// This may be called multiple times.
296   ///
297   /// \returns The value, or nullptr if failed() == true.
298   Node *getValue();
299 
300   void skip() override {
301     if (Node *Key = getKey()) {
302       Key->skip();
303       if (Node *Val = getValue())
304         Val->skip();
305     }
306   }
307 
308   static bool classof(const Node *N) {
309     return N->getType() == NK_KeyValue;
310   }
311 
312 private:
313   Node *Key = nullptr;
314   Node *Value = nullptr;
315 };
316 
317 /// This is an iterator abstraction over YAML collections shared by both
318 ///        sequences and maps.
319 ///
320 /// BaseT must have a ValueT* member named CurrentEntry and a member function
321 /// increment() which must set CurrentEntry to 0 to create an end iterator.
322 template <class BaseT, class ValueT>
323 class basic_collection_iterator
324     : public std::iterator<std::input_iterator_tag, ValueT> {
325 public:
326   basic_collection_iterator() = default;
327   basic_collection_iterator(BaseT *B) : Base(B) {}
328 
329   ValueT *operator->() const {
330     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
331     return Base->CurrentEntry;
332   }
333 
334   ValueT &operator*() const {
335     assert(Base && Base->CurrentEntry &&
336            "Attempted to dereference end iterator!");
337     return *Base->CurrentEntry;
338   }
339 
340   operator ValueT *() const {
341     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
342     return Base->CurrentEntry;
343   }
344 
345   /// Note on EqualityComparable:
346   ///
347   /// The iterator is not re-entrant,
348   /// it is meant to be used for parsing YAML on-demand
349   /// Once iteration started - it can point only to one entry at a time
350   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
351   /// iff Base and Other.Base are equal.
352   bool operator==(const basic_collection_iterator &Other) const {
353     if (Base && (Base == Other.Base)) {
354       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
355              && "Equal Bases expected to point to equal Entries");
356     }
357 
358     return Base == Other.Base;
359   }
360 
361   bool operator!=(const basic_collection_iterator &Other) const {
362     return !(Base == Other.Base);
363   }
364 
365   basic_collection_iterator &operator++() {
366     assert(Base && "Attempted to advance iterator past end!");
367     Base->increment();
368     // Create an end iterator.
369     if (!Base->CurrentEntry)
370       Base = nullptr;
371     return *this;
372   }
373 
374 private:
375   BaseT *Base = nullptr;
376 };
377 
378 // The following two templates are used for both MappingNode and Sequence Node.
379 template <class CollectionType>
380 typename CollectionType::iterator begin(CollectionType &C) {
381   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
382   C.IsAtBeginning = false;
383   typename CollectionType::iterator ret(&C);
384   ++ret;
385   return ret;
386 }
387 
388 template <class CollectionType> void skip(CollectionType &C) {
389   // TODO: support skipping from the middle of a parsed collection ;/
390   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
391   if (C.IsAtBeginning)
392     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
393          ++i)
394       i->skip();
395 }
396 
397 /// Represents a YAML map created from either a block map for a flow map.
398 ///
399 /// This parses the YAML stream as increment() is called.
400 ///
401 /// Example:
402 ///   Name: _main
403 ///   Scope: Global
404 class MappingNode final : public Node {
405   void anchor() override;
406 
407 public:
408   enum MappingType {
409     MT_Block,
410     MT_Flow,
411     MT_Inline ///< An inline mapping node is used for "[key: value]".
412   };
413 
414   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
415               MappingType MT)
416       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
417 
418   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
419 
420   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
421 
422   template <class T> friend typename T::iterator yaml::begin(T &);
423   template <class T> friend void yaml::skip(T &);
424 
425   iterator begin() { return yaml::begin(*this); }
426 
427   iterator end() { return iterator(); }
428 
429   void skip() override { yaml::skip(*this); }
430 
431   static bool classof(const Node *N) {
432     return N->getType() == NK_Mapping;
433   }
434 
435 private:
436   MappingType Type;
437   bool IsAtBeginning = true;
438   bool IsAtEnd = false;
439   KeyValueNode *CurrentEntry = nullptr;
440 
441   void increment();
442 };
443 
444 /// Represents a YAML sequence created from either a block sequence for a
445 ///        flow sequence.
446 ///
447 /// This parses the YAML stream as increment() is called.
448 ///
449 /// Example:
450 ///   - Hello
451 ///   - World
452 class SequenceNode final : public Node {
453   void anchor() override;
454 
455 public:
456   enum SequenceType {
457     ST_Block,
458     ST_Flow,
459     // Use for:
460     //
461     // key:
462     // - val1
463     // - val2
464     //
465     // As a BlockMappingEntry and BlockEnd are not created in this case.
466     ST_Indentless
467   };
468 
469   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
470                SequenceType ST)
471       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
472 
473   friend class basic_collection_iterator<SequenceNode, Node>;
474 
475   using iterator = basic_collection_iterator<SequenceNode, Node>;
476 
477   template <class T> friend typename T::iterator yaml::begin(T &);
478   template <class T> friend void yaml::skip(T &);
479 
480   void increment();
481 
482   iterator begin() { return yaml::begin(*this); }
483 
484   iterator end() { return iterator(); }
485 
486   void skip() override { yaml::skip(*this); }
487 
488   static bool classof(const Node *N) {
489     return N->getType() == NK_Sequence;
490   }
491 
492 private:
493   SequenceType SeqType;
494   bool IsAtBeginning = true;
495   bool IsAtEnd = false;
496   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
497   Node *CurrentEntry = nullptr;
498 };
499 
500 /// Represents an alias to a Node with an anchor.
501 ///
502 /// Example:
503 ///   *AnchorName
504 class AliasNode final : public Node {
505   void anchor() override;
506 
507 public:
508   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
509       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
510 
511   StringRef getName() const { return Name; }
512   Node *getTarget();
513 
514   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
515 
516 private:
517   StringRef Name;
518 };
519 
520 /// A YAML Stream is a sequence of Documents. A document contains a root
521 ///        node.
522 class Document {
523 public:
524   Document(Stream &ParentStream);
525 
526   /// Root for parsing a node. Returns a single node.
527   Node *parseBlockNode();
528 
529   /// Finish parsing the current document and return true if there are
530   ///        more. Return false otherwise.
531   bool skip();
532 
533   /// Parse and return the root level node.
534   Node *getRoot() {
535     if (Root)
536       return Root;
537     return Root = parseBlockNode();
538   }
539 
540   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
541 
542 private:
543   friend class Node;
544   friend class document_iterator;
545 
546   /// Stream to read tokens from.
547   Stream &stream;
548 
549   /// Used to allocate nodes to. All are destroyed without calling their
550   ///        destructor when the document is destroyed.
551   BumpPtrAllocator NodeAllocator;
552 
553   /// The root node. Used to support skipping a partially parsed
554   ///        document.
555   Node *Root;
556 
557   /// Maps tag prefixes to their expansion.
558   std::map<StringRef, StringRef> TagMap;
559 
560   Token &peekNext();
561   Token getNext();
562   void setError(const Twine &Message, Token &Location) const;
563   bool failed() const;
564 
565   /// Parse %BLAH directives and return true if any were encountered.
566   bool parseDirectives();
567 
568   /// Parse %YAML
569   void parseYAMLDirective();
570 
571   /// Parse %TAG
572   void parseTAGDirective();
573 
574   /// Consume the next token and error if it is not \a TK.
575   bool expectToken(int TK);
576 };
577 
578 /// Iterator abstraction for Documents over a Stream.
579 class document_iterator {
580 public:
581   document_iterator() = default;
582   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
583 
584   bool operator==(const document_iterator &Other) const {
585     if (isAtEnd() || Other.isAtEnd())
586       return isAtEnd() && Other.isAtEnd();
587 
588     return Doc == Other.Doc;
589   }
590   bool operator!=(const document_iterator &Other) const {
591     return !(*this == Other);
592   }
593 
594   document_iterator operator++() {
595     assert(Doc && "incrementing iterator past the end.");
596     if (!(*Doc)->skip()) {
597       Doc->reset(nullptr);
598     } else {
599       Stream &S = (*Doc)->stream;
600       Doc->reset(new Document(S));
601     }
602     return *this;
603   }
604 
605   Document &operator*() { return *Doc->get(); }
606 
607   std::unique_ptr<Document> &operator->() { return *Doc; }
608 
609 private:
610   bool isAtEnd() const { return !Doc || !*Doc; }
611 
612   std::unique_ptr<Document> *Doc = nullptr;
613 };
614 
615 } // end namespace yaml
616 
617 } // end namespace llvm
618 
619 #endif // LLVM_SUPPORT_YAMLPARSER_H
620