1 //===- YAMLParser.h - Simple YAML parser ------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This is a YAML 1.2 parser.
10 //
11 //  See http://www.yaml.org/spec/1.2/spec.html for the full standard.
12 //
13 //  This currently does not implement the following:
14 //    * Tag resolution.
15 //    * UTF-16.
16 //    * BOMs anywhere other than the first Unicode scalar value in the file.
17 //
18 //  The most important class here is Stream. This represents a YAML stream with
19 //  0, 1, or many documents.
20 //
21 //  SourceMgr sm;
22 //  StringRef input = getInput();
23 //  yaml::Stream stream(input, sm);
24 //
25 //  for (yaml::document_iterator di = stream.begin(), de = stream.end();
26 //       di != de; ++di) {
27 //    yaml::Node *n = di->getRoot();
28 //    if (n) {
29 //      // Do something with n...
30 //    } else
31 //      break;
32 //  }
33 //
34 //===----------------------------------------------------------------------===//
35 
36 #ifndef LLVM_SUPPORT_YAMLPARSER_H
37 #define LLVM_SUPPORT_YAMLPARSER_H
38 
39 #include "llvm/ADT/StringRef.h"
40 #include "llvm/Support/Allocator.h"
41 #include "llvm/Support/SMLoc.h"
42 #include "llvm/Support/SourceMgr.h"
43 #include <cassert>
44 #include <cstddef>
45 #include <iterator>
46 #include <map>
47 #include <memory>
48 #include <string>
49 #include <system_error>
50 
51 namespace llvm {
52 
53 class MemoryBufferRef;
54 class raw_ostream;
55 class Twine;
56 
57 namespace yaml {
58 
59 class Document;
60 class document_iterator;
61 class Node;
62 class Scanner;
63 struct Token;
64 
65 /// Dump all the tokens in this stream to OS.
66 /// \returns true if there was an error, false otherwise.
67 bool dumpTokens(StringRef Input, raw_ostream &);
68 
69 /// Scans all tokens in input without outputting anything. This is used
70 ///        for benchmarking the tokenizer.
71 /// \returns true if there was an error, false otherwise.
72 bool scanTokens(StringRef Input);
73 
74 /// Escape \a Input for a double quoted scalar; if \p EscapePrintable
75 /// is true, all UTF8 sequences will be escaped, if \p EscapePrintable is
76 /// false, those UTF8 sequences encoding printable unicode scalars will not be
77 /// escaped, but emitted verbatim.
78 std::string escape(StringRef Input, bool EscapePrintable = true);
79 
80 /// Parse \p S as a bool according to https://yaml.org/type/bool.html.
81 llvm::Optional<bool> parseBool(StringRef S);
82 
83 /// This class represents a YAML stream potentially containing multiple
84 ///        documents.
85 class Stream {
86 public:
87   /// This keeps a reference to the string referenced by \p Input.
88   Stream(StringRef Input, SourceMgr &, bool ShowColors = true,
89          std::error_code *EC = nullptr);
90 
91   Stream(MemoryBufferRef InputBuffer, SourceMgr &, bool ShowColors = true,
92          std::error_code *EC = nullptr);
93   ~Stream();
94 
95   document_iterator begin();
96   document_iterator end();
97   void skip();
98   bool failed();
99 
100   bool validate() {
101     skip();
102     return !failed();
103   }
104 
105   void printError(Node *N, const Twine &Msg,
106                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
107   void printError(const SMRange &Range, const Twine &Msg,
108                   SourceMgr::DiagKind Kind = SourceMgr::DK_Error);
109 
110 private:
111   friend class Document;
112 
113   std::unique_ptr<Scanner> scanner;
114   std::unique_ptr<Document> CurrentDoc;
115 };
116 
117 /// Abstract base class for all Nodes.
118 class Node {
119   virtual void anchor();
120 
121 public:
122   enum NodeKind {
123     NK_Null,
124     NK_Scalar,
125     NK_BlockScalar,
126     NK_KeyValue,
127     NK_Mapping,
128     NK_Sequence,
129     NK_Alias
130   };
131 
132   Node(unsigned int Type, std::unique_ptr<Document> &, StringRef Anchor,
133        StringRef Tag);
134 
135   // It's not safe to copy YAML nodes; the document is streamed and the position
136   // is part of the state.
137   Node(const Node &) = delete;
138   void operator=(const Node &) = delete;
139 
140   void *operator new(size_t Size, BumpPtrAllocator &Alloc,
141                      size_t Alignment = 16) noexcept {
142     return Alloc.Allocate(Size, Alignment);
143   }
144 
145   void operator delete(void *Ptr, BumpPtrAllocator &Alloc,
146                        size_t Size) noexcept {
147     Alloc.Deallocate(Ptr, Size, 0);
148   }
149 
150   void operator delete(void *) noexcept = delete;
151 
152   /// Get the value of the anchor attached to this node. If it does not
153   ///        have one, getAnchor().size() will be 0.
154   StringRef getAnchor() const { return Anchor; }
155 
156   /// Get the tag as it was written in the document. This does not
157   ///   perform tag resolution.
158   StringRef getRawTag() const { return Tag; }
159 
160   /// Get the verbatium tag for a given Node. This performs tag resoluton
161   ///   and substitution.
162   std::string getVerbatimTag() const;
163 
164   SMRange getSourceRange() const { return SourceRange; }
165   void setSourceRange(SMRange SR) { SourceRange = SR; }
166 
167   // These functions forward to Document and Scanner.
168   Token &peekNext();
169   Token getNext();
170   Node *parseBlockNode();
171   BumpPtrAllocator &getAllocator();
172   void setError(const Twine &Message, Token &Location) const;
173   bool failed() const;
174 
175   virtual void skip() {}
176 
177   unsigned int getType() const { return TypeID; }
178 
179 protected:
180   std::unique_ptr<Document> &Doc;
181   SMRange SourceRange;
182 
183   ~Node() = default;
184 
185 private:
186   unsigned int TypeID;
187   StringRef Anchor;
188   /// The tag as typed in the document.
189   StringRef Tag;
190 };
191 
192 /// A null value.
193 ///
194 /// Example:
195 ///   !!null null
196 class NullNode final : public Node {
197   void anchor() override;
198 
199 public:
200   NullNode(std::unique_ptr<Document> &D)
201       : Node(NK_Null, D, StringRef(), StringRef()) {}
202 
203   static bool classof(const Node *N) { return N->getType() == NK_Null; }
204 };
205 
206 /// A scalar node is an opaque datum that can be presented as a
207 ///        series of zero or more Unicode scalar values.
208 ///
209 /// Example:
210 ///   Adena
211 class ScalarNode final : public Node {
212   void anchor() override;
213 
214 public:
215   ScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
216              StringRef Val)
217       : Node(NK_Scalar, D, Anchor, Tag), Value(Val) {
218     SMLoc Start = SMLoc::getFromPointer(Val.begin());
219     SMLoc End = SMLoc::getFromPointer(Val.end());
220     SourceRange = SMRange(Start, End);
221   }
222 
223   // Return Value without any escaping or folding or other fun YAML stuff. This
224   // is the exact bytes that are contained in the file (after conversion to
225   // utf8).
226   StringRef getRawValue() const { return Value; }
227 
228   /// Gets the value of this node as a StringRef.
229   ///
230   /// \param Storage is used to store the content of the returned StringRef if
231   ///        it requires any modification from how it appeared in the source.
232   ///        This happens with escaped characters and multi-line literals.
233   StringRef getValue(SmallVectorImpl<char> &Storage) const;
234 
235   static bool classof(const Node *N) {
236     return N->getType() == NK_Scalar;
237   }
238 
239 private:
240   StringRef Value;
241 
242   StringRef unescapeDoubleQuoted(StringRef UnquotedValue,
243                                  StringRef::size_type Start,
244                                  SmallVectorImpl<char> &Storage) const;
245 };
246 
247 /// A block scalar node is an opaque datum that can be presented as a
248 ///        series of zero or more Unicode scalar values.
249 ///
250 /// Example:
251 ///   |
252 ///     Hello
253 ///     World
254 class BlockScalarNode final : public Node {
255   void anchor() override;
256 
257 public:
258   BlockScalarNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
259                   StringRef Value, StringRef RawVal)
260       : Node(NK_BlockScalar, D, Anchor, Tag), Value(Value) {
261     SMLoc Start = SMLoc::getFromPointer(RawVal.begin());
262     SMLoc End = SMLoc::getFromPointer(RawVal.end());
263     SourceRange = SMRange(Start, End);
264   }
265 
266   /// Gets the value of this node as a StringRef.
267   StringRef getValue() const { return Value; }
268 
269   static bool classof(const Node *N) {
270     return N->getType() == NK_BlockScalar;
271   }
272 
273 private:
274   StringRef Value;
275 };
276 
277 /// A key and value pair. While not technically a Node under the YAML
278 ///        representation graph, it is easier to treat them this way.
279 ///
280 /// TODO: Consider making this not a child of Node.
281 ///
282 /// Example:
283 ///   Section: .text
284 class KeyValueNode final : public Node {
285   void anchor() override;
286 
287 public:
288   KeyValueNode(std::unique_ptr<Document> &D)
289       : Node(NK_KeyValue, D, StringRef(), StringRef()) {}
290 
291   /// Parse and return the key.
292   ///
293   /// This may be called multiple times.
294   ///
295   /// \returns The key, or nullptr if failed() == true.
296   Node *getKey();
297 
298   /// Parse and return the value.
299   ///
300   /// This may be called multiple times.
301   ///
302   /// \returns The value, or nullptr if failed() == true.
303   Node *getValue();
304 
305   void skip() override {
306     if (Node *Key = getKey()) {
307       Key->skip();
308       if (Node *Val = getValue())
309         Val->skip();
310     }
311   }
312 
313   static bool classof(const Node *N) {
314     return N->getType() == NK_KeyValue;
315   }
316 
317 private:
318   Node *Key = nullptr;
319   Node *Value = nullptr;
320 };
321 
322 /// This is an iterator abstraction over YAML collections shared by both
323 ///        sequences and maps.
324 ///
325 /// BaseT must have a ValueT* member named CurrentEntry and a member function
326 /// increment() which must set CurrentEntry to 0 to create an end iterator.
327 template <class BaseT, class ValueT> class basic_collection_iterator {
328 public:
329   using iterator_category = std::input_iterator_tag;
330   using value_type = ValueT;
331   using difference_type = std::ptrdiff_t;
332   using pointer = value_type *;
333   using reference = value_type &;
334 
335   basic_collection_iterator() = default;
336   basic_collection_iterator(BaseT *B) : Base(B) {}
337 
338   ValueT *operator->() const {
339     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
340     return Base->CurrentEntry;
341   }
342 
343   ValueT &operator*() const {
344     assert(Base && Base->CurrentEntry &&
345            "Attempted to dereference end iterator!");
346     return *Base->CurrentEntry;
347   }
348 
349   operator ValueT *() const {
350     assert(Base && Base->CurrentEntry && "Attempted to access end iterator!");
351     return Base->CurrentEntry;
352   }
353 
354   /// Note on EqualityComparable:
355   ///
356   /// The iterator is not re-entrant,
357   /// it is meant to be used for parsing YAML on-demand
358   /// Once iteration started - it can point only to one entry at a time
359   /// hence Base.CurrentEntry and Other.Base.CurrentEntry are equal
360   /// iff Base and Other.Base are equal.
361   bool operator==(const basic_collection_iterator &Other) const {
362     if (Base && (Base == Other.Base)) {
363       assert((Base->CurrentEntry == Other.Base->CurrentEntry)
364              && "Equal Bases expected to point to equal Entries");
365     }
366 
367     return Base == Other.Base;
368   }
369 
370   bool operator!=(const basic_collection_iterator &Other) const {
371     return !(Base == Other.Base);
372   }
373 
374   basic_collection_iterator &operator++() {
375     assert(Base && "Attempted to advance iterator past end!");
376     Base->increment();
377     // Create an end iterator.
378     if (!Base->CurrentEntry)
379       Base = nullptr;
380     return *this;
381   }
382 
383 private:
384   BaseT *Base = nullptr;
385 };
386 
387 // The following two templates are used for both MappingNode and Sequence Node.
388 template <class CollectionType>
389 typename CollectionType::iterator begin(CollectionType &C) {
390   assert(C.IsAtBeginning && "You may only iterate over a collection once!");
391   C.IsAtBeginning = false;
392   typename CollectionType::iterator ret(&C);
393   ++ret;
394   return ret;
395 }
396 
397 template <class CollectionType> void skip(CollectionType &C) {
398   // TODO: support skipping from the middle of a parsed collection ;/
399   assert((C.IsAtBeginning || C.IsAtEnd) && "Cannot skip mid parse!");
400   if (C.IsAtBeginning)
401     for (typename CollectionType::iterator i = begin(C), e = C.end(); i != e;
402          ++i)
403       i->skip();
404 }
405 
406 /// Represents a YAML map created from either a block map for a flow map.
407 ///
408 /// This parses the YAML stream as increment() is called.
409 ///
410 /// Example:
411 ///   Name: _main
412 ///   Scope: Global
413 class MappingNode final : public Node {
414   void anchor() override;
415 
416 public:
417   enum MappingType {
418     MT_Block,
419     MT_Flow,
420     MT_Inline ///< An inline mapping node is used for "[key: value]".
421   };
422 
423   MappingNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
424               MappingType MT)
425       : Node(NK_Mapping, D, Anchor, Tag), Type(MT) {}
426 
427   friend class basic_collection_iterator<MappingNode, KeyValueNode>;
428 
429   using iterator = basic_collection_iterator<MappingNode, KeyValueNode>;
430 
431   template <class T> friend typename T::iterator yaml::begin(T &);
432   template <class T> friend void yaml::skip(T &);
433 
434   iterator begin() { return yaml::begin(*this); }
435 
436   iterator end() { return iterator(); }
437 
438   void skip() override { yaml::skip(*this); }
439 
440   static bool classof(const Node *N) {
441     return N->getType() == NK_Mapping;
442   }
443 
444 private:
445   MappingType Type;
446   bool IsAtBeginning = true;
447   bool IsAtEnd = false;
448   KeyValueNode *CurrentEntry = nullptr;
449 
450   void increment();
451 };
452 
453 /// Represents a YAML sequence created from either a block sequence for a
454 ///        flow sequence.
455 ///
456 /// This parses the YAML stream as increment() is called.
457 ///
458 /// Example:
459 ///   - Hello
460 ///   - World
461 class SequenceNode final : public Node {
462   void anchor() override;
463 
464 public:
465   enum SequenceType {
466     ST_Block,
467     ST_Flow,
468     // Use for:
469     //
470     // key:
471     // - val1
472     // - val2
473     //
474     // As a BlockMappingEntry and BlockEnd are not created in this case.
475     ST_Indentless
476   };
477 
478   SequenceNode(std::unique_ptr<Document> &D, StringRef Anchor, StringRef Tag,
479                SequenceType ST)
480       : Node(NK_Sequence, D, Anchor, Tag), SeqType(ST) {}
481 
482   friend class basic_collection_iterator<SequenceNode, Node>;
483 
484   using iterator = basic_collection_iterator<SequenceNode, Node>;
485 
486   template <class T> friend typename T::iterator yaml::begin(T &);
487   template <class T> friend void yaml::skip(T &);
488 
489   void increment();
490 
491   iterator begin() { return yaml::begin(*this); }
492 
493   iterator end() { return iterator(); }
494 
495   void skip() override { yaml::skip(*this); }
496 
497   static bool classof(const Node *N) {
498     return N->getType() == NK_Sequence;
499   }
500 
501 private:
502   SequenceType SeqType;
503   bool IsAtBeginning = true;
504   bool IsAtEnd = false;
505   bool WasPreviousTokenFlowEntry = true; // Start with an imaginary ','.
506   Node *CurrentEntry = nullptr;
507 };
508 
509 /// Represents an alias to a Node with an anchor.
510 ///
511 /// Example:
512 ///   *AnchorName
513 class AliasNode final : public Node {
514   void anchor() override;
515 
516 public:
517   AliasNode(std::unique_ptr<Document> &D, StringRef Val)
518       : Node(NK_Alias, D, StringRef(), StringRef()), Name(Val) {}
519 
520   StringRef getName() const { return Name; }
521 
522   static bool classof(const Node *N) { return N->getType() == NK_Alias; }
523 
524 private:
525   StringRef Name;
526 };
527 
528 /// A YAML Stream is a sequence of Documents. A document contains a root
529 ///        node.
530 class Document {
531 public:
532   Document(Stream &ParentStream);
533 
534   /// Root for parsing a node. Returns a single node.
535   Node *parseBlockNode();
536 
537   /// Finish parsing the current document and return true if there are
538   ///        more. Return false otherwise.
539   bool skip();
540 
541   /// Parse and return the root level node.
542   Node *getRoot() {
543     if (Root)
544       return Root;
545     return Root = parseBlockNode();
546   }
547 
548   const std::map<StringRef, StringRef> &getTagMap() const { return TagMap; }
549 
550 private:
551   friend class Node;
552   friend class document_iterator;
553 
554   /// Stream to read tokens from.
555   Stream &stream;
556 
557   /// Used to allocate nodes to. All are destroyed without calling their
558   ///        destructor when the document is destroyed.
559   BumpPtrAllocator NodeAllocator;
560 
561   /// The root node. Used to support skipping a partially parsed
562   ///        document.
563   Node *Root;
564 
565   /// Maps tag prefixes to their expansion.
566   std::map<StringRef, StringRef> TagMap;
567 
568   Token &peekNext();
569   Token getNext();
570   void setError(const Twine &Message, Token &Location) const;
571   bool failed() const;
572 
573   /// Parse %BLAH directives and return true if any were encountered.
574   bool parseDirectives();
575 
576   /// Parse %YAML
577   void parseYAMLDirective();
578 
579   /// Parse %TAG
580   void parseTAGDirective();
581 
582   /// Consume the next token and error if it is not \a TK.
583   bool expectToken(int TK);
584 };
585 
586 /// Iterator abstraction for Documents over a Stream.
587 class document_iterator {
588 public:
589   document_iterator() = default;
590   document_iterator(std::unique_ptr<Document> &D) : Doc(&D) {}
591 
592   bool operator==(const document_iterator &Other) const {
593     if (isAtEnd() || Other.isAtEnd())
594       return isAtEnd() && Other.isAtEnd();
595 
596     return Doc == Other.Doc;
597   }
598   bool operator!=(const document_iterator &Other) const {
599     return !(*this == Other);
600   }
601 
602   document_iterator operator++() {
603     assert(Doc && "incrementing iterator past the end.");
604     if (!(*Doc)->skip()) {
605       Doc->reset(nullptr);
606     } else {
607       Stream &S = (*Doc)->stream;
608       Doc->reset(new Document(S));
609     }
610     return *this;
611   }
612 
613   Document &operator*() { return *Doc->get(); }
614 
615   std::unique_ptr<Document> &operator->() { return *Doc; }
616 
617 private:
618   bool isAtEnd() const { return !Doc || !*Doc; }
619 
620   std::unique_ptr<Document> *Doc = nullptr;
621 };
622 
623 } // end namespace yaml
624 
625 } // end namespace llvm
626 
627 #endif // LLVM_SUPPORT_YAMLPARSER_H
628