1 //===- Markup.h -------------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file declares the log symbolizer markup data model and parser.
11 ///
12 /// See https://llvm.org/docs/SymbolizerMarkupFormat.html
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
17 #define LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
18 
19 #include <iostream>
20 
21 #include "llvm/ADT/Optional.h"
22 #include "llvm/ADT/SmallVector.h"
23 #include "llvm/ADT/StringRef.h"
24 #include "llvm/ADT/StringSet.h"
25 #include "llvm/Support/Regex.h"
26 
27 namespace llvm {
28 namespace symbolize {
29 
30 /// A node of symbolizer markup.
31 ///
32 /// If only the Text field is set, this represents a region of text outside a
33 /// markup element. ANSI SGR control codes are also reported this way; if
34 /// detected, then the control code will be the entirety of the Text field, and
35 /// any surrounding text will be reported as preceding and following nodes.
36 struct MarkupNode {
37   /// The full text of this node in the input.
38   StringRef Text;
39 
40   /// If this represents an element, the tag. Otherwise, empty.
41   StringRef Tag;
42 
43   /// If this represents an element with fields, a list of the field contents.
44   /// Otherwise, empty.
45   SmallVector<StringRef> Fields;
46 
47   bool operator==(const MarkupNode &Other) const {
48     return Text == Other.Text && Tag == Other.Tag && Fields == Other.Fields;
49   }
50   bool operator!=(const MarkupNode &Other) const { return !(*this == Other); }
51 };
52 
53 /// Parses a log containing symbolizer markup into a sequence of nodes.
54 class MarkupParser {
55 public:
56   MarkupParser(StringSet<> MultilineTags = {});
57 
58   /// Parses an individual \p Line of input.
59   ///
60   /// Nodes from the previous parseLine() call that haven't yet been extracted
61   /// by nextNode() are discarded. The nodes returned by nextNode() may
62   /// reference the input string, so it must be retained by the caller until the
63   /// last use.
64   ///
65   /// Note that some elements may span multiple lines. If a line ends with the
66   /// start of one of these elements, then no nodes will be produced until the
67   /// either the end or something that cannot be part of an element is
68   /// encountered. This may only occur after multiple calls to parseLine(),
69   /// corresponding to the lines of the multi-line element.
70   void parseLine(StringRef Line);
71 
72   /// Inform the parser of that the input stream has ended.
73   ///
74   /// This allows the parser to finish any deferred processing (e.g., an
75   /// in-progress multi-line element) and may cause nextNode() to return
76   /// additional nodes.
77   void flush();
78 
79   /// Returns the next node in the input sequence.
80   ///
81   /// Calling nextNode() may invalidate the contents of the node returned by the
82   /// previous call.
83   ///
84   /// \returns the next markup node or None if none remain.
85   Optional<MarkupNode> nextNode();
86 
87   bool isSGR(const MarkupNode &Node) const {
88     return SGRSyntax.match(Node.Text);
89   }
90 
91 private:
92   Optional<MarkupNode> parseElement(StringRef Line);
93   void parseTextOutsideMarkup(StringRef Text);
94   Optional<StringRef> parseMultiLineBegin(StringRef Line);
95   Optional<StringRef> parseMultiLineEnd(StringRef Line);
96 
97   // Tags of elements that can span multiple lines.
98   const StringSet<> MultilineTags;
99 
100   // Contents of a multi-line element that has finished being parsed. Retained
101   // to keep returned StringRefs for the contents valid.
102   std::string FinishedMultiline;
103 
104   // Contents of a multi-line element that is still in the process of receiving
105   // lines.
106   std::string InProgressMultiline;
107 
108   // The line currently being parsed.
109   StringRef Line;
110 
111   // Buffer for nodes parsed from the current line.
112   SmallVector<MarkupNode> Buffer;
113 
114   // Next buffer index to return.
115   size_t NextIdx;
116 
117   // Regular expression matching supported ANSI SGR escape sequences.
118   const Regex SGRSyntax;
119 };
120 
121 } // end namespace symbolize
122 } // end namespace llvm
123 
124 #endif // LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
125