1 //===- Markup.h -------------------------------------------------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 ///
9 /// \file
10 /// This file declares the log symbolizer markup data model and parser.
11 ///
12 /// See https://llvm.org/docs/SymbolizerMarkupFormat.html
13 ///
14 //===----------------------------------------------------------------------===//
15 
16 #ifndef LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
17 #define LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
18 
19 #include "llvm/ADT/SmallVector.h"
20 #include "llvm/ADT/StringRef.h"
21 #include "llvm/ADT/StringSet.h"
22 #include "llvm/Support/Regex.h"
23 
24 namespace llvm {
25 namespace symbolize {
26 
27 /// A node of symbolizer markup.
28 ///
29 /// If only the Text field is set, this represents a region of text outside a
30 /// markup element. ANSI SGR control codes are also reported this way; if
31 /// detected, then the control code will be the entirety of the Text field, and
32 /// any surrounding text will be reported as preceding and following nodes.
33 struct MarkupNode {
34   /// The full text of this node in the input.
35   StringRef Text;
36 
37   /// If this represents an element, the tag. Otherwise, empty.
38   StringRef Tag;
39 
40   /// If this represents an element with fields, a list of the field contents.
41   /// Otherwise, empty.
42   SmallVector<StringRef> Fields;
43 
44   bool operator==(const MarkupNode &Other) const {
45     return Text == Other.Text && Tag == Other.Tag && Fields == Other.Fields;
46   }
47   bool operator!=(const MarkupNode &Other) const { return !(*this == Other); }
48 };
49 
50 /// Parses a log containing symbolizer markup into a sequence of nodes.
51 class MarkupParser {
52 public:
53   MarkupParser(StringSet<> MultilineTags = {});
54 
55   /// Parses an individual \p Line of input.
56   ///
57   /// Nodes from the previous parseLine() call that haven't yet been extracted
58   /// by nextNode() are discarded. The nodes returned by nextNode() may
59   /// reference the input string, so it must be retained by the caller until the
60   /// last use.
61   ///
62   /// Note that some elements may span multiple lines. If a line ends with the
63   /// start of one of these elements, then no nodes will be produced until the
64   /// either the end or something that cannot be part of an element is
65   /// encountered. This may only occur after multiple calls to parseLine(),
66   /// corresponding to the lines of the multi-line element.
67   void parseLine(StringRef Line);
68 
69   /// Inform the parser of that the input stream has ended.
70   ///
71   /// This allows the parser to finish any deferred processing (e.g., an
72   /// in-progress multi-line element) and may cause nextNode() to return
73   /// additional nodes.
74   void flush();
75 
76   /// Returns the next node in the input sequence.
77   ///
78   /// Calling nextNode() may invalidate the contents of the node returned by the
79   /// previous call.
80   ///
81   /// \returns the next markup node or std::nullopt if none remain.
82   std::optional<MarkupNode> nextNode();
83 
84   bool isSGR(const MarkupNode &Node) const {
85     return SGRSyntax.match(Node.Text);
86   }
87 
88 private:
89   std::optional<MarkupNode> parseElement(StringRef Line);
90   void parseTextOutsideMarkup(StringRef Text);
91   std::optional<StringRef> parseMultiLineBegin(StringRef Line);
92   std::optional<StringRef> parseMultiLineEnd(StringRef Line);
93 
94   // Tags of elements that can span multiple lines.
95   const StringSet<> MultilineTags;
96 
97   // Contents of a multi-line element that has finished being parsed. Retained
98   // to keep returned StringRefs for the contents valid.
99   std::string FinishedMultiline;
100 
101   // Contents of a multi-line element that is still in the process of receiving
102   // lines.
103   std::string InProgressMultiline;
104 
105   // The line currently being parsed.
106   StringRef Line;
107 
108   // Buffer for nodes parsed from the current line.
109   SmallVector<MarkupNode> Buffer;
110 
111   // Next buffer index to return.
112   size_t NextIdx;
113 
114   // Regular expression matching supported ANSI SGR escape sequences.
115   const Regex SGRSyntax;
116 };
117 
118 } // end namespace symbolize
119 } // end namespace llvm
120 
121 #endif // LLVM_DEBUGINFO_SYMBOLIZE_MARKUP_H
122