1 //===--- CommentLexer.h - Lexer for structured comments ---------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This file defines lexer for structured comments and supporting token class.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #ifndef LLVM_CLANG_AST_COMMENTLEXER_H
14 #define LLVM_CLANG_AST_COMMENTLEXER_H
15 
16 #include "clang/Basic/Diagnostic.h"
17 #include "clang/Basic/SourceManager.h"
18 #include "llvm/ADT/SmallString.h"
19 #include "llvm/ADT/StringRef.h"
20 #include "llvm/Support/Allocator.h"
21 #include "llvm/Support/raw_ostream.h"
22 
23 namespace clang {
24 namespace comments {
25 
26 class Lexer;
27 class TextTokenRetokenizer;
28 struct CommandInfo;
29 class CommandTraits;
30 
31 namespace tok {
32 enum TokenKind {
33   eof,
34   newline,
35   text,
36   unknown_command,   // Command that does not have an ID.
37   backslash_command, // Command with an ID, that used backslash marker.
38   at_command,        // Command with an ID, that used 'at' marker.
39   verbatim_block_begin,
40   verbatim_block_line,
41   verbatim_block_end,
42   verbatim_line_name,
43   verbatim_line_text,
44   html_start_tag,     // <tag
45   html_ident,         // attr
46   html_equals,        // =
47   html_quoted_string, // "blah\"blah" or 'blah\'blah'
48   html_greater,       // >
49   html_slash_greater, // />
50   html_end_tag        // </tag
51 };
52 } // end namespace tok
53 
54 /// Comment token.
55 class Token {
56   friend class Lexer;
57   friend class TextTokenRetokenizer;
58 
59   /// The location of the token.
60   SourceLocation Loc;
61 
62   /// The actual kind of the token.
63   tok::TokenKind Kind;
64 
65   /// Length of the token spelling in comment.  Can be 0 for synthenized
66   /// tokens.
67   unsigned Length;
68 
69   /// Contains text value associated with a token.
70   const char *TextPtr;
71 
72   /// Integer value associated with a token.
73   ///
74   /// If the token is a known command, contains command ID and TextPtr is
75   /// unused (command spelling can be found with CommandTraits).  Otherwise,
76   /// contains the length of the string that starts at TextPtr.
77   unsigned IntVal;
78 
79 public:
getLocation()80   SourceLocation getLocation() const LLVM_READONLY { return Loc; }
setLocation(SourceLocation SL)81   void setLocation(SourceLocation SL) { Loc = SL; }
82 
getEndLocation()83   SourceLocation getEndLocation() const LLVM_READONLY {
84     if (Length == 0 || Length == 1)
85       return Loc;
86     return Loc.getLocWithOffset(Length - 1);
87   }
88 
getKind()89   tok::TokenKind getKind() const LLVM_READONLY { return Kind; }
setKind(tok::TokenKind K)90   void setKind(tok::TokenKind K) { Kind = K; }
91 
is(tok::TokenKind K)92   bool is(tok::TokenKind K) const LLVM_READONLY { return Kind == K; }
isNot(tok::TokenKind K)93   bool isNot(tok::TokenKind K) const LLVM_READONLY { return Kind != K; }
94 
getLength()95   unsigned getLength() const LLVM_READONLY { return Length; }
setLength(unsigned L)96   void setLength(unsigned L) { Length = L; }
97 
getText()98   StringRef getText() const LLVM_READONLY {
99     assert(is(tok::text));
100     return StringRef(TextPtr, IntVal);
101   }
102 
setText(StringRef Text)103   void setText(StringRef Text) {
104     assert(is(tok::text));
105     TextPtr = Text.data();
106     IntVal = Text.size();
107   }
108 
getUnknownCommandName()109   StringRef getUnknownCommandName() const LLVM_READONLY {
110     assert(is(tok::unknown_command));
111     return StringRef(TextPtr, IntVal);
112   }
113 
setUnknownCommandName(StringRef Name)114   void setUnknownCommandName(StringRef Name) {
115     assert(is(tok::unknown_command));
116     TextPtr = Name.data();
117     IntVal = Name.size();
118   }
119 
getCommandID()120   unsigned getCommandID() const LLVM_READONLY {
121     assert(is(tok::backslash_command) || is(tok::at_command));
122     return IntVal;
123   }
124 
setCommandID(unsigned ID)125   void setCommandID(unsigned ID) {
126     assert(is(tok::backslash_command) || is(tok::at_command));
127     IntVal = ID;
128   }
129 
getVerbatimBlockID()130   unsigned getVerbatimBlockID() const LLVM_READONLY {
131     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
132     return IntVal;
133   }
134 
setVerbatimBlockID(unsigned ID)135   void setVerbatimBlockID(unsigned ID) {
136     assert(is(tok::verbatim_block_begin) || is(tok::verbatim_block_end));
137     IntVal = ID;
138   }
139 
getVerbatimBlockText()140   StringRef getVerbatimBlockText() const LLVM_READONLY {
141     assert(is(tok::verbatim_block_line));
142     return StringRef(TextPtr, IntVal);
143   }
144 
setVerbatimBlockText(StringRef Text)145   void setVerbatimBlockText(StringRef Text) {
146     assert(is(tok::verbatim_block_line));
147     TextPtr = Text.data();
148     IntVal = Text.size();
149   }
150 
getVerbatimLineID()151   unsigned getVerbatimLineID() const LLVM_READONLY {
152     assert(is(tok::verbatim_line_name));
153     return IntVal;
154   }
155 
setVerbatimLineID(unsigned ID)156   void setVerbatimLineID(unsigned ID) {
157     assert(is(tok::verbatim_line_name));
158     IntVal = ID;
159   }
160 
getVerbatimLineText()161   StringRef getVerbatimLineText() const LLVM_READONLY {
162     assert(is(tok::verbatim_line_text));
163     return StringRef(TextPtr, IntVal);
164   }
165 
setVerbatimLineText(StringRef Text)166   void setVerbatimLineText(StringRef Text) {
167     assert(is(tok::verbatim_line_text));
168     TextPtr = Text.data();
169     IntVal = Text.size();
170   }
171 
getHTMLTagStartName()172   StringRef getHTMLTagStartName() const LLVM_READONLY {
173     assert(is(tok::html_start_tag));
174     return StringRef(TextPtr, IntVal);
175   }
176 
setHTMLTagStartName(StringRef Name)177   void setHTMLTagStartName(StringRef Name) {
178     assert(is(tok::html_start_tag));
179     TextPtr = Name.data();
180     IntVal = Name.size();
181   }
182 
getHTMLIdent()183   StringRef getHTMLIdent() const LLVM_READONLY {
184     assert(is(tok::html_ident));
185     return StringRef(TextPtr, IntVal);
186   }
187 
setHTMLIdent(StringRef Name)188   void setHTMLIdent(StringRef Name) {
189     assert(is(tok::html_ident));
190     TextPtr = Name.data();
191     IntVal = Name.size();
192   }
193 
getHTMLQuotedString()194   StringRef getHTMLQuotedString() const LLVM_READONLY {
195     assert(is(tok::html_quoted_string));
196     return StringRef(TextPtr, IntVal);
197   }
198 
setHTMLQuotedString(StringRef Str)199   void setHTMLQuotedString(StringRef Str) {
200     assert(is(tok::html_quoted_string));
201     TextPtr = Str.data();
202     IntVal = Str.size();
203   }
204 
getHTMLTagEndName()205   StringRef getHTMLTagEndName() const LLVM_READONLY {
206     assert(is(tok::html_end_tag));
207     return StringRef(TextPtr, IntVal);
208   }
209 
setHTMLTagEndName(StringRef Name)210   void setHTMLTagEndName(StringRef Name) {
211     assert(is(tok::html_end_tag));
212     TextPtr = Name.data();
213     IntVal = Name.size();
214   }
215 
216   void dump(const Lexer &L, const SourceManager &SM) const;
217 };
218 
219 /// Comment lexer.
220 class Lexer {
221 private:
222   Lexer(const Lexer &) = delete;
223   void operator=(const Lexer &) = delete;
224 
225   /// Allocator for strings that are semantic values of tokens and have to be
226   /// computed (for example, resolved decimal character references).
227   llvm::BumpPtrAllocator &Allocator;
228 
229   DiagnosticsEngine &Diags;
230 
231   const CommandTraits &Traits;
232 
233   const char *const BufferStart;
234   const char *const BufferEnd;
235   SourceLocation FileLoc;
236 
237   const char *BufferPtr;
238 
239   /// One past end pointer for the current comment.  For BCPL comments points
240   /// to newline or BufferEnd, for C comments points to star in '*/'.
241   const char *CommentEnd;
242 
243   enum LexerCommentState {
244     LCS_BeforeComment,
245     LCS_InsideBCPLComment,
246     LCS_InsideCComment,
247     LCS_BetweenComments
248   };
249 
250   /// Low-level lexer state, track if we are inside or outside of comment.
251   LexerCommentState CommentState;
252 
253   enum LexerState {
254     /// Lexing normal comment text
255     LS_Normal,
256 
257     /// Finished lexing verbatim block beginning command, will lex first body
258     /// line.
259     LS_VerbatimBlockFirstLine,
260 
261     /// Lexing verbatim block body line-by-line, skipping line-starting
262     /// decorations.
263     LS_VerbatimBlockBody,
264 
265     /// Finished lexing verbatim line beginning command, will lex text (one
266     /// line).
267     LS_VerbatimLineText,
268 
269     /// Finished lexing \verbatim <TAG \endverbatim part, lexing tag attributes.
270     LS_HTMLStartTag,
271 
272     /// Finished lexing \verbatim </TAG \endverbatim part, lexing '>'.
273     LS_HTMLEndTag
274   };
275 
276   /// Current lexing mode.
277   LexerState State;
278 
279   /// If State is LS_VerbatimBlock, contains the name of verbatim end
280   /// command, including command marker.
281   SmallString<16> VerbatimBlockEndCommandName;
282 
283   /// If true, the commands, html tags, etc will be parsed and reported as
284   /// separate tokens inside the comment body. If false, the comment text will
285   /// be parsed into text and newline tokens.
286   bool ParseCommands;
287 
288   /// Given a character reference name (e.g., "lt"), return the character that
289   /// it stands for (e.g., "<").
290   StringRef resolveHTMLNamedCharacterReference(StringRef Name) const;
291 
292   /// Given a Unicode codepoint as base-10 integer, return the character.
293   StringRef resolveHTMLDecimalCharacterReference(StringRef Name) const;
294 
295   /// Given a Unicode codepoint as base-16 integer, return the character.
296   StringRef resolveHTMLHexCharacterReference(StringRef Name) const;
297 
298   void formTokenWithChars(Token &Result, const char *TokEnd,
299                           tok::TokenKind Kind);
300 
formTextToken(Token & Result,const char * TokEnd)301   void formTextToken(Token &Result, const char *TokEnd) {
302     StringRef Text(BufferPtr, TokEnd - BufferPtr);
303     formTokenWithChars(Result, TokEnd, tok::text);
304     Result.setText(Text);
305   }
306 
getSourceLocation(const char * Loc)307   SourceLocation getSourceLocation(const char *Loc) const {
308     assert(Loc >= BufferStart && Loc <= BufferEnd &&
309            "Location out of range for this buffer!");
310 
311     const unsigned CharNo = Loc - BufferStart;
312     return FileLoc.getLocWithOffset(CharNo);
313   }
314 
Diag(SourceLocation Loc,unsigned DiagID)315   DiagnosticBuilder Diag(SourceLocation Loc, unsigned DiagID) {
316     return Diags.Report(Loc, DiagID);
317   }
318 
319   /// Eat string matching regexp \code \s*\* \endcode.
320   void skipLineStartingDecorations();
321 
322   /// Lex comment text, including commands if ParseCommands is set to true.
323   void lexCommentText(Token &T);
324 
325   void setupAndLexVerbatimBlock(Token &T, const char *TextBegin, char Marker,
326                                 const CommandInfo *Info);
327 
328   void lexVerbatimBlockFirstLine(Token &T);
329 
330   void lexVerbatimBlockBody(Token &T);
331 
332   void setupAndLexVerbatimLine(Token &T, const char *TextBegin,
333                                const CommandInfo *Info);
334 
335   void lexVerbatimLineText(Token &T);
336 
337   void lexHTMLCharacterReference(Token &T);
338 
339   void setupAndLexHTMLStartTag(Token &T);
340 
341   void lexHTMLStartTag(Token &T);
342 
343   void setupAndLexHTMLEndTag(Token &T);
344 
345   void lexHTMLEndTag(Token &T);
346 
347 public:
348   Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
349         const CommandTraits &Traits, SourceLocation FileLoc,
350         const char *BufferStart, const char *BufferEnd,
351         bool ParseCommands = true);
352 
353   void lex(Token &T);
354 
355   StringRef getSpelling(const Token &Tok, const SourceManager &SourceMgr) const;
356 };
357 
358 } // end namespace comments
359 } // end namespace clang
360 
361 #endif
362 
363