1 //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 
9 #include "clang/AST/RawCommentList.h"
10 #include "clang/AST/ASTContext.h"
11 #include "clang/AST/Comment.h"
12 #include "clang/AST/CommentBriefParser.h"
13 #include "clang/AST/CommentCommandTraits.h"
14 #include "clang/AST/CommentLexer.h"
15 #include "clang/AST/CommentParser.h"
16 #include "clang/AST/CommentSema.h"
17 #include "clang/Basic/CharInfo.h"
18 #include "llvm/ADT/STLExtras.h"
19 #include "llvm/ADT/StringExtras.h"
20 #include "llvm/Support/Allocator.h"
21 
22 using namespace clang;
23 
24 namespace {
25 /// Get comment kind and bool describing if it is a trailing comment.
26 std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
27                                                         bool ParseAllComments) {
28   const size_t MinCommentLength = ParseAllComments ? 2 : 3;
29   if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
30     return std::make_pair(RawComment::RCK_Invalid, false);
31 
32   RawComment::CommentKind K;
33   if (Comment[1] == '/') {
34     if (Comment.size() < 3)
35       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
36 
37     if (Comment[2] == '/')
38       K = RawComment::RCK_BCPLSlash;
39     else if (Comment[2] == '!')
40       K = RawComment::RCK_BCPLExcl;
41     else
42       return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
43   } else {
44     assert(Comment.size() >= 4);
45 
46     // Comment lexer does not understand escapes in comment markers, so pretend
47     // that this is not a comment.
48     if (Comment[1] != '*' ||
49         Comment[Comment.size() - 2] != '*' ||
50         Comment[Comment.size() - 1] != '/')
51       return std::make_pair(RawComment::RCK_Invalid, false);
52 
53     if (Comment[2] == '*')
54       K = RawComment::RCK_JavaDoc;
55     else if (Comment[2] == '!')
56       K = RawComment::RCK_Qt;
57     else
58       return std::make_pair(RawComment::RCK_OrdinaryC, false);
59   }
60   const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
61   return std::make_pair(K, TrailingComment);
62 }
63 
64 bool mergedCommentIsTrailingComment(StringRef Comment) {
65   return (Comment.size() > 3) && (Comment[3] == '<');
66 }
67 
68 /// Returns true if R1 and R2 both have valid locations that start on the same
69 /// column.
70 bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
71                                const RawComment &R2) {
72   SourceLocation L1 = R1.getBeginLoc();
73   SourceLocation L2 = R2.getBeginLoc();
74   bool Invalid = false;
75   unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
76   if (!Invalid) {
77     unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
78     return !Invalid && (C1 == C2);
79   }
80   return false;
81 }
82 } // unnamed namespace
83 
84 /// Determines whether there is only whitespace in `Buffer` between `P`
85 /// and the previous line.
86 /// \param Buffer The buffer to search in.
87 /// \param P The offset from the beginning of `Buffer` to start from.
88 /// \return true if all of the characters in `Buffer` ranging from the closest
89 /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
90 /// are whitespace.
91 static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
92   // Search backwards until we see linefeed or carriage return.
93   for (unsigned I = P; I != 0; --I) {
94     char C = Buffer[I - 1];
95     if (isVerticalWhitespace(C))
96       return true;
97     if (!isHorizontalWhitespace(C))
98       return false;
99   }
100   // We hit the beginning of the buffer.
101   return true;
102 }
103 
104 /// Returns whether `K` is an ordinary comment kind.
105 static bool isOrdinaryKind(RawComment::CommentKind K) {
106   return (K == RawComment::RCK_OrdinaryBCPL) ||
107          (K == RawComment::RCK_OrdinaryC);
108 }
109 
110 RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
111                        const CommentOptions &CommentOpts, bool Merged) :
112     Range(SR), RawTextValid(false), BriefTextValid(false),
113     IsAttached(false), IsTrailingComment(false),
114     IsAlmostTrailingComment(false) {
115   // Extract raw comment text, if possible.
116   if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
117     Kind = RCK_Invalid;
118     return;
119   }
120 
121   // Guess comment kind.
122   std::pair<CommentKind, bool> K =
123       getCommentKind(RawText, CommentOpts.ParseAllComments);
124 
125   // Guess whether an ordinary comment is trailing.
126   if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
127     FileID BeginFileID;
128     unsigned BeginOffset;
129     std::tie(BeginFileID, BeginOffset) =
130         SourceMgr.getDecomposedLoc(Range.getBegin());
131     if (BeginOffset != 0) {
132       bool Invalid = false;
133       const char *Buffer =
134           SourceMgr.getBufferData(BeginFileID, &Invalid).data();
135       IsTrailingComment |=
136           (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
137     }
138   }
139 
140   if (!Merged) {
141     Kind = K.first;
142     IsTrailingComment |= K.second;
143 
144     IsAlmostTrailingComment = RawText.startswith("//<") ||
145                                  RawText.startswith("/*<");
146   } else {
147     Kind = RCK_Merged;
148     IsTrailingComment =
149         IsTrailingComment || mergedCommentIsTrailingComment(RawText);
150   }
151 }
152 
153 StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
154   FileID BeginFileID;
155   FileID EndFileID;
156   unsigned BeginOffset;
157   unsigned EndOffset;
158 
159   std::tie(BeginFileID, BeginOffset) =
160       SourceMgr.getDecomposedLoc(Range.getBegin());
161   std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
162 
163   const unsigned Length = EndOffset - BeginOffset;
164   if (Length < 2)
165     return StringRef();
166 
167   // The comment can't begin in one file and end in another.
168   assert(BeginFileID == EndFileID);
169 
170   bool Invalid = false;
171   const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
172                                                     &Invalid).data();
173   if (Invalid)
174     return StringRef();
175 
176   return StringRef(BufferStart + BeginOffset, Length);
177 }
178 
179 const char *RawComment::extractBriefText(const ASTContext &Context) const {
180   // Lazily initialize RawText using the accessor before using it.
181   (void)getRawText(Context.getSourceManager());
182 
183   // Since we will be copying the resulting text, all allocations made during
184   // parsing are garbage after resulting string is formed.  Thus we can use
185   // a separate allocator for all temporary stuff.
186   llvm::BumpPtrAllocator Allocator;
187 
188   comments::Lexer L(Allocator, Context.getDiagnostics(),
189                     Context.getCommentCommandTraits(),
190                     Range.getBegin(),
191                     RawText.begin(), RawText.end());
192   comments::BriefParser P(L, Context.getCommentCommandTraits());
193 
194   const std::string Result = P.Parse();
195   const unsigned BriefTextLength = Result.size();
196   char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
197   memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
198   BriefText = BriefTextPtr;
199   BriefTextValid = true;
200 
201   return BriefTextPtr;
202 }
203 
204 comments::FullComment *RawComment::parse(const ASTContext &Context,
205                                          const Preprocessor *PP,
206                                          const Decl *D) const {
207   // Lazily initialize RawText using the accessor before using it.
208   (void)getRawText(Context.getSourceManager());
209 
210   comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
211                     Context.getCommentCommandTraits(),
212                     getSourceRange().getBegin(),
213                     RawText.begin(), RawText.end());
214   comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
215                    Context.getDiagnostics(),
216                    Context.getCommentCommandTraits(),
217                    PP);
218   S.setDecl(D);
219   comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
220                      Context.getDiagnostics(),
221                      Context.getCommentCommandTraits());
222 
223   return P.parseFullComment();
224 }
225 
226 static bool onlyWhitespaceBetween(SourceManager &SM,
227                                   SourceLocation Loc1, SourceLocation Loc2,
228                                   unsigned MaxNewlinesAllowed) {
229   std::pair<FileID, unsigned> Loc1Info = SM.getDecomposedLoc(Loc1);
230   std::pair<FileID, unsigned> Loc2Info = SM.getDecomposedLoc(Loc2);
231 
232   // Question does not make sense if locations are in different files.
233   if (Loc1Info.first != Loc2Info.first)
234     return false;
235 
236   bool Invalid = false;
237   const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
238   if (Invalid)
239     return false;
240 
241   unsigned NumNewlines = 0;
242   assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
243   // Look for non-whitespace characters and remember any newlines seen.
244   for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
245     switch (Buffer[I]) {
246     default:
247       return false;
248     case ' ':
249     case '\t':
250     case '\f':
251     case '\v':
252       break;
253     case '\r':
254     case '\n':
255       ++NumNewlines;
256 
257       // Check if we have found more than the maximum allowed number of
258       // newlines.
259       if (NumNewlines > MaxNewlinesAllowed)
260         return false;
261 
262       // Collapse \r\n and \n\r into a single newline.
263       if (I + 1 != Loc2Info.second &&
264           (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
265           Buffer[I] != Buffer[I + 1])
266         ++I;
267       break;
268     }
269   }
270 
271   return true;
272 }
273 
274 void RawCommentList::addComment(const RawComment &RC,
275                                 const CommentOptions &CommentOpts,
276                                 llvm::BumpPtrAllocator &Allocator) {
277   if (RC.isInvalid())
278     return;
279 
280   // Ordinary comments are not interesting for us.
281   if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
282     return;
283 
284   std::pair<FileID, unsigned> Loc =
285       SourceMgr.getDecomposedLoc(RC.getBeginLoc());
286 
287   const FileID CommentFile = Loc.first;
288   const unsigned CommentOffset = Loc.second;
289 
290   // If this is the first Doxygen comment, save it (because there isn't
291   // anything to merge it with).
292   if (OrderedComments[CommentFile].empty()) {
293     OrderedComments[CommentFile][CommentOffset] =
294         new (Allocator) RawComment(RC);
295     return;
296   }
297 
298   const RawComment &C1 = *OrderedComments[CommentFile].rbegin()->second;
299   const RawComment &C2 = RC;
300 
301   // Merge comments only if there is only whitespace between them.
302   // Can't merge trailing and non-trailing comments unless the second is
303   // non-trailing ordinary in the same column, as in the case:
304   //   int x; // documents x
305   //          // more text
306   // versus:
307   //   int x; // documents x
308   //   int y; // documents y
309   // or:
310   //   int x; // documents x
311   //   // documents y
312   //   int y;
313   // Merge comments if they are on same or consecutive lines.
314   if ((C1.isTrailingComment() == C2.isTrailingComment() ||
315        (C1.isTrailingComment() && !C2.isTrailingComment() &&
316         isOrdinaryKind(C2.getKind()) &&
317         commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
318       onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
319                             /*MaxNewlinesAllowed=*/1)) {
320     SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
321     *OrderedComments[CommentFile].rbegin()->second =
322         RawComment(SourceMgr, MergedRange, CommentOpts, true);
323   } else {
324     OrderedComments[CommentFile][CommentOffset] =
325         new (Allocator) RawComment(RC);
326   }
327 }
328 
329 const std::map<unsigned, RawComment *> *
330 RawCommentList::getCommentsInFile(FileID File) const {
331   auto CommentsInFile = OrderedComments.find(File);
332   if (CommentsInFile == OrderedComments.end())
333     return nullptr;
334 
335   return &CommentsInFile->second;
336 }
337 
338 bool RawCommentList::empty() const { return OrderedComments.empty(); }
339 
340 unsigned RawCommentList::getCommentBeginLine(RawComment *C, FileID File,
341                                              unsigned Offset) const {
342   auto Cached = CommentBeginLine.find(C);
343   if (Cached != CommentBeginLine.end())
344     return Cached->second;
345   const unsigned Line = SourceMgr.getLineNumber(File, Offset);
346   CommentBeginLine[C] = Line;
347   return Line;
348 }
349 
350 unsigned RawCommentList::getCommentEndOffset(RawComment *C) const {
351   auto Cached = CommentEndOffset.find(C);
352   if (Cached != CommentEndOffset.end())
353     return Cached->second;
354   const unsigned Offset =
355       SourceMgr.getDecomposedLoc(C->getSourceRange().getEnd()).second;
356   CommentEndOffset[C] = Offset;
357   return Offset;
358 }
359 
360 std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
361                                          DiagnosticsEngine &Diags) const {
362   llvm::StringRef CommentText = getRawText(SourceMgr);
363   if (CommentText.empty())
364     return "";
365 
366   std::string Result;
367   for (const RawComment::CommentLine &Line :
368        getFormattedLines(SourceMgr, Diags))
369     Result += Line.Text + "\n";
370 
371   auto LastChar = Result.find_last_not_of('\n');
372   Result.erase(LastChar + 1, Result.size());
373 
374   return Result;
375 }
376 
377 std::vector<RawComment::CommentLine>
378 RawComment::getFormattedLines(const SourceManager &SourceMgr,
379                               DiagnosticsEngine &Diags) const {
380   llvm::StringRef CommentText = getRawText(SourceMgr);
381   if (CommentText.empty())
382     return {};
383 
384   llvm::BumpPtrAllocator Allocator;
385   // We do not parse any commands, so CommentOptions are ignored by
386   // comments::Lexer. Therefore, we just use default-constructed options.
387   CommentOptions DefOpts;
388   comments::CommandTraits EmptyTraits(Allocator, DefOpts);
389   comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
390                     CommentText.begin(), CommentText.end(),
391                     /*ParseCommands=*/false);
392 
393   std::vector<RawComment::CommentLine> Result;
394   // A column number of the first non-whitespace token in the comment text.
395   // We skip whitespace up to this column, but keep the whitespace after this
396   // column. IndentColumn is calculated when lexing the first line and reused
397   // for the rest of lines.
398   unsigned IndentColumn = 0;
399 
400   // Record the line number of the last processed comment line.
401   // For block-style comments, an extra newline token will be produced after
402   // the end-comment marker, e.g.:
403   //   /** This is a multi-line comment block.
404   //       The lexer will produce two newline tokens here > */
405   // previousLine will record the line number when we previously saw a newline
406   // token and recorded a comment line. If we see another newline token on the
407   // same line, don't record anything in between.
408   unsigned PreviousLine = 0;
409 
410   // Processes one line of the comment and adds it to the result.
411   // Handles skipping the indent at the start of the line.
412   // Returns false when eof is reached and true otherwise.
413   auto LexLine = [&](bool IsFirstLine) -> bool {
414     comments::Token Tok;
415     // Lex the first token on the line. We handle it separately, because we to
416     // fix up its indentation.
417     L.lex(Tok);
418     if (Tok.is(comments::tok::eof))
419       return false;
420     if (Tok.is(comments::tok::newline)) {
421       PresumedLoc Loc = SourceMgr.getPresumedLoc(Tok.getLocation());
422       if (Loc.getLine() != PreviousLine) {
423         Result.emplace_back("", Loc, Loc);
424         PreviousLine = Loc.getLine();
425       }
426       return true;
427     }
428     SmallString<124> Line;
429     llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
430     bool LocInvalid = false;
431     unsigned TokColumn =
432         SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
433     assert(!LocInvalid && "getFormattedText for invalid location");
434 
435     // Amount of leading whitespace in TokText.
436     size_t WhitespaceLen = TokText.find_first_not_of(" \t");
437     if (WhitespaceLen == StringRef::npos)
438       WhitespaceLen = TokText.size();
439     // Remember the amount of whitespace we skipped in the first line to remove
440     // indent up to that column in the following lines.
441     if (IsFirstLine)
442       IndentColumn = TokColumn + WhitespaceLen;
443 
444     // Amount of leading whitespace we actually want to skip.
445     // For the first line we skip all the whitespace.
446     // For the rest of the lines, we skip whitespace up to IndentColumn.
447     unsigned SkipLen =
448         IsFirstLine
449             ? WhitespaceLen
450             : std::min<size_t>(
451                   WhitespaceLen,
452                   std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
453     llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
454     Line += Trimmed;
455     // Get the beginning location of the adjusted comment line.
456     PresumedLoc Begin =
457         SourceMgr.getPresumedLoc(Tok.getLocation().getLocWithOffset(SkipLen));
458 
459     // Lex all tokens in the rest of the line.
460     for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
461       if (Tok.is(comments::tok::newline)) {
462         // Get the ending location of the comment line.
463         PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
464         if (End.getLine() != PreviousLine) {
465           Result.emplace_back(Line, Begin, End);
466           PreviousLine = End.getLine();
467         }
468         return true;
469       }
470       Line += L.getSpelling(Tok, SourceMgr);
471     }
472     PresumedLoc End = SourceMgr.getPresumedLoc(Tok.getLocation());
473     Result.emplace_back(Line, Begin, End);
474     // We've reached the end of file token.
475     return false;
476   };
477 
478   // Process first line separately to remember indent for the following lines.
479   if (!LexLine(/*IsFirstLine=*/true))
480     return Result;
481   // Process the rest of the lines.
482   while (LexLine(/*IsFirstLine=*/false))
483     ;
484   return Result;
485 }
486