1 //===--- RawCommentList.cpp - Processing raw comments -----------*- C++ -*-===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8
9 #include "clang/AST/RawCommentList.h"
10 #include "clang/AST/ASTContext.h"
11 #include "clang/AST/Comment.h"
12 #include "clang/AST/CommentBriefParser.h"
13 #include "clang/AST/CommentCommandTraits.h"
14 #include "clang/AST/CommentLexer.h"
15 #include "clang/AST/CommentParser.h"
16 #include "clang/AST/CommentSema.h"
17 #include "clang/Basic/CharInfo.h"
18 #include "llvm/ADT/STLExtras.h"
19
20 using namespace clang;
21
22 namespace {
23 /// Get comment kind and bool describing if it is a trailing comment.
getCommentKind(StringRef Comment,bool ParseAllComments)24 std::pair<RawComment::CommentKind, bool> getCommentKind(StringRef Comment,
25 bool ParseAllComments) {
26 const size_t MinCommentLength = ParseAllComments ? 2 : 3;
27 if ((Comment.size() < MinCommentLength) || Comment[0] != '/')
28 return std::make_pair(RawComment::RCK_Invalid, false);
29
30 RawComment::CommentKind K;
31 if (Comment[1] == '/') {
32 if (Comment.size() < 3)
33 return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
34
35 if (Comment[2] == '/')
36 K = RawComment::RCK_BCPLSlash;
37 else if (Comment[2] == '!')
38 K = RawComment::RCK_BCPLExcl;
39 else
40 return std::make_pair(RawComment::RCK_OrdinaryBCPL, false);
41 } else {
42 assert(Comment.size() >= 4);
43
44 // Comment lexer does not understand escapes in comment markers, so pretend
45 // that this is not a comment.
46 if (Comment[1] != '*' ||
47 Comment[Comment.size() - 2] != '*' ||
48 Comment[Comment.size() - 1] != '/')
49 return std::make_pair(RawComment::RCK_Invalid, false);
50
51 if (Comment[2] == '*')
52 K = RawComment::RCK_JavaDoc;
53 else if (Comment[2] == '!')
54 K = RawComment::RCK_Qt;
55 else
56 return std::make_pair(RawComment::RCK_OrdinaryC, false);
57 }
58 const bool TrailingComment = (Comment.size() > 3) && (Comment[3] == '<');
59 return std::make_pair(K, TrailingComment);
60 }
61
mergedCommentIsTrailingComment(StringRef Comment)62 bool mergedCommentIsTrailingComment(StringRef Comment) {
63 return (Comment.size() > 3) && (Comment[3] == '<');
64 }
65
66 /// Returns true if R1 and R2 both have valid locations that start on the same
67 /// column.
commentsStartOnSameColumn(const SourceManager & SM,const RawComment & R1,const RawComment & R2)68 bool commentsStartOnSameColumn(const SourceManager &SM, const RawComment &R1,
69 const RawComment &R2) {
70 SourceLocation L1 = R1.getBeginLoc();
71 SourceLocation L2 = R2.getBeginLoc();
72 bool Invalid = false;
73 unsigned C1 = SM.getPresumedColumnNumber(L1, &Invalid);
74 if (!Invalid) {
75 unsigned C2 = SM.getPresumedColumnNumber(L2, &Invalid);
76 return !Invalid && (C1 == C2);
77 }
78 return false;
79 }
80 } // unnamed namespace
81
82 /// Determines whether there is only whitespace in `Buffer` between `P`
83 /// and the previous line.
84 /// \param Buffer The buffer to search in.
85 /// \param P The offset from the beginning of `Buffer` to start from.
86 /// \return true if all of the characters in `Buffer` ranging from the closest
87 /// line-ending character before `P` (or the beginning of `Buffer`) to `P - 1`
88 /// are whitespace.
onlyWhitespaceOnLineBefore(const char * Buffer,unsigned P)89 static bool onlyWhitespaceOnLineBefore(const char *Buffer, unsigned P) {
90 // Search backwards until we see linefeed or carriage return.
91 for (unsigned I = P; I != 0; --I) {
92 char C = Buffer[I - 1];
93 if (isVerticalWhitespace(C))
94 return true;
95 if (!isHorizontalWhitespace(C))
96 return false;
97 }
98 // We hit the beginning of the buffer.
99 return true;
100 }
101
102 /// Returns whether `K` is an ordinary comment kind.
isOrdinaryKind(RawComment::CommentKind K)103 static bool isOrdinaryKind(RawComment::CommentKind K) {
104 return (K == RawComment::RCK_OrdinaryBCPL) ||
105 (K == RawComment::RCK_OrdinaryC);
106 }
107
RawComment(const SourceManager & SourceMgr,SourceRange SR,const CommentOptions & CommentOpts,bool Merged)108 RawComment::RawComment(const SourceManager &SourceMgr, SourceRange SR,
109 const CommentOptions &CommentOpts, bool Merged) :
110 Range(SR), RawTextValid(false), BriefTextValid(false),
111 IsAttached(false), IsTrailingComment(false),
112 IsAlmostTrailingComment(false) {
113 // Extract raw comment text, if possible.
114 if (SR.getBegin() == SR.getEnd() || getRawText(SourceMgr).empty()) {
115 Kind = RCK_Invalid;
116 return;
117 }
118
119 // Guess comment kind.
120 std::pair<CommentKind, bool> K =
121 getCommentKind(RawText, CommentOpts.ParseAllComments);
122
123 // Guess whether an ordinary comment is trailing.
124 if (CommentOpts.ParseAllComments && isOrdinaryKind(K.first)) {
125 FileID BeginFileID;
126 unsigned BeginOffset;
127 std::tie(BeginFileID, BeginOffset) =
128 SourceMgr.getDecomposedLoc(Range.getBegin());
129 if (BeginOffset != 0) {
130 bool Invalid = false;
131 const char *Buffer =
132 SourceMgr.getBufferData(BeginFileID, &Invalid).data();
133 IsTrailingComment |=
134 (!Invalid && !onlyWhitespaceOnLineBefore(Buffer, BeginOffset));
135 }
136 }
137
138 if (!Merged) {
139 Kind = K.first;
140 IsTrailingComment |= K.second;
141
142 IsAlmostTrailingComment = RawText.startswith("//<") ||
143 RawText.startswith("/*<");
144 } else {
145 Kind = RCK_Merged;
146 IsTrailingComment =
147 IsTrailingComment || mergedCommentIsTrailingComment(RawText);
148 }
149 }
150
getRawTextSlow(const SourceManager & SourceMgr) const151 StringRef RawComment::getRawTextSlow(const SourceManager &SourceMgr) const {
152 FileID BeginFileID;
153 FileID EndFileID;
154 unsigned BeginOffset;
155 unsigned EndOffset;
156
157 std::tie(BeginFileID, BeginOffset) =
158 SourceMgr.getDecomposedLoc(Range.getBegin());
159 std::tie(EndFileID, EndOffset) = SourceMgr.getDecomposedLoc(Range.getEnd());
160
161 const unsigned Length = EndOffset - BeginOffset;
162 if (Length < 2)
163 return StringRef();
164
165 // The comment can't begin in one file and end in another.
166 assert(BeginFileID == EndFileID);
167
168 bool Invalid = false;
169 const char *BufferStart = SourceMgr.getBufferData(BeginFileID,
170 &Invalid).data();
171 if (Invalid)
172 return StringRef();
173
174 return StringRef(BufferStart + BeginOffset, Length);
175 }
176
extractBriefText(const ASTContext & Context) const177 const char *RawComment::extractBriefText(const ASTContext &Context) const {
178 // Lazily initialize RawText using the accessor before using it.
179 (void)getRawText(Context.getSourceManager());
180
181 // Since we will be copying the resulting text, all allocations made during
182 // parsing are garbage after resulting string is formed. Thus we can use
183 // a separate allocator for all temporary stuff.
184 llvm::BumpPtrAllocator Allocator;
185
186 comments::Lexer L(Allocator, Context.getDiagnostics(),
187 Context.getCommentCommandTraits(),
188 Range.getBegin(),
189 RawText.begin(), RawText.end());
190 comments::BriefParser P(L, Context.getCommentCommandTraits());
191
192 const std::string Result = P.Parse();
193 const unsigned BriefTextLength = Result.size();
194 char *BriefTextPtr = new (Context) char[BriefTextLength + 1];
195 memcpy(BriefTextPtr, Result.c_str(), BriefTextLength + 1);
196 BriefText = BriefTextPtr;
197 BriefTextValid = true;
198
199 return BriefTextPtr;
200 }
201
parse(const ASTContext & Context,const Preprocessor * PP,const Decl * D) const202 comments::FullComment *RawComment::parse(const ASTContext &Context,
203 const Preprocessor *PP,
204 const Decl *D) const {
205 // Lazily initialize RawText using the accessor before using it.
206 (void)getRawText(Context.getSourceManager());
207
208 comments::Lexer L(Context.getAllocator(), Context.getDiagnostics(),
209 Context.getCommentCommandTraits(),
210 getSourceRange().getBegin(),
211 RawText.begin(), RawText.end());
212 comments::Sema S(Context.getAllocator(), Context.getSourceManager(),
213 Context.getDiagnostics(),
214 Context.getCommentCommandTraits(),
215 PP);
216 S.setDecl(D);
217 comments::Parser P(L, S, Context.getAllocator(), Context.getSourceManager(),
218 Context.getDiagnostics(),
219 Context.getCommentCommandTraits());
220
221 return P.parseFullComment();
222 }
223
onlyWhitespaceBetween(SourceManager & SM,SourceLocation Loc1,SourceLocation Loc2,unsigned MaxNewlinesAllowed)224 static bool onlyWhitespaceBetween(SourceManager &SM,
225 SourceLocation Loc1, SourceLocation Loc2,
226 unsigned MaxNewlinesAllowed) {
227 std::pair<FileID, unsigned> Loc1Info = SM.getDecomposedLoc(Loc1);
228 std::pair<FileID, unsigned> Loc2Info = SM.getDecomposedLoc(Loc2);
229
230 // Question does not make sense if locations are in different files.
231 if (Loc1Info.first != Loc2Info.first)
232 return false;
233
234 bool Invalid = false;
235 const char *Buffer = SM.getBufferData(Loc1Info.first, &Invalid).data();
236 if (Invalid)
237 return false;
238
239 unsigned NumNewlines = 0;
240 assert(Loc1Info.second <= Loc2Info.second && "Loc1 after Loc2!");
241 // Look for non-whitespace characters and remember any newlines seen.
242 for (unsigned I = Loc1Info.second; I != Loc2Info.second; ++I) {
243 switch (Buffer[I]) {
244 default:
245 return false;
246 case ' ':
247 case '\t':
248 case '\f':
249 case '\v':
250 break;
251 case '\r':
252 case '\n':
253 ++NumNewlines;
254
255 // Check if we have found more than the maximum allowed number of
256 // newlines.
257 if (NumNewlines > MaxNewlinesAllowed)
258 return false;
259
260 // Collapse \r\n and \n\r into a single newline.
261 if (I + 1 != Loc2Info.second &&
262 (Buffer[I + 1] == '\n' || Buffer[I + 1] == '\r') &&
263 Buffer[I] != Buffer[I + 1])
264 ++I;
265 break;
266 }
267 }
268
269 return true;
270 }
271
addComment(const RawComment & RC,const CommentOptions & CommentOpts,llvm::BumpPtrAllocator & Allocator)272 void RawCommentList::addComment(const RawComment &RC,
273 const CommentOptions &CommentOpts,
274 llvm::BumpPtrAllocator &Allocator) {
275 if (RC.isInvalid())
276 return;
277
278 // Check if the comments are not in source order.
279 while (!Comments.empty() &&
280 !SourceMgr.isBeforeInTranslationUnit(Comments.back()->getBeginLoc(),
281 RC.getBeginLoc())) {
282 // If they are, just pop a few last comments that don't fit.
283 // This happens if an \#include directive contains comments.
284 Comments.pop_back();
285 }
286
287 // Ordinary comments are not interesting for us.
288 if (RC.isOrdinary() && !CommentOpts.ParseAllComments)
289 return;
290
291 // If this is the first Doxygen comment, save it (because there isn't
292 // anything to merge it with).
293 if (Comments.empty()) {
294 Comments.push_back(new (Allocator) RawComment(RC));
295 return;
296 }
297
298 const RawComment &C1 = *Comments.back();
299 const RawComment &C2 = RC;
300
301 // Merge comments only if there is only whitespace between them.
302 // Can't merge trailing and non-trailing comments unless the second is
303 // non-trailing ordinary in the same column, as in the case:
304 // int x; // documents x
305 // // more text
306 // versus:
307 // int x; // documents x
308 // int y; // documents y
309 // or:
310 // int x; // documents x
311 // // documents y
312 // int y;
313 // Merge comments if they are on same or consecutive lines.
314 if ((C1.isTrailingComment() == C2.isTrailingComment() ||
315 (C1.isTrailingComment() && !C2.isTrailingComment() &&
316 isOrdinaryKind(C2.getKind()) &&
317 commentsStartOnSameColumn(SourceMgr, C1, C2))) &&
318 onlyWhitespaceBetween(SourceMgr, C1.getEndLoc(), C2.getBeginLoc(),
319 /*MaxNewlinesAllowed=*/1)) {
320 SourceRange MergedRange(C1.getBeginLoc(), C2.getEndLoc());
321 *Comments.back() = RawComment(SourceMgr, MergedRange, CommentOpts, true);
322 } else {
323 Comments.push_back(new (Allocator) RawComment(RC));
324 }
325 }
326
addDeserializedComments(ArrayRef<RawComment * > DeserializedComments)327 void RawCommentList::addDeserializedComments(ArrayRef<RawComment *> DeserializedComments) {
328 std::vector<RawComment *> MergedComments;
329 MergedComments.reserve(Comments.size() + DeserializedComments.size());
330
331 std::merge(Comments.begin(), Comments.end(),
332 DeserializedComments.begin(), DeserializedComments.end(),
333 std::back_inserter(MergedComments),
334 BeforeThanCompare<RawComment>(SourceMgr));
335 std::swap(Comments, MergedComments);
336 }
337
getFormattedText(const SourceManager & SourceMgr,DiagnosticsEngine & Diags) const338 std::string RawComment::getFormattedText(const SourceManager &SourceMgr,
339 DiagnosticsEngine &Diags) const {
340 llvm::StringRef CommentText = getRawText(SourceMgr);
341 if (CommentText.empty())
342 return "";
343
344 llvm::BumpPtrAllocator Allocator;
345 // We do not parse any commands, so CommentOptions are ignored by
346 // comments::Lexer. Therefore, we just use default-constructed options.
347 CommentOptions DefOpts;
348 comments::CommandTraits EmptyTraits(Allocator, DefOpts);
349 comments::Lexer L(Allocator, Diags, EmptyTraits, getSourceRange().getBegin(),
350 CommentText.begin(), CommentText.end(),
351 /*ParseCommands=*/false);
352
353 std::string Result;
354 // A column number of the first non-whitespace token in the comment text.
355 // We skip whitespace up to this column, but keep the whitespace after this
356 // column. IndentColumn is calculated when lexing the first line and reused
357 // for the rest of lines.
358 unsigned IndentColumn = 0;
359
360 // Processes one line of the comment and adds it to the result.
361 // Handles skipping the indent at the start of the line.
362 // Returns false when eof is reached and true otherwise.
363 auto LexLine = [&](bool IsFirstLine) -> bool {
364 comments::Token Tok;
365 // Lex the first token on the line. We handle it separately, because we to
366 // fix up its indentation.
367 L.lex(Tok);
368 if (Tok.is(comments::tok::eof))
369 return false;
370 if (Tok.is(comments::tok::newline)) {
371 Result += "\n";
372 return true;
373 }
374 llvm::StringRef TokText = L.getSpelling(Tok, SourceMgr);
375 bool LocInvalid = false;
376 unsigned TokColumn =
377 SourceMgr.getSpellingColumnNumber(Tok.getLocation(), &LocInvalid);
378 assert(!LocInvalid && "getFormattedText for invalid location");
379
380 // Amount of leading whitespace in TokText.
381 size_t WhitespaceLen = TokText.find_first_not_of(" \t");
382 if (WhitespaceLen == StringRef::npos)
383 WhitespaceLen = TokText.size();
384 // Remember the amount of whitespace we skipped in the first line to remove
385 // indent up to that column in the following lines.
386 if (IsFirstLine)
387 IndentColumn = TokColumn + WhitespaceLen;
388
389 // Amount of leading whitespace we actually want to skip.
390 // For the first line we skip all the whitespace.
391 // For the rest of the lines, we skip whitespace up to IndentColumn.
392 unsigned SkipLen =
393 IsFirstLine
394 ? WhitespaceLen
395 : std::min<size_t>(
396 WhitespaceLen,
397 std::max<int>(static_cast<int>(IndentColumn) - TokColumn, 0));
398 llvm::StringRef Trimmed = TokText.drop_front(SkipLen);
399 Result += Trimmed;
400 // Lex all tokens in the rest of the line.
401 for (L.lex(Tok); Tok.isNot(comments::tok::eof); L.lex(Tok)) {
402 if (Tok.is(comments::tok::newline)) {
403 Result += "\n";
404 return true;
405 }
406 Result += L.getSpelling(Tok, SourceMgr);
407 }
408 // We've reached the end of file token.
409 return false;
410 };
411
412 auto DropTrailingNewLines = [](std::string &Str) {
413 while (Str.back() == '\n')
414 Str.pop_back();
415 };
416
417 // Process first line separately to remember indent for the following lines.
418 if (!LexLine(/*IsFirstLine=*/true)) {
419 DropTrailingNewLines(Result);
420 return Result;
421 }
422 // Process the rest of the lines.
423 while (LexLine(/*IsFirstLine=*/false))
424 ;
425 DropTrailingNewLines(Result);
426 return Result;
427 }
428