1 #include "clang/AST/CommentLexer.h"
2 #include "clang/AST/CommentCommandTraits.h"
3 #include "clang/AST/CommentDiagnostic.h"
4 #include "clang/Basic/CharInfo.h"
5 #include "llvm/ADT/StringExtras.h"
6 #include "llvm/ADT/StringSwitch.h"
7 #include "llvm/Support/ConvertUTF.h"
8 #include "llvm/Support/ErrorHandling.h"
9 
10 namespace clang {
11 namespace comments {
12 
dump(const Lexer & L,const SourceManager & SM) const13 void Token::dump(const Lexer &L, const SourceManager &SM) const {
14   llvm::errs() << "comments::Token Kind=" << Kind << " ";
15   Loc.dump(SM);
16   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17 }
18 
isHTMLNamedCharacterReferenceCharacter(char C)19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20   return isLetter(C);
21 }
22 
isHTMLDecimalCharacterReferenceCharacter(char C)23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24   return isDigit(C);
25 }
26 
isHTMLHexCharacterReferenceCharacter(char C)27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28   return isHexDigit(C);
29 }
30 
convertCodePointToUTF8(llvm::BumpPtrAllocator & Allocator,unsigned CodePoint)31 static inline StringRef convertCodePointToUTF8(
32                                       llvm::BumpPtrAllocator &Allocator,
33                                       unsigned CodePoint) {
34   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35   char *ResolvedPtr = Resolved;
36   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37     return StringRef(Resolved, ResolvedPtr - Resolved);
38   else
39     return StringRef();
40 }
41 
42 namespace {
43 
44 #include "clang/AST/CommentHTMLTags.inc"
45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46 
47 } // unnamed namespace
48 
resolveHTMLNamedCharacterReference(StringRef Name) const49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50   // Fast path, first check a few most widely used named character references.
51   return llvm::StringSwitch<StringRef>(Name)
52       .Case("amp", "&")
53       .Case("lt", "<")
54       .Case("gt", ">")
55       .Case("quot", "\"")
56       .Case("apos", "\'")
57       // Slow path.
58       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59 }
60 
resolveHTMLDecimalCharacterReference(StringRef Name) const61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62   unsigned CodePoint = 0;
63   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65     CodePoint *= 10;
66     CodePoint += Name[i] - '0';
67   }
68   return convertCodePointToUTF8(Allocator, CodePoint);
69 }
70 
resolveHTMLHexCharacterReference(StringRef Name) const71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72   unsigned CodePoint = 0;
73   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74     CodePoint *= 16;
75     const char C = Name[i];
76     assert(isHTMLHexCharacterReferenceCharacter(C));
77     CodePoint += llvm::hexDigitValue(C);
78   }
79   return convertCodePointToUTF8(Allocator, CodePoint);
80 }
81 
skipLineStartingDecorations()82 void Lexer::skipLineStartingDecorations() {
83   // This function should be called only for C comments
84   assert(CommentState == LCS_InsideCComment);
85 
86   if (BufferPtr == CommentEnd)
87     return;
88 
89   switch (*BufferPtr) {
90   case ' ':
91   case '\t':
92   case '\f':
93   case '\v': {
94     const char *NewBufferPtr = BufferPtr;
95     NewBufferPtr++;
96     if (NewBufferPtr == CommentEnd)
97       return;
98 
99     char C = *NewBufferPtr;
100     while (isHorizontalWhitespace(C)) {
101       NewBufferPtr++;
102       if (NewBufferPtr == CommentEnd)
103         return;
104       C = *NewBufferPtr;
105     }
106     if (C == '*')
107       BufferPtr = NewBufferPtr + 1;
108     break;
109   }
110   case '*':
111     BufferPtr++;
112     break;
113   }
114 }
115 
116 namespace {
117 /// Returns pointer to the first newline character in the string.
findNewline(const char * BufferPtr,const char * BufferEnd)118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120     if (isVerticalWhitespace(*BufferPtr))
121       return BufferPtr;
122   }
123   return BufferEnd;
124 }
125 
skipNewline(const char * BufferPtr,const char * BufferEnd)126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127   if (BufferPtr == BufferEnd)
128     return BufferPtr;
129 
130   if (*BufferPtr == '\n')
131     BufferPtr++;
132   else {
133     assert(*BufferPtr == '\r');
134     BufferPtr++;
135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136       BufferPtr++;
137   }
138   return BufferPtr;
139 }
140 
skipNamedCharacterReference(const char * BufferPtr,const char * BufferEnd)141 const char *skipNamedCharacterReference(const char *BufferPtr,
142                                         const char *BufferEnd) {
143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145       return BufferPtr;
146   }
147   return BufferEnd;
148 }
149 
skipDecimalCharacterReference(const char * BufferPtr,const char * BufferEnd)150 const char *skipDecimalCharacterReference(const char *BufferPtr,
151                                           const char *BufferEnd) {
152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154       return BufferPtr;
155   }
156   return BufferEnd;
157 }
158 
skipHexCharacterReference(const char * BufferPtr,const char * BufferEnd)159 const char *skipHexCharacterReference(const char *BufferPtr,
160                                       const char *BufferEnd) {
161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163       return BufferPtr;
164   }
165   return BufferEnd;
166 }
167 
isHTMLIdentifierStartingCharacter(char C)168 bool isHTMLIdentifierStartingCharacter(char C) {
169   return isLetter(C);
170 }
171 
isHTMLIdentifierCharacter(char C)172 bool isHTMLIdentifierCharacter(char C) {
173   return isAlphanumeric(C);
174 }
175 
skipHTMLIdentifier(const char * BufferPtr,const char * BufferEnd)176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178     if (!isHTMLIdentifierCharacter(*BufferPtr))
179       return BufferPtr;
180   }
181   return BufferEnd;
182 }
183 
184 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185 /// string allowed.
186 ///
187 /// Returns pointer to closing quote.
skipHTMLQuotedString(const char * BufferPtr,const char * BufferEnd)188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189 {
190   const char Quote = *BufferPtr;
191   assert(Quote == '\"' || Quote == '\'');
192 
193   BufferPtr++;
194   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195     const char C = *BufferPtr;
196     if (C == Quote && BufferPtr[-1] != '\\')
197       return BufferPtr;
198   }
199   return BufferEnd;
200 }
201 
skipWhitespace(const char * BufferPtr,const char * BufferEnd)202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204     if (!isWhitespace(*BufferPtr))
205       return BufferPtr;
206   }
207   return BufferEnd;
208 }
209 
isWhitespace(const char * BufferPtr,const char * BufferEnd)210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212 }
213 
isCommandNameStartCharacter(char C)214 bool isCommandNameStartCharacter(char C) {
215   return isLetter(C);
216 }
217 
isCommandNameCharacter(char C)218 bool isCommandNameCharacter(char C) {
219   return isAlphanumeric(C);
220 }
221 
skipCommandName(const char * BufferPtr,const char * BufferEnd)222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224     if (!isCommandNameCharacter(*BufferPtr))
225       return BufferPtr;
226   }
227   return BufferEnd;
228 }
229 
230 /// Return the one past end pointer for BCPL comments.
231 /// Handles newlines escaped with backslash or trigraph for backslahs.
findBCPLCommentEnd(const char * BufferPtr,const char * BufferEnd)232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233   const char *CurPtr = BufferPtr;
234   while (CurPtr != BufferEnd) {
235     while (!isVerticalWhitespace(*CurPtr)) {
236       CurPtr++;
237       if (CurPtr == BufferEnd)
238         return BufferEnd;
239     }
240     // We found a newline, check if it is escaped.
241     const char *EscapePtr = CurPtr - 1;
242     while(isHorizontalWhitespace(*EscapePtr))
243       EscapePtr--;
244 
245     if (*EscapePtr == '\\' ||
246         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248       // We found an escaped newline.
249       CurPtr = skipNewline(CurPtr, BufferEnd);
250     } else
251       return CurPtr; // Not an escaped newline.
252   }
253   return BufferEnd;
254 }
255 
256 /// Return the one past end pointer for C comments.
257 /// Very dumb, does not handle escaped newlines or trigraphs.
findCCommentEnd(const char * BufferPtr,const char * BufferEnd)258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260     if (*BufferPtr == '*') {
261       assert(BufferPtr + 1 != BufferEnd);
262       if (*(BufferPtr + 1) == '/')
263         return BufferPtr;
264     }
265   }
266   llvm_unreachable("buffer end hit before '*/' was seen");
267 }
268 
269 } // unnamed namespace
270 
formTokenWithChars(Token & Result,const char * TokEnd,tok::TokenKind Kind)271 void Lexer::formTokenWithChars(Token &Result, const char *TokEnd,
272                                tok::TokenKind Kind) {
273   const unsigned TokLen = TokEnd - BufferPtr;
274   Result.setLocation(getSourceLocation(BufferPtr));
275   Result.setKind(Kind);
276   Result.setLength(TokLen);
277 #ifndef NDEBUG
278   Result.TextPtr = "<UNSET>";
279   Result.IntVal = 7;
280 #endif
281   BufferPtr = TokEnd;
282 }
283 
lexCommentText(Token & T)284 void Lexer::lexCommentText(Token &T) {
285   assert(CommentState == LCS_InsideBCPLComment ||
286          CommentState == LCS_InsideCComment);
287 
288   switch (State) {
289   case LS_Normal:
290     break;
291   case LS_VerbatimBlockFirstLine:
292     lexVerbatimBlockFirstLine(T);
293     return;
294   case LS_VerbatimBlockBody:
295     lexVerbatimBlockBody(T);
296     return;
297   case LS_VerbatimLineText:
298     lexVerbatimLineText(T);
299     return;
300   case LS_HTMLStartTag:
301     lexHTMLStartTag(T);
302     return;
303   case LS_HTMLEndTag:
304     lexHTMLEndTag(T);
305     return;
306   }
307 
308   assert(State == LS_Normal);
309 
310   const char *TokenPtr = BufferPtr;
311   assert(TokenPtr < CommentEnd);
312   while (TokenPtr != CommentEnd) {
313     switch(*TokenPtr) {
314       case '\\':
315       case '@': {
316         // Commands that start with a backslash and commands that start with
317         // 'at' have equivalent semantics.  But we keep information about the
318         // exact syntax in AST for comments.
319         tok::TokenKind CommandKind =
320             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
321         TokenPtr++;
322         if (TokenPtr == CommentEnd) {
323           formTextToken(T, TokenPtr);
324           return;
325         }
326         char C = *TokenPtr;
327         switch (C) {
328         default:
329           break;
330 
331         case '\\': case '@': case '&': case '$':
332         case '#':  case '<': case '>': case '%':
333         case '\"': case '.': case ':':
334           // This is one of \\ \@ \& \$ etc escape sequences.
335           TokenPtr++;
336           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
337             // This is the \:: escape sequence.
338             TokenPtr++;
339           }
340           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
341           formTokenWithChars(T, TokenPtr, tok::text);
342           T.setText(UnescapedText);
343           return;
344         }
345 
346         // Don't make zero-length commands.
347         if (!isCommandNameStartCharacter(*TokenPtr)) {
348           formTextToken(T, TokenPtr);
349           return;
350         }
351 
352         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
353         unsigned Length = TokenPtr - (BufferPtr + 1);
354 
355         // Hardcoded support for lexing LaTeX formula commands
356         // \f$ \f[ \f] \f{ \f} as a single command.
357         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
358           C = *TokenPtr;
359           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
360             TokenPtr++;
361             Length++;
362           }
363         }
364 
365         StringRef CommandName(BufferPtr + 1, Length);
366 
367         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
368         if (!Info) {
369           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
370             StringRef CorrectedName = Info->Name;
371             SourceLocation Loc = getSourceLocation(BufferPtr);
372             SourceRange CommandRange(Loc.getLocWithOffset(1),
373                                      getSourceLocation(TokenPtr));
374             Diag(Loc, diag::warn_correct_comment_command_name)
375               << CommandName << CorrectedName
376               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
377           } else {
378             formTokenWithChars(T, TokenPtr, tok::unknown_command);
379             T.setUnknownCommandName(CommandName);
380             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
381             return;
382           }
383         }
384         if (Info->IsVerbatimBlockCommand) {
385           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
386           return;
387         }
388         if (Info->IsVerbatimLineCommand) {
389           setupAndLexVerbatimLine(T, TokenPtr, Info);
390           return;
391         }
392         formTokenWithChars(T, TokenPtr, CommandKind);
393         T.setCommandID(Info->getID());
394         return;
395       }
396 
397       case '&':
398         lexHTMLCharacterReference(T);
399         return;
400 
401       case '<': {
402         TokenPtr++;
403         if (TokenPtr == CommentEnd) {
404           formTextToken(T, TokenPtr);
405           return;
406         }
407         const char C = *TokenPtr;
408         if (isHTMLIdentifierStartingCharacter(C))
409           setupAndLexHTMLStartTag(T);
410         else if (C == '/')
411           setupAndLexHTMLEndTag(T);
412         else
413           formTextToken(T, TokenPtr);
414 
415         return;
416       }
417 
418       case '\n':
419       case '\r':
420         TokenPtr = skipNewline(TokenPtr, CommentEnd);
421         formTokenWithChars(T, TokenPtr, tok::newline);
422 
423         if (CommentState == LCS_InsideCComment)
424           skipLineStartingDecorations();
425         return;
426 
427       default: {
428         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
429                          find_first_of("\n\r\\@&<");
430         if (End != StringRef::npos)
431           TokenPtr += End;
432         else
433           TokenPtr = CommentEnd;
434         formTextToken(T, TokenPtr);
435         return;
436       }
437     }
438   }
439 }
440 
setupAndLexVerbatimBlock(Token & T,const char * TextBegin,char Marker,const CommandInfo * Info)441 void Lexer::setupAndLexVerbatimBlock(Token &T,
442                                      const char *TextBegin,
443                                      char Marker, const CommandInfo *Info) {
444   assert(Info->IsVerbatimBlockCommand);
445 
446   VerbatimBlockEndCommandName.clear();
447   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
448   VerbatimBlockEndCommandName.append(Info->EndCommandName);
449 
450   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
451   T.setVerbatimBlockID(Info->getID());
452 
453   // If there is a newline following the verbatim opening command, skip the
454   // newline so that we don't create an tok::verbatim_block_line with empty
455   // text content.
456   if (BufferPtr != CommentEnd &&
457       isVerticalWhitespace(*BufferPtr)) {
458     BufferPtr = skipNewline(BufferPtr, CommentEnd);
459     State = LS_VerbatimBlockBody;
460     return;
461   }
462 
463   State = LS_VerbatimBlockFirstLine;
464 }
465 
lexVerbatimBlockFirstLine(Token & T)466 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
467 again:
468   assert(BufferPtr < CommentEnd);
469 
470   // FIXME: It would be better to scan the text once, finding either the block
471   // end command or newline.
472   //
473   // Extract current line.
474   const char *Newline = findNewline(BufferPtr, CommentEnd);
475   StringRef Line(BufferPtr, Newline - BufferPtr);
476 
477   // Look for end command in current line.
478   size_t Pos = Line.find(VerbatimBlockEndCommandName);
479   const char *TextEnd;
480   const char *NextLine;
481   if (Pos == StringRef::npos) {
482     // Current line is completely verbatim.
483     TextEnd = Newline;
484     NextLine = skipNewline(Newline, CommentEnd);
485   } else if (Pos == 0) {
486     // Current line contains just an end command.
487     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
488     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
489     formTokenWithChars(T, End, tok::verbatim_block_end);
490     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
491     State = LS_Normal;
492     return;
493   } else {
494     // There is some text, followed by end command.  Extract text first.
495     TextEnd = BufferPtr + Pos;
496     NextLine = TextEnd;
497     // If there is only whitespace before end command, skip whitespace.
498     if (isWhitespace(BufferPtr, TextEnd)) {
499       BufferPtr = TextEnd;
500       goto again;
501     }
502   }
503 
504   StringRef Text(BufferPtr, TextEnd - BufferPtr);
505   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
506   T.setVerbatimBlockText(Text);
507 
508   State = LS_VerbatimBlockBody;
509 }
510 
lexVerbatimBlockBody(Token & T)511 void Lexer::lexVerbatimBlockBody(Token &T) {
512   assert(State == LS_VerbatimBlockBody);
513 
514   if (CommentState == LCS_InsideCComment)
515     skipLineStartingDecorations();
516 
517   lexVerbatimBlockFirstLine(T);
518 }
519 
setupAndLexVerbatimLine(Token & T,const char * TextBegin,const CommandInfo * Info)520 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
521                                     const CommandInfo *Info) {
522   assert(Info->IsVerbatimLineCommand);
523   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
524   T.setVerbatimLineID(Info->getID());
525 
526   State = LS_VerbatimLineText;
527 }
528 
lexVerbatimLineText(Token & T)529 void Lexer::lexVerbatimLineText(Token &T) {
530   assert(State == LS_VerbatimLineText);
531 
532   // Extract current line.
533   const char *Newline = findNewline(BufferPtr, CommentEnd);
534   StringRef Text(BufferPtr, Newline - BufferPtr);
535   formTokenWithChars(T, Newline, tok::verbatim_line_text);
536   T.setVerbatimLineText(Text);
537 
538   State = LS_Normal;
539 }
540 
lexHTMLCharacterReference(Token & T)541 void Lexer::lexHTMLCharacterReference(Token &T) {
542   const char *TokenPtr = BufferPtr;
543   assert(*TokenPtr == '&');
544   TokenPtr++;
545   if (TokenPtr == CommentEnd) {
546     formTextToken(T, TokenPtr);
547     return;
548   }
549   const char *NamePtr;
550   bool isNamed = false;
551   bool isDecimal = false;
552   char C = *TokenPtr;
553   if (isHTMLNamedCharacterReferenceCharacter(C)) {
554     NamePtr = TokenPtr;
555     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
556     isNamed = true;
557   } else if (C == '#') {
558     TokenPtr++;
559     if (TokenPtr == CommentEnd) {
560       formTextToken(T, TokenPtr);
561       return;
562     }
563     C = *TokenPtr;
564     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
565       NamePtr = TokenPtr;
566       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
567       isDecimal = true;
568     } else if (C == 'x' || C == 'X') {
569       TokenPtr++;
570       NamePtr = TokenPtr;
571       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
572     } else {
573       formTextToken(T, TokenPtr);
574       return;
575     }
576   } else {
577     formTextToken(T, TokenPtr);
578     return;
579   }
580   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
581       *TokenPtr != ';') {
582     formTextToken(T, TokenPtr);
583     return;
584   }
585   StringRef Name(NamePtr, TokenPtr - NamePtr);
586   TokenPtr++; // Skip semicolon.
587   StringRef Resolved;
588   if (isNamed)
589     Resolved = resolveHTMLNamedCharacterReference(Name);
590   else if (isDecimal)
591     Resolved = resolveHTMLDecimalCharacterReference(Name);
592   else
593     Resolved = resolveHTMLHexCharacterReference(Name);
594 
595   if (Resolved.empty()) {
596     formTextToken(T, TokenPtr);
597     return;
598   }
599   formTokenWithChars(T, TokenPtr, tok::text);
600   T.setText(Resolved);
601   return;
602 }
603 
setupAndLexHTMLStartTag(Token & T)604 void Lexer::setupAndLexHTMLStartTag(Token &T) {
605   assert(BufferPtr[0] == '<' &&
606          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
607   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
608   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
609   if (!isHTMLTagName(Name)) {
610     formTextToken(T, TagNameEnd);
611     return;
612   }
613 
614   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
615   T.setHTMLTagStartName(Name);
616 
617   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
618 
619   const char C = *BufferPtr;
620   if (BufferPtr != CommentEnd &&
621       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
622     State = LS_HTMLStartTag;
623 }
624 
lexHTMLStartTag(Token & T)625 void Lexer::lexHTMLStartTag(Token &T) {
626   assert(State == LS_HTMLStartTag);
627 
628   const char *TokenPtr = BufferPtr;
629   char C = *TokenPtr;
630   if (isHTMLIdentifierCharacter(C)) {
631     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
632     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
633     formTokenWithChars(T, TokenPtr, tok::html_ident);
634     T.setHTMLIdent(Ident);
635   } else {
636     switch (C) {
637     case '=':
638       TokenPtr++;
639       formTokenWithChars(T, TokenPtr, tok::html_equals);
640       break;
641     case '\"':
642     case '\'': {
643       const char *OpenQuote = TokenPtr;
644       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
645       const char *ClosingQuote = TokenPtr;
646       if (TokenPtr != CommentEnd) // Skip closing quote.
647         TokenPtr++;
648       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
649       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
650                                       ClosingQuote - (OpenQuote + 1)));
651       break;
652     }
653     case '>':
654       TokenPtr++;
655       formTokenWithChars(T, TokenPtr, tok::html_greater);
656       State = LS_Normal;
657       return;
658     case '/':
659       TokenPtr++;
660       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
661         TokenPtr++;
662         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
663       } else
664         formTextToken(T, TokenPtr);
665 
666       State = LS_Normal;
667       return;
668     }
669   }
670 
671   // Now look ahead and return to normal state if we don't see any HTML tokens
672   // ahead.
673   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
674   if (BufferPtr == CommentEnd) {
675     State = LS_Normal;
676     return;
677   }
678 
679   C = *BufferPtr;
680   if (!isHTMLIdentifierStartingCharacter(C) &&
681       C != '=' && C != '\"' && C != '\'' && C != '>') {
682     State = LS_Normal;
683     return;
684   }
685 }
686 
setupAndLexHTMLEndTag(Token & T)687 void Lexer::setupAndLexHTMLEndTag(Token &T) {
688   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
689 
690   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
691   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
692   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
693   if (!isHTMLTagName(Name)) {
694     formTextToken(T, TagNameEnd);
695     return;
696   }
697 
698   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
699 
700   formTokenWithChars(T, End, tok::html_end_tag);
701   T.setHTMLTagEndName(Name);
702 
703   if (BufferPtr != CommentEnd && *BufferPtr == '>')
704     State = LS_HTMLEndTag;
705 }
706 
lexHTMLEndTag(Token & T)707 void Lexer::lexHTMLEndTag(Token &T) {
708   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
709 
710   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
711   State = LS_Normal;
712 }
713 
Lexer(llvm::BumpPtrAllocator & Allocator,DiagnosticsEngine & Diags,const CommandTraits & Traits,SourceLocation FileLoc,const char * BufferStart,const char * BufferEnd)714 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
715              const CommandTraits &Traits,
716              SourceLocation FileLoc,
717              const char *BufferStart, const char *BufferEnd):
718     Allocator(Allocator), Diags(Diags), Traits(Traits),
719     BufferStart(BufferStart), BufferEnd(BufferEnd),
720     FileLoc(FileLoc), BufferPtr(BufferStart),
721     CommentState(LCS_BeforeComment), State(LS_Normal) {
722 }
723 
lex(Token & T)724 void Lexer::lex(Token &T) {
725 again:
726   switch (CommentState) {
727   case LCS_BeforeComment:
728     if (BufferPtr == BufferEnd) {
729       formTokenWithChars(T, BufferPtr, tok::eof);
730       return;
731     }
732 
733     assert(*BufferPtr == '/');
734     BufferPtr++; // Skip first slash.
735     switch(*BufferPtr) {
736     case '/': { // BCPL comment.
737       BufferPtr++; // Skip second slash.
738 
739       if (BufferPtr != BufferEnd) {
740         // Skip Doxygen magic marker, if it is present.
741         // It might be missing because of a typo //< or /*<, or because we
742         // merged this non-Doxygen comment into a bunch of Doxygen comments
743         // around it: /** ... */ /* ... */ /** ... */
744         const char C = *BufferPtr;
745         if (C == '/' || C == '!')
746           BufferPtr++;
747       }
748 
749       // Skip less-than symbol that marks trailing comments.
750       // Skip it even if the comment is not a Doxygen one, because //< and /*<
751       // are frequent typos.
752       if (BufferPtr != BufferEnd && *BufferPtr == '<')
753         BufferPtr++;
754 
755       CommentState = LCS_InsideBCPLComment;
756       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
757         State = LS_Normal;
758       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
759       goto again;
760     }
761     case '*': { // C comment.
762       BufferPtr++; // Skip star.
763 
764       // Skip Doxygen magic marker.
765       const char C = *BufferPtr;
766       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
767         BufferPtr++;
768 
769       // Skip less-than symbol that marks trailing comments.
770       if (BufferPtr != BufferEnd && *BufferPtr == '<')
771         BufferPtr++;
772 
773       CommentState = LCS_InsideCComment;
774       State = LS_Normal;
775       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
776       goto again;
777     }
778     default:
779       llvm_unreachable("second character of comment should be '/' or '*'");
780     }
781 
782   case LCS_BetweenComments: {
783     // Consecutive comments are extracted only if there is only whitespace
784     // between them.  So we can search for the start of the next comment.
785     const char *EndWhitespace = BufferPtr;
786     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
787       EndWhitespace++;
788 
789     // Turn any whitespace between comments (and there is only whitespace
790     // between them -- guaranteed by comment extraction) into a newline.  We
791     // have two newlines between C comments in total (first one was synthesized
792     // after a comment).
793     formTokenWithChars(T, EndWhitespace, tok::newline);
794 
795     CommentState = LCS_BeforeComment;
796     break;
797   }
798 
799   case LCS_InsideBCPLComment:
800   case LCS_InsideCComment:
801     if (BufferPtr != CommentEnd) {
802       lexCommentText(T);
803       break;
804     } else {
805       // Skip C comment closing sequence.
806       if (CommentState == LCS_InsideCComment) {
807         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
808         BufferPtr += 2;
809         assert(BufferPtr <= BufferEnd);
810 
811         // Synthenize newline just after the C comment, regardless if there is
812         // actually a newline.
813         formTokenWithChars(T, BufferPtr, tok::newline);
814 
815         CommentState = LCS_BetweenComments;
816         break;
817       } else {
818         // Don't synthesized a newline after BCPL comment.
819         CommentState = LCS_BetweenComments;
820         goto again;
821       }
822     }
823   }
824 }
825 
getSpelling(const Token & Tok,const SourceManager & SourceMgr,bool * Invalid) const826 StringRef Lexer::getSpelling(const Token &Tok,
827                              const SourceManager &SourceMgr,
828                              bool *Invalid) const {
829   SourceLocation Loc = Tok.getLocation();
830   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
831 
832   bool InvalidTemp = false;
833   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
834   if (InvalidTemp) {
835     *Invalid = true;
836     return StringRef();
837   }
838 
839   const char *Begin = File.data() + LocInfo.second;
840   return StringRef(Begin, Tok.getLength());
841 }
842 
843 } // end namespace comments
844 } // end namespace clang
845 
846