1 #include "clang/AST/CommentLexer.h"
2 #include "clang/AST/CommentCommandTraits.h"
3 #include "clang/AST/CommentDiagnostic.h"
4 #include "clang/Basic/CharInfo.h"
5 #include "llvm/ADT/StringExtras.h"
6 #include "llvm/ADT/StringSwitch.h"
7 #include "llvm/Support/ConvertUTF.h"
8 #include "llvm/Support/ErrorHandling.h"
9 
10 namespace clang {
11 namespace comments {
12 
13 void Token::dump(const Lexer &L, const SourceManager &SM) const {
14   llvm::errs() << "comments::Token Kind=" << Kind << " ";
15   Loc.dump(SM);
16   llvm::errs() << " " << Length << " \"" << L.getSpelling(*this, SM) << "\"\n";
17 }
18 
19 static inline bool isHTMLNamedCharacterReferenceCharacter(char C) {
20   return isLetter(C);
21 }
22 
23 static inline bool isHTMLDecimalCharacterReferenceCharacter(char C) {
24   return isDigit(C);
25 }
26 
27 static inline bool isHTMLHexCharacterReferenceCharacter(char C) {
28   return isHexDigit(C);
29 }
30 
31 static inline StringRef convertCodePointToUTF8(
32                                       llvm::BumpPtrAllocator &Allocator,
33                                       unsigned CodePoint) {
34   char *Resolved = Allocator.Allocate<char>(UNI_MAX_UTF8_BYTES_PER_CODE_POINT);
35   char *ResolvedPtr = Resolved;
36   if (llvm::ConvertCodePointToUTF8(CodePoint, ResolvedPtr))
37     return StringRef(Resolved, ResolvedPtr - Resolved);
38   else
39     return StringRef();
40 }
41 
42 namespace {
43 
44 #include "clang/AST/CommentHTMLTags.inc"
45 #include "clang/AST/CommentHTMLNamedCharacterReferences.inc"
46 
47 } // unnamed namespace
48 
49 StringRef Lexer::resolveHTMLNamedCharacterReference(StringRef Name) const {
50   // Fast path, first check a few most widely used named character references.
51   return llvm::StringSwitch<StringRef>(Name)
52       .Case("amp", "&")
53       .Case("lt", "<")
54       .Case("gt", ">")
55       .Case("quot", "\"")
56       .Case("apos", "\'")
57       // Slow path.
58       .Default(translateHTMLNamedCharacterReferenceToUTF8(Name));
59 }
60 
61 StringRef Lexer::resolveHTMLDecimalCharacterReference(StringRef Name) const {
62   unsigned CodePoint = 0;
63   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
64     assert(isHTMLDecimalCharacterReferenceCharacter(Name[i]));
65     CodePoint *= 10;
66     CodePoint += Name[i] - '0';
67   }
68   return convertCodePointToUTF8(Allocator, CodePoint);
69 }
70 
71 StringRef Lexer::resolveHTMLHexCharacterReference(StringRef Name) const {
72   unsigned CodePoint = 0;
73   for (unsigned i = 0, e = Name.size(); i != e; ++i) {
74     CodePoint *= 16;
75     const char C = Name[i];
76     assert(isHTMLHexCharacterReferenceCharacter(C));
77     CodePoint += llvm::hexDigitValue(C);
78   }
79   return convertCodePointToUTF8(Allocator, CodePoint);
80 }
81 
82 void Lexer::skipLineStartingDecorations() {
83   // This function should be called only for C comments
84   assert(CommentState == LCS_InsideCComment);
85 
86   if (BufferPtr == CommentEnd)
87     return;
88 
89   switch (*BufferPtr) {
90   case ' ':
91   case '\t':
92   case '\f':
93   case '\v': {
94     const char *NewBufferPtr = BufferPtr;
95     NewBufferPtr++;
96     if (NewBufferPtr == CommentEnd)
97       return;
98 
99     char C = *NewBufferPtr;
100     while (isHorizontalWhitespace(C)) {
101       NewBufferPtr++;
102       if (NewBufferPtr == CommentEnd)
103         return;
104       C = *NewBufferPtr;
105     }
106     if (C == '*')
107       BufferPtr = NewBufferPtr + 1;
108     break;
109   }
110   case '*':
111     BufferPtr++;
112     break;
113   }
114 }
115 
116 namespace {
117 /// Returns pointer to the first newline character in the string.
118 const char *findNewline(const char *BufferPtr, const char *BufferEnd) {
119   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
120     if (isVerticalWhitespace(*BufferPtr))
121       return BufferPtr;
122   }
123   return BufferEnd;
124 }
125 
126 const char *skipNewline(const char *BufferPtr, const char *BufferEnd) {
127   if (BufferPtr == BufferEnd)
128     return BufferPtr;
129 
130   if (*BufferPtr == '\n')
131     BufferPtr++;
132   else {
133     assert(*BufferPtr == '\r');
134     BufferPtr++;
135     if (BufferPtr != BufferEnd && *BufferPtr == '\n')
136       BufferPtr++;
137   }
138   return BufferPtr;
139 }
140 
141 const char *skipNamedCharacterReference(const char *BufferPtr,
142                                         const char *BufferEnd) {
143   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
144     if (!isHTMLNamedCharacterReferenceCharacter(*BufferPtr))
145       return BufferPtr;
146   }
147   return BufferEnd;
148 }
149 
150 const char *skipDecimalCharacterReference(const char *BufferPtr,
151                                           const char *BufferEnd) {
152   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
153     if (!isHTMLDecimalCharacterReferenceCharacter(*BufferPtr))
154       return BufferPtr;
155   }
156   return BufferEnd;
157 }
158 
159 const char *skipHexCharacterReference(const char *BufferPtr,
160                                       const char *BufferEnd) {
161   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
162     if (!isHTMLHexCharacterReferenceCharacter(*BufferPtr))
163       return BufferPtr;
164   }
165   return BufferEnd;
166 }
167 
168 bool isHTMLIdentifierStartingCharacter(char C) {
169   return isLetter(C);
170 }
171 
172 bool isHTMLIdentifierCharacter(char C) {
173   return isAlphanumeric(C);
174 }
175 
176 const char *skipHTMLIdentifier(const char *BufferPtr, const char *BufferEnd) {
177   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
178     if (!isHTMLIdentifierCharacter(*BufferPtr))
179       return BufferPtr;
180   }
181   return BufferEnd;
182 }
183 
184 /// Skip HTML string quoted in single or double quotes.  Escaping quotes inside
185 /// string allowed.
186 ///
187 /// Returns pointer to closing quote.
188 const char *skipHTMLQuotedString(const char *BufferPtr, const char *BufferEnd)
189 {
190   const char Quote = *BufferPtr;
191   assert(Quote == '\"' || Quote == '\'');
192 
193   BufferPtr++;
194   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
195     const char C = *BufferPtr;
196     if (C == Quote && BufferPtr[-1] != '\\')
197       return BufferPtr;
198   }
199   return BufferEnd;
200 }
201 
202 const char *skipWhitespace(const char *BufferPtr, const char *BufferEnd) {
203   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
204     if (!isWhitespace(*BufferPtr))
205       return BufferPtr;
206   }
207   return BufferEnd;
208 }
209 
210 bool isWhitespace(const char *BufferPtr, const char *BufferEnd) {
211   return skipWhitespace(BufferPtr, BufferEnd) == BufferEnd;
212 }
213 
214 bool isCommandNameStartCharacter(char C) {
215   return isLetter(C);
216 }
217 
218 bool isCommandNameCharacter(char C) {
219   return isAlphanumeric(C);
220 }
221 
222 const char *skipCommandName(const char *BufferPtr, const char *BufferEnd) {
223   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
224     if (!isCommandNameCharacter(*BufferPtr))
225       return BufferPtr;
226   }
227   return BufferEnd;
228 }
229 
230 /// Return the one past end pointer for BCPL comments.
231 /// Handles newlines escaped with backslash or trigraph for backslahs.
232 const char *findBCPLCommentEnd(const char *BufferPtr, const char *BufferEnd) {
233   const char *CurPtr = BufferPtr;
234   while (CurPtr != BufferEnd) {
235     while (!isVerticalWhitespace(*CurPtr)) {
236       CurPtr++;
237       if (CurPtr == BufferEnd)
238         return BufferEnd;
239     }
240     // We found a newline, check if it is escaped.
241     const char *EscapePtr = CurPtr - 1;
242     while(isHorizontalWhitespace(*EscapePtr))
243       EscapePtr--;
244 
245     if (*EscapePtr == '\\' ||
246         (EscapePtr - 2 >= BufferPtr && EscapePtr[0] == '/' &&
247          EscapePtr[-1] == '?' && EscapePtr[-2] == '?')) {
248       // We found an escaped newline.
249       CurPtr = skipNewline(CurPtr, BufferEnd);
250     } else
251       return CurPtr; // Not an escaped newline.
252   }
253   return BufferEnd;
254 }
255 
256 /// Return the one past end pointer for C comments.
257 /// Very dumb, does not handle escaped newlines or trigraphs.
258 const char *findCCommentEnd(const char *BufferPtr, const char *BufferEnd) {
259   for ( ; BufferPtr != BufferEnd; ++BufferPtr) {
260     if (*BufferPtr == '*') {
261       assert(BufferPtr + 1 != BufferEnd);
262       if (*(BufferPtr + 1) == '/')
263         return BufferPtr;
264     }
265   }
266   llvm_unreachable("buffer end hit before '*/' was seen");
267 }
268 
269 } // unnamed namespace
270 
271 void Lexer::lexCommentText(Token &T) {
272   assert(CommentState == LCS_InsideBCPLComment ||
273          CommentState == LCS_InsideCComment);
274 
275   switch (State) {
276   case LS_Normal:
277     break;
278   case LS_VerbatimBlockFirstLine:
279     lexVerbatimBlockFirstLine(T);
280     return;
281   case LS_VerbatimBlockBody:
282     lexVerbatimBlockBody(T);
283     return;
284   case LS_VerbatimLineText:
285     lexVerbatimLineText(T);
286     return;
287   case LS_HTMLStartTag:
288     lexHTMLStartTag(T);
289     return;
290   case LS_HTMLEndTag:
291     lexHTMLEndTag(T);
292     return;
293   }
294 
295   assert(State == LS_Normal);
296 
297   const char *TokenPtr = BufferPtr;
298   assert(TokenPtr < CommentEnd);
299   while (TokenPtr != CommentEnd) {
300     switch(*TokenPtr) {
301       case '\\':
302       case '@': {
303         // Commands that start with a backslash and commands that start with
304         // 'at' have equivalent semantics.  But we keep information about the
305         // exact syntax in AST for comments.
306         tok::TokenKind CommandKind =
307             (*TokenPtr == '@') ? tok::at_command : tok::backslash_command;
308         TokenPtr++;
309         if (TokenPtr == CommentEnd) {
310           formTextToken(T, TokenPtr);
311           return;
312         }
313         char C = *TokenPtr;
314         switch (C) {
315         default:
316           break;
317 
318         case '\\': case '@': case '&': case '$':
319         case '#':  case '<': case '>': case '%':
320         case '\"': case '.': case ':':
321           // This is one of \\ \@ \& \$ etc escape sequences.
322           TokenPtr++;
323           if (C == ':' && TokenPtr != CommentEnd && *TokenPtr == ':') {
324             // This is the \:: escape sequence.
325             TokenPtr++;
326           }
327           StringRef UnescapedText(BufferPtr + 1, TokenPtr - (BufferPtr + 1));
328           formTokenWithChars(T, TokenPtr, tok::text);
329           T.setText(UnescapedText);
330           return;
331         }
332 
333         // Don't make zero-length commands.
334         if (!isCommandNameStartCharacter(*TokenPtr)) {
335           formTextToken(T, TokenPtr);
336           return;
337         }
338 
339         TokenPtr = skipCommandName(TokenPtr, CommentEnd);
340         unsigned Length = TokenPtr - (BufferPtr + 1);
341 
342         // Hardcoded support for lexing LaTeX formula commands
343         // \f$ \f[ \f] \f{ \f} as a single command.
344         if (Length == 1 && TokenPtr[-1] == 'f' && TokenPtr != CommentEnd) {
345           C = *TokenPtr;
346           if (C == '$' || C == '[' || C == ']' || C == '{' || C == '}') {
347             TokenPtr++;
348             Length++;
349           }
350         }
351 
352         const StringRef CommandName(BufferPtr + 1, Length);
353 
354         const CommandInfo *Info = Traits.getCommandInfoOrNULL(CommandName);
355         if (!Info) {
356           formTokenWithChars(T, TokenPtr, tok::unknown_command);
357           T.setUnknownCommandName(CommandName);
358           if ((Info = Traits.getTypoCorrectCommandInfo(CommandName))) {
359             StringRef CorrectedName = Info->Name;
360             SourceRange CommandRange(T.getLocation().getLocWithOffset(1),
361                                      T.getEndLocation());
362             Diag(T.getLocation(), diag::warn_correct_comment_command_name)
363               << CommandName << CorrectedName
364               << FixItHint::CreateReplacement(CommandRange, CorrectedName);
365           } else {
366             Diag(T.getLocation(), diag::warn_unknown_comment_command_name);
367             return;
368           }
369         }
370         if (Info->IsVerbatimBlockCommand) {
371           setupAndLexVerbatimBlock(T, TokenPtr, *BufferPtr, Info);
372           return;
373         }
374         if (Info->IsVerbatimLineCommand) {
375           setupAndLexVerbatimLine(T, TokenPtr, Info);
376           return;
377         }
378         formTokenWithChars(T, TokenPtr, CommandKind);
379         T.setCommandID(Info->getID());
380         return;
381       }
382 
383       case '&':
384         lexHTMLCharacterReference(T);
385         return;
386 
387       case '<': {
388         TokenPtr++;
389         if (TokenPtr == CommentEnd) {
390           formTextToken(T, TokenPtr);
391           return;
392         }
393         const char C = *TokenPtr;
394         if (isHTMLIdentifierStartingCharacter(C))
395           setupAndLexHTMLStartTag(T);
396         else if (C == '/')
397           setupAndLexHTMLEndTag(T);
398         else
399           formTextToken(T, TokenPtr);
400 
401         return;
402       }
403 
404       case '\n':
405       case '\r':
406         TokenPtr = skipNewline(TokenPtr, CommentEnd);
407         formTokenWithChars(T, TokenPtr, tok::newline);
408 
409         if (CommentState == LCS_InsideCComment)
410           skipLineStartingDecorations();
411         return;
412 
413       default: {
414         size_t End = StringRef(TokenPtr, CommentEnd - TokenPtr).
415                          find_first_of("\n\r\\@&<");
416         if (End != StringRef::npos)
417           TokenPtr += End;
418         else
419           TokenPtr = CommentEnd;
420         formTextToken(T, TokenPtr);
421         return;
422       }
423     }
424   }
425 }
426 
427 void Lexer::setupAndLexVerbatimBlock(Token &T,
428                                      const char *TextBegin,
429                                      char Marker, const CommandInfo *Info) {
430   assert(Info->IsVerbatimBlockCommand);
431 
432   VerbatimBlockEndCommandName.clear();
433   VerbatimBlockEndCommandName.append(Marker == '\\' ? "\\" : "@");
434   VerbatimBlockEndCommandName.append(Info->EndCommandName);
435 
436   formTokenWithChars(T, TextBegin, tok::verbatim_block_begin);
437   T.setVerbatimBlockID(Info->getID());
438 
439   // If there is a newline following the verbatim opening command, skip the
440   // newline so that we don't create an tok::verbatim_block_line with empty
441   // text content.
442   if (BufferPtr != CommentEnd &&
443       isVerticalWhitespace(*BufferPtr)) {
444     BufferPtr = skipNewline(BufferPtr, CommentEnd);
445     State = LS_VerbatimBlockBody;
446     return;
447   }
448 
449   State = LS_VerbatimBlockFirstLine;
450 }
451 
452 void Lexer::lexVerbatimBlockFirstLine(Token &T) {
453 again:
454   assert(BufferPtr < CommentEnd);
455 
456   // FIXME: It would be better to scan the text once, finding either the block
457   // end command or newline.
458   //
459   // Extract current line.
460   const char *Newline = findNewline(BufferPtr, CommentEnd);
461   StringRef Line(BufferPtr, Newline - BufferPtr);
462 
463   // Look for end command in current line.
464   size_t Pos = Line.find(VerbatimBlockEndCommandName);
465   const char *TextEnd;
466   const char *NextLine;
467   if (Pos == StringRef::npos) {
468     // Current line is completely verbatim.
469     TextEnd = Newline;
470     NextLine = skipNewline(Newline, CommentEnd);
471   } else if (Pos == 0) {
472     // Current line contains just an end command.
473     const char *End = BufferPtr + VerbatimBlockEndCommandName.size();
474     StringRef Name(BufferPtr + 1, End - (BufferPtr + 1));
475     formTokenWithChars(T, End, tok::verbatim_block_end);
476     T.setVerbatimBlockID(Traits.getCommandInfo(Name)->getID());
477     State = LS_Normal;
478     return;
479   } else {
480     // There is some text, followed by end command.  Extract text first.
481     TextEnd = BufferPtr + Pos;
482     NextLine = TextEnd;
483     // If there is only whitespace before end command, skip whitespace.
484     if (isWhitespace(BufferPtr, TextEnd)) {
485       BufferPtr = TextEnd;
486       goto again;
487     }
488   }
489 
490   StringRef Text(BufferPtr, TextEnd - BufferPtr);
491   formTokenWithChars(T, NextLine, tok::verbatim_block_line);
492   T.setVerbatimBlockText(Text);
493 
494   State = LS_VerbatimBlockBody;
495 }
496 
497 void Lexer::lexVerbatimBlockBody(Token &T) {
498   assert(State == LS_VerbatimBlockBody);
499 
500   if (CommentState == LCS_InsideCComment)
501     skipLineStartingDecorations();
502 
503   lexVerbatimBlockFirstLine(T);
504 }
505 
506 void Lexer::setupAndLexVerbatimLine(Token &T, const char *TextBegin,
507                                     const CommandInfo *Info) {
508   assert(Info->IsVerbatimLineCommand);
509   formTokenWithChars(T, TextBegin, tok::verbatim_line_name);
510   T.setVerbatimLineID(Info->getID());
511 
512   State = LS_VerbatimLineText;
513 }
514 
515 void Lexer::lexVerbatimLineText(Token &T) {
516   assert(State == LS_VerbatimLineText);
517 
518   // Extract current line.
519   const char *Newline = findNewline(BufferPtr, CommentEnd);
520   const StringRef Text(BufferPtr, Newline - BufferPtr);
521   formTokenWithChars(T, Newline, tok::verbatim_line_text);
522   T.setVerbatimLineText(Text);
523 
524   State = LS_Normal;
525 }
526 
527 void Lexer::lexHTMLCharacterReference(Token &T) {
528   const char *TokenPtr = BufferPtr;
529   assert(*TokenPtr == '&');
530   TokenPtr++;
531   if (TokenPtr == CommentEnd) {
532     formTextToken(T, TokenPtr);
533     return;
534   }
535   const char *NamePtr;
536   bool isNamed = false;
537   bool isDecimal = false;
538   char C = *TokenPtr;
539   if (isHTMLNamedCharacterReferenceCharacter(C)) {
540     NamePtr = TokenPtr;
541     TokenPtr = skipNamedCharacterReference(TokenPtr, CommentEnd);
542     isNamed = true;
543   } else if (C == '#') {
544     TokenPtr++;
545     if (TokenPtr == CommentEnd) {
546       formTextToken(T, TokenPtr);
547       return;
548     }
549     C = *TokenPtr;
550     if (isHTMLDecimalCharacterReferenceCharacter(C)) {
551       NamePtr = TokenPtr;
552       TokenPtr = skipDecimalCharacterReference(TokenPtr, CommentEnd);
553       isDecimal = true;
554     } else if (C == 'x' || C == 'X') {
555       TokenPtr++;
556       NamePtr = TokenPtr;
557       TokenPtr = skipHexCharacterReference(TokenPtr, CommentEnd);
558     } else {
559       formTextToken(T, TokenPtr);
560       return;
561     }
562   } else {
563     formTextToken(T, TokenPtr);
564     return;
565   }
566   if (NamePtr == TokenPtr || TokenPtr == CommentEnd ||
567       *TokenPtr != ';') {
568     formTextToken(T, TokenPtr);
569     return;
570   }
571   StringRef Name(NamePtr, TokenPtr - NamePtr);
572   TokenPtr++; // Skip semicolon.
573   StringRef Resolved;
574   if (isNamed)
575     Resolved = resolveHTMLNamedCharacterReference(Name);
576   else if (isDecimal)
577     Resolved = resolveHTMLDecimalCharacterReference(Name);
578   else
579     Resolved = resolveHTMLHexCharacterReference(Name);
580 
581   if (Resolved.empty()) {
582     formTextToken(T, TokenPtr);
583     return;
584   }
585   formTokenWithChars(T, TokenPtr, tok::text);
586   T.setText(Resolved);
587   return;
588 }
589 
590 void Lexer::setupAndLexHTMLStartTag(Token &T) {
591   assert(BufferPtr[0] == '<' &&
592          isHTMLIdentifierStartingCharacter(BufferPtr[1]));
593   const char *TagNameEnd = skipHTMLIdentifier(BufferPtr + 2, CommentEnd);
594   StringRef Name(BufferPtr + 1, TagNameEnd - (BufferPtr + 1));
595   if (!isHTMLTagName(Name)) {
596     formTextToken(T, TagNameEnd);
597     return;
598   }
599 
600   formTokenWithChars(T, TagNameEnd, tok::html_start_tag);
601   T.setHTMLTagStartName(Name);
602 
603   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
604 
605   const char C = *BufferPtr;
606   if (BufferPtr != CommentEnd &&
607       (C == '>' || C == '/' || isHTMLIdentifierStartingCharacter(C)))
608     State = LS_HTMLStartTag;
609 }
610 
611 void Lexer::lexHTMLStartTag(Token &T) {
612   assert(State == LS_HTMLStartTag);
613 
614   const char *TokenPtr = BufferPtr;
615   char C = *TokenPtr;
616   if (isHTMLIdentifierCharacter(C)) {
617     TokenPtr = skipHTMLIdentifier(TokenPtr, CommentEnd);
618     StringRef Ident(BufferPtr, TokenPtr - BufferPtr);
619     formTokenWithChars(T, TokenPtr, tok::html_ident);
620     T.setHTMLIdent(Ident);
621   } else {
622     switch (C) {
623     case '=':
624       TokenPtr++;
625       formTokenWithChars(T, TokenPtr, tok::html_equals);
626       break;
627     case '\"':
628     case '\'': {
629       const char *OpenQuote = TokenPtr;
630       TokenPtr = skipHTMLQuotedString(TokenPtr, CommentEnd);
631       const char *ClosingQuote = TokenPtr;
632       if (TokenPtr != CommentEnd) // Skip closing quote.
633         TokenPtr++;
634       formTokenWithChars(T, TokenPtr, tok::html_quoted_string);
635       T.setHTMLQuotedString(StringRef(OpenQuote + 1,
636                                       ClosingQuote - (OpenQuote + 1)));
637       break;
638     }
639     case '>':
640       TokenPtr++;
641       formTokenWithChars(T, TokenPtr, tok::html_greater);
642       State = LS_Normal;
643       return;
644     case '/':
645       TokenPtr++;
646       if (TokenPtr != CommentEnd && *TokenPtr == '>') {
647         TokenPtr++;
648         formTokenWithChars(T, TokenPtr, tok::html_slash_greater);
649       } else
650         formTextToken(T, TokenPtr);
651 
652       State = LS_Normal;
653       return;
654     }
655   }
656 
657   // Now look ahead and return to normal state if we don't see any HTML tokens
658   // ahead.
659   BufferPtr = skipWhitespace(BufferPtr, CommentEnd);
660   if (BufferPtr == CommentEnd) {
661     State = LS_Normal;
662     return;
663   }
664 
665   C = *BufferPtr;
666   if (!isHTMLIdentifierStartingCharacter(C) &&
667       C != '=' && C != '\"' && C != '\'' && C != '>') {
668     State = LS_Normal;
669     return;
670   }
671 }
672 
673 void Lexer::setupAndLexHTMLEndTag(Token &T) {
674   assert(BufferPtr[0] == '<' && BufferPtr[1] == '/');
675 
676   const char *TagNameBegin = skipWhitespace(BufferPtr + 2, CommentEnd);
677   const char *TagNameEnd = skipHTMLIdentifier(TagNameBegin, CommentEnd);
678   StringRef Name(TagNameBegin, TagNameEnd - TagNameBegin);
679   if (!isHTMLTagName(Name)) {
680     formTextToken(T, TagNameEnd);
681     return;
682   }
683 
684   const char *End = skipWhitespace(TagNameEnd, CommentEnd);
685 
686   formTokenWithChars(T, End, tok::html_end_tag);
687   T.setHTMLTagEndName(Name);
688 
689   if (BufferPtr != CommentEnd && *BufferPtr == '>')
690     State = LS_HTMLEndTag;
691 }
692 
693 void Lexer::lexHTMLEndTag(Token &T) {
694   assert(BufferPtr != CommentEnd && *BufferPtr == '>');
695 
696   formTokenWithChars(T, BufferPtr + 1, tok::html_greater);
697   State = LS_Normal;
698 }
699 
700 Lexer::Lexer(llvm::BumpPtrAllocator &Allocator, DiagnosticsEngine &Diags,
701              const CommandTraits &Traits,
702              SourceLocation FileLoc,
703              const char *BufferStart, const char *BufferEnd):
704     Allocator(Allocator), Diags(Diags), Traits(Traits),
705     BufferStart(BufferStart), BufferEnd(BufferEnd),
706     FileLoc(FileLoc), BufferPtr(BufferStart),
707     CommentState(LCS_BeforeComment), State(LS_Normal) {
708 }
709 
710 void Lexer::lex(Token &T) {
711 again:
712   switch (CommentState) {
713   case LCS_BeforeComment:
714     if (BufferPtr == BufferEnd) {
715       formTokenWithChars(T, BufferPtr, tok::eof);
716       return;
717     }
718 
719     assert(*BufferPtr == '/');
720     BufferPtr++; // Skip first slash.
721     switch(*BufferPtr) {
722     case '/': { // BCPL comment.
723       BufferPtr++; // Skip second slash.
724 
725       if (BufferPtr != BufferEnd) {
726         // Skip Doxygen magic marker, if it is present.
727         // It might be missing because of a typo //< or /*<, or because we
728         // merged this non-Doxygen comment into a bunch of Doxygen comments
729         // around it: /** ... */ /* ... */ /** ... */
730         const char C = *BufferPtr;
731         if (C == '/' || C == '!')
732           BufferPtr++;
733       }
734 
735       // Skip less-than symbol that marks trailing comments.
736       // Skip it even if the comment is not a Doxygen one, because //< and /*<
737       // are frequent typos.
738       if (BufferPtr != BufferEnd && *BufferPtr == '<')
739         BufferPtr++;
740 
741       CommentState = LCS_InsideBCPLComment;
742       if (State != LS_VerbatimBlockBody && State != LS_VerbatimBlockFirstLine)
743         State = LS_Normal;
744       CommentEnd = findBCPLCommentEnd(BufferPtr, BufferEnd);
745       goto again;
746     }
747     case '*': { // C comment.
748       BufferPtr++; // Skip star.
749 
750       // Skip Doxygen magic marker.
751       const char C = *BufferPtr;
752       if ((C == '*' && *(BufferPtr + 1) != '/') || C == '!')
753         BufferPtr++;
754 
755       // Skip less-than symbol that marks trailing comments.
756       if (BufferPtr != BufferEnd && *BufferPtr == '<')
757         BufferPtr++;
758 
759       CommentState = LCS_InsideCComment;
760       State = LS_Normal;
761       CommentEnd = findCCommentEnd(BufferPtr, BufferEnd);
762       goto again;
763     }
764     default:
765       llvm_unreachable("second character of comment should be '/' or '*'");
766     }
767 
768   case LCS_BetweenComments: {
769     // Consecutive comments are extracted only if there is only whitespace
770     // between them.  So we can search for the start of the next comment.
771     const char *EndWhitespace = BufferPtr;
772     while(EndWhitespace != BufferEnd && *EndWhitespace != '/')
773       EndWhitespace++;
774 
775     // Turn any whitespace between comments (and there is only whitespace
776     // between them -- guaranteed by comment extraction) into a newline.  We
777     // have two newlines between C comments in total (first one was synthesized
778     // after a comment).
779     formTokenWithChars(T, EndWhitespace, tok::newline);
780 
781     CommentState = LCS_BeforeComment;
782     break;
783   }
784 
785   case LCS_InsideBCPLComment:
786   case LCS_InsideCComment:
787     if (BufferPtr != CommentEnd) {
788       lexCommentText(T);
789       break;
790     } else {
791       // Skip C comment closing sequence.
792       if (CommentState == LCS_InsideCComment) {
793         assert(BufferPtr[0] == '*' && BufferPtr[1] == '/');
794         BufferPtr += 2;
795         assert(BufferPtr <= BufferEnd);
796 
797         // Synthenize newline just after the C comment, regardless if there is
798         // actually a newline.
799         formTokenWithChars(T, BufferPtr, tok::newline);
800 
801         CommentState = LCS_BetweenComments;
802         break;
803       } else {
804         // Don't synthesized a newline after BCPL comment.
805         CommentState = LCS_BetweenComments;
806         goto again;
807       }
808     }
809   }
810 }
811 
812 StringRef Lexer::getSpelling(const Token &Tok,
813                              const SourceManager &SourceMgr,
814                              bool *Invalid) const {
815   SourceLocation Loc = Tok.getLocation();
816   std::pair<FileID, unsigned> LocInfo = SourceMgr.getDecomposedLoc(Loc);
817 
818   bool InvalidTemp = false;
819   StringRef File = SourceMgr.getBufferData(LocInfo.first, &InvalidTemp);
820   if (InvalidTemp) {
821     *Invalid = true;
822     return StringRef();
823   }
824 
825   const char *Begin = File.data() + LocInfo.second;
826   return StringRef(Begin, Tok.getLength());
827 }
828 
829 } // end namespace comments
830 } // end namespace clang
831 
832