1 //===- Lexer.cpp - C Language Family Lexer --------------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 //  This file implements the Lexer and Token interfaces.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "clang/Lex/Lexer.h"
14 #include "UnicodeCharSets.h"
15 #include "clang/Basic/CharInfo.h"
16 #include "clang/Basic/Diagnostic.h"
17 #include "clang/Basic/IdentifierTable.h"
18 #include "clang/Basic/LLVM.h"
19 #include "clang/Basic/LangOptions.h"
20 #include "clang/Basic/SourceLocation.h"
21 #include "clang/Basic/SourceManager.h"
22 #include "clang/Basic/TokenKinds.h"
23 #include "clang/Lex/LexDiagnostic.h"
24 #include "clang/Lex/LiteralSupport.h"
25 #include "clang/Lex/MultipleIncludeOpt.h"
26 #include "clang/Lex/Preprocessor.h"
27 #include "clang/Lex/PreprocessorOptions.h"
28 #include "clang/Lex/Token.h"
29 #include "llvm/ADT/STLExtras.h"
30 #include "llvm/ADT/StringExtras.h"
31 #include "llvm/ADT/StringRef.h"
32 #include "llvm/ADT/StringSwitch.h"
33 #include "llvm/Support/Compiler.h"
34 #include "llvm/Support/ConvertUTF.h"
35 #include "llvm/Support/MathExtras.h"
36 #include "llvm/Support/MemoryBufferRef.h"
37 #include "llvm/Support/NativeFormatting.h"
38 #include "llvm/Support/Unicode.h"
39 #include "llvm/Support/UnicodeCharRanges.h"
40 #include <algorithm>
41 #include <cassert>
42 #include <cstddef>
43 #include <cstdint>
44 #include <cstring>
45 #include <optional>
46 #include <string>
47 #include <tuple>
48 #include <utility>
49 
50 #ifdef __SSE4_2__
51 #include <nmmintrin.h>
52 #endif
53 
54 using namespace clang;
55 
56 //===----------------------------------------------------------------------===//
57 // Token Class Implementation
58 //===----------------------------------------------------------------------===//
59 
60 /// isObjCAtKeyword - Return true if we have an ObjC keyword identifier.
isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const61 bool Token::isObjCAtKeyword(tok::ObjCKeywordKind objcKey) const {
62   if (isAnnotation())
63     return false;
64   if (const IdentifierInfo *II = getIdentifierInfo())
65     return II->getObjCKeywordID() == objcKey;
66   return false;
67 }
68 
69 /// getObjCKeywordID - Return the ObjC keyword kind.
getObjCKeywordID() const70 tok::ObjCKeywordKind Token::getObjCKeywordID() const {
71   if (isAnnotation())
72     return tok::objc_not_keyword;
73   const IdentifierInfo *specId = getIdentifierInfo();
74   return specId ? specId->getObjCKeywordID() : tok::objc_not_keyword;
75 }
76 
77 //===----------------------------------------------------------------------===//
78 // Lexer Class Implementation
79 //===----------------------------------------------------------------------===//
80 
anchor()81 void Lexer::anchor() {}
82 
InitLexer(const char * BufStart,const char * BufPtr,const char * BufEnd)83 void Lexer::InitLexer(const char *BufStart, const char *BufPtr,
84                       const char *BufEnd) {
85   BufferStart = BufStart;
86   BufferPtr = BufPtr;
87   BufferEnd = BufEnd;
88 
89   assert(BufEnd[0] == 0 &&
90          "We assume that the input buffer has a null character at the end"
91          " to simplify lexing!");
92 
93   // Check whether we have a BOM in the beginning of the buffer. If yes - act
94   // accordingly. Right now we support only UTF-8 with and without BOM, so, just
95   // skip the UTF-8 BOM if it's present.
96   if (BufferStart == BufferPtr) {
97     // Determine the size of the BOM.
98     StringRef Buf(BufferStart, BufferEnd - BufferStart);
99     size_t BOMLength = llvm::StringSwitch<size_t>(Buf)
100       .StartsWith("\xEF\xBB\xBF", 3) // UTF-8 BOM
101       .Default(0);
102 
103     // Skip the BOM.
104     BufferPtr += BOMLength;
105   }
106 
107   Is_PragmaLexer = false;
108   CurrentConflictMarkerState = CMK_None;
109 
110   // Start of the file is a start of line.
111   IsAtStartOfLine = true;
112   IsAtPhysicalStartOfLine = true;
113 
114   HasLeadingSpace = false;
115   HasLeadingEmptyMacro = false;
116 
117   // We are not after parsing a #.
118   ParsingPreprocessorDirective = false;
119 
120   // We are not after parsing #include.
121   ParsingFilename = false;
122 
123   // We are not in raw mode.  Raw mode disables diagnostics and interpretation
124   // of tokens (e.g. identifiers, thus disabling macro expansion).  It is used
125   // to quickly lex the tokens of the buffer, e.g. when handling a "#if 0" block
126   // or otherwise skipping over tokens.
127   LexingRawMode = false;
128 
129   // Default to not keeping comments.
130   ExtendedTokenMode = 0;
131 
132   NewLinePtr = nullptr;
133 }
134 
135 /// Lexer constructor - Create a new lexer object for the specified buffer
136 /// with the specified preprocessor managing the lexing process.  This lexer
137 /// assumes that the associated file buffer and Preprocessor objects will
138 /// outlive it, so it doesn't take ownership of either of them.
Lexer(FileID FID,const llvm::MemoryBufferRef & InputFile,Preprocessor & PP,bool IsFirstIncludeOfFile)139 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &InputFile,
140              Preprocessor &PP, bool IsFirstIncludeOfFile)
141     : PreprocessorLexer(&PP, FID),
142       FileLoc(PP.getSourceManager().getLocForStartOfFile(FID)),
143       LangOpts(PP.getLangOpts()), LineComment(LangOpts.LineComment),
144       IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
145   InitLexer(InputFile.getBufferStart(), InputFile.getBufferStart(),
146             InputFile.getBufferEnd());
147 
148   resetExtendedTokenMode();
149 }
150 
151 /// Lexer constructor - Create a new raw lexer object.  This object is only
152 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
153 /// range will outlive it, so it doesn't take ownership of it.
Lexer(SourceLocation fileloc,const LangOptions & langOpts,const char * BufStart,const char * BufPtr,const char * BufEnd,bool IsFirstIncludeOfFile)154 Lexer::Lexer(SourceLocation fileloc, const LangOptions &langOpts,
155              const char *BufStart, const char *BufPtr, const char *BufEnd,
156              bool IsFirstIncludeOfFile)
157     : FileLoc(fileloc), LangOpts(langOpts), LineComment(LangOpts.LineComment),
158       IsFirstTimeLexingFile(IsFirstIncludeOfFile) {
159   InitLexer(BufStart, BufPtr, BufEnd);
160 
161   // We *are* in raw mode.
162   LexingRawMode = true;
163 }
164 
165 /// Lexer constructor - Create a new raw lexer object.  This object is only
166 /// suitable for calls to 'LexFromRawLexer'.  This lexer assumes that the text
167 /// range will outlive it, so it doesn't take ownership of it.
Lexer(FileID FID,const llvm::MemoryBufferRef & FromFile,const SourceManager & SM,const LangOptions & langOpts,bool IsFirstIncludeOfFile)168 Lexer::Lexer(FileID FID, const llvm::MemoryBufferRef &FromFile,
169              const SourceManager &SM, const LangOptions &langOpts,
170              bool IsFirstIncludeOfFile)
171     : Lexer(SM.getLocForStartOfFile(FID), langOpts, FromFile.getBufferStart(),
172             FromFile.getBufferStart(), FromFile.getBufferEnd(),
173             IsFirstIncludeOfFile) {}
174 
resetExtendedTokenMode()175 void Lexer::resetExtendedTokenMode() {
176   assert(PP && "Cannot reset token mode without a preprocessor");
177   if (LangOpts.TraditionalCPP)
178     SetKeepWhitespaceMode(true);
179   else
180     SetCommentRetentionState(PP->getCommentRetentionState());
181 }
182 
183 /// Create_PragmaLexer: Lexer constructor - Create a new lexer object for
184 /// _Pragma expansion.  This has a variety of magic semantics that this method
185 /// sets up.  It returns a new'd Lexer that must be delete'd when done.
186 ///
187 /// On entrance to this routine, TokStartLoc is a macro location which has a
188 /// spelling loc that indicates the bytes to be lexed for the token and an
189 /// expansion location that indicates where all lexed tokens should be
190 /// "expanded from".
191 ///
192 /// TODO: It would really be nice to make _Pragma just be a wrapper around a
193 /// normal lexer that remaps tokens as they fly by.  This would require making
194 /// Preprocessor::Lex virtual.  Given that, we could just dump in a magic lexer
195 /// interface that could handle this stuff.  This would pull GetMappedTokenLoc
196 /// out of the critical path of the lexer!
197 ///
Create_PragmaLexer(SourceLocation SpellingLoc,SourceLocation ExpansionLocStart,SourceLocation ExpansionLocEnd,unsigned TokLen,Preprocessor & PP)198 Lexer *Lexer::Create_PragmaLexer(SourceLocation SpellingLoc,
199                                  SourceLocation ExpansionLocStart,
200                                  SourceLocation ExpansionLocEnd,
201                                  unsigned TokLen, Preprocessor &PP) {
202   SourceManager &SM = PP.getSourceManager();
203 
204   // Create the lexer as if we were going to lex the file normally.
205   FileID SpellingFID = SM.getFileID(SpellingLoc);
206   llvm::MemoryBufferRef InputFile = SM.getBufferOrFake(SpellingFID);
207   Lexer *L = new Lexer(SpellingFID, InputFile, PP);
208 
209   // Now that the lexer is created, change the start/end locations so that we
210   // just lex the subsection of the file that we want.  This is lexing from a
211   // scratch buffer.
212   const char *StrData = SM.getCharacterData(SpellingLoc);
213 
214   L->BufferPtr = StrData;
215   L->BufferEnd = StrData+TokLen;
216   assert(L->BufferEnd[0] == 0 && "Buffer is not nul terminated!");
217 
218   // Set the SourceLocation with the remapping information.  This ensures that
219   // GetMappedTokenLoc will remap the tokens as they are lexed.
220   L->FileLoc = SM.createExpansionLoc(SM.getLocForStartOfFile(SpellingFID),
221                                      ExpansionLocStart,
222                                      ExpansionLocEnd, TokLen);
223 
224   // Ensure that the lexer thinks it is inside a directive, so that end \n will
225   // return an EOD token.
226   L->ParsingPreprocessorDirective = true;
227 
228   // This lexer really is for _Pragma.
229   L->Is_PragmaLexer = true;
230   return L;
231 }
232 
seek(unsigned Offset,bool IsAtStartOfLine)233 void Lexer::seek(unsigned Offset, bool IsAtStartOfLine) {
234   this->IsAtPhysicalStartOfLine = IsAtStartOfLine;
235   this->IsAtStartOfLine = IsAtStartOfLine;
236   assert((BufferStart + Offset) <= BufferEnd);
237   BufferPtr = BufferStart + Offset;
238 }
239 
StringifyImpl(T & Str,char Quote)240 template <typename T> static void StringifyImpl(T &Str, char Quote) {
241   typename T::size_type i = 0, e = Str.size();
242   while (i < e) {
243     if (Str[i] == '\\' || Str[i] == Quote) {
244       Str.insert(Str.begin() + i, '\\');
245       i += 2;
246       ++e;
247     } else if (Str[i] == '\n' || Str[i] == '\r') {
248       // Replace '\r\n' and '\n\r' to '\\' followed by 'n'.
249       if ((i < e - 1) && (Str[i + 1] == '\n' || Str[i + 1] == '\r') &&
250           Str[i] != Str[i + 1]) {
251         Str[i] = '\\';
252         Str[i + 1] = 'n';
253       } else {
254         // Replace '\n' and '\r' to '\\' followed by 'n'.
255         Str[i] = '\\';
256         Str.insert(Str.begin() + i + 1, 'n');
257         ++e;
258       }
259       i += 2;
260     } else
261       ++i;
262   }
263 }
264 
Stringify(StringRef Str,bool Charify)265 std::string Lexer::Stringify(StringRef Str, bool Charify) {
266   std::string Result = std::string(Str);
267   char Quote = Charify ? '\'' : '"';
268   StringifyImpl(Result, Quote);
269   return Result;
270 }
271 
Stringify(SmallVectorImpl<char> & Str)272 void Lexer::Stringify(SmallVectorImpl<char> &Str) { StringifyImpl(Str, '"'); }
273 
274 //===----------------------------------------------------------------------===//
275 // Token Spelling
276 //===----------------------------------------------------------------------===//
277 
278 /// Slow case of getSpelling. Extract the characters comprising the
279 /// spelling of this token from the provided input buffer.
getSpellingSlow(const Token & Tok,const char * BufPtr,const LangOptions & LangOpts,char * Spelling)280 static size_t getSpellingSlow(const Token &Tok, const char *BufPtr,
281                               const LangOptions &LangOpts, char *Spelling) {
282   assert(Tok.needsCleaning() && "getSpellingSlow called on simple token");
283 
284   size_t Length = 0;
285   const char *BufEnd = BufPtr + Tok.getLength();
286 
287   if (tok::isStringLiteral(Tok.getKind())) {
288     // Munch the encoding-prefix and opening double-quote.
289     while (BufPtr < BufEnd) {
290       auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
291       Spelling[Length++] = CharAndSize.Char;
292       BufPtr += CharAndSize.Size;
293 
294       if (Spelling[Length - 1] == '"')
295         break;
296     }
297 
298     // Raw string literals need special handling; trigraph expansion and line
299     // splicing do not occur within their d-char-sequence nor within their
300     // r-char-sequence.
301     if (Length >= 2 &&
302         Spelling[Length - 2] == 'R' && Spelling[Length - 1] == '"') {
303       // Search backwards from the end of the token to find the matching closing
304       // quote.
305       const char *RawEnd = BufEnd;
306       do --RawEnd; while (*RawEnd != '"');
307       size_t RawLength = RawEnd - BufPtr + 1;
308 
309       // Everything between the quotes is included verbatim in the spelling.
310       memcpy(Spelling + Length, BufPtr, RawLength);
311       Length += RawLength;
312       BufPtr += RawLength;
313 
314       // The rest of the token is lexed normally.
315     }
316   }
317 
318   while (BufPtr < BufEnd) {
319     auto CharAndSize = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
320     Spelling[Length++] = CharAndSize.Char;
321     BufPtr += CharAndSize.Size;
322   }
323 
324   assert(Length < Tok.getLength() &&
325          "NeedsCleaning flag set on token that didn't need cleaning!");
326   return Length;
327 }
328 
329 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
330 /// token are the characters used to represent the token in the source file
331 /// after trigraph expansion and escaped-newline folding.  In particular, this
332 /// wants to get the true, uncanonicalized, spelling of things like digraphs
333 /// UCNs, etc.
getSpelling(SourceLocation loc,SmallVectorImpl<char> & buffer,const SourceManager & SM,const LangOptions & options,bool * invalid)334 StringRef Lexer::getSpelling(SourceLocation loc,
335                              SmallVectorImpl<char> &buffer,
336                              const SourceManager &SM,
337                              const LangOptions &options,
338                              bool *invalid) {
339   // Break down the source location.
340   std::pair<FileID, unsigned> locInfo = SM.getDecomposedLoc(loc);
341 
342   // Try to the load the file buffer.
343   bool invalidTemp = false;
344   StringRef file = SM.getBufferData(locInfo.first, &invalidTemp);
345   if (invalidTemp) {
346     if (invalid) *invalid = true;
347     return {};
348   }
349 
350   const char *tokenBegin = file.data() + locInfo.second;
351 
352   // Lex from the start of the given location.
353   Lexer lexer(SM.getLocForStartOfFile(locInfo.first), options,
354               file.begin(), tokenBegin, file.end());
355   Token token;
356   lexer.LexFromRawLexer(token);
357 
358   unsigned length = token.getLength();
359 
360   // Common case:  no need for cleaning.
361   if (!token.needsCleaning())
362     return StringRef(tokenBegin, length);
363 
364   // Hard case, we need to relex the characters into the string.
365   buffer.resize(length);
366   buffer.resize(getSpellingSlow(token, tokenBegin, options, buffer.data()));
367   return StringRef(buffer.data(), buffer.size());
368 }
369 
370 /// getSpelling() - Return the 'spelling' of this token.  The spelling of a
371 /// token are the characters used to represent the token in the source file
372 /// after trigraph expansion and escaped-newline folding.  In particular, this
373 /// wants to get the true, uncanonicalized, spelling of things like digraphs
374 /// UCNs, etc.
getSpelling(const Token & Tok,const SourceManager & SourceMgr,const LangOptions & LangOpts,bool * Invalid)375 std::string Lexer::getSpelling(const Token &Tok, const SourceManager &SourceMgr,
376                                const LangOptions &LangOpts, bool *Invalid) {
377   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
378 
379   bool CharDataInvalid = false;
380   const char *TokStart = SourceMgr.getCharacterData(Tok.getLocation(),
381                                                     &CharDataInvalid);
382   if (Invalid)
383     *Invalid = CharDataInvalid;
384   if (CharDataInvalid)
385     return {};
386 
387   // If this token contains nothing interesting, return it directly.
388   if (!Tok.needsCleaning())
389     return std::string(TokStart, TokStart + Tok.getLength());
390 
391   std::string Result;
392   Result.resize(Tok.getLength());
393   Result.resize(getSpellingSlow(Tok, TokStart, LangOpts, &*Result.begin()));
394   return Result;
395 }
396 
397 /// getSpelling - This method is used to get the spelling of a token into a
398 /// preallocated buffer, instead of as an std::string.  The caller is required
399 /// to allocate enough space for the token, which is guaranteed to be at least
400 /// Tok.getLength() bytes long.  The actual length of the token is returned.
401 ///
402 /// Note that this method may do two possible things: it may either fill in
403 /// the buffer specified with characters, or it may *change the input pointer*
404 /// to point to a constant buffer with the data already in it (avoiding a
405 /// copy).  The caller is not allowed to modify the returned buffer pointer
406 /// if an internal buffer is returned.
getSpelling(const Token & Tok,const char * & Buffer,const SourceManager & SourceMgr,const LangOptions & LangOpts,bool * Invalid)407 unsigned Lexer::getSpelling(const Token &Tok, const char *&Buffer,
408                             const SourceManager &SourceMgr,
409                             const LangOptions &LangOpts, bool *Invalid) {
410   assert((int)Tok.getLength() >= 0 && "Token character range is bogus!");
411 
412   const char *TokStart = nullptr;
413   // NOTE: this has to be checked *before* testing for an IdentifierInfo.
414   if (Tok.is(tok::raw_identifier))
415     TokStart = Tok.getRawIdentifier().data();
416   else if (!Tok.hasUCN()) {
417     if (const IdentifierInfo *II = Tok.getIdentifierInfo()) {
418       // Just return the string from the identifier table, which is very quick.
419       Buffer = II->getNameStart();
420       return II->getLength();
421     }
422   }
423 
424   // NOTE: this can be checked even after testing for an IdentifierInfo.
425   if (Tok.isLiteral())
426     TokStart = Tok.getLiteralData();
427 
428   if (!TokStart) {
429     // Compute the start of the token in the input lexer buffer.
430     bool CharDataInvalid = false;
431     TokStart = SourceMgr.getCharacterData(Tok.getLocation(), &CharDataInvalid);
432     if (Invalid)
433       *Invalid = CharDataInvalid;
434     if (CharDataInvalid) {
435       Buffer = "";
436       return 0;
437     }
438   }
439 
440   // If this token contains nothing interesting, return it directly.
441   if (!Tok.needsCleaning()) {
442     Buffer = TokStart;
443     return Tok.getLength();
444   }
445 
446   // Otherwise, hard case, relex the characters into the string.
447   return getSpellingSlow(Tok, TokStart, LangOpts, const_cast<char*>(Buffer));
448 }
449 
450 /// MeasureTokenLength - Relex the token at the specified location and return
451 /// its length in bytes in the input file.  If the token needs cleaning (e.g.
452 /// includes a trigraph or an escaped newline) then this count includes bytes
453 /// that are part of that.
MeasureTokenLength(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)454 unsigned Lexer::MeasureTokenLength(SourceLocation Loc,
455                                    const SourceManager &SM,
456                                    const LangOptions &LangOpts) {
457   Token TheTok;
458   if (getRawToken(Loc, TheTok, SM, LangOpts))
459     return 0;
460   return TheTok.getLength();
461 }
462 
463 /// Relex the token at the specified location.
464 /// \returns true if there was a failure, false on success.
getRawToken(SourceLocation Loc,Token & Result,const SourceManager & SM,const LangOptions & LangOpts,bool IgnoreWhiteSpace)465 bool Lexer::getRawToken(SourceLocation Loc, Token &Result,
466                         const SourceManager &SM,
467                         const LangOptions &LangOpts,
468                         bool IgnoreWhiteSpace) {
469   // TODO: this could be special cased for common tokens like identifiers, ')',
470   // etc to make this faster, if it mattered.  Just look at StrData[0] to handle
471   // all obviously single-char tokens.  This could use
472   // Lexer::isObviouslySimpleCharacter for example to handle identifiers or
473   // something.
474 
475   // If this comes from a macro expansion, we really do want the macro name, not
476   // the token this macro expanded to.
477   Loc = SM.getExpansionLoc(Loc);
478   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
479   bool Invalid = false;
480   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
481   if (Invalid)
482     return true;
483 
484   const char *StrData = Buffer.data()+LocInfo.second;
485 
486   if (!IgnoreWhiteSpace && isWhitespace(StrData[0]))
487     return true;
488 
489   // Create a lexer starting at the beginning of this token.
490   Lexer TheLexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts,
491                  Buffer.begin(), StrData, Buffer.end());
492   TheLexer.SetCommentRetentionState(true);
493   TheLexer.LexFromRawLexer(Result);
494   return false;
495 }
496 
497 /// Returns the pointer that points to the beginning of line that contains
498 /// the given offset, or null if the offset if invalid.
findBeginningOfLine(StringRef Buffer,unsigned Offset)499 static const char *findBeginningOfLine(StringRef Buffer, unsigned Offset) {
500   const char *BufStart = Buffer.data();
501   if (Offset >= Buffer.size())
502     return nullptr;
503 
504   const char *LexStart = BufStart + Offset;
505   for (; LexStart != BufStart; --LexStart) {
506     if (isVerticalWhitespace(LexStart[0]) &&
507         !Lexer::isNewLineEscaped(BufStart, LexStart)) {
508       // LexStart should point at first character of logical line.
509       ++LexStart;
510       break;
511     }
512   }
513   return LexStart;
514 }
515 
getBeginningOfFileToken(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)516 static SourceLocation getBeginningOfFileToken(SourceLocation Loc,
517                                               const SourceManager &SM,
518                                               const LangOptions &LangOpts) {
519   assert(Loc.isFileID());
520   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
521   if (LocInfo.first.isInvalid())
522     return Loc;
523 
524   bool Invalid = false;
525   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
526   if (Invalid)
527     return Loc;
528 
529   // Back up from the current location until we hit the beginning of a line
530   // (or the buffer). We'll relex from that point.
531   const char *StrData = Buffer.data() + LocInfo.second;
532   const char *LexStart = findBeginningOfLine(Buffer, LocInfo.second);
533   if (!LexStart || LexStart == StrData)
534     return Loc;
535 
536   // Create a lexer starting at the beginning of this token.
537   SourceLocation LexerStartLoc = Loc.getLocWithOffset(-LocInfo.second);
538   Lexer TheLexer(LexerStartLoc, LangOpts, Buffer.data(), LexStart,
539                  Buffer.end());
540   TheLexer.SetCommentRetentionState(true);
541 
542   // Lex tokens until we find the token that contains the source location.
543   Token TheTok;
544   do {
545     TheLexer.LexFromRawLexer(TheTok);
546 
547     if (TheLexer.getBufferLocation() > StrData) {
548       // Lexing this token has taken the lexer past the source location we're
549       // looking for. If the current token encompasses our source location,
550       // return the beginning of that token.
551       if (TheLexer.getBufferLocation() - TheTok.getLength() <= StrData)
552         return TheTok.getLocation();
553 
554       // We ended up skipping over the source location entirely, which means
555       // that it points into whitespace. We're done here.
556       break;
557     }
558   } while (TheTok.getKind() != tok::eof);
559 
560   // We've passed our source location; just return the original source location.
561   return Loc;
562 }
563 
GetBeginningOfToken(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)564 SourceLocation Lexer::GetBeginningOfToken(SourceLocation Loc,
565                                           const SourceManager &SM,
566                                           const LangOptions &LangOpts) {
567   if (Loc.isFileID())
568     return getBeginningOfFileToken(Loc, SM, LangOpts);
569 
570   if (!SM.isMacroArgExpansion(Loc))
571     return Loc;
572 
573   SourceLocation FileLoc = SM.getSpellingLoc(Loc);
574   SourceLocation BeginFileLoc = getBeginningOfFileToken(FileLoc, SM, LangOpts);
575   std::pair<FileID, unsigned> FileLocInfo = SM.getDecomposedLoc(FileLoc);
576   std::pair<FileID, unsigned> BeginFileLocInfo =
577       SM.getDecomposedLoc(BeginFileLoc);
578   assert(FileLocInfo.first == BeginFileLocInfo.first &&
579          FileLocInfo.second >= BeginFileLocInfo.second);
580   return Loc.getLocWithOffset(BeginFileLocInfo.second - FileLocInfo.second);
581 }
582 
583 namespace {
584 
585 enum PreambleDirectiveKind {
586   PDK_Skipped,
587   PDK_Unknown
588 };
589 
590 } // namespace
591 
ComputePreamble(StringRef Buffer,const LangOptions & LangOpts,unsigned MaxLines)592 PreambleBounds Lexer::ComputePreamble(StringRef Buffer,
593                                       const LangOptions &LangOpts,
594                                       unsigned MaxLines) {
595   // Create a lexer starting at the beginning of the file. Note that we use a
596   // "fake" file source location at offset 1 so that the lexer will track our
597   // position within the file.
598   const SourceLocation::UIntTy StartOffset = 1;
599   SourceLocation FileLoc = SourceLocation::getFromRawEncoding(StartOffset);
600   Lexer TheLexer(FileLoc, LangOpts, Buffer.begin(), Buffer.begin(),
601                  Buffer.end());
602   TheLexer.SetCommentRetentionState(true);
603 
604   bool InPreprocessorDirective = false;
605   Token TheTok;
606   SourceLocation ActiveCommentLoc;
607 
608   unsigned MaxLineOffset = 0;
609   if (MaxLines) {
610     const char *CurPtr = Buffer.begin();
611     unsigned CurLine = 0;
612     while (CurPtr != Buffer.end()) {
613       char ch = *CurPtr++;
614       if (ch == '\n') {
615         ++CurLine;
616         if (CurLine == MaxLines)
617           break;
618       }
619     }
620     if (CurPtr != Buffer.end())
621       MaxLineOffset = CurPtr - Buffer.begin();
622   }
623 
624   do {
625     TheLexer.LexFromRawLexer(TheTok);
626 
627     if (InPreprocessorDirective) {
628       // If we've hit the end of the file, we're done.
629       if (TheTok.getKind() == tok::eof) {
630         break;
631       }
632 
633       // If we haven't hit the end of the preprocessor directive, skip this
634       // token.
635       if (!TheTok.isAtStartOfLine())
636         continue;
637 
638       // We've passed the end of the preprocessor directive, and will look
639       // at this token again below.
640       InPreprocessorDirective = false;
641     }
642 
643     // Keep track of the # of lines in the preamble.
644     if (TheTok.isAtStartOfLine()) {
645       unsigned TokOffset = TheTok.getLocation().getRawEncoding() - StartOffset;
646 
647       // If we were asked to limit the number of lines in the preamble,
648       // and we're about to exceed that limit, we're done.
649       if (MaxLineOffset && TokOffset >= MaxLineOffset)
650         break;
651     }
652 
653     // Comments are okay; skip over them.
654     if (TheTok.getKind() == tok::comment) {
655       if (ActiveCommentLoc.isInvalid())
656         ActiveCommentLoc = TheTok.getLocation();
657       continue;
658     }
659 
660     if (TheTok.isAtStartOfLine() && TheTok.getKind() == tok::hash) {
661       // This is the start of a preprocessor directive.
662       Token HashTok = TheTok;
663       InPreprocessorDirective = true;
664       ActiveCommentLoc = SourceLocation();
665 
666       // Figure out which directive this is. Since we're lexing raw tokens,
667       // we don't have an identifier table available. Instead, just look at
668       // the raw identifier to recognize and categorize preprocessor directives.
669       TheLexer.LexFromRawLexer(TheTok);
670       if (TheTok.getKind() == tok::raw_identifier && !TheTok.needsCleaning()) {
671         StringRef Keyword = TheTok.getRawIdentifier();
672         PreambleDirectiveKind PDK
673           = llvm::StringSwitch<PreambleDirectiveKind>(Keyword)
674               .Case("include", PDK_Skipped)
675               .Case("__include_macros", PDK_Skipped)
676               .Case("define", PDK_Skipped)
677               .Case("undef", PDK_Skipped)
678               .Case("line", PDK_Skipped)
679               .Case("error", PDK_Skipped)
680               .Case("pragma", PDK_Skipped)
681               .Case("import", PDK_Skipped)
682               .Case("include_next", PDK_Skipped)
683               .Case("warning", PDK_Skipped)
684               .Case("ident", PDK_Skipped)
685               .Case("sccs", PDK_Skipped)
686               .Case("assert", PDK_Skipped)
687               .Case("unassert", PDK_Skipped)
688               .Case("if", PDK_Skipped)
689               .Case("ifdef", PDK_Skipped)
690               .Case("ifndef", PDK_Skipped)
691               .Case("elif", PDK_Skipped)
692               .Case("elifdef", PDK_Skipped)
693               .Case("elifndef", PDK_Skipped)
694               .Case("else", PDK_Skipped)
695               .Case("endif", PDK_Skipped)
696               .Default(PDK_Unknown);
697 
698         switch (PDK) {
699         case PDK_Skipped:
700           continue;
701 
702         case PDK_Unknown:
703           // We don't know what this directive is; stop at the '#'.
704           break;
705         }
706       }
707 
708       // We only end up here if we didn't recognize the preprocessor
709       // directive or it was one that can't occur in the preamble at this
710       // point. Roll back the current token to the location of the '#'.
711       TheTok = HashTok;
712     } else if (TheTok.isAtStartOfLine() &&
713                TheTok.getKind() == tok::raw_identifier &&
714                TheTok.getRawIdentifier() == "module" &&
715                LangOpts.CPlusPlusModules) {
716       // The initial global module fragment introducer "module;" is part of
717       // the preamble, which runs up to the module declaration "module foo;".
718       Token ModuleTok = TheTok;
719       do {
720         TheLexer.LexFromRawLexer(TheTok);
721       } while (TheTok.getKind() == tok::comment);
722       if (TheTok.getKind() != tok::semi) {
723         // Not global module fragment, roll back.
724         TheTok = ModuleTok;
725         break;
726       }
727       continue;
728     }
729 
730     // We hit a token that we don't recognize as being in the
731     // "preprocessing only" part of the file, so we're no longer in
732     // the preamble.
733     break;
734   } while (true);
735 
736   SourceLocation End;
737   if (ActiveCommentLoc.isValid())
738     End = ActiveCommentLoc; // don't truncate a decl comment.
739   else
740     End = TheTok.getLocation();
741 
742   return PreambleBounds(End.getRawEncoding() - FileLoc.getRawEncoding(),
743                         TheTok.isAtStartOfLine());
744 }
745 
getTokenPrefixLength(SourceLocation TokStart,unsigned CharNo,const SourceManager & SM,const LangOptions & LangOpts)746 unsigned Lexer::getTokenPrefixLength(SourceLocation TokStart, unsigned CharNo,
747                                      const SourceManager &SM,
748                                      const LangOptions &LangOpts) {
749   // Figure out how many physical characters away the specified expansion
750   // character is.  This needs to take into consideration newlines and
751   // trigraphs.
752   bool Invalid = false;
753   const char *TokPtr = SM.getCharacterData(TokStart, &Invalid);
754 
755   // If they request the first char of the token, we're trivially done.
756   if (Invalid || (CharNo == 0 && Lexer::isObviouslySimpleCharacter(*TokPtr)))
757     return 0;
758 
759   unsigned PhysOffset = 0;
760 
761   // The usual case is that tokens don't contain anything interesting.  Skip
762   // over the uninteresting characters.  If a token only consists of simple
763   // chars, this method is extremely fast.
764   while (Lexer::isObviouslySimpleCharacter(*TokPtr)) {
765     if (CharNo == 0)
766       return PhysOffset;
767     ++TokPtr;
768     --CharNo;
769     ++PhysOffset;
770   }
771 
772   // If we have a character that may be a trigraph or escaped newline, use a
773   // lexer to parse it correctly.
774   for (; CharNo; --CharNo) {
775     auto CharAndSize = Lexer::getCharAndSizeNoWarn(TokPtr, LangOpts);
776     TokPtr += CharAndSize.Size;
777     PhysOffset += CharAndSize.Size;
778   }
779 
780   // Final detail: if we end up on an escaped newline, we want to return the
781   // location of the actual byte of the token.  For example foo\<newline>bar
782   // advanced by 3 should return the location of b, not of \\.  One compounding
783   // detail of this is that the escape may be made by a trigraph.
784   if (!Lexer::isObviouslySimpleCharacter(*TokPtr))
785     PhysOffset += Lexer::SkipEscapedNewLines(TokPtr)-TokPtr;
786 
787   return PhysOffset;
788 }
789 
790 /// Computes the source location just past the end of the
791 /// token at this source location.
792 ///
793 /// This routine can be used to produce a source location that
794 /// points just past the end of the token referenced by \p Loc, and
795 /// is generally used when a diagnostic needs to point just after a
796 /// token where it expected something different that it received. If
797 /// the returned source location would not be meaningful (e.g., if
798 /// it points into a macro), this routine returns an invalid
799 /// source location.
800 ///
801 /// \param Offset an offset from the end of the token, where the source
802 /// location should refer to. The default offset (0) produces a source
803 /// location pointing just past the end of the token; an offset of 1 produces
804 /// a source location pointing to the last character in the token, etc.
getLocForEndOfToken(SourceLocation Loc,unsigned Offset,const SourceManager & SM,const LangOptions & LangOpts)805 SourceLocation Lexer::getLocForEndOfToken(SourceLocation Loc, unsigned Offset,
806                                           const SourceManager &SM,
807                                           const LangOptions &LangOpts) {
808   if (Loc.isInvalid())
809     return {};
810 
811   if (Loc.isMacroID()) {
812     if (Offset > 0 || !isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
813       return {}; // Points inside the macro expansion.
814   }
815 
816   unsigned Len = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
817   if (Len > Offset)
818     Len = Len - Offset;
819   else
820     return Loc;
821 
822   return Loc.getLocWithOffset(Len);
823 }
824 
825 /// Returns true if the given MacroID location points at the first
826 /// token of the macro expansion.
isAtStartOfMacroExpansion(SourceLocation loc,const SourceManager & SM,const LangOptions & LangOpts,SourceLocation * MacroBegin)827 bool Lexer::isAtStartOfMacroExpansion(SourceLocation loc,
828                                       const SourceManager &SM,
829                                       const LangOptions &LangOpts,
830                                       SourceLocation *MacroBegin) {
831   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
832 
833   SourceLocation expansionLoc;
834   if (!SM.isAtStartOfImmediateMacroExpansion(loc, &expansionLoc))
835     return false;
836 
837   if (expansionLoc.isFileID()) {
838     // No other macro expansions, this is the first.
839     if (MacroBegin)
840       *MacroBegin = expansionLoc;
841     return true;
842   }
843 
844   return isAtStartOfMacroExpansion(expansionLoc, SM, LangOpts, MacroBegin);
845 }
846 
847 /// Returns true if the given MacroID location points at the last
848 /// token of the macro expansion.
isAtEndOfMacroExpansion(SourceLocation loc,const SourceManager & SM,const LangOptions & LangOpts,SourceLocation * MacroEnd)849 bool Lexer::isAtEndOfMacroExpansion(SourceLocation loc,
850                                     const SourceManager &SM,
851                                     const LangOptions &LangOpts,
852                                     SourceLocation *MacroEnd) {
853   assert(loc.isValid() && loc.isMacroID() && "Expected a valid macro loc");
854 
855   SourceLocation spellLoc = SM.getSpellingLoc(loc);
856   unsigned tokLen = MeasureTokenLength(spellLoc, SM, LangOpts);
857   if (tokLen == 0)
858     return false;
859 
860   SourceLocation afterLoc = loc.getLocWithOffset(tokLen);
861   SourceLocation expansionLoc;
862   if (!SM.isAtEndOfImmediateMacroExpansion(afterLoc, &expansionLoc))
863     return false;
864 
865   if (expansionLoc.isFileID()) {
866     // No other macro expansions.
867     if (MacroEnd)
868       *MacroEnd = expansionLoc;
869     return true;
870   }
871 
872   return isAtEndOfMacroExpansion(expansionLoc, SM, LangOpts, MacroEnd);
873 }
874 
makeRangeFromFileLocs(CharSourceRange Range,const SourceManager & SM,const LangOptions & LangOpts)875 static CharSourceRange makeRangeFromFileLocs(CharSourceRange Range,
876                                              const SourceManager &SM,
877                                              const LangOptions &LangOpts) {
878   SourceLocation Begin = Range.getBegin();
879   SourceLocation End = Range.getEnd();
880   assert(Begin.isFileID() && End.isFileID());
881   if (Range.isTokenRange()) {
882     End = Lexer::getLocForEndOfToken(End, 0, SM,LangOpts);
883     if (End.isInvalid())
884       return {};
885   }
886 
887   // Break down the source locations.
888   FileID FID;
889   unsigned BeginOffs;
890   std::tie(FID, BeginOffs) = SM.getDecomposedLoc(Begin);
891   if (FID.isInvalid())
892     return {};
893 
894   unsigned EndOffs;
895   if (!SM.isInFileID(End, FID, &EndOffs) ||
896       BeginOffs > EndOffs)
897     return {};
898 
899   return CharSourceRange::getCharRange(Begin, End);
900 }
901 
902 // Assumes that `Loc` is in an expansion.
isInExpansionTokenRange(const SourceLocation Loc,const SourceManager & SM)903 static bool isInExpansionTokenRange(const SourceLocation Loc,
904                                     const SourceManager &SM) {
905   return SM.getSLocEntry(SM.getFileID(Loc))
906       .getExpansion()
907       .isExpansionTokenRange();
908 }
909 
makeFileCharRange(CharSourceRange Range,const SourceManager & SM,const LangOptions & LangOpts)910 CharSourceRange Lexer::makeFileCharRange(CharSourceRange Range,
911                                          const SourceManager &SM,
912                                          const LangOptions &LangOpts) {
913   SourceLocation Begin = Range.getBegin();
914   SourceLocation End = Range.getEnd();
915   if (Begin.isInvalid() || End.isInvalid())
916     return {};
917 
918   if (Begin.isFileID() && End.isFileID())
919     return makeRangeFromFileLocs(Range, SM, LangOpts);
920 
921   if (Begin.isMacroID() && End.isFileID()) {
922     if (!isAtStartOfMacroExpansion(Begin, SM, LangOpts, &Begin))
923       return {};
924     Range.setBegin(Begin);
925     return makeRangeFromFileLocs(Range, SM, LangOpts);
926   }
927 
928   if (Begin.isFileID() && End.isMacroID()) {
929     if (Range.isTokenRange()) {
930       if (!isAtEndOfMacroExpansion(End, SM, LangOpts, &End))
931         return {};
932       // Use the *original* end, not the expanded one in `End`.
933       Range.setTokenRange(isInExpansionTokenRange(Range.getEnd(), SM));
934     } else if (!isAtStartOfMacroExpansion(End, SM, LangOpts, &End))
935       return {};
936     Range.setEnd(End);
937     return makeRangeFromFileLocs(Range, SM, LangOpts);
938   }
939 
940   assert(Begin.isMacroID() && End.isMacroID());
941   SourceLocation MacroBegin, MacroEnd;
942   if (isAtStartOfMacroExpansion(Begin, SM, LangOpts, &MacroBegin) &&
943       ((Range.isTokenRange() && isAtEndOfMacroExpansion(End, SM, LangOpts,
944                                                         &MacroEnd)) ||
945        (Range.isCharRange() && isAtStartOfMacroExpansion(End, SM, LangOpts,
946                                                          &MacroEnd)))) {
947     Range.setBegin(MacroBegin);
948     Range.setEnd(MacroEnd);
949     // Use the *original* `End`, not the expanded one in `MacroEnd`.
950     if (Range.isTokenRange())
951       Range.setTokenRange(isInExpansionTokenRange(End, SM));
952     return makeRangeFromFileLocs(Range, SM, LangOpts);
953   }
954 
955   bool Invalid = false;
956   const SrcMgr::SLocEntry &BeginEntry = SM.getSLocEntry(SM.getFileID(Begin),
957                                                         &Invalid);
958   if (Invalid)
959     return {};
960 
961   if (BeginEntry.getExpansion().isMacroArgExpansion()) {
962     const SrcMgr::SLocEntry &EndEntry = SM.getSLocEntry(SM.getFileID(End),
963                                                         &Invalid);
964     if (Invalid)
965       return {};
966 
967     if (EndEntry.getExpansion().isMacroArgExpansion() &&
968         BeginEntry.getExpansion().getExpansionLocStart() ==
969             EndEntry.getExpansion().getExpansionLocStart()) {
970       Range.setBegin(SM.getImmediateSpellingLoc(Begin));
971       Range.setEnd(SM.getImmediateSpellingLoc(End));
972       return makeFileCharRange(Range, SM, LangOpts);
973     }
974   }
975 
976   return {};
977 }
978 
getSourceText(CharSourceRange Range,const SourceManager & SM,const LangOptions & LangOpts,bool * Invalid)979 StringRef Lexer::getSourceText(CharSourceRange Range,
980                                const SourceManager &SM,
981                                const LangOptions &LangOpts,
982                                bool *Invalid) {
983   Range = makeFileCharRange(Range, SM, LangOpts);
984   if (Range.isInvalid()) {
985     if (Invalid) *Invalid = true;
986     return {};
987   }
988 
989   // Break down the source location.
990   std::pair<FileID, unsigned> beginInfo = SM.getDecomposedLoc(Range.getBegin());
991   if (beginInfo.first.isInvalid()) {
992     if (Invalid) *Invalid = true;
993     return {};
994   }
995 
996   unsigned EndOffs;
997   if (!SM.isInFileID(Range.getEnd(), beginInfo.first, &EndOffs) ||
998       beginInfo.second > EndOffs) {
999     if (Invalid) *Invalid = true;
1000     return {};
1001   }
1002 
1003   // Try to the load the file buffer.
1004   bool invalidTemp = false;
1005   StringRef file = SM.getBufferData(beginInfo.first, &invalidTemp);
1006   if (invalidTemp) {
1007     if (Invalid) *Invalid = true;
1008     return {};
1009   }
1010 
1011   if (Invalid) *Invalid = false;
1012   return file.substr(beginInfo.second, EndOffs - beginInfo.second);
1013 }
1014 
getImmediateMacroName(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)1015 StringRef Lexer::getImmediateMacroName(SourceLocation Loc,
1016                                        const SourceManager &SM,
1017                                        const LangOptions &LangOpts) {
1018   assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1019 
1020   // Find the location of the immediate macro expansion.
1021   while (true) {
1022     FileID FID = SM.getFileID(Loc);
1023     const SrcMgr::SLocEntry *E = &SM.getSLocEntry(FID);
1024     const SrcMgr::ExpansionInfo &Expansion = E->getExpansion();
1025     Loc = Expansion.getExpansionLocStart();
1026     if (!Expansion.isMacroArgExpansion())
1027       break;
1028 
1029     // For macro arguments we need to check that the argument did not come
1030     // from an inner macro, e.g: "MAC1( MAC2(foo) )"
1031 
1032     // Loc points to the argument id of the macro definition, move to the
1033     // macro expansion.
1034     Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1035     SourceLocation SpellLoc = Expansion.getSpellingLoc();
1036     if (SpellLoc.isFileID())
1037       break; // No inner macro.
1038 
1039     // If spelling location resides in the same FileID as macro expansion
1040     // location, it means there is no inner macro.
1041     FileID MacroFID = SM.getFileID(Loc);
1042     if (SM.isInFileID(SpellLoc, MacroFID))
1043       break;
1044 
1045     // Argument came from inner macro.
1046     Loc = SpellLoc;
1047   }
1048 
1049   // Find the spelling location of the start of the non-argument expansion
1050   // range. This is where the macro name was spelled in order to begin
1051   // expanding this macro.
1052   Loc = SM.getSpellingLoc(Loc);
1053 
1054   // Dig out the buffer where the macro name was spelled and the extents of the
1055   // name so that we can render it into the expansion note.
1056   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1057   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1058   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1059   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1060 }
1061 
getImmediateMacroNameForDiagnostics(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)1062 StringRef Lexer::getImmediateMacroNameForDiagnostics(
1063     SourceLocation Loc, const SourceManager &SM, const LangOptions &LangOpts) {
1064   assert(Loc.isMacroID() && "Only reasonable to call this on macros");
1065   // Walk past macro argument expansions.
1066   while (SM.isMacroArgExpansion(Loc))
1067     Loc = SM.getImmediateExpansionRange(Loc).getBegin();
1068 
1069   // If the macro's spelling isn't FileID or from scratch space, then it's
1070   // actually a token paste or stringization (or similar) and not a macro at
1071   // all.
1072   SourceLocation SpellLoc = SM.getSpellingLoc(Loc);
1073   if (!SpellLoc.isFileID() || SM.isWrittenInScratchSpace(SpellLoc))
1074     return {};
1075 
1076   // Find the spelling location of the start of the non-argument expansion
1077   // range. This is where the macro name was spelled in order to begin
1078   // expanding this macro.
1079   Loc = SM.getSpellingLoc(SM.getImmediateExpansionRange(Loc).getBegin());
1080 
1081   // Dig out the buffer where the macro name was spelled and the extents of the
1082   // name so that we can render it into the expansion note.
1083   std::pair<FileID, unsigned> ExpansionInfo = SM.getDecomposedLoc(Loc);
1084   unsigned MacroTokenLength = Lexer::MeasureTokenLength(Loc, SM, LangOpts);
1085   StringRef ExpansionBuffer = SM.getBufferData(ExpansionInfo.first);
1086   return ExpansionBuffer.substr(ExpansionInfo.second, MacroTokenLength);
1087 }
1088 
isAsciiIdentifierContinueChar(char c,const LangOptions & LangOpts)1089 bool Lexer::isAsciiIdentifierContinueChar(char c, const LangOptions &LangOpts) {
1090   return isAsciiIdentifierContinue(c, LangOpts.DollarIdents);
1091 }
1092 
isNewLineEscaped(const char * BufferStart,const char * Str)1093 bool Lexer::isNewLineEscaped(const char *BufferStart, const char *Str) {
1094   assert(isVerticalWhitespace(Str[0]));
1095   if (Str - 1 < BufferStart)
1096     return false;
1097 
1098   if ((Str[0] == '\n' && Str[-1] == '\r') ||
1099       (Str[0] == '\r' && Str[-1] == '\n')) {
1100     if (Str - 2 < BufferStart)
1101       return false;
1102     --Str;
1103   }
1104   --Str;
1105 
1106   // Rewind to first non-space character:
1107   while (Str > BufferStart && isHorizontalWhitespace(*Str))
1108     --Str;
1109 
1110   return *Str == '\\';
1111 }
1112 
getIndentationForLine(SourceLocation Loc,const SourceManager & SM)1113 StringRef Lexer::getIndentationForLine(SourceLocation Loc,
1114                                        const SourceManager &SM) {
1115   if (Loc.isInvalid() || Loc.isMacroID())
1116     return {};
1117   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1118   if (LocInfo.first.isInvalid())
1119     return {};
1120   bool Invalid = false;
1121   StringRef Buffer = SM.getBufferData(LocInfo.first, &Invalid);
1122   if (Invalid)
1123     return {};
1124   const char *Line = findBeginningOfLine(Buffer, LocInfo.second);
1125   if (!Line)
1126     return {};
1127   StringRef Rest = Buffer.substr(Line - Buffer.data());
1128   size_t NumWhitespaceChars = Rest.find_first_not_of(" \t");
1129   return NumWhitespaceChars == StringRef::npos
1130              ? ""
1131              : Rest.take_front(NumWhitespaceChars);
1132 }
1133 
1134 //===----------------------------------------------------------------------===//
1135 // Diagnostics forwarding code.
1136 //===----------------------------------------------------------------------===//
1137 
1138 /// GetMappedTokenLoc - If lexing out of a 'mapped buffer', where we pretend the
1139 /// lexer buffer was all expanded at a single point, perform the mapping.
1140 /// This is currently only used for _Pragma implementation, so it is the slow
1141 /// path of the hot getSourceLocation method.  Do not allow it to be inlined.
1142 static LLVM_ATTRIBUTE_NOINLINE SourceLocation GetMappedTokenLoc(
1143     Preprocessor &PP, SourceLocation FileLoc, unsigned CharNo, unsigned TokLen);
GetMappedTokenLoc(Preprocessor & PP,SourceLocation FileLoc,unsigned CharNo,unsigned TokLen)1144 static SourceLocation GetMappedTokenLoc(Preprocessor &PP,
1145                                         SourceLocation FileLoc,
1146                                         unsigned CharNo, unsigned TokLen) {
1147   assert(FileLoc.isMacroID() && "Must be a macro expansion");
1148 
1149   // Otherwise, we're lexing "mapped tokens".  This is used for things like
1150   // _Pragma handling.  Combine the expansion location of FileLoc with the
1151   // spelling location.
1152   SourceManager &SM = PP.getSourceManager();
1153 
1154   // Create a new SLoc which is expanded from Expansion(FileLoc) but whose
1155   // characters come from spelling(FileLoc)+Offset.
1156   SourceLocation SpellingLoc = SM.getSpellingLoc(FileLoc);
1157   SpellingLoc = SpellingLoc.getLocWithOffset(CharNo);
1158 
1159   // Figure out the expansion loc range, which is the range covered by the
1160   // original _Pragma(...) sequence.
1161   CharSourceRange II = SM.getImmediateExpansionRange(FileLoc);
1162 
1163   return SM.createExpansionLoc(SpellingLoc, II.getBegin(), II.getEnd(), TokLen);
1164 }
1165 
1166 /// getSourceLocation - Return a source location identifier for the specified
1167 /// offset in the current file.
getSourceLocation(const char * Loc,unsigned TokLen) const1168 SourceLocation Lexer::getSourceLocation(const char *Loc,
1169                                         unsigned TokLen) const {
1170   assert(Loc >= BufferStart && Loc <= BufferEnd &&
1171          "Location out of range for this buffer!");
1172 
1173   // In the normal case, we're just lexing from a simple file buffer, return
1174   // the file id from FileLoc with the offset specified.
1175   unsigned CharNo = Loc-BufferStart;
1176   if (FileLoc.isFileID())
1177     return FileLoc.getLocWithOffset(CharNo);
1178 
1179   // Otherwise, this is the _Pragma lexer case, which pretends that all of the
1180   // tokens are lexed from where the _Pragma was defined.
1181   assert(PP && "This doesn't work on raw lexers");
1182   return GetMappedTokenLoc(*PP, FileLoc, CharNo, TokLen);
1183 }
1184 
1185 /// Diag - Forwarding function for diagnostics.  This translate a source
1186 /// position in the current buffer into a SourceLocation object for rendering.
Diag(const char * Loc,unsigned DiagID) const1187 DiagnosticBuilder Lexer::Diag(const char *Loc, unsigned DiagID) const {
1188   return PP->Diag(getSourceLocation(Loc), DiagID);
1189 }
1190 
1191 //===----------------------------------------------------------------------===//
1192 // Trigraph and Escaped Newline Handling Code.
1193 //===----------------------------------------------------------------------===//
1194 
1195 /// GetTrigraphCharForLetter - Given a character that occurs after a ?? pair,
1196 /// return the decoded trigraph letter it corresponds to, or '\0' if nothing.
GetTrigraphCharForLetter(char Letter)1197 static char GetTrigraphCharForLetter(char Letter) {
1198   switch (Letter) {
1199   default:   return 0;
1200   case '=':  return '#';
1201   case ')':  return ']';
1202   case '(':  return '[';
1203   case '!':  return '|';
1204   case '\'': return '^';
1205   case '>':  return '}';
1206   case '/':  return '\\';
1207   case '<':  return '{';
1208   case '-':  return '~';
1209   }
1210 }
1211 
1212 /// DecodeTrigraphChar - If the specified character is a legal trigraph when
1213 /// prefixed with ??, emit a trigraph warning.  If trigraphs are enabled,
1214 /// return the result character.  Finally, emit a warning about trigraph use
1215 /// whether trigraphs are enabled or not.
DecodeTrigraphChar(const char * CP,Lexer * L,bool Trigraphs)1216 static char DecodeTrigraphChar(const char *CP, Lexer *L, bool Trigraphs) {
1217   char Res = GetTrigraphCharForLetter(*CP);
1218   if (!Res)
1219     return Res;
1220 
1221   if (!Trigraphs) {
1222     if (L && !L->isLexingRawMode())
1223       L->Diag(CP-2, diag::trigraph_ignored);
1224     return 0;
1225   }
1226 
1227   if (L && !L->isLexingRawMode())
1228     L->Diag(CP-2, diag::trigraph_converted) << StringRef(&Res, 1);
1229   return Res;
1230 }
1231 
1232 /// getEscapedNewLineSize - Return the size of the specified escaped newline,
1233 /// or 0 if it is not an escaped newline. P[-1] is known to be a "\" or a
1234 /// trigraph equivalent on entry to this function.
getEscapedNewLineSize(const char * Ptr)1235 unsigned Lexer::getEscapedNewLineSize(const char *Ptr) {
1236   unsigned Size = 0;
1237   while (isWhitespace(Ptr[Size])) {
1238     ++Size;
1239 
1240     if (Ptr[Size-1] != '\n' && Ptr[Size-1] != '\r')
1241       continue;
1242 
1243     // If this is a \r\n or \n\r, skip the other half.
1244     if ((Ptr[Size] == '\r' || Ptr[Size] == '\n') &&
1245         Ptr[Size-1] != Ptr[Size])
1246       ++Size;
1247 
1248     return Size;
1249   }
1250 
1251   // Not an escaped newline, must be a \t or something else.
1252   return 0;
1253 }
1254 
1255 /// SkipEscapedNewLines - If P points to an escaped newline (or a series of
1256 /// them), skip over them and return the first non-escaped-newline found,
1257 /// otherwise return P.
SkipEscapedNewLines(const char * P)1258 const char *Lexer::SkipEscapedNewLines(const char *P) {
1259   while (true) {
1260     const char *AfterEscape;
1261     if (*P == '\\') {
1262       AfterEscape = P+1;
1263     } else if (*P == '?') {
1264       // If not a trigraph for escape, bail out.
1265       if (P[1] != '?' || P[2] != '/')
1266         return P;
1267       // FIXME: Take LangOpts into account; the language might not
1268       // support trigraphs.
1269       AfterEscape = P+3;
1270     } else {
1271       return P;
1272     }
1273 
1274     unsigned NewLineSize = Lexer::getEscapedNewLineSize(AfterEscape);
1275     if (NewLineSize == 0) return P;
1276     P = AfterEscape+NewLineSize;
1277   }
1278 }
1279 
findNextToken(SourceLocation Loc,const SourceManager & SM,const LangOptions & LangOpts)1280 std::optional<Token> Lexer::findNextToken(SourceLocation Loc,
1281                                           const SourceManager &SM,
1282                                           const LangOptions &LangOpts) {
1283   if (Loc.isMacroID()) {
1284     if (!Lexer::isAtEndOfMacroExpansion(Loc, SM, LangOpts, &Loc))
1285       return std::nullopt;
1286   }
1287   Loc = Lexer::getLocForEndOfToken(Loc, 0, SM, LangOpts);
1288 
1289   // Break down the source location.
1290   std::pair<FileID, unsigned> LocInfo = SM.getDecomposedLoc(Loc);
1291 
1292   // Try to load the file buffer.
1293   bool InvalidTemp = false;
1294   StringRef File = SM.getBufferData(LocInfo.first, &InvalidTemp);
1295   if (InvalidTemp)
1296     return std::nullopt;
1297 
1298   const char *TokenBegin = File.data() + LocInfo.second;
1299 
1300   // Lex from the start of the given location.
1301   Lexer lexer(SM.getLocForStartOfFile(LocInfo.first), LangOpts, File.begin(),
1302                                       TokenBegin, File.end());
1303   // Find the token.
1304   Token Tok;
1305   lexer.LexFromRawLexer(Tok);
1306   return Tok;
1307 }
1308 
1309 /// Checks that the given token is the first token that occurs after the
1310 /// given location (this excludes comments and whitespace). Returns the location
1311 /// immediately after the specified token. If the token is not found or the
1312 /// location is inside a macro, the returned source location will be invalid.
findLocationAfterToken(SourceLocation Loc,tok::TokenKind TKind,const SourceManager & SM,const LangOptions & LangOpts,bool SkipTrailingWhitespaceAndNewLine)1313 SourceLocation Lexer::findLocationAfterToken(
1314     SourceLocation Loc, tok::TokenKind TKind, const SourceManager &SM,
1315     const LangOptions &LangOpts, bool SkipTrailingWhitespaceAndNewLine) {
1316   std::optional<Token> Tok = findNextToken(Loc, SM, LangOpts);
1317   if (!Tok || Tok->isNot(TKind))
1318     return {};
1319   SourceLocation TokenLoc = Tok->getLocation();
1320 
1321   // Calculate how much whitespace needs to be skipped if any.
1322   unsigned NumWhitespaceChars = 0;
1323   if (SkipTrailingWhitespaceAndNewLine) {
1324     const char *TokenEnd = SM.getCharacterData(TokenLoc) + Tok->getLength();
1325     unsigned char C = *TokenEnd;
1326     while (isHorizontalWhitespace(C)) {
1327       C = *(++TokenEnd);
1328       NumWhitespaceChars++;
1329     }
1330 
1331     // Skip \r, \n, \r\n, or \n\r
1332     if (C == '\n' || C == '\r') {
1333       char PrevC = C;
1334       C = *(++TokenEnd);
1335       NumWhitespaceChars++;
1336       if ((C == '\n' || C == '\r') && C != PrevC)
1337         NumWhitespaceChars++;
1338     }
1339   }
1340 
1341   return TokenLoc.getLocWithOffset(Tok->getLength() + NumWhitespaceChars);
1342 }
1343 
1344 /// getCharAndSizeSlow - Peek a single 'character' from the specified buffer,
1345 /// get its size, and return it.  This is tricky in several cases:
1346 ///   1. If currently at the start of a trigraph, we warn about the trigraph,
1347 ///      then either return the trigraph (skipping 3 chars) or the '?',
1348 ///      depending on whether trigraphs are enabled or not.
1349 ///   2. If this is an escaped newline (potentially with whitespace between
1350 ///      the backslash and newline), implicitly skip the newline and return
1351 ///      the char after it.
1352 ///
1353 /// This handles the slow/uncommon case of the getCharAndSize method.  Here we
1354 /// know that we can accumulate into Size, and that we have already incremented
1355 /// Ptr by Size bytes.
1356 ///
1357 /// NOTE: When this method is updated, getCharAndSizeSlowNoWarn (below) should
1358 /// be updated to match.
getCharAndSizeSlow(const char * Ptr,Token * Tok)1359 Lexer::SizedChar Lexer::getCharAndSizeSlow(const char *Ptr, Token *Tok) {
1360   unsigned Size = 0;
1361   // If we have a slash, look for an escaped newline.
1362   if (Ptr[0] == '\\') {
1363     ++Size;
1364     ++Ptr;
1365 Slash:
1366     // Common case, backslash-char where the char is not whitespace.
1367     if (!isWhitespace(Ptr[0]))
1368       return {'\\', Size};
1369 
1370     // See if we have optional whitespace characters between the slash and
1371     // newline.
1372     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1373       // Remember that this token needs to be cleaned.
1374       if (Tok) Tok->setFlag(Token::NeedsCleaning);
1375 
1376       // Warn if there was whitespace between the backslash and newline.
1377       if (Ptr[0] != '\n' && Ptr[0] != '\r' && Tok && !isLexingRawMode())
1378         Diag(Ptr, diag::backslash_newline_space);
1379 
1380       // Found backslash<whitespace><newline>.  Parse the char after it.
1381       Size += EscapedNewLineSize;
1382       Ptr  += EscapedNewLineSize;
1383 
1384       // Use slow version to accumulate a correct size field.
1385       auto CharAndSize = getCharAndSizeSlow(Ptr, Tok);
1386       CharAndSize.Size += Size;
1387       return CharAndSize;
1388     }
1389 
1390     // Otherwise, this is not an escaped newline, just return the slash.
1391     return {'\\', Size};
1392   }
1393 
1394   // If this is a trigraph, process it.
1395   if (Ptr[0] == '?' && Ptr[1] == '?') {
1396     // If this is actually a legal trigraph (not something like "??x"), emit
1397     // a trigraph warning.  If so, and if trigraphs are enabled, return it.
1398     if (char C = DecodeTrigraphChar(Ptr + 2, Tok ? this : nullptr,
1399                                     LangOpts.Trigraphs)) {
1400       // Remember that this token needs to be cleaned.
1401       if (Tok) Tok->setFlag(Token::NeedsCleaning);
1402 
1403       Ptr += 3;
1404       Size += 3;
1405       if (C == '\\') goto Slash;
1406       return {C, Size};
1407     }
1408   }
1409 
1410   // If this is neither, return a single character.
1411   return {*Ptr, Size + 1u};
1412 }
1413 
1414 /// getCharAndSizeSlowNoWarn - Handle the slow/uncommon case of the
1415 /// getCharAndSizeNoWarn method.  Here we know that we can accumulate into Size,
1416 /// and that we have already incremented Ptr by Size bytes.
1417 ///
1418 /// NOTE: When this method is updated, getCharAndSizeSlow (above) should
1419 /// be updated to match.
getCharAndSizeSlowNoWarn(const char * Ptr,const LangOptions & LangOpts)1420 Lexer::SizedChar Lexer::getCharAndSizeSlowNoWarn(const char *Ptr,
1421                                                  const LangOptions &LangOpts) {
1422 
1423   unsigned Size = 0;
1424   // If we have a slash, look for an escaped newline.
1425   if (Ptr[0] == '\\') {
1426     ++Size;
1427     ++Ptr;
1428 Slash:
1429     // Common case, backslash-char where the char is not whitespace.
1430     if (!isWhitespace(Ptr[0]))
1431       return {'\\', Size};
1432 
1433     // See if we have optional whitespace characters followed by a newline.
1434     if (unsigned EscapedNewLineSize = getEscapedNewLineSize(Ptr)) {
1435       // Found backslash<whitespace><newline>.  Parse the char after it.
1436       Size += EscapedNewLineSize;
1437       Ptr  += EscapedNewLineSize;
1438 
1439       // Use slow version to accumulate a correct size field.
1440       auto CharAndSize = getCharAndSizeSlowNoWarn(Ptr, LangOpts);
1441       CharAndSize.Size += Size;
1442       return CharAndSize;
1443     }
1444 
1445     // Otherwise, this is not an escaped newline, just return the slash.
1446     return {'\\', Size};
1447   }
1448 
1449   // If this is a trigraph, process it.
1450   if (LangOpts.Trigraphs && Ptr[0] == '?' && Ptr[1] == '?') {
1451     // If this is actually a legal trigraph (not something like "??x"), return
1452     // it.
1453     if (char C = GetTrigraphCharForLetter(Ptr[2])) {
1454       Ptr += 3;
1455       Size += 3;
1456       if (C == '\\') goto Slash;
1457       return {C, Size};
1458     }
1459   }
1460 
1461   // If this is neither, return a single character.
1462   return {*Ptr, Size + 1u};
1463 }
1464 
1465 //===----------------------------------------------------------------------===//
1466 // Helper methods for lexing.
1467 //===----------------------------------------------------------------------===//
1468 
1469 /// Routine that indiscriminately sets the offset into the source file.
SetByteOffset(unsigned Offset,bool StartOfLine)1470 void Lexer::SetByteOffset(unsigned Offset, bool StartOfLine) {
1471   BufferPtr = BufferStart + Offset;
1472   if (BufferPtr > BufferEnd)
1473     BufferPtr = BufferEnd;
1474   // FIXME: What exactly does the StartOfLine bit mean?  There are two
1475   // possible meanings for the "start" of the line: the first token on the
1476   // unexpanded line, or the first token on the expanded line.
1477   IsAtStartOfLine = StartOfLine;
1478   IsAtPhysicalStartOfLine = StartOfLine;
1479 }
1480 
isUnicodeWhitespace(uint32_t Codepoint)1481 static bool isUnicodeWhitespace(uint32_t Codepoint) {
1482   static const llvm::sys::UnicodeCharSet UnicodeWhitespaceChars(
1483       UnicodeWhitespaceCharRanges);
1484   return UnicodeWhitespaceChars.contains(Codepoint);
1485 }
1486 
codepointAsHexString(uint32_t C)1487 static llvm::SmallString<5> codepointAsHexString(uint32_t C) {
1488   llvm::SmallString<5> CharBuf;
1489   llvm::raw_svector_ostream CharOS(CharBuf);
1490   llvm::write_hex(CharOS, C, llvm::HexPrintStyle::Upper, 4);
1491   return CharBuf;
1492 }
1493 
1494 // To mitigate https://github.com/llvm/llvm-project/issues/54732,
1495 // we allow "Mathematical Notation Characters" in identifiers.
1496 // This is a proposed profile that extends the XID_Start/XID_continue
1497 // with mathematical symbols, superscipts and subscripts digits
1498 // found in some production software.
1499 // https://www.unicode.org/L2/L2022/22230-math-profile.pdf
isMathematicalExtensionID(uint32_t C,const LangOptions & LangOpts,bool IsStart,bool & IsExtension)1500 static bool isMathematicalExtensionID(uint32_t C, const LangOptions &LangOpts,
1501                                       bool IsStart, bool &IsExtension) {
1502   static const llvm::sys::UnicodeCharSet MathStartChars(
1503       MathematicalNotationProfileIDStartRanges);
1504   static const llvm::sys::UnicodeCharSet MathContinueChars(
1505       MathematicalNotationProfileIDContinueRanges);
1506   if (MathStartChars.contains(C) ||
1507       (!IsStart && MathContinueChars.contains(C))) {
1508     IsExtension = true;
1509     return true;
1510   }
1511   return false;
1512 }
1513 
isAllowedIDChar(uint32_t C,const LangOptions & LangOpts,bool & IsExtension)1514 static bool isAllowedIDChar(uint32_t C, const LangOptions &LangOpts,
1515                             bool &IsExtension) {
1516   if (LangOpts.AsmPreprocessor) {
1517     return false;
1518   } else if (LangOpts.DollarIdents && '$' == C) {
1519     return true;
1520   } else if (LangOpts.CPlusPlus || LangOpts.C23) {
1521     // A non-leading codepoint must have the XID_Continue property.
1522     // XIDContinueRanges doesn't contains characters also in XIDStartRanges,
1523     // so we need to check both tables.
1524     // '_' doesn't have the XID_Continue property but is allowed in C and C++.
1525     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1526     static const llvm::sys::UnicodeCharSet XIDContinueChars(XIDContinueRanges);
1527     if (C == '_' || XIDStartChars.contains(C) || XIDContinueChars.contains(C))
1528       return true;
1529     return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/false,
1530                                      IsExtension);
1531   } else if (LangOpts.C11) {
1532     static const llvm::sys::UnicodeCharSet C11AllowedIDChars(
1533         C11AllowedIDCharRanges);
1534     return C11AllowedIDChars.contains(C);
1535   } else {
1536     static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1537         C99AllowedIDCharRanges);
1538     return C99AllowedIDChars.contains(C);
1539   }
1540 }
1541 
isAllowedInitiallyIDChar(uint32_t C,const LangOptions & LangOpts,bool & IsExtension)1542 static bool isAllowedInitiallyIDChar(uint32_t C, const LangOptions &LangOpts,
1543                                      bool &IsExtension) {
1544   assert(C > 0x7F && "isAllowedInitiallyIDChar called with an ASCII codepoint");
1545   IsExtension = false;
1546   if (LangOpts.AsmPreprocessor) {
1547     return false;
1548   }
1549   if (LangOpts.CPlusPlus || LangOpts.C23) {
1550     static const llvm::sys::UnicodeCharSet XIDStartChars(XIDStartRanges);
1551     if (XIDStartChars.contains(C))
1552       return true;
1553     return isMathematicalExtensionID(C, LangOpts, /*IsStart=*/true,
1554                                      IsExtension);
1555   }
1556   if (!isAllowedIDChar(C, LangOpts, IsExtension))
1557     return false;
1558   if (LangOpts.C11) {
1559     static const llvm::sys::UnicodeCharSet C11DisallowedInitialIDChars(
1560         C11DisallowedInitialIDCharRanges);
1561     return !C11DisallowedInitialIDChars.contains(C);
1562   }
1563   static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1564       C99DisallowedInitialIDCharRanges);
1565   return !C99DisallowedInitialIDChars.contains(C);
1566 }
1567 
diagnoseExtensionInIdentifier(DiagnosticsEngine & Diags,uint32_t C,CharSourceRange Range)1568 static void diagnoseExtensionInIdentifier(DiagnosticsEngine &Diags, uint32_t C,
1569                                           CharSourceRange Range) {
1570 
1571   static const llvm::sys::UnicodeCharSet MathStartChars(
1572       MathematicalNotationProfileIDStartRanges);
1573   static const llvm::sys::UnicodeCharSet MathContinueChars(
1574       MathematicalNotationProfileIDContinueRanges);
1575 
1576   (void)MathStartChars;
1577   (void)MathContinueChars;
1578   assert((MathStartChars.contains(C) || MathContinueChars.contains(C)) &&
1579          "Unexpected mathematical notation codepoint");
1580   Diags.Report(Range.getBegin(), diag::ext_mathematical_notation)
1581       << codepointAsHexString(C) << Range;
1582 }
1583 
makeCharRange(Lexer & L,const char * Begin,const char * End)1584 static inline CharSourceRange makeCharRange(Lexer &L, const char *Begin,
1585                                             const char *End) {
1586   return CharSourceRange::getCharRange(L.getSourceLocation(Begin),
1587                                        L.getSourceLocation(End));
1588 }
1589 
maybeDiagnoseIDCharCompat(DiagnosticsEngine & Diags,uint32_t C,CharSourceRange Range,bool IsFirst)1590 static void maybeDiagnoseIDCharCompat(DiagnosticsEngine &Diags, uint32_t C,
1591                                       CharSourceRange Range, bool IsFirst) {
1592   // Check C99 compatibility.
1593   if (!Diags.isIgnored(diag::warn_c99_compat_unicode_id, Range.getBegin())) {
1594     enum {
1595       CannotAppearInIdentifier = 0,
1596       CannotStartIdentifier
1597     };
1598 
1599     static const llvm::sys::UnicodeCharSet C99AllowedIDChars(
1600         C99AllowedIDCharRanges);
1601     static const llvm::sys::UnicodeCharSet C99DisallowedInitialIDChars(
1602         C99DisallowedInitialIDCharRanges);
1603     if (!C99AllowedIDChars.contains(C)) {
1604       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1605         << Range
1606         << CannotAppearInIdentifier;
1607     } else if (IsFirst && C99DisallowedInitialIDChars.contains(C)) {
1608       Diags.Report(Range.getBegin(), diag::warn_c99_compat_unicode_id)
1609         << Range
1610         << CannotStartIdentifier;
1611     }
1612   }
1613 }
1614 
1615 /// After encountering UTF-8 character C and interpreting it as an identifier
1616 /// character, check whether it's a homoglyph for a common non-identifier
1617 /// source character that is unlikely to be an intentional identifier
1618 /// character and warn if so.
maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine & Diags,uint32_t C,CharSourceRange Range)1619 static void maybeDiagnoseUTF8Homoglyph(DiagnosticsEngine &Diags, uint32_t C,
1620                                        CharSourceRange Range) {
1621   // FIXME: Handle Unicode quotation marks (smart quotes, fullwidth quotes).
1622   struct HomoglyphPair {
1623     uint32_t Character;
1624     char LooksLike;
1625     bool operator<(HomoglyphPair R) const { return Character < R.Character; }
1626   };
1627   static constexpr HomoglyphPair SortedHomoglyphs[] = {
1628     {U'\u00ad', 0},   // SOFT HYPHEN
1629     {U'\u01c3', '!'}, // LATIN LETTER RETROFLEX CLICK
1630     {U'\u037e', ';'}, // GREEK QUESTION MARK
1631     {U'\u200b', 0},   // ZERO WIDTH SPACE
1632     {U'\u200c', 0},   // ZERO WIDTH NON-JOINER
1633     {U'\u200d', 0},   // ZERO WIDTH JOINER
1634     {U'\u2060', 0},   // WORD JOINER
1635     {U'\u2061', 0},   // FUNCTION APPLICATION
1636     {U'\u2062', 0},   // INVISIBLE TIMES
1637     {U'\u2063', 0},   // INVISIBLE SEPARATOR
1638     {U'\u2064', 0},   // INVISIBLE PLUS
1639     {U'\u2212', '-'}, // MINUS SIGN
1640     {U'\u2215', '/'}, // DIVISION SLASH
1641     {U'\u2216', '\\'}, // SET MINUS
1642     {U'\u2217', '*'}, // ASTERISK OPERATOR
1643     {U'\u2223', '|'}, // DIVIDES
1644     {U'\u2227', '^'}, // LOGICAL AND
1645     {U'\u2236', ':'}, // RATIO
1646     {U'\u223c', '~'}, // TILDE OPERATOR
1647     {U'\ua789', ':'}, // MODIFIER LETTER COLON
1648     {U'\ufeff', 0},   // ZERO WIDTH NO-BREAK SPACE
1649     {U'\uff01', '!'}, // FULLWIDTH EXCLAMATION MARK
1650     {U'\uff03', '#'}, // FULLWIDTH NUMBER SIGN
1651     {U'\uff04', '$'}, // FULLWIDTH DOLLAR SIGN
1652     {U'\uff05', '%'}, // FULLWIDTH PERCENT SIGN
1653     {U'\uff06', '&'}, // FULLWIDTH AMPERSAND
1654     {U'\uff08', '('}, // FULLWIDTH LEFT PARENTHESIS
1655     {U'\uff09', ')'}, // FULLWIDTH RIGHT PARENTHESIS
1656     {U'\uff0a', '*'}, // FULLWIDTH ASTERISK
1657     {U'\uff0b', '+'}, // FULLWIDTH ASTERISK
1658     {U'\uff0c', ','}, // FULLWIDTH COMMA
1659     {U'\uff0d', '-'}, // FULLWIDTH HYPHEN-MINUS
1660     {U'\uff0e', '.'}, // FULLWIDTH FULL STOP
1661     {U'\uff0f', '/'}, // FULLWIDTH SOLIDUS
1662     {U'\uff1a', ':'}, // FULLWIDTH COLON
1663     {U'\uff1b', ';'}, // FULLWIDTH SEMICOLON
1664     {U'\uff1c', '<'}, // FULLWIDTH LESS-THAN SIGN
1665     {U'\uff1d', '='}, // FULLWIDTH EQUALS SIGN
1666     {U'\uff1e', '>'}, // FULLWIDTH GREATER-THAN SIGN
1667     {U'\uff1f', '?'}, // FULLWIDTH QUESTION MARK
1668     {U'\uff20', '@'}, // FULLWIDTH COMMERCIAL AT
1669     {U'\uff3b', '['}, // FULLWIDTH LEFT SQUARE BRACKET
1670     {U'\uff3c', '\\'}, // FULLWIDTH REVERSE SOLIDUS
1671     {U'\uff3d', ']'}, // FULLWIDTH RIGHT SQUARE BRACKET
1672     {U'\uff3e', '^'}, // FULLWIDTH CIRCUMFLEX ACCENT
1673     {U'\uff5b', '{'}, // FULLWIDTH LEFT CURLY BRACKET
1674     {U'\uff5c', '|'}, // FULLWIDTH VERTICAL LINE
1675     {U'\uff5d', '}'}, // FULLWIDTH RIGHT CURLY BRACKET
1676     {U'\uff5e', '~'}, // FULLWIDTH TILDE
1677     {0, 0}
1678   };
1679   auto Homoglyph =
1680       std::lower_bound(std::begin(SortedHomoglyphs),
1681                        std::end(SortedHomoglyphs) - 1, HomoglyphPair{C, '\0'});
1682   if (Homoglyph->Character == C) {
1683     if (Homoglyph->LooksLike) {
1684       const char LooksLikeStr[] = {Homoglyph->LooksLike, 0};
1685       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_homoglyph)
1686           << Range << codepointAsHexString(C) << LooksLikeStr;
1687     } else {
1688       Diags.Report(Range.getBegin(), diag::warn_utf8_symbol_zero_width)
1689           << Range << codepointAsHexString(C);
1690     }
1691   }
1692 }
1693 
diagnoseInvalidUnicodeCodepointInIdentifier(DiagnosticsEngine & Diags,const LangOptions & LangOpts,uint32_t CodePoint,CharSourceRange Range,bool IsFirst)1694 static void diagnoseInvalidUnicodeCodepointInIdentifier(
1695     DiagnosticsEngine &Diags, const LangOptions &LangOpts, uint32_t CodePoint,
1696     CharSourceRange Range, bool IsFirst) {
1697   if (isASCII(CodePoint))
1698     return;
1699 
1700   bool IsExtension;
1701   bool IsIDStart = isAllowedInitiallyIDChar(CodePoint, LangOpts, IsExtension);
1702   bool IsIDContinue =
1703       IsIDStart || isAllowedIDChar(CodePoint, LangOpts, IsExtension);
1704 
1705   if ((IsFirst && IsIDStart) || (!IsFirst && IsIDContinue))
1706     return;
1707 
1708   bool InvalidOnlyAtStart = IsFirst && !IsIDStart && IsIDContinue;
1709 
1710   if (!IsFirst || InvalidOnlyAtStart) {
1711     Diags.Report(Range.getBegin(), diag::err_character_not_allowed_identifier)
1712         << Range << codepointAsHexString(CodePoint) << int(InvalidOnlyAtStart)
1713         << FixItHint::CreateRemoval(Range);
1714   } else {
1715     Diags.Report(Range.getBegin(), diag::err_character_not_allowed)
1716         << Range << codepointAsHexString(CodePoint)
1717         << FixItHint::CreateRemoval(Range);
1718   }
1719 }
1720 
tryConsumeIdentifierUCN(const char * & CurPtr,unsigned Size,Token & Result)1721 bool Lexer::tryConsumeIdentifierUCN(const char *&CurPtr, unsigned Size,
1722                                     Token &Result) {
1723   const char *UCNPtr = CurPtr + Size;
1724   uint32_t CodePoint = tryReadUCN(UCNPtr, CurPtr, /*Token=*/nullptr);
1725   if (CodePoint == 0) {
1726     return false;
1727   }
1728   bool IsExtension = false;
1729   if (!isAllowedIDChar(CodePoint, LangOpts, IsExtension)) {
1730     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1731       return false;
1732     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1733         !PP->isPreprocessedOutput())
1734       diagnoseInvalidUnicodeCodepointInIdentifier(
1735           PP->getDiagnostics(), LangOpts, CodePoint,
1736           makeCharRange(*this, CurPtr, UCNPtr),
1737           /*IsFirst=*/false);
1738 
1739     // We got a unicode codepoint that is neither a space nor a
1740     // a valid identifier part.
1741     // Carry on as if the codepoint was valid for recovery purposes.
1742   } else if (!isLexingRawMode()) {
1743     if (IsExtension)
1744       diagnoseExtensionInIdentifier(PP->getDiagnostics(), CodePoint,
1745                                     makeCharRange(*this, CurPtr, UCNPtr));
1746 
1747     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1748                               makeCharRange(*this, CurPtr, UCNPtr),
1749                               /*IsFirst=*/false);
1750   }
1751 
1752   Result.setFlag(Token::HasUCN);
1753   if ((UCNPtr - CurPtr ==  6 && CurPtr[1] == 'u') ||
1754       (UCNPtr - CurPtr == 10 && CurPtr[1] == 'U'))
1755     CurPtr = UCNPtr;
1756   else
1757     while (CurPtr != UCNPtr)
1758       (void)getAndAdvanceChar(CurPtr, Result);
1759   return true;
1760 }
1761 
tryConsumeIdentifierUTF8Char(const char * & CurPtr,Token & Result)1762 bool Lexer::tryConsumeIdentifierUTF8Char(const char *&CurPtr, Token &Result) {
1763   llvm::UTF32 CodePoint;
1764 
1765   // If a UTF-8 codepoint appears immediately after an escaped new line,
1766   // CurPtr may point to the splicing \ on the preceding line,
1767   // so we need to skip it.
1768   unsigned FirstCodeUnitSize;
1769   getCharAndSize(CurPtr, FirstCodeUnitSize);
1770   const char *CharStart = CurPtr + FirstCodeUnitSize - 1;
1771   const char *UnicodePtr = CharStart;
1772 
1773   llvm::ConversionResult ConvResult = llvm::convertUTF8Sequence(
1774       (const llvm::UTF8 **)&UnicodePtr, (const llvm::UTF8 *)BufferEnd,
1775       &CodePoint, llvm::strictConversion);
1776   if (ConvResult != llvm::conversionOK)
1777     return false;
1778 
1779   bool IsExtension = false;
1780   if (!isAllowedIDChar(static_cast<uint32_t>(CodePoint), LangOpts,
1781                        IsExtension)) {
1782     if (isASCII(CodePoint) || isUnicodeWhitespace(CodePoint))
1783       return false;
1784 
1785     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1786         !PP->isPreprocessedOutput())
1787       diagnoseInvalidUnicodeCodepointInIdentifier(
1788           PP->getDiagnostics(), LangOpts, CodePoint,
1789           makeCharRange(*this, CharStart, UnicodePtr), /*IsFirst=*/false);
1790     // We got a unicode codepoint that is neither a space nor a
1791     // a valid identifier part. Carry on as if the codepoint was
1792     // valid for recovery purposes.
1793   } else if (!isLexingRawMode()) {
1794     if (IsExtension)
1795       diagnoseExtensionInIdentifier(
1796           PP->getDiagnostics(), CodePoint,
1797           makeCharRange(*this, CharStart, UnicodePtr));
1798     maybeDiagnoseIDCharCompat(PP->getDiagnostics(), CodePoint,
1799                               makeCharRange(*this, CharStart, UnicodePtr),
1800                               /*IsFirst=*/false);
1801     maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), CodePoint,
1802                                makeCharRange(*this, CharStart, UnicodePtr));
1803   }
1804 
1805   // Once we sucessfully parsed some UTF-8,
1806   // calling ConsumeChar ensures the NeedsCleaning flag is set on the token
1807   // being lexed, and that warnings about trailing spaces are emitted.
1808   ConsumeChar(CurPtr, FirstCodeUnitSize, Result);
1809   CurPtr = UnicodePtr;
1810   return true;
1811 }
1812 
LexUnicodeIdentifierStart(Token & Result,uint32_t C,const char * CurPtr)1813 bool Lexer::LexUnicodeIdentifierStart(Token &Result, uint32_t C,
1814                                       const char *CurPtr) {
1815   bool IsExtension = false;
1816   if (isAllowedInitiallyIDChar(C, LangOpts, IsExtension)) {
1817     if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1818         !PP->isPreprocessedOutput()) {
1819       if (IsExtension)
1820         diagnoseExtensionInIdentifier(PP->getDiagnostics(), C,
1821                                       makeCharRange(*this, BufferPtr, CurPtr));
1822       maybeDiagnoseIDCharCompat(PP->getDiagnostics(), C,
1823                                 makeCharRange(*this, BufferPtr, CurPtr),
1824                                 /*IsFirst=*/true);
1825       maybeDiagnoseUTF8Homoglyph(PP->getDiagnostics(), C,
1826                                  makeCharRange(*this, BufferPtr, CurPtr));
1827     }
1828 
1829     MIOpt.ReadToken();
1830     return LexIdentifierContinue(Result, CurPtr);
1831   }
1832 
1833   if (!isLexingRawMode() && !ParsingPreprocessorDirective &&
1834       !PP->isPreprocessedOutput() && !isASCII(*BufferPtr) &&
1835       !isUnicodeWhitespace(C)) {
1836     // Non-ASCII characters tend to creep into source code unintentionally.
1837     // Instead of letting the parser complain about the unknown token,
1838     // just drop the character.
1839     // Note that we can /only/ do this when the non-ASCII character is actually
1840     // spelled as Unicode, not written as a UCN. The standard requires that
1841     // we not throw away any possible preprocessor tokens, but there's a
1842     // loophole in the mapping of Unicode characters to basic character set
1843     // characters that allows us to map these particular characters to, say,
1844     // whitespace.
1845     diagnoseInvalidUnicodeCodepointInIdentifier(
1846         PP->getDiagnostics(), LangOpts, C,
1847         makeCharRange(*this, BufferPtr, CurPtr), /*IsStart*/ true);
1848     BufferPtr = CurPtr;
1849     return false;
1850   }
1851 
1852   // Otherwise, we have an explicit UCN or a character that's unlikely to show
1853   // up by accident.
1854   MIOpt.ReadToken();
1855   FormTokenWithChars(Result, CurPtr, tok::unknown);
1856   return true;
1857 }
1858 
1859 static const char *
fastParseASCIIIdentifier(const char * CurPtr,const char * BufferEnd)1860 fastParseASCIIIdentifier(const char *CurPtr,
1861                          [[maybe_unused]] const char *BufferEnd) {
1862 #ifdef __SSE4_2__
1863   alignas(16) static constexpr char AsciiIdentifierRange[16] = {
1864       '_', '_', 'A', 'Z', 'a', 'z', '0', '9',
1865   };
1866   constexpr ssize_t BytesPerRegister = 16;
1867 
1868   __m128i AsciiIdentifierRangeV =
1869       _mm_load_si128((const __m128i *)AsciiIdentifierRange);
1870 
1871   while (LLVM_LIKELY(BufferEnd - CurPtr >= BytesPerRegister)) {
1872     __m128i Cv = _mm_loadu_si128((const __m128i *)(CurPtr));
1873 
1874     int Consumed = _mm_cmpistri(AsciiIdentifierRangeV, Cv,
1875                                 _SIDD_LEAST_SIGNIFICANT | _SIDD_CMP_RANGES |
1876                                     _SIDD_UBYTE_OPS | _SIDD_NEGATIVE_POLARITY);
1877     CurPtr += Consumed;
1878     if (Consumed == BytesPerRegister)
1879       continue;
1880     return CurPtr;
1881   }
1882 #endif
1883 
1884   unsigned char C = *CurPtr;
1885   while (isAsciiIdentifierContinue(C))
1886     C = *++CurPtr;
1887   return CurPtr;
1888 }
1889 
LexIdentifierContinue(Token & Result,const char * CurPtr)1890 bool Lexer::LexIdentifierContinue(Token &Result, const char *CurPtr) {
1891   // Match [_A-Za-z0-9]*, we have already matched an identifier start.
1892 
1893   while (true) {
1894 
1895     CurPtr = fastParseASCIIIdentifier(CurPtr, BufferEnd);
1896 
1897     unsigned Size;
1898     // Slow path: handle trigraph, unicode codepoints, UCNs.
1899     unsigned char C = getCharAndSize(CurPtr, Size);
1900     if (isAsciiIdentifierContinue(C)) {
1901       CurPtr = ConsumeChar(CurPtr, Size, Result);
1902       continue;
1903     }
1904     if (C == '$') {
1905       // If we hit a $ and they are not supported in identifiers, we are done.
1906       if (!LangOpts.DollarIdents)
1907         break;
1908       // Otherwise, emit a diagnostic and continue.
1909       if (!isLexingRawMode())
1910         Diag(CurPtr, diag::ext_dollar_in_identifier);
1911       CurPtr = ConsumeChar(CurPtr, Size, Result);
1912       continue;
1913     }
1914     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
1915       continue;
1916     if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
1917       continue;
1918     // Neither an expected Unicode codepoint nor a UCN.
1919     break;
1920   }
1921 
1922   const char *IdStart = BufferPtr;
1923   FormTokenWithChars(Result, CurPtr, tok::raw_identifier);
1924   Result.setRawIdentifierData(IdStart);
1925 
1926   // If we are in raw mode, return this identifier raw.  There is no need to
1927   // look up identifier information or attempt to macro expand it.
1928   if (LexingRawMode)
1929     return true;
1930 
1931   // Fill in Result.IdentifierInfo and update the token kind,
1932   // looking up the identifier in the identifier table.
1933   const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
1934   // Note that we have to call PP->LookUpIdentifierInfo() even for code
1935   // completion, it writes IdentifierInfo into Result, and callers rely on it.
1936 
1937   // If the completion point is at the end of an identifier, we want to treat
1938   // the identifier as incomplete even if it resolves to a macro or a keyword.
1939   // This allows e.g. 'class^' to complete to 'classifier'.
1940   if (isCodeCompletionPoint(CurPtr)) {
1941     // Return the code-completion token.
1942     Result.setKind(tok::code_completion);
1943     // Skip the code-completion char and all immediate identifier characters.
1944     // This ensures we get consistent behavior when completing at any point in
1945     // an identifier (i.e. at the start, in the middle, at the end). Note that
1946     // only simple cases (i.e. [a-zA-Z0-9_]) are supported to keep the code
1947     // simpler.
1948     assert(*CurPtr == 0 && "Completion character must be 0");
1949     ++CurPtr;
1950     // Note that code completion token is not added as a separate character
1951     // when the completion point is at the end of the buffer. Therefore, we need
1952     // to check if the buffer has ended.
1953     if (CurPtr < BufferEnd) {
1954       while (isAsciiIdentifierContinue(*CurPtr))
1955         ++CurPtr;
1956     }
1957     BufferPtr = CurPtr;
1958     return true;
1959   }
1960 
1961   // Finally, now that we know we have an identifier, pass this off to the
1962   // preprocessor, which may macro expand it or something.
1963   if (II->isHandleIdentifierCase())
1964     return PP->HandleIdentifier(Result);
1965 
1966   return true;
1967 }
1968 
1969 /// isHexaLiteral - Return true if Start points to a hex constant.
1970 /// in microsoft mode (where this is supposed to be several different tokens).
isHexaLiteral(const char * Start,const LangOptions & LangOpts)1971 bool Lexer::isHexaLiteral(const char *Start, const LangOptions &LangOpts) {
1972   auto CharAndSize1 = Lexer::getCharAndSizeNoWarn(Start, LangOpts);
1973   char C1 = CharAndSize1.Char;
1974   if (C1 != '0')
1975     return false;
1976 
1977   auto CharAndSize2 =
1978       Lexer::getCharAndSizeNoWarn(Start + CharAndSize1.Size, LangOpts);
1979   char C2 = CharAndSize2.Char;
1980   return (C2 == 'x' || C2 == 'X');
1981 }
1982 
1983 /// LexNumericConstant - Lex the remainder of a integer or floating point
1984 /// constant. From[-1] is the first character lexed.  Return the end of the
1985 /// constant.
LexNumericConstant(Token & Result,const char * CurPtr)1986 bool Lexer::LexNumericConstant(Token &Result, const char *CurPtr) {
1987   unsigned Size;
1988   char C = getCharAndSize(CurPtr, Size);
1989   char PrevCh = 0;
1990   while (isPreprocessingNumberBody(C)) {
1991     CurPtr = ConsumeChar(CurPtr, Size, Result);
1992     PrevCh = C;
1993     if (LangOpts.HLSL && C == '.' && (*CurPtr == 'x' || *CurPtr == 'r')) {
1994       CurPtr -= Size;
1995       break;
1996     }
1997     C = getCharAndSize(CurPtr, Size);
1998   }
1999 
2000   // If we fell out, check for a sign, due to 1e+12.  If we have one, continue.
2001   if ((C == '-' || C == '+') && (PrevCh == 'E' || PrevCh == 'e')) {
2002     // If we are in Microsoft mode, don't continue if the constant is hex.
2003     // For example, MSVC will accept the following as 3 tokens: 0x1234567e+1
2004     if (!LangOpts.MicrosoftExt || !isHexaLiteral(BufferPtr, LangOpts))
2005       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2006   }
2007 
2008   // If we have a hex FP constant, continue.
2009   if ((C == '-' || C == '+') && (PrevCh == 'P' || PrevCh == 'p')) {
2010     // Outside C99 and C++17, we accept hexadecimal floating point numbers as a
2011     // not-quite-conforming extension. Only do so if this looks like it's
2012     // actually meant to be a hexfloat, and not if it has a ud-suffix.
2013     bool IsHexFloat = true;
2014     if (!LangOpts.C99) {
2015       if (!isHexaLiteral(BufferPtr, LangOpts))
2016         IsHexFloat = false;
2017       else if (!LangOpts.CPlusPlus17 &&
2018                std::find(BufferPtr, CurPtr, '_') != CurPtr)
2019         IsHexFloat = false;
2020     }
2021     if (IsHexFloat)
2022       return LexNumericConstant(Result, ConsumeChar(CurPtr, Size, Result));
2023   }
2024 
2025   // If we have a digit separator, continue.
2026   if (C == '\'' && (LangOpts.CPlusPlus14 || LangOpts.C23)) {
2027     auto [Next, NextSize] = getCharAndSizeNoWarn(CurPtr + Size, LangOpts);
2028     if (isAsciiIdentifierContinue(Next)) {
2029       if (!isLexingRawMode())
2030         Diag(CurPtr, LangOpts.CPlusPlus
2031                          ? diag::warn_cxx11_compat_digit_separator
2032                          : diag::warn_c23_compat_digit_separator);
2033       CurPtr = ConsumeChar(CurPtr, Size, Result);
2034       CurPtr = ConsumeChar(CurPtr, NextSize, Result);
2035       return LexNumericConstant(Result, CurPtr);
2036     }
2037   }
2038 
2039   // If we have a UCN or UTF-8 character (perhaps in a ud-suffix), continue.
2040   if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2041     return LexNumericConstant(Result, CurPtr);
2042   if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2043     return LexNumericConstant(Result, CurPtr);
2044 
2045   // Update the location of token as well as BufferPtr.
2046   const char *TokStart = BufferPtr;
2047   FormTokenWithChars(Result, CurPtr, tok::numeric_constant);
2048   Result.setLiteralData(TokStart);
2049   return true;
2050 }
2051 
2052 /// LexUDSuffix - Lex the ud-suffix production for user-defined literal suffixes
2053 /// in C++11, or warn on a ud-suffix in C++98.
LexUDSuffix(Token & Result,const char * CurPtr,bool IsStringLiteral)2054 const char *Lexer::LexUDSuffix(Token &Result, const char *CurPtr,
2055                                bool IsStringLiteral) {
2056   assert(LangOpts.CPlusPlus);
2057 
2058   // Maximally munch an identifier.
2059   unsigned Size;
2060   char C = getCharAndSize(CurPtr, Size);
2061   bool Consumed = false;
2062 
2063   if (!isAsciiIdentifierStart(C)) {
2064     if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result))
2065       Consumed = true;
2066     else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result))
2067       Consumed = true;
2068     else
2069       return CurPtr;
2070   }
2071 
2072   if (!LangOpts.CPlusPlus11) {
2073     if (!isLexingRawMode())
2074       Diag(CurPtr,
2075            C == '_' ? diag::warn_cxx11_compat_user_defined_literal
2076                     : diag::warn_cxx11_compat_reserved_user_defined_literal)
2077         << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
2078     return CurPtr;
2079   }
2080 
2081   // C++11 [lex.ext]p10, [usrlit.suffix]p1: A program containing a ud-suffix
2082   // that does not start with an underscore is ill-formed. As a conforming
2083   // extension, we treat all such suffixes as if they had whitespace before
2084   // them. We assume a suffix beginning with a UCN or UTF-8 character is more
2085   // likely to be a ud-suffix than a macro, however, and accept that.
2086   if (!Consumed) {
2087     bool IsUDSuffix = false;
2088     if (C == '_')
2089       IsUDSuffix = true;
2090     else if (IsStringLiteral && LangOpts.CPlusPlus14) {
2091       // In C++1y, we need to look ahead a few characters to see if this is a
2092       // valid suffix for a string literal or a numeric literal (this could be
2093       // the 'operator""if' defining a numeric literal operator).
2094       const unsigned MaxStandardSuffixLength = 3;
2095       char Buffer[MaxStandardSuffixLength] = { C };
2096       unsigned Consumed = Size;
2097       unsigned Chars = 1;
2098       while (true) {
2099         auto [Next, NextSize] =
2100             getCharAndSizeNoWarn(CurPtr + Consumed, LangOpts);
2101         if (!isAsciiIdentifierContinue(Next)) {
2102           // End of suffix. Check whether this is on the allowed list.
2103           const StringRef CompleteSuffix(Buffer, Chars);
2104           IsUDSuffix =
2105               StringLiteralParser::isValidUDSuffix(LangOpts, CompleteSuffix);
2106           break;
2107         }
2108 
2109         if (Chars == MaxStandardSuffixLength)
2110           // Too long: can't be a standard suffix.
2111           break;
2112 
2113         Buffer[Chars++] = Next;
2114         Consumed += NextSize;
2115       }
2116     }
2117 
2118     if (!IsUDSuffix) {
2119       if (!isLexingRawMode())
2120         Diag(CurPtr, LangOpts.MSVCCompat
2121                          ? diag::ext_ms_reserved_user_defined_literal
2122                          : diag::ext_reserved_user_defined_literal)
2123             << FixItHint::CreateInsertion(getSourceLocation(CurPtr), " ");
2124       return CurPtr;
2125     }
2126 
2127     CurPtr = ConsumeChar(CurPtr, Size, Result);
2128   }
2129 
2130   Result.setFlag(Token::HasUDSuffix);
2131   while (true) {
2132     C = getCharAndSize(CurPtr, Size);
2133     if (isAsciiIdentifierContinue(C)) {
2134       CurPtr = ConsumeChar(CurPtr, Size, Result);
2135     } else if (C == '\\' && tryConsumeIdentifierUCN(CurPtr, Size, Result)) {
2136     } else if (!isASCII(C) && tryConsumeIdentifierUTF8Char(CurPtr, Result)) {
2137     } else
2138       break;
2139   }
2140 
2141   return CurPtr;
2142 }
2143 
2144 /// LexStringLiteral - Lex the remainder of a string literal, after having lexed
2145 /// either " or L" or u8" or u" or U".
LexStringLiteral(Token & Result,const char * CurPtr,tok::TokenKind Kind)2146 bool Lexer::LexStringLiteral(Token &Result, const char *CurPtr,
2147                              tok::TokenKind Kind) {
2148   const char *AfterQuote = CurPtr;
2149   // Does this string contain the \0 character?
2150   const char *NulCharacter = nullptr;
2151 
2152   if (!isLexingRawMode() &&
2153       (Kind == tok::utf8_string_literal ||
2154        Kind == tok::utf16_string_literal ||
2155        Kind == tok::utf32_string_literal))
2156     Diag(BufferPtr, LangOpts.CPlusPlus ? diag::warn_cxx98_compat_unicode_literal
2157                                        : diag::warn_c99_compat_unicode_literal);
2158 
2159   char C = getAndAdvanceChar(CurPtr, Result);
2160   while (C != '"') {
2161     // Skip escaped characters.  Escaped newlines will already be processed by
2162     // getAndAdvanceChar.
2163     if (C == '\\')
2164       C = getAndAdvanceChar(CurPtr, Result);
2165 
2166     if (C == '\n' || C == '\r' ||             // Newline.
2167         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
2168       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2169         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 1;
2170       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2171       return true;
2172     }
2173 
2174     if (C == 0) {
2175       if (isCodeCompletionPoint(CurPtr-1)) {
2176         if (ParsingFilename)
2177           codeCompleteIncludedFile(AfterQuote, CurPtr - 1, /*IsAngled=*/false);
2178         else
2179           PP->CodeCompleteNaturalLanguage();
2180         FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2181         cutOffLexing();
2182         return true;
2183       }
2184 
2185       NulCharacter = CurPtr-1;
2186     }
2187     C = getAndAdvanceChar(CurPtr, Result);
2188   }
2189 
2190   // If we are in C++11, lex the optional ud-suffix.
2191   if (LangOpts.CPlusPlus)
2192     CurPtr = LexUDSuffix(Result, CurPtr, true);
2193 
2194   // If a nul character existed in the string, warn about it.
2195   if (NulCharacter && !isLexingRawMode())
2196     Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2197 
2198   // Update the location of the token as well as the BufferPtr instance var.
2199   const char *TokStart = BufferPtr;
2200   FormTokenWithChars(Result, CurPtr, Kind);
2201   Result.setLiteralData(TokStart);
2202   return true;
2203 }
2204 
2205 /// LexRawStringLiteral - Lex the remainder of a raw string literal, after
2206 /// having lexed R", LR", u8R", uR", or UR".
LexRawStringLiteral(Token & Result,const char * CurPtr,tok::TokenKind Kind)2207 bool Lexer::LexRawStringLiteral(Token &Result, const char *CurPtr,
2208                                 tok::TokenKind Kind) {
2209   // This function doesn't use getAndAdvanceChar because C++0x [lex.pptoken]p3:
2210   //  Between the initial and final double quote characters of the raw string,
2211   //  any transformations performed in phases 1 and 2 (trigraphs,
2212   //  universal-character-names, and line splicing) are reverted.
2213 
2214   if (!isLexingRawMode())
2215     Diag(BufferPtr, diag::warn_cxx98_compat_raw_string_literal);
2216 
2217   unsigned PrefixLen = 0;
2218 
2219   while (PrefixLen != 16 && isRawStringDelimBody(CurPtr[PrefixLen]))
2220     ++PrefixLen;
2221 
2222   // If the last character was not a '(', then we didn't lex a valid delimiter.
2223   if (CurPtr[PrefixLen] != '(') {
2224     if (!isLexingRawMode()) {
2225       const char *PrefixEnd = &CurPtr[PrefixLen];
2226       if (PrefixLen == 16) {
2227         Diag(PrefixEnd, diag::err_raw_delim_too_long);
2228       } else {
2229         Diag(PrefixEnd, diag::err_invalid_char_raw_delim)
2230           << StringRef(PrefixEnd, 1);
2231       }
2232     }
2233 
2234     // Search for the next '"' in hopes of salvaging the lexer. Unfortunately,
2235     // it's possible the '"' was intended to be part of the raw string, but
2236     // there's not much we can do about that.
2237     while (true) {
2238       char C = *CurPtr++;
2239 
2240       if (C == '"')
2241         break;
2242       if (C == 0 && CurPtr-1 == BufferEnd) {
2243         --CurPtr;
2244         break;
2245       }
2246     }
2247 
2248     FormTokenWithChars(Result, CurPtr, tok::unknown);
2249     return true;
2250   }
2251 
2252   // Save prefix and move CurPtr past it
2253   const char *Prefix = CurPtr;
2254   CurPtr += PrefixLen + 1; // skip over prefix and '('
2255 
2256   while (true) {
2257     char C = *CurPtr++;
2258 
2259     if (C == ')') {
2260       // Check for prefix match and closing quote.
2261       if (strncmp(CurPtr, Prefix, PrefixLen) == 0 && CurPtr[PrefixLen] == '"') {
2262         CurPtr += PrefixLen + 1; // skip over prefix and '"'
2263         break;
2264       }
2265     } else if (C == 0 && CurPtr-1 == BufferEnd) { // End of file.
2266       if (!isLexingRawMode())
2267         Diag(BufferPtr, diag::err_unterminated_raw_string)
2268           << StringRef(Prefix, PrefixLen);
2269       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2270       return true;
2271     }
2272   }
2273 
2274   // If we are in C++11, lex the optional ud-suffix.
2275   if (LangOpts.CPlusPlus)
2276     CurPtr = LexUDSuffix(Result, CurPtr, true);
2277 
2278   // Update the location of token as well as BufferPtr.
2279   const char *TokStart = BufferPtr;
2280   FormTokenWithChars(Result, CurPtr, Kind);
2281   Result.setLiteralData(TokStart);
2282   return true;
2283 }
2284 
2285 /// LexAngledStringLiteral - Lex the remainder of an angled string literal,
2286 /// after having lexed the '<' character.  This is used for #include filenames.
LexAngledStringLiteral(Token & Result,const char * CurPtr)2287 bool Lexer::LexAngledStringLiteral(Token &Result, const char *CurPtr) {
2288   // Does this string contain the \0 character?
2289   const char *NulCharacter = nullptr;
2290   const char *AfterLessPos = CurPtr;
2291   char C = getAndAdvanceChar(CurPtr, Result);
2292   while (C != '>') {
2293     // Skip escaped characters.  Escaped newlines will already be processed by
2294     // getAndAdvanceChar.
2295     if (C == '\\')
2296       C = getAndAdvanceChar(CurPtr, Result);
2297 
2298     if (isVerticalWhitespace(C) ||               // Newline.
2299         (C == 0 && (CurPtr - 1 == BufferEnd))) { // End of file.
2300       // If the filename is unterminated, then it must just be a lone <
2301       // character.  Return this as such.
2302       FormTokenWithChars(Result, AfterLessPos, tok::less);
2303       return true;
2304     }
2305 
2306     if (C == 0) {
2307       if (isCodeCompletionPoint(CurPtr - 1)) {
2308         codeCompleteIncludedFile(AfterLessPos, CurPtr - 1, /*IsAngled=*/true);
2309         cutOffLexing();
2310         FormTokenWithChars(Result, CurPtr - 1, tok::unknown);
2311         return true;
2312       }
2313       NulCharacter = CurPtr-1;
2314     }
2315     C = getAndAdvanceChar(CurPtr, Result);
2316   }
2317 
2318   // If a nul character existed in the string, warn about it.
2319   if (NulCharacter && !isLexingRawMode())
2320     Diag(NulCharacter, diag::null_in_char_or_string) << 1;
2321 
2322   // Update the location of token as well as BufferPtr.
2323   const char *TokStart = BufferPtr;
2324   FormTokenWithChars(Result, CurPtr, tok::header_name);
2325   Result.setLiteralData(TokStart);
2326   return true;
2327 }
2328 
codeCompleteIncludedFile(const char * PathStart,const char * CompletionPoint,bool IsAngled)2329 void Lexer::codeCompleteIncludedFile(const char *PathStart,
2330                                      const char *CompletionPoint,
2331                                      bool IsAngled) {
2332   // Completion only applies to the filename, after the last slash.
2333   StringRef PartialPath(PathStart, CompletionPoint - PathStart);
2334   llvm::StringRef SlashChars = LangOpts.MSVCCompat ? "/\\" : "/";
2335   auto Slash = PartialPath.find_last_of(SlashChars);
2336   StringRef Dir =
2337       (Slash == StringRef::npos) ? "" : PartialPath.take_front(Slash);
2338   const char *StartOfFilename =
2339       (Slash == StringRef::npos) ? PathStart : PathStart + Slash + 1;
2340   // Code completion filter range is the filename only, up to completion point.
2341   PP->setCodeCompletionIdentifierInfo(&PP->getIdentifierTable().get(
2342       StringRef(StartOfFilename, CompletionPoint - StartOfFilename)));
2343   // We should replace the characters up to the closing quote or closest slash,
2344   // if any.
2345   while (CompletionPoint < BufferEnd) {
2346     char Next = *(CompletionPoint + 1);
2347     if (Next == 0 || Next == '\r' || Next == '\n')
2348       break;
2349     ++CompletionPoint;
2350     if (Next == (IsAngled ? '>' : '"'))
2351       break;
2352     if (SlashChars.contains(Next))
2353       break;
2354   }
2355 
2356   PP->setCodeCompletionTokenRange(
2357       FileLoc.getLocWithOffset(StartOfFilename - BufferStart),
2358       FileLoc.getLocWithOffset(CompletionPoint - BufferStart));
2359   PP->CodeCompleteIncludedFile(Dir, IsAngled);
2360 }
2361 
2362 /// LexCharConstant - Lex the remainder of a character constant, after having
2363 /// lexed either ' or L' or u8' or u' or U'.
LexCharConstant(Token & Result,const char * CurPtr,tok::TokenKind Kind)2364 bool Lexer::LexCharConstant(Token &Result, const char *CurPtr,
2365                             tok::TokenKind Kind) {
2366   // Does this character contain the \0 character?
2367   const char *NulCharacter = nullptr;
2368 
2369   if (!isLexingRawMode()) {
2370     if (Kind == tok::utf16_char_constant || Kind == tok::utf32_char_constant)
2371       Diag(BufferPtr, LangOpts.CPlusPlus
2372                           ? diag::warn_cxx98_compat_unicode_literal
2373                           : diag::warn_c99_compat_unicode_literal);
2374     else if (Kind == tok::utf8_char_constant)
2375       Diag(BufferPtr, diag::warn_cxx14_compat_u8_character_literal);
2376   }
2377 
2378   char C = getAndAdvanceChar(CurPtr, Result);
2379   if (C == '\'') {
2380     if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2381       Diag(BufferPtr, diag::ext_empty_character);
2382     FormTokenWithChars(Result, CurPtr, tok::unknown);
2383     return true;
2384   }
2385 
2386   while (C != '\'') {
2387     // Skip escaped characters.
2388     if (C == '\\')
2389       C = getAndAdvanceChar(CurPtr, Result);
2390 
2391     if (C == '\n' || C == '\r' ||             // Newline.
2392         (C == 0 && CurPtr-1 == BufferEnd)) {  // End of file.
2393       if (!isLexingRawMode() && !LangOpts.AsmPreprocessor)
2394         Diag(BufferPtr, diag::ext_unterminated_char_or_string) << 0;
2395       FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2396       return true;
2397     }
2398 
2399     if (C == 0) {
2400       if (isCodeCompletionPoint(CurPtr-1)) {
2401         PP->CodeCompleteNaturalLanguage();
2402         FormTokenWithChars(Result, CurPtr-1, tok::unknown);
2403         cutOffLexing();
2404         return true;
2405       }
2406 
2407       NulCharacter = CurPtr-1;
2408     }
2409     C = getAndAdvanceChar(CurPtr, Result);
2410   }
2411 
2412   // If we are in C++11, lex the optional ud-suffix.
2413   if (LangOpts.CPlusPlus)
2414     CurPtr = LexUDSuffix(Result, CurPtr, false);
2415 
2416   // If a nul character existed in the character, warn about it.
2417   if (NulCharacter && !isLexingRawMode())
2418     Diag(NulCharacter, diag::null_in_char_or_string) << 0;
2419 
2420   // Update the location of token as well as BufferPtr.
2421   const char *TokStart = BufferPtr;
2422   FormTokenWithChars(Result, CurPtr, Kind);
2423   Result.setLiteralData(TokStart);
2424   return true;
2425 }
2426 
2427 /// SkipWhitespace - Efficiently skip over a series of whitespace characters.
2428 /// Update BufferPtr to point to the next non-whitespace character and return.
2429 ///
2430 /// This method forms a token and returns true if KeepWhitespaceMode is enabled.
SkipWhitespace(Token & Result,const char * CurPtr,bool & TokAtPhysicalStartOfLine)2431 bool Lexer::SkipWhitespace(Token &Result, const char *CurPtr,
2432                            bool &TokAtPhysicalStartOfLine) {
2433   // Whitespace - Skip it, then return the token after the whitespace.
2434   bool SawNewline = isVerticalWhitespace(CurPtr[-1]);
2435 
2436   unsigned char Char = *CurPtr;
2437 
2438   const char *lastNewLine = nullptr;
2439   auto setLastNewLine = [&](const char *Ptr) {
2440     lastNewLine = Ptr;
2441     if (!NewLinePtr)
2442       NewLinePtr = Ptr;
2443   };
2444   if (SawNewline)
2445     setLastNewLine(CurPtr - 1);
2446 
2447   // Skip consecutive spaces efficiently.
2448   while (true) {
2449     // Skip horizontal whitespace very aggressively.
2450     while (isHorizontalWhitespace(Char))
2451       Char = *++CurPtr;
2452 
2453     // Otherwise if we have something other than whitespace, we're done.
2454     if (!isVerticalWhitespace(Char))
2455       break;
2456 
2457     if (ParsingPreprocessorDirective) {
2458       // End of preprocessor directive line, let LexTokenInternal handle this.
2459       BufferPtr = CurPtr;
2460       return false;
2461     }
2462 
2463     // OK, but handle newline.
2464     if (*CurPtr == '\n')
2465       setLastNewLine(CurPtr);
2466     SawNewline = true;
2467     Char = *++CurPtr;
2468   }
2469 
2470   // If the client wants us to return whitespace, return it now.
2471   if (isKeepWhitespaceMode()) {
2472     FormTokenWithChars(Result, CurPtr, tok::unknown);
2473     if (SawNewline) {
2474       IsAtStartOfLine = true;
2475       IsAtPhysicalStartOfLine = true;
2476     }
2477     // FIXME: The next token will not have LeadingSpace set.
2478     return true;
2479   }
2480 
2481   // If this isn't immediately after a newline, there is leading space.
2482   char PrevChar = CurPtr[-1];
2483   bool HasLeadingSpace = !isVerticalWhitespace(PrevChar);
2484 
2485   Result.setFlagValue(Token::LeadingSpace, HasLeadingSpace);
2486   if (SawNewline) {
2487     Result.setFlag(Token::StartOfLine);
2488     TokAtPhysicalStartOfLine = true;
2489 
2490     if (NewLinePtr && lastNewLine && NewLinePtr != lastNewLine && PP) {
2491       if (auto *Handler = PP->getEmptylineHandler())
2492         Handler->HandleEmptyline(SourceRange(getSourceLocation(NewLinePtr + 1),
2493                                              getSourceLocation(lastNewLine)));
2494     }
2495   }
2496 
2497   BufferPtr = CurPtr;
2498   return false;
2499 }
2500 
2501 /// We have just read the // characters from input.  Skip until we find the
2502 /// newline character that terminates the comment.  Then update BufferPtr and
2503 /// return.
2504 ///
2505 /// If we're in KeepCommentMode or any CommentHandler has inserted
2506 /// some tokens, this will store the first token and return true.
SkipLineComment(Token & Result,const char * CurPtr,bool & TokAtPhysicalStartOfLine)2507 bool Lexer::SkipLineComment(Token &Result, const char *CurPtr,
2508                             bool &TokAtPhysicalStartOfLine) {
2509   // If Line comments aren't explicitly enabled for this language, emit an
2510   // extension warning.
2511   if (!LineComment) {
2512     if (!isLexingRawMode()) // There's no PP in raw mode, so can't emit diags.
2513       Diag(BufferPtr, diag::ext_line_comment);
2514 
2515     // Mark them enabled so we only emit one warning for this translation
2516     // unit.
2517     LineComment = true;
2518   }
2519 
2520   // Scan over the body of the comment.  The common case, when scanning, is that
2521   // the comment contains normal ascii characters with nothing interesting in
2522   // them.  As such, optimize for this case with the inner loop.
2523   //
2524   // This loop terminates with CurPtr pointing at the newline (or end of buffer)
2525   // character that ends the line comment.
2526 
2527   // C++23 [lex.phases] p1
2528   // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2529   // diagnostic only once per entire ill-formed subsequence to avoid
2530   // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2531   bool UnicodeDecodingAlreadyDiagnosed = false;
2532 
2533   char C;
2534   while (true) {
2535     C = *CurPtr;
2536     // Skip over characters in the fast loop.
2537     while (isASCII(C) && C != 0 &&   // Potentially EOF.
2538            C != '\n' && C != '\r') { // Newline or DOS-style newline.
2539       C = *++CurPtr;
2540       UnicodeDecodingAlreadyDiagnosed = false;
2541     }
2542 
2543     if (!isASCII(C)) {
2544       unsigned Length = llvm::getUTF8SequenceSize(
2545           (const llvm::UTF8 *)CurPtr, (const llvm::UTF8 *)BufferEnd);
2546       if (Length == 0) {
2547         if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2548           Diag(CurPtr, diag::warn_invalid_utf8_in_comment);
2549         UnicodeDecodingAlreadyDiagnosed = true;
2550         ++CurPtr;
2551       } else {
2552         UnicodeDecodingAlreadyDiagnosed = false;
2553         CurPtr += Length;
2554       }
2555       continue;
2556     }
2557 
2558     const char *NextLine = CurPtr;
2559     if (C != 0) {
2560       // We found a newline, see if it's escaped.
2561       const char *EscapePtr = CurPtr-1;
2562       bool HasSpace = false;
2563       while (isHorizontalWhitespace(*EscapePtr)) { // Skip whitespace.
2564         --EscapePtr;
2565         HasSpace = true;
2566       }
2567 
2568       if (*EscapePtr == '\\')
2569         // Escaped newline.
2570         CurPtr = EscapePtr;
2571       else if (EscapePtr[0] == '/' && EscapePtr[-1] == '?' &&
2572                EscapePtr[-2] == '?' && LangOpts.Trigraphs)
2573         // Trigraph-escaped newline.
2574         CurPtr = EscapePtr-2;
2575       else
2576         break; // This is a newline, we're done.
2577 
2578       // If there was space between the backslash and newline, warn about it.
2579       if (HasSpace && !isLexingRawMode())
2580         Diag(EscapePtr, diag::backslash_newline_space);
2581     }
2582 
2583     // Otherwise, this is a hard case.  Fall back on getAndAdvanceChar to
2584     // properly decode the character.  Read it in raw mode to avoid emitting
2585     // diagnostics about things like trigraphs.  If we see an escaped newline,
2586     // we'll handle it below.
2587     const char *OldPtr = CurPtr;
2588     bool OldRawMode = isLexingRawMode();
2589     LexingRawMode = true;
2590     C = getAndAdvanceChar(CurPtr, Result);
2591     LexingRawMode = OldRawMode;
2592 
2593     // If we only read only one character, then no special handling is needed.
2594     // We're done and can skip forward to the newline.
2595     if (C != 0 && CurPtr == OldPtr+1) {
2596       CurPtr = NextLine;
2597       break;
2598     }
2599 
2600     // If we read multiple characters, and one of those characters was a \r or
2601     // \n, then we had an escaped newline within the comment.  Emit diagnostic
2602     // unless the next line is also a // comment.
2603     if (CurPtr != OldPtr + 1 && C != '/' &&
2604         (CurPtr == BufferEnd + 1 || CurPtr[0] != '/')) {
2605       for (; OldPtr != CurPtr; ++OldPtr)
2606         if (OldPtr[0] == '\n' || OldPtr[0] == '\r') {
2607           // Okay, we found a // comment that ends in a newline, if the next
2608           // line is also a // comment, but has spaces, don't emit a diagnostic.
2609           if (isWhitespace(C)) {
2610             const char *ForwardPtr = CurPtr;
2611             while (isWhitespace(*ForwardPtr))  // Skip whitespace.
2612               ++ForwardPtr;
2613             if (ForwardPtr[0] == '/' && ForwardPtr[1] == '/')
2614               break;
2615           }
2616 
2617           if (!isLexingRawMode())
2618             Diag(OldPtr-1, diag::ext_multi_line_line_comment);
2619           break;
2620         }
2621     }
2622 
2623     if (C == '\r' || C == '\n' || CurPtr == BufferEnd + 1) {
2624       --CurPtr;
2625       break;
2626     }
2627 
2628     if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2629       PP->CodeCompleteNaturalLanguage();
2630       cutOffLexing();
2631       return false;
2632     }
2633   }
2634 
2635   // Found but did not consume the newline.  Notify comment handlers about the
2636   // comment unless we're in a #if 0 block.
2637   if (PP && !isLexingRawMode() &&
2638       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2639                                             getSourceLocation(CurPtr)))) {
2640     BufferPtr = CurPtr;
2641     return true; // A token has to be returned.
2642   }
2643 
2644   // If we are returning comments as tokens, return this comment as a token.
2645   if (inKeepCommentMode())
2646     return SaveLineComment(Result, CurPtr);
2647 
2648   // If we are inside a preprocessor directive and we see the end of line,
2649   // return immediately, so that the lexer can return this as an EOD token.
2650   if (ParsingPreprocessorDirective || CurPtr == BufferEnd) {
2651     BufferPtr = CurPtr;
2652     return false;
2653   }
2654 
2655   // Otherwise, eat the \n character.  We don't care if this is a \n\r or
2656   // \r\n sequence.  This is an efficiency hack (because we know the \n can't
2657   // contribute to another token), it isn't needed for correctness.  Note that
2658   // this is ok even in KeepWhitespaceMode, because we would have returned the
2659   // comment above in that mode.
2660   NewLinePtr = CurPtr++;
2661 
2662   // The next returned token is at the start of the line.
2663   Result.setFlag(Token::StartOfLine);
2664   TokAtPhysicalStartOfLine = true;
2665   // No leading whitespace seen so far.
2666   Result.clearFlag(Token::LeadingSpace);
2667   BufferPtr = CurPtr;
2668   return false;
2669 }
2670 
2671 /// If in save-comment mode, package up this Line comment in an appropriate
2672 /// way and return it.
SaveLineComment(Token & Result,const char * CurPtr)2673 bool Lexer::SaveLineComment(Token &Result, const char *CurPtr) {
2674   // If we're not in a preprocessor directive, just return the // comment
2675   // directly.
2676   FormTokenWithChars(Result, CurPtr, tok::comment);
2677 
2678   if (!ParsingPreprocessorDirective || LexingRawMode)
2679     return true;
2680 
2681   // If this Line-style comment is in a macro definition, transmogrify it into
2682   // a C-style block comment.
2683   bool Invalid = false;
2684   std::string Spelling = PP->getSpelling(Result, &Invalid);
2685   if (Invalid)
2686     return true;
2687 
2688   assert(Spelling[0] == '/' && Spelling[1] == '/' && "Not line comment?");
2689   Spelling[1] = '*';   // Change prefix to "/*".
2690   Spelling += "*/";    // add suffix.
2691 
2692   Result.setKind(tok::comment);
2693   PP->CreateString(Spelling, Result,
2694                    Result.getLocation(), Result.getLocation());
2695   return true;
2696 }
2697 
2698 /// isBlockCommentEndOfEscapedNewLine - Return true if the specified newline
2699 /// character (either \\n or \\r) is part of an escaped newline sequence.  Issue
2700 /// a diagnostic if so.  We know that the newline is inside of a block comment.
isEndOfBlockCommentWithEscapedNewLine(const char * CurPtr,Lexer * L,bool Trigraphs)2701 static bool isEndOfBlockCommentWithEscapedNewLine(const char *CurPtr, Lexer *L,
2702                                                   bool Trigraphs) {
2703   assert(CurPtr[0] == '\n' || CurPtr[0] == '\r');
2704 
2705   // Position of the first trigraph in the ending sequence.
2706   const char *TrigraphPos = nullptr;
2707   // Position of the first whitespace after a '\' in the ending sequence.
2708   const char *SpacePos = nullptr;
2709 
2710   while (true) {
2711     // Back up off the newline.
2712     --CurPtr;
2713 
2714     // If this is a two-character newline sequence, skip the other character.
2715     if (CurPtr[0] == '\n' || CurPtr[0] == '\r') {
2716       // \n\n or \r\r -> not escaped newline.
2717       if (CurPtr[0] == CurPtr[1])
2718         return false;
2719       // \n\r or \r\n -> skip the newline.
2720       --CurPtr;
2721     }
2722 
2723     // If we have horizontal whitespace, skip over it.  We allow whitespace
2724     // between the slash and newline.
2725     while (isHorizontalWhitespace(*CurPtr) || *CurPtr == 0) {
2726       SpacePos = CurPtr;
2727       --CurPtr;
2728     }
2729 
2730     // If we have a slash, this is an escaped newline.
2731     if (*CurPtr == '\\') {
2732       --CurPtr;
2733     } else if (CurPtr[0] == '/' && CurPtr[-1] == '?' && CurPtr[-2] == '?') {
2734       // This is a trigraph encoding of a slash.
2735       TrigraphPos = CurPtr - 2;
2736       CurPtr -= 3;
2737     } else {
2738       return false;
2739     }
2740 
2741     // If the character preceding the escaped newline is a '*', then after line
2742     // splicing we have a '*/' ending the comment.
2743     if (*CurPtr == '*')
2744       break;
2745 
2746     if (*CurPtr != '\n' && *CurPtr != '\r')
2747       return false;
2748   }
2749 
2750   if (TrigraphPos) {
2751     // If no trigraphs are enabled, warn that we ignored this trigraph and
2752     // ignore this * character.
2753     if (!Trigraphs) {
2754       if (!L->isLexingRawMode())
2755         L->Diag(TrigraphPos, diag::trigraph_ignored_block_comment);
2756       return false;
2757     }
2758     if (!L->isLexingRawMode())
2759       L->Diag(TrigraphPos, diag::trigraph_ends_block_comment);
2760   }
2761 
2762   // Warn about having an escaped newline between the */ characters.
2763   if (!L->isLexingRawMode())
2764     L->Diag(CurPtr + 1, diag::escaped_newline_block_comment_end);
2765 
2766   // If there was space between the backslash and newline, warn about it.
2767   if (SpacePos && !L->isLexingRawMode())
2768     L->Diag(SpacePos, diag::backslash_newline_space);
2769 
2770   return true;
2771 }
2772 
2773 #ifdef __SSE2__
2774 #include <emmintrin.h>
2775 #elif __ALTIVEC__
2776 #include <altivec.h>
2777 #undef bool
2778 #endif
2779 
2780 /// We have just read from input the / and * characters that started a comment.
2781 /// Read until we find the * and / characters that terminate the comment.
2782 /// Note that we don't bother decoding trigraphs or escaped newlines in block
2783 /// comments, because they cannot cause the comment to end.  The only thing
2784 /// that can happen is the comment could end with an escaped newline between
2785 /// the terminating * and /.
2786 ///
2787 /// If we're in KeepCommentMode or any CommentHandler has inserted
2788 /// some tokens, this will store the first token and return true.
SkipBlockComment(Token & Result,const char * CurPtr,bool & TokAtPhysicalStartOfLine)2789 bool Lexer::SkipBlockComment(Token &Result, const char *CurPtr,
2790                              bool &TokAtPhysicalStartOfLine) {
2791   // Scan one character past where we should, looking for a '/' character.  Once
2792   // we find it, check to see if it was preceded by a *.  This common
2793   // optimization helps people who like to put a lot of * characters in their
2794   // comments.
2795 
2796   // The first character we get with newlines and trigraphs skipped to handle
2797   // the degenerate /*/ case below correctly if the * has an escaped newline
2798   // after it.
2799   unsigned CharSize;
2800   unsigned char C = getCharAndSize(CurPtr, CharSize);
2801   CurPtr += CharSize;
2802   if (C == 0 && CurPtr == BufferEnd+1) {
2803     if (!isLexingRawMode())
2804       Diag(BufferPtr, diag::err_unterminated_block_comment);
2805     --CurPtr;
2806 
2807     // KeepWhitespaceMode should return this broken comment as a token.  Since
2808     // it isn't a well formed comment, just return it as an 'unknown' token.
2809     if (isKeepWhitespaceMode()) {
2810       FormTokenWithChars(Result, CurPtr, tok::unknown);
2811       return true;
2812     }
2813 
2814     BufferPtr = CurPtr;
2815     return false;
2816   }
2817 
2818   // Check to see if the first character after the '/*' is another /.  If so,
2819   // then this slash does not end the block comment, it is part of it.
2820   if (C == '/')
2821     C = *CurPtr++;
2822 
2823   // C++23 [lex.phases] p1
2824   // Diagnose invalid UTF-8 if the corresponding warning is enabled, emitting a
2825   // diagnostic only once per entire ill-formed subsequence to avoid
2826   // emiting to many diagnostics (see http://unicode.org/review/pr-121.html).
2827   bool UnicodeDecodingAlreadyDiagnosed = false;
2828 
2829   while (true) {
2830     // Skip over all non-interesting characters until we find end of buffer or a
2831     // (probably ending) '/' character.
2832     if (CurPtr + 24 < BufferEnd &&
2833         // If there is a code-completion point avoid the fast scan because it
2834         // doesn't check for '\0'.
2835         !(PP && PP->getCodeCompletionFileLoc() == FileLoc)) {
2836       // While not aligned to a 16-byte boundary.
2837       while (C != '/' && (intptr_t)CurPtr % 16 != 0) {
2838         if (!isASCII(C))
2839           goto MultiByteUTF8;
2840         C = *CurPtr++;
2841       }
2842       if (C == '/') goto FoundSlash;
2843 
2844 #ifdef __SSE2__
2845       __m128i Slashes = _mm_set1_epi8('/');
2846       while (CurPtr + 16 < BufferEnd) {
2847         int Mask = _mm_movemask_epi8(*(const __m128i *)CurPtr);
2848         if (LLVM_UNLIKELY(Mask != 0)) {
2849           goto MultiByteUTF8;
2850         }
2851         // look for slashes
2852         int cmp = _mm_movemask_epi8(_mm_cmpeq_epi8(*(const __m128i*)CurPtr,
2853                                     Slashes));
2854         if (cmp != 0) {
2855           // Adjust the pointer to point directly after the first slash. It's
2856           // not necessary to set C here, it will be overwritten at the end of
2857           // the outer loop.
2858           CurPtr += llvm::countr_zero<unsigned>(cmp) + 1;
2859           goto FoundSlash;
2860         }
2861         CurPtr += 16;
2862       }
2863 #elif __ALTIVEC__
2864       __vector unsigned char LongUTF = {0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2865                                         0x80, 0x80, 0x80, 0x80, 0x80, 0x80,
2866                                         0x80, 0x80, 0x80, 0x80};
2867       __vector unsigned char Slashes = {
2868         '/', '/', '/', '/',  '/', '/', '/', '/',
2869         '/', '/', '/', '/',  '/', '/', '/', '/'
2870       };
2871       while (CurPtr + 16 < BufferEnd) {
2872         if (LLVM_UNLIKELY(
2873                 vec_any_ge(*(const __vector unsigned char *)CurPtr, LongUTF)))
2874           goto MultiByteUTF8;
2875         if (vec_any_eq(*(const __vector unsigned char *)CurPtr, Slashes)) {
2876           break;
2877         }
2878         CurPtr += 16;
2879       }
2880 
2881 #else
2882       while (CurPtr + 16 < BufferEnd) {
2883         bool HasNonASCII = false;
2884         for (unsigned I = 0; I < 16; ++I)
2885           HasNonASCII |= !isASCII(CurPtr[I]);
2886 
2887         if (LLVM_UNLIKELY(HasNonASCII))
2888           goto MultiByteUTF8;
2889 
2890         bool HasSlash = false;
2891         for (unsigned I = 0; I < 16; ++I)
2892           HasSlash |= CurPtr[I] == '/';
2893         if (HasSlash)
2894           break;
2895         CurPtr += 16;
2896       }
2897 #endif
2898 
2899       // It has to be one of the bytes scanned, increment to it and read one.
2900       C = *CurPtr++;
2901     }
2902 
2903     // Loop to scan the remainder, warning on invalid UTF-8
2904     // if the corresponding warning is enabled, emitting a diagnostic only once
2905     // per sequence that cannot be decoded.
2906     while (C != '/' && C != '\0') {
2907       if (isASCII(C)) {
2908         UnicodeDecodingAlreadyDiagnosed = false;
2909         C = *CurPtr++;
2910         continue;
2911       }
2912     MultiByteUTF8:
2913       // CurPtr is 1 code unit past C, so to decode
2914       // the codepoint, we need to read from the previous position.
2915       unsigned Length = llvm::getUTF8SequenceSize(
2916           (const llvm::UTF8 *)CurPtr - 1, (const llvm::UTF8 *)BufferEnd);
2917       if (Length == 0) {
2918         if (!UnicodeDecodingAlreadyDiagnosed && !isLexingRawMode())
2919           Diag(CurPtr - 1, diag::warn_invalid_utf8_in_comment);
2920         UnicodeDecodingAlreadyDiagnosed = true;
2921       } else {
2922         UnicodeDecodingAlreadyDiagnosed = false;
2923         CurPtr += Length - 1;
2924       }
2925       C = *CurPtr++;
2926     }
2927 
2928     if (C == '/') {
2929   FoundSlash:
2930       if (CurPtr[-2] == '*')  // We found the final */.  We're done!
2931         break;
2932 
2933       if ((CurPtr[-2] == '\n' || CurPtr[-2] == '\r')) {
2934         if (isEndOfBlockCommentWithEscapedNewLine(CurPtr - 2, this,
2935                                                   LangOpts.Trigraphs)) {
2936           // We found the final */, though it had an escaped newline between the
2937           // * and /.  We're done!
2938           break;
2939         }
2940       }
2941       if (CurPtr[0] == '*' && CurPtr[1] != '/') {
2942         // If this is a /* inside of the comment, emit a warning.  Don't do this
2943         // if this is a /*/, which will end the comment.  This misses cases with
2944         // embedded escaped newlines, but oh well.
2945         if (!isLexingRawMode())
2946           Diag(CurPtr-1, diag::warn_nested_block_comment);
2947       }
2948     } else if (C == 0 && CurPtr == BufferEnd+1) {
2949       if (!isLexingRawMode())
2950         Diag(BufferPtr, diag::err_unterminated_block_comment);
2951       // Note: the user probably forgot a */.  We could continue immediately
2952       // after the /*, but this would involve lexing a lot of what really is the
2953       // comment, which surely would confuse the parser.
2954       --CurPtr;
2955 
2956       // KeepWhitespaceMode should return this broken comment as a token.  Since
2957       // it isn't a well formed comment, just return it as an 'unknown' token.
2958       if (isKeepWhitespaceMode()) {
2959         FormTokenWithChars(Result, CurPtr, tok::unknown);
2960         return true;
2961       }
2962 
2963       BufferPtr = CurPtr;
2964       return false;
2965     } else if (C == '\0' && isCodeCompletionPoint(CurPtr-1)) {
2966       PP->CodeCompleteNaturalLanguage();
2967       cutOffLexing();
2968       return false;
2969     }
2970 
2971     C = *CurPtr++;
2972   }
2973 
2974   // Notify comment handlers about the comment unless we're in a #if 0 block.
2975   if (PP && !isLexingRawMode() &&
2976       PP->HandleComment(Result, SourceRange(getSourceLocation(BufferPtr),
2977                                             getSourceLocation(CurPtr)))) {
2978     BufferPtr = CurPtr;
2979     return true; // A token has to be returned.
2980   }
2981 
2982   // If we are returning comments as tokens, return this comment as a token.
2983   if (inKeepCommentMode()) {
2984     FormTokenWithChars(Result, CurPtr, tok::comment);
2985     return true;
2986   }
2987 
2988   // It is common for the tokens immediately after a /**/ comment to be
2989   // whitespace.  Instead of going through the big switch, handle it
2990   // efficiently now.  This is safe even in KeepWhitespaceMode because we would
2991   // have already returned above with the comment as a token.
2992   if (isHorizontalWhitespace(*CurPtr)) {
2993     SkipWhitespace(Result, CurPtr+1, TokAtPhysicalStartOfLine);
2994     return false;
2995   }
2996 
2997   // Otherwise, just return so that the next character will be lexed as a token.
2998   BufferPtr = CurPtr;
2999   Result.setFlag(Token::LeadingSpace);
3000   return false;
3001 }
3002 
3003 //===----------------------------------------------------------------------===//
3004 // Primary Lexing Entry Points
3005 //===----------------------------------------------------------------------===//
3006 
3007 /// ReadToEndOfLine - Read the rest of the current preprocessor line as an
3008 /// uninterpreted string.  This switches the lexer out of directive mode.
ReadToEndOfLine(SmallVectorImpl<char> * Result)3009 void Lexer::ReadToEndOfLine(SmallVectorImpl<char> *Result) {
3010   assert(ParsingPreprocessorDirective && ParsingFilename == false &&
3011          "Must be in a preprocessing directive!");
3012   Token Tmp;
3013   Tmp.startToken();
3014 
3015   // CurPtr - Cache BufferPtr in an automatic variable.
3016   const char *CurPtr = BufferPtr;
3017   while (true) {
3018     char Char = getAndAdvanceChar(CurPtr, Tmp);
3019     switch (Char) {
3020     default:
3021       if (Result)
3022         Result->push_back(Char);
3023       break;
3024     case 0:  // Null.
3025       // Found end of file?
3026       if (CurPtr-1 != BufferEnd) {
3027         if (isCodeCompletionPoint(CurPtr-1)) {
3028           PP->CodeCompleteNaturalLanguage();
3029           cutOffLexing();
3030           return;
3031         }
3032 
3033         // Nope, normal character, continue.
3034         if (Result)
3035           Result->push_back(Char);
3036         break;
3037       }
3038       // FALL THROUGH.
3039       [[fallthrough]];
3040     case '\r':
3041     case '\n':
3042       // Okay, we found the end of the line. First, back up past the \0, \r, \n.
3043       assert(CurPtr[-1] == Char && "Trigraphs for newline?");
3044       BufferPtr = CurPtr-1;
3045 
3046       // Next, lex the character, which should handle the EOD transition.
3047       Lex(Tmp);
3048       if (Tmp.is(tok::code_completion)) {
3049         if (PP)
3050           PP->CodeCompleteNaturalLanguage();
3051         Lex(Tmp);
3052       }
3053       assert(Tmp.is(tok::eod) && "Unexpected token!");
3054 
3055       // Finally, we're done;
3056       return;
3057     }
3058   }
3059 }
3060 
3061 /// LexEndOfFile - CurPtr points to the end of this file.  Handle this
3062 /// condition, reporting diagnostics and handling other edge cases as required.
3063 /// This returns true if Result contains a token, false if PP.Lex should be
3064 /// called again.
LexEndOfFile(Token & Result,const char * CurPtr)3065 bool Lexer::LexEndOfFile(Token &Result, const char *CurPtr) {
3066   // If we hit the end of the file while parsing a preprocessor directive,
3067   // end the preprocessor directive first.  The next token returned will
3068   // then be the end of file.
3069   if (ParsingPreprocessorDirective) {
3070     // Done parsing the "line".
3071     ParsingPreprocessorDirective = false;
3072     // Update the location of token as well as BufferPtr.
3073     FormTokenWithChars(Result, CurPtr, tok::eod);
3074 
3075     // Restore comment saving mode, in case it was disabled for directive.
3076     if (PP)
3077       resetExtendedTokenMode();
3078     return true;  // Have a token.
3079   }
3080 
3081   // If we are in raw mode, return this event as an EOF token.  Let the caller
3082   // that put us in raw mode handle the event.
3083   if (isLexingRawMode()) {
3084     Result.startToken();
3085     BufferPtr = BufferEnd;
3086     FormTokenWithChars(Result, BufferEnd, tok::eof);
3087     return true;
3088   }
3089 
3090   if (PP->isRecordingPreamble() && PP->isInPrimaryFile()) {
3091     PP->setRecordedPreambleConditionalStack(ConditionalStack);
3092     // If the preamble cuts off the end of a header guard, consider it guarded.
3093     // The guard is valid for the preamble content itself, and for tools the
3094     // most useful answer is "yes, this file has a header guard".
3095     if (!ConditionalStack.empty())
3096       MIOpt.ExitTopLevelConditional();
3097     ConditionalStack.clear();
3098   }
3099 
3100   // Issue diagnostics for unterminated #if and missing newline.
3101 
3102   // If we are in a #if directive, emit an error.
3103   while (!ConditionalStack.empty()) {
3104     if (PP->getCodeCompletionFileLoc() != FileLoc)
3105       PP->Diag(ConditionalStack.back().IfLoc,
3106                diag::err_pp_unterminated_conditional);
3107     ConditionalStack.pop_back();
3108   }
3109 
3110   // C99 5.1.1.2p2: If the file is non-empty and didn't end in a newline, issue
3111   // a pedwarn.
3112   if (CurPtr != BufferStart && (CurPtr[-1] != '\n' && CurPtr[-1] != '\r')) {
3113     DiagnosticsEngine &Diags = PP->getDiagnostics();
3114     SourceLocation EndLoc = getSourceLocation(BufferEnd);
3115     unsigned DiagID;
3116 
3117     if (LangOpts.CPlusPlus11) {
3118       // C++11 [lex.phases] 2.2 p2
3119       // Prefer the C++98 pedantic compatibility warning over the generic,
3120       // non-extension, user-requested "missing newline at EOF" warning.
3121       if (!Diags.isIgnored(diag::warn_cxx98_compat_no_newline_eof, EndLoc)) {
3122         DiagID = diag::warn_cxx98_compat_no_newline_eof;
3123       } else {
3124         DiagID = diag::warn_no_newline_eof;
3125       }
3126     } else {
3127       DiagID = diag::ext_no_newline_eof;
3128     }
3129 
3130     Diag(BufferEnd, DiagID)
3131       << FixItHint::CreateInsertion(EndLoc, "\n");
3132   }
3133 
3134   BufferPtr = CurPtr;
3135 
3136   // Finally, let the preprocessor handle this.
3137   return PP->HandleEndOfFile(Result, isPragmaLexer());
3138 }
3139 
3140 /// isNextPPTokenLParen - Return 1 if the next unexpanded token lexed from
3141 /// the specified lexer will return a tok::l_paren token, 0 if it is something
3142 /// else and 2 if there are no more tokens in the buffer controlled by the
3143 /// lexer.
isNextPPTokenLParen()3144 unsigned Lexer::isNextPPTokenLParen() {
3145   assert(!LexingRawMode && "How can we expand a macro from a skipping buffer?");
3146 
3147   if (isDependencyDirectivesLexer()) {
3148     if (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size())
3149       return 2;
3150     return DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
3151         tok::l_paren);
3152   }
3153 
3154   // Switch to 'skipping' mode.  This will ensure that we can lex a token
3155   // without emitting diagnostics, disables macro expansion, and will cause EOF
3156   // to return an EOF token instead of popping the include stack.
3157   LexingRawMode = true;
3158 
3159   // Save state that can be changed while lexing so that we can restore it.
3160   const char *TmpBufferPtr = BufferPtr;
3161   bool inPPDirectiveMode = ParsingPreprocessorDirective;
3162   bool atStartOfLine = IsAtStartOfLine;
3163   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3164   bool leadingSpace = HasLeadingSpace;
3165 
3166   Token Tok;
3167   Lex(Tok);
3168 
3169   // Restore state that may have changed.
3170   BufferPtr = TmpBufferPtr;
3171   ParsingPreprocessorDirective = inPPDirectiveMode;
3172   HasLeadingSpace = leadingSpace;
3173   IsAtStartOfLine = atStartOfLine;
3174   IsAtPhysicalStartOfLine = atPhysicalStartOfLine;
3175 
3176   // Restore the lexer back to non-skipping mode.
3177   LexingRawMode = false;
3178 
3179   if (Tok.is(tok::eof))
3180     return 2;
3181   return Tok.is(tok::l_paren);
3182 }
3183 
3184 /// Find the end of a version control conflict marker.
FindConflictEnd(const char * CurPtr,const char * BufferEnd,ConflictMarkerKind CMK)3185 static const char *FindConflictEnd(const char *CurPtr, const char *BufferEnd,
3186                                    ConflictMarkerKind CMK) {
3187   const char *Terminator = CMK == CMK_Perforce ? "<<<<\n" : ">>>>>>>";
3188   size_t TermLen = CMK == CMK_Perforce ? 5 : 7;
3189   auto RestOfBuffer = StringRef(CurPtr, BufferEnd - CurPtr).substr(TermLen);
3190   size_t Pos = RestOfBuffer.find(Terminator);
3191   while (Pos != StringRef::npos) {
3192     // Must occur at start of line.
3193     if (Pos == 0 ||
3194         (RestOfBuffer[Pos - 1] != '\r' && RestOfBuffer[Pos - 1] != '\n')) {
3195       RestOfBuffer = RestOfBuffer.substr(Pos+TermLen);
3196       Pos = RestOfBuffer.find(Terminator);
3197       continue;
3198     }
3199     return RestOfBuffer.data()+Pos;
3200   }
3201   return nullptr;
3202 }
3203 
3204 /// IsStartOfConflictMarker - If the specified pointer is the start of a version
3205 /// control conflict marker like '<<<<<<<', recognize it as such, emit an error
3206 /// and recover nicely.  This returns true if it is a conflict marker and false
3207 /// if not.
IsStartOfConflictMarker(const char * CurPtr)3208 bool Lexer::IsStartOfConflictMarker(const char *CurPtr) {
3209   // Only a conflict marker if it starts at the beginning of a line.
3210   if (CurPtr != BufferStart &&
3211       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3212     return false;
3213 
3214   // Check to see if we have <<<<<<< or >>>>.
3215   if (!StringRef(CurPtr, BufferEnd - CurPtr).starts_with("<<<<<<<") &&
3216       !StringRef(CurPtr, BufferEnd - CurPtr).starts_with(">>>> "))
3217     return false;
3218 
3219   // If we have a situation where we don't care about conflict markers, ignore
3220   // it.
3221   if (CurrentConflictMarkerState || isLexingRawMode())
3222     return false;
3223 
3224   ConflictMarkerKind Kind = *CurPtr == '<' ? CMK_Normal : CMK_Perforce;
3225 
3226   // Check to see if there is an ending marker somewhere in the buffer at the
3227   // start of a line to terminate this conflict marker.
3228   if (FindConflictEnd(CurPtr, BufferEnd, Kind)) {
3229     // We found a match.  We are really in a conflict marker.
3230     // Diagnose this, and ignore to the end of line.
3231     Diag(CurPtr, diag::err_conflict_marker);
3232     CurrentConflictMarkerState = Kind;
3233 
3234     // Skip ahead to the end of line.  We know this exists because the
3235     // end-of-conflict marker starts with \r or \n.
3236     while (*CurPtr != '\r' && *CurPtr != '\n') {
3237       assert(CurPtr != BufferEnd && "Didn't find end of line");
3238       ++CurPtr;
3239     }
3240     BufferPtr = CurPtr;
3241     return true;
3242   }
3243 
3244   // No end of conflict marker found.
3245   return false;
3246 }
3247 
3248 /// HandleEndOfConflictMarker - If this is a '====' or '||||' or '>>>>', or if
3249 /// it is '<<<<' and the conflict marker started with a '>>>>' marker, then it
3250 /// is the end of a conflict marker.  Handle it by ignoring up until the end of
3251 /// the line.  This returns true if it is a conflict marker and false if not.
HandleEndOfConflictMarker(const char * CurPtr)3252 bool Lexer::HandleEndOfConflictMarker(const char *CurPtr) {
3253   // Only a conflict marker if it starts at the beginning of a line.
3254   if (CurPtr != BufferStart &&
3255       CurPtr[-1] != '\n' && CurPtr[-1] != '\r')
3256     return false;
3257 
3258   // If we have a situation where we don't care about conflict markers, ignore
3259   // it.
3260   if (!CurrentConflictMarkerState || isLexingRawMode())
3261     return false;
3262 
3263   // Check to see if we have the marker (4 characters in a row).
3264   for (unsigned i = 1; i != 4; ++i)
3265     if (CurPtr[i] != CurPtr[0])
3266       return false;
3267 
3268   // If we do have it, search for the end of the conflict marker.  This could
3269   // fail if it got skipped with a '#if 0' or something.  Note that CurPtr might
3270   // be the end of conflict marker.
3271   if (const char *End = FindConflictEnd(CurPtr, BufferEnd,
3272                                         CurrentConflictMarkerState)) {
3273     CurPtr = End;
3274 
3275     // Skip ahead to the end of line.
3276     while (CurPtr != BufferEnd && *CurPtr != '\r' && *CurPtr != '\n')
3277       ++CurPtr;
3278 
3279     BufferPtr = CurPtr;
3280 
3281     // No longer in the conflict marker.
3282     CurrentConflictMarkerState = CMK_None;
3283     return true;
3284   }
3285 
3286   return false;
3287 }
3288 
findPlaceholderEnd(const char * CurPtr,const char * BufferEnd)3289 static const char *findPlaceholderEnd(const char *CurPtr,
3290                                       const char *BufferEnd) {
3291   if (CurPtr == BufferEnd)
3292     return nullptr;
3293   BufferEnd -= 1; // Scan until the second last character.
3294   for (; CurPtr != BufferEnd; ++CurPtr) {
3295     if (CurPtr[0] == '#' && CurPtr[1] == '>')
3296       return CurPtr + 2;
3297   }
3298   return nullptr;
3299 }
3300 
lexEditorPlaceholder(Token & Result,const char * CurPtr)3301 bool Lexer::lexEditorPlaceholder(Token &Result, const char *CurPtr) {
3302   assert(CurPtr[-1] == '<' && CurPtr[0] == '#' && "Not a placeholder!");
3303   if (!PP || !PP->getPreprocessorOpts().LexEditorPlaceholders || LexingRawMode)
3304     return false;
3305   const char *End = findPlaceholderEnd(CurPtr + 1, BufferEnd);
3306   if (!End)
3307     return false;
3308   const char *Start = CurPtr - 1;
3309   if (!LangOpts.AllowEditorPlaceholders)
3310     Diag(Start, diag::err_placeholder_in_source);
3311   Result.startToken();
3312   FormTokenWithChars(Result, End, tok::raw_identifier);
3313   Result.setRawIdentifierData(Start);
3314   PP->LookUpIdentifierInfo(Result);
3315   Result.setFlag(Token::IsEditorPlaceholder);
3316   BufferPtr = End;
3317   return true;
3318 }
3319 
isCodeCompletionPoint(const char * CurPtr) const3320 bool Lexer::isCodeCompletionPoint(const char *CurPtr) const {
3321   if (PP && PP->isCodeCompletionEnabled()) {
3322     SourceLocation Loc = FileLoc.getLocWithOffset(CurPtr-BufferStart);
3323     return Loc == PP->getCodeCompletionLoc();
3324   }
3325 
3326   return false;
3327 }
3328 
tryReadNumericUCN(const char * & StartPtr,const char * SlashLoc,Token * Result)3329 std::optional<uint32_t> Lexer::tryReadNumericUCN(const char *&StartPtr,
3330                                                  const char *SlashLoc,
3331                                                  Token *Result) {
3332   unsigned CharSize;
3333   char Kind = getCharAndSize(StartPtr, CharSize);
3334   assert((Kind == 'u' || Kind == 'U') && "expected a UCN");
3335 
3336   unsigned NumHexDigits;
3337   if (Kind == 'u')
3338     NumHexDigits = 4;
3339   else if (Kind == 'U')
3340     NumHexDigits = 8;
3341 
3342   bool Delimited = false;
3343   bool FoundEndDelimiter = false;
3344   unsigned Count = 0;
3345   bool Diagnose = Result && !isLexingRawMode();
3346 
3347   if (!LangOpts.CPlusPlus && !LangOpts.C99) {
3348     if (Diagnose)
3349       Diag(SlashLoc, diag::warn_ucn_not_valid_in_c89);
3350     return std::nullopt;
3351   }
3352 
3353   const char *CurPtr = StartPtr + CharSize;
3354   const char *KindLoc = &CurPtr[-1];
3355 
3356   uint32_t CodePoint = 0;
3357   while (Count != NumHexDigits || Delimited) {
3358     char C = getCharAndSize(CurPtr, CharSize);
3359     if (!Delimited && Count == 0 && C == '{') {
3360       Delimited = true;
3361       CurPtr += CharSize;
3362       continue;
3363     }
3364 
3365     if (Delimited && C == '}') {
3366       CurPtr += CharSize;
3367       FoundEndDelimiter = true;
3368       break;
3369     }
3370 
3371     unsigned Value = llvm::hexDigitValue(C);
3372     if (Value == -1U) {
3373       if (!Delimited)
3374         break;
3375       if (Diagnose)
3376         Diag(SlashLoc, diag::warn_delimited_ucn_incomplete)
3377             << StringRef(KindLoc, 1);
3378       return std::nullopt;
3379     }
3380 
3381     if (CodePoint & 0xF000'0000) {
3382       if (Diagnose)
3383         Diag(KindLoc, diag::err_escape_too_large) << 0;
3384       return std::nullopt;
3385     }
3386 
3387     CodePoint <<= 4;
3388     CodePoint |= Value;
3389     CurPtr += CharSize;
3390     Count++;
3391   }
3392 
3393   if (Count == 0) {
3394     if (Diagnose)
3395       Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3396                                        : diag::warn_ucn_escape_no_digits)
3397           << StringRef(KindLoc, 1);
3398     return std::nullopt;
3399   }
3400 
3401   if (Delimited && Kind == 'U') {
3402     if (Diagnose)
3403       Diag(SlashLoc, diag::err_hex_escape_no_digits) << StringRef(KindLoc, 1);
3404     return std::nullopt;
3405   }
3406 
3407   if (!Delimited && Count != NumHexDigits) {
3408     if (Diagnose) {
3409       Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3410       // If the user wrote \U1234, suggest a fixit to \u.
3411       if (Count == 4 && NumHexDigits == 8) {
3412         CharSourceRange URange = makeCharRange(*this, KindLoc, KindLoc + 1);
3413         Diag(KindLoc, diag::note_ucn_four_not_eight)
3414             << FixItHint::CreateReplacement(URange, "u");
3415       }
3416     }
3417     return std::nullopt;
3418   }
3419 
3420   if (Delimited && PP) {
3421     Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3422                        ? diag::warn_cxx23_delimited_escape_sequence
3423                        : diag::ext_delimited_escape_sequence)
3424         << /*delimited*/ 0 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3425   }
3426 
3427   if (Result) {
3428     Result->setFlag(Token::HasUCN);
3429     // If the UCN contains either a trigraph or a line splicing,
3430     // we need to call getAndAdvanceChar again to set the appropriate flags
3431     // on Result.
3432     if (CurPtr - StartPtr == (ptrdiff_t)(Count + 1 + (Delimited ? 2 : 0)))
3433       StartPtr = CurPtr;
3434     else
3435       while (StartPtr != CurPtr)
3436         (void)getAndAdvanceChar(StartPtr, *Result);
3437   } else {
3438     StartPtr = CurPtr;
3439   }
3440   return CodePoint;
3441 }
3442 
tryReadNamedUCN(const char * & StartPtr,const char * SlashLoc,Token * Result)3443 std::optional<uint32_t> Lexer::tryReadNamedUCN(const char *&StartPtr,
3444                                                const char *SlashLoc,
3445                                                Token *Result) {
3446   unsigned CharSize;
3447   bool Diagnose = Result && !isLexingRawMode();
3448 
3449   char C = getCharAndSize(StartPtr, CharSize);
3450   assert(C == 'N' && "expected \\N{...}");
3451 
3452   const char *CurPtr = StartPtr + CharSize;
3453   const char *KindLoc = &CurPtr[-1];
3454 
3455   C = getCharAndSize(CurPtr, CharSize);
3456   if (C != '{') {
3457     if (Diagnose)
3458       Diag(SlashLoc, diag::warn_ucn_escape_incomplete);
3459     return std::nullopt;
3460   }
3461   CurPtr += CharSize;
3462   const char *StartName = CurPtr;
3463   bool FoundEndDelimiter = false;
3464   llvm::SmallVector<char, 30> Buffer;
3465   while (C) {
3466     C = getCharAndSize(CurPtr, CharSize);
3467     CurPtr += CharSize;
3468     if (C == '}') {
3469       FoundEndDelimiter = true;
3470       break;
3471     }
3472 
3473     if (isVerticalWhitespace(C))
3474       break;
3475     Buffer.push_back(C);
3476   }
3477 
3478   if (!FoundEndDelimiter || Buffer.empty()) {
3479     if (Diagnose)
3480       Diag(SlashLoc, FoundEndDelimiter ? diag::warn_delimited_ucn_empty
3481                                        : diag::warn_delimited_ucn_incomplete)
3482           << StringRef(KindLoc, 1);
3483     return std::nullopt;
3484   }
3485 
3486   StringRef Name(Buffer.data(), Buffer.size());
3487   std::optional<char32_t> Match =
3488       llvm::sys::unicode::nameToCodepointStrict(Name);
3489   std::optional<llvm::sys::unicode::LooseMatchingResult> LooseMatch;
3490   if (!Match) {
3491     LooseMatch = llvm::sys::unicode::nameToCodepointLooseMatching(Name);
3492     if (Diagnose) {
3493       Diag(StartName, diag::err_invalid_ucn_name)
3494           << StringRef(Buffer.data(), Buffer.size())
3495           << makeCharRange(*this, StartName, CurPtr - CharSize);
3496       if (LooseMatch) {
3497         Diag(StartName, diag::note_invalid_ucn_name_loose_matching)
3498             << FixItHint::CreateReplacement(
3499                    makeCharRange(*this, StartName, CurPtr - CharSize),
3500                    LooseMatch->Name);
3501       }
3502     }
3503     // We do not offer misspelled character names suggestions here
3504     // as the set of what would be a valid suggestion depends on context,
3505     // and we should not make invalid suggestions.
3506   }
3507 
3508   if (Diagnose && Match)
3509     Diag(SlashLoc, PP->getLangOpts().CPlusPlus23
3510                        ? diag::warn_cxx23_delimited_escape_sequence
3511                        : diag::ext_delimited_escape_sequence)
3512         << /*named*/ 1 << (PP->getLangOpts().CPlusPlus ? 1 : 0);
3513 
3514   // If no diagnostic has been emitted yet, likely because we are doing a
3515   // tentative lexing, we do not want to recover here to make sure the token
3516   // will not be incorrectly considered valid. This function will be called
3517   // again and a diagnostic emitted then.
3518   if (LooseMatch && Diagnose)
3519     Match = LooseMatch->CodePoint;
3520 
3521   if (Result) {
3522     Result->setFlag(Token::HasUCN);
3523     // If the UCN contains either a trigraph or a line splicing,
3524     // we need to call getAndAdvanceChar again to set the appropriate flags
3525     // on Result.
3526     if (CurPtr - StartPtr == (ptrdiff_t)(Buffer.size() + 3))
3527       StartPtr = CurPtr;
3528     else
3529       while (StartPtr != CurPtr)
3530         (void)getAndAdvanceChar(StartPtr, *Result);
3531   } else {
3532     StartPtr = CurPtr;
3533   }
3534   return Match ? std::optional<uint32_t>(*Match) : std::nullopt;
3535 }
3536 
tryReadUCN(const char * & StartPtr,const char * SlashLoc,Token * Result)3537 uint32_t Lexer::tryReadUCN(const char *&StartPtr, const char *SlashLoc,
3538                            Token *Result) {
3539 
3540   unsigned CharSize;
3541   std::optional<uint32_t> CodePointOpt;
3542   char Kind = getCharAndSize(StartPtr, CharSize);
3543   if (Kind == 'u' || Kind == 'U')
3544     CodePointOpt = tryReadNumericUCN(StartPtr, SlashLoc, Result);
3545   else if (Kind == 'N')
3546     CodePointOpt = tryReadNamedUCN(StartPtr, SlashLoc, Result);
3547 
3548   if (!CodePointOpt)
3549     return 0;
3550 
3551   uint32_t CodePoint = *CodePointOpt;
3552 
3553   // Don't apply C family restrictions to UCNs in assembly mode
3554   if (LangOpts.AsmPreprocessor)
3555     return CodePoint;
3556 
3557   // C23 6.4.3p2: A universal character name shall not designate a code point
3558   // where the hexadecimal value is:
3559   // - in the range D800 through DFFF inclusive; or
3560   // - greater than 10FFFF.
3561   // A universal-character-name outside the c-char-sequence of a character
3562   // constant, or the s-char-sequence of a string-literal shall not designate
3563   // a control character or a character in the basic character set.
3564 
3565   // C++11 [lex.charset]p2: If the hexadecimal value for a
3566   //   universal-character-name corresponds to a surrogate code point (in the
3567   //   range 0xD800-0xDFFF, inclusive), the program is ill-formed. Additionally,
3568   //   if the hexadecimal value for a universal-character-name outside the
3569   //   c-char-sequence, s-char-sequence, or r-char-sequence of a character or
3570   //   string literal corresponds to a control character (in either of the
3571   //   ranges 0x00-0x1F or 0x7F-0x9F, both inclusive) or to a character in the
3572   //   basic source character set, the program is ill-formed.
3573   if (CodePoint < 0xA0) {
3574     // We don't use isLexingRawMode() here because we need to warn about bad
3575     // UCNs even when skipping preprocessing tokens in a #if block.
3576     if (Result && PP) {
3577       if (CodePoint < 0x20 || CodePoint >= 0x7F)
3578         Diag(BufferPtr, diag::err_ucn_control_character);
3579       else {
3580         char C = static_cast<char>(CodePoint);
3581         Diag(BufferPtr, diag::err_ucn_escape_basic_scs) << StringRef(&C, 1);
3582       }
3583     }
3584 
3585     return 0;
3586   } else if (CodePoint >= 0xD800 && CodePoint <= 0xDFFF) {
3587     // C++03 allows UCNs representing surrogate characters. C99 and C++11 don't.
3588     // We don't use isLexingRawMode() here because we need to diagnose bad
3589     // UCNs even when skipping preprocessing tokens in a #if block.
3590     if (Result && PP) {
3591       if (LangOpts.CPlusPlus && !LangOpts.CPlusPlus11)
3592         Diag(BufferPtr, diag::warn_ucn_escape_surrogate);
3593       else
3594         Diag(BufferPtr, diag::err_ucn_escape_invalid);
3595     }
3596     return 0;
3597   }
3598 
3599   return CodePoint;
3600 }
3601 
CheckUnicodeWhitespace(Token & Result,uint32_t C,const char * CurPtr)3602 bool Lexer::CheckUnicodeWhitespace(Token &Result, uint32_t C,
3603                                    const char *CurPtr) {
3604   if (!isLexingRawMode() && !PP->isPreprocessedOutput() &&
3605       isUnicodeWhitespace(C)) {
3606     Diag(BufferPtr, diag::ext_unicode_whitespace)
3607       << makeCharRange(*this, BufferPtr, CurPtr);
3608 
3609     Result.setFlag(Token::LeadingSpace);
3610     return true;
3611   }
3612   return false;
3613 }
3614 
PropagateLineStartLeadingSpaceInfo(Token & Result)3615 void Lexer::PropagateLineStartLeadingSpaceInfo(Token &Result) {
3616   IsAtStartOfLine = Result.isAtStartOfLine();
3617   HasLeadingSpace = Result.hasLeadingSpace();
3618   HasLeadingEmptyMacro = Result.hasLeadingEmptyMacro();
3619   // Note that this doesn't affect IsAtPhysicalStartOfLine.
3620 }
3621 
Lex(Token & Result)3622 bool Lexer::Lex(Token &Result) {
3623   assert(!isDependencyDirectivesLexer());
3624 
3625   // Start a new token.
3626   Result.startToken();
3627 
3628   // Set up misc whitespace flags for LexTokenInternal.
3629   if (IsAtStartOfLine) {
3630     Result.setFlag(Token::StartOfLine);
3631     IsAtStartOfLine = false;
3632   }
3633 
3634   if (HasLeadingSpace) {
3635     Result.setFlag(Token::LeadingSpace);
3636     HasLeadingSpace = false;
3637   }
3638 
3639   if (HasLeadingEmptyMacro) {
3640     Result.setFlag(Token::LeadingEmptyMacro);
3641     HasLeadingEmptyMacro = false;
3642   }
3643 
3644   bool atPhysicalStartOfLine = IsAtPhysicalStartOfLine;
3645   IsAtPhysicalStartOfLine = false;
3646   bool isRawLex = isLexingRawMode();
3647   (void) isRawLex;
3648   bool returnedToken = LexTokenInternal(Result, atPhysicalStartOfLine);
3649   // (After the LexTokenInternal call, the lexer might be destroyed.)
3650   assert((returnedToken || !isRawLex) && "Raw lex must succeed");
3651   return returnedToken;
3652 }
3653 
3654 /// LexTokenInternal - This implements a simple C family lexer.  It is an
3655 /// extremely performance critical piece of code.  This assumes that the buffer
3656 /// has a null character at the end of the file.  This returns a preprocessing
3657 /// token, not a normal token, as such, it is an internal interface.  It assumes
3658 /// that the Flags of result have been cleared before calling this.
LexTokenInternal(Token & Result,bool TokAtPhysicalStartOfLine)3659 bool Lexer::LexTokenInternal(Token &Result, bool TokAtPhysicalStartOfLine) {
3660 LexStart:
3661   assert(!Result.needsCleaning() && "Result needs cleaning");
3662   assert(!Result.hasPtrData() && "Result has not been reset");
3663 
3664   // CurPtr - Cache BufferPtr in an automatic variable.
3665   const char *CurPtr = BufferPtr;
3666 
3667   // Small amounts of horizontal whitespace is very common between tokens.
3668   if (isHorizontalWhitespace(*CurPtr)) {
3669     do {
3670       ++CurPtr;
3671     } while (isHorizontalWhitespace(*CurPtr));
3672 
3673     // If we are keeping whitespace and other tokens, just return what we just
3674     // skipped.  The next lexer invocation will return the token after the
3675     // whitespace.
3676     if (isKeepWhitespaceMode()) {
3677       FormTokenWithChars(Result, CurPtr, tok::unknown);
3678       // FIXME: The next token will not have LeadingSpace set.
3679       return true;
3680     }
3681 
3682     BufferPtr = CurPtr;
3683     Result.setFlag(Token::LeadingSpace);
3684   }
3685 
3686   unsigned SizeTmp, SizeTmp2;   // Temporaries for use in cases below.
3687 
3688   // Read a character, advancing over it.
3689   char Char = getAndAdvanceChar(CurPtr, Result);
3690   tok::TokenKind Kind;
3691 
3692   if (!isVerticalWhitespace(Char))
3693     NewLinePtr = nullptr;
3694 
3695   switch (Char) {
3696   case 0:  // Null.
3697     // Found end of file?
3698     if (CurPtr-1 == BufferEnd)
3699       return LexEndOfFile(Result, CurPtr-1);
3700 
3701     // Check if we are performing code completion.
3702     if (isCodeCompletionPoint(CurPtr-1)) {
3703       // Return the code-completion token.
3704       Result.startToken();
3705       FormTokenWithChars(Result, CurPtr, tok::code_completion);
3706       return true;
3707     }
3708 
3709     if (!isLexingRawMode())
3710       Diag(CurPtr-1, diag::null_in_file);
3711     Result.setFlag(Token::LeadingSpace);
3712     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3713       return true; // KeepWhitespaceMode
3714 
3715     // We know the lexer hasn't changed, so just try again with this lexer.
3716     // (We manually eliminate the tail call to avoid recursion.)
3717     goto LexNextToken;
3718 
3719   case 26:  // DOS & CP/M EOF: "^Z".
3720     // If we're in Microsoft extensions mode, treat this as end of file.
3721     if (LangOpts.MicrosoftExt) {
3722       if (!isLexingRawMode())
3723         Diag(CurPtr-1, diag::ext_ctrl_z_eof_microsoft);
3724       return LexEndOfFile(Result, CurPtr-1);
3725     }
3726 
3727     // If Microsoft extensions are disabled, this is just random garbage.
3728     Kind = tok::unknown;
3729     break;
3730 
3731   case '\r':
3732     if (CurPtr[0] == '\n')
3733       (void)getAndAdvanceChar(CurPtr, Result);
3734     [[fallthrough]];
3735   case '\n':
3736     // If we are inside a preprocessor directive and we see the end of line,
3737     // we know we are done with the directive, so return an EOD token.
3738     if (ParsingPreprocessorDirective) {
3739       // Done parsing the "line".
3740       ParsingPreprocessorDirective = false;
3741 
3742       // Restore comment saving mode, in case it was disabled for directive.
3743       if (PP)
3744         resetExtendedTokenMode();
3745 
3746       // Since we consumed a newline, we are back at the start of a line.
3747       IsAtStartOfLine = true;
3748       IsAtPhysicalStartOfLine = true;
3749       NewLinePtr = CurPtr - 1;
3750 
3751       Kind = tok::eod;
3752       break;
3753     }
3754 
3755     // No leading whitespace seen so far.
3756     Result.clearFlag(Token::LeadingSpace);
3757 
3758     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3759       return true; // KeepWhitespaceMode
3760 
3761     // We only saw whitespace, so just try again with this lexer.
3762     // (We manually eliminate the tail call to avoid recursion.)
3763     goto LexNextToken;
3764   case ' ':
3765   case '\t':
3766   case '\f':
3767   case '\v':
3768   SkipHorizontalWhitespace:
3769     Result.setFlag(Token::LeadingSpace);
3770     if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
3771       return true; // KeepWhitespaceMode
3772 
3773   SkipIgnoredUnits:
3774     CurPtr = BufferPtr;
3775 
3776     // If the next token is obviously a // or /* */ comment, skip it efficiently
3777     // too (without going through the big switch stmt).
3778     if (CurPtr[0] == '/' && CurPtr[1] == '/' && !inKeepCommentMode() &&
3779         LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP)) {
3780       if (SkipLineComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3781         return true; // There is a token to return.
3782       goto SkipIgnoredUnits;
3783     } else if (CurPtr[0] == '/' && CurPtr[1] == '*' && !inKeepCommentMode()) {
3784       if (SkipBlockComment(Result, CurPtr+2, TokAtPhysicalStartOfLine))
3785         return true; // There is a token to return.
3786       goto SkipIgnoredUnits;
3787     } else if (isHorizontalWhitespace(*CurPtr)) {
3788       goto SkipHorizontalWhitespace;
3789     }
3790     // We only saw whitespace, so just try again with this lexer.
3791     // (We manually eliminate the tail call to avoid recursion.)
3792     goto LexNextToken;
3793 
3794   // C99 6.4.4.1: Integer Constants.
3795   // C99 6.4.4.2: Floating Constants.
3796   case '0': case '1': case '2': case '3': case '4':
3797   case '5': case '6': case '7': case '8': case '9':
3798     // Notify MIOpt that we read a non-whitespace/non-comment token.
3799     MIOpt.ReadToken();
3800     return LexNumericConstant(Result, CurPtr);
3801 
3802   // Identifier (e.g., uber), or
3803   // UTF-8 (C23/C++17) or UTF-16 (C11/C++11) character literal, or
3804   // UTF-8 or UTF-16 string literal (C11/C++11).
3805   case 'u':
3806     // Notify MIOpt that we read a non-whitespace/non-comment token.
3807     MIOpt.ReadToken();
3808 
3809     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3810       Char = getCharAndSize(CurPtr, SizeTmp);
3811 
3812       // UTF-16 string literal
3813       if (Char == '"')
3814         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3815                                 tok::utf16_string_literal);
3816 
3817       // UTF-16 character constant
3818       if (Char == '\'')
3819         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3820                                tok::utf16_char_constant);
3821 
3822       // UTF-16 raw string literal
3823       if (Char == 'R' && LangOpts.CPlusPlus11 &&
3824           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3825         return LexRawStringLiteral(Result,
3826                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3827                                            SizeTmp2, Result),
3828                                tok::utf16_string_literal);
3829 
3830       if (Char == '8') {
3831         char Char2 = getCharAndSize(CurPtr + SizeTmp, SizeTmp2);
3832 
3833         // UTF-8 string literal
3834         if (Char2 == '"')
3835           return LexStringLiteral(Result,
3836                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3837                                            SizeTmp2, Result),
3838                                tok::utf8_string_literal);
3839         if (Char2 == '\'' && (LangOpts.CPlusPlus17 || LangOpts.C23))
3840           return LexCharConstant(
3841               Result, ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3842                                   SizeTmp2, Result),
3843               tok::utf8_char_constant);
3844 
3845         if (Char2 == 'R' && LangOpts.CPlusPlus11) {
3846           unsigned SizeTmp3;
3847           char Char3 = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
3848           // UTF-8 raw string literal
3849           if (Char3 == '"') {
3850             return LexRawStringLiteral(Result,
3851                    ConsumeChar(ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3852                                            SizeTmp2, Result),
3853                                SizeTmp3, Result),
3854                    tok::utf8_string_literal);
3855           }
3856         }
3857       }
3858     }
3859 
3860     // treat u like the start of an identifier.
3861     return LexIdentifierContinue(Result, CurPtr);
3862 
3863   case 'U': // Identifier (e.g. Uber) or C11/C++11 UTF-32 string literal
3864     // Notify MIOpt that we read a non-whitespace/non-comment token.
3865     MIOpt.ReadToken();
3866 
3867     if (LangOpts.CPlusPlus11 || LangOpts.C11) {
3868       Char = getCharAndSize(CurPtr, SizeTmp);
3869 
3870       // UTF-32 string literal
3871       if (Char == '"')
3872         return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3873                                 tok::utf32_string_literal);
3874 
3875       // UTF-32 character constant
3876       if (Char == '\'')
3877         return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3878                                tok::utf32_char_constant);
3879 
3880       // UTF-32 raw string literal
3881       if (Char == 'R' && LangOpts.CPlusPlus11 &&
3882           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3883         return LexRawStringLiteral(Result,
3884                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3885                                            SizeTmp2, Result),
3886                                tok::utf32_string_literal);
3887     }
3888 
3889     // treat U like the start of an identifier.
3890     return LexIdentifierContinue(Result, CurPtr);
3891 
3892   case 'R': // Identifier or C++0x raw string literal
3893     // Notify MIOpt that we read a non-whitespace/non-comment token.
3894     MIOpt.ReadToken();
3895 
3896     if (LangOpts.CPlusPlus11) {
3897       Char = getCharAndSize(CurPtr, SizeTmp);
3898 
3899       if (Char == '"')
3900         return LexRawStringLiteral(Result,
3901                                    ConsumeChar(CurPtr, SizeTmp, Result),
3902                                    tok::string_literal);
3903     }
3904 
3905     // treat R like the start of an identifier.
3906     return LexIdentifierContinue(Result, CurPtr);
3907 
3908   case 'L':   // Identifier (Loony) or wide literal (L'x' or L"xyz").
3909     // Notify MIOpt that we read a non-whitespace/non-comment token.
3910     MIOpt.ReadToken();
3911     Char = getCharAndSize(CurPtr, SizeTmp);
3912 
3913     // Wide string literal.
3914     if (Char == '"')
3915       return LexStringLiteral(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3916                               tok::wide_string_literal);
3917 
3918     // Wide raw string literal.
3919     if (LangOpts.CPlusPlus11 && Char == 'R' &&
3920         getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == '"')
3921       return LexRawStringLiteral(Result,
3922                                ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
3923                                            SizeTmp2, Result),
3924                                tok::wide_string_literal);
3925 
3926     // Wide character constant.
3927     if (Char == '\'')
3928       return LexCharConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result),
3929                              tok::wide_char_constant);
3930     // FALL THROUGH, treating L like the start of an identifier.
3931     [[fallthrough]];
3932 
3933   // C99 6.4.2: Identifiers.
3934   case 'A': case 'B': case 'C': case 'D': case 'E': case 'F': case 'G':
3935   case 'H': case 'I': case 'J': case 'K':    /*'L'*/case 'M': case 'N':
3936   case 'O': case 'P': case 'Q':    /*'R'*/case 'S': case 'T':    /*'U'*/
3937   case 'V': case 'W': case 'X': case 'Y': case 'Z':
3938   case 'a': case 'b': case 'c': case 'd': case 'e': case 'f': case 'g':
3939   case 'h': case 'i': case 'j': case 'k': case 'l': case 'm': case 'n':
3940   case 'o': case 'p': case 'q': case 'r': case 's': case 't':    /*'u'*/
3941   case 'v': case 'w': case 'x': case 'y': case 'z':
3942   case '_':
3943     // Notify MIOpt that we read a non-whitespace/non-comment token.
3944     MIOpt.ReadToken();
3945     return LexIdentifierContinue(Result, CurPtr);
3946 
3947   case '$':   // $ in identifiers.
3948     if (LangOpts.DollarIdents) {
3949       if (!isLexingRawMode())
3950         Diag(CurPtr-1, diag::ext_dollar_in_identifier);
3951       // Notify MIOpt that we read a non-whitespace/non-comment token.
3952       MIOpt.ReadToken();
3953       return LexIdentifierContinue(Result, CurPtr);
3954     }
3955 
3956     Kind = tok::unknown;
3957     break;
3958 
3959   // C99 6.4.4: Character Constants.
3960   case '\'':
3961     // Notify MIOpt that we read a non-whitespace/non-comment token.
3962     MIOpt.ReadToken();
3963     return LexCharConstant(Result, CurPtr, tok::char_constant);
3964 
3965   // C99 6.4.5: String Literals.
3966   case '"':
3967     // Notify MIOpt that we read a non-whitespace/non-comment token.
3968     MIOpt.ReadToken();
3969     return LexStringLiteral(Result, CurPtr,
3970                             ParsingFilename ? tok::header_name
3971                                             : tok::string_literal);
3972 
3973   // C99 6.4.6: Punctuators.
3974   case '?':
3975     Kind = tok::question;
3976     break;
3977   case '[':
3978     Kind = tok::l_square;
3979     break;
3980   case ']':
3981     Kind = tok::r_square;
3982     break;
3983   case '(':
3984     Kind = tok::l_paren;
3985     break;
3986   case ')':
3987     Kind = tok::r_paren;
3988     break;
3989   case '{':
3990     Kind = tok::l_brace;
3991     break;
3992   case '}':
3993     Kind = tok::r_brace;
3994     break;
3995   case '.':
3996     Char = getCharAndSize(CurPtr, SizeTmp);
3997     if (Char >= '0' && Char <= '9') {
3998       // Notify MIOpt that we read a non-whitespace/non-comment token.
3999       MIOpt.ReadToken();
4000 
4001       return LexNumericConstant(Result, ConsumeChar(CurPtr, SizeTmp, Result));
4002     } else if (LangOpts.CPlusPlus && Char == '*') {
4003       Kind = tok::periodstar;
4004       CurPtr += SizeTmp;
4005     } else if (Char == '.' &&
4006                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '.') {
4007       Kind = tok::ellipsis;
4008       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4009                            SizeTmp2, Result);
4010     } else {
4011       Kind = tok::period;
4012     }
4013     break;
4014   case '&':
4015     Char = getCharAndSize(CurPtr, SizeTmp);
4016     if (Char == '&') {
4017       Kind = tok::ampamp;
4018       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4019     } else if (Char == '=') {
4020       Kind = tok::ampequal;
4021       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4022     } else {
4023       Kind = tok::amp;
4024     }
4025     break;
4026   case '*':
4027     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4028       Kind = tok::starequal;
4029       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4030     } else {
4031       Kind = tok::star;
4032     }
4033     break;
4034   case '+':
4035     Char = getCharAndSize(CurPtr, SizeTmp);
4036     if (Char == '+') {
4037       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4038       Kind = tok::plusplus;
4039     } else if (Char == '=') {
4040       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4041       Kind = tok::plusequal;
4042     } else {
4043       Kind = tok::plus;
4044     }
4045     break;
4046   case '-':
4047     Char = getCharAndSize(CurPtr, SizeTmp);
4048     if (Char == '-') {      // --
4049       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4050       Kind = tok::minusminus;
4051     } else if (Char == '>' && LangOpts.CPlusPlus &&
4052                getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == '*') {  // C++ ->*
4053       CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4054                            SizeTmp2, Result);
4055       Kind = tok::arrowstar;
4056     } else if (Char == '>') {   // ->
4057       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4058       Kind = tok::arrow;
4059     } else if (Char == '=') {   // -=
4060       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4061       Kind = tok::minusequal;
4062     } else {
4063       Kind = tok::minus;
4064     }
4065     break;
4066   case '~':
4067     Kind = tok::tilde;
4068     break;
4069   case '!':
4070     if (getCharAndSize(CurPtr, SizeTmp) == '=') {
4071       Kind = tok::exclaimequal;
4072       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4073     } else {
4074       Kind = tok::exclaim;
4075     }
4076     break;
4077   case '/':
4078     // 6.4.9: Comments
4079     Char = getCharAndSize(CurPtr, SizeTmp);
4080     if (Char == '/') {         // Line comment.
4081       // Even if Line comments are disabled (e.g. in C89 mode), we generally
4082       // want to lex this as a comment.  There is one problem with this though,
4083       // that in one particular corner case, this can change the behavior of the
4084       // resultant program.  For example, In  "foo //**/ bar", C89 would lex
4085       // this as "foo / bar" and languages with Line comments would lex it as
4086       // "foo".  Check to see if the character after the second slash is a '*'.
4087       // If so, we will lex that as a "/" instead of the start of a comment.
4088       // However, we never do this if we are just preprocessing.
4089       bool TreatAsComment =
4090           LineComment && (LangOpts.CPlusPlus || !LangOpts.TraditionalCPP);
4091       if (!TreatAsComment)
4092         if (!(PP && PP->isPreprocessedOutput()))
4093           TreatAsComment = getCharAndSize(CurPtr+SizeTmp, SizeTmp2) != '*';
4094 
4095       if (TreatAsComment) {
4096         if (SkipLineComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4097                             TokAtPhysicalStartOfLine))
4098           return true; // There is a token to return.
4099 
4100         // It is common for the tokens immediately after a // comment to be
4101         // whitespace (indentation for the next line).  Instead of going through
4102         // the big switch, handle it efficiently now.
4103         goto SkipIgnoredUnits;
4104       }
4105     }
4106 
4107     if (Char == '*') {  // /**/ comment.
4108       if (SkipBlockComment(Result, ConsumeChar(CurPtr, SizeTmp, Result),
4109                            TokAtPhysicalStartOfLine))
4110         return true; // There is a token to return.
4111 
4112       // We only saw whitespace, so just try again with this lexer.
4113       // (We manually eliminate the tail call to avoid recursion.)
4114       goto LexNextToken;
4115     }
4116 
4117     if (Char == '=') {
4118       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4119       Kind = tok::slashequal;
4120     } else {
4121       Kind = tok::slash;
4122     }
4123     break;
4124   case '%':
4125     Char = getCharAndSize(CurPtr, SizeTmp);
4126     if (Char == '=') {
4127       Kind = tok::percentequal;
4128       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4129     } else if (LangOpts.Digraphs && Char == '>') {
4130       Kind = tok::r_brace;                             // '%>' -> '}'
4131       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4132     } else if (LangOpts.Digraphs && Char == ':') {
4133       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4134       Char = getCharAndSize(CurPtr, SizeTmp);
4135       if (Char == '%' && getCharAndSize(CurPtr+SizeTmp, SizeTmp2) == ':') {
4136         Kind = tok::hashhash;                          // '%:%:' -> '##'
4137         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4138                              SizeTmp2, Result);
4139       } else if (Char == '@' && LangOpts.MicrosoftExt) {// %:@ -> #@ -> Charize
4140         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4141         if (!isLexingRawMode())
4142           Diag(BufferPtr, diag::ext_charize_microsoft);
4143         Kind = tok::hashat;
4144       } else {                                         // '%:' -> '#'
4145         // We parsed a # character.  If this occurs at the start of the line,
4146         // it's actually the start of a preprocessing directive.  Callback to
4147         // the preprocessor to handle it.
4148         // TODO: -fpreprocessed mode??
4149         if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4150           goto HandleDirective;
4151 
4152         Kind = tok::hash;
4153       }
4154     } else {
4155       Kind = tok::percent;
4156     }
4157     break;
4158   case '<':
4159     Char = getCharAndSize(CurPtr, SizeTmp);
4160     if (ParsingFilename) {
4161       return LexAngledStringLiteral(Result, CurPtr);
4162     } else if (Char == '<') {
4163       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4164       if (After == '=') {
4165         Kind = tok::lesslessequal;
4166         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4167                              SizeTmp2, Result);
4168       } else if (After == '<' && IsStartOfConflictMarker(CurPtr-1)) {
4169         // If this is actually a '<<<<<<<' version control conflict marker,
4170         // recognize it as such and recover nicely.
4171         goto LexNextToken;
4172       } else if (After == '<' && HandleEndOfConflictMarker(CurPtr-1)) {
4173         // If this is '<<<<' and we're in a Perforce-style conflict marker,
4174         // ignore it.
4175         goto LexNextToken;
4176       } else if (LangOpts.CUDA && After == '<') {
4177         Kind = tok::lesslessless;
4178         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4179                              SizeTmp2, Result);
4180       } else {
4181         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4182         Kind = tok::lessless;
4183       }
4184     } else if (Char == '=') {
4185       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4186       if (After == '>') {
4187         if (LangOpts.CPlusPlus20) {
4188           if (!isLexingRawMode())
4189             Diag(BufferPtr, diag::warn_cxx17_compat_spaceship);
4190           CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4191                                SizeTmp2, Result);
4192           Kind = tok::spaceship;
4193           break;
4194         }
4195         // Suggest adding a space between the '<=' and the '>' to avoid a
4196         // change in semantics if this turns up in C++ <=17 mode.
4197         if (LangOpts.CPlusPlus && !isLexingRawMode()) {
4198           Diag(BufferPtr, diag::warn_cxx20_compat_spaceship)
4199             << FixItHint::CreateInsertion(
4200                    getSourceLocation(CurPtr + SizeTmp, SizeTmp2), " ");
4201         }
4202       }
4203       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4204       Kind = tok::lessequal;
4205     } else if (LangOpts.Digraphs && Char == ':') {     // '<:' -> '['
4206       if (LangOpts.CPlusPlus11 &&
4207           getCharAndSize(CurPtr + SizeTmp, SizeTmp2) == ':') {
4208         // C++0x [lex.pptoken]p3:
4209         //  Otherwise, if the next three characters are <:: and the subsequent
4210         //  character is neither : nor >, the < is treated as a preprocessor
4211         //  token by itself and not as the first character of the alternative
4212         //  token <:.
4213         unsigned SizeTmp3;
4214         char After = getCharAndSize(CurPtr + SizeTmp + SizeTmp2, SizeTmp3);
4215         if (After != ':' && After != '>') {
4216           Kind = tok::less;
4217           if (!isLexingRawMode())
4218             Diag(BufferPtr, diag::warn_cxx98_compat_less_colon_colon);
4219           break;
4220         }
4221       }
4222 
4223       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4224       Kind = tok::l_square;
4225     } else if (LangOpts.Digraphs && Char == '%') {     // '<%' -> '{'
4226       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4227       Kind = tok::l_brace;
4228     } else if (Char == '#' && /*Not a trigraph*/ SizeTmp == 1 &&
4229                lexEditorPlaceholder(Result, CurPtr)) {
4230       return true;
4231     } else {
4232       Kind = tok::less;
4233     }
4234     break;
4235   case '>':
4236     Char = getCharAndSize(CurPtr, SizeTmp);
4237     if (Char == '=') {
4238       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4239       Kind = tok::greaterequal;
4240     } else if (Char == '>') {
4241       char After = getCharAndSize(CurPtr+SizeTmp, SizeTmp2);
4242       if (After == '=') {
4243         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4244                              SizeTmp2, Result);
4245         Kind = tok::greatergreaterequal;
4246       } else if (After == '>' && IsStartOfConflictMarker(CurPtr-1)) {
4247         // If this is actually a '>>>>' conflict marker, recognize it as such
4248         // and recover nicely.
4249         goto LexNextToken;
4250       } else if (After == '>' && HandleEndOfConflictMarker(CurPtr-1)) {
4251         // If this is '>>>>>>>' and we're in a conflict marker, ignore it.
4252         goto LexNextToken;
4253       } else if (LangOpts.CUDA && After == '>') {
4254         Kind = tok::greatergreatergreater;
4255         CurPtr = ConsumeChar(ConsumeChar(CurPtr, SizeTmp, Result),
4256                              SizeTmp2, Result);
4257       } else {
4258         CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4259         Kind = tok::greatergreater;
4260       }
4261     } else {
4262       Kind = tok::greater;
4263     }
4264     break;
4265   case '^':
4266     Char = getCharAndSize(CurPtr, SizeTmp);
4267     if (Char == '=') {
4268       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4269       Kind = tok::caretequal;
4270     } else if (LangOpts.OpenCL && Char == '^') {
4271       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4272       Kind = tok::caretcaret;
4273     } else {
4274       Kind = tok::caret;
4275     }
4276     break;
4277   case '|':
4278     Char = getCharAndSize(CurPtr, SizeTmp);
4279     if (Char == '=') {
4280       Kind = tok::pipeequal;
4281       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4282     } else if (Char == '|') {
4283       // If this is '|||||||' and we're in a conflict marker, ignore it.
4284       if (CurPtr[1] == '|' && HandleEndOfConflictMarker(CurPtr-1))
4285         goto LexNextToken;
4286       Kind = tok::pipepipe;
4287       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4288     } else {
4289       Kind = tok::pipe;
4290     }
4291     break;
4292   case ':':
4293     Char = getCharAndSize(CurPtr, SizeTmp);
4294     if (LangOpts.Digraphs && Char == '>') {
4295       Kind = tok::r_square; // ':>' -> ']'
4296       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4297     } else if (Char == ':') {
4298       Kind = tok::coloncolon;
4299       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4300     } else {
4301       Kind = tok::colon;
4302     }
4303     break;
4304   case ';':
4305     Kind = tok::semi;
4306     break;
4307   case '=':
4308     Char = getCharAndSize(CurPtr, SizeTmp);
4309     if (Char == '=') {
4310       // If this is '====' and we're in a conflict marker, ignore it.
4311       if (CurPtr[1] == '=' && HandleEndOfConflictMarker(CurPtr-1))
4312         goto LexNextToken;
4313 
4314       Kind = tok::equalequal;
4315       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4316     } else {
4317       Kind = tok::equal;
4318     }
4319     break;
4320   case ',':
4321     Kind = tok::comma;
4322     break;
4323   case '#':
4324     Char = getCharAndSize(CurPtr, SizeTmp);
4325     if (Char == '#') {
4326       Kind = tok::hashhash;
4327       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4328     } else if (Char == '@' && LangOpts.MicrosoftExt) {  // #@ -> Charize
4329       Kind = tok::hashat;
4330       if (!isLexingRawMode())
4331         Diag(BufferPtr, diag::ext_charize_microsoft);
4332       CurPtr = ConsumeChar(CurPtr, SizeTmp, Result);
4333     } else {
4334       // We parsed a # character.  If this occurs at the start of the line,
4335       // it's actually the start of a preprocessing directive.  Callback to
4336       // the preprocessor to handle it.
4337       // TODO: -fpreprocessed mode??
4338       if (TokAtPhysicalStartOfLine && !LexingRawMode && !Is_PragmaLexer)
4339         goto HandleDirective;
4340 
4341       Kind = tok::hash;
4342     }
4343     break;
4344 
4345   case '@':
4346     // Objective C support.
4347     if (CurPtr[-1] == '@' && LangOpts.ObjC)
4348       Kind = tok::at;
4349     else
4350       Kind = tok::unknown;
4351     break;
4352 
4353   // UCNs (C99 6.4.3, C++11 [lex.charset]p2)
4354   case '\\':
4355     if (!LangOpts.AsmPreprocessor) {
4356       if (uint32_t CodePoint = tryReadUCN(CurPtr, BufferPtr, &Result)) {
4357         if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4358           if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4359             return true; // KeepWhitespaceMode
4360 
4361           // We only saw whitespace, so just try again with this lexer.
4362           // (We manually eliminate the tail call to avoid recursion.)
4363           goto LexNextToken;
4364         }
4365 
4366         return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4367       }
4368     }
4369 
4370     Kind = tok::unknown;
4371     break;
4372 
4373   default: {
4374     if (isASCII(Char)) {
4375       Kind = tok::unknown;
4376       break;
4377     }
4378 
4379     llvm::UTF32 CodePoint;
4380 
4381     // We can't just reset CurPtr to BufferPtr because BufferPtr may point to
4382     // an escaped newline.
4383     --CurPtr;
4384     llvm::ConversionResult Status =
4385         llvm::convertUTF8Sequence((const llvm::UTF8 **)&CurPtr,
4386                                   (const llvm::UTF8 *)BufferEnd,
4387                                   &CodePoint,
4388                                   llvm::strictConversion);
4389     if (Status == llvm::conversionOK) {
4390       if (CheckUnicodeWhitespace(Result, CodePoint, CurPtr)) {
4391         if (SkipWhitespace(Result, CurPtr, TokAtPhysicalStartOfLine))
4392           return true; // KeepWhitespaceMode
4393 
4394         // We only saw whitespace, so just try again with this lexer.
4395         // (We manually eliminate the tail call to avoid recursion.)
4396         goto LexNextToken;
4397       }
4398       return LexUnicodeIdentifierStart(Result, CodePoint, CurPtr);
4399     }
4400 
4401     if (isLexingRawMode() || ParsingPreprocessorDirective ||
4402         PP->isPreprocessedOutput()) {
4403       ++CurPtr;
4404       Kind = tok::unknown;
4405       break;
4406     }
4407 
4408     // Non-ASCII characters tend to creep into source code unintentionally.
4409     // Instead of letting the parser complain about the unknown token,
4410     // just diagnose the invalid UTF-8, then drop the character.
4411     Diag(CurPtr, diag::err_invalid_utf8);
4412 
4413     BufferPtr = CurPtr+1;
4414     // We're pretending the character didn't exist, so just try again with
4415     // this lexer.
4416     // (We manually eliminate the tail call to avoid recursion.)
4417     goto LexNextToken;
4418   }
4419   }
4420 
4421   // Notify MIOpt that we read a non-whitespace/non-comment token.
4422   MIOpt.ReadToken();
4423 
4424   // Update the location of token as well as BufferPtr.
4425   FormTokenWithChars(Result, CurPtr, Kind);
4426   return true;
4427 
4428 HandleDirective:
4429   // We parsed a # character and it's the start of a preprocessing directive.
4430 
4431   FormTokenWithChars(Result, CurPtr, tok::hash);
4432   PP->HandleDirective(Result);
4433 
4434   if (PP->hadModuleLoaderFatalFailure())
4435     // With a fatal failure in the module loader, we abort parsing.
4436     return true;
4437 
4438   // We parsed the directive; lex a token with the new state.
4439   return false;
4440 
4441 LexNextToken:
4442   Result.clearFlag(Token::NeedsCleaning);
4443   goto LexStart;
4444 }
4445 
convertDependencyDirectiveToken(const dependency_directives_scan::Token & DDTok,Token & Result)4446 const char *Lexer::convertDependencyDirectiveToken(
4447     const dependency_directives_scan::Token &DDTok, Token &Result) {
4448   const char *TokPtr = BufferStart + DDTok.Offset;
4449   Result.startToken();
4450   Result.setLocation(getSourceLocation(TokPtr));
4451   Result.setKind(DDTok.Kind);
4452   Result.setFlag((Token::TokenFlags)DDTok.Flags);
4453   Result.setLength(DDTok.Length);
4454   BufferPtr = TokPtr + DDTok.Length;
4455   return TokPtr;
4456 }
4457 
LexDependencyDirectiveToken(Token & Result)4458 bool Lexer::LexDependencyDirectiveToken(Token &Result) {
4459   assert(isDependencyDirectivesLexer());
4460 
4461   using namespace dependency_directives_scan;
4462 
4463   while (NextDepDirectiveTokenIndex == DepDirectives.front().Tokens.size()) {
4464     if (DepDirectives.front().Kind == pp_eof)
4465       return LexEndOfFile(Result, BufferEnd);
4466     if (DepDirectives.front().Kind == tokens_present_before_eof)
4467       MIOpt.ReadToken();
4468     NextDepDirectiveTokenIndex = 0;
4469     DepDirectives = DepDirectives.drop_front();
4470   }
4471 
4472   const dependency_directives_scan::Token &DDTok =
4473       DepDirectives.front().Tokens[NextDepDirectiveTokenIndex++];
4474   if (NextDepDirectiveTokenIndex > 1 || DDTok.Kind != tok::hash) {
4475     // Read something other than a preprocessor directive hash.
4476     MIOpt.ReadToken();
4477   }
4478 
4479   if (ParsingFilename && DDTok.is(tok::less)) {
4480     BufferPtr = BufferStart + DDTok.Offset;
4481     LexAngledStringLiteral(Result, BufferPtr + 1);
4482     if (Result.isNot(tok::header_name))
4483       return true;
4484     // Advance the index of lexed tokens.
4485     while (true) {
4486       const dependency_directives_scan::Token &NextTok =
4487           DepDirectives.front().Tokens[NextDepDirectiveTokenIndex];
4488       if (BufferStart + NextTok.Offset >= BufferPtr)
4489         break;
4490       ++NextDepDirectiveTokenIndex;
4491     }
4492     return true;
4493   }
4494 
4495   const char *TokPtr = convertDependencyDirectiveToken(DDTok, Result);
4496 
4497   if (Result.is(tok::hash) && Result.isAtStartOfLine()) {
4498     PP->HandleDirective(Result);
4499     return false;
4500   }
4501   if (Result.is(tok::raw_identifier)) {
4502     Result.setRawIdentifierData(TokPtr);
4503     if (!isLexingRawMode()) {
4504       const IdentifierInfo *II = PP->LookUpIdentifierInfo(Result);
4505       if (II->isHandleIdentifierCase())
4506         return PP->HandleIdentifier(Result);
4507     }
4508     return true;
4509   }
4510   if (Result.isLiteral()) {
4511     Result.setLiteralData(TokPtr);
4512     return true;
4513   }
4514   if (Result.is(tok::colon)) {
4515     // Convert consecutive colons to 'tok::coloncolon'.
4516     if (*BufferPtr == ':') {
4517       assert(DepDirectives.front().Tokens[NextDepDirectiveTokenIndex].is(
4518           tok::colon));
4519       ++NextDepDirectiveTokenIndex;
4520       Result.setKind(tok::coloncolon);
4521     }
4522     return true;
4523   }
4524   if (Result.is(tok::eod))
4525     ParsingPreprocessorDirective = false;
4526 
4527   return true;
4528 }
4529 
LexDependencyDirectiveTokenWhileSkipping(Token & Result)4530 bool Lexer::LexDependencyDirectiveTokenWhileSkipping(Token &Result) {
4531   assert(isDependencyDirectivesLexer());
4532 
4533   using namespace dependency_directives_scan;
4534 
4535   bool Stop = false;
4536   unsigned NestedIfs = 0;
4537   do {
4538     DepDirectives = DepDirectives.drop_front();
4539     switch (DepDirectives.front().Kind) {
4540     case pp_none:
4541       llvm_unreachable("unexpected 'pp_none'");
4542     case pp_include:
4543     case pp___include_macros:
4544     case pp_define:
4545     case pp_undef:
4546     case pp_import:
4547     case pp_pragma_import:
4548     case pp_pragma_once:
4549     case pp_pragma_push_macro:
4550     case pp_pragma_pop_macro:
4551     case pp_pragma_include_alias:
4552     case pp_pragma_system_header:
4553     case pp_include_next:
4554     case decl_at_import:
4555     case cxx_module_decl:
4556     case cxx_import_decl:
4557     case cxx_export_module_decl:
4558     case cxx_export_import_decl:
4559     case tokens_present_before_eof:
4560       break;
4561     case pp_if:
4562     case pp_ifdef:
4563     case pp_ifndef:
4564       ++NestedIfs;
4565       break;
4566     case pp_elif:
4567     case pp_elifdef:
4568     case pp_elifndef:
4569     case pp_else:
4570       if (!NestedIfs) {
4571         Stop = true;
4572       }
4573       break;
4574     case pp_endif:
4575       if (!NestedIfs) {
4576         Stop = true;
4577       } else {
4578         --NestedIfs;
4579       }
4580       break;
4581     case pp_eof:
4582       NextDepDirectiveTokenIndex = 0;
4583       return LexEndOfFile(Result, BufferEnd);
4584     }
4585   } while (!Stop);
4586 
4587   const dependency_directives_scan::Token &DDTok =
4588       DepDirectives.front().Tokens.front();
4589   assert(DDTok.is(tok::hash));
4590   NextDepDirectiveTokenIndex = 1;
4591 
4592   convertDependencyDirectiveToken(DDTok, Result);
4593   return false;
4594 }
4595