181ad6265SDimitry Andric //===- DependencyDirectivesScanner.cpp ------------------------------------===//
281ad6265SDimitry Andric //
381ad6265SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
481ad6265SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
581ad6265SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
681ad6265SDimitry Andric //
781ad6265SDimitry Andric //===----------------------------------------------------------------------===//
881ad6265SDimitry Andric ///
981ad6265SDimitry Andric /// \file
1081ad6265SDimitry Andric /// This is the interface for scanning header and source files to get the
1181ad6265SDimitry Andric /// minimum necessary preprocessor directives for evaluating includes. It
1281ad6265SDimitry Andric /// reduces the source down to #define, #include, #import, @import, and any
1381ad6265SDimitry Andric /// conditional preprocessor logic that contains one of those.
1481ad6265SDimitry Andric ///
1581ad6265SDimitry Andric //===----------------------------------------------------------------------===//
1681ad6265SDimitry Andric 
1781ad6265SDimitry Andric #include "clang/Lex/DependencyDirectivesScanner.h"
1881ad6265SDimitry Andric #include "clang/Basic/CharInfo.h"
1981ad6265SDimitry Andric #include "clang/Basic/Diagnostic.h"
2081ad6265SDimitry Andric #include "clang/Lex/LexDiagnostic.h"
2181ad6265SDimitry Andric #include "clang/Lex/Lexer.h"
2206c3fb27SDimitry Andric #include "clang/Lex/Pragma.h"
2381ad6265SDimitry Andric #include "llvm/ADT/ScopeExit.h"
2481ad6265SDimitry Andric #include "llvm/ADT/SmallString.h"
2581ad6265SDimitry Andric #include "llvm/ADT/StringMap.h"
2681ad6265SDimitry Andric #include "llvm/ADT/StringSwitch.h"
27bdd1243dSDimitry Andric #include <optional>
2881ad6265SDimitry Andric 
2981ad6265SDimitry Andric using namespace clang;
3081ad6265SDimitry Andric using namespace clang::dependency_directives_scan;
3181ad6265SDimitry Andric using namespace llvm;
3281ad6265SDimitry Andric 
3381ad6265SDimitry Andric namespace {
3481ad6265SDimitry Andric 
3581ad6265SDimitry Andric struct DirectiveWithTokens {
3681ad6265SDimitry Andric   DirectiveKind Kind;
3781ad6265SDimitry Andric   unsigned NumTokens;
3881ad6265SDimitry Andric 
DirectiveWithTokens__anon447827f50111::DirectiveWithTokens3981ad6265SDimitry Andric   DirectiveWithTokens(DirectiveKind Kind, unsigned NumTokens)
4081ad6265SDimitry Andric       : Kind(Kind), NumTokens(NumTokens) {}
4181ad6265SDimitry Andric };
4281ad6265SDimitry Andric 
4381ad6265SDimitry Andric /// Does an efficient "scan" of the sources to detect the presence of
4481ad6265SDimitry Andric /// preprocessor (or module import) directives and collects the raw lexed tokens
4581ad6265SDimitry Andric /// for those directives so that the \p Lexer can "replay" them when the file is
4681ad6265SDimitry Andric /// included.
4781ad6265SDimitry Andric ///
4881ad6265SDimitry Andric /// Note that the behavior of the raw lexer is affected by the language mode,
4981ad6265SDimitry Andric /// while at this point we want to do a scan and collect tokens once,
5081ad6265SDimitry Andric /// irrespective of the language mode that the file will get included in. To
5181ad6265SDimitry Andric /// compensate for that the \p Lexer, while "replaying", will adjust a token
5281ad6265SDimitry Andric /// where appropriate, when it could affect the preprocessor's state.
5381ad6265SDimitry Andric /// For example in a directive like
5481ad6265SDimitry Andric ///
5581ad6265SDimitry Andric /// \code
5681ad6265SDimitry Andric ///   #if __has_cpp_attribute(clang::fallthrough)
5781ad6265SDimitry Andric /// \endcode
5881ad6265SDimitry Andric ///
5981ad6265SDimitry Andric /// The preprocessor needs to see '::' as 'tok::coloncolon' instead of 2
6081ad6265SDimitry Andric /// 'tok::colon'. The \p Lexer will adjust if it sees consecutive 'tok::colon'
6181ad6265SDimitry Andric /// while in C++ mode.
6281ad6265SDimitry Andric struct Scanner {
Scanner__anon447827f50111::Scanner6381ad6265SDimitry Andric   Scanner(StringRef Input,
6481ad6265SDimitry Andric           SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
6581ad6265SDimitry Andric           DiagnosticsEngine *Diags, SourceLocation InputSourceLoc)
6681ad6265SDimitry Andric       : Input(Input), Tokens(Tokens), Diags(Diags),
6781ad6265SDimitry Andric         InputSourceLoc(InputSourceLoc), LangOpts(getLangOptsForDepScanning()),
6881ad6265SDimitry Andric         TheLexer(InputSourceLoc, LangOpts, Input.begin(), Input.begin(),
6981ad6265SDimitry Andric                  Input.end()) {}
7081ad6265SDimitry Andric 
getLangOptsForDepScanning__anon447827f50111::Scanner7181ad6265SDimitry Andric   static LangOptions getLangOptsForDepScanning() {
7281ad6265SDimitry Andric     LangOptions LangOpts;
7381ad6265SDimitry Andric     // Set the lexer to use 'tok::at' for '@', instead of 'tok::unknown'.
7481ad6265SDimitry Andric     LangOpts.ObjC = true;
7581ad6265SDimitry Andric     LangOpts.LineComment = true;
7606c3fb27SDimitry Andric     // FIXME: we do not enable C11 or C++11, so we are missing u/u8/U"" and
7706c3fb27SDimitry Andric     // R"()" literals.
7881ad6265SDimitry Andric     return LangOpts;
7981ad6265SDimitry Andric   }
8081ad6265SDimitry Andric 
8181ad6265SDimitry Andric   /// Lex the provided source and emit the directive tokens.
8281ad6265SDimitry Andric   ///
8381ad6265SDimitry Andric   /// \returns True on error.
8481ad6265SDimitry Andric   bool scan(SmallVectorImpl<Directive> &Directives);
8581ad6265SDimitry Andric 
8681ad6265SDimitry Andric private:
8781ad6265SDimitry Andric   /// Lexes next token and advances \p First and the \p Lexer.
88bdd1243dSDimitry Andric   [[nodiscard]] dependency_directives_scan::Token &
8981ad6265SDimitry Andric   lexToken(const char *&First, const char *const End);
9081ad6265SDimitry Andric 
9181ad6265SDimitry Andric   dependency_directives_scan::Token &lexIncludeFilename(const char *&First,
9281ad6265SDimitry Andric                                                         const char *const End);
9381ad6265SDimitry Andric 
94bdd1243dSDimitry Andric   void skipLine(const char *&First, const char *const End);
95bdd1243dSDimitry Andric   void skipDirective(StringRef Name, const char *&First, const char *const End);
96bdd1243dSDimitry Andric 
9706c3fb27SDimitry Andric   /// Returns the spelling of a string literal or identifier after performing
9806c3fb27SDimitry Andric   /// any processing needed to handle \c clang::Token::NeedsCleaning.
9906c3fb27SDimitry Andric   StringRef cleanStringIfNeeded(const dependency_directives_scan::Token &Tok);
10006c3fb27SDimitry Andric 
10181ad6265SDimitry Andric   /// Lexes next token and if it is identifier returns its string, otherwise
102bdd1243dSDimitry Andric   /// it skips the current line and returns \p std::nullopt.
10381ad6265SDimitry Andric   ///
10481ad6265SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
10581ad6265SDimitry Andric   /// advance beyond the token.
106bdd1243dSDimitry Andric   [[nodiscard]] std::optional<StringRef>
10781ad6265SDimitry Andric   tryLexIdentifierOrSkipLine(const char *&First, const char *const End);
10881ad6265SDimitry Andric 
10981ad6265SDimitry Andric   /// Used when it is certain that next token is an identifier.
110bdd1243dSDimitry Andric   [[nodiscard]] StringRef lexIdentifier(const char *&First,
11181ad6265SDimitry Andric                                         const char *const End);
11281ad6265SDimitry Andric 
11381ad6265SDimitry Andric   /// Lexes next token and returns true iff it is an identifier that matches \p
11481ad6265SDimitry Andric   /// Id, otherwise it skips the current line and returns false.
11581ad6265SDimitry Andric   ///
11681ad6265SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
11781ad6265SDimitry Andric   /// advance beyond the token.
118bdd1243dSDimitry Andric   [[nodiscard]] bool isNextIdentifierOrSkipLine(StringRef Id,
11981ad6265SDimitry Andric                                                 const char *&First,
12081ad6265SDimitry Andric                                                 const char *const End);
12181ad6265SDimitry Andric 
12206c3fb27SDimitry Andric   /// Lexes next token and returns true iff it matches the kind \p K.
12306c3fb27SDimitry Andric   /// Otherwise it skips the current line and returns false.
12406c3fb27SDimitry Andric   ///
12506c3fb27SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
12606c3fb27SDimitry Andric   /// advance beyond the token.
12706c3fb27SDimitry Andric   [[nodiscard]] bool isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
12806c3fb27SDimitry Andric                                            const char *const End);
12906c3fb27SDimitry Andric 
13006c3fb27SDimitry Andric   /// Lexes next token and if it is string literal, returns its string.
13106c3fb27SDimitry Andric   /// Otherwise, it skips the current line and returns \p std::nullopt.
13206c3fb27SDimitry Andric   ///
13306c3fb27SDimitry Andric   /// In any case (whatever the token kind) \p First and the \p Lexer will
13406c3fb27SDimitry Andric   /// advance beyond the token.
13506c3fb27SDimitry Andric   [[nodiscard]] std::optional<StringRef>
13606c3fb27SDimitry Andric   tryLexStringLiteralOrSkipLine(const char *&First, const char *const End);
13706c3fb27SDimitry Andric 
138bdd1243dSDimitry Andric   [[nodiscard]] bool scanImpl(const char *First, const char *const End);
139bdd1243dSDimitry Andric   [[nodiscard]] bool lexPPLine(const char *&First, const char *const End);
140bdd1243dSDimitry Andric   [[nodiscard]] bool lexAt(const char *&First, const char *const End);
141bdd1243dSDimitry Andric   [[nodiscard]] bool lexModule(const char *&First, const char *const End);
142bdd1243dSDimitry Andric   [[nodiscard]] bool lexDefine(const char *HashLoc, const char *&First,
14381ad6265SDimitry Andric                                const char *const End);
144bdd1243dSDimitry Andric   [[nodiscard]] bool lexPragma(const char *&First, const char *const End);
14506c3fb27SDimitry Andric   [[nodiscard]] bool lex_Pragma(const char *&First, const char *const End);
146bdd1243dSDimitry Andric   [[nodiscard]] bool lexEndif(const char *&First, const char *const End);
147bdd1243dSDimitry Andric   [[nodiscard]] bool lexDefault(DirectiveKind Kind, const char *&First,
14881ad6265SDimitry Andric                                 const char *const End);
149bdd1243dSDimitry Andric   [[nodiscard]] bool lexModuleDirectiveBody(DirectiveKind Kind,
15081ad6265SDimitry Andric                                             const char *&First,
15181ad6265SDimitry Andric                                             const char *const End);
15281ad6265SDimitry Andric   void lexPPDirectiveBody(const char *&First, const char *const End);
15381ad6265SDimitry Andric 
pushDirective__anon447827f50111::Scanner15481ad6265SDimitry Andric   DirectiveWithTokens &pushDirective(DirectiveKind Kind) {
15581ad6265SDimitry Andric     Tokens.append(CurDirToks);
15681ad6265SDimitry Andric     DirsWithToks.emplace_back(Kind, CurDirToks.size());
15781ad6265SDimitry Andric     CurDirToks.clear();
15881ad6265SDimitry Andric     return DirsWithToks.back();
15981ad6265SDimitry Andric   }
popDirective__anon447827f50111::Scanner16081ad6265SDimitry Andric   void popDirective() {
16181ad6265SDimitry Andric     Tokens.pop_back_n(DirsWithToks.pop_back_val().NumTokens);
16281ad6265SDimitry Andric   }
topDirective__anon447827f50111::Scanner16381ad6265SDimitry Andric   DirectiveKind topDirective() const {
16481ad6265SDimitry Andric     return DirsWithToks.empty() ? pp_none : DirsWithToks.back().Kind;
16581ad6265SDimitry Andric   }
16681ad6265SDimitry Andric 
getOffsetAt__anon447827f50111::Scanner16781ad6265SDimitry Andric   unsigned getOffsetAt(const char *CurPtr) const {
16881ad6265SDimitry Andric     return CurPtr - Input.data();
16981ad6265SDimitry Andric   }
17081ad6265SDimitry Andric 
17181ad6265SDimitry Andric   /// Reports a diagnostic if the diagnostic engine is provided. Always returns
17281ad6265SDimitry Andric   /// true at the end.
17381ad6265SDimitry Andric   bool reportError(const char *CurPtr, unsigned Err);
17481ad6265SDimitry Andric 
17581ad6265SDimitry Andric   StringMap<char> SplitIds;
17681ad6265SDimitry Andric   StringRef Input;
17781ad6265SDimitry Andric   SmallVectorImpl<dependency_directives_scan::Token> &Tokens;
17881ad6265SDimitry Andric   DiagnosticsEngine *Diags;
17981ad6265SDimitry Andric   SourceLocation InputSourceLoc;
18081ad6265SDimitry Andric 
181bdd1243dSDimitry Andric   const char *LastTokenPtr = nullptr;
18281ad6265SDimitry Andric   /// Keeps track of the tokens for the currently lexed directive. Once a
18381ad6265SDimitry Andric   /// directive is fully lexed and "committed" then the tokens get appended to
18481ad6265SDimitry Andric   /// \p Tokens and \p CurDirToks is cleared for the next directive.
18581ad6265SDimitry Andric   SmallVector<dependency_directives_scan::Token, 32> CurDirToks;
18681ad6265SDimitry Andric   /// The directives that were lexed along with the number of tokens that each
18781ad6265SDimitry Andric   /// directive contains. The tokens of all the directives are kept in \p Tokens
18881ad6265SDimitry Andric   /// vector, in the same order as the directives order in \p DirsWithToks.
18981ad6265SDimitry Andric   SmallVector<DirectiveWithTokens, 64> DirsWithToks;
19081ad6265SDimitry Andric   LangOptions LangOpts;
19181ad6265SDimitry Andric   Lexer TheLexer;
19281ad6265SDimitry Andric };
19381ad6265SDimitry Andric 
19481ad6265SDimitry Andric } // end anonymous namespace
19581ad6265SDimitry Andric 
reportError(const char * CurPtr,unsigned Err)19681ad6265SDimitry Andric bool Scanner::reportError(const char *CurPtr, unsigned Err) {
19781ad6265SDimitry Andric   if (!Diags)
19881ad6265SDimitry Andric     return true;
19981ad6265SDimitry Andric   assert(CurPtr >= Input.data() && "invalid buffer ptr");
20081ad6265SDimitry Andric   Diags->Report(InputSourceLoc.getLocWithOffset(getOffsetAt(CurPtr)), Err);
20181ad6265SDimitry Andric   return true;
20281ad6265SDimitry Andric }
20381ad6265SDimitry Andric 
skipOverSpaces(const char * & First,const char * const End)20481ad6265SDimitry Andric static void skipOverSpaces(const char *&First, const char *const End) {
20581ad6265SDimitry Andric   while (First != End && isHorizontalWhitespace(*First))
20681ad6265SDimitry Andric     ++First;
20781ad6265SDimitry Andric }
20881ad6265SDimitry Andric 
isRawStringLiteral(const char * First,const char * Current)209bdd1243dSDimitry Andric [[nodiscard]] static bool isRawStringLiteral(const char *First,
21081ad6265SDimitry Andric                                              const char *Current) {
21181ad6265SDimitry Andric   assert(First <= Current);
21281ad6265SDimitry Andric 
21381ad6265SDimitry Andric   // Check if we can even back up.
21481ad6265SDimitry Andric   if (*Current != '"' || First == Current)
21581ad6265SDimitry Andric     return false;
21681ad6265SDimitry Andric 
21781ad6265SDimitry Andric   // Check for an "R".
21881ad6265SDimitry Andric   --Current;
21981ad6265SDimitry Andric   if (*Current != 'R')
22081ad6265SDimitry Andric     return false;
22181ad6265SDimitry Andric   if (First == Current || !isAsciiIdentifierContinue(*--Current))
22281ad6265SDimitry Andric     return true;
22381ad6265SDimitry Andric 
22481ad6265SDimitry Andric   // Check for a prefix of "u", "U", or "L".
22581ad6265SDimitry Andric   if (*Current == 'u' || *Current == 'U' || *Current == 'L')
22681ad6265SDimitry Andric     return First == Current || !isAsciiIdentifierContinue(*--Current);
22781ad6265SDimitry Andric 
22881ad6265SDimitry Andric   // Check for a prefix of "u8".
22981ad6265SDimitry Andric   if (*Current != '8' || First == Current || *Current-- != 'u')
23081ad6265SDimitry Andric     return false;
23181ad6265SDimitry Andric   return First == Current || !isAsciiIdentifierContinue(*--Current);
23281ad6265SDimitry Andric }
23381ad6265SDimitry Andric 
skipRawString(const char * & First,const char * const End)23481ad6265SDimitry Andric static void skipRawString(const char *&First, const char *const End) {
23581ad6265SDimitry Andric   assert(First[0] == '"');
23681ad6265SDimitry Andric   assert(First[-1] == 'R');
23781ad6265SDimitry Andric 
23881ad6265SDimitry Andric   const char *Last = ++First;
23981ad6265SDimitry Andric   while (Last != End && *Last != '(')
24081ad6265SDimitry Andric     ++Last;
24181ad6265SDimitry Andric   if (Last == End) {
24281ad6265SDimitry Andric     First = Last; // Hit the end... just give up.
24381ad6265SDimitry Andric     return;
24481ad6265SDimitry Andric   }
24581ad6265SDimitry Andric 
24681ad6265SDimitry Andric   StringRef Terminator(First, Last - First);
24781ad6265SDimitry Andric   for (;;) {
24881ad6265SDimitry Andric     // Move First to just past the next ")".
24981ad6265SDimitry Andric     First = Last;
25081ad6265SDimitry Andric     while (First != End && *First != ')')
25181ad6265SDimitry Andric       ++First;
25281ad6265SDimitry Andric     if (First == End)
25381ad6265SDimitry Andric       return;
25481ad6265SDimitry Andric     ++First;
25581ad6265SDimitry Andric 
25681ad6265SDimitry Andric     // Look ahead for the terminator sequence.
25781ad6265SDimitry Andric     Last = First;
25881ad6265SDimitry Andric     while (Last != End && size_t(Last - First) < Terminator.size() &&
25981ad6265SDimitry Andric            Terminator[Last - First] == *Last)
26081ad6265SDimitry Andric       ++Last;
26181ad6265SDimitry Andric 
26281ad6265SDimitry Andric     // Check if we hit it (or the end of the file).
26381ad6265SDimitry Andric     if (Last == End) {
26481ad6265SDimitry Andric       First = Last;
26581ad6265SDimitry Andric       return;
26681ad6265SDimitry Andric     }
26781ad6265SDimitry Andric     if (size_t(Last - First) < Terminator.size())
26881ad6265SDimitry Andric       continue;
26981ad6265SDimitry Andric     if (*Last != '"')
27081ad6265SDimitry Andric       continue;
27181ad6265SDimitry Andric     First = Last + 1;
27281ad6265SDimitry Andric     return;
27381ad6265SDimitry Andric   }
27481ad6265SDimitry Andric }
27581ad6265SDimitry Andric 
27681ad6265SDimitry Andric // Returns the length of EOL, either 0 (no end-of-line), 1 (\n) or 2 (\r\n)
isEOL(const char * First,const char * const End)27781ad6265SDimitry Andric static unsigned isEOL(const char *First, const char *const End) {
27881ad6265SDimitry Andric   if (First == End)
27981ad6265SDimitry Andric     return 0;
28081ad6265SDimitry Andric   if (End - First > 1 && isVerticalWhitespace(First[0]) &&
28181ad6265SDimitry Andric       isVerticalWhitespace(First[1]) && First[0] != First[1])
28281ad6265SDimitry Andric     return 2;
28381ad6265SDimitry Andric   return !!isVerticalWhitespace(First[0]);
28481ad6265SDimitry Andric }
28581ad6265SDimitry Andric 
skipString(const char * & First,const char * const End)28681ad6265SDimitry Andric static void skipString(const char *&First, const char *const End) {
28781ad6265SDimitry Andric   assert(*First == '\'' || *First == '"' || *First == '<');
28881ad6265SDimitry Andric   const char Terminator = *First == '<' ? '>' : *First;
28981ad6265SDimitry Andric   for (++First; First != End && *First != Terminator; ++First) {
29081ad6265SDimitry Andric     // String and character literals don't extend past the end of the line.
29181ad6265SDimitry Andric     if (isVerticalWhitespace(*First))
29281ad6265SDimitry Andric       return;
29381ad6265SDimitry Andric     if (*First != '\\')
29481ad6265SDimitry Andric       continue;
29581ad6265SDimitry Andric     // Skip past backslash to the next character. This ensures that the
29681ad6265SDimitry Andric     // character right after it is skipped as well, which matters if it's
29781ad6265SDimitry Andric     // the terminator.
29881ad6265SDimitry Andric     if (++First == End)
29981ad6265SDimitry Andric       return;
30081ad6265SDimitry Andric     if (!isWhitespace(*First))
30181ad6265SDimitry Andric       continue;
30281ad6265SDimitry Andric     // Whitespace after the backslash might indicate a line continuation.
30381ad6265SDimitry Andric     const char *FirstAfterBackslashPastSpace = First;
30481ad6265SDimitry Andric     skipOverSpaces(FirstAfterBackslashPastSpace, End);
30581ad6265SDimitry Andric     if (unsigned NLSize = isEOL(FirstAfterBackslashPastSpace, End)) {
30681ad6265SDimitry Andric       // Advance the character pointer to the next line for the next
30781ad6265SDimitry Andric       // iteration.
30881ad6265SDimitry Andric       First = FirstAfterBackslashPastSpace + NLSize - 1;
30981ad6265SDimitry Andric     }
31081ad6265SDimitry Andric   }
31181ad6265SDimitry Andric   if (First != End)
31281ad6265SDimitry Andric     ++First; // Finish off the string.
31381ad6265SDimitry Andric }
31481ad6265SDimitry Andric 
31581ad6265SDimitry Andric // Returns the length of the skipped newline
skipNewline(const char * & First,const char * End)31681ad6265SDimitry Andric static unsigned skipNewline(const char *&First, const char *End) {
31781ad6265SDimitry Andric   if (First == End)
31881ad6265SDimitry Andric     return 0;
31981ad6265SDimitry Andric   assert(isVerticalWhitespace(*First));
32081ad6265SDimitry Andric   unsigned Len = isEOL(First, End);
32181ad6265SDimitry Andric   assert(Len && "expected newline");
32281ad6265SDimitry Andric   First += Len;
32381ad6265SDimitry Andric   return Len;
32481ad6265SDimitry Andric }
32581ad6265SDimitry Andric 
wasLineContinuation(const char * First,unsigned EOLLen)32681ad6265SDimitry Andric static bool wasLineContinuation(const char *First, unsigned EOLLen) {
32781ad6265SDimitry Andric   return *(First - (int)EOLLen - 1) == '\\';
32881ad6265SDimitry Andric }
32981ad6265SDimitry Andric 
skipToNewlineRaw(const char * & First,const char * const End)33081ad6265SDimitry Andric static void skipToNewlineRaw(const char *&First, const char *const End) {
33181ad6265SDimitry Andric   for (;;) {
33281ad6265SDimitry Andric     if (First == End)
33381ad6265SDimitry Andric       return;
33481ad6265SDimitry Andric 
33581ad6265SDimitry Andric     unsigned Len = isEOL(First, End);
33681ad6265SDimitry Andric     if (Len)
33781ad6265SDimitry Andric       return;
33881ad6265SDimitry Andric 
33981ad6265SDimitry Andric     do {
34081ad6265SDimitry Andric       if (++First == End)
34181ad6265SDimitry Andric         return;
34281ad6265SDimitry Andric       Len = isEOL(First, End);
34381ad6265SDimitry Andric     } while (!Len);
34481ad6265SDimitry Andric 
34581ad6265SDimitry Andric     if (First[-1] != '\\')
34681ad6265SDimitry Andric       return;
34781ad6265SDimitry Andric 
34881ad6265SDimitry Andric     First += Len;
34981ad6265SDimitry Andric     // Keep skipping lines...
35081ad6265SDimitry Andric   }
35181ad6265SDimitry Andric }
35281ad6265SDimitry Andric 
skipLineComment(const char * & First,const char * const End)35381ad6265SDimitry Andric static void skipLineComment(const char *&First, const char *const End) {
35481ad6265SDimitry Andric   assert(First[0] == '/' && First[1] == '/');
35581ad6265SDimitry Andric   First += 2;
35681ad6265SDimitry Andric   skipToNewlineRaw(First, End);
35781ad6265SDimitry Andric }
35881ad6265SDimitry Andric 
skipBlockComment(const char * & First,const char * const End)35981ad6265SDimitry Andric static void skipBlockComment(const char *&First, const char *const End) {
36081ad6265SDimitry Andric   assert(First[0] == '/' && First[1] == '*');
36181ad6265SDimitry Andric   if (End - First < 4) {
36281ad6265SDimitry Andric     First = End;
36381ad6265SDimitry Andric     return;
36481ad6265SDimitry Andric   }
36581ad6265SDimitry Andric   for (First += 3; First != End; ++First)
36681ad6265SDimitry Andric     if (First[-1] == '*' && First[0] == '/') {
36781ad6265SDimitry Andric       ++First;
36881ad6265SDimitry Andric       return;
36981ad6265SDimitry Andric     }
37081ad6265SDimitry Andric }
37181ad6265SDimitry Andric 
37281ad6265SDimitry Andric /// \returns True if the current single quotation mark character is a C++ 14
37381ad6265SDimitry Andric /// digit separator.
isQuoteCppDigitSeparator(const char * const Start,const char * const Cur,const char * const End)37481ad6265SDimitry Andric static bool isQuoteCppDigitSeparator(const char *const Start,
37581ad6265SDimitry Andric                                      const char *const Cur,
37681ad6265SDimitry Andric                                      const char *const End) {
37781ad6265SDimitry Andric   assert(*Cur == '\'' && "expected quotation character");
37881ad6265SDimitry Andric   // skipLine called in places where we don't expect a valid number
37981ad6265SDimitry Andric   // body before `start` on the same line, so always return false at the start.
38081ad6265SDimitry Andric   if (Start == Cur)
38181ad6265SDimitry Andric     return false;
38281ad6265SDimitry Andric   // The previous character must be a valid PP number character.
38381ad6265SDimitry Andric   // Make sure that the L, u, U, u8 prefixes don't get marked as a
38481ad6265SDimitry Andric   // separator though.
38581ad6265SDimitry Andric   char Prev = *(Cur - 1);
38681ad6265SDimitry Andric   if (Prev == 'L' || Prev == 'U' || Prev == 'u')
38781ad6265SDimitry Andric     return false;
38881ad6265SDimitry Andric   if (Prev == '8' && (Cur - 1 != Start) && *(Cur - 2) == 'u')
38981ad6265SDimitry Andric     return false;
39081ad6265SDimitry Andric   if (!isPreprocessingNumberBody(Prev))
39181ad6265SDimitry Andric     return false;
39281ad6265SDimitry Andric   // The next character should be a valid identifier body character.
39381ad6265SDimitry Andric   return (Cur + 1) < End && isAsciiIdentifierContinue(*(Cur + 1));
39481ad6265SDimitry Andric }
39581ad6265SDimitry Andric 
skipLine(const char * & First,const char * const End)396bdd1243dSDimitry Andric void Scanner::skipLine(const char *&First, const char *const End) {
39781ad6265SDimitry Andric   for (;;) {
39881ad6265SDimitry Andric     assert(First <= End);
39981ad6265SDimitry Andric     if (First == End)
40081ad6265SDimitry Andric       return;
40181ad6265SDimitry Andric 
40281ad6265SDimitry Andric     if (isVerticalWhitespace(*First)) {
40381ad6265SDimitry Andric       skipNewline(First, End);
40481ad6265SDimitry Andric       return;
40581ad6265SDimitry Andric     }
40681ad6265SDimitry Andric     const char *Start = First;
40781ad6265SDimitry Andric     while (First != End && !isVerticalWhitespace(*First)) {
40881ad6265SDimitry Andric       // Iterate over strings correctly to avoid comments and newlines.
40981ad6265SDimitry Andric       if (*First == '"' ||
41081ad6265SDimitry Andric           (*First == '\'' && !isQuoteCppDigitSeparator(Start, First, End))) {
411bdd1243dSDimitry Andric         LastTokenPtr = First;
41281ad6265SDimitry Andric         if (isRawStringLiteral(Start, First))
41381ad6265SDimitry Andric           skipRawString(First, End);
41481ad6265SDimitry Andric         else
41581ad6265SDimitry Andric           skipString(First, End);
41681ad6265SDimitry Andric         continue;
41781ad6265SDimitry Andric       }
41881ad6265SDimitry Andric 
41981ad6265SDimitry Andric       // Iterate over comments correctly.
42081ad6265SDimitry Andric       if (*First != '/' || End - First < 2) {
421bdd1243dSDimitry Andric         LastTokenPtr = First;
42281ad6265SDimitry Andric         ++First;
42381ad6265SDimitry Andric         continue;
42481ad6265SDimitry Andric       }
42581ad6265SDimitry Andric 
42681ad6265SDimitry Andric       if (First[1] == '/') {
42781ad6265SDimitry Andric         // "//...".
42881ad6265SDimitry Andric         skipLineComment(First, End);
42981ad6265SDimitry Andric         continue;
43081ad6265SDimitry Andric       }
43181ad6265SDimitry Andric 
43281ad6265SDimitry Andric       if (First[1] != '*') {
433bdd1243dSDimitry Andric         LastTokenPtr = First;
43481ad6265SDimitry Andric         ++First;
43581ad6265SDimitry Andric         continue;
43681ad6265SDimitry Andric       }
43781ad6265SDimitry Andric 
43881ad6265SDimitry Andric       // "/*...*/".
43981ad6265SDimitry Andric       skipBlockComment(First, End);
44081ad6265SDimitry Andric     }
44181ad6265SDimitry Andric     if (First == End)
44281ad6265SDimitry Andric       return;
44381ad6265SDimitry Andric 
44481ad6265SDimitry Andric     // Skip over the newline.
44581ad6265SDimitry Andric     unsigned Len = skipNewline(First, End);
44681ad6265SDimitry Andric     if (!wasLineContinuation(First, Len)) // Continue past line-continuations.
44781ad6265SDimitry Andric       break;
44881ad6265SDimitry Andric   }
44981ad6265SDimitry Andric }
45081ad6265SDimitry Andric 
skipDirective(StringRef Name,const char * & First,const char * const End)451bdd1243dSDimitry Andric void Scanner::skipDirective(StringRef Name, const char *&First,
45281ad6265SDimitry Andric                             const char *const End) {
45381ad6265SDimitry Andric   if (llvm::StringSwitch<bool>(Name)
45481ad6265SDimitry Andric           .Case("warning", true)
45581ad6265SDimitry Andric           .Case("error", true)
45681ad6265SDimitry Andric           .Default(false))
45781ad6265SDimitry Andric     // Do not process quotes or comments.
45881ad6265SDimitry Andric     skipToNewlineRaw(First, End);
45981ad6265SDimitry Andric   else
46081ad6265SDimitry Andric     skipLine(First, End);
46181ad6265SDimitry Andric }
46281ad6265SDimitry Andric 
skipWhitespace(const char * & First,const char * const End)46381ad6265SDimitry Andric static void skipWhitespace(const char *&First, const char *const End) {
46481ad6265SDimitry Andric   for (;;) {
46581ad6265SDimitry Andric     assert(First <= End);
46681ad6265SDimitry Andric     skipOverSpaces(First, End);
46781ad6265SDimitry Andric 
46881ad6265SDimitry Andric     if (End - First < 2)
46981ad6265SDimitry Andric       return;
47081ad6265SDimitry Andric 
47181ad6265SDimitry Andric     if (First[0] == '\\' && isVerticalWhitespace(First[1])) {
47281ad6265SDimitry Andric       skipNewline(++First, End);
47381ad6265SDimitry Andric       continue;
47481ad6265SDimitry Andric     }
47581ad6265SDimitry Andric 
47681ad6265SDimitry Andric     // Check for a non-comment character.
47781ad6265SDimitry Andric     if (First[0] != '/')
47881ad6265SDimitry Andric       return;
47981ad6265SDimitry Andric 
48081ad6265SDimitry Andric     // "// ...".
48181ad6265SDimitry Andric     if (First[1] == '/') {
48281ad6265SDimitry Andric       skipLineComment(First, End);
48381ad6265SDimitry Andric       return;
48481ad6265SDimitry Andric     }
48581ad6265SDimitry Andric 
48681ad6265SDimitry Andric     // Cannot be a comment.
48781ad6265SDimitry Andric     if (First[1] != '*')
48881ad6265SDimitry Andric       return;
48981ad6265SDimitry Andric 
49081ad6265SDimitry Andric     // "/*...*/".
49181ad6265SDimitry Andric     skipBlockComment(First, End);
49281ad6265SDimitry Andric   }
49381ad6265SDimitry Andric }
49481ad6265SDimitry Andric 
lexModuleDirectiveBody(DirectiveKind Kind,const char * & First,const char * const End)49581ad6265SDimitry Andric bool Scanner::lexModuleDirectiveBody(DirectiveKind Kind, const char *&First,
49681ad6265SDimitry Andric                                      const char *const End) {
49781ad6265SDimitry Andric   const char *DirectiveLoc = Input.data() + CurDirToks.front().Offset;
49881ad6265SDimitry Andric   for (;;) {
49981ad6265SDimitry Andric     const dependency_directives_scan::Token &Tok = lexToken(First, End);
50081ad6265SDimitry Andric     if (Tok.is(tok::eof))
50181ad6265SDimitry Andric       return reportError(
50281ad6265SDimitry Andric           DirectiveLoc,
50381ad6265SDimitry Andric           diag::err_dep_source_scanner_missing_semi_after_at_import);
50481ad6265SDimitry Andric     if (Tok.is(tok::semi))
50581ad6265SDimitry Andric       break;
50681ad6265SDimitry Andric   }
50781ad6265SDimitry Andric   pushDirective(Kind);
50881ad6265SDimitry Andric   skipWhitespace(First, End);
50981ad6265SDimitry Andric   if (First == End)
51081ad6265SDimitry Andric     return false;
51181ad6265SDimitry Andric   if (!isVerticalWhitespace(*First))
51281ad6265SDimitry Andric     return reportError(
51381ad6265SDimitry Andric         DirectiveLoc, diag::err_dep_source_scanner_unexpected_tokens_at_import);
51481ad6265SDimitry Andric   skipNewline(First, End);
51581ad6265SDimitry Andric   return false;
51681ad6265SDimitry Andric }
51781ad6265SDimitry Andric 
lexToken(const char * & First,const char * const End)51881ad6265SDimitry Andric dependency_directives_scan::Token &Scanner::lexToken(const char *&First,
51981ad6265SDimitry Andric                                                      const char *const End) {
52081ad6265SDimitry Andric   clang::Token Tok;
52181ad6265SDimitry Andric   TheLexer.LexFromRawLexer(Tok);
52281ad6265SDimitry Andric   First = Input.data() + TheLexer.getCurrentBufferOffset();
52381ad6265SDimitry Andric   assert(First <= End);
52481ad6265SDimitry Andric 
52581ad6265SDimitry Andric   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
52681ad6265SDimitry Andric   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
52781ad6265SDimitry Andric                           Tok.getFlags());
52881ad6265SDimitry Andric   return CurDirToks.back();
52981ad6265SDimitry Andric }
53081ad6265SDimitry Andric 
53181ad6265SDimitry Andric dependency_directives_scan::Token &
lexIncludeFilename(const char * & First,const char * const End)53281ad6265SDimitry Andric Scanner::lexIncludeFilename(const char *&First, const char *const End) {
53381ad6265SDimitry Andric   clang::Token Tok;
53481ad6265SDimitry Andric   TheLexer.LexIncludeFilename(Tok);
53581ad6265SDimitry Andric   First = Input.data() + TheLexer.getCurrentBufferOffset();
53681ad6265SDimitry Andric   assert(First <= End);
53781ad6265SDimitry Andric 
53881ad6265SDimitry Andric   unsigned Offset = TheLexer.getCurrentBufferOffset() - Tok.getLength();
53981ad6265SDimitry Andric   CurDirToks.emplace_back(Offset, Tok.getLength(), Tok.getKind(),
54081ad6265SDimitry Andric                           Tok.getFlags());
54181ad6265SDimitry Andric   return CurDirToks.back();
54281ad6265SDimitry Andric }
54381ad6265SDimitry Andric 
lexPPDirectiveBody(const char * & First,const char * const End)54481ad6265SDimitry Andric void Scanner::lexPPDirectiveBody(const char *&First, const char *const End) {
54581ad6265SDimitry Andric   while (true) {
54681ad6265SDimitry Andric     const dependency_directives_scan::Token &Tok = lexToken(First, End);
54781ad6265SDimitry Andric     if (Tok.is(tok::eod))
54881ad6265SDimitry Andric       break;
54981ad6265SDimitry Andric   }
55081ad6265SDimitry Andric }
55181ad6265SDimitry Andric 
55206c3fb27SDimitry Andric StringRef
cleanStringIfNeeded(const dependency_directives_scan::Token & Tok)55306c3fb27SDimitry Andric Scanner::cleanStringIfNeeded(const dependency_directives_scan::Token &Tok) {
55481ad6265SDimitry Andric   bool NeedsCleaning = Tok.Flags & clang::Token::NeedsCleaning;
55581ad6265SDimitry Andric   if (LLVM_LIKELY(!NeedsCleaning))
55681ad6265SDimitry Andric     return Input.slice(Tok.Offset, Tok.getEnd());
55781ad6265SDimitry Andric 
55881ad6265SDimitry Andric   SmallString<64> Spelling;
55981ad6265SDimitry Andric   Spelling.resize(Tok.Length);
56081ad6265SDimitry Andric 
56106c3fb27SDimitry Andric   // FIXME: C++11 raw string literals need special handling (see getSpellingSlow
56206c3fb27SDimitry Andric   // in the Lexer). Currently we cannot see them due to our LangOpts.
56306c3fb27SDimitry Andric 
56481ad6265SDimitry Andric   unsigned SpellingLength = 0;
56581ad6265SDimitry Andric   const char *BufPtr = Input.begin() + Tok.Offset;
56681ad6265SDimitry Andric   const char *AfterIdent = Input.begin() + Tok.getEnd();
56781ad6265SDimitry Andric   while (BufPtr < AfterIdent) {
568*5f757f3fSDimitry Andric     auto [Char, Size] = Lexer::getCharAndSizeNoWarn(BufPtr, LangOpts);
569*5f757f3fSDimitry Andric     Spelling[SpellingLength++] = Char;
57081ad6265SDimitry Andric     BufPtr += Size;
57181ad6265SDimitry Andric   }
57281ad6265SDimitry Andric 
57381ad6265SDimitry Andric   return SplitIds.try_emplace(StringRef(Spelling.begin(), SpellingLength), 0)
57481ad6265SDimitry Andric       .first->first();
57581ad6265SDimitry Andric }
57681ad6265SDimitry Andric 
57706c3fb27SDimitry Andric std::optional<StringRef>
tryLexIdentifierOrSkipLine(const char * & First,const char * const End)57806c3fb27SDimitry Andric Scanner::tryLexIdentifierOrSkipLine(const char *&First, const char *const End) {
57906c3fb27SDimitry Andric   const dependency_directives_scan::Token &Tok = lexToken(First, End);
58006c3fb27SDimitry Andric   if (Tok.isNot(tok::raw_identifier)) {
58106c3fb27SDimitry Andric     if (!Tok.is(tok::eod))
58206c3fb27SDimitry Andric       skipLine(First, End);
58306c3fb27SDimitry Andric     return std::nullopt;
58406c3fb27SDimitry Andric   }
58506c3fb27SDimitry Andric 
58606c3fb27SDimitry Andric   return cleanStringIfNeeded(Tok);
58706c3fb27SDimitry Andric }
58806c3fb27SDimitry Andric 
lexIdentifier(const char * & First,const char * const End)58981ad6265SDimitry Andric StringRef Scanner::lexIdentifier(const char *&First, const char *const End) {
590bdd1243dSDimitry Andric   std::optional<StringRef> Id = tryLexIdentifierOrSkipLine(First, End);
59181ad6265SDimitry Andric   assert(Id && "expected identifier token");
592bdd1243dSDimitry Andric   return *Id;
59381ad6265SDimitry Andric }
59481ad6265SDimitry Andric 
isNextIdentifierOrSkipLine(StringRef Id,const char * & First,const char * const End)59581ad6265SDimitry Andric bool Scanner::isNextIdentifierOrSkipLine(StringRef Id, const char *&First,
59681ad6265SDimitry Andric                                          const char *const End) {
597bdd1243dSDimitry Andric   if (std::optional<StringRef> FoundId =
598bdd1243dSDimitry Andric           tryLexIdentifierOrSkipLine(First, End)) {
59981ad6265SDimitry Andric     if (*FoundId == Id)
60081ad6265SDimitry Andric       return true;
60181ad6265SDimitry Andric     skipLine(First, End);
60281ad6265SDimitry Andric   }
60381ad6265SDimitry Andric   return false;
60481ad6265SDimitry Andric }
60581ad6265SDimitry Andric 
isNextTokenOrSkipLine(tok::TokenKind K,const char * & First,const char * const End)60606c3fb27SDimitry Andric bool Scanner::isNextTokenOrSkipLine(tok::TokenKind K, const char *&First,
60706c3fb27SDimitry Andric                                     const char *const End) {
60806c3fb27SDimitry Andric   const dependency_directives_scan::Token &Tok = lexToken(First, End);
60906c3fb27SDimitry Andric   if (Tok.is(K))
61006c3fb27SDimitry Andric     return true;
61106c3fb27SDimitry Andric   skipLine(First, End);
61206c3fb27SDimitry Andric   return false;
61306c3fb27SDimitry Andric }
61406c3fb27SDimitry Andric 
61506c3fb27SDimitry Andric std::optional<StringRef>
tryLexStringLiteralOrSkipLine(const char * & First,const char * const End)61606c3fb27SDimitry Andric Scanner::tryLexStringLiteralOrSkipLine(const char *&First,
61706c3fb27SDimitry Andric                                        const char *const End) {
61806c3fb27SDimitry Andric   const dependency_directives_scan::Token &Tok = lexToken(First, End);
61906c3fb27SDimitry Andric   if (!tok::isStringLiteral(Tok.Kind)) {
62006c3fb27SDimitry Andric     if (!Tok.is(tok::eod))
62106c3fb27SDimitry Andric       skipLine(First, End);
62206c3fb27SDimitry Andric     return std::nullopt;
62306c3fb27SDimitry Andric   }
62406c3fb27SDimitry Andric 
62506c3fb27SDimitry Andric   return cleanStringIfNeeded(Tok);
62606c3fb27SDimitry Andric }
62706c3fb27SDimitry Andric 
lexAt(const char * & First,const char * const End)62881ad6265SDimitry Andric bool Scanner::lexAt(const char *&First, const char *const End) {
62981ad6265SDimitry Andric   // Handle "@import".
63081ad6265SDimitry Andric 
63181ad6265SDimitry Andric   // Lex '@'.
63281ad6265SDimitry Andric   const dependency_directives_scan::Token &AtTok = lexToken(First, End);
63381ad6265SDimitry Andric   assert(AtTok.is(tok::at));
63481ad6265SDimitry Andric   (void)AtTok;
63581ad6265SDimitry Andric 
63681ad6265SDimitry Andric   if (!isNextIdentifierOrSkipLine("import", First, End))
63781ad6265SDimitry Andric     return false;
63881ad6265SDimitry Andric   return lexModuleDirectiveBody(decl_at_import, First, End);
63981ad6265SDimitry Andric }
64081ad6265SDimitry Andric 
lexModule(const char * & First,const char * const End)64181ad6265SDimitry Andric bool Scanner::lexModule(const char *&First, const char *const End) {
64281ad6265SDimitry Andric   StringRef Id = lexIdentifier(First, End);
64381ad6265SDimitry Andric   bool Export = false;
64481ad6265SDimitry Andric   if (Id == "export") {
64581ad6265SDimitry Andric     Export = true;
646bdd1243dSDimitry Andric     std::optional<StringRef> NextId = tryLexIdentifierOrSkipLine(First, End);
64781ad6265SDimitry Andric     if (!NextId)
64881ad6265SDimitry Andric       return false;
64981ad6265SDimitry Andric     Id = *NextId;
65081ad6265SDimitry Andric   }
65181ad6265SDimitry Andric 
65281ad6265SDimitry Andric   if (Id != "module" && Id != "import") {
65381ad6265SDimitry Andric     skipLine(First, End);
65481ad6265SDimitry Andric     return false;
65581ad6265SDimitry Andric   }
65681ad6265SDimitry Andric 
65781ad6265SDimitry Andric   skipWhitespace(First, End);
65881ad6265SDimitry Andric 
65981ad6265SDimitry Andric   // Ignore this as a module directive if the next character can't be part of
66081ad6265SDimitry Andric   // an import.
66181ad6265SDimitry Andric 
66281ad6265SDimitry Andric   switch (*First) {
66381ad6265SDimitry Andric   case ':':
66481ad6265SDimitry Andric   case '<':
66581ad6265SDimitry Andric   case '"':
66681ad6265SDimitry Andric     break;
66781ad6265SDimitry Andric   default:
66881ad6265SDimitry Andric     if (!isAsciiIdentifierContinue(*First)) {
66981ad6265SDimitry Andric       skipLine(First, End);
67081ad6265SDimitry Andric       return false;
67181ad6265SDimitry Andric     }
67281ad6265SDimitry Andric   }
67381ad6265SDimitry Andric 
67481ad6265SDimitry Andric   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ false);
67581ad6265SDimitry Andric 
67681ad6265SDimitry Andric   DirectiveKind Kind;
67781ad6265SDimitry Andric   if (Id == "module")
67881ad6265SDimitry Andric     Kind = Export ? cxx_export_module_decl : cxx_module_decl;
67981ad6265SDimitry Andric   else
68081ad6265SDimitry Andric     Kind = Export ? cxx_export_import_decl : cxx_import_decl;
68181ad6265SDimitry Andric 
68281ad6265SDimitry Andric   return lexModuleDirectiveBody(Kind, First, End);
68381ad6265SDimitry Andric }
68481ad6265SDimitry Andric 
lex_Pragma(const char * & First,const char * const End)68506c3fb27SDimitry Andric bool Scanner::lex_Pragma(const char *&First, const char *const End) {
68606c3fb27SDimitry Andric   if (!isNextTokenOrSkipLine(tok::l_paren, First, End))
68706c3fb27SDimitry Andric     return false;
68806c3fb27SDimitry Andric 
68906c3fb27SDimitry Andric   std::optional<StringRef> Str = tryLexStringLiteralOrSkipLine(First, End);
69006c3fb27SDimitry Andric 
69106c3fb27SDimitry Andric   if (!Str || !isNextTokenOrSkipLine(tok::r_paren, First, End))
69206c3fb27SDimitry Andric     return false;
69306c3fb27SDimitry Andric 
69406c3fb27SDimitry Andric   SmallString<64> Buffer(*Str);
69506c3fb27SDimitry Andric   prepare_PragmaString(Buffer);
69606c3fb27SDimitry Andric 
69706c3fb27SDimitry Andric   // Use a new scanner instance since the tokens will be inside the allocated
69806c3fb27SDimitry Andric   // string. We should already have captured all the relevant tokens in the
69906c3fb27SDimitry Andric   // current scanner.
70006c3fb27SDimitry Andric   SmallVector<dependency_directives_scan::Token> DiscardTokens;
70106c3fb27SDimitry Andric   const char *Begin = Buffer.c_str();
70206c3fb27SDimitry Andric   Scanner PragmaScanner{StringRef(Begin, Buffer.size()), DiscardTokens, Diags,
70306c3fb27SDimitry Andric                         InputSourceLoc};
70406c3fb27SDimitry Andric 
70506c3fb27SDimitry Andric   PragmaScanner.TheLexer.setParsingPreprocessorDirective(true);
70606c3fb27SDimitry Andric   if (PragmaScanner.lexPragma(Begin, Buffer.end()))
70706c3fb27SDimitry Andric     return true;
70806c3fb27SDimitry Andric 
70906c3fb27SDimitry Andric   DirectiveKind K = PragmaScanner.topDirective();
71006c3fb27SDimitry Andric   if (K == pp_none) {
71106c3fb27SDimitry Andric     skipLine(First, End);
71206c3fb27SDimitry Andric     return false;
71306c3fb27SDimitry Andric   }
71406c3fb27SDimitry Andric 
71506c3fb27SDimitry Andric   assert(Begin == Buffer.end());
71606c3fb27SDimitry Andric   pushDirective(K);
71706c3fb27SDimitry Andric   return false;
71806c3fb27SDimitry Andric }
71906c3fb27SDimitry Andric 
lexPragma(const char * & First,const char * const End)72081ad6265SDimitry Andric bool Scanner::lexPragma(const char *&First, const char *const End) {
721bdd1243dSDimitry Andric   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
72281ad6265SDimitry Andric   if (!FoundId)
72381ad6265SDimitry Andric     return false;
72481ad6265SDimitry Andric 
72581ad6265SDimitry Andric   StringRef Id = *FoundId;
72681ad6265SDimitry Andric   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
72781ad6265SDimitry Andric                   .Case("once", pp_pragma_once)
72881ad6265SDimitry Andric                   .Case("push_macro", pp_pragma_push_macro)
72981ad6265SDimitry Andric                   .Case("pop_macro", pp_pragma_pop_macro)
73081ad6265SDimitry Andric                   .Case("include_alias", pp_pragma_include_alias)
73181ad6265SDimitry Andric                   .Default(pp_none);
73281ad6265SDimitry Andric   if (Kind != pp_none) {
73381ad6265SDimitry Andric     lexPPDirectiveBody(First, End);
73481ad6265SDimitry Andric     pushDirective(Kind);
73581ad6265SDimitry Andric     return false;
73681ad6265SDimitry Andric   }
73781ad6265SDimitry Andric 
73881ad6265SDimitry Andric   if (Id != "clang") {
73981ad6265SDimitry Andric     skipLine(First, End);
74081ad6265SDimitry Andric     return false;
74181ad6265SDimitry Andric   }
74281ad6265SDimitry Andric 
74306c3fb27SDimitry Andric   FoundId = tryLexIdentifierOrSkipLine(First, End);
74406c3fb27SDimitry Andric   if (!FoundId)
74581ad6265SDimitry Andric     return false;
74606c3fb27SDimitry Andric   Id = *FoundId;
74706c3fb27SDimitry Andric 
74806c3fb27SDimitry Andric   // #pragma clang system_header
74906c3fb27SDimitry Andric   if (Id == "system_header") {
75006c3fb27SDimitry Andric     lexPPDirectiveBody(First, End);
75106c3fb27SDimitry Andric     pushDirective(pp_pragma_system_header);
75206c3fb27SDimitry Andric     return false;
75306c3fb27SDimitry Andric   }
75406c3fb27SDimitry Andric 
75506c3fb27SDimitry Andric   if (Id != "module") {
75606c3fb27SDimitry Andric     skipLine(First, End);
75706c3fb27SDimitry Andric     return false;
75806c3fb27SDimitry Andric   }
75981ad6265SDimitry Andric 
76081ad6265SDimitry Andric   // #pragma clang module.
76181ad6265SDimitry Andric   if (!isNextIdentifierOrSkipLine("import", First, End))
76281ad6265SDimitry Andric     return false;
76381ad6265SDimitry Andric 
76481ad6265SDimitry Andric   // #pragma clang module import.
76581ad6265SDimitry Andric   lexPPDirectiveBody(First, End);
76681ad6265SDimitry Andric   pushDirective(pp_pragma_import);
76781ad6265SDimitry Andric   return false;
76881ad6265SDimitry Andric }
76981ad6265SDimitry Andric 
lexEndif(const char * & First,const char * const End)77081ad6265SDimitry Andric bool Scanner::lexEndif(const char *&First, const char *const End) {
77181ad6265SDimitry Andric   // Strip out "#else" if it's empty.
77281ad6265SDimitry Andric   if (topDirective() == pp_else)
77381ad6265SDimitry Andric     popDirective();
77481ad6265SDimitry Andric 
77581ad6265SDimitry Andric   // If "#ifdef" is empty, strip it and skip the "#endif".
77681ad6265SDimitry Andric   //
77781ad6265SDimitry Andric   // FIXME: Once/if Clang starts disallowing __has_include in macro expansions,
77881ad6265SDimitry Andric   // we can skip empty `#if` and `#elif` blocks as well after scanning for a
77981ad6265SDimitry Andric   // literal __has_include in the condition.  Even without that rule we could
78081ad6265SDimitry Andric   // drop the tokens if we scan for identifiers in the condition and find none.
78181ad6265SDimitry Andric   if (topDirective() == pp_ifdef || topDirective() == pp_ifndef) {
78281ad6265SDimitry Andric     popDirective();
78381ad6265SDimitry Andric     skipLine(First, End);
78481ad6265SDimitry Andric     return false;
78581ad6265SDimitry Andric   }
78681ad6265SDimitry Andric 
78781ad6265SDimitry Andric   return lexDefault(pp_endif, First, End);
78881ad6265SDimitry Andric }
78981ad6265SDimitry Andric 
lexDefault(DirectiveKind Kind,const char * & First,const char * const End)79081ad6265SDimitry Andric bool Scanner::lexDefault(DirectiveKind Kind, const char *&First,
79181ad6265SDimitry Andric                          const char *const End) {
79281ad6265SDimitry Andric   lexPPDirectiveBody(First, End);
79381ad6265SDimitry Andric   pushDirective(Kind);
79481ad6265SDimitry Andric   return false;
79581ad6265SDimitry Andric }
79681ad6265SDimitry Andric 
isStartOfRelevantLine(char First)79781ad6265SDimitry Andric static bool isStartOfRelevantLine(char First) {
79881ad6265SDimitry Andric   switch (First) {
79981ad6265SDimitry Andric   case '#':
80081ad6265SDimitry Andric   case '@':
80181ad6265SDimitry Andric   case 'i':
80281ad6265SDimitry Andric   case 'e':
80381ad6265SDimitry Andric   case 'm':
80406c3fb27SDimitry Andric   case '_':
80581ad6265SDimitry Andric     return true;
80681ad6265SDimitry Andric   }
80781ad6265SDimitry Andric   return false;
80881ad6265SDimitry Andric }
80981ad6265SDimitry Andric 
lexPPLine(const char * & First,const char * const End)81081ad6265SDimitry Andric bool Scanner::lexPPLine(const char *&First, const char *const End) {
81181ad6265SDimitry Andric   assert(First != End);
81281ad6265SDimitry Andric 
81381ad6265SDimitry Andric   skipWhitespace(First, End);
81481ad6265SDimitry Andric   assert(First <= End);
81581ad6265SDimitry Andric   if (First == End)
81681ad6265SDimitry Andric     return false;
81781ad6265SDimitry Andric 
81881ad6265SDimitry Andric   if (!isStartOfRelevantLine(*First)) {
81981ad6265SDimitry Andric     skipLine(First, End);
82081ad6265SDimitry Andric     assert(First <= End);
82181ad6265SDimitry Andric     return false;
82281ad6265SDimitry Andric   }
82381ad6265SDimitry Andric 
824bdd1243dSDimitry Andric   LastTokenPtr = First;
825bdd1243dSDimitry Andric 
82681ad6265SDimitry Andric   TheLexer.seek(getOffsetAt(First), /*IsAtStartOfLine*/ true);
82781ad6265SDimitry Andric 
82881ad6265SDimitry Andric   auto ScEx1 = make_scope_exit([&]() {
82981ad6265SDimitry Andric     /// Clear Scanner's CurDirToks before returning, in case we didn't push a
83081ad6265SDimitry Andric     /// new directive.
83181ad6265SDimitry Andric     CurDirToks.clear();
83281ad6265SDimitry Andric   });
83381ad6265SDimitry Andric 
83481ad6265SDimitry Andric   // Handle "@import".
83581ad6265SDimitry Andric   if (*First == '@')
83681ad6265SDimitry Andric     return lexAt(First, End);
83781ad6265SDimitry Andric 
83881ad6265SDimitry Andric   if (*First == 'i' || *First == 'e' || *First == 'm')
83981ad6265SDimitry Andric     return lexModule(First, End);
84081ad6265SDimitry Andric 
84106c3fb27SDimitry Andric   if (*First == '_') {
84206c3fb27SDimitry Andric     if (isNextIdentifierOrSkipLine("_Pragma", First, End))
84306c3fb27SDimitry Andric       return lex_Pragma(First, End);
84406c3fb27SDimitry Andric     return false;
84506c3fb27SDimitry Andric   }
84606c3fb27SDimitry Andric 
84781ad6265SDimitry Andric   // Handle preprocessing directives.
84881ad6265SDimitry Andric 
84981ad6265SDimitry Andric   TheLexer.setParsingPreprocessorDirective(true);
85081ad6265SDimitry Andric   auto ScEx2 = make_scope_exit(
85181ad6265SDimitry Andric       [&]() { TheLexer.setParsingPreprocessorDirective(false); });
85281ad6265SDimitry Andric 
85381ad6265SDimitry Andric   // Lex '#'.
85481ad6265SDimitry Andric   const dependency_directives_scan::Token &HashTok = lexToken(First, End);
855bdd1243dSDimitry Andric   if (HashTok.is(tok::hashhash)) {
856bdd1243dSDimitry Andric     // A \p tok::hashhash at this location is passed by the preprocessor to the
857bdd1243dSDimitry Andric     // parser to interpret, like any other token. So for dependency scanning
858bdd1243dSDimitry Andric     // skip it like a normal token not affecting the preprocessor.
859bdd1243dSDimitry Andric     skipLine(First, End);
860bdd1243dSDimitry Andric     assert(First <= End);
861bdd1243dSDimitry Andric     return false;
862bdd1243dSDimitry Andric   }
86381ad6265SDimitry Andric   assert(HashTok.is(tok::hash));
86481ad6265SDimitry Andric   (void)HashTok;
86581ad6265SDimitry Andric 
866bdd1243dSDimitry Andric   std::optional<StringRef> FoundId = tryLexIdentifierOrSkipLine(First, End);
86781ad6265SDimitry Andric   if (!FoundId)
86881ad6265SDimitry Andric     return false;
86981ad6265SDimitry Andric 
87081ad6265SDimitry Andric   StringRef Id = *FoundId;
87181ad6265SDimitry Andric 
87281ad6265SDimitry Andric   if (Id == "pragma")
87381ad6265SDimitry Andric     return lexPragma(First, End);
87481ad6265SDimitry Andric 
87581ad6265SDimitry Andric   auto Kind = llvm::StringSwitch<DirectiveKind>(Id)
87681ad6265SDimitry Andric                   .Case("include", pp_include)
87781ad6265SDimitry Andric                   .Case("__include_macros", pp___include_macros)
87881ad6265SDimitry Andric                   .Case("define", pp_define)
87981ad6265SDimitry Andric                   .Case("undef", pp_undef)
88081ad6265SDimitry Andric                   .Case("import", pp_import)
88181ad6265SDimitry Andric                   .Case("include_next", pp_include_next)
88281ad6265SDimitry Andric                   .Case("if", pp_if)
88381ad6265SDimitry Andric                   .Case("ifdef", pp_ifdef)
88481ad6265SDimitry Andric                   .Case("ifndef", pp_ifndef)
88581ad6265SDimitry Andric                   .Case("elif", pp_elif)
88681ad6265SDimitry Andric                   .Case("elifdef", pp_elifdef)
88781ad6265SDimitry Andric                   .Case("elifndef", pp_elifndef)
88881ad6265SDimitry Andric                   .Case("else", pp_else)
88981ad6265SDimitry Andric                   .Case("endif", pp_endif)
89081ad6265SDimitry Andric                   .Default(pp_none);
89181ad6265SDimitry Andric   if (Kind == pp_none) {
89281ad6265SDimitry Andric     skipDirective(Id, First, End);
89381ad6265SDimitry Andric     return false;
89481ad6265SDimitry Andric   }
89581ad6265SDimitry Andric 
89681ad6265SDimitry Andric   if (Kind == pp_endif)
89781ad6265SDimitry Andric     return lexEndif(First, End);
89881ad6265SDimitry Andric 
89981ad6265SDimitry Andric   switch (Kind) {
90081ad6265SDimitry Andric   case pp_include:
90181ad6265SDimitry Andric   case pp___include_macros:
90281ad6265SDimitry Andric   case pp_include_next:
90381ad6265SDimitry Andric   case pp_import:
90481ad6265SDimitry Andric     lexIncludeFilename(First, End);
90581ad6265SDimitry Andric     break;
90681ad6265SDimitry Andric   default:
90781ad6265SDimitry Andric     break;
90881ad6265SDimitry Andric   }
90981ad6265SDimitry Andric 
91081ad6265SDimitry Andric   // Everything else.
91181ad6265SDimitry Andric   return lexDefault(Kind, First, End);
91281ad6265SDimitry Andric }
91381ad6265SDimitry Andric 
skipUTF8ByteOrderMark(const char * & First,const char * const End)91481ad6265SDimitry Andric static void skipUTF8ByteOrderMark(const char *&First, const char *const End) {
91581ad6265SDimitry Andric   if ((End - First) >= 3 && First[0] == '\xef' && First[1] == '\xbb' &&
91681ad6265SDimitry Andric       First[2] == '\xbf')
91781ad6265SDimitry Andric     First += 3;
91881ad6265SDimitry Andric }
91981ad6265SDimitry Andric 
scanImpl(const char * First,const char * const End)92081ad6265SDimitry Andric bool Scanner::scanImpl(const char *First, const char *const End) {
92181ad6265SDimitry Andric   skipUTF8ByteOrderMark(First, End);
92281ad6265SDimitry Andric   while (First != End)
92381ad6265SDimitry Andric     if (lexPPLine(First, End))
92481ad6265SDimitry Andric       return true;
92581ad6265SDimitry Andric   return false;
92681ad6265SDimitry Andric }
92781ad6265SDimitry Andric 
scan(SmallVectorImpl<Directive> & Directives)92881ad6265SDimitry Andric bool Scanner::scan(SmallVectorImpl<Directive> &Directives) {
92981ad6265SDimitry Andric   bool Error = scanImpl(Input.begin(), Input.end());
93081ad6265SDimitry Andric 
93181ad6265SDimitry Andric   if (!Error) {
93281ad6265SDimitry Andric     // Add an EOF on success.
933bdd1243dSDimitry Andric     if (LastTokenPtr &&
934bdd1243dSDimitry Andric         (Tokens.empty() || LastTokenPtr > Input.begin() + Tokens.back().Offset))
935bdd1243dSDimitry Andric       pushDirective(tokens_present_before_eof);
93681ad6265SDimitry Andric     pushDirective(pp_eof);
93781ad6265SDimitry Andric   }
93881ad6265SDimitry Andric 
93981ad6265SDimitry Andric   ArrayRef<dependency_directives_scan::Token> RemainingTokens = Tokens;
94081ad6265SDimitry Andric   for (const DirectiveWithTokens &DirWithToks : DirsWithToks) {
94181ad6265SDimitry Andric     assert(RemainingTokens.size() >= DirWithToks.NumTokens);
94281ad6265SDimitry Andric     Directives.emplace_back(DirWithToks.Kind,
94381ad6265SDimitry Andric                             RemainingTokens.take_front(DirWithToks.NumTokens));
94481ad6265SDimitry Andric     RemainingTokens = RemainingTokens.drop_front(DirWithToks.NumTokens);
94581ad6265SDimitry Andric   }
94681ad6265SDimitry Andric   assert(RemainingTokens.empty());
94781ad6265SDimitry Andric 
94881ad6265SDimitry Andric   return Error;
94981ad6265SDimitry Andric }
95081ad6265SDimitry Andric 
scanSourceForDependencyDirectives(StringRef Input,SmallVectorImpl<dependency_directives_scan::Token> & Tokens,SmallVectorImpl<Directive> & Directives,DiagnosticsEngine * Diags,SourceLocation InputSourceLoc)95181ad6265SDimitry Andric bool clang::scanSourceForDependencyDirectives(
95281ad6265SDimitry Andric     StringRef Input, SmallVectorImpl<dependency_directives_scan::Token> &Tokens,
95381ad6265SDimitry Andric     SmallVectorImpl<Directive> &Directives, DiagnosticsEngine *Diags,
95481ad6265SDimitry Andric     SourceLocation InputSourceLoc) {
95581ad6265SDimitry Andric   return Scanner(Input, Tokens, Diags, InputSourceLoc).scan(Directives);
95681ad6265SDimitry Andric }
95781ad6265SDimitry Andric 
printDependencyDirectivesAsSource(StringRef Source,ArrayRef<dependency_directives_scan::Directive> Directives,llvm::raw_ostream & OS)95881ad6265SDimitry Andric void clang::printDependencyDirectivesAsSource(
95981ad6265SDimitry Andric     StringRef Source,
96081ad6265SDimitry Andric     ArrayRef<dependency_directives_scan::Directive> Directives,
96181ad6265SDimitry Andric     llvm::raw_ostream &OS) {
96281ad6265SDimitry Andric   // Add a space separator where it is convenient for testing purposes.
96381ad6265SDimitry Andric   auto needsSpaceSeparator =
96481ad6265SDimitry Andric       [](tok::TokenKind Prev,
96581ad6265SDimitry Andric          const dependency_directives_scan::Token &Tok) -> bool {
96681ad6265SDimitry Andric     if (Prev == Tok.Kind)
96781ad6265SDimitry Andric       return !Tok.isOneOf(tok::l_paren, tok::r_paren, tok::l_square,
96881ad6265SDimitry Andric                           tok::r_square);
96981ad6265SDimitry Andric     if (Prev == tok::raw_identifier &&
97081ad6265SDimitry Andric         Tok.isOneOf(tok::hash, tok::numeric_constant, tok::string_literal,
97181ad6265SDimitry Andric                     tok::char_constant, tok::header_name))
97281ad6265SDimitry Andric       return true;
97381ad6265SDimitry Andric     if (Prev == tok::r_paren &&
97481ad6265SDimitry Andric         Tok.isOneOf(tok::raw_identifier, tok::hash, tok::string_literal,
97581ad6265SDimitry Andric                     tok::char_constant, tok::unknown))
97681ad6265SDimitry Andric       return true;
97781ad6265SDimitry Andric     if (Prev == tok::comma &&
97881ad6265SDimitry Andric         Tok.isOneOf(tok::l_paren, tok::string_literal, tok::less))
97981ad6265SDimitry Andric       return true;
98081ad6265SDimitry Andric     return false;
98181ad6265SDimitry Andric   };
98281ad6265SDimitry Andric 
98381ad6265SDimitry Andric   for (const dependency_directives_scan::Directive &Directive : Directives) {
984bdd1243dSDimitry Andric     if (Directive.Kind == tokens_present_before_eof)
985bdd1243dSDimitry Andric       OS << "<TokBeforeEOF>";
986bdd1243dSDimitry Andric     std::optional<tok::TokenKind> PrevTokenKind;
98781ad6265SDimitry Andric     for (const dependency_directives_scan::Token &Tok : Directive.Tokens) {
98881ad6265SDimitry Andric       if (PrevTokenKind && needsSpaceSeparator(*PrevTokenKind, Tok))
98981ad6265SDimitry Andric         OS << ' ';
99081ad6265SDimitry Andric       PrevTokenKind = Tok.Kind;
99181ad6265SDimitry Andric       OS << Source.slice(Tok.Offset, Tok.getEnd());
99281ad6265SDimitry Andric     }
99381ad6265SDimitry Andric   }
99481ad6265SDimitry Andric }
995