10b57cec5SDimitry Andric //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements the NumericLiteralParser, CharLiteralParser, and
100b57cec5SDimitry Andric // StringLiteralParser interfaces.
110b57cec5SDimitry Andric //
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric 
140b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h"
150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h"
160b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h"
170b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h"
180b57cec5SDimitry Andric #include "clang/Basic/TargetInfo.h"
190b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h"
200b57cec5SDimitry Andric #include "clang/Lex/Lexer.h"
210b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h"
220b57cec5SDimitry Andric #include "clang/Lex/Token.h"
230b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
240b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
250b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h"
260b57cec5SDimitry Andric #include "llvm/ADT/StringSwitch.h"
270b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h"
285ffd83dbSDimitry Andric #include "llvm/Support/Error.h"
290b57cec5SDimitry Andric #include "llvm/Support/ErrorHandling.h"
3081ad6265SDimitry Andric #include "llvm/Support/Unicode.h"
310b57cec5SDimitry Andric #include <algorithm>
320b57cec5SDimitry Andric #include <cassert>
330b57cec5SDimitry Andric #include <cstddef>
340b57cec5SDimitry Andric #include <cstdint>
350b57cec5SDimitry Andric #include <cstring>
360b57cec5SDimitry Andric #include <string>
370b57cec5SDimitry Andric 
380b57cec5SDimitry Andric using namespace clang;
390b57cec5SDimitry Andric 
getCharWidth(tok::TokenKind kind,const TargetInfo & Target)400b57cec5SDimitry Andric static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
410b57cec5SDimitry Andric   switch (kind) {
420b57cec5SDimitry Andric   default: llvm_unreachable("Unknown token type!");
430b57cec5SDimitry Andric   case tok::char_constant:
440b57cec5SDimitry Andric   case tok::string_literal:
450b57cec5SDimitry Andric   case tok::utf8_char_constant:
460b57cec5SDimitry Andric   case tok::utf8_string_literal:
470b57cec5SDimitry Andric     return Target.getCharWidth();
480b57cec5SDimitry Andric   case tok::wide_char_constant:
490b57cec5SDimitry Andric   case tok::wide_string_literal:
500b57cec5SDimitry Andric     return Target.getWCharWidth();
510b57cec5SDimitry Andric   case tok::utf16_char_constant:
520b57cec5SDimitry Andric   case tok::utf16_string_literal:
530b57cec5SDimitry Andric     return Target.getChar16Width();
540b57cec5SDimitry Andric   case tok::utf32_char_constant:
550b57cec5SDimitry Andric   case tok::utf32_string_literal:
560b57cec5SDimitry Andric     return Target.getChar32Width();
570b57cec5SDimitry Andric   }
580b57cec5SDimitry Andric }
590b57cec5SDimitry Andric 
getEncodingPrefixLen(tok::TokenKind kind)608a4dda33SDimitry Andric static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
618a4dda33SDimitry Andric   switch (kind) {
628a4dda33SDimitry Andric   default:
638a4dda33SDimitry Andric     llvm_unreachable("Unknown token type!");
648a4dda33SDimitry Andric   case tok::char_constant:
658a4dda33SDimitry Andric   case tok::string_literal:
668a4dda33SDimitry Andric     return 0;
678a4dda33SDimitry Andric   case tok::utf8_char_constant:
688a4dda33SDimitry Andric   case tok::utf8_string_literal:
698a4dda33SDimitry Andric     return 2;
708a4dda33SDimitry Andric   case tok::wide_char_constant:
718a4dda33SDimitry Andric   case tok::wide_string_literal:
728a4dda33SDimitry Andric   case tok::utf16_char_constant:
738a4dda33SDimitry Andric   case tok::utf16_string_literal:
748a4dda33SDimitry Andric   case tok::utf32_char_constant:
758a4dda33SDimitry Andric   case tok::utf32_string_literal:
768a4dda33SDimitry Andric     return 1;
778a4dda33SDimitry Andric   }
788a4dda33SDimitry Andric }
798a4dda33SDimitry Andric 
MakeCharSourceRange(const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd)800b57cec5SDimitry Andric static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
810b57cec5SDimitry Andric                                            FullSourceLoc TokLoc,
820b57cec5SDimitry Andric                                            const char *TokBegin,
830b57cec5SDimitry Andric                                            const char *TokRangeBegin,
840b57cec5SDimitry Andric                                            const char *TokRangeEnd) {
850b57cec5SDimitry Andric   SourceLocation Begin =
860b57cec5SDimitry Andric     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
870b57cec5SDimitry Andric                                    TokLoc.getManager(), Features);
880b57cec5SDimitry Andric   SourceLocation End =
890b57cec5SDimitry Andric     Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
900b57cec5SDimitry Andric                                    TokLoc.getManager(), Features);
910b57cec5SDimitry Andric   return CharSourceRange::getCharRange(Begin, End);
920b57cec5SDimitry Andric }
930b57cec5SDimitry Andric 
940b57cec5SDimitry Andric /// Produce a diagnostic highlighting some portion of a literal.
950b57cec5SDimitry Andric ///
960b57cec5SDimitry Andric /// Emits the diagnostic \p DiagID, highlighting the range of characters from
970b57cec5SDimitry Andric /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
980b57cec5SDimitry Andric /// a substring of a spelling buffer for the token beginning at \p TokBegin.
Diag(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,unsigned DiagID)990b57cec5SDimitry Andric static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
1000b57cec5SDimitry Andric                               const LangOptions &Features, FullSourceLoc TokLoc,
1010b57cec5SDimitry Andric                               const char *TokBegin, const char *TokRangeBegin,
1020b57cec5SDimitry Andric                               const char *TokRangeEnd, unsigned DiagID) {
1030b57cec5SDimitry Andric   SourceLocation Begin =
1040b57cec5SDimitry Andric     Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
1050b57cec5SDimitry Andric                                    TokLoc.getManager(), Features);
1060b57cec5SDimitry Andric   return Diags->Report(Begin, DiagID) <<
1070b57cec5SDimitry Andric     MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
1080b57cec5SDimitry Andric }
1090b57cec5SDimitry Andric 
IsEscapeValidInUnevaluatedStringLiteral(char Escape)11006c3fb27SDimitry Andric static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
11106c3fb27SDimitry Andric   switch (Escape) {
11206c3fb27SDimitry Andric   case '\'':
11306c3fb27SDimitry Andric   case '"':
11406c3fb27SDimitry Andric   case '?':
11506c3fb27SDimitry Andric   case '\\':
11606c3fb27SDimitry Andric   case 'a':
11706c3fb27SDimitry Andric   case 'b':
11806c3fb27SDimitry Andric   case 'f':
11906c3fb27SDimitry Andric   case 'n':
12006c3fb27SDimitry Andric   case 'r':
12106c3fb27SDimitry Andric   case 't':
12206c3fb27SDimitry Andric   case 'v':
12306c3fb27SDimitry Andric     return true;
12406c3fb27SDimitry Andric   }
12506c3fb27SDimitry Andric   return false;
12606c3fb27SDimitry Andric }
12706c3fb27SDimitry Andric 
1280b57cec5SDimitry Andric /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
1290b57cec5SDimitry Andric /// either a character or a string literal.
ProcessCharEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,bool & HadError,FullSourceLoc Loc,unsigned CharWidth,DiagnosticsEngine * Diags,const LangOptions & Features,StringLiteralEvalMethod EvalMethod)1300b57cec5SDimitry Andric static unsigned ProcessCharEscape(const char *ThisTokBegin,
1310b57cec5SDimitry Andric                                   const char *&ThisTokBuf,
1320b57cec5SDimitry Andric                                   const char *ThisTokEnd, bool &HadError,
1330b57cec5SDimitry Andric                                   FullSourceLoc Loc, unsigned CharWidth,
1340b57cec5SDimitry Andric                                   DiagnosticsEngine *Diags,
13506c3fb27SDimitry Andric                                   const LangOptions &Features,
13606c3fb27SDimitry Andric                                   StringLiteralEvalMethod EvalMethod) {
1370b57cec5SDimitry Andric   const char *EscapeBegin = ThisTokBuf;
138349cc55cSDimitry Andric   bool Delimited = false;
139349cc55cSDimitry Andric   bool EndDelimiterFound = false;
1400b57cec5SDimitry Andric 
1410b57cec5SDimitry Andric   // Skip the '\' char.
1420b57cec5SDimitry Andric   ++ThisTokBuf;
1430b57cec5SDimitry Andric 
1440b57cec5SDimitry Andric   // We know that this character can't be off the end of the buffer, because
1450b57cec5SDimitry Andric   // that would have been \", which would not have been the end of string.
1460b57cec5SDimitry Andric   unsigned ResultChar = *ThisTokBuf++;
14706c3fb27SDimitry Andric   char Escape = ResultChar;
1480b57cec5SDimitry Andric   switch (ResultChar) {
1490b57cec5SDimitry Andric   // These map to themselves.
1500b57cec5SDimitry Andric   case '\\': case '\'': case '"': case '?': break;
1510b57cec5SDimitry Andric 
1520b57cec5SDimitry Andric     // These have fixed mappings.
1530b57cec5SDimitry Andric   case 'a':
1540b57cec5SDimitry Andric     // TODO: K&R: the meaning of '\\a' is different in traditional C
1550b57cec5SDimitry Andric     ResultChar = 7;
1560b57cec5SDimitry Andric     break;
1570b57cec5SDimitry Andric   case 'b':
1580b57cec5SDimitry Andric     ResultChar = 8;
1590b57cec5SDimitry Andric     break;
1600b57cec5SDimitry Andric   case 'e':
1610b57cec5SDimitry Andric     if (Diags)
1620b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
1630b57cec5SDimitry Andric            diag::ext_nonstandard_escape) << "e";
1640b57cec5SDimitry Andric     ResultChar = 27;
1650b57cec5SDimitry Andric     break;
1660b57cec5SDimitry Andric   case 'E':
1670b57cec5SDimitry Andric     if (Diags)
1680b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
1690b57cec5SDimitry Andric            diag::ext_nonstandard_escape) << "E";
1700b57cec5SDimitry Andric     ResultChar = 27;
1710b57cec5SDimitry Andric     break;
1720b57cec5SDimitry Andric   case 'f':
1730b57cec5SDimitry Andric     ResultChar = 12;
1740b57cec5SDimitry Andric     break;
1750b57cec5SDimitry Andric   case 'n':
1760b57cec5SDimitry Andric     ResultChar = 10;
1770b57cec5SDimitry Andric     break;
1780b57cec5SDimitry Andric   case 'r':
1790b57cec5SDimitry Andric     ResultChar = 13;
1800b57cec5SDimitry Andric     break;
1810b57cec5SDimitry Andric   case 't':
1820b57cec5SDimitry Andric     ResultChar = 9;
1830b57cec5SDimitry Andric     break;
1840b57cec5SDimitry Andric   case 'v':
1850b57cec5SDimitry Andric     ResultChar = 11;
1860b57cec5SDimitry Andric     break;
1870b57cec5SDimitry Andric   case 'x': { // Hex escape.
1880b57cec5SDimitry Andric     ResultChar = 0;
189349cc55cSDimitry Andric     if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
190349cc55cSDimitry Andric       Delimited = true;
191349cc55cSDimitry Andric       ThisTokBuf++;
192349cc55cSDimitry Andric       if (*ThisTokBuf == '}') {
193349cc55cSDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194349cc55cSDimitry Andric              diag::err_delimited_escape_empty);
195349cc55cSDimitry Andric         return ResultChar;
196349cc55cSDimitry Andric       }
197349cc55cSDimitry Andric     } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
1980b57cec5SDimitry Andric       if (Diags)
1990b57cec5SDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
2000b57cec5SDimitry Andric              diag::err_hex_escape_no_digits) << "x";
201349cc55cSDimitry Andric       return ResultChar;
2020b57cec5SDimitry Andric     }
2030b57cec5SDimitry Andric 
2040b57cec5SDimitry Andric     // Hex escapes are a maximal series of hex digits.
2050b57cec5SDimitry Andric     bool Overflow = false;
2060b57cec5SDimitry Andric     for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
207349cc55cSDimitry Andric       if (Delimited && *ThisTokBuf == '}') {
208349cc55cSDimitry Andric         ThisTokBuf++;
209349cc55cSDimitry Andric         EndDelimiterFound = true;
210349cc55cSDimitry Andric         break;
211349cc55cSDimitry Andric       }
212349cc55cSDimitry Andric       int CharVal = llvm::hexDigitValue(*ThisTokBuf);
213349cc55cSDimitry Andric       if (CharVal == -1) {
214349cc55cSDimitry Andric         // Non delimited hex escape sequences stop at the first non-hex digit.
215349cc55cSDimitry Andric         if (!Delimited)
216349cc55cSDimitry Andric           break;
217349cc55cSDimitry Andric         HadError = true;
218349cc55cSDimitry Andric         if (Diags)
219349cc55cSDimitry Andric           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
220349cc55cSDimitry Andric                diag::err_delimited_escape_invalid)
221349cc55cSDimitry Andric               << StringRef(ThisTokBuf, 1);
222349cc55cSDimitry Andric         continue;
223349cc55cSDimitry Andric       }
2240b57cec5SDimitry Andric       // About to shift out a digit?
2250b57cec5SDimitry Andric       if (ResultChar & 0xF0000000)
2260b57cec5SDimitry Andric         Overflow = true;
2270b57cec5SDimitry Andric       ResultChar <<= 4;
2280b57cec5SDimitry Andric       ResultChar |= CharVal;
2290b57cec5SDimitry Andric     }
2300b57cec5SDimitry Andric     // See if any bits will be truncated when evaluated as a character.
2310b57cec5SDimitry Andric     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
2320b57cec5SDimitry Andric       Overflow = true;
2330b57cec5SDimitry Andric       ResultChar &= ~0U >> (32-CharWidth);
2340b57cec5SDimitry Andric     }
2350b57cec5SDimitry Andric 
2360b57cec5SDimitry Andric     // Check for overflow.
237349cc55cSDimitry Andric     if (!HadError && Overflow) { // Too many digits to fit in
238349cc55cSDimitry Andric       HadError = true;
239349cc55cSDimitry Andric       if (Diags)
2400b57cec5SDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
241349cc55cSDimitry Andric              diag::err_escape_too_large)
242349cc55cSDimitry Andric             << 0;
243349cc55cSDimitry Andric     }
2440b57cec5SDimitry Andric     break;
2450b57cec5SDimitry Andric   }
2460b57cec5SDimitry Andric   case '0': case '1': case '2': case '3':
2470b57cec5SDimitry Andric   case '4': case '5': case '6': case '7': {
2480b57cec5SDimitry Andric     // Octal escapes.
2490b57cec5SDimitry Andric     --ThisTokBuf;
2500b57cec5SDimitry Andric     ResultChar = 0;
2510b57cec5SDimitry Andric 
2520b57cec5SDimitry Andric     // Octal escapes are a series of octal digits with maximum length 3.
2530b57cec5SDimitry Andric     // "\0123" is a two digit sequence equal to "\012" "3".
2540b57cec5SDimitry Andric     unsigned NumDigits = 0;
2550b57cec5SDimitry Andric     do {
2560b57cec5SDimitry Andric       ResultChar <<= 3;
2570b57cec5SDimitry Andric       ResultChar |= *ThisTokBuf++ - '0';
2580b57cec5SDimitry Andric       ++NumDigits;
2590b57cec5SDimitry Andric     } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
2600b57cec5SDimitry Andric              ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
2610b57cec5SDimitry Andric 
2620b57cec5SDimitry Andric     // Check for overflow.  Reject '\777', but not L'\777'.
2630b57cec5SDimitry Andric     if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
2640b57cec5SDimitry Andric       if (Diags)
2650b57cec5SDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
2660b57cec5SDimitry Andric              diag::err_escape_too_large) << 1;
2670b57cec5SDimitry Andric       ResultChar &= ~0U >> (32-CharWidth);
2680b57cec5SDimitry Andric     }
2690b57cec5SDimitry Andric     break;
2700b57cec5SDimitry Andric   }
271349cc55cSDimitry Andric   case 'o': {
272349cc55cSDimitry Andric     bool Overflow = false;
273349cc55cSDimitry Andric     if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
274349cc55cSDimitry Andric       HadError = true;
275349cc55cSDimitry Andric       if (Diags)
276349cc55cSDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
27781ad6265SDimitry Andric              diag::err_delimited_escape_missing_brace)
27881ad6265SDimitry Andric             << "o";
2790b57cec5SDimitry Andric 
280349cc55cSDimitry Andric       break;
281349cc55cSDimitry Andric     }
282349cc55cSDimitry Andric     ResultChar = 0;
283349cc55cSDimitry Andric     Delimited = true;
284349cc55cSDimitry Andric     ++ThisTokBuf;
285349cc55cSDimitry Andric     if (*ThisTokBuf == '}') {
286349cc55cSDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287349cc55cSDimitry Andric            diag::err_delimited_escape_empty);
288349cc55cSDimitry Andric       return ResultChar;
289349cc55cSDimitry Andric     }
290349cc55cSDimitry Andric 
291349cc55cSDimitry Andric     while (ThisTokBuf != ThisTokEnd) {
292349cc55cSDimitry Andric       if (*ThisTokBuf == '}') {
293349cc55cSDimitry Andric         EndDelimiterFound = true;
294349cc55cSDimitry Andric         ThisTokBuf++;
295349cc55cSDimitry Andric         break;
296349cc55cSDimitry Andric       }
297349cc55cSDimitry Andric       if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
298349cc55cSDimitry Andric         HadError = true;
299349cc55cSDimitry Andric         if (Diags)
300349cc55cSDimitry Andric           Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
301349cc55cSDimitry Andric                diag::err_delimited_escape_invalid)
302349cc55cSDimitry Andric               << StringRef(ThisTokBuf, 1);
303349cc55cSDimitry Andric         ThisTokBuf++;
304349cc55cSDimitry Andric         continue;
305349cc55cSDimitry Andric       }
30606c3fb27SDimitry Andric       // Check if one of the top three bits is set before shifting them out.
30706c3fb27SDimitry Andric       if (ResultChar & 0xE0000000)
308349cc55cSDimitry Andric         Overflow = true;
309349cc55cSDimitry Andric 
310349cc55cSDimitry Andric       ResultChar <<= 3;
311349cc55cSDimitry Andric       ResultChar |= *ThisTokBuf++ - '0';
312349cc55cSDimitry Andric     }
313349cc55cSDimitry Andric     // Check for overflow.  Reject '\777', but not L'\777'.
314349cc55cSDimitry Andric     if (!HadError &&
315349cc55cSDimitry Andric         (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
316349cc55cSDimitry Andric       HadError = true;
317349cc55cSDimitry Andric       if (Diags)
318349cc55cSDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
319349cc55cSDimitry Andric              diag::err_escape_too_large)
320349cc55cSDimitry Andric             << 1;
321349cc55cSDimitry Andric       ResultChar &= ~0U >> (32 - CharWidth);
322349cc55cSDimitry Andric     }
323349cc55cSDimitry Andric     break;
324349cc55cSDimitry Andric   }
3250b57cec5SDimitry Andric     // Otherwise, these are not valid escapes.
3260b57cec5SDimitry Andric   case '(': case '{': case '[': case '%':
3270b57cec5SDimitry Andric     // GCC accepts these as extensions.  We warn about them as such though.
3280b57cec5SDimitry Andric     if (Diags)
3290b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
3300b57cec5SDimitry Andric            diag::ext_nonstandard_escape)
3310b57cec5SDimitry Andric         << std::string(1, ResultChar);
3320b57cec5SDimitry Andric     break;
3330b57cec5SDimitry Andric   default:
3340b57cec5SDimitry Andric     if (!Diags)
3350b57cec5SDimitry Andric       break;
3360b57cec5SDimitry Andric 
3370b57cec5SDimitry Andric     if (isPrintable(ResultChar))
3380b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
3390b57cec5SDimitry Andric            diag::ext_unknown_escape)
3400b57cec5SDimitry Andric         << std::string(1, ResultChar);
3410b57cec5SDimitry Andric     else
3420b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
3430b57cec5SDimitry Andric            diag::ext_unknown_escape)
3440b57cec5SDimitry Andric         << "x" + llvm::utohexstr(ResultChar);
3450b57cec5SDimitry Andric     break;
3460b57cec5SDimitry Andric   }
3470b57cec5SDimitry Andric 
348349cc55cSDimitry Andric   if (Delimited && Diags) {
349349cc55cSDimitry Andric     if (!EndDelimiterFound)
350349cc55cSDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
351349cc55cSDimitry Andric            diag::err_expected)
352349cc55cSDimitry Andric           << tok::r_brace;
353349cc55cSDimitry Andric     else if (!HadError) {
354349cc55cSDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
35506c3fb27SDimitry Andric            Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
356753f127fSDimitry Andric                                 : diag::ext_delimited_escape_sequence)
357753f127fSDimitry Andric           << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
358349cc55cSDimitry Andric     }
359349cc55cSDimitry Andric   }
360349cc55cSDimitry Andric 
36106c3fb27SDimitry Andric   if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
36206c3fb27SDimitry Andric       !IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
36306c3fb27SDimitry Andric     Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
36406c3fb27SDimitry Andric          diag::err_unevaluated_string_invalid_escape_sequence)
36506c3fb27SDimitry Andric         << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
3668a4dda33SDimitry Andric     HadError = true;
36706c3fb27SDimitry Andric   }
3688a4dda33SDimitry Andric 
3690b57cec5SDimitry Andric   return ResultChar;
3700b57cec5SDimitry Andric }
3710b57cec5SDimitry Andric 
appendCodePoint(unsigned Codepoint,llvm::SmallVectorImpl<char> & Str)3720b57cec5SDimitry Andric static void appendCodePoint(unsigned Codepoint,
3730b57cec5SDimitry Andric                             llvm::SmallVectorImpl<char> &Str) {
3740b57cec5SDimitry Andric   char ResultBuf[4];
3750b57cec5SDimitry Andric   char *ResultPtr = ResultBuf;
37681ad6265SDimitry Andric   if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
3770b57cec5SDimitry Andric     Str.append(ResultBuf, ResultPtr);
3780b57cec5SDimitry Andric }
3790b57cec5SDimitry Andric 
expandUCNs(SmallVectorImpl<char> & Buf,StringRef Input)3800b57cec5SDimitry Andric void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
3810b57cec5SDimitry Andric   for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
3820b57cec5SDimitry Andric     if (*I != '\\') {
3830b57cec5SDimitry Andric       Buf.push_back(*I);
3840b57cec5SDimitry Andric       continue;
3850b57cec5SDimitry Andric     }
3860b57cec5SDimitry Andric 
3870b57cec5SDimitry Andric     ++I;
388349cc55cSDimitry Andric     char Kind = *I;
389349cc55cSDimitry Andric     ++I;
390349cc55cSDimitry Andric 
39181ad6265SDimitry Andric     assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
392349cc55cSDimitry Andric     uint32_t CodePoint = 0;
393349cc55cSDimitry Andric 
394349cc55cSDimitry Andric     if (Kind == 'u' && *I == '{') {
395349cc55cSDimitry Andric       for (++I; *I != '}'; ++I) {
396349cc55cSDimitry Andric         unsigned Value = llvm::hexDigitValue(*I);
397349cc55cSDimitry Andric         assert(Value != -1U);
398349cc55cSDimitry Andric         CodePoint <<= 4;
399349cc55cSDimitry Andric         CodePoint += Value;
400349cc55cSDimitry Andric       }
401349cc55cSDimitry Andric       appendCodePoint(CodePoint, Buf);
402349cc55cSDimitry Andric       continue;
403349cc55cSDimitry Andric     }
4040b57cec5SDimitry Andric 
40581ad6265SDimitry Andric     if (Kind == 'N') {
40681ad6265SDimitry Andric       assert(*I == '{');
40781ad6265SDimitry Andric       ++I;
40881ad6265SDimitry Andric       auto Delim = std::find(I, Input.end(), '}');
40981ad6265SDimitry Andric       assert(Delim != Input.end());
4105f757f3fSDimitry Andric       StringRef Name(I, std::distance(I, Delim));
411bdd1243dSDimitry Andric       std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
4125f757f3fSDimitry Andric           llvm::sys::unicode::nameToCodepointLooseMatching(Name);
4135f757f3fSDimitry Andric       assert(Res && "could not find a codepoint that was previously found");
41481ad6265SDimitry Andric       CodePoint = Res->CodePoint;
41581ad6265SDimitry Andric       assert(CodePoint != 0xFFFFFFFF);
41681ad6265SDimitry Andric       appendCodePoint(CodePoint, Buf);
41781ad6265SDimitry Andric       I = Delim;
41881ad6265SDimitry Andric       continue;
41981ad6265SDimitry Andric     }
42081ad6265SDimitry Andric 
4210b57cec5SDimitry Andric     unsigned NumHexDigits;
422349cc55cSDimitry Andric     if (Kind == 'u')
4230b57cec5SDimitry Andric       NumHexDigits = 4;
4240b57cec5SDimitry Andric     else
4250b57cec5SDimitry Andric       NumHexDigits = 8;
4260b57cec5SDimitry Andric 
4270b57cec5SDimitry Andric     assert(I + NumHexDigits <= E);
4280b57cec5SDimitry Andric 
429349cc55cSDimitry Andric     for (; NumHexDigits != 0; ++I, --NumHexDigits) {
4300b57cec5SDimitry Andric       unsigned Value = llvm::hexDigitValue(*I);
4310b57cec5SDimitry Andric       assert(Value != -1U);
4320b57cec5SDimitry Andric 
4330b57cec5SDimitry Andric       CodePoint <<= 4;
4340b57cec5SDimitry Andric       CodePoint += Value;
4350b57cec5SDimitry Andric     }
4360b57cec5SDimitry Andric 
4370b57cec5SDimitry Andric     appendCodePoint(CodePoint, Buf);
4380b57cec5SDimitry Andric     --I;
4390b57cec5SDimitry Andric   }
4400b57cec5SDimitry Andric }
4410b57cec5SDimitry Andric 
isFunctionLocalStringLiteralMacro(tok::TokenKind K,const LangOptions & LO)4425f757f3fSDimitry Andric bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
4435f757f3fSDimitry Andric                                               const LangOptions &LO) {
4445f757f3fSDimitry Andric   return LO.MicrosoftExt &&
4455f757f3fSDimitry Andric          (K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||
4465f757f3fSDimitry Andric           K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||
4475f757f3fSDimitry Andric           K == tok::kw___FUNCDNAME__);
4485f757f3fSDimitry Andric }
4495f757f3fSDimitry Andric 
tokenIsLikeStringLiteral(const Token & Tok,const LangOptions & LO)4505f757f3fSDimitry Andric bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
4515f757f3fSDimitry Andric   return tok::isStringLiteral(Tok.getKind()) ||
4525f757f3fSDimitry Andric          isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);
4535f757f3fSDimitry Andric }
4545f757f3fSDimitry Andric 
ProcessNumericUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,bool & Delimited,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)45581ad6265SDimitry Andric static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
45681ad6265SDimitry Andric                                     const char *&ThisTokBuf,
45781ad6265SDimitry Andric                                     const char *ThisTokEnd, uint32_t &UcnVal,
45881ad6265SDimitry Andric                                     unsigned short &UcnLen, bool &Delimited,
4590b57cec5SDimitry Andric                                     FullSourceLoc Loc, DiagnosticsEngine *Diags,
4600b57cec5SDimitry Andric                                     const LangOptions &Features,
4610b57cec5SDimitry Andric                                     bool in_char_string_literal = false) {
4620b57cec5SDimitry Andric   const char *UcnBegin = ThisTokBuf;
46381ad6265SDimitry Andric   bool HasError = false;
46481ad6265SDimitry Andric   bool EndDelimiterFound = false;
4650b57cec5SDimitry Andric 
4660b57cec5SDimitry Andric   // Skip the '\u' char's.
4670b57cec5SDimitry Andric   ThisTokBuf += 2;
46881ad6265SDimitry Andric   Delimited = false;
469349cc55cSDimitry Andric   if (UcnBegin[1] == 'u' && in_char_string_literal &&
470349cc55cSDimitry Andric       ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
471349cc55cSDimitry Andric     Delimited = true;
472349cc55cSDimitry Andric     ThisTokBuf++;
473349cc55cSDimitry Andric   } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
4740b57cec5SDimitry Andric     if (Diags)
4750b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
47681ad6265SDimitry Andric            diag::err_hex_escape_no_digits)
47781ad6265SDimitry Andric           << StringRef(&ThisTokBuf[-1], 1);
4780b57cec5SDimitry Andric     return false;
4790b57cec5SDimitry Andric   }
4800b57cec5SDimitry Andric   UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
481349cc55cSDimitry Andric 
482349cc55cSDimitry Andric   bool Overflow = false;
483349cc55cSDimitry Andric   unsigned short Count = 0;
484349cc55cSDimitry Andric   for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
485349cc55cSDimitry Andric        ++ThisTokBuf) {
486349cc55cSDimitry Andric     if (Delimited && *ThisTokBuf == '}') {
487349cc55cSDimitry Andric       ++ThisTokBuf;
488349cc55cSDimitry Andric       EndDelimiterFound = true;
489349cc55cSDimitry Andric       break;
490349cc55cSDimitry Andric     }
491349cc55cSDimitry Andric     int CharVal = llvm::hexDigitValue(*ThisTokBuf);
492349cc55cSDimitry Andric     if (CharVal == -1) {
493349cc55cSDimitry Andric       HasError = true;
494349cc55cSDimitry Andric       if (!Delimited)
495349cc55cSDimitry Andric         break;
496349cc55cSDimitry Andric       if (Diags) {
497349cc55cSDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
498349cc55cSDimitry Andric              diag::err_delimited_escape_invalid)
499349cc55cSDimitry Andric             << StringRef(ThisTokBuf, 1);
500349cc55cSDimitry Andric       }
501349cc55cSDimitry Andric       Count++;
502349cc55cSDimitry Andric       continue;
503349cc55cSDimitry Andric     }
504349cc55cSDimitry Andric     if (UcnVal & 0xF0000000) {
505349cc55cSDimitry Andric       Overflow = true;
506349cc55cSDimitry Andric       continue;
507349cc55cSDimitry Andric     }
5080b57cec5SDimitry Andric     UcnVal <<= 4;
5090b57cec5SDimitry Andric     UcnVal |= CharVal;
510349cc55cSDimitry Andric     Count++;
5110b57cec5SDimitry Andric   }
512349cc55cSDimitry Andric 
513349cc55cSDimitry Andric   if (Overflow) {
5140b57cec5SDimitry Andric     if (Diags)
5150b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
516349cc55cSDimitry Andric            diag::err_escape_too_large)
517349cc55cSDimitry Andric           << 0;
5180b57cec5SDimitry Andric     return false;
5190b57cec5SDimitry Andric   }
5200b57cec5SDimitry Andric 
521349cc55cSDimitry Andric   if (Delimited && !EndDelimiterFound) {
522349cc55cSDimitry Andric     if (Diags) {
523349cc55cSDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
524349cc55cSDimitry Andric            diag::err_expected)
525349cc55cSDimitry Andric           << tok::r_brace;
526349cc55cSDimitry Andric     }
527349cc55cSDimitry Andric     return false;
528349cc55cSDimitry Andric   }
529349cc55cSDimitry Andric 
530349cc55cSDimitry Andric   // If we didn't consume the proper number of digits, there is a problem.
531349cc55cSDimitry Andric   if (Count == 0 || (!Delimited && Count != UcnLen)) {
532349cc55cSDimitry Andric     if (Diags)
533349cc55cSDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
534349cc55cSDimitry Andric            Delimited ? diag::err_delimited_escape_empty
535349cc55cSDimitry Andric                      : diag::err_ucn_escape_incomplete);
536349cc55cSDimitry Andric     return false;
537349cc55cSDimitry Andric   }
53881ad6265SDimitry Andric   return !HasError;
53981ad6265SDimitry Andric }
540349cc55cSDimitry Andric 
DiagnoseInvalidUnicodeCharacterName(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc Loc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,llvm::StringRef Name)54181ad6265SDimitry Andric static void DiagnoseInvalidUnicodeCharacterName(
54281ad6265SDimitry Andric     DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
54381ad6265SDimitry Andric     const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
54481ad6265SDimitry Andric     llvm::StringRef Name) {
54581ad6265SDimitry Andric 
54681ad6265SDimitry Andric   Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
54781ad6265SDimitry Andric        diag::err_invalid_ucn_name)
54881ad6265SDimitry Andric       << Name;
54981ad6265SDimitry Andric 
55081ad6265SDimitry Andric   namespace u = llvm::sys::unicode;
55181ad6265SDimitry Andric 
552bdd1243dSDimitry Andric   std::optional<u::LooseMatchingResult> Res =
55381ad6265SDimitry Andric       u::nameToCodepointLooseMatching(Name);
55481ad6265SDimitry Andric   if (Res) {
55581ad6265SDimitry Andric     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
55681ad6265SDimitry Andric          diag::note_invalid_ucn_name_loose_matching)
55781ad6265SDimitry Andric         << FixItHint::CreateReplacement(
55881ad6265SDimitry Andric                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
55981ad6265SDimitry Andric                                    TokRangeEnd),
56081ad6265SDimitry Andric                Res->Name);
56181ad6265SDimitry Andric     return;
56281ad6265SDimitry Andric   }
56381ad6265SDimitry Andric 
56481ad6265SDimitry Andric   unsigned Distance = 0;
56581ad6265SDimitry Andric   SmallVector<u::MatchForCodepointName> Matches =
56681ad6265SDimitry Andric       u::nearestMatchesForCodepointName(Name, 5);
56781ad6265SDimitry Andric   assert(!Matches.empty() && "No unicode characters found");
56881ad6265SDimitry Andric 
56981ad6265SDimitry Andric   for (const auto &Match : Matches) {
57081ad6265SDimitry Andric     if (Distance == 0)
57181ad6265SDimitry Andric       Distance = Match.Distance;
57281ad6265SDimitry Andric     if (std::max(Distance, Match.Distance) -
57381ad6265SDimitry Andric             std::min(Distance, Match.Distance) >
57481ad6265SDimitry Andric         3)
57581ad6265SDimitry Andric       break;
57681ad6265SDimitry Andric     Distance = Match.Distance;
57781ad6265SDimitry Andric 
57881ad6265SDimitry Andric     std::string Str;
57981ad6265SDimitry Andric     llvm::UTF32 V = Match.Value;
580bdd1243dSDimitry Andric     bool Converted =
58181ad6265SDimitry Andric         llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
582bdd1243dSDimitry Andric     (void)Converted;
58381ad6265SDimitry Andric     assert(Converted && "Found a match wich is not a unicode character");
58481ad6265SDimitry Andric 
58581ad6265SDimitry Andric     Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
58681ad6265SDimitry Andric          diag::note_invalid_ucn_name_candidate)
58781ad6265SDimitry Andric         << Match.Name << llvm::utohexstr(Match.Value)
58881ad6265SDimitry Andric         << Str // FIXME: Fix the rendering of non printable characters
58981ad6265SDimitry Andric         << FixItHint::CreateReplacement(
59081ad6265SDimitry Andric                MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
59181ad6265SDimitry Andric                                    TokRangeEnd),
59281ad6265SDimitry Andric                Match.Name);
59381ad6265SDimitry Andric   }
59481ad6265SDimitry Andric }
59581ad6265SDimitry Andric 
ProcessNamedUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features)59681ad6265SDimitry Andric static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
59781ad6265SDimitry Andric                                   const char *&ThisTokBuf,
59881ad6265SDimitry Andric                                   const char *ThisTokEnd, uint32_t &UcnVal,
59981ad6265SDimitry Andric                                   unsigned short &UcnLen, FullSourceLoc Loc,
60081ad6265SDimitry Andric                                   DiagnosticsEngine *Diags,
60181ad6265SDimitry Andric                                   const LangOptions &Features) {
60281ad6265SDimitry Andric   const char *UcnBegin = ThisTokBuf;
60381ad6265SDimitry Andric   assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
60481ad6265SDimitry Andric   ThisTokBuf += 2;
60581ad6265SDimitry Andric   if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
60681ad6265SDimitry Andric     if (Diags) {
60781ad6265SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
60881ad6265SDimitry Andric            diag::err_delimited_escape_missing_brace)
60981ad6265SDimitry Andric           << StringRef(&ThisTokBuf[-1], 1);
61081ad6265SDimitry Andric     }
61181ad6265SDimitry Andric     return false;
61281ad6265SDimitry Andric   }
61381ad6265SDimitry Andric   ThisTokBuf++;
614bdd1243dSDimitry Andric   const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
615bdd1243dSDimitry Andric     return C == '}' || isVerticalWhitespace(C);
61681ad6265SDimitry Andric   });
617bdd1243dSDimitry Andric   bool Incomplete = ClosingBrace == ThisTokEnd;
61881ad6265SDimitry Andric   bool Empty = ClosingBrace == ThisTokBuf;
61981ad6265SDimitry Andric   if (Incomplete || Empty) {
62081ad6265SDimitry Andric     if (Diags) {
62181ad6265SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
62281ad6265SDimitry Andric            Incomplete ? diag::err_ucn_escape_incomplete
62381ad6265SDimitry Andric                       : diag::err_delimited_escape_empty)
62481ad6265SDimitry Andric           << StringRef(&UcnBegin[1], 1);
62581ad6265SDimitry Andric     }
62681ad6265SDimitry Andric     ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
62781ad6265SDimitry Andric     return false;
62881ad6265SDimitry Andric   }
62981ad6265SDimitry Andric   StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
63081ad6265SDimitry Andric   ThisTokBuf = ClosingBrace + 1;
631bdd1243dSDimitry Andric   std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
63281ad6265SDimitry Andric   if (!Res) {
63381ad6265SDimitry Andric     if (Diags)
63481ad6265SDimitry Andric       DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
63581ad6265SDimitry Andric                                           &UcnBegin[3], ClosingBrace, Name);
63681ad6265SDimitry Andric     return false;
63781ad6265SDimitry Andric   }
63881ad6265SDimitry Andric   UcnVal = *Res;
63981ad6265SDimitry Andric   UcnLen = UcnVal > 0xFFFF ? 8 : 4;
64081ad6265SDimitry Andric   return true;
64181ad6265SDimitry Andric }
64281ad6265SDimitry Andric 
64381ad6265SDimitry Andric /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
64481ad6265SDimitry Andric /// return the UTF32.
ProcessUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)64581ad6265SDimitry Andric static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
64681ad6265SDimitry Andric                              const char *ThisTokEnd, uint32_t &UcnVal,
64781ad6265SDimitry Andric                              unsigned short &UcnLen, FullSourceLoc Loc,
64881ad6265SDimitry Andric                              DiagnosticsEngine *Diags,
64981ad6265SDimitry Andric                              const LangOptions &Features,
65081ad6265SDimitry Andric                              bool in_char_string_literal = false) {
65181ad6265SDimitry Andric 
65281ad6265SDimitry Andric   bool HasError;
65381ad6265SDimitry Andric   const char *UcnBegin = ThisTokBuf;
65481ad6265SDimitry Andric   bool IsDelimitedEscapeSequence = false;
65581ad6265SDimitry Andric   bool IsNamedEscapeSequence = false;
65681ad6265SDimitry Andric   if (ThisTokBuf[1] == 'N') {
65781ad6265SDimitry Andric     IsNamedEscapeSequence = true;
65881ad6265SDimitry Andric     HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
65981ad6265SDimitry Andric                                       UcnVal, UcnLen, Loc, Diags, Features);
66081ad6265SDimitry Andric   } else {
66181ad6265SDimitry Andric     HasError =
66281ad6265SDimitry Andric         !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
66381ad6265SDimitry Andric                                  UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
66481ad6265SDimitry Andric                                  Features, in_char_string_literal);
66581ad6265SDimitry Andric   }
666349cc55cSDimitry Andric   if (HasError)
667349cc55cSDimitry Andric     return false;
668349cc55cSDimitry Andric 
6690b57cec5SDimitry Andric   // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
6700b57cec5SDimitry Andric   if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
6710b57cec5SDimitry Andric       UcnVal > 0x10FFFF) {                      // maximum legal UTF32 value
6720b57cec5SDimitry Andric     if (Diags)
6730b57cec5SDimitry Andric       Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
6740b57cec5SDimitry Andric            diag::err_ucn_escape_invalid);
6750b57cec5SDimitry Andric     return false;
6760b57cec5SDimitry Andric   }
6770b57cec5SDimitry Andric 
6785f757f3fSDimitry Andric   // C23 and C++11 allow UCNs that refer to control characters
67906c3fb27SDimitry Andric   // and basic source characters inside character and string literals
6800b57cec5SDimitry Andric   if (UcnVal < 0xa0 &&
68106c3fb27SDimitry Andric       // $, @, ` are allowed in all language modes
68206c3fb27SDimitry Andric       (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
68306c3fb27SDimitry Andric     bool IsError =
6845f757f3fSDimitry Andric         (!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);
6850b57cec5SDimitry Andric     if (Diags) {
6860b57cec5SDimitry Andric       char BasicSCSChar = UcnVal;
6870b57cec5SDimitry Andric       if (UcnVal >= 0x20 && UcnVal < 0x7f)
6880b57cec5SDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
68906c3fb27SDimitry Andric              IsError ? diag::err_ucn_escape_basic_scs
69006c3fb27SDimitry Andric              : Features.CPlusPlus
69106c3fb27SDimitry Andric                  ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
6925f757f3fSDimitry Andric                  : diag::warn_c23_compat_literal_ucn_escape_basic_scs)
6930b57cec5SDimitry Andric             << StringRef(&BasicSCSChar, 1);
6940b57cec5SDimitry Andric       else
6950b57cec5SDimitry Andric         Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
69606c3fb27SDimitry Andric              IsError ? diag::err_ucn_control_character
69706c3fb27SDimitry Andric              : Features.CPlusPlus
69806c3fb27SDimitry Andric                  ? diag::warn_cxx98_compat_literal_ucn_control_character
6995f757f3fSDimitry Andric                  : diag::warn_c23_compat_literal_ucn_control_character);
7000b57cec5SDimitry Andric     }
7010b57cec5SDimitry Andric     if (IsError)
7020b57cec5SDimitry Andric       return false;
7030b57cec5SDimitry Andric   }
7040b57cec5SDimitry Andric 
7050b57cec5SDimitry Andric   if (!Features.CPlusPlus && !Features.C99 && Diags)
7060b57cec5SDimitry Andric     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
7070b57cec5SDimitry Andric          diag::warn_ucn_not_valid_in_c89_literal);
7080b57cec5SDimitry Andric 
70981ad6265SDimitry Andric   if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
710349cc55cSDimitry Andric     Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
71106c3fb27SDimitry Andric          Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
712753f127fSDimitry Andric                               : diag::ext_delimited_escape_sequence)
713753f127fSDimitry Andric         << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
714349cc55cSDimitry Andric 
7150b57cec5SDimitry Andric   return true;
7160b57cec5SDimitry Andric }
7170b57cec5SDimitry Andric 
7180b57cec5SDimitry Andric /// MeasureUCNEscape - Determine the number of bytes within the resulting string
7190b57cec5SDimitry Andric /// which this UCN will occupy.
MeasureUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,unsigned CharByteWidth,const LangOptions & Features,bool & HadError)7200b57cec5SDimitry Andric static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
7210b57cec5SDimitry Andric                             const char *ThisTokEnd, unsigned CharByteWidth,
7220b57cec5SDimitry Andric                             const LangOptions &Features, bool &HadError) {
7230b57cec5SDimitry Andric   // UTF-32: 4 bytes per escape.
7240b57cec5SDimitry Andric   if (CharByteWidth == 4)
7250b57cec5SDimitry Andric     return 4;
7260b57cec5SDimitry Andric 
7270b57cec5SDimitry Andric   uint32_t UcnVal = 0;
7280b57cec5SDimitry Andric   unsigned short UcnLen = 0;
7290b57cec5SDimitry Andric   FullSourceLoc Loc;
7300b57cec5SDimitry Andric 
7310b57cec5SDimitry Andric   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
7320b57cec5SDimitry Andric                         UcnLen, Loc, nullptr, Features, true)) {
7330b57cec5SDimitry Andric     HadError = true;
7340b57cec5SDimitry Andric     return 0;
7350b57cec5SDimitry Andric   }
7360b57cec5SDimitry Andric 
7370b57cec5SDimitry Andric   // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
7380b57cec5SDimitry Andric   if (CharByteWidth == 2)
7390b57cec5SDimitry Andric     return UcnVal <= 0xFFFF ? 2 : 4;
7400b57cec5SDimitry Andric 
7410b57cec5SDimitry Andric   // UTF-8.
7420b57cec5SDimitry Andric   if (UcnVal < 0x80)
7430b57cec5SDimitry Andric     return 1;
7440b57cec5SDimitry Andric   if (UcnVal < 0x800)
7450b57cec5SDimitry Andric     return 2;
7460b57cec5SDimitry Andric   if (UcnVal < 0x10000)
7470b57cec5SDimitry Andric     return 3;
7480b57cec5SDimitry Andric   return 4;
7490b57cec5SDimitry Andric }
7500b57cec5SDimitry Andric 
7510b57cec5SDimitry Andric /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
7520b57cec5SDimitry Andric /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
7530b57cec5SDimitry Andric /// StringLiteralParser. When we decide to implement UCN's for identifiers,
7540b57cec5SDimitry Andric /// we will likely rework our support for UCN's.
EncodeUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,char * & ResultBuf,bool & HadError,FullSourceLoc Loc,unsigned CharByteWidth,DiagnosticsEngine * Diags,const LangOptions & Features)7550b57cec5SDimitry Andric static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
7560b57cec5SDimitry Andric                             const char *ThisTokEnd,
7570b57cec5SDimitry Andric                             char *&ResultBuf, bool &HadError,
7580b57cec5SDimitry Andric                             FullSourceLoc Loc, unsigned CharByteWidth,
7590b57cec5SDimitry Andric                             DiagnosticsEngine *Diags,
7600b57cec5SDimitry Andric                             const LangOptions &Features) {
7610b57cec5SDimitry Andric   typedef uint32_t UTF32;
7620b57cec5SDimitry Andric   UTF32 UcnVal = 0;
7630b57cec5SDimitry Andric   unsigned short UcnLen = 0;
7640b57cec5SDimitry Andric   if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
7650b57cec5SDimitry Andric                         Loc, Diags, Features, true)) {
7660b57cec5SDimitry Andric     HadError = true;
7670b57cec5SDimitry Andric     return;
7680b57cec5SDimitry Andric   }
7690b57cec5SDimitry Andric 
7700b57cec5SDimitry Andric   assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
7710b57cec5SDimitry Andric          "only character widths of 1, 2, or 4 bytes supported");
7720b57cec5SDimitry Andric 
7730b57cec5SDimitry Andric   (void)UcnLen;
7740b57cec5SDimitry Andric   assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
7750b57cec5SDimitry Andric 
7760b57cec5SDimitry Andric   if (CharByteWidth == 4) {
7770b57cec5SDimitry Andric     // FIXME: Make the type of the result buffer correct instead of
7780b57cec5SDimitry Andric     // using reinterpret_cast.
7790b57cec5SDimitry Andric     llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
7800b57cec5SDimitry Andric     *ResultPtr = UcnVal;
7810b57cec5SDimitry Andric     ResultBuf += 4;
7820b57cec5SDimitry Andric     return;
7830b57cec5SDimitry Andric   }
7840b57cec5SDimitry Andric 
7850b57cec5SDimitry Andric   if (CharByteWidth == 2) {
7860b57cec5SDimitry Andric     // FIXME: Make the type of the result buffer correct instead of
7870b57cec5SDimitry Andric     // using reinterpret_cast.
7880b57cec5SDimitry Andric     llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
7890b57cec5SDimitry Andric 
7900b57cec5SDimitry Andric     if (UcnVal <= (UTF32)0xFFFF) {
7910b57cec5SDimitry Andric       *ResultPtr = UcnVal;
7920b57cec5SDimitry Andric       ResultBuf += 2;
7930b57cec5SDimitry Andric       return;
7940b57cec5SDimitry Andric     }
7950b57cec5SDimitry Andric 
7960b57cec5SDimitry Andric     // Convert to UTF16.
7970b57cec5SDimitry Andric     UcnVal -= 0x10000;
7980b57cec5SDimitry Andric     *ResultPtr     = 0xD800 + (UcnVal >> 10);
7990b57cec5SDimitry Andric     *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
8000b57cec5SDimitry Andric     ResultBuf += 4;
8010b57cec5SDimitry Andric     return;
8020b57cec5SDimitry Andric   }
8030b57cec5SDimitry Andric 
8040b57cec5SDimitry Andric   assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
8050b57cec5SDimitry Andric 
8060b57cec5SDimitry Andric   // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
8070b57cec5SDimitry Andric   // The conversion below was inspired by:
8080b57cec5SDimitry Andric   //   http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
8090b57cec5SDimitry Andric   // First, we determine how many bytes the result will require.
8100b57cec5SDimitry Andric   typedef uint8_t UTF8;
8110b57cec5SDimitry Andric 
8120b57cec5SDimitry Andric   unsigned short bytesToWrite = 0;
8130b57cec5SDimitry Andric   if (UcnVal < (UTF32)0x80)
8140b57cec5SDimitry Andric     bytesToWrite = 1;
8150b57cec5SDimitry Andric   else if (UcnVal < (UTF32)0x800)
8160b57cec5SDimitry Andric     bytesToWrite = 2;
8170b57cec5SDimitry Andric   else if (UcnVal < (UTF32)0x10000)
8180b57cec5SDimitry Andric     bytesToWrite = 3;
8190b57cec5SDimitry Andric   else
8200b57cec5SDimitry Andric     bytesToWrite = 4;
8210b57cec5SDimitry Andric 
8220b57cec5SDimitry Andric   const unsigned byteMask = 0xBF;
8230b57cec5SDimitry Andric   const unsigned byteMark = 0x80;
8240b57cec5SDimitry Andric 
8250b57cec5SDimitry Andric   // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
8260b57cec5SDimitry Andric   // into the first byte, depending on how many bytes follow.
8270b57cec5SDimitry Andric   static const UTF8 firstByteMark[5] = {
8280b57cec5SDimitry Andric     0x00, 0x00, 0xC0, 0xE0, 0xF0
8290b57cec5SDimitry Andric   };
8300b57cec5SDimitry Andric   // Finally, we write the bytes into ResultBuf.
8310b57cec5SDimitry Andric   ResultBuf += bytesToWrite;
8320b57cec5SDimitry Andric   switch (bytesToWrite) { // note: everything falls through.
8330b57cec5SDimitry Andric   case 4:
8340b57cec5SDimitry Andric     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
835bdd1243dSDimitry Andric     [[fallthrough]];
8360b57cec5SDimitry Andric   case 3:
8370b57cec5SDimitry Andric     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
838bdd1243dSDimitry Andric     [[fallthrough]];
8390b57cec5SDimitry Andric   case 2:
8400b57cec5SDimitry Andric     *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
841bdd1243dSDimitry Andric     [[fallthrough]];
8420b57cec5SDimitry Andric   case 1:
8430b57cec5SDimitry Andric     *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
8440b57cec5SDimitry Andric   }
8450b57cec5SDimitry Andric   // Update the buffer.
8460b57cec5SDimitry Andric   ResultBuf += bytesToWrite;
8470b57cec5SDimitry Andric }
8480b57cec5SDimitry Andric 
8490b57cec5SDimitry Andric ///       integer-constant: [C99 6.4.4.1]
8500b57cec5SDimitry Andric ///         decimal-constant integer-suffix
8510b57cec5SDimitry Andric ///         octal-constant integer-suffix
8520b57cec5SDimitry Andric ///         hexadecimal-constant integer-suffix
8530b57cec5SDimitry Andric ///         binary-literal integer-suffix [GNU, C++1y]
8540b57cec5SDimitry Andric ///       user-defined-integer-literal: [C++11 lex.ext]
8550b57cec5SDimitry Andric ///         decimal-literal ud-suffix
8560b57cec5SDimitry Andric ///         octal-literal ud-suffix
8570b57cec5SDimitry Andric ///         hexadecimal-literal ud-suffix
8580b57cec5SDimitry Andric ///         binary-literal ud-suffix [GNU, C++1y]
8590b57cec5SDimitry Andric ///       decimal-constant:
8600b57cec5SDimitry Andric ///         nonzero-digit
8610b57cec5SDimitry Andric ///         decimal-constant digit
8620b57cec5SDimitry Andric ///       octal-constant:
8630b57cec5SDimitry Andric ///         0
8640b57cec5SDimitry Andric ///         octal-constant octal-digit
8650b57cec5SDimitry Andric ///       hexadecimal-constant:
8660b57cec5SDimitry Andric ///         hexadecimal-prefix hexadecimal-digit
8670b57cec5SDimitry Andric ///         hexadecimal-constant hexadecimal-digit
8680b57cec5SDimitry Andric ///       hexadecimal-prefix: one of
8690b57cec5SDimitry Andric ///         0x 0X
8700b57cec5SDimitry Andric ///       binary-literal:
8710b57cec5SDimitry Andric ///         0b binary-digit
8720b57cec5SDimitry Andric ///         0B binary-digit
8730b57cec5SDimitry Andric ///         binary-literal binary-digit
8740b57cec5SDimitry Andric ///       integer-suffix:
8750b57cec5SDimitry Andric ///         unsigned-suffix [long-suffix]
8760b57cec5SDimitry Andric ///         unsigned-suffix [long-long-suffix]
8770b57cec5SDimitry Andric ///         long-suffix [unsigned-suffix]
8780b57cec5SDimitry Andric ///         long-long-suffix [unsigned-sufix]
8790b57cec5SDimitry Andric ///       nonzero-digit:
8800b57cec5SDimitry Andric ///         1 2 3 4 5 6 7 8 9
8810b57cec5SDimitry Andric ///       octal-digit:
8820b57cec5SDimitry Andric ///         0 1 2 3 4 5 6 7
8830b57cec5SDimitry Andric ///       hexadecimal-digit:
8840b57cec5SDimitry Andric ///         0 1 2 3 4 5 6 7 8 9
8850b57cec5SDimitry Andric ///         a b c d e f
8860b57cec5SDimitry Andric ///         A B C D E F
8870b57cec5SDimitry Andric ///       binary-digit:
8880b57cec5SDimitry Andric ///         0
8890b57cec5SDimitry Andric ///         1
8900b57cec5SDimitry Andric ///       unsigned-suffix: one of
8910b57cec5SDimitry Andric ///         u U
8920b57cec5SDimitry Andric ///       long-suffix: one of
8930b57cec5SDimitry Andric ///         l L
8940b57cec5SDimitry Andric ///       long-long-suffix: one of
8950b57cec5SDimitry Andric ///         ll LL
8960b57cec5SDimitry Andric ///
8970b57cec5SDimitry Andric ///       floating-constant: [C99 6.4.4.2]
8980b57cec5SDimitry Andric ///         TODO: add rules...
8990b57cec5SDimitry Andric ///
NumericLiteralParser(StringRef TokSpelling,SourceLocation TokLoc,const SourceManager & SM,const LangOptions & LangOpts,const TargetInfo & Target,DiagnosticsEngine & Diags)9000b57cec5SDimitry Andric NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
9010b57cec5SDimitry Andric                                            SourceLocation TokLoc,
9025ffd83dbSDimitry Andric                                            const SourceManager &SM,
9035ffd83dbSDimitry Andric                                            const LangOptions &LangOpts,
9045ffd83dbSDimitry Andric                                            const TargetInfo &Target,
9055ffd83dbSDimitry Andric                                            DiagnosticsEngine &Diags)
9065ffd83dbSDimitry Andric     : SM(SM), LangOpts(LangOpts), Diags(Diags),
9075ffd83dbSDimitry Andric       ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
9080b57cec5SDimitry Andric 
9090b57cec5SDimitry Andric   s = DigitsBegin = ThisTokBegin;
9100b57cec5SDimitry Andric   saw_exponent = false;
9110b57cec5SDimitry Andric   saw_period = false;
9120b57cec5SDimitry Andric   saw_ud_suffix = false;
9130b57cec5SDimitry Andric   saw_fixed_point_suffix = false;
9140b57cec5SDimitry Andric   isLong = false;
9150b57cec5SDimitry Andric   isUnsigned = false;
9160b57cec5SDimitry Andric   isLongLong = false;
917fe6060f1SDimitry Andric   isSizeT = false;
9180b57cec5SDimitry Andric   isHalf = false;
9190b57cec5SDimitry Andric   isFloat = false;
9200b57cec5SDimitry Andric   isImaginary = false;
9210b57cec5SDimitry Andric   isFloat16 = false;
9220b57cec5SDimitry Andric   isFloat128 = false;
9230b57cec5SDimitry Andric   MicrosoftInteger = 0;
9240b57cec5SDimitry Andric   isFract = false;
9250b57cec5SDimitry Andric   isAccum = false;
9260b57cec5SDimitry Andric   hadError = false;
92781ad6265SDimitry Andric   isBitInt = false;
9280b57cec5SDimitry Andric 
929349cc55cSDimitry Andric   // This routine assumes that the range begin/end matches the regex for integer
930349cc55cSDimitry Andric   // and FP constants (specifically, the 'pp-number' regex), and assumes that
931349cc55cSDimitry Andric   // the byte at "*end" is both valid and not part of the regex.  Because of
932349cc55cSDimitry Andric   // this, it doesn't have to check for 'overscan' in various places.
9335f757f3fSDimitry Andric   // Note: For HLSL, the end token is allowed to be '.' which would be in the
9345f757f3fSDimitry Andric   // 'pp-number' regex. This is required to support vector swizzles on numeric
9355f757f3fSDimitry Andric   // constants (i.e. 1.xx or 1.5f.rrr).
9365f757f3fSDimitry Andric   if (isPreprocessingNumberBody(*ThisTokEnd) &&
9375f757f3fSDimitry Andric       !(LangOpts.HLSL && *ThisTokEnd == '.')) {
938349cc55cSDimitry Andric     Diags.Report(TokLoc, diag::err_lexing_numeric);
939349cc55cSDimitry Andric     hadError = true;
940349cc55cSDimitry Andric     return;
941349cc55cSDimitry Andric   }
942349cc55cSDimitry Andric 
9430b57cec5SDimitry Andric   if (*s == '0') { // parse radix
9440b57cec5SDimitry Andric     ParseNumberStartingWithZero(TokLoc);
9450b57cec5SDimitry Andric     if (hadError)
9460b57cec5SDimitry Andric       return;
9470b57cec5SDimitry Andric   } else { // the first digit is non-zero
9480b57cec5SDimitry Andric     radix = 10;
9490b57cec5SDimitry Andric     s = SkipDigits(s);
9500b57cec5SDimitry Andric     if (s == ThisTokEnd) {
9510b57cec5SDimitry Andric       // Done.
9520b57cec5SDimitry Andric     } else {
9530b57cec5SDimitry Andric       ParseDecimalOrOctalCommon(TokLoc);
9540b57cec5SDimitry Andric       if (hadError)
9550b57cec5SDimitry Andric         return;
9560b57cec5SDimitry Andric     }
9570b57cec5SDimitry Andric   }
9580b57cec5SDimitry Andric 
9590b57cec5SDimitry Andric   SuffixBegin = s;
9600b57cec5SDimitry Andric   checkSeparator(TokLoc, s, CSK_AfterDigits);
9610b57cec5SDimitry Andric 
9620b57cec5SDimitry Andric   // Initial scan to lookahead for fixed point suffix.
9635ffd83dbSDimitry Andric   if (LangOpts.FixedPoint) {
9640b57cec5SDimitry Andric     for (const char *c = s; c != ThisTokEnd; ++c) {
9650b57cec5SDimitry Andric       if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
9660b57cec5SDimitry Andric         saw_fixed_point_suffix = true;
9670b57cec5SDimitry Andric         break;
9680b57cec5SDimitry Andric       }
9690b57cec5SDimitry Andric     }
9700b57cec5SDimitry Andric   }
9710b57cec5SDimitry Andric 
9720b57cec5SDimitry Andric   // Parse the suffix.  At this point we can classify whether we have an FP or
9730b57cec5SDimitry Andric   // integer constant.
9745ffd83dbSDimitry Andric   bool isFixedPointConstant = isFixedPointLiteral();
9750b57cec5SDimitry Andric   bool isFPConstant = isFloatingLiteral();
976fe6060f1SDimitry Andric   bool HasSize = false;
9770b57cec5SDimitry Andric 
9780b57cec5SDimitry Andric   // Loop over all of the characters of the suffix.  If we see something bad,
9790b57cec5SDimitry Andric   // we break out of the loop.
9800b57cec5SDimitry Andric   for (; s != ThisTokEnd; ++s) {
9810b57cec5SDimitry Andric     switch (*s) {
9820b57cec5SDimitry Andric     case 'R':
9830b57cec5SDimitry Andric     case 'r':
9845ffd83dbSDimitry Andric       if (!LangOpts.FixedPoint)
9855ffd83dbSDimitry Andric         break;
9860b57cec5SDimitry Andric       if (isFract || isAccum) break;
9870b57cec5SDimitry Andric       if (!(saw_period || saw_exponent)) break;
9880b57cec5SDimitry Andric       isFract = true;
9890b57cec5SDimitry Andric       continue;
9900b57cec5SDimitry Andric     case 'K':
9910b57cec5SDimitry Andric     case 'k':
9925ffd83dbSDimitry Andric       if (!LangOpts.FixedPoint)
9935ffd83dbSDimitry Andric         break;
9940b57cec5SDimitry Andric       if (isFract || isAccum) break;
9950b57cec5SDimitry Andric       if (!(saw_period || saw_exponent)) break;
9960b57cec5SDimitry Andric       isAccum = true;
9970b57cec5SDimitry Andric       continue;
9980b57cec5SDimitry Andric     case 'h':      // FP Suffix for "half".
9990b57cec5SDimitry Andric     case 'H':
10000b57cec5SDimitry Andric       // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
10015ffd83dbSDimitry Andric       if (!(LangOpts.Half || LangOpts.FixedPoint))
10025ffd83dbSDimitry Andric         break;
10030b57cec5SDimitry Andric       if (isIntegerLiteral()) break;  // Error for integer constant.
1004fe6060f1SDimitry Andric       if (HasSize)
1005fe6060f1SDimitry Andric         break;
1006fe6060f1SDimitry Andric       HasSize = true;
10070b57cec5SDimitry Andric       isHalf = true;
10080b57cec5SDimitry Andric       continue;  // Success.
10090b57cec5SDimitry Andric     case 'f':      // FP Suffix for "float"
10100b57cec5SDimitry Andric     case 'F':
10110b57cec5SDimitry Andric       if (!isFPConstant) break;  // Error for integer constant.
1012fe6060f1SDimitry Andric       if (HasSize)
1013fe6060f1SDimitry Andric         break;
1014fe6060f1SDimitry Andric       HasSize = true;
10150b57cec5SDimitry Andric 
10160b57cec5SDimitry Andric       // CUDA host and device may have different _Float16 support, therefore
10170b57cec5SDimitry Andric       // allows f16 literals to avoid false alarm.
1018bdd1243dSDimitry Andric       // When we compile for OpenMP target offloading on NVPTX, f16 suffix
1019bdd1243dSDimitry Andric       // should also be supported.
10200b57cec5SDimitry Andric       // ToDo: more precise check for CUDA.
1021bdd1243dSDimitry Andric       // TODO: AMDGPU might also support it in the future.
1022bdd1243dSDimitry Andric       if ((Target.hasFloat16Type() || LangOpts.CUDA ||
102306c3fb27SDimitry Andric            (LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1024bdd1243dSDimitry Andric           s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
10250b57cec5SDimitry Andric         s += 2; // success, eat up 2 characters.
10260b57cec5SDimitry Andric         isFloat16 = true;
10270b57cec5SDimitry Andric         continue;
10280b57cec5SDimitry Andric       }
10290b57cec5SDimitry Andric 
10300b57cec5SDimitry Andric       isFloat = true;
10310b57cec5SDimitry Andric       continue;  // Success.
10320b57cec5SDimitry Andric     case 'q':    // FP Suffix for "__float128"
10330b57cec5SDimitry Andric     case 'Q':
10340b57cec5SDimitry Andric       if (!isFPConstant) break;  // Error for integer constant.
1035fe6060f1SDimitry Andric       if (HasSize)
1036fe6060f1SDimitry Andric         break;
1037fe6060f1SDimitry Andric       HasSize = true;
10380b57cec5SDimitry Andric       isFloat128 = true;
10390b57cec5SDimitry Andric       continue;  // Success.
10400b57cec5SDimitry Andric     case 'u':
10410b57cec5SDimitry Andric     case 'U':
10420b57cec5SDimitry Andric       if (isFPConstant) break;  // Error for floating constant.
10430b57cec5SDimitry Andric       if (isUnsigned) break;    // Cannot be repeated.
10440b57cec5SDimitry Andric       isUnsigned = true;
10450b57cec5SDimitry Andric       continue;  // Success.
10460b57cec5SDimitry Andric     case 'l':
10470b57cec5SDimitry Andric     case 'L':
1048fe6060f1SDimitry Andric       if (HasSize)
1049fe6060f1SDimitry Andric         break;
1050fe6060f1SDimitry Andric       HasSize = true;
10510b57cec5SDimitry Andric 
10520b57cec5SDimitry Andric       // Check for long long.  The L's need to be adjacent and the same case.
10530b57cec5SDimitry Andric       if (s[1] == s[0]) {
10540b57cec5SDimitry Andric         assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
10550b57cec5SDimitry Andric         if (isFPConstant) break;        // long long invalid for floats.
10560b57cec5SDimitry Andric         isLongLong = true;
10570b57cec5SDimitry Andric         ++s;  // Eat both of them.
10580b57cec5SDimitry Andric       } else {
10590b57cec5SDimitry Andric         isLong = true;
10600b57cec5SDimitry Andric       }
10610b57cec5SDimitry Andric       continue; // Success.
1062fe6060f1SDimitry Andric     case 'z':
1063fe6060f1SDimitry Andric     case 'Z':
1064fe6060f1SDimitry Andric       if (isFPConstant)
1065fe6060f1SDimitry Andric         break; // Invalid for floats.
1066fe6060f1SDimitry Andric       if (HasSize)
1067fe6060f1SDimitry Andric         break;
1068fe6060f1SDimitry Andric       HasSize = true;
1069fe6060f1SDimitry Andric       isSizeT = true;
1070fe6060f1SDimitry Andric       continue;
10710b57cec5SDimitry Andric     case 'i':
10720b57cec5SDimitry Andric     case 'I':
1073fe6060f1SDimitry Andric       if (LangOpts.MicrosoftExt && !isFPConstant) {
1074fe6060f1SDimitry Andric         // Allow i8, i16, i32, and i64. First, look ahead and check if
1075fe6060f1SDimitry Andric         // suffixes are Microsoft integers and not the imaginary unit.
1076fe6060f1SDimitry Andric         uint8_t Bits = 0;
1077fe6060f1SDimitry Andric         size_t ToSkip = 0;
10780b57cec5SDimitry Andric         switch (s[1]) {
1079fe6060f1SDimitry Andric         case '8': // i8 suffix
1080fe6060f1SDimitry Andric           Bits = 8;
1081fe6060f1SDimitry Andric           ToSkip = 2;
10820b57cec5SDimitry Andric           break;
10830b57cec5SDimitry Andric         case '1':
1084fe6060f1SDimitry Andric           if (s[2] == '6') { // i16 suffix
1085fe6060f1SDimitry Andric             Bits = 16;
1086fe6060f1SDimitry Andric             ToSkip = 3;
10870b57cec5SDimitry Andric           }
10880b57cec5SDimitry Andric           break;
10890b57cec5SDimitry Andric         case '3':
1090fe6060f1SDimitry Andric           if (s[2] == '2') { // i32 suffix
1091fe6060f1SDimitry Andric             Bits = 32;
1092fe6060f1SDimitry Andric             ToSkip = 3;
10930b57cec5SDimitry Andric           }
10940b57cec5SDimitry Andric           break;
10950b57cec5SDimitry Andric         case '6':
1096fe6060f1SDimitry Andric           if (s[2] == '4') { // i64 suffix
1097fe6060f1SDimitry Andric             Bits = 64;
1098fe6060f1SDimitry Andric             ToSkip = 3;
10990b57cec5SDimitry Andric           }
11000b57cec5SDimitry Andric           break;
11010b57cec5SDimitry Andric         default:
11020b57cec5SDimitry Andric           break;
11030b57cec5SDimitry Andric         }
1104fe6060f1SDimitry Andric         if (Bits) {
1105fe6060f1SDimitry Andric           if (HasSize)
1106fe6060f1SDimitry Andric             break;
1107fe6060f1SDimitry Andric           HasSize = true;
1108fe6060f1SDimitry Andric           MicrosoftInteger = Bits;
1109fe6060f1SDimitry Andric           s += ToSkip;
11100b57cec5SDimitry Andric           assert(s <= ThisTokEnd && "didn't maximally munch?");
11110b57cec5SDimitry Andric           break;
11120b57cec5SDimitry Andric         }
11130b57cec5SDimitry Andric       }
1114bdd1243dSDimitry Andric       [[fallthrough]];
11150b57cec5SDimitry Andric     case 'j':
11160b57cec5SDimitry Andric     case 'J':
11170b57cec5SDimitry Andric       if (isImaginary) break;   // Cannot be repeated.
11180b57cec5SDimitry Andric       isImaginary = true;
11190b57cec5SDimitry Andric       continue;  // Success.
112081ad6265SDimitry Andric     case 'w':
112181ad6265SDimitry Andric     case 'W':
112281ad6265SDimitry Andric       if (isFPConstant)
112381ad6265SDimitry Andric         break; // Invalid for floats.
112481ad6265SDimitry Andric       if (HasSize)
112581ad6265SDimitry Andric         break; // Invalid if we already have a size for the literal.
112681ad6265SDimitry Andric 
112781ad6265SDimitry Andric       // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
112881ad6265SDimitry Andric       // explicitly do not support the suffix in C++ as an extension because a
112981ad6265SDimitry Andric       // library-based UDL that resolves to a library type may be more
113081ad6265SDimitry Andric       // appropriate there.
113181ad6265SDimitry Andric       if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
113281ad6265SDimitry Andric           (s[0] == 'W' && s[1] == 'B'))) {
113381ad6265SDimitry Andric         isBitInt = true;
113481ad6265SDimitry Andric         HasSize = true;
113581ad6265SDimitry Andric         ++s; // Skip both characters (2nd char skipped on continue).
113681ad6265SDimitry Andric         continue; // Success.
113781ad6265SDimitry Andric       }
11380b57cec5SDimitry Andric     }
11390b57cec5SDimitry Andric     // If we reached here, there was an error or a ud-suffix.
11400b57cec5SDimitry Andric     break;
11410b57cec5SDimitry Andric   }
11420b57cec5SDimitry Andric 
11430b57cec5SDimitry Andric   // "i", "if", and "il" are user-defined suffixes in C++1y.
11440b57cec5SDimitry Andric   if (s != ThisTokEnd || isImaginary) {
11450b57cec5SDimitry Andric     // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
11460b57cec5SDimitry Andric     expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
11475ffd83dbSDimitry Andric     if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
11480b57cec5SDimitry Andric       if (!isImaginary) {
11490b57cec5SDimitry Andric         // Any suffix pieces we might have parsed are actually part of the
11500b57cec5SDimitry Andric         // ud-suffix.
11510b57cec5SDimitry Andric         isLong = false;
11520b57cec5SDimitry Andric         isUnsigned = false;
11530b57cec5SDimitry Andric         isLongLong = false;
1154fe6060f1SDimitry Andric         isSizeT = false;
11550b57cec5SDimitry Andric         isFloat = false;
11560b57cec5SDimitry Andric         isFloat16 = false;
11570b57cec5SDimitry Andric         isHalf = false;
11580b57cec5SDimitry Andric         isImaginary = false;
115981ad6265SDimitry Andric         isBitInt = false;
11600b57cec5SDimitry Andric         MicrosoftInteger = 0;
11610b57cec5SDimitry Andric         saw_fixed_point_suffix = false;
11620b57cec5SDimitry Andric         isFract = false;
11630b57cec5SDimitry Andric         isAccum = false;
11640b57cec5SDimitry Andric       }
11650b57cec5SDimitry Andric 
11660b57cec5SDimitry Andric       saw_ud_suffix = true;
11670b57cec5SDimitry Andric       return;
11680b57cec5SDimitry Andric     }
11690b57cec5SDimitry Andric 
11700b57cec5SDimitry Andric     if (s != ThisTokEnd) {
11710b57cec5SDimitry Andric       // Report an error if there are any.
11725ffd83dbSDimitry Andric       Diags.Report(Lexer::AdvanceToTokenCharacter(
11735ffd83dbSDimitry Andric                        TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
11740b57cec5SDimitry Andric                    diag::err_invalid_suffix_constant)
11755ffd83dbSDimitry Andric           << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
11765ffd83dbSDimitry Andric           << (isFixedPointConstant ? 2 : isFPConstant);
11770b57cec5SDimitry Andric       hadError = true;
11780b57cec5SDimitry Andric     }
11790b57cec5SDimitry Andric   }
11800b57cec5SDimitry Andric 
11810b57cec5SDimitry Andric   if (!hadError && saw_fixed_point_suffix) {
11820b57cec5SDimitry Andric     assert(isFract || isAccum);
11830b57cec5SDimitry Andric   }
11840b57cec5SDimitry Andric }
11850b57cec5SDimitry Andric 
11860b57cec5SDimitry Andric /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
11870b57cec5SDimitry Andric /// numbers. It issues an error for illegal digits, and handles floating point
11880b57cec5SDimitry Andric /// parsing. If it detects a floating point number, the radix is set to 10.
ParseDecimalOrOctalCommon(SourceLocation TokLoc)11890b57cec5SDimitry Andric void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
11900b57cec5SDimitry Andric   assert((radix == 8 || radix == 10) && "Unexpected radix");
11910b57cec5SDimitry Andric 
11920b57cec5SDimitry Andric   // If we have a hex digit other than 'e' (which denotes a FP exponent) then
11930b57cec5SDimitry Andric   // the code is using an incorrect base.
11940b57cec5SDimitry Andric   if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
11955ffd83dbSDimitry Andric       !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
11965ffd83dbSDimitry Andric     Diags.Report(
11975ffd83dbSDimitry Andric         Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
11985ffd83dbSDimitry Andric         diag::err_invalid_digit)
11995ffd83dbSDimitry Andric         << StringRef(s, 1) << (radix == 8 ? 1 : 0);
12000b57cec5SDimitry Andric     hadError = true;
12010b57cec5SDimitry Andric     return;
12020b57cec5SDimitry Andric   }
12030b57cec5SDimitry Andric 
12040b57cec5SDimitry Andric   if (*s == '.') {
12050b57cec5SDimitry Andric     checkSeparator(TokLoc, s, CSK_AfterDigits);
12060b57cec5SDimitry Andric     s++;
12070b57cec5SDimitry Andric     radix = 10;
12080b57cec5SDimitry Andric     saw_period = true;
12090b57cec5SDimitry Andric     checkSeparator(TokLoc, s, CSK_BeforeDigits);
12100b57cec5SDimitry Andric     s = SkipDigits(s); // Skip suffix.
12110b57cec5SDimitry Andric   }
12120b57cec5SDimitry Andric   if (*s == 'e' || *s == 'E') { // exponent
12130b57cec5SDimitry Andric     checkSeparator(TokLoc, s, CSK_AfterDigits);
12140b57cec5SDimitry Andric     const char *Exponent = s;
12150b57cec5SDimitry Andric     s++;
12160b57cec5SDimitry Andric     radix = 10;
12170b57cec5SDimitry Andric     saw_exponent = true;
12180b57cec5SDimitry Andric     if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
12190b57cec5SDimitry Andric     const char *first_non_digit = SkipDigits(s);
12200b57cec5SDimitry Andric     if (containsDigits(s, first_non_digit)) {
12210b57cec5SDimitry Andric       checkSeparator(TokLoc, s, CSK_BeforeDigits);
12220b57cec5SDimitry Andric       s = first_non_digit;
12230b57cec5SDimitry Andric     } else {
12240b57cec5SDimitry Andric       if (!hadError) {
12255ffd83dbSDimitry Andric         Diags.Report(Lexer::AdvanceToTokenCharacter(
12265ffd83dbSDimitry Andric                          TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
12270b57cec5SDimitry Andric                      diag::err_exponent_has_no_digits);
12280b57cec5SDimitry Andric         hadError = true;
12290b57cec5SDimitry Andric       }
12300b57cec5SDimitry Andric       return;
12310b57cec5SDimitry Andric     }
12320b57cec5SDimitry Andric   }
12330b57cec5SDimitry Andric }
12340b57cec5SDimitry Andric 
12350b57cec5SDimitry Andric /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
12360b57cec5SDimitry Andric /// suffixes as ud-suffixes, because the diagnostic experience is better if we
12370b57cec5SDimitry Andric /// treat it as an invalid suffix.
isValidUDSuffix(const LangOptions & LangOpts,StringRef Suffix)12380b57cec5SDimitry Andric bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
12390b57cec5SDimitry Andric                                            StringRef Suffix) {
12400b57cec5SDimitry Andric   if (!LangOpts.CPlusPlus11 || Suffix.empty())
12410b57cec5SDimitry Andric     return false;
12420b57cec5SDimitry Andric 
12430b57cec5SDimitry Andric   // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
12440b57cec5SDimitry Andric   if (Suffix[0] == '_')
12450b57cec5SDimitry Andric     return true;
12460b57cec5SDimitry Andric 
12470b57cec5SDimitry Andric   // In C++11, there are no library suffixes.
12480b57cec5SDimitry Andric   if (!LangOpts.CPlusPlus14)
12490b57cec5SDimitry Andric     return false;
12500b57cec5SDimitry Andric 
12510b57cec5SDimitry Andric   // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
12520b57cec5SDimitry Andric   // Per tweaked N3660, "il", "i", and "if" are also used in the library.
12530b57cec5SDimitry Andric   // In C++2a "d" and "y" are used in the library.
12540b57cec5SDimitry Andric   return llvm::StringSwitch<bool>(Suffix)
12550b57cec5SDimitry Andric       .Cases("h", "min", "s", true)
12560b57cec5SDimitry Andric       .Cases("ms", "us", "ns", true)
12570b57cec5SDimitry Andric       .Cases("il", "i", "if", true)
12585ffd83dbSDimitry Andric       .Cases("d", "y", LangOpts.CPlusPlus20)
12590b57cec5SDimitry Andric       .Default(false);
12600b57cec5SDimitry Andric }
12610b57cec5SDimitry Andric 
checkSeparator(SourceLocation TokLoc,const char * Pos,CheckSeparatorKind IsAfterDigits)12620b57cec5SDimitry Andric void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
12630b57cec5SDimitry Andric                                           const char *Pos,
12640b57cec5SDimitry Andric                                           CheckSeparatorKind IsAfterDigits) {
12650b57cec5SDimitry Andric   if (IsAfterDigits == CSK_AfterDigits) {
12660b57cec5SDimitry Andric     if (Pos == ThisTokBegin)
12670b57cec5SDimitry Andric       return;
12680b57cec5SDimitry Andric     --Pos;
12690b57cec5SDimitry Andric   } else if (Pos == ThisTokEnd)
12700b57cec5SDimitry Andric     return;
12710b57cec5SDimitry Andric 
12720b57cec5SDimitry Andric   if (isDigitSeparator(*Pos)) {
12735ffd83dbSDimitry Andric     Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
12745ffd83dbSDimitry Andric                                                 LangOpts),
12750b57cec5SDimitry Andric                  diag::err_digit_separator_not_between_digits)
12760b57cec5SDimitry Andric         << IsAfterDigits;
12770b57cec5SDimitry Andric     hadError = true;
12780b57cec5SDimitry Andric   }
12790b57cec5SDimitry Andric }
12800b57cec5SDimitry Andric 
12810b57cec5SDimitry Andric /// ParseNumberStartingWithZero - This method is called when the first character
12820b57cec5SDimitry Andric /// of the number is found to be a zero.  This means it is either an octal
12830b57cec5SDimitry Andric /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
12840b57cec5SDimitry Andric /// a floating point number (01239.123e4).  Eat the prefix, determining the
12850b57cec5SDimitry Andric /// radix etc.
ParseNumberStartingWithZero(SourceLocation TokLoc)12860b57cec5SDimitry Andric void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
12870b57cec5SDimitry Andric   assert(s[0] == '0' && "Invalid method call");
12880b57cec5SDimitry Andric   s++;
12890b57cec5SDimitry Andric 
12900b57cec5SDimitry Andric   int c1 = s[0];
12910b57cec5SDimitry Andric 
12920b57cec5SDimitry Andric   // Handle a hex number like 0x1234.
12930b57cec5SDimitry Andric   if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
12940b57cec5SDimitry Andric     s++;
12950b57cec5SDimitry Andric     assert(s < ThisTokEnd && "didn't maximally munch?");
12960b57cec5SDimitry Andric     radix = 16;
12970b57cec5SDimitry Andric     DigitsBegin = s;
12980b57cec5SDimitry Andric     s = SkipHexDigits(s);
12990b57cec5SDimitry Andric     bool HasSignificandDigits = containsDigits(DigitsBegin, s);
13000b57cec5SDimitry Andric     if (s == ThisTokEnd) {
13010b57cec5SDimitry Andric       // Done.
13020b57cec5SDimitry Andric     } else if (*s == '.') {
13030b57cec5SDimitry Andric       s++;
13040b57cec5SDimitry Andric       saw_period = true;
13050b57cec5SDimitry Andric       const char *floatDigitsBegin = s;
13060b57cec5SDimitry Andric       s = SkipHexDigits(s);
13070b57cec5SDimitry Andric       if (containsDigits(floatDigitsBegin, s))
13080b57cec5SDimitry Andric         HasSignificandDigits = true;
13090b57cec5SDimitry Andric       if (HasSignificandDigits)
13100b57cec5SDimitry Andric         checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
13110b57cec5SDimitry Andric     }
13120b57cec5SDimitry Andric 
13130b57cec5SDimitry Andric     if (!HasSignificandDigits) {
13145ffd83dbSDimitry Andric       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
13155ffd83dbSDimitry Andric                                                   LangOpts),
13160b57cec5SDimitry Andric                    diag::err_hex_constant_requires)
13175ffd83dbSDimitry Andric           << LangOpts.CPlusPlus << 1;
13180b57cec5SDimitry Andric       hadError = true;
13190b57cec5SDimitry Andric       return;
13200b57cec5SDimitry Andric     }
13210b57cec5SDimitry Andric 
13220b57cec5SDimitry Andric     // A binary exponent can appear with or with a '.'. If dotted, the
13230b57cec5SDimitry Andric     // binary exponent is required.
13240b57cec5SDimitry Andric     if (*s == 'p' || *s == 'P') {
13250b57cec5SDimitry Andric       checkSeparator(TokLoc, s, CSK_AfterDigits);
13260b57cec5SDimitry Andric       const char *Exponent = s;
13270b57cec5SDimitry Andric       s++;
13280b57cec5SDimitry Andric       saw_exponent = true;
13290b57cec5SDimitry Andric       if (s != ThisTokEnd && (*s == '+' || *s == '-'))  s++; // sign
13300b57cec5SDimitry Andric       const char *first_non_digit = SkipDigits(s);
13310b57cec5SDimitry Andric       if (!containsDigits(s, first_non_digit)) {
13320b57cec5SDimitry Andric         if (!hadError) {
13335ffd83dbSDimitry Andric           Diags.Report(Lexer::AdvanceToTokenCharacter(
13345ffd83dbSDimitry Andric                            TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
13350b57cec5SDimitry Andric                        diag::err_exponent_has_no_digits);
13360b57cec5SDimitry Andric           hadError = true;
13370b57cec5SDimitry Andric         }
13380b57cec5SDimitry Andric         return;
13390b57cec5SDimitry Andric       }
13400b57cec5SDimitry Andric       checkSeparator(TokLoc, s, CSK_BeforeDigits);
13410b57cec5SDimitry Andric       s = first_non_digit;
13420b57cec5SDimitry Andric 
13435ffd83dbSDimitry Andric       if (!LangOpts.HexFloats)
13445ffd83dbSDimitry Andric         Diags.Report(TokLoc, LangOpts.CPlusPlus
13450b57cec5SDimitry Andric                                  ? diag::ext_hex_literal_invalid
13460b57cec5SDimitry Andric                                  : diag::ext_hex_constant_invalid);
13475ffd83dbSDimitry Andric       else if (LangOpts.CPlusPlus17)
13485ffd83dbSDimitry Andric         Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
13490b57cec5SDimitry Andric     } else if (saw_period) {
13505ffd83dbSDimitry Andric       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
13515ffd83dbSDimitry Andric                                                   LangOpts),
13520b57cec5SDimitry Andric                    diag::err_hex_constant_requires)
13535ffd83dbSDimitry Andric           << LangOpts.CPlusPlus << 0;
13540b57cec5SDimitry Andric       hadError = true;
13550b57cec5SDimitry Andric     }
13560b57cec5SDimitry Andric     return;
13570b57cec5SDimitry Andric   }
13580b57cec5SDimitry Andric 
13590b57cec5SDimitry Andric   // Handle simple binary numbers 0b01010
13600b57cec5SDimitry Andric   if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
13610b57cec5SDimitry Andric     // 0b101010 is a C++1y / GCC extension.
13625ffd83dbSDimitry Andric     Diags.Report(TokLoc, LangOpts.CPlusPlus14
13630b57cec5SDimitry Andric                              ? diag::warn_cxx11_compat_binary_literal
13645ffd83dbSDimitry Andric                          : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
13650b57cec5SDimitry Andric                                               : diag::ext_binary_literal);
13660b57cec5SDimitry Andric     ++s;
13670b57cec5SDimitry Andric     assert(s < ThisTokEnd && "didn't maximally munch?");
13680b57cec5SDimitry Andric     radix = 2;
13690b57cec5SDimitry Andric     DigitsBegin = s;
13700b57cec5SDimitry Andric     s = SkipBinaryDigits(s);
13710b57cec5SDimitry Andric     if (s == ThisTokEnd) {
13720b57cec5SDimitry Andric       // Done.
13730b57cec5SDimitry Andric     } else if (isHexDigit(*s) &&
13745ffd83dbSDimitry Andric                !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
13755ffd83dbSDimitry Andric       Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
13765ffd83dbSDimitry Andric                                                   LangOpts),
13775ffd83dbSDimitry Andric                    diag::err_invalid_digit)
13785ffd83dbSDimitry Andric           << StringRef(s, 1) << 2;
13790b57cec5SDimitry Andric       hadError = true;
13800b57cec5SDimitry Andric     }
13810b57cec5SDimitry Andric     // Other suffixes will be diagnosed by the caller.
13820b57cec5SDimitry Andric     return;
13830b57cec5SDimitry Andric   }
13840b57cec5SDimitry Andric 
13850b57cec5SDimitry Andric   // For now, the radix is set to 8. If we discover that we have a
13860b57cec5SDimitry Andric   // floating point constant, the radix will change to 10. Octal floating
13870b57cec5SDimitry Andric   // point constants are not permitted (only decimal and hexadecimal).
13880b57cec5SDimitry Andric   radix = 8;
138981ad6265SDimitry Andric   const char *PossibleNewDigitStart = s;
13900b57cec5SDimitry Andric   s = SkipOctalDigits(s);
139181ad6265SDimitry Andric   // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
139281ad6265SDimitry Andric   // as the start of the digits. So if skipping octal digits does not skip
139381ad6265SDimitry Andric   // anything, we leave the digit start where it was.
139481ad6265SDimitry Andric   if (s != PossibleNewDigitStart)
139581ad6265SDimitry Andric     DigitsBegin = PossibleNewDigitStart;
139681ad6265SDimitry Andric 
13970b57cec5SDimitry Andric   if (s == ThisTokEnd)
13980b57cec5SDimitry Andric     return; // Done, simple octal number like 01234
13990b57cec5SDimitry Andric 
14000b57cec5SDimitry Andric   // If we have some other non-octal digit that *is* a decimal digit, see if
14010b57cec5SDimitry Andric   // this is part of a floating point number like 094.123 or 09e1.
14020b57cec5SDimitry Andric   if (isDigit(*s)) {
14030b57cec5SDimitry Andric     const char *EndDecimal = SkipDigits(s);
14040b57cec5SDimitry Andric     if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
14050b57cec5SDimitry Andric       s = EndDecimal;
14060b57cec5SDimitry Andric       radix = 10;
14070b57cec5SDimitry Andric     }
14080b57cec5SDimitry Andric   }
14090b57cec5SDimitry Andric 
14100b57cec5SDimitry Andric   ParseDecimalOrOctalCommon(TokLoc);
14110b57cec5SDimitry Andric }
14120b57cec5SDimitry Andric 
alwaysFitsInto64Bits(unsigned Radix,unsigned NumDigits)14130b57cec5SDimitry Andric static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
14140b57cec5SDimitry Andric   switch (Radix) {
14150b57cec5SDimitry Andric   case 2:
14160b57cec5SDimitry Andric     return NumDigits <= 64;
14170b57cec5SDimitry Andric   case 8:
14180b57cec5SDimitry Andric     return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
14190b57cec5SDimitry Andric   case 10:
14200b57cec5SDimitry Andric     return NumDigits <= 19; // floor(log10(2^64))
14210b57cec5SDimitry Andric   case 16:
14220b57cec5SDimitry Andric     return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
14230b57cec5SDimitry Andric   default:
14240b57cec5SDimitry Andric     llvm_unreachable("impossible Radix");
14250b57cec5SDimitry Andric   }
14260b57cec5SDimitry Andric }
14270b57cec5SDimitry Andric 
14280b57cec5SDimitry Andric /// GetIntegerValue - Convert this numeric literal value to an APInt that
14290b57cec5SDimitry Andric /// matches Val's input width.  If there is an overflow, set Val to the low bits
14300b57cec5SDimitry Andric /// of the result and return true.  Otherwise, return false.
GetIntegerValue(llvm::APInt & Val)14310b57cec5SDimitry Andric bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
14320b57cec5SDimitry Andric   // Fast path: Compute a conservative bound on the maximum number of
14330b57cec5SDimitry Andric   // bits per digit in this radix. If we can't possibly overflow a
14340b57cec5SDimitry Andric   // uint64 based on that bound then do the simple conversion to
14350b57cec5SDimitry Andric   // integer. This avoids the expensive overflow checking below, and
14360b57cec5SDimitry Andric   // handles the common cases that matter (small decimal integers and
14370b57cec5SDimitry Andric   // hex/octal values which don't overflow).
14380b57cec5SDimitry Andric   const unsigned NumDigits = SuffixBegin - DigitsBegin;
14390b57cec5SDimitry Andric   if (alwaysFitsInto64Bits(radix, NumDigits)) {
14400b57cec5SDimitry Andric     uint64_t N = 0;
14410b57cec5SDimitry Andric     for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
14420b57cec5SDimitry Andric       if (!isDigitSeparator(*Ptr))
14430b57cec5SDimitry Andric         N = N * radix + llvm::hexDigitValue(*Ptr);
14440b57cec5SDimitry Andric 
14450b57cec5SDimitry Andric     // This will truncate the value to Val's input width. Simply check
14460b57cec5SDimitry Andric     // for overflow by comparing.
14470b57cec5SDimitry Andric     Val = N;
14480b57cec5SDimitry Andric     return Val.getZExtValue() != N;
14490b57cec5SDimitry Andric   }
14500b57cec5SDimitry Andric 
14510b57cec5SDimitry Andric   Val = 0;
14520b57cec5SDimitry Andric   const char *Ptr = DigitsBegin;
14530b57cec5SDimitry Andric 
14540b57cec5SDimitry Andric   llvm::APInt RadixVal(Val.getBitWidth(), radix);
14550b57cec5SDimitry Andric   llvm::APInt CharVal(Val.getBitWidth(), 0);
14560b57cec5SDimitry Andric   llvm::APInt OldVal = Val;
14570b57cec5SDimitry Andric 
14580b57cec5SDimitry Andric   bool OverflowOccurred = false;
14590b57cec5SDimitry Andric   while (Ptr < SuffixBegin) {
14600b57cec5SDimitry Andric     if (isDigitSeparator(*Ptr)) {
14610b57cec5SDimitry Andric       ++Ptr;
14620b57cec5SDimitry Andric       continue;
14630b57cec5SDimitry Andric     }
14640b57cec5SDimitry Andric 
14650b57cec5SDimitry Andric     unsigned C = llvm::hexDigitValue(*Ptr++);
14660b57cec5SDimitry Andric 
14670b57cec5SDimitry Andric     // If this letter is out of bound for this radix, reject it.
14680b57cec5SDimitry Andric     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
14690b57cec5SDimitry Andric 
14700b57cec5SDimitry Andric     CharVal = C;
14710b57cec5SDimitry Andric 
14720b57cec5SDimitry Andric     // Add the digit to the value in the appropriate radix.  If adding in digits
14730b57cec5SDimitry Andric     // made the value smaller, then this overflowed.
14740b57cec5SDimitry Andric     OldVal = Val;
14750b57cec5SDimitry Andric 
14760b57cec5SDimitry Andric     // Multiply by radix, did overflow occur on the multiply?
14770b57cec5SDimitry Andric     Val *= RadixVal;
14780b57cec5SDimitry Andric     OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
14790b57cec5SDimitry Andric 
14800b57cec5SDimitry Andric     // Add value, did overflow occur on the value?
14810b57cec5SDimitry Andric     //   (a + b) ult b  <=> overflow
14820b57cec5SDimitry Andric     Val += CharVal;
14830b57cec5SDimitry Andric     OverflowOccurred |= Val.ult(CharVal);
14840b57cec5SDimitry Andric   }
14850b57cec5SDimitry Andric   return OverflowOccurred;
14860b57cec5SDimitry Andric }
14870b57cec5SDimitry Andric 
14880b57cec5SDimitry Andric llvm::APFloat::opStatus
GetFloatValue(llvm::APFloat & Result)14890b57cec5SDimitry Andric NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
14900b57cec5SDimitry Andric   using llvm::APFloat;
14910b57cec5SDimitry Andric 
14920b57cec5SDimitry Andric   unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
14930b57cec5SDimitry Andric 
14940b57cec5SDimitry Andric   llvm::SmallString<16> Buffer;
14950b57cec5SDimitry Andric   StringRef Str(ThisTokBegin, n);
1496349cc55cSDimitry Andric   if (Str.contains('\'')) {
14970b57cec5SDimitry Andric     Buffer.reserve(n);
14980b57cec5SDimitry Andric     std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
14990b57cec5SDimitry Andric                         &isDigitSeparator);
15000b57cec5SDimitry Andric     Str = Buffer;
15010b57cec5SDimitry Andric   }
15020b57cec5SDimitry Andric 
1503480093f4SDimitry Andric   auto StatusOrErr =
1504480093f4SDimitry Andric       Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1505480093f4SDimitry Andric   assert(StatusOrErr && "Invalid floating point representation");
1506480093f4SDimitry Andric   return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1507480093f4SDimitry Andric                                                : APFloat::opInvalidOp;
15080b57cec5SDimitry Andric }
15090b57cec5SDimitry Andric 
IsExponentPart(char c)15100b57cec5SDimitry Andric static inline bool IsExponentPart(char c) {
15110b57cec5SDimitry Andric   return c == 'p' || c == 'P' || c == 'e' || c == 'E';
15120b57cec5SDimitry Andric }
15130b57cec5SDimitry Andric 
GetFixedPointValue(llvm::APInt & StoreVal,unsigned Scale)15140b57cec5SDimitry Andric bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
15150b57cec5SDimitry Andric   assert(radix == 16 || radix == 10);
15160b57cec5SDimitry Andric 
15170b57cec5SDimitry Andric   // Find how many digits are needed to store the whole literal.
15180b57cec5SDimitry Andric   unsigned NumDigits = SuffixBegin - DigitsBegin;
15190b57cec5SDimitry Andric   if (saw_period) --NumDigits;
15200b57cec5SDimitry Andric 
15210b57cec5SDimitry Andric   // Initial scan of the exponent if it exists
15220b57cec5SDimitry Andric   bool ExpOverflowOccurred = false;
15230b57cec5SDimitry Andric   bool NegativeExponent = false;
15240b57cec5SDimitry Andric   const char *ExponentBegin;
15250b57cec5SDimitry Andric   uint64_t Exponent = 0;
15260b57cec5SDimitry Andric   int64_t BaseShift = 0;
15270b57cec5SDimitry Andric   if (saw_exponent) {
15280b57cec5SDimitry Andric     const char *Ptr = DigitsBegin;
15290b57cec5SDimitry Andric 
15300b57cec5SDimitry Andric     while (!IsExponentPart(*Ptr)) ++Ptr;
15310b57cec5SDimitry Andric     ExponentBegin = Ptr;
15320b57cec5SDimitry Andric     ++Ptr;
15330b57cec5SDimitry Andric     NegativeExponent = *Ptr == '-';
15340b57cec5SDimitry Andric     if (NegativeExponent) ++Ptr;
15350b57cec5SDimitry Andric 
15360b57cec5SDimitry Andric     unsigned NumExpDigits = SuffixBegin - Ptr;
15370b57cec5SDimitry Andric     if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
15380b57cec5SDimitry Andric       llvm::StringRef ExpStr(Ptr, NumExpDigits);
15390b57cec5SDimitry Andric       llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
15400b57cec5SDimitry Andric       Exponent = ExpInt.getZExtValue();
15410b57cec5SDimitry Andric     } else {
15420b57cec5SDimitry Andric       ExpOverflowOccurred = true;
15430b57cec5SDimitry Andric     }
15440b57cec5SDimitry Andric 
15450b57cec5SDimitry Andric     if (NegativeExponent) BaseShift -= Exponent;
15460b57cec5SDimitry Andric     else BaseShift += Exponent;
15470b57cec5SDimitry Andric   }
15480b57cec5SDimitry Andric 
15490b57cec5SDimitry Andric   // Number of bits needed for decimal literal is
15500b57cec5SDimitry Andric   //   ceil(NumDigits * log2(10))       Integral part
15510b57cec5SDimitry Andric   // + Scale                            Fractional part
15520b57cec5SDimitry Andric   // + ceil(Exponent * log2(10))        Exponent
15530b57cec5SDimitry Andric   // --------------------------------------------------
15540b57cec5SDimitry Andric   //   ceil((NumDigits + Exponent) * log2(10)) + Scale
15550b57cec5SDimitry Andric   //
15560b57cec5SDimitry Andric   // But for simplicity in handling integers, we can round up log2(10) to 4,
15570b57cec5SDimitry Andric   // making:
15580b57cec5SDimitry Andric   // 4 * (NumDigits + Exponent) + Scale
15590b57cec5SDimitry Andric   //
15600b57cec5SDimitry Andric   // Number of digits needed for hexadecimal literal is
15610b57cec5SDimitry Andric   //   4 * NumDigits                    Integral part
15620b57cec5SDimitry Andric   // + Scale                            Fractional part
15630b57cec5SDimitry Andric   // + Exponent                         Exponent
15640b57cec5SDimitry Andric   // --------------------------------------------------
15650b57cec5SDimitry Andric   //   (4 * NumDigits) + Scale + Exponent
15660b57cec5SDimitry Andric   uint64_t NumBitsNeeded;
15670b57cec5SDimitry Andric   if (radix == 10)
15680b57cec5SDimitry Andric     NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
15690b57cec5SDimitry Andric   else
15700b57cec5SDimitry Andric     NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
15710b57cec5SDimitry Andric 
15720b57cec5SDimitry Andric   if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
15730b57cec5SDimitry Andric     ExpOverflowOccurred = true;
15740b57cec5SDimitry Andric   llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
15750b57cec5SDimitry Andric 
15760b57cec5SDimitry Andric   bool FoundDecimal = false;
15770b57cec5SDimitry Andric 
15780b57cec5SDimitry Andric   int64_t FractBaseShift = 0;
15790b57cec5SDimitry Andric   const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
15800b57cec5SDimitry Andric   for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
15810b57cec5SDimitry Andric     if (*Ptr == '.') {
15820b57cec5SDimitry Andric       FoundDecimal = true;
15830b57cec5SDimitry Andric       continue;
15840b57cec5SDimitry Andric     }
15850b57cec5SDimitry Andric 
15860b57cec5SDimitry Andric     // Normal reading of an integer
15870b57cec5SDimitry Andric     unsigned C = llvm::hexDigitValue(*Ptr);
15880b57cec5SDimitry Andric     assert(C < radix && "NumericLiteralParser ctor should have rejected this");
15890b57cec5SDimitry Andric 
15900b57cec5SDimitry Andric     Val *= radix;
15910b57cec5SDimitry Andric     Val += C;
15920b57cec5SDimitry Andric 
15930b57cec5SDimitry Andric     if (FoundDecimal)
15940b57cec5SDimitry Andric       // Keep track of how much we will need to adjust this value by from the
15950b57cec5SDimitry Andric       // number of digits past the radix point.
15960b57cec5SDimitry Andric       --FractBaseShift;
15970b57cec5SDimitry Andric   }
15980b57cec5SDimitry Andric 
15990b57cec5SDimitry Andric   // For a radix of 16, we will be multiplying by 2 instead of 16.
16000b57cec5SDimitry Andric   if (radix == 16) FractBaseShift *= 4;
16010b57cec5SDimitry Andric   BaseShift += FractBaseShift;
16020b57cec5SDimitry Andric 
16030b57cec5SDimitry Andric   Val <<= Scale;
16040b57cec5SDimitry Andric 
16050b57cec5SDimitry Andric   uint64_t Base = (radix == 16) ? 2 : 10;
16060b57cec5SDimitry Andric   if (BaseShift > 0) {
16070b57cec5SDimitry Andric     for (int64_t i = 0; i < BaseShift; ++i) {
16080b57cec5SDimitry Andric       Val *= Base;
16090b57cec5SDimitry Andric     }
16100b57cec5SDimitry Andric   } else if (BaseShift < 0) {
1611349cc55cSDimitry Andric     for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
16120b57cec5SDimitry Andric       Val = Val.udiv(Base);
16130b57cec5SDimitry Andric   }
16140b57cec5SDimitry Andric 
16150b57cec5SDimitry Andric   bool IntOverflowOccurred = false;
16160b57cec5SDimitry Andric   auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
16170b57cec5SDimitry Andric   if (Val.getBitWidth() > StoreVal.getBitWidth()) {
16180b57cec5SDimitry Andric     IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
16190b57cec5SDimitry Andric     StoreVal = Val.trunc(StoreVal.getBitWidth());
16200b57cec5SDimitry Andric   } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
16210b57cec5SDimitry Andric     IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
16220b57cec5SDimitry Andric     StoreVal = Val.zext(StoreVal.getBitWidth());
16230b57cec5SDimitry Andric   } else {
16240b57cec5SDimitry Andric     StoreVal = Val;
16250b57cec5SDimitry Andric   }
16260b57cec5SDimitry Andric 
16270b57cec5SDimitry Andric   return IntOverflowOccurred || ExpOverflowOccurred;
16280b57cec5SDimitry Andric }
16290b57cec5SDimitry Andric 
16300b57cec5SDimitry Andric /// \verbatim
16310b57cec5SDimitry Andric ///       user-defined-character-literal: [C++11 lex.ext]
16320b57cec5SDimitry Andric ///         character-literal ud-suffix
16330b57cec5SDimitry Andric ///       ud-suffix:
16340b57cec5SDimitry Andric ///         identifier
16350b57cec5SDimitry Andric ///       character-literal: [C++11 lex.ccon]
16360b57cec5SDimitry Andric ///         ' c-char-sequence '
16370b57cec5SDimitry Andric ///         u' c-char-sequence '
16380b57cec5SDimitry Andric ///         U' c-char-sequence '
16390b57cec5SDimitry Andric ///         L' c-char-sequence '
16400b57cec5SDimitry Andric ///         u8' c-char-sequence ' [C++1z lex.ccon]
16410b57cec5SDimitry Andric ///       c-char-sequence:
16420b57cec5SDimitry Andric ///         c-char
16430b57cec5SDimitry Andric ///         c-char-sequence c-char
16440b57cec5SDimitry Andric ///       c-char:
16450b57cec5SDimitry Andric ///         any member of the source character set except the single-quote ',
16460b57cec5SDimitry Andric ///           backslash \, or new-line character
16470b57cec5SDimitry Andric ///         escape-sequence
16480b57cec5SDimitry Andric ///         universal-character-name
16490b57cec5SDimitry Andric ///       escape-sequence:
16500b57cec5SDimitry Andric ///         simple-escape-sequence
16510b57cec5SDimitry Andric ///         octal-escape-sequence
16520b57cec5SDimitry Andric ///         hexadecimal-escape-sequence
16530b57cec5SDimitry Andric ///       simple-escape-sequence:
16540b57cec5SDimitry Andric ///         one of \' \" \? \\ \a \b \f \n \r \t \v
16550b57cec5SDimitry Andric ///       octal-escape-sequence:
16560b57cec5SDimitry Andric ///         \ octal-digit
16570b57cec5SDimitry Andric ///         \ octal-digit octal-digit
16580b57cec5SDimitry Andric ///         \ octal-digit octal-digit octal-digit
16590b57cec5SDimitry Andric ///       hexadecimal-escape-sequence:
16600b57cec5SDimitry Andric ///         \x hexadecimal-digit
16610b57cec5SDimitry Andric ///         hexadecimal-escape-sequence hexadecimal-digit
16620b57cec5SDimitry Andric ///       universal-character-name: [C++11 lex.charset]
16630b57cec5SDimitry Andric ///         \u hex-quad
16640b57cec5SDimitry Andric ///         \U hex-quad hex-quad
16650b57cec5SDimitry Andric ///       hex-quad:
16660b57cec5SDimitry Andric ///         hex-digit hex-digit hex-digit hex-digit
16670b57cec5SDimitry Andric /// \endverbatim
16680b57cec5SDimitry Andric ///
CharLiteralParser(const char * begin,const char * end,SourceLocation Loc,Preprocessor & PP,tok::TokenKind kind)16690b57cec5SDimitry Andric CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
16700b57cec5SDimitry Andric                                      SourceLocation Loc, Preprocessor &PP,
16710b57cec5SDimitry Andric                                      tok::TokenKind kind) {
16720b57cec5SDimitry Andric   // At this point we know that the character matches the regex "(L|u|U)?'.*'".
16730b57cec5SDimitry Andric   HadError = false;
16740b57cec5SDimitry Andric 
16750b57cec5SDimitry Andric   Kind = kind;
16760b57cec5SDimitry Andric 
16770b57cec5SDimitry Andric   const char *TokBegin = begin;
16780b57cec5SDimitry Andric 
16790b57cec5SDimitry Andric   // Skip over wide character determinant.
16800b57cec5SDimitry Andric   if (Kind != tok::char_constant)
16810b57cec5SDimitry Andric     ++begin;
16820b57cec5SDimitry Andric   if (Kind == tok::utf8_char_constant)
16830b57cec5SDimitry Andric     ++begin;
16840b57cec5SDimitry Andric 
16850b57cec5SDimitry Andric   // Skip over the entry quote.
1686349cc55cSDimitry Andric   if (begin[0] != '\'') {
1687349cc55cSDimitry Andric     PP.Diag(Loc, diag::err_lexing_char);
1688349cc55cSDimitry Andric     HadError = true;
1689349cc55cSDimitry Andric     return;
1690349cc55cSDimitry Andric   }
1691349cc55cSDimitry Andric 
16920b57cec5SDimitry Andric   ++begin;
16930b57cec5SDimitry Andric 
16940b57cec5SDimitry Andric   // Remove an optional ud-suffix.
16950b57cec5SDimitry Andric   if (end[-1] != '\'') {
16960b57cec5SDimitry Andric     const char *UDSuffixEnd = end;
16970b57cec5SDimitry Andric     do {
16980b57cec5SDimitry Andric       --end;
16990b57cec5SDimitry Andric     } while (end[-1] != '\'');
17000b57cec5SDimitry Andric     // FIXME: Don't bother with this if !tok.hasUCN().
17010b57cec5SDimitry Andric     expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
17020b57cec5SDimitry Andric     UDSuffixOffset = end - TokBegin;
17030b57cec5SDimitry Andric   }
17040b57cec5SDimitry Andric 
17050b57cec5SDimitry Andric   // Trim the ending quote.
17060b57cec5SDimitry Andric   assert(end != begin && "Invalid token lexed");
17070b57cec5SDimitry Andric   --end;
17080b57cec5SDimitry Andric 
17090b57cec5SDimitry Andric   // FIXME: The "Value" is an uint64_t so we can handle char literals of
17100b57cec5SDimitry Andric   // up to 64-bits.
17110b57cec5SDimitry Andric   // FIXME: This extensively assumes that 'char' is 8-bits.
17120b57cec5SDimitry Andric   assert(PP.getTargetInfo().getCharWidth() == 8 &&
17130b57cec5SDimitry Andric          "Assumes char is 8 bits");
17140b57cec5SDimitry Andric   assert(PP.getTargetInfo().getIntWidth() <= 64 &&
17150b57cec5SDimitry Andric          (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
17160b57cec5SDimitry Andric          "Assumes sizeof(int) on target is <= 64 and a multiple of char");
17170b57cec5SDimitry Andric   assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
17180b57cec5SDimitry Andric          "Assumes sizeof(wchar) on target is <= 64");
17190b57cec5SDimitry Andric 
17200b57cec5SDimitry Andric   SmallVector<uint32_t, 4> codepoint_buffer;
17210b57cec5SDimitry Andric   codepoint_buffer.resize(end - begin);
17220b57cec5SDimitry Andric   uint32_t *buffer_begin = &codepoint_buffer.front();
17230b57cec5SDimitry Andric   uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
17240b57cec5SDimitry Andric 
17250b57cec5SDimitry Andric   // Unicode escapes representing characters that cannot be correctly
17260b57cec5SDimitry Andric   // represented in a single code unit are disallowed in character literals
17270b57cec5SDimitry Andric   // by this implementation.
17280b57cec5SDimitry Andric   uint32_t largest_character_for_kind;
17290b57cec5SDimitry Andric   if (tok::wide_char_constant == Kind) {
17300b57cec5SDimitry Andric     largest_character_for_kind =
17310b57cec5SDimitry Andric         0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
17320b57cec5SDimitry Andric   } else if (tok::utf8_char_constant == Kind) {
17330b57cec5SDimitry Andric     largest_character_for_kind = 0x7F;
17340b57cec5SDimitry Andric   } else if (tok::utf16_char_constant == Kind) {
17350b57cec5SDimitry Andric     largest_character_for_kind = 0xFFFF;
17360b57cec5SDimitry Andric   } else if (tok::utf32_char_constant == Kind) {
17370b57cec5SDimitry Andric     largest_character_for_kind = 0x10FFFF;
17380b57cec5SDimitry Andric   } else {
17390b57cec5SDimitry Andric     largest_character_for_kind = 0x7Fu;
17400b57cec5SDimitry Andric   }
17410b57cec5SDimitry Andric 
17420b57cec5SDimitry Andric   while (begin != end) {
17430b57cec5SDimitry Andric     // Is this a span of non-escape characters?
17440b57cec5SDimitry Andric     if (begin[0] != '\\') {
17450b57cec5SDimitry Andric       char const *start = begin;
17460b57cec5SDimitry Andric       do {
17470b57cec5SDimitry Andric         ++begin;
17480b57cec5SDimitry Andric       } while (begin != end && *begin != '\\');
17490b57cec5SDimitry Andric 
17500b57cec5SDimitry Andric       char const *tmp_in_start = start;
17510b57cec5SDimitry Andric       uint32_t *tmp_out_start = buffer_begin;
17520b57cec5SDimitry Andric       llvm::ConversionResult res =
17530b57cec5SDimitry Andric           llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
17540b57cec5SDimitry Andric                              reinterpret_cast<llvm::UTF8 const *>(begin),
17550b57cec5SDimitry Andric                              &buffer_begin, buffer_end, llvm::strictConversion);
17560b57cec5SDimitry Andric       if (res != llvm::conversionOK) {
17570b57cec5SDimitry Andric         // If we see bad encoding for unprefixed character literals, warn and
17580b57cec5SDimitry Andric         // simply copy the byte values, for compatibility with gcc and
17590b57cec5SDimitry Andric         // older versions of clang.
176081ad6265SDimitry Andric         bool NoErrorOnBadEncoding = isOrdinary();
17610b57cec5SDimitry Andric         unsigned Msg = diag::err_bad_character_encoding;
17620b57cec5SDimitry Andric         if (NoErrorOnBadEncoding)
17630b57cec5SDimitry Andric           Msg = diag::warn_bad_character_encoding;
17640b57cec5SDimitry Andric         PP.Diag(Loc, Msg);
17650b57cec5SDimitry Andric         if (NoErrorOnBadEncoding) {
17660b57cec5SDimitry Andric           start = tmp_in_start;
17670b57cec5SDimitry Andric           buffer_begin = tmp_out_start;
17680b57cec5SDimitry Andric           for (; start != begin; ++start, ++buffer_begin)
17690b57cec5SDimitry Andric             *buffer_begin = static_cast<uint8_t>(*start);
17700b57cec5SDimitry Andric         } else {
17710b57cec5SDimitry Andric           HadError = true;
17720b57cec5SDimitry Andric         }
17730b57cec5SDimitry Andric       } else {
17740b57cec5SDimitry Andric         for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
17750b57cec5SDimitry Andric           if (*tmp_out_start > largest_character_for_kind) {
17760b57cec5SDimitry Andric             HadError = true;
17770b57cec5SDimitry Andric             PP.Diag(Loc, diag::err_character_too_large);
17780b57cec5SDimitry Andric           }
17790b57cec5SDimitry Andric         }
17800b57cec5SDimitry Andric       }
17810b57cec5SDimitry Andric 
17820b57cec5SDimitry Andric       continue;
17830b57cec5SDimitry Andric     }
17840b57cec5SDimitry Andric     // Is this a Universal Character Name escape?
178581ad6265SDimitry Andric     if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
17860b57cec5SDimitry Andric       unsigned short UcnLen = 0;
17870b57cec5SDimitry Andric       if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
17880b57cec5SDimitry Andric                             FullSourceLoc(Loc, PP.getSourceManager()),
17890b57cec5SDimitry Andric                             &PP.getDiagnostics(), PP.getLangOpts(), true)) {
17900b57cec5SDimitry Andric         HadError = true;
17910b57cec5SDimitry Andric       } else if (*buffer_begin > largest_character_for_kind) {
17920b57cec5SDimitry Andric         HadError = true;
17930b57cec5SDimitry Andric         PP.Diag(Loc, diag::err_character_too_large);
17940b57cec5SDimitry Andric       }
17950b57cec5SDimitry Andric 
17960b57cec5SDimitry Andric       ++buffer_begin;
17970b57cec5SDimitry Andric       continue;
17980b57cec5SDimitry Andric     }
17990b57cec5SDimitry Andric     unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
18000b57cec5SDimitry Andric     uint64_t result =
18010b57cec5SDimitry Andric         ProcessCharEscape(TokBegin, begin, end, HadError,
180206c3fb27SDimitry Andric                           FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
180306c3fb27SDimitry Andric                           &PP.getDiagnostics(), PP.getLangOpts(),
180406c3fb27SDimitry Andric                           StringLiteralEvalMethod::Evaluated);
18050b57cec5SDimitry Andric     *buffer_begin++ = result;
18060b57cec5SDimitry Andric   }
18070b57cec5SDimitry Andric 
18080b57cec5SDimitry Andric   unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
18090b57cec5SDimitry Andric 
18100b57cec5SDimitry Andric   if (NumCharsSoFar > 1) {
181181ad6265SDimitry Andric     if (isOrdinary() && NumCharsSoFar == 4)
1812e8d8bef9SDimitry Andric       PP.Diag(Loc, diag::warn_four_char_character_literal);
181381ad6265SDimitry Andric     else if (isOrdinary())
1814e8d8bef9SDimitry Andric       PP.Diag(Loc, diag::warn_multichar_character_literal);
1815349cc55cSDimitry Andric     else {
1816349cc55cSDimitry Andric       PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1817349cc55cSDimitry Andric       HadError = true;
1818349cc55cSDimitry Andric     }
18190b57cec5SDimitry Andric     IsMultiChar = true;
18200b57cec5SDimitry Andric   } else {
18210b57cec5SDimitry Andric     IsMultiChar = false;
18220b57cec5SDimitry Andric   }
18230b57cec5SDimitry Andric 
18240b57cec5SDimitry Andric   llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
18250b57cec5SDimitry Andric 
18260b57cec5SDimitry Andric   // Narrow character literals act as though their value is concatenated
18270b57cec5SDimitry Andric   // in this implementation, but warn on overflow.
18280b57cec5SDimitry Andric   bool multi_char_too_long = false;
182981ad6265SDimitry Andric   if (isOrdinary() && isMultiChar()) {
18300b57cec5SDimitry Andric     LitVal = 0;
18310b57cec5SDimitry Andric     for (size_t i = 0; i < NumCharsSoFar; ++i) {
18320b57cec5SDimitry Andric       // check for enough leading zeros to shift into
183306c3fb27SDimitry Andric       multi_char_too_long |= (LitVal.countl_zero() < 8);
18340b57cec5SDimitry Andric       LitVal <<= 8;
18350b57cec5SDimitry Andric       LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
18360b57cec5SDimitry Andric     }
18370b57cec5SDimitry Andric   } else if (NumCharsSoFar > 0) {
18380b57cec5SDimitry Andric     // otherwise just take the last character
18390b57cec5SDimitry Andric     LitVal = buffer_begin[-1];
18400b57cec5SDimitry Andric   }
18410b57cec5SDimitry Andric 
18420b57cec5SDimitry Andric   if (!HadError && multi_char_too_long) {
18430b57cec5SDimitry Andric     PP.Diag(Loc, diag::warn_char_constant_too_large);
18440b57cec5SDimitry Andric   }
18450b57cec5SDimitry Andric 
18460b57cec5SDimitry Andric   // Transfer the value from APInt to uint64_t
18470b57cec5SDimitry Andric   Value = LitVal.getZExtValue();
18480b57cec5SDimitry Andric 
18490b57cec5SDimitry Andric   // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
18500b57cec5SDimitry Andric   // if 'char' is signed for this target (C99 6.4.4.4p10).  Note that multiple
18510b57cec5SDimitry Andric   // character constants are not sign extended in the this implementation:
18520b57cec5SDimitry Andric   // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
185381ad6265SDimitry Andric   if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
18540b57cec5SDimitry Andric       PP.getLangOpts().CharIsSigned)
18550b57cec5SDimitry Andric     Value = (signed char)Value;
18560b57cec5SDimitry Andric }
18570b57cec5SDimitry Andric 
18580b57cec5SDimitry Andric /// \verbatim
18590b57cec5SDimitry Andric ///       string-literal: [C++0x lex.string]
18600b57cec5SDimitry Andric ///         encoding-prefix " [s-char-sequence] "
18610b57cec5SDimitry Andric ///         encoding-prefix R raw-string
18620b57cec5SDimitry Andric ///       encoding-prefix:
18630b57cec5SDimitry Andric ///         u8
18640b57cec5SDimitry Andric ///         u
18650b57cec5SDimitry Andric ///         U
18660b57cec5SDimitry Andric ///         L
18670b57cec5SDimitry Andric ///       s-char-sequence:
18680b57cec5SDimitry Andric ///         s-char
18690b57cec5SDimitry Andric ///         s-char-sequence s-char
18700b57cec5SDimitry Andric ///       s-char:
18710b57cec5SDimitry Andric ///         any member of the source character set except the double-quote ",
18720b57cec5SDimitry Andric ///           backslash \, or new-line character
18730b57cec5SDimitry Andric ///         escape-sequence
18740b57cec5SDimitry Andric ///         universal-character-name
18750b57cec5SDimitry Andric ///       raw-string:
18760b57cec5SDimitry Andric ///         " d-char-sequence ( r-char-sequence ) d-char-sequence "
18770b57cec5SDimitry Andric ///       r-char-sequence:
18780b57cec5SDimitry Andric ///         r-char
18790b57cec5SDimitry Andric ///         r-char-sequence r-char
18800b57cec5SDimitry Andric ///       r-char:
18810b57cec5SDimitry Andric ///         any member of the source character set, except a right parenthesis )
18820b57cec5SDimitry Andric ///           followed by the initial d-char-sequence (which may be empty)
18830b57cec5SDimitry Andric ///           followed by a double quote ".
18840b57cec5SDimitry Andric ///       d-char-sequence:
18850b57cec5SDimitry Andric ///         d-char
18860b57cec5SDimitry Andric ///         d-char-sequence d-char
18870b57cec5SDimitry Andric ///       d-char:
18880b57cec5SDimitry Andric ///         any member of the basic source character set except:
18890b57cec5SDimitry Andric ///           space, the left parenthesis (, the right parenthesis ),
18900b57cec5SDimitry Andric ///           the backslash \, and the control characters representing horizontal
18910b57cec5SDimitry Andric ///           tab, vertical tab, form feed, and newline.
18920b57cec5SDimitry Andric ///       escape-sequence: [C++0x lex.ccon]
18930b57cec5SDimitry Andric ///         simple-escape-sequence
18940b57cec5SDimitry Andric ///         octal-escape-sequence
18950b57cec5SDimitry Andric ///         hexadecimal-escape-sequence
18960b57cec5SDimitry Andric ///       simple-escape-sequence:
18970b57cec5SDimitry Andric ///         one of \' \" \? \\ \a \b \f \n \r \t \v
18980b57cec5SDimitry Andric ///       octal-escape-sequence:
18990b57cec5SDimitry Andric ///         \ octal-digit
19000b57cec5SDimitry Andric ///         \ octal-digit octal-digit
19010b57cec5SDimitry Andric ///         \ octal-digit octal-digit octal-digit
19020b57cec5SDimitry Andric ///       hexadecimal-escape-sequence:
19030b57cec5SDimitry Andric ///         \x hexadecimal-digit
19040b57cec5SDimitry Andric ///         hexadecimal-escape-sequence hexadecimal-digit
19050b57cec5SDimitry Andric ///       universal-character-name:
19060b57cec5SDimitry Andric ///         \u hex-quad
19070b57cec5SDimitry Andric ///         \U hex-quad hex-quad
19080b57cec5SDimitry Andric ///       hex-quad:
19090b57cec5SDimitry Andric ///         hex-digit hex-digit hex-digit hex-digit
19100b57cec5SDimitry Andric /// \endverbatim
19110b57cec5SDimitry Andric ///
StringLiteralParser(ArrayRef<Token> StringToks,Preprocessor & PP,StringLiteralEvalMethod EvalMethod)191206c3fb27SDimitry Andric StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
191306c3fb27SDimitry Andric                                          Preprocessor &PP,
191406c3fb27SDimitry Andric                                          StringLiteralEvalMethod EvalMethod)
19150b57cec5SDimitry Andric     : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1916349cc55cSDimitry Andric       Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
19170b57cec5SDimitry Andric       MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
191806c3fb27SDimitry Andric       ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
191906c3fb27SDimitry Andric       Pascal(false) {
19200b57cec5SDimitry Andric   init(StringToks);
19210b57cec5SDimitry Andric }
19220b57cec5SDimitry Andric 
init(ArrayRef<Token> StringToks)19230b57cec5SDimitry Andric void StringLiteralParser::init(ArrayRef<Token> StringToks){
19240b57cec5SDimitry Andric   // The literal token may have come from an invalid source location (e.g. due
19250b57cec5SDimitry Andric   // to a PCH error), in which case the token length will be 0.
19260b57cec5SDimitry Andric   if (StringToks.empty() || StringToks[0].getLength() < 2)
19270b57cec5SDimitry Andric     return DiagnoseLexingError(SourceLocation());
19280b57cec5SDimitry Andric 
19290b57cec5SDimitry Andric   // Scan all of the string portions, remember the max individual token length,
19300b57cec5SDimitry Andric   // computing a bound on the concatenated string length, and see whether any
19310b57cec5SDimitry Andric   // piece is a wide-string.  If any of the string portions is a wide-string
19320b57cec5SDimitry Andric   // literal, the result is a wide-string literal [C99 6.4.5p4].
19330b57cec5SDimitry Andric   assert(!StringToks.empty() && "expected at least one token");
19340b57cec5SDimitry Andric   MaxTokenLength = StringToks[0].getLength();
19350b57cec5SDimitry Andric   assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
19360b57cec5SDimitry Andric   SizeBound = StringToks[0].getLength() - 2; // -2 for "".
19370b57cec5SDimitry Andric   hadError = false;
19380b57cec5SDimitry Andric 
193906c3fb27SDimitry Andric   // Determines the kind of string from the prefix
194006c3fb27SDimitry Andric   Kind = tok::string_literal;
194106c3fb27SDimitry Andric 
19420b57cec5SDimitry Andric   /// (C99 5.1.1.2p1).  The common case is only one string fragment.
194306c3fb27SDimitry Andric   for (const Token &Tok : StringToks) {
194406c3fb27SDimitry Andric     if (Tok.getLength() < 2)
194506c3fb27SDimitry Andric       return DiagnoseLexingError(Tok.getLocation());
19460b57cec5SDimitry Andric 
19470b57cec5SDimitry Andric     // The string could be shorter than this if it needs cleaning, but this is a
19480b57cec5SDimitry Andric     // reasonable bound, which is all we need.
194906c3fb27SDimitry Andric     assert(Tok.getLength() >= 2 && "literal token is invalid!");
195006c3fb27SDimitry Andric     SizeBound += Tok.getLength() - 2; // -2 for "".
19510b57cec5SDimitry Andric 
19520b57cec5SDimitry Andric     // Remember maximum string piece length.
195306c3fb27SDimitry Andric     if (Tok.getLength() > MaxTokenLength)
195406c3fb27SDimitry Andric       MaxTokenLength = Tok.getLength();
19550b57cec5SDimitry Andric 
19560b57cec5SDimitry Andric     // Remember if we see any wide or utf-8/16/32 strings.
19570b57cec5SDimitry Andric     // Also check for illegal concatenations.
195806c3fb27SDimitry Andric     if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
19598a4dda33SDimitry Andric       if (Diags) {
19608a4dda33SDimitry Andric         SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
19618a4dda33SDimitry Andric             Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
19628a4dda33SDimitry Andric             Features);
19638a4dda33SDimitry Andric         CharSourceRange Range =
19648a4dda33SDimitry Andric             CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
19658a4dda33SDimitry Andric         StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
19668a4dda33SDimitry Andric                          getEncodingPrefixLen(Tok.getKind()));
19678a4dda33SDimitry Andric         Diags->Report(Tok.getLocation(),
19688a4dda33SDimitry Andric                       Features.CPlusPlus26
19698a4dda33SDimitry Andric                           ? diag::err_unevaluated_string_prefix
19708a4dda33SDimitry Andric                           : diag::warn_unevaluated_string_prefix)
19718a4dda33SDimitry Andric             << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
19728a4dda33SDimitry Andric       }
19738a4dda33SDimitry Andric       if (Features.CPlusPlus26)
197406c3fb27SDimitry Andric         hadError = true;
197506c3fb27SDimitry Andric     } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
197681ad6265SDimitry Andric       if (isOrdinary()) {
197706c3fb27SDimitry Andric         Kind = Tok.getKind();
19780b57cec5SDimitry Andric       } else {
19790b57cec5SDimitry Andric         if (Diags)
198006c3fb27SDimitry Andric           Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
19810b57cec5SDimitry Andric         hadError = true;
19820b57cec5SDimitry Andric       }
19830b57cec5SDimitry Andric     }
19840b57cec5SDimitry Andric   }
19850b57cec5SDimitry Andric 
19860b57cec5SDimitry Andric   // Include space for the null terminator.
19870b57cec5SDimitry Andric   ++SizeBound;
19880b57cec5SDimitry Andric 
19890b57cec5SDimitry Andric   // TODO: K&R warning: "traditional C rejects string constant concatenation"
19900b57cec5SDimitry Andric 
19910b57cec5SDimitry Andric   // Get the width in bytes of char/wchar_t/char16_t/char32_t
19920b57cec5SDimitry Andric   CharByteWidth = getCharWidth(Kind, Target);
19930b57cec5SDimitry Andric   assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
19940b57cec5SDimitry Andric   CharByteWidth /= 8;
19950b57cec5SDimitry Andric 
19960b57cec5SDimitry Andric   // The output buffer size needs to be large enough to hold wide characters.
19970b57cec5SDimitry Andric   // This is a worst-case assumption which basically corresponds to L"" "long".
19980b57cec5SDimitry Andric   SizeBound *= CharByteWidth;
19990b57cec5SDimitry Andric 
20000b57cec5SDimitry Andric   // Size the temporary buffer to hold the result string data.
20010b57cec5SDimitry Andric   ResultBuf.resize(SizeBound);
20020b57cec5SDimitry Andric 
20030b57cec5SDimitry Andric   // Likewise, but for each string piece.
20040b57cec5SDimitry Andric   SmallString<512> TokenBuf;
20050b57cec5SDimitry Andric   TokenBuf.resize(MaxTokenLength);
20060b57cec5SDimitry Andric 
20070b57cec5SDimitry Andric   // Loop over all the strings, getting their spelling, and expanding them to
20080b57cec5SDimitry Andric   // wide strings as appropriate.
20090b57cec5SDimitry Andric   ResultPtr = &ResultBuf[0];   // Next byte to fill in.
20100b57cec5SDimitry Andric 
20110b57cec5SDimitry Andric   Pascal = false;
20120b57cec5SDimitry Andric 
20130b57cec5SDimitry Andric   SourceLocation UDSuffixTokLoc;
20140b57cec5SDimitry Andric 
20150b57cec5SDimitry Andric   for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
20160b57cec5SDimitry Andric     const char *ThisTokBuf = &TokenBuf[0];
20170b57cec5SDimitry Andric     // Get the spelling of the token, which eliminates trigraphs, etc.  We know
20180b57cec5SDimitry Andric     // that ThisTokBuf points to a buffer that is big enough for the whole token
20190b57cec5SDimitry Andric     // and 'spelled' tokens can only shrink.
20200b57cec5SDimitry Andric     bool StringInvalid = false;
20210b57cec5SDimitry Andric     unsigned ThisTokLen =
20220b57cec5SDimitry Andric       Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
20230b57cec5SDimitry Andric                          &StringInvalid);
20240b57cec5SDimitry Andric     if (StringInvalid)
20250b57cec5SDimitry Andric       return DiagnoseLexingError(StringToks[i].getLocation());
20260b57cec5SDimitry Andric 
20270b57cec5SDimitry Andric     const char *ThisTokBegin = ThisTokBuf;
20280b57cec5SDimitry Andric     const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
20290b57cec5SDimitry Andric 
20300b57cec5SDimitry Andric     // Remove an optional ud-suffix.
20310b57cec5SDimitry Andric     if (ThisTokEnd[-1] != '"') {
20320b57cec5SDimitry Andric       const char *UDSuffixEnd = ThisTokEnd;
20330b57cec5SDimitry Andric       do {
20340b57cec5SDimitry Andric         --ThisTokEnd;
20350b57cec5SDimitry Andric       } while (ThisTokEnd[-1] != '"');
20360b57cec5SDimitry Andric 
20370b57cec5SDimitry Andric       StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
20380b57cec5SDimitry Andric 
20390b57cec5SDimitry Andric       if (UDSuffixBuf.empty()) {
20400b57cec5SDimitry Andric         if (StringToks[i].hasUCN())
20410b57cec5SDimitry Andric           expandUCNs(UDSuffixBuf, UDSuffix);
20420b57cec5SDimitry Andric         else
20430b57cec5SDimitry Andric           UDSuffixBuf.assign(UDSuffix);
20440b57cec5SDimitry Andric         UDSuffixToken = i;
20450b57cec5SDimitry Andric         UDSuffixOffset = ThisTokEnd - ThisTokBuf;
20460b57cec5SDimitry Andric         UDSuffixTokLoc = StringToks[i].getLocation();
20470b57cec5SDimitry Andric       } else {
20480b57cec5SDimitry Andric         SmallString<32> ExpandedUDSuffix;
20490b57cec5SDimitry Andric         if (StringToks[i].hasUCN()) {
20500b57cec5SDimitry Andric           expandUCNs(ExpandedUDSuffix, UDSuffix);
20510b57cec5SDimitry Andric           UDSuffix = ExpandedUDSuffix;
20520b57cec5SDimitry Andric         }
20530b57cec5SDimitry Andric 
20540b57cec5SDimitry Andric         // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
20550b57cec5SDimitry Andric         // result of a concatenation involving at least one user-defined-string-
20560b57cec5SDimitry Andric         // literal, all the participating user-defined-string-literals shall
20570b57cec5SDimitry Andric         // have the same ud-suffix.
205806c3fb27SDimitry Andric         bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
205906c3fb27SDimitry Andric         if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
20600b57cec5SDimitry Andric           if (Diags) {
20610b57cec5SDimitry Andric             SourceLocation TokLoc = StringToks[i].getLocation();
206206c3fb27SDimitry Andric             if (UnevaluatedStringHasUDL) {
206306c3fb27SDimitry Andric               Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
206406c3fb27SDimitry Andric                   << SourceRange(TokLoc, TokLoc);
206506c3fb27SDimitry Andric             } else {
20660b57cec5SDimitry Andric               Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
20670b57cec5SDimitry Andric                   << UDSuffixBuf << UDSuffix
206806c3fb27SDimitry Andric                   << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
206906c3fb27SDimitry Andric             }
20700b57cec5SDimitry Andric           }
20710b57cec5SDimitry Andric           hadError = true;
20720b57cec5SDimitry Andric         }
20730b57cec5SDimitry Andric       }
20740b57cec5SDimitry Andric     }
20750b57cec5SDimitry Andric 
20760b57cec5SDimitry Andric     // Strip the end quote.
20770b57cec5SDimitry Andric     --ThisTokEnd;
20780b57cec5SDimitry Andric 
20790b57cec5SDimitry Andric     // TODO: Input character set mapping support.
20800b57cec5SDimitry Andric 
20810b57cec5SDimitry Andric     // Skip marker for wide or unicode strings.
20820b57cec5SDimitry Andric     if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
20830b57cec5SDimitry Andric       ++ThisTokBuf;
20840b57cec5SDimitry Andric       // Skip 8 of u8 marker for utf8 strings.
20850b57cec5SDimitry Andric       if (ThisTokBuf[0] == '8')
20860b57cec5SDimitry Andric         ++ThisTokBuf;
20870b57cec5SDimitry Andric     }
20880b57cec5SDimitry Andric 
20890b57cec5SDimitry Andric     // Check for raw string
20900b57cec5SDimitry Andric     if (ThisTokBuf[0] == 'R') {
2091fe6060f1SDimitry Andric       if (ThisTokBuf[1] != '"') {
2092fe6060f1SDimitry Andric         // The file may have come from PCH and then changed after loading the
2093fe6060f1SDimitry Andric         // PCH; Fail gracefully.
2094fe6060f1SDimitry Andric         return DiagnoseLexingError(StringToks[i].getLocation());
2095fe6060f1SDimitry Andric       }
20960b57cec5SDimitry Andric       ThisTokBuf += 2; // skip R"
20970b57cec5SDimitry Andric 
2098fe6060f1SDimitry Andric       // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2099fe6060f1SDimitry Andric       // characters.
2100fe6060f1SDimitry Andric       constexpr unsigned MaxRawStrDelimLen = 16;
2101fe6060f1SDimitry Andric 
21020b57cec5SDimitry Andric       const char *Prefix = ThisTokBuf;
2103fe6060f1SDimitry Andric       while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2104fe6060f1SDimitry Andric              ThisTokBuf[0] != '(')
21050b57cec5SDimitry Andric         ++ThisTokBuf;
2106fe6060f1SDimitry Andric       if (ThisTokBuf[0] != '(')
2107fe6060f1SDimitry Andric         return DiagnoseLexingError(StringToks[i].getLocation());
21080b57cec5SDimitry Andric       ++ThisTokBuf; // skip '('
21090b57cec5SDimitry Andric 
21100b57cec5SDimitry Andric       // Remove same number of characters from the end
21110b57cec5SDimitry Andric       ThisTokEnd -= ThisTokBuf - Prefix;
2112fe6060f1SDimitry Andric       if (ThisTokEnd < ThisTokBuf)
2113fe6060f1SDimitry Andric         return DiagnoseLexingError(StringToks[i].getLocation());
21140b57cec5SDimitry Andric 
21150b57cec5SDimitry Andric       // C++14 [lex.string]p4: A source-file new-line in a raw string literal
21160b57cec5SDimitry Andric       // results in a new-line in the resulting execution string-literal.
21170b57cec5SDimitry Andric       StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
21180b57cec5SDimitry Andric       while (!RemainingTokenSpan.empty()) {
21190b57cec5SDimitry Andric         // Split the string literal on \r\n boundaries.
21200b57cec5SDimitry Andric         size_t CRLFPos = RemainingTokenSpan.find("\r\n");
21210b57cec5SDimitry Andric         StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
21220b57cec5SDimitry Andric         StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
21230b57cec5SDimitry Andric 
21240b57cec5SDimitry Andric         // Copy everything before the \r\n sequence into the string literal.
21250b57cec5SDimitry Andric         if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
21260b57cec5SDimitry Andric           hadError = true;
21270b57cec5SDimitry Andric 
21280b57cec5SDimitry Andric         // Point into the \n inside the \r\n sequence and operate on the
21290b57cec5SDimitry Andric         // remaining portion of the literal.
21300b57cec5SDimitry Andric         RemainingTokenSpan = AfterCRLF.substr(1);
21310b57cec5SDimitry Andric       }
21320b57cec5SDimitry Andric     } else {
21330b57cec5SDimitry Andric       if (ThisTokBuf[0] != '"') {
21340b57cec5SDimitry Andric         // The file may have come from PCH and then changed after loading the
21350b57cec5SDimitry Andric         // PCH; Fail gracefully.
21360b57cec5SDimitry Andric         return DiagnoseLexingError(StringToks[i].getLocation());
21370b57cec5SDimitry Andric       }
21380b57cec5SDimitry Andric       ++ThisTokBuf; // skip "
21390b57cec5SDimitry Andric 
21400b57cec5SDimitry Andric       // Check if this is a pascal string
214106c3fb27SDimitry Andric       if (!isUnevaluated() && Features.PascalStrings &&
214206c3fb27SDimitry Andric           ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
214306c3fb27SDimitry Andric           ThisTokBuf[1] == 'p') {
21440b57cec5SDimitry Andric 
21450b57cec5SDimitry Andric         // If the \p sequence is found in the first token, we have a pascal string
21460b57cec5SDimitry Andric         // Otherwise, if we already have a pascal string, ignore the first \p
21470b57cec5SDimitry Andric         if (i == 0) {
21480b57cec5SDimitry Andric           ++ThisTokBuf;
21490b57cec5SDimitry Andric           Pascal = true;
21500b57cec5SDimitry Andric         } else if (Pascal)
21510b57cec5SDimitry Andric           ThisTokBuf += 2;
21520b57cec5SDimitry Andric       }
21530b57cec5SDimitry Andric 
21540b57cec5SDimitry Andric       while (ThisTokBuf != ThisTokEnd) {
21550b57cec5SDimitry Andric         // Is this a span of non-escape characters?
21560b57cec5SDimitry Andric         if (ThisTokBuf[0] != '\\') {
21570b57cec5SDimitry Andric           const char *InStart = ThisTokBuf;
21580b57cec5SDimitry Andric           do {
21590b57cec5SDimitry Andric             ++ThisTokBuf;
21600b57cec5SDimitry Andric           } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
21610b57cec5SDimitry Andric 
21620b57cec5SDimitry Andric           // Copy the character span over.
21630b57cec5SDimitry Andric           if (CopyStringFragment(StringToks[i], ThisTokBegin,
21640b57cec5SDimitry Andric                                  StringRef(InStart, ThisTokBuf - InStart)))
21650b57cec5SDimitry Andric             hadError = true;
21660b57cec5SDimitry Andric           continue;
21670b57cec5SDimitry Andric         }
21680b57cec5SDimitry Andric         // Is this a Universal Character Name escape?
216981ad6265SDimitry Andric         if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
217081ad6265SDimitry Andric             ThisTokBuf[1] == 'N') {
21710b57cec5SDimitry Andric           EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
21720b57cec5SDimitry Andric                           ResultPtr, hadError,
21730b57cec5SDimitry Andric                           FullSourceLoc(StringToks[i].getLocation(), SM),
21740b57cec5SDimitry Andric                           CharByteWidth, Diags, Features);
21750b57cec5SDimitry Andric           continue;
21760b57cec5SDimitry Andric         }
21770b57cec5SDimitry Andric         // Otherwise, this is a non-UCN escape character.  Process it.
21780b57cec5SDimitry Andric         unsigned ResultChar =
21790b57cec5SDimitry Andric             ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
21800b57cec5SDimitry Andric                               FullSourceLoc(StringToks[i].getLocation(), SM),
218106c3fb27SDimitry Andric                               CharByteWidth * 8, Diags, Features, EvalMethod);
21820b57cec5SDimitry Andric 
21830b57cec5SDimitry Andric         if (CharByteWidth == 4) {
21840b57cec5SDimitry Andric           // FIXME: Make the type of the result buffer correct instead of
21850b57cec5SDimitry Andric           // using reinterpret_cast.
21860b57cec5SDimitry Andric           llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
21870b57cec5SDimitry Andric           *ResultWidePtr = ResultChar;
21880b57cec5SDimitry Andric           ResultPtr += 4;
21890b57cec5SDimitry Andric         } else if (CharByteWidth == 2) {
21900b57cec5SDimitry Andric           // FIXME: Make the type of the result buffer correct instead of
21910b57cec5SDimitry Andric           // using reinterpret_cast.
21920b57cec5SDimitry Andric           llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
21930b57cec5SDimitry Andric           *ResultWidePtr = ResultChar & 0xFFFF;
21940b57cec5SDimitry Andric           ResultPtr += 2;
21950b57cec5SDimitry Andric         } else {
21960b57cec5SDimitry Andric           assert(CharByteWidth == 1 && "Unexpected char width");
21970b57cec5SDimitry Andric           *ResultPtr++ = ResultChar & 0xFF;
21980b57cec5SDimitry Andric         }
21990b57cec5SDimitry Andric       }
22000b57cec5SDimitry Andric     }
22010b57cec5SDimitry Andric   }
22020b57cec5SDimitry Andric 
220306c3fb27SDimitry Andric   assert((!Pascal || !isUnevaluated()) &&
220406c3fb27SDimitry Andric          "Pascal string in unevaluated context");
22050b57cec5SDimitry Andric   if (Pascal) {
22060b57cec5SDimitry Andric     if (CharByteWidth == 4) {
22070b57cec5SDimitry Andric       // FIXME: Make the type of the result buffer correct instead of
22080b57cec5SDimitry Andric       // using reinterpret_cast.
22090b57cec5SDimitry Andric       llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
22100b57cec5SDimitry Andric       ResultWidePtr[0] = GetNumStringChars() - 1;
22110b57cec5SDimitry Andric     } else if (CharByteWidth == 2) {
22120b57cec5SDimitry Andric       // FIXME: Make the type of the result buffer correct instead of
22130b57cec5SDimitry Andric       // using reinterpret_cast.
22140b57cec5SDimitry Andric       llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
22150b57cec5SDimitry Andric       ResultWidePtr[0] = GetNumStringChars() - 1;
22160b57cec5SDimitry Andric     } else {
22170b57cec5SDimitry Andric       assert(CharByteWidth == 1 && "Unexpected char width");
22180b57cec5SDimitry Andric       ResultBuf[0] = GetNumStringChars() - 1;
22190b57cec5SDimitry Andric     }
22200b57cec5SDimitry Andric 
22210b57cec5SDimitry Andric     // Verify that pascal strings aren't too large.
22220b57cec5SDimitry Andric     if (GetStringLength() > 256) {
22230b57cec5SDimitry Andric       if (Diags)
22240b57cec5SDimitry Andric         Diags->Report(StringToks.front().getLocation(),
22250b57cec5SDimitry Andric                       diag::err_pascal_string_too_long)
22260b57cec5SDimitry Andric           << SourceRange(StringToks.front().getLocation(),
22270b57cec5SDimitry Andric                          StringToks.back().getLocation());
22280b57cec5SDimitry Andric       hadError = true;
22290b57cec5SDimitry Andric       return;
22300b57cec5SDimitry Andric     }
22310b57cec5SDimitry Andric   } else if (Diags) {
22320b57cec5SDimitry Andric     // Complain if this string literal has too many characters.
22330b57cec5SDimitry Andric     unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
22340b57cec5SDimitry Andric 
22350b57cec5SDimitry Andric     if (GetNumStringChars() > MaxChars)
22360b57cec5SDimitry Andric       Diags->Report(StringToks.front().getLocation(),
22370b57cec5SDimitry Andric                     diag::ext_string_too_long)
22380b57cec5SDimitry Andric         << GetNumStringChars() << MaxChars
22390b57cec5SDimitry Andric         << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
22400b57cec5SDimitry Andric         << SourceRange(StringToks.front().getLocation(),
22410b57cec5SDimitry Andric                        StringToks.back().getLocation());
22420b57cec5SDimitry Andric   }
22430b57cec5SDimitry Andric }
22440b57cec5SDimitry Andric 
resyncUTF8(const char * Err,const char * End)22450b57cec5SDimitry Andric static const char *resyncUTF8(const char *Err, const char *End) {
22460b57cec5SDimitry Andric   if (Err == End)
22470b57cec5SDimitry Andric     return End;
22480b57cec5SDimitry Andric   End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
22490b57cec5SDimitry Andric   while (++Err != End && (*Err & 0xC0) == 0x80)
22500b57cec5SDimitry Andric     ;
22510b57cec5SDimitry Andric   return Err;
22520b57cec5SDimitry Andric }
22530b57cec5SDimitry Andric 
22540b57cec5SDimitry Andric /// This function copies from Fragment, which is a sequence of bytes
22550b57cec5SDimitry Andric /// within Tok's contents (which begin at TokBegin) into ResultPtr.
22560b57cec5SDimitry Andric /// Performs widening for multi-byte characters.
CopyStringFragment(const Token & Tok,const char * TokBegin,StringRef Fragment)22570b57cec5SDimitry Andric bool StringLiteralParser::CopyStringFragment(const Token &Tok,
22580b57cec5SDimitry Andric                                              const char *TokBegin,
22590b57cec5SDimitry Andric                                              StringRef Fragment) {
22600b57cec5SDimitry Andric   const llvm::UTF8 *ErrorPtrTmp;
22610b57cec5SDimitry Andric   if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
22620b57cec5SDimitry Andric     return false;
22630b57cec5SDimitry Andric 
22640b57cec5SDimitry Andric   // If we see bad encoding for unprefixed string literals, warn and
22650b57cec5SDimitry Andric   // simply copy the byte values, for compatibility with gcc and older
22660b57cec5SDimitry Andric   // versions of clang.
226781ad6265SDimitry Andric   bool NoErrorOnBadEncoding = isOrdinary();
22680b57cec5SDimitry Andric   if (NoErrorOnBadEncoding) {
22690b57cec5SDimitry Andric     memcpy(ResultPtr, Fragment.data(), Fragment.size());
22700b57cec5SDimitry Andric     ResultPtr += Fragment.size();
22710b57cec5SDimitry Andric   }
22720b57cec5SDimitry Andric 
22730b57cec5SDimitry Andric   if (Diags) {
22740b57cec5SDimitry Andric     const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
22750b57cec5SDimitry Andric 
22760b57cec5SDimitry Andric     FullSourceLoc SourceLoc(Tok.getLocation(), SM);
22770b57cec5SDimitry Andric     const DiagnosticBuilder &Builder =
22780b57cec5SDimitry Andric       Diag(Diags, Features, SourceLoc, TokBegin,
22790b57cec5SDimitry Andric            ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
22800b57cec5SDimitry Andric            NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
22810b57cec5SDimitry Andric                                 : diag::err_bad_string_encoding);
22820b57cec5SDimitry Andric 
22830b57cec5SDimitry Andric     const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
22840b57cec5SDimitry Andric     StringRef NextFragment(NextStart, Fragment.end()-NextStart);
22850b57cec5SDimitry Andric 
22860b57cec5SDimitry Andric     // Decode into a dummy buffer.
22870b57cec5SDimitry Andric     SmallString<512> Dummy;
22880b57cec5SDimitry Andric     Dummy.reserve(Fragment.size() * CharByteWidth);
22890b57cec5SDimitry Andric     char *Ptr = Dummy.data();
22900b57cec5SDimitry Andric 
22910b57cec5SDimitry Andric     while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
22920b57cec5SDimitry Andric       const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
22930b57cec5SDimitry Andric       NextStart = resyncUTF8(ErrorPtr, Fragment.end());
22940b57cec5SDimitry Andric       Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
22950b57cec5SDimitry Andric                                      ErrorPtr, NextStart);
22960b57cec5SDimitry Andric       NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
22970b57cec5SDimitry Andric     }
22980b57cec5SDimitry Andric   }
22990b57cec5SDimitry Andric   return !NoErrorOnBadEncoding;
23000b57cec5SDimitry Andric }
23010b57cec5SDimitry Andric 
DiagnoseLexingError(SourceLocation Loc)23020b57cec5SDimitry Andric void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
23030b57cec5SDimitry Andric   hadError = true;
23040b57cec5SDimitry Andric   if (Diags)
23050b57cec5SDimitry Andric     Diags->Report(Loc, diag::err_lexing_string);
23060b57cec5SDimitry Andric }
23070b57cec5SDimitry Andric 
23080b57cec5SDimitry Andric /// getOffsetOfStringByte - This function returns the offset of the
23090b57cec5SDimitry Andric /// specified byte of the string data represented by Token.  This handles
23100b57cec5SDimitry Andric /// advancing over escape sequences in the string.
getOffsetOfStringByte(const Token & Tok,unsigned ByteNo) const23110b57cec5SDimitry Andric unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
23120b57cec5SDimitry Andric                                                     unsigned ByteNo) const {
23130b57cec5SDimitry Andric   // Get the spelling of the token.
23140b57cec5SDimitry Andric   SmallString<32> SpellingBuffer;
23150b57cec5SDimitry Andric   SpellingBuffer.resize(Tok.getLength());
23160b57cec5SDimitry Andric 
23170b57cec5SDimitry Andric   bool StringInvalid = false;
23180b57cec5SDimitry Andric   const char *SpellingPtr = &SpellingBuffer[0];
23190b57cec5SDimitry Andric   unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
23200b57cec5SDimitry Andric                                        &StringInvalid);
23210b57cec5SDimitry Andric   if (StringInvalid)
23220b57cec5SDimitry Andric     return 0;
23230b57cec5SDimitry Andric 
23240b57cec5SDimitry Andric   const char *SpellingStart = SpellingPtr;
23250b57cec5SDimitry Andric   const char *SpellingEnd = SpellingPtr+TokLen;
23260b57cec5SDimitry Andric 
23270b57cec5SDimitry Andric   // Handle UTF-8 strings just like narrow strings.
23280b57cec5SDimitry Andric   if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
23290b57cec5SDimitry Andric     SpellingPtr += 2;
23300b57cec5SDimitry Andric 
23310b57cec5SDimitry Andric   assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
23320b57cec5SDimitry Andric          SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
23330b57cec5SDimitry Andric 
23340b57cec5SDimitry Andric   // For raw string literals, this is easy.
23350b57cec5SDimitry Andric   if (SpellingPtr[0] == 'R') {
23360b57cec5SDimitry Andric     assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
23370b57cec5SDimitry Andric     // Skip 'R"'.
23380b57cec5SDimitry Andric     SpellingPtr += 2;
23390b57cec5SDimitry Andric     while (*SpellingPtr != '(') {
23400b57cec5SDimitry Andric       ++SpellingPtr;
23410b57cec5SDimitry Andric       assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
23420b57cec5SDimitry Andric     }
23430b57cec5SDimitry Andric     // Skip '('.
23440b57cec5SDimitry Andric     ++SpellingPtr;
23450b57cec5SDimitry Andric     return SpellingPtr - SpellingStart + ByteNo;
23460b57cec5SDimitry Andric   }
23470b57cec5SDimitry Andric 
23480b57cec5SDimitry Andric   // Skip over the leading quote
23490b57cec5SDimitry Andric   assert(SpellingPtr[0] == '"' && "Should be a string literal!");
23500b57cec5SDimitry Andric   ++SpellingPtr;
23510b57cec5SDimitry Andric 
23520b57cec5SDimitry Andric   // Skip over bytes until we find the offset we're looking for.
23530b57cec5SDimitry Andric   while (ByteNo) {
23540b57cec5SDimitry Andric     assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
23550b57cec5SDimitry Andric 
23560b57cec5SDimitry Andric     // Step over non-escapes simply.
23570b57cec5SDimitry Andric     if (*SpellingPtr != '\\') {
23580b57cec5SDimitry Andric       ++SpellingPtr;
23590b57cec5SDimitry Andric       --ByteNo;
23600b57cec5SDimitry Andric       continue;
23610b57cec5SDimitry Andric     }
23620b57cec5SDimitry Andric 
23630b57cec5SDimitry Andric     // Otherwise, this is an escape character.  Advance over it.
23640b57cec5SDimitry Andric     bool HadError = false;
236581ad6265SDimitry Andric     if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
236681ad6265SDimitry Andric         SpellingPtr[1] == 'N') {
23670b57cec5SDimitry Andric       const char *EscapePtr = SpellingPtr;
23680b57cec5SDimitry Andric       unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
23690b57cec5SDimitry Andric                                       1, Features, HadError);
23700b57cec5SDimitry Andric       if (Len > ByteNo) {
23710b57cec5SDimitry Andric         // ByteNo is somewhere within the escape sequence.
23720b57cec5SDimitry Andric         SpellingPtr = EscapePtr;
23730b57cec5SDimitry Andric         break;
23740b57cec5SDimitry Andric       }
23750b57cec5SDimitry Andric       ByteNo -= Len;
23760b57cec5SDimitry Andric     } else {
23770b57cec5SDimitry Andric       ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
237806c3fb27SDimitry Andric                         FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
237906c3fb27SDimitry Andric                         Diags, Features, StringLiteralEvalMethod::Evaluated);
23800b57cec5SDimitry Andric       --ByteNo;
23810b57cec5SDimitry Andric     }
23820b57cec5SDimitry Andric     assert(!HadError && "This method isn't valid on erroneous strings");
23830b57cec5SDimitry Andric   }
23840b57cec5SDimitry Andric 
23850b57cec5SDimitry Andric   return SpellingPtr-SpellingStart;
23860b57cec5SDimitry Andric }
23870b57cec5SDimitry Andric 
23880b57cec5SDimitry Andric /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
23890b57cec5SDimitry Andric /// suffixes as ud-suffixes, because the diagnostic experience is better if we
23900b57cec5SDimitry Andric /// treat it as an invalid suffix.
isValidUDSuffix(const LangOptions & LangOpts,StringRef Suffix)23910b57cec5SDimitry Andric bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
23920b57cec5SDimitry Andric                                           StringRef Suffix) {
23930b57cec5SDimitry Andric   return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
23940b57cec5SDimitry Andric          Suffix == "sv";
23950b57cec5SDimitry Andric }
2396