10b57cec5SDimitry Andric //===--- LiteralSupport.cpp - Code to parse and process literals ----------===//
20b57cec5SDimitry Andric //
30b57cec5SDimitry Andric // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
40b57cec5SDimitry Andric // See https://llvm.org/LICENSE.txt for license information.
50b57cec5SDimitry Andric // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
60b57cec5SDimitry Andric //
70b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
80b57cec5SDimitry Andric //
90b57cec5SDimitry Andric // This file implements the NumericLiteralParser, CharLiteralParser, and
100b57cec5SDimitry Andric // StringLiteralParser interfaces.
110b57cec5SDimitry Andric //
120b57cec5SDimitry Andric //===----------------------------------------------------------------------===//
130b57cec5SDimitry Andric
140b57cec5SDimitry Andric #include "clang/Lex/LiteralSupport.h"
150b57cec5SDimitry Andric #include "clang/Basic/CharInfo.h"
160b57cec5SDimitry Andric #include "clang/Basic/LangOptions.h"
170b57cec5SDimitry Andric #include "clang/Basic/SourceLocation.h"
180b57cec5SDimitry Andric #include "clang/Basic/TargetInfo.h"
190b57cec5SDimitry Andric #include "clang/Lex/LexDiagnostic.h"
200b57cec5SDimitry Andric #include "clang/Lex/Lexer.h"
210b57cec5SDimitry Andric #include "clang/Lex/Preprocessor.h"
220b57cec5SDimitry Andric #include "clang/Lex/Token.h"
230b57cec5SDimitry Andric #include "llvm/ADT/APInt.h"
240b57cec5SDimitry Andric #include "llvm/ADT/SmallVector.h"
250b57cec5SDimitry Andric #include "llvm/ADT/StringExtras.h"
260b57cec5SDimitry Andric #include "llvm/ADT/StringSwitch.h"
270b57cec5SDimitry Andric #include "llvm/Support/ConvertUTF.h"
285ffd83dbSDimitry Andric #include "llvm/Support/Error.h"
290b57cec5SDimitry Andric #include "llvm/Support/ErrorHandling.h"
3081ad6265SDimitry Andric #include "llvm/Support/Unicode.h"
310b57cec5SDimitry Andric #include <algorithm>
320b57cec5SDimitry Andric #include <cassert>
330b57cec5SDimitry Andric #include <cstddef>
340b57cec5SDimitry Andric #include <cstdint>
350b57cec5SDimitry Andric #include <cstring>
360b57cec5SDimitry Andric #include <string>
370b57cec5SDimitry Andric
380b57cec5SDimitry Andric using namespace clang;
390b57cec5SDimitry Andric
getCharWidth(tok::TokenKind kind,const TargetInfo & Target)400b57cec5SDimitry Andric static unsigned getCharWidth(tok::TokenKind kind, const TargetInfo &Target) {
410b57cec5SDimitry Andric switch (kind) {
420b57cec5SDimitry Andric default: llvm_unreachable("Unknown token type!");
430b57cec5SDimitry Andric case tok::char_constant:
440b57cec5SDimitry Andric case tok::string_literal:
450b57cec5SDimitry Andric case tok::utf8_char_constant:
460b57cec5SDimitry Andric case tok::utf8_string_literal:
470b57cec5SDimitry Andric return Target.getCharWidth();
480b57cec5SDimitry Andric case tok::wide_char_constant:
490b57cec5SDimitry Andric case tok::wide_string_literal:
500b57cec5SDimitry Andric return Target.getWCharWidth();
510b57cec5SDimitry Andric case tok::utf16_char_constant:
520b57cec5SDimitry Andric case tok::utf16_string_literal:
530b57cec5SDimitry Andric return Target.getChar16Width();
540b57cec5SDimitry Andric case tok::utf32_char_constant:
550b57cec5SDimitry Andric case tok::utf32_string_literal:
560b57cec5SDimitry Andric return Target.getChar32Width();
570b57cec5SDimitry Andric }
580b57cec5SDimitry Andric }
590b57cec5SDimitry Andric
getEncodingPrefixLen(tok::TokenKind kind)608a4dda33SDimitry Andric static unsigned getEncodingPrefixLen(tok::TokenKind kind) {
618a4dda33SDimitry Andric switch (kind) {
628a4dda33SDimitry Andric default:
638a4dda33SDimitry Andric llvm_unreachable("Unknown token type!");
648a4dda33SDimitry Andric case tok::char_constant:
658a4dda33SDimitry Andric case tok::string_literal:
668a4dda33SDimitry Andric return 0;
678a4dda33SDimitry Andric case tok::utf8_char_constant:
688a4dda33SDimitry Andric case tok::utf8_string_literal:
698a4dda33SDimitry Andric return 2;
708a4dda33SDimitry Andric case tok::wide_char_constant:
718a4dda33SDimitry Andric case tok::wide_string_literal:
728a4dda33SDimitry Andric case tok::utf16_char_constant:
738a4dda33SDimitry Andric case tok::utf16_string_literal:
748a4dda33SDimitry Andric case tok::utf32_char_constant:
758a4dda33SDimitry Andric case tok::utf32_string_literal:
768a4dda33SDimitry Andric return 1;
778a4dda33SDimitry Andric }
788a4dda33SDimitry Andric }
798a4dda33SDimitry Andric
MakeCharSourceRange(const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd)800b57cec5SDimitry Andric static CharSourceRange MakeCharSourceRange(const LangOptions &Features,
810b57cec5SDimitry Andric FullSourceLoc TokLoc,
820b57cec5SDimitry Andric const char *TokBegin,
830b57cec5SDimitry Andric const char *TokRangeBegin,
840b57cec5SDimitry Andric const char *TokRangeEnd) {
850b57cec5SDimitry Andric SourceLocation Begin =
860b57cec5SDimitry Andric Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
870b57cec5SDimitry Andric TokLoc.getManager(), Features);
880b57cec5SDimitry Andric SourceLocation End =
890b57cec5SDimitry Andric Lexer::AdvanceToTokenCharacter(Begin, TokRangeEnd - TokRangeBegin,
900b57cec5SDimitry Andric TokLoc.getManager(), Features);
910b57cec5SDimitry Andric return CharSourceRange::getCharRange(Begin, End);
920b57cec5SDimitry Andric }
930b57cec5SDimitry Andric
940b57cec5SDimitry Andric /// Produce a diagnostic highlighting some portion of a literal.
950b57cec5SDimitry Andric ///
960b57cec5SDimitry Andric /// Emits the diagnostic \p DiagID, highlighting the range of characters from
970b57cec5SDimitry Andric /// \p TokRangeBegin (inclusive) to \p TokRangeEnd (exclusive), which must be
980b57cec5SDimitry Andric /// a substring of a spelling buffer for the token beginning at \p TokBegin.
Diag(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc TokLoc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,unsigned DiagID)990b57cec5SDimitry Andric static DiagnosticBuilder Diag(DiagnosticsEngine *Diags,
1000b57cec5SDimitry Andric const LangOptions &Features, FullSourceLoc TokLoc,
1010b57cec5SDimitry Andric const char *TokBegin, const char *TokRangeBegin,
1020b57cec5SDimitry Andric const char *TokRangeEnd, unsigned DiagID) {
1030b57cec5SDimitry Andric SourceLocation Begin =
1040b57cec5SDimitry Andric Lexer::AdvanceToTokenCharacter(TokLoc, TokRangeBegin - TokBegin,
1050b57cec5SDimitry Andric TokLoc.getManager(), Features);
1060b57cec5SDimitry Andric return Diags->Report(Begin, DiagID) <<
1070b57cec5SDimitry Andric MakeCharSourceRange(Features, TokLoc, TokBegin, TokRangeBegin, TokRangeEnd);
1080b57cec5SDimitry Andric }
1090b57cec5SDimitry Andric
IsEscapeValidInUnevaluatedStringLiteral(char Escape)11006c3fb27SDimitry Andric static bool IsEscapeValidInUnevaluatedStringLiteral(char Escape) {
11106c3fb27SDimitry Andric switch (Escape) {
11206c3fb27SDimitry Andric case '\'':
11306c3fb27SDimitry Andric case '"':
11406c3fb27SDimitry Andric case '?':
11506c3fb27SDimitry Andric case '\\':
11606c3fb27SDimitry Andric case 'a':
11706c3fb27SDimitry Andric case 'b':
11806c3fb27SDimitry Andric case 'f':
11906c3fb27SDimitry Andric case 'n':
12006c3fb27SDimitry Andric case 'r':
12106c3fb27SDimitry Andric case 't':
12206c3fb27SDimitry Andric case 'v':
12306c3fb27SDimitry Andric return true;
12406c3fb27SDimitry Andric }
12506c3fb27SDimitry Andric return false;
12606c3fb27SDimitry Andric }
12706c3fb27SDimitry Andric
1280b57cec5SDimitry Andric /// ProcessCharEscape - Parse a standard C escape sequence, which can occur in
1290b57cec5SDimitry Andric /// either a character or a string literal.
ProcessCharEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,bool & HadError,FullSourceLoc Loc,unsigned CharWidth,DiagnosticsEngine * Diags,const LangOptions & Features,StringLiteralEvalMethod EvalMethod)1300b57cec5SDimitry Andric static unsigned ProcessCharEscape(const char *ThisTokBegin,
1310b57cec5SDimitry Andric const char *&ThisTokBuf,
1320b57cec5SDimitry Andric const char *ThisTokEnd, bool &HadError,
1330b57cec5SDimitry Andric FullSourceLoc Loc, unsigned CharWidth,
1340b57cec5SDimitry Andric DiagnosticsEngine *Diags,
13506c3fb27SDimitry Andric const LangOptions &Features,
13606c3fb27SDimitry Andric StringLiteralEvalMethod EvalMethod) {
1370b57cec5SDimitry Andric const char *EscapeBegin = ThisTokBuf;
138349cc55cSDimitry Andric bool Delimited = false;
139349cc55cSDimitry Andric bool EndDelimiterFound = false;
1400b57cec5SDimitry Andric
1410b57cec5SDimitry Andric // Skip the '\' char.
1420b57cec5SDimitry Andric ++ThisTokBuf;
1430b57cec5SDimitry Andric
1440b57cec5SDimitry Andric // We know that this character can't be off the end of the buffer, because
1450b57cec5SDimitry Andric // that would have been \", which would not have been the end of string.
1460b57cec5SDimitry Andric unsigned ResultChar = *ThisTokBuf++;
14706c3fb27SDimitry Andric char Escape = ResultChar;
1480b57cec5SDimitry Andric switch (ResultChar) {
1490b57cec5SDimitry Andric // These map to themselves.
1500b57cec5SDimitry Andric case '\\': case '\'': case '"': case '?': break;
1510b57cec5SDimitry Andric
1520b57cec5SDimitry Andric // These have fixed mappings.
1530b57cec5SDimitry Andric case 'a':
1540b57cec5SDimitry Andric // TODO: K&R: the meaning of '\\a' is different in traditional C
1550b57cec5SDimitry Andric ResultChar = 7;
1560b57cec5SDimitry Andric break;
1570b57cec5SDimitry Andric case 'b':
1580b57cec5SDimitry Andric ResultChar = 8;
1590b57cec5SDimitry Andric break;
1600b57cec5SDimitry Andric case 'e':
1610b57cec5SDimitry Andric if (Diags)
1620b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
1630b57cec5SDimitry Andric diag::ext_nonstandard_escape) << "e";
1640b57cec5SDimitry Andric ResultChar = 27;
1650b57cec5SDimitry Andric break;
1660b57cec5SDimitry Andric case 'E':
1670b57cec5SDimitry Andric if (Diags)
1680b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
1690b57cec5SDimitry Andric diag::ext_nonstandard_escape) << "E";
1700b57cec5SDimitry Andric ResultChar = 27;
1710b57cec5SDimitry Andric break;
1720b57cec5SDimitry Andric case 'f':
1730b57cec5SDimitry Andric ResultChar = 12;
1740b57cec5SDimitry Andric break;
1750b57cec5SDimitry Andric case 'n':
1760b57cec5SDimitry Andric ResultChar = 10;
1770b57cec5SDimitry Andric break;
1780b57cec5SDimitry Andric case 'r':
1790b57cec5SDimitry Andric ResultChar = 13;
1800b57cec5SDimitry Andric break;
1810b57cec5SDimitry Andric case 't':
1820b57cec5SDimitry Andric ResultChar = 9;
1830b57cec5SDimitry Andric break;
1840b57cec5SDimitry Andric case 'v':
1850b57cec5SDimitry Andric ResultChar = 11;
1860b57cec5SDimitry Andric break;
1870b57cec5SDimitry Andric case 'x': { // Hex escape.
1880b57cec5SDimitry Andric ResultChar = 0;
189349cc55cSDimitry Andric if (ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
190349cc55cSDimitry Andric Delimited = true;
191349cc55cSDimitry Andric ThisTokBuf++;
192349cc55cSDimitry Andric if (*ThisTokBuf == '}') {
193349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
194349cc55cSDimitry Andric diag::err_delimited_escape_empty);
195349cc55cSDimitry Andric return ResultChar;
196349cc55cSDimitry Andric }
197349cc55cSDimitry Andric } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
1980b57cec5SDimitry Andric if (Diags)
1990b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
2000b57cec5SDimitry Andric diag::err_hex_escape_no_digits) << "x";
201349cc55cSDimitry Andric return ResultChar;
2020b57cec5SDimitry Andric }
2030b57cec5SDimitry Andric
2040b57cec5SDimitry Andric // Hex escapes are a maximal series of hex digits.
2050b57cec5SDimitry Andric bool Overflow = false;
2060b57cec5SDimitry Andric for (; ThisTokBuf != ThisTokEnd; ++ThisTokBuf) {
207349cc55cSDimitry Andric if (Delimited && *ThisTokBuf == '}') {
208349cc55cSDimitry Andric ThisTokBuf++;
209349cc55cSDimitry Andric EndDelimiterFound = true;
210349cc55cSDimitry Andric break;
211349cc55cSDimitry Andric }
212349cc55cSDimitry Andric int CharVal = llvm::hexDigitValue(*ThisTokBuf);
213349cc55cSDimitry Andric if (CharVal == -1) {
214349cc55cSDimitry Andric // Non delimited hex escape sequences stop at the first non-hex digit.
215349cc55cSDimitry Andric if (!Delimited)
216349cc55cSDimitry Andric break;
217349cc55cSDimitry Andric HadError = true;
218349cc55cSDimitry Andric if (Diags)
219349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
220349cc55cSDimitry Andric diag::err_delimited_escape_invalid)
221349cc55cSDimitry Andric << StringRef(ThisTokBuf, 1);
222349cc55cSDimitry Andric continue;
223349cc55cSDimitry Andric }
2240b57cec5SDimitry Andric // About to shift out a digit?
2250b57cec5SDimitry Andric if (ResultChar & 0xF0000000)
2260b57cec5SDimitry Andric Overflow = true;
2270b57cec5SDimitry Andric ResultChar <<= 4;
2280b57cec5SDimitry Andric ResultChar |= CharVal;
2290b57cec5SDimitry Andric }
2300b57cec5SDimitry Andric // See if any bits will be truncated when evaluated as a character.
2310b57cec5SDimitry Andric if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
2320b57cec5SDimitry Andric Overflow = true;
2330b57cec5SDimitry Andric ResultChar &= ~0U >> (32-CharWidth);
2340b57cec5SDimitry Andric }
2350b57cec5SDimitry Andric
2360b57cec5SDimitry Andric // Check for overflow.
237349cc55cSDimitry Andric if (!HadError && Overflow) { // Too many digits to fit in
238349cc55cSDimitry Andric HadError = true;
239349cc55cSDimitry Andric if (Diags)
2400b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
241349cc55cSDimitry Andric diag::err_escape_too_large)
242349cc55cSDimitry Andric << 0;
243349cc55cSDimitry Andric }
2440b57cec5SDimitry Andric break;
2450b57cec5SDimitry Andric }
2460b57cec5SDimitry Andric case '0': case '1': case '2': case '3':
2470b57cec5SDimitry Andric case '4': case '5': case '6': case '7': {
2480b57cec5SDimitry Andric // Octal escapes.
2490b57cec5SDimitry Andric --ThisTokBuf;
2500b57cec5SDimitry Andric ResultChar = 0;
2510b57cec5SDimitry Andric
2520b57cec5SDimitry Andric // Octal escapes are a series of octal digits with maximum length 3.
2530b57cec5SDimitry Andric // "\0123" is a two digit sequence equal to "\012" "3".
2540b57cec5SDimitry Andric unsigned NumDigits = 0;
2550b57cec5SDimitry Andric do {
2560b57cec5SDimitry Andric ResultChar <<= 3;
2570b57cec5SDimitry Andric ResultChar |= *ThisTokBuf++ - '0';
2580b57cec5SDimitry Andric ++NumDigits;
2590b57cec5SDimitry Andric } while (ThisTokBuf != ThisTokEnd && NumDigits < 3 &&
2600b57cec5SDimitry Andric ThisTokBuf[0] >= '0' && ThisTokBuf[0] <= '7');
2610b57cec5SDimitry Andric
2620b57cec5SDimitry Andric // Check for overflow. Reject '\777', but not L'\777'.
2630b57cec5SDimitry Andric if (CharWidth != 32 && (ResultChar >> CharWidth) != 0) {
2640b57cec5SDimitry Andric if (Diags)
2650b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
2660b57cec5SDimitry Andric diag::err_escape_too_large) << 1;
2670b57cec5SDimitry Andric ResultChar &= ~0U >> (32-CharWidth);
2680b57cec5SDimitry Andric }
2690b57cec5SDimitry Andric break;
2700b57cec5SDimitry Andric }
271349cc55cSDimitry Andric case 'o': {
272349cc55cSDimitry Andric bool Overflow = false;
273349cc55cSDimitry Andric if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
274349cc55cSDimitry Andric HadError = true;
275349cc55cSDimitry Andric if (Diags)
276349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
27781ad6265SDimitry Andric diag::err_delimited_escape_missing_brace)
27881ad6265SDimitry Andric << "o";
2790b57cec5SDimitry Andric
280349cc55cSDimitry Andric break;
281349cc55cSDimitry Andric }
282349cc55cSDimitry Andric ResultChar = 0;
283349cc55cSDimitry Andric Delimited = true;
284349cc55cSDimitry Andric ++ThisTokBuf;
285349cc55cSDimitry Andric if (*ThisTokBuf == '}') {
286349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
287349cc55cSDimitry Andric diag::err_delimited_escape_empty);
288349cc55cSDimitry Andric return ResultChar;
289349cc55cSDimitry Andric }
290349cc55cSDimitry Andric
291349cc55cSDimitry Andric while (ThisTokBuf != ThisTokEnd) {
292349cc55cSDimitry Andric if (*ThisTokBuf == '}') {
293349cc55cSDimitry Andric EndDelimiterFound = true;
294349cc55cSDimitry Andric ThisTokBuf++;
295349cc55cSDimitry Andric break;
296349cc55cSDimitry Andric }
297349cc55cSDimitry Andric if (*ThisTokBuf < '0' || *ThisTokBuf > '7') {
298349cc55cSDimitry Andric HadError = true;
299349cc55cSDimitry Andric if (Diags)
300349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
301349cc55cSDimitry Andric diag::err_delimited_escape_invalid)
302349cc55cSDimitry Andric << StringRef(ThisTokBuf, 1);
303349cc55cSDimitry Andric ThisTokBuf++;
304349cc55cSDimitry Andric continue;
305349cc55cSDimitry Andric }
30606c3fb27SDimitry Andric // Check if one of the top three bits is set before shifting them out.
30706c3fb27SDimitry Andric if (ResultChar & 0xE0000000)
308349cc55cSDimitry Andric Overflow = true;
309349cc55cSDimitry Andric
310349cc55cSDimitry Andric ResultChar <<= 3;
311349cc55cSDimitry Andric ResultChar |= *ThisTokBuf++ - '0';
312349cc55cSDimitry Andric }
313349cc55cSDimitry Andric // Check for overflow. Reject '\777', but not L'\777'.
314349cc55cSDimitry Andric if (!HadError &&
315349cc55cSDimitry Andric (Overflow || (CharWidth != 32 && (ResultChar >> CharWidth) != 0))) {
316349cc55cSDimitry Andric HadError = true;
317349cc55cSDimitry Andric if (Diags)
318349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
319349cc55cSDimitry Andric diag::err_escape_too_large)
320349cc55cSDimitry Andric << 1;
321349cc55cSDimitry Andric ResultChar &= ~0U >> (32 - CharWidth);
322349cc55cSDimitry Andric }
323349cc55cSDimitry Andric break;
324349cc55cSDimitry Andric }
3250b57cec5SDimitry Andric // Otherwise, these are not valid escapes.
3260b57cec5SDimitry Andric case '(': case '{': case '[': case '%':
3270b57cec5SDimitry Andric // GCC accepts these as extensions. We warn about them as such though.
3280b57cec5SDimitry Andric if (Diags)
3290b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
3300b57cec5SDimitry Andric diag::ext_nonstandard_escape)
3310b57cec5SDimitry Andric << std::string(1, ResultChar);
3320b57cec5SDimitry Andric break;
3330b57cec5SDimitry Andric default:
3340b57cec5SDimitry Andric if (!Diags)
3350b57cec5SDimitry Andric break;
3360b57cec5SDimitry Andric
3370b57cec5SDimitry Andric if (isPrintable(ResultChar))
3380b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
3390b57cec5SDimitry Andric diag::ext_unknown_escape)
3400b57cec5SDimitry Andric << std::string(1, ResultChar);
3410b57cec5SDimitry Andric else
3420b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
3430b57cec5SDimitry Andric diag::ext_unknown_escape)
3440b57cec5SDimitry Andric << "x" + llvm::utohexstr(ResultChar);
3450b57cec5SDimitry Andric break;
3460b57cec5SDimitry Andric }
3470b57cec5SDimitry Andric
348349cc55cSDimitry Andric if (Delimited && Diags) {
349349cc55cSDimitry Andric if (!EndDelimiterFound)
350349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
351349cc55cSDimitry Andric diag::err_expected)
352349cc55cSDimitry Andric << tok::r_brace;
353349cc55cSDimitry Andric else if (!HadError) {
354349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
35506c3fb27SDimitry Andric Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
356753f127fSDimitry Andric : diag::ext_delimited_escape_sequence)
357753f127fSDimitry Andric << /*delimited*/ 0 << (Features.CPlusPlus ? 1 : 0);
358349cc55cSDimitry Andric }
359349cc55cSDimitry Andric }
360349cc55cSDimitry Andric
36106c3fb27SDimitry Andric if (EvalMethod == StringLiteralEvalMethod::Unevaluated &&
36206c3fb27SDimitry Andric !IsEscapeValidInUnevaluatedStringLiteral(Escape)) {
36306c3fb27SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, EscapeBegin, ThisTokBuf,
36406c3fb27SDimitry Andric diag::err_unevaluated_string_invalid_escape_sequence)
36506c3fb27SDimitry Andric << StringRef(EscapeBegin, ThisTokBuf - EscapeBegin);
3668a4dda33SDimitry Andric HadError = true;
36706c3fb27SDimitry Andric }
3688a4dda33SDimitry Andric
3690b57cec5SDimitry Andric return ResultChar;
3700b57cec5SDimitry Andric }
3710b57cec5SDimitry Andric
appendCodePoint(unsigned Codepoint,llvm::SmallVectorImpl<char> & Str)3720b57cec5SDimitry Andric static void appendCodePoint(unsigned Codepoint,
3730b57cec5SDimitry Andric llvm::SmallVectorImpl<char> &Str) {
3740b57cec5SDimitry Andric char ResultBuf[4];
3750b57cec5SDimitry Andric char *ResultPtr = ResultBuf;
37681ad6265SDimitry Andric if (llvm::ConvertCodePointToUTF8(Codepoint, ResultPtr))
3770b57cec5SDimitry Andric Str.append(ResultBuf, ResultPtr);
3780b57cec5SDimitry Andric }
3790b57cec5SDimitry Andric
expandUCNs(SmallVectorImpl<char> & Buf,StringRef Input)3800b57cec5SDimitry Andric void clang::expandUCNs(SmallVectorImpl<char> &Buf, StringRef Input) {
3810b57cec5SDimitry Andric for (StringRef::iterator I = Input.begin(), E = Input.end(); I != E; ++I) {
3820b57cec5SDimitry Andric if (*I != '\\') {
3830b57cec5SDimitry Andric Buf.push_back(*I);
3840b57cec5SDimitry Andric continue;
3850b57cec5SDimitry Andric }
3860b57cec5SDimitry Andric
3870b57cec5SDimitry Andric ++I;
388349cc55cSDimitry Andric char Kind = *I;
389349cc55cSDimitry Andric ++I;
390349cc55cSDimitry Andric
39181ad6265SDimitry Andric assert(Kind == 'u' || Kind == 'U' || Kind == 'N');
392349cc55cSDimitry Andric uint32_t CodePoint = 0;
393349cc55cSDimitry Andric
394349cc55cSDimitry Andric if (Kind == 'u' && *I == '{') {
395349cc55cSDimitry Andric for (++I; *I != '}'; ++I) {
396349cc55cSDimitry Andric unsigned Value = llvm::hexDigitValue(*I);
397349cc55cSDimitry Andric assert(Value != -1U);
398349cc55cSDimitry Andric CodePoint <<= 4;
399349cc55cSDimitry Andric CodePoint += Value;
400349cc55cSDimitry Andric }
401349cc55cSDimitry Andric appendCodePoint(CodePoint, Buf);
402349cc55cSDimitry Andric continue;
403349cc55cSDimitry Andric }
4040b57cec5SDimitry Andric
40581ad6265SDimitry Andric if (Kind == 'N') {
40681ad6265SDimitry Andric assert(*I == '{');
40781ad6265SDimitry Andric ++I;
40881ad6265SDimitry Andric auto Delim = std::find(I, Input.end(), '}');
40981ad6265SDimitry Andric assert(Delim != Input.end());
4105f757f3fSDimitry Andric StringRef Name(I, std::distance(I, Delim));
411bdd1243dSDimitry Andric std::optional<llvm::sys::unicode::LooseMatchingResult> Res =
4125f757f3fSDimitry Andric llvm::sys::unicode::nameToCodepointLooseMatching(Name);
4135f757f3fSDimitry Andric assert(Res && "could not find a codepoint that was previously found");
41481ad6265SDimitry Andric CodePoint = Res->CodePoint;
41581ad6265SDimitry Andric assert(CodePoint != 0xFFFFFFFF);
41681ad6265SDimitry Andric appendCodePoint(CodePoint, Buf);
41781ad6265SDimitry Andric I = Delim;
41881ad6265SDimitry Andric continue;
41981ad6265SDimitry Andric }
42081ad6265SDimitry Andric
4210b57cec5SDimitry Andric unsigned NumHexDigits;
422349cc55cSDimitry Andric if (Kind == 'u')
4230b57cec5SDimitry Andric NumHexDigits = 4;
4240b57cec5SDimitry Andric else
4250b57cec5SDimitry Andric NumHexDigits = 8;
4260b57cec5SDimitry Andric
4270b57cec5SDimitry Andric assert(I + NumHexDigits <= E);
4280b57cec5SDimitry Andric
429349cc55cSDimitry Andric for (; NumHexDigits != 0; ++I, --NumHexDigits) {
4300b57cec5SDimitry Andric unsigned Value = llvm::hexDigitValue(*I);
4310b57cec5SDimitry Andric assert(Value != -1U);
4320b57cec5SDimitry Andric
4330b57cec5SDimitry Andric CodePoint <<= 4;
4340b57cec5SDimitry Andric CodePoint += Value;
4350b57cec5SDimitry Andric }
4360b57cec5SDimitry Andric
4370b57cec5SDimitry Andric appendCodePoint(CodePoint, Buf);
4380b57cec5SDimitry Andric --I;
4390b57cec5SDimitry Andric }
4400b57cec5SDimitry Andric }
4410b57cec5SDimitry Andric
isFunctionLocalStringLiteralMacro(tok::TokenKind K,const LangOptions & LO)4425f757f3fSDimitry Andric bool clang::isFunctionLocalStringLiteralMacro(tok::TokenKind K,
4435f757f3fSDimitry Andric const LangOptions &LO) {
4445f757f3fSDimitry Andric return LO.MicrosoftExt &&
4455f757f3fSDimitry Andric (K == tok::kw___FUNCTION__ || K == tok::kw_L__FUNCTION__ ||
4465f757f3fSDimitry Andric K == tok::kw___FUNCSIG__ || K == tok::kw_L__FUNCSIG__ ||
4475f757f3fSDimitry Andric K == tok::kw___FUNCDNAME__);
4485f757f3fSDimitry Andric }
4495f757f3fSDimitry Andric
tokenIsLikeStringLiteral(const Token & Tok,const LangOptions & LO)4505f757f3fSDimitry Andric bool clang::tokenIsLikeStringLiteral(const Token &Tok, const LangOptions &LO) {
4515f757f3fSDimitry Andric return tok::isStringLiteral(Tok.getKind()) ||
4525f757f3fSDimitry Andric isFunctionLocalStringLiteralMacro(Tok.getKind(), LO);
4535f757f3fSDimitry Andric }
4545f757f3fSDimitry Andric
ProcessNumericUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,bool & Delimited,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)45581ad6265SDimitry Andric static bool ProcessNumericUCNEscape(const char *ThisTokBegin,
45681ad6265SDimitry Andric const char *&ThisTokBuf,
45781ad6265SDimitry Andric const char *ThisTokEnd, uint32_t &UcnVal,
45881ad6265SDimitry Andric unsigned short &UcnLen, bool &Delimited,
4590b57cec5SDimitry Andric FullSourceLoc Loc, DiagnosticsEngine *Diags,
4600b57cec5SDimitry Andric const LangOptions &Features,
4610b57cec5SDimitry Andric bool in_char_string_literal = false) {
4620b57cec5SDimitry Andric const char *UcnBegin = ThisTokBuf;
46381ad6265SDimitry Andric bool HasError = false;
46481ad6265SDimitry Andric bool EndDelimiterFound = false;
4650b57cec5SDimitry Andric
4660b57cec5SDimitry Andric // Skip the '\u' char's.
4670b57cec5SDimitry Andric ThisTokBuf += 2;
46881ad6265SDimitry Andric Delimited = false;
469349cc55cSDimitry Andric if (UcnBegin[1] == 'u' && in_char_string_literal &&
470349cc55cSDimitry Andric ThisTokBuf != ThisTokEnd && *ThisTokBuf == '{') {
471349cc55cSDimitry Andric Delimited = true;
472349cc55cSDimitry Andric ThisTokBuf++;
473349cc55cSDimitry Andric } else if (ThisTokBuf == ThisTokEnd || !isHexDigit(*ThisTokBuf)) {
4740b57cec5SDimitry Andric if (Diags)
4750b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
47681ad6265SDimitry Andric diag::err_hex_escape_no_digits)
47781ad6265SDimitry Andric << StringRef(&ThisTokBuf[-1], 1);
4780b57cec5SDimitry Andric return false;
4790b57cec5SDimitry Andric }
4800b57cec5SDimitry Andric UcnLen = (ThisTokBuf[-1] == 'u' ? 4 : 8);
481349cc55cSDimitry Andric
482349cc55cSDimitry Andric bool Overflow = false;
483349cc55cSDimitry Andric unsigned short Count = 0;
484349cc55cSDimitry Andric for (; ThisTokBuf != ThisTokEnd && (Delimited || Count != UcnLen);
485349cc55cSDimitry Andric ++ThisTokBuf) {
486349cc55cSDimitry Andric if (Delimited && *ThisTokBuf == '}') {
487349cc55cSDimitry Andric ++ThisTokBuf;
488349cc55cSDimitry Andric EndDelimiterFound = true;
489349cc55cSDimitry Andric break;
490349cc55cSDimitry Andric }
491349cc55cSDimitry Andric int CharVal = llvm::hexDigitValue(*ThisTokBuf);
492349cc55cSDimitry Andric if (CharVal == -1) {
493349cc55cSDimitry Andric HasError = true;
494349cc55cSDimitry Andric if (!Delimited)
495349cc55cSDimitry Andric break;
496349cc55cSDimitry Andric if (Diags) {
497349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
498349cc55cSDimitry Andric diag::err_delimited_escape_invalid)
499349cc55cSDimitry Andric << StringRef(ThisTokBuf, 1);
500349cc55cSDimitry Andric }
501349cc55cSDimitry Andric Count++;
502349cc55cSDimitry Andric continue;
503349cc55cSDimitry Andric }
504349cc55cSDimitry Andric if (UcnVal & 0xF0000000) {
505349cc55cSDimitry Andric Overflow = true;
506349cc55cSDimitry Andric continue;
507349cc55cSDimitry Andric }
5080b57cec5SDimitry Andric UcnVal <<= 4;
5090b57cec5SDimitry Andric UcnVal |= CharVal;
510349cc55cSDimitry Andric Count++;
5110b57cec5SDimitry Andric }
512349cc55cSDimitry Andric
513349cc55cSDimitry Andric if (Overflow) {
5140b57cec5SDimitry Andric if (Diags)
5150b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
516349cc55cSDimitry Andric diag::err_escape_too_large)
517349cc55cSDimitry Andric << 0;
5180b57cec5SDimitry Andric return false;
5190b57cec5SDimitry Andric }
5200b57cec5SDimitry Andric
521349cc55cSDimitry Andric if (Delimited && !EndDelimiterFound) {
522349cc55cSDimitry Andric if (Diags) {
523349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
524349cc55cSDimitry Andric diag::err_expected)
525349cc55cSDimitry Andric << tok::r_brace;
526349cc55cSDimitry Andric }
527349cc55cSDimitry Andric return false;
528349cc55cSDimitry Andric }
529349cc55cSDimitry Andric
530349cc55cSDimitry Andric // If we didn't consume the proper number of digits, there is a problem.
531349cc55cSDimitry Andric if (Count == 0 || (!Delimited && Count != UcnLen)) {
532349cc55cSDimitry Andric if (Diags)
533349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
534349cc55cSDimitry Andric Delimited ? diag::err_delimited_escape_empty
535349cc55cSDimitry Andric : diag::err_ucn_escape_incomplete);
536349cc55cSDimitry Andric return false;
537349cc55cSDimitry Andric }
53881ad6265SDimitry Andric return !HasError;
53981ad6265SDimitry Andric }
540349cc55cSDimitry Andric
DiagnoseInvalidUnicodeCharacterName(DiagnosticsEngine * Diags,const LangOptions & Features,FullSourceLoc Loc,const char * TokBegin,const char * TokRangeBegin,const char * TokRangeEnd,llvm::StringRef Name)54181ad6265SDimitry Andric static void DiagnoseInvalidUnicodeCharacterName(
54281ad6265SDimitry Andric DiagnosticsEngine *Diags, const LangOptions &Features, FullSourceLoc Loc,
54381ad6265SDimitry Andric const char *TokBegin, const char *TokRangeBegin, const char *TokRangeEnd,
54481ad6265SDimitry Andric llvm::StringRef Name) {
54581ad6265SDimitry Andric
54681ad6265SDimitry Andric Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
54781ad6265SDimitry Andric diag::err_invalid_ucn_name)
54881ad6265SDimitry Andric << Name;
54981ad6265SDimitry Andric
55081ad6265SDimitry Andric namespace u = llvm::sys::unicode;
55181ad6265SDimitry Andric
552bdd1243dSDimitry Andric std::optional<u::LooseMatchingResult> Res =
55381ad6265SDimitry Andric u::nameToCodepointLooseMatching(Name);
55481ad6265SDimitry Andric if (Res) {
55581ad6265SDimitry Andric Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
55681ad6265SDimitry Andric diag::note_invalid_ucn_name_loose_matching)
55781ad6265SDimitry Andric << FixItHint::CreateReplacement(
55881ad6265SDimitry Andric MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
55981ad6265SDimitry Andric TokRangeEnd),
56081ad6265SDimitry Andric Res->Name);
56181ad6265SDimitry Andric return;
56281ad6265SDimitry Andric }
56381ad6265SDimitry Andric
56481ad6265SDimitry Andric unsigned Distance = 0;
56581ad6265SDimitry Andric SmallVector<u::MatchForCodepointName> Matches =
56681ad6265SDimitry Andric u::nearestMatchesForCodepointName(Name, 5);
56781ad6265SDimitry Andric assert(!Matches.empty() && "No unicode characters found");
56881ad6265SDimitry Andric
56981ad6265SDimitry Andric for (const auto &Match : Matches) {
57081ad6265SDimitry Andric if (Distance == 0)
57181ad6265SDimitry Andric Distance = Match.Distance;
57281ad6265SDimitry Andric if (std::max(Distance, Match.Distance) -
57381ad6265SDimitry Andric std::min(Distance, Match.Distance) >
57481ad6265SDimitry Andric 3)
57581ad6265SDimitry Andric break;
57681ad6265SDimitry Andric Distance = Match.Distance;
57781ad6265SDimitry Andric
57881ad6265SDimitry Andric std::string Str;
57981ad6265SDimitry Andric llvm::UTF32 V = Match.Value;
580bdd1243dSDimitry Andric bool Converted =
58181ad6265SDimitry Andric llvm::convertUTF32ToUTF8String(llvm::ArrayRef<llvm::UTF32>(&V, 1), Str);
582bdd1243dSDimitry Andric (void)Converted;
58381ad6265SDimitry Andric assert(Converted && "Found a match wich is not a unicode character");
58481ad6265SDimitry Andric
58581ad6265SDimitry Andric Diag(Diags, Features, Loc, TokBegin, TokRangeBegin, TokRangeEnd,
58681ad6265SDimitry Andric diag::note_invalid_ucn_name_candidate)
58781ad6265SDimitry Andric << Match.Name << llvm::utohexstr(Match.Value)
58881ad6265SDimitry Andric << Str // FIXME: Fix the rendering of non printable characters
58981ad6265SDimitry Andric << FixItHint::CreateReplacement(
59081ad6265SDimitry Andric MakeCharSourceRange(Features, Loc, TokBegin, TokRangeBegin,
59181ad6265SDimitry Andric TokRangeEnd),
59281ad6265SDimitry Andric Match.Name);
59381ad6265SDimitry Andric }
59481ad6265SDimitry Andric }
59581ad6265SDimitry Andric
ProcessNamedUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features)59681ad6265SDimitry Andric static bool ProcessNamedUCNEscape(const char *ThisTokBegin,
59781ad6265SDimitry Andric const char *&ThisTokBuf,
59881ad6265SDimitry Andric const char *ThisTokEnd, uint32_t &UcnVal,
59981ad6265SDimitry Andric unsigned short &UcnLen, FullSourceLoc Loc,
60081ad6265SDimitry Andric DiagnosticsEngine *Diags,
60181ad6265SDimitry Andric const LangOptions &Features) {
60281ad6265SDimitry Andric const char *UcnBegin = ThisTokBuf;
60381ad6265SDimitry Andric assert(UcnBegin[0] == '\\' && UcnBegin[1] == 'N');
60481ad6265SDimitry Andric ThisTokBuf += 2;
60581ad6265SDimitry Andric if (ThisTokBuf == ThisTokEnd || *ThisTokBuf != '{') {
60681ad6265SDimitry Andric if (Diags) {
60781ad6265SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
60881ad6265SDimitry Andric diag::err_delimited_escape_missing_brace)
60981ad6265SDimitry Andric << StringRef(&ThisTokBuf[-1], 1);
61081ad6265SDimitry Andric }
61181ad6265SDimitry Andric return false;
61281ad6265SDimitry Andric }
61381ad6265SDimitry Andric ThisTokBuf++;
614bdd1243dSDimitry Andric const char *ClosingBrace = std::find_if(ThisTokBuf, ThisTokEnd, [](char C) {
615bdd1243dSDimitry Andric return C == '}' || isVerticalWhitespace(C);
61681ad6265SDimitry Andric });
617bdd1243dSDimitry Andric bool Incomplete = ClosingBrace == ThisTokEnd;
61881ad6265SDimitry Andric bool Empty = ClosingBrace == ThisTokBuf;
61981ad6265SDimitry Andric if (Incomplete || Empty) {
62081ad6265SDimitry Andric if (Diags) {
62181ad6265SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
62281ad6265SDimitry Andric Incomplete ? diag::err_ucn_escape_incomplete
62381ad6265SDimitry Andric : diag::err_delimited_escape_empty)
62481ad6265SDimitry Andric << StringRef(&UcnBegin[1], 1);
62581ad6265SDimitry Andric }
62681ad6265SDimitry Andric ThisTokBuf = ClosingBrace == ThisTokEnd ? ClosingBrace : ClosingBrace + 1;
62781ad6265SDimitry Andric return false;
62881ad6265SDimitry Andric }
62981ad6265SDimitry Andric StringRef Name(ThisTokBuf, ClosingBrace - ThisTokBuf);
63081ad6265SDimitry Andric ThisTokBuf = ClosingBrace + 1;
631bdd1243dSDimitry Andric std::optional<char32_t> Res = llvm::sys::unicode::nameToCodepointStrict(Name);
63281ad6265SDimitry Andric if (!Res) {
63381ad6265SDimitry Andric if (Diags)
63481ad6265SDimitry Andric DiagnoseInvalidUnicodeCharacterName(Diags, Features, Loc, ThisTokBegin,
63581ad6265SDimitry Andric &UcnBegin[3], ClosingBrace, Name);
63681ad6265SDimitry Andric return false;
63781ad6265SDimitry Andric }
63881ad6265SDimitry Andric UcnVal = *Res;
63981ad6265SDimitry Andric UcnLen = UcnVal > 0xFFFF ? 8 : 4;
64081ad6265SDimitry Andric return true;
64181ad6265SDimitry Andric }
64281ad6265SDimitry Andric
64381ad6265SDimitry Andric /// ProcessUCNEscape - Read the Universal Character Name, check constraints and
64481ad6265SDimitry Andric /// return the UTF32.
ProcessUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,uint32_t & UcnVal,unsigned short & UcnLen,FullSourceLoc Loc,DiagnosticsEngine * Diags,const LangOptions & Features,bool in_char_string_literal=false)64581ad6265SDimitry Andric static bool ProcessUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
64681ad6265SDimitry Andric const char *ThisTokEnd, uint32_t &UcnVal,
64781ad6265SDimitry Andric unsigned short &UcnLen, FullSourceLoc Loc,
64881ad6265SDimitry Andric DiagnosticsEngine *Diags,
64981ad6265SDimitry Andric const LangOptions &Features,
65081ad6265SDimitry Andric bool in_char_string_literal = false) {
65181ad6265SDimitry Andric
65281ad6265SDimitry Andric bool HasError;
65381ad6265SDimitry Andric const char *UcnBegin = ThisTokBuf;
65481ad6265SDimitry Andric bool IsDelimitedEscapeSequence = false;
65581ad6265SDimitry Andric bool IsNamedEscapeSequence = false;
65681ad6265SDimitry Andric if (ThisTokBuf[1] == 'N') {
65781ad6265SDimitry Andric IsNamedEscapeSequence = true;
65881ad6265SDimitry Andric HasError = !ProcessNamedUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
65981ad6265SDimitry Andric UcnVal, UcnLen, Loc, Diags, Features);
66081ad6265SDimitry Andric } else {
66181ad6265SDimitry Andric HasError =
66281ad6265SDimitry Andric !ProcessNumericUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
66381ad6265SDimitry Andric UcnLen, IsDelimitedEscapeSequence, Loc, Diags,
66481ad6265SDimitry Andric Features, in_char_string_literal);
66581ad6265SDimitry Andric }
666349cc55cSDimitry Andric if (HasError)
667349cc55cSDimitry Andric return false;
668349cc55cSDimitry Andric
6690b57cec5SDimitry Andric // Check UCN constraints (C99 6.4.3p2) [C++11 lex.charset p2]
6700b57cec5SDimitry Andric if ((0xD800 <= UcnVal && UcnVal <= 0xDFFF) || // surrogate codepoints
6710b57cec5SDimitry Andric UcnVal > 0x10FFFF) { // maximum legal UTF32 value
6720b57cec5SDimitry Andric if (Diags)
6730b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
6740b57cec5SDimitry Andric diag::err_ucn_escape_invalid);
6750b57cec5SDimitry Andric return false;
6760b57cec5SDimitry Andric }
6770b57cec5SDimitry Andric
6785f757f3fSDimitry Andric // C23 and C++11 allow UCNs that refer to control characters
67906c3fb27SDimitry Andric // and basic source characters inside character and string literals
6800b57cec5SDimitry Andric if (UcnVal < 0xa0 &&
68106c3fb27SDimitry Andric // $, @, ` are allowed in all language modes
68206c3fb27SDimitry Andric (UcnVal != 0x24 && UcnVal != 0x40 && UcnVal != 0x60)) {
68306c3fb27SDimitry Andric bool IsError =
6845f757f3fSDimitry Andric (!(Features.CPlusPlus11 || Features.C23) || !in_char_string_literal);
6850b57cec5SDimitry Andric if (Diags) {
6860b57cec5SDimitry Andric char BasicSCSChar = UcnVal;
6870b57cec5SDimitry Andric if (UcnVal >= 0x20 && UcnVal < 0x7f)
6880b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
68906c3fb27SDimitry Andric IsError ? diag::err_ucn_escape_basic_scs
69006c3fb27SDimitry Andric : Features.CPlusPlus
69106c3fb27SDimitry Andric ? diag::warn_cxx98_compat_literal_ucn_escape_basic_scs
6925f757f3fSDimitry Andric : diag::warn_c23_compat_literal_ucn_escape_basic_scs)
6930b57cec5SDimitry Andric << StringRef(&BasicSCSChar, 1);
6940b57cec5SDimitry Andric else
6950b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
69606c3fb27SDimitry Andric IsError ? diag::err_ucn_control_character
69706c3fb27SDimitry Andric : Features.CPlusPlus
69806c3fb27SDimitry Andric ? diag::warn_cxx98_compat_literal_ucn_control_character
6995f757f3fSDimitry Andric : diag::warn_c23_compat_literal_ucn_control_character);
7000b57cec5SDimitry Andric }
7010b57cec5SDimitry Andric if (IsError)
7020b57cec5SDimitry Andric return false;
7030b57cec5SDimitry Andric }
7040b57cec5SDimitry Andric
7050b57cec5SDimitry Andric if (!Features.CPlusPlus && !Features.C99 && Diags)
7060b57cec5SDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
7070b57cec5SDimitry Andric diag::warn_ucn_not_valid_in_c89_literal);
7080b57cec5SDimitry Andric
70981ad6265SDimitry Andric if ((IsDelimitedEscapeSequence || IsNamedEscapeSequence) && Diags)
710349cc55cSDimitry Andric Diag(Diags, Features, Loc, ThisTokBegin, UcnBegin, ThisTokBuf,
71106c3fb27SDimitry Andric Features.CPlusPlus23 ? diag::warn_cxx23_delimited_escape_sequence
712753f127fSDimitry Andric : diag::ext_delimited_escape_sequence)
713753f127fSDimitry Andric << (IsNamedEscapeSequence ? 1 : 0) << (Features.CPlusPlus ? 1 : 0);
714349cc55cSDimitry Andric
7150b57cec5SDimitry Andric return true;
7160b57cec5SDimitry Andric }
7170b57cec5SDimitry Andric
7180b57cec5SDimitry Andric /// MeasureUCNEscape - Determine the number of bytes within the resulting string
7190b57cec5SDimitry Andric /// which this UCN will occupy.
MeasureUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,unsigned CharByteWidth,const LangOptions & Features,bool & HadError)7200b57cec5SDimitry Andric static int MeasureUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
7210b57cec5SDimitry Andric const char *ThisTokEnd, unsigned CharByteWidth,
7220b57cec5SDimitry Andric const LangOptions &Features, bool &HadError) {
7230b57cec5SDimitry Andric // UTF-32: 4 bytes per escape.
7240b57cec5SDimitry Andric if (CharByteWidth == 4)
7250b57cec5SDimitry Andric return 4;
7260b57cec5SDimitry Andric
7270b57cec5SDimitry Andric uint32_t UcnVal = 0;
7280b57cec5SDimitry Andric unsigned short UcnLen = 0;
7290b57cec5SDimitry Andric FullSourceLoc Loc;
7300b57cec5SDimitry Andric
7310b57cec5SDimitry Andric if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal,
7320b57cec5SDimitry Andric UcnLen, Loc, nullptr, Features, true)) {
7330b57cec5SDimitry Andric HadError = true;
7340b57cec5SDimitry Andric return 0;
7350b57cec5SDimitry Andric }
7360b57cec5SDimitry Andric
7370b57cec5SDimitry Andric // UTF-16: 2 bytes for BMP, 4 bytes otherwise.
7380b57cec5SDimitry Andric if (CharByteWidth == 2)
7390b57cec5SDimitry Andric return UcnVal <= 0xFFFF ? 2 : 4;
7400b57cec5SDimitry Andric
7410b57cec5SDimitry Andric // UTF-8.
7420b57cec5SDimitry Andric if (UcnVal < 0x80)
7430b57cec5SDimitry Andric return 1;
7440b57cec5SDimitry Andric if (UcnVal < 0x800)
7450b57cec5SDimitry Andric return 2;
7460b57cec5SDimitry Andric if (UcnVal < 0x10000)
7470b57cec5SDimitry Andric return 3;
7480b57cec5SDimitry Andric return 4;
7490b57cec5SDimitry Andric }
7500b57cec5SDimitry Andric
7510b57cec5SDimitry Andric /// EncodeUCNEscape - Read the Universal Character Name, check constraints and
7520b57cec5SDimitry Andric /// convert the UTF32 to UTF8 or UTF16. This is a subroutine of
7530b57cec5SDimitry Andric /// StringLiteralParser. When we decide to implement UCN's for identifiers,
7540b57cec5SDimitry Andric /// we will likely rework our support for UCN's.
EncodeUCNEscape(const char * ThisTokBegin,const char * & ThisTokBuf,const char * ThisTokEnd,char * & ResultBuf,bool & HadError,FullSourceLoc Loc,unsigned CharByteWidth,DiagnosticsEngine * Diags,const LangOptions & Features)7550b57cec5SDimitry Andric static void EncodeUCNEscape(const char *ThisTokBegin, const char *&ThisTokBuf,
7560b57cec5SDimitry Andric const char *ThisTokEnd,
7570b57cec5SDimitry Andric char *&ResultBuf, bool &HadError,
7580b57cec5SDimitry Andric FullSourceLoc Loc, unsigned CharByteWidth,
7590b57cec5SDimitry Andric DiagnosticsEngine *Diags,
7600b57cec5SDimitry Andric const LangOptions &Features) {
7610b57cec5SDimitry Andric typedef uint32_t UTF32;
7620b57cec5SDimitry Andric UTF32 UcnVal = 0;
7630b57cec5SDimitry Andric unsigned short UcnLen = 0;
7640b57cec5SDimitry Andric if (!ProcessUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, UcnVal, UcnLen,
7650b57cec5SDimitry Andric Loc, Diags, Features, true)) {
7660b57cec5SDimitry Andric HadError = true;
7670b57cec5SDimitry Andric return;
7680b57cec5SDimitry Andric }
7690b57cec5SDimitry Andric
7700b57cec5SDimitry Andric assert((CharByteWidth == 1 || CharByteWidth == 2 || CharByteWidth == 4) &&
7710b57cec5SDimitry Andric "only character widths of 1, 2, or 4 bytes supported");
7720b57cec5SDimitry Andric
7730b57cec5SDimitry Andric (void)UcnLen;
7740b57cec5SDimitry Andric assert((UcnLen== 4 || UcnLen== 8) && "only ucn length of 4 or 8 supported");
7750b57cec5SDimitry Andric
7760b57cec5SDimitry Andric if (CharByteWidth == 4) {
7770b57cec5SDimitry Andric // FIXME: Make the type of the result buffer correct instead of
7780b57cec5SDimitry Andric // using reinterpret_cast.
7790b57cec5SDimitry Andric llvm::UTF32 *ResultPtr = reinterpret_cast<llvm::UTF32*>(ResultBuf);
7800b57cec5SDimitry Andric *ResultPtr = UcnVal;
7810b57cec5SDimitry Andric ResultBuf += 4;
7820b57cec5SDimitry Andric return;
7830b57cec5SDimitry Andric }
7840b57cec5SDimitry Andric
7850b57cec5SDimitry Andric if (CharByteWidth == 2) {
7860b57cec5SDimitry Andric // FIXME: Make the type of the result buffer correct instead of
7870b57cec5SDimitry Andric // using reinterpret_cast.
7880b57cec5SDimitry Andric llvm::UTF16 *ResultPtr = reinterpret_cast<llvm::UTF16*>(ResultBuf);
7890b57cec5SDimitry Andric
7900b57cec5SDimitry Andric if (UcnVal <= (UTF32)0xFFFF) {
7910b57cec5SDimitry Andric *ResultPtr = UcnVal;
7920b57cec5SDimitry Andric ResultBuf += 2;
7930b57cec5SDimitry Andric return;
7940b57cec5SDimitry Andric }
7950b57cec5SDimitry Andric
7960b57cec5SDimitry Andric // Convert to UTF16.
7970b57cec5SDimitry Andric UcnVal -= 0x10000;
7980b57cec5SDimitry Andric *ResultPtr = 0xD800 + (UcnVal >> 10);
7990b57cec5SDimitry Andric *(ResultPtr+1) = 0xDC00 + (UcnVal & 0x3FF);
8000b57cec5SDimitry Andric ResultBuf += 4;
8010b57cec5SDimitry Andric return;
8020b57cec5SDimitry Andric }
8030b57cec5SDimitry Andric
8040b57cec5SDimitry Andric assert(CharByteWidth == 1 && "UTF-8 encoding is only for 1 byte characters");
8050b57cec5SDimitry Andric
8060b57cec5SDimitry Andric // Now that we've parsed/checked the UCN, we convert from UTF32->UTF8.
8070b57cec5SDimitry Andric // The conversion below was inspired by:
8080b57cec5SDimitry Andric // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
8090b57cec5SDimitry Andric // First, we determine how many bytes the result will require.
8100b57cec5SDimitry Andric typedef uint8_t UTF8;
8110b57cec5SDimitry Andric
8120b57cec5SDimitry Andric unsigned short bytesToWrite = 0;
8130b57cec5SDimitry Andric if (UcnVal < (UTF32)0x80)
8140b57cec5SDimitry Andric bytesToWrite = 1;
8150b57cec5SDimitry Andric else if (UcnVal < (UTF32)0x800)
8160b57cec5SDimitry Andric bytesToWrite = 2;
8170b57cec5SDimitry Andric else if (UcnVal < (UTF32)0x10000)
8180b57cec5SDimitry Andric bytesToWrite = 3;
8190b57cec5SDimitry Andric else
8200b57cec5SDimitry Andric bytesToWrite = 4;
8210b57cec5SDimitry Andric
8220b57cec5SDimitry Andric const unsigned byteMask = 0xBF;
8230b57cec5SDimitry Andric const unsigned byteMark = 0x80;
8240b57cec5SDimitry Andric
8250b57cec5SDimitry Andric // Once the bits are split out into bytes of UTF8, this is a mask OR-ed
8260b57cec5SDimitry Andric // into the first byte, depending on how many bytes follow.
8270b57cec5SDimitry Andric static const UTF8 firstByteMark[5] = {
8280b57cec5SDimitry Andric 0x00, 0x00, 0xC0, 0xE0, 0xF0
8290b57cec5SDimitry Andric };
8300b57cec5SDimitry Andric // Finally, we write the bytes into ResultBuf.
8310b57cec5SDimitry Andric ResultBuf += bytesToWrite;
8320b57cec5SDimitry Andric switch (bytesToWrite) { // note: everything falls through.
8330b57cec5SDimitry Andric case 4:
8340b57cec5SDimitry Andric *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
835bdd1243dSDimitry Andric [[fallthrough]];
8360b57cec5SDimitry Andric case 3:
8370b57cec5SDimitry Andric *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
838bdd1243dSDimitry Andric [[fallthrough]];
8390b57cec5SDimitry Andric case 2:
8400b57cec5SDimitry Andric *--ResultBuf = (UTF8)((UcnVal | byteMark) & byteMask); UcnVal >>= 6;
841bdd1243dSDimitry Andric [[fallthrough]];
8420b57cec5SDimitry Andric case 1:
8430b57cec5SDimitry Andric *--ResultBuf = (UTF8) (UcnVal | firstByteMark[bytesToWrite]);
8440b57cec5SDimitry Andric }
8450b57cec5SDimitry Andric // Update the buffer.
8460b57cec5SDimitry Andric ResultBuf += bytesToWrite;
8470b57cec5SDimitry Andric }
8480b57cec5SDimitry Andric
8490b57cec5SDimitry Andric /// integer-constant: [C99 6.4.4.1]
8500b57cec5SDimitry Andric /// decimal-constant integer-suffix
8510b57cec5SDimitry Andric /// octal-constant integer-suffix
8520b57cec5SDimitry Andric /// hexadecimal-constant integer-suffix
8530b57cec5SDimitry Andric /// binary-literal integer-suffix [GNU, C++1y]
8540b57cec5SDimitry Andric /// user-defined-integer-literal: [C++11 lex.ext]
8550b57cec5SDimitry Andric /// decimal-literal ud-suffix
8560b57cec5SDimitry Andric /// octal-literal ud-suffix
8570b57cec5SDimitry Andric /// hexadecimal-literal ud-suffix
8580b57cec5SDimitry Andric /// binary-literal ud-suffix [GNU, C++1y]
8590b57cec5SDimitry Andric /// decimal-constant:
8600b57cec5SDimitry Andric /// nonzero-digit
8610b57cec5SDimitry Andric /// decimal-constant digit
8620b57cec5SDimitry Andric /// octal-constant:
8630b57cec5SDimitry Andric /// 0
8640b57cec5SDimitry Andric /// octal-constant octal-digit
8650b57cec5SDimitry Andric /// hexadecimal-constant:
8660b57cec5SDimitry Andric /// hexadecimal-prefix hexadecimal-digit
8670b57cec5SDimitry Andric /// hexadecimal-constant hexadecimal-digit
8680b57cec5SDimitry Andric /// hexadecimal-prefix: one of
8690b57cec5SDimitry Andric /// 0x 0X
8700b57cec5SDimitry Andric /// binary-literal:
8710b57cec5SDimitry Andric /// 0b binary-digit
8720b57cec5SDimitry Andric /// 0B binary-digit
8730b57cec5SDimitry Andric /// binary-literal binary-digit
8740b57cec5SDimitry Andric /// integer-suffix:
8750b57cec5SDimitry Andric /// unsigned-suffix [long-suffix]
8760b57cec5SDimitry Andric /// unsigned-suffix [long-long-suffix]
8770b57cec5SDimitry Andric /// long-suffix [unsigned-suffix]
8780b57cec5SDimitry Andric /// long-long-suffix [unsigned-sufix]
8790b57cec5SDimitry Andric /// nonzero-digit:
8800b57cec5SDimitry Andric /// 1 2 3 4 5 6 7 8 9
8810b57cec5SDimitry Andric /// octal-digit:
8820b57cec5SDimitry Andric /// 0 1 2 3 4 5 6 7
8830b57cec5SDimitry Andric /// hexadecimal-digit:
8840b57cec5SDimitry Andric /// 0 1 2 3 4 5 6 7 8 9
8850b57cec5SDimitry Andric /// a b c d e f
8860b57cec5SDimitry Andric /// A B C D E F
8870b57cec5SDimitry Andric /// binary-digit:
8880b57cec5SDimitry Andric /// 0
8890b57cec5SDimitry Andric /// 1
8900b57cec5SDimitry Andric /// unsigned-suffix: one of
8910b57cec5SDimitry Andric /// u U
8920b57cec5SDimitry Andric /// long-suffix: one of
8930b57cec5SDimitry Andric /// l L
8940b57cec5SDimitry Andric /// long-long-suffix: one of
8950b57cec5SDimitry Andric /// ll LL
8960b57cec5SDimitry Andric ///
8970b57cec5SDimitry Andric /// floating-constant: [C99 6.4.4.2]
8980b57cec5SDimitry Andric /// TODO: add rules...
8990b57cec5SDimitry Andric ///
NumericLiteralParser(StringRef TokSpelling,SourceLocation TokLoc,const SourceManager & SM,const LangOptions & LangOpts,const TargetInfo & Target,DiagnosticsEngine & Diags)9000b57cec5SDimitry Andric NumericLiteralParser::NumericLiteralParser(StringRef TokSpelling,
9010b57cec5SDimitry Andric SourceLocation TokLoc,
9025ffd83dbSDimitry Andric const SourceManager &SM,
9035ffd83dbSDimitry Andric const LangOptions &LangOpts,
9045ffd83dbSDimitry Andric const TargetInfo &Target,
9055ffd83dbSDimitry Andric DiagnosticsEngine &Diags)
9065ffd83dbSDimitry Andric : SM(SM), LangOpts(LangOpts), Diags(Diags),
9075ffd83dbSDimitry Andric ThisTokBegin(TokSpelling.begin()), ThisTokEnd(TokSpelling.end()) {
9080b57cec5SDimitry Andric
9090b57cec5SDimitry Andric s = DigitsBegin = ThisTokBegin;
9100b57cec5SDimitry Andric saw_exponent = false;
9110b57cec5SDimitry Andric saw_period = false;
9120b57cec5SDimitry Andric saw_ud_suffix = false;
9130b57cec5SDimitry Andric saw_fixed_point_suffix = false;
9140b57cec5SDimitry Andric isLong = false;
9150b57cec5SDimitry Andric isUnsigned = false;
9160b57cec5SDimitry Andric isLongLong = false;
917fe6060f1SDimitry Andric isSizeT = false;
9180b57cec5SDimitry Andric isHalf = false;
9190b57cec5SDimitry Andric isFloat = false;
9200b57cec5SDimitry Andric isImaginary = false;
9210b57cec5SDimitry Andric isFloat16 = false;
9220b57cec5SDimitry Andric isFloat128 = false;
9230b57cec5SDimitry Andric MicrosoftInteger = 0;
9240b57cec5SDimitry Andric isFract = false;
9250b57cec5SDimitry Andric isAccum = false;
9260b57cec5SDimitry Andric hadError = false;
92781ad6265SDimitry Andric isBitInt = false;
9280b57cec5SDimitry Andric
929349cc55cSDimitry Andric // This routine assumes that the range begin/end matches the regex for integer
930349cc55cSDimitry Andric // and FP constants (specifically, the 'pp-number' regex), and assumes that
931349cc55cSDimitry Andric // the byte at "*end" is both valid and not part of the regex. Because of
932349cc55cSDimitry Andric // this, it doesn't have to check for 'overscan' in various places.
9335f757f3fSDimitry Andric // Note: For HLSL, the end token is allowed to be '.' which would be in the
9345f757f3fSDimitry Andric // 'pp-number' regex. This is required to support vector swizzles on numeric
9355f757f3fSDimitry Andric // constants (i.e. 1.xx or 1.5f.rrr).
9365f757f3fSDimitry Andric if (isPreprocessingNumberBody(*ThisTokEnd) &&
9375f757f3fSDimitry Andric !(LangOpts.HLSL && *ThisTokEnd == '.')) {
938349cc55cSDimitry Andric Diags.Report(TokLoc, diag::err_lexing_numeric);
939349cc55cSDimitry Andric hadError = true;
940349cc55cSDimitry Andric return;
941349cc55cSDimitry Andric }
942349cc55cSDimitry Andric
9430b57cec5SDimitry Andric if (*s == '0') { // parse radix
9440b57cec5SDimitry Andric ParseNumberStartingWithZero(TokLoc);
9450b57cec5SDimitry Andric if (hadError)
9460b57cec5SDimitry Andric return;
9470b57cec5SDimitry Andric } else { // the first digit is non-zero
9480b57cec5SDimitry Andric radix = 10;
9490b57cec5SDimitry Andric s = SkipDigits(s);
9500b57cec5SDimitry Andric if (s == ThisTokEnd) {
9510b57cec5SDimitry Andric // Done.
9520b57cec5SDimitry Andric } else {
9530b57cec5SDimitry Andric ParseDecimalOrOctalCommon(TokLoc);
9540b57cec5SDimitry Andric if (hadError)
9550b57cec5SDimitry Andric return;
9560b57cec5SDimitry Andric }
9570b57cec5SDimitry Andric }
9580b57cec5SDimitry Andric
9590b57cec5SDimitry Andric SuffixBegin = s;
9600b57cec5SDimitry Andric checkSeparator(TokLoc, s, CSK_AfterDigits);
9610b57cec5SDimitry Andric
9620b57cec5SDimitry Andric // Initial scan to lookahead for fixed point suffix.
9635ffd83dbSDimitry Andric if (LangOpts.FixedPoint) {
9640b57cec5SDimitry Andric for (const char *c = s; c != ThisTokEnd; ++c) {
9650b57cec5SDimitry Andric if (*c == 'r' || *c == 'k' || *c == 'R' || *c == 'K') {
9660b57cec5SDimitry Andric saw_fixed_point_suffix = true;
9670b57cec5SDimitry Andric break;
9680b57cec5SDimitry Andric }
9690b57cec5SDimitry Andric }
9700b57cec5SDimitry Andric }
9710b57cec5SDimitry Andric
9720b57cec5SDimitry Andric // Parse the suffix. At this point we can classify whether we have an FP or
9730b57cec5SDimitry Andric // integer constant.
9745ffd83dbSDimitry Andric bool isFixedPointConstant = isFixedPointLiteral();
9750b57cec5SDimitry Andric bool isFPConstant = isFloatingLiteral();
976fe6060f1SDimitry Andric bool HasSize = false;
9770b57cec5SDimitry Andric
9780b57cec5SDimitry Andric // Loop over all of the characters of the suffix. If we see something bad,
9790b57cec5SDimitry Andric // we break out of the loop.
9800b57cec5SDimitry Andric for (; s != ThisTokEnd; ++s) {
9810b57cec5SDimitry Andric switch (*s) {
9820b57cec5SDimitry Andric case 'R':
9830b57cec5SDimitry Andric case 'r':
9845ffd83dbSDimitry Andric if (!LangOpts.FixedPoint)
9855ffd83dbSDimitry Andric break;
9860b57cec5SDimitry Andric if (isFract || isAccum) break;
9870b57cec5SDimitry Andric if (!(saw_period || saw_exponent)) break;
9880b57cec5SDimitry Andric isFract = true;
9890b57cec5SDimitry Andric continue;
9900b57cec5SDimitry Andric case 'K':
9910b57cec5SDimitry Andric case 'k':
9925ffd83dbSDimitry Andric if (!LangOpts.FixedPoint)
9935ffd83dbSDimitry Andric break;
9940b57cec5SDimitry Andric if (isFract || isAccum) break;
9950b57cec5SDimitry Andric if (!(saw_period || saw_exponent)) break;
9960b57cec5SDimitry Andric isAccum = true;
9970b57cec5SDimitry Andric continue;
9980b57cec5SDimitry Andric case 'h': // FP Suffix for "half".
9990b57cec5SDimitry Andric case 'H':
10000b57cec5SDimitry Andric // OpenCL Extension v1.2 s9.5 - h or H suffix for half type.
10015ffd83dbSDimitry Andric if (!(LangOpts.Half || LangOpts.FixedPoint))
10025ffd83dbSDimitry Andric break;
10030b57cec5SDimitry Andric if (isIntegerLiteral()) break; // Error for integer constant.
1004fe6060f1SDimitry Andric if (HasSize)
1005fe6060f1SDimitry Andric break;
1006fe6060f1SDimitry Andric HasSize = true;
10070b57cec5SDimitry Andric isHalf = true;
10080b57cec5SDimitry Andric continue; // Success.
10090b57cec5SDimitry Andric case 'f': // FP Suffix for "float"
10100b57cec5SDimitry Andric case 'F':
10110b57cec5SDimitry Andric if (!isFPConstant) break; // Error for integer constant.
1012fe6060f1SDimitry Andric if (HasSize)
1013fe6060f1SDimitry Andric break;
1014fe6060f1SDimitry Andric HasSize = true;
10150b57cec5SDimitry Andric
10160b57cec5SDimitry Andric // CUDA host and device may have different _Float16 support, therefore
10170b57cec5SDimitry Andric // allows f16 literals to avoid false alarm.
1018bdd1243dSDimitry Andric // When we compile for OpenMP target offloading on NVPTX, f16 suffix
1019bdd1243dSDimitry Andric // should also be supported.
10200b57cec5SDimitry Andric // ToDo: more precise check for CUDA.
1021bdd1243dSDimitry Andric // TODO: AMDGPU might also support it in the future.
1022bdd1243dSDimitry Andric if ((Target.hasFloat16Type() || LangOpts.CUDA ||
102306c3fb27SDimitry Andric (LangOpts.OpenMPIsTargetDevice && Target.getTriple().isNVPTX())) &&
1024bdd1243dSDimitry Andric s + 2 < ThisTokEnd && s[1] == '1' && s[2] == '6') {
10250b57cec5SDimitry Andric s += 2; // success, eat up 2 characters.
10260b57cec5SDimitry Andric isFloat16 = true;
10270b57cec5SDimitry Andric continue;
10280b57cec5SDimitry Andric }
10290b57cec5SDimitry Andric
10300b57cec5SDimitry Andric isFloat = true;
10310b57cec5SDimitry Andric continue; // Success.
10320b57cec5SDimitry Andric case 'q': // FP Suffix for "__float128"
10330b57cec5SDimitry Andric case 'Q':
10340b57cec5SDimitry Andric if (!isFPConstant) break; // Error for integer constant.
1035fe6060f1SDimitry Andric if (HasSize)
1036fe6060f1SDimitry Andric break;
1037fe6060f1SDimitry Andric HasSize = true;
10380b57cec5SDimitry Andric isFloat128 = true;
10390b57cec5SDimitry Andric continue; // Success.
10400b57cec5SDimitry Andric case 'u':
10410b57cec5SDimitry Andric case 'U':
10420b57cec5SDimitry Andric if (isFPConstant) break; // Error for floating constant.
10430b57cec5SDimitry Andric if (isUnsigned) break; // Cannot be repeated.
10440b57cec5SDimitry Andric isUnsigned = true;
10450b57cec5SDimitry Andric continue; // Success.
10460b57cec5SDimitry Andric case 'l':
10470b57cec5SDimitry Andric case 'L':
1048fe6060f1SDimitry Andric if (HasSize)
1049fe6060f1SDimitry Andric break;
1050fe6060f1SDimitry Andric HasSize = true;
10510b57cec5SDimitry Andric
10520b57cec5SDimitry Andric // Check for long long. The L's need to be adjacent and the same case.
10530b57cec5SDimitry Andric if (s[1] == s[0]) {
10540b57cec5SDimitry Andric assert(s + 1 < ThisTokEnd && "didn't maximally munch?");
10550b57cec5SDimitry Andric if (isFPConstant) break; // long long invalid for floats.
10560b57cec5SDimitry Andric isLongLong = true;
10570b57cec5SDimitry Andric ++s; // Eat both of them.
10580b57cec5SDimitry Andric } else {
10590b57cec5SDimitry Andric isLong = true;
10600b57cec5SDimitry Andric }
10610b57cec5SDimitry Andric continue; // Success.
1062fe6060f1SDimitry Andric case 'z':
1063fe6060f1SDimitry Andric case 'Z':
1064fe6060f1SDimitry Andric if (isFPConstant)
1065fe6060f1SDimitry Andric break; // Invalid for floats.
1066fe6060f1SDimitry Andric if (HasSize)
1067fe6060f1SDimitry Andric break;
1068fe6060f1SDimitry Andric HasSize = true;
1069fe6060f1SDimitry Andric isSizeT = true;
1070fe6060f1SDimitry Andric continue;
10710b57cec5SDimitry Andric case 'i':
10720b57cec5SDimitry Andric case 'I':
1073fe6060f1SDimitry Andric if (LangOpts.MicrosoftExt && !isFPConstant) {
1074fe6060f1SDimitry Andric // Allow i8, i16, i32, and i64. First, look ahead and check if
1075fe6060f1SDimitry Andric // suffixes are Microsoft integers and not the imaginary unit.
1076fe6060f1SDimitry Andric uint8_t Bits = 0;
1077fe6060f1SDimitry Andric size_t ToSkip = 0;
10780b57cec5SDimitry Andric switch (s[1]) {
1079fe6060f1SDimitry Andric case '8': // i8 suffix
1080fe6060f1SDimitry Andric Bits = 8;
1081fe6060f1SDimitry Andric ToSkip = 2;
10820b57cec5SDimitry Andric break;
10830b57cec5SDimitry Andric case '1':
1084fe6060f1SDimitry Andric if (s[2] == '6') { // i16 suffix
1085fe6060f1SDimitry Andric Bits = 16;
1086fe6060f1SDimitry Andric ToSkip = 3;
10870b57cec5SDimitry Andric }
10880b57cec5SDimitry Andric break;
10890b57cec5SDimitry Andric case '3':
1090fe6060f1SDimitry Andric if (s[2] == '2') { // i32 suffix
1091fe6060f1SDimitry Andric Bits = 32;
1092fe6060f1SDimitry Andric ToSkip = 3;
10930b57cec5SDimitry Andric }
10940b57cec5SDimitry Andric break;
10950b57cec5SDimitry Andric case '6':
1096fe6060f1SDimitry Andric if (s[2] == '4') { // i64 suffix
1097fe6060f1SDimitry Andric Bits = 64;
1098fe6060f1SDimitry Andric ToSkip = 3;
10990b57cec5SDimitry Andric }
11000b57cec5SDimitry Andric break;
11010b57cec5SDimitry Andric default:
11020b57cec5SDimitry Andric break;
11030b57cec5SDimitry Andric }
1104fe6060f1SDimitry Andric if (Bits) {
1105fe6060f1SDimitry Andric if (HasSize)
1106fe6060f1SDimitry Andric break;
1107fe6060f1SDimitry Andric HasSize = true;
1108fe6060f1SDimitry Andric MicrosoftInteger = Bits;
1109fe6060f1SDimitry Andric s += ToSkip;
11100b57cec5SDimitry Andric assert(s <= ThisTokEnd && "didn't maximally munch?");
11110b57cec5SDimitry Andric break;
11120b57cec5SDimitry Andric }
11130b57cec5SDimitry Andric }
1114bdd1243dSDimitry Andric [[fallthrough]];
11150b57cec5SDimitry Andric case 'j':
11160b57cec5SDimitry Andric case 'J':
11170b57cec5SDimitry Andric if (isImaginary) break; // Cannot be repeated.
11180b57cec5SDimitry Andric isImaginary = true;
11190b57cec5SDimitry Andric continue; // Success.
112081ad6265SDimitry Andric case 'w':
112181ad6265SDimitry Andric case 'W':
112281ad6265SDimitry Andric if (isFPConstant)
112381ad6265SDimitry Andric break; // Invalid for floats.
112481ad6265SDimitry Andric if (HasSize)
112581ad6265SDimitry Andric break; // Invalid if we already have a size for the literal.
112681ad6265SDimitry Andric
112781ad6265SDimitry Andric // wb and WB are allowed, but a mixture of cases like Wb or wB is not. We
112881ad6265SDimitry Andric // explicitly do not support the suffix in C++ as an extension because a
112981ad6265SDimitry Andric // library-based UDL that resolves to a library type may be more
113081ad6265SDimitry Andric // appropriate there.
113181ad6265SDimitry Andric if (!LangOpts.CPlusPlus && ((s[0] == 'w' && s[1] == 'b') ||
113281ad6265SDimitry Andric (s[0] == 'W' && s[1] == 'B'))) {
113381ad6265SDimitry Andric isBitInt = true;
113481ad6265SDimitry Andric HasSize = true;
113581ad6265SDimitry Andric ++s; // Skip both characters (2nd char skipped on continue).
113681ad6265SDimitry Andric continue; // Success.
113781ad6265SDimitry Andric }
11380b57cec5SDimitry Andric }
11390b57cec5SDimitry Andric // If we reached here, there was an error or a ud-suffix.
11400b57cec5SDimitry Andric break;
11410b57cec5SDimitry Andric }
11420b57cec5SDimitry Andric
11430b57cec5SDimitry Andric // "i", "if", and "il" are user-defined suffixes in C++1y.
11440b57cec5SDimitry Andric if (s != ThisTokEnd || isImaginary) {
11450b57cec5SDimitry Andric // FIXME: Don't bother expanding UCNs if !tok.hasUCN().
11460b57cec5SDimitry Andric expandUCNs(UDSuffixBuf, StringRef(SuffixBegin, ThisTokEnd - SuffixBegin));
11475ffd83dbSDimitry Andric if (isValidUDSuffix(LangOpts, UDSuffixBuf)) {
11480b57cec5SDimitry Andric if (!isImaginary) {
11490b57cec5SDimitry Andric // Any suffix pieces we might have parsed are actually part of the
11500b57cec5SDimitry Andric // ud-suffix.
11510b57cec5SDimitry Andric isLong = false;
11520b57cec5SDimitry Andric isUnsigned = false;
11530b57cec5SDimitry Andric isLongLong = false;
1154fe6060f1SDimitry Andric isSizeT = false;
11550b57cec5SDimitry Andric isFloat = false;
11560b57cec5SDimitry Andric isFloat16 = false;
11570b57cec5SDimitry Andric isHalf = false;
11580b57cec5SDimitry Andric isImaginary = false;
115981ad6265SDimitry Andric isBitInt = false;
11600b57cec5SDimitry Andric MicrosoftInteger = 0;
11610b57cec5SDimitry Andric saw_fixed_point_suffix = false;
11620b57cec5SDimitry Andric isFract = false;
11630b57cec5SDimitry Andric isAccum = false;
11640b57cec5SDimitry Andric }
11650b57cec5SDimitry Andric
11660b57cec5SDimitry Andric saw_ud_suffix = true;
11670b57cec5SDimitry Andric return;
11680b57cec5SDimitry Andric }
11690b57cec5SDimitry Andric
11700b57cec5SDimitry Andric if (s != ThisTokEnd) {
11710b57cec5SDimitry Andric // Report an error if there are any.
11725ffd83dbSDimitry Andric Diags.Report(Lexer::AdvanceToTokenCharacter(
11735ffd83dbSDimitry Andric TokLoc, SuffixBegin - ThisTokBegin, SM, LangOpts),
11740b57cec5SDimitry Andric diag::err_invalid_suffix_constant)
11755ffd83dbSDimitry Andric << StringRef(SuffixBegin, ThisTokEnd - SuffixBegin)
11765ffd83dbSDimitry Andric << (isFixedPointConstant ? 2 : isFPConstant);
11770b57cec5SDimitry Andric hadError = true;
11780b57cec5SDimitry Andric }
11790b57cec5SDimitry Andric }
11800b57cec5SDimitry Andric
11810b57cec5SDimitry Andric if (!hadError && saw_fixed_point_suffix) {
11820b57cec5SDimitry Andric assert(isFract || isAccum);
11830b57cec5SDimitry Andric }
11840b57cec5SDimitry Andric }
11850b57cec5SDimitry Andric
11860b57cec5SDimitry Andric /// ParseDecimalOrOctalCommon - This method is called for decimal or octal
11870b57cec5SDimitry Andric /// numbers. It issues an error for illegal digits, and handles floating point
11880b57cec5SDimitry Andric /// parsing. If it detects a floating point number, the radix is set to 10.
ParseDecimalOrOctalCommon(SourceLocation TokLoc)11890b57cec5SDimitry Andric void NumericLiteralParser::ParseDecimalOrOctalCommon(SourceLocation TokLoc){
11900b57cec5SDimitry Andric assert((radix == 8 || radix == 10) && "Unexpected radix");
11910b57cec5SDimitry Andric
11920b57cec5SDimitry Andric // If we have a hex digit other than 'e' (which denotes a FP exponent) then
11930b57cec5SDimitry Andric // the code is using an incorrect base.
11940b57cec5SDimitry Andric if (isHexDigit(*s) && *s != 'e' && *s != 'E' &&
11955ffd83dbSDimitry Andric !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
11965ffd83dbSDimitry Andric Diags.Report(
11975ffd83dbSDimitry Andric Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM, LangOpts),
11985ffd83dbSDimitry Andric diag::err_invalid_digit)
11995ffd83dbSDimitry Andric << StringRef(s, 1) << (radix == 8 ? 1 : 0);
12000b57cec5SDimitry Andric hadError = true;
12010b57cec5SDimitry Andric return;
12020b57cec5SDimitry Andric }
12030b57cec5SDimitry Andric
12040b57cec5SDimitry Andric if (*s == '.') {
12050b57cec5SDimitry Andric checkSeparator(TokLoc, s, CSK_AfterDigits);
12060b57cec5SDimitry Andric s++;
12070b57cec5SDimitry Andric radix = 10;
12080b57cec5SDimitry Andric saw_period = true;
12090b57cec5SDimitry Andric checkSeparator(TokLoc, s, CSK_BeforeDigits);
12100b57cec5SDimitry Andric s = SkipDigits(s); // Skip suffix.
12110b57cec5SDimitry Andric }
12120b57cec5SDimitry Andric if (*s == 'e' || *s == 'E') { // exponent
12130b57cec5SDimitry Andric checkSeparator(TokLoc, s, CSK_AfterDigits);
12140b57cec5SDimitry Andric const char *Exponent = s;
12150b57cec5SDimitry Andric s++;
12160b57cec5SDimitry Andric radix = 10;
12170b57cec5SDimitry Andric saw_exponent = true;
12180b57cec5SDimitry Andric if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
12190b57cec5SDimitry Andric const char *first_non_digit = SkipDigits(s);
12200b57cec5SDimitry Andric if (containsDigits(s, first_non_digit)) {
12210b57cec5SDimitry Andric checkSeparator(TokLoc, s, CSK_BeforeDigits);
12220b57cec5SDimitry Andric s = first_non_digit;
12230b57cec5SDimitry Andric } else {
12240b57cec5SDimitry Andric if (!hadError) {
12255ffd83dbSDimitry Andric Diags.Report(Lexer::AdvanceToTokenCharacter(
12265ffd83dbSDimitry Andric TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
12270b57cec5SDimitry Andric diag::err_exponent_has_no_digits);
12280b57cec5SDimitry Andric hadError = true;
12290b57cec5SDimitry Andric }
12300b57cec5SDimitry Andric return;
12310b57cec5SDimitry Andric }
12320b57cec5SDimitry Andric }
12330b57cec5SDimitry Andric }
12340b57cec5SDimitry Andric
12350b57cec5SDimitry Andric /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
12360b57cec5SDimitry Andric /// suffixes as ud-suffixes, because the diagnostic experience is better if we
12370b57cec5SDimitry Andric /// treat it as an invalid suffix.
isValidUDSuffix(const LangOptions & LangOpts,StringRef Suffix)12380b57cec5SDimitry Andric bool NumericLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
12390b57cec5SDimitry Andric StringRef Suffix) {
12400b57cec5SDimitry Andric if (!LangOpts.CPlusPlus11 || Suffix.empty())
12410b57cec5SDimitry Andric return false;
12420b57cec5SDimitry Andric
12430b57cec5SDimitry Andric // By C++11 [lex.ext]p10, ud-suffixes starting with an '_' are always valid.
12440b57cec5SDimitry Andric if (Suffix[0] == '_')
12450b57cec5SDimitry Andric return true;
12460b57cec5SDimitry Andric
12470b57cec5SDimitry Andric // In C++11, there are no library suffixes.
12480b57cec5SDimitry Andric if (!LangOpts.CPlusPlus14)
12490b57cec5SDimitry Andric return false;
12500b57cec5SDimitry Andric
12510b57cec5SDimitry Andric // In C++14, "s", "h", "min", "ms", "us", and "ns" are used in the library.
12520b57cec5SDimitry Andric // Per tweaked N3660, "il", "i", and "if" are also used in the library.
12530b57cec5SDimitry Andric // In C++2a "d" and "y" are used in the library.
12540b57cec5SDimitry Andric return llvm::StringSwitch<bool>(Suffix)
12550b57cec5SDimitry Andric .Cases("h", "min", "s", true)
12560b57cec5SDimitry Andric .Cases("ms", "us", "ns", true)
12570b57cec5SDimitry Andric .Cases("il", "i", "if", true)
12585ffd83dbSDimitry Andric .Cases("d", "y", LangOpts.CPlusPlus20)
12590b57cec5SDimitry Andric .Default(false);
12600b57cec5SDimitry Andric }
12610b57cec5SDimitry Andric
checkSeparator(SourceLocation TokLoc,const char * Pos,CheckSeparatorKind IsAfterDigits)12620b57cec5SDimitry Andric void NumericLiteralParser::checkSeparator(SourceLocation TokLoc,
12630b57cec5SDimitry Andric const char *Pos,
12640b57cec5SDimitry Andric CheckSeparatorKind IsAfterDigits) {
12650b57cec5SDimitry Andric if (IsAfterDigits == CSK_AfterDigits) {
12660b57cec5SDimitry Andric if (Pos == ThisTokBegin)
12670b57cec5SDimitry Andric return;
12680b57cec5SDimitry Andric --Pos;
12690b57cec5SDimitry Andric } else if (Pos == ThisTokEnd)
12700b57cec5SDimitry Andric return;
12710b57cec5SDimitry Andric
12720b57cec5SDimitry Andric if (isDigitSeparator(*Pos)) {
12735ffd83dbSDimitry Andric Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, Pos - ThisTokBegin, SM,
12745ffd83dbSDimitry Andric LangOpts),
12750b57cec5SDimitry Andric diag::err_digit_separator_not_between_digits)
12760b57cec5SDimitry Andric << IsAfterDigits;
12770b57cec5SDimitry Andric hadError = true;
12780b57cec5SDimitry Andric }
12790b57cec5SDimitry Andric }
12800b57cec5SDimitry Andric
12810b57cec5SDimitry Andric /// ParseNumberStartingWithZero - This method is called when the first character
12820b57cec5SDimitry Andric /// of the number is found to be a zero. This means it is either an octal
12830b57cec5SDimitry Andric /// number (like '04') or a hex number ('0x123a') a binary number ('0b1010') or
12840b57cec5SDimitry Andric /// a floating point number (01239.123e4). Eat the prefix, determining the
12850b57cec5SDimitry Andric /// radix etc.
ParseNumberStartingWithZero(SourceLocation TokLoc)12860b57cec5SDimitry Andric void NumericLiteralParser::ParseNumberStartingWithZero(SourceLocation TokLoc) {
12870b57cec5SDimitry Andric assert(s[0] == '0' && "Invalid method call");
12880b57cec5SDimitry Andric s++;
12890b57cec5SDimitry Andric
12900b57cec5SDimitry Andric int c1 = s[0];
12910b57cec5SDimitry Andric
12920b57cec5SDimitry Andric // Handle a hex number like 0x1234.
12930b57cec5SDimitry Andric if ((c1 == 'x' || c1 == 'X') && (isHexDigit(s[1]) || s[1] == '.')) {
12940b57cec5SDimitry Andric s++;
12950b57cec5SDimitry Andric assert(s < ThisTokEnd && "didn't maximally munch?");
12960b57cec5SDimitry Andric radix = 16;
12970b57cec5SDimitry Andric DigitsBegin = s;
12980b57cec5SDimitry Andric s = SkipHexDigits(s);
12990b57cec5SDimitry Andric bool HasSignificandDigits = containsDigits(DigitsBegin, s);
13000b57cec5SDimitry Andric if (s == ThisTokEnd) {
13010b57cec5SDimitry Andric // Done.
13020b57cec5SDimitry Andric } else if (*s == '.') {
13030b57cec5SDimitry Andric s++;
13040b57cec5SDimitry Andric saw_period = true;
13050b57cec5SDimitry Andric const char *floatDigitsBegin = s;
13060b57cec5SDimitry Andric s = SkipHexDigits(s);
13070b57cec5SDimitry Andric if (containsDigits(floatDigitsBegin, s))
13080b57cec5SDimitry Andric HasSignificandDigits = true;
13090b57cec5SDimitry Andric if (HasSignificandDigits)
13100b57cec5SDimitry Andric checkSeparator(TokLoc, floatDigitsBegin, CSK_BeforeDigits);
13110b57cec5SDimitry Andric }
13120b57cec5SDimitry Andric
13130b57cec5SDimitry Andric if (!HasSignificandDigits) {
13145ffd83dbSDimitry Andric Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
13155ffd83dbSDimitry Andric LangOpts),
13160b57cec5SDimitry Andric diag::err_hex_constant_requires)
13175ffd83dbSDimitry Andric << LangOpts.CPlusPlus << 1;
13180b57cec5SDimitry Andric hadError = true;
13190b57cec5SDimitry Andric return;
13200b57cec5SDimitry Andric }
13210b57cec5SDimitry Andric
13220b57cec5SDimitry Andric // A binary exponent can appear with or with a '.'. If dotted, the
13230b57cec5SDimitry Andric // binary exponent is required.
13240b57cec5SDimitry Andric if (*s == 'p' || *s == 'P') {
13250b57cec5SDimitry Andric checkSeparator(TokLoc, s, CSK_AfterDigits);
13260b57cec5SDimitry Andric const char *Exponent = s;
13270b57cec5SDimitry Andric s++;
13280b57cec5SDimitry Andric saw_exponent = true;
13290b57cec5SDimitry Andric if (s != ThisTokEnd && (*s == '+' || *s == '-')) s++; // sign
13300b57cec5SDimitry Andric const char *first_non_digit = SkipDigits(s);
13310b57cec5SDimitry Andric if (!containsDigits(s, first_non_digit)) {
13320b57cec5SDimitry Andric if (!hadError) {
13335ffd83dbSDimitry Andric Diags.Report(Lexer::AdvanceToTokenCharacter(
13345ffd83dbSDimitry Andric TokLoc, Exponent - ThisTokBegin, SM, LangOpts),
13350b57cec5SDimitry Andric diag::err_exponent_has_no_digits);
13360b57cec5SDimitry Andric hadError = true;
13370b57cec5SDimitry Andric }
13380b57cec5SDimitry Andric return;
13390b57cec5SDimitry Andric }
13400b57cec5SDimitry Andric checkSeparator(TokLoc, s, CSK_BeforeDigits);
13410b57cec5SDimitry Andric s = first_non_digit;
13420b57cec5SDimitry Andric
13435ffd83dbSDimitry Andric if (!LangOpts.HexFloats)
13445ffd83dbSDimitry Andric Diags.Report(TokLoc, LangOpts.CPlusPlus
13450b57cec5SDimitry Andric ? diag::ext_hex_literal_invalid
13460b57cec5SDimitry Andric : diag::ext_hex_constant_invalid);
13475ffd83dbSDimitry Andric else if (LangOpts.CPlusPlus17)
13485ffd83dbSDimitry Andric Diags.Report(TokLoc, diag::warn_cxx17_hex_literal);
13490b57cec5SDimitry Andric } else if (saw_period) {
13505ffd83dbSDimitry Andric Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
13515ffd83dbSDimitry Andric LangOpts),
13520b57cec5SDimitry Andric diag::err_hex_constant_requires)
13535ffd83dbSDimitry Andric << LangOpts.CPlusPlus << 0;
13540b57cec5SDimitry Andric hadError = true;
13550b57cec5SDimitry Andric }
13560b57cec5SDimitry Andric return;
13570b57cec5SDimitry Andric }
13580b57cec5SDimitry Andric
13590b57cec5SDimitry Andric // Handle simple binary numbers 0b01010
13600b57cec5SDimitry Andric if ((c1 == 'b' || c1 == 'B') && (s[1] == '0' || s[1] == '1')) {
13610b57cec5SDimitry Andric // 0b101010 is a C++1y / GCC extension.
13625ffd83dbSDimitry Andric Diags.Report(TokLoc, LangOpts.CPlusPlus14
13630b57cec5SDimitry Andric ? diag::warn_cxx11_compat_binary_literal
13645ffd83dbSDimitry Andric : LangOpts.CPlusPlus ? diag::ext_binary_literal_cxx14
13650b57cec5SDimitry Andric : diag::ext_binary_literal);
13660b57cec5SDimitry Andric ++s;
13670b57cec5SDimitry Andric assert(s < ThisTokEnd && "didn't maximally munch?");
13680b57cec5SDimitry Andric radix = 2;
13690b57cec5SDimitry Andric DigitsBegin = s;
13700b57cec5SDimitry Andric s = SkipBinaryDigits(s);
13710b57cec5SDimitry Andric if (s == ThisTokEnd) {
13720b57cec5SDimitry Andric // Done.
13730b57cec5SDimitry Andric } else if (isHexDigit(*s) &&
13745ffd83dbSDimitry Andric !isValidUDSuffix(LangOpts, StringRef(s, ThisTokEnd - s))) {
13755ffd83dbSDimitry Andric Diags.Report(Lexer::AdvanceToTokenCharacter(TokLoc, s - ThisTokBegin, SM,
13765ffd83dbSDimitry Andric LangOpts),
13775ffd83dbSDimitry Andric diag::err_invalid_digit)
13785ffd83dbSDimitry Andric << StringRef(s, 1) << 2;
13790b57cec5SDimitry Andric hadError = true;
13800b57cec5SDimitry Andric }
13810b57cec5SDimitry Andric // Other suffixes will be diagnosed by the caller.
13820b57cec5SDimitry Andric return;
13830b57cec5SDimitry Andric }
13840b57cec5SDimitry Andric
13850b57cec5SDimitry Andric // For now, the radix is set to 8. If we discover that we have a
13860b57cec5SDimitry Andric // floating point constant, the radix will change to 10. Octal floating
13870b57cec5SDimitry Andric // point constants are not permitted (only decimal and hexadecimal).
13880b57cec5SDimitry Andric radix = 8;
138981ad6265SDimitry Andric const char *PossibleNewDigitStart = s;
13900b57cec5SDimitry Andric s = SkipOctalDigits(s);
139181ad6265SDimitry Andric // When the value is 0 followed by a suffix (like 0wb), we want to leave 0
139281ad6265SDimitry Andric // as the start of the digits. So if skipping octal digits does not skip
139381ad6265SDimitry Andric // anything, we leave the digit start where it was.
139481ad6265SDimitry Andric if (s != PossibleNewDigitStart)
139581ad6265SDimitry Andric DigitsBegin = PossibleNewDigitStart;
139681ad6265SDimitry Andric
13970b57cec5SDimitry Andric if (s == ThisTokEnd)
13980b57cec5SDimitry Andric return; // Done, simple octal number like 01234
13990b57cec5SDimitry Andric
14000b57cec5SDimitry Andric // If we have some other non-octal digit that *is* a decimal digit, see if
14010b57cec5SDimitry Andric // this is part of a floating point number like 094.123 or 09e1.
14020b57cec5SDimitry Andric if (isDigit(*s)) {
14030b57cec5SDimitry Andric const char *EndDecimal = SkipDigits(s);
14040b57cec5SDimitry Andric if (EndDecimal[0] == '.' || EndDecimal[0] == 'e' || EndDecimal[0] == 'E') {
14050b57cec5SDimitry Andric s = EndDecimal;
14060b57cec5SDimitry Andric radix = 10;
14070b57cec5SDimitry Andric }
14080b57cec5SDimitry Andric }
14090b57cec5SDimitry Andric
14100b57cec5SDimitry Andric ParseDecimalOrOctalCommon(TokLoc);
14110b57cec5SDimitry Andric }
14120b57cec5SDimitry Andric
alwaysFitsInto64Bits(unsigned Radix,unsigned NumDigits)14130b57cec5SDimitry Andric static bool alwaysFitsInto64Bits(unsigned Radix, unsigned NumDigits) {
14140b57cec5SDimitry Andric switch (Radix) {
14150b57cec5SDimitry Andric case 2:
14160b57cec5SDimitry Andric return NumDigits <= 64;
14170b57cec5SDimitry Andric case 8:
14180b57cec5SDimitry Andric return NumDigits <= 64 / 3; // Digits are groups of 3 bits.
14190b57cec5SDimitry Andric case 10:
14200b57cec5SDimitry Andric return NumDigits <= 19; // floor(log10(2^64))
14210b57cec5SDimitry Andric case 16:
14220b57cec5SDimitry Andric return NumDigits <= 64 / 4; // Digits are groups of 4 bits.
14230b57cec5SDimitry Andric default:
14240b57cec5SDimitry Andric llvm_unreachable("impossible Radix");
14250b57cec5SDimitry Andric }
14260b57cec5SDimitry Andric }
14270b57cec5SDimitry Andric
14280b57cec5SDimitry Andric /// GetIntegerValue - Convert this numeric literal value to an APInt that
14290b57cec5SDimitry Andric /// matches Val's input width. If there is an overflow, set Val to the low bits
14300b57cec5SDimitry Andric /// of the result and return true. Otherwise, return false.
GetIntegerValue(llvm::APInt & Val)14310b57cec5SDimitry Andric bool NumericLiteralParser::GetIntegerValue(llvm::APInt &Val) {
14320b57cec5SDimitry Andric // Fast path: Compute a conservative bound on the maximum number of
14330b57cec5SDimitry Andric // bits per digit in this radix. If we can't possibly overflow a
14340b57cec5SDimitry Andric // uint64 based on that bound then do the simple conversion to
14350b57cec5SDimitry Andric // integer. This avoids the expensive overflow checking below, and
14360b57cec5SDimitry Andric // handles the common cases that matter (small decimal integers and
14370b57cec5SDimitry Andric // hex/octal values which don't overflow).
14380b57cec5SDimitry Andric const unsigned NumDigits = SuffixBegin - DigitsBegin;
14390b57cec5SDimitry Andric if (alwaysFitsInto64Bits(radix, NumDigits)) {
14400b57cec5SDimitry Andric uint64_t N = 0;
14410b57cec5SDimitry Andric for (const char *Ptr = DigitsBegin; Ptr != SuffixBegin; ++Ptr)
14420b57cec5SDimitry Andric if (!isDigitSeparator(*Ptr))
14430b57cec5SDimitry Andric N = N * radix + llvm::hexDigitValue(*Ptr);
14440b57cec5SDimitry Andric
14450b57cec5SDimitry Andric // This will truncate the value to Val's input width. Simply check
14460b57cec5SDimitry Andric // for overflow by comparing.
14470b57cec5SDimitry Andric Val = N;
14480b57cec5SDimitry Andric return Val.getZExtValue() != N;
14490b57cec5SDimitry Andric }
14500b57cec5SDimitry Andric
14510b57cec5SDimitry Andric Val = 0;
14520b57cec5SDimitry Andric const char *Ptr = DigitsBegin;
14530b57cec5SDimitry Andric
14540b57cec5SDimitry Andric llvm::APInt RadixVal(Val.getBitWidth(), radix);
14550b57cec5SDimitry Andric llvm::APInt CharVal(Val.getBitWidth(), 0);
14560b57cec5SDimitry Andric llvm::APInt OldVal = Val;
14570b57cec5SDimitry Andric
14580b57cec5SDimitry Andric bool OverflowOccurred = false;
14590b57cec5SDimitry Andric while (Ptr < SuffixBegin) {
14600b57cec5SDimitry Andric if (isDigitSeparator(*Ptr)) {
14610b57cec5SDimitry Andric ++Ptr;
14620b57cec5SDimitry Andric continue;
14630b57cec5SDimitry Andric }
14640b57cec5SDimitry Andric
14650b57cec5SDimitry Andric unsigned C = llvm::hexDigitValue(*Ptr++);
14660b57cec5SDimitry Andric
14670b57cec5SDimitry Andric // If this letter is out of bound for this radix, reject it.
14680b57cec5SDimitry Andric assert(C < radix && "NumericLiteralParser ctor should have rejected this");
14690b57cec5SDimitry Andric
14700b57cec5SDimitry Andric CharVal = C;
14710b57cec5SDimitry Andric
14720b57cec5SDimitry Andric // Add the digit to the value in the appropriate radix. If adding in digits
14730b57cec5SDimitry Andric // made the value smaller, then this overflowed.
14740b57cec5SDimitry Andric OldVal = Val;
14750b57cec5SDimitry Andric
14760b57cec5SDimitry Andric // Multiply by radix, did overflow occur on the multiply?
14770b57cec5SDimitry Andric Val *= RadixVal;
14780b57cec5SDimitry Andric OverflowOccurred |= Val.udiv(RadixVal) != OldVal;
14790b57cec5SDimitry Andric
14800b57cec5SDimitry Andric // Add value, did overflow occur on the value?
14810b57cec5SDimitry Andric // (a + b) ult b <=> overflow
14820b57cec5SDimitry Andric Val += CharVal;
14830b57cec5SDimitry Andric OverflowOccurred |= Val.ult(CharVal);
14840b57cec5SDimitry Andric }
14850b57cec5SDimitry Andric return OverflowOccurred;
14860b57cec5SDimitry Andric }
14870b57cec5SDimitry Andric
14880b57cec5SDimitry Andric llvm::APFloat::opStatus
GetFloatValue(llvm::APFloat & Result)14890b57cec5SDimitry Andric NumericLiteralParser::GetFloatValue(llvm::APFloat &Result) {
14900b57cec5SDimitry Andric using llvm::APFloat;
14910b57cec5SDimitry Andric
14920b57cec5SDimitry Andric unsigned n = std::min(SuffixBegin - ThisTokBegin, ThisTokEnd - ThisTokBegin);
14930b57cec5SDimitry Andric
14940b57cec5SDimitry Andric llvm::SmallString<16> Buffer;
14950b57cec5SDimitry Andric StringRef Str(ThisTokBegin, n);
1496349cc55cSDimitry Andric if (Str.contains('\'')) {
14970b57cec5SDimitry Andric Buffer.reserve(n);
14980b57cec5SDimitry Andric std::remove_copy_if(Str.begin(), Str.end(), std::back_inserter(Buffer),
14990b57cec5SDimitry Andric &isDigitSeparator);
15000b57cec5SDimitry Andric Str = Buffer;
15010b57cec5SDimitry Andric }
15020b57cec5SDimitry Andric
1503480093f4SDimitry Andric auto StatusOrErr =
1504480093f4SDimitry Andric Result.convertFromString(Str, APFloat::rmNearestTiesToEven);
1505480093f4SDimitry Andric assert(StatusOrErr && "Invalid floating point representation");
1506480093f4SDimitry Andric return !errorToBool(StatusOrErr.takeError()) ? *StatusOrErr
1507480093f4SDimitry Andric : APFloat::opInvalidOp;
15080b57cec5SDimitry Andric }
15090b57cec5SDimitry Andric
IsExponentPart(char c)15100b57cec5SDimitry Andric static inline bool IsExponentPart(char c) {
15110b57cec5SDimitry Andric return c == 'p' || c == 'P' || c == 'e' || c == 'E';
15120b57cec5SDimitry Andric }
15130b57cec5SDimitry Andric
GetFixedPointValue(llvm::APInt & StoreVal,unsigned Scale)15140b57cec5SDimitry Andric bool NumericLiteralParser::GetFixedPointValue(llvm::APInt &StoreVal, unsigned Scale) {
15150b57cec5SDimitry Andric assert(radix == 16 || radix == 10);
15160b57cec5SDimitry Andric
15170b57cec5SDimitry Andric // Find how many digits are needed to store the whole literal.
15180b57cec5SDimitry Andric unsigned NumDigits = SuffixBegin - DigitsBegin;
15190b57cec5SDimitry Andric if (saw_period) --NumDigits;
15200b57cec5SDimitry Andric
15210b57cec5SDimitry Andric // Initial scan of the exponent if it exists
15220b57cec5SDimitry Andric bool ExpOverflowOccurred = false;
15230b57cec5SDimitry Andric bool NegativeExponent = false;
15240b57cec5SDimitry Andric const char *ExponentBegin;
15250b57cec5SDimitry Andric uint64_t Exponent = 0;
15260b57cec5SDimitry Andric int64_t BaseShift = 0;
15270b57cec5SDimitry Andric if (saw_exponent) {
15280b57cec5SDimitry Andric const char *Ptr = DigitsBegin;
15290b57cec5SDimitry Andric
15300b57cec5SDimitry Andric while (!IsExponentPart(*Ptr)) ++Ptr;
15310b57cec5SDimitry Andric ExponentBegin = Ptr;
15320b57cec5SDimitry Andric ++Ptr;
15330b57cec5SDimitry Andric NegativeExponent = *Ptr == '-';
15340b57cec5SDimitry Andric if (NegativeExponent) ++Ptr;
15350b57cec5SDimitry Andric
15360b57cec5SDimitry Andric unsigned NumExpDigits = SuffixBegin - Ptr;
15370b57cec5SDimitry Andric if (alwaysFitsInto64Bits(radix, NumExpDigits)) {
15380b57cec5SDimitry Andric llvm::StringRef ExpStr(Ptr, NumExpDigits);
15390b57cec5SDimitry Andric llvm::APInt ExpInt(/*numBits=*/64, ExpStr, /*radix=*/10);
15400b57cec5SDimitry Andric Exponent = ExpInt.getZExtValue();
15410b57cec5SDimitry Andric } else {
15420b57cec5SDimitry Andric ExpOverflowOccurred = true;
15430b57cec5SDimitry Andric }
15440b57cec5SDimitry Andric
15450b57cec5SDimitry Andric if (NegativeExponent) BaseShift -= Exponent;
15460b57cec5SDimitry Andric else BaseShift += Exponent;
15470b57cec5SDimitry Andric }
15480b57cec5SDimitry Andric
15490b57cec5SDimitry Andric // Number of bits needed for decimal literal is
15500b57cec5SDimitry Andric // ceil(NumDigits * log2(10)) Integral part
15510b57cec5SDimitry Andric // + Scale Fractional part
15520b57cec5SDimitry Andric // + ceil(Exponent * log2(10)) Exponent
15530b57cec5SDimitry Andric // --------------------------------------------------
15540b57cec5SDimitry Andric // ceil((NumDigits + Exponent) * log2(10)) + Scale
15550b57cec5SDimitry Andric //
15560b57cec5SDimitry Andric // But for simplicity in handling integers, we can round up log2(10) to 4,
15570b57cec5SDimitry Andric // making:
15580b57cec5SDimitry Andric // 4 * (NumDigits + Exponent) + Scale
15590b57cec5SDimitry Andric //
15600b57cec5SDimitry Andric // Number of digits needed for hexadecimal literal is
15610b57cec5SDimitry Andric // 4 * NumDigits Integral part
15620b57cec5SDimitry Andric // + Scale Fractional part
15630b57cec5SDimitry Andric // + Exponent Exponent
15640b57cec5SDimitry Andric // --------------------------------------------------
15650b57cec5SDimitry Andric // (4 * NumDigits) + Scale + Exponent
15660b57cec5SDimitry Andric uint64_t NumBitsNeeded;
15670b57cec5SDimitry Andric if (radix == 10)
15680b57cec5SDimitry Andric NumBitsNeeded = 4 * (NumDigits + Exponent) + Scale;
15690b57cec5SDimitry Andric else
15700b57cec5SDimitry Andric NumBitsNeeded = 4 * NumDigits + Exponent + Scale;
15710b57cec5SDimitry Andric
15720b57cec5SDimitry Andric if (NumBitsNeeded > std::numeric_limits<unsigned>::max())
15730b57cec5SDimitry Andric ExpOverflowOccurred = true;
15740b57cec5SDimitry Andric llvm::APInt Val(static_cast<unsigned>(NumBitsNeeded), 0, /*isSigned=*/false);
15750b57cec5SDimitry Andric
15760b57cec5SDimitry Andric bool FoundDecimal = false;
15770b57cec5SDimitry Andric
15780b57cec5SDimitry Andric int64_t FractBaseShift = 0;
15790b57cec5SDimitry Andric const char *End = saw_exponent ? ExponentBegin : SuffixBegin;
15800b57cec5SDimitry Andric for (const char *Ptr = DigitsBegin; Ptr < End; ++Ptr) {
15810b57cec5SDimitry Andric if (*Ptr == '.') {
15820b57cec5SDimitry Andric FoundDecimal = true;
15830b57cec5SDimitry Andric continue;
15840b57cec5SDimitry Andric }
15850b57cec5SDimitry Andric
15860b57cec5SDimitry Andric // Normal reading of an integer
15870b57cec5SDimitry Andric unsigned C = llvm::hexDigitValue(*Ptr);
15880b57cec5SDimitry Andric assert(C < radix && "NumericLiteralParser ctor should have rejected this");
15890b57cec5SDimitry Andric
15900b57cec5SDimitry Andric Val *= radix;
15910b57cec5SDimitry Andric Val += C;
15920b57cec5SDimitry Andric
15930b57cec5SDimitry Andric if (FoundDecimal)
15940b57cec5SDimitry Andric // Keep track of how much we will need to adjust this value by from the
15950b57cec5SDimitry Andric // number of digits past the radix point.
15960b57cec5SDimitry Andric --FractBaseShift;
15970b57cec5SDimitry Andric }
15980b57cec5SDimitry Andric
15990b57cec5SDimitry Andric // For a radix of 16, we will be multiplying by 2 instead of 16.
16000b57cec5SDimitry Andric if (radix == 16) FractBaseShift *= 4;
16010b57cec5SDimitry Andric BaseShift += FractBaseShift;
16020b57cec5SDimitry Andric
16030b57cec5SDimitry Andric Val <<= Scale;
16040b57cec5SDimitry Andric
16050b57cec5SDimitry Andric uint64_t Base = (radix == 16) ? 2 : 10;
16060b57cec5SDimitry Andric if (BaseShift > 0) {
16070b57cec5SDimitry Andric for (int64_t i = 0; i < BaseShift; ++i) {
16080b57cec5SDimitry Andric Val *= Base;
16090b57cec5SDimitry Andric }
16100b57cec5SDimitry Andric } else if (BaseShift < 0) {
1611349cc55cSDimitry Andric for (int64_t i = BaseShift; i < 0 && !Val.isZero(); ++i)
16120b57cec5SDimitry Andric Val = Val.udiv(Base);
16130b57cec5SDimitry Andric }
16140b57cec5SDimitry Andric
16150b57cec5SDimitry Andric bool IntOverflowOccurred = false;
16160b57cec5SDimitry Andric auto MaxVal = llvm::APInt::getMaxValue(StoreVal.getBitWidth());
16170b57cec5SDimitry Andric if (Val.getBitWidth() > StoreVal.getBitWidth()) {
16180b57cec5SDimitry Andric IntOverflowOccurred |= Val.ugt(MaxVal.zext(Val.getBitWidth()));
16190b57cec5SDimitry Andric StoreVal = Val.trunc(StoreVal.getBitWidth());
16200b57cec5SDimitry Andric } else if (Val.getBitWidth() < StoreVal.getBitWidth()) {
16210b57cec5SDimitry Andric IntOverflowOccurred |= Val.zext(MaxVal.getBitWidth()).ugt(MaxVal);
16220b57cec5SDimitry Andric StoreVal = Val.zext(StoreVal.getBitWidth());
16230b57cec5SDimitry Andric } else {
16240b57cec5SDimitry Andric StoreVal = Val;
16250b57cec5SDimitry Andric }
16260b57cec5SDimitry Andric
16270b57cec5SDimitry Andric return IntOverflowOccurred || ExpOverflowOccurred;
16280b57cec5SDimitry Andric }
16290b57cec5SDimitry Andric
16300b57cec5SDimitry Andric /// \verbatim
16310b57cec5SDimitry Andric /// user-defined-character-literal: [C++11 lex.ext]
16320b57cec5SDimitry Andric /// character-literal ud-suffix
16330b57cec5SDimitry Andric /// ud-suffix:
16340b57cec5SDimitry Andric /// identifier
16350b57cec5SDimitry Andric /// character-literal: [C++11 lex.ccon]
16360b57cec5SDimitry Andric /// ' c-char-sequence '
16370b57cec5SDimitry Andric /// u' c-char-sequence '
16380b57cec5SDimitry Andric /// U' c-char-sequence '
16390b57cec5SDimitry Andric /// L' c-char-sequence '
16400b57cec5SDimitry Andric /// u8' c-char-sequence ' [C++1z lex.ccon]
16410b57cec5SDimitry Andric /// c-char-sequence:
16420b57cec5SDimitry Andric /// c-char
16430b57cec5SDimitry Andric /// c-char-sequence c-char
16440b57cec5SDimitry Andric /// c-char:
16450b57cec5SDimitry Andric /// any member of the source character set except the single-quote ',
16460b57cec5SDimitry Andric /// backslash \, or new-line character
16470b57cec5SDimitry Andric /// escape-sequence
16480b57cec5SDimitry Andric /// universal-character-name
16490b57cec5SDimitry Andric /// escape-sequence:
16500b57cec5SDimitry Andric /// simple-escape-sequence
16510b57cec5SDimitry Andric /// octal-escape-sequence
16520b57cec5SDimitry Andric /// hexadecimal-escape-sequence
16530b57cec5SDimitry Andric /// simple-escape-sequence:
16540b57cec5SDimitry Andric /// one of \' \" \? \\ \a \b \f \n \r \t \v
16550b57cec5SDimitry Andric /// octal-escape-sequence:
16560b57cec5SDimitry Andric /// \ octal-digit
16570b57cec5SDimitry Andric /// \ octal-digit octal-digit
16580b57cec5SDimitry Andric /// \ octal-digit octal-digit octal-digit
16590b57cec5SDimitry Andric /// hexadecimal-escape-sequence:
16600b57cec5SDimitry Andric /// \x hexadecimal-digit
16610b57cec5SDimitry Andric /// hexadecimal-escape-sequence hexadecimal-digit
16620b57cec5SDimitry Andric /// universal-character-name: [C++11 lex.charset]
16630b57cec5SDimitry Andric /// \u hex-quad
16640b57cec5SDimitry Andric /// \U hex-quad hex-quad
16650b57cec5SDimitry Andric /// hex-quad:
16660b57cec5SDimitry Andric /// hex-digit hex-digit hex-digit hex-digit
16670b57cec5SDimitry Andric /// \endverbatim
16680b57cec5SDimitry Andric ///
CharLiteralParser(const char * begin,const char * end,SourceLocation Loc,Preprocessor & PP,tok::TokenKind kind)16690b57cec5SDimitry Andric CharLiteralParser::CharLiteralParser(const char *begin, const char *end,
16700b57cec5SDimitry Andric SourceLocation Loc, Preprocessor &PP,
16710b57cec5SDimitry Andric tok::TokenKind kind) {
16720b57cec5SDimitry Andric // At this point we know that the character matches the regex "(L|u|U)?'.*'".
16730b57cec5SDimitry Andric HadError = false;
16740b57cec5SDimitry Andric
16750b57cec5SDimitry Andric Kind = kind;
16760b57cec5SDimitry Andric
16770b57cec5SDimitry Andric const char *TokBegin = begin;
16780b57cec5SDimitry Andric
16790b57cec5SDimitry Andric // Skip over wide character determinant.
16800b57cec5SDimitry Andric if (Kind != tok::char_constant)
16810b57cec5SDimitry Andric ++begin;
16820b57cec5SDimitry Andric if (Kind == tok::utf8_char_constant)
16830b57cec5SDimitry Andric ++begin;
16840b57cec5SDimitry Andric
16850b57cec5SDimitry Andric // Skip over the entry quote.
1686349cc55cSDimitry Andric if (begin[0] != '\'') {
1687349cc55cSDimitry Andric PP.Diag(Loc, diag::err_lexing_char);
1688349cc55cSDimitry Andric HadError = true;
1689349cc55cSDimitry Andric return;
1690349cc55cSDimitry Andric }
1691349cc55cSDimitry Andric
16920b57cec5SDimitry Andric ++begin;
16930b57cec5SDimitry Andric
16940b57cec5SDimitry Andric // Remove an optional ud-suffix.
16950b57cec5SDimitry Andric if (end[-1] != '\'') {
16960b57cec5SDimitry Andric const char *UDSuffixEnd = end;
16970b57cec5SDimitry Andric do {
16980b57cec5SDimitry Andric --end;
16990b57cec5SDimitry Andric } while (end[-1] != '\'');
17000b57cec5SDimitry Andric // FIXME: Don't bother with this if !tok.hasUCN().
17010b57cec5SDimitry Andric expandUCNs(UDSuffixBuf, StringRef(end, UDSuffixEnd - end));
17020b57cec5SDimitry Andric UDSuffixOffset = end - TokBegin;
17030b57cec5SDimitry Andric }
17040b57cec5SDimitry Andric
17050b57cec5SDimitry Andric // Trim the ending quote.
17060b57cec5SDimitry Andric assert(end != begin && "Invalid token lexed");
17070b57cec5SDimitry Andric --end;
17080b57cec5SDimitry Andric
17090b57cec5SDimitry Andric // FIXME: The "Value" is an uint64_t so we can handle char literals of
17100b57cec5SDimitry Andric // up to 64-bits.
17110b57cec5SDimitry Andric // FIXME: This extensively assumes that 'char' is 8-bits.
17120b57cec5SDimitry Andric assert(PP.getTargetInfo().getCharWidth() == 8 &&
17130b57cec5SDimitry Andric "Assumes char is 8 bits");
17140b57cec5SDimitry Andric assert(PP.getTargetInfo().getIntWidth() <= 64 &&
17150b57cec5SDimitry Andric (PP.getTargetInfo().getIntWidth() & 7) == 0 &&
17160b57cec5SDimitry Andric "Assumes sizeof(int) on target is <= 64 and a multiple of char");
17170b57cec5SDimitry Andric assert(PP.getTargetInfo().getWCharWidth() <= 64 &&
17180b57cec5SDimitry Andric "Assumes sizeof(wchar) on target is <= 64");
17190b57cec5SDimitry Andric
17200b57cec5SDimitry Andric SmallVector<uint32_t, 4> codepoint_buffer;
17210b57cec5SDimitry Andric codepoint_buffer.resize(end - begin);
17220b57cec5SDimitry Andric uint32_t *buffer_begin = &codepoint_buffer.front();
17230b57cec5SDimitry Andric uint32_t *buffer_end = buffer_begin + codepoint_buffer.size();
17240b57cec5SDimitry Andric
17250b57cec5SDimitry Andric // Unicode escapes representing characters that cannot be correctly
17260b57cec5SDimitry Andric // represented in a single code unit are disallowed in character literals
17270b57cec5SDimitry Andric // by this implementation.
17280b57cec5SDimitry Andric uint32_t largest_character_for_kind;
17290b57cec5SDimitry Andric if (tok::wide_char_constant == Kind) {
17300b57cec5SDimitry Andric largest_character_for_kind =
17310b57cec5SDimitry Andric 0xFFFFFFFFu >> (32-PP.getTargetInfo().getWCharWidth());
17320b57cec5SDimitry Andric } else if (tok::utf8_char_constant == Kind) {
17330b57cec5SDimitry Andric largest_character_for_kind = 0x7F;
17340b57cec5SDimitry Andric } else if (tok::utf16_char_constant == Kind) {
17350b57cec5SDimitry Andric largest_character_for_kind = 0xFFFF;
17360b57cec5SDimitry Andric } else if (tok::utf32_char_constant == Kind) {
17370b57cec5SDimitry Andric largest_character_for_kind = 0x10FFFF;
17380b57cec5SDimitry Andric } else {
17390b57cec5SDimitry Andric largest_character_for_kind = 0x7Fu;
17400b57cec5SDimitry Andric }
17410b57cec5SDimitry Andric
17420b57cec5SDimitry Andric while (begin != end) {
17430b57cec5SDimitry Andric // Is this a span of non-escape characters?
17440b57cec5SDimitry Andric if (begin[0] != '\\') {
17450b57cec5SDimitry Andric char const *start = begin;
17460b57cec5SDimitry Andric do {
17470b57cec5SDimitry Andric ++begin;
17480b57cec5SDimitry Andric } while (begin != end && *begin != '\\');
17490b57cec5SDimitry Andric
17500b57cec5SDimitry Andric char const *tmp_in_start = start;
17510b57cec5SDimitry Andric uint32_t *tmp_out_start = buffer_begin;
17520b57cec5SDimitry Andric llvm::ConversionResult res =
17530b57cec5SDimitry Andric llvm::ConvertUTF8toUTF32(reinterpret_cast<llvm::UTF8 const **>(&start),
17540b57cec5SDimitry Andric reinterpret_cast<llvm::UTF8 const *>(begin),
17550b57cec5SDimitry Andric &buffer_begin, buffer_end, llvm::strictConversion);
17560b57cec5SDimitry Andric if (res != llvm::conversionOK) {
17570b57cec5SDimitry Andric // If we see bad encoding for unprefixed character literals, warn and
17580b57cec5SDimitry Andric // simply copy the byte values, for compatibility with gcc and
17590b57cec5SDimitry Andric // older versions of clang.
176081ad6265SDimitry Andric bool NoErrorOnBadEncoding = isOrdinary();
17610b57cec5SDimitry Andric unsigned Msg = diag::err_bad_character_encoding;
17620b57cec5SDimitry Andric if (NoErrorOnBadEncoding)
17630b57cec5SDimitry Andric Msg = diag::warn_bad_character_encoding;
17640b57cec5SDimitry Andric PP.Diag(Loc, Msg);
17650b57cec5SDimitry Andric if (NoErrorOnBadEncoding) {
17660b57cec5SDimitry Andric start = tmp_in_start;
17670b57cec5SDimitry Andric buffer_begin = tmp_out_start;
17680b57cec5SDimitry Andric for (; start != begin; ++start, ++buffer_begin)
17690b57cec5SDimitry Andric *buffer_begin = static_cast<uint8_t>(*start);
17700b57cec5SDimitry Andric } else {
17710b57cec5SDimitry Andric HadError = true;
17720b57cec5SDimitry Andric }
17730b57cec5SDimitry Andric } else {
17740b57cec5SDimitry Andric for (; tmp_out_start < buffer_begin; ++tmp_out_start) {
17750b57cec5SDimitry Andric if (*tmp_out_start > largest_character_for_kind) {
17760b57cec5SDimitry Andric HadError = true;
17770b57cec5SDimitry Andric PP.Diag(Loc, diag::err_character_too_large);
17780b57cec5SDimitry Andric }
17790b57cec5SDimitry Andric }
17800b57cec5SDimitry Andric }
17810b57cec5SDimitry Andric
17820b57cec5SDimitry Andric continue;
17830b57cec5SDimitry Andric }
17840b57cec5SDimitry Andric // Is this a Universal Character Name escape?
178581ad6265SDimitry Andric if (begin[1] == 'u' || begin[1] == 'U' || begin[1] == 'N') {
17860b57cec5SDimitry Andric unsigned short UcnLen = 0;
17870b57cec5SDimitry Andric if (!ProcessUCNEscape(TokBegin, begin, end, *buffer_begin, UcnLen,
17880b57cec5SDimitry Andric FullSourceLoc(Loc, PP.getSourceManager()),
17890b57cec5SDimitry Andric &PP.getDiagnostics(), PP.getLangOpts(), true)) {
17900b57cec5SDimitry Andric HadError = true;
17910b57cec5SDimitry Andric } else if (*buffer_begin > largest_character_for_kind) {
17920b57cec5SDimitry Andric HadError = true;
17930b57cec5SDimitry Andric PP.Diag(Loc, diag::err_character_too_large);
17940b57cec5SDimitry Andric }
17950b57cec5SDimitry Andric
17960b57cec5SDimitry Andric ++buffer_begin;
17970b57cec5SDimitry Andric continue;
17980b57cec5SDimitry Andric }
17990b57cec5SDimitry Andric unsigned CharWidth = getCharWidth(Kind, PP.getTargetInfo());
18000b57cec5SDimitry Andric uint64_t result =
18010b57cec5SDimitry Andric ProcessCharEscape(TokBegin, begin, end, HadError,
180206c3fb27SDimitry Andric FullSourceLoc(Loc, PP.getSourceManager()), CharWidth,
180306c3fb27SDimitry Andric &PP.getDiagnostics(), PP.getLangOpts(),
180406c3fb27SDimitry Andric StringLiteralEvalMethod::Evaluated);
18050b57cec5SDimitry Andric *buffer_begin++ = result;
18060b57cec5SDimitry Andric }
18070b57cec5SDimitry Andric
18080b57cec5SDimitry Andric unsigned NumCharsSoFar = buffer_begin - &codepoint_buffer.front();
18090b57cec5SDimitry Andric
18100b57cec5SDimitry Andric if (NumCharsSoFar > 1) {
181181ad6265SDimitry Andric if (isOrdinary() && NumCharsSoFar == 4)
1812e8d8bef9SDimitry Andric PP.Diag(Loc, diag::warn_four_char_character_literal);
181381ad6265SDimitry Andric else if (isOrdinary())
1814e8d8bef9SDimitry Andric PP.Diag(Loc, diag::warn_multichar_character_literal);
1815349cc55cSDimitry Andric else {
1816349cc55cSDimitry Andric PP.Diag(Loc, diag::err_multichar_character_literal) << (isWide() ? 0 : 1);
1817349cc55cSDimitry Andric HadError = true;
1818349cc55cSDimitry Andric }
18190b57cec5SDimitry Andric IsMultiChar = true;
18200b57cec5SDimitry Andric } else {
18210b57cec5SDimitry Andric IsMultiChar = false;
18220b57cec5SDimitry Andric }
18230b57cec5SDimitry Andric
18240b57cec5SDimitry Andric llvm::APInt LitVal(PP.getTargetInfo().getIntWidth(), 0);
18250b57cec5SDimitry Andric
18260b57cec5SDimitry Andric // Narrow character literals act as though their value is concatenated
18270b57cec5SDimitry Andric // in this implementation, but warn on overflow.
18280b57cec5SDimitry Andric bool multi_char_too_long = false;
182981ad6265SDimitry Andric if (isOrdinary() && isMultiChar()) {
18300b57cec5SDimitry Andric LitVal = 0;
18310b57cec5SDimitry Andric for (size_t i = 0; i < NumCharsSoFar; ++i) {
18320b57cec5SDimitry Andric // check for enough leading zeros to shift into
183306c3fb27SDimitry Andric multi_char_too_long |= (LitVal.countl_zero() < 8);
18340b57cec5SDimitry Andric LitVal <<= 8;
18350b57cec5SDimitry Andric LitVal = LitVal + (codepoint_buffer[i] & 0xFF);
18360b57cec5SDimitry Andric }
18370b57cec5SDimitry Andric } else if (NumCharsSoFar > 0) {
18380b57cec5SDimitry Andric // otherwise just take the last character
18390b57cec5SDimitry Andric LitVal = buffer_begin[-1];
18400b57cec5SDimitry Andric }
18410b57cec5SDimitry Andric
18420b57cec5SDimitry Andric if (!HadError && multi_char_too_long) {
18430b57cec5SDimitry Andric PP.Diag(Loc, diag::warn_char_constant_too_large);
18440b57cec5SDimitry Andric }
18450b57cec5SDimitry Andric
18460b57cec5SDimitry Andric // Transfer the value from APInt to uint64_t
18470b57cec5SDimitry Andric Value = LitVal.getZExtValue();
18480b57cec5SDimitry Andric
18490b57cec5SDimitry Andric // If this is a single narrow character, sign extend it (e.g. '\xFF' is "-1")
18500b57cec5SDimitry Andric // if 'char' is signed for this target (C99 6.4.4.4p10). Note that multiple
18510b57cec5SDimitry Andric // character constants are not sign extended in the this implementation:
18520b57cec5SDimitry Andric // '\xFF\xFF' = 65536 and '\x0\xFF' = 255, which matches GCC.
185381ad6265SDimitry Andric if (isOrdinary() && NumCharsSoFar == 1 && (Value & 128) &&
18540b57cec5SDimitry Andric PP.getLangOpts().CharIsSigned)
18550b57cec5SDimitry Andric Value = (signed char)Value;
18560b57cec5SDimitry Andric }
18570b57cec5SDimitry Andric
18580b57cec5SDimitry Andric /// \verbatim
18590b57cec5SDimitry Andric /// string-literal: [C++0x lex.string]
18600b57cec5SDimitry Andric /// encoding-prefix " [s-char-sequence] "
18610b57cec5SDimitry Andric /// encoding-prefix R raw-string
18620b57cec5SDimitry Andric /// encoding-prefix:
18630b57cec5SDimitry Andric /// u8
18640b57cec5SDimitry Andric /// u
18650b57cec5SDimitry Andric /// U
18660b57cec5SDimitry Andric /// L
18670b57cec5SDimitry Andric /// s-char-sequence:
18680b57cec5SDimitry Andric /// s-char
18690b57cec5SDimitry Andric /// s-char-sequence s-char
18700b57cec5SDimitry Andric /// s-char:
18710b57cec5SDimitry Andric /// any member of the source character set except the double-quote ",
18720b57cec5SDimitry Andric /// backslash \, or new-line character
18730b57cec5SDimitry Andric /// escape-sequence
18740b57cec5SDimitry Andric /// universal-character-name
18750b57cec5SDimitry Andric /// raw-string:
18760b57cec5SDimitry Andric /// " d-char-sequence ( r-char-sequence ) d-char-sequence "
18770b57cec5SDimitry Andric /// r-char-sequence:
18780b57cec5SDimitry Andric /// r-char
18790b57cec5SDimitry Andric /// r-char-sequence r-char
18800b57cec5SDimitry Andric /// r-char:
18810b57cec5SDimitry Andric /// any member of the source character set, except a right parenthesis )
18820b57cec5SDimitry Andric /// followed by the initial d-char-sequence (which may be empty)
18830b57cec5SDimitry Andric /// followed by a double quote ".
18840b57cec5SDimitry Andric /// d-char-sequence:
18850b57cec5SDimitry Andric /// d-char
18860b57cec5SDimitry Andric /// d-char-sequence d-char
18870b57cec5SDimitry Andric /// d-char:
18880b57cec5SDimitry Andric /// any member of the basic source character set except:
18890b57cec5SDimitry Andric /// space, the left parenthesis (, the right parenthesis ),
18900b57cec5SDimitry Andric /// the backslash \, and the control characters representing horizontal
18910b57cec5SDimitry Andric /// tab, vertical tab, form feed, and newline.
18920b57cec5SDimitry Andric /// escape-sequence: [C++0x lex.ccon]
18930b57cec5SDimitry Andric /// simple-escape-sequence
18940b57cec5SDimitry Andric /// octal-escape-sequence
18950b57cec5SDimitry Andric /// hexadecimal-escape-sequence
18960b57cec5SDimitry Andric /// simple-escape-sequence:
18970b57cec5SDimitry Andric /// one of \' \" \? \\ \a \b \f \n \r \t \v
18980b57cec5SDimitry Andric /// octal-escape-sequence:
18990b57cec5SDimitry Andric /// \ octal-digit
19000b57cec5SDimitry Andric /// \ octal-digit octal-digit
19010b57cec5SDimitry Andric /// \ octal-digit octal-digit octal-digit
19020b57cec5SDimitry Andric /// hexadecimal-escape-sequence:
19030b57cec5SDimitry Andric /// \x hexadecimal-digit
19040b57cec5SDimitry Andric /// hexadecimal-escape-sequence hexadecimal-digit
19050b57cec5SDimitry Andric /// universal-character-name:
19060b57cec5SDimitry Andric /// \u hex-quad
19070b57cec5SDimitry Andric /// \U hex-quad hex-quad
19080b57cec5SDimitry Andric /// hex-quad:
19090b57cec5SDimitry Andric /// hex-digit hex-digit hex-digit hex-digit
19100b57cec5SDimitry Andric /// \endverbatim
19110b57cec5SDimitry Andric ///
StringLiteralParser(ArrayRef<Token> StringToks,Preprocessor & PP,StringLiteralEvalMethod EvalMethod)191206c3fb27SDimitry Andric StringLiteralParser::StringLiteralParser(ArrayRef<Token> StringToks,
191306c3fb27SDimitry Andric Preprocessor &PP,
191406c3fb27SDimitry Andric StringLiteralEvalMethod EvalMethod)
19150b57cec5SDimitry Andric : SM(PP.getSourceManager()), Features(PP.getLangOpts()),
1916349cc55cSDimitry Andric Target(PP.getTargetInfo()), Diags(&PP.getDiagnostics()),
19170b57cec5SDimitry Andric MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
191806c3fb27SDimitry Andric ResultPtr(ResultBuf.data()), EvalMethod(EvalMethod), hadError(false),
191906c3fb27SDimitry Andric Pascal(false) {
19200b57cec5SDimitry Andric init(StringToks);
19210b57cec5SDimitry Andric }
19220b57cec5SDimitry Andric
init(ArrayRef<Token> StringToks)19230b57cec5SDimitry Andric void StringLiteralParser::init(ArrayRef<Token> StringToks){
19240b57cec5SDimitry Andric // The literal token may have come from an invalid source location (e.g. due
19250b57cec5SDimitry Andric // to a PCH error), in which case the token length will be 0.
19260b57cec5SDimitry Andric if (StringToks.empty() || StringToks[0].getLength() < 2)
19270b57cec5SDimitry Andric return DiagnoseLexingError(SourceLocation());
19280b57cec5SDimitry Andric
19290b57cec5SDimitry Andric // Scan all of the string portions, remember the max individual token length,
19300b57cec5SDimitry Andric // computing a bound on the concatenated string length, and see whether any
19310b57cec5SDimitry Andric // piece is a wide-string. If any of the string portions is a wide-string
19320b57cec5SDimitry Andric // literal, the result is a wide-string literal [C99 6.4.5p4].
19330b57cec5SDimitry Andric assert(!StringToks.empty() && "expected at least one token");
19340b57cec5SDimitry Andric MaxTokenLength = StringToks[0].getLength();
19350b57cec5SDimitry Andric assert(StringToks[0].getLength() >= 2 && "literal token is invalid!");
19360b57cec5SDimitry Andric SizeBound = StringToks[0].getLength() - 2; // -2 for "".
19370b57cec5SDimitry Andric hadError = false;
19380b57cec5SDimitry Andric
193906c3fb27SDimitry Andric // Determines the kind of string from the prefix
194006c3fb27SDimitry Andric Kind = tok::string_literal;
194106c3fb27SDimitry Andric
19420b57cec5SDimitry Andric /// (C99 5.1.1.2p1). The common case is only one string fragment.
194306c3fb27SDimitry Andric for (const Token &Tok : StringToks) {
194406c3fb27SDimitry Andric if (Tok.getLength() < 2)
194506c3fb27SDimitry Andric return DiagnoseLexingError(Tok.getLocation());
19460b57cec5SDimitry Andric
19470b57cec5SDimitry Andric // The string could be shorter than this if it needs cleaning, but this is a
19480b57cec5SDimitry Andric // reasonable bound, which is all we need.
194906c3fb27SDimitry Andric assert(Tok.getLength() >= 2 && "literal token is invalid!");
195006c3fb27SDimitry Andric SizeBound += Tok.getLength() - 2; // -2 for "".
19510b57cec5SDimitry Andric
19520b57cec5SDimitry Andric // Remember maximum string piece length.
195306c3fb27SDimitry Andric if (Tok.getLength() > MaxTokenLength)
195406c3fb27SDimitry Andric MaxTokenLength = Tok.getLength();
19550b57cec5SDimitry Andric
19560b57cec5SDimitry Andric // Remember if we see any wide or utf-8/16/32 strings.
19570b57cec5SDimitry Andric // Also check for illegal concatenations.
195806c3fb27SDimitry Andric if (isUnevaluated() && Tok.getKind() != tok::string_literal) {
19598a4dda33SDimitry Andric if (Diags) {
19608a4dda33SDimitry Andric SourceLocation PrefixEndLoc = Lexer::AdvanceToTokenCharacter(
19618a4dda33SDimitry Andric Tok.getLocation(), getEncodingPrefixLen(Tok.getKind()), SM,
19628a4dda33SDimitry Andric Features);
19638a4dda33SDimitry Andric CharSourceRange Range =
19648a4dda33SDimitry Andric CharSourceRange::getCharRange({Tok.getLocation(), PrefixEndLoc});
19658a4dda33SDimitry Andric StringRef Prefix(SM.getCharacterData(Tok.getLocation()),
19668a4dda33SDimitry Andric getEncodingPrefixLen(Tok.getKind()));
19678a4dda33SDimitry Andric Diags->Report(Tok.getLocation(),
19688a4dda33SDimitry Andric Features.CPlusPlus26
19698a4dda33SDimitry Andric ? diag::err_unevaluated_string_prefix
19708a4dda33SDimitry Andric : diag::warn_unevaluated_string_prefix)
19718a4dda33SDimitry Andric << Prefix << Features.CPlusPlus << FixItHint::CreateRemoval(Range);
19728a4dda33SDimitry Andric }
19738a4dda33SDimitry Andric if (Features.CPlusPlus26)
197406c3fb27SDimitry Andric hadError = true;
197506c3fb27SDimitry Andric } else if (Tok.isNot(Kind) && Tok.isNot(tok::string_literal)) {
197681ad6265SDimitry Andric if (isOrdinary()) {
197706c3fb27SDimitry Andric Kind = Tok.getKind();
19780b57cec5SDimitry Andric } else {
19790b57cec5SDimitry Andric if (Diags)
198006c3fb27SDimitry Andric Diags->Report(Tok.getLocation(), diag::err_unsupported_string_concat);
19810b57cec5SDimitry Andric hadError = true;
19820b57cec5SDimitry Andric }
19830b57cec5SDimitry Andric }
19840b57cec5SDimitry Andric }
19850b57cec5SDimitry Andric
19860b57cec5SDimitry Andric // Include space for the null terminator.
19870b57cec5SDimitry Andric ++SizeBound;
19880b57cec5SDimitry Andric
19890b57cec5SDimitry Andric // TODO: K&R warning: "traditional C rejects string constant concatenation"
19900b57cec5SDimitry Andric
19910b57cec5SDimitry Andric // Get the width in bytes of char/wchar_t/char16_t/char32_t
19920b57cec5SDimitry Andric CharByteWidth = getCharWidth(Kind, Target);
19930b57cec5SDimitry Andric assert((CharByteWidth & 7) == 0 && "Assumes character size is byte multiple");
19940b57cec5SDimitry Andric CharByteWidth /= 8;
19950b57cec5SDimitry Andric
19960b57cec5SDimitry Andric // The output buffer size needs to be large enough to hold wide characters.
19970b57cec5SDimitry Andric // This is a worst-case assumption which basically corresponds to L"" "long".
19980b57cec5SDimitry Andric SizeBound *= CharByteWidth;
19990b57cec5SDimitry Andric
20000b57cec5SDimitry Andric // Size the temporary buffer to hold the result string data.
20010b57cec5SDimitry Andric ResultBuf.resize(SizeBound);
20020b57cec5SDimitry Andric
20030b57cec5SDimitry Andric // Likewise, but for each string piece.
20040b57cec5SDimitry Andric SmallString<512> TokenBuf;
20050b57cec5SDimitry Andric TokenBuf.resize(MaxTokenLength);
20060b57cec5SDimitry Andric
20070b57cec5SDimitry Andric // Loop over all the strings, getting their spelling, and expanding them to
20080b57cec5SDimitry Andric // wide strings as appropriate.
20090b57cec5SDimitry Andric ResultPtr = &ResultBuf[0]; // Next byte to fill in.
20100b57cec5SDimitry Andric
20110b57cec5SDimitry Andric Pascal = false;
20120b57cec5SDimitry Andric
20130b57cec5SDimitry Andric SourceLocation UDSuffixTokLoc;
20140b57cec5SDimitry Andric
20150b57cec5SDimitry Andric for (unsigned i = 0, e = StringToks.size(); i != e; ++i) {
20160b57cec5SDimitry Andric const char *ThisTokBuf = &TokenBuf[0];
20170b57cec5SDimitry Andric // Get the spelling of the token, which eliminates trigraphs, etc. We know
20180b57cec5SDimitry Andric // that ThisTokBuf points to a buffer that is big enough for the whole token
20190b57cec5SDimitry Andric // and 'spelled' tokens can only shrink.
20200b57cec5SDimitry Andric bool StringInvalid = false;
20210b57cec5SDimitry Andric unsigned ThisTokLen =
20220b57cec5SDimitry Andric Lexer::getSpelling(StringToks[i], ThisTokBuf, SM, Features,
20230b57cec5SDimitry Andric &StringInvalid);
20240b57cec5SDimitry Andric if (StringInvalid)
20250b57cec5SDimitry Andric return DiagnoseLexingError(StringToks[i].getLocation());
20260b57cec5SDimitry Andric
20270b57cec5SDimitry Andric const char *ThisTokBegin = ThisTokBuf;
20280b57cec5SDimitry Andric const char *ThisTokEnd = ThisTokBuf+ThisTokLen;
20290b57cec5SDimitry Andric
20300b57cec5SDimitry Andric // Remove an optional ud-suffix.
20310b57cec5SDimitry Andric if (ThisTokEnd[-1] != '"') {
20320b57cec5SDimitry Andric const char *UDSuffixEnd = ThisTokEnd;
20330b57cec5SDimitry Andric do {
20340b57cec5SDimitry Andric --ThisTokEnd;
20350b57cec5SDimitry Andric } while (ThisTokEnd[-1] != '"');
20360b57cec5SDimitry Andric
20370b57cec5SDimitry Andric StringRef UDSuffix(ThisTokEnd, UDSuffixEnd - ThisTokEnd);
20380b57cec5SDimitry Andric
20390b57cec5SDimitry Andric if (UDSuffixBuf.empty()) {
20400b57cec5SDimitry Andric if (StringToks[i].hasUCN())
20410b57cec5SDimitry Andric expandUCNs(UDSuffixBuf, UDSuffix);
20420b57cec5SDimitry Andric else
20430b57cec5SDimitry Andric UDSuffixBuf.assign(UDSuffix);
20440b57cec5SDimitry Andric UDSuffixToken = i;
20450b57cec5SDimitry Andric UDSuffixOffset = ThisTokEnd - ThisTokBuf;
20460b57cec5SDimitry Andric UDSuffixTokLoc = StringToks[i].getLocation();
20470b57cec5SDimitry Andric } else {
20480b57cec5SDimitry Andric SmallString<32> ExpandedUDSuffix;
20490b57cec5SDimitry Andric if (StringToks[i].hasUCN()) {
20500b57cec5SDimitry Andric expandUCNs(ExpandedUDSuffix, UDSuffix);
20510b57cec5SDimitry Andric UDSuffix = ExpandedUDSuffix;
20520b57cec5SDimitry Andric }
20530b57cec5SDimitry Andric
20540b57cec5SDimitry Andric // C++11 [lex.ext]p8: At the end of phase 6, if a string literal is the
20550b57cec5SDimitry Andric // result of a concatenation involving at least one user-defined-string-
20560b57cec5SDimitry Andric // literal, all the participating user-defined-string-literals shall
20570b57cec5SDimitry Andric // have the same ud-suffix.
205806c3fb27SDimitry Andric bool UnevaluatedStringHasUDL = isUnevaluated() && !UDSuffix.empty();
205906c3fb27SDimitry Andric if (UDSuffixBuf != UDSuffix || UnevaluatedStringHasUDL) {
20600b57cec5SDimitry Andric if (Diags) {
20610b57cec5SDimitry Andric SourceLocation TokLoc = StringToks[i].getLocation();
206206c3fb27SDimitry Andric if (UnevaluatedStringHasUDL) {
206306c3fb27SDimitry Andric Diags->Report(TokLoc, diag::err_unevaluated_string_udl)
206406c3fb27SDimitry Andric << SourceRange(TokLoc, TokLoc);
206506c3fb27SDimitry Andric } else {
20660b57cec5SDimitry Andric Diags->Report(TokLoc, diag::err_string_concat_mixed_suffix)
20670b57cec5SDimitry Andric << UDSuffixBuf << UDSuffix
206806c3fb27SDimitry Andric << SourceRange(UDSuffixTokLoc, UDSuffixTokLoc);
206906c3fb27SDimitry Andric }
20700b57cec5SDimitry Andric }
20710b57cec5SDimitry Andric hadError = true;
20720b57cec5SDimitry Andric }
20730b57cec5SDimitry Andric }
20740b57cec5SDimitry Andric }
20750b57cec5SDimitry Andric
20760b57cec5SDimitry Andric // Strip the end quote.
20770b57cec5SDimitry Andric --ThisTokEnd;
20780b57cec5SDimitry Andric
20790b57cec5SDimitry Andric // TODO: Input character set mapping support.
20800b57cec5SDimitry Andric
20810b57cec5SDimitry Andric // Skip marker for wide or unicode strings.
20820b57cec5SDimitry Andric if (ThisTokBuf[0] == 'L' || ThisTokBuf[0] == 'u' || ThisTokBuf[0] == 'U') {
20830b57cec5SDimitry Andric ++ThisTokBuf;
20840b57cec5SDimitry Andric // Skip 8 of u8 marker for utf8 strings.
20850b57cec5SDimitry Andric if (ThisTokBuf[0] == '8')
20860b57cec5SDimitry Andric ++ThisTokBuf;
20870b57cec5SDimitry Andric }
20880b57cec5SDimitry Andric
20890b57cec5SDimitry Andric // Check for raw string
20900b57cec5SDimitry Andric if (ThisTokBuf[0] == 'R') {
2091fe6060f1SDimitry Andric if (ThisTokBuf[1] != '"') {
2092fe6060f1SDimitry Andric // The file may have come from PCH and then changed after loading the
2093fe6060f1SDimitry Andric // PCH; Fail gracefully.
2094fe6060f1SDimitry Andric return DiagnoseLexingError(StringToks[i].getLocation());
2095fe6060f1SDimitry Andric }
20960b57cec5SDimitry Andric ThisTokBuf += 2; // skip R"
20970b57cec5SDimitry Andric
2098fe6060f1SDimitry Andric // C++11 [lex.string]p2: A `d-char-sequence` shall consist of at most 16
2099fe6060f1SDimitry Andric // characters.
2100fe6060f1SDimitry Andric constexpr unsigned MaxRawStrDelimLen = 16;
2101fe6060f1SDimitry Andric
21020b57cec5SDimitry Andric const char *Prefix = ThisTokBuf;
2103fe6060f1SDimitry Andric while (static_cast<unsigned>(ThisTokBuf - Prefix) < MaxRawStrDelimLen &&
2104fe6060f1SDimitry Andric ThisTokBuf[0] != '(')
21050b57cec5SDimitry Andric ++ThisTokBuf;
2106fe6060f1SDimitry Andric if (ThisTokBuf[0] != '(')
2107fe6060f1SDimitry Andric return DiagnoseLexingError(StringToks[i].getLocation());
21080b57cec5SDimitry Andric ++ThisTokBuf; // skip '('
21090b57cec5SDimitry Andric
21100b57cec5SDimitry Andric // Remove same number of characters from the end
21110b57cec5SDimitry Andric ThisTokEnd -= ThisTokBuf - Prefix;
2112fe6060f1SDimitry Andric if (ThisTokEnd < ThisTokBuf)
2113fe6060f1SDimitry Andric return DiagnoseLexingError(StringToks[i].getLocation());
21140b57cec5SDimitry Andric
21150b57cec5SDimitry Andric // C++14 [lex.string]p4: A source-file new-line in a raw string literal
21160b57cec5SDimitry Andric // results in a new-line in the resulting execution string-literal.
21170b57cec5SDimitry Andric StringRef RemainingTokenSpan(ThisTokBuf, ThisTokEnd - ThisTokBuf);
21180b57cec5SDimitry Andric while (!RemainingTokenSpan.empty()) {
21190b57cec5SDimitry Andric // Split the string literal on \r\n boundaries.
21200b57cec5SDimitry Andric size_t CRLFPos = RemainingTokenSpan.find("\r\n");
21210b57cec5SDimitry Andric StringRef BeforeCRLF = RemainingTokenSpan.substr(0, CRLFPos);
21220b57cec5SDimitry Andric StringRef AfterCRLF = RemainingTokenSpan.substr(CRLFPos);
21230b57cec5SDimitry Andric
21240b57cec5SDimitry Andric // Copy everything before the \r\n sequence into the string literal.
21250b57cec5SDimitry Andric if (CopyStringFragment(StringToks[i], ThisTokBegin, BeforeCRLF))
21260b57cec5SDimitry Andric hadError = true;
21270b57cec5SDimitry Andric
21280b57cec5SDimitry Andric // Point into the \n inside the \r\n sequence and operate on the
21290b57cec5SDimitry Andric // remaining portion of the literal.
21300b57cec5SDimitry Andric RemainingTokenSpan = AfterCRLF.substr(1);
21310b57cec5SDimitry Andric }
21320b57cec5SDimitry Andric } else {
21330b57cec5SDimitry Andric if (ThisTokBuf[0] != '"') {
21340b57cec5SDimitry Andric // The file may have come from PCH and then changed after loading the
21350b57cec5SDimitry Andric // PCH; Fail gracefully.
21360b57cec5SDimitry Andric return DiagnoseLexingError(StringToks[i].getLocation());
21370b57cec5SDimitry Andric }
21380b57cec5SDimitry Andric ++ThisTokBuf; // skip "
21390b57cec5SDimitry Andric
21400b57cec5SDimitry Andric // Check if this is a pascal string
214106c3fb27SDimitry Andric if (!isUnevaluated() && Features.PascalStrings &&
214206c3fb27SDimitry Andric ThisTokBuf + 1 != ThisTokEnd && ThisTokBuf[0] == '\\' &&
214306c3fb27SDimitry Andric ThisTokBuf[1] == 'p') {
21440b57cec5SDimitry Andric
21450b57cec5SDimitry Andric // If the \p sequence is found in the first token, we have a pascal string
21460b57cec5SDimitry Andric // Otherwise, if we already have a pascal string, ignore the first \p
21470b57cec5SDimitry Andric if (i == 0) {
21480b57cec5SDimitry Andric ++ThisTokBuf;
21490b57cec5SDimitry Andric Pascal = true;
21500b57cec5SDimitry Andric } else if (Pascal)
21510b57cec5SDimitry Andric ThisTokBuf += 2;
21520b57cec5SDimitry Andric }
21530b57cec5SDimitry Andric
21540b57cec5SDimitry Andric while (ThisTokBuf != ThisTokEnd) {
21550b57cec5SDimitry Andric // Is this a span of non-escape characters?
21560b57cec5SDimitry Andric if (ThisTokBuf[0] != '\\') {
21570b57cec5SDimitry Andric const char *InStart = ThisTokBuf;
21580b57cec5SDimitry Andric do {
21590b57cec5SDimitry Andric ++ThisTokBuf;
21600b57cec5SDimitry Andric } while (ThisTokBuf != ThisTokEnd && ThisTokBuf[0] != '\\');
21610b57cec5SDimitry Andric
21620b57cec5SDimitry Andric // Copy the character span over.
21630b57cec5SDimitry Andric if (CopyStringFragment(StringToks[i], ThisTokBegin,
21640b57cec5SDimitry Andric StringRef(InStart, ThisTokBuf - InStart)))
21650b57cec5SDimitry Andric hadError = true;
21660b57cec5SDimitry Andric continue;
21670b57cec5SDimitry Andric }
21680b57cec5SDimitry Andric // Is this a Universal Character Name escape?
216981ad6265SDimitry Andric if (ThisTokBuf[1] == 'u' || ThisTokBuf[1] == 'U' ||
217081ad6265SDimitry Andric ThisTokBuf[1] == 'N') {
21710b57cec5SDimitry Andric EncodeUCNEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd,
21720b57cec5SDimitry Andric ResultPtr, hadError,
21730b57cec5SDimitry Andric FullSourceLoc(StringToks[i].getLocation(), SM),
21740b57cec5SDimitry Andric CharByteWidth, Diags, Features);
21750b57cec5SDimitry Andric continue;
21760b57cec5SDimitry Andric }
21770b57cec5SDimitry Andric // Otherwise, this is a non-UCN escape character. Process it.
21780b57cec5SDimitry Andric unsigned ResultChar =
21790b57cec5SDimitry Andric ProcessCharEscape(ThisTokBegin, ThisTokBuf, ThisTokEnd, hadError,
21800b57cec5SDimitry Andric FullSourceLoc(StringToks[i].getLocation(), SM),
218106c3fb27SDimitry Andric CharByteWidth * 8, Diags, Features, EvalMethod);
21820b57cec5SDimitry Andric
21830b57cec5SDimitry Andric if (CharByteWidth == 4) {
21840b57cec5SDimitry Andric // FIXME: Make the type of the result buffer correct instead of
21850b57cec5SDimitry Andric // using reinterpret_cast.
21860b57cec5SDimitry Andric llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultPtr);
21870b57cec5SDimitry Andric *ResultWidePtr = ResultChar;
21880b57cec5SDimitry Andric ResultPtr += 4;
21890b57cec5SDimitry Andric } else if (CharByteWidth == 2) {
21900b57cec5SDimitry Andric // FIXME: Make the type of the result buffer correct instead of
21910b57cec5SDimitry Andric // using reinterpret_cast.
21920b57cec5SDimitry Andric llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultPtr);
21930b57cec5SDimitry Andric *ResultWidePtr = ResultChar & 0xFFFF;
21940b57cec5SDimitry Andric ResultPtr += 2;
21950b57cec5SDimitry Andric } else {
21960b57cec5SDimitry Andric assert(CharByteWidth == 1 && "Unexpected char width");
21970b57cec5SDimitry Andric *ResultPtr++ = ResultChar & 0xFF;
21980b57cec5SDimitry Andric }
21990b57cec5SDimitry Andric }
22000b57cec5SDimitry Andric }
22010b57cec5SDimitry Andric }
22020b57cec5SDimitry Andric
220306c3fb27SDimitry Andric assert((!Pascal || !isUnevaluated()) &&
220406c3fb27SDimitry Andric "Pascal string in unevaluated context");
22050b57cec5SDimitry Andric if (Pascal) {
22060b57cec5SDimitry Andric if (CharByteWidth == 4) {
22070b57cec5SDimitry Andric // FIXME: Make the type of the result buffer correct instead of
22080b57cec5SDimitry Andric // using reinterpret_cast.
22090b57cec5SDimitry Andric llvm::UTF32 *ResultWidePtr = reinterpret_cast<llvm::UTF32*>(ResultBuf.data());
22100b57cec5SDimitry Andric ResultWidePtr[0] = GetNumStringChars() - 1;
22110b57cec5SDimitry Andric } else if (CharByteWidth == 2) {
22120b57cec5SDimitry Andric // FIXME: Make the type of the result buffer correct instead of
22130b57cec5SDimitry Andric // using reinterpret_cast.
22140b57cec5SDimitry Andric llvm::UTF16 *ResultWidePtr = reinterpret_cast<llvm::UTF16*>(ResultBuf.data());
22150b57cec5SDimitry Andric ResultWidePtr[0] = GetNumStringChars() - 1;
22160b57cec5SDimitry Andric } else {
22170b57cec5SDimitry Andric assert(CharByteWidth == 1 && "Unexpected char width");
22180b57cec5SDimitry Andric ResultBuf[0] = GetNumStringChars() - 1;
22190b57cec5SDimitry Andric }
22200b57cec5SDimitry Andric
22210b57cec5SDimitry Andric // Verify that pascal strings aren't too large.
22220b57cec5SDimitry Andric if (GetStringLength() > 256) {
22230b57cec5SDimitry Andric if (Diags)
22240b57cec5SDimitry Andric Diags->Report(StringToks.front().getLocation(),
22250b57cec5SDimitry Andric diag::err_pascal_string_too_long)
22260b57cec5SDimitry Andric << SourceRange(StringToks.front().getLocation(),
22270b57cec5SDimitry Andric StringToks.back().getLocation());
22280b57cec5SDimitry Andric hadError = true;
22290b57cec5SDimitry Andric return;
22300b57cec5SDimitry Andric }
22310b57cec5SDimitry Andric } else if (Diags) {
22320b57cec5SDimitry Andric // Complain if this string literal has too many characters.
22330b57cec5SDimitry Andric unsigned MaxChars = Features.CPlusPlus? 65536 : Features.C99 ? 4095 : 509;
22340b57cec5SDimitry Andric
22350b57cec5SDimitry Andric if (GetNumStringChars() > MaxChars)
22360b57cec5SDimitry Andric Diags->Report(StringToks.front().getLocation(),
22370b57cec5SDimitry Andric diag::ext_string_too_long)
22380b57cec5SDimitry Andric << GetNumStringChars() << MaxChars
22390b57cec5SDimitry Andric << (Features.CPlusPlus ? 2 : Features.C99 ? 1 : 0)
22400b57cec5SDimitry Andric << SourceRange(StringToks.front().getLocation(),
22410b57cec5SDimitry Andric StringToks.back().getLocation());
22420b57cec5SDimitry Andric }
22430b57cec5SDimitry Andric }
22440b57cec5SDimitry Andric
resyncUTF8(const char * Err,const char * End)22450b57cec5SDimitry Andric static const char *resyncUTF8(const char *Err, const char *End) {
22460b57cec5SDimitry Andric if (Err == End)
22470b57cec5SDimitry Andric return End;
22480b57cec5SDimitry Andric End = Err + std::min<unsigned>(llvm::getNumBytesForUTF8(*Err), End-Err);
22490b57cec5SDimitry Andric while (++Err != End && (*Err & 0xC0) == 0x80)
22500b57cec5SDimitry Andric ;
22510b57cec5SDimitry Andric return Err;
22520b57cec5SDimitry Andric }
22530b57cec5SDimitry Andric
22540b57cec5SDimitry Andric /// This function copies from Fragment, which is a sequence of bytes
22550b57cec5SDimitry Andric /// within Tok's contents (which begin at TokBegin) into ResultPtr.
22560b57cec5SDimitry Andric /// Performs widening for multi-byte characters.
CopyStringFragment(const Token & Tok,const char * TokBegin,StringRef Fragment)22570b57cec5SDimitry Andric bool StringLiteralParser::CopyStringFragment(const Token &Tok,
22580b57cec5SDimitry Andric const char *TokBegin,
22590b57cec5SDimitry Andric StringRef Fragment) {
22600b57cec5SDimitry Andric const llvm::UTF8 *ErrorPtrTmp;
22610b57cec5SDimitry Andric if (ConvertUTF8toWide(CharByteWidth, Fragment, ResultPtr, ErrorPtrTmp))
22620b57cec5SDimitry Andric return false;
22630b57cec5SDimitry Andric
22640b57cec5SDimitry Andric // If we see bad encoding for unprefixed string literals, warn and
22650b57cec5SDimitry Andric // simply copy the byte values, for compatibility with gcc and older
22660b57cec5SDimitry Andric // versions of clang.
226781ad6265SDimitry Andric bool NoErrorOnBadEncoding = isOrdinary();
22680b57cec5SDimitry Andric if (NoErrorOnBadEncoding) {
22690b57cec5SDimitry Andric memcpy(ResultPtr, Fragment.data(), Fragment.size());
22700b57cec5SDimitry Andric ResultPtr += Fragment.size();
22710b57cec5SDimitry Andric }
22720b57cec5SDimitry Andric
22730b57cec5SDimitry Andric if (Diags) {
22740b57cec5SDimitry Andric const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
22750b57cec5SDimitry Andric
22760b57cec5SDimitry Andric FullSourceLoc SourceLoc(Tok.getLocation(), SM);
22770b57cec5SDimitry Andric const DiagnosticBuilder &Builder =
22780b57cec5SDimitry Andric Diag(Diags, Features, SourceLoc, TokBegin,
22790b57cec5SDimitry Andric ErrorPtr, resyncUTF8(ErrorPtr, Fragment.end()),
22800b57cec5SDimitry Andric NoErrorOnBadEncoding ? diag::warn_bad_string_encoding
22810b57cec5SDimitry Andric : diag::err_bad_string_encoding);
22820b57cec5SDimitry Andric
22830b57cec5SDimitry Andric const char *NextStart = resyncUTF8(ErrorPtr, Fragment.end());
22840b57cec5SDimitry Andric StringRef NextFragment(NextStart, Fragment.end()-NextStart);
22850b57cec5SDimitry Andric
22860b57cec5SDimitry Andric // Decode into a dummy buffer.
22870b57cec5SDimitry Andric SmallString<512> Dummy;
22880b57cec5SDimitry Andric Dummy.reserve(Fragment.size() * CharByteWidth);
22890b57cec5SDimitry Andric char *Ptr = Dummy.data();
22900b57cec5SDimitry Andric
22910b57cec5SDimitry Andric while (!ConvertUTF8toWide(CharByteWidth, NextFragment, Ptr, ErrorPtrTmp)) {
22920b57cec5SDimitry Andric const char *ErrorPtr = reinterpret_cast<const char *>(ErrorPtrTmp);
22930b57cec5SDimitry Andric NextStart = resyncUTF8(ErrorPtr, Fragment.end());
22940b57cec5SDimitry Andric Builder << MakeCharSourceRange(Features, SourceLoc, TokBegin,
22950b57cec5SDimitry Andric ErrorPtr, NextStart);
22960b57cec5SDimitry Andric NextFragment = StringRef(NextStart, Fragment.end()-NextStart);
22970b57cec5SDimitry Andric }
22980b57cec5SDimitry Andric }
22990b57cec5SDimitry Andric return !NoErrorOnBadEncoding;
23000b57cec5SDimitry Andric }
23010b57cec5SDimitry Andric
DiagnoseLexingError(SourceLocation Loc)23020b57cec5SDimitry Andric void StringLiteralParser::DiagnoseLexingError(SourceLocation Loc) {
23030b57cec5SDimitry Andric hadError = true;
23040b57cec5SDimitry Andric if (Diags)
23050b57cec5SDimitry Andric Diags->Report(Loc, diag::err_lexing_string);
23060b57cec5SDimitry Andric }
23070b57cec5SDimitry Andric
23080b57cec5SDimitry Andric /// getOffsetOfStringByte - This function returns the offset of the
23090b57cec5SDimitry Andric /// specified byte of the string data represented by Token. This handles
23100b57cec5SDimitry Andric /// advancing over escape sequences in the string.
getOffsetOfStringByte(const Token & Tok,unsigned ByteNo) const23110b57cec5SDimitry Andric unsigned StringLiteralParser::getOffsetOfStringByte(const Token &Tok,
23120b57cec5SDimitry Andric unsigned ByteNo) const {
23130b57cec5SDimitry Andric // Get the spelling of the token.
23140b57cec5SDimitry Andric SmallString<32> SpellingBuffer;
23150b57cec5SDimitry Andric SpellingBuffer.resize(Tok.getLength());
23160b57cec5SDimitry Andric
23170b57cec5SDimitry Andric bool StringInvalid = false;
23180b57cec5SDimitry Andric const char *SpellingPtr = &SpellingBuffer[0];
23190b57cec5SDimitry Andric unsigned TokLen = Lexer::getSpelling(Tok, SpellingPtr, SM, Features,
23200b57cec5SDimitry Andric &StringInvalid);
23210b57cec5SDimitry Andric if (StringInvalid)
23220b57cec5SDimitry Andric return 0;
23230b57cec5SDimitry Andric
23240b57cec5SDimitry Andric const char *SpellingStart = SpellingPtr;
23250b57cec5SDimitry Andric const char *SpellingEnd = SpellingPtr+TokLen;
23260b57cec5SDimitry Andric
23270b57cec5SDimitry Andric // Handle UTF-8 strings just like narrow strings.
23280b57cec5SDimitry Andric if (SpellingPtr[0] == 'u' && SpellingPtr[1] == '8')
23290b57cec5SDimitry Andric SpellingPtr += 2;
23300b57cec5SDimitry Andric
23310b57cec5SDimitry Andric assert(SpellingPtr[0] != 'L' && SpellingPtr[0] != 'u' &&
23320b57cec5SDimitry Andric SpellingPtr[0] != 'U' && "Doesn't handle wide or utf strings yet");
23330b57cec5SDimitry Andric
23340b57cec5SDimitry Andric // For raw string literals, this is easy.
23350b57cec5SDimitry Andric if (SpellingPtr[0] == 'R') {
23360b57cec5SDimitry Andric assert(SpellingPtr[1] == '"' && "Should be a raw string literal!");
23370b57cec5SDimitry Andric // Skip 'R"'.
23380b57cec5SDimitry Andric SpellingPtr += 2;
23390b57cec5SDimitry Andric while (*SpellingPtr != '(') {
23400b57cec5SDimitry Andric ++SpellingPtr;
23410b57cec5SDimitry Andric assert(SpellingPtr < SpellingEnd && "Missing ( for raw string literal");
23420b57cec5SDimitry Andric }
23430b57cec5SDimitry Andric // Skip '('.
23440b57cec5SDimitry Andric ++SpellingPtr;
23450b57cec5SDimitry Andric return SpellingPtr - SpellingStart + ByteNo;
23460b57cec5SDimitry Andric }
23470b57cec5SDimitry Andric
23480b57cec5SDimitry Andric // Skip over the leading quote
23490b57cec5SDimitry Andric assert(SpellingPtr[0] == '"' && "Should be a string literal!");
23500b57cec5SDimitry Andric ++SpellingPtr;
23510b57cec5SDimitry Andric
23520b57cec5SDimitry Andric // Skip over bytes until we find the offset we're looking for.
23530b57cec5SDimitry Andric while (ByteNo) {
23540b57cec5SDimitry Andric assert(SpellingPtr < SpellingEnd && "Didn't find byte offset!");
23550b57cec5SDimitry Andric
23560b57cec5SDimitry Andric // Step over non-escapes simply.
23570b57cec5SDimitry Andric if (*SpellingPtr != '\\') {
23580b57cec5SDimitry Andric ++SpellingPtr;
23590b57cec5SDimitry Andric --ByteNo;
23600b57cec5SDimitry Andric continue;
23610b57cec5SDimitry Andric }
23620b57cec5SDimitry Andric
23630b57cec5SDimitry Andric // Otherwise, this is an escape character. Advance over it.
23640b57cec5SDimitry Andric bool HadError = false;
236581ad6265SDimitry Andric if (SpellingPtr[1] == 'u' || SpellingPtr[1] == 'U' ||
236681ad6265SDimitry Andric SpellingPtr[1] == 'N') {
23670b57cec5SDimitry Andric const char *EscapePtr = SpellingPtr;
23680b57cec5SDimitry Andric unsigned Len = MeasureUCNEscape(SpellingStart, SpellingPtr, SpellingEnd,
23690b57cec5SDimitry Andric 1, Features, HadError);
23700b57cec5SDimitry Andric if (Len > ByteNo) {
23710b57cec5SDimitry Andric // ByteNo is somewhere within the escape sequence.
23720b57cec5SDimitry Andric SpellingPtr = EscapePtr;
23730b57cec5SDimitry Andric break;
23740b57cec5SDimitry Andric }
23750b57cec5SDimitry Andric ByteNo -= Len;
23760b57cec5SDimitry Andric } else {
23770b57cec5SDimitry Andric ProcessCharEscape(SpellingStart, SpellingPtr, SpellingEnd, HadError,
237806c3fb27SDimitry Andric FullSourceLoc(Tok.getLocation(), SM), CharByteWidth * 8,
237906c3fb27SDimitry Andric Diags, Features, StringLiteralEvalMethod::Evaluated);
23800b57cec5SDimitry Andric --ByteNo;
23810b57cec5SDimitry Andric }
23820b57cec5SDimitry Andric assert(!HadError && "This method isn't valid on erroneous strings");
23830b57cec5SDimitry Andric }
23840b57cec5SDimitry Andric
23850b57cec5SDimitry Andric return SpellingPtr-SpellingStart;
23860b57cec5SDimitry Andric }
23870b57cec5SDimitry Andric
23880b57cec5SDimitry Andric /// Determine whether a suffix is a valid ud-suffix. We avoid treating reserved
23890b57cec5SDimitry Andric /// suffixes as ud-suffixes, because the diagnostic experience is better if we
23900b57cec5SDimitry Andric /// treat it as an invalid suffix.
isValidUDSuffix(const LangOptions & LangOpts,StringRef Suffix)23910b57cec5SDimitry Andric bool StringLiteralParser::isValidUDSuffix(const LangOptions &LangOpts,
23920b57cec5SDimitry Andric StringRef Suffix) {
23930b57cec5SDimitry Andric return NumericLiteralParser::isValidUDSuffix(LangOpts, Suffix) ||
23940b57cec5SDimitry Andric Suffix == "sv";
23950b57cec5SDimitry Andric }
2396