1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
10 //
11 //===----------------------------------------------------------------------===//
12
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSwitch.h"
19 #include "llvm/MC/MCAsmInfo.h"
20 #include "llvm/MC/MCParser/MCAsmLexer.h"
21 #include "llvm/Support/Compiler.h"
22 #include "llvm/Support/SMLoc.h"
23 #include "llvm/Support/SaveAndRestore.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29 #include <tuple>
30 #include <utility>
31
32 using namespace llvm;
33
AsmLexer(const MCAsmInfo & MAI)34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35 AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
36 LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37 }
38
39 AsmLexer::~AsmLexer() = default;
40
setBuffer(StringRef Buf,const char * ptr,bool EndStatementAtEOF)41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42 bool EndStatementAtEOF) {
43 CurBuf = Buf;
44
45 if (ptr)
46 CurPtr = ptr;
47 else
48 CurPtr = CurBuf.begin();
49
50 TokStart = nullptr;
51 this->EndStatementAtEOF = EndStatementAtEOF;
52 }
53
54 /// ReturnError - Set the error to the specified string at the specified
55 /// location. This is defined to always return AsmToken::Error.
ReturnError(const char * Loc,const std::string & Msg)56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57 SetError(SMLoc::getFromPointer(Loc), Msg);
58
59 return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60 }
61
getNextChar()62 int AsmLexer::getNextChar() {
63 if (CurPtr == CurBuf.end())
64 return EOF;
65 return (unsigned char)*CurPtr++;
66 }
67
peekNextChar()68 int AsmLexer::peekNextChar() {
69 if (CurPtr == CurBuf.end())
70 return EOF;
71 return (unsigned char)*CurPtr;
72 }
73
74 /// The leading integral digit sequence and dot should have already been
75 /// consumed, some or all of the fractional digit sequence *can* have been
76 /// consumed.
LexFloatLiteral()77 AsmToken AsmLexer::LexFloatLiteral() {
78 // Skip the fractional digit sequence.
79 while (isDigit(*CurPtr))
80 ++CurPtr;
81
82 if (*CurPtr == '-' || *CurPtr == '+')
83 return ReturnError(CurPtr, "invalid sign in float literal");
84
85 // Check for exponent
86 if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87 ++CurPtr;
88
89 if (*CurPtr == '-' || *CurPtr == '+')
90 ++CurPtr;
91
92 while (isDigit(*CurPtr))
93 ++CurPtr;
94 }
95
96 return AsmToken(AsmToken::Real,
97 StringRef(TokStart, CurPtr - TokStart));
98 }
99
100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101 /// while making sure there are enough actual digits around for the constant to
102 /// be valid.
103 ///
104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105 /// before we get here.
LexHexFloatLiteral(bool NoIntDigits)106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107 assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108 "unexpected parse state in floating hex");
109 bool NoFracDigits = true;
110
111 // Skip the fractional part if there is one
112 if (*CurPtr == '.') {
113 ++CurPtr;
114
115 const char *FracStart = CurPtr;
116 while (isHexDigit(*CurPtr))
117 ++CurPtr;
118
119 NoFracDigits = CurPtr == FracStart;
120 }
121
122 if (NoIntDigits && NoFracDigits)
123 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124 "expected at least one significand digit");
125
126 // Make sure we do have some kind of proper exponent part
127 if (*CurPtr != 'p' && *CurPtr != 'P')
128 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129 "expected exponent part 'p'");
130 ++CurPtr;
131
132 if (*CurPtr == '+' || *CurPtr == '-')
133 ++CurPtr;
134
135 // N.b. exponent digits are *not* hex
136 const char *ExpStart = CurPtr;
137 while (isDigit(*CurPtr))
138 ++CurPtr;
139
140 if (CurPtr == ExpStart)
141 return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142 "expected at least one exponent digit");
143
144 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145 }
146
147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
isIdentifierChar(char C,bool AllowAt,bool AllowHash)148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149 return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150 (AllowAt && C == '@') || (AllowHash && C == '#');
151 }
152
LexIdentifier()153 AsmToken AsmLexer::LexIdentifier() {
154 // Check for floating point literals.
155 if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156 // Disambiguate a .1243foo identifier from a floating literal.
157 while (isDigit(*CurPtr))
158 ++CurPtr;
159
160 if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
161 AllowHashInIdentifier) ||
162 *CurPtr == 'e' || *CurPtr == 'E')
163 return LexFloatLiteral();
164 }
165
166 while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
167 ++CurPtr;
168
169 // Handle . as a special case.
170 if (CurPtr == TokStart+1 && TokStart[0] == '.')
171 return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172
173 return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174 }
175
176 /// LexSlash: Slash: /
177 /// C-Style Comment: /* ... */
178 /// C-style Comment: // ...
LexSlash()179 AsmToken AsmLexer::LexSlash() {
180 if (!MAI.shouldAllowAdditionalComments()) {
181 IsAtStartOfStatement = false;
182 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183 }
184
185 switch (*CurPtr) {
186 case '*':
187 IsAtStartOfStatement = false;
188 break; // C style comment.
189 case '/':
190 ++CurPtr;
191 return LexLineComment();
192 default:
193 IsAtStartOfStatement = false;
194 return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195 }
196
197 // C Style comment.
198 ++CurPtr; // skip the star.
199 const char *CommentTextStart = CurPtr;
200 while (CurPtr != CurBuf.end()) {
201 switch (*CurPtr++) {
202 case '*':
203 // End of the comment?
204 if (*CurPtr != '/')
205 break;
206 // If we have a CommentConsumer, notify it about the comment.
207 if (CommentConsumer) {
208 CommentConsumer->HandleComment(
209 SMLoc::getFromPointer(CommentTextStart),
210 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211 }
212 ++CurPtr; // End the */.
213 return AsmToken(AsmToken::Comment,
214 StringRef(TokStart, CurPtr - TokStart));
215 }
216 }
217 return ReturnError(TokStart, "unterminated comment");
218 }
219
220 /// LexLineComment: Comment: #[^\n]*
221 /// : //[^\n]*
LexLineComment()222 AsmToken AsmLexer::LexLineComment() {
223 // Mark This as an end of statement with a body of the
224 // comment. While it would be nicer to leave this two tokens,
225 // backwards compatability with TargetParsers makes keeping this in this form
226 // better.
227 const char *CommentTextStart = CurPtr;
228 int CurChar = getNextChar();
229 while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230 CurChar = getNextChar();
231 if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
232 ++CurPtr;
233
234 // If we have a CommentConsumer, notify it about the comment.
235 if (CommentConsumer) {
236 CommentConsumer->HandleComment(
237 SMLoc::getFromPointer(CommentTextStart),
238 StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
239 }
240
241 IsAtStartOfLine = true;
242 // This is a whole line comment. leave newline
243 if (IsAtStartOfStatement)
244 return AsmToken(AsmToken::EndOfStatement,
245 StringRef(TokStart, CurPtr - TokStart));
246 IsAtStartOfStatement = true;
247
248 return AsmToken(AsmToken::EndOfStatement,
249 StringRef(TokStart, CurPtr - 1 - TokStart));
250 }
251
SkipIgnoredIntegerSuffix(const char * & CurPtr)252 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
253 // Skip ULL, UL, U, L and LL suffices.
254 if (CurPtr[0] == 'U')
255 ++CurPtr;
256 if (CurPtr[0] == 'L')
257 ++CurPtr;
258 if (CurPtr[0] == 'L')
259 ++CurPtr;
260 }
261
262 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
263 // integer as a hexadecimal, possibly with leading zeroes.
doHexLookAhead(const char * & CurPtr,unsigned DefaultRadix,bool LexHex)264 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
265 bool LexHex) {
266 const char *FirstNonDec = nullptr;
267 const char *LookAhead = CurPtr;
268 while (true) {
269 if (isDigit(*LookAhead)) {
270 ++LookAhead;
271 } else {
272 if (!FirstNonDec)
273 FirstNonDec = LookAhead;
274
275 // Keep going if we are looking for a 'h' suffix.
276 if (LexHex && isHexDigit(*LookAhead))
277 ++LookAhead;
278 else
279 break;
280 }
281 }
282 bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
283 CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
284 if (isHex)
285 return 16;
286 return DefaultRadix;
287 }
288
findLastDigit(const char * CurPtr,unsigned DefaultRadix)289 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
290 while (hexDigitValue(*CurPtr) < DefaultRadix) {
291 ++CurPtr;
292 }
293 return CurPtr;
294 }
295
intToken(StringRef Ref,APInt & Value)296 static AsmToken intToken(StringRef Ref, APInt &Value) {
297 if (Value.isIntN(64))
298 return AsmToken(AsmToken::Integer, Ref, Value);
299 return AsmToken(AsmToken::BigNum, Ref, Value);
300 }
301
radixName(unsigned Radix)302 static std::string radixName(unsigned Radix) {
303 switch (Radix) {
304 case 2:
305 return "binary";
306 case 8:
307 return "octal";
308 case 10:
309 return "decimal";
310 case 16:
311 return "hexadecimal";
312 default:
313 return "base-" + std::to_string(Radix);
314 }
315 }
316
317 /// LexDigit: First character is [0-9].
318 /// Local Label: [0-9][:]
319 /// Forward/Backward Label: [0-9][fb]
320 /// Binary integer: 0b[01]+
321 /// Octal integer: 0[0-7]+
322 /// Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
323 /// Decimal integer: [1-9][0-9]*
LexDigit()324 AsmToken AsmLexer::LexDigit() {
325 // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
326 // MASM-flavor octal integer: [0-7]+[oOqQ]
327 // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
328 // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
329 if (LexMasmIntegers && isdigit(CurPtr[-1])) {
330 const char *FirstNonBinary =
331 (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
332 const char *FirstNonDecimal =
333 (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
334 const char *OldCurPtr = CurPtr;
335 while (isHexDigit(*CurPtr)) {
336 switch (*CurPtr) {
337 default:
338 if (!FirstNonDecimal) {
339 FirstNonDecimal = CurPtr;
340 }
341 LLVM_FALLTHROUGH;
342 case '9':
343 case '8':
344 case '7':
345 case '6':
346 case '5':
347 case '4':
348 case '3':
349 case '2':
350 if (!FirstNonBinary) {
351 FirstNonBinary = CurPtr;
352 }
353 break;
354 case '1':
355 case '0':
356 break;
357 }
358 ++CurPtr;
359 }
360 if (*CurPtr == '.') {
361 // MASM float literals (other than hex floats) always contain a ".", and
362 // are always written in decimal.
363 ++CurPtr;
364 return LexFloatLiteral();
365 }
366
367 if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
368 ++CurPtr;
369 return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
370 }
371
372 unsigned Radix = 0;
373 if (*CurPtr == 'h' || *CurPtr == 'H') {
374 // hexadecimal number
375 ++CurPtr;
376 Radix = 16;
377 } else if (*CurPtr == 't' || *CurPtr == 'T') {
378 // decimal number
379 ++CurPtr;
380 Radix = 10;
381 } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
382 *CurPtr == 'Q') {
383 // octal number
384 ++CurPtr;
385 Radix = 8;
386 } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
387 // binary number
388 ++CurPtr;
389 Radix = 2;
390 } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
391 DefaultRadix < 14 &&
392 (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
393 Radix = 10;
394 } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
395 DefaultRadix < 12 &&
396 (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
397 Radix = 2;
398 }
399
400 if (Radix) {
401 StringRef Result(TokStart, CurPtr - TokStart);
402 APInt Value(128, 0, true);
403
404 if (Result.drop_back().getAsInteger(Radix, Value))
405 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
406
407 // MSVC accepts and ignores type suffices on integer literals.
408 SkipIgnoredIntegerSuffix(CurPtr);
409
410 return intToken(Result, Value);
411 }
412
413 // default-radix integers, or floating point numbers, fall through
414 CurPtr = OldCurPtr;
415 }
416
417 // MASM default-radix integers: [0-9a-fA-F]+
418 // (All other integer literals have a radix specifier.)
419 if (LexMasmIntegers && UseMasmDefaultRadix) {
420 CurPtr = findLastDigit(CurPtr, 16);
421 StringRef Result(TokStart, CurPtr - TokStart);
422
423 APInt Value(128, 0, true);
424 if (Result.getAsInteger(DefaultRadix, Value)) {
425 return ReturnError(TokStart,
426 "invalid " + radixName(DefaultRadix) + " number");
427 }
428
429 return intToken(Result, Value);
430 }
431
432 // Motorola hex integers: $[0-9a-fA-F]+
433 if (LexMotorolaIntegers && CurPtr[-1] == '$') {
434 const char *NumStart = CurPtr;
435 while (isHexDigit(CurPtr[0]))
436 ++CurPtr;
437
438 APInt Result(128, 0);
439 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
440 return ReturnError(TokStart, "invalid hexadecimal number");
441
442 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
443 }
444
445 // Motorola binary integers: %[01]+
446 if (LexMotorolaIntegers && CurPtr[-1] == '%') {
447 const char *NumStart = CurPtr;
448 while (*CurPtr == '0' || *CurPtr == '1')
449 ++CurPtr;
450
451 APInt Result(128, 0);
452 if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
453 return ReturnError(TokStart, "invalid binary number");
454
455 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
456 }
457
458 // Decimal integer: [1-9][0-9]*
459 // HLASM-flavour decimal integer: [0-9][0-9]*
460 // FIXME: Later on, support for fb for HLASM has to be added in
461 // as they probably would be needed for asm goto
462 if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
463 unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
464
465 if (!LexHLASMIntegers) {
466 bool IsHex = Radix == 16;
467 // Check for floating point literals.
468 if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
469 if (*CurPtr == '.')
470 ++CurPtr;
471 return LexFloatLiteral();
472 }
473 }
474
475 StringRef Result(TokStart, CurPtr - TokStart);
476
477 APInt Value(128, 0, true);
478 if (Result.getAsInteger(Radix, Value))
479 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
480
481 if (!LexHLASMIntegers)
482 // The darwin/x86 (and x86-64) assembler accepts and ignores type
483 // suffices on integer literals.
484 SkipIgnoredIntegerSuffix(CurPtr);
485
486 return intToken(Result, Value);
487 }
488
489 if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
490 ++CurPtr;
491 // See if we actually have "0b" as part of something like "jmp 0b\n"
492 if (!isDigit(CurPtr[0])) {
493 --CurPtr;
494 StringRef Result(TokStart, CurPtr - TokStart);
495 return AsmToken(AsmToken::Integer, Result, 0);
496 }
497 const char *NumStart = CurPtr;
498 while (CurPtr[0] == '0' || CurPtr[0] == '1')
499 ++CurPtr;
500
501 // Requires at least one binary digit.
502 if (CurPtr == NumStart)
503 return ReturnError(TokStart, "invalid binary number");
504
505 StringRef Result(TokStart, CurPtr - TokStart);
506
507 APInt Value(128, 0, true);
508 if (Result.substr(2).getAsInteger(2, Value))
509 return ReturnError(TokStart, "invalid binary number");
510
511 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
512 // suffixes on integer literals.
513 SkipIgnoredIntegerSuffix(CurPtr);
514
515 return intToken(Result, Value);
516 }
517
518 if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
519 ++CurPtr;
520 const char *NumStart = CurPtr;
521 while (isHexDigit(CurPtr[0]))
522 ++CurPtr;
523
524 // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
525 // diagnosed by LexHexFloatLiteral).
526 if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
527 return LexHexFloatLiteral(NumStart == CurPtr);
528
529 // Otherwise requires at least one hex digit.
530 if (CurPtr == NumStart)
531 return ReturnError(CurPtr-2, "invalid hexadecimal number");
532
533 APInt Result(128, 0);
534 if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
535 return ReturnError(TokStart, "invalid hexadecimal number");
536
537 // Consume the optional [hH].
538 if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
539 ++CurPtr;
540
541 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
542 // suffixes on integer literals.
543 SkipIgnoredIntegerSuffix(CurPtr);
544
545 return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
546 }
547
548 // Either octal or hexadecimal.
549 APInt Value(128, 0, true);
550 unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
551 StringRef Result(TokStart, CurPtr - TokStart);
552 if (Result.getAsInteger(Radix, Value))
553 return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
554
555 // Consume the [hH].
556 if (Radix == 16)
557 ++CurPtr;
558
559 // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
560 // suffixes on integer literals.
561 SkipIgnoredIntegerSuffix(CurPtr);
562
563 return intToken(Result, Value);
564 }
565
566 /// LexSingleQuote: Integer: 'b'
LexSingleQuote()567 AsmToken AsmLexer::LexSingleQuote() {
568 int CurChar = getNextChar();
569
570 if (LexHLASMStrings)
571 return ReturnError(TokStart, "invalid usage of character literals");
572
573 if (LexMasmStrings) {
574 while (CurChar != EOF) {
575 if (CurChar != '\'') {
576 CurChar = getNextChar();
577 } else if (peekNextChar() == '\'') {
578 // In MASM single-quote strings, doubled single-quotes mean an escaped
579 // single quote, so should be lexed in.
580 getNextChar();
581 CurChar = getNextChar();
582 } else {
583 break;
584 }
585 }
586 if (CurChar == EOF)
587 return ReturnError(TokStart, "unterminated string constant");
588 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
589 }
590
591 if (CurChar == '\\')
592 CurChar = getNextChar();
593
594 if (CurChar == EOF)
595 return ReturnError(TokStart, "unterminated single quote");
596
597 CurChar = getNextChar();
598
599 if (CurChar != '\'')
600 return ReturnError(TokStart, "single quote way too long");
601
602 // The idea here being that 'c' is basically just an integral
603 // constant.
604 StringRef Res = StringRef(TokStart,CurPtr - TokStart);
605 long long Value;
606
607 if (Res.startswith("\'\\")) {
608 char theChar = Res[2];
609 switch (theChar) {
610 default: Value = theChar; break;
611 case '\'': Value = '\''; break;
612 case 't': Value = '\t'; break;
613 case 'n': Value = '\n'; break;
614 case 'b': Value = '\b'; break;
615 case 'f': Value = '\f'; break;
616 case 'r': Value = '\r'; break;
617 }
618 } else
619 Value = TokStart[1];
620
621 return AsmToken(AsmToken::Integer, Res, Value);
622 }
623
624 /// LexQuote: String: "..."
LexQuote()625 AsmToken AsmLexer::LexQuote() {
626 int CurChar = getNextChar();
627 if (LexHLASMStrings)
628 return ReturnError(TokStart, "invalid usage of string literals");
629
630 if (LexMasmStrings) {
631 while (CurChar != EOF) {
632 if (CurChar != '"') {
633 CurChar = getNextChar();
634 } else if (peekNextChar() == '"') {
635 // In MASM double-quoted strings, doubled double-quotes mean an escaped
636 // double quote, so should be lexed in.
637 getNextChar();
638 CurChar = getNextChar();
639 } else {
640 break;
641 }
642 }
643 if (CurChar == EOF)
644 return ReturnError(TokStart, "unterminated string constant");
645 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
646 }
647
648 // TODO: does gas allow multiline string constants?
649 while (CurChar != '"') {
650 if (CurChar == '\\') {
651 // Allow \", etc.
652 CurChar = getNextChar();
653 }
654
655 if (CurChar == EOF)
656 return ReturnError(TokStart, "unterminated string constant");
657
658 CurChar = getNextChar();
659 }
660
661 return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
662 }
663
LexUntilEndOfStatement()664 StringRef AsmLexer::LexUntilEndOfStatement() {
665 TokStart = CurPtr;
666
667 while (!isAtStartOfComment(CurPtr) && // Start of line comment.
668 !isAtStatementSeparator(CurPtr) && // End of statement marker.
669 *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
670 ++CurPtr;
671 }
672 return StringRef(TokStart, CurPtr-TokStart);
673 }
674
LexUntilEndOfLine()675 StringRef AsmLexer::LexUntilEndOfLine() {
676 TokStart = CurPtr;
677
678 while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
679 ++CurPtr;
680 }
681 return StringRef(TokStart, CurPtr-TokStart);
682 }
683
peekTokens(MutableArrayRef<AsmToken> Buf,bool ShouldSkipSpace)684 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
685 bool ShouldSkipSpace) {
686 SaveAndRestore<const char *> SavedTokenStart(TokStart);
687 SaveAndRestore<const char *> SavedCurPtr(CurPtr);
688 SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine);
689 SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement);
690 SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace);
691 SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true);
692 std::string SavedErr = getErr();
693 SMLoc SavedErrLoc = getErrLoc();
694
695 size_t ReadCount;
696 for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
697 AsmToken Token = LexToken();
698
699 Buf[ReadCount] = Token;
700
701 if (Token.is(AsmToken::Eof))
702 break;
703 }
704
705 SetError(SavedErrLoc, SavedErr);
706 return ReadCount;
707 }
708
isAtStartOfComment(const char * Ptr)709 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
710 if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
711 return false;
712
713 StringRef CommentString = MAI.getCommentString();
714
715 if (CommentString.size() == 1)
716 return CommentString[0] == Ptr[0];
717
718 // Allow # preprocessor commments also be counted as comments for "##" cases
719 if (CommentString[1] == '#')
720 return CommentString[0] == Ptr[0];
721
722 return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
723 }
724
isAtStatementSeparator(const char * Ptr)725 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
726 return strncmp(Ptr, MAI.getSeparatorString(),
727 strlen(MAI.getSeparatorString())) == 0;
728 }
729
LexToken()730 AsmToken AsmLexer::LexToken() {
731 TokStart = CurPtr;
732 // This always consumes at least one character.
733 int CurChar = getNextChar();
734
735 if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
736 // If this starts with a '#', this may be a cpp
737 // hash directive and otherwise a line comment.
738 AsmToken TokenBuf[2];
739 MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
740 size_t num = peekTokens(Buf, true);
741 // There cannot be a space preceding this
742 if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
743 TokenBuf[1].is(AsmToken::String)) {
744 CurPtr = TokStart; // reset curPtr;
745 StringRef s = LexUntilEndOfLine();
746 UnLex(TokenBuf[1]);
747 UnLex(TokenBuf[0]);
748 return AsmToken(AsmToken::HashDirective, s);
749 }
750
751 if (MAI.shouldAllowAdditionalComments())
752 return LexLineComment();
753 }
754
755 if (isAtStartOfComment(TokStart))
756 return LexLineComment();
757
758 if (isAtStatementSeparator(TokStart)) {
759 CurPtr += strlen(MAI.getSeparatorString()) - 1;
760 IsAtStartOfLine = true;
761 IsAtStartOfStatement = true;
762 return AsmToken(AsmToken::EndOfStatement,
763 StringRef(TokStart, strlen(MAI.getSeparatorString())));
764 }
765
766 // If we're missing a newline at EOF, make sure we still get an
767 // EndOfStatement token before the Eof token.
768 if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
769 IsAtStartOfLine = true;
770 IsAtStartOfStatement = true;
771 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
772 }
773 IsAtStartOfLine = false;
774 bool OldIsAtStartOfStatement = IsAtStartOfStatement;
775 IsAtStartOfStatement = false;
776 switch (CurChar) {
777 default:
778 // Handle identifier: [a-zA-Z_.?][a-zA-Z0-9_$.@#?]*
779 if (isalpha(CurChar) || CurChar == '_' || CurChar == '.' ||
780 (MAI.doesAllowQuestionAtStartOfIdentifier() && CurChar == '?'))
781 return LexIdentifier();
782
783 // Unknown character, emit an error.
784 return ReturnError(TokStart, "invalid character in input");
785 case EOF:
786 if (EndStatementAtEOF) {
787 IsAtStartOfLine = true;
788 IsAtStartOfStatement = true;
789 }
790 return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
791 case 0:
792 case ' ':
793 case '\t':
794 IsAtStartOfStatement = OldIsAtStartOfStatement;
795 while (*CurPtr == ' ' || *CurPtr == '\t')
796 CurPtr++;
797 if (SkipSpace)
798 return LexToken(); // Ignore whitespace.
799 else
800 return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
801 case '\r': {
802 IsAtStartOfLine = true;
803 IsAtStartOfStatement = true;
804 // If this is a CR followed by LF, treat that as one token.
805 if (CurPtr != CurBuf.end() && *CurPtr == '\n')
806 ++CurPtr;
807 return AsmToken(AsmToken::EndOfStatement,
808 StringRef(TokStart, CurPtr - TokStart));
809 }
810 case '\n':
811 IsAtStartOfLine = true;
812 IsAtStartOfStatement = true;
813 return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
814 case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
815 case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
816 case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
817 case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
818 case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
819 case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
820 case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
821 case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
822 case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
823 case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
824 case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
825 case '$': {
826 if (LexMotorolaIntegers && isHexDigit(*CurPtr))
827 return LexDigit();
828 if (MAI.doesAllowDollarAtStartOfIdentifier())
829 return LexIdentifier();
830 return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
831 }
832 case '@': {
833 if (MAI.doesAllowAtAtStartOfIdentifier())
834 return LexIdentifier();
835 return AsmToken(AsmToken::At, StringRef(TokStart, 1));
836 }
837 case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
838 case '=':
839 if (*CurPtr == '=') {
840 ++CurPtr;
841 return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
842 }
843 return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
844 case '-':
845 if (*CurPtr == '>') {
846 ++CurPtr;
847 return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
848 }
849 return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
850 case '|':
851 if (*CurPtr == '|') {
852 ++CurPtr;
853 return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
854 }
855 return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
856 case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
857 case '&':
858 if (*CurPtr == '&') {
859 ++CurPtr;
860 return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
861 }
862 return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
863 case '!':
864 if (*CurPtr == '=') {
865 ++CurPtr;
866 return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
867 }
868 return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
869 case '%':
870 if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
871 return LexDigit();
872 }
873
874 if (MAI.hasMipsExpressions()) {
875 AsmToken::TokenKind Operator;
876 unsigned OperatorLength;
877
878 std::tie(Operator, OperatorLength) =
879 StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
880 StringRef(CurPtr))
881 .StartsWith("call16", {AsmToken::PercentCall16, 7})
882 .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
883 .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
884 .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
885 .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
886 .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
887 .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
888 .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
889 .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
890 .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
891 .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
892 .StartsWith("got", {AsmToken::PercentGot, 4})
893 .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
894 .StartsWith("higher", {AsmToken::PercentHigher, 7})
895 .StartsWith("highest", {AsmToken::PercentHighest, 8})
896 .StartsWith("hi", {AsmToken::PercentHi, 3})
897 .StartsWith("lo", {AsmToken::PercentLo, 3})
898 .StartsWith("neg", {AsmToken::PercentNeg, 4})
899 .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
900 .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
901 .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
902 .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
903 .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
904 .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
905 .Default({AsmToken::Percent, 1});
906
907 if (Operator != AsmToken::Percent) {
908 CurPtr += OperatorLength - 1;
909 return AsmToken(Operator, StringRef(TokStart, OperatorLength));
910 }
911 }
912 return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
913 case '/':
914 IsAtStartOfStatement = OldIsAtStartOfStatement;
915 return LexSlash();
916 case '#': {
917 if (MAI.doesAllowHashAtStartOfIdentifier())
918 return LexIdentifier();
919 return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
920 }
921 case '\'': return LexSingleQuote();
922 case '"': return LexQuote();
923 case '0': case '1': case '2': case '3': case '4':
924 case '5': case '6': case '7': case '8': case '9':
925 return LexDigit();
926 case '<':
927 switch (*CurPtr) {
928 case '<':
929 ++CurPtr;
930 return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
931 case '=':
932 ++CurPtr;
933 return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
934 case '>':
935 ++CurPtr;
936 return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
937 default:
938 return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
939 }
940 case '>':
941 switch (*CurPtr) {
942 case '>':
943 ++CurPtr;
944 return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
945 case '=':
946 ++CurPtr;
947 return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
948 default:
949 return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
950 }
951
952 // TODO: Quoted identifiers (objc methods etc)
953 // local labels: [0-9][:]
954 // Forward/backward labels: [0-9][fb]
955 // Integers, fp constants, character constants.
956 }
957 }
958