1 //===- AsmLexer.cpp - Lexer for Assembly Files ----------------------------===//
2 //
3 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4 // See https://llvm.org/LICENSE.txt for license information.
5 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6 //
7 //===----------------------------------------------------------------------===//
8 //
9 // This class implements the lexer for assembly files.
10 //
11 //===----------------------------------------------------------------------===//
12 
13 #include "llvm/MC/MCParser/AsmLexer.h"
14 #include "llvm/ADT/APInt.h"
15 #include "llvm/ADT/ArrayRef.h"
16 #include "llvm/ADT/StringExtras.h"
17 #include "llvm/ADT/StringRef.h"
18 #include "llvm/ADT/StringSwitch.h"
19 #include "llvm/MC/MCAsmInfo.h"
20 #include "llvm/MC/MCParser/MCAsmLexer.h"
21 #include "llvm/Support/Compiler.h"
22 #include "llvm/Support/SMLoc.h"
23 #include "llvm/Support/SaveAndRestore.h"
24 #include <cassert>
25 #include <cctype>
26 #include <cstdio>
27 #include <cstring>
28 #include <string>
29 #include <tuple>
30 #include <utility>
31 
32 using namespace llvm;
33 
34 AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
35   AllowAtInIdentifier = !StringRef(MAI.getCommentString()).startswith("@");
36   LexMotorolaIntegers = MAI.shouldUseMotorolaIntegers();
37 }
38 
39 AsmLexer::~AsmLexer() = default;
40 
41 void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
42                          bool EndStatementAtEOF) {
43   CurBuf = Buf;
44 
45   if (ptr)
46     CurPtr = ptr;
47   else
48     CurPtr = CurBuf.begin();
49 
50   TokStart = nullptr;
51   this->EndStatementAtEOF = EndStatementAtEOF;
52 }
53 
54 /// ReturnError - Set the error to the specified string at the specified
55 /// location.  This is defined to always return AsmToken::Error.
56 AsmToken AsmLexer::ReturnError(const char *Loc, const std::string &Msg) {
57   SetError(SMLoc::getFromPointer(Loc), Msg);
58 
59   return AsmToken(AsmToken::Error, StringRef(Loc, CurPtr - Loc));
60 }
61 
62 int AsmLexer::getNextChar() {
63   if (CurPtr == CurBuf.end())
64     return EOF;
65   return (unsigned char)*CurPtr++;
66 }
67 
68 int AsmLexer::peekNextChar() {
69   if (CurPtr == CurBuf.end())
70     return EOF;
71   return (unsigned char)*CurPtr;
72 }
73 
74 /// The leading integral digit sequence and dot should have already been
75 /// consumed, some or all of the fractional digit sequence *can* have been
76 /// consumed.
77 AsmToken AsmLexer::LexFloatLiteral() {
78   // Skip the fractional digit sequence.
79   while (isDigit(*CurPtr))
80     ++CurPtr;
81 
82   if (*CurPtr == '-' || *CurPtr == '+')
83     return ReturnError(CurPtr, "invalid sign in float literal");
84 
85   // Check for exponent
86   if ((*CurPtr == 'e' || *CurPtr == 'E')) {
87     ++CurPtr;
88 
89     if (*CurPtr == '-' || *CurPtr == '+')
90       ++CurPtr;
91 
92     while (isDigit(*CurPtr))
93       ++CurPtr;
94   }
95 
96   return AsmToken(AsmToken::Real,
97                   StringRef(TokStart, CurPtr - TokStart));
98 }
99 
100 /// LexHexFloatLiteral matches essentially (.[0-9a-fA-F]*)?[pP][+-]?[0-9a-fA-F]+
101 /// while making sure there are enough actual digits around for the constant to
102 /// be valid.
103 ///
104 /// The leading "0x[0-9a-fA-F]*" (i.e. integer part) has already been consumed
105 /// before we get here.
106 AsmToken AsmLexer::LexHexFloatLiteral(bool NoIntDigits) {
107   assert((*CurPtr == 'p' || *CurPtr == 'P' || *CurPtr == '.') &&
108          "unexpected parse state in floating hex");
109   bool NoFracDigits = true;
110 
111   // Skip the fractional part if there is one
112   if (*CurPtr == '.') {
113     ++CurPtr;
114 
115     const char *FracStart = CurPtr;
116     while (isHexDigit(*CurPtr))
117       ++CurPtr;
118 
119     NoFracDigits = CurPtr == FracStart;
120   }
121 
122   if (NoIntDigits && NoFracDigits)
123     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
124                                  "expected at least one significand digit");
125 
126   // Make sure we do have some kind of proper exponent part
127   if (*CurPtr != 'p' && *CurPtr != 'P')
128     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
129                                  "expected exponent part 'p'");
130   ++CurPtr;
131 
132   if (*CurPtr == '+' || *CurPtr == '-')
133     ++CurPtr;
134 
135   // N.b. exponent digits are *not* hex
136   const char *ExpStart = CurPtr;
137   while (isDigit(*CurPtr))
138     ++CurPtr;
139 
140   if (CurPtr == ExpStart)
141     return ReturnError(TokStart, "invalid hexadecimal floating-point constant: "
142                                  "expected at least one exponent digit");
143 
144   return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
145 }
146 
147 /// LexIdentifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@#?]*
148 static bool isIdentifierChar(char C, bool AllowAt, bool AllowHash) {
149   return isAlnum(C) || C == '_' || C == '$' || C == '.' || C == '?' ||
150          (AllowAt && C == '@') || (AllowHash && C == '#');
151 }
152 
153 AsmToken AsmLexer::LexIdentifier() {
154   // Check for floating point literals.
155   if (CurPtr[-1] == '.' && isDigit(*CurPtr)) {
156     // Disambiguate a .1243foo identifier from a floating literal.
157     while (isDigit(*CurPtr))
158       ++CurPtr;
159 
160     if (!isIdentifierChar(*CurPtr, AllowAtInIdentifier,
161                           AllowHashInIdentifier) ||
162         *CurPtr == 'e' || *CurPtr == 'E')
163       return LexFloatLiteral();
164   }
165 
166   while (isIdentifierChar(*CurPtr, AllowAtInIdentifier, AllowHashInIdentifier))
167     ++CurPtr;
168 
169   // Handle . as a special case.
170   if (CurPtr == TokStart+1 && TokStart[0] == '.')
171     return AsmToken(AsmToken::Dot, StringRef(TokStart, 1));
172 
173   return AsmToken(AsmToken::Identifier, StringRef(TokStart, CurPtr - TokStart));
174 }
175 
176 /// LexSlash: Slash: /
177 ///           C-Style Comment: /* ... */
178 ///           C-style Comment: // ...
179 AsmToken AsmLexer::LexSlash() {
180   if (!MAI.shouldAllowAdditionalComments()) {
181     IsAtStartOfStatement = false;
182     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
183   }
184 
185   switch (*CurPtr) {
186   case '*':
187     IsAtStartOfStatement = false;
188     break; // C style comment.
189   case '/':
190     ++CurPtr;
191     return LexLineComment();
192   default:
193     IsAtStartOfStatement = false;
194     return AsmToken(AsmToken::Slash, StringRef(TokStart, 1));
195   }
196 
197   // C Style comment.
198   ++CurPtr;  // skip the star.
199   const char *CommentTextStart = CurPtr;
200   while (CurPtr != CurBuf.end()) {
201     switch (*CurPtr++) {
202     case '*':
203       // End of the comment?
204       if (*CurPtr != '/')
205         break;
206       // If we have a CommentConsumer, notify it about the comment.
207       if (CommentConsumer) {
208         CommentConsumer->HandleComment(
209             SMLoc::getFromPointer(CommentTextStart),
210             StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
211       }
212       ++CurPtr;   // End the */.
213       return AsmToken(AsmToken::Comment,
214                       StringRef(TokStart, CurPtr - TokStart));
215     }
216   }
217   return ReturnError(TokStart, "unterminated comment");
218 }
219 
220 /// LexLineComment: Comment: #[^\n]*
221 ///                        : //[^\n]*
222 AsmToken AsmLexer::LexLineComment() {
223   // Mark This as an end of statement with a body of the
224   // comment. While it would be nicer to leave this two tokens,
225   // backwards compatability with TargetParsers makes keeping this in this form
226   // better.
227   const char *CommentTextStart = CurPtr;
228   int CurChar = getNextChar();
229   while (CurChar != '\n' && CurChar != '\r' && CurChar != EOF)
230     CurChar = getNextChar();
231   if (CurChar == '\r' && CurPtr != CurBuf.end() && *CurPtr == '\n')
232     ++CurPtr;
233 
234   // If we have a CommentConsumer, notify it about the comment.
235   if (CommentConsumer) {
236     CommentConsumer->HandleComment(
237         SMLoc::getFromPointer(CommentTextStart),
238         StringRef(CommentTextStart, CurPtr - 1 - CommentTextStart));
239   }
240 
241   IsAtStartOfLine = true;
242   // This is a whole line comment. leave newline
243   if (IsAtStartOfStatement)
244     return AsmToken(AsmToken::EndOfStatement,
245                     StringRef(TokStart, CurPtr - TokStart));
246   IsAtStartOfStatement = true;
247 
248   return AsmToken(AsmToken::EndOfStatement,
249                   StringRef(TokStart, CurPtr - 1 - TokStart));
250 }
251 
252 static void SkipIgnoredIntegerSuffix(const char *&CurPtr) {
253   // Skip ULL, UL, U, L and LL suffices.
254   if (CurPtr[0] == 'U')
255     ++CurPtr;
256   if (CurPtr[0] == 'L')
257     ++CurPtr;
258   if (CurPtr[0] == 'L')
259     ++CurPtr;
260 }
261 
262 // Look ahead to search for first non-hex digit, if it's [hH], then we treat the
263 // integer as a hexadecimal, possibly with leading zeroes.
264 static unsigned doHexLookAhead(const char *&CurPtr, unsigned DefaultRadix,
265                                bool LexHex) {
266   const char *FirstNonDec = nullptr;
267   const char *LookAhead = CurPtr;
268   while (true) {
269     if (isDigit(*LookAhead)) {
270       ++LookAhead;
271     } else {
272       if (!FirstNonDec)
273         FirstNonDec = LookAhead;
274 
275       // Keep going if we are looking for a 'h' suffix.
276       if (LexHex && isHexDigit(*LookAhead))
277         ++LookAhead;
278       else
279         break;
280     }
281   }
282   bool isHex = LexHex && (*LookAhead == 'h' || *LookAhead == 'H');
283   CurPtr = isHex || !FirstNonDec ? LookAhead : FirstNonDec;
284   if (isHex)
285     return 16;
286   return DefaultRadix;
287 }
288 
289 static const char *findLastDigit(const char *CurPtr, unsigned DefaultRadix) {
290   while (hexDigitValue(*CurPtr) < DefaultRadix) {
291     ++CurPtr;
292   }
293   return CurPtr;
294 }
295 
296 static AsmToken intToken(StringRef Ref, APInt &Value) {
297   if (Value.isIntN(64))
298     return AsmToken(AsmToken::Integer, Ref, Value);
299   return AsmToken(AsmToken::BigNum, Ref, Value);
300 }
301 
302 static std::string radixName(unsigned Radix) {
303   switch (Radix) {
304   case 2:
305     return "binary";
306   case 8:
307     return "octal";
308   case 10:
309     return "decimal";
310   case 16:
311     return "hexadecimal";
312   default:
313     return "base-" + std::to_string(Radix);
314   }
315 }
316 
317 /// LexDigit: First character is [0-9].
318 ///   Local Label: [0-9][:]
319 ///   Forward/Backward Label: [0-9][fb]
320 ///   Binary integer: 0b[01]+
321 ///   Octal integer: 0[0-7]+
322 ///   Hex integer: 0x[0-9a-fA-F]+ or [0x]?[0-9][0-9a-fA-F]*[hH]
323 ///   Decimal integer: [1-9][0-9]*
324 AsmToken AsmLexer::LexDigit() {
325   // MASM-flavor binary integer: [01]+[yY] (if DefaultRadix < 16, [bByY])
326   // MASM-flavor octal integer: [0-7]+[oOqQ]
327   // MASM-flavor decimal integer: [0-9]+[tT] (if DefaultRadix < 16, [dDtT])
328   // MASM-flavor hexadecimal integer: [0-9][0-9a-fA-F]*[hH]
329   if (LexMasmIntegers && isdigit(CurPtr[-1])) {
330     const char *FirstNonBinary =
331         (CurPtr[-1] != '0' && CurPtr[-1] != '1') ? CurPtr - 1 : nullptr;
332     const char *FirstNonDecimal =
333         (CurPtr[-1] < '0' || CurPtr[-1] > '9') ? CurPtr - 1 : nullptr;
334     const char *OldCurPtr = CurPtr;
335     while (isHexDigit(*CurPtr)) {
336       switch (*CurPtr) {
337       default:
338         if (!FirstNonDecimal) {
339           FirstNonDecimal = CurPtr;
340         }
341         LLVM_FALLTHROUGH;
342       case '9':
343       case '8':
344       case '7':
345       case '6':
346       case '5':
347       case '4':
348       case '3':
349       case '2':
350         if (!FirstNonBinary) {
351           FirstNonBinary = CurPtr;
352         }
353         break;
354       case '1':
355       case '0':
356         break;
357       }
358       ++CurPtr;
359     }
360     if (*CurPtr == '.') {
361       // MASM float literals (other than hex floats) always contain a ".", and
362       // are always written in decimal.
363       ++CurPtr;
364       return LexFloatLiteral();
365     }
366 
367     if (LexMasmHexFloats && (*CurPtr == 'r' || *CurPtr == 'R')) {
368       ++CurPtr;
369       return AsmToken(AsmToken::Real, StringRef(TokStart, CurPtr - TokStart));
370     }
371 
372     unsigned Radix = 0;
373     if (*CurPtr == 'h' || *CurPtr == 'H') {
374       // hexadecimal number
375       ++CurPtr;
376       Radix = 16;
377     } else if (*CurPtr == 't' || *CurPtr == 'T') {
378       // decimal number
379       ++CurPtr;
380       Radix = 10;
381     } else if (*CurPtr == 'o' || *CurPtr == 'O' || *CurPtr == 'q' ||
382                *CurPtr == 'Q') {
383       // octal number
384       ++CurPtr;
385       Radix = 8;
386     } else if (*CurPtr == 'y' || *CurPtr == 'Y') {
387       // binary number
388       ++CurPtr;
389       Radix = 2;
390     } else if (FirstNonDecimal && FirstNonDecimal + 1 == CurPtr &&
391                DefaultRadix < 14 &&
392                (*FirstNonDecimal == 'd' || *FirstNonDecimal == 'D')) {
393       Radix = 10;
394     } else if (FirstNonBinary && FirstNonBinary + 1 == CurPtr &&
395                DefaultRadix < 12 &&
396                (*FirstNonBinary == 'b' || *FirstNonBinary == 'B')) {
397       Radix = 2;
398     }
399 
400     if (Radix) {
401       StringRef Result(TokStart, CurPtr - TokStart);
402       APInt Value(128, 0, true);
403 
404       if (Result.drop_back().getAsInteger(Radix, Value))
405         return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
406 
407       // MSVC accepts and ignores type suffices on integer literals.
408       SkipIgnoredIntegerSuffix(CurPtr);
409 
410       return intToken(Result, Value);
411     }
412 
413     // default-radix integers, or floating point numbers, fall through
414     CurPtr = OldCurPtr;
415   }
416 
417   // MASM default-radix integers: [0-9a-fA-F]+
418   // (All other integer literals have a radix specifier.)
419   if (LexMasmIntegers && UseMasmDefaultRadix) {
420     CurPtr = findLastDigit(CurPtr, 16);
421     StringRef Result(TokStart, CurPtr - TokStart);
422 
423     APInt Value(128, 0, true);
424     if (Result.getAsInteger(DefaultRadix, Value)) {
425       return ReturnError(TokStart,
426                          "invalid " + radixName(DefaultRadix) + " number");
427     }
428 
429     return intToken(Result, Value);
430   }
431 
432   // Motorola hex integers: $[0-9a-fA-F]+
433   if (LexMotorolaIntegers && CurPtr[-1] == '$') {
434     const char *NumStart = CurPtr;
435     while (isHexDigit(CurPtr[0]))
436       ++CurPtr;
437 
438     APInt Result(128, 0);
439     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(16, Result))
440       return ReturnError(TokStart, "invalid hexadecimal number");
441 
442     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
443   }
444 
445   // Motorola binary integers: %[01]+
446   if (LexMotorolaIntegers && CurPtr[-1] == '%') {
447     const char *NumStart = CurPtr;
448     while (*CurPtr == '0' || *CurPtr == '1')
449       ++CurPtr;
450 
451     APInt Result(128, 0);
452     if (StringRef(NumStart, CurPtr - NumStart).getAsInteger(2, Result))
453       return ReturnError(TokStart, "invalid binary number");
454 
455     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
456   }
457 
458   // Decimal integer: [1-9][0-9]*
459   // HLASM-flavour decimal integer: [0-9][0-9]*
460   // FIXME: Later on, support for fb for HLASM has to be added in
461   // as they probably would be needed for asm goto
462   if (LexHLASMIntegers || CurPtr[-1] != '0' || CurPtr[0] == '.') {
463     unsigned Radix = doHexLookAhead(CurPtr, 10, LexMasmIntegers);
464 
465     if (!LexHLASMIntegers) {
466       bool IsHex = Radix == 16;
467       // Check for floating point literals.
468       if (!IsHex && (*CurPtr == '.' || *CurPtr == 'e' || *CurPtr == 'E')) {
469         if (*CurPtr == '.')
470           ++CurPtr;
471         return LexFloatLiteral();
472       }
473     }
474 
475     StringRef Result(TokStart, CurPtr - TokStart);
476 
477     APInt Value(128, 0, true);
478     if (Result.getAsInteger(Radix, Value))
479       return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
480 
481     if (!LexHLASMIntegers)
482       // The darwin/x86 (and x86-64) assembler accepts and ignores type
483       // suffices on integer literals.
484       SkipIgnoredIntegerSuffix(CurPtr);
485 
486     return intToken(Result, Value);
487   }
488 
489   if (!LexMasmIntegers && ((*CurPtr == 'b') || (*CurPtr == 'B'))) {
490     ++CurPtr;
491     // See if we actually have "0b" as part of something like "jmp 0b\n"
492     if (!isDigit(CurPtr[0])) {
493       --CurPtr;
494       StringRef Result(TokStart, CurPtr - TokStart);
495       return AsmToken(AsmToken::Integer, Result, 0);
496     }
497     const char *NumStart = CurPtr;
498     while (CurPtr[0] == '0' || CurPtr[0] == '1')
499       ++CurPtr;
500 
501     // Requires at least one binary digit.
502     if (CurPtr == NumStart)
503       return ReturnError(TokStart, "invalid binary number");
504 
505     StringRef Result(TokStart, CurPtr - TokStart);
506 
507     APInt Value(128, 0, true);
508     if (Result.substr(2).getAsInteger(2, Value))
509       return ReturnError(TokStart, "invalid binary number");
510 
511     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
512     // suffixes on integer literals.
513     SkipIgnoredIntegerSuffix(CurPtr);
514 
515     return intToken(Result, Value);
516   }
517 
518   if ((*CurPtr == 'x') || (*CurPtr == 'X')) {
519     ++CurPtr;
520     const char *NumStart = CurPtr;
521     while (isHexDigit(CurPtr[0]))
522       ++CurPtr;
523 
524     // "0x.0p0" is valid, and "0x0p0" (but not "0xp0" for example, which will be
525     // diagnosed by LexHexFloatLiteral).
526     if (CurPtr[0] == '.' || CurPtr[0] == 'p' || CurPtr[0] == 'P')
527       return LexHexFloatLiteral(NumStart == CurPtr);
528 
529     // Otherwise requires at least one hex digit.
530     if (CurPtr == NumStart)
531       return ReturnError(CurPtr-2, "invalid hexadecimal number");
532 
533     APInt Result(128, 0);
534     if (StringRef(TokStart, CurPtr - TokStart).getAsInteger(0, Result))
535       return ReturnError(TokStart, "invalid hexadecimal number");
536 
537     // Consume the optional [hH].
538     if (LexMasmIntegers && (*CurPtr == 'h' || *CurPtr == 'H'))
539       ++CurPtr;
540 
541     // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
542     // suffixes on integer literals.
543     SkipIgnoredIntegerSuffix(CurPtr);
544 
545     return intToken(StringRef(TokStart, CurPtr - TokStart), Result);
546   }
547 
548   // Either octal or hexadecimal.
549   APInt Value(128, 0, true);
550   unsigned Radix = doHexLookAhead(CurPtr, 8, LexMasmIntegers);
551   StringRef Result(TokStart, CurPtr - TokStart);
552   if (Result.getAsInteger(Radix, Value))
553     return ReturnError(TokStart, "invalid " + radixName(Radix) + " number");
554 
555   // Consume the [hH].
556   if (Radix == 16)
557     ++CurPtr;
558 
559   // The darwin/x86 (and x86-64) assembler accepts and ignores ULL and LL
560   // suffixes on integer literals.
561   SkipIgnoredIntegerSuffix(CurPtr);
562 
563   return intToken(Result, Value);
564 }
565 
566 /// LexSingleQuote: Integer: 'b'
567 AsmToken AsmLexer::LexSingleQuote() {
568   int CurChar = getNextChar();
569 
570   if (LexHLASMStrings)
571     return ReturnError(TokStart, "invalid usage of character literals");
572 
573   if (LexMasmStrings) {
574     while (CurChar != EOF) {
575       if (CurChar != '\'') {
576         CurChar = getNextChar();
577       } else if (peekNextChar() == '\'') {
578         // In MASM single-quote strings, doubled single-quotes mean an escaped
579         // single quote, so should be lexed in.
580         getNextChar();
581         CurChar = getNextChar();
582       } else {
583         break;
584       }
585     }
586     if (CurChar == EOF)
587       return ReturnError(TokStart, "unterminated string constant");
588     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
589   }
590 
591   if (CurChar == '\\')
592     CurChar = getNextChar();
593 
594   if (CurChar == EOF)
595     return ReturnError(TokStart, "unterminated single quote");
596 
597   CurChar = getNextChar();
598 
599   if (CurChar != '\'')
600     return ReturnError(TokStart, "single quote way too long");
601 
602   // The idea here being that 'c' is basically just an integral
603   // constant.
604   StringRef Res = StringRef(TokStart,CurPtr - TokStart);
605   long long Value;
606 
607   if (Res.startswith("\'\\")) {
608     char theChar = Res[2];
609     switch (theChar) {
610       default: Value = theChar; break;
611       case '\'': Value = '\''; break;
612       case 't': Value = '\t'; break;
613       case 'n': Value = '\n'; break;
614       case 'b': Value = '\b'; break;
615       case 'f': Value = '\f'; break;
616       case 'r': Value = '\r'; break;
617     }
618   } else
619     Value = TokStart[1];
620 
621   return AsmToken(AsmToken::Integer, Res, Value);
622 }
623 
624 /// LexQuote: String: "..."
625 AsmToken AsmLexer::LexQuote() {
626   int CurChar = getNextChar();
627   if (LexHLASMStrings)
628     return ReturnError(TokStart, "invalid usage of string literals");
629 
630   if (LexMasmStrings) {
631     while (CurChar != EOF) {
632       if (CurChar != '"') {
633         CurChar = getNextChar();
634       } else if (peekNextChar() == '"') {
635         // In MASM double-quoted strings, doubled double-quotes mean an escaped
636         // double quote, so should be lexed in.
637         getNextChar();
638         CurChar = getNextChar();
639       } else {
640         break;
641       }
642     }
643     if (CurChar == EOF)
644       return ReturnError(TokStart, "unterminated string constant");
645     return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
646   }
647 
648   // TODO: does gas allow multiline string constants?
649   while (CurChar != '"') {
650     if (CurChar == '\\') {
651       // Allow \", etc.
652       CurChar = getNextChar();
653     }
654 
655     if (CurChar == EOF)
656       return ReturnError(TokStart, "unterminated string constant");
657 
658     CurChar = getNextChar();
659   }
660 
661   return AsmToken(AsmToken::String, StringRef(TokStart, CurPtr - TokStart));
662 }
663 
664 StringRef AsmLexer::LexUntilEndOfStatement() {
665   TokStart = CurPtr;
666 
667   while (!isAtStartOfComment(CurPtr) &&     // Start of line comment.
668          !isAtStatementSeparator(CurPtr) && // End of statement marker.
669          *CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
670     ++CurPtr;
671   }
672   return StringRef(TokStart, CurPtr-TokStart);
673 }
674 
675 StringRef AsmLexer::LexUntilEndOfLine() {
676   TokStart = CurPtr;
677 
678   while (*CurPtr != '\n' && *CurPtr != '\r' && CurPtr != CurBuf.end()) {
679     ++CurPtr;
680   }
681   return StringRef(TokStart, CurPtr-TokStart);
682 }
683 
684 size_t AsmLexer::peekTokens(MutableArrayRef<AsmToken> Buf,
685                             bool ShouldSkipSpace) {
686   SaveAndRestore<const char *> SavedTokenStart(TokStart);
687   SaveAndRestore<const char *> SavedCurPtr(CurPtr);
688   SaveAndRestore<bool> SavedAtStartOfLine(IsAtStartOfLine);
689   SaveAndRestore<bool> SavedAtStartOfStatement(IsAtStartOfStatement);
690   SaveAndRestore<bool> SavedSkipSpace(SkipSpace, ShouldSkipSpace);
691   SaveAndRestore<bool> SavedIsPeeking(IsPeeking, true);
692   std::string SavedErr = getErr();
693   SMLoc SavedErrLoc = getErrLoc();
694 
695   size_t ReadCount;
696   for (ReadCount = 0; ReadCount < Buf.size(); ++ReadCount) {
697     AsmToken Token = LexToken();
698 
699     Buf[ReadCount] = Token;
700 
701     if (Token.is(AsmToken::Eof))
702       break;
703   }
704 
705   SetError(SavedErrLoc, SavedErr);
706   return ReadCount;
707 }
708 
709 bool AsmLexer::isAtStartOfComment(const char *Ptr) {
710   if (MAI.getRestrictCommentStringToStartOfStatement() && !IsAtStartOfStatement)
711     return false;
712 
713   StringRef CommentString = MAI.getCommentString();
714 
715   if (CommentString.size() == 1)
716     return CommentString[0] == Ptr[0];
717 
718   // Allow # preprocessor commments also be counted as comments for "##" cases
719   if (CommentString[1] == '#')
720     return CommentString[0] == Ptr[0];
721 
722   return strncmp(Ptr, CommentString.data(), CommentString.size()) == 0;
723 }
724 
725 bool AsmLexer::isAtStatementSeparator(const char *Ptr) {
726   return strncmp(Ptr, MAI.getSeparatorString(),
727                  strlen(MAI.getSeparatorString())) == 0;
728 }
729 
730 AsmToken AsmLexer::LexToken() {
731   TokStart = CurPtr;
732   // This always consumes at least one character.
733   int CurChar = getNextChar();
734 
735   if (!IsPeeking && CurChar == '#' && IsAtStartOfStatement) {
736     // If this starts with a '#', this may be a cpp
737     // hash directive and otherwise a line comment.
738     AsmToken TokenBuf[2];
739     MutableArrayRef<AsmToken> Buf(TokenBuf, 2);
740     size_t num = peekTokens(Buf, true);
741     // There cannot be a space preceding this
742     if (IsAtStartOfLine && num == 2 && TokenBuf[0].is(AsmToken::Integer) &&
743         TokenBuf[1].is(AsmToken::String)) {
744       CurPtr = TokStart; // reset curPtr;
745       StringRef s = LexUntilEndOfLine();
746       UnLex(TokenBuf[1]);
747       UnLex(TokenBuf[0]);
748       return AsmToken(AsmToken::HashDirective, s);
749     }
750 
751     if (MAI.shouldAllowAdditionalComments())
752       return LexLineComment();
753   }
754 
755   if (isAtStartOfComment(TokStart))
756     return LexLineComment();
757 
758   if (isAtStatementSeparator(TokStart)) {
759     CurPtr += strlen(MAI.getSeparatorString()) - 1;
760     IsAtStartOfLine = true;
761     IsAtStartOfStatement = true;
762     return AsmToken(AsmToken::EndOfStatement,
763                     StringRef(TokStart, strlen(MAI.getSeparatorString())));
764   }
765 
766   // If we're missing a newline at EOF, make sure we still get an
767   // EndOfStatement token before the Eof token.
768   if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
769     IsAtStartOfLine = true;
770     IsAtStartOfStatement = true;
771     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 0));
772   }
773   IsAtStartOfLine = false;
774   bool OldIsAtStartOfStatement = IsAtStartOfStatement;
775   IsAtStartOfStatement = false;
776   switch (CurChar) {
777   default:
778     // Handle identifier: [a-zA-Z_.?][a-zA-Z0-9_$.@#?]*
779     if (isalpha(CurChar) || CurChar == '_' || CurChar == '.' ||
780         (MAI.doesAllowQuestionAtStartOfIdentifier() && CurChar == '?'))
781       return LexIdentifier();
782 
783     // Unknown character, emit an error.
784     return ReturnError(TokStart, "invalid character in input");
785   case EOF:
786     if (EndStatementAtEOF) {
787       IsAtStartOfLine = true;
788       IsAtStartOfStatement = true;
789     }
790     return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
791   case 0:
792   case ' ':
793   case '\t':
794     IsAtStartOfStatement = OldIsAtStartOfStatement;
795     while (*CurPtr == ' ' || *CurPtr == '\t')
796       CurPtr++;
797     if (SkipSpace)
798       return LexToken(); // Ignore whitespace.
799     else
800       return AsmToken(AsmToken::Space, StringRef(TokStart, CurPtr - TokStart));
801   case '\r': {
802     IsAtStartOfLine = true;
803     IsAtStartOfStatement = true;
804     // If this is a CR followed by LF, treat that as one token.
805     if (CurPtr != CurBuf.end() && *CurPtr == '\n')
806       ++CurPtr;
807     return AsmToken(AsmToken::EndOfStatement,
808                     StringRef(TokStart, CurPtr - TokStart));
809   }
810   case '\n':
811     IsAtStartOfLine = true;
812     IsAtStartOfStatement = true;
813     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
814   case ':': return AsmToken(AsmToken::Colon, StringRef(TokStart, 1));
815   case '+': return AsmToken(AsmToken::Plus, StringRef(TokStart, 1));
816   case '~': return AsmToken(AsmToken::Tilde, StringRef(TokStart, 1));
817   case '(': return AsmToken(AsmToken::LParen, StringRef(TokStart, 1));
818   case ')': return AsmToken(AsmToken::RParen, StringRef(TokStart, 1));
819   case '[': return AsmToken(AsmToken::LBrac, StringRef(TokStart, 1));
820   case ']': return AsmToken(AsmToken::RBrac, StringRef(TokStart, 1));
821   case '{': return AsmToken(AsmToken::LCurly, StringRef(TokStart, 1));
822   case '}': return AsmToken(AsmToken::RCurly, StringRef(TokStart, 1));
823   case '*': return AsmToken(AsmToken::Star, StringRef(TokStart, 1));
824   case ',': return AsmToken(AsmToken::Comma, StringRef(TokStart, 1));
825   case '$': {
826     if (LexMotorolaIntegers && isHexDigit(*CurPtr))
827       return LexDigit();
828     if (MAI.doesAllowDollarAtStartOfIdentifier())
829       return LexIdentifier();
830     return AsmToken(AsmToken::Dollar, StringRef(TokStart, 1));
831   }
832   case '@': {
833     if (MAI.doesAllowAtAtStartOfIdentifier())
834       return LexIdentifier();
835     return AsmToken(AsmToken::At, StringRef(TokStart, 1));
836   }
837   case '\\': return AsmToken(AsmToken::BackSlash, StringRef(TokStart, 1));
838   case '=':
839     if (*CurPtr == '=') {
840       ++CurPtr;
841       return AsmToken(AsmToken::EqualEqual, StringRef(TokStart, 2));
842     }
843     return AsmToken(AsmToken::Equal, StringRef(TokStart, 1));
844   case '-':
845     if (*CurPtr == '>') {
846       ++CurPtr;
847       return AsmToken(AsmToken::MinusGreater, StringRef(TokStart, 2));
848     }
849     return AsmToken(AsmToken::Minus, StringRef(TokStart, 1));
850   case '|':
851     if (*CurPtr == '|') {
852       ++CurPtr;
853       return AsmToken(AsmToken::PipePipe, StringRef(TokStart, 2));
854     }
855     return AsmToken(AsmToken::Pipe, StringRef(TokStart, 1));
856   case '^': return AsmToken(AsmToken::Caret, StringRef(TokStart, 1));
857   case '&':
858     if (*CurPtr == '&') {
859       ++CurPtr;
860       return AsmToken(AsmToken::AmpAmp, StringRef(TokStart, 2));
861     }
862     return AsmToken(AsmToken::Amp, StringRef(TokStart, 1));
863   case '!':
864     if (*CurPtr == '=') {
865       ++CurPtr;
866       return AsmToken(AsmToken::ExclaimEqual, StringRef(TokStart, 2));
867     }
868     return AsmToken(AsmToken::Exclaim, StringRef(TokStart, 1));
869   case '%':
870     if (LexMotorolaIntegers && (*CurPtr == '0' || *CurPtr == '1')) {
871       return LexDigit();
872     }
873 
874     if (MAI.hasMipsExpressions()) {
875       AsmToken::TokenKind Operator;
876       unsigned OperatorLength;
877 
878       std::tie(Operator, OperatorLength) =
879           StringSwitch<std::pair<AsmToken::TokenKind, unsigned>>(
880               StringRef(CurPtr))
881               .StartsWith("call16", {AsmToken::PercentCall16, 7})
882               .StartsWith("call_hi", {AsmToken::PercentCall_Hi, 8})
883               .StartsWith("call_lo", {AsmToken::PercentCall_Lo, 8})
884               .StartsWith("dtprel_hi", {AsmToken::PercentDtprel_Hi, 10})
885               .StartsWith("dtprel_lo", {AsmToken::PercentDtprel_Lo, 10})
886               .StartsWith("got_disp", {AsmToken::PercentGot_Disp, 9})
887               .StartsWith("got_hi", {AsmToken::PercentGot_Hi, 7})
888               .StartsWith("got_lo", {AsmToken::PercentGot_Lo, 7})
889               .StartsWith("got_ofst", {AsmToken::PercentGot_Ofst, 9})
890               .StartsWith("got_page", {AsmToken::PercentGot_Page, 9})
891               .StartsWith("gottprel", {AsmToken::PercentGottprel, 9})
892               .StartsWith("got", {AsmToken::PercentGot, 4})
893               .StartsWith("gp_rel", {AsmToken::PercentGp_Rel, 7})
894               .StartsWith("higher", {AsmToken::PercentHigher, 7})
895               .StartsWith("highest", {AsmToken::PercentHighest, 8})
896               .StartsWith("hi", {AsmToken::PercentHi, 3})
897               .StartsWith("lo", {AsmToken::PercentLo, 3})
898               .StartsWith("neg", {AsmToken::PercentNeg, 4})
899               .StartsWith("pcrel_hi", {AsmToken::PercentPcrel_Hi, 9})
900               .StartsWith("pcrel_lo", {AsmToken::PercentPcrel_Lo, 9})
901               .StartsWith("tlsgd", {AsmToken::PercentTlsgd, 6})
902               .StartsWith("tlsldm", {AsmToken::PercentTlsldm, 7})
903               .StartsWith("tprel_hi", {AsmToken::PercentTprel_Hi, 9})
904               .StartsWith("tprel_lo", {AsmToken::PercentTprel_Lo, 9})
905               .Default({AsmToken::Percent, 1});
906 
907       if (Operator != AsmToken::Percent) {
908         CurPtr += OperatorLength - 1;
909         return AsmToken(Operator, StringRef(TokStart, OperatorLength));
910       }
911     }
912     return AsmToken(AsmToken::Percent, StringRef(TokStart, 1));
913   case '/':
914     IsAtStartOfStatement = OldIsAtStartOfStatement;
915     return LexSlash();
916   case '#': {
917     if (MAI.doesAllowHashAtStartOfIdentifier())
918       return LexIdentifier();
919     return AsmToken(AsmToken::Hash, StringRef(TokStart, 1));
920   }
921   case '\'': return LexSingleQuote();
922   case '"': return LexQuote();
923   case '0': case '1': case '2': case '3': case '4':
924   case '5': case '6': case '7': case '8': case '9':
925     return LexDigit();
926   case '<':
927     switch (*CurPtr) {
928     case '<':
929       ++CurPtr;
930       return AsmToken(AsmToken::LessLess, StringRef(TokStart, 2));
931     case '=':
932       ++CurPtr;
933       return AsmToken(AsmToken::LessEqual, StringRef(TokStart, 2));
934     case '>':
935       ++CurPtr;
936       return AsmToken(AsmToken::LessGreater, StringRef(TokStart, 2));
937     default:
938       return AsmToken(AsmToken::Less, StringRef(TokStart, 1));
939     }
940   case '>':
941     switch (*CurPtr) {
942     case '>':
943       ++CurPtr;
944       return AsmToken(AsmToken::GreaterGreater, StringRef(TokStart, 2));
945     case '=':
946       ++CurPtr;
947       return AsmToken(AsmToken::GreaterEqual, StringRef(TokStart, 2));
948     default:
949       return AsmToken(AsmToken::Greater, StringRef(TokStart, 1));
950     }
951 
952   // TODO: Quoted identifiers (objc methods etc)
953   // local labels: [0-9][:]
954   // Forward/backward labels: [0-9][fb]
955   // Integers, fp constants, character constants.
956   }
957 }
958