//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===// // // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. // See https://llvm.org/LICENSE.txt for license information. // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception // //===----------------------------------------------------------------------===// // // This file implements the lexer for the MLIR textual form. // //===----------------------------------------------------------------------===// #include "Lexer.h" #include "mlir/IR/Diagnostics.h" #include "mlir/IR/Identifier.h" #include "mlir/IR/Location.h" #include "mlir/IR/MLIRContext.h" #include "llvm/ADT/StringExtras.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/SourceMgr.h" using namespace mlir; using llvm::SMLoc; using llvm::SourceMgr; // Returns true if 'c' is an allowable punctuation character: [$._-] // Returns false otherwise. static bool isPunct(char c) { return c == '$' || c == '.' || c == '_' || c == '-'; } Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context) : sourceMgr(sourceMgr), context(context) { auto bufferID = sourceMgr.getMainFileID(); curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer(); curPtr = curBuffer.begin(); } /// Encode the specified source location information into an attribute for /// attachment to the IR. Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) { auto &sourceMgr = getSourceMgr(); unsigned mainFileID = sourceMgr.getMainFileID(); auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID); auto *buffer = sourceMgr.getMemoryBuffer(mainFileID); return FileLineColLoc::get(buffer->getBufferIdentifier(), lineAndColumn.first, lineAndColumn.second, context); } /// emitError - Emit an error message and return an Token::error token. Token Lexer::emitError(const char *loc, const Twine &message) { mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)), message); return formToken(Token::error, loc); } Token Lexer::lexToken() { while (true) { const char *tokStart = curPtr; switch (*curPtr++) { default: // Handle bare identifiers. if (isalpha(curPtr[-1])) return lexBareIdentifierOrKeyword(tokStart); // Unknown character, emit an error. return emitError(tokStart, "unexpected character"); case ' ': case '\t': case '\n': case '\r': // Handle whitespace. continue; case '_': // Handle bare identifiers. return lexBareIdentifierOrKeyword(tokStart); case 0: // This may either be a nul character in the source file or may be the EOF // marker that llvm::MemoryBuffer guarantees will be there. if (curPtr - 1 == curBuffer.end()) return formToken(Token::eof, tokStart); continue; case ':': return formToken(Token::colon, tokStart); case ',': return formToken(Token::comma, tokStart); case '.': return lexEllipsis(tokStart); case '(': return formToken(Token::l_paren, tokStart); case ')': return formToken(Token::r_paren, tokStart); case '{': return formToken(Token::l_brace, tokStart); case '}': return formToken(Token::r_brace, tokStart); case '[': return formToken(Token::l_square, tokStart); case ']': return formToken(Token::r_square, tokStart); case '<': return formToken(Token::less, tokStart); case '>': return formToken(Token::greater, tokStart); case '=': return formToken(Token::equal, tokStart); case '+': return formToken(Token::plus, tokStart); case '*': return formToken(Token::star, tokStart); case '-': if (*curPtr == '>') { ++curPtr; return formToken(Token::arrow, tokStart); } return formToken(Token::minus, tokStart); case '?': return formToken(Token::question, tokStart); case '/': if (*curPtr == '/') { skipComment(); continue; } return emitError(tokStart, "unexpected character"); case '@': return lexAtIdentifier(tokStart); case '!': LLVM_FALLTHROUGH; case '^': LLVM_FALLTHROUGH; case '#': LLVM_FALLTHROUGH; case '%': return lexPrefixedIdentifier(tokStart); case '"': return lexString(tokStart); case '0': case '1': case '2': case '3': case '4': case '5': case '6': case '7': case '8': case '9': return lexNumber(tokStart); } } } /// Lex an '@foo' identifier. /// /// symbol-ref-id ::= `@` (bare-id | string-literal) /// Token Lexer::lexAtIdentifier(const char *tokStart) { char cur = *curPtr++; // Try to parse a string literal, if present. if (cur == '"') { Token stringIdentifier = lexString(curPtr); if (stringIdentifier.is(Token::error)) return stringIdentifier; return formToken(Token::at_identifier, tokStart); } // Otherwise, these always start with a letter or underscore. if (!isalpha(cur) && cur != '_') return emitError(curPtr - 1, "@ identifier expected to start with letter or '_'"); while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' || *curPtr == '$' || *curPtr == '.') ++curPtr; return formToken(Token::at_identifier, tokStart); } /// Lex a bare identifier or keyword that starts with a letter. /// /// bare-id ::= (letter|[_]) (letter|digit|[_$.])* /// integer-type ::= `[su]?i[1-9][0-9]*` /// Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) { // Match the rest of the identifier regex: [0-9a-zA-Z_.$]* while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' || *curPtr == '$' || *curPtr == '.') ++curPtr; // Check to see if this identifier is a keyword. StringRef spelling(tokStart, curPtr - tokStart); auto isAllDigit = [](StringRef str) { return llvm::all_of(str, [](char c) { return llvm::isDigit(c); }); }; // Check for i123, si456, ui789. if ((spelling.size() > 1 && tokStart[0] == 'i' && isAllDigit(spelling.drop_front())) || ((spelling.size() > 2 && tokStart[1] == 'i' && (tokStart[0] == 's' || tokStart[0] == 'u')) && isAllDigit(spelling.drop_front(2)))) return Token(Token::inttype, spelling); Token::Kind kind = StringSwitch(spelling) #define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING) #include "TokenKinds.def" .Default(Token::bare_identifier); return Token(kind, spelling); } /// Skip a comment line, starting with a '//'. /// /// TODO: add a regex for comments here and to the spec. /// void Lexer::skipComment() { // Advance over the second '/' in a '//' comment. assert(*curPtr == '/'); ++curPtr; while (true) { switch (*curPtr++) { case '\n': case '\r': // Newline is end of comment. return; case 0: // If this is the end of the buffer, end the comment. if (curPtr - 1 == curBuffer.end()) { --curPtr; return; } LLVM_FALLTHROUGH; default: // Skip over other characters. break; } } } /// Lex an ellipsis. /// /// ellipsis ::= '...' /// Token Lexer::lexEllipsis(const char *tokStart) { assert(curPtr[-1] == '.'); if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.') return emitError(curPtr, "expected three consecutive dots for an ellipsis"); curPtr += 2; return formToken(Token::ellipsis, tokStart); } /// Lex a number literal. /// /// integer-literal ::= digit+ | `0x` hex_digit+ /// float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)? /// Token Lexer::lexNumber(const char *tokStart) { assert(isdigit(curPtr[-1])); // Handle the hexadecimal case. if (curPtr[-1] == '0' && *curPtr == 'x') { // If we see stuff like 0xi32, this is a literal `0` followed by an // identifier `xi32`, stop after `0`. if (!isxdigit(curPtr[1])) return formToken(Token::integer, tokStart); curPtr += 2; while (isxdigit(*curPtr)) ++curPtr; return formToken(Token::integer, tokStart); } // Handle the normal decimal case. while (isdigit(*curPtr)) ++curPtr; if (*curPtr != '.') return formToken(Token::integer, tokStart); ++curPtr; // Skip over [0-9]*([eE][-+]?[0-9]+)? while (isdigit(*curPtr)) ++curPtr; if (*curPtr == 'e' || *curPtr == 'E') { if (isdigit(static_cast(curPtr[1])) || ((curPtr[1] == '-' || curPtr[1] == '+') && isdigit(static_cast(curPtr[2])))) { curPtr += 2; while (isdigit(*curPtr)) ++curPtr; } } return formToken(Token::floatliteral, tokStart); } /// Lex an identifier that starts with a prefix followed by suffix-id. /// /// attribute-id ::= `#` suffix-id /// ssa-id ::= '%' suffix-id /// block-id ::= '^' suffix-id /// type-id ::= '!' suffix-id /// suffix-id ::= digit+ | (letter|id-punct) (letter|id-punct|digit)* /// id-punct ::= `$` | `.` | `_` | `-` /// Token Lexer::lexPrefixedIdentifier(const char *tokStart) { Token::Kind kind; StringRef errorKind; switch (*tokStart) { case '#': kind = Token::hash_identifier; errorKind = "invalid attribute name"; break; case '%': kind = Token::percent_identifier; errorKind = "invalid SSA name"; break; case '^': kind = Token::caret_identifier; errorKind = "invalid block name"; break; case '!': kind = Token::exclamation_identifier; errorKind = "invalid type identifier"; break; default: llvm_unreachable("invalid caller"); } // Parse suffix-id. if (isdigit(*curPtr)) { // If suffix-id starts with a digit, the rest must be digits. while (isdigit(*curPtr)) { ++curPtr; } } else if (isalpha(*curPtr) || isPunct(*curPtr)) { do { ++curPtr; } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr)); } else { return emitError(curPtr - 1, errorKind); } return formToken(kind, tokStart); } /// Lex a string literal. /// /// string-literal ::= '"' [^"\n\f\v\r]* '"' /// /// TODO: define escaping rules. Token Lexer::lexString(const char *tokStart) { assert(curPtr[-1] == '"'); while (true) { switch (*curPtr++) { case '"': return formToken(Token::string, tokStart); case 0: // If this is a random nul character in the middle of a string, just // include it. If it is the end of file, then it is an error. if (curPtr - 1 != curBuffer.end()) continue; LLVM_FALLTHROUGH; case '\n': case '\v': case '\f': return emitError(curPtr - 1, "expected '\"' in string literal"); case '\\': // Handle explicitly a few escapes. if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't') ++curPtr; else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1])) // Support \xx for two hex digits. curPtr += 2; else return emitError(curPtr - 1, "unknown escape in string literal"); continue; default: continue; } } }