lib/Parser/Lexer.cpp

//===- Lexer.cpp - MLIR Lexer Implementation ------------------------------===//
//
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//===----------------------------------------------------------------------===//
//
// This file implements the lexer for the MLIR textual form.
//
//===----------------------------------------------------------------------===//

#include "Lexer.h"
#include "mlir/IR/Diagnostics.h"
#include "mlir/IR/Identifier.h"
#include "mlir/IR/Location.h"
#include "mlir/IR/MLIRContext.h"
#include "llvm/ADT/StringExtras.h"
#include "llvm/ADT/StringSwitch.h"
#include "llvm/Support/SourceMgr.h"
using namespace mlir;

using llvm::SMLoc;
using llvm::SourceMgr;

// Returns true if 'c' is an allowable punctuation character: [$._-]
// Returns false otherwise.
static bool isPunct(char c) {
  return c == '$' || c == '.' || c == '_' || c == '-';
}

Lexer::Lexer(const llvm::SourceMgr &sourceMgr, MLIRContext *context)
    : sourceMgr(sourceMgr), context(context) {
  auto bufferID = sourceMgr.getMainFileID();
  curBuffer = sourceMgr.getMemoryBuffer(bufferID)->getBuffer();
  curPtr = curBuffer.begin();
}

/// Encode the specified source location information into an attribute for
/// attachment to the IR.
Location Lexer::getEncodedSourceLocation(llvm::SMLoc loc) {
  auto &sourceMgr = getSourceMgr();
  unsigned mainFileID = sourceMgr.getMainFileID();
  auto lineAndColumn = sourceMgr.getLineAndColumn(loc, mainFileID);
  auto *buffer = sourceMgr.getMemoryBuffer(mainFileID);

  return FileLineColLoc::get(buffer->getBufferIdentifier(), lineAndColumn.first,
                             lineAndColumn.second, context);
}

/// emitError - Emit an error message and return an Token::error token.
Token Lexer::emitError(const char *loc, const Twine &message) {
  mlir::emitError(getEncodedSourceLocation(SMLoc::getFromPointer(loc)),
                  message);
  return formToken(Token::error, loc);
}

Token Lexer::lexToken() {
  while (true) {
    const char *tokStart = curPtr;
    switch (*curPtr++) {
    default:
      // Handle bare identifiers.
      if (isalpha(curPtr[-1]))
        return lexBareIdentifierOrKeyword(tokStart);

      // Unknown character, emit an error.
      return emitError(tokStart, "unexpected character");

    case ' ':
    case '\t':
    case '\n':
    case '\r':
      // Handle whitespace.
      continue;

    case '_':
      // Handle bare identifiers.
      return lexBareIdentifierOrKeyword(tokStart);

    case 0:
      // This may either be a nul character in the source file or may be the EOF
      // marker that llvm::MemoryBuffer guarantees will be there.
      if (curPtr - 1 == curBuffer.end())
        return formToken(Token::eof, tokStart);
      continue;

    case ':':
      return formToken(Token::colon, tokStart);
    case ',':
      return formToken(Token::comma, tokStart);
    case '.':
      return lexEllipsis(tokStart);
    case '(':
      return formToken(Token::l_paren, tokStart);
    case ')':
      return formToken(Token::r_paren, tokStart);
    case '{':
      return formToken(Token::l_brace, tokStart);
    case '}':
      return formToken(Token::r_brace, tokStart);
    case '[':
      return formToken(Token::l_square, tokStart);
    case ']':
      return formToken(Token::r_square, tokStart);
    case '<':
      return formToken(Token::less, tokStart);
    case '>':
      return formToken(Token::greater, tokStart);
    case '=':
      return formToken(Token::equal, tokStart);

    case '+':
      return formToken(Token::plus, tokStart);
    case '*':
      return formToken(Token::star, tokStart);
    case '-':
      if (*curPtr == '>') {
        ++curPtr;
        return formToken(Token::arrow, tokStart);
      }
      return formToken(Token::minus, tokStart);

    case '?':
      return formToken(Token::question, tokStart);

    case '/':
      if (*curPtr == '/') {
        skipComment();
        continue;
      }
      return emitError(tokStart, "unexpected character");

    case '@':
      return lexAtIdentifier(tokStart);

    case '!':
      LLVM_FALLTHROUGH;
    case '^':
      LLVM_FALLTHROUGH;
    case '#':
      LLVM_FALLTHROUGH;
    case '%':
      return lexPrefixedIdentifier(tokStart);
    case '"':
      return lexString(tokStart);

    case '0':
    case '1':
    case '2':
    case '3':
    case '4':
    case '5':
    case '6':
    case '7':
    case '8':
    case '9':
      return lexNumber(tokStart);
    }
  }
}

/// Lex an '@foo' identifier.
///
///   symbol-ref-id ::= `@` (bare-id | string-literal)
///
Token Lexer::lexAtIdentifier(const char *tokStart) {
  char cur = *curPtr++;

  // Try to parse a string literal, if present.
  if (cur == '"') {
    Token stringIdentifier = lexString(curPtr);
    if (stringIdentifier.is(Token::error))
      return stringIdentifier;
    return formToken(Token::at_identifier, tokStart);
  }

  // Otherwise, these always start with a letter or underscore.
  if (!isalpha(cur) && cur != '_')
    return emitError(curPtr - 1,
                     "@ identifier expected to start with letter or '_'");

  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
         *curPtr == '$' || *curPtr == '.')
    ++curPtr;
  return formToken(Token::at_identifier, tokStart);
}

/// Lex a bare identifier or keyword that starts with a letter.
///
///   bare-id ::= (letter|[_]) (letter|digit|[_$.])*
///   integer-type ::= `[su]?i[1-9][0-9]*`
///
Token Lexer::lexBareIdentifierOrKeyword(const char *tokStart) {
  // Match the rest of the identifier regex: [0-9a-zA-Z_.$]*
  while (isalpha(*curPtr) || isdigit(*curPtr) || *curPtr == '_' ||
         *curPtr == '$' || *curPtr == '.')
    ++curPtr;

  // Check to see if this identifier is a keyword.
  StringRef spelling(tokStart, curPtr - tokStart);

  auto isAllDigit = [](StringRef str) {
    return llvm::all_of(str, [](char c) { return llvm::isDigit(c); });
  };

  // Check for i123, si456, ui789.
  if ((spelling.size() > 1 && tokStart[0] == 'i' &&
       isAllDigit(spelling.drop_front())) ||
      ((spelling.size() > 2 && tokStart[1] == 'i' &&
        (tokStart[0] == 's' || tokStart[0] == 'u')) &&
       isAllDigit(spelling.drop_front(2))))
    return Token(Token::inttype, spelling);

  Token::Kind kind = StringSwitch<Token::Kind>(spelling)
#define TOK_KEYWORD(SPELLING) .Case(#SPELLING, Token::kw_##SPELLING)
#include "TokenKinds.def"
                         .Default(Token::bare_identifier);

  return Token(kind, spelling);
}

/// Skip a comment line, starting with a '//'.
///
///   TODO: add a regex for comments here and to the spec.
///
void Lexer::skipComment() {
  // Advance over the second '/' in a '//' comment.
  assert(*curPtr == '/');
  ++curPtr;

  while (true) {
    switch (*curPtr++) {
    case '\n':
    case '\r':
      // Newline is end of comment.
      return;
    case 0:
      // If this is the end of the buffer, end the comment.
      if (curPtr - 1 == curBuffer.end()) {
        --curPtr;
        return;
      }
      LLVM_FALLTHROUGH;
    default:
      // Skip over other characters.
      break;
    }
  }
}

/// Lex an ellipsis.
///
///   ellipsis ::= '...'
///
Token Lexer::lexEllipsis(const char *tokStart) {
  assert(curPtr[-1] == '.');

  if (curPtr == curBuffer.end() || *curPtr != '.' || *(curPtr + 1) != '.')
    return emitError(curPtr, "expected three consecutive dots for an ellipsis");

  curPtr += 2;
  return formToken(Token::ellipsis, tokStart);
}

/// Lex a number literal.
///
///   integer-literal ::= digit+ | `0x` hex_digit+
///   float-literal ::= [-+]?[0-9]+[.][0-9]*([eE][-+]?[0-9]+)?
///
Token Lexer::lexNumber(const char *tokStart) {
  assert(isdigit(curPtr[-1]));

  // Handle the hexadecimal case.
  if (curPtr[-1] == '0' && *curPtr == 'x') {
    // If we see stuff like 0xi32, this is a literal `0` followed by an
    // identifier `xi32`, stop after `0`.
    if (!isxdigit(curPtr[1]))
      return formToken(Token::integer, tokStart);

    curPtr += 2;
    while (isxdigit(*curPtr))
      ++curPtr;

    return formToken(Token::integer, tokStart);
  }

  // Handle the normal decimal case.
  while (isdigit(*curPtr))
    ++curPtr;

  if (*curPtr != '.')
    return formToken(Token::integer, tokStart);
  ++curPtr;

  // Skip over [0-9]*([eE][-+]?[0-9]+)?
  while (isdigit(*curPtr))
    ++curPtr;

  if (*curPtr == 'e' || *curPtr == 'E') {
    if (isdigit(static_cast<unsigned char>(curPtr[1])) ||
        ((curPtr[1] == '-' || curPtr[1] == '+') &&
         isdigit(static_cast<unsigned char>(curPtr[2])))) {
      curPtr += 2;
      while (isdigit(*curPtr))
        ++curPtr;
    }
  }
  return formToken(Token::floatliteral, tokStart);
}

/// Lex an identifier that starts with a prefix followed by suffix-id.
///
///   attribute-id  ::= `#` suffix-id
///   ssa-id        ::= '%' suffix-id
///   block-id      ::= '^' suffix-id
///   type-id       ::= '!' suffix-id
///   suffix-id     ::= digit+ | (letter|id-punct) (letter|id-punct|digit)*
///   id-punct      ::= `$` | `.` | `_` | `-`
///
Token Lexer::lexPrefixedIdentifier(const char *tokStart) {
  Token::Kind kind;
  StringRef errorKind;
  switch (*tokStart) {
  case '#':
    kind = Token::hash_identifier;
    errorKind = "invalid attribute name";
    break;
  case '%':
    kind = Token::percent_identifier;
    errorKind = "invalid SSA name";
    break;
  case '^':
    kind = Token::caret_identifier;
    errorKind = "invalid block name";
    break;
  case '!':
    kind = Token::exclamation_identifier;
    errorKind = "invalid type identifier";
    break;
  default:
    llvm_unreachable("invalid caller");
  }

  // Parse suffix-id.
  if (isdigit(*curPtr)) {
    // If suffix-id starts with a digit, the rest must be digits.
    while (isdigit(*curPtr)) {
      ++curPtr;
    }
  } else if (isalpha(*curPtr) || isPunct(*curPtr)) {
    do {
      ++curPtr;
    } while (isalpha(*curPtr) || isdigit(*curPtr) || isPunct(*curPtr));
  } else {
    return emitError(curPtr - 1, errorKind);
  }

  return formToken(kind, tokStart);
}

/// Lex a string literal.
///
///   string-literal ::= '"' [^"\n\f\v\r]* '"'
///
/// TODO: define escaping rules.
Token Lexer::lexString(const char *tokStart) {
  assert(curPtr[-1] == '"');

  while (true) {
    switch (*curPtr++) {
    case '"':
      return formToken(Token::string, tokStart);
    case 0:
      // If this is a random nul character in the middle of a string, just
      // include it.  If it is the end of file, then it is an error.
      if (curPtr - 1 != curBuffer.end())
        continue;
      LLVM_FALLTHROUGH;
    case '\n':
    case '\v':
    case '\f':
      return emitError(curPtr - 1, "expected '\"' in string literal");
    case '\\':
      // Handle explicitly a few escapes.
      if (*curPtr == '"' || *curPtr == '\\' || *curPtr == 'n' || *curPtr == 't')
        ++curPtr;
      else if (llvm::isHexDigit(*curPtr) && llvm::isHexDigit(curPtr[1]))
        // Support \xx for two hex digits.
        curPtr += 2;
      else
        return emitError(curPtr - 1, "unknown escape in string literal");
      continue;

    default:
      continue;
    }
  }
}