src/luacheck/lexer.lua

local utils = require "luacheck.utils"

-- Lexer should support syntax of Lua 5.1, Lua 5.2, Lua 5.3, Lua 5.4 and LuaJIT(64bit and complex cdata literals).
local lexer = {}

local sbyte = string.byte
local schar = string.char
local sreverse = string.reverse
local tconcat = table.concat
local mfloor = math.floor

-- No point in inlining these, fetching a constant ~= fetching a local.
local BYTE_0, BYTE_9, BYTE_f, BYTE_F = sbyte("0"), sbyte("9"), sbyte("f"), sbyte("F")
local BYTE_x, BYTE_X, BYTE_i, BYTE_I = sbyte("x"), sbyte("X"), sbyte("i"), sbyte("I")
local BYTE_l, BYTE_L, BYTE_u, BYTE_U = sbyte("l"), sbyte("L"), sbyte("u"), sbyte("U")
local BYTE_e, BYTE_E, BYTE_p, BYTE_P = sbyte("e"), sbyte("E"), sbyte("p"), sbyte("P")
local BYTE_a, BYTE_z, BYTE_A, BYTE_Z = sbyte("a"), sbyte("z"), sbyte("A"), sbyte("Z")
local BYTE_DOT, BYTE_COLON = sbyte("."), sbyte(":")
local BYTE_OBRACK, BYTE_CBRACK = sbyte("["), sbyte("]")
local BYTE_OBRACE, BYTE_CBRACE = sbyte("{"), sbyte("}")
local BYTE_QUOTE, BYTE_DQUOTE = sbyte("'"), sbyte('"')
local BYTE_PLUS, BYTE_DASH, BYTE_LDASH = sbyte("+"), sbyte("-"), sbyte("_")
local BYTE_SLASH, BYTE_BSLASH = sbyte("/"), sbyte("\\")
local BYTE_EQ, BYTE_NE = sbyte("="), sbyte("~")
local BYTE_LT, BYTE_GT = sbyte("<"), sbyte(">")
local BYTE_LF, BYTE_CR = sbyte("\n"), sbyte("\r")
local BYTE_SPACE, BYTE_FF, BYTE_TAB, BYTE_VTAB = sbyte(" "), sbyte("\f"), sbyte("\t"), sbyte("\v")

local function to_hex(b)
   if BYTE_0 <= b and b <= BYTE_9 then
      return b-BYTE_0
   elseif BYTE_a <= b and b <= BYTE_f then
      return 10+b-BYTE_a
   elseif BYTE_A <= b and b <= BYTE_F then
      return 10+b-BYTE_A
   else
      return nil
   end
end

local function to_dec(b)
   if BYTE_0 <= b and b <= BYTE_9 then
      return b-BYTE_0
   else
      return nil
   end
end

local function to_utf(codepoint)
   if codepoint < 0x80 then  -- ASCII?
      return schar(codepoint)
   end

   local buf = {}
   local mfb = 0x3F

   repeat
      buf[#buf+1] = schar(codepoint % 0x40 + 0x80)
      codepoint = mfloor(codepoint / 0x40)
      mfb = mfloor(mfb / 2)
   until codepoint <= mfb

   buf[#buf+1] = schar(0xFE - mfb*2 + codepoint)
   return sreverse(tconcat(buf))
end

local function is_alpha(b)
   return (BYTE_a <= b and b <= BYTE_z) or
      (BYTE_A <= b and b <= BYTE_Z) or b == BYTE_LDASH
end

local function is_newline(b)
   return (b == BYTE_LF) or (b == BYTE_CR)
end

local function is_space(b)
   return (b == BYTE_SPACE) or (b == BYTE_FF) or
      (b == BYTE_TAB) or (b == BYTE_VTAB)
end

local keywords = utils.array_to_set({
   "and", "break", "do", "else", "elseif", "end", "false", "for", "function", "goto", "if", "in",
   "local", "nil", "not", "or", "repeat", "return", "then", "true", "until", "while"})

local simple_escapes = {
   [sbyte("a")] = sbyte("\a"),
   [sbyte("b")] = sbyte("\b"),
   [sbyte("f")] = sbyte("\f"),
   [sbyte("n")] = sbyte("\n"),
   [sbyte("r")] = sbyte("\r"),
   [sbyte("t")] = sbyte("\t"),
   [sbyte("v")] = sbyte("\v"),
   [BYTE_BSLASH] = BYTE_BSLASH,
   [BYTE_QUOTE] = BYTE_QUOTE,
   [BYTE_DQUOTE] = BYTE_DQUOTE
}

local function next_byte(state)
   local offset = state.offset + 1
   state.offset = offset
   return state.src:get_codepoint(offset)
end

-- Skipping helpers.
-- Take the current character, skip something, return next character.

local function skip_newline(state, newline)
   local first_newline_offset = state.offset
   local b = next_byte(state)

   if b ~= newline and is_newline(b) then
      b = next_byte(state)
   end

   local line = state.line
   local line_offsets = state.line_offsets
   state.line_lengths[line] = first_newline_offset - line_offsets[line]
   line = line + 1
   state.line = line
   line_offsets[line] = state.offset
   return b
end

local function skip_to_newline(state, b)
   while not is_newline(b) and b do
      b = next_byte(state)
   end

   return b
end

local function skip_space(state, b)
   while is_space(b) or is_newline(b) do
      if is_newline(b) then
         b = skip_newline(state, b)
      else
         b = next_byte(state)
      end
   end

   return b
end

-- Skips "[=*" or "]=*". Returns next character and number of "="s.
local function skip_long_bracket(state)
   local start = state.offset
   local b = next_byte(state)

   while b == BYTE_EQ do
      b = next_byte(state)
   end

   return b, state.offset-start-1
end

-- Token handlers.

-- Called after the opening "[=*" has been skipped.
-- Takes number of "=" in the opening bracket and token type(comment or string).
local function lex_long_string(state, opening_long_bracket, token)
   local b = next_byte(state)

   if is_newline(b) then
      b = skip_newline(state, b)
   end

   local lines = {}
   local line_start = state.offset

   while true do
      if is_newline(b) then
         -- Add the finished line.
         lines[#lines+1] = state.src:get_substring(line_start, state.offset-1)

         b = skip_newline(state, b)
         line_start = state.offset
      elseif b == BYTE_CBRACK then
         local long_bracket
         b, long_bracket = skip_long_bracket(state)

         if b == BYTE_CBRACK and long_bracket == opening_long_bracket then
            break
         end
      elseif b == nil then
         return nil, token == "string" and "unfinished long string" or "unfinished long comment"
      else
         b = next_byte(state)
      end
   end

   -- Add last line.
   lines[#lines+1] = state.src:get_substring(line_start, state.offset-opening_long_bracket-2)
   state.offset = state.offset + 1
   return token, tconcat(lines, "\n")
end

local function lex_short_string(state, quote)
   local b = next_byte(state)
   local chunks  -- Buffer is only required when there are escape sequences.
   local chunk_start = state.offset

   while b ~= quote do
      if b == BYTE_BSLASH then
         -- Escape sequence.

         if not chunks then
            -- This is the first escape sequence, init buffer.
            chunks = {}
         end

         -- Put previous chunk into buffer.
         if chunk_start ~= state.offset then
            chunks[#chunks+1] = state.src:get_substring(chunk_start, state.offset-1)
         end

         b = next_byte(state)

         -- The final string escape sequence evaluates to.
         local s

         local escape_byte = simple_escapes[b]

         if escape_byte then  -- Is it a simple escape sequence?
            b = next_byte(state)
            s = schar(escape_byte)
         elseif is_newline(b) then
            b = skip_newline(state, b)
            s = "\n"
         elseif b == BYTE_x then
            -- Hexadecimal escape.
            b = next_byte(state)  -- Skip "x".
            -- Exactly two hexadecimal digits.
            local c1, c2

            if b then
               c1 = to_hex(b)
            end

            if not c1 then
               return nil, "invalid hexadecimal escape sequence", -2
            end

            b = next_byte(state)

            if b then
               c2 = to_hex(b)
            end

            if not c2 then
               return nil, "invalid hexadecimal escape sequence", -3
            end

            b = next_byte(state)
            s = schar(c1*16 + c2)
         elseif b == BYTE_u then
            b = next_byte(state)  -- Skip "u".

            if b ~= BYTE_OBRACE then
               return nil, "invalid UTF-8 escape sequence", -2
            end

            b = next_byte(state)  -- Skip "{".

            local codepoint  -- There should be at least one digit.

            if b then
               codepoint = to_hex(b)
            end

            if not codepoint then
               return nil, "invalid UTF-8 escape sequence", -3
            end

            local hexdigits = 0

            while true do
               b = next_byte(state)
               local hex

               if b then
                  hex = to_hex(b)
               end

               if hex then
                  hexdigits = hexdigits + 1
                  codepoint = codepoint*16 + hex

                  if codepoint > 0x7FFFFFFF then
                     -- UTF-8 value too large.
                     return nil, "invalid UTF-8 escape sequence", -hexdigits-3
                  end
               else
                  break
               end
            end

            if b ~= BYTE_CBRACE then
               return nil, "invalid UTF-8 escape sequence", -hexdigits-4
            end

            b = next_byte(state)  -- Skip "}".
            s = to_utf(codepoint)
         elseif b == BYTE_z then
            -- Zap following span of spaces.
            b = skip_space(state, next_byte(state))
         else
            -- Must be a decimal escape.
            local cb

            if b then
               cb = to_dec(b)
            end

            if not cb then
               return nil, "invalid escape sequence", -1
            end

            -- Up to three decimal digits.
            b = next_byte(state)

            if b then
               local c2 = to_dec(b)

               if c2 then
                  cb = 10*cb + c2
                  b = next_byte(state)

                  if b then
                     local c3 = to_dec(b)

                     if c3 then
                        cb = 10*cb + c3

                        if cb > 255 then
                           return nil, "invalid decimal escape sequence", -3
                        end

                        b = next_byte(state)
                     end
                  end
               end
            end

            s = schar(cb)
         end

         if s then
            chunks[#chunks+1] = s
         end

         -- Next chunk starts after escape sequence.
         chunk_start = state.offset
      elseif b == nil or is_newline(b) then
         return nil, "unfinished string"
      else
         b = next_byte(state)
      end
   end

   -- Offset now points at the closing quote.
   local string_value

   if chunks then
      -- Put last chunk into buffer.
      if chunk_start ~= state.offset then
         chunks[#chunks+1] = state.src:get_substring(chunk_start, state.offset-1)
      end

      string_value = tconcat(chunks)
   else
      -- There were no escape sequences.
      string_value = state.src:get_substring(chunk_start, state.offset-1)
   end

   -- Skip the closing quote.
   state.offset = state.offset + 1
   return "string", string_value
end

-- Payload for a number is simply a substring.
-- Luacheck is supposed to be forward-compatible with Lua 5.3 and LuaJIT syntax, so
--    parsing it into actual number may be problematic.
-- It is not needed currently anyway as Luacheck does not do static evaluation yet.
local function lex_number(state, b)
   local start = state.offset

   local exp_lower, exp_upper = BYTE_e, BYTE_E
   local is_digit = to_dec
   local has_digits = false
   local is_float = false

   if b == BYTE_0 then
      b = next_byte(state)

      if b == BYTE_x or b == BYTE_X then
         exp_lower, exp_upper = BYTE_p, BYTE_P
         is_digit = to_hex
         b = next_byte(state)
      else
         has_digits = true
      end
   end

   while b ~= nil and is_digit(b) do
      b = next_byte(state)
      has_digits = true
   end

   if b == BYTE_DOT then
      -- Fractional part.
      is_float = true
      b = next_byte(state)  -- Skip dot.

      while b ~= nil and is_digit(b) do
         b = next_byte(state)
         has_digits = true
      end
   end

   if b == exp_lower or b == exp_upper then
      -- Exponent part.
      is_float = true
      b = next_byte(state)

      -- Skip optional sign.
      if b == BYTE_PLUS or b == BYTE_DASH then
         b = next_byte(state)
      end

      -- Exponent consists of one or more decimal digits.
      if b == nil or not to_dec(b) then
         return nil, "malformed number"
      end

      repeat
         b = next_byte(state)
      until b == nil or not to_dec(b)
   end

   if not has_digits then
      return nil, "malformed number"
   end

   -- Is it cdata literal?
   if b == BYTE_i or b == BYTE_I then
      -- It is complex literal. Skip "i" or "I".
      state.offset = state.offset + 1
   else
      -- uint64_t and int64_t literals can not be fractional.
      if not is_float then
         if b == BYTE_u or b == BYTE_U then
            -- It may be uint64_t literal.
            local b1 = state.src:get_codepoint(state.offset+1)

            if b1 == BYTE_l or b1 == BYTE_L then
               local b2 = state.src:get_codepoint(state.offset+2)

               if b2 == BYTE_l or b2 == BYTE_L then
                  -- It is uint64_t literal.
                  state.offset = state.offset + 3
               end
            end
         elseif b == BYTE_l or b == BYTE_L then
            -- It may be uint64_t or int64_t literal.
            local b1 = state.src:get_codepoint(state.offset+1)

            if b1 == BYTE_l or b1 == BYTE_L then
               local b2 = state.src:get_codepoint(state.offset+2)

               if b2 == BYTE_u or b2 == BYTE_U then
                  -- It is uint64_t literal.
                  state.offset = state.offset + 3
               else
                  -- It is int64_t literal.
                  state.offset = state.offset + 2
               end
            end
         end
      end
   end

   return "number", state.src:get_substring(start, state.offset-1)
end

local function lex_ident(state)
   local start = state.offset
   local b = next_byte(state)

   while (b ~= nil) and (is_alpha(b) or to_dec(b)) do
      b = next_byte(state)
   end

   local ident = state.src:get_substring(start, state.offset-1)

   if keywords[ident] then
      return ident
   else
      return "name", ident
   end
end

local function lex_dash(state)
   local b = next_byte(state)

   -- Is it "-" or comment?
   if b ~= BYTE_DASH then
      return "-"
   end

   -- It is a comment.
   b = next_byte(state)
   local start = state.offset

   -- Is it a long comment?
   if b == BYTE_OBRACK then
      local long_bracket
      b, long_bracket = skip_long_bracket(state)

      if b == BYTE_OBRACK then
         return lex_long_string(state, long_bracket, "long_comment")
      end
   end

   -- Short comment.
   skip_to_newline(state, b)
   local comment_value = state.src:get_substring(start, state.offset - 1)
   return "short_comment", comment_value
end

local function lex_bracket(state)
   -- Is it "[" or long string?
   local b, long_bracket = skip_long_bracket(state)

   if b == BYTE_OBRACK then
      return lex_long_string(state, long_bracket, "string")
   elseif long_bracket == 0 then
      return "["
   else
      return nil, "invalid long string delimiter"
   end
end

local function lex_eq(state)
   local b = next_byte(state)

   if b == BYTE_EQ then
      state.offset = state.offset + 1
      return "=="
   else
      return "="
   end
end

local function lex_lt(state)
   local b = next_byte(state)

   if b == BYTE_EQ then
      state.offset = state.offset + 1
      return "<="
   elseif b == BYTE_LT then
      state.offset = state.offset + 1
      return "<<"
   else
      return "<"
   end
end

local function lex_gt(state)
   local b = next_byte(state)

   if b == BYTE_EQ then
      state.offset = state.offset + 1
      return ">="
   elseif b == BYTE_GT then
      state.offset = state.offset + 1
      return ">>"
   else
      return ">"
   end
end

local function lex_div(state)
   local b = next_byte(state)

   if b == BYTE_SLASH then
      state.offset = state.offset + 1
      return "//"
   else
      return "/"
   end
end

local function lex_ne(state)
   local b = next_byte(state)

   if b == BYTE_EQ then
      state.offset = state.offset + 1
      return "~="
   else
      return "~"
   end
end

local function lex_colon(state)
   local b = next_byte(state)

   if b == BYTE_COLON then
      state.offset = state.offset + 1
      return "::"
   else
      return ":"
   end
end

local function lex_dot(state)
   local b = next_byte(state)

   if b == BYTE_DOT then
      b = next_byte(state)

      if b == BYTE_DOT then
         state.offset = state.offset + 1
         return "...", "..."
      else
         return ".."
      end
   elseif b and to_dec(b) then
      -- Backtrack to dot.
      state.offset = state.offset - 2
      return lex_number(state, next_byte(state))
   else
      return "."
   end
end

local function lex_any(state, b)
   state.offset = state.offset + 1

   if b > 255 then
      b = 255
   end

   return schar(b)
end

-- Maps first bytes of tokens to functions that handle them.
-- Each handler takes the first byte as an argument.
-- Each handler stops at the character after the token and returns the token and,
--    optionally, a value associated with the token.
-- On error handler returns nil, error message and, optionally, start of reported location as negative offset.
local byte_handlers = {
   [BYTE_DOT] = lex_dot,
   [BYTE_COLON] = lex_colon,
   [BYTE_OBRACK] = lex_bracket,
   [BYTE_QUOTE] = lex_short_string,
   [BYTE_DQUOTE] = lex_short_string,
   [BYTE_DASH] = lex_dash,
   [BYTE_SLASH] = lex_div,
   [BYTE_EQ] = lex_eq,
   [BYTE_NE] = lex_ne,
   [BYTE_LT] = lex_lt,
   [BYTE_GT] = lex_gt,
   [BYTE_LDASH] = lex_ident
}

for b=BYTE_0, BYTE_9 do
   byte_handlers[b] = lex_number
end

for b=BYTE_a, BYTE_z do
   byte_handlers[b] = lex_ident
end

for b=BYTE_A, BYTE_Z do
   byte_handlers[b] = lex_ident
end

-- Creates and returns lexer state for source.
function lexer.new_state(src, line_offsets, line_lengths)
   local state = {
      src = src,
      line = 1,
      line_offsets = line_offsets or {},
      line_lengths = line_lengths or {},
      offset = 1
   }

   state.line_offsets[1] = 1

   if src:get_length() >= 2 and src:get_substring(1, 2) == "#!" then
      -- Skip shebang line.
      state.offset = 2
      skip_to_newline(state, next_byte(state))
   end

   return state
end

function lexer.get_quoted_substring_or_line(state, line, offset, end_offset)
   local line_length = state.line_lengths[line]

   if line_length then
      local line_end_offset = state.line_offsets[line] + line_length - 1

      if line_end_offset < end_offset then
         end_offset = line_end_offset
      end
   end

   return "'" .. state.src:get_printable_substring(offset, end_offset) .. "'"
end

-- Looks for next token starting from state.line, state.offset.
-- Returns next token, its value and its location (line, offset).
-- Sets state.line, state.offset to token end location + 1.
-- Fills state.line_offsets and state.line_lengths.
-- On error returns nil, error message, error location (line, offset), error end offset.
function lexer.next_token(state)
   local line_offsets = state.line_offsets
   local b = skip_space(state, state.src:get_codepoint(state.offset))

   -- Save location of token start.
   local token_line = state.line
   local line_offset = line_offsets[token_line]
   local token_offset = state.offset

   if not b then
      -- EOF token has length 1.
      state.offset = state.offset + 1
      state.line_lengths[token_line] = token_offset - line_offset
      return "eof", nil, token_line, token_offset
   end

   local token, token_value, relative_error_offset = (byte_handlers[b] or lex_any)(state, b)

   if relative_error_offset then
      -- Error relative to current offset.
      local error_offset = state.offset + relative_error_offset
      local error_end_offset = math.min(state.offset, state.src:get_length())
      local error_message = token_value .. " " .. lexer.get_quoted_substring_or_line(state,
         state.line, error_offset, error_end_offset)
      return nil, error_message, state.line, error_offset, error_end_offset
   end

   -- Single character errors fall through here.
   return token, token_value, token_line, token_offset, not token and token_offset
end

return lexer