1local decoder = require "luacheck.decoder" 2local lexer = require "luacheck.lexer" 3 4local function new_state_from_source_bytes(bytes) 5 return lexer.new_state(decoder.decode(bytes)) 6end 7 8local function get_tokens(source) 9 local lexer_state = new_state_from_source_bytes(source) 10 local tokens = {} 11 12 repeat 13 local token = {} 14 token.token, token.token_value, token.line, token.offset = lexer.next_token(lexer_state) 15 tokens[#tokens+1] = token 16 until token.token == "eof" 17 18 return tokens 19end 20 21local function get_token(source) 22 local lexer_state = new_state_from_source_bytes(source) 23 local token = {} 24 token.token, token.token_value = lexer.next_token(lexer_state) 25 return token 26end 27 28local function maybe_error(lexer_state) 29 local ok, msg, line, offset, end_offset = lexer.next_token(lexer_state) 30 return not ok and {msg = msg, line = line, offset = offset, end_offset = end_offset} 31end 32 33local function get_error(source) 34 return maybe_error(new_state_from_source_bytes(source)) 35end 36 37local function get_last_error(source) 38 local lexer_state = new_state_from_source_bytes(source) 39 local err 40 41 repeat 42 err = maybe_error(lexer_state) 43 until err 44 45 return err 46end 47 48describe("lexer", function() 49 it("parses EOS correctly", function() 50 assert.same({token = "eof"}, get_token(" ")) 51 end) 52 53 it("parses names correctly", function() 54 assert.same({token = "name", token_value = "foo"}, get_token("foo")) 55 assert.same({token = "name", token_value = "_"}, get_token("_")) 56 assert.same({token = "name", token_value = "foo1_2"}, get_token("foo1_2")) 57 assert.same({token = "name", token_value = "foo"}, get_token("foo!")) 58 end) 59 60 it("parses keywords correctly", function() 61 assert.same({token = "do"}, get_token("do")) 62 assert.same({token = "goto"}, get_token("goto fail;")) 63 end) 64 65 it("parses operators and special tokens correctly", function() 66 assert.same({token = "="}, get_token("= =")) 67 assert.same({token = "=="}, get_token("==")) 68 assert.same({token = "<"}, get_token("< =")) 69 assert.same({token = "<="}, get_token("<=")) 70 assert.same({token = "<<"}, get_token("<<")) 71 assert.same({token = ">"}, get_token("> =")) 72 assert.same({token = ">="}, get_token(">=")) 73 assert.same({token = ">>"}, get_token(">>")) 74 assert.same({token = "/"}, get_token("/ /")) 75 assert.same({token = "//"}, get_token("//")) 76 assert.same({token = "."}, get_token(".?.")) 77 assert.same({token = "."}, get_token(".")) 78 assert.same({token = ".."}, get_token("..%")) 79 assert.same({token = "...", token_value = "..."}, get_token("...")) 80 assert.same({token = ":"}, get_token(":.:")) 81 assert.same({token = "::"}, get_token("::.")) 82 end) 83 84 it("parses single character tokens correctly", function() 85 assert.same({token = "("}, get_token("((")) 86 assert.same({token = "["}, get_token("[x]")) 87 assert.same({token = "$"}, get_token("$$$")) 88 end) 89 90 describe("when parsing short strings", function() 91 it("parses empty short strings correctly", function() 92 assert.same({token = "string", token_value = ""}, get_token([[""]])) 93 assert.same({token = "string", token_value = ""}, get_token([['']])) 94 end) 95 96 it("parses short strings containing quotation marks correctly", function() 97 assert.same({token = "string", token_value = "'"}, get_token([["'"]])) 98 assert.same({token = "string", token_value = '"'}, get_token([['"']])) 99 end) 100 101 it("parses simple short strings correctly", function() 102 assert.same({token = "string", token_value = "foo"}, get_token([["foo"]])) 103 end) 104 105 it("parses simple escape sequences correctly", function() 106 assert.same({token = "string", token_value = "\r\n"}, get_token([["\r\n"]])) 107 assert.same({token = "string", token_value = "foo\\bar"}, get_token([["foo\\bar"]])) 108 assert.same({token = "string", token_value = "a\'\'b\"\""}, get_token([["a\'\'b\"\""]])) 109 end) 110 111 it("parses escaped newline correctly", function() 112 assert.same({token = "string", token_value = "foo \nbar"}, get_token([["foo \ 113bar"]])) 114 assert.same({token = "string", token_value = "foo \n\n\nbar"}, get_token([["foo \ 115\ 116\ 117bar"]])) 118 end) 119 120 it("parses \\z correctly", function() 121 assert.same({token = "string", token_value = "foo "}, get_token([["foo \z"]])) 122 assert.same({token = "string", token_value = "foo bar"}, get_token([["foo \zbar"]])) 123 assert.same({token = "string", token_value = "foo bar"}, get_token([["foo \z bar"]])) 124 -- luacheck: ignore 613 125 assert.same({token = "string", token_value = "foo bar"}, get_token([["foo \z 126 127 bar\z "]])) 128 end) 129 130 it("parses decimal escape sequences correctly", function() 131 assert.same({token = "string", token_value = "\0buffer exploit"}, get_token([["\0buffer exploit"]])) 132 assert.same({token = "string", token_value = "foo bar"}, get_token([["foo b\97r"]])) 133 assert.same({token = "string", token_value = "\1234"}, get_token([["\1234"]])) 134 assert.same( 135 {line = 1, offset = 2, end_offset = 5, msg = "invalid decimal escape sequence '\\300'"}, 136 get_error([["\300"]]) 137 ) 138 assert.same({line = 1, offset = 2, end_offset = 2, msg = "invalid escape sequence '\\'"}, get_error([["\]])) 139 end) 140 141 it("parses hexadecimal escape sequences correctly", function() 142 assert.same({token = "string", token_value = "\0buffer exploit"}, get_token([["\x00buffer exploit"]])) 143 assert.same({token = "string", token_value = "foo bar"}, get_token([["foo\x20bar"]])) 144 assert.same({token = "string", token_value = "jj"}, get_token([["\x6a\x6A"]])) 145 assert.same( 146 {line = 1, offset = 2, end_offset = 3, msg = "invalid escape sequence '\\X'"}, 147 get_error([["\XFF"]]) 148 ) 149 assert.same( 150 {line = 1, offset = 2, end_offset = 4, msg = "invalid hexadecimal escape sequence '\\x\"'"}, 151 get_error([["\x"]]) 152 ) 153 assert.same( 154 {line = 1, offset = 2, end_offset = 5, msg = "invalid hexadecimal escape sequence '\\x1\"'"}, 155 get_error([["\x1"]]) 156 ) 157 assert.same( 158 {line = 1, offset = 2, end_offset = 4, msg = "invalid hexadecimal escape sequence '\\x1'"}, 159 get_error([["\x1]]) 160 ) 161 assert.same( 162 {line = 1, offset = 2, end_offset = 4, msg = "invalid hexadecimal escape sequence '\\xx'"}, 163 get_error([["\xxx"]]) 164 ) 165 end) 166 167 it("parses utf-8 escape sequences correctly", function() 168 assert.same({token = "string", token_value = "\0\0"}, 169 get_token([["\u{0}\u{00000000}"]])) 170 assert.same({token = "string", token_value = "\0\127"}, 171 get_token([["\u{0}\u{7F}"]])) 172 assert.same({token = "string", token_value = "\194\128\223\191"}, 173 get_token([["\u{80}\u{7fF}"]])) 174 assert.same({token = "string", token_value = "\224\160\128\239\191\191"}, 175 get_token([["\u{800}\u{FFFF}"]])) 176 assert.same({token = "string", token_value = "\240\144\128\128\244\143\191\191"}, 177 get_token([["\u{10000}\u{10FFFF}"]])) 178 assert.same( 179 {line = 1, offset = 2, end_offset = 13, msg = "invalid UTF-8 escape sequence '\\u{110000000'"}, 180 get_error([["\u{110000000}"]]) 181 ) 182 assert.same( 183 {line = 1, offset = 2, end_offset = 4, msg = "invalid UTF-8 escape sequence '\\u\"'"}, 184 get_error([["\u"]]) 185 ) 186 assert.same( 187 {line = 1, offset = 2, end_offset = 4, msg = "invalid UTF-8 escape sequence '\\un'"}, 188 get_error([["\unrelated"]]) 189 ) 190 assert.same( 191 {line = 1, offset = 2, end_offset = 7, msg = "invalid UTF-8 escape sequence '\\u{11u'"}, 192 get_error([["\u{11unrelated"]]) 193 ) 194 assert.same( 195 {line = 1, offset = 2, end_offset = 6, msg = "invalid UTF-8 escape sequence '\\u{11'"}, 196 get_error([["\u{11]]) 197 ) 198 assert.same( 199 {line = 1, offset = 2, end_offset = 5, msg = "invalid UTF-8 escape sequence '\\u{u'"}, 200 get_error([["\u{unrelated}"]]) 201 ) 202 assert.same( 203 {line = 1, offset = 2, end_offset = 4, msg = "invalid UTF-8 escape sequence '\\u{'"}, 204 get_error([["\u{]]) 205 ) 206 end) 207 208 it("detects unknown escape sequences", function() 209 assert.same({line = 1, offset = 2, end_offset = 3, msg = "invalid escape sequence '\\c'"}, get_error([["\c"]])) 210 end) 211 212 it("detects unfinished strings", function() 213 assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished string"}, get_error([["]])) 214 assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished string"}, get_error([["']])) 215 assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished string"}, get_error([[" 216"]])) 217 end) 218 end) 219 220 describe("when parsing long strings", function() 221 it("parses empty long strings correctly", function() 222 assert.same({token = "string", token_value = ""}, get_token("[[]]")) 223 assert.same({token = "string", token_value = ""}, get_token("[===[]===]")) 224 end) 225 226 it("parses simple long strings correctly", function() 227 assert.same({token = "string", token_value = "foo"}, get_token("[[foo]]")) 228 assert.same({token = "string", token_value = "'foo'\n'bar'\n"}, get_token("[===['foo'\n'bar'\n]===]")) 229 end) 230 231 it("skips first newline", function() 232 assert.same({token = "string", token_value = ""}, get_token("[[\n]]")) 233 assert.same({token = "string", token_value = "\n"}, get_token("[===[\n\n]===]")) 234 end) 235 236 it("ignores closing brackets of unrelated length", function() 237 assert.same({token = "string", token_value = "]=] "}, get_token("[[]=] ]]")) 238 assert.same({token = "string", token_value = "foo]]\n]=== ]]"}, get_token("[===[foo]]\n]=== ]]]===]")) 239 end) 240 241 it("detects invalid opening brackets", function() 242 assert.same({line = 1, offset = 1, end_offset = 1, msg = "invalid long string delimiter"}, get_error("[=")) 243 assert.same({line = 1, offset = 1, end_offset = 1, msg = "invalid long string delimiter"}, get_error("[=|")) 244 end) 245 246 it("detects unfinished long strings", function() 247 assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished long string"}, get_error("[=[\n")) 248 assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished long string"}, get_error("[[]")) 249 end) 250 end) 251 252 describe("when parsing numbers", function() 253 it("parses decimal integers correctly", function() 254 assert.same({token = "number", token_value = "0"}, get_token("0")) 255 assert.same({token = "number", token_value = "123456789"}, get_token("123456789")) 256 end) 257 258 it("parses hexadecimal integers correctly", function() 259 assert.same({token = "number", token_value = "0x0"}, get_token("0x0")) 260 assert.same({token = "number", token_value = "0X0"}, get_token("0X0")) 261 assert.same({token = "number", token_value = "0xFfab"}, get_token("0xFfab")) 262 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x")) 263 end) 264 265 it("parses decimal floats correctly", function() 266 assert.same({token = "number", token_value = "0.0"}, get_token("0.0")) 267 assert.same({token = "number", token_value = "0."}, get_token("0.")) 268 assert.same({token = "number", token_value = ".1234"}, get_token(".1234")) 269 end) 270 271 it("parses hexadecimal floats correctly", function() 272 assert.same({token = "number", token_value = "0xf.A"}, get_token("0xf.A")) 273 assert.same({token = "number", token_value = "0x9."}, get_token("0x9.")) 274 assert.same({token = "number", token_value = "0x.b"}, get_token("0x.b")) 275 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x.")) 276 end) 277 278 it("parses decimal floats with exponent correctly", function() 279 assert.same({token = "number", token_value = "1.8e1"}, get_token("1.8e1")) 280 assert.same({token = "number", token_value = ".8e-1"}, get_token(".8e-1")) 281 assert.same({token = "number", token_value = "1.E+20"}, get_token("1.E+20")) 282 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8e")) 283 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8e-")) 284 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8E+")) 285 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8ee")) 286 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8e-e")) 287 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8E+i")) 288 end) 289 290 it("parses hexadecimal floats with exponent correctly", function() 291 assert.same({token = "number", token_value = "0x1.8p1"}, get_token("0x1.8p1")) 292 assert.same({token = "number", token_value = "0x.8P-1"}, get_token("0x.8P-1")) 293 assert.same({token = "number", token_value = "0x1.p+20"}, get_token("0x1.p+20")) 294 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p")) 295 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p-")) 296 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8P+")) 297 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8pF")) 298 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p-F")) 299 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p+LL")) 300 assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x.p1")) 301 end) 302 303 it("parses 64 bits cdata literals correctly", function() 304 assert.same({token = "number", token_value = "1LL"}, get_token("1LL")) 305 assert.same({token = "number", token_value = "1ll"}, get_token("1ll")) 306 assert.same({token = "number", token_value = "1Ll"}, get_token("1Ll")) 307 assert.same({token = "number", token_value = "1lL"}, get_token("1lL")) 308 assert.same({token = "number", token_value = "1ULL"}, get_token("1ULL")) 309 assert.same({token = "number", token_value = "1uLl"}, get_token("1uLl")) 310 assert.same({token = "number", token_value = "1LLu"}, get_token("1LLu")) 311 assert.same({token = "number", token_value = "1"}, get_token("1L")) 312 assert.same({token = "number", token_value = "1LL"}, get_token("1LLG")) 313 assert.same({token = "number", token_value = "1"}, get_token("1LUL")) 314 assert.same({token = "number", token_value = "0x1LL"}, get_token("0x1LL")) 315 assert.same({token = "number", token_value = "1.0"}, get_token("1.0LL")) 316 end) 317 318 it("parses complex cdata literals correctly", function() 319 assert.same({token = "number", token_value = "1i"}, get_token("1i")) 320 assert.same({token = "number", token_value = "1I"}, get_token("1I")) 321 assert.same({token = "number", token_value = "1"}, get_token("1j")) 322 assert.same({token = "number", token_value = "1LL"}, get_token("1LLi")) 323 assert.same({token = "number", token_value = "0x1i"}, get_token("0x1i")) 324 assert.same({token = "number", token_value = "0x1.0i"}, get_token("0x1.0i")) 325 end) 326 end) 327 328 it("parses short comments correctly", function() 329 assert.same({token = "short_comment", token_value = ""}, get_token("--")) 330 assert.same({token = "short_comment", token_value = "foo"}, get_token("--foo\nbar")) 331 assert.same({token = "short_comment", token_value = "["}, get_token("--[")) 332 assert.same({token = "short_comment", token_value = "[=foo"}, get_token("--[=foo\nbar")) 333 end) 334 335 it("parses long comments correctly", function() 336 assert.same({token = "long_comment", token_value = ""}, get_token("--[[]]")) 337 assert.same({token = "long_comment", token_value = ""}, get_token("--[[\n]]")) 338 assert.same({token = "long_comment", token_value = "foo\nbar"}, get_token("--[[foo\nbar]]")) 339 assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished long comment"}, get_error("--[=[]]")) 340 end) 341 342 it("provides correct location info", function() 343 assert.same({ 344 {token = "local", line = 1, offset = 1}, 345 {token = "function", line = 1, offset = 7}, 346 {token = "name", token_value = "foo", line = 1, offset = 16}, 347 {token = "(", line = 1, offset = 19}, 348 {token = "name", token_value = "bar", line = 1, offset = 20}, 349 {token = ")", line = 1, offset = 23}, 350 {token = "return", line = 2, offset = 28}, 351 {token = "name", token_value = "bar", line = 2, offset = 35}, 352 {token = ":", line = 2, offset = 38}, 353 {token = "name", token_value = "get_foo", line = 2, offset = 39}, 354 {token = "string", token_value = "long string\n", line = 2, offset = 46}, 355 {token = "end", line = 5, offset = 66}, 356 {token = "short_comment", token_value = " hello", line = 6, offset = 70}, 357 {token = "name", token_value = "print", line = 7, offset = 79}, 358 {token = "string", token_value = "123\n", line = 7, offset = 85}, 359 {token = "short_comment", token_value = " this comment ends just before EOF", line = 10, offset = 113}, 360 {token = "eof", line = 10, offset = 149} 361 }, get_tokens([[ 362local function foo(bar) 363 return bar:get_foo[=[ 364long string 365]=] 366end 367-- hello 368print "1\z 369 2\z 370 3\n" 371-- this comment ends just before EOF]])) 372 end) 373 374 it("provides correct location info for errors", function() 375 assert.same({line = 7, offset = 79, end_offset = 80, msg = "invalid escape sequence '\\g'"}, get_last_error([[ 376local function foo(bar) 377 return bar:get_foo[=[ 378long string 379]=] 380end 381 382print "1\g 383 2\z 384 3\n" 385]])) 386 387 assert.same({line = 8, offset = 89, end_offset = 92, msg = "invalid decimal escape sequence '\\300'"}, 388 get_last_error([[ 389local function foo(bar) 390 return bar:get_foo[=[ 391long string 392]=] 393end 394 395print "1\ 396 2\300 397 3\n" 398]])) 399 400 assert.same({line = 8, offset = 79, end_offset = 79, msg = "malformed number"}, get_last_error([[ 401local function foo(bar) 402 return bar:get_foo[=[ 403long string 404]=] 405end 406 407print ( 4080xx) 409]])) 410 411 assert.same({line = 7, offset = 77, end_offset = 77, msg = "unfinished string"}, get_last_error([[ 412local function foo(bar) 413 return bar:get_foo[=[ 414long string 415]=] 416end 417 418print "1\z 419 2\z 420 3\n 421]])) 422 end) 423 424 it("parses minified source correctly", function() 425 assert.same({ 426 {token = "name", token_value = "a", line = 1, offset = 1}, 427 {token = ",", line = 1, offset = 2}, 428 {token = "name", token_value = "b", line = 1, offset = 3}, 429 {token = "=", line = 1, offset = 4}, 430 {token = "number", token_value = "4ll", line = 1, offset = 5}, 431 {token = "name", token_value = "f", line = 1, offset = 8}, 432 {token = "=", line = 1, offset = 9}, 433 {token = "string", token_value = "", line = 1, offset = 10}, 434 {token = "function", line = 1, offset = 12}, 435 {token = "name", token_value = "_", line = 1, offset = 21}, 436 {token = "(", line = 1, offset = 22}, 437 {token = ")", line = 1, offset = 23}, 438 {token = "return", line = 1, offset = 24}, 439 {token = "number", token_value = "1", line = 1, offset = 31}, 440 {token = "or", line = 1, offset = 32}, 441 {token = "string", token_value = "", line = 1, offset = 34}, 442 {token = "end", line = 1, offset = 36}, 443 {token = "eof", line = 1, offset = 39} 444 }, get_tokens("a,b=4llf=''function _()return 1or''end")) 445 end) 446 447 it("handles argparse sample", function() 448 get_tokens(io.open("spec/samples/argparse-0.2.0.lua", "rb"):read("*a")) 449 end) 450end) 451