1local decoder = require "luacheck.decoder"
2local lexer = require "luacheck.lexer"
3
4local function new_state_from_source_bytes(bytes)
5   return lexer.new_state(decoder.decode(bytes))
6end
7
8local function get_tokens(source)
9   local lexer_state = new_state_from_source_bytes(source)
10   local tokens = {}
11
12   repeat
13      local token = {}
14      token.token, token.token_value, token.line, token.offset = lexer.next_token(lexer_state)
15      tokens[#tokens+1] = token
16   until token.token == "eof"
17
18   return tokens
19end
20
21local function get_token(source)
22   local lexer_state = new_state_from_source_bytes(source)
23   local token = {}
24   token.token, token.token_value = lexer.next_token(lexer_state)
25   return token
26end
27
28local function maybe_error(lexer_state)
29   local ok, msg, line, offset, end_offset = lexer.next_token(lexer_state)
30   return not ok and {msg = msg, line = line, offset = offset, end_offset = end_offset}
31end
32
33local function get_error(source)
34   return maybe_error(new_state_from_source_bytes(source))
35end
36
37local function get_last_error(source)
38   local lexer_state = new_state_from_source_bytes(source)
39   local err
40
41   repeat
42      err = maybe_error(lexer_state)
43   until err
44
45   return err
46end
47
48describe("lexer", function()
49   it("parses EOS correctly", function()
50      assert.same({token = "eof"}, get_token(" "))
51   end)
52
53   it("parses names correctly", function()
54      assert.same({token = "name", token_value = "foo"}, get_token("foo"))
55      assert.same({token = "name", token_value = "_"}, get_token("_"))
56      assert.same({token = "name", token_value = "foo1_2"}, get_token("foo1_2"))
57      assert.same({token = "name", token_value = "foo"}, get_token("foo!"))
58   end)
59
60   it("parses keywords correctly", function()
61      assert.same({token = "do"}, get_token("do"))
62      assert.same({token = "goto"}, get_token("goto fail;"))
63   end)
64
65   it("parses operators and special tokens correctly", function()
66      assert.same({token = "="}, get_token("= ="))
67      assert.same({token = "=="}, get_token("=="))
68      assert.same({token = "<"}, get_token("< ="))
69      assert.same({token = "<="}, get_token("<="))
70      assert.same({token = "<<"}, get_token("<<"))
71      assert.same({token = ">"}, get_token("> ="))
72      assert.same({token = ">="}, get_token(">="))
73      assert.same({token = ">>"}, get_token(">>"))
74      assert.same({token = "/"}, get_token("/ /"))
75      assert.same({token = "//"}, get_token("//"))
76      assert.same({token = "."}, get_token(".?."))
77      assert.same({token = "."}, get_token("."))
78      assert.same({token = ".."}, get_token("..%"))
79      assert.same({token = "...", token_value = "..."}, get_token("..."))
80      assert.same({token = ":"}, get_token(":.:"))
81      assert.same({token = "::"}, get_token("::."))
82   end)
83
84   it("parses single character tokens correctly", function()
85      assert.same({token = "("}, get_token("(("))
86      assert.same({token = "["}, get_token("[x]"))
87      assert.same({token = "$"}, get_token("$$$"))
88   end)
89
90   describe("when parsing short strings", function()
91      it("parses empty short strings correctly", function()
92         assert.same({token = "string", token_value = ""}, get_token([[""]]))
93         assert.same({token = "string", token_value = ""}, get_token([['']]))
94      end)
95
96      it("parses short strings containing quotation marks correctly", function()
97         assert.same({token = "string", token_value = "'"}, get_token([["'"]]))
98         assert.same({token = "string", token_value = '"'}, get_token([['"']]))
99      end)
100
101      it("parses simple short strings correctly", function()
102         assert.same({token = "string", token_value = "foo"}, get_token([["foo"]]))
103      end)
104
105      it("parses simple escape sequences correctly", function()
106         assert.same({token = "string", token_value = "\r\n"}, get_token([["\r\n"]]))
107         assert.same({token = "string", token_value = "foo\\bar"}, get_token([["foo\\bar"]]))
108         assert.same({token = "string", token_value = "a\'\'b\"\""}, get_token([["a\'\'b\"\""]]))
109      end)
110
111      it("parses escaped newline correctly", function()
112         assert.same({token = "string", token_value = "foo \nbar"}, get_token([["foo \
113bar"]]))
114         assert.same({token = "string", token_value = "foo \n\n\nbar"}, get_token([["foo \
115\
116\
117bar"]]))
118      end)
119
120      it("parses \\z correctly", function()
121         assert.same({token = "string", token_value = "foo "}, get_token([["foo \z"]]))
122         assert.same({token = "string", token_value = "foo bar"}, get_token([["foo \zbar"]]))
123         assert.same({token = "string", token_value = "foo bar"}, get_token([["foo \z bar"]]))
124         -- luacheck: ignore 613
125         assert.same({token = "string", token_value = "foo bar"}, get_token([["foo \z
126
127            bar\z "]]))
128      end)
129
130      it("parses decimal escape sequences correctly", function()
131         assert.same({token = "string", token_value = "\0buffer exploit"}, get_token([["\0buffer exploit"]]))
132         assert.same({token = "string", token_value = "foo bar"}, get_token([["foo b\97r"]]))
133         assert.same({token = "string", token_value = "\1234"}, get_token([["\1234"]]))
134         assert.same(
135            {line = 1, offset = 2, end_offset = 5, msg = "invalid decimal escape sequence '\\300'"},
136            get_error([["\300"]])
137         )
138         assert.same({line = 1, offset = 2, end_offset = 2, msg = "invalid escape sequence '\\'"}, get_error([["\]]))
139      end)
140
141      it("parses hexadecimal escape sequences correctly", function()
142         assert.same({token = "string", token_value = "\0buffer exploit"}, get_token([["\x00buffer exploit"]]))
143         assert.same({token = "string", token_value = "foo bar"}, get_token([["foo\x20bar"]]))
144         assert.same({token = "string", token_value = "jj"}, get_token([["\x6a\x6A"]]))
145         assert.same(
146            {line = 1, offset = 2, end_offset = 3, msg = "invalid escape sequence '\\X'"},
147            get_error([["\XFF"]])
148         )
149         assert.same(
150            {line = 1, offset = 2, end_offset = 4, msg = "invalid hexadecimal escape sequence '\\x\"'"},
151            get_error([["\x"]])
152         )
153         assert.same(
154            {line = 1, offset = 2, end_offset = 5, msg = "invalid hexadecimal escape sequence '\\x1\"'"},
155            get_error([["\x1"]])
156         )
157         assert.same(
158            {line = 1, offset = 2, end_offset = 4, msg = "invalid hexadecimal escape sequence '\\x1'"},
159            get_error([["\x1]])
160         )
161         assert.same(
162            {line = 1, offset = 2, end_offset = 4, msg = "invalid hexadecimal escape sequence '\\xx'"},
163            get_error([["\xxx"]])
164         )
165      end)
166
167      it("parses utf-8 escape sequences correctly", function()
168         assert.same({token = "string", token_value = "\0\0"},
169            get_token([["\u{0}\u{00000000}"]]))
170         assert.same({token = "string", token_value = "\0\127"},
171            get_token([["\u{0}\u{7F}"]]))
172         assert.same({token = "string", token_value = "\194\128\223\191"},
173            get_token([["\u{80}\u{7fF}"]]))
174         assert.same({token = "string", token_value = "\224\160\128\239\191\191"},
175            get_token([["\u{800}\u{FFFF}"]]))
176         assert.same({token = "string", token_value = "\240\144\128\128\244\143\191\191"},
177            get_token([["\u{10000}\u{10FFFF}"]]))
178         assert.same(
179            {line = 1, offset = 2, end_offset = 13, msg = "invalid UTF-8 escape sequence '\\u{110000000'"},
180            get_error([["\u{110000000}"]])
181         )
182         assert.same(
183            {line = 1, offset = 2, end_offset = 4, msg = "invalid UTF-8 escape sequence '\\u\"'"},
184            get_error([["\u"]])
185         )
186         assert.same(
187            {line = 1, offset = 2, end_offset = 4, msg = "invalid UTF-8 escape sequence '\\un'"},
188            get_error([["\unrelated"]])
189         )
190         assert.same(
191            {line = 1, offset = 2, end_offset = 7, msg = "invalid UTF-8 escape sequence '\\u{11u'"},
192            get_error([["\u{11unrelated"]])
193         )
194         assert.same(
195            {line = 1, offset = 2, end_offset = 6, msg = "invalid UTF-8 escape sequence '\\u{11'"},
196            get_error([["\u{11]])
197         )
198         assert.same(
199            {line = 1, offset = 2, end_offset = 5, msg = "invalid UTF-8 escape sequence '\\u{u'"},
200            get_error([["\u{unrelated}"]])
201         )
202         assert.same(
203            {line = 1, offset = 2, end_offset = 4, msg = "invalid UTF-8 escape sequence '\\u{'"},
204            get_error([["\u{]])
205         )
206      end)
207
208      it("detects unknown escape sequences", function()
209         assert.same({line = 1, offset = 2, end_offset = 3, msg = "invalid escape sequence '\\c'"}, get_error([["\c"]]))
210      end)
211
212      it("detects unfinished strings", function()
213         assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished string"}, get_error([["]]))
214         assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished string"}, get_error([["']]))
215         assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished string"}, get_error([["
216"]]))
217      end)
218   end)
219
220   describe("when parsing long strings", function()
221      it("parses empty long strings correctly", function()
222         assert.same({token = "string", token_value = ""}, get_token("[[]]"))
223         assert.same({token = "string", token_value = ""}, get_token("[===[]===]"))
224      end)
225
226      it("parses simple long strings correctly", function()
227         assert.same({token = "string", token_value = "foo"}, get_token("[[foo]]"))
228         assert.same({token = "string", token_value = "'foo'\n'bar'\n"}, get_token("[===['foo'\n'bar'\n]===]"))
229      end)
230
231      it("skips first newline", function()
232         assert.same({token = "string", token_value = ""}, get_token("[[\n]]"))
233         assert.same({token = "string", token_value = "\n"}, get_token("[===[\n\n]===]"))
234      end)
235
236      it("ignores closing brackets of unrelated length", function()
237         assert.same({token = "string", token_value = "]=] "}, get_token("[[]=] ]]"))
238         assert.same({token = "string", token_value = "foo]]\n]=== ]]"}, get_token("[===[foo]]\n]=== ]]]===]"))
239      end)
240
241      it("detects invalid opening brackets", function()
242         assert.same({line = 1, offset = 1, end_offset = 1, msg = "invalid long string delimiter"}, get_error("[="))
243         assert.same({line = 1, offset = 1, end_offset = 1, msg = "invalid long string delimiter"}, get_error("[=|"))
244      end)
245
246      it("detects unfinished long strings", function()
247         assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished long string"}, get_error("[=[\n"))
248         assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished long string"}, get_error("[[]"))
249      end)
250   end)
251
252   describe("when parsing numbers", function()
253      it("parses decimal integers correctly", function()
254         assert.same({token = "number", token_value = "0"}, get_token("0"))
255         assert.same({token = "number", token_value = "123456789"}, get_token("123456789"))
256      end)
257
258      it("parses hexadecimal integers correctly", function()
259         assert.same({token = "number", token_value = "0x0"}, get_token("0x0"))
260         assert.same({token = "number", token_value = "0X0"}, get_token("0X0"))
261         assert.same({token = "number", token_value = "0xFfab"}, get_token("0xFfab"))
262         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x"))
263      end)
264
265      it("parses decimal floats correctly", function()
266         assert.same({token = "number", token_value = "0.0"}, get_token("0.0"))
267         assert.same({token = "number", token_value = "0."}, get_token("0."))
268         assert.same({token = "number", token_value = ".1234"}, get_token(".1234"))
269      end)
270
271      it("parses hexadecimal floats correctly", function()
272         assert.same({token = "number", token_value = "0xf.A"}, get_token("0xf.A"))
273         assert.same({token = "number", token_value = "0x9."}, get_token("0x9."))
274         assert.same({token = "number", token_value = "0x.b"}, get_token("0x.b"))
275         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x."))
276      end)
277
278      it("parses decimal floats with exponent correctly", function()
279         assert.same({token = "number", token_value = "1.8e1"}, get_token("1.8e1"))
280         assert.same({token = "number", token_value = ".8e-1"}, get_token(".8e-1"))
281         assert.same({token = "number", token_value = "1.E+20"}, get_token("1.E+20"))
282         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8e"))
283         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8e-"))
284         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8E+"))
285         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8ee"))
286         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8e-e"))
287         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("1.8E+i"))
288      end)
289
290      it("parses hexadecimal floats with exponent correctly", function()
291         assert.same({token = "number", token_value = "0x1.8p1"}, get_token("0x1.8p1"))
292         assert.same({token = "number", token_value = "0x.8P-1"}, get_token("0x.8P-1"))
293         assert.same({token = "number", token_value = "0x1.p+20"}, get_token("0x1.p+20"))
294         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p"))
295         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p-"))
296         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8P+"))
297         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8pF"))
298         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p-F"))
299         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x1.8p+LL"))
300         assert.same({line = 1, offset = 1, end_offset = 1, msg = "malformed number"}, get_error("0x.p1"))
301      end)
302
303      it("parses 64 bits cdata literals correctly", function()
304         assert.same({token = "number", token_value = "1LL"}, get_token("1LL"))
305         assert.same({token = "number", token_value = "1ll"}, get_token("1ll"))
306         assert.same({token = "number", token_value = "1Ll"}, get_token("1Ll"))
307         assert.same({token = "number", token_value = "1lL"}, get_token("1lL"))
308         assert.same({token = "number", token_value = "1ULL"}, get_token("1ULL"))
309         assert.same({token = "number", token_value = "1uLl"}, get_token("1uLl"))
310         assert.same({token = "number", token_value = "1LLu"}, get_token("1LLu"))
311         assert.same({token = "number", token_value = "1"}, get_token("1L"))
312         assert.same({token = "number", token_value = "1LL"}, get_token("1LLG"))
313         assert.same({token = "number", token_value = "1"}, get_token("1LUL"))
314         assert.same({token = "number", token_value = "0x1LL"}, get_token("0x1LL"))
315         assert.same({token = "number", token_value = "1.0"}, get_token("1.0LL"))
316      end)
317
318      it("parses complex cdata literals correctly", function()
319         assert.same({token = "number", token_value = "1i"}, get_token("1i"))
320         assert.same({token = "number", token_value = "1I"}, get_token("1I"))
321         assert.same({token = "number", token_value = "1"}, get_token("1j"))
322         assert.same({token = "number", token_value = "1LL"}, get_token("1LLi"))
323         assert.same({token = "number", token_value = "0x1i"}, get_token("0x1i"))
324         assert.same({token = "number", token_value = "0x1.0i"}, get_token("0x1.0i"))
325      end)
326   end)
327
328   it("parses short comments correctly", function()
329      assert.same({token = "short_comment", token_value = ""}, get_token("--"))
330      assert.same({token = "short_comment", token_value = "foo"}, get_token("--foo\nbar"))
331      assert.same({token = "short_comment", token_value = "["}, get_token("--["))
332      assert.same({token = "short_comment", token_value = "[=foo"}, get_token("--[=foo\nbar"))
333   end)
334
335   it("parses long comments correctly", function()
336      assert.same({token = "long_comment", token_value = ""}, get_token("--[[]]"))
337      assert.same({token = "long_comment", token_value = ""}, get_token("--[[\n]]"))
338      assert.same({token = "long_comment", token_value = "foo\nbar"}, get_token("--[[foo\nbar]]"))
339      assert.same({line = 1, offset = 1, end_offset = 1, msg = "unfinished long comment"}, get_error("--[=[]]"))
340   end)
341
342   it("provides correct location info", function()
343      assert.same({
344         {token = "local", line = 1, offset = 1},
345         {token = "function", line = 1, offset = 7},
346         {token = "name", token_value = "foo", line = 1, offset = 16},
347         {token = "(", line = 1, offset = 19},
348         {token = "name", token_value = "bar", line = 1, offset = 20},
349         {token = ")", line = 1, offset = 23},
350         {token = "return", line = 2, offset = 28},
351         {token = "name", token_value = "bar", line = 2, offset = 35},
352         {token = ":", line = 2, offset = 38},
353         {token = "name", token_value = "get_foo", line = 2, offset = 39},
354         {token = "string", token_value = "long string\n", line = 2, offset = 46},
355         {token = "end", line = 5, offset = 66},
356         {token = "short_comment", token_value = " hello", line = 6, offset = 70},
357         {token = "name", token_value = "print", line = 7, offset = 79},
358         {token = "string", token_value = "123\n", line = 7, offset = 85},
359         {token = "short_comment", token_value = " this comment ends just before EOF", line = 10, offset = 113},
360         {token = "eof", line = 10, offset = 149}
361      }, get_tokens([[
362local function foo(bar)
363   return bar:get_foo[=[
364long string
365]=]
366end
367-- hello
368print "1\z
369       2\z
370       3\n"
371-- this comment ends just before EOF]]))
372   end)
373
374   it("provides correct location info for errors", function()
375      assert.same({line = 7, offset = 79, end_offset = 80, msg = "invalid escape sequence '\\g'"}, get_last_error([[
376local function foo(bar)
377   return bar:get_foo[=[
378long string
379]=]
380end
381
382print "1\g
383       2\z
384       3\n"
385]]))
386
387      assert.same({line = 8, offset = 89, end_offset = 92, msg = "invalid decimal escape sequence '\\300'"},
388         get_last_error([[
389local function foo(bar)
390   return bar:get_foo[=[
391long string
392]=]
393end
394
395print "1\
396       2\300
397       3\n"
398]]))
399
400      assert.same({line = 8, offset = 79, end_offset = 79, msg = "malformed number"}, get_last_error([[
401local function foo(bar)
402   return bar:get_foo[=[
403long string
404]=]
405end
406
407print (
4080xx)
409]]))
410
411      assert.same({line = 7, offset = 77, end_offset = 77, msg = "unfinished string"}, get_last_error([[
412local function foo(bar)
413   return bar:get_foo[=[
414long string
415]=]
416end
417
418print "1\z
419       2\z
420       3\n
421]]))
422   end)
423
424   it("parses minified source correctly", function()
425      assert.same({
426         {token = "name", token_value = "a", line = 1, offset = 1},
427         {token = ",", line = 1, offset = 2},
428         {token = "name", token_value = "b", line = 1, offset = 3},
429         {token = "=", line = 1, offset = 4},
430         {token = "number", token_value = "4ll", line = 1, offset = 5},
431         {token = "name", token_value = "f", line = 1, offset = 8},
432         {token = "=", line = 1, offset = 9},
433         {token = "string", token_value = "", line = 1, offset = 10},
434         {token = "function", line = 1, offset = 12},
435         {token = "name", token_value = "_", line = 1, offset = 21},
436         {token = "(", line = 1, offset = 22},
437         {token = ")", line = 1, offset = 23},
438         {token = "return", line = 1, offset = 24},
439         {token = "number", token_value = "1", line = 1, offset = 31},
440         {token = "or", line = 1, offset = 32},
441         {token = "string", token_value = "", line = 1, offset = 34},
442         {token = "end", line = 1, offset = 36},
443         {token = "eof", line = 1, offset = 39}
444      }, get_tokens("a,b=4llf=''function _()return 1or''end"))
445   end)
446
447   it("handles argparse sample", function()
448      get_tokens(io.open("spec/samples/argparse-0.2.0.lua", "rb"):read("*a"))
449   end)
450end)
451