1// Test GoAWK Lexer
2
3package lexer_test
4
5import (
6	"fmt"
7	"strconv"
8	"strings"
9	"testing"
10
11	. "github.com/benhoyt/goawk/lexer"
12)
13
14func TestLexer(t *testing.T) {
15	tests := []struct {
16		input  string
17		output string
18	}{
19		// Comments, whitespace, line continuations
20		{"+# foo \n- #foo", `1:1 + "", 1:8 <newline> "", 2:1 - ""`},
21		{"+\\\n-", `1:1 + "", 2:1 - ""`},
22		{"+\\\r\n-", `1:1 + "", 2:1 - ""`},
23		{"+\\-", `1:1 + "", 1:3 <illegal> "expected \\n after \\ line continuation", 1:3 - ""`},
24
25		// Names and keywords
26		{"x", `1:1 name "x"`},
27		{"x y0", `1:1 name "x", 1:3 name "y0"`},
28		{"x 0y", `1:1 name "x", 1:3 number "0", 1:4 name "y"`},
29		{"sub SUB", `1:1 sub "", 1:5 name "SUB"`},
30
31		// String tokens
32		{`"foo"`, `1:1 string "foo"`},
33		{`"a\t\r\n\z\'\"\a\b\f\vb"`, `1:1 string "a\t\r\nz'\"\a\b\f\vb"`},
34		{`"x`, `1:3 <illegal> "didn't find end quote in string"`},
35		{"\"x\n\"", `1:3 <illegal> "can't have newline in string", 1:3 <newline> "", 2:2 <illegal> "didn't find end quote in string"`},
36		{`'foo'`, `1:1 string "foo"`},
37		{`'a\t\r\n\z\'\"b'`, `1:1 string "a\t\r\nz'\"b"`},
38		{`'x`, `1:3 <illegal> "didn't find end quote in string"`},
39		{"'x\n'", `1:3 <illegal> "can't have newline in string", 1:3 <newline> "", 2:2 <illegal> "didn't find end quote in string"`},
40		{`"\x0.\x00.\x0A\x10\xff\xFF\x41"`, `1:1 string "\x00.\x00.\n\x10\xff\xffA"`},
41		{`"\xg"`, `1:4 <illegal> "1 or 2 hex digits expected", 1:4 name "g", 1:6 <illegal> "didn't find end quote in string"`},
42		{`"\0\78\7\77\777\0 \141 "`, `1:1 string "\x00\a8\a?\xff\x00 a "`},
43
44		// Number tokens
45		{"0", `1:1 number "0"`},
46		{"9", `1:1 number "9"`},
47		{" 0 ", `1:2 number "0"`},
48		{"\n  1", `1:1 <newline> "", 2:3 number "1"`},
49		{"1234", `1:1 number "1234"`},
50		{".5", `1:1 number ".5"`},
51		{".5e1", `1:1 number ".5e1"`},
52		{"5e+1", `1:1 number "5e+1"`},
53		{"5e-1", `1:1 number "5e-1"`},
54		{"0.", `1:1 number "0."`},
55		{"42e", `1:1 number "42", 1:3 name "e"`},
56		{"4.2e", `1:1 number "4.2", 1:4 name "e"`},
57		{"1.e3", `1:1 number "1.e3"`},
58		{"1.e3", `1:1 number "1.e3"`},
59		{"1e3foo", `1:1 number "1e3", 1:4 name "foo"`},
60		{"1e3+", `1:1 number "1e3", 1:4 + ""`},
61		{"1e3.4", `1:1 number "1e3", 1:4 number ".4"`},
62		{"1e-", `1:1 number "1", 1:2 name "e", 1:3 - ""`},
63		{"1e+", `1:1 number "1", 1:2 name "e", 1:3 + ""`},
64		{"42@", `1:1 number "42", 1:3 <illegal> "unexpected char"`},
65		{"0..", `1:1 number "0.", 1:4 <illegal> "expected digits"`},
66		{".", `1:2 <illegal> "expected digits"`},
67
68		// Misc errors
69		{"&=", `1:2 <illegal> "unexpected char after '&'", 1:2 = ""`},
70	}
71	for _, test := range tests {
72		t.Run(test.input, func(t *testing.T) {
73			l := NewLexer([]byte(test.input))
74			strs := []string{}
75			for {
76				pos, tok, val := l.Scan()
77				if tok == EOF {
78					break
79				}
80				if tok == NUMBER {
81					// Ensure ParseFloat() works, as that's what our
82					// parser uses to convert
83					trimmed := strings.TrimRight(val, "eE")
84					_, err := strconv.ParseFloat(trimmed, 64)
85					if err != nil {
86						t.Fatalf("couldn't parse float: %q", val)
87					}
88				}
89				strs = append(strs, fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val))
90			}
91			output := strings.Join(strs, ", ")
92			if output != test.output {
93				t.Errorf("expected %q, got %q", test.output, output)
94			}
95		})
96	}
97}
98
99func TestRegex(t *testing.T) {
100	tests := []struct {
101		input  string
102		output string
103	}{
104		{`/foo/`, `1:1 regex "foo"`},
105		{`/=foo/`, `1:1 regex "=foo"`},
106		{`/a\/b/`, `1:1 regex "a/b"`},
107		{`/a\/\zb/`, `1:1 regex "a/\\zb"`},
108		{`/a`, `1:3 <illegal> "didn't find end slash in regex"`},
109		{"/a\n", `1:3 <illegal> "can't have newline in regex"`},
110		{`foo/`, `1:4 <illegal> "unexpected name preceding regex"`},
111	}
112	for _, test := range tests {
113		t.Run(test.input, func(t *testing.T) {
114			l := NewLexer([]byte(test.input))
115			l.Scan() // Scan first token (probably DIV)
116			pos, tok, val := l.ScanRegex()
117			output := fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val)
118			if output != test.output {
119				t.Errorf("expected %q, got %q", test.output, output)
120			}
121		})
122	}
123}
124
125func TestHadSpace(t *testing.T) {
126	tests := []struct {
127		input  string
128		tokens []Token
129		spaces []bool
130	}{
131		{`foo(x)`, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, false, false, false}},
132		{`foo (x) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, true, false, false}},
133		{` foo ( x ) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{true, true, true, true}},
134	}
135	for _, test := range tests {
136		t.Run(test.input, func(t *testing.T) {
137			l := NewLexer([]byte(test.input))
138			for i := 0; ; i++ {
139				_, tok, _ := l.Scan()
140				if tok == EOF {
141					break
142				}
143				if tok != test.tokens[i] {
144					t.Errorf("expected %s for token %d, got %s", test.tokens[i], i, tok)
145				}
146				if l.HadSpace() != test.spaces[i] {
147					t.Errorf("expected %v for space %d, got %v", test.spaces[i], i, l.HadSpace())
148				}
149			}
150		})
151	}
152}
153
154func TestKeywordToken(t *testing.T) {
155	tests := []struct {
156		name string
157		tok  Token
158	}{
159		{"print", PRINT},
160		{"split", F_SPLIT},
161		{"BEGIN", BEGIN},
162		{"foo", ILLEGAL},
163		{"GoAWK", ILLEGAL},
164	}
165	for _, test := range tests {
166		t.Run(test.name, func(t *testing.T) {
167			tok := KeywordToken(test.name)
168			if tok != test.tok {
169				t.Errorf("expected %v, got %v", test.tok, tok)
170			}
171		})
172	}
173}
174
175func TestAllTokens(t *testing.T) {
176	input := "# comment line\n" +
177		"+ += && = : , -- /\n/= $ == >= > >> ++ { [ < ( #\n" +
178		"<= ~ % %= * *= !~ ! != | || ^ ^= ** **= ? } ] ) ; - -= " +
179		"BEGIN break continue delete do else END exit " +
180		"for function getline if in next print printf return while " +
181		"atan2 close cos exp fflush gsub index int length log match rand " +
182		"sin split sprintf sqrt srand sub substr system tolower toupper " +
183		"x \"str\\n\" 1234\n" +
184		"@ ."
185
186	strs := make([]string, 0, LAST+1)
187	seen := make([]bool, LAST+1)
188	l := NewLexer([]byte(input))
189	for {
190		_, tok, _ := l.Scan()
191		strs = append(strs, tok.String())
192		seen[int(tok)] = true
193		if tok == EOF {
194			break
195		}
196	}
197	output := strings.Join(strs, " ")
198
199	expected := "<newline> " +
200		"+ += && = : , -- / <newline> /= $ == >= > >> ++ { [ < ( <newline> " +
201		"<= ~ % %= * *= !~ ! != | || ^ ^= ^ ^= ? } ] ) ; - -= " +
202		"BEGIN break continue delete do else END exit " +
203		"for function getline if in next print printf return while " +
204		"atan2 close cos exp fflush gsub index int length log match rand " +
205		"sin split sprintf sqrt srand sub substr system tolower toupper " +
206		"name string number <newline> " +
207		"<illegal> <illegal> EOF"
208	if output != expected {
209		t.Errorf("expected %q, got %q", expected, output)
210	}
211
212	for i, s := range seen {
213		if !s && Token(i) != CONCAT && Token(i) != REGEX {
214			t.Errorf("token %s (%d) not seen", Token(i), i)
215		}
216	}
217
218	l = NewLexer([]byte(`/foo/`))
219	_, tok1, _ := l.Scan()
220	_, tok2, val := l.ScanRegex()
221	if tok1 != Token(DIV) || tok2 != Token(REGEX) || val != "foo" {
222		t.Errorf(`expected / regex "foo", got %s %s %q`, tok1, tok2, val)
223	}
224
225	l = NewLexer([]byte(`/=foo/`))
226	_, tok1, _ = l.Scan()
227	_, tok2, val = l.ScanRegex()
228	if tok1 != Token(DIV_ASSIGN) || tok2 != Token(REGEX) || val != "=foo" {
229		t.Errorf(`expected /= regex "=foo", got %s %s %q`, tok1, tok2, val)
230	}
231}
232
233func benchmarkLexer(b *testing.B, repeat int, source string) {
234	fullSource := []byte(strings.Repeat(source+"\n", repeat))
235	b.ResetTimer()
236	for i := 0; i < b.N; i++ {
237		l := NewLexer(fullSource)
238		for {
239			_, tok, _ := l.Scan()
240			if tok == EOF || tok == ILLEGAL {
241				break
242			}
243		}
244	}
245}
246
247func BenchmarkProgram(b *testing.B) {
248	benchmarkLexer(b, 5, `{ print $1, ($3+$4)*$5 }`)
249}
250
251func BenchmarkNames(b *testing.B) {
252	benchmarkLexer(b, 5, `x y i foobar abcdefghij0123456789 _`)
253}
254
255func BenchmarkKeywords(b *testing.B) {
256	benchmarkLexer(b, 5, `BEGIN END print sub if length`)
257}
258
259func BenchmarkSimpleTokens(b *testing.B) {
260	benchmarkLexer(b, 5, "\n : , { [ ( } ] ) ~ ? ; $")
261}
262
263func BenchmarkChoiceTokens(b *testing.B) {
264	benchmarkLexer(b, 5, `/ /=  % %= + ++ += * ** **= *= = == ^ ^= ! != !~ < <= > >= >> && | ||`)
265}
266
267func BenchmarkNumbers(b *testing.B) {
268	benchmarkLexer(b, 5, `0 1 .5 1234 1234567890 1234.56789e-50`)
269}
270
271func BenchmarkStrings(b *testing.B) {
272	benchmarkLexer(b, 5, `"x" "y" "xyz" "foo" "foo bar baz" "foo\tbar\rbaz\n"`)
273}
274
275func BenchmarkRegex(b *testing.B) {
276	source := `/x/ /./ /foo/ /bar/ /=equals=/ /\/\/\/\//`
277	fullSource := []byte(strings.Repeat(source+" ", 5))
278	b.ResetTimer()
279	for i := 0; i < b.N; i++ {
280		l := NewLexer(fullSource)
281		for {
282			_, tok, _ := l.Scan()
283			if tok == EOF {
284				break
285			}
286			if tok != DIV && tok != DIV_ASSIGN {
287				b.Fatalf("expected / or /=, got %s", tok)
288			}
289			_, tok, _ = l.ScanRegex()
290			if tok != REGEX {
291				b.Fatalf("expected regex, got %s", tok)
292			}
293		}
294	}
295}
296
297func Example() {
298	lexer := NewLexer([]byte(`$0 { print $1 }`))
299	for {
300		pos, tok, val := lexer.Scan()
301		if tok == EOF {
302			break
303		}
304		fmt.Printf("%d:%d %s %q\n", pos.Line, pos.Column, tok, val)
305	}
306	// Output:
307	// 1:1 $ ""
308	// 1:2 number "0"
309	// 1:4 { ""
310	// 1:6 print ""
311	// 1:12 $ ""
312	// 1:13 number "1"
313	// 1:15 } ""
314}
315