1// Test GoAWK Lexer 2 3package lexer_test 4 5import ( 6 "fmt" 7 "strconv" 8 "strings" 9 "testing" 10 11 . "github.com/benhoyt/goawk/lexer" 12) 13 14func TestLexer(t *testing.T) { 15 tests := []struct { 16 input string 17 output string 18 }{ 19 // Comments, whitespace, line continuations 20 {"+# foo \n- #foo", `1:1 + "", 1:8 <newline> "", 2:1 - ""`}, 21 {"+\\\n-", `1:1 + "", 2:1 - ""`}, 22 {"+\\\r\n-", `1:1 + "", 2:1 - ""`}, 23 {"+\\-", `1:1 + "", 1:3 <illegal> "expected \\n after \\ line continuation", 1:3 - ""`}, 24 25 // Names and keywords 26 {"x", `1:1 name "x"`}, 27 {"x y0", `1:1 name "x", 1:3 name "y0"`}, 28 {"x 0y", `1:1 name "x", 1:3 number "0", 1:4 name "y"`}, 29 {"sub SUB", `1:1 sub "", 1:5 name "SUB"`}, 30 31 // String tokens 32 {`"foo"`, `1:1 string "foo"`}, 33 {`"a\t\r\n\z\'\"\a\b\f\vb"`, `1:1 string "a\t\r\nz'\"\a\b\f\vb"`}, 34 {`"x`, `1:3 <illegal> "didn't find end quote in string"`}, 35 {"\"x\n\"", `1:3 <illegal> "can't have newline in string", 1:3 <newline> "", 2:2 <illegal> "didn't find end quote in string"`}, 36 {`'foo'`, `1:1 string "foo"`}, 37 {`'a\t\r\n\z\'\"b'`, `1:1 string "a\t\r\nz'\"b"`}, 38 {`'x`, `1:3 <illegal> "didn't find end quote in string"`}, 39 {"'x\n'", `1:3 <illegal> "can't have newline in string", 1:3 <newline> "", 2:2 <illegal> "didn't find end quote in string"`}, 40 {`"\x0.\x00.\x0A\x10\xff\xFF\x41"`, `1:1 string "\x00.\x00.\n\x10\xff\xffA"`}, 41 {`"\xg"`, `1:4 <illegal> "1 or 2 hex digits expected", 1:4 name "g", 1:6 <illegal> "didn't find end quote in string"`}, 42 {`"\0\78\7\77\777\0 \141 "`, `1:1 string "\x00\a8\a?\xff\x00 a "`}, 43 44 // Number tokens 45 {"0", `1:1 number "0"`}, 46 {"9", `1:1 number "9"`}, 47 {" 0 ", `1:2 number "0"`}, 48 {"\n 1", `1:1 <newline> "", 2:3 number "1"`}, 49 {"1234", `1:1 number "1234"`}, 50 {".5", `1:1 number ".5"`}, 51 {".5e1", `1:1 number ".5e1"`}, 52 {"5e+1", `1:1 number "5e+1"`}, 53 {"5e-1", `1:1 number "5e-1"`}, 54 {"0.", `1:1 number "0."`}, 55 {"42e", `1:1 number "42", 1:3 name "e"`}, 56 {"4.2e", `1:1 number "4.2", 1:4 name "e"`}, 57 {"1.e3", `1:1 number "1.e3"`}, 58 {"1.e3", `1:1 number "1.e3"`}, 59 {"1e3foo", `1:1 number "1e3", 1:4 name "foo"`}, 60 {"1e3+", `1:1 number "1e3", 1:4 + ""`}, 61 {"1e3.4", `1:1 number "1e3", 1:4 number ".4"`}, 62 {"1e-", `1:1 number "1", 1:2 name "e", 1:3 - ""`}, 63 {"1e+", `1:1 number "1", 1:2 name "e", 1:3 + ""`}, 64 {"42@", `1:1 number "42", 1:3 <illegal> "unexpected char"`}, 65 {"0..", `1:1 number "0.", 1:4 <illegal> "expected digits"`}, 66 {".", `1:2 <illegal> "expected digits"`}, 67 68 // Misc errors 69 {"&=", `1:2 <illegal> "unexpected char after '&'", 1:2 = ""`}, 70 } 71 for _, test := range tests { 72 t.Run(test.input, func(t *testing.T) { 73 l := NewLexer([]byte(test.input)) 74 strs := []string{} 75 for { 76 pos, tok, val := l.Scan() 77 if tok == EOF { 78 break 79 } 80 if tok == NUMBER { 81 // Ensure ParseFloat() works, as that's what our 82 // parser uses to convert 83 trimmed := strings.TrimRight(val, "eE") 84 _, err := strconv.ParseFloat(trimmed, 64) 85 if err != nil { 86 t.Fatalf("couldn't parse float: %q", val) 87 } 88 } 89 strs = append(strs, fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val)) 90 } 91 output := strings.Join(strs, ", ") 92 if output != test.output { 93 t.Errorf("expected %q, got %q", test.output, output) 94 } 95 }) 96 } 97} 98 99func TestRegex(t *testing.T) { 100 tests := []struct { 101 input string 102 output string 103 }{ 104 {`/foo/`, `1:1 regex "foo"`}, 105 {`/=foo/`, `1:1 regex "=foo"`}, 106 {`/a\/b/`, `1:1 regex "a/b"`}, 107 {`/a\/\zb/`, `1:1 regex "a/\\zb"`}, 108 {`/a`, `1:3 <illegal> "didn't find end slash in regex"`}, 109 {"/a\n", `1:3 <illegal> "can't have newline in regex"`}, 110 {`foo/`, `1:4 <illegal> "unexpected name preceding regex"`}, 111 } 112 for _, test := range tests { 113 t.Run(test.input, func(t *testing.T) { 114 l := NewLexer([]byte(test.input)) 115 l.Scan() // Scan first token (probably DIV) 116 pos, tok, val := l.ScanRegex() 117 output := fmt.Sprintf("%d:%d %s %q", pos.Line, pos.Column, tok, val) 118 if output != test.output { 119 t.Errorf("expected %q, got %q", test.output, output) 120 } 121 }) 122 } 123} 124 125func TestHadSpace(t *testing.T) { 126 tests := []struct { 127 input string 128 tokens []Token 129 spaces []bool 130 }{ 131 {`foo(x)`, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, false, false, false}}, 132 {`foo (x) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{false, true, false, false}}, 133 {` foo ( x ) `, []Token{NAME, LPAREN, NAME, RPAREN}, []bool{true, true, true, true}}, 134 } 135 for _, test := range tests { 136 t.Run(test.input, func(t *testing.T) { 137 l := NewLexer([]byte(test.input)) 138 for i := 0; ; i++ { 139 _, tok, _ := l.Scan() 140 if tok == EOF { 141 break 142 } 143 if tok != test.tokens[i] { 144 t.Errorf("expected %s for token %d, got %s", test.tokens[i], i, tok) 145 } 146 if l.HadSpace() != test.spaces[i] { 147 t.Errorf("expected %v for space %d, got %v", test.spaces[i], i, l.HadSpace()) 148 } 149 } 150 }) 151 } 152} 153 154func TestKeywordToken(t *testing.T) { 155 tests := []struct { 156 name string 157 tok Token 158 }{ 159 {"print", PRINT}, 160 {"split", F_SPLIT}, 161 {"BEGIN", BEGIN}, 162 {"foo", ILLEGAL}, 163 {"GoAWK", ILLEGAL}, 164 } 165 for _, test := range tests { 166 t.Run(test.name, func(t *testing.T) { 167 tok := KeywordToken(test.name) 168 if tok != test.tok { 169 t.Errorf("expected %v, got %v", test.tok, tok) 170 } 171 }) 172 } 173} 174 175func TestAllTokens(t *testing.T) { 176 input := "# comment line\n" + 177 "+ += && = : , -- /\n/= $ == >= > >> ++ { [ < ( #\n" + 178 "<= ~ % %= * *= !~ ! != | || ^ ^= ** **= ? } ] ) ; - -= " + 179 "BEGIN break continue delete do else END exit " + 180 "for function getline if in next print printf return while " + 181 "atan2 close cos exp fflush gsub index int length log match rand " + 182 "sin split sprintf sqrt srand sub substr system tolower toupper " + 183 "x \"str\\n\" 1234\n" + 184 "@ ." 185 186 strs := make([]string, 0, LAST+1) 187 seen := make([]bool, LAST+1) 188 l := NewLexer([]byte(input)) 189 for { 190 _, tok, _ := l.Scan() 191 strs = append(strs, tok.String()) 192 seen[int(tok)] = true 193 if tok == EOF { 194 break 195 } 196 } 197 output := strings.Join(strs, " ") 198 199 expected := "<newline> " + 200 "+ += && = : , -- / <newline> /= $ == >= > >> ++ { [ < ( <newline> " + 201 "<= ~ % %= * *= !~ ! != | || ^ ^= ^ ^= ? } ] ) ; - -= " + 202 "BEGIN break continue delete do else END exit " + 203 "for function getline if in next print printf return while " + 204 "atan2 close cos exp fflush gsub index int length log match rand " + 205 "sin split sprintf sqrt srand sub substr system tolower toupper " + 206 "name string number <newline> " + 207 "<illegal> <illegal> EOF" 208 if output != expected { 209 t.Errorf("expected %q, got %q", expected, output) 210 } 211 212 for i, s := range seen { 213 if !s && Token(i) != CONCAT && Token(i) != REGEX { 214 t.Errorf("token %s (%d) not seen", Token(i), i) 215 } 216 } 217 218 l = NewLexer([]byte(`/foo/`)) 219 _, tok1, _ := l.Scan() 220 _, tok2, val := l.ScanRegex() 221 if tok1 != Token(DIV) || tok2 != Token(REGEX) || val != "foo" { 222 t.Errorf(`expected / regex "foo", got %s %s %q`, tok1, tok2, val) 223 } 224 225 l = NewLexer([]byte(`/=foo/`)) 226 _, tok1, _ = l.Scan() 227 _, tok2, val = l.ScanRegex() 228 if tok1 != Token(DIV_ASSIGN) || tok2 != Token(REGEX) || val != "=foo" { 229 t.Errorf(`expected /= regex "=foo", got %s %s %q`, tok1, tok2, val) 230 } 231} 232 233func benchmarkLexer(b *testing.B, repeat int, source string) { 234 fullSource := []byte(strings.Repeat(source+"\n", repeat)) 235 b.ResetTimer() 236 for i := 0; i < b.N; i++ { 237 l := NewLexer(fullSource) 238 for { 239 _, tok, _ := l.Scan() 240 if tok == EOF || tok == ILLEGAL { 241 break 242 } 243 } 244 } 245} 246 247func BenchmarkProgram(b *testing.B) { 248 benchmarkLexer(b, 5, `{ print $1, ($3+$4)*$5 }`) 249} 250 251func BenchmarkNames(b *testing.B) { 252 benchmarkLexer(b, 5, `x y i foobar abcdefghij0123456789 _`) 253} 254 255func BenchmarkKeywords(b *testing.B) { 256 benchmarkLexer(b, 5, `BEGIN END print sub if length`) 257} 258 259func BenchmarkSimpleTokens(b *testing.B) { 260 benchmarkLexer(b, 5, "\n : , { [ ( } ] ) ~ ? ; $") 261} 262 263func BenchmarkChoiceTokens(b *testing.B) { 264 benchmarkLexer(b, 5, `/ /= % %= + ++ += * ** **= *= = == ^ ^= ! != !~ < <= > >= >> && | ||`) 265} 266 267func BenchmarkNumbers(b *testing.B) { 268 benchmarkLexer(b, 5, `0 1 .5 1234 1234567890 1234.56789e-50`) 269} 270 271func BenchmarkStrings(b *testing.B) { 272 benchmarkLexer(b, 5, `"x" "y" "xyz" "foo" "foo bar baz" "foo\tbar\rbaz\n"`) 273} 274 275func BenchmarkRegex(b *testing.B) { 276 source := `/x/ /./ /foo/ /bar/ /=equals=/ /\/\/\/\//` 277 fullSource := []byte(strings.Repeat(source+" ", 5)) 278 b.ResetTimer() 279 for i := 0; i < b.N; i++ { 280 l := NewLexer(fullSource) 281 for { 282 _, tok, _ := l.Scan() 283 if tok == EOF { 284 break 285 } 286 if tok != DIV && tok != DIV_ASSIGN { 287 b.Fatalf("expected / or /=, got %s", tok) 288 } 289 _, tok, _ = l.ScanRegex() 290 if tok != REGEX { 291 b.Fatalf("expected regex, got %s", tok) 292 } 293 } 294 } 295} 296 297func Example() { 298 lexer := NewLexer([]byte(`$0 { print $1 }`)) 299 for { 300 pos, tok, val := lexer.Scan() 301 if tok == EOF { 302 break 303 } 304 fmt.Printf("%d:%d %s %q\n", pos.Line, pos.Column, tok, val) 305 } 306 // Output: 307 // 1:1 $ "" 308 // 1:2 number "0" 309 // 1:4 { "" 310 // 1:6 print "" 311 // 1:12 $ "" 312 // 1:13 number "1" 313 // 1:15 } "" 314} 315