1const std = @import("../std.zig"); 2const mem = std.mem; 3 4pub const Token = struct { 5 tag: Tag, 6 loc: Loc, 7 8 pub const Loc = struct { 9 start: usize, 10 end: usize, 11 }; 12 13 pub const keywords = std.ComptimeStringMap(Tag, .{ 14 .{ "addrspace", .keyword_addrspace }, 15 .{ "align", .keyword_align }, 16 .{ "allowzero", .keyword_allowzero }, 17 .{ "and", .keyword_and }, 18 .{ "anyframe", .keyword_anyframe }, 19 .{ "anytype", .keyword_anytype }, 20 .{ "asm", .keyword_asm }, 21 .{ "async", .keyword_async }, 22 .{ "await", .keyword_await }, 23 .{ "break", .keyword_break }, 24 .{ "callconv", .keyword_callconv }, 25 .{ "catch", .keyword_catch }, 26 .{ "comptime", .keyword_comptime }, 27 .{ "const", .keyword_const }, 28 .{ "continue", .keyword_continue }, 29 .{ "defer", .keyword_defer }, 30 .{ "else", .keyword_else }, 31 .{ "enum", .keyword_enum }, 32 .{ "errdefer", .keyword_errdefer }, 33 .{ "error", .keyword_error }, 34 .{ "export", .keyword_export }, 35 .{ "extern", .keyword_extern }, 36 .{ "fn", .keyword_fn }, 37 .{ "for", .keyword_for }, 38 .{ "if", .keyword_if }, 39 .{ "inline", .keyword_inline }, 40 .{ "noalias", .keyword_noalias }, 41 .{ "noinline", .keyword_noinline }, 42 .{ "nosuspend", .keyword_nosuspend }, 43 .{ "opaque", .keyword_opaque }, 44 .{ "or", .keyword_or }, 45 .{ "orelse", .keyword_orelse }, 46 .{ "packed", .keyword_packed }, 47 .{ "pub", .keyword_pub }, 48 .{ "resume", .keyword_resume }, 49 .{ "return", .keyword_return }, 50 .{ "linksection", .keyword_linksection }, 51 .{ "struct", .keyword_struct }, 52 .{ "suspend", .keyword_suspend }, 53 .{ "switch", .keyword_switch }, 54 .{ "test", .keyword_test }, 55 .{ "threadlocal", .keyword_threadlocal }, 56 .{ "try", .keyword_try }, 57 .{ "union", .keyword_union }, 58 .{ "unreachable", .keyword_unreachable }, 59 .{ "usingnamespace", .keyword_usingnamespace }, 60 .{ "var", .keyword_var }, 61 .{ "volatile", .keyword_volatile }, 62 .{ "while", .keyword_while }, 63 }); 64 65 pub fn getKeyword(bytes: []const u8) ?Tag { 66 return keywords.get(bytes); 67 } 68 69 pub const Tag = enum { 70 invalid, 71 invalid_periodasterisks, 72 identifier, 73 string_literal, 74 multiline_string_literal_line, 75 char_literal, 76 eof, 77 builtin, 78 bang, 79 pipe, 80 pipe_pipe, 81 pipe_equal, 82 equal, 83 equal_equal, 84 equal_angle_bracket_right, 85 bang_equal, 86 l_paren, 87 r_paren, 88 semicolon, 89 percent, 90 percent_equal, 91 l_brace, 92 r_brace, 93 l_bracket, 94 r_bracket, 95 period, 96 period_asterisk, 97 ellipsis2, 98 ellipsis3, 99 caret, 100 caret_equal, 101 plus, 102 plus_plus, 103 plus_equal, 104 plus_percent, 105 plus_percent_equal, 106 plus_pipe, 107 plus_pipe_equal, 108 minus, 109 minus_equal, 110 minus_percent, 111 minus_percent_equal, 112 minus_pipe, 113 minus_pipe_equal, 114 asterisk, 115 asterisk_equal, 116 asterisk_asterisk, 117 asterisk_percent, 118 asterisk_percent_equal, 119 asterisk_pipe, 120 asterisk_pipe_equal, 121 arrow, 122 colon, 123 slash, 124 slash_equal, 125 comma, 126 ampersand, 127 ampersand_equal, 128 question_mark, 129 angle_bracket_left, 130 angle_bracket_left_equal, 131 angle_bracket_angle_bracket_left, 132 angle_bracket_angle_bracket_left_equal, 133 angle_bracket_angle_bracket_left_pipe, 134 angle_bracket_angle_bracket_left_pipe_equal, 135 angle_bracket_right, 136 angle_bracket_right_equal, 137 angle_bracket_angle_bracket_right, 138 angle_bracket_angle_bracket_right_equal, 139 tilde, 140 integer_literal, 141 float_literal, 142 doc_comment, 143 container_doc_comment, 144 keyword_addrspace, 145 keyword_align, 146 keyword_allowzero, 147 keyword_and, 148 keyword_anyframe, 149 keyword_anytype, 150 keyword_asm, 151 keyword_async, 152 keyword_await, 153 keyword_break, 154 keyword_callconv, 155 keyword_catch, 156 keyword_comptime, 157 keyword_const, 158 keyword_continue, 159 keyword_defer, 160 keyword_else, 161 keyword_enum, 162 keyword_errdefer, 163 keyword_error, 164 keyword_export, 165 keyword_extern, 166 keyword_fn, 167 keyword_for, 168 keyword_if, 169 keyword_inline, 170 keyword_noalias, 171 keyword_noinline, 172 keyword_nosuspend, 173 keyword_opaque, 174 keyword_or, 175 keyword_orelse, 176 keyword_packed, 177 keyword_pub, 178 keyword_resume, 179 keyword_return, 180 keyword_linksection, 181 keyword_struct, 182 keyword_suspend, 183 keyword_switch, 184 keyword_test, 185 keyword_threadlocal, 186 keyword_try, 187 keyword_union, 188 keyword_unreachable, 189 keyword_usingnamespace, 190 keyword_var, 191 keyword_volatile, 192 keyword_while, 193 194 pub fn lexeme(tag: Tag) ?[]const u8 { 195 return switch (tag) { 196 .invalid, 197 .identifier, 198 .string_literal, 199 .multiline_string_literal_line, 200 .char_literal, 201 .eof, 202 .builtin, 203 .integer_literal, 204 .float_literal, 205 .doc_comment, 206 .container_doc_comment, 207 => null, 208 209 .invalid_periodasterisks => ".**", 210 .bang => "!", 211 .pipe => "|", 212 .pipe_pipe => "||", 213 .pipe_equal => "|=", 214 .equal => "=", 215 .equal_equal => "==", 216 .equal_angle_bracket_right => "=>", 217 .bang_equal => "!=", 218 .l_paren => "(", 219 .r_paren => ")", 220 .semicolon => ";", 221 .percent => "%", 222 .percent_equal => "%=", 223 .l_brace => "{", 224 .r_brace => "}", 225 .l_bracket => "[", 226 .r_bracket => "]", 227 .period => ".", 228 .period_asterisk => ".*", 229 .ellipsis2 => "..", 230 .ellipsis3 => "...", 231 .caret => "^", 232 .caret_equal => "^=", 233 .plus => "+", 234 .plus_plus => "++", 235 .plus_equal => "+=", 236 .plus_percent => "+%", 237 .plus_percent_equal => "+%=", 238 .plus_pipe => "+|", 239 .plus_pipe_equal => "+|=", 240 .minus => "-", 241 .minus_equal => "-=", 242 .minus_percent => "-%", 243 .minus_percent_equal => "-%=", 244 .minus_pipe => "-|", 245 .minus_pipe_equal => "-|=", 246 .asterisk => "*", 247 .asterisk_equal => "*=", 248 .asterisk_asterisk => "**", 249 .asterisk_percent => "*%", 250 .asterisk_percent_equal => "*%=", 251 .asterisk_pipe => "*|", 252 .asterisk_pipe_equal => "*|=", 253 .arrow => "->", 254 .colon => ":", 255 .slash => "/", 256 .slash_equal => "/=", 257 .comma => ",", 258 .ampersand => "&", 259 .ampersand_equal => "&=", 260 .question_mark => "?", 261 .angle_bracket_left => "<", 262 .angle_bracket_left_equal => "<=", 263 .angle_bracket_angle_bracket_left => "<<", 264 .angle_bracket_angle_bracket_left_equal => "<<=", 265 .angle_bracket_angle_bracket_left_pipe => "<<|", 266 .angle_bracket_angle_bracket_left_pipe_equal => "<<|=", 267 .angle_bracket_right => ">", 268 .angle_bracket_right_equal => ">=", 269 .angle_bracket_angle_bracket_right => ">>", 270 .angle_bracket_angle_bracket_right_equal => ">>=", 271 .tilde => "~", 272 .keyword_addrspace => "addrspace", 273 .keyword_align => "align", 274 .keyword_allowzero => "allowzero", 275 .keyword_and => "and", 276 .keyword_anyframe => "anyframe", 277 .keyword_anytype => "anytype", 278 .keyword_asm => "asm", 279 .keyword_async => "async", 280 .keyword_await => "await", 281 .keyword_break => "break", 282 .keyword_callconv => "callconv", 283 .keyword_catch => "catch", 284 .keyword_comptime => "comptime", 285 .keyword_const => "const", 286 .keyword_continue => "continue", 287 .keyword_defer => "defer", 288 .keyword_else => "else", 289 .keyword_enum => "enum", 290 .keyword_errdefer => "errdefer", 291 .keyword_error => "error", 292 .keyword_export => "export", 293 .keyword_extern => "extern", 294 .keyword_fn => "fn", 295 .keyword_for => "for", 296 .keyword_if => "if", 297 .keyword_inline => "inline", 298 .keyword_noalias => "noalias", 299 .keyword_noinline => "noinline", 300 .keyword_nosuspend => "nosuspend", 301 .keyword_opaque => "opaque", 302 .keyword_or => "or", 303 .keyword_orelse => "orelse", 304 .keyword_packed => "packed", 305 .keyword_pub => "pub", 306 .keyword_resume => "resume", 307 .keyword_return => "return", 308 .keyword_linksection => "linksection", 309 .keyword_struct => "struct", 310 .keyword_suspend => "suspend", 311 .keyword_switch => "switch", 312 .keyword_test => "test", 313 .keyword_threadlocal => "threadlocal", 314 .keyword_try => "try", 315 .keyword_union => "union", 316 .keyword_unreachable => "unreachable", 317 .keyword_usingnamespace => "usingnamespace", 318 .keyword_var => "var", 319 .keyword_volatile => "volatile", 320 .keyword_while => "while", 321 }; 322 } 323 324 pub fn symbol(tag: Tag) []const u8 { 325 return tag.lexeme() orelse @tagName(tag); 326 } 327 }; 328}; 329 330pub const Tokenizer = struct { 331 buffer: [:0]const u8, 332 index: usize, 333 pending_invalid_token: ?Token, 334 335 /// For debugging purposes 336 pub fn dump(self: *Tokenizer, token: *const Token) void { 337 std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.start..token.end] }); 338 } 339 340 pub fn init(buffer: [:0]const u8) Tokenizer { 341 // Skip the UTF-8 BOM if present 342 const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0); 343 return Tokenizer{ 344 .buffer = buffer, 345 .index = src_start, 346 .pending_invalid_token = null, 347 }; 348 } 349 350 const State = enum { 351 start, 352 identifier, 353 builtin, 354 string_literal, 355 string_literal_backslash, 356 multiline_string_literal_line, 357 char_literal, 358 char_literal_backslash, 359 char_literal_hex_escape, 360 char_literal_unicode_escape_saw_u, 361 char_literal_unicode_escape, 362 char_literal_unicode_invalid, 363 char_literal_unicode, 364 char_literal_end, 365 backslash, 366 equal, 367 bang, 368 pipe, 369 minus, 370 minus_percent, 371 minus_pipe, 372 asterisk, 373 asterisk_percent, 374 asterisk_pipe, 375 slash, 376 line_comment_start, 377 line_comment, 378 doc_comment_start, 379 doc_comment, 380 zero, 381 int_literal_dec, 382 int_literal_dec_no_underscore, 383 int_literal_bin, 384 int_literal_bin_no_underscore, 385 int_literal_oct, 386 int_literal_oct_no_underscore, 387 int_literal_hex, 388 int_literal_hex_no_underscore, 389 num_dot_dec, 390 num_dot_hex, 391 float_fraction_dec, 392 float_fraction_dec_no_underscore, 393 float_fraction_hex, 394 float_fraction_hex_no_underscore, 395 float_exponent_unsigned, 396 float_exponent_num, 397 float_exponent_num_no_underscore, 398 ampersand, 399 caret, 400 percent, 401 plus, 402 plus_percent, 403 plus_pipe, 404 angle_bracket_left, 405 angle_bracket_angle_bracket_left, 406 angle_bracket_angle_bracket_left_pipe, 407 angle_bracket_right, 408 angle_bracket_angle_bracket_right, 409 period, 410 period_2, 411 period_asterisk, 412 saw_at_sign, 413 }; 414 415 pub fn next(self: *Tokenizer) Token { 416 if (self.pending_invalid_token) |token| { 417 self.pending_invalid_token = null; 418 return token; 419 } 420 var state: State = .start; 421 var result = Token{ 422 .tag = .eof, 423 .loc = .{ 424 .start = self.index, 425 .end = undefined, 426 }, 427 }; 428 var seen_escape_digits: usize = undefined; 429 var remaining_code_units: usize = undefined; 430 while (true) : (self.index += 1) { 431 const c = self.buffer[self.index]; 432 switch (state) { 433 .start => switch (c) { 434 0 => break, 435 ' ', '\n', '\t', '\r' => { 436 result.loc.start = self.index + 1; 437 }, 438 '"' => { 439 state = .string_literal; 440 result.tag = .string_literal; 441 }, 442 '\'' => { 443 state = .char_literal; 444 }, 445 'a'...'z', 'A'...'Z', '_' => { 446 state = .identifier; 447 result.tag = .identifier; 448 }, 449 '@' => { 450 state = .saw_at_sign; 451 }, 452 '=' => { 453 state = .equal; 454 }, 455 '!' => { 456 state = .bang; 457 }, 458 '|' => { 459 state = .pipe; 460 }, 461 '(' => { 462 result.tag = .l_paren; 463 self.index += 1; 464 break; 465 }, 466 ')' => { 467 result.tag = .r_paren; 468 self.index += 1; 469 break; 470 }, 471 '[' => { 472 result.tag = .l_bracket; 473 self.index += 1; 474 break; 475 }, 476 ']' => { 477 result.tag = .r_bracket; 478 self.index += 1; 479 break; 480 }, 481 ';' => { 482 result.tag = .semicolon; 483 self.index += 1; 484 break; 485 }, 486 ',' => { 487 result.tag = .comma; 488 self.index += 1; 489 break; 490 }, 491 '?' => { 492 result.tag = .question_mark; 493 self.index += 1; 494 break; 495 }, 496 ':' => { 497 result.tag = .colon; 498 self.index += 1; 499 break; 500 }, 501 '%' => { 502 state = .percent; 503 }, 504 '*' => { 505 state = .asterisk; 506 }, 507 '+' => { 508 state = .plus; 509 }, 510 '<' => { 511 state = .angle_bracket_left; 512 }, 513 '>' => { 514 state = .angle_bracket_right; 515 }, 516 '^' => { 517 state = .caret; 518 }, 519 '\\' => { 520 state = .backslash; 521 result.tag = .multiline_string_literal_line; 522 }, 523 '{' => { 524 result.tag = .l_brace; 525 self.index += 1; 526 break; 527 }, 528 '}' => { 529 result.tag = .r_brace; 530 self.index += 1; 531 break; 532 }, 533 '~' => { 534 result.tag = .tilde; 535 self.index += 1; 536 break; 537 }, 538 '.' => { 539 state = .period; 540 }, 541 '-' => { 542 state = .minus; 543 }, 544 '/' => { 545 state = .slash; 546 }, 547 '&' => { 548 state = .ampersand; 549 }, 550 '0' => { 551 state = .zero; 552 result.tag = .integer_literal; 553 }, 554 '1'...'9' => { 555 state = .int_literal_dec; 556 result.tag = .integer_literal; 557 }, 558 else => { 559 result.tag = .invalid; 560 result.loc.end = self.index; 561 self.index += 1; 562 return result; 563 }, 564 }, 565 566 .saw_at_sign => switch (c) { 567 '"' => { 568 result.tag = .identifier; 569 state = .string_literal; 570 }, 571 'a'...'z', 'A'...'Z', '_' => { 572 state = .builtin; 573 result.tag = .builtin; 574 }, 575 else => { 576 result.tag = .invalid; 577 break; 578 }, 579 }, 580 581 .ampersand => switch (c) { 582 '=' => { 583 result.tag = .ampersand_equal; 584 self.index += 1; 585 break; 586 }, 587 else => { 588 result.tag = .ampersand; 589 break; 590 }, 591 }, 592 593 .asterisk => switch (c) { 594 '=' => { 595 result.tag = .asterisk_equal; 596 self.index += 1; 597 break; 598 }, 599 '*' => { 600 result.tag = .asterisk_asterisk; 601 self.index += 1; 602 break; 603 }, 604 '%' => { 605 state = .asterisk_percent; 606 }, 607 '|' => { 608 state = .asterisk_pipe; 609 }, 610 else => { 611 result.tag = .asterisk; 612 break; 613 }, 614 }, 615 616 .asterisk_percent => switch (c) { 617 '=' => { 618 result.tag = .asterisk_percent_equal; 619 self.index += 1; 620 break; 621 }, 622 else => { 623 result.tag = .asterisk_percent; 624 break; 625 }, 626 }, 627 628 .asterisk_pipe => switch (c) { 629 '=' => { 630 result.tag = .asterisk_pipe_equal; 631 self.index += 1; 632 break; 633 }, 634 else => { 635 result.tag = .asterisk_pipe; 636 break; 637 }, 638 }, 639 640 .percent => switch (c) { 641 '=' => { 642 result.tag = .percent_equal; 643 self.index += 1; 644 break; 645 }, 646 else => { 647 result.tag = .percent; 648 break; 649 }, 650 }, 651 652 .plus => switch (c) { 653 '=' => { 654 result.tag = .plus_equal; 655 self.index += 1; 656 break; 657 }, 658 '+' => { 659 result.tag = .plus_plus; 660 self.index += 1; 661 break; 662 }, 663 '%' => { 664 state = .plus_percent; 665 }, 666 '|' => { 667 state = .plus_pipe; 668 }, 669 else => { 670 result.tag = .plus; 671 break; 672 }, 673 }, 674 675 .plus_percent => switch (c) { 676 '=' => { 677 result.tag = .plus_percent_equal; 678 self.index += 1; 679 break; 680 }, 681 else => { 682 result.tag = .plus_percent; 683 break; 684 }, 685 }, 686 687 .plus_pipe => switch (c) { 688 '=' => { 689 result.tag = .plus_pipe_equal; 690 self.index += 1; 691 break; 692 }, 693 else => { 694 result.tag = .plus_pipe; 695 break; 696 }, 697 }, 698 699 .caret => switch (c) { 700 '=' => { 701 result.tag = .caret_equal; 702 self.index += 1; 703 break; 704 }, 705 else => { 706 result.tag = .caret; 707 break; 708 }, 709 }, 710 711 .identifier => switch (c) { 712 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 713 else => { 714 if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| { 715 result.tag = tag; 716 } 717 break; 718 }, 719 }, 720 .builtin => switch (c) { 721 'a'...'z', 'A'...'Z', '_', '0'...'9' => {}, 722 else => break, 723 }, 724 .backslash => switch (c) { 725 '\\' => { 726 state = .multiline_string_literal_line; 727 }, 728 else => { 729 result.tag = .invalid; 730 break; 731 }, 732 }, 733 .string_literal => switch (c) { 734 '\\' => { 735 state = .string_literal_backslash; 736 }, 737 '"' => { 738 self.index += 1; 739 break; 740 }, 741 0 => { 742 if (self.index == self.buffer.len) { 743 break; 744 } else { 745 self.checkLiteralCharacter(); 746 } 747 }, 748 '\n' => { 749 result.tag = .invalid; 750 break; 751 }, 752 else => self.checkLiteralCharacter(), 753 }, 754 755 .string_literal_backslash => switch (c) { 756 0, '\n' => { 757 result.tag = .invalid; 758 break; 759 }, 760 else => { 761 state = .string_literal; 762 }, 763 }, 764 765 .char_literal => switch (c) { 766 0 => { 767 result.tag = .invalid; 768 break; 769 }, 770 '\\' => { 771 state = .char_literal_backslash; 772 }, 773 '\'', 0x80...0xbf, 0xf8...0xff => { 774 result.tag = .invalid; 775 break; 776 }, 777 0xc0...0xdf => { // 110xxxxx 778 remaining_code_units = 1; 779 state = .char_literal_unicode; 780 }, 781 0xe0...0xef => { // 1110xxxx 782 remaining_code_units = 2; 783 state = .char_literal_unicode; 784 }, 785 0xf0...0xf7 => { // 11110xxx 786 remaining_code_units = 3; 787 state = .char_literal_unicode; 788 }, 789 else => { 790 state = .char_literal_end; 791 }, 792 }, 793 794 .char_literal_backslash => switch (c) { 795 0, '\n' => { 796 result.tag = .invalid; 797 break; 798 }, 799 'x' => { 800 state = .char_literal_hex_escape; 801 seen_escape_digits = 0; 802 }, 803 'u' => { 804 state = .char_literal_unicode_escape_saw_u; 805 }, 806 else => { 807 state = .char_literal_end; 808 }, 809 }, 810 811 .char_literal_hex_escape => switch (c) { 812 '0'...'9', 'a'...'f', 'A'...'F' => { 813 seen_escape_digits += 1; 814 if (seen_escape_digits == 2) { 815 state = .char_literal_end; 816 } 817 }, 818 else => { 819 result.tag = .invalid; 820 break; 821 }, 822 }, 823 824 .char_literal_unicode_escape_saw_u => switch (c) { 825 0 => { 826 result.tag = .invalid; 827 break; 828 }, 829 '{' => { 830 state = .char_literal_unicode_escape; 831 }, 832 else => { 833 result.tag = .invalid; 834 state = .char_literal_unicode_invalid; 835 }, 836 }, 837 838 .char_literal_unicode_escape => switch (c) { 839 0 => { 840 result.tag = .invalid; 841 break; 842 }, 843 '0'...'9', 'a'...'f', 'A'...'F' => {}, 844 '}' => { 845 state = .char_literal_end; // too many/few digits handled later 846 }, 847 else => { 848 result.tag = .invalid; 849 state = .char_literal_unicode_invalid; 850 }, 851 }, 852 853 .char_literal_unicode_invalid => switch (c) { 854 // Keep consuming characters until an obvious stopping point. 855 // This consolidates e.g. `u{0ab1Q}` into a single invalid token 856 // instead of creating the tokens `u{0ab1`, `Q`, `}` 857 '0'...'9', 'a'...'z', 'A'...'Z', '}' => {}, 858 else => break, 859 }, 860 861 .char_literal_end => switch (c) { 862 '\'' => { 863 result.tag = .char_literal; 864 self.index += 1; 865 break; 866 }, 867 else => { 868 result.tag = .invalid; 869 break; 870 }, 871 }, 872 873 .char_literal_unicode => switch (c) { 874 0x80...0xbf => { 875 remaining_code_units -= 1; 876 if (remaining_code_units == 0) { 877 state = .char_literal_end; 878 } 879 }, 880 else => { 881 result.tag = .invalid; 882 break; 883 }, 884 }, 885 886 .multiline_string_literal_line => switch (c) { 887 0 => break, 888 '\n' => { 889 self.index += 1; 890 break; 891 }, 892 '\t' => {}, 893 else => self.checkLiteralCharacter(), 894 }, 895 896 .bang => switch (c) { 897 '=' => { 898 result.tag = .bang_equal; 899 self.index += 1; 900 break; 901 }, 902 else => { 903 result.tag = .bang; 904 break; 905 }, 906 }, 907 908 .pipe => switch (c) { 909 '=' => { 910 result.tag = .pipe_equal; 911 self.index += 1; 912 break; 913 }, 914 '|' => { 915 result.tag = .pipe_pipe; 916 self.index += 1; 917 break; 918 }, 919 else => { 920 result.tag = .pipe; 921 break; 922 }, 923 }, 924 925 .equal => switch (c) { 926 '=' => { 927 result.tag = .equal_equal; 928 self.index += 1; 929 break; 930 }, 931 '>' => { 932 result.tag = .equal_angle_bracket_right; 933 self.index += 1; 934 break; 935 }, 936 else => { 937 result.tag = .equal; 938 break; 939 }, 940 }, 941 942 .minus => switch (c) { 943 '>' => { 944 result.tag = .arrow; 945 self.index += 1; 946 break; 947 }, 948 '=' => { 949 result.tag = .minus_equal; 950 self.index += 1; 951 break; 952 }, 953 '%' => { 954 state = .minus_percent; 955 }, 956 '|' => { 957 state = .minus_pipe; 958 }, 959 else => { 960 result.tag = .minus; 961 break; 962 }, 963 }, 964 965 .minus_percent => switch (c) { 966 '=' => { 967 result.tag = .minus_percent_equal; 968 self.index += 1; 969 break; 970 }, 971 else => { 972 result.tag = .minus_percent; 973 break; 974 }, 975 }, 976 .minus_pipe => switch (c) { 977 '=' => { 978 result.tag = .minus_pipe_equal; 979 self.index += 1; 980 break; 981 }, 982 else => { 983 result.tag = .minus_pipe; 984 break; 985 }, 986 }, 987 988 .angle_bracket_left => switch (c) { 989 '<' => { 990 state = .angle_bracket_angle_bracket_left; 991 }, 992 '=' => { 993 result.tag = .angle_bracket_left_equal; 994 self.index += 1; 995 break; 996 }, 997 else => { 998 result.tag = .angle_bracket_left; 999 break; 1000 }, 1001 }, 1002 1003 .angle_bracket_angle_bracket_left => switch (c) { 1004 '=' => { 1005 result.tag = .angle_bracket_angle_bracket_left_equal; 1006 self.index += 1; 1007 break; 1008 }, 1009 '|' => { 1010 state = .angle_bracket_angle_bracket_left_pipe; 1011 }, 1012 else => { 1013 result.tag = .angle_bracket_angle_bracket_left; 1014 break; 1015 }, 1016 }, 1017 1018 .angle_bracket_angle_bracket_left_pipe => switch (c) { 1019 '=' => { 1020 result.tag = .angle_bracket_angle_bracket_left_pipe_equal; 1021 self.index += 1; 1022 break; 1023 }, 1024 else => { 1025 result.tag = .angle_bracket_angle_bracket_left_pipe; 1026 break; 1027 }, 1028 }, 1029 1030 .angle_bracket_right => switch (c) { 1031 '>' => { 1032 state = .angle_bracket_angle_bracket_right; 1033 }, 1034 '=' => { 1035 result.tag = .angle_bracket_right_equal; 1036 self.index += 1; 1037 break; 1038 }, 1039 else => { 1040 result.tag = .angle_bracket_right; 1041 break; 1042 }, 1043 }, 1044 1045 .angle_bracket_angle_bracket_right => switch (c) { 1046 '=' => { 1047 result.tag = .angle_bracket_angle_bracket_right_equal; 1048 self.index += 1; 1049 break; 1050 }, 1051 else => { 1052 result.tag = .angle_bracket_angle_bracket_right; 1053 break; 1054 }, 1055 }, 1056 1057 .period => switch (c) { 1058 '.' => { 1059 state = .period_2; 1060 }, 1061 '*' => { 1062 state = .period_asterisk; 1063 }, 1064 else => { 1065 result.tag = .period; 1066 break; 1067 }, 1068 }, 1069 1070 .period_2 => switch (c) { 1071 '.' => { 1072 result.tag = .ellipsis3; 1073 self.index += 1; 1074 break; 1075 }, 1076 else => { 1077 result.tag = .ellipsis2; 1078 break; 1079 }, 1080 }, 1081 1082 .period_asterisk => switch (c) { 1083 '*' => { 1084 result.tag = .invalid_periodasterisks; 1085 break; 1086 }, 1087 else => { 1088 result.tag = .period_asterisk; 1089 break; 1090 }, 1091 }, 1092 1093 .slash => switch (c) { 1094 '/' => { 1095 state = .line_comment_start; 1096 }, 1097 '=' => { 1098 result.tag = .slash_equal; 1099 self.index += 1; 1100 break; 1101 }, 1102 else => { 1103 result.tag = .slash; 1104 break; 1105 }, 1106 }, 1107 .line_comment_start => switch (c) { 1108 0 => { 1109 if (self.index != self.buffer.len) { 1110 result.tag = .invalid; 1111 self.index += 1; 1112 } 1113 break; 1114 }, 1115 '/' => { 1116 state = .doc_comment_start; 1117 }, 1118 '!' => { 1119 result.tag = .container_doc_comment; 1120 state = .doc_comment; 1121 }, 1122 '\n' => { 1123 state = .start; 1124 result.loc.start = self.index + 1; 1125 }, 1126 '\t', '\r' => state = .line_comment, 1127 else => { 1128 state = .line_comment; 1129 self.checkLiteralCharacter(); 1130 }, 1131 }, 1132 .doc_comment_start => switch (c) { 1133 '/' => { 1134 state = .line_comment; 1135 }, 1136 0, '\n' => { 1137 result.tag = .doc_comment; 1138 break; 1139 }, 1140 '\t', '\r' => { 1141 state = .doc_comment; 1142 result.tag = .doc_comment; 1143 }, 1144 else => { 1145 state = .doc_comment; 1146 result.tag = .doc_comment; 1147 self.checkLiteralCharacter(); 1148 }, 1149 }, 1150 .line_comment => switch (c) { 1151 0 => break, 1152 '\n' => { 1153 state = .start; 1154 result.loc.start = self.index + 1; 1155 }, 1156 '\t', '\r' => {}, 1157 else => self.checkLiteralCharacter(), 1158 }, 1159 .doc_comment => switch (c) { 1160 0, '\n' => break, 1161 '\t', '\r' => {}, 1162 else => self.checkLiteralCharacter(), 1163 }, 1164 .zero => switch (c) { 1165 'b' => { 1166 state = .int_literal_bin_no_underscore; 1167 }, 1168 'o' => { 1169 state = .int_literal_oct_no_underscore; 1170 }, 1171 'x' => { 1172 state = .int_literal_hex_no_underscore; 1173 }, 1174 '0'...'9', '_', '.', 'e', 'E' => { 1175 // reinterpret as a decimal number 1176 self.index -= 1; 1177 state = .int_literal_dec; 1178 }, 1179 'a', 'c', 'd', 'f'...'n', 'p'...'w', 'y', 'z', 'A'...'D', 'F'...'Z' => { 1180 result.tag = .invalid; 1181 break; 1182 }, 1183 else => break, 1184 }, 1185 .int_literal_bin_no_underscore => switch (c) { 1186 '0'...'1' => { 1187 state = .int_literal_bin; 1188 }, 1189 else => { 1190 result.tag = .invalid; 1191 break; 1192 }, 1193 }, 1194 .int_literal_bin => switch (c) { 1195 '_' => { 1196 state = .int_literal_bin_no_underscore; 1197 }, 1198 '0'...'1' => {}, 1199 '2'...'9', 'a'...'z', 'A'...'Z' => { 1200 result.tag = .invalid; 1201 break; 1202 }, 1203 else => break, 1204 }, 1205 .int_literal_oct_no_underscore => switch (c) { 1206 '0'...'7' => { 1207 state = .int_literal_oct; 1208 }, 1209 else => { 1210 result.tag = .invalid; 1211 break; 1212 }, 1213 }, 1214 .int_literal_oct => switch (c) { 1215 '_' => { 1216 state = .int_literal_oct_no_underscore; 1217 }, 1218 '0'...'7' => {}, 1219 '8', '9', 'a'...'z', 'A'...'Z' => { 1220 result.tag = .invalid; 1221 break; 1222 }, 1223 else => break, 1224 }, 1225 .int_literal_dec_no_underscore => switch (c) { 1226 '0'...'9' => { 1227 state = .int_literal_dec; 1228 }, 1229 else => { 1230 result.tag = .invalid; 1231 break; 1232 }, 1233 }, 1234 .int_literal_dec => switch (c) { 1235 '_' => { 1236 state = .int_literal_dec_no_underscore; 1237 }, 1238 '.' => { 1239 state = .num_dot_dec; 1240 result.tag = .invalid; 1241 }, 1242 'e', 'E' => { 1243 state = .float_exponent_unsigned; 1244 result.tag = .float_literal; 1245 }, 1246 '0'...'9' => {}, 1247 'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => { 1248 result.tag = .invalid; 1249 break; 1250 }, 1251 else => break, 1252 }, 1253 .int_literal_hex_no_underscore => switch (c) { 1254 '0'...'9', 'a'...'f', 'A'...'F' => { 1255 state = .int_literal_hex; 1256 }, 1257 else => { 1258 result.tag = .invalid; 1259 break; 1260 }, 1261 }, 1262 .int_literal_hex => switch (c) { 1263 '_' => { 1264 state = .int_literal_hex_no_underscore; 1265 }, 1266 '.' => { 1267 state = .num_dot_hex; 1268 result.tag = .invalid; 1269 }, 1270 'p', 'P' => { 1271 state = .float_exponent_unsigned; 1272 result.tag = .float_literal; 1273 }, 1274 '0'...'9', 'a'...'f', 'A'...'F' => {}, 1275 'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => { 1276 result.tag = .invalid; 1277 break; 1278 }, 1279 else => break, 1280 }, 1281 .num_dot_dec => switch (c) { 1282 '.' => { 1283 result.tag = .integer_literal; 1284 self.index -= 1; 1285 state = .start; 1286 break; 1287 }, 1288 '0'...'9' => { 1289 result.tag = .float_literal; 1290 state = .float_fraction_dec; 1291 }, 1292 '_', 'a'...'z', 'A'...'Z' => { 1293 result.tag = .invalid; 1294 break; 1295 }, 1296 else => break, 1297 }, 1298 .num_dot_hex => switch (c) { 1299 '.' => { 1300 result.tag = .integer_literal; 1301 self.index -= 1; 1302 state = .start; 1303 break; 1304 }, 1305 '0'...'9', 'a'...'f', 'A'...'F' => { 1306 result.tag = .float_literal; 1307 state = .float_fraction_hex; 1308 }, 1309 '_', 'g'...'z', 'G'...'Z' => { 1310 result.tag = .invalid; 1311 break; 1312 }, 1313 else => break, 1314 }, 1315 .float_fraction_dec_no_underscore => switch (c) { 1316 '0'...'9' => { 1317 state = .float_fraction_dec; 1318 }, 1319 else => { 1320 result.tag = .invalid; 1321 break; 1322 }, 1323 }, 1324 .float_fraction_dec => switch (c) { 1325 '_' => { 1326 state = .float_fraction_dec_no_underscore; 1327 }, 1328 'e', 'E' => { 1329 state = .float_exponent_unsigned; 1330 }, 1331 '0'...'9' => {}, 1332 'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => { 1333 result.tag = .invalid; 1334 break; 1335 }, 1336 else => break, 1337 }, 1338 .float_fraction_hex_no_underscore => switch (c) { 1339 '0'...'9', 'a'...'f', 'A'...'F' => { 1340 state = .float_fraction_hex; 1341 }, 1342 else => { 1343 result.tag = .invalid; 1344 break; 1345 }, 1346 }, 1347 .float_fraction_hex => switch (c) { 1348 '_' => { 1349 state = .float_fraction_hex_no_underscore; 1350 }, 1351 'p', 'P' => { 1352 state = .float_exponent_unsigned; 1353 }, 1354 '0'...'9', 'a'...'f', 'A'...'F' => {}, 1355 'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => { 1356 result.tag = .invalid; 1357 break; 1358 }, 1359 else => break, 1360 }, 1361 .float_exponent_unsigned => switch (c) { 1362 '+', '-' => { 1363 state = .float_exponent_num_no_underscore; 1364 }, 1365 else => { 1366 // reinterpret as a normal exponent number 1367 self.index -= 1; 1368 state = .float_exponent_num_no_underscore; 1369 }, 1370 }, 1371 .float_exponent_num_no_underscore => switch (c) { 1372 '0'...'9' => { 1373 state = .float_exponent_num; 1374 }, 1375 else => { 1376 result.tag = .invalid; 1377 break; 1378 }, 1379 }, 1380 .float_exponent_num => switch (c) { 1381 '_' => { 1382 state = .float_exponent_num_no_underscore; 1383 }, 1384 '0'...'9' => {}, 1385 'a'...'z', 'A'...'Z' => { 1386 result.tag = .invalid; 1387 break; 1388 }, 1389 else => break, 1390 }, 1391 } 1392 } 1393 1394 if (result.tag == .eof) { 1395 if (self.pending_invalid_token) |token| { 1396 self.pending_invalid_token = null; 1397 return token; 1398 } 1399 result.loc.start = self.index; 1400 } 1401 1402 result.loc.end = self.index; 1403 return result; 1404 } 1405 1406 fn checkLiteralCharacter(self: *Tokenizer) void { 1407 if (self.pending_invalid_token != null) return; 1408 const invalid_length = self.getInvalidCharacterLength(); 1409 if (invalid_length == 0) return; 1410 self.pending_invalid_token = .{ 1411 .tag = .invalid, 1412 .loc = .{ 1413 .start = self.index, 1414 .end = self.index + invalid_length, 1415 }, 1416 }; 1417 } 1418 1419 fn getInvalidCharacterLength(self: *Tokenizer) u3 { 1420 const c0 = self.buffer[self.index]; 1421 if (c0 < 0x80) { 1422 if (c0 < 0x20 or c0 == 0x7f) { 1423 // ascii control codes are never allowed 1424 // (note that \n was checked before we got here) 1425 return 1; 1426 } 1427 // looks fine to me. 1428 return 0; 1429 } else { 1430 // check utf8-encoded character. 1431 const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1; 1432 if (self.index + length > self.buffer.len) { 1433 return @intCast(u3, self.buffer.len - self.index); 1434 } 1435 const bytes = self.buffer[self.index .. self.index + length]; 1436 switch (length) { 1437 2 => { 1438 const value = std.unicode.utf8Decode2(bytes) catch return length; 1439 if (value == 0x85) return length; // U+0085 (NEL) 1440 }, 1441 3 => { 1442 const value = std.unicode.utf8Decode3(bytes) catch return length; 1443 if (value == 0x2028) return length; // U+2028 (LS) 1444 if (value == 0x2029) return length; // U+2029 (PS) 1445 }, 1446 4 => { 1447 _ = std.unicode.utf8Decode4(bytes) catch return length; 1448 }, 1449 else => unreachable, 1450 } 1451 self.index += length - 1; 1452 return 0; 1453 } 1454 } 1455}; 1456 1457test "tokenizer" { 1458 try testTokenize("test", &.{.keyword_test}); 1459} 1460 1461test "line comment followed by top-level comptime" { 1462 try testTokenize( 1463 \\// line comment 1464 \\comptime {} 1465 \\ 1466 , &.{ 1467 .keyword_comptime, 1468 .l_brace, 1469 .r_brace, 1470 }); 1471} 1472 1473test "tokenizer - unknown length pointer and then c pointer" { 1474 try testTokenize( 1475 \\[*]u8 1476 \\[*c]u8 1477 , &.{ 1478 .l_bracket, 1479 .asterisk, 1480 .r_bracket, 1481 .identifier, 1482 .l_bracket, 1483 .asterisk, 1484 .identifier, 1485 .r_bracket, 1486 .identifier, 1487 }); 1488} 1489 1490test "tokenizer - code point literal with hex escape" { 1491 try testTokenize( 1492 \\'\x1b' 1493 , &.{.char_literal}); 1494 try testTokenize( 1495 \\'\x1' 1496 , &.{ .invalid, .invalid }); 1497} 1498 1499test "tokenizer - code point literal with unicode escapes" { 1500 // Valid unicode escapes 1501 try testTokenize( 1502 \\'\u{3}' 1503 , &.{.char_literal}); 1504 try testTokenize( 1505 \\'\u{01}' 1506 , &.{.char_literal}); 1507 try testTokenize( 1508 \\'\u{2a}' 1509 , &.{.char_literal}); 1510 try testTokenize( 1511 \\'\u{3f9}' 1512 , &.{.char_literal}); 1513 try testTokenize( 1514 \\'\u{6E09aBc1523}' 1515 , &.{.char_literal}); 1516 try testTokenize( 1517 \\"\u{440}" 1518 , &.{.string_literal}); 1519 1520 // Invalid unicode escapes 1521 try testTokenize( 1522 \\'\u' 1523 , &.{.invalid}); 1524 try testTokenize( 1525 \\'\u{{' 1526 , &.{ .invalid, .invalid }); 1527 try testTokenize( 1528 \\'\u{}' 1529 , &.{.char_literal}); 1530 try testTokenize( 1531 \\'\u{s}' 1532 , &.{ .invalid, .invalid }); 1533 try testTokenize( 1534 \\'\u{2z}' 1535 , &.{ .invalid, .invalid }); 1536 try testTokenize( 1537 \\'\u{4a' 1538 , &.{.invalid}); 1539 1540 // Test old-style unicode literals 1541 try testTokenize( 1542 \\'\u0333' 1543 , &.{ .invalid, .invalid }); 1544 try testTokenize( 1545 \\'\U0333' 1546 , &.{ .invalid, .integer_literal, .invalid }); 1547} 1548 1549test "tokenizer - code point literal with unicode code point" { 1550 try testTokenize( 1551 \\'' 1552 , &.{.char_literal}); 1553} 1554 1555test "tokenizer - float literal e exponent" { 1556 try testTokenize("a = 4.94065645841246544177e-324;\n", &.{ 1557 .identifier, 1558 .equal, 1559 .float_literal, 1560 .semicolon, 1561 }); 1562} 1563 1564test "tokenizer - float literal p exponent" { 1565 try testTokenize("a = 0x1.a827999fcef32p+1022;\n", &.{ 1566 .identifier, 1567 .equal, 1568 .float_literal, 1569 .semicolon, 1570 }); 1571} 1572 1573test "tokenizer - chars" { 1574 try testTokenize("'c'", &.{.char_literal}); 1575} 1576 1577test "tokenizer - invalid token characters" { 1578 try testTokenize("#", &.{.invalid}); 1579 try testTokenize("`", &.{.invalid}); 1580 try testTokenize("'c", &.{.invalid}); 1581 try testTokenize("'", &.{.invalid}); 1582 try testTokenize("''", &.{ .invalid, .invalid }); 1583} 1584 1585test "tokenizer - invalid literal/comment characters" { 1586 try testTokenize("\"\x00\"", &.{ 1587 .string_literal, 1588 .invalid, 1589 }); 1590 try testTokenize("//\x00", &.{ 1591 .invalid, 1592 }); 1593 try testTokenize("//\x1f", &.{ 1594 .invalid, 1595 }); 1596 try testTokenize("//\x7f", &.{ 1597 .invalid, 1598 }); 1599} 1600 1601test "tokenizer - utf8" { 1602 try testTokenize("//\xc2\x80", &.{}); 1603 try testTokenize("//\xf4\x8f\xbf\xbf", &.{}); 1604} 1605 1606test "tokenizer - invalid utf8" { 1607 try testTokenize("//\x80", &.{ 1608 .invalid, 1609 }); 1610 try testTokenize("//\xbf", &.{ 1611 .invalid, 1612 }); 1613 try testTokenize("//\xf8", &.{ 1614 .invalid, 1615 }); 1616 try testTokenize("//\xff", &.{ 1617 .invalid, 1618 }); 1619 try testTokenize("//\xc2\xc0", &.{ 1620 .invalid, 1621 }); 1622 try testTokenize("//\xe0", &.{ 1623 .invalid, 1624 }); 1625 try testTokenize("//\xf0", &.{ 1626 .invalid, 1627 }); 1628 try testTokenize("//\xf0\x90\x80\xc0", &.{ 1629 .invalid, 1630 }); 1631} 1632 1633test "tokenizer - illegal unicode codepoints" { 1634 // unicode newline characters.U+0085, U+2028, U+2029 1635 try testTokenize("//\xc2\x84", &.{}); 1636 try testTokenize("//\xc2\x85", &.{ 1637 .invalid, 1638 }); 1639 try testTokenize("//\xc2\x86", &.{}); 1640 try testTokenize("//\xe2\x80\xa7", &.{}); 1641 try testTokenize("//\xe2\x80\xa8", &.{ 1642 .invalid, 1643 }); 1644 try testTokenize("//\xe2\x80\xa9", &.{ 1645 .invalid, 1646 }); 1647 try testTokenize("//\xe2\x80\xaa", &.{}); 1648} 1649 1650test "tokenizer - string identifier and builtin fns" { 1651 try testTokenize( 1652 \\const @"if" = @import("std"); 1653 , &.{ 1654 .keyword_const, 1655 .identifier, 1656 .equal, 1657 .builtin, 1658 .l_paren, 1659 .string_literal, 1660 .r_paren, 1661 .semicolon, 1662 }); 1663} 1664 1665test "tokenizer - multiline string literal with literal tab" { 1666 try testTokenize( 1667 \\\\foo bar 1668 , &.{ 1669 .multiline_string_literal_line, 1670 }); 1671} 1672 1673test "tokenizer - comments with literal tab" { 1674 try testTokenize( 1675 \\//foo bar 1676 \\//!foo bar 1677 \\///foo bar 1678 \\// foo 1679 \\/// foo 1680 \\/// /foo 1681 , &.{ 1682 .container_doc_comment, 1683 .doc_comment, 1684 .doc_comment, 1685 .doc_comment, 1686 }); 1687} 1688 1689test "tokenizer - pipe and then invalid" { 1690 try testTokenize("||=", &.{ 1691 .pipe_pipe, 1692 .equal, 1693 }); 1694} 1695 1696test "tokenizer - line comment and doc comment" { 1697 try testTokenize("//", &.{}); 1698 try testTokenize("// a / b", &.{}); 1699 try testTokenize("// /", &.{}); 1700 try testTokenize("/// a", &.{.doc_comment}); 1701 try testTokenize("///", &.{.doc_comment}); 1702 try testTokenize("////", &.{}); 1703 try testTokenize("//!", &.{.container_doc_comment}); 1704 try testTokenize("//!!", &.{.container_doc_comment}); 1705} 1706 1707test "tokenizer - line comment followed by identifier" { 1708 try testTokenize( 1709 \\ Unexpected, 1710 \\ // another 1711 \\ Another, 1712 , &.{ 1713 .identifier, 1714 .comma, 1715 .identifier, 1716 .comma, 1717 }); 1718} 1719 1720test "tokenizer - UTF-8 BOM is recognized and skipped" { 1721 try testTokenize("\xEF\xBB\xBFa;\n", &.{ 1722 .identifier, 1723 .semicolon, 1724 }); 1725} 1726 1727test "correctly parse pointer assignment" { 1728 try testTokenize("b.*=3;\n", &.{ 1729 .identifier, 1730 .period_asterisk, 1731 .equal, 1732 .integer_literal, 1733 .semicolon, 1734 }); 1735} 1736 1737test "correctly parse pointer dereference followed by asterisk" { 1738 try testTokenize("\"b\".* ** 10", &.{ 1739 .string_literal, 1740 .period_asterisk, 1741 .asterisk_asterisk, 1742 .integer_literal, 1743 }); 1744 1745 try testTokenize("(\"b\".*)** 10", &.{ 1746 .l_paren, 1747 .string_literal, 1748 .period_asterisk, 1749 .r_paren, 1750 .asterisk_asterisk, 1751 .integer_literal, 1752 }); 1753 1754 try testTokenize("\"b\".*** 10", &.{ 1755 .string_literal, 1756 .invalid_periodasterisks, 1757 .asterisk_asterisk, 1758 .integer_literal, 1759 }); 1760} 1761 1762test "tokenizer - range literals" { 1763 try testTokenize("0...9", &.{ .integer_literal, .ellipsis3, .integer_literal }); 1764 try testTokenize("'0'...'9'", &.{ .char_literal, .ellipsis3, .char_literal }); 1765 try testTokenize("0x00...0x09", &.{ .integer_literal, .ellipsis3, .integer_literal }); 1766 try testTokenize("0b00...0b11", &.{ .integer_literal, .ellipsis3, .integer_literal }); 1767 try testTokenize("0o00...0o11", &.{ .integer_literal, .ellipsis3, .integer_literal }); 1768} 1769 1770test "tokenizer - number literals decimal" { 1771 try testTokenize("0", &.{.integer_literal}); 1772 try testTokenize("1", &.{.integer_literal}); 1773 try testTokenize("2", &.{.integer_literal}); 1774 try testTokenize("3", &.{.integer_literal}); 1775 try testTokenize("4", &.{.integer_literal}); 1776 try testTokenize("5", &.{.integer_literal}); 1777 try testTokenize("6", &.{.integer_literal}); 1778 try testTokenize("7", &.{.integer_literal}); 1779 try testTokenize("8", &.{.integer_literal}); 1780 try testTokenize("9", &.{.integer_literal}); 1781 try testTokenize("1..", &.{ .integer_literal, .ellipsis2 }); 1782 try testTokenize("0a", &.{ .invalid, .identifier }); 1783 try testTokenize("9b", &.{ .invalid, .identifier }); 1784 try testTokenize("1z", &.{ .invalid, .identifier }); 1785 try testTokenize("1z_1", &.{ .invalid, .identifier }); 1786 try testTokenize("9z3", &.{ .invalid, .identifier }); 1787 1788 try testTokenize("0_0", &.{.integer_literal}); 1789 try testTokenize("0001", &.{.integer_literal}); 1790 try testTokenize("01234567890", &.{.integer_literal}); 1791 try testTokenize("012_345_6789_0", &.{.integer_literal}); 1792 try testTokenize("0_1_2_3_4_5_6_7_8_9_0", &.{.integer_literal}); 1793 1794 try testTokenize("00_", &.{.invalid}); 1795 try testTokenize("0_0_", &.{.invalid}); 1796 try testTokenize("0__0", &.{ .invalid, .identifier }); 1797 try testTokenize("0_0f", &.{ .invalid, .identifier }); 1798 try testTokenize("0_0_f", &.{ .invalid, .identifier }); 1799 try testTokenize("0_0_f_00", &.{ .invalid, .identifier }); 1800 try testTokenize("1_,", &.{ .invalid, .comma }); 1801 1802 try testTokenize("0.0", &.{.float_literal}); 1803 try testTokenize("1.0", &.{.float_literal}); 1804 try testTokenize("10.0", &.{.float_literal}); 1805 try testTokenize("0e0", &.{.float_literal}); 1806 try testTokenize("1e0", &.{.float_literal}); 1807 try testTokenize("1e100", &.{.float_literal}); 1808 try testTokenize("1.0e100", &.{.float_literal}); 1809 try testTokenize("1.0e+100", &.{.float_literal}); 1810 try testTokenize("1.0e-100", &.{.float_literal}); 1811 try testTokenize("1_0_0_0.0_0_0_0_0_1e1_0_0_0", &.{.float_literal}); 1812 1813 try testTokenize("1.", &.{.invalid}); 1814 try testTokenize("1e", &.{.invalid}); 1815 try testTokenize("1.e100", &.{ .invalid, .identifier }); 1816 try testTokenize("1.0e1f0", &.{ .invalid, .identifier }); 1817 try testTokenize("1.0p100", &.{ .invalid, .identifier }); 1818 try testTokenize("1.0p-100", &.{ .invalid, .identifier, .minus, .integer_literal }); 1819 try testTokenize("1.0p1f0", &.{ .invalid, .identifier }); 1820 try testTokenize("1.0_,", &.{ .invalid, .comma }); 1821 try testTokenize("1_.0", &.{ .invalid, .period, .integer_literal }); 1822 try testTokenize("1._", &.{ .invalid, .identifier }); 1823 try testTokenize("1.a", &.{ .invalid, .identifier }); 1824 try testTokenize("1.z", &.{ .invalid, .identifier }); 1825 try testTokenize("1._0", &.{ .invalid, .identifier }); 1826 try testTokenize("1.+", &.{ .invalid, .plus }); 1827 try testTokenize("1._+", &.{ .invalid, .identifier, .plus }); 1828 try testTokenize("1._e", &.{ .invalid, .identifier }); 1829 try testTokenize("1.0e", &.{.invalid}); 1830 try testTokenize("1.0e,", &.{ .invalid, .comma }); 1831 try testTokenize("1.0e_", &.{ .invalid, .identifier }); 1832 try testTokenize("1.0e+_", &.{ .invalid, .identifier }); 1833 try testTokenize("1.0e-_", &.{ .invalid, .identifier }); 1834 try testTokenize("1.0e0_+", &.{ .invalid, .plus }); 1835} 1836 1837test "tokenizer - number literals binary" { 1838 try testTokenize("0b0", &.{.integer_literal}); 1839 try testTokenize("0b1", &.{.integer_literal}); 1840 try testTokenize("0b2", &.{ .invalid, .integer_literal }); 1841 try testTokenize("0b3", &.{ .invalid, .integer_literal }); 1842 try testTokenize("0b4", &.{ .invalid, .integer_literal }); 1843 try testTokenize("0b5", &.{ .invalid, .integer_literal }); 1844 try testTokenize("0b6", &.{ .invalid, .integer_literal }); 1845 try testTokenize("0b7", &.{ .invalid, .integer_literal }); 1846 try testTokenize("0b8", &.{ .invalid, .integer_literal }); 1847 try testTokenize("0b9", &.{ .invalid, .integer_literal }); 1848 try testTokenize("0ba", &.{ .invalid, .identifier }); 1849 try testTokenize("0bb", &.{ .invalid, .identifier }); 1850 try testTokenize("0bc", &.{ .invalid, .identifier }); 1851 try testTokenize("0bd", &.{ .invalid, .identifier }); 1852 try testTokenize("0be", &.{ .invalid, .identifier }); 1853 try testTokenize("0bf", &.{ .invalid, .identifier }); 1854 try testTokenize("0bz", &.{ .invalid, .identifier }); 1855 1856 try testTokenize("0b0000_0000", &.{.integer_literal}); 1857 try testTokenize("0b1111_1111", &.{.integer_literal}); 1858 try testTokenize("0b10_10_10_10", &.{.integer_literal}); 1859 try testTokenize("0b0_1_0_1_0_1_0_1", &.{.integer_literal}); 1860 try testTokenize("0b1.", &.{ .integer_literal, .period }); 1861 try testTokenize("0b1.0", &.{ .integer_literal, .period, .integer_literal }); 1862 1863 try testTokenize("0B0", &.{ .invalid, .identifier }); 1864 try testTokenize("0b_", &.{ .invalid, .identifier }); 1865 try testTokenize("0b_0", &.{ .invalid, .identifier }); 1866 try testTokenize("0b1_", &.{.invalid}); 1867 try testTokenize("0b0__1", &.{ .invalid, .identifier }); 1868 try testTokenize("0b0_1_", &.{.invalid}); 1869 try testTokenize("0b1e", &.{ .invalid, .identifier }); 1870 try testTokenize("0b1p", &.{ .invalid, .identifier }); 1871 try testTokenize("0b1e0", &.{ .invalid, .identifier }); 1872 try testTokenize("0b1p0", &.{ .invalid, .identifier }); 1873 try testTokenize("0b1_,", &.{ .invalid, .comma }); 1874} 1875 1876test "tokenizer - number literals octal" { 1877 try testTokenize("0o0", &.{.integer_literal}); 1878 try testTokenize("0o1", &.{.integer_literal}); 1879 try testTokenize("0o2", &.{.integer_literal}); 1880 try testTokenize("0o3", &.{.integer_literal}); 1881 try testTokenize("0o4", &.{.integer_literal}); 1882 try testTokenize("0o5", &.{.integer_literal}); 1883 try testTokenize("0o6", &.{.integer_literal}); 1884 try testTokenize("0o7", &.{.integer_literal}); 1885 try testTokenize("0o8", &.{ .invalid, .integer_literal }); 1886 try testTokenize("0o9", &.{ .invalid, .integer_literal }); 1887 try testTokenize("0oa", &.{ .invalid, .identifier }); 1888 try testTokenize("0ob", &.{ .invalid, .identifier }); 1889 try testTokenize("0oc", &.{ .invalid, .identifier }); 1890 try testTokenize("0od", &.{ .invalid, .identifier }); 1891 try testTokenize("0oe", &.{ .invalid, .identifier }); 1892 try testTokenize("0of", &.{ .invalid, .identifier }); 1893 try testTokenize("0oz", &.{ .invalid, .identifier }); 1894 1895 try testTokenize("0o01234567", &.{.integer_literal}); 1896 try testTokenize("0o0123_4567", &.{.integer_literal}); 1897 try testTokenize("0o01_23_45_67", &.{.integer_literal}); 1898 try testTokenize("0o0_1_2_3_4_5_6_7", &.{.integer_literal}); 1899 try testTokenize("0o7.", &.{ .integer_literal, .period }); 1900 try testTokenize("0o7.0", &.{ .integer_literal, .period, .integer_literal }); 1901 1902 try testTokenize("0O0", &.{ .invalid, .identifier }); 1903 try testTokenize("0o_", &.{ .invalid, .identifier }); 1904 try testTokenize("0o_0", &.{ .invalid, .identifier }); 1905 try testTokenize("0o1_", &.{.invalid}); 1906 try testTokenize("0o0__1", &.{ .invalid, .identifier }); 1907 try testTokenize("0o0_1_", &.{.invalid}); 1908 try testTokenize("0o1e", &.{ .invalid, .identifier }); 1909 try testTokenize("0o1p", &.{ .invalid, .identifier }); 1910 try testTokenize("0o1e0", &.{ .invalid, .identifier }); 1911 try testTokenize("0o1p0", &.{ .invalid, .identifier }); 1912 try testTokenize("0o_,", &.{ .invalid, .identifier, .comma }); 1913} 1914 1915test "tokenizer - number literals hexadecimal" { 1916 try testTokenize("0x0", &.{.integer_literal}); 1917 try testTokenize("0x1", &.{.integer_literal}); 1918 try testTokenize("0x2", &.{.integer_literal}); 1919 try testTokenize("0x3", &.{.integer_literal}); 1920 try testTokenize("0x4", &.{.integer_literal}); 1921 try testTokenize("0x5", &.{.integer_literal}); 1922 try testTokenize("0x6", &.{.integer_literal}); 1923 try testTokenize("0x7", &.{.integer_literal}); 1924 try testTokenize("0x8", &.{.integer_literal}); 1925 try testTokenize("0x9", &.{.integer_literal}); 1926 try testTokenize("0xa", &.{.integer_literal}); 1927 try testTokenize("0xb", &.{.integer_literal}); 1928 try testTokenize("0xc", &.{.integer_literal}); 1929 try testTokenize("0xd", &.{.integer_literal}); 1930 try testTokenize("0xe", &.{.integer_literal}); 1931 try testTokenize("0xf", &.{.integer_literal}); 1932 try testTokenize("0xA", &.{.integer_literal}); 1933 try testTokenize("0xB", &.{.integer_literal}); 1934 try testTokenize("0xC", &.{.integer_literal}); 1935 try testTokenize("0xD", &.{.integer_literal}); 1936 try testTokenize("0xE", &.{.integer_literal}); 1937 try testTokenize("0xF", &.{.integer_literal}); 1938 try testTokenize("0x0z", &.{ .invalid, .identifier }); 1939 try testTokenize("0xz", &.{ .invalid, .identifier }); 1940 1941 try testTokenize("0x0123456789ABCDEF", &.{.integer_literal}); 1942 try testTokenize("0x0123_4567_89AB_CDEF", &.{.integer_literal}); 1943 try testTokenize("0x01_23_45_67_89AB_CDE_F", &.{.integer_literal}); 1944 try testTokenize("0x0_1_2_3_4_5_6_7_8_9_A_B_C_D_E_F", &.{.integer_literal}); 1945 1946 try testTokenize("0X0", &.{ .invalid, .identifier }); 1947 try testTokenize("0x_", &.{ .invalid, .identifier }); 1948 try testTokenize("0x_1", &.{ .invalid, .identifier }); 1949 try testTokenize("0x1_", &.{.invalid}); 1950 try testTokenize("0x0__1", &.{ .invalid, .identifier }); 1951 try testTokenize("0x0_1_", &.{.invalid}); 1952 try testTokenize("0x_,", &.{ .invalid, .identifier, .comma }); 1953 1954 try testTokenize("0x1.0", &.{.float_literal}); 1955 try testTokenize("0xF.0", &.{.float_literal}); 1956 try testTokenize("0xF.F", &.{.float_literal}); 1957 try testTokenize("0xF.Fp0", &.{.float_literal}); 1958 try testTokenize("0xF.FP0", &.{.float_literal}); 1959 try testTokenize("0x1p0", &.{.float_literal}); 1960 try testTokenize("0xfp0", &.{.float_literal}); 1961 try testTokenize("0x1.0+0xF.0", &.{ .float_literal, .plus, .float_literal }); 1962 1963 try testTokenize("0x1.", &.{.invalid}); 1964 try testTokenize("0xF.", &.{.invalid}); 1965 try testTokenize("0x1.+0xF.", &.{ .invalid, .plus, .invalid }); 1966 try testTokenize("0xff.p10", &.{ .invalid, .identifier }); 1967 1968 try testTokenize("0x0123456.789ABCDEF", &.{.float_literal}); 1969 try testTokenize("0x0_123_456.789_ABC_DEF", &.{.float_literal}); 1970 try testTokenize("0x0_1_2_3_4_5_6.7_8_9_A_B_C_D_E_F", &.{.float_literal}); 1971 try testTokenize("0x0p0", &.{.float_literal}); 1972 try testTokenize("0x0.0p0", &.{.float_literal}); 1973 try testTokenize("0xff.ffp10", &.{.float_literal}); 1974 try testTokenize("0xff.ffP10", &.{.float_literal}); 1975 try testTokenize("0xffp10", &.{.float_literal}); 1976 try testTokenize("0xff_ff.ff_ffp1_0_0_0", &.{.float_literal}); 1977 try testTokenize("0xf_f_f_f.f_f_f_fp+1_000", &.{.float_literal}); 1978 try testTokenize("0xf_f_f_f.f_f_f_fp-1_00_0", &.{.float_literal}); 1979 1980 try testTokenize("0x1e", &.{.integer_literal}); 1981 try testTokenize("0x1e0", &.{.integer_literal}); 1982 try testTokenize("0x1p", &.{.invalid}); 1983 try testTokenize("0xfp0z1", &.{ .invalid, .identifier }); 1984 try testTokenize("0xff.ffpff", &.{ .invalid, .identifier }); 1985 try testTokenize("0x0.p", &.{ .invalid, .identifier }); 1986 try testTokenize("0x0.z", &.{ .invalid, .identifier }); 1987 try testTokenize("0x0._", &.{ .invalid, .identifier }); 1988 try testTokenize("0x0_.0", &.{ .invalid, .period, .integer_literal }); 1989 try testTokenize("0x0_.0.0", &.{ .invalid, .period, .float_literal }); 1990 try testTokenize("0x0._0", &.{ .invalid, .identifier }); 1991 try testTokenize("0x0.0_", &.{.invalid}); 1992 try testTokenize("0x0_p0", &.{ .invalid, .identifier }); 1993 try testTokenize("0x0_.p0", &.{ .invalid, .period, .identifier }); 1994 try testTokenize("0x0._p0", &.{ .invalid, .identifier }); 1995 try testTokenize("0x0.0_p0", &.{ .invalid, .identifier }); 1996 try testTokenize("0x0._0p0", &.{ .invalid, .identifier }); 1997 try testTokenize("0x0.0p_0", &.{ .invalid, .identifier }); 1998 try testTokenize("0x0.0p+_0", &.{ .invalid, .identifier }); 1999 try testTokenize("0x0.0p-_0", &.{ .invalid, .identifier }); 2000 try testTokenize("0x0.0p0_", &.{ .invalid, .eof }); 2001} 2002 2003test "tokenizer - multi line string literal with only 1 backslash" { 2004 try testTokenize("x \\\n;", &.{ .identifier, .invalid, .semicolon }); 2005} 2006 2007test "tokenizer - invalid builtin identifiers" { 2008 try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren }); 2009 try testTokenize("@0()", &.{ .invalid, .integer_literal, .l_paren, .r_paren }); 2010} 2011 2012test "tokenizer - invalid token with unfinished escape right before eof" { 2013 try testTokenize("\"\\", &.{.invalid}); 2014 try testTokenize("'\\", &.{.invalid}); 2015 try testTokenize("'\\u", &.{.invalid}); 2016} 2017 2018test "tokenizer - saturating" { 2019 try testTokenize("<<", &.{.angle_bracket_angle_bracket_left}); 2020 try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe}); 2021 try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal}); 2022 2023 try testTokenize("*", &.{.asterisk}); 2024 try testTokenize("*|", &.{.asterisk_pipe}); 2025 try testTokenize("*|=", &.{.asterisk_pipe_equal}); 2026 2027 try testTokenize("+", &.{.plus}); 2028 try testTokenize("+|", &.{.plus_pipe}); 2029 try testTokenize("+|=", &.{.plus_pipe_equal}); 2030 2031 try testTokenize("-", &.{.minus}); 2032 try testTokenize("-|", &.{.minus_pipe}); 2033 try testTokenize("-|=", &.{.minus_pipe_equal}); 2034} 2035 2036fn testTokenize(source: [:0]const u8, expected_tokens: []const Token.Tag) !void { 2037 var tokenizer = Tokenizer.init(source); 2038 for (expected_tokens) |expected_token_id| { 2039 const token = tokenizer.next(); 2040 if (token.tag != expected_token_id) { 2041 std.debug.panic("expected {s}, found {s}\n", .{ 2042 @tagName(expected_token_id), @tagName(token.tag), 2043 }); 2044 } 2045 } 2046 const last_token = tokenizer.next(); 2047 try std.testing.expectEqual(Token.Tag.eof, last_token.tag); 2048 try std.testing.expectEqual(source.len, last_token.loc.start); 2049} 2050