1const std = @import("../std.zig");
2const mem = std.mem;
3
4pub const Token = struct {
5    tag: Tag,
6    loc: Loc,
7
8    pub const Loc = struct {
9        start: usize,
10        end: usize,
11    };
12
13    pub const keywords = std.ComptimeStringMap(Tag, .{
14        .{ "addrspace", .keyword_addrspace },
15        .{ "align", .keyword_align },
16        .{ "allowzero", .keyword_allowzero },
17        .{ "and", .keyword_and },
18        .{ "anyframe", .keyword_anyframe },
19        .{ "anytype", .keyword_anytype },
20        .{ "asm", .keyword_asm },
21        .{ "async", .keyword_async },
22        .{ "await", .keyword_await },
23        .{ "break", .keyword_break },
24        .{ "callconv", .keyword_callconv },
25        .{ "catch", .keyword_catch },
26        .{ "comptime", .keyword_comptime },
27        .{ "const", .keyword_const },
28        .{ "continue", .keyword_continue },
29        .{ "defer", .keyword_defer },
30        .{ "else", .keyword_else },
31        .{ "enum", .keyword_enum },
32        .{ "errdefer", .keyword_errdefer },
33        .{ "error", .keyword_error },
34        .{ "export", .keyword_export },
35        .{ "extern", .keyword_extern },
36        .{ "fn", .keyword_fn },
37        .{ "for", .keyword_for },
38        .{ "if", .keyword_if },
39        .{ "inline", .keyword_inline },
40        .{ "noalias", .keyword_noalias },
41        .{ "noinline", .keyword_noinline },
42        .{ "nosuspend", .keyword_nosuspend },
43        .{ "opaque", .keyword_opaque },
44        .{ "or", .keyword_or },
45        .{ "orelse", .keyword_orelse },
46        .{ "packed", .keyword_packed },
47        .{ "pub", .keyword_pub },
48        .{ "resume", .keyword_resume },
49        .{ "return", .keyword_return },
50        .{ "linksection", .keyword_linksection },
51        .{ "struct", .keyword_struct },
52        .{ "suspend", .keyword_suspend },
53        .{ "switch", .keyword_switch },
54        .{ "test", .keyword_test },
55        .{ "threadlocal", .keyword_threadlocal },
56        .{ "try", .keyword_try },
57        .{ "union", .keyword_union },
58        .{ "unreachable", .keyword_unreachable },
59        .{ "usingnamespace", .keyword_usingnamespace },
60        .{ "var", .keyword_var },
61        .{ "volatile", .keyword_volatile },
62        .{ "while", .keyword_while },
63    });
64
65    pub fn getKeyword(bytes: []const u8) ?Tag {
66        return keywords.get(bytes);
67    }
68
69    pub const Tag = enum {
70        invalid,
71        invalid_periodasterisks,
72        identifier,
73        string_literal,
74        multiline_string_literal_line,
75        char_literal,
76        eof,
77        builtin,
78        bang,
79        pipe,
80        pipe_pipe,
81        pipe_equal,
82        equal,
83        equal_equal,
84        equal_angle_bracket_right,
85        bang_equal,
86        l_paren,
87        r_paren,
88        semicolon,
89        percent,
90        percent_equal,
91        l_brace,
92        r_brace,
93        l_bracket,
94        r_bracket,
95        period,
96        period_asterisk,
97        ellipsis2,
98        ellipsis3,
99        caret,
100        caret_equal,
101        plus,
102        plus_plus,
103        plus_equal,
104        plus_percent,
105        plus_percent_equal,
106        plus_pipe,
107        plus_pipe_equal,
108        minus,
109        minus_equal,
110        minus_percent,
111        minus_percent_equal,
112        minus_pipe,
113        minus_pipe_equal,
114        asterisk,
115        asterisk_equal,
116        asterisk_asterisk,
117        asterisk_percent,
118        asterisk_percent_equal,
119        asterisk_pipe,
120        asterisk_pipe_equal,
121        arrow,
122        colon,
123        slash,
124        slash_equal,
125        comma,
126        ampersand,
127        ampersand_equal,
128        question_mark,
129        angle_bracket_left,
130        angle_bracket_left_equal,
131        angle_bracket_angle_bracket_left,
132        angle_bracket_angle_bracket_left_equal,
133        angle_bracket_angle_bracket_left_pipe,
134        angle_bracket_angle_bracket_left_pipe_equal,
135        angle_bracket_right,
136        angle_bracket_right_equal,
137        angle_bracket_angle_bracket_right,
138        angle_bracket_angle_bracket_right_equal,
139        tilde,
140        integer_literal,
141        float_literal,
142        doc_comment,
143        container_doc_comment,
144        keyword_addrspace,
145        keyword_align,
146        keyword_allowzero,
147        keyword_and,
148        keyword_anyframe,
149        keyword_anytype,
150        keyword_asm,
151        keyword_async,
152        keyword_await,
153        keyword_break,
154        keyword_callconv,
155        keyword_catch,
156        keyword_comptime,
157        keyword_const,
158        keyword_continue,
159        keyword_defer,
160        keyword_else,
161        keyword_enum,
162        keyword_errdefer,
163        keyword_error,
164        keyword_export,
165        keyword_extern,
166        keyword_fn,
167        keyword_for,
168        keyword_if,
169        keyword_inline,
170        keyword_noalias,
171        keyword_noinline,
172        keyword_nosuspend,
173        keyword_opaque,
174        keyword_or,
175        keyword_orelse,
176        keyword_packed,
177        keyword_pub,
178        keyword_resume,
179        keyword_return,
180        keyword_linksection,
181        keyword_struct,
182        keyword_suspend,
183        keyword_switch,
184        keyword_test,
185        keyword_threadlocal,
186        keyword_try,
187        keyword_union,
188        keyword_unreachable,
189        keyword_usingnamespace,
190        keyword_var,
191        keyword_volatile,
192        keyword_while,
193
194        pub fn lexeme(tag: Tag) ?[]const u8 {
195            return switch (tag) {
196                .invalid,
197                .identifier,
198                .string_literal,
199                .multiline_string_literal_line,
200                .char_literal,
201                .eof,
202                .builtin,
203                .integer_literal,
204                .float_literal,
205                .doc_comment,
206                .container_doc_comment,
207                => null,
208
209                .invalid_periodasterisks => ".**",
210                .bang => "!",
211                .pipe => "|",
212                .pipe_pipe => "||",
213                .pipe_equal => "|=",
214                .equal => "=",
215                .equal_equal => "==",
216                .equal_angle_bracket_right => "=>",
217                .bang_equal => "!=",
218                .l_paren => "(",
219                .r_paren => ")",
220                .semicolon => ";",
221                .percent => "%",
222                .percent_equal => "%=",
223                .l_brace => "{",
224                .r_brace => "}",
225                .l_bracket => "[",
226                .r_bracket => "]",
227                .period => ".",
228                .period_asterisk => ".*",
229                .ellipsis2 => "..",
230                .ellipsis3 => "...",
231                .caret => "^",
232                .caret_equal => "^=",
233                .plus => "+",
234                .plus_plus => "++",
235                .plus_equal => "+=",
236                .plus_percent => "+%",
237                .plus_percent_equal => "+%=",
238                .plus_pipe => "+|",
239                .plus_pipe_equal => "+|=",
240                .minus => "-",
241                .minus_equal => "-=",
242                .minus_percent => "-%",
243                .minus_percent_equal => "-%=",
244                .minus_pipe => "-|",
245                .minus_pipe_equal => "-|=",
246                .asterisk => "*",
247                .asterisk_equal => "*=",
248                .asterisk_asterisk => "**",
249                .asterisk_percent => "*%",
250                .asterisk_percent_equal => "*%=",
251                .asterisk_pipe => "*|",
252                .asterisk_pipe_equal => "*|=",
253                .arrow => "->",
254                .colon => ":",
255                .slash => "/",
256                .slash_equal => "/=",
257                .comma => ",",
258                .ampersand => "&",
259                .ampersand_equal => "&=",
260                .question_mark => "?",
261                .angle_bracket_left => "<",
262                .angle_bracket_left_equal => "<=",
263                .angle_bracket_angle_bracket_left => "<<",
264                .angle_bracket_angle_bracket_left_equal => "<<=",
265                .angle_bracket_angle_bracket_left_pipe => "<<|",
266                .angle_bracket_angle_bracket_left_pipe_equal => "<<|=",
267                .angle_bracket_right => ">",
268                .angle_bracket_right_equal => ">=",
269                .angle_bracket_angle_bracket_right => ">>",
270                .angle_bracket_angle_bracket_right_equal => ">>=",
271                .tilde => "~",
272                .keyword_addrspace => "addrspace",
273                .keyword_align => "align",
274                .keyword_allowzero => "allowzero",
275                .keyword_and => "and",
276                .keyword_anyframe => "anyframe",
277                .keyword_anytype => "anytype",
278                .keyword_asm => "asm",
279                .keyword_async => "async",
280                .keyword_await => "await",
281                .keyword_break => "break",
282                .keyword_callconv => "callconv",
283                .keyword_catch => "catch",
284                .keyword_comptime => "comptime",
285                .keyword_const => "const",
286                .keyword_continue => "continue",
287                .keyword_defer => "defer",
288                .keyword_else => "else",
289                .keyword_enum => "enum",
290                .keyword_errdefer => "errdefer",
291                .keyword_error => "error",
292                .keyword_export => "export",
293                .keyword_extern => "extern",
294                .keyword_fn => "fn",
295                .keyword_for => "for",
296                .keyword_if => "if",
297                .keyword_inline => "inline",
298                .keyword_noalias => "noalias",
299                .keyword_noinline => "noinline",
300                .keyword_nosuspend => "nosuspend",
301                .keyword_opaque => "opaque",
302                .keyword_or => "or",
303                .keyword_orelse => "orelse",
304                .keyword_packed => "packed",
305                .keyword_pub => "pub",
306                .keyword_resume => "resume",
307                .keyword_return => "return",
308                .keyword_linksection => "linksection",
309                .keyword_struct => "struct",
310                .keyword_suspend => "suspend",
311                .keyword_switch => "switch",
312                .keyword_test => "test",
313                .keyword_threadlocal => "threadlocal",
314                .keyword_try => "try",
315                .keyword_union => "union",
316                .keyword_unreachable => "unreachable",
317                .keyword_usingnamespace => "usingnamespace",
318                .keyword_var => "var",
319                .keyword_volatile => "volatile",
320                .keyword_while => "while",
321            };
322        }
323
324        pub fn symbol(tag: Tag) []const u8 {
325            return tag.lexeme() orelse @tagName(tag);
326        }
327    };
328};
329
330pub const Tokenizer = struct {
331    buffer: [:0]const u8,
332    index: usize,
333    pending_invalid_token: ?Token,
334
335    /// For debugging purposes
336    pub fn dump(self: *Tokenizer, token: *const Token) void {
337        std.debug.print("{s} \"{s}\"\n", .{ @tagName(token.tag), self.buffer[token.start..token.end] });
338    }
339
340    pub fn init(buffer: [:0]const u8) Tokenizer {
341        // Skip the UTF-8 BOM if present
342        const src_start = if (mem.startsWith(u8, buffer, "\xEF\xBB\xBF")) 3 else @as(usize, 0);
343        return Tokenizer{
344            .buffer = buffer,
345            .index = src_start,
346            .pending_invalid_token = null,
347        };
348    }
349
350    const State = enum {
351        start,
352        identifier,
353        builtin,
354        string_literal,
355        string_literal_backslash,
356        multiline_string_literal_line,
357        char_literal,
358        char_literal_backslash,
359        char_literal_hex_escape,
360        char_literal_unicode_escape_saw_u,
361        char_literal_unicode_escape,
362        char_literal_unicode_invalid,
363        char_literal_unicode,
364        char_literal_end,
365        backslash,
366        equal,
367        bang,
368        pipe,
369        minus,
370        minus_percent,
371        minus_pipe,
372        asterisk,
373        asterisk_percent,
374        asterisk_pipe,
375        slash,
376        line_comment_start,
377        line_comment,
378        doc_comment_start,
379        doc_comment,
380        zero,
381        int_literal_dec,
382        int_literal_dec_no_underscore,
383        int_literal_bin,
384        int_literal_bin_no_underscore,
385        int_literal_oct,
386        int_literal_oct_no_underscore,
387        int_literal_hex,
388        int_literal_hex_no_underscore,
389        num_dot_dec,
390        num_dot_hex,
391        float_fraction_dec,
392        float_fraction_dec_no_underscore,
393        float_fraction_hex,
394        float_fraction_hex_no_underscore,
395        float_exponent_unsigned,
396        float_exponent_num,
397        float_exponent_num_no_underscore,
398        ampersand,
399        caret,
400        percent,
401        plus,
402        plus_percent,
403        plus_pipe,
404        angle_bracket_left,
405        angle_bracket_angle_bracket_left,
406        angle_bracket_angle_bracket_left_pipe,
407        angle_bracket_right,
408        angle_bracket_angle_bracket_right,
409        period,
410        period_2,
411        period_asterisk,
412        saw_at_sign,
413    };
414
415    pub fn next(self: *Tokenizer) Token {
416        if (self.pending_invalid_token) |token| {
417            self.pending_invalid_token = null;
418            return token;
419        }
420        var state: State = .start;
421        var result = Token{
422            .tag = .eof,
423            .loc = .{
424                .start = self.index,
425                .end = undefined,
426            },
427        };
428        var seen_escape_digits: usize = undefined;
429        var remaining_code_units: usize = undefined;
430        while (true) : (self.index += 1) {
431            const c = self.buffer[self.index];
432            switch (state) {
433                .start => switch (c) {
434                    0 => break,
435                    ' ', '\n', '\t', '\r' => {
436                        result.loc.start = self.index + 1;
437                    },
438                    '"' => {
439                        state = .string_literal;
440                        result.tag = .string_literal;
441                    },
442                    '\'' => {
443                        state = .char_literal;
444                    },
445                    'a'...'z', 'A'...'Z', '_' => {
446                        state = .identifier;
447                        result.tag = .identifier;
448                    },
449                    '@' => {
450                        state = .saw_at_sign;
451                    },
452                    '=' => {
453                        state = .equal;
454                    },
455                    '!' => {
456                        state = .bang;
457                    },
458                    '|' => {
459                        state = .pipe;
460                    },
461                    '(' => {
462                        result.tag = .l_paren;
463                        self.index += 1;
464                        break;
465                    },
466                    ')' => {
467                        result.tag = .r_paren;
468                        self.index += 1;
469                        break;
470                    },
471                    '[' => {
472                        result.tag = .l_bracket;
473                        self.index += 1;
474                        break;
475                    },
476                    ']' => {
477                        result.tag = .r_bracket;
478                        self.index += 1;
479                        break;
480                    },
481                    ';' => {
482                        result.tag = .semicolon;
483                        self.index += 1;
484                        break;
485                    },
486                    ',' => {
487                        result.tag = .comma;
488                        self.index += 1;
489                        break;
490                    },
491                    '?' => {
492                        result.tag = .question_mark;
493                        self.index += 1;
494                        break;
495                    },
496                    ':' => {
497                        result.tag = .colon;
498                        self.index += 1;
499                        break;
500                    },
501                    '%' => {
502                        state = .percent;
503                    },
504                    '*' => {
505                        state = .asterisk;
506                    },
507                    '+' => {
508                        state = .plus;
509                    },
510                    '<' => {
511                        state = .angle_bracket_left;
512                    },
513                    '>' => {
514                        state = .angle_bracket_right;
515                    },
516                    '^' => {
517                        state = .caret;
518                    },
519                    '\\' => {
520                        state = .backslash;
521                        result.tag = .multiline_string_literal_line;
522                    },
523                    '{' => {
524                        result.tag = .l_brace;
525                        self.index += 1;
526                        break;
527                    },
528                    '}' => {
529                        result.tag = .r_brace;
530                        self.index += 1;
531                        break;
532                    },
533                    '~' => {
534                        result.tag = .tilde;
535                        self.index += 1;
536                        break;
537                    },
538                    '.' => {
539                        state = .period;
540                    },
541                    '-' => {
542                        state = .minus;
543                    },
544                    '/' => {
545                        state = .slash;
546                    },
547                    '&' => {
548                        state = .ampersand;
549                    },
550                    '0' => {
551                        state = .zero;
552                        result.tag = .integer_literal;
553                    },
554                    '1'...'9' => {
555                        state = .int_literal_dec;
556                        result.tag = .integer_literal;
557                    },
558                    else => {
559                        result.tag = .invalid;
560                        result.loc.end = self.index;
561                        self.index += 1;
562                        return result;
563                    },
564                },
565
566                .saw_at_sign => switch (c) {
567                    '"' => {
568                        result.tag = .identifier;
569                        state = .string_literal;
570                    },
571                    'a'...'z', 'A'...'Z', '_' => {
572                        state = .builtin;
573                        result.tag = .builtin;
574                    },
575                    else => {
576                        result.tag = .invalid;
577                        break;
578                    },
579                },
580
581                .ampersand => switch (c) {
582                    '=' => {
583                        result.tag = .ampersand_equal;
584                        self.index += 1;
585                        break;
586                    },
587                    else => {
588                        result.tag = .ampersand;
589                        break;
590                    },
591                },
592
593                .asterisk => switch (c) {
594                    '=' => {
595                        result.tag = .asterisk_equal;
596                        self.index += 1;
597                        break;
598                    },
599                    '*' => {
600                        result.tag = .asterisk_asterisk;
601                        self.index += 1;
602                        break;
603                    },
604                    '%' => {
605                        state = .asterisk_percent;
606                    },
607                    '|' => {
608                        state = .asterisk_pipe;
609                    },
610                    else => {
611                        result.tag = .asterisk;
612                        break;
613                    },
614                },
615
616                .asterisk_percent => switch (c) {
617                    '=' => {
618                        result.tag = .asterisk_percent_equal;
619                        self.index += 1;
620                        break;
621                    },
622                    else => {
623                        result.tag = .asterisk_percent;
624                        break;
625                    },
626                },
627
628                .asterisk_pipe => switch (c) {
629                    '=' => {
630                        result.tag = .asterisk_pipe_equal;
631                        self.index += 1;
632                        break;
633                    },
634                    else => {
635                        result.tag = .asterisk_pipe;
636                        break;
637                    },
638                },
639
640                .percent => switch (c) {
641                    '=' => {
642                        result.tag = .percent_equal;
643                        self.index += 1;
644                        break;
645                    },
646                    else => {
647                        result.tag = .percent;
648                        break;
649                    },
650                },
651
652                .plus => switch (c) {
653                    '=' => {
654                        result.tag = .plus_equal;
655                        self.index += 1;
656                        break;
657                    },
658                    '+' => {
659                        result.tag = .plus_plus;
660                        self.index += 1;
661                        break;
662                    },
663                    '%' => {
664                        state = .plus_percent;
665                    },
666                    '|' => {
667                        state = .plus_pipe;
668                    },
669                    else => {
670                        result.tag = .plus;
671                        break;
672                    },
673                },
674
675                .plus_percent => switch (c) {
676                    '=' => {
677                        result.tag = .plus_percent_equal;
678                        self.index += 1;
679                        break;
680                    },
681                    else => {
682                        result.tag = .plus_percent;
683                        break;
684                    },
685                },
686
687                .plus_pipe => switch (c) {
688                    '=' => {
689                        result.tag = .plus_pipe_equal;
690                        self.index += 1;
691                        break;
692                    },
693                    else => {
694                        result.tag = .plus_pipe;
695                        break;
696                    },
697                },
698
699                .caret => switch (c) {
700                    '=' => {
701                        result.tag = .caret_equal;
702                        self.index += 1;
703                        break;
704                    },
705                    else => {
706                        result.tag = .caret;
707                        break;
708                    },
709                },
710
711                .identifier => switch (c) {
712                    'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
713                    else => {
714                        if (Token.getKeyword(self.buffer[result.loc.start..self.index])) |tag| {
715                            result.tag = tag;
716                        }
717                        break;
718                    },
719                },
720                .builtin => switch (c) {
721                    'a'...'z', 'A'...'Z', '_', '0'...'9' => {},
722                    else => break,
723                },
724                .backslash => switch (c) {
725                    '\\' => {
726                        state = .multiline_string_literal_line;
727                    },
728                    else => {
729                        result.tag = .invalid;
730                        break;
731                    },
732                },
733                .string_literal => switch (c) {
734                    '\\' => {
735                        state = .string_literal_backslash;
736                    },
737                    '"' => {
738                        self.index += 1;
739                        break;
740                    },
741                    0 => {
742                        if (self.index == self.buffer.len) {
743                            break;
744                        } else {
745                            self.checkLiteralCharacter();
746                        }
747                    },
748                    '\n' => {
749                        result.tag = .invalid;
750                        break;
751                    },
752                    else => self.checkLiteralCharacter(),
753                },
754
755                .string_literal_backslash => switch (c) {
756                    0, '\n' => {
757                        result.tag = .invalid;
758                        break;
759                    },
760                    else => {
761                        state = .string_literal;
762                    },
763                },
764
765                .char_literal => switch (c) {
766                    0 => {
767                        result.tag = .invalid;
768                        break;
769                    },
770                    '\\' => {
771                        state = .char_literal_backslash;
772                    },
773                    '\'', 0x80...0xbf, 0xf8...0xff => {
774                        result.tag = .invalid;
775                        break;
776                    },
777                    0xc0...0xdf => { // 110xxxxx
778                        remaining_code_units = 1;
779                        state = .char_literal_unicode;
780                    },
781                    0xe0...0xef => { // 1110xxxx
782                        remaining_code_units = 2;
783                        state = .char_literal_unicode;
784                    },
785                    0xf0...0xf7 => { // 11110xxx
786                        remaining_code_units = 3;
787                        state = .char_literal_unicode;
788                    },
789                    else => {
790                        state = .char_literal_end;
791                    },
792                },
793
794                .char_literal_backslash => switch (c) {
795                    0, '\n' => {
796                        result.tag = .invalid;
797                        break;
798                    },
799                    'x' => {
800                        state = .char_literal_hex_escape;
801                        seen_escape_digits = 0;
802                    },
803                    'u' => {
804                        state = .char_literal_unicode_escape_saw_u;
805                    },
806                    else => {
807                        state = .char_literal_end;
808                    },
809                },
810
811                .char_literal_hex_escape => switch (c) {
812                    '0'...'9', 'a'...'f', 'A'...'F' => {
813                        seen_escape_digits += 1;
814                        if (seen_escape_digits == 2) {
815                            state = .char_literal_end;
816                        }
817                    },
818                    else => {
819                        result.tag = .invalid;
820                        break;
821                    },
822                },
823
824                .char_literal_unicode_escape_saw_u => switch (c) {
825                    0 => {
826                        result.tag = .invalid;
827                        break;
828                    },
829                    '{' => {
830                        state = .char_literal_unicode_escape;
831                    },
832                    else => {
833                        result.tag = .invalid;
834                        state = .char_literal_unicode_invalid;
835                    },
836                },
837
838                .char_literal_unicode_escape => switch (c) {
839                    0 => {
840                        result.tag = .invalid;
841                        break;
842                    },
843                    '0'...'9', 'a'...'f', 'A'...'F' => {},
844                    '}' => {
845                        state = .char_literal_end; // too many/few digits handled later
846                    },
847                    else => {
848                        result.tag = .invalid;
849                        state = .char_literal_unicode_invalid;
850                    },
851                },
852
853                .char_literal_unicode_invalid => switch (c) {
854                    // Keep consuming characters until an obvious stopping point.
855                    // This consolidates e.g. `u{0ab1Q}` into a single invalid token
856                    // instead of creating the tokens `u{0ab1`, `Q`, `}`
857                    '0'...'9', 'a'...'z', 'A'...'Z', '}' => {},
858                    else => break,
859                },
860
861                .char_literal_end => switch (c) {
862                    '\'' => {
863                        result.tag = .char_literal;
864                        self.index += 1;
865                        break;
866                    },
867                    else => {
868                        result.tag = .invalid;
869                        break;
870                    },
871                },
872
873                .char_literal_unicode => switch (c) {
874                    0x80...0xbf => {
875                        remaining_code_units -= 1;
876                        if (remaining_code_units == 0) {
877                            state = .char_literal_end;
878                        }
879                    },
880                    else => {
881                        result.tag = .invalid;
882                        break;
883                    },
884                },
885
886                .multiline_string_literal_line => switch (c) {
887                    0 => break,
888                    '\n' => {
889                        self.index += 1;
890                        break;
891                    },
892                    '\t' => {},
893                    else => self.checkLiteralCharacter(),
894                },
895
896                .bang => switch (c) {
897                    '=' => {
898                        result.tag = .bang_equal;
899                        self.index += 1;
900                        break;
901                    },
902                    else => {
903                        result.tag = .bang;
904                        break;
905                    },
906                },
907
908                .pipe => switch (c) {
909                    '=' => {
910                        result.tag = .pipe_equal;
911                        self.index += 1;
912                        break;
913                    },
914                    '|' => {
915                        result.tag = .pipe_pipe;
916                        self.index += 1;
917                        break;
918                    },
919                    else => {
920                        result.tag = .pipe;
921                        break;
922                    },
923                },
924
925                .equal => switch (c) {
926                    '=' => {
927                        result.tag = .equal_equal;
928                        self.index += 1;
929                        break;
930                    },
931                    '>' => {
932                        result.tag = .equal_angle_bracket_right;
933                        self.index += 1;
934                        break;
935                    },
936                    else => {
937                        result.tag = .equal;
938                        break;
939                    },
940                },
941
942                .minus => switch (c) {
943                    '>' => {
944                        result.tag = .arrow;
945                        self.index += 1;
946                        break;
947                    },
948                    '=' => {
949                        result.tag = .minus_equal;
950                        self.index += 1;
951                        break;
952                    },
953                    '%' => {
954                        state = .minus_percent;
955                    },
956                    '|' => {
957                        state = .minus_pipe;
958                    },
959                    else => {
960                        result.tag = .minus;
961                        break;
962                    },
963                },
964
965                .minus_percent => switch (c) {
966                    '=' => {
967                        result.tag = .minus_percent_equal;
968                        self.index += 1;
969                        break;
970                    },
971                    else => {
972                        result.tag = .minus_percent;
973                        break;
974                    },
975                },
976                .minus_pipe => switch (c) {
977                    '=' => {
978                        result.tag = .minus_pipe_equal;
979                        self.index += 1;
980                        break;
981                    },
982                    else => {
983                        result.tag = .minus_pipe;
984                        break;
985                    },
986                },
987
988                .angle_bracket_left => switch (c) {
989                    '<' => {
990                        state = .angle_bracket_angle_bracket_left;
991                    },
992                    '=' => {
993                        result.tag = .angle_bracket_left_equal;
994                        self.index += 1;
995                        break;
996                    },
997                    else => {
998                        result.tag = .angle_bracket_left;
999                        break;
1000                    },
1001                },
1002
1003                .angle_bracket_angle_bracket_left => switch (c) {
1004                    '=' => {
1005                        result.tag = .angle_bracket_angle_bracket_left_equal;
1006                        self.index += 1;
1007                        break;
1008                    },
1009                    '|' => {
1010                        state = .angle_bracket_angle_bracket_left_pipe;
1011                    },
1012                    else => {
1013                        result.tag = .angle_bracket_angle_bracket_left;
1014                        break;
1015                    },
1016                },
1017
1018                .angle_bracket_angle_bracket_left_pipe => switch (c) {
1019                    '=' => {
1020                        result.tag = .angle_bracket_angle_bracket_left_pipe_equal;
1021                        self.index += 1;
1022                        break;
1023                    },
1024                    else => {
1025                        result.tag = .angle_bracket_angle_bracket_left_pipe;
1026                        break;
1027                    },
1028                },
1029
1030                .angle_bracket_right => switch (c) {
1031                    '>' => {
1032                        state = .angle_bracket_angle_bracket_right;
1033                    },
1034                    '=' => {
1035                        result.tag = .angle_bracket_right_equal;
1036                        self.index += 1;
1037                        break;
1038                    },
1039                    else => {
1040                        result.tag = .angle_bracket_right;
1041                        break;
1042                    },
1043                },
1044
1045                .angle_bracket_angle_bracket_right => switch (c) {
1046                    '=' => {
1047                        result.tag = .angle_bracket_angle_bracket_right_equal;
1048                        self.index += 1;
1049                        break;
1050                    },
1051                    else => {
1052                        result.tag = .angle_bracket_angle_bracket_right;
1053                        break;
1054                    },
1055                },
1056
1057                .period => switch (c) {
1058                    '.' => {
1059                        state = .period_2;
1060                    },
1061                    '*' => {
1062                        state = .period_asterisk;
1063                    },
1064                    else => {
1065                        result.tag = .period;
1066                        break;
1067                    },
1068                },
1069
1070                .period_2 => switch (c) {
1071                    '.' => {
1072                        result.tag = .ellipsis3;
1073                        self.index += 1;
1074                        break;
1075                    },
1076                    else => {
1077                        result.tag = .ellipsis2;
1078                        break;
1079                    },
1080                },
1081
1082                .period_asterisk => switch (c) {
1083                    '*' => {
1084                        result.tag = .invalid_periodasterisks;
1085                        break;
1086                    },
1087                    else => {
1088                        result.tag = .period_asterisk;
1089                        break;
1090                    },
1091                },
1092
1093                .slash => switch (c) {
1094                    '/' => {
1095                        state = .line_comment_start;
1096                    },
1097                    '=' => {
1098                        result.tag = .slash_equal;
1099                        self.index += 1;
1100                        break;
1101                    },
1102                    else => {
1103                        result.tag = .slash;
1104                        break;
1105                    },
1106                },
1107                .line_comment_start => switch (c) {
1108                    0 => {
1109                        if (self.index != self.buffer.len) {
1110                            result.tag = .invalid;
1111                            self.index += 1;
1112                        }
1113                        break;
1114                    },
1115                    '/' => {
1116                        state = .doc_comment_start;
1117                    },
1118                    '!' => {
1119                        result.tag = .container_doc_comment;
1120                        state = .doc_comment;
1121                    },
1122                    '\n' => {
1123                        state = .start;
1124                        result.loc.start = self.index + 1;
1125                    },
1126                    '\t', '\r' => state = .line_comment,
1127                    else => {
1128                        state = .line_comment;
1129                        self.checkLiteralCharacter();
1130                    },
1131                },
1132                .doc_comment_start => switch (c) {
1133                    '/' => {
1134                        state = .line_comment;
1135                    },
1136                    0, '\n' => {
1137                        result.tag = .doc_comment;
1138                        break;
1139                    },
1140                    '\t', '\r' => {
1141                        state = .doc_comment;
1142                        result.tag = .doc_comment;
1143                    },
1144                    else => {
1145                        state = .doc_comment;
1146                        result.tag = .doc_comment;
1147                        self.checkLiteralCharacter();
1148                    },
1149                },
1150                .line_comment => switch (c) {
1151                    0 => break,
1152                    '\n' => {
1153                        state = .start;
1154                        result.loc.start = self.index + 1;
1155                    },
1156                    '\t', '\r' => {},
1157                    else => self.checkLiteralCharacter(),
1158                },
1159                .doc_comment => switch (c) {
1160                    0, '\n' => break,
1161                    '\t', '\r' => {},
1162                    else => self.checkLiteralCharacter(),
1163                },
1164                .zero => switch (c) {
1165                    'b' => {
1166                        state = .int_literal_bin_no_underscore;
1167                    },
1168                    'o' => {
1169                        state = .int_literal_oct_no_underscore;
1170                    },
1171                    'x' => {
1172                        state = .int_literal_hex_no_underscore;
1173                    },
1174                    '0'...'9', '_', '.', 'e', 'E' => {
1175                        // reinterpret as a decimal number
1176                        self.index -= 1;
1177                        state = .int_literal_dec;
1178                    },
1179                    'a', 'c', 'd', 'f'...'n', 'p'...'w', 'y', 'z', 'A'...'D', 'F'...'Z' => {
1180                        result.tag = .invalid;
1181                        break;
1182                    },
1183                    else => break,
1184                },
1185                .int_literal_bin_no_underscore => switch (c) {
1186                    '0'...'1' => {
1187                        state = .int_literal_bin;
1188                    },
1189                    else => {
1190                        result.tag = .invalid;
1191                        break;
1192                    },
1193                },
1194                .int_literal_bin => switch (c) {
1195                    '_' => {
1196                        state = .int_literal_bin_no_underscore;
1197                    },
1198                    '0'...'1' => {},
1199                    '2'...'9', 'a'...'z', 'A'...'Z' => {
1200                        result.tag = .invalid;
1201                        break;
1202                    },
1203                    else => break,
1204                },
1205                .int_literal_oct_no_underscore => switch (c) {
1206                    '0'...'7' => {
1207                        state = .int_literal_oct;
1208                    },
1209                    else => {
1210                        result.tag = .invalid;
1211                        break;
1212                    },
1213                },
1214                .int_literal_oct => switch (c) {
1215                    '_' => {
1216                        state = .int_literal_oct_no_underscore;
1217                    },
1218                    '0'...'7' => {},
1219                    '8', '9', 'a'...'z', 'A'...'Z' => {
1220                        result.tag = .invalid;
1221                        break;
1222                    },
1223                    else => break,
1224                },
1225                .int_literal_dec_no_underscore => switch (c) {
1226                    '0'...'9' => {
1227                        state = .int_literal_dec;
1228                    },
1229                    else => {
1230                        result.tag = .invalid;
1231                        break;
1232                    },
1233                },
1234                .int_literal_dec => switch (c) {
1235                    '_' => {
1236                        state = .int_literal_dec_no_underscore;
1237                    },
1238                    '.' => {
1239                        state = .num_dot_dec;
1240                        result.tag = .invalid;
1241                    },
1242                    'e', 'E' => {
1243                        state = .float_exponent_unsigned;
1244                        result.tag = .float_literal;
1245                    },
1246                    '0'...'9' => {},
1247                    'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => {
1248                        result.tag = .invalid;
1249                        break;
1250                    },
1251                    else => break,
1252                },
1253                .int_literal_hex_no_underscore => switch (c) {
1254                    '0'...'9', 'a'...'f', 'A'...'F' => {
1255                        state = .int_literal_hex;
1256                    },
1257                    else => {
1258                        result.tag = .invalid;
1259                        break;
1260                    },
1261                },
1262                .int_literal_hex => switch (c) {
1263                    '_' => {
1264                        state = .int_literal_hex_no_underscore;
1265                    },
1266                    '.' => {
1267                        state = .num_dot_hex;
1268                        result.tag = .invalid;
1269                    },
1270                    'p', 'P' => {
1271                        state = .float_exponent_unsigned;
1272                        result.tag = .float_literal;
1273                    },
1274                    '0'...'9', 'a'...'f', 'A'...'F' => {},
1275                    'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => {
1276                        result.tag = .invalid;
1277                        break;
1278                    },
1279                    else => break,
1280                },
1281                .num_dot_dec => switch (c) {
1282                    '.' => {
1283                        result.tag = .integer_literal;
1284                        self.index -= 1;
1285                        state = .start;
1286                        break;
1287                    },
1288                    '0'...'9' => {
1289                        result.tag = .float_literal;
1290                        state = .float_fraction_dec;
1291                    },
1292                    '_', 'a'...'z', 'A'...'Z' => {
1293                        result.tag = .invalid;
1294                        break;
1295                    },
1296                    else => break,
1297                },
1298                .num_dot_hex => switch (c) {
1299                    '.' => {
1300                        result.tag = .integer_literal;
1301                        self.index -= 1;
1302                        state = .start;
1303                        break;
1304                    },
1305                    '0'...'9', 'a'...'f', 'A'...'F' => {
1306                        result.tag = .float_literal;
1307                        state = .float_fraction_hex;
1308                    },
1309                    '_', 'g'...'z', 'G'...'Z' => {
1310                        result.tag = .invalid;
1311                        break;
1312                    },
1313                    else => break,
1314                },
1315                .float_fraction_dec_no_underscore => switch (c) {
1316                    '0'...'9' => {
1317                        state = .float_fraction_dec;
1318                    },
1319                    else => {
1320                        result.tag = .invalid;
1321                        break;
1322                    },
1323                },
1324                .float_fraction_dec => switch (c) {
1325                    '_' => {
1326                        state = .float_fraction_dec_no_underscore;
1327                    },
1328                    'e', 'E' => {
1329                        state = .float_exponent_unsigned;
1330                    },
1331                    '0'...'9' => {},
1332                    'a'...'d', 'f'...'z', 'A'...'D', 'F'...'Z' => {
1333                        result.tag = .invalid;
1334                        break;
1335                    },
1336                    else => break,
1337                },
1338                .float_fraction_hex_no_underscore => switch (c) {
1339                    '0'...'9', 'a'...'f', 'A'...'F' => {
1340                        state = .float_fraction_hex;
1341                    },
1342                    else => {
1343                        result.tag = .invalid;
1344                        break;
1345                    },
1346                },
1347                .float_fraction_hex => switch (c) {
1348                    '_' => {
1349                        state = .float_fraction_hex_no_underscore;
1350                    },
1351                    'p', 'P' => {
1352                        state = .float_exponent_unsigned;
1353                    },
1354                    '0'...'9', 'a'...'f', 'A'...'F' => {},
1355                    'g'...'o', 'q'...'z', 'G'...'O', 'Q'...'Z' => {
1356                        result.tag = .invalid;
1357                        break;
1358                    },
1359                    else => break,
1360                },
1361                .float_exponent_unsigned => switch (c) {
1362                    '+', '-' => {
1363                        state = .float_exponent_num_no_underscore;
1364                    },
1365                    else => {
1366                        // reinterpret as a normal exponent number
1367                        self.index -= 1;
1368                        state = .float_exponent_num_no_underscore;
1369                    },
1370                },
1371                .float_exponent_num_no_underscore => switch (c) {
1372                    '0'...'9' => {
1373                        state = .float_exponent_num;
1374                    },
1375                    else => {
1376                        result.tag = .invalid;
1377                        break;
1378                    },
1379                },
1380                .float_exponent_num => switch (c) {
1381                    '_' => {
1382                        state = .float_exponent_num_no_underscore;
1383                    },
1384                    '0'...'9' => {},
1385                    'a'...'z', 'A'...'Z' => {
1386                        result.tag = .invalid;
1387                        break;
1388                    },
1389                    else => break,
1390                },
1391            }
1392        }
1393
1394        if (result.tag == .eof) {
1395            if (self.pending_invalid_token) |token| {
1396                self.pending_invalid_token = null;
1397                return token;
1398            }
1399            result.loc.start = self.index;
1400        }
1401
1402        result.loc.end = self.index;
1403        return result;
1404    }
1405
1406    fn checkLiteralCharacter(self: *Tokenizer) void {
1407        if (self.pending_invalid_token != null) return;
1408        const invalid_length = self.getInvalidCharacterLength();
1409        if (invalid_length == 0) return;
1410        self.pending_invalid_token = .{
1411            .tag = .invalid,
1412            .loc = .{
1413                .start = self.index,
1414                .end = self.index + invalid_length,
1415            },
1416        };
1417    }
1418
1419    fn getInvalidCharacterLength(self: *Tokenizer) u3 {
1420        const c0 = self.buffer[self.index];
1421        if (c0 < 0x80) {
1422            if (c0 < 0x20 or c0 == 0x7f) {
1423                // ascii control codes are never allowed
1424                // (note that \n was checked before we got here)
1425                return 1;
1426            }
1427            // looks fine to me.
1428            return 0;
1429        } else {
1430            // check utf8-encoded character.
1431            const length = std.unicode.utf8ByteSequenceLength(c0) catch return 1;
1432            if (self.index + length > self.buffer.len) {
1433                return @intCast(u3, self.buffer.len - self.index);
1434            }
1435            const bytes = self.buffer[self.index .. self.index + length];
1436            switch (length) {
1437                2 => {
1438                    const value = std.unicode.utf8Decode2(bytes) catch return length;
1439                    if (value == 0x85) return length; // U+0085 (NEL)
1440                },
1441                3 => {
1442                    const value = std.unicode.utf8Decode3(bytes) catch return length;
1443                    if (value == 0x2028) return length; // U+2028 (LS)
1444                    if (value == 0x2029) return length; // U+2029 (PS)
1445                },
1446                4 => {
1447                    _ = std.unicode.utf8Decode4(bytes) catch return length;
1448                },
1449                else => unreachable,
1450            }
1451            self.index += length - 1;
1452            return 0;
1453        }
1454    }
1455};
1456
1457test "tokenizer" {
1458    try testTokenize("test", &.{.keyword_test});
1459}
1460
1461test "line comment followed by top-level comptime" {
1462    try testTokenize(
1463        \\// line comment
1464        \\comptime {}
1465        \\
1466    , &.{
1467        .keyword_comptime,
1468        .l_brace,
1469        .r_brace,
1470    });
1471}
1472
1473test "tokenizer - unknown length pointer and then c pointer" {
1474    try testTokenize(
1475        \\[*]u8
1476        \\[*c]u8
1477    , &.{
1478        .l_bracket,
1479        .asterisk,
1480        .r_bracket,
1481        .identifier,
1482        .l_bracket,
1483        .asterisk,
1484        .identifier,
1485        .r_bracket,
1486        .identifier,
1487    });
1488}
1489
1490test "tokenizer - code point literal with hex escape" {
1491    try testTokenize(
1492        \\'\x1b'
1493    , &.{.char_literal});
1494    try testTokenize(
1495        \\'\x1'
1496    , &.{ .invalid, .invalid });
1497}
1498
1499test "tokenizer - code point literal with unicode escapes" {
1500    // Valid unicode escapes
1501    try testTokenize(
1502        \\'\u{3}'
1503    , &.{.char_literal});
1504    try testTokenize(
1505        \\'\u{01}'
1506    , &.{.char_literal});
1507    try testTokenize(
1508        \\'\u{2a}'
1509    , &.{.char_literal});
1510    try testTokenize(
1511        \\'\u{3f9}'
1512    , &.{.char_literal});
1513    try testTokenize(
1514        \\'\u{6E09aBc1523}'
1515    , &.{.char_literal});
1516    try testTokenize(
1517        \\"\u{440}"
1518    , &.{.string_literal});
1519
1520    // Invalid unicode escapes
1521    try testTokenize(
1522        \\'\u'
1523    , &.{.invalid});
1524    try testTokenize(
1525        \\'\u{{'
1526    , &.{ .invalid, .invalid });
1527    try testTokenize(
1528        \\'\u{}'
1529    , &.{.char_literal});
1530    try testTokenize(
1531        \\'\u{s}'
1532    , &.{ .invalid, .invalid });
1533    try testTokenize(
1534        \\'\u{2z}'
1535    , &.{ .invalid, .invalid });
1536    try testTokenize(
1537        \\'\u{4a'
1538    , &.{.invalid});
1539
1540    // Test old-style unicode literals
1541    try testTokenize(
1542        \\'\u0333'
1543    , &.{ .invalid, .invalid });
1544    try testTokenize(
1545        \\'\U0333'
1546    , &.{ .invalid, .integer_literal, .invalid });
1547}
1548
1549test "tokenizer - code point literal with unicode code point" {
1550    try testTokenize(
1551        \\'��'
1552    , &.{.char_literal});
1553}
1554
1555test "tokenizer - float literal e exponent" {
1556    try testTokenize("a = 4.94065645841246544177e-324;\n", &.{
1557        .identifier,
1558        .equal,
1559        .float_literal,
1560        .semicolon,
1561    });
1562}
1563
1564test "tokenizer - float literal p exponent" {
1565    try testTokenize("a = 0x1.a827999fcef32p+1022;\n", &.{
1566        .identifier,
1567        .equal,
1568        .float_literal,
1569        .semicolon,
1570    });
1571}
1572
1573test "tokenizer - chars" {
1574    try testTokenize("'c'", &.{.char_literal});
1575}
1576
1577test "tokenizer - invalid token characters" {
1578    try testTokenize("#", &.{.invalid});
1579    try testTokenize("`", &.{.invalid});
1580    try testTokenize("'c", &.{.invalid});
1581    try testTokenize("'", &.{.invalid});
1582    try testTokenize("''", &.{ .invalid, .invalid });
1583}
1584
1585test "tokenizer - invalid literal/comment characters" {
1586    try testTokenize("\"\x00\"", &.{
1587        .string_literal,
1588        .invalid,
1589    });
1590    try testTokenize("//\x00", &.{
1591        .invalid,
1592    });
1593    try testTokenize("//\x1f", &.{
1594        .invalid,
1595    });
1596    try testTokenize("//\x7f", &.{
1597        .invalid,
1598    });
1599}
1600
1601test "tokenizer - utf8" {
1602    try testTokenize("//\xc2\x80", &.{});
1603    try testTokenize("//\xf4\x8f\xbf\xbf", &.{});
1604}
1605
1606test "tokenizer - invalid utf8" {
1607    try testTokenize("//\x80", &.{
1608        .invalid,
1609    });
1610    try testTokenize("//\xbf", &.{
1611        .invalid,
1612    });
1613    try testTokenize("//\xf8", &.{
1614        .invalid,
1615    });
1616    try testTokenize("//\xff", &.{
1617        .invalid,
1618    });
1619    try testTokenize("//\xc2\xc0", &.{
1620        .invalid,
1621    });
1622    try testTokenize("//\xe0", &.{
1623        .invalid,
1624    });
1625    try testTokenize("//\xf0", &.{
1626        .invalid,
1627    });
1628    try testTokenize("//\xf0\x90\x80\xc0", &.{
1629        .invalid,
1630    });
1631}
1632
1633test "tokenizer - illegal unicode codepoints" {
1634    // unicode newline characters.U+0085, U+2028, U+2029
1635    try testTokenize("//\xc2\x84", &.{});
1636    try testTokenize("//\xc2\x85", &.{
1637        .invalid,
1638    });
1639    try testTokenize("//\xc2\x86", &.{});
1640    try testTokenize("//\xe2\x80\xa7", &.{});
1641    try testTokenize("//\xe2\x80\xa8", &.{
1642        .invalid,
1643    });
1644    try testTokenize("//\xe2\x80\xa9", &.{
1645        .invalid,
1646    });
1647    try testTokenize("//\xe2\x80\xaa", &.{});
1648}
1649
1650test "tokenizer - string identifier and builtin fns" {
1651    try testTokenize(
1652        \\const @"if" = @import("std");
1653    , &.{
1654        .keyword_const,
1655        .identifier,
1656        .equal,
1657        .builtin,
1658        .l_paren,
1659        .string_literal,
1660        .r_paren,
1661        .semicolon,
1662    });
1663}
1664
1665test "tokenizer - multiline string literal with literal tab" {
1666    try testTokenize(
1667        \\\\foo	bar
1668    , &.{
1669        .multiline_string_literal_line,
1670    });
1671}
1672
1673test "tokenizer - comments with literal tab" {
1674    try testTokenize(
1675        \\//foo	bar
1676        \\//!foo	bar
1677        \\///foo	bar
1678        \\//	foo
1679        \\///	foo
1680        \\///	/foo
1681    , &.{
1682        .container_doc_comment,
1683        .doc_comment,
1684        .doc_comment,
1685        .doc_comment,
1686    });
1687}
1688
1689test "tokenizer - pipe and then invalid" {
1690    try testTokenize("||=", &.{
1691        .pipe_pipe,
1692        .equal,
1693    });
1694}
1695
1696test "tokenizer - line comment and doc comment" {
1697    try testTokenize("//", &.{});
1698    try testTokenize("// a / b", &.{});
1699    try testTokenize("// /", &.{});
1700    try testTokenize("/// a", &.{.doc_comment});
1701    try testTokenize("///", &.{.doc_comment});
1702    try testTokenize("////", &.{});
1703    try testTokenize("//!", &.{.container_doc_comment});
1704    try testTokenize("//!!", &.{.container_doc_comment});
1705}
1706
1707test "tokenizer - line comment followed by identifier" {
1708    try testTokenize(
1709        \\    Unexpected,
1710        \\    // another
1711        \\    Another,
1712    , &.{
1713        .identifier,
1714        .comma,
1715        .identifier,
1716        .comma,
1717    });
1718}
1719
1720test "tokenizer - UTF-8 BOM is recognized and skipped" {
1721    try testTokenize("\xEF\xBB\xBFa;\n", &.{
1722        .identifier,
1723        .semicolon,
1724    });
1725}
1726
1727test "correctly parse pointer assignment" {
1728    try testTokenize("b.*=3;\n", &.{
1729        .identifier,
1730        .period_asterisk,
1731        .equal,
1732        .integer_literal,
1733        .semicolon,
1734    });
1735}
1736
1737test "correctly parse pointer dereference followed by asterisk" {
1738    try testTokenize("\"b\".* ** 10", &.{
1739        .string_literal,
1740        .period_asterisk,
1741        .asterisk_asterisk,
1742        .integer_literal,
1743    });
1744
1745    try testTokenize("(\"b\".*)** 10", &.{
1746        .l_paren,
1747        .string_literal,
1748        .period_asterisk,
1749        .r_paren,
1750        .asterisk_asterisk,
1751        .integer_literal,
1752    });
1753
1754    try testTokenize("\"b\".*** 10", &.{
1755        .string_literal,
1756        .invalid_periodasterisks,
1757        .asterisk_asterisk,
1758        .integer_literal,
1759    });
1760}
1761
1762test "tokenizer - range literals" {
1763    try testTokenize("0...9", &.{ .integer_literal, .ellipsis3, .integer_literal });
1764    try testTokenize("'0'...'9'", &.{ .char_literal, .ellipsis3, .char_literal });
1765    try testTokenize("0x00...0x09", &.{ .integer_literal, .ellipsis3, .integer_literal });
1766    try testTokenize("0b00...0b11", &.{ .integer_literal, .ellipsis3, .integer_literal });
1767    try testTokenize("0o00...0o11", &.{ .integer_literal, .ellipsis3, .integer_literal });
1768}
1769
1770test "tokenizer - number literals decimal" {
1771    try testTokenize("0", &.{.integer_literal});
1772    try testTokenize("1", &.{.integer_literal});
1773    try testTokenize("2", &.{.integer_literal});
1774    try testTokenize("3", &.{.integer_literal});
1775    try testTokenize("4", &.{.integer_literal});
1776    try testTokenize("5", &.{.integer_literal});
1777    try testTokenize("6", &.{.integer_literal});
1778    try testTokenize("7", &.{.integer_literal});
1779    try testTokenize("8", &.{.integer_literal});
1780    try testTokenize("9", &.{.integer_literal});
1781    try testTokenize("1..", &.{ .integer_literal, .ellipsis2 });
1782    try testTokenize("0a", &.{ .invalid, .identifier });
1783    try testTokenize("9b", &.{ .invalid, .identifier });
1784    try testTokenize("1z", &.{ .invalid, .identifier });
1785    try testTokenize("1z_1", &.{ .invalid, .identifier });
1786    try testTokenize("9z3", &.{ .invalid, .identifier });
1787
1788    try testTokenize("0_0", &.{.integer_literal});
1789    try testTokenize("0001", &.{.integer_literal});
1790    try testTokenize("01234567890", &.{.integer_literal});
1791    try testTokenize("012_345_6789_0", &.{.integer_literal});
1792    try testTokenize("0_1_2_3_4_5_6_7_8_9_0", &.{.integer_literal});
1793
1794    try testTokenize("00_", &.{.invalid});
1795    try testTokenize("0_0_", &.{.invalid});
1796    try testTokenize("0__0", &.{ .invalid, .identifier });
1797    try testTokenize("0_0f", &.{ .invalid, .identifier });
1798    try testTokenize("0_0_f", &.{ .invalid, .identifier });
1799    try testTokenize("0_0_f_00", &.{ .invalid, .identifier });
1800    try testTokenize("1_,", &.{ .invalid, .comma });
1801
1802    try testTokenize("0.0", &.{.float_literal});
1803    try testTokenize("1.0", &.{.float_literal});
1804    try testTokenize("10.0", &.{.float_literal});
1805    try testTokenize("0e0", &.{.float_literal});
1806    try testTokenize("1e0", &.{.float_literal});
1807    try testTokenize("1e100", &.{.float_literal});
1808    try testTokenize("1.0e100", &.{.float_literal});
1809    try testTokenize("1.0e+100", &.{.float_literal});
1810    try testTokenize("1.0e-100", &.{.float_literal});
1811    try testTokenize("1_0_0_0.0_0_0_0_0_1e1_0_0_0", &.{.float_literal});
1812
1813    try testTokenize("1.", &.{.invalid});
1814    try testTokenize("1e", &.{.invalid});
1815    try testTokenize("1.e100", &.{ .invalid, .identifier });
1816    try testTokenize("1.0e1f0", &.{ .invalid, .identifier });
1817    try testTokenize("1.0p100", &.{ .invalid, .identifier });
1818    try testTokenize("1.0p-100", &.{ .invalid, .identifier, .minus, .integer_literal });
1819    try testTokenize("1.0p1f0", &.{ .invalid, .identifier });
1820    try testTokenize("1.0_,", &.{ .invalid, .comma });
1821    try testTokenize("1_.0", &.{ .invalid, .period, .integer_literal });
1822    try testTokenize("1._", &.{ .invalid, .identifier });
1823    try testTokenize("1.a", &.{ .invalid, .identifier });
1824    try testTokenize("1.z", &.{ .invalid, .identifier });
1825    try testTokenize("1._0", &.{ .invalid, .identifier });
1826    try testTokenize("1.+", &.{ .invalid, .plus });
1827    try testTokenize("1._+", &.{ .invalid, .identifier, .plus });
1828    try testTokenize("1._e", &.{ .invalid, .identifier });
1829    try testTokenize("1.0e", &.{.invalid});
1830    try testTokenize("1.0e,", &.{ .invalid, .comma });
1831    try testTokenize("1.0e_", &.{ .invalid, .identifier });
1832    try testTokenize("1.0e+_", &.{ .invalid, .identifier });
1833    try testTokenize("1.0e-_", &.{ .invalid, .identifier });
1834    try testTokenize("1.0e0_+", &.{ .invalid, .plus });
1835}
1836
1837test "tokenizer - number literals binary" {
1838    try testTokenize("0b0", &.{.integer_literal});
1839    try testTokenize("0b1", &.{.integer_literal});
1840    try testTokenize("0b2", &.{ .invalid, .integer_literal });
1841    try testTokenize("0b3", &.{ .invalid, .integer_literal });
1842    try testTokenize("0b4", &.{ .invalid, .integer_literal });
1843    try testTokenize("0b5", &.{ .invalid, .integer_literal });
1844    try testTokenize("0b6", &.{ .invalid, .integer_literal });
1845    try testTokenize("0b7", &.{ .invalid, .integer_literal });
1846    try testTokenize("0b8", &.{ .invalid, .integer_literal });
1847    try testTokenize("0b9", &.{ .invalid, .integer_literal });
1848    try testTokenize("0ba", &.{ .invalid, .identifier });
1849    try testTokenize("0bb", &.{ .invalid, .identifier });
1850    try testTokenize("0bc", &.{ .invalid, .identifier });
1851    try testTokenize("0bd", &.{ .invalid, .identifier });
1852    try testTokenize("0be", &.{ .invalid, .identifier });
1853    try testTokenize("0bf", &.{ .invalid, .identifier });
1854    try testTokenize("0bz", &.{ .invalid, .identifier });
1855
1856    try testTokenize("0b0000_0000", &.{.integer_literal});
1857    try testTokenize("0b1111_1111", &.{.integer_literal});
1858    try testTokenize("0b10_10_10_10", &.{.integer_literal});
1859    try testTokenize("0b0_1_0_1_0_1_0_1", &.{.integer_literal});
1860    try testTokenize("0b1.", &.{ .integer_literal, .period });
1861    try testTokenize("0b1.0", &.{ .integer_literal, .period, .integer_literal });
1862
1863    try testTokenize("0B0", &.{ .invalid, .identifier });
1864    try testTokenize("0b_", &.{ .invalid, .identifier });
1865    try testTokenize("0b_0", &.{ .invalid, .identifier });
1866    try testTokenize("0b1_", &.{.invalid});
1867    try testTokenize("0b0__1", &.{ .invalid, .identifier });
1868    try testTokenize("0b0_1_", &.{.invalid});
1869    try testTokenize("0b1e", &.{ .invalid, .identifier });
1870    try testTokenize("0b1p", &.{ .invalid, .identifier });
1871    try testTokenize("0b1e0", &.{ .invalid, .identifier });
1872    try testTokenize("0b1p0", &.{ .invalid, .identifier });
1873    try testTokenize("0b1_,", &.{ .invalid, .comma });
1874}
1875
1876test "tokenizer - number literals octal" {
1877    try testTokenize("0o0", &.{.integer_literal});
1878    try testTokenize("0o1", &.{.integer_literal});
1879    try testTokenize("0o2", &.{.integer_literal});
1880    try testTokenize("0o3", &.{.integer_literal});
1881    try testTokenize("0o4", &.{.integer_literal});
1882    try testTokenize("0o5", &.{.integer_literal});
1883    try testTokenize("0o6", &.{.integer_literal});
1884    try testTokenize("0o7", &.{.integer_literal});
1885    try testTokenize("0o8", &.{ .invalid, .integer_literal });
1886    try testTokenize("0o9", &.{ .invalid, .integer_literal });
1887    try testTokenize("0oa", &.{ .invalid, .identifier });
1888    try testTokenize("0ob", &.{ .invalid, .identifier });
1889    try testTokenize("0oc", &.{ .invalid, .identifier });
1890    try testTokenize("0od", &.{ .invalid, .identifier });
1891    try testTokenize("0oe", &.{ .invalid, .identifier });
1892    try testTokenize("0of", &.{ .invalid, .identifier });
1893    try testTokenize("0oz", &.{ .invalid, .identifier });
1894
1895    try testTokenize("0o01234567", &.{.integer_literal});
1896    try testTokenize("0o0123_4567", &.{.integer_literal});
1897    try testTokenize("0o01_23_45_67", &.{.integer_literal});
1898    try testTokenize("0o0_1_2_3_4_5_6_7", &.{.integer_literal});
1899    try testTokenize("0o7.", &.{ .integer_literal, .period });
1900    try testTokenize("0o7.0", &.{ .integer_literal, .period, .integer_literal });
1901
1902    try testTokenize("0O0", &.{ .invalid, .identifier });
1903    try testTokenize("0o_", &.{ .invalid, .identifier });
1904    try testTokenize("0o_0", &.{ .invalid, .identifier });
1905    try testTokenize("0o1_", &.{.invalid});
1906    try testTokenize("0o0__1", &.{ .invalid, .identifier });
1907    try testTokenize("0o0_1_", &.{.invalid});
1908    try testTokenize("0o1e", &.{ .invalid, .identifier });
1909    try testTokenize("0o1p", &.{ .invalid, .identifier });
1910    try testTokenize("0o1e0", &.{ .invalid, .identifier });
1911    try testTokenize("0o1p0", &.{ .invalid, .identifier });
1912    try testTokenize("0o_,", &.{ .invalid, .identifier, .comma });
1913}
1914
1915test "tokenizer - number literals hexadecimal" {
1916    try testTokenize("0x0", &.{.integer_literal});
1917    try testTokenize("0x1", &.{.integer_literal});
1918    try testTokenize("0x2", &.{.integer_literal});
1919    try testTokenize("0x3", &.{.integer_literal});
1920    try testTokenize("0x4", &.{.integer_literal});
1921    try testTokenize("0x5", &.{.integer_literal});
1922    try testTokenize("0x6", &.{.integer_literal});
1923    try testTokenize("0x7", &.{.integer_literal});
1924    try testTokenize("0x8", &.{.integer_literal});
1925    try testTokenize("0x9", &.{.integer_literal});
1926    try testTokenize("0xa", &.{.integer_literal});
1927    try testTokenize("0xb", &.{.integer_literal});
1928    try testTokenize("0xc", &.{.integer_literal});
1929    try testTokenize("0xd", &.{.integer_literal});
1930    try testTokenize("0xe", &.{.integer_literal});
1931    try testTokenize("0xf", &.{.integer_literal});
1932    try testTokenize("0xA", &.{.integer_literal});
1933    try testTokenize("0xB", &.{.integer_literal});
1934    try testTokenize("0xC", &.{.integer_literal});
1935    try testTokenize("0xD", &.{.integer_literal});
1936    try testTokenize("0xE", &.{.integer_literal});
1937    try testTokenize("0xF", &.{.integer_literal});
1938    try testTokenize("0x0z", &.{ .invalid, .identifier });
1939    try testTokenize("0xz", &.{ .invalid, .identifier });
1940
1941    try testTokenize("0x0123456789ABCDEF", &.{.integer_literal});
1942    try testTokenize("0x0123_4567_89AB_CDEF", &.{.integer_literal});
1943    try testTokenize("0x01_23_45_67_89AB_CDE_F", &.{.integer_literal});
1944    try testTokenize("0x0_1_2_3_4_5_6_7_8_9_A_B_C_D_E_F", &.{.integer_literal});
1945
1946    try testTokenize("0X0", &.{ .invalid, .identifier });
1947    try testTokenize("0x_", &.{ .invalid, .identifier });
1948    try testTokenize("0x_1", &.{ .invalid, .identifier });
1949    try testTokenize("0x1_", &.{.invalid});
1950    try testTokenize("0x0__1", &.{ .invalid, .identifier });
1951    try testTokenize("0x0_1_", &.{.invalid});
1952    try testTokenize("0x_,", &.{ .invalid, .identifier, .comma });
1953
1954    try testTokenize("0x1.0", &.{.float_literal});
1955    try testTokenize("0xF.0", &.{.float_literal});
1956    try testTokenize("0xF.F", &.{.float_literal});
1957    try testTokenize("0xF.Fp0", &.{.float_literal});
1958    try testTokenize("0xF.FP0", &.{.float_literal});
1959    try testTokenize("0x1p0", &.{.float_literal});
1960    try testTokenize("0xfp0", &.{.float_literal});
1961    try testTokenize("0x1.0+0xF.0", &.{ .float_literal, .plus, .float_literal });
1962
1963    try testTokenize("0x1.", &.{.invalid});
1964    try testTokenize("0xF.", &.{.invalid});
1965    try testTokenize("0x1.+0xF.", &.{ .invalid, .plus, .invalid });
1966    try testTokenize("0xff.p10", &.{ .invalid, .identifier });
1967
1968    try testTokenize("0x0123456.789ABCDEF", &.{.float_literal});
1969    try testTokenize("0x0_123_456.789_ABC_DEF", &.{.float_literal});
1970    try testTokenize("0x0_1_2_3_4_5_6.7_8_9_A_B_C_D_E_F", &.{.float_literal});
1971    try testTokenize("0x0p0", &.{.float_literal});
1972    try testTokenize("0x0.0p0", &.{.float_literal});
1973    try testTokenize("0xff.ffp10", &.{.float_literal});
1974    try testTokenize("0xff.ffP10", &.{.float_literal});
1975    try testTokenize("0xffp10", &.{.float_literal});
1976    try testTokenize("0xff_ff.ff_ffp1_0_0_0", &.{.float_literal});
1977    try testTokenize("0xf_f_f_f.f_f_f_fp+1_000", &.{.float_literal});
1978    try testTokenize("0xf_f_f_f.f_f_f_fp-1_00_0", &.{.float_literal});
1979
1980    try testTokenize("0x1e", &.{.integer_literal});
1981    try testTokenize("0x1e0", &.{.integer_literal});
1982    try testTokenize("0x1p", &.{.invalid});
1983    try testTokenize("0xfp0z1", &.{ .invalid, .identifier });
1984    try testTokenize("0xff.ffpff", &.{ .invalid, .identifier });
1985    try testTokenize("0x0.p", &.{ .invalid, .identifier });
1986    try testTokenize("0x0.z", &.{ .invalid, .identifier });
1987    try testTokenize("0x0._", &.{ .invalid, .identifier });
1988    try testTokenize("0x0_.0", &.{ .invalid, .period, .integer_literal });
1989    try testTokenize("0x0_.0.0", &.{ .invalid, .period, .float_literal });
1990    try testTokenize("0x0._0", &.{ .invalid, .identifier });
1991    try testTokenize("0x0.0_", &.{.invalid});
1992    try testTokenize("0x0_p0", &.{ .invalid, .identifier });
1993    try testTokenize("0x0_.p0", &.{ .invalid, .period, .identifier });
1994    try testTokenize("0x0._p0", &.{ .invalid, .identifier });
1995    try testTokenize("0x0.0_p0", &.{ .invalid, .identifier });
1996    try testTokenize("0x0._0p0", &.{ .invalid, .identifier });
1997    try testTokenize("0x0.0p_0", &.{ .invalid, .identifier });
1998    try testTokenize("0x0.0p+_0", &.{ .invalid, .identifier });
1999    try testTokenize("0x0.0p-_0", &.{ .invalid, .identifier });
2000    try testTokenize("0x0.0p0_", &.{ .invalid, .eof });
2001}
2002
2003test "tokenizer - multi line string literal with only 1 backslash" {
2004    try testTokenize("x \\\n;", &.{ .identifier, .invalid, .semicolon });
2005}
2006
2007test "tokenizer - invalid builtin identifiers" {
2008    try testTokenize("@()", &.{ .invalid, .l_paren, .r_paren });
2009    try testTokenize("@0()", &.{ .invalid, .integer_literal, .l_paren, .r_paren });
2010}
2011
2012test "tokenizer - invalid token with unfinished escape right before eof" {
2013    try testTokenize("\"\\", &.{.invalid});
2014    try testTokenize("'\\", &.{.invalid});
2015    try testTokenize("'\\u", &.{.invalid});
2016}
2017
2018test "tokenizer - saturating" {
2019    try testTokenize("<<", &.{.angle_bracket_angle_bracket_left});
2020    try testTokenize("<<|", &.{.angle_bracket_angle_bracket_left_pipe});
2021    try testTokenize("<<|=", &.{.angle_bracket_angle_bracket_left_pipe_equal});
2022
2023    try testTokenize("*", &.{.asterisk});
2024    try testTokenize("*|", &.{.asterisk_pipe});
2025    try testTokenize("*|=", &.{.asterisk_pipe_equal});
2026
2027    try testTokenize("+", &.{.plus});
2028    try testTokenize("+|", &.{.plus_pipe});
2029    try testTokenize("+|=", &.{.plus_pipe_equal});
2030
2031    try testTokenize("-", &.{.minus});
2032    try testTokenize("-|", &.{.minus_pipe});
2033    try testTokenize("-|=", &.{.minus_pipe_equal});
2034}
2035
2036fn testTokenize(source: [:0]const u8, expected_tokens: []const Token.Tag) !void {
2037    var tokenizer = Tokenizer.init(source);
2038    for (expected_tokens) |expected_token_id| {
2039        const token = tokenizer.next();
2040        if (token.tag != expected_token_id) {
2041            std.debug.panic("expected {s}, found {s}\n", .{
2042                @tagName(expected_token_id), @tagName(token.tag),
2043            });
2044        }
2045    }
2046    const last_token = tokenizer.next();
2047    try std.testing.expectEqual(Token.Tag.eof, last_token.tag);
2048    try std.testing.expectEqual(source.len, last_token.loc.start);
2049}
2050