1 //! Low-level Rust lexer.
2 //!
3 //! The idea with `rustc_lexer` is to make a reusable library,
4 //! by separating out pure lexing and rustc-specific concerns, like spans,
5 //! error reporting, and interning.  So, rustc_lexer operates directly on `&str`,
6 //! produces simple tokens which are a pair of type-tag and a bit of original text,
7 //! and does not report errors, instead storing them as flags on the token.
8 //!
9 //! Tokens produced by this lexer are not yet ready for parsing the Rust syntax.
10 //! For that see [`rustc_parse::lexer`], which converts this basic token stream
11 //! into wide tokens used by actual parser.
12 //!
13 //! The purpose of this crate is to convert raw sources into a labeled sequence
14 //! of well-known token types, so building an actual Rust token stream will
15 //! be easier.
16 //!
17 //! The main entity of this crate is the [`TokenKind`] enum which represents common
18 //! lexeme types.
19 //!
20 //! [`rustc_parse::lexer`]: ../rustc_parse/lexer/index.html
21 // We want to be able to build this crate with a stable compiler, so no
22 // `#![feature]` attributes should be added.
23 
24 mod cursor;
25 pub mod unescape;
26 
27 #[cfg(test)]
28 mod tests;
29 
30 use self::LiteralKind::*;
31 use self::TokenKind::*;
32 use crate::cursor::{Cursor, EOF_CHAR};
33 use std::convert::TryFrom;
34 
35 /// Parsed token.
36 /// It doesn't contain information about data that has been parsed,
37 /// only the type of the token and its size.
38 #[derive(Debug)]
39 pub struct Token {
40     pub kind: TokenKind,
41     pub len: usize,
42 }
43 
44 impl Token {
new(kind: TokenKind, len: usize) -> Token45     fn new(kind: TokenKind, len: usize) -> Token {
46         Token { kind, len }
47     }
48 }
49 
50 /// Enum representing common lexeme types.
51 // perf note: Changing all `usize` to `u32` doesn't change performance. See #77629
52 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
53 pub enum TokenKind {
54     // Multi-char tokens:
55     /// "// comment"
56     LineComment { doc_style: Option<DocStyle> },
57     /// `/* block comment */`
58     ///
59     /// Block comments can be recursive, so the sequence like `/* /* */`
60     /// will not be considered terminated and will result in a parsing error.
61     BlockComment { doc_style: Option<DocStyle>, terminated: bool },
62     /// Any whitespace characters sequence.
63     Whitespace,
64     /// "ident" or "continue"
65     /// At this step keywords are also considered identifiers.
66     Ident,
67     /// Like the above, but containing invalid unicode codepoints.
68     InvalidIdent,
69     /// "r#ident"
70     RawIdent,
71     /// An unknown prefix like `foo#`, `foo'`, `foo"`. Note that only the
72     /// prefix (`foo`) is included in the token, not the separator (which is
73     /// lexed as its own distinct token). In Rust 2021 and later, reserved
74     /// prefixes are reported as errors; in earlier editions, they result in a
75     /// (allowed by default) lint, and are treated as regular identifier
76     /// tokens.
77     UnknownPrefix,
78     /// "12_u8", "1.0e-40", "b"123"". See `LiteralKind` for more details.
79     Literal { kind: LiteralKind, suffix_start: usize },
80     /// "'a"
81     Lifetime { starts_with_number: bool },
82 
83     // One-char tokens:
84     /// ";"
85     Semi,
86     /// ","
87     Comma,
88     /// "."
89     Dot,
90     /// "("
91     OpenParen,
92     /// ")"
93     CloseParen,
94     /// "{"
95     OpenBrace,
96     /// "}"
97     CloseBrace,
98     /// "["
99     OpenBracket,
100     /// "]"
101     CloseBracket,
102     /// "@"
103     At,
104     /// "#"
105     Pound,
106     /// "~"
107     Tilde,
108     /// "?"
109     Question,
110     /// ":"
111     Colon,
112     /// "$"
113     Dollar,
114     /// "="
115     Eq,
116     /// "!"
117     Bang,
118     /// "<"
119     Lt,
120     /// ">"
121     Gt,
122     /// "-"
123     Minus,
124     /// "&"
125     And,
126     /// "|"
127     Or,
128     /// "+"
129     Plus,
130     /// "*"
131     Star,
132     /// "/"
133     Slash,
134     /// "^"
135     Caret,
136     /// "%"
137     Percent,
138 
139     /// Unknown token, not expected by the lexer, e.g. "№"
140     Unknown,
141 }
142 
143 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
144 pub enum DocStyle {
145     Outer,
146     Inner,
147 }
148 
149 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
150 pub enum LiteralKind {
151     /// "12_u8", "0o100", "0b120i99"
152     Int { base: Base, empty_int: bool },
153     /// "12.34f32", "0b100.100"
154     Float { base: Base, empty_exponent: bool },
155     /// "'a'", "'\\'", "'''", "';"
156     Char { terminated: bool },
157     /// "b'a'", "b'\\'", "b'''", "b';"
158     Byte { terminated: bool },
159     /// ""abc"", ""abc"
160     Str { terminated: bool },
161     /// "b"abc"", "b"abc"
162     ByteStr { terminated: bool },
163     /// "r"abc"", "r#"abc"#", "r####"ab"###"c"####", "r#"a"
164     RawStr { n_hashes: u16, err: Option<RawStrError> },
165     /// "br"abc"", "br#"abc"#", "br####"ab"###"c"####", "br#"a"
166     RawByteStr { n_hashes: u16, err: Option<RawStrError> },
167 }
168 
169 /// Error produced validating a raw string. Represents cases like:
170 /// - `r##~"abcde"##`: `InvalidStarter`
171 /// - `r###"abcde"##`: `NoTerminator { expected: 3, found: 2, possible_terminator_offset: Some(11)`
172 /// - Too many `#`s (>65535): `TooManyDelimiters`
173 // perf note: It doesn't matter that this makes `Token` 36 bytes bigger. See #77629
174 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
175 pub enum RawStrError {
176     /// Non `#` characters exist between `r` and `"` eg. `r#~"..`
177     InvalidStarter { bad_char: char },
178     /// The string was never terminated. `possible_terminator_offset` is the number of characters after `r` or `br` where they
179     /// may have intended to terminate it.
180     NoTerminator { expected: usize, found: usize, possible_terminator_offset: Option<usize> },
181     /// More than 65535 `#`s exist.
182     TooManyDelimiters { found: usize },
183 }
184 
185 /// Base of numeric literal encoding according to its prefix.
186 #[derive(Clone, Copy, Debug, PartialEq, Eq, PartialOrd, Ord)]
187 pub enum Base {
188     /// Literal starts with "0b".
189     Binary,
190     /// Literal starts with "0o".
191     Octal,
192     /// Literal starts with "0x".
193     Hexadecimal,
194     /// Literal doesn't contain a prefix.
195     Decimal,
196 }
197 
198 /// `rustc` allows files to have a shebang, e.g. "#!/usr/bin/rustrun",
199 /// but shebang isn't a part of rust syntax.
strip_shebang(input: &str) -> Option<usize>200 pub fn strip_shebang(input: &str) -> Option<usize> {
201     // Shebang must start with `#!` literally, without any preceding whitespace.
202     // For simplicity we consider any line starting with `#!` a shebang,
203     // regardless of restrictions put on shebangs by specific platforms.
204     if let Some(input_tail) = input.strip_prefix("#!") {
205         // Ok, this is a shebang but if the next non-whitespace token is `[`,
206         // then it may be valid Rust code, so consider it Rust code.
207         let next_non_whitespace_token = tokenize(input_tail).map(|tok| tok.kind).find(|tok| {
208             !matches!(
209                 tok,
210                 TokenKind::Whitespace
211                     | TokenKind::LineComment { doc_style: None }
212                     | TokenKind::BlockComment { doc_style: None, .. }
213             )
214         });
215         if next_non_whitespace_token != Some(TokenKind::OpenBracket) {
216             // No other choice than to consider this a shebang.
217             return Some(2 + input_tail.lines().next().unwrap_or_default().len());
218         }
219     }
220     None
221 }
222 
223 /// Parses the first token from the provided input string.
first_token(input: &str) -> Token224 pub fn first_token(input: &str) -> Token {
225     debug_assert!(!input.is_empty());
226     Cursor::new(input).advance_token()
227 }
228 
229 /// Creates an iterator that produces tokens from the input string.
tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_230 pub fn tokenize(mut input: &str) -> impl Iterator<Item = Token> + '_ {
231     std::iter::from_fn(move || {
232         if input.is_empty() {
233             return None;
234         }
235         let token = first_token(input);
236         input = &input[token.len..];
237         Some(token)
238     })
239 }
240 
241 /// True if `c` is considered a whitespace according to Rust language definition.
242 /// See [Rust language reference](https://doc.rust-lang.org/reference/whitespace.html)
243 /// for definitions of these classes.
is_whitespace(c: char) -> bool244 pub fn is_whitespace(c: char) -> bool {
245     // This is Pattern_White_Space.
246     //
247     // Note that this set is stable (ie, it doesn't change with different
248     // Unicode versions), so it's ok to just hard-code the values.
249 
250     matches!(
251         c,
252         // Usual ASCII suspects
253         '\u{0009}'   // \t
254         | '\u{000A}' // \n
255         | '\u{000B}' // vertical tab
256         | '\u{000C}' // form feed
257         | '\u{000D}' // \r
258         | '\u{0020}' // space
259 
260         // NEXT LINE from latin1
261         | '\u{0085}'
262 
263         // Bidi markers
264         | '\u{200E}' // LEFT-TO-RIGHT MARK
265         | '\u{200F}' // RIGHT-TO-LEFT MARK
266 
267         // Dedicated whitespace characters from Unicode
268         | '\u{2028}' // LINE SEPARATOR
269         | '\u{2029}' // PARAGRAPH SEPARATOR
270     )
271 }
272 
273 /// True if `c` is valid as a first character of an identifier.
274 /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
275 /// a formal definition of valid identifier name.
is_id_start(c: char) -> bool276 pub fn is_id_start(c: char) -> bool {
277     // This is XID_Start OR '_' (which formally is not a XID_Start).
278     c == '_' || unicode_xid::UnicodeXID::is_xid_start(c)
279 }
280 
281 /// True if `c` is valid as a non-first character of an identifier.
282 /// See [Rust language reference](https://doc.rust-lang.org/reference/identifiers.html) for
283 /// a formal definition of valid identifier name.
is_id_continue(c: char) -> bool284 pub fn is_id_continue(c: char) -> bool {
285     unicode_xid::UnicodeXID::is_xid_continue(c)
286 }
287 
288 /// The passed string is lexically an identifier.
is_ident(string: &str) -> bool289 pub fn is_ident(string: &str) -> bool {
290     let mut chars = string.chars();
291     if let Some(start) = chars.next() {
292         is_id_start(start) && chars.all(is_id_continue)
293     } else {
294         false
295     }
296 }
297 
298 impl Cursor<'_> {
299     /// Parses a token from the input string.
advance_token(&mut self) -> Token300     fn advance_token(&mut self) -> Token {
301         let first_char = self.bump().unwrap();
302         let token_kind = match first_char {
303             // Slash, comment or block comment.
304             '/' => match self.first() {
305                 '/' => self.line_comment(),
306                 '*' => self.block_comment(),
307                 _ => Slash,
308             },
309 
310             // Whitespace sequence.
311             c if is_whitespace(c) => self.whitespace(),
312 
313             // Raw identifier, raw string literal or identifier.
314             'r' => match (self.first(), self.second()) {
315                 ('#', c1) if is_id_start(c1) => self.raw_ident(),
316                 ('#', _) | ('"', _) => {
317                     let (n_hashes, err) = self.raw_double_quoted_string(1);
318                     let suffix_start = self.len_consumed();
319                     if err.is_none() {
320                         self.eat_literal_suffix();
321                     }
322                     let kind = RawStr { n_hashes, err };
323                     Literal { kind, suffix_start }
324                 }
325                 _ => self.ident_or_unknown_prefix(),
326             },
327 
328             // Byte literal, byte string literal, raw byte string literal or identifier.
329             'b' => match (self.first(), self.second()) {
330                 ('\'', _) => {
331                     self.bump();
332                     let terminated = self.single_quoted_string();
333                     let suffix_start = self.len_consumed();
334                     if terminated {
335                         self.eat_literal_suffix();
336                     }
337                     let kind = Byte { terminated };
338                     Literal { kind, suffix_start }
339                 }
340                 ('"', _) => {
341                     self.bump();
342                     let terminated = self.double_quoted_string();
343                     let suffix_start = self.len_consumed();
344                     if terminated {
345                         self.eat_literal_suffix();
346                     }
347                     let kind = ByteStr { terminated };
348                     Literal { kind, suffix_start }
349                 }
350                 ('r', '"') | ('r', '#') => {
351                     self.bump();
352                     let (n_hashes, err) = self.raw_double_quoted_string(2);
353                     let suffix_start = self.len_consumed();
354                     if err.is_none() {
355                         self.eat_literal_suffix();
356                     }
357                     let kind = RawByteStr { n_hashes, err };
358                     Literal { kind, suffix_start }
359                 }
360                 _ => self.ident_or_unknown_prefix(),
361             },
362 
363             // Identifier (this should be checked after other variant that can
364             // start as identifier).
365             c if is_id_start(c) => self.ident_or_unknown_prefix(),
366 
367             // Numeric literal.
368             c @ '0'..='9' => {
369                 let literal_kind = self.number(c);
370                 let suffix_start = self.len_consumed();
371                 self.eat_literal_suffix();
372                 TokenKind::Literal { kind: literal_kind, suffix_start }
373             }
374 
375             // One-symbol tokens.
376             ';' => Semi,
377             ',' => Comma,
378             '.' => Dot,
379             '(' => OpenParen,
380             ')' => CloseParen,
381             '{' => OpenBrace,
382             '}' => CloseBrace,
383             '[' => OpenBracket,
384             ']' => CloseBracket,
385             '@' => At,
386             '#' => Pound,
387             '~' => Tilde,
388             '?' => Question,
389             ':' => Colon,
390             '$' => Dollar,
391             '=' => Eq,
392             '!' => Bang,
393             '<' => Lt,
394             '>' => Gt,
395             '-' => Minus,
396             '&' => And,
397             '|' => Or,
398             '+' => Plus,
399             '*' => Star,
400             '^' => Caret,
401             '%' => Percent,
402 
403             // Lifetime or character literal.
404             '\'' => self.lifetime_or_char(),
405 
406             // String literal.
407             '"' => {
408                 let terminated = self.double_quoted_string();
409                 let suffix_start = self.len_consumed();
410                 if terminated {
411                     self.eat_literal_suffix();
412                 }
413                 let kind = Str { terminated };
414                 Literal { kind, suffix_start }
415             }
416             // Identifier starting with an emoji. Only lexed for graceful error recovery.
417             c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
418                 self.fake_ident_or_unknown_prefix()
419             }
420             _ => Unknown,
421         };
422         Token::new(token_kind, self.len_consumed())
423     }
424 
line_comment(&mut self) -> TokenKind425     fn line_comment(&mut self) -> TokenKind {
426         debug_assert!(self.prev() == '/' && self.first() == '/');
427         self.bump();
428 
429         let doc_style = match self.first() {
430             // `//!` is an inner line doc comment.
431             '!' => Some(DocStyle::Inner),
432             // `////` (more than 3 slashes) is not considered a doc comment.
433             '/' if self.second() != '/' => Some(DocStyle::Outer),
434             _ => None,
435         };
436 
437         self.eat_while(|c| c != '\n');
438         LineComment { doc_style }
439     }
440 
block_comment(&mut self) -> TokenKind441     fn block_comment(&mut self) -> TokenKind {
442         debug_assert!(self.prev() == '/' && self.first() == '*');
443         self.bump();
444 
445         let doc_style = match self.first() {
446             // `/*!` is an inner block doc comment.
447             '!' => Some(DocStyle::Inner),
448             // `/***` (more than 2 stars) is not considered a doc comment.
449             // `/**/` is not considered a doc comment.
450             '*' if !matches!(self.second(), '*' | '/') => Some(DocStyle::Outer),
451             _ => None,
452         };
453 
454         let mut depth = 1usize;
455         while let Some(c) = self.bump() {
456             match c {
457                 '/' if self.first() == '*' => {
458                     self.bump();
459                     depth += 1;
460                 }
461                 '*' if self.first() == '/' => {
462                     self.bump();
463                     depth -= 1;
464                     if depth == 0 {
465                         // This block comment is closed, so for a construction like "/* */ */"
466                         // there will be a successfully parsed block comment "/* */"
467                         // and " */" will be processed separately.
468                         break;
469                     }
470                 }
471                 _ => (),
472             }
473         }
474 
475         BlockComment { doc_style, terminated: depth == 0 }
476     }
477 
whitespace(&mut self) -> TokenKind478     fn whitespace(&mut self) -> TokenKind {
479         debug_assert!(is_whitespace(self.prev()));
480         self.eat_while(is_whitespace);
481         Whitespace
482     }
483 
raw_ident(&mut self) -> TokenKind484     fn raw_ident(&mut self) -> TokenKind {
485         debug_assert!(self.prev() == 'r' && self.first() == '#' && is_id_start(self.second()));
486         // Eat "#" symbol.
487         self.bump();
488         // Eat the identifier part of RawIdent.
489         self.eat_identifier();
490         RawIdent
491     }
492 
ident_or_unknown_prefix(&mut self) -> TokenKind493     fn ident_or_unknown_prefix(&mut self) -> TokenKind {
494         debug_assert!(is_id_start(self.prev()));
495         // Start is already eaten, eat the rest of identifier.
496         self.eat_while(is_id_continue);
497         // Known prefixes must have been handled earlier. So if
498         // we see a prefix here, it is definitely an unknown prefix.
499         match self.first() {
500             '#' | '"' | '\'' => UnknownPrefix,
501             c if !c.is_ascii() && unic_emoji_char::is_emoji(c) => {
502                 self.fake_ident_or_unknown_prefix()
503             }
504             _ => Ident,
505         }
506     }
507 
fake_ident_or_unknown_prefix(&mut self) -> TokenKind508     fn fake_ident_or_unknown_prefix(&mut self) -> TokenKind {
509         // Start is already eaten, eat the rest of identifier.
510         self.eat_while(|c| {
511             unicode_xid::UnicodeXID::is_xid_continue(c)
512                 || (!c.is_ascii() && unic_emoji_char::is_emoji(c))
513                 || c == '\u{200d}'
514         });
515         // Known prefixes must have been handled earlier. So if
516         // we see a prefix here, it is definitely an unknown prefix.
517         match self.first() {
518             '#' | '"' | '\'' => UnknownPrefix,
519             _ => InvalidIdent,
520         }
521     }
522 
number(&mut self, first_digit: char) -> LiteralKind523     fn number(&mut self, first_digit: char) -> LiteralKind {
524         debug_assert!('0' <= self.prev() && self.prev() <= '9');
525         let mut base = Base::Decimal;
526         if first_digit == '0' {
527             // Attempt to parse encoding base.
528             let has_digits = match self.first() {
529                 'b' => {
530                     base = Base::Binary;
531                     self.bump();
532                     self.eat_decimal_digits()
533                 }
534                 'o' => {
535                     base = Base::Octal;
536                     self.bump();
537                     self.eat_decimal_digits()
538                 }
539                 'x' => {
540                     base = Base::Hexadecimal;
541                     self.bump();
542                     self.eat_hexadecimal_digits()
543                 }
544                 // Not a base prefix.
545                 '0'..='9' | '_' | '.' | 'e' | 'E' => {
546                     self.eat_decimal_digits();
547                     true
548                 }
549                 // Just a 0.
550                 _ => return Int { base, empty_int: false },
551             };
552             // Base prefix was provided, but there were no digits
553             // after it, e.g. "0x".
554             if !has_digits {
555                 return Int { base, empty_int: true };
556             }
557         } else {
558             // No base prefix, parse number in the usual way.
559             self.eat_decimal_digits();
560         };
561 
562         match self.first() {
563             // Don't be greedy if this is actually an
564             // integer literal followed by field/method access or a range pattern
565             // (`0..2` and `12.foo()`)
566             '.' if self.second() != '.' && !is_id_start(self.second()) => {
567                 // might have stuff after the ., and if it does, it needs to start
568                 // with a number
569                 self.bump();
570                 let mut empty_exponent = false;
571                 if self.first().is_digit(10) {
572                     self.eat_decimal_digits();
573                     match self.first() {
574                         'e' | 'E' => {
575                             self.bump();
576                             empty_exponent = !self.eat_float_exponent();
577                         }
578                         _ => (),
579                     }
580                 }
581                 Float { base, empty_exponent }
582             }
583             'e' | 'E' => {
584                 self.bump();
585                 let empty_exponent = !self.eat_float_exponent();
586                 Float { base, empty_exponent }
587             }
588             _ => Int { base, empty_int: false },
589         }
590     }
591 
lifetime_or_char(&mut self) -> TokenKind592     fn lifetime_or_char(&mut self) -> TokenKind {
593         debug_assert!(self.prev() == '\'');
594 
595         let can_be_a_lifetime = if self.second() == '\'' {
596             // It's surely not a lifetime.
597             false
598         } else {
599             // If the first symbol is valid for identifier, it can be a lifetime.
600             // Also check if it's a number for a better error reporting (so '0 will
601             // be reported as invalid lifetime and not as unterminated char literal).
602             is_id_start(self.first()) || self.first().is_digit(10)
603         };
604 
605         if !can_be_a_lifetime {
606             let terminated = self.single_quoted_string();
607             let suffix_start = self.len_consumed();
608             if terminated {
609                 self.eat_literal_suffix();
610             }
611             let kind = Char { terminated };
612             return Literal { kind, suffix_start };
613         }
614 
615         // Either a lifetime or a character literal with
616         // length greater than 1.
617 
618         let starts_with_number = self.first().is_digit(10);
619 
620         // Skip the literal contents.
621         // First symbol can be a number (which isn't a valid identifier start),
622         // so skip it without any checks.
623         self.bump();
624         self.eat_while(is_id_continue);
625 
626         // Check if after skipping literal contents we've met a closing
627         // single quote (which means that user attempted to create a
628         // string with single quotes).
629         if self.first() == '\'' {
630             self.bump();
631             let kind = Char { terminated: true };
632             Literal { kind, suffix_start: self.len_consumed() }
633         } else {
634             Lifetime { starts_with_number }
635         }
636     }
637 
single_quoted_string(&mut self) -> bool638     fn single_quoted_string(&mut self) -> bool {
639         debug_assert!(self.prev() == '\'');
640         // Check if it's a one-symbol literal.
641         if self.second() == '\'' && self.first() != '\\' {
642             self.bump();
643             self.bump();
644             return true;
645         }
646 
647         // Literal has more than one symbol.
648 
649         // Parse until either quotes are terminated or error is detected.
650         loop {
651             match self.first() {
652                 // Quotes are terminated, finish parsing.
653                 '\'' => {
654                     self.bump();
655                     return true;
656                 }
657                 // Probably beginning of the comment, which we don't want to include
658                 // to the error report.
659                 '/' => break,
660                 // Newline without following '\'' means unclosed quote, stop parsing.
661                 '\n' if self.second() != '\'' => break,
662                 // End of file, stop parsing.
663                 EOF_CHAR if self.is_eof() => break,
664                 // Escaped slash is considered one character, so bump twice.
665                 '\\' => {
666                     self.bump();
667                     self.bump();
668                 }
669                 // Skip the character.
670                 _ => {
671                     self.bump();
672                 }
673             }
674         }
675         // String was not terminated.
676         false
677     }
678 
679     /// Eats double-quoted string and returns true
680     /// if string is terminated.
double_quoted_string(&mut self) -> bool681     fn double_quoted_string(&mut self) -> bool {
682         debug_assert!(self.prev() == '"');
683         while let Some(c) = self.bump() {
684             match c {
685                 '"' => {
686                     return true;
687                 }
688                 '\\' if self.first() == '\\' || self.first() == '"' => {
689                     // Bump again to skip escaped character.
690                     self.bump();
691                 }
692                 _ => (),
693             }
694         }
695         // End of file reached.
696         false
697     }
698 
699     /// Eats the double-quoted string and returns `n_hashes` and an error if encountered.
raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>)700     fn raw_double_quoted_string(&mut self, prefix_len: usize) -> (u16, Option<RawStrError>) {
701         // Wrap the actual function to handle the error with too many hashes.
702         // This way, it eats the whole raw string.
703         let (n_hashes, err) = self.raw_string_unvalidated(prefix_len);
704         // Only up to 65535 `#`s are allowed in raw strings
705         match u16::try_from(n_hashes) {
706             Ok(num) => (num, err),
707             // We lie about the number of hashes here :P
708             Err(_) => (0, Some(RawStrError::TooManyDelimiters { found: n_hashes })),
709         }
710     }
711 
raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>)712     fn raw_string_unvalidated(&mut self, prefix_len: usize) -> (usize, Option<RawStrError>) {
713         debug_assert!(self.prev() == 'r');
714         let start_pos = self.len_consumed();
715         let mut possible_terminator_offset = None;
716         let mut max_hashes = 0;
717 
718         // Count opening '#' symbols.
719         let mut eaten = 0;
720         while self.first() == '#' {
721             eaten += 1;
722             self.bump();
723         }
724         let n_start_hashes = eaten;
725 
726         // Check that string is started.
727         match self.bump() {
728             Some('"') => (),
729             c => {
730                 let c = c.unwrap_or(EOF_CHAR);
731                 return (n_start_hashes, Some(RawStrError::InvalidStarter { bad_char: c }));
732             }
733         }
734 
735         // Skip the string contents and on each '#' character met, check if this is
736         // a raw string termination.
737         loop {
738             self.eat_while(|c| c != '"');
739 
740             if self.is_eof() {
741                 return (
742                     n_start_hashes,
743                     Some(RawStrError::NoTerminator {
744                         expected: n_start_hashes,
745                         found: max_hashes,
746                         possible_terminator_offset,
747                     }),
748                 );
749             }
750 
751             // Eat closing double quote.
752             self.bump();
753 
754             // Check that amount of closing '#' symbols
755             // is equal to the amount of opening ones.
756             // Note that this will not consume extra trailing `#` characters:
757             // `r###"abcde"####` is lexed as a `RawStr { n_hashes: 3 }`
758             // followed by a `#` token.
759             let mut n_end_hashes = 0;
760             while self.first() == '#' && n_end_hashes < n_start_hashes {
761                 n_end_hashes += 1;
762                 self.bump();
763             }
764 
765             if n_end_hashes == n_start_hashes {
766                 return (n_start_hashes, None);
767             } else if n_end_hashes > max_hashes {
768                 // Keep track of possible terminators to give a hint about
769                 // where there might be a missing terminator
770                 possible_terminator_offset =
771                     Some(self.len_consumed() - start_pos - n_end_hashes + prefix_len);
772                 max_hashes = n_end_hashes;
773             }
774         }
775     }
776 
eat_decimal_digits(&mut self) -> bool777     fn eat_decimal_digits(&mut self) -> bool {
778         let mut has_digits = false;
779         loop {
780             match self.first() {
781                 '_' => {
782                     self.bump();
783                 }
784                 '0'..='9' => {
785                     has_digits = true;
786                     self.bump();
787                 }
788                 _ => break,
789             }
790         }
791         has_digits
792     }
793 
eat_hexadecimal_digits(&mut self) -> bool794     fn eat_hexadecimal_digits(&mut self) -> bool {
795         let mut has_digits = false;
796         loop {
797             match self.first() {
798                 '_' => {
799                     self.bump();
800                 }
801                 '0'..='9' | 'a'..='f' | 'A'..='F' => {
802                     has_digits = true;
803                     self.bump();
804                 }
805                 _ => break,
806             }
807         }
808         has_digits
809     }
810 
811     /// Eats the float exponent. Returns true if at least one digit was met,
812     /// and returns false otherwise.
eat_float_exponent(&mut self) -> bool813     fn eat_float_exponent(&mut self) -> bool {
814         debug_assert!(self.prev() == 'e' || self.prev() == 'E');
815         if self.first() == '-' || self.first() == '+' {
816             self.bump();
817         }
818         self.eat_decimal_digits()
819     }
820 
821     // Eats the suffix of the literal, e.g. "_u8".
eat_literal_suffix(&mut self)822     fn eat_literal_suffix(&mut self) {
823         self.eat_identifier();
824     }
825 
826     // Eats the identifier.
eat_identifier(&mut self)827     fn eat_identifier(&mut self) {
828         if !is_id_start(self.first()) {
829             return;
830         }
831         self.bump();
832 
833         self.eat_while(is_id_continue);
834     }
835 
836     /// Eats symbols while predicate returns true or until the end of file is reached.
eat_while(&mut self, mut predicate: impl FnMut(char) -> bool)837     fn eat_while(&mut self, mut predicate: impl FnMut(char) -> bool) {
838         while predicate(self.first()) && !self.is_eof() {
839             self.bump();
840         }
841     }
842 }
843