1 /* This Source Code Form is subject to the terms of the Mozilla Public
2  * License, v. 2.0. If a copy of the MPL was not distributed with this
3  * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4 
5 // https://drafts.csswg.org/css-syntax/#tokenization
6 
7 use self::Token::*;
8 use crate::cow_rc_str::CowRcStr;
9 use crate::parser::ParserState;
10 use matches::matches;
11 use std::char;
12 use std::i32;
13 use std::ops::Range;
14 
15 /// One of the pieces the CSS input is broken into.
16 ///
17 /// Some components use `Cow` in order to borrow from the original input string
18 /// and avoid allocating/copying when possible.
19 #[derive(PartialEq, Debug, Clone)]
20 pub enum Token<'a> {
21     /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
22     Ident(CowRcStr<'a>),
23 
24     /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
25     ///
26     /// The value does not include the `@` marker.
27     AtKeyword(CowRcStr<'a>),
28 
29     /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
30     ///
31     /// The value does not include the `#` marker.
32     Hash(CowRcStr<'a>),
33 
34     /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
35     ///
36     /// The value does not include the `#` marker.
37     IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
38 
39     /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
40     ///
41     /// The value does not include the quotes.
42     QuotedString(CowRcStr<'a>),
43 
44     /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
45     ///
46     /// The value does not include the `url(` `)` markers.  Note that `url( <string-token> )` is represented by a
47     /// `Function` token.
48     UnquotedUrl(CowRcStr<'a>),
49 
50     /// A `<delim-token>`
51     Delim(char),
52 
53     /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
54     Number {
55         /// Whether the number had a `+` or `-` sign.
56         ///
57         /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
58         has_sign: bool,
59 
60         /// The value as a float
61         value: f32,
62 
63         /// If the origin source did not include a fractional part, the value as an integer.
64         int_value: Option<i32>,
65     },
66 
67     /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
68     Percentage {
69         /// Whether the number had a `+` or `-` sign.
70         has_sign: bool,
71 
72         /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
73         unit_value: f32,
74 
75         /// If the origin source did not include a fractional part, the value as an integer.
76         /// It is **not** divided by 100.
77         int_value: Option<i32>,
78     },
79 
80     /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
81     Dimension {
82         /// Whether the number had a `+` or `-` sign.
83         ///
84         /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
85         has_sign: bool,
86 
87         /// The value as a float
88         value: f32,
89 
90         /// If the origin source did not include a fractional part, the value as an integer.
91         int_value: Option<i32>,
92 
93         /// The unit, e.g. "px" in `12px`
94         unit: CowRcStr<'a>,
95     },
96 
97     /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
98     WhiteSpace(&'a str),
99 
100     /// A comment.
101     ///
102     /// The CSS Syntax spec does not generate tokens for comments,
103     /// But we do, because we can (borrowed &str makes it cheap).
104     ///
105     /// The value does not include the `/*` `*/` markers.
106     Comment(&'a str),
107 
108     /// A `:` `<colon-token>`
109     Colon, // :
110 
111     /// A `;` `<semicolon-token>`
112     Semicolon, // ;
113 
114     /// A `,` `<comma-token>`
115     Comma, // ,
116 
117     /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
118     IncludeMatch,
119 
120     /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
121     DashMatch,
122 
123     /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
124     PrefixMatch,
125 
126     /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
127     SuffixMatch,
128 
129     /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
130     SubstringMatch,
131 
132     /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
133     CDO,
134 
135     /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
136     CDC,
137 
138     /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
139     ///
140     /// The value (name) does not include the `(` marker.
141     Function(CowRcStr<'a>),
142 
143     /// A `<(-token>`
144     ParenthesisBlock,
145 
146     /// A `<[-token>`
147     SquareBracketBlock,
148 
149     /// A `<{-token>`
150     CurlyBracketBlock,
151 
152     /// A `<bad-url-token>`
153     ///
154     /// This token always indicates a parse error.
155     BadUrl(CowRcStr<'a>),
156 
157     /// A `<bad-string-token>`
158     ///
159     /// This token always indicates a parse error.
160     BadString(CowRcStr<'a>),
161 
162     /// A `<)-token>`
163     ///
164     /// When obtained from one of the `Parser::next*` methods,
165     /// this token is always unmatched and indicates a parse error.
166     CloseParenthesis,
167 
168     /// A `<]-token>`
169     ///
170     /// When obtained from one of the `Parser::next*` methods,
171     /// this token is always unmatched and indicates a parse error.
172     CloseSquareBracket,
173 
174     /// A `<}-token>`
175     ///
176     /// When obtained from one of the `Parser::next*` methods,
177     /// this token is always unmatched and indicates a parse error.
178     CloseCurlyBracket,
179 }
180 
181 impl<'a> Token<'a> {
182     /// Return whether this token represents a parse error.
183     ///
184     /// `BadUrl` and `BadString` are tokenizer-level parse errors.
185     ///
186     /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
187     /// and therefore parse errors when returned by one of the `Parser::next*` methods.
is_parse_error(&self) -> bool188     pub fn is_parse_error(&self) -> bool {
189         matches!(
190             *self,
191             BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
192         )
193     }
194 }
195 
196 #[derive(Clone)]
197 pub struct Tokenizer<'a> {
198     input: &'a str,
199     /// Counted in bytes, not code points. From 0.
200     position: usize,
201     /// The position at the start of the current line; but adjusted to
202     /// ensure that computing the column will give the result in units
203     /// of UTF-16 characters.
204     current_line_start_position: usize,
205     current_line_number: u32,
206     var_or_env_functions: SeenStatus,
207     source_map_url: Option<&'a str>,
208     source_url: Option<&'a str>,
209 }
210 
211 #[derive(Copy, Clone, PartialEq, Eq)]
212 enum SeenStatus {
213     DontCare,
214     LookingForThem,
215     SeenAtLeastOne,
216 }
217 
218 impl<'a> Tokenizer<'a> {
219     #[inline]
new(input: &str) -> Tokenizer220     pub fn new(input: &str) -> Tokenizer {
221         Tokenizer::with_first_line_number(input, 0)
222     }
223 
224     #[inline]
with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer225     pub fn with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer {
226         Tokenizer {
227             input: input,
228             position: 0,
229             current_line_start_position: 0,
230             current_line_number: first_line_number,
231             var_or_env_functions: SeenStatus::DontCare,
232             source_map_url: None,
233             source_url: None,
234         }
235     }
236 
237     #[inline]
look_for_var_or_env_functions(&mut self)238     pub fn look_for_var_or_env_functions(&mut self) {
239         self.var_or_env_functions = SeenStatus::LookingForThem;
240     }
241 
242     #[inline]
seen_var_or_env_functions(&mut self) -> bool243     pub fn seen_var_or_env_functions(&mut self) -> bool {
244         let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;
245         self.var_or_env_functions = SeenStatus::DontCare;
246         seen
247     }
248 
249     #[inline]
see_function(&mut self, name: &str)250     pub fn see_function(&mut self, name: &str) {
251         if self.var_or_env_functions == SeenStatus::LookingForThem {
252             if name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env") {
253                 self.var_or_env_functions = SeenStatus::SeenAtLeastOne;
254             }
255         }
256     }
257 
258     #[inline]
next(&mut self) -> Result<Token<'a>, ()>259     pub fn next(&mut self) -> Result<Token<'a>, ()> {
260         next_token(self)
261     }
262 
263     #[inline]
position(&self) -> SourcePosition264     pub fn position(&self) -> SourcePosition {
265         SourcePosition(self.position)
266     }
267 
268     #[inline]
current_source_location(&self) -> SourceLocation269     pub fn current_source_location(&self) -> SourceLocation {
270         SourceLocation {
271             line: self.current_line_number,
272             column: (self.position - self.current_line_start_position + 1) as u32,
273         }
274     }
275 
276     #[inline]
current_source_map_url(&self) -> Option<&'a str>277     pub fn current_source_map_url(&self) -> Option<&'a str> {
278         self.source_map_url
279     }
280 
281     #[inline]
current_source_url(&self) -> Option<&'a str>282     pub fn current_source_url(&self) -> Option<&'a str> {
283         self.source_url
284     }
285 
286     #[inline]
state(&self) -> ParserState287     pub fn state(&self) -> ParserState {
288         ParserState {
289             position: self.position,
290             current_line_start_position: self.current_line_start_position,
291             current_line_number: self.current_line_number,
292             at_start_of: None,
293         }
294     }
295 
296     #[inline]
reset(&mut self, state: &ParserState)297     pub fn reset(&mut self, state: &ParserState) {
298         self.position = state.position;
299         self.current_line_start_position = state.current_line_start_position;
300         self.current_line_number = state.current_line_number;
301     }
302 
303     #[inline]
slice_from(&self, start_pos: SourcePosition) -> &'a str304     pub fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
305         &self.input[start_pos.0..self.position]
306     }
307 
308     #[inline]
slice(&self, range: Range<SourcePosition>) -> &'a str309     pub fn slice(&self, range: Range<SourcePosition>) -> &'a str {
310         &self.input[range.start.0..range.end.0]
311     }
312 
current_source_line(&self) -> &'a str313     pub fn current_source_line(&self) -> &'a str {
314         let current = self.position;
315         let start = self.input[0..current]
316             .rfind(|c| matches!(c, '\r' | '\n' | '\x0C'))
317             .map_or(0, |start| start + 1);
318         let end = self.input[current..]
319             .find(|c| matches!(c, '\r' | '\n' | '\x0C'))
320             .map_or(self.input.len(), |end| current + end);
321         &self.input[start..end]
322     }
323 
324     #[inline]
next_byte(&self) -> Option<u8>325     pub fn next_byte(&self) -> Option<u8> {
326         if self.is_eof() {
327             None
328         } else {
329             Some(self.input.as_bytes()[self.position])
330         }
331     }
332 
333     // If false, `tokenizer.next_char()` will not panic.
334     #[inline]
is_eof(&self) -> bool335     fn is_eof(&self) -> bool {
336         !self.has_at_least(0)
337     }
338 
339     // If true, the input has at least `n` bytes left *after* the current one.
340     // That is, `tokenizer.char_at(n)` will not panic.
341     #[inline]
has_at_least(&self, n: usize) -> bool342     fn has_at_least(&self, n: usize) -> bool {
343         self.position + n < self.input.len()
344     }
345 
346     // Advance over N bytes in the input.  This function can advance
347     // over ASCII bytes (excluding newlines), or UTF-8 sequence
348     // leaders (excluding leaders for 4-byte sequences).
349     #[inline]
advance(&mut self, n: usize)350     pub fn advance(&mut self, n: usize) {
351         if cfg!(debug_assertions) {
352             // Each byte must either be an ASCII byte or a sequence
353             // leader, but not a 4-byte leader; also newlines are
354             // rejected.
355             for i in 0..n {
356                 let b = self.byte_at(i);
357                 debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
358                 debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
359             }
360         }
361         self.position += n
362     }
363 
364     // Assumes non-EOF
365     #[inline]
next_byte_unchecked(&self) -> u8366     fn next_byte_unchecked(&self) -> u8 {
367         self.byte_at(0)
368     }
369 
370     #[inline]
byte_at(&self, offset: usize) -> u8371     fn byte_at(&self, offset: usize) -> u8 {
372         self.input.as_bytes()[self.position + offset]
373     }
374 
375     // Advance over a single byte; the byte must be a UTF-8 sequence
376     // leader for a 4-byte sequence.
377     #[inline]
consume_4byte_intro(&mut self)378     fn consume_4byte_intro(&mut self) {
379         debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
380         // This takes two UTF-16 characters to represent, so we
381         // actually have an undercount.
382         self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
383         self.position += 1;
384     }
385 
386     // Advance over a single byte; the byte must be a UTF-8
387     // continuation byte.
388     #[inline]
consume_continuation_byte(&mut self)389     fn consume_continuation_byte(&mut self) {
390         debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
391         // Continuation bytes contribute to column overcount.  Note
392         // that due to the special case for the 4-byte sequence intro,
393         // we must use wrapping add here.
394         self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
395         self.position += 1;
396     }
397 
398     // Advance over any kind of byte, excluding newlines.
399     #[inline(never)]
consume_known_byte(&mut self, byte: u8)400     fn consume_known_byte(&mut self, byte: u8) {
401         debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
402         self.position += 1;
403         // Continuation bytes contribute to column overcount.
404         if byte & 0xF0 == 0xF0 {
405             // This takes two UTF-16 characters to represent, so we
406             // actually have an undercount.
407             self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
408         } else if byte & 0xC0 == 0x80 {
409             // Note that due to the special case for the 4-byte
410             // sequence intro, we must use wrapping add here.
411             self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
412         }
413     }
414 
415     #[inline]
next_char(&self) -> char416     fn next_char(&self) -> char {
417         self.input[self.position..].chars().next().unwrap()
418     }
419 
420     // Given that a newline has been seen, advance over the newline
421     // and update the state.
422     #[inline]
consume_newline(&mut self)423     fn consume_newline(&mut self) {
424         let byte = self.next_byte_unchecked();
425         debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
426         self.position += 1;
427         if byte == b'\r' && self.next_byte() == Some(b'\n') {
428             self.position += 1;
429         }
430         self.current_line_start_position = self.position;
431         self.current_line_number += 1;
432     }
433 
434     #[inline]
has_newline_at(&self, offset: usize) -> bool435     fn has_newline_at(&self, offset: usize) -> bool {
436         self.position + offset < self.input.len()
437             && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
438     }
439 
440     #[inline]
consume_char(&mut self) -> char441     fn consume_char(&mut self) -> char {
442         let c = self.next_char();
443         let len_utf8 = c.len_utf8();
444         self.position += len_utf8;
445         // Note that due to the special case for the 4-byte sequence
446         // intro, we must use wrapping add here.
447         self.current_line_start_position = self
448             .current_line_start_position
449             .wrapping_add(len_utf8 - c.len_utf16());
450         c
451     }
452 
453     #[inline]
starts_with(&self, needle: &[u8]) -> bool454     fn starts_with(&self, needle: &[u8]) -> bool {
455         self.input.as_bytes()[self.position..].starts_with(needle)
456     }
457 
skip_whitespace(&mut self)458     pub fn skip_whitespace(&mut self) {
459         while !self.is_eof() {
460             match_byte! { self.next_byte_unchecked(),
461                 b' ' | b'\t' => {
462                     self.advance(1)
463                 },
464                 b'\n' | b'\x0C' | b'\r' => {
465                     self.consume_newline();
466                 },
467                 b'/' => {
468                     if self.starts_with(b"/*") {
469                         consume_comment(self);
470                     } else {
471                         return
472                     }
473                 }
474                 _ => {
475                     return
476                 }
477             }
478         }
479     }
480 
skip_cdc_and_cdo(&mut self)481     pub fn skip_cdc_and_cdo(&mut self) {
482         while !self.is_eof() {
483             match_byte! { self.next_byte_unchecked(),
484                 b' ' | b'\t' => {
485                     self.advance(1)
486                 },
487                 b'\n' | b'\x0C' | b'\r' => {
488                     self.consume_newline();
489                 },
490                 b'/' => {
491                     if self.starts_with(b"/*") {
492                         consume_comment(self);
493                     } else {
494                         return
495                     }
496                 }
497                 b'<' => {
498                     if self.starts_with(b"<!--") {
499                         self.advance(4)
500                     } else {
501                         return
502                     }
503                 }
504                 b'-' => {
505                     if self.starts_with(b"-->") {
506                         self.advance(3)
507                     } else {
508                         return
509                     }
510                 }
511                 _ => {
512                     return
513                 }
514             }
515         }
516     }
517 }
518 
519 /// A position from the start of the input, counted in UTF-8 bytes.
520 #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
521 pub struct SourcePosition(pub(crate) usize);
522 
523 impl SourcePosition {
524     /// Returns the current byte index in the original input.
525     #[inline]
byte_index(&self) -> usize526     pub fn byte_index(&self) -> usize {
527         self.0
528     }
529 }
530 
531 /// The line and column number for a given position within the input.
532 #[derive(PartialEq, Eq, Debug, Clone, Copy)]
533 pub struct SourceLocation {
534     /// The line number, starting at 0 for the first line, unless `with_first_line_number` was used.
535     pub line: u32,
536 
537     /// The column number within a line, starting at 1 for first the character of the line.
538     /// Column numbers are counted in UTF-16 code units.
539     pub column: u32,
540 }
541 
next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()>542 fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
543     if tokenizer.is_eof() {
544         return Err(());
545     }
546     let b = tokenizer.next_byte_unchecked();
547     let token = match_byte! { b,
548         b' ' | b'\t' => {
549             consume_whitespace(tokenizer, false)
550         },
551         b'\n' | b'\x0C' | b'\r' => {
552             consume_whitespace(tokenizer, true)
553         },
554         b'"' => { consume_string(tokenizer, false) },
555         b'#' => {
556             tokenizer.advance(1);
557             if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
558             else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() {
559                 // Any other valid case here already resulted in IDHash.
560                 b'0'..=b'9' | b'-' => true,
561                 _ => false,
562             } { Hash(consume_name(tokenizer)) }
563             else { Delim('#') }
564         },
565         b'$' => {
566             if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
567             else { tokenizer.advance(1); Delim('$') }
568         },
569         b'\'' => { consume_string(tokenizer, true) },
570         b'(' => { tokenizer.advance(1); ParenthesisBlock },
571         b')' => { tokenizer.advance(1); CloseParenthesis },
572         b'*' => {
573             if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
574             else { tokenizer.advance(1); Delim('*') }
575         },
576         b'+' => {
577             if (
578                 tokenizer.has_at_least(1)
579                 && matches!(tokenizer.byte_at(1), b'0'..=b'9')
580             ) || (
581                 tokenizer.has_at_least(2)
582                 && tokenizer.byte_at(1) == b'.'
583                 && matches!(tokenizer.byte_at(2), b'0'..=b'9')
584             ) {
585                 consume_numeric(tokenizer)
586             } else {
587                 tokenizer.advance(1);
588                 Delim('+')
589             }
590         },
591         b',' => { tokenizer.advance(1); Comma },
592         b'-' => {
593             if (
594                 tokenizer.has_at_least(1)
595                 && matches!(tokenizer.byte_at(1), b'0'..=b'9')
596             ) || (
597                 tokenizer.has_at_least(2)
598                 && tokenizer.byte_at(1) == b'.'
599                 && matches!(tokenizer.byte_at(2), b'0'..=b'9')
600             ) {
601                 consume_numeric(tokenizer)
602             } else if tokenizer.starts_with(b"-->") {
603                 tokenizer.advance(3);
604                 CDC
605             } else if is_ident_start(tokenizer) {
606                 consume_ident_like(tokenizer)
607             } else {
608                 tokenizer.advance(1);
609                 Delim('-')
610             }
611         },
612         b'.' => {
613             if tokenizer.has_at_least(1)
614                 && matches!(tokenizer.byte_at(1), b'0'..=b'9'
615             ) {
616                 consume_numeric(tokenizer)
617             } else {
618                 tokenizer.advance(1);
619                 Delim('.')
620             }
621         }
622         b'/' => {
623             if tokenizer.starts_with(b"/*") {
624                 Comment(consume_comment(tokenizer))
625             } else {
626                 tokenizer.advance(1);
627                 Delim('/')
628             }
629         }
630         b'0'..=b'9' => { consume_numeric(tokenizer) },
631         b':' => { tokenizer.advance(1); Colon },
632         b';' => { tokenizer.advance(1); Semicolon },
633         b'<' => {
634             if tokenizer.starts_with(b"<!--") {
635                 tokenizer.advance(4);
636                 CDO
637             } else {
638                 tokenizer.advance(1);
639                 Delim('<')
640             }
641         },
642         b'@' => {
643             tokenizer.advance(1);
644             if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
645             else { Delim('@') }
646         },
647         b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { consume_ident_like(tokenizer) },
648         b'[' => { tokenizer.advance(1); SquareBracketBlock },
649         b'\\' => {
650             if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
651             else { tokenizer.advance(1); Delim('\\') }
652         },
653         b']' => { tokenizer.advance(1); CloseSquareBracket },
654         b'^' => {
655             if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
656             else { tokenizer.advance(1); Delim('^') }
657         },
658         b'{' => { tokenizer.advance(1); CurlyBracketBlock },
659         b'|' => {
660             if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
661             else { tokenizer.advance(1); Delim('|') }
662         },
663         b'}' => { tokenizer.advance(1); CloseCurlyBracket },
664         b'~' => {
665             if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
666             else { tokenizer.advance(1); Delim('~') }
667         },
668         _ => {
669             if !b.is_ascii() {
670                 consume_ident_like(tokenizer)
671             } else {
672                 tokenizer.advance(1);
673                 Delim(b as char)
674             }
675         },
676     };
677     Ok(token)
678 }
679 
consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a>680 fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
681     let start_position = tokenizer.position();
682     if newline {
683         tokenizer.consume_newline();
684     } else {
685         tokenizer.advance(1);
686     }
687     while !tokenizer.is_eof() {
688         let b = tokenizer.next_byte_unchecked();
689         match_byte! { b,
690             b' ' | b'\t' => {
691                 tokenizer.advance(1);
692             }
693             b'\n' | b'\x0C' | b'\r' => {
694                 tokenizer.consume_newline();
695             }
696             _ => {
697                 break
698             }
699         }
700     }
701     WhiteSpace(tokenizer.slice_from(start_position))
702 }
703 
704 // Check for sourceMappingURL or sourceURL comments and update the
705 // tokenizer appropriately.
check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str)706 fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
707     let directive = "# sourceMappingURL=";
708     let directive_old = "@ sourceMappingURL=";
709 
710     // If there is a source map directive, extract the URL.
711     if contents.starts_with(directive) || contents.starts_with(directive_old) {
712         let contents = &contents[directive.len()..];
713         tokenizer.source_map_url = contents
714             .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
715             .next()
716     }
717 
718     let directive = "# sourceURL=";
719     let directive_old = "@ sourceURL=";
720 
721     // If there is a source map directive, extract the URL.
722     if contents.starts_with(directive) || contents.starts_with(directive_old) {
723         let contents = &contents[directive.len()..];
724         tokenizer.source_url = contents
725             .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
726             .next()
727     }
728 }
729 
consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str730 fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
731     tokenizer.advance(2); // consume "/*"
732     let start_position = tokenizer.position();
733     while !tokenizer.is_eof() {
734         match_byte! { tokenizer.next_byte_unchecked(),
735             b'*' => {
736                 let end_position = tokenizer.position();
737                 tokenizer.advance(1);
738                 if tokenizer.next_byte() == Some(b'/') {
739                     tokenizer.advance(1);
740                     let contents = tokenizer.slice(start_position..end_position);
741                     check_for_source_map(tokenizer, contents);
742                     return contents
743                 }
744             }
745             b'\n' | b'\x0C' | b'\r' => {
746                 tokenizer.consume_newline();
747             }
748             b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
749             b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
750             _ => {
751                 // ASCII or other leading byte.
752                 tokenizer.advance(1);
753             }
754         }
755     }
756     let contents = tokenizer.slice_from(start_position);
757     check_for_source_map(tokenizer, contents);
758     contents
759 }
760 
consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a>761 fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
762     match consume_quoted_string(tokenizer, single_quote) {
763         Ok(value) => QuotedString(value),
764         Err(value) => BadString(value),
765     }
766 }
767 
768 /// Return `Err(())` on syntax error (ie. unescaped newline)
consume_quoted_string<'a>( tokenizer: &mut Tokenizer<'a>, single_quote: bool, ) -> Result<CowRcStr<'a>, CowRcStr<'a>>769 fn consume_quoted_string<'a>(
770     tokenizer: &mut Tokenizer<'a>,
771     single_quote: bool,
772 ) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
773     tokenizer.advance(1); // Skip the initial quote
774                           // start_pos is at code point boundary, after " or '
775     let start_pos = tokenizer.position();
776     let mut string_bytes;
777     loop {
778         if tokenizer.is_eof() {
779             return Ok(tokenizer.slice_from(start_pos).into());
780         }
781         match_byte! { tokenizer.next_byte_unchecked(),
782             b'"' => {
783                 if !single_quote {
784                     let value = tokenizer.slice_from(start_pos);
785                     tokenizer.advance(1);
786                     return Ok(value.into())
787                 }
788                 tokenizer.advance(1);
789             }
790             b'\'' => {
791                 if single_quote {
792                     let value = tokenizer.slice_from(start_pos);
793                     tokenizer.advance(1);
794                     return Ok(value.into())
795                 }
796                 tokenizer.advance(1);
797             }
798             b'\\' | b'\0' => {
799                 // * The tokenizer’s input is UTF-8 since it’s `&str`.
800                 // * start_pos is at a code point boundary
801                 // * so is the current position (which is before '\\' or '\0'
802                 //
803                 // So `string_bytes` is well-formed UTF-8.
804                 string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
805                 break
806             }
807             b'\n' | b'\r' | b'\x0C' => {
808                 return Err(tokenizer.slice_from(start_pos).into())
809             },
810             b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
811             b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
812             _ => {
813                 // ASCII or other leading byte.
814                 tokenizer.advance(1);
815             }
816         }
817     }
818 
819     while !tokenizer.is_eof() {
820         let b = tokenizer.next_byte_unchecked();
821         match_byte! { b,
822             b'\n' | b'\r' | b'\x0C' => {
823                 return Err(
824                     // string_bytes is well-formed UTF-8, see other comments.
825                     unsafe {
826                         from_utf8_release_unchecked(string_bytes)
827                     }.into()
828                 );
829             }
830             b'"' => {
831                 tokenizer.advance(1);
832                 if !single_quote {
833                     break;
834                 }
835             }
836             b'\'' => {
837                 tokenizer.advance(1);
838                 if single_quote {
839                     break;
840                 }
841             }
842             b'\\' => {
843                 tokenizer.advance(1);
844                 if !tokenizer.is_eof() {
845                     match tokenizer.next_byte_unchecked() {
846                         // Escaped newline
847                         b'\n' | b'\x0C' | b'\r' => {
848                             tokenizer.consume_newline();
849                         }
850                         // This pushes one well-formed code point
851                         _ => consume_escape_and_write(tokenizer, &mut string_bytes)
852                     }
853                 }
854                 // else: escaped EOF, do nothing.
855                 continue;
856             }
857             b'\0' => {
858                 tokenizer.advance(1);
859                 string_bytes.extend("\u{FFFD}".as_bytes());
860                 continue;
861             }
862             b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
863             b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
864             _ => {
865                 // ASCII or other leading byte.
866                 tokenizer.advance(1);
867             },
868         }
869 
870         // If this byte is part of a multi-byte code point,
871         // we’ll end up copying the whole code point before this loop does something else.
872         string_bytes.push(b);
873     }
874 
875     Ok(
876         // string_bytes is well-formed UTF-8, see other comments.
877         unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
878     )
879 }
880 
881 #[inline]
is_ident_start(tokenizer: &mut Tokenizer) -> bool882 fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
883     !tokenizer.is_eof()
884         && match_byte! { tokenizer.next_byte_unchecked(),
885             b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { true },
886             b'-' => {
887                 tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
888                     b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
889                         true
890                     }
891                     b'\\' => { !tokenizer.has_newline_at(1) }
892                     b => { !b.is_ascii() },
893                 }
894             },
895             b'\\' => { !tokenizer.has_newline_at(1) },
896             b => { !b.is_ascii() },
897         }
898 }
899 
consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a>900 fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
901     let value = consume_name(tokenizer);
902     if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
903         tokenizer.advance(1);
904         if value.eq_ignore_ascii_case("url") {
905             consume_unquoted_url(tokenizer).unwrap_or(Function(value))
906         } else {
907             tokenizer.see_function(&value);
908             Function(value)
909         }
910     } else {
911         Ident(value)
912     }
913 }
914 
consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a>915 fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
916     // start_pos is the end of the previous token, therefore at a code point boundary
917     let start_pos = tokenizer.position();
918     let mut value_bytes;
919     loop {
920         if tokenizer.is_eof() {
921             return tokenizer.slice_from(start_pos).into();
922         }
923         match_byte! { tokenizer.next_byte_unchecked(),
924             b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => { tokenizer.advance(1) },
925             b'\\' | b'\0' => {
926                 // * The tokenizer’s input is UTF-8 since it’s `&str`.
927                 // * start_pos is at a code point boundary
928                 // * so is the current position (which is before '\\' or '\0'
929                 //
930                 // So `value_bytes` is well-formed UTF-8.
931                 value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
932                 break
933             }
934             b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
935             b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
936             b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
937             _b => {
938                 return tokenizer.slice_from(start_pos).into();
939             }
940         }
941     }
942 
943     while !tokenizer.is_eof() {
944         let b = tokenizer.next_byte_unchecked();
945         match_byte! { b,
946             b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-'  => {
947                 tokenizer.advance(1);
948                 value_bytes.push(b)  // ASCII
949             }
950             b'\\' => {
951                 if tokenizer.has_newline_at(1) { break }
952                 tokenizer.advance(1);
953                 // This pushes one well-formed code point
954                 consume_escape_and_write(tokenizer, &mut value_bytes)
955             }
956             b'\0' => {
957                 tokenizer.advance(1);
958                 value_bytes.extend("\u{FFFD}".as_bytes());
959             },
960             b'\x80'..=b'\xBF' => {
961                 // This byte *is* part of a multi-byte code point,
962                 // we’ll end up copying the whole code point before this loop does something else.
963                 tokenizer.consume_continuation_byte();
964                 value_bytes.push(b)
965             }
966             b'\xC0'..=b'\xEF' => {
967                 // This byte *is* part of a multi-byte code point,
968                 // we’ll end up copying the whole code point before this loop does something else.
969                 tokenizer.advance(1);
970                 value_bytes.push(b)
971             }
972             b'\xF0'..=b'\xFF' => {
973                 tokenizer.consume_4byte_intro();
974                 value_bytes.push(b)
975             }
976             _ => {
977                 // ASCII
978                 break;
979             }
980         }
981     }
982     // string_bytes is well-formed UTF-8, see other comments.
983     unsafe { from_utf8_release_unchecked(value_bytes) }.into()
984 }
985 
byte_to_hex_digit(b: u8) -> Option<u32>986 fn byte_to_hex_digit(b: u8) -> Option<u32> {
987     Some(match_byte! { b,
988         b'0' ..= b'9' => { b - b'0' },
989         b'a' ..= b'f' => { b - b'a' + 10 },
990         b'A' ..= b'F' => { b - b'A' + 10 },
991         _ => {
992             return None
993         }
994     } as u32)
995 }
996 
byte_to_decimal_digit(b: u8) -> Option<u32>997 fn byte_to_decimal_digit(b: u8) -> Option<u32> {
998     if b >= b'0' && b <= b'9' {
999         Some((b - b'0') as u32)
1000     } else {
1001         None
1002     }
1003 }
1004 
consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a>1005 fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1006     // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
1007     // But this is always called so that there is at least one digit in \d*(\.\d+)?
1008 
1009     // Do all the math in f64 so that large numbers overflow to +/-inf
1010     // and i32::{MIN, MAX} are within range.
1011 
1012     let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1013         b'-' => (true, -1.),
1014         b'+' => (true, 1.),
1015         _ => (false, 1.),
1016     };
1017     if has_sign {
1018         tokenizer.advance(1);
1019     }
1020 
1021     let mut integral_part: f64 = 0.;
1022     while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1023         integral_part = integral_part * 10. + digit as f64;
1024         tokenizer.advance(1);
1025         if tokenizer.is_eof() {
1026             break;
1027         }
1028     }
1029 
1030     let mut is_integer = true;
1031 
1032     let mut fractional_part: f64 = 0.;
1033     if tokenizer.has_at_least(1)
1034         && tokenizer.next_byte_unchecked() == b'.'
1035         && matches!(tokenizer.byte_at(1), b'0'..=b'9')
1036     {
1037         is_integer = false;
1038         tokenizer.advance(1); // Consume '.'
1039         let mut factor = 0.1;
1040         while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1041             fractional_part += digit as f64 * factor;
1042             factor *= 0.1;
1043             tokenizer.advance(1);
1044             if tokenizer.is_eof() {
1045                 break;
1046             }
1047         }
1048     }
1049 
1050     let mut value = sign * (integral_part + fractional_part);
1051 
1052     if tokenizer.has_at_least(1) && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E') {
1053         if matches!(tokenizer.byte_at(1), b'0'..=b'9')
1054             || (tokenizer.has_at_least(2)
1055                 && matches!(tokenizer.byte_at(1), b'+' | b'-')
1056                 && matches!(tokenizer.byte_at(2), b'0'..=b'9'))
1057         {
1058             is_integer = false;
1059             tokenizer.advance(1);
1060             let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1061                 b'-' => (true, -1.),
1062                 b'+' => (true, 1.),
1063                 _ => (false, 1.),
1064             };
1065             if has_sign {
1066                 tokenizer.advance(1);
1067             }
1068             let mut exponent: f64 = 0.;
1069             while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1070                 exponent = exponent * 10. + digit as f64;
1071                 tokenizer.advance(1);
1072                 if tokenizer.is_eof() {
1073                     break;
1074                 }
1075             }
1076             value *= f64::powf(10., sign * exponent);
1077         }
1078     }
1079 
1080     let int_value = if is_integer {
1081         Some(if value >= i32::MAX as f64 {
1082             i32::MAX
1083         } else if value <= i32::MIN as f64 {
1084             i32::MIN
1085         } else {
1086             value as i32
1087         })
1088     } else {
1089         None
1090     };
1091 
1092     if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
1093         tokenizer.advance(1);
1094         return Percentage {
1095             unit_value: (value / 100.) as f32,
1096             int_value: int_value,
1097             has_sign: has_sign,
1098         };
1099     }
1100     let value = value as f32;
1101     if is_ident_start(tokenizer) {
1102         let unit = consume_name(tokenizer);
1103         Dimension {
1104             value: value,
1105             int_value: int_value,
1106             has_sign: has_sign,
1107             unit: unit,
1108         }
1109     } else {
1110         Number {
1111             value: value,
1112             int_value: int_value,
1113             has_sign: has_sign,
1114         }
1115     }
1116 }
1117 
1118 #[inline]
from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String1119 unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
1120     if cfg!(debug_assertions) {
1121         String::from_utf8(string_bytes).unwrap()
1122     } else {
1123         String::from_utf8_unchecked(string_bytes)
1124     }
1125 }
1126 
consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()>1127 fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
1128     // This is only called after "url(", so the current position is a code point boundary.
1129     let start_position = tokenizer.position;
1130     let from_start = &tokenizer.input[tokenizer.position..];
1131     let mut newlines = 0;
1132     let mut last_newline = 0;
1133     let mut found_printable_char = false;
1134     let mut iter = from_start.bytes().enumerate();
1135     loop {
1136         let (offset, b) = match iter.next() {
1137             Some(item) => item,
1138             None => {
1139                 tokenizer.position = tokenizer.input.len();
1140                 break;
1141             }
1142         };
1143         match_byte! { b,
1144             b' ' | b'\t' => {},
1145             b'\n' | b'\x0C' => {
1146                 newlines += 1;
1147                 last_newline = offset;
1148             }
1149             b'\r' => {
1150                 if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
1151                     newlines += 1;
1152                     last_newline = offset;
1153                 }
1154             }
1155             b'"' | b'\'' => { return Err(()) },  // Do not advance
1156             b')' => {
1157                 // Don't use advance, because we may be skipping
1158                 // newlines here, and we want to avoid the assert.
1159                 tokenizer.position += offset + 1;
1160                 break
1161             }
1162             _ => {
1163                 // Don't use advance, because we may be skipping
1164                 // newlines here, and we want to avoid the assert.
1165                 tokenizer.position += offset;
1166                 found_printable_char = true;
1167                 break
1168             }
1169         }
1170     }
1171 
1172     if newlines > 0 {
1173         tokenizer.current_line_number += newlines;
1174         // No need for wrapping_add here, because there's no possible
1175         // way to wrap.
1176         tokenizer.current_line_start_position = start_position + last_newline + 1;
1177     }
1178 
1179     if found_printable_char {
1180         // This function only consumed ASCII (whitespace) bytes,
1181         // so the current position is a code point boundary.
1182         return Ok(consume_unquoted_url_internal(tokenizer));
1183     } else {
1184         return Ok(UnquotedUrl("".into()));
1185     }
1186 
1187     fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1188         // This function is only called with start_pos at a code point boundary.
1189         let start_pos = tokenizer.position();
1190         let mut string_bytes: Vec<u8>;
1191         loop {
1192             if tokenizer.is_eof() {
1193                 return UnquotedUrl(tokenizer.slice_from(start_pos).into());
1194             }
1195             match_byte! { tokenizer.next_byte_unchecked(),
1196                 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1197                     let value = tokenizer.slice_from(start_pos);
1198                     return consume_url_end(tokenizer, start_pos, value.into())
1199                 }
1200                 b')' => {
1201                     let value = tokenizer.slice_from(start_pos);
1202                     tokenizer.advance(1);
1203                     return UnquotedUrl(value.into())
1204                 }
1205                 b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1206                     | b'"' | b'\'' | b'(' => {
1207                     tokenizer.advance(1);
1208                     return consume_bad_url(tokenizer, start_pos)
1209                 },
1210                 b'\\' | b'\0' => {
1211                     // * The tokenizer’s input is UTF-8 since it’s `&str`.
1212                     // * start_pos is at a code point boundary
1213                     // * so is the current position (which is before '\\' or '\0'
1214                     //
1215                     // So `string_bytes` is well-formed UTF-8.
1216                     string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
1217                     break
1218                 }
1219                 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
1220                 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
1221                 _ => {
1222                     // ASCII or other leading byte.
1223                     tokenizer.advance(1);
1224                 }
1225             }
1226         }
1227         while !tokenizer.is_eof() {
1228             let b = tokenizer.next_byte_unchecked();
1229             match_byte! { b,
1230                 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1231                     // string_bytes is well-formed UTF-8, see other comments.
1232                     let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1233                     return consume_url_end(tokenizer, start_pos, string)
1234                 }
1235                 b')' => {
1236                     tokenizer.advance(1);
1237                     break;
1238                 }
1239                 b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F'  // non-printable
1240                     | b'"' | b'\'' | b'(' => {
1241                     tokenizer.advance(1);
1242                     return consume_bad_url(tokenizer, start_pos);
1243                 }
1244                 b'\\' => {
1245                     tokenizer.advance(1);
1246                     if tokenizer.has_newline_at(0) {
1247                         return consume_bad_url(tokenizer, start_pos)
1248                     }
1249 
1250                     // This pushes one well-formed code point to string_bytes
1251                     consume_escape_and_write(tokenizer, &mut string_bytes)
1252                 },
1253                 b'\0' => {
1254                     tokenizer.advance(1);
1255                     string_bytes.extend("\u{FFFD}".as_bytes());
1256                 }
1257                 b'\x80'..=b'\xBF' => {
1258                     // We’ll end up copying the whole code point
1259                     // before this loop does something else.
1260                     tokenizer.consume_continuation_byte();
1261                     string_bytes.push(b);
1262                 }
1263                 b'\xF0'..=b'\xFF' => {
1264                     // We’ll end up copying the whole code point
1265                     // before this loop does something else.
1266                     tokenizer.consume_4byte_intro();
1267                     string_bytes.push(b);
1268                 }
1269                 // If this byte is part of a multi-byte code point,
1270                 // we’ll end up copying the whole code point before this loop does something else.
1271                 b => {
1272                     // ASCII or other leading byte.
1273                     tokenizer.advance(1);
1274                     string_bytes.push(b)
1275                 }
1276             }
1277         }
1278         UnquotedUrl(
1279             // string_bytes is well-formed UTF-8, see other comments.
1280             unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
1281         )
1282     }
1283 
1284     fn consume_url_end<'a>(
1285         tokenizer: &mut Tokenizer<'a>,
1286         start_pos: SourcePosition,
1287         string: CowRcStr<'a>,
1288     ) -> Token<'a> {
1289         while !tokenizer.is_eof() {
1290             match_byte! { tokenizer.next_byte_unchecked(),
1291                 b')' => {
1292                     tokenizer.advance(1);
1293                     break
1294                 }
1295                 b' ' | b'\t' => { tokenizer.advance(1); }
1296                 b'\n' | b'\x0C' | b'\r' => {
1297                     tokenizer.consume_newline();
1298                 }
1299                 b => {
1300                     tokenizer.consume_known_byte(b);
1301                     return consume_bad_url(tokenizer, start_pos);
1302                 }
1303             }
1304         }
1305         UnquotedUrl(string)
1306     }
1307 
1308     fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
1309         // Consume up to the closing )
1310         while !tokenizer.is_eof() {
1311             match_byte! { tokenizer.next_byte_unchecked(),
1312                 b')' => {
1313                     let contents = tokenizer.slice_from(start_pos).into();
1314                     tokenizer.advance(1);
1315                     return BadUrl(contents)
1316                 }
1317                 b'\\' => {
1318                     tokenizer.advance(1);
1319                     if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
1320                         tokenizer.advance(1); // Skip an escaped ')' or '\'
1321                     }
1322                 }
1323                 b'\n' | b'\x0C' | b'\r' => {
1324                     tokenizer.consume_newline();
1325                 }
1326                 b => {
1327                     tokenizer.consume_known_byte(b);
1328                 }
1329             }
1330         }
1331         BadUrl(tokenizer.slice_from(start_pos).into())
1332     }
1333 }
1334 
1335 // (value, number of digits up to 6)
consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32)1336 fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
1337     let mut value = 0;
1338     let mut digits = 0;
1339     while digits < 6 && !tokenizer.is_eof() {
1340         match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
1341             Some(digit) => {
1342                 value = value * 16 + digit;
1343                 digits += 1;
1344                 tokenizer.advance(1);
1345             }
1346             None => break,
1347         }
1348     }
1349     (value, digits)
1350 }
1351 
1352 // Same constraints as consume_escape except it writes into `bytes` the result
1353 // instead of returning it.
consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>)1354 fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1355     bytes.extend(
1356         consume_escape(tokenizer)
1357             .encode_utf8(&mut [0; 4])
1358             .as_bytes(),
1359     )
1360 }
1361 
1362 // Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
1363 // and that the next input character has already been verified
1364 // to not be a newline.
consume_escape(tokenizer: &mut Tokenizer) -> char1365 fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1366     if tokenizer.is_eof() {
1367         return '\u{FFFD}';
1368     } // Escaped EOF
1369     match_byte! { tokenizer.next_byte_unchecked(),
1370         b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
1371             let (c, _) = consume_hex_digits(tokenizer);
1372             if !tokenizer.is_eof() {
1373                 match_byte! { tokenizer.next_byte_unchecked(),
1374                     b' ' | b'\t' => {
1375                         tokenizer.advance(1)
1376                     }
1377                     b'\n' | b'\x0C' | b'\r' => {
1378                         tokenizer.consume_newline();
1379                     }
1380                     _ => {}
1381                 }
1382             }
1383             static REPLACEMENT_CHAR: char = '\u{FFFD}';
1384             if c != 0 {
1385                 let c = char::from_u32(c);
1386                 c.unwrap_or(REPLACEMENT_CHAR)
1387             } else {
1388                 REPLACEMENT_CHAR
1389             }
1390         },
1391         b'\0' => {
1392             tokenizer.advance(1);
1393             '\u{FFFD}'
1394         }
1395         _ => { tokenizer.consume_char() }
1396     }
1397 }
1398