1 /* This Source Code Form is subject to the terms of the Mozilla Public
2 * License, v. 2.0. If a copy of the MPL was not distributed with this
3 * file, You can obtain one at http://mozilla.org/MPL/2.0/. */
4
5 // https://drafts.csswg.org/css-syntax/#tokenization
6
7 use self::Token::*;
8 use crate::cow_rc_str::CowRcStr;
9 use crate::parser::ParserState;
10 use matches::matches;
11 use std::char;
12 use std::i32;
13 use std::ops::Range;
14
15 /// One of the pieces the CSS input is broken into.
16 ///
17 /// Some components use `Cow` in order to borrow from the original input string
18 /// and avoid allocating/copying when possible.
19 #[derive(PartialEq, Debug, Clone)]
20 pub enum Token<'a> {
21 /// A [`<ident-token>`](https://drafts.csswg.org/css-syntax/#ident-token-diagram)
22 Ident(CowRcStr<'a>),
23
24 /// A [`<at-keyword-token>`](https://drafts.csswg.org/css-syntax/#at-keyword-token-diagram)
25 ///
26 /// The value does not include the `@` marker.
27 AtKeyword(CowRcStr<'a>),
28
29 /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "unrestricted"
30 ///
31 /// The value does not include the `#` marker.
32 Hash(CowRcStr<'a>),
33
34 /// A [`<hash-token>`](https://drafts.csswg.org/css-syntax/#hash-token-diagram) with the type flag set to "id"
35 ///
36 /// The value does not include the `#` marker.
37 IDHash(CowRcStr<'a>), // Hash that is a valid ID selector.
38
39 /// A [`<string-token>`](https://drafts.csswg.org/css-syntax/#string-token-diagram)
40 ///
41 /// The value does not include the quotes.
42 QuotedString(CowRcStr<'a>),
43
44 /// A [`<url-token>`](https://drafts.csswg.org/css-syntax/#url-token-diagram)
45 ///
46 /// The value does not include the `url(` `)` markers. Note that `url( <string-token> )` is represented by a
47 /// `Function` token.
48 UnquotedUrl(CowRcStr<'a>),
49
50 /// A `<delim-token>`
51 Delim(char),
52
53 /// A [`<number-token>`](https://drafts.csswg.org/css-syntax/#number-token-diagram)
54 Number {
55 /// Whether the number had a `+` or `-` sign.
56 ///
57 /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
58 has_sign: bool,
59
60 /// The value as a float
61 value: f32,
62
63 /// If the origin source did not include a fractional part, the value as an integer.
64 int_value: Option<i32>,
65 },
66
67 /// A [`<percentage-token>`](https://drafts.csswg.org/css-syntax/#percentage-token-diagram)
68 Percentage {
69 /// Whether the number had a `+` or `-` sign.
70 has_sign: bool,
71
72 /// The value as a float, divided by 100 so that the nominal range is 0.0 to 1.0.
73 unit_value: f32,
74
75 /// If the origin source did not include a fractional part, the value as an integer.
76 /// It is **not** divided by 100.
77 int_value: Option<i32>,
78 },
79
80 /// A [`<dimension-token>`](https://drafts.csswg.org/css-syntax/#dimension-token-diagram)
81 Dimension {
82 /// Whether the number had a `+` or `-` sign.
83 ///
84 /// This is used is some cases like the <An+B> micro syntax. (See the `parse_nth` function.)
85 has_sign: bool,
86
87 /// The value as a float
88 value: f32,
89
90 /// If the origin source did not include a fractional part, the value as an integer.
91 int_value: Option<i32>,
92
93 /// The unit, e.g. "px" in `12px`
94 unit: CowRcStr<'a>,
95 },
96
97 /// A [`<whitespace-token>`](https://drafts.csswg.org/css-syntax/#whitespace-token-diagram)
98 WhiteSpace(&'a str),
99
100 /// A comment.
101 ///
102 /// The CSS Syntax spec does not generate tokens for comments,
103 /// But we do, because we can (borrowed &str makes it cheap).
104 ///
105 /// The value does not include the `/*` `*/` markers.
106 Comment(&'a str),
107
108 /// A `:` `<colon-token>`
109 Colon, // :
110
111 /// A `;` `<semicolon-token>`
112 Semicolon, // ;
113
114 /// A `,` `<comma-token>`
115 Comma, // ,
116
117 /// A `~=` [`<include-match-token>`](https://drafts.csswg.org/css-syntax/#include-match-token-diagram)
118 IncludeMatch,
119
120 /// A `|=` [`<dash-match-token>`](https://drafts.csswg.org/css-syntax/#dash-match-token-diagram)
121 DashMatch,
122
123 /// A `^=` [`<prefix-match-token>`](https://drafts.csswg.org/css-syntax/#prefix-match-token-diagram)
124 PrefixMatch,
125
126 /// A `$=` [`<suffix-match-token>`](https://drafts.csswg.org/css-syntax/#suffix-match-token-diagram)
127 SuffixMatch,
128
129 /// A `*=` [`<substring-match-token>`](https://drafts.csswg.org/css-syntax/#substring-match-token-diagram)
130 SubstringMatch,
131
132 /// A `<!--` [`<CDO-token>`](https://drafts.csswg.org/css-syntax/#CDO-token-diagram)
133 CDO,
134
135 /// A `-->` [`<CDC-token>`](https://drafts.csswg.org/css-syntax/#CDC-token-diagram)
136 CDC,
137
138 /// A [`<function-token>`](https://drafts.csswg.org/css-syntax/#function-token-diagram)
139 ///
140 /// The value (name) does not include the `(` marker.
141 Function(CowRcStr<'a>),
142
143 /// A `<(-token>`
144 ParenthesisBlock,
145
146 /// A `<[-token>`
147 SquareBracketBlock,
148
149 /// A `<{-token>`
150 CurlyBracketBlock,
151
152 /// A `<bad-url-token>`
153 ///
154 /// This token always indicates a parse error.
155 BadUrl(CowRcStr<'a>),
156
157 /// A `<bad-string-token>`
158 ///
159 /// This token always indicates a parse error.
160 BadString(CowRcStr<'a>),
161
162 /// A `<)-token>`
163 ///
164 /// When obtained from one of the `Parser::next*` methods,
165 /// this token is always unmatched and indicates a parse error.
166 CloseParenthesis,
167
168 /// A `<]-token>`
169 ///
170 /// When obtained from one of the `Parser::next*` methods,
171 /// this token is always unmatched and indicates a parse error.
172 CloseSquareBracket,
173
174 /// A `<}-token>`
175 ///
176 /// When obtained from one of the `Parser::next*` methods,
177 /// this token is always unmatched and indicates a parse error.
178 CloseCurlyBracket,
179 }
180
181 impl<'a> Token<'a> {
182 /// Return whether this token represents a parse error.
183 ///
184 /// `BadUrl` and `BadString` are tokenizer-level parse errors.
185 ///
186 /// `CloseParenthesis`, `CloseSquareBracket`, and `CloseCurlyBracket` are *unmatched*
187 /// and therefore parse errors when returned by one of the `Parser::next*` methods.
is_parse_error(&self) -> bool188 pub fn is_parse_error(&self) -> bool {
189 matches!(
190 *self,
191 BadUrl(_) | BadString(_) | CloseParenthesis | CloseSquareBracket | CloseCurlyBracket
192 )
193 }
194 }
195
196 #[derive(Clone)]
197 pub struct Tokenizer<'a> {
198 input: &'a str,
199 /// Counted in bytes, not code points. From 0.
200 position: usize,
201 /// The position at the start of the current line; but adjusted to
202 /// ensure that computing the column will give the result in units
203 /// of UTF-16 characters.
204 current_line_start_position: usize,
205 current_line_number: u32,
206 var_or_env_functions: SeenStatus,
207 source_map_url: Option<&'a str>,
208 source_url: Option<&'a str>,
209 }
210
211 #[derive(Copy, Clone, PartialEq, Eq)]
212 enum SeenStatus {
213 DontCare,
214 LookingForThem,
215 SeenAtLeastOne,
216 }
217
218 impl<'a> Tokenizer<'a> {
219 #[inline]
new(input: &str) -> Tokenizer220 pub fn new(input: &str) -> Tokenizer {
221 Tokenizer::with_first_line_number(input, 0)
222 }
223
224 #[inline]
with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer225 pub fn with_first_line_number(input: &str, first_line_number: u32) -> Tokenizer {
226 Tokenizer {
227 input: input,
228 position: 0,
229 current_line_start_position: 0,
230 current_line_number: first_line_number,
231 var_or_env_functions: SeenStatus::DontCare,
232 source_map_url: None,
233 source_url: None,
234 }
235 }
236
237 #[inline]
look_for_var_or_env_functions(&mut self)238 pub fn look_for_var_or_env_functions(&mut self) {
239 self.var_or_env_functions = SeenStatus::LookingForThem;
240 }
241
242 #[inline]
seen_var_or_env_functions(&mut self) -> bool243 pub fn seen_var_or_env_functions(&mut self) -> bool {
244 let seen = self.var_or_env_functions == SeenStatus::SeenAtLeastOne;
245 self.var_or_env_functions = SeenStatus::DontCare;
246 seen
247 }
248
249 #[inline]
see_function(&mut self, name: &str)250 pub fn see_function(&mut self, name: &str) {
251 if self.var_or_env_functions == SeenStatus::LookingForThem {
252 if name.eq_ignore_ascii_case("var") || name.eq_ignore_ascii_case("env") {
253 self.var_or_env_functions = SeenStatus::SeenAtLeastOne;
254 }
255 }
256 }
257
258 #[inline]
next(&mut self) -> Result<Token<'a>, ()>259 pub fn next(&mut self) -> Result<Token<'a>, ()> {
260 next_token(self)
261 }
262
263 #[inline]
position(&self) -> SourcePosition264 pub fn position(&self) -> SourcePosition {
265 SourcePosition(self.position)
266 }
267
268 #[inline]
current_source_location(&self) -> SourceLocation269 pub fn current_source_location(&self) -> SourceLocation {
270 SourceLocation {
271 line: self.current_line_number,
272 column: (self.position - self.current_line_start_position + 1) as u32,
273 }
274 }
275
276 #[inline]
current_source_map_url(&self) -> Option<&'a str>277 pub fn current_source_map_url(&self) -> Option<&'a str> {
278 self.source_map_url
279 }
280
281 #[inline]
current_source_url(&self) -> Option<&'a str>282 pub fn current_source_url(&self) -> Option<&'a str> {
283 self.source_url
284 }
285
286 #[inline]
state(&self) -> ParserState287 pub fn state(&self) -> ParserState {
288 ParserState {
289 position: self.position,
290 current_line_start_position: self.current_line_start_position,
291 current_line_number: self.current_line_number,
292 at_start_of: None,
293 }
294 }
295
296 #[inline]
reset(&mut self, state: &ParserState)297 pub fn reset(&mut self, state: &ParserState) {
298 self.position = state.position;
299 self.current_line_start_position = state.current_line_start_position;
300 self.current_line_number = state.current_line_number;
301 }
302
303 #[inline]
slice_from(&self, start_pos: SourcePosition) -> &'a str304 pub fn slice_from(&self, start_pos: SourcePosition) -> &'a str {
305 &self.input[start_pos.0..self.position]
306 }
307
308 #[inline]
slice(&self, range: Range<SourcePosition>) -> &'a str309 pub fn slice(&self, range: Range<SourcePosition>) -> &'a str {
310 &self.input[range.start.0..range.end.0]
311 }
312
current_source_line(&self) -> &'a str313 pub fn current_source_line(&self) -> &'a str {
314 let current = self.position;
315 let start = self.input[0..current]
316 .rfind(|c| matches!(c, '\r' | '\n' | '\x0C'))
317 .map_or(0, |start| start + 1);
318 let end = self.input[current..]
319 .find(|c| matches!(c, '\r' | '\n' | '\x0C'))
320 .map_or(self.input.len(), |end| current + end);
321 &self.input[start..end]
322 }
323
324 #[inline]
next_byte(&self) -> Option<u8>325 pub fn next_byte(&self) -> Option<u8> {
326 if self.is_eof() {
327 None
328 } else {
329 Some(self.input.as_bytes()[self.position])
330 }
331 }
332
333 // If false, `tokenizer.next_char()` will not panic.
334 #[inline]
is_eof(&self) -> bool335 fn is_eof(&self) -> bool {
336 !self.has_at_least(0)
337 }
338
339 // If true, the input has at least `n` bytes left *after* the current one.
340 // That is, `tokenizer.char_at(n)` will not panic.
341 #[inline]
has_at_least(&self, n: usize) -> bool342 fn has_at_least(&self, n: usize) -> bool {
343 self.position + n < self.input.len()
344 }
345
346 // Advance over N bytes in the input. This function can advance
347 // over ASCII bytes (excluding newlines), or UTF-8 sequence
348 // leaders (excluding leaders for 4-byte sequences).
349 #[inline]
advance(&mut self, n: usize)350 pub fn advance(&mut self, n: usize) {
351 if cfg!(debug_assertions) {
352 // Each byte must either be an ASCII byte or a sequence
353 // leader, but not a 4-byte leader; also newlines are
354 // rejected.
355 for i in 0..n {
356 let b = self.byte_at(i);
357 debug_assert!(b.is_ascii() || (b & 0xF0 != 0xF0 && b & 0xC0 != 0x80));
358 debug_assert!(b != b'\r' && b != b'\n' && b != b'\x0C');
359 }
360 }
361 self.position += n
362 }
363
364 // Assumes non-EOF
365 #[inline]
next_byte_unchecked(&self) -> u8366 fn next_byte_unchecked(&self) -> u8 {
367 self.byte_at(0)
368 }
369
370 #[inline]
byte_at(&self, offset: usize) -> u8371 fn byte_at(&self, offset: usize) -> u8 {
372 self.input.as_bytes()[self.position + offset]
373 }
374
375 // Advance over a single byte; the byte must be a UTF-8 sequence
376 // leader for a 4-byte sequence.
377 #[inline]
consume_4byte_intro(&mut self)378 fn consume_4byte_intro(&mut self) {
379 debug_assert!(self.next_byte_unchecked() & 0xF0 == 0xF0);
380 // This takes two UTF-16 characters to represent, so we
381 // actually have an undercount.
382 self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
383 self.position += 1;
384 }
385
386 // Advance over a single byte; the byte must be a UTF-8
387 // continuation byte.
388 #[inline]
consume_continuation_byte(&mut self)389 fn consume_continuation_byte(&mut self) {
390 debug_assert!(self.next_byte_unchecked() & 0xC0 == 0x80);
391 // Continuation bytes contribute to column overcount. Note
392 // that due to the special case for the 4-byte sequence intro,
393 // we must use wrapping add here.
394 self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
395 self.position += 1;
396 }
397
398 // Advance over any kind of byte, excluding newlines.
399 #[inline(never)]
consume_known_byte(&mut self, byte: u8)400 fn consume_known_byte(&mut self, byte: u8) {
401 debug_assert!(byte != b'\r' && byte != b'\n' && byte != b'\x0C');
402 self.position += 1;
403 // Continuation bytes contribute to column overcount.
404 if byte & 0xF0 == 0xF0 {
405 // This takes two UTF-16 characters to represent, so we
406 // actually have an undercount.
407 self.current_line_start_position = self.current_line_start_position.wrapping_sub(1);
408 } else if byte & 0xC0 == 0x80 {
409 // Note that due to the special case for the 4-byte
410 // sequence intro, we must use wrapping add here.
411 self.current_line_start_position = self.current_line_start_position.wrapping_add(1);
412 }
413 }
414
415 #[inline]
next_char(&self) -> char416 fn next_char(&self) -> char {
417 self.input[self.position..].chars().next().unwrap()
418 }
419
420 // Given that a newline has been seen, advance over the newline
421 // and update the state.
422 #[inline]
consume_newline(&mut self)423 fn consume_newline(&mut self) {
424 let byte = self.next_byte_unchecked();
425 debug_assert!(byte == b'\r' || byte == b'\n' || byte == b'\x0C');
426 self.position += 1;
427 if byte == b'\r' && self.next_byte() == Some(b'\n') {
428 self.position += 1;
429 }
430 self.current_line_start_position = self.position;
431 self.current_line_number += 1;
432 }
433
434 #[inline]
has_newline_at(&self, offset: usize) -> bool435 fn has_newline_at(&self, offset: usize) -> bool {
436 self.position + offset < self.input.len()
437 && matches!(self.byte_at(offset), b'\n' | b'\r' | b'\x0C')
438 }
439
440 #[inline]
consume_char(&mut self) -> char441 fn consume_char(&mut self) -> char {
442 let c = self.next_char();
443 let len_utf8 = c.len_utf8();
444 self.position += len_utf8;
445 // Note that due to the special case for the 4-byte sequence
446 // intro, we must use wrapping add here.
447 self.current_line_start_position = self
448 .current_line_start_position
449 .wrapping_add(len_utf8 - c.len_utf16());
450 c
451 }
452
453 #[inline]
starts_with(&self, needle: &[u8]) -> bool454 fn starts_with(&self, needle: &[u8]) -> bool {
455 self.input.as_bytes()[self.position..].starts_with(needle)
456 }
457
skip_whitespace(&mut self)458 pub fn skip_whitespace(&mut self) {
459 while !self.is_eof() {
460 match_byte! { self.next_byte_unchecked(),
461 b' ' | b'\t' => {
462 self.advance(1)
463 },
464 b'\n' | b'\x0C' | b'\r' => {
465 self.consume_newline();
466 },
467 b'/' => {
468 if self.starts_with(b"/*") {
469 consume_comment(self);
470 } else {
471 return
472 }
473 }
474 _ => {
475 return
476 }
477 }
478 }
479 }
480
skip_cdc_and_cdo(&mut self)481 pub fn skip_cdc_and_cdo(&mut self) {
482 while !self.is_eof() {
483 match_byte! { self.next_byte_unchecked(),
484 b' ' | b'\t' => {
485 self.advance(1)
486 },
487 b'\n' | b'\x0C' | b'\r' => {
488 self.consume_newline();
489 },
490 b'/' => {
491 if self.starts_with(b"/*") {
492 consume_comment(self);
493 } else {
494 return
495 }
496 }
497 b'<' => {
498 if self.starts_with(b"<!--") {
499 self.advance(4)
500 } else {
501 return
502 }
503 }
504 b'-' => {
505 if self.starts_with(b"-->") {
506 self.advance(3)
507 } else {
508 return
509 }
510 }
511 _ => {
512 return
513 }
514 }
515 }
516 }
517 }
518
519 /// A position from the start of the input, counted in UTF-8 bytes.
520 #[derive(PartialEq, Eq, PartialOrd, Ord, Debug, Clone, Copy)]
521 pub struct SourcePosition(pub(crate) usize);
522
523 impl SourcePosition {
524 /// Returns the current byte index in the original input.
525 #[inline]
byte_index(&self) -> usize526 pub fn byte_index(&self) -> usize {
527 self.0
528 }
529 }
530
531 /// The line and column number for a given position within the input.
532 #[derive(PartialEq, Eq, Debug, Clone, Copy)]
533 pub struct SourceLocation {
534 /// The line number, starting at 0 for the first line, unless `with_first_line_number` was used.
535 pub line: u32,
536
537 /// The column number within a line, starting at 1 for first the character of the line.
538 /// Column numbers are counted in UTF-16 code units.
539 pub column: u32,
540 }
541
next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()>542 fn next_token<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
543 if tokenizer.is_eof() {
544 return Err(());
545 }
546 let b = tokenizer.next_byte_unchecked();
547 let token = match_byte! { b,
548 b' ' | b'\t' => {
549 consume_whitespace(tokenizer, false)
550 },
551 b'\n' | b'\x0C' | b'\r' => {
552 consume_whitespace(tokenizer, true)
553 },
554 b'"' => { consume_string(tokenizer, false) },
555 b'#' => {
556 tokenizer.advance(1);
557 if is_ident_start(tokenizer) { IDHash(consume_name(tokenizer)) }
558 else if !tokenizer.is_eof() && match tokenizer.next_byte_unchecked() {
559 // Any other valid case here already resulted in IDHash.
560 b'0'..=b'9' | b'-' => true,
561 _ => false,
562 } { Hash(consume_name(tokenizer)) }
563 else { Delim('#') }
564 },
565 b'$' => {
566 if tokenizer.starts_with(b"$=") { tokenizer.advance(2); SuffixMatch }
567 else { tokenizer.advance(1); Delim('$') }
568 },
569 b'\'' => { consume_string(tokenizer, true) },
570 b'(' => { tokenizer.advance(1); ParenthesisBlock },
571 b')' => { tokenizer.advance(1); CloseParenthesis },
572 b'*' => {
573 if tokenizer.starts_with(b"*=") { tokenizer.advance(2); SubstringMatch }
574 else { tokenizer.advance(1); Delim('*') }
575 },
576 b'+' => {
577 if (
578 tokenizer.has_at_least(1)
579 && matches!(tokenizer.byte_at(1), b'0'..=b'9')
580 ) || (
581 tokenizer.has_at_least(2)
582 && tokenizer.byte_at(1) == b'.'
583 && matches!(tokenizer.byte_at(2), b'0'..=b'9')
584 ) {
585 consume_numeric(tokenizer)
586 } else {
587 tokenizer.advance(1);
588 Delim('+')
589 }
590 },
591 b',' => { tokenizer.advance(1); Comma },
592 b'-' => {
593 if (
594 tokenizer.has_at_least(1)
595 && matches!(tokenizer.byte_at(1), b'0'..=b'9')
596 ) || (
597 tokenizer.has_at_least(2)
598 && tokenizer.byte_at(1) == b'.'
599 && matches!(tokenizer.byte_at(2), b'0'..=b'9')
600 ) {
601 consume_numeric(tokenizer)
602 } else if tokenizer.starts_with(b"-->") {
603 tokenizer.advance(3);
604 CDC
605 } else if is_ident_start(tokenizer) {
606 consume_ident_like(tokenizer)
607 } else {
608 tokenizer.advance(1);
609 Delim('-')
610 }
611 },
612 b'.' => {
613 if tokenizer.has_at_least(1)
614 && matches!(tokenizer.byte_at(1), b'0'..=b'9'
615 ) {
616 consume_numeric(tokenizer)
617 } else {
618 tokenizer.advance(1);
619 Delim('.')
620 }
621 }
622 b'/' => {
623 if tokenizer.starts_with(b"/*") {
624 Comment(consume_comment(tokenizer))
625 } else {
626 tokenizer.advance(1);
627 Delim('/')
628 }
629 }
630 b'0'..=b'9' => { consume_numeric(tokenizer) },
631 b':' => { tokenizer.advance(1); Colon },
632 b';' => { tokenizer.advance(1); Semicolon },
633 b'<' => {
634 if tokenizer.starts_with(b"<!--") {
635 tokenizer.advance(4);
636 CDO
637 } else {
638 tokenizer.advance(1);
639 Delim('<')
640 }
641 },
642 b'@' => {
643 tokenizer.advance(1);
644 if is_ident_start(tokenizer) { AtKeyword(consume_name(tokenizer)) }
645 else { Delim('@') }
646 },
647 b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { consume_ident_like(tokenizer) },
648 b'[' => { tokenizer.advance(1); SquareBracketBlock },
649 b'\\' => {
650 if !tokenizer.has_newline_at(1) { consume_ident_like(tokenizer) }
651 else { tokenizer.advance(1); Delim('\\') }
652 },
653 b']' => { tokenizer.advance(1); CloseSquareBracket },
654 b'^' => {
655 if tokenizer.starts_with(b"^=") { tokenizer.advance(2); PrefixMatch }
656 else { tokenizer.advance(1); Delim('^') }
657 },
658 b'{' => { tokenizer.advance(1); CurlyBracketBlock },
659 b'|' => {
660 if tokenizer.starts_with(b"|=") { tokenizer.advance(2); DashMatch }
661 else { tokenizer.advance(1); Delim('|') }
662 },
663 b'}' => { tokenizer.advance(1); CloseCurlyBracket },
664 b'~' => {
665 if tokenizer.starts_with(b"~=") { tokenizer.advance(2); IncludeMatch }
666 else { tokenizer.advance(1); Delim('~') }
667 },
668 _ => {
669 if !b.is_ascii() {
670 consume_ident_like(tokenizer)
671 } else {
672 tokenizer.advance(1);
673 Delim(b as char)
674 }
675 },
676 };
677 Ok(token)
678 }
679
consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a>680 fn consume_whitespace<'a>(tokenizer: &mut Tokenizer<'a>, newline: bool) -> Token<'a> {
681 let start_position = tokenizer.position();
682 if newline {
683 tokenizer.consume_newline();
684 } else {
685 tokenizer.advance(1);
686 }
687 while !tokenizer.is_eof() {
688 let b = tokenizer.next_byte_unchecked();
689 match_byte! { b,
690 b' ' | b'\t' => {
691 tokenizer.advance(1);
692 }
693 b'\n' | b'\x0C' | b'\r' => {
694 tokenizer.consume_newline();
695 }
696 _ => {
697 break
698 }
699 }
700 }
701 WhiteSpace(tokenizer.slice_from(start_position))
702 }
703
704 // Check for sourceMappingURL or sourceURL comments and update the
705 // tokenizer appropriately.
check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str)706 fn check_for_source_map<'a>(tokenizer: &mut Tokenizer<'a>, contents: &'a str) {
707 let directive = "# sourceMappingURL=";
708 let directive_old = "@ sourceMappingURL=";
709
710 // If there is a source map directive, extract the URL.
711 if contents.starts_with(directive) || contents.starts_with(directive_old) {
712 let contents = &contents[directive.len()..];
713 tokenizer.source_map_url = contents
714 .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
715 .next()
716 }
717
718 let directive = "# sourceURL=";
719 let directive_old = "@ sourceURL=";
720
721 // If there is a source map directive, extract the URL.
722 if contents.starts_with(directive) || contents.starts_with(directive_old) {
723 let contents = &contents[directive.len()..];
724 tokenizer.source_url = contents
725 .split(|c| c == ' ' || c == '\t' || c == '\x0C' || c == '\r' || c == '\n')
726 .next()
727 }
728 }
729
consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str730 fn consume_comment<'a>(tokenizer: &mut Tokenizer<'a>) -> &'a str {
731 tokenizer.advance(2); // consume "/*"
732 let start_position = tokenizer.position();
733 while !tokenizer.is_eof() {
734 match_byte! { tokenizer.next_byte_unchecked(),
735 b'*' => {
736 let end_position = tokenizer.position();
737 tokenizer.advance(1);
738 if tokenizer.next_byte() == Some(b'/') {
739 tokenizer.advance(1);
740 let contents = tokenizer.slice(start_position..end_position);
741 check_for_source_map(tokenizer, contents);
742 return contents
743 }
744 }
745 b'\n' | b'\x0C' | b'\r' => {
746 tokenizer.consume_newline();
747 }
748 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
749 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
750 _ => {
751 // ASCII or other leading byte.
752 tokenizer.advance(1);
753 }
754 }
755 }
756 let contents = tokenizer.slice_from(start_position);
757 check_for_source_map(tokenizer, contents);
758 contents
759 }
760
consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a>761 fn consume_string<'a>(tokenizer: &mut Tokenizer<'a>, single_quote: bool) -> Token<'a> {
762 match consume_quoted_string(tokenizer, single_quote) {
763 Ok(value) => QuotedString(value),
764 Err(value) => BadString(value),
765 }
766 }
767
768 /// Return `Err(())` on syntax error (ie. unescaped newline)
consume_quoted_string<'a>( tokenizer: &mut Tokenizer<'a>, single_quote: bool, ) -> Result<CowRcStr<'a>, CowRcStr<'a>>769 fn consume_quoted_string<'a>(
770 tokenizer: &mut Tokenizer<'a>,
771 single_quote: bool,
772 ) -> Result<CowRcStr<'a>, CowRcStr<'a>> {
773 tokenizer.advance(1); // Skip the initial quote
774 // start_pos is at code point boundary, after " or '
775 let start_pos = tokenizer.position();
776 let mut string_bytes;
777 loop {
778 if tokenizer.is_eof() {
779 return Ok(tokenizer.slice_from(start_pos).into());
780 }
781 match_byte! { tokenizer.next_byte_unchecked(),
782 b'"' => {
783 if !single_quote {
784 let value = tokenizer.slice_from(start_pos);
785 tokenizer.advance(1);
786 return Ok(value.into())
787 }
788 tokenizer.advance(1);
789 }
790 b'\'' => {
791 if single_quote {
792 let value = tokenizer.slice_from(start_pos);
793 tokenizer.advance(1);
794 return Ok(value.into())
795 }
796 tokenizer.advance(1);
797 }
798 b'\\' | b'\0' => {
799 // * The tokenizer’s input is UTF-8 since it’s `&str`.
800 // * start_pos is at a code point boundary
801 // * so is the current position (which is before '\\' or '\0'
802 //
803 // So `string_bytes` is well-formed UTF-8.
804 string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
805 break
806 }
807 b'\n' | b'\r' | b'\x0C' => {
808 return Err(tokenizer.slice_from(start_pos).into())
809 },
810 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
811 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
812 _ => {
813 // ASCII or other leading byte.
814 tokenizer.advance(1);
815 }
816 }
817 }
818
819 while !tokenizer.is_eof() {
820 let b = tokenizer.next_byte_unchecked();
821 match_byte! { b,
822 b'\n' | b'\r' | b'\x0C' => {
823 return Err(
824 // string_bytes is well-formed UTF-8, see other comments.
825 unsafe {
826 from_utf8_release_unchecked(string_bytes)
827 }.into()
828 );
829 }
830 b'"' => {
831 tokenizer.advance(1);
832 if !single_quote {
833 break;
834 }
835 }
836 b'\'' => {
837 tokenizer.advance(1);
838 if single_quote {
839 break;
840 }
841 }
842 b'\\' => {
843 tokenizer.advance(1);
844 if !tokenizer.is_eof() {
845 match tokenizer.next_byte_unchecked() {
846 // Escaped newline
847 b'\n' | b'\x0C' | b'\r' => {
848 tokenizer.consume_newline();
849 }
850 // This pushes one well-formed code point
851 _ => consume_escape_and_write(tokenizer, &mut string_bytes)
852 }
853 }
854 // else: escaped EOF, do nothing.
855 continue;
856 }
857 b'\0' => {
858 tokenizer.advance(1);
859 string_bytes.extend("\u{FFFD}".as_bytes());
860 continue;
861 }
862 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
863 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
864 _ => {
865 // ASCII or other leading byte.
866 tokenizer.advance(1);
867 },
868 }
869
870 // If this byte is part of a multi-byte code point,
871 // we’ll end up copying the whole code point before this loop does something else.
872 string_bytes.push(b);
873 }
874
875 Ok(
876 // string_bytes is well-formed UTF-8, see other comments.
877 unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
878 )
879 }
880
881 #[inline]
is_ident_start(tokenizer: &mut Tokenizer) -> bool882 fn is_ident_start(tokenizer: &mut Tokenizer) -> bool {
883 !tokenizer.is_eof()
884 && match_byte! { tokenizer.next_byte_unchecked(),
885 b'a'..=b'z' | b'A'..=b'Z' | b'_' | b'\0' => { true },
886 b'-' => {
887 tokenizer.has_at_least(1) && match_byte! { tokenizer.byte_at(1),
888 b'a'..=b'z' | b'A'..=b'Z' | b'-' | b'_' | b'\0' => {
889 true
890 }
891 b'\\' => { !tokenizer.has_newline_at(1) }
892 b => { !b.is_ascii() },
893 }
894 },
895 b'\\' => { !tokenizer.has_newline_at(1) },
896 b => { !b.is_ascii() },
897 }
898 }
899
consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a>900 fn consume_ident_like<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
901 let value = consume_name(tokenizer);
902 if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'(' {
903 tokenizer.advance(1);
904 if value.eq_ignore_ascii_case("url") {
905 consume_unquoted_url(tokenizer).unwrap_or(Function(value))
906 } else {
907 tokenizer.see_function(&value);
908 Function(value)
909 }
910 } else {
911 Ident(value)
912 }
913 }
914
consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a>915 fn consume_name<'a>(tokenizer: &mut Tokenizer<'a>) -> CowRcStr<'a> {
916 // start_pos is the end of the previous token, therefore at a code point boundary
917 let start_pos = tokenizer.position();
918 let mut value_bytes;
919 loop {
920 if tokenizer.is_eof() {
921 return tokenizer.slice_from(start_pos).into();
922 }
923 match_byte! { tokenizer.next_byte_unchecked(),
924 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => { tokenizer.advance(1) },
925 b'\\' | b'\0' => {
926 // * The tokenizer’s input is UTF-8 since it’s `&str`.
927 // * start_pos is at a code point boundary
928 // * so is the current position (which is before '\\' or '\0'
929 //
930 // So `value_bytes` is well-formed UTF-8.
931 value_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
932 break
933 }
934 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
935 b'\xC0'..=b'\xEF' => { tokenizer.advance(1); }
936 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
937 _b => {
938 return tokenizer.slice_from(start_pos).into();
939 }
940 }
941 }
942
943 while !tokenizer.is_eof() {
944 let b = tokenizer.next_byte_unchecked();
945 match_byte! { b,
946 b'a'..=b'z' | b'A'..=b'Z' | b'0'..=b'9' | b'_' | b'-' => {
947 tokenizer.advance(1);
948 value_bytes.push(b) // ASCII
949 }
950 b'\\' => {
951 if tokenizer.has_newline_at(1) { break }
952 tokenizer.advance(1);
953 // This pushes one well-formed code point
954 consume_escape_and_write(tokenizer, &mut value_bytes)
955 }
956 b'\0' => {
957 tokenizer.advance(1);
958 value_bytes.extend("\u{FFFD}".as_bytes());
959 },
960 b'\x80'..=b'\xBF' => {
961 // This byte *is* part of a multi-byte code point,
962 // we’ll end up copying the whole code point before this loop does something else.
963 tokenizer.consume_continuation_byte();
964 value_bytes.push(b)
965 }
966 b'\xC0'..=b'\xEF' => {
967 // This byte *is* part of a multi-byte code point,
968 // we’ll end up copying the whole code point before this loop does something else.
969 tokenizer.advance(1);
970 value_bytes.push(b)
971 }
972 b'\xF0'..=b'\xFF' => {
973 tokenizer.consume_4byte_intro();
974 value_bytes.push(b)
975 }
976 _ => {
977 // ASCII
978 break;
979 }
980 }
981 }
982 // string_bytes is well-formed UTF-8, see other comments.
983 unsafe { from_utf8_release_unchecked(value_bytes) }.into()
984 }
985
byte_to_hex_digit(b: u8) -> Option<u32>986 fn byte_to_hex_digit(b: u8) -> Option<u32> {
987 Some(match_byte! { b,
988 b'0' ..= b'9' => { b - b'0' },
989 b'a' ..= b'f' => { b - b'a' + 10 },
990 b'A' ..= b'F' => { b - b'A' + 10 },
991 _ => {
992 return None
993 }
994 } as u32)
995 }
996
byte_to_decimal_digit(b: u8) -> Option<u32>997 fn byte_to_decimal_digit(b: u8) -> Option<u32> {
998 if b >= b'0' && b <= b'9' {
999 Some((b - b'0') as u32)
1000 } else {
1001 None
1002 }
1003 }
1004
consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a>1005 fn consume_numeric<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1006 // Parse [+-]?\d*(\.\d+)?([eE][+-]?\d+)?
1007 // But this is always called so that there is at least one digit in \d*(\.\d+)?
1008
1009 // Do all the math in f64 so that large numbers overflow to +/-inf
1010 // and i32::{MIN, MAX} are within range.
1011
1012 let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1013 b'-' => (true, -1.),
1014 b'+' => (true, 1.),
1015 _ => (false, 1.),
1016 };
1017 if has_sign {
1018 tokenizer.advance(1);
1019 }
1020
1021 let mut integral_part: f64 = 0.;
1022 while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1023 integral_part = integral_part * 10. + digit as f64;
1024 tokenizer.advance(1);
1025 if tokenizer.is_eof() {
1026 break;
1027 }
1028 }
1029
1030 let mut is_integer = true;
1031
1032 let mut fractional_part: f64 = 0.;
1033 if tokenizer.has_at_least(1)
1034 && tokenizer.next_byte_unchecked() == b'.'
1035 && matches!(tokenizer.byte_at(1), b'0'..=b'9')
1036 {
1037 is_integer = false;
1038 tokenizer.advance(1); // Consume '.'
1039 let mut factor = 0.1;
1040 while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1041 fractional_part += digit as f64 * factor;
1042 factor *= 0.1;
1043 tokenizer.advance(1);
1044 if tokenizer.is_eof() {
1045 break;
1046 }
1047 }
1048 }
1049
1050 let mut value = sign * (integral_part + fractional_part);
1051
1052 if tokenizer.has_at_least(1) && matches!(tokenizer.next_byte_unchecked(), b'e' | b'E') {
1053 if matches!(tokenizer.byte_at(1), b'0'..=b'9')
1054 || (tokenizer.has_at_least(2)
1055 && matches!(tokenizer.byte_at(1), b'+' | b'-')
1056 && matches!(tokenizer.byte_at(2), b'0'..=b'9'))
1057 {
1058 is_integer = false;
1059 tokenizer.advance(1);
1060 let (has_sign, sign) = match tokenizer.next_byte_unchecked() {
1061 b'-' => (true, -1.),
1062 b'+' => (true, 1.),
1063 _ => (false, 1.),
1064 };
1065 if has_sign {
1066 tokenizer.advance(1);
1067 }
1068 let mut exponent: f64 = 0.;
1069 while let Some(digit) = byte_to_decimal_digit(tokenizer.next_byte_unchecked()) {
1070 exponent = exponent * 10. + digit as f64;
1071 tokenizer.advance(1);
1072 if tokenizer.is_eof() {
1073 break;
1074 }
1075 }
1076 value *= f64::powf(10., sign * exponent);
1077 }
1078 }
1079
1080 let int_value = if is_integer {
1081 Some(if value >= i32::MAX as f64 {
1082 i32::MAX
1083 } else if value <= i32::MIN as f64 {
1084 i32::MIN
1085 } else {
1086 value as i32
1087 })
1088 } else {
1089 None
1090 };
1091
1092 if !tokenizer.is_eof() && tokenizer.next_byte_unchecked() == b'%' {
1093 tokenizer.advance(1);
1094 return Percentage {
1095 unit_value: (value / 100.) as f32,
1096 int_value: int_value,
1097 has_sign: has_sign,
1098 };
1099 }
1100 let value = value as f32;
1101 if is_ident_start(tokenizer) {
1102 let unit = consume_name(tokenizer);
1103 Dimension {
1104 value: value,
1105 int_value: int_value,
1106 has_sign: has_sign,
1107 unit: unit,
1108 }
1109 } else {
1110 Number {
1111 value: value,
1112 int_value: int_value,
1113 has_sign: has_sign,
1114 }
1115 }
1116 }
1117
1118 #[inline]
from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String1119 unsafe fn from_utf8_release_unchecked(string_bytes: Vec<u8>) -> String {
1120 if cfg!(debug_assertions) {
1121 String::from_utf8(string_bytes).unwrap()
1122 } else {
1123 String::from_utf8_unchecked(string_bytes)
1124 }
1125 }
1126
consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()>1127 fn consume_unquoted_url<'a>(tokenizer: &mut Tokenizer<'a>) -> Result<Token<'a>, ()> {
1128 // This is only called after "url(", so the current position is a code point boundary.
1129 let start_position = tokenizer.position;
1130 let from_start = &tokenizer.input[tokenizer.position..];
1131 let mut newlines = 0;
1132 let mut last_newline = 0;
1133 let mut found_printable_char = false;
1134 let mut iter = from_start.bytes().enumerate();
1135 loop {
1136 let (offset, b) = match iter.next() {
1137 Some(item) => item,
1138 None => {
1139 tokenizer.position = tokenizer.input.len();
1140 break;
1141 }
1142 };
1143 match_byte! { b,
1144 b' ' | b'\t' => {},
1145 b'\n' | b'\x0C' => {
1146 newlines += 1;
1147 last_newline = offset;
1148 }
1149 b'\r' => {
1150 if from_start.as_bytes().get(offset + 1) != Some(&b'\n') {
1151 newlines += 1;
1152 last_newline = offset;
1153 }
1154 }
1155 b'"' | b'\'' => { return Err(()) }, // Do not advance
1156 b')' => {
1157 // Don't use advance, because we may be skipping
1158 // newlines here, and we want to avoid the assert.
1159 tokenizer.position += offset + 1;
1160 break
1161 }
1162 _ => {
1163 // Don't use advance, because we may be skipping
1164 // newlines here, and we want to avoid the assert.
1165 tokenizer.position += offset;
1166 found_printable_char = true;
1167 break
1168 }
1169 }
1170 }
1171
1172 if newlines > 0 {
1173 tokenizer.current_line_number += newlines;
1174 // No need for wrapping_add here, because there's no possible
1175 // way to wrap.
1176 tokenizer.current_line_start_position = start_position + last_newline + 1;
1177 }
1178
1179 if found_printable_char {
1180 // This function only consumed ASCII (whitespace) bytes,
1181 // so the current position is a code point boundary.
1182 return Ok(consume_unquoted_url_internal(tokenizer));
1183 } else {
1184 return Ok(UnquotedUrl("".into()));
1185 }
1186
1187 fn consume_unquoted_url_internal<'a>(tokenizer: &mut Tokenizer<'a>) -> Token<'a> {
1188 // This function is only called with start_pos at a code point boundary.
1189 let start_pos = tokenizer.position();
1190 let mut string_bytes: Vec<u8>;
1191 loop {
1192 if tokenizer.is_eof() {
1193 return UnquotedUrl(tokenizer.slice_from(start_pos).into());
1194 }
1195 match_byte! { tokenizer.next_byte_unchecked(),
1196 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1197 let value = tokenizer.slice_from(start_pos);
1198 return consume_url_end(tokenizer, start_pos, value.into())
1199 }
1200 b')' => {
1201 let value = tokenizer.slice_from(start_pos);
1202 tokenizer.advance(1);
1203 return UnquotedUrl(value.into())
1204 }
1205 b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
1206 | b'"' | b'\'' | b'(' => {
1207 tokenizer.advance(1);
1208 return consume_bad_url(tokenizer, start_pos)
1209 },
1210 b'\\' | b'\0' => {
1211 // * The tokenizer’s input is UTF-8 since it’s `&str`.
1212 // * start_pos is at a code point boundary
1213 // * so is the current position (which is before '\\' or '\0'
1214 //
1215 // So `string_bytes` is well-formed UTF-8.
1216 string_bytes = tokenizer.slice_from(start_pos).as_bytes().to_owned();
1217 break
1218 }
1219 b'\x80'..=b'\xBF' => { tokenizer.consume_continuation_byte(); }
1220 b'\xF0'..=b'\xFF' => { tokenizer.consume_4byte_intro(); }
1221 _ => {
1222 // ASCII or other leading byte.
1223 tokenizer.advance(1);
1224 }
1225 }
1226 }
1227 while !tokenizer.is_eof() {
1228 let b = tokenizer.next_byte_unchecked();
1229 match_byte! { b,
1230 b' ' | b'\t' | b'\n' | b'\r' | b'\x0C' => {
1231 // string_bytes is well-formed UTF-8, see other comments.
1232 let string = unsafe { from_utf8_release_unchecked(string_bytes) }.into();
1233 return consume_url_end(tokenizer, start_pos, string)
1234 }
1235 b')' => {
1236 tokenizer.advance(1);
1237 break;
1238 }
1239 b'\x01'..=b'\x08' | b'\x0B' | b'\x0E'..=b'\x1F' | b'\x7F' // non-printable
1240 | b'"' | b'\'' | b'(' => {
1241 tokenizer.advance(1);
1242 return consume_bad_url(tokenizer, start_pos);
1243 }
1244 b'\\' => {
1245 tokenizer.advance(1);
1246 if tokenizer.has_newline_at(0) {
1247 return consume_bad_url(tokenizer, start_pos)
1248 }
1249
1250 // This pushes one well-formed code point to string_bytes
1251 consume_escape_and_write(tokenizer, &mut string_bytes)
1252 },
1253 b'\0' => {
1254 tokenizer.advance(1);
1255 string_bytes.extend("\u{FFFD}".as_bytes());
1256 }
1257 b'\x80'..=b'\xBF' => {
1258 // We’ll end up copying the whole code point
1259 // before this loop does something else.
1260 tokenizer.consume_continuation_byte();
1261 string_bytes.push(b);
1262 }
1263 b'\xF0'..=b'\xFF' => {
1264 // We’ll end up copying the whole code point
1265 // before this loop does something else.
1266 tokenizer.consume_4byte_intro();
1267 string_bytes.push(b);
1268 }
1269 // If this byte is part of a multi-byte code point,
1270 // we’ll end up copying the whole code point before this loop does something else.
1271 b => {
1272 // ASCII or other leading byte.
1273 tokenizer.advance(1);
1274 string_bytes.push(b)
1275 }
1276 }
1277 }
1278 UnquotedUrl(
1279 // string_bytes is well-formed UTF-8, see other comments.
1280 unsafe { from_utf8_release_unchecked(string_bytes) }.into(),
1281 )
1282 }
1283
1284 fn consume_url_end<'a>(
1285 tokenizer: &mut Tokenizer<'a>,
1286 start_pos: SourcePosition,
1287 string: CowRcStr<'a>,
1288 ) -> Token<'a> {
1289 while !tokenizer.is_eof() {
1290 match_byte! { tokenizer.next_byte_unchecked(),
1291 b')' => {
1292 tokenizer.advance(1);
1293 break
1294 }
1295 b' ' | b'\t' => { tokenizer.advance(1); }
1296 b'\n' | b'\x0C' | b'\r' => {
1297 tokenizer.consume_newline();
1298 }
1299 b => {
1300 tokenizer.consume_known_byte(b);
1301 return consume_bad_url(tokenizer, start_pos);
1302 }
1303 }
1304 }
1305 UnquotedUrl(string)
1306 }
1307
1308 fn consume_bad_url<'a>(tokenizer: &mut Tokenizer<'a>, start_pos: SourcePosition) -> Token<'a> {
1309 // Consume up to the closing )
1310 while !tokenizer.is_eof() {
1311 match_byte! { tokenizer.next_byte_unchecked(),
1312 b')' => {
1313 let contents = tokenizer.slice_from(start_pos).into();
1314 tokenizer.advance(1);
1315 return BadUrl(contents)
1316 }
1317 b'\\' => {
1318 tokenizer.advance(1);
1319 if matches!(tokenizer.next_byte(), Some(b')') | Some(b'\\')) {
1320 tokenizer.advance(1); // Skip an escaped ')' or '\'
1321 }
1322 }
1323 b'\n' | b'\x0C' | b'\r' => {
1324 tokenizer.consume_newline();
1325 }
1326 b => {
1327 tokenizer.consume_known_byte(b);
1328 }
1329 }
1330 }
1331 BadUrl(tokenizer.slice_from(start_pos).into())
1332 }
1333 }
1334
1335 // (value, number of digits up to 6)
consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32)1336 fn consume_hex_digits<'a>(tokenizer: &mut Tokenizer<'a>) -> (u32, u32) {
1337 let mut value = 0;
1338 let mut digits = 0;
1339 while digits < 6 && !tokenizer.is_eof() {
1340 match byte_to_hex_digit(tokenizer.next_byte_unchecked()) {
1341 Some(digit) => {
1342 value = value * 16 + digit;
1343 digits += 1;
1344 tokenizer.advance(1);
1345 }
1346 None => break,
1347 }
1348 }
1349 (value, digits)
1350 }
1351
1352 // Same constraints as consume_escape except it writes into `bytes` the result
1353 // instead of returning it.
consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>)1354 fn consume_escape_and_write(tokenizer: &mut Tokenizer, bytes: &mut Vec<u8>) {
1355 bytes.extend(
1356 consume_escape(tokenizer)
1357 .encode_utf8(&mut [0; 4])
1358 .as_bytes(),
1359 )
1360 }
1361
1362 // Assumes that the U+005C REVERSE SOLIDUS (\) has already been consumed
1363 // and that the next input character has already been verified
1364 // to not be a newline.
consume_escape(tokenizer: &mut Tokenizer) -> char1365 fn consume_escape(tokenizer: &mut Tokenizer) -> char {
1366 if tokenizer.is_eof() {
1367 return '\u{FFFD}';
1368 } // Escaped EOF
1369 match_byte! { tokenizer.next_byte_unchecked(),
1370 b'0'..=b'9' | b'A'..=b'F' | b'a'..=b'f' => {
1371 let (c, _) = consume_hex_digits(tokenizer);
1372 if !tokenizer.is_eof() {
1373 match_byte! { tokenizer.next_byte_unchecked(),
1374 b' ' | b'\t' => {
1375 tokenizer.advance(1)
1376 }
1377 b'\n' | b'\x0C' | b'\r' => {
1378 tokenizer.consume_newline();
1379 }
1380 _ => {}
1381 }
1382 }
1383 static REPLACEMENT_CHAR: char = '\u{FFFD}';
1384 if c != 0 {
1385 let c = char::from_u32(c);
1386 c.unwrap_or(REPLACEMENT_CHAR)
1387 } else {
1388 REPLACEMENT_CHAR
1389 }
1390 },
1391 b'\0' => {
1392 tokenizer.advance(1);
1393 '\u{FFFD}'
1394 }
1395 _ => { tokenizer.consume_char() }
1396 }
1397 }
1398