1 // Copyright 2013-2016 The rust-url developers.
2 //
3 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
4 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
5 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
6 // option. This file may not be copied, modified, or distributed
7 // except according to those terms.
8 
9 use std::error::Error;
10 use std::fmt::{self, Formatter, Write};
11 use std::str;
12 
13 use crate::host::{Host, HostInternal};
14 use crate::Url;
15 use form_urlencoded::EncodingOverride;
16 use percent_encoding::{percent_encode, utf8_percent_encode, AsciiSet, CONTROLS};
17 
18 /// https://url.spec.whatwg.org/#fragment-percent-encode-set
19 const FRAGMENT: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'<').add(b'>').add(b'`');
20 
21 /// https://url.spec.whatwg.org/#path-percent-encode-set
22 const PATH: &AsciiSet = &FRAGMENT.add(b'#').add(b'?').add(b'{').add(b'}');
23 
24 /// https://url.spec.whatwg.org/#userinfo-percent-encode-set
25 pub(crate) const USERINFO: &AsciiSet = &PATH
26     .add(b'/')
27     .add(b':')
28     .add(b';')
29     .add(b'=')
30     .add(b'@')
31     .add(b'[')
32     .add(b'\\')
33     .add(b']')
34     .add(b'^')
35     .add(b'|');
36 
37 pub(crate) const PATH_SEGMENT: &AsciiSet = &PATH.add(b'/').add(b'%');
38 
39 // The backslash (\) character is treated as a path separator in special URLs
40 // so it needs to be additionally escaped in that case.
41 pub(crate) const SPECIAL_PATH_SEGMENT: &AsciiSet = &PATH_SEGMENT.add(b'\\');
42 
43 // https://url.spec.whatwg.org/#query-state
44 const QUERY: &AsciiSet = &CONTROLS.add(b' ').add(b'"').add(b'#').add(b'<').add(b'>');
45 const SPECIAL_QUERY: &AsciiSet = &QUERY.add(b'\'');
46 
47 pub type ParseResult<T> = Result<T, ParseError>;
48 
49 macro_rules! simple_enum_error {
50     ($($name: ident => $description: expr,)+) => {
51         /// Errors that can occur during parsing.
52         ///
53         /// This may be extended in the future so exhaustive matching is
54         /// discouraged with an unused variant.
55         #[allow(clippy::manual_non_exhaustive)] // introduced in 1.40, MSRV is 1.36
56         #[derive(PartialEq, Eq, Clone, Copy, Debug)]
57         pub enum ParseError {
58             $(
59                 $name,
60             )+
61             /// Unused variant enable non-exhaustive matching
62             #[doc(hidden)]
63             __FutureProof,
64         }
65 
66         impl fmt::Display for ParseError {
67             fn fmt(&self, fmt: &mut Formatter<'_>) -> fmt::Result {
68                 match *self {
69                     $(
70                         ParseError::$name => fmt.write_str($description),
71                     )+
72                     ParseError::__FutureProof => {
73                         unreachable!("Don't abuse the FutureProof!");
74                     }
75                 }
76             }
77         }
78     }
79 }
80 
81 impl Error for ParseError {}
82 
83 simple_enum_error! {
84     EmptyHost => "empty host",
85     IdnaError => "invalid international domain name",
86     InvalidPort => "invalid port number",
87     InvalidIpv4Address => "invalid IPv4 address",
88     InvalidIpv6Address => "invalid IPv6 address",
89     InvalidDomainCharacter => "invalid domain character",
90     RelativeUrlWithoutBase => "relative URL without a base",
91     RelativeUrlWithCannotBeABaseBase => "relative URL with a cannot-be-a-base base",
92     SetHostOnCannotBeABaseUrl => "a cannot-be-a-base URL doesn’t have a host to set",
93     Overflow => "URLs more than 4 GB are not supported",
94 }
95 
96 impl From<::idna::Errors> for ParseError {
from(_: ::idna::Errors) -> ParseError97     fn from(_: ::idna::Errors) -> ParseError {
98         ParseError::IdnaError
99     }
100 }
101 
102 macro_rules! syntax_violation_enum {
103     ($($name: ident => $description: expr,)+) => {
104         /// Non-fatal syntax violations that can occur during parsing.
105         ///
106         /// This may be extended in the future so exhaustive matching is
107         /// discouraged with an unused variant.
108         #[allow(clippy::manual_non_exhaustive)] // introduced in 1.40, MSRV is 1.36
109         #[derive(PartialEq, Eq, Clone, Copy, Debug)]
110         pub enum SyntaxViolation {
111             $(
112                 $name,
113             )+
114             /// Unused variant enable non-exhaustive matching
115             #[doc(hidden)]
116             __FutureProof,
117         }
118 
119         impl SyntaxViolation {
120             pub fn description(&self) -> &'static str {
121                 match *self {
122                     $(
123                         SyntaxViolation::$name => $description,
124                     )+
125                     SyntaxViolation::__FutureProof => {
126                         unreachable!("Don't abuse the FutureProof!");
127                     }
128                 }
129             }
130         }
131     }
132 }
133 
134 syntax_violation_enum! {
135     Backslash => "backslash",
136     C0SpaceIgnored =>
137         "leading or trailing control or space character are ignored in URLs",
138     EmbeddedCredentials =>
139         "embedding authentication information (username or password) \
140          in an URL is not recommended",
141     ExpectedDoubleSlash => "expected //",
142     ExpectedFileDoubleSlash => "expected // after file:",
143     FileWithHostAndWindowsDrive => "file: with host and Windows drive letter",
144     NonUrlCodePoint => "non-URL code point",
145     NullInFragment => "NULL characters are ignored in URL fragment identifiers",
146     PercentDecode => "expected 2 hex digits after %",
147     TabOrNewlineIgnored => "tabs or newlines are ignored in URLs",
148     UnencodedAtSign => "unencoded @ sign in username or password",
149 }
150 
151 impl fmt::Display for SyntaxViolation {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result152     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
153         fmt::Display::fmt(self.description(), f)
154     }
155 }
156 
157 #[derive(Copy, Clone, PartialEq)]
158 pub enum SchemeType {
159     File,
160     SpecialNotFile,
161     NotSpecial,
162 }
163 
164 impl SchemeType {
is_special(&self) -> bool165     pub fn is_special(&self) -> bool {
166         !matches!(*self, SchemeType::NotSpecial)
167     }
168 
is_file(&self) -> bool169     pub fn is_file(&self) -> bool {
170         matches!(*self, SchemeType::File)
171     }
172 
from(s: &str) -> Self173     pub fn from(s: &str) -> Self {
174         match s {
175             "http" | "https" | "ws" | "wss" | "ftp" => SchemeType::SpecialNotFile,
176             "file" => SchemeType::File,
177             _ => SchemeType::NotSpecial,
178         }
179     }
180 }
181 
default_port(scheme: &str) -> Option<u16>182 pub fn default_port(scheme: &str) -> Option<u16> {
183     match scheme {
184         "http" | "ws" => Some(80),
185         "https" | "wss" => Some(443),
186         "ftp" => Some(21),
187         _ => None,
188     }
189 }
190 
191 #[derive(Clone)]
192 pub struct Input<'i> {
193     chars: str::Chars<'i>,
194 }
195 
196 impl<'i> Input<'i> {
new(input: &'i str) -> Self197     pub fn new(input: &'i str) -> Self {
198         Input::with_log(input, None)
199     }
200 
no_trim(input: &'i str) -> Self201     pub fn no_trim(input: &'i str) -> Self {
202         Input {
203             chars: input.chars(),
204         }
205     }
206 
trim_tab_and_newlines( original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>, ) -> Self207     pub fn trim_tab_and_newlines(
208         original_input: &'i str,
209         vfn: Option<&dyn Fn(SyntaxViolation)>,
210     ) -> Self {
211         let input = original_input.trim_matches(ascii_tab_or_new_line);
212         if let Some(vfn) = vfn {
213             if input.len() < original_input.len() {
214                 vfn(SyntaxViolation::C0SpaceIgnored)
215             }
216             if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
217                 vfn(SyntaxViolation::TabOrNewlineIgnored)
218             }
219         }
220         Input {
221             chars: input.chars(),
222         }
223     }
224 
with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self225     pub fn with_log(original_input: &'i str, vfn: Option<&dyn Fn(SyntaxViolation)>) -> Self {
226         let input = original_input.trim_matches(c0_control_or_space);
227         if let Some(vfn) = vfn {
228             if input.len() < original_input.len() {
229                 vfn(SyntaxViolation::C0SpaceIgnored)
230             }
231             if input.chars().any(|c| matches!(c, '\t' | '\n' | '\r')) {
232                 vfn(SyntaxViolation::TabOrNewlineIgnored)
233             }
234         }
235         Input {
236             chars: input.chars(),
237         }
238     }
239 
240     #[inline]
is_empty(&self) -> bool241     pub fn is_empty(&self) -> bool {
242         self.clone().next().is_none()
243     }
244 
245     #[inline]
starts_with<P: Pattern>(&self, p: P) -> bool246     fn starts_with<P: Pattern>(&self, p: P) -> bool {
247         p.split_prefix(&mut self.clone())
248     }
249 
250     #[inline]
split_prefix<P: Pattern>(&self, p: P) -> Option<Self>251     pub fn split_prefix<P: Pattern>(&self, p: P) -> Option<Self> {
252         let mut remaining = self.clone();
253         if p.split_prefix(&mut remaining) {
254             Some(remaining)
255         } else {
256             None
257         }
258     }
259 
260     #[inline]
split_first(&self) -> (Option<char>, Self)261     fn split_first(&self) -> (Option<char>, Self) {
262         let mut remaining = self.clone();
263         (remaining.next(), remaining)
264     }
265 
266     #[inline]
count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self)267     fn count_matching<F: Fn(char) -> bool>(&self, f: F) -> (u32, Self) {
268         let mut count = 0;
269         let mut remaining = self.clone();
270         loop {
271             let mut input = remaining.clone();
272             if matches!(input.next(), Some(c) if f(c)) {
273                 remaining = input;
274                 count += 1;
275             } else {
276                 return (count, remaining);
277             }
278         }
279     }
280 
281     #[inline]
next_utf8(&mut self) -> Option<(char, &'i str)>282     fn next_utf8(&mut self) -> Option<(char, &'i str)> {
283         loop {
284             let utf8 = self.chars.as_str();
285             match self.chars.next() {
286                 Some(c) => {
287                     if !matches!(c, '\t' | '\n' | '\r') {
288                         return Some((c, &utf8[..c.len_utf8()]));
289                     }
290                 }
291                 None => return None,
292             }
293         }
294     }
295 }
296 
297 pub trait Pattern {
split_prefix(self, input: &mut Input) -> bool298     fn split_prefix(self, input: &mut Input) -> bool;
299 }
300 
301 impl Pattern for char {
split_prefix(self, input: &mut Input) -> bool302     fn split_prefix(self, input: &mut Input) -> bool {
303         input.next() == Some(self)
304     }
305 }
306 
307 impl<'a> Pattern for &'a str {
split_prefix(self, input: &mut Input) -> bool308     fn split_prefix(self, input: &mut Input) -> bool {
309         for c in self.chars() {
310             if input.next() != Some(c) {
311                 return false;
312             }
313         }
314         true
315     }
316 }
317 
318 impl<F: FnMut(char) -> bool> Pattern for F {
split_prefix(self, input: &mut Input) -> bool319     fn split_prefix(self, input: &mut Input) -> bool {
320         input.next().map_or(false, self)
321     }
322 }
323 
324 impl<'i> Iterator for Input<'i> {
325     type Item = char;
next(&mut self) -> Option<char>326     fn next(&mut self) -> Option<char> {
327         self.chars
328             .by_ref()
329             .find(|&c| !matches!(c, '\t' | '\n' | '\r'))
330     }
331 }
332 
333 pub struct Parser<'a> {
334     pub serialization: String,
335     pub base_url: Option<&'a Url>,
336     pub query_encoding_override: EncodingOverride<'a>,
337     pub violation_fn: Option<&'a dyn Fn(SyntaxViolation)>,
338     pub context: Context,
339 }
340 
341 #[derive(PartialEq, Eq, Copy, Clone)]
342 pub enum Context {
343     UrlParser,
344     Setter,
345     PathSegmentSetter,
346 }
347 
348 impl<'a> Parser<'a> {
log_violation(&self, v: SyntaxViolation)349     fn log_violation(&self, v: SyntaxViolation) {
350         if let Some(f) = self.violation_fn {
351             f(v)
352         }
353     }
354 
log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool)355     fn log_violation_if(&self, v: SyntaxViolation, test: impl FnOnce() -> bool) {
356         if let Some(f) = self.violation_fn {
357             if test() {
358                 f(v)
359             }
360         }
361     }
362 
for_setter(serialization: String) -> Parser<'a>363     pub fn for_setter(serialization: String) -> Parser<'a> {
364         Parser {
365             serialization,
366             base_url: None,
367             query_encoding_override: None,
368             violation_fn: None,
369             context: Context::Setter,
370         }
371     }
372 
373     /// https://url.spec.whatwg.org/#concept-basic-url-parser
parse_url(mut self, input: &str) -> ParseResult<Url>374     pub fn parse_url(mut self, input: &str) -> ParseResult<Url> {
375         let input = Input::with_log(input, self.violation_fn);
376         if let Ok(remaining) = self.parse_scheme(input.clone()) {
377             return self.parse_with_scheme(remaining);
378         }
379 
380         // No-scheme state
381         if let Some(base_url) = self.base_url {
382             if input.starts_with('#') {
383                 self.fragment_only(base_url, input)
384             } else if base_url.cannot_be_a_base() {
385                 Err(ParseError::RelativeUrlWithCannotBeABaseBase)
386             } else {
387                 let scheme_type = SchemeType::from(base_url.scheme());
388                 if scheme_type.is_file() {
389                     self.parse_file(input, scheme_type, Some(base_url))
390                 } else {
391                     self.parse_relative(input, scheme_type, base_url)
392                 }
393             }
394         } else {
395             Err(ParseError::RelativeUrlWithoutBase)
396         }
397     }
398 
parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()>399     pub fn parse_scheme<'i>(&mut self, mut input: Input<'i>) -> Result<Input<'i>, ()> {
400         if input.is_empty() || !input.starts_with(ascii_alpha) {
401             return Err(());
402         }
403         debug_assert!(self.serialization.is_empty());
404         while let Some(c) = input.next() {
405             match c {
406                 'a'..='z' | 'A'..='Z' | '0'..='9' | '+' | '-' | '.' => {
407                     self.serialization.push(c.to_ascii_lowercase())
408                 }
409                 ':' => return Ok(input),
410                 _ => {
411                     self.serialization.clear();
412                     return Err(());
413                 }
414             }
415         }
416         // EOF before ':'
417         if self.context == Context::Setter {
418             Ok(input)
419         } else {
420             self.serialization.clear();
421             Err(())
422         }
423     }
424 
parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url>425     fn parse_with_scheme(mut self, input: Input<'_>) -> ParseResult<Url> {
426         use crate::SyntaxViolation::{ExpectedDoubleSlash, ExpectedFileDoubleSlash};
427         let scheme_end = to_u32(self.serialization.len())?;
428         let scheme_type = SchemeType::from(&self.serialization);
429         self.serialization.push(':');
430         match scheme_type {
431             SchemeType::File => {
432                 self.log_violation_if(ExpectedFileDoubleSlash, || !input.starts_with("//"));
433                 let base_file_url = self.base_url.and_then(|base| {
434                     if base.scheme() == "file" {
435                         Some(base)
436                     } else {
437                         None
438                     }
439                 });
440                 self.serialization.clear();
441                 self.parse_file(input, scheme_type, base_file_url)
442             }
443             SchemeType::SpecialNotFile => {
444                 // special relative or authority state
445                 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
446                 if let Some(base_url) = self.base_url {
447                     if slashes_count < 2
448                         && base_url.scheme() == &self.serialization[..scheme_end as usize]
449                     {
450                         // "Cannot-be-a-base" URLs only happen with "not special" schemes.
451                         debug_assert!(!base_url.cannot_be_a_base());
452                         self.serialization.clear();
453                         return self.parse_relative(input, scheme_type, base_url);
454                     }
455                 }
456                 // special authority slashes state
457                 self.log_violation_if(ExpectedDoubleSlash, || {
458                     input
459                         .clone()
460                         .take_while(|&c| matches!(c, '/' | '\\'))
461                         .collect::<String>()
462                         != "//"
463                 });
464                 self.after_double_slash(remaining, scheme_type, scheme_end)
465             }
466             SchemeType::NotSpecial => self.parse_non_special(input, scheme_type, scheme_end),
467         }
468     }
469 
470     /// Scheme other than file, http, https, ws, ws, ftp.
parse_non_special( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>471     fn parse_non_special(
472         mut self,
473         input: Input<'_>,
474         scheme_type: SchemeType,
475         scheme_end: u32,
476     ) -> ParseResult<Url> {
477         // path or authority state (
478         if let Some(input) = input.split_prefix("//") {
479             return self.after_double_slash(input, scheme_type, scheme_end);
480         }
481         // Anarchist URL (no authority)
482         let path_start = to_u32(self.serialization.len())?;
483         let username_end = path_start;
484         let host_start = path_start;
485         let host_end = path_start;
486         let host = HostInternal::None;
487         let port = None;
488         let remaining = if let Some(input) = input.split_prefix('/') {
489             let path_start = self.serialization.len();
490             self.serialization.push('/');
491             self.parse_path(scheme_type, &mut false, path_start, input)
492         } else {
493             self.parse_cannot_be_a_base_path(input)
494         };
495         self.with_query_and_fragment(
496             scheme_type,
497             scheme_end,
498             username_end,
499             host_start,
500             host_end,
501             host,
502             port,
503             path_start,
504             remaining,
505         )
506     }
507 
parse_file( mut self, input: Input<'_>, scheme_type: SchemeType, base_file_url: Option<&Url>, ) -> ParseResult<Url>508     fn parse_file(
509         mut self,
510         input: Input<'_>,
511         scheme_type: SchemeType,
512         base_file_url: Option<&Url>,
513     ) -> ParseResult<Url> {
514         use crate::SyntaxViolation::Backslash;
515         // file state
516         debug_assert!(self.serialization.is_empty());
517         let (first_char, input_after_first_char) = input.split_first();
518         if matches!(first_char, Some('/') | Some('\\')) {
519             self.log_violation_if(SyntaxViolation::Backslash, || first_char == Some('\\'));
520             // file slash state
521             let (next_char, input_after_next_char) = input_after_first_char.split_first();
522             if matches!(next_char, Some('/') | Some('\\')) {
523                 self.log_violation_if(Backslash, || next_char == Some('\\'));
524                 // file host state
525                 self.serialization.push_str("file://");
526                 let scheme_end = "file".len() as u32;
527                 let host_start = "file://".len() as u32;
528                 let (path_start, mut host, remaining) =
529                     self.parse_file_host(input_after_next_char)?;
530                 let mut host_end = to_u32(self.serialization.len())?;
531                 let mut has_host = !matches!(host, HostInternal::None);
532                 let remaining = if path_start {
533                     self.parse_path_start(SchemeType::File, &mut has_host, remaining)
534                 } else {
535                     let path_start = self.serialization.len();
536                     self.serialization.push('/');
537                     self.parse_path(SchemeType::File, &mut has_host, path_start, remaining)
538                 };
539 
540                 // For file URLs that have a host and whose path starts
541                 // with the windows drive letter we just remove the host.
542                 if !has_host {
543                     self.serialization
544                         .drain(host_start as usize..host_end as usize);
545                     host_end = host_start;
546                     host = HostInternal::None;
547                 }
548                 let (query_start, fragment_start) =
549                     self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
550                 return Ok(Url {
551                     serialization: self.serialization,
552                     scheme_end,
553                     username_end: host_start,
554                     host_start,
555                     host_end,
556                     host,
557                     port: None,
558                     path_start: host_end,
559                     query_start,
560                     fragment_start,
561                 });
562             } else {
563                 self.serialization.push_str("file://");
564                 let scheme_end = "file".len() as u32;
565                 let host_start = "file://".len();
566                 let mut host_end = host_start;
567                 let mut host = HostInternal::None;
568                 if !starts_with_windows_drive_letter_segment(&input_after_first_char) {
569                     if let Some(base_url) = base_file_url {
570                         let first_segment = base_url.path_segments().unwrap().next().unwrap();
571                         if is_normalized_windows_drive_letter(first_segment) {
572                             self.serialization.push('/');
573                             self.serialization.push_str(first_segment);
574                         } else if let Some(host_str) = base_url.host_str() {
575                             self.serialization.push_str(host_str);
576                             host_end = self.serialization.len();
577                             host = base_url.host;
578                         }
579                     }
580                 }
581                 // If c is the EOF code point, U+002F (/), U+005C (\), U+003F (?), or U+0023 (#), then decrease pointer by one
582                 let parse_path_input = if let Some(c) = first_char {
583                     if c == '/' || c == '\\' || c == '?' || c == '#' {
584                         input
585                     } else {
586                         input_after_first_char
587                     }
588                 } else {
589                     input_after_first_char
590                 };
591 
592                 let remaining =
593                     self.parse_path(SchemeType::File, &mut false, host_end, parse_path_input);
594 
595                 let host_start = host_start as u32;
596 
597                 let (query_start, fragment_start) =
598                     self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
599 
600                 let host_end = host_end as u32;
601                 return Ok(Url {
602                     serialization: self.serialization,
603                     scheme_end,
604                     username_end: host_start,
605                     host_start,
606                     host_end,
607                     host,
608                     port: None,
609                     path_start: host_end,
610                     query_start,
611                     fragment_start,
612                 });
613             }
614         }
615         if let Some(base_url) = base_file_url {
616             match first_char {
617                 None => {
618                     // Copy everything except the fragment
619                     let before_fragment = match base_url.fragment_start {
620                         Some(i) => &base_url.serialization[..i as usize],
621                         None => &*base_url.serialization,
622                     };
623                     self.serialization.push_str(before_fragment);
624                     Ok(Url {
625                         serialization: self.serialization,
626                         fragment_start: None,
627                         ..*base_url
628                     })
629                 }
630                 Some('?') => {
631                     // Copy everything up to the query string
632                     let before_query = match (base_url.query_start, base_url.fragment_start) {
633                         (None, None) => &*base_url.serialization,
634                         (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
635                     };
636                     self.serialization.push_str(before_query);
637                     let (query_start, fragment_start) =
638                         self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
639                     Ok(Url {
640                         serialization: self.serialization,
641                         query_start,
642                         fragment_start,
643                         ..*base_url
644                     })
645                 }
646                 Some('#') => self.fragment_only(base_url, input),
647                 _ => {
648                     if !starts_with_windows_drive_letter_segment(&input) {
649                         let before_query = match (base_url.query_start, base_url.fragment_start) {
650                             (None, None) => &*base_url.serialization,
651                             (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
652                         };
653                         self.serialization.push_str(before_query);
654                         self.shorten_path(SchemeType::File, base_url.path_start as usize);
655                         let remaining = self.parse_path(
656                             SchemeType::File,
657                             &mut true,
658                             base_url.path_start as usize,
659                             input,
660                         );
661                         self.with_query_and_fragment(
662                             SchemeType::File,
663                             base_url.scheme_end,
664                             base_url.username_end,
665                             base_url.host_start,
666                             base_url.host_end,
667                             base_url.host,
668                             base_url.port,
669                             base_url.path_start,
670                             remaining,
671                         )
672                     } else {
673                         self.serialization.push_str("file:///");
674                         let scheme_end = "file".len() as u32;
675                         let path_start = "file://".len();
676                         let remaining =
677                             self.parse_path(SchemeType::File, &mut false, path_start, input);
678                         let (query_start, fragment_start) =
679                             self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
680                         let path_start = path_start as u32;
681                         Ok(Url {
682                             serialization: self.serialization,
683                             scheme_end,
684                             username_end: path_start,
685                             host_start: path_start,
686                             host_end: path_start,
687                             host: HostInternal::None,
688                             port: None,
689                             path_start,
690                             query_start,
691                             fragment_start,
692                         })
693                     }
694                 }
695             }
696         } else {
697             self.serialization.push_str("file:///");
698             let scheme_end = "file".len() as u32;
699             let path_start = "file://".len();
700             let remaining = self.parse_path(SchemeType::File, &mut false, path_start, input);
701             let (query_start, fragment_start) =
702                 self.parse_query_and_fragment(SchemeType::File, scheme_end, remaining)?;
703             let path_start = path_start as u32;
704             Ok(Url {
705                 serialization: self.serialization,
706                 scheme_end,
707                 username_end: path_start,
708                 host_start: path_start,
709                 host_end: path_start,
710                 host: HostInternal::None,
711                 port: None,
712                 path_start,
713                 query_start,
714                 fragment_start,
715             })
716         }
717     }
718 
parse_relative( mut self, input: Input<'_>, scheme_type: SchemeType, base_url: &Url, ) -> ParseResult<Url>719     fn parse_relative(
720         mut self,
721         input: Input<'_>,
722         scheme_type: SchemeType,
723         base_url: &Url,
724     ) -> ParseResult<Url> {
725         // relative state
726         debug_assert!(self.serialization.is_empty());
727         let (first_char, input_after_first_char) = input.split_first();
728         match first_char {
729             None => {
730                 // Copy everything except the fragment
731                 let before_fragment = match base_url.fragment_start {
732                     Some(i) => &base_url.serialization[..i as usize],
733                     None => &*base_url.serialization,
734                 };
735                 self.serialization.push_str(before_fragment);
736                 Ok(Url {
737                     serialization: self.serialization,
738                     fragment_start: None,
739                     ..*base_url
740                 })
741             }
742             Some('?') => {
743                 // Copy everything up to the query string
744                 let before_query = match (base_url.query_start, base_url.fragment_start) {
745                     (None, None) => &*base_url.serialization,
746                     (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
747                 };
748                 self.serialization.push_str(before_query);
749                 let (query_start, fragment_start) =
750                     self.parse_query_and_fragment(scheme_type, base_url.scheme_end, input)?;
751                 Ok(Url {
752                     serialization: self.serialization,
753                     query_start,
754                     fragment_start,
755                     ..*base_url
756                 })
757             }
758             Some('#') => self.fragment_only(base_url, input),
759             Some('/') | Some('\\') => {
760                 let (slashes_count, remaining) = input.count_matching(|c| matches!(c, '/' | '\\'));
761                 if slashes_count >= 2 {
762                     self.log_violation_if(SyntaxViolation::ExpectedDoubleSlash, || {
763                         input
764                             .clone()
765                             .take_while(|&c| matches!(c, '/' | '\\'))
766                             .collect::<String>()
767                             != "//"
768                     });
769                     let scheme_end = base_url.scheme_end;
770                     debug_assert!(base_url.byte_at(scheme_end) == b':');
771                     self.serialization
772                         .push_str(base_url.slice(..scheme_end + 1));
773                     if let Some(after_prefix) = input.split_prefix("//") {
774                         return self.after_double_slash(after_prefix, scheme_type, scheme_end);
775                     }
776                     return self.after_double_slash(remaining, scheme_type, scheme_end);
777                 }
778                 let path_start = base_url.path_start;
779                 self.serialization.push_str(base_url.slice(..path_start));
780                 self.serialization.push('/');
781                 let remaining = self.parse_path(
782                     scheme_type,
783                     &mut true,
784                     path_start as usize,
785                     input_after_first_char,
786                 );
787                 self.with_query_and_fragment(
788                     scheme_type,
789                     base_url.scheme_end,
790                     base_url.username_end,
791                     base_url.host_start,
792                     base_url.host_end,
793                     base_url.host,
794                     base_url.port,
795                     base_url.path_start,
796                     remaining,
797                 )
798             }
799             _ => {
800                 let before_query = match (base_url.query_start, base_url.fragment_start) {
801                     (None, None) => &*base_url.serialization,
802                     (Some(i), _) | (None, Some(i)) => base_url.slice(..i),
803                 };
804                 self.serialization.push_str(before_query);
805                 // FIXME spec says just "remove last entry", not the "pop" algorithm
806                 self.pop_path(scheme_type, base_url.path_start as usize);
807                 // A special url always has a path.
808                 // A path always starts with '/'
809                 if self.serialization.len() == base_url.path_start as usize
810                     && (SchemeType::from(base_url.scheme()).is_special() || !input.is_empty())
811                 {
812                     self.serialization.push('/');
813                 }
814                 let remaining = match input.split_first() {
815                     (Some('/'), remaining) => self.parse_path(
816                         scheme_type,
817                         &mut true,
818                         base_url.path_start as usize,
819                         remaining,
820                     ),
821                     _ => {
822                         self.parse_path(scheme_type, &mut true, base_url.path_start as usize, input)
823                     }
824                 };
825                 self.with_query_and_fragment(
826                     scheme_type,
827                     base_url.scheme_end,
828                     base_url.username_end,
829                     base_url.host_start,
830                     base_url.host_end,
831                     base_url.host,
832                     base_url.port,
833                     base_url.path_start,
834                     remaining,
835                 )
836             }
837         }
838     }
839 
after_double_slash( mut self, input: Input<'_>, scheme_type: SchemeType, scheme_end: u32, ) -> ParseResult<Url>840     fn after_double_slash(
841         mut self,
842         input: Input<'_>,
843         scheme_type: SchemeType,
844         scheme_end: u32,
845     ) -> ParseResult<Url> {
846         self.serialization.push('/');
847         self.serialization.push('/');
848         // authority state
849         let before_authority = self.serialization.len();
850         let (username_end, remaining) = self.parse_userinfo(input, scheme_type)?;
851         let has_authority = before_authority != self.serialization.len();
852         // host state
853         let host_start = to_u32(self.serialization.len())?;
854         let (host_end, host, port, remaining) =
855             self.parse_host_and_port(remaining, scheme_end, scheme_type)?;
856         if host == HostInternal::None && has_authority {
857             return Err(ParseError::EmptyHost);
858         }
859         // path state
860         let path_start = to_u32(self.serialization.len())?;
861         let remaining = self.parse_path_start(scheme_type, &mut true, remaining);
862         self.with_query_and_fragment(
863             scheme_type,
864             scheme_end,
865             username_end,
866             host_start,
867             host_end,
868             host,
869             port,
870             path_start,
871             remaining,
872         )
873     }
874 
875     /// Return (username_end, remaining)
parse_userinfo<'i>( &mut self, mut input: Input<'i>, scheme_type: SchemeType, ) -> ParseResult<(u32, Input<'i>)>876     fn parse_userinfo<'i>(
877         &mut self,
878         mut input: Input<'i>,
879         scheme_type: SchemeType,
880     ) -> ParseResult<(u32, Input<'i>)> {
881         let mut last_at = None;
882         let mut remaining = input.clone();
883         let mut char_count = 0;
884         while let Some(c) = remaining.next() {
885             match c {
886                 '@' => {
887                     if last_at.is_some() {
888                         self.log_violation(SyntaxViolation::UnencodedAtSign)
889                     } else {
890                         self.log_violation(SyntaxViolation::EmbeddedCredentials)
891                     }
892                     last_at = Some((char_count, remaining.clone()))
893                 }
894                 '/' | '?' | '#' => break,
895                 '\\' if scheme_type.is_special() => break,
896                 _ => (),
897             }
898             char_count += 1;
899         }
900         let (mut userinfo_char_count, remaining) = match last_at {
901             None => return Ok((to_u32(self.serialization.len())?, input)),
902             Some((0, remaining)) => {
903                 // Otherwise, if one of the following is true
904                 // c is the EOF code point, U+002F (/), U+003F (?), or U+0023 (#)
905                 // url is special and c is U+005C (\)
906                 // If @ flag is set and buffer is the empty string, validation error, return failure.
907                 if let (Some(c), _) = remaining.split_first() {
908                     if c == '/' || c == '?' || c == '#' || (scheme_type.is_special() && c == '\\') {
909                         return Err(ParseError::EmptyHost);
910                     }
911                 }
912                 return Ok((to_u32(self.serialization.len())?, remaining));
913             }
914             Some(x) => x,
915         };
916 
917         let mut username_end = None;
918         let mut has_password = false;
919         let mut has_username = false;
920         while userinfo_char_count > 0 {
921             let (c, utf8_c) = input.next_utf8().unwrap();
922             userinfo_char_count -= 1;
923             if c == ':' && username_end.is_none() {
924                 // Start parsing password
925                 username_end = Some(to_u32(self.serialization.len())?);
926                 // We don't add a colon if the password is empty
927                 if userinfo_char_count > 0 {
928                     self.serialization.push(':');
929                     has_password = true;
930                 }
931             } else {
932                 if !has_password {
933                     has_username = true;
934                 }
935                 self.check_url_code_point(c, &input);
936                 self.serialization
937                     .extend(utf8_percent_encode(utf8_c, USERINFO));
938             }
939         }
940         let username_end = match username_end {
941             Some(i) => i,
942             None => to_u32(self.serialization.len())?,
943         };
944         if has_username || has_password {
945             self.serialization.push('@');
946         }
947         Ok((username_end, remaining))
948     }
949 
parse_host_and_port<'i>( &mut self, input: Input<'i>, scheme_end: u32, scheme_type: SchemeType, ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)>950     fn parse_host_and_port<'i>(
951         &mut self,
952         input: Input<'i>,
953         scheme_end: u32,
954         scheme_type: SchemeType,
955     ) -> ParseResult<(u32, HostInternal, Option<u16>, Input<'i>)> {
956         let (host, remaining) = Parser::parse_host(input, scheme_type)?;
957         write!(&mut self.serialization, "{}", host).unwrap();
958         let host_end = to_u32(self.serialization.len())?;
959         if let Host::Domain(h) = &host {
960             if h.is_empty() {
961                 // Port with an empty host
962                 if remaining.starts_with(":") {
963                     return Err(ParseError::EmptyHost);
964                 }
965                 if scheme_type.is_special() {
966                     return Err(ParseError::EmptyHost);
967                 }
968             }
969         };
970 
971         let (port, remaining) = if let Some(remaining) = remaining.split_prefix(':') {
972             let scheme = || default_port(&self.serialization[..scheme_end as usize]);
973             Parser::parse_port(remaining, scheme, self.context)?
974         } else {
975             (None, remaining)
976         };
977         if let Some(port) = port {
978             write!(&mut self.serialization, ":{}", port).unwrap()
979         }
980         Ok((host_end, host.into(), port, remaining))
981     }
982 
parse_host( mut input: Input<'_>, scheme_type: SchemeType, ) -> ParseResult<(Host<String>, Input<'_>)>983     pub fn parse_host(
984         mut input: Input<'_>,
985         scheme_type: SchemeType,
986     ) -> ParseResult<(Host<String>, Input<'_>)> {
987         if scheme_type.is_file() {
988             return Parser::get_file_host(input);
989         }
990         // Undo the Input abstraction here to avoid allocating in the common case
991         // where the host part of the input does not contain any tab or newline
992         let input_str = input.chars.as_str();
993         let mut inside_square_brackets = false;
994         let mut has_ignored_chars = false;
995         let mut non_ignored_chars = 0;
996         let mut bytes = 0;
997         for c in input_str.chars() {
998             match c {
999                 ':' if !inside_square_brackets => break,
1000                 '\\' if scheme_type.is_special() => break,
1001                 '/' | '?' | '#' => break,
1002                 '\t' | '\n' | '\r' => {
1003                     has_ignored_chars = true;
1004                 }
1005                 '[' => {
1006                     inside_square_brackets = true;
1007                     non_ignored_chars += 1
1008                 }
1009                 ']' => {
1010                     inside_square_brackets = false;
1011                     non_ignored_chars += 1
1012                 }
1013                 _ => non_ignored_chars += 1,
1014             }
1015             bytes += c.len_utf8();
1016         }
1017         let replaced: String;
1018         let host_str;
1019         {
1020             let host_input = input.by_ref().take(non_ignored_chars);
1021             if has_ignored_chars {
1022                 replaced = host_input.collect();
1023                 host_str = &*replaced
1024             } else {
1025                 for _ in host_input {}
1026                 host_str = &input_str[..bytes]
1027             }
1028         }
1029         if scheme_type == SchemeType::SpecialNotFile && host_str.is_empty() {
1030             return Err(ParseError::EmptyHost);
1031         }
1032         if !scheme_type.is_special() {
1033             let host = Host::parse_opaque(host_str)?;
1034             return Ok((host, input));
1035         }
1036         let host = Host::parse(host_str)?;
1037         Ok((host, input))
1038     }
1039 
get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)>1040     fn get_file_host(input: Input<'_>) -> ParseResult<(Host<String>, Input<'_>)> {
1041         let (_, host_str, remaining) = Parser::file_host(input)?;
1042         let host = match Host::parse(&host_str)? {
1043             Host::Domain(ref d) if d == "localhost" => Host::Domain("".to_string()),
1044             host => host,
1045         };
1046         Ok((host, remaining))
1047     }
1048 
parse_file_host<'i>( &mut self, input: Input<'i>, ) -> ParseResult<(bool, HostInternal, Input<'i>)>1049     fn parse_file_host<'i>(
1050         &mut self,
1051         input: Input<'i>,
1052     ) -> ParseResult<(bool, HostInternal, Input<'i>)> {
1053         let has_host;
1054         let (_, host_str, remaining) = Parser::file_host(input)?;
1055         let host = if host_str.is_empty() {
1056             has_host = false;
1057             HostInternal::None
1058         } else {
1059             match Host::parse(&host_str)? {
1060                 Host::Domain(ref d) if d == "localhost" => {
1061                     has_host = false;
1062                     HostInternal::None
1063                 }
1064                 host => {
1065                     write!(&mut self.serialization, "{}", host).unwrap();
1066                     has_host = true;
1067                     host.into()
1068                 }
1069             }
1070         };
1071         Ok((has_host, host, remaining))
1072     }
1073 
file_host(input: Input) -> ParseResult<(bool, String, Input)>1074     pub fn file_host(input: Input) -> ParseResult<(bool, String, Input)> {
1075         // Undo the Input abstraction here to avoid allocating in the common case
1076         // where the host part of the input does not contain any tab or newline
1077         let input_str = input.chars.as_str();
1078         let mut has_ignored_chars = false;
1079         let mut non_ignored_chars = 0;
1080         let mut bytes = 0;
1081         for c in input_str.chars() {
1082             match c {
1083                 '/' | '\\' | '?' | '#' => break,
1084                 '\t' | '\n' | '\r' => has_ignored_chars = true,
1085                 _ => non_ignored_chars += 1,
1086             }
1087             bytes += c.len_utf8();
1088         }
1089         let replaced: String;
1090         let host_str;
1091         let mut remaining = input.clone();
1092         {
1093             let host_input = remaining.by_ref().take(non_ignored_chars);
1094             if has_ignored_chars {
1095                 replaced = host_input.collect();
1096                 host_str = &*replaced
1097             } else {
1098                 for _ in host_input {}
1099                 host_str = &input_str[..bytes]
1100             }
1101         }
1102         if is_windows_drive_letter(host_str) {
1103             return Ok((false, "".to_string(), input));
1104         }
1105         Ok((true, host_str.to_string(), remaining))
1106     }
1107 
parse_port<P>( mut input: Input<'_>, default_port: P, context: Context, ) -> ParseResult<(Option<u16>, Input<'_>)> where P: Fn() -> Option<u16>,1108     pub fn parse_port<P>(
1109         mut input: Input<'_>,
1110         default_port: P,
1111         context: Context,
1112     ) -> ParseResult<(Option<u16>, Input<'_>)>
1113     where
1114         P: Fn() -> Option<u16>,
1115     {
1116         let mut port: u32 = 0;
1117         let mut has_any_digit = false;
1118         while let (Some(c), remaining) = input.split_first() {
1119             if let Some(digit) = c.to_digit(10) {
1120                 port = port * 10 + digit;
1121                 if port > ::std::u16::MAX as u32 {
1122                     return Err(ParseError::InvalidPort);
1123                 }
1124                 has_any_digit = true;
1125             } else if context == Context::UrlParser && !matches!(c, '/' | '\\' | '?' | '#') {
1126                 return Err(ParseError::InvalidPort);
1127             } else {
1128                 break;
1129             }
1130             input = remaining;
1131         }
1132         let mut opt_port = Some(port as u16);
1133         if !has_any_digit || opt_port == default_port() {
1134             opt_port = None;
1135         }
1136         Ok((opt_port, input))
1137     }
1138 
parse_path_start<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, input: Input<'i>, ) -> Input<'i>1139     pub fn parse_path_start<'i>(
1140         &mut self,
1141         scheme_type: SchemeType,
1142         has_host: &mut bool,
1143         input: Input<'i>,
1144     ) -> Input<'i> {
1145         let path_start = self.serialization.len();
1146         let (maybe_c, remaining) = input.split_first();
1147         // If url is special, then:
1148         if scheme_type.is_special() {
1149             if maybe_c == Some('\\') {
1150                 // If c is U+005C (\), validation error.
1151                 self.log_violation(SyntaxViolation::Backslash);
1152             }
1153             // A special URL always has a non-empty path.
1154             if !self.serialization.ends_with('/') {
1155                 self.serialization.push('/');
1156                 // We have already made sure the forward slash is present.
1157                 if maybe_c == Some('/') || maybe_c == Some('\\') {
1158                     return self.parse_path(scheme_type, has_host, path_start, remaining);
1159                 }
1160             }
1161             return self.parse_path(scheme_type, has_host, path_start, input);
1162         } else if maybe_c == Some('?') || maybe_c == Some('#') {
1163             // Otherwise, if state override is not given and c is U+003F (?),
1164             // set url’s query to the empty string and state to query state.
1165             // Otherwise, if state override is not given and c is U+0023 (#),
1166             // set url’s fragment to the empty string and state to fragment state.
1167             // The query and path states will be handled by the caller.
1168             return input;
1169         }
1170 
1171         if maybe_c != None && maybe_c != Some('/') {
1172             self.serialization.push('/');
1173         }
1174         // Otherwise, if c is not the EOF code point:
1175         self.parse_path(scheme_type, has_host, path_start, input)
1176     }
1177 
parse_path<'i>( &mut self, scheme_type: SchemeType, has_host: &mut bool, path_start: usize, mut input: Input<'i>, ) -> Input<'i>1178     pub fn parse_path<'i>(
1179         &mut self,
1180         scheme_type: SchemeType,
1181         has_host: &mut bool,
1182         path_start: usize,
1183         mut input: Input<'i>,
1184     ) -> Input<'i> {
1185         // Relative path state
1186         loop {
1187             let segment_start = self.serialization.len();
1188             let mut ends_with_slash = false;
1189             loop {
1190                 let input_before_c = input.clone();
1191                 let (c, utf8_c) = if let Some(x) = input.next_utf8() {
1192                     x
1193                 } else {
1194                     break;
1195                 };
1196                 match c {
1197                     '/' if self.context != Context::PathSegmentSetter => {
1198                         self.serialization.push(c);
1199                         ends_with_slash = true;
1200                         break;
1201                     }
1202                     '\\' if self.context != Context::PathSegmentSetter
1203                         && scheme_type.is_special() =>
1204                     {
1205                         self.log_violation(SyntaxViolation::Backslash);
1206                         self.serialization.push('/');
1207                         ends_with_slash = true;
1208                         break;
1209                     }
1210                     '?' | '#' if self.context == Context::UrlParser => {
1211                         input = input_before_c;
1212                         break;
1213                     }
1214                     _ => {
1215                         self.check_url_code_point(c, &input);
1216                         if self.context == Context::PathSegmentSetter {
1217                             if scheme_type.is_special() {
1218                                 self.serialization
1219                                     .extend(utf8_percent_encode(utf8_c, SPECIAL_PATH_SEGMENT));
1220                             } else {
1221                                 self.serialization
1222                                     .extend(utf8_percent_encode(utf8_c, PATH_SEGMENT));
1223                             }
1224                         } else {
1225                             self.serialization.extend(utf8_percent_encode(utf8_c, PATH));
1226                         }
1227                     }
1228                 }
1229             }
1230             // Going from &str to String to &str to please the 1.33.0 borrow checker
1231             let before_slash_string = if ends_with_slash {
1232                 self.serialization[segment_start..self.serialization.len() - 1].to_owned()
1233             } else {
1234                 self.serialization[segment_start..self.serialization.len()].to_owned()
1235             };
1236             let segment_before_slash: &str = &before_slash_string;
1237             match segment_before_slash {
1238                 // If buffer is a double-dot path segment, shorten url’s path,
1239                 ".." | "%2e%2e" | "%2e%2E" | "%2E%2e" | "%2E%2E" | "%2e." | "%2E." | ".%2e"
1240                 | ".%2E" => {
1241                     debug_assert!(self.serialization.as_bytes()[segment_start - 1] == b'/');
1242                     self.serialization.truncate(segment_start);
1243                     if self.serialization.ends_with('/')
1244                         && Parser::last_slash_can_be_removed(&self.serialization, path_start)
1245                     {
1246                         self.serialization.pop();
1247                     }
1248                     self.shorten_path(scheme_type, path_start);
1249 
1250                     // and then if neither c is U+002F (/), nor url is special and c is U+005C (\), append the empty string to url’s path.
1251                     if ends_with_slash && !self.serialization.ends_with('/') {
1252                         self.serialization.push('/');
1253                     }
1254                 }
1255                 // Otherwise, if buffer is a single-dot path segment and if neither c is U+002F (/),
1256                 // nor url is special and c is U+005C (\), append the empty string to url’s path.
1257                 "." | "%2e" | "%2E" => {
1258                     self.serialization.truncate(segment_start);
1259                     if !self.serialization.ends_with('/') {
1260                         self.serialization.push('/');
1261                     }
1262                 }
1263                 _ => {
1264                     // If url’s scheme is "file", url’s path is empty, and buffer is a Windows drive letter, then
1265                     if scheme_type.is_file() && is_windows_drive_letter(segment_before_slash) {
1266                         // Replace the second code point in buffer with U+003A (:).
1267                         if let Some(c) = segment_before_slash.chars().next() {
1268                             self.serialization.truncate(segment_start);
1269                             self.serialization.push(c);
1270                             self.serialization.push(':');
1271                             if ends_with_slash {
1272                                 self.serialization.push('/');
1273                             }
1274                         }
1275                         // If url’s host is neither the empty string nor null,
1276                         // validation error, set url’s host to the empty string.
1277                         if *has_host {
1278                             self.log_violation(SyntaxViolation::FileWithHostAndWindowsDrive);
1279                             *has_host = false; // FIXME account for this in callers
1280                         }
1281                     }
1282                 }
1283             }
1284             if !ends_with_slash {
1285                 break;
1286             }
1287         }
1288         if scheme_type.is_file() {
1289             // while url’s path’s size is greater than 1
1290             // and url’s path[0] is the empty string,
1291             // validation error, remove the first item from url’s path.
1292             //FIXME: log violation
1293             let path = self.serialization.split_off(path_start);
1294             self.serialization.push('/');
1295             self.serialization.push_str(&path.trim_start_matches('/'));
1296         }
1297 
1298         input
1299     }
1300 
last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool1301     fn last_slash_can_be_removed(serialization: &str, path_start: usize) -> bool {
1302         let url_before_segment = &serialization[..serialization.len() - 1];
1303         if let Some(segment_before_start) = url_before_segment.rfind('/') {
1304             // Do not remove the root slash
1305             segment_before_start >= path_start
1306                 // Or a windows drive letter slash
1307                 && !path_starts_with_windows_drive_letter(&serialization[segment_before_start..])
1308         } else {
1309             false
1310         }
1311     }
1312 
1313     /// https://url.spec.whatwg.org/#shorten-a-urls-path
shorten_path(&mut self, scheme_type: SchemeType, path_start: usize)1314     fn shorten_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1315         // If path is empty, then return.
1316         if self.serialization.len() == path_start {
1317             return;
1318         }
1319         // If url’s scheme is "file", path’s size is 1, and path[0] is a normalized Windows drive letter, then return.
1320         if scheme_type.is_file()
1321             && is_normalized_windows_drive_letter(&self.serialization[path_start..])
1322         {
1323             return;
1324         }
1325         // Remove path’s last item.
1326         self.pop_path(scheme_type, path_start);
1327     }
1328 
1329     /// https://url.spec.whatwg.org/#pop-a-urls-path
pop_path(&mut self, scheme_type: SchemeType, path_start: usize)1330     fn pop_path(&mut self, scheme_type: SchemeType, path_start: usize) {
1331         if self.serialization.len() > path_start {
1332             let slash_position = self.serialization[path_start..].rfind('/').unwrap();
1333             // + 1 since rfind returns the position before the slash.
1334             let segment_start = path_start + slash_position + 1;
1335             // Don’t pop a Windows drive letter
1336             if !(scheme_type.is_file()
1337                 && is_normalized_windows_drive_letter(&self.serialization[segment_start..]))
1338             {
1339                 self.serialization.truncate(segment_start);
1340             }
1341         }
1342     }
1343 
parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i>1344     pub fn parse_cannot_be_a_base_path<'i>(&mut self, mut input: Input<'i>) -> Input<'i> {
1345         loop {
1346             let input_before_c = input.clone();
1347             match input.next_utf8() {
1348                 Some(('?', _)) | Some(('#', _)) if self.context == Context::UrlParser => {
1349                     return input_before_c
1350                 }
1351                 Some((c, utf8_c)) => {
1352                     self.check_url_code_point(c, &input);
1353                     self.serialization
1354                         .extend(utf8_percent_encode(utf8_c, CONTROLS));
1355                 }
1356                 None => return input,
1357             }
1358         }
1359     }
1360 
1361     #[allow(clippy::too_many_arguments)]
with_query_and_fragment( mut self, scheme_type: SchemeType, scheme_end: u32, username_end: u32, host_start: u32, host_end: u32, host: HostInternal, port: Option<u16>, path_start: u32, remaining: Input<'_>, ) -> ParseResult<Url>1362     fn with_query_and_fragment(
1363         mut self,
1364         scheme_type: SchemeType,
1365         scheme_end: u32,
1366         username_end: u32,
1367         host_start: u32,
1368         host_end: u32,
1369         host: HostInternal,
1370         port: Option<u16>,
1371         path_start: u32,
1372         remaining: Input<'_>,
1373     ) -> ParseResult<Url> {
1374         let (query_start, fragment_start) =
1375             self.parse_query_and_fragment(scheme_type, scheme_end, remaining)?;
1376         Ok(Url {
1377             serialization: self.serialization,
1378             scheme_end,
1379             username_end,
1380             host_start,
1381             host_end,
1382             host,
1383             port,
1384             path_start,
1385             query_start,
1386             fragment_start,
1387         })
1388     }
1389 
1390     /// Return (query_start, fragment_start)
parse_query_and_fragment( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'_>, ) -> ParseResult<(Option<u32>, Option<u32>)>1391     fn parse_query_and_fragment(
1392         &mut self,
1393         scheme_type: SchemeType,
1394         scheme_end: u32,
1395         mut input: Input<'_>,
1396     ) -> ParseResult<(Option<u32>, Option<u32>)> {
1397         let mut query_start = None;
1398         match input.next() {
1399             Some('#') => {}
1400             Some('?') => {
1401                 query_start = Some(to_u32(self.serialization.len())?);
1402                 self.serialization.push('?');
1403                 let remaining = self.parse_query(scheme_type, scheme_end, input);
1404                 if let Some(remaining) = remaining {
1405                     input = remaining
1406                 } else {
1407                     return Ok((query_start, None));
1408                 }
1409             }
1410             None => return Ok((None, None)),
1411             _ => panic!("Programming error. parse_query_and_fragment() called without ? or #"),
1412         }
1413 
1414         let fragment_start = to_u32(self.serialization.len())?;
1415         self.serialization.push('#');
1416         self.parse_fragment(input);
1417         Ok((query_start, Some(fragment_start)))
1418     }
1419 
parse_query<'i>( &mut self, scheme_type: SchemeType, scheme_end: u32, mut input: Input<'i>, ) -> Option<Input<'i>>1420     pub fn parse_query<'i>(
1421         &mut self,
1422         scheme_type: SchemeType,
1423         scheme_end: u32,
1424         mut input: Input<'i>,
1425     ) -> Option<Input<'i>> {
1426         let mut query = String::new(); // FIXME: use a streaming decoder instead
1427         let mut remaining = None;
1428         while let Some(c) = input.next() {
1429             if c == '#' && self.context == Context::UrlParser {
1430                 remaining = Some(input);
1431                 break;
1432             } else {
1433                 self.check_url_code_point(c, &input);
1434                 query.push(c);
1435             }
1436         }
1437 
1438         let encoding = match &self.serialization[..scheme_end as usize] {
1439             "http" | "https" | "file" | "ftp" => self.query_encoding_override,
1440             _ => None,
1441         };
1442         let query_bytes = if let Some(o) = encoding {
1443             o(&query)
1444         } else {
1445             query.as_bytes().into()
1446         };
1447         let set = if scheme_type.is_special() {
1448             SPECIAL_QUERY
1449         } else {
1450             QUERY
1451         };
1452         self.serialization.extend(percent_encode(&query_bytes, set));
1453         remaining
1454     }
1455 
fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url>1456     fn fragment_only(mut self, base_url: &Url, mut input: Input<'_>) -> ParseResult<Url> {
1457         let before_fragment = match base_url.fragment_start {
1458             Some(i) => base_url.slice(..i),
1459             None => &*base_url.serialization,
1460         };
1461         debug_assert!(self.serialization.is_empty());
1462         self.serialization
1463             .reserve(before_fragment.len() + input.chars.as_str().len());
1464         self.serialization.push_str(before_fragment);
1465         self.serialization.push('#');
1466         let next = input.next();
1467         debug_assert!(next == Some('#'));
1468         self.parse_fragment(input);
1469         Ok(Url {
1470             serialization: self.serialization,
1471             fragment_start: Some(to_u32(before_fragment.len())?),
1472             ..*base_url
1473         })
1474     }
1475 
parse_fragment(&mut self, mut input: Input<'_>)1476     pub fn parse_fragment(&mut self, mut input: Input<'_>) {
1477         while let Some((c, utf8_c)) = input.next_utf8() {
1478             if c == '\0' {
1479                 self.log_violation(SyntaxViolation::NullInFragment)
1480             } else {
1481                 self.check_url_code_point(c, &input);
1482             }
1483             self.serialization
1484                 .extend(utf8_percent_encode(utf8_c, FRAGMENT));
1485         }
1486     }
1487 
check_url_code_point(&self, c: char, input: &Input<'_>)1488     fn check_url_code_point(&self, c: char, input: &Input<'_>) {
1489         if let Some(vfn) = self.violation_fn {
1490             if c == '%' {
1491                 let mut input = input.clone();
1492                 if !matches!((input.next(), input.next()), (Some(a), Some(b))
1493                              if is_ascii_hex_digit(a) && is_ascii_hex_digit(b))
1494                 {
1495                     vfn(SyntaxViolation::PercentDecode)
1496                 }
1497             } else if !is_url_code_point(c) {
1498                 vfn(SyntaxViolation::NonUrlCodePoint)
1499             }
1500         }
1501     }
1502 }
1503 
1504 #[inline]
is_ascii_hex_digit(c: char) -> bool1505 fn is_ascii_hex_digit(c: char) -> bool {
1506     matches!(c, 'a'..='f' | 'A'..='F' | '0'..='9')
1507 }
1508 
1509 // Non URL code points:
1510 // U+0000 to U+0020 (space)
1511 // " # % < > [ \ ] ^ ` { | }
1512 // U+007F to U+009F
1513 // surrogates
1514 // U+FDD0 to U+FDEF
1515 // Last two of each plane: U+__FFFE to U+__FFFF for __ in 00 to 10 hex
1516 #[inline]
is_url_code_point(c: char) -> bool1517 fn is_url_code_point(c: char) -> bool {
1518     matches!(c,
1519         'a'..='z' |
1520         'A'..='Z' |
1521         '0'..='9' |
1522         '!' | '$' | '&' | '\'' | '(' | ')' | '*' | '+' | ',' | '-' |
1523         '.' | '/' | ':' | ';' | '=' | '?' | '@' | '_' | '~' |
1524         '\u{A0}'..='\u{D7FF}' | '\u{E000}'..='\u{FDCF}' | '\u{FDF0}'..='\u{FFFD}' |
1525         '\u{10000}'..='\u{1FFFD}' | '\u{20000}'..='\u{2FFFD}' |
1526         '\u{30000}'..='\u{3FFFD}' | '\u{40000}'..='\u{4FFFD}' |
1527         '\u{50000}'..='\u{5FFFD}' | '\u{60000}'..='\u{6FFFD}' |
1528         '\u{70000}'..='\u{7FFFD}' | '\u{80000}'..='\u{8FFFD}' |
1529         '\u{90000}'..='\u{9FFFD}' | '\u{A0000}'..='\u{AFFFD}' |
1530         '\u{B0000}'..='\u{BFFFD}' | '\u{C0000}'..='\u{CFFFD}' |
1531         '\u{D0000}'..='\u{DFFFD}' | '\u{E1000}'..='\u{EFFFD}' |
1532         '\u{F0000}'..='\u{FFFFD}' | '\u{100000}'..='\u{10FFFD}')
1533 }
1534 
1535 /// https://url.spec.whatwg.org/#c0-controls-and-space
1536 #[inline]
c0_control_or_space(ch: char) -> bool1537 fn c0_control_or_space(ch: char) -> bool {
1538     ch <= ' ' // U+0000 to U+0020
1539 }
1540 
1541 /// https://infra.spec.whatwg.org/#ascii-tab-or-newline
1542 #[inline]
ascii_tab_or_new_line(ch: char) -> bool1543 fn ascii_tab_or_new_line(ch: char) -> bool {
1544     matches!(ch, '\t' | '\r' | '\n')
1545 }
1546 
1547 /// https://url.spec.whatwg.org/#ascii-alpha
1548 #[inline]
ascii_alpha(ch: char) -> bool1549 pub fn ascii_alpha(ch: char) -> bool {
1550     matches!(ch, 'a'..='z' | 'A'..='Z')
1551 }
1552 
1553 #[inline]
to_u32(i: usize) -> ParseResult<u32>1554 pub fn to_u32(i: usize) -> ParseResult<u32> {
1555     if i <= ::std::u32::MAX as usize {
1556         Ok(i as u32)
1557     } else {
1558         Err(ParseError::Overflow)
1559     }
1560 }
1561 
is_normalized_windows_drive_letter(segment: &str) -> bool1562 fn is_normalized_windows_drive_letter(segment: &str) -> bool {
1563     is_windows_drive_letter(segment) && segment.as_bytes()[1] == b':'
1564 }
1565 
1566 /// Wether the scheme is file:, the path has a single segment, and that segment
1567 /// is a Windows drive letter
1568 #[inline]
is_windows_drive_letter(segment: &str) -> bool1569 pub fn is_windows_drive_letter(segment: &str) -> bool {
1570     segment.len() == 2 && starts_with_windows_drive_letter(segment)
1571 }
1572 
1573 /// Wether path starts with a root slash
1574 /// and a windows drive letter eg: "/c:" or "/a:/"
path_starts_with_windows_drive_letter(s: &str) -> bool1575 fn path_starts_with_windows_drive_letter(s: &str) -> bool {
1576     if let Some(c) = s.as_bytes().get(0) {
1577         matches!(c, b'/' | b'\\' | b'?' | b'#') && starts_with_windows_drive_letter(&s[1..])
1578     } else {
1579         false
1580     }
1581 }
1582 
starts_with_windows_drive_letter(s: &str) -> bool1583 fn starts_with_windows_drive_letter(s: &str) -> bool {
1584     s.len() >= 2
1585         && ascii_alpha(s.as_bytes()[0] as char)
1586         && matches!(s.as_bytes()[1], b':' | b'|')
1587         && (s.len() == 2 || matches!(s.as_bytes()[2], b'/' | b'\\' | b'?' | b'#'))
1588 }
1589 
1590 /// https://url.spec.whatwg.org/#start-with-a-windows-drive-letter
starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool1591 fn starts_with_windows_drive_letter_segment(input: &Input<'_>) -> bool {
1592     let mut input = input.clone();
1593     match (input.next(), input.next(), input.next()) {
1594         // its first two code points are a Windows drive letter
1595         // its third code point is U+002F (/), U+005C (\), U+003F (?), or U+0023 (#).
1596         (Some(a), Some(b), Some(c))
1597             if ascii_alpha(a) && matches!(b, ':' | '|') && matches!(c, '/' | '\\' | '?' | '#') =>
1598         {
1599             true
1600         }
1601         // its first two code points are a Windows drive letter
1602         // its length is 2
1603         (Some(a), Some(b), None) if ascii_alpha(a) && matches!(b, ':' | '|') => true,
1604         _ => false,
1605     }
1606 }
1607