1 use crate::validation::{Context, Reason};
2 use std::{
3     collections::HashMap,
4     ffi::{OsStr, OsString},
5     fmt::{self, Debug, Formatter},
6     io,
7     path::{Component, Path, PathBuf},
8     sync::Arc,
9 };
10 
11 /// Try to resolve a link relative to the current directory.
12 ///
13 /// # Note
14 ///
15 /// The behaviour of this function may vary greatly depending on the
16 /// [`Options`] passed in.
17 ///
18 /// ## Root Directory
19 ///
20 /// Setting a value for [`Options::root_directory()`] and
21 /// [`Options::links_may_traverse_the_root_directory()`] act as a sort of sanity
22 /// check to prevent links from going outside of a directory tree. They can also
23 /// be useful in preventing [directory traversal attacks][dta] and detecting
24 /// brittle code (links that go outside of a specific directory may not exist on
25 /// other machines).
26 ///
27 /// When the link is absolute, it will be resolved relative to
28 /// [`Options::root_directory()`]. If now root directory was provided, it will
29 /// *always* trigger a [`Reason::TraversesParentDirectories`] error to
30 /// prevent possible directory traversal attacks.
31 ///
32 /// ## Default File
33 ///
34 /// Because a link can only point to a file, when a link specifies a directory
35 /// we'll automatically append [`Options::default_file()`] to the end.
36 ///
37 /// This will typically be something like `"index.html"`, meaning a link to
38 /// `./whatever/` will be resolved to `./whatever/index.html`, which is the
39 /// default behaviour for web browsers.
40 ///
41 /// ## Alternate Extensions
42 ///
43 /// Sometimes you might have a `index.md` document but also accept `index.html`
44 /// as a valid link (like in `mdbook`). For this you can provide a mapping of
45 /// [`Options::alternate_extensions()`] to fall back to when the original
46 /// extension doesn't work.
47 ///
48 /// [dta]: https://en.wikipedia.org/wiki/Directory_traversal_attack
resolve_link( current_directory: &Path, link: &Path, options: &Options, ) -> Result<PathBuf, Reason>49 pub fn resolve_link(
50     current_directory: &Path,
51     link: &Path,
52     options: &Options,
53 ) -> Result<PathBuf, Reason> {
54     let joined = options.join(current_directory, link)?;
55 
56     let candidates = options.possible_names(joined);
57 
58     for candidate in candidates {
59         log::trace!(
60             "Checking if \"{}\" points to \"{}\"",
61             link.display(),
62             candidate.display(),
63         );
64 
65         if let Ok(canonical) = options.canonicalize(&candidate) {
66             options.sanity_check(&canonical)?;
67             return Ok(canonical);
68         }
69     }
70 
71     log::trace!("None of the candidates exist for \"{}\"", link.display());
72     Err(Reason::Io(io::ErrorKind::NotFound.into()))
73 }
74 
75 /// Check whether a [`Path`] points to a valid file on disk.
76 ///
77 /// If a fragment specifier is provided, this function will scan through the
78 /// linked document and check that the file contains the corresponding anchor
79 /// (e.g. markdown heading or HTML `id`).
check_filesystem<C>( current_directory: &Path, path: &Path, fragment: Option<&str>, ctx: &C, ) -> Result<(), Reason> where C: Context + ?Sized,80 pub fn check_filesystem<C>(
81     current_directory: &Path,
82     path: &Path,
83     fragment: Option<&str>,
84     ctx: &C,
85 ) -> Result<(), Reason>
86 where
87     C: Context + ?Sized,
88 {
89     log::debug!(
90         "Checking \"{}\" in the context of \"{}\"",
91         path.display(),
92         current_directory.display()
93     );
94 
95     let options = ctx.filesystem_options();
96     let resolved_location = resolve_link(current_directory, path, options)?;
97 
98     log::debug!(
99         "\"{}\" resolved to \"{}\"",
100         path.display(),
101         resolved_location.display()
102     );
103 
104     if let Some(fragment) = fragment {
105         // TODO: detect the file type and check the fragment exists
106         log::warn!(
107             "Not checking that the \"{}\" section exists in \"{}\" because fragment resolution isn't implemented",
108             fragment,
109             resolved_location.display(),
110         );
111     }
112 
113     if let Err(reason) =
114         options.run_custom_validation(&resolved_location, fragment)
115     {
116         log::debug!(
117             "Custom validation reported \"{}\" as invalid because {}",
118             resolved_location.display(),
119             reason
120         );
121         return Err(reason);
122     }
123 
124     Ok(())
125 }
126 
127 /// Options to be used with [`resolve_link()`].
128 #[derive(Clone)]
129 #[cfg_attr(
130     feature = "serde-1",
131     derive(serde::Serialize, serde::Deserialize),
132     serde(default)
133 )]
134 pub struct Options {
135     root_directory: Option<PathBuf>,
136     default_file: OsString,
137     links_may_traverse_the_root_directory: bool,
138     // Note: the key is normalised to lowercase to make sure extensions are
139     // case insensitive
140     alternate_extensions: HashMap<String, Vec<OsString>>,
141     #[serde(skip, default = "nop_custom_validation")]
142     custom_validation: Arc<dyn Fn(&Path, Option<&str>) -> Result<(), Reason>>,
143 }
144 
145 impl Options {
146     /// The name used by [`Options::default_file()`].
147     pub const DEFAULT_FILE: &'static str = "index.html";
148 
149     /// A mapping of possible alternate extensions to try when checking a
150     /// filesystem link.
default_alternate_extensions( ) -> impl IntoIterator<Item = (OsString, impl IntoIterator<Item = OsString>)>151     pub fn default_alternate_extensions(
152     ) -> impl IntoIterator<Item = (OsString, impl IntoIterator<Item = OsString>)>
153     {
154         const MAPPING: &'static [(&'static str, &'static [&'static str])] =
155             &[("md", &["html"])];
156 
157         MAPPING.iter().map(|(ext, alts)| {
158             (OsString::from(ext), alts.iter().map(OsString::from))
159         })
160     }
161 
162     /// Create a new [`Options`] populated with some sane defaults.
new() -> Self163     pub fn new() -> Self {
164         Options {
165             root_directory: None,
166             default_file: OsString::from(Options::DEFAULT_FILE),
167             links_may_traverse_the_root_directory: false,
168             alternate_extensions: Options::default_alternate_extensions()
169                 .into_iter()
170                 .map(|(key, values)| {
171                     (
172                         key.to_string_lossy().to_lowercase(),
173                         values.into_iter().map(Into::into).collect(),
174                     )
175                 })
176                 .collect(),
177             custom_validation: nop_custom_validation(),
178         }
179     }
180 
181     /// Get the root directory, if one was provided.
root_directory(&self) -> Option<&Path>182     pub fn root_directory(&self) -> Option<&Path> {
183         self.root_directory.as_ref().map(|p| &**p)
184     }
185 
186     /// Set the [`Options::root_directory()`], automatically converting to its
187     /// canonical form with [`dunce::canonicalize()`].
with_root_directory<P: AsRef<Path>>( self, root_directory: P, ) -> io::Result<Self>188     pub fn with_root_directory<P: AsRef<Path>>(
189         self,
190         root_directory: P,
191     ) -> io::Result<Self> {
192         Ok(Options {
193             root_directory: Some(dunce::canonicalize(root_directory)?),
194             ..self
195         })
196     }
197 
198     /// The default file name to use when a directory is linked to.
default_file(&self) -> &OsStr199     pub fn default_file(&self) -> &OsStr { &self.default_file }
200 
201     /// Set the [`Options::default_file()`].
set_default_file<O: Into<OsString>>(self, default_file: O) -> Self202     pub fn set_default_file<O: Into<OsString>>(self, default_file: O) -> Self {
203         Options {
204             default_file: default_file.into(),
205             ..self
206         }
207     }
208 
209     /// Get the map of alternate extensions to use when checking.
210     ///
211     /// By default we only map `*.md` to `*.html`
212     /// ([`Options::default_alternate_extensions()`]).
alternate_extensions( &self, ) -> impl Iterator<Item = (&OsStr, impl Iterator<Item = &OsStr>)>213     pub fn alternate_extensions(
214         &self,
215     ) -> impl Iterator<Item = (&OsStr, impl Iterator<Item = &OsStr>)> {
216         self.alternate_extensions.iter().map(|(key, value)| {
217             (OsStr::new(key), value.iter().map(|alt| alt.as_os_str()))
218         })
219     }
220 
221     /// Set the [`Options::alternate_extensions()`] mapping.
set_alternate_extensions<S, I, V>(mut self, alternates: I) -> Self where I: IntoIterator<Item = (S, V)>, S: Into<OsString>, V: IntoIterator<Item = S>,222     pub fn set_alternate_extensions<S, I, V>(mut self, alternates: I) -> Self
223     where
224         I: IntoIterator<Item = (S, V)>,
225         S: Into<OsString>,
226         V: IntoIterator<Item = S>,
227     {
228         self.alternate_extensions = alternates
229             .into_iter()
230             .map(|(key, values)| {
231                 (
232                     key.into().to_string_lossy().to_lowercase(),
233                     values.into_iter().map(Into::into).collect(),
234                 )
235             })
236             .collect();
237 
238         self
239     }
240 
241     /// Are links allowed to go outside of the [`Options::root_directory()`]?
links_may_traverse_the_root_directory(&self) -> bool242     pub fn links_may_traverse_the_root_directory(&self) -> bool {
243         self.links_may_traverse_the_root_directory
244     }
245 
246     /// Set [`Options::links_may_traverse_the_root_directory()`].
set_links_may_traverse_the_root_directory( self, value: bool, ) -> Self247     pub fn set_links_may_traverse_the_root_directory(
248         self,
249         value: bool,
250     ) -> Self {
251         Options {
252             links_may_traverse_the_root_directory: value,
253             ..self
254         }
255     }
256 
257     /// Set a function which will be executed after a link is resolved, allowing
258     /// you to apply custom business logic.
set_custom_validation<F>(self, custom_validation: F) -> Self where F: Fn(&Path, Option<&str>) -> Result<(), Reason> + 'static,259     pub fn set_custom_validation<F>(self, custom_validation: F) -> Self
260     where
261         F: Fn(&Path, Option<&str>) -> Result<(), Reason> + 'static,
262     {
263         let custom_validation = Arc::new(custom_validation);
264         Options {
265             custom_validation,
266             ..self
267         }
268     }
269 
join( &self, current_dir: &Path, second: &Path, ) -> Result<PathBuf, Reason>270     fn join(
271         &self,
272         current_dir: &Path,
273         second: &Path,
274     ) -> Result<PathBuf, Reason> {
275         log::trace!(
276             "Appending \"{}\" to \"{}\"",
277             second.display(),
278             current_dir.display()
279         );
280 
281         if second.has_root() {
282             // if the path is absolute (i.e. has a leading slash) then it's
283             // meant to be relative to the root directory, not the current one
284             match self.root_directory() {
285                 Some(root) => {
286                     let mut buffer = root.to_path_buf();
287                     // append everything except the bits that make it absolute
288                     // (e.g. "/" or "C:\")
289                     buffer.extend(remove_absolute_components(second));
290                     Ok(buffer)
291                 },
292                 // You really shouldn't provide links to absolute files on your
293                 // system (e.g. "/home/michael/Documents/whatever" or
294                 // "/etc/passwd").
295                 //
296                 // For one, it's extremely brittle and will probably only work
297                 // on that computer, but more importantly it's also a vector
298                 // for directory traversal attacks.
299                 //
300                 // Feel free to send a PR if you believe otherwise.
301                 None => {
302                     log::warn!("The bit to be appended is absolute, but we don't have a \"root\" directory to resolve relative to");
303                     Err(Reason::TraversesParentDirectories)
304                 },
305             }
306         } else {
307             Ok(current_dir.join(second))
308         }
309     }
310 
311     /// Gets the canonical version of a particular path, resolving symlinks and
312     /// other filesystem quirks.
313     ///
314     /// This will fail if the item doesn't exist.
canonicalize(&self, path: &Path) -> Result<PathBuf, Reason>315     fn canonicalize(&self, path: &Path) -> Result<PathBuf, Reason> {
316         let mut canonical = dunce::canonicalize(path)?;
317 
318         if canonical.is_dir() {
319             log::trace!(
320                 "Appending the default file name because \"{}\" is a directory",
321                 canonical.display()
322             );
323 
324             canonical.push(&self.default_file);
325             // we need to canonicalize again because the default file may be a
326             // symlink, or not exist at all
327             canonical = dunce::canonicalize(canonical)?;
328         }
329 
330         Ok(canonical)
331     }
332 
sanity_check(&self, path: &Path) -> Result<(), Reason>333     fn sanity_check(&self, path: &Path) -> Result<(), Reason> {
334         log::trace!("Applying sanity checks to \"{}\"", path.display());
335 
336         if let Some(root) = self.root_directory() {
337             log::trace!(
338                 "Checking if \"{}\" is allowed to leave \"{}\"",
339                 path.display(),
340                 root.display()
341             );
342 
343             if !(self.links_may_traverse_the_root_directory
344                 || path.starts_with(root))
345             {
346                 log::trace!(
347                     "\"{}\" traverses outside the \"root\" directory",
348                     path.display()
349                 );
350                 return Err(Reason::TraversesParentDirectories);
351             }
352         }
353 
354         Ok(())
355     }
356 
357     /// sometimes the file being linked to may be usable with another extension
358     /// (e.g. in mdbook, markdown files can be linked to with the HTML
359     /// extension).
possible_names( &self, original: PathBuf, ) -> impl IntoIterator<Item = PathBuf>360     fn possible_names(
361         &self,
362         original: PathBuf,
363     ) -> impl IntoIterator<Item = PathBuf> {
364         let mut names = vec![original.clone()];
365 
366         if let Some(alternatives) = original
367             .extension()
368             .map(|ext| ext.to_string_lossy().to_lowercase())
369             .and_then(|ext| self.alternate_extensions.get(&ext))
370         {
371             for alternative in alternatives {
372                 names.push(original.with_extension(alternative));
373             }
374         }
375 
376         log::trace!(
377             "Possible candidates for \"{}\" are {:?}",
378             original.display(),
379             names
380         );
381 
382         names
383     }
384 
run_custom_validation( &self, resolved_path: &Path, fragment: Option<&str>, ) -> Result<(), Reason>385     fn run_custom_validation(
386         &self,
387         resolved_path: &Path,
388         fragment: Option<&str>,
389     ) -> Result<(), Reason> {
390         (self.custom_validation)(resolved_path, fragment)
391     }
392 }
393 
nop_custom_validation( ) -> Arc<dyn Fn(&Path, Option<&str>) -> Result<(), Reason>>394 fn nop_custom_validation(
395 ) -> Arc<dyn Fn(&Path, Option<&str>) -> Result<(), Reason>> {
396     Arc::new(|_, _| Ok(()))
397 }
398 
399 impl Default for Options {
default() -> Self400     fn default() -> Self { Options::new() }
401 }
402 
403 impl Debug for Options {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result404     fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
405         let Options {
406             root_directory,
407             default_file,
408             links_may_traverse_the_root_directory,
409             alternate_extensions,
410             custom_validation: _,
411         } = self;
412 
413         f.debug_struct("Options")
414             .field("root_directory", root_directory)
415             .field("default_file", default_file)
416             .field(
417                 "links_may_traverse_the_root_directory",
418                 links_may_traverse_the_root_directory,
419             )
420             .field("alternate_extensions", alternate_extensions)
421             .finish()
422     }
423 }
424 
425 impl PartialEq for Options {
eq(&self, other: &Options) -> bool426     fn eq(&self, other: &Options) -> bool {
427         let Options {
428             root_directory,
429             default_file,
430             links_may_traverse_the_root_directory,
431             alternate_extensions,
432             custom_validation: _,
433         } = self;
434 
435         root_directory == &other.root_directory
436             && default_file == &other.default_file
437             && links_may_traverse_the_root_directory
438                 == &other.links_may_traverse_the_root_directory
439             && alternate_extensions == &other.alternate_extensions
440     }
441 }
442 
remove_absolute_components( path: &Path, ) -> impl Iterator<Item = Component> + '_443 fn remove_absolute_components(
444     path: &Path,
445 ) -> impl Iterator<Item = Component> + '_ {
446     path.components()
447         .skip_while(|c| matches!(c, Component::Prefix(_) | Component::RootDir))
448 }
449 
450 #[cfg(test)]
451 mod tests {
452     use super::*;
453     use crate::BasicContext;
454     use std::sync::atomic::{AtomicBool, Ordering};
455 
validation_dir() -> PathBuf456     fn validation_dir() -> PathBuf {
457         Path::new(env!("CARGO_MANIFEST_DIR"))
458             .join("src")
459             .join("validation")
460     }
461 
touch<S: AsRef<Path>>(filename: S, directories: &[&Path])462     fn touch<S: AsRef<Path>>(filename: S, directories: &[&Path]) {
463         for dir in directories {
464             std::fs::create_dir_all(dir).unwrap();
465 
466             let item = dir.join(filename.as_ref());
467             let _f = std::fs::File::create(&item).unwrap();
468         }
469     }
470 
init_logging()471     fn init_logging() {
472         let _ = env_logger::builder()
473             .filter(Some("linkcheck"), log::LevelFilter::Trace)
474             .is_test(true)
475             .try_init();
476     }
477 
478     #[test]
resolve_mod_relative_to_validation_dir()479     fn resolve_mod_relative_to_validation_dir() {
480         init_logging();
481         let current_dir = validation_dir();
482         let link = "mod.rs";
483         let options = Options::default();
484 
485         let got =
486             resolve_link(&current_dir, Path::new(link), &options).unwrap();
487 
488         assert_eq!(got, current_dir.join(link));
489     }
490 
491     #[test]
custom_validation_function_gets_called()492     fn custom_validation_function_gets_called() {
493         init_logging();
494         let current_dir = validation_dir();
495         let link = "mod.rs";
496         let called = Arc::new(AtomicBool::new(false));
497         let called_2 = Arc::clone(&called);
498         let mut ctx = BasicContext::default();
499         ctx.options = Options::default().set_custom_validation(move |_, _| {
500             called_2.store(true, Ordering::SeqCst);
501             Ok(())
502         });
503 
504         check_filesystem(&current_dir, Path::new(link), None, &ctx).unwrap();
505 
506         assert!(called.load(Ordering::SeqCst))
507     }
508 
509     #[test]
detect_possible_directory_traversal_attacks()510     fn detect_possible_directory_traversal_attacks() {
511         init_logging();
512         let temp = tempfile::tempdir().unwrap();
513         let temp = dunce::canonicalize(temp.path()).unwrap();
514         let foo = temp.join("foo");
515         let bar = foo.join("bar");
516         let baz = bar.join("baz");
517         let options = Options::default().with_root_directory(&temp).unwrap();
518         touch(&options.default_file, &[&temp, &foo, &bar, &baz]);
519         let current_dir = baz.as_path();
520         let resolve = |link: &str| -> Result<PathBuf, Reason> {
521             resolve_link(current_dir, Path::new(link), &options)
522         };
523 
524         // checking up to the root directory is okay
525         assert_eq!(
526             resolve(".").unwrap(),
527             current_dir.join(&options.default_file)
528         );
529         assert_eq!(resolve("..").unwrap(), bar.join(&options.default_file));
530         assert_eq!(resolve("../..").unwrap(), foo.join(&options.default_file));
531         assert_eq!(
532             resolve("../../..").unwrap(),
533             temp.join(&options.default_file)
534         );
535         // but a directory traversal attack isn't
536         let bad_path = if cfg!(windows) {
537             "../../../../../../../../../../../../../../../../../Windows/System32/cmd.exe"
538         } else {
539             "../../../../../../../../../../../../../../../../../etc/passwd"
540         };
541         let traverses_parent_dir = resolve(bad_path).unwrap_err();
542         assert!(
543             matches!(traverses_parent_dir, Reason::TraversesParentDirectories),
544             "{:?} should have traversed the parent directory",
545             traverses_parent_dir
546         );
547     }
548 
549     #[test]
links_with_a_leading_slash_are_relative_to_the_root()550     fn links_with_a_leading_slash_are_relative_to_the_root() {
551         init_logging();
552         let temp = tempfile::tempdir().unwrap();
553         let temp = dunce::canonicalize(temp.path()).unwrap();
554         let foo = temp.join("foo");
555         let bar = temp.join("bar");
556         let options = Options::default().with_root_directory(&temp).unwrap();
557         touch(&options.default_file, &[&temp, &foo, &bar]);
558         let link = Path::new("/bar");
559 
560         let got = resolve_link(&foo, link, &options).unwrap();
561 
562         assert_eq!(got, bar.join(&options.default_file));
563     }
564 
565     #[test]
link_to_a_file_we_know_doesnt_exist()566     fn link_to_a_file_we_know_doesnt_exist() {
567         init_logging();
568         let temp = tempfile::tempdir().unwrap();
569         let temp = dunce::canonicalize(temp.path()).unwrap();
570         let options = Options::default().with_root_directory(&temp).unwrap();
571         let link = Path::new("./bar");
572 
573         let err = resolve_link(&temp, link, &options).unwrap_err();
574 
575         assert!(err.file_not_found());
576     }
577 
578     #[test]
absolute_link_with_no_root_set_is_an_error()579     fn absolute_link_with_no_root_set_is_an_error() {
580         init_logging();
581         let temp = tempfile::tempdir().unwrap();
582         let temp = dunce::canonicalize(temp.path()).unwrap();
583         let options = Options::default();
584         let link = Path::new("/bar");
585 
586         let err = resolve_link(&temp, link, &options).unwrap_err();
587 
588         assert!(matches!(err, Reason::TraversesParentDirectories));
589     }
590 
591     #[test]
a_link_that_is_allowed_to_traverse_the_root_dir()592     fn a_link_that_is_allowed_to_traverse_the_root_dir() {
593         init_logging();
594         let temp = tempfile::tempdir().unwrap();
595         let temp = dunce::canonicalize(temp.path()).unwrap();
596         let foo = temp.join("foo");
597         let bar = temp.join("bar");
598         touch(Options::DEFAULT_FILE, &[&temp, &foo, &bar]);
599         let options = Options::default()
600             .with_root_directory(&foo)
601             .unwrap()
602             .set_links_may_traverse_the_root_directory(true);
603         let link = Path::new("../bar/index.html");
604 
605         let got = resolve_link(&foo, link, &options).unwrap();
606 
607         assert_eq!(got, bar.join("index.html"));
608     }
609 
610     #[test]
markdown_files_can_be_used_as_html()611     fn markdown_files_can_be_used_as_html() {
612         init_logging();
613         let temp = tempfile::tempdir().unwrap();
614         let temp = dunce::canonicalize(temp.path()).unwrap();
615         touch("index.html", &[&temp]);
616         let link = "index.md";
617         // let options = Options::default()
618         //     .set_alternate_extensions(Options::DEFAULT_ALTERNATE_EXTENSIONS);
619         let options = Options::default()
620             .set_alternate_extensions(Options::default_alternate_extensions());
621 
622         let got = resolve_link(&temp, Path::new(link), &options).unwrap();
623 
624         assert_eq!(got, temp.join("index.html"));
625     }
626 
627     #[test]
join_paths()628     fn join_paths() {
629         init_logging();
630         let temp = tempfile::tempdir().unwrap();
631         let temp = dunce::canonicalize(temp.path()).unwrap();
632         let foo = temp.join("foo");
633         let bar = foo.join("bar");
634         let baz = bar.join("baz");
635         let baz_index = baz.join("index.html");
636         touch("index.html", &[&temp, &foo, &bar, &baz]);
637         let options = Options::default().with_root_directory(&temp).unwrap();
638 
639         let inputs = vec![
640             ("/foo", &temp, &foo),
641             ("foo", &temp, &foo),
642             ("foo/bar", &temp, &bar),
643             ("foo/bar/baz", &temp, &baz),
644             ("/foo/bar/baz/index.html", &temp, &baz_index),
645             ("bar/baz", &foo, &baz),
646             ("baz", &bar, &baz),
647             ("index.html", &baz, &baz_index),
648         ];
649 
650         for (link, base, should_be) in inputs {
651             let got = options.join(base, Path::new(link)).unwrap();
652             assert_eq!(got, *should_be);
653         }
654     }
655 }
656