1 use crate::validation::{Context, Reason};
2 use std::{
3 collections::HashMap,
4 ffi::{OsStr, OsString},
5 fmt::{self, Debug, Formatter},
6 io,
7 path::{Component, Path, PathBuf},
8 sync::Arc,
9 };
10
11 /// Try to resolve a link relative to the current directory.
12 ///
13 /// # Note
14 ///
15 /// The behaviour of this function may vary greatly depending on the
16 /// [`Options`] passed in.
17 ///
18 /// ## Root Directory
19 ///
20 /// Setting a value for [`Options::root_directory()`] and
21 /// [`Options::links_may_traverse_the_root_directory()`] act as a sort of sanity
22 /// check to prevent links from going outside of a directory tree. They can also
23 /// be useful in preventing [directory traversal attacks][dta] and detecting
24 /// brittle code (links that go outside of a specific directory may not exist on
25 /// other machines).
26 ///
27 /// When the link is absolute, it will be resolved relative to
28 /// [`Options::root_directory()`]. If now root directory was provided, it will
29 /// *always* trigger a [`Reason::TraversesParentDirectories`] error to
30 /// prevent possible directory traversal attacks.
31 ///
32 /// ## Default File
33 ///
34 /// Because a link can only point to a file, when a link specifies a directory
35 /// we'll automatically append [`Options::default_file()`] to the end.
36 ///
37 /// This will typically be something like `"index.html"`, meaning a link to
38 /// `./whatever/` will be resolved to `./whatever/index.html`, which is the
39 /// default behaviour for web browsers.
40 ///
41 /// ## Alternate Extensions
42 ///
43 /// Sometimes you might have a `index.md` document but also accept `index.html`
44 /// as a valid link (like in `mdbook`). For this you can provide a mapping of
45 /// [`Options::alternate_extensions()`] to fall back to when the original
46 /// extension doesn't work.
47 ///
48 /// [dta]: https://en.wikipedia.org/wiki/Directory_traversal_attack
resolve_link( current_directory: &Path, link: &Path, options: &Options, ) -> Result<PathBuf, Reason>49 pub fn resolve_link(
50 current_directory: &Path,
51 link: &Path,
52 options: &Options,
53 ) -> Result<PathBuf, Reason> {
54 let joined = options.join(current_directory, link)?;
55
56 let candidates = options.possible_names(joined);
57
58 for candidate in candidates {
59 log::trace!(
60 "Checking if \"{}\" points to \"{}\"",
61 link.display(),
62 candidate.display(),
63 );
64
65 if let Ok(canonical) = options.canonicalize(&candidate) {
66 options.sanity_check(&canonical)?;
67 return Ok(canonical);
68 }
69 }
70
71 log::trace!("None of the candidates exist for \"{}\"", link.display());
72 Err(Reason::Io(io::ErrorKind::NotFound.into()))
73 }
74
75 /// Check whether a [`Path`] points to a valid file on disk.
76 ///
77 /// If a fragment specifier is provided, this function will scan through the
78 /// linked document and check that the file contains the corresponding anchor
79 /// (e.g. markdown heading or HTML `id`).
check_filesystem<C>( current_directory: &Path, path: &Path, fragment: Option<&str>, ctx: &C, ) -> Result<(), Reason> where C: Context + ?Sized,80 pub fn check_filesystem<C>(
81 current_directory: &Path,
82 path: &Path,
83 fragment: Option<&str>,
84 ctx: &C,
85 ) -> Result<(), Reason>
86 where
87 C: Context + ?Sized,
88 {
89 log::debug!(
90 "Checking \"{}\" in the context of \"{}\"",
91 path.display(),
92 current_directory.display()
93 );
94
95 let options = ctx.filesystem_options();
96 let resolved_location = resolve_link(current_directory, path, options)?;
97
98 log::debug!(
99 "\"{}\" resolved to \"{}\"",
100 path.display(),
101 resolved_location.display()
102 );
103
104 if let Some(fragment) = fragment {
105 // TODO: detect the file type and check the fragment exists
106 log::warn!(
107 "Not checking that the \"{}\" section exists in \"{}\" because fragment resolution isn't implemented",
108 fragment,
109 resolved_location.display(),
110 );
111 }
112
113 if let Err(reason) =
114 options.run_custom_validation(&resolved_location, fragment)
115 {
116 log::debug!(
117 "Custom validation reported \"{}\" as invalid because {}",
118 resolved_location.display(),
119 reason
120 );
121 return Err(reason);
122 }
123
124 Ok(())
125 }
126
127 /// Options to be used with [`resolve_link()`].
128 #[derive(Clone)]
129 #[cfg_attr(
130 feature = "serde-1",
131 derive(serde::Serialize, serde::Deserialize),
132 serde(default)
133 )]
134 pub struct Options {
135 root_directory: Option<PathBuf>,
136 default_file: OsString,
137 links_may_traverse_the_root_directory: bool,
138 // Note: the key is normalised to lowercase to make sure extensions are
139 // case insensitive
140 alternate_extensions: HashMap<String, Vec<OsString>>,
141 #[serde(skip, default = "nop_custom_validation")]
142 custom_validation: Arc<dyn Fn(&Path, Option<&str>) -> Result<(), Reason>>,
143 }
144
145 impl Options {
146 /// The name used by [`Options::default_file()`].
147 pub const DEFAULT_FILE: &'static str = "index.html";
148
149 /// A mapping of possible alternate extensions to try when checking a
150 /// filesystem link.
default_alternate_extensions( ) -> impl IntoIterator<Item = (OsString, impl IntoIterator<Item = OsString>)>151 pub fn default_alternate_extensions(
152 ) -> impl IntoIterator<Item = (OsString, impl IntoIterator<Item = OsString>)>
153 {
154 const MAPPING: &'static [(&'static str, &'static [&'static str])] =
155 &[("md", &["html"])];
156
157 MAPPING.iter().map(|(ext, alts)| {
158 (OsString::from(ext), alts.iter().map(OsString::from))
159 })
160 }
161
162 /// Create a new [`Options`] populated with some sane defaults.
new() -> Self163 pub fn new() -> Self {
164 Options {
165 root_directory: None,
166 default_file: OsString::from(Options::DEFAULT_FILE),
167 links_may_traverse_the_root_directory: false,
168 alternate_extensions: Options::default_alternate_extensions()
169 .into_iter()
170 .map(|(key, values)| {
171 (
172 key.to_string_lossy().to_lowercase(),
173 values.into_iter().map(Into::into).collect(),
174 )
175 })
176 .collect(),
177 custom_validation: nop_custom_validation(),
178 }
179 }
180
181 /// Get the root directory, if one was provided.
root_directory(&self) -> Option<&Path>182 pub fn root_directory(&self) -> Option<&Path> {
183 self.root_directory.as_ref().map(|p| &**p)
184 }
185
186 /// Set the [`Options::root_directory()`], automatically converting to its
187 /// canonical form with [`dunce::canonicalize()`].
with_root_directory<P: AsRef<Path>>( self, root_directory: P, ) -> io::Result<Self>188 pub fn with_root_directory<P: AsRef<Path>>(
189 self,
190 root_directory: P,
191 ) -> io::Result<Self> {
192 Ok(Options {
193 root_directory: Some(dunce::canonicalize(root_directory)?),
194 ..self
195 })
196 }
197
198 /// The default file name to use when a directory is linked to.
default_file(&self) -> &OsStr199 pub fn default_file(&self) -> &OsStr { &self.default_file }
200
201 /// Set the [`Options::default_file()`].
set_default_file<O: Into<OsString>>(self, default_file: O) -> Self202 pub fn set_default_file<O: Into<OsString>>(self, default_file: O) -> Self {
203 Options {
204 default_file: default_file.into(),
205 ..self
206 }
207 }
208
209 /// Get the map of alternate extensions to use when checking.
210 ///
211 /// By default we only map `*.md` to `*.html`
212 /// ([`Options::default_alternate_extensions()`]).
alternate_extensions( &self, ) -> impl Iterator<Item = (&OsStr, impl Iterator<Item = &OsStr>)>213 pub fn alternate_extensions(
214 &self,
215 ) -> impl Iterator<Item = (&OsStr, impl Iterator<Item = &OsStr>)> {
216 self.alternate_extensions.iter().map(|(key, value)| {
217 (OsStr::new(key), value.iter().map(|alt| alt.as_os_str()))
218 })
219 }
220
221 /// Set the [`Options::alternate_extensions()`] mapping.
set_alternate_extensions<S, I, V>(mut self, alternates: I) -> Self where I: IntoIterator<Item = (S, V)>, S: Into<OsString>, V: IntoIterator<Item = S>,222 pub fn set_alternate_extensions<S, I, V>(mut self, alternates: I) -> Self
223 where
224 I: IntoIterator<Item = (S, V)>,
225 S: Into<OsString>,
226 V: IntoIterator<Item = S>,
227 {
228 self.alternate_extensions = alternates
229 .into_iter()
230 .map(|(key, values)| {
231 (
232 key.into().to_string_lossy().to_lowercase(),
233 values.into_iter().map(Into::into).collect(),
234 )
235 })
236 .collect();
237
238 self
239 }
240
241 /// Are links allowed to go outside of the [`Options::root_directory()`]?
links_may_traverse_the_root_directory(&self) -> bool242 pub fn links_may_traverse_the_root_directory(&self) -> bool {
243 self.links_may_traverse_the_root_directory
244 }
245
246 /// Set [`Options::links_may_traverse_the_root_directory()`].
set_links_may_traverse_the_root_directory( self, value: bool, ) -> Self247 pub fn set_links_may_traverse_the_root_directory(
248 self,
249 value: bool,
250 ) -> Self {
251 Options {
252 links_may_traverse_the_root_directory: value,
253 ..self
254 }
255 }
256
257 /// Set a function which will be executed after a link is resolved, allowing
258 /// you to apply custom business logic.
set_custom_validation<F>(self, custom_validation: F) -> Self where F: Fn(&Path, Option<&str>) -> Result<(), Reason> + 'static,259 pub fn set_custom_validation<F>(self, custom_validation: F) -> Self
260 where
261 F: Fn(&Path, Option<&str>) -> Result<(), Reason> + 'static,
262 {
263 let custom_validation = Arc::new(custom_validation);
264 Options {
265 custom_validation,
266 ..self
267 }
268 }
269
join( &self, current_dir: &Path, second: &Path, ) -> Result<PathBuf, Reason>270 fn join(
271 &self,
272 current_dir: &Path,
273 second: &Path,
274 ) -> Result<PathBuf, Reason> {
275 log::trace!(
276 "Appending \"{}\" to \"{}\"",
277 second.display(),
278 current_dir.display()
279 );
280
281 if second.has_root() {
282 // if the path is absolute (i.e. has a leading slash) then it's
283 // meant to be relative to the root directory, not the current one
284 match self.root_directory() {
285 Some(root) => {
286 let mut buffer = root.to_path_buf();
287 // append everything except the bits that make it absolute
288 // (e.g. "/" or "C:\")
289 buffer.extend(remove_absolute_components(second));
290 Ok(buffer)
291 },
292 // You really shouldn't provide links to absolute files on your
293 // system (e.g. "/home/michael/Documents/whatever" or
294 // "/etc/passwd").
295 //
296 // For one, it's extremely brittle and will probably only work
297 // on that computer, but more importantly it's also a vector
298 // for directory traversal attacks.
299 //
300 // Feel free to send a PR if you believe otherwise.
301 None => {
302 log::warn!("The bit to be appended is absolute, but we don't have a \"root\" directory to resolve relative to");
303 Err(Reason::TraversesParentDirectories)
304 },
305 }
306 } else {
307 Ok(current_dir.join(second))
308 }
309 }
310
311 /// Gets the canonical version of a particular path, resolving symlinks and
312 /// other filesystem quirks.
313 ///
314 /// This will fail if the item doesn't exist.
canonicalize(&self, path: &Path) -> Result<PathBuf, Reason>315 fn canonicalize(&self, path: &Path) -> Result<PathBuf, Reason> {
316 let mut canonical = dunce::canonicalize(path)?;
317
318 if canonical.is_dir() {
319 log::trace!(
320 "Appending the default file name because \"{}\" is a directory",
321 canonical.display()
322 );
323
324 canonical.push(&self.default_file);
325 // we need to canonicalize again because the default file may be a
326 // symlink, or not exist at all
327 canonical = dunce::canonicalize(canonical)?;
328 }
329
330 Ok(canonical)
331 }
332
sanity_check(&self, path: &Path) -> Result<(), Reason>333 fn sanity_check(&self, path: &Path) -> Result<(), Reason> {
334 log::trace!("Applying sanity checks to \"{}\"", path.display());
335
336 if let Some(root) = self.root_directory() {
337 log::trace!(
338 "Checking if \"{}\" is allowed to leave \"{}\"",
339 path.display(),
340 root.display()
341 );
342
343 if !(self.links_may_traverse_the_root_directory
344 || path.starts_with(root))
345 {
346 log::trace!(
347 "\"{}\" traverses outside the \"root\" directory",
348 path.display()
349 );
350 return Err(Reason::TraversesParentDirectories);
351 }
352 }
353
354 Ok(())
355 }
356
357 /// sometimes the file being linked to may be usable with another extension
358 /// (e.g. in mdbook, markdown files can be linked to with the HTML
359 /// extension).
possible_names( &self, original: PathBuf, ) -> impl IntoIterator<Item = PathBuf>360 fn possible_names(
361 &self,
362 original: PathBuf,
363 ) -> impl IntoIterator<Item = PathBuf> {
364 let mut names = vec![original.clone()];
365
366 if let Some(alternatives) = original
367 .extension()
368 .map(|ext| ext.to_string_lossy().to_lowercase())
369 .and_then(|ext| self.alternate_extensions.get(&ext))
370 {
371 for alternative in alternatives {
372 names.push(original.with_extension(alternative));
373 }
374 }
375
376 log::trace!(
377 "Possible candidates for \"{}\" are {:?}",
378 original.display(),
379 names
380 );
381
382 names
383 }
384
run_custom_validation( &self, resolved_path: &Path, fragment: Option<&str>, ) -> Result<(), Reason>385 fn run_custom_validation(
386 &self,
387 resolved_path: &Path,
388 fragment: Option<&str>,
389 ) -> Result<(), Reason> {
390 (self.custom_validation)(resolved_path, fragment)
391 }
392 }
393
nop_custom_validation( ) -> Arc<dyn Fn(&Path, Option<&str>) -> Result<(), Reason>>394 fn nop_custom_validation(
395 ) -> Arc<dyn Fn(&Path, Option<&str>) -> Result<(), Reason>> {
396 Arc::new(|_, _| Ok(()))
397 }
398
399 impl Default for Options {
default() -> Self400 fn default() -> Self { Options::new() }
401 }
402
403 impl Debug for Options {
fmt(&self, f: &mut Formatter<'_>) -> fmt::Result404 fn fmt(&self, f: &mut Formatter<'_>) -> fmt::Result {
405 let Options {
406 root_directory,
407 default_file,
408 links_may_traverse_the_root_directory,
409 alternate_extensions,
410 custom_validation: _,
411 } = self;
412
413 f.debug_struct("Options")
414 .field("root_directory", root_directory)
415 .field("default_file", default_file)
416 .field(
417 "links_may_traverse_the_root_directory",
418 links_may_traverse_the_root_directory,
419 )
420 .field("alternate_extensions", alternate_extensions)
421 .finish()
422 }
423 }
424
425 impl PartialEq for Options {
eq(&self, other: &Options) -> bool426 fn eq(&self, other: &Options) -> bool {
427 let Options {
428 root_directory,
429 default_file,
430 links_may_traverse_the_root_directory,
431 alternate_extensions,
432 custom_validation: _,
433 } = self;
434
435 root_directory == &other.root_directory
436 && default_file == &other.default_file
437 && links_may_traverse_the_root_directory
438 == &other.links_may_traverse_the_root_directory
439 && alternate_extensions == &other.alternate_extensions
440 }
441 }
442
remove_absolute_components( path: &Path, ) -> impl Iterator<Item = Component> + '_443 fn remove_absolute_components(
444 path: &Path,
445 ) -> impl Iterator<Item = Component> + '_ {
446 path.components()
447 .skip_while(|c| matches!(c, Component::Prefix(_) | Component::RootDir))
448 }
449
450 #[cfg(test)]
451 mod tests {
452 use super::*;
453 use crate::BasicContext;
454 use std::sync::atomic::{AtomicBool, Ordering};
455
validation_dir() -> PathBuf456 fn validation_dir() -> PathBuf {
457 Path::new(env!("CARGO_MANIFEST_DIR"))
458 .join("src")
459 .join("validation")
460 }
461
touch<S: AsRef<Path>>(filename: S, directories: &[&Path])462 fn touch<S: AsRef<Path>>(filename: S, directories: &[&Path]) {
463 for dir in directories {
464 std::fs::create_dir_all(dir).unwrap();
465
466 let item = dir.join(filename.as_ref());
467 let _f = std::fs::File::create(&item).unwrap();
468 }
469 }
470
init_logging()471 fn init_logging() {
472 let _ = env_logger::builder()
473 .filter(Some("linkcheck"), log::LevelFilter::Trace)
474 .is_test(true)
475 .try_init();
476 }
477
478 #[test]
resolve_mod_relative_to_validation_dir()479 fn resolve_mod_relative_to_validation_dir() {
480 init_logging();
481 let current_dir = validation_dir();
482 let link = "mod.rs";
483 let options = Options::default();
484
485 let got =
486 resolve_link(¤t_dir, Path::new(link), &options).unwrap();
487
488 assert_eq!(got, current_dir.join(link));
489 }
490
491 #[test]
custom_validation_function_gets_called()492 fn custom_validation_function_gets_called() {
493 init_logging();
494 let current_dir = validation_dir();
495 let link = "mod.rs";
496 let called = Arc::new(AtomicBool::new(false));
497 let called_2 = Arc::clone(&called);
498 let mut ctx = BasicContext::default();
499 ctx.options = Options::default().set_custom_validation(move |_, _| {
500 called_2.store(true, Ordering::SeqCst);
501 Ok(())
502 });
503
504 check_filesystem(¤t_dir, Path::new(link), None, &ctx).unwrap();
505
506 assert!(called.load(Ordering::SeqCst))
507 }
508
509 #[test]
detect_possible_directory_traversal_attacks()510 fn detect_possible_directory_traversal_attacks() {
511 init_logging();
512 let temp = tempfile::tempdir().unwrap();
513 let temp = dunce::canonicalize(temp.path()).unwrap();
514 let foo = temp.join("foo");
515 let bar = foo.join("bar");
516 let baz = bar.join("baz");
517 let options = Options::default().with_root_directory(&temp).unwrap();
518 touch(&options.default_file, &[&temp, &foo, &bar, &baz]);
519 let current_dir = baz.as_path();
520 let resolve = |link: &str| -> Result<PathBuf, Reason> {
521 resolve_link(current_dir, Path::new(link), &options)
522 };
523
524 // checking up to the root directory is okay
525 assert_eq!(
526 resolve(".").unwrap(),
527 current_dir.join(&options.default_file)
528 );
529 assert_eq!(resolve("..").unwrap(), bar.join(&options.default_file));
530 assert_eq!(resolve("../..").unwrap(), foo.join(&options.default_file));
531 assert_eq!(
532 resolve("../../..").unwrap(),
533 temp.join(&options.default_file)
534 );
535 // but a directory traversal attack isn't
536 let bad_path = if cfg!(windows) {
537 "../../../../../../../../../../../../../../../../../Windows/System32/cmd.exe"
538 } else {
539 "../../../../../../../../../../../../../../../../../etc/passwd"
540 };
541 let traverses_parent_dir = resolve(bad_path).unwrap_err();
542 assert!(
543 matches!(traverses_parent_dir, Reason::TraversesParentDirectories),
544 "{:?} should have traversed the parent directory",
545 traverses_parent_dir
546 );
547 }
548
549 #[test]
links_with_a_leading_slash_are_relative_to_the_root()550 fn links_with_a_leading_slash_are_relative_to_the_root() {
551 init_logging();
552 let temp = tempfile::tempdir().unwrap();
553 let temp = dunce::canonicalize(temp.path()).unwrap();
554 let foo = temp.join("foo");
555 let bar = temp.join("bar");
556 let options = Options::default().with_root_directory(&temp).unwrap();
557 touch(&options.default_file, &[&temp, &foo, &bar]);
558 let link = Path::new("/bar");
559
560 let got = resolve_link(&foo, link, &options).unwrap();
561
562 assert_eq!(got, bar.join(&options.default_file));
563 }
564
565 #[test]
link_to_a_file_we_know_doesnt_exist()566 fn link_to_a_file_we_know_doesnt_exist() {
567 init_logging();
568 let temp = tempfile::tempdir().unwrap();
569 let temp = dunce::canonicalize(temp.path()).unwrap();
570 let options = Options::default().with_root_directory(&temp).unwrap();
571 let link = Path::new("./bar");
572
573 let err = resolve_link(&temp, link, &options).unwrap_err();
574
575 assert!(err.file_not_found());
576 }
577
578 #[test]
absolute_link_with_no_root_set_is_an_error()579 fn absolute_link_with_no_root_set_is_an_error() {
580 init_logging();
581 let temp = tempfile::tempdir().unwrap();
582 let temp = dunce::canonicalize(temp.path()).unwrap();
583 let options = Options::default();
584 let link = Path::new("/bar");
585
586 let err = resolve_link(&temp, link, &options).unwrap_err();
587
588 assert!(matches!(err, Reason::TraversesParentDirectories));
589 }
590
591 #[test]
a_link_that_is_allowed_to_traverse_the_root_dir()592 fn a_link_that_is_allowed_to_traverse_the_root_dir() {
593 init_logging();
594 let temp = tempfile::tempdir().unwrap();
595 let temp = dunce::canonicalize(temp.path()).unwrap();
596 let foo = temp.join("foo");
597 let bar = temp.join("bar");
598 touch(Options::DEFAULT_FILE, &[&temp, &foo, &bar]);
599 let options = Options::default()
600 .with_root_directory(&foo)
601 .unwrap()
602 .set_links_may_traverse_the_root_directory(true);
603 let link = Path::new("../bar/index.html");
604
605 let got = resolve_link(&foo, link, &options).unwrap();
606
607 assert_eq!(got, bar.join("index.html"));
608 }
609
610 #[test]
markdown_files_can_be_used_as_html()611 fn markdown_files_can_be_used_as_html() {
612 init_logging();
613 let temp = tempfile::tempdir().unwrap();
614 let temp = dunce::canonicalize(temp.path()).unwrap();
615 touch("index.html", &[&temp]);
616 let link = "index.md";
617 // let options = Options::default()
618 // .set_alternate_extensions(Options::DEFAULT_ALTERNATE_EXTENSIONS);
619 let options = Options::default()
620 .set_alternate_extensions(Options::default_alternate_extensions());
621
622 let got = resolve_link(&temp, Path::new(link), &options).unwrap();
623
624 assert_eq!(got, temp.join("index.html"));
625 }
626
627 #[test]
join_paths()628 fn join_paths() {
629 init_logging();
630 let temp = tempfile::tempdir().unwrap();
631 let temp = dunce::canonicalize(temp.path()).unwrap();
632 let foo = temp.join("foo");
633 let bar = foo.join("bar");
634 let baz = bar.join("baz");
635 let baz_index = baz.join("index.html");
636 touch("index.html", &[&temp, &foo, &bar, &baz]);
637 let options = Options::default().with_root_directory(&temp).unwrap();
638
639 let inputs = vec![
640 ("/foo", &temp, &foo),
641 ("foo", &temp, &foo),
642 ("foo/bar", &temp, &bar),
643 ("foo/bar/baz", &temp, &baz),
644 ("/foo/bar/baz/index.html", &temp, &baz_index),
645 ("bar/baz", &foo, &baz),
646 ("baz", &bar, &baz),
647 ("index.html", &baz, &baz_index),
648 ];
649
650 for (link, base, should_be) in inputs {
651 let got = options.join(base, Path::new(link)).unwrap();
652 assert_eq!(got, *should_be);
653 }
654 }
655 }
656