1 use rayon::prelude::*;
2 
3 use crate::Site;
4 use core::time;
5 use errors::{bail, Result};
6 use errors::{Error, ErrorKind};
7 use std::{collections::HashMap, path::PathBuf, thread};
8 use url::Url;
9 
10 /// Check whether all internal links pointing to explicit anchor fragments are valid.
11 ///
12 /// This is very similar to `check_external_links`, although internal links checking
13 /// is always performed (while external ones only conditionally in `zola check`).
check_internal_links_with_anchors(site: &Site) -> Result<()>14 pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
15     println!("Checking all internal links with anchors.");
16     let library = site.library.write().expect("Get lock for check_internal_links_with_anchors");
17 
18     // Chain all internal links, from both sections and pages.
19     let page_links = library
20         .pages()
21         .values()
22         .map(|p| {
23             let path = &p.file.path;
24             p.internal_links.iter().map(move |l| (path.clone(), l))
25         })
26         .flatten();
27     let section_links = library
28         .sections()
29         .values()
30         .map(|p| {
31             let path = &p.file.path;
32             p.internal_links.iter().map(move |l| (path.clone(), l))
33         })
34         .flatten();
35     let all_links = page_links.chain(section_links);
36 
37     // Only keep links with anchor fragments, and count them too.
38     // Bare files have already been checked elsewhere, thus they are not interesting here.
39     let mut anchors_total = 0usize;
40     let links_with_anchors = all_links
41         .filter_map(|(page_path, link)| match link {
42             (md_path, Some(anchor)) => Some((page_path, md_path, anchor)),
43             _ => None,
44         })
45         .inspect(|_| anchors_total = anchors_total.saturating_add(1));
46 
47     // Check for targets existence (including anchors), then keep only the faulty
48     // entries for error reporting purposes.
49     let missing_targets = links_with_anchors.filter(|(_, md_path, anchor)| {
50         // There are a few `expect` here since the presence of the .md file will
51         // already have been checked in the markdown rendering
52         let mut full_path = site.base_path.clone();
53         full_path.push("content");
54         for part in md_path.split('/') {
55             full_path.push(part);
56         }
57         if md_path.contains("_index.md") {
58             let section = library
59                 .get_section(&full_path)
60                 .expect("Couldn't find section in check_internal_links_with_anchors");
61             !section.has_anchor(anchor)
62         } else {
63             let page = library
64                 .get_page(&full_path)
65                 .expect("Couldn't find section in check_internal_links_with_anchors");
66             !page.has_anchor(anchor)
67         }
68     });
69 
70     // Format faulty entries into error messages, and collect them.
71     let errors = missing_targets
72         .map(|(page_path, md_path, anchor)| {
73             format!(
74                 "The anchor in the link `@/{}#{}` in {} does not exist.",
75                 md_path,
76                 anchor,
77                 page_path.to_string_lossy(),
78             )
79         })
80         .collect::<Vec<_>>();
81 
82     // Finally emit a summary, and return overall anchors-checking result.
83     match errors.len() {
84         0 => {
85             println!("> Successfully checked {} internal link(s) with anchors.", anchors_total);
86             Ok(())
87         }
88         errors_total => {
89             println!(
90                 "> Checked {} internal link(s) with anchors: {} target(s) missing.",
91                 anchors_total, errors_total,
92             );
93             Err(Error { kind: ErrorKind::Msg(errors.join("\n")), source: None })
94         }
95     }
96 }
97 
get_link_domain(link: &str) -> Result<String>98 fn get_link_domain(link: &str) -> Result<String> {
99     return match Url::parse(link) {
100         Ok(url) => match url.host_str().map(String::from) {
101             Some(domain_str) => Ok(domain_str),
102             None => bail!("could not parse domain `{}` from link", link),
103         },
104         Err(err) => bail!("could not parse domain `{}` from link: `{}`", link, err),
105     };
106 }
107 
check_external_links(site: &Site) -> Result<()>108 pub fn check_external_links(site: &Site) -> Result<()> {
109     let library = site.library.write().expect("Get lock for check_external_links");
110 
111     let mut all_links: Vec<(PathBuf, String, String)> = vec![];
112 
113     for p in library.pages_values().into_iter() {
114         for external_link in p.clone().external_links.into_iter() {
115             let domain = get_link_domain(&external_link)?;
116             all_links.push((p.file.path.clone(), external_link, domain));
117         }
118     }
119 
120     for s in library.sections_values().into_iter() {
121         for external_link in s.clone().external_links.into_iter() {
122             let domain = get_link_domain(&external_link)?;
123             all_links.push((s.file.path.clone(), external_link, domain));
124         }
125     }
126 
127     println!("Checking {} external link(s).", all_links.len());
128 
129     let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();
130 
131     for link in all_links.iter() {
132         links_by_domain.entry(link.2.to_string()).or_default();
133         // Insert content path and link under the domain key
134         links_by_domain
135             .get_mut(&link.2.to_string())
136             .unwrap()
137             .push((link.0.clone(), link.1.clone()));
138     }
139 
140     if all_links.is_empty() {
141         return Ok(());
142     }
143 
144     // create thread pool with lots of threads so we can fetch
145     // (almost) all pages simultaneously, limiting all links for a single
146     // domain to one thread to avoid rate-limiting
147     let threads = std::cmp::min(links_by_domain.len(), 8);
148     let pool = rayon::ThreadPoolBuilder::new()
149         .num_threads(threads)
150         .build()
151         .map_err(|e| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?;
152 
153     let errors = pool.install(|| {
154         links_by_domain
155             .par_iter()
156             .map(|(_domain, links)| {
157                 let mut links_to_process = links.len();
158                 links
159                     .iter()
160                     .filter_map(move |(page_path, link)| {
161                         links_to_process -= 1;
162 
163                         if site
164                             .config
165                             .link_checker
166                             .skip_prefixes
167                             .iter()
168                             .any(|prefix| link.starts_with(prefix))
169                         {
170                             return None;
171                         }
172 
173                         let res = link_checker::check_url(link, &site.config.link_checker);
174 
175                         if links_to_process > 0 {
176                             // Prevent rate-limiting, wait before next crawl unless we're done with this domain
177                             thread::sleep(time::Duration::from_millis(500));
178                         }
179 
180                         if link_checker::is_valid(&res) {
181                             None
182                         } else {
183                             Some((page_path, link, res))
184                         }
185                     })
186                     .collect::<Vec<_>>()
187             })
188             .flatten()
189             .collect::<Vec<_>>()
190     });
191 
192     println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());
193 
194     if errors.is_empty() {
195         return Ok(());
196     }
197 
198     let msg = errors
199         .into_iter()
200         .map(|(page_path, link, check_res)| {
201             format!(
202                 "Dead link in {} to {}: {}",
203                 page_path.to_string_lossy(),
204                 link,
205                 link_checker::message(&check_res)
206             )
207         })
208         .collect::<Vec<_>>()
209         .join("\n");
210 
211     Err(Error { kind: ErrorKind::Msg(msg), source: None })
212 }
213