1 use rayon::prelude::*;
2
3 use crate::Site;
4 use core::time;
5 use errors::{bail, Result};
6 use errors::{Error, ErrorKind};
7 use std::{collections::HashMap, path::PathBuf, thread};
8 use url::Url;
9
10 /// Check whether all internal links pointing to explicit anchor fragments are valid.
11 ///
12 /// This is very similar to `check_external_links`, although internal links checking
13 /// is always performed (while external ones only conditionally in `zola check`).
check_internal_links_with_anchors(site: &Site) -> Result<()>14 pub fn check_internal_links_with_anchors(site: &Site) -> Result<()> {
15 println!("Checking all internal links with anchors.");
16 let library = site.library.write().expect("Get lock for check_internal_links_with_anchors");
17
18 // Chain all internal links, from both sections and pages.
19 let page_links = library
20 .pages()
21 .values()
22 .map(|p| {
23 let path = &p.file.path;
24 p.internal_links.iter().map(move |l| (path.clone(), l))
25 })
26 .flatten();
27 let section_links = library
28 .sections()
29 .values()
30 .map(|p| {
31 let path = &p.file.path;
32 p.internal_links.iter().map(move |l| (path.clone(), l))
33 })
34 .flatten();
35 let all_links = page_links.chain(section_links);
36
37 // Only keep links with anchor fragments, and count them too.
38 // Bare files have already been checked elsewhere, thus they are not interesting here.
39 let mut anchors_total = 0usize;
40 let links_with_anchors = all_links
41 .filter_map(|(page_path, link)| match link {
42 (md_path, Some(anchor)) => Some((page_path, md_path, anchor)),
43 _ => None,
44 })
45 .inspect(|_| anchors_total = anchors_total.saturating_add(1));
46
47 // Check for targets existence (including anchors), then keep only the faulty
48 // entries for error reporting purposes.
49 let missing_targets = links_with_anchors.filter(|(_, md_path, anchor)| {
50 // There are a few `expect` here since the presence of the .md file will
51 // already have been checked in the markdown rendering
52 let mut full_path = site.base_path.clone();
53 full_path.push("content");
54 for part in md_path.split('/') {
55 full_path.push(part);
56 }
57 if md_path.contains("_index.md") {
58 let section = library
59 .get_section(&full_path)
60 .expect("Couldn't find section in check_internal_links_with_anchors");
61 !section.has_anchor(anchor)
62 } else {
63 let page = library
64 .get_page(&full_path)
65 .expect("Couldn't find section in check_internal_links_with_anchors");
66 !page.has_anchor(anchor)
67 }
68 });
69
70 // Format faulty entries into error messages, and collect them.
71 let errors = missing_targets
72 .map(|(page_path, md_path, anchor)| {
73 format!(
74 "The anchor in the link `@/{}#{}` in {} does not exist.",
75 md_path,
76 anchor,
77 page_path.to_string_lossy(),
78 )
79 })
80 .collect::<Vec<_>>();
81
82 // Finally emit a summary, and return overall anchors-checking result.
83 match errors.len() {
84 0 => {
85 println!("> Successfully checked {} internal link(s) with anchors.", anchors_total);
86 Ok(())
87 }
88 errors_total => {
89 println!(
90 "> Checked {} internal link(s) with anchors: {} target(s) missing.",
91 anchors_total, errors_total,
92 );
93 Err(Error { kind: ErrorKind::Msg(errors.join("\n")), source: None })
94 }
95 }
96 }
97
get_link_domain(link: &str) -> Result<String>98 fn get_link_domain(link: &str) -> Result<String> {
99 return match Url::parse(link) {
100 Ok(url) => match url.host_str().map(String::from) {
101 Some(domain_str) => Ok(domain_str),
102 None => bail!("could not parse domain `{}` from link", link),
103 },
104 Err(err) => bail!("could not parse domain `{}` from link: `{}`", link, err),
105 };
106 }
107
check_external_links(site: &Site) -> Result<()>108 pub fn check_external_links(site: &Site) -> Result<()> {
109 let library = site.library.write().expect("Get lock for check_external_links");
110
111 let mut all_links: Vec<(PathBuf, String, String)> = vec![];
112
113 for p in library.pages_values().into_iter() {
114 for external_link in p.clone().external_links.into_iter() {
115 let domain = get_link_domain(&external_link)?;
116 all_links.push((p.file.path.clone(), external_link, domain));
117 }
118 }
119
120 for s in library.sections_values().into_iter() {
121 for external_link in s.clone().external_links.into_iter() {
122 let domain = get_link_domain(&external_link)?;
123 all_links.push((s.file.path.clone(), external_link, domain));
124 }
125 }
126
127 println!("Checking {} external link(s).", all_links.len());
128
129 let mut links_by_domain: HashMap<String, Vec<(PathBuf, String)>> = HashMap::new();
130
131 for link in all_links.iter() {
132 links_by_domain.entry(link.2.to_string()).or_default();
133 // Insert content path and link under the domain key
134 links_by_domain
135 .get_mut(&link.2.to_string())
136 .unwrap()
137 .push((link.0.clone(), link.1.clone()));
138 }
139
140 if all_links.is_empty() {
141 return Ok(());
142 }
143
144 // create thread pool with lots of threads so we can fetch
145 // (almost) all pages simultaneously, limiting all links for a single
146 // domain to one thread to avoid rate-limiting
147 let threads = std::cmp::min(links_by_domain.len(), 8);
148 let pool = rayon::ThreadPoolBuilder::new()
149 .num_threads(threads)
150 .build()
151 .map_err(|e| Error { kind: ErrorKind::Msg(e.to_string()), source: None })?;
152
153 let errors = pool.install(|| {
154 links_by_domain
155 .par_iter()
156 .map(|(_domain, links)| {
157 let mut links_to_process = links.len();
158 links
159 .iter()
160 .filter_map(move |(page_path, link)| {
161 links_to_process -= 1;
162
163 if site
164 .config
165 .link_checker
166 .skip_prefixes
167 .iter()
168 .any(|prefix| link.starts_with(prefix))
169 {
170 return None;
171 }
172
173 let res = link_checker::check_url(link, &site.config.link_checker);
174
175 if links_to_process > 0 {
176 // Prevent rate-limiting, wait before next crawl unless we're done with this domain
177 thread::sleep(time::Duration::from_millis(500));
178 }
179
180 if link_checker::is_valid(&res) {
181 None
182 } else {
183 Some((page_path, link, res))
184 }
185 })
186 .collect::<Vec<_>>()
187 })
188 .flatten()
189 .collect::<Vec<_>>()
190 });
191
192 println!("> Checked {} external link(s): {} error(s) found.", all_links.len(), errors.len());
193
194 if errors.is_empty() {
195 return Ok(());
196 }
197
198 let msg = errors
199 .into_iter()
200 .map(|(page_path, link, check_res)| {
201 format!(
202 "Dead link in {} to {}: {}",
203 page_path.to_string_lossy(),
204 link,
205 link_checker::message(&check_res)
206 )
207 })
208 .collect::<Vec<_>>()
209 .join("\n");
210
211 Err(Error { kind: ErrorKind::Msg(msg), source: None })
212 }
213