1 #[macro_use]
2 extern crate html5ever;
3 #[macro_use]
4 extern crate lazy_static;
5
6 use html5ever::serialize::{serialize, SerializeOpts};
7 use html5ever::{driver as html, QualName};
8 use markup5ever_rcdom::{Handle, NodeData, RcDom, SerializableHandle};
9 use pulldown_cmark::{Options, Parser};
10
11 use regex::Regex;
12 use std::collections::HashSet;
13 use std::mem;
14 use std::rc::{Rc, Weak};
15 use tendril::stream::TendrilSink;
16
17 mod suite;
18
19 #[inline(never)]
test_markdown_html(input: &str, output: &str, smart_punct: bool)20 pub fn test_markdown_html(input: &str, output: &str, smart_punct: bool) {
21 let mut s = String::new();
22
23 let mut opts = Options::empty();
24 opts.insert(Options::ENABLE_TABLES);
25 opts.insert(Options::ENABLE_FOOTNOTES);
26 opts.insert(Options::ENABLE_STRIKETHROUGH);
27 opts.insert(Options::ENABLE_TASKLISTS);
28 if smart_punct {
29 opts.insert(Options::ENABLE_SMART_PUNCTUATION);
30 }
31
32 let p = Parser::new_ext(input, opts);
33 pulldown_cmark::html::push_html(&mut s, p);
34
35 assert_eq!(normalize_html(output), normalize_html(&s));
36 }
37
38 lazy_static! {
39 static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
40 static ref LEADING_WHITESPACE_RE: Regex = Regex::new(r"\A\s+").unwrap();
41 static ref TRAILING_WHITESPACE_RE: Regex = Regex::new(r"\s+\z").unwrap();
42 static ref BLOCK_TAGS: HashSet<&'static str> = [
43 "article",
44 "header",
45 "aside",
46 "hgroup",
47 "blockquote",
48 "hr",
49 "iframe",
50 "body",
51 "li",
52 "map",
53 "button",
54 "object",
55 "canvas",
56 "ol",
57 "caption",
58 "output",
59 "col",
60 "p",
61 "colgroup",
62 "pre",
63 "dd",
64 "progress",
65 "div",
66 "section",
67 "dl",
68 "table",
69 "td",
70 "dt",
71 "tbody",
72 "embed",
73 "textarea",
74 "fieldset",
75 "tfoot",
76 "figcaption",
77 "th",
78 "figure",
79 "thead",
80 "footer",
81 "tr",
82 "form",
83 "ul",
84 "h1",
85 "h2",
86 "h3",
87 "h4",
88 "h5",
89 "h6",
90 "video",
91 "script",
92 "style"
93 ]
94 .iter()
95 .cloned()
96 .collect();
97 static ref WHITESPACE_SENSITIVE_TAGS: HashSet<&'static str> =
98 ["pre", "code", "h1", "h2", "h3", "h4", "h5", "h6"]
99 .iter()
100 .cloned()
101 .collect();
102 static ref TABLE_TAGS: HashSet<&'static str> = ["table", "thead", "tbody", "tr", "td"]
103 .iter()
104 .cloned()
105 .collect();
106 }
107
make_html_parser() -> html::Parser<RcDom>108 fn make_html_parser() -> html::Parser<RcDom> {
109 html::parse_fragment(
110 RcDom::default(),
111 html::ParseOpts::default(),
112 QualName::new(None, ns!(html), local_name!("div")),
113 vec![],
114 )
115 }
116
normalize_html(s: &str) -> String117 fn normalize_html(s: &str) -> String {
118 let parser = make_html_parser();
119 let dom = parser.one(s);
120 let body: SerializableHandle = normalize_dom(&dom).into();
121 let opts = SerializeOpts::default();
122 let mut ret_val = Vec::new();
123 serialize(&mut ret_val, &body, opts)
124 .expect("Writing to a string shouldn't fail (expect on OOM)");
125 String::from_utf8(ret_val).expect("html5ever should always produce UTF8")
126 }
127
normalize_dom(dom: &RcDom) -> Handle128 fn normalize_dom(dom: &RcDom) -> Handle {
129 let body = {
130 let children = dom.document.children.borrow();
131 children[0].clone()
132 };
133 let mut current_level = Vec::new();
134 let mut next_level = Vec::new();
135 current_level.extend(body.children.borrow().iter().cloned().rev());
136 loop {
137 while let Some(mut node) = current_level.pop() {
138 let parent = node.parent.replace(None);
139 node.parent.replace(parent.clone());
140 let parent = parent
141 .expect("a node in the DOM will have a parent, except the root, which is not processed")
142 .upgrade().expect("a node's parent will be pointed to by its parent (or the root pointer), and will not be dropped");
143 let retain = normalize_node(&parent, &mut node);
144 if !retain {
145 let mut siblings = parent.children.borrow_mut();
146 siblings.retain(|s| !Rc::ptr_eq(&node, s));
147 } else {
148 next_level.extend(node.children.borrow().iter().cloned().rev());
149 }
150 }
151 if next_level.is_empty() {
152 break;
153 };
154 mem::swap(&mut next_level, &mut current_level);
155 }
156 body
157 }
158
159 // Returns false if node is an empty text node or an empty tbody.
160 // Returns true otherwise.
normalize_node(parent: &Handle, node: &mut Handle) -> bool161 fn normalize_node(parent: &Handle, node: &mut Handle) -> bool {
162 match node.data {
163 NodeData::Comment { .. }
164 | NodeData::Doctype { .. }
165 | NodeData::Document
166 | NodeData::ProcessingInstruction { .. } => true,
167 NodeData::Text { ref contents, .. } => {
168 let mut contents = contents.borrow_mut();
169 let is_pre = {
170 let mut parent = parent.clone();
171 loop {
172 let is_pre = if let NodeData::Element { ref name, .. } = parent.data {
173 WHITESPACE_SENSITIVE_TAGS.contains(&&*name.local.to_ascii_lowercase())
174 } else {
175 false
176 };
177 if is_pre {
178 break true;
179 };
180 let parent_ = parent.parent.replace(None);
181 parent.parent.replace(parent_.clone());
182 let parent_ = parent_.as_ref().and_then(Weak::upgrade);
183 if let Some(parent_) = parent_ {
184 parent = parent_
185 } else {
186 break false;
187 };
188 }
189 };
190 if !is_pre {
191 let (is_first_in_block, is_last_in_block) = {
192 let mut is_first_in_block = true;
193 let mut is_last_in_block = true;
194 let mut parent = parent.clone();
195 let mut node = node.clone();
196 loop {
197 let reached_block = if let NodeData::Element { ref name, .. } = parent.data
198 {
199 BLOCK_TAGS.contains(&&*name.local.to_ascii_lowercase())
200 } else {
201 false
202 };
203 let (is_first, is_last) = {
204 let siblings = parent.children.borrow();
205 let n = &node;
206 (
207 siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false),
208 siblings.len() > 0
209 && siblings
210 .get(siblings.len() - 1)
211 .map(|s| Rc::ptr_eq(s, n))
212 .unwrap_or(false),
213 )
214 };
215 is_first_in_block = is_first_in_block && is_first;
216 is_last_in_block = is_last_in_block && is_last;
217 if (is_first_in_block || is_last_in_block) && !reached_block {
218 node = parent.clone();
219 let parent_ = parent.parent.replace(None);
220 parent.parent.replace(parent_.clone());
221 let parent_ = parent_.as_ref().and_then(Weak::upgrade);
222 if let Some(parent_) = parent_ {
223 parent = parent_;
224 } else {
225 break (is_first_in_block, is_last_in_block);
226 }
227 } else {
228 break (is_first_in_block, is_last_in_block);
229 }
230 }
231 };
232 let is_preceeded_by_ws = {
233 let mut parent = parent.clone();
234 let mut node = node.clone();
235 'ascent: loop {
236 let is_first = {
237 let siblings = parent.children.borrow();
238 let n = &node;
239 siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false)
240 };
241 if is_first {
242 node = parent.clone();
243 let parent_ = parent.parent.replace(None);
244 parent.parent.replace(parent_.clone());
245 let parent_ = parent_.as_ref().and_then(Weak::upgrade);
246 if let Some(parent_) = parent_ {
247 parent = parent_;
248 } else {
249 break 'ascent false;
250 }
251 } else {
252 let siblings = parent.children.borrow();
253 let n = &node;
254 let mut pos = !0;
255 'search: for (i, s) in siblings.iter().enumerate() {
256 if Rc::ptr_eq(s, n) {
257 pos = i;
258 break 'search;
259 }
260 }
261 assert!(
262 pos != !0,
263 "The list of node's parent's children shall contain node"
264 );
265 assert!(
266 pos != 0,
267 "If node is not first, then node's position shall not be zero"
268 );
269 let mut preceeding = siblings[pos - 1].clone();
270 'descent: loop {
271 if let NodeData::Text { .. } = preceeding.data {
272 break 'descent;
273 }
274 preceeding = {
275 let ch = preceeding.children.borrow();
276 if ch.len() == 0 {
277 break 'descent;
278 }
279 if let Some(preceeding_) = ch.get(ch.len() - 1) {
280 preceeding_.clone()
281 } else {
282 break 'descent;
283 }
284 };
285 }
286 if let NodeData::Text { ref contents, .. } = preceeding.data {
287 break 'ascent TRAILING_WHITESPACE_RE.is_match(&*contents.borrow());
288 } else {
289 break 'ascent false;
290 }
291 }
292 }
293 };
294
295 let is_in_table = if let NodeData::Element { ref name, .. } = parent.data {
296 TABLE_TAGS.contains(&&*name.local.to_ascii_lowercase())
297 } else {
298 false
299 };
300 let whitespace_replacement = if is_in_table { "" } else { " " };
301 *contents = WHITESPACE_RE
302 .replace_all(&*contents, whitespace_replacement)
303 .as_ref()
304 .into();
305
306 if is_first_in_block || is_preceeded_by_ws {
307 *contents = LEADING_WHITESPACE_RE
308 .replace_all(&*contents, "")
309 .as_ref()
310 .into();
311 }
312 if is_last_in_block {
313 *contents = TRAILING_WHITESPACE_RE
314 .replace_all(&*contents, "")
315 .as_ref()
316 .into();
317 }
318 // TODO: collapse whitespace when adjacent to whitespace.
319 // For example, the whitespace in the span should be collapsed in all of these cases:
320 //
321 // " <span> q </span> "
322 // "<b>q </b><span> q</span>"
323 // "<b>q <i></i></b><span> q</span>"
324 // "<b>q <i></i></b><span> q</span>"
325 // "q <b></b><span> q</span>"
326 }
327 &**contents != ""
328 }
329 NodeData::Element {
330 ref attrs,
331 ref name,
332 ..
333 } => {
334 let mut attrs = attrs.borrow_mut();
335 for a in attrs.iter_mut() {
336 a.name.local = a.name.local.to_ascii_lowercase().into();
337 }
338 attrs.sort_by(|a: &html5ever::Attribute, b: &html5ever::Attribute| {
339 (&*a.name.local).cmp(&*b.name.local)
340 });
341 let ascii_name = &*name.local.to_ascii_lowercase();
342 // drop empty tbody's
343 ascii_name != "tbody"
344 || node.children.borrow().len() > 1
345 || node
346 .children
347 .borrow()
348 .iter()
349 .next()
350 .map(|only_child| match only_child.data {
351 NodeData::Text { ref contents, .. } => {
352 !contents.borrow().chars().all(|c| c.is_whitespace())
353 }
354 _ => true,
355 })
356 .unwrap_or(false)
357 }
358 }
359 }
360
361 #[test]
strip_div_newline()362 fn strip_div_newline() {
363 assert_eq!("<div></div>", normalize_html("<div>\n</div>"));
364 }
365
366 #[test]
strip_end_newline()367 fn strip_end_newline() {
368 assert_eq!("test", normalize_html("test\n"));
369 }
370
371 #[test]
strip_double_space()372 fn strip_double_space() {
373 assert_eq!("test mess", normalize_html("test mess"));
374 }
375
376 #[test]
strip_inline_internal_text()377 fn strip_inline_internal_text() {
378 assert_eq!(
379 "<u>a </u>b <u>c</u>",
380 normalize_html("<u> a </u> b <u> c </u>")
381 )
382 }
383
384 #[test]
strip_inline_block_internal_text()385 fn strip_inline_block_internal_text() {
386 assert_eq!(
387 "<u>a </u>b <u>c</u>",
388 normalize_html(" <u> a </u> b <u> c </u> ")
389 )
390 }
391
392 #[test]
leaves_necessary_whitespace_alone()393 fn leaves_necessary_whitespace_alone() {
394 assert_eq!("<u>a</u> b <u>c</u>", normalize_html("<u>a</u> b <u>c</u>"))
395 }
396
397 #[test]
leaves_necessary_whitespace_alone_weird()398 fn leaves_necessary_whitespace_alone_weird() {
399 assert_eq!(
400 "<u>a </u>b <u>c</u>",
401 normalize_html(" <u>a </u>b <u>c</u>")
402 )
403 }
404
405 #[test]
leaves_necessary_whitespace_all_nested()406 fn leaves_necessary_whitespace_all_nested() {
407 assert_eq!(
408 "<u></u><u></u><u></u><u></u>",
409 normalize_html("<u> </u><u> </u><u> </u><u> </u>")
410 )
411 }
412
413 #[test]
drops_empty_tbody()414 fn drops_empty_tbody() {
415 assert_eq!(
416 "<table><thead><tr><td>hi</td></tr></thead></table>",
417 normalize_html("<table><thead><tr><td>hi</td></tr></thead><tbody> </tbody></table>")
418 )
419 }
420
421 #[test]
leaves_nonempty_tbody()422 fn leaves_nonempty_tbody() {
423 let input = "<table><thead><tr><td>hi</td></tr></thead><tbody><tr></tr></tbody></table>";
424 assert_eq!(input, normalize_html(input))
425 }
426