1 #[macro_use]
2 extern crate html5ever;
3 #[macro_use]
4 extern crate lazy_static;
5 
6 use html5ever::serialize::{serialize, SerializeOpts};
7 use html5ever::{driver as html, QualName};
8 use markup5ever_rcdom::{Handle, NodeData, RcDom, SerializableHandle};
9 use pulldown_cmark::{Options, Parser};
10 
11 use regex::Regex;
12 use std::collections::HashSet;
13 use std::mem;
14 use std::rc::{Rc, Weak};
15 use tendril::stream::TendrilSink;
16 
17 mod suite;
18 
19 #[inline(never)]
test_markdown_html(input: &str, output: &str, smart_punct: bool)20 pub fn test_markdown_html(input: &str, output: &str, smart_punct: bool) {
21     let mut s = String::new();
22 
23     let mut opts = Options::empty();
24     opts.insert(Options::ENABLE_TABLES);
25     opts.insert(Options::ENABLE_FOOTNOTES);
26     opts.insert(Options::ENABLE_STRIKETHROUGH);
27     opts.insert(Options::ENABLE_TASKLISTS);
28     if smart_punct {
29         opts.insert(Options::ENABLE_SMART_PUNCTUATION);
30     }
31 
32     let p = Parser::new_ext(input, opts);
33     pulldown_cmark::html::push_html(&mut s, p);
34 
35     assert_eq!(normalize_html(output), normalize_html(&s));
36 }
37 
38 lazy_static! {
39     static ref WHITESPACE_RE: Regex = Regex::new(r"\s+").unwrap();
40     static ref LEADING_WHITESPACE_RE: Regex = Regex::new(r"\A\s+").unwrap();
41     static ref TRAILING_WHITESPACE_RE: Regex = Regex::new(r"\s+\z").unwrap();
42     static ref BLOCK_TAGS: HashSet<&'static str> = [
43         "article",
44         "header",
45         "aside",
46         "hgroup",
47         "blockquote",
48         "hr",
49         "iframe",
50         "body",
51         "li",
52         "map",
53         "button",
54         "object",
55         "canvas",
56         "ol",
57         "caption",
58         "output",
59         "col",
60         "p",
61         "colgroup",
62         "pre",
63         "dd",
64         "progress",
65         "div",
66         "section",
67         "dl",
68         "table",
69         "td",
70         "dt",
71         "tbody",
72         "embed",
73         "textarea",
74         "fieldset",
75         "tfoot",
76         "figcaption",
77         "th",
78         "figure",
79         "thead",
80         "footer",
81         "tr",
82         "form",
83         "ul",
84         "h1",
85         "h2",
86         "h3",
87         "h4",
88         "h5",
89         "h6",
90         "video",
91         "script",
92         "style"
93     ]
94     .iter()
95     .cloned()
96     .collect();
97     static ref WHITESPACE_SENSITIVE_TAGS: HashSet<&'static str> =
98         ["pre", "code", "h1", "h2", "h3", "h4", "h5", "h6"]
99             .iter()
100             .cloned()
101             .collect();
102     static ref TABLE_TAGS: HashSet<&'static str> = ["table", "thead", "tbody", "tr", "td"]
103         .iter()
104         .cloned()
105         .collect();
106 }
107 
make_html_parser() -> html::Parser<RcDom>108 fn make_html_parser() -> html::Parser<RcDom> {
109     html::parse_fragment(
110         RcDom::default(),
111         html::ParseOpts::default(),
112         QualName::new(None, ns!(html), local_name!("div")),
113         vec![],
114     )
115 }
116 
normalize_html(s: &str) -> String117 fn normalize_html(s: &str) -> String {
118     let parser = make_html_parser();
119     let dom = parser.one(s);
120     let body: SerializableHandle = normalize_dom(&dom).into();
121     let opts = SerializeOpts::default();
122     let mut ret_val = Vec::new();
123     serialize(&mut ret_val, &body, opts)
124         .expect("Writing to a string shouldn't fail (expect on OOM)");
125     String::from_utf8(ret_val).expect("html5ever should always produce UTF8")
126 }
127 
normalize_dom(dom: &RcDom) -> Handle128 fn normalize_dom(dom: &RcDom) -> Handle {
129     let body = {
130         let children = dom.document.children.borrow();
131         children[0].clone()
132     };
133     let mut current_level = Vec::new();
134     let mut next_level = Vec::new();
135     current_level.extend(body.children.borrow().iter().cloned().rev());
136     loop {
137         while let Some(mut node) = current_level.pop() {
138             let parent = node.parent.replace(None);
139             node.parent.replace(parent.clone());
140             let parent = parent
141                 .expect("a node in the DOM will have a parent, except the root, which is not processed")
142                 .upgrade().expect("a node's parent will be pointed to by its parent (or the root pointer), and will not be dropped");
143             let retain = normalize_node(&parent, &mut node);
144             if !retain {
145                 let mut siblings = parent.children.borrow_mut();
146                 siblings.retain(|s| !Rc::ptr_eq(&node, s));
147             } else {
148                 next_level.extend(node.children.borrow().iter().cloned().rev());
149             }
150         }
151         if next_level.is_empty() {
152             break;
153         };
154         mem::swap(&mut next_level, &mut current_level);
155     }
156     body
157 }
158 
159 // Returns false if node is an empty text node or an empty tbody.
160 // Returns true otherwise.
normalize_node(parent: &Handle, node: &mut Handle) -> bool161 fn normalize_node(parent: &Handle, node: &mut Handle) -> bool {
162     match node.data {
163         NodeData::Comment { .. }
164         | NodeData::Doctype { .. }
165         | NodeData::Document
166         | NodeData::ProcessingInstruction { .. } => true,
167         NodeData::Text { ref contents, .. } => {
168             let mut contents = contents.borrow_mut();
169             let is_pre = {
170                 let mut parent = parent.clone();
171                 loop {
172                     let is_pre = if let NodeData::Element { ref name, .. } = parent.data {
173                         WHITESPACE_SENSITIVE_TAGS.contains(&&*name.local.to_ascii_lowercase())
174                     } else {
175                         false
176                     };
177                     if is_pre {
178                         break true;
179                     };
180                     let parent_ = parent.parent.replace(None);
181                     parent.parent.replace(parent_.clone());
182                     let parent_ = parent_.as_ref().and_then(Weak::upgrade);
183                     if let Some(parent_) = parent_ {
184                         parent = parent_
185                     } else {
186                         break false;
187                     };
188                 }
189             };
190             if !is_pre {
191                 let (is_first_in_block, is_last_in_block) = {
192                     let mut is_first_in_block = true;
193                     let mut is_last_in_block = true;
194                     let mut parent = parent.clone();
195                     let mut node = node.clone();
196                     loop {
197                         let reached_block = if let NodeData::Element { ref name, .. } = parent.data
198                         {
199                             BLOCK_TAGS.contains(&&*name.local.to_ascii_lowercase())
200                         } else {
201                             false
202                         };
203                         let (is_first, is_last) = {
204                             let siblings = parent.children.borrow();
205                             let n = &node;
206                             (
207                                 siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false),
208                                 siblings.len() > 0
209                                     && siblings
210                                         .get(siblings.len() - 1)
211                                         .map(|s| Rc::ptr_eq(s, n))
212                                         .unwrap_or(false),
213                             )
214                         };
215                         is_first_in_block = is_first_in_block && is_first;
216                         is_last_in_block = is_last_in_block && is_last;
217                         if (is_first_in_block || is_last_in_block) && !reached_block {
218                             node = parent.clone();
219                             let parent_ = parent.parent.replace(None);
220                             parent.parent.replace(parent_.clone());
221                             let parent_ = parent_.as_ref().and_then(Weak::upgrade);
222                             if let Some(parent_) = parent_ {
223                                 parent = parent_;
224                             } else {
225                                 break (is_first_in_block, is_last_in_block);
226                             }
227                         } else {
228                             break (is_first_in_block, is_last_in_block);
229                         }
230                     }
231                 };
232                 let is_preceeded_by_ws = {
233                     let mut parent = parent.clone();
234                     let mut node = node.clone();
235                     'ascent: loop {
236                         let is_first = {
237                             let siblings = parent.children.borrow();
238                             let n = &node;
239                             siblings.get(0).map(|s| Rc::ptr_eq(s, n)).unwrap_or(false)
240                         };
241                         if is_first {
242                             node = parent.clone();
243                             let parent_ = parent.parent.replace(None);
244                             parent.parent.replace(parent_.clone());
245                             let parent_ = parent_.as_ref().and_then(Weak::upgrade);
246                             if let Some(parent_) = parent_ {
247                                 parent = parent_;
248                             } else {
249                                 break 'ascent false;
250                             }
251                         } else {
252                             let siblings = parent.children.borrow();
253                             let n = &node;
254                             let mut pos = !0;
255                             'search: for (i, s) in siblings.iter().enumerate() {
256                                 if Rc::ptr_eq(s, n) {
257                                     pos = i;
258                                     break 'search;
259                                 }
260                             }
261                             assert!(
262                                 pos != !0,
263                                 "The list of node's parent's children shall contain node"
264                             );
265                             assert!(
266                                 pos != 0,
267                                 "If node is not first, then node's position shall not be zero"
268                             );
269                             let mut preceeding = siblings[pos - 1].clone();
270                             'descent: loop {
271                                 if let NodeData::Text { .. } = preceeding.data {
272                                     break 'descent;
273                                 }
274                                 preceeding = {
275                                     let ch = preceeding.children.borrow();
276                                     if ch.len() == 0 {
277                                         break 'descent;
278                                     }
279                                     if let Some(preceeding_) = ch.get(ch.len() - 1) {
280                                         preceeding_.clone()
281                                     } else {
282                                         break 'descent;
283                                     }
284                                 };
285                             }
286                             if let NodeData::Text { ref contents, .. } = preceeding.data {
287                                 break 'ascent TRAILING_WHITESPACE_RE.is_match(&*contents.borrow());
288                             } else {
289                                 break 'ascent false;
290                             }
291                         }
292                     }
293                 };
294 
295                 let is_in_table = if let NodeData::Element { ref name, .. } = parent.data {
296                     TABLE_TAGS.contains(&&*name.local.to_ascii_lowercase())
297                 } else {
298                     false
299                 };
300                 let whitespace_replacement = if is_in_table { "" } else { " " };
301                 *contents = WHITESPACE_RE
302                     .replace_all(&*contents, whitespace_replacement)
303                     .as_ref()
304                     .into();
305 
306                 if is_first_in_block || is_preceeded_by_ws {
307                     *contents = LEADING_WHITESPACE_RE
308                         .replace_all(&*contents, "")
309                         .as_ref()
310                         .into();
311                 }
312                 if is_last_in_block {
313                     *contents = TRAILING_WHITESPACE_RE
314                         .replace_all(&*contents, "")
315                         .as_ref()
316                         .into();
317                 }
318                 // TODO: collapse whitespace when adjacent to whitespace.
319                 // For example, the whitespace in the span should be collapsed in all of these cases:
320                 //
321                 //     " <span> q </span> "
322                 //     "<b>q </b><span> q</span>"
323                 //     "<b>q <i></i></b><span> q</span>"
324                 //     "<b>q <i></i></b><span> q</span>"
325                 //     "q <b></b><span> q</span>"
326             }
327             &**contents != ""
328         }
329         NodeData::Element {
330             ref attrs,
331             ref name,
332             ..
333         } => {
334             let mut attrs = attrs.borrow_mut();
335             for a in attrs.iter_mut() {
336                 a.name.local = a.name.local.to_ascii_lowercase().into();
337             }
338             attrs.sort_by(|a: &html5ever::Attribute, b: &html5ever::Attribute| {
339                 (&*a.name.local).cmp(&*b.name.local)
340             });
341             let ascii_name = &*name.local.to_ascii_lowercase();
342             // drop empty tbody's
343             ascii_name != "tbody"
344                 || node.children.borrow().len() > 1
345                 || node
346                     .children
347                     .borrow()
348                     .iter()
349                     .next()
350                     .map(|only_child| match only_child.data {
351                         NodeData::Text { ref contents, .. } => {
352                             !contents.borrow().chars().all(|c| c.is_whitespace())
353                         }
354                         _ => true,
355                     })
356                     .unwrap_or(false)
357         }
358     }
359 }
360 
361 #[test]
strip_div_newline()362 fn strip_div_newline() {
363     assert_eq!("<div></div>", normalize_html("<div>\n</div>"));
364 }
365 
366 #[test]
strip_end_newline()367 fn strip_end_newline() {
368     assert_eq!("test", normalize_html("test\n"));
369 }
370 
371 #[test]
strip_double_space()372 fn strip_double_space() {
373     assert_eq!("test mess", normalize_html("test  mess"));
374 }
375 
376 #[test]
strip_inline_internal_text()377 fn strip_inline_internal_text() {
378     assert_eq!(
379         "<u>a </u>b <u>c</u>",
380         normalize_html("<u> a </u> b <u> c </u>")
381     )
382 }
383 
384 #[test]
strip_inline_block_internal_text()385 fn strip_inline_block_internal_text() {
386     assert_eq!(
387         "<u>a </u>b <u>c</u>",
388         normalize_html(" <u> a </u> b <u> c </u> ")
389     )
390 }
391 
392 #[test]
leaves_necessary_whitespace_alone()393 fn leaves_necessary_whitespace_alone() {
394     assert_eq!("<u>a</u> b <u>c</u>", normalize_html("<u>a</u> b <u>c</u>"))
395 }
396 
397 #[test]
leaves_necessary_whitespace_alone_weird()398 fn leaves_necessary_whitespace_alone_weird() {
399     assert_eq!(
400         "<u>a </u>b <u>c</u>",
401         normalize_html(" <u>a </u>b <u>c</u>")
402     )
403 }
404 
405 #[test]
leaves_necessary_whitespace_all_nested()406 fn leaves_necessary_whitespace_all_nested() {
407     assert_eq!(
408         "<u></u><u></u><u></u><u></u>",
409         normalize_html("<u> </u><u> </u><u> </u><u> </u>")
410     )
411 }
412 
413 #[test]
drops_empty_tbody()414 fn drops_empty_tbody() {
415     assert_eq!(
416         "<table><thead><tr><td>hi</td></tr></thead></table>",
417         normalize_html("<table><thead><tr><td>hi</td></tr></thead><tbody>  </tbody></table>")
418     )
419 }
420 
421 #[test]
leaves_nonempty_tbody()422 fn leaves_nonempty_tbody() {
423     let input = "<table><thead><tr><td>hi</td></tr></thead><tbody><tr></tr></tbody></table>";
424     assert_eq!(input, normalize_html(input))
425 }
426