1 use crate::String;
2 use super::FormatCmd;
3 use crate::output::LocalizedQuotes;
4 use crate::IngestOptions;
5 
6 #[derive(Debug, Clone, PartialEq, Eq, Serialize)]
7 pub enum MicroNode {
8     Text(String),
9 
10     Formatted(Vec<MicroNode>, FormatCmd),
11 
12     Quoted {
13         /// Holds false until flip_flop_nodes gives it a meaningful value
14         is_inner: bool,
15         localized: LocalizedQuotes,
16         children: Vec<MicroNode>,
17     },
18 
19     NoCase(Vec<MicroNode>),
20 
21     NoDecor(Vec<MicroNode>),
22 }
23 
24 impl MicroNode {
take_text(&mut self) -> Option<String>25     pub(crate) fn take_text(&mut self) -> Option<String> {
26         match self {
27             MicroNode::Text(ref mut text) => Some(std::mem::replace(text, String::new())),
28             _ => None,
29         }
30     }
31 }
32 
33 use crate::output::markup::parse_quotes;
34 
35 impl MicroNode {
36     /// TODO: catch errors and get the input back as a String
parse(fragment: &str, options: &IngestOptions) -> Vec<MicroNode>37     pub fn parse(fragment: &str, options: &IngestOptions) -> Vec<MicroNode> {
38         let mut tag_parser = TagParser::new(&fragment);
39         let result: Vec<MicroNode> = tag_parser.walk(&MicroHtmlReader { options });
40         if !options.no_parse_quotes {
41             parse_quotes(result, options)
42         } else {
43             result
44         }
45     }
46 }
47 
48 pub trait HtmlReader<T> {
constructor(&self, tag: &Tag, children: Vec<T>) -> Vec<T>49     fn constructor(&self, tag: &Tag, children: Vec<T>) -> Vec<T>;
plain(&self, s: &str) -> Option<Vec<T>>50     fn plain(&self, s: &str) -> Option<Vec<T>>;
filter(&self, tag: &mut Tag)51     fn filter(&self, tag: &mut Tag) {
52         if tag.name == "html" || tag.name == "body" {
53             // ignore <html> and <body> tags, but still parse their children
54             tag.ignore_self();
55         } else if tag.name == "i" || tag.name == "b" || tag.name == "sup" || tag.name == "sub" {
56             // ok
57         } else if tag.name == "span" {
58             tag.allow_attribute("style");
59             tag.allow_attribute("class");
60         } else {
61             tag.ignore_self();
62         }
63     }
64 }
65 
micro_html_to_string(fragment: &str, options: &IngestOptions) -> String66 pub fn micro_html_to_string(fragment: &str, options: &IngestOptions) -> String {
67     let mut parser = TagParser::new(&fragment);
68     let reader = PlainHtmlReader {
69         options: options.clone(),
70     };
71     let result: Vec<String> = parser.walk(&reader);
72     let mut res: Option<String> = None;
73     for r in result {
74         res = match res {
75             Some(ref mut acc) => {
76                 acc.push_str(&r);
77                 continue;
78             }
79             None => Some(r),
80         }
81     }
82     res.unwrap_or_default()
83 }
84 
85 struct PlainHtmlReader {
86     options: IngestOptions,
87 }
88 
89 impl HtmlReader<String> for PlainHtmlReader {
constructor(&self, tag: &Tag, children: Vec<String>) -> Vec<String>90     fn constructor(&self, tag: &Tag, children: Vec<String>) -> Vec<String> {
91         match tag.name {
92             "i" => children,
93             "b" => children,
94             "sup" => children,
95             "sub" => children,
96             "span" => match tag.attrs {
97                 // very specific!
98                 [("style", "font-variant:small-caps;")]
99                 | [("style", "font-variant: small-caps;")]
100                 | [("class", "nocase")]
101                 | [("class", "nodecor")] => children,
102                 _ => return vec![],
103             },
104             _ => return vec![],
105         }
106     }
107 
plain(&self, s: &str) -> Option<Vec<String>>108     fn plain(&self, s: &str) -> Option<Vec<String>> {
109         let plain = self.options.plain(s);
110         Some(vec![plain.as_ref().into()])
111     }
112 }
113 
114 struct MicroHtmlReader<'a> {
115     options: &'a IngestOptions,
116 }
117 
118 impl HtmlReader<MicroNode> for MicroHtmlReader<'_> {
constructor(&self, tag: &Tag, children: Vec<MicroNode>) -> Vec<MicroNode>119     fn constructor(&self, tag: &Tag, children: Vec<MicroNode>) -> Vec<MicroNode> {
120         let single = match tag.name {
121             "i" => MicroNode::Formatted(children, FormatCmd::FontStyleItalic),
122             "b" => MicroNode::Formatted(children, FormatCmd::FontWeightBold),
123             "sup" => MicroNode::Formatted(children, FormatCmd::VerticalAlignmentSuperscript),
124             "sub" => MicroNode::Formatted(children, FormatCmd::VerticalAlignmentSubscript),
125             "span" => match tag.attrs {
126                 // very specific!
127                 [("style", "font-variant:small-caps;")]
128                 | [("style", "font-variant: small-caps;")] => {
129                     MicroNode::Formatted(children, FormatCmd::FontVariantSmallCaps)
130                 }
131                 [("class", "nocase")] => MicroNode::NoCase(children),
132                 [("class", "nodecor")] => MicroNode::NoDecor(children),
133                 // TODO: do we really want <span class="unrecognised">Children</span> to be removed
134                 // completely?
135                 _ => return vec![],
136             },
137             // TODO: Same here
138             _ => return vec![],
139         };
140         vec![single]
141     }
142 
plain<'input>(&self, s: &'input str) -> Option<Vec<MicroNode>>143     fn plain<'input>(&self, s: &'input str) -> Option<Vec<MicroNode>> {
144         let plain = self.options.plain(s);
145         Some(super::superscript::parse_sup_sub(&plain))
146     }
147 }
148 
149 #[test]
test_sanitize()150 fn test_sanitize() {
151     let fragment =
152         r#"<span class="nocase"><i class="whatever">Italic</i></span> <img src="5" /> <b>Bold</b>"#;
153     let result = MicroNode::parse(fragment, &Default::default());
154     use FormatCmd::*;
155     use MicroNode::*;
156     assert_eq!(
157         result,
158         &[
159             NoCase(vec![Formatted(
160                 vec![Text("Italic".into())],
161                 FontStyleItalic
162             ),]),
163             Text(" ".into()),
164             Text(" ".into()),
165             Formatted(vec![Text("Bold".into())], FontWeightBold)
166         ]
167     );
168 }
169 
170 // The following is based on the MIT-licensed html_sanitizer crate,
171 // and adjusted to work on *inline* HTML, not entire documents.
172 //
173 // https://github.com/Trangar/html_sanitizer/blob/master/src/lib.rs
174 
175 use html5ever::driver::ParseOpts;
176 use html5ever::interface::QualName;
177 use markup5ever_rcdom::{Handle, NodeData, RcDom};
178 use html5ever::tendril::TendrilSink;
179 use html5ever::tree_builder::TreeBuilderOpts;
180 use html5ever::{local_name, parse_fragment, Namespace};
181 
182 struct TagParser {
183     dom: RcDom,
184 }
185 
186 use stringreader::StringReader;
187 
188 impl<'input> TagParser {
new(input: &'input str) -> Self189     fn new(input: &'input str) -> Self {
190         let opts = ParseOpts {
191             tree_builder: TreeBuilderOpts {
192                 drop_doctype: true,
193                 scripting_enabled: false,
194                 ..Default::default()
195             },
196             ..Default::default()
197         };
198         let html_p = QualName::new(
199             None,
200             Namespace::from("http://www.w3.org/1999/xhtml"),
201             local_name!("p"),
202         );
203         let mut reader = StringReader::new(input);
204         let dom = parse_fragment(RcDom::default(), opts, html_p, vec![])
205             .from_utf8()
206             .read_from(&mut reader)
207             .unwrap();
208         // println!("Errors: {:?}", dom.errors);
209 
210         TagParser { dom }
211     }
212 
internal_walk_micro<T, R>(handle: &Handle, callbacks: &R) -> Vec<T> where R: HtmlReader<T>,213     fn internal_walk_micro<T, R>(handle: &Handle, callbacks: &R) -> Vec<T>
214     where
215         R: HtmlReader<T>,
216     {
217         let mut output = Vec::new();
218 
219         if let NodeData::Element { name, attrs, .. } = &handle.data {
220             let name = &name.local;
221             let attrs = attrs.borrow();
222             let mut attributes = Vec::<(&str, &str)>::new();
223             for attr in attrs.iter() {
224                 attributes.push((&attr.name.local, &attr.value));
225             }
226             let mut tag = Tag::from_name_and_attrs(name, &attributes);
227             callbacks.filter(&mut tag);
228 
229             if tag.ignore_self && tag.ignore_contents {
230                 return output;
231             }
232             // if let Some(rewrite) = tag.rewrite {
233             //     return rewrite;
234             // }
235 
236             let attrs: Vec<(&str, &str)> = tag
237                 .attrs
238                 .iter()
239                 .filter(|a| tag.allowed_attributes.iter().any(|b| b == &a.0))
240                 .cloned()
241                 .collect();
242 
243             if !tag.ignore_self && !tag.ignore_contents {
244                 let proposed = Tag::from_name_and_attrs(tag.name, &attrs);
245                 let mut children = Vec::new();
246                 for child in handle.children.borrow().iter() {
247                     children.extend(TagParser::internal_walk_micro(child, callbacks));
248                 }
249                 output.extend(callbacks.constructor(&proposed, children));
250             } else if tag.ignore_self {
251                 for child in handle.children.borrow().iter() {
252                     output.extend(TagParser::internal_walk_micro(child, callbacks));
253                 }
254             } else if tag.ignore_contents {
255                 let proposed = Tag::from_name_and_attrs(tag.name, &attrs);
256                 output.extend(callbacks.constructor(&proposed, vec![]));
257             }
258         } else {
259             match &handle.data {
260                 NodeData::Document => {}
261                 NodeData::Doctype { .. } => {}
262                 NodeData::Text { contents } => {
263                     let cont = &contents.borrow();
264                     if let Some(s) = callbacks.plain(cont) {
265                         output.extend(s.into_iter())
266                     }
267                 }
268                 NodeData::Comment { .. } => {}
269                 NodeData::Element { .. } => unreachable!(),
270                 NodeData::ProcessingInstruction { .. } => debug!(
271                     // "Unknown enum tag: NodeData::ProcessingInstruction {{ {:?} {:?} }}",
272                     // target, contents
273                     "Unknown enum tag: NodeData::ProcessingInstruction",
274                 ),
275             }
276             for child in handle.children.borrow().iter() {
277                 output.extend(TagParser::internal_walk_micro(child, callbacks));
278             }
279         }
280         output
281     }
282 
283     /// Recursively walk through all the HTML nodes, calling `callback` for each tag.
walk<T, R>(&mut self, callbacks: &R) -> Vec<T> where R: HtmlReader<T>,284     fn walk<T, R>(&mut self, callbacks: &R) -> Vec<T>
285     where
286         R: HtmlReader<T>,
287     {
288         TagParser::internal_walk_micro(&self.dom.document, callbacks)
289     }
290 }
291 
292 /// Represents a single HTML node. You can read the `name` and `attrs` properties to figure out what tag you're sanitizing.
293 ///
294 /// By default all html nodes will be printed, but attributes will be stripped from a tag unless they are added with `allow_attribute` and `allow_attributes`.
295 pub struct Tag<'a> {
296     /// The name of the HTML tag, e.g. 'div', 'img', etc.
297     pub name: &'a str,
298 
299     /// The attributes of the HTML tag, e.g. ('style', 'width: 100%').
300     pub attrs: &'a [(&'a str, &'a str)],
301 
302     allowed_attributes: Vec<&'static str>,
303     ignore_self: bool,
304     ignore_contents: bool,
305 }
306 
307 impl<'a> Tag<'a> {
from_name_and_attrs(name: &'a str, attrs: &'a [(&'a str, &'a str)]) -> Tag<'a>308     fn from_name_and_attrs(name: &'a str, attrs: &'a [(&'a str, &'a str)]) -> Tag<'a> {
309         Tag {
310             name,
311             attrs,
312             // rewrite: None,
313             allowed_attributes: Vec::new(),
314             ignore_self: false,
315             ignore_contents: false,
316         }
317     }
318 
319     /// Allow the given attribute. This attribute does not have to exist in the `attrs` tag.
320     ///
321     /// When this HTML node gets printed, this attribute will also get printed.
allow_attribute(&mut self, attr: &'static str)322     pub fn allow_attribute(&mut self, attr: &'static str) {
323         self.allowed_attributes.push(attr);
324     }
325 
326     /// Allow the given attributes. These attributes do not have to exist in the `attrs` tag.
327     ///
328     /// When this HTML node gets printed, these attributes will also get printed.
allow_attributes(&mut self, attrs: &[&'static str])329     pub fn allow_attributes(&mut self, attrs: &[&'static str]) {
330         self.allowed_attributes.reserve(attrs.len());
331         for attr in attrs {
332             self.allowed_attributes.push(attr);
333         }
334     }
335 
336     /// Ignore this tag. This means that the HTML Node will not be printed in the output. In addition, all the child nodes and text content will also not be printed.
ignore_self_and_contents(&mut self)337     pub fn ignore_self_and_contents(&mut self) {
338         self.ignore_self = true;
339         self.ignore_contents = true;
340     }
341 
342     /// Ignore this tag. This means that the HTML Node will not be printed in the output. All child nodes and text content will be printed.
ignore_self(&mut self)343     pub fn ignore_self(&mut self) {
344         self.ignore_self = true;
345     }
346 }
347