1 use crate::String;
2 use super::FormatCmd;
3 use crate::output::LocalizedQuotes;
4 use crate::IngestOptions;
5
6 #[derive(Debug, Clone, PartialEq, Eq, Serialize)]
7 pub enum MicroNode {
8 Text(String),
9
10 Formatted(Vec<MicroNode>, FormatCmd),
11
12 Quoted {
13 /// Holds false until flip_flop_nodes gives it a meaningful value
14 is_inner: bool,
15 localized: LocalizedQuotes,
16 children: Vec<MicroNode>,
17 },
18
19 NoCase(Vec<MicroNode>),
20
21 NoDecor(Vec<MicroNode>),
22 }
23
24 impl MicroNode {
take_text(&mut self) -> Option<String>25 pub(crate) fn take_text(&mut self) -> Option<String> {
26 match self {
27 MicroNode::Text(ref mut text) => Some(std::mem::replace(text, String::new())),
28 _ => None,
29 }
30 }
31 }
32
33 use crate::output::markup::parse_quotes;
34
35 impl MicroNode {
36 /// TODO: catch errors and get the input back as a String
parse(fragment: &str, options: &IngestOptions) -> Vec<MicroNode>37 pub fn parse(fragment: &str, options: &IngestOptions) -> Vec<MicroNode> {
38 let mut tag_parser = TagParser::new(&fragment);
39 let result: Vec<MicroNode> = tag_parser.walk(&MicroHtmlReader { options });
40 if !options.no_parse_quotes {
41 parse_quotes(result, options)
42 } else {
43 result
44 }
45 }
46 }
47
48 pub trait HtmlReader<T> {
constructor(&self, tag: &Tag, children: Vec<T>) -> Vec<T>49 fn constructor(&self, tag: &Tag, children: Vec<T>) -> Vec<T>;
plain(&self, s: &str) -> Option<Vec<T>>50 fn plain(&self, s: &str) -> Option<Vec<T>>;
filter(&self, tag: &mut Tag)51 fn filter(&self, tag: &mut Tag) {
52 if tag.name == "html" || tag.name == "body" {
53 // ignore <html> and <body> tags, but still parse their children
54 tag.ignore_self();
55 } else if tag.name == "i" || tag.name == "b" || tag.name == "sup" || tag.name == "sub" {
56 // ok
57 } else if tag.name == "span" {
58 tag.allow_attribute("style");
59 tag.allow_attribute("class");
60 } else {
61 tag.ignore_self();
62 }
63 }
64 }
65
micro_html_to_string(fragment: &str, options: &IngestOptions) -> String66 pub fn micro_html_to_string(fragment: &str, options: &IngestOptions) -> String {
67 let mut parser = TagParser::new(&fragment);
68 let reader = PlainHtmlReader {
69 options: options.clone(),
70 };
71 let result: Vec<String> = parser.walk(&reader);
72 let mut res: Option<String> = None;
73 for r in result {
74 res = match res {
75 Some(ref mut acc) => {
76 acc.push_str(&r);
77 continue;
78 }
79 None => Some(r),
80 }
81 }
82 res.unwrap_or_default()
83 }
84
85 struct PlainHtmlReader {
86 options: IngestOptions,
87 }
88
89 impl HtmlReader<String> for PlainHtmlReader {
constructor(&self, tag: &Tag, children: Vec<String>) -> Vec<String>90 fn constructor(&self, tag: &Tag, children: Vec<String>) -> Vec<String> {
91 match tag.name {
92 "i" => children,
93 "b" => children,
94 "sup" => children,
95 "sub" => children,
96 "span" => match tag.attrs {
97 // very specific!
98 [("style", "font-variant:small-caps;")]
99 | [("style", "font-variant: small-caps;")]
100 | [("class", "nocase")]
101 | [("class", "nodecor")] => children,
102 _ => return vec![],
103 },
104 _ => return vec![],
105 }
106 }
107
plain(&self, s: &str) -> Option<Vec<String>>108 fn plain(&self, s: &str) -> Option<Vec<String>> {
109 let plain = self.options.plain(s);
110 Some(vec![plain.as_ref().into()])
111 }
112 }
113
114 struct MicroHtmlReader<'a> {
115 options: &'a IngestOptions,
116 }
117
118 impl HtmlReader<MicroNode> for MicroHtmlReader<'_> {
constructor(&self, tag: &Tag, children: Vec<MicroNode>) -> Vec<MicroNode>119 fn constructor(&self, tag: &Tag, children: Vec<MicroNode>) -> Vec<MicroNode> {
120 let single = match tag.name {
121 "i" => MicroNode::Formatted(children, FormatCmd::FontStyleItalic),
122 "b" => MicroNode::Formatted(children, FormatCmd::FontWeightBold),
123 "sup" => MicroNode::Formatted(children, FormatCmd::VerticalAlignmentSuperscript),
124 "sub" => MicroNode::Formatted(children, FormatCmd::VerticalAlignmentSubscript),
125 "span" => match tag.attrs {
126 // very specific!
127 [("style", "font-variant:small-caps;")]
128 | [("style", "font-variant: small-caps;")] => {
129 MicroNode::Formatted(children, FormatCmd::FontVariantSmallCaps)
130 }
131 [("class", "nocase")] => MicroNode::NoCase(children),
132 [("class", "nodecor")] => MicroNode::NoDecor(children),
133 // TODO: do we really want <span class="unrecognised">Children</span> to be removed
134 // completely?
135 _ => return vec![],
136 },
137 // TODO: Same here
138 _ => return vec![],
139 };
140 vec![single]
141 }
142
plain<'input>(&self, s: &'input str) -> Option<Vec<MicroNode>>143 fn plain<'input>(&self, s: &'input str) -> Option<Vec<MicroNode>> {
144 let plain = self.options.plain(s);
145 Some(super::superscript::parse_sup_sub(&plain))
146 }
147 }
148
149 #[test]
test_sanitize()150 fn test_sanitize() {
151 let fragment =
152 r#"<span class="nocase"><i class="whatever">Italic</i></span> <img src="5" /> <b>Bold</b>"#;
153 let result = MicroNode::parse(fragment, &Default::default());
154 use FormatCmd::*;
155 use MicroNode::*;
156 assert_eq!(
157 result,
158 &[
159 NoCase(vec![Formatted(
160 vec![Text("Italic".into())],
161 FontStyleItalic
162 ),]),
163 Text(" ".into()),
164 Text(" ".into()),
165 Formatted(vec![Text("Bold".into())], FontWeightBold)
166 ]
167 );
168 }
169
170 // The following is based on the MIT-licensed html_sanitizer crate,
171 // and adjusted to work on *inline* HTML, not entire documents.
172 //
173 // https://github.com/Trangar/html_sanitizer/blob/master/src/lib.rs
174
175 use html5ever::driver::ParseOpts;
176 use html5ever::interface::QualName;
177 use markup5ever_rcdom::{Handle, NodeData, RcDom};
178 use html5ever::tendril::TendrilSink;
179 use html5ever::tree_builder::TreeBuilderOpts;
180 use html5ever::{local_name, parse_fragment, Namespace};
181
182 struct TagParser {
183 dom: RcDom,
184 }
185
186 use stringreader::StringReader;
187
188 impl<'input> TagParser {
new(input: &'input str) -> Self189 fn new(input: &'input str) -> Self {
190 let opts = ParseOpts {
191 tree_builder: TreeBuilderOpts {
192 drop_doctype: true,
193 scripting_enabled: false,
194 ..Default::default()
195 },
196 ..Default::default()
197 };
198 let html_p = QualName::new(
199 None,
200 Namespace::from("http://www.w3.org/1999/xhtml"),
201 local_name!("p"),
202 );
203 let mut reader = StringReader::new(input);
204 let dom = parse_fragment(RcDom::default(), opts, html_p, vec![])
205 .from_utf8()
206 .read_from(&mut reader)
207 .unwrap();
208 // println!("Errors: {:?}", dom.errors);
209
210 TagParser { dom }
211 }
212
internal_walk_micro<T, R>(handle: &Handle, callbacks: &R) -> Vec<T> where R: HtmlReader<T>,213 fn internal_walk_micro<T, R>(handle: &Handle, callbacks: &R) -> Vec<T>
214 where
215 R: HtmlReader<T>,
216 {
217 let mut output = Vec::new();
218
219 if let NodeData::Element { name, attrs, .. } = &handle.data {
220 let name = &name.local;
221 let attrs = attrs.borrow();
222 let mut attributes = Vec::<(&str, &str)>::new();
223 for attr in attrs.iter() {
224 attributes.push((&attr.name.local, &attr.value));
225 }
226 let mut tag = Tag::from_name_and_attrs(name, &attributes);
227 callbacks.filter(&mut tag);
228
229 if tag.ignore_self && tag.ignore_contents {
230 return output;
231 }
232 // if let Some(rewrite) = tag.rewrite {
233 // return rewrite;
234 // }
235
236 let attrs: Vec<(&str, &str)> = tag
237 .attrs
238 .iter()
239 .filter(|a| tag.allowed_attributes.iter().any(|b| b == &a.0))
240 .cloned()
241 .collect();
242
243 if !tag.ignore_self && !tag.ignore_contents {
244 let proposed = Tag::from_name_and_attrs(tag.name, &attrs);
245 let mut children = Vec::new();
246 for child in handle.children.borrow().iter() {
247 children.extend(TagParser::internal_walk_micro(child, callbacks));
248 }
249 output.extend(callbacks.constructor(&proposed, children));
250 } else if tag.ignore_self {
251 for child in handle.children.borrow().iter() {
252 output.extend(TagParser::internal_walk_micro(child, callbacks));
253 }
254 } else if tag.ignore_contents {
255 let proposed = Tag::from_name_and_attrs(tag.name, &attrs);
256 output.extend(callbacks.constructor(&proposed, vec![]));
257 }
258 } else {
259 match &handle.data {
260 NodeData::Document => {}
261 NodeData::Doctype { .. } => {}
262 NodeData::Text { contents } => {
263 let cont = &contents.borrow();
264 if let Some(s) = callbacks.plain(cont) {
265 output.extend(s.into_iter())
266 }
267 }
268 NodeData::Comment { .. } => {}
269 NodeData::Element { .. } => unreachable!(),
270 NodeData::ProcessingInstruction { .. } => debug!(
271 // "Unknown enum tag: NodeData::ProcessingInstruction {{ {:?} {:?} }}",
272 // target, contents
273 "Unknown enum tag: NodeData::ProcessingInstruction",
274 ),
275 }
276 for child in handle.children.borrow().iter() {
277 output.extend(TagParser::internal_walk_micro(child, callbacks));
278 }
279 }
280 output
281 }
282
283 /// Recursively walk through all the HTML nodes, calling `callback` for each tag.
walk<T, R>(&mut self, callbacks: &R) -> Vec<T> where R: HtmlReader<T>,284 fn walk<T, R>(&mut self, callbacks: &R) -> Vec<T>
285 where
286 R: HtmlReader<T>,
287 {
288 TagParser::internal_walk_micro(&self.dom.document, callbacks)
289 }
290 }
291
292 /// Represents a single HTML node. You can read the `name` and `attrs` properties to figure out what tag you're sanitizing.
293 ///
294 /// By default all html nodes will be printed, but attributes will be stripped from a tag unless they are added with `allow_attribute` and `allow_attributes`.
295 pub struct Tag<'a> {
296 /// The name of the HTML tag, e.g. 'div', 'img', etc.
297 pub name: &'a str,
298
299 /// The attributes of the HTML tag, e.g. ('style', 'width: 100%').
300 pub attrs: &'a [(&'a str, &'a str)],
301
302 allowed_attributes: Vec<&'static str>,
303 ignore_self: bool,
304 ignore_contents: bool,
305 }
306
307 impl<'a> Tag<'a> {
from_name_and_attrs(name: &'a str, attrs: &'a [(&'a str, &'a str)]) -> Tag<'a>308 fn from_name_and_attrs(name: &'a str, attrs: &'a [(&'a str, &'a str)]) -> Tag<'a> {
309 Tag {
310 name,
311 attrs,
312 // rewrite: None,
313 allowed_attributes: Vec::new(),
314 ignore_self: false,
315 ignore_contents: false,
316 }
317 }
318
319 /// Allow the given attribute. This attribute does not have to exist in the `attrs` tag.
320 ///
321 /// When this HTML node gets printed, this attribute will also get printed.
allow_attribute(&mut self, attr: &'static str)322 pub fn allow_attribute(&mut self, attr: &'static str) {
323 self.allowed_attributes.push(attr);
324 }
325
326 /// Allow the given attributes. These attributes do not have to exist in the `attrs` tag.
327 ///
328 /// When this HTML node gets printed, these attributes will also get printed.
allow_attributes(&mut self, attrs: &[&'static str])329 pub fn allow_attributes(&mut self, attrs: &[&'static str]) {
330 self.allowed_attributes.reserve(attrs.len());
331 for attr in attrs {
332 self.allowed_attributes.push(attr);
333 }
334 }
335
336 /// Ignore this tag. This means that the HTML Node will not be printed in the output. In addition, all the child nodes and text content will also not be printed.
ignore_self_and_contents(&mut self)337 pub fn ignore_self_and_contents(&mut self) {
338 self.ignore_self = true;
339 self.ignore_contents = true;
340 }
341
342 /// Ignore this tag. This means that the HTML Node will not be printed in the output. All child nodes and text content will be printed.
ignore_self(&mut self)343 pub fn ignore_self(&mut self) {
344 self.ignore_self = true;
345 }
346 }
347