1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use serde_json::{Map, Value};
11 use std::borrow::Cow::Borrowed;
12 use std::env;
13 use std::ffi::OsStr;
14 use std::io::Read;
15 use std::mem::replace;
16 use std::path::Path;
17 
18 use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn};
19 use util::find_tests::foreach_xml5lib_test;
20 
21 use markup5ever::buffer_queue::BufferQueue;
22 use xml5ever::tendril::{SliceExt, StrTendril};
23 use xml5ever::tokenizer::{CharacterTokens, Token, TokenSink};
24 use xml5ever::tokenizer::{CommentToken, EmptyTag, EndTag, ShortTag, StartTag, Tag};
25 use xml5ever::tokenizer::{Doctype, DoctypeToken, PIToken, Pi};
26 use xml5ever::tokenizer::{EOFToken, XmlTokenizer, XmlTokenizerOpts};
27 use xml5ever::tokenizer::{NullCharacterToken, ParseError, TagToken};
28 use xml5ever::{namespace_url, ns, Attribute, LocalName, QualName};
29 
30 mod util {
31     pub mod find_tests;
32 }
33 
34 // Return all ways of splitting the string into at most n
35 // possibly-empty pieces.
splits(s: &str, n: usize) -> Vec<Vec<StrTendril>>36 fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
37     if n == 1 {
38         return vec![vec![s.to_tendril()]];
39     }
40 
41     let mut points: Vec<usize> = s.char_indices().map(|(n, _)| n).collect();
42     points.push(s.len());
43 
44     // do this with iterators?
45     let mut out = vec![];
46     for p in points.into_iter() {
47         let y = &s[p..];
48         for mut x in splits(&s[..p], n - 1).into_iter() {
49             x.push(y.to_tendril());
50             out.push(x);
51         }
52     }
53 
54     out.extend(splits(s, n - 1).into_iter());
55     out
56 }
57 
58 struct TokenLogger {
59     tokens: Vec<Token>,
60     current_str: StrTendril,
61     exact_errors: bool,
62 }
63 
64 impl TokenLogger {
new(exact_errors: bool) -> TokenLogger65     fn new(exact_errors: bool) -> TokenLogger {
66         TokenLogger {
67             tokens: vec![],
68             current_str: StrTendril::new(),
69             exact_errors: exact_errors,
70         }
71     }
72 
73     // Push anything other than character tokens
push(&mut self, token: Token)74     fn push(&mut self, token: Token) {
75         self.finish_str();
76         self.tokens.push(token);
77     }
78 
finish_str(&mut self)79     fn finish_str(&mut self) {
80         if self.current_str.len() > 0 {
81             let s = replace(&mut self.current_str, StrTendril::new());
82             self.tokens.push(CharacterTokens(s));
83         }
84     }
85 
get_tokens(mut self) -> Vec<Token>86     fn get_tokens(mut self) -> Vec<Token> {
87         self.finish_str();
88         self.tokens
89     }
90 }
91 
92 impl TokenSink for TokenLogger {
process_token(&mut self, token: Token)93     fn process_token(&mut self, token: Token) {
94         match token {
95             CharacterTokens(b) => {
96                 self.current_str.push_slice(&b);
97             },
98 
99             NullCharacterToken => {
100                 self.current_str.push_char('\0');
101             },
102 
103             ParseError(_) => {
104                 if self.exact_errors {
105                     self.push(ParseError(Borrowed("")));
106                 }
107             },
108 
109             TagToken(mut t) => {
110                 // The spec seems to indicate that one can emit
111                 // erroneous end tags with attrs, but the test
112                 // cases don't contain them.
113                 match t.kind {
114                     EndTag => {
115                         t.attrs = vec![];
116                     },
117                     _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
118                 }
119                 self.push(TagToken(t));
120             },
121 
122             EOFToken => (),
123 
124             _ => self.push(token),
125         }
126     }
127 }
128 
tokenize_xml(input: Vec<StrTendril>, opts: XmlTokenizerOpts) -> Vec<Token>129 fn tokenize_xml(input: Vec<StrTendril>, opts: XmlTokenizerOpts) -> Vec<Token> {
130     let sink = TokenLogger::new(opts.exact_errors);
131     let mut tok = XmlTokenizer::new(sink, opts);
132     let mut buf = BufferQueue::new();
133 
134     for chunk in input.into_iter() {
135         buf.push_back(chunk);
136         let _ = tok.feed(&mut buf);
137     }
138     let _ = tok.feed(&mut buf);
139     tok.end();
140     tok.sink.get_tokens()
141 }
142 
143 trait JsonExt: Sized {
get_str(&self) -> String144     fn get_str(&self) -> String;
get_tendril(&self) -> StrTendril145     fn get_tendril(&self) -> StrTendril;
get_nullable_tendril(&self) -> Option<StrTendril>146     fn get_nullable_tendril(&self) -> Option<StrTendril>;
get_bool(&self) -> bool147     fn get_bool(&self) -> bool;
get_obj<'t>(&'t self) -> &'t Map<String, Self>148     fn get_obj<'t>(&'t self) -> &'t Map<String, Self>;
get_list<'t>(&'t self) -> &'t Vec<Self>149     fn get_list<'t>(&'t self) -> &'t Vec<Self>;
find<'t>(&'t self, key: &str) -> &'t Self150     fn find<'t>(&'t self, key: &str) -> &'t Self;
151 }
152 
153 impl JsonExt for Value {
get_str(&self) -> String154     fn get_str(&self) -> String {
155         match *self {
156             Value::String(ref s) => s.to_string(),
157             _ => panic!("Value::get_str: not a String"),
158         }
159     }
160 
get_tendril(&self) -> StrTendril161     fn get_tendril(&self) -> StrTendril {
162         match *self {
163             Value::String(ref s) => s.to_tendril(),
164             _ => panic!("Value::get_tendril: not a String"),
165         }
166     }
167 
get_nullable_tendril(&self) -> Option<StrTendril>168     fn get_nullable_tendril(&self) -> Option<StrTendril> {
169         match *self {
170             Value::Null => None,
171             Value::String(ref s) => Some(s.to_tendril()),
172             _ => panic!("Value::get_nullable_tendril: not a String"),
173         }
174     }
175 
get_bool(&self) -> bool176     fn get_bool(&self) -> bool {
177         match *self {
178             Value::Bool(b) => b,
179             _ => panic!("Value::get_bool: not a Boolean"),
180         }
181     }
182 
get_obj<'t>(&'t self) -> &'t Map<String, Value>183     fn get_obj<'t>(&'t self) -> &'t Map<String, Value> {
184         match *self {
185             Value::Object(ref m) => &*m,
186             _ => panic!("Value::get_obj: not an Object"),
187         }
188     }
189 
get_list<'t>(&'t self) -> &'t Vec<Value>190     fn get_list<'t>(&'t self) -> &'t Vec<Value> {
191         match *self {
192             Value::Array(ref m) => m,
193             _ => panic!("Value::get_list: not an Array"),
194         }
195     }
196 
find<'t>(&'t self, key: &str) -> &'t Value197     fn find<'t>(&'t self, key: &str) -> &'t Value {
198         self.get_obj().get(&key.to_string()).unwrap()
199     }
200 }
201 
202 // Parse a JSON object (other than "ParseError") to a token.
json_to_token(js: &Value) -> Token203 fn json_to_token(js: &Value) -> Token {
204     let parts = js.as_array().unwrap();
205     // Collect refs here so we don't have to use "ref" in all the patterns below.
206     let args: Vec<&Value> = parts[1..].iter().collect();
207     match &*parts[0].get_str() {
208         "StartTag" => TagToken(Tag {
209             kind: StartTag,
210             name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
211             attrs: args[1]
212                 .get_obj()
213                 .iter()
214                 .map(|(k, v)| Attribute {
215                     name: QualName::new(None, ns!(), LocalName::from(&**k)),
216                     value: v.get_tendril(),
217                 })
218                 .collect(),
219         }),
220 
221         "EndTag" => TagToken(Tag {
222             kind: EndTag,
223             name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
224             attrs: vec![],
225         }),
226 
227         "ShortTag" => TagToken(Tag {
228             kind: ShortTag,
229             name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
230             attrs: vec![],
231         }),
232 
233         "EmptyTag" => TagToken(Tag {
234             kind: EmptyTag,
235             name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
236             attrs: args[1]
237                 .get_obj()
238                 .iter()
239                 .map(|(k, v)| Attribute {
240                     name: QualName::new(None, ns!(), LocalName::from(&**k)),
241                     value: v.get_tendril(),
242                 })
243                 .collect(),
244         }),
245 
246         "Comment" => CommentToken(args[0].get_tendril()),
247 
248         "Character" => CharacterTokens(args[0].get_tendril()),
249 
250         "PI" => PIToken(Pi {
251             target: args[0].get_tendril(),
252             data: args[1].get_tendril(),
253         }),
254 
255         "DOCTYPE" => DoctypeToken(Doctype {
256             name: args[0].get_nullable_tendril(),
257             public_id: args[1].get_nullable_tendril(),
258             system_id: args[2].get_nullable_tendril(),
259         }),
260 
261         // We don't need to produce NullCharacterToken because
262         // the TokenLogger will convert them to CharacterTokens.
263         _ => panic!("don't understand token {:?}", parts),
264     }
265 }
266 
267 // Parse the "output" field of the test case into a vector of tokens.
json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token>268 fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
269     // Use a TokenLogger so that we combine character tokens separated
270     // by an ignored error.
271     let mut sink = TokenLogger::new(exact_errors);
272     for tok in js.as_array().unwrap().iter() {
273         match *tok {
274             Value::String(ref s) if &s[..] == "ParseError" => {
275                 sink.process_token(ParseError(Borrowed("")))
276             },
277             _ => sink.process_token(json_to_token(tok)),
278         }
279     }
280     sink.get_tokens()
281 }
282 
mk_xml_test( desc: String, input: String, expect: Value, opts: XmlTokenizerOpts, ) -> TestDescAndFn283 fn mk_xml_test(
284     desc: String,
285     input: String,
286     expect: Value,
287     opts: XmlTokenizerOpts,
288 ) -> TestDescAndFn {
289     TestDescAndFn {
290         desc: TestDesc::new(DynTestName(desc)),
291         testfn: DynTestFn(Box::new(move || {
292             // Split up the input at different points to test incremental tokenization.
293             let insplits = splits(&input, 3);
294             for input in insplits.into_iter() {
295                 // Clone 'input' so we have it for the failure message.
296                 // Also clone opts.  If we don't, we get the wrong
297                 // result but the compiler doesn't catch it!
298                 // Possibly mozilla/rust#12223.
299                 let output = tokenize_xml(input.clone(), opts.clone());
300                 let expect = json_to_tokens(&expect, opts.exact_errors);
301                 if output != expect {
302                     panic!(
303                         "\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
304                         input, output, expect
305                     );
306                 }
307             }
308         })),
309     }
310 }
311 
mk_xml_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value)312 fn mk_xml_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value) {
313     let input: &str = &js.find("input").get_str();
314     let expect = js.find("output");
315     let desc = format!("tok: {}: {}", filename, js.find("description").get_str());
316 
317     // Some tests want to start in a state other than Data.
318     let state_overrides = vec![None];
319 
320     // Build the tests.
321     for state in state_overrides.into_iter() {
322         for &exact_errors in [false, true].iter() {
323             let mut newdesc = desc.clone();
324             match state {
325                 Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
326                 None => (),
327             };
328             if exact_errors {
329                 newdesc = format!("{} (exact errors)", newdesc);
330             }
331 
332             tests.push(mk_xml_test(
333                 newdesc,
334                 String::from(input),
335                 expect.clone(),
336                 XmlTokenizerOpts {
337                     exact_errors: exact_errors,
338                     initial_state: state,
339 
340                     // Not discarding a BOM is what the test suite expects; see
341                     // https://github.com/html5lib/html5lib-tests/issues/2
342                     discard_bom: false,
343 
344                     ..Default::default()
345                 },
346             ));
347         }
348     }
349 }
350 
tests(src_dir: &Path) -> Vec<TestDescAndFn>351 fn tests(src_dir: &Path) -> Vec<TestDescAndFn> {
352     let mut tests = vec![];
353     foreach_xml5lib_test(
354         src_dir,
355         "tokenizer",
356         OsStr::new("test"),
357         |path, mut file| {
358             let mut s = String::new();
359             file.read_to_string(&mut s)
360                 .ok()
361                 .expect("file reading error");
362             let js: Value = serde_json::from_str(&s).ok().expect("json parse error");
363 
364             match js["tests"] {
365                 Value::Array(ref lst) => {
366                     for test in lst.iter() {
367                         mk_xml_tests(
368                             &mut tests,
369                             path.file_name().unwrap().to_str().unwrap(),
370                             test,
371                         );
372                     }
373                 },
374 
375                 _ => (),
376             }
377         },
378     );
379 
380     tests
381 }
382 
main()383 fn main() {
384     let args: Vec<_> = env::args().collect();
385     rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
386 }
387