1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 mod foreach_html5lib_test;
11 
12 use foreach_html5lib_test::foreach_html5lib_test;
13 use html5ever::tendril::*;
14 use html5ever::tokenizer::states::{Plaintext, RawData, Rawtext, Rcdata};
15 use html5ever::tokenizer::BufferQueue;
16 use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
17 use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
18 use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
19 use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
20 use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
21 use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn};
22 use serde_json::{Map, Value};
23 use std::borrow::Cow::Borrowed;
24 use std::default::Default;
25 use std::ffi::OsStr;
26 use std::io::Read;
27 use std::mem::replace;
28 use std::path::Path;
29 use std::{char, env};
30 
31 // Return all ways of splitting the string into at most n
32 // possibly-empty pieces.
splits(s: &str, n: usize) -> Vec<Vec<StrTendril>>33 fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
34     if n == 1 {
35         return vec![vec![s.to_tendril()]];
36     }
37 
38     let mut points: Vec<usize> = s.char_indices().map(|(n, _)| n).collect();
39     points.push(s.len());
40 
41     // do this with iterators?
42     let mut out = vec![];
43     for p in points.into_iter() {
44         let y = &s[p..];
45         for mut x in splits(&s[..p], n - 1).into_iter() {
46             x.push(y.to_tendril());
47             out.push(x);
48         }
49     }
50 
51     out.extend(splits(s, n - 1).into_iter());
52     out
53 }
54 
55 struct TokenLogger {
56     tokens: Vec<Token>,
57     current_str: StrTendril,
58     exact_errors: bool,
59 }
60 
61 impl TokenLogger {
new(exact_errors: bool) -> TokenLogger62     fn new(exact_errors: bool) -> TokenLogger {
63         TokenLogger {
64             tokens: vec![],
65             current_str: StrTendril::new(),
66             exact_errors: exact_errors,
67         }
68     }
69 
70     // Push anything other than character tokens
push(&mut self, token: Token)71     fn push(&mut self, token: Token) {
72         self.finish_str();
73         self.tokens.push(token);
74     }
75 
finish_str(&mut self)76     fn finish_str(&mut self) {
77         if self.current_str.len() > 0 {
78             let s = replace(&mut self.current_str, StrTendril::new());
79             self.tokens.push(CharacterTokens(s));
80         }
81     }
82 
get_tokens(mut self) -> Vec<Token>83     fn get_tokens(mut self) -> Vec<Token> {
84         self.finish_str();
85         self.tokens
86     }
87 }
88 
89 impl TokenSink for TokenLogger {
90     type Handle = ();
91 
process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()>92     fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
93         match token {
94             CharacterTokens(b) => {
95                 self.current_str.push_slice(&b);
96             },
97 
98             NullCharacterToken => {
99                 self.current_str.push_char('\0');
100             },
101 
102             ParseError(_) => {
103                 if self.exact_errors {
104                     self.push(ParseError(Borrowed("")));
105                 }
106             },
107 
108             TagToken(mut t) => {
109                 // The spec seems to indicate that one can emit
110                 // erroneous end tags with attrs, but the test
111                 // cases don't contain them.
112                 match t.kind {
113                     EndTag => {
114                         t.self_closing = false;
115                         t.attrs = vec![];
116                     },
117                     _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
118                 }
119                 self.push(TagToken(t));
120             },
121 
122             EOFToken => (),
123 
124             _ => self.push(token),
125         }
126         TokenSinkResult::Continue
127     }
128 }
129 
tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token>130 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token> {
131     let sink = TokenLogger::new(opts.exact_errors);
132     let mut tok = Tokenizer::new(sink, opts);
133     let mut buffer = BufferQueue::new();
134     for chunk in input.into_iter() {
135         buffer.push_back(chunk);
136         let _ = tok.feed(&mut buffer);
137     }
138     let _ = tok.feed(&mut buffer);
139     tok.end();
140     tok.sink.get_tokens()
141 }
142 
143 trait JsonExt: Sized {
get_str(&self) -> String144     fn get_str(&self) -> String;
get_tendril(&self) -> StrTendril145     fn get_tendril(&self) -> StrTendril;
get_nullable_tendril(&self) -> Option<StrTendril>146     fn get_nullable_tendril(&self) -> Option<StrTendril>;
get_bool(&self) -> bool147     fn get_bool(&self) -> bool;
get_obj<'t>(&'t self) -> &'t Map<String, Self>148     fn get_obj<'t>(&'t self) -> &'t Map<String, Self>;
get_list<'t>(&'t self) -> &'t Vec<Self>149     fn get_list<'t>(&'t self) -> &'t Vec<Self>;
find<'t>(&'t self, key: &str) -> &'t Self150     fn find<'t>(&'t self, key: &str) -> &'t Self;
151 }
152 
153 impl JsonExt for Value {
get_str(&self) -> String154     fn get_str(&self) -> String {
155         match *self {
156             Value::String(ref s) => s.to_string(),
157             _ => panic!("Value::get_str: not a String"),
158         }
159     }
160 
get_tendril(&self) -> StrTendril161     fn get_tendril(&self) -> StrTendril {
162         match *self {
163             Value::String(ref s) => s.to_tendril(),
164             _ => panic!("Value::get_tendril: not a String"),
165         }
166     }
167 
get_nullable_tendril(&self) -> Option<StrTendril>168     fn get_nullable_tendril(&self) -> Option<StrTendril> {
169         match *self {
170             Value::Null => None,
171             Value::String(ref s) => Some(s.to_tendril()),
172             _ => panic!("Value::get_nullable_tendril: not a String"),
173         }
174     }
175 
get_bool(&self) -> bool176     fn get_bool(&self) -> bool {
177         match *self {
178             Value::Bool(b) => b,
179             _ => panic!("Value::get_bool: not a Bool"),
180         }
181     }
182 
get_obj<'t>(&'t self) -> &'t Map<String, Value>183     fn get_obj<'t>(&'t self) -> &'t Map<String, Value> {
184         match *self {
185             Value::Object(ref m) => &*m,
186             _ => panic!("Value::get_obj: not an Object"),
187         }
188     }
189 
get_list<'t>(&'t self) -> &'t Vec<Value>190     fn get_list<'t>(&'t self) -> &'t Vec<Value> {
191         match *self {
192             Value::Array(ref m) => m,
193             _ => panic!("Value::get_list: not an Array"),
194         }
195     }
196 
find<'t>(&'t self, key: &str) -> &'t Value197     fn find<'t>(&'t self, key: &str) -> &'t Value {
198         self.get_obj().get(&key.to_string()).unwrap()
199     }
200 }
201 
202 // Parse a JSON object (other than "ParseError") to a token.
json_to_token(js: &Value) -> Token203 fn json_to_token(js: &Value) -> Token {
204     let parts = js.get_list();
205     // Collect refs here so we don't have to use "ref" in all the patterns below.
206     let args: Vec<&Value> = parts[1..].iter().collect();
207     match &*parts[0].get_str() {
208         "DOCTYPE" => DoctypeToken(Doctype {
209             name: args[0].get_nullable_tendril(),
210             public_id: args[1].get_nullable_tendril(),
211             system_id: args[2].get_nullable_tendril(),
212             force_quirks: !args[3].get_bool(),
213         }),
214 
215         "StartTag" => TagToken(Tag {
216             kind: StartTag,
217             name: LocalName::from(&*args[0].get_str()),
218             attrs: args[1]
219                 .get_obj()
220                 .iter()
221                 .map(|(k, v)| Attribute {
222                     name: QualName::new(None, ns!(), LocalName::from(&**k)),
223                     value: v.get_tendril(),
224                 })
225                 .collect(),
226             self_closing: match args.get(2) {
227                 Some(b) => b.get_bool(),
228                 None => false,
229             },
230         }),
231 
232         "EndTag" => TagToken(Tag {
233             kind: EndTag,
234             name: LocalName::from(&*args[0].get_str()),
235             attrs: vec![],
236             self_closing: false,
237         }),
238 
239         "Comment" => CommentToken(args[0].get_tendril()),
240 
241         "Character" => CharacterTokens(args[0].get_tendril()),
242 
243         // We don't need to produce NullCharacterToken because
244         // the TokenLogger will convert them to CharacterTokens.
245         _ => panic!("don't understand token {:?}", parts),
246     }
247 }
248 
249 // Parse the "output" field of the test case into a vector of tokens.
json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token>250 fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
251     // Use a TokenLogger so that we combine character tokens separated
252     // by an ignored error.
253     let mut sink = TokenLogger::new(exact_errors);
254     for tok in js.get_list().iter() {
255         assert_eq!(
256             match *tok {
257                 Value::String(ref s) if &s[..] == "ParseError" => {
258                     sink.process_token(ParseError(Borrowed("")), 0)
259                 },
260                 _ => sink.process_token(json_to_token(tok), 0),
261             },
262             TokenSinkResult::Continue
263         );
264     }
265     sink.get_tokens()
266 }
267 
268 // Undo the escaping in "doubleEscaped" tests.
unescape(s: &str) -> Option<String>269 fn unescape(s: &str) -> Option<String> {
270     let mut out = String::with_capacity(s.len());
271     let mut it = s.chars().peekable();
272     loop {
273         match it.next() {
274             None => return Some(out),
275             Some('\\') => {
276                 if it.peek() != Some(&'u') {
277                     panic!("can't understand escape");
278                 }
279                 drop(it.next());
280                 let hex: String = it.by_ref().take(4).collect();
281                 match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
282                     // Some of the tests use lone surrogates, but we have no
283                     // way to represent them in the UTF-8 input to our parser.
284                     // Since these can only come from script, we will catch
285                     // them there.
286                     None => return None,
287                     Some(c) => out.push(c),
288                 }
289             },
290             Some(c) => out.push(c),
291         }
292     }
293 }
294 
unescape_json(js: &Value) -> Value295 fn unescape_json(js: &Value) -> Value {
296     match *js {
297         // unwrap is OK here because the spec'd *output* of the tokenizer never
298         // contains a lone surrogate.
299         Value::String(ref s) => Value::String(unescape(&s).unwrap()),
300         Value::Array(ref xs) => Value::Array(xs.iter().map(unescape_json).collect()),
301         Value::Object(ref obj) => {
302             let mut new_obj = Map::new();
303             for (k, v) in obj.iter() {
304                 new_obj.insert(k.clone(), unescape_json(v));
305             }
306             Value::Object(new_obj)
307         },
308         _ => js.clone(),
309     }
310 }
311 
mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> TestDescAndFn312 fn mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> TestDescAndFn {
313     TestDescAndFn {
314         desc: TestDesc::new(DynTestName(desc)),
315         testfn: DynTestFn(Box::new(move || {
316             // Split up the input at different points to test incremental tokenization.
317             let insplits = splits(&input, 3);
318             for input in insplits.into_iter() {
319                 // Clone 'input' so we have it for the failure message.
320                 // Also clone opts.  If we don't, we get the wrong
321                 // result but the compiler doesn't catch it!
322                 // Possibly mozilla/rust#12223.
323                 let output = tokenize(input.clone(), opts.clone());
324                 let expect_toks = json_to_tokens(&expect, opts.exact_errors);
325                 if output != expect_toks {
326                     panic!(
327                         "\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
328                         input, output, expect
329                     );
330                 }
331             }
332         })),
333     }
334 }
335 
mk_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value)336 fn mk_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value) {
337     let obj = js.get_obj();
338     let mut input = js.find("input").get_str();
339     let mut expect = js.find("output").clone();
340     let desc = format!("tok: {}: {}", filename, js.find("description").get_str());
341 
342     // "Double-escaped" tests require additional processing of
343     // the input and output.
344     if obj
345         .get(&"doubleEscaped".to_string())
346         .map_or(false, |j| j.get_bool())
347     {
348         match unescape(&input) {
349             None => return,
350             Some(i) => input = i,
351         }
352         expect = unescape_json(&expect);
353     }
354 
355     // Some tests have a last start tag name.
356     let start_tag = obj.get(&"lastStartTag".to_string()).map(|s| s.get_str());
357 
358     // Some tests want to start in a state other than Data.
359     let state_overrides = match obj.get(&"initialStates".to_string()) {
360         Some(&Value::Array(ref xs)) => xs
361             .iter()
362             .map(|s| {
363                 Some(match &s.get_str()[..] {
364                     "PLAINTEXT state" => Plaintext,
365                     "RAWTEXT state" => RawData(Rawtext),
366                     "RCDATA state" => RawData(Rcdata),
367                     s => panic!("don't know state {}", s),
368                 })
369             })
370             .collect(),
371         None => vec![None],
372         _ => panic!("don't understand initialStates value"),
373     };
374 
375     // Build the tests.
376     for state in state_overrides.into_iter() {
377         for &exact_errors in [false, true].iter() {
378             let mut newdesc = desc.clone();
379             match state {
380                 Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
381                 None => (),
382             };
383             if exact_errors {
384                 newdesc = format!("{} (exact errors)", newdesc);
385             }
386 
387             tests.push(mk_test(
388                 newdesc,
389                 input.clone(),
390                 expect.clone(),
391                 TokenizerOpts {
392                     exact_errors: exact_errors,
393                     initial_state: state,
394                     last_start_tag_name: start_tag.clone(),
395 
396                     // Not discarding a BOM is what the test suite expects; see
397                     // https://github.com/html5lib/html5lib-tests/issues/2
398                     discard_bom: false,
399 
400                     ..Default::default()
401                 },
402             ));
403         }
404     }
405 }
406 
tests(src_dir: &Path) -> Vec<TestDescAndFn>407 fn tests(src_dir: &Path) -> Vec<TestDescAndFn> {
408     let mut tests = vec![];
409 
410     foreach_html5lib_test(
411         src_dir,
412         "tokenizer",
413         OsStr::new("test"),
414         |path, mut file| {
415             let mut s = String::new();
416             file.read_to_string(&mut s)
417                 .ok()
418                 .expect("file reading error");
419             let js: Value = serde_json::from_str(&s).ok().expect("json parse error");
420 
421             match js.get_obj().get(&"tests".to_string()) {
422                 Some(&Value::Array(ref lst)) => {
423                     for test in lst.iter() {
424                         mk_tests(
425                             &mut tests,
426                             path.file_name().unwrap().to_str().unwrap(),
427                             test,
428                         );
429                     }
430                 },
431 
432                 // xmlViolation.test doesn't follow this format.
433                 _ => (),
434             }
435         },
436     );
437 
438     tests
439 }
440 
main()441 fn main() {
442     let args: Vec<_> = env::args().collect();
443     rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
444 }
445