1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 mod foreach_html5lib_test;
11
12 use foreach_html5lib_test::foreach_html5lib_test;
13 use html5ever::tendril::*;
14 use html5ever::tokenizer::states::{Plaintext, RawData, Rawtext, Rcdata};
15 use html5ever::tokenizer::BufferQueue;
16 use html5ever::tokenizer::{CharacterTokens, EOFToken, NullCharacterToken, ParseError};
17 use html5ever::tokenizer::{CommentToken, DoctypeToken, TagToken, Token};
18 use html5ever::tokenizer::{Doctype, EndTag, StartTag, Tag};
19 use html5ever::tokenizer::{TokenSink, TokenSinkResult, Tokenizer, TokenizerOpts};
20 use html5ever::{namespace_url, ns, Attribute, LocalName, QualName};
21 use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn};
22 use serde_json::{Map, Value};
23 use std::borrow::Cow::Borrowed;
24 use std::default::Default;
25 use std::ffi::OsStr;
26 use std::io::Read;
27 use std::mem::replace;
28 use std::path::Path;
29 use std::{char, env};
30
31 // Return all ways of splitting the string into at most n
32 // possibly-empty pieces.
splits(s: &str, n: usize) -> Vec<Vec<StrTendril>>33 fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
34 if n == 1 {
35 return vec![vec![s.to_tendril()]];
36 }
37
38 let mut points: Vec<usize> = s.char_indices().map(|(n, _)| n).collect();
39 points.push(s.len());
40
41 // do this with iterators?
42 let mut out = vec![];
43 for p in points.into_iter() {
44 let y = &s[p..];
45 for mut x in splits(&s[..p], n - 1).into_iter() {
46 x.push(y.to_tendril());
47 out.push(x);
48 }
49 }
50
51 out.extend(splits(s, n - 1).into_iter());
52 out
53 }
54
55 struct TokenLogger {
56 tokens: Vec<Token>,
57 current_str: StrTendril,
58 exact_errors: bool,
59 }
60
61 impl TokenLogger {
new(exact_errors: bool) -> TokenLogger62 fn new(exact_errors: bool) -> TokenLogger {
63 TokenLogger {
64 tokens: vec![],
65 current_str: StrTendril::new(),
66 exact_errors: exact_errors,
67 }
68 }
69
70 // Push anything other than character tokens
push(&mut self, token: Token)71 fn push(&mut self, token: Token) {
72 self.finish_str();
73 self.tokens.push(token);
74 }
75
finish_str(&mut self)76 fn finish_str(&mut self) {
77 if self.current_str.len() > 0 {
78 let s = replace(&mut self.current_str, StrTendril::new());
79 self.tokens.push(CharacterTokens(s));
80 }
81 }
82
get_tokens(mut self) -> Vec<Token>83 fn get_tokens(mut self) -> Vec<Token> {
84 self.finish_str();
85 self.tokens
86 }
87 }
88
89 impl TokenSink for TokenLogger {
90 type Handle = ();
91
process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()>92 fn process_token(&mut self, token: Token, _line_number: u64) -> TokenSinkResult<()> {
93 match token {
94 CharacterTokens(b) => {
95 self.current_str.push_slice(&b);
96 },
97
98 NullCharacterToken => {
99 self.current_str.push_char('\0');
100 },
101
102 ParseError(_) => {
103 if self.exact_errors {
104 self.push(ParseError(Borrowed("")));
105 }
106 },
107
108 TagToken(mut t) => {
109 // The spec seems to indicate that one can emit
110 // erroneous end tags with attrs, but the test
111 // cases don't contain them.
112 match t.kind {
113 EndTag => {
114 t.self_closing = false;
115 t.attrs = vec![];
116 },
117 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
118 }
119 self.push(TagToken(t));
120 },
121
122 EOFToken => (),
123
124 _ => self.push(token),
125 }
126 TokenSinkResult::Continue
127 }
128 }
129
tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token>130 fn tokenize(input: Vec<StrTendril>, opts: TokenizerOpts) -> Vec<Token> {
131 let sink = TokenLogger::new(opts.exact_errors);
132 let mut tok = Tokenizer::new(sink, opts);
133 let mut buffer = BufferQueue::new();
134 for chunk in input.into_iter() {
135 buffer.push_back(chunk);
136 let _ = tok.feed(&mut buffer);
137 }
138 let _ = tok.feed(&mut buffer);
139 tok.end();
140 tok.sink.get_tokens()
141 }
142
143 trait JsonExt: Sized {
get_str(&self) -> String144 fn get_str(&self) -> String;
get_tendril(&self) -> StrTendril145 fn get_tendril(&self) -> StrTendril;
get_nullable_tendril(&self) -> Option<StrTendril>146 fn get_nullable_tendril(&self) -> Option<StrTendril>;
get_bool(&self) -> bool147 fn get_bool(&self) -> bool;
get_obj<'t>(&'t self) -> &'t Map<String, Self>148 fn get_obj<'t>(&'t self) -> &'t Map<String, Self>;
get_list<'t>(&'t self) -> &'t Vec<Self>149 fn get_list<'t>(&'t self) -> &'t Vec<Self>;
find<'t>(&'t self, key: &str) -> &'t Self150 fn find<'t>(&'t self, key: &str) -> &'t Self;
151 }
152
153 impl JsonExt for Value {
get_str(&self) -> String154 fn get_str(&self) -> String {
155 match *self {
156 Value::String(ref s) => s.to_string(),
157 _ => panic!("Value::get_str: not a String"),
158 }
159 }
160
get_tendril(&self) -> StrTendril161 fn get_tendril(&self) -> StrTendril {
162 match *self {
163 Value::String(ref s) => s.to_tendril(),
164 _ => panic!("Value::get_tendril: not a String"),
165 }
166 }
167
get_nullable_tendril(&self) -> Option<StrTendril>168 fn get_nullable_tendril(&self) -> Option<StrTendril> {
169 match *self {
170 Value::Null => None,
171 Value::String(ref s) => Some(s.to_tendril()),
172 _ => panic!("Value::get_nullable_tendril: not a String"),
173 }
174 }
175
get_bool(&self) -> bool176 fn get_bool(&self) -> bool {
177 match *self {
178 Value::Bool(b) => b,
179 _ => panic!("Value::get_bool: not a Bool"),
180 }
181 }
182
get_obj<'t>(&'t self) -> &'t Map<String, Value>183 fn get_obj<'t>(&'t self) -> &'t Map<String, Value> {
184 match *self {
185 Value::Object(ref m) => &*m,
186 _ => panic!("Value::get_obj: not an Object"),
187 }
188 }
189
get_list<'t>(&'t self) -> &'t Vec<Value>190 fn get_list<'t>(&'t self) -> &'t Vec<Value> {
191 match *self {
192 Value::Array(ref m) => m,
193 _ => panic!("Value::get_list: not an Array"),
194 }
195 }
196
find<'t>(&'t self, key: &str) -> &'t Value197 fn find<'t>(&'t self, key: &str) -> &'t Value {
198 self.get_obj().get(&key.to_string()).unwrap()
199 }
200 }
201
202 // Parse a JSON object (other than "ParseError") to a token.
json_to_token(js: &Value) -> Token203 fn json_to_token(js: &Value) -> Token {
204 let parts = js.get_list();
205 // Collect refs here so we don't have to use "ref" in all the patterns below.
206 let args: Vec<&Value> = parts[1..].iter().collect();
207 match &*parts[0].get_str() {
208 "DOCTYPE" => DoctypeToken(Doctype {
209 name: args[0].get_nullable_tendril(),
210 public_id: args[1].get_nullable_tendril(),
211 system_id: args[2].get_nullable_tendril(),
212 force_quirks: !args[3].get_bool(),
213 }),
214
215 "StartTag" => TagToken(Tag {
216 kind: StartTag,
217 name: LocalName::from(&*args[0].get_str()),
218 attrs: args[1]
219 .get_obj()
220 .iter()
221 .map(|(k, v)| Attribute {
222 name: QualName::new(None, ns!(), LocalName::from(&**k)),
223 value: v.get_tendril(),
224 })
225 .collect(),
226 self_closing: match args.get(2) {
227 Some(b) => b.get_bool(),
228 None => false,
229 },
230 }),
231
232 "EndTag" => TagToken(Tag {
233 kind: EndTag,
234 name: LocalName::from(&*args[0].get_str()),
235 attrs: vec![],
236 self_closing: false,
237 }),
238
239 "Comment" => CommentToken(args[0].get_tendril()),
240
241 "Character" => CharacterTokens(args[0].get_tendril()),
242
243 // We don't need to produce NullCharacterToken because
244 // the TokenLogger will convert them to CharacterTokens.
245 _ => panic!("don't understand token {:?}", parts),
246 }
247 }
248
249 // Parse the "output" field of the test case into a vector of tokens.
json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token>250 fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
251 // Use a TokenLogger so that we combine character tokens separated
252 // by an ignored error.
253 let mut sink = TokenLogger::new(exact_errors);
254 for tok in js.get_list().iter() {
255 assert_eq!(
256 match *tok {
257 Value::String(ref s) if &s[..] == "ParseError" => {
258 sink.process_token(ParseError(Borrowed("")), 0)
259 },
260 _ => sink.process_token(json_to_token(tok), 0),
261 },
262 TokenSinkResult::Continue
263 );
264 }
265 sink.get_tokens()
266 }
267
268 // Undo the escaping in "doubleEscaped" tests.
unescape(s: &str) -> Option<String>269 fn unescape(s: &str) -> Option<String> {
270 let mut out = String::with_capacity(s.len());
271 let mut it = s.chars().peekable();
272 loop {
273 match it.next() {
274 None => return Some(out),
275 Some('\\') => {
276 if it.peek() != Some(&'u') {
277 panic!("can't understand escape");
278 }
279 drop(it.next());
280 let hex: String = it.by_ref().take(4).collect();
281 match u32::from_str_radix(&hex, 16).ok().and_then(char::from_u32) {
282 // Some of the tests use lone surrogates, but we have no
283 // way to represent them in the UTF-8 input to our parser.
284 // Since these can only come from script, we will catch
285 // them there.
286 None => return None,
287 Some(c) => out.push(c),
288 }
289 },
290 Some(c) => out.push(c),
291 }
292 }
293 }
294
unescape_json(js: &Value) -> Value295 fn unescape_json(js: &Value) -> Value {
296 match *js {
297 // unwrap is OK here because the spec'd *output* of the tokenizer never
298 // contains a lone surrogate.
299 Value::String(ref s) => Value::String(unescape(&s).unwrap()),
300 Value::Array(ref xs) => Value::Array(xs.iter().map(unescape_json).collect()),
301 Value::Object(ref obj) => {
302 let mut new_obj = Map::new();
303 for (k, v) in obj.iter() {
304 new_obj.insert(k.clone(), unescape_json(v));
305 }
306 Value::Object(new_obj)
307 },
308 _ => js.clone(),
309 }
310 }
311
mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> TestDescAndFn312 fn mk_test(desc: String, input: String, expect: Value, opts: TokenizerOpts) -> TestDescAndFn {
313 TestDescAndFn {
314 desc: TestDesc::new(DynTestName(desc)),
315 testfn: DynTestFn(Box::new(move || {
316 // Split up the input at different points to test incremental tokenization.
317 let insplits = splits(&input, 3);
318 for input in insplits.into_iter() {
319 // Clone 'input' so we have it for the failure message.
320 // Also clone opts. If we don't, we get the wrong
321 // result but the compiler doesn't catch it!
322 // Possibly mozilla/rust#12223.
323 let output = tokenize(input.clone(), opts.clone());
324 let expect_toks = json_to_tokens(&expect, opts.exact_errors);
325 if output != expect_toks {
326 panic!(
327 "\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
328 input, output, expect
329 );
330 }
331 }
332 })),
333 }
334 }
335
mk_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value)336 fn mk_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value) {
337 let obj = js.get_obj();
338 let mut input = js.find("input").get_str();
339 let mut expect = js.find("output").clone();
340 let desc = format!("tok: {}: {}", filename, js.find("description").get_str());
341
342 // "Double-escaped" tests require additional processing of
343 // the input and output.
344 if obj
345 .get(&"doubleEscaped".to_string())
346 .map_or(false, |j| j.get_bool())
347 {
348 match unescape(&input) {
349 None => return,
350 Some(i) => input = i,
351 }
352 expect = unescape_json(&expect);
353 }
354
355 // Some tests have a last start tag name.
356 let start_tag = obj.get(&"lastStartTag".to_string()).map(|s| s.get_str());
357
358 // Some tests want to start in a state other than Data.
359 let state_overrides = match obj.get(&"initialStates".to_string()) {
360 Some(&Value::Array(ref xs)) => xs
361 .iter()
362 .map(|s| {
363 Some(match &s.get_str()[..] {
364 "PLAINTEXT state" => Plaintext,
365 "RAWTEXT state" => RawData(Rawtext),
366 "RCDATA state" => RawData(Rcdata),
367 s => panic!("don't know state {}", s),
368 })
369 })
370 .collect(),
371 None => vec![None],
372 _ => panic!("don't understand initialStates value"),
373 };
374
375 // Build the tests.
376 for state in state_overrides.into_iter() {
377 for &exact_errors in [false, true].iter() {
378 let mut newdesc = desc.clone();
379 match state {
380 Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
381 None => (),
382 };
383 if exact_errors {
384 newdesc = format!("{} (exact errors)", newdesc);
385 }
386
387 tests.push(mk_test(
388 newdesc,
389 input.clone(),
390 expect.clone(),
391 TokenizerOpts {
392 exact_errors: exact_errors,
393 initial_state: state,
394 last_start_tag_name: start_tag.clone(),
395
396 // Not discarding a BOM is what the test suite expects; see
397 // https://github.com/html5lib/html5lib-tests/issues/2
398 discard_bom: false,
399
400 ..Default::default()
401 },
402 ));
403 }
404 }
405 }
406
tests(src_dir: &Path) -> Vec<TestDescAndFn>407 fn tests(src_dir: &Path) -> Vec<TestDescAndFn> {
408 let mut tests = vec![];
409
410 foreach_html5lib_test(
411 src_dir,
412 "tokenizer",
413 OsStr::new("test"),
414 |path, mut file| {
415 let mut s = String::new();
416 file.read_to_string(&mut s)
417 .ok()
418 .expect("file reading error");
419 let js: Value = serde_json::from_str(&s).ok().expect("json parse error");
420
421 match js.get_obj().get(&"tests".to_string()) {
422 Some(&Value::Array(ref lst)) => {
423 for test in lst.iter() {
424 mk_tests(
425 &mut tests,
426 path.file_name().unwrap().to_str().unwrap(),
427 test,
428 );
429 }
430 },
431
432 // xmlViolation.test doesn't follow this format.
433 _ => (),
434 }
435 },
436 );
437
438 tests
439 }
440
main()441 fn main() {
442 let args: Vec<_> = env::args().collect();
443 rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
444 }
445