1 // Copyright 2014-2017 The html5ever Project Developers. See the
2 // COPYRIGHT file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 use serde_json::{Map, Value};
11 use std::borrow::Cow::Borrowed;
12 use std::env;
13 use std::ffi::OsStr;
14 use std::io::Read;
15 use std::mem::replace;
16 use std::path::Path;
17
18 use rustc_test::{DynTestFn, DynTestName, TestDesc, TestDescAndFn};
19 use util::find_tests::foreach_xml5lib_test;
20
21 use markup5ever::buffer_queue::BufferQueue;
22 use xml5ever::tendril::{SliceExt, StrTendril};
23 use xml5ever::tokenizer::{CharacterTokens, Token, TokenSink};
24 use xml5ever::tokenizer::{CommentToken, EmptyTag, EndTag, ShortTag, StartTag, Tag};
25 use xml5ever::tokenizer::{Doctype, DoctypeToken, PIToken, Pi};
26 use xml5ever::tokenizer::{EOFToken, XmlTokenizer, XmlTokenizerOpts};
27 use xml5ever::tokenizer::{NullCharacterToken, ParseError, TagToken};
28 use xml5ever::{namespace_url, ns, Attribute, LocalName, QualName};
29
30 mod util {
31 pub mod find_tests;
32 }
33
34 // Return all ways of splitting the string into at most n
35 // possibly-empty pieces.
splits(s: &str, n: usize) -> Vec<Vec<StrTendril>>36 fn splits(s: &str, n: usize) -> Vec<Vec<StrTendril>> {
37 if n == 1 {
38 return vec![vec![s.to_tendril()]];
39 }
40
41 let mut points: Vec<usize> = s.char_indices().map(|(n, _)| n).collect();
42 points.push(s.len());
43
44 // do this with iterators?
45 let mut out = vec![];
46 for p in points.into_iter() {
47 let y = &s[p..];
48 for mut x in splits(&s[..p], n - 1).into_iter() {
49 x.push(y.to_tendril());
50 out.push(x);
51 }
52 }
53
54 out.extend(splits(s, n - 1).into_iter());
55 out
56 }
57
58 struct TokenLogger {
59 tokens: Vec<Token>,
60 current_str: StrTendril,
61 exact_errors: bool,
62 }
63
64 impl TokenLogger {
new(exact_errors: bool) -> TokenLogger65 fn new(exact_errors: bool) -> TokenLogger {
66 TokenLogger {
67 tokens: vec![],
68 current_str: StrTendril::new(),
69 exact_errors: exact_errors,
70 }
71 }
72
73 // Push anything other than character tokens
push(&mut self, token: Token)74 fn push(&mut self, token: Token) {
75 self.finish_str();
76 self.tokens.push(token);
77 }
78
finish_str(&mut self)79 fn finish_str(&mut self) {
80 if self.current_str.len() > 0 {
81 let s = replace(&mut self.current_str, StrTendril::new());
82 self.tokens.push(CharacterTokens(s));
83 }
84 }
85
get_tokens(mut self) -> Vec<Token>86 fn get_tokens(mut self) -> Vec<Token> {
87 self.finish_str();
88 self.tokens
89 }
90 }
91
92 impl TokenSink for TokenLogger {
process_token(&mut self, token: Token)93 fn process_token(&mut self, token: Token) {
94 match token {
95 CharacterTokens(b) => {
96 self.current_str.push_slice(&b);
97 },
98
99 NullCharacterToken => {
100 self.current_str.push_char('\0');
101 },
102
103 ParseError(_) => {
104 if self.exact_errors {
105 self.push(ParseError(Borrowed("")));
106 }
107 },
108
109 TagToken(mut t) => {
110 // The spec seems to indicate that one can emit
111 // erroneous end tags with attrs, but the test
112 // cases don't contain them.
113 match t.kind {
114 EndTag => {
115 t.attrs = vec![];
116 },
117 _ => t.attrs.sort_by(|a1, a2| a1.name.cmp(&a2.name)),
118 }
119 self.push(TagToken(t));
120 },
121
122 EOFToken => (),
123
124 _ => self.push(token),
125 }
126 }
127 }
128
tokenize_xml(input: Vec<StrTendril>, opts: XmlTokenizerOpts) -> Vec<Token>129 fn tokenize_xml(input: Vec<StrTendril>, opts: XmlTokenizerOpts) -> Vec<Token> {
130 let sink = TokenLogger::new(opts.exact_errors);
131 let mut tok = XmlTokenizer::new(sink, opts);
132 let mut buf = BufferQueue::new();
133
134 for chunk in input.into_iter() {
135 buf.push_back(chunk);
136 let _ = tok.feed(&mut buf);
137 }
138 let _ = tok.feed(&mut buf);
139 tok.end();
140 tok.sink.get_tokens()
141 }
142
143 trait JsonExt: Sized {
get_str(&self) -> String144 fn get_str(&self) -> String;
get_tendril(&self) -> StrTendril145 fn get_tendril(&self) -> StrTendril;
get_nullable_tendril(&self) -> Option<StrTendril>146 fn get_nullable_tendril(&self) -> Option<StrTendril>;
get_bool(&self) -> bool147 fn get_bool(&self) -> bool;
get_obj<'t>(&'t self) -> &'t Map<String, Self>148 fn get_obj<'t>(&'t self) -> &'t Map<String, Self>;
get_list<'t>(&'t self) -> &'t Vec<Self>149 fn get_list<'t>(&'t self) -> &'t Vec<Self>;
find<'t>(&'t self, key: &str) -> &'t Self150 fn find<'t>(&'t self, key: &str) -> &'t Self;
151 }
152
153 impl JsonExt for Value {
get_str(&self) -> String154 fn get_str(&self) -> String {
155 match *self {
156 Value::String(ref s) => s.to_string(),
157 _ => panic!("Value::get_str: not a String"),
158 }
159 }
160
get_tendril(&self) -> StrTendril161 fn get_tendril(&self) -> StrTendril {
162 match *self {
163 Value::String(ref s) => s.to_tendril(),
164 _ => panic!("Value::get_tendril: not a String"),
165 }
166 }
167
get_nullable_tendril(&self) -> Option<StrTendril>168 fn get_nullable_tendril(&self) -> Option<StrTendril> {
169 match *self {
170 Value::Null => None,
171 Value::String(ref s) => Some(s.to_tendril()),
172 _ => panic!("Value::get_nullable_tendril: not a String"),
173 }
174 }
175
get_bool(&self) -> bool176 fn get_bool(&self) -> bool {
177 match *self {
178 Value::Bool(b) => b,
179 _ => panic!("Value::get_bool: not a Boolean"),
180 }
181 }
182
get_obj<'t>(&'t self) -> &'t Map<String, Value>183 fn get_obj<'t>(&'t self) -> &'t Map<String, Value> {
184 match *self {
185 Value::Object(ref m) => &*m,
186 _ => panic!("Value::get_obj: not an Object"),
187 }
188 }
189
get_list<'t>(&'t self) -> &'t Vec<Value>190 fn get_list<'t>(&'t self) -> &'t Vec<Value> {
191 match *self {
192 Value::Array(ref m) => m,
193 _ => panic!("Value::get_list: not an Array"),
194 }
195 }
196
find<'t>(&'t self, key: &str) -> &'t Value197 fn find<'t>(&'t self, key: &str) -> &'t Value {
198 self.get_obj().get(&key.to_string()).unwrap()
199 }
200 }
201
202 // Parse a JSON object (other than "ParseError") to a token.
json_to_token(js: &Value) -> Token203 fn json_to_token(js: &Value) -> Token {
204 let parts = js.as_array().unwrap();
205 // Collect refs here so we don't have to use "ref" in all the patterns below.
206 let args: Vec<&Value> = parts[1..].iter().collect();
207 match &*parts[0].get_str() {
208 "StartTag" => TagToken(Tag {
209 kind: StartTag,
210 name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
211 attrs: args[1]
212 .get_obj()
213 .iter()
214 .map(|(k, v)| Attribute {
215 name: QualName::new(None, ns!(), LocalName::from(&**k)),
216 value: v.get_tendril(),
217 })
218 .collect(),
219 }),
220
221 "EndTag" => TagToken(Tag {
222 kind: EndTag,
223 name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
224 attrs: vec![],
225 }),
226
227 "ShortTag" => TagToken(Tag {
228 kind: ShortTag,
229 name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
230 attrs: vec![],
231 }),
232
233 "EmptyTag" => TagToken(Tag {
234 kind: EmptyTag,
235 name: QualName::new(None, ns!(), LocalName::from(args[0].get_str())),
236 attrs: args[1]
237 .get_obj()
238 .iter()
239 .map(|(k, v)| Attribute {
240 name: QualName::new(None, ns!(), LocalName::from(&**k)),
241 value: v.get_tendril(),
242 })
243 .collect(),
244 }),
245
246 "Comment" => CommentToken(args[0].get_tendril()),
247
248 "Character" => CharacterTokens(args[0].get_tendril()),
249
250 "PI" => PIToken(Pi {
251 target: args[0].get_tendril(),
252 data: args[1].get_tendril(),
253 }),
254
255 "DOCTYPE" => DoctypeToken(Doctype {
256 name: args[0].get_nullable_tendril(),
257 public_id: args[1].get_nullable_tendril(),
258 system_id: args[2].get_nullable_tendril(),
259 }),
260
261 // We don't need to produce NullCharacterToken because
262 // the TokenLogger will convert them to CharacterTokens.
263 _ => panic!("don't understand token {:?}", parts),
264 }
265 }
266
267 // Parse the "output" field of the test case into a vector of tokens.
json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token>268 fn json_to_tokens(js: &Value, exact_errors: bool) -> Vec<Token> {
269 // Use a TokenLogger so that we combine character tokens separated
270 // by an ignored error.
271 let mut sink = TokenLogger::new(exact_errors);
272 for tok in js.as_array().unwrap().iter() {
273 match *tok {
274 Value::String(ref s) if &s[..] == "ParseError" => {
275 sink.process_token(ParseError(Borrowed("")))
276 },
277 _ => sink.process_token(json_to_token(tok)),
278 }
279 }
280 sink.get_tokens()
281 }
282
mk_xml_test( desc: String, input: String, expect: Value, opts: XmlTokenizerOpts, ) -> TestDescAndFn283 fn mk_xml_test(
284 desc: String,
285 input: String,
286 expect: Value,
287 opts: XmlTokenizerOpts,
288 ) -> TestDescAndFn {
289 TestDescAndFn {
290 desc: TestDesc::new(DynTestName(desc)),
291 testfn: DynTestFn(Box::new(move || {
292 // Split up the input at different points to test incremental tokenization.
293 let insplits = splits(&input, 3);
294 for input in insplits.into_iter() {
295 // Clone 'input' so we have it for the failure message.
296 // Also clone opts. If we don't, we get the wrong
297 // result but the compiler doesn't catch it!
298 // Possibly mozilla/rust#12223.
299 let output = tokenize_xml(input.clone(), opts.clone());
300 let expect = json_to_tokens(&expect, opts.exact_errors);
301 if output != expect {
302 panic!(
303 "\ninput: {:?}\ngot: {:?}\nexpected: {:?}",
304 input, output, expect
305 );
306 }
307 }
308 })),
309 }
310 }
311
mk_xml_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value)312 fn mk_xml_tests(tests: &mut Vec<TestDescAndFn>, filename: &str, js: &Value) {
313 let input: &str = &js.find("input").get_str();
314 let expect = js.find("output");
315 let desc = format!("tok: {}: {}", filename, js.find("description").get_str());
316
317 // Some tests want to start in a state other than Data.
318 let state_overrides = vec![None];
319
320 // Build the tests.
321 for state in state_overrides.into_iter() {
322 for &exact_errors in [false, true].iter() {
323 let mut newdesc = desc.clone();
324 match state {
325 Some(s) => newdesc = format!("{} (in state {:?})", newdesc, s),
326 None => (),
327 };
328 if exact_errors {
329 newdesc = format!("{} (exact errors)", newdesc);
330 }
331
332 tests.push(mk_xml_test(
333 newdesc,
334 String::from(input),
335 expect.clone(),
336 XmlTokenizerOpts {
337 exact_errors: exact_errors,
338 initial_state: state,
339
340 // Not discarding a BOM is what the test suite expects; see
341 // https://github.com/html5lib/html5lib-tests/issues/2
342 discard_bom: false,
343
344 ..Default::default()
345 },
346 ));
347 }
348 }
349 }
350
tests(src_dir: &Path) -> Vec<TestDescAndFn>351 fn tests(src_dir: &Path) -> Vec<TestDescAndFn> {
352 let mut tests = vec![];
353 foreach_xml5lib_test(
354 src_dir,
355 "tokenizer",
356 OsStr::new("test"),
357 |path, mut file| {
358 let mut s = String::new();
359 file.read_to_string(&mut s)
360 .ok()
361 .expect("file reading error");
362 let js: Value = serde_json::from_str(&s).ok().expect("json parse error");
363
364 match js["tests"] {
365 Value::Array(ref lst) => {
366 for test in lst.iter() {
367 mk_xml_tests(
368 &mut tests,
369 path.file_name().unwrap().to_str().unwrap(),
370 test,
371 );
372 }
373 },
374
375 _ => (),
376 }
377 },
378 );
379
380 tests
381 }
382
main()383 fn main() {
384 let args: Vec<_> = env::args().collect();
385 rustc_test::test_main(&args, tests(Path::new(env!("CARGO_MANIFEST_DIR"))));
386 }
387