1 // Copyright 2014-2017 The html5ever Project Developers. See the 2 // COPYRIGHT file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 use super::{TokenSink, XmlTokenizer}; 11 use crate::data; 12 use log::debug; 13 use mac::{format_if, unwrap_or_return}; 14 use markup5ever::buffer_queue::BufferQueue; 15 use std::borrow::Cow::Borrowed; 16 use std::char::from_u32; 17 use crate::tendril::StrTendril; 18 use crate::util::is_ascii_alnum; 19 20 use self::State::*; 21 pub use self::Status::*; 22 23 //§ tokenizing-character-references 24 pub struct CharRef { 25 /// The resulting character(s) 26 pub chars: [char; 2], 27 28 /// How many slots in `chars` are valid? 29 pub num_chars: u8, 30 } 31 32 pub enum Status { 33 Stuck, 34 Progress, 35 Done, 36 } 37 38 #[derive(Debug)] 39 enum State { 40 Begin, 41 Octothorpe, 42 Numeric(u32), // base 43 NumericSemicolon, 44 Named, 45 BogusName, 46 } 47 48 pub struct CharRefTokenizer { 49 state: State, 50 addnl_allowed: Option<char>, 51 result: Option<CharRef>, 52 53 num: u32, 54 num_too_big: bool, 55 seen_digit: bool, 56 hex_marker: Option<char>, 57 58 name_buf_opt: Option<StrTendril>, 59 name_match: Option<(u32, u32)>, 60 name_len: usize, 61 } 62 63 impl CharRefTokenizer { 64 // NB: We assume that we have an additional allowed character iff we're 65 // tokenizing in an attribute value. new(addnl_allowed: Option<char>) -> CharRefTokenizer66 pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { 67 CharRefTokenizer { 68 state: Begin, 69 addnl_allowed: addnl_allowed, 70 result: None, 71 num: 0, 72 num_too_big: false, 73 seen_digit: false, 74 hex_marker: None, 75 name_buf_opt: None, 76 name_match: None, 77 name_len: 0, 78 } 79 } 80 81 // A CharRefTokenizer can only tokenize one character reference, 82 // so this method consumes the tokenizer. get_result(self) -> CharRef83 pub fn get_result(self) -> CharRef { 84 self.result.expect("get_result called before done") 85 } 86 name_buf<'t>(&'t self) -> &'t StrTendril87 fn name_buf<'t>(&'t self) -> &'t StrTendril { 88 self.name_buf_opt 89 .as_ref() 90 .expect("name_buf missing in named character reference") 91 } 92 name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril93 fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril { 94 self.name_buf_opt 95 .as_mut() 96 .expect("name_buf missing in named character reference") 97 } 98 finish_none(&mut self) -> Status99 fn finish_none(&mut self) -> Status { 100 self.result = Some(CharRef { 101 chars: ['\0', '\0'], 102 num_chars: 0, 103 }); 104 Done 105 } 106 finish_one(&mut self, c: char) -> Status107 fn finish_one(&mut self, c: char) -> Status { 108 self.result = Some(CharRef { 109 chars: [c, '\0'], 110 num_chars: 1, 111 }); 112 Done 113 } 114 } 115 116 impl CharRefTokenizer { step<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status117 pub fn step<Sink: TokenSink>( 118 &mut self, 119 tokenizer: &mut XmlTokenizer<Sink>, 120 input: &mut BufferQueue, 121 ) -> Status { 122 if self.result.is_some() { 123 return Done; 124 } 125 126 debug!("char ref tokenizer stepping in state {:?}", self.state); 127 match self.state { 128 Begin => self.do_begin(tokenizer, input), 129 Octothorpe => self.do_octothorpe(tokenizer, input), 130 Numeric(base) => self.do_numeric(tokenizer, base, input), 131 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), 132 Named => self.do_named(tokenizer, input), 133 BogusName => self.do_bogus_name(tokenizer, input), 134 } 135 } 136 do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status137 fn do_begin<Sink: TokenSink>( 138 &mut self, 139 tokenizer: &mut XmlTokenizer<Sink>, 140 input: &mut BufferQueue, 141 ) -> Status { 142 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 143 '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), 144 c if Some(c) == self.addnl_allowed => self.finish_none(), 145 146 '#' => { 147 tokenizer.discard_char(input); 148 self.state = Octothorpe; 149 Progress 150 }, 151 152 _ => { 153 self.state = Named; 154 self.name_buf_opt = Some(StrTendril::new()); 155 Progress 156 }, 157 } 158 } 159 do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status160 fn do_octothorpe<Sink: TokenSink>( 161 &mut self, 162 tokenizer: &mut XmlTokenizer<Sink>, 163 input: &mut BufferQueue, 164 ) -> Status { 165 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 166 match c { 167 'x' | 'X' => { 168 tokenizer.discard_char(input); 169 self.hex_marker = Some(c); 170 self.state = Numeric(16); 171 }, 172 173 _ => { 174 self.hex_marker = None; 175 self.state = Numeric(10); 176 }, 177 } 178 Progress 179 } 180 do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, base: u32, input: &mut BufferQueue, ) -> Status181 fn do_numeric<Sink: TokenSink>( 182 &mut self, 183 tokenizer: &mut XmlTokenizer<Sink>, 184 base: u32, 185 input: &mut BufferQueue, 186 ) -> Status { 187 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 188 match c.to_digit(base) { 189 Some(n) => { 190 tokenizer.discard_char(input); 191 self.num = self.num.wrapping_mul(base); 192 if self.num > 0x10FFFF { 193 // We might overflow, and the character is definitely invalid. 194 // We still parse digits and semicolon, but don't use the result. 195 self.num_too_big = true; 196 } 197 self.num = self.num.wrapping_add(n); 198 self.seen_digit = true; 199 Progress 200 }, 201 202 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), 203 204 None => { 205 self.state = NumericSemicolon; 206 Progress 207 }, 208 } 209 } 210 do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status211 fn do_numeric_semicolon<Sink: TokenSink>( 212 &mut self, 213 tokenizer: &mut XmlTokenizer<Sink>, 214 input: &mut BufferQueue, 215 ) -> Status { 216 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 217 ';' => tokenizer.discard_char(input), 218 _ => tokenizer.emit_error(Borrowed( 219 "Semicolon missing after numeric character reference", 220 )), 221 }; 222 self.finish_numeric(tokenizer) 223 } 224 unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status225 fn unconsume_numeric<Sink: TokenSink>( 226 &mut self, 227 tokenizer: &mut XmlTokenizer<Sink>, 228 input: &mut BufferQueue, 229 ) -> Status { 230 let mut unconsume = StrTendril::from_char('#'); 231 match self.hex_marker { 232 Some(c) => unconsume.push_char(c), 233 None => (), 234 } 235 236 tokenizer.unconsume(input, unconsume); 237 tokenizer.emit_error(Borrowed("Numeric character reference without digits")); 238 self.finish_none() 239 } 240 finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status241 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status { 242 fn conv(n: u32) -> char { 243 from_u32(n).expect("invalid char missed by error handling cases") 244 } 245 246 let (c, error) = match self.num { 247 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 248 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), 249 250 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { 251 Some(c) => (c, true), 252 None => (conv(self.num), true), 253 }, 254 255 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), 256 257 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), 258 259 n => (conv(n), false), 260 }; 261 262 if error { 263 let msg = format_if!( 264 tokenizer.opts.exact_errors, 265 "Invalid numeric character reference", 266 "Invalid numeric character reference value 0x{:06X}", 267 self.num 268 ); 269 tokenizer.emit_error(msg); 270 } 271 272 self.finish_one(c) 273 } 274 do_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status275 fn do_named<Sink: TokenSink>( 276 &mut self, 277 tokenizer: &mut XmlTokenizer<Sink>, 278 input: &mut BufferQueue, 279 ) -> Status { 280 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 281 self.name_buf_mut().push_char(c); 282 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { 283 // We have either a full match or a prefix of one. 284 Some(&m) => { 285 if m.0 != 0 { 286 // We have a full match, but there might be a longer one to come. 287 self.name_match = Some(m); 288 self.name_len = self.name_buf().len(); 289 } 290 // Otherwise we just have a prefix match. 291 Progress 292 }, 293 294 // Can't continue the match. 295 None => self.finish_named(tokenizer, Some(c), input), 296 } 297 } 298 emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>)299 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) { 300 let msg = format_if!( 301 tokenizer.opts.exact_errors, 302 "Invalid character reference", 303 "Invalid character reference &{}", 304 self.name_buf() 305 ); 306 tokenizer.emit_error(msg); 307 } 308 unconsume_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )309 fn unconsume_name<Sink: TokenSink>( 310 &mut self, 311 tokenizer: &mut XmlTokenizer<Sink>, 312 input: &mut BufferQueue, 313 ) { 314 tokenizer.unconsume(input, self.name_buf_opt.take().unwrap()); 315 } 316 finish_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, end_char: Option<char>, input: &mut BufferQueue, ) -> Status317 fn finish_named<Sink: TokenSink>( 318 &mut self, 319 tokenizer: &mut XmlTokenizer<Sink>, 320 end_char: Option<char>, 321 input: &mut BufferQueue, 322 ) -> Status { 323 match self.name_match { 324 None => { 325 match end_char { 326 Some(c) if is_ascii_alnum(c) => { 327 // Keep looking for a semicolon, to determine whether 328 // we emit a parse error. 329 self.state = BogusName; 330 return Progress; 331 }, 332 333 // Check length because &; is not a parse error. 334 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), 335 336 _ => (), 337 } 338 self.unconsume_name(tokenizer, input); 339 self.finish_none() 340 }, 341 342 Some((c1, c2)) => { 343 // We have a complete match, but we may have consumed 344 // additional characters into self.name_buf. Usually 345 // at least one, but several in cases like 346 // 347 // ¬ => match for U+00AC 348 // ¬i => valid prefix for ¬in 349 // ¬it => can't continue match 350 351 let name_len = self.name_len; 352 assert!(name_len > 0); 353 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); 354 355 // There might not be a next character after the match, if 356 // we had a full match and then hit EOF. 357 let next_after = if name_len == self.name_buf().len() { 358 None 359 } else { 360 Some(self.name_buf()[name_len..].chars().next().unwrap()) 361 }; 362 363 // "If the character reference is being consumed as part of an 364 // attribute, and the last character matched is not a U+003B 365 // SEMICOLON character (;), and the next character is either a 366 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII 367 // character, then, for historical reasons, all the characters 368 // that were matched after the U+0026 AMPERSAND character (&) 369 // must be unconsumed, and nothing is returned. However, if 370 // this next character is in fact a U+003D EQUALS SIGN 371 // character (=), then this is a parse error" 372 373 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { 374 (_, ';', _) => false, 375 (Some(_), _, Some('=')) => { 376 tokenizer.emit_error(Borrowed( 377 "Equals sign after character reference in attribute", 378 )); 379 true 380 }, 381 (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, 382 _ => { 383 tokenizer.emit_error(Borrowed( 384 "Character reference does not end with semicolon", 385 )); 386 false 387 }, 388 }; 389 390 if unconsume_all { 391 self.unconsume_name(tokenizer, input); 392 self.finish_none() 393 } else { 394 tokenizer 395 .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..])); 396 self.result = Some(CharRef { 397 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], 398 num_chars: if c2 == 0 { 1 } else { 2 }, 399 }); 400 Done 401 } 402 }, 403 } 404 } 405 do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status406 fn do_bogus_name<Sink: TokenSink>( 407 &mut self, 408 tokenizer: &mut XmlTokenizer<Sink>, 409 input: &mut BufferQueue, 410 ) -> Status { 411 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 412 self.name_buf_mut().push_char(c); 413 match c { 414 _ if is_ascii_alnum(c) => return Progress, 415 ';' => self.emit_name_error(tokenizer), 416 _ => (), 417 } 418 self.unconsume_name(tokenizer, input); 419 self.finish_none() 420 } 421 end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )422 pub fn end_of_file<Sink: TokenSink>( 423 &mut self, 424 tokenizer: &mut XmlTokenizer<Sink>, 425 input: &mut BufferQueue, 426 ) { 427 while self.result.is_none() { 428 match self.state { 429 Begin => drop(self.finish_none()), 430 431 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), 432 433 Numeric(_) | NumericSemicolon => { 434 tokenizer.emit_error(Borrowed("EOF in numeric character reference")); 435 self.finish_numeric(tokenizer); 436 }, 437 438 Named => drop(self.finish_named(tokenizer, None, input)), 439 440 BogusName => { 441 self.unconsume_name(tokenizer, input); 442 self.finish_none(); 443 }, 444 445 Octothorpe => { 446 tokenizer.unconsume(input, StrTendril::from_slice("#")); 447 tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); 448 self.finish_none(); 449 }, 450 } 451 } 452 } 453 } 454