1 // Copyright 2014-2017 The html5ever Project Developers. See the 2 // COPYRIGHT file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 use super::{TokenSink, XmlTokenizer}; 11 use crate::data; 12 use crate::tendril::StrTendril; 13 use log::debug; 14 use mac::{format_if, unwrap_or_return}; 15 use markup5ever::buffer_queue::BufferQueue; 16 use std::borrow::Cow::Borrowed; 17 use std::char::from_u32; 18 19 use self::State::*; 20 pub use self::Status::*; 21 22 //§ tokenizing-character-references 23 pub struct CharRef { 24 /// The resulting character(s) 25 pub chars: [char; 2], 26 27 /// How many slots in `chars` are valid? 28 pub num_chars: u8, 29 } 30 31 pub enum Status { 32 Stuck, 33 Progress, 34 Done, 35 } 36 37 #[derive(Debug)] 38 enum State { 39 Begin, 40 Octothorpe, 41 Numeric(u32), // base 42 NumericSemicolon, 43 Named, 44 BogusName, 45 } 46 47 pub struct CharRefTokenizer { 48 state: State, 49 addnl_allowed: Option<char>, 50 result: Option<CharRef>, 51 52 num: u32, 53 num_too_big: bool, 54 seen_digit: bool, 55 hex_marker: Option<char>, 56 57 name_buf_opt: Option<StrTendril>, 58 name_match: Option<(u32, u32)>, 59 name_len: usize, 60 } 61 62 impl CharRefTokenizer { 63 // NB: We assume that we have an additional allowed character iff we're 64 // tokenizing in an attribute value. new(addnl_allowed: Option<char>) -> CharRefTokenizer65 pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { 66 CharRefTokenizer { 67 state: Begin, 68 addnl_allowed, 69 result: None, 70 num: 0, 71 num_too_big: false, 72 seen_digit: false, 73 hex_marker: None, 74 name_buf_opt: None, 75 name_match: None, 76 name_len: 0, 77 } 78 } 79 80 // A CharRefTokenizer can only tokenize one character reference, 81 // so this method consumes the tokenizer. get_result(self) -> CharRef82 pub fn get_result(self) -> CharRef { 83 self.result.expect("get_result called before done") 84 } 85 name_buf(&self) -> &StrTendril86 fn name_buf(&self) -> &StrTendril { 87 self.name_buf_opt 88 .as_ref() 89 .expect("name_buf missing in named character reference") 90 } 91 name_buf_mut(&mut self) -> &mut StrTendril92 fn name_buf_mut(&mut self) -> &mut StrTendril { 93 self.name_buf_opt 94 .as_mut() 95 .expect("name_buf missing in named character reference") 96 } 97 finish_none(&mut self) -> Status98 fn finish_none(&mut self) -> Status { 99 self.result = Some(CharRef { 100 chars: ['\0', '\0'], 101 num_chars: 0, 102 }); 103 Done 104 } 105 finish_one(&mut self, c: char) -> Status106 fn finish_one(&mut self, c: char) -> Status { 107 self.result = Some(CharRef { 108 chars: [c, '\0'], 109 num_chars: 1, 110 }); 111 Done 112 } 113 } 114 115 impl CharRefTokenizer { step<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status116 pub fn step<Sink: TokenSink>( 117 &mut self, 118 tokenizer: &mut XmlTokenizer<Sink>, 119 input: &mut BufferQueue, 120 ) -> Status { 121 if self.result.is_some() { 122 return Done; 123 } 124 125 debug!("char ref tokenizer stepping in state {:?}", self.state); 126 match self.state { 127 Begin => self.do_begin(tokenizer, input), 128 Octothorpe => self.do_octothorpe(tokenizer, input), 129 Numeric(base) => self.do_numeric(tokenizer, base, input), 130 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), 131 Named => self.do_named(tokenizer, input), 132 BogusName => self.do_bogus_name(tokenizer, input), 133 } 134 } 135 do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status136 fn do_begin<Sink: TokenSink>( 137 &mut self, 138 tokenizer: &mut XmlTokenizer<Sink>, 139 input: &mut BufferQueue, 140 ) -> Status { 141 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 142 '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), 143 c if Some(c) == self.addnl_allowed => self.finish_none(), 144 145 '#' => { 146 tokenizer.discard_char(input); 147 self.state = Octothorpe; 148 Progress 149 }, 150 151 _ => { 152 self.state = Named; 153 self.name_buf_opt = Some(StrTendril::new()); 154 Progress 155 }, 156 } 157 } 158 do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status159 fn do_octothorpe<Sink: TokenSink>( 160 &mut self, 161 tokenizer: &mut XmlTokenizer<Sink>, 162 input: &mut BufferQueue, 163 ) -> Status { 164 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 165 match c { 166 'x' | 'X' => { 167 tokenizer.discard_char(input); 168 self.hex_marker = Some(c); 169 self.state = Numeric(16); 170 }, 171 172 _ => { 173 self.hex_marker = None; 174 self.state = Numeric(10); 175 }, 176 } 177 Progress 178 } 179 do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, base: u32, input: &mut BufferQueue, ) -> Status180 fn do_numeric<Sink: TokenSink>( 181 &mut self, 182 tokenizer: &mut XmlTokenizer<Sink>, 183 base: u32, 184 input: &mut BufferQueue, 185 ) -> Status { 186 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 187 match c.to_digit(base) { 188 Some(n) => { 189 tokenizer.discard_char(input); 190 self.num = self.num.wrapping_mul(base); 191 if self.num > 0x10FFFF { 192 // We might overflow, and the character is definitely invalid. 193 // We still parse digits and semicolon, but don't use the result. 194 self.num_too_big = true; 195 } 196 self.num = self.num.wrapping_add(n); 197 self.seen_digit = true; 198 Progress 199 }, 200 201 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), 202 203 None => { 204 self.state = NumericSemicolon; 205 Progress 206 }, 207 } 208 } 209 do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status210 fn do_numeric_semicolon<Sink: TokenSink>( 211 &mut self, 212 tokenizer: &mut XmlTokenizer<Sink>, 213 input: &mut BufferQueue, 214 ) -> Status { 215 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 216 ';' => tokenizer.discard_char(input), 217 _ => tokenizer.emit_error(Borrowed( 218 "Semicolon missing after numeric character reference", 219 )), 220 }; 221 self.finish_numeric(tokenizer) 222 } 223 unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status224 fn unconsume_numeric<Sink: TokenSink>( 225 &mut self, 226 tokenizer: &mut XmlTokenizer<Sink>, 227 input: &mut BufferQueue, 228 ) -> Status { 229 let mut unconsume = StrTendril::from_char('#'); 230 match self.hex_marker { 231 Some(c) => unconsume.push_char(c), 232 None => (), 233 } 234 235 tokenizer.unconsume(input, unconsume); 236 tokenizer.emit_error(Borrowed("Numeric character reference without digits")); 237 self.finish_none() 238 } 239 finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status240 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) -> Status { 241 fn conv(n: u32) -> char { 242 from_u32(n).expect("invalid char missed by error handling cases") 243 } 244 245 let (c, error) = match self.num { 246 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 247 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), 248 249 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { 250 Some(c) => (c, true), 251 None => (conv(self.num), true), 252 }, 253 254 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), 255 256 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), 257 258 n => (conv(n), false), 259 }; 260 261 if error { 262 let msg = format_if!( 263 tokenizer.opts.exact_errors, 264 "Invalid numeric character reference", 265 "Invalid numeric character reference value 0x{:06X}", 266 self.num 267 ); 268 tokenizer.emit_error(msg); 269 } 270 271 self.finish_one(c) 272 } 273 do_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status274 fn do_named<Sink: TokenSink>( 275 &mut self, 276 tokenizer: &mut XmlTokenizer<Sink>, 277 input: &mut BufferQueue, 278 ) -> Status { 279 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 280 self.name_buf_mut().push_char(c); 281 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { 282 // We have either a full match or a prefix of one. 283 Some(&m) => { 284 if m.0 != 0 { 285 // We have a full match, but there might be a longer one to come. 286 self.name_match = Some(m); 287 self.name_len = self.name_buf().len(); 288 } 289 // Otherwise we just have a prefix match. 290 Progress 291 }, 292 293 // Can't continue the match. 294 None => self.finish_named(tokenizer, Some(c), input), 295 } 296 } 297 emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>)298 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut XmlTokenizer<Sink>) { 299 let msg = format_if!( 300 tokenizer.opts.exact_errors, 301 "Invalid character reference", 302 "Invalid character reference &{}", 303 self.name_buf() 304 ); 305 tokenizer.emit_error(msg); 306 } 307 unconsume_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )308 fn unconsume_name<Sink: TokenSink>( 309 &mut self, 310 tokenizer: &mut XmlTokenizer<Sink>, 311 input: &mut BufferQueue, 312 ) { 313 tokenizer.unconsume(input, self.name_buf_opt.take().unwrap()); 314 } 315 finish_named<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, end_char: Option<char>, input: &mut BufferQueue, ) -> Status316 fn finish_named<Sink: TokenSink>( 317 &mut self, 318 tokenizer: &mut XmlTokenizer<Sink>, 319 end_char: Option<char>, 320 input: &mut BufferQueue, 321 ) -> Status { 322 match self.name_match { 323 None => { 324 match end_char { 325 Some(c) if c.is_ascii_alphanumeric() => { 326 // Keep looking for a semicolon, to determine whether 327 // we emit a parse error. 328 self.state = BogusName; 329 return Progress; 330 }, 331 332 // Check length because &; is not a parse error. 333 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), 334 335 _ => (), 336 } 337 self.unconsume_name(tokenizer, input); 338 self.finish_none() 339 }, 340 341 Some((c1, c2)) => { 342 // We have a complete match, but we may have consumed 343 // additional characters into self.name_buf. Usually 344 // at least one, but several in cases like 345 // 346 // ¬ => match for U+00AC 347 // ¬i => valid prefix for ¬in 348 // ¬it => can't continue match 349 350 let name_len = self.name_len; 351 assert!(name_len > 0); 352 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); 353 354 // There might not be a next character after the match, if 355 // we had a full match and then hit EOF. 356 let next_after = if name_len == self.name_buf().len() { 357 None 358 } else { 359 Some(self.name_buf()[name_len..].chars().next().unwrap()) 360 }; 361 362 // "If the character reference is being consumed as part of an 363 // attribute, and the last character matched is not a U+003B 364 // SEMICOLON character (;), and the next character is either a 365 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII 366 // character, then, for historical reasons, all the characters 367 // that were matched after the U+0026 AMPERSAND character (&) 368 // must be unconsumed, and nothing is returned. However, if 369 // this next character is in fact a U+003D EQUALS SIGN 370 // character (=), then this is a parse error" 371 372 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { 373 (_, ';', _) => false, 374 (Some(_), _, Some('=')) => { 375 tokenizer.emit_error(Borrowed( 376 "Equals sign after character reference in attribute", 377 )); 378 true 379 }, 380 (Some(_), _, Some(c)) if c.is_ascii_alphanumeric() => true, 381 _ => { 382 tokenizer.emit_error(Borrowed( 383 "Character reference does not end with semicolon", 384 )); 385 false 386 }, 387 }; 388 389 if unconsume_all { 390 self.unconsume_name(tokenizer, input); 391 self.finish_none() 392 } else { 393 tokenizer 394 .unconsume(input, StrTendril::from_slice(&self.name_buf()[name_len..])); 395 self.result = Some(CharRef { 396 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], 397 num_chars: if c2 == 0 { 1 } else { 2 }, 398 }); 399 Done 400 } 401 }, 402 } 403 } 404 do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, ) -> Status405 fn do_bogus_name<Sink: TokenSink>( 406 &mut self, 407 tokenizer: &mut XmlTokenizer<Sink>, 408 input: &mut BufferQueue, 409 ) -> Status { 410 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 411 self.name_buf_mut().push_char(c); 412 match c { 413 _ if c.is_ascii_alphanumeric() => return Progress, 414 ';' => self.emit_name_error(tokenizer), 415 _ => (), 416 } 417 self.unconsume_name(tokenizer, input); 418 self.finish_none() 419 } 420 end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut XmlTokenizer<Sink>, input: &mut BufferQueue, )421 pub fn end_of_file<Sink: TokenSink>( 422 &mut self, 423 tokenizer: &mut XmlTokenizer<Sink>, 424 input: &mut BufferQueue, 425 ) { 426 while self.result.is_none() { 427 match self.state { 428 Begin => drop(self.finish_none()), 429 430 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), 431 432 Numeric(_) | NumericSemicolon => { 433 tokenizer.emit_error(Borrowed("EOF in numeric character reference")); 434 self.finish_numeric(tokenizer); 435 }, 436 437 Named => drop(self.finish_named(tokenizer, None, input)), 438 439 BogusName => { 440 self.unconsume_name(tokenizer, input); 441 self.finish_none(); 442 }, 443 444 Octothorpe => { 445 tokenizer.unconsume(input, StrTendril::from_slice("#")); 446 tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); 447 self.finish_none(); 448 }, 449 } 450 } 451 } 452 } 453