1 // Copyright 2014-2017 The html5ever Project Developers. See the 2 // COPYRIGHT file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 use super::{TokenSink, Tokenizer}; 11 use crate::buffer_queue::BufferQueue; 12 use crate::data; 13 use crate::tendril::StrTendril; 14 use crate::util::str::is_ascii_alnum; 15 16 use log::debug; 17 use mac::format_if; 18 use std::borrow::Cow::Borrowed; 19 use std::char::from_u32; 20 21 use self::State::*; 22 pub use self::Status::*; 23 24 //§ tokenizing-character-references 25 pub struct CharRef { 26 /// The resulting character(s) 27 pub chars: [char; 2], 28 29 /// How many slots in `chars` are valid? 30 pub num_chars: u8, 31 } 32 33 pub enum Status { 34 Stuck, 35 Progress, 36 Done, 37 } 38 39 #[derive(Debug)] 40 enum State { 41 Begin, 42 Octothorpe, 43 Numeric(u32), // base 44 NumericSemicolon, 45 Named, 46 BogusName, 47 } 48 49 pub struct CharRefTokenizer { 50 state: State, 51 addnl_allowed: Option<char>, 52 result: Option<CharRef>, 53 54 num: u32, 55 num_too_big: bool, 56 seen_digit: bool, 57 hex_marker: Option<char>, 58 59 name_buf_opt: Option<StrTendril>, 60 name_match: Option<(u32, u32)>, 61 name_len: usize, 62 } 63 64 impl CharRefTokenizer { 65 // NB: We assume that we have an additional allowed character iff we're 66 // tokenizing in an attribute value. new(addnl_allowed: Option<char>) -> CharRefTokenizer67 pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { 68 CharRefTokenizer { 69 state: Begin, 70 addnl_allowed: addnl_allowed, 71 result: None, 72 num: 0, 73 num_too_big: false, 74 seen_digit: false, 75 hex_marker: None, 76 name_buf_opt: None, 77 name_match: None, 78 name_len: 0, 79 } 80 } 81 82 // A CharRefTokenizer can only tokenize one character reference, 83 // so this method consumes the tokenizer. get_result(self) -> CharRef84 pub fn get_result(self) -> CharRef { 85 self.result.expect("get_result called before done") 86 } 87 name_buf<'t>(&'t self) -> &'t StrTendril88 fn name_buf<'t>(&'t self) -> &'t StrTendril { 89 self.name_buf_opt 90 .as_ref() 91 .expect("name_buf missing in named character reference") 92 } 93 name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril94 fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril { 95 self.name_buf_opt 96 .as_mut() 97 .expect("name_buf missing in named character reference") 98 } 99 finish_none(&mut self) -> Status100 fn finish_none(&mut self) -> Status { 101 self.result = Some(CharRef { 102 chars: ['\0', '\0'], 103 num_chars: 0, 104 }); 105 Done 106 } 107 finish_one(&mut self, c: char) -> Status108 fn finish_one(&mut self, c: char) -> Status { 109 self.result = Some(CharRef { 110 chars: [c, '\0'], 111 num_chars: 1, 112 }); 113 Done 114 } 115 } 116 117 impl CharRefTokenizer { step<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status118 pub fn step<Sink: TokenSink>( 119 &mut self, 120 tokenizer: &mut Tokenizer<Sink>, 121 input: &mut BufferQueue, 122 ) -> Status { 123 if self.result.is_some() { 124 return Done; 125 } 126 127 debug!("char ref tokenizer stepping in state {:?}", self.state); 128 match self.state { 129 Begin => self.do_begin(tokenizer, input), 130 Octothorpe => self.do_octothorpe(tokenizer, input), 131 Numeric(base) => self.do_numeric(tokenizer, input, base), 132 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), 133 Named => self.do_named(tokenizer, input), 134 BogusName => self.do_bogus_name(tokenizer, input), 135 } 136 } 137 do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status138 fn do_begin<Sink: TokenSink>( 139 &mut self, 140 tokenizer: &mut Tokenizer<Sink>, 141 input: &mut BufferQueue, 142 ) -> Status { 143 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 144 '\t' | '\n' | '\x0C' | ' ' | '<' | '&' => self.finish_none(), 145 c if Some(c) == self.addnl_allowed => self.finish_none(), 146 147 '#' => { 148 tokenizer.discard_char(input); 149 self.state = Octothorpe; 150 Progress 151 }, 152 153 _ => { 154 self.state = Named; 155 self.name_buf_opt = Some(StrTendril::new()); 156 Progress 157 }, 158 } 159 } 160 do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status161 fn do_octothorpe<Sink: TokenSink>( 162 &mut self, 163 tokenizer: &mut Tokenizer<Sink>, 164 input: &mut BufferQueue, 165 ) -> Status { 166 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 167 match c { 168 'x' | 'X' => { 169 tokenizer.discard_char(input); 170 self.hex_marker = Some(c); 171 self.state = Numeric(16); 172 }, 173 174 _ => { 175 self.hex_marker = None; 176 self.state = Numeric(10); 177 }, 178 } 179 Progress 180 } 181 do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, base: u32, ) -> Status182 fn do_numeric<Sink: TokenSink>( 183 &mut self, 184 tokenizer: &mut Tokenizer<Sink>, 185 input: &mut BufferQueue, 186 base: u32, 187 ) -> Status { 188 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 189 match c.to_digit(base) { 190 Some(n) => { 191 tokenizer.discard_char(input); 192 self.num = self.num.wrapping_mul(base); 193 if self.num > 0x10FFFF { 194 // We might overflow, and the character is definitely invalid. 195 // We still parse digits and semicolon, but don't use the result. 196 self.num_too_big = true; 197 } 198 self.num = self.num.wrapping_add(n); 199 self.seen_digit = true; 200 Progress 201 }, 202 203 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), 204 205 None => { 206 self.state = NumericSemicolon; 207 Progress 208 }, 209 } 210 } 211 do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status212 fn do_numeric_semicolon<Sink: TokenSink>( 213 &mut self, 214 tokenizer: &mut Tokenizer<Sink>, 215 input: &mut BufferQueue, 216 ) -> Status { 217 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 218 ';' => tokenizer.discard_char(input), 219 _ => tokenizer.emit_error(Borrowed( 220 "Semicolon missing after numeric character reference", 221 )), 222 }; 223 self.finish_numeric(tokenizer) 224 } 225 unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status226 fn unconsume_numeric<Sink: TokenSink>( 227 &mut self, 228 tokenizer: &mut Tokenizer<Sink>, 229 input: &mut BufferQueue, 230 ) -> Status { 231 let mut unconsume = StrTendril::from_char('#'); 232 match self.hex_marker { 233 Some(c) => unconsume.push_char(c), 234 None => (), 235 } 236 237 input.push_front(unconsume); 238 tokenizer.emit_error(Borrowed("Numeric character reference without digits")); 239 self.finish_none() 240 } 241 finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status242 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { 243 fn conv(n: u32) -> char { 244 from_u32(n).expect("invalid char missed by error handling cases") 245 } 246 247 let (c, error) = match self.num { 248 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 249 0x00 | 0xD800..=0xDFFF => ('\u{fffd}', true), 250 251 0x80..=0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { 252 Some(c) => (c, true), 253 None => (conv(self.num), true), 254 }, 255 256 0x01..=0x08 | 0x0B | 0x0D..=0x1F | 0x7F | 0xFDD0..=0xFDEF => (conv(self.num), true), 257 258 n if (n & 0xFFFE) == 0xFFFE => (conv(n), true), 259 260 n => (conv(n), false), 261 }; 262 263 if error { 264 let msg = format_if!( 265 tokenizer.opts.exact_errors, 266 "Invalid numeric character reference", 267 "Invalid numeric character reference value 0x{:06X}", 268 self.num 269 ); 270 tokenizer.emit_error(msg); 271 } 272 273 self.finish_one(c) 274 } 275 do_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status276 fn do_named<Sink: TokenSink>( 277 &mut self, 278 tokenizer: &mut Tokenizer<Sink>, 279 input: &mut BufferQueue, 280 ) -> Status { 281 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 282 self.name_buf_mut().push_char(c); 283 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { 284 // We have either a full match or a prefix of one. 285 Some(&m) => { 286 if m.0 != 0 { 287 // We have a full match, but there might be a longer one to come. 288 self.name_match = Some(m); 289 self.name_len = self.name_buf().len(); 290 } 291 // Otherwise we just have a prefix match. 292 Progress 293 }, 294 295 // Can't continue the match. 296 None => self.finish_named(tokenizer, input, Some(c)), 297 } 298 } 299 emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>)300 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { 301 let msg = format_if!( 302 tokenizer.opts.exact_errors, 303 "Invalid character reference", 304 "Invalid character reference &{}", 305 self.name_buf() 306 ); 307 tokenizer.emit_error(msg); 308 } 309 unconsume_name(&mut self, input: &mut BufferQueue)310 fn unconsume_name(&mut self, input: &mut BufferQueue) { 311 input.push_front(self.name_buf_opt.take().unwrap()); 312 } 313 finish_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, end_char: Option<char>, ) -> Status314 fn finish_named<Sink: TokenSink>( 315 &mut self, 316 tokenizer: &mut Tokenizer<Sink>, 317 input: &mut BufferQueue, 318 end_char: Option<char>, 319 ) -> Status { 320 match self.name_match { 321 None => { 322 match end_char { 323 Some(c) if is_ascii_alnum(c) => { 324 // Keep looking for a semicolon, to determine whether 325 // we emit a parse error. 326 self.state = BogusName; 327 return Progress; 328 }, 329 330 // Check length because &; is not a parse error. 331 Some(';') if self.name_buf().len() > 1 => self.emit_name_error(tokenizer), 332 333 _ => (), 334 } 335 self.unconsume_name(input); 336 self.finish_none() 337 }, 338 339 Some((c1, c2)) => { 340 // We have a complete match, but we may have consumed 341 // additional characters into self.name_buf. Usually 342 // at least one, but several in cases like 343 // 344 // ¬ => match for U+00AC 345 // ¬i => valid prefix for ¬in 346 // ¬it => can't continue match 347 348 let name_len = self.name_len; 349 assert!(name_len > 0); 350 let last_matched = self.name_buf()[name_len - 1..].chars().next().unwrap(); 351 352 // There might not be a next character after the match, if 353 // we had a full match and then hit EOF. 354 let next_after = if name_len == self.name_buf().len() { 355 None 356 } else { 357 Some(self.name_buf()[name_len..].chars().next().unwrap()) 358 }; 359 360 // "If the character reference is being consumed as part of an 361 // attribute, and the last character matched is not a U+003B 362 // SEMICOLON character (;), and the next character is either a 363 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII 364 // character, then, for historical reasons, all the characters 365 // that were matched after the U+0026 AMPERSAND character (&) 366 // must be unconsumed, and nothing is returned. However, if 367 // this next character is in fact a U+003D EQUALS SIGN 368 // character (=), then this is a parse error" 369 370 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { 371 (_, ';', _) => false, 372 (Some(_), _, Some('=')) => { 373 tokenizer.emit_error(Borrowed( 374 "Equals sign after character reference in attribute", 375 )); 376 true 377 }, 378 (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, 379 _ => { 380 tokenizer.emit_error(Borrowed( 381 "Character reference does not end with semicolon", 382 )); 383 false 384 }, 385 }; 386 387 if unconsume_all { 388 self.unconsume_name(input); 389 self.finish_none() 390 } else { 391 input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); 392 self.result = Some(CharRef { 393 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], 394 num_chars: if c2 == 0 { 1 } else { 2 }, 395 }); 396 Done 397 } 398 }, 399 } 400 } 401 do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, ) -> Status402 fn do_bogus_name<Sink: TokenSink>( 403 &mut self, 404 tokenizer: &mut Tokenizer<Sink>, 405 input: &mut BufferQueue, 406 ) -> Status { 407 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 408 self.name_buf_mut().push_char(c); 409 match c { 410 _ if is_ascii_alnum(c) => return Progress, 411 ';' => self.emit_name_error(tokenizer), 412 _ => (), 413 } 414 self.unconsume_name(input); 415 self.finish_none() 416 } 417 end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, )418 pub fn end_of_file<Sink: TokenSink>( 419 &mut self, 420 tokenizer: &mut Tokenizer<Sink>, 421 input: &mut BufferQueue, 422 ) { 423 while self.result.is_none() { 424 match self.state { 425 Begin => drop(self.finish_none()), 426 427 Numeric(_) if !self.seen_digit => drop(self.unconsume_numeric(tokenizer, input)), 428 429 Numeric(_) | NumericSemicolon => { 430 tokenizer.emit_error(Borrowed("EOF in numeric character reference")); 431 self.finish_numeric(tokenizer); 432 }, 433 434 Named => drop(self.finish_named(tokenizer, input, None)), 435 436 BogusName => { 437 self.unconsume_name(input); 438 self.finish_none(); 439 }, 440 441 Octothorpe => { 442 input.push_front(StrTendril::from_slice("#")); 443 tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); 444 self.finish_none(); 445 }, 446 } 447 } 448 } 449 } 450