1 // Copyright 2014 The html5ever Project Developers. See the 2 // COPYRIGHT file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 use super::{Tokenizer, TokenSink}; 11 use super::buffer_queue::BufferQueue; 12 13 use util::str::{is_ascii_alnum}; 14 15 use tendril::StrTendril; 16 17 use std::char::from_u32; 18 use std::borrow::Cow::Borrowed; 19 20 pub use self::Status::*; 21 use self::State::*; 22 23 mod data; 24 25 //§ tokenizing-character-references 26 pub struct CharRef { 27 /// The resulting character(s) 28 pub chars: [char; 2], 29 30 /// How many slots in `chars` are valid? 31 pub num_chars: u8, 32 } 33 34 pub enum Status { 35 Stuck, 36 Progress, 37 Done, 38 } 39 40 #[derive(Debug)] 41 enum State { 42 Begin, 43 Octothorpe, 44 Numeric(u32), // base 45 NumericSemicolon, 46 Named, 47 BogusName, 48 } 49 50 pub struct CharRefTokenizer { 51 state: State, 52 addnl_allowed: Option<char>, 53 result: Option<CharRef>, 54 55 num: u32, 56 num_too_big: bool, 57 seen_digit: bool, 58 hex_marker: Option<char>, 59 60 name_buf_opt: Option<StrTendril>, 61 name_match: Option<(u32, u32)>, 62 name_len: usize, 63 } 64 65 impl CharRefTokenizer { 66 // NB: We assume that we have an additional allowed character iff we're 67 // tokenizing in an attribute value. new(addnl_allowed: Option<char>) -> CharRefTokenizer68 pub fn new(addnl_allowed: Option<char>) -> CharRefTokenizer { 69 CharRefTokenizer { 70 state: Begin, 71 addnl_allowed: addnl_allowed, 72 result: None, 73 num: 0, 74 num_too_big: false, 75 seen_digit: false, 76 hex_marker: None, 77 name_buf_opt: None, 78 name_match: None, 79 name_len: 0, 80 } 81 } 82 83 // A CharRefTokenizer can only tokenize one character reference, 84 // so this method consumes the tokenizer. get_result(self) -> CharRef85 pub fn get_result(self) -> CharRef { 86 self.result.expect("get_result called before done") 87 } 88 name_buf<'t>(&'t self) -> &'t StrTendril89 fn name_buf<'t>(&'t self) -> &'t StrTendril { 90 self.name_buf_opt.as_ref() 91 .expect("name_buf missing in named character reference") 92 } 93 name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril94 fn name_buf_mut<'t>(&'t mut self) -> &'t mut StrTendril { 95 self.name_buf_opt.as_mut() 96 .expect("name_buf missing in named character reference") 97 } 98 finish_none(&mut self) -> Status99 fn finish_none(&mut self) -> Status { 100 self.result = Some(CharRef { 101 chars: ['\0', '\0'], 102 num_chars: 0, 103 }); 104 Done 105 } 106 finish_one(&mut self, c: char) -> Status107 fn finish_one(&mut self, c: char) -> Status { 108 self.result = Some(CharRef { 109 chars: [c, '\0'], 110 num_chars: 1, 111 }); 112 Done 113 } 114 } 115 116 impl CharRefTokenizer { step<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status117 pub fn step<Sink: TokenSink>( 118 &mut self, 119 tokenizer: &mut Tokenizer<Sink>, 120 input: &mut BufferQueue) 121 -> Status { 122 if self.result.is_some() { 123 return Done; 124 } 125 126 debug!("char ref tokenizer stepping in state {:?}", self.state); 127 match self.state { 128 Begin => self.do_begin(tokenizer, input), 129 Octothorpe => self.do_octothorpe(tokenizer, input), 130 Numeric(base) => self.do_numeric(tokenizer, input, base), 131 NumericSemicolon => self.do_numeric_semicolon(tokenizer, input), 132 Named => self.do_named(tokenizer, input), 133 BogusName => self.do_bogus_name(tokenizer, input), 134 } 135 } 136 do_begin<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status137 fn do_begin<Sink: TokenSink>( 138 &mut self, 139 tokenizer: &mut Tokenizer<Sink>, 140 input: &mut BufferQueue) 141 -> Status { 142 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 143 '\t' | '\n' | '\x0C' | ' ' | '<' | '&' 144 => self.finish_none(), 145 c if Some(c) == self.addnl_allowed 146 => self.finish_none(), 147 148 '#' => { 149 tokenizer.discard_char(input); 150 self.state = Octothorpe; 151 Progress 152 } 153 154 _ => { 155 self.state = Named; 156 self.name_buf_opt = Some(StrTendril::new()); 157 Progress 158 } 159 } 160 } 161 do_octothorpe<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status162 fn do_octothorpe<Sink: TokenSink>( 163 &mut self, 164 tokenizer: &mut Tokenizer<Sink>, 165 input: &mut BufferQueue) 166 -> Status { 167 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 168 match c { 169 'x' | 'X' => { 170 tokenizer.discard_char(input); 171 self.hex_marker = Some(c); 172 self.state = Numeric(16); 173 } 174 175 _ => { 176 self.hex_marker = None; 177 self.state = Numeric(10); 178 } 179 } 180 Progress 181 } 182 do_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, base: u32) -> Status183 fn do_numeric<Sink: TokenSink>( 184 &mut self, 185 tokenizer: &mut Tokenizer<Sink>, 186 input: &mut BufferQueue, 187 base: u32) 188 -> Status { 189 let c = unwrap_or_return!(tokenizer.peek(input), Stuck); 190 match c.to_digit(base) { 191 Some(n) => { 192 tokenizer.discard_char(input); 193 self.num = self.num.wrapping_mul(base); 194 if self.num > 0x10FFFF { 195 // We might overflow, and the character is definitely invalid. 196 // We still parse digits and semicolon, but don't use the result. 197 self.num_too_big = true; 198 } 199 self.num = self.num.wrapping_add(n); 200 self.seen_digit = true; 201 Progress 202 } 203 204 None if !self.seen_digit => self.unconsume_numeric(tokenizer, input), 205 206 None => { 207 self.state = NumericSemicolon; 208 Progress 209 } 210 } 211 } 212 do_numeric_semicolon<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status213 fn do_numeric_semicolon<Sink: TokenSink>( 214 &mut self, 215 tokenizer: &mut Tokenizer<Sink>, 216 input: &mut BufferQueue) 217 -> Status { 218 match unwrap_or_return!(tokenizer.peek(input), Stuck) { 219 ';' => tokenizer.discard_char(input), 220 _ => tokenizer.emit_error(Borrowed("Semicolon missing after numeric character reference")), 221 }; 222 self.finish_numeric(tokenizer) 223 } 224 unconsume_numeric<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status225 fn unconsume_numeric<Sink: TokenSink>( 226 &mut self, 227 tokenizer: &mut Tokenizer<Sink>, 228 input: &mut BufferQueue) 229 -> Status { 230 let mut unconsume = StrTendril::from_char('#'); 231 match self.hex_marker { 232 Some(c) => unconsume.push_char(c), 233 None => (), 234 } 235 236 input.push_front(unconsume); 237 tokenizer.emit_error(Borrowed("Numeric character reference without digits")); 238 self.finish_none() 239 } 240 finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status241 fn finish_numeric<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) -> Status { 242 fn conv(n: u32) -> char { 243 from_u32(n).expect("invalid char missed by error handling cases") 244 } 245 246 let (c, error) = match self.num { 247 n if (n > 0x10FFFF) || self.num_too_big => ('\u{fffd}', true), 248 0x00 | 0xD800...0xDFFF => ('\u{fffd}', true), 249 250 0x80...0x9F => match data::C1_REPLACEMENTS[(self.num - 0x80) as usize] { 251 Some(c) => (c, true), 252 None => (conv(self.num), true), 253 }, 254 255 0x01...0x08 | 0x0B | 0x0D...0x1F | 0x7F | 0xFDD0...0xFDEF 256 => (conv(self.num), true), 257 258 n if (n & 0xFFFE) == 0xFFFE 259 => (conv(n), true), 260 261 n => (conv(n), false), 262 }; 263 264 if error { 265 let msg = format_if!(tokenizer.opts.exact_errors, 266 "Invalid numeric character reference", 267 "Invalid numeric character reference value 0x{:06X}", self.num); 268 tokenizer.emit_error(msg); 269 } 270 271 self.finish_one(c) 272 } 273 do_named<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status274 fn do_named<Sink: TokenSink>( 275 &mut self, 276 tokenizer: &mut Tokenizer<Sink>, 277 input: &mut BufferQueue) 278 -> Status { 279 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 280 self.name_buf_mut().push_char(c); 281 match data::NAMED_ENTITIES.get(&self.name_buf()[..]) { 282 // We have either a full match or a prefix of one. 283 Some(&m) => { 284 if m.0 != 0 { 285 // We have a full match, but there might be a longer one to come. 286 self.name_match = Some(m); 287 self.name_len = self.name_buf().len(); 288 } 289 // Otherwise we just have a prefix match. 290 Progress 291 } 292 293 // Can't continue the match. 294 None => self.finish_named(tokenizer, input, Some(c)), 295 } 296 } 297 emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>)298 fn emit_name_error<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>) { 299 let msg = format_if!(tokenizer.opts.exact_errors, 300 "Invalid character reference", 301 "Invalid character reference &{}", self.name_buf()); 302 tokenizer.emit_error(msg); 303 } 304 unconsume_name(&mut self, input: &mut BufferQueue)305 fn unconsume_name(&mut self, input: &mut BufferQueue) { 306 input.push_front(self.name_buf_opt.take().unwrap()); 307 } 308 finish_named<Sink: TokenSink>(&mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue, end_char: Option<char>) -> Status309 fn finish_named<Sink: TokenSink>(&mut self, 310 tokenizer: &mut Tokenizer<Sink>, 311 input: &mut BufferQueue, 312 end_char: Option<char>) -> Status { 313 match self.name_match { 314 None => { 315 match end_char { 316 Some(c) if is_ascii_alnum(c) => { 317 // Keep looking for a semicolon, to determine whether 318 // we emit a parse error. 319 self.state = BogusName; 320 return Progress; 321 } 322 323 // Check length because &; is not a parse error. 324 Some(';') if self.name_buf().len() > 1 325 => self.emit_name_error(tokenizer), 326 327 _ => (), 328 } 329 self.unconsume_name(input); 330 self.finish_none() 331 } 332 333 Some((c1, c2)) => { 334 // We have a complete match, but we may have consumed 335 // additional characters into self.name_buf. Usually 336 // at least one, but several in cases like 337 // 338 // ¬ => match for U+00AC 339 // ¬i => valid prefix for ¬in 340 // ¬it => can't continue match 341 342 let name_len = self.name_len; 343 assert!(name_len > 0); 344 let last_matched = self.name_buf()[name_len-1..].chars().next().unwrap(); 345 346 // There might not be a next character after the match, if 347 // we had a full match and then hit EOF. 348 let next_after = if name_len == self.name_buf().len() { 349 None 350 } else { 351 Some(self.name_buf()[name_len..].chars().next().unwrap()) 352 }; 353 354 // "If the character reference is being consumed as part of an 355 // attribute, and the last character matched is not a U+003B 356 // SEMICOLON character (;), and the next character is either a 357 // U+003D EQUALS SIGN character (=) or an alphanumeric ASCII 358 // character, then, for historical reasons, all the characters 359 // that were matched after the U+0026 AMPERSAND character (&) 360 // must be unconsumed, and nothing is returned. However, if 361 // this next character is in fact a U+003D EQUALS SIGN 362 // character (=), then this is a parse error" 363 364 let unconsume_all = match (self.addnl_allowed, last_matched, next_after) { 365 (_, ';', _) => false, 366 (Some(_), _, Some('=')) => { 367 tokenizer.emit_error(Borrowed("Equals sign after character reference in attribute")); 368 true 369 } 370 (Some(_), _, Some(c)) if is_ascii_alnum(c) => true, 371 _ => { 372 tokenizer.emit_error(Borrowed("Character reference does not end with semicolon")); 373 false 374 } 375 }; 376 377 if unconsume_all { 378 self.unconsume_name(input); 379 self.finish_none() 380 } else { 381 input.push_front(StrTendril::from_slice(&self.name_buf()[name_len..])); 382 self.result = Some(CharRef { 383 chars: [from_u32(c1).unwrap(), from_u32(c2).unwrap()], 384 num_chars: if c2 == 0 { 1 } else { 2 }, 385 }); 386 Done 387 } 388 } 389 } 390 } 391 do_bogus_name<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue) -> Status392 fn do_bogus_name<Sink: TokenSink>( 393 &mut self, 394 tokenizer: &mut Tokenizer<Sink>, 395 input: &mut BufferQueue) 396 -> Status { 397 let c = unwrap_or_return!(tokenizer.get_char(input), Stuck); 398 self.name_buf_mut().push_char(c); 399 match c { 400 _ if is_ascii_alnum(c) => return Progress, 401 ';' => self.emit_name_error(tokenizer), 402 _ => () 403 } 404 self.unconsume_name(input); 405 self.finish_none() 406 } 407 end_of_file<Sink: TokenSink>( &mut self, tokenizer: &mut Tokenizer<Sink>, input: &mut BufferQueue)408 pub fn end_of_file<Sink: TokenSink>( 409 &mut self, 410 tokenizer: &mut Tokenizer<Sink>, 411 input: &mut BufferQueue) { 412 while self.result.is_none() { 413 match self.state { 414 Begin => drop(self.finish_none()), 415 416 Numeric(_) if !self.seen_digit 417 => drop(self.unconsume_numeric(tokenizer, input)), 418 419 Numeric(_) | NumericSemicolon => { 420 tokenizer.emit_error(Borrowed("EOF in numeric character reference")); 421 self.finish_numeric(tokenizer); 422 } 423 424 Named => drop(self.finish_named(tokenizer, input, None)), 425 426 BogusName => { 427 self.unconsume_name(input); 428 self.finish_none(); 429 } 430 431 Octothorpe => { 432 input.push_front(StrTendril::from_slice("#")); 433 tokenizer.emit_error(Borrowed("EOF after '#' in character reference")); 434 self.finish_none(); 435 } 436 } 437 } 438 } 439 } 440