1 // Copyright Mozilla Foundation. See the COPYRIGHT 2 // file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 use super::*; 11 use crate::handles::*; 12 use crate::variant::*; 13 14 pub struct Utf16Decoder { 15 lead_surrogate: u16, // If non-zero and pending_bmp == false, a pending lead surrogate 16 lead_byte: Option<u8>, 17 be: bool, 18 pending_bmp: bool, // if true, lead_surrogate is actually pending BMP 19 } 20 21 impl Utf16Decoder { new(big_endian: bool) -> VariantDecoder22 pub fn new(big_endian: bool) -> VariantDecoder { 23 VariantDecoder::Utf16(Utf16Decoder { 24 lead_surrogate: 0, 25 lead_byte: None, 26 be: big_endian, 27 pending_bmp: false, 28 }) 29 } 30 additional_from_state(&self) -> usize31 pub fn additional_from_state(&self) -> usize { 32 1 + if self.lead_byte.is_some() { 1 } else { 0 } 33 + if self.lead_surrogate == 0 { 0 } else { 2 } 34 } 35 max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>36 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { 37 checked_add( 38 1, 39 checked_div(byte_length.checked_add(self.additional_from_state()), 2), 40 ) 41 } 42 max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>43 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { 44 checked_add( 45 1, 46 checked_mul( 47 3, 48 checked_div(byte_length.checked_add(self.additional_from_state()), 2), 49 ), 50 ) 51 } 52 max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>53 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { 54 checked_add( 55 1, 56 checked_mul( 57 3, 58 checked_div(byte_length.checked_add(self.additional_from_state()), 2), 59 ), 60 ) 61 } 62 63 decoder_functions!( 64 { 65 if self.pending_bmp { 66 match dest.check_space_bmp() { 67 Space::Full(_) => { 68 return (DecoderResult::OutputFull, 0, 0); 69 } 70 Space::Available(destination_handle) => { 71 destination_handle.write_bmp(self.lead_surrogate); 72 self.pending_bmp = false; 73 self.lead_surrogate = 0; 74 } 75 } 76 } 77 }, 78 { 79 // This is the fast path. The rest runs only at the 80 // start and end for partial sequences. 81 if self.lead_byte.is_none() && self.lead_surrogate == 0 { 82 if let Some((read, written)) = if self.be { 83 dest.copy_utf16_from::<BigEndian>(&mut source) 84 } else { 85 dest.copy_utf16_from::<LittleEndian>(&mut source) 86 } { 87 return (DecoderResult::Malformed(2, 0), read, written); 88 } 89 } 90 }, 91 { 92 debug_assert!(!self.pending_bmp); 93 if self.lead_surrogate != 0 || self.lead_byte.is_some() { 94 // We need to check space without intent to write in order to 95 // make sure that there is space for the replacement character. 96 match dest.check_space_bmp() { 97 Space::Full(_) => { 98 return (DecoderResult::OutputFull, 0, 0); 99 } 100 Space::Available(_) => { 101 if self.lead_surrogate != 0 { 102 self.lead_surrogate = 0; 103 match self.lead_byte { 104 None => { 105 return ( 106 DecoderResult::Malformed(2, 0), 107 src_consumed, 108 dest.written(), 109 ); 110 } 111 Some(_) => { 112 self.lead_byte = None; 113 return ( 114 DecoderResult::Malformed(3, 0), 115 src_consumed, 116 dest.written(), 117 ); 118 } 119 } 120 } 121 debug_assert!(self.lead_byte.is_some()); 122 self.lead_byte = None; 123 return (DecoderResult::Malformed(1, 0), src_consumed, dest.written()); 124 } 125 } 126 } 127 }, 128 { 129 match self.lead_byte { 130 None => { 131 self.lead_byte = Some(b); 132 continue; 133 } 134 Some(lead) => { 135 self.lead_byte = None; 136 let code_unit = if self.be { 137 u16::from(lead) << 8 | u16::from(b) 138 } else { 139 u16::from(b) << 8 | u16::from(lead) 140 }; 141 let high_bits = code_unit & 0xFC00u16; 142 if high_bits == 0xD800u16 { 143 // high surrogate 144 if self.lead_surrogate != 0 { 145 // The previous high surrogate was in 146 // error and this one becomes the new 147 // pending one. 148 self.lead_surrogate = code_unit as u16; 149 return ( 150 DecoderResult::Malformed(2, 2), 151 unread_handle.consumed(), 152 destination_handle.written(), 153 ); 154 } 155 self.lead_surrogate = code_unit; 156 continue; 157 } 158 if high_bits == 0xDC00u16 { 159 // low surrogate 160 if self.lead_surrogate == 0 { 161 return ( 162 DecoderResult::Malformed(2, 0), 163 unread_handle.consumed(), 164 destination_handle.written(), 165 ); 166 } 167 destination_handle.write_surrogate_pair(self.lead_surrogate, code_unit); 168 self.lead_surrogate = 0; 169 continue; 170 } 171 // bmp 172 if self.lead_surrogate != 0 { 173 // The previous high surrogate was in 174 // error and this code unit becomes a 175 // pending BMP character. 176 self.lead_surrogate = code_unit; 177 self.pending_bmp = true; 178 return ( 179 DecoderResult::Malformed(2, 2), 180 unread_handle.consumed(), 181 destination_handle.written(), 182 ); 183 } 184 destination_handle.write_bmp(code_unit); 185 continue; 186 } 187 } 188 }, 189 self, 190 src_consumed, 191 dest, 192 source, 193 b, 194 destination_handle, 195 unread_handle, 196 check_space_astral 197 ); 198 } 199 200 // Any copyright to the test code below this comment is dedicated to the 201 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 202 203 #[cfg(test)] 204 mod tests { 205 use super::super::testing::*; 206 use super::super::*; 207 decode_utf_16le(bytes: &[u8], expect: &str)208 fn decode_utf_16le(bytes: &[u8], expect: &str) { 209 decode_without_padding(UTF_16LE, bytes, expect); 210 } 211 decode_utf_16be(bytes: &[u8], expect: &str)212 fn decode_utf_16be(bytes: &[u8], expect: &str) { 213 decode_without_padding(UTF_16BE, bytes, expect); 214 } 215 encode_utf_16le(string: &str, expect: &[u8])216 fn encode_utf_16le(string: &str, expect: &[u8]) { 217 encode(UTF_16LE, string, expect); 218 } 219 encode_utf_16be(string: &str, expect: &[u8])220 fn encode_utf_16be(string: &str, expect: &[u8]) { 221 encode(UTF_16BE, string, expect); 222 } 223 224 #[test] test_utf_16_decode()225 fn test_utf_16_decode() { 226 decode_utf_16le(b"", ""); 227 decode_utf_16be(b"", ""); 228 229 decode_utf_16le(b"\x61\x00\x62\x00", "\u{0061}\u{0062}"); 230 decode_utf_16be(b"\x00\x61\x00\x62", "\u{0061}\u{0062}"); 231 232 decode_utf_16le(b"\xFE\xFF\x00\x61\x00\x62", "\u{0061}\u{0062}"); 233 decode_utf_16be(b"\xFF\xFE\x61\x00\x62\x00", "\u{0061}\u{0062}"); 234 235 decode_utf_16le(b"\x61\x00\x62", "\u{0061}\u{FFFD}"); 236 decode_utf_16be(b"\x00\x61\x00", "\u{0061}\u{FFFD}"); 237 238 decode_utf_16le(b"\x3D\xD8\xA9", "\u{FFFD}"); 239 decode_utf_16be(b"\xD8\x3D\xDC", "\u{FFFD}"); 240 241 decode_utf_16le(b"\x3D\xD8\xA9\xDC\x03\x26", "\u{1F4A9}\u{2603}"); 242 decode_utf_16be(b"\xD8\x3D\xDC\xA9\x26\x03", "\u{1F4A9}\u{2603}"); 243 244 decode_utf_16le(b"\xA9\xDC\x03\x26", "\u{FFFD}\u{2603}"); 245 decode_utf_16be(b"\xDC\xA9\x26\x03", "\u{FFFD}\u{2603}"); 246 247 decode_utf_16le(b"\x3D\xD8\x03\x26", "\u{FFFD}\u{2603}"); 248 decode_utf_16be(b"\xD8\x3D\x26\x03", "\u{FFFD}\u{2603}"); 249 250 // The \xFF makes sure that the parts before and after have different alignment 251 let long_le = b"\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xFF\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8\x00\x00\x00\x00\x00\x00\x00\x00\xA9\xDC\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x3D\xD8"; 252 let long_be = b"\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xFF\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D\x00\x00\x00\x00\x00\x00\x00\x00\xDC\xA9\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xD8\x3D"; 253 let long_expect = "\x00\x00\x00\x00\u{1F4A9}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\u{FFFD}\x00\x00\x00\x00\x00\x00\x00\x00\u{FFFD}"; 254 decode_utf_16le(&long_le[..long_le.len() / 2], long_expect); 255 decode_utf_16be(&long_be[..long_be.len() / 2], long_expect); 256 decode_utf_16le(&long_le[long_le.len() / 2 + 1..], long_expect); 257 decode_utf_16be(&long_be[long_be.len() / 2 + 1..], long_expect); 258 } 259 260 #[test] test_utf_16_encode()261 fn test_utf_16_encode() { 262 // Empty 263 encode_utf_16be("", b""); 264 encode_utf_16le("", b""); 265 266 // Encodes as UTF-8 267 assert_eq!(UTF_16LE.new_encoder().encoding(), UTF_8); 268 assert_eq!(UTF_16BE.new_encoder().encoding(), UTF_8); 269 encode_utf_16le("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes()); 270 encode_utf_16be("\u{1F4A9}\u{2603}", "\u{1F4A9}\u{2603}".as_bytes()); 271 } 272 273 #[test] test_utf_16be_decode_one_by_one()274 fn test_utf_16be_decode_one_by_one() { 275 let input = b"\x00\x61\x00\xE4\x26\x03\xD8\x3D\xDC\xA9"; 276 let mut output = [0u16; 20]; 277 let mut decoder = UTF_16BE.new_decoder(); 278 for b in input.chunks(1) { 279 assert_eq!(b.len(), 1); 280 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); 281 let (result, read, _, had_errors) = 282 decoder.decode_to_utf16(b, &mut output[..needed], false); 283 assert_eq!(result, CoderResult::InputEmpty); 284 assert_eq!(read, 1); 285 assert!(!had_errors); 286 } 287 } 288 289 #[test] test_utf_16le_decode_one_by_one()290 fn test_utf_16le_decode_one_by_one() { 291 let input = b"\x61\x00\xE4\x00\x03\x26\x3D\xD8\xA9\xDC"; 292 let mut output = [0u16; 20]; 293 let mut decoder = UTF_16LE.new_decoder(); 294 for b in input.chunks(1) { 295 assert_eq!(b.len(), 1); 296 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); 297 let (result, read, _, had_errors) = 298 decoder.decode_to_utf16(b, &mut output[..needed], false); 299 assert_eq!(result, CoderResult::InputEmpty); 300 assert_eq!(read, 1); 301 assert!(!had_errors); 302 } 303 } 304 305 #[test] test_utf_16be_decode_three_at_a_time()306 fn test_utf_16be_decode_three_at_a_time() { 307 let input = b"\x00\xE4\x26\x03\xD8\x3D\xDC\xA9\x00\x61\x00\xE4"; 308 let mut output = [0u16; 20]; 309 let mut decoder = UTF_16BE.new_decoder(); 310 for b in input.chunks(3) { 311 assert_eq!(b.len(), 3); 312 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); 313 let (result, read, _, had_errors) = 314 decoder.decode_to_utf16(b, &mut output[..needed], false); 315 assert_eq!(result, CoderResult::InputEmpty); 316 assert_eq!(read, b.len()); 317 assert!(!had_errors); 318 } 319 } 320 321 #[test] test_utf_16le_decode_three_at_a_time()322 fn test_utf_16le_decode_three_at_a_time() { 323 let input = b"\xE4\x00\x03\x26\x3D\xD8\xA9\xDC\x61\x00\xE4\x00"; 324 let mut output = [0u16; 20]; 325 let mut decoder = UTF_16LE.new_decoder(); 326 for b in input.chunks(3) { 327 assert_eq!(b.len(), 3); 328 let needed = decoder.max_utf16_buffer_length(b.len()).unwrap(); 329 let (result, read, _, had_errors) = 330 decoder.decode_to_utf16(b, &mut output[..needed], false); 331 assert_eq!(result, CoderResult::InputEmpty); 332 assert_eq!(read, b.len()); 333 assert!(!had_errors); 334 } 335 } 336 337 #[test] test_utf_16le_decode_bom_prefixed_split_byte_pair()338 fn test_utf_16le_decode_bom_prefixed_split_byte_pair() { 339 let mut output = [0u16; 20]; 340 let mut decoder = UTF_16LE.new_decoder(); 341 { 342 let needed = decoder.max_utf16_buffer_length(1).unwrap(); 343 let (result, read, written, had_errors) = 344 decoder.decode_to_utf16(b"\xFF", &mut output[..needed], false); 345 assert_eq!(result, CoderResult::InputEmpty); 346 assert_eq!(read, 1); 347 assert_eq!(written, 0); 348 assert!(!had_errors); 349 } 350 { 351 let needed = decoder.max_utf16_buffer_length(1).unwrap(); 352 let (result, read, written, had_errors) = 353 decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true); 354 assert_eq!(result, CoderResult::InputEmpty); 355 assert_eq!(read, 1); 356 assert_eq!(written, 1); 357 assert!(!had_errors); 358 assert_eq!(output[0], 0xFDFF); 359 } 360 } 361 362 #[test] test_utf_16be_decode_bom_prefixed_split_byte_pair()363 fn test_utf_16be_decode_bom_prefixed_split_byte_pair() { 364 let mut output = [0u16; 20]; 365 let mut decoder = UTF_16BE.new_decoder(); 366 { 367 let needed = decoder.max_utf16_buffer_length(1).unwrap(); 368 let (result, read, written, had_errors) = 369 decoder.decode_to_utf16(b"\xFE", &mut output[..needed], false); 370 assert_eq!(result, CoderResult::InputEmpty); 371 assert_eq!(read, 1); 372 assert_eq!(written, 0); 373 assert!(!had_errors); 374 } 375 { 376 let needed = decoder.max_utf16_buffer_length(1).unwrap(); 377 let (result, read, written, had_errors) = 378 decoder.decode_to_utf16(b"\xFD", &mut output[..needed], true); 379 assert_eq!(result, CoderResult::InputEmpty); 380 assert_eq!(read, 1); 381 assert_eq!(written, 1); 382 assert!(!had_errors); 383 assert_eq!(output[0], 0xFEFD); 384 } 385 } 386 387 #[test] test_utf_16le_decode_bom_prefix()388 fn test_utf_16le_decode_bom_prefix() { 389 let mut output = [0u16; 20]; 390 let mut decoder = UTF_16LE.new_decoder(); 391 { 392 let needed = decoder.max_utf16_buffer_length(1).unwrap(); 393 let (result, read, written, had_errors) = 394 decoder.decode_to_utf16(b"\xFF", &mut output[..needed], true); 395 assert_eq!(result, CoderResult::InputEmpty); 396 assert_eq!(read, 1); 397 assert_eq!(written, 1); 398 assert!(had_errors); 399 assert_eq!(output[0], 0xFFFD); 400 } 401 } 402 403 #[test] test_utf_16be_decode_bom_prefix()404 fn test_utf_16be_decode_bom_prefix() { 405 let mut output = [0u16; 20]; 406 let mut decoder = UTF_16BE.new_decoder(); 407 { 408 let needed = decoder.max_utf16_buffer_length(1).unwrap(); 409 let (result, read, written, had_errors) = 410 decoder.decode_to_utf16(b"\xFE", &mut output[..needed], true); 411 assert_eq!(result, CoderResult::InputEmpty); 412 assert_eq!(read, 1); 413 assert_eq!(written, 1); 414 assert!(had_errors); 415 assert_eq!(output[0], 0xFFFD); 416 } 417 } 418 419 #[test] test_utf_16le_decode_near_end()420 fn test_utf_16le_decode_near_end() { 421 let mut output = [0u8; 4]; 422 let mut decoder = UTF_16LE.new_decoder(); 423 { 424 let (result, read, written, had_errors) = 425 decoder.decode_to_utf8(&[0x03], &mut output[..], false); 426 assert_eq!(result, CoderResult::InputEmpty); 427 assert_eq!(read, 1); 428 assert_eq!(written, 0); 429 assert!(!had_errors); 430 assert_eq!(output[0], 0x0); 431 } 432 { 433 let (result, read, written, had_errors) = 434 decoder.decode_to_utf8(&[0x26, 0x03, 0x26], &mut output[..], false); 435 assert_eq!(result, CoderResult::OutputFull); 436 assert_eq!(read, 1); 437 assert_eq!(written, 3); 438 assert!(!had_errors); 439 assert_eq!(output[0], 0xE2); 440 assert_eq!(output[1], 0x98); 441 assert_eq!(output[2], 0x83); 442 assert_eq!(output[3], 0x00); 443 } 444 } 445 446 #[test] test_utf_16be_decode_near_end()447 fn test_utf_16be_decode_near_end() { 448 let mut output = [0u8; 4]; 449 let mut decoder = UTF_16BE.new_decoder(); 450 { 451 let (result, read, written, had_errors) = 452 decoder.decode_to_utf8(&[0x26], &mut output[..], false); 453 assert_eq!(result, CoderResult::InputEmpty); 454 assert_eq!(read, 1); 455 assert_eq!(written, 0); 456 assert!(!had_errors); 457 assert_eq!(output[0], 0x0); 458 } 459 { 460 let (result, read, written, had_errors) = 461 decoder.decode_to_utf8(&[0x03, 0x26, 0x03], &mut output[..], false); 462 assert_eq!(result, CoderResult::OutputFull); 463 assert_eq!(read, 1); 464 assert_eq!(written, 3); 465 assert!(!had_errors); 466 assert_eq!(output[0], 0xE2); 467 assert_eq!(output[1], 0x98); 468 assert_eq!(output[2], 0x83); 469 assert_eq!(output[3], 0x00); 470 } 471 } 472 } 473