1 // This is a part of rust-encoding. 2 // Copyright (c) 2013-2015, Kang Seonghoon. 3 // See README.md and LICENSE.txt for details. 4 5 //! UTF-16. 6 7 use std::convert::Into; 8 use std::marker::PhantomData; 9 use util::as_char; 10 use types::*; 11 12 /// An implementation type for little endian. 13 /// 14 /// Can be used as a type parameter to `UTF16Encoding`, `UTF16Encoder` and `UTF16Decoder`. 15 #[derive(Clone, Copy)] 16 pub struct Little; 17 18 /// An implementation type for big endian. 19 /// 20 /// Can be used as a type parameter to `UTF16Encoding`, `UTF16Encoder` and `UTF16Decoder`. 21 #[derive(Clone, Copy)] 22 pub struct Big; 23 24 /// An internal trait used to customize UTF-16 implementations. 25 #[doc(hidden)] // XXX never intended to be used publicly, should be gone later 26 pub trait Endian: Clone + 'static { name() -> &'static str27 fn name() -> &'static str; whatwg_name() -> Option<&'static str>28 fn whatwg_name() -> Option<&'static str>; write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8)29 fn write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8); concat_two_bytes(lead: u16, trail: u8) -> u1630 fn concat_two_bytes(lead: u16, trail: u8) -> u16; 31 } 32 33 impl Endian for Little { name() -> &'static str34 fn name() -> &'static str { "utf-16le" } whatwg_name() -> Option<&'static str>35 fn whatwg_name() -> Option<&'static str> { Some("utf-16le") } write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8)36 fn write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8) { 37 output.write_byte(lsb); 38 output.write_byte(msb); 39 } concat_two_bytes(lead: u16, trail: u8) -> u1640 fn concat_two_bytes(lead: u16, trail: u8) -> u16 { 41 lead | ((trail as u16) << 8) 42 } 43 } 44 45 impl Endian for Big { name() -> &'static str46 fn name() -> &'static str { "utf-16be" } whatwg_name() -> Option<&'static str>47 fn whatwg_name() -> Option<&'static str> { Some("utf-16be") } write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8)48 fn write_two_bytes(output: &mut ByteWriter, msb: u8, lsb: u8) { 49 output.write_byte(msb); 50 output.write_byte(lsb); 51 } concat_two_bytes(lead: u16, trail: u8) -> u1652 fn concat_two_bytes(lead: u16, trail: u8) -> u16 { 53 (lead << 8) | trail as u16 54 } 55 } 56 57 /** 58 * UTF-16 (UCS Transformation Format, 16-bit). 59 * 60 * This is a Unicode encoding where one codepoint may use 61 * 2 (up to U+FFFF) or 4 bytes (up to U+10FFFF) depending on its value. 62 * It uses a "surrogate" mechanism to encode non-BMP codepoints, 63 * which are represented as a pair of lower surrogate and upper surrogate characters. 64 * In this effect, surrogate characters (U+D800..DFFF) cannot appear alone 65 * and cannot be included in a valid Unicode string. 66 * 67 * ## Specialization 68 * 69 * This type is specialized with endianness type `E`, 70 * which should be either `Little` (little endian) or `Big` (big endian). 71 */ 72 #[derive(Clone, Copy)] 73 pub struct UTF16Encoding<E> { 74 _marker: PhantomData<E> 75 } 76 77 /// A type for UTF-16 in little endian. 78 pub type UTF16LEEncoding = UTF16Encoding<Little>; 79 /// A type for UTF-16 in big endian. 80 pub type UTF16BEEncoding = UTF16Encoding<Big>; 81 82 /// An instance for UTF-16 in little endian. 83 pub const UTF_16LE_ENCODING: UTF16LEEncoding = UTF16Encoding { _marker: PhantomData }; 84 /// An instance for UTF-16 in big endian. 85 pub const UTF_16BE_ENCODING: UTF16BEEncoding = UTF16Encoding { _marker: PhantomData }; 86 87 impl<E: Endian> Encoding for UTF16Encoding<E> { name(&self) -> &'static str88 fn name(&self) -> &'static str { <E as Endian>::name() } whatwg_name(&self) -> Option<&'static str>89 fn whatwg_name(&self) -> Option<&'static str> { <E as Endian>::whatwg_name() } raw_encoder(&self) -> Box<RawEncoder>90 fn raw_encoder(&self) -> Box<RawEncoder> { UTF16Encoder::<E>::new() } raw_decoder(&self) -> Box<RawDecoder>91 fn raw_decoder(&self) -> Box<RawDecoder> { UTF16Decoder::<E>::new() } 92 } 93 94 /** 95 * An encoder for UTF-16. 96 * 97 * ## Specialization 98 * 99 * This type is specialized with endianness type `E`, 100 * which should be either `Little` (little endian) or `Big` (big endian). 101 */ 102 #[derive(Clone, Copy)] 103 pub struct UTF16Encoder<E> { 104 _marker: PhantomData<E> 105 } 106 107 impl<E: Endian> UTF16Encoder<E> { new() -> Box<RawEncoder>108 fn new() -> Box<RawEncoder> { 109 Box::new(UTF16Encoder::<E> { _marker: PhantomData }) 110 } 111 } 112 113 impl<E: Endian> RawEncoder for UTF16Encoder<E> { from_self(&self) -> Box<RawEncoder>114 fn from_self(&self) -> Box<RawEncoder> { UTF16Encoder::<E>::new() } 115 raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)116 fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) { 117 output.writer_hint(input.len() * 2); 118 119 let write_two_bytes = |output: &mut ByteWriter, msb: u8, lsb: u8| 120 <E as Endian>::write_two_bytes(output, msb, lsb); 121 122 for ch in input.chars() { 123 match ch { 124 '\u{0}'...'\u{d7ff}' | '\u{e000}'...'\u{ffff}' => { 125 let ch = ch as u32; 126 write_two_bytes(output, (ch >> 8) as u8, (ch & 0xff) as u8); 127 } 128 '\u{10000}'...'\u{10ffff}' => { 129 let ch = ch as u32 - 0x10000; 130 write_two_bytes(output, (0xd8 | (ch >> 18)) as u8, 131 ((ch >> 10) & 0xff) as u8); 132 write_two_bytes(output, (0xdc | ((ch >> 8) & 0x3)) as u8, 133 (ch & 0xff) as u8); 134 } 135 _ => unreachable!() // XXX Rust issue #12483, this is redundant 136 } 137 } 138 (input.len(), None) 139 } 140 raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>141 fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> { 142 None 143 } 144 } 145 146 /** 147 * A decoder for UTF-16. 148 * 149 * ## Specialization 150 * 151 * This type is specialized with endianness type `E`, 152 * which should be either `Little` (little endian) or `Big` (big endian). 153 */ 154 pub struct UTF16Decoder<E> { 155 leadbyte: u16, 156 leadsurrogate: u16, 157 _marker: PhantomData<E> 158 } 159 160 impl<E: Endian> UTF16Decoder<E> { new() -> Box<RawDecoder>161 pub fn new() -> Box<RawDecoder> { 162 Box::new(UTF16Decoder::<E> { leadbyte: 0xffff, leadsurrogate: 0xffff, 163 _marker: PhantomData }) 164 } 165 } 166 167 impl<E: Endian> RawDecoder for UTF16Decoder<E> { from_self(&self) -> Box<RawDecoder>168 fn from_self(&self) -> Box<RawDecoder> { UTF16Decoder::<E>::new() } 169 raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)170 fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) { 171 output.writer_hint(input.len() / 2); // when every codepoint is U+0000..007F 172 173 let concat_two_bytes = |lead: u16, trail: u8| 174 <E as Endian>::concat_two_bytes(lead, trail); 175 176 let mut i = 0; 177 let mut processed = 0; 178 let len = input.len(); 179 180 if i >= len { return (processed, None); } 181 182 if self.leadbyte != 0xffff { 183 let ch = concat_two_bytes(self.leadbyte, input[i]); 184 i += 1; 185 self.leadbyte = 0xffff; 186 if self.leadsurrogate != 0xffff { // `ch` is lower surrogate 187 let upper = self.leadsurrogate; 188 self.leadsurrogate = 0xffff; 189 match ch { 190 0xdc00...0xdfff => { 191 let ch = ((upper as u32 - 0xd800) << 10) + (ch as u32 - 0xdc00); 192 output.write_char(as_char(ch + 0x10000)); 193 processed = i; 194 } 195 _ => { 196 return (processed, Some(CodecError { 197 upto: i as isize - 2, cause: "invalid sequence".into() 198 })); 199 } 200 } 201 } else { 202 match ch { 203 0xd800...0xdbff => { 204 self.leadsurrogate = ch; 205 // pass through 206 } 207 0xdc00...0xdfff => { 208 return (processed, Some(CodecError { 209 upto: i as isize, cause: "invalid sequence".into() 210 })); 211 } 212 _ => { 213 output.write_char(as_char(ch as u32)); 214 processed = i; 215 } 216 } 217 } 218 if i >= len { return (processed, None); } 219 } 220 221 if self.leadsurrogate != 0xffff { 222 i += 1; 223 if i >= len { 224 self.leadbyte = input[i-1] as u16; 225 return (processed, None); 226 } 227 let upper = self.leadsurrogate; 228 let ch = concat_two_bytes(input[i-1] as u16, input[i]); 229 i += 1; 230 match ch { 231 0xdc00...0xdfff => { 232 let ch = ((upper as u32 - 0xd800) << 10) + (ch as u32 - 0xdc00); 233 output.write_char(as_char(ch + 0x10000)); 234 } 235 _ => { 236 self.leadbyte = 0xffff; 237 self.leadsurrogate = 0xffff; 238 return (processed, Some(CodecError { 239 upto: i as isize - 2, cause: "invalid sequence".into() 240 })); 241 } 242 } 243 } 244 245 self.leadbyte = 0xffff; 246 self.leadsurrogate = 0xffff; 247 processed = i; 248 while i < len { 249 i += 1; 250 if i >= len { 251 self.leadbyte = input[i-1] as u16; 252 break; 253 } 254 let ch = concat_two_bytes(input[i-1] as u16, input[i]); 255 match ch { 256 0xd800...0xdbff => { 257 i += 2; 258 if i >= len { 259 self.leadsurrogate = ch; 260 if i-1 < len { self.leadbyte = input[i-1] as u16; } 261 break; 262 } 263 let ch2 = concat_two_bytes(input[i-1] as u16, input[i]); 264 match ch2 { 265 0xdc00...0xdfff => { 266 let ch = ((ch as u32 - 0xd800) << 10) + (ch2 as u32 - 0xdc00); 267 output.write_char(as_char(ch + 0x10000)); 268 } 269 _ => { 270 return (processed, Some(CodecError { 271 upto: i as isize - 1, cause: "invalid sequence".into() 272 })); 273 } 274 } 275 } 276 0xdc00...0xdfff => { 277 return (processed, Some(CodecError { 278 upto: i as isize + 1, cause: "invalid sequence".into() 279 })); 280 } 281 _ => { 282 output.write_char(as_char(ch as u32)); 283 } 284 } 285 i += 1; 286 processed = i; 287 } 288 (processed, None) 289 } 290 raw_finish(&mut self, _output: &mut StringWriter) -> Option<CodecError>291 fn raw_finish(&mut self, _output: &mut StringWriter) -> Option<CodecError> { 292 let leadbyte = self.leadbyte; 293 let leadsurrogate = self.leadsurrogate; 294 self.leadbyte = 0xffff; 295 self.leadsurrogate = 0xffff; 296 if leadbyte != 0xffff || leadsurrogate != 0xffff { 297 Some(CodecError { upto: 0, cause: "incomplete sequence".into() }) 298 } else { 299 None 300 } 301 } 302 } 303 304 #[cfg(test)] 305 mod tests { 306 // little endian and big endian is symmetric to each other, there's no need to test both. 307 // since big endian is easier to inspect we test UTF_16BE only. 308 309 use super::UTF_16BE_ENCODING as UTF_16BE; 310 use types::*; 311 312 #[test] test_encoder_valid()313 fn test_encoder_valid() { 314 let mut e = UTF_16BE.raw_encoder(); 315 assert_feed_ok!(e, "\u{0}\ 316 \u{1}\u{02}\u{004}\u{0008}\ 317 \u{10}\u{020}\u{0040}\u{80}\ 318 \u{100}\u{0200}\u{400}\u{800}\ 319 \u{1000}\u{2000}\u{4000}\u{8000}\ 320 \u{ffff}", "", 321 [0x00, 0x00, 322 0x00, 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 323 0x00, 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 324 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00, 325 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x00, 326 0xff, 0xff]); 327 assert_feed_ok!(e, "\u{10000}\ 328 \u{10001}\u{010002}\ 329 \u{10004}\u{010008}\ 330 \u{10010}\u{010020}\ 331 \u{10040}\u{010080}\ 332 \u{10100}\u{010200}\ 333 \u{10400}\u{010800}\ 334 \u{11000}\u{012000}\ 335 \u{14000}\u{018000}\ 336 \u{20000}\u{030000}\ 337 \u{50000}\u{090000}\ 338 \u{10FFFF}", "", 339 [0xd8, 0x00, 0xdc, 0x00, 340 0xd8, 0x00, 0xdc, 0x01, 0xd8, 0x00, 0xdc, 0x02, 341 0xd8, 0x00, 0xdc, 0x04, 0xd8, 0x00, 0xdc, 0x08, 342 0xd8, 0x00, 0xdc, 0x10, 0xd8, 0x00, 0xdc, 0x20, 343 0xd8, 0x00, 0xdc, 0x40, 0xd8, 0x00, 0xdc, 0x80, 344 0xd8, 0x00, 0xdd, 0x00, 0xd8, 0x00, 0xde, 0x00, 345 0xd8, 0x01, 0xdc, 0x00, 0xd8, 0x02, 0xdc, 0x00, 346 0xd8, 0x04, 0xdc, 0x00, 0xd8, 0x08, 0xdc, 0x00, 347 0xd8, 0x10, 0xdc, 0x00, 0xd8, 0x20, 0xdc, 0x00, 348 0xd8, 0x40, 0xdc, 0x00, 0xd8, 0x80, 0xdc, 0x00, 349 0xd9, 0x00, 0xdc, 0x00, 0xda, 0x00, 0xdc, 0x00, 350 0xdb, 0xff, 0xdf, 0xff]); 351 assert_finish_ok!(e, []); 352 } 353 354 #[test] test_decoder_valid()355 fn test_decoder_valid() { 356 let mut d = UTF_16BE.raw_decoder(); 357 assert_feed_ok!(d, [0x00, 0x00, 358 0x00, 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 359 0x00, 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 360 0x01, 0x00, 0x02, 0x00, 0x04, 0x00, 0x08, 0x00, 361 0x10, 0x00, 0x20, 0x00, 0x40, 0x00, 0x80, 0x00, 362 0xff, 0xff], [], 363 "\u{0}\ 364 \u{1}\u{02}\u{004}\u{0008}\ 365 \u{10}\u{020}\u{0040}\u{80}\ 366 \u{100}\u{0200}\u{400}\u{800}\ 367 \u{1000}\u{2000}\u{4000}\u{8000}\ 368 \u{ffff}"); 369 assert_feed_ok!(d, [0xd8, 0x00, 0xdc, 0x00, 370 0xd8, 0x00, 0xdc, 0x01, 0xd8, 0x00, 0xdc, 0x02, 371 0xd8, 0x00, 0xdc, 0x04, 0xd8, 0x00, 0xdc, 0x08, 372 0xd8, 0x00, 0xdc, 0x10, 0xd8, 0x00, 0xdc, 0x20, 373 0xd8, 0x00, 0xdc, 0x40, 0xd8, 0x00, 0xdc, 0x80, 374 0xd8, 0x00, 0xdd, 0x00, 0xd8, 0x00, 0xde, 0x00, 375 0xd8, 0x01, 0xdc, 0x00, 0xd8, 0x02, 0xdc, 0x00, 376 0xd8, 0x04, 0xdc, 0x00, 0xd8, 0x08, 0xdc, 0x00, 377 0xd8, 0x10, 0xdc, 0x00, 0xd8, 0x20, 0xdc, 0x00, 378 0xd8, 0x40, 0xdc, 0x00, 0xd8, 0x80, 0xdc, 0x00, 379 0xd9, 0x00, 0xdc, 0x00, 0xda, 0x00, 0xdc, 0x00, 380 0xdb, 0xff, 0xdf, 0xff], [], 381 "\u{10000}\ 382 \u{10001}\u{010002}\ 383 \u{10004}\u{010008}\ 384 \u{10010}\u{010020}\ 385 \u{10040}\u{010080}\ 386 \u{10100}\u{010200}\ 387 \u{10400}\u{010800}\ 388 \u{11000}\u{012000}\ 389 \u{14000}\u{018000}\ 390 \u{20000}\u{030000}\ 391 \u{50000}\u{090000}\ 392 \u{10FFFF}"); 393 assert_finish_ok!(d, ""); 394 } 395 396 #[test] test_decoder_valid_partial_bmp()397 fn test_decoder_valid_partial_bmp() { 398 let mut d = UTF_16BE.raw_decoder(); 399 assert_feed_ok!(d, [], [0x12], ""); 400 assert_feed_ok!(d, [0x34], [], "\u{1234}"); 401 assert_feed_ok!(d, [], [0x56], ""); 402 assert_feed_ok!(d, [0x78], [], "\u{5678}"); 403 assert_finish_ok!(d, ""); 404 405 let mut d = UTF_16BE.raw_decoder(); 406 assert_feed_ok!(d, [], [0x12], ""); 407 assert_feed_ok!(d, [0x34], [0x56], "\u{1234}"); 408 assert_feed_ok!(d, [0x78, 0xab, 0xcd], [], "\u{5678}\u{abcd}"); 409 assert_finish_ok!(d, ""); 410 } 411 412 #[test] test_decoder_valid_partial_non_bmp()413 fn test_decoder_valid_partial_non_bmp() { 414 let mut d = UTF_16BE.raw_decoder(); 415 assert_feed_ok!(d, [], [0xd8], ""); 416 assert_feed_ok!(d, [], [0x08], ""); 417 assert_feed_ok!(d, [], [0xdf], ""); 418 assert_feed_ok!(d, [0x45], [0xd9], "\u{12345}"); 419 assert_feed_ok!(d, [], [0x5e], ""); 420 assert_feed_ok!(d, [], [0xdc], ""); 421 assert_feed_ok!(d, [0x90], [], "\u{67890}"); 422 assert_finish_ok!(d, ""); 423 424 let mut d = UTF_16BE.raw_decoder(); 425 assert_feed_ok!(d, [], [0xd8], ""); 426 assert_feed_ok!(d, [], [0x08, 0xdf], ""); 427 assert_feed_ok!(d, [0x45], [0xd9, 0x5e], "\u{12345}"); 428 assert_feed_ok!(d, [0xdc, 0x90], [], "\u{67890}"); 429 assert_finish_ok!(d, ""); 430 431 let mut d = UTF_16BE.raw_decoder(); 432 assert_feed_ok!(d, [], [0xd8, 0x08, 0xdf], ""); 433 assert_feed_ok!(d, [0x45], [0xd9, 0x5e, 0xdc], "\u{12345}"); 434 assert_feed_ok!(d, [0x90], [], "\u{67890}"); 435 assert_finish_ok!(d, ""); 436 } 437 438 #[test] test_decoder_invalid_partial()439 fn test_decoder_invalid_partial() { 440 let mut d = UTF_16BE.raw_decoder(); 441 assert_feed_ok!(d, [], [0x12], ""); 442 assert_finish_err!(d, ""); 443 444 let mut d = UTF_16BE.raw_decoder(); 445 assert_feed_ok!(d, [], [0xd8], ""); 446 assert_finish_err!(d, ""); 447 448 let mut d = UTF_16BE.raw_decoder(); 449 assert_feed_ok!(d, [], [0xd8, 0x08], ""); 450 assert_finish_err!(d, ""); 451 452 let mut d = UTF_16BE.raw_decoder(); 453 assert_feed_ok!(d, [], [0xd8, 0x08, 0xdf], ""); 454 assert_finish_err!(d, ""); 455 } 456 457 #[test] test_decoder_invalid_lone_upper_surrogate()458 fn test_decoder_invalid_lone_upper_surrogate() { 459 let mut d = UTF_16BE.raw_decoder(); 460 assert_feed_ok!(d, [], [0xd8, 0x00], ""); 461 assert_feed_err!(d, [], [], [0x12, 0x34], ""); 462 assert_feed_err!(d, [], [0xd8, 0x00], [0x56, 0x78], ""); 463 assert_feed_ok!(d, [], [0xd8, 0x00], ""); 464 assert_feed_err!(d, [], [], [0xd8, 0x00], ""); 465 assert_feed_ok!(d, [], [0xd8, 0x00], ""); 466 assert_finish_err!(d, ""); 467 468 let mut d = UTF_16BE.raw_decoder(); 469 assert_feed_ok!(d, [], [0xdb, 0xff], ""); 470 assert_feed_err!(d, [], [], [0x12, 0x34], ""); 471 assert_feed_err!(d, [], [0xdb, 0xff], [0x56, 0x78], ""); 472 assert_feed_ok!(d, [], [0xdb, 0xff], ""); 473 assert_feed_err!(d, [], [], [0xdb, 0xff], ""); 474 assert_feed_ok!(d, [], [0xdb, 0xff], ""); 475 assert_finish_err!(d, ""); 476 } 477 478 #[test] test_decoder_invalid_lone_upper_surrogate_partial()479 fn test_decoder_invalid_lone_upper_surrogate_partial() { 480 let mut d = UTF_16BE.raw_decoder(); 481 assert_feed_ok!(d, [], [0xd8], ""); 482 assert_feed_err!(d, [], [0x00], [0x12, 0x34], ""); 483 assert_feed_ok!(d, [], [0xd8, 0x00, 0x56], ""); 484 assert_feed_err!(d, -1, [], [], [0x56, 0x78], ""); 485 assert_feed_ok!(d, [], [0xd8], ""); 486 assert_feed_err!(d, [], [0x00], [0xd8, 0x00], ""); 487 assert_feed_ok!(d, [], [0xd8, 0x00, 0xdb], ""); 488 assert_feed_err!(d, -1, [], [], [0xdb, 0xff], ""); 489 assert_feed_ok!(d, [], [0xd8], ""); 490 assert_finish_err!(d, ""); 491 492 let mut d = UTF_16BE.raw_decoder(); 493 assert_feed_ok!(d, [], [0xdb], ""); 494 assert_feed_err!(d, [], [0xff], [0x12, 0x34], ""); 495 assert_feed_ok!(d, [], [0xdb, 0xff, 0x56], ""); 496 assert_feed_err!(d, -1, [], [], [0x56, 0x78], ""); 497 assert_feed_ok!(d, [], [0xdb], ""); 498 assert_feed_err!(d, [], [0xff], [0xdb, 0xff], ""); 499 assert_feed_ok!(d, [], [0xdb, 0xff, 0xd8], ""); 500 assert_feed_err!(d, -1, [], [], [0xd8, 0x00], ""); 501 assert_feed_ok!(d, [], [0xdb], ""); 502 assert_finish_err!(d, ""); 503 } 504 505 #[test] test_decoder_invalid_lone_lower_surrogate()506 fn test_decoder_invalid_lone_lower_surrogate() { 507 let mut d = UTF_16BE.raw_decoder(); 508 assert_feed_err!(d, [], [0xdc, 0x00], [], ""); 509 assert_feed_err!(d, [0x12, 0x34], [0xdc, 0x00], [0x56, 0x78], "\u{1234}"); 510 assert_finish_ok!(d, ""); 511 512 let mut d = UTF_16BE.raw_decoder(); 513 assert_feed_err!(d, [], [0xdf, 0xff], [], ""); 514 assert_feed_err!(d, [0x12, 0x34], [0xdf, 0xff], [0x56, 0x78], "\u{1234}"); 515 assert_finish_ok!(d, ""); 516 } 517 518 #[test] test_decoder_invalid_lone_lower_surrogate_partial()519 fn test_decoder_invalid_lone_lower_surrogate_partial() { 520 let mut d = UTF_16BE.raw_decoder(); 521 assert_feed_ok!(d, [], [0xdc], ""); 522 assert_feed_err!(d, [], [0x00], [], ""); 523 assert_feed_ok!(d, [0x12, 0x34], [0xdc], "\u{1234}"); 524 assert_feed_err!(d, [], [0x00], [0x56, 0x78], ""); 525 assert_finish_ok!(d, ""); 526 527 assert_feed_ok!(d, [], [0xdf], ""); 528 assert_feed_err!(d, [], [0xff], [], ""); 529 assert_feed_ok!(d, [0x12, 0x34], [0xdf], "\u{1234}"); 530 assert_feed_err!(d, [], [0xff], [0x56, 0x78], ""); 531 assert_finish_ok!(d, ""); 532 } 533 534 #[test] test_decoder_invalid_one_byte_before_finish()535 fn test_decoder_invalid_one_byte_before_finish() { 536 let mut d = UTF_16BE.raw_decoder(); 537 assert_feed_ok!(d, [], [0x12], ""); 538 assert_finish_err!(d, ""); 539 540 let mut d = UTF_16BE.raw_decoder(); 541 assert_feed_ok!(d, [0x12, 0x34], [0x56], "\u{1234}"); 542 assert_finish_err!(d, ""); 543 } 544 545 #[test] test_decoder_invalid_three_bytes_before_finish()546 fn test_decoder_invalid_three_bytes_before_finish() { 547 let mut d = UTF_16BE.raw_decoder(); 548 assert_feed_ok!(d, [], [0xd8, 0x00, 0xdc], ""); 549 assert_finish_err!(d, ""); 550 551 let mut d = UTF_16BE.raw_decoder(); 552 assert_feed_ok!(d, [0x12, 0x34], [0xd8, 0x00, 0xdc], "\u{1234}"); 553 assert_finish_err!(d, ""); 554 } 555 556 #[test] test_decoder_invalid_three_bytes_before_finish_partial()557 fn test_decoder_invalid_three_bytes_before_finish_partial() { 558 let mut d = UTF_16BE.raw_decoder(); 559 assert_feed_ok!(d, [], [0xd8], ""); 560 assert_feed_ok!(d, [], [0x00], ""); 561 assert_feed_ok!(d, [], [0xdc], ""); 562 assert_finish_err!(d, ""); 563 564 let mut d = UTF_16BE.raw_decoder(); 565 assert_feed_ok!(d, [0x12, 0x34], [0xd8], "\u{1234}"); 566 assert_feed_ok!(d, [], [0x00, 0xdc], ""); 567 assert_finish_err!(d, ""); 568 569 let mut d = UTF_16BE.raw_decoder(); 570 assert_feed_ok!(d, [0x12, 0x34], [0xd8, 0x00], "\u{1234}"); 571 assert_feed_ok!(d, [], [0xdc], ""); 572 assert_finish_err!(d, ""); 573 } 574 575 #[test] test_decoder_feed_after_finish()576 fn test_decoder_feed_after_finish() { 577 let mut d = UTF_16BE.raw_decoder(); 578 assert_feed_ok!(d, [0x12, 0x34], [0x12], "\u{1234}"); 579 assert_finish_err!(d, ""); 580 assert_feed_ok!(d, [0x12, 0x34], [], "\u{1234}"); 581 assert_finish_ok!(d, ""); 582 583 let mut d = UTF_16BE.raw_decoder(); 584 assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [0xd8, 0x08, 0xdf], "\u{12345}"); 585 assert_finish_err!(d, ""); 586 assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [0xd8, 0x08], "\u{12345}"); 587 assert_finish_err!(d, ""); 588 assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [0xd8], "\u{12345}"); 589 assert_finish_err!(d, ""); 590 assert_feed_ok!(d, [0xd8, 0x08, 0xdf, 0x45], [], "\u{12345}"); 591 assert_finish_ok!(d, ""); 592 } 593 } 594 595