1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 2 // file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 use super::*; 11 use data::*; 12 use handles::*; 13 use variant::*; 14 // Rust 1.14.0 requires the following despite the asterisk above. 15 use super::in_inclusive_range32; 16 17 pub struct Big5Decoder { 18 lead: Option<u8>, 19 } 20 21 impl Big5Decoder { new() -> VariantDecoder22 pub fn new() -> VariantDecoder { 23 VariantDecoder::Big5(Big5Decoder { lead: None }) 24 } 25 in_neutral_state(&self) -> bool26 pub fn in_neutral_state(&self) -> bool { 27 self.lead.is_none() 28 } 29 plus_one_if_lead(&self, byte_length: usize) -> Option<usize>30 fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { 31 byte_length.checked_add(match self.lead { 32 None => 0, 33 Some(_) => 1, 34 }) 35 } 36 max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>37 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { 38 // If there is a lead but the next byte isn't a valid trail, an 39 // error is generated for the lead (+1). Then another iteration checks 40 // space, which needs +1 to account for the possibility of astral 41 // output or combining pair. 42 checked_add(1, self.plus_one_if_lead(byte_length)) 43 } 44 max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>45 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { 46 // No need to account for REPLACEMENT CHARACTERS. 47 // Cases: 48 // ASCII: 1 to 1 49 // Valid pair: 2 to 2, 2 to 3 or 2 to 4, i.e. worst case 2 to 4 50 // lead set and first byte is trail: 1 to 4 worst case 51 // 52 // When checking for space for the last byte: 53 // no lead: the last byte must be ASCII (or fatal error): 1 to 1 54 // lead set: space for 4 bytes was already checked when reading the 55 // lead, hence the last lead and the last trail together are worst 56 // case 2 to 4. 57 // 58 // If lead set and the input is a single trail byte, the worst-case 59 // output is 4, so we need to add one before multiplying if lead is 60 // set. 61 // 62 // Finally, add two so that if input is non-zero, the output is at 63 // least 4. 64 checked_add(2, checked_mul(2, self.plus_one_if_lead(byte_length))) 65 } 66 max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>67 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { 68 // If there is a lead but the next byte isn't a valid trail, an 69 // error is generated for the lead (+(1*3)). Then another iteration 70 // checks space, which needs +3 to account for the possibility of astral 71 // output or combining pair. In between start and end, the worst case 72 // is that every byte is bad: *3. 73 checked_add(3, checked_mul(3, self.plus_one_if_lead(byte_length))) 74 } 75 76 ascii_compatible_two_byte_decoder_functions!( 77 { 78 // If lead is between 0x81 and 0xFE, inclusive, 79 // subtract offset 0x81. 80 let non_ascii_minus_offset = 81 non_ascii.wrapping_sub(0x81); 82 if non_ascii_minus_offset > (0xFE - 0x81) { 83 return (DecoderResult::Malformed(1, 0), 84 source.consumed(), 85 handle.written()); 86 } 87 non_ascii_minus_offset 88 }, 89 { 90 // If trail is between 0x40 and 0x7E, inclusive, 91 // subtract offset 0x40. Else if trail is 92 // between 0xA1 and 0xFE, inclusive, subtract 93 // offset 0x62. 94 // TODO: Find out which range is more probable. 95 let mut trail_minus_offset = 96 byte.wrapping_sub(0x40); 97 if trail_minus_offset > (0x7E - 0x40) { 98 let trail_minus_range_start = 99 byte.wrapping_sub(0xA1); 100 if trail_minus_range_start > 101 (0xFE - 0xA1) { 102 if byte < 0x80 { 103 return (DecoderResult::Malformed(1, 0), 104 unread_handle_trail.unread(), 105 handle.written()); 106 } 107 return (DecoderResult::Malformed(2, 0), 108 unread_handle_trail.consumed(), 109 handle.written()); 110 } 111 trail_minus_offset = byte - 0x62; 112 } 113 let pointer = lead_minus_offset as usize * 114 157usize + 115 trail_minus_offset as usize; 116 let rebased_pointer = pointer.wrapping_sub(942); 117 let low_bits = big5_low_bits(rebased_pointer); 118 if low_bits == 0 { 119 match pointer { 120 1133 => { 121 handle.write_big5_combination(0x00CAu16, 122 0x0304u16) 123 } 124 1135 => { 125 handle.write_big5_combination(0x00CAu16, 126 0x030Cu16) 127 } 128 1164 => { 129 handle.write_big5_combination(0x00EAu16, 130 0x0304u16) 131 } 132 1166 => { 133 handle.write_big5_combination(0x00EAu16, 134 0x030Cu16) 135 } 136 _ => { 137 if byte < 0x80 { 138 return (DecoderResult::Malformed(1, 0), 139 unread_handle_trail.unread(), 140 handle.written()); 141 } 142 return (DecoderResult::Malformed(2, 0), 143 unread_handle_trail.consumed(), 144 handle.written()); 145 } 146 } 147 } else if big5_is_astral(rebased_pointer) { 148 handle.write_astral(u32::from(low_bits) | 149 0x20000u32) 150 } else { 151 handle.write_bmp_excl_ascii(low_bits) 152 } 153 }, 154 self, 155 non_ascii, 156 byte, 157 lead_minus_offset, 158 unread_handle_trail, 159 source, 160 handle, 161 'outermost, 162 copy_ascii_from_check_space_astral, 163 check_space_astral, 164 false); 165 } 166 167 pub struct Big5Encoder; 168 169 impl Big5Encoder { new(encoding: &'static Encoding) -> Encoder170 pub fn new(encoding: &'static Encoding) -> Encoder { 171 Encoder::new(encoding, VariantEncoder::Big5(Big5Encoder)) 172 } 173 max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>174 pub fn max_buffer_length_from_utf16_without_replacement( 175 &self, 176 u16_length: usize, 177 ) -> Option<usize> { 178 // Astral: 2 to 2 179 // ASCII: 1 to 1 180 // Other: 1 to 2 181 u16_length.checked_mul(2) 182 } 183 max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>184 pub fn max_buffer_length_from_utf8_without_replacement( 185 &self, 186 byte_length: usize, 187 ) -> Option<usize> { 188 // Astral: 4 to 2 189 // Upper BMP: 3 to 2 190 // Lower BMP: 2 to 2 191 // ASCII: 1 to 1 192 byte_length.checked_add(1) 193 } 194 195 ascii_compatible_encoder_functions!( 196 { 197 // For simplicity, unified ideographs 198 // in the pointer range 11206...11212 are handled 199 // as Level 1 Hanzi. 200 if let Some((lead, trail)) = big5_level1_hanzi_encode(bmp) { 201 handle.write_two(lead, trail) 202 } else { 203 let pointer = if let Some(pointer) = big5_box_encode(bmp) { 204 pointer 205 } else if let Some(pointer) = big5_other_encode(bmp) { 206 pointer 207 } else { 208 return ( 209 EncoderResult::unmappable_from_bmp(bmp), 210 source.consumed(), 211 handle.written(), 212 ); 213 }; 214 let lead = pointer / 157 + 0x81; 215 let remainder = pointer % 157; 216 let trail = if remainder < 0x3F { 217 remainder + 0x40 218 } else { 219 remainder + 0x62 220 }; 221 handle.write_two(lead as u8, trail as u8) 222 } 223 }, 224 { 225 if in_inclusive_range32(astral as u32, 0x2008A, 0x2F8A6) { 226 if let Some(rebased_pointer) = big5_astral_encode(astral as u16) { 227 // big5_astral_encode returns rebased pointer, 228 // so adding 0x87 instead of 0x81. 229 let lead = rebased_pointer / 157 + 0x87; 230 let remainder = rebased_pointer % 157; 231 let trail = if remainder < 0x3F { 232 remainder + 0x40 233 } else { 234 remainder + 0x62 235 }; 236 handle.write_two(lead as u8, trail as u8) 237 } else { 238 return ( 239 EncoderResult::Unmappable(astral), 240 source.consumed(), 241 handle.written(), 242 ); 243 } 244 } else { 245 return ( 246 EncoderResult::Unmappable(astral), 247 source.consumed(), 248 handle.written(), 249 ); 250 } 251 }, 252 bmp, 253 astral, 254 self, 255 source, 256 handle, 257 copy_ascii_to_check_space_two, 258 check_space_two, 259 false 260 ); 261 } 262 263 // Any copyright to the test code below this comment is dedicated to the 264 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 265 266 #[cfg(test)] 267 mod tests { 268 use super::super::testing::*; 269 use super::super::*; 270 decode_big5(bytes: &[u8], expect: &str)271 fn decode_big5(bytes: &[u8], expect: &str) { 272 decode(BIG5, bytes, expect); 273 } 274 encode_big5(string: &str, expect: &[u8])275 fn encode_big5(string: &str, expect: &[u8]) { 276 encode(BIG5, string, expect); 277 } 278 279 #[test] test_big5_decode()280 fn test_big5_decode() { 281 // Empty 282 decode_big5(b"", &""); 283 284 // ASCII 285 decode_big5(&[0x61u8, 0x62u8], &"\u{0061}\u{0062}"); 286 287 // Edge cases 288 decode_big5(&[0x87u8, 0x40u8], &"\u{43F0}"); 289 decode_big5(&[0xFEu8, 0xFEu8], &"\u{79D4}"); 290 decode_big5(&[0xFEu8, 0xFDu8], &"\u{2910D}"); 291 decode_big5(&[0x88u8, 0x62u8], &"\u{00CA}\u{0304}"); 292 decode_big5(&[0x88u8, 0x64u8], &"\u{00CA}\u{030C}"); 293 decode_big5(&[0x88u8, 0x66u8], &"\u{00CA}"); 294 decode_big5(&[0x88u8, 0xA3u8], &"\u{00EA}\u{0304}"); 295 decode_big5(&[0x88u8, 0xA5u8], &"\u{00EA}\u{030C}"); 296 decode_big5(&[0x88u8, 0xA7u8], &"\u{00EA}"); 297 decode_big5(&[0x99u8, 0xD4u8], &"\u{8991}"); 298 decode_big5(&[0x99u8, 0xD5u8], &"\u{27967}"); 299 decode_big5(&[0x99u8, 0xD6u8], &"\u{8A29}"); 300 301 // Edge cases surrounded with ASCII 302 decode_big5( 303 &[0x61u8, 0x87u8, 0x40u8, 0x62u8], 304 &"\u{0061}\u{43F0}\u{0062}", 305 ); 306 decode_big5( 307 &[0x61u8, 0xFEu8, 0xFEu8, 0x62u8], 308 &"\u{0061}\u{79D4}\u{0062}", 309 ); 310 decode_big5( 311 &[0x61u8, 0xFEu8, 0xFDu8, 0x62u8], 312 &"\u{0061}\u{2910D}\u{0062}", 313 ); 314 decode_big5( 315 &[0x61u8, 0x88u8, 0x62u8, 0x62u8], 316 &"\u{0061}\u{00CA}\u{0304}\u{0062}", 317 ); 318 decode_big5( 319 &[0x61u8, 0x88u8, 0x64u8, 0x62u8], 320 &"\u{0061}\u{00CA}\u{030C}\u{0062}", 321 ); 322 decode_big5( 323 &[0x61u8, 0x88u8, 0x66u8, 0x62u8], 324 &"\u{0061}\u{00CA}\u{0062}", 325 ); 326 decode_big5( 327 &[0x61u8, 0x88u8, 0xA3u8, 0x62u8], 328 &"\u{0061}\u{00EA}\u{0304}\u{0062}", 329 ); 330 decode_big5( 331 &[0x61u8, 0x88u8, 0xA5u8, 0x62u8], 332 &"\u{0061}\u{00EA}\u{030C}\u{0062}", 333 ); 334 decode_big5( 335 &[0x61u8, 0x88u8, 0xA7u8, 0x62u8], 336 &"\u{0061}\u{00EA}\u{0062}", 337 ); 338 decode_big5( 339 &[0x61u8, 0x99u8, 0xD4u8, 0x62u8], 340 &"\u{0061}\u{8991}\u{0062}", 341 ); 342 decode_big5( 343 &[0x61u8, 0x99u8, 0xD5u8, 0x62u8], 344 &"\u{0061}\u{27967}\u{0062}", 345 ); 346 decode_big5( 347 &[0x61u8, 0x99u8, 0xD6u8, 0x62u8], 348 &"\u{0061}\u{8A29}\u{0062}", 349 ); 350 351 // Bad sequences 352 decode_big5(&[0x80u8, 0x61u8], &"\u{FFFD}\u{0061}"); 353 decode_big5(&[0xFFu8, 0x61u8], &"\u{FFFD}\u{0061}"); 354 decode_big5(&[0xFEu8, 0x39u8], &"\u{FFFD}\u{0039}"); 355 decode_big5(&[0x87u8, 0x66u8], &"\u{FFFD}\u{0066}"); 356 decode_big5(&[0x81u8, 0x40u8], &"\u{FFFD}\u{0040}"); 357 decode_big5(&[0x61u8, 0x81u8], &"\u{0061}\u{FFFD}"); 358 } 359 360 #[test] test_big5_encode()361 fn test_big5_encode() { 362 // Empty 363 encode_big5("", b""); 364 365 // ASCII 366 encode_big5("\u{0061}\u{0062}", b"\x61\x62"); 367 368 // Edge cases 369 encode_big5("\u{9EA6}\u{0061}", b"麦\x61"); 370 encode_big5("\u{2626B}\u{0061}", b"𦉫\x61"); 371 encode_big5("\u{3000}", b"\xA1\x40"); 372 encode_big5("\u{20AC}", b"\xA3\xE1"); 373 encode_big5("\u{4E00}", b"\xA4\x40"); 374 encode_big5("\u{27607}", b"\xC8\xA4"); 375 encode_big5("\u{FFE2}", b"\xC8\xCD"); 376 encode_big5("\u{79D4}", b"\xFE\xFE"); 377 378 // Not in index 379 encode_big5("\u{2603}\u{0061}", b"☃\x61"); 380 381 // duplicate low bits 382 encode_big5("\u{203B5}", b"\xFD\x6A"); 383 encode_big5("\u{25605}", b"\xFE\x46"); 384 385 // prefer last 386 encode_big5("\u{2550}", b"\xF9\xF9"); 387 } 388 389 #[test] test_big5_decode_all()390 fn test_big5_decode_all() { 391 let input = include_bytes!("test_data/big5_in.txt"); 392 let expectation = include_str!("test_data/big5_in_ref.txt"); 393 let (cow, had_errors) = BIG5.decode_without_bom_handling(input); 394 assert!(had_errors, "Should have had errors."); 395 assert_eq!(&cow[..], expectation); 396 } 397 398 #[test] test_big5_encode_all()399 fn test_big5_encode_all() { 400 let input = include_str!("test_data/big5_out.txt"); 401 let expectation = include_bytes!("test_data/big5_out_ref.txt"); 402 let (cow, encoding, had_errors) = BIG5.encode(input); 403 assert!(!had_errors, "Should not have had errors."); 404 assert_eq!(encoding, BIG5); 405 assert_eq!(&cow[..], &expectation[..]); 406 } 407 408 #[test] test_big5_encode_from_two_low_surrogates()409 fn test_big5_encode_from_two_low_surrogates() { 410 let expectation = b"��"; 411 let mut output = [0u8; 40]; 412 let mut encoder = BIG5.new_encoder(); 413 let (result, read, written, had_errors) = 414 encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true); 415 assert_eq!(result, CoderResult::InputEmpty); 416 assert_eq!(read, 2); 417 assert_eq!(written, expectation.len()); 418 assert!(had_errors); 419 assert_eq!(&output[..written], expectation); 420 } 421 } 422