1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT 2 // file at the top-level directory of this distribution. 3 // 4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license 6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your 7 // option. This file may not be copied, modified, or distributed 8 // except according to those terms. 9 10 use handles::*; 11 use data::*; 12 use variant::*; 13 use super::*; 14 // Rust 1.14.0 requires the following despite the asterisk above. 15 use super::in_inclusive_range16; 16 17 enum EucJpPending { 18 None, 19 Jis0208Lead(u8), 20 Jis0212Shift, 21 Jis0212Lead(u8), 22 HalfWidthKatakana, 23 } 24 25 impl EucJpPending { is_none(&self) -> bool26 fn is_none(&self) -> bool { 27 match *self { 28 EucJpPending::None => true, 29 _ => false, 30 } 31 } 32 count(&self) -> usize33 fn count(&self) -> usize { 34 match *self { 35 EucJpPending::None => 0, 36 EucJpPending::Jis0208Lead(_) | 37 EucJpPending::Jis0212Shift | 38 EucJpPending::HalfWidthKatakana => 1, 39 EucJpPending::Jis0212Lead(_) => 2, 40 } 41 } 42 } 43 44 pub struct EucJpDecoder { 45 pending: EucJpPending, 46 } 47 48 impl EucJpDecoder { new() -> VariantDecoder49 pub fn new() -> VariantDecoder { 50 VariantDecoder::EucJp(EucJpDecoder { pending: EucJpPending::None }) 51 } 52 plus_one_if_lead(&self, byte_length: usize) -> Option<usize>53 fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> { 54 byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 }) 55 } 56 max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>57 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> { 58 self.plus_one_if_lead(byte_length) 59 } 60 max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>61 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> { 62 // worst case: 2 to 3 63 let len = self.plus_one_if_lead(byte_length); 64 checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2))) 65 } 66 max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>67 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> { 68 checked_mul(3, self.plus_one_if_lead(byte_length)) 69 } 70 71 euc_jp_decoder_functions!( 72 { 73 let trail_minus_offset = byte.wrapping_sub(0xA1); 74 // Fast-track Hiragana (60% according to Lunde) 75 // and Katakana (10% acconding to Lunde). 76 if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 { 77 // Hiragana 78 handle.write_upper_bmp(0x3041 + trail_minus_offset as u16) 79 } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 { 80 // Katakana 81 handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16) 82 } else if trail_minus_offset > (0xFE - 0xA1) { 83 if byte < 0x80 { 84 return (DecoderResult::Malformed(1, 0), 85 unread_handle_trail.unread(), 86 handle.written()); 87 } 88 return (DecoderResult::Malformed(2, 0), 89 unread_handle_trail.consumed(), 90 handle.written()); 91 } else { 92 let pointer = mul_94(jis0208_lead_minus_offset) + trail_minus_offset as usize; 93 let level1_pointer = pointer.wrapping_sub(1410); 94 if level1_pointer < JIS0208_LEVEL1_KANJI.len() { 95 handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer]) 96 } else { 97 let level2_pointer = pointer.wrapping_sub(4418); 98 if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() { 99 handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer]) 100 } else { 101 let ibm_pointer = pointer.wrapping_sub(8272); 102 if ibm_pointer < IBM_KANJI.len() { 103 handle.write_upper_bmp(IBM_KANJI[ibm_pointer]) 104 } else if let Some(bmp) = jis0208_symbol_decode(pointer) { 105 handle.write_bmp_excl_ascii(bmp) 106 } else if let Some(bmp) = jis0208_range_decode(pointer) { 107 handle.write_bmp_excl_ascii(bmp) 108 } else { 109 return (DecoderResult::Malformed(2, 0), 110 unread_handle_trail.consumed(), 111 handle.written()); 112 } 113 } 114 } 115 } 116 }, 117 { 118 // If lead is between 0xA1 and 0xFE, inclusive, 119 // subtract 0xA1. 120 let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1); 121 if jis0212_lead_minus_offset > (0xFE - 0xA1) { 122 if lead < 0x80 { 123 return (DecoderResult::Malformed(1, 0), 124 unread_handle_jis0212.unread(), 125 handle.written()); 126 } 127 return (DecoderResult::Malformed(2, 0), 128 unread_handle_jis0212.consumed(), 129 handle.written()); 130 } 131 jis0212_lead_minus_offset 132 }, 133 { 134 // If trail is between 0xA1 and 0xFE, inclusive, 135 // subtract 0xA1. 136 let trail_minus_offset = byte.wrapping_sub(0xA1); 137 if trail_minus_offset > (0xFE - 0xA1) { 138 if byte < 0x80 { 139 return (DecoderResult::Malformed(2, 0), 140 unread_handle_trail.unread(), 141 handle.written()); 142 } 143 return (DecoderResult::Malformed(3, 0), 144 unread_handle_trail.consumed(), 145 handle.written()); 146 } 147 let pointer = mul_94(jis0212_lead_minus_offset) + trail_minus_offset as usize; 148 let pointer_minus_kanji = pointer.wrapping_sub(1410); 149 if pointer_minus_kanji < JIS0212_KANJI.len() { 150 handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji]) 151 } else if let Some(bmp) = jis0212_accented_decode(pointer) { 152 handle.write_bmp_excl_ascii(bmp) 153 } else { 154 let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597); 155 if pointer_minus_upper_cyrillic <= (607 - 597) { 156 handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16) 157 } else { 158 let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645); 159 if pointer_minus_lower_cyrillic <= (655 - 645) { 160 handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16) 161 } else { 162 return (DecoderResult::Malformed(3, 0), 163 unread_handle_trail.consumed(), 164 handle.written()); 165 } 166 } 167 } 168 }, 169 { 170 // If trail is between 0xA1 and 0xDF, inclusive, 171 // subtract 0xA1 and map to half-width Katakana. 172 let trail_minus_offset = byte.wrapping_sub(0xA1); 173 if trail_minus_offset > (0xDF - 0xA1) { 174 if byte < 0x80 { 175 return (DecoderResult::Malformed(1, 0), 176 unread_handle_trail.unread(), 177 handle.written()); 178 } 179 return (DecoderResult::Malformed(2, 0), 180 unread_handle_trail.consumed(), 181 handle.written()); 182 } 183 handle.write_upper_bmp(0xFF61 + trail_minus_offset as u16) 184 }, 185 self, 186 non_ascii, 187 jis0208_lead_minus_offset, 188 byte, 189 unread_handle_trail, 190 jis0212_lead_minus_offset, 191 lead, 192 unread_handle_jis0212, 193 source, 194 handle 195 ); 196 } 197 198 pub struct EucJpEncoder; 199 200 impl EucJpEncoder { new(encoding: &'static Encoding) -> Encoder201 pub fn new(encoding: &'static Encoding) -> Encoder { 202 Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder)) 203 } 204 max_buffer_length_from_utf16_without_replacement(&self, u16_length: usize) -> Option<usize>205 pub fn max_buffer_length_from_utf16_without_replacement(&self, 206 u16_length: usize) 207 -> Option<usize> { 208 u16_length.checked_mul(2) 209 } 210 max_buffer_length_from_utf8_without_replacement(&self, byte_length: usize) -> Option<usize>211 pub fn max_buffer_length_from_utf8_without_replacement(&self, 212 byte_length: usize) 213 -> Option<usize> { 214 byte_length.checked_add(1) 215 } 216 217 ascii_compatible_bmp_encoder_functions!( 218 { 219 // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana 220 let bmp_minus_hiragana = bmp.wrapping_sub(0x3041); 221 if bmp_minus_hiragana < 0x53 { 222 handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8) 223 } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) { 224 if 0x4EDD == bmp { 225 // Ideograph on the symbol row! 226 handle.write_two(0xA1, 0xB8) 227 } else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) { 228 handle.write_two(lead, trail) 229 } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) { 230 let lead = (pos / 94) + 0xD0; 231 let trail = (pos % 94) + 0xA1; 232 handle.write_two(lead as u8, trail as u8) 233 } else if let Some(pos) = position(&IBM_KANJI[..], bmp) { 234 let lead = (pos / 94) + 0xF9; 235 let trail = (pos % 94) + 0xA1; 236 handle.write_two(lead as u8, trail as u8) 237 } else { 238 return (EncoderResult::unmappable_from_bmp(bmp), 239 source.consumed(), 240 handle.written()); 241 } 242 } else { 243 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1); 244 if bmp_minus_katakana < 0x56 { 245 handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8) 246 } else { 247 let bmp_minus_space = bmp.wrapping_sub(0x3000); 248 if bmp_minus_space < 3 { 249 // fast-track common punctuation 250 handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8) 251 } else if bmp == 0xA5 { 252 handle.write_one(0x5Cu8) 253 } else if bmp == 0x203E { 254 handle.write_one(0x7Eu8) 255 } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) { 256 handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8) 257 } else if bmp == 0x2212 { 258 handle.write_two(0xA1u8, 0xDDu8) 259 } else if let Some(pointer) = jis0208_range_encode(bmp) { 260 let lead = (pointer / 94) + 0xA1; 261 let trail = (pointer % 94) + 0xA1; 262 handle.write_two(lead as u8, trail as u8) 263 } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 || 264 bmp == 0xF9DC { 265 // Guaranteed to be found in IBM_KANJI 266 let pos = position(&IBM_KANJI[..], bmp).unwrap(); 267 let lead = (pos / 94) + 0xF9; 268 let trail = (pos % 94) + 0xA1; 269 handle.write_two(lead as u8, trail as u8) 270 } else if let Some(pointer) = ibm_symbol_encode(bmp) { 271 let lead = (pointer / 94) + 0xA1; 272 let trail = (pointer % 94) + 0xA1; 273 handle.write_two(lead as u8, trail as u8) 274 } else if let Some(pointer) = jis0208_symbol_encode(bmp) { 275 let lead = (pointer / 94) + 0xA1; 276 let trail = (pointer % 94) + 0xA1; 277 handle.write_two(lead as u8, trail as u8) 278 } else { 279 return (EncoderResult::unmappable_from_bmp(bmp), 280 source.consumed(), 281 handle.written()); 282 } 283 } 284 } 285 }, 286 bmp, 287 self, 288 source, 289 handle, 290 copy_ascii_to_check_space_two, 291 check_space_two, 292 false 293 ); 294 } 295 296 // Any copyright to the test code below this comment is dedicated to the 297 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/ 298 299 #[cfg(test)] 300 mod tests { 301 use super::super::testing::*; 302 use super::super::*; 303 decode_euc_jp(bytes: &[u8], expect: &str)304 fn decode_euc_jp(bytes: &[u8], expect: &str) { 305 decode(EUC_JP, bytes, expect); 306 } 307 encode_euc_jp(string: &str, expect: &[u8])308 fn encode_euc_jp(string: &str, expect: &[u8]) { 309 encode(EUC_JP, string, expect); 310 } 311 312 #[test] test_euc_jp_decode()313 fn test_euc_jp_decode() { 314 // Empty 315 decode_euc_jp(b"", &""); 316 317 // ASCII 318 decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}"); 319 320 // Half-width 321 decode_euc_jp(b"\x8E\xA1", "\u{FF61}"); 322 decode_euc_jp(b"\x8E\xDF", "\u{FF9F}"); 323 decode_euc_jp(b"\x8E\xA0", "\u{FFFD}"); 324 decode_euc_jp(b"\x8E\xE0", "\u{FFFD}"); 325 decode_euc_jp(b"\x8E\xFF", "\u{FFFD}"); 326 decode_euc_jp(b"\x8E", "\u{FFFD}"); 327 328 // JIS 0212 329 decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}"); 330 decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}"); 331 decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}"); 332 decode_euc_jp(b"\x8F\xA1", "\u{FFFD}"); 333 decode_euc_jp(b"\x8F", "\u{FFFD}"); 334 335 // JIS 0208 336 decode_euc_jp(b"\xA1\xA1", "\u{3000}"); 337 decode_euc_jp(b"\xA1\xA0", "\u{FFFD}"); 338 decode_euc_jp(b"\xFC\xFE", "\u{FF02}"); 339 decode_euc_jp(b"\xFE\xFE", "\u{FFFD}"); 340 decode_euc_jp(b"\xA1", "\u{FFFD}"); 341 342 // Bad leads 343 decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}"); 344 decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}"); 345 decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}"); 346 decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}"); 347 decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}"); 348 decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}"); 349 decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}"); 350 decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}"); 351 decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}"); 352 decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}"); 353 decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}"); 354 decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}"); 355 decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}"); 356 decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}"); 357 decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}"); 358 decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}"); 359 360 // Bad ASCII trail 361 decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}"); 362 } 363 364 #[test] test_euc_jp_encode()365 fn test_euc_jp_encode() { 366 // Empty 367 encode_euc_jp("", b""); 368 369 // ASCII 370 encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62"); 371 372 // Exceptional code points 373 encode_euc_jp("\u{00A5}", b"\x5C"); 374 encode_euc_jp("\u{203E}", b"\x7E"); 375 encode_euc_jp("\u{2212}", b"\xA1\xDD"); 376 377 // Half-width 378 encode_euc_jp("\u{FF61}", b"\x8E\xA1"); 379 encode_euc_jp("\u{FF9F}", b"\x8E\xDF"); 380 381 // JIS 0212 382 encode_euc_jp("\u{02D8}", b"˘"); 383 384 // JIS 0208 385 encode_euc_jp("\u{3000}", b"\xA1\xA1"); 386 encode_euc_jp("\u{FF02}", b"\xFC\xFE"); 387 } 388 389 #[test] test_jis0208_decode_all()390 fn test_jis0208_decode_all() { 391 let input = include_bytes!("test_data/jis0208_in.txt"); 392 let expectation = include_str!("test_data/jis0208_in_ref.txt"); 393 let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input); 394 assert!(had_errors, "Should have had errors."); 395 assert_eq!(&cow[..], expectation); 396 } 397 398 #[test] test_jis0208_encode_all()399 fn test_jis0208_encode_all() { 400 let input = include_str!("test_data/jis0208_out.txt"); 401 let expectation = include_bytes!("test_data/jis0208_out_ref.txt"); 402 let (cow, encoding, had_errors) = EUC_JP.encode(input); 403 assert!(!had_errors, "Should not have had errors."); 404 assert_eq!(encoding, EUC_JP); 405 assert_eq!(&cow[..], &expectation[..]); 406 } 407 408 #[test] test_jis0212_decode_all()409 fn test_jis0212_decode_all() { 410 let input = include_bytes!("test_data/jis0212_in.txt"); 411 let expectation = include_str!("test_data/jis0212_in_ref.txt"); 412 let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input); 413 assert!(had_errors, "Should have had errors."); 414 assert_eq!(&cow[..], expectation); 415 } 416 } 417