1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use data::*;
12 use handles::*;
13 use variant::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range16;
16 
17 enum EucJpPending {
18     None,
19     Jis0208Lead(u8),
20     Jis0212Shift,
21     Jis0212Lead(u8),
22     HalfWidthKatakana,
23 }
24 
25 impl EucJpPending {
is_none(&self) -> bool26     fn is_none(&self) -> bool {
27         match *self {
28             EucJpPending::None => true,
29             _ => false,
30         }
31     }
32 
count(&self) -> usize33     fn count(&self) -> usize {
34         match *self {
35             EucJpPending::None => 0,
36             EucJpPending::Jis0208Lead(_)
37             | EucJpPending::Jis0212Shift
38             | EucJpPending::HalfWidthKatakana => 1,
39             EucJpPending::Jis0212Lead(_) => 2,
40         }
41     }
42 }
43 
44 pub struct EucJpDecoder {
45     pending: EucJpPending,
46 }
47 
48 impl EucJpDecoder {
new() -> VariantDecoder49     pub fn new() -> VariantDecoder {
50         VariantDecoder::EucJp(EucJpDecoder {
51             pending: EucJpPending::None,
52         })
53     }
54 
in_neutral_state(&self) -> bool55     pub fn in_neutral_state(&self) -> bool {
56         self.pending.is_none()
57     }
58 
plus_one_if_lead(&self, byte_length: usize) -> Option<usize>59     fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
60         byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
61     }
62 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>63     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
64         self.plus_one_if_lead(byte_length)
65     }
66 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>67     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
68         // worst case: 2 to 3
69         let len = self.plus_one_if_lead(byte_length);
70         checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
71     }
72 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>73     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
74         checked_mul(3, self.plus_one_if_lead(byte_length))
75     }
76 
77     euc_jp_decoder_functions!(
78         {
79             let trail_minus_offset = byte.wrapping_sub(0xA1);
80             // Fast-track Hiragana (60% according to Lunde)
81             // and Katakana (10% acconding to Lunde).
82             if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
83                 // Hiragana
84                 handle.write_upper_bmp(0x3041 + u16::from(trail_minus_offset))
85             } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
86                 // Katakana
87                 handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
88             } else if trail_minus_offset > (0xFE - 0xA1) {
89                 if byte < 0x80 {
90                     return (
91                         DecoderResult::Malformed(1, 0),
92                         unread_handle_trail.unread(),
93                         handle.written(),
94                     );
95                 }
96                 return (
97                     DecoderResult::Malformed(2, 0),
98                     unread_handle_trail.consumed(),
99                     handle.written(),
100                 );
101             } else {
102                 let pointer = mul_94(jis0208_lead_minus_offset) + usize::from(trail_minus_offset);
103                 let level1_pointer = pointer.wrapping_sub(1410);
104                 if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
105                     handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
106                 } else {
107                     let level2_pointer = pointer.wrapping_sub(4418);
108                     if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
109                         handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
110                     } else {
111                         let ibm_pointer = pointer.wrapping_sub(8272);
112                         if ibm_pointer < IBM_KANJI.len() {
113                             handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
114                         } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
115                             handle.write_bmp_excl_ascii(bmp)
116                         } else if let Some(bmp) = jis0208_range_decode(pointer) {
117                             handle.write_bmp_excl_ascii(bmp)
118                         } else {
119                             return (
120                                 DecoderResult::Malformed(2, 0),
121                                 unread_handle_trail.consumed(),
122                                 handle.written(),
123                             );
124                         }
125                     }
126                 }
127             }
128         },
129         {
130             // If lead is between 0xA1 and 0xFE, inclusive,
131             // subtract 0xA1.
132             let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
133             if jis0212_lead_minus_offset > (0xFE - 0xA1) {
134                 if lead < 0x80 {
135                     return (
136                         DecoderResult::Malformed(1, 0),
137                         unread_handle_jis0212.unread(),
138                         handle.written(),
139                     );
140                 }
141                 return (
142                     DecoderResult::Malformed(2, 0),
143                     unread_handle_jis0212.consumed(),
144                     handle.written(),
145                 );
146             }
147             jis0212_lead_minus_offset
148         },
149         {
150             // If trail is between 0xA1 and 0xFE, inclusive,
151             // subtract 0xA1.
152             let trail_minus_offset = byte.wrapping_sub(0xA1);
153             if trail_minus_offset > (0xFE - 0xA1) {
154                 if byte < 0x80 {
155                     return (
156                         DecoderResult::Malformed(2, 0),
157                         unread_handle_trail.unread(),
158                         handle.written(),
159                     );
160                 }
161                 return (
162                     DecoderResult::Malformed(3, 0),
163                     unread_handle_trail.consumed(),
164                     handle.written(),
165                 );
166             }
167             let pointer = mul_94(jis0212_lead_minus_offset) + usize::from(trail_minus_offset);
168             let pointer_minus_kanji = pointer.wrapping_sub(1410);
169             if pointer_minus_kanji < JIS0212_KANJI.len() {
170                 handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
171             } else if let Some(bmp) = jis0212_accented_decode(pointer) {
172                 handle.write_bmp_excl_ascii(bmp)
173             } else {
174                 let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
175                 if pointer_minus_upper_cyrillic <= (607 - 597) {
176                     handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
177                 } else {
178                     let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
179                     if pointer_minus_lower_cyrillic <= (655 - 645) {
180                         handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
181                     } else {
182                         return (
183                             DecoderResult::Malformed(3, 0),
184                             unread_handle_trail.consumed(),
185                             handle.written(),
186                         );
187                     }
188                 }
189             }
190         },
191         {
192             // If trail is between 0xA1 and 0xDF, inclusive,
193             // subtract 0xA1 and map to half-width Katakana.
194             let trail_minus_offset = byte.wrapping_sub(0xA1);
195             if trail_minus_offset > (0xDF - 0xA1) {
196                 if byte < 0x80 {
197                     return (
198                         DecoderResult::Malformed(1, 0),
199                         unread_handle_trail.unread(),
200                         handle.written(),
201                     );
202                 }
203                 return (
204                     DecoderResult::Malformed(2, 0),
205                     unread_handle_trail.consumed(),
206                     handle.written(),
207                 );
208             }
209             handle.write_upper_bmp(0xFF61 + u16::from(trail_minus_offset))
210         },
211         self,
212         non_ascii,
213         jis0208_lead_minus_offset,
214         byte,
215         unread_handle_trail,
216         jis0212_lead_minus_offset,
217         lead,
218         unread_handle_jis0212,
219         source,
220         handle
221     );
222 }
223 
224 #[cfg(feature = "fast-kanji-encode")]
225 #[inline(always)]
encode_kanji(bmp: u16) -> Option<(u8, u8)>226 fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
227     jis0208_kanji_euc_jp_encode(bmp)
228 }
229 
230 #[cfg(not(feature = "fast-kanji-encode"))]
231 #[inline(always)]
encode_kanji(bmp: u16) -> Option<(u8, u8)>232 fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
233     if 0x4EDD == bmp {
234         // Ideograph on the symbol row!
235         Some((0xA1, 0xB8))
236     } else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
237         Some((lead, trail))
238     } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
239         let lead = (pos / 94) + 0xD0;
240         let trail = (pos % 94) + 0xA1;
241         Some((lead as u8, trail as u8))
242     } else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
243         let lead = (pos / 94) + 0xF9;
244         let trail = (pos % 94) + 0xA1;
245         Some((lead as u8, trail as u8))
246     } else {
247         None
248     }
249 }
250 
251 pub struct EucJpEncoder;
252 
253 impl EucJpEncoder {
new(encoding: &'static Encoding) -> Encoder254     pub fn new(encoding: &'static Encoding) -> Encoder {
255         Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
256     }
257 
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>258     pub fn max_buffer_length_from_utf16_without_replacement(
259         &self,
260         u16_length: usize,
261     ) -> Option<usize> {
262         u16_length.checked_mul(2)
263     }
264 
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>265     pub fn max_buffer_length_from_utf8_without_replacement(
266         &self,
267         byte_length: usize,
268     ) -> Option<usize> {
269         byte_length.checked_add(1)
270     }
271 
272     ascii_compatible_bmp_encoder_functions!(
273         {
274             // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
275             let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
276             if bmp_minus_hiragana < 0x53 {
277                 handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
278             } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
279                 if let Some((lead, trail)) = encode_kanji(bmp) {
280                     handle.write_two(lead, trail)
281                 } else {
282                     return (
283                         EncoderResult::unmappable_from_bmp(bmp),
284                         source.consumed(),
285                         handle.written(),
286                     );
287                 }
288             } else {
289                 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
290                 if bmp_minus_katakana < 0x56 {
291                     handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
292                 } else {
293                     let bmp_minus_space = bmp.wrapping_sub(0x3000);
294                     if bmp_minus_space < 3 {
295                         // fast-track common punctuation
296                         handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
297                     } else if bmp == 0xA5 {
298                         handle.write_one(0x5Cu8)
299                     } else if bmp == 0x203E {
300                         handle.write_one(0x7Eu8)
301                     } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
302                         handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
303                     } else if bmp == 0x2212 {
304                         handle.write_two(0xA1u8, 0xDDu8)
305                     } else if let Some(pointer) = jis0208_range_encode(bmp) {
306                         let lead = (pointer / 94) + 0xA1;
307                         let trail = (pointer % 94) + 0xA1;
308                         handle.write_two(lead as u8, trail as u8)
309                     } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
310                         || bmp == 0xF929
311                         || bmp == 0xF9DC
312                     {
313                         // Guaranteed to be found in IBM_KANJI
314                         let pos = position(&IBM_KANJI[..], bmp).unwrap();
315                         let lead = (pos / 94) + 0xF9;
316                         let trail = (pos % 94) + 0xA1;
317                         handle.write_two(lead as u8, trail as u8)
318                     } else if let Some(pointer) = ibm_symbol_encode(bmp) {
319                         let lead = (pointer / 94) + 0xA1;
320                         let trail = (pointer % 94) + 0xA1;
321                         handle.write_two(lead as u8, trail as u8)
322                     } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
323                         let lead = (pointer / 94) + 0xA1;
324                         let trail = (pointer % 94) + 0xA1;
325                         handle.write_two(lead as u8, trail as u8)
326                     } else {
327                         return (
328                             EncoderResult::unmappable_from_bmp(bmp),
329                             source.consumed(),
330                             handle.written(),
331                         );
332                     }
333                 }
334             }
335         },
336         bmp,
337         self,
338         source,
339         handle,
340         copy_ascii_to_check_space_two,
341         check_space_two,
342         false
343     );
344 }
345 
346 // Any copyright to the test code below this comment is dedicated to the
347 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
348 
349 #[cfg(test)]
350 mod tests {
351     use super::super::testing::*;
352     use super::super::*;
353 
decode_euc_jp(bytes: &[u8], expect: &str)354     fn decode_euc_jp(bytes: &[u8], expect: &str) {
355         decode(EUC_JP, bytes, expect);
356     }
357 
encode_euc_jp(string: &str, expect: &[u8])358     fn encode_euc_jp(string: &str, expect: &[u8]) {
359         encode(EUC_JP, string, expect);
360     }
361 
362     #[test]
test_euc_jp_decode()363     fn test_euc_jp_decode() {
364         // Empty
365         decode_euc_jp(b"", &"");
366 
367         // ASCII
368         decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
369 
370         // Half-width
371         decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
372         decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
373         decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
374         decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
375         decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
376         decode_euc_jp(b"\x8E", "\u{FFFD}");
377 
378         // JIS 0212
379         decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
380         decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
381         decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
382         decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
383         decode_euc_jp(b"\x8F", "\u{FFFD}");
384 
385         // JIS 0208
386         decode_euc_jp(b"\xA1\xA1", "\u{3000}");
387         decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
388         decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
389         decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
390         decode_euc_jp(b"\xA1", "\u{FFFD}");
391 
392         // Bad leads
393         decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
394         decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
395         decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
396         decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
397         decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
398         decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
399         decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
400         decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
401         decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
402         decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
403         decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
404         decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
405         decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
406         decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
407         decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
408         decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
409 
410         // Bad ASCII trail
411         decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
412     }
413 
414     #[test]
test_euc_jp_encode()415     fn test_euc_jp_encode() {
416         // Empty
417         encode_euc_jp("", b"");
418 
419         // ASCII
420         encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
421 
422         // Exceptional code points
423         encode_euc_jp("\u{00A5}", b"\x5C");
424         encode_euc_jp("\u{203E}", b"\x7E");
425         encode_euc_jp("\u{2212}", b"\xA1\xDD");
426 
427         // Half-width
428         encode_euc_jp("\u{FF61}", b"\x8E\xA1");
429         encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
430 
431         // JIS 0212
432         encode_euc_jp("\u{02D8}", b"&#728;");
433 
434         // JIS 0208
435         encode_euc_jp("\u{3000}", b"\xA1\xA1");
436         encode_euc_jp("\u{FF02}", b"\xFC\xFE");
437     }
438 
439     #[test]
test_jis0208_decode_all()440     fn test_jis0208_decode_all() {
441         let input = include_bytes!("test_data/jis0208_in.txt");
442         let expectation = include_str!("test_data/jis0208_in_ref.txt");
443         let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
444         assert!(had_errors, "Should have had errors.");
445         assert_eq!(&cow[..], expectation);
446     }
447 
448     #[test]
test_jis0208_encode_all()449     fn test_jis0208_encode_all() {
450         let input = include_str!("test_data/jis0208_out.txt");
451         let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
452         let (cow, encoding, had_errors) = EUC_JP.encode(input);
453         assert!(!had_errors, "Should not have had errors.");
454         assert_eq!(encoding, EUC_JP);
455         assert_eq!(&cow[..], &expectation[..]);
456     }
457 
458     #[test]
test_jis0212_decode_all()459     fn test_jis0212_decode_all() {
460         let input = include_bytes!("test_data/jis0212_in.txt");
461         let expectation = include_str!("test_data/jis0212_in_ref.txt");
462         let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
463         assert!(had_errors, "Should have had errors.");
464         assert_eq!(&cow[..], expectation);
465     }
466 }
467