1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use handles::*;
11 use data::*;
12 use variant::*;
13 use super::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range16;
16 
17 enum EucJpPending {
18     None,
19     Jis0208Lead(u8),
20     Jis0212Shift,
21     Jis0212Lead(u8),
22     HalfWidthKatakana,
23 }
24 
25 impl EucJpPending {
is_none(&self) -> bool26     fn is_none(&self) -> bool {
27         match *self {
28             EucJpPending::None => true,
29             _ => false,
30         }
31     }
32 
count(&self) -> usize33     fn count(&self) -> usize {
34         match *self {
35             EucJpPending::None => 0,
36             EucJpPending::Jis0208Lead(_) |
37             EucJpPending::Jis0212Shift |
38             EucJpPending::HalfWidthKatakana => 1,
39             EucJpPending::Jis0212Lead(_) => 2,
40         }
41     }
42 }
43 
44 pub struct EucJpDecoder {
45     pending: EucJpPending,
46 }
47 
48 impl EucJpDecoder {
new() -> VariantDecoder49     pub fn new() -> VariantDecoder {
50         VariantDecoder::EucJp(EucJpDecoder { pending: EucJpPending::None })
51     }
52 
plus_one_if_lead(&self, byte_length: usize) -> Option<usize>53     fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
54         byte_length.checked_add(if self.pending.is_none() { 0 } else { 1 })
55     }
56 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>57     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
58         self.plus_one_if_lead(byte_length)
59     }
60 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>61     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
62         // worst case: 2 to 3
63         let len = self.plus_one_if_lead(byte_length);
64         checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
65     }
66 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>67     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
68         checked_mul(3, self.plus_one_if_lead(byte_length))
69     }
70 
71     euc_jp_decoder_functions!(
72         {
73             let trail_minus_offset = byte.wrapping_sub(0xA1);
74             // Fast-track Hiragana (60% according to Lunde)
75             // and Katakana (10% acconding to Lunde).
76             if jis0208_lead_minus_offset == 0x03 && trail_minus_offset < 0x53 {
77                 // Hiragana
78                 handle.write_upper_bmp(0x3041 + trail_minus_offset as u16)
79             } else if jis0208_lead_minus_offset == 0x04 && trail_minus_offset < 0x56 {
80                 // Katakana
81                 handle.write_upper_bmp(0x30A1 + trail_minus_offset as u16)
82             } else if trail_minus_offset > (0xFE - 0xA1) {
83                 if byte < 0x80 {
84                     return (DecoderResult::Malformed(1, 0),
85                             unread_handle_trail.unread(),
86                             handle.written());
87                 }
88                 return (DecoderResult::Malformed(2, 0),
89                         unread_handle_trail.consumed(),
90                         handle.written());
91             } else {
92                 let pointer = mul_94(jis0208_lead_minus_offset) + trail_minus_offset as usize;
93                 let level1_pointer = pointer.wrapping_sub(1410);
94                 if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
95                     handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
96                 } else {
97                     let level2_pointer = pointer.wrapping_sub(4418);
98                     if level2_pointer < JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
99                         handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
100                     } else {
101                         let ibm_pointer = pointer.wrapping_sub(8272);
102                         if ibm_pointer < IBM_KANJI.len() {
103                             handle.write_upper_bmp(IBM_KANJI[ibm_pointer])
104                         } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
105                             handle.write_bmp_excl_ascii(bmp)
106                         } else if let Some(bmp) = jis0208_range_decode(pointer) {
107                             handle.write_bmp_excl_ascii(bmp)
108                         } else {
109                             return (DecoderResult::Malformed(2, 0),
110                                     unread_handle_trail.consumed(),
111                                     handle.written());
112                         }
113                     }
114                 }
115             }
116         },
117         {
118             // If lead is between 0xA1 and 0xFE, inclusive,
119             // subtract 0xA1.
120             let jis0212_lead_minus_offset = lead.wrapping_sub(0xA1);
121             if jis0212_lead_minus_offset > (0xFE - 0xA1) {
122                 if lead < 0x80 {
123                     return (DecoderResult::Malformed(1, 0),
124                             unread_handle_jis0212.unread(),
125                             handle.written());
126                 }
127                 return (DecoderResult::Malformed(2, 0),
128                         unread_handle_jis0212.consumed(),
129                         handle.written());
130             }
131             jis0212_lead_minus_offset
132         },
133         {
134             // If trail is between 0xA1 and 0xFE, inclusive,
135             // subtract 0xA1.
136             let trail_minus_offset = byte.wrapping_sub(0xA1);
137             if trail_minus_offset > (0xFE - 0xA1) {
138                 if byte < 0x80 {
139                     return (DecoderResult::Malformed(2, 0),
140                             unread_handle_trail.unread(),
141                             handle.written());
142                 }
143                 return (DecoderResult::Malformed(3, 0),
144                         unread_handle_trail.consumed(),
145                         handle.written());
146             }
147             let pointer = mul_94(jis0212_lead_minus_offset) + trail_minus_offset as usize;
148             let pointer_minus_kanji = pointer.wrapping_sub(1410);
149             if pointer_minus_kanji < JIS0212_KANJI.len() {
150                 handle.write_upper_bmp(JIS0212_KANJI[pointer_minus_kanji])
151             } else if let Some(bmp) = jis0212_accented_decode(pointer) {
152                 handle.write_bmp_excl_ascii(bmp)
153             } else {
154                 let pointer_minus_upper_cyrillic = pointer.wrapping_sub(597);
155                 if pointer_minus_upper_cyrillic <= (607 - 597) {
156                     handle.write_mid_bmp(0x0402 + pointer_minus_upper_cyrillic as u16)
157                 } else {
158                     let pointer_minus_lower_cyrillic = pointer.wrapping_sub(645);
159                     if pointer_minus_lower_cyrillic <= (655 - 645) {
160                         handle.write_mid_bmp(0x0452 + pointer_minus_lower_cyrillic as u16)
161                     } else {
162                         return (DecoderResult::Malformed(3, 0),
163                                 unread_handle_trail.consumed(),
164                                 handle.written());
165                     }
166                 }
167             }
168         },
169         {
170             // If trail is between 0xA1 and 0xDF, inclusive,
171             // subtract 0xA1 and map to half-width Katakana.
172             let trail_minus_offset = byte.wrapping_sub(0xA1);
173             if trail_minus_offset > (0xDF - 0xA1) {
174                 if byte < 0x80 {
175                     return (DecoderResult::Malformed(1, 0),
176                             unread_handle_trail.unread(),
177                             handle.written());
178                 }
179                 return (DecoderResult::Malformed(2, 0),
180                         unread_handle_trail.consumed(),
181                         handle.written());
182             }
183             handle.write_upper_bmp(0xFF61 + trail_minus_offset as u16)
184         },
185         self,
186         non_ascii,
187         jis0208_lead_minus_offset,
188         byte,
189         unread_handle_trail,
190         jis0212_lead_minus_offset,
191         lead,
192         unread_handle_jis0212,
193         source,
194         handle
195     );
196 }
197 
198 pub struct EucJpEncoder;
199 
200 impl EucJpEncoder {
new(encoding: &'static Encoding) -> Encoder201     pub fn new(encoding: &'static Encoding) -> Encoder {
202         Encoder::new(encoding, VariantEncoder::EucJp(EucJpEncoder))
203     }
204 
max_buffer_length_from_utf16_without_replacement(&self, u16_length: usize) -> Option<usize>205     pub fn max_buffer_length_from_utf16_without_replacement(&self,
206                                                             u16_length: usize)
207                                                             -> Option<usize> {
208         u16_length.checked_mul(2)
209     }
210 
max_buffer_length_from_utf8_without_replacement(&self, byte_length: usize) -> Option<usize>211     pub fn max_buffer_length_from_utf8_without_replacement(&self,
212                                                            byte_length: usize)
213                                                            -> Option<usize> {
214         byte_length.checked_add(1)
215     }
216 
217     ascii_compatible_bmp_encoder_functions!(
218         {
219             // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
220             let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
221             if bmp_minus_hiragana < 0x53 {
222                 handle.write_two(0xA4, 0xA1 + bmp_minus_hiragana as u8)
223             } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
224                 if 0x4EDD == bmp {
225                     // Ideograph on the symbol row!
226                     handle.write_two(0xA1, 0xB8)
227                 } else if let Some((lead, trail)) = jis0208_level1_kanji_euc_jp_encode(bmp) {
228                     handle.write_two(lead, trail)
229                 } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
230                     let lead = (pos / 94) + 0xD0;
231                     let trail = (pos % 94) + 0xA1;
232                     handle.write_two(lead as u8, trail as u8)
233                 } else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
234                     let lead = (pos / 94) + 0xF9;
235                     let trail = (pos % 94) + 0xA1;
236                     handle.write_two(lead as u8, trail as u8)
237                 } else {
238                     return (EncoderResult::unmappable_from_bmp(bmp),
239                             source.consumed(),
240                             handle.written());
241                 }
242             } else {
243                 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
244                 if bmp_minus_katakana < 0x56 {
245                     handle.write_two(0xA5, 0xA1 + bmp_minus_katakana as u8)
246                 } else {
247                     let bmp_minus_space = bmp.wrapping_sub(0x3000);
248                     if bmp_minus_space < 3 {
249                         // fast-track common punctuation
250                         handle.write_two(0xA1, 0xA1 + bmp_minus_space as u8)
251                     } else if bmp == 0xA5 {
252                         handle.write_one(0x5Cu8)
253                     } else if bmp == 0x203E {
254                         handle.write_one(0x7Eu8)
255                     } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
256                         handle.write_two(0x8Eu8, (bmp - (0xFF61 - 0xA1)) as u8)
257                     } else if bmp == 0x2212 {
258                         handle.write_two(0xA1u8, 0xDDu8)
259                     } else if let Some(pointer) = jis0208_range_encode(bmp) {
260                         let lead = (pointer / 94) + 0xA1;
261                         let trail = (pointer % 94) + 0xA1;
262                         handle.write_two(lead as u8, trail as u8)
263                     } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D) || bmp == 0xF929 ||
264                               bmp == 0xF9DC {
265                         // Guaranteed to be found in IBM_KANJI
266                         let pos = position(&IBM_KANJI[..], bmp).unwrap();
267                         let lead = (pos / 94) + 0xF9;
268                         let trail = (pos % 94) + 0xA1;
269                         handle.write_two(lead as u8, trail as u8)
270                     } else if let Some(pointer) = ibm_symbol_encode(bmp) {
271                         let lead = (pointer / 94) + 0xA1;
272                         let trail = (pointer % 94) + 0xA1;
273                         handle.write_two(lead as u8, trail as u8)
274                     } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
275                         let lead = (pointer / 94) + 0xA1;
276                         let trail = (pointer % 94) + 0xA1;
277                         handle.write_two(lead as u8, trail as u8)
278                     } else {
279                         return (EncoderResult::unmappable_from_bmp(bmp),
280                                 source.consumed(),
281                                 handle.written());
282                     }
283                 }
284             }
285         },
286         bmp,
287         self,
288         source,
289         handle,
290         copy_ascii_to_check_space_two,
291         check_space_two,
292         false
293     );
294 }
295 
296 // Any copyright to the test code below this comment is dedicated to the
297 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
298 
299 #[cfg(test)]
300 mod tests {
301     use super::super::testing::*;
302     use super::super::*;
303 
decode_euc_jp(bytes: &[u8], expect: &str)304     fn decode_euc_jp(bytes: &[u8], expect: &str) {
305         decode(EUC_JP, bytes, expect);
306     }
307 
encode_euc_jp(string: &str, expect: &[u8])308     fn encode_euc_jp(string: &str, expect: &[u8]) {
309         encode(EUC_JP, string, expect);
310     }
311 
312     #[test]
test_euc_jp_decode()313     fn test_euc_jp_decode() {
314         // Empty
315         decode_euc_jp(b"", &"");
316 
317         // ASCII
318         decode_euc_jp(b"\x61\x62", "\u{0061}\u{0062}");
319 
320         // Half-width
321         decode_euc_jp(b"\x8E\xA1", "\u{FF61}");
322         decode_euc_jp(b"\x8E\xDF", "\u{FF9F}");
323         decode_euc_jp(b"\x8E\xA0", "\u{FFFD}");
324         decode_euc_jp(b"\x8E\xE0", "\u{FFFD}");
325         decode_euc_jp(b"\x8E\xFF", "\u{FFFD}");
326         decode_euc_jp(b"\x8E", "\u{FFFD}");
327 
328         // JIS 0212
329         decode_euc_jp(b"\x8F\xA1\xA1", "\u{FFFD}");
330         decode_euc_jp(b"\x8F\xA2\xAF", "\u{02D8}");
331         decode_euc_jp(b"\x8F\xA2\xFF", "\u{FFFD}");
332         decode_euc_jp(b"\x8F\xA1", "\u{FFFD}");
333         decode_euc_jp(b"\x8F", "\u{FFFD}");
334 
335         // JIS 0208
336         decode_euc_jp(b"\xA1\xA1", "\u{3000}");
337         decode_euc_jp(b"\xA1\xA0", "\u{FFFD}");
338         decode_euc_jp(b"\xFC\xFE", "\u{FF02}");
339         decode_euc_jp(b"\xFE\xFE", "\u{FFFD}");
340         decode_euc_jp(b"\xA1", "\u{FFFD}");
341 
342         // Bad leads
343         decode_euc_jp(b"\xFF\xA1\xA1", "\u{FFFD}\u{3000}");
344         decode_euc_jp(b"\xA0\xA1\xA1", "\u{FFFD}\u{3000}");
345         decode_euc_jp(b"\x80\xA1\xA1", "\u{FFFD}\u{3000}");
346         decode_euc_jp(b"\x81\xA1\xA1", "\u{FFFD}\u{3000}");
347         decode_euc_jp(b"\x82\xA1\xA1", "\u{FFFD}\u{3000}");
348         decode_euc_jp(b"\x83\xA1\xA1", "\u{FFFD}\u{3000}");
349         decode_euc_jp(b"\x84\xA1\xA1", "\u{FFFD}\u{3000}");
350         decode_euc_jp(b"\x85\xA1\xA1", "\u{FFFD}\u{3000}");
351         decode_euc_jp(b"\x86\xA1\xA1", "\u{FFFD}\u{3000}");
352         decode_euc_jp(b"\x87\xA1\xA1", "\u{FFFD}\u{3000}");
353         decode_euc_jp(b"\x88\xA1\xA1", "\u{FFFD}\u{3000}");
354         decode_euc_jp(b"\x89\xA1\xA1", "\u{FFFD}\u{3000}");
355         decode_euc_jp(b"\x8A\xA1\xA1", "\u{FFFD}\u{3000}");
356         decode_euc_jp(b"\x8B\xA1\xA1", "\u{FFFD}\u{3000}");
357         decode_euc_jp(b"\x8C\xA1\xA1", "\u{FFFD}\u{3000}");
358         decode_euc_jp(b"\x8D\xA1\xA1", "\u{FFFD}\u{3000}");
359 
360         // Bad ASCII trail
361         decode_euc_jp(b"\xA1\x40", "\u{FFFD}\u{0040}");
362     }
363 
364     #[test]
test_euc_jp_encode()365     fn test_euc_jp_encode() {
366         // Empty
367         encode_euc_jp("", b"");
368 
369         // ASCII
370         encode_euc_jp("\u{0061}\u{0062}", b"\x61\x62");
371 
372         // Exceptional code points
373         encode_euc_jp("\u{00A5}", b"\x5C");
374         encode_euc_jp("\u{203E}", b"\x7E");
375         encode_euc_jp("\u{2212}", b"\xA1\xDD");
376 
377         // Half-width
378         encode_euc_jp("\u{FF61}", b"\x8E\xA1");
379         encode_euc_jp("\u{FF9F}", b"\x8E\xDF");
380 
381         // JIS 0212
382         encode_euc_jp("\u{02D8}", b"&#728;");
383 
384         // JIS 0208
385         encode_euc_jp("\u{3000}", b"\xA1\xA1");
386         encode_euc_jp("\u{FF02}", b"\xFC\xFE");
387     }
388 
389     #[test]
test_jis0208_decode_all()390     fn test_jis0208_decode_all() {
391         let input = include_bytes!("test_data/jis0208_in.txt");
392         let expectation = include_str!("test_data/jis0208_in_ref.txt");
393         let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
394         assert!(had_errors, "Should have had errors.");
395         assert_eq!(&cow[..], expectation);
396     }
397 
398     #[test]
test_jis0208_encode_all()399     fn test_jis0208_encode_all() {
400         let input = include_str!("test_data/jis0208_out.txt");
401         let expectation = include_bytes!("test_data/jis0208_out_ref.txt");
402         let (cow, encoding, had_errors) = EUC_JP.encode(input);
403         assert!(!had_errors, "Should not have had errors.");
404         assert_eq!(encoding, EUC_JP);
405         assert_eq!(&cow[..], &expectation[..]);
406     }
407 
408     #[test]
test_jis0212_decode_all()409     fn test_jis0212_decode_all() {
410         let input = include_bytes!("test_data/jis0212_in.txt");
411         let expectation = include_str!("test_data/jis0212_in_ref.txt");
412         let (cow, had_errors) = EUC_JP.decode_without_bom_handling(input);
413         assert!(had_errors, "Should have had errors.");
414         assert_eq!(&cow[..], expectation);
415     }
416 }
417