1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use crate::data::*;
12 use crate::handles::*;
13 use crate::variant::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range;
16 use super::in_inclusive_range16;
17 
18 pub struct ShiftJisDecoder {
19     lead: Option<u8>,
20 }
21 
22 impl ShiftJisDecoder {
new() -> VariantDecoder23     pub fn new() -> VariantDecoder {
24         VariantDecoder::ShiftJis(ShiftJisDecoder { lead: None })
25     }
26 
in_neutral_state(&self) -> bool27     pub fn in_neutral_state(&self) -> bool {
28         self.lead.is_none()
29     }
30 
plus_one_if_lead(&self, byte_length: usize) -> Option<usize>31     fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
32         byte_length.checked_add(match self.lead {
33             None => 0,
34             Some(_) => 1,
35         })
36     }
37 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>38     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
39         self.plus_one_if_lead(byte_length)
40     }
41 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>42     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
43         // worst case: 1 to 3 (half-width katakana)
44         self.max_utf8_buffer_length(byte_length)
45     }
46 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>47     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
48         checked_mul(3, self.plus_one_if_lead(byte_length))
49     }
50 
51     ascii_compatible_two_byte_decoder_functions!(
52         {
53            // If lead is between 0x81 and 0x9F, inclusive,
54            // subtract offset 0x81. Else if lead is
55            // between 0xE0 and 0xFC, inclusive, subtract
56            // offset 0xC1. Else if lead is between
57            // 0xA1 and 0xDF, inclusive, map to half-width
58            // Katakana. Else if lead is 0x80, pass through.
59             let mut non_ascii_minus_offset =
60                 non_ascii.wrapping_sub(0x81);
61             if non_ascii_minus_offset > (0x9F - 0x81) {
62                 let non_ascii_minus_range_start = non_ascii.wrapping_sub(0xE0);
63                 if non_ascii_minus_range_start > (0xFC - 0xE0) {
64                     let non_ascii_minus_half_with_katakana_start = non_ascii.wrapping_sub(0xA1);
65                     if non_ascii_minus_half_with_katakana_start > (0xDF - 0xA1) {
66                         if non_ascii == 0x80 {
67                             handle.write_mid_bmp(0x80);
68                             // Not caring about optimizing subsequent non-ASCII
69                             continue 'outermost;
70                         }
71                         return (DecoderResult::Malformed(1, 0),
72                                 source.consumed(),
73                                 handle.written());
74                     }
75                     handle.write_upper_bmp(0xFF61 + u16::from(non_ascii_minus_half_with_katakana_start));
76                     // Not caring about optimizing subsequent non-ASCII
77                     continue 'outermost;
78                 }
79                 non_ascii_minus_offset = non_ascii - 0xC1;
80             }
81             non_ascii_minus_offset
82         },
83         {
84             // If trail is between 0x40 and 0x7E, inclusive,
85             // subtract offset 0x40. Else if trail is
86             // between 0x80 and 0xFC, inclusive, subtract
87             // offset 0x41.
88             // Fast-track Hiragana (60% according to Lunde)
89             // and Katakana (10% acconding to Lunde).
90             // Hiragana doesn't cross 0x7F, but Katakana does.
91             // We can check for Hiragana before normalizing
92             // trail.
93             let trail_minus_hiragana = byte.wrapping_sub(0x9F);
94             if lead_minus_offset == 0x01 && trail_minus_hiragana < 0x53 {
95             // Hiragana
96                 handle.write_upper_bmp(0x3041 + u16::from(trail_minus_hiragana))
97             } else {
98                 let mut trail_minus_offset =
99                     byte.wrapping_sub(0x40);
100                 if trail_minus_offset > (0x7E - 0x40) {
101                     let trail_minus_range_start =
102                         byte.wrapping_sub(0x80);
103                     if trail_minus_range_start > (0xFC - 0x80) {
104                         if byte < 0x80 {
105                             return (DecoderResult::Malformed(1, 0),
106                                     unread_handle_trail.unread(),
107                                     handle.written());
108                         }
109                         return (DecoderResult::Malformed(2, 0),
110                                 unread_handle_trail.consumed(),
111                                 handle.written());
112                     }
113                     trail_minus_offset = byte - 0x41;
114                 }
115                 if lead_minus_offset == 0x02 &&
116                    trail_minus_offset < 0x56 {
117                     // Katakana
118                     handle.write_upper_bmp(0x30A1 + u16::from(trail_minus_offset))
119                 } else {
120                     let pointer = lead_minus_offset as usize *
121                                   188usize +
122                                   trail_minus_offset as usize;
123                     let level1_pointer = pointer.wrapping_sub(1410);
124                     if level1_pointer < JIS0208_LEVEL1_KANJI.len() {
125                         handle.write_upper_bmp(JIS0208_LEVEL1_KANJI[level1_pointer])
126                     } else {
127                         let level2_pointer = pointer.wrapping_sub(4418);
128                         if level2_pointer <
129                            JIS0208_LEVEL2_AND_ADDITIONAL_KANJI.len() {
130                             handle.write_upper_bmp(JIS0208_LEVEL2_AND_ADDITIONAL_KANJI[level2_pointer])
131                         } else {
132                             let upper_ibm_pointer = pointer.wrapping_sub(10744);
133                             if upper_ibm_pointer < IBM_KANJI.len() {
134                                 handle.write_upper_bmp(IBM_KANJI[upper_ibm_pointer])
135                             } else {
136                                 let lower_ibm_pointer = pointer.wrapping_sub(8272);
137                                 if lower_ibm_pointer < IBM_KANJI.len() {
138                                     handle.write_upper_bmp(IBM_KANJI[lower_ibm_pointer])
139                                 } else if in_inclusive_range(pointer, 8836, 10715) {
140                                     handle.write_upper_bmp((0xE000 - 8836 + pointer) as u16)
141                                 } else if let Some(bmp) = jis0208_symbol_decode(pointer) {
142                                     handle.write_bmp_excl_ascii(bmp)
143                                 } else if let Some(bmp) = jis0208_range_decode(pointer) {
144                                     handle.write_bmp_excl_ascii(bmp)
145                                 } else {
146                                     if byte < 0x80 {
147                                         return (DecoderResult::Malformed(1, 0),
148                                                 unread_handle_trail.unread(),
149                                                 handle.written());
150                                     }
151                                     return (DecoderResult::Malformed(2, 0),
152                                             unread_handle_trail.consumed(),
153                                             handle.written());
154                                 }
155                             }
156                         }
157                     }
158                 }
159             }
160         },
161         self,
162         non_ascii,
163         byte,
164         lead_minus_offset,
165         unread_handle_trail,
166         source,
167         handle,
168         'outermost,
169         copy_ascii_from_check_space_bmp,
170         check_space_bmp,
171         false);
172 }
173 
174 #[cfg(feature = "fast-kanji-encode")]
175 #[inline(always)]
encode_kanji(bmp: u16) -> Option<(u8, u8)>176 fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
177     jis0208_kanji_shift_jis_encode(bmp)
178 }
179 
180 #[cfg(not(feature = "fast-kanji-encode"))]
181 #[inline(always)]
encode_kanji(bmp: u16) -> Option<(u8, u8)>182 fn encode_kanji(bmp: u16) -> Option<(u8, u8)> {
183     if let Some((lead, trail)) = jis0208_level1_kanji_shift_jis_encode(bmp) {
184         return Some((lead, trail));
185     }
186     let pointer = if 0x4EDD == bmp {
187         // Ideograph on the symbol row!
188         23
189     } else if let Some(pos) = jis0208_level2_and_additional_kanji_encode(bmp) {
190         4418 + pos
191     } else if let Some(pos) = position(&IBM_KANJI[..], bmp) {
192         10744 + pos
193     } else {
194         return None;
195     };
196     let lead = pointer / 188;
197     let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
198     let trail = pointer % 188;
199     let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
200     Some(((lead + lead_offset) as u8, (trail + trail_offset) as u8))
201 }
202 
203 pub struct ShiftJisEncoder;
204 
205 impl ShiftJisEncoder {
new(encoding: &'static Encoding) -> Encoder206     pub fn new(encoding: &'static Encoding) -> Encoder {
207         Encoder::new(encoding, VariantEncoder::ShiftJis(ShiftJisEncoder))
208     }
209 
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>210     pub fn max_buffer_length_from_utf16_without_replacement(
211         &self,
212         u16_length: usize,
213     ) -> Option<usize> {
214         u16_length.checked_mul(2)
215     }
216 
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>217     pub fn max_buffer_length_from_utf8_without_replacement(
218         &self,
219         byte_length: usize,
220     ) -> Option<usize> {
221         byte_length.checked_add(1)
222     }
223 
224     ascii_compatible_bmp_encoder_functions!(
225         {
226             // Lunde says 60% Hiragana, 30% Kanji, 10% Katakana
227             let bmp_minus_hiragana = bmp.wrapping_sub(0x3041);
228             if bmp_minus_hiragana < 0x53 {
229                 handle.write_two(0x82, 0x9F + bmp_minus_hiragana as u8)
230             } else if in_inclusive_range16(bmp, 0x4E00, 0x9FA0) {
231                 if let Some((lead, trail)) = encode_kanji(bmp) {
232                     handle.write_two(lead, trail)
233                 } else {
234                     return (
235                         EncoderResult::unmappable_from_bmp(bmp),
236                         source.consumed(),
237                         handle.written(),
238                     );
239                 }
240             } else {
241                 let bmp_minus_katakana = bmp.wrapping_sub(0x30A1);
242                 if bmp_minus_katakana < 0x56 {
243                     let trail_offset = if bmp_minus_katakana < 0x3F {
244                         0x40
245                     } else {
246                         0x41
247                     };
248                     handle.write_two(0x83, (trail_offset + bmp_minus_katakana) as u8)
249                 } else {
250                     let bmp_minus_space = bmp.wrapping_sub(0x3000);
251                     if bmp_minus_space < 3 {
252                         // fast-track common punctuation
253                         handle.write_two(0x81, 0x40 + bmp_minus_space as u8)
254                     } else if bmp == 0xA5 {
255                         handle.write_one(0x5Cu8)
256                     } else if bmp == 0x80 {
257                         handle.write_one(0x80u8)
258                     } else if bmp == 0x203E {
259                         handle.write_one(0x7Eu8)
260                     } else if in_inclusive_range16(bmp, 0xFF61, 0xFF9F) {
261                         handle.write_one((bmp - (0xFF61 - 0xA1)) as u8)
262                     } else if bmp == 0x2212 {
263                         handle.write_two(0x81u8, 0x7Cu8)
264                     } else {
265                         let bmp_minus_roman = bmp.wrapping_sub(0x2170);
266                         let pointer = if bmp_minus_roman <= (0x2179 - 0x2170) {
267                             10716 + bmp_minus_roman as usize
268                         } else if let Some(pointer) = jis0208_range_encode(bmp) {
269                             pointer
270                         } else if in_inclusive_range16(bmp, 0xFA0E, 0xFA2D)
271                             || bmp == 0xF929
272                             || bmp == 0xF9DC
273                         {
274                             // Guaranteed to be found in IBM_KANJI
275                             let pos = position(&IBM_KANJI[..], bmp).unwrap();
276                             10744 + pos
277                         } else if let Some(pointer) = jis0208_symbol_encode(bmp) {
278                             pointer
279                         } else {
280                             return (
281                                 EncoderResult::unmappable_from_bmp(bmp),
282                                 source.consumed(),
283                                 handle.written(),
284                             );
285                         };
286                         let lead = pointer / 188;
287                         let lead_offset = if lead < 0x1F { 0x81usize } else { 0xC1usize };
288                         let trail = pointer % 188;
289                         let trail_offset = if trail < 0x3F { 0x40usize } else { 0x41usize };
290                         handle.write_two((lead + lead_offset) as u8, (trail + trail_offset) as u8)
291                     }
292                 }
293             }
294         },
295         bmp,
296         self,
297         source,
298         handle,
299         copy_ascii_to_check_space_two,
300         check_space_two,
301         false
302     );
303 }
304 
305 // Any copyright to the test code below this comment is dedicated to the
306 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
307 
308 #[cfg(test)]
309 mod tests {
310     use super::super::testing::*;
311     use super::super::*;
312 
decode_shift_jis(bytes: &[u8], expect: &str)313     fn decode_shift_jis(bytes: &[u8], expect: &str) {
314         decode(SHIFT_JIS, bytes, expect);
315     }
316 
encode_shift_jis(string: &str, expect: &[u8])317     fn encode_shift_jis(string: &str, expect: &[u8]) {
318         encode(SHIFT_JIS, string, expect);
319     }
320 
321     #[test]
test_shift_jis_decode()322     fn test_shift_jis_decode() {
323         // Empty
324         decode_shift_jis(b"", &"");
325 
326         // ASCII
327         decode_shift_jis(b"\x61\x62", "\u{0061}\u{0062}");
328 
329         // Half-width
330         decode_shift_jis(b"\xA1", "\u{FF61}");
331         decode_shift_jis(b"\xDF", "\u{FF9F}");
332         decode_shift_jis(b"\xA0", "\u{FFFD}");
333         decode_shift_jis(b"\xE0", "\u{FFFD}");
334         decode_shift_jis(b"\xA0+", "\u{FFFD}+");
335         decode_shift_jis(b"\xE0+", "\u{FFFD}+");
336 
337         // EUDC
338         decode_shift_jis(b"\xF0\x40", "\u{E000}");
339         decode_shift_jis(b"\xF9\xFC", "\u{E757}");
340         decode_shift_jis(b"\xEF\xFC", "\u{FFFD}");
341         decode_shift_jis(b"\xFA\x40", "\u{2170}");
342 
343         // JIS 0208
344         decode_shift_jis(b"\x81\x40", "\u{3000}");
345         decode_shift_jis(b"\x81\x3F", "\u{FFFD}?");
346         decode_shift_jis(b"\xEE\xFC", "\u{FF02}");
347         decode_shift_jis(b"\xEE\xFD", "\u{FFFD}");
348         decode_shift_jis(b"\xFA\x40", "\u{2170}");
349         decode_shift_jis(b"\xFA\x3F", "\u{FFFD}?");
350         decode_shift_jis(b"\xFC\x4B", "\u{9ED1}");
351         decode_shift_jis(b"\xFC\x4C", "\u{FFFD}L");
352         //
353     }
354 
355     #[test]
test_shift_jis_encode()356     fn test_shift_jis_encode() {
357         // Empty
358         encode_shift_jis("", b"");
359 
360         // ASCII
361         encode_shift_jis("\u{0061}\u{0062}", b"\x61\x62");
362 
363         // Exceptional code points
364         encode_shift_jis("\u{0080}", b"\x80");
365         encode_shift_jis("\u{00A5}", b"\x5C");
366         encode_shift_jis("\u{203E}", b"\x7E");
367         encode_shift_jis("\u{2212}", b"\x81\x7C");
368 
369         // Half-width
370         encode_shift_jis("\u{FF61}", b"\xA1");
371         encode_shift_jis("\u{FF9F}", b"\xDF");
372 
373         // EUDC
374         encode_shift_jis("\u{E000}", b"&#57344;");
375         encode_shift_jis("\u{E757}", b"&#59223;");
376 
377         // JIS 0212
378         encode_shift_jis("\u{02D8}", b"&#728;");
379 
380         // JIS 0208
381         encode_shift_jis("\u{3000}", b"\x81\x40");
382         encode_shift_jis("\u{FF02}", b"\xFA\x57");
383         encode_shift_jis("\u{2170}", b"\xFA\x40");
384         encode_shift_jis("\u{9ED1}", b"\xFC\x4B");
385     }
386 
387     #[test]
388     #[cfg_attr(miri, ignore)] // Miri is too slow
test_shift_jis_decode_all()389     fn test_shift_jis_decode_all() {
390         let input = include_bytes!("test_data/shift_jis_in.txt");
391         let expectation = include_str!("test_data/shift_jis_in_ref.txt");
392         let (cow, had_errors) = SHIFT_JIS.decode_without_bom_handling(input);
393         assert!(had_errors, "Should have had errors.");
394         assert_eq!(&cow[..], expectation);
395     }
396 
397     #[test]
398     #[cfg_attr(miri, ignore)] // Miri is too slow
test_shift_jis_encode_all()399     fn test_shift_jis_encode_all() {
400         let input = include_str!("test_data/shift_jis_out.txt");
401         let expectation = include_bytes!("test_data/shift_jis_out_ref.txt");
402         let (cow, encoding, had_errors) = SHIFT_JIS.encode(input);
403         assert!(!had_errors, "Should not have had errors.");
404         assert_eq!(encoding, SHIFT_JIS);
405         assert_eq!(&cow[..], &expectation[..]);
406     }
407 
408     #[test]
test_shift_jis_half_width_katakana_length()409     fn test_shift_jis_half_width_katakana_length() {
410         let mut output = [0u8; 20];
411         let mut decoder = SHIFT_JIS.new_decoder();
412         {
413             let needed = decoder
414                 .max_utf8_buffer_length_without_replacement(1)
415                 .unwrap();
416             let (result, read, written) =
417                 decoder.decode_to_utf8_without_replacement(b"\xA1", &mut output[..needed], true);
418             assert_eq!(result, DecoderResult::InputEmpty);
419             assert_eq!(read, 1);
420             assert_eq!(written, 3);
421             assert_eq!(output[0], 0xEF);
422             assert_eq!(output[1], 0xBD);
423             assert_eq!(output[2], 0xA1);
424         }
425     }
426 }
427