1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use crate::data::*;
12 use crate::handles::*;
13 use crate::variant::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range16;
16 use super::in_range16;
17 
18 pub struct EucKrDecoder {
19     lead: Option<u8>,
20 }
21 
22 impl EucKrDecoder {
new() -> VariantDecoder23     pub fn new() -> VariantDecoder {
24         VariantDecoder::EucKr(EucKrDecoder { lead: None })
25     }
26 
in_neutral_state(&self) -> bool27     pub fn in_neutral_state(&self) -> bool {
28         self.lead.is_none()
29     }
30 
plus_one_if_lead(&self, byte_length: usize) -> Option<usize>31     fn plus_one_if_lead(&self, byte_length: usize) -> Option<usize> {
32         byte_length.checked_add(match self.lead {
33             None => 0,
34             Some(_) => 1,
35         })
36     }
37 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>38     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
39         self.plus_one_if_lead(byte_length)
40     }
41 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>42     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
43         // worst case: 2 to 3
44         let len = self.plus_one_if_lead(byte_length);
45         checked_add(2, checked_add_opt(len, checked_div(checked_add(1, len), 2)))
46     }
47 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>48     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
49         checked_mul(3, self.plus_one_if_lead(byte_length))
50     }
51 
52     ascii_compatible_two_byte_decoder_functions!(
53         {
54             // If lead is between 0x81 and 0xFE, inclusive,
55             // subtract offset 0x81.
56             let non_ascii_minus_offset =
57                 non_ascii.wrapping_sub(0x81);
58             if non_ascii_minus_offset > (0xFE - 0x81) {
59                 return (DecoderResult::Malformed(1, 0),
60                         source.consumed(),
61                         handle.written());
62             }
63             non_ascii_minus_offset
64         },
65         {
66             if lead_minus_offset >= 0x20 {
67                 // Not the extension range above KS X 1001
68                 let trail_minus_offset =
69                     byte.wrapping_sub(0xA1);
70                 if trail_minus_offset <= (0xFE - 0xA1) {
71                     // KS X 1001
72                     let ksx_pointer = mul_94(lead_minus_offset - 0x20) + trail_minus_offset as usize;
73                     let hangul_pointer = ksx_pointer.wrapping_sub((0x2F - 0x20) * 94);
74                     if hangul_pointer < KSX1001_HANGUL.len() {
75                         let upper_bmp = KSX1001_HANGUL[hangul_pointer];
76                         handle.write_upper_bmp(upper_bmp)
77                     } else if ksx_pointer < KSX1001_SYMBOLS.len() {
78                         let bmp = KSX1001_SYMBOLS[ksx_pointer];
79                         handle.write_bmp_excl_ascii(bmp)
80                     } else {
81                         let hanja_pointer = ksx_pointer.wrapping_sub((0x49 - 0x20) * 94);
82                         if hanja_pointer < KSX1001_HANJA.len() {
83                             let upper_bmp = KSX1001_HANJA[hanja_pointer];
84                             handle.write_upper_bmp(upper_bmp)
85                         } else if (lead_minus_offset == 0x27) && ((trail_minus_offset as usize) < KSX1001_UPPERCASE.len()) {
86                             let mid_bmp = KSX1001_UPPERCASE[trail_minus_offset as usize];
87                             if mid_bmp == 0 {
88                                 return (DecoderResult::Malformed(2, 0),
89                                         unread_handle_trail.consumed(),
90                                         handle.written());
91                             }
92                             handle.write_mid_bmp(mid_bmp)
93                         } else if (lead_minus_offset == 0x28) && ((trail_minus_offset as usize) < KSX1001_LOWERCASE.len()) {
94                             let mid_bmp = KSX1001_LOWERCASE[trail_minus_offset as usize];
95                             handle.write_mid_bmp(mid_bmp)
96                         } else if (lead_minus_offset == 0x25) && ((trail_minus_offset as usize) < KSX1001_BOX.len()) {
97                             let upper_bmp = KSX1001_BOX[trail_minus_offset as usize];
98                             handle.write_upper_bmp(upper_bmp)
99                         } else {
100                             let other_pointer = ksx_pointer.wrapping_sub(2 * 94);
101                             if other_pointer < 0x039F {
102                                 let bmp = ksx1001_other_decode(other_pointer as u16);
103                                 // ASCII range means unassigned
104                                 if bmp < 0x80 {
105                                     return (DecoderResult::Malformed(2, 0),
106                                             unread_handle_trail.consumed(),
107                                             handle.written());
108                                 }
109                                 handle.write_bmp_excl_ascii(bmp)
110                             } else {
111                                 return (DecoderResult::Malformed(2, 0),
112                                         unread_handle_trail.consumed(),
113                                         handle.written());
114                             }
115                         }
116                     }
117                 } else {
118                     // Extension range to the left of
119                     // KS X 1001
120                     let left_lead = lead_minus_offset - 0x20;
121                     let left_trail = if byte.wrapping_sub(0x40 + 0x41) < (0x60 - 0x40) {
122                         byte - (12 + 0x41)
123                     } else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
124                         byte - (6 + 0x41)
125                     } else if byte.wrapping_sub(0x41) < 0x1A {
126                         byte - 0x41
127                     } else {
128                         if byte < 0x80 {
129                             return (DecoderResult::Malformed(1, 0),
130                                     unread_handle_trail.unread(),
131                                     handle.written());
132                         }
133                         return (DecoderResult::Malformed(2, 0),
134                                 unread_handle_trail.consumed(),
135                                 handle.written());
136                     };
137                     let left_pointer = ((left_lead as usize) * (190 - 94 - 12)) + left_trail as usize;
138                     if left_pointer < (0x45 - 0x20) * (190 - 94 - 12) + 0x12 {
139                         let upper_bmp = cp949_left_hangul_decode(left_pointer as u16);
140                         handle.write_upper_bmp(upper_bmp)
141                     } else {
142                         if byte < 0x80 {
143                             return (DecoderResult::Malformed(1, 0),
144                                     unread_handle_trail.unread(),
145                                     handle.written());
146                         }
147                         return (DecoderResult::Malformed(2, 0),
148                                 unread_handle_trail.consumed(),
149                                 handle.written());
150                     }
151                 }
152             } else {
153                 // Extension range above KS X 1001
154                 let top_trail = if byte.wrapping_sub(0x40 + 0x41) < (0xBE - 0x40) {
155                     byte - (12 + 0x41)
156                 } else if byte.wrapping_sub(0x20 + 0x41) < (0x3A - 0x20) {
157                     byte - (6 + 0x41)
158                 } else if byte.wrapping_sub(0x41) < 0x1A {
159                     byte - 0x41
160                 } else {
161                     if byte < 0x80 {
162                         return (DecoderResult::Malformed(1, 0),
163                                 unread_handle_trail.unread(),
164                                 handle.written());
165                     }
166                     return (DecoderResult::Malformed(2, 0),
167                             unread_handle_trail.consumed(),
168                             handle.written());
169                 };
170                 let top_pointer = ((lead_minus_offset as usize) * (190 - 12)) + top_trail as usize;
171                 let upper_bmp = cp949_top_hangul_decode(top_pointer as u16);
172                 handle.write_upper_bmp(upper_bmp)
173             }
174         },
175         self,
176         non_ascii,
177         byte,
178         lead_minus_offset,
179         unread_handle_trail,
180         source,
181         handle,
182         'outermost,
183         copy_ascii_from_check_space_bmp,
184         check_space_bmp,
185         true);
186 }
187 
ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)>188 fn ksx1001_encode_misc(bmp: u16) -> Option<(usize, usize)> {
189     if in_inclusive_range16(bmp, 0x3000, 0x3015) {
190         if let Some(pos) = position(&KSX1001_SYMBOLS[..(0xAB - 0x60)], bmp) {
191             return Some((0xA1, pos + 0xA1));
192         }
193     }
194     if let Some(other_pointer) = ksx1001_other_encode(bmp) {
195         let other_lead = ((other_pointer as usize) / 94) + (0x81 + 0x22);
196         let other_trail = ((other_pointer as usize) % 94) + 0xA1;
197         return Some((other_lead, other_trail));
198     }
199     if in_range16(bmp, 0x00AA, 0x0168) {
200         // Latin
201         if let Some(pos) = position(&KSX1001_LOWERCASE[..], bmp) {
202             return Some((0x81 + 0x28, 0xA1 + pos));
203         }
204         if let Some(pos) = position(&KSX1001_UPPERCASE[..], bmp) {
205             return Some((0x81 + 0x27, 0xA1 + pos));
206         }
207     } else if in_range16(bmp, 0x2500, 0x254C) {
208         if let Some(pos) = position(&KSX1001_BOX[..], bmp) {
209             return Some((0x81 + 0x25, 0xA1 + pos));
210         }
211     }
212     if in_inclusive_range16(bmp, 0x2015, 0x266D)
213         || in_inclusive_range16(bmp, 0x321C, 0x33D8)
214         || in_inclusive_range16(bmp, 0xFF3C, 0xFFE5)
215         || in_inclusive_range16(bmp, 0x00A1, 0x00F7)
216         || in_inclusive_range16(bmp, 0x02C7, 0x02DD)
217     {
218         if let Some(pos) = position(&KSX1001_SYMBOLS[3..], bmp) {
219             if pos < (94 - 3) {
220                 return Some((0xA1, pos + 0xA1 + 3));
221             }
222             return Some((0xA2, pos - (94 - 3) + 0xA1));
223         }
224     }
225     None
226 }
227 
228 #[cfg(not(feature = "fast-hangul-encode"))]
229 #[inline(always)]
ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8)230 fn ksx1001_encode_hangul(bmp: u16, _: u16) -> (u8, u8) {
231     match KSX1001_HANGUL.binary_search(&bmp) {
232         Ok(ksx_hangul_pointer) => {
233             let ksx_hangul_lead = (ksx_hangul_pointer / 94) + (0x81 + 0x2F);
234             let ksx_hangul_trail = (ksx_hangul_pointer % 94) + 0xA1;
235             (ksx_hangul_lead as u8, ksx_hangul_trail as u8)
236         }
237         Err(_) => {
238             let (lead, cp949_trail) = if bmp < 0xC8A5 {
239                 // Above KS X 1001
240                 let top_pointer = cp949_top_hangul_encode(bmp) as usize;
241                 let top_lead = (top_pointer / (190 - 12)) + 0x81;
242                 let top_trail = top_pointer % (190 - 12);
243                 (top_lead as u8, top_trail as u8)
244             } else {
245                 // To the left of KS X 1001
246                 let left_pointer = cp949_left_hangul_encode(bmp) as usize;
247                 let left_lead = (left_pointer / (190 - 94 - 12)) + (0x81 + 0x20);
248                 let left_trail = left_pointer % (190 - 94 - 12);
249                 (left_lead as u8, left_trail as u8)
250             };
251             let offset = if cp949_trail >= (0x40 - 12) {
252                 0x41 + 12
253             } else if cp949_trail >= (0x20 - 6) {
254                 0x41 + 6
255             } else {
256                 0x41
257             };
258             (lead as u8, (cp949_trail + offset) as u8)
259         }
260     }
261 }
262 
263 #[cfg(feature = "fast-hangul-encode")]
264 #[inline(always)]
ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8)265 fn ksx1001_encode_hangul(_: u16, bmp_minus_hangul_start: u16) -> (u8, u8) {
266     cp949_hangul_encode(bmp_minus_hangul_start)
267 }
268 
269 #[cfg(not(feature = "fast-hanja-encode"))]
270 #[inline(always)]
ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)>271 fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
272     if let Some(hanja_pointer) = position(&KSX1001_HANJA[..], bmp) {
273         let hanja_lead = (hanja_pointer / 94) + (0x81 + 0x49);
274         let hanja_trail = (hanja_pointer % 94) + 0xA1;
275         Some((hanja_lead as u8, hanja_trail as u8))
276     } else {
277         None
278     }
279 }
280 
281 #[cfg(feature = "fast-hanja-encode")]
282 #[inline(always)]
ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)>283 fn ksx1001_encode_hanja(bmp: u16) -> Option<(u8, u8)> {
284     if bmp < 0xF900 {
285         ksx1001_unified_hangul_encode(bmp)
286     } else {
287         Some(ksx1001_compatibility_hangul_encode(bmp))
288     }
289 }
290 
291 pub struct EucKrEncoder;
292 
293 impl EucKrEncoder {
new(encoding: &'static Encoding) -> Encoder294     pub fn new(encoding: &'static Encoding) -> Encoder {
295         Encoder::new(encoding, VariantEncoder::EucKr(EucKrEncoder))
296     }
297 
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>298     pub fn max_buffer_length_from_utf16_without_replacement(
299         &self,
300         u16_length: usize,
301     ) -> Option<usize> {
302         u16_length.checked_mul(2)
303     }
304 
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>305     pub fn max_buffer_length_from_utf8_without_replacement(
306         &self,
307         byte_length: usize,
308     ) -> Option<usize> {
309         byte_length.checked_add(1)
310     }
311 
312     ascii_compatible_bmp_encoder_functions!(
313         {
314             let bmp_minus_hangul_start = bmp.wrapping_sub(0xAC00);
315             let (lead, trail) = if bmp_minus_hangul_start < (0xD7A4 - 0xAC00) {
316                 // Hangul
317                 ksx1001_encode_hangul(bmp, bmp_minus_hangul_start)
318             } else if in_range16(bmp, 0x33DE, 0xFF01) {
319                 // Vast range that includes no other
320                 // mappables except Hangul (already
321                 // processed) and Hanja.
322                 // Narrow the range further to Unified and
323                 // Compatibility ranges of Hanja.
324                 if in_range16(bmp, 0x4E00, 0x9F9D) || in_range16(bmp, 0xF900, 0xFA0C) {
325                     if let Some((hanja_lead, hanja_trail)) = ksx1001_encode_hanja(bmp) {
326                         (hanja_lead, hanja_trail)
327                     } else {
328                         return (
329                             EncoderResult::unmappable_from_bmp(bmp),
330                             source.consumed(),
331                             handle.written(),
332                         );
333                     }
334                 } else {
335                     return (
336                         EncoderResult::unmappable_from_bmp(bmp),
337                         source.consumed(),
338                         handle.written(),
339                     );
340                 }
341             } else if let Some((lead, trail)) = ksx1001_encode_misc(bmp) {
342                 (lead as u8, trail as u8)
343             } else {
344                 return (
345                     EncoderResult::unmappable_from_bmp(bmp),
346                     source.consumed(),
347                     handle.written(),
348                 );
349             };
350             handle.write_two(lead, trail)
351         },
352         bmp,
353         self,
354         source,
355         handle,
356         copy_ascii_to_check_space_two,
357         check_space_two,
358         true
359     );
360 }
361 
362 // Any copyright to the test code below this comment is dedicated to the
363 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
364 
365 #[cfg(all(test, feature = "alloc"))]
366 mod tests {
367     use super::super::testing::*;
368     use super::super::*;
369 
decode_euc_kr(bytes: &[u8], expect: &str)370     fn decode_euc_kr(bytes: &[u8], expect: &str) {
371         decode(EUC_KR, bytes, expect);
372     }
373 
encode_euc_kr(string: &str, expect: &[u8])374     fn encode_euc_kr(string: &str, expect: &[u8]) {
375         encode(EUC_KR, string, expect);
376     }
377 
378     #[test]
test_euc_kr_decode()379     fn test_euc_kr_decode() {
380         // Empty
381         decode_euc_kr(b"", &"");
382 
383         // ASCII
384         decode_euc_kr(b"\x61\x62", "\u{0061}\u{0062}");
385 
386         decode_euc_kr(b"\x81\x41", "\u{AC02}");
387         decode_euc_kr(b"\x81\x5B", "\u{FFFD}\x5B");
388         decode_euc_kr(b"\xFD\xFE", "\u{8A70}");
389         decode_euc_kr(b"\xFE\x41", "\u{FFFD}\x41");
390         decode_euc_kr(b"\xFF\x41", "\u{FFFD}\x41");
391         decode_euc_kr(b"\x80\x41", "\u{FFFD}\x41");
392         decode_euc_kr(b"\xA1\xFF", "\u{FFFD}");
393         decode_euc_kr(b"\x81\xFF", "\u{FFFD}");
394     }
395 
396     #[test]
test_euc_kr_encode()397     fn test_euc_kr_encode() {
398         // Empty
399         encode_euc_kr("", b"");
400 
401         // ASCII
402         encode_euc_kr("\u{0061}\u{0062}", b"\x61\x62");
403 
404         encode_euc_kr("\u{AC02}", b"\x81\x41");
405         encode_euc_kr("\u{8A70}", b"\xFD\xFE");
406     }
407 
408     #[test]
409     #[cfg_attr(miri, ignore)] // Miri is too slow
test_euc_kr_decode_all()410     fn test_euc_kr_decode_all() {
411         let input = include_bytes!("test_data/euc_kr_in.txt");
412         let expectation = include_str!("test_data/euc_kr_in_ref.txt");
413         let (cow, had_errors) = EUC_KR.decode_without_bom_handling(input);
414         assert!(had_errors, "Should have had errors.");
415         assert_eq!(&cow[..], expectation);
416     }
417 
418     #[test]
419     #[cfg_attr(miri, ignore)] // Miri is too slow
test_euc_kr_encode_all()420     fn test_euc_kr_encode_all() {
421         let input = include_str!("test_data/euc_kr_out.txt");
422         let expectation = include_bytes!("test_data/euc_kr_out_ref.txt");
423         let (cow, encoding, had_errors) = EUC_KR.encode(input);
424         assert!(!had_errors, "Should not have had errors.");
425         assert_eq!(encoding, EUC_KR);
426         assert_eq!(&cow[..], &expectation[..]);
427     }
428 
429     #[test]
test_euc_kr_encode_from_two_low_surrogates()430     fn test_euc_kr_encode_from_two_low_surrogates() {
431         let expectation = b"&#65533;&#65533;";
432         let mut output = [0u8; 40];
433         let mut encoder = EUC_KR.new_encoder();
434         let (result, read, written, had_errors) =
435             encoder.encode_from_utf16(&[0xDC00u16, 0xDEDEu16], &mut output[..], true);
436         assert_eq!(result, CoderResult::InputEmpty);
437         assert_eq!(read, 2);
438         assert_eq!(written, expectation.len());
439         assert!(had_errors);
440         assert_eq!(&output[..written], expectation);
441     }
442 }
443