1 // Copyright 2015-2016 Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9 
10 use super::*;
11 use data::*;
12 use handles::*;
13 use variant::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range16;
16 use super::in_range16;
17 
18 enum Gb18030Pending {
19     None,
20     One(u8),
21     Two(u8, u8),
22     Three(u8, u8, u8),
23 }
24 
25 impl Gb18030Pending {
is_none(&self) -> bool26     fn is_none(&self) -> bool {
27         match *self {
28             Gb18030Pending::None => true,
29             _ => false,
30         }
31     }
32 
count(&self) -> usize33     fn count(&self) -> usize {
34         match *self {
35             Gb18030Pending::None => 0,
36             Gb18030Pending::One(_) => 1,
37             Gb18030Pending::Two(_, _) => 2,
38             Gb18030Pending::Three(_, _, _) => 3,
39         }
40     }
41 }
42 
43 pub struct Gb18030Decoder {
44     first: Option<u8>,
45     second: Option<u8>,
46     third: Option<u8>,
47     pending: Gb18030Pending,
48     pending_ascii: Option<u8>,
49 }
50 
51 impl Gb18030Decoder {
new() -> VariantDecoder52     pub fn new() -> VariantDecoder {
53         VariantDecoder::Gb18030(Gb18030Decoder {
54             first: None,
55             second: None,
56             third: None,
57             pending: Gb18030Pending::None,
58             pending_ascii: None,
59         })
60     }
61 
in_neutral_state(&self) -> bool62     pub fn in_neutral_state(&self) -> bool {
63         self.first.is_none()
64             && self.second.is_none()
65             && self.third.is_none()
66             && self.pending.is_none()
67             && self.pending_ascii.is_none()
68     }
69 
extra_from_state(&self, byte_length: usize) -> Option<usize>70     fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
71         byte_length.checked_add(
72             self.pending.count()
73                 + match self.first {
74                     None => 0,
75                     Some(_) => 1,
76                 }
77                 + match self.second {
78                     None => 0,
79                     Some(_) => 1,
80                 }
81                 + match self.third {
82                     None => 0,
83                     Some(_) => 1,
84                 }
85                 + match self.pending_ascii {
86                     None => 0,
87                     Some(_) => 1,
88                 },
89         )
90     }
91 
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>92     pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
93         // ASCII: 1 to 1 (worst case)
94         // gbk: 2 to 1
95         // ranges: 4 to 1 or 4 to 2
96         checked_add(1, self.extra_from_state(byte_length))
97     }
98 
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>99     pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
100         // ASCII: 1 to 1
101         // gbk: 2 to 2 or 2 to 3
102         // ranges: 4 to 2, 4 to 3 or 4 to 4
103         // 0x80: 1 to 3 (worst case)
104         self.max_utf8_buffer_length(byte_length)
105     }
106 
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>107     pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
108         checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
109     }
110 
111     gb18030_decoder_functions!(
112         {
113             // If first is between 0x81 and 0xFE, inclusive,
114             // subtract offset 0x81.
115             let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
116             if non_ascii_minus_offset > (0xFE - 0x81) {
117                 if non_ascii == 0x80 {
118                     handle.write_upper_bmp(0x20ACu16);
119                     continue 'outermost;
120                 }
121                 return (DecoderResult::Malformed(1, 0),
122                         source.consumed(),
123                         handle.written());
124             }
125             non_ascii_minus_offset
126         },
127         {
128             // Two-byte (or error)
129             if first_minus_offset >= 0x20 {
130                 // Not the gbk ideograph range above GB2312
131                 let trail_minus_offset = second.wrapping_sub(0xA1);
132                 if trail_minus_offset <= (0xFE - 0xA1) {
133                     // GB2312
134                     let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
135                     if hanzi_lead < (0x77 - 0x2F) {
136                         // Level 1 Hanzi, Level 2 Hanzi
137                         // or one of the 5 PUA code
138                         // points in between.
139                         let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
140                         let upper_bmp = GB2312_HANZI[hanzi_pointer];
141                         handle.write_upper_bmp(upper_bmp)
142                     } else if first_minus_offset == 0x20 {
143                         // Symbols (starting with ideographic space)
144                         let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
145                         handle.write_bmp_excl_ascii(bmp)
146                     } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
147                         handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
148                     } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
149                         handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
150                     } else if first_minus_offset > 0x76 {
151                         // Bottom PUA
152                         let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
153                         handle.write_upper_bmp(pua)
154                     } else {
155                         let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
156                         handle.write_bmp_excl_ascii(bmp)
157                     }
158                 } else {
159                     // gbk range on the left
160                     let mut trail_minus_offset = second.wrapping_sub(0x40);
161                     if trail_minus_offset > (0x7E - 0x40) {
162                         let trail_minus_range_start = second.wrapping_sub(0x80);
163                         if trail_minus_range_start > (0xA0 - 0x80) {
164                             if second < 0x80 {
165                                 return (DecoderResult::Malformed(1, 0),
166                                         unread_handle_second.unread(),
167                                         handle.written());
168                             }
169                             return (DecoderResult::Malformed(2, 0),
170                                     unread_handle_second.consumed(),
171                                     handle.written());
172                         }
173                         trail_minus_offset = second - 0x41;
174                     }
175                     // Zero-base lead
176                     let left_lead = first_minus_offset - 0x20;
177                     let left_pointer = left_lead as usize * (190 - 94) +
178                                        trail_minus_offset as usize;
179                     let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
180                     if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
181                         let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
182                         handle.write_upper_bmp(upper_bmp)
183                     } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
184                         let bmp = gbk_other_decode(left_pointer as u16);
185                         handle.write_bmp_excl_ascii(bmp)
186                     } else {
187                         let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
188                         let upper_bmp = GBK_BOTTOM[bottom_pointer];
189                         handle.write_upper_bmp(upper_bmp)
190                     }
191                 }
192             } else {
193                 // gbk ideograph range above GB2312
194                 let mut trail_minus_offset = second.wrapping_sub(0x40);
195                 if trail_minus_offset > (0x7E - 0x40) {
196                     let trail_minus_range_start = second.wrapping_sub(0x80);
197                     if trail_minus_range_start > (0xFE - 0x80) {
198                         if second < 0x80 {
199                             return (DecoderResult::Malformed(1, 0),
200                                     unread_handle_second.unread(),
201                                     handle.written());
202                         }
203                         return (DecoderResult::Malformed(2, 0),
204                                 unread_handle_second.consumed(),
205                                 handle.written());
206                     }
207                     trail_minus_offset = second - 0x41;
208                 }
209                 let pointer = first_minus_offset as usize * 190usize +
210                               trail_minus_offset as usize;
211                 let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
212                 handle.write_upper_bmp(upper_bmp)
213             }
214         },
215         {
216             // If third is between 0x81 and 0xFE, inclusive,
217             // subtract offset 0x81.
218             let third_minus_offset = third.wrapping_sub(0x81);
219             if third_minus_offset > (0xFE - 0x81) {
220                 // We have an error. Let's inline what's going
221                 // to happen when `second` is
222                 // reprocessed. (`third` gets unread.)
223                 // `second` is guaranteed ASCII, so let's
224                 // put it in `pending_ascii`. Recompute
225                 // `second` from `second_minus_offset`.
226                 self.pending_ascii = Some(second_minus_offset + 0x30);
227                 // Now unread `third` and designate the previous
228                 // `first` as being in error.
229                 return (DecoderResult::Malformed(1, 1),
230                         unread_handle_third.unread(),
231                         handle.written());
232             }
233             third_minus_offset
234         },
235         {
236             // If fourth is between 0x30 and 0x39, inclusive,
237             // subtract offset 0x30.
238             //
239             // If we have an error, we'll inline what's going
240             // to happen when `second` and `third` are
241             // reprocessed. (`fourth` gets unread.)
242             // `second` is guaranteed ASCII, so let's
243             // put it in `pending_ascii`. Recompute
244             // `second` from `second_minus_offset` to
245             // make this block reusable when `second`
246             // is not in scope.
247             //
248             // `third` is guaranteed to be in the range
249             // that makes it become the new `self.first`.
250             //
251             // `fourth` gets unread and the previous
252             // `first` gets designates as being in error.
253             let fourth_minus_offset = fourth.wrapping_sub(0x30);
254             if fourth_minus_offset > (0x39 - 0x30) {
255                 self.pending_ascii = Some(second_minus_offset + 0x30);
256                 self.pending = Gb18030Pending::One(third_minus_offset);
257                 return (DecoderResult::Malformed(1, 2),
258                         unread_handle_fourth.unread(),
259                         handle.written());
260             }
261             let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
262                           (second_minus_offset as usize * (10 * 126)) +
263                           (third_minus_offset as usize * 10) +
264                           fourth_minus_offset as usize;
265             if pointer <= 39419 {
266                 // BMP
267                 if pointer == 7457 {
268                     handle.write_upper_bmp(0xE7C7)
269                 } else {
270                     handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
271                 }
272             } else if pointer >= 189_000 && pointer <= 1_237_575 {
273                 // Astral
274                 handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
275             } else {
276                 return (DecoderResult::Malformed(4, 0),
277                         unread_handle_fourth.consumed(),
278                         handle.written());
279             }
280         },
281         self,
282         non_ascii,
283         first_minus_offset,
284         second,
285         second_minus_offset,
286         unread_handle_second,
287         third,
288         third_minus_offset,
289         unread_handle_third,
290         fourth,
291         fourth_minus_offset,
292         unread_handle_fourth,
293         source,
294         handle,
295         'outermost);
296 }
297 
298 // XXX Experiment with inline directives
gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)>299 fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
300     // Try ideographic punctuation first as it's the most likely case.
301     // Throwing in the check for full-width currencies and tilde is probably
302     // more size-efficient here than elsewhere.
303     if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
304         if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
305             return Some((0xA1, pos + 0xA1));
306         }
307     }
308     // Ext A
309     if in_range16(bmp, 0x3400, 0x4E00) {
310         return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
311             (
312                 0xFE,
313                 pos + if pos < (0x3F - 16) {
314                     0x40 + 16
315                 } else {
316                     0x41 + 16
317                 },
318             )
319         });
320     }
321     // Compatibility ideographs
322     if in_range16(bmp, 0xF900, 0xFB00) {
323         return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
324             if pos < 5 {
325                 // end of second to last row
326                 (0xFD, pos + (190 - 94 - 5 + 0x41))
327             } else {
328                 // last row
329                 (0xFE, pos + (0x40 - 5))
330             }
331         });
332     }
333     // Handle everything below U+02CA, which is in GBK_OTHER.
334     if bmp < 0x02CA {
335         if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
336             // Pinyin except U+1E3F
337             if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
338                 return Some((0xA8, pos + 0xA1));
339             }
340         } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
341             || in_inclusive_range16(bmp, 0x02C7, 0x02C9)
342         {
343             // Diacritics and Latin 1 symbols
344             if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
345                 return Some((0xA1, pos + 0xA1 + 3));
346             }
347         }
348         return None;
349     }
350     if bmp >= 0xE794 {
351         // Various brackets, all in PUA or full-width regions
352         if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
353             return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
354         }
355     } else if bmp == 0x1E3F {
356         // The one Pinyin placed elsewhere on the BMP
357         return Some((0xA8, 0x7B - 0x60 + 0xA1));
358     } else if in_range16(bmp, 0xA000, 0xD800) {
359         // Since Korean has usage in China, let's spend a branch to fast-track
360         // Hangul.
361         return None;
362     }
363     // GB2312 other (except bottom PUA and PUA between Hanzi levels).
364     if let Some(other_pointer) = gb2312_other_encode(bmp) {
365         let other_lead = other_pointer as usize / 94;
366         let other_trail = other_pointer as usize % 94;
367         return Some((0xA2 + other_lead, 0xA1 + other_trail));
368     }
369     // At this point, we've handled all mappable characters above U+02D9 but
370     // below U+2010. Let's check for that range in order to let lower BMP
371     // characters used for minority languages in China avoid the subsequent
372     // search that deals mainly with various symbols.
373     if in_range16(bmp, 0x02DA, 0x2010) {
374         return None;
375     }
376     // GBK other (except radicals and PUA in GBK_BOTTOM).
377     if let Some(other_pointer) = gbk_other_encode(bmp) {
378         let other_lead = other_pointer as usize / (190 - 94);
379         let other_trail = other_pointer as usize % (190 - 94);
380         let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
381         return Some((other_lead + (0x81 + 0x20), other_trail + offset));
382     }
383     // CJK Radicals Supplement or PUA in GBK_BOTTOM
384     if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
385         if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
386             let trail = pos + 16;
387             let offset = if trail < 0x3F { 0x40 } else { 0x41 };
388             return Some((0xFE, trail + offset));
389         }
390     }
391     // GB2312 bottom PUA
392     let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
393     if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
394         let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
395         let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
396         return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
397     }
398     // PUA between Hanzi Levels
399     let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
400     if bmp_minus_pua_between_hanzi < 5 {
401         return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
402     }
403     None
404 }
405 
406 #[cfg(not(feature = "fast-gb-hanzi-encode"))]
407 #[inline(always)]
encode_hanzi(bmp: u16, _: u16) -> (u8, u8)408 fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
409     if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
410         (lead, trail)
411     } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
412         let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
413         let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
414         (hanzi_lead as u8, hanzi_trail as u8)
415     } else {
416         let (lead, gbk_trail) = if bmp < 0x72DC {
417             // Above GB2312
418             let pointer = gbk_top_ideograph_encode(bmp) as usize;
419             let lead = (pointer / 190) + 0x81;
420             let gbk_trail = pointer % 190;
421             (lead, gbk_trail)
422         } else {
423             // To the left of GB2312
424             let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
425             let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
426             let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
427             (lead, gbk_trail)
428         };
429         let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
430         (lead as u8, (gbk_trail + offset) as u8)
431     }
432 }
433 
434 #[cfg(feature = "fast-gb-hanzi-encode")]
435 #[inline(always)]
encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8)436 fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
437     gbk_hanzi_encode(bmp_minus_unified_start)
438 }
439 
440 pub struct Gb18030Encoder {
441     extended: bool,
442 }
443 
444 impl Gb18030Encoder {
new(encoding: &'static Encoding, extended_range: bool) -> Encoder445     pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
446         Encoder::new(
447             encoding,
448             VariantEncoder::Gb18030(Gb18030Encoder {
449                 extended: extended_range,
450             }),
451         )
452     }
453 
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>454     pub fn max_buffer_length_from_utf16_without_replacement(
455         &self,
456         u16_length: usize,
457     ) -> Option<usize> {
458         if self.extended {
459             u16_length.checked_mul(4)
460         } else {
461             // Need to add, because space check is done with the four-byte
462             // assumption.
463             checked_add(2, u16_length.checked_mul(2))
464         }
465     }
466 
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>467     pub fn max_buffer_length_from_utf8_without_replacement(
468         &self,
469         byte_length: usize,
470     ) -> Option<usize> {
471         if self.extended {
472             // 1 to 1
473             // 2 to 2
474             // 3 to 2
475             // 2 to 4 (worst)
476             // 3 to 4
477             // 4 to 4
478             checked_add(2, byte_length.checked_mul(2))
479         } else {
480             // 1 to 1
481             // 2 to 2
482             // 3 to 2
483             // Need to add, because space check is done with the four-byte
484             // assumption.
485             byte_length.checked_add(3)
486         }
487     }
488 
489     ascii_compatible_encoder_functions!(
490         {
491             let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
492             if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
493                 // CJK Unified Ideographs
494                 // Can't fail now, since all are
495                 // mapped.
496                 let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
497                 handle.write_two(lead, trail)
498             } else if bmp == 0xE5E5 {
499                 // It's not optimal to check for the unmappable
500                 // and for euro at this stage, but getting
501                 // the out of the way makes the rest of the
502                 // code less messy.
503                 return (
504                     EncoderResult::unmappable_from_bmp(bmp),
505                     source.consumed(),
506                     handle.written(),
507                 );
508             } else if bmp == 0x20AC && !self.extended {
509                 handle.write_one(0x80u8)
510             } else {
511                 match gbk_encode_non_unified(bmp) {
512                     Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
513                     None => {
514                         if !self.extended {
515                             return (
516                                 EncoderResult::unmappable_from_bmp(bmp),
517                                 source.consumed(),
518                                 handle.written(),
519                             );
520                         }
521                         let range_pointer = gb18030_range_encode(bmp);
522                         let first = range_pointer / (10 * 126 * 10);
523                         let rem_first = range_pointer % (10 * 126 * 10);
524                         let second = rem_first / (10 * 126);
525                         let rem_second = rem_first % (10 * 126);
526                         let third = rem_second / 10;
527                         let fourth = rem_second % 10;
528                         handle.write_four(
529                             (first + 0x81) as u8,
530                             (second + 0x30) as u8,
531                             (third + 0x81) as u8,
532                             (fourth + 0x30) as u8,
533                         )
534                     }
535                 }
536             }
537         },
538         {
539             if !self.extended {
540                 return (
541                     EncoderResult::Unmappable(astral),
542                     source.consumed(),
543                     handle.written(),
544                 );
545             }
546             let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
547             let first = range_pointer / (10 * 126 * 10);
548             let rem_first = range_pointer % (10 * 126 * 10);
549             let second = rem_first / (10 * 126);
550             let rem_second = rem_first % (10 * 126);
551             let third = rem_second / 10;
552             let fourth = rem_second % 10;
553             handle.write_four(
554                 (first + 0x81) as u8,
555                 (second + 0x30) as u8,
556                 (third + 0x81) as u8,
557                 (fourth + 0x30) as u8,
558             )
559         },
560         bmp,
561         astral,
562         self,
563         source,
564         handle,
565         copy_ascii_to_check_space_four,
566         check_space_four,
567         false
568     );
569 }
570 
571 // Any copyright to the test code below this comment is dedicated to the
572 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
573 
574 #[cfg(test)]
575 mod tests {
576     use super::super::testing::*;
577     use super::super::*;
578 
decode_gb18030(bytes: &[u8], expect: &str)579     fn decode_gb18030(bytes: &[u8], expect: &str) {
580         decode(GB18030, bytes, expect);
581     }
582 
encode_gb18030(string: &str, expect: &[u8])583     fn encode_gb18030(string: &str, expect: &[u8]) {
584         encode(GB18030, string, expect);
585     }
586 
encode_gbk(string: &str, expect: &[u8])587     fn encode_gbk(string: &str, expect: &[u8]) {
588         encode(GBK, string, expect);
589     }
590 
591     #[test]
test_gb18030_decode()592     fn test_gb18030_decode() {
593         // Empty
594         decode_gb18030(b"", &"");
595 
596         // ASCII
597         decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
598 
599         // euro
600         decode_gb18030(b"\x80", "\u{20AC}");
601         decode_gb18030(b"\xA2\xE3", "\u{20AC}");
602 
603         // two bytes
604         decode_gb18030(b"\x81\x40", "\u{4E02}");
605         decode_gb18030(b"\x81\x7E", "\u{4E8A}");
606         decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
607         decode_gb18030(b"\x81\x80", "\u{4E90}");
608         decode_gb18030(b"\x81\xFE", "\u{4FA2}");
609         decode_gb18030(b"\xFE\x40", "\u{FA0C}");
610         decode_gb18030(b"\xFE\x7E", "\u{E843}");
611         decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
612         decode_gb18030(b"\xFE\x80", "\u{4723}");
613         decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
614 
615         // The difference from the original GB18030
616         decode_gb18030(b"\xA3\xA0", "\u{3000}");
617         decode_gb18030(b"\xA1\xA1", "\u{3000}");
618 
619         // 0xFF
620         decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
621         decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
622         decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
623         decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
624         decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
625         decode_gb18030(
626             b"\xFF\x32\x9A\x33\x00",
627             "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
628         );
629 
630         // Four bytes
631         decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
632         decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
633         decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
634         decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
635         decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
636         decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
637         decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
638         decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
639         decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
640     }
641 
642     #[test]
test_gb18030_encode()643     fn test_gb18030_encode() {
644         // Empty
645         encode_gb18030("", b"");
646 
647         // ASCII
648         encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
649 
650         // euro
651         encode_gb18030("\u{20AC}", b"\xA2\xE3");
652 
653         // two bytes
654         encode_gb18030("\u{4E02}", b"\x81\x40");
655         encode_gb18030("\u{4E8A}", b"\x81\x7E");
656         encode_gb18030("\u{4E90}", b"\x81\x80");
657         encode_gb18030("\u{4FA2}", b"\x81\xFE");
658         encode_gb18030("\u{FA0C}", b"\xFE\x40");
659         encode_gb18030("\u{E843}", b"\xFE\x7E");
660         encode_gb18030("\u{4723}", b"\xFE\x80");
661         encode_gb18030("\u{E4C5}", b"\xFE\xFE");
662 
663         // The difference from the original GB18030
664         encode_gb18030("\u{E5E5}", b"&#58853;");
665         encode_gb18030("\u{3000}", b"\xA1\xA1");
666 
667         // Four bytes
668         encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
669         encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
670         encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
671         encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
672         encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
673 
674         // Edge cases
675         encode_gb18030("\u{00F7}", b"\xA1\xC2");
676     }
677 
678     #[test]
test_gbk_encode()679     fn test_gbk_encode() {
680         // Empty
681         encode_gbk("", b"");
682 
683         // ASCII
684         encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
685 
686         // euro
687         encode_gbk("\u{20AC}", b"\x80");
688 
689         // two bytes
690         encode_gbk("\u{4E02}", b"\x81\x40");
691         encode_gbk("\u{4E8A}", b"\x81\x7E");
692         encode_gbk("\u{4E90}", b"\x81\x80");
693         encode_gbk("\u{4FA2}", b"\x81\xFE");
694         encode_gbk("\u{FA0C}", b"\xFE\x40");
695         encode_gbk("\u{E843}", b"\xFE\x7E");
696         encode_gbk("\u{4723}", b"\xFE\x80");
697         encode_gbk("\u{E4C5}", b"\xFE\xFE");
698 
699         // The difference from the original gb18030
700         encode_gbk("\u{E5E5}", b"&#58853;");
701         encode_gbk("\u{3000}", b"\xA1\xA1");
702 
703         // Four bytes
704         encode_gbk("\u{0080}", b"&#128;");
705         encode_gbk("\u{E7C7}", b"&#59335;");
706         encode_gbk("\u{2603}", b"&#9731;");
707         encode_gbk("\u{1F4A9}", b"&#128169;");
708         encode_gbk("\u{10FFFF}", b"&#1114111;");
709 
710         // Edge cases
711         encode_gbk("\u{00F7}", b"\xA1\xC2");
712     }
713 
714     #[test]
test_gb18030_decode_all()715     fn test_gb18030_decode_all() {
716         let input = include_bytes!("test_data/gb18030_in.txt");
717         let expectation = include_str!("test_data/gb18030_in_ref.txt");
718         let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
719         assert!(!had_errors, "Should not have had errors.");
720         assert_eq!(&cow[..], expectation);
721     }
722 
723     #[test]
test_gb18030_encode_all()724     fn test_gb18030_encode_all() {
725         let input = include_str!("test_data/gb18030_out.txt");
726         let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
727         let (cow, encoding, had_errors) = GB18030.encode(input);
728         assert!(!had_errors, "Should not have had errors.");
729         assert_eq!(encoding, GB18030);
730         assert_eq!(&cow[..], &expectation[..]);
731     }
732 
733     #[test]
test_gb18030_encode_from_utf16_max_length()734     fn test_gb18030_encode_from_utf16_max_length() {
735         let mut output = [0u8; 20];
736         let mut encoder = GB18030.new_encoder();
737         {
738             let needed = encoder
739                 .max_buffer_length_from_utf16_without_replacement(1)
740                 .unwrap();
741             let (result, read, written) = encoder.encode_from_utf16_without_replacement(
742                 &[0x3000],
743                 &mut output[..needed],
744                 true,
745             );
746             assert_eq!(result, EncoderResult::InputEmpty);
747             assert_eq!(read, 1);
748             assert_eq!(written, 2);
749             assert_eq!(output[0], 0xA1);
750             assert_eq!(output[1], 0xA1);
751         }
752     }
753 }
754