1 // Copyright Mozilla Foundation. See the COPYRIGHT
2 // file at the top-level directory of this distribution.
3 //
4 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5 // https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6 // <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7 // option. This file may not be copied, modified, or distributed
8 // except according to those terms.
9
10 use super::*;
11 use crate::data::*;
12 use crate::handles::*;
13 use crate::variant::*;
14 // Rust 1.14.0 requires the following despite the asterisk above.
15 use super::in_inclusive_range16;
16 use super::in_range16;
17
18 enum Gb18030Pending {
19 None,
20 One(u8),
21 Two(u8, u8),
22 Three(u8, u8, u8),
23 }
24
25 impl Gb18030Pending {
is_none(&self) -> bool26 fn is_none(&self) -> bool {
27 match *self {
28 Gb18030Pending::None => true,
29 _ => false,
30 }
31 }
32
count(&self) -> usize33 fn count(&self) -> usize {
34 match *self {
35 Gb18030Pending::None => 0,
36 Gb18030Pending::One(_) => 1,
37 Gb18030Pending::Two(_, _) => 2,
38 Gb18030Pending::Three(_, _, _) => 3,
39 }
40 }
41 }
42
43 pub struct Gb18030Decoder {
44 first: Option<u8>,
45 second: Option<u8>,
46 third: Option<u8>,
47 pending: Gb18030Pending,
48 pending_ascii: Option<u8>,
49 }
50
51 impl Gb18030Decoder {
new() -> VariantDecoder52 pub fn new() -> VariantDecoder {
53 VariantDecoder::Gb18030(Gb18030Decoder {
54 first: None,
55 second: None,
56 third: None,
57 pending: Gb18030Pending::None,
58 pending_ascii: None,
59 })
60 }
61
in_neutral_state(&self) -> bool62 pub fn in_neutral_state(&self) -> bool {
63 self.first.is_none()
64 && self.second.is_none()
65 && self.third.is_none()
66 && self.pending.is_none()
67 && self.pending_ascii.is_none()
68 }
69
extra_from_state(&self, byte_length: usize) -> Option<usize>70 fn extra_from_state(&self, byte_length: usize) -> Option<usize> {
71 byte_length.checked_add(
72 self.pending.count()
73 + match self.first {
74 None => 0,
75 Some(_) => 1,
76 }
77 + match self.second {
78 None => 0,
79 Some(_) => 1,
80 }
81 + match self.third {
82 None => 0,
83 Some(_) => 1,
84 }
85 + match self.pending_ascii {
86 None => 0,
87 Some(_) => 1,
88 },
89 )
90 }
91
max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize>92 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
93 // ASCII: 1 to 1 (worst case)
94 // gbk: 2 to 1
95 // ranges: 4 to 1 or 4 to 2
96 checked_add(1, self.extra_from_state(byte_length))
97 }
98
max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize>99 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
100 // ASCII: 1 to 1
101 // gbk: 2 to 2 or 2 to 3
102 // ranges: 4 to 2, 4 to 3 or 4 to 4
103 // 0x80: 1 to 3 (worst case)
104 self.max_utf8_buffer_length(byte_length)
105 }
106
max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize>107 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
108 checked_add(1, checked_mul(3, self.extra_from_state(byte_length)))
109 }
110
111 gb18030_decoder_functions!(
112 {
113 // If first is between 0x81 and 0xFE, inclusive,
114 // subtract offset 0x81.
115 let non_ascii_minus_offset = non_ascii.wrapping_sub(0x81);
116 if non_ascii_minus_offset > (0xFE - 0x81) {
117 if non_ascii == 0x80 {
118 handle.write_upper_bmp(0x20ACu16);
119 continue 'outermost;
120 }
121 return (DecoderResult::Malformed(1, 0),
122 source.consumed(),
123 handle.written());
124 }
125 non_ascii_minus_offset
126 },
127 {
128 // Two-byte (or error)
129 if first_minus_offset >= 0x20 {
130 // Not the gbk ideograph range above GB2312
131 let trail_minus_offset = second.wrapping_sub(0xA1);
132 if trail_minus_offset <= (0xFE - 0xA1) {
133 // GB2312
134 let hanzi_lead = first_minus_offset.wrapping_sub(0x2F);
135 if hanzi_lead < (0x77 - 0x2F) {
136 // Level 1 Hanzi, Level 2 Hanzi
137 // or one of the 5 PUA code
138 // points in between.
139 let hanzi_pointer = mul_94(hanzi_lead) + trail_minus_offset as usize;
140 let upper_bmp = GB2312_HANZI[hanzi_pointer];
141 handle.write_upper_bmp(upper_bmp)
142 } else if first_minus_offset == 0x20 {
143 // Symbols (starting with ideographic space)
144 let bmp = GB2312_SYMBOLS[trail_minus_offset as usize];
145 handle.write_bmp_excl_ascii(bmp)
146 } else if first_minus_offset == 0x25 && ((trail_minus_offset.wrapping_sub(63) as usize) < GB2312_SYMBOLS_AFTER_GREEK.len()) {
147 handle.write_bmp_excl_ascii(GB2312_SYMBOLS_AFTER_GREEK[trail_minus_offset.wrapping_sub(63) as usize])
148 } else if first_minus_offset == 0x27 && (trail_minus_offset as usize) < GB2312_PINYIN.len() {
149 handle.write_bmp_excl_ascii(GB2312_PINYIN[trail_minus_offset as usize])
150 } else if first_minus_offset > 0x76 {
151 // Bottom PUA
152 let pua = (0xE234 + mul_94(first_minus_offset - 0x77) + trail_minus_offset as usize) as u16;
153 handle.write_upper_bmp(pua)
154 } else {
155 let bmp = gb2312_other_decode((mul_94(first_minus_offset - 0x21) + (trail_minus_offset as usize)) as u16);
156 handle.write_bmp_excl_ascii(bmp)
157 }
158 } else {
159 // gbk range on the left
160 let mut trail_minus_offset = second.wrapping_sub(0x40);
161 if trail_minus_offset > (0x7E - 0x40) {
162 let trail_minus_range_start = second.wrapping_sub(0x80);
163 if trail_minus_range_start > (0xA0 - 0x80) {
164 if second < 0x80 {
165 return (DecoderResult::Malformed(1, 0),
166 unread_handle_second.unread(),
167 handle.written());
168 }
169 return (DecoderResult::Malformed(2, 0),
170 unread_handle_second.consumed(),
171 handle.written());
172 }
173 trail_minus_offset = second - 0x41;
174 }
175 // Zero-base lead
176 let left_lead = first_minus_offset - 0x20;
177 let left_pointer = left_lead as usize * (190 - 94) +
178 trail_minus_offset as usize;
179 let gbk_left_ideograph_pointer = left_pointer.wrapping_sub((0x29 - 0x20) * (190 - 94));
180 if gbk_left_ideograph_pointer < (((0x7D - 0x29) * (190 - 94)) - 5) {
181 let upper_bmp = gbk_left_ideograph_decode(gbk_left_ideograph_pointer as u16);
182 handle.write_upper_bmp(upper_bmp)
183 } else if left_pointer < ((0x29 - 0x20) * (190 - 94)) {
184 let bmp = gbk_other_decode(left_pointer as u16);
185 handle.write_bmp_excl_ascii(bmp)
186 } else {
187 let bottom_pointer = left_pointer - (((0x7D - 0x20) * (190 - 94)) - 5);
188 let upper_bmp = GBK_BOTTOM[bottom_pointer];
189 handle.write_upper_bmp(upper_bmp)
190 }
191 }
192 } else {
193 // gbk ideograph range above GB2312
194 let mut trail_minus_offset = second.wrapping_sub(0x40);
195 if trail_minus_offset > (0x7E - 0x40) {
196 let trail_minus_range_start = second.wrapping_sub(0x80);
197 if trail_minus_range_start > (0xFE - 0x80) {
198 if second < 0x80 {
199 return (DecoderResult::Malformed(1, 0),
200 unread_handle_second.unread(),
201 handle.written());
202 }
203 return (DecoderResult::Malformed(2, 0),
204 unread_handle_second.consumed(),
205 handle.written());
206 }
207 trail_minus_offset = second - 0x41;
208 }
209 let pointer = first_minus_offset as usize * 190usize +
210 trail_minus_offset as usize;
211 let upper_bmp = gbk_top_ideograph_decode(pointer as u16);
212 handle.write_upper_bmp(upper_bmp)
213 }
214 },
215 {
216 // If third is between 0x81 and 0xFE, inclusive,
217 // subtract offset 0x81.
218 let third_minus_offset = third.wrapping_sub(0x81);
219 if third_minus_offset > (0xFE - 0x81) {
220 // We have an error. Let's inline what's going
221 // to happen when `second` is
222 // reprocessed. (`third` gets unread.)
223 // `second` is guaranteed ASCII, so let's
224 // put it in `pending_ascii`. Recompute
225 // `second` from `second_minus_offset`.
226 self.pending_ascii = Some(second_minus_offset + 0x30);
227 // Now unread `third` and designate the previous
228 // `first` as being in error.
229 return (DecoderResult::Malformed(1, 1),
230 unread_handle_third.unread(),
231 handle.written());
232 }
233 third_minus_offset
234 },
235 {
236 // If fourth is between 0x30 and 0x39, inclusive,
237 // subtract offset 0x30.
238 //
239 // If we have an error, we'll inline what's going
240 // to happen when `second` and `third` are
241 // reprocessed. (`fourth` gets unread.)
242 // `second` is guaranteed ASCII, so let's
243 // put it in `pending_ascii`. Recompute
244 // `second` from `second_minus_offset` to
245 // make this block reusable when `second`
246 // is not in scope.
247 //
248 // `third` is guaranteed to be in the range
249 // that makes it become the new `self.first`.
250 //
251 // `fourth` gets unread and the previous
252 // `first` gets designates as being in error.
253 let fourth_minus_offset = fourth.wrapping_sub(0x30);
254 if fourth_minus_offset > (0x39 - 0x30) {
255 self.pending_ascii = Some(second_minus_offset + 0x30);
256 self.pending = Gb18030Pending::One(third_minus_offset);
257 return (DecoderResult::Malformed(1, 2),
258 unread_handle_fourth.unread(),
259 handle.written());
260 }
261 let pointer = (first_minus_offset as usize * (10 * 126 * 10)) +
262 (second_minus_offset as usize * (10 * 126)) +
263 (third_minus_offset as usize * 10) +
264 fourth_minus_offset as usize;
265 if pointer <= 39419 {
266 // BMP
267 if pointer == 7457 {
268 handle.write_upper_bmp(0xE7C7)
269 } else {
270 handle.write_bmp_excl_ascii(gb18030_range_decode(pointer as u16))
271 }
272 } else if pointer >= 189_000 && pointer <= 1_237_575 {
273 // Astral
274 handle.write_astral((pointer - (189_000usize - 0x1_0000usize)) as u32)
275 } else {
276 return (DecoderResult::Malformed(4, 0),
277 unread_handle_fourth.consumed(),
278 handle.written());
279 }
280 },
281 self,
282 non_ascii,
283 first_minus_offset,
284 second,
285 second_minus_offset,
286 unread_handle_second,
287 third,
288 third_minus_offset,
289 unread_handle_third,
290 fourth,
291 fourth_minus_offset,
292 unread_handle_fourth,
293 source,
294 handle,
295 'outermost);
296 }
297
298 // XXX Experiment with inline directives
gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)>299 fn gbk_encode_non_unified(bmp: u16) -> Option<(usize, usize)> {
300 // Try ideographic punctuation first as it's the most likely case.
301 // Throwing in the check for full-width currencies and tilde is probably
302 // more size-efficient here than elsewhere.
303 if in_inclusive_range16(bmp, 0x2014, 0x3017) || in_inclusive_range16(bmp, 0xFF04, 0xFFE1) {
304 if let Some(pos) = position(&GB2312_SYMBOLS[..], bmp) {
305 return Some((0xA1, pos + 0xA1));
306 }
307 }
308 // Ext A
309 if in_range16(bmp, 0x3400, 0x4E00) {
310 return position(&GBK_BOTTOM[21..100], bmp).map(|pos| {
311 (
312 0xFE,
313 pos + if pos < (0x3F - 16) {
314 0x40 + 16
315 } else {
316 0x41 + 16
317 },
318 )
319 });
320 }
321 // Compatibility ideographs
322 if in_range16(bmp, 0xF900, 0xFB00) {
323 return position(&GBK_BOTTOM[0..21], bmp).map(|pos| {
324 if pos < 5 {
325 // end of second to last row
326 (0xFD, pos + (190 - 94 - 5 + 0x41))
327 } else {
328 // last row
329 (0xFE, pos + (0x40 - 5))
330 }
331 });
332 }
333 // Handle everything below U+02CA, which is in GBK_OTHER.
334 if bmp < 0x02CA {
335 if in_range16(bmp, 0x00E0, 0x0262) && bmp != 0x00F7 {
336 // Pinyin except U+1E3F
337 if let Some(pos) = position(&GB2312_PINYIN[..], bmp) {
338 return Some((0xA8, pos + 0xA1));
339 }
340 } else if in_inclusive_range16(bmp, 0x00A4, 0x00F7)
341 || in_inclusive_range16(bmp, 0x02C7, 0x02C9)
342 {
343 // Diacritics and Latin 1 symbols
344 if let Some(pos) = position(&GB2312_SYMBOLS[3..(0xAC - 0x60)], bmp) {
345 return Some((0xA1, pos + 0xA1 + 3));
346 }
347 }
348 return None;
349 }
350 if bmp >= 0xE794 {
351 // Various brackets, all in PUA or full-width regions
352 if let Some(pos) = position(&GB2312_SYMBOLS_AFTER_GREEK[..], bmp) {
353 return Some((0xA6, pos + (0x9F - 0x60 + 0xA1)));
354 }
355 } else if bmp == 0x1E3F {
356 // The one Pinyin placed elsewhere on the BMP
357 return Some((0xA8, 0x7B - 0x60 + 0xA1));
358 } else if in_range16(bmp, 0xA000, 0xD800) {
359 // Since Korean has usage in China, let's spend a branch to fast-track
360 // Hangul.
361 return None;
362 }
363 // GB2312 other (except bottom PUA and PUA between Hanzi levels).
364 if let Some(other_pointer) = gb2312_other_encode(bmp) {
365 let other_lead = other_pointer as usize / 94;
366 let other_trail = other_pointer as usize % 94;
367 return Some((0xA2 + other_lead, 0xA1 + other_trail));
368 }
369 // At this point, we've handled all mappable characters above U+02D9 but
370 // below U+2010. Let's check for that range in order to let lower BMP
371 // characters used for minority languages in China avoid the subsequent
372 // search that deals mainly with various symbols.
373 if in_range16(bmp, 0x02DA, 0x2010) {
374 return None;
375 }
376 // GBK other (except radicals and PUA in GBK_BOTTOM).
377 if let Some(other_pointer) = gbk_other_encode(bmp) {
378 let other_lead = other_pointer as usize / (190 - 94);
379 let other_trail = other_pointer as usize % (190 - 94);
380 let offset = if other_trail < 0x3F { 0x40 } else { 0x41 };
381 return Some((other_lead + (0x81 + 0x20), other_trail + offset));
382 }
383 // CJK Radicals Supplement or PUA in GBK_BOTTOM
384 if in_inclusive_range16(bmp, 0x2E81, 0x2ECA) || in_inclusive_range16(bmp, 0xE816, 0xE864) {
385 if let Some(pos) = position(&GBK_BOTTOM[21..], bmp) {
386 let trail = pos + 16;
387 let offset = if trail < 0x3F { 0x40 } else { 0x41 };
388 return Some((0xFE, trail + offset));
389 }
390 }
391 // GB2312 bottom PUA
392 let bmp_minus_gb2312_bottom_pua = bmp.wrapping_sub(0xE234);
393 if bmp_minus_gb2312_bottom_pua <= (0xE4C5 - 0xE234) {
394 let pua_lead = bmp_minus_gb2312_bottom_pua as usize / 94;
395 let pua_trail = bmp_minus_gb2312_bottom_pua as usize % 94;
396 return Some((0x81 + 0x77 + pua_lead, 0xA1 + pua_trail));
397 }
398 // PUA between Hanzi Levels
399 let bmp_minus_pua_between_hanzi = bmp.wrapping_sub(0xE810);
400 if bmp_minus_pua_between_hanzi < 5 {
401 return Some((0x81 + 0x56, 0xFF - 5 + bmp_minus_pua_between_hanzi as usize));
402 }
403 None
404 }
405
406 #[cfg(not(feature = "fast-gb-hanzi-encode"))]
407 #[inline(always)]
encode_hanzi(bmp: u16, _: u16) -> (u8, u8)408 fn encode_hanzi(bmp: u16, _: u16) -> (u8, u8) {
409 if let Some((lead, trail)) = gb2312_level1_hanzi_encode(bmp) {
410 (lead, trail)
411 } else if let Some(hanzi_pointer) = gb2312_level2_hanzi_encode(bmp) {
412 let hanzi_lead = (hanzi_pointer / 94) + (0xD8);
413 let hanzi_trail = (hanzi_pointer % 94) + 0xA1;
414 (hanzi_lead as u8, hanzi_trail as u8)
415 } else {
416 let (lead, gbk_trail) = if bmp < 0x72DC {
417 // Above GB2312
418 let pointer = gbk_top_ideograph_encode(bmp) as usize;
419 let lead = (pointer / 190) + 0x81;
420 let gbk_trail = pointer % 190;
421 (lead, gbk_trail)
422 } else {
423 // To the left of GB2312
424 let gbk_left_ideograph_pointer = gbk_left_ideograph_encode(bmp) as usize;
425 let lead = (gbk_left_ideograph_pointer / (190 - 94)) + (0x81 + 0x29);
426 let gbk_trail = gbk_left_ideograph_pointer % (190 - 94);
427 (lead, gbk_trail)
428 };
429 let offset = if gbk_trail < 0x3F { 0x40 } else { 0x41 };
430 (lead as u8, (gbk_trail + offset) as u8)
431 }
432 }
433
434 #[cfg(feature = "fast-gb-hanzi-encode")]
435 #[inline(always)]
encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8)436 fn encode_hanzi(_: u16, bmp_minus_unified_start: u16) -> (u8, u8) {
437 gbk_hanzi_encode(bmp_minus_unified_start)
438 }
439
440 pub struct Gb18030Encoder {
441 extended: bool,
442 }
443
444 impl Gb18030Encoder {
new(encoding: &'static Encoding, extended_range: bool) -> Encoder445 pub fn new(encoding: &'static Encoding, extended_range: bool) -> Encoder {
446 Encoder::new(
447 encoding,
448 VariantEncoder::Gb18030(Gb18030Encoder {
449 extended: extended_range,
450 }),
451 )
452 }
453
max_buffer_length_from_utf16_without_replacement( &self, u16_length: usize, ) -> Option<usize>454 pub fn max_buffer_length_from_utf16_without_replacement(
455 &self,
456 u16_length: usize,
457 ) -> Option<usize> {
458 if self.extended {
459 u16_length.checked_mul(4)
460 } else {
461 // Need to add, because space check is done with the four-byte
462 // assumption.
463 checked_add(2, u16_length.checked_mul(2))
464 }
465 }
466
max_buffer_length_from_utf8_without_replacement( &self, byte_length: usize, ) -> Option<usize>467 pub fn max_buffer_length_from_utf8_without_replacement(
468 &self,
469 byte_length: usize,
470 ) -> Option<usize> {
471 if self.extended {
472 // 1 to 1
473 // 2 to 2
474 // 3 to 2
475 // 2 to 4 (worst)
476 // 3 to 4
477 // 4 to 4
478 checked_add(2, byte_length.checked_mul(2))
479 } else {
480 // 1 to 1
481 // 2 to 2
482 // 3 to 2
483 // Need to add, because space check is done with the four-byte
484 // assumption.
485 byte_length.checked_add(3)
486 }
487 }
488
489 ascii_compatible_encoder_functions!(
490 {
491 let bmp_minus_unified_start = bmp.wrapping_sub(0x4E00);
492 if bmp_minus_unified_start < (0x9FA6 - 0x4E00) {
493 // CJK Unified Ideographs
494 // Can't fail now, since all are
495 // mapped.
496 let (lead, trail) = encode_hanzi(bmp, bmp_minus_unified_start);
497 handle.write_two(lead, trail)
498 } else if bmp == 0xE5E5 {
499 // It's not optimal to check for the unmappable
500 // and for euro at this stage, but getting
501 // the out of the way makes the rest of the
502 // code less messy.
503 return (
504 EncoderResult::unmappable_from_bmp(bmp),
505 source.consumed(),
506 handle.written(),
507 );
508 } else if bmp == 0x20AC && !self.extended {
509 handle.write_one(0x80u8)
510 } else {
511 match gbk_encode_non_unified(bmp) {
512 Some((lead, trail)) => handle.write_two(lead as u8, trail as u8),
513 None => {
514 if !self.extended {
515 return (
516 EncoderResult::unmappable_from_bmp(bmp),
517 source.consumed(),
518 handle.written(),
519 );
520 }
521 let range_pointer = gb18030_range_encode(bmp);
522 let first = range_pointer / (10 * 126 * 10);
523 let rem_first = range_pointer % (10 * 126 * 10);
524 let second = rem_first / (10 * 126);
525 let rem_second = rem_first % (10 * 126);
526 let third = rem_second / 10;
527 let fourth = rem_second % 10;
528 handle.write_four(
529 (first + 0x81) as u8,
530 (second + 0x30) as u8,
531 (third + 0x81) as u8,
532 (fourth + 0x30) as u8,
533 )
534 }
535 }
536 }
537 },
538 {
539 if !self.extended {
540 return (
541 EncoderResult::Unmappable(astral),
542 source.consumed(),
543 handle.written(),
544 );
545 }
546 let range_pointer = astral as usize + (189_000usize - 0x1_0000usize);
547 let first = range_pointer / (10 * 126 * 10);
548 let rem_first = range_pointer % (10 * 126 * 10);
549 let second = rem_first / (10 * 126);
550 let rem_second = rem_first % (10 * 126);
551 let third = rem_second / 10;
552 let fourth = rem_second % 10;
553 handle.write_four(
554 (first + 0x81) as u8,
555 (second + 0x30) as u8,
556 (third + 0x81) as u8,
557 (fourth + 0x30) as u8,
558 )
559 },
560 bmp,
561 astral,
562 self,
563 source,
564 handle,
565 copy_ascii_to_check_space_four,
566 check_space_four,
567 false
568 );
569 }
570
571 // Any copyright to the test code below this comment is dedicated to the
572 // Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
573
574 #[cfg(test)]
575 mod tests {
576 use super::super::testing::*;
577 use super::super::*;
578
decode_gb18030(bytes: &[u8], expect: &str)579 fn decode_gb18030(bytes: &[u8], expect: &str) {
580 decode(GB18030, bytes, expect);
581 }
582
encode_gb18030(string: &str, expect: &[u8])583 fn encode_gb18030(string: &str, expect: &[u8]) {
584 encode(GB18030, string, expect);
585 }
586
encode_gbk(string: &str, expect: &[u8])587 fn encode_gbk(string: &str, expect: &[u8]) {
588 encode(GBK, string, expect);
589 }
590
591 #[test]
test_gb18030_decode()592 fn test_gb18030_decode() {
593 // Empty
594 decode_gb18030(b"", &"");
595
596 // ASCII
597 decode_gb18030(b"\x61\x62", "\u{0061}\u{0062}");
598
599 // euro
600 decode_gb18030(b"\x80", "\u{20AC}");
601 decode_gb18030(b"\xA2\xE3", "\u{20AC}");
602
603 // two bytes
604 decode_gb18030(b"\x81\x40", "\u{4E02}");
605 decode_gb18030(b"\x81\x7E", "\u{4E8A}");
606 decode_gb18030(b"\x81\x7F", "\u{FFFD}\u{007F}");
607 decode_gb18030(b"\x81\x80", "\u{4E90}");
608 decode_gb18030(b"\x81\xFE", "\u{4FA2}");
609 decode_gb18030(b"\xFE\x40", "\u{FA0C}");
610 decode_gb18030(b"\xFE\x7E", "\u{E843}");
611 decode_gb18030(b"\xFE\x7F", "\u{FFFD}\u{007F}");
612 decode_gb18030(b"\xFE\x80", "\u{4723}");
613 decode_gb18030(b"\xFE\xFE", "\u{E4C5}");
614
615 // The difference from the original GB18030
616 decode_gb18030(b"\xA3\xA0", "\u{3000}");
617 decode_gb18030(b"\xA1\xA1", "\u{3000}");
618
619 // 0xFF
620 decode_gb18030(b"\xFF\x40", "\u{FFFD}\u{0040}");
621 decode_gb18030(b"\xE3\xFF\x9A\x33", "\u{FFFD}\u{FFFD}"); // not \u{FFFD}\u{FFFD}\u{0033} !
622 decode_gb18030(b"\xFF\x32\x9A\x33", "\u{FFFD}\u{0032}\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD}\u{0033} !
623 decode_gb18030(b"\xFF\x40\x00", "\u{FFFD}\u{0040}\u{0000}");
624 decode_gb18030(b"\xE3\xFF\x9A\x33\x00", "\u{FFFD}\u{FFFD}\u{0033}\u{0000}");
625 decode_gb18030(
626 b"\xFF\x32\x9A\x33\x00",
627 "\u{FFFD}\u{0032}\u{FFFD}\u{0033}\u{0000}",
628 );
629
630 // Four bytes
631 decode_gb18030(b"\x81\x30\x81\x30", "\u{0080}");
632 decode_gb18030(b"\x81\x35\xF4\x37", "\u{E7C7}");
633 decode_gb18030(b"\x81\x37\xA3\x30", "\u{2603}");
634 decode_gb18030(b"\x94\x39\xDA\x33", "\u{1F4A9}");
635 decode_gb18030(b"\xE3\x32\x9A\x35", "\u{10FFFF}");
636 decode_gb18030(b"\xE3\x32\x9A\x36\x81\x30", "\u{FFFD}\u{FFFD}");
637 decode_gb18030(b"\xE3\x32\x9A\x36\x81\x40", "\u{FFFD}\u{4E02}");
638 decode_gb18030(b"\xE3\x32\x9A", "\u{FFFD}"); // not \u{FFFD}\u{0032}\u{FFFD} !
639 decode_gb18030(b"\xE3\x32\x9A\x00", "\u{FFFD}\u{0032}\u{FFFD}\u{0000}");
640 }
641
642 #[test]
test_gb18030_encode()643 fn test_gb18030_encode() {
644 // Empty
645 encode_gb18030("", b"");
646
647 // ASCII
648 encode_gb18030("\u{0061}\u{0062}", b"\x61\x62");
649
650 // euro
651 encode_gb18030("\u{20AC}", b"\xA2\xE3");
652
653 // two bytes
654 encode_gb18030("\u{4E02}", b"\x81\x40");
655 encode_gb18030("\u{4E8A}", b"\x81\x7E");
656 if !cfg!(miri) {
657 // Miri is too slow
658 encode_gb18030("\u{4E90}", b"\x81\x80");
659 encode_gb18030("\u{4FA2}", b"\x81\xFE");
660 encode_gb18030("\u{FA0C}", b"\xFE\x40");
661 encode_gb18030("\u{E843}", b"\xFE\x7E");
662 encode_gb18030("\u{4723}", b"\xFE\x80");
663 encode_gb18030("\u{E4C5}", b"\xFE\xFE");
664 }
665
666 // The difference from the original GB18030
667 encode_gb18030("\u{E5E5}", b"");
668 encode_gb18030("\u{3000}", b"\xA1\xA1");
669
670 // Four bytes
671 encode_gb18030("\u{0080}", b"\x81\x30\x81\x30");
672 encode_gb18030("\u{E7C7}", b"\x81\x35\xF4\x37");
673 if !cfg!(miri) {
674 // Miri is too slow
675 encode_gb18030("\u{2603}", b"\x81\x37\xA3\x30");
676 encode_gb18030("\u{1F4A9}", b"\x94\x39\xDA\x33");
677 encode_gb18030("\u{10FFFF}", b"\xE3\x32\x9A\x35");
678 }
679
680 // Edge cases
681 encode_gb18030("\u{00F7}", b"\xA1\xC2");
682 }
683
684 #[test]
test_gbk_encode()685 fn test_gbk_encode() {
686 // Empty
687 encode_gbk("", b"");
688
689 // ASCII
690 encode_gbk("\u{0061}\u{0062}", b"\x61\x62");
691
692 // euro
693 encode_gbk("\u{20AC}", b"\x80");
694
695 // two bytes
696 encode_gbk("\u{4E02}", b"\x81\x40");
697 encode_gbk("\u{4E8A}", b"\x81\x7E");
698 if !cfg!(miri) {
699 // Miri is too slow
700 encode_gbk("\u{4E90}", b"\x81\x80");
701 encode_gbk("\u{4FA2}", b"\x81\xFE");
702 encode_gbk("\u{FA0C}", b"\xFE\x40");
703 encode_gbk("\u{E843}", b"\xFE\x7E");
704 encode_gbk("\u{4723}", b"\xFE\x80");
705 encode_gbk("\u{E4C5}", b"\xFE\xFE");
706 }
707
708 // The difference from the original gb18030
709 encode_gbk("\u{E5E5}", b"");
710 encode_gbk("\u{3000}", b"\xA1\xA1");
711
712 // Four bytes
713 encode_gbk("\u{0080}", b"€");
714 encode_gbk("\u{E7C7}", b"");
715 if !cfg!(miri) {
716 // Miri is too slow
717 encode_gbk("\u{2603}", b"☃");
718 encode_gbk("\u{1F4A9}", b"💩");
719 encode_gbk("\u{10FFFF}", b"");
720 }
721
722 // Edge cases
723 encode_gbk("\u{00F7}", b"\xA1\xC2");
724 }
725
726 #[test]
727 #[cfg_attr(miri, ignore)] // Miri is too slow
test_gb18030_decode_all()728 fn test_gb18030_decode_all() {
729 let input = include_bytes!("test_data/gb18030_in.txt");
730 let expectation = include_str!("test_data/gb18030_in_ref.txt");
731 let (cow, had_errors) = GB18030.decode_without_bom_handling(input);
732 assert!(!had_errors, "Should not have had errors.");
733 assert_eq!(&cow[..], expectation);
734 }
735
736 #[test]
737 #[cfg_attr(miri, ignore)] // Miri is too slow
test_gb18030_encode_all()738 fn test_gb18030_encode_all() {
739 let input = include_str!("test_data/gb18030_out.txt");
740 let expectation = include_bytes!("test_data/gb18030_out_ref.txt");
741 let (cow, encoding, had_errors) = GB18030.encode(input);
742 assert!(!had_errors, "Should not have had errors.");
743 assert_eq!(encoding, GB18030);
744 assert_eq!(&cow[..], &expectation[..]);
745 }
746
747 #[test]
test_gb18030_encode_from_utf16_max_length()748 fn test_gb18030_encode_from_utf16_max_length() {
749 let mut output = [0u8; 20];
750 let mut encoder = GB18030.new_encoder();
751 {
752 let needed = encoder
753 .max_buffer_length_from_utf16_without_replacement(1)
754 .unwrap();
755 let (result, read, written) = encoder.encode_from_utf16_without_replacement(
756 &[0x3000],
757 &mut output[..needed],
758 true,
759 );
760 assert_eq!(result, EncoderResult::InputEmpty);
761 assert_eq!(read, 1);
762 assert_eq!(written, 2);
763 assert_eq!(output[0], 0xA1);
764 assert_eq!(output[1], 0xA1);
765 }
766 }
767 }
768