1 // This is a part of rust-encoding.
2 // Copyright (c) 2013-2015, Kang Seonghoon.
3 // See README.md and LICENSE.txt for details.
4 
5 //! Legacy simplified Chinese encodings based on GB 2312 and GB 18030.
6 
7 use std::convert::Into;
8 use std::marker::PhantomData;
9 use std::default::Default;
10 use util::StrCharIndex;
11 use index_simpchinese as index;
12 use types::*;
13 
14 /// An implementation type for GBK.
15 ///
16 /// Can be used as a type parameter to `GBEncoding` and `GBEncoder`.
17 /// (GB18030Decoder is shared by both.)
18 #[derive(Clone, Copy)]
19 pub struct GBK;
20 
21 /// An implementation type for GB18030.
22 ///
23 /// Can be used as a type parameter to `GBEncoding` and `GBEncoder.'
24 /// (GB18030Decoder is shared by both.)
25 #[derive(Clone, Copy)]
26 pub struct GB18030;
27 
28 /// An internal trait used to customize GBK and GB18030 implementations.
29 #[doc(hidden)] // XXX never intended to be used publicly, should be gone later
30 pub trait GBType: Clone + 'static {
name() -> &'static str31     fn name() -> &'static str;
whatwg_name() -> Option<&'static str>32     fn whatwg_name() -> Option<&'static str>;
initial_gbk_flag() -> bool33     fn initial_gbk_flag() -> bool;
34 }
35 
36 impl GBType for GBK {
name() -> &'static str37     fn name() -> &'static str { "gbk" }
whatwg_name() -> Option<&'static str>38     fn whatwg_name() -> Option<&'static str> { Some("gbk") }
initial_gbk_flag() -> bool39     fn initial_gbk_flag() -> bool { true }
40 }
41 
42 impl GBType for GB18030 {
name() -> &'static str43     fn name() -> &'static str { "gb18030" }
whatwg_name() -> Option<&'static str>44     fn whatwg_name() -> Option<&'static str> { Some("gb18030") }
initial_gbk_flag() -> bool45     fn initial_gbk_flag() -> bool { false }
46 }
47 
48 /**
49  * GBK and GB 18030-2005.
50  *
51  * The original GBK 1.0 region spans `[81-FE] [40-7E 80-FE]`, and is derived from
52  * several different revisions of a family of encodings named "GBK":
53  *
54  * - GBK as specified in the normative annex of GB 13000.1-93,
55  *   the domestic standard equivalent to Unicode 1.1,
56  *   consisted of characters included in Unicode 1.1 and not in GB 2312-80.
57  * - Windows code page 936 is the widespread extension to GBK.
58  * - Due to the popularity of Windows code page 936,
59  *   a formal encoding based on Windows code page 936 (while adding new characters)
60  *   was standardized into GBK 1.0.
61  * - Finally, GB 18030 added four-byte sequences to GBK for becoming a pan-Unicode encoding,
62  *   while adding new characters to the (former) GBK region again.
63  *
64  * GB 18030-2005 is a simplified Chinese encoding which extends GBK 1.0 to a pan-Unicode encoding.
65  * It assigns four-byte sequences to every Unicode codepoint missing from the GBK area,
66  * lexicographically ordered with occasional "gaps" for codepoints in the GBK area.
67  * Due to this compatibility decision,
68  * there is no simple relationship between these four-byte sequences and Unicode codepoints,
69  * though there *exists* a relatively simple mapping algorithm with a small lookup table.
70  *
71  * ## Specialization
72  *
73  * This type is specialized with GBType `T`,
74  * which should be either `GBK` or `GB18030`.
75  */
76 #[derive(Clone, Copy)]
77 pub struct GBEncoding<T> {
78     _marker: PhantomData<T>
79 }
80 
81 /// A type for GBK.
82 pub type GBKEncoding = GBEncoding<GBK>;
83 /// A type for GB18030.
84 pub type GB18030Encoding = GBEncoding<GB18030>;
85 
86 /// An instance for GBK.
87 pub const GBK_ENCODING: GBKEncoding = GBEncoding { _marker: PhantomData };
88 /// An instance for GB18030.
89 pub const GB18030_ENCODING: GB18030Encoding = GBEncoding { _marker: PhantomData };
90 
91 impl<T: GBType> Encoding for GBEncoding<T> {
name(&self) -> &'static str92     fn name(&self) -> &'static str { <T as GBType>::name() }
whatwg_name(&self) -> Option<&'static str>93     fn whatwg_name(&self) -> Option<&'static str> { <T as GBType>::whatwg_name() }
raw_encoder(&self) -> Box<RawEncoder>94     fn raw_encoder(&self) -> Box<RawEncoder> { GBEncoder::<T>::new() }
raw_decoder(&self) -> Box<RawDecoder>95     fn raw_decoder(&self) -> Box<RawDecoder> { GB18030Decoder::new() }
96 }
97 
98 /**
99  * An encoder for GBK and GB18030.
100  *
101  * ## Specialization
102  *
103  * This type is specialized with GBType `T`,
104  * which should be either `GBK` or `GB18030`.
105  */
106 #[derive(Clone, Copy)]
107 pub struct GBEncoder<T> {
108     _marker: PhantomData<T>
109 }
110 
111 impl<T: GBType> GBEncoder<T> {
new() -> Box<RawEncoder>112     pub fn new() -> Box<RawEncoder> {
113         Box::new(GBEncoder::<T> { _marker: PhantomData })
114     }
115 }
116 
117 impl<T: GBType> RawEncoder for GBEncoder<T> {
from_self(&self) -> Box<RawEncoder>118     fn from_self(&self) -> Box<RawEncoder> { GBEncoder::<T>::new() }
is_ascii_compatible(&self) -> bool119     fn is_ascii_compatible(&self) -> bool { true }
120 
raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)121     fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
122         output.writer_hint(input.len());
123 
124         let gbk_flag = <T as GBType>::initial_gbk_flag();
125         for ((i, j), ch) in input.index_iter() {
126             if ch < '\u{80}' {
127                 output.write_byte(ch as u8);
128             } else if gbk_flag && ch == '\u{20AC}' {
129                 output.write_byte('\u{80}' as u8)
130             } else {
131                 let ptr = index::gb18030::backward(ch as u32);
132                 if ptr == 0xffff {
133                     if gbk_flag {
134                         return (i, Some(CodecError {
135                             upto: j as isize,
136                             cause: "gbk doesn't support gb18030 extensions".into()
137                         }));
138                     }
139                     let ptr = index::gb18030_ranges::backward(ch as u32);
140                     assert!(ptr != 0xffffffff);
141                     let (ptr, byte4) = (ptr / 10, ptr % 10);
142                     let (ptr, byte3) = (ptr / 126, ptr % 126);
143                     let (byte1, byte2) = (ptr / 10, ptr % 10);
144                     output.write_byte((byte1 + 0x81) as u8);
145                     output.write_byte((byte2 + 0x30) as u8);
146                     output.write_byte((byte3 + 0x81) as u8);
147                     output.write_byte((byte4 + 0x30) as u8);
148                 } else {
149                     let lead = ptr / 190 + 0x81;
150                     let trail = ptr % 190;
151                     let trailoffset = if trail < 0x3f {0x40} else {0x41};
152                     output.write_byte(lead as u8);
153                     output.write_byte((trail + trailoffset) as u8);
154                 }
155             }
156         }
157         (input.len(), None)
158     }
159 
raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>160     fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
161         None
162     }
163 }
164 
165 /// A decoder for GB 18030 (also used by GBK).
166 #[derive(Clone, Copy)]
167 struct GB18030Decoder {
168     st: gb18030::State,
169 }
170 
171 impl GB18030Decoder {
new() -> Box<RawDecoder>172     pub fn new() -> Box<RawDecoder> {
173         Box::new(GB18030Decoder { st: Default::default() })
174     }
175 }
176 
177 impl RawDecoder for GB18030Decoder {
from_self(&self) -> Box<RawDecoder>178     fn from_self(&self) -> Box<RawDecoder> { GB18030Decoder::new() }
is_ascii_compatible(&self) -> bool179     fn is_ascii_compatible(&self) -> bool { true }
180 
raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)181     fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
182         let (st, processed, err) = gb18030::raw_feed(self.st, input, output, &());
183         self.st = st;
184         (processed, err)
185     }
186 
raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>187     fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
188         let (st, err) = gb18030::raw_finish(self.st, output, &());
189         self.st = st;
190         err
191     }
192 }
193 
194 stateful_decoder! {
195     module gb18030;
196 
197     internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
198         use index_simpchinese as index;
199 
200         let lead = lead as u16;
201         let trail = trail as u16;
202         let index = match (lead, trail) {
203             (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0x80...0xfe) => {
204                 let trailoffset = if trail < 0x7f {0x40} else {0x41};
205                 (lead - 0x81) * 190 + trail - trailoffset
206             }
207             _ => 0xffff,
208         };
209         index::gb18030::forward(index)
210     }
211 
212     internal pub fn map_four_bytes(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 {
213         use index_simpchinese as index;
214 
215         // no range check here, caller should have done all checks
216         let index = (b1 as u32 - 0x81) * 12600 + (b2 as u32 - 0x30) * 1260 +
217                     (b3 as u32 - 0x81) * 10 + (b4 as u32 - 0x30);
218         index::gb18030_ranges::forward(index)
219     }
220 
221 initial:
222     // gb18030 first = 0x00, gb18030 second = 0x00, gb18030 third = 0x00
223     state S0(ctx: Context) {
224         case b @ 0x00...0x7f => ctx.emit(b as u32);
225         case 0x80 => ctx.emit(0x20ac);
226         case b @ 0x81...0xfe => S1(ctx, b);
227         case _ => ctx.err("invalid sequence");
228     }
229 
230 transient:
231     // gb18030 first != 0x00, gb18030 second = 0x00, gb18030 third = 0x00
232     state S1(ctx: Context, first: u8) {
233         case b @ 0x30...0x39 => S2(ctx, first, b);
234         case b => match map_two_bytes(first, b) {
235             0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional
236             ch => ctx.emit(ch)
237         };
238     }
239 
240     // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third = 0x00
241     state S2(ctx: Context, first: u8, second: u8) {
242         case b @ 0x81...0xfe => S3(ctx, first, second, b);
243         case _ => ctx.backup_and_err(2, "invalid sequence");
244     }
245 
246     // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third != 0x00
247     state S3(ctx: Context, first: u8, second: u8, third: u8) {
248         case b @ 0x30...0x39 => match map_four_bytes(first, second, third, b) {
249             0xffffffff => ctx.backup_and_err(3, "invalid sequence"), // unconditional
250             ch => ctx.emit(ch)
251         };
252         case _ => ctx.backup_and_err(3, "invalid sequence");
253     }
254 }
255 
256 #[cfg(test)]
257 mod gb18030_tests {
258     extern crate test;
259     use super::GB18030_ENCODING;
260     use testutils;
261     use types::*;
262 
263     #[test]
test_encoder()264     fn test_encoder() {
265         let mut e = GB18030_ENCODING.raw_encoder();
266         assert_feed_ok!(e, "A", "", [0x41]);
267         assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
268         assert_feed_ok!(e, "", "", []);
269         assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "",
270                         [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1,
271                          0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]);
272         assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa2, 0xe3, 0x2f, 0x6d]);
273         assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]);
274         assert_feed_ok!(e, "\u{80}", "", [0x81, 0x30, 0x81, 0x30]);
275         assert_feed_ok!(e, "\u{81}", "", [0x81, 0x30, 0x81, 0x31]);
276         assert_feed_ok!(e, "\u{a3}", "", [0x81, 0x30, 0x84, 0x35]);
277         assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
278         assert_feed_ok!(e, "\u{a5}", "", [0x81, 0x30, 0x84, 0x36]);
279         assert_feed_ok!(e, "\u{10ffff}", "", [0xe3, 0x32, 0x9a, 0x35]);
280         assert_feed_ok!(e, "\u{2a6a5}\u{3007}", "", [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96]);
281         assert_finish_ok!(e, []);
282     }
283 
284     #[test]
test_decoder_valid()285     fn test_decoder_valid() {
286         let mut d = GB18030_ENCODING.raw_decoder();
287         assert_feed_ok!(d, [0x41], [], "A");
288         assert_feed_ok!(d, [0x42, 0x43], [], "BC");
289         assert_feed_ok!(d, [], [], "");
290         assert_feed_ok!(d, [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1,
291                             0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa], [],
292                         "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}");
293         assert_feed_ok!(d, [0x31, 0x80, 0x2f, 0x6d], [], "1\u{20ac}/m");
294         assert_feed_ok!(d, [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3], [], "\u{ff21}\u{ff22}\u{ff23}");
295         assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x30], [], "\u{80}");
296         assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x31], [], "\u{81}");
297         assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x35], [], "\u{a3}");
298         assert_feed_ok!(d, [0xa1, 0xe8], [], "\u{a4}" );
299         assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x36], [], "\u{a5}");
300         assert_feed_ok!(d, [0xe3, 0x32, 0x9a, 0x35], [], "\u{10ffff}");
301         assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96], [], "\u{2a6a5}\u{3007}");
302         assert_finish_ok!(d, "");
303     }
304 
305     #[test]
test_decoder_valid_partial()306     fn test_decoder_valid_partial() {
307         let mut d = GB18030_ENCODING.raw_decoder();
308         assert_feed_ok!(d, [], [0xa1], "");
309         assert_feed_ok!(d, [0xa1], [], "\u{3000}");
310         assert_feed_ok!(d, [], [0x81], "");
311         assert_feed_ok!(d, [], [0x30], "");
312         assert_feed_ok!(d, [], [0x81], "");
313         assert_feed_ok!(d, [0x30], [], "\u{80}");
314         assert_feed_ok!(d, [], [0x81], "");
315         assert_feed_ok!(d, [], [0x30], "");
316         assert_feed_ok!(d, [0x81, 0x31], [], "\u{81}");
317         assert_feed_ok!(d, [], [0x81], "");
318         assert_feed_ok!(d, [0x30, 0x81, 0x32], [], "\u{82}");
319         assert_feed_ok!(d, [], [0x81], "");
320         assert_feed_ok!(d, [], [0x30, 0x81], "");
321         assert_feed_ok!(d, [0x33], [], "\u{83}");
322         assert_feed_ok!(d, [], [0x81, 0x30], "");
323         assert_feed_ok!(d, [], [0x81], "");
324         assert_feed_ok!(d, [0x34], [], "\u{84}");
325         assert_feed_ok!(d, [], [0x81, 0x30], "");
326         assert_feed_ok!(d, [0x81, 0x35], [], "\u{85}");
327         assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
328         assert_feed_ok!(d, [0x36], [], "\u{86}");
329         assert_finish_ok!(d, "");
330     }
331 
332     #[test]
test_decoder_invalid_partial()333     fn test_decoder_invalid_partial() {
334         let mut d = GB18030_ENCODING.raw_decoder();
335         assert_feed_ok!(d, [], [0xa1], "");
336         assert_finish_err!(d, "");
337 
338         let mut d = GB18030_ENCODING.raw_decoder();
339         assert_feed_ok!(d, [], [0x81], "");
340         assert_finish_err!(d, "");
341 
342         let mut d = GB18030_ENCODING.raw_decoder();
343         assert_feed_ok!(d, [], [0x81, 0x30], "");
344         assert_finish_err!(d, "");
345 
346         let mut d = GB18030_ENCODING.raw_decoder();
347         assert_feed_ok!(d, [], [0x81, 0x30, 0x81], "");
348         assert_finish_err!(d, "");
349     }
350 
351     #[test]
test_decoder_invalid_out_of_range()352     fn test_decoder_invalid_out_of_range() {
353         let mut d = GB18030_ENCODING.raw_decoder();
354         assert_feed_err!(d, [], [0xff], [], "");
355         assert_feed_err!(d, [], [0x81], [0x00], "");
356         assert_feed_err!(d, [], [0x81], [0x7f], "");
357         assert_feed_err!(d, [], [0x81], [0xff], "");
358         assert_feed_err!(d, [], [0x81], [0x31, 0x00], "");
359         assert_feed_err!(d, [], [0x81], [0x31, 0x80], "");
360         assert_feed_err!(d, [], [0x81], [0x31, 0xff], "");
361         assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x00], "");
362         assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x2f], "");
363         assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x3a], "");
364         assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0xff], "");
365         assert_finish_ok!(d, "");
366     }
367 
368     #[test]
test_decoder_invalid_boundary()369     fn test_decoder_invalid_boundary() {
370         // U+10FFFF (E3 32 9A 35) is the last Unicode codepoint, E3 32 9A 36 is invalid.
371         // note that since the 2nd to 4th bytes may coincide with ASCII, bytes 32 9A 36 is
372         // not considered to be in the problem. this is compatible to WHATWG Encoding standard.
373         let mut d = GB18030_ENCODING.raw_decoder();
374         assert_feed_ok!(d, [], [0xe3], "");
375         assert_feed_err!(d, [], [], [0x32, 0x9a, 0x36], "");
376         assert_finish_ok!(d, "");
377 
378         let mut d = GB18030_ENCODING.raw_decoder();
379         assert_feed_ok!(d, [], [0xe3], "");
380         assert_feed_ok!(d, [], [0x32, 0x9a], "");
381         assert_feed_err!(d, -2, [], [], [0x32, 0x9a, 0x36], "");
382         assert_finish_ok!(d, "");
383     }
384 
385     #[test]
test_decoder_feed_after_finish()386     fn test_decoder_feed_after_finish() {
387         let mut d = GB18030_ENCODING.raw_decoder();
388         assert_feed_ok!(d, [0xd2, 0xbb], [0xd2], "\u{4e00}");
389         assert_finish_err!(d, "");
390         assert_feed_ok!(d, [0xd2, 0xbb], [], "\u{4e00}");
391         assert_finish_ok!(d, "");
392 
393         let mut d = GB18030_ENCODING.raw_decoder();
394         assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35, 0xee], "\u{2a6a5}");
395         assert_finish_err!(d, "");
396         assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35], "\u{2a6a5}");
397         assert_finish_err!(d, "");
398         assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98], "\u{2a6a5}");
399         assert_finish_err!(d, "");
400         assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [], "\u{2a6a5}");
401         assert_finish_ok!(d, "");
402     }
403 
404     #[bench]
bench_encode_short_text(bencher: &mut test::Bencher)405     fn bench_encode_short_text(bencher: &mut test::Bencher) {
406         let s = testutils::SIMPLIFIED_CHINESE_TEXT;
407         bencher.bytes = s.len() as u64;
408         bencher.iter(|| test::black_box({
409             GB18030_ENCODING.encode(&s, EncoderTrap::Strict)
410         }))
411     }
412 
413     #[bench]
bench_decode_short_text(bencher: &mut test::Bencher)414     fn bench_decode_short_text(bencher: &mut test::Bencher) {
415         let s = GB18030_ENCODING.encode(testutils::SIMPLIFIED_CHINESE_TEXT,
416                                        EncoderTrap::Strict).ok().unwrap();
417         bencher.bytes = s.len() as u64;
418         bencher.iter(|| test::black_box({
419             GB18030_ENCODING.decode(&s, DecoderTrap::Strict)
420         }))
421     }
422 }
423 
424 #[cfg(test)]
425 mod gbk_tests {
426     extern crate test;
427     use super::GBK_ENCODING;
428     use testutils;
429     use types::*;
430 
431     // GBK and GB 18030 share the same decoder logic.
432 
433     #[test]
test_encoder()434     fn test_encoder() {
435         let mut e = GBK_ENCODING.raw_encoder();
436         assert_feed_ok!(e, "A", "", [0x41]);
437         assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
438         assert_feed_ok!(e, "", "", []);
439         assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "",
440                         [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1,
441                          0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]);
442         assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0x80, 0x2f, 0x6d]);
443         assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]);
444         assert_feed_err!(e, "", "\u{80}", "", []);
445         assert_feed_err!(e, "", "\u{81}", "", []);
446         assert_feed_err!(e, "", "\u{a3}", "", []);
447         assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]);
448         assert_feed_err!(e, "", "\u{a5}", "", []);
449         assert_feed_err!(e, "", "\u{10ffff}", "", []);
450         assert_feed_err!(e, "", "\u{2a6a5}", "\u{3007}", []);
451         assert_feed_err!(e, "\u{3007}", "\u{2a6a5}", "", [0xa9, 0x96]);
452         assert_finish_ok!(e, []);
453     }
454 
455     #[bench]
bench_encode_short_text(bencher: &mut test::Bencher)456     fn bench_encode_short_text(bencher: &mut test::Bencher) {
457         let s = testutils::SIMPLIFIED_CHINESE_TEXT;
458         bencher.bytes = s.len() as u64;
459         bencher.iter(|| test::black_box({
460             GBK_ENCODING.encode(&s, EncoderTrap::Strict)
461         }))
462     }
463 }
464 
465 /**
466  * HZ. (RFC 1843)
467  *
468  * This is a simplified Chinese encoding based on GB 2312.
469  * It bears a resemblance to ISO 2022 encodings in such that the printable escape sequences `~{`
470  * and `~}` are used to delimit a sequence of 7-bit-safe GB 2312 sequences. For the comparison,
471  * they are equivalent to ISO-2022-CN escape sequences `ESC $ ) A` and `ESC ( B`.
472  * Additional escape sequences `~~` (for a literal `~`) and `~\n` (ignored) are also supported.
473  */
474 #[derive(Clone, Copy)]
475 pub struct HZEncoding;
476 
477 impl Encoding for HZEncoding {
name(&self) -> &'static str478     fn name(&self) -> &'static str { "hz" }
whatwg_name(&self) -> Option<&'static str>479     fn whatwg_name(&self) -> Option<&'static str> { None }
raw_encoder(&self) -> Box<RawEncoder>480     fn raw_encoder(&self) -> Box<RawEncoder> { HZEncoder::new() }
raw_decoder(&self) -> Box<RawDecoder>481     fn raw_decoder(&self) -> Box<RawDecoder> { HZDecoder::new() }
482 }
483 
484 /// An encoder for HZ.
485 #[derive(Clone, Copy)]
486 pub struct HZEncoder {
487     escaped: bool,
488 }
489 
490 impl HZEncoder {
new() -> Box<RawEncoder>491     pub fn new() -> Box<RawEncoder> { Box::new(HZEncoder { escaped: false }) }
492 }
493 
494 impl RawEncoder for HZEncoder {
from_self(&self) -> Box<RawEncoder>495     fn from_self(&self) -> Box<RawEncoder> { HZEncoder::new() }
is_ascii_compatible(&self) -> bool496     fn is_ascii_compatible(&self) -> bool { false }
497 
raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)498     fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
499         output.writer_hint(input.len());
500 
501         let mut escaped = self.escaped;
502         macro_rules! ensure_escaped(
503             () => (if !escaped { output.write_bytes(b"~{"); escaped = true; })
504         );
505         macro_rules! ensure_unescaped(
506             () => (if escaped { output.write_bytes(b"~}"); escaped = false; })
507         );
508 
509         for ((i,j), ch) in input.index_iter() {
510             if ch < '\u{80}' {
511                 ensure_unescaped!();
512                 output.write_byte(ch as u8);
513                 if ch == '~' { output.write_byte('~' as u8); }
514             } else {
515                 let ptr = index::gb18030::backward(ch as u32);
516                 if ptr == 0xffff {
517                     self.escaped = escaped; // do NOT reset the state!
518                     return (i, Some(CodecError {
519                         upto: j as isize, cause: "unrepresentable character".into()
520                     }));
521                 } else {
522                     let lead = ptr / 190;
523                     let trail = ptr % 190;
524                     if lead < 0x21 - 1 || trail < 0x21 + 0x3f { // GBK extension, ignored
525                         self.escaped = escaped; // do NOT reset the state!
526                         return (i, Some(CodecError {
527                             upto: j as isize, cause: "unrepresentable character".into()
528                         }));
529                     } else {
530                         ensure_escaped!();
531                         output.write_byte((lead + 1) as u8);
532                         output.write_byte((trail - 0x3f) as u8);
533                     }
534                 }
535             }
536         }
537 
538         self.escaped = escaped;
539         (input.len(), None)
540     }
541 
raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>542     fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
543         None
544     }
545 }
546 
547 /// A decoder for HZ.
548 #[derive(Clone, Copy)]
549 struct HZDecoder {
550     st: hz::State,
551 }
552 
553 impl HZDecoder {
new() -> Box<RawDecoder>554     pub fn new() -> Box<RawDecoder> {
555         Box::new(HZDecoder { st: Default::default() })
556     }
557 }
558 
559 impl RawDecoder for HZDecoder {
from_self(&self) -> Box<RawDecoder>560     fn from_self(&self) -> Box<RawDecoder> { HZDecoder::new() }
is_ascii_compatible(&self) -> bool561     fn is_ascii_compatible(&self) -> bool { true }
562 
raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)563     fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
564         let (st, processed, err) = hz::raw_feed(self.st, input, output, &());
565         self.st = st;
566         (processed, err)
567     }
568 
raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>569     fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
570         let (st, err) = hz::raw_finish(self.st, output, &());
571         self.st = st;
572         err
573     }
574 }
575 
576 stateful_decoder! {
577     module hz;
578 
579     internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
580         use index_simpchinese as index;
581 
582         let lead = lead as u16;
583         let trail = trail as u16;
584         let index = match (lead, trail) {
585             (0x20...0x7f, 0x21...0x7e) => (lead - 1) * 190 + (trail + 0x3f),
586             _ => 0xffff,
587         };
588         index::gb18030::forward(index)
589     }
590 
591 initial:
592     // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x00
593     state A0(ctx: Context) {
594         case 0x7e => A1(ctx);
595         case b @ 0x00...0x7f => ctx.emit(b as u32);
596         case _ => ctx.err("invalid sequence");
597         final => ctx.reset();
598     }
599 
600 checkpoint:
601     // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x00
602     state B0(ctx: Context) {
603         case 0x7e => B1(ctx);
604         case b @ 0x20...0x7f => B2(ctx, b);
605         case 0x0a => ctx.err("invalid sequence"); // error *and* reset
606         case _ => ctx.err("invalid sequence"), B0(ctx);
607         final => ctx.reset();
608     }
609 
610 transient:
611     // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x7e
612     state A1(ctx: Context) {
613         case 0x7b => B0(ctx);
614         case 0x7d => A0(ctx);
615         case 0x7e => ctx.emit(0x7e), A0(ctx);
616         case 0x0a => A0(ctx);
617         case _ => ctx.backup_and_err(1, "invalid sequence");
618         final => ctx.err("incomplete sequence");
619     }
620 
621     // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x7e
622     state B1(ctx: Context) {
623         case 0x7b => B0(ctx);
624         case 0x7d => A0(ctx);
625         case 0x7e => ctx.emit(0x7e), B0(ctx);
626         case 0x0a => A0(ctx);
627         case _ => ctx.backup_and_err(1, "invalid sequence"), B0(ctx);
628         final => ctx.err("incomplete sequence");
629     }
630 
631     // hz-gb-2312 flag = set, hz-gb-2312 lead != 0 & != 0x7e
632     state B2(ctx: Context, lead: u8) {
633         case 0x0a => ctx.err("invalid sequence"); // should reset the state!
634         case b =>
635             match map_two_bytes(lead, b) {
636                 0xffff => ctx.err("invalid sequence"),
637                 ch => ctx.emit(ch)
638             },
639             B0(ctx);
640         final => ctx.err("incomplete sequence");
641     }
642 }
643 
644 #[cfg(test)]
645 mod hz_tests {
646     extern crate test;
647     use super::HZEncoding;
648     use testutils;
649     use types::*;
650 
651     #[test]
test_encoder_valid()652     fn test_encoder_valid() {
653         let mut e = HZEncoding.raw_encoder();
654         assert_feed_ok!(e, "A", "", *b"A");
655         assert_feed_ok!(e, "BC", "", *b"BC");
656         assert_feed_ok!(e, "", "", *b"");
657         assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "",
658                         *b"~{VP;*HKCq92:M9z");
659         assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", *b"#A#B#C");
660         assert_feed_ok!(e, "1\u{20ac}/m", "", *b"~}1~{\"c~}/m");
661         assert_feed_ok!(e, "~<\u{a4}~\u{0a4}>~", "", *b"~~<~{!h~}~~~{!h~}>~~");
662         assert_finish_ok!(e, []);
663     }
664 
665     #[test]
test_encoder_invalid()666     fn test_encoder_invalid() {
667         let mut e = HZEncoding.raw_encoder();
668         assert_feed_err!(e, "", "\u{ffff}", "", []);
669         assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
670         // no support for GBK extension
671         assert_feed_err!(e, "", "\u{3007}", "", []);
672         assert_finish_ok!(e, []);
673     }
674 
675     #[test]
test_decoder_valid()676     fn test_decoder_valid() {
677         let mut d = HZEncoding.raw_decoder();
678         assert_feed_ok!(d, *b"A", *b"", "A");
679         assert_feed_ok!(d, *b"BC", *b"", "BC");
680         assert_feed_ok!(d, *b"D~~E", *b"~", "D~E");
681         assert_feed_ok!(d, *b"~F~\nG", *b"~", "~FG");
682         assert_feed_ok!(d, *b"", *b"", "");
683         assert_feed_ok!(d, *b"\nH", *b"~", "H");
684         assert_feed_ok!(d, *b"{VP~}~{;*~{HKCq92:M9z", *b"",
685                         "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}");
686         assert_feed_ok!(d, *b"", *b"#", "");
687         assert_feed_ok!(d, *b"A", *b"~", "\u{ff21}");
688         assert_feed_ok!(d, *b"~#B~~#C", *b"~", "~\u{ff22}~\u{ff23}");
689         assert_feed_ok!(d, *b"", *b"", "");
690         assert_feed_ok!(d, *b"\n#D~{#E~\n#F~{#G", *b"~", "#D\u{ff25}#F\u{ff27}");
691         assert_feed_ok!(d, *b"}X~}YZ", *b"", "XYZ");
692         assert_finish_ok!(d, "");
693     }
694 
695     #[test]
test_decoder_invalid_out_or_range()696     fn test_decoder_invalid_out_or_range() {
697         let mut d = HZEncoding.raw_decoder();
698         assert_feed_ok!(d, *b"~{", *b"", "");
699         assert_feed_err!(d, *b"", *b"\x20\x20", *b"", "");
700         assert_feed_err!(d, *b"", *b"\x20\x7f", *b"", ""); // do not reset the state (except for CR)
701         assert_feed_err!(d, *b"", *b"\x21\x7f", *b"", "");
702         assert_feed_err!(d, *b"", *b"\x7f\x20", *b"", "");
703         assert_feed_err!(d, *b"", *b"\x7f\x21", *b"", "");
704         assert_feed_err!(d, *b"", *b"\x7f\x7f", *b"", "");
705         assert_finish_ok!(d, "");
706     }
707 
708     #[test]
test_decoder_invalid_carriage_return()709     fn test_decoder_invalid_carriage_return() {
710         // CR in the multibyte mode is invalid but *also* resets the state
711         let mut d = HZEncoding.raw_decoder();
712         assert_feed_ok!(d, *b"~{#A", *b"", "\u{ff21}");
713         assert_feed_err!(d, *b"", *b"\n", *b"", "");
714         assert_feed_ok!(d, *b"#B~{#C", *b"", "#B\u{ff23}");
715         assert_feed_err!(d, *b"", *b"#\n", *b"", "");
716         assert_feed_ok!(d, *b"#D", *b"", "#D");
717         assert_finish_ok!(d, "");
718     }
719 
720     #[test]
test_decoder_invalid_partial()721     fn test_decoder_invalid_partial() {
722         let mut d = HZEncoding.raw_decoder();
723         assert_feed_ok!(d, *b"", *b"~", "");
724         assert_finish_err!(d, "");
725 
726         let mut d = HZEncoding.raw_decoder();
727         assert_feed_ok!(d, *b"~{", *b"#", "");
728         assert_finish_err!(d, "");
729 
730         let mut d = HZEncoding.raw_decoder();
731         assert_feed_ok!(d, *b"~{#A", *b"~", "\u{ff21}");
732         assert_finish_err!(d, "");
733     }
734 
735     #[test]
test_decoder_invalid_escape()736     fn test_decoder_invalid_escape() {
737         let mut d = HZEncoding.raw_decoder();
738         assert_feed_ok!(d, *b"#A", *b"", "#A");
739         assert_feed_err!(d, *b"", *b"~", *b"xy", "");
740         assert_feed_ok!(d, *b"#B", *b"", "#B");
741         assert_feed_ok!(d, *b"", *b"~", "");
742         assert_feed_err!(d, *b"", *b"", *b"xy", "");
743         assert_feed_ok!(d, *b"#C~{#D", *b"", "#C\u{ff24}");
744         assert_feed_err!(d, *b"", *b"~", *b"xy", "");
745         assert_feed_ok!(d, *b"#E", *b"", "\u{ff25}"); // does not reset to ASCII
746         assert_feed_ok!(d, *b"", *b"~", "");
747         assert_feed_err!(d, *b"", *b"", *b"xy", "");
748         assert_feed_ok!(d, *b"#F~}#G", *b"", "\u{ff26}#G");
749         assert_finish_ok!(d, "");
750     }
751 
752     #[test]
test_decoder_feed_after_finish()753     fn test_decoder_feed_after_finish() {
754         let mut d = HZEncoding.raw_decoder();
755         assert_feed_ok!(d, *b"R;~{R;", *b"R", "R;\u{4e00}");
756         assert_finish_err!(d, "");
757         assert_feed_ok!(d, *b"R;~{R;", *b"", "R;\u{4e00}");
758         assert_finish_ok!(d, "");
759     }
760 
761     #[bench]
bench_encode_short_text(bencher: &mut test::Bencher)762     fn bench_encode_short_text(bencher: &mut test::Bencher) {
763         let s = testutils::SIMPLIFIED_CHINESE_TEXT;
764         bencher.bytes = s.len() as u64;
765         bencher.iter(|| test::black_box({
766             HZEncoding.encode(&s, EncoderTrap::Strict)
767         }))
768     }
769 
770     #[bench]
bench_decode_short_text(bencher: &mut test::Bencher)771     fn bench_decode_short_text(bencher: &mut test::Bencher) {
772         let s = HZEncoding.encode(testutils::SIMPLIFIED_CHINESE_TEXT,
773                                   EncoderTrap::Strict).ok().unwrap();
774         bencher.bytes = s.len() as u64;
775         bencher.iter(|| test::black_box({
776             HZEncoding.decode(&s, DecoderTrap::Strict)
777         }))
778     }
779 }
780 
781