1 // This is a part of rust-encoding.
2 // Copyright (c) 2013-2015, Kang Seonghoon.
3 // See README.md and LICENSE.txt for details.
4 
5 //! Legacy traditional Chinese encodings.
6 
7 use std::convert::Into;
8 use std::default::Default;
9 use util::StrCharIndex;
10 use index_tradchinese as index;
11 use types::*;
12 
13 /**
14  * Big5-2003 with common extensions. (XXX with asymmetric HKSCS-2008 support)
15  *
16  * This is a traditional Chinese encoding spanning the region `[81-FE] [40-7E A1-FE]`.
17  * Originally a proprietary encoding by the consortium of five companies (hence the name),
18  * the Republic of China government standardized Big5-2003 in an appendix of CNS 11643
19  * so that CNS 11643 plane 1 and plane 2 have
20  * an almost identical set of characters as Big5 (but with a different mapping).
21  * The Hong Kong government has an official extension to Big5
22  * named Hong Kong Supplementary Character Set (HKSCS).
23  *
24  * This particular implementation of Big5 includes the widespread ETEN and HKSCS extensions,
25  * but excludes less common extensions such as Big5+, Big-5E and Unicode-at-on.
26  */
27 #[derive(Clone, Copy)]
28 pub struct BigFive2003Encoding;
29 
30 impl Encoding for BigFive2003Encoding {
name(&self) -> &'static str31     fn name(&self) -> &'static str { "big5-2003" }
whatwg_name(&self) -> Option<&'static str>32     fn whatwg_name(&self) -> Option<&'static str> { Some("big5") } // WHATWG compatibility
raw_encoder(&self) -> Box<RawEncoder>33     fn raw_encoder(&self) -> Box<RawEncoder> { BigFive2003Encoder::new() }
raw_decoder(&self) -> Box<RawDecoder>34     fn raw_decoder(&self) -> Box<RawDecoder> { BigFive2003HKSCS2008Decoder::new() }
35 }
36 
37 /// An encoder for Big5-2003.
38 #[derive(Clone, Copy)]
39 pub struct BigFive2003Encoder;
40 
41 impl BigFive2003Encoder {
new() -> Box<RawEncoder>42     pub fn new() -> Box<RawEncoder> { Box::new(BigFive2003Encoder) }
43 }
44 
45 impl RawEncoder for BigFive2003Encoder {
from_self(&self) -> Box<RawEncoder>46     fn from_self(&self) -> Box<RawEncoder> { BigFive2003Encoder::new() }
is_ascii_compatible(&self) -> bool47     fn is_ascii_compatible(&self) -> bool { true }
48 
raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)49     fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) {
50         output.writer_hint(input.len());
51 
52         for ((i,j), ch) in input.index_iter() {
53             if ch < '\u{80}' {
54                 output.write_byte(ch as u8);
55             } else {
56                 let ptr = index::big5::backward(ch as u32);
57                 if ptr == 0xffff || ptr < (0xa1 - 0x81) * 157 {
58                     // no HKSCS extension (XXX doesn't HKSCS include 0xFA40..0xFEFE?)
59                     return (i, Some(CodecError {
60                         upto: j as isize, cause: "unrepresentable character".into()
61                     }));
62                 }
63                 let lead = ptr / 157 + 0x81;
64                 let trail = ptr % 157;
65                 let trailoffset = if trail < 0x3f {0x40} else {0x62};
66                 output.write_byte(lead as u8);
67                 output.write_byte((trail + trailoffset) as u8);
68             }
69         }
70         (input.len(), None)
71     }
72 
raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>73     fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> {
74         None
75     }
76 }
77 
78 /// A decoder for Big5-2003 with HKSCS-2008 extension.
79 #[derive(Clone, Copy)]
80 struct BigFive2003HKSCS2008Decoder {
81     st: bigfive2003::State,
82 }
83 
84 impl BigFive2003HKSCS2008Decoder {
new() -> Box<RawDecoder>85     pub fn new() -> Box<RawDecoder> {
86         Box::new(BigFive2003HKSCS2008Decoder { st: Default::default() })
87     }
88 }
89 
90 impl RawDecoder for BigFive2003HKSCS2008Decoder {
from_self(&self) -> Box<RawDecoder>91     fn from_self(&self) -> Box<RawDecoder> { BigFive2003HKSCS2008Decoder::new() }
is_ascii_compatible(&self) -> bool92     fn is_ascii_compatible(&self) -> bool { true }
93 
raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)94     fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) {
95         let (st, processed, err) = bigfive2003::raw_feed(self.st, input, output, &());
96         self.st = st;
97         (processed, err)
98     }
99 
raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>100     fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> {
101         let (st, err) = bigfive2003::raw_finish(self.st, output, &());
102         self.st = st;
103         err
104     }
105 }
106 
107 stateful_decoder! {
108     module bigfive2003;
109 
110     internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 {
111         use index_tradchinese as index;
112 
113         let lead = lead as u16;
114         let trail = trail as u16;
115         let index = match (lead, trail) {
116             (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0xa1...0xfe) => {
117                 let trailoffset = if trail < 0x7f {0x40} else {0x62};
118                 (lead - 0x81) * 157 + trail - trailoffset
119             }
120             _ => 0xffff,
121         };
122         index::big5::forward(index) // may return two-letter replacements 0..3
123     }
124 
125 initial:
126     // big5 lead = 0x00
127     state S0(ctx: Context) {
128         case b @ 0x00...0x7f => ctx.emit(b as u32);
129         case b @ 0x81...0xfe => S1(ctx, b);
130         case _ => ctx.err("invalid sequence");
131     }
132 
133 transient:
134     // big5 lead != 0x00
135     state S1(ctx: Context, lead: u8) {
136         case b => match map_two_bytes(lead, b) {
137             0xffff => {
138                 let backup = if b < 0x80 {1} else {0};
139                 ctx.backup_and_err(backup, "invalid sequence")
140             },
141             0 /*index=1133*/ => ctx.emit_str("\u{ca}\u{304}"),
142             1 /*index=1135*/ => ctx.emit_str("\u{ca}\u{30c}"),
143             2 /*index=1164*/ => ctx.emit_str("\u{ea}\u{304}"),
144             3 /*index=1166*/ => ctx.emit_str("\u{ea}\u{30c}"),
145             ch => ctx.emit(ch),
146         };
147     }
148 }
149 
150 #[cfg(test)]
151 mod bigfive2003_tests {
152     extern crate test;
153     use super::BigFive2003Encoding;
154     use testutils;
155     use types::*;
156 
157     #[test]
test_encoder_valid()158     fn test_encoder_valid() {
159         let mut e = BigFive2003Encoding.raw_encoder();
160         assert_feed_ok!(e, "A", "", [0x41]);
161         assert_feed_ok!(e, "BC", "", [0x42, 0x43]);
162         assert_feed_ok!(e, "", "", []);
163         assert_feed_ok!(e, "\u{4e2d}\u{83ef}\u{6c11}\u{570b}", "",
164                         [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea]);
165         assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa3, 0xe1, 0x2f, 0x6d]);
166         assert_feed_ok!(e, "\u{ffed}", "", [0xf9, 0xfe]);
167         assert_finish_ok!(e, []);
168     }
169 
170     #[test]
test_encoder_invalid()171     fn test_encoder_invalid() {
172         let mut e = BigFive2003Encoding.raw_encoder();
173         assert_feed_err!(e, "", "\u{ffff}", "", []);
174         assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]);
175         assert_feed_err!(e, "", "\u{3eec}", "\u{4e00}", []); // HKSCS-2008 addition
176         assert_finish_ok!(e, []);
177     }
178 
179     #[test]
test_decoder_valid()180     fn test_decoder_valid() {
181         let mut d = BigFive2003Encoding.raw_decoder();
182         assert_feed_ok!(d, [0x41], [], "A");
183         assert_feed_ok!(d, [0x42, 0x43], [], "BC");
184         assert_feed_ok!(d, [], [], "");
185         assert_feed_ok!(d, [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea], [],
186                         "\u{4e2d}\u{83ef}\u{6c11}\u{570b}");
187         assert_feed_ok!(d, [], [0xa4], "");
188         assert_feed_ok!(d, [0xa4, 0xb5, 0xd8], [0xa5], "\u{4e2d}\u{83ef}");
189         assert_feed_ok!(d, [0xc1, 0xb0, 0xea], [], "\u{6c11}\u{570b}");
190         assert_feed_ok!(d, [0x31, 0xa3, 0xe1, 0x2f, 0x6d], [], "1\u{20ac}/m");
191         assert_feed_ok!(d, [0xf9, 0xfe], [], "\u{ffed}");
192         assert_feed_ok!(d, [0x87, 0x7e], [], "\u{3eec}"); // HKSCS-2008 addition
193         assert_feed_ok!(d, [0x88, 0x62, 0x88, 0x64, 0x88, 0xa3, 0x88, 0xa5], [],
194                         "\u{ca}\u{304}\u{00ca}\u{30c}\u{ea}\u{304}\u{ea}\u{30c}"); // 2-byte output
195         assert_finish_ok!(d, "");
196     }
197 
198     #[test]
test_decoder_invalid_lone_lead_immediate_test_finish()199     fn test_decoder_invalid_lone_lead_immediate_test_finish() {
200         for i in 0x81..0xff {
201             let mut d = BigFive2003Encoding.raw_decoder();
202             assert_feed_ok!(d, [], [i], ""); // wait for a trail
203             assert_finish_err!(d, "");
204         }
205 
206         // 80/FF: immediate failure
207         let mut d = BigFive2003Encoding.raw_decoder();
208         assert_feed_err!(d, [], [0x80], [], "");
209         assert_feed_err!(d, [], [0xff], [], "");
210         assert_finish_ok!(d, "");
211     }
212 
213     #[test]
test_decoder_invalid_lone_lead_followed_by_space()214     fn test_decoder_invalid_lone_lead_followed_by_space() {
215         for i in 0x80..0x100 {
216             let i = i as u8;
217             let mut d = BigFive2003Encoding.raw_decoder();
218             assert_feed_err!(d, [], [i], [0x20], "");
219             assert_finish_ok!(d, "");
220         }
221     }
222 
223     #[test]
test_decoder_invalid_lead_followed_by_invalid_trail()224     fn test_decoder_invalid_lead_followed_by_invalid_trail() {
225         // unlike most other cases, valid lead + invalid MSB-set trail are entirely consumed.
226         // https://www.w3.org/Bugs/Public/show_bug.cgi?id=16771
227         for i in 0x81..0xff {
228             let mut d = BigFive2003Encoding.raw_decoder();
229             assert_feed_err!(d, [], [i, 0x80], [0x20], "");
230             assert_feed_err!(d, [], [i, 0xff], [0x20], "");
231             assert_finish_ok!(d, "");
232 
233             let mut d = BigFive2003Encoding.raw_decoder();
234             assert_feed_ok!(d, [], [i], "");
235             assert_feed_err!(d, [], [0x80], [0x20], "");
236             assert_feed_ok!(d, [], [i], "");
237             assert_feed_err!(d, [], [0xff], [0x20], "");
238             assert_finish_ok!(d, "");
239         }
240 
241         // 80/FF is not a valid lead and the trail is not consumed
242         let mut d = BigFive2003Encoding.raw_decoder();
243         assert_feed_err!(d, [], [0x80], [0x80], "");
244         assert_feed_err!(d, [], [0x80], [0xff], "");
245         assert_feed_err!(d, [], [0xff], [0x80], "");
246         assert_feed_err!(d, [], [0xff], [0xff], "");
247         assert_finish_ok!(d, "");
248     }
249 
250     #[test]
test_decoder_feed_after_finish()251     fn test_decoder_feed_after_finish() {
252         let mut d = BigFive2003Encoding.raw_decoder();
253         assert_feed_ok!(d, [0xa4, 0x40], [0xa4], "\u{4e00}");
254         assert_finish_err!(d, "");
255         assert_feed_ok!(d, [0xa4, 0x40], [], "\u{4e00}");
256         assert_finish_ok!(d, "");
257     }
258 
259     #[bench]
bench_encode_short_text(bencher: &mut test::Bencher)260     fn bench_encode_short_text(bencher: &mut test::Bencher) {
261         let s = testutils::TRADITIONAL_CHINESE_TEXT;
262         bencher.bytes = s.len() as u64;
263         bencher.iter(|| test::black_box({
264             BigFive2003Encoding.encode(&s, EncoderTrap::Strict)
265         }))
266     }
267 
268     #[bench]
bench_decode_short_text(bencher: &mut test::Bencher)269     fn bench_decode_short_text(bencher: &mut test::Bencher) {
270         let s = BigFive2003Encoding.encode(testutils::TRADITIONAL_CHINESE_TEXT,
271                                            EncoderTrap::Strict).ok().unwrap();
272         bencher.bytes = s.len() as u64;
273         bencher.iter(|| test::black_box({
274             BigFive2003Encoding.decode(&s, DecoderTrap::Strict)
275         }))
276     }
277 }
278