1 // This is a part of rust-encoding. 2 // Copyright (c) 2013-2015, Kang Seonghoon. 3 // See README.md and LICENSE.txt for details. 4 5 //! Legacy simplified Chinese encodings based on GB 2312 and GB 18030. 6 7 use std::convert::Into; 8 use std::marker::PhantomData; 9 use std::default::Default; 10 use util::StrCharIndex; 11 use index_simpchinese as index; 12 use types::*; 13 14 /// An implementation type for GBK. 15 /// 16 /// Can be used as a type parameter to `GBEncoding` and `GBEncoder`. 17 /// (GB18030Decoder is shared by both.) 18 #[derive(Clone, Copy)] 19 pub struct GBK; 20 21 /// An implementation type for GB18030. 22 /// 23 /// Can be used as a type parameter to `GBEncoding` and `GBEncoder.' 24 /// (GB18030Decoder is shared by both.) 25 #[derive(Clone, Copy)] 26 pub struct GB18030; 27 28 /// An internal trait used to customize GBK and GB18030 implementations. 29 #[doc(hidden)] // XXX never intended to be used publicly, should be gone later 30 pub trait GBType: Clone + 'static { name() -> &'static str31 fn name() -> &'static str; whatwg_name() -> Option<&'static str>32 fn whatwg_name() -> Option<&'static str>; initial_gbk_flag() -> bool33 fn initial_gbk_flag() -> bool; 34 } 35 36 impl GBType for GBK { name() -> &'static str37 fn name() -> &'static str { "gbk" } whatwg_name() -> Option<&'static str>38 fn whatwg_name() -> Option<&'static str> { Some("gbk") } initial_gbk_flag() -> bool39 fn initial_gbk_flag() -> bool { true } 40 } 41 42 impl GBType for GB18030 { name() -> &'static str43 fn name() -> &'static str { "gb18030" } whatwg_name() -> Option<&'static str>44 fn whatwg_name() -> Option<&'static str> { Some("gb18030") } initial_gbk_flag() -> bool45 fn initial_gbk_flag() -> bool { false } 46 } 47 48 /** 49 * GBK and GB 18030-2005. 50 * 51 * The original GBK 1.0 region spans `[81-FE] [40-7E 80-FE]`, and is derived from 52 * several different revisions of a family of encodings named "GBK": 53 * 54 * - GBK as specified in the normative annex of GB 13000.1-93, 55 * the domestic standard equivalent to Unicode 1.1, 56 * consisted of characters included in Unicode 1.1 and not in GB 2312-80. 57 * - Windows code page 936 is the widespread extension to GBK. 58 * - Due to the popularity of Windows code page 936, 59 * a formal encoding based on Windows code page 936 (while adding new characters) 60 * was standardized into GBK 1.0. 61 * - Finally, GB 18030 added four-byte sequences to GBK for becoming a pan-Unicode encoding, 62 * while adding new characters to the (former) GBK region again. 63 * 64 * GB 18030-2005 is a simplified Chinese encoding which extends GBK 1.0 to a pan-Unicode encoding. 65 * It assigns four-byte sequences to every Unicode codepoint missing from the GBK area, 66 * lexicographically ordered with occasional "gaps" for codepoints in the GBK area. 67 * Due to this compatibility decision, 68 * there is no simple relationship between these four-byte sequences and Unicode codepoints, 69 * though there *exists* a relatively simple mapping algorithm with a small lookup table. 70 * 71 * ## Specialization 72 * 73 * This type is specialized with GBType `T`, 74 * which should be either `GBK` or `GB18030`. 75 */ 76 #[derive(Clone, Copy)] 77 pub struct GBEncoding<T> { 78 _marker: PhantomData<T> 79 } 80 81 /// A type for GBK. 82 pub type GBKEncoding = GBEncoding<GBK>; 83 /// A type for GB18030. 84 pub type GB18030Encoding = GBEncoding<GB18030>; 85 86 /// An instance for GBK. 87 pub const GBK_ENCODING: GBKEncoding = GBEncoding { _marker: PhantomData }; 88 /// An instance for GB18030. 89 pub const GB18030_ENCODING: GB18030Encoding = GBEncoding { _marker: PhantomData }; 90 91 impl<T: GBType> Encoding for GBEncoding<T> { name(&self) -> &'static str92 fn name(&self) -> &'static str { <T as GBType>::name() } whatwg_name(&self) -> Option<&'static str>93 fn whatwg_name(&self) -> Option<&'static str> { <T as GBType>::whatwg_name() } raw_encoder(&self) -> Box<RawEncoder>94 fn raw_encoder(&self) -> Box<RawEncoder> { GBEncoder::<T>::new() } raw_decoder(&self) -> Box<RawDecoder>95 fn raw_decoder(&self) -> Box<RawDecoder> { GB18030Decoder::new() } 96 } 97 98 /** 99 * An encoder for GBK and GB18030. 100 * 101 * ## Specialization 102 * 103 * This type is specialized with GBType `T`, 104 * which should be either `GBK` or `GB18030`. 105 */ 106 #[derive(Clone, Copy)] 107 pub struct GBEncoder<T> { 108 _marker: PhantomData<T> 109 } 110 111 impl<T: GBType> GBEncoder<T> { new() -> Box<RawEncoder>112 pub fn new() -> Box<RawEncoder> { 113 Box::new(GBEncoder::<T> { _marker: PhantomData }) 114 } 115 } 116 117 impl<T: GBType> RawEncoder for GBEncoder<T> { from_self(&self) -> Box<RawEncoder>118 fn from_self(&self) -> Box<RawEncoder> { GBEncoder::<T>::new() } is_ascii_compatible(&self) -> bool119 fn is_ascii_compatible(&self) -> bool { true } 120 raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)121 fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) { 122 output.writer_hint(input.len()); 123 124 let gbk_flag = <T as GBType>::initial_gbk_flag(); 125 for ((i, j), ch) in input.index_iter() { 126 if ch < '\u{80}' { 127 output.write_byte(ch as u8); 128 } else if gbk_flag && ch == '\u{20AC}' { 129 output.write_byte('\u{80}' as u8) 130 } else { 131 let ptr = index::gb18030::backward(ch as u32); 132 if ptr == 0xffff { 133 if gbk_flag { 134 return (i, Some(CodecError { 135 upto: j as isize, 136 cause: "gbk doesn't support gb18030 extensions".into() 137 })); 138 } 139 let ptr = index::gb18030_ranges::backward(ch as u32); 140 assert!(ptr != 0xffffffff); 141 let (ptr, byte4) = (ptr / 10, ptr % 10); 142 let (ptr, byte3) = (ptr / 126, ptr % 126); 143 let (byte1, byte2) = (ptr / 10, ptr % 10); 144 output.write_byte((byte1 + 0x81) as u8); 145 output.write_byte((byte2 + 0x30) as u8); 146 output.write_byte((byte3 + 0x81) as u8); 147 output.write_byte((byte4 + 0x30) as u8); 148 } else { 149 let lead = ptr / 190 + 0x81; 150 let trail = ptr % 190; 151 let trailoffset = if trail < 0x3f {0x40} else {0x41}; 152 output.write_byte(lead as u8); 153 output.write_byte((trail + trailoffset) as u8); 154 } 155 } 156 } 157 (input.len(), None) 158 } 159 raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>160 fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> { 161 None 162 } 163 } 164 165 /// A decoder for GB 18030 (also used by GBK). 166 #[derive(Clone, Copy)] 167 struct GB18030Decoder { 168 st: gb18030::State, 169 } 170 171 impl GB18030Decoder { new() -> Box<RawDecoder>172 pub fn new() -> Box<RawDecoder> { 173 Box::new(GB18030Decoder { st: Default::default() }) 174 } 175 } 176 177 impl RawDecoder for GB18030Decoder { from_self(&self) -> Box<RawDecoder>178 fn from_self(&self) -> Box<RawDecoder> { GB18030Decoder::new() } is_ascii_compatible(&self) -> bool179 fn is_ascii_compatible(&self) -> bool { true } 180 raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)181 fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) { 182 let (st, processed, err) = gb18030::raw_feed(self.st, input, output, &()); 183 self.st = st; 184 (processed, err) 185 } 186 raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>187 fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> { 188 let (st, err) = gb18030::raw_finish(self.st, output, &()); 189 self.st = st; 190 err 191 } 192 } 193 194 stateful_decoder! { 195 module gb18030; 196 197 internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 { 198 use index_simpchinese as index; 199 200 let lead = lead as u16; 201 let trail = trail as u16; 202 let index = match (lead, trail) { 203 (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0x80...0xfe) => { 204 let trailoffset = if trail < 0x7f {0x40} else {0x41}; 205 (lead - 0x81) * 190 + trail - trailoffset 206 } 207 _ => 0xffff, 208 }; 209 index::gb18030::forward(index) 210 } 211 212 internal pub fn map_four_bytes(b1: u8, b2: u8, b3: u8, b4: u8) -> u32 { 213 use index_simpchinese as index; 214 215 // no range check here, caller should have done all checks 216 let index = (b1 as u32 - 0x81) * 12600 + (b2 as u32 - 0x30) * 1260 + 217 (b3 as u32 - 0x81) * 10 + (b4 as u32 - 0x30); 218 index::gb18030_ranges::forward(index) 219 } 220 221 initial: 222 // gb18030 first = 0x00, gb18030 second = 0x00, gb18030 third = 0x00 223 state S0(ctx: Context) { 224 case b @ 0x00...0x7f => ctx.emit(b as u32); 225 case 0x80 => ctx.emit(0x20ac); 226 case b @ 0x81...0xfe => S1(ctx, b); 227 case _ => ctx.err("invalid sequence"); 228 } 229 230 transient: 231 // gb18030 first != 0x00, gb18030 second = 0x00, gb18030 third = 0x00 232 state S1(ctx: Context, first: u8) { 233 case b @ 0x30...0x39 => S2(ctx, first, b); 234 case b => match map_two_bytes(first, b) { 235 0xffff => ctx.backup_and_err(1, "invalid sequence"), // unconditional 236 ch => ctx.emit(ch) 237 }; 238 } 239 240 // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third = 0x00 241 state S2(ctx: Context, first: u8, second: u8) { 242 case b @ 0x81...0xfe => S3(ctx, first, second, b); 243 case _ => ctx.backup_and_err(2, "invalid sequence"); 244 } 245 246 // gb18030 first != 0x00, gb18030 second != 0x00, gb18030 third != 0x00 247 state S3(ctx: Context, first: u8, second: u8, third: u8) { 248 case b @ 0x30...0x39 => match map_four_bytes(first, second, third, b) { 249 0xffffffff => ctx.backup_and_err(3, "invalid sequence"), // unconditional 250 ch => ctx.emit(ch) 251 }; 252 case _ => ctx.backup_and_err(3, "invalid sequence"); 253 } 254 } 255 256 #[cfg(test)] 257 mod gb18030_tests { 258 extern crate test; 259 use super::GB18030_ENCODING; 260 use testutils; 261 use types::*; 262 263 #[test] test_encoder()264 fn test_encoder() { 265 let mut e = GB18030_ENCODING.raw_encoder(); 266 assert_feed_ok!(e, "A", "", [0x41]); 267 assert_feed_ok!(e, "BC", "", [0x42, 0x43]); 268 assert_feed_ok!(e, "", "", []); 269 assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "", 270 [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 271 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]); 272 assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa2, 0xe3, 0x2f, 0x6d]); 273 assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]); 274 assert_feed_ok!(e, "\u{80}", "", [0x81, 0x30, 0x81, 0x30]); 275 assert_feed_ok!(e, "\u{81}", "", [0x81, 0x30, 0x81, 0x31]); 276 assert_feed_ok!(e, "\u{a3}", "", [0x81, 0x30, 0x84, 0x35]); 277 assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]); 278 assert_feed_ok!(e, "\u{a5}", "", [0x81, 0x30, 0x84, 0x36]); 279 assert_feed_ok!(e, "\u{10ffff}", "", [0xe3, 0x32, 0x9a, 0x35]); 280 assert_feed_ok!(e, "\u{2a6a5}\u{3007}", "", [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96]); 281 assert_finish_ok!(e, []); 282 } 283 284 #[test] test_decoder_valid()285 fn test_decoder_valid() { 286 let mut d = GB18030_ENCODING.raw_decoder(); 287 assert_feed_ok!(d, [0x41], [], "A"); 288 assert_feed_ok!(d, [0x42, 0x43], [], "BC"); 289 assert_feed_ok!(d, [], [], ""); 290 assert_feed_ok!(d, [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 291 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa], [], 292 "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"); 293 assert_feed_ok!(d, [0x31, 0x80, 0x2f, 0x6d], [], "1\u{20ac}/m"); 294 assert_feed_ok!(d, [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3], [], "\u{ff21}\u{ff22}\u{ff23}"); 295 assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x30], [], "\u{80}"); 296 assert_feed_ok!(d, [0x81, 0x30, 0x81, 0x31], [], "\u{81}"); 297 assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x35], [], "\u{a3}"); 298 assert_feed_ok!(d, [0xa1, 0xe8], [], "\u{a4}" ); 299 assert_feed_ok!(d, [0x81, 0x30, 0x84, 0x36], [], "\u{a5}"); 300 assert_feed_ok!(d, [0xe3, 0x32, 0x9a, 0x35], [], "\u{10ffff}"); 301 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37, 0xa9, 0x96], [], "\u{2a6a5}\u{3007}"); 302 assert_finish_ok!(d, ""); 303 } 304 305 #[test] test_decoder_valid_partial()306 fn test_decoder_valid_partial() { 307 let mut d = GB18030_ENCODING.raw_decoder(); 308 assert_feed_ok!(d, [], [0xa1], ""); 309 assert_feed_ok!(d, [0xa1], [], "\u{3000}"); 310 assert_feed_ok!(d, [], [0x81], ""); 311 assert_feed_ok!(d, [], [0x30], ""); 312 assert_feed_ok!(d, [], [0x81], ""); 313 assert_feed_ok!(d, [0x30], [], "\u{80}"); 314 assert_feed_ok!(d, [], [0x81], ""); 315 assert_feed_ok!(d, [], [0x30], ""); 316 assert_feed_ok!(d, [0x81, 0x31], [], "\u{81}"); 317 assert_feed_ok!(d, [], [0x81], ""); 318 assert_feed_ok!(d, [0x30, 0x81, 0x32], [], "\u{82}"); 319 assert_feed_ok!(d, [], [0x81], ""); 320 assert_feed_ok!(d, [], [0x30, 0x81], ""); 321 assert_feed_ok!(d, [0x33], [], "\u{83}"); 322 assert_feed_ok!(d, [], [0x81, 0x30], ""); 323 assert_feed_ok!(d, [], [0x81], ""); 324 assert_feed_ok!(d, [0x34], [], "\u{84}"); 325 assert_feed_ok!(d, [], [0x81, 0x30], ""); 326 assert_feed_ok!(d, [0x81, 0x35], [], "\u{85}"); 327 assert_feed_ok!(d, [], [0x81, 0x30, 0x81], ""); 328 assert_feed_ok!(d, [0x36], [], "\u{86}"); 329 assert_finish_ok!(d, ""); 330 } 331 332 #[test] test_decoder_invalid_partial()333 fn test_decoder_invalid_partial() { 334 let mut d = GB18030_ENCODING.raw_decoder(); 335 assert_feed_ok!(d, [], [0xa1], ""); 336 assert_finish_err!(d, ""); 337 338 let mut d = GB18030_ENCODING.raw_decoder(); 339 assert_feed_ok!(d, [], [0x81], ""); 340 assert_finish_err!(d, ""); 341 342 let mut d = GB18030_ENCODING.raw_decoder(); 343 assert_feed_ok!(d, [], [0x81, 0x30], ""); 344 assert_finish_err!(d, ""); 345 346 let mut d = GB18030_ENCODING.raw_decoder(); 347 assert_feed_ok!(d, [], [0x81, 0x30, 0x81], ""); 348 assert_finish_err!(d, ""); 349 } 350 351 #[test] test_decoder_invalid_out_of_range()352 fn test_decoder_invalid_out_of_range() { 353 let mut d = GB18030_ENCODING.raw_decoder(); 354 assert_feed_err!(d, [], [0xff], [], ""); 355 assert_feed_err!(d, [], [0x81], [0x00], ""); 356 assert_feed_err!(d, [], [0x81], [0x7f], ""); 357 assert_feed_err!(d, [], [0x81], [0xff], ""); 358 assert_feed_err!(d, [], [0x81], [0x31, 0x00], ""); 359 assert_feed_err!(d, [], [0x81], [0x31, 0x80], ""); 360 assert_feed_err!(d, [], [0x81], [0x31, 0xff], ""); 361 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x00], ""); 362 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x2f], ""); 363 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0x3a], ""); 364 assert_feed_err!(d, [], [0x81], [0x31, 0x81, 0xff], ""); 365 assert_finish_ok!(d, ""); 366 } 367 368 #[test] test_decoder_invalid_boundary()369 fn test_decoder_invalid_boundary() { 370 // U+10FFFF (E3 32 9A 35) is the last Unicode codepoint, E3 32 9A 36 is invalid. 371 // note that since the 2nd to 4th bytes may coincide with ASCII, bytes 32 9A 36 is 372 // not considered to be in the problem. this is compatible to WHATWG Encoding standard. 373 let mut d = GB18030_ENCODING.raw_decoder(); 374 assert_feed_ok!(d, [], [0xe3], ""); 375 assert_feed_err!(d, [], [], [0x32, 0x9a, 0x36], ""); 376 assert_finish_ok!(d, ""); 377 378 let mut d = GB18030_ENCODING.raw_decoder(); 379 assert_feed_ok!(d, [], [0xe3], ""); 380 assert_feed_ok!(d, [], [0x32, 0x9a], ""); 381 assert_feed_err!(d, -2, [], [], [0x32, 0x9a, 0x36], ""); 382 assert_finish_ok!(d, ""); 383 } 384 385 #[test] test_decoder_feed_after_finish()386 fn test_decoder_feed_after_finish() { 387 let mut d = GB18030_ENCODING.raw_decoder(); 388 assert_feed_ok!(d, [0xd2, 0xbb], [0xd2], "\u{4e00}"); 389 assert_finish_err!(d, ""); 390 assert_feed_ok!(d, [0xd2, 0xbb], [], "\u{4e00}"); 391 assert_finish_ok!(d, ""); 392 393 let mut d = GB18030_ENCODING.raw_decoder(); 394 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35, 0xee], "\u{2a6a5}"); 395 assert_finish_err!(d, ""); 396 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98, 0x35], "\u{2a6a5}"); 397 assert_finish_err!(d, ""); 398 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [0x98], "\u{2a6a5}"); 399 assert_finish_err!(d, ""); 400 assert_feed_ok!(d, [0x98, 0x35, 0xee, 0x37], [], "\u{2a6a5}"); 401 assert_finish_ok!(d, ""); 402 } 403 404 #[bench] bench_encode_short_text(bencher: &mut test::Bencher)405 fn bench_encode_short_text(bencher: &mut test::Bencher) { 406 let s = testutils::SIMPLIFIED_CHINESE_TEXT; 407 bencher.bytes = s.len() as u64; 408 bencher.iter(|| test::black_box({ 409 GB18030_ENCODING.encode(&s, EncoderTrap::Strict) 410 })) 411 } 412 413 #[bench] bench_decode_short_text(bencher: &mut test::Bencher)414 fn bench_decode_short_text(bencher: &mut test::Bencher) { 415 let s = GB18030_ENCODING.encode(testutils::SIMPLIFIED_CHINESE_TEXT, 416 EncoderTrap::Strict).ok().unwrap(); 417 bencher.bytes = s.len() as u64; 418 bencher.iter(|| test::black_box({ 419 GB18030_ENCODING.decode(&s, DecoderTrap::Strict) 420 })) 421 } 422 } 423 424 #[cfg(test)] 425 mod gbk_tests { 426 extern crate test; 427 use super::GBK_ENCODING; 428 use testutils; 429 use types::*; 430 431 // GBK and GB 18030 share the same decoder logic. 432 433 #[test] test_encoder()434 fn test_encoder() { 435 let mut e = GBK_ENCODING.raw_encoder(); 436 assert_feed_ok!(e, "A", "", [0x41]); 437 assert_feed_ok!(e, "BC", "", [0x42, 0x43]); 438 assert_feed_ok!(e, "", "", []); 439 assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "", 440 [0xd6, 0xd0, 0xbb, 0xaa, 0xc8, 0xcb, 0xc3, 0xf1, 441 0xb9, 0xb2, 0xba, 0xcd, 0xb9, 0xfa]); 442 assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0x80, 0x2f, 0x6d]); 443 assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", [0xa3, 0xc1, 0xa3, 0xc2, 0xa3, 0xc3]); 444 assert_feed_err!(e, "", "\u{80}", "", []); 445 assert_feed_err!(e, "", "\u{81}", "", []); 446 assert_feed_err!(e, "", "\u{a3}", "", []); 447 assert_feed_ok!(e, "\u{a4}", "", [0xa1, 0xe8]); 448 assert_feed_err!(e, "", "\u{a5}", "", []); 449 assert_feed_err!(e, "", "\u{10ffff}", "", []); 450 assert_feed_err!(e, "", "\u{2a6a5}", "\u{3007}", []); 451 assert_feed_err!(e, "\u{3007}", "\u{2a6a5}", "", [0xa9, 0x96]); 452 assert_finish_ok!(e, []); 453 } 454 455 #[bench] bench_encode_short_text(bencher: &mut test::Bencher)456 fn bench_encode_short_text(bencher: &mut test::Bencher) { 457 let s = testutils::SIMPLIFIED_CHINESE_TEXT; 458 bencher.bytes = s.len() as u64; 459 bencher.iter(|| test::black_box({ 460 GBK_ENCODING.encode(&s, EncoderTrap::Strict) 461 })) 462 } 463 } 464 465 /** 466 * HZ. (RFC 1843) 467 * 468 * This is a simplified Chinese encoding based on GB 2312. 469 * It bears a resemblance to ISO 2022 encodings in such that the printable escape sequences `~{` 470 * and `~}` are used to delimit a sequence of 7-bit-safe GB 2312 sequences. For the comparison, 471 * they are equivalent to ISO-2022-CN escape sequences `ESC $ ) A` and `ESC ( B`. 472 * Additional escape sequences `~~` (for a literal `~`) and `~\n` (ignored) are also supported. 473 */ 474 #[derive(Clone, Copy)] 475 pub struct HZEncoding; 476 477 impl Encoding for HZEncoding { name(&self) -> &'static str478 fn name(&self) -> &'static str { "hz" } whatwg_name(&self) -> Option<&'static str>479 fn whatwg_name(&self) -> Option<&'static str> { None } raw_encoder(&self) -> Box<RawEncoder>480 fn raw_encoder(&self) -> Box<RawEncoder> { HZEncoder::new() } raw_decoder(&self) -> Box<RawDecoder>481 fn raw_decoder(&self) -> Box<RawDecoder> { HZDecoder::new() } 482 } 483 484 /// An encoder for HZ. 485 #[derive(Clone, Copy)] 486 pub struct HZEncoder { 487 escaped: bool, 488 } 489 490 impl HZEncoder { new() -> Box<RawEncoder>491 pub fn new() -> Box<RawEncoder> { Box::new(HZEncoder { escaped: false }) } 492 } 493 494 impl RawEncoder for HZEncoder { from_self(&self) -> Box<RawEncoder>495 fn from_self(&self) -> Box<RawEncoder> { HZEncoder::new() } is_ascii_compatible(&self) -> bool496 fn is_ascii_compatible(&self) -> bool { false } 497 raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)498 fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) { 499 output.writer_hint(input.len()); 500 501 let mut escaped = self.escaped; 502 macro_rules! ensure_escaped( 503 () => (if !escaped { output.write_bytes(b"~{"); escaped = true; }) 504 ); 505 macro_rules! ensure_unescaped( 506 () => (if escaped { output.write_bytes(b"~}"); escaped = false; }) 507 ); 508 509 for ((i,j), ch) in input.index_iter() { 510 if ch < '\u{80}' { 511 ensure_unescaped!(); 512 output.write_byte(ch as u8); 513 if ch == '~' { output.write_byte('~' as u8); } 514 } else { 515 let ptr = index::gb18030::backward(ch as u32); 516 if ptr == 0xffff { 517 self.escaped = escaped; // do NOT reset the state! 518 return (i, Some(CodecError { 519 upto: j as isize, cause: "unrepresentable character".into() 520 })); 521 } else { 522 let lead = ptr / 190; 523 let trail = ptr % 190; 524 if lead < 0x21 - 1 || trail < 0x21 + 0x3f { // GBK extension, ignored 525 self.escaped = escaped; // do NOT reset the state! 526 return (i, Some(CodecError { 527 upto: j as isize, cause: "unrepresentable character".into() 528 })); 529 } else { 530 ensure_escaped!(); 531 output.write_byte((lead + 1) as u8); 532 output.write_byte((trail - 0x3f) as u8); 533 } 534 } 535 } 536 } 537 538 self.escaped = escaped; 539 (input.len(), None) 540 } 541 raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>542 fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> { 543 None 544 } 545 } 546 547 /// A decoder for HZ. 548 #[derive(Clone, Copy)] 549 struct HZDecoder { 550 st: hz::State, 551 } 552 553 impl HZDecoder { new() -> Box<RawDecoder>554 pub fn new() -> Box<RawDecoder> { 555 Box::new(HZDecoder { st: Default::default() }) 556 } 557 } 558 559 impl RawDecoder for HZDecoder { from_self(&self) -> Box<RawDecoder>560 fn from_self(&self) -> Box<RawDecoder> { HZDecoder::new() } is_ascii_compatible(&self) -> bool561 fn is_ascii_compatible(&self) -> bool { true } 562 raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)563 fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) { 564 let (st, processed, err) = hz::raw_feed(self.st, input, output, &()); 565 self.st = st; 566 (processed, err) 567 } 568 raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>569 fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> { 570 let (st, err) = hz::raw_finish(self.st, output, &()); 571 self.st = st; 572 err 573 } 574 } 575 576 stateful_decoder! { 577 module hz; 578 579 internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 { 580 use index_simpchinese as index; 581 582 let lead = lead as u16; 583 let trail = trail as u16; 584 let index = match (lead, trail) { 585 (0x20...0x7f, 0x21...0x7e) => (lead - 1) * 190 + (trail + 0x3f), 586 _ => 0xffff, 587 }; 588 index::gb18030::forward(index) 589 } 590 591 initial: 592 // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x00 593 state A0(ctx: Context) { 594 case 0x7e => A1(ctx); 595 case b @ 0x00...0x7f => ctx.emit(b as u32); 596 case _ => ctx.err("invalid sequence"); 597 final => ctx.reset(); 598 } 599 600 checkpoint: 601 // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x00 602 state B0(ctx: Context) { 603 case 0x7e => B1(ctx); 604 case b @ 0x20...0x7f => B2(ctx, b); 605 case 0x0a => ctx.err("invalid sequence"); // error *and* reset 606 case _ => ctx.err("invalid sequence"), B0(ctx); 607 final => ctx.reset(); 608 } 609 610 transient: 611 // hz-gb-2312 flag = unset, hz-gb-2312 lead = 0x7e 612 state A1(ctx: Context) { 613 case 0x7b => B0(ctx); 614 case 0x7d => A0(ctx); 615 case 0x7e => ctx.emit(0x7e), A0(ctx); 616 case 0x0a => A0(ctx); 617 case _ => ctx.backup_and_err(1, "invalid sequence"); 618 final => ctx.err("incomplete sequence"); 619 } 620 621 // hz-gb-2312 flag = set, hz-gb-2312 lead = 0x7e 622 state B1(ctx: Context) { 623 case 0x7b => B0(ctx); 624 case 0x7d => A0(ctx); 625 case 0x7e => ctx.emit(0x7e), B0(ctx); 626 case 0x0a => A0(ctx); 627 case _ => ctx.backup_and_err(1, "invalid sequence"), B0(ctx); 628 final => ctx.err("incomplete sequence"); 629 } 630 631 // hz-gb-2312 flag = set, hz-gb-2312 lead != 0 & != 0x7e 632 state B2(ctx: Context, lead: u8) { 633 case 0x0a => ctx.err("invalid sequence"); // should reset the state! 634 case b => 635 match map_two_bytes(lead, b) { 636 0xffff => ctx.err("invalid sequence"), 637 ch => ctx.emit(ch) 638 }, 639 B0(ctx); 640 final => ctx.err("incomplete sequence"); 641 } 642 } 643 644 #[cfg(test)] 645 mod hz_tests { 646 extern crate test; 647 use super::HZEncoding; 648 use testutils; 649 use types::*; 650 651 #[test] test_encoder_valid()652 fn test_encoder_valid() { 653 let mut e = HZEncoding.raw_encoder(); 654 assert_feed_ok!(e, "A", "", *b"A"); 655 assert_feed_ok!(e, "BC", "", *b"BC"); 656 assert_feed_ok!(e, "", "", *b""); 657 assert_feed_ok!(e, "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}", "", 658 *b"~{VP;*HKCq92:M9z"); 659 assert_feed_ok!(e, "\u{ff21}\u{ff22}\u{ff23}", "", *b"#A#B#C"); 660 assert_feed_ok!(e, "1\u{20ac}/m", "", *b"~}1~{\"c~}/m"); 661 assert_feed_ok!(e, "~<\u{a4}~\u{0a4}>~", "", *b"~~<~{!h~}~~~{!h~}>~~"); 662 assert_finish_ok!(e, []); 663 } 664 665 #[test] test_encoder_invalid()666 fn test_encoder_invalid() { 667 let mut e = HZEncoding.raw_encoder(); 668 assert_feed_err!(e, "", "\u{ffff}", "", []); 669 assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); 670 // no support for GBK extension 671 assert_feed_err!(e, "", "\u{3007}", "", []); 672 assert_finish_ok!(e, []); 673 } 674 675 #[test] test_decoder_valid()676 fn test_decoder_valid() { 677 let mut d = HZEncoding.raw_decoder(); 678 assert_feed_ok!(d, *b"A", *b"", "A"); 679 assert_feed_ok!(d, *b"BC", *b"", "BC"); 680 assert_feed_ok!(d, *b"D~~E", *b"~", "D~E"); 681 assert_feed_ok!(d, *b"~F~\nG", *b"~", "~FG"); 682 assert_feed_ok!(d, *b"", *b"", ""); 683 assert_feed_ok!(d, *b"\nH", *b"~", "H"); 684 assert_feed_ok!(d, *b"{VP~}~{;*~{HKCq92:M9z", *b"", 685 "\u{4e2d}\u{534e}\u{4eba}\u{6c11}\u{5171}\u{548c}\u{56fd}"); 686 assert_feed_ok!(d, *b"", *b"#", ""); 687 assert_feed_ok!(d, *b"A", *b"~", "\u{ff21}"); 688 assert_feed_ok!(d, *b"~#B~~#C", *b"~", "~\u{ff22}~\u{ff23}"); 689 assert_feed_ok!(d, *b"", *b"", ""); 690 assert_feed_ok!(d, *b"\n#D~{#E~\n#F~{#G", *b"~", "#D\u{ff25}#F\u{ff27}"); 691 assert_feed_ok!(d, *b"}X~}YZ", *b"", "XYZ"); 692 assert_finish_ok!(d, ""); 693 } 694 695 #[test] test_decoder_invalid_out_or_range()696 fn test_decoder_invalid_out_or_range() { 697 let mut d = HZEncoding.raw_decoder(); 698 assert_feed_ok!(d, *b"~{", *b"", ""); 699 assert_feed_err!(d, *b"", *b"\x20\x20", *b"", ""); 700 assert_feed_err!(d, *b"", *b"\x20\x7f", *b"", ""); // do not reset the state (except for CR) 701 assert_feed_err!(d, *b"", *b"\x21\x7f", *b"", ""); 702 assert_feed_err!(d, *b"", *b"\x7f\x20", *b"", ""); 703 assert_feed_err!(d, *b"", *b"\x7f\x21", *b"", ""); 704 assert_feed_err!(d, *b"", *b"\x7f\x7f", *b"", ""); 705 assert_finish_ok!(d, ""); 706 } 707 708 #[test] test_decoder_invalid_carriage_return()709 fn test_decoder_invalid_carriage_return() { 710 // CR in the multibyte mode is invalid but *also* resets the state 711 let mut d = HZEncoding.raw_decoder(); 712 assert_feed_ok!(d, *b"~{#A", *b"", "\u{ff21}"); 713 assert_feed_err!(d, *b"", *b"\n", *b"", ""); 714 assert_feed_ok!(d, *b"#B~{#C", *b"", "#B\u{ff23}"); 715 assert_feed_err!(d, *b"", *b"#\n", *b"", ""); 716 assert_feed_ok!(d, *b"#D", *b"", "#D"); 717 assert_finish_ok!(d, ""); 718 } 719 720 #[test] test_decoder_invalid_partial()721 fn test_decoder_invalid_partial() { 722 let mut d = HZEncoding.raw_decoder(); 723 assert_feed_ok!(d, *b"", *b"~", ""); 724 assert_finish_err!(d, ""); 725 726 let mut d = HZEncoding.raw_decoder(); 727 assert_feed_ok!(d, *b"~{", *b"#", ""); 728 assert_finish_err!(d, ""); 729 730 let mut d = HZEncoding.raw_decoder(); 731 assert_feed_ok!(d, *b"~{#A", *b"~", "\u{ff21}"); 732 assert_finish_err!(d, ""); 733 } 734 735 #[test] test_decoder_invalid_escape()736 fn test_decoder_invalid_escape() { 737 let mut d = HZEncoding.raw_decoder(); 738 assert_feed_ok!(d, *b"#A", *b"", "#A"); 739 assert_feed_err!(d, *b"", *b"~", *b"xy", ""); 740 assert_feed_ok!(d, *b"#B", *b"", "#B"); 741 assert_feed_ok!(d, *b"", *b"~", ""); 742 assert_feed_err!(d, *b"", *b"", *b"xy", ""); 743 assert_feed_ok!(d, *b"#C~{#D", *b"", "#C\u{ff24}"); 744 assert_feed_err!(d, *b"", *b"~", *b"xy", ""); 745 assert_feed_ok!(d, *b"#E", *b"", "\u{ff25}"); // does not reset to ASCII 746 assert_feed_ok!(d, *b"", *b"~", ""); 747 assert_feed_err!(d, *b"", *b"", *b"xy", ""); 748 assert_feed_ok!(d, *b"#F~}#G", *b"", "\u{ff26}#G"); 749 assert_finish_ok!(d, ""); 750 } 751 752 #[test] test_decoder_feed_after_finish()753 fn test_decoder_feed_after_finish() { 754 let mut d = HZEncoding.raw_decoder(); 755 assert_feed_ok!(d, *b"R;~{R;", *b"R", "R;\u{4e00}"); 756 assert_finish_err!(d, ""); 757 assert_feed_ok!(d, *b"R;~{R;", *b"", "R;\u{4e00}"); 758 assert_finish_ok!(d, ""); 759 } 760 761 #[bench] bench_encode_short_text(bencher: &mut test::Bencher)762 fn bench_encode_short_text(bencher: &mut test::Bencher) { 763 let s = testutils::SIMPLIFIED_CHINESE_TEXT; 764 bencher.bytes = s.len() as u64; 765 bencher.iter(|| test::black_box({ 766 HZEncoding.encode(&s, EncoderTrap::Strict) 767 })) 768 } 769 770 #[bench] bench_decode_short_text(bencher: &mut test::Bencher)771 fn bench_decode_short_text(bencher: &mut test::Bencher) { 772 let s = HZEncoding.encode(testutils::SIMPLIFIED_CHINESE_TEXT, 773 EncoderTrap::Strict).ok().unwrap(); 774 bencher.bytes = s.len() as u64; 775 bencher.iter(|| test::black_box({ 776 HZEncoding.decode(&s, DecoderTrap::Strict) 777 })) 778 } 779 } 780 781