1 // This is a part of rust-encoding. 2 // Copyright (c) 2013-2015, Kang Seonghoon. 3 // See README.md and LICENSE.txt for details. 4 5 //! Legacy traditional Chinese encodings. 6 7 use std::convert::Into; 8 use std::default::Default; 9 use util::StrCharIndex; 10 use index_tradchinese as index; 11 use types::*; 12 13 /** 14 * Big5-2003 with common extensions. (XXX with asymmetric HKSCS-2008 support) 15 * 16 * This is a traditional Chinese encoding spanning the region `[81-FE] [40-7E A1-FE]`. 17 * Originally a proprietary encoding by the consortium of five companies (hence the name), 18 * the Republic of China government standardized Big5-2003 in an appendix of CNS 11643 19 * so that CNS 11643 plane 1 and plane 2 have 20 * an almost identical set of characters as Big5 (but with a different mapping). 21 * The Hong Kong government has an official extension to Big5 22 * named Hong Kong Supplementary Character Set (HKSCS). 23 * 24 * This particular implementation of Big5 includes the widespread ETEN and HKSCS extensions, 25 * but excludes less common extensions such as Big5+, Big-5E and Unicode-at-on. 26 */ 27 #[derive(Clone, Copy)] 28 pub struct BigFive2003Encoding; 29 30 impl Encoding for BigFive2003Encoding { name(&self) -> &'static str31 fn name(&self) -> &'static str { "big5-2003" } whatwg_name(&self) -> Option<&'static str>32 fn whatwg_name(&self) -> Option<&'static str> { Some("big5") } // WHATWG compatibility raw_encoder(&self) -> Box<RawEncoder>33 fn raw_encoder(&self) -> Box<RawEncoder> { BigFive2003Encoder::new() } raw_decoder(&self) -> Box<RawDecoder>34 fn raw_decoder(&self) -> Box<RawDecoder> { BigFive2003HKSCS2008Decoder::new() } 35 } 36 37 /// An encoder for Big5-2003. 38 #[derive(Clone, Copy)] 39 pub struct BigFive2003Encoder; 40 41 impl BigFive2003Encoder { new() -> Box<RawEncoder>42 pub fn new() -> Box<RawEncoder> { Box::new(BigFive2003Encoder) } 43 } 44 45 impl RawEncoder for BigFive2003Encoder { from_self(&self) -> Box<RawEncoder>46 fn from_self(&self) -> Box<RawEncoder> { BigFive2003Encoder::new() } is_ascii_compatible(&self) -> bool47 fn is_ascii_compatible(&self) -> bool { true } 48 raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>)49 fn raw_feed(&mut self, input: &str, output: &mut ByteWriter) -> (usize, Option<CodecError>) { 50 output.writer_hint(input.len()); 51 52 for ((i,j), ch) in input.index_iter() { 53 if ch < '\u{80}' { 54 output.write_byte(ch as u8); 55 } else { 56 let ptr = index::big5::backward(ch as u32); 57 if ptr == 0xffff || ptr < (0xa1 - 0x81) * 157 { 58 // no HKSCS extension (XXX doesn't HKSCS include 0xFA40..0xFEFE?) 59 return (i, Some(CodecError { 60 upto: j as isize, cause: "unrepresentable character".into() 61 })); 62 } 63 let lead = ptr / 157 + 0x81; 64 let trail = ptr % 157; 65 let trailoffset = if trail < 0x3f {0x40} else {0x62}; 66 output.write_byte(lead as u8); 67 output.write_byte((trail + trailoffset) as u8); 68 } 69 } 70 (input.len(), None) 71 } 72 raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError>73 fn raw_finish(&mut self, _output: &mut ByteWriter) -> Option<CodecError> { 74 None 75 } 76 } 77 78 /// A decoder for Big5-2003 with HKSCS-2008 extension. 79 #[derive(Clone, Copy)] 80 struct BigFive2003HKSCS2008Decoder { 81 st: bigfive2003::State, 82 } 83 84 impl BigFive2003HKSCS2008Decoder { new() -> Box<RawDecoder>85 pub fn new() -> Box<RawDecoder> { 86 Box::new(BigFive2003HKSCS2008Decoder { st: Default::default() }) 87 } 88 } 89 90 impl RawDecoder for BigFive2003HKSCS2008Decoder { from_self(&self) -> Box<RawDecoder>91 fn from_self(&self) -> Box<RawDecoder> { BigFive2003HKSCS2008Decoder::new() } is_ascii_compatible(&self) -> bool92 fn is_ascii_compatible(&self) -> bool { true } 93 raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>)94 fn raw_feed(&mut self, input: &[u8], output: &mut StringWriter) -> (usize, Option<CodecError>) { 95 let (st, processed, err) = bigfive2003::raw_feed(self.st, input, output, &()); 96 self.st = st; 97 (processed, err) 98 } 99 raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError>100 fn raw_finish(&mut self, output: &mut StringWriter) -> Option<CodecError> { 101 let (st, err) = bigfive2003::raw_finish(self.st, output, &()); 102 self.st = st; 103 err 104 } 105 } 106 107 stateful_decoder! { 108 module bigfive2003; 109 110 internal pub fn map_two_bytes(lead: u8, trail: u8) -> u32 { 111 use index_tradchinese as index; 112 113 let lead = lead as u16; 114 let trail = trail as u16; 115 let index = match (lead, trail) { 116 (0x81...0xfe, 0x40...0x7e) | (0x81...0xfe, 0xa1...0xfe) => { 117 let trailoffset = if trail < 0x7f {0x40} else {0x62}; 118 (lead - 0x81) * 157 + trail - trailoffset 119 } 120 _ => 0xffff, 121 }; 122 index::big5::forward(index) // may return two-letter replacements 0..3 123 } 124 125 initial: 126 // big5 lead = 0x00 127 state S0(ctx: Context) { 128 case b @ 0x00...0x7f => ctx.emit(b as u32); 129 case b @ 0x81...0xfe => S1(ctx, b); 130 case _ => ctx.err("invalid sequence"); 131 } 132 133 transient: 134 // big5 lead != 0x00 135 state S1(ctx: Context, lead: u8) { 136 case b => match map_two_bytes(lead, b) { 137 0xffff => { 138 let backup = if b < 0x80 {1} else {0}; 139 ctx.backup_and_err(backup, "invalid sequence") 140 }, 141 0 /*index=1133*/ => ctx.emit_str("\u{ca}\u{304}"), 142 1 /*index=1135*/ => ctx.emit_str("\u{ca}\u{30c}"), 143 2 /*index=1164*/ => ctx.emit_str("\u{ea}\u{304}"), 144 3 /*index=1166*/ => ctx.emit_str("\u{ea}\u{30c}"), 145 ch => ctx.emit(ch), 146 }; 147 } 148 } 149 150 #[cfg(test)] 151 mod bigfive2003_tests { 152 extern crate test; 153 use super::BigFive2003Encoding; 154 use testutils; 155 use types::*; 156 157 #[test] test_encoder_valid()158 fn test_encoder_valid() { 159 let mut e = BigFive2003Encoding.raw_encoder(); 160 assert_feed_ok!(e, "A", "", [0x41]); 161 assert_feed_ok!(e, "BC", "", [0x42, 0x43]); 162 assert_feed_ok!(e, "", "", []); 163 assert_feed_ok!(e, "\u{4e2d}\u{83ef}\u{6c11}\u{570b}", "", 164 [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea]); 165 assert_feed_ok!(e, "1\u{20ac}/m", "", [0x31, 0xa3, 0xe1, 0x2f, 0x6d]); 166 assert_feed_ok!(e, "\u{ffed}", "", [0xf9, 0xfe]); 167 assert_finish_ok!(e, []); 168 } 169 170 #[test] test_encoder_invalid()171 fn test_encoder_invalid() { 172 let mut e = BigFive2003Encoding.raw_encoder(); 173 assert_feed_err!(e, "", "\u{ffff}", "", []); 174 assert_feed_err!(e, "?", "\u{ffff}", "!", [0x3f]); 175 assert_feed_err!(e, "", "\u{3eec}", "\u{4e00}", []); // HKSCS-2008 addition 176 assert_finish_ok!(e, []); 177 } 178 179 #[test] test_decoder_valid()180 fn test_decoder_valid() { 181 let mut d = BigFive2003Encoding.raw_decoder(); 182 assert_feed_ok!(d, [0x41], [], "A"); 183 assert_feed_ok!(d, [0x42, 0x43], [], "BC"); 184 assert_feed_ok!(d, [], [], ""); 185 assert_feed_ok!(d, [0xa4, 0xa4, 0xb5, 0xd8, 0xa5, 0xc1, 0xb0, 0xea], [], 186 "\u{4e2d}\u{83ef}\u{6c11}\u{570b}"); 187 assert_feed_ok!(d, [], [0xa4], ""); 188 assert_feed_ok!(d, [0xa4, 0xb5, 0xd8], [0xa5], "\u{4e2d}\u{83ef}"); 189 assert_feed_ok!(d, [0xc1, 0xb0, 0xea], [], "\u{6c11}\u{570b}"); 190 assert_feed_ok!(d, [0x31, 0xa3, 0xe1, 0x2f, 0x6d], [], "1\u{20ac}/m"); 191 assert_feed_ok!(d, [0xf9, 0xfe], [], "\u{ffed}"); 192 assert_feed_ok!(d, [0x87, 0x7e], [], "\u{3eec}"); // HKSCS-2008 addition 193 assert_feed_ok!(d, [0x88, 0x62, 0x88, 0x64, 0x88, 0xa3, 0x88, 0xa5], [], 194 "\u{ca}\u{304}\u{00ca}\u{30c}\u{ea}\u{304}\u{ea}\u{30c}"); // 2-byte output 195 assert_finish_ok!(d, ""); 196 } 197 198 #[test] test_decoder_invalid_lone_lead_immediate_test_finish()199 fn test_decoder_invalid_lone_lead_immediate_test_finish() { 200 for i in 0x81..0xff { 201 let mut d = BigFive2003Encoding.raw_decoder(); 202 assert_feed_ok!(d, [], [i], ""); // wait for a trail 203 assert_finish_err!(d, ""); 204 } 205 206 // 80/FF: immediate failure 207 let mut d = BigFive2003Encoding.raw_decoder(); 208 assert_feed_err!(d, [], [0x80], [], ""); 209 assert_feed_err!(d, [], [0xff], [], ""); 210 assert_finish_ok!(d, ""); 211 } 212 213 #[test] test_decoder_invalid_lone_lead_followed_by_space()214 fn test_decoder_invalid_lone_lead_followed_by_space() { 215 for i in 0x80..0x100 { 216 let i = i as u8; 217 let mut d = BigFive2003Encoding.raw_decoder(); 218 assert_feed_err!(d, [], [i], [0x20], ""); 219 assert_finish_ok!(d, ""); 220 } 221 } 222 223 #[test] test_decoder_invalid_lead_followed_by_invalid_trail()224 fn test_decoder_invalid_lead_followed_by_invalid_trail() { 225 // unlike most other cases, valid lead + invalid MSB-set trail are entirely consumed. 226 // https://www.w3.org/Bugs/Public/show_bug.cgi?id=16771 227 for i in 0x81..0xff { 228 let mut d = BigFive2003Encoding.raw_decoder(); 229 assert_feed_err!(d, [], [i, 0x80], [0x20], ""); 230 assert_feed_err!(d, [], [i, 0xff], [0x20], ""); 231 assert_finish_ok!(d, ""); 232 233 let mut d = BigFive2003Encoding.raw_decoder(); 234 assert_feed_ok!(d, [], [i], ""); 235 assert_feed_err!(d, [], [0x80], [0x20], ""); 236 assert_feed_ok!(d, [], [i], ""); 237 assert_feed_err!(d, [], [0xff], [0x20], ""); 238 assert_finish_ok!(d, ""); 239 } 240 241 // 80/FF is not a valid lead and the trail is not consumed 242 let mut d = BigFive2003Encoding.raw_decoder(); 243 assert_feed_err!(d, [], [0x80], [0x80], ""); 244 assert_feed_err!(d, [], [0x80], [0xff], ""); 245 assert_feed_err!(d, [], [0xff], [0x80], ""); 246 assert_feed_err!(d, [], [0xff], [0xff], ""); 247 assert_finish_ok!(d, ""); 248 } 249 250 #[test] test_decoder_feed_after_finish()251 fn test_decoder_feed_after_finish() { 252 let mut d = BigFive2003Encoding.raw_decoder(); 253 assert_feed_ok!(d, [0xa4, 0x40], [0xa4], "\u{4e00}"); 254 assert_finish_err!(d, ""); 255 assert_feed_ok!(d, [0xa4, 0x40], [], "\u{4e00}"); 256 assert_finish_ok!(d, ""); 257 } 258 259 #[bench] bench_encode_short_text(bencher: &mut test::Bencher)260 fn bench_encode_short_text(bencher: &mut test::Bencher) { 261 let s = testutils::TRADITIONAL_CHINESE_TEXT; 262 bencher.bytes = s.len() as u64; 263 bencher.iter(|| test::black_box({ 264 BigFive2003Encoding.encode(&s, EncoderTrap::Strict) 265 })) 266 } 267 268 #[bench] bench_decode_short_text(bencher: &mut test::Bencher)269 fn bench_decode_short_text(bencher: &mut test::Bencher) { 270 let s = BigFive2003Encoding.encode(testutils::TRADITIONAL_CHINESE_TEXT, 271 EncoderTrap::Strict).ok().unwrap(); 272 bencher.bytes = s.len() as u64; 273 bencher.iter(|| test::black_box({ 274 BigFive2003Encoding.decode(&s, DecoderTrap::Strict) 275 })) 276 } 277 } 278