1 // 2 // GB18030Encoding.cs 3 // 4 // Author: 5 // Atsushi Enomoto <atsushi@ximian.com> 6 // 7 using System; 8 using System.Reflection; 9 using System.Text; 10 using I18N.Common; 11 12 #if DISABLE_UNSAFE 13 using MonoEncoder = I18N.Common.MonoSafeEncoder; 14 using MonoEncoding = I18N.Common.MonoSafeEncoding; 15 #endif 16 17 namespace I18N.CJK 18 { 19 [Serializable] 20 internal class ENCgb18030 : GB18030Encoding 21 { ENCgb18030()22 public ENCgb18030 (): base () {} 23 } 24 25 [Serializable] 26 public class CP54936 : GB18030Encoding { } 27 28 [Serializable] 29 public class GB18030Encoding : MonoEncoding 30 { 31 // Constructor. GB18030Encoding()32 public GB18030Encoding () 33 : base (54936, 936) 34 { 35 } 36 37 public override string EncodingName { 38 get { return "Chinese Simplified (GB18030)"; } 39 } 40 41 public override string HeaderName { 42 get { return "GB18030"; } 43 } 44 45 public override string BodyName { 46 get { return "GB18030"; } 47 } 48 49 public override string WebName { 50 get { return "GB18030"; } 51 } 52 53 public override bool IsMailNewsDisplay { 54 get { return true; } 55 } 56 57 public override bool IsMailNewsSave { 58 get { return true; } 59 } 60 61 public override bool IsBrowserDisplay { 62 get { return true; } 63 } 64 65 public override bool IsBrowserSave { 66 get { return true; } 67 } 68 GetMaxByteCount(int len)69 public override int GetMaxByteCount (int len) 70 { 71 // non-GB2312 characters in \u0080 - \uFFFF 72 return len * 4; 73 } 74 GetMaxCharCount(int len)75 public override int GetMaxCharCount (int len) 76 { 77 return len; 78 } 79 80 #if !DISABLE_UNSAFE GetByteCountImpl(char* chars, int count)81 public unsafe override int GetByteCountImpl (char* chars, int count) 82 { 83 return new GB18030Encoder (this).GetByteCountImpl (chars, count, true); 84 } 85 GetBytesImpl(char* chars, int charCount, byte* bytes, int byteCount)86 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount) 87 { 88 return new GB18030Encoder (this).GetBytesImpl (chars, charCount, bytes, byteCount, true); 89 } 90 #else GetByteCount(char [] chars, int index, int length)91 public override int GetByteCount (char [] chars, int index, int length) 92 { 93 return new GB18030Encoder (this).GetByteCount (chars, index, length, true); 94 } 95 GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)96 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex) 97 { 98 return new GB18030Encoder (this).GetBytes (chars, charIndex, charCount, bytes, byteIndex, true); 99 } 100 #endif 101 GetCharCount(byte [] bytes, int start, int len)102 public override int GetCharCount (byte [] bytes, int start, int len) 103 { 104 return new GB18030Decoder ().GetCharCount (bytes, start, len); 105 } 106 GetChars(byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx)107 public override int GetChars (byte [] bytes, int byteIdx, int srclen, char [] chars, int charIdx) 108 { 109 return new GB18030Decoder ().GetChars (bytes, byteIdx, srclen, chars, charIdx); 110 } 111 GetEncoder()112 public override Encoder GetEncoder () 113 { 114 return new GB18030Encoder (this); 115 } 116 GetDecoder()117 public override Decoder GetDecoder () 118 { 119 return new GB18030Decoder (); 120 } 121 } 122 123 class GB18030Decoder : DbcsEncoding.DbcsDecoder 124 { 125 static DbcsConvert gb2312 = DbcsConvert.Gb2312; 126 // for now incomplete block is not supported - should we? 127 // int incomplete1 = -1, incomplete2 = -1, incomplete3 = -1; 128 GB18030Decoder()129 public GB18030Decoder () 130 : base (null) 131 { 132 } 133 GetCharCount(byte [] bytes, int start, int len)134 public override int GetCharCount (byte [] bytes, int start, int len) 135 { 136 CheckRange (bytes, start, len); 137 138 int end = start + len; 139 int ret = 0; 140 while (start < end) { 141 if (bytes [start] < 0x80) { 142 ret++; 143 start++; 144 continue; 145 } 146 else if (bytes [start] == 0x80) { 147 // Euro sign - actually it is obsolete, 148 // now it's just reserved but not used 149 ret++; 150 start++; 151 continue; 152 } 153 else if (bytes [start] == 0xFF) { 154 // invalid data - fill '?' 155 ret++; 156 start++; 157 continue; 158 } 159 else if (start + 1 >= end) { 160 // incomplete1 = bytes [start]; 161 // incomplete2 = -1; 162 // incomplete3 = -1; 163 ret++; 164 break; // incomplete tail. 165 } 166 167 byte second = bytes [start + 1]; 168 if (second == 0x7F || second == 0xFF) { 169 // invalid data 170 ret++; 171 start += 2; 172 continue; 173 } 174 else if (0x30 <= second && second <= 0x39) { 175 // UCS mapping 176 if (start + 3 >= end) { 177 // incomplete tail. 178 // incomplete1 = bytes [start]; 179 // incomplete2 = bytes [start + 1]; 180 // if (start + 3 == end) 181 // incomplete3 = bytes [start + 2]; 182 ret += start + 3 == end ? 3 : 2; 183 break; 184 } 185 long value = GB18030Source.FromGBX (bytes, start); 186 if (value < 0) { 187 // invalid data. 188 ret++; 189 start -= (int) value; 190 } else if (value >= 0x10000) { 191 // UTF16 surrogate 192 ret += 2; 193 start += 4; 194 } else { 195 // UTF16 BMP 196 ret++; 197 start+= 4; 198 } 199 } else { 200 // GB2312 mapping 201 start += 2; 202 ret++; 203 } 204 } 205 return ret; 206 } 207 GetChars(byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex)208 public override int GetChars (byte [] bytes, int byteIndex, int byteCount, char [] chars, int charIndex) 209 { 210 CheckRange (bytes, byteIndex, byteCount, chars, charIndex); 211 212 int byteEnd = byteIndex + byteCount; 213 int charStart = charIndex; 214 215 while (byteIndex < byteEnd) { 216 if (bytes [byteIndex] < 0x80) { 217 chars [charIndex++] = (char) bytes [byteIndex++]; 218 continue; 219 } 220 else if (bytes [byteIndex] == 0x80) { 221 // Euro sign - actually it is obsolete, 222 // now it's just reserved but not used 223 chars [charIndex++] = '\u20AC'; 224 byteIndex++; 225 continue; 226 } 227 else if (bytes [byteIndex] == 0xFF) { 228 // invalid data - fill '?' 229 chars [charIndex++] = '?'; 230 byteIndex++; 231 continue; 232 } 233 else if (byteIndex + 1 >= byteEnd) { 234 //incomplete1 = bytes [byteIndex++]; 235 //incomplete2 = -1; 236 //incomplete3 = -1; 237 break; // incomplete tail. 238 } 239 240 byte second = bytes [byteIndex + 1]; 241 if (second == 0x7F || second == 0xFF) { 242 // invalid data 243 chars [charIndex++] = '?'; 244 byteIndex += 2; 245 } 246 else if (0x30 <= second && second <= 0x39) { 247 // UCS mapping 248 if (byteIndex + 3 >= byteEnd) { 249 // incomplete tail. 250 //incomplete1 = bytes [byteIndex]; 251 //incomplete2 = bytes [byteIndex + 1]; 252 //if (byteIndex + 3 == byteEnd) 253 // incomplete3 = bytes [byteIndex + 2]; 254 break; 255 } 256 long value = GB18030Source.FromGBX (bytes, byteIndex); 257 if (value < 0) { 258 // invalid data. 259 chars [charIndex++] = '?'; 260 byteIndex -= (int) value; 261 } else if (value >= 0x10000) { 262 // UTF16 surrogate 263 value -= 0x10000; 264 chars [charIndex++] = (char) (value / 0x400 + 0xD800); 265 chars [charIndex++] = (char) (value % 0x400 + 0xDC00); 266 byteIndex += 4; 267 } else { 268 // UTF16 BMP 269 chars [charIndex++] = (char) value; 270 byteIndex += 4; 271 } 272 } else { 273 byte first = bytes [byteIndex]; 274 int ord = ((first - 0x81) * 191 + second - 0x40) * 2; 275 char c1 = ord < 0 || ord >= gb2312.n2u.Length ? 276 '\0' : (char) (gb2312.n2u [ord] + gb2312.n2u [ord + 1] * 256); 277 if (c1 == 0) 278 chars [charIndex++] = '?'; 279 else 280 chars [charIndex++] = c1; 281 byteIndex += 2; 282 } 283 } 284 285 return charIndex - charStart; 286 } 287 } 288 289 class GB18030Encoder : MonoEncoder 290 { 291 static DbcsConvert gb2312 = DbcsConvert.Gb2312; 292 GB18030Encoder(MonoEncoding owner)293 public GB18030Encoder (MonoEncoding owner) 294 : base (owner) 295 { 296 } 297 298 char incomplete_byte_count; 299 char incomplete_bytes; 300 301 #if !DISABLE_UNSAFE GetByteCountImpl(char* chars, int count, bool refresh)302 public unsafe override int GetByteCountImpl (char* chars, int count, bool refresh) 303 { 304 int start = 0; 305 int end = count; 306 int ret = 0; 307 while (start < end) { 308 char ch = chars [start]; 309 if (ch < 0x80) { 310 // ASCII 311 ret++; 312 start++; 313 continue; 314 } else if (Char.IsSurrogate (ch)) { 315 // Surrogate 316 if (start + 1 == end) { 317 incomplete_byte_count = ch; 318 start++; 319 } else { 320 ret += 4; 321 start += 2; 322 } 323 continue; 324 } 325 326 if (ch < 0x80 || ch == 0xFF) { 327 // ASCII 328 ret++; 329 start++; 330 continue; 331 } 332 333 byte b1 = gb2312.u2n [((int) ch) * 2 + 1]; 334 byte b2 = gb2312.u2n [((int) ch) * 2]; 335 if (b1 != 0 && b2 != 0) { 336 // GB2312 337 ret += 2; 338 start++; 339 continue; 340 } 341 342 // non-GB2312 343 long value = GB18030Source.FromUCS (ch); 344 if (value < 0) 345 ret++; // invalid(?) 346 else 347 ret += 4; 348 start++; 349 } 350 351 if (refresh) { 352 if (incomplete_byte_count != char.MinValue) 353 ret++; 354 incomplete_byte_count = char.MinValue; 355 } 356 return ret; 357 } 358 GetBytesImpl(char* chars, int charCount, byte* bytes, int byteCount, bool refresh)359 public unsafe override int GetBytesImpl (char* chars, int charCount, byte* bytes, int byteCount, bool refresh) 360 { 361 int charIndex = 0; 362 int byteIndex = 0; 363 364 int charEnd = charIndex + charCount; 365 int byteStart = byteIndex; 366 char ch = incomplete_bytes; 367 368 while (charIndex < charEnd) { 369 if (incomplete_bytes == char.MinValue) 370 ch = chars [charIndex++]; 371 else 372 incomplete_bytes = char.MinValue; 373 374 if (ch < 0x80) { 375 // ASCII 376 bytes [byteIndex++] = (byte) ch; 377 continue; 378 } else if (Char.IsSurrogate (ch)) { 379 // Surrogate 380 if (charIndex == charEnd) { 381 incomplete_bytes = ch; 382 break; // incomplete 383 } 384 char ch2 = chars [charIndex++]; 385 if (!Char.IsSurrogate (ch2)) { 386 // invalid surrogate 387 HandleFallback ( 388 chars, ref charIndex, ref charCount, 389 bytes, ref byteIndex, ref byteCount, null); 390 continue; 391 } 392 int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00; 393 GB18030Source.Unlinear (bytes + byteIndex, GB18030Source.FromUCSSurrogate (cp)); 394 byteIndex += 4; 395 continue; 396 } 397 398 399 if (ch <= 0x80 || ch == 0xFF) { 400 // Character maps to itself 401 bytes [byteIndex++] = (byte) ch; 402 continue; 403 } 404 405 byte b1 = gb2312.u2n [((int) ch) * 2 + 1]; 406 byte b2 = gb2312.u2n [((int) ch) * 2]; 407 if (b1 != 0 && b2 != 0) { 408 bytes [byteIndex++] = b1; 409 bytes [byteIndex++] = b2; 410 continue; 411 } 412 413 long value = GB18030Source.FromUCS (ch); 414 if (value < 0) 415 bytes [byteIndex++] = 0x3F; // invalid(?) 416 else { 417 // non-GB2312 418 GB18030Source.Unlinear (bytes + byteIndex, value); 419 byteIndex += 4; 420 } 421 } 422 423 if (refresh) { 424 if (incomplete_bytes != char.MinValue) 425 bytes [byteIndex++] = 0x3F; // incomplete 426 incomplete_bytes = char.MinValue; 427 } 428 429 return byteIndex - byteStart; 430 } 431 #else 432 GetByteCount(char[] chars, int index, int count, bool refresh)433 public override int GetByteCount(char[] chars, int index, int count, bool refresh) 434 { 435 int start = 0; 436 int end = count; 437 int ret = 0; 438 while (start < end) 439 { 440 char ch = chars[start]; 441 if (ch < 0x80) 442 { 443 // ASCII 444 ret++; 445 start++; 446 continue; 447 } 448 else if (Char.IsSurrogate(ch)) 449 { 450 // Surrogate 451 if (start + 1 == end) 452 { 453 incomplete_byte_count = ch; 454 start++; 455 } 456 else 457 { 458 ret += 4; 459 start += 2; 460 } 461 continue; 462 } 463 464 if (ch < 0x80 || ch == 0xFF) 465 { 466 // ASCII 467 ret++; 468 start++; 469 continue; 470 } 471 472 byte b1 = gb2312.u2n[((int)ch) * 2 + 1]; 473 byte b2 = gb2312.u2n[((int)ch) * 2]; 474 if (b1 != 0 && b2 != 0) 475 { 476 // GB2312 477 ret += 2; 478 start++; 479 continue; 480 } 481 482 // non-GB2312 483 long value = GB18030Source.FromUCS(ch); 484 if (value < 0) 485 ret++; // invalid(?) 486 else 487 ret += 4; 488 start++; 489 } 490 491 if (refresh) 492 { 493 if (incomplete_byte_count != char.MinValue) 494 ret++; 495 incomplete_byte_count = char.MinValue; 496 } 497 return ret; 498 } 499 GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool refresh)500 public override int GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex, bool refresh) 501 { 502 int byteCount = bytes.Length; 503 int charEnd = charIndex + charCount; 504 int byteStart = byteIndex; 505 char ch = incomplete_bytes; 506 507 while (charIndex < charEnd) 508 { 509 if (incomplete_bytes == char.MinValue) 510 ch = chars[charIndex++]; 511 else 512 incomplete_bytes = char.MinValue; 513 514 if (ch < 0x80) 515 { 516 // ASCII 517 bytes[byteIndex++] = (byte)ch; 518 continue; 519 } 520 else if (Char.IsSurrogate(ch)) 521 { 522 // Surrogate 523 if (charIndex == charEnd) 524 { 525 incomplete_bytes = ch; 526 break; // incomplete 527 } 528 char ch2 = chars[charIndex++]; 529 if (!Char.IsSurrogate(ch2)) 530 { 531 // invalid surrogate 532 HandleFallback (chars, ref charIndex, ref charCount, 533 bytes, ref byteIndex, ref byteCount, null); 534 continue; 535 } 536 int cp = (ch - 0xD800) * 0x400 + ch2 - 0xDC00; 537 GB18030Source.Unlinear(bytes, byteIndex, GB18030Source.FromUCSSurrogate(cp)); 538 byteIndex += 4; 539 continue; 540 } 541 542 543 if (ch <= 0x80 || ch == 0xFF) 544 { 545 // Character maps to itself 546 bytes[byteIndex++] = (byte)ch; 547 continue; 548 } 549 550 byte b1 = gb2312.u2n[((int)ch) * 2 + 1]; 551 byte b2 = gb2312.u2n[((int)ch) * 2]; 552 if (b1 != 0 && b2 != 0) 553 { 554 bytes[byteIndex++] = b1; 555 bytes[byteIndex++] = b2; 556 continue; 557 } 558 559 long value = GB18030Source.FromUCS(ch); 560 if (value < 0) 561 bytes[byteIndex++] = 0x3F; // invalid(?) 562 else 563 { 564 // non-GB2312 565 GB18030Source.Unlinear(bytes, byteIndex, value); 566 byteIndex += 4; 567 } 568 } 569 570 if (refresh) 571 { 572 if (incomplete_bytes != char.MinValue) 573 bytes[byteIndex++] = 0x3F; // incomplete 574 incomplete_bytes = char.MinValue; 575 } 576 577 return byteIndex - byteStart; 578 } 579 #endif 580 } 581 } 582