1 // ==++== 2 // 3 // Copyright (c) Microsoft Corporation. All rights reserved. 4 // 5 // ==--== 6 7 // ISO2022Encoding.cs 8 // 9 // Ported to managed code from c_is2022.c and related iso 2022 dll files from mlang 10 // 11 // Abstract: 12 // 13 // Managed implimentation of ISO 2022 code pages, ported from the implimentation in c_is2022.dll 14 // This code should be kept in sync with the other implimentations 15 // This encoding wraps the basic encodings in code that adds the shift in/out wrapper methods 16 // 17 // Notes: 18 // 19 // IsAlwaysNormalized ??? 20 // Regarding Normalization for ISO-2022-JP (50220, 50221, 50222), its the same rules as EUCJP 21 // Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings 22 // Form D is precluded because of 0x00a8, which changes to space + dierises. 23 // 24 // Note: I think that IsAlwaysNormalized should probably return true for form C for Japanese 20932 based CPs. 25 // 26 // For ISO-2022-KR 27 // Never normalized, C & D (& therefore KC & KD) are precluded because of Hangul syllables and combined characters. 28 // 29 // IsAlwaysNormalized ??? 30 // Regarding Normalization for ISO-2022-CN (50227, 50229) & HZ-GB2312 (52936) I think is similar to the Japanese case. 31 // Forms KC & KD are precluded because of things like halfwidth Katakana that has compatibility mappings 32 // Form D is precluded because of 0x00a8, which changes to space + dierises. 33 // 34 // Note: I think that IsAlwaysNormalized should probably return true for form C for Chinese 20936 based CPs. 35 // 36 #if FEATURE_CODEPAGES_FILE // requires BaseCodePageEncooding 37 namespace System.Text 38 { 39 using System.Globalization; 40 using System.Diagnostics.Contracts; 41 using System.Text; 42 using System.Runtime.InteropServices; 43 using System; 44 using System.Security; 45 using System.Runtime.CompilerServices; 46 using System.Runtime.Serialization; 47 48 49 /*=================================ISO2022Encoding============================ 50 ** 51 ** This is used to support ISO 2022 encodings that use shift/escape sequences. 52 ** 53 ==============================================================================*/ 54 55 [Serializable] 56 internal class ISO2022Encoding : DBCSCodePageEncoding 57 { 58 const byte SHIFT_OUT = (byte)0x0E; 59 const byte SHIFT_IN = (byte)0x0F; 60 const byte ESCAPE = 0x1B; 61 const byte LEADBYTE_HALFWIDTH = 0x10; 62 63 // We have to load the 936 code page tables, so impersonate 936 as our base 64 // This pretends to be other code pages as far as memory sections are concerned. 65 [System.Security.SecurityCritical] // auto-generated ISO2022Encoding(int codePage)66 internal ISO2022Encoding(int codePage) : base(codePage, tableBaseCodePages[codePage % 10]) 67 { 68 this.m_bUseMlangTypeForSerialization = true; 69 } 70 71 // Constructor called by serialization. 72 // Note: We use the base GetObjectData however 73 [System.Security.SecurityCritical] // auto-generated ISO2022Encoding(SerializationInfo info, StreamingContext context)74 internal ISO2022Encoding(SerializationInfo info, StreamingContext context) : base(info, context) 75 { 76 // Actually this can't ever get called, CodePageEncoding is our proxy 77 Contract.Assert(false, "Didn't expect to make it to DBCSCodePageEncoding serialization constructor"); 78 throw new ArgumentException(Environment.GetResourceString("Arg_ExecutionEngineException")); 79 } 80 81 static int[] tableBaseCodePages = 82 { 83 932, // 50220 ISO-2022-JP, No halfwidth Katakana, convert to full width 84 932, // 50221 ISO-2022-JP, Use escape sequence for half width Katakana 85 932, // 50222 ISO-2022-JP, Use shift-in/shift-out for half width Katakana 86 0, 87 0, 88 949, // 50225 ISO-2022-KR, Korean 89 936, // 52936 HZ-GB2312, 936 might be better source 90 0, //20936, // 50227 ISO-2022-CN, Note: This is just the same as CP 936 in Everett. 91 0, 92 // 50229 is currently unsupported, CP 20000 is currently not built in .nlp file 93 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_1 94 0, //20000, // 50229 ISO-2022-CN, ModeCNS11643_2 95 0 // ModeASCII 96 }; 97 98 internal enum ISO2022Modes 99 { 100 ModeHalfwidthKatakana = 0, 101 ModeJIS0208 = 1, 102 ModeKR = 5, 103 ModeHZ = 6, 104 ModeGB2312 = 7, 105 ModeCNS11643_1 = 9, 106 ModeCNS11643_2 = 10, 107 ModeASCII = 11, 108 109 ModeIncompleteEscape = -1, 110 ModeInvalidEscape = -2, 111 ModeNOOP = -3 112 } 113 114 [System.Security.SecurityCritical] // auto-generated GetMemorySectionName()115 protected unsafe override String GetMemorySectionName() 116 { 117 int iUseCodePage = this.bFlagDataTable ? dataTableCodePage : CodePage; 118 119 String strFormat; 120 121 switch (this.CodePage) 122 { 123 case 50220: 124 case 50221: 125 case 50222: 126 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022JP"; 127 break; 128 case 50225: 129 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_ISO2022KR"; 130 break; 131 case 52936: 132 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}_HZ"; 133 break; 134 default: 135 Contract.Assert(false, "[ISO2022Encoding.GetMemorySectionName] Don't expect to get here for code page " + this.CodePage); 136 strFormat = "CodePage_{0}_{1}_{2}_{3}_{4}"; 137 break; 138 } 139 140 String strName = String.Format(CultureInfo.InvariantCulture, strFormat, 141 iUseCodePage, this.pCodePage->VersionMajor, this.pCodePage->VersionMinor, 142 this.pCodePage->VersionRevision, this.pCodePage->VersionBuild); 143 144 return strName; 145 } 146 147 // Clean up characters for ISO2022 code pages, etc. 148 // ISO2022 (50220, 50221, 50222) 149 // GB-HZ (52936) CleanUpBytes(ref int bytes)150 protected override bool CleanUpBytes(ref int bytes) 151 { 152 switch (this.CodePage) 153 { 154 // 932 based code pages 155 case 50220: 156 case 50221: 157 case 50222: 158 { 159 if (bytes >= 0x100) 160 { 161 // map extended char (0xfa40-0xfc4b) to a special range 162 // (ported from mlang) 163 if (bytes >= 0xfa40 && bytes <= 0xfc4b) 164 { 165 if ( bytes >= 0xfa40 && bytes <= 0xfa5b ) 166 { 167 if ( bytes <= 0xfa49 ) 168 bytes = bytes - 0x0b51 ; 169 else if ( bytes >= 0xfa4a && bytes <= 0xfa53 ) 170 bytes = bytes - 0x072f6 ; 171 else if ( bytes >= 0xfa54 && bytes <= 0xfa57 ) 172 bytes = bytes - 0x0b5b ; 173 else if ( bytes == 0xfa58 ) 174 bytes = 0x878a ; 175 else if ( bytes == 0xfa59 ) 176 bytes = 0x8782 ; 177 else if ( bytes == 0xfa5a ) 178 bytes = 0x8784 ; 179 else if ( bytes == 0xfa5b ) 180 bytes = 0x879a ; 181 } 182 else if ( bytes >= 0xfa5c && bytes <= 0xfc4b ) 183 { 184 byte tc = unchecked((byte)bytes); 185 if ( tc < 0x5c ) 186 bytes = bytes - 0x0d5f; 187 else if ( tc >= 0x80 && tc <= 0x9B ) 188 bytes = bytes - 0x0d1d; 189 else 190 bytes = bytes - 0x0d1c; 191 } 192 } 193 194 // Convert 932 code page to 20932 like code page range 195 // (also ported from mlang) 196 byte bLead = unchecked((byte)(bytes >> 8)); 197 byte bTrail = unchecked((byte)bytes); 198 199 bLead -= ((bLead > (byte)0x9f) ? (byte)0xb1 : (byte)0x71); 200 bLead = (byte)((bLead << 1) + 1); 201 if (bTrail > (byte)0x9e) 202 { 203 bTrail -= (byte)0x7e; 204 bLead++; 205 } 206 else 207 { 208 if (bTrail > (byte)0x7e) 209 bTrail--; 210 bTrail -= (byte)0x1f; 211 } 212 213 bytes = ((int)bLead) << 8 | (int)bTrail; 214 215 // Don't step out of our allocated lead byte area. 216 // All DBCS lead and trail bytes should be >= 0x21 and <= 0x7e 217 // This is commented out because Everett/Mlang had illegal PUA 218 // mappings to ISO2022 code pages that we're maintaining. 219 // if ((bytes & 0xFF00) < 0x2100 || (bytes & 0xFF00) > 0x7e00 || 220 // (bytes & 0xFF) < 0x21 || (bytes & 0xFF) > 0x7e) 221 // return false; 222 } 223 else 224 { 225 // Adjust 1/2 Katakana 226 if (bytes >= 0xa1 && bytes <= 0xdf) 227 bytes += (LEADBYTE_HALFWIDTH << 8) - 0x80; 228 229 // 0x81-0x9f and 0xe0-0xfc CP 932 230 // 0x8e and 0xa1-0xfe CP 20932 (we don't use 8e though) 231 // b0-df is 1/2 Katakana 232 if (bytes >= 0x81 && 233 (bytes <= 0x9f || 234 (bytes >= 0xe0 && bytes <= 0xfc))) 235 { 236 // Don't do lead bytes, we use escape sequences instead. 237 return false; 238 } 239 } 240 break; 241 } 242 case 50225: 243 { 244 // For 50225 since we don't rely on lead byte marks, return false and don't add them, 245 // esp. since we're only a 7 bit code page. 246 if (bytes >= 0x80 && bytes <= 0xff) 247 return false; 248 249 // Ignore characters out of range (a1-7f) 250 if (bytes >= 0x100 && 251 ((bytes & 0xff) < 0xa1 || (bytes & 0xff) == 0xff || 252 (bytes & 0xff00) < 0xa100 || (bytes & 0xff00) == 0xff00)) 253 return false; 254 255 // May as well get them into our 7 bit range 256 bytes &= 0x7f7f; 257 258 break; 259 } 260 case 52936: 261 { 262 // Since we don't rely on lead byte marks for 52936, get rid of them so we 263 // don't end up with extra wierd fffe mappings. 264 if (bytes >= 0x81 && bytes <= 0xfe) 265 return false; 266 267 break; 268 } 269 } 270 271 return true; 272 } 273 274 // GetByteCount 275 [System.Security.SecurityCritical] // auto-generated GetByteCount(char* chars, int count, EncoderNLS baseEncoder)276 internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS baseEncoder) 277 { 278 // Just need to ASSERT, this is called by something else internal that checked parameters already 279 Contract.Assert(count >= 0, "[ISO2022Encoding.GetByteCount]count is negative"); 280 Contract.Assert(chars != null, "[ISO2022Encoding.GetByteCount]chars is null"); 281 282 // Just call GetBytes with null byte* to get count 283 return GetBytes(chars, count, null, 0, baseEncoder); 284 } 285 286 [System.Security.SecurityCritical] // auto-generated GetBytes(char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS baseEncoder)287 internal override unsafe int GetBytes(char* chars, int charCount, 288 byte* bytes, int byteCount, EncoderNLS baseEncoder) 289 { 290 // Just need to ASSERT, this is called by something else internal that checked parameters already 291 Contract.Assert(chars != null, "[ISO2022Encoding.GetBytes]chars is null"); 292 Contract.Assert(byteCount >= 0, "[ISO2022Encoding.GetBytes]byteCount is negative"); 293 Contract.Assert(charCount >= 0, "[ISO2022Encoding.GetBytes]charCount is negative"); 294 295 // Assert because we shouldn't be able to have a null encoder. 296 Contract.Assert(encoderFallback != null, "[ISO2022Encoding.GetBytes]Attempting to use null encoder fallback"); 297 298 // Fix our encoder 299 ISO2022Encoder encoder = (ISO2022Encoder)baseEncoder; 300 301 // Our return value 302 int iCount = 0; 303 304 switch(CodePage) 305 { 306 case 50220: 307 case 50221: 308 case 50222: 309 iCount = GetBytesCP5022xJP( chars, charCount, bytes, byteCount, encoder ); 310 break; 311 case 50225: 312 iCount = GetBytesCP50225KR( chars, charCount, bytes, byteCount, encoder ); 313 break; 314 // Everett had 50227 the same as 936 315 /* case 50227: 316 iCount = GetBytesCP50227CN( chars, charCount, bytes, byteCount, encoder ); 317 break; 318 */ 319 case 52936: 320 iCount = GetBytesCP52936( chars, charCount, bytes, byteCount, encoder ); 321 break; 322 } 323 324 return iCount; 325 } 326 327 // This is internal and called by something else, 328 [System.Security.SecurityCritical] // auto-generated GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)329 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) 330 { 331 // Just assert, we're called internally so these should be safe, checked already 332 Contract.Assert(bytes != null, "[ISO2022Encoding.GetCharCount]bytes is null"); 333 Contract.Assert(count >= 0, "[ISO2022Encoding.GetCharCount]byteCount is negative"); 334 335 // Just call getChars with null char* to get count 336 return GetChars(bytes, count, null, 0, baseDecoder); 337 } 338 339 [System.Security.SecurityCritical] // auto-generated GetChars(byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder)340 internal override unsafe int GetChars(byte* bytes, int byteCount, 341 char* chars, int charCount, DecoderNLS baseDecoder) 342 { 343 // Just need to ASSERT, this is called by something else internal that checked parameters already 344 Contract.Assert(bytes != null, "[ISO2022Encoding.GetChars]bytes is null"); 345 Contract.Assert(byteCount >= 0, "[ISO2022Encoding.GetChars]byteCount is negative"); 346 Contract.Assert(charCount >= 0, "[ISO2022Encoding.GetChars]charCount is negative"); 347 348 // Fix our decoder 349 ISO2022Decoder decoder = (ISO2022Decoder)baseDecoder; 350 int iCount = 0; 351 352 switch (CodePage) 353 { 354 case 50220: 355 case 50221: 356 case 50222: 357 iCount = GetCharsCP5022xJP( bytes, byteCount, chars, charCount, decoder); 358 break; 359 case 50225: 360 iCount = GetCharsCP50225KR( bytes, byteCount, chars, charCount, decoder); 361 break; 362 // Currently 50227 is the same as 936 363 // case 50227: 364 // iCount = GetCharsCP50227CN( bytes, byteCount, chars, charCount, decoder); 365 // break; 366 case 52936: 367 iCount = GetCharsCP52936( bytes, byteCount, chars, charCount, decoder); 368 break; 369 default: 370 Contract.Assert(false, "[ISO2022Encoding.GetChars] had unexpected code page"); 371 break; 372 } 373 374 return iCount; 375 } 376 377 // ISO 2022 Code pages for JP. 378 // 50220 - No halfwidth Katakana, convert to full width 379 // 50221 - Use escape sequence for half width Katakana 380 // 50222 - Use shift-in/shift-out for half width Katakana 381 // 382 // These are the JIS code pages, superset of ISO-2022 / ISO-2022-JP-1 383 // 0E Shift Out (following bytes are Katakana) 384 // 0F Shift In (back to "normal" behavior) 385 // 21-7E Byte ranges (1 or 2 bytes) 386 // <ESC> $ @ To Double Byte 0208 Mode (actually older code page, but subset of 0208) 387 // <ESC> $ B To Double Byte 0208 Mode (duplicate) 388 // <ESC> $ ( D To Double Byte 0212 Mode (previously we misinterpreted this) 389 // <ESC> $ I To half width Katakana 390 // <ESC> ( J To JIS-Roman 391 // <ESC> ( H To JIS-Roman (swedish character set) 392 // <ESC> ( B To ASCII 393 // <ESC> & @ Alternate lead in to <ESC> $ B so just ignore it. 394 // 395 // So in Katakana mode we add 0x8e as a lead byte and use CP 20932 to convert it 396 // In ASCII mode we just spit out the single byte. 397 // In Roman mode we should change 0x5c (\) -> Yen sign and 0x7e (~) to Overline, however 398 // we didn't in mLang, otherwise roman is like ASCII. 399 // In 0208 double byte mode we have to |= with 0x8080 and use CP 20932 to convert it. 400 // In 0212 double byte mode we have to |= with 0x8000 and use CP 20932 to convert it. 401 // 402 // Note that JIS Shift In/Shift Out is different than the other ISO2022 encodings. For JIS 403 // Shift out always shifts to half-width Katakana. Chinese encodings use designator sequences 404 // instead of escape sequences and shift out to the designated sequence or back in to ASCII. 405 // 406 // When decoding JIS 0208, MLang used a '*' (0x2a) character in JIS 0208 mode to map the trailing byte 407 // to halfwidth katakana. I found no description of that behavior, however that block of 0208 is 408 // undefined, so we maintain that behavior when decoding. We will never generate characters using 409 // that technique, but the decoder will process them. 410 // 411 [System.Security.SecurityCritical] // auto-generated GetBytesCP5022xJP(char* chars, int charCount, byte* bytes, int byteCount, ISO2022Encoder encoder)412 private unsafe int GetBytesCP5022xJP(char* chars, int charCount, 413 byte* bytes, int byteCount, ISO2022Encoder encoder) 414 { 415 // prepare our helpers 416 Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( 417 this, encoder, bytes, byteCount, chars, charCount); 418 419 // Get our mode 420 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode 421 ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that shift in will go back to (only used by CP 50222) 422 423 // Check our encoder 424 if (encoder != null) 425 { 426 char charLeftOver = encoder.charLeftOver; 427 428 currentMode = encoder.currentMode; 429 shiftInMode = encoder.shiftInOutMode; 430 431 // We may have a left over character from last time, try and process it. 432 if (charLeftOver > 0) 433 { 434 Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP5022xJP]leftover character should be high surrogate"); 435 436 // It has to be a high surrogate, which we don't support, so it has to be a fallback 437 buffer.Fallback(charLeftOver); 438 } 439 } 440 441 while (buffer.MoreData) 442 { 443 // Get our char 444 char ch = buffer.GetNextChar(); 445 446 // Get our bytes 447 ushort iBytes = mapUnicodeToBytes[ch]; 448 449 StartConvert: 450 // Check for halfwidth bytes 451 byte bLeadByte = (byte)(iBytes >> 8); 452 byte bTrailByte = (byte)(iBytes & 0xff); 453 454 if (bLeadByte == LEADBYTE_HALFWIDTH) 455 { 456 // Its Halfwidth Katakana 457 if (CodePage == 50220) 458 { 459 // CodePage 50220 doesn't use halfwidth Katakana, convert to fullwidth 460 // See if its out of range, fallback if so, throws if recursive fallback 461 if (bTrailByte < 0x21 || bTrailByte >= 0x21 + HalfToFullWidthKanaTable.Length) 462 { 463 buffer.Fallback(ch); 464 continue; 465 } 466 467 // Get the full width katakana char to use. 468 iBytes = unchecked((ushort)(HalfToFullWidthKanaTable[bTrailByte - 0x21] & 0x7F7F)); 469 470 // May have to do all sorts of fun stuff for mode, go back to start convert 471 goto StartConvert; 472 } 473 474 // Can use halfwidth Katakana, make sure we're in right mode 475 476 // Make sure we're in right mode 477 if (currentMode != ISO2022Modes.ModeHalfwidthKatakana) 478 { 479 // 50222 or 50221, either shift in/out or escape to get to Katakana mode 480 if (CodePage == 50222) 481 { 482 // Shift Out 483 if (!buffer.AddByte(SHIFT_OUT)) 484 break; // convert out of space, stop 485 486 // Don't change modes until after AddByte in case it fails for convert 487 // We get to shift out to Katakana, make sure we'll go back to the right mode 488 // (This ends up always being ASCII) 489 shiftInMode = currentMode; 490 currentMode = ISO2022Modes.ModeHalfwidthKatakana; 491 } 492 else 493 { 494 // 50221 does halfwidth katakana by escape sequence 495 Contract.Assert(CodePage == 50221, "[ISO2022Encoding.GetBytesCP5022xJP]Expected Code Page 50221"); 496 497 // Add our escape sequence 498 if (!buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'I'))) 499 break; // convert out of space, stop 500 501 currentMode = ISO2022Modes.ModeHalfwidthKatakana; 502 } 503 } 504 505 // We know we're in Katakana mode now, so add it. 506 // Go ahead and add the Katakana byte. Our table tail bytes are 0x80 too big. 507 if (!buffer.AddByte(unchecked((byte)(bTrailByte & 0x7F)))) 508 break; // convert out of space, stop 509 510 // Done with this one 511 continue; 512 } 513 else if (bLeadByte != 0) 514 { 515 // 516 // It's a double byte character. 517 // 518 519 // If we're CP 50222 we may have to shift in from Katakana mode first 520 if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) 521 { 522 // Shift In 523 if (!buffer.AddByte(SHIFT_IN)) 524 break; // convert out of space, stop 525 526 // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) 527 currentMode = shiftInMode; 528 } 529 530 // Make sure we're in the right mode (JIS 0208 or JIS 0212) 531 // Note: Right now we don't use JIS 0212. Also this table'd be wrong 532 533 // Its JIS extension 0208 534 if (currentMode != ISO2022Modes.ModeJIS0208) 535 { 536 // Escape sequence, we can fail after this, mode will be correct for convert 537 if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)'B'))) 538 break; // Convert out of space, stop 539 540 currentMode = ISO2022Modes.ModeJIS0208; 541 } 542 543 // Add our double bytes 544 if (!buffer.AddByte(unchecked((byte)(bLeadByte)), unchecked((byte)(bTrailByte)))) 545 break; // Convert out of space, stop 546 continue; 547 } 548 else if (iBytes != 0 || ch == 0) 549 { 550 // Single byte Char 551 // If we're CP 50222 we may have to shift in from Katakana mode first 552 if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) 553 { 554 // Shift IN 555 if (!buffer.AddByte(SHIFT_IN)) 556 break; // convert ran out of room 557 558 // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) 559 currentMode = shiftInMode; 560 } 561 562 // Its a single byte character, switch to ASCII if we have to 563 if (currentMode != ISO2022Modes.ModeASCII) 564 { 565 if (!buffer.AddByte(ESCAPE,unchecked((byte)'('), unchecked((byte)'B'))) 566 break; // convert ran out of room 567 568 currentMode = ISO2022Modes.ModeASCII; 569 } 570 571 // Add the ASCII char 572 if (!buffer.AddByte(bTrailByte)) 573 break; // convert had no room left 574 continue; 575 } 576 577 // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) 578 buffer.Fallback(ch); 579 } 580 581 // Switch back to ASCII if MustFlush or no encoder 582 if (currentMode != ISO2022Modes.ModeASCII && 583 (encoder == null || encoder.MustFlush)) 584 { 585 // If we're CP 50222 we may have to shift in from Katakana mode first 586 if (CodePage == 50222 && currentMode == ISO2022Modes.ModeHalfwidthKatakana) 587 { 588 // Shift IN, only shift mode if necessary. 589 if (buffer.AddByte(SHIFT_IN)) 590 // Need to shift in from katakana. (Still might not be right, but won't be shifted out anyway) 591 currentMode = shiftInMode; 592 else 593 // If not successful, convert will maintain state for next time, also 594 // AddByte will have decremented our char count, however we need it to remain the same 595 buffer.GetNextChar(); 596 } 597 598 // switch back to ASCII to finish neatly 599 if (currentMode != ISO2022Modes.ModeASCII && 600 (CodePage != 50222 || currentMode != ISO2022Modes.ModeHalfwidthKatakana)) 601 { 602 // only shift if it was successful 603 if (buffer.AddByte(ESCAPE, unchecked((byte)'('), unchecked((byte)'B'))) 604 currentMode = ISO2022Modes.ModeASCII; 605 else 606 // If not successful, convert will maintain state for next time, also 607 // AddByte will have decremented our char count, however we need it to remain the same 608 buffer.GetNextChar(); 609 } 610 } 611 612 // Remember our encoder state 613 if (bytes != null && encoder != null) 614 { 615 // This is ASCII if we had to flush 616 encoder.currentMode = currentMode; 617 encoder.shiftInOutMode = shiftInMode; 618 619 if (!buffer.fallbackBuffer.bUsedEncoder) 620 { 621 encoder.charLeftOver = (char)0; 622 } 623 624 encoder.m_charsUsed = buffer.CharsUsed; 625 } 626 627 // Return our length 628 return buffer.Count; 629 } 630 631 // ISO 2022 Code pages for Korean - CP 50225 632 // 633 // CP 50225 has Shift In/Shift Out codes, and a single designator sequence that is supposed 634 // to appear once in the file, at the beginning of a line, before any multibyte code points. 635 // So we stick the designator at the beginning of the output. 636 // 637 // These are the KR code page codes for ISO-2022-KR 638 // 0E Shift Out (following bytes are double byte) 639 // 0F Shift In (back to ASCII behavior) 640 // 21-7E Byte ranges (1 or 2 bytes) 641 // <ESC> $)C Double byte ISO-2022-KR designator 642 // 643 // Note that this encoding is a little different than other encodings. The <esc>$)C sequence 644 // should only appear once per file. (Actually I saw another spec/rfc that said at the beginning 645 // of each line, but it shouldn't really matter.) 646 // 647 // During decoding Mlang accepted ' ', '\t, and '\n' as their respective characters, even if 648 // it was in double byte mode. We maintain that behavior, although I couldn't find a reference or 649 // reason for that behavior. We never generate data using that shortcut. 650 // 651 // Also Mlang always assumed KR mode, even if the designator wasn't found yet, so we do that as 652 // well. So basically we just ignore <ESC>$)C when decoding. 653 // 654 [System.Security.SecurityCritical] // auto-generated GetBytesCP50225KR(char* chars, int charCount, byte* bytes, int byteCount, ISO2022Encoder encoder)655 private unsafe int GetBytesCP50225KR(char* chars, int charCount, 656 byte* bytes, int byteCount, ISO2022Encoder encoder) 657 { 658 // prepare our helpers 659 Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( 660 this, encoder, bytes, byteCount, chars, charCount); 661 662 // Get our mode 663 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Mode 664 ISO2022Modes shiftOutMode = ISO2022Modes.ModeASCII; // ModeKR if already stamped lead bytes 665 666 // Check our encoder 667 if (encoder != null) 668 { 669 // May have leftover stuff 670 char charLeftOver = encoder.charLeftOver; 671 currentMode = encoder.currentMode; 672 shiftOutMode = encoder.shiftInOutMode; 673 674 // We may have a l left over character from last time, try and process it. 675 if (charLeftOver > 0) 676 { 677 Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP50225KR]leftover character should be high surrogate"); 678 679 // It has to be a high surrogate, which we don't support, so it has to be a fallback 680 buffer.Fallback(charLeftOver); 681 } 682 } 683 684 while (buffer.MoreData) 685 { 686 // Get our data 687 char ch = buffer.GetNextChar(); 688 689 // Get our bytes 690 ushort iBytes = mapUnicodeToBytes[ch]; 691 692 // Check for double byte bytes 693 byte bLeadByte = (byte)(iBytes >> 8); 694 byte bTrailByte = (byte)(iBytes & 0xff); 695 696 if (bLeadByte != 0) 697 { 698 // 699 // It's a double byte character. 700 // 701 702 // If we haven't done our Korean designator, then do so, if we have any input 703 if (shiftOutMode != ISO2022Modes.ModeKR) 704 { 705 // Add our code page designator sequence 706 if (!buffer.AddByte(ESCAPE, unchecked((byte)'$'), unchecked((byte)')'), unchecked((byte)'C'))) 707 break; // No room during convert. 708 709 shiftOutMode = ISO2022Modes.ModeKR; 710 } 711 712 // May have to switch to ModeKR first 713 if (currentMode != ISO2022Modes.ModeKR) 714 { 715 if (!buffer.AddByte(SHIFT_OUT)) 716 break; // No convert room 717 718 currentMode = ISO2022Modes.ModeKR; 719 } 720 721 // Add the bytes 722 if (!buffer.AddByte(bLeadByte, bTrailByte)) 723 break; // no convert room 724 continue; 725 } 726 else if (iBytes != 0 || ch == 0) 727 { 728 // Its a single byte character, switch to ASCII if we have to 729 if (currentMode != ISO2022Modes.ModeASCII) 730 { 731 if (!buffer.AddByte(SHIFT_IN)) 732 break; 733 734 currentMode = ISO2022Modes.ModeASCII; 735 } 736 737 // Add the ASCII char 738 if (!buffer.AddByte(bTrailByte)) 739 break; 740 continue; 741 } 742 743 // Its unknown, do fallback, throws if recursive (knows because we called InternalGetNextChar) 744 buffer.Fallback(ch); 745 } 746 747 // Switch back to ASCII if MustFlush or no encoder 748 if (currentMode != ISO2022Modes.ModeASCII && 749 (encoder == null || encoder.MustFlush)) 750 { 751 // Get back to ASCII to be safe. Only do it if it success. 752 if (buffer.AddByte(SHIFT_IN)) 753 currentMode = ISO2022Modes.ModeASCII; 754 else 755 // If not successful, convert will maintain state for next time, also 756 // AddByte will have decremented our char count, however we need it to remain the same 757 buffer.GetNextChar(); 758 } 759 760 // Remember our encoder state 761 if (bytes != null && encoder != null) 762 { 763 // If we didn't use the encoder, then there's no chars left over 764 if (!buffer.fallbackBuffer.bUsedEncoder) 765 { 766 encoder.charLeftOver = (char)0; 767 } 768 769 // This is ASCII if we had to flush 770 encoder.currentMode = currentMode; 771 772 // We don't use shift out mode, but if we've flushed we need to reset it so it doesn't 773 // get output again. 774 if (!encoder.MustFlush || encoder.charLeftOver != (char)0) 775 { 776 // We should be not flushing or converting 777 Contract.Assert(!encoder.MustFlush || !encoder.m_throwOnOverflow, 778 "[ISO2022Encoding.GetBytesCP50225KR]Expected no left over data or not flushing or not converting"); 779 encoder.shiftInOutMode = shiftOutMode; 780 } 781 else 782 encoder.shiftInOutMode = ISO2022Modes.ModeASCII; 783 784 encoder.m_charsUsed = buffer.CharsUsed; 785 } 786 787 // Return our length 788 return buffer.Count; 789 } 790 791 // CP52936 is HZ Encoding 792 // HZ Encoding has 4 shift sequences: 793 // ~~ '~' (\u7e) 794 // ~} shift into 1 byte mode, 795 // ~{ shift into 2 byte GB 2312-80 796 // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) 797 // (This is for mailers that restrict to 70 or 80 or whatever character lines) 798 // 799 // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 800 // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e 801 // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe 802 // 803 // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. 804 // (all bytes <= 0x7f) 805 [System.Security.SecurityCritical] // auto-generated GetBytesCP52936(char* chars, int charCount, byte* bytes, int byteCount, ISO2022Encoder encoder)806 private unsafe int GetBytesCP52936(char* chars, int charCount, 807 byte* bytes, int byteCount, ISO2022Encoder encoder) 808 { 809 // prepare our helpers 810 Encoding.EncodingByteBuffer buffer = new Encoding.EncodingByteBuffer( 811 this, encoder, bytes, byteCount, chars, charCount); 812 813 // Mode 814 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; 815 816 // Check our encoder 817 if (encoder != null) 818 { 819 char charLeftOver = encoder.charLeftOver; 820 currentMode = encoder.currentMode; 821 822 // We may have a left over character from last time, try and process it. 823 if (charLeftOver > 0) 824 { 825 Contract.Assert(Char.IsHighSurrogate(charLeftOver), "[ISO2022Encoding.GetBytesCP52936]leftover character should be high surrogate"); 826 827 // It has to be a high surrogate, which we don't support, so it has to be a fallback 828 buffer.Fallback(charLeftOver); 829 } 830 } 831 832 while (buffer.MoreData) 833 { 834 // Get our char 835 char ch = buffer.GetNextChar(); 836 837 // Get our bytes 838 ushort sChar = mapUnicodeToBytes[ch]; 839 if (sChar == 0 && ch != 0) 840 { 841 // Wasn't a legal byte sequence, its a surrogate or fallback 842 // Throws if recursive (knows because we called InternalGetNextChar) 843 buffer.Fallback(ch); 844 845 // Done with our char, now process fallback 846 continue; 847 } 848 849 // Check for halfwidth bytes 850 byte bLeadByte = (byte)(sChar >> 8); 851 byte bTrailByte = (byte)(sChar & 0xff); 852 853 // If its a double byte, it has to fit in the lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe range 854 // (including the 0x8080 that our codepage or's to the value) 855 if ((bLeadByte != 0 && 856 (bLeadByte < 0xa1 || bLeadByte > 0xf7 || bTrailByte < 0xa1 || bTrailByte > 0xfe)) || 857 (bLeadByte == 0 && bTrailByte > 0x80 && bTrailByte != 0xff)) 858 { 859 // Illegal character, in 936 code page, but not in HZ subset, get fallback for it 860 buffer.Fallback(ch); 861 continue; 862 } 863 864 // sChar is now either ASCII or has an 0x8080 mask 865 if (bLeadByte != 0) 866 { 867 // Its a double byte mode 868 if (currentMode != ISO2022Modes.ModeHZ) 869 { 870 // Need to add the double byte mode marker 871 if (!buffer.AddByte((byte)'~', (byte)'{', 2)) 872 break; // Stop if no buffer space in convert 873 874 currentMode = ISO2022Modes.ModeHZ; 875 } 876 877 // Go ahead and add the 2 bytes 878 if (!buffer.AddByte(unchecked((byte)(bLeadByte & 0x7f)), unchecked((byte)(bTrailByte & 0x7f)))) 879 break; // Stop if no buffer space in convert 880 } 881 else 882 { 883 // Its supposed to be ASCII 884 if (currentMode != ISO2022Modes.ModeASCII) 885 { 886 // Need to add the ASCII mode marker 887 // Will have 1 more byte (or 2 if ~) 888 if (!buffer.AddByte((byte)'~', (byte)'}', bTrailByte == '~' ? 2:1)) 889 break; 890 891 currentMode = ISO2022Modes.ModeASCII; 892 } 893 894 // If its a '~' we'll need an extra one 895 if (bTrailByte == '~') 896 { 897 // Need to add the extra ~ 898 if (!buffer.AddByte((byte)'~', 1)) 899 break; 900 } 901 902 // Need to add the character 903 if (!buffer.AddByte(bTrailByte)) 904 break; 905 } 906 } 907 908 // Add ASCII shift out if we're at end of decoder 909 if (currentMode != ISO2022Modes.ModeASCII && 910 (encoder == null || encoder.MustFlush)) 911 { 912 // Need to add the ASCII mode marker 913 // Only turn off other mode if this works 914 if (buffer.AddByte((byte)'~',(byte)'}')) 915 currentMode = ISO2022Modes.ModeASCII; 916 else 917 // If not successful, convert will maintain state for next time, also 918 // AddByte will have decremented our char count, however we need it to remain the same 919 buffer.GetNextChar(); 920 } 921 922 // Need to remember our mode 923 if (encoder != null && bytes != null) 924 { 925 // This is ASCII if we had to flush 926 encoder.currentMode = currentMode; 927 928 if (!buffer.fallbackBuffer.bUsedEncoder) 929 { 930 encoder.charLeftOver = (char)0; 931 } 932 933 encoder.m_charsUsed = buffer.CharsUsed; 934 } 935 936 // Return our length 937 return buffer.Count; 938 } 939 940 [System.Security.SecurityCritical] // auto-generated GetCharsCP5022xJP(byte* bytes, int byteCount, char* chars, int charCount, ISO2022Decoder decoder)941 private unsafe int GetCharsCP5022xJP(byte* bytes, int byteCount, 942 char* chars, int charCount, ISO2022Decoder decoder) 943 { 944 // Get our info. 945 Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( 946 this, decoder, chars, charCount, bytes, byteCount); 947 948 // No mode information yet 949 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode 950 ISO2022Modes shiftInMode = ISO2022Modes.ModeASCII; // Mode that we'll shift in to 951 byte[] escapeBytes = new byte[4]; 952 int escapeCount = 0; 953 954 if (decoder != null) 955 { 956 currentMode = decoder.currentMode; 957 shiftInMode = decoder.shiftInOutMode; 958 959 // See if we have leftover decoder buffer to use 960 // Load our bytesLeftOver 961 escapeCount = decoder.bytesLeftOverCount; 962 963 // Don't want to mess up decoder if we're counting or throw an exception 964 for (int i = 0; i < escapeCount; i++) 965 escapeBytes[i] = decoder.bytesLeftOver[i]; 966 } 967 968 // Do this until the end 969 while (buffer.MoreData || escapeCount > 0) 970 { 971 byte ch; 972 973 if (escapeCount > 0) 974 { 975 // Get more escape sequences if necessary 976 if (escapeBytes[0] == ESCAPE) 977 { 978 // Stop if no more input 979 if (!buffer.MoreData) 980 { 981 if (decoder != null && !decoder.MustFlush) 982 break; 983 } 984 else 985 { 986 // Add it to the sequence we can check 987 escapeBytes[escapeCount++] = buffer.GetNextByte(); 988 989 // We have an escape sequence 990 ISO2022Modes modeReturn = 991 CheckEscapeSequenceJP(escapeBytes, escapeCount); 992 993 if (modeReturn != ISO2022Modes.ModeInvalidEscape) 994 { 995 if (modeReturn != ISO2022Modes.ModeIncompleteEscape) 996 { 997 // Processed escape correctly 998 escapeCount = 0; 999 1000 // We're now this mode 1001 currentMode = shiftInMode = modeReturn; 1002 } 1003 1004 // Either way, continue to get next escape or real byte 1005 continue; 1006 } 1007 } 1008 1009 // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. 1010 } 1011 1012 // Read next escape byte and move them down one. 1013 ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); 1014 } 1015 else 1016 { 1017 // Get our next byte 1018 ch = buffer.GetNextByte(); 1019 1020 if (ch == ESCAPE) 1021 { 1022 // We'll have an escape sequence, use it if we don't have one buffered already 1023 if (escapeCount == 0) 1024 { 1025 // Start this new escape sequence 1026 escapeBytes[0] = ch; 1027 escapeCount = 1; 1028 continue; 1029 } 1030 1031 // Flush the previous escape sequence, then reuse this escape byte 1032 buffer.AdjustBytes(-1); 1033 } 1034 } 1035 1036 if (ch == SHIFT_OUT) 1037 { 1038 shiftInMode = currentMode; 1039 currentMode = ISO2022Modes.ModeHalfwidthKatakana; 1040 continue; 1041 } 1042 else if (ch == SHIFT_IN) 1043 { 1044 currentMode = shiftInMode; 1045 continue; 1046 } 1047 1048 // Get our full character 1049 ushort iBytes = ch; 1050 bool b2Bytes = false; 1051 1052 if (currentMode == ISO2022Modes.ModeJIS0208) 1053 { 1054 // 1055 // To handle errors, we need to check: 1056 // 1. if trailbyte is there 1057 // 2. if code is valid 1058 // 1059 if (escapeCount > 0) 1060 { 1061 // Let another escape fall through 1062 if (escapeBytes[0] != ESCAPE) 1063 { 1064 // Move them down one & get the next data 1065 iBytes <<= 8; 1066 iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); 1067 b2Bytes = true; 1068 } 1069 } 1070 else if (buffer.MoreData) 1071 { 1072 iBytes <<= 8; 1073 iBytes |= buffer.GetNextByte(); 1074 b2Bytes = true; 1075 } 1076 else 1077 { 1078 // Not enough input, use decoder if possible 1079 if (decoder == null || decoder.MustFlush) 1080 { 1081 // No decoder, do fallback for this byte 1082 buffer.Fallback(ch); 1083 break; 1084 } 1085 1086 // Stick it in the decoder if we're not counting 1087 if (chars != null) 1088 { 1089 escapeBytes[0] = ch; 1090 escapeCount = 1; 1091 } 1092 break; 1093 } 1094 1095 // MLang treated JIS 0208 '*' lead byte like a single halfwidth katakana 1096 // escape, so use 0x8e00 as katakana lead byte and keep same trail byte. 1097 // 0x2a lead byte range is normally unused in JIS 0208, so shouldn't have 1098 // any wierd compatibility issues. 1099 if ((b2Bytes == true) && ((iBytes & 0xff00) == 0x2a00)) 1100 { 1101 iBytes = (ushort)(iBytes & 0xff); 1102 iBytes |= (LEADBYTE_HALFWIDTH << 8); // Put us in the halfwidth katakana range 1103 } 1104 } 1105 else if (iBytes >= 0xA1 && iBytes <= 0xDF) 1106 { 1107 // Everett accidentally mapped Katakana like shift-jis (932), 1108 // even though this is a 7 bit code page. We keep that mapping 1109 iBytes |= (LEADBYTE_HALFWIDTH << 8); // Map to halfwidth katakana range 1110 iBytes &= 0xff7f; // remove extra 0x80 1111 } 1112 else if (currentMode == ISO2022Modes.ModeHalfwidthKatakana ) 1113 { 1114 // Add 0x10 lead byte that our encoding expects for Katakana: 1115 iBytes |= (LEADBYTE_HALFWIDTH << 8); 1116 } 1117 1118 // We have an iBytes to try to convert. 1119 char c = mapBytesToUnicode[iBytes]; 1120 1121 // See if it was unknown 1122 if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) 1123 { 1124 // Have to do fallback 1125 if (b2Bytes) 1126 { 1127 if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) 1128 break; 1129 } 1130 else 1131 { 1132 if (!buffer.Fallback(ch)) 1133 break; 1134 } 1135 } 1136 else 1137 { 1138 // If we were JIS 0208, then we consumed an extra byte 1139 if (!buffer.AddChar(c, b2Bytes ? 2:1)) 1140 break; 1141 } 1142 } 1143 1144 // Make sure our decoder state matches our mode, if not counting 1145 if (chars != null && decoder != null) 1146 { 1147 // Remember it if we don't flush 1148 if (!decoder.MustFlush || escapeCount != 0) 1149 { 1150 // Either not flushing or had state (from convert) 1151 Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, 1152 "[ISO2022Encoding.GetCharsCP5022xJP]Expected no state or not converting or not flushing"); 1153 1154 decoder.currentMode = currentMode; 1155 decoder.shiftInOutMode = shiftInMode; 1156 1157 // Remember escape buffer 1158 decoder.bytesLeftOverCount = escapeCount; 1159 decoder.bytesLeftOver = escapeBytes; 1160 } 1161 else 1162 { 1163 // We flush, clear buffer 1164 decoder.currentMode = ISO2022Modes.ModeASCII; 1165 decoder.shiftInOutMode = ISO2022Modes.ModeASCII; 1166 decoder.bytesLeftOverCount = 0; 1167 // Slightly different if counting/not counting 1168 } 1169 1170 decoder.m_bytesUsed = buffer.BytesUsed; 1171 } 1172 1173 // Return # of characters we found 1174 return buffer.Count; 1175 } 1176 1177 // We know we have an escape sequence, so check it starting with the byte after the escape CheckEscapeSequenceJP( byte[] bytes, int escapeCount )1178 private ISO2022Modes CheckEscapeSequenceJP( byte[] bytes, int escapeCount ) 1179 { 1180 // Have an escape sequence 1181 if (bytes[0] != ESCAPE) 1182 return ISO2022Modes.ModeInvalidEscape; 1183 1184 if (escapeCount < 3) 1185 return ISO2022Modes.ModeIncompleteEscape; 1186 1187 if (bytes[1] == '(') 1188 { 1189 if (bytes[2] == 'B') // <esc>(B 1190 { 1191 return ISO2022Modes.ModeASCII; 1192 } 1193 else if (bytes[2] == 'H') // <esc>(H 1194 { 1195 // Actually this is supposed to be Swedish 1196 // We treat it like ASCII though. 1197 return ISO2022Modes.ModeASCII; 1198 } 1199 else if (bytes[2] == 'J') // <esc>(J 1200 { 1201 // Actually this is supposed to be Roman 1202 // 2 characters are different, but historically we treat it as ascii 1203 return ISO2022Modes.ModeASCII; 1204 } 1205 else if (bytes[2] == 'I') // <esc>(I 1206 { 1207 return ISO2022Modes.ModeHalfwidthKatakana; 1208 } 1209 } 1210 else if (bytes[1] == '$') 1211 { 1212 if (bytes[2] == '@' || // <esc>$@ 1213 bytes[2] == 'B') // <esc>$B 1214 { 1215 return ISO2022Modes.ModeJIS0208; 1216 } 1217 else 1218 { 1219 // Looking for <esc>$(D 1220 if (escapeCount < 4) 1221 return ISO2022Modes.ModeIncompleteEscape; 1222 1223 if (bytes[2] == '(' && bytes[3] == 'D') // <esc>$(D 1224 { 1225 // Mlang treated 0208 like 0212 even though that's wrong 1226 return ISO2022Modes.ModeJIS0208; 1227 } 1228 } 1229 } 1230 else if (bytes[1] == '&') 1231 { 1232 if (bytes[2] == '@') // <esc>&@ 1233 { 1234 // Ignore ESC & @ (prefix to <esc>$B) 1235 return ISO2022Modes.ModeNOOP; 1236 } 1237 } 1238 1239 // If we get here we fell through and have an invalid/unknown escape sequence 1240 return ISO2022Modes.ModeInvalidEscape; 1241 } 1242 DecrementEscapeBytes(ref byte[] bytes, ref int count)1243 private byte DecrementEscapeBytes(ref byte[] bytes, ref int count) 1244 { 1245 Contract.Assert(count > 0, "[ISO2022Encoding.DecrementEscapeBytes]count > 0"); 1246 1247 // Decrement our count 1248 count--; 1249 1250 // Remember the first one 1251 byte returnValue = bytes[0]; 1252 1253 // Move them down one. 1254 for (int i = 0; i < count; i++) 1255 { 1256 bytes[i] = bytes[i+1]; 1257 } 1258 1259 // Clear out the last byte 1260 bytes[count] = 0; 1261 1262 // Return the old 1st byte 1263 return returnValue; 1264 } 1265 1266 // Note that in DBCS mode mlang passed through ' ', '\t' and '\n' as SBCS characters 1267 // probably to allow mailer formatting without too much extra work. 1268 [System.Security.SecurityCritical] // auto-generated GetCharsCP50225KR(byte* bytes, int byteCount, char* chars, int charCount, ISO2022Decoder decoder)1269 private unsafe int GetCharsCP50225KR(byte* bytes, int byteCount, 1270 char* chars, int charCount, ISO2022Decoder decoder) 1271 { 1272 // Get our info. 1273 Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( 1274 this, decoder, chars, charCount, bytes, byteCount); 1275 1276 // No mode information yet 1277 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; // Our current Mode 1278 1279 byte[] escapeBytes = new byte[4]; 1280 int escapeCount = 0; 1281 1282 if (decoder != null) 1283 { 1284 currentMode = decoder.currentMode; 1285 1286 // See if we have leftover decoder buffer to use 1287 // Load our bytesLeftOver 1288 escapeCount = decoder.bytesLeftOverCount; 1289 1290 // Don't want to mess up decoder if we're counting or throw an exception 1291 for (int i = 0; i < escapeCount; i++) 1292 escapeBytes[i] = decoder.bytesLeftOver[i]; 1293 } 1294 1295 // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. 1296 while (buffer.MoreData || escapeCount > 0) 1297 { 1298 byte ch; 1299 1300 if (escapeCount > 0) 1301 { 1302 // Get more escape sequences if necessary 1303 if (escapeBytes[0] == ESCAPE) 1304 { 1305 // Stop if no more input 1306 if (!buffer.MoreData) 1307 { 1308 if (decoder != null && !decoder.MustFlush) 1309 break; 1310 } 1311 else 1312 { 1313 // Add it to the sequence we can check 1314 escapeBytes[escapeCount++] = buffer.GetNextByte(); 1315 1316 // We have an escape sequence 1317 ISO2022Modes modeReturn = 1318 CheckEscapeSequenceKR(escapeBytes, escapeCount); 1319 1320 if (modeReturn != ISO2022Modes.ModeInvalidEscape) 1321 { 1322 if (modeReturn != ISO2022Modes.ModeIncompleteEscape) 1323 { 1324 // Processed escape correctly, no effect (we know about KR mode) 1325 escapeCount = 0; 1326 } 1327 1328 // Either way, continue to get next escape or real byte 1329 continue; 1330 } 1331 } 1332 1333 // If ModeInvalidEscape, or no input & must flush, then fall through to add escape. 1334 } 1335 1336 // Still have something left over in escape buffer 1337 // Get it and move them down one 1338 ch = DecrementEscapeBytes(ref escapeBytes, ref escapeCount); 1339 } 1340 else 1341 { 1342 // Get our next byte 1343 ch = buffer.GetNextByte(); 1344 1345 if (ch == ESCAPE) 1346 { 1347 // We'll have an escape sequence, use it if we don't have one buffered already 1348 if (escapeCount == 0) 1349 { 1350 // Start this new escape sequence 1351 escapeBytes[0] = ch; 1352 escapeCount = 1; 1353 continue; 1354 } 1355 1356 // Flush previous escape sequence, then reuse this escape byte 1357 buffer.AdjustBytes(-1); 1358 } 1359 } 1360 1361 if (ch == SHIFT_OUT) 1362 { 1363 currentMode = ISO2022Modes.ModeKR; 1364 continue; 1365 } 1366 else if (ch == SHIFT_IN) 1367 { 1368 currentMode = ISO2022Modes.ModeASCII; 1369 continue; 1370 } 1371 1372 // Get our full character 1373 ushort iBytes = ch; 1374 bool b2Bytes = false; 1375 1376 // MLANG was passing through ' ', '\t' and '\n', so we do so as well, but I don't see that in the RFC. 1377 if (currentMode == ISO2022Modes.ModeKR && ch != ' ' && ch != '\t' && ch != '\n') 1378 { 1379 // 1380 // To handle errors, we need to check: 1381 // 1. if trailbyte is there 1382 // 2. if code is valid 1383 // 1384 if (escapeCount > 0) 1385 { 1386 // Let another escape fall through 1387 if (escapeBytes[0] != ESCAPE) 1388 { 1389 // Move them down one & get the next data 1390 iBytes <<= 8; 1391 iBytes |= DecrementEscapeBytes(ref escapeBytes, ref escapeCount); 1392 b2Bytes = true; 1393 } 1394 } 1395 else if (buffer.MoreData) 1396 { 1397 iBytes <<= 8; 1398 iBytes |= buffer.GetNextByte(); 1399 b2Bytes = true; 1400 } 1401 else 1402 { 1403 // Not enough input, use decoder if possible 1404 if (decoder == null || decoder.MustFlush) 1405 { 1406 // No decoder, do fallback for lonely 1st byte 1407 buffer.Fallback(ch); 1408 break; 1409 } 1410 1411 // Stick it in the decoder if we're not counting 1412 if (chars != null) 1413 { 1414 escapeBytes[0] = ch; 1415 escapeCount = 1; 1416 } 1417 break; 1418 } 1419 } 1420 1421 // We have a iBytes to try to convert. 1422 char c = mapBytesToUnicode[iBytes]; 1423 1424 // See if it was unknown 1425 if (c == UNKNOWN_CHAR_FLAG && iBytes != 0) 1426 { 1427 // Have to do fallback 1428 if (b2Bytes) 1429 { 1430 if (!buffer.Fallback((byte)(iBytes >> 8), (byte)iBytes)) 1431 break; 1432 } 1433 else 1434 { 1435 if (!buffer.Fallback(ch)) 1436 break; 1437 } 1438 } 1439 else 1440 { 1441 if (!buffer.AddChar(c, b2Bytes ? 2:1)) 1442 break; 1443 } 1444 } 1445 1446 // Make sure our decoder state matches our mode, if not counting 1447 if (chars != null && decoder != null) 1448 { 1449 // Remember it if we don't flush 1450 if (!decoder.MustFlush || escapeCount != 0) 1451 { 1452 // Either not flushing or had state (from convert) 1453 Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, 1454 "[ISO2022Encoding.GetCharsCP50225KR]Expected no state or not converting or not flushing"); 1455 1456 decoder.currentMode = currentMode; 1457 1458 // Remember escape buffer 1459 decoder.bytesLeftOverCount = escapeCount; 1460 decoder.bytesLeftOver = escapeBytes; 1461 } 1462 else 1463 { 1464 // We flush, clear buffer 1465 decoder.currentMode = ISO2022Modes.ModeASCII; 1466 decoder.shiftInOutMode = ISO2022Modes.ModeASCII; 1467 decoder.bytesLeftOverCount = 0; 1468 } 1469 1470 decoder.m_bytesUsed = buffer.BytesUsed; 1471 } 1472 1473 // Return # of characters we found 1474 return buffer.Count; 1475 } 1476 1477 // We know we have an escape sequence, so check it starting with the byte after the escape CheckEscapeSequenceKR( byte[] bytes, int escapeCount )1478 private ISO2022Modes CheckEscapeSequenceKR( byte[] bytes, int escapeCount ) 1479 { 1480 // Have an escape sequence 1481 if (bytes[0] != ESCAPE) 1482 return ISO2022Modes.ModeInvalidEscape; 1483 1484 if (escapeCount < 4) 1485 return ISO2022Modes.ModeIncompleteEscape; 1486 1487 if (bytes[1] == '$' && bytes[2] == ')' && bytes[3] == 'C') // <esc>$)C 1488 return ISO2022Modes.ModeKR; 1489 1490 // If we get here we fell through and have an invalid/unknown escape sequence 1491 return ISO2022Modes.ModeInvalidEscape; 1492 } 1493 1494 // CP52936 is HZ Encoding 1495 // HZ Encoding has 4 shift sequences: 1496 // ~~ '~' (\u7e) 1497 // ~} shift into 1 byte mode, 1498 // ~{ shift into 2 byte GB 2312-80 1499 // ~<NL> Maintain 2 byte mode across new lines (ignore both ~ and <NL> characters) 1500 // (This is for mailers that restrict to 70 or 80 or whatever character lines) 1501 // 1502 // According to comment in mlang, lead & trail byte ranges are described in RFC 1843 1503 // RFC 1843 => valid HZ code range: leading byte 0x21 - 0x77, 2nd byte 0x21 - 0x7e 1504 // Our 936 code points are or'd with 0x8080, so lead byte 0xa1 - 0xf7, trail byte 0xa1 - 0xfe 1505 // 1506 // This encoding is designed for transmission by e-mail and news. No bytes should have high bit set. 1507 // (all bytes <= 0x7f) 1508 [System.Security.SecurityCritical] // auto-generated GetCharsCP52936(byte* bytes, int byteCount, char* chars, int charCount, ISO2022Decoder decoder)1509 private unsafe int GetCharsCP52936(byte* bytes, int byteCount, 1510 char* chars, int charCount, ISO2022Decoder decoder) 1511 { 1512 Contract.Assert(byteCount >=0, "[ISO2022Encoding.GetCharsCP52936]count >=0"); 1513 Contract.Assert(bytes!=null, "[ISO2022Encoding.GetCharsCP52936]bytes!=null"); 1514 1515 // Get our info. 1516 Encoding.EncodingCharBuffer buffer = new Encoding.EncodingCharBuffer( 1517 this, decoder, chars, charCount, bytes, byteCount); 1518 1519 // No mode information yet 1520 ISO2022Modes currentMode = ISO2022Modes.ModeASCII; 1521 int byteLeftOver = -1; 1522 bool bUsedDecoder = false; 1523 1524 if (decoder != null) 1525 { 1526 currentMode = decoder.currentMode; 1527 // See if we have leftover decoder buffer to use 1528 // Don't want to mess up decoder if we're counting or throw an exception 1529 if (decoder.bytesLeftOverCount != 0 ) 1530 { 1531 // Load our bytesLeftOver 1532 byteLeftOver = decoder.bytesLeftOver[0]; 1533 } 1534 } 1535 1536 // Do this until the end, just do '?' replacement because we don't have fallbacks for decodings. 1537 while (buffer.MoreData || byteLeftOver >= 0) 1538 { 1539 byte ch; 1540 1541 // May have a left over byte 1542 if (byteLeftOver >= 0) 1543 { 1544 ch = (byte)byteLeftOver; 1545 byteLeftOver = -1; 1546 } 1547 else 1548 { 1549 ch = buffer.GetNextByte(); 1550 } 1551 1552 // We're in escape mode 1553 if (ch == '~') 1554 { 1555 // Next char is type of switch 1556 if (!buffer.MoreData) 1557 { 1558 // We don't have anything left, it'll be in decoder or a ? 1559 // don't fail if we are allowing overflows 1560 if (decoder == null || decoder.MustFlush) 1561 { 1562 // We'll be a '?' 1563 buffer.Fallback(ch); 1564 // break if we fail & break if we don't (because !MoreData) 1565 // Add succeeded, continue 1566 break; 1567 } 1568 1569 // Stick it in decoder 1570 if (decoder != null) 1571 decoder.ClearMustFlush(); 1572 1573 if (chars != null) 1574 { 1575 decoder.bytesLeftOverCount = 1; 1576 decoder.bytesLeftOver[0] = (byte)'~'; 1577 bUsedDecoder = true; 1578 } 1579 break; 1580 } 1581 1582 // What type is it?, get 2nd byte 1583 ch = buffer.GetNextByte(); 1584 1585 if (ch == '~' && currentMode == ISO2022Modes.ModeASCII) 1586 { 1587 // Its just a ~~ replacement for ~, add it 1588 if (!buffer.AddChar((char)ch, 2)) 1589 // Add failed, break for converting 1590 break; 1591 1592 // Add succeeded, continue 1593 continue; 1594 } 1595 else if (ch == '{') 1596 { 1597 // Switching to Double Byte mode 1598 currentMode = ISO2022Modes.ModeHZ; 1599 continue; 1600 } 1601 else if (ch == '}') 1602 { 1603 // Switching to ASCII mode 1604 currentMode = ISO2022Modes.ModeASCII; 1605 continue; 1606 } 1607 else if (ch == '\n') 1608 { 1609 // Ignore ~\n sequence 1610 continue; 1611 } 1612 else 1613 { 1614 // Unknown escape, back up and try the '~' as a "normal" byte or lead byte 1615 buffer.AdjustBytes(-1); 1616 ch = (byte)'~'; 1617 } 1618 } 1619 1620 // go ahead and add our data 1621 if (currentMode != ISO2022Modes.ModeASCII) 1622 { 1623 // Should be ModeHZ 1624 Contract.Assert(currentMode == ISO2022Modes.ModeHZ, "[ISO2022Encoding.GetCharsCP52936]Expected ModeHZ"); 1625 char cm; 1626 1627 // Everett allowed characters < 0x20 to be passed as if they were ASCII 1628 if (ch < 0x20) 1629 { 1630 // Emit it as ASCII 1631 goto STOREASCII; 1632 } 1633 1634 // Its multibyte, should have another byte 1635 if (!buffer.MoreData) 1636 { 1637 // No bytes left 1638 // don't fail if we are allowing overflows 1639 if (decoder == null || decoder.MustFlush) 1640 { 1641 // Not enough bytes, fallback lead byte 1642 buffer.Fallback(ch); 1643 1644 // Break if we fail & break because !MoreData 1645 break; 1646 } 1647 1648 if (decoder != null) 1649 decoder.ClearMustFlush(); 1650 1651 // Stick it in decoder 1652 if (chars != null) 1653 { 1654 decoder.bytesLeftOverCount = 1; 1655 decoder.bytesLeftOver[0] = ch; 1656 bUsedDecoder = true; 1657 } 1658 break; 1659 } 1660 1661 // Everett uses space as an escape character for single SBCS bytes 1662 byte ch2 = buffer.GetNextByte(); 1663 ushort iBytes = (ushort)(ch << 8 | ch2); 1664 1665 if (ch == ' ' && ch2 != 0) 1666 { 1667 // Get next char and treat it like ASCII (Everett treated space like an escape 1668 // allowing the next char to be just ascii) 1669 cm = (char)ch2; 1670 goto STOREMULTIBYTE; 1671 } 1672 1673 // Bytes should be in range: lead byte 0x21-0x77, trail byte: 0x21 - 0x7e 1674 if ((ch < 0x21 || ch > 0x77 || ch2 < 0x21 || ch2 > 0x7e) && 1675 // Everett allowed high bit mappings for same characters (but only if both bits set) 1676 (ch < 0xa1 || ch > 0xf7 || ch2 < 0xa1 || ch2 > 0xfe)) 1677 { 1678 // For some reason Everett allowed XX20 to become unicode 3000... (ideo sp) 1679 if (ch2 == 0x20 && 0x21 <= ch && ch <= 0x7d) 1680 { 1681 iBytes = 0x2121; 1682 goto MULTIBYTE; 1683 } 1684 1685 // Illegal char, use fallback. If lead byte is 0 have to do it special and do it first 1686 if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) 1687 break; 1688 continue; 1689 } 1690 1691 MULTIBYTE: 1692 iBytes |= 0x8080; 1693 // Look up the multibyte char to stick it in our data 1694 1695 // We have a iBytes to try to convert. 1696 cm = mapBytesToUnicode[iBytes]; 1697 1698 STOREMULTIBYTE: 1699 1700 // See if it was unknown 1701 if (cm == UNKNOWN_CHAR_FLAG && iBytes != 0) 1702 { 1703 // Fall back the unknown stuff 1704 if (!buffer.Fallback((byte)(iBytes>>8), (byte)(iBytes))) 1705 break; 1706 continue; 1707 } 1708 1709 if (!buffer.AddChar(cm, 2)) 1710 break; // convert ran out of buffer, stop 1711 continue; 1712 } 1713 1714 // Just ASCII 1715 // We allow some chars > 7f because everett did, so we have to look them up. 1716 STOREASCII: 1717 char c = mapBytesToUnicode[ch]; 1718 1719 // Check if it was unknown 1720 if ((c == UNKNOWN_CHAR_FLAG || c == 0) && (ch != 0)) 1721 { 1722 // fallback the unkown bytes 1723 if (!buffer.Fallback((byte)ch)) 1724 break; 1725 continue; 1726 } 1727 1728 // Go ahead and add our ASCII character 1729 if (!buffer.AddChar(c)) 1730 break; // convert ran out of buffer, stop 1731 } 1732 1733 // Need to remember our state, IF we're not counting 1734 if (chars != null && decoder != null) 1735 { 1736 if (!bUsedDecoder) 1737 { 1738 // If we didn't use it, clear the byte left over 1739 decoder.bytesLeftOverCount = 0; 1740 } 1741 1742 if (decoder.MustFlush && decoder.bytesLeftOverCount == 0) 1743 { 1744 decoder.currentMode = ISO2022Modes.ModeASCII; 1745 } 1746 else 1747 { 1748 // Either not flushing or had state (from convert) 1749 Contract.Assert(!decoder.MustFlush || !decoder.m_throwOnOverflow, 1750 "[ISO2022Encoding.GetCharsCP52936]Expected no state or not converting or not flushing"); 1751 1752 decoder.currentMode = currentMode; 1753 } 1754 decoder.m_bytesUsed = buffer.BytesUsed; 1755 } 1756 1757 // Return # of characters we found 1758 return buffer.Count; 1759 } 1760 1761 // Note: These all end up with 1/2 bytes of average byte count, so unless we're 1 we're always 1762 // charCount/2 bytes too big. GetMaxByteCount(int charCount)1763 public override int GetMaxByteCount(int charCount) 1764 { 1765 if (charCount < 0) 1766 throw new ArgumentOutOfRangeException("charCount", 1767 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); 1768 Contract.EndContractBlock(); 1769 1770 // Characters would be # of characters + 1 in case high surrogate is ? * max fallback 1771 long byteCount = (long)charCount + 1; 1772 1773 if (EncoderFallback.MaxCharCount > 1) 1774 byteCount *= EncoderFallback.MaxCharCount; 1775 1776 // Start with just generic DBCS values (sort of). 1777 int perChar = 2; 1778 int extraStart = 0; 1779 int extraEnd = 0; 1780 1781 switch (CodePage) 1782 { 1783 case 50220: 1784 case 50221: 1785 // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP 1786 perChar = 5; // 5 max (4.5 average) 1787 extraEnd = 3; // 3 bytes to shift back to ASCII 1788 break; 1789 case 50222: 1790 // 2 bytes per char + 3 bytes switch to JIS 0208 or 1 byte + 3 bytes switch to 1 byte CP 1791 perChar = 5; // 5 max (4.5 average) 1792 extraEnd = 4; // 1 byte to shift from Katakana -> DBCS, 3 bytes to shift back to ASCII from DBCS 1793 break; 1794 case 50225: 1795 // 2 bytes per char + 1 byte SO, or 1 byte per char + 1 byte SI. 1796 perChar = 3; // 3 max, (2.5 average) 1797 extraStart = 4; // EUC-KR marker appears at beginning of file. 1798 extraEnd = 1; // 1 byte to shift back to ascii if necessary. 1799 break; 1800 case 52936: 1801 // 2 bytes per char + 2 byte shift, or 1 byte + 1 byte shift 1802 // Worst case: left over surrogate with no low surrogate is extra ?, could have to switch to ASCII, then could have HZ and flush to ASCII mode 1803 perChar = 4; // 4 max, (3.5 average if every other char is HZ/ASCII) 1804 extraEnd = 2; // 2 if we have to shift back to ASCII 1805 break; 1806 } 1807 1808 // Return our surrogate and End plus perChar for each char. 1809 byteCount *= perChar; 1810 byteCount += extraStart + extraEnd; 1811 1812 if (byteCount > 0x7fffffff) 1813 throw new ArgumentOutOfRangeException("charCount", Environment.GetResourceString("ArgumentOutOfRange_GetByteCountOverflow")); 1814 1815 return (int)byteCount; 1816 } 1817 GetMaxCharCount(int byteCount)1818 public override int GetMaxCharCount(int byteCount) 1819 { 1820 if (byteCount < 0) 1821 throw new ArgumentOutOfRangeException("byteCount", 1822 Environment.GetResourceString("ArgumentOutOfRange_NeedNonNegNum")); 1823 Contract.EndContractBlock(); 1824 1825 int perChar = 1; 1826 int extraDecoder = 1; 1827 1828 switch (CodePage) 1829 { 1830 case 50220: 1831 case 50221: 1832 case 50222: 1833 case 50225: 1834 perChar = 1; // Worst case all ASCII 1835 extraDecoder = 3; // Could have left over 3 chars of 4 char escape sequence, that all become ? 1836 break; 1837 case 52936: 1838 perChar = 1; // Worst case all ASCII 1839 extraDecoder = 1; // sequences are 2 chars, so if next one is illegal, then previous 1 could be ? 1840 break; 1841 } 1842 1843 // Figure out our length, perchar * char + whatever extra our decoder could do to us. 1844 long charCount = ((long)byteCount * perChar) + extraDecoder; 1845 1846 // Just in case we have to fall back unknown ones. 1847 if (DecoderFallback.MaxCharCount > 1) 1848 charCount *= DecoderFallback.MaxCharCount; 1849 1850 if (charCount > 0x7fffffff) 1851 throw new ArgumentOutOfRangeException("byteCount", Environment.GetResourceString("ArgumentOutOfRange_GetCharCountOverflow")); 1852 1853 return (int)charCount; 1854 } 1855 GetEncoder()1856 public override Encoder GetEncoder() 1857 { 1858 return new ISO2022Encoder(this); 1859 } 1860 GetDecoder()1861 public override Decoder GetDecoder() 1862 { 1863 return new ISO2022Decoder(this); 1864 } 1865 1866 [Serializable] 1867 internal class ISO2022Encoder : System.Text.EncoderNLS 1868 { 1869 internal ISO2022Modes currentMode; 1870 internal ISO2022Modes shiftInOutMode; 1871 ISO2022Encoder(EncodingNLS encoding)1872 internal ISO2022Encoder(EncodingNLS encoding) : base(encoding) 1873 { 1874 // base calls reset 1875 } 1876 Reset()1877 public override void Reset() 1878 { 1879 // Reset 1880 currentMode = ISO2022Modes.ModeASCII; 1881 shiftInOutMode = ISO2022Modes.ModeASCII; 1882 charLeftOver = (char)0; 1883 if (m_fallbackBuffer != null) 1884 m_fallbackBuffer.Reset(); 1885 } 1886 1887 // Anything left in our encoder? 1888 internal override bool HasState 1889 { 1890 get 1891 { 1892 // Don't check shift-out mode, it may be ascii (JP) or not (KR) 1893 return (this.charLeftOver != (char)0 || 1894 currentMode != ISO2022Modes.ModeASCII); 1895 } 1896 } 1897 } 1898 1899 [Serializable] 1900 internal class ISO2022Decoder : System.Text.DecoderNLS 1901 { 1902 internal byte[] bytesLeftOver; 1903 internal int bytesLeftOverCount; 1904 internal ISO2022Modes currentMode; 1905 internal ISO2022Modes shiftInOutMode; 1906 ISO2022Decoder(EncodingNLS encoding)1907 internal ISO2022Decoder(EncodingNLS encoding) : base(encoding) 1908 { 1909 // base calls reset 1910 } 1911 Reset()1912 public override void Reset() 1913 { 1914 // Reset 1915 bytesLeftOverCount = 0; 1916 bytesLeftOver = new byte[4]; 1917 currentMode = ISO2022Modes.ModeASCII; 1918 shiftInOutMode = ISO2022Modes.ModeASCII; 1919 if (m_fallbackBuffer != null) 1920 m_fallbackBuffer.Reset(); 1921 } 1922 1923 // Anything left in our decoder? 1924 internal override bool HasState 1925 { 1926 get 1927 { 1928 // If have bytes left over or not shifted back to ASCII then have problem 1929 return (this.bytesLeftOverCount != 0 || 1930 currentMode != ISO2022Modes.ModeASCII); 1931 } 1932 } 1933 } 1934 1935 static ushort[] HalfToFullWidthKanaTable = 1936 { 1937 0xa1a3, // 0x8ea1 : Halfwidth Ideographic Period 1938 0xa1d6, // 0x8ea2 : Halfwidth Opening Corner Bracket 1939 0xa1d7, // 0x8ea3 : Halfwidth Closing Corner Bracket 1940 0xa1a2, // 0x8ea4 : Halfwidth Ideographic Comma 1941 0xa1a6, // 0x8ea5 : Halfwidth Katakana Middle Dot 1942 0xa5f2, // 0x8ea6 : Halfwidth Katakana Wo 1943 0xa5a1, // 0x8ea7 : Halfwidth Katakana Small A 1944 0xa5a3, // 0x8ea8 : Halfwidth Katakana Small I 1945 0xa5a5, // 0x8ea9 : Halfwidth Katakana Small U 1946 0xa5a7, // 0x8eaa : Halfwidth Katakana Small E 1947 0xa5a9, // 0x8eab : Halfwidth Katakana Small O 1948 0xa5e3, // 0x8eac : Halfwidth Katakana Small Ya 1949 0xa5e5, // 0x8ead : Halfwidth Katakana Small Yu 1950 0xa5e7, // 0x8eae : Halfwidth Katakana Small Yo 1951 0xa5c3, // 0x8eaf : Halfwidth Katakana Small Tu 1952 0xa1bc, // 0x8eb0 : Halfwidth Katakana-Hiragana Prolonged Sound Mark 1953 0xa5a2, // 0x8eb1 : Halfwidth Katakana A 1954 0xa5a4, // 0x8eb2 : Halfwidth Katakana I 1955 0xa5a6, // 0x8eb3 : Halfwidth Katakana U 1956 0xa5a8, // 0x8eb4 : Halfwidth Katakana E 1957 0xa5aa, // 0x8eb5 : Halfwidth Katakana O 1958 0xa5ab, // 0x8eb6 : Halfwidth Katakana Ka 1959 0xa5ad, // 0x8eb7 : Halfwidth Katakana Ki 1960 0xa5af, // 0x8eb8 : Halfwidth Katakana Ku 1961 0xa5b1, // 0x8eb9 : Halfwidth Katakana Ke 1962 0xa5b3, // 0x8eba : Halfwidth Katakana Ko 1963 0xa5b5, // 0x8ebb : Halfwidth Katakana Sa 1964 0xa5b7, // 0x8ebc : Halfwidth Katakana Si 1965 0xa5b9, // 0x8ebd : Halfwidth Katakana Su 1966 0xa5bb, // 0x8ebe : Halfwidth Katakana Se 1967 0xa5bd, // 0x8ebf : Halfwidth Katakana So 1968 0xa5bf, // 0x8ec0 : Halfwidth Katakana Ta 1969 0xa5c1, // 0x8ec1 : Halfwidth Katakana Ti 1970 0xa5c4, // 0x8ec2 : Halfwidth Katakana Tu 1971 0xa5c6, // 0x8ec3 : Halfwidth Katakana Te 1972 0xa5c8, // 0x8ec4 : Halfwidth Katakana To 1973 0xa5ca, // 0x8ec5 : Halfwidth Katakana Na 1974 0xa5cb, // 0x8ec6 : Halfwidth Katakana Ni 1975 0xa5cc, // 0x8ec7 : Halfwidth Katakana Nu 1976 0xa5cd, // 0x8ec8 : Halfwidth Katakana Ne 1977 0xa5ce, // 0x8ec9 : Halfwidth Katakana No 1978 0xa5cf, // 0x8eca : Halfwidth Katakana Ha 1979 0xa5d2, // 0x8ecb : Halfwidth Katakana Hi 1980 0xa5d5, // 0x8ecc : Halfwidth Katakana Hu 1981 0xa5d8, // 0x8ecd : Halfwidth Katakana He 1982 0xa5db, // 0x8ece : Halfwidth Katakana Ho 1983 0xa5de, // 0x8ecf : Halfwidth Katakana Ma 1984 0xa5df, // 0x8ed0 : Halfwidth Katakana Mi 1985 0xa5e0, // 0x8ed1 : Halfwidth Katakana Mu 1986 0xa5e1, // 0x8ed2 : Halfwidth Katakana Me 1987 0xa5e2, // 0x8ed3 : Halfwidth Katakana Mo 1988 0xa5e4, // 0x8ed4 : Halfwidth Katakana Ya 1989 0xa5e6, // 0x8ed5 : Halfwidth Katakana Yu 1990 0xa5e8, // 0x8ed6 : Halfwidth Katakana Yo 1991 0xa5e9, // 0x8ed7 : Halfwidth Katakana Ra 1992 0xa5ea, // 0x8ed8 : Halfwidth Katakana Ri 1993 0xa5eb, // 0x8ed9 : Halfwidth Katakana Ru 1994 0xa5ec, // 0x8eda : Halfwidth Katakana Re 1995 0xa5ed, // 0x8edb : Halfwidth Katakana Ro 1996 0xa5ef, // 0x8edc : Halfwidth Katakana Wa 1997 0xa5f3, // 0x8edd : Halfwidth Katakana N 1998 0xa1ab, // 0x8ede : Halfwidth Katakana Voiced Sound Mark 1999 0xa1ac // 0x8edf : Halfwidth Katakana Semi-Voiced Sound Mark 2000 }; 2001 } 2002 } 2003 #endif // FEATURE_CODEPAGES_FILE 2004 2005