1 // Licensed to the .NET Foundation under one or more agreements. 2 // The .NET Foundation licenses this file to you under the MIT license. 3 // See the LICENSE file in the project root for more information. 4 5 // 6 // Don't override IsAlwaysNormalized because it is just a Unicode Transformation and could be confused. 7 // 8 9 using System; 10 using System.Globalization; 11 using System.Diagnostics; 12 using System.Runtime.InteropServices; 13 14 namespace System.Text 15 { 16 public class UnicodeEncoding : Encoding 17 { 18 // Used by Encoding.BigEndianUnicode/Unicode for lazy initialization 19 // The initialization code will not be run until a static member of the class is referenced 20 internal static readonly UnicodeEncoding s_bigEndianDefault = new UnicodeEncoding(bigEndian: true, byteOrderMark: true); 21 internal static readonly UnicodeEncoding s_littleEndianDefault = new UnicodeEncoding(bigEndian: false, byteOrderMark: true); 22 23 private static readonly byte[] s_bigEndianPreamble = new byte[2] { 0xfe, 0xff }; 24 private static readonly byte[] s_littleEndianPreamble = new byte[2] { 0xff, 0xfe }; 25 26 internal bool isThrowException = false; 27 28 internal bool bigEndian = false; 29 internal bool byteOrderMark = true; 30 31 // Unicode version 2.0 character size in bytes 32 public const int CharSize = 2; 33 34 UnicodeEncoding()35 public UnicodeEncoding() 36 : this(false, true) 37 { 38 } 39 40 UnicodeEncoding(bool bigEndian, bool byteOrderMark)41 public UnicodeEncoding(bool bigEndian, bool byteOrderMark) 42 : this(bigEndian, byteOrderMark, false) 43 { 44 } 45 46 UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes)47 public UnicodeEncoding(bool bigEndian, bool byteOrderMark, bool throwOnInvalidBytes) 48 : base(bigEndian ? 1201 : 1200) //Set the data item. 49 { 50 this.isThrowException = throwOnInvalidBytes; 51 this.bigEndian = bigEndian; 52 this.byteOrderMark = byteOrderMark; 53 54 // Encoding constructor already did this, but it'll be wrong if we're throwing exceptions 55 if (this.isThrowException) 56 SetDefaultFallbacks(); 57 } 58 SetDefaultFallbacks()59 internal override void SetDefaultFallbacks() 60 { 61 // For UTF-X encodings, we use a replacement fallback with an empty string 62 if (this.isThrowException) 63 { 64 this.encoderFallback = EncoderFallback.ExceptionFallback; 65 this.decoderFallback = DecoderFallback.ExceptionFallback; 66 } 67 else 68 { 69 this.encoderFallback = new EncoderReplacementFallback("\xFFFD"); 70 this.decoderFallback = new DecoderReplacementFallback("\xFFFD"); 71 } 72 } 73 74 // The following methods are copied from EncodingNLS.cs. 75 // Unfortunately EncodingNLS.cs is internal and we're public, so we have to re-implement them here. 76 // These should be kept in sync for the following classes: 77 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 78 // 79 80 // Returns the number of bytes required to encode a range of characters in 81 // a character array. 82 // 83 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 84 // So if you fix this, fix the others. Currently those include: 85 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 86 // parent method is safe 87 GetByteCount(char[] chars, int index, int count)88 public override unsafe int GetByteCount(char[] chars, int index, int count) 89 { 90 // Validate input parameters 91 if (chars == null) 92 throw new ArgumentNullException("chars", SR.ArgumentNull_Array); 93 94 if (index < 0 || count < 0) 95 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); 96 97 if (chars.Length - index < count) 98 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); 99 100 // If no input, return 0, avoid fixed empty array problem 101 if (count == 0) 102 return 0; 103 104 // Just call the pointer version 105 fixed (char* pChars = chars) 106 return GetByteCount(pChars + index, count, null); 107 } 108 109 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 110 // So if you fix this, fix the others. Currently those include: 111 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 112 // parent method is safe 113 GetByteCount(String s)114 public override unsafe int GetByteCount(String s) 115 { 116 // Validate input 117 if (s==null) 118 throw new ArgumentNullException("s"); 119 120 fixed (char* pChars = s) 121 return GetByteCount(pChars, s.Length, null); 122 } 123 124 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 125 // So if you fix this, fix the others. Currently those include: 126 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 127 128 [CLSCompliant(false)] GetByteCount(char* chars, int count)129 public override unsafe int GetByteCount(char* chars, int count) 130 { 131 // Validate Parameters 132 if (chars == null) 133 throw new ArgumentNullException("chars", SR.ArgumentNull_Array); 134 135 if (count < 0) 136 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); 137 138 // Call it with empty encoder 139 return GetByteCount(chars, count, null); 140 } 141 142 // Parent method is safe. 143 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 144 // So if you fix this, fix the others. Currently those include: 145 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 146 GetBytes(String s, int charIndex, int charCount, byte[] bytes, int byteIndex)147 public override unsafe int GetBytes(String s, int charIndex, int charCount, 148 byte[] bytes, int byteIndex) 149 { 150 if (s == null || bytes == null) 151 throw new ArgumentNullException((s == null ? "s" : "bytes"), SR.ArgumentNull_Array); 152 153 if (charIndex < 0 || charCount < 0) 154 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); 155 156 if (s.Length - charIndex < charCount) 157 throw new ArgumentOutOfRangeException("s", SR.ArgumentOutOfRange_IndexCount); 158 159 if (byteIndex < 0 || byteIndex > bytes.Length) 160 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); 161 162 int byteCount = bytes.Length - byteIndex; 163 164 fixed (char* pChars = s) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes)) 165 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); 166 } 167 168 // Encodes a range of characters in a character array into a range of bytes 169 // in a byte array. An exception occurs if the byte array is not large 170 // enough to hold the complete encoding of the characters. The 171 // GetByteCount method can be used to determine the exact number of 172 // bytes that will be produced for a given range of characters. 173 // Alternatively, the GetMaxByteCount method can be used to 174 // determine the maximum number of bytes that will be produced for a given 175 // number of characters, regardless of the actual character values. 176 // 177 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 178 // So if you fix this, fix the others. Currently those include: 179 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 180 // parent method is safe 181 GetBytes(char[] chars, int charIndex, int charCount, byte[] bytes, int byteIndex)182 public override unsafe int GetBytes(char[] chars, int charIndex, int charCount, 183 byte[] bytes, int byteIndex) 184 { 185 // Validate parameters 186 if (chars == null || bytes == null) 187 throw new ArgumentNullException((chars == null ? "chars" : "bytes"), SR.ArgumentNull_Array); 188 189 if (charIndex < 0 || charCount < 0) 190 throw new ArgumentOutOfRangeException((charIndex < 0 ? "charIndex" : "charCount"), SR.ArgumentOutOfRange_NeedNonNegNum); 191 192 if (chars.Length - charIndex < charCount) 193 throw new ArgumentOutOfRangeException("chars", SR.ArgumentOutOfRange_IndexCountBuffer); 194 195 if (byteIndex < 0 || byteIndex > bytes.Length) 196 throw new ArgumentOutOfRangeException("byteIndex", SR.ArgumentOutOfRange_Index); 197 198 // If nothing to encode return 0, avoid fixed problem 199 if (charCount == 0) 200 return 0; 201 202 // Just call pointer version 203 int byteCount = bytes.Length - byteIndex; 204 205 fixed (char* pChars = chars) fixed (byte* pBytes = &MemoryMarshal.GetReference((Span<byte>)bytes)) 206 // Remember that byteCount is # to decode, not size of array. 207 return GetBytes(pChars + charIndex, charCount, pBytes + byteIndex, byteCount, null); 208 } 209 210 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 211 // So if you fix this, fix the others. Currently those include: 212 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 213 214 [CLSCompliant(false)] GetBytes(char* chars, int charCount, byte* bytes, int byteCount)215 public override unsafe int GetBytes(char* chars, int charCount, byte* bytes, int byteCount) 216 { 217 // Validate Parameters 218 if (bytes == null || chars == null) 219 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); 220 221 if (charCount < 0 || byteCount < 0) 222 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); 223 224 return GetBytes(chars, charCount, bytes, byteCount, null); 225 } 226 227 // Returns the number of characters produced by decoding a range of bytes 228 // in a byte array. 229 // 230 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 231 // So if you fix this, fix the others. Currently those include: 232 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 233 // parent method is safe 234 GetCharCount(byte[] bytes, int index, int count)235 public override unsafe int GetCharCount(byte[] bytes, int index, int count) 236 { 237 // Validate Parameters 238 if (bytes == null) 239 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); 240 241 if (index < 0 || count < 0) 242 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); 243 244 if (bytes.Length - index < count) 245 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); 246 247 // If no input just return 0, fixed doesn't like 0 length arrays 248 if (count == 0) 249 return 0; 250 251 // Just call pointer version 252 fixed (byte* pBytes = bytes) 253 return GetCharCount(pBytes + index, count, null); 254 } 255 256 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 257 // So if you fix this, fix the others. Currently those include: 258 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 259 260 [CLSCompliant(false)] GetCharCount(byte* bytes, int count)261 public override unsafe int GetCharCount(byte* bytes, int count) 262 { 263 // Validate Parameters 264 if (bytes == null) 265 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); 266 267 if (count < 0) 268 throw new ArgumentOutOfRangeException("count", SR.ArgumentOutOfRange_NeedNonNegNum); 269 270 return GetCharCount(bytes, count, null); 271 } 272 273 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 274 // So if you fix this, fix the others. Currently those include: 275 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 276 // parent method is safe 277 GetChars(byte[] bytes, int byteIndex, int byteCount, char[] chars, int charIndex)278 public override unsafe int GetChars(byte[] bytes, int byteIndex, int byteCount, 279 char[] chars, int charIndex) 280 { 281 // Validate Parameters 282 if (bytes == null || chars == null) 283 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); 284 285 if (byteIndex < 0 || byteCount < 0) 286 throw new ArgumentOutOfRangeException((byteIndex < 0 ? "byteIndex" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); 287 288 if ( bytes.Length - byteIndex < byteCount) 289 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); 290 291 if (charIndex < 0 || charIndex > chars.Length) 292 throw new ArgumentOutOfRangeException("charIndex", SR.ArgumentOutOfRange_Index); 293 294 // If no input, return 0 & avoid fixed problem 295 if (byteCount == 0) 296 return 0; 297 298 // Just call pointer version 299 int charCount = chars.Length - charIndex; 300 301 fixed (byte* pBytes = bytes) fixed (char* pChars = &MemoryMarshal.GetReference((Span<char>)chars)) 302 // Remember that charCount is # to decode, not size of array 303 return GetChars(pBytes + byteIndex, byteCount, pChars + charIndex, charCount, null); 304 } 305 306 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 307 // So if you fix this, fix the others. Currently those include: 308 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 309 310 [CLSCompliant(false)] GetChars(byte* bytes, int byteCount, char* chars, int charCount)311 public unsafe override int GetChars(byte* bytes, int byteCount, char* chars, int charCount) 312 { 313 // Validate Parameters 314 if (bytes == null || chars == null) 315 throw new ArgumentNullException(bytes == null ? "bytes" : "chars", SR.ArgumentNull_Array); 316 317 if (charCount < 0 || byteCount < 0) 318 throw new ArgumentOutOfRangeException((charCount < 0 ? "charCount" : "byteCount"), SR.ArgumentOutOfRange_NeedNonNegNum); 319 320 return GetChars(bytes, byteCount, chars, charCount, null); 321 } 322 323 // Returns a string containing the decoded representation of a range of 324 // bytes in a byte array. 325 // 326 // All of our public Encodings that don't use EncodingNLS must have this (including EncodingNLS) 327 // So if you fix this, fix the others. Currently those include: 328 // EncodingNLS, UTF7Encoding, UTF8Encoding, UTF32Encoding, ASCIIEncoding, UnicodeEncoding 329 // parent method is safe 330 GetString(byte[] bytes, int index, int count)331 public override unsafe string GetString(byte[] bytes, int index, int count) 332 { 333 // Validate Parameters 334 if (bytes == null) 335 throw new ArgumentNullException("bytes", SR.ArgumentNull_Array); 336 337 if (index < 0 || count < 0) 338 throw new ArgumentOutOfRangeException((index < 0 ? "index" : "count"), SR.ArgumentOutOfRange_NeedNonNegNum); 339 340 if (bytes.Length - index < count) 341 throw new ArgumentOutOfRangeException("bytes", SR.ArgumentOutOfRange_IndexCountBuffer); 342 343 // Avoid problems with empty input buffer 344 if (count == 0) return String.Empty; 345 346 fixed (byte* pBytes = bytes) 347 return String.CreateStringFromEncoding( 348 pBytes + index, count, this); 349 } 350 351 // 352 // End of standard methods copied from EncodingNLS.cs 353 // 354 GetByteCount(char* chars, int count, EncoderNLS encoder)355 internal override unsafe int GetByteCount(char* chars, int count, EncoderNLS encoder) 356 { 357 Debug.Assert(chars != null, "[UnicodeEncoding.GetByteCount]chars!=null"); 358 Debug.Assert(count >= 0, "[UnicodeEncoding.GetByteCount]count >=0"); 359 360 // Start by assuming each char gets 2 bytes 361 int byteCount = count << 1; 362 363 // Check for overflow in byteCount 364 // (If they were all invalid chars, this would actually be wrong, 365 // but that's a ridiculously large # so we're not concerned about that case) 366 if (byteCount < 0) 367 throw new ArgumentOutOfRangeException(nameof(count), SR.ArgumentOutOfRange_GetByteCountOverflow); 368 369 char* charStart = chars; 370 char* charEnd = chars + count; 371 char charLeftOver = (char)0; 372 373 bool wasHereBefore = false; 374 375 // Need -1 to check 2 at a time. If we have an even #, longChars will go 376 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars 377 // will go from longEnd - 1 long to longEnd. (Might not get to use this) 378 ulong* longEnd = (ulong*)(charEnd - 3); 379 380 // For fallback we may need a fallback buffer 381 EncoderFallbackBuffer fallbackBuffer = null; 382 char* charsForFallback; 383 384 if (encoder != null) 385 { 386 charLeftOver = encoder._charLeftOver; 387 388 // Assume extra bytes to encode charLeftOver if it existed 389 if (charLeftOver > 0) 390 byteCount += 2; 391 392 // We mustn't have left over fallback data when counting 393 if (encoder.InternalHasFallbackBuffer) 394 { 395 fallbackBuffer = encoder.FallbackBuffer; 396 if (fallbackBuffer.Remaining > 0) 397 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); 398 399 // Set our internal fallback interesting things. 400 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); 401 } 402 } 403 404 char ch; 405 TryAgain: 406 407 while (((ch = (fallbackBuffer == null) ? (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || chars < charEnd) 408 { 409 // First unwind any fallback 410 if (ch == 0) 411 { 412 // No fallback, maybe we can do it fast 413 #if !NO_FAST_UNICODE_LOOP 414 #if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. 415 if ( bigEndian && 416 #else 417 if (!bigEndian && 418 #endif // BIGENDIAN 419 420 #if BIT64 // 64 bit CPU needs to be long aligned for this to work. 421 charLeftOver == 0 && (unchecked((long)chars) & 7) == 0) 422 #else 423 charLeftOver == 0 && (unchecked((int)chars) & 3) == 0) 424 #endif 425 { 426 // Need new char* so we can check 4 at a time 427 ulong* longChars = (ulong*)chars; 428 429 while (longChars < longEnd) 430 { 431 // See if we potentially have surrogates (0x8000 bit set) 432 // (We're either big endian on a big endian machine or little endian on 433 // a little endian machine so that'll work) 434 if ((0x8000800080008000 & *longChars) != 0) 435 { 436 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high 437 // 5 bits looks like 11011, then its a high or low surrogate. 438 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. 439 // Note that we expect BMP characters to be more common than surrogates 440 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates 441 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800; 442 443 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate 444 // but no clue if they're high or low. 445 // If each of the 4 characters are non-zero, then none are surrogates. 446 if ((uTemp & 0xFFFF000000000000) == 0 || 447 (uTemp & 0x0000FFFF00000000) == 0 || 448 (uTemp & 0x00000000FFFF0000) == 0 || 449 (uTemp & 0x000000000000FFFF) == 0) 450 { 451 // It has at least 1 surrogate, but we don't know if they're high or low surrogates, 452 // or if there's 1 or 4 surrogates 453 454 // If they happen to be high/low/high/low, we may as well continue. Check the next 455 // bit to see if its set (low) or not (high) in the right pattern 456 #if BIGENDIAN 457 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) 458 #else 459 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) 460 #endif 461 { 462 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high 463 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. 464 465 // Drop out to the slow loop to resolve the surrogates 466 break; 467 } 468 // else they are all surrogates in High/Low/High/Low order, so we can use them. 469 } 470 // else none are surrogates, so we can use them. 471 } 472 // else all < 0x8000 so we can use them 473 474 // We already counted these four chars, go to next long. 475 longChars++; 476 } 477 478 chars = (char*)longChars; 479 480 if (chars >= charEnd) 481 break; 482 } 483 #endif // !NO_FAST_UNICODE_LOOP 484 485 // No fallback, just get next char 486 ch = *chars; 487 chars++; 488 } 489 else 490 { 491 // We weren't preallocating fallback space. 492 byteCount += 2; 493 } 494 495 // Check for high or low surrogates 496 if (ch >= 0xd800 && ch <= 0xdfff) 497 { 498 // Was it a high surrogate? 499 if (ch <= 0xdbff) 500 { 501 // Its a high surrogate, if we already had a high surrogate do its fallback 502 if (charLeftOver > 0) 503 { 504 // Unwind the current character, this should be safe because we 505 // don't have leftover data in the fallback, so chars must have 506 // advanced already. 507 Debug.Assert(chars > charStart, 508 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced in unexpected high surrogate"); 509 chars--; 510 511 // If previous high surrogate deallocate 2 bytes 512 byteCount -= 2; 513 514 // Fallback the previous surrogate 515 // Need to initialize fallback buffer? 516 if (fallbackBuffer == null) 517 { 518 if (encoder == null) 519 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 520 else 521 fallbackBuffer = encoder.FallbackBuffer; 522 523 // Set our internal fallback interesting things. 524 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); 525 } 526 527 charsForFallback = chars; // Avoid passing chars by reference to allow it to be enregistered 528 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); 529 chars = charsForFallback; 530 531 // Now no high surrogate left over 532 charLeftOver = (char)0; 533 continue; 534 } 535 536 // Remember this high surrogate 537 charLeftOver = ch; 538 continue; 539 } 540 541 542 // Its a low surrogate 543 if (charLeftOver == 0) 544 { 545 // Expected a previous high surrogate. 546 // Don't count this one (we'll count its fallback if necessary) 547 byteCount -= 2; 548 549 // fallback this one 550 // Need to initialize fallback buffer? 551 if (fallbackBuffer == null) 552 { 553 if (encoder == null) 554 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 555 else 556 fallbackBuffer = encoder.FallbackBuffer; 557 558 // Set our internal fallback interesting things. 559 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); 560 } 561 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 562 fallbackBuffer.InternalFallback(ch, ref charsForFallback); 563 chars = charsForFallback; 564 continue; 565 } 566 567 // Valid surrogate pair, add our charLeftOver 568 charLeftOver = (char)0; 569 continue; 570 } 571 else if (charLeftOver > 0) 572 { 573 // Expected a low surrogate, but this char is normal 574 575 // Rewind the current character, fallback previous character. 576 // this should be safe because we don't have leftover data in the 577 // fallback, so chars must have advanced already. 578 Debug.Assert(chars > charStart, 579 "[UnicodeEncoding.GetByteCount]Expected chars to have advanced when expected low surrogate"); 580 chars--; 581 582 // fallback previous chars 583 // Need to initialize fallback buffer? 584 if (fallbackBuffer == null) 585 { 586 if (encoder == null) 587 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 588 else 589 fallbackBuffer = encoder.FallbackBuffer; 590 591 // Set our internal fallback interesting things. 592 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); 593 } 594 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 595 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); 596 chars = charsForFallback; 597 598 // Ignore charLeftOver or throw 599 byteCount -= 2; 600 charLeftOver = (char)0; 601 602 continue; 603 } 604 605 // Ok we had something to add (already counted) 606 } 607 608 // Don't allocate space for left over char 609 if (charLeftOver > 0) 610 { 611 byteCount -= 2; 612 613 // If we have to flush, stick it in fallback and try again 614 if (encoder == null || encoder.MustFlush) 615 { 616 if (wasHereBefore) 617 { 618 // Throw it, using our complete character 619 throw new ArgumentException( 620 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars)); 621 } 622 else 623 { 624 // Need to initialize fallback buffer? 625 if (fallbackBuffer == null) 626 { 627 if (encoder == null) 628 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 629 else 630 fallbackBuffer = encoder.FallbackBuffer; 631 632 // Set our internal fallback interesting things. 633 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); 634 } 635 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 636 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); 637 chars = charsForFallback; 638 charLeftOver = (char)0; 639 wasHereBefore = true; 640 goto TryAgain; 641 } 642 } 643 } 644 645 // Shouldn't have anything in fallback buffer for GetByteCount 646 // (don't have to check _throwOnOverflow for count) 647 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, 648 "[UnicodeEncoding.GetByteCount]Expected empty fallback buffer at end"); 649 650 // Don't remember fallbackBuffer.encoder for counting 651 return byteCount; 652 } 653 GetBytes(char* chars, int charCount, byte* bytes, int byteCount, EncoderNLS encoder)654 internal override unsafe int GetBytes(char* chars, int charCount, 655 byte* bytes, int byteCount, EncoderNLS encoder) 656 { 657 Debug.Assert(chars != null, "[UnicodeEncoding.GetBytes]chars!=null"); 658 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetBytes]byteCount >=0"); 659 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetBytes]charCount >=0"); 660 Debug.Assert(bytes != null, "[UnicodeEncoding.GetBytes]bytes!=null"); 661 662 char charLeftOver = (char)0; 663 char ch; 664 bool wasHereBefore = false; 665 666 667 byte* byteEnd = bytes + byteCount; 668 char* charEnd = chars + charCount; 669 byte* byteStart = bytes; 670 char* charStart = chars; 671 672 // For fallback we may need a fallback buffer 673 EncoderFallbackBuffer fallbackBuffer = null; 674 char* charsForFallback; 675 676 // Get our encoder, but don't clear it yet. 677 if (encoder != null) 678 { 679 charLeftOver = encoder._charLeftOver; 680 681 // We mustn't have left over fallback data when counting 682 if (encoder.InternalHasFallbackBuffer) 683 { 684 // We always need the fallback buffer in get bytes so we can flush any remaining ones if necessary 685 fallbackBuffer = encoder.FallbackBuffer; 686 if (fallbackBuffer.Remaining > 0 && encoder._throwOnOverflow) 687 throw new ArgumentException(SR.Format(SR.Argument_EncoderFallbackNotEmpty, this.EncodingName, encoder.Fallback.GetType())); 688 689 // Set our internal fallback interesting things. 690 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, false); 691 } 692 } 693 694 TryAgain: 695 while (((ch = (fallbackBuffer == null) ? 696 (char)0 : fallbackBuffer.InternalGetNextChar()) != 0) || 697 chars < charEnd) 698 { 699 // First unwind any fallback 700 if (ch == 0) 701 { 702 // No fallback, maybe we can do it fast 703 #if !NO_FAST_UNICODE_LOOP 704 #if BIGENDIAN // If endianess is backwards then each pair of bytes would be backwards. 705 if ( bigEndian && 706 #else 707 if (!bigEndian && 708 #endif // BIGENDIAN 709 #if BIT64 // 64 bit CPU needs to be long aligned for this to work, 32 bit CPU needs to be 32 bit aligned 710 (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && 711 #else 712 (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 && 713 #endif // BIT64 714 charLeftOver == 0) 715 { 716 // Need -1 to check 2 at a time. If we have an even #, longChars will go 717 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars 718 // will go from longEnd - 1 long to longEnd. (Might not get to use this) 719 // We can only go iCount units (limited by shorter of char or byte buffers. 720 ulong* longEnd = (ulong*)(chars - 3 + 721 (((byteEnd - bytes) >> 1 < charEnd - chars) ? 722 (byteEnd - bytes) >> 1 : charEnd - chars)); 723 724 // Need new char* so we can check 4 at a time 725 ulong* longChars = (ulong*)chars; 726 ulong* longBytes = (ulong*)bytes; 727 728 while (longChars < longEnd) 729 { 730 // See if we potentially have surrogates (0x8000 bit set) 731 // (We're either big endian on a big endian machine or little endian on 732 // a little endian machine so that'll work) 733 if ((0x8000800080008000 & *longChars) != 0) 734 { 735 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high 736 // 5 bits looks like 11011, then its a high or low surrogate. 737 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. 738 // Note that we expect BMP characters to be more common than surrogates 739 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates 740 ulong uTemp = (0xf800f800f800f800 & *longChars) ^ 0xd800d800d800d800; 741 742 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate 743 // but no clue if they're high or low. 744 // If each of the 4 characters are non-zero, then none are surrogates. 745 if ((uTemp & 0xFFFF000000000000) == 0 || 746 (uTemp & 0x0000FFFF00000000) == 0 || 747 (uTemp & 0x00000000FFFF0000) == 0 || 748 (uTemp & 0x000000000000FFFF) == 0) 749 { 750 // It has at least 1 surrogate, but we don't know if they're high or low surrogates, 751 // or if there's 1 or 4 surrogates 752 753 // If they happen to be high/low/high/low, we may as well continue. Check the next 754 // bit to see if its set (low) or not (high) in the right pattern 755 #if BIGENDIAN 756 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xd800dc00d800dc00) != 0) 757 #else 758 if (((0xfc00fc00fc00fc00 & *longChars) ^ 0xdc00d800dc00d800) != 0) 759 #endif 760 { 761 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high 762 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. 763 764 // Drop out to the slow loop to resolve the surrogates 765 break; 766 } 767 // else they are all surrogates in High/Low/High/Low order, so we can use them. 768 } 769 // else none are surrogates, so we can use them. 770 } 771 // else all < 0x8000 so we can use them 772 773 // We can use these 4 chars. 774 *longBytes = *longChars; 775 longChars++; 776 longBytes++; 777 } 778 779 chars = (char*)longChars; 780 bytes = (byte*)longBytes; 781 782 if (chars >= charEnd) 783 break; 784 } 785 // Not aligned, but maybe we can still be somewhat faster 786 // Also somehow this optimizes the above loop? It seems to cause something above 787 // to get enregistered, but I haven't figured out how to make that happen without this loop. 788 else if ((charLeftOver == 0) && 789 #if BIGENDIAN 790 bigEndian && 791 #else 792 !bigEndian && 793 #endif // BIGENDIAN 794 795 #if BIT64 796 (unchecked((long)chars) & 7) != (unchecked((long)bytes) & 7) && // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time 797 #else 798 (unchecked((int)chars) & 3) != (unchecked((int)bytes) & 3) && // Only do this if chars & bytes are out of line, otherwise faster loop will be faster next time 799 #endif // BIT64 800 (unchecked((int)(bytes)) & 1) == 0) 801 { 802 // # to use 803 long iCount = ((byteEnd - bytes) >> 1 < charEnd - chars) ? 804 (byteEnd - bytes) >> 1 : charEnd - chars; 805 806 // Need new char* 807 char* charOut = ((char*)bytes); // a char* for our output 808 char* tempEnd = chars + iCount - 1; // Our end pointer 809 810 while (chars < tempEnd) 811 { 812 if (*chars >= (char)0xd800 && *chars <= (char)0xdfff) 813 { 814 // break for fallback for low surrogate 815 if (*chars >= 0xdc00) 816 break; 817 818 // break if next one's not a low surrogate (will do fallback) 819 if (*(chars + 1) < 0xdc00 || *(chars + 1) > 0xdfff) 820 break; 821 822 // They both exist, use them 823 } 824 // If 2nd char is surrogate & this one isn't then only add one 825 else if (*(chars + 1) >= (char)0xd800 && *(chars + 1) <= 0xdfff) 826 { 827 *charOut = *chars; 828 charOut++; 829 chars++; 830 continue; 831 } 832 833 *charOut = *chars; 834 *(charOut + 1) = *(chars + 1); 835 charOut += 2; 836 chars += 2; 837 } 838 839 bytes = (byte*)charOut; 840 841 if (chars >= charEnd) 842 break; 843 } 844 #endif // !NO_FAST_UNICODE_LOOP 845 846 // No fallback, just get next char 847 ch = *chars; 848 chars++; 849 } 850 851 // Check for high or low surrogates 852 if (ch >= 0xd800 && ch <= 0xdfff) 853 { 854 // Was it a high surrogate? 855 if (ch <= 0xdbff) 856 { 857 // Its a high surrogate, see if we already had a high surrogate 858 if (charLeftOver > 0) 859 { 860 // Unwind the current character, this should be safe because we 861 // don't have leftover data in the fallback, so chars must have 862 // advanced already. 863 Debug.Assert(chars > charStart, 864 "[UnicodeEncoding.GetBytes]Expected chars to have advanced in unexpected high surrogate"); 865 chars--; 866 867 // Fallback the previous surrogate 868 // Might need to create our fallback buffer 869 if (fallbackBuffer == null) 870 { 871 if (encoder == null) 872 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 873 else 874 fallbackBuffer = encoder.FallbackBuffer; 875 876 // Set our internal fallback interesting things. 877 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); 878 } 879 880 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 881 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); 882 chars = charsForFallback; 883 884 charLeftOver = (char)0; 885 continue; 886 } 887 888 // Remember this high surrogate 889 charLeftOver = ch; 890 continue; 891 } 892 893 // Its a low surrogate 894 if (charLeftOver == 0) 895 { 896 // We'll fall back this one 897 // Might need to create our fallback buffer 898 if (fallbackBuffer == null) 899 { 900 if (encoder == null) 901 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 902 else 903 fallbackBuffer = encoder.FallbackBuffer; 904 905 // Set our internal fallback interesting things. 906 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); 907 } 908 909 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 910 fallbackBuffer.InternalFallback(ch, ref charsForFallback); 911 chars = charsForFallback; 912 continue; 913 } 914 915 // Valid surrogate pair, add our charLeftOver 916 if (bytes + 3 >= byteEnd) 917 { 918 // Not enough room to add this surrogate pair 919 if (fallbackBuffer != null && fallbackBuffer.bFallingBack) 920 { 921 // These must have both been from the fallbacks. 922 // Both of these MUST have been from a fallback because if the 1st wasn't 923 // from a fallback, then a high surrogate followed by an illegal char 924 // would've caused the high surrogate to fall back. If a high surrogate 925 // fell back, then it was consumed and both chars came from the fallback. 926 fallbackBuffer.MovePrevious(); // Didn't use either fallback surrogate 927 fallbackBuffer.MovePrevious(); 928 } 929 else 930 { 931 // If we don't have enough room, then either we should've advanced a while 932 // or we should have bytes==byteStart and throw below 933 Debug.Assert(chars > charStart + 1 || bytes == byteStart, 934 "[UnicodeEncoding.GetBytes]Expected chars to have when no room to add surrogate pair"); 935 chars -= 2; // Didn't use either surrogate 936 } 937 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) 938 charLeftOver = (char)0; // we'll retry it later 939 break; // Didn't throw, but stop 'til next time. 940 } 941 942 if (bigEndian) 943 { 944 *(bytes++) = (byte)(charLeftOver >> 8); 945 *(bytes++) = (byte)charLeftOver; 946 } 947 else 948 { 949 *(bytes++) = (byte)charLeftOver; 950 *(bytes++) = (byte)(charLeftOver >> 8); 951 } 952 953 charLeftOver = (char)0; 954 } 955 else if (charLeftOver > 0) 956 { 957 // Expected a low surrogate, but this char is normal 958 959 // Rewind the current character, fallback previous character. 960 // this should be safe because we don't have leftover data in the 961 // fallback, so chars must have advanced already. 962 Debug.Assert(chars > charStart, 963 "[UnicodeEncoding.GetBytes]Expected chars to have advanced after expecting low surrogate"); 964 chars--; 965 966 // fallback previous chars 967 // Might need to create our fallback buffer 968 if (fallbackBuffer == null) 969 { 970 if (encoder == null) 971 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 972 else 973 fallbackBuffer = encoder.FallbackBuffer; 974 975 // Set our internal fallback interesting things. 976 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); 977 } 978 979 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 980 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); 981 chars = charsForFallback; 982 983 // Ignore charLeftOver or throw 984 charLeftOver = (char)0; 985 continue; 986 } 987 988 // Ok, we have a char to add 989 if (bytes + 1 >= byteEnd) 990 { 991 // Couldn't add this char 992 if (fallbackBuffer != null && fallbackBuffer.bFallingBack) 993 fallbackBuffer.MovePrevious(); // Not using this fallback char 994 else 995 { 996 // Lonely charLeftOver (from previous call) would've been caught up above, 997 // so this must be a case where we've already read an input char. 998 Debug.Assert(chars > charStart, 999 "[UnicodeEncoding.GetBytes]Expected chars to have advanced for failed fallback"); 1000 chars--; // Not using this char 1001 } 1002 ThrowBytesOverflow(encoder, bytes == byteStart); // Throw maybe (if no bytes written) 1003 break; // didn't throw, just stop 1004 } 1005 1006 if (bigEndian) 1007 { 1008 *(bytes++) = (byte)(ch >> 8); 1009 *(bytes++) = (byte)ch; 1010 } 1011 else 1012 { 1013 *(bytes++) = (byte)ch; 1014 *(bytes++) = (byte)(ch >> 8); 1015 } 1016 } 1017 1018 // Don't allocate space for left over char 1019 if (charLeftOver > 0) 1020 { 1021 // If we aren't flushing we need to fall this back 1022 if (encoder == null || encoder.MustFlush) 1023 { 1024 if (wasHereBefore) 1025 { 1026 // Throw it, using our complete character 1027 throw new ArgumentException( 1028 SR.Format(SR.Argument_RecursiveFallback, charLeftOver), nameof(chars)); 1029 } 1030 else 1031 { 1032 // If we have to flush, stick it in fallback and try again 1033 // Might need to create our fallback buffer 1034 if (fallbackBuffer == null) 1035 { 1036 if (encoder == null) 1037 fallbackBuffer = this.encoderFallback.CreateFallbackBuffer(); 1038 else 1039 fallbackBuffer = encoder.FallbackBuffer; 1040 1041 // Set our internal fallback interesting things. 1042 fallbackBuffer.InternalInitialize(charStart, charEnd, encoder, true); 1043 } 1044 1045 // If we're not flushing, that'll remember the left over character. 1046 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 1047 fallbackBuffer.InternalFallback(charLeftOver, ref charsForFallback); 1048 chars = charsForFallback; 1049 1050 charLeftOver = (char)0; 1051 wasHereBefore = true; 1052 goto TryAgain; 1053 } 1054 } 1055 } 1056 1057 // Not flushing, remember it in the encoder 1058 if (encoder != null) 1059 { 1060 encoder._charLeftOver = charLeftOver; 1061 encoder._charsUsed = (int)(chars - charStart); 1062 } 1063 1064 // Remember charLeftOver if we must, or clear it if we're flushing 1065 // (charLeftOver should be 0 if we're flushing) 1066 Debug.Assert((encoder != null && !encoder.MustFlush) || charLeftOver == (char)0, 1067 "[UnicodeEncoding.GetBytes] Expected no left over characters if flushing"); 1068 1069 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0 || 1070 encoder == null || !encoder._throwOnOverflow, 1071 "[UnicodeEncoding.GetBytes]Expected empty fallback buffer if not converting"); 1072 1073 // We used to copy it fast, but this doesn't check for surrogates 1074 // System.IO.__UnmanagedMemoryStream.memcpyimpl(bytes, (byte*)chars, usedByteCount); 1075 1076 return (int)(bytes - byteStart); 1077 } 1078 GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder)1079 internal override unsafe int GetCharCount(byte* bytes, int count, DecoderNLS baseDecoder) 1080 { 1081 Debug.Assert(bytes != null, "[UnicodeEncoding.GetCharCount]bytes!=null"); 1082 Debug.Assert(count >= 0, "[UnicodeEncoding.GetCharCount]count >=0"); 1083 1084 UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder; 1085 1086 byte* byteEnd = bytes + count; 1087 byte* byteStart = bytes; 1088 1089 // Need last vars 1090 int lastByte = -1; 1091 char lastChar = (char)0; 1092 1093 // Start by assuming same # of chars as bytes 1094 int charCount = count >> 1; 1095 1096 // Need -1 to check 2 at a time. If we have an even #, longBytes will go 1097 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longBytes 1098 // will go from longEnd - 1 long to longEnd. (Might not get to use this) 1099 ulong* longEnd = (ulong*)(byteEnd - 7); 1100 1101 // For fallback we may need a fallback buffer 1102 DecoderFallbackBuffer fallbackBuffer = null; 1103 1104 if (decoder != null) 1105 { 1106 lastByte = decoder.lastByte; 1107 lastChar = decoder.lastChar; 1108 1109 // Assume extra char if last char was around 1110 if (lastChar > 0) 1111 charCount++; 1112 1113 // Assume extra char if extra last byte makes up odd # of input bytes 1114 if (lastByte >= 0 && (count & 1) == 1) 1115 { 1116 charCount++; 1117 } 1118 1119 // Shouldn't have anything in fallback buffer for GetCharCount 1120 // (don't have to check _throwOnOverflow for count) 1121 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, 1122 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at start"); 1123 } 1124 1125 while (bytes < byteEnd) 1126 { 1127 // If we're aligned then maybe we can do it fast 1128 // That'll hurt if we're unaligned because we'll always test but never be aligned 1129 #if !NO_FAST_UNICODE_LOOP 1130 #if BIGENDIAN 1131 if (bigEndian && 1132 #else // BIGENDIAN 1133 if (!bigEndian && 1134 #endif // BIGENDIAN 1135 #if BIT64 // win64 has to be long aligned 1136 (unchecked((long)bytes) & 7) == 0 && 1137 #else 1138 (unchecked((int)bytes) & 3) == 0 && 1139 #endif // BIT64 1140 lastByte == -1 && lastChar == 0) 1141 { 1142 // Need new char* so we can check 4 at a time 1143 ulong* longBytes = (ulong*)bytes; 1144 1145 while (longBytes < longEnd) 1146 { 1147 // See if we potentially have surrogates (0x8000 bit set) 1148 // (We're either big endian on a big endian machine or little endian on 1149 // a little endian machine so that'll work) 1150 if ((0x8000800080008000 & *longBytes) != 0) 1151 { 1152 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high 1153 // 5 bits looks like 11011, then its a high or low surrogate. 1154 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. 1155 // Note that we expect BMP characters to be more common than surrogates 1156 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates 1157 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800; 1158 1159 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate 1160 // but no clue if they're high or low. 1161 // If each of the 4 characters are non-zero, then none are surrogates. 1162 if ((uTemp & 0xFFFF000000000000) == 0 || 1163 (uTemp & 0x0000FFFF00000000) == 0 || 1164 (uTemp & 0x00000000FFFF0000) == 0 || 1165 (uTemp & 0x000000000000FFFF) == 0) 1166 { 1167 // It has at least 1 surrogate, but we don't know if they're high or low surrogates, 1168 // or if there's 1 or 4 surrogates 1169 1170 // If they happen to be high/low/high/low, we may as well continue. Check the next 1171 // bit to see if its set (low) or not (high) in the right pattern 1172 #if BIGENDIAN 1173 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) 1174 #else 1175 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) 1176 #endif 1177 { 1178 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high 1179 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. 1180 1181 // Drop out to the slow loop to resolve the surrogates 1182 break; 1183 } 1184 // else they are all surrogates in High/Low/High/Low order, so we can use them. 1185 } 1186 // else none are surrogates, so we can use them. 1187 } 1188 // else all < 0x8000 so we can use them 1189 1190 // We can use these 4 chars. 1191 longBytes++; 1192 } 1193 1194 bytes = (byte*)longBytes; 1195 1196 if (bytes >= byteEnd) 1197 break; 1198 } 1199 #endif // !NO_FAST_UNICODE_LOOP 1200 1201 // Get 1st byte 1202 if (lastByte < 0) 1203 { 1204 lastByte = *bytes++; 1205 if (bytes >= byteEnd) break; 1206 } 1207 1208 // Get full char 1209 char ch; 1210 if (bigEndian) 1211 { 1212 ch = (char)(lastByte << 8 | *(bytes++)); 1213 } 1214 else 1215 { 1216 ch = (char)(*(bytes++) << 8 | lastByte); 1217 } 1218 lastByte = -1; 1219 1220 // See if the char's valid 1221 if (ch >= 0xd800 && ch <= 0xdfff) 1222 { 1223 // Was it a high surrogate? 1224 if (ch <= 0xdbff) 1225 { 1226 // Its a high surrogate, if we had one then do fallback for previous one 1227 if (lastChar > 0) 1228 { 1229 // Ignore previous bad high surrogate 1230 charCount--; 1231 1232 // Get fallback for previous high surrogate 1233 // Note we have to reconstruct bytes because some may have been in decoder 1234 byte[] byteBuffer = null; 1235 if (bigEndian) 1236 { 1237 byteBuffer = new byte[] 1238 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; 1239 } 1240 else 1241 { 1242 byteBuffer = new byte[] 1243 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; 1244 } 1245 1246 if (fallbackBuffer == null) 1247 { 1248 if (decoder == null) 1249 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1250 else 1251 fallbackBuffer = decoder.FallbackBuffer; 1252 1253 // Set our internal fallback interesting things. 1254 fallbackBuffer.InternalInitialize(byteStart, null); 1255 } 1256 1257 // Get fallback. 1258 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); 1259 } 1260 1261 // Ignore the last one which fell back already, 1262 // and remember the new high surrogate 1263 lastChar = ch; 1264 continue; 1265 } 1266 1267 // Its a low surrogate 1268 if (lastChar == 0) 1269 { 1270 // Expected a previous high surrogate 1271 charCount--; 1272 1273 // Get fallback for this low surrogate 1274 // Note we have to reconstruct bytes because some may have been in decoder 1275 byte[] byteBuffer = null; 1276 if (bigEndian) 1277 { 1278 byteBuffer = new byte[] 1279 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }; 1280 } 1281 else 1282 { 1283 byteBuffer = new byte[] 1284 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }; 1285 } 1286 1287 if (fallbackBuffer == null) 1288 { 1289 if (decoder == null) 1290 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1291 else 1292 fallbackBuffer = decoder.FallbackBuffer; 1293 1294 // Set our internal fallback interesting things. 1295 fallbackBuffer.InternalInitialize(byteStart, null); 1296 } 1297 1298 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); 1299 1300 // Ignore this one (we already did its fallback) 1301 continue; 1302 } 1303 1304 // Valid surrogate pair, already counted. 1305 lastChar = (char)0; 1306 } 1307 else if (lastChar > 0) 1308 { 1309 // Had a high surrogate, expected a low surrogate 1310 // Un-count the last high surrogate 1311 charCount--; 1312 1313 // fall back the high surrogate. 1314 byte[] byteBuffer = null; 1315 if (bigEndian) 1316 { 1317 byteBuffer = new byte[] 1318 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; 1319 } 1320 else 1321 { 1322 byteBuffer = new byte[] 1323 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; 1324 } 1325 1326 if (fallbackBuffer == null) 1327 { 1328 if (decoder == null) 1329 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1330 else 1331 fallbackBuffer = decoder.FallbackBuffer; 1332 1333 // Set our internal fallback interesting things. 1334 fallbackBuffer.InternalInitialize(byteStart, null); 1335 } 1336 1337 // Already subtracted high surrogate 1338 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); 1339 1340 // Not left over now, clear previous high surrogate and continue to add current char 1341 lastChar = (char)0; 1342 } 1343 1344 // Valid char, already counted 1345 } 1346 1347 // Extra space if we can't use decoder 1348 if (decoder == null || decoder.MustFlush) 1349 { 1350 if (lastChar > 0) 1351 { 1352 // No hanging high surrogates allowed, do fallback and remove count for it 1353 charCount--; 1354 byte[] byteBuffer = null; 1355 if (bigEndian) 1356 { 1357 byteBuffer = new byte[] 1358 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; 1359 } 1360 else 1361 { 1362 byteBuffer = new byte[] 1363 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; 1364 } 1365 1366 if (fallbackBuffer == null) 1367 { 1368 if (decoder == null) 1369 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1370 else 1371 fallbackBuffer = decoder.FallbackBuffer; 1372 1373 // Set our internal fallback interesting things. 1374 fallbackBuffer.InternalInitialize(byteStart, null); 1375 } 1376 1377 charCount += fallbackBuffer.InternalFallback(byteBuffer, bytes); 1378 1379 lastChar = (char)0; 1380 } 1381 1382 if (lastByte >= 0) 1383 { 1384 if (fallbackBuffer == null) 1385 { 1386 if (decoder == null) 1387 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1388 else 1389 fallbackBuffer = decoder.FallbackBuffer; 1390 1391 // Set our internal fallback interesting things. 1392 fallbackBuffer.InternalInitialize(byteStart, null); 1393 } 1394 1395 // No hanging odd bytes allowed if must flush 1396 charCount += fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes); 1397 lastByte = -1; 1398 } 1399 } 1400 1401 // If we had a high surrogate left over, we can't count it 1402 if (lastChar > 0) 1403 charCount--; 1404 1405 // Shouldn't have anything in fallback buffer for GetCharCount 1406 // (don't have to check _throwOnOverflow for count) 1407 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, 1408 "[UnicodeEncoding.GetCharCount]Expected empty fallback buffer at end"); 1409 1410 return charCount; 1411 } 1412 GetChars(byte* bytes, int byteCount, char* chars, int charCount, DecoderNLS baseDecoder)1413 internal override unsafe int GetChars(byte* bytes, int byteCount, 1414 char* chars, int charCount, DecoderNLS baseDecoder) 1415 { 1416 Debug.Assert(chars != null, "[UnicodeEncoding.GetChars]chars!=null"); 1417 Debug.Assert(byteCount >= 0, "[UnicodeEncoding.GetChars]byteCount >=0"); 1418 Debug.Assert(charCount >= 0, "[UnicodeEncoding.GetChars]charCount >=0"); 1419 Debug.Assert(bytes != null, "[UnicodeEncoding.GetChars]bytes!=null"); 1420 1421 UnicodeEncoding.Decoder decoder = (UnicodeEncoding.Decoder)baseDecoder; 1422 1423 // Need last vars 1424 int lastByte = -1; 1425 char lastChar = (char)0; 1426 1427 // Get our decoder (but don't clear it yet) 1428 if (decoder != null) 1429 { 1430 lastByte = decoder.lastByte; 1431 lastChar = decoder.lastChar; 1432 1433 // Shouldn't have anything in fallback buffer for GetChars 1434 // (don't have to check _throwOnOverflow for chars) 1435 Debug.Assert(!decoder.InternalHasFallbackBuffer || decoder.FallbackBuffer.Remaining == 0, 1436 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at start"); 1437 } 1438 1439 // For fallback we may need a fallback buffer 1440 DecoderFallbackBuffer fallbackBuffer = null; 1441 char* charsForFallback; 1442 1443 byte* byteEnd = bytes + byteCount; 1444 char* charEnd = chars + charCount; 1445 byte* byteStart = bytes; 1446 char* charStart = chars; 1447 1448 while (bytes < byteEnd) 1449 { 1450 // If we're aligned then maybe we can do it fast 1451 // That'll hurt if we're unaligned because we'll always test but never be aligned 1452 #if !NO_FAST_UNICODE_LOOP 1453 #if BIGENDIAN 1454 if (bigEndian && 1455 #else // BIGENDIAN 1456 if (!bigEndian && 1457 #endif // BIGENDIAN 1458 #if BIT64 // win64 has to be long aligned 1459 (unchecked((long)chars) & 7) == 0 && (unchecked((long)bytes) & 7) == 0 && 1460 #else 1461 (unchecked((int)chars) & 3) == 0 && (unchecked((int)bytes) & 3) == 0 && 1462 #endif // BIT64 1463 lastByte == -1 && lastChar == 0) 1464 { 1465 // Need -1 to check 2 at a time. If we have an even #, longChars will go 1466 // from longEnd - 1/2 long to longEnd + 1/2 long. If we're odd, longChars 1467 // will go from longEnd - 1 long to longEnd. (Might not get to use this) 1468 // We can only go iCount units (limited by shorter of char or byte buffers. 1469 ulong* longEnd = (ulong*)(bytes - 7 + 1470 (((byteEnd - bytes) >> 1 < charEnd - chars) ? 1471 (byteEnd - bytes) : (charEnd - chars) << 1)); 1472 1473 // Need new char* so we can check 4 at a time 1474 ulong* longBytes = (ulong*)bytes; 1475 ulong* longChars = (ulong*)chars; 1476 1477 while (longBytes < longEnd) 1478 { 1479 // See if we potentially have surrogates (0x8000 bit set) 1480 // (We're either big endian on a big endian machine or little endian on 1481 // a little endian machine so that'll work) 1482 if ((0x8000800080008000 & *longBytes) != 0) 1483 { 1484 // See if any of these are high or low surrogates (0xd800 - 0xdfff). If the high 1485 // 5 bits looks like 11011, then its a high or low surrogate. 1486 // We do the & f800 to filter the 5 bits, then ^ d800 to ensure the 0 isn't set. 1487 // Note that we expect BMP characters to be more common than surrogates 1488 // & each char with 11111... then ^ with 11011. Zeroes then indicate surrogates 1489 ulong uTemp = (0xf800f800f800f800 & *longBytes) ^ 0xd800d800d800d800; 1490 1491 // Check each of the 4 chars. 0 for those 16 bits means it was a surrogate 1492 // but no clue if they're high or low. 1493 // If each of the 4 characters are non-zero, then none are surrogates. 1494 if ((uTemp & 0xFFFF000000000000) == 0 || 1495 (uTemp & 0x0000FFFF00000000) == 0 || 1496 (uTemp & 0x00000000FFFF0000) == 0 || 1497 (uTemp & 0x000000000000FFFF) == 0) 1498 { 1499 // It has at least 1 surrogate, but we don't know if they're high or low surrogates, 1500 // or if there's 1 or 4 surrogates 1501 1502 // If they happen to be high/low/high/low, we may as well continue. Check the next 1503 // bit to see if its set (low) or not (high) in the right pattern 1504 #if BIGENDIAN 1505 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xd800dc00d800dc00) != 0) 1506 #else 1507 if (((0xfc00fc00fc00fc00 & *longBytes) ^ 0xdc00d800dc00d800) != 0) 1508 #endif 1509 { 1510 // Either there weren't 4 surrogates, or the 0x0400 bit was set when a high 1511 // was hoped for or the 0x0400 bit wasn't set where a low was hoped for. 1512 1513 // Drop out to the slow loop to resolve the surrogates 1514 break; 1515 } 1516 // else they are all surrogates in High/Low/High/Low order, so we can use them. 1517 } 1518 // else none are surrogates, so we can use them. 1519 } 1520 // else all < 0x8000 so we can use them 1521 1522 // We can use these 4 chars. 1523 *longChars = *longBytes; 1524 longBytes++; 1525 longChars++; 1526 } 1527 1528 chars = (char*)longChars; 1529 bytes = (byte*)longBytes; 1530 1531 if (bytes >= byteEnd) 1532 break; 1533 } 1534 #endif // !NO_FAST_UNICODE_LOOP 1535 1536 // Get 1st byte 1537 if (lastByte < 0) 1538 { 1539 lastByte = *bytes++; 1540 continue; 1541 } 1542 1543 // Get full char 1544 char ch; 1545 if (bigEndian) 1546 { 1547 ch = (char)(lastByte << 8 | *(bytes++)); 1548 } 1549 else 1550 { 1551 ch = (char)(*(bytes++) << 8 | lastByte); 1552 } 1553 lastByte = -1; 1554 1555 // See if the char's valid 1556 if (ch >= 0xd800 && ch <= 0xdfff) 1557 { 1558 // Was it a high surrogate? 1559 if (ch <= 0xdbff) 1560 { 1561 // Its a high surrogate, if we had one then do fallback for previous one 1562 if (lastChar > 0) 1563 { 1564 // Get fallback for previous high surrogate 1565 // Note we have to reconstruct bytes because some may have been in decoder 1566 byte[] byteBuffer = null; 1567 if (bigEndian) 1568 { 1569 byteBuffer = new byte[] 1570 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; 1571 } 1572 else 1573 { 1574 byteBuffer = new byte[] 1575 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; 1576 } 1577 1578 if (fallbackBuffer == null) 1579 { 1580 if (decoder == null) 1581 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1582 else 1583 fallbackBuffer = decoder.FallbackBuffer; 1584 1585 // Set our internal fallback interesting things. 1586 fallbackBuffer.InternalInitialize(byteStart, charEnd); 1587 } 1588 1589 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 1590 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); 1591 chars = charsForFallback; 1592 1593 if (!fallbackResult) 1594 { 1595 // couldn't fall back lonely surrogate 1596 // We either advanced bytes or chars should == charStart and throw below 1597 Debug.Assert(bytes >= byteStart + 2 || chars == charStart, 1598 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (bad surrogate)"); 1599 bytes -= 2; // didn't use these 2 bytes 1600 fallbackBuffer.InternalReset(); 1601 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output 1602 break; // couldn't fallback but didn't throw 1603 } 1604 } 1605 1606 // Ignore the previous high surrogate which fell back already, 1607 // yet remember the current high surrogate for next time. 1608 lastChar = ch; 1609 continue; 1610 } 1611 1612 // Its a low surrogate 1613 if (lastChar == 0) 1614 { 1615 // Expected a previous high surrogate 1616 // Get fallback for this low surrogate 1617 // Note we have to reconstruct bytes because some may have been in decoder 1618 byte[] byteBuffer = null; 1619 if (bigEndian) 1620 { 1621 byteBuffer = new byte[] 1622 { unchecked((byte)(ch >> 8)), unchecked((byte)ch) }; 1623 } 1624 else 1625 { 1626 byteBuffer = new byte[] 1627 { unchecked((byte)ch), unchecked((byte)(ch >> 8)) }; 1628 } 1629 1630 if (fallbackBuffer == null) 1631 { 1632 if (decoder == null) 1633 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1634 else 1635 fallbackBuffer = decoder.FallbackBuffer; 1636 1637 // Set our internal fallback interesting things. 1638 fallbackBuffer.InternalInitialize(byteStart, charEnd); 1639 } 1640 1641 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 1642 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); 1643 chars = charsForFallback; 1644 1645 if (!fallbackResult) 1646 { 1647 // couldn't fall back lonely surrogate 1648 // We either advanced bytes or chars should == charStart and throw below 1649 Debug.Assert(bytes >= byteStart + 2 || chars == charStart, 1650 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (lonely surrogate)"); 1651 bytes -= 2; // didn't use these 2 bytes 1652 fallbackBuffer.InternalReset(); 1653 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output 1654 break; // couldn't fallback but didn't throw 1655 } 1656 1657 // Didn't throw, ignore this one (we already did its fallback) 1658 continue; 1659 } 1660 1661 // Valid surrogate pair, add our lastChar (will need 2 chars) 1662 if (chars >= charEnd - 1) 1663 { 1664 // couldn't find room for this surrogate pair 1665 // We either advanced bytes or chars should == charStart and throw below 1666 Debug.Assert(bytes >= byteStart + 2 || chars == charStart, 1667 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (surrogate pair)"); 1668 bytes -= 2; // didn't use these 2 bytes 1669 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output 1670 // Leave lastChar for next call to Convert() 1671 break; // couldn't fallback but didn't throw 1672 } 1673 1674 *chars++ = lastChar; 1675 lastChar = (char)0; 1676 } 1677 else if (lastChar > 0) 1678 { 1679 // Had a high surrogate, expected a low surrogate, fall back the high surrogate. 1680 byte[] byteBuffer = null; 1681 if (bigEndian) 1682 { 1683 byteBuffer = new byte[] 1684 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; 1685 } 1686 else 1687 { 1688 byteBuffer = new byte[] 1689 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; 1690 } 1691 1692 if (fallbackBuffer == null) 1693 { 1694 if (decoder == null) 1695 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1696 else 1697 fallbackBuffer = decoder.FallbackBuffer; 1698 1699 // Set our internal fallback interesting things. 1700 fallbackBuffer.InternalInitialize(byteStart, charEnd); 1701 } 1702 1703 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 1704 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); 1705 chars = charsForFallback; 1706 1707 if (!fallbackResult) 1708 { 1709 // couldn't fall back high surrogate, or char that would be next 1710 // We either advanced bytes or chars should == charStart and throw below 1711 Debug.Assert(bytes >= byteStart + 2 || chars == charStart, 1712 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (no low surrogate)"); 1713 bytes -= 2; // didn't use these 2 bytes 1714 fallbackBuffer.InternalReset(); 1715 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output 1716 break; // couldn't fallback but didn't throw 1717 } 1718 1719 // Not left over now, clear previous high surrogate and continue to add current char 1720 lastChar = (char)0; 1721 } 1722 1723 // Valid char, room for it? 1724 if (chars >= charEnd) 1725 { 1726 // 2 bytes couldn't fall back 1727 // We either advanced bytes or chars should == charStart and throw below 1728 Debug.Assert(bytes >= byteStart + 2 || chars == charStart, 1729 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (normal)"); 1730 bytes -= 2; // didn't use these bytes 1731 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output 1732 break; // couldn't fallback but didn't throw 1733 } 1734 1735 // add it 1736 *chars++ = ch; 1737 } 1738 1739 // Remember our decoder if we must 1740 if (decoder == null || decoder.MustFlush) 1741 { 1742 if (lastChar > 0) 1743 { 1744 // No hanging high surrogates allowed, do fallback and remove count for it 1745 byte[] byteBuffer = null; 1746 if (bigEndian) 1747 { 1748 byteBuffer = new byte[] 1749 { unchecked((byte)(lastChar >> 8)), unchecked((byte)lastChar) }; 1750 } 1751 else 1752 { 1753 byteBuffer = new byte[] 1754 { unchecked((byte)lastChar), unchecked((byte)(lastChar >> 8)) }; 1755 } 1756 1757 if (fallbackBuffer == null) 1758 { 1759 if (decoder == null) 1760 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1761 else 1762 fallbackBuffer = decoder.FallbackBuffer; 1763 1764 // Set our internal fallback interesting things. 1765 fallbackBuffer.InternalInitialize(byteStart, charEnd); 1766 } 1767 1768 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 1769 bool fallbackResult = fallbackBuffer.InternalFallback(byteBuffer, bytes, ref charsForFallback); 1770 chars = charsForFallback; 1771 1772 if (!fallbackResult) 1773 { 1774 // 2 bytes couldn't fall back 1775 // We either advanced bytes or chars should == charStart and throw below 1776 Debug.Assert(bytes >= byteStart + 2 || chars == charStart, 1777 "[UnicodeEncoding.GetChars]Expected bytes to have advanced or no output (decoder)"); 1778 bytes -= 2; // didn't use these bytes 1779 if (lastByte >= 0) 1780 bytes--; // had an extra last byte hanging around 1781 fallbackBuffer.InternalReset(); 1782 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output 1783 // We'll remember these in our decoder though 1784 bytes += 2; 1785 if (lastByte >= 0) 1786 bytes++; 1787 goto End; 1788 } 1789 1790 // done with this one 1791 lastChar = (char)0; 1792 } 1793 1794 if (lastByte >= 0) 1795 { 1796 if (fallbackBuffer == null) 1797 { 1798 if (decoder == null) 1799 fallbackBuffer = this.decoderFallback.CreateFallbackBuffer(); 1800 else 1801 fallbackBuffer = decoder.FallbackBuffer; 1802 1803 // Set our internal fallback interesting things. 1804 fallbackBuffer.InternalInitialize(byteStart, charEnd); 1805 } 1806 1807 // No hanging odd bytes allowed if must flush 1808 charsForFallback = chars; // Avoid passing chars by reference to allow it to be en-registered 1809 bool fallbackResult = fallbackBuffer.InternalFallback(new byte[] { unchecked((byte)lastByte) }, bytes, ref charsForFallback); 1810 chars = charsForFallback; 1811 1812 if (!fallbackResult) 1813 { 1814 // odd byte couldn't fall back 1815 bytes--; // didn't use this byte 1816 fallbackBuffer.InternalReset(); 1817 ThrowCharsOverflow(decoder, chars == charStart);// Might throw, if no chars output 1818 // didn't throw, but we'll remember it in the decoder 1819 bytes++; 1820 goto End; 1821 } 1822 1823 // Didn't fail, clear buffer 1824 lastByte = -1; 1825 } 1826 } 1827 1828 End: 1829 1830 // Remember our decoder if we must 1831 if (decoder != null) 1832 { 1833 Debug.Assert((decoder.MustFlush == false) || ((lastChar == (char)0) && (lastByte == -1)), 1834 "[UnicodeEncoding.GetChars] Expected no left over chars or bytes if flushing" 1835 // + " " + ((int)lastChar).ToString("X4") + " " + lastByte.ToString("X2") 1836 ); 1837 1838 decoder._bytesUsed = (int)(bytes - byteStart); 1839 decoder.lastChar = lastChar; 1840 decoder.lastByte = lastByte; 1841 } 1842 1843 // Used to do this the old way 1844 // System.IO.__UnmanagedMemoryStream.memcpyimpl((byte*)chars, bytes, byteCount); 1845 1846 // Shouldn't have anything in fallback buffer for GetChars 1847 // (don't have to check _throwOnOverflow for count or chars) 1848 Debug.Assert(fallbackBuffer == null || fallbackBuffer.Remaining == 0, 1849 "[UnicodeEncoding.GetChars]Expected empty fallback buffer at end"); 1850 1851 return (int)(chars - charStart); 1852 } 1853 1854 GetEncoder()1855 public override System.Text.Encoder GetEncoder() 1856 { 1857 return new EncoderNLS(this); 1858 } 1859 1860 GetDecoder()1861 public override System.Text.Decoder GetDecoder() 1862 { 1863 return new UnicodeEncoding.Decoder(this); 1864 } 1865 1866 GetPreamble()1867 public override byte[] GetPreamble() 1868 { 1869 if (byteOrderMark) 1870 { 1871 // Note - we must allocate new byte[]'s here to prevent someone 1872 // from modifying a cached byte[]. 1873 if (bigEndian) 1874 return new byte[2] { 0xfe, 0xff }; 1875 else 1876 return new byte[2] { 0xff, 0xfe }; 1877 } 1878 return Array.Empty<Byte>(); 1879 } 1880 1881 public override ReadOnlySpan<byte> Preamble => 1882 GetType() != typeof(UnicodeEncoding) ? GetPreamble() : // in case a derived UnicodeEncoding overrode GetPreamble 1883 byteOrderMark ? (bigEndian ? s_bigEndianPreamble : s_littleEndianPreamble) : 1884 Array.Empty<byte>(); 1885 GetMaxByteCount(int charCount)1886 public override int GetMaxByteCount(int charCount) 1887 { 1888 if (charCount < 0) 1889 throw new ArgumentOutOfRangeException(nameof(charCount), 1890 SR.ArgumentOutOfRange_NeedNonNegNum); 1891 1892 // Characters would be # of characters + 1 in case left over high surrogate is ? * max fallback 1893 long byteCount = (long)charCount + 1; 1894 1895 if (EncoderFallback.MaxCharCount > 1) 1896 byteCount *= EncoderFallback.MaxCharCount; 1897 1898 // 2 bytes per char 1899 byteCount <<= 1; 1900 1901 if (byteCount > 0x7fffffff) 1902 throw new ArgumentOutOfRangeException(nameof(charCount), SR.ArgumentOutOfRange_GetByteCountOverflow); 1903 1904 return (int)byteCount; 1905 } 1906 1907 GetMaxCharCount(int byteCount)1908 public override int GetMaxCharCount(int byteCount) 1909 { 1910 if (byteCount < 0) 1911 throw new ArgumentOutOfRangeException(nameof(byteCount), 1912 SR.ArgumentOutOfRange_NeedNonNegNum); 1913 1914 // long because byteCount could be biggest int. 1915 // 1 char per 2 bytes. Round up in case 1 left over in decoder. 1916 // Round up using &1 in case byteCount is max size 1917 // Might also need an extra 1 if there's a left over high surrogate in the decoder. 1918 long charCount = (long)(byteCount >> 1) + (byteCount & 1) + 1; 1919 1920 // Don't forget fallback (in case they have a bunch of lonely surrogates or something bizarre like that) 1921 if (DecoderFallback.MaxCharCount > 1) 1922 charCount *= DecoderFallback.MaxCharCount; 1923 1924 if (charCount > 0x7fffffff) 1925 throw new ArgumentOutOfRangeException(nameof(byteCount), SR.ArgumentOutOfRange_GetCharCountOverflow); 1926 1927 return (int)charCount; 1928 } 1929 1930 Equals(Object value)1931 public override bool Equals(Object value) 1932 { 1933 UnicodeEncoding that = value as UnicodeEncoding; 1934 if (that != null) 1935 { 1936 // 1937 // Big Endian Unicode has different code page (1201) than small Endian one (1200), 1938 // so we still have to check _codePage here. 1939 // 1940 return (CodePage == that.CodePage) && 1941 byteOrderMark == that.byteOrderMark && 1942 // isThrowException == that.isThrowException && // Same as Encoder/Decoder being exception fallbacks 1943 bigEndian == that.bigEndian && 1944 (EncoderFallback.Equals(that.EncoderFallback)) && 1945 (DecoderFallback.Equals(that.DecoderFallback)); 1946 } 1947 return (false); 1948 } 1949 GetHashCode()1950 public override int GetHashCode() 1951 { 1952 return CodePage + this.EncoderFallback.GetHashCode() + this.DecoderFallback.GetHashCode() + 1953 (byteOrderMark ? 4 : 0) + (bigEndian ? 8 : 0); 1954 } 1955 1956 private sealed class Decoder : System.Text.DecoderNLS 1957 { 1958 internal int lastByte = -1; 1959 internal char lastChar = '\0'; 1960 Decoder(UnicodeEncoding encoding)1961 public Decoder(UnicodeEncoding encoding) : base(encoding) 1962 { 1963 // base calls reset 1964 } 1965 Reset()1966 public override void Reset() 1967 { 1968 lastByte = -1; 1969 lastChar = '\0'; 1970 if (_fallbackBuffer != null) 1971 _fallbackBuffer.Reset(); 1972 } 1973 1974 // Anything left in our decoder? 1975 internal override bool HasState 1976 { 1977 get 1978 { 1979 return (this.lastByte != -1 || this.lastChar != '\0'); 1980 } 1981 } 1982 } 1983 } 1984 } 1985 1986