1 // Licensed to the .NET Foundation under one or more agreements. 2 // The .NET Foundation licenses this file to you under the MIT license. 3 // See the LICENSE file in the project root for more information. 4 5 using System; 6 using System.Diagnostics; 7 using System.Reflection; 8 using System.Runtime.CompilerServices; 9 using System.Threading; 10 11 namespace System.Text.Unicode 12 { 13 /// <summary> 14 /// Contains helpers for dealing with Unicode code points. 15 /// </summary> 16 internal static unsafe class UnicodeHelpers 17 { 18 /// <summary> 19 /// Used for invalid Unicode sequences or other unrepresentable values. 20 /// </summary> 21 private const char UNICODE_REPLACEMENT_CHAR = '\uFFFD'; 22 23 /// <summary> 24 /// The last code point defined by the Unicode specification. 25 /// </summary> 26 internal const int UNICODE_LAST_CODEPOINT = 0x10FFFF; 27 28 private static uint[] _definedCharacterBitmap; 29 30 /// <summary> 31 /// Helper method which creates a bitmap of all characters which are 32 /// defined per the Unicode specification. 33 /// </summary> 34 [MethodImpl(MethodImplOptions.NoInlining)] CreateDefinedCharacterBitmap()35 private static uint[] CreateDefinedCharacterBitmap() 36 { 37 // The stream should be exactly 8KB in size. 38 var stream = typeof(UnicodeRange).GetTypeInfo().Assembly.GetManifestResourceStream("System.Text.Encodings.Web.Resources.unicode8definedcharacters.bin"); 39 40 if (stream == null) 41 { 42 throw new BadImageFormatException(); 43 } 44 45 if (stream.Length != 8 * 1024) 46 { 47 Environment.FailFast("Corrupt data detected."); 48 } 49 50 // Read everything in as raw bytes. 51 byte[] rawData = new byte[8 * 1024]; 52 for (int numBytesReadTotal = 0; numBytesReadTotal < rawData.Length;) 53 { 54 int numBytesReadThisIteration = stream.Read(rawData, numBytesReadTotal, rawData.Length - numBytesReadTotal); 55 if (numBytesReadThisIteration == 0) 56 { 57 Environment.FailFast("Corrupt data detected."); 58 } 59 numBytesReadTotal += numBytesReadThisIteration; 60 } 61 62 // Finally, convert the byte[] to a uint[]. 63 // The incoming bytes are little-endian. 64 uint[] retVal = new uint[2 * 1024]; 65 for (int i = 0; i < retVal.Length; i++) 66 { 67 retVal[i] = (((uint)rawData[4 * i + 3]) << 24) 68 | (((uint)rawData[4 * i + 2]) << 16) 69 | (((uint)rawData[4 * i + 1]) << 8) 70 | (uint)rawData[4 * i]; 71 } 72 73 // And we're done! 74 Volatile.Write(ref _definedCharacterBitmap, retVal); 75 return retVal; 76 } 77 78 /// <summary> 79 /// Returns a bitmap of all characters which are defined per version 7.0.0 80 /// of the Unicode specification. 81 /// </summary> 82 [MethodImpl(MethodImplOptions.AggressiveInlining)] GetDefinedCharacterBitmap()83 internal static uint[] GetDefinedCharacterBitmap() 84 { 85 return Volatile.Read(ref _definedCharacterBitmap) ?? CreateDefinedCharacterBitmap(); 86 } 87 88 /// <summary> 89 /// Given a UTF-16 character stream, reads the next scalar value from the stream. 90 /// Set 'endOfString' to true if 'pChar' points to the last character in the stream. 91 /// </summary> 92 [MethodImpl(MethodImplOptions.AggressiveInlining)] GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair)93 internal static int GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair) 94 { 95 if (!Char.IsSurrogate(first)) 96 { 97 wasSurrogatePair = false; 98 return first; 99 } 100 return GetScalarValueFromUtf16Slow(first, second, out wasSurrogatePair); 101 } 102 GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair)103 private static int GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair) 104 { 105 #if DEBUG 106 if (!Char.IsSurrogate(first)) 107 { 108 Debug.Assert(false, "This case should've been handled by the fast path."); 109 wasSurrogatePair = false; 110 return first; 111 } 112 #endif 113 if (Char.IsHighSurrogate(first)) 114 { 115 if (second != null) 116 { 117 if (Char.IsLowSurrogate(second.Value)) 118 { 119 // valid surrogate pair - extract codepoint 120 wasSurrogatePair = true; 121 return GetScalarValueFromUtf16SurrogatePair(first, second.Value); 122 } 123 else 124 { 125 // unmatched surrogate - substitute 126 wasSurrogatePair = false; 127 return UNICODE_REPLACEMENT_CHAR; 128 } 129 } 130 else 131 { 132 // unmatched surrogate - substitute 133 wasSurrogatePair = false; 134 return UNICODE_REPLACEMENT_CHAR; 135 } 136 } 137 else 138 { 139 // unmatched surrogate - substitute 140 Debug.Assert(Char.IsLowSurrogate(first)); 141 wasSurrogatePair = false; 142 return UNICODE_REPLACEMENT_CHAR; 143 } 144 } 145 146 /// <summary> 147 /// Given a UTF-16 character stream, reads the next scalar value from the stream. 148 /// Set 'endOfString' to true if 'pChar' points to the last character in the stream. 149 /// </summary> 150 [MethodImpl(MethodImplOptions.AggressiveInlining)] GetScalarValueFromUtf16(char* pChar, bool endOfString)151 internal static int GetScalarValueFromUtf16(char* pChar, bool endOfString) 152 { 153 // This method is marked as AggressiveInlining to handle the common case of a non-surrogate 154 // character. The surrogate case is handled in the slower fallback code path. 155 char thisChar = *pChar; 156 return (Char.IsSurrogate(thisChar)) ? GetScalarValueFromUtf16Slow(pChar, endOfString) : thisChar; 157 } 158 GetScalarValueFromUtf16Slow(char* pChar, bool endOfString)159 private static int GetScalarValueFromUtf16Slow(char* pChar, bool endOfString) 160 { 161 char firstChar = pChar[0]; 162 163 if (!Char.IsSurrogate(firstChar)) 164 { 165 Debug.Assert(false, "This case should've been handled by the fast path."); 166 return firstChar; 167 } 168 else if (Char.IsHighSurrogate(firstChar)) 169 { 170 if (endOfString) 171 { 172 // unmatched surrogate - substitute 173 return UNICODE_REPLACEMENT_CHAR; 174 } 175 else 176 { 177 char secondChar = pChar[1]; 178 if (Char.IsLowSurrogate(secondChar)) 179 { 180 // valid surrogate pair - extract codepoint 181 return GetScalarValueFromUtf16SurrogatePair(firstChar, secondChar); 182 } 183 else 184 { 185 // unmatched surrogate - substitute 186 return UNICODE_REPLACEMENT_CHAR; 187 } 188 } 189 } 190 else 191 { 192 // unmatched surrogate - substitute 193 Debug.Assert(Char.IsLowSurrogate(firstChar)); 194 return UNICODE_REPLACEMENT_CHAR; 195 } 196 } 197 GetScalarValueFromUtf16SurrogatePair(char highSurrogate, char lowSurrogate)198 private static int GetScalarValueFromUtf16SurrogatePair(char highSurrogate, char lowSurrogate) 199 { 200 Debug.Assert(Char.IsHighSurrogate(highSurrogate)); 201 Debug.Assert(Char.IsLowSurrogate(lowSurrogate)); 202 203 // See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the 204 // details of this conversion. We don't use Char.ConvertToUtf32 because its exception 205 // handling shows up on the hot path, and our caller has already sanitized the inputs. 206 return (lowSurrogate & 0x3ff) | (((highSurrogate & 0x3ff) + (1 << 6)) << 10); 207 } 208 GetUtf16SurrogatePairFromAstralScalarValue(int scalar, out char highSurrogate, out char lowSurrogate)209 internal static void GetUtf16SurrogatePairFromAstralScalarValue(int scalar, out char highSurrogate, out char lowSurrogate) 210 { 211 Debug.Assert(0x10000 <= scalar && scalar <= UNICODE_LAST_CODEPOINT); 212 213 // See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the 214 // details of this conversion. We don't use Char.ConvertFromUtf32 because its exception 215 // handling shows up on the hot path, it allocates temporary strings (which we don't want), 216 // and our caller has already sanitized the inputs. 217 218 int x = scalar & 0xFFFF; 219 int u = scalar >> 16; 220 int w = u - 1; 221 highSurrogate = (char)(0xD800 | (w << 6) | (x >> 10)); 222 lowSurrogate = (char)(0xDC00 | (x & 0x3FF)); 223 } 224 225 /// <summary> 226 /// Given a Unicode scalar value, returns the UTF-8 representation of the value. 227 /// The return value's bytes should be popped from the LSB. 228 /// </summary> GetUtf8RepresentationForScalarValue(uint scalar)229 internal static int GetUtf8RepresentationForScalarValue(uint scalar) 230 { 231 Debug.Assert(scalar <= UNICODE_LAST_CODEPOINT); 232 233 // See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.6 for the 234 // details of this conversion. We don't use UTF8Encoding since we're encoding 235 // a scalar code point, not a UTF16 character sequence. 236 if (scalar <= 0x7f) 237 { 238 // one byte used: scalar 00000000 0xxxxxxx -> byte sequence 0xxxxxxx 239 byte firstByte = (byte)scalar; 240 return firstByte; 241 } 242 else if (scalar <= 0x7ff) 243 { 244 // two bytes used: scalar 00000yyy yyxxxxxx -> byte sequence 110yyyyy 10xxxxxx 245 byte firstByte = (byte)(0xc0 | (scalar >> 6)); 246 byte secondByteByte = (byte)(0x80 | (scalar & 0x3f)); 247 return ((secondByteByte << 8) | firstByte); 248 } 249 else if (scalar <= 0xffff) 250 { 251 // three bytes used: scalar zzzzyyyy yyxxxxxx -> byte sequence 1110zzzz 10yyyyyy 10xxxxxx 252 byte firstByte = (byte)(0xe0 | (scalar >> 12)); 253 byte secondByte = (byte)(0x80 | ((scalar >> 6) & 0x3f)); 254 byte thirdByte = (byte)(0x80 | (scalar & 0x3f)); 255 return ((((thirdByte << 8) | secondByte) << 8) | firstByte); 256 } 257 else 258 { 259 // four bytes used: scalar 000uuuuu zzzzyyyy yyxxxxxx -> byte sequence 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx 260 byte firstByte = (byte)(0xf0 | (scalar >> 18)); 261 byte secondByte = (byte)(0x80 | ((scalar >> 12) & 0x3f)); 262 byte thirdByte = (byte)(0x80 | ((scalar >> 6) & 0x3f)); 263 byte fourthByte = (byte)(0x80 | (scalar & 0x3f)); 264 return ((((((fourthByte << 8) | thirdByte) << 8) | secondByte) << 8) | firstByte); 265 } 266 } 267 268 /// <summary> 269 /// Returns a value stating whether a character is defined per version 7.0.0 270 /// of the Unicode specification. Certain classes of characters (control chars, 271 /// private use, surrogates, some whitespace) are considered "undefined" for 272 /// our purposes. 273 /// </summary> 274 [MethodImpl(MethodImplOptions.AggressiveInlining)] IsCharacterDefined(char c)275 internal static bool IsCharacterDefined(char c) 276 { 277 uint codePoint = (uint)c; 278 int index = (int)(codePoint >> 5); 279 int offset = (int)(codePoint & 0x1FU); 280 return ((GetDefinedCharacterBitmap()[index] >> offset) & 0x1U) != 0; 281 } 282 283 /// <summary> 284 /// Determines whether the given scalar value is in the supplementary plane and thus 285 /// requires 2 characters to be represented in UTF-16 (as a surrogate pair). 286 /// </summary> 287 [MethodImpl(MethodImplOptions.AggressiveInlining)] IsSupplementaryCodePoint(int scalar)288 internal static bool IsSupplementaryCodePoint(int scalar) 289 { 290 return ((scalar & ~((int)Char.MaxValue)) != 0); 291 } 292 } 293 } 294