1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 using System;
6 using System.Diagnostics;
7 using System.Reflection;
8 using System.Runtime.CompilerServices;
9 using System.Threading;
10 
11 namespace System.Text.Unicode
12 {
13     /// <summary>
14     /// Contains helpers for dealing with Unicode code points.
15     /// </summary>
16     internal static unsafe class UnicodeHelpers
17     {
18         /// <summary>
19         /// Used for invalid Unicode sequences or other unrepresentable values.
20         /// </summary>
21         private const char UNICODE_REPLACEMENT_CHAR = '\uFFFD';
22 
23         /// <summary>
24         /// The last code point defined by the Unicode specification.
25         /// </summary>
26         internal const int UNICODE_LAST_CODEPOINT = 0x10FFFF;
27 
28         private static uint[] _definedCharacterBitmap;
29 
30         /// <summary>
31         /// Helper method which creates a bitmap of all characters which are
32         /// defined per the Unicode specification.
33         /// </summary>
34         [MethodImpl(MethodImplOptions.NoInlining)]
CreateDefinedCharacterBitmap()35         private static uint[] CreateDefinedCharacterBitmap()
36         {
37             // The stream should be exactly 8KB in size.
38             var stream = typeof(UnicodeRange).GetTypeInfo().Assembly.GetManifestResourceStream("System.Text.Encodings.Web.Resources.unicode8definedcharacters.bin");
39 
40             if (stream == null)
41             {
42                 throw new BadImageFormatException();
43             }
44 
45             if (stream.Length != 8 * 1024)
46             {
47                 Environment.FailFast("Corrupt data detected.");
48             }
49 
50             // Read everything in as raw bytes.
51             byte[] rawData = new byte[8 * 1024];
52             for (int numBytesReadTotal = 0; numBytesReadTotal < rawData.Length;)
53             {
54                 int numBytesReadThisIteration = stream.Read(rawData, numBytesReadTotal, rawData.Length - numBytesReadTotal);
55                 if (numBytesReadThisIteration == 0)
56                 {
57                     Environment.FailFast("Corrupt data detected.");
58                 }
59                 numBytesReadTotal += numBytesReadThisIteration;
60             }
61 
62             // Finally, convert the byte[] to a uint[].
63             // The incoming bytes are little-endian.
64             uint[] retVal = new uint[2 * 1024];
65             for (int i = 0; i < retVal.Length; i++)
66             {
67                 retVal[i] = (((uint)rawData[4 * i + 3]) << 24)
68                     | (((uint)rawData[4 * i + 2]) << 16)
69                     | (((uint)rawData[4 * i + 1]) << 8)
70                     | (uint)rawData[4 * i];
71             }
72 
73             // And we're done!
74             Volatile.Write(ref _definedCharacterBitmap, retVal);
75             return retVal;
76         }
77 
78         /// <summary>
79         /// Returns a bitmap of all characters which are defined per version 7.0.0
80         /// of the Unicode specification.
81         /// </summary>
82         [MethodImpl(MethodImplOptions.AggressiveInlining)]
GetDefinedCharacterBitmap()83         internal static uint[] GetDefinedCharacterBitmap()
84         {
85             return Volatile.Read(ref _definedCharacterBitmap) ?? CreateDefinedCharacterBitmap();
86         }
87 
88         /// <summary>
89         /// Given a UTF-16 character stream, reads the next scalar value from the stream.
90         /// Set 'endOfString' to true if 'pChar' points to the last character in the stream.
91         /// </summary>
92         [MethodImpl(MethodImplOptions.AggressiveInlining)]
GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair)93         internal static int GetScalarValueFromUtf16(char first, char? second, out bool wasSurrogatePair)
94         {
95             if (!Char.IsSurrogate(first))
96             {
97                 wasSurrogatePair = false;
98                 return first;
99             }
100             return GetScalarValueFromUtf16Slow(first, second, out wasSurrogatePair);
101         }
102 
GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair)103         private static int GetScalarValueFromUtf16Slow(char first, char? second, out bool wasSurrogatePair)
104         {
105 #if DEBUG
106             if (!Char.IsSurrogate(first))
107             {
108                 Debug.Assert(false, "This case should've been handled by the fast path.");
109                 wasSurrogatePair = false;
110                 return first;
111             }
112 #endif
113             if (Char.IsHighSurrogate(first))
114             {
115                 if (second != null)
116                 {
117                     if (Char.IsLowSurrogate(second.Value))
118                     {
119                         // valid surrogate pair - extract codepoint
120                         wasSurrogatePair = true;
121                         return GetScalarValueFromUtf16SurrogatePair(first, second.Value);
122                     }
123                     else
124                     {
125                         // unmatched surrogate - substitute
126                         wasSurrogatePair = false;
127                         return UNICODE_REPLACEMENT_CHAR;
128                     }
129                 }
130                 else
131                 {
132                     // unmatched surrogate - substitute
133                     wasSurrogatePair = false;
134                     return UNICODE_REPLACEMENT_CHAR;
135                 }
136             }
137             else
138             {
139                 // unmatched surrogate - substitute
140                 Debug.Assert(Char.IsLowSurrogate(first));
141                 wasSurrogatePair = false;
142                 return UNICODE_REPLACEMENT_CHAR;
143             }
144         }
145 
146         /// <summary>
147         /// Given a UTF-16 character stream, reads the next scalar value from the stream.
148         /// Set 'endOfString' to true if 'pChar' points to the last character in the stream.
149         /// </summary>
150         [MethodImpl(MethodImplOptions.AggressiveInlining)]
GetScalarValueFromUtf16(char* pChar, bool endOfString)151         internal static int GetScalarValueFromUtf16(char* pChar, bool endOfString)
152         {
153             // This method is marked as AggressiveInlining to handle the common case of a non-surrogate
154             // character. The surrogate case is handled in the slower fallback code path.
155             char thisChar = *pChar;
156             return (Char.IsSurrogate(thisChar)) ? GetScalarValueFromUtf16Slow(pChar, endOfString) : thisChar;
157         }
158 
GetScalarValueFromUtf16Slow(char* pChar, bool endOfString)159         private static int GetScalarValueFromUtf16Slow(char* pChar, bool endOfString)
160         {
161             char firstChar = pChar[0];
162 
163             if (!Char.IsSurrogate(firstChar))
164             {
165                 Debug.Assert(false, "This case should've been handled by the fast path.");
166                 return firstChar;
167             }
168             else if (Char.IsHighSurrogate(firstChar))
169             {
170                 if (endOfString)
171                 {
172                     // unmatched surrogate - substitute
173                     return UNICODE_REPLACEMENT_CHAR;
174                 }
175                 else
176                 {
177                     char secondChar = pChar[1];
178                     if (Char.IsLowSurrogate(secondChar))
179                     {
180                         // valid surrogate pair - extract codepoint
181                         return GetScalarValueFromUtf16SurrogatePair(firstChar, secondChar);
182                     }
183                     else
184                     {
185                         // unmatched surrogate - substitute
186                         return UNICODE_REPLACEMENT_CHAR;
187                     }
188                 }
189             }
190             else
191             {
192                 // unmatched surrogate - substitute
193                 Debug.Assert(Char.IsLowSurrogate(firstChar));
194                 return UNICODE_REPLACEMENT_CHAR;
195             }
196         }
197 
GetScalarValueFromUtf16SurrogatePair(char highSurrogate, char lowSurrogate)198         private static int GetScalarValueFromUtf16SurrogatePair(char highSurrogate, char lowSurrogate)
199         {
200             Debug.Assert(Char.IsHighSurrogate(highSurrogate));
201             Debug.Assert(Char.IsLowSurrogate(lowSurrogate));
202 
203             // See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the
204             // details of this conversion. We don't use Char.ConvertToUtf32 because its exception
205             // handling shows up on the hot path, and our caller has already sanitized the inputs.
206             return (lowSurrogate & 0x3ff) | (((highSurrogate & 0x3ff) + (1 << 6)) << 10);
207         }
208 
GetUtf16SurrogatePairFromAstralScalarValue(int scalar, out char highSurrogate, out char lowSurrogate)209         internal static void GetUtf16SurrogatePairFromAstralScalarValue(int scalar, out char highSurrogate, out char lowSurrogate)
210         {
211             Debug.Assert(0x10000 <= scalar && scalar <= UNICODE_LAST_CODEPOINT);
212 
213             // See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.5 for the
214             // details of this conversion. We don't use Char.ConvertFromUtf32 because its exception
215             // handling shows up on the hot path, it allocates temporary strings (which we don't want),
216             // and our caller has already sanitized the inputs.
217 
218             int x = scalar & 0xFFFF;
219             int u = scalar >> 16;
220             int w = u - 1;
221             highSurrogate = (char)(0xD800 | (w << 6) | (x >> 10));
222             lowSurrogate = (char)(0xDC00 | (x & 0x3FF));
223         }
224 
225         /// <summary>
226         /// Given a Unicode scalar value, returns the UTF-8 representation of the value.
227         /// The return value's bytes should be popped from the LSB.
228         /// </summary>
GetUtf8RepresentationForScalarValue(uint scalar)229         internal static int GetUtf8RepresentationForScalarValue(uint scalar)
230         {
231             Debug.Assert(scalar <= UNICODE_LAST_CODEPOINT);
232 
233             // See http://www.unicode.org/versions/Unicode6.2.0/ch03.pdf, Table 3.6 for the
234             // details of this conversion. We don't use UTF8Encoding since we're encoding
235             // a scalar code point, not a UTF16 character sequence.
236             if (scalar <= 0x7f)
237             {
238                 // one byte used: scalar 00000000 0xxxxxxx -> byte sequence 0xxxxxxx
239                 byte firstByte = (byte)scalar;
240                 return firstByte;
241             }
242             else if (scalar <= 0x7ff)
243             {
244                 // two bytes used: scalar 00000yyy yyxxxxxx -> byte sequence 110yyyyy 10xxxxxx
245                 byte firstByte = (byte)(0xc0 | (scalar >> 6));
246                 byte secondByteByte = (byte)(0x80 | (scalar & 0x3f));
247                 return ((secondByteByte << 8) | firstByte);
248             }
249             else if (scalar <= 0xffff)
250             {
251                 // three bytes used: scalar zzzzyyyy yyxxxxxx -> byte sequence 1110zzzz 10yyyyyy 10xxxxxx
252                 byte firstByte = (byte)(0xe0 | (scalar >> 12));
253                 byte secondByte = (byte)(0x80 | ((scalar >> 6) & 0x3f));
254                 byte thirdByte = (byte)(0x80 | (scalar & 0x3f));
255                 return ((((thirdByte << 8) | secondByte) << 8) | firstByte);
256             }
257             else
258             {
259                 // four bytes used: scalar 000uuuuu zzzzyyyy yyxxxxxx -> byte sequence 11110uuu 10uuzzzz 10yyyyyy 10xxxxxx
260                 byte firstByte = (byte)(0xf0 | (scalar >> 18));
261                 byte secondByte = (byte)(0x80 | ((scalar >> 12) & 0x3f));
262                 byte thirdByte = (byte)(0x80 | ((scalar >> 6) & 0x3f));
263                 byte fourthByte = (byte)(0x80 | (scalar & 0x3f));
264                 return ((((((fourthByte << 8) | thirdByte) << 8) | secondByte) << 8) | firstByte);
265             }
266         }
267 
268         /// <summary>
269         /// Returns a value stating whether a character is defined per version 7.0.0
270         /// of the Unicode specification. Certain classes of characters (control chars,
271         /// private use, surrogates, some whitespace) are considered "undefined" for
272         /// our purposes.
273         /// </summary>
274         [MethodImpl(MethodImplOptions.AggressiveInlining)]
IsCharacterDefined(char c)275         internal static bool IsCharacterDefined(char c)
276         {
277             uint codePoint = (uint)c;
278             int index = (int)(codePoint >> 5);
279             int offset = (int)(codePoint & 0x1FU);
280             return ((GetDefinedCharacterBitmap()[index] >> offset) & 0x1U) != 0;
281         }
282 
283         /// <summary>
284         /// Determines whether the given scalar value is in the supplementary plane and thus
285         /// requires 2 characters to be represented in UTF-16 (as a surrogate pair).
286         /// </summary>
287         [MethodImpl(MethodImplOptions.AggressiveInlining)]
IsSupplementaryCodePoint(int scalar)288         internal static bool IsSupplementaryCodePoint(int scalar)
289         {
290             return ((scalar & ~((int)Char.MaxValue)) != 0);
291         }
292     }
293 }
294