1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 ////////////////////////////////////////////////////////////////////////////
6 //
7 //
8 //  Purpose:  This class implements a set of methods for retrieving
9 //            character type information.  Character type information is
10 //            independent of culture and region.
11 //
12 //
13 ////////////////////////////////////////////////////////////////////////////
14 
15 using System.Diagnostics;
16 
17 namespace System.Globalization
18 {
19     public static partial class CharUnicodeInfo
20     {
21         //--------------------------------------------------------------------//
22         //                        Internal Information                        //
23         //--------------------------------------------------------------------//
24 
25         //
26         // Native methods to access the Unicode category data tables in charinfo.nlp.
27         //
28         internal const char HIGH_SURROGATE_START = '\ud800';
29         internal const char HIGH_SURROGATE_END = '\udbff';
30         internal const char LOW_SURROGATE_START = '\udc00';
31         internal const char LOW_SURROGATE_END = '\udfff';
32 
33         internal const int UNICODE_CATEGORY_OFFSET = 0;
34         internal const int BIDI_CATEGORY_OFFSET = 1;
35 
36         // The starting codepoint for Unicode plane 1.  Plane 1 contains 0x010000 ~ 0x01ffff.
37         internal const int UNICODE_PLANE01_START = 0x10000;
38 
39 
40         ////////////////////////////////////////////////////////////////////////
41         //
42         // Actions:
43         // Convert the BMP character or surrogate pointed by index to a UTF32 value.
44         // This is similar to Char.ConvertToUTF32, but the difference is that
45         // it does not throw exceptions when invalid surrogate characters are passed in.
46         //
47         // WARNING: since it doesn't throw an exception it CAN return a value
48         //          in the surrogate range D800-DFFF, which are not legal unicode values.
49         //
50         ////////////////////////////////////////////////////////////////////////
51 
InternalConvertToUtf32(String s, int index)52         internal static int InternalConvertToUtf32(String s, int index)
53         {
54             Debug.Assert(s != null, "s != null");
55             Debug.Assert(index >= 0 && index < s.Length, "index < s.Length");
56             if (index < s.Length - 1)
57             {
58                 int temp1 = (int)s[index] - HIGH_SURROGATE_START;
59                 if (temp1 >= 0 && temp1 <= 0x3ff)
60                 {
61                     int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
62                     if (temp2 >= 0 && temp2 <= 0x3ff)
63                     {
64                         // Convert the surrogate to UTF32 and get the result.
65                         return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
66                     }
67                 }
68             }
69             return ((int)s[index]);
70         }
71         ////////////////////////////////////////////////////////////////////////
72         //
73         // Convert a character or a surrogate pair starting at index of string s
74         // to UTF32 value.
75         //
76         //  Parameters:
77         //      s       The string
78         //      index   The starting index.  It can point to a BMP character or
79         //              a surrogate pair.
80         //      len     The length of the string.
81         //      charLength  [out]   If the index points to a BMP char, charLength
82         //              will be 1.  If the index points to a surrogate pair,
83         //              charLength will be 2.
84         //
85         // WARNING: since it doesn't throw an exception it CAN return a value
86         //          in the surrogate range D800-DFFF, which are not legal unicode values.
87         //
88         //  Returns:
89         //      The UTF32 value
90         //
91         ////////////////////////////////////////////////////////////////////////
92 
InternalConvertToUtf32(String s, int index, out int charLength)93         internal static int InternalConvertToUtf32(String s, int index, out int charLength)
94         {
95             Debug.Assert(s != null, "s != null");
96             Debug.Assert(s.Length > 0, "s.Length > 0");
97             Debug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
98             charLength = 1;
99             if (index < s.Length - 1)
100             {
101                 int temp1 = (int)s[index] - HIGH_SURROGATE_START;
102                 if (temp1 >= 0 && temp1 <= 0x3ff)
103                 {
104                     int temp2 = (int)s[index + 1] - LOW_SURROGATE_START;
105                     if (temp2 >= 0 && temp2 <= 0x3ff)
106                     {
107                         // Convert the surrogate to UTF32 and get the result.
108                         charLength++;
109                         return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START);
110                     }
111                 }
112             }
113             return ((int)s[index]);
114         }
115 
116         ////////////////////////////////////////////////////////////////////////
117         //
118         //  IsWhiteSpace
119         //
120         //  Determines if the given character is a white space character.
121         //
122         ////////////////////////////////////////////////////////////////////////
123 
IsWhiteSpace(String s, int index)124         internal static bool IsWhiteSpace(String s, int index)
125         {
126             Debug.Assert(s != null, "s!=null");
127             Debug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length");
128 
129             UnicodeCategory uc = GetUnicodeCategory(s, index);
130             // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
131             // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
132             switch (uc)
133             {
134                 case (UnicodeCategory.SpaceSeparator):
135                 case (UnicodeCategory.LineSeparator):
136                 case (UnicodeCategory.ParagraphSeparator):
137                     return (true);
138             }
139             return (false);
140         }
141 
142 
IsWhiteSpace(char c)143         internal static bool IsWhiteSpace(char c)
144         {
145             UnicodeCategory uc = GetUnicodeCategory(c);
146             // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator".
147             // And U+2029 is th eonly character which is under the category "ParagraphSeparator".
148             switch (uc)
149             {
150                 case (UnicodeCategory.SpaceSeparator):
151                 case (UnicodeCategory.LineSeparator):
152                 case (UnicodeCategory.ParagraphSeparator):
153                     return (true);
154             }
155 
156             return (false);
157         }
158 
159 
160         //
161         // This is called by the public char and string, index versions
162         //
163         // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character
164         //
InternalGetNumericValue(int ch)165         internal static unsafe double InternalGetNumericValue(int ch)
166         {
167             Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
168             // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
169             ushort index = s_pNumericLevel1Index[ch >> 8];
170             // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table.
171             // The offset is referred to an float item in m_pNumericFloatData.
172             // Note that & has the lower precedence than addition, so don't forget the parathesis.
173             index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)];
174 
175             fixed (ushort* pUshortPtr = &(s_pNumericLevel1Index[index]))
176             {
177                 byte* pBytePtr = (byte*)pUshortPtr;
178                 fixed (byte* pByteNum = s_pNumericValues)
179                 {
180                     double* pDouble = (double*)pByteNum;
181                     return pDouble[pBytePtr[(ch & 0x000f)]];
182                 }
183             }
184         }
185 
InternalGetDigitValues(int ch)186         internal static unsafe ushort InternalGetDigitValues(int ch)
187         {
188             Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
189             // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
190             ushort index = s_pNumericLevel1Index[ch >> 8];
191             // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table.
192             // Note that & has the lower precedence than addition, so don't forget the parathesis.
193             index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)];
194 
195             fixed (ushort* pUshortPtr = &(s_pNumericLevel1Index[index]))
196             {
197                 byte* pBytePtr = (byte*)pUshortPtr;
198                 return s_pDigitValues[pBytePtr[(ch & 0x000f)]];
199             }
200         }
201 
202         ////////////////////////////////////////////////////////////////////////
203         //
204         //Returns the numeric value associated with the character c. If the character is a fraction,
205         // the return value will not be an integer. If the character does not have a numeric value, the return value is -1.
206         //
207         //Returns:
208         //  the numeric value for the specified Unicode character.  If the character does not have a numeric value, the return value is -1.
209         //Arguments:
210         //      ch  a Unicode character
211         //Exceptions:
212         //      ArgumentNullException
213         //      ArgumentOutOfRangeException
214         //
215         ////////////////////////////////////////////////////////////////////////
216 
217 
GetNumericValue(char ch)218         public static double GetNumericValue(char ch)
219         {
220             return (InternalGetNumericValue(ch));
221         }
222 
223 
GetNumericValue(String s, int index)224         public static double GetNumericValue(String s, int index)
225         {
226             if (s == null)
227             {
228                 throw new ArgumentNullException(nameof(s));
229             }
230             if (index < 0 || index >= s.Length)
231             {
232                 throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
233             }
234             return (InternalGetNumericValue(InternalConvertToUtf32(s, index)));
235         }
236 
GetDecimalDigitValue(char ch)237         public static int GetDecimalDigitValue(char ch)
238         {
239             return (sbyte)(InternalGetDigitValues(ch) >> 8);
240         }
241 
GetDecimalDigitValue(String s, int index)242         public static int GetDecimalDigitValue(String s, int index)
243         {
244             if (s == null)
245             {
246                 throw new ArgumentNullException(nameof(s));
247             }
248 
249             if (index < 0 || index >= s.Length)
250             {
251                 throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
252             }
253 
254             return (sbyte)(InternalGetDigitValues(InternalConvertToUtf32(s, index)) >> 8);
255         }
256 
GetDigitValue(char ch)257         public static int GetDigitValue(char ch)
258         {
259             return (sbyte)(InternalGetDigitValues(ch) & 0x00FF);
260         }
261 
GetDigitValue(String s, int index)262         public static int GetDigitValue(String s, int index)
263         {
264             if (s == null)
265             {
266                 throw new ArgumentNullException(nameof(s));
267             }
268 
269             if (index < 0 || index >= s.Length)
270             {
271                 throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index);
272             }
273 
274             return (sbyte)(InternalGetDigitValues(InternalConvertToUtf32(s, index)) & 0x00FF);
275         }
276 
GetUnicodeCategory(char ch)277         public static UnicodeCategory GetUnicodeCategory(char ch)
278         {
279             return (GetUnicodeCategory((int)ch));
280         }
281 
GetUnicodeCategory(String s, int index)282         public static UnicodeCategory GetUnicodeCategory(String s, int index)
283         {
284             if (s == null)
285                 throw new ArgumentNullException(nameof(s));
286             if (((uint)index) >= ((uint)s.Length))
287             {
288                 throw new ArgumentOutOfRangeException(nameof(index));
289             }
290             return InternalGetUnicodeCategory(s, index);
291         }
292 
GetUnicodeCategory(int codePoint)293         public static UnicodeCategory GetUnicodeCategory(int codePoint)
294         {
295             return ((UnicodeCategory)InternalGetCategoryValue(codePoint, UNICODE_CATEGORY_OFFSET));
296         }
297 
298 
299         ////////////////////////////////////////////////////////////////////////
300         //
301         //Action: Returns the Unicode Category property for the character c.
302         //Returns:
303         //  an value in UnicodeCategory enum
304         //Arguments:
305         //  ch  a Unicode character
306         //Exceptions:
307         //  None
308         //
309         //Note that this API will return values for D800-DF00 surrogate halves.
310         //
311         ////////////////////////////////////////////////////////////////////////
312 
InternalGetCategoryValue(int ch, int offset)313         internal static unsafe byte InternalGetCategoryValue(int ch, int offset)
314         {
315             Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range.");
316             // Get the level 2 item from the highest 12 bit (8 - 19) of ch.
317             ushort index = s_pCategoryLevel1Index[ch >> 8];
318             // Get the level 2 WORD offset from the 4 - 7 bit of ch.  This provides the base offset of the level 3 table.
319             // Note that & has the lower precedence than addition, so don't forget the parathesis.
320             index = s_pCategoryLevel1Index[index + ((ch >> 4) & 0x000f)];
321 
322             fixed (ushort* pUshortPtr = &(s_pCategoryLevel1Index[index]))
323             {
324                 byte* pBytePtr = (byte*)pUshortPtr;
325                 // Get the result from the 0 -3 bit of ch.
326                 byte valueIndex = pBytePtr[(ch & 0x000f)];
327                 byte uc = s_pCategoriesValue[valueIndex * 2 + offset];
328                 //
329                 // Make sure that OtherNotAssigned is the last category in UnicodeCategory.
330                 // If that changes, change the following assertion as well.
331                 //
332                 //Debug.Assert(uc >= 0 && uc <= UnicodeCategory.OtherNotAssigned, "Table returns incorrect Unicode category");
333                 return (uc);
334             }
335         }
336 
337         ////////////////////////////////////////////////////////////////////////
338         //
339         //Action: Returns the Unicode Category property for the character c.
340         //Returns:
341         //  an value in UnicodeCategory enum
342         //Arguments:
343         //  value  a Unicode String
344         //  index  Index for the specified string.
345         //Exceptions:
346         //  None
347         //
348         ////////////////////////////////////////////////////////////////////////
349 
InternalGetUnicodeCategory(String value, int index)350         internal static UnicodeCategory InternalGetUnicodeCategory(String value, int index)
351         {
352             Debug.Assert(value != null, "value can not be null");
353             Debug.Assert(index < value.Length, "index < value.Length");
354 
355             return (GetUnicodeCategory(InternalConvertToUtf32(value, index)));
356         }
357 
GetBidiCategory(String s, int index)358         internal static BidiCategory GetBidiCategory(String s, int index)
359         {
360             if (s == null)
361                 throw new ArgumentNullException(nameof(s));
362 
363             if (((uint)index) >= ((uint)s.Length))
364             {
365                 throw new ArgumentOutOfRangeException(nameof(index));
366             }
367 
368             return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET));
369         }
370 
371         ////////////////////////////////////////////////////////////////////////
372         //
373         // Get the Unicode category of the character starting at index.  If the character is in BMP, charLength will return 1.
374         // If the character is a valid surrogate pair, charLength will return 2.
375         //
376         ////////////////////////////////////////////////////////////////////////
377 
InternalGetUnicodeCategory(String str, int index, out int charLength)378         internal static UnicodeCategory InternalGetUnicodeCategory(String str, int index, out int charLength)
379         {
380             Debug.Assert(str != null, "str can not be null");
381             Debug.Assert(str.Length > 0, "str.Length > 0"); ;
382             Debug.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length");
383 
384             return (GetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength)));
385         }
386 
IsCombiningCategory(UnicodeCategory uc)387         internal static bool IsCombiningCategory(UnicodeCategory uc)
388         {
389             Debug.Assert(uc >= 0, "uc >= 0");
390             return (
391                 uc == UnicodeCategory.NonSpacingMark ||
392                 uc == UnicodeCategory.SpacingCombiningMark ||
393                 uc == UnicodeCategory.EnclosingMark
394             );
395         }
396     }
397 }
398