1 // Licensed to the .NET Foundation under one or more agreements. 2 // The .NET Foundation licenses this file to you under the MIT license. 3 // See the LICENSE file in the project root for more information. 4 5 //////////////////////////////////////////////////////////////////////////// 6 // 7 // 8 // Purpose: This class implements a set of methods for retrieving 9 // character type information. Character type information is 10 // independent of culture and region. 11 // 12 // 13 //////////////////////////////////////////////////////////////////////////// 14 15 using System.Diagnostics; 16 17 namespace System.Globalization 18 { 19 public static partial class CharUnicodeInfo 20 { 21 //--------------------------------------------------------------------// 22 // Internal Information // 23 //--------------------------------------------------------------------// 24 25 // 26 // Native methods to access the Unicode category data tables in charinfo.nlp. 27 // 28 internal const char HIGH_SURROGATE_START = '\ud800'; 29 internal const char HIGH_SURROGATE_END = '\udbff'; 30 internal const char LOW_SURROGATE_START = '\udc00'; 31 internal const char LOW_SURROGATE_END = '\udfff'; 32 33 internal const int UNICODE_CATEGORY_OFFSET = 0; 34 internal const int BIDI_CATEGORY_OFFSET = 1; 35 36 // The starting codepoint for Unicode plane 1. Plane 1 contains 0x010000 ~ 0x01ffff. 37 internal const int UNICODE_PLANE01_START = 0x10000; 38 39 40 //////////////////////////////////////////////////////////////////////// 41 // 42 // Actions: 43 // Convert the BMP character or surrogate pointed by index to a UTF32 value. 44 // This is similar to Char.ConvertToUTF32, but the difference is that 45 // it does not throw exceptions when invalid surrogate characters are passed in. 46 // 47 // WARNING: since it doesn't throw an exception it CAN return a value 48 // in the surrogate range D800-DFFF, which are not legal unicode values. 49 // 50 //////////////////////////////////////////////////////////////////////// 51 InternalConvertToUtf32(String s, int index)52 internal static int InternalConvertToUtf32(String s, int index) 53 { 54 Debug.Assert(s != null, "s != null"); 55 Debug.Assert(index >= 0 && index < s.Length, "index < s.Length"); 56 if (index < s.Length - 1) 57 { 58 int temp1 = (int)s[index] - HIGH_SURROGATE_START; 59 if (temp1 >= 0 && temp1 <= 0x3ff) 60 { 61 int temp2 = (int)s[index + 1] - LOW_SURROGATE_START; 62 if (temp2 >= 0 && temp2 <= 0x3ff) 63 { 64 // Convert the surrogate to UTF32 and get the result. 65 return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START); 66 } 67 } 68 } 69 return ((int)s[index]); 70 } 71 //////////////////////////////////////////////////////////////////////// 72 // 73 // Convert a character or a surrogate pair starting at index of string s 74 // to UTF32 value. 75 // 76 // Parameters: 77 // s The string 78 // index The starting index. It can point to a BMP character or 79 // a surrogate pair. 80 // len The length of the string. 81 // charLength [out] If the index points to a BMP char, charLength 82 // will be 1. If the index points to a surrogate pair, 83 // charLength will be 2. 84 // 85 // WARNING: since it doesn't throw an exception it CAN return a value 86 // in the surrogate range D800-DFFF, which are not legal unicode values. 87 // 88 // Returns: 89 // The UTF32 value 90 // 91 //////////////////////////////////////////////////////////////////////// 92 InternalConvertToUtf32(String s, int index, out int charLength)93 internal static int InternalConvertToUtf32(String s, int index, out int charLength) 94 { 95 Debug.Assert(s != null, "s != null"); 96 Debug.Assert(s.Length > 0, "s.Length > 0"); 97 Debug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length"); 98 charLength = 1; 99 if (index < s.Length - 1) 100 { 101 int temp1 = (int)s[index] - HIGH_SURROGATE_START; 102 if (temp1 >= 0 && temp1 <= 0x3ff) 103 { 104 int temp2 = (int)s[index + 1] - LOW_SURROGATE_START; 105 if (temp2 >= 0 && temp2 <= 0x3ff) 106 { 107 // Convert the surrogate to UTF32 and get the result. 108 charLength++; 109 return ((temp1 * 0x400) + temp2 + UNICODE_PLANE01_START); 110 } 111 } 112 } 113 return ((int)s[index]); 114 } 115 116 //////////////////////////////////////////////////////////////////////// 117 // 118 // IsWhiteSpace 119 // 120 // Determines if the given character is a white space character. 121 // 122 //////////////////////////////////////////////////////////////////////// 123 IsWhiteSpace(String s, int index)124 internal static bool IsWhiteSpace(String s, int index) 125 { 126 Debug.Assert(s != null, "s!=null"); 127 Debug.Assert(index >= 0 && index < s.Length, "index >= 0 && index < s.Length"); 128 129 UnicodeCategory uc = GetUnicodeCategory(s, index); 130 // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator". 131 // And U+2029 is th eonly character which is under the category "ParagraphSeparator". 132 switch (uc) 133 { 134 case (UnicodeCategory.SpaceSeparator): 135 case (UnicodeCategory.LineSeparator): 136 case (UnicodeCategory.ParagraphSeparator): 137 return (true); 138 } 139 return (false); 140 } 141 142 IsWhiteSpace(char c)143 internal static bool IsWhiteSpace(char c) 144 { 145 UnicodeCategory uc = GetUnicodeCategory(c); 146 // In Unicode 3.0, U+2028 is the only character which is under the category "LineSeparator". 147 // And U+2029 is th eonly character which is under the category "ParagraphSeparator". 148 switch (uc) 149 { 150 case (UnicodeCategory.SpaceSeparator): 151 case (UnicodeCategory.LineSeparator): 152 case (UnicodeCategory.ParagraphSeparator): 153 return (true); 154 } 155 156 return (false); 157 } 158 159 160 // 161 // This is called by the public char and string, index versions 162 // 163 // Note that for ch in the range D800-DFFF we just treat it as any other non-numeric character 164 // InternalGetNumericValue(int ch)165 internal static unsafe double InternalGetNumericValue(int ch) 166 { 167 Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range."); 168 // Get the level 2 item from the highest 12 bit (8 - 19) of ch. 169 ushort index = s_pNumericLevel1Index[ch >> 8]; 170 // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table. 171 // The offset is referred to an float item in m_pNumericFloatData. 172 // Note that & has the lower precedence than addition, so don't forget the parathesis. 173 index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)]; 174 175 fixed (ushort* pUshortPtr = &(s_pNumericLevel1Index[index])) 176 { 177 byte* pBytePtr = (byte*)pUshortPtr; 178 fixed (byte* pByteNum = s_pNumericValues) 179 { 180 double* pDouble = (double*)pByteNum; 181 return pDouble[pBytePtr[(ch & 0x000f)]]; 182 } 183 } 184 } 185 InternalGetDigitValues(int ch)186 internal static unsafe ushort InternalGetDigitValues(int ch) 187 { 188 Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range."); 189 // Get the level 2 item from the highest 12 bit (8 - 19) of ch. 190 ushort index = s_pNumericLevel1Index[ch >> 8]; 191 // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table. 192 // Note that & has the lower precedence than addition, so don't forget the parathesis. 193 index = s_pNumericLevel1Index[index + ((ch >> 4) & 0x000f)]; 194 195 fixed (ushort* pUshortPtr = &(s_pNumericLevel1Index[index])) 196 { 197 byte* pBytePtr = (byte*)pUshortPtr; 198 return s_pDigitValues[pBytePtr[(ch & 0x000f)]]; 199 } 200 } 201 202 //////////////////////////////////////////////////////////////////////// 203 // 204 //Returns the numeric value associated with the character c. If the character is a fraction, 205 // the return value will not be an integer. If the character does not have a numeric value, the return value is -1. 206 // 207 //Returns: 208 // the numeric value for the specified Unicode character. If the character does not have a numeric value, the return value is -1. 209 //Arguments: 210 // ch a Unicode character 211 //Exceptions: 212 // ArgumentNullException 213 // ArgumentOutOfRangeException 214 // 215 //////////////////////////////////////////////////////////////////////// 216 217 GetNumericValue(char ch)218 public static double GetNumericValue(char ch) 219 { 220 return (InternalGetNumericValue(ch)); 221 } 222 223 GetNumericValue(String s, int index)224 public static double GetNumericValue(String s, int index) 225 { 226 if (s == null) 227 { 228 throw new ArgumentNullException(nameof(s)); 229 } 230 if (index < 0 || index >= s.Length) 231 { 232 throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); 233 } 234 return (InternalGetNumericValue(InternalConvertToUtf32(s, index))); 235 } 236 GetDecimalDigitValue(char ch)237 public static int GetDecimalDigitValue(char ch) 238 { 239 return (sbyte)(InternalGetDigitValues(ch) >> 8); 240 } 241 GetDecimalDigitValue(String s, int index)242 public static int GetDecimalDigitValue(String s, int index) 243 { 244 if (s == null) 245 { 246 throw new ArgumentNullException(nameof(s)); 247 } 248 249 if (index < 0 || index >= s.Length) 250 { 251 throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); 252 } 253 254 return (sbyte)(InternalGetDigitValues(InternalConvertToUtf32(s, index)) >> 8); 255 } 256 GetDigitValue(char ch)257 public static int GetDigitValue(char ch) 258 { 259 return (sbyte)(InternalGetDigitValues(ch) & 0x00FF); 260 } 261 GetDigitValue(String s, int index)262 public static int GetDigitValue(String s, int index) 263 { 264 if (s == null) 265 { 266 throw new ArgumentNullException(nameof(s)); 267 } 268 269 if (index < 0 || index >= s.Length) 270 { 271 throw new ArgumentOutOfRangeException(nameof(index), SR.ArgumentOutOfRange_Index); 272 } 273 274 return (sbyte)(InternalGetDigitValues(InternalConvertToUtf32(s, index)) & 0x00FF); 275 } 276 GetUnicodeCategory(char ch)277 public static UnicodeCategory GetUnicodeCategory(char ch) 278 { 279 return (GetUnicodeCategory((int)ch)); 280 } 281 GetUnicodeCategory(String s, int index)282 public static UnicodeCategory GetUnicodeCategory(String s, int index) 283 { 284 if (s == null) 285 throw new ArgumentNullException(nameof(s)); 286 if (((uint)index) >= ((uint)s.Length)) 287 { 288 throw new ArgumentOutOfRangeException(nameof(index)); 289 } 290 return InternalGetUnicodeCategory(s, index); 291 } 292 GetUnicodeCategory(int codePoint)293 public static UnicodeCategory GetUnicodeCategory(int codePoint) 294 { 295 return ((UnicodeCategory)InternalGetCategoryValue(codePoint, UNICODE_CATEGORY_OFFSET)); 296 } 297 298 299 //////////////////////////////////////////////////////////////////////// 300 // 301 //Action: Returns the Unicode Category property for the character c. 302 //Returns: 303 // an value in UnicodeCategory enum 304 //Arguments: 305 // ch a Unicode character 306 //Exceptions: 307 // None 308 // 309 //Note that this API will return values for D800-DF00 surrogate halves. 310 // 311 //////////////////////////////////////////////////////////////////////// 312 InternalGetCategoryValue(int ch, int offset)313 internal static unsafe byte InternalGetCategoryValue(int ch, int offset) 314 { 315 Debug.Assert(ch >= 0 && ch <= 0x10ffff, "ch is not in valid Unicode range."); 316 // Get the level 2 item from the highest 12 bit (8 - 19) of ch. 317 ushort index = s_pCategoryLevel1Index[ch >> 8]; 318 // Get the level 2 WORD offset from the 4 - 7 bit of ch. This provides the base offset of the level 3 table. 319 // Note that & has the lower precedence than addition, so don't forget the parathesis. 320 index = s_pCategoryLevel1Index[index + ((ch >> 4) & 0x000f)]; 321 322 fixed (ushort* pUshortPtr = &(s_pCategoryLevel1Index[index])) 323 { 324 byte* pBytePtr = (byte*)pUshortPtr; 325 // Get the result from the 0 -3 bit of ch. 326 byte valueIndex = pBytePtr[(ch & 0x000f)]; 327 byte uc = s_pCategoriesValue[valueIndex * 2 + offset]; 328 // 329 // Make sure that OtherNotAssigned is the last category in UnicodeCategory. 330 // If that changes, change the following assertion as well. 331 // 332 //Debug.Assert(uc >= 0 && uc <= UnicodeCategory.OtherNotAssigned, "Table returns incorrect Unicode category"); 333 return (uc); 334 } 335 } 336 337 //////////////////////////////////////////////////////////////////////// 338 // 339 //Action: Returns the Unicode Category property for the character c. 340 //Returns: 341 // an value in UnicodeCategory enum 342 //Arguments: 343 // value a Unicode String 344 // index Index for the specified string. 345 //Exceptions: 346 // None 347 // 348 //////////////////////////////////////////////////////////////////////// 349 InternalGetUnicodeCategory(String value, int index)350 internal static UnicodeCategory InternalGetUnicodeCategory(String value, int index) 351 { 352 Debug.Assert(value != null, "value can not be null"); 353 Debug.Assert(index < value.Length, "index < value.Length"); 354 355 return (GetUnicodeCategory(InternalConvertToUtf32(value, index))); 356 } 357 GetBidiCategory(String s, int index)358 internal static BidiCategory GetBidiCategory(String s, int index) 359 { 360 if (s == null) 361 throw new ArgumentNullException(nameof(s)); 362 363 if (((uint)index) >= ((uint)s.Length)) 364 { 365 throw new ArgumentOutOfRangeException(nameof(index)); 366 } 367 368 return ((BidiCategory) InternalGetCategoryValue(InternalConvertToUtf32(s, index), BIDI_CATEGORY_OFFSET)); 369 } 370 371 //////////////////////////////////////////////////////////////////////// 372 // 373 // Get the Unicode category of the character starting at index. If the character is in BMP, charLength will return 1. 374 // If the character is a valid surrogate pair, charLength will return 2. 375 // 376 //////////////////////////////////////////////////////////////////////// 377 InternalGetUnicodeCategory(String str, int index, out int charLength)378 internal static UnicodeCategory InternalGetUnicodeCategory(String str, int index, out int charLength) 379 { 380 Debug.Assert(str != null, "str can not be null"); 381 Debug.Assert(str.Length > 0, "str.Length > 0"); ; 382 Debug.Assert(index >= 0 && index < str.Length, "index >= 0 && index < str.Length"); 383 384 return (GetUnicodeCategory(InternalConvertToUtf32(str, index, out charLength))); 385 } 386 IsCombiningCategory(UnicodeCategory uc)387 internal static bool IsCombiningCategory(UnicodeCategory uc) 388 { 389 Debug.Assert(uc >= 0, "uc >= 0"); 390 return ( 391 uc == UnicodeCategory.NonSpacingMark || 392 uc == UnicodeCategory.SpacingCombiningMark || 393 uc == UnicodeCategory.EnclosingMark 394 ); 395 } 396 } 397 } 398