1 // Licensed to the .NET Foundation under one or more agreements. 2 // The .NET Foundation licenses this file to you under the MIT license. 3 // See the LICENSE file in the project root for more information. 4 5 // This RegexCharClass class provides the "set of Unicode chars" functionality 6 // used by the regexp engine. 7 8 // The main function of RegexCharClass is as a builder to turn ranges, characters and 9 // Unicode categories into a single string. This string is used as a black box 10 // representation of a character class by the rest of Regex. The format is as follows. 11 // 12 // Char index Use 13 // 0 Flags - currently this only holds the "negate" flag 14 // 1 length of the string representing the "set" portion, e.g. [a-z0-9] only has a "set" 15 // 2 length of the string representing the "category" portion, e.g. [\p{Lu}] only has a "category" 16 // 3...m The set. These are a series of ranges which define the characters included in the set. 17 // To determine if a given character is in the set, we binary search over this set of ranges 18 // and see where the character should go. Based on whether the ending index is odd or even, 19 // we know if the character is in the set. 20 // m+1...n The categories. This is a list of UnicodeCategory enum values which describe categories 21 // included in this class. 22 23 using System.Collections.Generic; 24 using System.Diagnostics; 25 using System.Globalization; 26 using System.IO; 27 28 namespace System.Text.RegularExpressions 29 { 30 internal sealed class RegexCharClass 31 { 32 // instance data 33 private List<SingleRange> _rangelist; 34 private StringBuilder _categories; 35 private bool _canonical; 36 private bool _negate; 37 private RegexCharClass _subtractor; 38 39 // Constants 40 private const int FLAGS = 0; 41 private const int SETLENGTH = 1; 42 private const int CATEGORYLENGTH = 2; 43 private const int SETSTART = 3; 44 45 private const string NullCharString = "\0"; 46 47 private const char NullChar = '\0'; 48 private const char LastChar = '\uFFFF'; 49 50 private const char GroupChar = (char)0; 51 52 53 private const short SpaceConst = 100; 54 private const short NotSpaceConst = -100; 55 56 private const char ZeroWidthJoiner = '\u200D'; 57 private const char ZeroWidthNonJoiner = '\u200C'; 58 59 60 private static readonly string s_internalRegexIgnoreCase = "__InternalRegexIgnoreCase__"; 61 private static readonly string s_space = "\x64"; 62 private static readonly string s_notSpace = "\uFF9C"; 63 private static readonly string s_word = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; 64 private static readonly string s_notWord = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000"; 65 66 internal static readonly string SpaceClass = "\u0000\u0000\u0001\u0064"; 67 internal static readonly string NotSpaceClass = "\u0001\u0000\u0001\u0064"; 68 internal static readonly string WordClass = "\u0000\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; 69 internal static readonly string NotWordClass = "\u0001\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000"; 70 internal static readonly string DigitClass = "\u0000\u0000\u0001\u0009"; 71 internal static readonly string NotDigitClass = "\u0000\u0000\u0001\uFFF7"; 72 73 private const string ECMASpaceSet = "\u0009\u000E\u0020\u0021"; 74 private const string NotECMASpaceSet = "\0\u0009\u000E\u0020\u0021"; 75 private const string ECMAWordSet = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; 76 private const string NotECMAWordSet = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; 77 private const string ECMADigitSet = "\u0030\u003A"; 78 private const string NotECMADigitSet = "\0\u0030\u003A"; 79 80 internal const string ECMASpaceClass = "\x00\x04\x00" + ECMASpaceSet; 81 internal const string NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceSet; 82 internal const string ECMAWordClass = "\x00\x0A\x00" + ECMAWordSet; 83 internal const string NotECMAWordClass = "\x01\x0A\x00" + ECMAWordSet; 84 internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitSet; 85 internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet; 86 87 internal const string AnyClass = "\x00\x01\x00\x00"; 88 internal const string EmptyClass = "\x00\x00\x00"; 89 90 // UnicodeCategory is zero based, so we add one to each value and subtract it off later 91 private const int DefinedCategoriesCapacity = 38; 92 private static readonly Dictionary<string, string> s_definedCategories = new Dictionary<string, string>(DefinedCategoriesCapacity) 93 { 94 // Others 95 { "Cc", "\u000F" }, // UnicodeCategory.Control + 1 96 { "Cf", "\u0010" }, // UnicodeCategory.Format + 1 97 { "Cn", "\u001E" }, // UnicodeCategory.OtherNotAssigned + 1 98 { "Co", "\u0012" }, // UnicodeCategory.PrivateUse + 1 99 { "Cs", "\u0011" }, // UnicodeCategory.Surrogate + 1 100 { "C", "\u0000\u000F\u0010\u001E\u0012\u0011\u0000" }, 101 102 // Letters 103 { "Ll", "\u0002" }, // UnicodeCategory.LowercaseLetter + 1 104 { "Lm", "\u0004" }, // UnicodeCategory.ModifierLetter + 1 105 { "Lo", "\u0005" }, // UnicodeCategory.OtherLetter + 1 106 { "Lt", "\u0003" }, // UnicodeCategory.TitlecaseLetter + 1 107 { "Lu", "\u0001" }, // UnicodeCategory.UppercaseLetter + 1 108 { "L", "\u0000\u0002\u0004\u0005\u0003\u0001\u0000" }, 109 110 // InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter} 111 // !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!! 112 { "__InternalRegexIgnoreCase__", "\u0000\u0002\u0003\u0001\u0000" }, 113 114 // Marks 115 { "Mc", "\u0007" }, // UnicodeCategory.SpacingCombiningMark + 1 116 { "Me", "\u0008" }, // UnicodeCategory.EnclosingMark + 1 117 { "Mn", "\u0006" }, // UnicodeCategory.NonSpacingMark + 1 118 { "M", "\u0000\u0007\u0008\u0006\u0000" }, 119 120 // Numbers 121 { "Nd", "\u0009" }, // UnicodeCategory.DecimalDigitNumber + 1 122 { "Nl", "\u000A" }, // UnicodeCategory.LetterNumber + 1 123 { "No", "\u000B" }, // UnicodeCategory.OtherNumber + 1 124 { "N", "\u0000\u0009\u000A\u000B\u0000" }, 125 126 // Punctuation 127 { "Pc", "\u0013" }, // UnicodeCategory.ConnectorPunctuation + 1 128 { "Pd", "\u0014" }, // UnicodeCategory.DashPunctuation + 1 129 { "Pe", "\u0016" }, // UnicodeCategory.ClosePunctuation + 1 130 { "Po", "\u0019" }, // UnicodeCategory.OtherPunctuation + 1 131 { "Ps", "\u0015" }, // UnicodeCategory.OpenPunctuation + 1 132 { "Pf", "\u0018" }, // UnicodeCategory.FinalQuotePunctuation + 1 133 { "Pi", "\u0017" }, // UnicodeCategory.InitialQuotePunctuation + 1 134 { "P", "\u0000\u0013\u0014\u0016\u0019\u0015\u0018\u0017\u0000" }, 135 136 // Symbols 137 { "Sc", "\u001B" }, // UnicodeCategory.CurrencySymbol + 1 138 { "Sk", "\u001C" }, // UnicodeCategory.ModifierSymbol + 1 139 { "Sm", "\u001A" }, // UnicodeCategory.MathSymbol + 1 140 { "So", "\u001D" }, // UnicodeCategory.OtherSymbol + 1 141 { "S", "\u0000\u001B\u001C\u001A\u001D\u0000" }, 142 143 // Separators 144 { "Zl", "\u000D" }, // UnicodeCategory.LineSeparator + 1 145 { "Zp", "\u000E" }, // UnicodeCategory.ParagraphSeparator + 1 146 { "Zs", "\u000C" }, // UnicodeCategory.SpaceSeparator + 1 147 { "Z", "\u0000\u000D\u000E\u000C\u0000" }, 148 }; 149 150 /* 151 * The property table contains all the block definitions defined in the 152 * XML schema spec (http://www.w3.org/TR/2001/PR-xmlschema-2-20010316/#charcter-classes), Unicode 4.0 spec (www.unicode.org), 153 * and Perl 5.6 (see Programming Perl, 3rd edition page 167). Three blocks defined by Perl (and here) may 154 * not be in the Unicode: IsHighPrivateUseSurrogates, IsHighSurrogates, and IsLowSurrogates. 155 * 156 **/ 157 // Has to be sorted by the first column 158 private static readonly string[][] s_propTable = { 159 new [] {"IsAlphabeticPresentationForms", "\uFB00\uFB50"}, 160 new [] {"IsArabic", "\u0600\u0700"}, 161 new [] {"IsArabicPresentationForms-A", "\uFB50\uFE00"}, 162 new [] {"IsArabicPresentationForms-B", "\uFE70\uFF00"}, 163 new [] {"IsArmenian", "\u0530\u0590"}, 164 new [] {"IsArrows", "\u2190\u2200"}, 165 new [] {"IsBasicLatin", "\u0000\u0080"}, 166 new [] {"IsBengali", "\u0980\u0A00"}, 167 new [] {"IsBlockElements", "\u2580\u25A0"}, 168 new [] {"IsBopomofo", "\u3100\u3130"}, 169 new [] {"IsBopomofoExtended", "\u31A0\u31C0"}, 170 new [] {"IsBoxDrawing", "\u2500\u2580"}, 171 new [] {"IsBraillePatterns", "\u2800\u2900"}, 172 new [] {"IsBuhid", "\u1740\u1760"}, 173 new [] {"IsCJKCompatibility", "\u3300\u3400"}, 174 new [] {"IsCJKCompatibilityForms", "\uFE30\uFE50"}, 175 new [] {"IsCJKCompatibilityIdeographs", "\uF900\uFB00"}, 176 new [] {"IsCJKRadicalsSupplement", "\u2E80\u2F00"}, 177 new [] {"IsCJKSymbolsandPunctuation", "\u3000\u3040"}, 178 new [] {"IsCJKUnifiedIdeographs", "\u4E00\uA000"}, 179 new [] {"IsCJKUnifiedIdeographsExtensionA", "\u3400\u4DC0"}, 180 new [] {"IsCherokee", "\u13A0\u1400"}, 181 new [] {"IsCombiningDiacriticalMarks", "\u0300\u0370"}, 182 new [] {"IsCombiningDiacriticalMarksforSymbols","\u20D0\u2100"}, 183 new [] {"IsCombiningHalfMarks", "\uFE20\uFE30"}, 184 new [] {"IsCombiningMarksforSymbols", "\u20D0\u2100"}, 185 new [] {"IsControlPictures", "\u2400\u2440"}, 186 new [] {"IsCurrencySymbols", "\u20A0\u20D0"}, 187 new [] {"IsCyrillic", "\u0400\u0500"}, 188 new [] {"IsCyrillicSupplement", "\u0500\u0530"}, 189 new [] {"IsDevanagari", "\u0900\u0980"}, 190 new [] {"IsDingbats", "\u2700\u27C0"}, 191 new [] {"IsEnclosedAlphanumerics", "\u2460\u2500"}, 192 new [] {"IsEnclosedCJKLettersandMonths", "\u3200\u3300"}, 193 new [] {"IsEthiopic", "\u1200\u1380"}, 194 new [] {"IsGeneralPunctuation", "\u2000\u2070"}, 195 new [] {"IsGeometricShapes", "\u25A0\u2600"}, 196 new [] {"IsGeorgian", "\u10A0\u1100"}, 197 new [] {"IsGreek", "\u0370\u0400"}, 198 new [] {"IsGreekExtended", "\u1F00\u2000"}, 199 new [] {"IsGreekandCoptic", "\u0370\u0400"}, 200 new [] {"IsGujarati", "\u0A80\u0B00"}, 201 new [] {"IsGurmukhi", "\u0A00\u0A80"}, 202 new [] {"IsHalfwidthandFullwidthForms", "\uFF00\uFFF0"}, 203 new [] {"IsHangulCompatibilityJamo", "\u3130\u3190"}, 204 new [] {"IsHangulJamo", "\u1100\u1200"}, 205 new [] {"IsHangulSyllables", "\uAC00\uD7B0"}, 206 new [] {"IsHanunoo", "\u1720\u1740"}, 207 new [] {"IsHebrew", "\u0590\u0600"}, 208 new [] {"IsHighPrivateUseSurrogates", "\uDB80\uDC00"}, 209 new [] {"IsHighSurrogates", "\uD800\uDB80"}, 210 new [] {"IsHiragana", "\u3040\u30A0"}, 211 new [] {"IsIPAExtensions", "\u0250\u02B0"}, 212 new [] {"IsIdeographicDescriptionCharacters", "\u2FF0\u3000"}, 213 new [] {"IsKanbun", "\u3190\u31A0"}, 214 new [] {"IsKangxiRadicals", "\u2F00\u2FE0"}, 215 new [] {"IsKannada", "\u0C80\u0D00"}, 216 new [] {"IsKatakana", "\u30A0\u3100"}, 217 new [] {"IsKatakanaPhoneticExtensions", "\u31F0\u3200"}, 218 new [] {"IsKhmer", "\u1780\u1800"}, 219 new [] {"IsKhmerSymbols", "\u19E0\u1A00"}, 220 new [] {"IsLao", "\u0E80\u0F00"}, 221 new [] {"IsLatin-1Supplement", "\u0080\u0100"}, 222 new [] {"IsLatinExtended-A", "\u0100\u0180"}, 223 new [] {"IsLatinExtended-B", "\u0180\u0250"}, 224 new [] {"IsLatinExtendedAdditional", "\u1E00\u1F00"}, 225 new [] {"IsLetterlikeSymbols", "\u2100\u2150"}, 226 new [] {"IsLimbu", "\u1900\u1950"}, 227 new [] {"IsLowSurrogates", "\uDC00\uE000"}, 228 new [] {"IsMalayalam", "\u0D00\u0D80"}, 229 new [] {"IsMathematicalOperators", "\u2200\u2300"}, 230 new [] {"IsMiscellaneousMathematicalSymbols-A","\u27C0\u27F0"}, 231 new [] {"IsMiscellaneousMathematicalSymbols-B","\u2980\u2A00"}, 232 new [] {"IsMiscellaneousSymbols", "\u2600\u2700"}, 233 new [] {"IsMiscellaneousSymbolsandArrows", "\u2B00\u2C00"}, 234 new [] {"IsMiscellaneousTechnical", "\u2300\u2400"}, 235 new [] {"IsMongolian", "\u1800\u18B0"}, 236 new [] {"IsMyanmar", "\u1000\u10A0"}, 237 new [] {"IsNumberForms", "\u2150\u2190"}, 238 new [] {"IsOgham", "\u1680\u16A0"}, 239 new [] {"IsOpticalCharacterRecognition", "\u2440\u2460"}, 240 new [] {"IsOriya", "\u0B00\u0B80"}, 241 new [] {"IsPhoneticExtensions", "\u1D00\u1D80"}, 242 new [] {"IsPrivateUse", "\uE000\uF900"}, 243 new [] {"IsPrivateUseArea", "\uE000\uF900"}, 244 new [] {"IsRunic", "\u16A0\u1700"}, 245 new [] {"IsSinhala", "\u0D80\u0E00"}, 246 new [] {"IsSmallFormVariants", "\uFE50\uFE70"}, 247 new [] {"IsSpacingModifierLetters", "\u02B0\u0300"}, 248 new [] {"IsSpecials", "\uFFF0"}, 249 new [] {"IsSuperscriptsandSubscripts", "\u2070\u20A0"}, 250 new [] {"IsSupplementalArrows-A", "\u27F0\u2800"}, 251 new [] {"IsSupplementalArrows-B", "\u2900\u2980"}, 252 new [] {"IsSupplementalMathematicalOperators", "\u2A00\u2B00"}, 253 new [] {"IsSyriac", "\u0700\u0750"}, 254 new [] {"IsTagalog", "\u1700\u1720"}, 255 new [] {"IsTagbanwa", "\u1760\u1780"}, 256 new [] {"IsTaiLe", "\u1950\u1980"}, 257 new [] {"IsTamil", "\u0B80\u0C00"}, 258 new [] {"IsTelugu", "\u0C00\u0C80"}, 259 new [] {"IsThaana", "\u0780\u07C0"}, 260 new [] {"IsThai", "\u0E00\u0E80"}, 261 new [] {"IsTibetan", "\u0F00\u1000"}, 262 new [] {"IsUnifiedCanadianAboriginalSyllabics","\u1400\u1680"}, 263 new [] {"IsVariationSelectors", "\uFE00\uFE10"}, 264 new [] {"IsYiRadicals", "\uA490\uA4D0"}, 265 new [] {"IsYiSyllables", "\uA000\uA490"}, 266 new [] {"IsYijingHexagramSymbols", "\u4DC0\u4E00"}, 267 new [] {"_xmlC", /* Name Char */ "\u002D\u002F\u0030\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00B7\u00B8\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u02D0\u02D2\u0300\u0346\u0360\u0362\u0386\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0483\u0487\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0653\u0660\u066A\u0670\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06E9\u06EA\u06EE\u06F0\u06FA\u0901\u0904\u0905\u093A\u093C\u094E\u0951\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC" 268 +"\u09DE\u09DF\u09E4\u09E6\u09F2\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B70\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF0\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2" 269 +"\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0E01\u0E2F\u0E30\u0E3B\u0E40\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0F18\u0F1A\u0F20\u0F2A\u0F35\u0F36\u0F37\u0F38\u0F39\u0F3A\u0F3E\u0F48\u0F49\u0F6A\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F96\u0F97\u0F98\u0F99\u0FAE\u0FB1\u0FB8\u0FB9\u0FBA\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00" 270 +"\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u20D0\u20DD\u20E1\u20E2\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3005\u3006\u3007\u3008\u3021\u3030\u3031\u3036\u3041\u3095\u3099\u309B\u309D\u309F\u30A1\u30FB\u30FC\u30FF\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"}, 271 new [] {"_xmlD", "\u0030\u003A\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF0\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F2A\u1040\u104A\u1369\u1372\u17E0\u17EA\u1810\u181A\uFF10\uFF1A"}, 272 new [] {"_xmlI", /* Start Name Char */ "\u003A\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u0386\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0641\u064B\u0671\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06D6\u06E5\u06E7\u0905\u093A\u093D\u093E\u0958\u0962\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09DC\u09DE\u09DF\u09E2\u09F0\u09F2\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A59\u0A5D\u0A5E\u0A5F\u0A72\u0A75\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABD\u0ABE\u0AE0\u0AE1\u0B05\u0B0D\u0B0F" 273 +"\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3D\u0B3E\u0B5C\u0B5E\u0B5F\u0B62\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C60\u0C62\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CDE\u0CDF\u0CE0\u0CE2\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D60\u0D62\u0E01\u0E2F\u0E30\u0E31\u0E32\u0E34\u0E40\u0E46\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EB1\u0EB2\u0EB4\u0EBD\u0EBE\u0EC0\u0EC5\u0F40\u0F48\u0F49\u0F6A\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC" 274 +"\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3007\u3008\u3021\u302A\u3041\u3095\u30A1\u30FB\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"}, 275 new [] {"_xmlW", "\u0024\u0025\u002B\u002C\u0030\u003A\u003C\u003F\u0041\u005B\u005E\u005F\u0060\u007B\u007C\u007D\u007E\u007F\u00A2\u00AB\u00AC\u00AD\u00AE\u00B7\u00B8\u00BB\u00BC\u00BF\u00C0\u0221\u0222\u0234\u0250\u02AE\u02B0\u02EF\u0300\u0350\u0360\u0370\u0374\u0376\u037A\u037B\u0384\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03F7\u0400\u0487\u0488\u04CF\u04D0\u04F6\u04F8\u04FA\u0500\u0510\u0531\u0557\u0559\u055A\u0561\u0588\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0656\u0660\u066A\u066E\u06D4\u06D5\u06DD\u06DE\u06EE\u06F0\u06FF\u0710\u072D\u0730\u074B\u0780\u07B2\u0901\u0904\u0905\u093A\u093C\u094E\u0950\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC\u09DE\u09DF\u09E4\u09E6\u09FB\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35" 276 +"\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AD0\u0AD1\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B71\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF3\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49" 277 +"\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0D82\u0D84\u0D85\u0D97\u0D9A\u0DB2\u0DB3\u0DBC\u0DBD\u0DBE\u0DC0\u0DC7\u0DCA\u0DCB\u0DCF\u0DD5\u0DD6\u0DD7\u0DD8\u0DE0\u0DF2\u0DF4\u0E01\u0E3B\u0E3F\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0EDC\u0EDE\u0F00\u0F04\u0F13\u0F3A\u0F3E\u0F48\u0F49\u0F6B\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F98\u0F99\u0FBD\u0FBE\u0FCD\u0FCF\u0FD0\u1000\u1022\u1023\u1028\u1029\u102B\u102C\u1033\u1036\u103A\u1040\u104A\u1050\u105A\u10A0\u10C6\u10D0\u10F9\u1100\u115A\u115F\u11A3\u11A8\u11FA\u1200\u1207\u1208\u1247\u1248\u1249\u124A\u124E\u1250\u1257\u1258\u1259\u125A\u125E\u1260\u1287\u1288\u1289\u128A\u128E\u1290\u12AF\u12B0\u12B1\u12B2\u12B6\u12B8\u12BF\u12C0\u12C1\u12C2\u12C6\u12C8\u12CF\u12D0\u12D7\u12D8\u12EF\u12F0\u130F\u1310\u1311\u1312\u1316\u1318\u131F\u1320\u1347\u1348\u135B\u1369\u137D\u13A0" 278 +"\u13F5\u1401\u166D\u166F\u1677\u1681\u169B\u16A0\u16EB\u16EE\u16F1\u1700\u170D\u170E\u1715\u1720\u1735\u1740\u1754\u1760\u176D\u176E\u1771\u1772\u1774\u1780\u17D4\u17D7\u17D8\u17DB\u17DD\u17E0\u17EA\u180B\u180E\u1810\u181A\u1820\u1878\u1880\u18AA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FC5\u1FC6\u1FD4\u1FD6\u1FDC\u1FDD\u1FF0\u1FF2\u1FF5\u1FF6\u1FFF\u2044\u2045\u2052\u2053\u2070\u2072\u2074\u207D\u207F\u208D\u20A0\u20B2\u20D0\u20EB\u2100\u213B\u213D\u214C\u2153\u2184\u2190\u2329\u232B\u23B4\u23B7\u23CF\u2400\u2427\u2440\u244B\u2460\u24FF\u2500\u2614\u2616\u2618\u2619\u267E\u2680\u268A\u2701\u2705\u2706\u270A\u270C\u2728\u2729\u274C\u274D\u274E\u274F\u2753\u2756\u2757\u2758\u275F\u2761\u2768\u2776\u2795\u2798\u27B0\u27B1\u27BF\u27D0\u27E6\u27F0\u2983\u2999\u29D8\u29DC\u29FC\u29FE\u2B00\u2E80\u2E9A\u2E9B\u2EF4\u2F00\u2FD6\u2FF0\u2FFC\u3004\u3008\u3012\u3014\u3020\u3030\u3031\u303D\u303E\u3040" 279 +"\u3041\u3097\u3099\u30A0\u30A1\u30FB\u30FC\u3100\u3105\u312D\u3131\u318F\u3190\u31B8\u31F0\u321D\u3220\u3244\u3251\u327C\u327F\u32CC\u32D0\u32FF\u3300\u3377\u337B\u33DE\u33E0\u33FF\u3400\u4DB6\u4E00\u9FA6\uA000\uA48D\uA490\uA4C7\uAC00\uD7A4\uF900\uFA2E\uFA30\uFA6B\uFB00\uFB07\uFB13\uFB18\uFB1D\uFB37\uFB38\uFB3D\uFB3E\uFB3F\uFB40\uFB42\uFB43\uFB45\uFB46\uFBB2\uFBD3\uFD3E\uFD50\uFD90\uFD92\uFDC8\uFDF0\uFDFD\uFE00\uFE10\uFE20\uFE24\uFE62\uFE63\uFE64\uFE67\uFE69\uFE6A\uFE70\uFE75\uFE76\uFEFD\uFF04\uFF05\uFF0B\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF21\uFF3B\uFF3E\uFF3F\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF66\uFFBF\uFFC2\uFFC8\uFFCA\uFFD0\uFFD2\uFFD8\uFFDA\uFFDD\uFFE0\uFFE7\uFFE8\uFFEF\uFFFC\uFFFE"}, 280 }; 281 282 283 /************************************************************************** 284 Let U be the set of Unicode character values and let L be the lowercase 285 function, mapping from U to U. To perform case insensitive matching of 286 character sets, we need to be able to map an interval I in U, say 287 288 I = [chMin, chMax] = { ch : chMin <= ch <= chMax } 289 290 to a set A such that A contains L(I) and A is contained in the union of 291 I and L(I). 292 293 The table below partitions U into intervals on which L is non-decreasing. 294 Thus, for any interval J = [a, b] contained in one of these intervals, 295 L(J) is contained in [L(a), L(b)]. 296 297 It is also true that for any such J, [L(a), L(b)] is contained in the 298 union of J and L(J). This does not follow from L being non-decreasing on 299 these intervals. It follows from the nature of the L on each interval. 300 On each interval, L has one of the following forms: 301 302 (1) L(ch) = constant (LowercaseSet) 303 (2) L(ch) = ch + offset (LowercaseAdd) 304 (3) L(ch) = ch | 1 (LowercaseBor) 305 (4) L(ch) = ch + (ch & 1) (LowercaseBad) 306 307 It is easy to verify that for any of these forms [L(a), L(b)] is 308 contained in the union of [a, b] and L([a, b]). 309 ***************************************************************************/ 310 311 private const int LowercaseSet = 0; // Set to arg. 312 private const int LowercaseAdd = 1; // Add arg. 313 private const int LowercaseBor = 2; // Bitwise or with 1. 314 private const int LowercaseBad = 3; // Bitwise and with 1 and add original. 315 316 private static readonly LowerCaseMapping[] s_lcTable = new LowerCaseMapping[] 317 { 318 new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32), 319 new LowerCaseMapping('\u00C0', '\u00DE', LowercaseAdd, 32), 320 new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0), 321 new LowerCaseMapping('\u0130', '\u0130', LowercaseSet, 0x0069), 322 new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0), 323 new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0), 324 new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0), 325 new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF), 326 new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0), 327 new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253), 328 new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0), 329 new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254), 330 new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188), 331 new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205), 332 new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C), 333 new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD), 334 new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259), 335 new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B), 336 new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192), 337 new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260), 338 new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263), 339 new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269), 340 new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268), 341 new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199), 342 new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F), 343 new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272), 344 new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275), 345 new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0), 346 new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8), 347 new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283), 348 new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD), 349 new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288), 350 new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0), 351 new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217), 352 new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0), 353 new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292), 354 new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9), 355 new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD), 356 new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6), 357 new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9), 358 new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC), 359 new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0), 360 new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0), 361 new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3), 362 new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5), 363 new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0), 364 new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC), 365 new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37), 366 new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC), 367 new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63), 368 new LowerCaseMapping('\u0391', '\u03AB', LowercaseAdd, 32), 369 new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0), 370 new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80), 371 new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32), 372 new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0), 373 new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0), 374 new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0), 375 new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8), 376 new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC), 377 new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0), 378 new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0), 379 new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9), 380 new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48), 381 new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 48), 382 new LowerCaseMapping('\u1E00', '\u1EF8', LowercaseBor, 0), 383 new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8), 384 new LowerCaseMapping('\u1F18', '\u1F1F', LowercaseAdd, -8), 385 new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8), 386 new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8), 387 new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8), 388 new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51), 389 new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53), 390 new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55), 391 new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57), 392 new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8), 393 new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8), 394 new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8), 395 new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8), 396 new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8), 397 new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74), 398 new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3), 399 new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86), 400 new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3), 401 new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8), 402 new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100), 403 new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8), 404 new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112), 405 new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5), 406 new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128), 407 new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126), 408 new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3), 409 new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16), 410 new LowerCaseMapping('\u24B6', '\u24D0', LowercaseAdd, 26), 411 new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32), 412 }; 413 414 #if DEBUG RegexCharClass()415 static RegexCharClass() 416 { 417 // Make sure the initial capacity for s_definedCategories is correct 418 Debug.Assert( 419 s_definedCategories.Count == DefinedCategoriesCapacity, 420 "RegexCharClass s_definedCategories's initial capacity (DefinedCategoriesCapacity) is incorrect.", 421 "Expected (s_definedCategories.Count): {0}, Actual (DefinedCategoriesCapacity): {1}", 422 s_definedCategories.Count, 423 DefinedCategoriesCapacity); 424 425 // Make sure the s_propTable is correctly ordered 426 int len = s_propTable.Length; 427 for (int i = 0; i < len - 1; i++) 428 Debug.Assert(string.Compare(s_propTable[i][0], s_propTable[i + 1][0], StringComparison.Ordinal) < 0, "RegexCharClass s_propTable is out of order at (" + s_propTable[i][0] + ", " + s_propTable[i + 1][0] + ")"); 429 } 430 #endif 431 432 /// <summary> 433 /// Creates an empty character class. 434 /// </summary> RegexCharClass()435 internal RegexCharClass() 436 { 437 _rangelist = new List<SingleRange>(6); 438 _canonical = true; 439 _categories = new StringBuilder(); 440 } 441 RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction)442 private RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction) 443 { 444 _rangelist = ranges; 445 _categories = categories; 446 _canonical = true; 447 _negate = negate; 448 _subtractor = subtraction; 449 } 450 451 internal bool CanMerge 452 { 453 get 454 { 455 return !_negate && _subtractor == null; 456 } 457 } 458 459 internal bool Negate 460 { 461 set { _negate = value; } 462 } 463 AddChar(char c)464 internal void AddChar(char c) 465 { 466 AddRange(c, c); 467 } 468 469 /// <summary> 470 /// Adds a regex char class 471 /// </summary> AddCharClass(RegexCharClass cc)472 internal void AddCharClass(RegexCharClass cc) 473 { 474 int i; 475 476 Debug.Assert(cc.CanMerge && CanMerge, "Both character classes added together must be able to merge"); 477 478 if (!cc._canonical) 479 { 480 // if the new char class to add isn't canonical, we're not either. 481 _canonical = false; 482 } 483 else if (_canonical && RangeCount() > 0 && cc.RangeCount() > 0 && cc.GetRangeAt(0)._first <= GetRangeAt(RangeCount() - 1)._last) 484 _canonical = false; 485 486 for (i = 0; i < cc.RangeCount(); i += 1) 487 { 488 _rangelist.Add(cc.GetRangeAt(i)); 489 } 490 491 _categories.Append(cc._categories.ToString()); 492 } 493 494 /// <summary> 495 /// Adds a set (specified by its string representation) to the class. 496 /// </summary> AddSet(string set)497 private void AddSet(string set) 498 { 499 int i; 500 501 if (_canonical && RangeCount() > 0 && set.Length > 0 && 502 set[0] <= GetRangeAt(RangeCount() - 1)._last) 503 _canonical = false; 504 505 for (i = 0; i < set.Length - 1; i += 2) 506 { 507 _rangelist.Add(new SingleRange(set[i], (char)(set[i + 1] - 1))); 508 } 509 510 if (i < set.Length) 511 { 512 _rangelist.Add(new SingleRange(set[i], LastChar)); 513 } 514 } 515 AddSubtraction(RegexCharClass sub)516 internal void AddSubtraction(RegexCharClass sub) 517 { 518 Debug.Assert(_subtractor == null, "Can't add two subtractions to a char class. "); 519 _subtractor = sub; 520 } 521 522 /// <summary> 523 /// Adds a single range of characters to the class. 524 /// </summary> AddRange(char first, char last)525 internal void AddRange(char first, char last) 526 { 527 _rangelist.Add(new SingleRange(first, last)); 528 if (_canonical && _rangelist.Count > 0 && 529 first <= _rangelist[_rangelist.Count - 1]._last) 530 { 531 _canonical = false; 532 } 533 } 534 AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern)535 internal void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern) 536 { 537 string category; 538 if (s_definedCategories.TryGetValue(categoryName, out category) && !categoryName.Equals(s_internalRegexIgnoreCase)) 539 { 540 if (caseInsensitive) 541 { 542 if (categoryName.Equals("Ll") || categoryName.Equals("Lu") || categoryName.Equals("Lt")) 543 // when RegexOptions.IgnoreCase is specified then {Ll}, {Lu}, and {Lt} cases should all match 544 category = s_definedCategories[s_internalRegexIgnoreCase]; 545 } 546 547 if (invert) 548 category = NegateCategory(category); // negate the category 549 550 _categories.Append(category); 551 } 552 else 553 AddSet(SetFromProperty(categoryName, invert, pattern)); 554 } 555 AddCategory(string category)556 private void AddCategory(string category) 557 { 558 _categories.Append(category); 559 } 560 561 /// <summary> 562 /// Adds to the class any lowercase versions of characters already 563 /// in the class. Used for case-insensitivity. 564 /// </summary> AddLowercase(CultureInfo culture)565 internal void AddLowercase(CultureInfo culture) 566 { 567 _canonical = false; 568 569 int count = _rangelist.Count; 570 for (int i = 0; i < count; i++) 571 { 572 SingleRange range = _rangelist[i]; 573 if (range._first == range._last) 574 { 575 char lower = culture.TextInfo.ToLower(range._first); 576 _rangelist[i] = new SingleRange(lower, lower); 577 } 578 else 579 { 580 AddLowercaseRange(range._first, range._last, culture); 581 } 582 } 583 } 584 585 /// <summary> 586 /// For a single range that's in the set, adds any additional ranges 587 /// necessary to ensure that lowercase equivalents are also included. 588 /// </summary> AddLowercaseRange(char chMin, char chMax, CultureInfo culture)589 private void AddLowercaseRange(char chMin, char chMax, CultureInfo culture) 590 { 591 int i, iMax, iMid; 592 char chMinT, chMaxT; 593 LowerCaseMapping lc; 594 595 for (i = 0, iMax = s_lcTable.Length; i < iMax;) 596 { 597 iMid = (i + iMax) / 2; 598 if (s_lcTable[iMid]._chMax < chMin) 599 i = iMid + 1; 600 else 601 iMax = iMid; 602 } 603 604 if (i >= s_lcTable.Length) 605 return; 606 607 for (; i < s_lcTable.Length && (lc = s_lcTable[i])._chMin <= chMax; i++) 608 { 609 if ((chMinT = lc._chMin) < chMin) 610 chMinT = chMin; 611 612 if ((chMaxT = lc._chMax) > chMax) 613 chMaxT = chMax; 614 615 switch (lc._lcOp) 616 { 617 case LowercaseSet: 618 chMinT = (char)lc._data; 619 chMaxT = (char)lc._data; 620 break; 621 case LowercaseAdd: 622 unchecked 623 { 624 chMinT += (char)lc._data; 625 chMaxT += (char)lc._data; 626 } 627 break; 628 case LowercaseBor: 629 chMinT |= (char)1; 630 chMaxT |= (char)1; 631 break; 632 case LowercaseBad: 633 chMinT += (char)(chMinT & 1); 634 chMaxT += (char)(chMaxT & 1); 635 break; 636 } 637 638 if (chMinT < chMin || chMaxT > chMax) 639 AddRange(chMinT, chMaxT); 640 } 641 } 642 AddWord(bool ecma, bool negate)643 internal void AddWord(bool ecma, bool negate) 644 { 645 if (negate) 646 { 647 if (ecma) 648 AddSet(NotECMAWordSet); 649 else 650 AddCategory(s_notWord); 651 } 652 else 653 { 654 if (ecma) 655 AddSet(ECMAWordSet); 656 else 657 AddCategory(s_word); 658 } 659 } 660 AddSpace(bool ecma, bool negate)661 internal void AddSpace(bool ecma, bool negate) 662 { 663 if (negate) 664 { 665 if (ecma) 666 AddSet(NotECMASpaceSet); 667 else 668 AddCategory(s_notSpace); 669 } 670 else 671 { 672 if (ecma) 673 AddSet(ECMASpaceSet); 674 else 675 AddCategory(s_space); 676 } 677 } 678 AddDigit(bool ecma, bool negate, string pattern)679 internal void AddDigit(bool ecma, bool negate, string pattern) 680 { 681 if (ecma) 682 { 683 if (negate) 684 AddSet(NotECMADigitSet); 685 else 686 AddSet(ECMADigitSet); 687 } 688 else 689 AddCategoryFromName("Nd", negate, false, pattern); 690 } 691 ConvertOldStringsToClass(string set, string category)692 internal static string ConvertOldStringsToClass(string set, string category) 693 { 694 StringBuilder sb = StringBuilderCache.Acquire(set.Length + category.Length + 3); 695 696 if (set.Length >= 2 && set[0] == '\0' && set[1] == '\0') 697 { 698 sb.Append((char)0x1); 699 sb.Append((char)(set.Length - 2)); 700 sb.Append((char)category.Length); 701 sb.Append(set.Substring(2)); 702 } 703 else 704 { 705 sb.Append((char)0x0); 706 sb.Append((char)set.Length); 707 sb.Append((char)category.Length); 708 sb.Append(set); 709 } 710 sb.Append(category); 711 712 return StringBuilderCache.GetStringAndRelease(sb); 713 } 714 715 /// <summary> 716 /// Returns the char 717 /// </summary> SingletonChar(string set)718 internal static char SingletonChar(string set) 719 { 720 Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class"); 721 return set[SETSTART]; 722 } 723 IsMergeable(string charClass)724 internal static bool IsMergeable(string charClass) 725 { 726 return (!IsNegated(charClass) && !IsSubtraction(charClass)); 727 } 728 IsEmpty(string charClass)729 internal static bool IsEmpty(string charClass) 730 { 731 if (charClass[CATEGORYLENGTH] == 0 && charClass[FLAGS] == 0 && charClass[SETLENGTH] == 0 && !IsSubtraction(charClass)) 732 return true; 733 else 734 return false; 735 } 736 737 /// <summary> 738 /// <c>true</c> if the set contains a single character only 739 /// </summary> IsSingleton(string set)740 internal static bool IsSingleton(string set) 741 { 742 if (set[FLAGS] == 0 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) && 743 (set[SETSTART] == LastChar || set[SETSTART] + 1 == set[SETSTART + 1])) 744 return true; 745 else 746 return false; 747 } 748 IsSingletonInverse(string set)749 internal static bool IsSingletonInverse(string set) 750 { 751 if (set[FLAGS] == 1 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) && 752 (set[SETSTART] == LastChar || set[SETSTART] + 1 == set[SETSTART + 1])) 753 return true; 754 else 755 return false; 756 } 757 IsSubtraction(string charClass)758 private static bool IsSubtraction(string charClass) 759 { 760 return (charClass.Length > SETSTART + charClass[SETLENGTH] + charClass[CATEGORYLENGTH]); 761 } 762 IsNegated(string set)763 internal static bool IsNegated(string set) 764 { 765 return (set != null && set[FLAGS] == 1); 766 } 767 IsECMAWordChar(char ch)768 internal static bool IsECMAWordChar(char ch) 769 { 770 // According to ECMA-262, \s, \S, ., ^, and $ use Unicode-based interpretations of 771 // whitespace and newline, while \d, \D\, \w, \W, \b, and \B use ASCII-only 772 // interpretations of digit, word character, and word boundary. In other words, 773 // no special treatment of Unicode ZERO WIDTH NON-JOINER (ZWNJ U+200C) and 774 // ZERO WIDTH JOINER (ZWJ U+200D) is required for ECMA word boundaries. 775 return CharInClass(ch, ECMAWordClass); 776 } 777 IsWordChar(char ch)778 internal static bool IsWordChar(char ch) 779 { 780 // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) 781 // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic 782 // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C 783 // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. 784 return CharInClass(ch, WordClass) || ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner; 785 } 786 CharInClass(char ch, string set)787 internal static bool CharInClass(char ch, string set) 788 { 789 return CharInClassRecursive(ch, set, 0); 790 } 791 792 CharInClassRecursive(char ch, string set, int start)793 internal static bool CharInClassRecursive(char ch, string set, int start) 794 { 795 int mySetLength = set[start + SETLENGTH]; 796 int myCategoryLength = set[start + CATEGORYLENGTH]; 797 int myEndPosition = start + SETSTART + mySetLength + myCategoryLength; 798 799 bool subtracted = false; 800 801 if (set.Length > myEndPosition) 802 { 803 subtracted = CharInClassRecursive(ch, set, myEndPosition); 804 } 805 806 bool b = CharInClassInternal(ch, set, start, mySetLength, myCategoryLength); 807 808 // Note that we apply the negation *before* performing the subtraction. This is because 809 // the negation only applies to the first char class, not the entire subtraction. 810 if (set[start + FLAGS] == 1) 811 b = !b; 812 813 return b && !subtracted; 814 } 815 816 /// <summary> 817 /// Determines a character's membership in a character class (via the 818 /// string representation of the class). 819 /// </summary> CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength)820 private static bool CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength) 821 { 822 int min; 823 int max; 824 int mid; 825 min = start + SETSTART; 826 max = min + mySetLength; 827 828 while (min != max) 829 { 830 mid = (min + max) / 2; 831 if (ch < set[mid]) 832 max = mid; 833 else 834 min = mid + 1; 835 } 836 837 // The starting position of the set within the character class determines 838 // whether what an odd or even ending position means. If the start is odd, 839 // an *even* ending position means the character was in the set. With recursive 840 // subtractions in the mix, the starting position = start+SETSTART. Since we know that 841 // SETSTART is odd, we can simplify it out of the equation. But if it changes we need to 842 // reverse this check. 843 Debug.Assert((SETSTART & 0x1) == 1, "If SETSTART is not odd, the calculation below this will be reversed"); 844 if ((min & 0x1) == (start & 0x1)) 845 return true; 846 else 847 { 848 if (myCategoryLength == 0) 849 return false; 850 851 return CharInCategory(ch, set, start, mySetLength, myCategoryLength); 852 } 853 } 854 CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength)855 private static bool CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength) 856 { 857 UnicodeCategory chcategory = CharUnicodeInfo.GetUnicodeCategory(ch); 858 859 int i = start + SETSTART + mySetLength; 860 int end = i + myCategoryLength; 861 while (i < end) 862 { 863 int curcat = unchecked((short)set[i]); 864 865 if (curcat == 0) 866 { 867 // zero is our marker for a group of categories - treated as a unit 868 if (CharInCategoryGroup(ch, chcategory, set, ref i)) 869 return true; 870 } 871 else if (curcat > 0) 872 { 873 // greater than zero is a positive case 874 875 if (curcat == SpaceConst) 876 { 877 if (char.IsWhiteSpace(ch)) 878 return true; 879 else 880 { 881 i++; 882 continue; 883 } 884 } 885 --curcat; 886 887 if (chcategory == (UnicodeCategory)curcat) 888 return true; 889 } 890 else 891 { 892 // less than zero is a negative case 893 if (curcat == NotSpaceConst) 894 { 895 if (!char.IsWhiteSpace(ch)) 896 return true; 897 else 898 { 899 i++; 900 continue; 901 } 902 } 903 904 //curcat = -curcat; 905 //--curcat; 906 curcat = -1 - curcat; 907 908 if (chcategory != (UnicodeCategory)curcat) 909 return true; 910 } 911 i++; 912 } 913 return false; 914 } 915 916 /// <summary> 917 /// This is used for categories which are composed of other categories - L, N, Z, W... 918 /// These groups need special treatment when they are negated 919 /// </summary> CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i)920 private static bool CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i) 921 { 922 i++; 923 924 int curcat = unchecked((short)category[i]); 925 if (curcat > 0) 926 { 927 // positive case - the character must be in ANY of the categories in the group 928 bool answer = false; 929 930 while (curcat != 0) 931 { 932 if (!answer) 933 { 934 --curcat; 935 if (chcategory == (UnicodeCategory)curcat) 936 answer = true; 937 } 938 i++; 939 curcat = (short)category[i]; 940 } 941 return answer; 942 } 943 else 944 { 945 // negative case - the character must be in NONE of the categories in the group 946 bool answer = true; 947 948 while (curcat != 0) 949 { 950 if (answer) 951 { 952 //curcat = -curcat; 953 //--curcat; 954 curcat = -1 - curcat; 955 if (chcategory == (UnicodeCategory)curcat) 956 answer = false; 957 } 958 i++; 959 curcat = unchecked((short)category[i]); 960 } 961 return answer; 962 } 963 } 964 NegateCategory(string category)965 private static string NegateCategory(string category) 966 { 967 if (category == null) 968 return null; 969 970 StringBuilder sb = StringBuilderCache.Acquire(category.Length); 971 972 for (int i = 0; i < category.Length; i++) 973 { 974 short ch = (short)category[i]; 975 sb.Append(unchecked((char)-ch)); 976 } 977 return StringBuilderCache.GetStringAndRelease(sb); 978 } 979 Parse(string charClass)980 internal static RegexCharClass Parse(string charClass) 981 { 982 return ParseRecursive(charClass, 0); 983 } 984 ParseRecursive(string charClass, int start)985 private static RegexCharClass ParseRecursive(string charClass, int start) 986 { 987 int mySetLength = charClass[start + SETLENGTH]; 988 int myCategoryLength = charClass[start + CATEGORYLENGTH]; 989 int myEndPosition = start + SETSTART + mySetLength + myCategoryLength; 990 991 List<SingleRange> ranges = new List<SingleRange>(mySetLength); 992 int i = start + SETSTART; 993 int end = i + mySetLength; 994 while (i < end) 995 { 996 char first = charClass[i]; 997 i++; 998 999 char last; 1000 if (i < end) 1001 last = (char)(charClass[i] - 1); 1002 else 1003 last = LastChar; 1004 i++; 1005 ranges.Add(new SingleRange(first, last)); 1006 } 1007 1008 RegexCharClass sub = null; 1009 if (charClass.Length > myEndPosition) 1010 sub = ParseRecursive(charClass, myEndPosition); 1011 1012 return new RegexCharClass(charClass[start + FLAGS] == 1, ranges, new StringBuilder(charClass.Substring(end, myCategoryLength)), sub); 1013 } 1014 1015 /// <summary> 1016 /// The number of single ranges that have been accumulated so far. 1017 /// </summary> RangeCount()1018 private int RangeCount() 1019 { 1020 return _rangelist.Count; 1021 } 1022 1023 /// <summary> 1024 /// Constructs the string representation of the class. 1025 /// </summary> ToStringClass()1026 internal string ToStringClass() 1027 { 1028 if (!_canonical) 1029 Canonicalize(); 1030 1031 // make a guess about the length of the ranges. We'll update this at the end. 1032 // This is important because if the last range ends in LastChar, we won't append 1033 // LastChar to the list. 1034 int rangeLen = _rangelist.Count * 2; 1035 StringBuilder sb = StringBuilderCache.Acquire(rangeLen + _categories.Length + 3); 1036 1037 int flags; 1038 if (_negate) 1039 flags = 1; 1040 else 1041 flags = 0; 1042 1043 sb.Append((char)flags); 1044 sb.Append((char)rangeLen); 1045 sb.Append((char)_categories.Length); 1046 1047 for (int i = 0; i < _rangelist.Count; i++) 1048 { 1049 SingleRange currentRange = _rangelist[i]; 1050 sb.Append(currentRange._first); 1051 1052 if (currentRange._last != LastChar) 1053 sb.Append((char)(currentRange._last + 1)); 1054 } 1055 1056 sb[SETLENGTH] = (char)(sb.Length - SETSTART); 1057 1058 sb.Append(_categories); 1059 1060 if (_subtractor != null) 1061 sb.Append(_subtractor.ToStringClass()); 1062 1063 return StringBuilderCache.GetStringAndRelease(sb); 1064 } 1065 1066 /// <summary> 1067 /// The ith range. 1068 /// </summary> GetRangeAt(int i)1069 private SingleRange GetRangeAt(int i) 1070 { 1071 return _rangelist[i]; 1072 } 1073 1074 /// <summary> 1075 /// Logic to reduce a character class to a unique, sorted form. 1076 /// </summary> Canonicalize()1077 private void Canonicalize() 1078 { 1079 SingleRange CurrentRange; 1080 int i; 1081 int j; 1082 char last; 1083 bool done; 1084 1085 _canonical = true; 1086 _rangelist.Sort(SingleRangeComparer.Instance); 1087 1088 // 1089 // Find and eliminate overlapping or abutting ranges 1090 // 1091 1092 if (_rangelist.Count > 1) 1093 { 1094 done = false; 1095 1096 for (i = 1, j = 0; ; i++) 1097 { 1098 for (last = _rangelist[j]._last; ; i++) 1099 { 1100 if (i == _rangelist.Count || last == LastChar) 1101 { 1102 done = true; 1103 break; 1104 } 1105 1106 if ((CurrentRange = _rangelist[i])._first > last + 1) 1107 break; 1108 1109 if (last < CurrentRange._last) 1110 last = CurrentRange._last; 1111 } 1112 1113 _rangelist[j] = new SingleRange(_rangelist[j]._first, last); 1114 1115 j++; 1116 1117 if (done) 1118 break; 1119 1120 if (j < i) 1121 _rangelist[j] = _rangelist[i]; 1122 } 1123 _rangelist.RemoveRange(j, _rangelist.Count - j); 1124 } 1125 } 1126 SetFromProperty(string capname, bool invert, string pattern)1127 private static string SetFromProperty(string capname, bool invert, string pattern) 1128 { 1129 int min = 0; 1130 int max = s_propTable.Length; 1131 while (min != max) 1132 { 1133 int mid = (min + max) / 2; 1134 int res = string.Compare(capname, s_propTable[mid][0], StringComparison.Ordinal); 1135 if (res < 0) 1136 max = mid; 1137 else if (res > 0) 1138 min = mid + 1; 1139 else 1140 { 1141 string set = s_propTable[mid][1]; 1142 Debug.Assert(!string.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table"); 1143 if (invert) 1144 { 1145 if (set[0] == NullChar) 1146 { 1147 return set.Substring(1); 1148 } 1149 return NullCharString + set; 1150 } 1151 else 1152 { 1153 return set; 1154 } 1155 } 1156 } 1157 throw new ArgumentException(SR.Format(SR.MakeException, pattern, SR.Format(SR.UnknownProperty, capname))); 1158 } 1159 1160 #if DEBUG 1161 1162 /// <summary> 1163 /// Produces a human-readable description for a set string. 1164 /// </summary> SetDescription(string set)1165 internal static string SetDescription(string set) 1166 { 1167 int mySetLength = set[SETLENGTH]; 1168 int myCategoryLength = set[CATEGORYLENGTH]; 1169 int myEndPosition = SETSTART + mySetLength + myCategoryLength; 1170 1171 StringBuilder desc = new StringBuilder(); 1172 1173 desc.Append('['); 1174 1175 int index = SETSTART; 1176 char ch1; 1177 char ch2; 1178 1179 if (IsNegated(set)) 1180 desc.Append('^'); 1181 1182 while (index < SETSTART + set[SETLENGTH]) 1183 { 1184 ch1 = set[index]; 1185 if (index + 1 < set.Length) 1186 ch2 = (char)(set[index + 1] - 1); 1187 else 1188 ch2 = LastChar; 1189 1190 desc.Append(CharDescription(ch1)); 1191 1192 if (ch2 != ch1) 1193 { 1194 if (ch1 + 1 != ch2) 1195 desc.Append('-'); 1196 desc.Append(CharDescription(ch2)); 1197 } 1198 index += 2; 1199 } 1200 1201 while (index < SETSTART + set[SETLENGTH] + set[CATEGORYLENGTH]) 1202 { 1203 ch1 = set[index]; 1204 if (ch1 == 0) 1205 { 1206 bool found = false; 1207 1208 int lastindex = set.IndexOf(GroupChar, index + 1); 1209 string group = set.Substring(index, lastindex - index + 1); 1210 1211 foreach (var kvp in s_definedCategories) 1212 { 1213 if (group.Equals(kvp.Value)) 1214 { 1215 if ((short)set[index + 1] > 0) 1216 desc.Append("\\p{"); 1217 else 1218 desc.Append("\\P{"); 1219 1220 desc.Append(kvp.Key); 1221 desc.Append('}'); 1222 1223 found = true; 1224 break; 1225 } 1226 } 1227 1228 if (!found) 1229 { 1230 if (group.Equals(s_word)) 1231 desc.Append("\\w"); 1232 else if (group.Equals(s_notWord)) 1233 desc.Append("\\W"); 1234 else 1235 Debug.Fail("Couldn't find a group to match '" + group + "'"); 1236 } 1237 1238 index = lastindex; 1239 } 1240 else 1241 { 1242 desc.Append(CategoryDescription(ch1)); 1243 } 1244 1245 index++; 1246 } 1247 1248 if (set.Length > myEndPosition) 1249 { 1250 desc.Append('-'); 1251 desc.Append(SetDescription(set.Substring(myEndPosition))); 1252 } 1253 1254 desc.Append(']'); 1255 1256 return desc.ToString(); 1257 } 1258 1259 internal static readonly char[] Hex = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' }; 1260 internal static readonly string[] Categories = new string[] {"Lu", "Ll", "Lt", "Lm", "Lo", s_internalRegexIgnoreCase, 1261 "Mn", "Mc", "Me", 1262 "Nd", "Nl", "No", 1263 "Zs", "Zl", "Zp", 1264 "Cc", "Cf", "Cs", "Co", 1265 "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", 1266 "Sm", "Sc", "Sk", "So", 1267 "Cn" }; 1268 1269 /// <summary> 1270 /// Produces a human-readable description for a single character. 1271 /// </summary> CharDescription(char ch)1272 internal static string CharDescription(char ch) 1273 { 1274 if (ch == '\\') 1275 return "\\\\"; 1276 1277 if (ch >= ' ' && ch <= '~') 1278 { 1279 return ch.ToString(); 1280 } 1281 1282 var sb = new StringBuilder(); 1283 int shift; 1284 1285 if (ch < 256) 1286 { 1287 sb.Append("\\x"); 1288 shift = 8; 1289 } 1290 else 1291 { 1292 sb.Append("\\u"); 1293 shift = 16; 1294 } 1295 1296 while (shift > 0) 1297 { 1298 shift -= 4; 1299 sb.Append(Hex[(ch >> shift) & 0xF]); 1300 } 1301 1302 return sb.ToString(); 1303 } 1304 CategoryDescription(char ch)1305 private static string CategoryDescription(char ch) 1306 { 1307 if (ch == SpaceConst) 1308 return "\\s"; 1309 else if ((short)ch == NotSpaceConst) 1310 return "\\S"; 1311 else if ((short)ch < 0) 1312 { 1313 return "\\P{" + Categories[(-((short)ch) - 1)] + "}"; 1314 } 1315 else 1316 { 1317 return "\\p{" + Categories[(ch - 1)] + "}"; 1318 } 1319 } 1320 1321 #endif 1322 1323 /// <summary> 1324 /// Lower case mapping descriptor. 1325 /// </summary> 1326 private readonly struct LowerCaseMapping 1327 { LowerCaseMappingSystem.Text.RegularExpressions.RegexCharClass.LowerCaseMapping1328 internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data) 1329 { 1330 _chMin = chMin; 1331 _chMax = chMax; 1332 _lcOp = lcOp; 1333 _data = data; 1334 } 1335 1336 internal readonly char _chMin; 1337 internal readonly char _chMax; 1338 internal readonly int _lcOp; 1339 internal readonly int _data; 1340 } 1341 1342 /// <summary> 1343 /// For sorting ranges; compare based on the first char in the range. 1344 /// </summary> 1345 private sealed class SingleRangeComparer : IComparer<SingleRange> 1346 { 1347 public static readonly SingleRangeComparer Instance = new SingleRangeComparer(); 1348 SingleRangeComparer()1349 private SingleRangeComparer() 1350 { 1351 } 1352 Compare(SingleRange x, SingleRange y)1353 public int Compare(SingleRange x, SingleRange y) 1354 { 1355 return x._first.CompareTo(y._first); 1356 } 1357 } 1358 1359 /// <summary> 1360 /// A first/last pair representing a single range of characters. 1361 /// </summary> 1362 private readonly struct SingleRange 1363 { SingleRangeSystem.Text.RegularExpressions.RegexCharClass.SingleRange1364 internal SingleRange(char first, char last) 1365 { 1366 _first = first; 1367 _last = last; 1368 } 1369 1370 internal readonly char _first; 1371 internal readonly char _last; 1372 } 1373 } 1374 } 1375