1 //------------------------------------------------------------------------------ 2 // <copyright file="RegexCharClass.cs" company="Microsoft"> 3 // Copyright (c) Microsoft Corporation. All rights reserved. 4 // </copyright> 5 //------------------------------------------------------------------------------ 6 7 // This RegexCharClass class provides the "set of Unicode chars" functionality 8 // used by the regexp engine. 9 10 // The main function of RegexCharClass is as a builder to turn ranges, characters and 11 // Unicode categories into a single string. This string is used as a black box 12 // representation of a character class by the rest of Regex. The format is as follows. 13 // 14 // Char index Use 15 // 0 Flags - currently this only holds the "negate" flag 16 // 1 length of the string representing the "set" portion, eg [a-z0-9] only has a "set" 17 // 2 length of the string representing the "category" portion, eg [\p{Lu}] only has a "category" 18 // 3...m The set. These are a series of ranges which define the characters included in the set. 19 // To determine if a given character is in the set, we binary search over this set of ranges 20 // and see where the character should go. Based on whether the ending index is odd or even, 21 // we know if the character is in the set. 22 // m+1...n The categories. This is a list of UnicodeCategory enum values which describe categories 23 // included in this class. 24 25 namespace System.Text.RegularExpressions { 26 27 using System.Collections; 28 using System.Collections.Generic; 29 using System.Globalization; 30 using System.Diagnostics; 31 32 internal sealed class RegexCharClass { 33 // instance data 34 private List<SingleRange> _rangelist; 35 private StringBuilder _categories; 36 private bool _canonical; 37 private bool _negate; 38 private RegexCharClass _subtractor; 39 40 // Constants 41 private const int FLAGS = 0; 42 private const int SETLENGTH = 1; 43 private const int CATEGORYLENGTH = 2; 44 private const int SETSTART = 3; 45 46 private const char Nullchar = '\0'; 47 private const char Lastchar = '\uFFFF'; 48 49 private const char GroupChar = (char) 0; 50 51 52 private const short SpaceConst = 100; 53 private const short NotSpaceConst = -100; 54 55 private const char ZeroWidthJoiner = '\u200D'; 56 private const char ZeroWidthNonJoiner = '\u200C'; 57 58 59 private static readonly String InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__"; 60 private static readonly String Space = "\x64"; 61 private static readonly String NotSpace = NegateCategory(Space); 62 private static readonly String Word; 63 private static readonly String NotWord; 64 65 internal static readonly String SpaceClass; 66 internal static readonly String NotSpaceClass; 67 internal static readonly String WordClass; 68 internal static readonly String NotWordClass; 69 internal static readonly String DigitClass; 70 internal static readonly String NotDigitClass; 71 72 private const String ECMASpaceSet = "\u0009\u000E\u0020\u0021"; 73 private const String NotECMASpaceSet = "\0\u0009\u000E\u0020\u0021"; 74 private const String ECMAWordSet = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; 75 private const String NotECMAWordSet = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131"; 76 private const String ECMADigitSet = "\u0030\u003A"; 77 private const String NotECMADigitSet = "\0\u0030\u003A"; 78 79 internal const String ECMASpaceClass = "\x00\x04\x00" + ECMASpaceSet; 80 internal const String NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceSet; 81 internal const String ECMAWordClass = "\x00\x0A\x00" + ECMAWordSet; 82 internal const String NotECMAWordClass = "\x01\x0A\x00" + ECMAWordSet; 83 internal const String ECMADigitClass = "\x00\x02\x00" + ECMADigitSet; 84 internal const String NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet; 85 86 internal const String AnyClass = "\x00\x01\x00\x00"; 87 internal const String EmptyClass = "\x00\x00\x00"; 88 89 static Dictionary<String, String> _definedCategories; 90 91 /* 92 * The property table contains all the block definitions defined in the 93 * XML schema spec (http://www.w3.org/TR/2001/PR-xmlschema-2-20010316/#charcter-classes), Unicode 4.0 spec (www.unicode.org), 94 * and Perl 5.6 (see Programming Perl, 3rd edition page 167). Three blocks defined by Perl (and here) may 95 * not be in the Unicode: IsHighPrivateUseSurrogates, IsHighSurrogates, and IsLowSurrogates. 96 * 97 **/ 98 // Has to be sorted by the first column 99 private static readonly String[,] _propTable = { 100 {"IsAlphabeticPresentationForms", "\uFB00\uFB50"}, 101 {"IsArabic", "\u0600\u0700"}, 102 {"IsArabicPresentationForms-A", "\uFB50\uFE00"}, 103 {"IsArabicPresentationForms-B", "\uFE70\uFF00"}, 104 {"IsArmenian", "\u0530\u0590"}, 105 {"IsArrows", "\u2190\u2200"}, 106 {"IsBasicLatin", "\u0000\u0080"}, 107 {"IsBengali", "\u0980\u0A00"}, 108 {"IsBlockElements", "\u2580\u25A0"}, 109 {"IsBopomofo", "\u3100\u3130"}, 110 {"IsBopomofoExtended", "\u31A0\u31C0"}, 111 {"IsBoxDrawing", "\u2500\u2580"}, 112 {"IsBraillePatterns", "\u2800\u2900"}, 113 {"IsBuhid", "\u1740\u1760"}, 114 {"IsCJKCompatibility", "\u3300\u3400"}, 115 {"IsCJKCompatibilityForms", "\uFE30\uFE50"}, 116 {"IsCJKCompatibilityIdeographs", "\uF900\uFB00"}, 117 {"IsCJKRadicalsSupplement", "\u2E80\u2F00"}, 118 {"IsCJKSymbolsandPunctuation", "\u3000\u3040"}, 119 {"IsCJKUnifiedIdeographs", "\u4E00\uA000"}, 120 {"IsCJKUnifiedIdeographsExtensionA", "\u3400\u4DC0"}, 121 {"IsCherokee", "\u13A0\u1400"}, 122 {"IsCombiningDiacriticalMarks", "\u0300\u0370"}, 123 {"IsCombiningDiacriticalMarksforSymbols","\u20D0\u2100"}, 124 {"IsCombiningHalfMarks", "\uFE20\uFE30"}, 125 {"IsCombiningMarksforSymbols", "\u20D0\u2100"}, 126 {"IsControlPictures", "\u2400\u2440"}, 127 {"IsCurrencySymbols", "\u20A0\u20D0"}, 128 {"IsCyrillic", "\u0400\u0500"}, 129 {"IsCyrillicSupplement", "\u0500\u0530"}, 130 {"IsDevanagari", "\u0900\u0980"}, 131 {"IsDingbats", "\u2700\u27C0"}, 132 {"IsEnclosedAlphanumerics", "\u2460\u2500"}, 133 {"IsEnclosedCJKLettersandMonths", "\u3200\u3300"}, 134 {"IsEthiopic", "\u1200\u1380"}, 135 {"IsGeneralPunctuation", "\u2000\u2070"}, 136 {"IsGeometricShapes", "\u25A0\u2600"}, 137 {"IsGeorgian", "\u10A0\u1100"}, 138 {"IsGreek", "\u0370\u0400"}, 139 {"IsGreekExtended", "\u1F00\u2000"}, 140 {"IsGreekandCoptic", "\u0370\u0400"}, 141 {"IsGujarati", "\u0A80\u0B00"}, 142 {"IsGurmukhi", "\u0A00\u0A80"}, 143 {"IsHalfwidthandFullwidthForms", "\uFF00\uFFF0"}, 144 {"IsHangulCompatibilityJamo", "\u3130\u3190"}, 145 {"IsHangulJamo", "\u1100\u1200"}, 146 {"IsHangulSyllables", "\uAC00\uD7B0"}, 147 {"IsHanunoo", "\u1720\u1740"}, 148 {"IsHebrew", "\u0590\u0600"}, 149 {"IsHighPrivateUseSurrogates", "\uDB80\uDC00"}, 150 {"IsHighSurrogates", "\uD800\uDB80"}, 151 {"IsHiragana", "\u3040\u30A0"}, 152 {"IsIPAExtensions", "\u0250\u02B0"}, 153 {"IsIdeographicDescriptionCharacters", "\u2FF0\u3000"}, 154 {"IsKanbun", "\u3190\u31A0"}, 155 {"IsKangxiRadicals", "\u2F00\u2FE0"}, 156 {"IsKannada", "\u0C80\u0D00"}, 157 {"IsKatakana", "\u30A0\u3100"}, 158 {"IsKatakanaPhoneticExtensions", "\u31F0\u3200"}, 159 {"IsKhmer", "\u1780\u1800"}, 160 {"IsKhmerSymbols", "\u19E0\u1A00"}, 161 {"IsLao", "\u0E80\u0F00"}, 162 {"IsLatin-1Supplement", "\u0080\u0100"}, 163 {"IsLatinExtended-A", "\u0100\u0180"}, 164 {"IsLatinExtended-B", "\u0180\u0250"}, 165 {"IsLatinExtendedAdditional", "\u1E00\u1F00"}, 166 {"IsLetterlikeSymbols", "\u2100\u2150"}, 167 {"IsLimbu", "\u1900\u1950"}, 168 {"IsLowSurrogates", "\uDC00\uE000"}, 169 {"IsMalayalam", "\u0D00\u0D80"}, 170 {"IsMathematicalOperators", "\u2200\u2300"}, 171 {"IsMiscellaneousMathematicalSymbols-A","\u27C0\u27F0"}, 172 {"IsMiscellaneousMathematicalSymbols-B","\u2980\u2A00"}, 173 {"IsMiscellaneousSymbols", "\u2600\u2700"}, 174 {"IsMiscellaneousSymbolsandArrows", "\u2B00\u2C00"}, 175 {"IsMiscellaneousTechnical", "\u2300\u2400"}, 176 {"IsMongolian", "\u1800\u18B0"}, 177 {"IsMyanmar", "\u1000\u10A0"}, 178 {"IsNumberForms", "\u2150\u2190"}, 179 {"IsOgham", "\u1680\u16A0"}, 180 {"IsOpticalCharacterRecognition", "\u2440\u2460"}, 181 {"IsOriya", "\u0B00\u0B80"}, 182 {"IsPhoneticExtensions", "\u1D00\u1D80"}, 183 {"IsPrivateUse", "\uE000\uF900"}, 184 {"IsPrivateUseArea", "\uE000\uF900"}, 185 {"IsRunic", "\u16A0\u1700"}, 186 {"IsSinhala", "\u0D80\u0E00"}, 187 {"IsSmallFormVariants", "\uFE50\uFE70"}, 188 {"IsSpacingModifierLetters", "\u02B0\u0300"}, 189 {"IsSpecials", "\uFFF0"}, 190 {"IsSuperscriptsandSubscripts", "\u2070\u20A0"}, 191 {"IsSupplementalArrows-A", "\u27F0\u2800"}, 192 {"IsSupplementalArrows-B", "\u2900\u2980"}, 193 {"IsSupplementalMathematicalOperators", "\u2A00\u2B00"}, 194 {"IsSyriac", "\u0700\u0750"}, 195 {"IsTagalog", "\u1700\u1720"}, 196 {"IsTagbanwa", "\u1760\u1780"}, 197 {"IsTaiLe", "\u1950\u1980"}, 198 {"IsTamil", "\u0B80\u0C00"}, 199 {"IsTelugu", "\u0C00\u0C80"}, 200 {"IsThaana", "\u0780\u07C0"}, 201 {"IsThai", "\u0E00\u0E80"}, 202 {"IsTibetan", "\u0F00\u1000"}, 203 {"IsUnifiedCanadianAboriginalSyllabics","\u1400\u1680"}, 204 {"IsVariationSelectors", "\uFE00\uFE10"}, 205 {"IsYiRadicals", "\uA490\uA4D0"}, 206 {"IsYiSyllables", "\uA000\uA490"}, 207 {"IsYijingHexagramSymbols", "\u4DC0\u4E00"}, 208 {"_xmlC", /* Name Char */ "\u002D\u002F\u0030\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00B7\u00B8\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u02D0\u02D2\u0300\u0346\u0360\u0362\u0386\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0483\u0487\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0653\u0660\u066A\u0670\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06E9\u06EA\u06EE\u06F0\u06FA\u0901\u0904\u0905\u093A\u093C\u094E\u0951\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC" 209 +"\u09DE\u09DF\u09E4\u09E6\u09F2\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B70\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF0\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2" 210 +"\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0E01\u0E2F\u0E30\u0E3B\u0E40\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0F18\u0F1A\u0F20\u0F2A\u0F35\u0F36\u0F37\u0F38\u0F39\u0F3A\u0F3E\u0F48\u0F49\u0F6A\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F96\u0F97\u0F98\u0F99\u0FAE\u0FB1\u0FB8\u0FB9\u0FBA\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00" 211 +"\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u20D0\u20DD\u20E1\u20E2\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3005\u3006\u3007\u3008\u3021\u3030\u3031\u3036\u3041\u3095\u3099\u309B\u309D\u309F\u30A1\u30FB\u30FC\u30FF\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"}, 212 {"_xmlD", "\u0030\u003A\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF0\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F2A\u1040\u104A\u1369\u1372\u17E0\u17EA\u1810\u181A\uFF10\uFF1A"}, 213 {"_xmlI", /* Start Name Char */ "\u003A\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u0386\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0641\u064B\u0671\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06D6\u06E5\u06E7\u0905\u093A\u093D\u093E\u0958\u0962\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09DC\u09DE\u09DF\u09E2\u09F0\u09F2\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A59\u0A5D\u0A5E\u0A5F\u0A72\u0A75\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABD\u0ABE\u0AE0\u0AE1\u0B05\u0B0D\u0B0F" 214 +"\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3D\u0B3E\u0B5C\u0B5E\u0B5F\u0B62\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C60\u0C62\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CDE\u0CDF\u0CE0\u0CE2\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D60\u0D62\u0E01\u0E2F\u0E30\u0E31\u0E32\u0E34\u0E40\u0E46\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EB1\u0EB2\u0EB4\u0EBD\u0EBE\u0EC0\u0EC5\u0F40\u0F48\u0F49\u0F6A\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC" 215 +"\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3007\u3008\u3021\u302A\u3041\u3095\u30A1\u30FB\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"}, 216 {"_xmlW", "\u0024\u0025\u002B\u002C\u0030\u003A\u003C\u003F\u0041\u005B\u005E\u005F\u0060\u007B\u007C\u007D\u007E\u007F\u00A2\u00AB\u00AC\u00AD\u00AE\u00B7\u00B8\u00BB\u00BC\u00BF\u00C0\u0221\u0222\u0234\u0250\u02AE\u02B0\u02EF\u0300\u0350\u0360\u0370\u0374\u0376\u037A\u037B\u0384\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03F7\u0400\u0487\u0488\u04CF\u04D0\u04F6\u04F8\u04FA\u0500\u0510\u0531\u0557\u0559\u055A\u0561\u0588\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0656\u0660\u066A\u066E\u06D4\u06D5\u06DD\u06DE\u06EE\u06F0\u06FF\u0710\u072D\u0730\u074B\u0780\u07B2\u0901\u0904\u0905\u093A\u093C\u094E\u0950\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC\u09DE\u09DF\u09E4\u09E6\u09FB\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35" 217 +"\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AD0\u0AD1\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B71\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF3\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49" 218 +"\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0D82\u0D84\u0D85\u0D97\u0D9A\u0DB2\u0DB3\u0DBC\u0DBD\u0DBE\u0DC0\u0DC7\u0DCA\u0DCB\u0DCF\u0DD5\u0DD6\u0DD7\u0DD8\u0DE0\u0DF2\u0DF4\u0E01\u0E3B\u0E3F\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0EDC\u0EDE\u0F00\u0F04\u0F13\u0F3A\u0F3E\u0F48\u0F49\u0F6B\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F98\u0F99\u0FBD\u0FBE\u0FCD\u0FCF\u0FD0\u1000\u1022\u1023\u1028\u1029\u102B\u102C\u1033\u1036\u103A\u1040\u104A\u1050\u105A\u10A0\u10C6\u10D0\u10F9\u1100\u115A\u115F\u11A3\u11A8\u11FA\u1200\u1207\u1208\u1247\u1248\u1249\u124A\u124E\u1250\u1257\u1258\u1259\u125A\u125E\u1260\u1287\u1288\u1289\u128A\u128E\u1290\u12AF\u12B0\u12B1\u12B2\u12B6\u12B8\u12BF\u12C0\u12C1\u12C2\u12C6\u12C8\u12CF\u12D0\u12D7\u12D8\u12EF\u12F0\u130F\u1310\u1311\u1312\u1316\u1318\u131F\u1320\u1347\u1348\u135B\u1369\u137D\u13A0" 219 +"\u13F5\u1401\u166D\u166F\u1677\u1681\u169B\u16A0\u16EB\u16EE\u16F1\u1700\u170D\u170E\u1715\u1720\u1735\u1740\u1754\u1760\u176D\u176E\u1771\u1772\u1774\u1780\u17D4\u17D7\u17D8\u17DB\u17DD\u17E0\u17EA\u180B\u180E\u1810\u181A\u1820\u1878\u1880\u18AA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FC5\u1FC6\u1FD4\u1FD6\u1FDC\u1FDD\u1FF0\u1FF2\u1FF5\u1FF6\u1FFF\u2044\u2045\u2052\u2053\u2070\u2072\u2074\u207D\u207F\u208D\u20A0\u20B2\u20D0\u20EB\u2100\u213B\u213D\u214C\u2153\u2184\u2190\u2329\u232B\u23B4\u23B7\u23CF\u2400\u2427\u2440\u244B\u2460\u24FF\u2500\u2614\u2616\u2618\u2619\u267E\u2680\u268A\u2701\u2705\u2706\u270A\u270C\u2728\u2729\u274C\u274D\u274E\u274F\u2753\u2756\u2757\u2758\u275F\u2761\u2768\u2776\u2795\u2798\u27B0\u27B1\u27BF\u27D0\u27E6\u27F0\u2983\u2999\u29D8\u29DC\u29FC\u29FE\u2B00\u2E80\u2E9A\u2E9B\u2EF4\u2F00\u2FD6\u2FF0\u2FFC\u3004\u3008\u3012\u3014\u3020\u3030\u3031\u303D\u303E\u3040" 220 +"\u3041\u3097\u3099\u30A0\u30A1\u30FB\u30FC\u3100\u3105\u312D\u3131\u318F\u3190\u31B8\u31F0\u321D\u3220\u3244\u3251\u327C\u327F\u32CC\u32D0\u32FF\u3300\u3377\u337B\u33DE\u33E0\u33FF\u3400\u4DB6\u4E00\u9FA6\uA000\uA48D\uA490\uA4C7\uAC00\uD7A4\uF900\uFA2E\uFA30\uFA6B\uFB00\uFB07\uFB13\uFB18\uFB1D\uFB37\uFB38\uFB3D\uFB3E\uFB3F\uFB40\uFB42\uFB43\uFB45\uFB46\uFBB2\uFBD3\uFD3E\uFD50\uFD90\uFD92\uFDC8\uFDF0\uFDFD\uFE00\uFE10\uFE20\uFE24\uFE62\uFE63\uFE64\uFE67\uFE69\uFE6A\uFE70\uFE75\uFE76\uFEFD\uFF04\uFF05\uFF0B\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF21\uFF3B\uFF3E\uFF3F\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF66\uFFBF\uFFC2\uFFC8\uFFCA\uFFD0\uFFD2\uFFD8\uFFDA\uFFDD\uFFE0\uFFE7\uFFE8\uFFEF\uFFFC\uFFFE"}, 221 }; 222 223 224 /************************************************************************** 225 Let U be the set of Unicode character values and let L be the lowercase 226 function, mapping from U to U. To perform case insensitive matching of 227 character sets, we need to be able to map an interval I in U, say 228 229 I = [chMin, chMax] = { ch : chMin <= ch <= chMax } 230 231 to a set A such that A contains L(I) and A is contained in the union of 232 I and L(I). 233 234 The table below partitions U into intervals on which L is non-decreasing. 235 Thus, for any interval J = [a, b] contained in one of these intervals, 236 L(J) is contained in [L(a), L(b)]. 237 238 It is also true that for any such J, [L(a), L(b)] is contained in the 239 union of J and L(J). This does not follow from L being non-decreasing on 240 these intervals. It follows from the nature of the L on each interval. 241 On each interval, L has one of the following forms: 242 243 (1) L(ch) = constant (LowercaseSet) 244 (2) L(ch) = ch + offset (LowercaseAdd) 245 (3) L(ch) = ch | 1 (LowercaseBor) 246 (4) L(ch) = ch + (ch & 1) (LowercaseBad) 247 248 It is easy to verify that for any of these forms [L(a), L(b)] is 249 contained in the union of [a, b] and L([a, b]). 250 ***************************************************************************/ 251 252 private const int LowercaseSet = 0; // Set to arg. 253 private const int LowercaseAdd = 1; // Add arg. 254 private const int LowercaseBor = 2; // Bitwise or with 1. 255 private const int LowercaseBad = 3; // Bitwise and with 1 and add original. 256 257 private static readonly LowerCaseMapping[] _lcTable = new LowerCaseMapping[] 258 { 259 new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32), 260 new LowerCaseMapping('\u00C0', '\u00DE', LowercaseAdd, 32), 261 new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0), 262 new LowerCaseMapping('\u0130', '\u0130', LowercaseSet, 0x0069), 263 new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0), 264 new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0), 265 new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0), 266 new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF), 267 new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0), 268 new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253), 269 new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0), 270 new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254), 271 new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188), 272 new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205), 273 new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C), 274 new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD), 275 new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259), 276 new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B), 277 new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192), 278 new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260), 279 new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263), 280 new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269), 281 new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268), 282 new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199), 283 new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F), 284 new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272), 285 new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275), 286 new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0), 287 new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8), 288 new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283), 289 new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD), 290 new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288), 291 new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0), 292 new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217), 293 new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0), 294 new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292), 295 new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9), 296 new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD), 297 new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6), 298 new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9), 299 new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC), 300 new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0), 301 new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0), 302 new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3), 303 new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5), 304 new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0), 305 new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC), 306 new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37), 307 new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC), 308 new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63), 309 new LowerCaseMapping('\u0391', '\u03AB', LowercaseAdd, 32), 310 new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0), 311 new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80), 312 new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32), 313 new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0), 314 new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0), 315 new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0), 316 new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8), 317 new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC), 318 new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0), 319 new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0), 320 new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9), 321 new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48), 322 new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 48), 323 new LowerCaseMapping('\u1E00', '\u1EF8', LowercaseBor, 0), 324 new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8), 325 new LowerCaseMapping('\u1F18', '\u1F1F', LowercaseAdd, -8), 326 new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8), 327 new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8), 328 new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8), 329 new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51), 330 new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53), 331 new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55), 332 new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57), 333 new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8), 334 new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8), 335 new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8), 336 new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8), 337 new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8), 338 new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74), 339 new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3), 340 new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86), 341 new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3), 342 new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8), 343 new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100), 344 new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8), 345 new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112), 346 new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5), 347 new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128), 348 new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126), 349 new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3), 350 new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16), 351 new LowerCaseMapping('\u24B6', '\u24D0', LowercaseAdd, 26), 352 new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32), 353 }; 354 RegexCharClass()355 static RegexCharClass() { 356 // addressing Dictionary versus Hashtable thread safety difference by using 357 // a temp Dictionary. Note that this is just a theoretical concern since this 358 // is a static ctor and getter methods aren't called until after this is 359 // done; this is just to avoid the long-term possibility of thread safety 360 // problems. 361 Dictionary<String, String> tempCategories = new Dictionary<String, String>(32); 362 363 char[] groups = new char[9]; 364 StringBuilder word = new StringBuilder(11); 365 366 word.Append(GroupChar); 367 groups[0] = GroupChar; 368 369 // We need the UnicodeCategory enum values as a char so we can put them in a string 370 // in the hashtable. In order to get there, we first must cast to an int, 371 // then cast to a char 372 // Also need to distinguish between positive and negative values. UnicodeCategory is zero 373 // based, so we add one to each value and subtract it off later 374 375 // Others 376 groups[1] = (char) ((int) UnicodeCategory.Control + 1); 377 tempCategories["Cc"] = groups[1].ToString(); // Control 378 groups[2] = (char) ((int) UnicodeCategory.Format + 1); 379 tempCategories["Cf"] = groups[2].ToString(); // Format 380 groups[3] = (char) ((int) UnicodeCategory.OtherNotAssigned + 1); 381 tempCategories["Cn"] = groups[3].ToString(); // Not assigned 382 groups[4] = (char) ((int) UnicodeCategory.PrivateUse + 1); 383 tempCategories["Co"] = groups[4].ToString(); // Private use 384 groups[5] = (char) ((int) UnicodeCategory.Surrogate + 1); 385 tempCategories["Cs"] = groups[5].ToString(); // Surrogate 386 387 groups[6] = GroupChar; 388 tempCategories["C"] = new String(groups, 0, 7); 389 390 // Letters 391 groups[1] = (char) ((int) UnicodeCategory.LowercaseLetter + 1); 392 tempCategories["Ll"] = groups[1].ToString(); // Lowercase 393 groups[2] = (char) ((int) UnicodeCategory.ModifierLetter + 1); 394 tempCategories["Lm"] = groups[2].ToString(); // Modifier 395 groups[3] = (char) ((int) UnicodeCategory.OtherLetter + 1); 396 tempCategories["Lo"] = groups[3].ToString(); // Other 397 groups[4] = (char) ((int) UnicodeCategory.TitlecaseLetter + 1); 398 tempCategories["Lt"] = groups[4].ToString(); // Titlecase 399 groups[5] = (char) ((int) UnicodeCategory.UppercaseLetter + 1); 400 tempCategories["Lu"] = groups[5].ToString(); // Uppercase 401 402 //groups[6] = GroupChar; 403 tempCategories["L"] = new String(groups, 0, 7); 404 word.Append(new String(groups, 1, 5)); 405 406 // InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter} 407 // !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!! 408 tempCategories[InternalRegexIgnoreCase] = String.Format(CultureInfo.InvariantCulture, "{0}{1}{2}{3}{4}", GroupChar, groups[1], groups[4], groups[5], groups[6]); 409 410 // Marks 411 groups[1] = (char) ((int) UnicodeCategory.SpacingCombiningMark + 1); 412 tempCategories["Mc"] = groups[1].ToString(); // Spacing combining 413 groups[2] = (char) ((int) UnicodeCategory.EnclosingMark + 1); 414 tempCategories["Me"] = groups[2].ToString(); // Enclosing 415 groups[3] = (char) ((int) UnicodeCategory.NonSpacingMark + 1); 416 tempCategories["Mn"] = groups[3].ToString(); // Non-spacing 417 418 groups[4] = GroupChar; 419 tempCategories["M"] = new String(groups, 0, 5); 420 //word.Append(groups[1]); 421 word.Append(groups[3]); 422 423 // Numbers 424 groups[1] = (char) ((int) UnicodeCategory.DecimalDigitNumber + 1); 425 tempCategories["Nd"] = groups[1].ToString(); // Decimal digit 426 groups[2] = (char) ((int) UnicodeCategory.LetterNumber + 1); 427 tempCategories["Nl"] = groups[2].ToString(); // Letter 428 groups[3] = (char) ((int) UnicodeCategory.OtherNumber + 1); 429 tempCategories["No"] = groups[3].ToString(); // Other 430 431 //groups[4] = GroupChar; 432 tempCategories["N"] = new String(groups, 0, 5); 433 word.Append(groups[1]); 434 //word.Append(new String(groups, 1, 3)); 435 436 // Punctuation 437 groups[1] = (char) ((int) UnicodeCategory.ConnectorPunctuation + 1); 438 tempCategories["Pc"] = groups[1].ToString(); // Connector 439 groups[2] = (char) ((int) UnicodeCategory.DashPunctuation + 1); 440 tempCategories["Pd"] = groups[2].ToString(); // Dash 441 groups[3] = (char) ((int) UnicodeCategory.ClosePunctuation + 1); 442 tempCategories["Pe"] = groups[3].ToString(); // Close 443 groups[4] = (char) ((int) UnicodeCategory.OtherPunctuation + 1); 444 tempCategories["Po"] = groups[4].ToString(); // Other 445 groups[5] = (char) ((int) UnicodeCategory.OpenPunctuation + 1); 446 tempCategories["Ps"] = groups[5].ToString(); // Open 447 groups[6] = (char) ((int) UnicodeCategory.FinalQuotePunctuation + 1); 448 tempCategories["Pf"] = groups[6].ToString(); // Inital quote 449 groups[7] = (char) ((int) UnicodeCategory.InitialQuotePunctuation + 1); 450 tempCategories["Pi"] = groups[7].ToString(); // Final quote 451 452 groups[8] = GroupChar; 453 tempCategories["P"] = new String(groups, 0, 9); 454 word.Append(groups[1]); 455 456 // Symbols 457 groups[1] = (char) ((int) UnicodeCategory.CurrencySymbol + 1); 458 tempCategories["Sc"] = groups[1].ToString(); // Currency 459 groups[2] = (char) ((int) UnicodeCategory.ModifierSymbol + 1); 460 tempCategories["Sk"] = groups[2].ToString(); // Modifier 461 groups[3] = (char) ((int) UnicodeCategory.MathSymbol + 1); 462 tempCategories["Sm"] = groups[3].ToString(); // Math 463 groups[4] = (char) ((int) UnicodeCategory.OtherSymbol + 1); 464 tempCategories["So"] = groups[4].ToString(); // Other 465 466 groups[5] = GroupChar; 467 tempCategories["S"] = new String(groups, 0, 6); 468 469 // Separators 470 groups[1] = (char) ((int) UnicodeCategory.LineSeparator + 1); 471 tempCategories["Zl"] = groups[1].ToString(); // Line 472 groups[2] = (char) ((int) UnicodeCategory.ParagraphSeparator + 1); 473 tempCategories["Zp"] = groups[2].ToString(); // Paragraph 474 groups[3] = (char) ((int) UnicodeCategory.SpaceSeparator + 1); 475 tempCategories["Zs"] = groups[3].ToString(); // Space 476 477 groups[4] = GroupChar; 478 tempCategories["Z"] = new String(groups, 0, 5); 479 480 481 word.Append(GroupChar); 482 Word = word.ToString(); 483 NotWord = NegateCategory(Word); 484 485 486 SpaceClass = "\x00\x00\x01" + Space; 487 NotSpaceClass = "\x01\x00\x01" + Space; 488 WordClass = "\x00\x00" + (char) Word.Length + Word; 489 NotWordClass = "\x01\x00" + (char) Word.Length + Word;; 490 DigitClass = "\x00\x00\x01" + (char) ((int) UnicodeCategory.DecimalDigitNumber + 1); 491 NotDigitClass = "\x00\x00\x01" + unchecked ((char) (- ((int) UnicodeCategory.DecimalDigitNumber + 1)) ); 492 493 #if DBG 494 // make sure the _propTable is correctly ordered 495 int len = _propTable.GetLength(0); 496 for (int i=0; i<len-1; i++) 497 Debug.Assert(String.Compare(_propTable[i,0], _propTable[i+1,0], StringComparison.Ordinal) < 0, "RegexCharClass _propTable is out of order at (" + _propTable[i,0] +", " + _propTable[i+1,0] + ")"); 498 #endif 499 500 _definedCategories = tempCategories; 501 } 502 503 /* 504 * RegexCharClass() 505 * 506 * Creates an empty character class. 507 */ RegexCharClass()508 internal RegexCharClass() { 509 _rangelist = new List<SingleRange>(6); 510 _canonical = true; 511 _categories = new StringBuilder(); 512 513 } 514 RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction)515 private RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction) { 516 _rangelist = ranges; 517 _categories = categories; 518 _canonical = true; 519 _negate=negate; 520 _subtractor = subtraction; 521 } 522 523 internal bool CanMerge { 524 get { 525 return !_negate && _subtractor == null; 526 } 527 } 528 529 internal bool Negate { 530 set { _negate = value; } 531 } 532 AddChar(char c)533 internal void AddChar(char c) { 534 AddRange(c,c); 535 } 536 537 /* 538 * AddCharClass() 539 * 540 * Adds a regex char class 541 */ AddCharClass(RegexCharClass cc)542 internal void AddCharClass(RegexCharClass cc) { 543 int i; 544 545 Debug.Assert(cc.CanMerge && this.CanMerge, "Both character classes added together must be able to merge" ); 546 547 if (!cc._canonical) { 548 // if the new char class to add isn't canonical, we're not either. 549 _canonical = false; 550 } 551 else if (_canonical && RangeCount() > 0 && cc.RangeCount() > 0 && cc.GetRangeAt(0)._first <= GetRangeAt(RangeCount() - 1)._last) 552 _canonical = false; 553 554 for (i = 0; i < cc.RangeCount(); i += 1) { 555 _rangelist.Add(cc.GetRangeAt(i)); 556 } 557 558 _categories.Append(cc._categories.ToString()); 559 } 560 561 /* 562 * AddSet() 563 * 564 * Adds a set (specified by its string represenation) to the class. 565 */ AddSet(String set)566 private void AddSet(String set) { 567 int i; 568 569 if (_canonical && RangeCount() > 0 && set.Length > 0 && 570 set[0] <= GetRangeAt(RangeCount() - 1)._last) 571 _canonical = false; 572 573 for (i = 0; i < set.Length - 1; i += 2) { 574 _rangelist.Add(new SingleRange(set[i], (char)(set[i + 1] - 1))); 575 } 576 577 if (i < set.Length) { 578 _rangelist.Add(new SingleRange(set[i], Lastchar)); 579 } 580 } 581 AddSubtraction(RegexCharClass sub)582 internal void AddSubtraction(RegexCharClass sub) { 583 Debug.Assert(_subtractor == null, "Can't add two subtractions to a char class. "); 584 _subtractor = sub; 585 } 586 587 /* 588 * AddRange() 589 * 590 * Adds a single range of characters to the class. 591 */ AddRange(char first, char last)592 internal void AddRange(char first, char last) { 593 _rangelist.Add(new SingleRange(first, last)); 594 if (_canonical && _rangelist.Count > 0 && 595 first <= _rangelist[_rangelist.Count - 1]._last) { 596 _canonical = false; 597 } 598 } 599 AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern)600 internal void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern) { 601 602 String cat; 603 _definedCategories.TryGetValue(categoryName, out cat); 604 if (cat != null && !categoryName.Equals(InternalRegexIgnoreCase)) { 605 string catstr = cat; 606 607 if (caseInsensitive) { 608 if (categoryName.Equals("Ll") || categoryName.Equals("Lu") || categoryName.Equals("Lt")) 609 // when RegexOptions.IgnoreCase is specified then {Ll}, {Lu}, and {Lt} cases should all match 610 catstr = (string) _definedCategories[InternalRegexIgnoreCase]; 611 } 612 613 if (invert) 614 catstr = NegateCategory(catstr); // negate the category 615 616 _categories.Append((string) catstr); 617 } 618 else 619 AddSet(SetFromProperty(categoryName, invert, pattern)); 620 } 621 AddCategory(string category)622 private void AddCategory(string category) { 623 _categories.Append(category); 624 } 625 626 /* 627 * AddLowerCase() 628 * 629 * Adds to the class any lowercase versions of characters already 630 * in the class. Used for case-insensitivity. 631 */ AddLowercase(CultureInfo culture)632 internal void AddLowercase(CultureInfo culture) { 633 int i; 634 int origSize; 635 SingleRange range; 636 637 _canonical = false; 638 639 for (i = 0, origSize = _rangelist.Count; i < origSize; i++) { 640 range = _rangelist[i]; 641 if (range._first == range._last) 642 range._first = range._last = Char.ToLower(range._first, culture); 643 else 644 AddLowercaseRange(range._first, range._last, culture); 645 } 646 } 647 648 /* 649 * AddLowercaseRange() 650 * 651 * For a single range that's in the set, adds any additional ranges 652 * necessary to ensure that lowercase equivalents are also included. 653 */ AddLowercaseRange(char chMin, char chMax, CultureInfo culture)654 private void AddLowercaseRange(char chMin, char chMax, CultureInfo culture) { 655 int i, iMax, iMid; 656 char chMinT, chMaxT; 657 LowerCaseMapping lc; 658 659 for (i = 0, iMax = _lcTable.Length; i < iMax; ) { 660 iMid = (i + iMax) / 2; 661 if (_lcTable[iMid]._chMax < chMin) 662 i = iMid + 1; 663 else 664 iMax = iMid; 665 } 666 667 if (i >= _lcTable.Length) 668 return; 669 670 for ( ; i < _lcTable.Length && (lc = _lcTable[i])._chMin <= chMax; i++) { 671 if ((chMinT = lc._chMin) < chMin) 672 chMinT = chMin; 673 674 if ((chMaxT = lc._chMax) > chMax) 675 chMaxT = chMax; 676 677 switch (lc._lcOp) { 678 case LowercaseSet: 679 chMinT = (char)lc._data; 680 chMaxT = (char)lc._data; 681 break; 682 case LowercaseAdd: 683 chMinT += (char)lc._data; 684 chMaxT += (char)lc._data; 685 break; 686 case LowercaseBor: 687 chMinT |= (char)1; 688 chMaxT |= (char)1; 689 break; 690 case LowercaseBad: 691 chMinT += (char)(chMinT & 1); 692 chMaxT += (char)(chMaxT & 1); 693 break; 694 } 695 696 if (chMinT < chMin || chMaxT > chMax) 697 AddRange(chMinT, chMaxT); 698 } 699 } 700 AddWord(bool ecma, bool negate)701 internal void AddWord(bool ecma, bool negate) { 702 if (negate) { 703 if (ecma) 704 AddSet(RegexCharClass.NotECMAWordSet); 705 else 706 AddCategory(RegexCharClass.NotWord); 707 } 708 else { 709 if (ecma) 710 AddSet(RegexCharClass.ECMAWordSet); 711 else 712 AddCategory(RegexCharClass.Word); 713 } 714 } 715 AddSpace(bool ecma, bool negate)716 internal void AddSpace(bool ecma, bool negate) { 717 if (negate) { 718 if (ecma) 719 AddSet(RegexCharClass.NotECMASpaceSet); 720 else 721 AddCategory(RegexCharClass.NotSpace); 722 } 723 else { 724 if (ecma) 725 AddSet(RegexCharClass.ECMASpaceSet); 726 else 727 AddCategory(RegexCharClass.Space); 728 } 729 } 730 AddDigit(bool ecma, bool negate, string pattern)731 internal void AddDigit(bool ecma, bool negate, string pattern) { 732 if (ecma) { 733 if (negate) 734 AddSet(RegexCharClass.NotECMADigitSet); 735 else 736 AddSet(RegexCharClass.ECMADigitSet); 737 } 738 else 739 AddCategoryFromName("Nd", negate, false, pattern); 740 } 741 ConvertOldStringsToClass(string set, string category)742 internal static string ConvertOldStringsToClass(string set, string category) { 743 StringBuilder sb = new StringBuilder(set.Length + category.Length + 3); 744 745 if (set.Length >= 2 && set[0] =='\0' && set[1] == '\0') { 746 sb.Append((char) 0x1); 747 sb.Append((char) (set.Length - 2)); 748 sb.Append((char) category.Length); 749 sb.Append(set.Substring(2)); 750 } 751 else { 752 sb.Append((char) 0x0); 753 sb.Append((char) set.Length); 754 sb.Append((char) category.Length); 755 sb.Append(set); 756 } 757 sb.Append(category); 758 759 return sb.ToString(); 760 } 761 762 /* 763 * SingletonChar() 764 * 765 * Returns the char 766 */ SingletonChar(String set)767 internal static char SingletonChar(String set) { 768 Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class"); 769 return set[SETSTART]; 770 } 771 IsMergeable(string charClass)772 internal static bool IsMergeable(string charClass) { 773 return (!IsNegated(charClass) && !IsSubtraction(charClass)); 774 } 775 IsEmpty(String charClass)776 internal static bool IsEmpty(String charClass) { 777 if (charClass[CATEGORYLENGTH] == 0 && charClass[FLAGS] == 0 && charClass[SETLENGTH] == 0 && !IsSubtraction(charClass)) 778 return true; 779 else 780 return false; 781 } 782 783 /* 784 * IsSingleton() 785 * 786 * True if the set contains a single character only 787 */ IsSingleton(String set)788 internal static bool IsSingleton(String set) { 789 if (set[FLAGS] == 0 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) && 790 (set[SETSTART] == Lastchar || set[SETSTART]+1 == set[SETSTART+1])) 791 return true; 792 else 793 return false; 794 } 795 IsSingletonInverse(String set)796 internal static bool IsSingletonInverse(String set) { 797 if (set[FLAGS] == 1 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) && 798 (set[SETSTART] == Lastchar || set[SETSTART]+1 == set[SETSTART+1])) 799 return true; 800 else 801 return false; 802 } 803 IsSubtraction(string charClass)804 private static bool IsSubtraction(string charClass) { 805 return (charClass.Length > SETSTART + charClass[SETLENGTH] + charClass[CATEGORYLENGTH]); 806 } 807 IsNegated(string set)808 internal static bool IsNegated(string set) { 809 return (set != null && set[FLAGS] == 1); 810 } 811 IsECMAWordChar(char ch)812 internal static bool IsECMAWordChar(char ch) { 813 // According to ECMA-262, \s, \S, ., ^, and $ use Unicode-based interpretations of 814 // whitespace and newline, while \d, \D\, \w, \W, \b, and \B use ASCII-only 815 // interpretations of digit, word character, and word boundary. In other words, 816 // no special treatment of Unicode ZERO WIDTH NON-JOINER (ZWNJ U+200C) and 817 // ZERO WIDTH JOINER (ZWJ U+200D) is required for ECMA word boundaries. 818 return CharInClass(ch, ECMAWordClass); 819 } 820 IsWordChar(char ch)821 internal static bool IsWordChar(char ch) { 822 // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/) 823 // RL 1.4 Simple Word Boundaries The class of <word_character> includes all Alphabetic 824 // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C 825 // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER. 826 return CharInClass(ch, WordClass) || ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner; 827 } 828 CharInClass(char ch, String set)829 internal static bool CharInClass(char ch, String set) { 830 return CharInClassRecursive(ch, set, 0); 831 } 832 833 CharInClassRecursive(char ch, String set, int start)834 internal static bool CharInClassRecursive(char ch, String set, int start) { 835 int mySetLength = set[start+SETLENGTH]; 836 int myCategoryLength = set[start+CATEGORYLENGTH]; 837 int myEndPosition = start + SETSTART + mySetLength + myCategoryLength; 838 839 bool subtracted = false; 840 841 if (set.Length > myEndPosition) { 842 subtracted = CharInClassRecursive(ch, set, myEndPosition); 843 } 844 845 bool b = CharInClassInternal(ch, set, start, mySetLength, myCategoryLength); 846 847 // Note that we apply the negation *before* performing the subtraction. This is because 848 // the negation only applies to the first char class, not the entire subtraction. 849 if (set[start+FLAGS] == 1) 850 b = !b; 851 852 return b && !subtracted; 853 } 854 855 /* 856 * CharInClass() 857 * 858 * Determines a character's membership in a character class (via the 859 * string representation of the class). 860 */ CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength)861 private static bool CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength) { 862 int min; 863 int max; 864 int mid; 865 min = start + SETSTART; 866 max = min + mySetLength; 867 868 while (min != max) { 869 mid = (min + max) / 2; 870 if (ch < set[mid]) 871 max = mid; 872 else 873 min = mid + 1; 874 } 875 876 // The starting position of the set within the character class determines 877 // whether what an odd or even ending position means. If the start is odd, 878 // an *even* ending position means the character was in the set. With recursive 879 // subtractions in the mix, the starting position = start+SETSTART. Since we know that 880 // SETSTART is odd, we can simplify it out of the equation. But if it changes we need to 881 // reverse this check. 882 Debug.Assert((SETSTART & 0x1) == 1, "If SETSTART is not odd, the calculation below this will be reversed"); 883 if ((min & 0x1) == (start & 0x1)) 884 return true; 885 else { 886 if (myCategoryLength == 0) 887 return false; 888 889 return CharInCategory(ch, set, start, mySetLength, myCategoryLength); 890 } 891 } 892 CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength)893 private static bool CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength) { 894 UnicodeCategory chcategory = char.GetUnicodeCategory(ch); 895 896 int i=start + SETSTART + mySetLength; 897 int end = i + myCategoryLength; 898 while (i<end) { 899 int curcat = (short) set[i]; 900 901 if (curcat == 0) { 902 // zero is our marker for a group of categories - treated as a unit 903 if (CharInCategoryGroup(ch, chcategory, set, ref i)) 904 return true; 905 } 906 else if (curcat > 0) { 907 // greater than zero is a positive case 908 909 if (curcat == SpaceConst) { 910 if (Char.IsWhiteSpace(ch)) 911 return true; 912 else { 913 i++; 914 continue; 915 } 916 } 917 --curcat; 918 919 if (chcategory == (UnicodeCategory) curcat) 920 return true; 921 } 922 else { 923 // less than zero is a negative case 924 if (curcat == NotSpaceConst) { 925 if (!Char.IsWhiteSpace(ch)) 926 return true; 927 else { 928 i++; 929 continue; 930 } 931 } 932 933 //curcat = -curcat; 934 //--curcat; 935 curcat = -1 - curcat; 936 937 if (chcategory != (UnicodeCategory) curcat) 938 return true; 939 } 940 i++; 941 } 942 return false; 943 } 944 945 /* 946 * CharInCategoryGroup 947 * This is used for categories which are composed of other categories - L, N, Z, W... 948 * These groups need special treatment when they are negated 949 */ CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i)950 private static bool CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i) { 951 i++; 952 953 int curcat = (short) category[i]; 954 if (curcat > 0) { 955 // positive case - the character must be in ANY of the categories in the group 956 bool answer = false; 957 958 while (curcat != 0) { 959 if (!answer) { 960 --curcat; 961 if (chcategory == (UnicodeCategory) curcat) 962 answer = true; 963 } 964 i++; 965 curcat = (short) category[i]; 966 } 967 return answer; 968 } 969 else { 970 971 // negative case - the character must be in NONE of the categories in the group 972 bool answer = true; 973 974 while (curcat != 0) { 975 if (answer) { 976 //curcat = -curcat; 977 //--curcat; 978 curcat = -1 - curcat; 979 if (chcategory == (UnicodeCategory) curcat) 980 answer = false; 981 } 982 i++; 983 curcat = (short) category[i]; 984 } 985 return answer; 986 } 987 } 988 NegateCategory(string category)989 private static string NegateCategory(string category) { 990 if (category == null) 991 return null; 992 993 StringBuilder sb = new StringBuilder(category.Length); 994 995 for (int i=0; i<category.Length; i++) { 996 short ch = (short) category[i]; 997 sb.Append( (char) -ch); 998 } 999 return sb.ToString(); 1000 } 1001 Parse(string charClass)1002 internal static RegexCharClass Parse(string charClass) { 1003 return ParseRecursive(charClass, 0); 1004 } 1005 ParseRecursive(string charClass, int start)1006 private static RegexCharClass ParseRecursive(string charClass, int start) { 1007 int mySetLength = charClass[start+SETLENGTH]; 1008 int myCategoryLength = charClass[start+CATEGORYLENGTH]; 1009 int myEndPosition = start + SETSTART + mySetLength + myCategoryLength; 1010 1011 List<SingleRange> ranges = new List<SingleRange>(mySetLength); 1012 int i=start+SETSTART; 1013 int end = i + mySetLength; 1014 while (i<end) { 1015 char first = charClass[i]; 1016 i++; 1017 1018 char last; 1019 if (i < end) 1020 last = (char) (charClass[i] - 1); 1021 else 1022 last = Lastchar; 1023 i++; 1024 ranges.Add(new SingleRange(first, last)); 1025 } 1026 1027 RegexCharClass sub = null; 1028 if (charClass.Length > myEndPosition) 1029 sub = ParseRecursive(charClass, myEndPosition); 1030 1031 return new RegexCharClass(charClass[start+FLAGS] == 1, ranges, new StringBuilder(charClass.Substring(end, myCategoryLength)), sub); 1032 } 1033 1034 /* 1035 * RangeCount() 1036 * 1037 * The number of single ranges that have been accumulated so far. 1038 */ RangeCount()1039 private int RangeCount() { 1040 return _rangelist.Count; 1041 } 1042 1043 /* 1044 * ToString() 1045 * 1046 * Constructs the string representation of the class. 1047 */ ToStringClass()1048 internal String ToStringClass() { 1049 if (!_canonical) 1050 Canonicalize(); 1051 1052 // make a guess about the length of the ranges. We'll update this at the end. 1053 // This is important because if the last range ends in LastChar, we won't append 1054 // LastChar to the list. 1055 int rangeLen = _rangelist.Count * 2 ; 1056 StringBuilder sb = new StringBuilder(rangeLen + _categories.Length + 3); 1057 1058 int flags; 1059 if (_negate) 1060 flags = 1; 1061 else 1062 flags = 0; 1063 1064 sb.Append((char) flags); 1065 sb.Append((char) rangeLen); 1066 sb.Append((char) _categories.Length); 1067 1068 for (int i = 0; i < _rangelist.Count; i++) { 1069 SingleRange currentRange = _rangelist[i]; 1070 sb.Append(currentRange._first); 1071 1072 if (currentRange._last != Lastchar) 1073 sb.Append((char)(currentRange._last + 1)); 1074 } 1075 1076 sb[SETLENGTH] = (char) (sb.Length - SETSTART); 1077 1078 sb.Append(_categories); 1079 1080 if (_subtractor != null) 1081 sb.Append(_subtractor.ToStringClass()); 1082 1083 return sb.ToString(); 1084 } 1085 1086 /* 1087 * GetRangeAt(int i) 1088 * 1089 * The ith range. 1090 */ GetRangeAt(int i)1091 private SingleRange GetRangeAt(int i) { 1092 return _rangelist[i]; 1093 } 1094 1095 /* 1096 * Canonicalize() 1097 * 1098 * Logic to reduce a character class to a unique, sorted form. 1099 */ Canonicalize()1100 private void Canonicalize() { 1101 SingleRange CurrentRange; 1102 int i; 1103 int j; 1104 char last; 1105 bool Done; 1106 1107 _canonical = true; 1108 _rangelist.Sort(0, _rangelist.Count, new SingleRangeComparer()); 1109 1110 // 1111 // Find and eliminate overlapping or abutting ranges 1112 // 1113 1114 if (_rangelist.Count > 1) { 1115 Done = false; 1116 1117 for (i = 1, j = 0; ; i++) { 1118 for (last = _rangelist[j]._last; ; i++) { 1119 if (i == _rangelist.Count || last == Lastchar) { 1120 Done = true; 1121 break; 1122 } 1123 1124 if ((CurrentRange = _rangelist[i])._first > last + 1) 1125 break; 1126 1127 if (last < CurrentRange._last) 1128 last = CurrentRange._last; 1129 } 1130 1131 _rangelist[j]._last = last; 1132 1133 j++; 1134 1135 if (Done) 1136 break; 1137 1138 if (j < i) 1139 _rangelist[j] = _rangelist[i]; 1140 } 1141 _rangelist.RemoveRange(j, _rangelist.Count - j); 1142 } 1143 } 1144 SetFromProperty(String capname, bool invert, string pattern)1145 private static String SetFromProperty(String capname, bool invert, string pattern) { 1146 int min = 0; 1147 int max = _propTable.GetLength(0); 1148 while (min != max) { 1149 int mid = (min + max) / 2; 1150 int res = String.Compare(capname, _propTable[mid,0], StringComparison.Ordinal); 1151 if (res < 0) 1152 max = mid; 1153 else if (res > 0) 1154 min = mid + 1; 1155 else { 1156 String set = _propTable[mid,1]; 1157 Debug.Assert(!String.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table"); 1158 if (invert) 1159 { 1160 if (set[0] == Nullchar) 1161 { 1162 return set.Substring(1); 1163 } 1164 return Nullchar + set; 1165 } 1166 else 1167 { 1168 return set; 1169 } 1170 } 1171 } 1172 throw new ArgumentException(SR.GetString(SR.MakeException, pattern, SR.GetString(SR.UnknownProperty, capname))); 1173 } 1174 1175 #if DBG 1176 1177 /* 1178 * SetDescription() 1179 * 1180 * Produces a human-readable description for a set string. 1181 */ SetDescription(String set)1182 internal static String SetDescription(String set) { 1183 int mySetLength = set[SETLENGTH]; 1184 int myCategoryLength = set[CATEGORYLENGTH]; 1185 int myEndPosition = SETSTART + mySetLength + myCategoryLength; 1186 1187 StringBuilder desc = new StringBuilder("["); 1188 1189 int index = SETSTART; 1190 char ch1; 1191 char ch2; 1192 1193 if (IsNegated(set)) 1194 desc.Append('^'); 1195 1196 while (index < SETSTART + set[SETLENGTH]) { 1197 ch1 = set[index]; 1198 if (index + 1 < set.Length) 1199 ch2 = (char)(set[index + 1] - 1); 1200 else 1201 ch2 = Lastchar; 1202 1203 desc.Append(CharDescription(ch1)); 1204 1205 if (ch2 != ch1) { 1206 if (ch1 + 1 != ch2) 1207 desc.Append('-'); 1208 desc.Append(CharDescription(ch2)); 1209 } 1210 index += 2; 1211 } 1212 1213 while (index < SETSTART + set[SETLENGTH] + set[CATEGORYLENGTH]) { 1214 ch1 = set[index]; 1215 if (ch1 == 0) { 1216 bool found = false; 1217 1218 int lastindex = set.IndexOf(GroupChar, index+1); 1219 string group = set.Substring(index,lastindex-index + 1); 1220 1221 IDictionaryEnumerator en = _definedCategories.GetEnumerator(); 1222 while(en.MoveNext()) { 1223 if (group.Equals(en.Value)) { 1224 if ((short) set[index+1] > 0) 1225 desc.Append("\\p{" + en.Key + "}"); 1226 else 1227 desc.Append("\\P{" + en.Key + "}"); 1228 1229 found = true; 1230 break; 1231 } 1232 } 1233 1234 if (!found) { 1235 if (group.Equals(Word)) 1236 desc.Append("\\w"); 1237 else if (group.Equals(NotWord)) 1238 desc.Append("\\W"); 1239 else 1240 Debug.Assert(false, "Couldn't find a goup to match '" + group + "'"); 1241 } 1242 1243 index = lastindex; 1244 } 1245 else { 1246 desc.Append(CategoryDescription(ch1)); 1247 } 1248 1249 index++; 1250 } 1251 1252 if (set.Length > myEndPosition) { 1253 desc.Append('-'); 1254 desc.Append(SetDescription(set.Substring(myEndPosition))); 1255 } 1256 1257 desc.Append(']'); 1258 1259 return desc.ToString(); 1260 } 1261 1262 internal static readonly char [] Hex = new char [] {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'}; 1263 internal static readonly string[] Categories = new string[] {"Lu", "Ll", "Lt", "Lm", "Lo", InternalRegexIgnoreCase, 1264 "Mn", "Mc", "Me", 1265 "Nd", "Nl", "No", 1266 "Zs", "Zl", "Zp", 1267 "Cc", "Cf", "Cs", "Co", 1268 "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po", 1269 "Sm", "Sc", "Sk", "So", 1270 "Cn" }; 1271 1272 /* 1273 * CharDescription() 1274 * 1275 * Produces a human-readable description for a single character. 1276 */ CharDescription(char ch)1277 internal static String CharDescription(char ch) { 1278 StringBuilder sb = new StringBuilder(); 1279 int shift; 1280 1281 if (ch == '\\') 1282 return "\\\\"; 1283 1284 if (ch >= ' ' && ch <= '~') { 1285 sb.Append(ch); 1286 return sb.ToString(); 1287 } 1288 1289 if (ch < 256) { 1290 sb.Append("\\x"); 1291 shift = 8; 1292 } 1293 else { 1294 sb.Append("\\u"); 1295 shift = 16; 1296 } 1297 1298 while (shift > 0) { 1299 shift -= 4; 1300 sb.Append(Hex[(ch >> shift) & 0xF]); 1301 } 1302 1303 return sb.ToString(); 1304 } 1305 CategoryDescription(char ch)1306 private static String CategoryDescription(char ch) { 1307 if (ch == SpaceConst) 1308 return "\\s"; 1309 else if ((short) ch == NotSpaceConst) 1310 return "\\S"; 1311 else if ((short) ch < 0) { 1312 return "\\P{" + Categories[(- ((short)ch) - 1)] + "}"; 1313 } 1314 else { 1315 return "\\p{" + Categories[(ch - 1)] + "}"; 1316 } 1317 } 1318 1319 #endif 1320 1321 // Lower case mapping descriptor. 1322 private struct LowerCaseMapping { LowerCaseMappingSystem.Text.RegularExpressions.RegexCharClass.LowerCaseMapping1323 internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data) { 1324 _chMin = chMin; 1325 _chMax = chMax; 1326 _lcOp = lcOp; 1327 _data = data; 1328 } 1329 1330 internal char _chMin; 1331 internal char _chMax; 1332 internal int _lcOp; 1333 internal int _data; 1334 } 1335 1336 /* 1337 * SingleRangeComparer 1338 * 1339 * For sorting ranges; compare based on the first char in the range. 1340 */ 1341 private sealed class SingleRangeComparer : IComparer<SingleRange> { Compare(SingleRange x, SingleRange y)1342 public int Compare(SingleRange x, SingleRange y) { 1343 return((x)._first < (y)._first ? -1 1344 : ((x)._first > (y)._first ? 1 : 0)); 1345 } 1346 } 1347 1348 /* 1349 * SingleRange 1350 * 1351 * A first/last pair representing a single range of characters. 1352 */ 1353 private sealed class SingleRange { SingleRange(char first, char last)1354 internal SingleRange(char first, char last) { 1355 _first = first; 1356 _last = last; 1357 } 1358 1359 internal char _first; 1360 internal char _last; 1361 } 1362 } 1363 1364 } 1365