1 //------------------------------------------------------------------------------
2 // <copyright file="RegexCharClass.cs" company="Microsoft">
3 //     Copyright (c) Microsoft Corporation.  All rights reserved.
4 // </copyright>
5 //------------------------------------------------------------------------------
6 
7 // This RegexCharClass class provides the "set of Unicode chars" functionality
8 // used by the regexp engine.
9 
10 // The main function of RegexCharClass is as a builder to turn ranges, characters and
11 // Unicode categories into a single string.  This string is used as a black box
12 // representation of a character class by the rest of Regex.  The format is as follows.
13 //
14 // Char index   Use
15 //      0       Flags - currently this only holds the "negate" flag
16 //      1       length of the string representing the "set" portion, eg [a-z0-9] only has a "set"
17 //      2       length of the string representing the "category" portion, eg [\p{Lu}] only has a "category"
18 //      3...m   The set.  These are a series of ranges which define the characters included in the set.
19 //              To determine if a given character is in the set, we binary search over this set of ranges
20 //              and see where the character should go.  Based on whether the ending index is odd or even,
21 //              we know if the character is in the set.
22 //      m+1...n The categories.  This is a list of UnicodeCategory enum values which describe categories
23 //              included in this class.
24 
25 namespace System.Text.RegularExpressions {
26 
27     using System.Collections;
28     using System.Collections.Generic;
29     using System.Globalization;
30     using System.Diagnostics;
31 
32     internal sealed class RegexCharClass {
33         // instance data
34         private List<SingleRange>          _rangelist;
35         private StringBuilder      _categories;
36         private bool               _canonical;
37         private bool               _negate;
38         private RegexCharClass     _subtractor;
39 
40         // Constants
41         private const int FLAGS = 0;
42         private const int SETLENGTH = 1;
43         private const int CATEGORYLENGTH = 2;
44         private const int SETSTART = 3;
45 
46         private const char   Nullchar   = '\0';
47         private const char   Lastchar   = '\uFFFF';
48 
49         private const char GroupChar = (char) 0;
50 
51 
52         private const short SpaceConst = 100;
53         private const short NotSpaceConst = -100;
54 
55         private const char ZeroWidthJoiner = '\u200D';
56         private const char ZeroWidthNonJoiner = '\u200C';
57 
58 
59         private static readonly String InternalRegexIgnoreCase = "__InternalRegexIgnoreCase__";
60         private static readonly String Space = "\x64";
61         private static readonly String NotSpace = NegateCategory(Space);
62         private static readonly String Word;
63         private static readonly String NotWord;
64 
65         internal static readonly String SpaceClass;
66         internal static readonly String NotSpaceClass;
67         internal static readonly String WordClass;
68         internal static readonly String NotWordClass;
69         internal static readonly String DigitClass;
70         internal static readonly String NotDigitClass;
71 
72         private const String ECMASpaceSet    = "\u0009\u000E\u0020\u0021";
73         private const String NotECMASpaceSet = "\0\u0009\u000E\u0020\u0021";
74         private const String ECMAWordSet     = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
75         private const String NotECMAWordSet  = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
76         private const String ECMADigitSet    = "\u0030\u003A";
77         private const String NotECMADigitSet = "\0\u0030\u003A";
78 
79         internal const String ECMASpaceClass    = "\x00\x04\x00" + ECMASpaceSet;
80         internal const String NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceSet;
81         internal const String ECMAWordClass     = "\x00\x0A\x00" + ECMAWordSet;
82         internal const String NotECMAWordClass  = "\x01\x0A\x00" + ECMAWordSet;
83         internal const String ECMADigitClass    = "\x00\x02\x00" + ECMADigitSet;
84         internal const String NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet;
85 
86         internal const String AnyClass          = "\x00\x01\x00\x00";
87         internal const String EmptyClass        = "\x00\x00\x00";
88 
89         static Dictionary<String, String> _definedCategories;
90 
91         /*
92          *   The property table contains all the block definitions defined in the
93          *   XML schema spec (http://www.w3.org/TR/2001/PR-xmlschema-2-20010316/#charcter-classes), Unicode 4.0 spec (www.unicode.org),
94          *   and Perl 5.6 (see Programming Perl, 3rd edition page 167).   Three blocks defined by Perl (and here) may
95          *   not be in the Unicode: IsHighPrivateUseSurrogates, IsHighSurrogates, and IsLowSurrogates.
96          *
97         **/
98         // Has to be sorted by the first column
99         private static readonly String[,] _propTable = {
100             {"IsAlphabeticPresentationForms",       "\uFB00\uFB50"},
101             {"IsArabic",                            "\u0600\u0700"},
102             {"IsArabicPresentationForms-A",         "\uFB50\uFE00"},
103             {"IsArabicPresentationForms-B",         "\uFE70\uFF00"},
104             {"IsArmenian",                          "\u0530\u0590"},
105             {"IsArrows",                            "\u2190\u2200"},
106             {"IsBasicLatin",                        "\u0000\u0080"},
107             {"IsBengali",                           "\u0980\u0A00"},
108             {"IsBlockElements",                     "\u2580\u25A0"},
109             {"IsBopomofo",                          "\u3100\u3130"},
110             {"IsBopomofoExtended",                  "\u31A0\u31C0"},
111             {"IsBoxDrawing",                        "\u2500\u2580"},
112             {"IsBraillePatterns",                   "\u2800\u2900"},
113             {"IsBuhid",                             "\u1740\u1760"},
114             {"IsCJKCompatibility",                  "\u3300\u3400"},
115             {"IsCJKCompatibilityForms",             "\uFE30\uFE50"},
116             {"IsCJKCompatibilityIdeographs",        "\uF900\uFB00"},
117             {"IsCJKRadicalsSupplement",             "\u2E80\u2F00"},
118             {"IsCJKSymbolsandPunctuation",          "\u3000\u3040"},
119             {"IsCJKUnifiedIdeographs",              "\u4E00\uA000"},
120             {"IsCJKUnifiedIdeographsExtensionA",    "\u3400\u4DC0"},
121             {"IsCherokee",                          "\u13A0\u1400"},
122             {"IsCombiningDiacriticalMarks",         "\u0300\u0370"},
123             {"IsCombiningDiacriticalMarksforSymbols","\u20D0\u2100"},
124             {"IsCombiningHalfMarks",                "\uFE20\uFE30"},
125             {"IsCombiningMarksforSymbols",          "\u20D0\u2100"},
126             {"IsControlPictures",                   "\u2400\u2440"},
127             {"IsCurrencySymbols",                   "\u20A0\u20D0"},
128             {"IsCyrillic",                          "\u0400\u0500"},
129             {"IsCyrillicSupplement",                "\u0500\u0530"},
130             {"IsDevanagari",                        "\u0900\u0980"},
131             {"IsDingbats",                          "\u2700\u27C0"},
132             {"IsEnclosedAlphanumerics",             "\u2460\u2500"},
133             {"IsEnclosedCJKLettersandMonths",       "\u3200\u3300"},
134             {"IsEthiopic",                          "\u1200\u1380"},
135             {"IsGeneralPunctuation",                "\u2000\u2070"},
136             {"IsGeometricShapes",                   "\u25A0\u2600"},
137             {"IsGeorgian",                          "\u10A0\u1100"},
138             {"IsGreek",                             "\u0370\u0400"},
139             {"IsGreekExtended",                     "\u1F00\u2000"},
140             {"IsGreekandCoptic",                    "\u0370\u0400"},
141             {"IsGujarati",                          "\u0A80\u0B00"},
142             {"IsGurmukhi",                          "\u0A00\u0A80"},
143             {"IsHalfwidthandFullwidthForms",        "\uFF00\uFFF0"},
144             {"IsHangulCompatibilityJamo",           "\u3130\u3190"},
145             {"IsHangulJamo",                        "\u1100\u1200"},
146             {"IsHangulSyllables",                   "\uAC00\uD7B0"},
147             {"IsHanunoo",                           "\u1720\u1740"},
148             {"IsHebrew",                            "\u0590\u0600"},
149             {"IsHighPrivateUseSurrogates",          "\uDB80\uDC00"},
150             {"IsHighSurrogates",                    "\uD800\uDB80"},
151             {"IsHiragana",                          "\u3040\u30A0"},
152             {"IsIPAExtensions",                     "\u0250\u02B0"},
153             {"IsIdeographicDescriptionCharacters",  "\u2FF0\u3000"},
154             {"IsKanbun",                            "\u3190\u31A0"},
155             {"IsKangxiRadicals",                    "\u2F00\u2FE0"},
156             {"IsKannada",                           "\u0C80\u0D00"},
157             {"IsKatakana",                          "\u30A0\u3100"},
158             {"IsKatakanaPhoneticExtensions",        "\u31F0\u3200"},
159             {"IsKhmer",                             "\u1780\u1800"},
160             {"IsKhmerSymbols",                      "\u19E0\u1A00"},
161             {"IsLao",                               "\u0E80\u0F00"},
162             {"IsLatin-1Supplement",                 "\u0080\u0100"},
163             {"IsLatinExtended-A",                   "\u0100\u0180"},
164             {"IsLatinExtended-B",                   "\u0180\u0250"},
165             {"IsLatinExtendedAdditional",           "\u1E00\u1F00"},
166             {"IsLetterlikeSymbols",                 "\u2100\u2150"},
167             {"IsLimbu",                             "\u1900\u1950"},
168             {"IsLowSurrogates",                     "\uDC00\uE000"},
169             {"IsMalayalam",                         "\u0D00\u0D80"},
170             {"IsMathematicalOperators",             "\u2200\u2300"},
171             {"IsMiscellaneousMathematicalSymbols-A","\u27C0\u27F0"},
172             {"IsMiscellaneousMathematicalSymbols-B","\u2980\u2A00"},
173             {"IsMiscellaneousSymbols",              "\u2600\u2700"},
174             {"IsMiscellaneousSymbolsandArrows",     "\u2B00\u2C00"},
175             {"IsMiscellaneousTechnical",            "\u2300\u2400"},
176             {"IsMongolian",                         "\u1800\u18B0"},
177             {"IsMyanmar",                           "\u1000\u10A0"},
178             {"IsNumberForms",                       "\u2150\u2190"},
179             {"IsOgham",                             "\u1680\u16A0"},
180             {"IsOpticalCharacterRecognition",       "\u2440\u2460"},
181             {"IsOriya",                             "\u0B00\u0B80"},
182             {"IsPhoneticExtensions",                "\u1D00\u1D80"},
183             {"IsPrivateUse",                        "\uE000\uF900"},
184             {"IsPrivateUseArea",                    "\uE000\uF900"},
185             {"IsRunic",                             "\u16A0\u1700"},
186             {"IsSinhala",                           "\u0D80\u0E00"},
187             {"IsSmallFormVariants",                 "\uFE50\uFE70"},
188             {"IsSpacingModifierLetters",            "\u02B0\u0300"},
189             {"IsSpecials",                          "\uFFF0"},
190             {"IsSuperscriptsandSubscripts",         "\u2070\u20A0"},
191             {"IsSupplementalArrows-A",              "\u27F0\u2800"},
192             {"IsSupplementalArrows-B",              "\u2900\u2980"},
193             {"IsSupplementalMathematicalOperators", "\u2A00\u2B00"},
194             {"IsSyriac",                            "\u0700\u0750"},
195             {"IsTagalog",                           "\u1700\u1720"},
196             {"IsTagbanwa",                          "\u1760\u1780"},
197             {"IsTaiLe",                             "\u1950\u1980"},
198             {"IsTamil",                             "\u0B80\u0C00"},
199             {"IsTelugu",                            "\u0C00\u0C80"},
200             {"IsThaana",                            "\u0780\u07C0"},
201             {"IsThai",                              "\u0E00\u0E80"},
202             {"IsTibetan",                           "\u0F00\u1000"},
203             {"IsUnifiedCanadianAboriginalSyllabics","\u1400\u1680"},
204             {"IsVariationSelectors",                "\uFE00\uFE10"},
205             {"IsYiRadicals",                        "\uA490\uA4D0"},
206             {"IsYiSyllables",                       "\uA000\uA490"},
207             {"IsYijingHexagramSymbols",             "\u4DC0\u4E00"},
208             {"_xmlC", /* Name Char              */   "\u002D\u002F\u0030\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00B7\u00B8\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u02D0\u02D2\u0300\u0346\u0360\u0362\u0386\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0483\u0487\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0653\u0660\u066A\u0670\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06E9\u06EA\u06EE\u06F0\u06FA\u0901\u0904\u0905\u093A\u093C\u094E\u0951\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC"
209                 +"\u09DE\u09DF\u09E4\u09E6\u09F2\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B70\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF0\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2"
210                 +"\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0E01\u0E2F\u0E30\u0E3B\u0E40\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0F18\u0F1A\u0F20\u0F2A\u0F35\u0F36\u0F37\u0F38\u0F39\u0F3A\u0F3E\u0F48\u0F49\u0F6A\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F96\u0F97\u0F98\u0F99\u0FAE\u0FB1\u0FB8\u0FB9\u0FBA\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00"
211                 +"\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u20D0\u20DD\u20E1\u20E2\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3005\u3006\u3007\u3008\u3021\u3030\u3031\u3036\u3041\u3095\u3099\u309B\u309D\u309F\u30A1\u30FB\u30FC\u30FF\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"},
212             {"_xmlD",                                "\u0030\u003A\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF0\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F2A\u1040\u104A\u1369\u1372\u17E0\u17EA\u1810\u181A\uFF10\uFF1A"},
213             {"_xmlI", /* Start Name Char       */    "\u003A\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u0386\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0641\u064B\u0671\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06D6\u06E5\u06E7\u0905\u093A\u093D\u093E\u0958\u0962\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09DC\u09DE\u09DF\u09E2\u09F0\u09F2\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A59\u0A5D\u0A5E\u0A5F\u0A72\u0A75\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABD\u0ABE\u0AE0\u0AE1\u0B05\u0B0D\u0B0F"
214                 +"\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3D\u0B3E\u0B5C\u0B5E\u0B5F\u0B62\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C60\u0C62\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CDE\u0CDF\u0CE0\u0CE2\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D60\u0D62\u0E01\u0E2F\u0E30\u0E31\u0E32\u0E34\u0E40\u0E46\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EB1\u0EB2\u0EB4\u0EBD\u0EBE\u0EC0\u0EC5\u0F40\u0F48\u0F49\u0F6A\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC"
215                 +"\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3007\u3008\u3021\u302A\u3041\u3095\u30A1\u30FB\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"},
216             {"_xmlW",                                "\u0024\u0025\u002B\u002C\u0030\u003A\u003C\u003F\u0041\u005B\u005E\u005F\u0060\u007B\u007C\u007D\u007E\u007F\u00A2\u00AB\u00AC\u00AD\u00AE\u00B7\u00B8\u00BB\u00BC\u00BF\u00C0\u0221\u0222\u0234\u0250\u02AE\u02B0\u02EF\u0300\u0350\u0360\u0370\u0374\u0376\u037A\u037B\u0384\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03F7\u0400\u0487\u0488\u04CF\u04D0\u04F6\u04F8\u04FA\u0500\u0510\u0531\u0557\u0559\u055A\u0561\u0588\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0656\u0660\u066A\u066E\u06D4\u06D5\u06DD\u06DE\u06EE\u06F0\u06FF\u0710\u072D\u0730\u074B\u0780\u07B2\u0901\u0904\u0905\u093A\u093C\u094E\u0950\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC\u09DE\u09DF\u09E4\u09E6\u09FB\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35"
217                 +"\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AD0\u0AD1\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B71\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF3\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49"
218                 +"\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0D82\u0D84\u0D85\u0D97\u0D9A\u0DB2\u0DB3\u0DBC\u0DBD\u0DBE\u0DC0\u0DC7\u0DCA\u0DCB\u0DCF\u0DD5\u0DD6\u0DD7\u0DD8\u0DE0\u0DF2\u0DF4\u0E01\u0E3B\u0E3F\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0EDC\u0EDE\u0F00\u0F04\u0F13\u0F3A\u0F3E\u0F48\u0F49\u0F6B\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F98\u0F99\u0FBD\u0FBE\u0FCD\u0FCF\u0FD0\u1000\u1022\u1023\u1028\u1029\u102B\u102C\u1033\u1036\u103A\u1040\u104A\u1050\u105A\u10A0\u10C6\u10D0\u10F9\u1100\u115A\u115F\u11A3\u11A8\u11FA\u1200\u1207\u1208\u1247\u1248\u1249\u124A\u124E\u1250\u1257\u1258\u1259\u125A\u125E\u1260\u1287\u1288\u1289\u128A\u128E\u1290\u12AF\u12B0\u12B1\u12B2\u12B6\u12B8\u12BF\u12C0\u12C1\u12C2\u12C6\u12C8\u12CF\u12D0\u12D7\u12D8\u12EF\u12F0\u130F\u1310\u1311\u1312\u1316\u1318\u131F\u1320\u1347\u1348\u135B\u1369\u137D\u13A0"
219                 +"\u13F5\u1401\u166D\u166F\u1677\u1681\u169B\u16A0\u16EB\u16EE\u16F1\u1700\u170D\u170E\u1715\u1720\u1735\u1740\u1754\u1760\u176D\u176E\u1771\u1772\u1774\u1780\u17D4\u17D7\u17D8\u17DB\u17DD\u17E0\u17EA\u180B\u180E\u1810\u181A\u1820\u1878\u1880\u18AA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FC5\u1FC6\u1FD4\u1FD6\u1FDC\u1FDD\u1FF0\u1FF2\u1FF5\u1FF6\u1FFF\u2044\u2045\u2052\u2053\u2070\u2072\u2074\u207D\u207F\u208D\u20A0\u20B2\u20D0\u20EB\u2100\u213B\u213D\u214C\u2153\u2184\u2190\u2329\u232B\u23B4\u23B7\u23CF\u2400\u2427\u2440\u244B\u2460\u24FF\u2500\u2614\u2616\u2618\u2619\u267E\u2680\u268A\u2701\u2705\u2706\u270A\u270C\u2728\u2729\u274C\u274D\u274E\u274F\u2753\u2756\u2757\u2758\u275F\u2761\u2768\u2776\u2795\u2798\u27B0\u27B1\u27BF\u27D0\u27E6\u27F0\u2983\u2999\u29D8\u29DC\u29FC\u29FE\u2B00\u2E80\u2E9A\u2E9B\u2EF4\u2F00\u2FD6\u2FF0\u2FFC\u3004\u3008\u3012\u3014\u3020\u3030\u3031\u303D\u303E\u3040"
220                 +"\u3041\u3097\u3099\u30A0\u30A1\u30FB\u30FC\u3100\u3105\u312D\u3131\u318F\u3190\u31B8\u31F0\u321D\u3220\u3244\u3251\u327C\u327F\u32CC\u32D0\u32FF\u3300\u3377\u337B\u33DE\u33E0\u33FF\u3400\u4DB6\u4E00\u9FA6\uA000\uA48D\uA490\uA4C7\uAC00\uD7A4\uF900\uFA2E\uFA30\uFA6B\uFB00\uFB07\uFB13\uFB18\uFB1D\uFB37\uFB38\uFB3D\uFB3E\uFB3F\uFB40\uFB42\uFB43\uFB45\uFB46\uFBB2\uFBD3\uFD3E\uFD50\uFD90\uFD92\uFDC8\uFDF0\uFDFD\uFE00\uFE10\uFE20\uFE24\uFE62\uFE63\uFE64\uFE67\uFE69\uFE6A\uFE70\uFE75\uFE76\uFEFD\uFF04\uFF05\uFF0B\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF21\uFF3B\uFF3E\uFF3F\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF66\uFFBF\uFFC2\uFFC8\uFFCA\uFFD0\uFFD2\uFFD8\uFFDA\uFFDD\uFFE0\uFFE7\uFFE8\uFFEF\uFFFC\uFFFE"},
221         };
222 
223 
224         /**************************************************************************
225             Let U be the set of Unicode character values and let L be the lowercase
226             function, mapping from U to U. To perform case insensitive matching of
227             character sets, we need to be able to map an interval I in U, say
228 
229                 I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
230 
231             to a set A such that A contains L(I) and A is contained in the union of
232             I and L(I).
233 
234             The table below partitions U into intervals on which L is non-decreasing.
235             Thus, for any interval J = [a, b] contained in one of these intervals,
236             L(J) is contained in [L(a), L(b)].
237 
238             It is also true that for any such J, [L(a), L(b)] is contained in the
239             union of J and L(J). This does not follow from L being non-decreasing on
240             these intervals. It follows from the nature of the L on each interval.
241             On each interval, L has one of the following forms:
242 
243                 (1) L(ch) = constant            (LowercaseSet)
244                 (2) L(ch) = ch + offset         (LowercaseAdd)
245                 (3) L(ch) = ch | 1              (LowercaseBor)
246                 (4) L(ch) = ch + (ch & 1)       (LowercaseBad)
247 
248             It is easy to verify that for any of these forms [L(a), L(b)] is
249             contained in the union of [a, b] and L([a, b]).
250         ***************************************************************************/
251 
252         private const int LowercaseSet = 0;    // Set to arg.
253         private const int LowercaseAdd = 1;    // Add arg.
254         private const int LowercaseBor = 2;    // Bitwise or with 1.
255         private const int LowercaseBad = 3;    // Bitwise and with 1 and add original.
256 
257         private static readonly LowerCaseMapping[] _lcTable = new LowerCaseMapping[]
258         {
259             new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32),
260             new LowerCaseMapping('\u00C0', '\u00DE', LowercaseAdd, 32),
261             new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0),
262             new LowerCaseMapping('\u0130', '\u0130', LowercaseSet, 0x0069),
263             new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0),
264             new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0),
265             new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0),
266             new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF),
267             new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0),
268             new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253),
269             new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0),
270             new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254),
271             new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188),
272             new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205),
273             new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C),
274             new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD),
275             new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259),
276             new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B),
277             new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192),
278             new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260),
279             new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263),
280             new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269),
281             new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268),
282             new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199),
283             new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F),
284             new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272),
285             new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275),
286             new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0),
287             new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8),
288             new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283),
289             new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD),
290             new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288),
291             new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0),
292             new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217),
293             new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0),
294             new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292),
295             new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9),
296             new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD),
297             new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6),
298             new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9),
299             new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC),
300             new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0),
301             new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0),
302             new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3),
303             new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5),
304             new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0),
305             new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC),
306             new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37),
307             new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC),
308             new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63),
309             new LowerCaseMapping('\u0391', '\u03AB', LowercaseAdd, 32),
310             new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0),
311             new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80),
312             new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32),
313             new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0),
314             new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0),
315             new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0),
316             new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8),
317             new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC),
318             new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0),
319             new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0),
320             new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9),
321             new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48),
322             new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 48),
323             new LowerCaseMapping('\u1E00', '\u1EF8', LowercaseBor, 0),
324             new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8),
325             new LowerCaseMapping('\u1F18', '\u1F1F', LowercaseAdd, -8),
326             new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8),
327             new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8),
328             new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8),
329             new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51),
330             new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53),
331             new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55),
332             new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57),
333             new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8),
334             new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8),
335             new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8),
336             new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8),
337             new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8),
338             new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74),
339             new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3),
340             new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86),
341             new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3),
342             new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8),
343             new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100),
344             new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8),
345             new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112),
346             new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5),
347             new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128),
348             new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126),
349             new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3),
350             new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16),
351             new LowerCaseMapping('\u24B6', '\u24D0', LowercaseAdd, 26),
352             new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32),
353         };
354 
RegexCharClass()355         static RegexCharClass() {
356             // addressing Dictionary versus Hashtable thread safety difference by using
357             // a temp Dictionary. Note that this is just a theoretical concern since this
358             // is a static ctor and getter methods aren't called until after this is
359             // done; this is just to avoid the long-term possibility of thread safety
360             // problems.
361             Dictionary<String, String> tempCategories = new Dictionary<String, String>(32);
362 
363             char[] groups = new char[9];
364             StringBuilder word = new StringBuilder(11);
365 
366             word.Append(GroupChar);
367             groups[0] = GroupChar;
368 
369             // We need the UnicodeCategory enum values as a char so we can put them in a string
370             // in the hashtable.  In order to get there, we first must cast to an int,
371             // then cast to a char
372             // Also need to distinguish between positive and negative values.  UnicodeCategory is zero
373             // based, so we add one to each value and subtract it off later
374 
375             // Others
376             groups[1] = (char) ((int) UnicodeCategory.Control + 1);
377             tempCategories["Cc"] = groups[1].ToString();     // Control
378             groups[2] = (char) ((int) UnicodeCategory.Format + 1);
379             tempCategories["Cf"] = groups[2].ToString();     // Format
380             groups[3] = (char) ((int) UnicodeCategory.OtherNotAssigned + 1);
381             tempCategories["Cn"] = groups[3].ToString();     // Not assigned
382             groups[4] = (char) ((int) UnicodeCategory.PrivateUse + 1);
383             tempCategories["Co"] = groups[4].ToString();     // Private use
384             groups[5] = (char) ((int) UnicodeCategory.Surrogate + 1);
385             tempCategories["Cs"] = groups[5].ToString();     // Surrogate
386 
387             groups[6] = GroupChar;
388             tempCategories["C"] = new String(groups, 0, 7);
389 
390             // Letters
391             groups[1] = (char) ((int) UnicodeCategory.LowercaseLetter + 1);
392             tempCategories["Ll"] = groups[1].ToString();     // Lowercase
393             groups[2] = (char) ((int) UnicodeCategory.ModifierLetter + 1);
394             tempCategories["Lm"] = groups[2].ToString();     // Modifier
395             groups[3] = (char) ((int) UnicodeCategory.OtherLetter + 1);
396             tempCategories["Lo"] = groups[3].ToString();     // Other
397             groups[4] = (char) ((int) UnicodeCategory.TitlecaseLetter + 1);
398             tempCategories["Lt"] = groups[4].ToString();     // Titlecase
399             groups[5] = (char) ((int) UnicodeCategory.UppercaseLetter + 1);
400             tempCategories["Lu"] = groups[5].ToString();     // Uppercase
401 
402             //groups[6] = GroupChar;
403             tempCategories["L"] = new String(groups, 0, 7);
404             word.Append(new String(groups, 1, 5));
405 
406             // InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter}
407             // !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!!
408             tempCategories[InternalRegexIgnoreCase] = String.Format(CultureInfo.InvariantCulture, "{0}{1}{2}{3}{4}", GroupChar, groups[1], groups[4], groups[5], groups[6]);
409 
410             // Marks
411             groups[1] = (char) ((int) UnicodeCategory.SpacingCombiningMark + 1);
412             tempCategories["Mc"] = groups[1].ToString();     // Spacing combining
413             groups[2] = (char) ((int) UnicodeCategory.EnclosingMark + 1);
414             tempCategories["Me"] = groups[2].ToString();     // Enclosing
415             groups[3] = (char) ((int) UnicodeCategory.NonSpacingMark + 1);
416             tempCategories["Mn"] = groups[3].ToString();     // Non-spacing
417 
418             groups[4] = GroupChar;
419             tempCategories["M"] = new String(groups, 0, 5);
420             //word.Append(groups[1]);
421             word.Append(groups[3]);
422 
423             // Numbers
424             groups[1] = (char) ((int) UnicodeCategory.DecimalDigitNumber + 1);
425             tempCategories["Nd"] = groups[1].ToString();     // Decimal digit
426             groups[2] = (char) ((int) UnicodeCategory.LetterNumber + 1);
427             tempCategories["Nl"] = groups[2].ToString();     // Letter
428             groups[3] = (char) ((int) UnicodeCategory.OtherNumber + 1);
429             tempCategories["No"] = groups[3].ToString();     // Other
430 
431             //groups[4] = GroupChar;
432             tempCategories["N"] = new String(groups, 0, 5);
433             word.Append(groups[1]);
434             //word.Append(new String(groups, 1, 3));
435 
436             // Punctuation
437             groups[1] = (char) ((int) UnicodeCategory.ConnectorPunctuation + 1);
438             tempCategories["Pc"] = groups[1].ToString();     // Connector
439             groups[2] = (char) ((int) UnicodeCategory.DashPunctuation + 1);
440             tempCategories["Pd"] = groups[2].ToString();     // Dash
441             groups[3] = (char) ((int) UnicodeCategory.ClosePunctuation + 1);
442             tempCategories["Pe"] = groups[3].ToString();     // Close
443             groups[4] = (char) ((int) UnicodeCategory.OtherPunctuation + 1);
444             tempCategories["Po"] = groups[4].ToString();     // Other
445             groups[5] = (char) ((int) UnicodeCategory.OpenPunctuation + 1);
446             tempCategories["Ps"] = groups[5].ToString();     // Open
447             groups[6] = (char) ((int) UnicodeCategory.FinalQuotePunctuation + 1);
448             tempCategories["Pf"] = groups[6].ToString();     // Inital quote
449             groups[7] = (char) ((int) UnicodeCategory.InitialQuotePunctuation + 1);
450             tempCategories["Pi"] = groups[7].ToString();     // Final quote
451 
452             groups[8] = GroupChar;
453             tempCategories["P"] = new String(groups, 0, 9);
454             word.Append(groups[1]);
455 
456             // Symbols
457             groups[1] = (char) ((int) UnicodeCategory.CurrencySymbol + 1);
458             tempCategories["Sc"] = groups[1].ToString();     // Currency
459             groups[2] = (char) ((int) UnicodeCategory.ModifierSymbol + 1);
460             tempCategories["Sk"] = groups[2].ToString();     // Modifier
461             groups[3] = (char) ((int) UnicodeCategory.MathSymbol + 1);
462             tempCategories["Sm"] = groups[3].ToString();     // Math
463             groups[4] = (char) ((int) UnicodeCategory.OtherSymbol + 1);
464             tempCategories["So"] = groups[4].ToString();     // Other
465 
466             groups[5] = GroupChar;
467             tempCategories["S"] = new String(groups, 0, 6);
468 
469             // Separators
470             groups[1] = (char) ((int) UnicodeCategory.LineSeparator + 1);
471             tempCategories["Zl"] = groups[1].ToString();     // Line
472             groups[2] = (char) ((int) UnicodeCategory.ParagraphSeparator + 1);
473             tempCategories["Zp"] = groups[2].ToString();     // Paragraph
474             groups[3] = (char) ((int) UnicodeCategory.SpaceSeparator + 1);
475             tempCategories["Zs"] = groups[3].ToString();     // Space
476 
477             groups[4] = GroupChar;
478             tempCategories["Z"] = new String(groups, 0, 5);
479 
480 
481             word.Append(GroupChar);
482             Word = word.ToString();
483             NotWord = NegateCategory(Word);
484 
485 
486             SpaceClass      = "\x00\x00\x01" + Space;
487             NotSpaceClass   = "\x01\x00\x01" + Space;
488             WordClass       = "\x00\x00" + (char) Word.Length + Word;
489             NotWordClass    = "\x01\x00" + (char) Word.Length + Word;;
490             DigitClass      = "\x00\x00\x01" + (char) ((int) UnicodeCategory.DecimalDigitNumber + 1);
491             NotDigitClass   = "\x00\x00\x01" + unchecked ((char) (- ((int) UnicodeCategory.DecimalDigitNumber + 1)) );
492 
493 #if DBG
494             // make sure the _propTable is correctly ordered
495             int len = _propTable.GetLength(0);
496             for (int i=0; i<len-1; i++)
497                 Debug.Assert(String.Compare(_propTable[i,0], _propTable[i+1,0], StringComparison.Ordinal) < 0, "RegexCharClass _propTable is out of order at (" + _propTable[i,0] +", " + _propTable[i+1,0] + ")");
498 #endif
499 
500             _definedCategories = tempCategories;
501         }
502 
503         /*
504          * RegexCharClass()
505          *
506          * Creates an empty character class.
507          */
RegexCharClass()508         internal RegexCharClass() {
509             _rangelist = new List<SingleRange>(6);
510             _canonical = true;
511             _categories = new StringBuilder();
512 
513         }
514 
RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction)515         private  RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction) {
516             _rangelist = ranges;
517             _categories = categories;
518             _canonical = true;
519             _negate=negate;
520             _subtractor = subtraction;
521         }
522 
523         internal bool CanMerge {
524             get {
525                 return !_negate && _subtractor == null;
526             }
527         }
528 
529         internal bool Negate {
530             set { _negate = value; }
531         }
532 
AddChar(char c)533         internal void AddChar(char c) {
534             AddRange(c,c);
535         }
536 
537         /*
538          * AddCharClass()
539          *
540          * Adds a regex char class
541          */
AddCharClass(RegexCharClass cc)542         internal void AddCharClass(RegexCharClass cc) {
543             int i;
544 
545             Debug.Assert(cc.CanMerge && this.CanMerge, "Both character classes added together must be able to merge" );
546 
547             if (!cc._canonical) {
548                 // if the new char class to add isn't canonical, we're not either.
549                 _canonical = false;
550             }
551             else if (_canonical && RangeCount() > 0 && cc.RangeCount() > 0 && cc.GetRangeAt(0)._first <= GetRangeAt(RangeCount() - 1)._last)
552                 _canonical = false;
553 
554             for (i = 0; i < cc.RangeCount(); i += 1) {
555                 _rangelist.Add(cc.GetRangeAt(i));
556             }
557 
558             _categories.Append(cc._categories.ToString());
559         }
560 
561         /*
562          * AddSet()
563          *
564          * Adds a set (specified by its string represenation) to the class.
565          */
AddSet(String set)566         private void AddSet(String set) {
567             int i;
568 
569             if (_canonical && RangeCount() > 0 && set.Length > 0 &&
570                 set[0] <= GetRangeAt(RangeCount() - 1)._last)
571                 _canonical = false;
572 
573             for (i = 0; i < set.Length - 1; i += 2) {
574                 _rangelist.Add(new SingleRange(set[i], (char)(set[i + 1] - 1)));
575             }
576 
577             if (i < set.Length) {
578                 _rangelist.Add(new SingleRange(set[i], Lastchar));
579             }
580         }
581 
AddSubtraction(RegexCharClass sub)582         internal void AddSubtraction(RegexCharClass sub) {
583             Debug.Assert(_subtractor == null, "Can't add two subtractions to a char class. ");
584             _subtractor = sub;
585         }
586 
587         /*
588          * AddRange()
589          *
590          * Adds a single range of characters to the class.
591          */
AddRange(char first, char last)592         internal void AddRange(char first, char last) {
593             _rangelist.Add(new SingleRange(first, last));
594             if (_canonical && _rangelist.Count > 0 &&
595                 first <= _rangelist[_rangelist.Count - 1]._last) {
596                 _canonical = false;
597             }
598         }
599 
AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern)600         internal void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern) {
601 
602             String cat;
603             _definedCategories.TryGetValue(categoryName, out cat);
604             if (cat != null && !categoryName.Equals(InternalRegexIgnoreCase)) {
605                 string catstr = cat;
606 
607                 if (caseInsensitive) {
608                     if (categoryName.Equals("Ll") || categoryName.Equals("Lu") || categoryName.Equals("Lt"))
609                         // when RegexOptions.IgnoreCase is specified then {Ll}, {Lu}, and {Lt} cases should all match
610                         catstr = (string) _definedCategories[InternalRegexIgnoreCase];
611                 }
612 
613                 if (invert)
614                     catstr = NegateCategory(catstr); // negate the category
615 
616                 _categories.Append((string) catstr);
617             }
618             else
619                 AddSet(SetFromProperty(categoryName, invert, pattern));
620         }
621 
AddCategory(string category)622         private void AddCategory(string category) {
623             _categories.Append(category);
624         }
625 
626         /*
627          * AddLowerCase()
628          *
629          * Adds to the class any lowercase versions of characters already
630          * in the class. Used for case-insensitivity.
631          */
AddLowercase(CultureInfo culture)632         internal void AddLowercase(CultureInfo culture) {
633             int i;
634             int origSize;
635             SingleRange range;
636 
637             _canonical = false;
638 
639             for (i = 0, origSize = _rangelist.Count; i < origSize; i++) {
640                 range = _rangelist[i];
641                 if (range._first == range._last)
642                     range._first = range._last = Char.ToLower(range._first, culture);
643                 else
644                     AddLowercaseRange(range._first, range._last, culture);
645             }
646         }
647 
648         /*
649          * AddLowercaseRange()
650          *
651          * For a single range that's in the set, adds any additional ranges
652          * necessary to ensure that lowercase equivalents are also included.
653          */
AddLowercaseRange(char chMin, char chMax, CultureInfo culture)654         private void AddLowercaseRange(char chMin, char chMax, CultureInfo culture) {
655             int i, iMax, iMid;
656             char chMinT, chMaxT;
657             LowerCaseMapping lc;
658 
659             for (i = 0, iMax = _lcTable.Length; i < iMax; ) {
660                 iMid = (i + iMax) / 2;
661                 if (_lcTable[iMid]._chMax < chMin)
662                     i = iMid + 1;
663                 else
664                     iMax = iMid;
665             }
666 
667             if (i >= _lcTable.Length)
668                 return;
669 
670             for ( ; i < _lcTable.Length && (lc = _lcTable[i])._chMin <= chMax; i++) {
671                 if ((chMinT = lc._chMin) < chMin)
672                     chMinT = chMin;
673 
674                 if ((chMaxT = lc._chMax) > chMax)
675                     chMaxT = chMax;
676 
677                 switch (lc._lcOp) {
678                     case LowercaseSet:
679                         chMinT = (char)lc._data;
680                         chMaxT = (char)lc._data;
681                         break;
682                     case LowercaseAdd:
683                         chMinT += (char)lc._data;
684                         chMaxT += (char)lc._data;
685                         break;
686                     case LowercaseBor:
687                         chMinT |= (char)1;
688                         chMaxT |= (char)1;
689                         break;
690                     case LowercaseBad:
691                         chMinT += (char)(chMinT & 1);
692                         chMaxT += (char)(chMaxT & 1);
693                         break;
694                 }
695 
696                 if (chMinT < chMin || chMaxT > chMax)
697                     AddRange(chMinT, chMaxT);
698             }
699         }
700 
AddWord(bool ecma, bool negate)701         internal void AddWord(bool ecma, bool negate) {
702             if (negate) {
703                 if (ecma)
704                     AddSet(RegexCharClass.NotECMAWordSet);
705                 else
706                     AddCategory(RegexCharClass.NotWord);
707             }
708             else {
709                 if (ecma)
710                     AddSet(RegexCharClass.ECMAWordSet);
711                 else
712                     AddCategory(RegexCharClass.Word);
713             }
714         }
715 
AddSpace(bool ecma, bool negate)716         internal void AddSpace(bool ecma, bool negate) {
717             if (negate) {
718                 if (ecma)
719                     AddSet(RegexCharClass.NotECMASpaceSet);
720                 else
721                     AddCategory(RegexCharClass.NotSpace);
722             }
723             else {
724                 if (ecma)
725                     AddSet(RegexCharClass.ECMASpaceSet);
726                 else
727                     AddCategory(RegexCharClass.Space);
728             }
729         }
730 
AddDigit(bool ecma, bool negate, string pattern)731         internal void AddDigit(bool ecma, bool negate, string pattern) {
732             if (ecma) {
733                 if (negate)
734                     AddSet(RegexCharClass.NotECMADigitSet);
735                 else
736                     AddSet(RegexCharClass.ECMADigitSet);
737             }
738             else
739                 AddCategoryFromName("Nd", negate, false, pattern);
740         }
741 
ConvertOldStringsToClass(string set, string category)742         internal static string ConvertOldStringsToClass(string set, string category) {
743             StringBuilder sb = new StringBuilder(set.Length + category.Length + 3);
744 
745             if (set.Length >= 2 && set[0] =='\0' && set[1] == '\0') {
746                 sb.Append((char) 0x1);
747                 sb.Append((char) (set.Length - 2));
748                 sb.Append((char) category.Length);
749                 sb.Append(set.Substring(2));
750             }
751             else {
752                 sb.Append((char) 0x0);
753                 sb.Append((char) set.Length);
754                 sb.Append((char) category.Length);
755                 sb.Append(set);
756             }
757             sb.Append(category);
758 
759             return sb.ToString();
760         }
761 
762         /*
763          * SingletonChar()
764          *
765          * Returns the char
766          */
SingletonChar(String set)767         internal static char SingletonChar(String set) {
768             Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
769             return set[SETSTART];
770         }
771 
IsMergeable(string charClass)772         internal static bool IsMergeable(string charClass) {
773             return (!IsNegated(charClass) && !IsSubtraction(charClass));
774         }
775 
IsEmpty(String charClass)776         internal static bool IsEmpty(String charClass) {
777             if (charClass[CATEGORYLENGTH] == 0 && charClass[FLAGS] == 0 && charClass[SETLENGTH] == 0 && !IsSubtraction(charClass))
778                 return true;
779             else
780                 return false;
781         }
782 
783         /*
784          * IsSingleton()
785          *
786          * True if the set contains a single character only
787          */
IsSingleton(String set)788         internal static bool IsSingleton(String set) {
789             if (set[FLAGS] == 0 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
790                 (set[SETSTART] == Lastchar || set[SETSTART]+1 == set[SETSTART+1]))
791                 return true;
792             else
793                 return false;
794         }
795 
IsSingletonInverse(String set)796         internal static bool IsSingletonInverse(String set) {
797             if (set[FLAGS] == 1 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
798                 (set[SETSTART] == Lastchar || set[SETSTART]+1 == set[SETSTART+1]))
799                 return true;
800             else
801                 return false;
802         }
803 
IsSubtraction(string charClass)804         private static bool IsSubtraction(string charClass) {
805             return (charClass.Length > SETSTART + charClass[SETLENGTH] + charClass[CATEGORYLENGTH]);
806         }
807 
IsNegated(string set)808         internal static bool IsNegated(string set) {
809             return (set != null && set[FLAGS] == 1);
810         }
811 
IsECMAWordChar(char ch)812         internal static bool IsECMAWordChar(char ch) {
813             // According to ECMA-262, \s, \S, ., ^, and $ use Unicode-based interpretations of
814             // whitespace and newline, while \d, \D\, \w, \W, \b, and \B use ASCII-only
815             // interpretations of digit, word character, and word boundary.  In other words,
816             // no special treatment of Unicode ZERO WIDTH NON-JOINER (ZWNJ U+200C) and
817             // ZERO WIDTH JOINER (ZWJ U+200D) is required for ECMA word boundaries.
818             return CharInClass(ch, ECMAWordClass);
819         }
820 
IsWordChar(char ch)821         internal static bool IsWordChar(char ch) {
822             // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
823             // RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
824             // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
825             // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
826             return CharInClass(ch, WordClass) || ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner;
827         }
828 
CharInClass(char ch, String set)829         internal static bool CharInClass(char ch, String set) {
830             return CharInClassRecursive(ch, set, 0);
831         }
832 
833 
CharInClassRecursive(char ch, String set, int start)834         internal static bool CharInClassRecursive(char ch, String set, int start) {
835             int mySetLength = set[start+SETLENGTH];
836             int myCategoryLength = set[start+CATEGORYLENGTH];
837             int myEndPosition = start + SETSTART + mySetLength + myCategoryLength;
838 
839             bool subtracted = false;
840 
841             if (set.Length > myEndPosition) {
842                 subtracted = CharInClassRecursive(ch, set, myEndPosition);
843             }
844 
845             bool b = CharInClassInternal(ch, set, start, mySetLength, myCategoryLength);
846 
847             // Note that we apply the negation *before* performing the subtraction.  This is because
848             // the negation only applies to the first char class, not the entire subtraction.
849             if (set[start+FLAGS] == 1)
850                 b = !b;
851 
852             return b && !subtracted;
853         }
854 
855         /*
856          * CharInClass()
857          *
858          * Determines a character's membership in a character class (via the
859          * string representation of the class).
860          */
CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength)861         private static bool CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength) {
862             int min;
863             int max;
864             int mid;
865             min = start + SETSTART;
866             max = min + mySetLength;
867 
868             while (min != max) {
869                 mid = (min + max) / 2;
870                 if (ch < set[mid])
871                     max = mid;
872                 else
873                     min = mid + 1;
874             }
875 
876             // The starting position of the set within the character class determines
877             // whether what an odd or even ending position means.  If the start is odd,
878             // an *even* ending position means the character was in the set.  With recursive
879             // subtractions in the mix, the starting position = start+SETSTART.  Since we know that
880             // SETSTART is odd, we can simplify it out of the equation.  But if it changes we need to
881             // reverse this check.
882             Debug.Assert((SETSTART & 0x1) == 1, "If SETSTART is not odd, the calculation below this will be reversed");
883             if ((min & 0x1) == (start & 0x1))
884                 return true;
885             else {
886                 if (myCategoryLength == 0)
887                     return false;
888 
889                 return CharInCategory(ch, set, start, mySetLength, myCategoryLength);
890             }
891         }
892 
CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength)893         private static bool CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength) {
894             UnicodeCategory chcategory = char.GetUnicodeCategory(ch);
895 
896             int i=start + SETSTART + mySetLength;
897             int end = i + myCategoryLength;
898             while (i<end) {
899                 int curcat = (short) set[i];
900 
901                 if (curcat == 0) {
902                     // zero is our marker for a group of categories - treated as a unit
903                     if (CharInCategoryGroup(ch, chcategory, set, ref i))
904                         return true;
905                 }
906                 else if (curcat > 0) {
907                     // greater than zero is a positive case
908 
909                     if (curcat  == SpaceConst) {
910                         if (Char.IsWhiteSpace(ch))
911                             return true;
912                         else  {
913                             i++;
914                             continue;
915                         }
916                     }
917                     --curcat;
918 
919                     if (chcategory == (UnicodeCategory) curcat)
920                         return true;
921                 }
922                 else {
923                     // less than zero is a negative case
924                     if (curcat == NotSpaceConst) {
925                         if (!Char.IsWhiteSpace(ch))
926                             return true;
927                         else  {
928                             i++;
929                             continue;
930                         }
931                     }
932 
933                     //curcat = -curcat;
934                     //--curcat;
935                     curcat = -1 - curcat;
936 
937                     if (chcategory != (UnicodeCategory) curcat)
938                         return true;
939                 }
940                 i++;
941             }
942             return false;
943         }
944 
945         /*
946         *  CharInCategoryGroup
947         *  This is used for categories which are composed of other categories - L, N, Z, W...
948         *  These groups need special treatment when they are negated
949         */
CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i)950         private static bool CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i) {
951             i++;
952 
953             int curcat = (short) category[i];
954             if (curcat > 0) {
955                 // positive case - the character must be in ANY of the categories in the group
956                 bool answer = false;
957 
958                 while (curcat != 0) {
959                     if (!answer) {
960                         --curcat;
961                         if (chcategory == (UnicodeCategory) curcat)
962                             answer = true;
963                     }
964                     i++;
965                     curcat = (short) category[i];
966                 }
967                 return answer;
968             }
969             else {
970 
971                 // negative case - the character must be in NONE of the categories in the group
972                 bool answer = true;
973 
974                 while (curcat != 0) {
975                     if (answer) {
976                         //curcat = -curcat;
977                         //--curcat;
978                         curcat = -1 - curcat;
979                         if (chcategory == (UnicodeCategory) curcat)
980                             answer = false;
981                     }
982                     i++;
983                     curcat = (short) category[i];
984                 }
985                 return answer;
986             }
987         }
988 
NegateCategory(string category)989         private static string NegateCategory(string category) {
990             if (category == null)
991                 return null;
992 
993             StringBuilder sb = new StringBuilder(category.Length);
994 
995             for (int i=0; i<category.Length; i++) {
996                 short ch = (short) category[i];
997                 sb.Append( (char) -ch);
998             }
999             return sb.ToString();
1000         }
1001 
Parse(string charClass)1002         internal static RegexCharClass Parse(string charClass) {
1003             return ParseRecursive(charClass, 0);
1004         }
1005 
ParseRecursive(string charClass, int start)1006         private static RegexCharClass ParseRecursive(string charClass, int start) {
1007             int mySetLength = charClass[start+SETLENGTH];
1008             int myCategoryLength = charClass[start+CATEGORYLENGTH];
1009             int myEndPosition = start + SETSTART + mySetLength + myCategoryLength;
1010 
1011             List<SingleRange> ranges = new List<SingleRange>(mySetLength);
1012             int i=start+SETSTART;
1013             int end = i + mySetLength;
1014             while (i<end) {
1015                 char first = charClass[i];
1016                 i++;
1017 
1018                 char last;
1019                 if (i < end)
1020                     last = (char) (charClass[i] - 1);
1021                 else
1022                     last = Lastchar;
1023                 i++;
1024                 ranges.Add(new SingleRange(first, last));
1025             }
1026 
1027             RegexCharClass sub = null;
1028             if (charClass.Length > myEndPosition)
1029                 sub = ParseRecursive(charClass, myEndPosition);
1030 
1031             return new RegexCharClass(charClass[start+FLAGS] == 1, ranges, new StringBuilder(charClass.Substring(end, myCategoryLength)), sub);
1032         }
1033 
1034         /*
1035          * RangeCount()
1036          *
1037          * The number of single ranges that have been accumulated so far.
1038          */
RangeCount()1039         private int RangeCount() {
1040             return _rangelist.Count;
1041         }
1042 
1043         /*
1044          * ToString()
1045          *
1046          * Constructs the string representation of the class.
1047          */
ToStringClass()1048         internal String ToStringClass() {
1049             if (!_canonical)
1050                 Canonicalize();
1051 
1052             // make a guess about the length of the ranges.  We'll update this at the end.
1053             // This is important because if the last range ends in LastChar, we won't append
1054             // LastChar to the list.
1055             int rangeLen = _rangelist.Count * 2 ;
1056             StringBuilder sb = new StringBuilder(rangeLen + _categories.Length + 3);
1057 
1058             int flags;
1059             if (_negate)
1060                 flags = 1;
1061             else
1062                 flags = 0;
1063 
1064             sb.Append((char) flags);
1065             sb.Append((char) rangeLen);
1066             sb.Append((char) _categories.Length);
1067 
1068             for (int i = 0; i < _rangelist.Count; i++) {
1069                 SingleRange currentRange = _rangelist[i];
1070                 sb.Append(currentRange._first);
1071 
1072                 if (currentRange._last != Lastchar)
1073                     sb.Append((char)(currentRange._last + 1));
1074             }
1075 
1076             sb[SETLENGTH] = (char) (sb.Length - SETSTART);
1077 
1078             sb.Append(_categories);
1079 
1080             if (_subtractor != null)
1081                 sb.Append(_subtractor.ToStringClass());
1082 
1083             return sb.ToString();
1084         }
1085 
1086         /*
1087          * GetRangeAt(int i)
1088          *
1089          * The ith range.
1090          */
GetRangeAt(int i)1091         private SingleRange GetRangeAt(int i) {
1092             return _rangelist[i];
1093         }
1094 
1095         /*
1096          * Canonicalize()
1097          *
1098          * Logic to reduce a character class to a unique, sorted form.
1099          */
Canonicalize()1100         private void Canonicalize() {
1101             SingleRange CurrentRange;
1102             int i;
1103             int j;
1104             char last;
1105             bool Done;
1106 
1107             _canonical = true;
1108             _rangelist.Sort(0, _rangelist.Count, new SingleRangeComparer());
1109 
1110             //
1111             // Find and eliminate overlapping or abutting ranges
1112             //
1113 
1114             if (_rangelist.Count > 1) {
1115                 Done = false;
1116 
1117                 for (i = 1, j = 0; ; i++) {
1118                     for (last = _rangelist[j]._last; ; i++) {
1119                         if (i == _rangelist.Count || last == Lastchar) {
1120                             Done = true;
1121                             break;
1122                         }
1123 
1124                         if ((CurrentRange = _rangelist[i])._first > last + 1)
1125                             break;
1126 
1127                         if (last < CurrentRange._last)
1128                             last = CurrentRange._last;
1129                     }
1130 
1131                     _rangelist[j]._last = last;
1132 
1133                     j++;
1134 
1135                     if (Done)
1136                         break;
1137 
1138                     if (j < i)
1139                         _rangelist[j] = _rangelist[i];
1140                 }
1141                 _rangelist.RemoveRange(j, _rangelist.Count - j);
1142             }
1143         }
1144 
SetFromProperty(String capname, bool invert, string pattern)1145         private static String SetFromProperty(String capname, bool invert, string pattern) {
1146             int min = 0;
1147             int max = _propTable.GetLength(0);
1148             while (min != max) {
1149                 int mid = (min + max) / 2;
1150                 int res = String.Compare(capname, _propTable[mid,0], StringComparison.Ordinal);
1151                 if (res < 0)
1152                     max = mid;
1153                 else if (res > 0)
1154                     min = mid + 1;
1155                 else {
1156                     String set = _propTable[mid,1];
1157                     Debug.Assert(!String.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table");
1158                     if (invert)
1159                     {
1160                         if (set[0] == Nullchar)
1161                         {
1162                             return set.Substring(1);
1163                         }
1164                         return Nullchar + set;
1165                     }
1166                     else
1167                     {
1168                         return set;
1169                     }
1170                 }
1171             }
1172             throw new ArgumentException(SR.GetString(SR.MakeException, pattern, SR.GetString(SR.UnknownProperty, capname)));
1173         }
1174 
1175 #if DBG
1176 
1177         /*
1178          * SetDescription()
1179          *
1180          * Produces a human-readable description for a set string.
1181          */
SetDescription(String set)1182         internal static String SetDescription(String set) {
1183             int mySetLength = set[SETLENGTH];
1184             int myCategoryLength = set[CATEGORYLENGTH];
1185             int myEndPosition = SETSTART + mySetLength + myCategoryLength;
1186 
1187             StringBuilder desc = new StringBuilder("[");
1188 
1189             int index = SETSTART;
1190             char ch1;
1191             char ch2;
1192 
1193             if (IsNegated(set))
1194                 desc.Append('^');
1195 
1196             while (index < SETSTART + set[SETLENGTH]) {
1197                 ch1 = set[index];
1198                 if (index + 1 < set.Length)
1199                     ch2 = (char)(set[index + 1] - 1);
1200                 else
1201                     ch2 = Lastchar;
1202 
1203                 desc.Append(CharDescription(ch1));
1204 
1205                 if (ch2 != ch1) {
1206                     if (ch1 + 1 != ch2)
1207                         desc.Append('-');
1208                     desc.Append(CharDescription(ch2));
1209                 }
1210                 index += 2;
1211             }
1212 
1213             while (index < SETSTART + set[SETLENGTH] + set[CATEGORYLENGTH]) {
1214                 ch1 = set[index];
1215                 if (ch1 == 0) {
1216                     bool found = false;
1217 
1218                     int lastindex = set.IndexOf(GroupChar, index+1);
1219                     string group = set.Substring(index,lastindex-index + 1);
1220 
1221                     IDictionaryEnumerator en = _definedCategories.GetEnumerator();
1222                     while(en.MoveNext()) {
1223                         if (group.Equals(en.Value)) {
1224                             if ((short) set[index+1] > 0)
1225                                 desc.Append("\\p{" + en.Key + "}");
1226                             else
1227                                 desc.Append("\\P{" + en.Key + "}");
1228 
1229                             found = true;
1230                             break;
1231                         }
1232                     }
1233 
1234                     if (!found) {
1235                         if (group.Equals(Word))
1236                             desc.Append("\\w");
1237                         else if (group.Equals(NotWord))
1238                             desc.Append("\\W");
1239                         else
1240                             Debug.Assert(false, "Couldn't find a goup to match '" + group + "'");
1241                     }
1242 
1243                     index = lastindex;
1244                 }
1245                 else {
1246                     desc.Append(CategoryDescription(ch1));
1247                 }
1248 
1249                 index++;
1250             }
1251 
1252             if (set.Length > myEndPosition) {
1253                 desc.Append('-');
1254                 desc.Append(SetDescription(set.Substring(myEndPosition)));
1255             }
1256 
1257             desc.Append(']');
1258 
1259             return desc.ToString();
1260         }
1261 
1262         internal static readonly char [] Hex = new char [] {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f'};
1263         internal static readonly string[] Categories = new string[] {"Lu", "Ll", "Lt", "Lm", "Lo", InternalRegexIgnoreCase,
1264                                                                      "Mn", "Mc", "Me",
1265                                                                      "Nd", "Nl", "No",
1266                                                                      "Zs", "Zl", "Zp",
1267                                                                      "Cc", "Cf", "Cs", "Co",
1268                                                                      "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
1269                                                                      "Sm", "Sc", "Sk", "So",
1270                                                                      "Cn" };
1271 
1272          /*
1273          * CharDescription()
1274          *
1275          * Produces a human-readable description for a single character.
1276          */
CharDescription(char ch)1277         internal static String CharDescription(char ch) {
1278             StringBuilder sb = new StringBuilder();
1279             int shift;
1280 
1281             if (ch == '\\')
1282                 return "\\\\";
1283 
1284             if (ch >= ' ' && ch <= '~') {
1285                 sb.Append(ch);
1286                 return sb.ToString();
1287             }
1288 
1289             if (ch < 256) {
1290                 sb.Append("\\x");
1291                 shift = 8;
1292             }
1293             else {
1294                 sb.Append("\\u");
1295                 shift = 16;
1296             }
1297 
1298             while (shift > 0) {
1299                 shift -= 4;
1300                 sb.Append(Hex[(ch >> shift) & 0xF]);
1301             }
1302 
1303             return sb.ToString();
1304         }
1305 
CategoryDescription(char ch)1306         private static String CategoryDescription(char ch) {
1307             if (ch == SpaceConst)
1308                 return "\\s";
1309             else if ((short) ch == NotSpaceConst)
1310                 return "\\S";
1311             else if ((short) ch < 0) {
1312                 return "\\P{" + Categories[(- ((short)ch) - 1)] + "}";
1313             }
1314             else {
1315                 return "\\p{" + Categories[(ch - 1)] + "}";
1316             }
1317         }
1318 
1319 #endif
1320 
1321         // Lower case mapping descriptor.
1322         private struct LowerCaseMapping {
LowerCaseMappingSystem.Text.RegularExpressions.RegexCharClass.LowerCaseMapping1323             internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data) {
1324                 _chMin = chMin;
1325                 _chMax = chMax;
1326                 _lcOp  = lcOp;
1327                 _data  = data;
1328             }
1329 
1330             internal char _chMin;
1331             internal char _chMax;
1332             internal int _lcOp;
1333             internal int _data;
1334         }
1335 
1336         /*
1337          * SingleRangeComparer
1338          *
1339          * For sorting ranges; compare based on the first char in the range.
1340          */
1341         private sealed class SingleRangeComparer : IComparer<SingleRange> {
Compare(SingleRange x, SingleRange y)1342             public int Compare(SingleRange x, SingleRange y) {
1343                 return((x)._first < (y)._first ? -1
1344                        : ((x)._first > (y)._first ? 1 : 0));
1345             }
1346         }
1347 
1348         /*
1349          * SingleRange
1350          *
1351          * A first/last pair representing a single range of characters.
1352          */
1353         private sealed class SingleRange {
SingleRange(char first, char last)1354             internal SingleRange(char first, char last) {
1355                 _first = first;
1356                 _last = last;
1357             }
1358 
1359             internal char _first;
1360             internal char _last;
1361         }
1362     }
1363 
1364 }
1365