1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 // This RegexCharClass class provides the "set of Unicode chars" functionality
6 // used by the regexp engine.
7 
8 // The main function of RegexCharClass is as a builder to turn ranges, characters and
9 // Unicode categories into a single string.  This string is used as a black box
10 // representation of a character class by the rest of Regex.  The format is as follows.
11 //
12 // Char index   Use
13 //      0       Flags - currently this only holds the "negate" flag
14 //      1       length of the string representing the "set" portion, e.g. [a-z0-9] only has a "set"
15 //      2       length of the string representing the "category" portion, e.g. [\p{Lu}] only has a "category"
16 //      3...m   The set.  These are a series of ranges which define the characters included in the set.
17 //              To determine if a given character is in the set, we binary search over this set of ranges
18 //              and see where the character should go.  Based on whether the ending index is odd or even,
19 //              we know if the character is in the set.
20 //      m+1...n The categories.  This is a list of UnicodeCategory enum values which describe categories
21 //              included in this class.
22 
23 using System.Collections.Generic;
24 using System.Diagnostics;
25 using System.Globalization;
26 using System.IO;
27 
28 namespace System.Text.RegularExpressions
29 {
30     internal sealed class RegexCharClass
31     {
32         // instance data
33         private List<SingleRange> _rangelist;
34         private StringBuilder _categories;
35         private bool _canonical;
36         private bool _negate;
37         private RegexCharClass _subtractor;
38 
39         // Constants
40         private const int FLAGS = 0;
41         private const int SETLENGTH = 1;
42         private const int CATEGORYLENGTH = 2;
43         private const int SETSTART = 3;
44 
45         private const string NullCharString = "\0";
46 
47         private const char NullChar = '\0';
48         private const char LastChar = '\uFFFF';
49 
50         private const char GroupChar = (char)0;
51 
52 
53         private const short SpaceConst = 100;
54         private const short NotSpaceConst = -100;
55 
56         private const char ZeroWidthJoiner = '\u200D';
57         private const char ZeroWidthNonJoiner = '\u200C';
58 
59 
60         private static readonly string s_internalRegexIgnoreCase = "__InternalRegexIgnoreCase__";
61         private static readonly string s_space = "\x64";
62         private static readonly string s_notSpace = "\uFF9C";
63         private static readonly string s_word = "\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000";
64         private static readonly string s_notWord = "\u0000\uFFFE\uFFFC\uFFFB\uFFFD\uFFFF\uFFFA\uFFF7\uFFED\u0000";
65 
66         internal static readonly string SpaceClass = "\u0000\u0000\u0001\u0064";
67         internal static readonly string NotSpaceClass = "\u0001\u0000\u0001\u0064";
68         internal static readonly string WordClass = "\u0000\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000";
69         internal static readonly string NotWordClass = "\u0001\u0000\u000A\u0000\u0002\u0004\u0005\u0003\u0001\u0006\u0009\u0013\u0000";
70         internal static readonly string DigitClass = "\u0000\u0000\u0001\u0009";
71         internal static readonly string NotDigitClass = "\u0000\u0000\u0001\uFFF7";
72 
73         private const string ECMASpaceSet = "\u0009\u000E\u0020\u0021";
74         private const string NotECMASpaceSet = "\0\u0009\u000E\u0020\u0021";
75         private const string ECMAWordSet = "\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
76         private const string NotECMAWordSet = "\0\u0030\u003A\u0041\u005B\u005F\u0060\u0061\u007B\u0130\u0131";
77         private const string ECMADigitSet = "\u0030\u003A";
78         private const string NotECMADigitSet = "\0\u0030\u003A";
79 
80         internal const string ECMASpaceClass = "\x00\x04\x00" + ECMASpaceSet;
81         internal const string NotECMASpaceClass = "\x01\x04\x00" + ECMASpaceSet;
82         internal const string ECMAWordClass = "\x00\x0A\x00" + ECMAWordSet;
83         internal const string NotECMAWordClass = "\x01\x0A\x00" + ECMAWordSet;
84         internal const string ECMADigitClass = "\x00\x02\x00" + ECMADigitSet;
85         internal const string NotECMADigitClass = "\x01\x02\x00" + ECMADigitSet;
86 
87         internal const string AnyClass = "\x00\x01\x00\x00";
88         internal const string EmptyClass = "\x00\x00\x00";
89 
90         // UnicodeCategory is zero based, so we add one to each value and subtract it off later
91         private const int DefinedCategoriesCapacity = 38;
92         private static readonly Dictionary<string, string> s_definedCategories = new Dictionary<string, string>(DefinedCategoriesCapacity)
93         {
94             // Others
95             { "Cc", "\u000F" }, // UnicodeCategory.Control + 1
96             { "Cf", "\u0010" }, // UnicodeCategory.Format + 1
97             { "Cn", "\u001E" }, // UnicodeCategory.OtherNotAssigned + 1
98             { "Co", "\u0012" }, // UnicodeCategory.PrivateUse + 1
99             { "Cs", "\u0011" }, // UnicodeCategory.Surrogate + 1
100             { "C", "\u0000\u000F\u0010\u001E\u0012\u0011\u0000" },
101 
102             // Letters
103             { "Ll", "\u0002" }, // UnicodeCategory.LowercaseLetter + 1
104             { "Lm", "\u0004" }, // UnicodeCategory.ModifierLetter + 1
105             { "Lo", "\u0005" }, // UnicodeCategory.OtherLetter + 1
106             { "Lt", "\u0003" }, // UnicodeCategory.TitlecaseLetter + 1
107             { "Lu", "\u0001" }, // UnicodeCategory.UppercaseLetter + 1
108             { "L", "\u0000\u0002\u0004\u0005\u0003\u0001\u0000" },
109 
110             // InternalRegexIgnoreCase = {LowercaseLetter} OR {TitlecaseLetter} OR {UppercaseLetter}
111             // !!!This category should only ever be used in conjunction with RegexOptions.IgnoreCase code paths!!!
112             { "__InternalRegexIgnoreCase__", "\u0000\u0002\u0003\u0001\u0000" },
113 
114             // Marks
115             { "Mc", "\u0007" }, // UnicodeCategory.SpacingCombiningMark + 1
116             { "Me", "\u0008" }, // UnicodeCategory.EnclosingMark + 1
117             { "Mn", "\u0006" }, // UnicodeCategory.NonSpacingMark + 1
118             { "M", "\u0000\u0007\u0008\u0006\u0000" },
119 
120             // Numbers
121             { "Nd", "\u0009" }, // UnicodeCategory.DecimalDigitNumber + 1
122             { "Nl", "\u000A" }, // UnicodeCategory.LetterNumber + 1
123             { "No", "\u000B" }, // UnicodeCategory.OtherNumber + 1
124             { "N", "\u0000\u0009\u000A\u000B\u0000" },
125 
126             // Punctuation
127             { "Pc", "\u0013" }, // UnicodeCategory.ConnectorPunctuation + 1
128             { "Pd", "\u0014" }, // UnicodeCategory.DashPunctuation + 1
129             { "Pe", "\u0016" }, // UnicodeCategory.ClosePunctuation + 1
130             { "Po", "\u0019" }, // UnicodeCategory.OtherPunctuation + 1
131             { "Ps", "\u0015" }, // UnicodeCategory.OpenPunctuation + 1
132             { "Pf", "\u0018" }, // UnicodeCategory.FinalQuotePunctuation + 1
133             { "Pi", "\u0017" }, // UnicodeCategory.InitialQuotePunctuation + 1
134             { "P", "\u0000\u0013\u0014\u0016\u0019\u0015\u0018\u0017\u0000" },
135 
136             // Symbols
137             { "Sc", "\u001B" }, // UnicodeCategory.CurrencySymbol + 1
138             { "Sk", "\u001C" }, // UnicodeCategory.ModifierSymbol + 1
139             { "Sm", "\u001A" }, // UnicodeCategory.MathSymbol + 1
140             { "So", "\u001D" }, // UnicodeCategory.OtherSymbol + 1
141             { "S", "\u0000\u001B\u001C\u001A\u001D\u0000" },
142 
143             // Separators
144             { "Zl", "\u000D" }, // UnicodeCategory.LineSeparator + 1
145             { "Zp", "\u000E" }, // UnicodeCategory.ParagraphSeparator + 1
146             { "Zs", "\u000C" }, // UnicodeCategory.SpaceSeparator + 1
147             { "Z", "\u0000\u000D\u000E\u000C\u0000" },
148         };
149 
150         /*
151          *   The property table contains all the block definitions defined in the
152          *   XML schema spec (http://www.w3.org/TR/2001/PR-xmlschema-2-20010316/#charcter-classes), Unicode 4.0 spec (www.unicode.org),
153          *   and Perl 5.6 (see Programming Perl, 3rd edition page 167).   Three blocks defined by Perl (and here) may
154          *   not be in the Unicode: IsHighPrivateUseSurrogates, IsHighSurrogates, and IsLowSurrogates.
155          *
156         **/
157         // Has to be sorted by the first column
158         private static readonly string[][] s_propTable = {
159             new [] {"IsAlphabeticPresentationForms",       "\uFB00\uFB50"},
160             new [] {"IsArabic",                            "\u0600\u0700"},
161             new [] {"IsArabicPresentationForms-A",         "\uFB50\uFE00"},
162             new [] {"IsArabicPresentationForms-B",         "\uFE70\uFF00"},
163             new [] {"IsArmenian",                          "\u0530\u0590"},
164             new [] {"IsArrows",                            "\u2190\u2200"},
165             new [] {"IsBasicLatin",                        "\u0000\u0080"},
166             new [] {"IsBengali",                           "\u0980\u0A00"},
167             new [] {"IsBlockElements",                     "\u2580\u25A0"},
168             new [] {"IsBopomofo",                          "\u3100\u3130"},
169             new [] {"IsBopomofoExtended",                  "\u31A0\u31C0"},
170             new [] {"IsBoxDrawing",                        "\u2500\u2580"},
171             new [] {"IsBraillePatterns",                   "\u2800\u2900"},
172             new [] {"IsBuhid",                             "\u1740\u1760"},
173             new [] {"IsCJKCompatibility",                  "\u3300\u3400"},
174             new [] {"IsCJKCompatibilityForms",             "\uFE30\uFE50"},
175             new [] {"IsCJKCompatibilityIdeographs",        "\uF900\uFB00"},
176             new [] {"IsCJKRadicalsSupplement",             "\u2E80\u2F00"},
177             new [] {"IsCJKSymbolsandPunctuation",          "\u3000\u3040"},
178             new [] {"IsCJKUnifiedIdeographs",              "\u4E00\uA000"},
179             new [] {"IsCJKUnifiedIdeographsExtensionA",    "\u3400\u4DC0"},
180             new [] {"IsCherokee",                          "\u13A0\u1400"},
181             new [] {"IsCombiningDiacriticalMarks",         "\u0300\u0370"},
182             new [] {"IsCombiningDiacriticalMarksforSymbols","\u20D0\u2100"},
183             new [] {"IsCombiningHalfMarks",                "\uFE20\uFE30"},
184             new [] {"IsCombiningMarksforSymbols",          "\u20D0\u2100"},
185             new [] {"IsControlPictures",                   "\u2400\u2440"},
186             new [] {"IsCurrencySymbols",                   "\u20A0\u20D0"},
187             new [] {"IsCyrillic",                          "\u0400\u0500"},
188             new [] {"IsCyrillicSupplement",                "\u0500\u0530"},
189             new [] {"IsDevanagari",                        "\u0900\u0980"},
190             new [] {"IsDingbats",                          "\u2700\u27C0"},
191             new [] {"IsEnclosedAlphanumerics",             "\u2460\u2500"},
192             new [] {"IsEnclosedCJKLettersandMonths",       "\u3200\u3300"},
193             new [] {"IsEthiopic",                          "\u1200\u1380"},
194             new [] {"IsGeneralPunctuation",                "\u2000\u2070"},
195             new [] {"IsGeometricShapes",                   "\u25A0\u2600"},
196             new [] {"IsGeorgian",                          "\u10A0\u1100"},
197             new [] {"IsGreek",                             "\u0370\u0400"},
198             new [] {"IsGreekExtended",                     "\u1F00\u2000"},
199             new [] {"IsGreekandCoptic",                    "\u0370\u0400"},
200             new [] {"IsGujarati",                          "\u0A80\u0B00"},
201             new [] {"IsGurmukhi",                          "\u0A00\u0A80"},
202             new [] {"IsHalfwidthandFullwidthForms",        "\uFF00\uFFF0"},
203             new [] {"IsHangulCompatibilityJamo",           "\u3130\u3190"},
204             new [] {"IsHangulJamo",                        "\u1100\u1200"},
205             new [] {"IsHangulSyllables",                   "\uAC00\uD7B0"},
206             new [] {"IsHanunoo",                           "\u1720\u1740"},
207             new [] {"IsHebrew",                            "\u0590\u0600"},
208             new [] {"IsHighPrivateUseSurrogates",          "\uDB80\uDC00"},
209             new [] {"IsHighSurrogates",                    "\uD800\uDB80"},
210             new [] {"IsHiragana",                          "\u3040\u30A0"},
211             new [] {"IsIPAExtensions",                     "\u0250\u02B0"},
212             new [] {"IsIdeographicDescriptionCharacters",  "\u2FF0\u3000"},
213             new [] {"IsKanbun",                            "\u3190\u31A0"},
214             new [] {"IsKangxiRadicals",                    "\u2F00\u2FE0"},
215             new [] {"IsKannada",                           "\u0C80\u0D00"},
216             new [] {"IsKatakana",                          "\u30A0\u3100"},
217             new [] {"IsKatakanaPhoneticExtensions",        "\u31F0\u3200"},
218             new [] {"IsKhmer",                             "\u1780\u1800"},
219             new [] {"IsKhmerSymbols",                      "\u19E0\u1A00"},
220             new [] {"IsLao",                               "\u0E80\u0F00"},
221             new [] {"IsLatin-1Supplement",                 "\u0080\u0100"},
222             new [] {"IsLatinExtended-A",                   "\u0100\u0180"},
223             new [] {"IsLatinExtended-B",                   "\u0180\u0250"},
224             new [] {"IsLatinExtendedAdditional",           "\u1E00\u1F00"},
225             new [] {"IsLetterlikeSymbols",                 "\u2100\u2150"},
226             new [] {"IsLimbu",                             "\u1900\u1950"},
227             new [] {"IsLowSurrogates",                     "\uDC00\uE000"},
228             new [] {"IsMalayalam",                         "\u0D00\u0D80"},
229             new [] {"IsMathematicalOperators",             "\u2200\u2300"},
230             new [] {"IsMiscellaneousMathematicalSymbols-A","\u27C0\u27F0"},
231             new [] {"IsMiscellaneousMathematicalSymbols-B","\u2980\u2A00"},
232             new [] {"IsMiscellaneousSymbols",              "\u2600\u2700"},
233             new [] {"IsMiscellaneousSymbolsandArrows",     "\u2B00\u2C00"},
234             new [] {"IsMiscellaneousTechnical",            "\u2300\u2400"},
235             new [] {"IsMongolian",                         "\u1800\u18B0"},
236             new [] {"IsMyanmar",                           "\u1000\u10A0"},
237             new [] {"IsNumberForms",                       "\u2150\u2190"},
238             new [] {"IsOgham",                             "\u1680\u16A0"},
239             new [] {"IsOpticalCharacterRecognition",       "\u2440\u2460"},
240             new [] {"IsOriya",                             "\u0B00\u0B80"},
241             new [] {"IsPhoneticExtensions",                "\u1D00\u1D80"},
242             new [] {"IsPrivateUse",                        "\uE000\uF900"},
243             new [] {"IsPrivateUseArea",                    "\uE000\uF900"},
244             new [] {"IsRunic",                             "\u16A0\u1700"},
245             new [] {"IsSinhala",                           "\u0D80\u0E00"},
246             new [] {"IsSmallFormVariants",                 "\uFE50\uFE70"},
247             new [] {"IsSpacingModifierLetters",            "\u02B0\u0300"},
248             new [] {"IsSpecials",                          "\uFFF0"},
249             new [] {"IsSuperscriptsandSubscripts",         "\u2070\u20A0"},
250             new [] {"IsSupplementalArrows-A",              "\u27F0\u2800"},
251             new [] {"IsSupplementalArrows-B",              "\u2900\u2980"},
252             new [] {"IsSupplementalMathematicalOperators", "\u2A00\u2B00"},
253             new [] {"IsSyriac",                            "\u0700\u0750"},
254             new [] {"IsTagalog",                           "\u1700\u1720"},
255             new [] {"IsTagbanwa",                          "\u1760\u1780"},
256             new [] {"IsTaiLe",                             "\u1950\u1980"},
257             new [] {"IsTamil",                             "\u0B80\u0C00"},
258             new [] {"IsTelugu",                            "\u0C00\u0C80"},
259             new [] {"IsThaana",                            "\u0780\u07C0"},
260             new [] {"IsThai",                              "\u0E00\u0E80"},
261             new [] {"IsTibetan",                           "\u0F00\u1000"},
262             new [] {"IsUnifiedCanadianAboriginalSyllabics","\u1400\u1680"},
263             new [] {"IsVariationSelectors",                "\uFE00\uFE10"},
264             new [] {"IsYiRadicals",                        "\uA490\uA4D0"},
265             new [] {"IsYiSyllables",                       "\uA000\uA490"},
266             new [] {"IsYijingHexagramSymbols",             "\u4DC0\u4E00"},
267             new [] {"_xmlC", /* Name Char              */   "\u002D\u002F\u0030\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00B7\u00B8\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u02D0\u02D2\u0300\u0346\u0360\u0362\u0386\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0483\u0487\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0653\u0660\u066A\u0670\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06E9\u06EA\u06EE\u06F0\u06FA\u0901\u0904\u0905\u093A\u093C\u094E\u0951\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC"
268                 +"\u09DE\u09DF\u09E4\u09E6\u09F2\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B70\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF0\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2"
269                 +"\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0E01\u0E2F\u0E30\u0E3B\u0E40\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0F18\u0F1A\u0F20\u0F2A\u0F35\u0F36\u0F37\u0F38\u0F39\u0F3A\u0F3E\u0F48\u0F49\u0F6A\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F96\u0F97\u0F98\u0F99\u0FAE\u0FB1\u0FB8\u0FB9\u0FBA\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00"
270                 +"\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u20D0\u20DD\u20E1\u20E2\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3005\u3006\u3007\u3008\u3021\u3030\u3031\u3036\u3041\u3095\u3099\u309B\u309D\u309F\u30A1\u30FB\u30FC\u30FF\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"},
271             new [] {"_xmlD",                                "\u0030\u003A\u0660\u066A\u06F0\u06FA\u0966\u0970\u09E6\u09F0\u0A66\u0A70\u0AE6\u0AF0\u0B66\u0B70\u0BE7\u0BF0\u0C66\u0C70\u0CE6\u0CF0\u0D66\u0D70\u0E50\u0E5A\u0ED0\u0EDA\u0F20\u0F2A\u1040\u104A\u1369\u1372\u17E0\u17EA\u1810\u181A\uFF10\uFF1A"},
272             new [] {"_xmlI", /* Start Name Char       */    "\u003A\u003B\u0041\u005B\u005F\u0060\u0061\u007B\u00C0\u00D7\u00D8\u00F7\u00F8\u0132\u0134\u013F\u0141\u0149\u014A\u017F\u0180\u01C4\u01CD\u01F1\u01F4\u01F6\u01FA\u0218\u0250\u02A9\u02BB\u02C2\u0386\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03D7\u03DA\u03DB\u03DC\u03DD\u03DE\u03DF\u03E0\u03E1\u03E2\u03F4\u0401\u040D\u040E\u0450\u0451\u045D\u045E\u0482\u0490\u04C5\u04C7\u04C9\u04CB\u04CD\u04D0\u04EC\u04EE\u04F6\u04F8\u04FA\u0531\u0557\u0559\u055A\u0561\u0587\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0641\u064B\u0671\u06B8\u06BA\u06BF\u06C0\u06CF\u06D0\u06D4\u06D5\u06D6\u06E5\u06E7\u0905\u093A\u093D\u093E\u0958\u0962\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09DC\u09DE\u09DF\u09E2\u09F0\u09F2\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35\u0A37\u0A38\u0A3A\u0A59\u0A5D\u0A5E\u0A5F\u0A72\u0A75\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABD\u0ABE\u0AE0\u0AE1\u0B05\u0B0D\u0B0F"
273                 +"\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3D\u0B3E\u0B5C\u0B5E\u0B5F\u0B62\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C60\u0C62\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CDE\u0CDF\u0CE0\u0CE2\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D60\u0D62\u0E01\u0E2F\u0E30\u0E31\u0E32\u0E34\u0E40\u0E46\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EAF\u0EB0\u0EB1\u0EB2\u0EB4\u0EBD\u0EBE\u0EC0\u0EC5\u0F40\u0F48\u0F49\u0F6A\u10A0\u10C6\u10D0\u10F7\u1100\u1101\u1102\u1104\u1105\u1108\u1109\u110A\u110B\u110D\u110E\u1113\u113C\u113D\u113E\u113F\u1140\u1141\u114C\u114D\u114E\u114F\u1150\u1151\u1154\u1156\u1159\u115A\u115F\u1162\u1163\u1164\u1165\u1166\u1167\u1168\u1169\u116A\u116D\u116F\u1172\u1174\u1175\u1176\u119E\u119F\u11A8\u11A9\u11AB\u11AC"
274                 +"\u11AE\u11B0\u11B7\u11B9\u11BA\u11BB\u11BC\u11C3\u11EB\u11EC\u11F0\u11F1\u11F9\u11FA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FBD\u1FBE\u1FBF\u1FC2\u1FC5\u1FC6\u1FCD\u1FD0\u1FD4\u1FD6\u1FDC\u1FE0\u1FED\u1FF2\u1FF5\u1FF6\u1FFD\u2126\u2127\u212A\u212C\u212E\u212F\u2180\u2183\u3007\u3008\u3021\u302A\u3041\u3095\u30A1\u30FB\u3105\u312D\u4E00\u9FA6\uAC00\uD7A4"},
275             new [] {"_xmlW",                                "\u0024\u0025\u002B\u002C\u0030\u003A\u003C\u003F\u0041\u005B\u005E\u005F\u0060\u007B\u007C\u007D\u007E\u007F\u00A2\u00AB\u00AC\u00AD\u00AE\u00B7\u00B8\u00BB\u00BC\u00BF\u00C0\u0221\u0222\u0234\u0250\u02AE\u02B0\u02EF\u0300\u0350\u0360\u0370\u0374\u0376\u037A\u037B\u0384\u0387\u0388\u038B\u038C\u038D\u038E\u03A2\u03A3\u03CF\u03D0\u03F7\u0400\u0487\u0488\u04CF\u04D0\u04F6\u04F8\u04FA\u0500\u0510\u0531\u0557\u0559\u055A\u0561\u0588\u0591\u05A2\u05A3\u05BA\u05BB\u05BE\u05BF\u05C0\u05C1\u05C3\u05C4\u05C5\u05D0\u05EB\u05F0\u05F3\u0621\u063B\u0640\u0656\u0660\u066A\u066E\u06D4\u06D5\u06DD\u06DE\u06EE\u06F0\u06FF\u0710\u072D\u0730\u074B\u0780\u07B2\u0901\u0904\u0905\u093A\u093C\u094E\u0950\u0955\u0958\u0964\u0966\u0970\u0981\u0984\u0985\u098D\u098F\u0991\u0993\u09A9\u09AA\u09B1\u09B2\u09B3\u09B6\u09BA\u09BC\u09BD\u09BE\u09C5\u09C7\u09C9\u09CB\u09CE\u09D7\u09D8\u09DC\u09DE\u09DF\u09E4\u09E6\u09FB\u0A02\u0A03\u0A05\u0A0B\u0A0F\u0A11\u0A13\u0A29\u0A2A\u0A31\u0A32\u0A34\u0A35"
276                 +"\u0A37\u0A38\u0A3A\u0A3C\u0A3D\u0A3E\u0A43\u0A47\u0A49\u0A4B\u0A4E\u0A59\u0A5D\u0A5E\u0A5F\u0A66\u0A75\u0A81\u0A84\u0A85\u0A8C\u0A8D\u0A8E\u0A8F\u0A92\u0A93\u0AA9\u0AAA\u0AB1\u0AB2\u0AB4\u0AB5\u0ABA\u0ABC\u0AC6\u0AC7\u0ACA\u0ACB\u0ACE\u0AD0\u0AD1\u0AE0\u0AE1\u0AE6\u0AF0\u0B01\u0B04\u0B05\u0B0D\u0B0F\u0B11\u0B13\u0B29\u0B2A\u0B31\u0B32\u0B34\u0B36\u0B3A\u0B3C\u0B44\u0B47\u0B49\u0B4B\u0B4E\u0B56\u0B58\u0B5C\u0B5E\u0B5F\u0B62\u0B66\u0B71\u0B82\u0B84\u0B85\u0B8B\u0B8E\u0B91\u0B92\u0B96\u0B99\u0B9B\u0B9C\u0B9D\u0B9E\u0BA0\u0BA3\u0BA5\u0BA8\u0BAB\u0BAE\u0BB6\u0BB7\u0BBA\u0BBE\u0BC3\u0BC6\u0BC9\u0BCA\u0BCE\u0BD7\u0BD8\u0BE7\u0BF3\u0C01\u0C04\u0C05\u0C0D\u0C0E\u0C11\u0C12\u0C29\u0C2A\u0C34\u0C35\u0C3A\u0C3E\u0C45\u0C46\u0C49\u0C4A\u0C4E\u0C55\u0C57\u0C60\u0C62\u0C66\u0C70\u0C82\u0C84\u0C85\u0C8D\u0C8E\u0C91\u0C92\u0CA9\u0CAA\u0CB4\u0CB5\u0CBA\u0CBE\u0CC5\u0CC6\u0CC9\u0CCA\u0CCE\u0CD5\u0CD7\u0CDE\u0CDF\u0CE0\u0CE2\u0CE6\u0CF0\u0D02\u0D04\u0D05\u0D0D\u0D0E\u0D11\u0D12\u0D29\u0D2A\u0D3A\u0D3E\u0D44\u0D46\u0D49"
277                 +"\u0D4A\u0D4E\u0D57\u0D58\u0D60\u0D62\u0D66\u0D70\u0D82\u0D84\u0D85\u0D97\u0D9A\u0DB2\u0DB3\u0DBC\u0DBD\u0DBE\u0DC0\u0DC7\u0DCA\u0DCB\u0DCF\u0DD5\u0DD6\u0DD7\u0DD8\u0DE0\u0DF2\u0DF4\u0E01\u0E3B\u0E3F\u0E4F\u0E50\u0E5A\u0E81\u0E83\u0E84\u0E85\u0E87\u0E89\u0E8A\u0E8B\u0E8D\u0E8E\u0E94\u0E98\u0E99\u0EA0\u0EA1\u0EA4\u0EA5\u0EA6\u0EA7\u0EA8\u0EAA\u0EAC\u0EAD\u0EBA\u0EBB\u0EBE\u0EC0\u0EC5\u0EC6\u0EC7\u0EC8\u0ECE\u0ED0\u0EDA\u0EDC\u0EDE\u0F00\u0F04\u0F13\u0F3A\u0F3E\u0F48\u0F49\u0F6B\u0F71\u0F85\u0F86\u0F8C\u0F90\u0F98\u0F99\u0FBD\u0FBE\u0FCD\u0FCF\u0FD0\u1000\u1022\u1023\u1028\u1029\u102B\u102C\u1033\u1036\u103A\u1040\u104A\u1050\u105A\u10A0\u10C6\u10D0\u10F9\u1100\u115A\u115F\u11A3\u11A8\u11FA\u1200\u1207\u1208\u1247\u1248\u1249\u124A\u124E\u1250\u1257\u1258\u1259\u125A\u125E\u1260\u1287\u1288\u1289\u128A\u128E\u1290\u12AF\u12B0\u12B1\u12B2\u12B6\u12B8\u12BF\u12C0\u12C1\u12C2\u12C6\u12C8\u12CF\u12D0\u12D7\u12D8\u12EF\u12F0\u130F\u1310\u1311\u1312\u1316\u1318\u131F\u1320\u1347\u1348\u135B\u1369\u137D\u13A0"
278                 +"\u13F5\u1401\u166D\u166F\u1677\u1681\u169B\u16A0\u16EB\u16EE\u16F1\u1700\u170D\u170E\u1715\u1720\u1735\u1740\u1754\u1760\u176D\u176E\u1771\u1772\u1774\u1780\u17D4\u17D7\u17D8\u17DB\u17DD\u17E0\u17EA\u180B\u180E\u1810\u181A\u1820\u1878\u1880\u18AA\u1E00\u1E9C\u1EA0\u1EFA\u1F00\u1F16\u1F18\u1F1E\u1F20\u1F46\u1F48\u1F4E\u1F50\u1F58\u1F59\u1F5A\u1F5B\u1F5C\u1F5D\u1F5E\u1F5F\u1F7E\u1F80\u1FB5\u1FB6\u1FC5\u1FC6\u1FD4\u1FD6\u1FDC\u1FDD\u1FF0\u1FF2\u1FF5\u1FF6\u1FFF\u2044\u2045\u2052\u2053\u2070\u2072\u2074\u207D\u207F\u208D\u20A0\u20B2\u20D0\u20EB\u2100\u213B\u213D\u214C\u2153\u2184\u2190\u2329\u232B\u23B4\u23B7\u23CF\u2400\u2427\u2440\u244B\u2460\u24FF\u2500\u2614\u2616\u2618\u2619\u267E\u2680\u268A\u2701\u2705\u2706\u270A\u270C\u2728\u2729\u274C\u274D\u274E\u274F\u2753\u2756\u2757\u2758\u275F\u2761\u2768\u2776\u2795\u2798\u27B0\u27B1\u27BF\u27D0\u27E6\u27F0\u2983\u2999\u29D8\u29DC\u29FC\u29FE\u2B00\u2E80\u2E9A\u2E9B\u2EF4\u2F00\u2FD6\u2FF0\u2FFC\u3004\u3008\u3012\u3014\u3020\u3030\u3031\u303D\u303E\u3040"
279                 +"\u3041\u3097\u3099\u30A0\u30A1\u30FB\u30FC\u3100\u3105\u312D\u3131\u318F\u3190\u31B8\u31F0\u321D\u3220\u3244\u3251\u327C\u327F\u32CC\u32D0\u32FF\u3300\u3377\u337B\u33DE\u33E0\u33FF\u3400\u4DB6\u4E00\u9FA6\uA000\uA48D\uA490\uA4C7\uAC00\uD7A4\uF900\uFA2E\uFA30\uFA6B\uFB00\uFB07\uFB13\uFB18\uFB1D\uFB37\uFB38\uFB3D\uFB3E\uFB3F\uFB40\uFB42\uFB43\uFB45\uFB46\uFBB2\uFBD3\uFD3E\uFD50\uFD90\uFD92\uFDC8\uFDF0\uFDFD\uFE00\uFE10\uFE20\uFE24\uFE62\uFE63\uFE64\uFE67\uFE69\uFE6A\uFE70\uFE75\uFE76\uFEFD\uFF04\uFF05\uFF0B\uFF0C\uFF10\uFF1A\uFF1C\uFF1F\uFF21\uFF3B\uFF3E\uFF3F\uFF40\uFF5B\uFF5C\uFF5D\uFF5E\uFF5F\uFF66\uFFBF\uFFC2\uFFC8\uFFCA\uFFD0\uFFD2\uFFD8\uFFDA\uFFDD\uFFE0\uFFE7\uFFE8\uFFEF\uFFFC\uFFFE"},
280         };
281 
282 
283         /**************************************************************************
284             Let U be the set of Unicode character values and let L be the lowercase
285             function, mapping from U to U. To perform case insensitive matching of
286             character sets, we need to be able to map an interval I in U, say
287 
288                 I = [chMin, chMax] = { ch : chMin <= ch <= chMax }
289 
290             to a set A such that A contains L(I) and A is contained in the union of
291             I and L(I).
292 
293             The table below partitions U into intervals on which L is non-decreasing.
294             Thus, for any interval J = [a, b] contained in one of these intervals,
295             L(J) is contained in [L(a), L(b)].
296 
297             It is also true that for any such J, [L(a), L(b)] is contained in the
298             union of J and L(J). This does not follow from L being non-decreasing on
299             these intervals. It follows from the nature of the L on each interval.
300             On each interval, L has one of the following forms:
301 
302                 (1) L(ch) = constant            (LowercaseSet)
303                 (2) L(ch) = ch + offset         (LowercaseAdd)
304                 (3) L(ch) = ch | 1              (LowercaseBor)
305                 (4) L(ch) = ch + (ch & 1)       (LowercaseBad)
306 
307             It is easy to verify that for any of these forms [L(a), L(b)] is
308             contained in the union of [a, b] and L([a, b]).
309         ***************************************************************************/
310 
311         private const int LowercaseSet = 0;    // Set to arg.
312         private const int LowercaseAdd = 1;    // Add arg.
313         private const int LowercaseBor = 2;    // Bitwise or with 1.
314         private const int LowercaseBad = 3;    // Bitwise and with 1 and add original.
315 
316         private static readonly LowerCaseMapping[] s_lcTable = new LowerCaseMapping[]
317         {
318             new LowerCaseMapping('\u0041', '\u005A', LowercaseAdd, 32),
319             new LowerCaseMapping('\u00C0', '\u00DE', LowercaseAdd, 32),
320             new LowerCaseMapping('\u0100', '\u012E', LowercaseBor, 0),
321             new LowerCaseMapping('\u0130', '\u0130', LowercaseSet, 0x0069),
322             new LowerCaseMapping('\u0132', '\u0136', LowercaseBor, 0),
323             new LowerCaseMapping('\u0139', '\u0147', LowercaseBad, 0),
324             new LowerCaseMapping('\u014A', '\u0176', LowercaseBor, 0),
325             new LowerCaseMapping('\u0178', '\u0178', LowercaseSet, 0x00FF),
326             new LowerCaseMapping('\u0179', '\u017D', LowercaseBad, 0),
327             new LowerCaseMapping('\u0181', '\u0181', LowercaseSet, 0x0253),
328             new LowerCaseMapping('\u0182', '\u0184', LowercaseBor, 0),
329             new LowerCaseMapping('\u0186', '\u0186', LowercaseSet, 0x0254),
330             new LowerCaseMapping('\u0187', '\u0187', LowercaseSet, 0x0188),
331             new LowerCaseMapping('\u0189', '\u018A', LowercaseAdd, 205),
332             new LowerCaseMapping('\u018B', '\u018B', LowercaseSet, 0x018C),
333             new LowerCaseMapping('\u018E', '\u018E', LowercaseSet, 0x01DD),
334             new LowerCaseMapping('\u018F', '\u018F', LowercaseSet, 0x0259),
335             new LowerCaseMapping('\u0190', '\u0190', LowercaseSet, 0x025B),
336             new LowerCaseMapping('\u0191', '\u0191', LowercaseSet, 0x0192),
337             new LowerCaseMapping('\u0193', '\u0193', LowercaseSet, 0x0260),
338             new LowerCaseMapping('\u0194', '\u0194', LowercaseSet, 0x0263),
339             new LowerCaseMapping('\u0196', '\u0196', LowercaseSet, 0x0269),
340             new LowerCaseMapping('\u0197', '\u0197', LowercaseSet, 0x0268),
341             new LowerCaseMapping('\u0198', '\u0198', LowercaseSet, 0x0199),
342             new LowerCaseMapping('\u019C', '\u019C', LowercaseSet, 0x026F),
343             new LowerCaseMapping('\u019D', '\u019D', LowercaseSet, 0x0272),
344             new LowerCaseMapping('\u019F', '\u019F', LowercaseSet, 0x0275),
345             new LowerCaseMapping('\u01A0', '\u01A4', LowercaseBor, 0),
346             new LowerCaseMapping('\u01A7', '\u01A7', LowercaseSet, 0x01A8),
347             new LowerCaseMapping('\u01A9', '\u01A9', LowercaseSet, 0x0283),
348             new LowerCaseMapping('\u01AC', '\u01AC', LowercaseSet, 0x01AD),
349             new LowerCaseMapping('\u01AE', '\u01AE', LowercaseSet, 0x0288),
350             new LowerCaseMapping('\u01AF', '\u01AF', LowercaseSet, 0x01B0),
351             new LowerCaseMapping('\u01B1', '\u01B2', LowercaseAdd, 217),
352             new LowerCaseMapping('\u01B3', '\u01B5', LowercaseBad, 0),
353             new LowerCaseMapping('\u01B7', '\u01B7', LowercaseSet, 0x0292),
354             new LowerCaseMapping('\u01B8', '\u01B8', LowercaseSet, 0x01B9),
355             new LowerCaseMapping('\u01BC', '\u01BC', LowercaseSet, 0x01BD),
356             new LowerCaseMapping('\u01C4', '\u01C5', LowercaseSet, 0x01C6),
357             new LowerCaseMapping('\u01C7', '\u01C8', LowercaseSet, 0x01C9),
358             new LowerCaseMapping('\u01CA', '\u01CB', LowercaseSet, 0x01CC),
359             new LowerCaseMapping('\u01CD', '\u01DB', LowercaseBad, 0),
360             new LowerCaseMapping('\u01DE', '\u01EE', LowercaseBor, 0),
361             new LowerCaseMapping('\u01F1', '\u01F2', LowercaseSet, 0x01F3),
362             new LowerCaseMapping('\u01F4', '\u01F4', LowercaseSet, 0x01F5),
363             new LowerCaseMapping('\u01FA', '\u0216', LowercaseBor, 0),
364             new LowerCaseMapping('\u0386', '\u0386', LowercaseSet, 0x03AC),
365             new LowerCaseMapping('\u0388', '\u038A', LowercaseAdd, 37),
366             new LowerCaseMapping('\u038C', '\u038C', LowercaseSet, 0x03CC),
367             new LowerCaseMapping('\u038E', '\u038F', LowercaseAdd, 63),
368             new LowerCaseMapping('\u0391', '\u03AB', LowercaseAdd, 32),
369             new LowerCaseMapping('\u03E2', '\u03EE', LowercaseBor, 0),
370             new LowerCaseMapping('\u0401', '\u040F', LowercaseAdd, 80),
371             new LowerCaseMapping('\u0410', '\u042F', LowercaseAdd, 32),
372             new LowerCaseMapping('\u0460', '\u0480', LowercaseBor, 0),
373             new LowerCaseMapping('\u0490', '\u04BE', LowercaseBor, 0),
374             new LowerCaseMapping('\u04C1', '\u04C3', LowercaseBad, 0),
375             new LowerCaseMapping('\u04C7', '\u04C7', LowercaseSet, 0x04C8),
376             new LowerCaseMapping('\u04CB', '\u04CB', LowercaseSet, 0x04CC),
377             new LowerCaseMapping('\u04D0', '\u04EA', LowercaseBor, 0),
378             new LowerCaseMapping('\u04EE', '\u04F4', LowercaseBor, 0),
379             new LowerCaseMapping('\u04F8', '\u04F8', LowercaseSet, 0x04F9),
380             new LowerCaseMapping('\u0531', '\u0556', LowercaseAdd, 48),
381             new LowerCaseMapping('\u10A0', '\u10C5', LowercaseAdd, 48),
382             new LowerCaseMapping('\u1E00', '\u1EF8', LowercaseBor, 0),
383             new LowerCaseMapping('\u1F08', '\u1F0F', LowercaseAdd, -8),
384             new LowerCaseMapping('\u1F18', '\u1F1F', LowercaseAdd, -8),
385             new LowerCaseMapping('\u1F28', '\u1F2F', LowercaseAdd, -8),
386             new LowerCaseMapping('\u1F38', '\u1F3F', LowercaseAdd, -8),
387             new LowerCaseMapping('\u1F48', '\u1F4D', LowercaseAdd, -8),
388             new LowerCaseMapping('\u1F59', '\u1F59', LowercaseSet, 0x1F51),
389             new LowerCaseMapping('\u1F5B', '\u1F5B', LowercaseSet, 0x1F53),
390             new LowerCaseMapping('\u1F5D', '\u1F5D', LowercaseSet, 0x1F55),
391             new LowerCaseMapping('\u1F5F', '\u1F5F', LowercaseSet, 0x1F57),
392             new LowerCaseMapping('\u1F68', '\u1F6F', LowercaseAdd, -8),
393             new LowerCaseMapping('\u1F88', '\u1F8F', LowercaseAdd, -8),
394             new LowerCaseMapping('\u1F98', '\u1F9F', LowercaseAdd, -8),
395             new LowerCaseMapping('\u1FA8', '\u1FAF', LowercaseAdd, -8),
396             new LowerCaseMapping('\u1FB8', '\u1FB9', LowercaseAdd, -8),
397             new LowerCaseMapping('\u1FBA', '\u1FBB', LowercaseAdd, -74),
398             new LowerCaseMapping('\u1FBC', '\u1FBC', LowercaseSet, 0x1FB3),
399             new LowerCaseMapping('\u1FC8', '\u1FCB', LowercaseAdd, -86),
400             new LowerCaseMapping('\u1FCC', '\u1FCC', LowercaseSet, 0x1FC3),
401             new LowerCaseMapping('\u1FD8', '\u1FD9', LowercaseAdd, -8),
402             new LowerCaseMapping('\u1FDA', '\u1FDB', LowercaseAdd, -100),
403             new LowerCaseMapping('\u1FE8', '\u1FE9', LowercaseAdd, -8),
404             new LowerCaseMapping('\u1FEA', '\u1FEB', LowercaseAdd, -112),
405             new LowerCaseMapping('\u1FEC', '\u1FEC', LowercaseSet, 0x1FE5),
406             new LowerCaseMapping('\u1FF8', '\u1FF9', LowercaseAdd, -128),
407             new LowerCaseMapping('\u1FFA', '\u1FFB', LowercaseAdd, -126),
408             new LowerCaseMapping('\u1FFC', '\u1FFC', LowercaseSet, 0x1FF3),
409             new LowerCaseMapping('\u2160', '\u216F', LowercaseAdd, 16),
410             new LowerCaseMapping('\u24B6', '\u24D0', LowercaseAdd, 26),
411             new LowerCaseMapping('\uFF21', '\uFF3A', LowercaseAdd, 32),
412         };
413 
414 #if DEBUG
RegexCharClass()415         static RegexCharClass()
416         {
417             // Make sure the initial capacity for s_definedCategories is correct
418             Debug.Assert(
419                 s_definedCategories.Count == DefinedCategoriesCapacity,
420                 "RegexCharClass s_definedCategories's initial capacity (DefinedCategoriesCapacity) is incorrect.",
421                 "Expected (s_definedCategories.Count): {0}, Actual (DefinedCategoriesCapacity): {1}",
422                 s_definedCategories.Count,
423                 DefinedCategoriesCapacity);
424 
425             // Make sure the s_propTable is correctly ordered
426             int len = s_propTable.Length;
427             for (int i = 0; i < len - 1; i++)
428                 Debug.Assert(string.Compare(s_propTable[i][0], s_propTable[i + 1][0], StringComparison.Ordinal) < 0, "RegexCharClass s_propTable is out of order at (" + s_propTable[i][0] + ", " + s_propTable[i + 1][0] + ")");
429         }
430 #endif
431 
432         /// <summary>
433         /// Creates an empty character class.
434         /// </summary>
RegexCharClass()435         internal RegexCharClass()
436         {
437             _rangelist = new List<SingleRange>(6);
438             _canonical = true;
439             _categories = new StringBuilder();
440         }
441 
RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction)442         private RegexCharClass(bool negate, List<SingleRange> ranges, StringBuilder categories, RegexCharClass subtraction)
443         {
444             _rangelist = ranges;
445             _categories = categories;
446             _canonical = true;
447             _negate = negate;
448             _subtractor = subtraction;
449         }
450 
451         internal bool CanMerge
452         {
453             get
454             {
455                 return !_negate && _subtractor == null;
456             }
457         }
458 
459         internal bool Negate
460         {
461             set { _negate = value; }
462         }
463 
AddChar(char c)464         internal void AddChar(char c)
465         {
466             AddRange(c, c);
467         }
468 
469         /// <summary>
470         /// Adds a regex char class
471         /// </summary>
AddCharClass(RegexCharClass cc)472         internal void AddCharClass(RegexCharClass cc)
473         {
474             int i;
475 
476             Debug.Assert(cc.CanMerge && CanMerge, "Both character classes added together must be able to merge");
477 
478             if (!cc._canonical)
479             {
480                 // if the new char class to add isn't canonical, we're not either.
481                 _canonical = false;
482             }
483             else if (_canonical && RangeCount() > 0 && cc.RangeCount() > 0 && cc.GetRangeAt(0)._first <= GetRangeAt(RangeCount() - 1)._last)
484                 _canonical = false;
485 
486             for (i = 0; i < cc.RangeCount(); i += 1)
487             {
488                 _rangelist.Add(cc.GetRangeAt(i));
489             }
490 
491             _categories.Append(cc._categories.ToString());
492         }
493 
494         /// <summary>
495         /// Adds a set (specified by its string representation) to the class.
496         /// </summary>
AddSet(string set)497         private void AddSet(string set)
498         {
499             int i;
500 
501             if (_canonical && RangeCount() > 0 && set.Length > 0 &&
502                 set[0] <= GetRangeAt(RangeCount() - 1)._last)
503                 _canonical = false;
504 
505             for (i = 0; i < set.Length - 1; i += 2)
506             {
507                 _rangelist.Add(new SingleRange(set[i], (char)(set[i + 1] - 1)));
508             }
509 
510             if (i < set.Length)
511             {
512                 _rangelist.Add(new SingleRange(set[i], LastChar));
513             }
514         }
515 
AddSubtraction(RegexCharClass sub)516         internal void AddSubtraction(RegexCharClass sub)
517         {
518             Debug.Assert(_subtractor == null, "Can't add two subtractions to a char class. ");
519             _subtractor = sub;
520         }
521 
522         /// <summary>
523         /// Adds a single range of characters to the class.
524         /// </summary>
AddRange(char first, char last)525         internal void AddRange(char first, char last)
526         {
527             _rangelist.Add(new SingleRange(first, last));
528             if (_canonical && _rangelist.Count > 0 &&
529                 first <= _rangelist[_rangelist.Count - 1]._last)
530             {
531                 _canonical = false;
532             }
533         }
534 
AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern)535         internal void AddCategoryFromName(string categoryName, bool invert, bool caseInsensitive, string pattern)
536         {
537             string category;
538             if (s_definedCategories.TryGetValue(categoryName, out category) && !categoryName.Equals(s_internalRegexIgnoreCase))
539             {
540                 if (caseInsensitive)
541                 {
542                     if (categoryName.Equals("Ll") || categoryName.Equals("Lu") || categoryName.Equals("Lt"))
543                         // when RegexOptions.IgnoreCase is specified then {Ll}, {Lu}, and {Lt} cases should all match
544                         category = s_definedCategories[s_internalRegexIgnoreCase];
545                 }
546 
547                 if (invert)
548                     category = NegateCategory(category); // negate the category
549 
550                 _categories.Append(category);
551             }
552             else
553                 AddSet(SetFromProperty(categoryName, invert, pattern));
554         }
555 
AddCategory(string category)556         private void AddCategory(string category)
557         {
558             _categories.Append(category);
559         }
560 
561         /// <summary>
562         /// Adds to the class any lowercase versions of characters already
563         /// in the class. Used for case-insensitivity.
564         /// </summary>
AddLowercase(CultureInfo culture)565         internal void AddLowercase(CultureInfo culture)
566         {
567             _canonical = false;
568 
569             int count = _rangelist.Count;
570             for (int i = 0; i < count; i++)
571             {
572                 SingleRange range = _rangelist[i];
573                 if (range._first == range._last)
574                 {
575                     char lower = culture.TextInfo.ToLower(range._first);
576                     _rangelist[i] = new SingleRange(lower, lower);
577                 }
578                 else
579                 {
580                     AddLowercaseRange(range._first, range._last, culture);
581                 }
582             }
583         }
584 
585         /// <summary>
586         /// For a single range that's in the set, adds any additional ranges
587         /// necessary to ensure that lowercase equivalents are also included.
588         /// </summary>
AddLowercaseRange(char chMin, char chMax, CultureInfo culture)589         private void AddLowercaseRange(char chMin, char chMax, CultureInfo culture)
590         {
591             int i, iMax, iMid;
592             char chMinT, chMaxT;
593             LowerCaseMapping lc;
594 
595             for (i = 0, iMax = s_lcTable.Length; i < iMax;)
596             {
597                 iMid = (i + iMax) / 2;
598                 if (s_lcTable[iMid]._chMax < chMin)
599                     i = iMid + 1;
600                 else
601                     iMax = iMid;
602             }
603 
604             if (i >= s_lcTable.Length)
605                 return;
606 
607             for (; i < s_lcTable.Length && (lc = s_lcTable[i])._chMin <= chMax; i++)
608             {
609                 if ((chMinT = lc._chMin) < chMin)
610                     chMinT = chMin;
611 
612                 if ((chMaxT = lc._chMax) > chMax)
613                     chMaxT = chMax;
614 
615                 switch (lc._lcOp)
616                 {
617                     case LowercaseSet:
618                         chMinT = (char)lc._data;
619                         chMaxT = (char)lc._data;
620                         break;
621                     case LowercaseAdd:
622                         unchecked
623                         {
624                             chMinT += (char)lc._data;
625                             chMaxT += (char)lc._data;
626                         }
627                         break;
628                     case LowercaseBor:
629                         chMinT |= (char)1;
630                         chMaxT |= (char)1;
631                         break;
632                     case LowercaseBad:
633                         chMinT += (char)(chMinT & 1);
634                         chMaxT += (char)(chMaxT & 1);
635                         break;
636                 }
637 
638                 if (chMinT < chMin || chMaxT > chMax)
639                     AddRange(chMinT, chMaxT);
640             }
641         }
642 
AddWord(bool ecma, bool negate)643         internal void AddWord(bool ecma, bool negate)
644         {
645             if (negate)
646             {
647                 if (ecma)
648                     AddSet(NotECMAWordSet);
649                 else
650                     AddCategory(s_notWord);
651             }
652             else
653             {
654                 if (ecma)
655                     AddSet(ECMAWordSet);
656                 else
657                     AddCategory(s_word);
658             }
659         }
660 
AddSpace(bool ecma, bool negate)661         internal void AddSpace(bool ecma, bool negate)
662         {
663             if (negate)
664             {
665                 if (ecma)
666                     AddSet(NotECMASpaceSet);
667                 else
668                     AddCategory(s_notSpace);
669             }
670             else
671             {
672                 if (ecma)
673                     AddSet(ECMASpaceSet);
674                 else
675                     AddCategory(s_space);
676             }
677         }
678 
AddDigit(bool ecma, bool negate, string pattern)679         internal void AddDigit(bool ecma, bool negate, string pattern)
680         {
681             if (ecma)
682             {
683                 if (negate)
684                     AddSet(NotECMADigitSet);
685                 else
686                     AddSet(ECMADigitSet);
687             }
688             else
689                 AddCategoryFromName("Nd", negate, false, pattern);
690         }
691 
ConvertOldStringsToClass(string set, string category)692         internal static string ConvertOldStringsToClass(string set, string category)
693         {
694             StringBuilder sb = StringBuilderCache.Acquire(set.Length + category.Length + 3);
695 
696             if (set.Length >= 2 && set[0] == '\0' && set[1] == '\0')
697             {
698                 sb.Append((char)0x1);
699                 sb.Append((char)(set.Length - 2));
700                 sb.Append((char)category.Length);
701                 sb.Append(set.Substring(2));
702             }
703             else
704             {
705                 sb.Append((char)0x0);
706                 sb.Append((char)set.Length);
707                 sb.Append((char)category.Length);
708                 sb.Append(set);
709             }
710             sb.Append(category);
711 
712             return StringBuilderCache.GetStringAndRelease(sb);
713         }
714 
715         /// <summary>
716         /// Returns the char
717         /// </summary>
SingletonChar(string set)718         internal static char SingletonChar(string set)
719         {
720             Debug.Assert(IsSingleton(set) || IsSingletonInverse(set), "Tried to get the singleton char out of a non singleton character class");
721             return set[SETSTART];
722         }
723 
IsMergeable(string charClass)724         internal static bool IsMergeable(string charClass)
725         {
726             return (!IsNegated(charClass) && !IsSubtraction(charClass));
727         }
728 
IsEmpty(string charClass)729         internal static bool IsEmpty(string charClass)
730         {
731             if (charClass[CATEGORYLENGTH] == 0 && charClass[FLAGS] == 0 && charClass[SETLENGTH] == 0 && !IsSubtraction(charClass))
732                 return true;
733             else
734                 return false;
735         }
736 
737         /// <summary>
738         /// <c>true</c> if the set contains a single character only
739         /// </summary>
IsSingleton(string set)740         internal static bool IsSingleton(string set)
741         {
742             if (set[FLAGS] == 0 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
743                 (set[SETSTART] == LastChar || set[SETSTART] + 1 == set[SETSTART + 1]))
744                 return true;
745             else
746                 return false;
747         }
748 
IsSingletonInverse(string set)749         internal static bool IsSingletonInverse(string set)
750         {
751             if (set[FLAGS] == 1 && set[CATEGORYLENGTH] == 0 && set[SETLENGTH] == 2 && !IsSubtraction(set) &&
752                 (set[SETSTART] == LastChar || set[SETSTART] + 1 == set[SETSTART + 1]))
753                 return true;
754             else
755                 return false;
756         }
757 
IsSubtraction(string charClass)758         private static bool IsSubtraction(string charClass)
759         {
760             return (charClass.Length > SETSTART + charClass[SETLENGTH] + charClass[CATEGORYLENGTH]);
761         }
762 
IsNegated(string set)763         internal static bool IsNegated(string set)
764         {
765             return (set != null && set[FLAGS] == 1);
766         }
767 
IsECMAWordChar(char ch)768         internal static bool IsECMAWordChar(char ch)
769         {
770             // According to ECMA-262, \s, \S, ., ^, and $ use Unicode-based interpretations of
771             // whitespace and newline, while \d, \D\, \w, \W, \b, and \B use ASCII-only
772             // interpretations of digit, word character, and word boundary.  In other words,
773             // no special treatment of Unicode ZERO WIDTH NON-JOINER (ZWNJ U+200C) and
774             // ZERO WIDTH JOINER (ZWJ U+200D) is required for ECMA word boundaries.
775             return CharInClass(ch, ECMAWordClass);
776         }
777 
IsWordChar(char ch)778         internal static bool IsWordChar(char ch)
779         {
780             // According to UTS#18 Unicode Regular Expressions (http://www.unicode.org/reports/tr18/)
781             // RL 1.4 Simple Word Boundaries  The class of <word_character> includes all Alphabetic
782             // values from the Unicode character database, from UnicodeData.txt [UData], plus the U+200C
783             // ZERO WIDTH NON-JOINER and U+200D ZERO WIDTH JOINER.
784             return CharInClass(ch, WordClass) || ch == ZeroWidthJoiner || ch == ZeroWidthNonJoiner;
785         }
786 
CharInClass(char ch, string set)787         internal static bool CharInClass(char ch, string set)
788         {
789             return CharInClassRecursive(ch, set, 0);
790         }
791 
792 
CharInClassRecursive(char ch, string set, int start)793         internal static bool CharInClassRecursive(char ch, string set, int start)
794         {
795             int mySetLength = set[start + SETLENGTH];
796             int myCategoryLength = set[start + CATEGORYLENGTH];
797             int myEndPosition = start + SETSTART + mySetLength + myCategoryLength;
798 
799             bool subtracted = false;
800 
801             if (set.Length > myEndPosition)
802             {
803                 subtracted = CharInClassRecursive(ch, set, myEndPosition);
804             }
805 
806             bool b = CharInClassInternal(ch, set, start, mySetLength, myCategoryLength);
807 
808             // Note that we apply the negation *before* performing the subtraction.  This is because
809             // the negation only applies to the first char class, not the entire subtraction.
810             if (set[start + FLAGS] == 1)
811                 b = !b;
812 
813             return b && !subtracted;
814         }
815 
816         /// <summary>
817         /// Determines a character's membership in a character class (via the
818         /// string representation of the class).
819         /// </summary>
CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength)820         private static bool CharInClassInternal(char ch, string set, int start, int mySetLength, int myCategoryLength)
821         {
822             int min;
823             int max;
824             int mid;
825             min = start + SETSTART;
826             max = min + mySetLength;
827 
828             while (min != max)
829             {
830                 mid = (min + max) / 2;
831                 if (ch < set[mid])
832                     max = mid;
833                 else
834                     min = mid + 1;
835             }
836 
837             // The starting position of the set within the character class determines
838             // whether what an odd or even ending position means.  If the start is odd,
839             // an *even* ending position means the character was in the set.  With recursive
840             // subtractions in the mix, the starting position = start+SETSTART.  Since we know that
841             // SETSTART is odd, we can simplify it out of the equation.  But if it changes we need to
842             // reverse this check.
843             Debug.Assert((SETSTART & 0x1) == 1, "If SETSTART is not odd, the calculation below this will be reversed");
844             if ((min & 0x1) == (start & 0x1))
845                 return true;
846             else
847             {
848                 if (myCategoryLength == 0)
849                     return false;
850 
851                 return CharInCategory(ch, set, start, mySetLength, myCategoryLength);
852             }
853         }
854 
CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength)855         private static bool CharInCategory(char ch, string set, int start, int mySetLength, int myCategoryLength)
856         {
857             UnicodeCategory chcategory = CharUnicodeInfo.GetUnicodeCategory(ch);
858 
859             int i = start + SETSTART + mySetLength;
860             int end = i + myCategoryLength;
861             while (i < end)
862             {
863                 int curcat = unchecked((short)set[i]);
864 
865                 if (curcat == 0)
866                 {
867                     // zero is our marker for a group of categories - treated as a unit
868                     if (CharInCategoryGroup(ch, chcategory, set, ref i))
869                         return true;
870                 }
871                 else if (curcat > 0)
872                 {
873                     // greater than zero is a positive case
874 
875                     if (curcat == SpaceConst)
876                     {
877                         if (char.IsWhiteSpace(ch))
878                             return true;
879                         else
880                         {
881                             i++;
882                             continue;
883                         }
884                     }
885                     --curcat;
886 
887                     if (chcategory == (UnicodeCategory)curcat)
888                         return true;
889                 }
890                 else
891                 {
892                     // less than zero is a negative case
893                     if (curcat == NotSpaceConst)
894                     {
895                         if (!char.IsWhiteSpace(ch))
896                             return true;
897                         else
898                         {
899                             i++;
900                             continue;
901                         }
902                     }
903 
904                     //curcat = -curcat;
905                     //--curcat;
906                     curcat = -1 - curcat;
907 
908                     if (chcategory != (UnicodeCategory)curcat)
909                         return true;
910                 }
911                 i++;
912             }
913             return false;
914         }
915 
916         /// <summary>
917         /// This is used for categories which are composed of other categories - L, N, Z, W...
918         /// These groups need special treatment when they are negated
919         /// </summary>
CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i)920         private static bool CharInCategoryGroup(char ch, UnicodeCategory chcategory, string category, ref int i)
921         {
922             i++;
923 
924             int curcat = unchecked((short)category[i]);
925             if (curcat > 0)
926             {
927                 // positive case - the character must be in ANY of the categories in the group
928                 bool answer = false;
929 
930                 while (curcat != 0)
931                 {
932                     if (!answer)
933                     {
934                         --curcat;
935                         if (chcategory == (UnicodeCategory)curcat)
936                             answer = true;
937                     }
938                     i++;
939                     curcat = (short)category[i];
940                 }
941                 return answer;
942             }
943             else
944             {
945                 // negative case - the character must be in NONE of the categories in the group
946                 bool answer = true;
947 
948                 while (curcat != 0)
949                 {
950                     if (answer)
951                     {
952                         //curcat = -curcat;
953                         //--curcat;
954                         curcat = -1 - curcat;
955                         if (chcategory == (UnicodeCategory)curcat)
956                             answer = false;
957                     }
958                     i++;
959                     curcat = unchecked((short)category[i]);
960                 }
961                 return answer;
962             }
963         }
964 
NegateCategory(string category)965         private static string NegateCategory(string category)
966         {
967             if (category == null)
968                 return null;
969 
970             StringBuilder sb = StringBuilderCache.Acquire(category.Length);
971 
972             for (int i = 0; i < category.Length; i++)
973             {
974                 short ch = (short)category[i];
975                 sb.Append(unchecked((char)-ch));
976             }
977             return StringBuilderCache.GetStringAndRelease(sb);
978         }
979 
Parse(string charClass)980         internal static RegexCharClass Parse(string charClass)
981         {
982             return ParseRecursive(charClass, 0);
983         }
984 
ParseRecursive(string charClass, int start)985         private static RegexCharClass ParseRecursive(string charClass, int start)
986         {
987             int mySetLength = charClass[start + SETLENGTH];
988             int myCategoryLength = charClass[start + CATEGORYLENGTH];
989             int myEndPosition = start + SETSTART + mySetLength + myCategoryLength;
990 
991             List<SingleRange> ranges = new List<SingleRange>(mySetLength);
992             int i = start + SETSTART;
993             int end = i + mySetLength;
994             while (i < end)
995             {
996                 char first = charClass[i];
997                 i++;
998 
999                 char last;
1000                 if (i < end)
1001                     last = (char)(charClass[i] - 1);
1002                 else
1003                     last = LastChar;
1004                 i++;
1005                 ranges.Add(new SingleRange(first, last));
1006             }
1007 
1008             RegexCharClass sub = null;
1009             if (charClass.Length > myEndPosition)
1010                 sub = ParseRecursive(charClass, myEndPosition);
1011 
1012             return new RegexCharClass(charClass[start + FLAGS] == 1, ranges, new StringBuilder(charClass.Substring(end, myCategoryLength)), sub);
1013         }
1014 
1015         /// <summary>
1016         /// The number of single ranges that have been accumulated so far.
1017         /// </summary>
RangeCount()1018         private int RangeCount()
1019         {
1020             return _rangelist.Count;
1021         }
1022 
1023         /// <summary>
1024         /// Constructs the string representation of the class.
1025         /// </summary>
ToStringClass()1026         internal string ToStringClass()
1027         {
1028             if (!_canonical)
1029                 Canonicalize();
1030 
1031             // make a guess about the length of the ranges.  We'll update this at the end.
1032             // This is important because if the last range ends in LastChar, we won't append
1033             // LastChar to the list.
1034             int rangeLen = _rangelist.Count * 2;
1035             StringBuilder sb = StringBuilderCache.Acquire(rangeLen + _categories.Length + 3);
1036 
1037             int flags;
1038             if (_negate)
1039                 flags = 1;
1040             else
1041                 flags = 0;
1042 
1043             sb.Append((char)flags);
1044             sb.Append((char)rangeLen);
1045             sb.Append((char)_categories.Length);
1046 
1047             for (int i = 0; i < _rangelist.Count; i++)
1048             {
1049                 SingleRange currentRange = _rangelist[i];
1050                 sb.Append(currentRange._first);
1051 
1052                 if (currentRange._last != LastChar)
1053                     sb.Append((char)(currentRange._last + 1));
1054             }
1055 
1056             sb[SETLENGTH] = (char)(sb.Length - SETSTART);
1057 
1058             sb.Append(_categories);
1059 
1060             if (_subtractor != null)
1061                 sb.Append(_subtractor.ToStringClass());
1062 
1063             return StringBuilderCache.GetStringAndRelease(sb);
1064         }
1065 
1066         /// <summary>
1067         /// The ith range.
1068         /// </summary>
GetRangeAt(int i)1069         private SingleRange GetRangeAt(int i)
1070         {
1071             return _rangelist[i];
1072         }
1073 
1074         /// <summary>
1075         /// Logic to reduce a character class to a unique, sorted form.
1076         /// </summary>
Canonicalize()1077         private void Canonicalize()
1078         {
1079             SingleRange CurrentRange;
1080             int i;
1081             int j;
1082             char last;
1083             bool done;
1084 
1085             _canonical = true;
1086             _rangelist.Sort(SingleRangeComparer.Instance);
1087 
1088             //
1089             // Find and eliminate overlapping or abutting ranges
1090             //
1091 
1092             if (_rangelist.Count > 1)
1093             {
1094                 done = false;
1095 
1096                 for (i = 1, j = 0; ; i++)
1097                 {
1098                     for (last = _rangelist[j]._last; ; i++)
1099                     {
1100                         if (i == _rangelist.Count || last == LastChar)
1101                         {
1102                             done = true;
1103                             break;
1104                         }
1105 
1106                         if ((CurrentRange = _rangelist[i])._first > last + 1)
1107                             break;
1108 
1109                         if (last < CurrentRange._last)
1110                             last = CurrentRange._last;
1111                     }
1112 
1113                     _rangelist[j] = new SingleRange(_rangelist[j]._first, last);
1114 
1115                     j++;
1116 
1117                     if (done)
1118                         break;
1119 
1120                     if (j < i)
1121                         _rangelist[j] = _rangelist[i];
1122                 }
1123                 _rangelist.RemoveRange(j, _rangelist.Count - j);
1124             }
1125         }
1126 
SetFromProperty(string capname, bool invert, string pattern)1127         private static string SetFromProperty(string capname, bool invert, string pattern)
1128         {
1129             int min = 0;
1130             int max = s_propTable.Length;
1131             while (min != max)
1132             {
1133                 int mid = (min + max) / 2;
1134                 int res = string.Compare(capname, s_propTable[mid][0], StringComparison.Ordinal);
1135                 if (res < 0)
1136                     max = mid;
1137                 else if (res > 0)
1138                     min = mid + 1;
1139                 else
1140                 {
1141                     string set = s_propTable[mid][1];
1142                     Debug.Assert(!string.IsNullOrEmpty(set), "Found a null/empty element in RegexCharClass prop table");
1143                     if (invert)
1144                     {
1145                         if (set[0] == NullChar)
1146                         {
1147                             return set.Substring(1);
1148                         }
1149                         return NullCharString + set;
1150                     }
1151                     else
1152                     {
1153                         return set;
1154                     }
1155                 }
1156             }
1157             throw new ArgumentException(SR.Format(SR.MakeException, pattern, SR.Format(SR.UnknownProperty, capname)));
1158         }
1159 
1160 #if DEBUG
1161 
1162         /// <summary>
1163         /// Produces a human-readable description for a set string.
1164         /// </summary>
SetDescription(string set)1165         internal static string SetDescription(string set)
1166         {
1167             int mySetLength = set[SETLENGTH];
1168             int myCategoryLength = set[CATEGORYLENGTH];
1169             int myEndPosition = SETSTART + mySetLength + myCategoryLength;
1170 
1171             StringBuilder desc = new StringBuilder();
1172 
1173             desc.Append('[');
1174 
1175             int index = SETSTART;
1176             char ch1;
1177             char ch2;
1178 
1179             if (IsNegated(set))
1180                 desc.Append('^');
1181 
1182             while (index < SETSTART + set[SETLENGTH])
1183             {
1184                 ch1 = set[index];
1185                 if (index + 1 < set.Length)
1186                     ch2 = (char)(set[index + 1] - 1);
1187                 else
1188                     ch2 = LastChar;
1189 
1190                 desc.Append(CharDescription(ch1));
1191 
1192                 if (ch2 != ch1)
1193                 {
1194                     if (ch1 + 1 != ch2)
1195                         desc.Append('-');
1196                     desc.Append(CharDescription(ch2));
1197                 }
1198                 index += 2;
1199             }
1200 
1201             while (index < SETSTART + set[SETLENGTH] + set[CATEGORYLENGTH])
1202             {
1203                 ch1 = set[index];
1204                 if (ch1 == 0)
1205                 {
1206                     bool found = false;
1207 
1208                     int lastindex = set.IndexOf(GroupChar, index + 1);
1209                     string group = set.Substring(index, lastindex - index + 1);
1210 
1211                     foreach (var kvp in s_definedCategories)
1212                     {
1213                         if (group.Equals(kvp.Value))
1214                         {
1215                             if ((short)set[index + 1] > 0)
1216                                 desc.Append("\\p{");
1217                             else
1218                                 desc.Append("\\P{");
1219 
1220                             desc.Append(kvp.Key);
1221                             desc.Append('}');
1222 
1223                             found = true;
1224                             break;
1225                         }
1226                     }
1227 
1228                     if (!found)
1229                     {
1230                         if (group.Equals(s_word))
1231                             desc.Append("\\w");
1232                         else if (group.Equals(s_notWord))
1233                             desc.Append("\\W");
1234                         else
1235                             Debug.Fail("Couldn't find a group to match '" + group + "'");
1236                     }
1237 
1238                     index = lastindex;
1239                 }
1240                 else
1241                 {
1242                     desc.Append(CategoryDescription(ch1));
1243                 }
1244 
1245                 index++;
1246             }
1247 
1248             if (set.Length > myEndPosition)
1249             {
1250                 desc.Append('-');
1251                 desc.Append(SetDescription(set.Substring(myEndPosition)));
1252             }
1253 
1254             desc.Append(']');
1255 
1256             return desc.ToString();
1257         }
1258 
1259         internal static readonly char[] Hex = new char[] { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
1260         internal static readonly string[] Categories = new string[] {"Lu", "Ll", "Lt", "Lm", "Lo", s_internalRegexIgnoreCase,
1261                                                                      "Mn", "Mc", "Me",
1262                                                                      "Nd", "Nl", "No",
1263                                                                      "Zs", "Zl", "Zp",
1264                                                                      "Cc", "Cf", "Cs", "Co",
1265                                                                      "Pc", "Pd", "Ps", "Pe", "Pi", "Pf", "Po",
1266                                                                      "Sm", "Sc", "Sk", "So",
1267                                                                      "Cn" };
1268 
1269         /// <summary>
1270         /// Produces a human-readable description for a single character.
1271         /// </summary>
CharDescription(char ch)1272         internal static string CharDescription(char ch)
1273         {
1274             if (ch == '\\')
1275                 return "\\\\";
1276 
1277             if (ch >= ' ' && ch <= '~')
1278             {
1279                 return ch.ToString();
1280             }
1281 
1282             var sb = new StringBuilder();
1283             int shift;
1284 
1285             if (ch < 256)
1286             {
1287                 sb.Append("\\x");
1288                 shift = 8;
1289             }
1290             else
1291             {
1292                 sb.Append("\\u");
1293                 shift = 16;
1294             }
1295 
1296             while (shift > 0)
1297             {
1298                 shift -= 4;
1299                 sb.Append(Hex[(ch >> shift) & 0xF]);
1300             }
1301 
1302             return sb.ToString();
1303         }
1304 
CategoryDescription(char ch)1305         private static string CategoryDescription(char ch)
1306         {
1307             if (ch == SpaceConst)
1308                 return "\\s";
1309             else if ((short)ch == NotSpaceConst)
1310                 return "\\S";
1311             else if ((short)ch < 0)
1312             {
1313                 return "\\P{" + Categories[(-((short)ch) - 1)] + "}";
1314             }
1315             else
1316             {
1317                 return "\\p{" + Categories[(ch - 1)] + "}";
1318             }
1319         }
1320 
1321 #endif
1322 
1323         /// <summary>
1324         /// Lower case mapping descriptor.
1325         /// </summary>
1326         private readonly struct LowerCaseMapping
1327         {
LowerCaseMappingSystem.Text.RegularExpressions.RegexCharClass.LowerCaseMapping1328             internal LowerCaseMapping(char chMin, char chMax, int lcOp, int data)
1329             {
1330                 _chMin = chMin;
1331                 _chMax = chMax;
1332                 _lcOp = lcOp;
1333                 _data = data;
1334             }
1335 
1336             internal readonly char _chMin;
1337             internal readonly char _chMax;
1338             internal readonly int _lcOp;
1339             internal readonly int _data;
1340         }
1341 
1342         /// <summary>
1343         /// For sorting ranges; compare based on the first char in the range.
1344         /// </summary>
1345         private sealed class SingleRangeComparer : IComparer<SingleRange>
1346         {
1347             public static readonly SingleRangeComparer Instance = new SingleRangeComparer();
1348 
SingleRangeComparer()1349             private SingleRangeComparer()
1350             {
1351             }
1352 
Compare(SingleRange x, SingleRange y)1353             public int Compare(SingleRange x, SingleRange y)
1354             {
1355                 return x._first.CompareTo(y._first);
1356             }
1357         }
1358 
1359         /// <summary>
1360         /// A first/last pair representing a single range of characters.
1361         /// </summary>
1362         private readonly struct SingleRange
1363         {
SingleRangeSystem.Text.RegularExpressions.RegexCharClass.SingleRange1364             internal SingleRange(char first, char last)
1365             {
1366                 _first = first;
1367                 _last = last;
1368             }
1369 
1370             internal readonly char _first;
1371             internal readonly char _last;
1372         }
1373     }
1374 }
1375