1 // Licensed to the .NET Foundation under one or more agreements.
2 // The .NET Foundation licenses this file to you under the MIT license.
3 // See the LICENSE file in the project root for more information.
4 
5 // The RegexBoyerMoore object precomputes the Boyer-Moore
6 // tables for fast string scanning. These tables allow
7 // you to scan for the first occurrence of a string within
8 // a large body of text without examining every character.
9 // The performance of the heuristic depends on the actual
10 // string and the text being searched, but usually, the longer
11 // the string that is being searched for, the fewer characters
12 // need to be examined.
13 
14 using System.Diagnostics;
15 using System.Globalization;
16 using System.IO;
17 
18 namespace System.Text.RegularExpressions
19 {
20     internal sealed class RegexBoyerMoore
21     {
22         internal readonly int[] _positive;
23         internal readonly int[] _negativeASCII;
24         internal readonly int[][] _negativeUnicode;
25         internal readonly string _pattern;
26         internal readonly int _lowASCII;
27         internal readonly int _highASCII;
28         private readonly bool _rightToLeft;
29         internal readonly bool _caseInsensitive;
30         private readonly CultureInfo _culture;
31 
32         /// <summary>
33         /// Constructs a Boyer-Moore state machine for searching for the string
34         /// pattern. The string must not be zero-length.
35         /// </summary>
RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture)36         internal RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture)
37         {
38             // Sorry, you just can't use Boyer-Moore to find an empty pattern.
39             // We're doing this for your own protection. (Really, for speed.)
40             Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf");
41 
42             int beforefirst;
43             int last;
44             int bump;
45             int examine;
46             int scan;
47             int match;
48             char ch;
49 
50             // We do the ToLower character by character for consistency.  With surrogate chars, doing
51             // a ToLower on the entire string could actually change the surrogate pair.  This is more correct
52             // linguistically, but since Regex doesn't support surrogates, it's more important to be
53             // consistent.
54             if (caseInsensitive)
55             {
56                 StringBuilder sb = StringBuilderCache.Acquire(pattern.Length);
57                 for (int i = 0; i < pattern.Length; i++)
58                     sb.Append(culture.TextInfo.ToLower(pattern[i]));
59                 pattern = StringBuilderCache.GetStringAndRelease(sb);
60             }
61 
62             _pattern = pattern;
63             _rightToLeft = rightToLeft;
64             _caseInsensitive = caseInsensitive;
65             _culture = culture;
66 
67             if (!rightToLeft)
68             {
69                 beforefirst = -1;
70                 last = pattern.Length - 1;
71                 bump = 1;
72             }
73             else
74             {
75                 beforefirst = pattern.Length;
76                 last = 0;
77                 bump = -1;
78             }
79 
80             // PART I - the good-suffix shift table
81             //
82             // compute the positive requirement:
83             // if char "i" is the first one from the right that doesn't match,
84             // then we know the matcher can advance by _positive[i].
85             //
86             // This algorithm is a simplified variant of the standard
87             // Boyer-Moore good suffix calculation.
88 
89             _positive = new int[pattern.Length];
90 
91             examine = last;
92             ch = pattern[examine];
93             _positive[examine] = bump;
94             examine -= bump;
95 
96             for (; ;)
97             {
98                 // find an internal char (examine) that matches the tail
99 
100                 for (; ;)
101                 {
102                     if (examine == beforefirst)
103                         goto OuterloopBreak;
104                     if (pattern[examine] == ch)
105                         break;
106                     examine -= bump;
107                 }
108 
109                 match = last;
110                 scan = examine;
111 
112                 // find the length of the match
113 
114                 for (; ;)
115                 {
116                     if (scan == beforefirst || pattern[match] != pattern[scan])
117                     {
118                         // at the end of the match, note the difference in _positive
119                         // this is not the length of the match, but the distance from the internal match
120                         // to the tail suffix.
121                         if (_positive[match] == 0)
122                             _positive[match] = match - scan;
123 
124                         // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan));
125 
126                         break;
127                     }
128 
129                     scan -= bump;
130                     match -= bump;
131                 }
132 
133                 examine -= bump;
134             }
135 
136         OuterloopBreak:
137 
138             match = last - bump;
139 
140             // scan for the chars for which there are no shifts that yield a different candidate
141 
142 
143             // The inside of the if statement used to say
144             // "_positive[match] = last - beforefirst;"
145             // This is slightly less aggressive in how much we skip, but at worst it
146             // should mean a little more work rather than skipping a potential match.
147             while (match != beforefirst)
148             {
149                 if (_positive[match] == 0)
150                     _positive[match] = bump;
151 
152                 match -= bump;
153             }
154 
155             // PART II - the bad-character shift table
156             //
157             // compute the negative requirement:
158             // if char "ch" is the reject character when testing position "i",
159             // we can slide up by _negative[ch];
160             // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch))
161             //
162             // the lookup table is divided into ASCII and Unicode portions;
163             // only those parts of the Unicode 16-bit code set that actually
164             // appear in the string are in the table. (Maximum size with
165             // Unicode is 65K; ASCII only case is 512 bytes.)
166 
167             _negativeASCII = new int[128];
168 
169             for (int i = 0; i < 128; i++)
170                 _negativeASCII[i] = last - beforefirst;
171 
172             _lowASCII = 127;
173             _highASCII = 0;
174 
175             for (examine = last; examine != beforefirst; examine -= bump)
176             {
177                 ch = pattern[examine];
178 
179                 if (ch < 128)
180                 {
181                     if (_lowASCII > ch)
182                         _lowASCII = ch;
183 
184                     if (_highASCII < ch)
185                         _highASCII = ch;
186 
187                     if (_negativeASCII[ch] == last - beforefirst)
188                         _negativeASCII[ch] = last - examine;
189                 }
190                 else
191                 {
192                     int i = ch >> 8;
193                     int j = ch & 0xFF;
194 
195                     if (_negativeUnicode == null)
196                     {
197                         _negativeUnicode = new int[256][];
198                     }
199 
200                     if (_negativeUnicode[i] == null)
201                     {
202                         int[] newarray = new int[256];
203 
204                         for (int k = 0; k < newarray.Length; k++)
205                             newarray[k] = last - beforefirst;
206 
207                         if (i == 0)
208                         {
209                             Array.Copy(_negativeASCII, 0, newarray, 0, 128);
210                             _negativeASCII = newarray;
211                         }
212 
213                         _negativeUnicode[i] = newarray;
214                     }
215 
216                     if (_negativeUnicode[i][j] == last - beforefirst)
217                         _negativeUnicode[i][j] = last - examine;
218                 }
219             }
220         }
221 
MatchPattern(string text, int index)222         private bool MatchPattern(string text, int index)
223         {
224             if (_caseInsensitive)
225             {
226                 if (text.Length - index < _pattern.Length)
227                 {
228                     return false;
229                 }
230 
231                 TextInfo textinfo = _culture.TextInfo;
232                 for (int i = 0; i < _pattern.Length; i++)
233                 {
234                     Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!");
235                     if (textinfo.ToLower(text[index + i]) != _pattern[i])
236                     {
237                         return false;
238                     }
239                 }
240                 return true;
241             }
242             else
243             {
244                 return (0 == string.CompareOrdinal(_pattern, 0, text, index, _pattern.Length));
245             }
246         }
247 
248         /// <summary>
249         /// When a regex is anchored, we can do a quick IsMatch test instead of a Scan
250         /// </summary>
IsMatch(string text, int index, int beglimit, int endlimit)251         internal bool IsMatch(string text, int index, int beglimit, int endlimit)
252         {
253             if (!_rightToLeft)
254             {
255                 if (index < beglimit || endlimit - index < _pattern.Length)
256                     return false;
257 
258                 return MatchPattern(text, index);
259             }
260             else
261             {
262                 if (index > endlimit || index - beglimit < _pattern.Length)
263                     return false;
264 
265                 return MatchPattern(text, index - _pattern.Length);
266             }
267         }
268 
269         /// <summary>
270         /// Scan uses the Boyer-Moore algorithm to find the first occurrence
271         /// of the specified string within text, beginning at index, and
272         /// constrained within beglimit and endlimit.
273         ///
274         /// The direction and case-sensitivity of the match is determined
275         /// by the arguments to the RegexBoyerMoore constructor.
276         /// </summary>
Scan(string text, int index, int beglimit, int endlimit)277         internal int Scan(string text, int index, int beglimit, int endlimit)
278         {
279             int test;
280             int test2;
281             int match;
282             int startmatch;
283             int endmatch;
284             int advance;
285             int defadv;
286             int bump;
287             char chMatch;
288             char chTest;
289             int[] unicodeLookup;
290 
291             if (!_rightToLeft)
292             {
293                 defadv = _pattern.Length;
294                 startmatch = _pattern.Length - 1;
295                 endmatch = 0;
296                 test = index + defadv - 1;
297                 bump = 1;
298             }
299             else
300             {
301                 defadv = -_pattern.Length;
302                 startmatch = 0;
303                 endmatch = -defadv - 1;
304                 test = index + defadv;
305                 bump = -1;
306             }
307 
308             chMatch = _pattern[startmatch];
309 
310             for (; ;)
311             {
312                 if (test >= endlimit || test < beglimit)
313                     return -1;
314 
315                 chTest = text[test];
316 
317                 if (_caseInsensitive)
318                     chTest = _culture.TextInfo.ToLower(chTest);
319 
320                 if (chTest != chMatch)
321                 {
322                     if (chTest < 128)
323                         advance = _negativeASCII[chTest];
324                     else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8])))
325                         advance = unicodeLookup[chTest & 0xFF];
326                     else
327                         advance = defadv;
328 
329                     test += advance;
330                 }
331                 else
332                 { // if (chTest == chMatch)
333                     test2 = test;
334                     match = startmatch;
335 
336                     for (; ;)
337                     {
338                         if (match == endmatch)
339                             return (_rightToLeft ? test2 + 1 : test2);
340 
341                         match -= bump;
342                         test2 -= bump;
343 
344                         chTest = text[test2];
345 
346                         if (_caseInsensitive)
347                             chTest = _culture.TextInfo.ToLower(chTest);
348 
349                         if (chTest != _pattern[match])
350                         {
351                             advance = _positive[match];
352                             if ((chTest & 0xFF80) == 0)
353                                 test2 = (match - startmatch) + _negativeASCII[chTest];
354                             else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8])))
355                                 test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF];
356                             else
357                             {
358                                 test += advance;
359                                 break;
360                             }
361 
362                             if (_rightToLeft ? test2 < advance : test2 > advance)
363                                 advance = test2;
364 
365                             test += advance;
366                             break;
367                         }
368                     }
369                 }
370             }
371         }
372 
373         /// <summary>
374         /// Used when dumping for debugging.
375         /// </summary>
ToString()376         public override string ToString()
377         {
378             return _pattern;
379         }
380 
381 #if DEBUG
Dump(string indent)382         public string Dump(string indent)
383         {
384             StringBuilder sb = new StringBuilder();
385 
386             sb.Append(indent + "BM Pattern: " + _pattern + "\n");
387             sb.Append(indent + "Positive: ");
388             for (int i = 0; i < _positive.Length; i++)
389             {
390                 sb.Append(_positive[i].ToString(CultureInfo.InvariantCulture) + " ");
391             }
392             sb.Append("\n");
393 
394             if (_negativeASCII != null)
395             {
396                 sb.Append(indent + "Negative table\n");
397                 for (int i = 0; i < _negativeASCII.Length; i++)
398                 {
399                     if (_negativeASCII[i] != _pattern.Length)
400                     {
401                         sb.Append(indent + "  " + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + _negativeASCII[i].ToString(CultureInfo.InvariantCulture) + "\n");
402                     }
403                 }
404             }
405 
406             return sb.ToString();
407         }
408 #endif
409     }
410 }
411