1 // Licensed to the .NET Foundation under one or more agreements. 2 // The .NET Foundation licenses this file to you under the MIT license. 3 // See the LICENSE file in the project root for more information. 4 5 // The RegexBoyerMoore object precomputes the Boyer-Moore 6 // tables for fast string scanning. These tables allow 7 // you to scan for the first occurrence of a string within 8 // a large body of text without examining every character. 9 // The performance of the heuristic depends on the actual 10 // string and the text being searched, but usually, the longer 11 // the string that is being searched for, the fewer characters 12 // need to be examined. 13 14 using System.Diagnostics; 15 using System.Globalization; 16 using System.IO; 17 18 namespace System.Text.RegularExpressions 19 { 20 internal sealed class RegexBoyerMoore 21 { 22 internal readonly int[] _positive; 23 internal readonly int[] _negativeASCII; 24 internal readonly int[][] _negativeUnicode; 25 internal readonly string _pattern; 26 internal readonly int _lowASCII; 27 internal readonly int _highASCII; 28 private readonly bool _rightToLeft; 29 internal readonly bool _caseInsensitive; 30 private readonly CultureInfo _culture; 31 32 /// <summary> 33 /// Constructs a Boyer-Moore state machine for searching for the string 34 /// pattern. The string must not be zero-length. 35 /// </summary> RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture)36 internal RegexBoyerMoore(string pattern, bool caseInsensitive, bool rightToLeft, CultureInfo culture) 37 { 38 // Sorry, you just can't use Boyer-Moore to find an empty pattern. 39 // We're doing this for your own protection. (Really, for speed.) 40 Debug.Assert(pattern.Length != 0, "RegexBoyerMoore called with an empty string. This is bad for perf"); 41 42 int beforefirst; 43 int last; 44 int bump; 45 int examine; 46 int scan; 47 int match; 48 char ch; 49 50 // We do the ToLower character by character for consistency. With surrogate chars, doing 51 // a ToLower on the entire string could actually change the surrogate pair. This is more correct 52 // linguistically, but since Regex doesn't support surrogates, it's more important to be 53 // consistent. 54 if (caseInsensitive) 55 { 56 StringBuilder sb = StringBuilderCache.Acquire(pattern.Length); 57 for (int i = 0; i < pattern.Length; i++) 58 sb.Append(culture.TextInfo.ToLower(pattern[i])); 59 pattern = StringBuilderCache.GetStringAndRelease(sb); 60 } 61 62 _pattern = pattern; 63 _rightToLeft = rightToLeft; 64 _caseInsensitive = caseInsensitive; 65 _culture = culture; 66 67 if (!rightToLeft) 68 { 69 beforefirst = -1; 70 last = pattern.Length - 1; 71 bump = 1; 72 } 73 else 74 { 75 beforefirst = pattern.Length; 76 last = 0; 77 bump = -1; 78 } 79 80 // PART I - the good-suffix shift table 81 // 82 // compute the positive requirement: 83 // if char "i" is the first one from the right that doesn't match, 84 // then we know the matcher can advance by _positive[i]. 85 // 86 // This algorithm is a simplified variant of the standard 87 // Boyer-Moore good suffix calculation. 88 89 _positive = new int[pattern.Length]; 90 91 examine = last; 92 ch = pattern[examine]; 93 _positive[examine] = bump; 94 examine -= bump; 95 96 for (; ;) 97 { 98 // find an internal char (examine) that matches the tail 99 100 for (; ;) 101 { 102 if (examine == beforefirst) 103 goto OuterloopBreak; 104 if (pattern[examine] == ch) 105 break; 106 examine -= bump; 107 } 108 109 match = last; 110 scan = examine; 111 112 // find the length of the match 113 114 for (; ;) 115 { 116 if (scan == beforefirst || pattern[match] != pattern[scan]) 117 { 118 // at the end of the match, note the difference in _positive 119 // this is not the length of the match, but the distance from the internal match 120 // to the tail suffix. 121 if (_positive[match] == 0) 122 _positive[match] = match - scan; 123 124 // System.Diagnostics.Debug.WriteLine("Set positive[" + match + "] to " + (match - scan)); 125 126 break; 127 } 128 129 scan -= bump; 130 match -= bump; 131 } 132 133 examine -= bump; 134 } 135 136 OuterloopBreak: 137 138 match = last - bump; 139 140 // scan for the chars for which there are no shifts that yield a different candidate 141 142 143 // The inside of the if statement used to say 144 // "_positive[match] = last - beforefirst;" 145 // This is slightly less aggressive in how much we skip, but at worst it 146 // should mean a little more work rather than skipping a potential match. 147 while (match != beforefirst) 148 { 149 if (_positive[match] == 0) 150 _positive[match] = bump; 151 152 match -= bump; 153 } 154 155 // PART II - the bad-character shift table 156 // 157 // compute the negative requirement: 158 // if char "ch" is the reject character when testing position "i", 159 // we can slide up by _negative[ch]; 160 // (_negative[ch] = str.Length - 1 - str.LastIndexOf(ch)) 161 // 162 // the lookup table is divided into ASCII and Unicode portions; 163 // only those parts of the Unicode 16-bit code set that actually 164 // appear in the string are in the table. (Maximum size with 165 // Unicode is 65K; ASCII only case is 512 bytes.) 166 167 _negativeASCII = new int[128]; 168 169 for (int i = 0; i < 128; i++) 170 _negativeASCII[i] = last - beforefirst; 171 172 _lowASCII = 127; 173 _highASCII = 0; 174 175 for (examine = last; examine != beforefirst; examine -= bump) 176 { 177 ch = pattern[examine]; 178 179 if (ch < 128) 180 { 181 if (_lowASCII > ch) 182 _lowASCII = ch; 183 184 if (_highASCII < ch) 185 _highASCII = ch; 186 187 if (_negativeASCII[ch] == last - beforefirst) 188 _negativeASCII[ch] = last - examine; 189 } 190 else 191 { 192 int i = ch >> 8; 193 int j = ch & 0xFF; 194 195 if (_negativeUnicode == null) 196 { 197 _negativeUnicode = new int[256][]; 198 } 199 200 if (_negativeUnicode[i] == null) 201 { 202 int[] newarray = new int[256]; 203 204 for (int k = 0; k < newarray.Length; k++) 205 newarray[k] = last - beforefirst; 206 207 if (i == 0) 208 { 209 Array.Copy(_negativeASCII, 0, newarray, 0, 128); 210 _negativeASCII = newarray; 211 } 212 213 _negativeUnicode[i] = newarray; 214 } 215 216 if (_negativeUnicode[i][j] == last - beforefirst) 217 _negativeUnicode[i][j] = last - examine; 218 } 219 } 220 } 221 MatchPattern(string text, int index)222 private bool MatchPattern(string text, int index) 223 { 224 if (_caseInsensitive) 225 { 226 if (text.Length - index < _pattern.Length) 227 { 228 return false; 229 } 230 231 TextInfo textinfo = _culture.TextInfo; 232 for (int i = 0; i < _pattern.Length; i++) 233 { 234 Debug.Assert(textinfo.ToLower(_pattern[i]) == _pattern[i], "pattern should be converted to lower case in constructor!"); 235 if (textinfo.ToLower(text[index + i]) != _pattern[i]) 236 { 237 return false; 238 } 239 } 240 return true; 241 } 242 else 243 { 244 return (0 == string.CompareOrdinal(_pattern, 0, text, index, _pattern.Length)); 245 } 246 } 247 248 /// <summary> 249 /// When a regex is anchored, we can do a quick IsMatch test instead of a Scan 250 /// </summary> IsMatch(string text, int index, int beglimit, int endlimit)251 internal bool IsMatch(string text, int index, int beglimit, int endlimit) 252 { 253 if (!_rightToLeft) 254 { 255 if (index < beglimit || endlimit - index < _pattern.Length) 256 return false; 257 258 return MatchPattern(text, index); 259 } 260 else 261 { 262 if (index > endlimit || index - beglimit < _pattern.Length) 263 return false; 264 265 return MatchPattern(text, index - _pattern.Length); 266 } 267 } 268 269 /// <summary> 270 /// Scan uses the Boyer-Moore algorithm to find the first occurrence 271 /// of the specified string within text, beginning at index, and 272 /// constrained within beglimit and endlimit. 273 /// 274 /// The direction and case-sensitivity of the match is determined 275 /// by the arguments to the RegexBoyerMoore constructor. 276 /// </summary> Scan(string text, int index, int beglimit, int endlimit)277 internal int Scan(string text, int index, int beglimit, int endlimit) 278 { 279 int test; 280 int test2; 281 int match; 282 int startmatch; 283 int endmatch; 284 int advance; 285 int defadv; 286 int bump; 287 char chMatch; 288 char chTest; 289 int[] unicodeLookup; 290 291 if (!_rightToLeft) 292 { 293 defadv = _pattern.Length; 294 startmatch = _pattern.Length - 1; 295 endmatch = 0; 296 test = index + defadv - 1; 297 bump = 1; 298 } 299 else 300 { 301 defadv = -_pattern.Length; 302 startmatch = 0; 303 endmatch = -defadv - 1; 304 test = index + defadv; 305 bump = -1; 306 } 307 308 chMatch = _pattern[startmatch]; 309 310 for (; ;) 311 { 312 if (test >= endlimit || test < beglimit) 313 return -1; 314 315 chTest = text[test]; 316 317 if (_caseInsensitive) 318 chTest = _culture.TextInfo.ToLower(chTest); 319 320 if (chTest != chMatch) 321 { 322 if (chTest < 128) 323 advance = _negativeASCII[chTest]; 324 else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 325 advance = unicodeLookup[chTest & 0xFF]; 326 else 327 advance = defadv; 328 329 test += advance; 330 } 331 else 332 { // if (chTest == chMatch) 333 test2 = test; 334 match = startmatch; 335 336 for (; ;) 337 { 338 if (match == endmatch) 339 return (_rightToLeft ? test2 + 1 : test2); 340 341 match -= bump; 342 test2 -= bump; 343 344 chTest = text[test2]; 345 346 if (_caseInsensitive) 347 chTest = _culture.TextInfo.ToLower(chTest); 348 349 if (chTest != _pattern[match]) 350 { 351 advance = _positive[match]; 352 if ((chTest & 0xFF80) == 0) 353 test2 = (match - startmatch) + _negativeASCII[chTest]; 354 else if (null != _negativeUnicode && (null != (unicodeLookup = _negativeUnicode[chTest >> 8]))) 355 test2 = (match - startmatch) + unicodeLookup[chTest & 0xFF]; 356 else 357 { 358 test += advance; 359 break; 360 } 361 362 if (_rightToLeft ? test2 < advance : test2 > advance) 363 advance = test2; 364 365 test += advance; 366 break; 367 } 368 } 369 } 370 } 371 } 372 373 /// <summary> 374 /// Used when dumping for debugging. 375 /// </summary> ToString()376 public override string ToString() 377 { 378 return _pattern; 379 } 380 381 #if DEBUG Dump(string indent)382 public string Dump(string indent) 383 { 384 StringBuilder sb = new StringBuilder(); 385 386 sb.Append(indent + "BM Pattern: " + _pattern + "\n"); 387 sb.Append(indent + "Positive: "); 388 for (int i = 0; i < _positive.Length; i++) 389 { 390 sb.Append(_positive[i].ToString(CultureInfo.InvariantCulture) + " "); 391 } 392 sb.Append("\n"); 393 394 if (_negativeASCII != null) 395 { 396 sb.Append(indent + "Negative table\n"); 397 for (int i = 0; i < _negativeASCII.Length; i++) 398 { 399 if (_negativeASCII[i] != _pattern.Length) 400 { 401 sb.Append(indent + " " + Regex.Escape(Convert.ToString((char)i, CultureInfo.InvariantCulture)) + " " + _negativeASCII[i].ToString(CultureInfo.InvariantCulture) + "\n"); 402 } 403 } 404 } 405 406 return sb.ToString(); 407 } 408 #endif 409 } 410 } 411