1 use std::cmp;
2 use std::mem;
3
4 use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
5 use memchr::{memchr, memchr2, memchr3};
6 use syntax::hir::literal::{Literal, Literals};
7
8 use freqs::BYTE_FREQUENCIES;
9
10 /// A prefix extracted from a compiled regular expression.
11 ///
12 /// A regex prefix is a set of literal strings that *must* be matched at the
13 /// beginning of a regex in order for the entire regex to match. Similarly
14 /// for a regex suffix.
15 #[derive(Clone, Debug)]
16 pub struct LiteralSearcher {
17 complete: bool,
18 lcp: FreqyPacked,
19 lcs: FreqyPacked,
20 matcher: Matcher,
21 }
22
23 #[derive(Clone, Debug)]
24 enum Matcher {
25 /// No literals. (Never advances through the input.)
26 Empty,
27 /// A set of four or more single byte literals.
28 Bytes(SingleByteSet),
29 /// A single substring, find using memchr and frequency analysis.
30 FreqyPacked(FreqyPacked),
31 /// A single substring, find using Boyer-Moore.
32 BoyerMoore(BoyerMooreSearch),
33 /// An Aho-Corasick automaton.
34 AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
35 /// A packed multiple substring searcher, using SIMD.
36 ///
37 /// Note that Aho-Corasick will actually use this packed searcher
38 /// internally automatically, however, there is some overhead associated
39 /// with going through the Aho-Corasick machinery. So using the packed
40 /// searcher directly results in some gains.
41 Packed { s: packed::Searcher, lits: Vec<Literal> },
42 }
43
44 impl LiteralSearcher {
45 /// Returns a matcher that never matches and never advances the input.
empty() -> Self46 pub fn empty() -> Self {
47 Self::new(Literals::empty(), Matcher::Empty)
48 }
49
50 /// Returns a matcher for literal prefixes from the given set.
prefixes(lits: Literals) -> Self51 pub fn prefixes(lits: Literals) -> Self {
52 let matcher = Matcher::prefixes(&lits);
53 Self::new(lits, matcher)
54 }
55
56 /// Returns a matcher for literal suffixes from the given set.
suffixes(lits: Literals) -> Self57 pub fn suffixes(lits: Literals) -> Self {
58 let matcher = Matcher::suffixes(&lits);
59 Self::new(lits, matcher)
60 }
61
new(lits: Literals, matcher: Matcher) -> Self62 fn new(lits: Literals, matcher: Matcher) -> Self {
63 let complete = lits.all_complete();
64 LiteralSearcher {
65 complete: complete,
66 lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
67 lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
68 matcher: matcher,
69 }
70 }
71
72 /// Returns true if all matches comprise the entire regular expression.
73 ///
74 /// This does not necessarily mean that a literal match implies a match
75 /// of the regular expression. For example, the regular expresison `^a`
76 /// is comprised of a single complete literal `a`, but the regular
77 /// expression demands that it only match at the beginning of a string.
complete(&self) -> bool78 pub fn complete(&self) -> bool {
79 self.complete && !self.is_empty()
80 }
81
82 /// Find the position of a literal in `haystack` if it exists.
83 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<(usize, usize)>84 pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
85 use self::Matcher::*;
86 match self.matcher {
87 Empty => Some((0, 0)),
88 Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
89 FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
90 BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
91 AC { ref ac, .. } => {
92 ac.find(haystack).map(|m| (m.start(), m.end()))
93 }
94 Packed { ref s, .. } => {
95 s.find(haystack).map(|m| (m.start(), m.end()))
96 }
97 }
98 }
99
100 /// Like find, except matches must start at index `0`.
find_start(&self, haystack: &[u8]) -> Option<(usize, usize)>101 pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
102 for lit in self.iter() {
103 if lit.len() > haystack.len() {
104 continue;
105 }
106 if lit == &haystack[0..lit.len()] {
107 return Some((0, lit.len()));
108 }
109 }
110 None
111 }
112
113 /// Like find, except matches must end at index `haystack.len()`.
find_end(&self, haystack: &[u8]) -> Option<(usize, usize)>114 pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
115 for lit in self.iter() {
116 if lit.len() > haystack.len() {
117 continue;
118 }
119 if lit == &haystack[haystack.len() - lit.len()..] {
120 return Some((haystack.len() - lit.len(), haystack.len()));
121 }
122 }
123 None
124 }
125
126 /// Returns an iterator over all literals to be matched.
iter(&self) -> LiteralIter127 pub fn iter(&self) -> LiteralIter {
128 match self.matcher {
129 Matcher::Empty => LiteralIter::Empty,
130 Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
131 Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
132 Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
133 Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
134 Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
135 }
136 }
137
138 /// Returns a matcher for the longest common prefix of this matcher.
lcp(&self) -> &FreqyPacked139 pub fn lcp(&self) -> &FreqyPacked {
140 &self.lcp
141 }
142
143 /// Returns a matcher for the longest common suffix of this matcher.
lcs(&self) -> &FreqyPacked144 pub fn lcs(&self) -> &FreqyPacked {
145 &self.lcs
146 }
147
148 /// Returns true iff this prefix is empty.
is_empty(&self) -> bool149 pub fn is_empty(&self) -> bool {
150 self.len() == 0
151 }
152
153 /// Returns the number of prefixes in this machine.
len(&self) -> usize154 pub fn len(&self) -> usize {
155 use self::Matcher::*;
156 match self.matcher {
157 Empty => 0,
158 Bytes(ref sset) => sset.dense.len(),
159 FreqyPacked(_) => 1,
160 BoyerMoore(_) => 1,
161 AC { ref ac, .. } => ac.pattern_count(),
162 Packed { ref lits, .. } => lits.len(),
163 }
164 }
165
166 /// Return the approximate heap usage of literals in bytes.
approximate_size(&self) -> usize167 pub fn approximate_size(&self) -> usize {
168 use self::Matcher::*;
169 match self.matcher {
170 Empty => 0,
171 Bytes(ref sset) => sset.approximate_size(),
172 FreqyPacked(ref single) => single.approximate_size(),
173 BoyerMoore(ref single) => single.approximate_size(),
174 AC { ref ac, .. } => ac.heap_bytes(),
175 Packed { ref s, .. } => s.heap_bytes(),
176 }
177 }
178 }
179
180 impl Matcher {
prefixes(lits: &Literals) -> Self181 fn prefixes(lits: &Literals) -> Self {
182 let sset = SingleByteSet::prefixes(lits);
183 Matcher::new(lits, sset)
184 }
185
suffixes(lits: &Literals) -> Self186 fn suffixes(lits: &Literals) -> Self {
187 let sset = SingleByteSet::suffixes(lits);
188 Matcher::new(lits, sset)
189 }
190
new(lits: &Literals, sset: SingleByteSet) -> Self191 fn new(lits: &Literals, sset: SingleByteSet) -> Self {
192 if lits.literals().is_empty() {
193 return Matcher::Empty;
194 }
195 if sset.dense.len() >= 26 {
196 // Avoid trying to match a large number of single bytes.
197 // This is *very* sensitive to a frequency analysis comparison
198 // between the bytes in sset and the composition of the haystack.
199 // No matter the size of sset, if its members all are rare in the
200 // haystack, then it'd be worth using it. How to tune this... IDK.
201 // ---AG
202 return Matcher::Empty;
203 }
204 if sset.complete {
205 return Matcher::Bytes(sset);
206 }
207 if lits.literals().len() == 1 {
208 let lit = lits.literals()[0].to_vec();
209 if BoyerMooreSearch::should_use(lit.as_slice()) {
210 return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
211 } else {
212 return Matcher::FreqyPacked(FreqyPacked::new(lit));
213 }
214 }
215
216 let pats = lits.literals().to_owned();
217 let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
218 if lits.literals().len() <= 100 && !is_aho_corasick_fast {
219 let mut builder = packed::Config::new()
220 .match_kind(packed::MatchKind::LeftmostFirst)
221 .builder();
222 if let Some(s) = builder.extend(&pats).build() {
223 return Matcher::Packed { s, lits: pats };
224 }
225 }
226 let ac = AhoCorasickBuilder::new()
227 .match_kind(aho_corasick::MatchKind::LeftmostFirst)
228 .dfa(true)
229 .build_with_size::<u32, _, _>(&pats)
230 .unwrap();
231 Matcher::AC { ac, lits: pats }
232 }
233 }
234
235 pub enum LiteralIter<'a> {
236 Empty,
237 Bytes(&'a [u8]),
238 Single(&'a [u8]),
239 AC(&'a [Literal]),
240 Packed(&'a [Literal]),
241 }
242
243 impl<'a> Iterator for LiteralIter<'a> {
244 type Item = &'a [u8];
245
next(&mut self) -> Option<Self::Item>246 fn next(&mut self) -> Option<Self::Item> {
247 match *self {
248 LiteralIter::Empty => None,
249 LiteralIter::Bytes(ref mut many) => {
250 if many.is_empty() {
251 None
252 } else {
253 let next = &many[0..1];
254 *many = &many[1..];
255 Some(next)
256 }
257 }
258 LiteralIter::Single(ref mut one) => {
259 if one.is_empty() {
260 None
261 } else {
262 let next = &one[..];
263 *one = &[];
264 Some(next)
265 }
266 }
267 LiteralIter::AC(ref mut lits) => {
268 if lits.is_empty() {
269 None
270 } else {
271 let next = &lits[0];
272 *lits = &lits[1..];
273 Some(&**next)
274 }
275 }
276 LiteralIter::Packed(ref mut lits) => {
277 if lits.is_empty() {
278 None
279 } else {
280 let next = &lits[0];
281 *lits = &lits[1..];
282 Some(&**next)
283 }
284 }
285 }
286 }
287 }
288
289 #[derive(Clone, Debug)]
290 struct SingleByteSet {
291 sparse: Vec<bool>,
292 dense: Vec<u8>,
293 complete: bool,
294 all_ascii: bool,
295 }
296
297 impl SingleByteSet {
new() -> SingleByteSet298 fn new() -> SingleByteSet {
299 SingleByteSet {
300 sparse: vec![false; 256],
301 dense: vec![],
302 complete: true,
303 all_ascii: true,
304 }
305 }
306
prefixes(lits: &Literals) -> SingleByteSet307 fn prefixes(lits: &Literals) -> SingleByteSet {
308 let mut sset = SingleByteSet::new();
309 for lit in lits.literals() {
310 sset.complete = sset.complete && lit.len() == 1;
311 if let Some(&b) = lit.get(0) {
312 if !sset.sparse[b as usize] {
313 if b > 0x7F {
314 sset.all_ascii = false;
315 }
316 sset.dense.push(b);
317 sset.sparse[b as usize] = true;
318 }
319 }
320 }
321 sset
322 }
323
suffixes(lits: &Literals) -> SingleByteSet324 fn suffixes(lits: &Literals) -> SingleByteSet {
325 let mut sset = SingleByteSet::new();
326 for lit in lits.literals() {
327 sset.complete = sset.complete && lit.len() == 1;
328 if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
329 if !sset.sparse[b as usize] {
330 if b > 0x7F {
331 sset.all_ascii = false;
332 }
333 sset.dense.push(b);
334 sset.sparse[b as usize] = true;
335 }
336 }
337 }
338 sset
339 }
340
341 /// Faster find that special cases certain sizes to use memchr.
342 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, text: &[u8]) -> Option<usize>343 fn find(&self, text: &[u8]) -> Option<usize> {
344 match self.dense.len() {
345 0 => None,
346 1 => memchr(self.dense[0], text),
347 2 => memchr2(self.dense[0], self.dense[1], text),
348 3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
349 _ => self._find(text),
350 }
351 }
352
353 /// Generic find that works on any sized set.
_find(&self, haystack: &[u8]) -> Option<usize>354 fn _find(&self, haystack: &[u8]) -> Option<usize> {
355 for (i, &b) in haystack.iter().enumerate() {
356 if self.sparse[b as usize] {
357 return Some(i);
358 }
359 }
360 None
361 }
362
approximate_size(&self) -> usize363 fn approximate_size(&self) -> usize {
364 (self.dense.len() * mem::size_of::<u8>())
365 + (self.sparse.len() * mem::size_of::<bool>())
366 }
367 }
368
369 /// Provides an implementation of fast subtring search using frequency
370 /// analysis.
371 ///
372 /// memchr is so fast that we do everything we can to keep the loop in memchr
373 /// for as long as possible. The easiest way to do this is to intelligently
374 /// pick the byte to send to memchr. The best byte is the byte that occurs
375 /// least frequently in the haystack. Since doing frequency analysis on the
376 /// haystack is far too expensive, we compute a set of fixed frequencies up
377 /// front and hard code them in src/freqs.rs. Frequency analysis is done via
378 /// scripts/frequencies.py.
379 #[derive(Clone, Debug)]
380 pub struct FreqyPacked {
381 /// The pattern.
382 pat: Vec<u8>,
383 /// The number of Unicode characters in the pattern. This is useful for
384 /// determining the effective length of a pattern when deciding which
385 /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
386 /// as one character.
387 char_len: usize,
388 /// The rarest byte in the pattern, according to pre-computed frequency
389 /// analysis.
390 rare1: u8,
391 /// The offset of the rarest byte in `pat`.
392 rare1i: usize,
393 /// The second rarest byte in the pattern, according to pre-computed
394 /// frequency analysis. (This may be equivalent to the rarest byte.)
395 ///
396 /// The second rarest byte is used as a type of guard for quickly detecting
397 /// a mismatch after memchr locates an instance of the rarest byte. This
398 /// is a hedge against pathological cases where the pre-computed frequency
399 /// analysis may be off. (But of course, does not prevent *all*
400 /// pathological cases.)
401 rare2: u8,
402 /// The offset of the second rarest byte in `pat`.
403 rare2i: usize,
404 }
405
406 impl FreqyPacked {
new(pat: Vec<u8>) -> FreqyPacked407 fn new(pat: Vec<u8>) -> FreqyPacked {
408 if pat.is_empty() {
409 return FreqyPacked::empty();
410 }
411
412 // Find the rarest two bytes. Try to make them distinct (but it's not
413 // required).
414 let mut rare1 = pat[0];
415 let mut rare2 = pat[0];
416 for b in pat[1..].iter().cloned() {
417 if freq_rank(b) < freq_rank(rare1) {
418 rare1 = b;
419 }
420 }
421 for &b in &pat {
422 if rare1 == rare2 {
423 rare2 = b
424 } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
425 rare2 = b;
426 }
427 }
428
429 // And find the offsets of their last occurrences.
430 let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
431 let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
432
433 let char_len = char_len_lossy(&pat);
434 FreqyPacked {
435 pat: pat,
436 char_len: char_len,
437 rare1: rare1,
438 rare1i: rare1i,
439 rare2: rare2,
440 rare2i: rare2i,
441 }
442 }
443
empty() -> FreqyPacked444 fn empty() -> FreqyPacked {
445 FreqyPacked {
446 pat: vec![],
447 char_len: 0,
448 rare1: 0,
449 rare1i: 0,
450 rare2: 0,
451 rare2i: 0,
452 }
453 }
454
455 #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<usize>456 pub fn find(&self, haystack: &[u8]) -> Option<usize> {
457 let pat = &*self.pat;
458 if haystack.len() < pat.len() || pat.is_empty() {
459 return None;
460 }
461 let mut i = self.rare1i;
462 while i < haystack.len() {
463 i += match memchr(self.rare1, &haystack[i..]) {
464 None => return None,
465 Some(i) => i,
466 };
467 let start = i - self.rare1i;
468 let end = start + pat.len();
469 if end > haystack.len() {
470 return None;
471 }
472 let aligned = &haystack[start..end];
473 if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
474 return Some(start);
475 }
476 i += 1;
477 }
478 None
479 }
480
481 #[cfg_attr(feature = "perf-inline", inline(always))]
is_suffix(&self, text: &[u8]) -> bool482 pub fn is_suffix(&self, text: &[u8]) -> bool {
483 if text.len() < self.len() {
484 return false;
485 }
486 text[text.len() - self.len()..] == *self.pat
487 }
488
len(&self) -> usize489 pub fn len(&self) -> usize {
490 self.pat.len()
491 }
492
char_len(&self) -> usize493 pub fn char_len(&self) -> usize {
494 self.char_len
495 }
496
approximate_size(&self) -> usize497 fn approximate_size(&self) -> usize {
498 self.pat.len() * mem::size_of::<u8>()
499 }
500 }
501
char_len_lossy(bytes: &[u8]) -> usize502 fn char_len_lossy(bytes: &[u8]) -> usize {
503 String::from_utf8_lossy(bytes).chars().count()
504 }
505
506 /// An implementation of Tuned Boyer-Moore as laid out by
507 /// Andrew Hume and Daniel Sunday in "Fast String Searching".
508 /// O(n) in the size of the input.
509 ///
510 /// Fast string searching algorithms come in many variations,
511 /// but they can generally be described in terms of three main
512 /// components.
513 ///
514 /// The skip loop is where the string searcher wants to spend
515 /// as much time as possible. Exactly which character in the
516 /// pattern the skip loop examines varies from algorithm to
517 /// algorithm, but in the simplest case this loop repeated
518 /// looks at the last character in the pattern and jumps
519 /// forward in the input if it is not in the pattern.
520 /// Robert Boyer and J Moore called this the "fast" loop in
521 /// their original paper.
522 ///
523 /// The match loop is responsible for actually examining the
524 /// whole potentially matching substring. In order to fail
525 /// faster, the match loop sometimes has a guard test attached.
526 /// The guard test uses frequency analysis of the different
527 /// characters in the pattern to choose the least frequency
528 /// occurring character and use it to find match failures
529 /// as quickly as possible.
530 ///
531 /// The shift rule governs how the algorithm will shuffle its
532 /// test window in the event of a failure during the match loop.
533 /// Certain shift rules allow the worst-case run time of the
534 /// algorithm to be shown to be O(n) in the size of the input
535 /// rather than O(nm) in the size of the input and the size
536 /// of the pattern (as naive Boyer-Moore is).
537 ///
538 /// "Fast String Searching", in addition to presenting a tuned
539 /// algorithm, provides a comprehensive taxonomy of the many
540 /// different flavors of string searchers. Under that taxonomy
541 /// TBM, the algorithm implemented here, uses an unrolled fast
542 /// skip loop with memchr fallback, a forward match loop with guard,
543 /// and the mini Sunday's delta shift rule. To unpack that you'll have to
544 /// read the paper.
545 #[derive(Clone, Debug)]
546 pub struct BoyerMooreSearch {
547 /// The pattern we are going to look for in the haystack.
548 pattern: Vec<u8>,
549
550 /// The skip table for the skip loop.
551 ///
552 /// Maps the character at the end of the input
553 /// to a shift.
554 skip_table: Vec<usize>,
555
556 /// The guard character (least frequently occurring char).
557 guard: u8,
558 /// The reverse-index of the guard character in the pattern.
559 guard_reverse_idx: usize,
560
561 /// Daniel Sunday's mini generalized delta2 shift table.
562 ///
563 /// We use a skip loop, so we only have to provide a shift
564 /// for the skip char (last char). This is why it is a mini
565 /// shift rule.
566 md2_shift: usize,
567 }
568
569 impl BoyerMooreSearch {
570 /// Create a new string searcher, performing whatever
571 /// compilation steps are required.
new(pattern: Vec<u8>) -> Self572 fn new(pattern: Vec<u8>) -> Self {
573 debug_assert!(!pattern.is_empty());
574
575 let (g, gi) = Self::select_guard(pattern.as_slice());
576 let skip_table = Self::compile_skip_table(pattern.as_slice());
577 let md2_shift = Self::compile_md2_shift(pattern.as_slice());
578 BoyerMooreSearch {
579 pattern: pattern,
580 skip_table: skip_table,
581 guard: g,
582 guard_reverse_idx: gi,
583 md2_shift: md2_shift,
584 }
585 }
586
587 /// Find the pattern in `haystack`, returning the offset
588 /// of the start of the first occurrence of the pattern
589 /// in `haystack`.
590 #[inline]
find(&self, haystack: &[u8]) -> Option<usize>591 fn find(&self, haystack: &[u8]) -> Option<usize> {
592 if haystack.len() < self.pattern.len() {
593 return None;
594 }
595
596 let mut window_end = self.pattern.len() - 1;
597
598 // Inspired by the grep source. It is a way
599 // to do correct loop unrolling without having to place
600 // a crashpad of terminating charicters at the end in
601 // the way described in the Fast String Searching paper.
602 const NUM_UNROLL: usize = 10;
603 // 1 for the initial position, and 1 for the md2 shift
604 let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
605
606 if haystack.len() > short_circut {
607 // just 1 for the md2 shift
608 let backstop =
609 haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
610 loop {
611 window_end =
612 match self.skip_loop(haystack, window_end, backstop) {
613 Some(i) => i,
614 None => return None,
615 };
616 if window_end >= backstop {
617 break;
618 }
619
620 if self.check_match(haystack, window_end) {
621 return Some(window_end - (self.pattern.len() - 1));
622 } else {
623 let skip = self.skip_table[haystack[window_end] as usize];
624 window_end +=
625 if skip == 0 { self.md2_shift } else { skip };
626 continue;
627 }
628 }
629 }
630
631 // now process the input after the backstop
632 while window_end < haystack.len() {
633 let mut skip = self.skip_table[haystack[window_end] as usize];
634 if skip == 0 {
635 if self.check_match(haystack, window_end) {
636 return Some(window_end - (self.pattern.len() - 1));
637 } else {
638 skip = self.md2_shift;
639 }
640 }
641 window_end += skip;
642 }
643
644 None
645 }
646
len(&self) -> usize647 fn len(&self) -> usize {
648 return self.pattern.len();
649 }
650
651 /// The key heuristic behind which the BoyerMooreSearch lives.
652 ///
653 /// See `rust-lang/regex/issues/408`.
654 ///
655 /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
656 /// platform-specific memchr routine with a bit of frequency
657 /// analysis sprinkled on top actually wins most of the time.
658 /// However, there are a few cases where Tuned Boyer-Moore still
659 /// wins.
660 ///
661 /// If the haystack is random, frequency analysis doesn't help us,
662 /// so Boyer-Moore will win for sufficiently large needles.
663 /// Unfortunately, there is no obvious way to determine this
664 /// ahead of time.
665 ///
666 /// If the pattern itself consists of very common characters,
667 /// frequency analysis won't get us anywhere. The most extreme
668 /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
669 /// this case is wholly determined by the pattern, so we can actually
670 /// implement the heuristic.
671 ///
672 /// A third case is if the pattern is sufficiently long. The idea
673 /// here is that once the pattern gets long enough the Tuned
674 /// Boyer-Moore skip loop will start making strides long enough
675 /// to beat the asm deep magic that is memchr.
should_use(pattern: &[u8]) -> bool676 fn should_use(pattern: &[u8]) -> bool {
677 // The minimum pattern length required to use TBM.
678 const MIN_LEN: usize = 9;
679 // The minimum frequency rank (lower is rarer) that every byte in the
680 // pattern must have in order to use TBM. That is, if the pattern
681 // contains _any_ byte with a lower rank, then TBM won't be used.
682 const MIN_CUTOFF: usize = 150;
683 // The maximum frequency rank for any byte.
684 const MAX_CUTOFF: usize = 255;
685 // The scaling factor used to determine the actual cutoff frequency
686 // to use (keeping in mind that the minimum frequency rank is bounded
687 // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
688 // likely to be used as the pattern grows longer. That is, longer
689 // patterns permit somewhat less frequent bytes than shorter patterns,
690 // under the assumption that TBM gets better as the pattern gets
691 // longer.
692 const LEN_CUTOFF_PROPORTION: usize = 4;
693
694 let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
695 let cutoff = cmp::max(
696 MIN_CUTOFF,
697 MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
698 );
699 // The pattern must be long enough to be worthwhile. e.g., memchr will
700 // be faster on `e` because it is short even though e is quite common.
701 pattern.len() > MIN_LEN
702 // all the bytes must be more common than the cutoff.
703 && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
704 }
705
706 /// Check to see if there is a match at the given position
707 #[inline]
check_match(&self, haystack: &[u8], window_end: usize) -> bool708 fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
709 // guard test
710 if haystack[window_end - self.guard_reverse_idx] != self.guard {
711 return false;
712 }
713
714 // match loop
715 let window_start = window_end - (self.pattern.len() - 1);
716 for i in 0..self.pattern.len() {
717 if self.pattern[i] != haystack[window_start + i] {
718 return false;
719 }
720 }
721
722 true
723 }
724
725 /// Skip forward according to the shift table.
726 ///
727 /// Returns the offset of the next occurrence
728 /// of the last char in the pattern, or the none
729 /// if it never reappears. If `skip_loop` hits the backstop
730 /// it will leave early.
731 #[inline]
skip_loop( &self, haystack: &[u8], mut window_end: usize, backstop: usize, ) -> Option<usize>732 fn skip_loop(
733 &self,
734 haystack: &[u8],
735 mut window_end: usize,
736 backstop: usize,
737 ) -> Option<usize> {
738 let window_end_snapshot = window_end;
739 let skip_of = |we: usize| -> usize {
740 // Unsafe might make this faster, but the benchmarks
741 // were hard to interpret.
742 self.skip_table[haystack[we] as usize]
743 };
744
745 loop {
746 let mut skip = skip_of(window_end);
747 window_end += skip;
748 skip = skip_of(window_end);
749 window_end += skip;
750 if skip != 0 {
751 skip = skip_of(window_end);
752 window_end += skip;
753 skip = skip_of(window_end);
754 window_end += skip;
755 skip = skip_of(window_end);
756 window_end += skip;
757 if skip != 0 {
758 skip = skip_of(window_end);
759 window_end += skip;
760 skip = skip_of(window_end);
761 window_end += skip;
762 skip = skip_of(window_end);
763 window_end += skip;
764 if skip != 0 {
765 skip = skip_of(window_end);
766 window_end += skip;
767 skip = skip_of(window_end);
768 window_end += skip;
769
770 // If ten iterations did not make at least 16 words
771 // worth of progress, we just fall back on memchr.
772 if window_end - window_end_snapshot
773 > 16 * mem::size_of::<usize>()
774 {
775 // Returning a window_end >= backstop will
776 // immediatly break us out of the inner loop in
777 // `find`.
778 if window_end >= backstop {
779 return Some(window_end);
780 }
781
782 continue; // we made enough progress
783 } else {
784 // In case we are already there, and so that
785 // we will catch the guard char.
786 window_end = window_end
787 .checked_sub(1 + self.guard_reverse_idx)
788 .unwrap_or(0);
789
790 match memchr(self.guard, &haystack[window_end..]) {
791 None => return None,
792 Some(g_idx) => {
793 return Some(
794 window_end
795 + g_idx
796 + self.guard_reverse_idx,
797 );
798 }
799 }
800 }
801 }
802 }
803 }
804
805 return Some(window_end);
806 }
807 }
808
809 /// Compute the ufast skip table.
compile_skip_table(pattern: &[u8]) -> Vec<usize>810 fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
811 let mut tab = vec![pattern.len(); 256];
812
813 // For every char in the pattern, we write a skip
814 // that will line us up with the rightmost occurrence.
815 //
816 // N.B. the sentinel (0) is written by the last
817 // loop iteration.
818 for (i, c) in pattern.iter().enumerate() {
819 tab[*c as usize] = (pattern.len() - 1) - i;
820 }
821
822 tab
823 }
824
825 /// Select the guard character based off of the precomputed
826 /// frequency table.
select_guard(pattern: &[u8]) -> (u8, usize)827 fn select_guard(pattern: &[u8]) -> (u8, usize) {
828 let mut rarest = pattern[0];
829 let mut rarest_rev_idx = pattern.len() - 1;
830 for (i, c) in pattern.iter().enumerate() {
831 if freq_rank(*c) < freq_rank(rarest) {
832 rarest = *c;
833 rarest_rev_idx = (pattern.len() - 1) - i;
834 }
835 }
836
837 (rarest, rarest_rev_idx)
838 }
839
840 /// If there is another occurrence of the skip
841 /// char, shift to it, otherwise just shift to
842 /// the next window.
compile_md2_shift(pattern: &[u8]) -> usize843 fn compile_md2_shift(pattern: &[u8]) -> usize {
844 let shiftc = *pattern.last().unwrap();
845
846 // For a pattern of length 1 we will never apply the
847 // shift rule, so we use a poison value on the principle
848 // that failing fast is a good thing.
849 if pattern.len() == 1 {
850 return 0xDEADBEAF;
851 }
852
853 let mut i = pattern.len() - 2;
854 while i > 0 {
855 if pattern[i] == shiftc {
856 return (pattern.len() - 1) - i;
857 }
858 i -= 1;
859 }
860
861 // The skip char never re-occurs in the pattern, so
862 // we can just shift the whole window length.
863 pattern.len() - 1
864 }
865
approximate_size(&self) -> usize866 fn approximate_size(&self) -> usize {
867 (self.pattern.len() * mem::size_of::<u8>())
868 + (256 * mem::size_of::<usize>()) // skip table
869 }
870 }
871
freq_rank(b: u8) -> usize872 fn freq_rank(b: u8) -> usize {
873 BYTE_FREQUENCIES[b as usize] as usize
874 }
875
876 #[cfg(test)]
877 mod tests {
878 use super::{BoyerMooreSearch, FreqyPacked};
879
880 //
881 // Unit Tests
882 //
883
884 // The "hello, world" of string searching
885 #[test]
bm_find_subs()886 fn bm_find_subs() {
887 let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
888 let haystack = b"I keep seeing patterns in this text";
889 assert_eq!(14, searcher.find(haystack).unwrap());
890 }
891
892 #[test]
bm_find_no_subs()893 fn bm_find_no_subs() {
894 let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
895 let haystack = b"I keep seeing needles in this text";
896 assert_eq!(None, searcher.find(haystack));
897 }
898
899 //
900 // Regression Tests
901 //
902
903 #[test]
bm_skip_reset_bug()904 fn bm_skip_reset_bug() {
905 let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
906 let needle = vec![0, 1, 1, 0];
907
908 let searcher = BoyerMooreSearch::new(needle);
909 let offset = searcher.find(haystack.as_slice()).unwrap();
910 assert_eq!(4, offset);
911 }
912
913 #[test]
bm_backstop_underflow_bug()914 fn bm_backstop_underflow_bug() {
915 let haystack = vec![0, 0];
916 let needle = vec![0, 0];
917
918 let searcher = BoyerMooreSearch::new(needle);
919 let offset = searcher.find(haystack.as_slice()).unwrap();
920 assert_eq!(0, offset);
921 }
922
923 #[test]
bm_naive_off_by_one_bug()924 fn bm_naive_off_by_one_bug() {
925 let haystack = vec![91];
926 let needle = vec![91];
927
928 let naive_offset = naive_find(&needle, &haystack).unwrap();
929 assert_eq!(0, naive_offset);
930 }
931
932 #[test]
bm_memchr_fallback_indexing_bug()933 fn bm_memchr_fallback_indexing_bug() {
934 let mut haystack = vec![
935 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
936 0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
937 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
938 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
939 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940 ];
941 let needle = vec![1, 1, 1, 1, 32, 32, 87];
942 let needle_start = haystack.len();
943 haystack.extend(needle.clone());
944
945 let searcher = BoyerMooreSearch::new(needle);
946 assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
947 }
948
949 #[test]
bm_backstop_boundary()950 fn bm_backstop_boundary() {
951 let haystack = b"\
952 // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
953 e_data.clone_created(entity_id, entity_to_add.entity_id);
954 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
955 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
956 "
957 .to_vec();
958 let needle = b"clone_created".to_vec();
959
960 let searcher = BoyerMooreSearch::new(needle);
961 let result = searcher.find(&haystack);
962 assert_eq!(Some(43), result);
963 }
964
965 #[test]
bm_win_gnu_indexing_bug()966 fn bm_win_gnu_indexing_bug() {
967 let haystack_raw = vec![
968 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
971 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
972 ];
973 let needle = vec![1, 1, 1, 1, 1, 1, 1];
974 let haystack = haystack_raw.as_slice();
975
976 BoyerMooreSearch::new(needle.clone()).find(haystack);
977 }
978
979 //
980 // QuickCheck Properties
981 //
982
983 use quickcheck::TestResult;
984
naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize>985 fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
986 assert!(needle.len() <= haystack.len());
987
988 for i in 0..(haystack.len() - (needle.len() - 1)) {
989 if haystack[i] == needle[0]
990 && &haystack[i..(i + needle.len())] == needle
991 {
992 return Some(i);
993 }
994 }
995
996 None
997 }
998
999 quickcheck! {
1000 fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1001 if pile1.len() == 0 || pile2.len() == 0 {
1002 return TestResult::discard();
1003 }
1004
1005 let (needle, haystack) = if pile1.len() < pile2.len() {
1006 (pile1, pile2.as_slice())
1007 } else {
1008 (pile2, pile1.as_slice())
1009 };
1010
1011 let searcher = BoyerMooreSearch::new(needle.clone());
1012 TestResult::from_bool(
1013 searcher.find(haystack) == naive_find(&needle, haystack))
1014 }
1015
1016 fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1017 if pile1.len() == 0 || pile2.len() == 0 {
1018 return TestResult::discard();
1019 }
1020
1021 let (needle, haystack) = if pile1.len() < pile2.len() {
1022 (pile1, pile2.as_slice())
1023 } else {
1024 (pile2, pile1.as_slice())
1025 };
1026
1027 let bm_searcher = BoyerMooreSearch::new(needle.clone());
1028 let freqy_memchr = FreqyPacked::new(needle);
1029 TestResult::from_bool(
1030 bm_searcher.find(haystack) == freqy_memchr.find(haystack))
1031 }
1032
1033 fn qc_bm_finds_trailing_needle(
1034 haystack_pre: Vec<u8>,
1035 needle: Vec<u8>
1036 ) -> TestResult {
1037 if needle.len() == 0 {
1038 return TestResult::discard();
1039 }
1040
1041 let mut haystack = haystack_pre.clone();
1042 let searcher = BoyerMooreSearch::new(needle.clone());
1043
1044 if haystack.len() >= needle.len() &&
1045 searcher.find(haystack.as_slice()).is_some() {
1046 return TestResult::discard();
1047 }
1048
1049 haystack.extend(needle.clone());
1050
1051 // What if the the tail of the haystack can start the
1052 // needle?
1053 let start = haystack_pre.len()
1054 .checked_sub(needle.len())
1055 .unwrap_or(0);
1056 for i in 0..(needle.len() - 1) {
1057 if searcher.find(&haystack[(i + start)..]).is_some() {
1058 return TestResult::discard();
1059 }
1060 }
1061
1062 TestResult::from_bool(
1063 searcher.find(haystack.as_slice())
1064 .map(|x| x == haystack_pre.len())
1065 .unwrap_or(false))
1066 }
1067
1068 // qc_equals_* is only testing the negative case as @burntsushi
1069 // pointed out in https://github.com/rust-lang/regex/issues/446.
1070 // This quickcheck prop represents an effort to force testing of
1071 // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1072 // already check some of the positive cases, but they don't cover
1073 // cases where the needle is in the middle of haystack. This prop
1074 // fills that hole.
1075 fn qc_bm_finds_subslice(
1076 haystack: Vec<u8>,
1077 needle_start: usize,
1078 needle_length: usize
1079 ) -> TestResult {
1080 if haystack.len() == 0 {
1081 return TestResult::discard();
1082 }
1083
1084 let needle_start = needle_start % haystack.len();
1085 let needle_length = needle_length % (haystack.len() - needle_start);
1086
1087 if needle_length == 0 {
1088 return TestResult::discard();
1089 }
1090
1091 let needle = &haystack[needle_start..(needle_start + needle_length)];
1092
1093 let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
1094
1095 let start = naive_find(&needle, &haystack);
1096 match start {
1097 None => TestResult::from_bool(false),
1098 Some(nf_start) =>
1099 TestResult::from_bool(
1100 nf_start <= needle_start
1101 && bm_searcher.find(&haystack) == start
1102 )
1103 }
1104 }
1105
1106 fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
1107 if needle.len() == 0 {
1108 return TestResult::discard();
1109 }
1110
1111 let mut haystack = needle.clone();
1112 let searcher = BoyerMooreSearch::new(needle.clone());
1113 haystack.extend(needle);
1114
1115 TestResult::from_bool(
1116 searcher.find(haystack.as_slice())
1117 .map(|x| x == 0)
1118 .unwrap_or(false))
1119 }
1120 }
1121 }
1122