1 use std::cmp;
2 use std::mem;
3 
4 use aho_corasick::{self, packed, AhoCorasick, AhoCorasickBuilder};
5 use memchr::{memchr, memchr2, memchr3};
6 use syntax::hir::literal::{Literal, Literals};
7 
8 use freqs::BYTE_FREQUENCIES;
9 
10 /// A prefix extracted from a compiled regular expression.
11 ///
12 /// A regex prefix is a set of literal strings that *must* be matched at the
13 /// beginning of a regex in order for the entire regex to match. Similarly
14 /// for a regex suffix.
15 #[derive(Clone, Debug)]
16 pub struct LiteralSearcher {
17     complete: bool,
18     lcp: FreqyPacked,
19     lcs: FreqyPacked,
20     matcher: Matcher,
21 }
22 
23 #[derive(Clone, Debug)]
24 enum Matcher {
25     /// No literals. (Never advances through the input.)
26     Empty,
27     /// A set of four or more single byte literals.
28     Bytes(SingleByteSet),
29     /// A single substring, find using memchr and frequency analysis.
30     FreqyPacked(FreqyPacked),
31     /// A single substring, find using Boyer-Moore.
32     BoyerMoore(BoyerMooreSearch),
33     /// An Aho-Corasick automaton.
34     AC { ac: AhoCorasick<u32>, lits: Vec<Literal> },
35     /// A packed multiple substring searcher, using SIMD.
36     ///
37     /// Note that Aho-Corasick will actually use this packed searcher
38     /// internally automatically, however, there is some overhead associated
39     /// with going through the Aho-Corasick machinery. So using the packed
40     /// searcher directly results in some gains.
41     Packed { s: packed::Searcher, lits: Vec<Literal> },
42 }
43 
44 impl LiteralSearcher {
45     /// Returns a matcher that never matches and never advances the input.
empty() -> Self46     pub fn empty() -> Self {
47         Self::new(Literals::empty(), Matcher::Empty)
48     }
49 
50     /// Returns a matcher for literal prefixes from the given set.
prefixes(lits: Literals) -> Self51     pub fn prefixes(lits: Literals) -> Self {
52         let matcher = Matcher::prefixes(&lits);
53         Self::new(lits, matcher)
54     }
55 
56     /// Returns a matcher for literal suffixes from the given set.
suffixes(lits: Literals) -> Self57     pub fn suffixes(lits: Literals) -> Self {
58         let matcher = Matcher::suffixes(&lits);
59         Self::new(lits, matcher)
60     }
61 
new(lits: Literals, matcher: Matcher) -> Self62     fn new(lits: Literals, matcher: Matcher) -> Self {
63         let complete = lits.all_complete();
64         LiteralSearcher {
65             complete: complete,
66             lcp: FreqyPacked::new(lits.longest_common_prefix().to_vec()),
67             lcs: FreqyPacked::new(lits.longest_common_suffix().to_vec()),
68             matcher: matcher,
69         }
70     }
71 
72     /// Returns true if all matches comprise the entire regular expression.
73     ///
74     /// This does not necessarily mean that a literal match implies a match
75     /// of the regular expression. For example, the regular expresison `^a`
76     /// is comprised of a single complete literal `a`, but the regular
77     /// expression demands that it only match at the beginning of a string.
complete(&self) -> bool78     pub fn complete(&self) -> bool {
79         self.complete && !self.is_empty()
80     }
81 
82     /// Find the position of a literal in `haystack` if it exists.
83     #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<(usize, usize)>84     pub fn find(&self, haystack: &[u8]) -> Option<(usize, usize)> {
85         use self::Matcher::*;
86         match self.matcher {
87             Empty => Some((0, 0)),
88             Bytes(ref sset) => sset.find(haystack).map(|i| (i, i + 1)),
89             FreqyPacked(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
90             BoyerMoore(ref s) => s.find(haystack).map(|i| (i, i + s.len())),
91             AC { ref ac, .. } => {
92                 ac.find(haystack).map(|m| (m.start(), m.end()))
93             }
94             Packed { ref s, .. } => {
95                 s.find(haystack).map(|m| (m.start(), m.end()))
96             }
97         }
98     }
99 
100     /// Like find, except matches must start at index `0`.
find_start(&self, haystack: &[u8]) -> Option<(usize, usize)>101     pub fn find_start(&self, haystack: &[u8]) -> Option<(usize, usize)> {
102         for lit in self.iter() {
103             if lit.len() > haystack.len() {
104                 continue;
105             }
106             if lit == &haystack[0..lit.len()] {
107                 return Some((0, lit.len()));
108             }
109         }
110         None
111     }
112 
113     /// Like find, except matches must end at index `haystack.len()`.
find_end(&self, haystack: &[u8]) -> Option<(usize, usize)>114     pub fn find_end(&self, haystack: &[u8]) -> Option<(usize, usize)> {
115         for lit in self.iter() {
116             if lit.len() > haystack.len() {
117                 continue;
118             }
119             if lit == &haystack[haystack.len() - lit.len()..] {
120                 return Some((haystack.len() - lit.len(), haystack.len()));
121             }
122         }
123         None
124     }
125 
126     /// Returns an iterator over all literals to be matched.
iter(&self) -> LiteralIter127     pub fn iter(&self) -> LiteralIter {
128         match self.matcher {
129             Matcher::Empty => LiteralIter::Empty,
130             Matcher::Bytes(ref sset) => LiteralIter::Bytes(&sset.dense),
131             Matcher::FreqyPacked(ref s) => LiteralIter::Single(&s.pat),
132             Matcher::BoyerMoore(ref s) => LiteralIter::Single(&s.pattern),
133             Matcher::AC { ref lits, .. } => LiteralIter::AC(lits),
134             Matcher::Packed { ref lits, .. } => LiteralIter::Packed(lits),
135         }
136     }
137 
138     /// Returns a matcher for the longest common prefix of this matcher.
lcp(&self) -> &FreqyPacked139     pub fn lcp(&self) -> &FreqyPacked {
140         &self.lcp
141     }
142 
143     /// Returns a matcher for the longest common suffix of this matcher.
lcs(&self) -> &FreqyPacked144     pub fn lcs(&self) -> &FreqyPacked {
145         &self.lcs
146     }
147 
148     /// Returns true iff this prefix is empty.
is_empty(&self) -> bool149     pub fn is_empty(&self) -> bool {
150         self.len() == 0
151     }
152 
153     /// Returns the number of prefixes in this machine.
len(&self) -> usize154     pub fn len(&self) -> usize {
155         use self::Matcher::*;
156         match self.matcher {
157             Empty => 0,
158             Bytes(ref sset) => sset.dense.len(),
159             FreqyPacked(_) => 1,
160             BoyerMoore(_) => 1,
161             AC { ref ac, .. } => ac.pattern_count(),
162             Packed { ref lits, .. } => lits.len(),
163         }
164     }
165 
166     /// Return the approximate heap usage of literals in bytes.
approximate_size(&self) -> usize167     pub fn approximate_size(&self) -> usize {
168         use self::Matcher::*;
169         match self.matcher {
170             Empty => 0,
171             Bytes(ref sset) => sset.approximate_size(),
172             FreqyPacked(ref single) => single.approximate_size(),
173             BoyerMoore(ref single) => single.approximate_size(),
174             AC { ref ac, .. } => ac.heap_bytes(),
175             Packed { ref s, .. } => s.heap_bytes(),
176         }
177     }
178 }
179 
180 impl Matcher {
prefixes(lits: &Literals) -> Self181     fn prefixes(lits: &Literals) -> Self {
182         let sset = SingleByteSet::prefixes(lits);
183         Matcher::new(lits, sset)
184     }
185 
suffixes(lits: &Literals) -> Self186     fn suffixes(lits: &Literals) -> Self {
187         let sset = SingleByteSet::suffixes(lits);
188         Matcher::new(lits, sset)
189     }
190 
new(lits: &Literals, sset: SingleByteSet) -> Self191     fn new(lits: &Literals, sset: SingleByteSet) -> Self {
192         if lits.literals().is_empty() {
193             return Matcher::Empty;
194         }
195         if sset.dense.len() >= 26 {
196             // Avoid trying to match a large number of single bytes.
197             // This is *very* sensitive to a frequency analysis comparison
198             // between the bytes in sset and the composition of the haystack.
199             // No matter the size of sset, if its members all are rare in the
200             // haystack, then it'd be worth using it. How to tune this... IDK.
201             // ---AG
202             return Matcher::Empty;
203         }
204         if sset.complete {
205             return Matcher::Bytes(sset);
206         }
207         if lits.literals().len() == 1 {
208             let lit = lits.literals()[0].to_vec();
209             if BoyerMooreSearch::should_use(lit.as_slice()) {
210                 return Matcher::BoyerMoore(BoyerMooreSearch::new(lit));
211             } else {
212                 return Matcher::FreqyPacked(FreqyPacked::new(lit));
213             }
214         }
215 
216         let pats = lits.literals().to_owned();
217         let is_aho_corasick_fast = sset.dense.len() <= 1 && sset.all_ascii;
218         if lits.literals().len() <= 100 && !is_aho_corasick_fast {
219             let mut builder = packed::Config::new()
220                 .match_kind(packed::MatchKind::LeftmostFirst)
221                 .builder();
222             if let Some(s) = builder.extend(&pats).build() {
223                 return Matcher::Packed { s, lits: pats };
224             }
225         }
226         let ac = AhoCorasickBuilder::new()
227             .match_kind(aho_corasick::MatchKind::LeftmostFirst)
228             .dfa(true)
229             .build_with_size::<u32, _, _>(&pats)
230             .unwrap();
231         Matcher::AC { ac, lits: pats }
232     }
233 }
234 
235 pub enum LiteralIter<'a> {
236     Empty,
237     Bytes(&'a [u8]),
238     Single(&'a [u8]),
239     AC(&'a [Literal]),
240     Packed(&'a [Literal]),
241 }
242 
243 impl<'a> Iterator for LiteralIter<'a> {
244     type Item = &'a [u8];
245 
next(&mut self) -> Option<Self::Item>246     fn next(&mut self) -> Option<Self::Item> {
247         match *self {
248             LiteralIter::Empty => None,
249             LiteralIter::Bytes(ref mut many) => {
250                 if many.is_empty() {
251                     None
252                 } else {
253                     let next = &many[0..1];
254                     *many = &many[1..];
255                     Some(next)
256                 }
257             }
258             LiteralIter::Single(ref mut one) => {
259                 if one.is_empty() {
260                     None
261                 } else {
262                     let next = &one[..];
263                     *one = &[];
264                     Some(next)
265                 }
266             }
267             LiteralIter::AC(ref mut lits) => {
268                 if lits.is_empty() {
269                     None
270                 } else {
271                     let next = &lits[0];
272                     *lits = &lits[1..];
273                     Some(&**next)
274                 }
275             }
276             LiteralIter::Packed(ref mut lits) => {
277                 if lits.is_empty() {
278                     None
279                 } else {
280                     let next = &lits[0];
281                     *lits = &lits[1..];
282                     Some(&**next)
283                 }
284             }
285         }
286     }
287 }
288 
289 #[derive(Clone, Debug)]
290 struct SingleByteSet {
291     sparse: Vec<bool>,
292     dense: Vec<u8>,
293     complete: bool,
294     all_ascii: bool,
295 }
296 
297 impl SingleByteSet {
new() -> SingleByteSet298     fn new() -> SingleByteSet {
299         SingleByteSet {
300             sparse: vec![false; 256],
301             dense: vec![],
302             complete: true,
303             all_ascii: true,
304         }
305     }
306 
prefixes(lits: &Literals) -> SingleByteSet307     fn prefixes(lits: &Literals) -> SingleByteSet {
308         let mut sset = SingleByteSet::new();
309         for lit in lits.literals() {
310             sset.complete = sset.complete && lit.len() == 1;
311             if let Some(&b) = lit.get(0) {
312                 if !sset.sparse[b as usize] {
313                     if b > 0x7F {
314                         sset.all_ascii = false;
315                     }
316                     sset.dense.push(b);
317                     sset.sparse[b as usize] = true;
318                 }
319             }
320         }
321         sset
322     }
323 
suffixes(lits: &Literals) -> SingleByteSet324     fn suffixes(lits: &Literals) -> SingleByteSet {
325         let mut sset = SingleByteSet::new();
326         for lit in lits.literals() {
327             sset.complete = sset.complete && lit.len() == 1;
328             if let Some(&b) = lit.get(lit.len().checked_sub(1).unwrap()) {
329                 if !sset.sparse[b as usize] {
330                     if b > 0x7F {
331                         sset.all_ascii = false;
332                     }
333                     sset.dense.push(b);
334                     sset.sparse[b as usize] = true;
335                 }
336             }
337         }
338         sset
339     }
340 
341     /// Faster find that special cases certain sizes to use memchr.
342     #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, text: &[u8]) -> Option<usize>343     fn find(&self, text: &[u8]) -> Option<usize> {
344         match self.dense.len() {
345             0 => None,
346             1 => memchr(self.dense[0], text),
347             2 => memchr2(self.dense[0], self.dense[1], text),
348             3 => memchr3(self.dense[0], self.dense[1], self.dense[2], text),
349             _ => self._find(text),
350         }
351     }
352 
353     /// Generic find that works on any sized set.
_find(&self, haystack: &[u8]) -> Option<usize>354     fn _find(&self, haystack: &[u8]) -> Option<usize> {
355         for (i, &b) in haystack.iter().enumerate() {
356             if self.sparse[b as usize] {
357                 return Some(i);
358             }
359         }
360         None
361     }
362 
approximate_size(&self) -> usize363     fn approximate_size(&self) -> usize {
364         (self.dense.len() * mem::size_of::<u8>())
365             + (self.sparse.len() * mem::size_of::<bool>())
366     }
367 }
368 
369 /// Provides an implementation of fast subtring search using frequency
370 /// analysis.
371 ///
372 /// memchr is so fast that we do everything we can to keep the loop in memchr
373 /// for as long as possible. The easiest way to do this is to intelligently
374 /// pick the byte to send to memchr. The best byte is the byte that occurs
375 /// least frequently in the haystack. Since doing frequency analysis on the
376 /// haystack is far too expensive, we compute a set of fixed frequencies up
377 /// front and hard code them in src/freqs.rs. Frequency analysis is done via
378 /// scripts/frequencies.py.
379 #[derive(Clone, Debug)]
380 pub struct FreqyPacked {
381     /// The pattern.
382     pat: Vec<u8>,
383     /// The number of Unicode characters in the pattern. This is useful for
384     /// determining the effective length of a pattern when deciding which
385     /// optimizations to perform. A trailing incomplete UTF-8 sequence counts
386     /// as one character.
387     char_len: usize,
388     /// The rarest byte in the pattern, according to pre-computed frequency
389     /// analysis.
390     rare1: u8,
391     /// The offset of the rarest byte in `pat`.
392     rare1i: usize,
393     /// The second rarest byte in the pattern, according to pre-computed
394     /// frequency analysis. (This may be equivalent to the rarest byte.)
395     ///
396     /// The second rarest byte is used as a type of guard for quickly detecting
397     /// a mismatch after memchr locates an instance of the rarest byte. This
398     /// is a hedge against pathological cases where the pre-computed frequency
399     /// analysis may be off. (But of course, does not prevent *all*
400     /// pathological cases.)
401     rare2: u8,
402     /// The offset of the second rarest byte in `pat`.
403     rare2i: usize,
404 }
405 
406 impl FreqyPacked {
new(pat: Vec<u8>) -> FreqyPacked407     fn new(pat: Vec<u8>) -> FreqyPacked {
408         if pat.is_empty() {
409             return FreqyPacked::empty();
410         }
411 
412         // Find the rarest two bytes. Try to make them distinct (but it's not
413         // required).
414         let mut rare1 = pat[0];
415         let mut rare2 = pat[0];
416         for b in pat[1..].iter().cloned() {
417             if freq_rank(b) < freq_rank(rare1) {
418                 rare1 = b;
419             }
420         }
421         for &b in &pat {
422             if rare1 == rare2 {
423                 rare2 = b
424             } else if b != rare1 && freq_rank(b) < freq_rank(rare2) {
425                 rare2 = b;
426             }
427         }
428 
429         // And find the offsets of their last occurrences.
430         let rare1i = pat.iter().rposition(|&b| b == rare1).unwrap();
431         let rare2i = pat.iter().rposition(|&b| b == rare2).unwrap();
432 
433         let char_len = char_len_lossy(&pat);
434         FreqyPacked {
435             pat: pat,
436             char_len: char_len,
437             rare1: rare1,
438             rare1i: rare1i,
439             rare2: rare2,
440             rare2i: rare2i,
441         }
442     }
443 
empty() -> FreqyPacked444     fn empty() -> FreqyPacked {
445         FreqyPacked {
446             pat: vec![],
447             char_len: 0,
448             rare1: 0,
449             rare1i: 0,
450             rare2: 0,
451             rare2i: 0,
452         }
453     }
454 
455     #[cfg_attr(feature = "perf-inline", inline(always))]
find(&self, haystack: &[u8]) -> Option<usize>456     pub fn find(&self, haystack: &[u8]) -> Option<usize> {
457         let pat = &*self.pat;
458         if haystack.len() < pat.len() || pat.is_empty() {
459             return None;
460         }
461         let mut i = self.rare1i;
462         while i < haystack.len() {
463             i += match memchr(self.rare1, &haystack[i..]) {
464                 None => return None,
465                 Some(i) => i,
466             };
467             let start = i - self.rare1i;
468             let end = start + pat.len();
469             if end > haystack.len() {
470                 return None;
471             }
472             let aligned = &haystack[start..end];
473             if aligned[self.rare2i] == self.rare2 && aligned == &*self.pat {
474                 return Some(start);
475             }
476             i += 1;
477         }
478         None
479     }
480 
481     #[cfg_attr(feature = "perf-inline", inline(always))]
is_suffix(&self, text: &[u8]) -> bool482     pub fn is_suffix(&self, text: &[u8]) -> bool {
483         if text.len() < self.len() {
484             return false;
485         }
486         text[text.len() - self.len()..] == *self.pat
487     }
488 
len(&self) -> usize489     pub fn len(&self) -> usize {
490         self.pat.len()
491     }
492 
char_len(&self) -> usize493     pub fn char_len(&self) -> usize {
494         self.char_len
495     }
496 
approximate_size(&self) -> usize497     fn approximate_size(&self) -> usize {
498         self.pat.len() * mem::size_of::<u8>()
499     }
500 }
501 
char_len_lossy(bytes: &[u8]) -> usize502 fn char_len_lossy(bytes: &[u8]) -> usize {
503     String::from_utf8_lossy(bytes).chars().count()
504 }
505 
506 /// An implementation of Tuned Boyer-Moore as laid out by
507 /// Andrew Hume and Daniel Sunday in "Fast String Searching".
508 /// O(n) in the size of the input.
509 ///
510 /// Fast string searching algorithms come in many variations,
511 /// but they can generally be described in terms of three main
512 /// components.
513 ///
514 /// The skip loop is where the string searcher wants to spend
515 /// as much time as possible. Exactly which character in the
516 /// pattern the skip loop examines varies from algorithm to
517 /// algorithm, but in the simplest case this loop repeated
518 /// looks at the last character in the pattern and jumps
519 /// forward in the input if it is not in the pattern.
520 /// Robert Boyer and J Moore called this the "fast" loop in
521 /// their original paper.
522 ///
523 /// The match loop is responsible for actually examining the
524 /// whole potentially matching substring. In order to fail
525 /// faster, the match loop sometimes has a guard test attached.
526 /// The guard test uses frequency analysis of the different
527 /// characters in the pattern to choose the least frequency
528 /// occurring character and use it to find match failures
529 /// as quickly as possible.
530 ///
531 /// The shift rule governs how the algorithm will shuffle its
532 /// test window in the event of a failure during the match loop.
533 /// Certain shift rules allow the worst-case run time of the
534 /// algorithm to be shown to be O(n) in the size of the input
535 /// rather than O(nm) in the size of the input and the size
536 /// of the pattern (as naive Boyer-Moore is).
537 ///
538 /// "Fast String Searching", in addition to presenting a tuned
539 /// algorithm, provides a comprehensive taxonomy of the many
540 /// different flavors of string searchers. Under that taxonomy
541 /// TBM, the algorithm implemented here, uses an unrolled fast
542 /// skip loop with memchr fallback, a forward match loop with guard,
543 /// and the mini Sunday's delta shift rule. To unpack that you'll have to
544 /// read the paper.
545 #[derive(Clone, Debug)]
546 pub struct BoyerMooreSearch {
547     /// The pattern we are going to look for in the haystack.
548     pattern: Vec<u8>,
549 
550     /// The skip table for the skip loop.
551     ///
552     /// Maps the character at the end of the input
553     /// to a shift.
554     skip_table: Vec<usize>,
555 
556     /// The guard character (least frequently occurring char).
557     guard: u8,
558     /// The reverse-index of the guard character in the pattern.
559     guard_reverse_idx: usize,
560 
561     /// Daniel Sunday's mini generalized delta2 shift table.
562     ///
563     /// We use a skip loop, so we only have to provide a shift
564     /// for the skip char (last char). This is why it is a mini
565     /// shift rule.
566     md2_shift: usize,
567 }
568 
569 impl BoyerMooreSearch {
570     /// Create a new string searcher, performing whatever
571     /// compilation steps are required.
new(pattern: Vec<u8>) -> Self572     fn new(pattern: Vec<u8>) -> Self {
573         debug_assert!(!pattern.is_empty());
574 
575         let (g, gi) = Self::select_guard(pattern.as_slice());
576         let skip_table = Self::compile_skip_table(pattern.as_slice());
577         let md2_shift = Self::compile_md2_shift(pattern.as_slice());
578         BoyerMooreSearch {
579             pattern: pattern,
580             skip_table: skip_table,
581             guard: g,
582             guard_reverse_idx: gi,
583             md2_shift: md2_shift,
584         }
585     }
586 
587     /// Find the pattern in `haystack`, returning the offset
588     /// of the start of the first occurrence of the pattern
589     /// in `haystack`.
590     #[inline]
find(&self, haystack: &[u8]) -> Option<usize>591     fn find(&self, haystack: &[u8]) -> Option<usize> {
592         if haystack.len() < self.pattern.len() {
593             return None;
594         }
595 
596         let mut window_end = self.pattern.len() - 1;
597 
598         // Inspired by the grep source. It is a way
599         // to do correct loop unrolling without having to place
600         // a crashpad of terminating charicters at the end in
601         // the way described in the Fast String Searching paper.
602         const NUM_UNROLL: usize = 10;
603         // 1 for the initial position, and 1 for the md2 shift
604         let short_circut = (NUM_UNROLL + 2) * self.pattern.len();
605 
606         if haystack.len() > short_circut {
607             // just 1 for the md2 shift
608             let backstop =
609                 haystack.len() - ((NUM_UNROLL + 1) * self.pattern.len());
610             loop {
611                 window_end =
612                     match self.skip_loop(haystack, window_end, backstop) {
613                         Some(i) => i,
614                         None => return None,
615                     };
616                 if window_end >= backstop {
617                     break;
618                 }
619 
620                 if self.check_match(haystack, window_end) {
621                     return Some(window_end - (self.pattern.len() - 1));
622                 } else {
623                     let skip = self.skip_table[haystack[window_end] as usize];
624                     window_end +=
625                         if skip == 0 { self.md2_shift } else { skip };
626                     continue;
627                 }
628             }
629         }
630 
631         // now process the input after the backstop
632         while window_end < haystack.len() {
633             let mut skip = self.skip_table[haystack[window_end] as usize];
634             if skip == 0 {
635                 if self.check_match(haystack, window_end) {
636                     return Some(window_end - (self.pattern.len() - 1));
637                 } else {
638                     skip = self.md2_shift;
639                 }
640             }
641             window_end += skip;
642         }
643 
644         None
645     }
646 
len(&self) -> usize647     fn len(&self) -> usize {
648         return self.pattern.len();
649     }
650 
651     /// The key heuristic behind which the BoyerMooreSearch lives.
652     ///
653     /// See `rust-lang/regex/issues/408`.
654     ///
655     /// Tuned Boyer-Moore is actually pretty slow! It turns out a handrolled
656     /// platform-specific memchr routine with a bit of frequency
657     /// analysis sprinkled on top actually wins most of the time.
658     /// However, there are a few cases where Tuned Boyer-Moore still
659     /// wins.
660     ///
661     /// If the haystack is random, frequency analysis doesn't help us,
662     /// so Boyer-Moore will win for sufficiently large needles.
663     /// Unfortunately, there is no obvious way to determine this
664     /// ahead of time.
665     ///
666     /// If the pattern itself consists of very common characters,
667     /// frequency analysis won't get us anywhere. The most extreme
668     /// example of this is a pattern like `eeeeeeeeeeeeeeee`. Fortunately,
669     /// this case is wholly determined by the pattern, so we can actually
670     /// implement the heuristic.
671     ///
672     /// A third case is if the pattern is sufficiently long. The idea
673     /// here is that once the pattern gets long enough the Tuned
674     /// Boyer-Moore skip loop will start making strides long enough
675     /// to beat the asm deep magic that is memchr.
should_use(pattern: &[u8]) -> bool676     fn should_use(pattern: &[u8]) -> bool {
677         // The minimum pattern length required to use TBM.
678         const MIN_LEN: usize = 9;
679         // The minimum frequency rank (lower is rarer) that every byte in the
680         // pattern must have in order to use TBM. That is, if the pattern
681         // contains _any_ byte with a lower rank, then TBM won't be used.
682         const MIN_CUTOFF: usize = 150;
683         // The maximum frequency rank for any byte.
684         const MAX_CUTOFF: usize = 255;
685         // The scaling factor used to determine the actual cutoff frequency
686         // to use (keeping in mind that the minimum frequency rank is bounded
687         // by MIN_CUTOFF). This scaling factor is an attempt to make TBM more
688         // likely to be used as the pattern grows longer. That is, longer
689         // patterns permit somewhat less frequent bytes than shorter patterns,
690         // under the assumption that TBM gets better as the pattern gets
691         // longer.
692         const LEN_CUTOFF_PROPORTION: usize = 4;
693 
694         let scaled_rank = pattern.len().wrapping_mul(LEN_CUTOFF_PROPORTION);
695         let cutoff = cmp::max(
696             MIN_CUTOFF,
697             MAX_CUTOFF - cmp::min(MAX_CUTOFF, scaled_rank),
698         );
699         // The pattern must be long enough to be worthwhile. e.g., memchr will
700         // be faster on `e` because it is short even though e is quite common.
701         pattern.len() > MIN_LEN
702             // all the bytes must be more common than the cutoff.
703             && pattern.iter().all(|c| freq_rank(*c) >= cutoff)
704     }
705 
706     /// Check to see if there is a match at the given position
707     #[inline]
check_match(&self, haystack: &[u8], window_end: usize) -> bool708     fn check_match(&self, haystack: &[u8], window_end: usize) -> bool {
709         // guard test
710         if haystack[window_end - self.guard_reverse_idx] != self.guard {
711             return false;
712         }
713 
714         // match loop
715         let window_start = window_end - (self.pattern.len() - 1);
716         for i in 0..self.pattern.len() {
717             if self.pattern[i] != haystack[window_start + i] {
718                 return false;
719             }
720         }
721 
722         true
723     }
724 
725     /// Skip forward according to the shift table.
726     ///
727     /// Returns the offset of the next occurrence
728     /// of the last char in the pattern, or the none
729     /// if it never reappears. If `skip_loop` hits the backstop
730     /// it will leave early.
731     #[inline]
skip_loop( &self, haystack: &[u8], mut window_end: usize, backstop: usize, ) -> Option<usize>732     fn skip_loop(
733         &self,
734         haystack: &[u8],
735         mut window_end: usize,
736         backstop: usize,
737     ) -> Option<usize> {
738         let window_end_snapshot = window_end;
739         let skip_of = |we: usize| -> usize {
740             // Unsafe might make this faster, but the benchmarks
741             // were hard to interpret.
742             self.skip_table[haystack[we] as usize]
743         };
744 
745         loop {
746             let mut skip = skip_of(window_end);
747             window_end += skip;
748             skip = skip_of(window_end);
749             window_end += skip;
750             if skip != 0 {
751                 skip = skip_of(window_end);
752                 window_end += skip;
753                 skip = skip_of(window_end);
754                 window_end += skip;
755                 skip = skip_of(window_end);
756                 window_end += skip;
757                 if skip != 0 {
758                     skip = skip_of(window_end);
759                     window_end += skip;
760                     skip = skip_of(window_end);
761                     window_end += skip;
762                     skip = skip_of(window_end);
763                     window_end += skip;
764                     if skip != 0 {
765                         skip = skip_of(window_end);
766                         window_end += skip;
767                         skip = skip_of(window_end);
768                         window_end += skip;
769 
770                         // If ten iterations did not make at least 16 words
771                         // worth of progress, we just fall back on memchr.
772                         if window_end - window_end_snapshot
773                             > 16 * mem::size_of::<usize>()
774                         {
775                             // Returning a window_end >= backstop will
776                             // immediatly break us out of the inner loop in
777                             // `find`.
778                             if window_end >= backstop {
779                                 return Some(window_end);
780                             }
781 
782                             continue; // we made enough progress
783                         } else {
784                             // In case we are already there, and so that
785                             // we will catch the guard char.
786                             window_end = window_end
787                                 .checked_sub(1 + self.guard_reverse_idx)
788                                 .unwrap_or(0);
789 
790                             match memchr(self.guard, &haystack[window_end..]) {
791                                 None => return None,
792                                 Some(g_idx) => {
793                                     return Some(
794                                         window_end
795                                             + g_idx
796                                             + self.guard_reverse_idx,
797                                     );
798                                 }
799                             }
800                         }
801                     }
802                 }
803             }
804 
805             return Some(window_end);
806         }
807     }
808 
809     /// Compute the ufast skip table.
compile_skip_table(pattern: &[u8]) -> Vec<usize>810     fn compile_skip_table(pattern: &[u8]) -> Vec<usize> {
811         let mut tab = vec![pattern.len(); 256];
812 
813         // For every char in the pattern, we write a skip
814         // that will line us up with the rightmost occurrence.
815         //
816         // N.B. the sentinel (0) is written by the last
817         // loop iteration.
818         for (i, c) in pattern.iter().enumerate() {
819             tab[*c as usize] = (pattern.len() - 1) - i;
820         }
821 
822         tab
823     }
824 
825     /// Select the guard character based off of the precomputed
826     /// frequency table.
select_guard(pattern: &[u8]) -> (u8, usize)827     fn select_guard(pattern: &[u8]) -> (u8, usize) {
828         let mut rarest = pattern[0];
829         let mut rarest_rev_idx = pattern.len() - 1;
830         for (i, c) in pattern.iter().enumerate() {
831             if freq_rank(*c) < freq_rank(rarest) {
832                 rarest = *c;
833                 rarest_rev_idx = (pattern.len() - 1) - i;
834             }
835         }
836 
837         (rarest, rarest_rev_idx)
838     }
839 
840     /// If there is another occurrence of the skip
841     /// char, shift to it, otherwise just shift to
842     /// the next window.
compile_md2_shift(pattern: &[u8]) -> usize843     fn compile_md2_shift(pattern: &[u8]) -> usize {
844         let shiftc = *pattern.last().unwrap();
845 
846         // For a pattern of length 1 we will never apply the
847         // shift rule, so we use a poison value on the principle
848         // that failing fast is a good thing.
849         if pattern.len() == 1 {
850             return 0xDEADBEAF;
851         }
852 
853         let mut i = pattern.len() - 2;
854         while i > 0 {
855             if pattern[i] == shiftc {
856                 return (pattern.len() - 1) - i;
857             }
858             i -= 1;
859         }
860 
861         // The skip char never re-occurs in the pattern, so
862         // we can just shift the whole window length.
863         pattern.len() - 1
864     }
865 
approximate_size(&self) -> usize866     fn approximate_size(&self) -> usize {
867         (self.pattern.len() * mem::size_of::<u8>())
868             + (256 * mem::size_of::<usize>()) // skip table
869     }
870 }
871 
freq_rank(b: u8) -> usize872 fn freq_rank(b: u8) -> usize {
873     BYTE_FREQUENCIES[b as usize] as usize
874 }
875 
876 #[cfg(test)]
877 mod tests {
878     use super::{BoyerMooreSearch, FreqyPacked};
879 
880     //
881     // Unit Tests
882     //
883 
884     // The "hello, world" of string searching
885     #[test]
bm_find_subs()886     fn bm_find_subs() {
887         let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
888         let haystack = b"I keep seeing patterns in this text";
889         assert_eq!(14, searcher.find(haystack).unwrap());
890     }
891 
892     #[test]
bm_find_no_subs()893     fn bm_find_no_subs() {
894         let searcher = BoyerMooreSearch::new(Vec::from(&b"pattern"[..]));
895         let haystack = b"I keep seeing needles in this text";
896         assert_eq!(None, searcher.find(haystack));
897     }
898 
899     //
900     // Regression Tests
901     //
902 
903     #[test]
bm_skip_reset_bug()904     fn bm_skip_reset_bug() {
905         let haystack = vec![0, 0, 0, 0, 0, 1, 1, 0];
906         let needle = vec![0, 1, 1, 0];
907 
908         let searcher = BoyerMooreSearch::new(needle);
909         let offset = searcher.find(haystack.as_slice()).unwrap();
910         assert_eq!(4, offset);
911     }
912 
913     #[test]
bm_backstop_underflow_bug()914     fn bm_backstop_underflow_bug() {
915         let haystack = vec![0, 0];
916         let needle = vec![0, 0];
917 
918         let searcher = BoyerMooreSearch::new(needle);
919         let offset = searcher.find(haystack.as_slice()).unwrap();
920         assert_eq!(0, offset);
921     }
922 
923     #[test]
bm_naive_off_by_one_bug()924     fn bm_naive_off_by_one_bug() {
925         let haystack = vec![91];
926         let needle = vec![91];
927 
928         let naive_offset = naive_find(&needle, &haystack).unwrap();
929         assert_eq!(0, naive_offset);
930     }
931 
932     #[test]
bm_memchr_fallback_indexing_bug()933     fn bm_memchr_fallback_indexing_bug() {
934         let mut haystack = vec![
935             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
936             0, 0, 0, 0, 0, 87, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
937             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
938             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
939             0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
940         ];
941         let needle = vec![1, 1, 1, 1, 32, 32, 87];
942         let needle_start = haystack.len();
943         haystack.extend(needle.clone());
944 
945         let searcher = BoyerMooreSearch::new(needle);
946         assert_eq!(needle_start, searcher.find(haystack.as_slice()).unwrap());
947     }
948 
949     #[test]
bm_backstop_boundary()950     fn bm_backstop_boundary() {
951         let haystack = b"\
952 // aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
953 e_data.clone_created(entity_id, entity_to_add.entity_id);
954 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
955 aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
956 "
957         .to_vec();
958         let needle = b"clone_created".to_vec();
959 
960         let searcher = BoyerMooreSearch::new(needle);
961         let result = searcher.find(&haystack);
962         assert_eq!(Some(43), result);
963     }
964 
965     #[test]
bm_win_gnu_indexing_bug()966     fn bm_win_gnu_indexing_bug() {
967         let haystack_raw = vec![
968             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
969             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
970             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
971             0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
972         ];
973         let needle = vec![1, 1, 1, 1, 1, 1, 1];
974         let haystack = haystack_raw.as_slice();
975 
976         BoyerMooreSearch::new(needle.clone()).find(haystack);
977     }
978 
979     //
980     // QuickCheck Properties
981     //
982 
983     use quickcheck::TestResult;
984 
naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize>985     fn naive_find(needle: &[u8], haystack: &[u8]) -> Option<usize> {
986         assert!(needle.len() <= haystack.len());
987 
988         for i in 0..(haystack.len() - (needle.len() - 1)) {
989             if haystack[i] == needle[0]
990                 && &haystack[i..(i + needle.len())] == needle
991             {
992                 return Some(i);
993             }
994         }
995 
996         None
997     }
998 
999     quickcheck! {
1000         fn qc_bm_equals_nieve_find(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1001             if pile1.len() == 0 || pile2.len() == 0 {
1002                 return TestResult::discard();
1003             }
1004 
1005             let (needle, haystack) = if pile1.len() < pile2.len() {
1006                 (pile1, pile2.as_slice())
1007             } else {
1008                 (pile2, pile1.as_slice())
1009             };
1010 
1011             let searcher = BoyerMooreSearch::new(needle.clone());
1012             TestResult::from_bool(
1013                 searcher.find(haystack) == naive_find(&needle, haystack))
1014         }
1015 
1016         fn qc_bm_equals_single(pile1: Vec<u8>, pile2: Vec<u8>) -> TestResult {
1017             if pile1.len() == 0 || pile2.len() == 0 {
1018                 return TestResult::discard();
1019             }
1020 
1021             let (needle, haystack) = if pile1.len() < pile2.len() {
1022                 (pile1, pile2.as_slice())
1023             } else {
1024                 (pile2, pile1.as_slice())
1025             };
1026 
1027             let bm_searcher = BoyerMooreSearch::new(needle.clone());
1028             let freqy_memchr = FreqyPacked::new(needle);
1029             TestResult::from_bool(
1030                 bm_searcher.find(haystack) == freqy_memchr.find(haystack))
1031         }
1032 
1033         fn qc_bm_finds_trailing_needle(
1034             haystack_pre: Vec<u8>,
1035             needle: Vec<u8>
1036         ) -> TestResult {
1037             if needle.len() == 0 {
1038                 return TestResult::discard();
1039             }
1040 
1041             let mut haystack = haystack_pre.clone();
1042             let searcher = BoyerMooreSearch::new(needle.clone());
1043 
1044             if haystack.len() >= needle.len() &&
1045                 searcher.find(haystack.as_slice()).is_some() {
1046                 return TestResult::discard();
1047             }
1048 
1049             haystack.extend(needle.clone());
1050 
1051             // What if the the tail of the haystack can start the
1052             // needle?
1053             let start = haystack_pre.len()
1054                 .checked_sub(needle.len())
1055                 .unwrap_or(0);
1056             for i in 0..(needle.len() - 1) {
1057                 if searcher.find(&haystack[(i + start)..]).is_some() {
1058                     return TestResult::discard();
1059                 }
1060             }
1061 
1062             TestResult::from_bool(
1063                 searcher.find(haystack.as_slice())
1064                         .map(|x| x == haystack_pre.len())
1065                         .unwrap_or(false))
1066         }
1067 
1068         // qc_equals_* is only testing the negative case as @burntsushi
1069         // pointed out in https://github.com/rust-lang/regex/issues/446.
1070         // This quickcheck prop represents an effort to force testing of
1071         // the positive case. qc_bm_finds_first and qc_bm_finds_trailing_needle
1072         // already check some of the positive cases, but they don't cover
1073         // cases where the needle is in the middle of haystack. This prop
1074         // fills that hole.
1075         fn qc_bm_finds_subslice(
1076             haystack: Vec<u8>,
1077             needle_start: usize,
1078             needle_length: usize
1079         ) -> TestResult {
1080             if haystack.len() == 0 {
1081                 return TestResult::discard();
1082             }
1083 
1084             let needle_start = needle_start % haystack.len();
1085             let needle_length = needle_length % (haystack.len() - needle_start);
1086 
1087             if needle_length == 0 {
1088                 return TestResult::discard();
1089             }
1090 
1091             let needle = &haystack[needle_start..(needle_start + needle_length)];
1092 
1093             let bm_searcher = BoyerMooreSearch::new(needle.to_vec());
1094 
1095             let start = naive_find(&needle, &haystack);
1096             match start {
1097                 None => TestResult::from_bool(false),
1098                 Some(nf_start) =>
1099                     TestResult::from_bool(
1100                         nf_start <= needle_start
1101                             && bm_searcher.find(&haystack) == start
1102                     )
1103             }
1104         }
1105 
1106         fn qc_bm_finds_first(needle: Vec<u8>) -> TestResult {
1107             if needle.len() == 0 {
1108                 return TestResult::discard();
1109             }
1110 
1111             let mut haystack = needle.clone();
1112             let searcher = BoyerMooreSearch::new(needle.clone());
1113             haystack.extend(needle);
1114 
1115             TestResult::from_bool(
1116                 searcher.find(haystack.as_slice())
1117                         .map(|x| x == 0)
1118                         .unwrap_or(false))
1119         }
1120     }
1121 }
1122