1 use std::cmp;
2 use std::fmt;
3 use std::panic::{RefUnwindSafe, UnwindSafe};
4 use std::u8;
5 
6 use memchr::{memchr, memchr2, memchr3};
7 
8 use crate::ahocorasick::MatchKind;
9 use crate::packed;
10 use crate::Match;
11 
12 /// A candidate is the result of running a prefilter on a haystack at a
13 /// particular position. The result is either no match, a confirmed match or
14 /// a possible match.
15 ///
16 /// When no match is returned, the prefilter is guaranteeing that no possible
17 /// match can be found in the haystack, and the caller may trust this. That is,
18 /// all correct prefilters must never report false negatives.
19 ///
20 /// In some cases, a prefilter can confirm a match very quickly, in which case,
21 /// the caller may use this to stop what it's doing and report the match. In
22 /// this case, prefilter implementations must never report a false positive.
23 /// In other cases, the prefilter can only report a potential match, in which
24 /// case the callers must attempt to confirm the match. In this case, prefilter
25 /// implementations are permitted to return false positives.
26 #[derive(Clone, Debug)]
27 pub enum Candidate {
28     None,
29     Match(Match),
30     PossibleStartOfMatch(usize),
31 }
32 
33 impl Candidate {
34     /// Convert this candidate into an option. This is useful when callers
35     /// do not distinguish between true positives and false positives (i.e.,
36     /// the caller must always confirm the match in order to update some other
37     /// state).
into_option(self) -> Option<usize>38     pub fn into_option(self) -> Option<usize> {
39         match self {
40             Candidate::None => None,
41             Candidate::Match(ref m) => Some(m.start()),
42             Candidate::PossibleStartOfMatch(start) => Some(start),
43         }
44     }
45 }
46 
47 /// A prefilter describes the behavior of fast literal scanners for quickly
48 /// skipping past bytes in the haystack that we know cannot possibly
49 /// participate in a match.
50 pub trait Prefilter:
51     Send + Sync + RefUnwindSafe + UnwindSafe + fmt::Debug
52 {
53     /// Returns the next possible match candidate. This may yield false
54     /// positives, so callers must confirm a match starting at the position
55     /// returned. This, however, must never produce false negatives. That is,
56     /// this must, at minimum, return the starting position of the next match
57     /// in the given haystack after or at the given position.
next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate58     fn next_candidate(
59         &self,
60         state: &mut PrefilterState,
61         haystack: &[u8],
62         at: usize,
63     ) -> Candidate;
64 
65     /// A method for cloning a prefilter, to work-around the fact that Clone
66     /// is not object-safe.
clone_prefilter(&self) -> Box<dyn Prefilter>67     fn clone_prefilter(&self) -> Box<dyn Prefilter>;
68 
69     /// Returns the approximate total amount of heap used by this prefilter, in
70     /// units of bytes.
heap_bytes(&self) -> usize71     fn heap_bytes(&self) -> usize;
72 
73     /// Returns true if and only if this prefilter never returns false
74     /// positives. This is useful for completely avoiding the automaton
75     /// when the prefilter can quickly confirm its own matches.
76     ///
77     /// By default, this returns true, which is conservative; it is always
78     /// correct to return `true`. Returning `false` here and reporting a false
79     /// positive will result in incorrect searches.
reports_false_positives(&self) -> bool80     fn reports_false_positives(&self) -> bool {
81         true
82     }
83 
84     /// Returns true if and only if this prefilter may look for a non-starting
85     /// position of a match.
86     ///
87     /// This is useful in a streaming context where prefilters that don't look
88     /// for a starting position of a match can be quite difficult to deal with.
89     ///
90     /// This returns false by default.
looks_for_non_start_of_match(&self) -> bool91     fn looks_for_non_start_of_match(&self) -> bool {
92         false
93     }
94 }
95 
96 impl<'a, P: Prefilter + ?Sized> Prefilter for &'a P {
97     #[inline]
next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate98     fn next_candidate(
99         &self,
100         state: &mut PrefilterState,
101         haystack: &[u8],
102         at: usize,
103     ) -> Candidate {
104         (**self).next_candidate(state, haystack, at)
105     }
106 
clone_prefilter(&self) -> Box<dyn Prefilter>107     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
108         (**self).clone_prefilter()
109     }
110 
heap_bytes(&self) -> usize111     fn heap_bytes(&self) -> usize {
112         (**self).heap_bytes()
113     }
114 
reports_false_positives(&self) -> bool115     fn reports_false_positives(&self) -> bool {
116         (**self).reports_false_positives()
117     }
118 }
119 
120 /// A convenience object for representing any type that implements Prefilter
121 /// and is cloneable.
122 #[derive(Debug)]
123 pub struct PrefilterObj(Box<dyn Prefilter>);
124 
125 impl Clone for PrefilterObj {
clone(&self) -> Self126     fn clone(&self) -> Self {
127         PrefilterObj(self.0.clone_prefilter())
128     }
129 }
130 
131 impl PrefilterObj {
132     /// Create a new prefilter object.
new<T: Prefilter + 'static>(t: T) -> PrefilterObj133     pub fn new<T: Prefilter + 'static>(t: T) -> PrefilterObj {
134         PrefilterObj(Box::new(t))
135     }
136 
137     /// Return the underlying prefilter trait object.
as_ref(&self) -> &dyn Prefilter138     pub fn as_ref(&self) -> &dyn Prefilter {
139         &*self.0
140     }
141 }
142 
143 /// PrefilterState tracks state associated with the effectiveness of a
144 /// prefilter. It is used to track how many bytes, on average, are skipped by
145 /// the prefilter. If this average dips below a certain threshold over time,
146 /// then the state renders the prefilter inert and stops using it.
147 ///
148 /// A prefilter state should be created for each search. (Where creating an
149 /// iterator via, e.g., `find_iter`, is treated as a single search.)
150 #[derive(Clone, Debug)]
151 pub struct PrefilterState {
152     /// The number of skips that has been executed.
153     skips: usize,
154     /// The total number of bytes that have been skipped.
155     skipped: usize,
156     /// The maximum length of a match. This is used to help determine how many
157     /// bytes on average should be skipped in order for a prefilter to be
158     /// effective.
159     max_match_len: usize,
160     /// Once this heuristic has been deemed permanently ineffective, it will be
161     /// inert throughout the rest of its lifetime. This serves as a cheap way
162     /// to check inertness.
163     inert: bool,
164     /// The last (absolute) position at which a prefilter scanned to.
165     /// Prefilters can use this position to determine whether to re-scan or
166     /// not.
167     ///
168     /// Unlike other things that impact effectiveness, this is a fleeting
169     /// condition. That is, a prefilter can be considered ineffective if it is
170     /// at a position before `last_scan_at`, but can become effective again
171     /// once the search moves past `last_scan_at`.
172     ///
173     /// The utility of this is to both avoid additional overhead from calling
174     /// the prefilter and to avoid quadratic behavior. This ensures that a
175     /// prefilter will scan any particular byte at most once. (Note that some
176     /// prefilters, like the start-byte prefilter, do not need to use this
177     /// field at all, since it only looks for starting bytes.)
178     last_scan_at: usize,
179 }
180 
181 impl PrefilterState {
182     /// The minimum number of skip attempts to try before considering whether
183     /// a prefilter is effective or not.
184     const MIN_SKIPS: usize = 40;
185 
186     /// The minimum amount of bytes that skipping must average, expressed as a
187     /// factor of the multiple of the length of a possible match.
188     ///
189     /// That is, after MIN_SKIPS have occurred, if the average number of bytes
190     /// skipped ever falls below MIN_AVG_FACTOR * max-match-length, then the
191     /// prefilter outed to be rendered inert.
192     const MIN_AVG_FACTOR: usize = 2;
193 
194     /// Create a fresh prefilter state.
new(max_match_len: usize) -> PrefilterState195     pub fn new(max_match_len: usize) -> PrefilterState {
196         PrefilterState {
197             skips: 0,
198             skipped: 0,
199             max_match_len,
200             inert: false,
201             last_scan_at: 0,
202         }
203     }
204 
205     /// Create a prefilter state that always disables the prefilter.
disabled() -> PrefilterState206     pub fn disabled() -> PrefilterState {
207         PrefilterState {
208             skips: 0,
209             skipped: 0,
210             max_match_len: 0,
211             inert: true,
212             last_scan_at: 0,
213         }
214     }
215 
216     /// Update this state with the number of bytes skipped on the last
217     /// invocation of the prefilter.
218     #[inline]
update_skipped_bytes(&mut self, skipped: usize)219     fn update_skipped_bytes(&mut self, skipped: usize) {
220         self.skips += 1;
221         self.skipped += skipped;
222     }
223 
224     /// Updates the position at which the last scan stopped. This may be
225     /// greater than the position of the last candidate reported. For example,
226     /// searching for the "rare" byte `z` in `abczdef` for the pattern `abcz`
227     /// will report a candidate at position `0`, but the end of its last scan
228     /// will be at position `3`.
229     ///
230     /// This position factors into the effectiveness of this prefilter. If the
231     /// current position is less than the last position at which a scan ended,
232     /// then the prefilter should not be re-run until the search moves past
233     /// that position.
234     #[inline]
update_at(&mut self, at: usize)235     fn update_at(&mut self, at: usize) {
236         if at > self.last_scan_at {
237             self.last_scan_at = at;
238         }
239     }
240 
241     /// Return true if and only if this state indicates that a prefilter is
242     /// still effective.
243     ///
244     /// The given pos should correspond to the current starting position of the
245     /// search.
246     #[inline]
is_effective(&mut self, at: usize) -> bool247     pub fn is_effective(&mut self, at: usize) -> bool {
248         if self.inert {
249             return false;
250         }
251         if at < self.last_scan_at {
252             return false;
253         }
254         if self.skips < PrefilterState::MIN_SKIPS {
255             return true;
256         }
257 
258         let min_avg = PrefilterState::MIN_AVG_FACTOR * self.max_match_len;
259         if self.skipped >= min_avg * self.skips {
260             return true;
261         }
262 
263         // We're inert.
264         self.inert = true;
265         false
266     }
267 }
268 
269 /// A builder for constructing the best possible prefilter. When constructed,
270 /// this builder will heuristically select the best prefilter it can build,
271 /// if any, and discard the rest.
272 #[derive(Debug)]
273 pub struct Builder {
274     count: usize,
275     ascii_case_insensitive: bool,
276     start_bytes: StartBytesBuilder,
277     rare_bytes: RareBytesBuilder,
278     packed: Option<packed::Builder>,
279 }
280 
281 impl Builder {
282     /// Create a new builder for constructing the best possible prefilter.
new(kind: MatchKind) -> Builder283     pub fn new(kind: MatchKind) -> Builder {
284         let pbuilder = kind
285             .as_packed()
286             .map(|kind| packed::Config::new().match_kind(kind).builder());
287         Builder {
288             count: 0,
289             ascii_case_insensitive: false,
290             start_bytes: StartBytesBuilder::new(),
291             rare_bytes: RareBytesBuilder::new(),
292             packed: pbuilder,
293         }
294     }
295 
296     /// Enable ASCII case insensitivity. When set, byte strings added to this
297     /// builder will be interpreted without respect to ASCII case.
ascii_case_insensitive(mut self, yes: bool) -> Builder298     pub fn ascii_case_insensitive(mut self, yes: bool) -> Builder {
299         self.ascii_case_insensitive = yes;
300         self.start_bytes = self.start_bytes.ascii_case_insensitive(yes);
301         self.rare_bytes = self.rare_bytes.ascii_case_insensitive(yes);
302         self
303     }
304 
305     /// Return a prefilter suitable for quickly finding potential matches.
306     ///
307     /// All patterns added to an Aho-Corasick automaton should be added to this
308     /// builder before attempting to construct the prefilter.
build(&self) -> Option<PrefilterObj>309     pub fn build(&self) -> Option<PrefilterObj> {
310         // match (self.start_bytes.build(), self.rare_bytes.build()) {
311         match (self.start_bytes.build(), self.rare_bytes.build()) {
312             // If we could build both start and rare prefilters, then there are
313             // a few cases in which we'd want to use the start-byte prefilter
314             // over the rare-byte prefilter, since the former has lower
315             // overhead.
316             (prestart @ Some(_), prerare @ Some(_)) => {
317                 // If the start-byte prefilter can scan for a smaller number
318                 // of bytes than the rare-byte prefilter, then it's probably
319                 // faster.
320                 let has_fewer_bytes =
321                     self.start_bytes.count < self.rare_bytes.count;
322                 // Otherwise, if the combined frequency rank of the detected
323                 // bytes in the start-byte prefilter is "close" to the combined
324                 // frequency rank of the rare-byte prefilter, then we pick
325                 // the start-byte prefilter even if the rare-byte prefilter
326                 // heuristically searches for rare bytes. This is because the
327                 // rare-byte prefilter has higher constant costs, so we tend to
328                 // prefer the start-byte prefilter when we can.
329                 let has_rarer_bytes =
330                     self.start_bytes.rank_sum <= self.rare_bytes.rank_sum + 50;
331                 if has_fewer_bytes || has_rarer_bytes {
332                     prestart
333                 } else {
334                     prerare
335                 }
336             }
337             (prestart @ Some(_), None) => prestart,
338             (None, prerare @ Some(_)) => prerare,
339             (None, None) if self.ascii_case_insensitive => None,
340             (None, None) => self
341                 .packed
342                 .as_ref()
343                 .and_then(|b| b.build())
344                 .map(|s| PrefilterObj::new(Packed(s))),
345         }
346     }
347 
348     /// Add a literal string to this prefilter builder.
add(&mut self, bytes: &[u8])349     pub fn add(&mut self, bytes: &[u8]) {
350         self.count += 1;
351         self.start_bytes.add(bytes);
352         self.rare_bytes.add(bytes);
353         if let Some(ref mut pbuilder) = self.packed {
354             pbuilder.add(bytes);
355         }
356     }
357 }
358 
359 /// A type that wraps a packed searcher and implements the `Prefilter`
360 /// interface.
361 #[derive(Clone, Debug)]
362 struct Packed(packed::Searcher);
363 
364 impl Prefilter for Packed {
next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate365     fn next_candidate(
366         &self,
367         _state: &mut PrefilterState,
368         haystack: &[u8],
369         at: usize,
370     ) -> Candidate {
371         self.0.find_at(haystack, at).map_or(Candidate::None, Candidate::Match)
372     }
373 
clone_prefilter(&self) -> Box<dyn Prefilter>374     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
375         Box::new(self.clone())
376     }
377 
heap_bytes(&self) -> usize378     fn heap_bytes(&self) -> usize {
379         self.0.heap_bytes()
380     }
381 
reports_false_positives(&self) -> bool382     fn reports_false_positives(&self) -> bool {
383         false
384     }
385 }
386 
387 /// A builder for constructing a rare byte prefilter.
388 ///
389 /// A rare byte prefilter attempts to pick out a small set of rare bytes that
390 /// occurr in the patterns, and then quickly scan to matches of those rare
391 /// bytes.
392 #[derive(Clone, Debug)]
393 struct RareBytesBuilder {
394     /// Whether this prefilter should account for ASCII case insensitivity or
395     /// not.
396     ascii_case_insensitive: bool,
397     /// A set of rare bytes, indexed by byte value.
398     rare_set: ByteSet,
399     /// A set of byte offsets associated with bytes in a pattern. An entry
400     /// corresponds to a particular bytes (its index) and is only non-zero if
401     /// the byte occurred at an offset greater than 0 in at least one pattern.
402     ///
403     /// If a byte's offset is not representable in 8 bits, then the rare bytes
404     /// prefilter becomes inert.
405     byte_offsets: RareByteOffsets,
406     /// Whether this is available as a prefilter or not. This can be set to
407     /// false during construction if a condition is seen that invalidates the
408     /// use of the rare-byte prefilter.
409     available: bool,
410     /// The number of bytes set to an active value in `byte_offsets`.
411     count: usize,
412     /// The sum of frequency ranks for the rare bytes detected. This is
413     /// intended to give a heuristic notion of how rare the bytes are.
414     rank_sum: u16,
415 }
416 
417 /// A set of bytes.
418 #[derive(Clone, Copy)]
419 struct ByteSet([bool; 256]);
420 
421 impl ByteSet {
empty() -> ByteSet422     fn empty() -> ByteSet {
423         ByteSet([false; 256])
424     }
425 
insert(&mut self, b: u8) -> bool426     fn insert(&mut self, b: u8) -> bool {
427         let new = !self.contains(b);
428         self.0[b as usize] = true;
429         new
430     }
431 
contains(&self, b: u8) -> bool432     fn contains(&self, b: u8) -> bool {
433         self.0[b as usize]
434     }
435 }
436 
437 impl fmt::Debug for ByteSet {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result438     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
439         let mut bytes = vec![];
440         for b in 0..=255 {
441             if self.contains(b) {
442                 bytes.push(b);
443             }
444         }
445         f.debug_struct("ByteSet").field("set", &bytes).finish()
446     }
447 }
448 
449 /// A set of byte offsets, keyed by byte.
450 #[derive(Clone, Copy)]
451 struct RareByteOffsets {
452     /// Each entry corresponds to the maximum offset of the corresponding
453     /// byte across all patterns seen.
454     set: [RareByteOffset; 256],
455 }
456 
457 impl RareByteOffsets {
458     /// Create a new empty set of rare byte offsets.
empty() -> RareByteOffsets459     pub fn empty() -> RareByteOffsets {
460         RareByteOffsets { set: [RareByteOffset::default(); 256] }
461     }
462 
463     /// Add the given offset for the given byte to this set. If the offset is
464     /// greater than the existing offset, then it overwrites the previous
465     /// value and returns false. If there is no previous value set, then this
466     /// sets it and returns true.
set(&mut self, byte: u8, off: RareByteOffset)467     pub fn set(&mut self, byte: u8, off: RareByteOffset) {
468         self.set[byte as usize].max =
469             cmp::max(self.set[byte as usize].max, off.max);
470     }
471 }
472 
473 impl fmt::Debug for RareByteOffsets {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result474     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
475         let mut offsets = vec![];
476         for off in self.set.iter() {
477             if off.max > 0 {
478                 offsets.push(off);
479             }
480         }
481         f.debug_struct("RareByteOffsets").field("set", &offsets).finish()
482     }
483 }
484 
485 /// Offsets associated with an occurrence of a "rare" byte in any of the
486 /// patterns used to construct a single Aho-Corasick automaton.
487 #[derive(Clone, Copy, Debug)]
488 struct RareByteOffset {
489     /// The maximum offset at which a particular byte occurs from the start
490     /// of any pattern. This is used as a shift amount. That is, when an
491     /// occurrence of this byte is found, the candidate position reported by
492     /// the prefilter is `position_of_byte - max`, such that the automaton
493     /// will begin its search at a position that is guaranteed to observe a
494     /// match.
495     ///
496     /// To avoid accidentally quadratic behavior, a prefilter is considered
497     /// ineffective when it is asked to start scanning from a position that it
498     /// has already scanned past.
499     ///
500     /// Using a `u8` here means that if we ever see a pattern that's longer
501     /// than 255 bytes, then the entire rare byte prefilter is disabled.
502     max: u8,
503 }
504 
505 impl Default for RareByteOffset {
default() -> RareByteOffset506     fn default() -> RareByteOffset {
507         RareByteOffset { max: 0 }
508     }
509 }
510 
511 impl RareByteOffset {
512     /// Create a new rare byte offset. If the given offset is too big, then
513     /// None is returned. In that case, callers should render the rare bytes
514     /// prefilter inert.
new(max: usize) -> Option<RareByteOffset>515     fn new(max: usize) -> Option<RareByteOffset> {
516         if max > u8::MAX as usize {
517             None
518         } else {
519             Some(RareByteOffset { max: max as u8 })
520         }
521     }
522 }
523 
524 impl RareBytesBuilder {
525     /// Create a new builder for constructing a rare byte prefilter.
new() -> RareBytesBuilder526     fn new() -> RareBytesBuilder {
527         RareBytesBuilder {
528             ascii_case_insensitive: false,
529             rare_set: ByteSet::empty(),
530             byte_offsets: RareByteOffsets::empty(),
531             available: true,
532             count: 0,
533             rank_sum: 0,
534         }
535     }
536 
537     /// Enable ASCII case insensitivity. When set, byte strings added to this
538     /// builder will be interpreted without respect to ASCII case.
ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder539     fn ascii_case_insensitive(mut self, yes: bool) -> RareBytesBuilder {
540         self.ascii_case_insensitive = yes;
541         self
542     }
543 
544     /// Build the rare bytes prefilter.
545     ///
546     /// If there are more than 3 distinct starting bytes, or if heuristics
547     /// otherwise determine that this prefilter should not be used, then `None`
548     /// is returned.
build(&self) -> Option<PrefilterObj>549     fn build(&self) -> Option<PrefilterObj> {
550         if !self.available || self.count > 3 {
551             return None;
552         }
553         let (mut bytes, mut len) = ([0; 3], 0);
554         for b in 0..=255 {
555             if self.rare_set.contains(b) {
556                 bytes[len] = b as u8;
557                 len += 1;
558             }
559         }
560         match len {
561             0 => None,
562             1 => Some(PrefilterObj::new(RareBytesOne {
563                 byte1: bytes[0],
564                 offset: self.byte_offsets.set[bytes[0] as usize],
565             })),
566             2 => Some(PrefilterObj::new(RareBytesTwo {
567                 offsets: self.byte_offsets,
568                 byte1: bytes[0],
569                 byte2: bytes[1],
570             })),
571             3 => Some(PrefilterObj::new(RareBytesThree {
572                 offsets: self.byte_offsets,
573                 byte1: bytes[0],
574                 byte2: bytes[1],
575                 byte3: bytes[2],
576             })),
577             _ => unreachable!(),
578         }
579     }
580 
581     /// Add a byte string to this builder.
582     ///
583     /// All patterns added to an Aho-Corasick automaton should be added to this
584     /// builder before attempting to construct the prefilter.
add(&mut self, bytes: &[u8])585     fn add(&mut self, bytes: &[u8]) {
586         // If we've already given up, then do nothing.
587         if !self.available {
588             return;
589         }
590         // If we've already blown our budget, then don't waste time looking
591         // for more rare bytes.
592         if self.count > 3 {
593             self.available = false;
594             return;
595         }
596         // If the pattern is too long, then our offset table is bunk, so
597         // give up.
598         if bytes.len() >= 256 {
599             self.available = false;
600             return;
601         }
602         let mut rarest = match bytes.get(0) {
603             None => return,
604             Some(&b) => (b, freq_rank(b)),
605         };
606         // The idea here is to look for the rarest byte in each pattern, and
607         // add that to our set. As a special exception, if we see a byte that
608         // we've already added, then we immediately stop and choose that byte,
609         // even if there's another rare byte in the pattern. This helps us
610         // apply the rare byte optimization in more cases by attempting to pick
611         // bytes that are in common between patterns. So for example, if we
612         // were searching for `Sherlock` and `lockjaw`, then this would pick
613         // `k` for both patterns, resulting in the use of `memchr` instead of
614         // `memchr2` for `k` and `j`.
615         let mut found = false;
616         for (pos, &b) in bytes.iter().enumerate() {
617             self.set_offset(pos, b);
618             if found {
619                 continue;
620             }
621             if self.rare_set.contains(b) {
622                 found = true;
623                 continue;
624             }
625             let rank = freq_rank(b);
626             if rank < rarest.1 {
627                 rarest = (b, rank);
628             }
629         }
630         if !found {
631             self.add_rare_byte(rarest.0);
632         }
633     }
634 
set_offset(&mut self, pos: usize, byte: u8)635     fn set_offset(&mut self, pos: usize, byte: u8) {
636         // This unwrap is OK because pos is never bigger than our max.
637         let offset = RareByteOffset::new(pos).unwrap();
638         self.byte_offsets.set(byte, offset);
639         if self.ascii_case_insensitive {
640             self.byte_offsets.set(opposite_ascii_case(byte), offset);
641         }
642     }
643 
add_rare_byte(&mut self, byte: u8)644     fn add_rare_byte(&mut self, byte: u8) {
645         self.add_one_rare_byte(byte);
646         if self.ascii_case_insensitive {
647             self.add_one_rare_byte(opposite_ascii_case(byte));
648         }
649     }
650 
add_one_rare_byte(&mut self, byte: u8)651     fn add_one_rare_byte(&mut self, byte: u8) {
652         if self.rare_set.insert(byte) {
653             self.count += 1;
654             self.rank_sum += freq_rank(byte) as u16;
655         }
656     }
657 }
658 
659 /// A prefilter for scanning for a single "rare" byte.
660 #[derive(Clone, Debug)]
661 struct RareBytesOne {
662     byte1: u8,
663     offset: RareByteOffset,
664 }
665 
666 impl Prefilter for RareBytesOne {
next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate667     fn next_candidate(
668         &self,
669         state: &mut PrefilterState,
670         haystack: &[u8],
671         at: usize,
672     ) -> Candidate {
673         memchr(self.byte1, &haystack[at..])
674             .map(|i| {
675                 let pos = at + i;
676                 state.last_scan_at = pos;
677                 cmp::max(at, pos.saturating_sub(self.offset.max as usize))
678             })
679             .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
680     }
681 
clone_prefilter(&self) -> Box<dyn Prefilter>682     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
683         Box::new(self.clone())
684     }
685 
heap_bytes(&self) -> usize686     fn heap_bytes(&self) -> usize {
687         0
688     }
689 
looks_for_non_start_of_match(&self) -> bool690     fn looks_for_non_start_of_match(&self) -> bool {
691         // TODO: It should be possible to use a rare byte prefilter in a
692         // streaming context. The main problem is that we usually assume that
693         // if a prefilter has scanned some text and not found anything, then no
694         // match *starts* in that text. This doesn't matter in non-streaming
695         // contexts, but in a streaming context, if we're looking for a byte
696         // that doesn't start at the beginning of a match and don't find it,
697         // then it's still possible for a match to start at the end of the
698         // current buffer content. In order to fix this, the streaming searcher
699         // would need to become aware of prefilters that do this and use the
700         // appropriate offset in various places. It is quite a delicate change
701         // and probably shouldn't be attempted until streaming search has a
702         // better testing strategy. In particular, we'd really like to be able
703         // to vary the buffer size to force strange cases that occur at the
704         // edge of the buffer. If we make the buffer size minimal, then these
705         // cases occur more frequently and easier.
706         //
707         // This is also a bummer because this means that if the prefilter
708         // builder chose a rare byte prefilter, then a streaming search won't
709         // use any prefilter at all because the builder doesn't know how it's
710         // going to be used. Assuming we don't make streaming search aware of
711         // these special types of prefilters as described above, we could fix
712         // this by building a "backup" prefilter that could be used when the
713         // rare byte prefilter could not. But that's a bandaide. Sigh.
714         true
715     }
716 }
717 
718 /// A prefilter for scanning for two "rare" bytes.
719 #[derive(Clone, Debug)]
720 struct RareBytesTwo {
721     offsets: RareByteOffsets,
722     byte1: u8,
723     byte2: u8,
724 }
725 
726 impl Prefilter for RareBytesTwo {
next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate727     fn next_candidate(
728         &self,
729         state: &mut PrefilterState,
730         haystack: &[u8],
731         at: usize,
732     ) -> Candidate {
733         memchr2(self.byte1, self.byte2, &haystack[at..])
734             .map(|i| {
735                 let pos = at + i;
736                 state.update_at(pos);
737                 let offset = self.offsets.set[haystack[pos] as usize].max;
738                 cmp::max(at, pos.saturating_sub(offset as usize))
739             })
740             .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
741     }
742 
clone_prefilter(&self) -> Box<dyn Prefilter>743     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
744         Box::new(self.clone())
745     }
746 
heap_bytes(&self) -> usize747     fn heap_bytes(&self) -> usize {
748         0
749     }
750 
looks_for_non_start_of_match(&self) -> bool751     fn looks_for_non_start_of_match(&self) -> bool {
752         // TODO: See Prefilter impl for RareBytesOne.
753         true
754     }
755 }
756 
757 /// A prefilter for scanning for three "rare" bytes.
758 #[derive(Clone, Debug)]
759 struct RareBytesThree {
760     offsets: RareByteOffsets,
761     byte1: u8,
762     byte2: u8,
763     byte3: u8,
764 }
765 
766 impl Prefilter for RareBytesThree {
next_candidate( &self, state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate767     fn next_candidate(
768         &self,
769         state: &mut PrefilterState,
770         haystack: &[u8],
771         at: usize,
772     ) -> Candidate {
773         memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
774             .map(|i| {
775                 let pos = at + i;
776                 state.update_at(pos);
777                 let offset = self.offsets.set[haystack[pos] as usize].max;
778                 cmp::max(at, pos.saturating_sub(offset as usize))
779             })
780             .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
781     }
782 
clone_prefilter(&self) -> Box<dyn Prefilter>783     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
784         Box::new(self.clone())
785     }
786 
heap_bytes(&self) -> usize787     fn heap_bytes(&self) -> usize {
788         0
789     }
790 
looks_for_non_start_of_match(&self) -> bool791     fn looks_for_non_start_of_match(&self) -> bool {
792         // TODO: See Prefilter impl for RareBytesOne.
793         true
794     }
795 }
796 
797 /// A builder for constructing a starting byte prefilter.
798 ///
799 /// A starting byte prefilter is a simplistic prefilter that looks for possible
800 /// matches by reporting all positions corresponding to a particular byte. This
801 /// generally only takes affect when there are at most 3 distinct possible
802 /// starting bytes. e.g., the patterns `foo`, `bar`, and `baz` have two
803 /// distinct starting bytes (`f` and `b`), and this prefilter returns all
804 /// occurrences of either `f` or `b`.
805 ///
806 /// In some cases, a heuristic frequency analysis may determine that it would
807 /// be better not to use this prefilter even when there are 3 or fewer distinct
808 /// starting bytes.
809 #[derive(Clone, Debug)]
810 struct StartBytesBuilder {
811     /// Whether this prefilter should account for ASCII case insensitivity or
812     /// not.
813     ascii_case_insensitive: bool,
814     /// The set of starting bytes observed.
815     byteset: Vec<bool>,
816     /// The number of bytes set to true in `byteset`.
817     count: usize,
818     /// The sum of frequency ranks for the rare bytes detected. This is
819     /// intended to give a heuristic notion of how rare the bytes are.
820     rank_sum: u16,
821 }
822 
823 impl StartBytesBuilder {
824     /// Create a new builder for constructing a start byte prefilter.
new() -> StartBytesBuilder825     fn new() -> StartBytesBuilder {
826         StartBytesBuilder {
827             ascii_case_insensitive: false,
828             byteset: vec![false; 256],
829             count: 0,
830             rank_sum: 0,
831         }
832     }
833 
834     /// Enable ASCII case insensitivity. When set, byte strings added to this
835     /// builder will be interpreted without respect to ASCII case.
ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder836     fn ascii_case_insensitive(mut self, yes: bool) -> StartBytesBuilder {
837         self.ascii_case_insensitive = yes;
838         self
839     }
840 
841     /// Build the starting bytes prefilter.
842     ///
843     /// If there are more than 3 distinct starting bytes, or if heuristics
844     /// otherwise determine that this prefilter should not be used, then `None`
845     /// is returned.
build(&self) -> Option<PrefilterObj>846     fn build(&self) -> Option<PrefilterObj> {
847         if self.count > 3 {
848             return None;
849         }
850         let (mut bytes, mut len) = ([0; 3], 0);
851         for b in 0..256 {
852             if !self.byteset[b] {
853                 continue;
854             }
855             // We don't handle non-ASCII bytes for now. Getting non-ASCII
856             // bytes right is trickier, since we generally don't want to put
857             // a leading UTF-8 code unit into a prefilter that isn't ASCII,
858             // since they can frequently. Instead, it would be better to use a
859             // continuation byte, but this requires more sophisticated analysis
860             // of the automaton and a richer prefilter API.
861             if b > 0x7F {
862                 return None;
863             }
864             bytes[len] = b as u8;
865             len += 1;
866         }
867         match len {
868             0 => None,
869             1 => Some(PrefilterObj::new(StartBytesOne { byte1: bytes[0] })),
870             2 => Some(PrefilterObj::new(StartBytesTwo {
871                 byte1: bytes[0],
872                 byte2: bytes[1],
873             })),
874             3 => Some(PrefilterObj::new(StartBytesThree {
875                 byte1: bytes[0],
876                 byte2: bytes[1],
877                 byte3: bytes[2],
878             })),
879             _ => unreachable!(),
880         }
881     }
882 
883     /// Add a byte string to this builder.
884     ///
885     /// All patterns added to an Aho-Corasick automaton should be added to this
886     /// builder before attempting to construct the prefilter.
add(&mut self, bytes: &[u8])887     fn add(&mut self, bytes: &[u8]) {
888         if self.count > 3 {
889             return;
890         }
891         if let Some(&byte) = bytes.get(0) {
892             self.add_one_byte(byte);
893             if self.ascii_case_insensitive {
894                 self.add_one_byte(opposite_ascii_case(byte));
895             }
896         }
897     }
898 
add_one_byte(&mut self, byte: u8)899     fn add_one_byte(&mut self, byte: u8) {
900         if !self.byteset[byte as usize] {
901             self.byteset[byte as usize] = true;
902             self.count += 1;
903             self.rank_sum += freq_rank(byte) as u16;
904         }
905     }
906 }
907 
908 /// A prefilter for scanning for a single starting byte.
909 #[derive(Clone, Debug)]
910 struct StartBytesOne {
911     byte1: u8,
912 }
913 
914 impl Prefilter for StartBytesOne {
next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate915     fn next_candidate(
916         &self,
917         _state: &mut PrefilterState,
918         haystack: &[u8],
919         at: usize,
920     ) -> Candidate {
921         memchr(self.byte1, &haystack[at..])
922             .map(|i| at + i)
923             .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
924     }
925 
clone_prefilter(&self) -> Box<dyn Prefilter>926     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
927         Box::new(self.clone())
928     }
929 
heap_bytes(&self) -> usize930     fn heap_bytes(&self) -> usize {
931         0
932     }
933 }
934 
935 /// A prefilter for scanning for two starting bytes.
936 #[derive(Clone, Debug)]
937 struct StartBytesTwo {
938     byte1: u8,
939     byte2: u8,
940 }
941 
942 impl Prefilter for StartBytesTwo {
next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate943     fn next_candidate(
944         &self,
945         _state: &mut PrefilterState,
946         haystack: &[u8],
947         at: usize,
948     ) -> Candidate {
949         memchr2(self.byte1, self.byte2, &haystack[at..])
950             .map(|i| at + i)
951             .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
952     }
953 
clone_prefilter(&self) -> Box<dyn Prefilter>954     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
955         Box::new(self.clone())
956     }
957 
heap_bytes(&self) -> usize958     fn heap_bytes(&self) -> usize {
959         0
960     }
961 }
962 
963 /// A prefilter for scanning for three starting bytes.
964 #[derive(Clone, Debug)]
965 struct StartBytesThree {
966     byte1: u8,
967     byte2: u8,
968     byte3: u8,
969 }
970 
971 impl Prefilter for StartBytesThree {
next_candidate( &self, _state: &mut PrefilterState, haystack: &[u8], at: usize, ) -> Candidate972     fn next_candidate(
973         &self,
974         _state: &mut PrefilterState,
975         haystack: &[u8],
976         at: usize,
977     ) -> Candidate {
978         memchr3(self.byte1, self.byte2, self.byte3, &haystack[at..])
979             .map(|i| at + i)
980             .map_or(Candidate::None, Candidate::PossibleStartOfMatch)
981     }
982 
clone_prefilter(&self) -> Box<dyn Prefilter>983     fn clone_prefilter(&self) -> Box<dyn Prefilter> {
984         Box::new(self.clone())
985     }
986 
heap_bytes(&self) -> usize987     fn heap_bytes(&self) -> usize {
988         0
989     }
990 }
991 
992 /// Return the next candidate reported by the given prefilter while
993 /// simultaneously updating the given prestate.
994 ///
995 /// The caller is responsible for checking the prestate before deciding whether
996 /// to initiate a search.
997 #[inline]
next<P: Prefilter>( prestate: &mut PrefilterState, prefilter: P, haystack: &[u8], at: usize, ) -> Candidate998 pub fn next<P: Prefilter>(
999     prestate: &mut PrefilterState,
1000     prefilter: P,
1001     haystack: &[u8],
1002     at: usize,
1003 ) -> Candidate {
1004     let cand = prefilter.next_candidate(prestate, haystack, at);
1005     match cand {
1006         Candidate::None => {
1007             prestate.update_skipped_bytes(haystack.len() - at);
1008         }
1009         Candidate::Match(ref m) => {
1010             prestate.update_skipped_bytes(m.start() - at);
1011         }
1012         Candidate::PossibleStartOfMatch(i) => {
1013             prestate.update_skipped_bytes(i - at);
1014         }
1015     }
1016     cand
1017 }
1018 
1019 /// If the given byte is an ASCII letter, then return it in the opposite case.
1020 /// e.g., Given `b'A'`, this returns `b'a'`, and given `b'a'`, this returns
1021 /// `b'A'`. If a non-ASCII letter is given, then the given byte is returned.
opposite_ascii_case(b: u8) -> u81022 pub fn opposite_ascii_case(b: u8) -> u8 {
1023     if b'A' <= b && b <= b'Z' {
1024         b.to_ascii_lowercase()
1025     } else if b'a' <= b && b <= b'z' {
1026         b.to_ascii_uppercase()
1027     } else {
1028         b
1029     }
1030 }
1031 
1032 /// Return the frequency rank of the given byte. The higher the rank, the more
1033 /// common the byte (heuristically speaking).
freq_rank(b: u8) -> u81034 fn freq_rank(b: u8) -> u8 {
1035     use crate::byte_frequencies::BYTE_FREQUENCIES;
1036     BYTE_FREQUENCIES[b as usize]
1037 }
1038 
1039 #[cfg(test)]
1040 mod tests {
1041     use super::*;
1042 
1043     #[test]
scratch()1044     fn scratch() {
1045         let mut b = Builder::new(MatchKind::LeftmostFirst);
1046         b.add(b"Sherlock");
1047         b.add(b"locjaw");
1048         // b.add(b"Sherlock");
1049         // b.add(b"Holmes");
1050         // b.add(b"Watson");
1051         // b.add("Шерлок Холмс".as_bytes());
1052         // b.add("Джон Уотсон".as_bytes());
1053 
1054         let s = b.build().unwrap();
1055         println!("{:?}", s);
1056     }
1057 }
1058