1 use std::io;
2 
3 use crate::automaton::Automaton;
4 use crate::buffer::Buffer;
5 use crate::dfa::{self, DFA};
6 use crate::error::Result;
7 use crate::nfa::{self, NFA};
8 use crate::packed;
9 use crate::prefilter::{Prefilter, PrefilterState};
10 use crate::state_id::StateID;
11 use crate::Match;
12 
13 /// An automaton for searching multiple strings in linear time.
14 ///
15 /// The `AhoCorasick` type supports a few basic ways of constructing an
16 /// automaton, including
17 /// [`AhoCorasick::new`](struct.AhoCorasick.html#method.new)
18 /// and
19 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
20 /// However, there are a fair number of configurable options that can be set
21 /// by using
22 /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
23 /// instead. Such options include, but are not limited to, how matches are
24 /// determined, simple case insensitivity, whether to use a DFA or not and
25 /// various knobs for controlling the space-vs-time trade offs taken when
26 /// building the automaton.
27 ///
28 /// If you aren't sure where to start, try beginning with
29 /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured).
30 ///
31 /// # Resource usage
32 ///
33 /// Aho-Corasick automatons are always constructed in `O(p)` time, where `p`
34 /// is the combined length of all patterns being searched. With that said,
35 /// building an automaton can be fairly costly because of high constant
36 /// factors, particularly when enabling the
37 /// [DFA](struct.AhoCorasickBuilder.html#method.dfa)
38 /// option (which is disabled by default). For this reason, it's generally a
39 /// good idea to build an automaton once and reuse it as much as possible.
40 ///
41 /// Aho-Corasick automatons can also use a fair bit of memory. To get a
42 /// concrete idea of how much memory is being used, try using the
getReady()43 /// [`AhoCorasick::heap_bytes`](struct.AhoCorasick.html#method.heap_bytes)
44 /// method.
45 ///
46 /// # Examples
47 ///
48 /// This example shows how to search for occurrences of multiple patterns
49 /// simultaneously in a case insensitive fashion. Each match includes the
50 /// pattern that matched along with the byte offsets of the match.
51 ///
52 /// ```
53 /// use aho_corasick::AhoCorasickBuilder;
54 ///
55 /// let patterns = &["apple", "maple", "snapple"];
56 /// let haystack = "Nobody likes maple in their apple flavored Snapple.";
57 ///
58 /// let ac = AhoCorasickBuilder::new()
59 ///     .ascii_case_insensitive(true)
60 ///     .build(patterns);
61 /// let mut matches = vec![];
62 /// for mat in ac.find_iter(haystack) {
63 ///     matches.push((mat.pattern(), mat.start(), mat.end()));
64 /// }
65 /// assert_eq!(matches, vec![
66 ///     (1, 13, 18),
67 ///     (0, 28, 33),
68 ///     (2, 43, 50),
69 /// ]);
70 /// ```
71 ///
72 /// This example shows how to replace matches with some other string:
73 ///
74 /// ```
75 /// use aho_corasick::AhoCorasick;
76 ///
77 /// let patterns = &["fox", "brown", "quick"];
78 /// let haystack = "The quick brown fox.";
79 /// let replace_with = &["sloth", "grey", "slow"];
80 ///
81 /// let ac = AhoCorasick::new(patterns);
82 /// let result = ac.replace_all(haystack, replace_with);
83 /// assert_eq!(result, "The slow grey sloth.");
84 /// ```
85 #[derive(Clone, Debug)]
86 pub struct AhoCorasick<S: StateID = usize> {
87     imp: Imp<S>,
88     match_kind: MatchKind,
89 }
90 
91 impl AhoCorasick {
92     /// Create a new Aho-Corasick automaton using the default configuration.
meth()93     ///
94     /// The default configuration optimizes for less space usage, but at the
95     /// expense of longer search times. To change the configuration, use
96     /// [`AhoCorasickBuilder`](struct.AhoCorasickBuilder.html)
97     /// for fine-grained control, or
98     /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
99     /// for automatic configuration if you aren't sure which settings to pick.
100     ///
101     /// This uses the default
102     /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
103     /// match semantics, which reports a match as soon as it is found. This
104     /// corresponds to the standard match semantics supported by textbook
105     /// descriptions of the Aho-Corasick algorithm.
106     ///
107     /// # Examples
108     ///
109     /// Basic usage:
110     ///
111     /// ```
112     /// use aho_corasick::AhoCorasick;
113     ///
114     /// let ac = AhoCorasick::new(&[
115     ///     "foo", "bar", "baz",
116     /// ]);
117     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
118     /// ```
119     pub fn new<I, P>(patterns: I) -> AhoCorasick
120     where
121         I: IntoIterator<Item = P>,
122         P: AsRef<[u8]>,
123     {
124         AhoCorasickBuilder::new().build(patterns)
125     }
126 
127     /// Build an Aho-Corasick automaton with an automatically determined
128     /// configuration.
129     ///
130     /// Specifically, this requires a slice of patterns instead of an iterator
131     /// since the configuration is determined by looking at the patterns before
132     /// constructing the automaton. The idea here is to balance space and time
133     /// automatically. That is, when searching a small number of patterns, this
134     /// will attempt to use the fastest possible configuration since the total
135     /// space required will be small anyway. As the number of patterns grows,
136     /// this will fall back to slower configurations that use less space.
137     ///
138     /// If you want auto configuration but with match semantics different from
139     /// the default `MatchKind::Standard`, then use
140     /// [`AhoCorasickBuilder::auto_configure`](struct.AhoCorasickBuilder.html#method.auto_configure).
141     ///
142     /// # Examples
143     ///
144     /// Basic usage is just like `new`, except you must provide a slice:
145     ///
146     /// ```
147     /// use aho_corasick::AhoCorasick;
148     ///
149     /// let ac = AhoCorasick::new_auto_configured(&[
150     ///     "foo", "bar", "baz",
151     /// ]);
152     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
153     /// ```
154     pub fn new_auto_configured<B>(patterns: &[B]) -> AhoCorasick
155     where
156         B: AsRef<[u8]>,
157     {
158         AhoCorasickBuilder::new().auto_configure(patterns).build(patterns)
159     }
160 }
161 
162 impl<S: StateID> AhoCorasick<S> {
163     /// Returns true if and only if this automaton matches the haystack at any
164     /// position.
165     ///
166     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
167     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
168     /// `&[u8]` itself.
169     ///
170     /// # Examples
171     ///
172     /// Basic usage:
173     ///
174     /// ```
175     /// use aho_corasick::AhoCorasick;
176     ///
177     /// let ac = AhoCorasick::new(&[
178     ///     "foo", "bar", "quux", "baz",
179     /// ]);
180     /// assert!(ac.is_match("xxx bar xxx"));
181     /// assert!(!ac.is_match("xxx qux xxx"));
182     /// ```
183     pub fn is_match<B: AsRef<[u8]>>(&self, haystack: B) -> bool {
184         self.earliest_find(haystack).is_some()
185     }
186 
187     /// Returns the location of the first detected match in `haystack`.
188     ///
189     /// This method has the same behavior regardless of the
190     /// [`MatchKind`](enum.MatchKind.html)
191     /// of this automaton.
192     ///
193     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
194     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
195     /// `&[u8]` itself.
196     ///
197     /// # Examples
198     ///
199     /// Basic usage:
200     ///
201     /// ```
202     /// use aho_corasick::AhoCorasick;
203     ///
204     /// let ac = AhoCorasick::new(&[
205     ///     "abc", "b",
206     /// ]);
207     /// let mat = ac.earliest_find("abcd").expect("should have match");
208     /// assert_eq!(1, mat.pattern());
209     /// assert_eq!((1, 2), (mat.start(), mat.end()));
210     /// ```
211     pub fn earliest_find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
212         let mut prestate = PrefilterState::new(self.max_pattern_len());
213         let mut start = self.imp.start_state();
214         self.imp.earliest_find_at(
215             &mut prestate,
216             haystack.as_ref(),
217             0,
218             &mut start,
219         )
220     }
221 
222     /// Returns the location of the first match according to the match
223     /// semantics that this automaton was constructed with.
224     ///
225     /// When using `MatchKind::Standard`, this corresponds precisely to the
226     /// same behavior as
227     /// [`earliest_find`](struct.AhoCorasick.html#method.earliest_find).
228     /// Otherwise, match semantics correspond to either
229     /// [leftmost-first](enum.MatchKind.html#variant.LeftmostFirst)
230     /// or
231     /// [leftmost-longest](enum.MatchKind.html#variant.LeftmostLongest).
232     ///
233     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
234     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
235     /// `&[u8]` itself.
236     ///
237     /// # Examples
238     ///
239     /// Basic usage, with standard semantics:
240     ///
241     /// ```
242     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
243     ///
244     /// let patterns = &["b", "abc", "abcd"];
245     /// let haystack = "abcd";
246     ///
247     /// let ac = AhoCorasickBuilder::new()
248     ///     .match_kind(MatchKind::Standard) // default, not necessary
249     ///     .build(patterns);
250     /// let mat = ac.find(haystack).expect("should have a match");
251     /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
252     /// ```
253     ///
254     /// Now with leftmost-first semantics:
255     ///
256     /// ```
257     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
258     ///
259     /// let patterns = &["b", "abc", "abcd"];
260     /// let haystack = "abcd";
261     ///
262     /// let ac = AhoCorasickBuilder::new()
263     ///     .match_kind(MatchKind::LeftmostFirst)
264     ///     .build(patterns);
265     /// let mat = ac.find(haystack).expect("should have a match");
266     /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
267     /// ```
268     ///
269     /// And finally, leftmost-longest semantics:
270     ///
271     /// ```
272     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
273     ///
274     /// let patterns = &["b", "abc", "abcd"];
275     /// let haystack = "abcd";
276     ///
277     /// let ac = AhoCorasickBuilder::new()
278     ///     .match_kind(MatchKind::LeftmostLongest)
279     ///     .build(patterns);
280     /// let mat = ac.find(haystack).expect("should have a match");
281     /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
282     /// ```
283     pub fn find<B: AsRef<[u8]>>(&self, haystack: B) -> Option<Match> {
284         let mut prestate = PrefilterState::new(self.max_pattern_len());
285         self.imp.find_at_no_state(&mut prestate, haystack.as_ref(), 0)
286     }
287 
288     /// Returns an iterator of non-overlapping matches, using the match
289     /// semantics that this automaton was constructed with.
290     ///
291     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
292     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
293     /// `&[u8]` itself.
294     ///
295     /// # Examples
296     ///
297     /// Basic usage, with standard semantics:
298     ///
299     /// ```
300     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
301     ///
302     /// let patterns = &["append", "appendage", "app"];
303     /// let haystack = "append the app to the appendage";
304     ///
305     /// let ac = AhoCorasickBuilder::new()
306     ///     .match_kind(MatchKind::Standard) // default, not necessary
307     ///     .build(patterns);
308     /// let matches: Vec<usize> = ac
309     ///     .find_iter(haystack)
310     ///     .map(|mat| mat.pattern())
311     ///     .collect();
312     /// assert_eq!(vec![2, 2, 2], matches);
313     /// ```
314     ///
315     /// Now with leftmost-first semantics:
316     ///
317     /// ```
318     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
319     ///
320     /// let patterns = &["append", "appendage", "app"];
321     /// let haystack = "append the app to the appendage";
322     ///
323     /// let ac = AhoCorasickBuilder::new()
324     ///     .match_kind(MatchKind::LeftmostFirst)
325     ///     .build(patterns);
326     /// let matches: Vec<usize> = ac
327     ///     .find_iter(haystack)
328     ///     .map(|mat| mat.pattern())
329     ///     .collect();
330     /// assert_eq!(vec![0, 2, 0], matches);
331     /// ```
332     ///
333     /// And finally, leftmost-longest semantics:
334     ///
335     /// ```
336     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
337     ///
338     /// let patterns = &["append", "appendage", "app"];
339     /// let haystack = "append the app to the appendage";
340     ///
341     /// let ac = AhoCorasickBuilder::new()
342     ///     .match_kind(MatchKind::LeftmostLongest)
343     ///     .build(patterns);
344     /// let matches: Vec<usize> = ac
345     ///     .find_iter(haystack)
346     ///     .map(|mat| mat.pattern())
347     ///     .collect();
348     /// assert_eq!(vec![0, 2, 1], matches);
349     /// ```
350     pub fn find_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
351         &'a self,
352         haystack: &'b B,
353     ) -> FindIter<'a, 'b, S> {
354         FindIter::new(self, haystack.as_ref())
355     }
356 
357     /// Returns an iterator of overlapping matches in the given `haystack`.
358     ///
359     /// Overlapping matches can _only_ be detected using
360     /// `MatchKind::Standard` semantics. If this automaton was constructed with
361     /// leftmost semantics, then this method will panic. To determine whether
362     /// this will panic at runtime, use the
363     /// [`AhoCorasick::supports_overlapping`](struct.AhoCorasick.html#method.supports_overlapping)
364     /// method.
365     ///
366     /// `haystack` may be any type that is cheaply convertible to a `&[u8]`.
367     /// This includes, but is not limited to, `String`, `&str`, `Vec<u8>`, and
368     /// `&[u8]` itself.
369     ///
370     /// # Panics
371     ///
372     /// This panics when `AhoCorasick::supports_overlapping` returns `false`.
373     /// That is, this panics when this automaton's match semantics are not
374     /// `MatchKind::Standard`.
375     ///
376     /// # Examples
377     ///
378     /// Basic usage, with standard semantics:
379     ///
380     /// ```
381     /// use aho_corasick::AhoCorasick;
382     ///
383     /// let patterns = &["append", "appendage", "app"];
384     /// let haystack = "append the app to the appendage";
385     ///
386     /// let ac = AhoCorasick::new(patterns);
387     /// let matches: Vec<usize> = ac
388     ///     .find_overlapping_iter(haystack)
389     ///     .map(|mat| mat.pattern())
390     ///     .collect();
391     /// assert_eq!(vec![2, 0, 2, 2, 0, 1], matches);
392     /// ```
393     pub fn find_overlapping_iter<'a, 'b, B: ?Sized + AsRef<[u8]>>(
394         &'a self,
395         haystack: &'b B,
396     ) -> FindOverlappingIter<'a, 'b, S> {
397         FindOverlappingIter::new(self, haystack.as_ref())
398     }
399 
400     /// Replace all matches with a corresponding value in the `replace_with`
401     /// slice given. Matches correspond to the same matches as reported by
402     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
403     ///
404     /// Replacements are determined by the index of the matching pattern.
405     /// For example, if the pattern with index `2` is found, then it is
406     /// replaced by `replace_with[2]`.
407     ///
408     /// # Panics
409     ///
410     /// This panics when `replace_with.len()` does not equal the total number
411     /// of patterns that are matched by this automaton.
412     ///
413     /// # Examples
414     ///
415     /// Basic usage:
416     ///
417     /// ```
418     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
419     ///
420     /// let patterns = &["append", "appendage", "app"];
421     /// let haystack = "append the app to the appendage";
422     ///
423     /// let ac = AhoCorasickBuilder::new()
424     ///     .match_kind(MatchKind::LeftmostFirst)
425     ///     .build(patterns);
426     /// let result = ac.replace_all(haystack, &["x", "y", "z"]);
427     /// assert_eq!("x the z to the xage", result);
428     /// ```
429     pub fn replace_all<B>(&self, haystack: &str, replace_with: &[B]) -> String
430     where
431         B: AsRef<str>,
432     {
433         assert_eq!(
434             replace_with.len(),
435             self.pattern_count(),
436             "replace_all requires a replacement for every pattern \
437              in the automaton"
438         );
439         let mut dst = String::with_capacity(haystack.len());
440         self.replace_all_with(haystack, &mut dst, |mat, _, dst| {
441             dst.push_str(replace_with[mat.pattern()].as_ref());
442             true
443         });
444         dst
445     }
446 
447     /// Replace all matches using raw bytes with a corresponding value in the
448     /// `replace_with` slice given. Matches correspond to the same matches as
449     /// reported by [`find_iter`](struct.AhoCorasick.html#method.find_iter).
450     ///
451     /// Replacements are determined by the index of the matching pattern.
452     /// For example, if the pattern with index `2` is found, then it is
453     /// replaced by `replace_with[2]`.
454     ///
455     /// # Panics
456     ///
457     /// This panics when `replace_with.len()` does not equal the total number
458     /// of patterns that are matched by this automaton.
459     ///
460     /// # Examples
461     ///
462     /// Basic usage:
463     ///
464     /// ```
465     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
466     ///
467     /// let patterns = &["append", "appendage", "app"];
468     /// let haystack = b"append the app to the appendage";
469     ///
470     /// let ac = AhoCorasickBuilder::new()
471     ///     .match_kind(MatchKind::LeftmostFirst)
472     ///     .build(patterns);
473     /// let result = ac.replace_all_bytes(haystack, &["x", "y", "z"]);
474     /// assert_eq!(b"x the z to the xage".to_vec(), result);
475     /// ```
476     pub fn replace_all_bytes<B>(
477         &self,
478         haystack: &[u8],
479         replace_with: &[B],
480     ) -> Vec<u8>
481     where
482         B: AsRef<[u8]>,
483     {
484         assert_eq!(
485             replace_with.len(),
486             self.pattern_count(),
487             "replace_all_bytes requires a replacement for every pattern \
488              in the automaton"
489         );
490         let mut dst = Vec::with_capacity(haystack.len());
491         self.replace_all_with_bytes(haystack, &mut dst, |mat, _, dst| {
492             dst.extend(replace_with[mat.pattern()].as_ref());
493             true
494         });
495         dst
496     }
497 
498     /// Replace all matches using a closure called on each match.
499     /// Matches correspond to the same matches as reported by
500     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
501     ///
502     /// The closure accepts three parameters: the match found, the text of
503     /// the match and a string buffer with which to write the replaced text
504     /// (if any). If the closure returns `true`, then it continues to the next
505     /// match. If the closure returns `false`, then searching is stopped.
506     ///
507     /// # Examples
508     ///
509     /// Basic usage:
510     ///
511     /// ```
512     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
513     ///
514     /// let patterns = &["append", "appendage", "app"];
515     /// let haystack = "append the app to the appendage";
516     ///
517     /// let ac = AhoCorasickBuilder::new()
518     ///     .match_kind(MatchKind::LeftmostFirst)
519     ///     .build(patterns);
520     /// let mut result = String::new();
521     /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
522     ///     dst.push_str(&mat.pattern().to_string());
523     ///     true
524     /// });
525     /// assert_eq!("0 the 2 to the 0age", result);
526     /// ```
527     ///
528     /// Stopping the replacement by returning `false` (continued from the
529     /// example above):
530     ///
531     /// ```
532     /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
533     /// # let patterns = &["append", "appendage", "app"];
534     /// # let haystack = "append the app to the appendage";
535     /// # let ac = AhoCorasickBuilder::new()
536     /// #    .match_kind(MatchKind::LeftmostFirst)
537     /// #    .build(patterns);
538     /// let mut result = String::new();
539     /// ac.replace_all_with(haystack, &mut result, |mat, _, dst| {
540     ///     dst.push_str(&mat.pattern().to_string());
541     ///     mat.pattern() != 2
542     /// });
543     /// assert_eq!("0 the 2 to the appendage", result);
544     /// ```
545     pub fn replace_all_with<F>(
546         &self,
547         haystack: &str,
548         dst: &mut String,
549         mut replace_with: F,
550     ) where
551         F: FnMut(&Match, &str, &mut String) -> bool,
552     {
553         let mut last_match = 0;
554         for mat in self.find_iter(haystack) {
555             dst.push_str(&haystack[last_match..mat.start()]);
556             last_match = mat.end();
557             if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
558                 break;
559             };
560         }
561         dst.push_str(&haystack[last_match..]);
562     }
563 
564     /// Replace all matches using raw bytes with a closure called on each
565     /// match. Matches correspond to the same matches as reported by
566     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
567     ///
568     /// The closure accepts three parameters: the match found, the text of
569     /// the match and a byte buffer with which to write the replaced text
570     /// (if any). If the closure returns `true`, then it continues to the next
571     /// match. If the closure returns `false`, then searching is stopped.
572     ///
573     /// # Examples
574     ///
575     /// Basic usage:
576     ///
577     /// ```
578     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
579     ///
580     /// let patterns = &["append", "appendage", "app"];
581     /// let haystack = b"append the app to the appendage";
582     ///
583     /// let ac = AhoCorasickBuilder::new()
584     ///     .match_kind(MatchKind::LeftmostFirst)
585     ///     .build(patterns);
586     /// let mut result = vec![];
587     /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
588     ///     dst.extend(mat.pattern().to_string().bytes());
589     ///     true
590     /// });
591     /// assert_eq!(b"0 the 2 to the 0age".to_vec(), result);
592     /// ```
593     ///
594     /// Stopping the replacement by returning `false` (continued from the
595     /// example above):
596     ///
597     /// ```
598     /// # use aho_corasick::{AhoCorasickBuilder, MatchKind};
599     /// # let patterns = &["append", "appendage", "app"];
600     /// # let haystack = b"append the app to the appendage";
601     /// # let ac = AhoCorasickBuilder::new()
602     /// #    .match_kind(MatchKind::LeftmostFirst)
603     /// #    .build(patterns);
604     /// let mut result = vec![];
605     /// ac.replace_all_with_bytes(haystack, &mut result, |mat, _, dst| {
606     ///     dst.extend(mat.pattern().to_string().bytes());
607     ///     mat.pattern() != 2
608     /// });
609     /// assert_eq!(b"0 the 2 to the appendage".to_vec(), result);
610     /// ```
611     pub fn replace_all_with_bytes<F>(
612         &self,
613         haystack: &[u8],
614         dst: &mut Vec<u8>,
615         mut replace_with: F,
616     ) where
617         F: FnMut(&Match, &[u8], &mut Vec<u8>) -> bool,
618     {
619         let mut last_match = 0;
620         for mat in self.find_iter(haystack) {
621             dst.extend(&haystack[last_match..mat.start()]);
622             last_match = mat.end();
623             if !replace_with(&mat, &haystack[mat.start()..mat.end()], dst) {
624                 break;
625             };
626         }
627         dst.extend(&haystack[last_match..]);
628     }
629 
630     /// Returns an iterator of non-overlapping matches in the given
631     /// stream. Matches correspond to the same matches as reported by
632     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
633     ///
634     /// The matches yielded by this iterator use absolute position offsets in
635     /// the stream given, where the first byte has index `0`. Matches are
636     /// yieled until the stream is exhausted.
637     ///
638     /// Each item yielded by the iterator is an `io::Result<Match>`, where an
639     /// error is yielded if there was a problem reading from the reader given.
640     ///
641     /// When searching a stream, an internal buffer is used. Therefore, callers
642     /// should avoiding providing a buffered reader, if possible.
643     ///
644     /// Searching a stream requires that the automaton was built with
645     /// `MatchKind::Standard` semantics. If this automaton was constructed
646     /// with leftmost semantics, then this method will panic. To determine
647     /// whether this will panic at runtime, use the
648     /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
649     /// method.
650     ///
651     /// # Memory usage
652     ///
653     /// In general, searching streams will use a constant amount of memory for
654     /// its internal buffer. The one requirement is that the internal buffer
655     /// must be at least the size of the longest possible match. In most use
656     /// cases, the default buffer size will be much larger than any individual
657     /// match.
658     ///
659     /// # Panics
660     ///
661     /// This panics when `AhoCorasick::supports_stream` returns `false`.
662     /// That is, this panics when this automaton's match semantics are not
663     /// `MatchKind::Standard`. This restriction may be lifted in the future.
664     ///
665     /// # Examples
666     ///
667     /// Basic usage:
668     ///
669     /// ```
670     /// use aho_corasick::AhoCorasick;
671     ///
672     /// # fn example() -> Result<(), ::std::io::Error> {
673     /// let patterns = &["append", "appendage", "app"];
674     /// let haystack = "append the app to the appendage";
675     ///
676     /// let ac = AhoCorasick::new(patterns);
677     /// let mut matches = vec![];
678     /// for result in ac.stream_find_iter(haystack.as_bytes()) {
679     ///     let mat = result?;
680     ///     matches.push(mat.pattern());
681     /// }
682     /// assert_eq!(vec![2, 2, 2], matches);
683     /// # Ok(()) }; example().unwrap()
684     /// ```
685     pub fn stream_find_iter<'a, R: io::Read>(
686         &'a self,
687         rdr: R,
688     ) -> StreamFindIter<'a, R, S> {
689         StreamFindIter::new(self, rdr)
690     }
691 
692     /// Search for and replace all matches of this automaton in
693     /// the given reader, and write the replacements to the given
694     /// writer. Matches correspond to the same matches as reported by
695     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
696     ///
697     /// Replacements are determined by the index of the matching pattern.
698     /// For example, if the pattern with index `2` is found, then it is
699     /// replaced by `replace_with[2]`.
700     ///
701     /// After all matches are replaced, the writer is _not_ flushed.
702     ///
703     /// If there was a problem reading from the given reader or writing to the
704     /// given writer, then the corresponding `io::Error` is returned and all
705     /// replacement is stopped.
706     ///
707     /// When searching a stream, an internal buffer is used. Therefore, callers
708     /// should avoiding providing a buffered reader, if possible. However,
709     /// callers may want to provide a buffered writer.
710     ///
711     /// Searching a stream requires that the automaton was built with
712     /// `MatchKind::Standard` semantics. If this automaton was constructed
713     /// with leftmost semantics, then this method will panic. To determine
714     /// whether this will panic at runtime, use the
715     /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
716     /// method.
717     ///
718     /// # Memory usage
719     ///
720     /// In general, searching streams will use a constant amount of memory for
721     /// its internal buffer. The one requirement is that the internal buffer
722     /// must be at least the size of the longest possible match. In most use
723     /// cases, the default buffer size will be much larger than any individual
724     /// match.
725     ///
726     /// # Panics
727     ///
728     /// This panics when `AhoCorasick::supports_stream` returns `false`.
729     /// That is, this panics when this automaton's match semantics are not
730     /// `MatchKind::Standard`. This restriction may be lifted in the future.
731     ///
732     /// # Examples
733     ///
734     /// Basic usage:
735     ///
736     /// ```
737     /// use aho_corasick::AhoCorasick;
738     ///
739     /// # fn example() -> Result<(), ::std::io::Error> {
740     /// let patterns = &["fox", "brown", "quick"];
741     /// let haystack = "The quick brown fox.";
742     /// let replace_with = &["sloth", "grey", "slow"];
743     ///
744     /// let ac = AhoCorasick::new(patterns);
745     /// let mut result = vec![];
746     /// ac.stream_replace_all(haystack.as_bytes(), &mut result, replace_with)?;
747     /// assert_eq!(b"The slow grey sloth.".to_vec(), result);
748     /// # Ok(()) }; example().unwrap()
749     /// ```
750     pub fn stream_replace_all<R, W, B>(
751         &self,
752         rdr: R,
753         wtr: W,
754         replace_with: &[B],
755     ) -> io::Result<()>
756     where
757         R: io::Read,
758         W: io::Write,
759         B: AsRef<[u8]>,
760     {
761         assert_eq!(
762             replace_with.len(),
763             self.pattern_count(),
764             "stream_replace_all requires a replacement for every pattern \
765              in the automaton"
766         );
767         self.stream_replace_all_with(rdr, wtr, |mat, _, wtr| {
768             wtr.write_all(replace_with[mat.pattern()].as_ref())
769         })
770     }
771 
772     /// Search the given reader and replace all matches of this automaton
773     /// using the given closure. The result is written to the given
774     /// writer. Matches correspond to the same matches as reported by
775     /// [`find_iter`](struct.AhoCorasick.html#method.find_iter).
776     ///
777     /// The closure accepts three parameters: the match found, the text of
778     /// the match and the writer with which to write the replaced text (if any).
779     ///
780     /// After all matches are replaced, the writer is _not_ flushed.
781     ///
782     /// If there was a problem reading from the given reader or writing to the
783     /// given writer, then the corresponding `io::Error` is returned and all
784     /// replacement is stopped.
785     ///
786     /// When searching a stream, an internal buffer is used. Therefore, callers
787     /// should avoiding providing a buffered reader, if possible. However,
788     /// callers may want to provide a buffered writer.
789     ///
790     /// Searching a stream requires that the automaton was built with
791     /// `MatchKind::Standard` semantics. If this automaton was constructed
792     /// with leftmost semantics, then this method will panic. To determine
793     /// whether this will panic at runtime, use the
794     /// [`AhoCorasick::supports_stream`](struct.AhoCorasick.html#method.supports_stream)
795     /// method.
796     ///
797     /// # Memory usage
798     ///
799     /// In general, searching streams will use a constant amount of memory for
800     /// its internal buffer. The one requirement is that the internal buffer
801     /// must be at least the size of the longest possible match. In most use
802     /// cases, the default buffer size will be much larger than any individual
803     /// match.
804     ///
805     /// # Panics
806     ///
807     /// This panics when `AhoCorasick::supports_stream` returns `false`.
808     /// That is, this panics when this automaton's match semantics are not
809     /// `MatchKind::Standard`. This restriction may be lifted in the future.
810     ///
811     /// # Examples
812     ///
813     /// Basic usage:
814     ///
815     /// ```
816     /// use std::io::Write;
817     /// use aho_corasick::AhoCorasick;
818     ///
819     /// # fn example() -> Result<(), ::std::io::Error> {
820     /// let patterns = &["fox", "brown", "quick"];
821     /// let haystack = "The quick brown fox.";
822     ///
823     /// let ac = AhoCorasick::new(patterns);
824     /// let mut result = vec![];
825     /// ac.stream_replace_all_with(
826     ///     haystack.as_bytes(),
827     ///     &mut result,
828     ///     |mat, _, wtr| {
829     ///         wtr.write_all(mat.pattern().to_string().as_bytes())
830     ///     },
831     /// )?;
832     /// assert_eq!(b"The 2 1 0.".to_vec(), result);
833     /// # Ok(()) }; example().unwrap()
834     /// ```
835     pub fn stream_replace_all_with<R, W, F>(
836         &self,
837         rdr: R,
838         mut wtr: W,
839         mut replace_with: F,
840     ) -> io::Result<()>
841     where
842         R: io::Read,
843         W: io::Write,
844         F: FnMut(&Match, &[u8], &mut W) -> io::Result<()>,
845     {
846         let mut it = StreamChunkIter::new(self, rdr);
847         while let Some(result) = it.next() {
848             let chunk = result?;
849             match chunk {
850                 StreamChunk::NonMatch { bytes, .. } => {
851                     wtr.write_all(bytes)?;
852                 }
853                 StreamChunk::Match { bytes, mat } => {
854                     replace_with(&mat, bytes, &mut wtr)?;
855                 }
856             }
857         }
858         Ok(())
859     }
860 
861     /// Returns the match kind used by this automaton.
862     ///
863     /// # Examples
864     ///
865     /// Basic usage:
866     ///
867     /// ```
868     /// use aho_corasick::{AhoCorasick, MatchKind};
869     ///
870     /// let ac = AhoCorasick::new(&[
871     ///     "foo", "bar", "quux", "baz",
872     /// ]);
873     /// assert_eq!(&MatchKind::Standard, ac.match_kind());
874     /// ```
875     pub fn match_kind(&self) -> &MatchKind {
876         self.imp.match_kind()
877     }
878 
879     /// Returns the length of the longest pattern matched by this automaton.
880     ///
881     /// # Examples
882     ///
883     /// Basic usage:
884     ///
885     /// ```
886     /// use aho_corasick::AhoCorasick;
887     ///
888     /// let ac = AhoCorasick::new(&[
889     ///     "foo", "bar", "quux", "baz",
890     /// ]);
891     /// assert_eq!(4, ac.max_pattern_len());
892     /// ```
893     pub fn max_pattern_len(&self) -> usize {
894         self.imp.max_pattern_len()
895     }
896 
897     /// Return the total number of patterns matched by this automaton.
898     ///
899     /// This includes patterns that may never participate in a match. For
900     /// example, if
901     /// [`MatchKind::LeftmostFirst`](enum.MatchKind.html#variant.LeftmostFirst)
902     /// match semantics are used, and the patterns `Sam` and `Samwise` were
903     /// used to build the automaton, then `Samwise` can never participate in a
904     /// match because `Sam` will always take priority.
905     ///
906     /// # Examples
907     ///
908     /// Basic usage:
909     ///
910     /// ```
911     /// use aho_corasick::AhoCorasick;
912     ///
913     /// let ac = AhoCorasick::new(&[
914     ///     "foo", "bar", "baz",
915     /// ]);
916     /// assert_eq!(3, ac.pattern_count());
917     /// ```
918     pub fn pattern_count(&self) -> usize {
919         self.imp.pattern_count()
920     }
921 
922     /// Returns true if and only if this automaton supports reporting
923     /// overlapping matches.
924     ///
925     /// If this returns false and overlapping matches are requested, then it
926     /// will result in a panic.
927     ///
928     /// Since leftmost matching is inherently incompatible with overlapping
929     /// matches, only
930     /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
931     /// supports overlapping matches. This is unlikely to change in the future.
932     ///
933     /// # Examples
934     ///
935     /// Basic usage:
936     ///
937     /// ```
938     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
939     ///
940     /// let ac = AhoCorasickBuilder::new()
941     ///     .match_kind(MatchKind::Standard)
942     ///     .build(&["foo", "bar", "baz"]);
943     /// assert!(ac.supports_overlapping());
944     ///
945     /// let ac = AhoCorasickBuilder::new()
946     ///     .match_kind(MatchKind::LeftmostFirst)
947     ///     .build(&["foo", "bar", "baz"]);
948     /// assert!(!ac.supports_overlapping());
949     /// ```
950     pub fn supports_overlapping(&self) -> bool {
951         self.match_kind.supports_overlapping()
952     }
953 
954     /// Returns true if and only if this automaton supports stream searching.
955     ///
956     /// If this returns false and stream searching (or replacing) is attempted,
957     /// then it will result in a panic.
958     ///
959     /// Currently, only
960     /// [`MatchKind::Standard`](enum.MatchKind.html#variant.Standard)
961     /// supports streaming. This may be expanded in the future.
962     ///
963     /// # Examples
964     ///
965     /// Basic usage:
966     ///
967     /// ```
968     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
969     ///
970     /// let ac = AhoCorasickBuilder::new()
971     ///     .match_kind(MatchKind::Standard)
972     ///     .build(&["foo", "bar", "baz"]);
973     /// assert!(ac.supports_stream());
974     ///
975     /// let ac = AhoCorasickBuilder::new()
976     ///     .match_kind(MatchKind::LeftmostFirst)
977     ///     .build(&["foo", "bar", "baz"]);
978     /// assert!(!ac.supports_stream());
979     /// ```
980     pub fn supports_stream(&self) -> bool {
981         self.match_kind.supports_stream()
982     }
983 
984     /// Returns the approximate total amount of heap used by this automaton, in
985     /// units of bytes.
986     ///
987     /// # Examples
988     ///
989     /// This example shows the difference in heap usage between a few
990     /// configurations:
991     ///
992     /// ```ignore
993     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
994     ///
995     /// let ac = AhoCorasickBuilder::new()
996     ///     .dfa(false) // default
997     ///     .build(&["foo", "bar", "baz"]);
998     /// assert_eq!(10_336, ac.heap_bytes());
999     ///
1000     /// let ac = AhoCorasickBuilder::new()
1001     ///     .dfa(false) // default
1002     ///     .ascii_case_insensitive(true)
1003     ///     .build(&["foo", "bar", "baz"]);
1004     /// assert_eq!(10_384, ac.heap_bytes());
1005     ///
1006     /// let ac = AhoCorasickBuilder::new()
1007     ///     .dfa(true)
1008     ///     .ascii_case_insensitive(true)
1009     ///     .build(&["foo", "bar", "baz"]);
1010     /// assert_eq!(1_248, ac.heap_bytes());
1011     /// ```
1012     pub fn heap_bytes(&self) -> usize {
1013         match self.imp {
1014             Imp::NFA(ref nfa) => nfa.heap_bytes(),
1015             Imp::DFA(ref dfa) => dfa.heap_bytes(),
1016         }
1017     }
1018 }
1019 
1020 /// The internal implementation of Aho-Corasick, which is either an NFA or
1021 /// a DFA. The NFA is slower but uses less memory. The DFA is faster but uses
1022 /// more memory.
1023 #[derive(Clone, Debug)]
1024 enum Imp<S: StateID> {
1025     NFA(NFA<S>),
1026     DFA(DFA<S>),
1027 }
1028 
1029 impl<S: StateID> Imp<S> {
1030     /// Returns the type of match semantics implemented by this automaton.
1031     fn match_kind(&self) -> &MatchKind {
1032         match *self {
1033             Imp::NFA(ref nfa) => nfa.match_kind(),
1034             Imp::DFA(ref dfa) => dfa.match_kind(),
1035         }
1036     }
1037 
1038     /// Returns the identifier of the start state.
1039     fn start_state(&self) -> S {
1040         match *self {
1041             Imp::NFA(ref nfa) => nfa.start_state(),
1042             Imp::DFA(ref dfa) => dfa.start_state(),
1043         }
1044     }
1045 
1046     /// The length, in bytes, of the longest pattern in this automaton. This
1047     /// information is useful for maintaining correct buffer sizes when
1048     /// searching on streams.
1049     fn max_pattern_len(&self) -> usize {
1050         match *self {
1051             Imp::NFA(ref nfa) => nfa.max_pattern_len(),
1052             Imp::DFA(ref dfa) => dfa.max_pattern_len(),
1053         }
1054     }
1055 
1056     /// The total number of patterns added to this automaton. This includes
1057     /// patterns that may never match. The maximum matching pattern that can be
1058     /// reported is exactly one less than this number.
1059     fn pattern_count(&self) -> usize {
1060         match *self {
1061             Imp::NFA(ref nfa) => nfa.pattern_count(),
1062             Imp::DFA(ref dfa) => dfa.pattern_count(),
1063         }
1064     }
1065 
1066     /// Returns the prefilter object, if one exists, for the underlying
1067     /// automaton.
1068     fn prefilter(&self) -> Option<&dyn Prefilter> {
1069         match *self {
1070             Imp::NFA(ref nfa) => nfa.prefilter(),
1071             Imp::DFA(ref dfa) => dfa.prefilter(),
1072         }
1073     }
1074 
1075     /// Returns true if and only if we should attempt to use a prefilter.
1076     fn use_prefilter(&self) -> bool {
1077         let p = match self.prefilter() {
1078             None => return false,
1079             Some(p) => p,
1080         };
1081         !p.looks_for_non_start_of_match()
1082     }
1083 
1084     #[inline(always)]
1085     fn overlapping_find_at(
1086         &self,
1087         prestate: &mut PrefilterState,
1088         haystack: &[u8],
1089         at: usize,
1090         state_id: &mut S,
1091         match_index: &mut usize,
1092     ) -> Option<Match> {
1093         match *self {
1094             Imp::NFA(ref nfa) => nfa.overlapping_find_at(
1095                 prestate,
1096                 haystack,
1097                 at,
1098                 state_id,
1099                 match_index,
1100             ),
1101             Imp::DFA(ref dfa) => dfa.overlapping_find_at(
1102                 prestate,
1103                 haystack,
1104                 at,
1105                 state_id,
1106                 match_index,
1107             ),
1108         }
1109     }
1110 
1111     #[inline(always)]
1112     fn earliest_find_at(
1113         &self,
1114         prestate: &mut PrefilterState,
1115         haystack: &[u8],
1116         at: usize,
1117         state_id: &mut S,
1118     ) -> Option<Match> {
1119         match *self {
1120             Imp::NFA(ref nfa) => {
1121                 nfa.earliest_find_at(prestate, haystack, at, state_id)
1122             }
1123             Imp::DFA(ref dfa) => {
1124                 dfa.earliest_find_at(prestate, haystack, at, state_id)
1125             }
1126         }
1127     }
1128 
1129     #[inline(always)]
1130     fn find_at_no_state(
1131         &self,
1132         prestate: &mut PrefilterState,
1133         haystack: &[u8],
1134         at: usize,
1135     ) -> Option<Match> {
1136         match *self {
1137             Imp::NFA(ref nfa) => nfa.find_at_no_state(prestate, haystack, at),
1138             Imp::DFA(ref dfa) => dfa.find_at_no_state(prestate, haystack, at),
1139         }
1140     }
1141 }
1142 
1143 /// An iterator of non-overlapping matches in a particular haystack.
1144 ///
1145 /// This iterator yields matches according to the
1146 /// [`MatchKind`](enum.MatchKind.html)
1147 /// used by this automaton.
1148 ///
1149 /// This iterator is constructed via the
1150 /// [`AhoCorasick::find_iter`](struct.AhoCorasick.html#method.find_iter)
1151 /// method.
1152 ///
1153 /// The type variable `S` refers to the representation used for state
1154 /// identifiers. (By default, this is `usize`.)
1155 ///
1156 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1157 ///
1158 /// The lifetime `'b` refers to the lifetime of the haystack being searched.
1159 #[derive(Debug)]
1160 pub struct FindIter<'a, 'b, S: StateID> {
1161     fsm: &'a Imp<S>,
1162     prestate: PrefilterState,
1163     haystack: &'b [u8],
1164     pos: usize,
1165 }
1166 
1167 impl<'a, 'b, S: StateID> FindIter<'a, 'b, S> {
1168     fn new(ac: &'a AhoCorasick<S>, haystack: &'b [u8]) -> FindIter<'a, 'b, S> {
1169         let prestate = PrefilterState::new(ac.max_pattern_len());
1170         FindIter { fsm: &ac.imp, prestate, haystack, pos: 0 }
1171     }
1172 }
1173 
1174 impl<'a, 'b, S: StateID> Iterator for FindIter<'a, 'b, S> {
1175     type Item = Match;
1176 
1177     fn next(&mut self) -> Option<Match> {
1178         if self.pos > self.haystack.len() {
1179             return None;
1180         }
1181         let result = self.fsm.find_at_no_state(
1182             &mut self.prestate,
1183             self.haystack,
1184             self.pos,
1185         );
1186         let mat = match result {
1187             None => return None,
1188             Some(mat) => mat,
1189         };
1190         if mat.end() == self.pos {
1191             // If the automaton can match the empty string and if we found an
1192             // empty match, then we need to forcefully move the position.
1193             self.pos += 1;
1194         } else {
1195             self.pos = mat.end();
1196         }
1197         Some(mat)
1198     }
1199 }
1200 
1201 /// An iterator of overlapping matches in a particular haystack.
1202 ///
1203 /// This iterator will report all possible matches in a particular haystack,
1204 /// even when the matches overlap.
1205 ///
1206 /// This iterator is constructed via the
1207 /// [`AhoCorasick::find_overlapping_iter`](struct.AhoCorasick.html#method.find_overlapping_iter)
1208 /// method.
1209 ///
1210 /// The type variable `S` refers to the representation used for state
1211 /// identifiers. (By default, this is `usize`.)
1212 ///
1213 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1214 ///
1215 /// The lifetime `'b` refers to the lifetime of the haystack being searched.
1216 #[derive(Debug)]
1217 pub struct FindOverlappingIter<'a, 'b, S: StateID> {
1218     fsm: &'a Imp<S>,
1219     prestate: PrefilterState,
1220     haystack: &'b [u8],
1221     pos: usize,
1222     last_match_end: usize,
1223     state_id: S,
1224     match_index: usize,
1225 }
1226 
1227 impl<'a, 'b, S: StateID> FindOverlappingIter<'a, 'b, S> {
1228     fn new(
1229         ac: &'a AhoCorasick<S>,
1230         haystack: &'b [u8],
1231     ) -> FindOverlappingIter<'a, 'b, S> {
1232         assert!(
1233             ac.supports_overlapping(),
1234             "automaton does not support overlapping searches"
1235         );
1236         let prestate = PrefilterState::new(ac.max_pattern_len());
1237         FindOverlappingIter {
1238             fsm: &ac.imp,
1239             prestate,
1240             haystack,
1241             pos: 0,
1242             last_match_end: 0,
1243             state_id: ac.imp.start_state(),
1244             match_index: 0,
1245         }
1246     }
1247 }
1248 
1249 impl<'a, 'b, S: StateID> Iterator for FindOverlappingIter<'a, 'b, S> {
1250     type Item = Match;
1251 
1252     fn next(&mut self) -> Option<Match> {
1253         let result = self.fsm.overlapping_find_at(
1254             &mut self.prestate,
1255             self.haystack,
1256             self.pos,
1257             &mut self.state_id,
1258             &mut self.match_index,
1259         );
1260         match result {
1261             None => return None,
1262             Some(m) => {
1263                 self.pos = m.end();
1264                 Some(m)
1265             }
1266         }
1267     }
1268 }
1269 
1270 /// An iterator that reports Aho-Corasick matches in a stream.
1271 ///
1272 /// This iterator yields elements of type `io::Result<Match>`, where an error
1273 /// is reported if there was a problem reading from the underlying stream.
1274 /// The iterator terminates only when the underlying stream reaches `EOF`.
1275 ///
1276 /// This iterator is constructed via the
1277 /// [`AhoCorasick::stream_find_iter`](struct.AhoCorasick.html#method.stream_find_iter)
1278 /// method.
1279 ///
1280 /// The type variable `R` refers to the `io::Read` stream that is being read
1281 /// from.
1282 ///
1283 /// The type variable `S` refers to the representation used for state
1284 /// identifiers. (By default, this is `usize`.)
1285 ///
1286 /// The lifetime `'a` refers to the lifetime of the `AhoCorasick` automaton.
1287 #[derive(Debug)]
1288 pub struct StreamFindIter<'a, R, S: StateID> {
1289     it: StreamChunkIter<'a, R, S>,
1290 }
1291 
1292 impl<'a, R: io::Read, S: StateID> StreamFindIter<'a, R, S> {
1293     fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamFindIter<'a, R, S> {
1294         StreamFindIter { it: StreamChunkIter::new(ac, rdr) }
1295     }
1296 }
1297 
1298 impl<'a, R: io::Read, S: StateID> Iterator for StreamFindIter<'a, R, S> {
1299     type Item = io::Result<Match>;
1300 
1301     fn next(&mut self) -> Option<io::Result<Match>> {
1302         loop {
1303             match self.it.next() {
1304                 None => return None,
1305                 Some(Err(err)) => return Some(Err(err)),
1306                 Some(Ok(StreamChunk::NonMatch { .. })) => {}
1307                 Some(Ok(StreamChunk::Match { mat, .. })) => {
1308                     return Some(Ok(mat));
1309                 }
1310             }
1311         }
1312     }
1313 }
1314 
1315 /// An iterator over chunks in an underlying reader. Each chunk either
1316 /// corresponds to non-matching bytes or matching bytes, but all bytes from
1317 /// the underlying reader are reported in sequence. There may be an arbitrary
1318 /// number of non-matching chunks before seeing a matching chunk.
1319 ///
1320 /// N.B. This does not actually implement Iterator because we need to borrow
1321 /// from the underlying reader. But conceptually, it's still an iterator.
1322 #[derive(Debug)]
1323 struct StreamChunkIter<'a, R, S: StateID> {
1324     /// The AC automaton.
1325     fsm: &'a Imp<S>,
1326     /// State associated with this automaton's prefilter. It is a heuristic
1327     /// for stopping the prefilter if it's deemed ineffective.
1328     prestate: PrefilterState,
1329     /// The source of bytes we read from.
1330     rdr: R,
1331     /// A fixed size buffer. This is what we actually search. There are some
1332     /// invariants around the buffer's size, namely, it must be big enough to
1333     /// contain the longest possible match.
1334     buf: Buffer,
1335     /// The ID of the FSM state we're currently in.
1336     state_id: S,
1337     /// The current position at which to start the next search in `buf`.
1338     search_pos: usize,
1339     /// The absolute position of `search_pos`, where `0` corresponds to the
1340     /// position of the first byte read from `rdr`.
1341     absolute_pos: usize,
1342     /// The ending position of the last StreamChunk that was returned to the
1343     /// caller. This position is used to determine whether we need to emit
1344     /// non-matching bytes before emitting a match.
1345     report_pos: usize,
1346     /// A match that should be reported on the next call.
1347     pending_match: Option<Match>,
1348     /// Enabled only when the automaton can match the empty string. When
1349     /// enabled, we need to execute one final search after consuming the
1350     /// reader to find the trailing empty match.
1351     has_empty_match_at_end: bool,
1352 }
1353 
1354 /// A single chunk yielded by the stream chunk iterator.
1355 ///
1356 /// The `'r` lifetime refers to the lifetime of the stream chunk iterator.
1357 #[derive(Debug)]
1358 enum StreamChunk<'r> {
1359     /// A chunk that does not contain any matches.
1360     NonMatch { bytes: &'r [u8], start: usize },
1361     /// A chunk that precisely contains a match.
1362     Match { bytes: &'r [u8], mat: Match },
1363 }
1364 
1365 impl<'a, R: io::Read, S: StateID> StreamChunkIter<'a, R, S> {
1366     fn new(ac: &'a AhoCorasick<S>, rdr: R) -> StreamChunkIter<'a, R, S> {
1367         assert!(
1368             ac.supports_stream(),
1369             "stream searching is only supported for Standard match semantics"
1370         );
1371 
1372         let prestate = if ac.imp.use_prefilter() {
1373             PrefilterState::new(ac.max_pattern_len())
1374         } else {
1375             PrefilterState::disabled()
1376         };
1377         let buf = Buffer::new(ac.imp.max_pattern_len());
1378         let state_id = ac.imp.start_state();
1379         StreamChunkIter {
1380             fsm: &ac.imp,
1381             prestate,
1382             rdr,
1383             buf,
1384             state_id,
1385             absolute_pos: 0,
1386             report_pos: 0,
1387             search_pos: 0,
1388             pending_match: None,
1389             has_empty_match_at_end: ac.is_match(""),
1390         }
1391     }
1392 
1393     fn next<'r>(&'r mut self) -> Option<io::Result<StreamChunk<'r>>> {
1394         loop {
1395             if let Some(mut mat) = self.pending_match.take() {
1396                 let bytes = &self.buf.buffer()[mat.start()..mat.end()];
1397                 self.report_pos = mat.end();
1398                 mat = mat.increment(self.absolute_pos);
1399                 return Some(Ok(StreamChunk::Match { bytes, mat }));
1400             }
1401             if self.search_pos >= self.buf.len() {
1402                 if let Some(end) = self.unreported() {
1403                     let bytes = &self.buf.buffer()[self.report_pos..end];
1404                     let start = self.absolute_pos + self.report_pos;
1405                     self.report_pos = end;
1406                     return Some(Ok(StreamChunk::NonMatch { bytes, start }));
1407                 }
1408                 if self.buf.len() >= self.buf.min_buffer_len() {
1409                     // This is the point at which we roll our buffer, which we
1410                     // only do if our buffer has at least the minimum amount of
1411                     // bytes in it. Before rolling, we update our various
1412                     // positions to be consistent with the buffer after it has
1413                     // been rolled.
1414 
1415                     self.report_pos -=
1416                         self.buf.len() - self.buf.min_buffer_len();
1417                     self.absolute_pos +=
1418                         self.search_pos - self.buf.min_buffer_len();
1419                     self.search_pos = self.buf.min_buffer_len();
1420                     self.buf.roll();
1421                 }
1422                 match self.buf.fill(&mut self.rdr) {
1423                     Err(err) => return Some(Err(err)),
1424                     Ok(false) => {
1425                         // We've hit EOF, but if there are still some
1426                         // unreported bytes remaining, return them now.
1427                         if self.report_pos < self.buf.len() {
1428                             let bytes = &self.buf.buffer()[self.report_pos..];
1429                             let start = self.absolute_pos + self.report_pos;
1430                             self.report_pos = self.buf.len();
1431 
1432                             let chunk = StreamChunk::NonMatch { bytes, start };
1433                             return Some(Ok(chunk));
1434                         } else {
1435                             // We've reported everything, but there might still
1436                             // be a match at the very last position.
1437                             if !self.has_empty_match_at_end {
1438                                 return None;
1439                             }
1440                             // fallthrough for another search to get trailing
1441                             // empty matches
1442                             self.has_empty_match_at_end = false;
1443                         }
1444                     }
1445                     Ok(true) => {}
1446                 }
1447             }
1448             let result = self.fsm.earliest_find_at(
1449                 &mut self.prestate,
1450                 self.buf.buffer(),
1451                 self.search_pos,
1452                 &mut self.state_id,
1453             );
1454             match result {
1455                 None => {
1456                     self.search_pos = self.buf.len();
1457                 }
1458                 Some(mat) => {
1459                     self.state_id = self.fsm.start_state();
1460                     if mat.end() == self.search_pos {
1461                         // If the automaton can match the empty string and if
1462                         // we found an empty match, then we need to forcefully
1463                         // move the position.
1464                         self.search_pos += 1;
1465                     } else {
1466                         self.search_pos = mat.end();
1467                     }
1468                     self.pending_match = Some(mat.clone());
1469                     if self.report_pos < mat.start() {
1470                         let bytes =
1471                             &self.buf.buffer()[self.report_pos..mat.start()];
1472                         let start = self.absolute_pos + self.report_pos;
1473                         self.report_pos = mat.start();
1474 
1475                         let chunk = StreamChunk::NonMatch { bytes, start };
1476                         return Some(Ok(chunk));
1477                     }
1478                 }
1479             }
1480         }
1481     }
1482 
1483     fn unreported(&self) -> Option<usize> {
1484         let end = self.search_pos.saturating_sub(self.buf.min_buffer_len());
1485         if self.report_pos < end {
1486             Some(end)
1487         } else {
1488             None
1489         }
1490     }
1491 }
1492 
1493 /// A builder for configuring an Aho-Corasick automaton.
1494 #[derive(Clone, Debug)]
1495 pub struct AhoCorasickBuilder {
1496     nfa_builder: nfa::Builder,
1497     dfa_builder: dfa::Builder,
1498     dfa: bool,
1499 }
1500 
1501 impl Default for AhoCorasickBuilder {
1502     fn default() -> AhoCorasickBuilder {
1503         AhoCorasickBuilder::new()
1504     }
1505 }
1506 
1507 impl AhoCorasickBuilder {
1508     /// Create a new builder for configuring an Aho-Corasick automaton.
1509     ///
1510     /// If you don't need fine grained configuration or aren't sure which knobs
1511     /// to set, try using
1512     /// [`AhoCorasick::new_auto_configured`](struct.AhoCorasick.html#method.new_auto_configured)
1513     /// instead.
1514     pub fn new() -> AhoCorasickBuilder {
1515         AhoCorasickBuilder {
1516             nfa_builder: nfa::Builder::new(),
1517             dfa_builder: dfa::Builder::new(),
1518             dfa: false,
1519         }
1520     }
1521 
1522     /// Build an Aho-Corasick automaton using the configuration set on this
1523     /// builder.
1524     ///
1525     /// A builder may be reused to create more automatons.
1526     ///
1527     /// This method will use the default for representing internal state
1528     /// identifiers, which is `usize`. This guarantees that building the
1529     /// automaton will succeed and is generally a good default, but can make
1530     /// the size of the automaton 2-8 times bigger than it needs to be,
1531     /// depending on your target platform.
1532     ///
1533     /// # Examples
1534     ///
1535     /// Basic usage:
1536     ///
1537     /// ```
1538     /// use aho_corasick::AhoCorasickBuilder;
1539     ///
1540     /// let patterns = &["foo", "bar", "baz"];
1541     /// let ac = AhoCorasickBuilder::new()
1542     ///     .build(patterns);
1543     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1544     /// ```
1545     pub fn build<I, P>(&self, patterns: I) -> AhoCorasick
1546     where
1547         I: IntoIterator<Item = P>,
1548         P: AsRef<[u8]>,
1549     {
1550         // The builder only returns an error if the chosen state ID
1551         // representation is too small to fit all of the given patterns. In
1552         // this case, since we fix the representation to usize, it will always
1553         // work because it's impossible to overflow usize since the underlying
1554         // storage would OOM long before that happens.
1555         self.build_with_size::<usize, I, P>(patterns)
1556             .expect("usize state ID type should always work")
1557     }
1558 
1559     /// Build an Aho-Corasick automaton using the configuration set on this
1560     /// builder with a specific state identifier representation. This only has
1561     /// an effect when the `dfa` option is enabled.
1562     ///
1563     /// Generally, the choices for a state identifier representation are
1564     /// `u8`, `u16`, `u32`, `u64` or `usize`, with `usize` being the default.
1565     /// The advantage of choosing a smaller state identifier representation
1566     /// is that the automaton produced will be smaller. This might be
1567     /// beneficial for just generally using less space, or might even allow it
1568     /// to fit more of the automaton in your CPU's cache, leading to overall
1569     /// better search performance.
1570     ///
1571     /// Unlike the standard `build` method, this can report an error if the
1572     /// state identifier representation cannot support the size of the
1573     /// automaton.
1574     ///
1575     /// Note that the state identifier representation is determined by the
1576     /// `S` type variable. This requires a type hint of some sort, either
1577     /// by specifying the return type or using the turbofish, e.g.,
1578     /// `build_with_size::<u16, _, _>(...)`.
1579     ///
1580     /// # Examples
1581     ///
1582     /// Basic usage:
1583     ///
1584     /// ```
1585     /// use aho_corasick::{AhoCorasick, AhoCorasickBuilder};
1586     ///
1587     /// # fn example() -> Result<(), ::aho_corasick::Error> {
1588     /// let patterns = &["foo", "bar", "baz"];
1589     /// let ac: AhoCorasick<u8> = AhoCorasickBuilder::new()
1590     ///     .build_with_size(patterns)?;
1591     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1592     /// # Ok(()) }; example().unwrap()
1593     /// ```
1594     ///
1595     /// Or alternatively, with turbofish:
1596     ///
1597     /// ```
1598     /// use aho_corasick::AhoCorasickBuilder;
1599     ///
1600     /// # fn example() -> Result<(), ::aho_corasick::Error> {
1601     /// let patterns = &["foo", "bar", "baz"];
1602     /// let ac = AhoCorasickBuilder::new()
1603     ///     .build_with_size::<u8, _, _>(patterns)?;
1604     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1605     /// # Ok(()) }; example().unwrap()
1606     /// ```
1607     pub fn build_with_size<S, I, P>(
1608         &self,
1609         patterns: I,
1610     ) -> Result<AhoCorasick<S>>
1611     where
1612         S: StateID,
1613         I: IntoIterator<Item = P>,
1614         P: AsRef<[u8]>,
1615     {
1616         let nfa = self.nfa_builder.build(patterns)?;
1617         let match_kind = nfa.match_kind().clone();
1618         let imp = if self.dfa {
1619             let dfa = self.dfa_builder.build(&nfa)?;
1620             Imp::DFA(dfa)
1621         } else {
1622             Imp::NFA(nfa)
1623         };
1624         Ok(AhoCorasick { imp, match_kind })
1625     }
1626 
1627     /// Automatically configure the settings on this builder according to the
1628     /// patterns that will be used to construct the automaton.
1629     ///
1630     /// The idea here is to balance space and time automatically. That is, when
1631     /// searching a small number of patterns, this will attempt to use the
1632     /// fastest possible configuration since the total space required will be
1633     /// small anyway. As the number of patterns grows, this will fall back to
1634     /// slower configurations that use less space.
1635     ///
1636     /// This is guaranteed to never set `match_kind`, but any other option may
1637     /// be overridden.
1638     ///
1639     /// # Examples
1640     ///
1641     /// Basic usage:
1642     ///
1643     /// ```
1644     /// use aho_corasick::AhoCorasickBuilder;
1645     ///
1646     /// let patterns = &["foo", "bar", "baz"];
1647     /// let ac = AhoCorasickBuilder::new()
1648     ///     .auto_configure(patterns)
1649     ///     .build(patterns);
1650     /// assert_eq!(Some(1), ac.find("xxx bar xxx").map(|m| m.pattern()));
1651     /// ```
1652     pub fn auto_configure<B: AsRef<[u8]>>(
1653         &mut self,
1654         patterns: &[B],
1655     ) -> &mut AhoCorasickBuilder {
1656         // N.B. Currently we only use the length of `patterns` to make a
1657         // decision here, and could therefore ask for an `ExactSizeIterator`
1658         // instead. But it's conceivable that we might adapt this to look at
1659         // the total number of bytes, which would requires a second pass.
1660         //
1661         // The logic here is fairly rudimentary at the moment, but probably
1662         // OK. The idea here is to use the fastest thing possible for a small
1663         // number of patterns. That is, a DFA with no byte classes, since byte
1664         // classes require an extra indirection for every byte searched. With a
1665         // moderate number of patterns, we still want a DFA, but save on both
1666         // space and compilation time by enabling byte classes. Finally, fall
1667         // back to the slower but smaller NFA.
1668         if patterns.len() <= 100 {
1669             // N.B. Using byte classes can actually be faster by improving
1670             // locality, but this only really applies for multi-megabyte
1671             // automata (i.e., automata that don't fit in your CPU's cache).
1672             self.dfa(true);
1673         } else if patterns.len() <= 5000 {
1674             self.dfa(true);
1675         }
1676         self
1677     }
1678 
1679     /// Set the desired match semantics.
1680     ///
1681     /// The default is `MatchKind::Standard`, which corresponds to the match
1682     /// semantics supported by the standard textbook description of the
1683     /// Aho-Corasick algorithm. Namely, matches are reported as soon as they
1684     /// are found. Moreover, this is the only way to get overlapping matches
1685     /// or do stream searching.
1686     ///
1687     /// The other kinds of match semantics that are supported are
1688     /// `MatchKind::LeftmostFirst` and `MatchKind::LeftmostLongest`. The former
1689     /// corresponds to the match you would get if you were to try to match
1690     /// each pattern at each position in the haystack in the same order that
1691     /// you give to the automaton. That is, it returns the leftmost match
1692     /// corresponding the earliest pattern given to the automaton. The latter
1693     /// corresponds to finding the longest possible match among all leftmost
1694     /// matches.
1695     ///
1696     /// For more details on match semantics, see the
1697     /// [documentation for `MatchKind`](enum.MatchKind.html).
1698     ///
1699     /// # Examples
1700     ///
1701     /// In these examples, we demonstrate the differences between match
1702     /// semantics for a particular set of patterns in a specific order:
1703     /// `b`, `abc`, `abcd`.
1704     ///
1705     /// Standard semantics:
1706     ///
1707     /// ```
1708     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1709     ///
1710     /// let patterns = &["b", "abc", "abcd"];
1711     /// let haystack = "abcd";
1712     ///
1713     /// let ac = AhoCorasickBuilder::new()
1714     ///     .match_kind(MatchKind::Standard) // default, not necessary
1715     ///     .build(patterns);
1716     /// let mat = ac.find(haystack).expect("should have a match");
1717     /// assert_eq!("b", &haystack[mat.start()..mat.end()]);
1718     /// ```
1719     ///
1720     /// Leftmost-first semantics:
1721     ///
1722     /// ```
1723     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1724     ///
1725     /// let patterns = &["b", "abc", "abcd"];
1726     /// let haystack = "abcd";
1727     ///
1728     /// let ac = AhoCorasickBuilder::new()
1729     ///     .match_kind(MatchKind::LeftmostFirst)
1730     ///     .build(patterns);
1731     /// let mat = ac.find(haystack).expect("should have a match");
1732     /// assert_eq!("abc", &haystack[mat.start()..mat.end()]);
1733     /// ```
1734     ///
1735     /// Leftmost-longest semantics:
1736     ///
1737     /// ```
1738     /// use aho_corasick::{AhoCorasickBuilder, MatchKind};
1739     ///
1740     /// let patterns = &["b", "abc", "abcd"];
1741     /// let haystack = "abcd";
1742     ///
1743     /// let ac = AhoCorasickBuilder::new()
1744     ///     .match_kind(MatchKind::LeftmostLongest)
1745     ///     .build(patterns);
1746     /// let mat = ac.find(haystack).expect("should have a match");
1747     /// assert_eq!("abcd", &haystack[mat.start()..mat.end()]);
1748     /// ```
1749     pub fn match_kind(&mut self, kind: MatchKind) -> &mut AhoCorasickBuilder {
1750         self.nfa_builder.match_kind(kind);
1751         self
1752     }
1753 
1754     /// Enable anchored mode, which requires all matches to start at the
1755     /// first position in a haystack.
1756     ///
1757     /// This option is disabled by default.
1758     ///
1759     /// # Examples
1760     ///
1761     /// Basic usage:
1762     ///
1763     /// ```
1764     /// use aho_corasick::AhoCorasickBuilder;
1765     ///
1766     /// let patterns = &["foo", "bar"];
1767     /// let haystack = "foobar";
1768     ///
1769     /// let ac = AhoCorasickBuilder::new()
1770     ///     .anchored(true)
1771     ///     .build(patterns);
1772     /// assert_eq!(1, ac.find_iter(haystack).count());
1773     /// ```
1774     ///
1775     /// When searching for overlapping matches, all matches that start at
1776     /// the beginning of a haystack will be reported:
1777     ///
1778     /// ```
1779     /// use aho_corasick::AhoCorasickBuilder;
1780     ///
1781     /// let patterns = &["foo", "foofoo"];
1782     /// let haystack = "foofoo";
1783     ///
1784     /// let ac = AhoCorasickBuilder::new()
1785     ///     .anchored(true)
1786     ///     .build(patterns);
1787     /// assert_eq!(2, ac.find_overlapping_iter(haystack).count());
1788     /// // A non-anchored search would return 3 matches.
1789     /// ```
1790     pub fn anchored(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1791         self.nfa_builder.anchored(yes);
1792         self
1793     }
1794 
1795     /// Enable ASCII-aware case insensitive matching.
1796     ///
1797     /// When this option is enabled, searching will be performed without
1798     /// respect to case for ASCII letters (`a-z` and `A-Z`) only.
1799     ///
1800     /// Enabling this option does not change the search algorithm, but it may
1801     /// increase the size of the automaton.
1802     ///
1803     /// **NOTE:** In the future, support for full Unicode case insensitivity
1804     /// may be added, but ASCII case insensitivity is comparatively much
1805     /// simpler to add.
1806     ///
1807     /// # Examples
1808     ///
1809     /// Basic usage:
1810     ///
1811     /// ```
1812     /// use aho_corasick::AhoCorasickBuilder;
1813     ///
1814     /// let patterns = &["FOO", "bAr", "BaZ"];
1815     /// let haystack = "foo bar baz";
1816     ///
1817     /// let ac = AhoCorasickBuilder::new()
1818     ///     .ascii_case_insensitive(true)
1819     ///     .build(patterns);
1820     /// assert_eq!(3, ac.find_iter(haystack).count());
1821     /// ```
1822     pub fn ascii_case_insensitive(
1823         &mut self,
1824         yes: bool,
1825     ) -> &mut AhoCorasickBuilder {
1826         self.nfa_builder.ascii_case_insensitive(yes);
1827         self
1828     }
1829 
1830     /// Set the limit on how many NFA states use a dense representation for
1831     /// their transitions.
1832     ///
1833     /// A dense representation uses more space, but supports faster access to
1834     /// transitions at search time. Thus, this setting permits the control of a
1835     /// space vs time trade off when using the NFA variant of Aho-Corasick.
1836     ///
1837     /// This limit is expressed in terms of the depth of a state, i.e., the
1838     /// number of transitions from the starting state of the NFA. The idea is
1839     /// that most of the time searching will be spent near the starting state
1840     /// of the automaton, so states near the start state should use a dense
1841     /// representation. States further away from the start state would then use
1842     /// a sparse representation, which uses less space but is slower to access
1843     /// transitions at search time.
1844     ///
1845     /// By default, this is set to a low but non-zero number.
1846     ///
1847     /// This setting has no effect if the `dfa` option is enabled.
1848     pub fn dense_depth(&mut self, depth: usize) -> &mut AhoCorasickBuilder {
1849         self.nfa_builder.dense_depth(depth);
1850         self
1851     }
1852 
1853     /// Compile the standard Aho-Corasick automaton into a deterministic finite
1854     /// automaton (DFA).
1855     ///
1856     /// When this is disabled (which is the default), then a non-deterministic
1857     /// finite automaton (NFA) is used instead.
1858     ///
1859     /// The main benefit to a DFA is that it can execute searches more quickly
1860     /// than a NFA (perhaps 2-4 times as fast). The main drawback is that the
1861     /// DFA uses more space and can take much longer to build.
1862     ///
1863     /// Enabling this option does not change the time complexity for
1864     /// constructing the Aho-Corasick automaton (which is `O(p)` where
1865     /// `p` is the total number of patterns being compiled). Enabling this
1866     /// option does however reduce the time complexity of non-overlapping
1867     /// searches from `O(n + p)` to `O(n)`, where `n` is the length of the
1868     /// haystack.
1869     ///
1870     /// In general, it's a good idea to enable this if you're searching a
1871     /// small number of fairly short patterns (~1000), or if you want the
1872     /// fastest possible search without regard to compilation time or space
1873     /// usage.
1874     pub fn dfa(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1875         self.dfa = yes;
1876         self
1877     }
1878 
1879     /// Enable heuristic prefilter optimizations.
1880     ///
1881     /// When enabled, searching will attempt to quickly skip to match
1882     /// candidates using specialized literal search routines. A prefilter
1883     /// cannot always be used, and is generally treated as a heuristic. It
1884     /// can be useful to disable this if the prefilter is observed to be
1885     /// sub-optimal for a particular workload.
1886     ///
1887     /// This is enabled by default.
1888     pub fn prefilter(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1889         self.nfa_builder.prefilter(yes);
1890         self
1891     }
1892 
1893     /// Shrink the size of the transition alphabet by mapping bytes to their
1894     /// equivalence classes. This only has an effect when the `dfa` option is
1895     /// enabled.
1896     ///
1897     /// When enabled, each a DFA will use a map from all possible bytes
1898     /// to their corresponding equivalence class. Each equivalence class
1899     /// represents a set of bytes that does not discriminate between a match
1900     /// and a non-match in the DFA. For example, the patterns `bar` and `baz`
1901     /// have at least five equivalence classes: singleton sets of `b`, `a`, `r`
1902     /// and `z`, and a final set that contains every other byte.
1903     ///
1904     /// The advantage of this map is that the size of the transition table can
1905     /// be reduced drastically from `#states * 256 * sizeof(id)` to
1906     /// `#states * k * sizeof(id)` where `k` is the number of equivalence
1907     /// classes. As a result, total space usage can decrease substantially.
1908     /// Moreover, since a smaller alphabet is used, compilation becomes faster
1909     /// as well.
1910     ///
1911     /// The disadvantage of this map is that every byte searched must be
1912     /// passed through this map before it can be used to determine the next
1913     /// transition. This has a small match time performance cost. However, if
1914     /// the DFA is otherwise very large without byte classes, then using byte
1915     /// classes can greatly improve memory locality and thus lead to better
1916     /// overall performance.
1917     ///
1918     /// This option is enabled by default.
1919     #[deprecated(
1920         since = "0.7.16",
1921         note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1922     )]
1923     pub fn byte_classes(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1924         self.dfa_builder.byte_classes(yes);
1925         self
1926     }
1927 
1928     /// Premultiply state identifiers in the transition table. This only has
1929     /// an effect when the `dfa` option is enabled.
1930     ///
1931     /// When enabled, state identifiers are premultiplied to point to their
1932     /// corresponding row in the transition table. That is, given the `i`th
1933     /// state, its corresponding premultiplied identifier is `i * k` where `k`
1934     /// is the alphabet size of the automaton. (The alphabet size is at most
1935     /// 256, but is in practice smaller if byte classes is enabled.)
1936     ///
1937     /// When state identifiers are not premultiplied, then the identifier of
1938     /// the `i`th state is `i`.
1939     ///
1940     /// The advantage of premultiplying state identifiers is that is saves a
1941     /// multiplication instruction per byte when searching with a DFA. This has
1942     /// been observed to lead to a 20% performance benefit in micro-benchmarks.
1943     ///
1944     /// The primary disadvantage of premultiplying state identifiers is
1945     /// that they require a larger integer size to represent. For example,
1946     /// if the DFA has 200 states, then its premultiplied form requires 16
1947     /// bits to represent every possible state identifier, where as its
1948     /// non-premultiplied form only requires 8 bits.
1949     ///
1950     /// This option is enabled by default.
1951     #[deprecated(
1952         since = "0.7.16",
1953         note = "not carrying its weight, will be always enabled, see: https://github.com/BurntSushi/aho-corasick/issues/57"
1954     )]
1955     pub fn premultiply(&mut self, yes: bool) -> &mut AhoCorasickBuilder {
1956         self.dfa_builder.premultiply(yes);
1957         self
1958     }
1959 }
1960 
1961 /// A knob for controlling the match semantics of an Aho-Corasick automaton.
1962 ///
1963 /// There are two generally different ways that Aho-Corasick automatons can
1964 /// report matches. The first way is the "standard" approach that results from
1965 /// implementing most textbook explanations of Aho-Corasick. The second way is
1966 /// to report only the leftmost non-overlapping matches. The leftmost approach
1967 /// is in turn split into two different ways of resolving ambiguous matches:
1968 /// leftmost-first and leftmost-longest.
1969 ///
1970 /// The `Standard` match kind is the default and is the only one that supports
1971 /// overlapping matches and stream searching. (Trying to find overlapping
1972 /// or streaming matches using leftmost match semantics will result in a
1973 /// panic.) The `Standard` match kind will report matches as they are seen.
1974 /// When searching for overlapping matches, then all possible matches are
1975 /// reported. When searching for non-overlapping matches, the first match seen
1976 /// is reported. For example, for non-overlapping matches, given the patterns
1977 /// `abcd` and `b` and the subject string `abcdef`, only a match for `b` is
1978 /// reported since it is detected first. The `abcd` match is never reported
1979 /// since it overlaps with the `b` match.
1980 ///
1981 /// In contrast, the leftmost match kind always prefers the leftmost match
1982 /// among all possible matches. Given the same example as above with `abcd` and
1983 /// `b` as patterns and `abcdef` as the subject string, the leftmost match is
1984 /// `abcd` since it begins before the `b` match, even though the `b` match is
1985 /// detected before the `abcd` match. In this case, the `b` match is not
1986 /// reported at all since it overlaps with the `abcd` match.
1987 ///
1988 /// The difference between leftmost-first and leftmost-longest is in how they
1989 /// resolve ambiguous matches when there are multiple leftmost matches to
1990 /// choose from. Leftmost-first always chooses the pattern that was provided
1991 /// earliest, where as leftmost-longest always chooses the longest matching
1992 /// pattern. For example, given the patterns `a` and `ab` and the subject
1993 /// string `ab`, the leftmost-first match is `a` but the leftmost-longest match
1994 /// is `ab`. Conversely, if the patterns were given in reverse order, i.e.,
1995 /// `ab` and `a`, then both the leftmost-first and leftmost-longest matches
1996 /// would be `ab`. Stated differently, the leftmost-first match depends on the
1997 /// order in which the patterns were given to the Aho-Corasick automaton.
1998 /// Because of that, when leftmost-first matching is used, if a pattern `A`
1999 /// that appears before a pattern `B` is a prefix of `B`, then it is impossible
2000 /// to ever observe a match of `B`.
2001 ///
2002 /// If you're not sure which match kind to pick, then stick with the standard
2003 /// kind, which is the default. In particular, if you need overlapping or
2004 /// streaming matches, then you _must_ use the standard kind. The leftmost
2005 /// kinds are useful in specific circumstances. For example, leftmost-first can
2006 /// be very useful as a way to implement match priority based on the order of
2007 /// patterns given and leftmost-longest can be useful for dictionary searching
2008 /// such that only the longest matching words are reported.
2009 ///
2010 /// # Relationship with regular expression alternations
2011 ///
2012 /// Understanding match semantics can be a little tricky, and one easy way
2013 /// to conceptualize non-overlapping matches from an Aho-Corasick automaton
2014 /// is to think about them as a simple alternation of literals in a regular
2015 /// expression. For example, let's say we wanted to match the strings
2016 /// `Sam` and `Samwise`, which would turn into the regex `Sam|Samwise`. It
2017 /// turns out that regular expression engines have two different ways of
2018 /// matching this alternation. The first way, leftmost-longest, is commonly
2019 /// found in POSIX compatible implementations of regular expressions (such as
2020 /// `grep`). The second way, leftmost-first, is commonly found in backtracking
2021 /// implementations such as Perl. (Some regex engines, such as RE2 and Rust's
2022 /// regex engine do not use backtracking, but still implement leftmost-first
2023 /// semantics in an effort to match the behavior of dominant backtracking
2024 /// regex engines such as those found in Perl, Ruby, Python, Javascript and
2025 /// PHP.)
2026 ///
2027 /// That is, when matching `Sam|Samwise` against `Samwise`, a POSIX regex
2028 /// will match `Samwise` because it is the longest possible match, but a
2029 /// Perl-like regex will match `Sam` since it appears earlier in the
2030 /// alternation. Indeed, the regex `Sam|Samwise` in a Perl-like regex engine
2031 /// will never match `Samwise` since `Sam` will always have higher priority.
2032 /// Conversely, matching the regex `Samwise|Sam` against `Samwise` will lead to
2033 /// a match of `Samwise` in both POSIX and Perl-like regexes since `Samwise` is
2034 /// still longest match, but it also appears earlier than `Sam`.
2035 ///
2036 /// The "standard" match semantics of Aho-Corasick generally don't correspond
2037 /// to the match semantics of any large group of regex implementations, so
2038 /// there's no direct analogy that can be made here. Standard match semantics
2039 /// are generally useful for overlapping matches, or if you just want to see
2040 /// matches as they are detected.
2041 ///
2042 /// The main conclusion to draw from this section is that the match semantics
2043 /// can be tweaked to precisely match either Perl-like regex alternations or
2044 /// POSIX regex alternations.
2045 #[derive(Clone, Copy, Debug, Eq, PartialEq)]
2046 pub enum MatchKind {
2047     /// Use standard match semantics, which support overlapping matches. When
2048     /// used with non-overlapping matches, matches are reported as they are
2049     /// seen.
2050     Standard,
2051     /// Use leftmost-first match semantics, which reports leftmost matches.
2052     /// When there are multiple possible leftmost matches, the match
2053     /// corresponding to the pattern that appeared earlier when constructing
2054     /// the automaton is reported.
2055     ///
2056     /// This does **not** support overlapping matches or stream searching. If
2057     /// this match kind is used, attempting to find overlapping matches or
2058     /// stream matches will panic.
2059     LeftmostFirst,
2060     /// Use leftmost-longest match semantics, which reports leftmost matches.
2061     /// When there are multiple possible leftmost matches, the longest match
2062     /// is chosen.
2063     ///
2064     /// This does **not** support overlapping matches or stream searching. If
2065     /// this match kind is used, attempting to find overlapping matches or
2066     /// stream matches will panic.
2067     LeftmostLongest,
2068     /// Hints that destructuring should not be exhaustive.
2069     ///
2070     /// This enum may grow additional variants, so this makes sure clients
2071     /// don't count on exhaustive matching. (Otherwise, adding a new variant
2072     /// could break existing code.)
2073     #[doc(hidden)]
2074     __Nonexhaustive,
2075 }
2076 
2077 /// The default match kind is `MatchKind::Standard`.
2078 impl Default for MatchKind {
2079     fn default() -> MatchKind {
2080         MatchKind::Standard
2081     }
2082 }
2083 
2084 impl MatchKind {
2085     fn supports_overlapping(&self) -> bool {
2086         self.is_standard()
2087     }
2088 
2089     fn supports_stream(&self) -> bool {
2090         // TODO: It may be possible to support this. It's hard.
2091         //
2092         // See: https://github.com/rust-lang/regex/issues/425#issuecomment-471367838
2093         self.is_standard()
2094     }
2095 
2096     pub(crate) fn is_standard(&self) -> bool {
2097         *self == MatchKind::Standard
2098     }
2099 
2100     pub(crate) fn is_leftmost(&self) -> bool {
2101         *self == MatchKind::LeftmostFirst
2102             || *self == MatchKind::LeftmostLongest
2103     }
2104 
2105     pub(crate) fn is_leftmost_first(&self) -> bool {
2106         *self == MatchKind::LeftmostFirst
2107     }
2108 
2109     /// Convert this match kind into a packed match kind. If this match kind
2110     /// corresponds to standard semantics, then this returns None, since
2111     /// packed searching does not support standard semantics.
2112     pub(crate) fn as_packed(&self) -> Option<packed::MatchKind> {
2113         match *self {
2114             MatchKind::Standard => None,
2115             MatchKind::LeftmostFirst => Some(packed::MatchKind::LeftmostFirst),
2116             MatchKind::LeftmostLongest => {
2117                 Some(packed::MatchKind::LeftmostLongest)
2118             }
2119             MatchKind::__Nonexhaustive => unreachable!(),
2120         }
2121     }
2122 }
2123 
2124 #[cfg(test)]
2125 mod tests {
2126     use super::*;
2127 
2128     #[test]
2129     fn oibits() {
2130         use std::panic::{RefUnwindSafe, UnwindSafe};
2131 
2132         fn assert_send<T: Send>() {}
2133         fn assert_sync<T: Sync>() {}
2134         fn assert_unwind_safe<T: RefUnwindSafe + UnwindSafe>() {}
2135 
2136         assert_send::<AhoCorasick>();
2137         assert_sync::<AhoCorasick>();
2138         assert_unwind_safe::<AhoCorasick>();
2139         assert_send::<AhoCorasickBuilder>();
2140         assert_sync::<AhoCorasickBuilder>();
2141         assert_unwind_safe::<AhoCorasickBuilder>();
2142     }
2143 }
2144