1 macro_rules! define_set {
2     ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
3      $(#[$doc_regexset_example:meta])* ) => {
4         pub mod $name {
5             use std::fmt;
6             use std::iter;
7             use std::slice;
8             use std::vec;
9 
10             use crate::error::Error;
11             use crate::exec::Exec;
12             use crate::re_builder::$builder_mod::RegexSetBuilder;
13             use crate::re_trait::RegularExpression;
14 
15 /// Match multiple (possibly overlapping) regular expressions in a single scan.
16 ///
17 /// A regex set corresponds to the union of two or more regular expressions.
18 /// That is, a regex set will match text where at least one of its
19 /// constituent regular expressions matches. A regex set as its formulated here
20 /// provides a touch more power: it will also report *which* regular
21 /// expressions in the set match. Indeed, this is the key difference between
22 /// regex sets and a single `Regex` with many alternates, since only one
23 /// alternate can match at a time.
24 ///
25 /// For example, consider regular expressions to match email addresses and
26 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
27 /// regex set is constructed from those regexes, then searching the text
28 /// `foo@example.com` will report both regexes as matching. Of course, one
29 /// could accomplish this by compiling each regex on its own and doing two
30 /// searches over the text. The key advantage of using a regex set is that it
31 /// will report the matching regexes using a *single pass through the text*.
32 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
33 /// router for a complex web application or a user agent matcher), then a regex
34 /// set can realize huge performance gains.
35 ///
36 /// # Example
37 ///
38 /// This shows how the above two regexes (for matching email addresses and
39 /// domains) might work:
40 ///
41 $(#[$doc_regexset_example])*
42 ///
43 /// Note that it would be possible to adapt the above example to using `Regex`
44 /// with an expression like:
45 ///
46 /// ```text
47 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
48 /// ```
49 ///
50 /// After a match, one could then inspect the capture groups to figure out
51 /// which alternates matched. The problem is that it is hard to make this
52 /// approach scale when there are many regexes since the overlap between each
53 /// alternate isn't always obvious to reason about.
54 ///
55 /// # Limitations
56 ///
57 /// Regex sets are limited to answering the following two questions:
58 ///
59 /// 1. Does any regex in the set match?
60 /// 2. If so, which regexes in the set match?
61 ///
62 /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
63 /// since the matching engines can stop after the first match is found.
64 ///
65 /// Other features like finding the location of successive matches or their
66 /// sub-captures aren't supported. If you need this functionality, the
67 /// recommended approach is to compile each regex in the set independently and
68 /// selectively match them based on which regexes in the set matched.
69 ///
70 /// # Performance
71 ///
72 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
73 /// search takes `O(mn)` time, where `m` is proportional to the size of the
74 /// regex set and `n` is proportional to the length of the search text.
75 #[derive(Clone)]
76 pub struct RegexSet(Exec);
77 
78 impl RegexSet {
79     /// Create a new regex set with the given regular expressions.
80     ///
81     /// This takes an iterator of `S`, where `S` is something that can produce
82     /// a `&str`. If any of the strings in the iterator are not valid regular
83     /// expressions, then an error is returned.
84     ///
85     /// # Example
86     ///
87     /// Create a new regex set from an iterator of strings:
88     ///
89     /// ```rust
90     /// # use regex::RegexSet;
91     /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
92     /// assert!(set.is_match("foo"));
93     /// ```
94     pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
95             where S: AsRef<str>, I: IntoIterator<Item=S> {
96         RegexSetBuilder::new(exprs).build()
97     }
98 
99     /// Create a new empty regex set.
100     ///
101     /// # Example
102     ///
103     /// ```rust
104     /// # use regex::RegexSet;
105     /// let set = RegexSet::empty();
106     /// assert!(set.is_empty());
107     /// ```
108     pub fn empty() -> RegexSet {
109         RegexSetBuilder::new(&[""; 0]).build().unwrap()
110     }
111 
112     /// Returns true if and only if one of the regexes in this set matches
113     /// the text given.
114     ///
115     /// This method should be preferred if you only need to test whether any
116     /// of the regexes in the set should match, but don't care about *which*
117     /// regexes matched. This is because the underlying matching engine will
118     /// quit immediately after seeing the first match instead of continuing to
119     /// find all matches.
120     ///
121     /// Note that as with searches using `Regex`, the expression is unanchored
122     /// by default. That is, if the regex does not start with `^` or `\A`, or
123     /// end with `$` or `\z`, then it is permitted to match anywhere in the
124     /// text.
125     ///
126     /// # Example
127     ///
128     /// Tests whether a set matches some text:
129     ///
130     /// ```rust
131     /// # use regex::RegexSet;
132     /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
133     /// assert!(set.is_match("foo"));
134     /// assert!(!set.is_match("☃"));
135     /// ```
136     pub fn is_match(&self, text: $text_ty) -> bool {
137         self.is_match_at(text, 0)
138     }
139 
140     /// Returns the same as is_match, but starts the search at the given
141     /// offset.
142     ///
143     /// The significance of the starting point is that it takes the surrounding
144     /// context into consideration. For example, the `\A` anchor can only
145     /// match when `start == 0`.
146     #[doc(hidden)]
147     pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
148         self.0.searcher().is_match_at($as_bytes(text), start)
149     }
150 
151     /// Returns the set of regular expressions that match in the given text.
152     ///
153     /// The set returned contains the index of each regular expression that
154     /// matches in the given text. The index is in correspondence with the
155     /// order of regular expressions given to `RegexSet`'s constructor.
156     ///
157     /// The set can also be used to iterate over the matched indices.
158     ///
159     /// Note that as with searches using `Regex`, the expression is unanchored
160     /// by default. That is, if the regex does not start with `^` or `\A`, or
161     /// end with `$` or `\z`, then it is permitted to match anywhere in the
162     /// text.
163     ///
164     /// # Example
165     ///
166     /// Tests which regular expressions match the given text:
167     ///
168     /// ```rust
169     /// # use regex::RegexSet;
170     /// let set = RegexSet::new(&[
171     ///     r"\w+",
172     ///     r"\d+",
173     ///     r"\pL+",
174     ///     r"foo",
175     ///     r"bar",
176     ///     r"barfoo",
177     ///     r"foobar",
178     /// ]).unwrap();
179     /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
180     /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
181     ///
182     /// // You can also test whether a particular regex matched:
183     /// let matches = set.matches("foobar");
184     /// assert!(!matches.matched(5));
185     /// assert!(matches.matched(6));
186     /// ```
187     pub fn matches(&self, text: $text_ty) -> SetMatches {
188         let mut matches = vec![false; self.0.regex_strings().len()];
189         let any = self.read_matches_at(&mut matches, text, 0);
190         SetMatches {
191             matched_any: any,
192             matches: matches,
193         }
194     }
195 
196     /// Returns the same as matches, but starts the search at the given
197     /// offset and stores the matches into the slice given.
198     ///
199     /// The significance of the starting point is that it takes the surrounding
200     /// context into consideration. For example, the `\A` anchor can only
201     /// match when `start == 0`.
202     ///
203     /// `matches` must have a length that is at least the number of regexes
204     /// in this set.
205     ///
206     /// This method returns true if and only if at least one member of
207     /// `matches` is true after executing the set against `text`.
208     #[doc(hidden)]
209     pub fn read_matches_at(
210         &self,
211         matches: &mut [bool],
212         text: $text_ty,
213         start: usize,
214     ) -> bool {
215         self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
216     }
217 
218     /// Returns the total number of regular expressions in this set.
219     pub fn len(&self) -> usize {
220         self.0.regex_strings().len()
221     }
222 
223     /// Returns `true` if this set contains no regular expressions.
224     pub fn is_empty(&self) -> bool {
225         self.0.regex_strings().is_empty()
226     }
227 
228     /// Returns the patterns that this set will match on.
229     ///
230     /// This function can be used to determine the pattern for a match. The
231     /// slice returned has exactly as many patterns givens to this regex set,
232     /// and the order of the slice is the same as the order of the patterns
233     /// provided to the set.
234     ///
235     /// # Example
236     ///
237     /// ```rust
238     /// # use regex::RegexSet;
239     /// let set = RegexSet::new(&[
240     ///     r"\w+",
241     ///     r"\d+",
242     ///     r"\pL+",
243     ///     r"foo",
244     ///     r"bar",
245     ///     r"barfoo",
246     ///     r"foobar",
247     /// ]).unwrap();
248     /// let matches: Vec<_> = set
249     ///     .matches("foobar")
250     ///     .into_iter()
251     ///     .map(|match_idx| &set.patterns()[match_idx])
252     ///     .collect();
253     /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
254     /// ```
255     pub fn patterns(&self) -> &[String] {
256         self.0.regex_strings()
257     }
258 }
259 
260 /// A set of matches returned by a regex set.
261 #[derive(Clone, Debug)]
262 pub struct SetMatches {
263     matched_any: bool,
264     matches: Vec<bool>,
265 }
266 
267 impl SetMatches {
268     /// Whether this set contains any matches.
269     pub fn matched_any(&self) -> bool {
270         self.matched_any
271     }
272 
273     /// Whether the regex at the given index matched.
274     ///
275     /// The index for a regex is determined by its insertion order upon the
276     /// initial construction of a `RegexSet`, starting at `0`.
277     ///
278     /// # Panics
279     ///
280     /// If `regex_index` is greater than or equal to `self.len()`.
281     pub fn matched(&self, regex_index: usize) -> bool {
282         self.matches[regex_index]
283     }
284 
285     /// The total number of regexes in the set that created these matches.
286     pub fn len(&self) -> usize {
287         self.matches.len()
288     }
289 
290     /// Returns an iterator over indexes in the regex that matched.
291     ///
292     /// This will always produces matches in ascending order of index, where
293     /// the index corresponds to the index of the regex that matched with
294     /// respect to its position when initially building the set.
295     pub fn iter(&self) -> SetMatchesIter<'_> {
296         SetMatchesIter((&*self.matches).into_iter().enumerate())
297     }
298 }
299 
300 impl IntoIterator for SetMatches {
301     type IntoIter = SetMatchesIntoIter;
302     type Item = usize;
303 
304     fn into_iter(self) -> Self::IntoIter {
305         SetMatchesIntoIter(self.matches.into_iter().enumerate())
306     }
307 }
308 
309 impl<'a> IntoIterator for &'a SetMatches {
310     type IntoIter = SetMatchesIter<'a>;
311     type Item = usize;
312 
313     fn into_iter(self) -> Self::IntoIter {
314         self.iter()
315     }
316 }
317 
318 /// An owned iterator over the set of matches from a regex set.
319 ///
320 /// This will always produces matches in ascending order of index, where the
321 /// index corresponds to the index of the regex that matched with respect to
322 /// its position when initially building the set.
323 #[derive(Debug)]
324 pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
325 
326 impl Iterator for SetMatchesIntoIter {
327     type Item = usize;
328 
329     fn next(&mut self) -> Option<usize> {
330         loop {
331             match self.0.next() {
332                 None => return None,
333                 Some((_, false)) => {}
334                 Some((i, true)) => return Some(i),
335             }
336         }
337     }
338 
339     fn size_hint(&self) -> (usize, Option<usize>) {
340         self.0.size_hint()
341     }
342 }
343 
344 impl DoubleEndedIterator for SetMatchesIntoIter {
345     fn next_back(&mut self) -> Option<usize> {
346         loop {
347             match self.0.next_back() {
348                 None => return None,
349                 Some((_, false)) => {}
350                 Some((i, true)) => return Some(i),
351             }
352         }
353     }
354 }
355 
356 impl iter::FusedIterator for SetMatchesIntoIter {}
357 
358 /// A borrowed iterator over the set of matches from a regex set.
359 ///
360 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
361 ///
362 /// This will always produces matches in ascending order of index, where the
363 /// index corresponds to the index of the regex that matched with respect to
364 /// its position when initially building the set.
365 #[derive(Clone, Debug)]
366 pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
367 
368 impl<'a> Iterator for SetMatchesIter<'a> {
369     type Item = usize;
370 
371     fn next(&mut self) -> Option<usize> {
372         loop {
373             match self.0.next() {
374                 None => return None,
375                 Some((_, &false)) => {}
376                 Some((i, &true)) => return Some(i),
377             }
378         }
379     }
380 
381     fn size_hint(&self) -> (usize, Option<usize>) {
382         self.0.size_hint()
383     }
384 }
385 
386 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
387     fn next_back(&mut self) -> Option<usize> {
388         loop {
389             match self.0.next_back() {
390                 None => return None,
391                 Some((_, &false)) => {}
392                 Some((i, &true)) => return Some(i),
393             }
394         }
395     }
396 }
397 
398 impl<'a> iter::FusedIterator for SetMatchesIter<'a> {}
399 
400 #[doc(hidden)]
401 impl From<Exec> for RegexSet {
402     fn from(exec: Exec) -> Self {
403         RegexSet(exec)
404     }
405 }
406 
407 impl fmt::Debug for RegexSet {
408     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
409         write!(f, "RegexSet({:?})", self.0.regex_strings())
410     }
411 }
412 
413 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
414 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
415         }
416     }
417 }
418 
419 define_set! {
420     unicode,
421     set_unicode,
422     &str,
423     as_bytes_str,
424 /// ```rust
425 /// # use regex::RegexSet;
426 /// let set = RegexSet::new(&[
427 ///     r"[a-z]+@[a-z]+\.(com|org|net)",
428 ///     r"[a-z]+\.(com|org|net)",
429 /// ]).unwrap();
430 ///
431 /// // Ask whether any regexes in the set match.
432 /// assert!(set.is_match("foo@example.com"));
433 ///
434 /// // Identify which regexes in the set match.
435 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
436 /// assert_eq!(vec![0, 1], matches);
437 ///
438 /// // Try again, but with text that only matches one of the regexes.
439 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
440 /// assert_eq!(vec![1], matches);
441 ///
442 /// // Try again, but with text that doesn't match any regex in the set.
443 /// let matches: Vec<_> = set.matches("example").into_iter().collect();
444 /// assert!(matches.is_empty());
445 /// ```
446 }
447 
448 define_set! {
449     bytes,
450     set_bytes,
451     &[u8],
452     as_bytes_bytes,
453 /// ```rust
454 /// # use regex::bytes::RegexSet;
455 /// let set = RegexSet::new(&[
456 ///     r"[a-z]+@[a-z]+\.(com|org|net)",
457 ///     r"[a-z]+\.(com|org|net)",
458 /// ]).unwrap();
459 ///
460 /// // Ask whether any regexes in the set match.
461 /// assert!(set.is_match(b"foo@example.com"));
462 ///
463 /// // Identify which regexes in the set match.
464 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
465 /// assert_eq!(vec![0, 1], matches);
466 ///
467 /// // Try again, but with text that only matches one of the regexes.
468 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
469 /// assert_eq!(vec![1], matches);
470 ///
471 /// // Try again, but with text that doesn't match any regex in the set.
472 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
473 /// assert!(matches.is_empty());
474 /// ```
475 }
476