1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 macro_rules! define_set {
12     ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr,
13      $(#[$doc_regexset_example:meta])* ) => {
14         pub mod $name {
15             use std::fmt;
16             use std::iter;
17             use std::slice;
18             use std::vec;
19 
20             use error::Error;
21             use exec::Exec;
22             use re_builder::$builder_mod::RegexSetBuilder;
23             use re_trait::RegularExpression;
24 
25 /// Match multiple (possibly overlapping) regular expressions in a single scan.
26 ///
27 /// A regex set corresponds to the union of two or more regular expressions.
28 /// That is, a regex set will match text where at least one of its
29 /// constituent regular expressions matches. A regex set as its formulated here
30 /// provides a touch more power: it will also report *which* regular
31 /// expressions in the set match. Indeed, this is the key difference between
32 /// regex sets and a single `Regex` with many alternates, since only one
33 /// alternate can match at a time.
34 ///
35 /// For example, consider regular expressions to match email addresses and
36 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a
37 /// regex set is constructed from those regexes, then searching the text
38 /// `foo@example.com` will report both regexes as matching. Of course, one
39 /// could accomplish this by compiling each regex on its own and doing two
40 /// searches over the text. The key advantage of using a regex set is that it
41 /// will report the matching regexes using a *single pass through the text*.
42 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL
43 /// router for a complex web application or a user agent matcher), then a regex
44 /// set can realize huge performance gains.
45 ///
46 /// # Example
47 ///
48 /// This shows how the above two regexes (for matching email addresses and
49 /// domains) might work:
50 ///
51 $(#[$doc_regexset_example])*
52 ///
53 /// Note that it would be possible to adapt the above example to using `Regex`
54 /// with an expression like:
55 ///
56 /// ```ignore
57 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net))
58 /// ```
59 ///
60 /// After a match, one could then inspect the capture groups to figure out
61 /// which alternates matched. The problem is that it is hard to make this
62 /// approach scale when there are many regexes since the overlap between each
63 /// alternate isn't always obvious to reason about.
64 ///
65 /// # Limitations
66 ///
67 /// Regex sets are limited to answering the following two questions:
68 ///
69 /// 1. Does any regex in the set match?
70 /// 2. If so, which regexes in the set match?
71 ///
72 /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2)
73 /// since the matching engines can stop after the first match is found.
74 ///
75 /// Other features like finding the location of successive matches or their
76 /// sub-captures aren't supported. If you need this functionality, the
77 /// recommended approach is to compile each regex in the set independently and
78 /// selectively match them based on which regexes in the set matched.
79 ///
80 /// # Performance
81 ///
82 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely,
83 /// search takes `O(mn)` time, where `m` is proportional to the size of the
84 /// regex set and `n` is proportional to the length of the search text.
85 #[derive(Clone)]
86 pub struct RegexSet(Exec);
87 
88 impl RegexSet {
89     /// Create a new regex set with the given regular expressions.
90     ///
91     /// This takes an iterator of `S`, where `S` is something that can produce
92     /// a `&str`. If any of the strings in the iterator are not valid regular
93     /// expressions, then an error is returned.
94     ///
95     /// # Example
96     ///
97     /// Create a new regex set from an iterator of strings:
98     ///
99     /// ```rust
100     /// # use regex::RegexSet;
101     /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
102     /// assert!(set.is_match("foo"));
103     /// ```
104     pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error>
105             where S: AsRef<str>, I: IntoIterator<Item=S> {
106         RegexSetBuilder::new(exprs).build()
107     }
108 
109     /// Returns true if and only if one of the regexes in this set matches
110     /// the text given.
111     ///
112     /// This method should be preferred if you only need to test whether any
113     /// of the regexes in the set should match, but don't care about *which*
114     /// regexes matched. This is because the underlying matching engine will
115     /// quit immediately after seeing the first match instead of continuing to
116     /// find all matches.
117     ///
118     /// Note that as with searches using `Regex`, the expression is unanchored
119     /// by default. That is, if the regex does not start with `^` or `\A`, or
120     /// end with `$` or `\z`, then it is permitted to match anywhere in the
121     /// text.
122     ///
123     /// # Example
124     ///
125     /// Tests whether a set matches some text:
126     ///
127     /// ```rust
128     /// # use regex::RegexSet;
129     /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap();
130     /// assert!(set.is_match("foo"));
131     /// assert!(!set.is_match("☃"));
132     /// ```
133     pub fn is_match(&self, text: $text_ty) -> bool {
134         self.is_match_at(text, 0)
135     }
136 
137     /// Returns the same as is_match, but starts the search at the given
138     /// offset.
139     ///
140     /// The significance of the starting point is that it takes the surrounding
141     /// context into consideration. For example, the `\A` anchor can only
142     /// match when `start == 0`.
143     #[doc(hidden)]
144     pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool {
145         self.0.searcher().is_match_at($as_bytes(text), start)
146     }
147 
148     /// Returns the set of regular expressions that match in the given text.
149     ///
150     /// The set returned contains the index of each regular expression that
151     /// matches in the given text. The index is in correspondence with the
152     /// order of regular expressions given to `RegexSet`'s constructor.
153     ///
154     /// The set can also be used to iterate over the matched indices.
155     ///
156     /// Note that as with searches using `Regex`, the expression is unanchored
157     /// by default. That is, if the regex does not start with `^` or `\A`, or
158     /// end with `$` or `\z`, then it is permitted to match anywhere in the
159     /// text.
160     ///
161     /// # Example
162     ///
163     /// Tests which regular expressions match the given text:
164     ///
165     /// ```rust
166     /// # use regex::RegexSet;
167     /// let set = RegexSet::new(&[
168     ///     r"\w+",
169     ///     r"\d+",
170     ///     r"\pL+",
171     ///     r"foo",
172     ///     r"bar",
173     ///     r"barfoo",
174     ///     r"foobar",
175     /// ]).unwrap();
176     /// let matches: Vec<_> = set.matches("foobar").into_iter().collect();
177     /// assert_eq!(matches, vec![0, 2, 3, 4, 6]);
178     ///
179     /// // You can also test whether a particular regex matched:
180     /// let matches = set.matches("foobar");
181     /// assert!(!matches.matched(5));
182     /// assert!(matches.matched(6));
183     /// ```
184     pub fn matches(&self, text: $text_ty) -> SetMatches {
185         let mut matches = vec![false; self.0.regex_strings().len()];
186         let any = self.read_matches_at(&mut matches, text, 0);
187         SetMatches {
188             matched_any: any,
189             matches: matches,
190         }
191     }
192 
193     /// Returns the same as matches, but starts the search at the given
194     /// offset and stores the matches into the slice given.
195     ///
196     /// The significance of the starting point is that it takes the surrounding
197     /// context into consideration. For example, the `\A` anchor can only
198     /// match when `start == 0`.
199     ///
200     /// `matches` must have a length that is at least the number of regexes
201     /// in this set.
202     ///
203     /// This method returns true if and only if at least one member of
204     /// `matches` is true after executing the set against `text`.
205     #[doc(hidden)]
206     pub fn read_matches_at(
207         &self,
208         matches: &mut [bool],
209         text: $text_ty,
210         start: usize,
211     ) -> bool {
212         self.0.searcher().many_matches_at(matches, $as_bytes(text), start)
213     }
214 
215     /// Returns the total number of regular expressions in this set.
216     pub fn len(&self) -> usize {
217         self.0.regex_strings().len()
218     }
219 
220     /// Returns the patterns that this set will match on.
221     ///
222     /// This function can be used to determine the pattern for a match. The
223     /// slice returned has exactly as many patterns givens to this regex set,
224     /// and the order of the slice is the same as the order of the patterns
225     /// provided to the set.
226     ///
227     /// # Example
228     ///
229     /// ```rust
230     /// # use regex::RegexSet;
231     /// let set = RegexSet::new(&[
232     ///     r"\w+",
233     ///     r"\d+",
234     ///     r"\pL+",
235     ///     r"foo",
236     ///     r"bar",
237     ///     r"barfoo",
238     ///     r"foobar",
239     /// ]).unwrap();
240     /// let matches: Vec<_> = set
241     ///     .matches("foobar")
242     ///     .into_iter()
243     ///     .map(|match_idx| &set.patterns()[match_idx])
244     ///     .collect();
245     /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]);
246     /// ```
247     pub fn patterns(&self) -> &[String] {
248         self.0.regex_strings()
249     }
250 }
251 
252 /// A set of matches returned by a regex set.
253 #[derive(Clone, Debug)]
254 pub struct SetMatches {
255     matched_any: bool,
256     matches: Vec<bool>,
257 }
258 
259 impl SetMatches {
260     /// Whether this set contains any matches.
261     pub fn matched_any(&self) -> bool {
262         self.matched_any
263     }
264 
265     /// Whether the regex at the given index matched.
266     ///
267     /// The index for a regex is determined by its insertion order upon the
268     /// initial construction of a `RegexSet`, starting at `0`.
269     ///
270     /// # Panics
271     ///
272     /// If `regex_index` is greater than or equal to `self.len()`.
273     pub fn matched(&self, regex_index: usize) -> bool {
274         self.matches[regex_index]
275     }
276 
277     /// The total number of regexes in the set that created these matches.
278     pub fn len(&self) -> usize {
279         self.matches.len()
280     }
281 
282     /// Returns an iterator over indexes in the regex that matched.
283     ///
284     /// This will always produces matches in ascending order of index, where
285     /// the index corresponds to the index of the regex that matched with
286     /// respect to its position when initially building the set.
287     pub fn iter(&self) -> SetMatchesIter {
288         SetMatchesIter((&*self.matches).into_iter().enumerate())
289     }
290 }
291 
292 impl IntoIterator for SetMatches {
293     type IntoIter = SetMatchesIntoIter;
294     type Item = usize;
295 
296     fn into_iter(self) -> Self::IntoIter {
297         SetMatchesIntoIter(self.matches.into_iter().enumerate())
298     }
299 }
300 
301 impl<'a> IntoIterator for &'a SetMatches {
302     type IntoIter = SetMatchesIter<'a>;
303     type Item = usize;
304 
305     fn into_iter(self) -> Self::IntoIter {
306         self.iter()
307     }
308 }
309 
310 /// An owned iterator over the set of matches from a regex set.
311 ///
312 /// This will always produces matches in ascending order of index, where the
313 /// index corresponds to the index of the regex that matched with respect to
314 /// its position when initially building the set.
315 pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>);
316 
317 impl Iterator for SetMatchesIntoIter {
318     type Item = usize;
319 
320     fn next(&mut self) -> Option<usize> {
321         loop {
322             match self.0.next() {
323                 None => return None,
324                 Some((_, false)) => {}
325                 Some((i, true)) => return Some(i),
326             }
327         }
328     }
329 
330     fn size_hint(&self) -> (usize, Option<usize>) {
331         self.0.size_hint()
332     }
333 }
334 
335 impl DoubleEndedIterator for SetMatchesIntoIter {
336     fn next_back(&mut self) -> Option<usize> {
337         loop {
338             match self.0.next_back() {
339                 None => return None,
340                 Some((_, false)) => {}
341                 Some((i, true)) => return Some(i),
342             }
343         }
344     }
345 }
346 
347 /// A borrowed iterator over the set of matches from a regex set.
348 ///
349 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value.
350 ///
351 /// This will always produces matches in ascending order of index, where the
352 /// index corresponds to the index of the regex that matched with respect to
353 /// its position when initially building the set.
354 #[derive(Clone)]
355 pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>);
356 
357 impl<'a> Iterator for SetMatchesIter<'a> {
358     type Item = usize;
359 
360     fn next(&mut self) -> Option<usize> {
361         loop {
362             match self.0.next() {
363                 None => return None,
364                 Some((_, &false)) => {}
365                 Some((i, &true)) => return Some(i),
366             }
367         }
368     }
369 
370     fn size_hint(&self) -> (usize, Option<usize>) {
371         self.0.size_hint()
372     }
373 }
374 
375 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> {
376     fn next_back(&mut self) -> Option<usize> {
377         loop {
378             match self.0.next_back() {
379                 None => return None,
380                 Some((_, &false)) => {}
381                 Some((i, &true)) => return Some(i),
382             }
383         }
384     }
385 }
386 
387 #[doc(hidden)]
388 impl From<Exec> for RegexSet {
389     fn from(exec: Exec) -> Self {
390         RegexSet(exec)
391     }
392 }
393 
394 impl fmt::Debug for RegexSet {
395     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
396         write!(f, "RegexSet({:?})", self.0.regex_strings())
397     }
398 }
399 
400 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() }
401 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text }
402         }
403     }
404 }
405 
406 define_set! {
407     unicode,
408     set_unicode,
409     &str,
410     as_bytes_str,
411 /// ```rust
412 /// # use regex::RegexSet;
413 /// let set = RegexSet::new(&[
414 ///     r"[a-z]+@[a-z]+\.(com|org|net)",
415 ///     r"[a-z]+\.(com|org|net)",
416 /// ]).unwrap();
417 ///
418 /// // Ask whether any regexes in the set match.
419 /// assert!(set.is_match("foo@example.com"));
420 ///
421 /// // Identify which regexes in the set match.
422 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect();
423 /// assert_eq!(vec![0, 1], matches);
424 ///
425 /// // Try again, but with text that only matches one of the regexes.
426 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect();
427 /// assert_eq!(vec![1], matches);
428 ///
429 /// // Try again, but with text that doesn't match any regex in the set.
430 /// let matches: Vec<_> = set.matches("example").into_iter().collect();
431 /// assert!(matches.is_empty());
432 /// ```
433 }
434 
435 define_set! {
436     bytes,
437     set_bytes,
438     &[u8],
439     as_bytes_bytes,
440 /// ```rust
441 /// # use regex::bytes::RegexSet;
442 /// let set = RegexSet::new(&[
443 ///     r"[a-z]+@[a-z]+\.(com|org|net)",
444 ///     r"[a-z]+\.(com|org|net)",
445 /// ]).unwrap();
446 ///
447 /// // Ask whether any regexes in the set match.
448 /// assert!(set.is_match(b"foo@example.com"));
449 ///
450 /// // Identify which regexes in the set match.
451 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect();
452 /// assert_eq!(vec![0, 1], matches);
453 ///
454 /// // Try again, but with text that only matches one of the regexes.
455 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect();
456 /// assert_eq!(vec![1], matches);
457 ///
458 /// // Try again, but with text that doesn't match any regex in the set.
459 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect();
460 /// assert!(matches.is_empty());
461 /// ```
462 }
463