1 macro_rules! define_set { 2 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, 3 $(#[$doc_regexset_example:meta])* ) => { 4 pub mod $name { 5 use std::fmt; 6 use std::iter; 7 use std::slice; 8 use std::vec; 9 10 use error::Error; 11 use exec::Exec; 12 use re_builder::$builder_mod::RegexSetBuilder; 13 use re_trait::RegularExpression; 14 15 /// Match multiple (possibly overlapping) regular expressions in a single scan. 16 /// 17 /// A regex set corresponds to the union of two or more regular expressions. 18 /// That is, a regex set will match text where at least one of its 19 /// constituent regular expressions matches. A regex set as its formulated here 20 /// provides a touch more power: it will also report *which* regular 21 /// expressions in the set match. Indeed, this is the key difference between 22 /// regex sets and a single `Regex` with many alternates, since only one 23 /// alternate can match at a time. 24 /// 25 /// For example, consider regular expressions to match email addresses and 26 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a 27 /// regex set is constructed from those regexes, then searching the text 28 /// `foo@example.com` will report both regexes as matching. Of course, one 29 /// could accomplish this by compiling each regex on its own and doing two 30 /// searches over the text. The key advantage of using a regex set is that it 31 /// will report the matching regexes using a *single pass through the text*. 32 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL 33 /// router for a complex web application or a user agent matcher), then a regex 34 /// set can realize huge performance gains. 35 /// 36 /// # Example 37 /// 38 /// This shows how the above two regexes (for matching email addresses and 39 /// domains) might work: 40 /// 41 $(#[$doc_regexset_example])* 42 /// 43 /// Note that it would be possible to adapt the above example to using `Regex` 44 /// with an expression like: 45 /// 46 /// ```ignore 47 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) 48 /// ``` 49 /// 50 /// After a match, one could then inspect the capture groups to figure out 51 /// which alternates matched. The problem is that it is hard to make this 52 /// approach scale when there are many regexes since the overlap between each 53 /// alternate isn't always obvious to reason about. 54 /// 55 /// # Limitations 56 /// 57 /// Regex sets are limited to answering the following two questions: 58 /// 59 /// 1. Does any regex in the set match? 60 /// 2. If so, which regexes in the set match? 61 /// 62 /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) 63 /// since the matching engines can stop after the first match is found. 64 /// 65 /// Other features like finding the location of successive matches or their 66 /// sub-captures aren't supported. If you need this functionality, the 67 /// recommended approach is to compile each regex in the set independently and 68 /// selectively match them based on which regexes in the set matched. 69 /// 70 /// # Performance 71 /// 72 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely, 73 /// search takes `O(mn)` time, where `m` is proportional to the size of the 74 /// regex set and `n` is proportional to the length of the search text. 75 #[derive(Clone)] 76 pub struct RegexSet(Exec); 77 78 impl RegexSet { 79 /// Create a new regex set with the given regular expressions. 80 /// 81 /// This takes an iterator of `S`, where `S` is something that can produce 82 /// a `&str`. If any of the strings in the iterator are not valid regular 83 /// expressions, then an error is returned. 84 /// 85 /// # Example 86 /// 87 /// Create a new regex set from an iterator of strings: 88 /// 89 /// ```rust 90 /// # use regex::RegexSet; 91 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 92 /// assert!(set.is_match("foo")); 93 /// ``` 94 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> 95 where S: AsRef<str>, I: IntoIterator<Item=S> { 96 RegexSetBuilder::new(exprs).build() 97 } 98 99 /// Returns true if and only if one of the regexes in this set matches 100 /// the text given. 101 /// 102 /// This method should be preferred if you only need to test whether any 103 /// of the regexes in the set should match, but don't care about *which* 104 /// regexes matched. This is because the underlying matching engine will 105 /// quit immediately after seeing the first match instead of continuing to 106 /// find all matches. 107 /// 108 /// Note that as with searches using `Regex`, the expression is unanchored 109 /// by default. That is, if the regex does not start with `^` or `\A`, or 110 /// end with `$` or `\z`, then it is permitted to match anywhere in the 111 /// text. 112 /// 113 /// # Example 114 /// 115 /// Tests whether a set matches some text: 116 /// 117 /// ```rust 118 /// # use regex::RegexSet; 119 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 120 /// assert!(set.is_match("foo")); 121 /// assert!(!set.is_match("☃")); 122 /// ``` 123 pub fn is_match(&self, text: $text_ty) -> bool { 124 self.is_match_at(text, 0) 125 } 126 127 /// Returns the same as is_match, but starts the search at the given 128 /// offset. 129 /// 130 /// The significance of the starting point is that it takes the surrounding 131 /// context into consideration. For example, the `\A` anchor can only 132 /// match when `start == 0`. 133 #[doc(hidden)] 134 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { 135 self.0.searcher().is_match_at($as_bytes(text), start) 136 } 137 138 /// Returns the set of regular expressions that match in the given text. 139 /// 140 /// The set returned contains the index of each regular expression that 141 /// matches in the given text. The index is in correspondence with the 142 /// order of regular expressions given to `RegexSet`'s constructor. 143 /// 144 /// The set can also be used to iterate over the matched indices. 145 /// 146 /// Note that as with searches using `Regex`, the expression is unanchored 147 /// by default. That is, if the regex does not start with `^` or `\A`, or 148 /// end with `$` or `\z`, then it is permitted to match anywhere in the 149 /// text. 150 /// 151 /// # Example 152 /// 153 /// Tests which regular expressions match the given text: 154 /// 155 /// ```rust 156 /// # use regex::RegexSet; 157 /// let set = RegexSet::new(&[ 158 /// r"\w+", 159 /// r"\d+", 160 /// r"\pL+", 161 /// r"foo", 162 /// r"bar", 163 /// r"barfoo", 164 /// r"foobar", 165 /// ]).unwrap(); 166 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); 167 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); 168 /// 169 /// // You can also test whether a particular regex matched: 170 /// let matches = set.matches("foobar"); 171 /// assert!(!matches.matched(5)); 172 /// assert!(matches.matched(6)); 173 /// ``` 174 pub fn matches(&self, text: $text_ty) -> SetMatches { 175 let mut matches = vec![false; self.0.regex_strings().len()]; 176 let any = self.read_matches_at(&mut matches, text, 0); 177 SetMatches { 178 matched_any: any, 179 matches: matches, 180 } 181 } 182 183 /// Returns the same as matches, but starts the search at the given 184 /// offset and stores the matches into the slice given. 185 /// 186 /// The significance of the starting point is that it takes the surrounding 187 /// context into consideration. For example, the `\A` anchor can only 188 /// match when `start == 0`. 189 /// 190 /// `matches` must have a length that is at least the number of regexes 191 /// in this set. 192 /// 193 /// This method returns true if and only if at least one member of 194 /// `matches` is true after executing the set against `text`. 195 #[doc(hidden)] 196 pub fn read_matches_at( 197 &self, 198 matches: &mut [bool], 199 text: $text_ty, 200 start: usize, 201 ) -> bool { 202 self.0.searcher().many_matches_at(matches, $as_bytes(text), start) 203 } 204 205 /// Returns the total number of regular expressions in this set. 206 pub fn len(&self) -> usize { 207 self.0.regex_strings().len() 208 } 209 210 /// Returns the patterns that this set will match on. 211 /// 212 /// This function can be used to determine the pattern for a match. The 213 /// slice returned has exactly as many patterns givens to this regex set, 214 /// and the order of the slice is the same as the order of the patterns 215 /// provided to the set. 216 /// 217 /// # Example 218 /// 219 /// ```rust 220 /// # use regex::RegexSet; 221 /// let set = RegexSet::new(&[ 222 /// r"\w+", 223 /// r"\d+", 224 /// r"\pL+", 225 /// r"foo", 226 /// r"bar", 227 /// r"barfoo", 228 /// r"foobar", 229 /// ]).unwrap(); 230 /// let matches: Vec<_> = set 231 /// .matches("foobar") 232 /// .into_iter() 233 /// .map(|match_idx| &set.patterns()[match_idx]) 234 /// .collect(); 235 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); 236 /// ``` 237 pub fn patterns(&self) -> &[String] { 238 self.0.regex_strings() 239 } 240 } 241 242 /// A set of matches returned by a regex set. 243 #[derive(Clone, Debug)] 244 pub struct SetMatches { 245 matched_any: bool, 246 matches: Vec<bool>, 247 } 248 249 impl SetMatches { 250 /// Whether this set contains any matches. 251 pub fn matched_any(&self) -> bool { 252 self.matched_any 253 } 254 255 /// Whether the regex at the given index matched. 256 /// 257 /// The index for a regex is determined by its insertion order upon the 258 /// initial construction of a `RegexSet`, starting at `0`. 259 /// 260 /// # Panics 261 /// 262 /// If `regex_index` is greater than or equal to `self.len()`. 263 pub fn matched(&self, regex_index: usize) -> bool { 264 self.matches[regex_index] 265 } 266 267 /// The total number of regexes in the set that created these matches. 268 pub fn len(&self) -> usize { 269 self.matches.len() 270 } 271 272 /// Returns an iterator over indexes in the regex that matched. 273 /// 274 /// This will always produces matches in ascending order of index, where 275 /// the index corresponds to the index of the regex that matched with 276 /// respect to its position when initially building the set. 277 pub fn iter(&self) -> SetMatchesIter { 278 SetMatchesIter((&*self.matches).into_iter().enumerate()) 279 } 280 } 281 282 impl IntoIterator for SetMatches { 283 type IntoIter = SetMatchesIntoIter; 284 type Item = usize; 285 286 fn into_iter(self) -> Self::IntoIter { 287 SetMatchesIntoIter(self.matches.into_iter().enumerate()) 288 } 289 } 290 291 impl<'a> IntoIterator for &'a SetMatches { 292 type IntoIter = SetMatchesIter<'a>; 293 type Item = usize; 294 295 fn into_iter(self) -> Self::IntoIter { 296 self.iter() 297 } 298 } 299 300 /// An owned iterator over the set of matches from a regex set. 301 /// 302 /// This will always produces matches in ascending order of index, where the 303 /// index corresponds to the index of the regex that matched with respect to 304 /// its position when initially building the set. 305 pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); 306 307 impl Iterator for SetMatchesIntoIter { 308 type Item = usize; 309 310 fn next(&mut self) -> Option<usize> { 311 loop { 312 match self.0.next() { 313 None => return None, 314 Some((_, false)) => {} 315 Some((i, true)) => return Some(i), 316 } 317 } 318 } 319 320 fn size_hint(&self) -> (usize, Option<usize>) { 321 self.0.size_hint() 322 } 323 } 324 325 impl DoubleEndedIterator for SetMatchesIntoIter { 326 fn next_back(&mut self) -> Option<usize> { 327 loop { 328 match self.0.next_back() { 329 None => return None, 330 Some((_, false)) => {} 331 Some((i, true)) => return Some(i), 332 } 333 } 334 } 335 } 336 337 /// A borrowed iterator over the set of matches from a regex set. 338 /// 339 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value. 340 /// 341 /// This will always produces matches in ascending order of index, where the 342 /// index corresponds to the index of the regex that matched with respect to 343 /// its position when initially building the set. 344 #[derive(Clone)] 345 pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); 346 347 impl<'a> Iterator for SetMatchesIter<'a> { 348 type Item = usize; 349 350 fn next(&mut self) -> Option<usize> { 351 loop { 352 match self.0.next() { 353 None => return None, 354 Some((_, &false)) => {} 355 Some((i, &true)) => return Some(i), 356 } 357 } 358 } 359 360 fn size_hint(&self) -> (usize, Option<usize>) { 361 self.0.size_hint() 362 } 363 } 364 365 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { 366 fn next_back(&mut self) -> Option<usize> { 367 loop { 368 match self.0.next_back() { 369 None => return None, 370 Some((_, &false)) => {} 371 Some((i, &true)) => return Some(i), 372 } 373 } 374 } 375 } 376 377 #[doc(hidden)] 378 impl From<Exec> for RegexSet { 379 fn from(exec: Exec) -> Self { 380 RegexSet(exec) 381 } 382 } 383 384 impl fmt::Debug for RegexSet { 385 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 386 write!(f, "RegexSet({:?})", self.0.regex_strings()) 387 } 388 } 389 390 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } 391 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } 392 } 393 } 394 } 395 396 define_set! { 397 unicode, 398 set_unicode, 399 &str, 400 as_bytes_str, 401 /// ```rust 402 /// # use regex::RegexSet; 403 /// let set = RegexSet::new(&[ 404 /// r"[a-z]+@[a-z]+\.(com|org|net)", 405 /// r"[a-z]+\.(com|org|net)", 406 /// ]).unwrap(); 407 /// 408 /// // Ask whether any regexes in the set match. 409 /// assert!(set.is_match("foo@example.com")); 410 /// 411 /// // Identify which regexes in the set match. 412 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); 413 /// assert_eq!(vec![0, 1], matches); 414 /// 415 /// // Try again, but with text that only matches one of the regexes. 416 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); 417 /// assert_eq!(vec![1], matches); 418 /// 419 /// // Try again, but with text that doesn't match any regex in the set. 420 /// let matches: Vec<_> = set.matches("example").into_iter().collect(); 421 /// assert!(matches.is_empty()); 422 /// ``` 423 } 424 425 define_set! { 426 bytes, 427 set_bytes, 428 &[u8], 429 as_bytes_bytes, 430 /// ```rust 431 /// # use regex::bytes::RegexSet; 432 /// let set = RegexSet::new(&[ 433 /// r"[a-z]+@[a-z]+\.(com|org|net)", 434 /// r"[a-z]+\.(com|org|net)", 435 /// ]).unwrap(); 436 /// 437 /// // Ask whether any regexes in the set match. 438 /// assert!(set.is_match(b"foo@example.com")); 439 /// 440 /// // Identify which regexes in the set match. 441 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); 442 /// assert_eq!(vec![0, 1], matches); 443 /// 444 /// // Try again, but with text that only matches one of the regexes. 445 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); 446 /// assert_eq!(vec![1], matches); 447 /// 448 /// // Try again, but with text that doesn't match any regex in the set. 449 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); 450 /// assert!(matches.is_empty()); 451 /// ``` 452 } 453