1 macro_rules! define_set { 2 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, 3 $(#[$doc_regexset_example:meta])* ) => { 4 pub mod $name { 5 use std::fmt; 6 use std::iter; 7 use std::slice; 8 use std::vec; 9 10 use crate::error::Error; 11 use crate::exec::Exec; 12 use crate::re_builder::$builder_mod::RegexSetBuilder; 13 use crate::re_trait::RegularExpression; 14 15 /// Match multiple (possibly overlapping) regular expressions in a single scan. 16 /// 17 /// A regex set corresponds to the union of two or more regular expressions. 18 /// That is, a regex set will match text where at least one of its 19 /// constituent regular expressions matches. A regex set as its formulated here 20 /// provides a touch more power: it will also report *which* regular 21 /// expressions in the set match. Indeed, this is the key difference between 22 /// regex sets and a single `Regex` with many alternates, since only one 23 /// alternate can match at a time. 24 /// 25 /// For example, consider regular expressions to match email addresses and 26 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a 27 /// regex set is constructed from those regexes, then searching the text 28 /// `foo@example.com` will report both regexes as matching. Of course, one 29 /// could accomplish this by compiling each regex on its own and doing two 30 /// searches over the text. The key advantage of using a regex set is that it 31 /// will report the matching regexes using a *single pass through the text*. 32 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL 33 /// router for a complex web application or a user agent matcher), then a regex 34 /// set can realize huge performance gains. 35 /// 36 /// # Example 37 /// 38 /// This shows how the above two regexes (for matching email addresses and 39 /// domains) might work: 40 /// 41 $(#[$doc_regexset_example])* 42 /// 43 /// Note that it would be possible to adapt the above example to using `Regex` 44 /// with an expression like: 45 /// 46 /// ```text 47 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) 48 /// ``` 49 /// 50 /// After a match, one could then inspect the capture groups to figure out 51 /// which alternates matched. The problem is that it is hard to make this 52 /// approach scale when there are many regexes since the overlap between each 53 /// alternate isn't always obvious to reason about. 54 /// 55 /// # Limitations 56 /// 57 /// Regex sets are limited to answering the following two questions: 58 /// 59 /// 1. Does any regex in the set match? 60 /// 2. If so, which regexes in the set match? 61 /// 62 /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) 63 /// since the matching engines can stop after the first match is found. 64 /// 65 /// Other features like finding the location of successive matches or their 66 /// sub-captures aren't supported. If you need this functionality, the 67 /// recommended approach is to compile each regex in the set independently and 68 /// selectively match them based on which regexes in the set matched. 69 /// 70 /// # Performance 71 /// 72 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely, 73 /// search takes `O(mn)` time, where `m` is proportional to the size of the 74 /// regex set and `n` is proportional to the length of the search text. 75 #[derive(Clone)] 76 pub struct RegexSet(Exec); 77 78 impl RegexSet { 79 /// Create a new regex set with the given regular expressions. 80 /// 81 /// This takes an iterator of `S`, where `S` is something that can produce 82 /// a `&str`. If any of the strings in the iterator are not valid regular 83 /// expressions, then an error is returned. 84 /// 85 /// # Example 86 /// 87 /// Create a new regex set from an iterator of strings: 88 /// 89 /// ```rust 90 /// # use regex::RegexSet; 91 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 92 /// assert!(set.is_match("foo")); 93 /// ``` 94 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> 95 where S: AsRef<str>, I: IntoIterator<Item=S> { 96 RegexSetBuilder::new(exprs).build() 97 } 98 99 /// Create a new empty regex set. 100 /// 101 /// # Example 102 /// 103 /// ```rust 104 /// # use regex::RegexSet; 105 /// let set = RegexSet::empty(); 106 /// assert!(set.is_empty()); 107 /// ``` 108 pub fn empty() -> RegexSet { 109 RegexSetBuilder::new(&[""; 0]).build().unwrap() 110 } 111 112 /// Returns true if and only if one of the regexes in this set matches 113 /// the text given. 114 /// 115 /// This method should be preferred if you only need to test whether any 116 /// of the regexes in the set should match, but don't care about *which* 117 /// regexes matched. This is because the underlying matching engine will 118 /// quit immediately after seeing the first match instead of continuing to 119 /// find all matches. 120 /// 121 /// Note that as with searches using `Regex`, the expression is unanchored 122 /// by default. That is, if the regex does not start with `^` or `\A`, or 123 /// end with `$` or `\z`, then it is permitted to match anywhere in the 124 /// text. 125 /// 126 /// # Example 127 /// 128 /// Tests whether a set matches some text: 129 /// 130 /// ```rust 131 /// # use regex::RegexSet; 132 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 133 /// assert!(set.is_match("foo")); 134 /// assert!(!set.is_match("☃")); 135 /// ``` 136 pub fn is_match(&self, text: $text_ty) -> bool { 137 self.is_match_at(text, 0) 138 } 139 140 /// Returns the same as is_match, but starts the search at the given 141 /// offset. 142 /// 143 /// The significance of the starting point is that it takes the surrounding 144 /// context into consideration. For example, the `\A` anchor can only 145 /// match when `start == 0`. 146 #[doc(hidden)] 147 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { 148 self.0.searcher().is_match_at($as_bytes(text), start) 149 } 150 151 /// Returns the set of regular expressions that match in the given text. 152 /// 153 /// The set returned contains the index of each regular expression that 154 /// matches in the given text. The index is in correspondence with the 155 /// order of regular expressions given to `RegexSet`'s constructor. 156 /// 157 /// The set can also be used to iterate over the matched indices. 158 /// 159 /// Note that as with searches using `Regex`, the expression is unanchored 160 /// by default. That is, if the regex does not start with `^` or `\A`, or 161 /// end with `$` or `\z`, then it is permitted to match anywhere in the 162 /// text. 163 /// 164 /// # Example 165 /// 166 /// Tests which regular expressions match the given text: 167 /// 168 /// ```rust 169 /// # use regex::RegexSet; 170 /// let set = RegexSet::new(&[ 171 /// r"\w+", 172 /// r"\d+", 173 /// r"\pL+", 174 /// r"foo", 175 /// r"bar", 176 /// r"barfoo", 177 /// r"foobar", 178 /// ]).unwrap(); 179 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); 180 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); 181 /// 182 /// // You can also test whether a particular regex matched: 183 /// let matches = set.matches("foobar"); 184 /// assert!(!matches.matched(5)); 185 /// assert!(matches.matched(6)); 186 /// ``` 187 pub fn matches(&self, text: $text_ty) -> SetMatches { 188 let mut matches = vec![false; self.0.regex_strings().len()]; 189 let any = self.read_matches_at(&mut matches, text, 0); 190 SetMatches { 191 matched_any: any, 192 matches: matches, 193 } 194 } 195 196 /// Returns the same as matches, but starts the search at the given 197 /// offset and stores the matches into the slice given. 198 /// 199 /// The significance of the starting point is that it takes the surrounding 200 /// context into consideration. For example, the `\A` anchor can only 201 /// match when `start == 0`. 202 /// 203 /// `matches` must have a length that is at least the number of regexes 204 /// in this set. 205 /// 206 /// This method returns true if and only if at least one member of 207 /// `matches` is true after executing the set against `text`. 208 #[doc(hidden)] 209 pub fn read_matches_at( 210 &self, 211 matches: &mut [bool], 212 text: $text_ty, 213 start: usize, 214 ) -> bool { 215 self.0.searcher().many_matches_at(matches, $as_bytes(text), start) 216 } 217 218 /// Returns the total number of regular expressions in this set. 219 pub fn len(&self) -> usize { 220 self.0.regex_strings().len() 221 } 222 223 /// Returns `true` if this set contains no regular expressions. 224 pub fn is_empty(&self) -> bool { 225 self.0.regex_strings().is_empty() 226 } 227 228 /// Returns the patterns that this set will match on. 229 /// 230 /// This function can be used to determine the pattern for a match. The 231 /// slice returned has exactly as many patterns givens to this regex set, 232 /// and the order of the slice is the same as the order of the patterns 233 /// provided to the set. 234 /// 235 /// # Example 236 /// 237 /// ```rust 238 /// # use regex::RegexSet; 239 /// let set = RegexSet::new(&[ 240 /// r"\w+", 241 /// r"\d+", 242 /// r"\pL+", 243 /// r"foo", 244 /// r"bar", 245 /// r"barfoo", 246 /// r"foobar", 247 /// ]).unwrap(); 248 /// let matches: Vec<_> = set 249 /// .matches("foobar") 250 /// .into_iter() 251 /// .map(|match_idx| &set.patterns()[match_idx]) 252 /// .collect(); 253 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); 254 /// ``` 255 pub fn patterns(&self) -> &[String] { 256 self.0.regex_strings() 257 } 258 } 259 260 /// A set of matches returned by a regex set. 261 #[derive(Clone, Debug)] 262 pub struct SetMatches { 263 matched_any: bool, 264 matches: Vec<bool>, 265 } 266 267 impl SetMatches { 268 /// Whether this set contains any matches. 269 pub fn matched_any(&self) -> bool { 270 self.matched_any 271 } 272 273 /// Whether the regex at the given index matched. 274 /// 275 /// The index for a regex is determined by its insertion order upon the 276 /// initial construction of a `RegexSet`, starting at `0`. 277 /// 278 /// # Panics 279 /// 280 /// If `regex_index` is greater than or equal to `self.len()`. 281 pub fn matched(&self, regex_index: usize) -> bool { 282 self.matches[regex_index] 283 } 284 285 /// The total number of regexes in the set that created these matches. 286 pub fn len(&self) -> usize { 287 self.matches.len() 288 } 289 290 /// Returns an iterator over indexes in the regex that matched. 291 /// 292 /// This will always produces matches in ascending order of index, where 293 /// the index corresponds to the index of the regex that matched with 294 /// respect to its position when initially building the set. 295 pub fn iter(&self) -> SetMatchesIter<'_> { 296 SetMatchesIter((&*self.matches).into_iter().enumerate()) 297 } 298 } 299 300 impl IntoIterator for SetMatches { 301 type IntoIter = SetMatchesIntoIter; 302 type Item = usize; 303 304 fn into_iter(self) -> Self::IntoIter { 305 SetMatchesIntoIter(self.matches.into_iter().enumerate()) 306 } 307 } 308 309 impl<'a> IntoIterator for &'a SetMatches { 310 type IntoIter = SetMatchesIter<'a>; 311 type Item = usize; 312 313 fn into_iter(self) -> Self::IntoIter { 314 self.iter() 315 } 316 } 317 318 /// An owned iterator over the set of matches from a regex set. 319 /// 320 /// This will always produces matches in ascending order of index, where the 321 /// index corresponds to the index of the regex that matched with respect to 322 /// its position when initially building the set. 323 #[derive(Debug)] 324 pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); 325 326 impl Iterator for SetMatchesIntoIter { 327 type Item = usize; 328 329 fn next(&mut self) -> Option<usize> { 330 loop { 331 match self.0.next() { 332 None => return None, 333 Some((_, false)) => {} 334 Some((i, true)) => return Some(i), 335 } 336 } 337 } 338 339 fn size_hint(&self) -> (usize, Option<usize>) { 340 self.0.size_hint() 341 } 342 } 343 344 impl DoubleEndedIterator for SetMatchesIntoIter { 345 fn next_back(&mut self) -> Option<usize> { 346 loop { 347 match self.0.next_back() { 348 None => return None, 349 Some((_, false)) => {} 350 Some((i, true)) => return Some(i), 351 } 352 } 353 } 354 } 355 356 impl iter::FusedIterator for SetMatchesIntoIter {} 357 358 /// A borrowed iterator over the set of matches from a regex set. 359 /// 360 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value. 361 /// 362 /// This will always produces matches in ascending order of index, where the 363 /// index corresponds to the index of the regex that matched with respect to 364 /// its position when initially building the set. 365 #[derive(Clone, Debug)] 366 pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); 367 368 impl<'a> Iterator for SetMatchesIter<'a> { 369 type Item = usize; 370 371 fn next(&mut self) -> Option<usize> { 372 loop { 373 match self.0.next() { 374 None => return None, 375 Some((_, &false)) => {} 376 Some((i, &true)) => return Some(i), 377 } 378 } 379 } 380 381 fn size_hint(&self) -> (usize, Option<usize>) { 382 self.0.size_hint() 383 } 384 } 385 386 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { 387 fn next_back(&mut self) -> Option<usize> { 388 loop { 389 match self.0.next_back() { 390 None => return None, 391 Some((_, &false)) => {} 392 Some((i, &true)) => return Some(i), 393 } 394 } 395 } 396 } 397 398 impl<'a> iter::FusedIterator for SetMatchesIter<'a> {} 399 400 #[doc(hidden)] 401 impl From<Exec> for RegexSet { 402 fn from(exec: Exec) -> Self { 403 RegexSet(exec) 404 } 405 } 406 407 impl fmt::Debug for RegexSet { 408 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 409 write!(f, "RegexSet({:?})", self.0.regex_strings()) 410 } 411 } 412 413 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } 414 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } 415 } 416 } 417 } 418 419 define_set! { 420 unicode, 421 set_unicode, 422 &str, 423 as_bytes_str, 424 /// ```rust 425 /// # use regex::RegexSet; 426 /// let set = RegexSet::new(&[ 427 /// r"[a-z]+@[a-z]+\.(com|org|net)", 428 /// r"[a-z]+\.(com|org|net)", 429 /// ]).unwrap(); 430 /// 431 /// // Ask whether any regexes in the set match. 432 /// assert!(set.is_match("foo@example.com")); 433 /// 434 /// // Identify which regexes in the set match. 435 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); 436 /// assert_eq!(vec![0, 1], matches); 437 /// 438 /// // Try again, but with text that only matches one of the regexes. 439 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); 440 /// assert_eq!(vec![1], matches); 441 /// 442 /// // Try again, but with text that doesn't match any regex in the set. 443 /// let matches: Vec<_> = set.matches("example").into_iter().collect(); 444 /// assert!(matches.is_empty()); 445 /// ``` 446 } 447 448 define_set! { 449 bytes, 450 set_bytes, 451 &[u8], 452 as_bytes_bytes, 453 /// ```rust 454 /// # use regex::bytes::RegexSet; 455 /// let set = RegexSet::new(&[ 456 /// r"[a-z]+@[a-z]+\.(com|org|net)", 457 /// r"[a-z]+\.(com|org|net)", 458 /// ]).unwrap(); 459 /// 460 /// // Ask whether any regexes in the set match. 461 /// assert!(set.is_match(b"foo@example.com")); 462 /// 463 /// // Identify which regexes in the set match. 464 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); 465 /// assert_eq!(vec![0, 1], matches); 466 /// 467 /// // Try again, but with text that only matches one of the regexes. 468 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); 469 /// assert_eq!(vec![1], matches); 470 /// 471 /// // Try again, but with text that doesn't match any regex in the set. 472 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); 473 /// assert!(matches.is_empty()); 474 /// ``` 475 } 476