1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT 2 // file at the top-level directory of this distribution and at 3 // http://rust-lang.org/COPYRIGHT. 4 // 5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or 6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license 7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your 8 // option. This file may not be copied, modified, or distributed 9 // except according to those terms. 10 11 macro_rules! define_set { 12 ($name:ident, $builder_mod:ident, $text_ty:ty, $as_bytes:expr, 13 $(#[$doc_regexset_example:meta])* ) => { 14 pub mod $name { 15 use std::fmt; 16 use std::iter; 17 use std::slice; 18 use std::vec; 19 20 use error::Error; 21 use exec::Exec; 22 use re_builder::$builder_mod::RegexSetBuilder; 23 use re_trait::RegularExpression; 24 25 /// Match multiple (possibly overlapping) regular expressions in a single scan. 26 /// 27 /// A regex set corresponds to the union of two or more regular expressions. 28 /// That is, a regex set will match text where at least one of its 29 /// constituent regular expressions matches. A regex set as its formulated here 30 /// provides a touch more power: it will also report *which* regular 31 /// expressions in the set match. Indeed, this is the key difference between 32 /// regex sets and a single `Regex` with many alternates, since only one 33 /// alternate can match at a time. 34 /// 35 /// For example, consider regular expressions to match email addresses and 36 /// domains: `[a-z]+@[a-z]+\.(com|org|net)` and `[a-z]+\.(com|org|net)`. If a 37 /// regex set is constructed from those regexes, then searching the text 38 /// `foo@example.com` will report both regexes as matching. Of course, one 39 /// could accomplish this by compiling each regex on its own and doing two 40 /// searches over the text. The key advantage of using a regex set is that it 41 /// will report the matching regexes using a *single pass through the text*. 42 /// If one has hundreds or thousands of regexes to match repeatedly (like a URL 43 /// router for a complex web application or a user agent matcher), then a regex 44 /// set can realize huge performance gains. 45 /// 46 /// # Example 47 /// 48 /// This shows how the above two regexes (for matching email addresses and 49 /// domains) might work: 50 /// 51 $(#[$doc_regexset_example])* 52 /// 53 /// Note that it would be possible to adapt the above example to using `Regex` 54 /// with an expression like: 55 /// 56 /// ```ignore 57 /// (?P<email>[a-z]+@(?P<email_domain>[a-z]+[.](com|org|net)))|(?P<domain>[a-z]+[.](com|org|net)) 58 /// ``` 59 /// 60 /// After a match, one could then inspect the capture groups to figure out 61 /// which alternates matched. The problem is that it is hard to make this 62 /// approach scale when there are many regexes since the overlap between each 63 /// alternate isn't always obvious to reason about. 64 /// 65 /// # Limitations 66 /// 67 /// Regex sets are limited to answering the following two questions: 68 /// 69 /// 1. Does any regex in the set match? 70 /// 2. If so, which regexes in the set match? 71 /// 72 /// As with the main `Regex` type, it is cheaper to ask (1) instead of (2) 73 /// since the matching engines can stop after the first match is found. 74 /// 75 /// Other features like finding the location of successive matches or their 76 /// sub-captures aren't supported. If you need this functionality, the 77 /// recommended approach is to compile each regex in the set independently and 78 /// selectively match them based on which regexes in the set matched. 79 /// 80 /// # Performance 81 /// 82 /// A `RegexSet` has the same performance characteristics as `Regex`. Namely, 83 /// search takes `O(mn)` time, where `m` is proportional to the size of the 84 /// regex set and `n` is proportional to the length of the search text. 85 #[derive(Clone)] 86 pub struct RegexSet(Exec); 87 88 impl RegexSet { 89 /// Create a new regex set with the given regular expressions. 90 /// 91 /// This takes an iterator of `S`, where `S` is something that can produce 92 /// a `&str`. If any of the strings in the iterator are not valid regular 93 /// expressions, then an error is returned. 94 /// 95 /// # Example 96 /// 97 /// Create a new regex set from an iterator of strings: 98 /// 99 /// ```rust 100 /// # use regex::RegexSet; 101 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 102 /// assert!(set.is_match("foo")); 103 /// ``` 104 pub fn new<I, S>(exprs: I) -> Result<RegexSet, Error> 105 where S: AsRef<str>, I: IntoIterator<Item=S> { 106 RegexSetBuilder::new(exprs).build() 107 } 108 109 /// Returns true if and only if one of the regexes in this set matches 110 /// the text given. 111 /// 112 /// This method should be preferred if you only need to test whether any 113 /// of the regexes in the set should match, but don't care about *which* 114 /// regexes matched. This is because the underlying matching engine will 115 /// quit immediately after seeing the first match instead of continuing to 116 /// find all matches. 117 /// 118 /// Note that as with searches using `Regex`, the expression is unanchored 119 /// by default. That is, if the regex does not start with `^` or `\A`, or 120 /// end with `$` or `\z`, then it is permitted to match anywhere in the 121 /// text. 122 /// 123 /// # Example 124 /// 125 /// Tests whether a set matches some text: 126 /// 127 /// ```rust 128 /// # use regex::RegexSet; 129 /// let set = RegexSet::new(&[r"\w+", r"\d+"]).unwrap(); 130 /// assert!(set.is_match("foo")); 131 /// assert!(!set.is_match("☃")); 132 /// ``` 133 pub fn is_match(&self, text: $text_ty) -> bool { 134 self.is_match_at(text, 0) 135 } 136 137 /// Returns the same as is_match, but starts the search at the given 138 /// offset. 139 /// 140 /// The significance of the starting point is that it takes the surrounding 141 /// context into consideration. For example, the `\A` anchor can only 142 /// match when `start == 0`. 143 #[doc(hidden)] 144 pub fn is_match_at(&self, text: $text_ty, start: usize) -> bool { 145 self.0.searcher().is_match_at($as_bytes(text), start) 146 } 147 148 /// Returns the set of regular expressions that match in the given text. 149 /// 150 /// The set returned contains the index of each regular expression that 151 /// matches in the given text. The index is in correspondence with the 152 /// order of regular expressions given to `RegexSet`'s constructor. 153 /// 154 /// The set can also be used to iterate over the matched indices. 155 /// 156 /// Note that as with searches using `Regex`, the expression is unanchored 157 /// by default. That is, if the regex does not start with `^` or `\A`, or 158 /// end with `$` or `\z`, then it is permitted to match anywhere in the 159 /// text. 160 /// 161 /// # Example 162 /// 163 /// Tests which regular expressions match the given text: 164 /// 165 /// ```rust 166 /// # use regex::RegexSet; 167 /// let set = RegexSet::new(&[ 168 /// r"\w+", 169 /// r"\d+", 170 /// r"\pL+", 171 /// r"foo", 172 /// r"bar", 173 /// r"barfoo", 174 /// r"foobar", 175 /// ]).unwrap(); 176 /// let matches: Vec<_> = set.matches("foobar").into_iter().collect(); 177 /// assert_eq!(matches, vec![0, 2, 3, 4, 6]); 178 /// 179 /// // You can also test whether a particular regex matched: 180 /// let matches = set.matches("foobar"); 181 /// assert!(!matches.matched(5)); 182 /// assert!(matches.matched(6)); 183 /// ``` 184 pub fn matches(&self, text: $text_ty) -> SetMatches { 185 let mut matches = vec![false; self.0.regex_strings().len()]; 186 let any = self.read_matches_at(&mut matches, text, 0); 187 SetMatches { 188 matched_any: any, 189 matches: matches, 190 } 191 } 192 193 /// Returns the same as matches, but starts the search at the given 194 /// offset and stores the matches into the slice given. 195 /// 196 /// The significance of the starting point is that it takes the surrounding 197 /// context into consideration. For example, the `\A` anchor can only 198 /// match when `start == 0`. 199 /// 200 /// `matches` must have a length that is at least the number of regexes 201 /// in this set. 202 /// 203 /// This method returns true if and only if at least one member of 204 /// `matches` is true after executing the set against `text`. 205 #[doc(hidden)] 206 pub fn read_matches_at( 207 &self, 208 matches: &mut [bool], 209 text: $text_ty, 210 start: usize, 211 ) -> bool { 212 self.0.searcher().many_matches_at(matches, $as_bytes(text), start) 213 } 214 215 /// Returns the total number of regular expressions in this set. 216 pub fn len(&self) -> usize { 217 self.0.regex_strings().len() 218 } 219 220 /// Returns the patterns that this set will match on. 221 /// 222 /// This function can be used to determine the pattern for a match. The 223 /// slice returned has exactly as many patterns givens to this regex set, 224 /// and the order of the slice is the same as the order of the patterns 225 /// provided to the set. 226 /// 227 /// # Example 228 /// 229 /// ```rust 230 /// # use regex::RegexSet; 231 /// let set = RegexSet::new(&[ 232 /// r"\w+", 233 /// r"\d+", 234 /// r"\pL+", 235 /// r"foo", 236 /// r"bar", 237 /// r"barfoo", 238 /// r"foobar", 239 /// ]).unwrap(); 240 /// let matches: Vec<_> = set 241 /// .matches("foobar") 242 /// .into_iter() 243 /// .map(|match_idx| &set.patterns()[match_idx]) 244 /// .collect(); 245 /// assert_eq!(matches, vec![r"\w+", r"\pL+", r"foo", r"bar", r"foobar"]); 246 /// ``` 247 pub fn patterns(&self) -> &[String] { 248 self.0.regex_strings() 249 } 250 } 251 252 /// A set of matches returned by a regex set. 253 #[derive(Clone, Debug)] 254 pub struct SetMatches { 255 matched_any: bool, 256 matches: Vec<bool>, 257 } 258 259 impl SetMatches { 260 /// Whether this set contains any matches. 261 pub fn matched_any(&self) -> bool { 262 self.matched_any 263 } 264 265 /// Whether the regex at the given index matched. 266 /// 267 /// The index for a regex is determined by its insertion order upon the 268 /// initial construction of a `RegexSet`, starting at `0`. 269 /// 270 /// # Panics 271 /// 272 /// If `regex_index` is greater than or equal to `self.len()`. 273 pub fn matched(&self, regex_index: usize) -> bool { 274 self.matches[regex_index] 275 } 276 277 /// The total number of regexes in the set that created these matches. 278 pub fn len(&self) -> usize { 279 self.matches.len() 280 } 281 282 /// Returns an iterator over indexes in the regex that matched. 283 /// 284 /// This will always produces matches in ascending order of index, where 285 /// the index corresponds to the index of the regex that matched with 286 /// respect to its position when initially building the set. 287 pub fn iter(&self) -> SetMatchesIter { 288 SetMatchesIter((&*self.matches).into_iter().enumerate()) 289 } 290 } 291 292 impl IntoIterator for SetMatches { 293 type IntoIter = SetMatchesIntoIter; 294 type Item = usize; 295 296 fn into_iter(self) -> Self::IntoIter { 297 SetMatchesIntoIter(self.matches.into_iter().enumerate()) 298 } 299 } 300 301 impl<'a> IntoIterator for &'a SetMatches { 302 type IntoIter = SetMatchesIter<'a>; 303 type Item = usize; 304 305 fn into_iter(self) -> Self::IntoIter { 306 self.iter() 307 } 308 } 309 310 /// An owned iterator over the set of matches from a regex set. 311 /// 312 /// This will always produces matches in ascending order of index, where the 313 /// index corresponds to the index of the regex that matched with respect to 314 /// its position when initially building the set. 315 pub struct SetMatchesIntoIter(iter::Enumerate<vec::IntoIter<bool>>); 316 317 impl Iterator for SetMatchesIntoIter { 318 type Item = usize; 319 320 fn next(&mut self) -> Option<usize> { 321 loop { 322 match self.0.next() { 323 None => return None, 324 Some((_, false)) => {} 325 Some((i, true)) => return Some(i), 326 } 327 } 328 } 329 330 fn size_hint(&self) -> (usize, Option<usize>) { 331 self.0.size_hint() 332 } 333 } 334 335 impl DoubleEndedIterator for SetMatchesIntoIter { 336 fn next_back(&mut self) -> Option<usize> { 337 loop { 338 match self.0.next_back() { 339 None => return None, 340 Some((_, false)) => {} 341 Some((i, true)) => return Some(i), 342 } 343 } 344 } 345 } 346 347 /// A borrowed iterator over the set of matches from a regex set. 348 /// 349 /// The lifetime `'a` refers to the lifetime of a `SetMatches` value. 350 /// 351 /// This will always produces matches in ascending order of index, where the 352 /// index corresponds to the index of the regex that matched with respect to 353 /// its position when initially building the set. 354 #[derive(Clone)] 355 pub struct SetMatchesIter<'a>(iter::Enumerate<slice::Iter<'a, bool>>); 356 357 impl<'a> Iterator for SetMatchesIter<'a> { 358 type Item = usize; 359 360 fn next(&mut self) -> Option<usize> { 361 loop { 362 match self.0.next() { 363 None => return None, 364 Some((_, &false)) => {} 365 Some((i, &true)) => return Some(i), 366 } 367 } 368 } 369 370 fn size_hint(&self) -> (usize, Option<usize>) { 371 self.0.size_hint() 372 } 373 } 374 375 impl<'a> DoubleEndedIterator for SetMatchesIter<'a> { 376 fn next_back(&mut self) -> Option<usize> { 377 loop { 378 match self.0.next_back() { 379 None => return None, 380 Some((_, &false)) => {} 381 Some((i, &true)) => return Some(i), 382 } 383 } 384 } 385 } 386 387 #[doc(hidden)] 388 impl From<Exec> for RegexSet { 389 fn from(exec: Exec) -> Self { 390 RegexSet(exec) 391 } 392 } 393 394 impl fmt::Debug for RegexSet { 395 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 396 write!(f, "RegexSet({:?})", self.0.regex_strings()) 397 } 398 } 399 400 #[allow(dead_code)] fn as_bytes_str(text: &str) -> &[u8] { text.as_bytes() } 401 #[allow(dead_code)] fn as_bytes_bytes(text: &[u8]) -> &[u8] { text } 402 } 403 } 404 } 405 406 define_set! { 407 unicode, 408 set_unicode, 409 &str, 410 as_bytes_str, 411 /// ```rust 412 /// # use regex::RegexSet; 413 /// let set = RegexSet::new(&[ 414 /// r"[a-z]+@[a-z]+\.(com|org|net)", 415 /// r"[a-z]+\.(com|org|net)", 416 /// ]).unwrap(); 417 /// 418 /// // Ask whether any regexes in the set match. 419 /// assert!(set.is_match("foo@example.com")); 420 /// 421 /// // Identify which regexes in the set match. 422 /// let matches: Vec<_> = set.matches("foo@example.com").into_iter().collect(); 423 /// assert_eq!(vec![0, 1], matches); 424 /// 425 /// // Try again, but with text that only matches one of the regexes. 426 /// let matches: Vec<_> = set.matches("example.com").into_iter().collect(); 427 /// assert_eq!(vec![1], matches); 428 /// 429 /// // Try again, but with text that doesn't match any regex in the set. 430 /// let matches: Vec<_> = set.matches("example").into_iter().collect(); 431 /// assert!(matches.is_empty()); 432 /// ``` 433 } 434 435 define_set! { 436 bytes, 437 set_bytes, 438 &[u8], 439 as_bytes_bytes, 440 /// ```rust 441 /// # use regex::bytes::RegexSet; 442 /// let set = RegexSet::new(&[ 443 /// r"[a-z]+@[a-z]+\.(com|org|net)", 444 /// r"[a-z]+\.(com|org|net)", 445 /// ]).unwrap(); 446 /// 447 /// // Ask whether any regexes in the set match. 448 /// assert!(set.is_match(b"foo@example.com")); 449 /// 450 /// // Identify which regexes in the set match. 451 /// let matches: Vec<_> = set.matches(b"foo@example.com").into_iter().collect(); 452 /// assert_eq!(vec![0, 1], matches); 453 /// 454 /// // Try again, but with text that only matches one of the regexes. 455 /// let matches: Vec<_> = set.matches(b"example.com").into_iter().collect(); 456 /// assert_eq!(vec![1], matches); 457 /// 458 /// // Try again, but with text that doesn't match any regex in the set. 459 /// let matches: Vec<_> = set.matches(b"example").into_iter().collect(); 460 /// assert!(matches.is_empty()); 461 /// ``` 462 } 463