1 use super::{Regex, Region, SearchOptions}; 2 use std::iter::FusedIterator; 3 4 impl Regex { 5 /// Returns the capture groups corresponding to the leftmost-first match 6 /// in text. Capture group `0` always corresponds to the entire match. 7 /// If no match is found, then `None` is returned. captures<'t>(&self, text: &'t str) -> Option<Captures<'t>>8 pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> { 9 let mut region = Region::new(); 10 self.search_with_options( 11 text, 12 0, 13 text.len(), 14 SearchOptions::SEARCH_OPTION_NONE, 15 Some(&mut region), 16 ) 17 .map(|pos| Captures { 18 text, 19 region, 20 offset: pos, 21 }) 22 } 23 24 /// Returns an iterator for each successive non-overlapping match in `text`, 25 /// returning the start and end byte indices with respect to `text`. 26 /// 27 /// # Example 28 /// 29 /// Find the start and end location of every word with exactly 13 30 /// characters: 31 /// 32 /// ```rust 33 /// # extern crate onig; use onig::Regex; 34 /// # fn main() { 35 /// let text = "Retroactively relinquishing remunerations is reprehensible."; 36 /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) { 37 /// println!("{:?}", pos); 38 /// } 39 /// // Output: 40 /// // (0, 13) 41 /// // (14, 27) 42 /// // (28, 41) 43 /// // (45, 58) 44 /// # } 45 /// ``` find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't>46 pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> { 47 FindMatches { 48 regex: self, 49 region: Region::new(), 50 text, 51 last_end: 0, 52 last_match_end: None, 53 } 54 } 55 56 /// Returns an iterator over all the non-overlapping capture groups matched 57 /// in `text`. This is operationally the same as `find_iter` (except it 58 /// yields information about submatches). 59 /// 60 /// # Example 61 /// 62 /// We can use this to find all movie titles and their release years in 63 /// some text, where the movie is formatted like "'Title' (xxxx)": 64 /// 65 /// ```rust 66 /// # extern crate onig; use onig::Regex; 67 /// # fn main() { 68 /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)") 69 /// .unwrap(); 70 /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931)."; 71 /// for caps in re.captures_iter(text) { 72 /// println!("Movie: {:?}, Released: {:?}", caps.at(1), caps.at(2)); 73 /// } 74 /// // Output: 75 /// // Movie: Citizen Kane, Released: 1941 76 /// // Movie: The Wizard of Oz, Released: 1939 77 /// // Movie: M, Released: 1931 78 /// # } 79 /// ``` captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't>80 pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't> { 81 FindCaptures { 82 regex: self, 83 text, 84 last_end: 0, 85 last_match_end: None, 86 } 87 } 88 89 /// Returns an iterator of substrings of `text` delimited by a match 90 /// of the regular expression. 91 /// Namely, each element of the iterator corresponds to text that *isn't* 92 /// matched by the regular expression. 93 /// 94 /// This method will *not* copy the text given. 95 /// 96 /// # Example 97 /// 98 /// To split a string delimited by arbitrary amounts of spaces or tabs: 99 /// 100 /// ```rust 101 /// # extern crate onig; use onig::Regex; 102 /// # fn main() { 103 /// let re = Regex::new(r"[ \t]+").unwrap(); 104 /// let fields: Vec<&str> = re.split("a b \t c\td e").collect(); 105 /// assert_eq!(fields, vec!("a", "b", "c", "d", "e")); 106 /// # } 107 /// ``` split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't>108 pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> { 109 RegexSplits { 110 finder: self.find_iter(text), 111 last: 0, 112 } 113 } 114 115 /// Returns an iterator of at most `limit` substrings of `text` delimited 116 /// by a match of the regular expression. (A `limit` of `0` will return no 117 /// substrings.) 118 /// Namely, each element of the iterator corresponds to text that *isn't* 119 /// matched by the regular expression. 120 /// The remainder of the string that is not split will be the last element 121 /// in the iterator. 122 /// 123 /// This method will *not* copy the text given. 124 /// 125 /// # Example 126 /// 127 /// Get the first two words in some text: 128 /// 129 /// ```rust 130 /// # extern crate onig; use onig::Regex; 131 /// # fn main() { 132 /// let re = Regex::new(r"\W+").unwrap(); 133 /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect(); 134 /// assert_eq!(fields, vec!("Hey", "How", "are you?")); 135 /// # } 136 /// ``` splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't>137 pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't> { 138 RegexSplitsN { 139 splits: self.split(text), 140 n: limit, 141 } 142 } 143 144 /// Scan the given slice, capturing into the given region and 145 /// executing a callback for each match. scan_with_region<F>( &self, to_search: &str, region: &mut Region, options: SearchOptions, mut callback: F, ) -> i32 where F: Fn(i32, i32, &Region) -> bool,146 pub fn scan_with_region<F>( 147 &self, 148 to_search: &str, 149 region: &mut Region, 150 options: SearchOptions, 151 mut callback: F, 152 ) -> i32 153 where 154 F: Fn(i32, i32, &Region) -> bool, 155 { 156 use onig_sys::{onig_scan, OnigRegion}; 157 use std::os::raw::{c_int, c_void}; 158 159 // Find the bounds of the string we're searching 160 let start = to_search.as_ptr(); 161 let end = to_search[to_search.len()..].as_ptr(); 162 163 unsafe extern "C" fn scan_cb<F>( 164 i: c_int, 165 j: c_int, 166 r: *mut OnigRegion, 167 ud: *mut c_void, 168 ) -> c_int 169 where 170 F: Fn(i32, i32, &Region) -> bool, 171 { 172 let region = Region::clone_from_raw(r); 173 let callback = &*(ud as *mut F); 174 if callback(i, j, ®ion) { 175 0 176 } else { 177 -1 178 } 179 } 180 181 unsafe { 182 onig_scan( 183 self.raw, 184 start, 185 end, 186 (&mut region.raw) as *mut ::onig_sys::OnigRegion, 187 options.bits(), 188 Some(scan_cb::<F>), 189 &mut callback as *mut F as *mut c_void, 190 ) 191 } 192 } 193 194 /// Scan a Pattern and Observe Captures 195 /// 196 /// The scan function takes a haystack `to_search` and invokes the 197 /// given `callback` for each capture of this expression. scan<'t, CB>(&self, to_search: &'t str, callback: CB) where CB: Fn(i32, Captures<'t>) -> bool,198 pub fn scan<'t, CB>(&self, to_search: &'t str, callback: CB) 199 where 200 CB: Fn(i32, Captures<'t>) -> bool, 201 { 202 let mut region = Region::new(); 203 self.scan_with_region( 204 to_search, 205 &mut region, 206 SearchOptions::SEARCH_OPTION_NONE, 207 |n, s, region| { 208 let captures = Captures { 209 text: to_search, 210 region: region.clone(), 211 offset: s as usize, 212 }; 213 callback(n, captures) 214 }, 215 ); 216 } 217 } 218 219 /// Captures represents a group of captured strings for a single match. 220 /// 221 /// The 0th capture always corresponds to the entire match. Each subsequent 222 /// index corresponds to the next capture group in the regex. Positions 223 /// returned from a capture group are always byte indices. 224 /// 225 /// `'t` is the lifetime of the matched text. 226 #[derive(Debug)] 227 pub struct Captures<'t> { 228 text: &'t str, 229 region: Region, 230 offset: usize, 231 } 232 233 impl<'t> Captures<'t> { 234 /// Returns the start and end positions of the Nth capture group. Returns 235 /// `None` if i is not a valid capture group or if the capture group did 236 /// not match anything. The positions returned are always byte indices with 237 /// respect to the original string matched. pos(&self, pos: usize) -> Option<(usize, usize)>238 pub fn pos(&self, pos: usize) -> Option<(usize, usize)> { 239 self.region.pos(pos) 240 } 241 242 /// Returns the matched string for the capture group `i`. If `i` isn't 243 /// a valid capture group or didn't match anything, then `None` is returned. at(&self, pos: usize) -> Option<&'t str>244 pub fn at(&self, pos: usize) -> Option<&'t str> { 245 self.pos(pos).map(|(beg, end)| &self.text[beg..end]) 246 } 247 248 /// Returns the number of captured groups. len(&self) -> usize249 pub fn len(&self) -> usize { 250 self.region.len() 251 } 252 253 /// Returns true if and only if there are no captured groups. is_empty(&self) -> bool254 pub fn is_empty(&self) -> bool { 255 self.len() == 0 256 } 257 258 /// Creates an iterator of all the capture groups in order of appearance in 259 /// the regular expression. iter(&'t self) -> SubCaptures<'t>260 pub fn iter(&'t self) -> SubCaptures<'t> { 261 SubCaptures { idx: 0, caps: self } 262 } 263 264 /// Creates an iterator of all the capture group positions in order of 265 /// appearance in the regular expression. Positions are byte indices in 266 /// terms of the original string matched. iter_pos(&'t self) -> SubCapturesPos<'t>267 pub fn iter_pos(&'t self) -> SubCapturesPos<'t> { 268 SubCapturesPos { idx: 0, caps: self } 269 } 270 271 /// Offset of the captures within the given string slice. offset(&self) -> usize272 pub fn offset(&self) -> usize { 273 self.offset 274 } 275 } 276 277 /// An iterator over capture groups for a particular match of a regular 278 /// expression. 279 /// 280 /// `'t` is the lifetime of the matched text. 281 pub struct SubCaptures<'t> { 282 idx: usize, 283 caps: &'t Captures<'t>, 284 } 285 286 impl<'t> Iterator for SubCaptures<'t> { 287 type Item = Option<&'t str>; 288 next(&mut self) -> Option<Option<&'t str>>289 fn next(&mut self) -> Option<Option<&'t str>> { 290 if self.idx < self.caps.len() { 291 self.idx += 1; 292 Some(self.caps.at(self.idx - 1)) 293 } else { 294 None 295 } 296 } 297 size_hint(&self) -> (usize, Option<usize>)298 fn size_hint(&self) -> (usize, Option<usize>) { 299 let size = self.caps.len(); 300 (size, Some(size)) 301 } 302 count(self) -> usize303 fn count(self) -> usize { 304 self.caps.len() 305 } 306 } 307 308 impl<'t> FusedIterator for SubCaptures<'t> {} 309 310 impl<'t> ExactSizeIterator for SubCaptures<'t> {} 311 312 /// An iterator over capture group positions for a particular match of 313 /// a regular expression. 314 /// 315 /// Positions are byte indices in terms of the original 316 /// string matched. `'t` is the lifetime of the matched text. 317 pub struct SubCapturesPos<'t> { 318 idx: usize, 319 caps: &'t Captures<'t>, 320 } 321 322 impl<'t> Iterator for SubCapturesPos<'t> { 323 type Item = Option<(usize, usize)>; 324 next(&mut self) -> Option<Option<(usize, usize)>>325 fn next(&mut self) -> Option<Option<(usize, usize)>> { 326 if self.idx < self.caps.len() { 327 self.idx += 1; 328 Some(self.caps.pos(self.idx - 1)) 329 } else { 330 None 331 } 332 } 333 size_hint(&self) -> (usize, Option<usize>)334 fn size_hint(&self) -> (usize, Option<usize>) { 335 let size = self.caps.len(); 336 (size, Some(size)) 337 } 338 count(self) -> usize339 fn count(self) -> usize { 340 self.caps.len() 341 } 342 } 343 344 impl<'t> FusedIterator for SubCapturesPos<'t> {} 345 346 impl<'t> ExactSizeIterator for SubCapturesPos<'t> {} 347 348 /// An iterator over all non-overlapping matches for a particular string. 349 /// 350 /// The iterator yields a tuple of integers corresponding to the start and end 351 /// of the match. The indices are byte offsets. The iterator stops when no more 352 /// matches can be found. 353 /// 354 /// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime 355 /// of the matched string. 356 pub struct FindMatches<'r, 't> { 357 regex: &'r Regex, 358 region: Region, 359 text: &'t str, 360 last_end: usize, 361 last_match_end: Option<usize>, 362 } 363 364 impl<'r, 't> Iterator for FindMatches<'r, 't> { 365 type Item = (usize, usize); 366 next(&mut self) -> Option<(usize, usize)>367 fn next(&mut self) -> Option<(usize, usize)> { 368 if self.last_end > self.text.len() { 369 return None; 370 } 371 self.region.clear(); 372 self.regex.search_with_options( 373 self.text, 374 self.last_end, 375 self.text.len(), 376 SearchOptions::SEARCH_OPTION_NONE, 377 Some(&mut self.region), 378 )?; 379 let (s, e) = self.region.pos(0).unwrap(); 380 381 // Don't accept empty matches immediately following the last match. 382 // i.e., no infinite loops please. 383 if e == s && self.last_match_end.map_or(false, |l| l == e) { 384 self.last_end += self.text[self.last_end..] 385 .chars() 386 .next() 387 .map(|c| c.len_utf8()) 388 .unwrap_or(1); 389 return self.next(); 390 } else { 391 self.last_end = e; 392 self.last_match_end = Some(e); 393 } 394 395 Some((s, e)) 396 } 397 } 398 399 impl<'r, 't> FusedIterator for FindMatches<'r, 't> {} 400 401 /// An iterator that yields all non-overlapping capture groups matching a 402 /// particular regular expression. 403 /// 404 /// The iterator stops when no more matches can be found. 405 /// 406 /// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime 407 /// of the matched string. 408 pub struct FindCaptures<'r, 't> { 409 regex: &'r Regex, 410 text: &'t str, 411 last_end: usize, 412 last_match_end: Option<usize>, 413 } 414 415 impl<'r, 't> Iterator for FindCaptures<'r, 't> { 416 type Item = Captures<'t>; 417 next(&mut self) -> Option<Captures<'t>>418 fn next(&mut self) -> Option<Captures<'t>> { 419 if self.last_end > self.text.len() { 420 return None; 421 } 422 423 let mut region = Region::new(); 424 let r = self.regex.search_with_options( 425 self.text, 426 self.last_end, 427 self.text.len(), 428 SearchOptions::SEARCH_OPTION_NONE, 429 Some(&mut region), 430 )?; 431 let (s, e) = region.pos(0).unwrap(); 432 433 // Don't accept empty matches immediately following the last match. 434 // i.e., no infinite loops please. 435 if e == s && self.last_match_end.map_or(false, |l| l == e) { 436 self.last_end += self.text[self.last_end..] 437 .chars() 438 .next() 439 .map(|c| c.len_utf8()) 440 .unwrap_or(1); 441 return self.next(); 442 } else { 443 self.last_end = e; 444 self.last_match_end = Some(e); 445 } 446 Some(Captures { 447 text: self.text, 448 region, 449 offset: r, 450 }) 451 } 452 } 453 454 impl<'r, 't> FusedIterator for FindCaptures<'r, 't> {} 455 456 /// Yields all substrings delimited by a regular expression match. 457 /// 458 /// `'r` is the lifetime of the compiled expression and `'t` is the lifetime 459 /// of the string being split. 460 pub struct RegexSplits<'r, 't> { 461 finder: FindMatches<'r, 't>, 462 last: usize, 463 } 464 465 impl<'r, 't> Iterator for RegexSplits<'r, 't> { 466 type Item = &'t str; 467 next(&mut self) -> Option<&'t str>468 fn next(&mut self) -> Option<&'t str> { 469 let text = self.finder.text; 470 match self.finder.next() { 471 None => { 472 if self.last >= text.len() { 473 None 474 } else { 475 let s = &text[self.last..]; 476 self.last = text.len(); 477 Some(s) 478 } 479 } 480 Some((s, e)) => { 481 let matched = &text[self.last..s]; 482 self.last = e; 483 Some(matched) 484 } 485 } 486 } 487 } 488 489 impl<'r, 't> FusedIterator for RegexSplits<'r, 't> {} 490 491 /// Yields at most `N` substrings delimited by a regular expression match. 492 /// 493 /// The last substring will be whatever remains after splitting. 494 /// 495 /// `'r` is the lifetime of the compiled expression and `'t` is the lifetime 496 /// of the string being split. 497 pub struct RegexSplitsN<'r, 't> { 498 splits: RegexSplits<'r, 't>, 499 n: usize, 500 } 501 502 impl<'r, 't> Iterator for RegexSplitsN<'r, 't> { 503 type Item = &'t str; 504 next(&mut self) -> Option<&'t str>505 fn next(&mut self) -> Option<&'t str> { 506 if self.n == 0 { 507 return None; 508 } 509 self.n -= 1; 510 if self.n == 0 { 511 let text = self.splits.finder.text; 512 Some(&text[self.splits.last..]) 513 } else { 514 self.splits.next() 515 } 516 } 517 size_hint(&self) -> (usize, Option<usize>)518 fn size_hint(&self) -> (usize, Option<usize>) { 519 (0, Some(self.n)) 520 } 521 } 522 523 impl<'r, 't> FusedIterator for RegexSplitsN<'r, 't> {} 524 525 #[cfg(test)] 526 mod tests { 527 use super::super::*; 528 529 #[test] test_regex_captures()530 fn test_regex_captures() { 531 let regex = Regex::new("e(l+)|(r+)").unwrap(); 532 let captures = regex.captures("hello").unwrap(); 533 assert_eq!(captures.len(), 3); 534 assert_eq!(captures.is_empty(), false); 535 let pos1 = captures.pos(0).unwrap(); 536 let pos2 = captures.pos(1).unwrap(); 537 let pos3 = captures.pos(2); 538 assert_eq!(pos1, (1, 4)); 539 assert_eq!(pos2, (2, 4)); 540 assert_eq!(pos3, None); 541 let str1 = captures.at(0).unwrap(); 542 let str2 = captures.at(1).unwrap(); 543 let str3 = captures.at(2); 544 assert_eq!(str1, "ell"); 545 assert_eq!(str2, "ll"); 546 assert_eq!(str3, None); 547 } 548 549 #[test] test_regex_subcaptures()550 fn test_regex_subcaptures() { 551 let regex = Regex::new("e(l+)").unwrap(); 552 let captures = regex.captures("hello").unwrap(); 553 let caps = captures.iter().collect::<Vec<_>>(); 554 assert_eq!(caps[0], Some("ell")); 555 assert_eq!(caps[1], Some("ll")); 556 assert_eq!(caps.len(), 2); 557 } 558 559 #[test] test_regex_subcapturespos()560 fn test_regex_subcapturespos() { 561 let regex = Regex::new("e(l+)").unwrap(); 562 let captures = regex.captures("hello").unwrap(); 563 let caps = captures.iter_pos().collect::<Vec<_>>(); 564 assert_eq!(caps[0], Some((1, 4))); 565 assert_eq!(caps[1], Some((2, 4))); 566 assert_eq!(caps.len(), 2); 567 } 568 569 #[test] test_find_iter()570 fn test_find_iter() { 571 let re = Regex::new(r"\d+").unwrap(); 572 let ms = re.find_iter("a12b2").collect::<Vec<_>>(); 573 assert_eq!(ms, vec![(1, 3), (4, 5)]); 574 } 575 576 #[test] test_find_iter_one_zero_length()577 fn test_find_iter_one_zero_length() { 578 let re = Regex::new(r"\d*").unwrap(); 579 let ms = re.find_iter("a1b2").collect::<Vec<_>>(); 580 assert_eq!(ms, vec![(0, 0), (1, 2), (3, 4)]); 581 } 582 583 #[test] test_find_iter_many_zero_length()584 fn test_find_iter_many_zero_length() { 585 let re = Regex::new(r"\d*").unwrap(); 586 let ms = re.find_iter("a1bbb2").collect::<Vec<_>>(); 587 assert_eq!(ms, vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)]); 588 } 589 590 #[test] test_find_iter_empty_after_match()591 fn test_find_iter_empty_after_match() { 592 let re = Regex::new(r"b|(?=,)").unwrap(); 593 let ms = re.find_iter("ba,").collect::<Vec<_>>(); 594 assert_eq!(ms, vec![(0, 1), (2, 2)]); 595 } 596 597 #[test] test_zero_length_matches_jumps_past_match_location()598 fn test_zero_length_matches_jumps_past_match_location() { 599 let re = Regex::new(r"\b").unwrap(); 600 let matches = re.find_iter("test string").collect::<Vec<_>>(); 601 assert_eq!(matches, [(0, 0), (4, 4), (5, 5), (11, 11)]); 602 } 603 604 #[test] test_captures_iter()605 fn test_captures_iter() { 606 let re = Regex::new(r"\d+").unwrap(); 607 let ms = re.captures_iter("a12b2").collect::<Vec<_>>(); 608 assert_eq!(ms[0].pos(0).unwrap(), (1, 3)); 609 assert_eq!(ms[1].pos(0).unwrap(), (4, 5)); 610 } 611 612 #[test] test_captures_stores_match_offset()613 fn test_captures_stores_match_offset() { 614 let reg = Regex::new(r"\d+\.(\d+)").unwrap(); 615 let captures = reg.captures("100 - 3.1415 / 2.0").unwrap(); 616 assert_eq!(6, captures.offset()); 617 let all_caps = reg 618 .captures_iter("1 - 3234.3 * 123.2 - 100") 619 .map(|cap| cap.offset()) 620 .collect::<Vec<_>>(); 621 assert_eq!(vec![4, 13], all_caps); 622 } 623 } 624