1 use super::{Regex, Region, SearchOptions};
2 use std::iter::FusedIterator;
3 
4 impl Regex {
5     /// Returns the capture groups corresponding to the leftmost-first match
6     /// in text. Capture group `0` always corresponds to the entire match.
7     /// If no match is found, then `None` is returned.
captures<'t>(&self, text: &'t str) -> Option<Captures<'t>>8     pub fn captures<'t>(&self, text: &'t str) -> Option<Captures<'t>> {
9         let mut region = Region::new();
10         self.search_with_options(
11             text,
12             0,
13             text.len(),
14             SearchOptions::SEARCH_OPTION_NONE,
15             Some(&mut region),
16         )
17         .map(|pos| Captures {
18             text,
19             region,
20             offset: pos,
21         })
22     }
23 
24     /// Returns an iterator for each successive non-overlapping match in `text`,
25     /// returning the start and end byte indices with respect to `text`.
26     ///
27     /// # Example
28     ///
29     /// Find the start and end location of every word with exactly 13
30     /// characters:
31     ///
32     /// ```rust
33     /// # extern crate onig; use onig::Regex;
34     /// # fn main() {
35     /// let text = "Retroactively relinquishing remunerations is reprehensible.";
36     /// for pos in Regex::new(r"\b\w{13}\b").unwrap().find_iter(text) {
37     ///     println!("{:?}", pos);
38     /// }
39     /// // Output:
40     /// // (0, 13)
41     /// // (14, 27)
42     /// // (28, 41)
43     /// // (45, 58)
44     /// # }
45     /// ```
find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't>46     pub fn find_iter<'r, 't>(&'r self, text: &'t str) -> FindMatches<'r, 't> {
47         FindMatches {
48             regex: self,
49             region: Region::new(),
50             text,
51             last_end: 0,
52             last_match_end: None,
53         }
54     }
55 
56     /// Returns an iterator over all the non-overlapping capture groups matched
57     /// in `text`. This is operationally the same as `find_iter` (except it
58     /// yields information about submatches).
59     ///
60     /// # Example
61     ///
62     /// We can use this to find all movie titles and their release years in
63     /// some text, where the movie is formatted like "'Title' (xxxx)":
64     ///
65     /// ```rust
66     /// # extern crate onig; use onig::Regex;
67     /// # fn main() {
68     /// let re = Regex::new(r"'([^']+)'\s+\((\d{4})\)")
69     ///                .unwrap();
70     /// let text = "'Citizen Kane' (1941), 'The Wizard of Oz' (1939), 'M' (1931).";
71     /// for caps in re.captures_iter(text) {
72     ///     println!("Movie: {:?}, Released: {:?}", caps.at(1), caps.at(2));
73     /// }
74     /// // Output:
75     /// // Movie: Citizen Kane, Released: 1941
76     /// // Movie: The Wizard of Oz, Released: 1939
77     /// // Movie: M, Released: 1931
78     /// # }
79     /// ```
captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't>80     pub fn captures_iter<'r, 't>(&'r self, text: &'t str) -> FindCaptures<'r, 't> {
81         FindCaptures {
82             regex: self,
83             text,
84             last_end: 0,
85             last_match_end: None,
86         }
87     }
88 
89     /// Returns an iterator of substrings of `text` delimited by a match
90     /// of the regular expression.
91     /// Namely, each element of the iterator corresponds to text that *isn't*
92     /// matched by the regular expression.
93     ///
94     /// This method will *not* copy the text given.
95     ///
96     /// # Example
97     ///
98     /// To split a string delimited by arbitrary amounts of spaces or tabs:
99     ///
100     /// ```rust
101     /// # extern crate onig; use onig::Regex;
102     /// # fn main() {
103     /// let re = Regex::new(r"[ \t]+").unwrap();
104     /// let fields: Vec<&str> = re.split("a b \t  c\td    e").collect();
105     /// assert_eq!(fields, vec!("a", "b", "c", "d", "e"));
106     /// # }
107     /// ```
split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't>108     pub fn split<'r, 't>(&'r self, text: &'t str) -> RegexSplits<'r, 't> {
109         RegexSplits {
110             finder: self.find_iter(text),
111             last: 0,
112         }
113     }
114 
115     /// Returns an iterator of at most `limit` substrings of `text` delimited
116     /// by a match of the regular expression. (A `limit` of `0` will return no
117     /// substrings.)
118     /// Namely, each element of the iterator corresponds to text that *isn't*
119     /// matched by the regular expression.
120     /// The remainder of the string that is not split will be the last element
121     /// in the iterator.
122     ///
123     /// This method will *not* copy the text given.
124     ///
125     /// # Example
126     ///
127     /// Get the first two words in some text:
128     ///
129     /// ```rust
130     /// # extern crate onig; use onig::Regex;
131     /// # fn main() {
132     /// let re = Regex::new(r"\W+").unwrap();
133     /// let fields: Vec<&str> = re.splitn("Hey! How are you?", 3).collect();
134     /// assert_eq!(fields, vec!("Hey", "How", "are you?"));
135     /// # }
136     /// ```
splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't>137     pub fn splitn<'r, 't>(&'r self, text: &'t str, limit: usize) -> RegexSplitsN<'r, 't> {
138         RegexSplitsN {
139             splits: self.split(text),
140             n: limit,
141         }
142     }
143 
144     /// Scan the given slice, capturing into the given region and
145     /// executing a callback for each match.
scan_with_region<F>( &self, to_search: &str, region: &mut Region, options: SearchOptions, mut callback: F, ) -> i32 where F: Fn(i32, i32, &Region) -> bool,146     pub fn scan_with_region<F>(
147         &self,
148         to_search: &str,
149         region: &mut Region,
150         options: SearchOptions,
151         mut callback: F,
152     ) -> i32
153     where
154         F: Fn(i32, i32, &Region) -> bool,
155     {
156         use onig_sys::{onig_scan, OnigRegion};
157         use std::os::raw::{c_int, c_void};
158 
159         // Find the bounds of the string we're searching
160         let start = to_search.as_ptr();
161         let end = to_search[to_search.len()..].as_ptr();
162 
163         unsafe extern "C" fn scan_cb<F>(
164             i: c_int,
165             j: c_int,
166             r: *mut OnigRegion,
167             ud: *mut c_void,
168         ) -> c_int
169         where
170             F: Fn(i32, i32, &Region) -> bool,
171         {
172             let region = Region::clone_from_raw(r);
173             let callback = &*(ud as *mut F);
174             if callback(i, j, &region) {
175                 0
176             } else {
177                 -1
178             }
179         }
180 
181         unsafe {
182             onig_scan(
183                 self.raw,
184                 start,
185                 end,
186                 (&mut region.raw) as *mut ::onig_sys::OnigRegion,
187                 options.bits(),
188                 Some(scan_cb::<F>),
189                 &mut callback as *mut F as *mut c_void,
190             )
191         }
192     }
193 
194     /// Scan a Pattern and Observe Captures
195     ///
196     /// The scan function takes a haystack `to_search` and invokes the
197     /// given `callback` for each capture of this expression.
scan<'t, CB>(&self, to_search: &'t str, callback: CB) where CB: Fn(i32, Captures<'t>) -> bool,198     pub fn scan<'t, CB>(&self, to_search: &'t str, callback: CB)
199     where
200         CB: Fn(i32, Captures<'t>) -> bool,
201     {
202         let mut region = Region::new();
203         self.scan_with_region(
204             to_search,
205             &mut region,
206             SearchOptions::SEARCH_OPTION_NONE,
207             |n, s, region| {
208                 let captures = Captures {
209                     text: to_search,
210                     region: region.clone(),
211                     offset: s as usize,
212                 };
213                 callback(n, captures)
214             },
215         );
216     }
217 }
218 
219 /// Captures represents a group of captured strings for a single match.
220 ///
221 /// The 0th capture always corresponds to the entire match. Each subsequent
222 /// index corresponds to the next capture group in the regex. Positions
223 /// returned from a capture group are always byte indices.
224 ///
225 /// `'t` is the lifetime of the matched text.
226 #[derive(Debug)]
227 pub struct Captures<'t> {
228     text: &'t str,
229     region: Region,
230     offset: usize,
231 }
232 
233 impl<'t> Captures<'t> {
234     /// Returns the start and end positions of the Nth capture group. Returns
235     /// `None` if i is not a valid capture group or if the capture group did
236     /// not match anything. The positions returned are always byte indices with
237     /// respect to the original string matched.
pos(&self, pos: usize) -> Option<(usize, usize)>238     pub fn pos(&self, pos: usize) -> Option<(usize, usize)> {
239         self.region.pos(pos)
240     }
241 
242     /// Returns the matched string for the capture group `i`. If `i` isn't
243     /// a valid capture group or didn't match anything, then `None` is returned.
at(&self, pos: usize) -> Option<&'t str>244     pub fn at(&self, pos: usize) -> Option<&'t str> {
245         self.pos(pos).map(|(beg, end)| &self.text[beg..end])
246     }
247 
248     /// Returns the number of captured groups.
len(&self) -> usize249     pub fn len(&self) -> usize {
250         self.region.len()
251     }
252 
253     /// Returns true if and only if there are no captured groups.
is_empty(&self) -> bool254     pub fn is_empty(&self) -> bool {
255         self.len() == 0
256     }
257 
258     /// Creates an iterator of all the capture groups in order of appearance in
259     /// the regular expression.
iter(&'t self) -> SubCaptures<'t>260     pub fn iter(&'t self) -> SubCaptures<'t> {
261         SubCaptures { idx: 0, caps: self }
262     }
263 
264     /// Creates an iterator of all the capture group positions in order of
265     /// appearance in the regular expression. Positions are byte indices in
266     /// terms of the original string matched.
iter_pos(&'t self) -> SubCapturesPos<'t>267     pub fn iter_pos(&'t self) -> SubCapturesPos<'t> {
268         SubCapturesPos { idx: 0, caps: self }
269     }
270 
271     /// Offset of the captures within the given string slice.
offset(&self) -> usize272     pub fn offset(&self) -> usize {
273         self.offset
274     }
275 }
276 
277 /// An iterator over capture groups for a particular match of a regular
278 /// expression.
279 ///
280 /// `'t` is the lifetime of the matched text.
281 pub struct SubCaptures<'t> {
282     idx: usize,
283     caps: &'t Captures<'t>,
284 }
285 
286 impl<'t> Iterator for SubCaptures<'t> {
287     type Item = Option<&'t str>;
288 
next(&mut self) -> Option<Option<&'t str>>289     fn next(&mut self) -> Option<Option<&'t str>> {
290         if self.idx < self.caps.len() {
291             self.idx += 1;
292             Some(self.caps.at(self.idx - 1))
293         } else {
294             None
295         }
296     }
297 
size_hint(&self) -> (usize, Option<usize>)298     fn size_hint(&self) -> (usize, Option<usize>) {
299         let size = self.caps.len();
300         (size, Some(size))
301     }
302 
count(self) -> usize303     fn count(self) -> usize {
304         self.caps.len()
305     }
306 }
307 
308 impl<'t> FusedIterator for SubCaptures<'t> {}
309 
310 impl<'t> ExactSizeIterator for SubCaptures<'t> {}
311 
312 /// An iterator over capture group positions for a particular match of
313 /// a regular expression.
314 ///
315 /// Positions are byte indices in terms of the original
316 /// string matched. `'t` is the lifetime of the matched text.
317 pub struct SubCapturesPos<'t> {
318     idx: usize,
319     caps: &'t Captures<'t>,
320 }
321 
322 impl<'t> Iterator for SubCapturesPos<'t> {
323     type Item = Option<(usize, usize)>;
324 
next(&mut self) -> Option<Option<(usize, usize)>>325     fn next(&mut self) -> Option<Option<(usize, usize)>> {
326         if self.idx < self.caps.len() {
327             self.idx += 1;
328             Some(self.caps.pos(self.idx - 1))
329         } else {
330             None
331         }
332     }
333 
size_hint(&self) -> (usize, Option<usize>)334     fn size_hint(&self) -> (usize, Option<usize>) {
335         let size = self.caps.len();
336         (size, Some(size))
337     }
338 
count(self) -> usize339     fn count(self) -> usize {
340         self.caps.len()
341     }
342 }
343 
344 impl<'t> FusedIterator for SubCapturesPos<'t> {}
345 
346 impl<'t> ExactSizeIterator for SubCapturesPos<'t> {}
347 
348 /// An iterator over all non-overlapping matches for a particular string.
349 ///
350 /// The iterator yields a tuple of integers corresponding to the start and end
351 /// of the match. The indices are byte offsets. The iterator stops when no more
352 /// matches can be found.
353 ///
354 /// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
355 /// of the matched string.
356 pub struct FindMatches<'r, 't> {
357     regex: &'r Regex,
358     region: Region,
359     text: &'t str,
360     last_end: usize,
361     last_match_end: Option<usize>,
362 }
363 
364 impl<'r, 't> Iterator for FindMatches<'r, 't> {
365     type Item = (usize, usize);
366 
next(&mut self) -> Option<(usize, usize)>367     fn next(&mut self) -> Option<(usize, usize)> {
368         if self.last_end > self.text.len() {
369             return None;
370         }
371         self.region.clear();
372         self.regex.search_with_options(
373             self.text,
374             self.last_end,
375             self.text.len(),
376             SearchOptions::SEARCH_OPTION_NONE,
377             Some(&mut self.region),
378         )?;
379         let (s, e) = self.region.pos(0).unwrap();
380 
381         // Don't accept empty matches immediately following the last match.
382         // i.e., no infinite loops please.
383         if e == s && self.last_match_end.map_or(false, |l| l == e) {
384             self.last_end += self.text[self.last_end..]
385                 .chars()
386                 .next()
387                 .map(|c| c.len_utf8())
388                 .unwrap_or(1);
389             return self.next();
390         } else {
391             self.last_end = e;
392             self.last_match_end = Some(e);
393         }
394 
395         Some((s, e))
396     }
397 }
398 
399 impl<'r, 't> FusedIterator for FindMatches<'r, 't> {}
400 
401 /// An iterator that yields all non-overlapping capture groups matching a
402 /// particular regular expression.
403 ///
404 /// The iterator stops when no more matches can be found.
405 ///
406 /// `'r` is the lifetime of the `Regex` struct and `'t` is the lifetime
407 /// of the matched string.
408 pub struct FindCaptures<'r, 't> {
409     regex: &'r Regex,
410     text: &'t str,
411     last_end: usize,
412     last_match_end: Option<usize>,
413 }
414 
415 impl<'r, 't> Iterator for FindCaptures<'r, 't> {
416     type Item = Captures<'t>;
417 
next(&mut self) -> Option<Captures<'t>>418     fn next(&mut self) -> Option<Captures<'t>> {
419         if self.last_end > self.text.len() {
420             return None;
421         }
422 
423         let mut region = Region::new();
424         let r = self.regex.search_with_options(
425             self.text,
426             self.last_end,
427             self.text.len(),
428             SearchOptions::SEARCH_OPTION_NONE,
429             Some(&mut region),
430         )?;
431         let (s, e) = region.pos(0).unwrap();
432 
433         // Don't accept empty matches immediately following the last match.
434         // i.e., no infinite loops please.
435         if e == s && self.last_match_end.map_or(false, |l| l == e) {
436             self.last_end += self.text[self.last_end..]
437                 .chars()
438                 .next()
439                 .map(|c| c.len_utf8())
440                 .unwrap_or(1);
441             return self.next();
442         } else {
443             self.last_end = e;
444             self.last_match_end = Some(e);
445         }
446         Some(Captures {
447             text: self.text,
448             region,
449             offset: r,
450         })
451     }
452 }
453 
454 impl<'r, 't> FusedIterator for FindCaptures<'r, 't> {}
455 
456 /// Yields all substrings delimited by a regular expression match.
457 ///
458 /// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
459 /// of the string being split.
460 pub struct RegexSplits<'r, 't> {
461     finder: FindMatches<'r, 't>,
462     last: usize,
463 }
464 
465 impl<'r, 't> Iterator for RegexSplits<'r, 't> {
466     type Item = &'t str;
467 
next(&mut self) -> Option<&'t str>468     fn next(&mut self) -> Option<&'t str> {
469         let text = self.finder.text;
470         match self.finder.next() {
471             None => {
472                 if self.last >= text.len() {
473                     None
474                 } else {
475                     let s = &text[self.last..];
476                     self.last = text.len();
477                     Some(s)
478                 }
479             }
480             Some((s, e)) => {
481                 let matched = &text[self.last..s];
482                 self.last = e;
483                 Some(matched)
484             }
485         }
486     }
487 }
488 
489 impl<'r, 't> FusedIterator for RegexSplits<'r, 't> {}
490 
491 /// Yields at most `N` substrings delimited by a regular expression match.
492 ///
493 /// The last substring will be whatever remains after splitting.
494 ///
495 /// `'r` is the lifetime of the compiled expression and `'t` is the lifetime
496 /// of the string being split.
497 pub struct RegexSplitsN<'r, 't> {
498     splits: RegexSplits<'r, 't>,
499     n: usize,
500 }
501 
502 impl<'r, 't> Iterator for RegexSplitsN<'r, 't> {
503     type Item = &'t str;
504 
next(&mut self) -> Option<&'t str>505     fn next(&mut self) -> Option<&'t str> {
506         if self.n == 0 {
507             return None;
508         }
509         self.n -= 1;
510         if self.n == 0 {
511             let text = self.splits.finder.text;
512             Some(&text[self.splits.last..])
513         } else {
514             self.splits.next()
515         }
516     }
517 
size_hint(&self) -> (usize, Option<usize>)518     fn size_hint(&self) -> (usize, Option<usize>) {
519         (0, Some(self.n))
520     }
521 }
522 
523 impl<'r, 't> FusedIterator for RegexSplitsN<'r, 't> {}
524 
525 #[cfg(test)]
526 mod tests {
527     use super::super::*;
528 
529     #[test]
test_regex_captures()530     fn test_regex_captures() {
531         let regex = Regex::new("e(l+)|(r+)").unwrap();
532         let captures = regex.captures("hello").unwrap();
533         assert_eq!(captures.len(), 3);
534         assert_eq!(captures.is_empty(), false);
535         let pos1 = captures.pos(0).unwrap();
536         let pos2 = captures.pos(1).unwrap();
537         let pos3 = captures.pos(2);
538         assert_eq!(pos1, (1, 4));
539         assert_eq!(pos2, (2, 4));
540         assert_eq!(pos3, None);
541         let str1 = captures.at(0).unwrap();
542         let str2 = captures.at(1).unwrap();
543         let str3 = captures.at(2);
544         assert_eq!(str1, "ell");
545         assert_eq!(str2, "ll");
546         assert_eq!(str3, None);
547     }
548 
549     #[test]
test_regex_subcaptures()550     fn test_regex_subcaptures() {
551         let regex = Regex::new("e(l+)").unwrap();
552         let captures = regex.captures("hello").unwrap();
553         let caps = captures.iter().collect::<Vec<_>>();
554         assert_eq!(caps[0], Some("ell"));
555         assert_eq!(caps[1], Some("ll"));
556         assert_eq!(caps.len(), 2);
557     }
558 
559     #[test]
test_regex_subcapturespos()560     fn test_regex_subcapturespos() {
561         let regex = Regex::new("e(l+)").unwrap();
562         let captures = regex.captures("hello").unwrap();
563         let caps = captures.iter_pos().collect::<Vec<_>>();
564         assert_eq!(caps[0], Some((1, 4)));
565         assert_eq!(caps[1], Some((2, 4)));
566         assert_eq!(caps.len(), 2);
567     }
568 
569     #[test]
test_find_iter()570     fn test_find_iter() {
571         let re = Regex::new(r"\d+").unwrap();
572         let ms = re.find_iter("a12b2").collect::<Vec<_>>();
573         assert_eq!(ms, vec![(1, 3), (4, 5)]);
574     }
575 
576     #[test]
test_find_iter_one_zero_length()577     fn test_find_iter_one_zero_length() {
578         let re = Regex::new(r"\d*").unwrap();
579         let ms = re.find_iter("a1b2").collect::<Vec<_>>();
580         assert_eq!(ms, vec![(0, 0), (1, 2), (3, 4)]);
581     }
582 
583     #[test]
test_find_iter_many_zero_length()584     fn test_find_iter_many_zero_length() {
585         let re = Regex::new(r"\d*").unwrap();
586         let ms = re.find_iter("a1bbb2").collect::<Vec<_>>();
587         assert_eq!(ms, vec![(0, 0), (1, 2), (3, 3), (4, 4), (5, 6)]);
588     }
589 
590     #[test]
test_find_iter_empty_after_match()591     fn test_find_iter_empty_after_match() {
592         let re = Regex::new(r"b|(?=,)").unwrap();
593         let ms = re.find_iter("ba,").collect::<Vec<_>>();
594         assert_eq!(ms, vec![(0, 1), (2, 2)]);
595     }
596 
597     #[test]
test_zero_length_matches_jumps_past_match_location()598     fn test_zero_length_matches_jumps_past_match_location() {
599         let re = Regex::new(r"\b").unwrap();
600         let matches = re.find_iter("test string").collect::<Vec<_>>();
601         assert_eq!(matches, [(0, 0), (4, 4), (5, 5), (11, 11)]);
602     }
603 
604     #[test]
test_captures_iter()605     fn test_captures_iter() {
606         let re = Regex::new(r"\d+").unwrap();
607         let ms = re.captures_iter("a12b2").collect::<Vec<_>>();
608         assert_eq!(ms[0].pos(0).unwrap(), (1, 3));
609         assert_eq!(ms[1].pos(0).unwrap(), (4, 5));
610     }
611 
612     #[test]
test_captures_stores_match_offset()613     fn test_captures_stores_match_offset() {
614         let reg = Regex::new(r"\d+\.(\d+)").unwrap();
615         let captures = reg.captures("100 - 3.1415 / 2.0").unwrap();
616         assert_eq!(6, captures.offset());
617         let all_caps = reg
618             .captures_iter("1 - 3234.3 * 123.2 - 100")
619             .map(|cap| cap.offset())
620             .collect::<Vec<_>>();
621         assert_eq!(vec![4, 13], all_caps);
622     }
623 }
624