1 /// Slot is a single saved capture location. Note that there are two slots for
2 /// every capture in a regular expression (one slot each for the start and end
3 /// of the capture).
4 pub type Slot = Option<usize>;
5 
6 /// Locations represents the offsets of each capturing group in a regex for
7 /// a single match.
8 ///
9 /// Unlike `Captures`, a `Locations` value only stores offsets.
10 #[doc(hidden)]
11 #[derive(Clone, Debug)]
12 pub struct Locations(Vec<Slot>);
13 
14 impl Locations {
15     /// Returns the start and end positions of the Nth capture group. Returns
16     /// `None` if `i` is not a valid capture group or if the capture group did
17     /// not match anything. The positions returned are *always* byte indices
18     /// with respect to the original string matched.
pos(&self, i: usize) -> Option<(usize, usize)>19     pub fn pos(&self, i: usize) -> Option<(usize, usize)> {
20         let (s, e) = (i * 2, i * 2 + 1);
21         match (self.0.get(s), self.0.get(e)) {
22             (Some(&Some(s)), Some(&Some(e))) => Some((s, e)),
23             _ => None,
24         }
25     }
26 
27     /// Creates an iterator of all the capture group positions in order of
28     /// appearance in the regular expression. Positions are byte indices
29     /// in terms of the original string matched.
iter(&self) -> SubCapturesPosIter30     pub fn iter(&self) -> SubCapturesPosIter {
31         SubCapturesPosIter { idx: 0, locs: self }
32     }
33 
34     /// Returns the total number of capturing groups.
35     ///
36     /// This is always at least `1` since every regex has at least `1`
37     /// capturing group that corresponds to the entire match.
len(&self) -> usize38     pub fn len(&self) -> usize {
39         self.0.len() / 2
40     }
41 
42     /// Return the individual slots as a slice.
as_slots(&mut self) -> &mut [Slot]43     pub(crate) fn as_slots(&mut self) -> &mut [Slot] {
44         &mut self.0
45     }
46 }
47 
48 /// An iterator over capture group positions for a particular match of a
49 /// regular expression.
50 ///
51 /// Positions are byte indices in terms of the original string matched.
52 ///
53 /// `'c` is the lifetime of the captures.
54 #[derive(Clone)]
55 pub struct SubCapturesPosIter<'c> {
56     idx: usize,
57     locs: &'c Locations,
58 }
59 
60 impl<'c> Iterator for SubCapturesPosIter<'c> {
61     type Item = Option<(usize, usize)>;
62 
next(&mut self) -> Option<Option<(usize, usize)>>63     fn next(&mut self) -> Option<Option<(usize, usize)>> {
64         if self.idx >= self.locs.len() {
65             return None;
66         }
67         let x = match self.locs.pos(self.idx) {
68             None => Some(None),
69             Some((s, e)) => Some(Some((s, e))),
70         };
71         self.idx += 1;
72         x
73     }
74 }
75 
76 /// `RegularExpression` describes types that can implement regex searching.
77 ///
78 /// This trait is my attempt at reducing code duplication and to standardize
79 /// the internal API. Specific duplication that is avoided are the `find`
80 /// and `capture` iterators, which are slightly tricky.
81 ///
82 /// It's not clear whether this trait is worth it, and it also isn't
83 /// clear whether it's useful as a public trait or not. Methods like
84 /// `next_after_empty` reak of bad design, but the rest of the methods seem
85 /// somewhat reasonable. One particular thing this trait would expose would be
86 /// the ability to start the search of a regex anywhere in a haystack, which
87 /// isn't possible in the current public API.
88 pub trait RegularExpression: Sized {
89     /// The type of the haystack.
90     type Text: ?Sized;
91 
92     /// The number of capture slots in the compiled regular expression. This is
93     /// always two times the number of capture groups (two slots per group).
slots_len(&self) -> usize94     fn slots_len(&self) -> usize;
95 
96     /// Allocates fresh space for all capturing groups in this regex.
locations(&self) -> Locations97     fn locations(&self) -> Locations {
98         Locations(vec![None; self.slots_len()])
99     }
100 
101     /// Returns the position of the next character after `i`.
102     ///
103     /// For example, a haystack with type `&[u8]` probably returns `i+1`,
104     /// whereas a haystack with type `&str` probably returns `i` plus the
105     /// length of the next UTF-8 sequence.
next_after_empty(&self, text: &Self::Text, i: usize) -> usize106     fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize;
107 
108     /// Returns the location of the shortest match.
shortest_match_at( &self, text: &Self::Text, start: usize, ) -> Option<usize>109     fn shortest_match_at(
110         &self,
111         text: &Self::Text,
112         start: usize,
113     ) -> Option<usize>;
114 
115     /// Returns whether the regex matches the text given.
is_match_at(&self, text: &Self::Text, start: usize) -> bool116     fn is_match_at(&self, text: &Self::Text, start: usize) -> bool;
117 
118     /// Returns the leftmost-first match location if one exists.
find_at( &self, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>119     fn find_at(
120         &self,
121         text: &Self::Text,
122         start: usize,
123     ) -> Option<(usize, usize)>;
124 
125     /// Returns the leftmost-first match location if one exists, and also
126     /// fills in any matching capture slot locations.
captures_read_at( &self, locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>127     fn captures_read_at(
128         &self,
129         locs: &mut Locations,
130         text: &Self::Text,
131         start: usize,
132     ) -> Option<(usize, usize)>;
133 
134     /// Returns an iterator over all non-overlapping successive leftmost-first
135     /// matches.
find_iter(self, text: &Self::Text) -> Matches<Self>136     fn find_iter(self, text: &Self::Text) -> Matches<Self> {
137         Matches { re: self, text: text, last_end: 0, last_match: None }
138     }
139 
140     /// Returns an iterator over all non-overlapping successive leftmost-first
141     /// matches with captures.
captures_iter(self, text: &Self::Text) -> CaptureMatches<Self>142     fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> {
143         CaptureMatches(self.find_iter(text))
144     }
145 }
146 
147 /// An iterator over all non-overlapping successive leftmost-first matches.
148 pub struct Matches<'t, R>
149 where
150     R: RegularExpression,
151     R::Text: 't,
152 {
153     re: R,
154     text: &'t R::Text,
155     last_end: usize,
156     last_match: Option<usize>,
157 }
158 
159 impl<'t, R> Matches<'t, R>
160 where
161     R: RegularExpression,
162     R::Text: 't,
163 {
164     /// Return the text being searched.
text(&self) -> &'t R::Text165     pub fn text(&self) -> &'t R::Text {
166         self.text
167     }
168 
169     /// Return the underlying regex.
regex(&self) -> &R170     pub fn regex(&self) -> &R {
171         &self.re
172     }
173 }
174 
175 impl<'t, R> Iterator for Matches<'t, R>
176 where
177     R: RegularExpression,
178     R::Text: 't + AsRef<[u8]>,
179 {
180     type Item = (usize, usize);
181 
next(&mut self) -> Option<(usize, usize)>182     fn next(&mut self) -> Option<(usize, usize)> {
183         if self.last_end > self.text.as_ref().len() {
184             return None;
185         }
186         let (s, e) = match self.re.find_at(self.text, self.last_end) {
187             None => return None,
188             Some((s, e)) => (s, e),
189         };
190         if s == e {
191             // This is an empty match. To ensure we make progress, start
192             // the next search at the smallest possible starting position
193             // of the next match following this one.
194             self.last_end = self.re.next_after_empty(self.text, e);
195             // Don't accept empty matches immediately following a match.
196             // Just move on to the next match.
197             if Some(e) == self.last_match {
198                 return self.next();
199             }
200         } else {
201             self.last_end = e;
202         }
203         self.last_match = Some(e);
204         Some((s, e))
205     }
206 }
207 
208 /// An iterator over all non-overlapping successive leftmost-first matches with
209 /// captures.
210 pub struct CaptureMatches<'t, R>(Matches<'t, R>)
211 where
212     R: RegularExpression,
213     R::Text: 't;
214 
215 impl<'t, R> CaptureMatches<'t, R>
216 where
217     R: RegularExpression,
218     R::Text: 't,
219 {
220     /// Return the text being searched.
text(&self) -> &'t R::Text221     pub fn text(&self) -> &'t R::Text {
222         self.0.text()
223     }
224 
225     /// Return the underlying regex.
regex(&self) -> &R226     pub fn regex(&self) -> &R {
227         self.0.regex()
228     }
229 }
230 
231 impl<'t, R> Iterator for CaptureMatches<'t, R>
232 where
233     R: RegularExpression,
234     R::Text: 't + AsRef<[u8]>,
235 {
236     type Item = Locations;
237 
next(&mut self) -> Option<Locations>238     fn next(&mut self) -> Option<Locations> {
239         if self.0.last_end > self.0.text.as_ref().len() {
240             return None;
241         }
242         let mut locs = self.0.re.locations();
243         let (s, e) = match self.0.re.captures_read_at(
244             &mut locs,
245             self.0.text,
246             self.0.last_end,
247         ) {
248             None => return None,
249             Some((s, e)) => (s, e),
250         };
251         if s == e {
252             self.0.last_end = self.0.re.next_after_empty(self.0.text, e);
253             if Some(e) == self.0.last_match {
254                 return self.next();
255             }
256         } else {
257             self.0.last_end = e;
258         }
259         self.0.last_match = Some(e);
260         Some(locs)
261     }
262 }
263