1 /// Slot is a single saved capture location. Note that there are two slots for 2 /// every capture in a regular expression (one slot each for the start and end 3 /// of the capture). 4 pub type Slot = Option<usize>; 5 6 /// Locations represents the offsets of each capturing group in a regex for 7 /// a single match. 8 /// 9 /// Unlike `Captures`, a `Locations` value only stores offsets. 10 #[doc(hidden)] 11 #[derive(Clone, Debug)] 12 pub struct Locations(Vec<Slot>); 13 14 impl Locations { 15 /// Returns the start and end positions of the Nth capture group. Returns 16 /// `None` if `i` is not a valid capture group or if the capture group did 17 /// not match anything. The positions returned are *always* byte indices 18 /// with respect to the original string matched. pos(&self, i: usize) -> Option<(usize, usize)>19 pub fn pos(&self, i: usize) -> Option<(usize, usize)> { 20 let (s, e) = (i * 2, i * 2 + 1); 21 match (self.0.get(s), self.0.get(e)) { 22 (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), 23 _ => None, 24 } 25 } 26 27 /// Creates an iterator of all the capture group positions in order of 28 /// appearance in the regular expression. Positions are byte indices 29 /// in terms of the original string matched. iter(&self) -> SubCapturesPosIter30 pub fn iter(&self) -> SubCapturesPosIter { 31 SubCapturesPosIter { idx: 0, locs: self } 32 } 33 34 /// Returns the total number of capturing groups. 35 /// 36 /// This is always at least `1` since every regex has at least `1` 37 /// capturing group that corresponds to the entire match. len(&self) -> usize38 pub fn len(&self) -> usize { 39 self.0.len() / 2 40 } 41 42 /// Return the individual slots as a slice. as_slots(&mut self) -> &mut [Slot]43 pub(crate) fn as_slots(&mut self) -> &mut [Slot] { 44 &mut self.0 45 } 46 } 47 48 /// An iterator over capture group positions for a particular match of a 49 /// regular expression. 50 /// 51 /// Positions are byte indices in terms of the original string matched. 52 /// 53 /// `'c` is the lifetime of the captures. 54 #[derive(Clone)] 55 pub struct SubCapturesPosIter<'c> { 56 idx: usize, 57 locs: &'c Locations, 58 } 59 60 impl<'c> Iterator for SubCapturesPosIter<'c> { 61 type Item = Option<(usize, usize)>; 62 next(&mut self) -> Option<Option<(usize, usize)>>63 fn next(&mut self) -> Option<Option<(usize, usize)>> { 64 if self.idx >= self.locs.len() { 65 return None; 66 } 67 let x = match self.locs.pos(self.idx) { 68 None => Some(None), 69 Some((s, e)) => Some(Some((s, e))), 70 }; 71 self.idx += 1; 72 x 73 } 74 } 75 76 /// `RegularExpression` describes types that can implement regex searching. 77 /// 78 /// This trait is my attempt at reducing code duplication and to standardize 79 /// the internal API. Specific duplication that is avoided are the `find` 80 /// and `capture` iterators, which are slightly tricky. 81 /// 82 /// It's not clear whether this trait is worth it, and it also isn't 83 /// clear whether it's useful as a public trait or not. Methods like 84 /// `next_after_empty` reak of bad design, but the rest of the methods seem 85 /// somewhat reasonable. One particular thing this trait would expose would be 86 /// the ability to start the search of a regex anywhere in a haystack, which 87 /// isn't possible in the current public API. 88 pub trait RegularExpression: Sized { 89 /// The type of the haystack. 90 type Text: ?Sized; 91 92 /// The number of capture slots in the compiled regular expression. This is 93 /// always two times the number of capture groups (two slots per group). slots_len(&self) -> usize94 fn slots_len(&self) -> usize; 95 96 /// Allocates fresh space for all capturing groups in this regex. locations(&self) -> Locations97 fn locations(&self) -> Locations { 98 Locations(vec![None; self.slots_len()]) 99 } 100 101 /// Returns the position of the next character after `i`. 102 /// 103 /// For example, a haystack with type `&[u8]` probably returns `i+1`, 104 /// whereas a haystack with type `&str` probably returns `i` plus the 105 /// length of the next UTF-8 sequence. next_after_empty(&self, text: &Self::Text, i: usize) -> usize106 fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; 107 108 /// Returns the location of the shortest match. shortest_match_at( &self, text: &Self::Text, start: usize, ) -> Option<usize>109 fn shortest_match_at( 110 &self, 111 text: &Self::Text, 112 start: usize, 113 ) -> Option<usize>; 114 115 /// Returns whether the regex matches the text given. is_match_at(&self, text: &Self::Text, start: usize) -> bool116 fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; 117 118 /// Returns the leftmost-first match location if one exists. find_at( &self, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>119 fn find_at( 120 &self, 121 text: &Self::Text, 122 start: usize, 123 ) -> Option<(usize, usize)>; 124 125 /// Returns the leftmost-first match location if one exists, and also 126 /// fills in any matching capture slot locations. captures_read_at( &self, locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>127 fn captures_read_at( 128 &self, 129 locs: &mut Locations, 130 text: &Self::Text, 131 start: usize, 132 ) -> Option<(usize, usize)>; 133 134 /// Returns an iterator over all non-overlapping successive leftmost-first 135 /// matches. find_iter(self, text: &Self::Text) -> Matches<Self>136 fn find_iter(self, text: &Self::Text) -> Matches<Self> { 137 Matches { re: self, text: text, last_end: 0, last_match: None } 138 } 139 140 /// Returns an iterator over all non-overlapping successive leftmost-first 141 /// matches with captures. captures_iter(self, text: &Self::Text) -> CaptureMatches<Self>142 fn captures_iter(self, text: &Self::Text) -> CaptureMatches<Self> { 143 CaptureMatches(self.find_iter(text)) 144 } 145 } 146 147 /// An iterator over all non-overlapping successive leftmost-first matches. 148 pub struct Matches<'t, R> 149 where 150 R: RegularExpression, 151 R::Text: 't, 152 { 153 re: R, 154 text: &'t R::Text, 155 last_end: usize, 156 last_match: Option<usize>, 157 } 158 159 impl<'t, R> Matches<'t, R> 160 where 161 R: RegularExpression, 162 R::Text: 't, 163 { 164 /// Return the text being searched. text(&self) -> &'t R::Text165 pub fn text(&self) -> &'t R::Text { 166 self.text 167 } 168 169 /// Return the underlying regex. regex(&self) -> &R170 pub fn regex(&self) -> &R { 171 &self.re 172 } 173 } 174 175 impl<'t, R> Iterator for Matches<'t, R> 176 where 177 R: RegularExpression, 178 R::Text: 't + AsRef<[u8]>, 179 { 180 type Item = (usize, usize); 181 next(&mut self) -> Option<(usize, usize)>182 fn next(&mut self) -> Option<(usize, usize)> { 183 if self.last_end > self.text.as_ref().len() { 184 return None; 185 } 186 let (s, e) = match self.re.find_at(self.text, self.last_end) { 187 None => return None, 188 Some((s, e)) => (s, e), 189 }; 190 if s == e { 191 // This is an empty match. To ensure we make progress, start 192 // the next search at the smallest possible starting position 193 // of the next match following this one. 194 self.last_end = self.re.next_after_empty(self.text, e); 195 // Don't accept empty matches immediately following a match. 196 // Just move on to the next match. 197 if Some(e) == self.last_match { 198 return self.next(); 199 } 200 } else { 201 self.last_end = e; 202 } 203 self.last_match = Some(e); 204 Some((s, e)) 205 } 206 } 207 208 /// An iterator over all non-overlapping successive leftmost-first matches with 209 /// captures. 210 pub struct CaptureMatches<'t, R>(Matches<'t, R>) 211 where 212 R: RegularExpression, 213 R::Text: 't; 214 215 impl<'t, R> CaptureMatches<'t, R> 216 where 217 R: RegularExpression, 218 R::Text: 't, 219 { 220 /// Return the text being searched. text(&self) -> &'t R::Text221 pub fn text(&self) -> &'t R::Text { 222 self.0.text() 223 } 224 225 /// Return the underlying regex. regex(&self) -> &R226 pub fn regex(&self) -> &R { 227 self.0.regex() 228 } 229 } 230 231 impl<'t, R> Iterator for CaptureMatches<'t, R> 232 where 233 R: RegularExpression, 234 R::Text: 't + AsRef<[u8]>, 235 { 236 type Item = Locations; 237 next(&mut self) -> Option<Locations>238 fn next(&mut self) -> Option<Locations> { 239 if self.0.last_end > self.0.text.as_ref().len() { 240 return None; 241 } 242 let mut locs = self.0.re.locations(); 243 let (s, e) = match self.0.re.captures_read_at( 244 &mut locs, 245 self.0.text, 246 self.0.last_end, 247 ) { 248 None => return None, 249 Some((s, e)) => (s, e), 250 }; 251 if s == e { 252 self.0.last_end = self.0.re.next_after_empty(self.0.text, e); 253 if Some(e) == self.0.last_match { 254 return self.next(); 255 } 256 } else { 257 self.0.last_end = e; 258 } 259 self.0.last_match = Some(e); 260 Some(locs) 261 } 262 } 263