1 use std::fmt; 2 use std::iter::FusedIterator; 3 4 /// Slot is a single saved capture location. Note that there are two slots for 5 /// every capture in a regular expression (one slot each for the start and end 6 /// of the capture). 7 pub type Slot = Option<usize>; 8 9 /// Locations represents the offsets of each capturing group in a regex for 10 /// a single match. 11 /// 12 /// Unlike `Captures`, a `Locations` value only stores offsets. 13 #[doc(hidden)] 14 #[derive(Clone, Debug)] 15 pub struct Locations(Vec<Slot>); 16 17 impl Locations { 18 /// Returns the start and end positions of the Nth capture group. Returns 19 /// `None` if `i` is not a valid capture group or if the capture group did 20 /// not match anything. The positions returned are *always* byte indices 21 /// with respect to the original string matched. pos(&self, i: usize) -> Option<(usize, usize)>22 pub fn pos(&self, i: usize) -> Option<(usize, usize)> { 23 let (s, e) = (i * 2, i * 2 + 1); 24 match (self.0.get(s), self.0.get(e)) { 25 (Some(&Some(s)), Some(&Some(e))) => Some((s, e)), 26 _ => None, 27 } 28 } 29 30 /// Creates an iterator of all the capture group positions in order of 31 /// appearance in the regular expression. Positions are byte indices 32 /// in terms of the original string matched. iter(&self) -> SubCapturesPosIter<'_>33 pub fn iter(&self) -> SubCapturesPosIter<'_> { 34 SubCapturesPosIter { idx: 0, locs: self } 35 } 36 37 /// Returns the total number of capturing groups. 38 /// 39 /// This is always at least `1` since every regex has at least `1` 40 /// capturing group that corresponds to the entire match. len(&self) -> usize41 pub fn len(&self) -> usize { 42 self.0.len() / 2 43 } 44 45 /// Return the individual slots as a slice. as_slots(&mut self) -> &mut [Slot]46 pub(crate) fn as_slots(&mut self) -> &mut [Slot] { 47 &mut self.0 48 } 49 } 50 51 /// An iterator over capture group positions for a particular match of a 52 /// regular expression. 53 /// 54 /// Positions are byte indices in terms of the original string matched. 55 /// 56 /// `'c` is the lifetime of the captures. 57 #[derive(Clone, Debug)] 58 pub struct SubCapturesPosIter<'c> { 59 idx: usize, 60 locs: &'c Locations, 61 } 62 63 impl<'c> Iterator for SubCapturesPosIter<'c> { 64 type Item = Option<(usize, usize)>; 65 next(&mut self) -> Option<Option<(usize, usize)>>66 fn next(&mut self) -> Option<Option<(usize, usize)>> { 67 if self.idx >= self.locs.len() { 68 return None; 69 } 70 let x = match self.locs.pos(self.idx) { 71 None => Some(None), 72 Some((s, e)) => Some(Some((s, e))), 73 }; 74 self.idx += 1; 75 x 76 } 77 } 78 79 impl<'c> FusedIterator for SubCapturesPosIter<'c> {} 80 81 /// `RegularExpression` describes types that can implement regex searching. 82 /// 83 /// This trait is my attempt at reducing code duplication and to standardize 84 /// the internal API. Specific duplication that is avoided are the `find` 85 /// and `capture` iterators, which are slightly tricky. 86 /// 87 /// It's not clear whether this trait is worth it, and it also isn't 88 /// clear whether it's useful as a public trait or not. Methods like 89 /// `next_after_empty` reak of bad design, but the rest of the methods seem 90 /// somewhat reasonable. One particular thing this trait would expose would be 91 /// the ability to start the search of a regex anywhere in a haystack, which 92 /// isn't possible in the current public API. 93 pub trait RegularExpression: Sized + fmt::Debug { 94 /// The type of the haystack. 95 type Text: ?Sized + fmt::Debug; 96 97 /// The number of capture slots in the compiled regular expression. This is 98 /// always two times the number of capture groups (two slots per group). slots_len(&self) -> usize99 fn slots_len(&self) -> usize; 100 101 /// Allocates fresh space for all capturing groups in this regex. locations(&self) -> Locations102 fn locations(&self) -> Locations { 103 Locations(vec![None; self.slots_len()]) 104 } 105 106 /// Returns the position of the next character after `i`. 107 /// 108 /// For example, a haystack with type `&[u8]` probably returns `i+1`, 109 /// whereas a haystack with type `&str` probably returns `i` plus the 110 /// length of the next UTF-8 sequence. next_after_empty(&self, text: &Self::Text, i: usize) -> usize111 fn next_after_empty(&self, text: &Self::Text, i: usize) -> usize; 112 113 /// Returns the location of the shortest match. shortest_match_at( &self, text: &Self::Text, start: usize, ) -> Option<usize>114 fn shortest_match_at( 115 &self, 116 text: &Self::Text, 117 start: usize, 118 ) -> Option<usize>; 119 120 /// Returns whether the regex matches the text given. is_match_at(&self, text: &Self::Text, start: usize) -> bool121 fn is_match_at(&self, text: &Self::Text, start: usize) -> bool; 122 123 /// Returns the leftmost-first match location if one exists. find_at( &self, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>124 fn find_at( 125 &self, 126 text: &Self::Text, 127 start: usize, 128 ) -> Option<(usize, usize)>; 129 130 /// Returns the leftmost-first match location if one exists, and also 131 /// fills in any matching capture slot locations. captures_read_at( &self, locs: &mut Locations, text: &Self::Text, start: usize, ) -> Option<(usize, usize)>132 fn captures_read_at( 133 &self, 134 locs: &mut Locations, 135 text: &Self::Text, 136 start: usize, 137 ) -> Option<(usize, usize)>; 138 139 /// Returns an iterator over all non-overlapping successive leftmost-first 140 /// matches. find_iter(self, text: &Self::Text) -> Matches<'_, Self>141 fn find_iter(self, text: &Self::Text) -> Matches<'_, Self> { 142 Matches { re: self, text: text, last_end: 0, last_match: None } 143 } 144 145 /// Returns an iterator over all non-overlapping successive leftmost-first 146 /// matches with captures. captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self>147 fn captures_iter(self, text: &Self::Text) -> CaptureMatches<'_, Self> { 148 CaptureMatches(self.find_iter(text)) 149 } 150 } 151 152 /// An iterator over all non-overlapping successive leftmost-first matches. 153 #[derive(Debug)] 154 pub struct Matches<'t, R> 155 where 156 R: RegularExpression, 157 R::Text: 't, 158 { 159 re: R, 160 text: &'t R::Text, 161 last_end: usize, 162 last_match: Option<usize>, 163 } 164 165 impl<'t, R> Matches<'t, R> 166 where 167 R: RegularExpression, 168 R::Text: 't, 169 { 170 /// Return the text being searched. text(&self) -> &'t R::Text171 pub fn text(&self) -> &'t R::Text { 172 self.text 173 } 174 175 /// Return the underlying regex. regex(&self) -> &R176 pub fn regex(&self) -> &R { 177 &self.re 178 } 179 } 180 181 impl<'t, R> Iterator for Matches<'t, R> 182 where 183 R: RegularExpression, 184 R::Text: 't + AsRef<[u8]>, 185 { 186 type Item = (usize, usize); 187 next(&mut self) -> Option<(usize, usize)>188 fn next(&mut self) -> Option<(usize, usize)> { 189 if self.last_end > self.text.as_ref().len() { 190 return None; 191 } 192 let (s, e) = match self.re.find_at(self.text, self.last_end) { 193 None => return None, 194 Some((s, e)) => (s, e), 195 }; 196 if s == e { 197 // This is an empty match. To ensure we make progress, start 198 // the next search at the smallest possible starting position 199 // of the next match following this one. 200 self.last_end = self.re.next_after_empty(self.text, e); 201 // Don't accept empty matches immediately following a match. 202 // Just move on to the next match. 203 if Some(e) == self.last_match { 204 return self.next(); 205 } 206 } else { 207 self.last_end = e; 208 } 209 self.last_match = Some(e); 210 Some((s, e)) 211 } 212 } 213 214 impl<'t, R> FusedIterator for Matches<'t, R> 215 where 216 R: RegularExpression, 217 R::Text: 't + AsRef<[u8]>, 218 { 219 } 220 221 /// An iterator over all non-overlapping successive leftmost-first matches with 222 /// captures. 223 #[derive(Debug)] 224 pub struct CaptureMatches<'t, R>(Matches<'t, R>) 225 where 226 R: RegularExpression, 227 R::Text: 't; 228 229 impl<'t, R> CaptureMatches<'t, R> 230 where 231 R: RegularExpression, 232 R::Text: 't, 233 { 234 /// Return the text being searched. text(&self) -> &'t R::Text235 pub fn text(&self) -> &'t R::Text { 236 self.0.text() 237 } 238 239 /// Return the underlying regex. regex(&self) -> &R240 pub fn regex(&self) -> &R { 241 self.0.regex() 242 } 243 } 244 245 impl<'t, R> Iterator for CaptureMatches<'t, R> 246 where 247 R: RegularExpression, 248 R::Text: 't + AsRef<[u8]>, 249 { 250 type Item = Locations; 251 next(&mut self) -> Option<Locations>252 fn next(&mut self) -> Option<Locations> { 253 if self.0.last_end > self.0.text.as_ref().len() { 254 return None; 255 } 256 let mut locs = self.0.re.locations(); 257 let (s, e) = match self.0.re.captures_read_at( 258 &mut locs, 259 self.0.text, 260 self.0.last_end, 261 ) { 262 None => return None, 263 Some((s, e)) => (s, e), 264 }; 265 if s == e { 266 self.0.last_end = self.0.re.next_after_empty(self.0.text, e); 267 if Some(e) == self.0.last_match { 268 return self.next(); 269 } 270 } else { 271 self.0.last_end = e; 272 } 273 self.0.last_match = Some(e); 274 Some(locs) 275 } 276 } 277 278 impl<'t, R> FusedIterator for CaptureMatches<'t, R> 279 where 280 R: RegularExpression, 281 R::Text: 't + AsRef<[u8]>, 282 { 283 } 284