1 //! Regex matchers on character and byte streams.
2 //!
3 //! ## Overview
4 //!
5 //! The [`regex`] crate implements regular expression matching on strings and byte
6 //! arrays. However, in order to match the output of implementations of `fmt::Debug`
7 //! and `fmt::Display`, or by any code which writes to an instance of `fmt::Write`
8 //! or `io::Write`, it is necessary to first allocate a buffer, write to that
9 //! buffer, and then match the buffer against a regex.
10 //!
11 //! In cases where it is not necessary to extract substrings, but only to test whether
12 //! or not output matches a regex, it is not strictly necessary to allocate and
13 //! write this output to a buffer. This crate provides a simple interface on top of
14 //! the lower-level [`regex-automata`] library that implements `fmt::Write` and
15 //! `io::Write` for regex patterns. This may be used to test whether streaming
16 //! output matches a pattern without buffering that output.
17 //!
18 //! Users who need to extract substrings based on a pattern or who already have
19 //! buffered data should probably use the [`regex`] crate instead.
20 //!
21 //! ## Syntax
22 //!
23 //! This crate uses the same [regex syntax][syntax] of the `regex-automata` crate.
24 //!
25 //! [`regex`]: https://crates.io/crates/regex
26 //! [`regex-automata`]: https://crates.io/crates/regex-automata
27 //! [syntax]: https://docs.rs/regex-automata/0.1.7/regex_automata/#syntax
28 
29 use regex_automata::{dense, DenseDFA, SparseDFA, StateID, DFA};
30 use std::{fmt, io, marker::PhantomData, str::FromStr};
31 
32 pub use regex_automata::Error;
33 
34 /// A compiled match pattern that can match multipe inputs, or return a
35 /// [`Matcher`] that matches a single input.
36 ///
37 /// [`Matcher`]: ../struct.Matcher.html
38 #[derive(Debug, Clone)]
39 pub struct Pattern<S = usize, A = DenseDFA<Vec<S>, S>>
40 where
41     S: StateID,
42     A: DFA<ID = S>,
43 {
44     automaton: A,
45 }
46 
47 /// A reference to a [`Pattern`] that matches a single input.
48 ///
49 /// [`Pattern`]: ../struct.Pattern.html
50 #[derive(Debug, Clone)]
51 pub struct Matcher<'a, S = usize, A = DenseDFA<&'a [S], S>>
52 where
53     S: StateID,
54     A: DFA<ID = S>,
55 {
56     automaton: A,
57     state: S,
58     _lt: PhantomData<&'a ()>,
59 }
60 
61 // === impl Pattern ===
62 
63 impl Pattern {
64     /// Returns a new `Pattern` for the given regex, or an error if the regex
65     /// was invalid.
66     ///
67     /// The returned `Pattern` will match occurances of the pattern which start
68     /// at *any* in a byte or character stream — the pattern may be preceded by
69     /// any number of non-matching characters. Essentially, it will behave as
70     /// though the regular expression started with a `.*?`, which enables a
71     /// match to appear anywhere. If this is not the desired behavior, use
72     /// [`Pattern::new_anchored`] instead.
73     ///
74     /// For example:
75     /// ```
76     /// use matchers::Pattern;
77     ///
78     /// // This pattern matches any number of `a`s followed by a `b`.
79     /// let pattern = Pattern::new("a+b").expect("regex is not invalid");
80     ///
81     /// // Of course, the pattern matches an input where the entire sequence of
82     /// // characters matches the pattern:
83     /// assert!(pattern.display_matches(&"aaaaab"));
84     ///
85     /// // And, since the pattern is unanchored, it will also match the
86     /// // sequence when it's followed by non-matching characters:
87     /// assert!(pattern.display_matches(&"hello world! aaaaab"));
88     /// ```
new(pattern: &str) -> Result<Self, Error>89     pub fn new(pattern: &str) -> Result<Self, Error> {
90         let automaton = DenseDFA::new(pattern)?;
91         Ok(Pattern { automaton })
92     }
93 
94     /// Returns a new `Pattern` anchored at the beginning of the input stream,
95     /// or an error if the regex was invalid.
96     ///
97     /// The returned `Pattern` will *only* match an occurence of the pattern in
98     /// an input sequence if the first character or byte in the input matches
99     /// the pattern. If this is not the desired behavior, use [`Pattern::new`]
100     /// instead.
101     ///
102     /// For example:
103     /// ```
104     /// use matchers::Pattern;
105     ///
106     /// // This pattern matches any number of `a`s followed by a `b`.
107     /// let pattern = Pattern::new_anchored("a+b")
108     ///     .expect("regex is not invalid");
109     ///
110     /// // The pattern matches an input where the entire sequence of
111     /// // characters matches the pattern:
112     /// assert!(pattern.display_matches(&"aaaaab"));
113     ///
114     /// // Since the pattern is anchored, it will *not* match an input that
115     /// // begins with non-matching characters:
116     /// assert!(!pattern.display_matches(&"hello world! aaaaab"));
117     ///
118     /// // ...however, if we create a pattern beginning with `.*?`, it will:
119     /// let pattern2 = Pattern::new_anchored(".*?a+b")
120     ///     .expect("regex is not invalid");
121     /// assert!(pattern2.display_matches(&"hello world! aaaaab"));
122     /// ```
new_anchored(pattern: &str) -> Result<Self, Error>123     pub fn new_anchored(pattern: &str) -> Result<Self, Error> {
124         let automaton = dense::Builder::new().anchored(true).build(pattern)?;
125         Ok(Pattern { automaton })
126     }
127 }
128 
129 impl FromStr for Pattern {
130     type Err = Error;
from_str(s: &str) -> Result<Self, Self::Err>131     fn from_str(s: &str) -> Result<Self, Self::Err> {
132         Self::new(s)
133     }
134 }
135 
136 impl<S, A> Pattern<S, A>
137 where
138     S: StateID,
139     A: DFA<ID = S>,
140     Self: for<'a> ToMatcher<'a, S>,
141 {
142     /// Returns `true` if this pattern matches the given string.
143     #[inline]
matches(&self, s: &impl AsRef<str>) -> bool144     pub fn matches(&self, s: &impl AsRef<str>) -> bool {
145         self.matcher().matches(s)
146     }
147 
148     /// Returns `true` if this pattern matches the formatted output of the given
149     /// type implementing `fmt::Debug`.
150     ///
151     /// For example:
152     /// ```rust
153     /// use matchers::Pattern;
154     ///
155     /// #[derive(Debug)]
156     /// pub struct Hello {
157     ///     to: &'static str,
158     /// }
159     ///
160     /// let pattern = Pattern::new(r#"Hello \{ to: "W[^"]*" \}"#).unwrap();
161     ///
162     /// let hello_world = Hello { to: "World" };
163     /// assert!(pattern.debug_matches(&hello_world));
164     ///
165     /// let hello_sf = Hello { to: "San Francisco" };
166     /// assert_eq!(pattern.debug_matches(&hello_sf), false);
167     ///
168     /// let hello_washington = Hello { to: "Washington" };
169     /// assert!(pattern.debug_matches(&hello_washington));
170     /// ```
171     #[inline]
debug_matches(&self, d: &impl fmt::Debug) -> bool172     pub fn debug_matches(&self, d: &impl fmt::Debug) -> bool {
173         self.matcher().debug_matches(d)
174     }
175 
176     /// Returns `true` if this pattern matches the formatted output of the given
177     /// type implementing `fmt::Display`.
178     ///
179     /// For example:
180     /// ```rust
181     /// # use std::fmt;
182     /// use matchers::Pattern;
183     ///
184     /// #[derive(Debug)]
185     /// pub struct Hello {
186     ///     to: &'static str,
187     /// }
188     ///
189     /// impl fmt::Display for Hello {
190     ///     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
191     ///         write!(f, "Hello {}", self.to)
192     ///     }
193     /// }
194     ///
195     /// let pattern = Pattern::new("Hello [Ww].+").unwrap();
196     ///
197     /// let hello_world = Hello { to: "world" };
198     /// assert!(pattern.display_matches(&hello_world));
199     /// assert_eq!(pattern.debug_matches(&hello_world), false);
200     ///
201     /// let hello_sf = Hello { to: "San Francisco" };
202     /// assert_eq!(pattern.display_matches(&hello_sf), false);
203     ///
204     /// let hello_washington = Hello { to: "Washington" };
205     /// assert!(pattern.display_matches(&hello_washington));
206     /// ```
207     #[inline]
display_matches(&self, d: &impl fmt::Display) -> bool208     pub fn display_matches(&self, d: &impl fmt::Display) -> bool {
209         self.matcher().display_matches(d)
210     }
211 
212     /// Returns either a `bool` indicating whether or not this pattern matches the
213     /// data read from the provided `io::Read` stream, or an `io::Error` if an
214     /// error occurred reading from the stream.
215     #[inline]
read_matches(&self, io: impl io::Read) -> io::Result<bool>216     pub fn read_matches(&self, io: impl io::Read) -> io::Result<bool> {
217         self.matcher().read_matches(io)
218     }
219 }
220 
221 // === impl Matcher ===
222 
223 impl<'a, S, A> Matcher<'a, S, A>
224 where
225     S: StateID,
226     A: DFA<ID = S>,
227 {
new(automaton: A) -> Self228     fn new(automaton: A) -> Self {
229         let state = automaton.start_state();
230         Self {
231             automaton,
232             state,
233             _lt: PhantomData,
234         }
235     }
236 
237     #[inline]
advance(&mut self, input: u8)238     fn advance(&mut self, input: u8) {
239         self.state = unsafe {
240             // It's safe to call `next_state_unchecked` since the matcher may
241             // only be constructed by a `Pattern`, which, in turn,can only be
242             // constructed with a valid DFA.
243             self.automaton.next_state_unchecked(self.state, input)
244         };
245     }
246 
247     /// Returns `true` if this `Matcher` has matched any input that has been
248     /// provided.
249     #[inline]
is_matched(&self) -> bool250     pub fn is_matched(&self) -> bool {
251         self.automaton.is_match_state(self.state)
252     }
253 
254     /// Returns `true` if this pattern matches the formatted output of the given
255     /// type implementing `fmt::Debug`.
matches(mut self, s: &impl AsRef<str>) -> bool256     pub fn matches(mut self, s: &impl AsRef<str>) -> bool {
257         for &byte in s.as_ref().as_bytes() {
258             self.advance(byte);
259             if self.automaton.is_dead_state(self.state) {
260                 return false;
261             }
262         }
263         self.is_matched()
264     }
265 
266     /// Returns `true` if this pattern matches the formatted output of the given
267     /// type implementing `fmt::Debug`.
debug_matches(mut self, d: &impl fmt::Debug) -> bool268     pub fn debug_matches(mut self, d: &impl fmt::Debug) -> bool {
269         use std::fmt::Write;
270         write!(&mut self, "{:?}", d).expect("matcher write impl should not fail");
271         self.is_matched()
272     }
273 
274     /// Returns `true` if this pattern matches the formatted output of the given
275     /// type implementing `fmt::Display`.
display_matches(mut self, d: &impl fmt::Display) -> bool276     pub fn display_matches(mut self, d: &impl fmt::Display) -> bool {
277         use std::fmt::Write;
278         write!(&mut self, "{}", d).expect("matcher write impl should not fail");
279         self.is_matched()
280     }
281 
282     /// Returns either a `bool` indicating whether or not this pattern matches the
283     /// data read from the provided `io::Read` stream, or an `io::Error` if an
284     /// error occurred reading from the stream.
read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool>285     pub fn read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool> {
286         for r in io.bytes() {
287             self.advance(r?);
288             if self.automaton.is_dead_state(self.state) {
289                 return Ok(false);
290             }
291         }
292         Ok(self.is_matched())
293     }
294 }
295 
296 impl<'a, S, A> fmt::Write for Matcher<'a, S, A>
297 where
298     S: StateID,
299     A: DFA<ID = S>,
300 {
write_str(&mut self, s: &str) -> fmt::Result301     fn write_str(&mut self, s: &str) -> fmt::Result {
302         for &byte in s.as_bytes() {
303             self.advance(byte);
304             if self.automaton.is_dead_state(self.state) {
305                 break;
306             }
307         }
308         Ok(())
309     }
310 }
311 
312 impl<'a, S, A> io::Write for Matcher<'a, S, A>
313 where
314     S: StateID,
315     A: DFA<ID = S>,
316 {
write(&mut self, bytes: &[u8]) -> Result<usize, io::Error>317     fn write(&mut self, bytes: &[u8]) -> Result<usize, io::Error> {
318         let mut i = 0;
319         for &byte in bytes {
320             self.advance(byte);
321             i += 1;
322             if self.automaton.is_dead_state(self.state) {
323                 break;
324             }
325         }
326         Ok(i)
327     }
328 
flush(&mut self) -> Result<(), io::Error>329     fn flush(&mut self) -> Result<(), io::Error> {
330         Ok(())
331     }
332 }
333 
334 pub trait ToMatcher<'a, S>
335 where
336     Self: crate::sealed::Sealed,
337     S: StateID + 'a,
338 {
339     type Automaton: DFA<ID = S>;
matcher(&'a self) -> Matcher<'a, S, Self::Automaton>340     fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton>;
341 }
342 
343 impl<S> crate::sealed::Sealed for Pattern<S, DenseDFA<Vec<S>, S>> where S: StateID {}
344 
345 impl<'a, S> ToMatcher<'a, S> for Pattern<S, DenseDFA<Vec<S>, S>>
346 where
347     S: StateID + 'a,
348 {
349     type Automaton = DenseDFA<&'a [S], S>;
matcher(&'a self) -> Matcher<'a, S, Self::Automaton>350     fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> {
351         Matcher::new(self.automaton.as_ref())
352     }
353 }
354 
355 impl<'a, S> ToMatcher<'a, S> for Pattern<S, SparseDFA<Vec<u8>, S>>
356 where
357     S: StateID + 'a,
358 {
359     type Automaton = SparseDFA<&'a [u8], S>;
matcher(&'a self) -> Matcher<'a, S, Self::Automaton>360     fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> {
361         Matcher::new(self.automaton.as_ref())
362     }
363 }
364 
365 impl<S> crate::sealed::Sealed for Pattern<S, SparseDFA<Vec<u8>, S>> where S: StateID {}
366 
367 mod sealed {
368     pub trait Sealed {}
369 }
370 
371 #[cfg(test)]
372 mod test {
373     use super::*;
374 
375     struct Str<'a>(&'a str);
376     struct ReadStr<'a>(io::Cursor<&'a [u8]>);
377 
378     impl<'a> fmt::Debug for Str<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result379         fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
380             write!(f, "{}", self.0)
381         }
382     }
383 
384     impl<'a> fmt::Display for Str<'a> {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result385         fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
386             write!(f, "{}", self.0)
387         }
388     }
389 
390     impl<'a> io::Read for ReadStr<'a> {
read(&mut self, buf: &mut [u8]) -> io::Result<usize>391         fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> {
392             self.0.read(buf)
393         }
394     }
395 
396     impl Str<'static> {
hello_world() -> Self397         fn hello_world() -> Self {
398             Self::new("hello world")
399         }
400     }
401 
402     impl<'a> Str<'a> {
new(s: &'a str) -> Self403         fn new(s: &'a str) -> Self {
404             Str(s)
405         }
406 
to_reader(self) -> ReadStr<'a>407         fn to_reader(self) -> ReadStr<'a> {
408             ReadStr(io::Cursor::new(self.0.as_bytes()))
409         }
410     }
411 
test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)412     fn test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
413         let pat = new_pattern("hello world").unwrap();
414         assert!(pat.debug_matches(&Str::hello_world()));
415 
416         let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
417         assert!(pat.debug_matches(&Str::hello_world()));
418 
419         let pat = new_pattern("goodbye world").unwrap();
420         assert_eq!(pat.debug_matches(&Str::hello_world()), false);
421     }
422 
test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)423     fn test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
424         let pat = new_pattern("hello world").unwrap();
425         assert!(pat.display_matches(&Str::hello_world()));
426 
427         let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
428         assert!(pat.display_matches(&Str::hello_world()));
429 
430         let pat = new_pattern("goodbye world").unwrap();
431         assert_eq!(pat.display_matches(&Str::hello_world()), false);
432     }
433 
test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)434     fn test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
435         let pat = new_pattern("hello world").unwrap();
436         assert!(pat
437             .read_matches(Str::hello_world().to_reader())
438             .expect("no io error should occur"));
439 
440         let pat = new_pattern("hel+o w[orl]{3}d").unwrap();
441         assert!(pat
442             .read_matches(Str::hello_world().to_reader())
443             .expect("no io error should occur"));
444 
445         let pat = new_pattern("goodbye world").unwrap();
446         assert_eq!(
447             pat.read_matches(Str::hello_world().to_reader())
448                 .expect("no io error should occur"),
449             false
450         );
451     }
452 
test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)453     fn test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) {
454         let pat = new_pattern("a+b").unwrap();
455         assert!(pat.debug_matches(&Str::new("ab")));
456         assert!(pat.debug_matches(&Str::new("aaaab")));
457         assert!(pat.debug_matches(&Str::new("aaaaaaaaaab")));
458         assert_eq!(pat.debug_matches(&Str::new("b")), false);
459         assert_eq!(pat.debug_matches(&Str::new("abb")), false);
460         assert_eq!(pat.debug_matches(&Str::new("aaaaabb")), false);
461     }
462 
463     mod anchored {
464         use super::*;
465         #[test]
debug_matches()466         fn debug_matches() {
467             test_debug_matches(Pattern::new_anchored)
468         }
469 
470         #[test]
display_matches()471         fn display_matches() {
472             test_display_matches(Pattern::new_anchored)
473         }
474 
475         #[test]
reader_matches()476         fn reader_matches() {
477             test_reader_matches(Pattern::new_anchored)
478         }
479 
480         #[test]
debug_rep_patterns()481         fn debug_rep_patterns() {
482             test_debug_rep_patterns(Pattern::new_anchored)
483         }
484 
485         // === anchored behavior =============================================
486         // Tests that anchored patterns match each input type only beginning at
487         // the first character.
test_is_anchored(f: impl Fn(&Pattern, Str) -> bool)488         fn test_is_anchored(f: impl Fn(&Pattern, Str) -> bool) {
489             let pat = Pattern::new_anchored("a+b").unwrap();
490             assert!(f(&pat, Str::new("ab")));
491             assert!(f(&pat, Str::new("aaaab")));
492             assert!(f(&pat, Str::new("aaaaaaaaaab")));
493             assert!(!f(&pat, Str::new("bab")));
494             assert!(!f(&pat, Str::new("ffab")));
495             assert!(!f(&pat, Str::new("qqqqqqqaaaaab")));
496         }
497 
498         #[test]
debug_is_anchored()499         fn debug_is_anchored() {
500             test_is_anchored(|pat, input| pat.debug_matches(&input))
501         }
502 
503         #[test]
display_is_anchored()504         fn display_is_anchored() {
505             test_is_anchored(|pat, input| pat.display_matches(&input));
506         }
507 
508         #[test]
reader_is_anchored()509         fn reader_is_anchored() {
510             test_is_anchored(|pat, input| {
511                 pat.read_matches(input.to_reader())
512                     .expect("no io error occurs")
513             });
514         }
515 
516         // === explicitly unanchored =========================================
517         // Tests that if an "anchored" pattern begins with `.*?`, it matches as
518         // though it was unanchored.
test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool)519         fn test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
520             let pat = Pattern::new_anchored(".*?a+b").unwrap();
521             assert!(f(&pat, Str::new("ab")));
522             assert!(f(&pat, Str::new("aaaab")));
523             assert!(f(&pat, Str::new("aaaaaaaaaab")));
524             assert!(f(&pat, Str::new("bab")));
525             assert!(f(&pat, Str::new("ffab")));
526             assert!(f(&pat, Str::new("qqqqqqqaaaaab")));
527         }
528 
529         #[test]
debug_explicitly_unanchored()530         fn debug_explicitly_unanchored() {
531             test_explicitly_unanchored(|pat, input| pat.debug_matches(&input))
532         }
533 
534         #[test]
display_explicitly_unanchored()535         fn display_explicitly_unanchored() {
536             test_explicitly_unanchored(|pat, input| pat.display_matches(&input));
537         }
538 
539         #[test]
reader_explicitly_unanchored()540         fn reader_explicitly_unanchored() {
541             test_explicitly_unanchored(|pat, input| {
542                 pat.read_matches(input.to_reader())
543                     .expect("no io error occurs")
544             });
545         }
546     }
547 
548     mod unanchored {
549         use super::*;
550         #[test]
debug_matches()551         fn debug_matches() {
552             test_debug_matches(Pattern::new)
553         }
554 
555         #[test]
display_matches()556         fn display_matches() {
557             test_display_matches(Pattern::new)
558         }
559 
560         #[test]
reader_matches()561         fn reader_matches() {
562             test_reader_matches(Pattern::new)
563         }
564 
565         #[test]
debug_rep_patterns()566         fn debug_rep_patterns() {
567             test_debug_rep_patterns(Pattern::new)
568         }
569 
570         // === anchored behavior =============================================
571         // Tests that unanchored patterns match anywhere in the input stream.
test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool)572         fn test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool) {
573             let pat = Pattern::new("a+b").unwrap();
574             assert!(f(&pat, Str::new("ab")));
575             assert!(f(&pat, Str::new("aaaab")));
576             assert!(f(&pat, Str::new("aaaaaaaaaab")));
577             assert!(f(&pat, Str::new("bab")));
578             assert!(f(&pat, Str::new("ffab")));
579             assert!(f(&pat, Str::new("qqqfqqqqaaaaab")));
580         }
581 
582         #[test]
debug_is_unanchored()583         fn debug_is_unanchored() {
584             test_is_unanchored(|pat, input| pat.debug_matches(&input))
585         }
586 
587         #[test]
display_is_unanchored()588         fn display_is_unanchored() {
589             test_is_unanchored(|pat, input| pat.display_matches(&input));
590         }
591 
592         #[test]
reader_is_unanchored()593         fn reader_is_unanchored() {
594             test_is_unanchored(|pat, input| {
595                 pat.read_matches(input.to_reader())
596                     .expect("no io error occurs")
597             });
598         }
599     }
600 }
601