1 //! Regex matchers on character and byte streams. 2 //! 3 //! ## Overview 4 //! 5 //! The [`regex`] crate implements regular expression matching on strings and byte 6 //! arrays. However, in order to match the output of implementations of `fmt::Debug` 7 //! and `fmt::Display`, or by any code which writes to an instance of `fmt::Write` 8 //! or `io::Write`, it is necessary to first allocate a buffer, write to that 9 //! buffer, and then match the buffer against a regex. 10 //! 11 //! In cases where it is not necessary to extract substrings, but only to test whether 12 //! or not output matches a regex, it is not strictly necessary to allocate and 13 //! write this output to a buffer. This crate provides a simple interface on top of 14 //! the lower-level [`regex-automata`] library that implements `fmt::Write` and 15 //! `io::Write` for regex patterns. This may be used to test whether streaming 16 //! output matches a pattern without buffering that output. 17 //! 18 //! Users who need to extract substrings based on a pattern or who already have 19 //! buffered data should probably use the [`regex`] crate instead. 20 //! 21 //! ## Syntax 22 //! 23 //! This crate uses the same [regex syntax][syntax] of the `regex-automata` crate. 24 //! 25 //! [`regex`]: https://crates.io/crates/regex 26 //! [`regex-automata`]: https://crates.io/crates/regex-automata 27 //! [syntax]: https://docs.rs/regex-automata/0.1.7/regex_automata/#syntax 28 29 use regex_automata::{dense, DenseDFA, SparseDFA, StateID, DFA}; 30 use std::{fmt, io, marker::PhantomData, str::FromStr}; 31 32 pub use regex_automata::Error; 33 34 /// A compiled match pattern that can match multipe inputs, or return a 35 /// [`Matcher`] that matches a single input. 36 /// 37 /// [`Matcher`]: ../struct.Matcher.html 38 #[derive(Debug, Clone)] 39 pub struct Pattern<S = usize, A = DenseDFA<Vec<S>, S>> 40 where 41 S: StateID, 42 A: DFA<ID = S>, 43 { 44 automaton: A, 45 } 46 47 /// A reference to a [`Pattern`] that matches a single input. 48 /// 49 /// [`Pattern`]: ../struct.Pattern.html 50 #[derive(Debug, Clone)] 51 pub struct Matcher<'a, S = usize, A = DenseDFA<&'a [S], S>> 52 where 53 S: StateID, 54 A: DFA<ID = S>, 55 { 56 automaton: A, 57 state: S, 58 _lt: PhantomData<&'a ()>, 59 } 60 61 // === impl Pattern === 62 63 impl Pattern { 64 /// Returns a new `Pattern` for the given regex, or an error if the regex 65 /// was invalid. 66 /// 67 /// The returned `Pattern` will match occurances of the pattern which start 68 /// at *any* in a byte or character stream — the pattern may be preceded by 69 /// any number of non-matching characters. Essentially, it will behave as 70 /// though the regular expression started with a `.*?`, which enables a 71 /// match to appear anywhere. If this is not the desired behavior, use 72 /// [`Pattern::new_anchored`] instead. 73 /// 74 /// For example: 75 /// ``` 76 /// use matchers::Pattern; 77 /// 78 /// // This pattern matches any number of `a`s followed by a `b`. 79 /// let pattern = Pattern::new("a+b").expect("regex is not invalid"); 80 /// 81 /// // Of course, the pattern matches an input where the entire sequence of 82 /// // characters matches the pattern: 83 /// assert!(pattern.display_matches(&"aaaaab")); 84 /// 85 /// // And, since the pattern is unanchored, it will also match the 86 /// // sequence when it's followed by non-matching characters: 87 /// assert!(pattern.display_matches(&"hello world! aaaaab")); 88 /// ``` new(pattern: &str) -> Result<Self, Error>89 pub fn new(pattern: &str) -> Result<Self, Error> { 90 let automaton = DenseDFA::new(pattern)?; 91 Ok(Pattern { automaton }) 92 } 93 94 /// Returns a new `Pattern` anchored at the beginning of the input stream, 95 /// or an error if the regex was invalid. 96 /// 97 /// The returned `Pattern` will *only* match an occurence of the pattern in 98 /// an input sequence if the first character or byte in the input matches 99 /// the pattern. If this is not the desired behavior, use [`Pattern::new`] 100 /// instead. 101 /// 102 /// For example: 103 /// ``` 104 /// use matchers::Pattern; 105 /// 106 /// // This pattern matches any number of `a`s followed by a `b`. 107 /// let pattern = Pattern::new_anchored("a+b") 108 /// .expect("regex is not invalid"); 109 /// 110 /// // The pattern matches an input where the entire sequence of 111 /// // characters matches the pattern: 112 /// assert!(pattern.display_matches(&"aaaaab")); 113 /// 114 /// // Since the pattern is anchored, it will *not* match an input that 115 /// // begins with non-matching characters: 116 /// assert!(!pattern.display_matches(&"hello world! aaaaab")); 117 /// 118 /// // ...however, if we create a pattern beginning with `.*?`, it will: 119 /// let pattern2 = Pattern::new_anchored(".*?a+b") 120 /// .expect("regex is not invalid"); 121 /// assert!(pattern2.display_matches(&"hello world! aaaaab")); 122 /// ``` new_anchored(pattern: &str) -> Result<Self, Error>123 pub fn new_anchored(pattern: &str) -> Result<Self, Error> { 124 let automaton = dense::Builder::new().anchored(true).build(pattern)?; 125 Ok(Pattern { automaton }) 126 } 127 } 128 129 impl FromStr for Pattern { 130 type Err = Error; from_str(s: &str) -> Result<Self, Self::Err>131 fn from_str(s: &str) -> Result<Self, Self::Err> { 132 Self::new(s) 133 } 134 } 135 136 impl<S, A> Pattern<S, A> 137 where 138 S: StateID, 139 A: DFA<ID = S>, 140 Self: for<'a> ToMatcher<'a, S>, 141 { 142 /// Returns `true` if this pattern matches the given string. 143 #[inline] matches(&self, s: &impl AsRef<str>) -> bool144 pub fn matches(&self, s: &impl AsRef<str>) -> bool { 145 self.matcher().matches(s) 146 } 147 148 /// Returns `true` if this pattern matches the formatted output of the given 149 /// type implementing `fmt::Debug`. 150 /// 151 /// For example: 152 /// ```rust 153 /// use matchers::Pattern; 154 /// 155 /// #[derive(Debug)] 156 /// pub struct Hello { 157 /// to: &'static str, 158 /// } 159 /// 160 /// let pattern = Pattern::new(r#"Hello \{ to: "W[^"]*" \}"#).unwrap(); 161 /// 162 /// let hello_world = Hello { to: "World" }; 163 /// assert!(pattern.debug_matches(&hello_world)); 164 /// 165 /// let hello_sf = Hello { to: "San Francisco" }; 166 /// assert_eq!(pattern.debug_matches(&hello_sf), false); 167 /// 168 /// let hello_washington = Hello { to: "Washington" }; 169 /// assert!(pattern.debug_matches(&hello_washington)); 170 /// ``` 171 #[inline] debug_matches(&self, d: &impl fmt::Debug) -> bool172 pub fn debug_matches(&self, d: &impl fmt::Debug) -> bool { 173 self.matcher().debug_matches(d) 174 } 175 176 /// Returns `true` if this pattern matches the formatted output of the given 177 /// type implementing `fmt::Display`. 178 /// 179 /// For example: 180 /// ```rust 181 /// # use std::fmt; 182 /// use matchers::Pattern; 183 /// 184 /// #[derive(Debug)] 185 /// pub struct Hello { 186 /// to: &'static str, 187 /// } 188 /// 189 /// impl fmt::Display for Hello { 190 /// fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 191 /// write!(f, "Hello {}", self.to) 192 /// } 193 /// } 194 /// 195 /// let pattern = Pattern::new("Hello [Ww].+").unwrap(); 196 /// 197 /// let hello_world = Hello { to: "world" }; 198 /// assert!(pattern.display_matches(&hello_world)); 199 /// assert_eq!(pattern.debug_matches(&hello_world), false); 200 /// 201 /// let hello_sf = Hello { to: "San Francisco" }; 202 /// assert_eq!(pattern.display_matches(&hello_sf), false); 203 /// 204 /// let hello_washington = Hello { to: "Washington" }; 205 /// assert!(pattern.display_matches(&hello_washington)); 206 /// ``` 207 #[inline] display_matches(&self, d: &impl fmt::Display) -> bool208 pub fn display_matches(&self, d: &impl fmt::Display) -> bool { 209 self.matcher().display_matches(d) 210 } 211 212 /// Returns either a `bool` indicating whether or not this pattern matches the 213 /// data read from the provided `io::Read` stream, or an `io::Error` if an 214 /// error occurred reading from the stream. 215 #[inline] read_matches(&self, io: impl io::Read) -> io::Result<bool>216 pub fn read_matches(&self, io: impl io::Read) -> io::Result<bool> { 217 self.matcher().read_matches(io) 218 } 219 } 220 221 // === impl Matcher === 222 223 impl<'a, S, A> Matcher<'a, S, A> 224 where 225 S: StateID, 226 A: DFA<ID = S>, 227 { new(automaton: A) -> Self228 fn new(automaton: A) -> Self { 229 let state = automaton.start_state(); 230 Self { 231 automaton, 232 state, 233 _lt: PhantomData, 234 } 235 } 236 237 #[inline] advance(&mut self, input: u8)238 fn advance(&mut self, input: u8) { 239 self.state = unsafe { 240 // It's safe to call `next_state_unchecked` since the matcher may 241 // only be constructed by a `Pattern`, which, in turn,can only be 242 // constructed with a valid DFA. 243 self.automaton.next_state_unchecked(self.state, input) 244 }; 245 } 246 247 /// Returns `true` if this `Matcher` has matched any input that has been 248 /// provided. 249 #[inline] is_matched(&self) -> bool250 pub fn is_matched(&self) -> bool { 251 self.automaton.is_match_state(self.state) 252 } 253 254 /// Returns `true` if this pattern matches the formatted output of the given 255 /// type implementing `fmt::Debug`. matches(mut self, s: &impl AsRef<str>) -> bool256 pub fn matches(mut self, s: &impl AsRef<str>) -> bool { 257 for &byte in s.as_ref().as_bytes() { 258 self.advance(byte); 259 if self.automaton.is_dead_state(self.state) { 260 return false; 261 } 262 } 263 self.is_matched() 264 } 265 266 /// Returns `true` if this pattern matches the formatted output of the given 267 /// type implementing `fmt::Debug`. debug_matches(mut self, d: &impl fmt::Debug) -> bool268 pub fn debug_matches(mut self, d: &impl fmt::Debug) -> bool { 269 use std::fmt::Write; 270 write!(&mut self, "{:?}", d).expect("matcher write impl should not fail"); 271 self.is_matched() 272 } 273 274 /// Returns `true` if this pattern matches the formatted output of the given 275 /// type implementing `fmt::Display`. display_matches(mut self, d: &impl fmt::Display) -> bool276 pub fn display_matches(mut self, d: &impl fmt::Display) -> bool { 277 use std::fmt::Write; 278 write!(&mut self, "{}", d).expect("matcher write impl should not fail"); 279 self.is_matched() 280 } 281 282 /// Returns either a `bool` indicating whether or not this pattern matches the 283 /// data read from the provided `io::Read` stream, or an `io::Error` if an 284 /// error occurred reading from the stream. read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool>285 pub fn read_matches(mut self, io: impl io::Read + Sized) -> io::Result<bool> { 286 for r in io.bytes() { 287 self.advance(r?); 288 if self.automaton.is_dead_state(self.state) { 289 return Ok(false); 290 } 291 } 292 Ok(self.is_matched()) 293 } 294 } 295 296 impl<'a, S, A> fmt::Write for Matcher<'a, S, A> 297 where 298 S: StateID, 299 A: DFA<ID = S>, 300 { write_str(&mut self, s: &str) -> fmt::Result301 fn write_str(&mut self, s: &str) -> fmt::Result { 302 for &byte in s.as_bytes() { 303 self.advance(byte); 304 if self.automaton.is_dead_state(self.state) { 305 break; 306 } 307 } 308 Ok(()) 309 } 310 } 311 312 impl<'a, S, A> io::Write for Matcher<'a, S, A> 313 where 314 S: StateID, 315 A: DFA<ID = S>, 316 { write(&mut self, bytes: &[u8]) -> Result<usize, io::Error>317 fn write(&mut self, bytes: &[u8]) -> Result<usize, io::Error> { 318 let mut i = 0; 319 for &byte in bytes { 320 self.advance(byte); 321 i += 1; 322 if self.automaton.is_dead_state(self.state) { 323 break; 324 } 325 } 326 Ok(i) 327 } 328 flush(&mut self) -> Result<(), io::Error>329 fn flush(&mut self) -> Result<(), io::Error> { 330 Ok(()) 331 } 332 } 333 334 pub trait ToMatcher<'a, S> 335 where 336 Self: crate::sealed::Sealed, 337 S: StateID + 'a, 338 { 339 type Automaton: DFA<ID = S>; matcher(&'a self) -> Matcher<'a, S, Self::Automaton>340 fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton>; 341 } 342 343 impl<S> crate::sealed::Sealed for Pattern<S, DenseDFA<Vec<S>, S>> where S: StateID {} 344 345 impl<'a, S> ToMatcher<'a, S> for Pattern<S, DenseDFA<Vec<S>, S>> 346 where 347 S: StateID + 'a, 348 { 349 type Automaton = DenseDFA<&'a [S], S>; matcher(&'a self) -> Matcher<'a, S, Self::Automaton>350 fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> { 351 Matcher::new(self.automaton.as_ref()) 352 } 353 } 354 355 impl<'a, S> ToMatcher<'a, S> for Pattern<S, SparseDFA<Vec<u8>, S>> 356 where 357 S: StateID + 'a, 358 { 359 type Automaton = SparseDFA<&'a [u8], S>; matcher(&'a self) -> Matcher<'a, S, Self::Automaton>360 fn matcher(&'a self) -> Matcher<'a, S, Self::Automaton> { 361 Matcher::new(self.automaton.as_ref()) 362 } 363 } 364 365 impl<S> crate::sealed::Sealed for Pattern<S, SparseDFA<Vec<u8>, S>> where S: StateID {} 366 367 mod sealed { 368 pub trait Sealed {} 369 } 370 371 #[cfg(test)] 372 mod test { 373 use super::*; 374 375 struct Str<'a>(&'a str); 376 struct ReadStr<'a>(io::Cursor<&'a [u8]>); 377 378 impl<'a> fmt::Debug for Str<'a> { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result379 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 380 write!(f, "{}", self.0) 381 } 382 } 383 384 impl<'a> fmt::Display for Str<'a> { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result385 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 386 write!(f, "{}", self.0) 387 } 388 } 389 390 impl<'a> io::Read for ReadStr<'a> { read(&mut self, buf: &mut [u8]) -> io::Result<usize>391 fn read(&mut self, buf: &mut [u8]) -> io::Result<usize> { 392 self.0.read(buf) 393 } 394 } 395 396 impl Str<'static> { hello_world() -> Self397 fn hello_world() -> Self { 398 Self::new("hello world") 399 } 400 } 401 402 impl<'a> Str<'a> { new(s: &'a str) -> Self403 fn new(s: &'a str) -> Self { 404 Str(s) 405 } 406 to_reader(self) -> ReadStr<'a>407 fn to_reader(self) -> ReadStr<'a> { 408 ReadStr(io::Cursor::new(self.0.as_bytes())) 409 } 410 } 411 test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)412 fn test_debug_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { 413 let pat = new_pattern("hello world").unwrap(); 414 assert!(pat.debug_matches(&Str::hello_world())); 415 416 let pat = new_pattern("hel+o w[orl]{3}d").unwrap(); 417 assert!(pat.debug_matches(&Str::hello_world())); 418 419 let pat = new_pattern("goodbye world").unwrap(); 420 assert_eq!(pat.debug_matches(&Str::hello_world()), false); 421 } 422 test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)423 fn test_display_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { 424 let pat = new_pattern("hello world").unwrap(); 425 assert!(pat.display_matches(&Str::hello_world())); 426 427 let pat = new_pattern("hel+o w[orl]{3}d").unwrap(); 428 assert!(pat.display_matches(&Str::hello_world())); 429 430 let pat = new_pattern("goodbye world").unwrap(); 431 assert_eq!(pat.display_matches(&Str::hello_world()), false); 432 } 433 test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)434 fn test_reader_matches(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { 435 let pat = new_pattern("hello world").unwrap(); 436 assert!(pat 437 .read_matches(Str::hello_world().to_reader()) 438 .expect("no io error should occur")); 439 440 let pat = new_pattern("hel+o w[orl]{3}d").unwrap(); 441 assert!(pat 442 .read_matches(Str::hello_world().to_reader()) 443 .expect("no io error should occur")); 444 445 let pat = new_pattern("goodbye world").unwrap(); 446 assert_eq!( 447 pat.read_matches(Str::hello_world().to_reader()) 448 .expect("no io error should occur"), 449 false 450 ); 451 } 452 test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, Error>)453 fn test_debug_rep_patterns(new_pattern: impl Fn(&str) -> Result<Pattern, Error>) { 454 let pat = new_pattern("a+b").unwrap(); 455 assert!(pat.debug_matches(&Str::new("ab"))); 456 assert!(pat.debug_matches(&Str::new("aaaab"))); 457 assert!(pat.debug_matches(&Str::new("aaaaaaaaaab"))); 458 assert_eq!(pat.debug_matches(&Str::new("b")), false); 459 assert_eq!(pat.debug_matches(&Str::new("abb")), false); 460 assert_eq!(pat.debug_matches(&Str::new("aaaaabb")), false); 461 } 462 463 mod anchored { 464 use super::*; 465 #[test] debug_matches()466 fn debug_matches() { 467 test_debug_matches(Pattern::new_anchored) 468 } 469 470 #[test] display_matches()471 fn display_matches() { 472 test_display_matches(Pattern::new_anchored) 473 } 474 475 #[test] reader_matches()476 fn reader_matches() { 477 test_reader_matches(Pattern::new_anchored) 478 } 479 480 #[test] debug_rep_patterns()481 fn debug_rep_patterns() { 482 test_debug_rep_patterns(Pattern::new_anchored) 483 } 484 485 // === anchored behavior ============================================= 486 // Tests that anchored patterns match each input type only beginning at 487 // the first character. test_is_anchored(f: impl Fn(&Pattern, Str) -> bool)488 fn test_is_anchored(f: impl Fn(&Pattern, Str) -> bool) { 489 let pat = Pattern::new_anchored("a+b").unwrap(); 490 assert!(f(&pat, Str::new("ab"))); 491 assert!(f(&pat, Str::new("aaaab"))); 492 assert!(f(&pat, Str::new("aaaaaaaaaab"))); 493 assert!(!f(&pat, Str::new("bab"))); 494 assert!(!f(&pat, Str::new("ffab"))); 495 assert!(!f(&pat, Str::new("qqqqqqqaaaaab"))); 496 } 497 498 #[test] debug_is_anchored()499 fn debug_is_anchored() { 500 test_is_anchored(|pat, input| pat.debug_matches(&input)) 501 } 502 503 #[test] display_is_anchored()504 fn display_is_anchored() { 505 test_is_anchored(|pat, input| pat.display_matches(&input)); 506 } 507 508 #[test] reader_is_anchored()509 fn reader_is_anchored() { 510 test_is_anchored(|pat, input| { 511 pat.read_matches(input.to_reader()) 512 .expect("no io error occurs") 513 }); 514 } 515 516 // === explicitly unanchored ========================================= 517 // Tests that if an "anchored" pattern begins with `.*?`, it matches as 518 // though it was unanchored. test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool)519 fn test_explicitly_unanchored(f: impl Fn(&Pattern, Str) -> bool) { 520 let pat = Pattern::new_anchored(".*?a+b").unwrap(); 521 assert!(f(&pat, Str::new("ab"))); 522 assert!(f(&pat, Str::new("aaaab"))); 523 assert!(f(&pat, Str::new("aaaaaaaaaab"))); 524 assert!(f(&pat, Str::new("bab"))); 525 assert!(f(&pat, Str::new("ffab"))); 526 assert!(f(&pat, Str::new("qqqqqqqaaaaab"))); 527 } 528 529 #[test] debug_explicitly_unanchored()530 fn debug_explicitly_unanchored() { 531 test_explicitly_unanchored(|pat, input| pat.debug_matches(&input)) 532 } 533 534 #[test] display_explicitly_unanchored()535 fn display_explicitly_unanchored() { 536 test_explicitly_unanchored(|pat, input| pat.display_matches(&input)); 537 } 538 539 #[test] reader_explicitly_unanchored()540 fn reader_explicitly_unanchored() { 541 test_explicitly_unanchored(|pat, input| { 542 pat.read_matches(input.to_reader()) 543 .expect("no io error occurs") 544 }); 545 } 546 } 547 548 mod unanchored { 549 use super::*; 550 #[test] debug_matches()551 fn debug_matches() { 552 test_debug_matches(Pattern::new) 553 } 554 555 #[test] display_matches()556 fn display_matches() { 557 test_display_matches(Pattern::new) 558 } 559 560 #[test] reader_matches()561 fn reader_matches() { 562 test_reader_matches(Pattern::new) 563 } 564 565 #[test] debug_rep_patterns()566 fn debug_rep_patterns() { 567 test_debug_rep_patterns(Pattern::new) 568 } 569 570 // === anchored behavior ============================================= 571 // Tests that unanchored patterns match anywhere in the input stream. test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool)572 fn test_is_unanchored(f: impl Fn(&Pattern, Str) -> bool) { 573 let pat = Pattern::new("a+b").unwrap(); 574 assert!(f(&pat, Str::new("ab"))); 575 assert!(f(&pat, Str::new("aaaab"))); 576 assert!(f(&pat, Str::new("aaaaaaaaaab"))); 577 assert!(f(&pat, Str::new("bab"))); 578 assert!(f(&pat, Str::new("ffab"))); 579 assert!(f(&pat, Str::new("qqqfqqqqaaaaab"))); 580 } 581 582 #[test] debug_is_unanchored()583 fn debug_is_unanchored() { 584 test_is_unanchored(|pat, input| pat.debug_matches(&input)) 585 } 586 587 #[test] display_is_unanchored()588 fn display_is_unanchored() { 589 test_is_unanchored(|pat, input| pat.display_matches(&input)); 590 } 591 592 #[test] reader_is_unanchored()593 fn reader_is_unanchored() { 594 test_is_unanchored(|pat, input| { 595 pat.read_matches(input.to_reader()) 596 .expect("no io error occurs") 597 }); 598 } 599 } 600 } 601