1 //! This crate provides a safe wrapper around the 2 //! [Oniguruma](https://github.com/kkos/oniguruma) regular expression library. 3 //! 4 //! # Examples 5 //! 6 //! ```rust 7 //! use onig::Regex; 8 //! 9 //! let regex = Regex::new("e(l+)").unwrap(); 10 //! for (i, pos) in regex.captures("hello").unwrap().iter_pos().enumerate() { 11 //! match pos { 12 //! Some((beg, end)) => 13 //! println!("Group {} captured in position {}:{}", i, beg, end), 14 //! None => 15 //! println!("Group {} is not captured", i) 16 //! } 17 //! } 18 //! ``` 19 //! 20 //! # Match vs Search 21 //! 22 //! There are two basic things you can do with a `Regex` pattern; test 23 //! if the pattern matches the whole of a given string, and search for 24 //! occurences of the pattern within a string. Oniguruma exposes these 25 //! two concepts with the *match* and *search* APIs. 26 //! 27 //! In addition two these two base Onigurma APIs this crate exposes a 28 //! third *find* API, built on top of the *search* API. 29 //! 30 //! ``` 31 //! # use onig::Regex; 32 //! let pattern = Regex::new("hello").unwrap(); 33 //! assert_eq!(true, pattern.find("hello world").is_some()); 34 //! assert_eq!(false, pattern.is_match("hello world")); 35 //! ``` 36 //! 37 //! ## The *Match* API 38 //! 39 //! Functions in the match API check if a pattern matches the entire 40 //! string. The simplest of these is `Regex::is_match`. This retuns a 41 //! `true` if the pattern matches the string. For more complex useage 42 //! then `Regex::match_with_options` and `Regex::match_with_encoding` 43 //! can be used. These allow the capture groups to be inspected, 44 //! matching with different options, and matching sub-sections of a 45 //! given text. 46 //! 47 //! ## The *Search* API 48 //! 49 //! Function in the search API search for a pattern anywhere within a 50 //! string. The simplist of these is `Regex::find`. This returns the 51 //! offset of the first occurence of the pattern within the string. 52 //! For more complex useage `Regex::search_with_options` and 53 //! `Regex::search_with_encoding` can be used. These allow capture 54 //! groups to be inspected, searching with different options and 55 //! searching within subsections of a given text. 56 //! 57 //! ## The *Find* API 58 //! 59 //! The find API is built on top of the search API. Functions in this 60 //! API allow iteration across all matches of the pattern within a 61 //! string, not just the first one. The functions deal with some of 62 //! the complexities of this, such as zero-length matches. 63 //! 64 //! The simplest step-up from the basic search API `Regex::find` is 65 //! getting the captures relating to a match with the 66 //! `Regex::captures` method. To find capture information for all 67 //! matches within a string `Regex::find_iter` and 68 //! `Regex::captures_iter` can be used. The former exposes the start 69 //! and end of the match as `Regex::find` does, the latter exposes the 70 //! whole capture group information as `Regex::captures` does. 71 //! 72 //! # The `std::pattern` API 73 //! 74 //! In addition to the main Oniguruma API it is possible to use the 75 //! `Regex` object with the 76 //! [`std::pattern`](https://doc.rust-lang.org/std/str/pattern/) 77 //! API. To enable support compile with the `std-pattern` feature. If 78 //! you're using Cargo you can do this by adding the following to your 79 //! Cargo.toml: 80 //! 81 //! ```toml 82 //! [dependencies.onig] 83 //! version = "1.2" 84 //! features = ["std-pattern"] 85 //! ``` 86 87 #![cfg_attr(not(feature = "cargo-clippy"), allow(unknown_lints))] 88 #![cfg_attr(feature = "std-pattern", feature(pattern))] 89 #![deny(missing_docs)] 90 91 #[macro_use] 92 extern crate bitflags; 93 #[macro_use] 94 extern crate lazy_static; 95 #[cfg(windows)] 96 extern crate libc; 97 extern crate onig_sys; 98 99 mod buffers; 100 mod find; 101 mod flags; 102 mod match_param; 103 mod names; 104 mod region; 105 mod replace; 106 mod syntax; 107 mod tree; 108 mod utils; 109 110 #[cfg(feature = "std-pattern")] 111 mod pattern; 112 113 // re-export the onig types publically 114 pub use buffers::{EncodedBytes, EncodedChars}; 115 pub use find::{ 116 Captures, FindCaptures, FindMatches, RegexSplits, RegexSplitsN, SubCaptures, SubCapturesPos, 117 }; 118 pub use flags::*; 119 pub use match_param::MatchParam; 120 pub use region::Region; 121 pub use replace::Replacer; 122 pub use syntax::{MetaChar, Syntax}; 123 pub use tree::{CaptureTreeNode, CaptureTreeNodeIter}; 124 pub use utils::{copyright, define_user_property, version}; 125 126 use std::os::raw::c_int; 127 use std::ptr::{null, null_mut}; 128 use std::sync::Mutex; 129 use std::{error, fmt, str}; 130 131 #[derive(Debug)] 132 enum ErrorData { 133 OnigError(c_int), 134 Custom, 135 } 136 137 /// This struture represents an error from the underlying Oniguruma libray. 138 pub struct Error { 139 data: ErrorData, 140 description: String, 141 } 142 143 /// This struct is a wrapper around an Oniguruma regular expression 144 /// pointer. This represents a compiled regex which can be used in 145 /// search and match operations. 146 #[derive(Debug, Eq, PartialEq)] 147 pub struct Regex { 148 raw: onig_sys::OnigRegex, 149 } 150 151 unsafe impl Send for Regex {} 152 unsafe impl Sync for Regex {} 153 154 impl Error { from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self155 fn from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self { 156 Error::new(code, info) 157 } 158 from_code(code: c_int) -> Self159 fn from_code(code: c_int) -> Self { 160 Error::new(code, null()) 161 } 162 custom<T: Into<String>>(message: T) -> Self163 fn custom<T: Into<String>>(message: T) -> Self { 164 Error { 165 data: ErrorData::Custom, 166 description: message.into(), 167 } 168 } 169 new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self170 fn new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self { 171 let buff = &mut [0; onig_sys::ONIG_MAX_ERROR_MESSAGE_LEN as usize]; 172 let len = unsafe { onig_sys::onig_error_code_to_str(buff.as_mut_ptr(), code, info) }; 173 let description = if let Ok(description) = str::from_utf8(&buff[..len as usize]) { 174 description 175 } else { 176 return Self::custom("Onig error string was invalid UTF-8"); 177 }; 178 Error { 179 data: ErrorData::OnigError(code), 180 description: description.to_owned(), 181 } 182 } 183 184 /// Return Oniguruma engine error code. code(&self) -> i32185 pub fn code(&self) -> i32 { 186 match self.data { 187 ErrorData::OnigError(code) => code, 188 _ => -1, 189 } 190 } 191 192 /// Return error description provided by Oniguruma engine. description(&self) -> &str193 pub fn description(&self) -> &str { 194 &self.description 195 } 196 } 197 198 impl error::Error for Error { description(&self) -> &str199 fn description(&self) -> &str { 200 &self.description 201 } 202 } 203 204 impl fmt::Display for Error { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result205 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 206 write!(f, "Oniguruma error: {}", self.description()) 207 } 208 } 209 210 impl fmt::Debug for Error { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result211 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 212 write!(f, "Error({:?}, {})", self.data, self.description()) 213 } 214 } 215 216 lazy_static! { 217 static ref REGEX_NEW_MUTEX: Mutex<()> = Mutex::new(()); 218 } 219 220 impl Regex { 221 /// Create a Regex 222 /// 223 /// Simple regular expression constructor. Compiles a new regular 224 /// expression with the default options using the ruby syntax. 225 /// Once compiled, it can be used repeatedly to search in a string. If an 226 /// invalid expression is given, then an error is returned. 227 /// 228 /// # Arguments 229 /// 230 /// * `pattern` - The regex pattern to compile 231 /// 232 /// # Examples 233 /// 234 /// ``` 235 /// use onig::Regex; 236 /// let r = Regex::new(r#"hello (\w+)"#); 237 /// assert!(r.is_ok()); 238 /// ``` new(pattern: &str) -> Result<Self, Error>239 pub fn new(pattern: &str) -> Result<Self, Error> { 240 Regex::with_encoding(pattern) 241 } 242 243 /// Create a Regex, Specifying an Encoding 244 /// 245 /// Attempts to compile `pattern` into a new `Regex` 246 /// instance. Instead of assuming UTF-8 as the encoding scheme the 247 /// encoding is inferred from the `pattern` buffer. 248 /// 249 /// # Arguments 250 /// 251 /// * `pattern` - The regex pattern to compile 252 /// 253 /// # Examples 254 /// 255 /// ``` 256 /// use onig::{Regex, EncodedBytes}; 257 /// let utf8 = Regex::with_encoding("hello"); 258 /// assert!(utf8.is_ok()); 259 /// let ascii = Regex::with_encoding(EncodedBytes::ascii(b"world")); 260 /// assert!(ascii.is_ok()); 261 /// ``` with_encoding<T>(pattern: T) -> Result<Regex, Error> where T: EncodedChars,262 pub fn with_encoding<T>(pattern: T) -> Result<Regex, Error> 263 where 264 T: EncodedChars, 265 { 266 Regex::with_options_and_encoding( 267 pattern, 268 RegexOptions::REGEX_OPTION_NONE, 269 Syntax::default(), 270 ) 271 } 272 273 /// Create a new Regex 274 /// 275 /// Attempts to compile a pattern into a new `Regex` instance. 276 /// Once compiled, it can be used repeatedly to search in a string. If an 277 /// invalid expression is given, then an error is returned. 278 /// See [`onig_sys::onig_new`][regex_new] for more information. 279 /// 280 /// # Arguments 281 /// 282 /// * `pattern` - The regex pattern to compile. 283 /// * `options` - The regex compilation options. 284 /// * `syntax` - The syntax which the regex is written in. 285 /// 286 /// # Examples 287 /// 288 /// ``` 289 /// use onig::{Regex, Syntax, RegexOptions}; 290 /// let r = Regex::with_options("hello.*world", 291 /// RegexOptions::REGEX_OPTION_NONE, 292 /// Syntax::default()); 293 /// assert!(r.is_ok()); 294 /// ``` 295 /// 296 /// [regex_new]: ./onig_sys/fn.onig_new.html with_options( pattern: &str, option: RegexOptions, syntax: &Syntax, ) -> Result<Regex, Error>297 pub fn with_options( 298 pattern: &str, 299 option: RegexOptions, 300 syntax: &Syntax, 301 ) -> Result<Regex, Error> { 302 Regex::with_options_and_encoding(pattern, option, syntax) 303 } 304 305 /// Create a new Regex, Specifying Options and Ecoding 306 /// 307 /// Attempts to comile the given `pattern` into a new `Regex` 308 /// instance. Instead of assuming UTF-8 as the encoding scheme the 309 /// encoding is inferred from the `pattern` buffer. If the regex 310 /// fails to compile the returned `Error` value from 311 /// [`onig_new`][regex_new] contains more information. 312 /// 313 /// [regex_new]: ./onig_sys/fn.onig_new.html 314 /// 315 /// # Arguments 316 /// 317 /// * `pattern` - The regex pattern to compile. 318 /// * `options` - The regex compilation options. 319 /// * `syntax` - The syntax which the regex is written in. 320 /// 321 /// # Examples 322 /// ``` 323 /// use onig::{Regex, Syntax, EncodedBytes, RegexOptions}; 324 /// let pattern = EncodedBytes::ascii(b"hello"); 325 /// let r = Regex::with_options_and_encoding(pattern, 326 /// RegexOptions::REGEX_OPTION_SINGLELINE, 327 /// Syntax::default()); 328 /// assert!(r.is_ok()); 329 /// ``` with_options_and_encoding<T>( pattern: T, option: RegexOptions, syntax: &Syntax, ) -> Result<Self, Error> where T: EncodedChars,330 pub fn with_options_and_encoding<T>( 331 pattern: T, 332 option: RegexOptions, 333 syntax: &Syntax, 334 ) -> Result<Self, Error> 335 where 336 T: EncodedChars, 337 { 338 // Convert the rust types to those required for the call to 339 // `onig_new`. 340 let mut reg: onig_sys::OnigRegex = null_mut(); 341 let reg_ptr = &mut reg as *mut onig_sys::OnigRegex; 342 343 // We can use this later to get an error message to pass back 344 // if regex creation fails. 345 let mut error = onig_sys::OnigErrorInfo { 346 enc: null_mut(), 347 par: null_mut(), 348 par_end: null_mut(), 349 }; 350 351 let err = unsafe { 352 // Grab a lock to make sure that `onig_new` isn't called by 353 // more than one thread at a time. 354 let _guard = REGEX_NEW_MUTEX.lock().unwrap(); 355 onig_sys::onig_new( 356 reg_ptr, 357 pattern.start_ptr(), 358 pattern.limit_ptr(), 359 option.bits(), 360 pattern.encoding(), 361 syntax as *const Syntax as *mut Syntax as *mut onig_sys::OnigSyntaxType, 362 &mut error, 363 ) 364 }; 365 366 if err == onig_sys::ONIG_NORMAL as i32 { 367 Ok(Regex { raw: reg }) 368 } else { 369 Err(Error::from_code_and_info(err, &error)) 370 } 371 } 372 373 /// Match String 374 /// 375 /// Try to match the regex against the given string slice, 376 /// starting at a given offset. This method works the same way as 377 /// `match_with_encoding`, but the encoding is always utf-8. 378 /// 379 /// For more information see [Match vs 380 /// Search](index.html#match-vs-search) 381 /// 382 /// # Arguments 383 /// 384 /// * `str` - The string slice to match against. 385 /// * `at` - The byte index in the passed slice to start matching 386 /// * `options` - The regex match options. 387 /// * `region` - The region for return group match range info 388 /// 389 /// # Returns 390 /// 391 /// `Some(len)` if the regex matched, with `len` being the number 392 /// of bytes matched. `None` if the regex doesn't match. 393 /// 394 /// # Examples 395 /// 396 /// ``` 397 /// use onig::{Regex, SearchOptions}; 398 /// 399 /// let r = Regex::new(".*").unwrap(); 400 /// let res = r.match_with_options("hello", 0, SearchOptions::SEARCH_OPTION_NONE, None); 401 /// assert!(res.is_some()); // it matches 402 /// assert!(res.unwrap() == 5); // 5 characters matched 403 /// ``` match_with_options( &self, str: &str, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>404 pub fn match_with_options( 405 &self, 406 str: &str, 407 at: usize, 408 options: SearchOptions, 409 region: Option<&mut Region>, 410 ) -> Option<usize> { 411 self.match_with_encoding(str, at, options, region) 412 } 413 414 /// Match String with Encoding 415 /// 416 /// Match the regex against a string. This method will start at 417 /// the offset `at` into the string and try and match the 418 /// regex. If the regex matches then the return value is the 419 /// number of characters which matched. If the regex doesn't match 420 /// the return is `None`. 421 /// 422 /// For more information see [Match vs 423 /// Search](index.html#match-vs-search) 424 /// 425 /// The contents of `chars` must have the same encoding that was 426 /// used to construct the regex. 427 /// 428 /// # Arguments 429 /// 430 /// * `chars` - The buffer to match against. 431 /// * `at` - The byte index in the passed buffer to start matching 432 /// * `options` - The regex match options. 433 /// * `region` - The region for return group match range info 434 /// 435 /// # Returns 436 /// 437 /// `Some(len)` if the regex matched, with `len` being the number 438 /// of bytes matched. `None` if the regex doesn't match. 439 /// 440 /// # Examples 441 /// 442 /// ``` 443 /// use onig::{Regex, EncodedBytes, SearchOptions}; 444 /// 445 /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap(); 446 /// let res = r.match_with_encoding(EncodedBytes::ascii(b"world"), 447 /// 0, SearchOptions::SEARCH_OPTION_NONE, None); 448 /// assert!(res.is_some()); // it matches 449 /// assert!(res.unwrap() == 5); // 5 characters matched 450 /// ``` match_with_encoding<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,451 pub fn match_with_encoding<T>( 452 &self, 453 chars: T, 454 at: usize, 455 options: SearchOptions, 456 region: Option<&mut Region>, 457 ) -> Option<usize> 458 where 459 T: EncodedChars, 460 { 461 let match_param = MatchParam::default(); 462 let result = self.match_with_param(chars, at, options, region, match_param); 463 464 match result { 465 Ok(r) => r, 466 Err(e) => panic!("Onig: Regex match error: {}", e.description()), 467 } 468 } 469 470 /// Match string with encoding and match param 471 /// 472 /// Match the regex against a string. This method will start at 473 /// the offset `at` into the string and try and match the 474 /// regex. If the regex matches then the return value is the 475 /// number of characters which matched. If the regex doesn't match 476 /// the return is `None`. 477 /// 478 /// For more information see [Match vs 479 /// Search](index.html#match-vs-search) 480 /// 481 /// The contents of `chars` must have the same encoding that was 482 /// used to construct the regex. 483 /// 484 /// # Arguments 485 /// 486 /// * `chars` - The buffer to match against. 487 /// * `at` - The byte index in the passed buffer to start matching 488 /// * `options` - The regex match options. 489 /// * `region` - The region for return group match range info 490 /// * `match_param` - The match parameters 491 /// 492 /// # Returns 493 /// 494 /// `Ok(Some(len))` if the regex matched, with `len` being the number 495 /// of bytes matched. `Ok(None)` if the regex doesn't match. `Err` with an 496 /// `Error` if an error occurred (e.g. retry-limit-in-match exceeded). 497 /// 498 /// # Examples 499 /// 500 /// ``` 501 /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions}; 502 /// 503 /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap(); 504 /// let res = r.match_with_param(EncodedBytes::ascii(b"world"), 505 /// 0, SearchOptions::SEARCH_OPTION_NONE, 506 /// None, MatchParam::default()); 507 /// assert!(res.is_ok()); // matching did not error 508 /// assert!(res.unwrap() == Some(5)); // 5 characters matched 509 /// ``` match_with_param<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,510 pub fn match_with_param<T>( 511 &self, 512 chars: T, 513 at: usize, 514 options: SearchOptions, 515 region: Option<&mut Region>, 516 match_param: MatchParam, 517 ) -> Result<Option<usize>, Error> 518 where 519 T: EncodedChars, 520 { 521 if chars.encoding() != self.encoding() { 522 return Err(Error::custom(format!("Regex encoding does not match haystack encoding ({0:?}, {1:?})", chars.encoding(), self.encoding()))); 523 } 524 let r = unsafe { 525 let offset = chars.start_ptr().add(at); 526 if offset > chars.limit_ptr() { 527 return Err(Error::custom(format!("Offset {} is too large", at))); 528 } 529 onig_sys::onig_match_with_param( 530 self.raw, 531 chars.start_ptr(), 532 chars.limit_ptr(), 533 offset, 534 match region { 535 Some(region) => region as *mut Region as *mut onig_sys::OnigRegion, 536 None => std::ptr::null_mut(), 537 }, 538 options.bits(), 539 match_param.as_raw(), 540 ) 541 }; 542 543 if r >= 0 { 544 Ok(Some(r as usize)) 545 } else if r == onig_sys::ONIG_MISMATCH { 546 Ok(None) 547 } else { 548 Err(Error::from_code(r)) 549 } 550 } 551 552 /// Search pattern in string 553 /// 554 /// Search for matches the regex in a string. This method will return the 555 /// index of the first match of the regex within the string, if 556 /// there is one. If `from` is less than `to`, then search is performed 557 /// in forward order, otherwise – in backward order. 558 /// 559 /// For more information see [Match vs 560 /// Search](index.html#match-vs-search) 561 /// 562 /// # Arguments 563 /// 564 /// * `str` - The string to search in. 565 /// * `from` - The byte index in the passed slice to start search 566 /// * `to` - The byte index in the passed slice to finish search 567 /// * `options` - The options for the search. 568 /// * `region` - The region for return group match range info 569 /// 570 /// # Returns 571 /// 572 /// `Some(pos)` if the regex matches, where `pos` is the 573 /// byte-position of the start of the match. `None` if the regex 574 /// doesn't match anywhere in `str`. 575 /// 576 /// # Examples 577 /// 578 /// ``` 579 /// use onig::{Regex, SearchOptions}; 580 /// 581 /// let r = Regex::new("l{1,2}").unwrap(); 582 /// let res = r.search_with_options("hello", 0, 5, SearchOptions::SEARCH_OPTION_NONE, None); 583 /// assert!(res.is_some()); // it matches 584 /// assert!(res.unwrap() == 2); // match starts at character 3 585 /// ``` search_with_options( &self, str: &str, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>586 pub fn search_with_options( 587 &self, 588 str: &str, 589 from: usize, 590 to: usize, 591 options: SearchOptions, 592 region: Option<&mut Region>, 593 ) -> Option<usize> { 594 self.search_with_encoding(str, from, to, options, region) 595 } 596 597 /// Search for a Pattern in a String with an Encoding 598 /// 599 /// Search for matches the regex in a string. This method will 600 /// return the index of the first match of the regex within the 601 /// string, if there is one. If `from` is less than `to`, then 602 /// search is performed in forward order, otherwise – in backward 603 /// order. 604 /// 605 /// For more information see [Match vs 606 /// Search](index.html#match-vs-search) 607 /// 608 /// The encoding of the buffer passed to search in must match the 609 /// encoding of the regex. 610 /// 611 /// # Arguments 612 /// 613 /// * `chars` - The character buffer to search in. 614 /// * `from` - The byte index in the passed slice to start search 615 /// * `to` - The byte index in the passed slice to finish search 616 /// * `options` - The options for the search. 617 /// * `region` - The region for return group match range info 618 /// 619 /// # Returns 620 /// 621 /// `Some(pos)` if the regex matches, where `pos` is the 622 /// byte-position of the start of the match. `None` if the regex 623 /// doesn't match anywhere in `chars`. 624 /// 625 /// # Examples 626 /// 627 /// ``` 628 /// use onig::{Regex, EncodedBytes, SearchOptions}; 629 /// 630 /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap(); 631 /// let res = r.search_with_encoding(EncodedBytes::ascii(b"hello"), 632 /// 0, 5, SearchOptions::SEARCH_OPTION_NONE, None); 633 /// assert!(res.is_some()); // it matches 634 /// assert!(res.unwrap() == 2); // match starts at character 3 635 /// ``` search_with_encoding<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,636 pub fn search_with_encoding<T>( 637 &self, 638 chars: T, 639 from: usize, 640 to: usize, 641 options: SearchOptions, 642 region: Option<&mut Region>, 643 ) -> Option<usize> 644 where 645 T: EncodedChars, 646 { 647 let match_param = MatchParam::default(); 648 let result = self.search_with_param(chars, from, to, options, region, match_param); 649 650 match result { 651 Ok(r) => r, 652 Err(e) => panic!("Onig: Regex search error: {}", e.description()), 653 } 654 } 655 656 /// Search pattern in string with encoding and match param 657 /// 658 /// Search for matches the regex in a string. This method will 659 /// return the index of the first match of the regex within the 660 /// string, if there is one. If `from` is less than `to`, then 661 /// search is performed in forward order, otherwise – in backward 662 /// order. 663 /// 664 /// For more information see [Match vs 665 /// Search](index.html#match-vs-search) 666 /// 667 /// The encoding of the buffer passed to search in must match the 668 /// encoding of the regex. 669 /// 670 /// # Arguments 671 /// 672 /// * `chars` - The character buffer to search in. 673 /// * `from` - The byte index in the passed slice to start search 674 /// * `to` - The byte index in the passed slice to finish search 675 /// * `options` - The options for the search. 676 /// * `region` - The region for return group match range info 677 /// * `match_param` - The match parameters 678 /// 679 /// # Returns 680 /// 681 /// `Ok(Some(pos))` if the regex matches, where `pos` is the 682 /// byte-position of the start of the match. `Ok(None)` if the regex 683 /// doesn't match anywhere in `chars`. `Err` with an `Error` if an error 684 /// occurred (e.g. retry-limit-in-match exceeded). 685 /// 686 /// # Examples 687 /// 688 /// ``` 689 /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions}; 690 /// 691 /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap(); 692 /// let res = r.search_with_param(EncodedBytes::ascii(b"hello"), 693 /// 0, 5, SearchOptions::SEARCH_OPTION_NONE, 694 /// None, MatchParam::default()); 695 /// assert!(res.is_ok()); // matching did not error 696 /// assert!(res.unwrap() == Some(2)); // match starts at character 3 697 /// ``` search_with_param<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,698 pub fn search_with_param<T>( 699 &self, 700 chars: T, 701 from: usize, 702 to: usize, 703 options: SearchOptions, 704 region: Option<&mut Region>, 705 match_param: MatchParam, 706 ) -> Result<Option<usize>, Error> 707 where 708 T: EncodedChars, 709 { 710 let (beg, end) = (chars.start_ptr(), chars.limit_ptr()); 711 if chars.encoding() != self.encoding() { 712 return Err(Error::custom(format!("Regex encoding does not match haystack encoding ({0:?}, {1:?})", chars.encoding(), self.encoding()))); 713 } 714 let r = unsafe { 715 let start = beg.add(from); 716 let range = beg.add(to); 717 if start > end { 718 return Err(Error::custom("Start of match should be before end")); 719 } 720 if range > end { 721 return Err(Error::custom("Limit of match should be before end")); 722 } 723 onig_sys::onig_search_with_param( 724 self.raw, 725 beg, 726 end, 727 start, 728 range, 729 match region { 730 Some(region) => region as *mut Region as *mut onig_sys::OnigRegion, 731 None => std::ptr::null_mut(), 732 }, 733 options.bits(), 734 match_param.as_raw(), 735 ) 736 }; 737 738 if r >= 0 { 739 Ok(Some(r as usize)) 740 } else if r == onig_sys::ONIG_MISMATCH { 741 Ok(None) 742 } else { 743 Err(Error::from_code(r)) 744 } 745 } 746 747 /// Returns true if and only if the regex matches the string given. 748 /// 749 /// For more information see [Match vs 750 /// Search](index.html#match-vs-search) 751 /// 752 /// # Arguments 753 /// * `text` - The string slice to test against the pattern. 754 /// 755 /// # Returns 756 /// 757 /// `true` if the pattern matches the whole of `text`, `false` otherwise. is_match(&self, text: &str) -> bool758 pub fn is_match(&self, text: &str) -> bool { 759 self.match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None) 760 .map(|r| r == text.len()) 761 .unwrap_or(false) 762 } 763 764 /// Find a Match in a Buffer, With Encoding 765 /// 766 /// Finds the first match of the regular expression within the 767 /// buffer. 768 /// 769 /// Note that this should only be used if you want to discover the 770 /// position of the match within a string. Testing if a pattern 771 /// matches the whole string is faster if you use `is_match`. For 772 /// more information see [Match vs 773 /// Search](index.html#match-vs-search) 774 /// 775 /// # Arguments 776 /// * `text` - The text to search in. 777 /// 778 /// # Returns 779 /// 780 /// The offset of the start and end of the first match. If no 781 /// match exists `None` is returned. find(&self, text: &str) -> Option<(usize, usize)>782 pub fn find(&self, text: &str) -> Option<(usize, usize)> { 783 self.find_with_encoding(text) 784 } 785 786 /// Find a Match in a Buffer, With Encoding 787 /// 788 /// Finds the first match of the regular expression within the 789 /// buffer. 790 /// 791 /// For more information see [Match vs 792 /// Search](index.html#match-vs-search) 793 /// 794 /// # Arguments 795 /// * `text` - The text to search in. 796 /// 797 /// # Returns 798 /// 799 /// The offset of the start and end of the first match. If no 800 /// match exists `None` is returned. find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)> where T: EncodedChars,801 pub fn find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)> 802 where 803 T: EncodedChars, 804 { 805 let mut region = Region::new(); 806 let len = text.len(); 807 self.search_with_encoding( 808 text, 809 0, 810 len, 811 SearchOptions::SEARCH_OPTION_NONE, 812 Some(&mut region), 813 ) 814 .and_then(|_| region.pos(0)) 815 } 816 817 /// Get the Encoding of the Regex 818 /// 819 /// # Returns 820 /// 821 /// Returns a reference to an oniguruma encoding which was used 822 /// when this regex was created. encoding(&self) -> onig_sys::OnigEncoding823 pub fn encoding(&self) -> onig_sys::OnigEncoding { 824 unsafe { onig_sys::onig_get_encoding(self.raw) } 825 } 826 827 /// Get the Number of Capture Groups in this Pattern captures_len(&self) -> usize828 pub fn captures_len(&self) -> usize { 829 unsafe { onig_sys::onig_number_of_captures(self.raw) as usize } 830 } 831 832 /// Get the Size of the Capture Histories for this Pattern capture_histories_len(&self) -> usize833 pub fn capture_histories_len(&self) -> usize { 834 unsafe { onig_sys::onig_number_of_capture_histories(self.raw) as usize } 835 } 836 } 837 838 impl Drop for Regex { drop(&mut self)839 fn drop(&mut self) { 840 unsafe { 841 onig_sys::onig_free(self.raw); 842 } 843 } 844 } 845 846 #[cfg(test)] 847 mod tests { 848 use super::*; 849 use std::panic; 850 851 #[test] test_regex_create()852 fn test_regex_create() { 853 Regex::with_options(".*", RegexOptions::REGEX_OPTION_NONE, Syntax::default()).unwrap(); 854 855 Regex::new(r#"a \w+ word"#).unwrap(); 856 } 857 858 #[test] test_regex_invalid()859 fn test_regex_invalid() { 860 let e = Regex::new("\\p{foo}").unwrap_err(); 861 assert_eq!(e.code(), -223); 862 assert_eq!(e.description(), "invalid character property name {foo}"); 863 } 864 865 #[test] test_failed_match()866 fn test_failed_match() { 867 let regex = Regex::new("foo").unwrap(); 868 let res = regex.match_with_options("bar", 0, SearchOptions::SEARCH_OPTION_NONE, None); 869 assert!(res.is_none()); 870 } 871 872 #[test] test_regex_search_with_options()873 fn test_regex_search_with_options() { 874 let mut region = Region::new(); 875 let regex = Regex::new("e(l+)").unwrap(); 876 877 let r = regex.search_with_options( 878 "hello", 879 0, 880 5, 881 SearchOptions::SEARCH_OPTION_NONE, 882 Some(&mut region), 883 ); 884 885 assert!(region.tree().is_none()); 886 assert_eq!(r, Some(1)); 887 assert_eq!(region.len(), 2); 888 let pos1 = region.pos(0).unwrap(); 889 let pos2 = region.pos(1).unwrap(); 890 assert_eq!(pos1, (1, 4)); 891 assert_eq!(pos2, (2, 4)); 892 893 // test cloning here since we already have a filled region 894 let cloned_region = region.clone(); 895 let pos1_clone = cloned_region.pos(0).unwrap(); 896 assert_eq!(pos1_clone, pos1); 897 } 898 899 #[test] test_regex_match_with_options()900 fn test_regex_match_with_options() { 901 let mut region = Region::new(); 902 let regex = Regex::new("he(l+)").unwrap(); 903 904 let r = regex.match_with_options( 905 "hello", 906 0, 907 SearchOptions::SEARCH_OPTION_NONE, 908 Some(&mut region), 909 ); 910 911 assert!(region.tree().is_none()); 912 assert_eq!(r, Some(4)); 913 assert_eq!(region.len(), 2); 914 let pos1 = region.pos(0).unwrap(); 915 let pos2 = region.pos(1).unwrap(); 916 assert_eq!(pos1, (0, 4)); 917 assert_eq!(pos2, (2, 4)); 918 } 919 920 #[test] test_regex_is_match()921 fn test_regex_is_match() { 922 let regex = Regex::new("he(l+)o").unwrap(); 923 assert!(regex.is_match("hello")); 924 assert!(!regex.is_match("hello 2.0")); 925 } 926 927 #[test] test_regex_find()928 fn test_regex_find() { 929 let regex = Regex::new("he(l+)o").unwrap(); 930 assert_eq!(regex.find("hey, hello!"), Some((5, 10))); 931 assert_eq!(regex.find("hey, honey!"), None); 932 } 933 934 #[test] test_regex_captures_len()935 fn test_regex_captures_len() { 936 let regex = Regex::new("(he)(l+)(o)").unwrap(); 937 assert_eq!(regex.captures_len(), 3); 938 } 939 940 #[test] test_regex_error_is_match()941 fn test_regex_error_is_match() { 942 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 943 let result = regex.match_with_param( 944 "ababababababababababababababababababababababababababababacbc", 945 0, 946 SearchOptions::SEARCH_OPTION_NONE, 947 None, 948 MatchParam::default(), 949 ); 950 951 let e = result.err().unwrap(); 952 assert_eq!("retry-limit-in-match over", e.description()); 953 } 954 955 #[test] test_regex_panic_is_match()956 fn test_regex_panic_is_match() { 957 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 958 let result = panic::catch_unwind(|| { 959 regex.is_match("ababababababababababababababababababababababababababababacbc") 960 }); 961 let e = result.err().unwrap(); 962 let message = e.downcast_ref::<String>().unwrap(); 963 assert_eq!( 964 message.as_str(), 965 "Onig: Regex match error: retry-limit-in-match over" 966 ); 967 } 968 969 #[test] test_regex_error_find()970 fn test_regex_error_find() { 971 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 972 let s = "ababababababababababababababababababababababababababababacbc"; 973 let result = regex.search_with_param( 974 s, 975 0, 976 s.len(), 977 SearchOptions::SEARCH_OPTION_NONE, 978 None, 979 MatchParam::default(), 980 ); 981 982 let e = result.err().unwrap(); 983 assert_eq!("retry-limit-in-match over", e.description()); 984 } 985 986 #[test] test_regex_panic_find()987 fn test_regex_panic_find() { 988 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 989 let result = panic::catch_unwind(|| { 990 regex.find("ababababababababababababababababababababababababababababacbc") 991 }); 992 let e = result.err().unwrap(); 993 let message = e.downcast_ref::<String>().unwrap(); 994 assert_eq!( 995 message.as_str(), 996 "Onig: Regex search error: retry-limit-in-match over" 997 ); 998 } 999 1000 #[test] test_search_with_invalid_range()1001 fn test_search_with_invalid_range() { 1002 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1003 .expect("regex"); 1004 let string = "Ruby"; 1005 let is_match = regex.search_with_param( 1006 string, 1007 5, 1008 string.len(), 1009 SearchOptions::SEARCH_OPTION_NONE, 1010 None, 1011 MatchParam::default(), 1012 ); 1013 assert!(is_match.is_err()); 1014 1015 let is_match = regex.search_with_param( 1016 string, 1017 2, 1018 string.len() + 1, 1019 SearchOptions::SEARCH_OPTION_NONE, 1020 None, 1021 MatchParam::default(), 1022 ); 1023 assert!(is_match.is_err()); 1024 } 1025 1026 #[test] test_search_with_invalid_range_panic()1027 fn test_search_with_invalid_range_panic() { 1028 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1029 .expect("regex"); 1030 let string = "Ruby"; 1031 let is_match = panic::catch_unwind(|| regex.search_with_encoding( 1032 string, 1033 5, 1034 string.len(), 1035 SearchOptions::SEARCH_OPTION_NONE, 1036 None, 1037 )); 1038 assert!(is_match.is_err()); 1039 } 1040 1041 #[test] test_match_with_invalid_range()1042 fn test_match_with_invalid_range() { 1043 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1044 .expect("regex"); 1045 let string = "Ruby"; 1046 let is_match = regex.match_with_param( 1047 string, 1048 5, 1049 SearchOptions::SEARCH_OPTION_NONE, 1050 None, 1051 MatchParam::default(), 1052 ); 1053 assert!(is_match.is_err()); 1054 } 1055 1056 #[test] test_match_with_invalid_range_panic()1057 fn test_match_with_invalid_range_panic() { 1058 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1059 .expect("regex"); 1060 let string = "Ruby"; 1061 let is_match = panic::catch_unwind(|| regex.match_with_encoding( 1062 string, 1063 5, 1064 SearchOptions::SEARCH_OPTION_NONE, 1065 None, 1066 )); 1067 assert!(is_match.is_err()); 1068 } 1069 } 1070