1 //! This crate provides a safe wrapper around the 2 //! [Oniguruma](https://github.com/kkos/oniguruma) regular expression library. 3 //! 4 //! # Examples 5 //! 6 //! ```rust 7 //! use onig::Regex; 8 //! 9 //! let regex = Regex::new("e(l+)").unwrap(); 10 //! for (i, pos) in regex.captures("hello").unwrap().iter_pos().enumerate() { 11 //! match pos { 12 //! Some((beg, end)) => 13 //! println!("Group {} captured in position {}:{}", i, beg, end), 14 //! None => 15 //! println!("Group {} is not captured", i) 16 //! } 17 //! } 18 //! ``` 19 //! 20 //! # Match vs Search 21 //! 22 //! There are two basic things you can do with a `Regex` pattern; test 23 //! if the pattern matches the whole of a given string, and search for 24 //! occurences of the pattern within a string. Oniguruma exposes these 25 //! two concepts with the *match* and *search* APIs. 26 //! 27 //! In addition two these two base Onigurma APIs this crate exposes a 28 //! third *find* API, built on top of the *search* API. 29 //! 30 //! ``` 31 //! # use onig::Regex; 32 //! let pattern = Regex::new("hello").unwrap(); 33 //! assert_eq!(true, pattern.find("hello world").is_some()); 34 //! assert_eq!(false, pattern.is_match("hello world")); 35 //! ``` 36 //! 37 //! ## The *Match* API 38 //! 39 //! Functions in the match API check if a pattern matches the entire 40 //! string. The simplest of these is `Regex::is_match`. This retuns a 41 //! `true` if the pattern matches the string. For more complex useage 42 //! then `Regex::match_with_options` and `Regex::match_with_encoding` 43 //! can be used. These allow the capture groups to be inspected, 44 //! matching with different options, and matching sub-sections of a 45 //! given text. 46 //! 47 //! ## The *Search* API 48 //! 49 //! Function in the search API search for a pattern anywhere within a 50 //! string. The simplist of these is `Regex::find`. This returns the 51 //! offset of the first occurence of the pattern within the string. 52 //! For more complex useage `Regex::search_with_options` and 53 //! `Regex::search_with_encoding` can be used. These allow capture 54 //! groups to be inspected, searching with different options and 55 //! searching within subsections of a given text. 56 //! 57 //! ## The *Find* API 58 //! 59 //! The find API is built on top of the search API. Functions in this 60 //! API allow iteration across all matches of the pattern within a 61 //! string, not just the first one. The functions deal with some of 62 //! the complexities of this, such as zero-length matches. 63 //! 64 //! The simplest step-up from the basic search API `Regex::find` is 65 //! getting the captures relating to a match with the 66 //! `Regex::captures` method. To find capture information for all 67 //! matches within a string `Regex::find_iter` and 68 //! `Regex::captures_iter` can be used. The former exposes the start 69 //! and end of the match as `Regex::find` does, the latter exposes the 70 //! whole capture group information as `Regex::captures` does. 71 //! 72 //! # The `std::pattern` API 73 //! 74 //! In addition to the main Oniguruma API it is possible to use the 75 //! `Regex` object with the 76 //! [`std::pattern`](https://doc.rust-lang.org/std/str/pattern/) 77 //! API. To enable support compile with the `std-pattern` feature. If 78 //! you're using Cargo you can do this by adding the following to your 79 //! Cargo.toml: 80 //! 81 //! ```toml 82 //! [dependencies.onig] 83 //! version = "1.2" 84 //! features = ["std-pattern"] 85 //! ``` 86 87 #![cfg_attr(not(feature = "cargo-clippy"), allow(unknown_lints))] 88 #![cfg_attr(feature = "std-pattern", feature(pattern))] 89 #![deny(missing_docs)] 90 91 #[macro_use] 92 extern crate bitflags; 93 #[macro_use] 94 extern crate lazy_static; 95 #[cfg(windows)] 96 extern crate libc; 97 extern crate onig_sys; 98 99 mod buffers; 100 mod find; 101 mod flags; 102 mod match_param; 103 mod names; 104 mod region; 105 mod replace; 106 mod syntax; 107 mod tree; 108 mod utils; 109 110 #[cfg(feature = "std-pattern")] 111 mod pattern; 112 113 // re-export the onig types publically 114 pub use buffers::{EncodedBytes, EncodedChars}; 115 pub use find::{ 116 Captures, FindCaptures, FindMatches, RegexSplits, RegexSplitsN, SubCaptures, SubCapturesPos, 117 }; 118 pub use flags::*; 119 pub use match_param::MatchParam; 120 pub use region::Region; 121 pub use replace::Replacer; 122 pub use syntax::{MetaChar, Syntax}; 123 pub use tree::{CaptureTreeNode, CaptureTreeNodeIter}; 124 pub use utils::{copyright, define_user_property, version}; 125 126 use std::os::raw::c_int; 127 use std::ptr::{null, null_mut}; 128 use std::sync::Mutex; 129 use std::{error, fmt, str}; 130 131 #[derive(Debug)] 132 enum ErrorData { 133 OnigError(c_int), 134 Custom, 135 } 136 137 /// This struture represents an error from the underlying Oniguruma libray. 138 pub struct Error { 139 data: ErrorData, 140 description: String, 141 } 142 143 /// This struct is a wrapper around an Oniguruma regular expression 144 /// pointer. This represents a compiled regex which can be used in 145 /// search and match operations. 146 #[derive(Debug, Eq, PartialEq)] 147 pub struct Regex { 148 raw: onig_sys::OnigRegex, 149 } 150 151 unsafe impl Send for Regex {} 152 unsafe impl Sync for Regex {} 153 154 impl Error { from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self155 fn from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self { 156 Error::new(code, info) 157 } 158 from_code(code: c_int) -> Self159 fn from_code(code: c_int) -> Self { 160 Error::new(code, null()) 161 } 162 custom<T: Into<String>>(message: T) -> Self163 fn custom<T: Into<String>>(message: T) -> Self { 164 Error { 165 data: ErrorData::Custom, 166 description: message.into(), 167 } 168 } 169 new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self170 fn new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self { 171 let buff = &mut [0; onig_sys::ONIG_MAX_ERROR_MESSAGE_LEN as usize]; 172 let len = unsafe { onig_sys::onig_error_code_to_str(buff.as_mut_ptr(), code, info) }; 173 let description = if let Ok(description) = str::from_utf8(&buff[..len as usize]) { 174 description 175 } else { 176 return Self::custom("Onig error string was invalid UTF-8"); 177 }; 178 Error { 179 data: ErrorData::OnigError(code), 180 description: description.to_owned(), 181 } 182 } 183 184 /// Return Oniguruma engine error code. code(&self) -> i32185 pub fn code(&self) -> i32 { 186 match self.data { 187 ErrorData::OnigError(code) => code, 188 _ => -1, 189 } 190 } 191 192 /// Return error description provided by Oniguruma engine. description(&self) -> &str193 pub fn description(&self) -> &str { 194 &self.description 195 } 196 } 197 198 impl error::Error for Error { description(&self) -> &str199 fn description(&self) -> &str { 200 &self.description 201 } 202 } 203 204 impl fmt::Display for Error { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result205 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 206 write!(f, "Oniguruma error: {}", self.description()) 207 } 208 } 209 210 impl fmt::Debug for Error { fmt(&self, f: &mut fmt::Formatter) -> fmt::Result211 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { 212 write!(f, "Error({:?}, {})", self.data, self.description()) 213 } 214 } 215 216 lazy_static! { 217 static ref REGEX_NEW_MUTEX: Mutex<()> = Mutex::new(()); 218 } 219 220 impl Regex { 221 /// Create a Regex 222 /// 223 /// Simple regular expression constructor. Compiles a new regular 224 /// expression with the default options using the ruby syntax. 225 /// Once compiled, it can be used repeatedly to search in a string. If an 226 /// invalid expression is given, then an error is returned. 227 /// 228 /// # Arguments 229 /// 230 /// * `pattern` - The regex pattern to compile 231 /// 232 /// # Examples 233 /// 234 /// ``` 235 /// use onig::Regex; 236 /// let r = Regex::new(r#"hello (\w+)"#); 237 /// assert!(r.is_ok()); 238 /// ``` new(pattern: &str) -> Result<Self, Error>239 pub fn new(pattern: &str) -> Result<Self, Error> { 240 Regex::with_encoding(pattern) 241 } 242 243 /// Create a Regex, Specifying an Encoding 244 /// 245 /// Attempts to compile `pattern` into a new `Regex` 246 /// instance. Instead of assuming UTF-8 as the encoding scheme the 247 /// encoding is inferred from the `pattern` buffer. 248 /// 249 /// # Arguments 250 /// 251 /// * `pattern` - The regex pattern to compile 252 /// 253 /// # Examples 254 /// 255 /// ``` 256 /// use onig::{Regex, EncodedBytes}; 257 /// let utf8 = Regex::with_encoding("hello"); 258 /// assert!(utf8.is_ok()); 259 /// let ascii = Regex::with_encoding(EncodedBytes::ascii(b"world")); 260 /// assert!(ascii.is_ok()); 261 /// ``` with_encoding<T>(pattern: T) -> Result<Regex, Error> where T: EncodedChars,262 pub fn with_encoding<T>(pattern: T) -> Result<Regex, Error> 263 where 264 T: EncodedChars, 265 { 266 Regex::with_options_and_encoding( 267 pattern, 268 RegexOptions::REGEX_OPTION_NONE, 269 Syntax::default(), 270 ) 271 } 272 273 /// Create a new Regex 274 /// 275 /// Attempts to compile a pattern into a new `Regex` instance. 276 /// Once compiled, it can be used repeatedly to search in a string. If an 277 /// invalid expression is given, then an error is returned. 278 /// See [`onig_sys::onig_new`][regex_new] for more information. 279 /// 280 /// # Arguments 281 /// 282 /// * `pattern` - The regex pattern to compile. 283 /// * `options` - The regex compilation options. 284 /// * `syntax` - The syntax which the regex is written in. 285 /// 286 /// # Examples 287 /// 288 /// ``` 289 /// use onig::{Regex, Syntax, RegexOptions}; 290 /// let r = Regex::with_options("hello.*world", 291 /// RegexOptions::REGEX_OPTION_NONE, 292 /// Syntax::default()); 293 /// assert!(r.is_ok()); 294 /// ``` 295 /// 296 /// [regex_new]: ./onig_sys/fn.onig_new.html with_options( pattern: &str, option: RegexOptions, syntax: &Syntax, ) -> Result<Regex, Error>297 pub fn with_options( 298 pattern: &str, 299 option: RegexOptions, 300 syntax: &Syntax, 301 ) -> Result<Regex, Error> { 302 Regex::with_options_and_encoding(pattern, option, syntax) 303 } 304 305 /// Create a new Regex, Specifying Options and Ecoding 306 /// 307 /// Attempts to comile the given `pattern` into a new `Regex` 308 /// instance. Instead of assuming UTF-8 as the encoding scheme the 309 /// encoding is inferred from the `pattern` buffer. If the regex 310 /// fails to compile the returned `Error` value from 311 /// [`onig_new`][regex_new] contains more information. 312 /// 313 /// [regex_new]: ./onig_sys/fn.onig_new.html 314 /// 315 /// # Arguments 316 /// 317 /// * `pattern` - The regex pattern to compile. 318 /// * `options` - The regex compilation options. 319 /// * `syntax` - The syntax which the regex is written in. 320 /// 321 /// # Examples 322 /// ``` 323 /// use onig::{Regex, Syntax, EncodedBytes, RegexOptions}; 324 /// let pattern = EncodedBytes::ascii(b"hello"); 325 /// let r = Regex::with_options_and_encoding(pattern, 326 /// RegexOptions::REGEX_OPTION_SINGLELINE, 327 /// Syntax::default()); 328 /// assert!(r.is_ok()); 329 /// ``` with_options_and_encoding<T>( pattern: T, option: RegexOptions, syntax: &Syntax, ) -> Result<Self, Error> where T: EncodedChars,330 pub fn with_options_and_encoding<T>( 331 pattern: T, 332 option: RegexOptions, 333 syntax: &Syntax, 334 ) -> Result<Self, Error> 335 where 336 T: EncodedChars, 337 { 338 // Convert the rust types to those required for the call to 339 // `onig_new`. 340 let mut reg: onig_sys::OnigRegex = null_mut(); 341 let reg_ptr = &mut reg as *mut onig_sys::OnigRegex; 342 343 // We can use this later to get an error message to pass back 344 // if regex creation fails. 345 let mut error = onig_sys::OnigErrorInfo { 346 enc: null_mut(), 347 par: null_mut(), 348 par_end: null_mut(), 349 }; 350 351 let err = unsafe { 352 // Grab a lock to make sure that `onig_new` isn't called by 353 // more than one thread at a time. 354 let _guard = REGEX_NEW_MUTEX.lock().unwrap(); 355 onig_sys::onig_new( 356 reg_ptr, 357 pattern.start_ptr(), 358 pattern.limit_ptr(), 359 option.bits(), 360 pattern.encoding(), 361 syntax as *const Syntax as *mut Syntax as *mut onig_sys::OnigSyntaxType, 362 &mut error, 363 ) 364 }; 365 366 if err == onig_sys::ONIG_NORMAL as i32 { 367 Ok(Regex { raw: reg }) 368 } else { 369 Err(Error::from_code_and_info(err, &error)) 370 } 371 } 372 373 /// Match String 374 /// 375 /// Try to match the regex against the given string slice, 376 /// starting at a given offset. This method works the same way as 377 /// `match_with_encoding`, but the encoding is always utf-8. 378 /// 379 /// For more information see [Match vs 380 /// Search](index.html#match-vs-search) 381 /// 382 /// # Arguments 383 /// 384 /// * `str` - The string slice to match against. 385 /// * `at` - The byte index in the passed slice to start matching 386 /// * `options` - The regex match options. 387 /// * `region` - The region for return group match range info 388 /// 389 /// # Returns 390 /// 391 /// `Some(len)` if the regex matched, with `len` being the number 392 /// of bytes matched. `None` if the regex doesn't match. 393 /// 394 /// # Examples 395 /// 396 /// ``` 397 /// use onig::{Regex, SearchOptions}; 398 /// 399 /// let r = Regex::new(".*").unwrap(); 400 /// let res = r.match_with_options("hello", 0, SearchOptions::SEARCH_OPTION_NONE, None); 401 /// assert!(res.is_some()); // it matches 402 /// assert!(res.unwrap() == 5); // 5 characters matched 403 /// ``` match_with_options( &self, str: &str, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>404 pub fn match_with_options( 405 &self, 406 str: &str, 407 at: usize, 408 options: SearchOptions, 409 region: Option<&mut Region>, 410 ) -> Option<usize> { 411 self.match_with_encoding(str, at, options, region) 412 } 413 414 /// Match String with Encoding 415 /// 416 /// Match the regex against a string. This method will start at 417 /// the offset `at` into the string and try and match the 418 /// regex. If the regex matches then the return value is the 419 /// number of characters which matched. If the regex doesn't match 420 /// the return is `None`. 421 /// 422 /// For more information see [Match vs 423 /// Search](index.html#match-vs-search) 424 /// 425 /// The contents of `chars` must have the same encoding that was 426 /// used to construct the regex. 427 /// 428 /// # Arguments 429 /// 430 /// * `chars` - The buffer to match against. 431 /// * `at` - The byte index in the passed buffer to start matching 432 /// * `options` - The regex match options. 433 /// * `region` - The region for return group match range info 434 /// 435 /// # Returns 436 /// 437 /// `Some(len)` if the regex matched, with `len` being the number 438 /// of bytes matched. `None` if the regex doesn't match. 439 /// 440 /// # Examples 441 /// 442 /// ``` 443 /// use onig::{Regex, EncodedBytes, SearchOptions}; 444 /// 445 /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap(); 446 /// let res = r.match_with_encoding(EncodedBytes::ascii(b"world"), 447 /// 0, SearchOptions::SEARCH_OPTION_NONE, None); 448 /// assert!(res.is_some()); // it matches 449 /// assert!(res.unwrap() == 5); // 5 characters matched 450 /// ``` match_with_encoding<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,451 pub fn match_with_encoding<T>( 452 &self, 453 chars: T, 454 at: usize, 455 options: SearchOptions, 456 region: Option<&mut Region>, 457 ) -> Option<usize> 458 where 459 T: EncodedChars, 460 { 461 let match_param = MatchParam::default(); 462 let result = self.match_with_param(chars, at, options, region, match_param); 463 464 match result { 465 Ok(r) => r, 466 Err(e) => panic!("Onig: Regex match error: {}", e.description()), 467 } 468 } 469 470 /// Match string with encoding and match param 471 /// 472 /// Match the regex against a string. This method will start at 473 /// the offset `at` into the string and try and match the 474 /// regex. If the regex matches then the return value is the 475 /// number of characters which matched. If the regex doesn't match 476 /// the return is `None`. 477 /// 478 /// For more information see [Match vs 479 /// Search](index.html#match-vs-search) 480 /// 481 /// The contents of `chars` must have the same encoding that was 482 /// used to construct the regex. 483 /// 484 /// # Arguments 485 /// 486 /// * `chars` - The buffer to match against. 487 /// * `at` - The byte index in the passed buffer to start matching 488 /// * `options` - The regex match options. 489 /// * `region` - The region for return group match range info 490 /// * `match_param` - The match parameters 491 /// 492 /// # Returns 493 /// 494 /// `Ok(Some(len))` if the regex matched, with `len` being the number 495 /// of bytes matched. `Ok(None)` if the regex doesn't match. `Err` with an 496 /// `Error` if an error occurred (e.g. retry-limit-in-match exceeded). 497 /// 498 /// # Examples 499 /// 500 /// ``` 501 /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions}; 502 /// 503 /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap(); 504 /// let res = r.match_with_param(EncodedBytes::ascii(b"world"), 505 /// 0, SearchOptions::SEARCH_OPTION_NONE, 506 /// None, MatchParam::default()); 507 /// assert!(res.is_ok()); // matching did not error 508 /// assert!(res.unwrap() == Some(5)); // 5 characters matched 509 /// ``` match_with_param<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,510 pub fn match_with_param<T>( 511 &self, 512 chars: T, 513 at: usize, 514 options: SearchOptions, 515 region: Option<&mut Region>, 516 match_param: MatchParam, 517 ) -> Result<Option<usize>, Error> 518 where 519 T: EncodedChars, 520 { 521 if chars.encoding() != self.encoding() { 522 return Err(Error::custom(format!( 523 "Regex encoding does not match haystack encoding ({0:?}, {1:?})", 524 chars.encoding(), 525 self.encoding() 526 ))); 527 } 528 let r = unsafe { 529 let offset = chars.start_ptr().add(at); 530 if offset > chars.limit_ptr() { 531 return Err(Error::custom(format!("Offset {} is too large", at))); 532 } 533 onig_sys::onig_match_with_param( 534 self.raw, 535 chars.start_ptr(), 536 chars.limit_ptr(), 537 offset, 538 match region { 539 Some(region) => region as *mut Region as *mut onig_sys::OnigRegion, 540 None => std::ptr::null_mut(), 541 }, 542 options.bits(), 543 match_param.as_raw(), 544 ) 545 }; 546 547 if r >= 0 { 548 Ok(Some(r as usize)) 549 } else if r == onig_sys::ONIG_MISMATCH { 550 Ok(None) 551 } else { 552 Err(Error::from_code(r)) 553 } 554 } 555 556 /// Search pattern in string 557 /// 558 /// Search for matches the regex in a string. This method will return the 559 /// index of the first match of the regex within the string, if 560 /// there is one. If `from` is less than `to`, then search is performed 561 /// in forward order, otherwise – in backward order. 562 /// 563 /// For more information see [Match vs 564 /// Search](index.html#match-vs-search) 565 /// 566 /// # Arguments 567 /// 568 /// * `str` - The string to search in. 569 /// * `from` - The byte index in the passed slice to start search 570 /// * `to` - The byte index in the passed slice to finish search 571 /// * `options` - The options for the search. 572 /// * `region` - The region for return group match range info 573 /// 574 /// # Returns 575 /// 576 /// `Some(pos)` if the regex matches, where `pos` is the 577 /// byte-position of the start of the match. `None` if the regex 578 /// doesn't match anywhere in `str`. 579 /// 580 /// # Examples 581 /// 582 /// ``` 583 /// use onig::{Regex, SearchOptions}; 584 /// 585 /// let r = Regex::new("l{1,2}").unwrap(); 586 /// let res = r.search_with_options("hello", 0, 5, SearchOptions::SEARCH_OPTION_NONE, None); 587 /// assert!(res.is_some()); // it matches 588 /// assert!(res.unwrap() == 2); // match starts at character 3 589 /// ``` search_with_options( &self, str: &str, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>590 pub fn search_with_options( 591 &self, 592 str: &str, 593 from: usize, 594 to: usize, 595 options: SearchOptions, 596 region: Option<&mut Region>, 597 ) -> Option<usize> { 598 self.search_with_encoding(str, from, to, options, region) 599 } 600 601 /// Search for a Pattern in a String with an Encoding 602 /// 603 /// Search for matches the regex in a string. This method will 604 /// return the index of the first match of the regex within the 605 /// string, if there is one. If `from` is less than `to`, then 606 /// search is performed in forward order, otherwise – in backward 607 /// order. 608 /// 609 /// For more information see [Match vs 610 /// Search](index.html#match-vs-search) 611 /// 612 /// The encoding of the buffer passed to search in must match the 613 /// encoding of the regex. 614 /// 615 /// # Arguments 616 /// 617 /// * `chars` - The character buffer to search in. 618 /// * `from` - The byte index in the passed slice to start search 619 /// * `to` - The byte index in the passed slice to finish search 620 /// * `options` - The options for the search. 621 /// * `region` - The region for return group match range info 622 /// 623 /// # Returns 624 /// 625 /// `Some(pos)` if the regex matches, where `pos` is the 626 /// byte-position of the start of the match. `None` if the regex 627 /// doesn't match anywhere in `chars`. 628 /// 629 /// # Examples 630 /// 631 /// ``` 632 /// use onig::{Regex, EncodedBytes, SearchOptions}; 633 /// 634 /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap(); 635 /// let res = r.search_with_encoding(EncodedBytes::ascii(b"hello"), 636 /// 0, 5, SearchOptions::SEARCH_OPTION_NONE, None); 637 /// assert!(res.is_some()); // it matches 638 /// assert!(res.unwrap() == 2); // match starts at character 3 639 /// ``` search_with_encoding<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,640 pub fn search_with_encoding<T>( 641 &self, 642 chars: T, 643 from: usize, 644 to: usize, 645 options: SearchOptions, 646 region: Option<&mut Region>, 647 ) -> Option<usize> 648 where 649 T: EncodedChars, 650 { 651 let match_param = MatchParam::default(); 652 let result = self.search_with_param(chars, from, to, options, region, match_param); 653 654 match result { 655 Ok(r) => r, 656 Err(e) => panic!("Onig: Regex search error: {}", e.description()), 657 } 658 } 659 660 /// Search pattern in string with encoding and match param 661 /// 662 /// Search for matches the regex in a string. This method will 663 /// return the index of the first match of the regex within the 664 /// string, if there is one. If `from` is less than `to`, then 665 /// search is performed in forward order, otherwise – in backward 666 /// order. 667 /// 668 /// For more information see [Match vs 669 /// Search](index.html#match-vs-search) 670 /// 671 /// The encoding of the buffer passed to search in must match the 672 /// encoding of the regex. 673 /// 674 /// # Arguments 675 /// 676 /// * `chars` - The character buffer to search in. 677 /// * `from` - The byte index in the passed slice to start search 678 /// * `to` - The byte index in the passed slice to finish search 679 /// * `options` - The options for the search. 680 /// * `region` - The region for return group match range info 681 /// * `match_param` - The match parameters 682 /// 683 /// # Returns 684 /// 685 /// `Ok(Some(pos))` if the regex matches, where `pos` is the 686 /// byte-position of the start of the match. `Ok(None)` if the regex 687 /// doesn't match anywhere in `chars`. `Err` with an `Error` if an error 688 /// occurred (e.g. retry-limit-in-match exceeded). 689 /// 690 /// # Examples 691 /// 692 /// ``` 693 /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions}; 694 /// 695 /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap(); 696 /// let res = r.search_with_param(EncodedBytes::ascii(b"hello"), 697 /// 0, 5, SearchOptions::SEARCH_OPTION_NONE, 698 /// None, MatchParam::default()); 699 /// assert!(res.is_ok()); // matching did not error 700 /// assert!(res.unwrap() == Some(2)); // match starts at character 3 701 /// ``` search_with_param<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,702 pub fn search_with_param<T>( 703 &self, 704 chars: T, 705 from: usize, 706 to: usize, 707 options: SearchOptions, 708 region: Option<&mut Region>, 709 match_param: MatchParam, 710 ) -> Result<Option<usize>, Error> 711 where 712 T: EncodedChars, 713 { 714 let (beg, end) = (chars.start_ptr(), chars.limit_ptr()); 715 if chars.encoding() != self.encoding() { 716 return Err(Error::custom(format!( 717 "Regex encoding does not match haystack encoding ({0:?}, {1:?})", 718 chars.encoding(), 719 self.encoding() 720 ))); 721 } 722 let r = unsafe { 723 let start = beg.add(from); 724 let range = beg.add(to); 725 if start > end { 726 return Err(Error::custom("Start of match should be before end")); 727 } 728 if range > end { 729 return Err(Error::custom("Limit of match should be before end")); 730 } 731 onig_sys::onig_search_with_param( 732 self.raw, 733 beg, 734 end, 735 start, 736 range, 737 match region { 738 Some(region) => region as *mut Region as *mut onig_sys::OnigRegion, 739 None => std::ptr::null_mut(), 740 }, 741 options.bits(), 742 match_param.as_raw(), 743 ) 744 }; 745 746 if r >= 0 { 747 Ok(Some(r as usize)) 748 } else if r == onig_sys::ONIG_MISMATCH { 749 Ok(None) 750 } else { 751 Err(Error::from_code(r)) 752 } 753 } 754 755 /// Returns true if and only if the regex matches the string given. 756 /// 757 /// For more information see [Match vs 758 /// Search](index.html#match-vs-search) 759 /// 760 /// # Arguments 761 /// * `text` - The string slice to test against the pattern. 762 /// 763 /// # Returns 764 /// 765 /// `true` if the pattern matches the whole of `text`, `false` otherwise. is_match(&self, text: &str) -> bool766 pub fn is_match(&self, text: &str) -> bool { 767 self.match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None) 768 .map(|r| r == text.len()) 769 .unwrap_or(false) 770 } 771 772 /// Find a Match in a Buffer, With Encoding 773 /// 774 /// Finds the first match of the regular expression within the 775 /// buffer. 776 /// 777 /// Note that this should only be used if you want to discover the 778 /// position of the match within a string. Testing if a pattern 779 /// matches the whole string is faster if you use `is_match`. For 780 /// more information see [Match vs 781 /// Search](index.html#match-vs-search) 782 /// 783 /// # Arguments 784 /// * `text` - The text to search in. 785 /// 786 /// # Returns 787 /// 788 /// The offset of the start and end of the first match. If no 789 /// match exists `None` is returned. find(&self, text: &str) -> Option<(usize, usize)>790 pub fn find(&self, text: &str) -> Option<(usize, usize)> { 791 self.find_with_encoding(text) 792 } 793 794 /// Find a Match in a Buffer, With Encoding 795 /// 796 /// Finds the first match of the regular expression within the 797 /// buffer. 798 /// 799 /// For more information see [Match vs 800 /// Search](index.html#match-vs-search) 801 /// 802 /// # Arguments 803 /// * `text` - The text to search in. 804 /// 805 /// # Returns 806 /// 807 /// The offset of the start and end of the first match. If no 808 /// match exists `None` is returned. find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)> where T: EncodedChars,809 pub fn find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)> 810 where 811 T: EncodedChars, 812 { 813 let mut region = Region::new(); 814 let len = text.len(); 815 self.search_with_encoding( 816 text, 817 0, 818 len, 819 SearchOptions::SEARCH_OPTION_NONE, 820 Some(&mut region), 821 ) 822 .and_then(|_| region.pos(0)) 823 } 824 825 /// Get the Encoding of the Regex 826 /// 827 /// # Returns 828 /// 829 /// Returns a reference to an oniguruma encoding which was used 830 /// when this regex was created. encoding(&self) -> onig_sys::OnigEncoding831 pub fn encoding(&self) -> onig_sys::OnigEncoding { 832 unsafe { onig_sys::onig_get_encoding(self.raw) } 833 } 834 835 /// Get the Number of Capture Groups in this Pattern captures_len(&self) -> usize836 pub fn captures_len(&self) -> usize { 837 unsafe { onig_sys::onig_number_of_captures(self.raw) as usize } 838 } 839 840 /// Get the Size of the Capture Histories for this Pattern capture_histories_len(&self) -> usize841 pub fn capture_histories_len(&self) -> usize { 842 unsafe { onig_sys::onig_number_of_capture_histories(self.raw) as usize } 843 } 844 } 845 846 impl Drop for Regex { drop(&mut self)847 fn drop(&mut self) { 848 unsafe { 849 onig_sys::onig_free(self.raw); 850 } 851 } 852 } 853 854 #[cfg(test)] 855 mod tests { 856 use super::*; 857 use std::panic; 858 859 #[test] test_regex_create()860 fn test_regex_create() { 861 Regex::with_options(".*", RegexOptions::REGEX_OPTION_NONE, Syntax::default()).unwrap(); 862 863 Regex::new(r#"a \w+ word"#).unwrap(); 864 } 865 866 #[test] test_regex_invalid()867 fn test_regex_invalid() { 868 let e = Regex::new("\\p{foo}").unwrap_err(); 869 assert_eq!(e.code(), -223); 870 assert_eq!(e.description(), "invalid character property name {foo}"); 871 } 872 873 #[test] test_failed_match()874 fn test_failed_match() { 875 let regex = Regex::new("foo").unwrap(); 876 let res = regex.match_with_options("bar", 0, SearchOptions::SEARCH_OPTION_NONE, None); 877 assert!(res.is_none()); 878 } 879 880 #[test] test_regex_search_with_options()881 fn test_regex_search_with_options() { 882 let mut region = Region::new(); 883 let regex = Regex::new("e(l+)").unwrap(); 884 885 let r = regex.search_with_options( 886 "hello", 887 0, 888 5, 889 SearchOptions::SEARCH_OPTION_NONE, 890 Some(&mut region), 891 ); 892 893 assert!(region.tree().is_none()); 894 assert_eq!(r, Some(1)); 895 assert_eq!(region.len(), 2); 896 let pos1 = region.pos(0).unwrap(); 897 let pos2 = region.pos(1).unwrap(); 898 assert_eq!(pos1, (1, 4)); 899 assert_eq!(pos2, (2, 4)); 900 901 // test cloning here since we already have a filled region 902 let cloned_region = region.clone(); 903 let pos1_clone = cloned_region.pos(0).unwrap(); 904 assert_eq!(pos1_clone, pos1); 905 } 906 907 #[test] test_regex_match_with_options()908 fn test_regex_match_with_options() { 909 let mut region = Region::new(); 910 let regex = Regex::new("he(l+)").unwrap(); 911 912 let r = regex.match_with_options( 913 "hello", 914 0, 915 SearchOptions::SEARCH_OPTION_NONE, 916 Some(&mut region), 917 ); 918 919 assert!(region.tree().is_none()); 920 assert_eq!(r, Some(4)); 921 assert_eq!(region.len(), 2); 922 let pos1 = region.pos(0).unwrap(); 923 let pos2 = region.pos(1).unwrap(); 924 assert_eq!(pos1, (0, 4)); 925 assert_eq!(pos2, (2, 4)); 926 } 927 928 #[test] test_regex_is_match()929 fn test_regex_is_match() { 930 let regex = Regex::new("he(l+)o").unwrap(); 931 assert!(regex.is_match("hello")); 932 assert!(!regex.is_match("hello 2.0")); 933 } 934 935 #[test] test_regex_find()936 fn test_regex_find() { 937 let regex = Regex::new("he(l+)o").unwrap(); 938 assert_eq!(regex.find("hey, hello!"), Some((5, 10))); 939 assert_eq!(regex.find("hey, honey!"), None); 940 } 941 942 #[test] test_regex_captures_len()943 fn test_regex_captures_len() { 944 let regex = Regex::new("(he)(l+)(o)").unwrap(); 945 assert_eq!(regex.captures_len(), 3); 946 } 947 948 #[test] test_regex_error_is_match()949 fn test_regex_error_is_match() { 950 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 951 let result = regex.match_with_param( 952 "ababababababababababababababababababababababababababababacbc", 953 0, 954 SearchOptions::SEARCH_OPTION_NONE, 955 None, 956 MatchParam::default(), 957 ); 958 959 let e = result.err().unwrap(); 960 assert_eq!("retry-limit-in-match over", e.description()); 961 } 962 963 #[test] test_regex_panic_is_match()964 fn test_regex_panic_is_match() { 965 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 966 let result = panic::catch_unwind(|| { 967 regex.is_match("ababababababababababababababababababababababababababababacbc") 968 }); 969 let e = result.err().unwrap(); 970 let message = e.downcast_ref::<String>().unwrap(); 971 assert_eq!( 972 message.as_str(), 973 "Onig: Regex match error: retry-limit-in-match over" 974 ); 975 } 976 977 #[test] test_regex_error_find()978 fn test_regex_error_find() { 979 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 980 let s = "ababababababababababababababababababababababababababababacbc"; 981 let result = regex.search_with_param( 982 s, 983 0, 984 s.len(), 985 SearchOptions::SEARCH_OPTION_NONE, 986 None, 987 MatchParam::default(), 988 ); 989 990 let e = result.err().unwrap(); 991 assert_eq!("retry-limit-in-match over", e.description()); 992 } 993 994 #[test] test_regex_panic_find()995 fn test_regex_panic_find() { 996 let regex = Regex::new("(a|b|ab)*bc").unwrap(); 997 let result = panic::catch_unwind(|| { 998 regex.find("ababababababababababababababababababababababababababababacbc") 999 }); 1000 let e = result.err().unwrap(); 1001 let message = e.downcast_ref::<String>().unwrap(); 1002 assert_eq!( 1003 message.as_str(), 1004 "Onig: Regex search error: retry-limit-in-match over" 1005 ); 1006 } 1007 1008 #[test] test_search_with_invalid_range()1009 fn test_search_with_invalid_range() { 1010 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1011 .expect("regex"); 1012 let string = "Ruby"; 1013 let is_match = regex.search_with_param( 1014 string, 1015 5, 1016 string.len(), 1017 SearchOptions::SEARCH_OPTION_NONE, 1018 None, 1019 MatchParam::default(), 1020 ); 1021 assert!(is_match.is_err()); 1022 1023 let is_match = regex.search_with_param( 1024 string, 1025 2, 1026 string.len() + 1, 1027 SearchOptions::SEARCH_OPTION_NONE, 1028 None, 1029 MatchParam::default(), 1030 ); 1031 assert!(is_match.is_err()); 1032 } 1033 1034 #[test] test_search_with_invalid_range_panic()1035 fn test_search_with_invalid_range_panic() { 1036 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1037 .expect("regex"); 1038 let string = "Ruby"; 1039 let is_match = panic::catch_unwind(|| { 1040 regex.search_with_encoding( 1041 string, 1042 5, 1043 string.len(), 1044 SearchOptions::SEARCH_OPTION_NONE, 1045 None, 1046 ) 1047 }); 1048 assert!(is_match.is_err()); 1049 } 1050 1051 #[test] test_match_with_invalid_range()1052 fn test_match_with_invalid_range() { 1053 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1054 .expect("regex"); 1055 let string = "Ruby"; 1056 let is_match = regex.match_with_param( 1057 string, 1058 5, 1059 SearchOptions::SEARCH_OPTION_NONE, 1060 None, 1061 MatchParam::default(), 1062 ); 1063 assert!(is_match.is_err()); 1064 } 1065 1066 #[test] test_match_with_invalid_range_panic()1067 fn test_match_with_invalid_range_panic() { 1068 let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default()) 1069 .expect("regex"); 1070 let string = "Ruby"; 1071 let is_match = panic::catch_unwind(|| { 1072 regex.match_with_encoding(string, 5, SearchOptions::SEARCH_OPTION_NONE, None) 1073 }); 1074 assert!(is_match.is_err()); 1075 } 1076 } 1077