1 //! This crate provides a safe wrapper around the
2 //! [Oniguruma](https://github.com/kkos/oniguruma) regular expression library.
3 //!
4 //! # Examples
5 //!
6 //! ```rust
7 //! use onig::Regex;
8 //!
9 //! let regex = Regex::new("e(l+)").unwrap();
10 //! for (i, pos) in regex.captures("hello").unwrap().iter_pos().enumerate() {
11 //!     match pos {
12 //!          Some((beg, end)) =>
13 //!              println!("Group {} captured in position {}:{}", i, beg, end),
14 //!          None =>
15 //!              println!("Group {} is not captured", i)
16 //!     }
17 //! }
18 //! ```
19 //!
20 //! # Match vs Search
21 //!
22 //! There are two basic things you can do with a `Regex` pattern; test
23 //! if the pattern matches the whole of a given string, and search for
24 //! occurences of the pattern within a string. Oniguruma exposes these
25 //! two concepts with the *match* and *search* APIs.
26 //!
27 //! In addition two these two base Onigurma APIs this crate exposes a
28 //! third *find* API, built on top of the *search* API.
29 //!
30 //! ```
31 //! # use onig::Regex;
32 //! let pattern = Regex::new("hello").unwrap();
33 //! assert_eq!(true, pattern.find("hello world").is_some());
34 //! assert_eq!(false, pattern.is_match("hello world"));
35 //! ```
36 //!
37 //! ## The *Match* API
38 //!
39 //! Functions in the match API check if a pattern matches the entire
40 //! string. The simplest of these is `Regex::is_match`. This retuns a
41 //! `true` if the pattern matches the string. For more complex useage
42 //! then `Regex::match_with_options` and `Regex::match_with_encoding`
43 //! can be used. These allow the capture groups to be inspected,
44 //! matching with different options, and matching sub-sections of a
45 //! given text.
46 //!
47 //! ## The *Search* API
48 //!
49 //! Function in the search API search for a pattern anywhere within a
50 //! string. The simplist of these is `Regex::find`. This returns the
51 //! offset of the first occurence of the pattern within the string.
52 //! For more complex useage `Regex::search_with_options` and
53 //! `Regex::search_with_encoding` can be used. These allow capture
54 //! groups to be inspected, searching with different options and
55 //! searching within subsections of a given text.
56 //!
57 //! ## The *Find* API
58 //!
59 //! The find API is built on top of the search API. Functions in this
60 //! API allow iteration across all matches of the pattern within a
61 //! string, not just the first one. The functions deal with some of
62 //! the complexities of this, such as zero-length matches.
63 //!
64 //! The simplest step-up from the basic search API `Regex::find` is
65 //! getting the captures relating to a match with the
66 //! `Regex::captures` method. To find capture information for all
67 //! matches within a string `Regex::find_iter` and
68 //! `Regex::captures_iter` can be used. The former exposes the start
69 //! and end of the match as `Regex::find` does, the latter exposes the
70 //! whole capture group information as `Regex::captures` does.
71 //!
72 //! # The `std::pattern` API
73 //!
74 //! In addition to the main Oniguruma API it is possible to use the
75 //! `Regex` object with the
76 //! [`std::pattern`](https://doc.rust-lang.org/std/str/pattern/)
77 //! API. To enable support compile with the `std-pattern` feature. If
78 //! you're using Cargo you can do this by adding the following to your
79 //! Cargo.toml:
80 //!
81 //! ```toml
82 //! [dependencies.onig]
83 //! version = "1.2"
84 //! features = ["std-pattern"]
85 //! ```
86 
87 #![cfg_attr(not(feature = "cargo-clippy"), allow(unknown_lints))]
88 #![cfg_attr(feature = "std-pattern", feature(pattern))]
89 #![deny(missing_docs)]
90 
91 #[macro_use]
92 extern crate bitflags;
93 #[macro_use]
94 extern crate lazy_static;
95 #[cfg(windows)]
96 extern crate libc;
97 extern crate onig_sys;
98 
99 mod buffers;
100 mod find;
101 mod flags;
102 mod match_param;
103 mod names;
104 mod region;
105 mod replace;
106 mod syntax;
107 mod tree;
108 mod utils;
109 
110 #[cfg(feature = "std-pattern")]
111 mod pattern;
112 
113 // re-export the onig types publically
114 pub use buffers::{EncodedBytes, EncodedChars};
115 pub use find::{
116     Captures, FindCaptures, FindMatches, RegexSplits, RegexSplitsN, SubCaptures, SubCapturesPos,
117 };
118 pub use flags::*;
119 pub use match_param::MatchParam;
120 pub use region::Region;
121 pub use replace::Replacer;
122 pub use syntax::{MetaChar, Syntax};
123 pub use tree::{CaptureTreeNode, CaptureTreeNodeIter};
124 pub use utils::{copyright, define_user_property, version};
125 
126 use std::os::raw::c_int;
127 use std::ptr::{null, null_mut};
128 use std::sync::Mutex;
129 use std::{error, fmt, str};
130 
131 #[derive(Debug)]
132 enum ErrorData {
133     OnigError(c_int),
134     Custom,
135 }
136 
137 /// This struture represents an error from the underlying Oniguruma libray.
138 pub struct Error {
139     data: ErrorData,
140     description: String,
141 }
142 
143 /// This struct is a wrapper around an Oniguruma regular expression
144 /// pointer. This represents a compiled regex which can be used in
145 /// search and match operations.
146 #[derive(Debug, Eq, PartialEq)]
147 pub struct Regex {
148     raw: onig_sys::OnigRegex,
149 }
150 
151 unsafe impl Send for Regex {}
152 unsafe impl Sync for Regex {}
153 
154 impl Error {
from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self155     fn from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self {
156         Error::new(code, info)
157     }
158 
from_code(code: c_int) -> Self159     fn from_code(code: c_int) -> Self {
160         Error::new(code, null())
161     }
162 
custom<T: Into<String>>(message: T) -> Self163     fn custom<T: Into<String>>(message: T) -> Self {
164         Error {
165             data: ErrorData::Custom,
166             description: message.into(),
167         }
168     }
169 
new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self170     fn new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self {
171         let buff = &mut [0; onig_sys::ONIG_MAX_ERROR_MESSAGE_LEN as usize];
172         let len = unsafe { onig_sys::onig_error_code_to_str(buff.as_mut_ptr(), code, info) };
173         let description = if let Ok(description) = str::from_utf8(&buff[..len as usize]) {
174             description
175         } else {
176             return Self::custom("Onig error string was invalid UTF-8");
177         };
178         Error {
179             data: ErrorData::OnigError(code),
180             description: description.to_owned(),
181         }
182     }
183 
184     /// Return Oniguruma engine error code.
code(&self) -> i32185     pub fn code(&self) -> i32 {
186         match self.data {
187             ErrorData::OnigError(code) => code,
188             _ => -1,
189         }
190     }
191 
192     /// Return error description provided by Oniguruma engine.
description(&self) -> &str193     pub fn description(&self) -> &str {
194         &self.description
195     }
196 }
197 
198 impl error::Error for Error {
description(&self) -> &str199     fn description(&self) -> &str {
200         &self.description
201     }
202 }
203 
204 impl fmt::Display for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result205     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
206         write!(f, "Oniguruma error: {}", self.description())
207     }
208 }
209 
210 impl fmt::Debug for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result211     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
212         write!(f, "Error({:?}, {})", self.data, self.description())
213     }
214 }
215 
216 lazy_static! {
217     static ref REGEX_NEW_MUTEX: Mutex<()> = Mutex::new(());
218 }
219 
220 impl Regex {
221     /// Create a Regex
222     ///
223     /// Simple regular expression constructor. Compiles a new regular
224     /// expression with the default options using the ruby syntax.
225     /// Once compiled, it can be used repeatedly to search in a string. If an
226     /// invalid expression is given, then an error is returned.
227     ///
228     /// # Arguments
229     ///
230     /// * `pattern` - The regex pattern to compile
231     ///
232     /// # Examples
233     ///
234     /// ```
235     /// use onig::Regex;
236     /// let r = Regex::new(r#"hello (\w+)"#);
237     /// assert!(r.is_ok());
238     /// ```
new(pattern: &str) -> Result<Self, Error>239     pub fn new(pattern: &str) -> Result<Self, Error> {
240         Regex::with_encoding(pattern)
241     }
242 
243     /// Create a Regex, Specifying an Encoding
244     ///
245     /// Attempts to compile `pattern` into a new `Regex`
246     /// instance. Instead of assuming UTF-8 as the encoding scheme the
247     /// encoding is inferred from the `pattern` buffer.
248     ///
249     /// # Arguments
250     ///
251     /// * `pattern` - The regex pattern to compile
252     ///
253     /// # Examples
254     ///
255     /// ```
256     /// use onig::{Regex, EncodedBytes};
257     /// let utf8 = Regex::with_encoding("hello");
258     /// assert!(utf8.is_ok());
259     /// let ascii = Regex::with_encoding(EncodedBytes::ascii(b"world"));
260     /// assert!(ascii.is_ok());
261     /// ```
with_encoding<T>(pattern: T) -> Result<Regex, Error> where T: EncodedChars,262     pub fn with_encoding<T>(pattern: T) -> Result<Regex, Error>
263     where
264         T: EncodedChars,
265     {
266         Regex::with_options_and_encoding(
267             pattern,
268             RegexOptions::REGEX_OPTION_NONE,
269             Syntax::default(),
270         )
271     }
272 
273     /// Create a new Regex
274     ///
275     /// Attempts to compile a pattern into a new `Regex` instance.
276     /// Once compiled, it can be used repeatedly to search in a string. If an
277     /// invalid expression is given, then an error is returned.
278     /// See [`onig_sys::onig_new`][regex_new] for more information.
279     ///
280     /// # Arguments
281     ///
282     ///  * `pattern` - The regex pattern to compile.
283     ///  * `options` - The regex compilation options.
284     ///  * `syntax`  - The syntax which the regex is written in.
285     ///
286     /// # Examples
287     ///
288     /// ```
289     /// use onig::{Regex, Syntax, RegexOptions};
290     /// let r = Regex::with_options("hello.*world",
291     ///                             RegexOptions::REGEX_OPTION_NONE,
292     ///                             Syntax::default());
293     /// assert!(r.is_ok());
294     /// ```
295     ///
296     /// [regex_new]: ./onig_sys/fn.onig_new.html
with_options( pattern: &str, option: RegexOptions, syntax: &Syntax, ) -> Result<Regex, Error>297     pub fn with_options(
298         pattern: &str,
299         option: RegexOptions,
300         syntax: &Syntax,
301     ) -> Result<Regex, Error> {
302         Regex::with_options_and_encoding(pattern, option, syntax)
303     }
304 
305     /// Create a new Regex, Specifying Options and Ecoding
306     ///
307     /// Attempts to comile the given `pattern` into a new `Regex`
308     /// instance. Instead of assuming UTF-8 as the encoding scheme the
309     /// encoding is inferred from the `pattern` buffer. If the regex
310     /// fails to compile the returned `Error` value from
311     /// [`onig_new`][regex_new] contains more information.
312     ///
313     /// [regex_new]: ./onig_sys/fn.onig_new.html
314     ///
315     /// # Arguments
316     ///
317     ///  * `pattern` - The regex pattern to compile.
318     ///  * `options` - The regex compilation options.
319     ///  * `syntax`  - The syntax which the regex is written in.
320     ///
321     /// # Examples
322     /// ```
323     /// use onig::{Regex, Syntax, EncodedBytes, RegexOptions};
324     /// let pattern = EncodedBytes::ascii(b"hello");
325     /// let r = Regex::with_options_and_encoding(pattern,
326     ///                                          RegexOptions::REGEX_OPTION_SINGLELINE,
327     ///                                          Syntax::default());
328     /// assert!(r.is_ok());
329     /// ```
with_options_and_encoding<T>( pattern: T, option: RegexOptions, syntax: &Syntax, ) -> Result<Self, Error> where T: EncodedChars,330     pub fn with_options_and_encoding<T>(
331         pattern: T,
332         option: RegexOptions,
333         syntax: &Syntax,
334     ) -> Result<Self, Error>
335     where
336         T: EncodedChars,
337     {
338         // Convert the rust types to those required for the call to
339         // `onig_new`.
340         let mut reg: onig_sys::OnigRegex = null_mut();
341         let reg_ptr = &mut reg as *mut onig_sys::OnigRegex;
342 
343         // We can use this later to get an error message to pass back
344         // if regex creation fails.
345         let mut error = onig_sys::OnigErrorInfo {
346             enc: null_mut(),
347             par: null_mut(),
348             par_end: null_mut(),
349         };
350 
351         let err = unsafe {
352             // Grab a lock to make sure that `onig_new` isn't called by
353             // more than one thread at a time.
354             let _guard = REGEX_NEW_MUTEX.lock().unwrap();
355             onig_sys::onig_new(
356                 reg_ptr,
357                 pattern.start_ptr(),
358                 pattern.limit_ptr(),
359                 option.bits(),
360                 pattern.encoding(),
361                 syntax as *const Syntax as *mut Syntax as *mut onig_sys::OnigSyntaxType,
362                 &mut error,
363             )
364         };
365 
366         if err == onig_sys::ONIG_NORMAL as i32 {
367             Ok(Regex { raw: reg })
368         } else {
369             Err(Error::from_code_and_info(err, &error))
370         }
371     }
372 
373     /// Match String
374     ///
375     /// Try to match the regex against the given string slice,
376     /// starting at a given offset. This method works the same way as
377     /// `match_with_encoding`, but the encoding is always utf-8.
378     ///
379     /// For more information see [Match vs
380     /// Search](index.html#match-vs-search)
381     ///
382     /// # Arguments
383     ///
384     /// * `str` - The string slice to match against.
385     /// * `at` - The byte index in the passed slice to start matching
386     /// * `options` - The regex match options.
387     /// * `region` - The region for return group match range info
388     ///
389     /// # Returns
390     ///
391     /// `Some(len)` if the regex matched, with `len` being the number
392     /// of bytes matched. `None` if the regex doesn't match.
393     ///
394     /// # Examples
395     ///
396     /// ```
397     /// use onig::{Regex, SearchOptions};
398     ///
399     /// let r = Regex::new(".*").unwrap();
400     /// let res = r.match_with_options("hello", 0, SearchOptions::SEARCH_OPTION_NONE, None);
401     /// assert!(res.is_some()); // it matches
402     /// assert!(res.unwrap() == 5); // 5 characters matched
403     /// ```
match_with_options( &self, str: &str, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>404     pub fn match_with_options(
405         &self,
406         str: &str,
407         at: usize,
408         options: SearchOptions,
409         region: Option<&mut Region>,
410     ) -> Option<usize> {
411         self.match_with_encoding(str, at, options, region)
412     }
413 
414     /// Match String with Encoding
415     ///
416     /// Match the regex against a string. This method will start at
417     /// the offset `at` into the string and try and match the
418     /// regex. If the regex matches then the return value is the
419     /// number of characters which matched. If the regex doesn't match
420     /// the return is `None`.
421     ///
422     /// For more information see [Match vs
423     /// Search](index.html#match-vs-search)
424     ///
425     /// The contents of `chars` must have the same encoding that was
426     /// used to construct the regex.
427     ///
428     /// # Arguments
429     ///
430     /// * `chars` - The buffer to match against.
431     /// * `at` - The byte index in the passed buffer to start matching
432     /// * `options` - The regex match options.
433     /// * `region` - The region for return group match range info
434     ///
435     /// # Returns
436     ///
437     /// `Some(len)` if the regex matched, with `len` being the number
438     /// of bytes matched. `None` if the regex doesn't match.
439     ///
440     /// # Examples
441     ///
442     /// ```
443     /// use onig::{Regex, EncodedBytes, SearchOptions};
444     ///
445     /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap();
446     /// let res = r.match_with_encoding(EncodedBytes::ascii(b"world"),
447     ///                                 0, SearchOptions::SEARCH_OPTION_NONE, None);
448     /// assert!(res.is_some()); // it matches
449     /// assert!(res.unwrap() == 5); // 5 characters matched
450     /// ```
match_with_encoding<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,451     pub fn match_with_encoding<T>(
452         &self,
453         chars: T,
454         at: usize,
455         options: SearchOptions,
456         region: Option<&mut Region>,
457     ) -> Option<usize>
458     where
459         T: EncodedChars,
460     {
461         let match_param = MatchParam::default();
462         let result = self.match_with_param(chars, at, options, region, match_param);
463 
464         match result {
465             Ok(r) => r,
466             Err(e) => panic!("Onig: Regex match error: {}", e.description()),
467         }
468     }
469 
470     /// Match string with encoding and match param
471     ///
472     /// Match the regex against a string. This method will start at
473     /// the offset `at` into the string and try and match the
474     /// regex. If the regex matches then the return value is the
475     /// number of characters which matched. If the regex doesn't match
476     /// the return is `None`.
477     ///
478     /// For more information see [Match vs
479     /// Search](index.html#match-vs-search)
480     ///
481     /// The contents of `chars` must have the same encoding that was
482     /// used to construct the regex.
483     ///
484     /// # Arguments
485     ///
486     /// * `chars` - The buffer to match against.
487     /// * `at` - The byte index in the passed buffer to start matching
488     /// * `options` - The regex match options.
489     /// * `region` - The region for return group match range info
490     /// * `match_param` - The match parameters
491     ///
492     /// # Returns
493     ///
494     /// `Ok(Some(len))` if the regex matched, with `len` being the number
495     /// of bytes matched. `Ok(None)` if the regex doesn't match. `Err` with an
496     /// `Error` if an error occurred (e.g. retry-limit-in-match exceeded).
497     ///
498     /// # Examples
499     ///
500     /// ```
501     /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions};
502     ///
503     /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap();
504     /// let res = r.match_with_param(EncodedBytes::ascii(b"world"),
505     ///                              0, SearchOptions::SEARCH_OPTION_NONE,
506     ///                              None, MatchParam::default());
507     /// assert!(res.is_ok()); // matching did not error
508     /// assert!(res.unwrap() == Some(5)); // 5 characters matched
509     /// ```
match_with_param<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,510     pub fn match_with_param<T>(
511         &self,
512         chars: T,
513         at: usize,
514         options: SearchOptions,
515         region: Option<&mut Region>,
516         match_param: MatchParam,
517     ) -> Result<Option<usize>, Error>
518     where
519         T: EncodedChars,
520     {
521         if chars.encoding() != self.encoding() {
522             return Err(Error::custom(format!("Regex encoding does not match haystack encoding ({0:?}, {1:?})", chars.encoding(), self.encoding())));
523         }
524         let r = unsafe {
525             let offset = chars.start_ptr().add(at);
526             if offset > chars.limit_ptr() {
527                 return Err(Error::custom(format!("Offset {} is too large", at)));
528             }
529             onig_sys::onig_match_with_param(
530                 self.raw,
531                 chars.start_ptr(),
532                 chars.limit_ptr(),
533                 offset,
534                 match region {
535                     Some(region) => region as *mut Region as *mut onig_sys::OnigRegion,
536                     None => std::ptr::null_mut(),
537                 },
538                 options.bits(),
539                 match_param.as_raw(),
540             )
541         };
542 
543         if r >= 0 {
544             Ok(Some(r as usize))
545         } else if r == onig_sys::ONIG_MISMATCH {
546             Ok(None)
547         } else {
548             Err(Error::from_code(r))
549         }
550     }
551 
552     /// Search pattern in string
553     ///
554     /// Search for matches the regex in a string. This method will return the
555     /// index of the first match of the regex within the string, if
556     /// there is one. If `from` is less than `to`, then search is performed
557     /// in forward order, otherwise – in backward order.
558     ///
559     /// For more information see [Match vs
560     /// Search](index.html#match-vs-search)
561     ///
562     /// # Arguments
563     ///
564     ///  * `str` - The string to search in.
565     ///  * `from` - The byte index in the passed slice to start search
566     ///  * `to` - The byte index in the passed slice to finish search
567     ///  * `options` - The options for the search.
568     ///  * `region` - The region for return group match range info
569     ///
570     /// # Returns
571     ///
572     /// `Some(pos)` if the regex matches, where `pos` is the
573     /// byte-position of the start of the match. `None` if the regex
574     /// doesn't match anywhere in `str`.
575     ///
576     /// # Examples
577     ///
578     /// ```
579     /// use onig::{Regex, SearchOptions};
580     ///
581     /// let r = Regex::new("l{1,2}").unwrap();
582     /// let res = r.search_with_options("hello", 0, 5, SearchOptions::SEARCH_OPTION_NONE, None);
583     /// assert!(res.is_some()); // it matches
584     /// assert!(res.unwrap() == 2); // match starts at character 3
585     /// ```
search_with_options( &self, str: &str, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>586     pub fn search_with_options(
587         &self,
588         str: &str,
589         from: usize,
590         to: usize,
591         options: SearchOptions,
592         region: Option<&mut Region>,
593     ) -> Option<usize> {
594         self.search_with_encoding(str, from, to, options, region)
595     }
596 
597     /// Search for a Pattern in a String with an Encoding
598     ///
599     /// Search for matches the regex in a string. This method will
600     /// return the index of the first match of the regex within the
601     /// string, if there is one. If `from` is less than `to`, then
602     /// search is performed in forward order, otherwise – in backward
603     /// order.
604     ///
605     /// For more information see [Match vs
606     /// Search](index.html#match-vs-search)
607     ///
608     /// The encoding of the buffer passed to search in must match the
609     /// encoding of the regex.
610     ///
611     /// # Arguments
612     ///
613     ///  * `chars` - The character buffer to search in.
614     ///  * `from` - The byte index in the passed slice to start search
615     ///  * `to` - The byte index in the passed slice to finish search
616     ///  * `options` - The options for the search.
617     ///  * `region` - The region for return group match range info
618     ///
619     /// # Returns
620     ///
621     /// `Some(pos)` if the regex matches, where `pos` is the
622     /// byte-position of the start of the match. `None` if the regex
623     /// doesn't match anywhere in `chars`.
624     ///
625     /// # Examples
626     ///
627     /// ```
628     /// use onig::{Regex, EncodedBytes, SearchOptions};
629     ///
630     /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap();
631     /// let res = r.search_with_encoding(EncodedBytes::ascii(b"hello"),
632     ///                                  0, 5, SearchOptions::SEARCH_OPTION_NONE, None);
633     /// assert!(res.is_some()); // it matches
634     /// assert!(res.unwrap() == 2); // match starts at character 3
635     /// ```
search_with_encoding<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,636     pub fn search_with_encoding<T>(
637         &self,
638         chars: T,
639         from: usize,
640         to: usize,
641         options: SearchOptions,
642         region: Option<&mut Region>,
643     ) -> Option<usize>
644     where
645         T: EncodedChars,
646     {
647         let match_param = MatchParam::default();
648         let result = self.search_with_param(chars, from, to, options, region, match_param);
649 
650         match result {
651             Ok(r) => r,
652             Err(e) => panic!("Onig: Regex search error: {}", e.description()),
653         }
654     }
655 
656     /// Search pattern in string with encoding and match param
657     ///
658     /// Search for matches the regex in a string. This method will
659     /// return the index of the first match of the regex within the
660     /// string, if there is one. If `from` is less than `to`, then
661     /// search is performed in forward order, otherwise – in backward
662     /// order.
663     ///
664     /// For more information see [Match vs
665     /// Search](index.html#match-vs-search)
666     ///
667     /// The encoding of the buffer passed to search in must match the
668     /// encoding of the regex.
669     ///
670     /// # Arguments
671     ///
672     ///  * `chars` - The character buffer to search in.
673     ///  * `from` - The byte index in the passed slice to start search
674     ///  * `to` - The byte index in the passed slice to finish search
675     ///  * `options` - The options for the search.
676     ///  * `region` - The region for return group match range info
677     ///  * `match_param` - The match parameters
678     ///
679     /// # Returns
680     ///
681     /// `Ok(Some(pos))` if the regex matches, where `pos` is the
682     /// byte-position of the start of the match. `Ok(None)` if the regex
683     /// doesn't match anywhere in `chars`. `Err` with an `Error` if an error
684     /// occurred (e.g. retry-limit-in-match exceeded).
685     ///
686     /// # Examples
687     ///
688     /// ```
689     /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions};
690     ///
691     /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap();
692     /// let res = r.search_with_param(EncodedBytes::ascii(b"hello"),
693     ///                               0, 5, SearchOptions::SEARCH_OPTION_NONE,
694     ///                               None, MatchParam::default());
695     /// assert!(res.is_ok()); // matching did not error
696     /// assert!(res.unwrap() == Some(2)); // match starts at character 3
697     /// ```
search_with_param<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,698     pub fn search_with_param<T>(
699         &self,
700         chars: T,
701         from: usize,
702         to: usize,
703         options: SearchOptions,
704         region: Option<&mut Region>,
705         match_param: MatchParam,
706     ) -> Result<Option<usize>, Error>
707     where
708         T: EncodedChars,
709     {
710         let (beg, end) = (chars.start_ptr(), chars.limit_ptr());
711         if chars.encoding() != self.encoding() {
712             return Err(Error::custom(format!("Regex encoding does not match haystack encoding ({0:?}, {1:?})", chars.encoding(), self.encoding())));
713         }
714         let r = unsafe {
715             let start = beg.add(from);
716             let range = beg.add(to);
717             if start > end {
718                 return Err(Error::custom("Start of match should be before end"));
719             }
720             if range > end {
721                 return Err(Error::custom("Limit of match should be before end"));
722             }
723             onig_sys::onig_search_with_param(
724                 self.raw,
725                 beg,
726                 end,
727                 start,
728                 range,
729                 match region {
730                     Some(region) => region as *mut Region as *mut onig_sys::OnigRegion,
731                     None => std::ptr::null_mut(),
732                 },
733                 options.bits(),
734                 match_param.as_raw(),
735             )
736         };
737 
738         if r >= 0 {
739             Ok(Some(r as usize))
740         } else if r == onig_sys::ONIG_MISMATCH {
741             Ok(None)
742         } else {
743             Err(Error::from_code(r))
744         }
745     }
746 
747     /// Returns true if and only if the regex matches the string given.
748     ///
749     /// For more information see [Match vs
750     /// Search](index.html#match-vs-search)
751     ///
752     /// # Arguments
753     ///  * `text` - The string slice to test against the pattern.
754     ///
755     /// # Returns
756     ///
757     /// `true` if the pattern matches the whole of `text`, `false` otherwise.
is_match(&self, text: &str) -> bool758     pub fn is_match(&self, text: &str) -> bool {
759         self.match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None)
760             .map(|r| r == text.len())
761             .unwrap_or(false)
762     }
763 
764     /// Find a Match in a Buffer, With Encoding
765     ///
766     /// Finds the first match of the regular expression within the
767     /// buffer.
768     ///
769     /// Note that this should only be used if you want to discover the
770     /// position of the match within a string. Testing if a pattern
771     /// matches the whole string is faster if you use `is_match`.  For
772     /// more information see [Match vs
773     /// Search](index.html#match-vs-search)
774     ///
775     /// # Arguments
776     ///  * `text` - The text to search in.
777     ///
778     /// # Returns
779     ///
780     ///  The offset of the start and end of the first match. If no
781     ///  match exists `None` is returned.
find(&self, text: &str) -> Option<(usize, usize)>782     pub fn find(&self, text: &str) -> Option<(usize, usize)> {
783         self.find_with_encoding(text)
784     }
785 
786     /// Find a Match in a Buffer, With Encoding
787     ///
788     /// Finds the first match of the regular expression within the
789     /// buffer.
790     ///
791     /// For more information see [Match vs
792     /// Search](index.html#match-vs-search)
793     ///
794     /// # Arguments
795     ///  * `text` - The text to search in.
796     ///
797     /// # Returns
798     ///
799     ///  The offset of the start and end of the first match. If no
800     ///  match exists `None` is returned.
find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)> where T: EncodedChars,801     pub fn find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)>
802     where
803         T: EncodedChars,
804     {
805         let mut region = Region::new();
806         let len = text.len();
807         self.search_with_encoding(
808             text,
809             0,
810             len,
811             SearchOptions::SEARCH_OPTION_NONE,
812             Some(&mut region),
813         )
814         .and_then(|_| region.pos(0))
815     }
816 
817     /// Get the Encoding of the Regex
818     ///
819     /// # Returns
820     ///
821     /// Returns a reference to an oniguruma encoding which was used
822     /// when this regex was created.
encoding(&self) -> onig_sys::OnigEncoding823     pub fn encoding(&self) -> onig_sys::OnigEncoding {
824         unsafe { onig_sys::onig_get_encoding(self.raw) }
825     }
826 
827     /// Get the Number of Capture Groups in this Pattern
captures_len(&self) -> usize828     pub fn captures_len(&self) -> usize {
829         unsafe { onig_sys::onig_number_of_captures(self.raw) as usize }
830     }
831 
832     /// Get the Size of the Capture Histories for this Pattern
capture_histories_len(&self) -> usize833     pub fn capture_histories_len(&self) -> usize {
834         unsafe { onig_sys::onig_number_of_capture_histories(self.raw) as usize }
835     }
836 }
837 
838 impl Drop for Regex {
drop(&mut self)839     fn drop(&mut self) {
840         unsafe {
841             onig_sys::onig_free(self.raw);
842         }
843     }
844 }
845 
846 #[cfg(test)]
847 mod tests {
848     use super::*;
849     use std::panic;
850 
851     #[test]
test_regex_create()852     fn test_regex_create() {
853         Regex::with_options(".*", RegexOptions::REGEX_OPTION_NONE, Syntax::default()).unwrap();
854 
855         Regex::new(r#"a \w+ word"#).unwrap();
856     }
857 
858     #[test]
test_regex_invalid()859     fn test_regex_invalid() {
860         let e = Regex::new("\\p{foo}").unwrap_err();
861         assert_eq!(e.code(), -223);
862         assert_eq!(e.description(), "invalid character property name {foo}");
863     }
864 
865     #[test]
test_failed_match()866     fn test_failed_match() {
867         let regex = Regex::new("foo").unwrap();
868         let res = regex.match_with_options("bar", 0, SearchOptions::SEARCH_OPTION_NONE, None);
869         assert!(res.is_none());
870     }
871 
872     #[test]
test_regex_search_with_options()873     fn test_regex_search_with_options() {
874         let mut region = Region::new();
875         let regex = Regex::new("e(l+)").unwrap();
876 
877         let r = regex.search_with_options(
878             "hello",
879             0,
880             5,
881             SearchOptions::SEARCH_OPTION_NONE,
882             Some(&mut region),
883         );
884 
885         assert!(region.tree().is_none());
886         assert_eq!(r, Some(1));
887         assert_eq!(region.len(), 2);
888         let pos1 = region.pos(0).unwrap();
889         let pos2 = region.pos(1).unwrap();
890         assert_eq!(pos1, (1, 4));
891         assert_eq!(pos2, (2, 4));
892 
893         // test cloning here since we already have a filled region
894         let cloned_region = region.clone();
895         let pos1_clone = cloned_region.pos(0).unwrap();
896         assert_eq!(pos1_clone, pos1);
897     }
898 
899     #[test]
test_regex_match_with_options()900     fn test_regex_match_with_options() {
901         let mut region = Region::new();
902         let regex = Regex::new("he(l+)").unwrap();
903 
904         let r = regex.match_with_options(
905             "hello",
906             0,
907             SearchOptions::SEARCH_OPTION_NONE,
908             Some(&mut region),
909         );
910 
911         assert!(region.tree().is_none());
912         assert_eq!(r, Some(4));
913         assert_eq!(region.len(), 2);
914         let pos1 = region.pos(0).unwrap();
915         let pos2 = region.pos(1).unwrap();
916         assert_eq!(pos1, (0, 4));
917         assert_eq!(pos2, (2, 4));
918     }
919 
920     #[test]
test_regex_is_match()921     fn test_regex_is_match() {
922         let regex = Regex::new("he(l+)o").unwrap();
923         assert!(regex.is_match("hello"));
924         assert!(!regex.is_match("hello 2.0"));
925     }
926 
927     #[test]
test_regex_find()928     fn test_regex_find() {
929         let regex = Regex::new("he(l+)o").unwrap();
930         assert_eq!(regex.find("hey, hello!"), Some((5, 10)));
931         assert_eq!(regex.find("hey, honey!"), None);
932     }
933 
934     #[test]
test_regex_captures_len()935     fn test_regex_captures_len() {
936         let regex = Regex::new("(he)(l+)(o)").unwrap();
937         assert_eq!(regex.captures_len(), 3);
938     }
939 
940     #[test]
test_regex_error_is_match()941     fn test_regex_error_is_match() {
942         let regex = Regex::new("(a|b|ab)*bc").unwrap();
943         let result = regex.match_with_param(
944             "ababababababababababababababababababababababababababababacbc",
945             0,
946             SearchOptions::SEARCH_OPTION_NONE,
947             None,
948             MatchParam::default(),
949         );
950 
951         let e = result.err().unwrap();
952         assert_eq!("retry-limit-in-match over", e.description());
953     }
954 
955     #[test]
test_regex_panic_is_match()956     fn test_regex_panic_is_match() {
957         let regex = Regex::new("(a|b|ab)*bc").unwrap();
958         let result = panic::catch_unwind(|| {
959             regex.is_match("ababababababababababababababababababababababababababababacbc")
960         });
961         let e = result.err().unwrap();
962         let message = e.downcast_ref::<String>().unwrap();
963         assert_eq!(
964             message.as_str(),
965             "Onig: Regex match error: retry-limit-in-match over"
966         );
967     }
968 
969     #[test]
test_regex_error_find()970     fn test_regex_error_find() {
971         let regex = Regex::new("(a|b|ab)*bc").unwrap();
972         let s = "ababababababababababababababababababababababababababababacbc";
973         let result = regex.search_with_param(
974             s,
975             0,
976             s.len(),
977             SearchOptions::SEARCH_OPTION_NONE,
978             None,
979             MatchParam::default(),
980         );
981 
982         let e = result.err().unwrap();
983         assert_eq!("retry-limit-in-match over", e.description());
984     }
985 
986     #[test]
test_regex_panic_find()987     fn test_regex_panic_find() {
988         let regex = Regex::new("(a|b|ab)*bc").unwrap();
989         let result = panic::catch_unwind(|| {
990             regex.find("ababababababababababababababababababababababababababababacbc")
991         });
992         let e = result.err().unwrap();
993         let message = e.downcast_ref::<String>().unwrap();
994         assert_eq!(
995             message.as_str(),
996             "Onig: Regex search error: retry-limit-in-match over"
997         );
998     }
999 
1000     #[test]
test_search_with_invalid_range()1001     fn test_search_with_invalid_range() {
1002         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1003             .expect("regex");
1004         let string = "Ruby";
1005         let is_match = regex.search_with_param(
1006             string,
1007             5,
1008             string.len(),
1009             SearchOptions::SEARCH_OPTION_NONE,
1010             None,
1011             MatchParam::default(),
1012         );
1013         assert!(is_match.is_err());
1014 
1015         let is_match = regex.search_with_param(
1016             string,
1017             2,
1018             string.len() + 1,
1019             SearchOptions::SEARCH_OPTION_NONE,
1020             None,
1021             MatchParam::default(),
1022         );
1023         assert!(is_match.is_err());
1024     }
1025 
1026     #[test]
test_search_with_invalid_range_panic()1027     fn test_search_with_invalid_range_panic() {
1028         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1029             .expect("regex");
1030         let string = "Ruby";
1031         let is_match = panic::catch_unwind(|| regex.search_with_encoding(
1032             string,
1033             5,
1034             string.len(),
1035             SearchOptions::SEARCH_OPTION_NONE,
1036             None,
1037         ));
1038         assert!(is_match.is_err());
1039     }
1040 
1041     #[test]
test_match_with_invalid_range()1042     fn test_match_with_invalid_range() {
1043         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1044             .expect("regex");
1045         let string = "Ruby";
1046         let is_match = regex.match_with_param(
1047             string,
1048             5,
1049             SearchOptions::SEARCH_OPTION_NONE,
1050             None,
1051             MatchParam::default(),
1052         );
1053         assert!(is_match.is_err());
1054     }
1055 
1056     #[test]
test_match_with_invalid_range_panic()1057     fn test_match_with_invalid_range_panic() {
1058         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1059             .expect("regex");
1060         let string = "Ruby";
1061         let is_match = panic::catch_unwind(|| regex.match_with_encoding(
1062             string,
1063             5,
1064             SearchOptions::SEARCH_OPTION_NONE,
1065             None,
1066         ));
1067         assert!(is_match.is_err());
1068     }
1069 }
1070