1 //! This crate provides a safe wrapper around the
2 //! [Oniguruma](https://github.com/kkos/oniguruma) regular expression library.
3 //!
4 //! # Examples
5 //!
6 //! ```rust
7 //! use onig::Regex;
8 //!
9 //! let regex = Regex::new("e(l+)").unwrap();
10 //! for (i, pos) in regex.captures("hello").unwrap().iter_pos().enumerate() {
11 //!     match pos {
12 //!          Some((beg, end)) =>
13 //!              println!("Group {} captured in position {}:{}", i, beg, end),
14 //!          None =>
15 //!              println!("Group {} is not captured", i)
16 //!     }
17 //! }
18 //! ```
19 //!
20 //! # Match vs Search
21 //!
22 //! There are two basic things you can do with a `Regex` pattern; test
23 //! if the pattern matches the whole of a given string, and search for
24 //! occurences of the pattern within a string. Oniguruma exposes these
25 //! two concepts with the *match* and *search* APIs.
26 //!
27 //! In addition two these two base Onigurma APIs this crate exposes a
28 //! third *find* API, built on top of the *search* API.
29 //!
30 //! ```
31 //! # use onig::Regex;
32 //! let pattern = Regex::new("hello").unwrap();
33 //! assert_eq!(true, pattern.find("hello world").is_some());
34 //! assert_eq!(false, pattern.is_match("hello world"));
35 //! ```
36 //!
37 //! ## The *Match* API
38 //!
39 //! Functions in the match API check if a pattern matches the entire
40 //! string. The simplest of these is `Regex::is_match`. This retuns a
41 //! `true` if the pattern matches the string. For more complex useage
42 //! then `Regex::match_with_options` and `Regex::match_with_encoding`
43 //! can be used. These allow the capture groups to be inspected,
44 //! matching with different options, and matching sub-sections of a
45 //! given text.
46 //!
47 //! ## The *Search* API
48 //!
49 //! Function in the search API search for a pattern anywhere within a
50 //! string. The simplist of these is `Regex::find`. This returns the
51 //! offset of the first occurence of the pattern within the string.
52 //! For more complex useage `Regex::search_with_options` and
53 //! `Regex::search_with_encoding` can be used. These allow capture
54 //! groups to be inspected, searching with different options and
55 //! searching within subsections of a given text.
56 //!
57 //! ## The *Find* API
58 //!
59 //! The find API is built on top of the search API. Functions in this
60 //! API allow iteration across all matches of the pattern within a
61 //! string, not just the first one. The functions deal with some of
62 //! the complexities of this, such as zero-length matches.
63 //!
64 //! The simplest step-up from the basic search API `Regex::find` is
65 //! getting the captures relating to a match with the
66 //! `Regex::captures` method. To find capture information for all
67 //! matches within a string `Regex::find_iter` and
68 //! `Regex::captures_iter` can be used. The former exposes the start
69 //! and end of the match as `Regex::find` does, the latter exposes the
70 //! whole capture group information as `Regex::captures` does.
71 //!
72 //! # The `std::pattern` API
73 //!
74 //! In addition to the main Oniguruma API it is possible to use the
75 //! `Regex` object with the
76 //! [`std::pattern`](https://doc.rust-lang.org/std/str/pattern/)
77 //! API. To enable support compile with the `std-pattern` feature. If
78 //! you're using Cargo you can do this by adding the following to your
79 //! Cargo.toml:
80 //!
81 //! ```toml
82 //! [dependencies.onig]
83 //! version = "1.2"
84 //! features = ["std-pattern"]
85 //! ```
86 
87 #![cfg_attr(not(feature = "cargo-clippy"), allow(unknown_lints))]
88 #![cfg_attr(feature = "std-pattern", feature(pattern))]
89 #![deny(missing_docs)]
90 
91 #[macro_use]
92 extern crate bitflags;
93 #[macro_use]
94 extern crate lazy_static;
95 #[cfg(windows)]
96 extern crate libc;
97 extern crate onig_sys;
98 
99 mod buffers;
100 mod find;
101 mod flags;
102 mod match_param;
103 mod names;
104 mod region;
105 mod replace;
106 mod syntax;
107 mod tree;
108 mod utils;
109 
110 #[cfg(feature = "std-pattern")]
111 mod pattern;
112 
113 // re-export the onig types publically
114 pub use buffers::{EncodedBytes, EncodedChars};
115 pub use find::{
116     Captures, FindCaptures, FindMatches, RegexSplits, RegexSplitsN, SubCaptures, SubCapturesPos,
117 };
118 pub use flags::*;
119 pub use match_param::MatchParam;
120 pub use region::Region;
121 pub use replace::Replacer;
122 pub use syntax::{MetaChar, Syntax};
123 pub use tree::{CaptureTreeNode, CaptureTreeNodeIter};
124 pub use utils::{copyright, define_user_property, version};
125 
126 use std::os::raw::c_int;
127 use std::ptr::{null, null_mut};
128 use std::sync::Mutex;
129 use std::{error, fmt, str};
130 
131 #[derive(Debug)]
132 enum ErrorData {
133     OnigError(c_int),
134     Custom,
135 }
136 
137 /// This struture represents an error from the underlying Oniguruma libray.
138 pub struct Error {
139     data: ErrorData,
140     description: String,
141 }
142 
143 /// This struct is a wrapper around an Oniguruma regular expression
144 /// pointer. This represents a compiled regex which can be used in
145 /// search and match operations.
146 #[derive(Debug, Eq, PartialEq)]
147 pub struct Regex {
148     raw: onig_sys::OnigRegex,
149 }
150 
151 unsafe impl Send for Regex {}
152 unsafe impl Sync for Regex {}
153 
154 impl Error {
from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self155     fn from_code_and_info(code: c_int, info: &onig_sys::OnigErrorInfo) -> Self {
156         Error::new(code, info)
157     }
158 
from_code(code: c_int) -> Self159     fn from_code(code: c_int) -> Self {
160         Error::new(code, null())
161     }
162 
custom<T: Into<String>>(message: T) -> Self163     fn custom<T: Into<String>>(message: T) -> Self {
164         Error {
165             data: ErrorData::Custom,
166             description: message.into(),
167         }
168     }
169 
new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self170     fn new(code: c_int, info: *const onig_sys::OnigErrorInfo) -> Self {
171         let buff = &mut [0; onig_sys::ONIG_MAX_ERROR_MESSAGE_LEN as usize];
172         let len = unsafe { onig_sys::onig_error_code_to_str(buff.as_mut_ptr(), code, info) };
173         let description = if let Ok(description) = str::from_utf8(&buff[..len as usize]) {
174             description
175         } else {
176             return Self::custom("Onig error string was invalid UTF-8");
177         };
178         Error {
179             data: ErrorData::OnigError(code),
180             description: description.to_owned(),
181         }
182     }
183 
184     /// Return Oniguruma engine error code.
code(&self) -> i32185     pub fn code(&self) -> i32 {
186         match self.data {
187             ErrorData::OnigError(code) => code,
188             _ => -1,
189         }
190     }
191 
192     /// Return error description provided by Oniguruma engine.
description(&self) -> &str193     pub fn description(&self) -> &str {
194         &self.description
195     }
196 }
197 
198 impl error::Error for Error {
description(&self) -> &str199     fn description(&self) -> &str {
200         &self.description
201     }
202 }
203 
204 impl fmt::Display for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result205     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
206         write!(f, "Oniguruma error: {}", self.description())
207     }
208 }
209 
210 impl fmt::Debug for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result211     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
212         write!(f, "Error({:?}, {})", self.data, self.description())
213     }
214 }
215 
216 lazy_static! {
217     static ref REGEX_NEW_MUTEX: Mutex<()> = Mutex::new(());
218 }
219 
220 impl Regex {
221     /// Create a Regex
222     ///
223     /// Simple regular expression constructor. Compiles a new regular
224     /// expression with the default options using the ruby syntax.
225     /// Once compiled, it can be used repeatedly to search in a string. If an
226     /// invalid expression is given, then an error is returned.
227     ///
228     /// # Arguments
229     ///
230     /// * `pattern` - The regex pattern to compile
231     ///
232     /// # Examples
233     ///
234     /// ```
235     /// use onig::Regex;
236     /// let r = Regex::new(r#"hello (\w+)"#);
237     /// assert!(r.is_ok());
238     /// ```
new(pattern: &str) -> Result<Self, Error>239     pub fn new(pattern: &str) -> Result<Self, Error> {
240         Regex::with_encoding(pattern)
241     }
242 
243     /// Create a Regex, Specifying an Encoding
244     ///
245     /// Attempts to compile `pattern` into a new `Regex`
246     /// instance. Instead of assuming UTF-8 as the encoding scheme the
247     /// encoding is inferred from the `pattern` buffer.
248     ///
249     /// # Arguments
250     ///
251     /// * `pattern` - The regex pattern to compile
252     ///
253     /// # Examples
254     ///
255     /// ```
256     /// use onig::{Regex, EncodedBytes};
257     /// let utf8 = Regex::with_encoding("hello");
258     /// assert!(utf8.is_ok());
259     /// let ascii = Regex::with_encoding(EncodedBytes::ascii(b"world"));
260     /// assert!(ascii.is_ok());
261     /// ```
with_encoding<T>(pattern: T) -> Result<Regex, Error> where T: EncodedChars,262     pub fn with_encoding<T>(pattern: T) -> Result<Regex, Error>
263     where
264         T: EncodedChars,
265     {
266         Regex::with_options_and_encoding(
267             pattern,
268             RegexOptions::REGEX_OPTION_NONE,
269             Syntax::default(),
270         )
271     }
272 
273     /// Create a new Regex
274     ///
275     /// Attempts to compile a pattern into a new `Regex` instance.
276     /// Once compiled, it can be used repeatedly to search in a string. If an
277     /// invalid expression is given, then an error is returned.
278     /// See [`onig_sys::onig_new`][regex_new] for more information.
279     ///
280     /// # Arguments
281     ///
282     ///  * `pattern` - The regex pattern to compile.
283     ///  * `options` - The regex compilation options.
284     ///  * `syntax`  - The syntax which the regex is written in.
285     ///
286     /// # Examples
287     ///
288     /// ```
289     /// use onig::{Regex, Syntax, RegexOptions};
290     /// let r = Regex::with_options("hello.*world",
291     ///                             RegexOptions::REGEX_OPTION_NONE,
292     ///                             Syntax::default());
293     /// assert!(r.is_ok());
294     /// ```
295     ///
296     /// [regex_new]: ./onig_sys/fn.onig_new.html
with_options( pattern: &str, option: RegexOptions, syntax: &Syntax, ) -> Result<Regex, Error>297     pub fn with_options(
298         pattern: &str,
299         option: RegexOptions,
300         syntax: &Syntax,
301     ) -> Result<Regex, Error> {
302         Regex::with_options_and_encoding(pattern, option, syntax)
303     }
304 
305     /// Create a new Regex, Specifying Options and Ecoding
306     ///
307     /// Attempts to comile the given `pattern` into a new `Regex`
308     /// instance. Instead of assuming UTF-8 as the encoding scheme the
309     /// encoding is inferred from the `pattern` buffer. If the regex
310     /// fails to compile the returned `Error` value from
311     /// [`onig_new`][regex_new] contains more information.
312     ///
313     /// [regex_new]: ./onig_sys/fn.onig_new.html
314     ///
315     /// # Arguments
316     ///
317     ///  * `pattern` - The regex pattern to compile.
318     ///  * `options` - The regex compilation options.
319     ///  * `syntax`  - The syntax which the regex is written in.
320     ///
321     /// # Examples
322     /// ```
323     /// use onig::{Regex, Syntax, EncodedBytes, RegexOptions};
324     /// let pattern = EncodedBytes::ascii(b"hello");
325     /// let r = Regex::with_options_and_encoding(pattern,
326     ///                                          RegexOptions::REGEX_OPTION_SINGLELINE,
327     ///                                          Syntax::default());
328     /// assert!(r.is_ok());
329     /// ```
with_options_and_encoding<T>( pattern: T, option: RegexOptions, syntax: &Syntax, ) -> Result<Self, Error> where T: EncodedChars,330     pub fn with_options_and_encoding<T>(
331         pattern: T,
332         option: RegexOptions,
333         syntax: &Syntax,
334     ) -> Result<Self, Error>
335     where
336         T: EncodedChars,
337     {
338         // Convert the rust types to those required for the call to
339         // `onig_new`.
340         let mut reg: onig_sys::OnigRegex = null_mut();
341         let reg_ptr = &mut reg as *mut onig_sys::OnigRegex;
342 
343         // We can use this later to get an error message to pass back
344         // if regex creation fails.
345         let mut error = onig_sys::OnigErrorInfo {
346             enc: null_mut(),
347             par: null_mut(),
348             par_end: null_mut(),
349         };
350 
351         let err = unsafe {
352             // Grab a lock to make sure that `onig_new` isn't called by
353             // more than one thread at a time.
354             let _guard = REGEX_NEW_MUTEX.lock().unwrap();
355             onig_sys::onig_new(
356                 reg_ptr,
357                 pattern.start_ptr(),
358                 pattern.limit_ptr(),
359                 option.bits(),
360                 pattern.encoding(),
361                 syntax as *const Syntax as *mut Syntax as *mut onig_sys::OnigSyntaxType,
362                 &mut error,
363             )
364         };
365 
366         if err == onig_sys::ONIG_NORMAL as i32 {
367             Ok(Regex { raw: reg })
368         } else {
369             Err(Error::from_code_and_info(err, &error))
370         }
371     }
372 
373     /// Match String
374     ///
375     /// Try to match the regex against the given string slice,
376     /// starting at a given offset. This method works the same way as
377     /// `match_with_encoding`, but the encoding is always utf-8.
378     ///
379     /// For more information see [Match vs
380     /// Search](index.html#match-vs-search)
381     ///
382     /// # Arguments
383     ///
384     /// * `str` - The string slice to match against.
385     /// * `at` - The byte index in the passed slice to start matching
386     /// * `options` - The regex match options.
387     /// * `region` - The region for return group match range info
388     ///
389     /// # Returns
390     ///
391     /// `Some(len)` if the regex matched, with `len` being the number
392     /// of bytes matched. `None` if the regex doesn't match.
393     ///
394     /// # Examples
395     ///
396     /// ```
397     /// use onig::{Regex, SearchOptions};
398     ///
399     /// let r = Regex::new(".*").unwrap();
400     /// let res = r.match_with_options("hello", 0, SearchOptions::SEARCH_OPTION_NONE, None);
401     /// assert!(res.is_some()); // it matches
402     /// assert!(res.unwrap() == 5); // 5 characters matched
403     /// ```
match_with_options( &self, str: &str, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>404     pub fn match_with_options(
405         &self,
406         str: &str,
407         at: usize,
408         options: SearchOptions,
409         region: Option<&mut Region>,
410     ) -> Option<usize> {
411         self.match_with_encoding(str, at, options, region)
412     }
413 
414     /// Match String with Encoding
415     ///
416     /// Match the regex against a string. This method will start at
417     /// the offset `at` into the string and try and match the
418     /// regex. If the regex matches then the return value is the
419     /// number of characters which matched. If the regex doesn't match
420     /// the return is `None`.
421     ///
422     /// For more information see [Match vs
423     /// Search](index.html#match-vs-search)
424     ///
425     /// The contents of `chars` must have the same encoding that was
426     /// used to construct the regex.
427     ///
428     /// # Arguments
429     ///
430     /// * `chars` - The buffer to match against.
431     /// * `at` - The byte index in the passed buffer to start matching
432     /// * `options` - The regex match options.
433     /// * `region` - The region for return group match range info
434     ///
435     /// # Returns
436     ///
437     /// `Some(len)` if the regex matched, with `len` being the number
438     /// of bytes matched. `None` if the regex doesn't match.
439     ///
440     /// # Examples
441     ///
442     /// ```
443     /// use onig::{Regex, EncodedBytes, SearchOptions};
444     ///
445     /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap();
446     /// let res = r.match_with_encoding(EncodedBytes::ascii(b"world"),
447     ///                                 0, SearchOptions::SEARCH_OPTION_NONE, None);
448     /// assert!(res.is_some()); // it matches
449     /// assert!(res.unwrap() == 5); // 5 characters matched
450     /// ```
match_with_encoding<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,451     pub fn match_with_encoding<T>(
452         &self,
453         chars: T,
454         at: usize,
455         options: SearchOptions,
456         region: Option<&mut Region>,
457     ) -> Option<usize>
458     where
459         T: EncodedChars,
460     {
461         let match_param = MatchParam::default();
462         let result = self.match_with_param(chars, at, options, region, match_param);
463 
464         match result {
465             Ok(r) => r,
466             Err(e) => panic!("Onig: Regex match error: {}", e.description()),
467         }
468     }
469 
470     /// Match string with encoding and match param
471     ///
472     /// Match the regex against a string. This method will start at
473     /// the offset `at` into the string and try and match the
474     /// regex. If the regex matches then the return value is the
475     /// number of characters which matched. If the regex doesn't match
476     /// the return is `None`.
477     ///
478     /// For more information see [Match vs
479     /// Search](index.html#match-vs-search)
480     ///
481     /// The contents of `chars` must have the same encoding that was
482     /// used to construct the regex.
483     ///
484     /// # Arguments
485     ///
486     /// * `chars` - The buffer to match against.
487     /// * `at` - The byte index in the passed buffer to start matching
488     /// * `options` - The regex match options.
489     /// * `region` - The region for return group match range info
490     /// * `match_param` - The match parameters
491     ///
492     /// # Returns
493     ///
494     /// `Ok(Some(len))` if the regex matched, with `len` being the number
495     /// of bytes matched. `Ok(None)` if the regex doesn't match. `Err` with an
496     /// `Error` if an error occurred (e.g. retry-limit-in-match exceeded).
497     ///
498     /// # Examples
499     ///
500     /// ```
501     /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions};
502     ///
503     /// let r = Regex::with_encoding(EncodedBytes::ascii(b".*")).unwrap();
504     /// let res = r.match_with_param(EncodedBytes::ascii(b"world"),
505     ///                              0, SearchOptions::SEARCH_OPTION_NONE,
506     ///                              None, MatchParam::default());
507     /// assert!(res.is_ok()); // matching did not error
508     /// assert!(res.unwrap() == Some(5)); // 5 characters matched
509     /// ```
match_with_param<T>( &self, chars: T, at: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,510     pub fn match_with_param<T>(
511         &self,
512         chars: T,
513         at: usize,
514         options: SearchOptions,
515         region: Option<&mut Region>,
516         match_param: MatchParam,
517     ) -> Result<Option<usize>, Error>
518     where
519         T: EncodedChars,
520     {
521         if chars.encoding() != self.encoding() {
522             return Err(Error::custom(format!(
523                 "Regex encoding does not match haystack encoding ({0:?}, {1:?})",
524                 chars.encoding(),
525                 self.encoding()
526             )));
527         }
528         let r = unsafe {
529             let offset = chars.start_ptr().add(at);
530             if offset > chars.limit_ptr() {
531                 return Err(Error::custom(format!("Offset {} is too large", at)));
532             }
533             onig_sys::onig_match_with_param(
534                 self.raw,
535                 chars.start_ptr(),
536                 chars.limit_ptr(),
537                 offset,
538                 match region {
539                     Some(region) => region as *mut Region as *mut onig_sys::OnigRegion,
540                     None => std::ptr::null_mut(),
541                 },
542                 options.bits(),
543                 match_param.as_raw(),
544             )
545         };
546 
547         if r >= 0 {
548             Ok(Some(r as usize))
549         } else if r == onig_sys::ONIG_MISMATCH {
550             Ok(None)
551         } else {
552             Err(Error::from_code(r))
553         }
554     }
555 
556     /// Search pattern in string
557     ///
558     /// Search for matches the regex in a string. This method will return the
559     /// index of the first match of the regex within the string, if
560     /// there is one. If `from` is less than `to`, then search is performed
561     /// in forward order, otherwise – in backward order.
562     ///
563     /// For more information see [Match vs
564     /// Search](index.html#match-vs-search)
565     ///
566     /// # Arguments
567     ///
568     ///  * `str` - The string to search in.
569     ///  * `from` - The byte index in the passed slice to start search
570     ///  * `to` - The byte index in the passed slice to finish search
571     ///  * `options` - The options for the search.
572     ///  * `region` - The region for return group match range info
573     ///
574     /// # Returns
575     ///
576     /// `Some(pos)` if the regex matches, where `pos` is the
577     /// byte-position of the start of the match. `None` if the regex
578     /// doesn't match anywhere in `str`.
579     ///
580     /// # Examples
581     ///
582     /// ```
583     /// use onig::{Regex, SearchOptions};
584     ///
585     /// let r = Regex::new("l{1,2}").unwrap();
586     /// let res = r.search_with_options("hello", 0, 5, SearchOptions::SEARCH_OPTION_NONE, None);
587     /// assert!(res.is_some()); // it matches
588     /// assert!(res.unwrap() == 2); // match starts at character 3
589     /// ```
search_with_options( &self, str: &str, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize>590     pub fn search_with_options(
591         &self,
592         str: &str,
593         from: usize,
594         to: usize,
595         options: SearchOptions,
596         region: Option<&mut Region>,
597     ) -> Option<usize> {
598         self.search_with_encoding(str, from, to, options, region)
599     }
600 
601     /// Search for a Pattern in a String with an Encoding
602     ///
603     /// Search for matches the regex in a string. This method will
604     /// return the index of the first match of the regex within the
605     /// string, if there is one. If `from` is less than `to`, then
606     /// search is performed in forward order, otherwise – in backward
607     /// order.
608     ///
609     /// For more information see [Match vs
610     /// Search](index.html#match-vs-search)
611     ///
612     /// The encoding of the buffer passed to search in must match the
613     /// encoding of the regex.
614     ///
615     /// # Arguments
616     ///
617     ///  * `chars` - The character buffer to search in.
618     ///  * `from` - The byte index in the passed slice to start search
619     ///  * `to` - The byte index in the passed slice to finish search
620     ///  * `options` - The options for the search.
621     ///  * `region` - The region for return group match range info
622     ///
623     /// # Returns
624     ///
625     /// `Some(pos)` if the regex matches, where `pos` is the
626     /// byte-position of the start of the match. `None` if the regex
627     /// doesn't match anywhere in `chars`.
628     ///
629     /// # Examples
630     ///
631     /// ```
632     /// use onig::{Regex, EncodedBytes, SearchOptions};
633     ///
634     /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap();
635     /// let res = r.search_with_encoding(EncodedBytes::ascii(b"hello"),
636     ///                                  0, 5, SearchOptions::SEARCH_OPTION_NONE, None);
637     /// assert!(res.is_some()); // it matches
638     /// assert!(res.unwrap() == 2); // match starts at character 3
639     /// ```
search_with_encoding<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, ) -> Option<usize> where T: EncodedChars,640     pub fn search_with_encoding<T>(
641         &self,
642         chars: T,
643         from: usize,
644         to: usize,
645         options: SearchOptions,
646         region: Option<&mut Region>,
647     ) -> Option<usize>
648     where
649         T: EncodedChars,
650     {
651         let match_param = MatchParam::default();
652         let result = self.search_with_param(chars, from, to, options, region, match_param);
653 
654         match result {
655             Ok(r) => r,
656             Err(e) => panic!("Onig: Regex search error: {}", e.description()),
657         }
658     }
659 
660     /// Search pattern in string with encoding and match param
661     ///
662     /// Search for matches the regex in a string. This method will
663     /// return the index of the first match of the regex within the
664     /// string, if there is one. If `from` is less than `to`, then
665     /// search is performed in forward order, otherwise – in backward
666     /// order.
667     ///
668     /// For more information see [Match vs
669     /// Search](index.html#match-vs-search)
670     ///
671     /// The encoding of the buffer passed to search in must match the
672     /// encoding of the regex.
673     ///
674     /// # Arguments
675     ///
676     ///  * `chars` - The character buffer to search in.
677     ///  * `from` - The byte index in the passed slice to start search
678     ///  * `to` - The byte index in the passed slice to finish search
679     ///  * `options` - The options for the search.
680     ///  * `region` - The region for return group match range info
681     ///  * `match_param` - The match parameters
682     ///
683     /// # Returns
684     ///
685     /// `Ok(Some(pos))` if the regex matches, where `pos` is the
686     /// byte-position of the start of the match. `Ok(None)` if the regex
687     /// doesn't match anywhere in `chars`. `Err` with an `Error` if an error
688     /// occurred (e.g. retry-limit-in-match exceeded).
689     ///
690     /// # Examples
691     ///
692     /// ```
693     /// use onig::{Regex, EncodedBytes, MatchParam, SearchOptions};
694     ///
695     /// let r = Regex::with_encoding(EncodedBytes::ascii(b"l{1,2}")).unwrap();
696     /// let res = r.search_with_param(EncodedBytes::ascii(b"hello"),
697     ///                               0, 5, SearchOptions::SEARCH_OPTION_NONE,
698     ///                               None, MatchParam::default());
699     /// assert!(res.is_ok()); // matching did not error
700     /// assert!(res.unwrap() == Some(2)); // match starts at character 3
701     /// ```
search_with_param<T>( &self, chars: T, from: usize, to: usize, options: SearchOptions, region: Option<&mut Region>, match_param: MatchParam, ) -> Result<Option<usize>, Error> where T: EncodedChars,702     pub fn search_with_param<T>(
703         &self,
704         chars: T,
705         from: usize,
706         to: usize,
707         options: SearchOptions,
708         region: Option<&mut Region>,
709         match_param: MatchParam,
710     ) -> Result<Option<usize>, Error>
711     where
712         T: EncodedChars,
713     {
714         let (beg, end) = (chars.start_ptr(), chars.limit_ptr());
715         if chars.encoding() != self.encoding() {
716             return Err(Error::custom(format!(
717                 "Regex encoding does not match haystack encoding ({0:?}, {1:?})",
718                 chars.encoding(),
719                 self.encoding()
720             )));
721         }
722         let r = unsafe {
723             let start = beg.add(from);
724             let range = beg.add(to);
725             if start > end {
726                 return Err(Error::custom("Start of match should be before end"));
727             }
728             if range > end {
729                 return Err(Error::custom("Limit of match should be before end"));
730             }
731             onig_sys::onig_search_with_param(
732                 self.raw,
733                 beg,
734                 end,
735                 start,
736                 range,
737                 match region {
738                     Some(region) => region as *mut Region as *mut onig_sys::OnigRegion,
739                     None => std::ptr::null_mut(),
740                 },
741                 options.bits(),
742                 match_param.as_raw(),
743             )
744         };
745 
746         if r >= 0 {
747             Ok(Some(r as usize))
748         } else if r == onig_sys::ONIG_MISMATCH {
749             Ok(None)
750         } else {
751             Err(Error::from_code(r))
752         }
753     }
754 
755     /// Returns true if and only if the regex matches the string given.
756     ///
757     /// For more information see [Match vs
758     /// Search](index.html#match-vs-search)
759     ///
760     /// # Arguments
761     ///  * `text` - The string slice to test against the pattern.
762     ///
763     /// # Returns
764     ///
765     /// `true` if the pattern matches the whole of `text`, `false` otherwise.
is_match(&self, text: &str) -> bool766     pub fn is_match(&self, text: &str) -> bool {
767         self.match_with_options(text, 0, SearchOptions::SEARCH_OPTION_NONE, None)
768             .map(|r| r == text.len())
769             .unwrap_or(false)
770     }
771 
772     /// Find a Match in a Buffer, With Encoding
773     ///
774     /// Finds the first match of the regular expression within the
775     /// buffer.
776     ///
777     /// Note that this should only be used if you want to discover the
778     /// position of the match within a string. Testing if a pattern
779     /// matches the whole string is faster if you use `is_match`.  For
780     /// more information see [Match vs
781     /// Search](index.html#match-vs-search)
782     ///
783     /// # Arguments
784     ///  * `text` - The text to search in.
785     ///
786     /// # Returns
787     ///
788     ///  The offset of the start and end of the first match. If no
789     ///  match exists `None` is returned.
find(&self, text: &str) -> Option<(usize, usize)>790     pub fn find(&self, text: &str) -> Option<(usize, usize)> {
791         self.find_with_encoding(text)
792     }
793 
794     /// Find a Match in a Buffer, With Encoding
795     ///
796     /// Finds the first match of the regular expression within the
797     /// buffer.
798     ///
799     /// For more information see [Match vs
800     /// Search](index.html#match-vs-search)
801     ///
802     /// # Arguments
803     ///  * `text` - The text to search in.
804     ///
805     /// # Returns
806     ///
807     ///  The offset of the start and end of the first match. If no
808     ///  match exists `None` is returned.
find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)> where T: EncodedChars,809     pub fn find_with_encoding<T>(&self, text: T) -> Option<(usize, usize)>
810     where
811         T: EncodedChars,
812     {
813         let mut region = Region::new();
814         let len = text.len();
815         self.search_with_encoding(
816             text,
817             0,
818             len,
819             SearchOptions::SEARCH_OPTION_NONE,
820             Some(&mut region),
821         )
822         .and_then(|_| region.pos(0))
823     }
824 
825     /// Get the Encoding of the Regex
826     ///
827     /// # Returns
828     ///
829     /// Returns a reference to an oniguruma encoding which was used
830     /// when this regex was created.
encoding(&self) -> onig_sys::OnigEncoding831     pub fn encoding(&self) -> onig_sys::OnigEncoding {
832         unsafe { onig_sys::onig_get_encoding(self.raw) }
833     }
834 
835     /// Get the Number of Capture Groups in this Pattern
captures_len(&self) -> usize836     pub fn captures_len(&self) -> usize {
837         unsafe { onig_sys::onig_number_of_captures(self.raw) as usize }
838     }
839 
840     /// Get the Size of the Capture Histories for this Pattern
capture_histories_len(&self) -> usize841     pub fn capture_histories_len(&self) -> usize {
842         unsafe { onig_sys::onig_number_of_capture_histories(self.raw) as usize }
843     }
844 }
845 
846 impl Drop for Regex {
drop(&mut self)847     fn drop(&mut self) {
848         unsafe {
849             onig_sys::onig_free(self.raw);
850         }
851     }
852 }
853 
854 #[cfg(test)]
855 mod tests {
856     use super::*;
857     use std::panic;
858 
859     #[test]
test_regex_create()860     fn test_regex_create() {
861         Regex::with_options(".*", RegexOptions::REGEX_OPTION_NONE, Syntax::default()).unwrap();
862 
863         Regex::new(r#"a \w+ word"#).unwrap();
864     }
865 
866     #[test]
test_regex_invalid()867     fn test_regex_invalid() {
868         let e = Regex::new("\\p{foo}").unwrap_err();
869         assert_eq!(e.code(), -223);
870         assert_eq!(e.description(), "invalid character property name {foo}");
871     }
872 
873     #[test]
test_failed_match()874     fn test_failed_match() {
875         let regex = Regex::new("foo").unwrap();
876         let res = regex.match_with_options("bar", 0, SearchOptions::SEARCH_OPTION_NONE, None);
877         assert!(res.is_none());
878     }
879 
880     #[test]
test_regex_search_with_options()881     fn test_regex_search_with_options() {
882         let mut region = Region::new();
883         let regex = Regex::new("e(l+)").unwrap();
884 
885         let r = regex.search_with_options(
886             "hello",
887             0,
888             5,
889             SearchOptions::SEARCH_OPTION_NONE,
890             Some(&mut region),
891         );
892 
893         assert!(region.tree().is_none());
894         assert_eq!(r, Some(1));
895         assert_eq!(region.len(), 2);
896         let pos1 = region.pos(0).unwrap();
897         let pos2 = region.pos(1).unwrap();
898         assert_eq!(pos1, (1, 4));
899         assert_eq!(pos2, (2, 4));
900 
901         // test cloning here since we already have a filled region
902         let cloned_region = region.clone();
903         let pos1_clone = cloned_region.pos(0).unwrap();
904         assert_eq!(pos1_clone, pos1);
905     }
906 
907     #[test]
test_regex_match_with_options()908     fn test_regex_match_with_options() {
909         let mut region = Region::new();
910         let regex = Regex::new("he(l+)").unwrap();
911 
912         let r = regex.match_with_options(
913             "hello",
914             0,
915             SearchOptions::SEARCH_OPTION_NONE,
916             Some(&mut region),
917         );
918 
919         assert!(region.tree().is_none());
920         assert_eq!(r, Some(4));
921         assert_eq!(region.len(), 2);
922         let pos1 = region.pos(0).unwrap();
923         let pos2 = region.pos(1).unwrap();
924         assert_eq!(pos1, (0, 4));
925         assert_eq!(pos2, (2, 4));
926     }
927 
928     #[test]
test_regex_is_match()929     fn test_regex_is_match() {
930         let regex = Regex::new("he(l+)o").unwrap();
931         assert!(regex.is_match("hello"));
932         assert!(!regex.is_match("hello 2.0"));
933     }
934 
935     #[test]
test_regex_find()936     fn test_regex_find() {
937         let regex = Regex::new("he(l+)o").unwrap();
938         assert_eq!(regex.find("hey, hello!"), Some((5, 10)));
939         assert_eq!(regex.find("hey, honey!"), None);
940     }
941 
942     #[test]
test_regex_captures_len()943     fn test_regex_captures_len() {
944         let regex = Regex::new("(he)(l+)(o)").unwrap();
945         assert_eq!(regex.captures_len(), 3);
946     }
947 
948     #[test]
test_regex_error_is_match()949     fn test_regex_error_is_match() {
950         let regex = Regex::new("(a|b|ab)*bc").unwrap();
951         let result = regex.match_with_param(
952             "ababababababababababababababababababababababababababababacbc",
953             0,
954             SearchOptions::SEARCH_OPTION_NONE,
955             None,
956             MatchParam::default(),
957         );
958 
959         let e = result.err().unwrap();
960         assert_eq!("retry-limit-in-match over", e.description());
961     }
962 
963     #[test]
test_regex_panic_is_match()964     fn test_regex_panic_is_match() {
965         let regex = Regex::new("(a|b|ab)*bc").unwrap();
966         let result = panic::catch_unwind(|| {
967             regex.is_match("ababababababababababababababababababababababababababababacbc")
968         });
969         let e = result.err().unwrap();
970         let message = e.downcast_ref::<String>().unwrap();
971         assert_eq!(
972             message.as_str(),
973             "Onig: Regex match error: retry-limit-in-match over"
974         );
975     }
976 
977     #[test]
test_regex_error_find()978     fn test_regex_error_find() {
979         let regex = Regex::new("(a|b|ab)*bc").unwrap();
980         let s = "ababababababababababababababababababababababababababababacbc";
981         let result = regex.search_with_param(
982             s,
983             0,
984             s.len(),
985             SearchOptions::SEARCH_OPTION_NONE,
986             None,
987             MatchParam::default(),
988         );
989 
990         let e = result.err().unwrap();
991         assert_eq!("retry-limit-in-match over", e.description());
992     }
993 
994     #[test]
test_regex_panic_find()995     fn test_regex_panic_find() {
996         let regex = Regex::new("(a|b|ab)*bc").unwrap();
997         let result = panic::catch_unwind(|| {
998             regex.find("ababababababababababababababababababababababababababababacbc")
999         });
1000         let e = result.err().unwrap();
1001         let message = e.downcast_ref::<String>().unwrap();
1002         assert_eq!(
1003             message.as_str(),
1004             "Onig: Regex search error: retry-limit-in-match over"
1005         );
1006     }
1007 
1008     #[test]
test_search_with_invalid_range()1009     fn test_search_with_invalid_range() {
1010         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1011             .expect("regex");
1012         let string = "Ruby";
1013         let is_match = regex.search_with_param(
1014             string,
1015             5,
1016             string.len(),
1017             SearchOptions::SEARCH_OPTION_NONE,
1018             None,
1019             MatchParam::default(),
1020         );
1021         assert!(is_match.is_err());
1022 
1023         let is_match = regex.search_with_param(
1024             string,
1025             2,
1026             string.len() + 1,
1027             SearchOptions::SEARCH_OPTION_NONE,
1028             None,
1029             MatchParam::default(),
1030         );
1031         assert!(is_match.is_err());
1032     }
1033 
1034     #[test]
test_search_with_invalid_range_panic()1035     fn test_search_with_invalid_range_panic() {
1036         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1037             .expect("regex");
1038         let string = "Ruby";
1039         let is_match = panic::catch_unwind(|| {
1040             regex.search_with_encoding(
1041                 string,
1042                 5,
1043                 string.len(),
1044                 SearchOptions::SEARCH_OPTION_NONE,
1045                 None,
1046             )
1047         });
1048         assert!(is_match.is_err());
1049     }
1050 
1051     #[test]
test_match_with_invalid_range()1052     fn test_match_with_invalid_range() {
1053         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1054             .expect("regex");
1055         let string = "Ruby";
1056         let is_match = regex.match_with_param(
1057             string,
1058             5,
1059             SearchOptions::SEARCH_OPTION_NONE,
1060             None,
1061             MatchParam::default(),
1062         );
1063         assert!(is_match.is_err());
1064     }
1065 
1066     #[test]
test_match_with_invalid_range_panic()1067     fn test_match_with_invalid_range_panic() {
1068         let regex = Regex::with_options("R...", RegexOptions::REGEX_OPTION_NONE, Syntax::default())
1069             .expect("regex");
1070         let string = "Ruby";
1071         let is_match = panic::catch_unwind(|| {
1072             regex.match_with_encoding(string, 5, SearchOptions::SEARCH_OPTION_NONE, None)
1073         });
1074         assert!(is_match.is_err());
1075     }
1076 }
1077