1 /// The set of user configurable options for compiling zero or more regexes.
2 #[derive(Clone, Debug)]
3 #[allow(missing_docs)]
4 pub struct RegexOptions {
5     pub pats: Vec<String>,
6     pub size_limit: usize,
7     pub dfa_size_limit: usize,
8     pub nest_limit: u32,
9     pub case_insensitive: bool,
10     pub multi_line: bool,
11     pub dot_matches_new_line: bool,
12     pub swap_greed: bool,
13     pub ignore_whitespace: bool,
14     pub unicode: bool,
15     pub octal: bool,
16 }
17 
18 impl Default for RegexOptions {
default() -> Self19     fn default() -> Self {
20         RegexOptions {
21             pats: vec![],
22             size_limit: 10 * (1 << 20),
23             dfa_size_limit: 2 * (1 << 20),
24             nest_limit: 250,
25             case_insensitive: false,
26             multi_line: false,
27             dot_matches_new_line: false,
28             swap_greed: false,
29             ignore_whitespace: false,
30             unicode: true,
31             octal: false,
32         }
33     }
34 }
35 
36 macro_rules! define_builder {
37     ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
38         pub mod $name {
39             use super::RegexOptions;
40             use error::Error;
41             use exec::ExecBuilder;
42 
43             use $regex_mod::Regex;
44 
45             /// A configurable builder for a regular expression.
46             ///
47             /// A builder can be used to configure how the regex is built, for example, by
48             /// setting the default flags (which can be overridden in the expression
49             /// itself) or setting various limits.
50             pub struct RegexBuilder(RegexOptions);
51 
52             impl RegexBuilder {
53                 /// Create a new regular expression builder with the given pattern.
54                 ///
55                 /// If the pattern is invalid, then an error will be returned when
56                 /// `build` is called.
57                 pub fn new(pattern: &str) -> RegexBuilder {
58                     let mut builder = RegexBuilder(RegexOptions::default());
59                     builder.0.pats.push(pattern.to_owned());
60                     builder
61                 }
62 
63                 /// Consume the builder and compile the regular expression.
64                 ///
65                 /// Note that calling `as_str` on the resulting `Regex` will produce the
66                 /// pattern given to `new` verbatim. Notably, it will not incorporate any
67                 /// of the flags set on this builder.
68                 pub fn build(&self) -> Result<Regex, Error> {
69                     ExecBuilder::new_options(self.0.clone())
70                         .only_utf8($only_utf8)
71                         .build()
72                         .map(Regex::from)
73                 }
74 
75                 /// Set the value for the case insensitive (`i`) flag.
76                 ///
77                 /// When enabled, letters in the pattern will match both upper case and
78                 /// lower case variants.
79                 pub fn case_insensitive(
80                     &mut self,
81                     yes: bool,
82                 ) -> &mut RegexBuilder {
83                     self.0.case_insensitive = yes;
84                     self
85                 }
86 
87                 /// Set the value for the multi-line matching (`m`) flag.
88                 ///
89                 /// When enabled, `^` matches the beginning of lines and `$` matches the
90                 /// end of lines.
91                 ///
92                 /// By default, they match beginning/end of the input.
93                 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder {
94                     self.0.multi_line = yes;
95                     self
96                 }
97 
98                 /// Set the value for the any character (`s`) flag, where in `.` matches
99                 /// anything when `s` is set and matches anything except for new line when
100                 /// it is not set (the default).
101                 ///
102                 /// N.B. "matches anything" means "any byte" when Unicode is disabled and
103                 /// means "any valid UTF-8 encoding of any Unicode scalar value" when
104                 /// Unicode is enabled.
105                 pub fn dot_matches_new_line(
106                     &mut self,
107                     yes: bool,
108                 ) -> &mut RegexBuilder {
109                     self.0.dot_matches_new_line = yes;
110                     self
111                 }
112 
113                 /// Set the value for the greedy swap (`U`) flag.
114                 ///
115                 /// When enabled, a pattern like `a*` is lazy (tries to find shortest
116                 /// match) and `a*?` is greedy (tries to find longest match).
117                 ///
118                 /// By default, `a*` is greedy and `a*?` is lazy.
119                 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder {
120                     self.0.swap_greed = yes;
121                     self
122                 }
123 
124                 /// Set the value for the ignore whitespace (`x`) flag.
125                 ///
126                 /// When enabled, whitespace such as new lines and spaces will be ignored
127                 /// between expressions of the pattern, and `#` can be used to start a
128                 /// comment until the next new line.
129                 pub fn ignore_whitespace(
130                     &mut self,
131                     yes: bool,
132                 ) -> &mut RegexBuilder {
133                     self.0.ignore_whitespace = yes;
134                     self
135                 }
136 
137                 /// Set the value for the Unicode (`u`) flag.
138                 ///
139                 /// Enabled by default. When disabled, character classes such as `\w` only
140                 /// match ASCII word characters instead of all Unicode word characters.
141                 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder {
142                     self.0.unicode = yes;
143                     self
144                 }
145 
146                 /// Whether to support octal syntax or not.
147                 ///
148                 /// Octal syntax is a little-known way of uttering Unicode codepoints in
149                 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
150                 /// `\141` are all equivalent regular expressions, where the last example
151                 /// shows octal syntax.
152                 ///
153                 /// While supporting octal syntax isn't in and of itself a problem, it does
154                 /// make good error messages harder. That is, in PCRE based regex engines,
155                 /// syntax like `\0` invokes a backreference, which is explicitly
156                 /// unsupported in Rust's regex engine. However, many users expect it to
157                 /// be supported. Therefore, when octal support is disabled, the error
158                 /// message will explicitly mention that backreferences aren't supported.
159                 ///
160                 /// Octal syntax is disabled by default.
161                 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder {
162                     self.0.octal = yes;
163                     self
164                 }
165 
166                 /// Set the approximate size limit of the compiled regular expression.
167                 ///
168                 /// This roughly corresponds to the number of bytes occupied by a single
169                 /// compiled program. If the program exceeds this number, then a
170                 /// compilation error is returned.
171                 pub fn size_limit(
172                     &mut self,
173                     limit: usize,
174                 ) -> &mut RegexBuilder {
175                     self.0.size_limit = limit;
176                     self
177                 }
178 
179                 /// Set the approximate size of the cache used by the DFA.
180                 ///
181                 /// This roughly corresponds to the number of bytes that the DFA will
182                 /// use while searching.
183                 ///
184                 /// Note that this is a *per thread* limit. There is no way to set a global
185                 /// limit. In particular, if a regex is used from multiple threads
186                 /// simultaneously, then each thread may use up to the number of bytes
187                 /// specified here.
188                 pub fn dfa_size_limit(
189                     &mut self,
190                     limit: usize,
191                 ) -> &mut RegexBuilder {
192                     self.0.dfa_size_limit = limit;
193                     self
194                 }
195 
196                 /// Set the nesting limit for this parser.
197                 ///
198                 /// The nesting limit controls how deep the abstract syntax tree is allowed
199                 /// to be. If the AST exceeds the given limit (e.g., with too many nested
200                 /// groups), then an error is returned by the parser.
201                 ///
202                 /// The purpose of this limit is to act as a heuristic to prevent stack
203                 /// overflow for consumers that do structural induction on an `Ast` using
204                 /// explicit recursion. While this crate never does this (instead using
205                 /// constant stack space and moving the call stack to the heap), other
206                 /// crates may.
207                 ///
208                 /// This limit is not checked until the entire Ast is parsed. Therefore,
209                 /// if callers want to put a limit on the amount of heap space used, then
210                 /// they should impose a limit on the length, in bytes, of the concrete
211                 /// pattern string. In particular, this is viable since this parser
212                 /// implementation will limit itself to heap space proportional to the
213                 /// length of the pattern string.
214                 ///
215                 /// Note that a nest limit of `0` will return a nest limit error for most
216                 /// patterns but not all. For example, a nest limit of `0` permits `a` but
217                 /// not `ab`, since `ab` requires a concatenation, which results in a nest
218                 /// depth of `1`. In general, a nest limit is not something that manifests
219                 /// in an obvious way in the concrete syntax, therefore, it should not be
220                 /// used in a granular way.
221                 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder {
222                     self.0.nest_limit = limit;
223                     self
224                 }
225             }
226         }
227     };
228 }
229 
230 define_builder!(bytes, re_bytes, false);
231 define_builder!(unicode, re_unicode, true);
232 
233 macro_rules! define_set_builder {
234     ($name:ident, $regex_mod:ident, $only_utf8:expr) => {
235         pub mod $name {
236             use super::RegexOptions;
237             use error::Error;
238             use exec::ExecBuilder;
239 
240             use re_set::$regex_mod::RegexSet;
241 
242             /// A configurable builder for a set of regular expressions.
243             ///
244             /// A builder can be used to configure how the regexes are built, for example,
245             /// by setting the default flags (which can be overridden in the expression
246             /// itself) or setting various limits.
247             pub struct RegexSetBuilder(RegexOptions);
248 
249             impl RegexSetBuilder {
250                 /// Create a new regular expression builder with the given pattern.
251                 ///
252                 /// If the pattern is invalid, then an error will be returned when
253                 /// `build` is called.
254                 pub fn new<I, S>(patterns: I) -> RegexSetBuilder
255                 where
256                     S: AsRef<str>,
257                     I: IntoIterator<Item = S>,
258                 {
259                     let mut builder = RegexSetBuilder(RegexOptions::default());
260                     for pat in patterns {
261                         builder.0.pats.push(pat.as_ref().to_owned());
262                     }
263                     builder
264                 }
265 
266                 /// Consume the builder and compile the regular expressions into a set.
267                 pub fn build(&self) -> Result<RegexSet, Error> {
268                     ExecBuilder::new_options(self.0.clone())
269                         .only_utf8($only_utf8)
270                         .build()
271                         .map(RegexSet::from)
272                 }
273 
274                 /// Set the value for the case insensitive (`i`) flag.
275                 pub fn case_insensitive(
276                     &mut self,
277                     yes: bool,
278                 ) -> &mut RegexSetBuilder {
279                     self.0.case_insensitive = yes;
280                     self
281                 }
282 
283                 /// Set the value for the multi-line matching (`m`) flag.
284                 pub fn multi_line(
285                     &mut self,
286                     yes: bool,
287                 ) -> &mut RegexSetBuilder {
288                     self.0.multi_line = yes;
289                     self
290                 }
291 
292                 /// Set the value for the any character (`s`) flag, where in `.` matches
293                 /// anything when `s` is set and matches anything except for new line when
294                 /// it is not set (the default).
295                 ///
296                 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet`
297                 /// expressions and means "any Unicode scalar value" for `regex::RegexSet`
298                 /// expressions.
299                 pub fn dot_matches_new_line(
300                     &mut self,
301                     yes: bool,
302                 ) -> &mut RegexSetBuilder {
303                     self.0.dot_matches_new_line = yes;
304                     self
305                 }
306 
307                 /// Set the value for the greedy swap (`U`) flag.
308                 pub fn swap_greed(
309                     &mut self,
310                     yes: bool,
311                 ) -> &mut RegexSetBuilder {
312                     self.0.swap_greed = yes;
313                     self
314                 }
315 
316                 /// Set the value for the ignore whitespace (`x`) flag.
317                 pub fn ignore_whitespace(
318                     &mut self,
319                     yes: bool,
320                 ) -> &mut RegexSetBuilder {
321                     self.0.ignore_whitespace = yes;
322                     self
323                 }
324 
325                 /// Set the value for the Unicode (`u`) flag.
326                 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder {
327                     self.0.unicode = yes;
328                     self
329                 }
330 
331                 /// Whether to support octal syntax or not.
332                 ///
333                 /// Octal syntax is a little-known way of uttering Unicode codepoints in
334                 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
335                 /// `\141` are all equivalent regular expressions, where the last example
336                 /// shows octal syntax.
337                 ///
338                 /// While supporting octal syntax isn't in and of itself a problem, it does
339                 /// make good error messages harder. That is, in PCRE based regex engines,
340                 /// syntax like `\0` invokes a backreference, which is explicitly
341                 /// unsupported in Rust's regex engine. However, many users expect it to
342                 /// be supported. Therefore, when octal support is disabled, the error
343                 /// message will explicitly mention that backreferences aren't supported.
344                 ///
345                 /// Octal syntax is disabled by default.
346                 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder {
347                     self.0.octal = yes;
348                     self
349                 }
350 
351                 /// Set the approximate size limit of the compiled regular expression.
352                 ///
353                 /// This roughly corresponds to the number of bytes occupied by a single
354                 /// compiled program. If the program exceeds this number, then a
355                 /// compilation error is returned.
356                 pub fn size_limit(
357                     &mut self,
358                     limit: usize,
359                 ) -> &mut RegexSetBuilder {
360                     self.0.size_limit = limit;
361                     self
362                 }
363 
364                 /// Set the approximate size of the cache used by the DFA.
365                 ///
366                 /// This roughly corresponds to the number of bytes that the DFA will
367                 /// use while searching.
368                 ///
369                 /// Note that this is a *per thread* limit. There is no way to set a global
370                 /// limit. In particular, if a regex is used from multiple threads
371                 /// simultaneously, then each thread may use up to the number of bytes
372                 /// specified here.
373                 pub fn dfa_size_limit(
374                     &mut self,
375                     limit: usize,
376                 ) -> &mut RegexSetBuilder {
377                     self.0.dfa_size_limit = limit;
378                     self
379                 }
380 
381                 /// Set the nesting limit for this parser.
382                 ///
383                 /// The nesting limit controls how deep the abstract syntax tree is allowed
384                 /// to be. If the AST exceeds the given limit (e.g., with too many nested
385                 /// groups), then an error is returned by the parser.
386                 ///
387                 /// The purpose of this limit is to act as a heuristic to prevent stack
388                 /// overflow for consumers that do structural induction on an `Ast` using
389                 /// explicit recursion. While this crate never does this (instead using
390                 /// constant stack space and moving the call stack to the heap), other
391                 /// crates may.
392                 ///
393                 /// This limit is not checked until the entire Ast is parsed. Therefore,
394                 /// if callers want to put a limit on the amount of heap space used, then
395                 /// they should impose a limit on the length, in bytes, of the concrete
396                 /// pattern string. In particular, this is viable since this parser
397                 /// implementation will limit itself to heap space proportional to the
398                 /// length of the pattern string.
399                 ///
400                 /// Note that a nest limit of `0` will return a nest limit error for most
401                 /// patterns but not all. For example, a nest limit of `0` permits `a` but
402                 /// not `ab`, since `ab` requires a concatenation, which results in a nest
403                 /// depth of `1`. In general, a nest limit is not something that manifests
404                 /// in an obvious way in the concrete syntax, therefore, it should not be
405                 /// used in a granular way.
406                 pub fn nest_limit(
407                     &mut self,
408                     limit: u32,
409                 ) -> &mut RegexSetBuilder {
410                     self.0.nest_limit = limit;
411                     self
412                 }
413             }
414         }
415     };
416 }
417 
418 define_set_builder!(set_bytes, bytes, false);
419 define_set_builder!(set_unicode, unicode, true);
420