1 use ast;
2 use hir;
3 
4 use Result;
5 
6 /// A builder for a regular expression parser.
7 ///
8 /// This builder permits modifying configuration options for the parser.
9 ///
10 /// This type combines the builder options for both the
11 /// [AST `ParserBuilder`](ast/parse/struct.ParserBuilder.html)
12 /// and the
13 /// [HIR `TranslatorBuilder`](hir/translate/struct.TranslatorBuilder.html).
14 #[derive(Clone, Debug, Default)]
15 pub struct ParserBuilder {
16     ast: ast::parse::ParserBuilder,
17     hir: hir::translate::TranslatorBuilder,
18 }
19 
20 impl ParserBuilder {
21     /// Create a new parser builder with a default configuration.
new() -> ParserBuilder22     pub fn new() -> ParserBuilder {
23         ParserBuilder::default()
24     }
25 
26     /// Build a parser from this configuration with the given pattern.
build(&self) -> Parser27     pub fn build(&self) -> Parser {
28         Parser { ast: self.ast.build(), hir: self.hir.build() }
29     }
30 
31     /// Set the nesting limit for this parser.
32     ///
33     /// The nesting limit controls how deep the abstract syntax tree is allowed
34     /// to be. If the AST exceeds the given limit (e.g., with too many nested
35     /// groups), then an error is returned by the parser.
36     ///
37     /// The purpose of this limit is to act as a heuristic to prevent stack
38     /// overflow for consumers that do structural induction on an `Ast` using
39     /// explicit recursion. While this crate never does this (instead using
40     /// constant stack space and moving the call stack to the heap), other
41     /// crates may.
42     ///
43     /// This limit is not checked until the entire Ast is parsed. Therefore,
44     /// if callers want to put a limit on the amount of heap space used, then
45     /// they should impose a limit on the length, in bytes, of the concrete
46     /// pattern string. In particular, this is viable since this parser
47     /// implementation will limit itself to heap space proportional to the
48     /// lenth of the pattern string.
49     ///
50     /// Note that a nest limit of `0` will return a nest limit error for most
51     /// patterns but not all. For example, a nest limit of `0` permits `a` but
52     /// not `ab`, since `ab` requires a concatenation, which results in a nest
53     /// depth of `1`. In general, a nest limit is not something that manifests
54     /// in an obvious way in the concrete syntax, therefore, it should not be
55     /// used in a granular way.
nest_limit(&mut self, limit: u32) -> &mut ParserBuilder56     pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
57         self.ast.nest_limit(limit);
58         self
59     }
60 
61     /// Whether to support octal syntax or not.
62     ///
63     /// Octal syntax is a little-known way of uttering Unicode codepoints in
64     /// a regular expression. For example, `a`, `\x61`, `\u0061` and
65     /// `\141` are all equivalent regular expressions, where the last example
66     /// shows octal syntax.
67     ///
68     /// While supporting octal syntax isn't in and of itself a problem, it does
69     /// make good error messages harder. That is, in PCRE based regex engines,
70     /// syntax like `\0` invokes a backreference, which is explicitly
71     /// unsupported in Rust's regex engine. However, many users expect it to
72     /// be supported. Therefore, when octal support is disabled, the error
73     /// message will explicitly mention that backreferences aren't supported.
74     ///
75     /// Octal syntax is disabled by default.
octal(&mut self, yes: bool) -> &mut ParserBuilder76     pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
77         self.ast.octal(yes);
78         self
79     }
80 
81     /// When enabled, the parser will permit the construction of a regular
82     /// expression that may match invalid UTF-8.
83     ///
84     /// When disabled (the default), the parser is guaranteed to produce
85     /// an expression that will only ever match valid UTF-8 (otherwise, the
86     /// parser will return an error).
87     ///
88     /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
89     /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
90     /// the parser to return an error. Namely, a negated ASCII word boundary
91     /// can result in matching positions that aren't valid UTF-8 boundaries.
allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder92     pub fn allow_invalid_utf8(&mut self, yes: bool) -> &mut ParserBuilder {
93         self.hir.allow_invalid_utf8(yes);
94         self
95     }
96 
97     /// Enable verbose mode in the regular expression.
98     ///
99     /// When enabled, verbose mode permits insigificant whitespace in many
100     /// places in the regular expression, as well as comments. Comments are
101     /// started using `#` and continue until the end of the line.
102     ///
103     /// By default, this is disabled. It may be selectively enabled in the
104     /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder105     pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
106         self.ast.ignore_whitespace(yes);
107         self
108     }
109 
110     /// Enable or disable the case insensitive flag by default.
111     ///
112     /// By default this is disabled. It may alternatively be selectively
113     /// enabled in the regular expression itself via the `i` flag.
case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder114     pub fn case_insensitive(&mut self, yes: bool) -> &mut ParserBuilder {
115         self.hir.case_insensitive(yes);
116         self
117     }
118 
119     /// Enable or disable the multi-line matching flag by default.
120     ///
121     /// By default this is disabled. It may alternatively be selectively
122     /// enabled in the regular expression itself via the `m` flag.
multi_line(&mut self, yes: bool) -> &mut ParserBuilder123     pub fn multi_line(&mut self, yes: bool) -> &mut ParserBuilder {
124         self.hir.multi_line(yes);
125         self
126     }
127 
128     /// Enable or disable the "dot matches any character" flag by default.
129     ///
130     /// By default this is disabled. It may alternatively be selectively
131     /// enabled in the regular expression itself via the `s` flag.
dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder132     pub fn dot_matches_new_line(&mut self, yes: bool) -> &mut ParserBuilder {
133         self.hir.dot_matches_new_line(yes);
134         self
135     }
136 
137     /// Enable or disable the "swap greed" flag by default.
138     ///
139     /// By default this is disabled. It may alternatively be selectively
140     /// enabled in the regular expression itself via the `U` flag.
swap_greed(&mut self, yes: bool) -> &mut ParserBuilder141     pub fn swap_greed(&mut self, yes: bool) -> &mut ParserBuilder {
142         self.hir.swap_greed(yes);
143         self
144     }
145 
146     /// Enable or disable the Unicode flag (`u`) by default.
147     ///
148     /// By default this is **enabled**. It may alternatively be selectively
149     /// disabled in the regular expression itself via the `u` flag.
150     ///
151     /// Note that unless `allow_invalid_utf8` is enabled (it's disabled by
152     /// default), a regular expression will fail to parse if Unicode mode is
153     /// disabled and a sub-expression could possibly match invalid UTF-8.
unicode(&mut self, yes: bool) -> &mut ParserBuilder154     pub fn unicode(&mut self, yes: bool) -> &mut ParserBuilder {
155         self.hir.unicode(yes);
156         self
157     }
158 }
159 
160 /// A convenience parser for regular expressions.
161 ///
162 /// This parser takes as input a regular expression pattern string (the
163 /// "concrete syntax") and returns a high-level intermediate representation
164 /// (the HIR) suitable for most types of analysis. In particular, this parser
165 /// hides the intermediate state of producing an AST (the "abstract syntax").
166 /// The AST is itself far more complex than the HIR, so this parser serves as a
167 /// convenience for never having to deal with it at all.
168 ///
169 /// If callers have more fine grained use cases that need an AST, then please
170 /// see the [`ast::parse`](ast/parse/index.html) module.
171 ///
172 /// A `Parser` can be configured in more detail via a
173 /// [`ParserBuilder`](struct.ParserBuilder.html).
174 #[derive(Clone, Debug)]
175 pub struct Parser {
176     ast: ast::parse::Parser,
177     hir: hir::translate::Translator,
178 }
179 
180 impl Parser {
181     /// Create a new parser with a default configuration.
182     ///
183     /// The parser can be run with `parse` method. The parse method returns
184     /// a high level intermediate representation of the given regular
185     /// expression.
186     ///
187     /// To set configuration options on the parser, use
188     /// [`ParserBuilder`](struct.ParserBuilder.html).
new() -> Parser189     pub fn new() -> Parser {
190         ParserBuilder::new().build()
191     }
192 
193     /// Parse the regular expression into a high level intermediate
194     /// representation.
parse(&mut self, pattern: &str) -> Result<hir::Hir>195     pub fn parse(&mut self, pattern: &str) -> Result<hir::Hir> {
196         let ast = self.ast.parse(pattern)?;
197         let hir = self.hir.translate(pattern, &ast)?;
198         Ok(hir)
199     }
200 }
201