1 /// The set of user configurable options for compiling zero or more regexes. 2 #[derive(Clone, Debug)] 3 #[allow(missing_docs)] 4 pub struct RegexOptions { 5 pub pats: Vec<String>, 6 pub size_limit: usize, 7 pub dfa_size_limit: usize, 8 pub nest_limit: u32, 9 pub case_insensitive: bool, 10 pub multi_line: bool, 11 pub dot_matches_new_line: bool, 12 pub swap_greed: bool, 13 pub ignore_whitespace: bool, 14 pub unicode: bool, 15 pub octal: bool, 16 } 17 18 impl Default for RegexOptions { default() -> Self19 fn default() -> Self { 20 RegexOptions { 21 pats: vec![], 22 size_limit: 10 * (1 << 20), 23 dfa_size_limit: 2 * (1 << 20), 24 nest_limit: 250, 25 case_insensitive: false, 26 multi_line: false, 27 dot_matches_new_line: false, 28 swap_greed: false, 29 ignore_whitespace: false, 30 unicode: true, 31 octal: false, 32 } 33 } 34 } 35 36 macro_rules! define_builder { 37 ($name:ident, $regex_mod:ident, $only_utf8:expr) => { 38 pub mod $name { 39 use super::RegexOptions; 40 use error::Error; 41 use exec::ExecBuilder; 42 43 use $regex_mod::Regex; 44 45 /// A configurable builder for a regular expression. 46 /// 47 /// A builder can be used to configure how the regex is built, for example, by 48 /// setting the default flags (which can be overridden in the expression 49 /// itself) or setting various limits. 50 pub struct RegexBuilder(RegexOptions); 51 52 impl RegexBuilder { 53 /// Create a new regular expression builder with the given pattern. 54 /// 55 /// If the pattern is invalid, then an error will be returned when 56 /// `build` is called. 57 pub fn new(pattern: &str) -> RegexBuilder { 58 let mut builder = RegexBuilder(RegexOptions::default()); 59 builder.0.pats.push(pattern.to_owned()); 60 builder 61 } 62 63 /// Consume the builder and compile the regular expression. 64 /// 65 /// Note that calling `as_str` on the resulting `Regex` will produce the 66 /// pattern given to `new` verbatim. Notably, it will not incorporate any 67 /// of the flags set on this builder. 68 pub fn build(&self) -> Result<Regex, Error> { 69 ExecBuilder::new_options(self.0.clone()) 70 .only_utf8($only_utf8) 71 .build() 72 .map(Regex::from) 73 } 74 75 /// Set the value for the case insensitive (`i`) flag. 76 /// 77 /// When enabled, letters in the pattern will match both upper case and 78 /// lower case variants. 79 pub fn case_insensitive( 80 &mut self, 81 yes: bool, 82 ) -> &mut RegexBuilder { 83 self.0.case_insensitive = yes; 84 self 85 } 86 87 /// Set the value for the multi-line matching (`m`) flag. 88 /// 89 /// When enabled, `^` matches the beginning of lines and `$` matches the 90 /// end of lines. 91 /// 92 /// By default, they match beginning/end of the input. 93 pub fn multi_line(&mut self, yes: bool) -> &mut RegexBuilder { 94 self.0.multi_line = yes; 95 self 96 } 97 98 /// Set the value for the any character (`s`) flag, where in `.` matches 99 /// anything when `s` is set and matches anything except for new line when 100 /// it is not set (the default). 101 /// 102 /// N.B. "matches anything" means "any byte" when Unicode is disabled and 103 /// means "any valid UTF-8 encoding of any Unicode scalar value" when 104 /// Unicode is enabled. 105 pub fn dot_matches_new_line( 106 &mut self, 107 yes: bool, 108 ) -> &mut RegexBuilder { 109 self.0.dot_matches_new_line = yes; 110 self 111 } 112 113 /// Set the value for the greedy swap (`U`) flag. 114 /// 115 /// When enabled, a pattern like `a*` is lazy (tries to find shortest 116 /// match) and `a*?` is greedy (tries to find longest match). 117 /// 118 /// By default, `a*` is greedy and `a*?` is lazy. 119 pub fn swap_greed(&mut self, yes: bool) -> &mut RegexBuilder { 120 self.0.swap_greed = yes; 121 self 122 } 123 124 /// Set the value for the ignore whitespace (`x`) flag. 125 /// 126 /// When enabled, whitespace such as new lines and spaces will be ignored 127 /// between expressions of the pattern, and `#` can be used to start a 128 /// comment until the next new line. 129 pub fn ignore_whitespace( 130 &mut self, 131 yes: bool, 132 ) -> &mut RegexBuilder { 133 self.0.ignore_whitespace = yes; 134 self 135 } 136 137 /// Set the value for the Unicode (`u`) flag. 138 /// 139 /// Enabled by default. When disabled, character classes such as `\w` only 140 /// match ASCII word characters instead of all Unicode word characters. 141 pub fn unicode(&mut self, yes: bool) -> &mut RegexBuilder { 142 self.0.unicode = yes; 143 self 144 } 145 146 /// Whether to support octal syntax or not. 147 /// 148 /// Octal syntax is a little-known way of uttering Unicode codepoints in 149 /// a regular expression. For example, `a`, `\x61`, `\u0061` and 150 /// `\141` are all equivalent regular expressions, where the last example 151 /// shows octal syntax. 152 /// 153 /// While supporting octal syntax isn't in and of itself a problem, it does 154 /// make good error messages harder. That is, in PCRE based regex engines, 155 /// syntax like `\0` invokes a backreference, which is explicitly 156 /// unsupported in Rust's regex engine. However, many users expect it to 157 /// be supported. Therefore, when octal support is disabled, the error 158 /// message will explicitly mention that backreferences aren't supported. 159 /// 160 /// Octal syntax is disabled by default. 161 pub fn octal(&mut self, yes: bool) -> &mut RegexBuilder { 162 self.0.octal = yes; 163 self 164 } 165 166 /// Set the approximate size limit of the compiled regular expression. 167 /// 168 /// This roughly corresponds to the number of bytes occupied by a single 169 /// compiled program. If the program exceeds this number, then a 170 /// compilation error is returned. 171 pub fn size_limit( 172 &mut self, 173 limit: usize, 174 ) -> &mut RegexBuilder { 175 self.0.size_limit = limit; 176 self 177 } 178 179 /// Set the approximate size of the cache used by the DFA. 180 /// 181 /// This roughly corresponds to the number of bytes that the DFA will 182 /// use while searching. 183 /// 184 /// Note that this is a *per thread* limit. There is no way to set a global 185 /// limit. In particular, if a regex is used from multiple threads 186 /// simultaneously, then each thread may use up to the number of bytes 187 /// specified here. 188 pub fn dfa_size_limit( 189 &mut self, 190 limit: usize, 191 ) -> &mut RegexBuilder { 192 self.0.dfa_size_limit = limit; 193 self 194 } 195 196 /// Set the nesting limit for this parser. 197 /// 198 /// The nesting limit controls how deep the abstract syntax tree is allowed 199 /// to be. If the AST exceeds the given limit (e.g., with too many nested 200 /// groups), then an error is returned by the parser. 201 /// 202 /// The purpose of this limit is to act as a heuristic to prevent stack 203 /// overflow for consumers that do structural induction on an `Ast` using 204 /// explicit recursion. While this crate never does this (instead using 205 /// constant stack space and moving the call stack to the heap), other 206 /// crates may. 207 /// 208 /// This limit is not checked until the entire Ast is parsed. Therefore, 209 /// if callers want to put a limit on the amount of heap space used, then 210 /// they should impose a limit on the length, in bytes, of the concrete 211 /// pattern string. In particular, this is viable since this parser 212 /// implementation will limit itself to heap space proportional to the 213 /// length of the pattern string. 214 /// 215 /// Note that a nest limit of `0` will return a nest limit error for most 216 /// patterns but not all. For example, a nest limit of `0` permits `a` but 217 /// not `ab`, since `ab` requires a concatenation, which results in a nest 218 /// depth of `1`. In general, a nest limit is not something that manifests 219 /// in an obvious way in the concrete syntax, therefore, it should not be 220 /// used in a granular way. 221 pub fn nest_limit(&mut self, limit: u32) -> &mut RegexBuilder { 222 self.0.nest_limit = limit; 223 self 224 } 225 } 226 } 227 }; 228 } 229 230 define_builder!(bytes, re_bytes, false); 231 define_builder!(unicode, re_unicode, true); 232 233 macro_rules! define_set_builder { 234 ($name:ident, $regex_mod:ident, $only_utf8:expr) => { 235 pub mod $name { 236 use super::RegexOptions; 237 use error::Error; 238 use exec::ExecBuilder; 239 240 use re_set::$regex_mod::RegexSet; 241 242 /// A configurable builder for a set of regular expressions. 243 /// 244 /// A builder can be used to configure how the regexes are built, for example, 245 /// by setting the default flags (which can be overridden in the expression 246 /// itself) or setting various limits. 247 pub struct RegexSetBuilder(RegexOptions); 248 249 impl RegexSetBuilder { 250 /// Create a new regular expression builder with the given pattern. 251 /// 252 /// If the pattern is invalid, then an error will be returned when 253 /// `build` is called. 254 pub fn new<I, S>(patterns: I) -> RegexSetBuilder 255 where 256 S: AsRef<str>, 257 I: IntoIterator<Item = S>, 258 { 259 let mut builder = RegexSetBuilder(RegexOptions::default()); 260 for pat in patterns { 261 builder.0.pats.push(pat.as_ref().to_owned()); 262 } 263 builder 264 } 265 266 /// Consume the builder and compile the regular expressions into a set. 267 pub fn build(&self) -> Result<RegexSet, Error> { 268 ExecBuilder::new_options(self.0.clone()) 269 .only_utf8($only_utf8) 270 .build() 271 .map(RegexSet::from) 272 } 273 274 /// Set the value for the case insensitive (`i`) flag. 275 pub fn case_insensitive( 276 &mut self, 277 yes: bool, 278 ) -> &mut RegexSetBuilder { 279 self.0.case_insensitive = yes; 280 self 281 } 282 283 /// Set the value for the multi-line matching (`m`) flag. 284 pub fn multi_line( 285 &mut self, 286 yes: bool, 287 ) -> &mut RegexSetBuilder { 288 self.0.multi_line = yes; 289 self 290 } 291 292 /// Set the value for the any character (`s`) flag, where in `.` matches 293 /// anything when `s` is set and matches anything except for new line when 294 /// it is not set (the default). 295 /// 296 /// N.B. "matches anything" means "any byte" for `regex::bytes::RegexSet` 297 /// expressions and means "any Unicode scalar value" for `regex::RegexSet` 298 /// expressions. 299 pub fn dot_matches_new_line( 300 &mut self, 301 yes: bool, 302 ) -> &mut RegexSetBuilder { 303 self.0.dot_matches_new_line = yes; 304 self 305 } 306 307 /// Set the value for the greedy swap (`U`) flag. 308 pub fn swap_greed( 309 &mut self, 310 yes: bool, 311 ) -> &mut RegexSetBuilder { 312 self.0.swap_greed = yes; 313 self 314 } 315 316 /// Set the value for the ignore whitespace (`x`) flag. 317 pub fn ignore_whitespace( 318 &mut self, 319 yes: bool, 320 ) -> &mut RegexSetBuilder { 321 self.0.ignore_whitespace = yes; 322 self 323 } 324 325 /// Set the value for the Unicode (`u`) flag. 326 pub fn unicode(&mut self, yes: bool) -> &mut RegexSetBuilder { 327 self.0.unicode = yes; 328 self 329 } 330 331 /// Whether to support octal syntax or not. 332 /// 333 /// Octal syntax is a little-known way of uttering Unicode codepoints in 334 /// a regular expression. For example, `a`, `\x61`, `\u0061` and 335 /// `\141` are all equivalent regular expressions, where the last example 336 /// shows octal syntax. 337 /// 338 /// While supporting octal syntax isn't in and of itself a problem, it does 339 /// make good error messages harder. That is, in PCRE based regex engines, 340 /// syntax like `\0` invokes a backreference, which is explicitly 341 /// unsupported in Rust's regex engine. However, many users expect it to 342 /// be supported. Therefore, when octal support is disabled, the error 343 /// message will explicitly mention that backreferences aren't supported. 344 /// 345 /// Octal syntax is disabled by default. 346 pub fn octal(&mut self, yes: bool) -> &mut RegexSetBuilder { 347 self.0.octal = yes; 348 self 349 } 350 351 /// Set the approximate size limit of the compiled regular expression. 352 /// 353 /// This roughly corresponds to the number of bytes occupied by a single 354 /// compiled program. If the program exceeds this number, then a 355 /// compilation error is returned. 356 pub fn size_limit( 357 &mut self, 358 limit: usize, 359 ) -> &mut RegexSetBuilder { 360 self.0.size_limit = limit; 361 self 362 } 363 364 /// Set the approximate size of the cache used by the DFA. 365 /// 366 /// This roughly corresponds to the number of bytes that the DFA will 367 /// use while searching. 368 /// 369 /// Note that this is a *per thread* limit. There is no way to set a global 370 /// limit. In particular, if a regex is used from multiple threads 371 /// simultaneously, then each thread may use up to the number of bytes 372 /// specified here. 373 pub fn dfa_size_limit( 374 &mut self, 375 limit: usize, 376 ) -> &mut RegexSetBuilder { 377 self.0.dfa_size_limit = limit; 378 self 379 } 380 381 /// Set the nesting limit for this parser. 382 /// 383 /// The nesting limit controls how deep the abstract syntax tree is allowed 384 /// to be. If the AST exceeds the given limit (e.g., with too many nested 385 /// groups), then an error is returned by the parser. 386 /// 387 /// The purpose of this limit is to act as a heuristic to prevent stack 388 /// overflow for consumers that do structural induction on an `Ast` using 389 /// explicit recursion. While this crate never does this (instead using 390 /// constant stack space and moving the call stack to the heap), other 391 /// crates may. 392 /// 393 /// This limit is not checked until the entire Ast is parsed. Therefore, 394 /// if callers want to put a limit on the amount of heap space used, then 395 /// they should impose a limit on the length, in bytes, of the concrete 396 /// pattern string. In particular, this is viable since this parser 397 /// implementation will limit itself to heap space proportional to the 398 /// length of the pattern string. 399 /// 400 /// Note that a nest limit of `0` will return a nest limit error for most 401 /// patterns but not all. For example, a nest limit of `0` permits `a` but 402 /// not `ab`, since `ab` requires a concatenation, which results in a nest 403 /// depth of `1`. In general, a nest limit is not something that manifests 404 /// in an obvious way in the concrete syntax, therefore, it should not be 405 /// used in a granular way. 406 pub fn nest_limit( 407 &mut self, 408 limit: u32, 409 ) -> &mut RegexSetBuilder { 410 self.0.nest_limit = limit; 411 self 412 } 413 } 414 } 415 }; 416 } 417 418 define_set_builder!(set_bytes, bytes, false); 419 define_set_builder!(set_unicode, unicode, true); 420