1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 use std::cmp::{max, min};
12 use std::u8;
13 
14 use unicode::regex::UNICODE_CLASSES;
15 
16 use {
17     Expr, Repeater, CharClass, ClassRange,
18     CaptureIndex, CaptureName,
19     Error, ErrorKind, Result,
20 };
21 
22 /// Parser state.
23 ///
24 /// Keeps the entire input in memory and maintains a cursor (char offset).
25 ///
26 /// It also keeps an expression stack, which is responsible for managing
27 /// grouped expressions and flag state.
28 #[derive(Debug)]
29 pub struct Parser {
30     chars: Vec<char>,
31     chari: usize,
32     stack: Vec<Build>,
33     caps: usize,
34     names: Vec<String>, // to check for duplicates
35     flags: Flags,
36 }
37 
38 /// Flag state used in the parser.
39 #[derive(Clone, Copy, Debug)]
40 pub struct Flags {
41     /// i
42     pub casei: bool,
43     /// m
44     pub multi: bool,
45     /// s
46     pub dotnl: bool,
47     /// U
48     pub swap_greed: bool,
49     /// x
50     pub ignore_space: bool,
51     /// u
52     pub unicode: bool,
53     /// Not actually a flag, but when disabled, every regex that may not match
54     /// UTF-8 exclusively will cause the parser to return an error.
55     pub allow_bytes: bool,
56 }
57 
58 impl Default for Flags {
default() -> Self59     fn default() -> Self {
60         Flags {
61             casei: false,
62             multi: false,
63             dotnl: false,
64             swap_greed: false,
65             ignore_space: false,
66             unicode: true,
67             allow_bytes: false,
68         }
69     }
70 }
71 
72 /// An ephemeral type for representing the expression stack.
73 ///
74 /// Everything on the stack is either a regular expression or a marker
75 /// indicating the opening of a group (possibly non-capturing). The opening
76 /// of a group copies the current flag state, which is reset on the parser
77 /// state once the group closes.
78 #[derive(Debug)]
79 enum Build {
80     Expr(Expr),
81     LeftParen {
82         i: CaptureIndex,
83         name: CaptureName,
84         chari: usize,
85         old_flags: Flags,
86     },
87 }
88 
89 /// A type for representing the elements of a bracket stack used for parsing
90 /// character classes.
91 ///
92 /// This is for parsing nested character classes without recursion.
93 #[derive(Debug)]
94 enum Bracket {
95     /// The opening of a character class (possibly negated)
96     LeftBracket {
97         negated: bool,
98     },
99     /// A set of characters within a character class, e.g., `a-z`
100     Set(CharClass),
101     /// An intersection operator (`&&`)
102     Intersection,
103 }
104 
105 // Primary expression parsing routines.
106 impl Parser {
parse(s: &str, flags: Flags) -> Result<Expr>107     pub fn parse(s: &str, flags: Flags) -> Result<Expr> {
108         Parser {
109             chars: s.chars().collect(),
110             chari: 0,
111             stack: vec![],
112             caps: 0,
113             names: vec![],
114             flags: flags,
115         }.parse_expr()
116     }
117 
118     // Top-level expression parser.
119     //
120     // Starts at the beginning of the input and consumes until either the end
121     // of input or an error.
parse_expr(mut self) -> Result<Expr>122     fn parse_expr(mut self) -> Result<Expr> {
123         loop {
124             self.ignore_space();
125             if self.eof() {
126                 break;
127             }
128             let build_expr = match self.cur() {
129                 '\\' => try!(self.parse_escape()),
130                 '|' => { let e = try!(self.alternate()); self.bump(); e }
131                 '?' => try!(self.parse_simple_repeat(Repeater::ZeroOrOne)),
132                 '*' => try!(self.parse_simple_repeat(Repeater::ZeroOrMore)),
133                 '+' => try!(self.parse_simple_repeat(Repeater::OneOrMore)),
134                 '{' => try!(self.parse_counted_repeat()),
135                 '[' => try!(self.parse_class()),
136                 '^' => {
137                     if self.flags.multi {
138                         self.parse_one(Expr::StartLine)
139                     } else {
140                         self.parse_one(Expr::StartText)
141                     }
142                 }
143                 '$' => {
144                     if self.flags.multi {
145                         self.parse_one(Expr::EndLine)
146                     } else {
147                         self.parse_one(Expr::EndText)
148                     }
149                 }
150                 '.' => {
151                     if self.flags.dotnl {
152                         if self.flags.unicode {
153                             self.parse_one(Expr::AnyChar)
154                         } else {
155                             if !self.flags.allow_bytes {
156                                 return Err(self.err(ErrorKind::InvalidUtf8));
157                             }
158                             self.parse_one(Expr::AnyByte)
159                         }
160                     } else {
161                         if self.flags.unicode {
162                             self.parse_one(Expr::AnyCharNoNL)
163                         } else {
164                             if !self.flags.allow_bytes {
165                                 return Err(self.err(ErrorKind::InvalidUtf8));
166                             }
167                             self.parse_one(Expr::AnyByteNoNL)
168                         }
169                     }
170                 }
171                 '(' => try!(self.parse_group()),
172                 ')' => {
173                     let (old_flags, e) = try!(self.close_paren());
174                     self.bump();
175                     self.flags = old_flags;
176                     e
177                 }
178                 _ => {
179                     let c = self.bump();
180                     try!(self.lit(c))
181                 }
182             };
183             if !build_expr.is_empty() {
184                 self.stack.push(build_expr);
185             }
186         }
187         self.finish_concat()
188     }
189 
190     // Parses an escape sequence, e.g., \Ax
191     //
192     // Start: `\`
193     // End:   `x`
parse_escape(&mut self) -> Result<Build>194     fn parse_escape(&mut self) -> Result<Build> {
195         self.bump();
196         if self.eof() {
197             return Err(self.err(ErrorKind::UnexpectedEscapeEof));
198         }
199         let c = self.cur();
200         if is_punct(c) || (self.flags.ignore_space && c.is_whitespace()) {
201             let c = self.bump();
202             return Ok(try!(self.lit(c)));
203         }
204         match c {
205             'a' => { self.bump(); Ok(try!(self.lit('\x07'))) }
206             'f' => { self.bump(); Ok(try!(self.lit('\x0C'))) }
207             't' => { self.bump(); Ok(try!(self.lit('\t'))) }
208             'n' => { self.bump(); Ok(try!(self.lit('\n'))) }
209             'r' => { self.bump(); Ok(try!(self.lit('\r'))) }
210             'v' => { self.bump(); Ok(try!(self.lit('\x0B'))) }
211             'A' => { self.bump(); Ok(Build::Expr(Expr::StartText)) }
212             'z' => { self.bump(); Ok(Build::Expr(Expr::EndText)) }
213             'b' => {
214                 self.bump();
215                 Ok(Build::Expr(if self.flags.unicode {
216                     Expr::WordBoundary
217                 } else {
218                     Expr::WordBoundaryAscii
219                 }))
220             }
221             'B' => {
222                 self.bump();
223                 Ok(Build::Expr(if self.flags.unicode {
224                     Expr::NotWordBoundary
225                 } else {
226                     Expr::NotWordBoundaryAscii
227                 }))
228             }
229             '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => self.parse_octal(),
230             'x' => { self.bump(); self.parse_hex() }
231             'p'|'P' => {
232                 self.bump();
233                 self.parse_unicode_class(c == 'P')
234                     .map(|cls| Build::Expr(Expr::Class(cls)))
235             }
236             'd'|'s'|'w'|'D'|'S'|'W' => {
237                 self.bump();
238                 Ok(Build::Expr(Expr::Class(self.parse_perl_class(c))))
239             }
240             c => Err(self.err(ErrorKind::UnrecognizedEscape(c))),
241         }
242     }
243 
244     // Parses a group, e.g., `(abc)`.
245     //
246     // Start: `(`
247     // End:   `a`
248     //
249     // A more interesting example, `(?P<foo>abc)`.
250     //
251     // Start: `(`
252     // End:   `a`
parse_group(&mut self) -> Result<Build>253     fn parse_group(&mut self) -> Result<Build> {
254         let chari = self.chari;
255         let mut name: CaptureName = None;
256         self.bump();
257         self.ignore_space();
258         if self.bump_if("?P<") {
259             let n = try!(self.parse_group_name());
260             if self.names.iter().any(|n2| n2 == &n) {
261                 return Err(self.err(ErrorKind::DuplicateCaptureName(n)));
262             }
263             self.names.push(n.clone());
264             name = Some(n);
265         } else if self.bump_if("?") {
266             // This can never be capturing. It's either setting flags for
267             // the current group, or it's opening a non-capturing group or
268             // it's opening a group with a specific set of flags (which is
269             // also non-capturing).
270             // Anything else is an error.
271             return self.parse_group_flags(chari);
272         }
273         self.caps = checkadd(self.caps, 1);
274         Ok(Build::LeftParen {
275             i: Some(self.caps),
276             name: name,
277             chari: chari,
278             old_flags: self.flags, // no flags changed if we're here
279         })
280     }
281 
282     // Parses flags (inline or grouped), e.g., `(?s-i:abc)`.
283     //
284     // Start: `s`
285     // End:   `a`
286     //
287     // Another example, `(?s-i)a`.
288     //
289     // Start: `s`
290     // End:   `a`
parse_group_flags(&mut self, opening_chari: usize) -> Result<Build>291     fn parse_group_flags(&mut self, opening_chari: usize) -> Result<Build> {
292         let old_flags = self.flags;
293         let mut sign = true;
294         let mut saw_flag = false;
295         loop {
296             if self.eof() {
297                 // e.g., (?i
298                 return Err(self.err(ErrorKind::UnexpectedFlagEof));
299             }
300             match self.cur() {
301                 'i' => { self.flags.casei = sign; saw_flag = true }
302                 'm' => { self.flags.multi = sign; saw_flag = true }
303                 's' => { self.flags.dotnl = sign; saw_flag = true }
304                 'U' => { self.flags.swap_greed = sign; saw_flag = true }
305                 'x' => { self.flags.ignore_space = sign; saw_flag = true }
306                 'u' => { self.flags.unicode = sign; saw_flag = true }
307                 '-' => {
308                     if !sign {
309                         // e.g., (?-i-s)
310                         return Err(self.err(ErrorKind::DoubleFlagNegation));
311                     }
312                     sign = false;
313                     saw_flag = false;
314                 }
315                 ')' => {
316                     if !saw_flag {
317                         // e.g., (?)
318                         return Err(self.err(ErrorKind::EmptyFlagNegation));
319                     }
320                     // At this point, we're just changing the flags inside
321                     // the current group, which means the old flags have
322                     // been saved elsewhere. Our modifications in place are
323                     // okey dokey!
324                     //
325                     // This particular flag expression only has a stateful
326                     // impact on a regex's AST, so nothing gets explicitly
327                     // added.
328                     self.bump();
329                     return Ok(Build::Expr(Expr::Empty));
330                 }
331                 ':' => {
332                     if !sign && !saw_flag {
333                         // e.g., (?i-:a)
334                         // Note that if there's no negation, it's OK not
335                         // to see flag, because you end up with a regular
336                         // non-capturing group: `(?:a)`.
337                         return Err(self.err(ErrorKind::EmptyFlagNegation));
338                     }
339                     self.bump();
340                     return Ok(Build::LeftParen {
341                         i: None,
342                         name: None,
343                         chari: opening_chari,
344                         old_flags: old_flags,
345                     });
346                 }
347                 // e.g., (?z:a)
348                 c => return Err(self.err(ErrorKind::UnrecognizedFlag(c))),
349             }
350             self.bump();
351         }
352     }
353 
354     // Parses a group name, e.g., `foo` in `(?P<foo>abc)`.
355     //
356     // Start: `f`
357     // End:   `a`
parse_group_name(&mut self) -> Result<String>358     fn parse_group_name(&mut self) -> Result<String> {
359         let mut name = String::new();
360         while !self.eof() && !self.peek_is('>') {
361             name.push(self.bump());
362         }
363         if self.eof() {
364             // e.g., (?P<a
365             return Err(self.err(ErrorKind::UnclosedCaptureName(name)));
366         }
367         let all_valid = name.chars().all(is_valid_capture_char);
368         match name.chars().next() {
369             // e.g., (?P<>a)
370             None => Err(self.err(ErrorKind::EmptyCaptureName)),
371             Some(c) if (c >= '0' && c <= '9') || !all_valid => {
372                 // e.g., (?P<a#>x)
373                 // e.g., (?P<1a>x)
374                 Err(self.err(ErrorKind::InvalidCaptureName(name)))
375             }
376             _ => {
377                 self.bump(); // for `>`
378                 Ok(name)
379             }
380         }
381     }
382 
383     // Parses a counted repeition operator, e.g., `a{2,4}?z`.
384     //
385     // Start: `{`
386     // End:   `z`
parse_counted_repeat(&mut self) -> Result<Build>387     fn parse_counted_repeat(&mut self) -> Result<Build> {
388         let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., ({5}
389         if !e.can_repeat() {
390             // e.g., a*{5}
391             return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e)));
392         }
393         self.bump();
394         self.ignore_space();
395         let min = try!(self.parse_decimal());
396         let mut max_opt = Some(min);
397         self.ignore_space();
398         if self.bump_if(',') {
399             self.ignore_space();
400             if self.peek_is('}') {
401                 max_opt = None;
402             } else {
403                 let max = try!(self.parse_decimal());
404                 if min > max {
405                     // e.g., a{2,1}
406                     return Err(self.err(ErrorKind::InvalidRepeatRange {
407                         min: min,
408                         max: max,
409                     }));
410                 }
411                 max_opt = Some(max);
412             }
413         }
414         self.ignore_space();
415         if !self.bump_if('}') {
416             Err(self.err(ErrorKind::UnclosedRepeat))
417         } else {
418             Ok(Build::Expr(Expr::Repeat {
419                 e: Box::new(e),
420                 r: Repeater::Range { min: min, max: max_opt },
421                 greedy: !self.bump_if('?') ^ self.flags.swap_greed,
422             }))
423         }
424     }
425 
426     // Parses a simple repetition operator, e.g., `a+?z`.
427     //
428     // Start: `+`
429     // End:   `z`
430     //
431     // N.B. "simple" in this context means "not min/max repetition",
432     // e.g., `a{1,2}`.
parse_simple_repeat(&mut self, rep: Repeater) -> Result<Build>433     fn parse_simple_repeat(&mut self, rep: Repeater) -> Result<Build> {
434         let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., (*
435         if !e.can_repeat() {
436             // e.g., a**
437             return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e)));
438         }
439         self.bump();
440         Ok(Build::Expr(Expr::Repeat {
441             e: Box::new(e),
442             r: rep,
443             greedy: !self.bump_if('?') ^ self.flags.swap_greed,
444         }))
445     }
446 
447     // Parses a decimal number until the given character, e.g., `a{123,456}`.
448     //
449     // Start: `1`
450     // End:   `,` (where `until == ','`)
parse_decimal(&mut self) -> Result<u32>451     fn parse_decimal(&mut self) -> Result<u32> {
452         match self.bump_get(|c| is_ascii_word(c) || c.is_whitespace()) {
453             // e.g., a{}
454             None => Err(self.err(ErrorKind::MissingBase10)),
455             Some(n) => {
456                 // e.g., a{xyz
457                 // e.g., a{9999999999}
458                 let n = n.trim();
459                 u32::from_str_radix(n, 10)
460                     .map_err(|_| self.err(ErrorKind::InvalidBase10(n.into())))
461             }
462         }
463     }
464 
465     // Parses an octal number, up to 3 digits, e.g., `a\123b`
466     //
467     // Start: `1`
468     // End:   `b`
parse_octal(&mut self) -> Result<Build>469     fn parse_octal(&mut self) -> Result<Build> {
470         use std::char;
471         let mut i = 0; // counter for limiting octal to 3 digits.
472         let n = self.bump_get(|c| { i += 1; i <= 3 && c >= '0' && c <= '7' })
473                     .expect("octal string"); // guaranteed at least 1 digit
474         // I think both of the following unwraps are impossible to fail.
475         // We limit it to a three digit octal number, which maxes out at
476         // `0777` or `511` in decimal. Since all digits are in `0...7`, we'll
477         // always have a valid `u32` number. Moreover, since all numbers in
478         // the range `0...511` are valid Unicode scalar values, it will always
479         // be a valid `char`.
480         //
481         // Hence, we `unwrap` with reckless abandon.
482         let n = u32::from_str_radix(&n, 8).ok().expect("valid octal number");
483         if !self.flags.unicode {
484             return Ok(try!(self.u32_to_one_byte(n)));
485         }
486         let c = char::from_u32(n).expect("Unicode scalar value");
487         Ok(try!(self.lit(c)))
488     }
489 
490     // Parses a hex number, e.g., `a\x5ab`.
491     //
492     // Start: `5`
493     // End:   `b`
494     //
495     // And also, `a\x{2603}b`.
496     //
497     // Start: `{`
498     // End:   `b`
parse_hex(&mut self) -> Result<Build>499     fn parse_hex(&mut self) -> Result<Build> {
500         self.ignore_space();
501         if self.bump_if('{') {
502             self.parse_hex_many_digits()
503         } else {
504             self.parse_hex_two_digits()
505         }
506     }
507 
508     // Parses a many-digit hex number, e.g., `a\x{2603}b`.
509     //
510     // Start: `2`
511     // End:   `b`
parse_hex_many_digits(&mut self) -> Result<Build>512     fn parse_hex_many_digits(&mut self) -> Result<Build> {
513         use std::char;
514 
515         self.ignore_space();
516         let s = self.bump_get(is_ascii_word).unwrap_or("".into());
517         let n = try!(u32::from_str_radix(&s, 16)
518                          .map_err(|_| self.err(ErrorKind::InvalidBase16(s))));
519         self.ignore_space();
520         if !self.bump_if('}') {
521             // e.g., a\x{d
522             return Err(self.err(ErrorKind::UnclosedHex));
523         }
524         if !self.flags.unicode {
525             return Ok(try!(self.u32_to_one_byte(n)));
526         }
527         let c = try!(char::from_u32(n)
528                           .ok_or(self.err(ErrorKind::InvalidScalarValue(n))));
529         Ok(try!(self.lit(c)))
530     }
531 
532     // Parses a two-digit hex number, e.g., `a\x5ab`.
533     //
534     // Start: `5`
535     // End:   `b`
parse_hex_two_digits(&mut self) -> Result<Build>536     fn parse_hex_two_digits(&mut self) -> Result<Build> {
537         use std::char;
538 
539         let mut i = 0;
540         let s = self.bump_get(|_| { i += 1; i <= 2 }).unwrap_or("".into());
541         if s.len() < 2 {
542             // e.g., a\x
543             // e.g., a\xf
544             return Err(self.err(ErrorKind::UnexpectedTwoDigitHexEof));
545         }
546         let n = try!(u32::from_str_radix(&s, 16)
547                          .map_err(|_| self.err(ErrorKind::InvalidBase16(s))));
548         if !self.flags.unicode {
549             return Ok(try!(self.u32_to_one_byte(n)));
550         }
551         let c = char::from_u32(n).expect("Unicode scalar value");
552         Ok(try!(self.lit(c)))
553     }
554 
555     // Parses a character class, e.g., `[^a-zA-Z0-9]+`.
556     //
557     // If the Unicode flag is enabled, the class is returned as a `CharClass`,
558     // otherwise it is converted to a `ByteClass`.
559     //
560     // Start: `[`
561     // End:   `+`
parse_class(&mut self) -> Result<Build>562     fn parse_class(&mut self) -> Result<Build> {
563         let class = try!(self.parse_class_as_chars());
564         Ok(Build::Expr(if self.flags.unicode {
565             Expr::Class(class)
566         } else {
567             let byte_class = class.to_byte_class();
568 
569             // If `class` was only non-empty due to multibyte characters, the
570             // corresponding byte class will now be empty.
571             //
572             // See https://github.com/rust-lang/regex/issues/303
573             if byte_class.is_empty() {
574                 // e.g., (?-u)[^\x00-\xFF]
575                 return Err(self.err(ErrorKind::EmptyClass));
576             }
577 
578             Expr::ClassBytes(byte_class)
579         }))
580     }
581 
582     // Parses a character class as a `CharClass`, e.g., `[^a-zA-Z0-9]+`.
583     //
584     // Start: `[`
585     // End:   `+`
parse_class_as_chars(&mut self) -> Result<CharClass>586     fn parse_class_as_chars(&mut self) -> Result<CharClass> {
587         let mut bracket_stack = vec![];
588         bracket_stack.extend(self.parse_open_bracket());
589         loop {
590             self.ignore_space();
591             if self.eof() {
592                 // e.g., [a
593                 return Err(self.err(ErrorKind::UnexpectedClassEof));
594             }
595             match self.cur() {
596                 '[' => {
597                     if let Some(class) = self.maybe_parse_ascii() {
598                         // e.g. `[:alnum:]`
599                         bracket_stack.push(Bracket::Set(class));
600                     } else {
601                         // nested set, e.g. `[c-d]` in `[a-b[c-d]]`
602                         bracket_stack.extend(self.parse_open_bracket());
603                     }
604                 }
605                 ']' => {
606                     self.bump();
607                     let class = try!(self.close_bracket(&mut bracket_stack));
608                     if bracket_stack.is_empty() {
609                         // That was the outermost class, so stop now
610                         return Ok(class);
611                     }
612                     bracket_stack.push(Bracket::Set(class));
613                 }
614                 '\\' => {
615                     let class = try!(self.parse_class_escape());
616                     bracket_stack.push(Bracket::Set(class));
617                 }
618                 '&' if self.peek_is("&&") => {
619                     self.bump();
620                     self.bump();
621                     bracket_stack.push(Bracket::Intersection);
622                 }
623                 start => {
624                     if !self.flags.unicode {
625                         let _ = try!(self.codepoint_to_one_byte(start));
626                     }
627                     self.bump();
628                     match start {
629                         '~'|'-' => {
630                             // Only report an error if we see ~~ or --.
631                             if self.peek_is(start) {
632                                 return Err(self.err(
633                                     ErrorKind::UnsupportedClassChar(start)));
634                             }
635                         }
636                         _ => {}
637                     }
638                     let class = try!(self.parse_class_range(start));
639                     bracket_stack.push(Bracket::Set(class));
640                 }
641             }
642         }
643     }
644 
645     // Parses the start of a character class or a nested character class.
646     // That includes negation using `^` and unescaped `-` and `]` allowed at
647     // the start of the class.
648     //
649     // e.g., `[^a]` or `[-a]` or `[]a]`
650     //
651     // Start: `[`
652     // End:   `a`
parse_open_bracket(&mut self) -> Vec<Bracket>653     fn parse_open_bracket(&mut self) -> Vec<Bracket> {
654         self.bump();
655         self.ignore_space();
656         let negated = self.bump_if('^');
657         self.ignore_space();
658 
659         let mut class = CharClass::empty();
660         while self.bump_if('-') {
661             class.ranges.push(ClassRange::one('-'));
662             self.ignore_space();
663         }
664         if class.is_empty() {
665             if self.bump_if(']') {
666                 class.ranges.push(ClassRange::one(']'));
667                 self.ignore_space();
668             }
669         }
670 
671         let bracket = Bracket::LeftBracket { negated: negated };
672         if class.is_empty() {
673             vec![bracket]
674         } else {
675             vec![bracket, Bracket::Set(class)]
676         }
677     }
678 
679     // Parses an escape in a character class.
680     //
681     // This is a helper for `parse_class`. Instead of returning an `Ok` value,
682     // it either mutates the char class or returns an error.
683     //
684     // e.g., `\wx`
685     //
686     // Start: `\`
687     // End:   `x`
parse_class_escape(&mut self) -> Result<CharClass>688     fn parse_class_escape(&mut self) -> Result<CharClass> {
689         match try!(self.parse_escape()) {
690             Build::Expr(Expr::Class(class)) => {
691                 Ok(class)
692             }
693             Build::Expr(Expr::ClassBytes(class2)) => {
694                 let mut class = CharClass::empty();
695                 for byte_range in class2 {
696                     let s = byte_range.start as char;
697                     let e = byte_range.end as char;
698                     class.ranges.push(ClassRange::new(s, e));
699                 }
700                 Ok(class)
701             }
702             Build::Expr(Expr::Literal { chars, .. }) => {
703                 self.parse_class_range(chars[0])
704             }
705             Build::Expr(Expr::LiteralBytes { bytes, .. }) => {
706                 let start = bytes[0] as char;
707                 self.parse_class_range(start)
708             }
709             Build::Expr(e) => {
710                 let err = ErrorKind::InvalidClassEscape(e);
711                 Err(self.err(err))
712             }
713             // Because `parse_escape` can never return `LeftParen`.
714             _ => unreachable!(),
715         }
716     }
717 
718     // Parses a single range in a character class.
719     //
720     // e.g., `[a-z]`
721     //
722     // Start: `-` (with start == `a`)
723     // End:   `]`
parse_class_range(&mut self, start: char) -> Result<CharClass>724     fn parse_class_range(&mut self, start: char) -> Result<CharClass> {
725         self.ignore_space();
726         if !self.bump_if('-') {
727             // Not a range, so just return a singleton range.
728             return Ok(CharClass::new(vec![ClassRange::one(start)]));
729         }
730         self.ignore_space();
731         if self.eof() {
732             // e.g., [a-
733             return Err(self.err(ErrorKind::UnexpectedClassEof));
734         }
735         if self.peek_is(']') {
736             // This is the end of the class, so we permit use of `-` as a
737             // regular char (just like we do in the beginning).
738             return Ok(CharClass::new(vec![ClassRange::one(start), ClassRange::one('-')]));
739         }
740 
741         // We have a real range. Just need to check to parse literal and
742         // make sure it's a valid range.
743         let end = match self.cur() {
744             '\\' => match try!(self.parse_escape()) {
745                 Build::Expr(Expr::Literal { chars, .. }) => {
746                     chars[0]
747                 }
748                 Build::Expr(Expr::LiteralBytes { bytes, .. }) => {
749                     bytes[0] as char
750                 }
751                 Build::Expr(e) => {
752                     return Err(self.err(ErrorKind::InvalidClassEscape(e)));
753                 }
754                 // Because `parse_escape` can never return `LeftParen`.
755                 _ => unreachable!(),
756             },
757             c => {
758                 self.bump();
759                 if c == '-' {
760                     return Err(self.err(ErrorKind::UnsupportedClassChar('-')));
761                 }
762                 if !self.flags.unicode {
763                     let _ = try!(self.codepoint_to_one_byte(c));
764                 }
765                 c
766             }
767         };
768         if end < start {
769             // e.g., [z-a]
770             return Err(self.err(ErrorKind::InvalidClassRange {
771                 start: start,
772                 end: end,
773             }));
774         }
775         Ok(CharClass::new(vec![ClassRange::new(start, end)]))
776     }
777 
778     // Parses an ASCII class, e.g., `[:alnum:]+`.
779     //
780     // Start: `[`
781     // End:   `+`
782     //
783     // Also supports negation, e.g., `[:^alnum:]`.
784     //
785     // This parsing routine is distinct from the others in that it doesn't
786     // actually report any errors. Namely, if it fails, then the parser should
787     // fall back to parsing a regular class.
788     //
789     // This method will only make progress in the parser if it succeeds.
790     // Otherwise, the input remains where it started.
maybe_parse_ascii(&mut self) -> Option<CharClass>791     fn maybe_parse_ascii(&mut self) -> Option<CharClass> {
792         fn parse(p: &mut Parser) -> Option<CharClass> {
793             p.bump(); // the `[`
794             if !p.bump_if(':') { return None; }
795             let negate = p.bump_if('^');
796             let name = match p.bump_get(|c| c != ':') {
797                 None => return None,
798                 Some(name) => name,
799             };
800             if !p.bump_if(":]") { return None; }
801             ascii_class(&name).map(|cls| p.class_transform(negate, cls))
802         }
803         let start = self.chari;
804         match parse(self) {
805             None => { self.chari = start; None }
806             result => result,
807         }
808     }
809 
810     // Parses a Uncode class name, e.g., `a\pLb`.
811     //
812     // Start: `L`
813     // End:   `b`
814     //
815     // And also, `a\p{Greek}b`.
816     //
817     // Start: `{`
818     // End:   `b`
819     //
820     // `negate` is true when the class name is used with `\P`.
parse_unicode_class(&mut self, neg: bool) -> Result<CharClass>821     fn parse_unicode_class(&mut self, neg: bool) -> Result<CharClass> {
822         self.ignore_space();
823         let name =
824             if self.bump_if('{') {
825                 self.ignore_space();
826                 let n = self.bump_get(is_ascii_word).unwrap_or("".into());
827                 self.ignore_space();
828                 if n.is_empty() || !self.bump_if('}') {
829                     // e.g., \p{Greek
830                     return Err(self.err(ErrorKind::UnclosedUnicodeName));
831                 }
832                 n
833             } else {
834                 if self.eof() {
835                     // e.g., \p
836                     return Err(self.err(ErrorKind::UnexpectedEscapeEof));
837                 }
838                 self.bump().to_string()
839             };
840         match unicode_class(&name) {
841             None => Err(self.err(ErrorKind::UnrecognizedUnicodeClass(name))),
842             Some(cls) => {
843                 if self.flags.unicode {
844                     Ok(self.class_transform(neg, cls))
845                 } else {
846                     Err(self.err(ErrorKind::UnicodeNotAllowed))
847                 }
848             }
849         }
850     }
851 
852     // Parses a perl character class with Unicode support.
853     //
854     // `name` must be one of d, s, w, D, S, W. If not, this function panics.
855     //
856     // No parser state is changed.
parse_perl_class(&mut self, name: char) -> CharClass857     fn parse_perl_class(&mut self, name: char) -> CharClass {
858         use unicode::regex::{PERLD, PERLS, PERLW};
859         let (cls, negate) = match (self.flags.unicode, name) {
860             (true, 'd') => (raw_class_to_expr(PERLD), false),
861             (true, 'D') => (raw_class_to_expr(PERLD), true),
862             (true, 's') => (raw_class_to_expr(PERLS), false),
863             (true, 'S') => (raw_class_to_expr(PERLS), true),
864             (true, 'w') => (raw_class_to_expr(PERLW), false),
865             (true, 'W') => (raw_class_to_expr(PERLW), true),
866             (false, 'd') => (ascii_class("digit").unwrap(), false),
867             (false, 'D') => (ascii_class("digit").unwrap(), true),
868             (false, 's') => (ascii_class("space").unwrap(), false),
869             (false, 'S') => (ascii_class("space").unwrap(), true),
870             (false, 'w') => (ascii_class("word").unwrap(), false),
871             (false, 'W') => (ascii_class("word").unwrap(), true),
872             _ => unreachable!(),
873         };
874         self.class_transform(negate, cls)
875     }
876 
877     // Always bump to the next input and return the given expression as a
878     // `Build`.
879     //
880     // This is mostly for convenience when the surrounding context implies
881     // that the next character corresponds to the given expression.
parse_one(&mut self, e: Expr) -> Build882     fn parse_one(&mut self, e: Expr) -> Build {
883         self.bump();
884         Build::Expr(e)
885     }
886 }
887 
888 // Auxiliary helper methods.
889 impl Parser {
chars(&self) -> Chars890     fn chars(&self) -> Chars {
891         Chars::new(&self.chars[self.chari..])
892     }
893 
ignore_space(&mut self)894     fn ignore_space(&mut self) {
895         if !self.flags.ignore_space {
896             return;
897         }
898         while !self.eof() {
899             match self.cur() {
900                 '#' => {
901                     self.bump();
902                     while !self.eof() {
903                         match self.bump() {
904                             '\n' => break,
905                             _ => continue,
906                         }
907                     }
908                 },
909                 c => if !c.is_whitespace() {
910                     return;
911                 } else {
912                     self.bump();
913                 }
914             }
915         }
916     }
917 
bump(&mut self) -> char918     fn bump(&mut self) -> char {
919         let c = self.cur();
920         self.chari = checkadd(self.chari, self.chars().next_count());
921         c
922     }
923 
cur(&self) -> char924     fn cur(&self) -> char { self.chars().next().unwrap() }
925 
eof(&self) -> bool926     fn eof(&self) -> bool { self.chars().next().is_none() }
927 
bump_get<B: Bumpable>(&mut self, s: B) -> Option<String>928     fn bump_get<B: Bumpable>(&mut self, s: B) -> Option<String> {
929         let n = s.match_end(self);
930         if n == 0 {
931             None
932         } else {
933             let end = checkadd(self.chari, n);
934             let s = self.chars[self.chari..end]
935                         .iter().cloned().collect::<String>();
936             self.chari = end;
937             Some(s)
938         }
939     }
940 
bump_if<B: Bumpable>(&mut self, s: B) -> bool941     fn bump_if<B: Bumpable>(&mut self, s: B) -> bool {
942         let n = s.match_end(self);
943         if n == 0 {
944             false
945         } else {
946             self.chari = checkadd(self.chari, n);
947             true
948         }
949     }
950 
peek_is<B: Bumpable>(&self, s: B) -> bool951     fn peek_is<B: Bumpable>(&self, s: B) -> bool {
952         s.match_end(self) > 0
953     }
954 
err(&self, kind: ErrorKind) -> Error955     fn err(&self, kind: ErrorKind) -> Error {
956         self.errat(self.chari, kind)
957     }
958 
errat(&self, pos: usize, kind: ErrorKind) -> Error959     fn errat(&self, pos: usize, kind: ErrorKind) -> Error {
960         Error { pos: pos, surround: self.windowat(pos), kind: kind }
961     }
962 
windowat(&self, pos: usize) -> String963     fn windowat(&self, pos: usize) -> String {
964         let s = max(5, pos) - 5;
965         let e = min(self.chars.len(), checkadd(pos, 5));
966         self.chars[s..e].iter().cloned().collect()
967     }
968 
pop(&mut self, expected: ErrorKind) -> Result<Expr>969     fn pop(&mut self, expected: ErrorKind) -> Result<Expr> {
970         match self.stack.pop() {
971             None | Some(Build::LeftParen{..}) => Err(self.err(expected)),
972             Some(Build::Expr(e)) => Ok(e),
973         }
974     }
975 
976     // If the current context calls for case insensitivity, then apply
977     // case folding. Similarly, if `negate` is `true`, then negate the
978     // class. (Negation always proceeds case folding.)
class_transform(&self, negate: bool, mut cls: CharClass) -> CharClass979     fn class_transform(&self, negate: bool, mut cls: CharClass) -> CharClass {
980         if self.flags.casei {
981             cls = cls.case_fold();
982         }
983         if negate {
984             cls = cls.negate();
985         }
986         cls
987     }
988 
989     // Translates a Unicode codepoint into a single UTF-8 byte, and returns an
990     // error if it's not possible.
991     //
992     // This will panic if self.flags.unicode == true.
codepoint_to_one_byte(&self, c: char) -> Result<u8>993     fn codepoint_to_one_byte(&self, c: char) -> Result<u8> {
994         assert!(!self.flags.unicode);
995         let bytes = c.to_string().as_bytes().to_owned();
996         if bytes.len() > 1 {
997             return Err(self.err(ErrorKind::UnicodeNotAllowed));
998         }
999         Ok(bytes[0])
1000     }
1001 
1002     // Creates a new byte literal from a single byte.
1003     //
1004     // If the given number can't fit into a single byte, then it is assumed
1005     // to be a Unicode codepoint and an error is returned.
1006     //
1007     // This should only be called when the bytes flag is enabled.
u32_to_one_byte(&self, b: u32) -> Result<Build>1008     fn u32_to_one_byte(&self, b: u32) -> Result<Build> {
1009         assert!(!self.flags.unicode);
1010         if b > u8::MAX as u32 {
1011             Err(self.err(ErrorKind::UnicodeNotAllowed))
1012         } else if !self.flags.allow_bytes && b > 0x7F {
1013             Err(self.err(ErrorKind::InvalidUtf8))
1014         } else {
1015             Ok(Build::Expr(Expr::LiteralBytes {
1016                 bytes: vec![b as u8],
1017                 casei: self.flags.casei,
1018             }))
1019         }
1020     }
1021 
1022     // Creates a new literal expr from a Unicode codepoint.
1023     //
1024     // Creates a byte literal if the `bytes` flag is set.
lit(&self, c: char) -> Result<Build>1025     fn lit(&self, c: char) -> Result<Build> {
1026         Ok(Build::Expr(if self.flags.unicode {
1027             Expr::Literal {
1028                 chars: vec![c],
1029                 casei: self.flags.casei,
1030             }
1031         } else {
1032             Expr::LiteralBytes {
1033                 bytes: vec![try!(self.codepoint_to_one_byte(c))],
1034                 casei: self.flags.casei,
1035             }
1036         }))
1037     }
1038 }
1039 
1040 struct Chars<'a> {
1041     chars: &'a [char],
1042     cur: usize,
1043 }
1044 
1045 impl<'a> Iterator for Chars<'a> {
1046     type Item = char;
next(&mut self) -> Option<char>1047     fn next(&mut self) -> Option<char> {
1048         let x = self.c();
1049         self.advance();
1050         return x;
1051     }
1052 }
1053 
1054 impl<'a> Chars<'a> {
new(chars: &[char]) -> Chars1055     fn new(chars: &[char]) -> Chars {
1056         Chars {
1057             chars: chars,
1058             cur: 0,
1059         }
1060     }
1061 
c(&self) -> Option<char>1062     fn c(&self) -> Option<char> {
1063         self.chars.get(self.cur).map(|&c| c)
1064     }
1065 
advance(&mut self)1066     fn advance(&mut self) {
1067         self.cur = checkadd(self.cur, 1);
1068     }
1069 
next_count(&mut self) -> usize1070     fn next_count(&mut self) -> usize {
1071         self.next();
1072         self.cur
1073     }
1074 }
1075 
1076 // Auxiliary methods for manipulating the expression stack.
1077 impl Parser {
1078     // Called whenever an alternate (`|`) is found.
1079     //
1080     // This pops the expression stack until:
1081     //
1082     //  1. The stack is empty. Pushes an alternation with one arm.
1083     //  2. An opening parenthesis is found. Leave the parenthesis
1084     //     on the stack and push an alternation with one arm.
1085     //  3. An alternate (`|`) is found. Pop the existing alternation,
1086     //     add an arm and push the modified alternation.
1087     //
1088     // Each "arm" in the above corresponds to the concatenation of all
1089     // popped expressions.
1090     //
1091     // In the first two cases, the stack is left in an invalid state
1092     // because an alternation with one arm is not allowed. This
1093     // particular state will be detected by `finish_concat` and an
1094     // error will be reported.
1095     //
1096     // In none of the cases is an empty arm allowed. If an empty arm
1097     // is found, an error is reported.
alternate(&mut self) -> Result<Build>1098     fn alternate(&mut self) -> Result<Build> {
1099         let mut concat = vec![];
1100         let alts = |es| Ok(Build::Expr(Expr::Alternate(es)));
1101         loop {
1102             match self.stack.pop() {
1103                 None => {
1104                     if concat.is_empty() {
1105                         // e.g., |a
1106                         return Err(self.err(ErrorKind::EmptyAlternate));
1107                     }
1108                     return alts(vec![rev_concat(concat)]);
1109                 }
1110                 Some(e @ Build::LeftParen{..}) => {
1111                     if concat.is_empty() {
1112                         // e.g., (|a)
1113                         return Err(self.err(ErrorKind::EmptyAlternate));
1114                     }
1115                     self.stack.push(e);
1116                     return alts(vec![rev_concat(concat)]);
1117                 }
1118                 Some(Build::Expr(Expr::Alternate(mut es))) => {
1119                     if concat.is_empty() {
1120                         // e.g., a||
1121                         return Err(self.err(ErrorKind::EmptyAlternate));
1122                     }
1123                     es.push(rev_concat(concat));
1124                     return alts(es);
1125                 }
1126                 Some(Build::Expr(e)) => { concat.push(e); }
1127             }
1128         }
1129     }
1130 
1131     // Called whenever a closing parenthesis (`)`) is found.
1132     //
1133     // This pops the expression stack until:
1134     //
1135     //  1. The stack is empty. An error is reported because this
1136     //     indicates an unopened parenthesis.
1137     //  2. An opening parenthesis is found. Pop the opening parenthesis
1138     //     and push a `Group` expression.
1139     //  3. An alternate (`|`) is found. Pop the existing alternation
1140     //     and an arm to it in place. Pop one more item from the stack.
1141     //     If the stack was empty, then report an unopened parenthesis
1142     //     error, otherwise assume it is an opening parenthesis and
1143     //     push a `Group` expression with the popped alternation.
1144     //     (We can assume this is an opening parenthesis because an
1145     //     alternation either corresponds to the entire Regex or it
1146     //     corresponds to an entire group. This is guaranteed by the
1147     //     `alternate` method.)
1148     //
1149     // Each "arm" in the above corresponds to the concatenation of all
1150     // popped expressions.
1151     //
1152     // Empty arms nor empty groups are allowed.
close_paren(&mut self) -> Result<(Flags, Build)>1153     fn close_paren(&mut self) -> Result<(Flags, Build)> {
1154         let mut concat = vec![];
1155         loop {
1156             match self.stack.pop() {
1157                 // e.g., )
1158                 None => return Err(self.err(ErrorKind::UnopenedParen)),
1159                 Some(Build::LeftParen { i, name, old_flags, .. }) => {
1160                     if concat.is_empty() {
1161                         // e.g., ()
1162                         return Err(self.err(ErrorKind::EmptyGroup));
1163                     }
1164                     return Ok((old_flags, Build::Expr(Expr::Group {
1165                         e: Box::new(rev_concat(concat)),
1166                         i: i,
1167                         name: name,
1168                     })));
1169                 }
1170                 Some(Build::Expr(Expr::Alternate(mut es))) => {
1171                     if concat.is_empty() {
1172                         // e.g., (a|)
1173                         return Err(self.err(ErrorKind::EmptyAlternate));
1174                     }
1175                     es.push(rev_concat(concat));
1176                     match self.stack.pop() {
1177                         // e.g., a|b)
1178                         None => return Err(self.err(ErrorKind::UnopenedParen)),
1179                         Some(Build::Expr(_)) => unreachable!(),
1180                         Some(Build::LeftParen { i, name, old_flags, .. }) => {
1181                             return Ok((old_flags, Build::Expr(Expr::Group {
1182                                 e: Box::new(Expr::Alternate(es)),
1183                                 i: i,
1184                                 name: name,
1185                             })));
1186                         }
1187                     }
1188                 }
1189                 Some(Build::Expr(e)) => { concat.push(e); }
1190             }
1191         }
1192     }
1193 
1194     // Called only when the parser reaches the end of input.
1195     //
1196     // This pops the expression stack until:
1197     //
1198     //  1. The stack is empty. Return concatenation of popped
1199     //     expressions. This concatenation may be empty!
1200     //  2. An alternation is found. Pop the alternation and push
1201     //     a new arm. Return the alternation as the entire Regex.
1202     //     After this, the stack must be empty, or else there is
1203     //     an unclosed paren.
1204     //
1205     // If an opening parenthesis is popped, then an error is
1206     // returned since it indicates an unclosed parenthesis.
finish_concat(&mut self) -> Result<Expr>1207     fn finish_concat(&mut self) -> Result<Expr> {
1208         let mut concat = vec![];
1209         loop {
1210             match self.stack.pop() {
1211                 None => { return Ok(rev_concat(concat)); }
1212                 Some(Build::LeftParen{ chari, ..}) => {
1213                     // e.g., a(b
1214                     return Err(self.errat(chari, ErrorKind::UnclosedParen));
1215                 }
1216                 Some(Build::Expr(Expr::Alternate(mut es))) => {
1217                     if concat.is_empty() {
1218                         // e.g., a|
1219                         return Err(self.err(ErrorKind::EmptyAlternate));
1220                     }
1221                     es.push(rev_concat(concat));
1222                     // Make sure there are no opening parens remaining.
1223                     match self.stack.pop() {
1224                         None => return Ok(Expr::Alternate(es)),
1225                         Some(Build::LeftParen{ chari, ..}) => {
1226                             // e.g., (a|b
1227                             return Err(self.errat(
1228                                 chari, ErrorKind::UnclosedParen));
1229                         }
1230                         e => unreachable!("{:?}", e),
1231                     }
1232                 }
1233                 Some(Build::Expr(e)) => { concat.push(e); }
1234             }
1235         }
1236     }
1237 }
1238 
1239 // Methods for working with the bracket stack used for character class parsing.
1240 impl Parser {
1241 
1242     // After parsing a closing bracket `]`, process elements of the bracket
1243     // stack until finding the corresponding opening bracket `[`, and return
1244     // the combined character class. E.g. with `[^b-f&&ab-c]`:
1245     //
1246     // 1. Adjacent sets are merged into a single union: `ab-c` -> `a-c`
1247     // 2. Unions separated by `&&` are intersected: `b-f` and `a-c` -> `b-c`
1248     // 3. Negation is applied if necessary: `b-c` -> negation of `b-c`
close_bracket(&self, stack: &mut Vec<Bracket>) -> Result<CharClass>1249     fn close_bracket(&self, stack: &mut Vec<Bracket>) -> Result<CharClass> {
1250         let mut union = CharClass::empty();
1251         let mut intersect = vec![];
1252         loop {
1253             match stack.pop() {
1254                 Some(Bracket::Set(class)) => {
1255                     union.ranges.extend(class);
1256                 }
1257                 Some(Bracket::Intersection) => {
1258                     let class = self.class_union_transform(union);
1259                     intersect.push(class);
1260                     union = CharClass::empty();
1261                 }
1262                 Some(Bracket::LeftBracket { negated }) => {
1263                     let mut class = self.class_union_transform(union);
1264                     for c in intersect {
1265                         class = class.intersection(&c);
1266                     }
1267                     // negate after combining all sets (`^` has lower precedence than `&&`)
1268                     if negated {
1269                         class = class.negate();
1270                     }
1271                     if class.is_empty() {
1272                         // e.g., [^\d\D]
1273                         return Err(self.err(ErrorKind::EmptyClass));
1274                     }
1275                     return Ok(class);
1276                 }
1277                 // The first element on the stack is a `LeftBracket`
1278                 None => unreachable!()
1279             }
1280         }
1281     }
1282 
1283     // Apply case folding if requested on the union character class, and
1284     // return a canonicalized class.
class_union_transform(&self, class: CharClass) -> CharClass1285     fn class_union_transform(&self, class: CharClass) -> CharClass {
1286         if self.flags.casei {
1287             // Case folding canonicalizes too
1288             class.case_fold()
1289         } else {
1290             class.canonicalize()
1291         }
1292     }
1293 }
1294 
1295 impl Build {
is_empty(&self) -> bool1296     fn is_empty(&self) -> bool {
1297         match *self {
1298             Build::Expr(Expr::Empty) => true,
1299             _ => false,
1300         }
1301     }
1302 }
1303 
1304 // Make it ergonomic to conditionally bump the parser.
1305 // i.e., `bump_if('a')` or `bump_if("abc")`.
1306 trait Bumpable {
match_end(self, p: &Parser) -> usize1307     fn match_end(self, p: &Parser) -> usize;
1308 }
1309 
1310 impl Bumpable for char {
match_end(self, p: &Parser) -> usize1311     fn match_end(self, p: &Parser) -> usize {
1312         let mut chars = p.chars();
1313         if chars.next().map(|c| c == self).unwrap_or(false) {
1314             chars.cur
1315         } else {
1316             0
1317         }
1318     }
1319 }
1320 
1321 impl<'a> Bumpable for &'a str {
match_end(self, p: &Parser) -> usize1322     fn match_end(self, p: &Parser) -> usize {
1323         let mut search = self.chars();
1324         let mut rest = p.chars();
1325         let mut count = 0;
1326         loop {
1327             match (rest.next(), search.next()) {
1328                 (Some(c1), Some(c2)) if c1 == c2 => count = rest.cur,
1329                 (_, None) => return count,
1330                 _ => return 0,
1331             }
1332         }
1333     }
1334 }
1335 
1336 impl<F: FnMut(char) -> bool> Bumpable for F {
match_end(mut self, p: &Parser) -> usize1337     fn match_end(mut self, p: &Parser) -> usize {
1338         let mut chars = p.chars();
1339         let mut count = 0;
1340         while let Some(c) = chars.next() {
1341             if !self(c) {
1342                 break
1343             }
1344             count = chars.cur;
1345         }
1346         count
1347     }
1348 }
1349 
1350 // Turn a sequence of expressions into a concatenation.
1351 // This only uses `Concat` if there are 2 or more expressions.
rev_concat(mut exprs: Vec<Expr>) -> Expr1352 fn rev_concat(mut exprs: Vec<Expr>) -> Expr {
1353     if exprs.len() == 0 {
1354         Expr::Empty
1355     } else if exprs.len() == 1 {
1356         exprs.pop().unwrap()
1357     } else {
1358         exprs.reverse();
1359         Expr::Concat(exprs)
1360     }
1361 }
1362 
1363 // Returns true if and only if the given character is allowed in a capture
1364 // name. Note that the first char of a capture name must not be numeric.
is_valid_capture_char(c: char) -> bool1365 fn is_valid_capture_char(c: char) -> bool {
1366     c == '_' || (c >= '0' && c <= '9')
1367     || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
1368 }
1369 
is_ascii_word(c: char) -> bool1370 fn is_ascii_word(c: char) -> bool {
1371     match c {
1372         'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true,
1373         _ => false,
1374     }
1375 }
1376 
1377 /// Returns true if the give character has significance in a regex.
is_punct(c: char) -> bool1378 pub fn is_punct(c: char) -> bool {
1379     match c {
1380         '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
1381         '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
1382         _ => false,
1383     }
1384 }
1385 
checkadd(x: usize, y: usize) -> usize1386 fn checkadd(x: usize, y: usize) -> usize {
1387     x.checked_add(y).expect("regex length overflow")
1388 }
1389 
unicode_class(name: &str) -> Option<CharClass>1390 fn unicode_class(name: &str) -> Option<CharClass> {
1391     UNICODE_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| {
1392         raw_class_to_expr(UNICODE_CLASSES[i].1)
1393     })
1394 }
1395 
ascii_class(name: &str) -> Option<CharClass>1396 fn ascii_class(name: &str) -> Option<CharClass> {
1397     ASCII_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| {
1398         raw_class_to_expr(ASCII_CLASSES[i].1)
1399     })
1400 }
1401 
raw_class_to_expr(raw: &[(char, char)]) -> CharClass1402 fn raw_class_to_expr(raw: &[(char, char)]) -> CharClass {
1403     let range = |&(s, e)| ClassRange { start: s, end: e };
1404     CharClass::new(raw.iter().map(range).collect())
1405 }
1406 
1407 type Class = &'static [(char, char)];
1408 type NamedClasses = &'static [(&'static str, Class)];
1409 
1410 const ASCII_CLASSES: NamedClasses = &[
1411     // Classes must be in alphabetical order so that bsearch works.
1412     // [:alnum:]      alphanumeric (== [0-9A-Za-z])
1413     // [:alpha:]      alphabetic (== [A-Za-z])
1414     // [:ascii:]      ASCII (== [\x00-\x7F])
1415     // [:blank:]      blank (== [\t ])
1416     // [:cntrl:]      control (== [\x00-\x1F\x7F])
1417     // [:digit:]      digits (== [0-9])
1418     // [:graph:]      graphical (== [!-~])
1419     // [:lower:]      lower case (== [a-z])
1420     // [:print:]      printable (== [ -~] == [ [:graph:]])
1421     // [:punct:]      punctuation (== [!-/:-@[-`{-~])
1422     // [:space:]      whitespace (== [\t\n\v\f\r ])
1423     // [:upper:]      upper case (== [A-Z])
1424     // [:word:]       word characters (== [0-9A-Za-z_])
1425     // [:xdigit:]     hex digit (== [0-9A-Fa-f])
1426     // Taken from: http://golang.org/pkg/regex/syntax/
1427     ("alnum", &ALNUM),
1428     ("alpha", &ALPHA),
1429     ("ascii", &ASCII),
1430     ("blank", &BLANK),
1431     ("cntrl", &CNTRL),
1432     ("digit", &DIGIT),
1433     ("graph", &GRAPH),
1434     ("lower", &LOWER),
1435     ("print", &PRINT),
1436     ("punct", &PUNCT),
1437     ("space", &SPACE),
1438     ("upper", &UPPER),
1439     ("word", &WORD),
1440     ("xdigit", &XDIGIT),
1441 ];
1442 
1443 const ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
1444 const ALPHA: Class = &[('A', 'Z'), ('a', 'z')];
1445 const ASCII: Class = &[('\x00', '\x7F')];
1446 const BLANK: Class = &[(' ', ' '), ('\t', '\t')];
1447 const CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')];
1448 const DIGIT: Class = &[('0', '9')];
1449 const GRAPH: Class = &[('!', '~')];
1450 const LOWER: Class = &[('a', 'z')];
1451 const PRINT: Class = &[(' ', '~')];
1452 const PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
1453 const SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'),
1454                        ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')];
1455 const UPPER: Class = &[('A', 'Z')];
1456 const WORD: Class = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')];
1457 const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')];
1458 
1459 #[cfg(test)]
1460 mod tests {
1461     use {
1462         CharClass, ClassRange, ByteClass, ByteRange,
1463         Expr, Repeater,
1464         ErrorKind,
1465     };
1466     use unicode::regex::{PERLD, PERLS, PERLW};
1467     use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};
1468 
1469     static YI: &'static [(char, char)] = &[
1470         ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
1471     ];
1472 
p(s: &str) -> Expr1473     fn p(s: &str) -> Expr { Parser::parse(s, Flags::default()).unwrap() }
pf(s: &str, flags: Flags) -> Expr1474     fn pf(s: &str, flags: Flags) -> Expr { Parser::parse(s, flags).unwrap() }
lit(c: char) -> Expr1475     fn lit(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: false } }
liti(c: char) -> Expr1476     fn liti(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: true } }
b<T>(v: T) -> Box<T>1477     fn b<T>(v: T) -> Box<T> { Box::new(v) }
c(es: &[Expr]) -> Expr1478     fn c(es: &[Expr]) -> Expr { Expr::Concat(es.to_vec()) }
1479 
pb(s: &str) -> Expr1480     fn pb(s: &str) -> Expr {
1481         let flags = Flags { allow_bytes: true, .. Flags::default() };
1482         Parser::parse(s, flags).unwrap()
1483     }
1484 
blit(b: u8) -> Expr1485     fn blit(b: u8) -> Expr {
1486         Expr::LiteralBytes {
1487             bytes: vec![b],
1488             casei: false,
1489         }
1490     }
1491 
bliti(b: u8) -> Expr1492     fn bliti(b: u8) -> Expr {
1493         Expr::LiteralBytes {
1494             bytes: vec![b],
1495             casei: true,
1496         }
1497     }
1498 
class(ranges: &[(char, char)]) -> CharClass1499     fn class(ranges: &[(char, char)]) -> CharClass {
1500         let ranges = ranges.iter().cloned()
1501                            .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
1502         CharClass::new(ranges)
1503     }
1504 
classes(classes: &[&[(char, char)]]) -> CharClass1505     fn classes(classes: &[&[(char, char)]]) -> CharClass {
1506         let mut cls = CharClass::empty();
1507         for &ranges in classes {
1508             cls.ranges.extend(class(ranges));
1509         }
1510         cls.canonicalize()
1511     }
1512 
bclass(ranges: &[(u8, u8)]) -> ByteClass1513     fn bclass(ranges: &[(u8, u8)]) -> ByteClass {
1514         let ranges = ranges.iter().cloned()
1515                            .map(|(c1, c2)| ByteRange::new(c1, c2)).collect();
1516         ByteClass::new(ranges)
1517     }
1518 
asciid() -> CharClass1519     fn asciid() -> CharClass {
1520         ascii_class("digit").unwrap()
1521     }
1522 
asciis() -> CharClass1523     fn asciis() -> CharClass {
1524         ascii_class("space").unwrap()
1525     }
1526 
asciiw() -> CharClass1527     fn asciiw() -> CharClass {
1528         ascii_class("word").unwrap()
1529     }
1530 
asciid_bytes() -> ByteClass1531     fn asciid_bytes() -> ByteClass {
1532         asciid().to_byte_class()
1533     }
1534 
asciis_bytes() -> ByteClass1535     fn asciis_bytes() -> ByteClass {
1536         asciis().to_byte_class()
1537     }
1538 
asciiw_bytes() -> ByteClass1539     fn asciiw_bytes() -> ByteClass {
1540         asciiw().to_byte_class()
1541     }
1542 
1543     #[test]
empty()1544     fn empty() {
1545         assert_eq!(p(""), Expr::Empty);
1546     }
1547 
1548     #[test]
literal()1549     fn literal() {
1550         assert_eq!(p("a"), lit('a'));
1551         assert_eq!(pb("(?-u)a"), blit(b'a'));
1552     }
1553 
1554     #[test]
literal_string()1555     fn literal_string() {
1556         assert_eq!(p("ab"), Expr::Concat(vec![lit('a'), lit('b')]));
1557         assert_eq!(pb("(?-u)ab"), Expr::Concat(vec![blit(b'a'), blit(b'b')]));
1558     }
1559 
1560     #[test]
start_literal()1561     fn start_literal() {
1562         assert_eq!(p("^a"), Expr::Concat(vec![
1563             Expr::StartText,
1564             Expr::Literal { chars: vec!['a'], casei: false },
1565         ]));
1566     }
1567 
1568     #[test]
repeat_zero_or_one_greedy()1569     fn repeat_zero_or_one_greedy() {
1570         assert_eq!(p("a?"), Expr::Repeat {
1571             e: b(lit('a')),
1572             r: Repeater::ZeroOrOne,
1573             greedy: true,
1574         });
1575     }
1576 
1577     #[test]
repeat_zero_or_one_greedy_concat()1578     fn repeat_zero_or_one_greedy_concat() {
1579         assert_eq!(p("ab?"), Expr::Concat(vec![
1580             lit('a'),
1581             Expr::Repeat {
1582                 e: b(lit('b')),
1583                 r: Repeater::ZeroOrOne,
1584                 greedy: true,
1585             },
1586         ]));
1587     }
1588 
1589     #[test]
repeat_zero_or_one_nongreedy()1590     fn repeat_zero_or_one_nongreedy() {
1591         assert_eq!(p("a??"), Expr::Repeat {
1592             e: b(lit('a')),
1593             r: Repeater::ZeroOrOne,
1594             greedy: false,
1595         });
1596     }
1597 
1598     #[test]
repeat_one_or_more_greedy()1599     fn repeat_one_or_more_greedy() {
1600         assert_eq!(p("a+"), Expr::Repeat {
1601             e: b(lit('a')),
1602             r: Repeater::OneOrMore,
1603             greedy: true,
1604         });
1605     }
1606 
1607     #[test]
repeat_one_or_more_nongreedy()1608     fn repeat_one_or_more_nongreedy() {
1609         assert_eq!(p("a+?"), Expr::Repeat {
1610             e: b(lit('a')),
1611             r: Repeater::OneOrMore,
1612             greedy: false,
1613         });
1614     }
1615 
1616     #[test]
repeat_zero_or_more_greedy()1617     fn repeat_zero_or_more_greedy() {
1618         assert_eq!(p("a*"), Expr::Repeat {
1619             e: b(lit('a')),
1620             r: Repeater::ZeroOrMore,
1621             greedy: true,
1622         });
1623     }
1624 
1625     #[test]
repeat_zero_or_more_nongreedy()1626     fn repeat_zero_or_more_nongreedy() {
1627         assert_eq!(p("a*?"), Expr::Repeat {
1628             e: b(lit('a')),
1629             r: Repeater::ZeroOrMore,
1630             greedy: false,
1631         });
1632     }
1633 
1634     #[test]
repeat_counted_exact()1635     fn repeat_counted_exact() {
1636         assert_eq!(p("a{5}"), Expr::Repeat {
1637             e: b(lit('a')),
1638             r: Repeater::Range { min: 5, max: Some(5) },
1639             greedy: true,
1640         });
1641     }
1642 
1643     #[test]
repeat_counted_min()1644     fn repeat_counted_min() {
1645         assert_eq!(p("a{5,}"), Expr::Repeat {
1646             e: b(lit('a')),
1647             r: Repeater::Range { min: 5, max: None },
1648             greedy: true,
1649         });
1650     }
1651 
1652     #[test]
repeat_counted_min_max()1653     fn repeat_counted_min_max() {
1654         assert_eq!(p("a{5,10}"), Expr::Repeat {
1655             e: b(lit('a')),
1656             r: Repeater::Range { min: 5, max: Some(10) },
1657             greedy: true,
1658         });
1659     }
1660 
1661     #[test]
repeat_counted_exact_nongreedy()1662     fn repeat_counted_exact_nongreedy() {
1663         assert_eq!(p("a{5}?"), Expr::Repeat {
1664             e: b(lit('a')),
1665             r: Repeater::Range { min: 5, max: Some(5) },
1666             greedy: false,
1667         });
1668     }
1669 
1670     #[test]
repeat_counted_min_nongreedy()1671     fn repeat_counted_min_nongreedy() {
1672         assert_eq!(p("a{5,}?"), Expr::Repeat {
1673             e: b(lit('a')),
1674             r: Repeater::Range { min: 5, max: None },
1675             greedy: false,
1676         });
1677     }
1678 
1679     #[test]
repeat_counted_min_max_nongreedy()1680     fn repeat_counted_min_max_nongreedy() {
1681         assert_eq!(p("a{5,10}?"), Expr::Repeat {
1682             e: b(lit('a')),
1683             r: Repeater::Range { min: 5, max: Some(10) },
1684             greedy: false,
1685         });
1686     }
1687 
1688     #[test]
repeat_counted_whitespace()1689     fn repeat_counted_whitespace() {
1690         assert_eq!(p("a{ 5   }"), Expr::Repeat {
1691             e: b(lit('a')),
1692             r: Repeater::Range { min: 5, max: Some(5) },
1693             greedy: true,
1694         });
1695         assert_eq!(p("a{ 5 , 10 }"), Expr::Repeat {
1696             e: b(lit('a')),
1697             r: Repeater::Range { min: 5, max: Some(10) },
1698             greedy: true,
1699         });
1700     }
1701 
1702     #[test]
group_literal()1703     fn group_literal() {
1704         assert_eq!(p("(a)"), Expr::Group {
1705             e: b(lit('a')),
1706             i: Some(1),
1707             name: None,
1708         });
1709     }
1710 
1711     #[test]
group_literal_concat()1712     fn group_literal_concat() {
1713         assert_eq!(p("(ab)"), Expr::Group {
1714             e: b(c(&[lit('a'), lit('b')])),
1715             i: Some(1),
1716             name: None,
1717         });
1718     }
1719 
1720     #[test]
alt_two()1721     fn alt_two() {
1722         assert_eq!(p("a|b"), Expr::Alternate(vec![lit('a'), lit('b')]));
1723     }
1724 
1725     #[test]
alt_many()1726     fn alt_many() {
1727         assert_eq!(p("a|b|c"), Expr::Alternate(vec![
1728             lit('a'), lit('b'), lit('c'),
1729         ]));
1730     }
1731 
1732     #[test]
alt_many_concat()1733     fn alt_many_concat() {
1734         assert_eq!(p("ab|bc|cd"), Expr::Alternate(vec![
1735             c(&[lit('a'), lit('b')]),
1736             c(&[lit('b'), lit('c')]),
1737             c(&[lit('c'), lit('d')]),
1738         ]));
1739     }
1740 
1741     #[test]
alt_group_two()1742     fn alt_group_two() {
1743         assert_eq!(p("(a|b)"), Expr::Group {
1744             e: b(Expr::Alternate(vec![lit('a'), lit('b')])),
1745             i: Some(1),
1746             name: None,
1747         });
1748     }
1749 
1750     #[test]
alt_group_many()1751     fn alt_group_many() {
1752         assert_eq!(p("(a|b|c)"), Expr::Group {
1753             e: b(Expr::Alternate(vec![lit('a'), lit('b'), lit('c')])),
1754             i: Some(1),
1755             name: None,
1756         });
1757     }
1758 
1759     #[test]
alt_group_many_concat()1760     fn alt_group_many_concat() {
1761         assert_eq!(p("(ab|bc|cd)"), Expr::Group {
1762             e: b(Expr::Alternate(vec![
1763                 c(&[lit('a'), lit('b')]),
1764                 c(&[lit('b'), lit('c')]),
1765                 c(&[lit('c'), lit('d')]),
1766             ])),
1767             i: Some(1),
1768             name: None,
1769         });
1770     }
1771 
1772     #[test]
alt_group_nested()1773     fn alt_group_nested() {
1774         assert_eq!(p("(ab|(bc|(cd)))"), Expr::Group {
1775             e: b(Expr::Alternate(vec![
1776                 c(&[lit('a'), lit('b')]),
1777                 Expr::Group {
1778                     e: b(Expr::Alternate(vec![
1779                         c(&[lit('b'), lit('c')]),
1780                         Expr::Group {
1781                             e: b(c(&[lit('c'), lit('d')])),
1782                             i: Some(3),
1783                             name: None,
1784                         }
1785                     ])),
1786                     i: Some(2),
1787                     name: None,
1788                 },
1789             ])),
1790             i: Some(1),
1791             name: None,
1792         });
1793     }
1794 
1795     #[test]
group_name()1796     fn group_name() {
1797         assert_eq!(p("(?P<foo>a)"), Expr::Group {
1798             e: b(lit('a')),
1799             i: Some(1),
1800             name: Some("foo".into()),
1801         });
1802     }
1803 
1804     #[test]
group_no_capture()1805     fn group_no_capture() {
1806         assert_eq!(p("(?:a)"), Expr::Group {
1807             e: b(lit('a')),
1808             i: None,
1809             name: None,
1810         });
1811     }
1812 
1813     #[test]
group_flags()1814     fn group_flags() {
1815         assert_eq!(p("(?i:a)"), Expr::Group {
1816             e: b(liti('a')),
1817             i: None,
1818             name: None,
1819         });
1820         assert_eq!(pb("(?i-u:a)"), Expr::Group {
1821             e: b(bliti(b'a')),
1822             i: None,
1823             name: None,
1824         });
1825     }
1826 
1827     #[test]
group_flags_returned()1828     fn group_flags_returned() {
1829         assert_eq!(p("(?i:a)a"), c(&[
1830             Expr::Group {
1831                 e: b(liti('a')),
1832                 i: None,
1833                 name: None,
1834             },
1835             lit('a'),
1836         ]));
1837         assert_eq!(pb("(?i-u:a)a"), c(&[
1838             Expr::Group {
1839                 e: b(bliti(b'a')),
1840                 i: None,
1841                 name: None,
1842             },
1843             lit('a'),
1844         ]));
1845     }
1846 
1847     #[test]
group_flags_retained()1848     fn group_flags_retained() {
1849         assert_eq!(p("(?i)(?-i:a)a"), c(&[
1850             Expr::Group {
1851                 e: b(lit('a')),
1852                 i: None,
1853                 name: None,
1854             },
1855             liti('a'),
1856         ]));
1857         assert_eq!(pb("(?i-u)(?u-i:a)a"), c(&[
1858             Expr::Group {
1859                 e: b(lit('a')),
1860                 i: None,
1861                 name: None,
1862             },
1863             bliti(b'a'),
1864         ]));
1865     }
1866 
1867     #[test]
flags_inline()1868     fn flags_inline() {
1869         assert_eq!(p("(?i)a"), liti('a'));
1870     }
1871 
1872     #[test]
flags_inline_multiple()1873     fn flags_inline_multiple() {
1874         assert_eq!(p("(?is)a."), c(&[liti('a'), Expr::AnyChar]));
1875     }
1876 
1877     #[test]
flags_inline_multiline()1878     fn flags_inline_multiline() {
1879         assert_eq!(p("(?m)^(?-m)$"), c(&[Expr::StartLine, Expr::EndText]));
1880     }
1881 
1882     #[test]
flags_inline_swap_greed()1883     fn flags_inline_swap_greed() {
1884         assert_eq!(p("(?U)a*a*?(?i-U)a*a*?"), c(&[
1885             Expr::Repeat {
1886                 e: b(lit('a')),
1887                 r: Repeater::ZeroOrMore,
1888                 greedy: false,
1889             },
1890             Expr::Repeat {
1891                 e: b(lit('a')),
1892                 r: Repeater::ZeroOrMore,
1893                 greedy: true,
1894             },
1895             Expr::Repeat {
1896                 e: b(liti('a')),
1897                 r: Repeater::ZeroOrMore,
1898                 greedy: true,
1899             },
1900             Expr::Repeat {
1901                 e: b(liti('a')),
1902                 r: Repeater::ZeroOrMore,
1903                 greedy: false,
1904             },
1905         ]));
1906     }
1907 
1908     #[test]
flags_inline_multiple_negate_one()1909     fn flags_inline_multiple_negate_one() {
1910         assert_eq!(p("(?is)a.(?i-s)a."), c(&[
1911             liti('a'), Expr::AnyChar, liti('a'), Expr::AnyCharNoNL,
1912         ]));
1913     }
1914 
1915     #[test]
any_byte()1916     fn any_byte() {
1917         assert_eq!(
1918             pb("(?-u).(?u)."), c(&[Expr::AnyByteNoNL, Expr::AnyCharNoNL]));
1919         assert_eq!(
1920             pb("(?s)(?-u).(?u)."), c(&[Expr::AnyByte, Expr::AnyChar]));
1921     }
1922 
1923     #[test]
flags_inline_negate()1924     fn flags_inline_negate() {
1925         assert_eq!(p("(?i)a(?-i)a"), c(&[liti('a'), lit('a')]));
1926     }
1927 
1928     #[test]
flags_group_inline()1929     fn flags_group_inline() {
1930         assert_eq!(p("(a(?i)a)a"), c(&[
1931             Expr::Group {
1932                 e: b(c(&[lit('a'), liti('a')])),
1933                 i: Some(1),
1934                 name: None,
1935             },
1936             lit('a'),
1937         ]));
1938     }
1939 
1940     #[test]
flags_group_inline_retain()1941     fn flags_group_inline_retain() {
1942         assert_eq!(p("(?i)((?-i)a)a"), c(&[
1943             Expr::Group {
1944                 e: b(lit('a')),
1945                 i: Some(1),
1946                 name: None,
1947             },
1948             liti('a'),
1949         ]));
1950     }
1951 
1952     #[test]
flags_default_casei()1953     fn flags_default_casei() {
1954         let flags = Flags { casei: true, .. Flags::default() };
1955         assert_eq!(pf("a", flags), liti('a'));
1956     }
1957 
1958     #[test]
flags_default_multi()1959     fn flags_default_multi() {
1960         let flags = Flags { multi: true, .. Flags::default() };
1961         assert_eq!(pf("^", flags), Expr::StartLine);
1962     }
1963 
1964     #[test]
flags_default_dotnl()1965     fn flags_default_dotnl() {
1966         let flags = Flags { dotnl: true, .. Flags::default() };
1967         assert_eq!(pf(".", flags), Expr::AnyChar);
1968     }
1969 
1970     #[test]
flags_default_swap_greed()1971     fn flags_default_swap_greed() {
1972         let flags = Flags { swap_greed: true, .. Flags::default() };
1973         assert_eq!(pf("a+", flags), Expr::Repeat {
1974             e: b(lit('a')),
1975             r: Repeater::OneOrMore,
1976             greedy: false,
1977         });
1978     }
1979 
1980     #[test]
flags_default_ignore_space()1981     fn flags_default_ignore_space() {
1982         let flags = Flags { ignore_space: true, .. Flags::default() };
1983         assert_eq!(pf(" a ", flags), lit('a'));
1984     }
1985 
1986     #[test]
escape_simple()1987     fn escape_simple() {
1988         assert_eq!(p(r"\a\f\t\n\r\v"), c(&[
1989             lit('\x07'), lit('\x0C'), lit('\t'),
1990             lit('\n'), lit('\r'), lit('\x0B'),
1991         ]));
1992     }
1993 
1994     #[test]
escape_boundaries()1995     fn escape_boundaries() {
1996         assert_eq!(p(r"\A\z\b\B"), c(&[
1997             Expr::StartText, Expr::EndText,
1998             Expr::WordBoundary, Expr::NotWordBoundary,
1999         ]));
2000         assert_eq!(pb(r"(?-u)\b\B"), c(&[
2001             Expr::WordBoundaryAscii, Expr::NotWordBoundaryAscii,
2002         ]));
2003     }
2004 
2005     #[test]
escape_punctuation()2006     fn escape_punctuation() {
2007         assert_eq!(p(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), c(&[
2008             lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'),
2009             lit('('), lit(')'), lit('|'), lit('['), lit(']'),
2010             lit('{'), lit('}'), lit('^'), lit('$'), lit('#'),
2011         ]));
2012     }
2013 
2014     #[test]
escape_octal()2015     fn escape_octal() {
2016         assert_eq!(p(r"\123"), lit('S'));
2017         assert_eq!(p(r"\1234"), c(&[lit('S'), lit('4')]));
2018 
2019         assert_eq!(pb(r"(?-u)\377"), blit(0xFF));
2020     }
2021 
2022     #[test]
escape_hex2()2023     fn escape_hex2() {
2024         assert_eq!(p(r"\x53"), lit('S'));
2025         assert_eq!(p(r"\x534"), c(&[lit('S'), lit('4')]));
2026 
2027         assert_eq!(pb(r"(?-u)\xff"), blit(0xFF));
2028         assert_eq!(pb(r"(?-u)\x00"), blit(0x0));
2029         assert_eq!(pb(r"(?-u)[\x00]"),
2030                    Expr::ClassBytes(bclass(&[(b'\x00', b'\x00')])));
2031         assert_eq!(pb(r"(?-u)[^\x00]"),
2032                    Expr::ClassBytes(bclass(&[(b'\x01', b'\xFF')])));
2033     }
2034 
2035     #[test]
escape_hex()2036     fn escape_hex() {
2037         assert_eq!(p(r"\x{53}"), lit('S'));
2038         assert_eq!(p(r"\x{53}4"), c(&[lit('S'), lit('4')]));
2039         assert_eq!(p(r"\x{2603}"), lit('\u{2603}'));
2040 
2041         assert_eq!(pb(r"(?-u)\x{00FF}"), blit(0xFF));
2042     }
2043 
2044     #[test]
escape_unicode_name()2045     fn escape_unicode_name() {
2046         assert_eq!(p(r"\p{Yi}"), Expr::Class(class(YI)));
2047     }
2048 
2049     #[test]
escape_unicode_letter()2050     fn escape_unicode_letter() {
2051         assert_eq!(p(r"\pZ"), Expr::Class(class(&[
2052             ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2053             ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2054             ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2055             ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2056         ])));
2057     }
2058 
2059     #[test]
escape_unicode_name_case_fold()2060     fn escape_unicode_name_case_fold() {
2061         assert_eq!(p(r"(?i)\p{Yi}"), Expr::Class(class(YI).case_fold()));
2062     }
2063 
2064     #[test]
escape_unicode_letter_case_fold()2065     fn escape_unicode_letter_case_fold() {
2066         assert_eq!(p(r"(?i)\pZ"), Expr::Class(class(&[
2067             ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2068             ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2069             ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2070             ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2071         ]).case_fold()));
2072     }
2073 
2074     #[test]
escape_unicode_name_negate()2075     fn escape_unicode_name_negate() {
2076         assert_eq!(p(r"\P{Yi}"), Expr::Class(class(YI).negate()));
2077     }
2078 
2079     #[test]
escape_unicode_letter_negate()2080     fn escape_unicode_letter_negate() {
2081         assert_eq!(p(r"\PZ"), Expr::Class(class(&[
2082             ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2083             ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2084             ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2085             ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2086         ]).negate()));
2087     }
2088 
2089     #[test]
escape_unicode_name_negate_case_fold()2090     fn escape_unicode_name_negate_case_fold() {
2091         assert_eq!(p(r"(?i)\P{Yi}"),
2092                    Expr::Class(class(YI).negate().case_fold()));
2093     }
2094 
2095     #[test]
escape_unicode_letter_negate_case_fold()2096     fn escape_unicode_letter_negate_case_fold() {
2097         assert_eq!(p(r"(?i)\PZ"), Expr::Class(class(&[
2098             ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2099             ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2100             ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2101             ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2102         ]).negate().case_fold()));
2103     }
2104 
2105     #[test]
escape_perl_d()2106     fn escape_perl_d() {
2107         assert_eq!(p(r"\d"), Expr::Class(class(PERLD)));
2108         assert_eq!(pb(r"(?-u)\d"), Expr::Class(asciid()));
2109     }
2110 
2111     #[test]
escape_perl_s()2112     fn escape_perl_s() {
2113         assert_eq!(p(r"\s"), Expr::Class(class(PERLS)));
2114         assert_eq!(pb(r"(?-u)\s"), Expr::Class(asciis()));
2115     }
2116 
2117     #[test]
escape_perl_w()2118     fn escape_perl_w() {
2119         assert_eq!(p(r"\w"), Expr::Class(class(PERLW)));
2120         assert_eq!(pb(r"(?-u)\w"), Expr::Class(asciiw()));
2121     }
2122 
2123     #[test]
escape_perl_d_negate()2124     fn escape_perl_d_negate() {
2125         assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate()));
2126         assert_eq!(pb(r"(?-u)\D"), Expr::Class(asciid().negate()));
2127     }
2128 
2129     #[test]
escape_perl_s_negate()2130     fn escape_perl_s_negate() {
2131         assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate()));
2132         assert_eq!(pb(r"(?-u)\S"), Expr::Class(asciis().negate()));
2133     }
2134 
2135     #[test]
escape_perl_w_negate()2136     fn escape_perl_w_negate() {
2137         assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate()));
2138         assert_eq!(pb(r"(?-u)\W"), Expr::Class(asciiw().negate()));
2139     }
2140 
2141     #[test]
escape_perl_d_case_fold()2142     fn escape_perl_d_case_fold() {
2143         assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold()));
2144         assert_eq!(pb(r"(?i-u)\d"), Expr::Class(asciid().case_fold()));
2145     }
2146 
2147     #[test]
escape_perl_s_case_fold()2148     fn escape_perl_s_case_fold() {
2149         assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold()));
2150         assert_eq!(pb(r"(?i-u)\s"), Expr::Class(asciis().case_fold()));
2151     }
2152 
2153     #[test]
escape_perl_w_case_fold()2154     fn escape_perl_w_case_fold() {
2155         assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold()));
2156         assert_eq!(pb(r"(?i-u)\w"), Expr::Class(asciiw().case_fold()));
2157     }
2158 
2159     #[test]
escape_perl_d_case_fold_negate()2160     fn escape_perl_d_case_fold_negate() {
2161         assert_eq!(p(r"(?i)\D"),
2162                    Expr::Class(class(PERLD).case_fold().negate()));
2163         let bytes = asciid().case_fold().negate();
2164         assert_eq!(pb(r"(?i-u)\D"), Expr::Class(bytes));
2165     }
2166 
2167     #[test]
escape_perl_s_case_fold_negate()2168     fn escape_perl_s_case_fold_negate() {
2169         assert_eq!(p(r"(?i)\S"),
2170                    Expr::Class(class(PERLS).case_fold().negate()));
2171         let bytes = asciis().case_fold().negate();
2172         assert_eq!(pb(r"(?i-u)\S"), Expr::Class(bytes));
2173     }
2174 
2175     #[test]
escape_perl_w_case_fold_negate()2176     fn escape_perl_w_case_fold_negate() {
2177         assert_eq!(p(r"(?i)\W"),
2178                    Expr::Class(class(PERLW).case_fold().negate()));
2179         let bytes = asciiw().case_fold().negate();
2180         assert_eq!(pb(r"(?i-u)\W"), Expr::Class(bytes));
2181     }
2182 
2183     #[test]
class_singleton()2184     fn class_singleton() {
2185         assert_eq!(p(r"[a]"), Expr::Class(class(&[('a', 'a')])));
2186         assert_eq!(p(r"[\x00]"), Expr::Class(class(&[('\x00', '\x00')])));
2187         assert_eq!(p(r"[\n]"), Expr::Class(class(&[('\n', '\n')])));
2188         assert_eq!(p("[\n]"), Expr::Class(class(&[('\n', '\n')])));
2189 
2190         assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
2191         assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
2192         assert_eq!(pb(r"(?-u)[\xFF]"),
2193                    Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
2194         assert_eq!(pb("(?-u)[\n]"),
2195                    Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
2196         assert_eq!(pb(r"(?-u)[\n]"),
2197                    Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
2198     }
2199 
2200     #[test]
class_singleton_negate()2201     fn class_singleton_negate() {
2202         assert_eq!(p(r"[^a]"), Expr::Class(class(&[
2203             ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
2204         ])));
2205         assert_eq!(p(r"[^\x00]"), Expr::Class(class(&[
2206             ('\x01', '\u{10FFFF}'),
2207         ])));
2208         assert_eq!(p(r"[^\n]"), Expr::Class(class(&[
2209             ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'),
2210         ])));
2211         assert_eq!(p("[^\n]"), Expr::Class(class(&[
2212             ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'),
2213         ])));
2214 
2215         assert_eq!(pb(r"(?-u)[^a]"), Expr::ClassBytes(bclass(&[
2216             (0x00, 0x60), (0x62, 0xFF),
2217         ])));
2218         assert_eq!(pb(r"(?-u)[^\x00]"), Expr::ClassBytes(bclass(&[
2219             (0x01, 0xFF),
2220         ])));
2221         assert_eq!(pb(r"(?-u)[^\n]"), Expr::ClassBytes(bclass(&[
2222             (0x00, 0x09), (0x0B, 0xFF),
2223         ])));
2224         assert_eq!(pb("(?-u)[^\n]"), Expr::ClassBytes(bclass(&[
2225             (0x00, 0x09), (0x0B, 0xFF),
2226         ])));
2227     }
2228 
2229     #[test]
class_singleton_class()2230     fn class_singleton_class() {
2231         assert_eq!(p(r"[\d]"), Expr::Class(class(PERLD)));
2232         assert_eq!(p(r"[\p{Yi}]"), Expr::Class(class(YI)));
2233 
2234         let bytes = class(PERLD).to_byte_class();
2235         assert_eq!(pb(r"(?-u)[\d]"), Expr::ClassBytes(bytes));
2236     }
2237 
2238     #[test]
class_singleton_class_negate()2239     fn class_singleton_class_negate() {
2240         assert_eq!(p(r"[^\d]"), Expr::Class(class(PERLD).negate()));
2241         assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate()));
2242         assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate()));
2243 
2244         let bytes = asciid_bytes().negate();
2245         assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes));
2246         let bytes = asciiw_bytes().negate();
2247         assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes));
2248         let bytes = asciis_bytes().negate();
2249         assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes));
2250     }
2251 
2252     #[test]
class_singleton_class_negate_negate()2253     fn class_singleton_class_negate_negate() {
2254         assert_eq!(p(r"[^\D]"), Expr::Class(class(PERLD)));
2255         assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW)));
2256         assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS)));
2257 
2258         assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid_bytes()));
2259         assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw_bytes()));
2260         assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis_bytes()));
2261     }
2262 
2263     #[test]
class_singleton_class_casei()2264     fn class_singleton_class_casei() {
2265         assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold()));
2266         assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold()));
2267 
2268         assert_eq!(pb(r"(?i-u)[\d]"),
2269                    Expr::ClassBytes(asciid_bytes().case_fold()));
2270     }
2271 
2272     #[test]
class_singleton_class_negate_casei()2273     fn class_singleton_class_negate_casei() {
2274         assert_eq!(p(r"(?i)[^\d]"),
2275                    Expr::Class(class(PERLD).case_fold().negate()));
2276         assert_eq!(p(r"(?i)[^\w]"),
2277                    Expr::Class(class(PERLW).case_fold().negate()));
2278         assert_eq!(p(r"(?i)[^\s]"),
2279                    Expr::Class(class(PERLS).case_fold().negate()));
2280 
2281         let bytes = asciid_bytes().case_fold().negate();
2282         assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes));
2283         let bytes = asciiw_bytes().case_fold().negate();
2284         assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes));
2285         let bytes = asciis_bytes().case_fold().negate();
2286         assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes));
2287     }
2288 
2289     #[test]
class_singleton_class_negate_negate_casei()2290     fn class_singleton_class_negate_negate_casei() {
2291         assert_eq!(p(r"(?i)[^\D]"), Expr::Class(class(PERLD).case_fold()));
2292         assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold()));
2293         assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold()));
2294 
2295         assert_eq!(pb(r"(?i-u)[^\D]"),
2296                    Expr::ClassBytes(asciid_bytes().case_fold()));
2297         assert_eq!(pb(r"(?i-u)[^\W]"),
2298                    Expr::ClassBytes(asciiw_bytes().case_fold()));
2299         assert_eq!(pb(r"(?i-u)[^\S]"),
2300                    Expr::ClassBytes(asciis_bytes().case_fold()));
2301     }
2302 
2303     #[test]
class_multiple_class()2304     fn class_multiple_class() {
2305         assert_eq!(p(r"[\d\p{Yi}]"), Expr::Class(classes(&[
2306             PERLD, YI,
2307         ])));
2308     }
2309 
2310     #[test]
class_multiple_class_negate()2311     fn class_multiple_class_negate() {
2312         assert_eq!(p(r"[^\d\p{Yi}]"), Expr::Class(classes(&[
2313             PERLD, YI,
2314         ]).negate()));
2315     }
2316 
2317     #[test]
class_multiple_class_negate_negate()2318     fn class_multiple_class_negate_negate() {
2319         let nperlw = class(PERLW).negate();
2320         let nyi = class(YI).negate();
2321         let cls = CharClass::empty().merge(nperlw).merge(nyi);
2322         assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
2323     }
2324 
2325     #[test]
class_multiple_class_casei()2326     fn class_multiple_class_casei() {
2327         assert_eq!(p(r"(?i)[\d\p{Yi}]"), Expr::Class(classes(&[
2328             PERLD, YI,
2329         ]).case_fold()));
2330     }
2331 
2332     #[test]
class_multiple_class_negate_casei()2333     fn class_multiple_class_negate_casei() {
2334         assert_eq!(p(r"(?i)[^\d\p{Yi}]"), Expr::Class(classes(&[
2335             PERLD, YI,
2336         ]).case_fold().negate()));
2337     }
2338 
2339     #[test]
class_multiple_class_negate_negate_casei()2340     fn class_multiple_class_negate_negate_casei() {
2341         let nperlw = class(PERLW).negate();
2342         let nyi = class(YI).negate();
2343         let class = CharClass::empty().merge(nperlw).merge(nyi);
2344         assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
2345                    Expr::Class(class.case_fold().negate()));
2346     }
2347 
2348     #[test]
class_class_hypen()2349     fn class_class_hypen() {
2350         assert_eq!(p(r"[\p{Yi}-]"), Expr::Class(classes(&[
2351             &[('-', '-')], YI,
2352         ])));
2353         assert_eq!(p(r"[\p{Yi}-a]"), Expr::Class(classes(&[
2354             &[('-', '-')], &[('a', 'a')], YI,
2355         ])));
2356     }
2357 
2358     #[test]
class_brackets()2359     fn class_brackets() {
2360         assert_eq!(p(r"[]]"), Expr::Class(class(&[(']', ']')])));
2361         assert_eq!(p(r"[]\[]"), Expr::Class(class(&[('[', '['), (']', ']')])));
2362         assert_eq!(p(r"[\[]]"), Expr::Concat(vec![
2363             Expr::Class(class(&[('[', '[')])),
2364             lit(']'),
2365         ]));
2366     }
2367 
2368     #[test]
class_brackets_hypen()2369     fn class_brackets_hypen() {
2370         assert_eq!(p("[]-]"), Expr::Class(class(&[('-', '-'), (']', ']')])));
2371         assert_eq!(p("[-]]"), Expr::Concat(vec![
2372             Expr::Class(class(&[('-', '-')])),
2373             lit(']'),
2374         ]));
2375     }
2376 
2377     #[test]
class_nested_class_union()2378     fn class_nested_class_union() {
2379         assert_eq!(p(r"[c[a-b]]"), Expr::Class(class(&[('a', 'c')])));
2380         assert_eq!(p(r"[[a-b]]"), Expr::Class(class(&[('a', 'b')])));
2381         assert_eq!(p(r"[[c][a-b]]"), Expr::Class(class(&[('a', 'c')])));
2382 
2383         assert_eq!(pb(r"(?-u)[c[a-b]]"),
2384                    Expr::ClassBytes(bclass(&[(b'a', b'c')])));
2385         assert_eq!(pb(r"(?-u)[[a-b]]"),
2386                    Expr::ClassBytes(bclass(&[(b'a', b'b')])));
2387         assert_eq!(pb(r"(?-u)[[c][a-b]]"),
2388                    Expr::ClassBytes(bclass(&[(b'a', b'c')])));
2389     }
2390 
2391     #[test]
class_nested_class_union_casei()2392     fn class_nested_class_union_casei() {
2393         assert_eq!(p(r"(?i)[c[a-b]]"),
2394                    Expr::Class(class(&[('a', 'c')]).case_fold()));
2395         assert_eq!(p(r"(?i)[[a-b]]"),
2396                    Expr::Class(class(&[('a', 'b')]).case_fold()));
2397         assert_eq!(p(r"(?i)[[c][a-b]]"),
2398                    Expr::Class(class(&[('a', 'c')]).case_fold()));
2399 
2400         assert_eq!(pb(r"(?i-u)[[\d]]"),
2401                    Expr::ClassBytes(asciid_bytes().case_fold()));
2402     }
2403 
2404     #[test]
class_nested_class_negate()2405     fn class_nested_class_negate() {
2406         assert_eq!(p(r"[^[\d]]"), Expr::Class(class(PERLD).negate()));
2407         assert_eq!(p(r"[[^\d]]"), Expr::Class(class(PERLD).negate()));
2408         assert_eq!(p(r"[^[^\d]]"), Expr::Class(class(PERLD)));
2409         assert_eq!(p(r"[^[\w]]"), Expr::Class(class(PERLW).negate()));
2410         assert_eq!(p(r"[[^\w]]"), Expr::Class(class(PERLW).negate()));
2411         assert_eq!(p(r"[^[^\w]]"), Expr::Class(class(PERLW)));
2412         assert_eq!(p(r"[a-b[^c]]"),
2413                    Expr::Class(class(&[('\u{0}', 'b'), ('d', '\u{10FFFF}')])));
2414 
2415         assert_eq!(pb(r"(?-u)[^[\d]]"),
2416                    Expr::ClassBytes(asciid_bytes().negate()));
2417         assert_eq!(pb(r"(?-u)[[^\d]]"),
2418                    Expr::ClassBytes(asciid_bytes().negate()));
2419         assert_eq!(pb(r"(?-u)[^[^\d]]"),
2420                    Expr::ClassBytes(asciid_bytes()));
2421         assert_eq!(pb(r"(?-u)[^[\w]]"),
2422                    Expr::ClassBytes(asciiw_bytes().negate()));
2423         assert_eq!(pb(r"(?-u)[[^\w]]"),
2424                    Expr::ClassBytes(asciiw_bytes().negate()));
2425         assert_eq!(pb(r"(?-u)[^[^\w]]"),
2426                    Expr::ClassBytes(asciiw_bytes()));
2427         assert_eq!(pb(r"(?-u)[a-b[^c]]"),
2428                    Expr::ClassBytes(bclass(&[(b'\x00', b'b'), (b'd', b'\xFF')])))
2429     }
2430 
2431     #[test]
class_nested_class_negate_casei()2432     fn class_nested_class_negate_casei() {
2433         assert_eq!(p(r"(?i)[^[\d]]"),
2434                    Expr::Class(class(PERLD).case_fold().negate()));
2435         assert_eq!(p(r"(?i)[[^\d]]"),
2436                    Expr::Class(class(PERLD).case_fold().negate()));
2437         assert_eq!(p(r"(?i)[^[^\d]]"),
2438                    Expr::Class(class(PERLD).case_fold()));
2439         assert_eq!(p(r"(?i)[^[\w]]"),
2440                    Expr::Class(class(PERLW).case_fold().negate()));
2441         assert_eq!(p(r"(?i)[[^\w]]"),
2442                    Expr::Class(class(PERLW).case_fold().negate()));
2443         assert_eq!(p(r"(?i)[^[^\w]]"),
2444                    Expr::Class(class(PERLW).case_fold()));
2445         let mut cls = CharClass::empty().negate();
2446         cls.remove('c');
2447         cls.remove('C');
2448         assert_eq!(p(r"(?i)[a-b[^c]]"), Expr::Class(cls));
2449 
2450         assert_eq!(pb(r"(?i-u)[^[\d]]"),
2451                    Expr::ClassBytes(asciid_bytes().case_fold().negate()));
2452         assert_eq!(pb(r"(?i-u)[[^\d]]"),
2453                    Expr::ClassBytes(asciid_bytes().case_fold().negate()));
2454         assert_eq!(pb(r"(?i-u)[^[^\d]]"),
2455                    Expr::ClassBytes(asciid_bytes().case_fold()));
2456         assert_eq!(pb(r"(?i-u)[^[\w]]"),
2457                    Expr::ClassBytes(asciiw_bytes().case_fold().negate()));
2458         assert_eq!(pb(r"(?i-u)[[^\w]]"),
2459                    Expr::ClassBytes(asciiw_bytes().case_fold().negate()));
2460         assert_eq!(pb(r"(?i-u)[^[^\w]]"),
2461                    Expr::ClassBytes(asciiw_bytes().case_fold()));
2462         let mut bytes = ByteClass::new(vec![]).negate();
2463         bytes.remove(b'c');
2464         bytes.remove(b'C');
2465         assert_eq!(pb(r"(?i-u)[a-b[^c]]"), Expr::ClassBytes(bytes));
2466     }
2467 
2468     #[test]
class_nested_class_brackets_hyphen()2469     fn class_nested_class_brackets_hyphen() {
2470         // This is confusing, but `]` is allowed if first character within a class
2471         // It parses as a nested class with the `]` and `-` characters
2472         assert_eq!(p(r"[[]-]]"), Expr::Class(class(&[('-', '-'), (']', ']')])));
2473         assert_eq!(p(r"[[\[]]"), Expr::Class(class(&[('[', '[')])));
2474         assert_eq!(p(r"[[\]]]"), Expr::Class(class(&[(']', ']')])));
2475     }
2476 
2477     #[test]
class_nested_class_deep_nesting()2478     fn class_nested_class_deep_nesting() {
2479         // Makes sure that implementation can handle deep nesting.
2480         // With recursive parsing, this regex would blow the stack size.
2481         use std::iter::repeat;
2482         let nesting = 10_000;
2483         let open: String = repeat("[").take(nesting).collect();
2484         let close: String = repeat("]").take(nesting).collect();
2485         let s  = format!("{}a{}", open, close);
2486         assert_eq!(p(&s), Expr::Class(class(&[('a', 'a')])));
2487     }
2488 
2489     #[test]
class_intersection_ranges()2490     fn class_intersection_ranges() {
2491         assert_eq!(p(r"[abc&&b-c]"), Expr::Class(class(&[('b', 'c')])));
2492         assert_eq!(p(r"[abc&&[b-c]]"), Expr::Class(class(&[('b', 'c')])));
2493         assert_eq!(p(r"[[abc]&&[b-c]]"), Expr::Class(class(&[('b', 'c')])));
2494         assert_eq!(p(r"[a-z&&b-y&&c-x]"), Expr::Class(class(&[('c', 'x')])));
2495         assert_eq!(p(r"[c-da-b&&a-d]"), Expr::Class(class(&[('a', 'd')])));
2496         assert_eq!(p(r"[a-d&&c-da-b]"), Expr::Class(class(&[('a', 'd')])));
2497 
2498         assert_eq!(pb(r"(?-u)[abc&&b-c]"),
2499                    Expr::ClassBytes(bclass(&[(b'b', b'c')])));
2500         assert_eq!(pb(r"(?-u)[abc&&[b-c]]"),
2501                    Expr::ClassBytes(bclass(&[(b'b', b'c')])));
2502         assert_eq!(pb(r"(?-u)[[abc]&&[b-c]]"),
2503                    Expr::ClassBytes(bclass(&[(b'b', b'c')])));
2504         assert_eq!(pb(r"(?-u)[a-z&&b-y&&c-x]"),
2505                    Expr::ClassBytes(bclass(&[(b'c', b'x')])));
2506         assert_eq!(pb(r"(?-u)[c-da-b&&a-d]"),
2507                    Expr::ClassBytes(bclass(&[(b'a', b'd')])));
2508     }
2509 
2510     #[test]
class_intersection_ranges_casei()2511     fn class_intersection_ranges_casei() {
2512         assert_eq!(p(r"(?i)[abc&&b-c]"),
2513                    Expr::Class(class(&[('b', 'c')]).case_fold()));
2514         assert_eq!(p(r"(?i)[abc&&[b-c]]"),
2515                    Expr::Class(class(&[('b', 'c')]).case_fold()));
2516         assert_eq!(p(r"(?i)[[abc]&&[b-c]]"),
2517                    Expr::Class(class(&[('b', 'c')]).case_fold()));
2518         assert_eq!(p(r"(?i)[a-z&&b-y&&c-x]"),
2519                    Expr::Class(class(&[('c', 'x')]).case_fold()));
2520         assert_eq!(p(r"(?i)[c-da-b&&a-d]"),
2521                    Expr::Class(class(&[('a', 'd')]).case_fold()));
2522 
2523         assert_eq!(pb(r"(?i-u)[abc&&b-c]"),
2524                    Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
2525         assert_eq!(pb(r"(?i-u)[abc&&[b-c]]"),
2526                    Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
2527         assert_eq!(pb(r"(?i-u)[[abc]&&[b-c]]"),
2528                    Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
2529         assert_eq!(pb(r"(?i-u)[a-z&&b-y&&c-x]"),
2530                    Expr::ClassBytes(bclass(&[(b'c', b'x')]).case_fold()));
2531         assert_eq!(pb(r"(?i-u)[c-da-b&&a-d]"),
2532                    Expr::ClassBytes(bclass(&[(b'a', b'd')]).case_fold()));
2533     }
2534 
2535     #[test]
class_intersection_classes()2536     fn class_intersection_classes() {
2537         assert_eq!(p(r"[\w&&\d]"), Expr::Class(class(PERLD)));
2538         assert_eq!(p(r"[\w&&[[:ascii:]]]"), Expr::Class(asciiw()));
2539         assert_eq!(p(r"[\x00-\xFF&&\pZ]"),
2540                    Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')])));
2541 
2542         assert_eq!(pb(r"(?-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes()));
2543         assert_eq!(pb(r"(?-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes()));
2544     }
2545 
2546     #[test]
class_intersection_classes_casei()2547     fn class_intersection_classes_casei() {
2548         assert_eq!(p(r"(?i)[\w&&\d]"), Expr::Class(class(PERLD).case_fold()));
2549         assert_eq!(p(r"(?i)[\w&&[[:ascii:]]]"), Expr::Class(asciiw().case_fold()));
2550         assert_eq!(p(r"(?i)[\x00-\xFF&&\pZ]"),
2551                    Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')])));
2552 
2553         assert_eq!(pb(r"(?i-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes().case_fold()));
2554         assert_eq!(pb(r"(?i-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes().case_fold()));
2555     }
2556 
2557     #[test]
class_intersection_negate()2558     fn class_intersection_negate() {
2559         assert_eq!(p(r"[^\w&&\d]"), Expr::Class(class(PERLD).negate()));
2560         assert_eq!(p(r"[^[\w&&\d]]"), Expr::Class(class(PERLD).negate()));
2561         assert_eq!(p(r"[^[^\w&&\d]]"), Expr::Class(class(PERLD)));
2562         assert_eq!(p(r"[\w&&[^\d]]"),
2563                    Expr::Class(class(PERLW).intersection(&class(PERLD).negate())));
2564         assert_eq!(p(r"[[^\w]&&[^\d]]"),
2565                    Expr::Class(class(PERLW).negate()));
2566 
2567         assert_eq!(pb(r"(?-u)[^\w&&\d]"),
2568                    Expr::ClassBytes(asciid_bytes().negate()));
2569         assert_eq!(pb(r"(?-u)[^[\w&&\d]]"),
2570                    Expr::ClassBytes(asciid_bytes().negate()));
2571         assert_eq!(pb(r"(?-u)[^[^\w&&\d]]"),
2572                    Expr::ClassBytes(asciid_bytes()));
2573         assert_eq!(pb(r"(?-u)[\w&&[^\d]]"),
2574                    Expr::ClassBytes(asciiw().intersection(&asciid().negate()).to_byte_class()));
2575         assert_eq!(pb(r"(?-u)[[^\w]&&[^\d]]"),
2576                    Expr::ClassBytes(asciiw_bytes().negate()));
2577     }
2578 
2579     #[test]
class_intersection_negate_casei()2580     fn class_intersection_negate_casei() {
2581         assert_eq!(p(r"(?i)[^\w&&a-z]"),
2582                    Expr::Class(class(&[('a', 'z')]).case_fold().negate()));
2583         assert_eq!(p(r"(?i)[^[\w&&a-z]]"),
2584                    Expr::Class(class(&[('a', 'z')]).case_fold().negate()));
2585         assert_eq!(p(r"(?i)[^[^\w&&a-z]]"),
2586                    Expr::Class(class(&[('a', 'z')]).case_fold()));
2587         assert_eq!(p(r"(?i)[\w&&[^a-z]]"),
2588                    Expr::Class(
2589                        class(PERLW).intersection(&class(&[('a', 'z')])
2590                        .case_fold().negate())));
2591         assert_eq!(p(r"(?i)[[^\w]&&[^a-z]]"),
2592                    Expr::Class(class(PERLW).negate()));
2593 
2594         assert_eq!(pb(r"(?i-u)[^\w&&a-z]"),
2595                    Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate()));
2596         assert_eq!(pb(r"(?i-u)[^[\w&&a-z]]"),
2597                    Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate()));
2598         assert_eq!(pb(r"(?i-u)[^[^\w&&a-z]]"),
2599                    Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold()));
2600         assert_eq!(pb(r"(?i-u)[\w&&[^a-z]]"),
2601                    Expr::ClassBytes(bclass(&[(b'0', b'9'), (b'_', b'_')])));
2602         assert_eq!(pb(r"(?i-u)[[^\w]&&[^a-z]]"),
2603                    Expr::ClassBytes(asciiw_bytes().negate()));
2604     }
2605 
2606     #[test]
class_intersection_caret()2607     fn class_intersection_caret() {
2608         // In `[a^]`, `^` does not need to be escaped, so it makes sense that
2609         // `^` is also allowed to be unescaped after `&&`.
2610         assert_eq!(p(r"[\^&&^]"), Expr::Class(class(&[('^', '^')])));
2611     }
2612 
2613     #[test]
class_intersection_brackets_hyphen()2614     fn class_intersection_brackets_hyphen() {
2615         // `]` needs to be escaped after `&&` because it is not at the start of the class.
2616         assert_eq!(p(r"[]&&\]]"), Expr::Class(class(&[(']', ']')])));
2617 
2618         assert_eq!(p(r"[-&&-]"), Expr::Class(class(&[('-', '-')])));
2619     }
2620 
2621     #[test]
class_intersection_ampersand()2622     fn class_intersection_ampersand() {
2623         // Unescaped `&` after `&&`
2624         assert_eq!(p(r"[\&&&&]"), Expr::Class(class(&[('&', '&')])));
2625         assert_eq!(p(r"[\&&&\&]"), Expr::Class(class(&[('&', '&')])));
2626     }
2627 
2628     #[test]
class_intersection_precedence()2629     fn class_intersection_precedence() {
2630         assert_eq!(p(r"[a-w&&[^c-g]z]"), Expr::Class(class(&[('a', 'b'), ('h', 'w')])));
2631     }
2632 
2633     #[test]
class_special_escaped_set_chars()2634     fn class_special_escaped_set_chars() {
2635         // These tests ensure that some special characters require escaping
2636         // for use in character classes. The intention is to use these
2637         // characters to implement sets as described in UTS#18 RL1.3. Once
2638         // that's done, these tests should be removed and replaced with others.
2639         assert_eq!(p(r"[\[]"), Expr::Class(class(&[('[', '[')])));
2640         assert_eq!(p(r"[&]"), Expr::Class(class(&[('&', '&')])));
2641         assert_eq!(p(r"[\&]"), Expr::Class(class(&[('&', '&')])));
2642         assert_eq!(p(r"[\&\&]"), Expr::Class(class(&[('&', '&')])));
2643         assert_eq!(p(r"[\x00-&]"), Expr::Class(class(&[('\u{0}', '&')])));
2644         assert_eq!(p(r"[&-\xFF]"), Expr::Class(class(&[('&', '\u{FF}')])));
2645 
2646         assert_eq!(p(r"[~]"), Expr::Class(class(&[('~', '~')])));
2647         assert_eq!(p(r"[\~]"), Expr::Class(class(&[('~', '~')])));
2648         assert_eq!(p(r"[\~\~]"), Expr::Class(class(&[('~', '~')])));
2649         assert_eq!(p(r"[\x00-~]"), Expr::Class(class(&[('\u{0}', '~')])));
2650         assert_eq!(p(r"[~-\xFF]"), Expr::Class(class(&[('~', '\u{FF}')])));
2651 
2652         assert_eq!(p(r"[+-\-]"), Expr::Class(class(&[('+', '-')])));
2653         assert_eq!(p(r"[a-a\--\xFF]"), Expr::Class(class(&[
2654             ('-', '\u{FF}'),
2655         ])));
2656     }
2657 
2658     #[test]
class_overlapping()2659     fn class_overlapping() {
2660         assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')])));
2661         assert_eq!(p("[a-fg-m]"), Expr::Class(class(&[('a', 'm')])));
2662 
2663         assert_eq!(pb("(?-u)[a-fd-h]"),
2664                    Expr::ClassBytes(bclass(&[(b'a', b'h')])));
2665         assert_eq!(pb("(?-u)[a-fg-m]"),
2666                    Expr::ClassBytes(bclass(&[(b'a', b'm')])));
2667     }
2668 
2669     #[test]
ascii_classes()2670     fn ascii_classes() {
2671         assert_eq!(p("[:blank:]"), Expr::Class(class(&[
2672             (':', ':'), ('a', 'b'), ('k', 'l'), ('n', 'n'),
2673         ])));
2674         assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER)));
2675 
2676         assert_eq!(pb("(?-u)[[:upper:]]"),
2677                    Expr::ClassBytes(class(UPPER).to_byte_class()));
2678     }
2679 
2680     #[test]
ascii_classes_not()2681     fn ascii_classes_not() {
2682         assert_eq!(p("[:abc:]"),
2683                    Expr::Class(class(&[(':', ':'), ('a', 'c')])));
2684         assert_eq!(pb("(?-u)[:abc:]"),
2685                    Expr::ClassBytes(bclass(&[(b':', b':'), (b'a', b'c')])));
2686     }
2687 
2688     #[test]
ascii_classes_multiple()2689     fn ascii_classes_multiple() {
2690         assert_eq!(p("[[:lower:][:upper:]]"),
2691                    Expr::Class(classes(&[UPPER, LOWER])));
2692 
2693         assert_eq!(pb("(?-u)[[:lower:][:upper:]]"),
2694                    Expr::ClassBytes(classes(&[UPPER, LOWER]).to_byte_class()));
2695     }
2696 
2697     #[test]
ascii_classes_negate()2698     fn ascii_classes_negate() {
2699         assert_eq!(p("[[:^upper:]]"), Expr::Class(class(UPPER).negate()));
2700         assert_eq!(p("[^[:^upper:]]"), Expr::Class(class(UPPER)));
2701 
2702         assert_eq!(pb("(?-u)[[:^upper:]]"),
2703                    Expr::ClassBytes(class(UPPER).to_byte_class().negate()));
2704         assert_eq!(pb("(?-u)[^[:^upper:]]"),
2705                    Expr::ClassBytes(class(UPPER).to_byte_class()));
2706     }
2707 
2708     #[test]
ascii_classes_negate_multiple()2709     fn ascii_classes_negate_multiple() {
2710         let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
2711         let cls = CharClass::empty().merge(nlower).merge(nword);
2712         assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
2713         assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
2714     }
2715 
2716     #[test]
ascii_classes_case_fold()2717     fn ascii_classes_case_fold() {
2718         assert_eq!(p("(?i)[[:upper:]]"),
2719                    Expr::Class(class(UPPER).case_fold()));
2720 
2721         assert_eq!(pb("(?i-u)[[:upper:]]"),
2722                    Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
2723     }
2724 
2725     #[test]
ascii_classes_negate_case_fold()2726     fn ascii_classes_negate_case_fold() {
2727         assert_eq!(p("(?i)[[:^upper:]]"),
2728                    Expr::Class(class(UPPER).case_fold().negate()));
2729         assert_eq!(p("(?i)[^[:^upper:]]"),
2730                    Expr::Class(class(UPPER).case_fold()));
2731 
2732         assert_eq!(pb("(?i-u)[[:^upper:]]"),
2733                    Expr::ClassBytes(
2734                        class(UPPER).to_byte_class().case_fold().negate()));
2735         assert_eq!(pb("(?i-u)[^[:^upper:]]"),
2736                    Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
2737     }
2738 
2739     #[test]
single_class_negate_case_fold()2740     fn single_class_negate_case_fold() {
2741         assert_eq!(p("(?i)[^x]"),
2742                    Expr::Class(class(&[('x', 'x')]).case_fold().negate()));
2743 
2744         assert_eq!(pb("(?i-u)[^x]"),
2745                    Expr::ClassBytes(
2746                        class(&[('x', 'x')])
2747                        .to_byte_class().case_fold().negate()));
2748     }
2749 
2750     #[test]
ignore_space_empty()2751     fn ignore_space_empty() {
2752         assert_eq!(p("(?x) "), Expr::Empty);
2753     }
2754 
2755     #[test]
ignore_space_literal()2756     fn ignore_space_literal() {
2757         assert_eq!(p("(?x) a b c"), Expr::Concat(vec![
2758             lit('a'), lit('b'), lit('c'),
2759         ]));
2760     }
2761 
2762     #[test]
ignore_space_literal_off()2763     fn ignore_space_literal_off() {
2764         assert_eq!(p("(?x) a b c(?-x) a"), Expr::Concat(vec![
2765             lit('a'), lit('b'), lit('c'), lit(' '), lit('a'),
2766         ]));
2767     }
2768 
2769     #[test]
ignore_space_class()2770     fn ignore_space_class() {
2771         assert_eq!(p("(?x)[a
2772         - z
2773 ]"), Expr::Class(class(&[('a', 'z')])));
2774         assert_eq!(p("(?x)[  ^   a
2775         - z
2776 ]"), Expr::Class(class(&[('a', 'z')]).negate()));
2777     }
2778 
2779     #[test]
ignore_space_escape_octal()2780     fn ignore_space_escape_octal() {
2781         assert_eq!(p(r"(?x)\12 3"), Expr::Concat(vec![
2782             lit('\n'),
2783             lit('3'),
2784         ]));
2785     }
2786 
2787     #[test]
ignore_space_escape_hex()2788     fn ignore_space_escape_hex() {
2789         assert_eq!(p(r"(?x)\x { 53 }"), lit('S'));
2790         assert_eq!(p(r"(?x)\x # comment
2791 { # comment
2792     53 # comment
2793 } # comment"), lit('S'));
2794     }
2795 
2796     #[test]
ignore_space_escape_hex2()2797     fn ignore_space_escape_hex2() {
2798         assert_eq!(p(r"(?x)\x 53"), lit('S'));
2799         assert_eq!(p(r"(?x)\x # comment
2800         53 # comment"), lit('S'));
2801     }
2802 
2803     #[test]
ignore_space_escape_unicode_name()2804     fn ignore_space_escape_unicode_name() {
2805         assert_eq!(p(r"(?x)\p # comment
2806 { # comment
2807     Yi # comment
2808 } # comment"), Expr::Class(class(YI)));
2809     }
2810 
2811     #[test]
ignore_space_repeat_counted()2812     fn ignore_space_repeat_counted() {
2813         assert_eq!(p("(?x)a # comment
2814 { # comment
2815     5 # comment
2816     , # comment
2817     10 # comment
2818 }"), Expr::Repeat {
2819             e: b(lit('a')),
2820             r: Repeater::Range { min: 5, max: Some(10) },
2821             greedy: true,
2822         });
2823     }
2824 
2825     #[test]
ignore_space_comments()2826     fn ignore_space_comments() {
2827         assert_eq!(p(r"(?x)(?P<foo>
2828     a # comment 1
2829 )(?P<bar>
2830     z # comment 2
2831 )"), Expr::Concat(vec![
2832         Expr::Group {
2833             e: Box::new(lit('a')),
2834             i: Some(1),
2835             name: Some("foo".into()),
2836         },
2837         Expr::Group {
2838             e: Box::new(lit('z')),
2839             i: Some(2),
2840             name: Some("bar".into()),
2841         },
2842     ]));
2843     }
2844 
2845     #[test]
ignore_space_comments_re_enable()2846     fn ignore_space_comments_re_enable() {
2847         assert_eq!(p(r"(?x)a # hi
2848 (?-x:#) # sweet"), Expr::Concat(vec![
2849             lit('a'),
2850             Expr::Group {
2851                 e: Box::new(lit('#')),
2852                 i: None,
2853                 name: None,
2854             },
2855         ]));
2856     }
2857 
2858     #[test]
ignore_space_escape_punctuation()2859     fn ignore_space_escape_punctuation() {
2860         assert_eq!(p(r"(?x)\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), c(&[
2861             lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'),
2862             lit('('), lit(')'), lit('|'), lit('['), lit(']'),
2863             lit('{'), lit('}'), lit('^'), lit('$'), lit('#'),
2864         ]));
2865     }
2866 
2867     #[test]
ignore_space_escape_hash()2868     fn ignore_space_escape_hash() {
2869         assert_eq!(p(r"(?x)a\# # hi there"), Expr::Concat(vec![
2870             lit('a'),
2871             lit('#'),
2872         ]));
2873     }
2874 
2875     #[test]
ignore_space_escape_space()2876     fn ignore_space_escape_space() {
2877         assert_eq!(p(r"(?x)a\  # hi there"), Expr::Concat(vec![
2878             lit('a'),
2879             lit(' '),
2880         ]));
2881     }
2882 
2883     // Test every single possible error case.
2884 
2885     macro_rules! test_err {
2886         ($re:expr, $pos:expr, $kind:expr) => {
2887             test_err!($re, $pos, $kind, Flags::default());
2888         };
2889         ($re:expr, $pos:expr, $kind:expr, $flags:expr) => {{
2890             let err = Parser::parse($re, $flags).unwrap_err();
2891             assert_eq!($pos, err.pos);
2892             assert_eq!($kind, err.kind);
2893             assert!($re.contains(&err.surround));
2894         }}
2895     }
2896 
2897     #[test]
invalid_utf8_not_allowed()2898     fn invalid_utf8_not_allowed() {
2899         // let flags = Flags { unicode: false, .. Flags::default() };
2900         test_err!(r"(?-u)\xFF", 9, ErrorKind::InvalidUtf8);
2901         test_err!(r"(?-u).", 5, ErrorKind::InvalidUtf8);
2902         test_err!(r"(?-u)(?s).", 9, ErrorKind::InvalidUtf8);
2903         test_err!(r"(?-u)[\x00-\x80]", 15, ErrorKind::InvalidUtf8);
2904         test_err!(r"(?-u)\222", 9, ErrorKind::InvalidUtf8);
2905         test_err!(r"(?-u)\x{0080}", 13, ErrorKind::InvalidUtf8);
2906     }
2907 
2908     #[test]
unicode_char_not_allowed()2909     fn unicode_char_not_allowed() {
2910         let flags = Flags { allow_bytes: true, .. Flags::default() };
2911         test_err!("☃(?-u:☃)", 7, ErrorKind::UnicodeNotAllowed, flags);
2912     }
2913 
2914     #[test]
unicode_class_not_allowed()2915     fn unicode_class_not_allowed() {
2916         let flags = Flags { allow_bytes: true, .. Flags::default() };
2917         test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags);
2918     }
2919 
2920     #[test]
unicode_class_literal_not_allowed()2921     fn unicode_class_literal_not_allowed() {
2922         let flags = Flags { allow_bytes: true, .. Flags::default() };
2923         test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
2924         test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
2925     }
2926 
2927     #[test]
unicode_hex_not_allowed()2928     fn unicode_hex_not_allowed() {
2929         let flags = Flags { allow_bytes: true, .. Flags::default() };
2930         test_err!(r"(?-u)\x{FFFF}", 13, ErrorKind::UnicodeNotAllowed, flags);
2931         test_err!(r"(?-u)\x{100}", 12, ErrorKind::UnicodeNotAllowed, flags);
2932     }
2933 
2934     #[test]
unicode_octal_not_allowed()2935     fn unicode_octal_not_allowed() {
2936         let flags = Flags { allow_bytes: true, .. Flags::default() };
2937         test_err!(r"(?-u)\400", 9, ErrorKind::UnicodeNotAllowed, flags);
2938     }
2939 
2940     #[test]
error_repeat_no_expr_simple()2941     fn error_repeat_no_expr_simple() {
2942         test_err!("(*", 1, ErrorKind::RepeaterExpectsExpr);
2943     }
2944 
2945     #[test]
error_repeat_no_expr_counted()2946     fn error_repeat_no_expr_counted() {
2947         test_err!("({5}", 1, ErrorKind::RepeaterExpectsExpr);
2948     }
2949 
2950     #[test]
error_repeat_beginning_counted()2951     fn error_repeat_beginning_counted() {
2952         test_err!("{5}", 0, ErrorKind::RepeaterExpectsExpr);
2953     }
2954 
2955     #[test]
error_repeat_illegal_exprs_simple()2956     fn error_repeat_illegal_exprs_simple() {
2957         test_err!("a**", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat {
2958             e: b(lit('a')),
2959             r: Repeater::ZeroOrMore,
2960             greedy: true,
2961         }));
2962         test_err!("a|*", 2,
2963             ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')]))
2964         );
2965     }
2966 
2967     #[test]
error_repeat_illegal_exprs_counted()2968     fn error_repeat_illegal_exprs_counted() {
2969         test_err!("a*{5}", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat {
2970             e: b(lit('a')),
2971             r: Repeater::ZeroOrMore,
2972             greedy: true,
2973         }));
2974         test_err!("a|{5}", 2,
2975             ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')]))
2976         );
2977     }
2978 
2979     #[test]
error_repeat_empty_number()2980     fn error_repeat_empty_number() {
2981         test_err!("a{}", 2, ErrorKind::MissingBase10);
2982     }
2983 
2984     #[test]
error_repeat_eof()2985     fn error_repeat_eof() {
2986         test_err!("a{5", 3, ErrorKind::UnclosedRepeat);
2987     }
2988 
2989     #[test]
error_repeat_empty_number_eof()2990     fn error_repeat_empty_number_eof() {
2991         test_err!("a{xyz", 5, ErrorKind::InvalidBase10("xyz".into()));
2992         test_err!("a{12,xyz", 8, ErrorKind::InvalidBase10("xyz".into()));
2993     }
2994 
2995     #[test]
error_repeat_invalid_number()2996     fn error_repeat_invalid_number() {
2997         test_err!("a{9999999999}", 12,
2998                   ErrorKind::InvalidBase10("9999999999".into()));
2999         test_err!("a{1,9999999999}", 14,
3000                   ErrorKind::InvalidBase10("9999999999".into()));
3001     }
3002 
3003     #[test]
error_repeat_invalid_number_extra()3004     fn error_repeat_invalid_number_extra() {
3005         test_err!("a{12x}", 5, ErrorKind::InvalidBase10("12x".into()));
3006         test_err!("a{1,12x}", 7, ErrorKind::InvalidBase10("12x".into()));
3007     }
3008 
3009     #[test]
error_repeat_invalid_range()3010     fn error_repeat_invalid_range() {
3011         test_err!("a{2,1}", 5,
3012                   ErrorKind::InvalidRepeatRange { min: 2, max: 1 });
3013     }
3014 
3015     #[test]
error_alternate_empty()3016     fn error_alternate_empty() {
3017         test_err!("|a", 0, ErrorKind::EmptyAlternate);
3018     }
3019 
3020     #[test]
error_alternate_empty_with_group()3021     fn error_alternate_empty_with_group() {
3022         test_err!("(|a)", 1, ErrorKind::EmptyAlternate);
3023     }
3024 
3025     #[test]
error_alternate_empty_with_alternate()3026     fn error_alternate_empty_with_alternate() {
3027         test_err!("a||", 2, ErrorKind::EmptyAlternate);
3028     }
3029 
3030     #[test]
error_close_paren_unopened_empty()3031     fn error_close_paren_unopened_empty() {
3032         test_err!(")", 0, ErrorKind::UnopenedParen);
3033     }
3034 
3035     #[test]
error_close_paren_unopened()3036     fn error_close_paren_unopened() {
3037         test_err!("ab)", 2, ErrorKind::UnopenedParen);
3038     }
3039 
3040     #[test]
error_close_paren_unopened_with_alt()3041     fn error_close_paren_unopened_with_alt() {
3042         test_err!("a|b)", 3, ErrorKind::UnopenedParen);
3043     }
3044 
3045     #[test]
error_close_paren_unclosed_with_alt()3046     fn error_close_paren_unclosed_with_alt() {
3047         test_err!("(a|b", 0, ErrorKind::UnclosedParen);
3048     }
3049 
3050     #[test]
error_close_paren_empty_alt()3051     fn error_close_paren_empty_alt() {
3052         test_err!("(a|)", 3, ErrorKind::EmptyAlternate);
3053     }
3054 
3055     #[test]
error_close_paren_empty_group()3056     fn error_close_paren_empty_group() {
3057         test_err!("()", 1, ErrorKind::EmptyGroup);
3058     }
3059 
3060     #[test]
error_close_paren_empty_group_with_name()3061     fn error_close_paren_empty_group_with_name() {
3062         test_err!("(?P<foo>)", 8, ErrorKind::EmptyGroup);
3063     }
3064 
3065     #[test]
error_finish_concat_unclosed()3066     fn error_finish_concat_unclosed() {
3067         test_err!("ab(xy", 2, ErrorKind::UnclosedParen);
3068     }
3069 
3070     #[test]
error_finish_concat_empty_alt()3071     fn error_finish_concat_empty_alt() {
3072         test_err!("a|", 2, ErrorKind::EmptyAlternate);
3073     }
3074 
3075     #[test]
error_group_name_invalid()3076     fn error_group_name_invalid() {
3077         test_err!("(?P<a#>x)", 6, ErrorKind::InvalidCaptureName("a#".into()));
3078     }
3079 
3080     #[test]
error_group_name_invalid_leading()3081     fn error_group_name_invalid_leading() {
3082         test_err!("(?P<1a>a)", 6, ErrorKind::InvalidCaptureName("1a".into()));
3083     }
3084 
3085     #[test]
error_group_name_unexpected_eof()3086     fn error_group_name_unexpected_eof() {
3087         test_err!("(?P<a", 5, ErrorKind::UnclosedCaptureName("a".into()));
3088     }
3089 
3090     #[test]
error_group_name_empty()3091     fn error_group_name_empty() {
3092         test_err!("(?P<>a)", 4, ErrorKind::EmptyCaptureName);
3093     }
3094 
3095     #[test]
error_group_opts_unrecognized_flag()3096     fn error_group_opts_unrecognized_flag() {
3097         test_err!("(?z:a)", 2, ErrorKind::UnrecognizedFlag('z'));
3098     }
3099 
3100     #[test]
error_group_opts_unexpected_eof()3101     fn error_group_opts_unexpected_eof() {
3102         test_err!("(?i", 3, ErrorKind::UnexpectedFlagEof);
3103     }
3104 
3105     #[test]
error_group_opts_double_negation()3106     fn error_group_opts_double_negation() {
3107         test_err!("(?-i-s:a)", 4, ErrorKind::DoubleFlagNegation);
3108     }
3109 
3110     #[test]
error_group_opts_empty_negation()3111     fn error_group_opts_empty_negation() {
3112         test_err!("(?i-:a)", 4, ErrorKind::EmptyFlagNegation);
3113     }
3114 
3115     #[test]
error_group_opts_empty()3116     fn error_group_opts_empty() {
3117         test_err!("(?)", 2, ErrorKind::EmptyFlagNegation);
3118     }
3119 
3120     #[test]
error_escape_unexpected_eof()3121     fn error_escape_unexpected_eof() {
3122         test_err!(r"\", 1, ErrorKind::UnexpectedEscapeEof);
3123     }
3124 
3125     #[test]
error_escape_unrecognized()3126     fn error_escape_unrecognized() {
3127         test_err!(r"\m", 1, ErrorKind::UnrecognizedEscape('m'));
3128     }
3129 
3130     #[test]
error_escape_hex2_eof0()3131     fn error_escape_hex2_eof0() {
3132         test_err!(r"\x", 2, ErrorKind::UnexpectedTwoDigitHexEof);
3133     }
3134 
3135     #[test]
error_escape_hex2_eof1()3136     fn error_escape_hex2_eof1() {
3137         test_err!(r"\xA", 3, ErrorKind::UnexpectedTwoDigitHexEof);
3138     }
3139 
3140     #[test]
error_escape_hex2_invalid()3141     fn error_escape_hex2_invalid() {
3142         test_err!(r"\xAG", 4, ErrorKind::InvalidBase16("AG".into()));
3143     }
3144 
3145     #[test]
error_escape_hex_eof0()3146     fn error_escape_hex_eof0() {
3147         test_err!(r"\x{", 3, ErrorKind::InvalidBase16("".into()));
3148     }
3149 
3150     #[test]
error_escape_hex_eof1()3151     fn error_escape_hex_eof1() {
3152         test_err!(r"\x{A", 4, ErrorKind::UnclosedHex);
3153     }
3154 
3155     #[test]
error_escape_hex_invalid()3156     fn error_escape_hex_invalid() {
3157         test_err!(r"\x{AG}", 5, ErrorKind::InvalidBase16("AG".into()));
3158     }
3159 
3160     #[test]
error_escape_hex_invalid_scalar_value_surrogate()3161     fn error_escape_hex_invalid_scalar_value_surrogate() {
3162         test_err!(r"\x{D800}", 8, ErrorKind::InvalidScalarValue(0xD800));
3163     }
3164 
3165     #[test]
error_escape_hex_invalid_scalar_value_high()3166     fn error_escape_hex_invalid_scalar_value_high() {
3167         test_err!(r"\x{110000}", 10, ErrorKind::InvalidScalarValue(0x110000));
3168     }
3169 
3170     #[test]
error_escape_hex_invalid_u32()3171     fn error_escape_hex_invalid_u32() {
3172         test_err!(r"\x{9999999999}", 13,
3173                   ErrorKind::InvalidBase16("9999999999".into()));
3174     }
3175 
3176     #[test]
error_unicode_unclosed()3177     fn error_unicode_unclosed() {
3178         test_err!(r"\p{", 3, ErrorKind::UnclosedUnicodeName);
3179         test_err!(r"\p{Greek", 8, ErrorKind::UnclosedUnicodeName);
3180     }
3181 
3182     #[test]
error_unicode_no_letter()3183     fn error_unicode_no_letter() {
3184         test_err!(r"\p", 2, ErrorKind::UnexpectedEscapeEof);
3185     }
3186 
3187     #[test]
error_unicode_unknown_letter()3188     fn error_unicode_unknown_letter() {
3189         test_err!(r"\pA", 3, ErrorKind::UnrecognizedUnicodeClass("A".into()));
3190     }
3191 
3192     #[test]
error_unicode_unknown_name()3193     fn error_unicode_unknown_name() {
3194         test_err!(r"\p{Yii}", 7,
3195                   ErrorKind::UnrecognizedUnicodeClass("Yii".into()));
3196     }
3197 
3198     #[test]
error_class_eof_empty()3199     fn error_class_eof_empty() {
3200         test_err!("[", 1, ErrorKind::UnexpectedClassEof);
3201         test_err!("[^", 2, ErrorKind::UnexpectedClassEof);
3202     }
3203 
3204     #[test]
error_class_eof_non_empty()3205     fn error_class_eof_non_empty() {
3206         test_err!("[a", 2, ErrorKind::UnexpectedClassEof);
3207         test_err!("[^a", 3, ErrorKind::UnexpectedClassEof);
3208     }
3209 
3210     #[test]
error_class_eof_range()3211     fn error_class_eof_range() {
3212         test_err!("[a-", 3, ErrorKind::UnexpectedClassEof);
3213         test_err!("[^a-", 4, ErrorKind::UnexpectedClassEof);
3214         test_err!("[---", 4, ErrorKind::UnexpectedClassEof);
3215     }
3216 
3217     #[test]
error_class_invalid_escape()3218     fn error_class_invalid_escape() {
3219         test_err!(r"[\pA]", 4,
3220                   ErrorKind::UnrecognizedUnicodeClass("A".into()));
3221     }
3222 
3223     #[test]
error_class_valid_escape_not_allowed()3224     fn error_class_valid_escape_not_allowed() {
3225         test_err!(r"[\A]", 3, ErrorKind::InvalidClassEscape(Expr::StartText));
3226     }
3227 
3228     #[test]
error_class_range_valid_escape_not_allowed()3229     fn error_class_range_valid_escape_not_allowed() {
3230         test_err!(r"[a-\d]", 5,
3231                   ErrorKind::InvalidClassEscape(Expr::Class(class(PERLD))));
3232         test_err!(r"[a-\A]", 5,
3233                   ErrorKind::InvalidClassEscape(Expr::StartText));
3234         test_err!(r"[\A-a]", 3,
3235                   ErrorKind::InvalidClassEscape(Expr::StartText));
3236     }
3237 
3238     #[test]
error_class_invalid_range()3239     fn error_class_invalid_range() {
3240         test_err!("[z-a]", 4, ErrorKind::InvalidClassRange {
3241             start: 'z',
3242             end: 'a',
3243         });
3244     }
3245 
3246     #[test]
error_class_empty_range()3247     fn error_class_empty_range() {
3248         test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
3249         test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
3250         test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
3251 
3252         let flags = Flags { allow_bytes: true, .. Flags::default() };
3253         test_err!(r"(?-u)[^\x00-\xFF]", 17, ErrorKind::EmptyClass, flags);
3254     }
3255 
3256     #[test]
error_class_unsupported_char()3257     fn error_class_unsupported_char() {
3258         // These tests ensure that some unescaped special characters are
3259         // rejected in character classes. The intention is to use these
3260         // characters to implement sets as described in UTS#18 RL1.3. Once
3261         // that's done, these tests should be removed and replaced with others.
3262         test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~'));
3263         test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-'));
3264         test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-'));
3265         test_err!(r"[a&&~~]", 5, ErrorKind::UnsupportedClassChar('~'));
3266         test_err!(r"[a&&--]", 5, ErrorKind::UnsupportedClassChar('-'));
3267     }
3268 
3269     #[test]
error_class_nested_class()3270     fn error_class_nested_class() {
3271         test_err!(r"[[]]", 4, ErrorKind::UnexpectedClassEof);
3272         test_err!(r"[[][]]", 6, ErrorKind::UnexpectedClassEof);
3273         test_err!(r"[[^\d\D]]", 8, ErrorKind::EmptyClass);
3274         test_err!(r"[[]", 3, ErrorKind::UnexpectedClassEof);
3275         test_err!(r"[[^]", 4, ErrorKind::UnexpectedClassEof);
3276     }
3277 
3278     #[test]
error_class_intersection()3279     fn error_class_intersection() {
3280         test_err!(r"[&&]", 4, ErrorKind::EmptyClass);
3281         test_err!(r"[a&&]", 5, ErrorKind::EmptyClass);
3282         test_err!(r"[&&&&]", 6, ErrorKind::EmptyClass);
3283         // `]` after `&&` is not the same as in (`[]]`), because it's also not
3284         // allowed unescaped in `[a]]`.
3285         test_err!(r"[]&&]]", 5, ErrorKind::EmptyClass);
3286 
3287         let flags = Flags { allow_bytes: true, .. Flags::default() };
3288         test_err!(r"(?-u)[a&&\pZ]", 12, ErrorKind::UnicodeNotAllowed, flags);
3289     }
3290 
3291     #[test]
error_duplicate_capture_name()3292     fn error_duplicate_capture_name() {
3293         test_err!("(?P<a>.)(?P<a>.)", 14,
3294                   ErrorKind::DuplicateCaptureName("a".into()));
3295     }
3296 
3297     #[test]
error_ignore_space_escape_hex()3298     fn error_ignore_space_escape_hex() {
3299         test_err!(r"(?x)\x{ 5 3 }", 10, ErrorKind::UnclosedHex);
3300     }
3301 
3302     #[test]
error_ignore_space_escape_hex2()3303     fn error_ignore_space_escape_hex2() {
3304         test_err!(r"(?x)\x 5 3", 9, ErrorKind::InvalidBase16("5 ".into()));
3305     }
3306 
3307     #[test]
error_ignore_space_escape_unicode_name()3308     fn error_ignore_space_escape_unicode_name() {
3309         test_err!(r"(?x)\p{Y i}", 9, ErrorKind::UnclosedUnicodeName);
3310     }
3311 }
3312