1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 /*!
12 This crate provides a regular expression parser and an abstract syntax for
13 regular expressions. The abstract syntax is defined by the `Expr` type. The
14 concrete syntax is enumerated in the
15 [`regex`](../regex/index.html#syntax)
16 crate documentation.
17 
18 Note that since this crate is first and foremost an implementation detail for
19 the `regex` crate, it may experience more frequent breaking changes. It is
20 exposed as a separate crate so that others may use it to do analysis on regular
21 expressions or even build their own matching engine.
22 
23 # Example: parsing an expression
24 
25 Parsing a regular expression can be done with the `Expr::parse` function.
26 
27 ```rust
28 use regex_syntax::Expr;
29 
30 assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![
31     Expr::Literal { chars: vec!['a', 'b'], casei: false },
32     Expr::Literal { chars: vec!['y', 'z'], casei: false },
33 ]));
34 ```
35 
36 # Example: inspecting an error
37 
38 The parser in this crate provides very detailed error values. For example,
39 if an invalid character class range is given:
40 
41 ```rust
42 use regex_syntax::{Expr, ErrorKind};
43 
44 let err = Expr::parse(r"[z-a]").unwrap_err();
45 assert_eq!(err.position(), 4);
46 assert_eq!(err.kind(), &ErrorKind::InvalidClassRange {
47     start: 'z',
48     end: 'a',
49 });
50 ```
51 
52 Or unbalanced parentheses:
53 
54 ```rust
55 use regex_syntax::{Expr, ErrorKind};
56 
57 let err = Expr::parse(r"ab(cd").unwrap_err();
58 assert_eq!(err.position(), 2);
59 assert_eq!(err.kind(), &ErrorKind::UnclosedParen);
60 ```
61 */
62 
63 #![deny(missing_docs)]
64 #![cfg_attr(test, deny(warnings))]
65 
66 #[cfg(test)] extern crate quickcheck;
67 #[cfg(test)] extern crate rand;
68 
69 mod literals;
70 mod parser;
71 mod unicode;
72 
73 use std::ascii;
74 use std::char;
75 use std::cmp::{Ordering, max, min};
76 use std::fmt;
77 use std::iter::IntoIterator;
78 use std::ops::Deref;
79 use std::result;
80 use std::slice;
81 use std::u8;
82 use std::vec;
83 
84 use unicode::case_folding;
85 
86 use self::Expr::*;
87 use self::Repeater::*;
88 
89 use parser::{Flags, Parser};
90 
91 pub use literals::{Literals, Lit};
92 
93 /// A regular expression abstract syntax tree.
94 ///
95 /// An `Expr` represents the abstract syntax of a regular expression.
96 #[derive(Clone, Debug, PartialEq, Eq)]
97 pub enum Expr {
98     /// An empty regex (which never matches any text).
99     Empty,
100     /// A sequence of one or more literal characters to be matched.
101     Literal {
102         /// The characters.
103         chars: Vec<char>,
104         /// Whether to match case insensitively.
105         casei: bool,
106     },
107     /// A sequence of one or more literal bytes to be matched.
108     LiteralBytes {
109         /// The bytes.
110         bytes: Vec<u8>,
111         /// Whether to match case insensitively.
112         ///
113         /// The interpretation of "case insensitive" in this context is
114         /// ambiguous since `bytes` can be arbitrary. However, a good heuristic
115         /// is to assume that the bytes are ASCII-compatible and do simple
116         /// ASCII case folding.
117         casei: bool,
118     },
119     /// Match any character.
120     AnyChar,
121     /// Match any character, excluding new line (`0xA`).
122     AnyCharNoNL,
123     /// Match any byte.
124     AnyByte,
125     /// Match any byte, excluding new line (`0xA`).
126     AnyByteNoNL,
127     /// A character class.
128     Class(CharClass),
129     /// A character class with byte ranges only.
130     ClassBytes(ByteClass),
131     /// Match the start of a line or beginning of input.
132     StartLine,
133     /// Match the end of a line or end of input.
134     EndLine,
135     /// Match the beginning of input.
136     StartText,
137     /// Match the end of input.
138     EndText,
139     /// Match a word boundary (word character on one side and a non-word
140     /// character on the other).
141     WordBoundary,
142     /// Match a position that is not a word boundary (word or non-word
143     /// characters on both sides).
144     NotWordBoundary,
145     /// Match an ASCII word boundary.
146     WordBoundaryAscii,
147     /// Match a position that is not an ASCII word boundary.
148     NotWordBoundaryAscii,
149     /// A group, possibly non-capturing.
150     Group {
151         /// The expression inside the group.
152         e: Box<Expr>,
153         /// The capture index (starting at `1`) only for capturing groups.
154         i: Option<usize>,
155         /// The capture name, only for capturing named groups.
156         name: Option<String>,
157     },
158     /// A repeat operator (`?`, `*`, `+` or `{m,n}`).
159     Repeat {
160         /// The expression to be repeated. Limited to literals, `.`, classes
161         /// or grouped expressions.
162         e: Box<Expr>,
163         /// The type of repeat operator used.
164         r: Repeater,
165         /// Whether the repeat is greedy (match the most) or not (match the
166         /// least).
167         greedy: bool,
168     },
169     /// A concatenation of expressions. Must be matched one after the other.
170     ///
171     /// N.B. A concat expression can only appear at the top-level or
172     /// immediately inside a group expression.
173     Concat(Vec<Expr>),
174     /// An alternation of expressions. Only one must match.
175     ///
176     /// N.B. An alternate expression can only appear at the top-level or
177     /// immediately inside a group expression.
178     Alternate(Vec<Expr>),
179 }
180 
181 type CaptureIndex = Option<usize>;
182 
183 type CaptureName = Option<String>;
184 
185 /// The type of a repeat operator expression.
186 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
187 pub enum Repeater {
188     /// Match zero or one (`?`).
189     ZeroOrOne,
190     /// Match zero or more (`*`).
191     ZeroOrMore,
192     /// Match one or more (`+`).
193     OneOrMore,
194     /// Match for at least `min` and at most `max` (`{m,n}`).
195     ///
196     /// When `max` is `None`, there is no upper bound on the number of matches.
197     Range {
198         /// Lower bound on the number of matches.
199         min: u32,
200         /// Optional upper bound on the number of matches.
201         max: Option<u32>,
202     },
203 }
204 
205 impl Repeater {
206     /// Returns true if and only if this repetition can match the empty string.
matches_empty(&self) -> bool207     fn matches_empty(&self) -> bool {
208         use self::Repeater::*;
209         match *self {
210             ZeroOrOne => true,
211             ZeroOrMore => true,
212             OneOrMore => false,
213             Range { min, .. } => min == 0,
214         }
215     }
216 }
217 
218 /// A character class.
219 ///
220 /// A character class has a canonical format that the parser guarantees. Its
221 /// canonical format is defined by the following invariants:
222 ///
223 /// 1. Given any Unicode scalar value, it is matched by *at most* one character
224 ///    range in a canonical character class.
225 /// 2. Every adjacent character range is separated by at least one Unicode
226 ///    scalar value.
227 /// 3. Given any pair of character ranges `r1` and `r2`, if
228 ///    `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
229 ///    character class.
230 ///
231 /// In sum, any `CharClass` produced by this crate's parser is a sorted
232 /// sequence of non-overlapping ranges. This makes it possible to test whether
233 /// a character is matched by a class with a binary search.
234 ///
235 /// If the case insensitive flag was set when parsing a character class, then
236 /// simple case folding is done automatically. For example, `(?i)[a-c]` is
237 /// automatically translated to `[a-cA-C]`.
238 #[derive(Clone, Debug, PartialEq, Eq)]
239 pub struct CharClass {
240     ranges: Vec<ClassRange>,
241 }
242 
243 /// A single inclusive range in a character class.
244 ///
245 /// Since range boundaries are defined by Unicode scalar values, the boundaries
246 /// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may
247 /// *cover* codepoints that are not scalar values.
248 ///
249 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
250 /// for testing whether a character is contained inside a given range.
251 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
252 pub struct ClassRange {
253     /// The start character of the range.
254     ///
255     /// This must be less than or equal to `end`.
256     pub start: char,
257 
258     /// The end character of the range.
259     ///
260     /// This must be greater than or equal to `start`.
261     pub end: char,
262 }
263 
264 /// A byte class for byte ranges only.
265 ///
266 /// A byte class has a canonical format that the parser guarantees. Its
267 /// canonical format is defined by the following invariants:
268 ///
269 /// 1. Given any byte, it is matched by *at most* one byte range in a canonical
270 ///    character class.
271 /// 2. Every adjacent byte range is separated by at least one byte.
272 /// 3. Given any pair of byte ranges `r1` and `r2`, if
273 ///    `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
274 ///    character class.
275 ///
276 /// In sum, any `ByteClass` produced by this crate's parser is a sorted
277 /// sequence of non-overlapping ranges. This makes it possible to test whether
278 /// a byte is matched by a class with a binary search.
279 ///
280 /// If the case insensitive flag was set when parsing a character class,
281 /// then simple ASCII-only case folding is done automatically. For example,
282 /// `(?i)[a-c]` is automatically translated to `[a-cA-C]`.
283 #[derive(Clone, Debug, PartialEq, Eq)]
284 pub struct ByteClass {
285     ranges: Vec<ByteRange>,
286 }
287 
288 /// A single inclusive range in a byte class.
289 ///
290 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
291 /// for testing whether a byte is contained inside a given range.
292 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
293 pub struct ByteRange {
294     /// The start byte of the range.
295     ///
296     /// This must be less than or equal to `end`.
297     pub start: u8,
298 
299     /// The end byte of the range.
300     ///
301     /// This must be greater than or equal to `end`.
302     pub end: u8,
303 }
304 
305 /// A builder for configuring regular expression parsing.
306 ///
307 /// This allows setting the default values of flags and other options, such
308 /// as the maximum nesting depth.
309 #[derive(Clone, Debug)]
310 pub struct ExprBuilder {
311     flags: Flags,
312     nest_limit: usize,
313 }
314 
315 impl ExprBuilder {
316     /// Create a new builder for configuring expression parsing.
317     ///
318     /// Note that all flags are disabled by default.
new() -> ExprBuilder319     pub fn new() -> ExprBuilder {
320         ExprBuilder {
321             flags: Flags::default(),
322             nest_limit: 200,
323         }
324     }
325 
326     /// Set the default value for the case insensitive (`i`) flag.
case_insensitive(mut self, yes: bool) -> ExprBuilder327     pub fn case_insensitive(mut self, yes: bool) -> ExprBuilder {
328         self.flags.casei = yes;
329         self
330     }
331 
332     /// Set the default value for the multi-line matching (`m`) flag.
multi_line(mut self, yes: bool) -> ExprBuilder333     pub fn multi_line(mut self, yes: bool) -> ExprBuilder {
334         self.flags.multi = yes;
335         self
336     }
337 
338     /// Set the default value for the any character (`s`) flag.
dot_matches_new_line(mut self, yes: bool) -> ExprBuilder339     pub fn dot_matches_new_line(mut self, yes: bool) -> ExprBuilder {
340         self.flags.dotnl = yes;
341         self
342     }
343 
344     /// Set the default value for the greedy swap (`U`) flag.
swap_greed(mut self, yes: bool) -> ExprBuilder345     pub fn swap_greed(mut self, yes: bool) -> ExprBuilder {
346         self.flags.swap_greed = yes;
347         self
348     }
349 
350     /// Set the default value for the ignore whitespace (`x`) flag.
ignore_whitespace(mut self, yes: bool) -> ExprBuilder351     pub fn ignore_whitespace(mut self, yes: bool) -> ExprBuilder {
352         self.flags.ignore_space = yes;
353         self
354     }
355 
356     /// Set the default value for the Unicode (`u`) flag.
357     ///
358     /// If `yes` is false, then `allow_bytes` is set to true.
unicode(mut self, yes: bool) -> ExprBuilder359     pub fn unicode(mut self, yes: bool) -> ExprBuilder {
360         self.flags.unicode = yes;
361         if !yes {
362             self.allow_bytes(true)
363         } else {
364             self
365         }
366     }
367 
368     /// Whether the parser allows matching arbitrary bytes or not.
369     ///
370     /// When the `u` flag is disabled (either with this builder or in the
371     /// expression itself), the parser switches to interpreting the expression
372     /// as matching arbitrary bytes instead of Unicode codepoints. For example,
373     /// the expression `(?u:\xFF)` matches the *codepoint* `\xFF`, which
374     /// corresponds to the UTF-8 byte sequence `\xCE\xBF`. Conversely,
375     /// `(?-u:\xFF)` matches the *byte* `\xFF`, which is not valid UTF-8.
376     ///
377     /// When `allow_bytes` is disabled (the default), an expression like
378     /// `(?-u:\xFF)` will cause the parser to return an error, since it would
379     /// otherwise match invalid UTF-8. When enabled, it will be allowed.
allow_bytes(mut self, yes: bool) -> ExprBuilder380     pub fn allow_bytes(mut self, yes: bool) -> ExprBuilder {
381         self.flags.allow_bytes = yes;
382         self
383     }
384 
385     /// Set the nesting limit for regular expression parsing.
386     ///
387     /// Regular expressions that nest more than this limit will result in a
388     /// `StackExhausted` error.
nest_limit(mut self, limit: usize) -> ExprBuilder389     pub fn nest_limit(mut self, limit: usize) -> ExprBuilder {
390         self.nest_limit = limit;
391         self
392     }
393 
394     /// Parse a string as a regular expression using the current configuraiton.
parse(self, s: &str) -> Result<Expr>395     pub fn parse(self, s: &str) -> Result<Expr> {
396         Parser::parse(s, self.flags).and_then(|e| e.simplify(self.nest_limit))
397     }
398 }
399 
400 impl Expr {
401     /// Parses a string in a regular expression syntax tree.
402     ///
403     /// This is a convenience method for parsing an expression using the
404     /// default configuration. To tweak parsing options (such as which flags
405     /// are enabled by default), use the `ExprBuilder` type.
parse(s: &str) -> Result<Expr>406     pub fn parse(s: &str) -> Result<Expr> {
407         ExprBuilder::new().parse(s)
408     }
409 
410     /// Returns true iff the expression can be repeated by a quantifier.
can_repeat(&self) -> bool411     fn can_repeat(&self) -> bool {
412         match *self {
413             Literal{..} | LiteralBytes{..}
414             | AnyChar | AnyCharNoNL | AnyByte | AnyByteNoNL
415             | Class(_) | ClassBytes(_)
416             | StartLine | EndLine | StartText | EndText
417             | WordBoundary | NotWordBoundary
418             | WordBoundaryAscii | NotWordBoundaryAscii
419             | Group{..}
420             => true,
421             _ => false,
422         }
423     }
424 
simplify(self, nest_limit: usize) -> Result<Expr>425     fn simplify(self, nest_limit: usize) -> Result<Expr> {
426         fn combine_literals(es: &mut Vec<Expr>, e: Expr) {
427             match (es.pop(), e) {
428                 (None, e) => es.push(e),
429                 (Some(Literal { chars: mut chars1, casei: casei1 }),
430                       Literal { chars: chars2, casei: casei2 }) => {
431                     if casei1 == casei2 {
432                         chars1.extend(chars2);
433                         es.push(Literal { chars: chars1, casei: casei1 });
434                     } else {
435                         es.push(Literal { chars: chars1, casei: casei1 });
436                         es.push(Literal { chars: chars2, casei: casei2 });
437                     }
438                 }
439                 (Some(LiteralBytes { bytes: mut bytes1, casei: casei1 }),
440                       LiteralBytes { bytes: bytes2, casei: casei2 }) => {
441                     if casei1 == casei2 {
442                         bytes1.extend(bytes2);
443                         es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
444                     } else {
445                         es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
446                         es.push(LiteralBytes { bytes: bytes2, casei: casei2 });
447                     }
448                 }
449                 (Some(e1), e2) => {
450                     es.push(e1);
451                     es.push(e2);
452                 }
453             }
454         }
455         fn simp(expr: Expr, recurse: usize, limit: usize) -> Result<Expr> {
456             if recurse > limit {
457                 return Err(Error {
458                     pos: 0,
459                     surround: "".to_owned(),
460                     kind: ErrorKind::StackExhausted,
461                 });
462             }
463             let simplify = |e| simp(e, recurse + 1, limit);
464             Ok(match expr {
465                 Repeat { e, r, greedy } => Repeat {
466                     e: Box::new(try!(simplify(*e))),
467                     r: r,
468                     greedy: greedy,
469                 },
470                 Group { e, i, name } => {
471                     let e = try!(simplify(*e));
472                     if i.is_none() && name.is_none() && e.can_repeat() {
473                         e
474                     } else {
475                         Group { e: Box::new(e), i: i, name: name }
476                     }
477                 }
478                 Concat(es) => {
479                     let mut new_es = Vec::with_capacity(es.len());
480                     for e in es {
481                         combine_literals(&mut new_es, try!(simplify(e)));
482                     }
483                     if new_es.len() == 1 {
484                         new_es.pop().unwrap()
485                     } else {
486                         Concat(new_es)
487                     }
488                 }
489                 Alternate(es) => {
490                     let mut new_es = Vec::with_capacity(es.len());
491                     for e in es {
492                         new_es.push(try!(simplify(e)));
493                     }
494                     Alternate(new_es)
495                 }
496                 e => e,
497             })
498         }
499         simp(self, 0, nest_limit)
500     }
501 
502     /// Returns a set of literal prefixes extracted from this expression.
prefixes(&self) -> Literals503     pub fn prefixes(&self) -> Literals {
504         let mut lits = Literals::empty();
505         lits.union_prefixes(self);
506         lits
507     }
508 
509     /// Returns a set of literal suffixes extracted from this expression.
suffixes(&self) -> Literals510     pub fn suffixes(&self) -> Literals {
511         let mut lits = Literals::empty();
512         lits.union_suffixes(self);
513         lits
514     }
515 
516     /// Returns true if and only if the expression is required to match from
517     /// the beginning of text.
is_anchored_start(&self) -> bool518     pub fn is_anchored_start(&self) -> bool {
519         match *self {
520             Repeat { ref e, r, .. } => {
521                 !r.matches_empty() && e.is_anchored_start()
522             }
523             Group { ref e, .. } => e.is_anchored_start(),
524             Concat(ref es) => es[0].is_anchored_start(),
525             Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()),
526             StartText => true,
527             _ => false,
528         }
529     }
530 
531     /// Returns true if and only if the expression has at least one matchable
532     /// sub-expression that must match the beginning of text.
has_anchored_start(&self) -> bool533     pub fn has_anchored_start(&self) -> bool {
534         match *self {
535             Repeat { ref e, r, .. } => {
536                 !r.matches_empty() && e.has_anchored_start()
537             }
538             Group { ref e, .. } => e.has_anchored_start(),
539             Concat(ref es) => es[0].has_anchored_start(),
540             Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()),
541             StartText => true,
542             _ => false,
543         }
544     }
545 
546     /// Returns true if and only if the expression is required to match at the
547     /// end of the text.
is_anchored_end(&self) -> bool548     pub fn is_anchored_end(&self) -> bool {
549         match *self {
550             Repeat { ref e, r, .. } => {
551                 !r.matches_empty() && e.is_anchored_end()
552             }
553             Group { ref e, .. } => e.is_anchored_end(),
554             Concat(ref es) => es[es.len() - 1].is_anchored_end(),
555             Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()),
556             EndText => true,
557             _ => false,
558         }
559     }
560 
561     /// Returns true if and only if the expression has at least one matchable
562     /// sub-expression that must match the beginning of text.
has_anchored_end(&self) -> bool563     pub fn has_anchored_end(&self) -> bool {
564         match *self {
565             Repeat { ref e, r, .. } => {
566                 !r.matches_empty() && e.has_anchored_end()
567             }
568             Group { ref e, .. } => e.has_anchored_end(),
569             Concat(ref es) => es[es.len() - 1].has_anchored_end(),
570             Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()),
571             EndText => true,
572             _ => false,
573         }
574     }
575 
576     /// Returns true if and only if the expression contains sub-expressions
577     /// that can match arbitrary bytes.
has_bytes(&self) -> bool578     pub fn has_bytes(&self) -> bool {
579         match *self {
580             Repeat { ref e, .. } => e.has_bytes(),
581             Group { ref e, .. } => e.has_bytes(),
582             Concat(ref es) => es.iter().any(|e| e.has_bytes()),
583             Alternate(ref es) => es.iter().any(|e| e.has_bytes()),
584             LiteralBytes{..} => true,
585             AnyByte | AnyByteNoNL => true,
586             ClassBytes(_) => true,
587             WordBoundaryAscii | NotWordBoundaryAscii => true,
588             _ => false,
589         }
590     }
591 }
592 
593 impl Deref for CharClass {
594     type Target = Vec<ClassRange>;
deref(&self) -> &Vec<ClassRange>595     fn deref(&self) -> &Vec<ClassRange> { &self.ranges }
596 }
597 
598 impl IntoIterator for CharClass {
599     type Item = ClassRange;
600     type IntoIter = vec::IntoIter<ClassRange>;
into_iter(self) -> vec::IntoIter<ClassRange>601     fn into_iter(self) -> vec::IntoIter<ClassRange> { self.ranges.into_iter() }
602 }
603 
604 impl<'a> IntoIterator for &'a CharClass {
605     type Item = &'a ClassRange;
606     type IntoIter = slice::Iter<'a, ClassRange>;
into_iter(self) -> slice::Iter<'a, ClassRange>607     fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() }
608 }
609 
610 impl CharClass {
611     /// Create a new class from an existing set of ranges.
new(ranges: Vec<ClassRange>) -> CharClass612     pub fn new(ranges: Vec<ClassRange>) -> CharClass {
613         CharClass { ranges: ranges }
614     }
615 
616     /// Create an empty class.
empty() -> CharClass617     fn empty() -> CharClass {
618         CharClass::new(Vec::new())
619     }
620 
621     /// Returns true if `c` is matched by this character class.
matches(&self, c: char) -> bool622     pub fn matches(&self, c: char) -> bool {
623         self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok()
624     }
625 
626     /// Removes the given character from the class if it exists.
627     ///
628     /// Note that this takes `O(n)` time in the number of ranges.
remove(&mut self, c: char)629     pub fn remove(&mut self, c: char) {
630         let mut i = match self.binary_search_by(|r| c.partial_cmp(r).unwrap()) {
631             Ok(i) => i,
632             Err(_) => return,
633         };
634         let mut r = self.ranges.remove(i);
635         if r.start == c {
636             r.start = inc_char(c);
637             if r.start > r.end || c == char::MAX {
638                 return;
639             }
640             self.ranges.insert(i, r);
641         } else if r.end == c {
642             r.end = dec_char(c);
643             if r.end < r.start || c == '\x00' {
644                 return;
645             }
646             self.ranges.insert(0, r);
647         } else {
648             let (mut r1, mut r2) = (r.clone(), r.clone());
649             r1.end = dec_char(c);
650             if r1.start <= r1.end {
651                 self.ranges.insert(i, r1);
652                 i += 1;
653             }
654             r2.start = inc_char(c);
655             if r2.start <= r2.end {
656                 self.ranges.insert(i, r2);
657             }
658         }
659     }
660 
661     /// Create a new empty class from this one.
to_empty(&self) -> CharClass662     fn to_empty(&self) -> CharClass {
663         CharClass { ranges: Vec::with_capacity(self.len()) }
664     }
665 
666     /// Create a byte class from this character class.
667     ///
668     /// Codepoints above 0xFF are removed.
to_byte_class(self) -> ByteClass669     fn to_byte_class(self) -> ByteClass {
670         ByteClass::new(
671             self.ranges.into_iter()
672                        .filter_map(|r| r.to_byte_range())
673                        .collect()).canonicalize()
674     }
675 
676     /// Merge two classes and canonicalize them.
677     #[cfg(test)]
merge(mut self, other: CharClass) -> CharClass678     fn merge(mut self, other: CharClass) -> CharClass {
679         self.ranges.extend(other);
680         self.canonicalize()
681     }
682 
683     /// Canonicalize any sequence of ranges.
684     ///
685     /// This is responsible for enforcing the canonical format invariants
686     /// as described on the docs for the `CharClass` type.
canonicalize(mut self) -> CharClass687     fn canonicalize(mut self) -> CharClass {
688         // TODO: Save some cycles here by checking if already canonicalized.
689         self.ranges.sort();
690         let mut ordered = self.to_empty(); // TODO: Do this in place?
691         for candidate in self {
692             // If the candidate overlaps with an existing range, then it must
693             // be the most recent range added because we process the candidates
694             // in order.
695             if let Some(or) = ordered.ranges.last_mut() {
696                 if or.overlapping(candidate) {
697                     *or = or.merge(candidate);
698                     continue;
699                 }
700             }
701             ordered.ranges.push(candidate);
702         }
703         ordered
704     }
705 
706     /// Calculate the intersection of two canonical character classes.
707     ///
708     /// The returned intersection is canonical.
intersection(&self, other: &CharClass) -> CharClass709     fn intersection(&self, other: &CharClass) -> CharClass {
710         if self.ranges.is_empty() || other.ranges.is_empty() {
711             return CharClass::empty();
712         }
713 
714         let mut intersection = CharClass::empty();
715 
716         let mut iter_a = self.ranges.iter();
717         let mut iter_b = other.ranges.iter();
718         let mut a = iter_a.next().unwrap();
719         let mut b = iter_b.next().unwrap();
720         loop {
721             if let Some(i) = a.intersection(&b) {
722                 intersection.ranges.push(i);
723             }
724 
725             // If the range with the smaller end didn't match this time,
726             // it won't ever match, so move on to the next one.
727             let (iter, item) = if a.end < b.end {
728                 (&mut iter_a, &mut a)
729             } else {
730                 (&mut iter_b, &mut b)
731             };
732             match iter.next() {
733                 Some(v) => *item = v,
734                 None => break, // no more ranges to check, done
735             }
736         }
737 
738         intersection.canonicalize()
739     }
740 
741     /// Negates the character class.
742     ///
743     /// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
744     /// if and only if `c` does not match `self.negate()`.
negate(mut self) -> CharClass745     pub fn negate(mut self) -> CharClass {
746         fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) }
747 
748         if self.is_empty() {
749             // Inverting an empty range yields all of Unicode.
750             return CharClass {
751                 ranges: vec![ClassRange { start: '\x00', end: '\u{10ffff}' }],
752             };
753         }
754         self = self.canonicalize();
755         let mut inv = self.to_empty();
756         if self[0].start > '\x00' {
757             inv.ranges.push(range('\x00', dec_char(self[0].start)));
758         }
759         for win in self.windows(2) {
760             inv.ranges.push(range(inc_char(win[0].end),
761                                   dec_char(win[1].start)));
762         }
763         if self[self.len() - 1].end < char::MAX {
764             inv.ranges.push(range(inc_char(self[self.len() - 1].end),
765                                   char::MAX));
766         }
767         inv
768     }
769 
770     /// Apply case folding to this character class.
771     ///
772     /// N.B. Applying case folding to a negated character class probably
773     /// won't produce the expected result. e.g., `(?i)[^x]` really should
774     /// match any character sans `x` and `X`, but if `[^x]` is negated
775     /// before being case folded, you'll end up matching any character.
case_fold(self) -> CharClass776     pub fn case_fold(self) -> CharClass {
777         let mut folded = self.to_empty();
778         for r in self {
779             // Applying case folding to a range is expensive because *every*
780             // character needs to be examined. Thus, we avoid that drudgery
781             // if no character in the current range is in our case folding
782             // table.
783             if r.needs_case_folding() {
784                 folded.ranges.extend(r.case_fold());
785             }
786             folded.ranges.push(r);
787         }
788         folded.canonicalize()
789     }
790 
791     /// Returns the number of characters that match this class.
num_chars(&self) -> usize792     fn num_chars(&self) -> usize {
793         self.ranges.iter()
794             .map(|&r| 1 + (r.end as u32) - (r.start as u32))
795             .fold(0, |acc, len| acc + len)
796             as usize
797     }
798 }
799 
800 impl ClassRange {
801     /// Create a new class range.
802     ///
803     /// If `end < start`, then the two values are swapped so that
804     /// the invariant `start <= end` is preserved.
new(start: char, end: char) -> ClassRange805     fn new(start: char, end: char) -> ClassRange {
806         if start <= end {
807             ClassRange { start: start, end: end }
808         } else {
809             ClassRange { start: end, end: start }
810         }
811     }
812 
813     /// Translate this to a byte class.
814     ///
815     /// If the start codepoint exceeds 0xFF, then this returns `None`.
816     ///
817     /// If the end codepoint exceeds 0xFF, then it is set to 0xFF.
to_byte_range(self) -> Option<ByteRange>818     fn to_byte_range(self) -> Option<ByteRange> {
819         if self.start > '\u{FF}' {
820             None
821         } else {
822             let s = self.start as u8;
823             let e = min('\u{FF}', self.end) as u8;
824             Some(ByteRange::new(s, e))
825         }
826     }
827 
828     /// Create a range of one character.
one(c: char) -> ClassRange829     fn one(c: char) -> ClassRange {
830         ClassRange { start: c, end: c }
831     }
832 
833     /// Returns true if and only if the two ranges are overlapping. Note that
834     /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
overlapping(self, other: ClassRange) -> bool835     fn overlapping(self, other: ClassRange) -> bool {
836         max(self.start, other.start) <= inc_char(min(self.end, other.end))
837     }
838 
839     /// Returns the intersection of the two ranges if they have common
840     /// characters, `None` otherwise.
intersection(&self, other: &ClassRange) -> Option<ClassRange>841     fn intersection(&self, other: &ClassRange) -> Option<ClassRange> {
842         let start = max(self.start, other.start);
843         let end = min(self.end, other.end);
844         if start <= end {
845             Some(ClassRange::new(start, end))
846         } else {
847             None
848         }
849     }
850 
851     /// Creates a new range representing the union of `self` and `other.
merge(self, other: ClassRange) -> ClassRange852     fn merge(self, other: ClassRange) -> ClassRange {
853         ClassRange {
854             start: min(self.start, other.start),
855             end: max(self.end, other.end),
856         }
857     }
858 
859     /// Returns true if and only if this range contains a character that is
860     /// in the case folding table.
needs_case_folding(self) -> bool861     fn needs_case_folding(self) -> bool {
862         case_folding::C_plus_S_both_table
863         .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok()
864     }
865 
866     /// Apply case folding to this range.
867     ///
868     /// Since case folding might add characters such that the range is no
869     /// longer contiguous, this returns multiple class ranges. They are in
870     /// canonical order.
case_fold(self) -> Vec<ClassRange>871     fn case_fold(self) -> Vec<ClassRange> {
872         let table = &case_folding::C_plus_S_both_table;
873         let (s, e) = (self.start as u32, self.end as u32 + 1);
874         let mut start = self.start;
875         let mut end = start;
876         let mut next_case_fold = '\x00';
877         let mut ranges = Vec::with_capacity(10);
878         for mut c in (s..e).filter_map(char::from_u32) {
879             if c >= next_case_fold {
880                 c = match simple_case_fold_both_result(c) {
881                     Ok(i) => {
882                         for &(c1, c2) in &table[i..] {
883                             if c1 != c {
884                                 break;
885                             }
886                             if c2 != inc_char(end) {
887                                 ranges.push(ClassRange::new(start, end));
888                                 start = c2;
889                             }
890                             end = c2;
891                         }
892                         continue;
893                     }
894                     Err(i) => {
895                         if i < table.len() {
896                             next_case_fold = table[i].0;
897                         } else {
898                             next_case_fold = '\u{10FFFF}';
899                         }
900                         c
901                     }
902                 };
903             }
904             // The fast path. We know this character doesn't have an entry
905             // in the case folding table.
906             if c != inc_char(end) {
907                 ranges.push(ClassRange::new(start, end));
908                 start = c;
909             }
910             end = c;
911         }
912         ranges.push(ClassRange::new(start, end));
913         ranges
914     }
915 }
916 
917 impl PartialEq<char> for ClassRange {
918     #[inline]
eq(&self, other: &char) -> bool919     fn eq(&self, other: &char) -> bool {
920         self.start <= *other && *other <= self.end
921     }
922 }
923 
924 impl PartialEq<ClassRange> for char {
925     #[inline]
eq(&self, other: &ClassRange) -> bool926     fn eq(&self, other: &ClassRange) -> bool {
927         other.eq(self)
928     }
929 }
930 
931 impl PartialOrd<char> for ClassRange {
932     #[inline]
partial_cmp(&self, other: &char) -> Option<Ordering>933     fn partial_cmp(&self, other: &char) -> Option<Ordering> {
934         Some(if self == other {
935             Ordering::Equal
936         } else if *other > self.end {
937             Ordering::Greater
938         } else {
939             Ordering::Less
940         })
941     }
942 }
943 
944 impl PartialOrd<ClassRange> for char {
945     #[inline]
partial_cmp(&self, other: &ClassRange) -> Option<Ordering>946     fn partial_cmp(&self, other: &ClassRange) -> Option<Ordering> {
947         other.partial_cmp(self).map(|o| o.reverse())
948     }
949 }
950 
951 impl ByteClass {
952     /// Create a new class from an existing set of ranges.
new(ranges: Vec<ByteRange>) -> ByteClass953     pub fn new(ranges: Vec<ByteRange>) -> ByteClass {
954         ByteClass { ranges: ranges }
955     }
956 
957     /// Returns true if `b` is matched by this byte class.
matches(&self, b: u8) -> bool958     pub fn matches(&self, b: u8) -> bool {
959         self.binary_search_by(|range| b.partial_cmp(range).unwrap()).is_ok()
960     }
961 
962     /// Removes the given byte from the class if it exists.
963     ///
964     /// Note that this takes `O(n)` time in the number of ranges.
remove(&mut self, b: u8)965     pub fn remove(&mut self, b: u8) {
966         let mut i = match self.binary_search_by(|r| b.partial_cmp(r).unwrap()) {
967             Ok(i) => i,
968             Err(_) => return,
969         };
970         let mut r = self.ranges.remove(i);
971         if r.start == b {
972             r.start = b.saturating_add(1);
973             if r.start > r.end || b == u8::MAX {
974                 return;
975             }
976             self.ranges.insert(i, r);
977         } else if r.end == b {
978             r.end = b.saturating_sub(1);
979             if r.end < r.start || b == b'\x00' {
980                 return;
981             }
982             self.ranges.insert(0, r);
983         } else {
984             let (mut r1, mut r2) = (r.clone(), r.clone());
985             r1.end = b.saturating_sub(1);
986             if r1.start <= r1.end {
987                 self.ranges.insert(i, r1);
988                 i += 1;
989             }
990             r2.start = b.saturating_add(1);
991             if r2.start <= r2.end {
992                 self.ranges.insert(i, r2);
993             }
994         }
995     }
996 
997     /// Create a new empty class from this one.
to_empty(&self) -> ByteClass998     fn to_empty(&self) -> ByteClass {
999         ByteClass { ranges: Vec::with_capacity(self.len()) }
1000     }
1001 
1002     /// Canonicalze any sequence of ranges.
1003     ///
1004     /// This is responsible for enforcing the canonical format invariants
1005     /// as described on the docs for the `ByteClass` type.
canonicalize(mut self) -> ByteClass1006     fn canonicalize(mut self) -> ByteClass {
1007         // TODO: Save some cycles here by checking if already canonicalized.
1008         self.ranges.sort();
1009         let mut ordered = self.to_empty(); // TODO: Do this in place?
1010         for candidate in self {
1011             // If the candidate overlaps with an existing range, then it must
1012             // be the most recent range added because we process the candidates
1013             // in order.
1014             if let Some(or) = ordered.ranges.last_mut() {
1015                 if or.overlapping(candidate) {
1016                     *or = or.merge(candidate);
1017                     continue;
1018                 }
1019             }
1020             ordered.ranges.push(candidate);
1021         }
1022         ordered
1023     }
1024 
1025     /// Negates the byte class.
1026     ///
1027     /// For all `b` where `b` is a byte, `b` matches `self` if and only if `b`
1028     /// does not match `self.negate()`.
negate(mut self) -> ByteClass1029     pub fn negate(mut self) -> ByteClass {
1030         fn range(s: u8, e: u8) -> ByteRange { ByteRange::new(s, e) }
1031 
1032         if self.is_empty() {
1033             // Inverting an empty range yields all bytes.
1034             return ByteClass {
1035                 ranges: vec![ByteRange { start: b'\x00', end: b'\xff' }],
1036             };
1037         }
1038         self = self.canonicalize();
1039         let mut inv = self.to_empty();
1040         if self[0].start > b'\x00' {
1041             inv.ranges.push(range(b'\x00', self[0].start.saturating_sub(1)));
1042         }
1043         for win in self.windows(2) {
1044             inv.ranges.push(range(win[0].end.saturating_add(1),
1045                                   win[1].start.saturating_sub(1)));
1046         }
1047         if self[self.len() - 1].end < u8::MAX {
1048             inv.ranges.push(range(self[self.len() - 1].end.saturating_add(1),
1049                                   u8::MAX));
1050         }
1051         inv
1052     }
1053 
1054     /// Apply case folding to this byte class.
1055     ///
1056     /// This assumes that the bytes in the ranges are ASCII compatible.
1057     ///
1058     /// N.B. Applying case folding to a negated character class probably
1059     /// won't produce the expected result. e.g., `(?i)[^x]` really should
1060     /// match any character sans `x` and `X`, but if `[^x]` is negated
1061     /// before being case folded, you'll end up matching any character.
case_fold(self) -> ByteClass1062     pub fn case_fold(self) -> ByteClass {
1063         let mut folded = self.to_empty();
1064         for r in self {
1065             folded.ranges.extend(r.case_fold());
1066         }
1067         folded.canonicalize()
1068     }
1069 
1070     /// Returns the number of bytes that match this class.
num_bytes(&self) -> usize1071     fn num_bytes(&self) -> usize {
1072         self.ranges.iter()
1073             .map(|&r| 1 + (r.end as u32) - (r.start as u32))
1074             .fold(0, |acc, len| acc + len)
1075             as usize
1076     }
1077 }
1078 
1079 impl ByteRange {
1080     /// Create a new class range.
1081     ///
1082     /// If `end < start`, then the two values are swapped so that
1083     /// the invariant `start <= end` is preserved.
new(start: u8, end: u8) -> ByteRange1084     fn new(start: u8, end: u8) -> ByteRange {
1085         if start <= end {
1086             ByteRange { start: start, end: end }
1087         } else {
1088             ByteRange { start: end, end: start }
1089         }
1090     }
1091 
1092     /// Returns true if and only if the two ranges are overlapping. Note that
1093     /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
overlapping(self, other: ByteRange) -> bool1094     fn overlapping(self, other: ByteRange) -> bool {
1095         max(self.start, other.start)
1096         <= min(self.end, other.end).saturating_add(1)
1097     }
1098 
1099     /// Returns true if and only if the intersection of self and other is non
1100     /// empty.
is_intersect_empty(self, other: ByteRange) -> bool1101     fn is_intersect_empty(self, other: ByteRange) -> bool {
1102         max(self.start, other.start) > min(self.end, other.end)
1103     }
1104 
1105     /// Creates a new range representing the union of `self` and `other.
merge(self, other: ByteRange) -> ByteRange1106     fn merge(self, other: ByteRange) -> ByteRange {
1107         ByteRange {
1108             start: min(self.start, other.start),
1109             end: max(self.end, other.end),
1110         }
1111     }
1112 
1113     /// Apply case folding to this range.
1114     ///
1115     /// Since case folding might add bytes such that the range is no
1116     /// longer contiguous, this returns multiple byte ranges.
1117     ///
1118     /// This assumes that the bytes in this range are ASCII compatible.
case_fold(self) -> Vec<ByteRange>1119     fn case_fold(self) -> Vec<ByteRange> {
1120         // So much easier than Unicode case folding!
1121         let mut ranges = vec![self];
1122         if !ByteRange::new(b'a', b'z').is_intersect_empty(self) {
1123             let lower = max(self.start, b'a');
1124             let upper = min(self.end, b'z');
1125             ranges.push(ByteRange::new(lower - 32, upper - 32));
1126         }
1127         if !ByteRange::new(b'A', b'Z').is_intersect_empty(self) {
1128             let lower = max(self.start, b'A');
1129             let upper = min(self.end, b'Z');
1130             ranges.push(ByteRange::new(lower + 32, upper + 32));
1131         }
1132         ranges
1133     }
1134 }
1135 
1136 impl Deref for ByteClass {
1137     type Target = Vec<ByteRange>;
deref(&self) -> &Vec<ByteRange>1138     fn deref(&self) -> &Vec<ByteRange> { &self.ranges }
1139 }
1140 
1141 impl IntoIterator for ByteClass {
1142     type Item = ByteRange;
1143     type IntoIter = vec::IntoIter<ByteRange>;
into_iter(self) -> vec::IntoIter<ByteRange>1144     fn into_iter(self) -> vec::IntoIter<ByteRange> { self.ranges.into_iter() }
1145 }
1146 
1147 impl<'a> IntoIterator for &'a ByteClass {
1148     type Item = &'a ByteRange;
1149     type IntoIter = slice::Iter<'a, ByteRange>;
into_iter(self) -> slice::Iter<'a, ByteRange>1150     fn into_iter(self) -> slice::Iter<'a, ByteRange> { self.iter() }
1151 }
1152 
1153 impl PartialEq<u8> for ByteRange {
1154     #[inline]
eq(&self, other: &u8) -> bool1155     fn eq(&self, other: &u8) -> bool {
1156         self.start <= *other && *other <= self.end
1157     }
1158 }
1159 
1160 impl PartialEq<ByteRange> for u8 {
1161     #[inline]
eq(&self, other: &ByteRange) -> bool1162     fn eq(&self, other: &ByteRange) -> bool {
1163         other.eq(self)
1164     }
1165 }
1166 
1167 impl PartialOrd<u8> for ByteRange {
1168     #[inline]
partial_cmp(&self, other: &u8) -> Option<Ordering>1169     fn partial_cmp(&self, other: &u8) -> Option<Ordering> {
1170         Some(if self == other {
1171             Ordering::Equal
1172         } else if *other > self.end {
1173             Ordering::Greater
1174         } else {
1175             Ordering::Less
1176         })
1177     }
1178 }
1179 
1180 impl PartialOrd<ByteRange> for u8 {
1181     #[inline]
partial_cmp(&self, other: &ByteRange) -> Option<Ordering>1182     fn partial_cmp(&self, other: &ByteRange) -> Option<Ordering> {
1183         other.partial_cmp(self).map(|o| o.reverse())
1184     }
1185 }
1186 
1187 /// This implementation of `Display` will write a regular expression from the
1188 /// syntax tree. It does not write the original string parsed.
1189 impl fmt::Display for Expr {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1190     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1191         match *self {
1192             Empty => write!(f, ""),
1193             Literal { ref chars, casei } => {
1194                 if casei {
1195                     try!(write!(f, "(?iu:"));
1196                 } else {
1197                     try!(write!(f, "(?u:"));
1198                 }
1199                 for &c in chars {
1200                     try!(write!(f, "{}", quote_char(c)));
1201                 }
1202                 try!(write!(f, ")"));
1203                 Ok(())
1204             }
1205             LiteralBytes { ref bytes, casei } => {
1206                 if casei {
1207                     try!(write!(f, "(?i-u:"));
1208                 } else {
1209                     try!(write!(f, "(?-u:"));
1210                 }
1211                 for &b in bytes {
1212                     try!(write!(f, "{}", quote_byte(b)));
1213                 }
1214                 try!(write!(f, ")"));
1215                 Ok(())
1216             }
1217             AnyChar => write!(f, "(?su:.)"),
1218             AnyCharNoNL => write!(f, "(?u:.)"),
1219             AnyByte => write!(f, "(?s-u:.)"),
1220             AnyByteNoNL => write!(f, "(?-u:.)"),
1221             Class(ref cls) => write!(f, "{}", cls),
1222             ClassBytes(ref cls) => write!(f, "{}", cls),
1223             StartLine => write!(f, "(?m:^)"),
1224             EndLine => write!(f, "(?m:$)"),
1225             StartText => write!(f, r"^"),
1226             EndText => write!(f, r"$"),
1227             WordBoundary => write!(f, r"(?u:\b)"),
1228             NotWordBoundary => write!(f, r"(?u:\B)"),
1229             WordBoundaryAscii => write!(f, r"(?-u:\b)"),
1230             NotWordBoundaryAscii => write!(f, r"(?-u:\B)"),
1231             Group { ref e, i: None, name: None } => write!(f, "(?:{})", e),
1232             Group { ref e, name: None, .. } => write!(f, "({})", e),
1233             Group { ref e, name: Some(ref n), .. } => {
1234                 write!(f, "(?P<{}>{})", n, e)
1235             }
1236             Repeat { ref e, r, greedy } => {
1237                 match &**e {
1238                     &Literal { ref chars, .. } if chars.len() > 1 => {
1239                         try!(write!(f, "(?:{}){}", e, r))
1240                     }
1241                     _ => try!(write!(f, "{}{}", e, r)),
1242                 }
1243                 if !greedy { try!(write!(f, "?")); }
1244                 Ok(())
1245             }
1246             Concat(ref es) => {
1247                 for e in es {
1248                     try!(write!(f, "{}", e));
1249                 }
1250                 Ok(())
1251             }
1252             Alternate(ref es) => {
1253                 for (i, e) in es.iter().enumerate() {
1254                     if i > 0 { try!(write!(f, "|")); }
1255                     try!(write!(f, "{}", e));
1256                 }
1257                 Ok(())
1258             }
1259         }
1260     }
1261 }
1262 
1263 impl fmt::Display for Repeater {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1264     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1265         match *self {
1266             ZeroOrOne => write!(f, "?"),
1267             ZeroOrMore => write!(f, "*"),
1268             OneOrMore => write!(f, "+"),
1269             Range { min: s, max: None } => write!(f, "{{{},}}", s),
1270             Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s),
1271             Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e),
1272         }
1273     }
1274 }
1275 
1276 impl fmt::Display for CharClass {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1277     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1278         try!(write!(f, "(?u:["));
1279         for range in self.iter() {
1280             if range.start == '-' || range.end == '-' {
1281                 try!(write!(f, "-"));
1282                 break;
1283             }
1284         }
1285         for range in self.iter() {
1286             let mut range = *range;
1287             if range.start == '-' {
1288                 range.start = ((range.start as u8) + 1) as char;
1289             }
1290             if range.end == '-' {
1291                 range.end = ((range.end as u8) - 1) as char;
1292             }
1293             if range.start > range.end {
1294                 continue;
1295             }
1296             try!(write!(f, "{}", range));
1297         }
1298         try!(write!(f, "])"));
1299         Ok(())
1300     }
1301 }
1302 
1303 impl fmt::Display for ClassRange {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1304     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1305         write!(f, "{}-{}", quote_char(self.start), quote_char(self.end))
1306     }
1307 }
1308 
1309 impl fmt::Display for ByteClass {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1310     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1311         try!(write!(f, "(?-u:["));
1312         for range in self.iter() {
1313             if range.start == b'-' || range.end == b'-' {
1314                 try!(write!(f, "-"));
1315                 break;
1316             }
1317         }
1318         for range in self.iter() {
1319             let mut range = *range;
1320             if range.start == b'-' {
1321                 range.start += 1;
1322             }
1323             if range.end == b'-' {
1324                 range.start -= 1;
1325             }
1326             if range.start > range.end {
1327                 continue;
1328             }
1329             try!(write!(f, "{}", range));
1330         }
1331         try!(write!(f, "])"));
1332         Ok(())
1333     }
1334 }
1335 
1336 impl fmt::Display for ByteRange {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1337     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1338         write!(f, "{}-{}", quote_byte(self.start), quote_byte(self.end))
1339     }
1340 }
1341 
1342 /// An alias for computations that can return a `Error`.
1343 pub type Result<T> = ::std::result::Result<T, Error>;
1344 
1345 /// A parse error.
1346 ///
1347 /// This includes details about the specific type of error and a rough
1348 /// approximation of where it occurred.
1349 #[derive(Clone, Debug, PartialEq)]
1350 pub struct Error {
1351     pos: usize,
1352     surround: String,
1353     kind: ErrorKind,
1354 }
1355 
1356 /// The specific type of parse error that can occur.
1357 #[derive(Clone, Debug, PartialEq)]
1358 pub enum ErrorKind {
1359     /// A negation symbol is used twice in flag settings.
1360     /// e.g., `(?-i-s)`.
1361     DoubleFlagNegation,
1362     /// The same capture name was used more than once.
1363     /// e.g., `(?P<a>.)(?P<a>.)`.
1364     DuplicateCaptureName(String),
1365     /// An alternate is empty. e.g., `(|a)`.
1366     EmptyAlternate,
1367     /// A capture group name is empty. e.g., `(?P<>a)`.
1368     EmptyCaptureName,
1369     /// A negation symbol was not proceded by any flags. e.g., `(?i-)`.
1370     EmptyFlagNegation,
1371     /// A group is empty. e.g., `()`.
1372     EmptyGroup,
1373     /// An invalid number was used in a counted repetition. e.g., `a{b}`.
1374     InvalidBase10(String),
1375     /// An invalid hexadecimal number was used in an escape sequence.
1376     /// e.g., `\xAG`.
1377     InvalidBase16(String),
1378     /// An invalid capture name was used. e.g., `(?P<0a>b)`.
1379     InvalidCaptureName(String),
1380     /// An invalid class range was givien. Specifically, when the start of the
1381     /// range is greater than the end. e.g., `[z-a]`.
1382     InvalidClassRange {
1383         /// The first character specified in the range.
1384         start: char,
1385         /// The second character specified in the range.
1386         end: char,
1387     },
1388     /// An escape sequence was used in a character class where it is not
1389     /// allowed. e.g., `[a-\pN]` or `[\A]`.
1390     InvalidClassEscape(Expr),
1391     /// An invalid counted repetition min/max was given. e.g., `a{2,1}`.
1392     InvalidRepeatRange {
1393         /// The first number specified in the repetition.
1394         min: u32,
1395         /// The second number specified in the repetition.
1396         max: u32,
1397     },
1398     /// An invalid Unicode scalar value was used in a long hexadecimal
1399     /// sequence. e.g., `\x{D800}`.
1400     InvalidScalarValue(u32),
1401     /// An empty counted repetition operator. e.g., `a{}`.
1402     MissingBase10,
1403     /// A repetition operator was not applied to an expression. e.g., `*`.
1404     RepeaterExpectsExpr,
1405     /// A repetition operator was applied to an expression that cannot be
1406     /// repeated. e.g., `a+*` or `a|*`.
1407     RepeaterUnexpectedExpr(Expr),
1408     /// A capture group name that is never closed. e.g., `(?P<a`.
1409     UnclosedCaptureName(String),
1410     /// An unclosed hexadecimal literal. e.g., `\x{a`.
1411     UnclosedHex,
1412     /// An unclosed parenthesis. e.g., `(a`.
1413     UnclosedParen,
1414     /// An unclosed counted repetition operator. e.g., `a{2`.
1415     UnclosedRepeat,
1416     /// An unclosed named Unicode class. e.g., `\p{Yi`.
1417     UnclosedUnicodeName,
1418     /// Saw end of regex before class was closed. e.g., `[a`.
1419     UnexpectedClassEof,
1420     /// Saw end of regex before escape sequence was closed. e.g., `\`.
1421     UnexpectedEscapeEof,
1422     /// Saw end of regex before flags were closed. e.g., `(?i`.
1423     UnexpectedFlagEof,
1424     /// Saw end of regex before two hexadecimal digits were seen. e.g., `\xA`.
1425     UnexpectedTwoDigitHexEof,
1426     /// Unopened parenthesis. e.g., `)`.
1427     UnopenedParen,
1428     /// Unrecognized escape sequence. e.g., `\q`.
1429     UnrecognizedEscape(char),
1430     /// Unrecognized flag. e.g., `(?a)`.
1431     UnrecognizedFlag(char),
1432     /// Unrecognized named Unicode class. e.g., `\p{Foo}`.
1433     UnrecognizedUnicodeClass(String),
1434     /// Indicates that the regex uses too much nesting.
1435     ///
1436     /// (N.B. This error exists because traversing the Expr is recursive and
1437     /// an explicit heap allocated stack is not (yet?) used. Regardless, some
1438     /// sort of limit must be applied to avoid unbounded memory growth.
1439     StackExhausted,
1440     /// A disallowed flag was found (e.g., `u`).
1441     FlagNotAllowed(char),
1442     /// A Unicode class was used when the Unicode (`u`) flag was disabled.
1443     UnicodeNotAllowed,
1444     /// InvalidUtf8 indicates that the expression may match non-UTF-8 bytes.
1445     /// This never returned if the parser is permitted to allow expressions
1446     /// that match arbitrary bytes.
1447     InvalidUtf8,
1448     /// A character class was constructed such that it is empty.
1449     /// e.g., `[^\d\D]`.
1450     EmptyClass,
1451     /// Indicates that unsupported notation was used in a character class.
1452     ///
1453     /// The char in this error corresponds to the illegal character.
1454     ///
1455     /// The intent of this error is to carve a path to support set notation
1456     /// as described in UTS#18 RL1.3. We do this by rejecting regexes that
1457     /// would use the notation.
1458     ///
1459     /// The work around for end users is to escape the character included in
1460     /// this error message.
1461     UnsupportedClassChar(char),
1462     /// Hints that destructuring should not be exhaustive.
1463     ///
1464     /// This enum may grow additional variants, so this makes sure clients
1465     /// don't count on exhaustive matching. (Otherwise, adding a new variant
1466     /// could break existing code.)
1467     #[doc(hidden)]
1468     __Nonexhaustive,
1469 }
1470 
1471 impl Error {
1472     /// Returns an approximate *character* offset at which the error occurred.
1473     ///
1474     /// The character offset may be equal to the number of characters in the
1475     /// string, in which case it should be interpreted as pointing to the end
1476     /// of the regex.
position(&self) -> usize1477     pub fn position(&self) -> usize {
1478         self.pos
1479     }
1480 
1481     /// Returns the type of the regex parse error.
kind(&self) -> &ErrorKind1482     pub fn kind(&self) -> &ErrorKind {
1483         &self.kind
1484     }
1485 }
1486 
1487 impl ErrorKind {
description(&self) -> &str1488     fn description(&self) -> &str {
1489         use ErrorKind::*;
1490         match *self {
1491             DoubleFlagNegation => "double flag negation",
1492             DuplicateCaptureName(_) => "duplicate capture name",
1493             EmptyAlternate => "empty alternate",
1494             EmptyCaptureName => "empty capture name",
1495             EmptyFlagNegation => "flag negation without any flags",
1496             EmptyGroup => "empty group (e.g., '()')",
1497             InvalidBase10(_) => "invalid base 10 number",
1498             InvalidBase16(_) => "invalid base 16 number",
1499             InvalidCaptureName(_) => "invalid capture name",
1500             InvalidClassRange{..} => "invalid character class range",
1501             InvalidClassEscape(_) => "invalid escape sequence in class",
1502             InvalidRepeatRange{..} => "invalid counted repetition range",
1503             InvalidScalarValue(_) => "invalid Unicode scalar value",
1504             MissingBase10 => "missing count in repetition operator",
1505             RepeaterExpectsExpr => "repetition operator missing expression",
1506             RepeaterUnexpectedExpr(_) => "expression cannot be repeated",
1507             UnclosedCaptureName(_) => "unclosed capture group name",
1508             UnclosedHex => "unclosed hexadecimal literal",
1509             UnclosedParen => "unclosed parenthesis",
1510             UnclosedRepeat => "unclosed counted repetition operator",
1511             UnclosedUnicodeName => "unclosed Unicode class literal",
1512             UnexpectedClassEof => "unexpected EOF in character class",
1513             UnexpectedEscapeEof => "unexpected EOF in escape sequence",
1514             UnexpectedFlagEof => "unexpected EOF in flags",
1515             UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal",
1516             UnopenedParen => "unopened parenthesis",
1517             UnrecognizedEscape(_) => "unrecognized escape sequence",
1518             UnrecognizedFlag(_) => "unrecognized flag",
1519             UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name",
1520             StackExhausted => "stack exhausted, too much nesting",
1521             FlagNotAllowed(_) => "flag not allowed",
1522             UnicodeNotAllowed => "Unicode features not allowed",
1523             InvalidUtf8 => "matching arbitrary bytes is not allowed",
1524             EmptyClass => "empty character class",
1525             UnsupportedClassChar(_) => "unsupported class notation",
1526             __Nonexhaustive => unreachable!(),
1527         }
1528     }
1529 }
1530 
1531 impl ::std::error::Error for Error {
description(&self) -> &str1532     fn description(&self) -> &str {
1533         self.kind.description()
1534     }
1535 }
1536 
1537 impl fmt::Display for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1538     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1539         if let ErrorKind::StackExhausted = self.kind {
1540             write!(f, "Error parsing regex: {}", self.kind)
1541         } else {
1542             write!(
1543                 f, "Error parsing regex near '{}' at character offset {}: {}",
1544                 self.surround, self.pos, self.kind)
1545         }
1546     }
1547 }
1548 
1549 impl fmt::Display for ErrorKind {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1550     fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1551         use ErrorKind::*;
1552         match *self {
1553             DoubleFlagNegation =>
1554                 write!(f, "Only one negation symbol is allowed in flags."),
1555             DuplicateCaptureName(ref s) =>
1556                 write!(f, "Capture name '{}' is used more than once.", s),
1557             EmptyAlternate =>
1558                 write!(f, "Alternations cannot be empty."),
1559             EmptyCaptureName =>
1560                 write!(f, "Capture names cannot be empty."),
1561             EmptyFlagNegation =>
1562                 write!(f, "Flag negation requires setting at least one flag."),
1563             EmptyGroup =>
1564                 write!(f, "Empty regex groups (e.g., '()') are not allowed."),
1565             InvalidBase10(ref s) =>
1566                 write!(f, "Not a valid base 10 number: '{}'", s),
1567             InvalidBase16(ref s) =>
1568                 write!(f, "Not a valid base 16 number: '{}'", s),
1569             InvalidCaptureName(ref s) =>
1570                 write!(f, "Invalid capture name: '{}'. Capture names must \
1571                            consist of [_a-zA-Z0-9] and are not allowed to \
1572                            start with with a number.", s),
1573             InvalidClassRange { start, end } =>
1574                 write!(f, "Invalid character class range '{}-{}'. \
1575                            Character class ranges must start with the smaller \
1576                            character, but {} > {}", start, end, start, end),
1577             InvalidClassEscape(ref e) =>
1578                 write!(f, "Invalid escape sequence in character \
1579                            class: '{}'.", e),
1580             InvalidRepeatRange { min, max } =>
1581                 write!(f, "Invalid counted repetition range: {{{}, {}}}. \
1582                            Counted repetition ranges must start with the \
1583                            minimum, but {} > {}", min, max, min, max),
1584             InvalidScalarValue(c) =>
1585                 write!(f, "Number does not correspond to a Unicode scalar \
1586                            value: '{}'.", c),
1587             MissingBase10 =>
1588                 write!(f, "Missing maximum in counted
1589 repetition operator."),
1590             RepeaterExpectsExpr =>
1591                 write!(f, "Missing expression for repetition operator."),
1592             RepeaterUnexpectedExpr(ref e) =>
1593                 write!(f, "Invalid application of repetition operator to: \
1594                           '{}'.", e),
1595             UnclosedCaptureName(ref s) =>
1596                 write!(f, "Capture name group for '{}' is not closed. \
1597                            (Missing a '>'.)", s),
1598             UnclosedHex =>
1599                 write!(f, "Unclosed hexadecimal literal (missing a '}}')."),
1600             UnclosedParen =>
1601                 write!(f, "Unclosed parenthesis."),
1602             UnclosedRepeat =>
1603                 write!(f, "Unclosed counted repetition (missing a '}}')."),
1604             UnclosedUnicodeName =>
1605                 write!(f, "Unclosed Unicode literal (missing a '}}')."),
1606             UnexpectedClassEof =>
1607                 write!(f, "Character class was not closed before the end of \
1608                            the regex (missing a ']')."),
1609             UnexpectedEscapeEof =>
1610                 write!(f, "Started an escape sequence that didn't finish \
1611                            before the end of the regex."),
1612             UnexpectedFlagEof =>
1613                 write!(f, "Inline flag settings was not closed before the end \
1614                            of the regex (missing a ')' or ':')."),
1615             UnexpectedTwoDigitHexEof =>
1616                 write!(f, "Unexpected end of two digit hexadecimal literal."),
1617             UnopenedParen =>
1618                 write!(f, "Unopened parenthesis."),
1619             UnrecognizedEscape(c) =>
1620                 write!(f, "Unrecognized escape sequence: '\\{}'.", c),
1621             UnrecognizedFlag(c) =>
1622                 write!(f, "Unrecognized flag: '{}'. \
1623                            (Allowed flags: i, m, s, U, u, x.)", c),
1624             UnrecognizedUnicodeClass(ref s) =>
1625                 write!(f, "Unrecognized Unicode class name: '{}'.", s),
1626             StackExhausted =>
1627                 write!(f, "Exhausted space required to parse regex with too \
1628                            much nesting."),
1629             FlagNotAllowed(flag) =>
1630                 write!(f, "Use of the flag '{}' is not allowed.", flag),
1631             UnicodeNotAllowed =>
1632                 write!(f, "Unicode features are not allowed when the Unicode \
1633                            (u) flag is not set."),
1634             InvalidUtf8 =>
1635                 write!(f, "Matching arbitrary bytes is not allowed."),
1636             EmptyClass =>
1637                 write!(f, "Empty character classes are not allowed."),
1638             UnsupportedClassChar(c) =>
1639                 write!(f, "Use of unescaped '{}' in character class is \
1640                            not allowed.", c),
1641             __Nonexhaustive => unreachable!(),
1642         }
1643     }
1644 }
1645 
1646 /// The result of binary search on the simple case folding table.
1647 ///
1648 /// Note that this binary search is done on the "both" table, such that
1649 /// the index returned corresponds to the *first* location of `c1` in the
1650 /// table. The table can then be scanned linearly starting from the position
1651 /// returned to find other case mappings for `c1`.
simple_case_fold_both_result(c1: char) -> result::Result<usize, usize>1652 fn simple_case_fold_both_result(c1: char) -> result::Result<usize, usize> {
1653     let table = &case_folding::C_plus_S_both_table;
1654     let i = binary_search(table, |&(c2, _)| c1 <= c2);
1655     if i >= table.len() || table[i].0 != c1 {
1656         Err(i)
1657     } else {
1658         Ok(i)
1659     }
1660 }
1661 
1662 /// Binary search to find first element such that `pred(T) == true`.
1663 ///
1664 /// Assumes that if `pred(xs[i]) == true` then `pred(xs[i+1]) == true`.
1665 ///
1666 /// If all elements yield `pred(T) == false`, then `xs.len()` is returned.
binary_search<T, F>(xs: &[T], mut pred: F) -> usize where F: FnMut(&T) -> bool1667 fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
1668         where F: FnMut(&T) -> bool {
1669     let (mut left, mut right) = (0, xs.len());
1670     while left < right {
1671         let mid = (left + right) / 2;
1672         if pred(&xs[mid]) {
1673             right = mid;
1674         } else {
1675             left = mid + 1;
1676         }
1677     }
1678     left
1679 }
1680 
1681 /// Escapes all regular expression meta characters in `text`.
1682 ///
1683 /// The string returned may be safely used as a literal in a regular
1684 /// expression.
escape(text: &str) -> String1685 pub fn escape(text: &str) -> String {
1686     let mut quoted = String::with_capacity(text.len());
1687     for c in text.chars() {
1688         if parser::is_punct(c) {
1689             quoted.push('\\');
1690         }
1691         quoted.push(c);
1692     }
1693     quoted
1694 }
1695 
quote_char(c: char) -> String1696 fn quote_char(c: char) -> String {
1697     let mut s = String::new();
1698     if parser::is_punct(c) {
1699         s.push('\\');
1700     }
1701     s.push(c);
1702     s
1703 }
1704 
quote_byte(b: u8) -> String1705 fn quote_byte(b: u8) -> String {
1706     if parser::is_punct(b as char) || b == b'\'' || b == b'"' {
1707         quote_char(b as char)
1708     } else {
1709         let escaped: Vec<u8> = ascii::escape_default(b).collect();
1710         String::from_utf8(escaped).unwrap()
1711     }
1712 }
1713 
inc_char(c: char) -> char1714 fn inc_char(c: char) -> char {
1715     match c {
1716         char::MAX => char::MAX,
1717         '\u{D7FF}' => '\u{E000}',
1718         c => char::from_u32(c as u32 + 1).unwrap(),
1719     }
1720 }
1721 
dec_char(c: char) -> char1722 fn dec_char(c: char) -> char {
1723     match c {
1724         '\x00' => '\x00',
1725         '\u{E000}' => '\u{D7FF}',
1726         c => char::from_u32(c as u32 - 1).unwrap(),
1727     }
1728 }
1729 
1730 /// Returns true if and only if `c` is a word character.
1731 #[doc(hidden)]
is_word_char(c: char) -> bool1732 pub fn is_word_char(c: char) -> bool {
1733     match c {
1734         '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z'  => true,
1735         _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| {
1736             if c >= start && c <= end {
1737                 Ordering::Equal
1738             } else if start > c {
1739                 Ordering::Greater
1740             } else {
1741                 Ordering::Less
1742             }
1743         }).is_ok(),
1744     }
1745 }
1746 
1747 /// Returns true if and only if `c` is an ASCII word byte.
1748 #[doc(hidden)]
is_word_byte(b: u8) -> bool1749 pub fn is_word_byte(b: u8) -> bool {
1750     match b {
1751         b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z'  => true,
1752         _ => false,
1753     }
1754 }
1755 
1756 #[cfg(test)]
1757 mod properties;
1758 
1759 #[cfg(test)]
1760 mod tests {
1761     use {CharClass, ClassRange, ByteClass, ByteRange, Expr};
1762 
class(ranges: &[(char, char)]) -> CharClass1763     fn class(ranges: &[(char, char)]) -> CharClass {
1764         let ranges = ranges.iter().cloned()
1765                            .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
1766         CharClass::new(ranges)
1767     }
1768 
bclass(ranges: &[(u8, u8)]) -> ByteClass1769     fn bclass(ranges: &[(u8, u8)]) -> ByteClass {
1770         let ranges = ranges.iter().cloned()
1771                            .map(|(c1, c2)| ByteRange::new(c1, c2)).collect();
1772         ByteClass::new(ranges)
1773     }
1774 
e(re: &str) -> Expr1775     fn e(re: &str) -> Expr { Expr::parse(re).unwrap() }
1776 
1777     #[test]
stack_exhaustion()1778     fn stack_exhaustion() {
1779         use std::iter::repeat;
1780 
1781         let open: String = repeat('(').take(200).collect();
1782         let close: String = repeat(')').take(200).collect();
1783         assert!(Expr::parse(&format!("{}a{}", open, close)).is_ok());
1784 
1785         let open: String = repeat('(').take(200 + 1).collect();
1786         let close: String = repeat(')').take(200 + 1).collect();
1787         assert!(Expr::parse(&format!("{}a{}", open, close)).is_err());
1788     }
1789 
1790     #[test]
anchored_start()1791     fn anchored_start() {
1792         assert!(e("^a").is_anchored_start());
1793         assert!(e("(^a)").is_anchored_start());
1794         assert!(e("^a|^b").is_anchored_start());
1795         assert!(e("(^a)|(^b)").is_anchored_start());
1796         assert!(e("(^(a|b))").is_anchored_start());
1797 
1798         assert!(!e("^a|b").is_anchored_start());
1799         assert!(!e("a|^b").is_anchored_start());
1800     }
1801 
1802     #[test]
anchored_end()1803     fn anchored_end() {
1804         assert!(e("a$").is_anchored_end());
1805         assert!(e("(a$)").is_anchored_end());
1806         assert!(e("a$|b$").is_anchored_end());
1807         assert!(e("(a$)|(b$)").is_anchored_end());
1808         assert!(e("((a|b)$)").is_anchored_end());
1809 
1810         assert!(!e("a$|b").is_anchored_end());
1811         assert!(!e("a|b$").is_anchored_end());
1812     }
1813 
1814     #[test]
class_canon_no_change()1815     fn class_canon_no_change() {
1816         let cls = class(&[('a', 'c'), ('x', 'z')]);
1817         assert_eq!(cls.clone().canonicalize(), cls);
1818     }
1819 
1820     #[test]
class_canon_unordered()1821     fn class_canon_unordered() {
1822         let cls = class(&[('x', 'z'), ('a', 'c')]);
1823         assert_eq!(cls.canonicalize(), class(&[
1824             ('a', 'c'), ('x', 'z'),
1825         ]));
1826     }
1827 
1828     #[test]
class_canon_overlap()1829     fn class_canon_overlap() {
1830         let cls = class(&[('x', 'z'), ('w', 'y')]);
1831         assert_eq!(cls.canonicalize(), class(&[
1832             ('w', 'z'),
1833         ]));
1834     }
1835 
1836     #[test]
class_canon_overlap_many()1837     fn class_canon_overlap_many() {
1838         let cls = class(&[
1839             ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
1840             ('m', 'p'), ('l', 's'),
1841         ]);
1842         assert_eq!(cls.clone().canonicalize(), class(&[
1843             ('a', 'j'), ('l', 's'),
1844         ]));
1845     }
1846 
1847     #[test]
class_canon_overlap_boundary()1848     fn class_canon_overlap_boundary() {
1849         let cls = class(&[('x', 'z'), ('u', 'w')]);
1850         assert_eq!(cls.canonicalize(), class(&[
1851             ('u', 'z'),
1852         ]));
1853     }
1854 
1855     #[test]
class_canon_extreme_edge_case()1856     fn class_canon_extreme_edge_case() {
1857         let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1858         assert_eq!(cls.canonicalize(), class(&[
1859             ('\x00', '\u{10FFFF}'),
1860         ]));
1861     }
1862 
1863     #[test]
class_canon_singles()1864     fn class_canon_singles() {
1865         let cls = class(&[('a', 'a'), ('b', 'b')]);
1866         assert_eq!(cls.canonicalize(), class(&[('a', 'b')]));
1867     }
1868 
1869     #[test]
class_negate_single()1870     fn class_negate_single() {
1871         let cls = class(&[('a', 'a')]);
1872         assert_eq!(cls.negate(), class(&[
1873             ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
1874         ]));
1875     }
1876 
1877     #[test]
class_negate_singles()1878     fn class_negate_singles() {
1879         let cls = class(&[('a', 'a'), ('b', 'b')]);
1880         assert_eq!(cls.negate(), class(&[
1881             ('\x00', '\x60'), ('\x63', '\u{10FFFF}'),
1882         ]));
1883     }
1884 
1885     #[test]
class_negate_multiples()1886     fn class_negate_multiples() {
1887         let cls = class(&[('a', 'c'), ('x', 'z')]);
1888         assert_eq!(cls.negate(), class(&[
1889             ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'),
1890         ]));
1891     }
1892 
1893     #[test]
class_negate_min_scalar()1894     fn class_negate_min_scalar() {
1895         let cls = class(&[('\x00', 'a')]);
1896         assert_eq!(cls.negate(), class(&[
1897             ('\x62', '\u{10FFFF}'),
1898         ]));
1899     }
1900 
1901     #[test]
class_negate_max_scalar()1902     fn class_negate_max_scalar() {
1903         let cls = class(&[('a', '\u{10FFFF}')]);
1904         assert_eq!(cls.negate(), class(&[
1905             ('\x00', '\x60'),
1906         ]));
1907     }
1908 
1909     #[test]
class_negate_everything()1910     fn class_negate_everything() {
1911         let cls = class(&[('\x00', '\u{10FFFF}')]);
1912         assert_eq!(cls.negate(), class(&[]));
1913     }
1914 
1915     #[test]
class_negate_everything_sans_one()1916     fn class_negate_everything_sans_one() {
1917         let cls = class(&[
1918             ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')
1919         ]);
1920         assert_eq!(cls.negate(), class(&[
1921             ('\u{10FFFE}', '\u{10FFFE}'),
1922         ]));
1923     }
1924 
1925     #[test]
class_negate_surrogates_min()1926     fn class_negate_surrogates_min() {
1927         let cls = class(&[('\x00', '\u{D7FF}')]);
1928         assert_eq!(cls.negate(), class(&[
1929             ('\u{E000}', '\u{10FFFF}'),
1930         ]));
1931     }
1932 
1933     #[test]
class_negate_surrogates_min_edge()1934     fn class_negate_surrogates_min_edge() {
1935         let cls = class(&[('\x00', '\u{D7FE}')]);
1936         assert_eq!(cls.negate(), class(&[
1937             ('\u{D7FF}', '\u{10FFFF}'),
1938         ]));
1939     }
1940 
1941     #[test]
class_negate_surrogates_max()1942     fn class_negate_surrogates_max() {
1943         let cls = class(&[('\u{E000}', '\u{10FFFF}')]);
1944         assert_eq!(cls.negate(), class(&[
1945             ('\x00', '\u{D7FF}'),
1946         ]));
1947     }
1948 
1949     #[test]
class_negate_surrogates_max_edge()1950     fn class_negate_surrogates_max_edge() {
1951         let cls = class(&[('\u{E001}', '\u{10FFFF}')]);
1952         assert_eq!(cls.negate(), class(&[
1953             ('\x00', '\u{E000}'),
1954         ]));
1955     }
1956 
1957     #[test]
class_intersection_empty()1958     fn class_intersection_empty() {
1959         let cls1 = class(&[]);
1960         let cls2 = class(&[('a', 'a')]);
1961         assert_intersection(cls1, cls2, class(&[]));
1962     }
1963 
1964     #[test]
class_intersection_single_equal()1965     fn class_intersection_single_equal() {
1966         let cls1 = class(&[('a', 'a')]);
1967         let cls2 = class(&[('a', 'a')]);
1968         assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1969     }
1970 
1971     #[test]
class_intersection_single_unequal()1972     fn class_intersection_single_unequal() {
1973         let cls1 = class(&[('a', 'a')]);
1974         let cls2 = class(&[('b', 'b')]);
1975         assert_intersection(cls1, cls2, class(&[]));
1976     }
1977 
1978     #[test]
class_intersection_single_in_other()1979     fn class_intersection_single_in_other() {
1980         let cls1 = class(&[('a', 'a')]);
1981         let cls2 = class(&[('a', 'c')]);
1982         assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1983     }
1984 
1985     #[test]
class_intersection_range_in_other()1986     fn class_intersection_range_in_other() {
1987         let cls1 = class(&[('a', 'b')]);
1988         let cls2 = class(&[('a', 'c')]);
1989         assert_intersection(cls1, cls2, class(&[('a', 'b')]));
1990     }
1991 
1992     #[test]
class_intersection_range_intersection()1993     fn class_intersection_range_intersection() {
1994         let cls1 = class(&[('a', 'b')]);
1995         let cls2 = class(&[('b', 'c')]);
1996         assert_intersection(cls1, cls2, class(&[('b', 'b')]));
1997     }
1998 
1999     #[test]
class_intersection_only_adjacent()2000     fn class_intersection_only_adjacent() {
2001         let cls1 = class(&[('a', 'b')]);
2002         let cls2 = class(&[('c', 'd')]);
2003         assert_intersection(cls1, cls2, class(&[]));
2004     }
2005 
2006     #[test]
class_intersection_range_subset()2007     fn class_intersection_range_subset() {
2008         let cls1 = class(&[('b', 'c')]);
2009         let cls2 = class(&[('a', 'd')]);
2010         assert_intersection(cls1, cls2, class(&[('b', 'c')]));
2011     }
2012 
2013     #[test]
class_intersection_many_ranges_in_one_big()2014     fn class_intersection_many_ranges_in_one_big() {
2015         let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2016         let cls2 = class(&[('a', 'h')]);
2017         assert_intersection(cls1, cls2, class(&[
2018             ('a', 'b'), ('d', 'e'), ('g', 'h')
2019         ]));
2020     }
2021 
2022     #[test]
class_intersection_many_ranges_same()2023     fn class_intersection_many_ranges_same() {
2024         let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2025         let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2026         assert_intersection(cls1, cls2, class(&[
2027             ('a', 'b'), ('d', 'e'), ('g', 'h')
2028         ]));
2029     }
2030 
2031     #[test]
class_intersection_multiple_non_intersecting()2032     fn class_intersection_multiple_non_intersecting() {
2033         let cls1 = class(&[('a', 'b'), ('g', 'h')]);
2034         let cls2 = class(&[('d', 'e'), ('k', 'l')]);
2035         assert_intersection(cls1, cls2, class(&[]));
2036     }
2037 
2038     #[test]
class_intersection_non_intersecting_then_intersecting()2039     fn class_intersection_non_intersecting_then_intersecting() {
2040         let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2041         let cls2 = class(&[('h', 'h')]);
2042         assert_intersection(cls1, cls2, class(&[('h', 'h')]));
2043     }
2044 
2045     #[test]
class_intersection_adjacent_alternating()2046     fn class_intersection_adjacent_alternating() {
2047         let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2048         let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2049         assert_intersection(cls1, cls2, class(&[]));
2050     }
2051 
2052     #[test]
class_intersection_overlapping_alternating()2053     fn class_intersection_overlapping_alternating() {
2054         let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2055         let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2056         assert_intersection(cls1, cls2, class(&[('b', 'f')]));
2057     }
2058 
2059     #[test]
class_canon_overlap_many_case_fold()2060     fn class_canon_overlap_many_case_fold() {
2061         let cls = class(&[
2062             ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
2063             ('M', 'P'), ('L', 'S'), ('c', 'f'),
2064         ]);
2065         assert_eq!(cls.case_fold(), class(&[
2066             ('A', 'J'), ('L', 'S'),
2067             ('a', 'j'), ('l', 's'),
2068             ('\u{17F}', '\u{17F}'),
2069         ]));
2070 
2071         let cls = bclass(&[
2072             (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
2073             (b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
2074         ]);
2075         assert_eq!(cls.case_fold(), bclass(&[
2076             (b'A', b'J'), (b'L', b'S'),
2077             (b'a', b'j'), (b'l', b's'),
2078         ]));
2079     }
2080 
2081     #[test]
class_fold_az()2082     fn class_fold_az() {
2083         let cls = class(&[('A', 'Z')]);
2084         assert_eq!(cls.case_fold(), class(&[
2085             ('A', 'Z'), ('a', 'z'),
2086             ('\u{17F}', '\u{17F}'),
2087             ('\u{212A}', '\u{212A}'),
2088         ]));
2089         let cls = class(&[('a', 'z')]);
2090         assert_eq!(cls.case_fold(), class(&[
2091             ('A', 'Z'), ('a', 'z'),
2092             ('\u{17F}', '\u{17F}'),
2093             ('\u{212A}', '\u{212A}'),
2094         ]));
2095 
2096         let cls = bclass(&[(b'A', b'Z')]);
2097         assert_eq!(cls.case_fold(), bclass(&[
2098             (b'A', b'Z'), (b'a', b'z'),
2099         ]));
2100         let cls = bclass(&[(b'a', b'z')]);
2101         assert_eq!(cls.case_fold(), bclass(&[
2102             (b'A', b'Z'), (b'a', b'z'),
2103         ]));
2104     }
2105 
2106     #[test]
class_fold_a_underscore()2107     fn class_fold_a_underscore() {
2108         let cls = class(&[('A', 'A'), ('_', '_')]);
2109         assert_eq!(cls.clone().canonicalize(), class(&[
2110             ('A', 'A'), ('_', '_'),
2111         ]));
2112         assert_eq!(cls.case_fold(), class(&[
2113             ('A', 'A'), ('_', '_'), ('a', 'a'),
2114         ]));
2115 
2116         let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
2117         assert_eq!(cls.clone().canonicalize(), bclass(&[
2118             (b'A', b'A'), (b'_', b'_'),
2119         ]));
2120         assert_eq!(cls.case_fold(), bclass(&[
2121             (b'A', b'A'), (b'_', b'_'), (b'a', b'a'),
2122         ]));
2123     }
2124 
2125     #[test]
class_fold_a_equals()2126     fn class_fold_a_equals() {
2127         let cls = class(&[('A', 'A'), ('=', '=')]);
2128         assert_eq!(cls.clone().canonicalize(), class(&[
2129             ('=', '='), ('A', 'A'),
2130         ]));
2131         assert_eq!(cls.case_fold(), class(&[
2132             ('=', '='), ('A', 'A'), ('a', 'a'),
2133         ]));
2134 
2135         let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
2136         assert_eq!(cls.clone().canonicalize(), bclass(&[
2137             (b'=', b'='), (b'A', b'A'),
2138         ]));
2139         assert_eq!(cls.case_fold(), bclass(&[
2140             (b'=', b'='), (b'A', b'A'), (b'a', b'a'),
2141         ]));
2142     }
2143 
2144     #[test]
class_fold_no_folding_needed()2145     fn class_fold_no_folding_needed() {
2146         let cls = class(&[('\x00', '\x10')]);
2147         assert_eq!(cls.case_fold(), class(&[
2148             ('\x00', '\x10'),
2149         ]));
2150 
2151         let cls = bclass(&[(b'\x00', b'\x10')]);
2152         assert_eq!(cls.case_fold(), bclass(&[
2153             (b'\x00', b'\x10'),
2154         ]));
2155     }
2156 
2157     #[test]
class_fold_negated()2158     fn class_fold_negated() {
2159         let cls = class(&[('x', 'x')]);
2160         assert_eq!(cls.clone().case_fold(), class(&[
2161             ('X', 'X'), ('x', 'x'),
2162         ]));
2163         assert_eq!(cls.case_fold().negate(), class(&[
2164             ('\x00', 'W'), ('Y', 'w'), ('y', '\u{10FFFF}'),
2165         ]));
2166 
2167         let cls = bclass(&[(b'x', b'x')]);
2168         assert_eq!(cls.clone().case_fold(), bclass(&[
2169             (b'X', b'X'), (b'x', b'x'),
2170         ]));
2171         assert_eq!(cls.case_fold().negate(), bclass(&[
2172             (b'\x00', b'W'), (b'Y', b'w'), (b'y', b'\xff'),
2173         ]));
2174     }
2175 
2176     #[test]
class_fold_single_to_multiple()2177     fn class_fold_single_to_multiple() {
2178         let cls = class(&[('k', 'k')]);
2179         assert_eq!(cls.case_fold(), class(&[
2180             ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
2181         ]));
2182 
2183         let cls = bclass(&[(b'k', b'k')]);
2184         assert_eq!(cls.case_fold(), bclass(&[
2185             (b'K', b'K'), (b'k', b'k'),
2186         ]));
2187     }
2188 
2189     #[test]
class_fold_at()2190     fn class_fold_at() {
2191         let cls = class(&[('@', '@')]);
2192         assert_eq!(cls.clone().canonicalize(), class(&[('@', '@')]));
2193         assert_eq!(cls.case_fold(), class(&[('@', '@')]));
2194 
2195         let cls = bclass(&[(b'@', b'@')]);
2196         assert_eq!(cls.clone().canonicalize(), bclass(&[(b'@', b'@')]));
2197         assert_eq!(cls.case_fold(), bclass(&[(b'@', b'@')]));
2198     }
2199 
2200     #[test]
roundtrip_class_hypen()2201     fn roundtrip_class_hypen() {
2202         let expr = e("[-./]");
2203         assert_eq!("(?u:[-\\.-/])", expr.to_string());
2204 
2205         let expr = e("(?-u)[-./]");
2206         assert_eq!("(?-u:[-\\.-/])", expr.to_string());
2207     }
2208 
assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass)2209     fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) {
2210         // intersection operation should be commutative
2211         assert_eq!(cls1.intersection(&cls2), expected);
2212         assert_eq!(cls2.intersection(&cls1), expected);
2213     }
2214 }
2215