1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 This crate provides a regular expression parser and an abstract syntax for
13 regular expressions. The abstract syntax is defined by the `Expr` type. The
14 concrete syntax is enumerated in the
15 [`regex`](../regex/index.html#syntax)
16 crate documentation.
17
18 Note that since this crate is first and foremost an implementation detail for
19 the `regex` crate, it may experience more frequent breaking changes. It is
20 exposed as a separate crate so that others may use it to do analysis on regular
21 expressions or even build their own matching engine.
22
23 # Example: parsing an expression
24
25 Parsing a regular expression can be done with the `Expr::parse` function.
26
27 ```rust
28 use regex_syntax::Expr;
29
30 assert_eq!(Expr::parse(r"ab|yz").unwrap(), Expr::Alternate(vec![
31 Expr::Literal { chars: vec!['a', 'b'], casei: false },
32 Expr::Literal { chars: vec!['y', 'z'], casei: false },
33 ]));
34 ```
35
36 # Example: inspecting an error
37
38 The parser in this crate provides very detailed error values. For example,
39 if an invalid character class range is given:
40
41 ```rust
42 use regex_syntax::{Expr, ErrorKind};
43
44 let err = Expr::parse(r"[z-a]").unwrap_err();
45 assert_eq!(err.position(), 4);
46 assert_eq!(err.kind(), &ErrorKind::InvalidClassRange {
47 start: 'z',
48 end: 'a',
49 });
50 ```
51
52 Or unbalanced parentheses:
53
54 ```rust
55 use regex_syntax::{Expr, ErrorKind};
56
57 let err = Expr::parse(r"ab(cd").unwrap_err();
58 assert_eq!(err.position(), 2);
59 assert_eq!(err.kind(), &ErrorKind::UnclosedParen);
60 ```
61 */
62
63 #![deny(missing_docs)]
64 #![cfg_attr(test, deny(warnings))]
65
66 #[cfg(test)] extern crate quickcheck;
67 #[cfg(test)] extern crate rand;
68
69 mod literals;
70 mod parser;
71 mod unicode;
72
73 use std::ascii;
74 use std::char;
75 use std::cmp::{Ordering, max, min};
76 use std::fmt;
77 use std::iter::IntoIterator;
78 use std::ops::Deref;
79 use std::result;
80 use std::slice;
81 use std::u8;
82 use std::vec;
83
84 use unicode::case_folding;
85
86 use self::Expr::*;
87 use self::Repeater::*;
88
89 use parser::{Flags, Parser};
90
91 pub use literals::{Literals, Lit};
92
93 /// A regular expression abstract syntax tree.
94 ///
95 /// An `Expr` represents the abstract syntax of a regular expression.
96 #[derive(Clone, Debug, PartialEq, Eq)]
97 pub enum Expr {
98 /// An empty regex (which never matches any text).
99 Empty,
100 /// A sequence of one or more literal characters to be matched.
101 Literal {
102 /// The characters.
103 chars: Vec<char>,
104 /// Whether to match case insensitively.
105 casei: bool,
106 },
107 /// A sequence of one or more literal bytes to be matched.
108 LiteralBytes {
109 /// The bytes.
110 bytes: Vec<u8>,
111 /// Whether to match case insensitively.
112 ///
113 /// The interpretation of "case insensitive" in this context is
114 /// ambiguous since `bytes` can be arbitrary. However, a good heuristic
115 /// is to assume that the bytes are ASCII-compatible and do simple
116 /// ASCII case folding.
117 casei: bool,
118 },
119 /// Match any character.
120 AnyChar,
121 /// Match any character, excluding new line (`0xA`).
122 AnyCharNoNL,
123 /// Match any byte.
124 AnyByte,
125 /// Match any byte, excluding new line (`0xA`).
126 AnyByteNoNL,
127 /// A character class.
128 Class(CharClass),
129 /// A character class with byte ranges only.
130 ClassBytes(ByteClass),
131 /// Match the start of a line or beginning of input.
132 StartLine,
133 /// Match the end of a line or end of input.
134 EndLine,
135 /// Match the beginning of input.
136 StartText,
137 /// Match the end of input.
138 EndText,
139 /// Match a word boundary (word character on one side and a non-word
140 /// character on the other).
141 WordBoundary,
142 /// Match a position that is not a word boundary (word or non-word
143 /// characters on both sides).
144 NotWordBoundary,
145 /// Match an ASCII word boundary.
146 WordBoundaryAscii,
147 /// Match a position that is not an ASCII word boundary.
148 NotWordBoundaryAscii,
149 /// A group, possibly non-capturing.
150 Group {
151 /// The expression inside the group.
152 e: Box<Expr>,
153 /// The capture index (starting at `1`) only for capturing groups.
154 i: Option<usize>,
155 /// The capture name, only for capturing named groups.
156 name: Option<String>,
157 },
158 /// A repeat operator (`?`, `*`, `+` or `{m,n}`).
159 Repeat {
160 /// The expression to be repeated. Limited to literals, `.`, classes
161 /// or grouped expressions.
162 e: Box<Expr>,
163 /// The type of repeat operator used.
164 r: Repeater,
165 /// Whether the repeat is greedy (match the most) or not (match the
166 /// least).
167 greedy: bool,
168 },
169 /// A concatenation of expressions. Must be matched one after the other.
170 ///
171 /// N.B. A concat expression can only appear at the top-level or
172 /// immediately inside a group expression.
173 Concat(Vec<Expr>),
174 /// An alternation of expressions. Only one must match.
175 ///
176 /// N.B. An alternate expression can only appear at the top-level or
177 /// immediately inside a group expression.
178 Alternate(Vec<Expr>),
179 }
180
181 type CaptureIndex = Option<usize>;
182
183 type CaptureName = Option<String>;
184
185 /// The type of a repeat operator expression.
186 #[derive(Clone, Copy, Debug, PartialEq, Eq)]
187 pub enum Repeater {
188 /// Match zero or one (`?`).
189 ZeroOrOne,
190 /// Match zero or more (`*`).
191 ZeroOrMore,
192 /// Match one or more (`+`).
193 OneOrMore,
194 /// Match for at least `min` and at most `max` (`{m,n}`).
195 ///
196 /// When `max` is `None`, there is no upper bound on the number of matches.
197 Range {
198 /// Lower bound on the number of matches.
199 min: u32,
200 /// Optional upper bound on the number of matches.
201 max: Option<u32>,
202 },
203 }
204
205 impl Repeater {
206 /// Returns true if and only if this repetition can match the empty string.
matches_empty(&self) -> bool207 fn matches_empty(&self) -> bool {
208 use self::Repeater::*;
209 match *self {
210 ZeroOrOne => true,
211 ZeroOrMore => true,
212 OneOrMore => false,
213 Range { min, .. } => min == 0,
214 }
215 }
216 }
217
218 /// A character class.
219 ///
220 /// A character class has a canonical format that the parser guarantees. Its
221 /// canonical format is defined by the following invariants:
222 ///
223 /// 1. Given any Unicode scalar value, it is matched by *at most* one character
224 /// range in a canonical character class.
225 /// 2. Every adjacent character range is separated by at least one Unicode
226 /// scalar value.
227 /// 3. Given any pair of character ranges `r1` and `r2`, if
228 /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
229 /// character class.
230 ///
231 /// In sum, any `CharClass` produced by this crate's parser is a sorted
232 /// sequence of non-overlapping ranges. This makes it possible to test whether
233 /// a character is matched by a class with a binary search.
234 ///
235 /// If the case insensitive flag was set when parsing a character class, then
236 /// simple case folding is done automatically. For example, `(?i)[a-c]` is
237 /// automatically translated to `[a-cA-C]`.
238 #[derive(Clone, Debug, PartialEq, Eq)]
239 pub struct CharClass {
240 ranges: Vec<ClassRange>,
241 }
242
243 /// A single inclusive range in a character class.
244 ///
245 /// Since range boundaries are defined by Unicode scalar values, the boundaries
246 /// can never be in the open interval `(0xD7FF, 0xE000)`. However, a range may
247 /// *cover* codepoints that are not scalar values.
248 ///
249 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
250 /// for testing whether a character is contained inside a given range.
251 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
252 pub struct ClassRange {
253 /// The start character of the range.
254 ///
255 /// This must be less than or equal to `end`.
256 pub start: char,
257
258 /// The end character of the range.
259 ///
260 /// This must be greater than or equal to `start`.
261 pub end: char,
262 }
263
264 /// A byte class for byte ranges only.
265 ///
266 /// A byte class has a canonical format that the parser guarantees. Its
267 /// canonical format is defined by the following invariants:
268 ///
269 /// 1. Given any byte, it is matched by *at most* one byte range in a canonical
270 /// character class.
271 /// 2. Every adjacent byte range is separated by at least one byte.
272 /// 3. Given any pair of byte ranges `r1` and `r2`, if
273 /// `r1.end < r2.start`, then `r1` comes before `r2` in a canonical
274 /// character class.
275 ///
276 /// In sum, any `ByteClass` produced by this crate's parser is a sorted
277 /// sequence of non-overlapping ranges. This makes it possible to test whether
278 /// a byte is matched by a class with a binary search.
279 ///
280 /// If the case insensitive flag was set when parsing a character class,
281 /// then simple ASCII-only case folding is done automatically. For example,
282 /// `(?i)[a-c]` is automatically translated to `[a-cA-C]`.
283 #[derive(Clone, Debug, PartialEq, Eq)]
284 pub struct ByteClass {
285 ranges: Vec<ByteRange>,
286 }
287
288 /// A single inclusive range in a byte class.
289 ///
290 /// Note that this has a few convenient impls on `PartialEq` and `PartialOrd`
291 /// for testing whether a byte is contained inside a given range.
292 #[derive(Clone, Copy, Debug, PartialEq, PartialOrd, Eq, Ord)]
293 pub struct ByteRange {
294 /// The start byte of the range.
295 ///
296 /// This must be less than or equal to `end`.
297 pub start: u8,
298
299 /// The end byte of the range.
300 ///
301 /// This must be greater than or equal to `end`.
302 pub end: u8,
303 }
304
305 /// A builder for configuring regular expression parsing.
306 ///
307 /// This allows setting the default values of flags and other options, such
308 /// as the maximum nesting depth.
309 #[derive(Clone, Debug)]
310 pub struct ExprBuilder {
311 flags: Flags,
312 nest_limit: usize,
313 }
314
315 impl ExprBuilder {
316 /// Create a new builder for configuring expression parsing.
317 ///
318 /// Note that all flags are disabled by default.
new() -> ExprBuilder319 pub fn new() -> ExprBuilder {
320 ExprBuilder {
321 flags: Flags::default(),
322 nest_limit: 200,
323 }
324 }
325
326 /// Set the default value for the case insensitive (`i`) flag.
case_insensitive(mut self, yes: bool) -> ExprBuilder327 pub fn case_insensitive(mut self, yes: bool) -> ExprBuilder {
328 self.flags.casei = yes;
329 self
330 }
331
332 /// Set the default value for the multi-line matching (`m`) flag.
multi_line(mut self, yes: bool) -> ExprBuilder333 pub fn multi_line(mut self, yes: bool) -> ExprBuilder {
334 self.flags.multi = yes;
335 self
336 }
337
338 /// Set the default value for the any character (`s`) flag.
dot_matches_new_line(mut self, yes: bool) -> ExprBuilder339 pub fn dot_matches_new_line(mut self, yes: bool) -> ExprBuilder {
340 self.flags.dotnl = yes;
341 self
342 }
343
344 /// Set the default value for the greedy swap (`U`) flag.
swap_greed(mut self, yes: bool) -> ExprBuilder345 pub fn swap_greed(mut self, yes: bool) -> ExprBuilder {
346 self.flags.swap_greed = yes;
347 self
348 }
349
350 /// Set the default value for the ignore whitespace (`x`) flag.
ignore_whitespace(mut self, yes: bool) -> ExprBuilder351 pub fn ignore_whitespace(mut self, yes: bool) -> ExprBuilder {
352 self.flags.ignore_space = yes;
353 self
354 }
355
356 /// Set the default value for the Unicode (`u`) flag.
357 ///
358 /// If `yes` is false, then `allow_bytes` is set to true.
unicode(mut self, yes: bool) -> ExprBuilder359 pub fn unicode(mut self, yes: bool) -> ExprBuilder {
360 self.flags.unicode = yes;
361 if !yes {
362 self.allow_bytes(true)
363 } else {
364 self
365 }
366 }
367
368 /// Whether the parser allows matching arbitrary bytes or not.
369 ///
370 /// When the `u` flag is disabled (either with this builder or in the
371 /// expression itself), the parser switches to interpreting the expression
372 /// as matching arbitrary bytes instead of Unicode codepoints. For example,
373 /// the expression `(?u:\xFF)` matches the *codepoint* `\xFF`, which
374 /// corresponds to the UTF-8 byte sequence `\xCE\xBF`. Conversely,
375 /// `(?-u:\xFF)` matches the *byte* `\xFF`, which is not valid UTF-8.
376 ///
377 /// When `allow_bytes` is disabled (the default), an expression like
378 /// `(?-u:\xFF)` will cause the parser to return an error, since it would
379 /// otherwise match invalid UTF-8. When enabled, it will be allowed.
allow_bytes(mut self, yes: bool) -> ExprBuilder380 pub fn allow_bytes(mut self, yes: bool) -> ExprBuilder {
381 self.flags.allow_bytes = yes;
382 self
383 }
384
385 /// Set the nesting limit for regular expression parsing.
386 ///
387 /// Regular expressions that nest more than this limit will result in a
388 /// `StackExhausted` error.
nest_limit(mut self, limit: usize) -> ExprBuilder389 pub fn nest_limit(mut self, limit: usize) -> ExprBuilder {
390 self.nest_limit = limit;
391 self
392 }
393
394 /// Parse a string as a regular expression using the current configuraiton.
parse(self, s: &str) -> Result<Expr>395 pub fn parse(self, s: &str) -> Result<Expr> {
396 Parser::parse(s, self.flags).and_then(|e| e.simplify(self.nest_limit))
397 }
398 }
399
400 impl Expr {
401 /// Parses a string in a regular expression syntax tree.
402 ///
403 /// This is a convenience method for parsing an expression using the
404 /// default configuration. To tweak parsing options (such as which flags
405 /// are enabled by default), use the `ExprBuilder` type.
parse(s: &str) -> Result<Expr>406 pub fn parse(s: &str) -> Result<Expr> {
407 ExprBuilder::new().parse(s)
408 }
409
410 /// Returns true iff the expression can be repeated by a quantifier.
can_repeat(&self) -> bool411 fn can_repeat(&self) -> bool {
412 match *self {
413 Literal{..} | LiteralBytes{..}
414 | AnyChar | AnyCharNoNL | AnyByte | AnyByteNoNL
415 | Class(_) | ClassBytes(_)
416 | StartLine | EndLine | StartText | EndText
417 | WordBoundary | NotWordBoundary
418 | WordBoundaryAscii | NotWordBoundaryAscii
419 | Group{..}
420 => true,
421 _ => false,
422 }
423 }
424
simplify(self, nest_limit: usize) -> Result<Expr>425 fn simplify(self, nest_limit: usize) -> Result<Expr> {
426 fn combine_literals(es: &mut Vec<Expr>, e: Expr) {
427 match (es.pop(), e) {
428 (None, e) => es.push(e),
429 (Some(Literal { chars: mut chars1, casei: casei1 }),
430 Literal { chars: chars2, casei: casei2 }) => {
431 if casei1 == casei2 {
432 chars1.extend(chars2);
433 es.push(Literal { chars: chars1, casei: casei1 });
434 } else {
435 es.push(Literal { chars: chars1, casei: casei1 });
436 es.push(Literal { chars: chars2, casei: casei2 });
437 }
438 }
439 (Some(LiteralBytes { bytes: mut bytes1, casei: casei1 }),
440 LiteralBytes { bytes: bytes2, casei: casei2 }) => {
441 if casei1 == casei2 {
442 bytes1.extend(bytes2);
443 es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
444 } else {
445 es.push(LiteralBytes { bytes: bytes1, casei: casei1 });
446 es.push(LiteralBytes { bytes: bytes2, casei: casei2 });
447 }
448 }
449 (Some(e1), e2) => {
450 es.push(e1);
451 es.push(e2);
452 }
453 }
454 }
455 fn simp(expr: Expr, recurse: usize, limit: usize) -> Result<Expr> {
456 if recurse > limit {
457 return Err(Error {
458 pos: 0,
459 surround: "".to_owned(),
460 kind: ErrorKind::StackExhausted,
461 });
462 }
463 let simplify = |e| simp(e, recurse + 1, limit);
464 Ok(match expr {
465 Repeat { e, r, greedy } => Repeat {
466 e: Box::new(try!(simplify(*e))),
467 r: r,
468 greedy: greedy,
469 },
470 Group { e, i, name } => {
471 let e = try!(simplify(*e));
472 if i.is_none() && name.is_none() && e.can_repeat() {
473 e
474 } else {
475 Group { e: Box::new(e), i: i, name: name }
476 }
477 }
478 Concat(es) => {
479 let mut new_es = Vec::with_capacity(es.len());
480 for e in es {
481 combine_literals(&mut new_es, try!(simplify(e)));
482 }
483 if new_es.len() == 1 {
484 new_es.pop().unwrap()
485 } else {
486 Concat(new_es)
487 }
488 }
489 Alternate(es) => {
490 let mut new_es = Vec::with_capacity(es.len());
491 for e in es {
492 new_es.push(try!(simplify(e)));
493 }
494 Alternate(new_es)
495 }
496 e => e,
497 })
498 }
499 simp(self, 0, nest_limit)
500 }
501
502 /// Returns a set of literal prefixes extracted from this expression.
prefixes(&self) -> Literals503 pub fn prefixes(&self) -> Literals {
504 let mut lits = Literals::empty();
505 lits.union_prefixes(self);
506 lits
507 }
508
509 /// Returns a set of literal suffixes extracted from this expression.
suffixes(&self) -> Literals510 pub fn suffixes(&self) -> Literals {
511 let mut lits = Literals::empty();
512 lits.union_suffixes(self);
513 lits
514 }
515
516 /// Returns true if and only if the expression is required to match from
517 /// the beginning of text.
is_anchored_start(&self) -> bool518 pub fn is_anchored_start(&self) -> bool {
519 match *self {
520 Repeat { ref e, r, .. } => {
521 !r.matches_empty() && e.is_anchored_start()
522 }
523 Group { ref e, .. } => e.is_anchored_start(),
524 Concat(ref es) => es[0].is_anchored_start(),
525 Alternate(ref es) => es.iter().all(|e| e.is_anchored_start()),
526 StartText => true,
527 _ => false,
528 }
529 }
530
531 /// Returns true if and only if the expression has at least one matchable
532 /// sub-expression that must match the beginning of text.
has_anchored_start(&self) -> bool533 pub fn has_anchored_start(&self) -> bool {
534 match *self {
535 Repeat { ref e, r, .. } => {
536 !r.matches_empty() && e.has_anchored_start()
537 }
538 Group { ref e, .. } => e.has_anchored_start(),
539 Concat(ref es) => es[0].has_anchored_start(),
540 Alternate(ref es) => es.iter().any(|e| e.has_anchored_start()),
541 StartText => true,
542 _ => false,
543 }
544 }
545
546 /// Returns true if and only if the expression is required to match at the
547 /// end of the text.
is_anchored_end(&self) -> bool548 pub fn is_anchored_end(&self) -> bool {
549 match *self {
550 Repeat { ref e, r, .. } => {
551 !r.matches_empty() && e.is_anchored_end()
552 }
553 Group { ref e, .. } => e.is_anchored_end(),
554 Concat(ref es) => es[es.len() - 1].is_anchored_end(),
555 Alternate(ref es) => es.iter().all(|e| e.is_anchored_end()),
556 EndText => true,
557 _ => false,
558 }
559 }
560
561 /// Returns true if and only if the expression has at least one matchable
562 /// sub-expression that must match the beginning of text.
has_anchored_end(&self) -> bool563 pub fn has_anchored_end(&self) -> bool {
564 match *self {
565 Repeat { ref e, r, .. } => {
566 !r.matches_empty() && e.has_anchored_end()
567 }
568 Group { ref e, .. } => e.has_anchored_end(),
569 Concat(ref es) => es[es.len() - 1].has_anchored_end(),
570 Alternate(ref es) => es.iter().any(|e| e.has_anchored_end()),
571 EndText => true,
572 _ => false,
573 }
574 }
575
576 /// Returns true if and only if the expression contains sub-expressions
577 /// that can match arbitrary bytes.
has_bytes(&self) -> bool578 pub fn has_bytes(&self) -> bool {
579 match *self {
580 Repeat { ref e, .. } => e.has_bytes(),
581 Group { ref e, .. } => e.has_bytes(),
582 Concat(ref es) => es.iter().any(|e| e.has_bytes()),
583 Alternate(ref es) => es.iter().any(|e| e.has_bytes()),
584 LiteralBytes{..} => true,
585 AnyByte | AnyByteNoNL => true,
586 ClassBytes(_) => true,
587 WordBoundaryAscii | NotWordBoundaryAscii => true,
588 _ => false,
589 }
590 }
591 }
592
593 impl Deref for CharClass {
594 type Target = Vec<ClassRange>;
deref(&self) -> &Vec<ClassRange>595 fn deref(&self) -> &Vec<ClassRange> { &self.ranges }
596 }
597
598 impl IntoIterator for CharClass {
599 type Item = ClassRange;
600 type IntoIter = vec::IntoIter<ClassRange>;
into_iter(self) -> vec::IntoIter<ClassRange>601 fn into_iter(self) -> vec::IntoIter<ClassRange> { self.ranges.into_iter() }
602 }
603
604 impl<'a> IntoIterator for &'a CharClass {
605 type Item = &'a ClassRange;
606 type IntoIter = slice::Iter<'a, ClassRange>;
into_iter(self) -> slice::Iter<'a, ClassRange>607 fn into_iter(self) -> slice::Iter<'a, ClassRange> { self.iter() }
608 }
609
610 impl CharClass {
611 /// Create a new class from an existing set of ranges.
new(ranges: Vec<ClassRange>) -> CharClass612 pub fn new(ranges: Vec<ClassRange>) -> CharClass {
613 CharClass { ranges: ranges }
614 }
615
616 /// Create an empty class.
empty() -> CharClass617 fn empty() -> CharClass {
618 CharClass::new(Vec::new())
619 }
620
621 /// Returns true if `c` is matched by this character class.
matches(&self, c: char) -> bool622 pub fn matches(&self, c: char) -> bool {
623 self.binary_search_by(|range| c.partial_cmp(range).unwrap()).is_ok()
624 }
625
626 /// Removes the given character from the class if it exists.
627 ///
628 /// Note that this takes `O(n)` time in the number of ranges.
remove(&mut self, c: char)629 pub fn remove(&mut self, c: char) {
630 let mut i = match self.binary_search_by(|r| c.partial_cmp(r).unwrap()) {
631 Ok(i) => i,
632 Err(_) => return,
633 };
634 let mut r = self.ranges.remove(i);
635 if r.start == c {
636 r.start = inc_char(c);
637 if r.start > r.end || c == char::MAX {
638 return;
639 }
640 self.ranges.insert(i, r);
641 } else if r.end == c {
642 r.end = dec_char(c);
643 if r.end < r.start || c == '\x00' {
644 return;
645 }
646 self.ranges.insert(0, r);
647 } else {
648 let (mut r1, mut r2) = (r.clone(), r.clone());
649 r1.end = dec_char(c);
650 if r1.start <= r1.end {
651 self.ranges.insert(i, r1);
652 i += 1;
653 }
654 r2.start = inc_char(c);
655 if r2.start <= r2.end {
656 self.ranges.insert(i, r2);
657 }
658 }
659 }
660
661 /// Create a new empty class from this one.
to_empty(&self) -> CharClass662 fn to_empty(&self) -> CharClass {
663 CharClass { ranges: Vec::with_capacity(self.len()) }
664 }
665
666 /// Create a byte class from this character class.
667 ///
668 /// Codepoints above 0xFF are removed.
to_byte_class(self) -> ByteClass669 fn to_byte_class(self) -> ByteClass {
670 ByteClass::new(
671 self.ranges.into_iter()
672 .filter_map(|r| r.to_byte_range())
673 .collect()).canonicalize()
674 }
675
676 /// Merge two classes and canonicalize them.
677 #[cfg(test)]
merge(mut self, other: CharClass) -> CharClass678 fn merge(mut self, other: CharClass) -> CharClass {
679 self.ranges.extend(other);
680 self.canonicalize()
681 }
682
683 /// Canonicalize any sequence of ranges.
684 ///
685 /// This is responsible for enforcing the canonical format invariants
686 /// as described on the docs for the `CharClass` type.
canonicalize(mut self) -> CharClass687 fn canonicalize(mut self) -> CharClass {
688 // TODO: Save some cycles here by checking if already canonicalized.
689 self.ranges.sort();
690 let mut ordered = self.to_empty(); // TODO: Do this in place?
691 for candidate in self {
692 // If the candidate overlaps with an existing range, then it must
693 // be the most recent range added because we process the candidates
694 // in order.
695 if let Some(or) = ordered.ranges.last_mut() {
696 if or.overlapping(candidate) {
697 *or = or.merge(candidate);
698 continue;
699 }
700 }
701 ordered.ranges.push(candidate);
702 }
703 ordered
704 }
705
706 /// Calculate the intersection of two canonical character classes.
707 ///
708 /// The returned intersection is canonical.
intersection(&self, other: &CharClass) -> CharClass709 fn intersection(&self, other: &CharClass) -> CharClass {
710 if self.ranges.is_empty() || other.ranges.is_empty() {
711 return CharClass::empty();
712 }
713
714 let mut intersection = CharClass::empty();
715
716 let mut iter_a = self.ranges.iter();
717 let mut iter_b = other.ranges.iter();
718 let mut a = iter_a.next().unwrap();
719 let mut b = iter_b.next().unwrap();
720 loop {
721 if let Some(i) = a.intersection(&b) {
722 intersection.ranges.push(i);
723 }
724
725 // If the range with the smaller end didn't match this time,
726 // it won't ever match, so move on to the next one.
727 let (iter, item) = if a.end < b.end {
728 (&mut iter_a, &mut a)
729 } else {
730 (&mut iter_b, &mut b)
731 };
732 match iter.next() {
733 Some(v) => *item = v,
734 None => break, // no more ranges to check, done
735 }
736 }
737
738 intersection.canonicalize()
739 }
740
741 /// Negates the character class.
742 ///
743 /// For all `c` where `c` is a Unicode scalar value, `c` matches `self`
744 /// if and only if `c` does not match `self.negate()`.
negate(mut self) -> CharClass745 pub fn negate(mut self) -> CharClass {
746 fn range(s: char, e: char) -> ClassRange { ClassRange::new(s, e) }
747
748 if self.is_empty() {
749 // Inverting an empty range yields all of Unicode.
750 return CharClass {
751 ranges: vec![ClassRange { start: '\x00', end: '\u{10ffff}' }],
752 };
753 }
754 self = self.canonicalize();
755 let mut inv = self.to_empty();
756 if self[0].start > '\x00' {
757 inv.ranges.push(range('\x00', dec_char(self[0].start)));
758 }
759 for win in self.windows(2) {
760 inv.ranges.push(range(inc_char(win[0].end),
761 dec_char(win[1].start)));
762 }
763 if self[self.len() - 1].end < char::MAX {
764 inv.ranges.push(range(inc_char(self[self.len() - 1].end),
765 char::MAX));
766 }
767 inv
768 }
769
770 /// Apply case folding to this character class.
771 ///
772 /// N.B. Applying case folding to a negated character class probably
773 /// won't produce the expected result. e.g., `(?i)[^x]` really should
774 /// match any character sans `x` and `X`, but if `[^x]` is negated
775 /// before being case folded, you'll end up matching any character.
case_fold(self) -> CharClass776 pub fn case_fold(self) -> CharClass {
777 let mut folded = self.to_empty();
778 for r in self {
779 // Applying case folding to a range is expensive because *every*
780 // character needs to be examined. Thus, we avoid that drudgery
781 // if no character in the current range is in our case folding
782 // table.
783 if r.needs_case_folding() {
784 folded.ranges.extend(r.case_fold());
785 }
786 folded.ranges.push(r);
787 }
788 folded.canonicalize()
789 }
790
791 /// Returns the number of characters that match this class.
num_chars(&self) -> usize792 fn num_chars(&self) -> usize {
793 self.ranges.iter()
794 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
795 .fold(0, |acc, len| acc + len)
796 as usize
797 }
798 }
799
800 impl ClassRange {
801 /// Create a new class range.
802 ///
803 /// If `end < start`, then the two values are swapped so that
804 /// the invariant `start <= end` is preserved.
new(start: char, end: char) -> ClassRange805 fn new(start: char, end: char) -> ClassRange {
806 if start <= end {
807 ClassRange { start: start, end: end }
808 } else {
809 ClassRange { start: end, end: start }
810 }
811 }
812
813 /// Translate this to a byte class.
814 ///
815 /// If the start codepoint exceeds 0xFF, then this returns `None`.
816 ///
817 /// If the end codepoint exceeds 0xFF, then it is set to 0xFF.
to_byte_range(self) -> Option<ByteRange>818 fn to_byte_range(self) -> Option<ByteRange> {
819 if self.start > '\u{FF}' {
820 None
821 } else {
822 let s = self.start as u8;
823 let e = min('\u{FF}', self.end) as u8;
824 Some(ByteRange::new(s, e))
825 }
826 }
827
828 /// Create a range of one character.
one(c: char) -> ClassRange829 fn one(c: char) -> ClassRange {
830 ClassRange { start: c, end: c }
831 }
832
833 /// Returns true if and only if the two ranges are overlapping. Note that
834 /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
overlapping(self, other: ClassRange) -> bool835 fn overlapping(self, other: ClassRange) -> bool {
836 max(self.start, other.start) <= inc_char(min(self.end, other.end))
837 }
838
839 /// Returns the intersection of the two ranges if they have common
840 /// characters, `None` otherwise.
intersection(&self, other: &ClassRange) -> Option<ClassRange>841 fn intersection(&self, other: &ClassRange) -> Option<ClassRange> {
842 let start = max(self.start, other.start);
843 let end = min(self.end, other.end);
844 if start <= end {
845 Some(ClassRange::new(start, end))
846 } else {
847 None
848 }
849 }
850
851 /// Creates a new range representing the union of `self` and `other.
merge(self, other: ClassRange) -> ClassRange852 fn merge(self, other: ClassRange) -> ClassRange {
853 ClassRange {
854 start: min(self.start, other.start),
855 end: max(self.end, other.end),
856 }
857 }
858
859 /// Returns true if and only if this range contains a character that is
860 /// in the case folding table.
needs_case_folding(self) -> bool861 fn needs_case_folding(self) -> bool {
862 case_folding::C_plus_S_both_table
863 .binary_search_by(|&(c, _)| self.partial_cmp(&c).unwrap()).is_ok()
864 }
865
866 /// Apply case folding to this range.
867 ///
868 /// Since case folding might add characters such that the range is no
869 /// longer contiguous, this returns multiple class ranges. They are in
870 /// canonical order.
case_fold(self) -> Vec<ClassRange>871 fn case_fold(self) -> Vec<ClassRange> {
872 let table = &case_folding::C_plus_S_both_table;
873 let (s, e) = (self.start as u32, self.end as u32 + 1);
874 let mut start = self.start;
875 let mut end = start;
876 let mut next_case_fold = '\x00';
877 let mut ranges = Vec::with_capacity(10);
878 for mut c in (s..e).filter_map(char::from_u32) {
879 if c >= next_case_fold {
880 c = match simple_case_fold_both_result(c) {
881 Ok(i) => {
882 for &(c1, c2) in &table[i..] {
883 if c1 != c {
884 break;
885 }
886 if c2 != inc_char(end) {
887 ranges.push(ClassRange::new(start, end));
888 start = c2;
889 }
890 end = c2;
891 }
892 continue;
893 }
894 Err(i) => {
895 if i < table.len() {
896 next_case_fold = table[i].0;
897 } else {
898 next_case_fold = '\u{10FFFF}';
899 }
900 c
901 }
902 };
903 }
904 // The fast path. We know this character doesn't have an entry
905 // in the case folding table.
906 if c != inc_char(end) {
907 ranges.push(ClassRange::new(start, end));
908 start = c;
909 }
910 end = c;
911 }
912 ranges.push(ClassRange::new(start, end));
913 ranges
914 }
915 }
916
917 impl PartialEq<char> for ClassRange {
918 #[inline]
eq(&self, other: &char) -> bool919 fn eq(&self, other: &char) -> bool {
920 self.start <= *other && *other <= self.end
921 }
922 }
923
924 impl PartialEq<ClassRange> for char {
925 #[inline]
eq(&self, other: &ClassRange) -> bool926 fn eq(&self, other: &ClassRange) -> bool {
927 other.eq(self)
928 }
929 }
930
931 impl PartialOrd<char> for ClassRange {
932 #[inline]
partial_cmp(&self, other: &char) -> Option<Ordering>933 fn partial_cmp(&self, other: &char) -> Option<Ordering> {
934 Some(if self == other {
935 Ordering::Equal
936 } else if *other > self.end {
937 Ordering::Greater
938 } else {
939 Ordering::Less
940 })
941 }
942 }
943
944 impl PartialOrd<ClassRange> for char {
945 #[inline]
partial_cmp(&self, other: &ClassRange) -> Option<Ordering>946 fn partial_cmp(&self, other: &ClassRange) -> Option<Ordering> {
947 other.partial_cmp(self).map(|o| o.reverse())
948 }
949 }
950
951 impl ByteClass {
952 /// Create a new class from an existing set of ranges.
new(ranges: Vec<ByteRange>) -> ByteClass953 pub fn new(ranges: Vec<ByteRange>) -> ByteClass {
954 ByteClass { ranges: ranges }
955 }
956
957 /// Returns true if `b` is matched by this byte class.
matches(&self, b: u8) -> bool958 pub fn matches(&self, b: u8) -> bool {
959 self.binary_search_by(|range| b.partial_cmp(range).unwrap()).is_ok()
960 }
961
962 /// Removes the given byte from the class if it exists.
963 ///
964 /// Note that this takes `O(n)` time in the number of ranges.
remove(&mut self, b: u8)965 pub fn remove(&mut self, b: u8) {
966 let mut i = match self.binary_search_by(|r| b.partial_cmp(r).unwrap()) {
967 Ok(i) => i,
968 Err(_) => return,
969 };
970 let mut r = self.ranges.remove(i);
971 if r.start == b {
972 r.start = b.saturating_add(1);
973 if r.start > r.end || b == u8::MAX {
974 return;
975 }
976 self.ranges.insert(i, r);
977 } else if r.end == b {
978 r.end = b.saturating_sub(1);
979 if r.end < r.start || b == b'\x00' {
980 return;
981 }
982 self.ranges.insert(0, r);
983 } else {
984 let (mut r1, mut r2) = (r.clone(), r.clone());
985 r1.end = b.saturating_sub(1);
986 if r1.start <= r1.end {
987 self.ranges.insert(i, r1);
988 i += 1;
989 }
990 r2.start = b.saturating_add(1);
991 if r2.start <= r2.end {
992 self.ranges.insert(i, r2);
993 }
994 }
995 }
996
997 /// Create a new empty class from this one.
to_empty(&self) -> ByteClass998 fn to_empty(&self) -> ByteClass {
999 ByteClass { ranges: Vec::with_capacity(self.len()) }
1000 }
1001
1002 /// Canonicalze any sequence of ranges.
1003 ///
1004 /// This is responsible for enforcing the canonical format invariants
1005 /// as described on the docs for the `ByteClass` type.
canonicalize(mut self) -> ByteClass1006 fn canonicalize(mut self) -> ByteClass {
1007 // TODO: Save some cycles here by checking if already canonicalized.
1008 self.ranges.sort();
1009 let mut ordered = self.to_empty(); // TODO: Do this in place?
1010 for candidate in self {
1011 // If the candidate overlaps with an existing range, then it must
1012 // be the most recent range added because we process the candidates
1013 // in order.
1014 if let Some(or) = ordered.ranges.last_mut() {
1015 if or.overlapping(candidate) {
1016 *or = or.merge(candidate);
1017 continue;
1018 }
1019 }
1020 ordered.ranges.push(candidate);
1021 }
1022 ordered
1023 }
1024
1025 /// Negates the byte class.
1026 ///
1027 /// For all `b` where `b` is a byte, `b` matches `self` if and only if `b`
1028 /// does not match `self.negate()`.
negate(mut self) -> ByteClass1029 pub fn negate(mut self) -> ByteClass {
1030 fn range(s: u8, e: u8) -> ByteRange { ByteRange::new(s, e) }
1031
1032 if self.is_empty() {
1033 // Inverting an empty range yields all bytes.
1034 return ByteClass {
1035 ranges: vec![ByteRange { start: b'\x00', end: b'\xff' }],
1036 };
1037 }
1038 self = self.canonicalize();
1039 let mut inv = self.to_empty();
1040 if self[0].start > b'\x00' {
1041 inv.ranges.push(range(b'\x00', self[0].start.saturating_sub(1)));
1042 }
1043 for win in self.windows(2) {
1044 inv.ranges.push(range(win[0].end.saturating_add(1),
1045 win[1].start.saturating_sub(1)));
1046 }
1047 if self[self.len() - 1].end < u8::MAX {
1048 inv.ranges.push(range(self[self.len() - 1].end.saturating_add(1),
1049 u8::MAX));
1050 }
1051 inv
1052 }
1053
1054 /// Apply case folding to this byte class.
1055 ///
1056 /// This assumes that the bytes in the ranges are ASCII compatible.
1057 ///
1058 /// N.B. Applying case folding to a negated character class probably
1059 /// won't produce the expected result. e.g., `(?i)[^x]` really should
1060 /// match any character sans `x` and `X`, but if `[^x]` is negated
1061 /// before being case folded, you'll end up matching any character.
case_fold(self) -> ByteClass1062 pub fn case_fold(self) -> ByteClass {
1063 let mut folded = self.to_empty();
1064 for r in self {
1065 folded.ranges.extend(r.case_fold());
1066 }
1067 folded.canonicalize()
1068 }
1069
1070 /// Returns the number of bytes that match this class.
num_bytes(&self) -> usize1071 fn num_bytes(&self) -> usize {
1072 self.ranges.iter()
1073 .map(|&r| 1 + (r.end as u32) - (r.start as u32))
1074 .fold(0, |acc, len| acc + len)
1075 as usize
1076 }
1077 }
1078
1079 impl ByteRange {
1080 /// Create a new class range.
1081 ///
1082 /// If `end < start`, then the two values are swapped so that
1083 /// the invariant `start <= end` is preserved.
new(start: u8, end: u8) -> ByteRange1084 fn new(start: u8, end: u8) -> ByteRange {
1085 if start <= end {
1086 ByteRange { start: start, end: end }
1087 } else {
1088 ByteRange { start: end, end: start }
1089 }
1090 }
1091
1092 /// Returns true if and only if the two ranges are overlapping. Note that
1093 /// since ranges are inclusive, `a-c` and `d-f` are overlapping!
overlapping(self, other: ByteRange) -> bool1094 fn overlapping(self, other: ByteRange) -> bool {
1095 max(self.start, other.start)
1096 <= min(self.end, other.end).saturating_add(1)
1097 }
1098
1099 /// Returns true if and only if the intersection of self and other is non
1100 /// empty.
is_intersect_empty(self, other: ByteRange) -> bool1101 fn is_intersect_empty(self, other: ByteRange) -> bool {
1102 max(self.start, other.start) > min(self.end, other.end)
1103 }
1104
1105 /// Creates a new range representing the union of `self` and `other.
merge(self, other: ByteRange) -> ByteRange1106 fn merge(self, other: ByteRange) -> ByteRange {
1107 ByteRange {
1108 start: min(self.start, other.start),
1109 end: max(self.end, other.end),
1110 }
1111 }
1112
1113 /// Apply case folding to this range.
1114 ///
1115 /// Since case folding might add bytes such that the range is no
1116 /// longer contiguous, this returns multiple byte ranges.
1117 ///
1118 /// This assumes that the bytes in this range are ASCII compatible.
case_fold(self) -> Vec<ByteRange>1119 fn case_fold(self) -> Vec<ByteRange> {
1120 // So much easier than Unicode case folding!
1121 let mut ranges = vec![self];
1122 if !ByteRange::new(b'a', b'z').is_intersect_empty(self) {
1123 let lower = max(self.start, b'a');
1124 let upper = min(self.end, b'z');
1125 ranges.push(ByteRange::new(lower - 32, upper - 32));
1126 }
1127 if !ByteRange::new(b'A', b'Z').is_intersect_empty(self) {
1128 let lower = max(self.start, b'A');
1129 let upper = min(self.end, b'Z');
1130 ranges.push(ByteRange::new(lower + 32, upper + 32));
1131 }
1132 ranges
1133 }
1134 }
1135
1136 impl Deref for ByteClass {
1137 type Target = Vec<ByteRange>;
deref(&self) -> &Vec<ByteRange>1138 fn deref(&self) -> &Vec<ByteRange> { &self.ranges }
1139 }
1140
1141 impl IntoIterator for ByteClass {
1142 type Item = ByteRange;
1143 type IntoIter = vec::IntoIter<ByteRange>;
into_iter(self) -> vec::IntoIter<ByteRange>1144 fn into_iter(self) -> vec::IntoIter<ByteRange> { self.ranges.into_iter() }
1145 }
1146
1147 impl<'a> IntoIterator for &'a ByteClass {
1148 type Item = &'a ByteRange;
1149 type IntoIter = slice::Iter<'a, ByteRange>;
into_iter(self) -> slice::Iter<'a, ByteRange>1150 fn into_iter(self) -> slice::Iter<'a, ByteRange> { self.iter() }
1151 }
1152
1153 impl PartialEq<u8> for ByteRange {
1154 #[inline]
eq(&self, other: &u8) -> bool1155 fn eq(&self, other: &u8) -> bool {
1156 self.start <= *other && *other <= self.end
1157 }
1158 }
1159
1160 impl PartialEq<ByteRange> for u8 {
1161 #[inline]
eq(&self, other: &ByteRange) -> bool1162 fn eq(&self, other: &ByteRange) -> bool {
1163 other.eq(self)
1164 }
1165 }
1166
1167 impl PartialOrd<u8> for ByteRange {
1168 #[inline]
partial_cmp(&self, other: &u8) -> Option<Ordering>1169 fn partial_cmp(&self, other: &u8) -> Option<Ordering> {
1170 Some(if self == other {
1171 Ordering::Equal
1172 } else if *other > self.end {
1173 Ordering::Greater
1174 } else {
1175 Ordering::Less
1176 })
1177 }
1178 }
1179
1180 impl PartialOrd<ByteRange> for u8 {
1181 #[inline]
partial_cmp(&self, other: &ByteRange) -> Option<Ordering>1182 fn partial_cmp(&self, other: &ByteRange) -> Option<Ordering> {
1183 other.partial_cmp(self).map(|o| o.reverse())
1184 }
1185 }
1186
1187 /// This implementation of `Display` will write a regular expression from the
1188 /// syntax tree. It does not write the original string parsed.
1189 impl fmt::Display for Expr {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1190 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1191 match *self {
1192 Empty => write!(f, ""),
1193 Literal { ref chars, casei } => {
1194 if casei {
1195 try!(write!(f, "(?iu:"));
1196 } else {
1197 try!(write!(f, "(?u:"));
1198 }
1199 for &c in chars {
1200 try!(write!(f, "{}", quote_char(c)));
1201 }
1202 try!(write!(f, ")"));
1203 Ok(())
1204 }
1205 LiteralBytes { ref bytes, casei } => {
1206 if casei {
1207 try!(write!(f, "(?i-u:"));
1208 } else {
1209 try!(write!(f, "(?-u:"));
1210 }
1211 for &b in bytes {
1212 try!(write!(f, "{}", quote_byte(b)));
1213 }
1214 try!(write!(f, ")"));
1215 Ok(())
1216 }
1217 AnyChar => write!(f, "(?su:.)"),
1218 AnyCharNoNL => write!(f, "(?u:.)"),
1219 AnyByte => write!(f, "(?s-u:.)"),
1220 AnyByteNoNL => write!(f, "(?-u:.)"),
1221 Class(ref cls) => write!(f, "{}", cls),
1222 ClassBytes(ref cls) => write!(f, "{}", cls),
1223 StartLine => write!(f, "(?m:^)"),
1224 EndLine => write!(f, "(?m:$)"),
1225 StartText => write!(f, r"^"),
1226 EndText => write!(f, r"$"),
1227 WordBoundary => write!(f, r"(?u:\b)"),
1228 NotWordBoundary => write!(f, r"(?u:\B)"),
1229 WordBoundaryAscii => write!(f, r"(?-u:\b)"),
1230 NotWordBoundaryAscii => write!(f, r"(?-u:\B)"),
1231 Group { ref e, i: None, name: None } => write!(f, "(?:{})", e),
1232 Group { ref e, name: None, .. } => write!(f, "({})", e),
1233 Group { ref e, name: Some(ref n), .. } => {
1234 write!(f, "(?P<{}>{})", n, e)
1235 }
1236 Repeat { ref e, r, greedy } => {
1237 match &**e {
1238 &Literal { ref chars, .. } if chars.len() > 1 => {
1239 try!(write!(f, "(?:{}){}", e, r))
1240 }
1241 _ => try!(write!(f, "{}{}", e, r)),
1242 }
1243 if !greedy { try!(write!(f, "?")); }
1244 Ok(())
1245 }
1246 Concat(ref es) => {
1247 for e in es {
1248 try!(write!(f, "{}", e));
1249 }
1250 Ok(())
1251 }
1252 Alternate(ref es) => {
1253 for (i, e) in es.iter().enumerate() {
1254 if i > 0 { try!(write!(f, "|")); }
1255 try!(write!(f, "{}", e));
1256 }
1257 Ok(())
1258 }
1259 }
1260 }
1261 }
1262
1263 impl fmt::Display for Repeater {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1264 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1265 match *self {
1266 ZeroOrOne => write!(f, "?"),
1267 ZeroOrMore => write!(f, "*"),
1268 OneOrMore => write!(f, "+"),
1269 Range { min: s, max: None } => write!(f, "{{{},}}", s),
1270 Range { min: s, max: Some(e) } if s == e => write!(f, "{{{}}}", s),
1271 Range { min: s, max: Some(e) } => write!(f, "{{{}, {}}}", s, e),
1272 }
1273 }
1274 }
1275
1276 impl fmt::Display for CharClass {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1277 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1278 try!(write!(f, "(?u:["));
1279 for range in self.iter() {
1280 if range.start == '-' || range.end == '-' {
1281 try!(write!(f, "-"));
1282 break;
1283 }
1284 }
1285 for range in self.iter() {
1286 let mut range = *range;
1287 if range.start == '-' {
1288 range.start = ((range.start as u8) + 1) as char;
1289 }
1290 if range.end == '-' {
1291 range.end = ((range.end as u8) - 1) as char;
1292 }
1293 if range.start > range.end {
1294 continue;
1295 }
1296 try!(write!(f, "{}", range));
1297 }
1298 try!(write!(f, "])"));
1299 Ok(())
1300 }
1301 }
1302
1303 impl fmt::Display for ClassRange {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1304 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1305 write!(f, "{}-{}", quote_char(self.start), quote_char(self.end))
1306 }
1307 }
1308
1309 impl fmt::Display for ByteClass {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1310 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1311 try!(write!(f, "(?-u:["));
1312 for range in self.iter() {
1313 if range.start == b'-' || range.end == b'-' {
1314 try!(write!(f, "-"));
1315 break;
1316 }
1317 }
1318 for range in self.iter() {
1319 let mut range = *range;
1320 if range.start == b'-' {
1321 range.start += 1;
1322 }
1323 if range.end == b'-' {
1324 range.start -= 1;
1325 }
1326 if range.start > range.end {
1327 continue;
1328 }
1329 try!(write!(f, "{}", range));
1330 }
1331 try!(write!(f, "])"));
1332 Ok(())
1333 }
1334 }
1335
1336 impl fmt::Display for ByteRange {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1337 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1338 write!(f, "{}-{}", quote_byte(self.start), quote_byte(self.end))
1339 }
1340 }
1341
1342 /// An alias for computations that can return a `Error`.
1343 pub type Result<T> = ::std::result::Result<T, Error>;
1344
1345 /// A parse error.
1346 ///
1347 /// This includes details about the specific type of error and a rough
1348 /// approximation of where it occurred.
1349 #[derive(Clone, Debug, PartialEq)]
1350 pub struct Error {
1351 pos: usize,
1352 surround: String,
1353 kind: ErrorKind,
1354 }
1355
1356 /// The specific type of parse error that can occur.
1357 #[derive(Clone, Debug, PartialEq)]
1358 pub enum ErrorKind {
1359 /// A negation symbol is used twice in flag settings.
1360 /// e.g., `(?-i-s)`.
1361 DoubleFlagNegation,
1362 /// The same capture name was used more than once.
1363 /// e.g., `(?P<a>.)(?P<a>.)`.
1364 DuplicateCaptureName(String),
1365 /// An alternate is empty. e.g., `(|a)`.
1366 EmptyAlternate,
1367 /// A capture group name is empty. e.g., `(?P<>a)`.
1368 EmptyCaptureName,
1369 /// A negation symbol was not proceded by any flags. e.g., `(?i-)`.
1370 EmptyFlagNegation,
1371 /// A group is empty. e.g., `()`.
1372 EmptyGroup,
1373 /// An invalid number was used in a counted repetition. e.g., `a{b}`.
1374 InvalidBase10(String),
1375 /// An invalid hexadecimal number was used in an escape sequence.
1376 /// e.g., `\xAG`.
1377 InvalidBase16(String),
1378 /// An invalid capture name was used. e.g., `(?P<0a>b)`.
1379 InvalidCaptureName(String),
1380 /// An invalid class range was givien. Specifically, when the start of the
1381 /// range is greater than the end. e.g., `[z-a]`.
1382 InvalidClassRange {
1383 /// The first character specified in the range.
1384 start: char,
1385 /// The second character specified in the range.
1386 end: char,
1387 },
1388 /// An escape sequence was used in a character class where it is not
1389 /// allowed. e.g., `[a-\pN]` or `[\A]`.
1390 InvalidClassEscape(Expr),
1391 /// An invalid counted repetition min/max was given. e.g., `a{2,1}`.
1392 InvalidRepeatRange {
1393 /// The first number specified in the repetition.
1394 min: u32,
1395 /// The second number specified in the repetition.
1396 max: u32,
1397 },
1398 /// An invalid Unicode scalar value was used in a long hexadecimal
1399 /// sequence. e.g., `\x{D800}`.
1400 InvalidScalarValue(u32),
1401 /// An empty counted repetition operator. e.g., `a{}`.
1402 MissingBase10,
1403 /// A repetition operator was not applied to an expression. e.g., `*`.
1404 RepeaterExpectsExpr,
1405 /// A repetition operator was applied to an expression that cannot be
1406 /// repeated. e.g., `a+*` or `a|*`.
1407 RepeaterUnexpectedExpr(Expr),
1408 /// A capture group name that is never closed. e.g., `(?P<a`.
1409 UnclosedCaptureName(String),
1410 /// An unclosed hexadecimal literal. e.g., `\x{a`.
1411 UnclosedHex,
1412 /// An unclosed parenthesis. e.g., `(a`.
1413 UnclosedParen,
1414 /// An unclosed counted repetition operator. e.g., `a{2`.
1415 UnclosedRepeat,
1416 /// An unclosed named Unicode class. e.g., `\p{Yi`.
1417 UnclosedUnicodeName,
1418 /// Saw end of regex before class was closed. e.g., `[a`.
1419 UnexpectedClassEof,
1420 /// Saw end of regex before escape sequence was closed. e.g., `\`.
1421 UnexpectedEscapeEof,
1422 /// Saw end of regex before flags were closed. e.g., `(?i`.
1423 UnexpectedFlagEof,
1424 /// Saw end of regex before two hexadecimal digits were seen. e.g., `\xA`.
1425 UnexpectedTwoDigitHexEof,
1426 /// Unopened parenthesis. e.g., `)`.
1427 UnopenedParen,
1428 /// Unrecognized escape sequence. e.g., `\q`.
1429 UnrecognizedEscape(char),
1430 /// Unrecognized flag. e.g., `(?a)`.
1431 UnrecognizedFlag(char),
1432 /// Unrecognized named Unicode class. e.g., `\p{Foo}`.
1433 UnrecognizedUnicodeClass(String),
1434 /// Indicates that the regex uses too much nesting.
1435 ///
1436 /// (N.B. This error exists because traversing the Expr is recursive and
1437 /// an explicit heap allocated stack is not (yet?) used. Regardless, some
1438 /// sort of limit must be applied to avoid unbounded memory growth.
1439 StackExhausted,
1440 /// A disallowed flag was found (e.g., `u`).
1441 FlagNotAllowed(char),
1442 /// A Unicode class was used when the Unicode (`u`) flag was disabled.
1443 UnicodeNotAllowed,
1444 /// InvalidUtf8 indicates that the expression may match non-UTF-8 bytes.
1445 /// This never returned if the parser is permitted to allow expressions
1446 /// that match arbitrary bytes.
1447 InvalidUtf8,
1448 /// A character class was constructed such that it is empty.
1449 /// e.g., `[^\d\D]`.
1450 EmptyClass,
1451 /// Indicates that unsupported notation was used in a character class.
1452 ///
1453 /// The char in this error corresponds to the illegal character.
1454 ///
1455 /// The intent of this error is to carve a path to support set notation
1456 /// as described in UTS#18 RL1.3. We do this by rejecting regexes that
1457 /// would use the notation.
1458 ///
1459 /// The work around for end users is to escape the character included in
1460 /// this error message.
1461 UnsupportedClassChar(char),
1462 /// Hints that destructuring should not be exhaustive.
1463 ///
1464 /// This enum may grow additional variants, so this makes sure clients
1465 /// don't count on exhaustive matching. (Otherwise, adding a new variant
1466 /// could break existing code.)
1467 #[doc(hidden)]
1468 __Nonexhaustive,
1469 }
1470
1471 impl Error {
1472 /// Returns an approximate *character* offset at which the error occurred.
1473 ///
1474 /// The character offset may be equal to the number of characters in the
1475 /// string, in which case it should be interpreted as pointing to the end
1476 /// of the regex.
position(&self) -> usize1477 pub fn position(&self) -> usize {
1478 self.pos
1479 }
1480
1481 /// Returns the type of the regex parse error.
kind(&self) -> &ErrorKind1482 pub fn kind(&self) -> &ErrorKind {
1483 &self.kind
1484 }
1485 }
1486
1487 impl ErrorKind {
description(&self) -> &str1488 fn description(&self) -> &str {
1489 use ErrorKind::*;
1490 match *self {
1491 DoubleFlagNegation => "double flag negation",
1492 DuplicateCaptureName(_) => "duplicate capture name",
1493 EmptyAlternate => "empty alternate",
1494 EmptyCaptureName => "empty capture name",
1495 EmptyFlagNegation => "flag negation without any flags",
1496 EmptyGroup => "empty group (e.g., '()')",
1497 InvalidBase10(_) => "invalid base 10 number",
1498 InvalidBase16(_) => "invalid base 16 number",
1499 InvalidCaptureName(_) => "invalid capture name",
1500 InvalidClassRange{..} => "invalid character class range",
1501 InvalidClassEscape(_) => "invalid escape sequence in class",
1502 InvalidRepeatRange{..} => "invalid counted repetition range",
1503 InvalidScalarValue(_) => "invalid Unicode scalar value",
1504 MissingBase10 => "missing count in repetition operator",
1505 RepeaterExpectsExpr => "repetition operator missing expression",
1506 RepeaterUnexpectedExpr(_) => "expression cannot be repeated",
1507 UnclosedCaptureName(_) => "unclosed capture group name",
1508 UnclosedHex => "unclosed hexadecimal literal",
1509 UnclosedParen => "unclosed parenthesis",
1510 UnclosedRepeat => "unclosed counted repetition operator",
1511 UnclosedUnicodeName => "unclosed Unicode class literal",
1512 UnexpectedClassEof => "unexpected EOF in character class",
1513 UnexpectedEscapeEof => "unexpected EOF in escape sequence",
1514 UnexpectedFlagEof => "unexpected EOF in flags",
1515 UnexpectedTwoDigitHexEof => "unexpected EOF in hex literal",
1516 UnopenedParen => "unopened parenthesis",
1517 UnrecognizedEscape(_) => "unrecognized escape sequence",
1518 UnrecognizedFlag(_) => "unrecognized flag",
1519 UnrecognizedUnicodeClass(_) => "unrecognized Unicode class name",
1520 StackExhausted => "stack exhausted, too much nesting",
1521 FlagNotAllowed(_) => "flag not allowed",
1522 UnicodeNotAllowed => "Unicode features not allowed",
1523 InvalidUtf8 => "matching arbitrary bytes is not allowed",
1524 EmptyClass => "empty character class",
1525 UnsupportedClassChar(_) => "unsupported class notation",
1526 __Nonexhaustive => unreachable!(),
1527 }
1528 }
1529 }
1530
1531 impl ::std::error::Error for Error {
description(&self) -> &str1532 fn description(&self) -> &str {
1533 self.kind.description()
1534 }
1535 }
1536
1537 impl fmt::Display for Error {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1538 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1539 if let ErrorKind::StackExhausted = self.kind {
1540 write!(f, "Error parsing regex: {}", self.kind)
1541 } else {
1542 write!(
1543 f, "Error parsing regex near '{}' at character offset {}: {}",
1544 self.surround, self.pos, self.kind)
1545 }
1546 }
1547 }
1548
1549 impl fmt::Display for ErrorKind {
fmt(&self, f: &mut fmt::Formatter) -> fmt::Result1550 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
1551 use ErrorKind::*;
1552 match *self {
1553 DoubleFlagNegation =>
1554 write!(f, "Only one negation symbol is allowed in flags."),
1555 DuplicateCaptureName(ref s) =>
1556 write!(f, "Capture name '{}' is used more than once.", s),
1557 EmptyAlternate =>
1558 write!(f, "Alternations cannot be empty."),
1559 EmptyCaptureName =>
1560 write!(f, "Capture names cannot be empty."),
1561 EmptyFlagNegation =>
1562 write!(f, "Flag negation requires setting at least one flag."),
1563 EmptyGroup =>
1564 write!(f, "Empty regex groups (e.g., '()') are not allowed."),
1565 InvalidBase10(ref s) =>
1566 write!(f, "Not a valid base 10 number: '{}'", s),
1567 InvalidBase16(ref s) =>
1568 write!(f, "Not a valid base 16 number: '{}'", s),
1569 InvalidCaptureName(ref s) =>
1570 write!(f, "Invalid capture name: '{}'. Capture names must \
1571 consist of [_a-zA-Z0-9] and are not allowed to \
1572 start with with a number.", s),
1573 InvalidClassRange { start, end } =>
1574 write!(f, "Invalid character class range '{}-{}'. \
1575 Character class ranges must start with the smaller \
1576 character, but {} > {}", start, end, start, end),
1577 InvalidClassEscape(ref e) =>
1578 write!(f, "Invalid escape sequence in character \
1579 class: '{}'.", e),
1580 InvalidRepeatRange { min, max } =>
1581 write!(f, "Invalid counted repetition range: {{{}, {}}}. \
1582 Counted repetition ranges must start with the \
1583 minimum, but {} > {}", min, max, min, max),
1584 InvalidScalarValue(c) =>
1585 write!(f, "Number does not correspond to a Unicode scalar \
1586 value: '{}'.", c),
1587 MissingBase10 =>
1588 write!(f, "Missing maximum in counted
1589 repetition operator."),
1590 RepeaterExpectsExpr =>
1591 write!(f, "Missing expression for repetition operator."),
1592 RepeaterUnexpectedExpr(ref e) =>
1593 write!(f, "Invalid application of repetition operator to: \
1594 '{}'.", e),
1595 UnclosedCaptureName(ref s) =>
1596 write!(f, "Capture name group for '{}' is not closed. \
1597 (Missing a '>'.)", s),
1598 UnclosedHex =>
1599 write!(f, "Unclosed hexadecimal literal (missing a '}}')."),
1600 UnclosedParen =>
1601 write!(f, "Unclosed parenthesis."),
1602 UnclosedRepeat =>
1603 write!(f, "Unclosed counted repetition (missing a '}}')."),
1604 UnclosedUnicodeName =>
1605 write!(f, "Unclosed Unicode literal (missing a '}}')."),
1606 UnexpectedClassEof =>
1607 write!(f, "Character class was not closed before the end of \
1608 the regex (missing a ']')."),
1609 UnexpectedEscapeEof =>
1610 write!(f, "Started an escape sequence that didn't finish \
1611 before the end of the regex."),
1612 UnexpectedFlagEof =>
1613 write!(f, "Inline flag settings was not closed before the end \
1614 of the regex (missing a ')' or ':')."),
1615 UnexpectedTwoDigitHexEof =>
1616 write!(f, "Unexpected end of two digit hexadecimal literal."),
1617 UnopenedParen =>
1618 write!(f, "Unopened parenthesis."),
1619 UnrecognizedEscape(c) =>
1620 write!(f, "Unrecognized escape sequence: '\\{}'.", c),
1621 UnrecognizedFlag(c) =>
1622 write!(f, "Unrecognized flag: '{}'. \
1623 (Allowed flags: i, m, s, U, u, x.)", c),
1624 UnrecognizedUnicodeClass(ref s) =>
1625 write!(f, "Unrecognized Unicode class name: '{}'.", s),
1626 StackExhausted =>
1627 write!(f, "Exhausted space required to parse regex with too \
1628 much nesting."),
1629 FlagNotAllowed(flag) =>
1630 write!(f, "Use of the flag '{}' is not allowed.", flag),
1631 UnicodeNotAllowed =>
1632 write!(f, "Unicode features are not allowed when the Unicode \
1633 (u) flag is not set."),
1634 InvalidUtf8 =>
1635 write!(f, "Matching arbitrary bytes is not allowed."),
1636 EmptyClass =>
1637 write!(f, "Empty character classes are not allowed."),
1638 UnsupportedClassChar(c) =>
1639 write!(f, "Use of unescaped '{}' in character class is \
1640 not allowed.", c),
1641 __Nonexhaustive => unreachable!(),
1642 }
1643 }
1644 }
1645
1646 /// The result of binary search on the simple case folding table.
1647 ///
1648 /// Note that this binary search is done on the "both" table, such that
1649 /// the index returned corresponds to the *first* location of `c1` in the
1650 /// table. The table can then be scanned linearly starting from the position
1651 /// returned to find other case mappings for `c1`.
simple_case_fold_both_result(c1: char) -> result::Result<usize, usize>1652 fn simple_case_fold_both_result(c1: char) -> result::Result<usize, usize> {
1653 let table = &case_folding::C_plus_S_both_table;
1654 let i = binary_search(table, |&(c2, _)| c1 <= c2);
1655 if i >= table.len() || table[i].0 != c1 {
1656 Err(i)
1657 } else {
1658 Ok(i)
1659 }
1660 }
1661
1662 /// Binary search to find first element such that `pred(T) == true`.
1663 ///
1664 /// Assumes that if `pred(xs[i]) == true` then `pred(xs[i+1]) == true`.
1665 ///
1666 /// If all elements yield `pred(T) == false`, then `xs.len()` is returned.
binary_search<T, F>(xs: &[T], mut pred: F) -> usize where F: FnMut(&T) -> bool1667 fn binary_search<T, F>(xs: &[T], mut pred: F) -> usize
1668 where F: FnMut(&T) -> bool {
1669 let (mut left, mut right) = (0, xs.len());
1670 while left < right {
1671 let mid = (left + right) / 2;
1672 if pred(&xs[mid]) {
1673 right = mid;
1674 } else {
1675 left = mid + 1;
1676 }
1677 }
1678 left
1679 }
1680
1681 /// Escapes all regular expression meta characters in `text`.
1682 ///
1683 /// The string returned may be safely used as a literal in a regular
1684 /// expression.
escape(text: &str) -> String1685 pub fn escape(text: &str) -> String {
1686 let mut quoted = String::with_capacity(text.len());
1687 for c in text.chars() {
1688 if parser::is_punct(c) {
1689 quoted.push('\\');
1690 }
1691 quoted.push(c);
1692 }
1693 quoted
1694 }
1695
quote_char(c: char) -> String1696 fn quote_char(c: char) -> String {
1697 let mut s = String::new();
1698 if parser::is_punct(c) {
1699 s.push('\\');
1700 }
1701 s.push(c);
1702 s
1703 }
1704
quote_byte(b: u8) -> String1705 fn quote_byte(b: u8) -> String {
1706 if parser::is_punct(b as char) || b == b'\'' || b == b'"' {
1707 quote_char(b as char)
1708 } else {
1709 let escaped: Vec<u8> = ascii::escape_default(b).collect();
1710 String::from_utf8(escaped).unwrap()
1711 }
1712 }
1713
inc_char(c: char) -> char1714 fn inc_char(c: char) -> char {
1715 match c {
1716 char::MAX => char::MAX,
1717 '\u{D7FF}' => '\u{E000}',
1718 c => char::from_u32(c as u32 + 1).unwrap(),
1719 }
1720 }
1721
dec_char(c: char) -> char1722 fn dec_char(c: char) -> char {
1723 match c {
1724 '\x00' => '\x00',
1725 '\u{E000}' => '\u{D7FF}',
1726 c => char::from_u32(c as u32 - 1).unwrap(),
1727 }
1728 }
1729
1730 /// Returns true if and only if `c` is a word character.
1731 #[doc(hidden)]
is_word_char(c: char) -> bool1732 pub fn is_word_char(c: char) -> bool {
1733 match c {
1734 '_' | '0' ... '9' | 'a' ... 'z' | 'A' ... 'Z' => true,
1735 _ => ::unicode::regex::PERLW.binary_search_by(|&(start, end)| {
1736 if c >= start && c <= end {
1737 Ordering::Equal
1738 } else if start > c {
1739 Ordering::Greater
1740 } else {
1741 Ordering::Less
1742 }
1743 }).is_ok(),
1744 }
1745 }
1746
1747 /// Returns true if and only if `c` is an ASCII word byte.
1748 #[doc(hidden)]
is_word_byte(b: u8) -> bool1749 pub fn is_word_byte(b: u8) -> bool {
1750 match b {
1751 b'_' | b'0' ... b'9' | b'a' ... b'z' | b'A' ... b'Z' => true,
1752 _ => false,
1753 }
1754 }
1755
1756 #[cfg(test)]
1757 mod properties;
1758
1759 #[cfg(test)]
1760 mod tests {
1761 use {CharClass, ClassRange, ByteClass, ByteRange, Expr};
1762
class(ranges: &[(char, char)]) -> CharClass1763 fn class(ranges: &[(char, char)]) -> CharClass {
1764 let ranges = ranges.iter().cloned()
1765 .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
1766 CharClass::new(ranges)
1767 }
1768
bclass(ranges: &[(u8, u8)]) -> ByteClass1769 fn bclass(ranges: &[(u8, u8)]) -> ByteClass {
1770 let ranges = ranges.iter().cloned()
1771 .map(|(c1, c2)| ByteRange::new(c1, c2)).collect();
1772 ByteClass::new(ranges)
1773 }
1774
e(re: &str) -> Expr1775 fn e(re: &str) -> Expr { Expr::parse(re).unwrap() }
1776
1777 #[test]
stack_exhaustion()1778 fn stack_exhaustion() {
1779 use std::iter::repeat;
1780
1781 let open: String = repeat('(').take(200).collect();
1782 let close: String = repeat(')').take(200).collect();
1783 assert!(Expr::parse(&format!("{}a{}", open, close)).is_ok());
1784
1785 let open: String = repeat('(').take(200 + 1).collect();
1786 let close: String = repeat(')').take(200 + 1).collect();
1787 assert!(Expr::parse(&format!("{}a{}", open, close)).is_err());
1788 }
1789
1790 #[test]
anchored_start()1791 fn anchored_start() {
1792 assert!(e("^a").is_anchored_start());
1793 assert!(e("(^a)").is_anchored_start());
1794 assert!(e("^a|^b").is_anchored_start());
1795 assert!(e("(^a)|(^b)").is_anchored_start());
1796 assert!(e("(^(a|b))").is_anchored_start());
1797
1798 assert!(!e("^a|b").is_anchored_start());
1799 assert!(!e("a|^b").is_anchored_start());
1800 }
1801
1802 #[test]
anchored_end()1803 fn anchored_end() {
1804 assert!(e("a$").is_anchored_end());
1805 assert!(e("(a$)").is_anchored_end());
1806 assert!(e("a$|b$").is_anchored_end());
1807 assert!(e("(a$)|(b$)").is_anchored_end());
1808 assert!(e("((a|b)$)").is_anchored_end());
1809
1810 assert!(!e("a$|b").is_anchored_end());
1811 assert!(!e("a|b$").is_anchored_end());
1812 }
1813
1814 #[test]
class_canon_no_change()1815 fn class_canon_no_change() {
1816 let cls = class(&[('a', 'c'), ('x', 'z')]);
1817 assert_eq!(cls.clone().canonicalize(), cls);
1818 }
1819
1820 #[test]
class_canon_unordered()1821 fn class_canon_unordered() {
1822 let cls = class(&[('x', 'z'), ('a', 'c')]);
1823 assert_eq!(cls.canonicalize(), class(&[
1824 ('a', 'c'), ('x', 'z'),
1825 ]));
1826 }
1827
1828 #[test]
class_canon_overlap()1829 fn class_canon_overlap() {
1830 let cls = class(&[('x', 'z'), ('w', 'y')]);
1831 assert_eq!(cls.canonicalize(), class(&[
1832 ('w', 'z'),
1833 ]));
1834 }
1835
1836 #[test]
class_canon_overlap_many()1837 fn class_canon_overlap_many() {
1838 let cls = class(&[
1839 ('c', 'f'), ('a', 'g'), ('d', 'j'), ('a', 'c'),
1840 ('m', 'p'), ('l', 's'),
1841 ]);
1842 assert_eq!(cls.clone().canonicalize(), class(&[
1843 ('a', 'j'), ('l', 's'),
1844 ]));
1845 }
1846
1847 #[test]
class_canon_overlap_boundary()1848 fn class_canon_overlap_boundary() {
1849 let cls = class(&[('x', 'z'), ('u', 'w')]);
1850 assert_eq!(cls.canonicalize(), class(&[
1851 ('u', 'z'),
1852 ]));
1853 }
1854
1855 #[test]
class_canon_extreme_edge_case()1856 fn class_canon_extreme_edge_case() {
1857 let cls = class(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1858 assert_eq!(cls.canonicalize(), class(&[
1859 ('\x00', '\u{10FFFF}'),
1860 ]));
1861 }
1862
1863 #[test]
class_canon_singles()1864 fn class_canon_singles() {
1865 let cls = class(&[('a', 'a'), ('b', 'b')]);
1866 assert_eq!(cls.canonicalize(), class(&[('a', 'b')]));
1867 }
1868
1869 #[test]
class_negate_single()1870 fn class_negate_single() {
1871 let cls = class(&[('a', 'a')]);
1872 assert_eq!(cls.negate(), class(&[
1873 ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
1874 ]));
1875 }
1876
1877 #[test]
class_negate_singles()1878 fn class_negate_singles() {
1879 let cls = class(&[('a', 'a'), ('b', 'b')]);
1880 assert_eq!(cls.negate(), class(&[
1881 ('\x00', '\x60'), ('\x63', '\u{10FFFF}'),
1882 ]));
1883 }
1884
1885 #[test]
class_negate_multiples()1886 fn class_negate_multiples() {
1887 let cls = class(&[('a', 'c'), ('x', 'z')]);
1888 assert_eq!(cls.negate(), class(&[
1889 ('\x00', '\x60'), ('\x64', '\x77'), ('\x7b', '\u{10FFFF}'),
1890 ]));
1891 }
1892
1893 #[test]
class_negate_min_scalar()1894 fn class_negate_min_scalar() {
1895 let cls = class(&[('\x00', 'a')]);
1896 assert_eq!(cls.negate(), class(&[
1897 ('\x62', '\u{10FFFF}'),
1898 ]));
1899 }
1900
1901 #[test]
class_negate_max_scalar()1902 fn class_negate_max_scalar() {
1903 let cls = class(&[('a', '\u{10FFFF}')]);
1904 assert_eq!(cls.negate(), class(&[
1905 ('\x00', '\x60'),
1906 ]));
1907 }
1908
1909 #[test]
class_negate_everything()1910 fn class_negate_everything() {
1911 let cls = class(&[('\x00', '\u{10FFFF}')]);
1912 assert_eq!(cls.negate(), class(&[]));
1913 }
1914
1915 #[test]
class_negate_everything_sans_one()1916 fn class_negate_everything_sans_one() {
1917 let cls = class(&[
1918 ('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')
1919 ]);
1920 assert_eq!(cls.negate(), class(&[
1921 ('\u{10FFFE}', '\u{10FFFE}'),
1922 ]));
1923 }
1924
1925 #[test]
class_negate_surrogates_min()1926 fn class_negate_surrogates_min() {
1927 let cls = class(&[('\x00', '\u{D7FF}')]);
1928 assert_eq!(cls.negate(), class(&[
1929 ('\u{E000}', '\u{10FFFF}'),
1930 ]));
1931 }
1932
1933 #[test]
class_negate_surrogates_min_edge()1934 fn class_negate_surrogates_min_edge() {
1935 let cls = class(&[('\x00', '\u{D7FE}')]);
1936 assert_eq!(cls.negate(), class(&[
1937 ('\u{D7FF}', '\u{10FFFF}'),
1938 ]));
1939 }
1940
1941 #[test]
class_negate_surrogates_max()1942 fn class_negate_surrogates_max() {
1943 let cls = class(&[('\u{E000}', '\u{10FFFF}')]);
1944 assert_eq!(cls.negate(), class(&[
1945 ('\x00', '\u{D7FF}'),
1946 ]));
1947 }
1948
1949 #[test]
class_negate_surrogates_max_edge()1950 fn class_negate_surrogates_max_edge() {
1951 let cls = class(&[('\u{E001}', '\u{10FFFF}')]);
1952 assert_eq!(cls.negate(), class(&[
1953 ('\x00', '\u{E000}'),
1954 ]));
1955 }
1956
1957 #[test]
class_intersection_empty()1958 fn class_intersection_empty() {
1959 let cls1 = class(&[]);
1960 let cls2 = class(&[('a', 'a')]);
1961 assert_intersection(cls1, cls2, class(&[]));
1962 }
1963
1964 #[test]
class_intersection_single_equal()1965 fn class_intersection_single_equal() {
1966 let cls1 = class(&[('a', 'a')]);
1967 let cls2 = class(&[('a', 'a')]);
1968 assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1969 }
1970
1971 #[test]
class_intersection_single_unequal()1972 fn class_intersection_single_unequal() {
1973 let cls1 = class(&[('a', 'a')]);
1974 let cls2 = class(&[('b', 'b')]);
1975 assert_intersection(cls1, cls2, class(&[]));
1976 }
1977
1978 #[test]
class_intersection_single_in_other()1979 fn class_intersection_single_in_other() {
1980 let cls1 = class(&[('a', 'a')]);
1981 let cls2 = class(&[('a', 'c')]);
1982 assert_intersection(cls1, cls2, class(&[('a', 'a')]));
1983 }
1984
1985 #[test]
class_intersection_range_in_other()1986 fn class_intersection_range_in_other() {
1987 let cls1 = class(&[('a', 'b')]);
1988 let cls2 = class(&[('a', 'c')]);
1989 assert_intersection(cls1, cls2, class(&[('a', 'b')]));
1990 }
1991
1992 #[test]
class_intersection_range_intersection()1993 fn class_intersection_range_intersection() {
1994 let cls1 = class(&[('a', 'b')]);
1995 let cls2 = class(&[('b', 'c')]);
1996 assert_intersection(cls1, cls2, class(&[('b', 'b')]));
1997 }
1998
1999 #[test]
class_intersection_only_adjacent()2000 fn class_intersection_only_adjacent() {
2001 let cls1 = class(&[('a', 'b')]);
2002 let cls2 = class(&[('c', 'd')]);
2003 assert_intersection(cls1, cls2, class(&[]));
2004 }
2005
2006 #[test]
class_intersection_range_subset()2007 fn class_intersection_range_subset() {
2008 let cls1 = class(&[('b', 'c')]);
2009 let cls2 = class(&[('a', 'd')]);
2010 assert_intersection(cls1, cls2, class(&[('b', 'c')]));
2011 }
2012
2013 #[test]
class_intersection_many_ranges_in_one_big()2014 fn class_intersection_many_ranges_in_one_big() {
2015 let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2016 let cls2 = class(&[('a', 'h')]);
2017 assert_intersection(cls1, cls2, class(&[
2018 ('a', 'b'), ('d', 'e'), ('g', 'h')
2019 ]));
2020 }
2021
2022 #[test]
class_intersection_many_ranges_same()2023 fn class_intersection_many_ranges_same() {
2024 let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2025 let cls2 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2026 assert_intersection(cls1, cls2, class(&[
2027 ('a', 'b'), ('d', 'e'), ('g', 'h')
2028 ]));
2029 }
2030
2031 #[test]
class_intersection_multiple_non_intersecting()2032 fn class_intersection_multiple_non_intersecting() {
2033 let cls1 = class(&[('a', 'b'), ('g', 'h')]);
2034 let cls2 = class(&[('d', 'e'), ('k', 'l')]);
2035 assert_intersection(cls1, cls2, class(&[]));
2036 }
2037
2038 #[test]
class_intersection_non_intersecting_then_intersecting()2039 fn class_intersection_non_intersecting_then_intersecting() {
2040 let cls1 = class(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2041 let cls2 = class(&[('h', 'h')]);
2042 assert_intersection(cls1, cls2, class(&[('h', 'h')]));
2043 }
2044
2045 #[test]
class_intersection_adjacent_alternating()2046 fn class_intersection_adjacent_alternating() {
2047 let cls1 = class(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2048 let cls2 = class(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2049 assert_intersection(cls1, cls2, class(&[]));
2050 }
2051
2052 #[test]
class_intersection_overlapping_alternating()2053 fn class_intersection_overlapping_alternating() {
2054 let cls1 = class(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2055 let cls2 = class(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2056 assert_intersection(cls1, cls2, class(&[('b', 'f')]));
2057 }
2058
2059 #[test]
class_canon_overlap_many_case_fold()2060 fn class_canon_overlap_many_case_fold() {
2061 let cls = class(&[
2062 ('C', 'F'), ('A', 'G'), ('D', 'J'), ('A', 'C'),
2063 ('M', 'P'), ('L', 'S'), ('c', 'f'),
2064 ]);
2065 assert_eq!(cls.case_fold(), class(&[
2066 ('A', 'J'), ('L', 'S'),
2067 ('a', 'j'), ('l', 's'),
2068 ('\u{17F}', '\u{17F}'),
2069 ]));
2070
2071 let cls = bclass(&[
2072 (b'C', b'F'), (b'A', b'G'), (b'D', b'J'), (b'A', b'C'),
2073 (b'M', b'P'), (b'L', b'S'), (b'c', b'f'),
2074 ]);
2075 assert_eq!(cls.case_fold(), bclass(&[
2076 (b'A', b'J'), (b'L', b'S'),
2077 (b'a', b'j'), (b'l', b's'),
2078 ]));
2079 }
2080
2081 #[test]
class_fold_az()2082 fn class_fold_az() {
2083 let cls = class(&[('A', 'Z')]);
2084 assert_eq!(cls.case_fold(), class(&[
2085 ('A', 'Z'), ('a', 'z'),
2086 ('\u{17F}', '\u{17F}'),
2087 ('\u{212A}', '\u{212A}'),
2088 ]));
2089 let cls = class(&[('a', 'z')]);
2090 assert_eq!(cls.case_fold(), class(&[
2091 ('A', 'Z'), ('a', 'z'),
2092 ('\u{17F}', '\u{17F}'),
2093 ('\u{212A}', '\u{212A}'),
2094 ]));
2095
2096 let cls = bclass(&[(b'A', b'Z')]);
2097 assert_eq!(cls.case_fold(), bclass(&[
2098 (b'A', b'Z'), (b'a', b'z'),
2099 ]));
2100 let cls = bclass(&[(b'a', b'z')]);
2101 assert_eq!(cls.case_fold(), bclass(&[
2102 (b'A', b'Z'), (b'a', b'z'),
2103 ]));
2104 }
2105
2106 #[test]
class_fold_a_underscore()2107 fn class_fold_a_underscore() {
2108 let cls = class(&[('A', 'A'), ('_', '_')]);
2109 assert_eq!(cls.clone().canonicalize(), class(&[
2110 ('A', 'A'), ('_', '_'),
2111 ]));
2112 assert_eq!(cls.case_fold(), class(&[
2113 ('A', 'A'), ('_', '_'), ('a', 'a'),
2114 ]));
2115
2116 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
2117 assert_eq!(cls.clone().canonicalize(), bclass(&[
2118 (b'A', b'A'), (b'_', b'_'),
2119 ]));
2120 assert_eq!(cls.case_fold(), bclass(&[
2121 (b'A', b'A'), (b'_', b'_'), (b'a', b'a'),
2122 ]));
2123 }
2124
2125 #[test]
class_fold_a_equals()2126 fn class_fold_a_equals() {
2127 let cls = class(&[('A', 'A'), ('=', '=')]);
2128 assert_eq!(cls.clone().canonicalize(), class(&[
2129 ('=', '='), ('A', 'A'),
2130 ]));
2131 assert_eq!(cls.case_fold(), class(&[
2132 ('=', '='), ('A', 'A'), ('a', 'a'),
2133 ]));
2134
2135 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
2136 assert_eq!(cls.clone().canonicalize(), bclass(&[
2137 (b'=', b'='), (b'A', b'A'),
2138 ]));
2139 assert_eq!(cls.case_fold(), bclass(&[
2140 (b'=', b'='), (b'A', b'A'), (b'a', b'a'),
2141 ]));
2142 }
2143
2144 #[test]
class_fold_no_folding_needed()2145 fn class_fold_no_folding_needed() {
2146 let cls = class(&[('\x00', '\x10')]);
2147 assert_eq!(cls.case_fold(), class(&[
2148 ('\x00', '\x10'),
2149 ]));
2150
2151 let cls = bclass(&[(b'\x00', b'\x10')]);
2152 assert_eq!(cls.case_fold(), bclass(&[
2153 (b'\x00', b'\x10'),
2154 ]));
2155 }
2156
2157 #[test]
class_fold_negated()2158 fn class_fold_negated() {
2159 let cls = class(&[('x', 'x')]);
2160 assert_eq!(cls.clone().case_fold(), class(&[
2161 ('X', 'X'), ('x', 'x'),
2162 ]));
2163 assert_eq!(cls.case_fold().negate(), class(&[
2164 ('\x00', 'W'), ('Y', 'w'), ('y', '\u{10FFFF}'),
2165 ]));
2166
2167 let cls = bclass(&[(b'x', b'x')]);
2168 assert_eq!(cls.clone().case_fold(), bclass(&[
2169 (b'X', b'X'), (b'x', b'x'),
2170 ]));
2171 assert_eq!(cls.case_fold().negate(), bclass(&[
2172 (b'\x00', b'W'), (b'Y', b'w'), (b'y', b'\xff'),
2173 ]));
2174 }
2175
2176 #[test]
class_fold_single_to_multiple()2177 fn class_fold_single_to_multiple() {
2178 let cls = class(&[('k', 'k')]);
2179 assert_eq!(cls.case_fold(), class(&[
2180 ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
2181 ]));
2182
2183 let cls = bclass(&[(b'k', b'k')]);
2184 assert_eq!(cls.case_fold(), bclass(&[
2185 (b'K', b'K'), (b'k', b'k'),
2186 ]));
2187 }
2188
2189 #[test]
class_fold_at()2190 fn class_fold_at() {
2191 let cls = class(&[('@', '@')]);
2192 assert_eq!(cls.clone().canonicalize(), class(&[('@', '@')]));
2193 assert_eq!(cls.case_fold(), class(&[('@', '@')]));
2194
2195 let cls = bclass(&[(b'@', b'@')]);
2196 assert_eq!(cls.clone().canonicalize(), bclass(&[(b'@', b'@')]));
2197 assert_eq!(cls.case_fold(), bclass(&[(b'@', b'@')]));
2198 }
2199
2200 #[test]
roundtrip_class_hypen()2201 fn roundtrip_class_hypen() {
2202 let expr = e("[-./]");
2203 assert_eq!("(?u:[-\\.-/])", expr.to_string());
2204
2205 let expr = e("(?-u)[-./]");
2206 assert_eq!("(?-u:[-\\.-/])", expr.to_string());
2207 }
2208
assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass)2209 fn assert_intersection(cls1: CharClass, cls2: CharClass, expected: CharClass) {
2210 // intersection operation should be commutative
2211 assert_eq!(cls1.intersection(&cls2), expected);
2212 assert_eq!(cls2.intersection(&cls1), expected);
2213 }
2214 }
2215