1 // Copyright 2014-2015 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 use std::cmp::{max, min};
12 use std::u8;
13
14 use unicode::regex::UNICODE_CLASSES;
15
16 use {
17 Expr, Repeater, CharClass, ClassRange,
18 CaptureIndex, CaptureName,
19 Error, ErrorKind, Result,
20 };
21
22 /// Parser state.
23 ///
24 /// Keeps the entire input in memory and maintains a cursor (char offset).
25 ///
26 /// It also keeps an expression stack, which is responsible for managing
27 /// grouped expressions and flag state.
28 #[derive(Debug)]
29 pub struct Parser {
30 chars: Vec<char>,
31 chari: usize,
32 stack: Vec<Build>,
33 caps: usize,
34 names: Vec<String>, // to check for duplicates
35 flags: Flags,
36 }
37
38 /// Flag state used in the parser.
39 #[derive(Clone, Copy, Debug)]
40 pub struct Flags {
41 /// i
42 pub casei: bool,
43 /// m
44 pub multi: bool,
45 /// s
46 pub dotnl: bool,
47 /// U
48 pub swap_greed: bool,
49 /// x
50 pub ignore_space: bool,
51 /// u
52 pub unicode: bool,
53 /// Not actually a flag, but when disabled, every regex that may not match
54 /// UTF-8 exclusively will cause the parser to return an error.
55 pub allow_bytes: bool,
56 }
57
58 impl Default for Flags {
default() -> Self59 fn default() -> Self {
60 Flags {
61 casei: false,
62 multi: false,
63 dotnl: false,
64 swap_greed: false,
65 ignore_space: false,
66 unicode: true,
67 allow_bytes: false,
68 }
69 }
70 }
71
72 /// An ephemeral type for representing the expression stack.
73 ///
74 /// Everything on the stack is either a regular expression or a marker
75 /// indicating the opening of a group (possibly non-capturing). The opening
76 /// of a group copies the current flag state, which is reset on the parser
77 /// state once the group closes.
78 #[derive(Debug)]
79 enum Build {
80 Expr(Expr),
81 LeftParen {
82 i: CaptureIndex,
83 name: CaptureName,
84 chari: usize,
85 old_flags: Flags,
86 },
87 }
88
89 /// A type for representing the elements of a bracket stack used for parsing
90 /// character classes.
91 ///
92 /// This is for parsing nested character classes without recursion.
93 #[derive(Debug)]
94 enum Bracket {
95 /// The opening of a character class (possibly negated)
96 LeftBracket {
97 negated: bool,
98 },
99 /// A set of characters within a character class, e.g., `a-z`
100 Set(CharClass),
101 /// An intersection operator (`&&`)
102 Intersection,
103 }
104
105 // Primary expression parsing routines.
106 impl Parser {
parse(s: &str, flags: Flags) -> Result<Expr>107 pub fn parse(s: &str, flags: Flags) -> Result<Expr> {
108 Parser {
109 chars: s.chars().collect(),
110 chari: 0,
111 stack: vec![],
112 caps: 0,
113 names: vec![],
114 flags: flags,
115 }.parse_expr()
116 }
117
118 // Top-level expression parser.
119 //
120 // Starts at the beginning of the input and consumes until either the end
121 // of input or an error.
parse_expr(mut self) -> Result<Expr>122 fn parse_expr(mut self) -> Result<Expr> {
123 loop {
124 self.ignore_space();
125 if self.eof() {
126 break;
127 }
128 let build_expr = match self.cur() {
129 '\\' => try!(self.parse_escape()),
130 '|' => { let e = try!(self.alternate()); self.bump(); e }
131 '?' => try!(self.parse_simple_repeat(Repeater::ZeroOrOne)),
132 '*' => try!(self.parse_simple_repeat(Repeater::ZeroOrMore)),
133 '+' => try!(self.parse_simple_repeat(Repeater::OneOrMore)),
134 '{' => try!(self.parse_counted_repeat()),
135 '[' => try!(self.parse_class()),
136 '^' => {
137 if self.flags.multi {
138 self.parse_one(Expr::StartLine)
139 } else {
140 self.parse_one(Expr::StartText)
141 }
142 }
143 '$' => {
144 if self.flags.multi {
145 self.parse_one(Expr::EndLine)
146 } else {
147 self.parse_one(Expr::EndText)
148 }
149 }
150 '.' => {
151 if self.flags.dotnl {
152 if self.flags.unicode {
153 self.parse_one(Expr::AnyChar)
154 } else {
155 if !self.flags.allow_bytes {
156 return Err(self.err(ErrorKind::InvalidUtf8));
157 }
158 self.parse_one(Expr::AnyByte)
159 }
160 } else {
161 if self.flags.unicode {
162 self.parse_one(Expr::AnyCharNoNL)
163 } else {
164 if !self.flags.allow_bytes {
165 return Err(self.err(ErrorKind::InvalidUtf8));
166 }
167 self.parse_one(Expr::AnyByteNoNL)
168 }
169 }
170 }
171 '(' => try!(self.parse_group()),
172 ')' => {
173 let (old_flags, e) = try!(self.close_paren());
174 self.bump();
175 self.flags = old_flags;
176 e
177 }
178 _ => {
179 let c = self.bump();
180 try!(self.lit(c))
181 }
182 };
183 if !build_expr.is_empty() {
184 self.stack.push(build_expr);
185 }
186 }
187 self.finish_concat()
188 }
189
190 // Parses an escape sequence, e.g., \Ax
191 //
192 // Start: `\`
193 // End: `x`
parse_escape(&mut self) -> Result<Build>194 fn parse_escape(&mut self) -> Result<Build> {
195 self.bump();
196 if self.eof() {
197 return Err(self.err(ErrorKind::UnexpectedEscapeEof));
198 }
199 let c = self.cur();
200 if is_punct(c) || (self.flags.ignore_space && c.is_whitespace()) {
201 let c = self.bump();
202 return Ok(try!(self.lit(c)));
203 }
204 match c {
205 'a' => { self.bump(); Ok(try!(self.lit('\x07'))) }
206 'f' => { self.bump(); Ok(try!(self.lit('\x0C'))) }
207 't' => { self.bump(); Ok(try!(self.lit('\t'))) }
208 'n' => { self.bump(); Ok(try!(self.lit('\n'))) }
209 'r' => { self.bump(); Ok(try!(self.lit('\r'))) }
210 'v' => { self.bump(); Ok(try!(self.lit('\x0B'))) }
211 'A' => { self.bump(); Ok(Build::Expr(Expr::StartText)) }
212 'z' => { self.bump(); Ok(Build::Expr(Expr::EndText)) }
213 'b' => {
214 self.bump();
215 Ok(Build::Expr(if self.flags.unicode {
216 Expr::WordBoundary
217 } else {
218 Expr::WordBoundaryAscii
219 }))
220 }
221 'B' => {
222 self.bump();
223 Ok(Build::Expr(if self.flags.unicode {
224 Expr::NotWordBoundary
225 } else {
226 Expr::NotWordBoundaryAscii
227 }))
228 }
229 '0'|'1'|'2'|'3'|'4'|'5'|'6'|'7' => self.parse_octal(),
230 'x' => { self.bump(); self.parse_hex() }
231 'p'|'P' => {
232 self.bump();
233 self.parse_unicode_class(c == 'P')
234 .map(|cls| Build::Expr(Expr::Class(cls)))
235 }
236 'd'|'s'|'w'|'D'|'S'|'W' => {
237 self.bump();
238 Ok(Build::Expr(Expr::Class(self.parse_perl_class(c))))
239 }
240 c => Err(self.err(ErrorKind::UnrecognizedEscape(c))),
241 }
242 }
243
244 // Parses a group, e.g., `(abc)`.
245 //
246 // Start: `(`
247 // End: `a`
248 //
249 // A more interesting example, `(?P<foo>abc)`.
250 //
251 // Start: `(`
252 // End: `a`
parse_group(&mut self) -> Result<Build>253 fn parse_group(&mut self) -> Result<Build> {
254 let chari = self.chari;
255 let mut name: CaptureName = None;
256 self.bump();
257 self.ignore_space();
258 if self.bump_if("?P<") {
259 let n = try!(self.parse_group_name());
260 if self.names.iter().any(|n2| n2 == &n) {
261 return Err(self.err(ErrorKind::DuplicateCaptureName(n)));
262 }
263 self.names.push(n.clone());
264 name = Some(n);
265 } else if self.bump_if("?") {
266 // This can never be capturing. It's either setting flags for
267 // the current group, or it's opening a non-capturing group or
268 // it's opening a group with a specific set of flags (which is
269 // also non-capturing).
270 // Anything else is an error.
271 return self.parse_group_flags(chari);
272 }
273 self.caps = checkadd(self.caps, 1);
274 Ok(Build::LeftParen {
275 i: Some(self.caps),
276 name: name,
277 chari: chari,
278 old_flags: self.flags, // no flags changed if we're here
279 })
280 }
281
282 // Parses flags (inline or grouped), e.g., `(?s-i:abc)`.
283 //
284 // Start: `s`
285 // End: `a`
286 //
287 // Another example, `(?s-i)a`.
288 //
289 // Start: `s`
290 // End: `a`
parse_group_flags(&mut self, opening_chari: usize) -> Result<Build>291 fn parse_group_flags(&mut self, opening_chari: usize) -> Result<Build> {
292 let old_flags = self.flags;
293 let mut sign = true;
294 let mut saw_flag = false;
295 loop {
296 if self.eof() {
297 // e.g., (?i
298 return Err(self.err(ErrorKind::UnexpectedFlagEof));
299 }
300 match self.cur() {
301 'i' => { self.flags.casei = sign; saw_flag = true }
302 'm' => { self.flags.multi = sign; saw_flag = true }
303 's' => { self.flags.dotnl = sign; saw_flag = true }
304 'U' => { self.flags.swap_greed = sign; saw_flag = true }
305 'x' => { self.flags.ignore_space = sign; saw_flag = true }
306 'u' => { self.flags.unicode = sign; saw_flag = true }
307 '-' => {
308 if !sign {
309 // e.g., (?-i-s)
310 return Err(self.err(ErrorKind::DoubleFlagNegation));
311 }
312 sign = false;
313 saw_flag = false;
314 }
315 ')' => {
316 if !saw_flag {
317 // e.g., (?)
318 return Err(self.err(ErrorKind::EmptyFlagNegation));
319 }
320 // At this point, we're just changing the flags inside
321 // the current group, which means the old flags have
322 // been saved elsewhere. Our modifications in place are
323 // okey dokey!
324 //
325 // This particular flag expression only has a stateful
326 // impact on a regex's AST, so nothing gets explicitly
327 // added.
328 self.bump();
329 return Ok(Build::Expr(Expr::Empty));
330 }
331 ':' => {
332 if !sign && !saw_flag {
333 // e.g., (?i-:a)
334 // Note that if there's no negation, it's OK not
335 // to see flag, because you end up with a regular
336 // non-capturing group: `(?:a)`.
337 return Err(self.err(ErrorKind::EmptyFlagNegation));
338 }
339 self.bump();
340 return Ok(Build::LeftParen {
341 i: None,
342 name: None,
343 chari: opening_chari,
344 old_flags: old_flags,
345 });
346 }
347 // e.g., (?z:a)
348 c => return Err(self.err(ErrorKind::UnrecognizedFlag(c))),
349 }
350 self.bump();
351 }
352 }
353
354 // Parses a group name, e.g., `foo` in `(?P<foo>abc)`.
355 //
356 // Start: `f`
357 // End: `a`
parse_group_name(&mut self) -> Result<String>358 fn parse_group_name(&mut self) -> Result<String> {
359 let mut name = String::new();
360 while !self.eof() && !self.peek_is('>') {
361 name.push(self.bump());
362 }
363 if self.eof() {
364 // e.g., (?P<a
365 return Err(self.err(ErrorKind::UnclosedCaptureName(name)));
366 }
367 let all_valid = name.chars().all(is_valid_capture_char);
368 match name.chars().next() {
369 // e.g., (?P<>a)
370 None => Err(self.err(ErrorKind::EmptyCaptureName)),
371 Some(c) if (c >= '0' && c <= '9') || !all_valid => {
372 // e.g., (?P<a#>x)
373 // e.g., (?P<1a>x)
374 Err(self.err(ErrorKind::InvalidCaptureName(name)))
375 }
376 _ => {
377 self.bump(); // for `>`
378 Ok(name)
379 }
380 }
381 }
382
383 // Parses a counted repeition operator, e.g., `a{2,4}?z`.
384 //
385 // Start: `{`
386 // End: `z`
parse_counted_repeat(&mut self) -> Result<Build>387 fn parse_counted_repeat(&mut self) -> Result<Build> {
388 let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., ({5}
389 if !e.can_repeat() {
390 // e.g., a*{5}
391 return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e)));
392 }
393 self.bump();
394 self.ignore_space();
395 let min = try!(self.parse_decimal());
396 let mut max_opt = Some(min);
397 self.ignore_space();
398 if self.bump_if(',') {
399 self.ignore_space();
400 if self.peek_is('}') {
401 max_opt = None;
402 } else {
403 let max = try!(self.parse_decimal());
404 if min > max {
405 // e.g., a{2,1}
406 return Err(self.err(ErrorKind::InvalidRepeatRange {
407 min: min,
408 max: max,
409 }));
410 }
411 max_opt = Some(max);
412 }
413 }
414 self.ignore_space();
415 if !self.bump_if('}') {
416 Err(self.err(ErrorKind::UnclosedRepeat))
417 } else {
418 Ok(Build::Expr(Expr::Repeat {
419 e: Box::new(e),
420 r: Repeater::Range { min: min, max: max_opt },
421 greedy: !self.bump_if('?') ^ self.flags.swap_greed,
422 }))
423 }
424 }
425
426 // Parses a simple repetition operator, e.g., `a+?z`.
427 //
428 // Start: `+`
429 // End: `z`
430 //
431 // N.B. "simple" in this context means "not min/max repetition",
432 // e.g., `a{1,2}`.
parse_simple_repeat(&mut self, rep: Repeater) -> Result<Build>433 fn parse_simple_repeat(&mut self, rep: Repeater) -> Result<Build> {
434 let e = try!(self.pop(ErrorKind::RepeaterExpectsExpr)); // e.g., (*
435 if !e.can_repeat() {
436 // e.g., a**
437 return Err(self.err(ErrorKind::RepeaterUnexpectedExpr(e)));
438 }
439 self.bump();
440 Ok(Build::Expr(Expr::Repeat {
441 e: Box::new(e),
442 r: rep,
443 greedy: !self.bump_if('?') ^ self.flags.swap_greed,
444 }))
445 }
446
447 // Parses a decimal number until the given character, e.g., `a{123,456}`.
448 //
449 // Start: `1`
450 // End: `,` (where `until == ','`)
parse_decimal(&mut self) -> Result<u32>451 fn parse_decimal(&mut self) -> Result<u32> {
452 match self.bump_get(|c| is_ascii_word(c) || c.is_whitespace()) {
453 // e.g., a{}
454 None => Err(self.err(ErrorKind::MissingBase10)),
455 Some(n) => {
456 // e.g., a{xyz
457 // e.g., a{9999999999}
458 let n = n.trim();
459 u32::from_str_radix(n, 10)
460 .map_err(|_| self.err(ErrorKind::InvalidBase10(n.into())))
461 }
462 }
463 }
464
465 // Parses an octal number, up to 3 digits, e.g., `a\123b`
466 //
467 // Start: `1`
468 // End: `b`
parse_octal(&mut self) -> Result<Build>469 fn parse_octal(&mut self) -> Result<Build> {
470 use std::char;
471 let mut i = 0; // counter for limiting octal to 3 digits.
472 let n = self.bump_get(|c| { i += 1; i <= 3 && c >= '0' && c <= '7' })
473 .expect("octal string"); // guaranteed at least 1 digit
474 // I think both of the following unwraps are impossible to fail.
475 // We limit it to a three digit octal number, which maxes out at
476 // `0777` or `511` in decimal. Since all digits are in `0...7`, we'll
477 // always have a valid `u32` number. Moreover, since all numbers in
478 // the range `0...511` are valid Unicode scalar values, it will always
479 // be a valid `char`.
480 //
481 // Hence, we `unwrap` with reckless abandon.
482 let n = u32::from_str_radix(&n, 8).ok().expect("valid octal number");
483 if !self.flags.unicode {
484 return Ok(try!(self.u32_to_one_byte(n)));
485 }
486 let c = char::from_u32(n).expect("Unicode scalar value");
487 Ok(try!(self.lit(c)))
488 }
489
490 // Parses a hex number, e.g., `a\x5ab`.
491 //
492 // Start: `5`
493 // End: `b`
494 //
495 // And also, `a\x{2603}b`.
496 //
497 // Start: `{`
498 // End: `b`
parse_hex(&mut self) -> Result<Build>499 fn parse_hex(&mut self) -> Result<Build> {
500 self.ignore_space();
501 if self.bump_if('{') {
502 self.parse_hex_many_digits()
503 } else {
504 self.parse_hex_two_digits()
505 }
506 }
507
508 // Parses a many-digit hex number, e.g., `a\x{2603}b`.
509 //
510 // Start: `2`
511 // End: `b`
parse_hex_many_digits(&mut self) -> Result<Build>512 fn parse_hex_many_digits(&mut self) -> Result<Build> {
513 use std::char;
514
515 self.ignore_space();
516 let s = self.bump_get(is_ascii_word).unwrap_or("".into());
517 let n = try!(u32::from_str_radix(&s, 16)
518 .map_err(|_| self.err(ErrorKind::InvalidBase16(s))));
519 self.ignore_space();
520 if !self.bump_if('}') {
521 // e.g., a\x{d
522 return Err(self.err(ErrorKind::UnclosedHex));
523 }
524 if !self.flags.unicode {
525 return Ok(try!(self.u32_to_one_byte(n)));
526 }
527 let c = try!(char::from_u32(n)
528 .ok_or(self.err(ErrorKind::InvalidScalarValue(n))));
529 Ok(try!(self.lit(c)))
530 }
531
532 // Parses a two-digit hex number, e.g., `a\x5ab`.
533 //
534 // Start: `5`
535 // End: `b`
parse_hex_two_digits(&mut self) -> Result<Build>536 fn parse_hex_two_digits(&mut self) -> Result<Build> {
537 use std::char;
538
539 let mut i = 0;
540 let s = self.bump_get(|_| { i += 1; i <= 2 }).unwrap_or("".into());
541 if s.len() < 2 {
542 // e.g., a\x
543 // e.g., a\xf
544 return Err(self.err(ErrorKind::UnexpectedTwoDigitHexEof));
545 }
546 let n = try!(u32::from_str_radix(&s, 16)
547 .map_err(|_| self.err(ErrorKind::InvalidBase16(s))));
548 if !self.flags.unicode {
549 return Ok(try!(self.u32_to_one_byte(n)));
550 }
551 let c = char::from_u32(n).expect("Unicode scalar value");
552 Ok(try!(self.lit(c)))
553 }
554
555 // Parses a character class, e.g., `[^a-zA-Z0-9]+`.
556 //
557 // If the Unicode flag is enabled, the class is returned as a `CharClass`,
558 // otherwise it is converted to a `ByteClass`.
559 //
560 // Start: `[`
561 // End: `+`
parse_class(&mut self) -> Result<Build>562 fn parse_class(&mut self) -> Result<Build> {
563 let class = try!(self.parse_class_as_chars());
564 Ok(Build::Expr(if self.flags.unicode {
565 Expr::Class(class)
566 } else {
567 let byte_class = class.to_byte_class();
568
569 // If `class` was only non-empty due to multibyte characters, the
570 // corresponding byte class will now be empty.
571 //
572 // See https://github.com/rust-lang/regex/issues/303
573 if byte_class.is_empty() {
574 // e.g., (?-u)[^\x00-\xFF]
575 return Err(self.err(ErrorKind::EmptyClass));
576 }
577
578 Expr::ClassBytes(byte_class)
579 }))
580 }
581
582 // Parses a character class as a `CharClass`, e.g., `[^a-zA-Z0-9]+`.
583 //
584 // Start: `[`
585 // End: `+`
parse_class_as_chars(&mut self) -> Result<CharClass>586 fn parse_class_as_chars(&mut self) -> Result<CharClass> {
587 let mut bracket_stack = vec![];
588 bracket_stack.extend(self.parse_open_bracket());
589 loop {
590 self.ignore_space();
591 if self.eof() {
592 // e.g., [a
593 return Err(self.err(ErrorKind::UnexpectedClassEof));
594 }
595 match self.cur() {
596 '[' => {
597 if let Some(class) = self.maybe_parse_ascii() {
598 // e.g. `[:alnum:]`
599 bracket_stack.push(Bracket::Set(class));
600 } else {
601 // nested set, e.g. `[c-d]` in `[a-b[c-d]]`
602 bracket_stack.extend(self.parse_open_bracket());
603 }
604 }
605 ']' => {
606 self.bump();
607 let class = try!(self.close_bracket(&mut bracket_stack));
608 if bracket_stack.is_empty() {
609 // That was the outermost class, so stop now
610 return Ok(class);
611 }
612 bracket_stack.push(Bracket::Set(class));
613 }
614 '\\' => {
615 let class = try!(self.parse_class_escape());
616 bracket_stack.push(Bracket::Set(class));
617 }
618 '&' if self.peek_is("&&") => {
619 self.bump();
620 self.bump();
621 bracket_stack.push(Bracket::Intersection);
622 }
623 start => {
624 if !self.flags.unicode {
625 let _ = try!(self.codepoint_to_one_byte(start));
626 }
627 self.bump();
628 match start {
629 '~'|'-' => {
630 // Only report an error if we see ~~ or --.
631 if self.peek_is(start) {
632 return Err(self.err(
633 ErrorKind::UnsupportedClassChar(start)));
634 }
635 }
636 _ => {}
637 }
638 let class = try!(self.parse_class_range(start));
639 bracket_stack.push(Bracket::Set(class));
640 }
641 }
642 }
643 }
644
645 // Parses the start of a character class or a nested character class.
646 // That includes negation using `^` and unescaped `-` and `]` allowed at
647 // the start of the class.
648 //
649 // e.g., `[^a]` or `[-a]` or `[]a]`
650 //
651 // Start: `[`
652 // End: `a`
parse_open_bracket(&mut self) -> Vec<Bracket>653 fn parse_open_bracket(&mut self) -> Vec<Bracket> {
654 self.bump();
655 self.ignore_space();
656 let negated = self.bump_if('^');
657 self.ignore_space();
658
659 let mut class = CharClass::empty();
660 while self.bump_if('-') {
661 class.ranges.push(ClassRange::one('-'));
662 self.ignore_space();
663 }
664 if class.is_empty() {
665 if self.bump_if(']') {
666 class.ranges.push(ClassRange::one(']'));
667 self.ignore_space();
668 }
669 }
670
671 let bracket = Bracket::LeftBracket { negated: negated };
672 if class.is_empty() {
673 vec![bracket]
674 } else {
675 vec![bracket, Bracket::Set(class)]
676 }
677 }
678
679 // Parses an escape in a character class.
680 //
681 // This is a helper for `parse_class`. Instead of returning an `Ok` value,
682 // it either mutates the char class or returns an error.
683 //
684 // e.g., `\wx`
685 //
686 // Start: `\`
687 // End: `x`
parse_class_escape(&mut self) -> Result<CharClass>688 fn parse_class_escape(&mut self) -> Result<CharClass> {
689 match try!(self.parse_escape()) {
690 Build::Expr(Expr::Class(class)) => {
691 Ok(class)
692 }
693 Build::Expr(Expr::ClassBytes(class2)) => {
694 let mut class = CharClass::empty();
695 for byte_range in class2 {
696 let s = byte_range.start as char;
697 let e = byte_range.end as char;
698 class.ranges.push(ClassRange::new(s, e));
699 }
700 Ok(class)
701 }
702 Build::Expr(Expr::Literal { chars, .. }) => {
703 self.parse_class_range(chars[0])
704 }
705 Build::Expr(Expr::LiteralBytes { bytes, .. }) => {
706 let start = bytes[0] as char;
707 self.parse_class_range(start)
708 }
709 Build::Expr(e) => {
710 let err = ErrorKind::InvalidClassEscape(e);
711 Err(self.err(err))
712 }
713 // Because `parse_escape` can never return `LeftParen`.
714 _ => unreachable!(),
715 }
716 }
717
718 // Parses a single range in a character class.
719 //
720 // e.g., `[a-z]`
721 //
722 // Start: `-` (with start == `a`)
723 // End: `]`
parse_class_range(&mut self, start: char) -> Result<CharClass>724 fn parse_class_range(&mut self, start: char) -> Result<CharClass> {
725 self.ignore_space();
726 if !self.bump_if('-') {
727 // Not a range, so just return a singleton range.
728 return Ok(CharClass::new(vec![ClassRange::one(start)]));
729 }
730 self.ignore_space();
731 if self.eof() {
732 // e.g., [a-
733 return Err(self.err(ErrorKind::UnexpectedClassEof));
734 }
735 if self.peek_is(']') {
736 // This is the end of the class, so we permit use of `-` as a
737 // regular char (just like we do in the beginning).
738 return Ok(CharClass::new(vec![ClassRange::one(start), ClassRange::one('-')]));
739 }
740
741 // We have a real range. Just need to check to parse literal and
742 // make sure it's a valid range.
743 let end = match self.cur() {
744 '\\' => match try!(self.parse_escape()) {
745 Build::Expr(Expr::Literal { chars, .. }) => {
746 chars[0]
747 }
748 Build::Expr(Expr::LiteralBytes { bytes, .. }) => {
749 bytes[0] as char
750 }
751 Build::Expr(e) => {
752 return Err(self.err(ErrorKind::InvalidClassEscape(e)));
753 }
754 // Because `parse_escape` can never return `LeftParen`.
755 _ => unreachable!(),
756 },
757 c => {
758 self.bump();
759 if c == '-' {
760 return Err(self.err(ErrorKind::UnsupportedClassChar('-')));
761 }
762 if !self.flags.unicode {
763 let _ = try!(self.codepoint_to_one_byte(c));
764 }
765 c
766 }
767 };
768 if end < start {
769 // e.g., [z-a]
770 return Err(self.err(ErrorKind::InvalidClassRange {
771 start: start,
772 end: end,
773 }));
774 }
775 Ok(CharClass::new(vec![ClassRange::new(start, end)]))
776 }
777
778 // Parses an ASCII class, e.g., `[:alnum:]+`.
779 //
780 // Start: `[`
781 // End: `+`
782 //
783 // Also supports negation, e.g., `[:^alnum:]`.
784 //
785 // This parsing routine is distinct from the others in that it doesn't
786 // actually report any errors. Namely, if it fails, then the parser should
787 // fall back to parsing a regular class.
788 //
789 // This method will only make progress in the parser if it succeeds.
790 // Otherwise, the input remains where it started.
maybe_parse_ascii(&mut self) -> Option<CharClass>791 fn maybe_parse_ascii(&mut self) -> Option<CharClass> {
792 fn parse(p: &mut Parser) -> Option<CharClass> {
793 p.bump(); // the `[`
794 if !p.bump_if(':') { return None; }
795 let negate = p.bump_if('^');
796 let name = match p.bump_get(|c| c != ':') {
797 None => return None,
798 Some(name) => name,
799 };
800 if !p.bump_if(":]") { return None; }
801 ascii_class(&name).map(|cls| p.class_transform(negate, cls))
802 }
803 let start = self.chari;
804 match parse(self) {
805 None => { self.chari = start; None }
806 result => result,
807 }
808 }
809
810 // Parses a Uncode class name, e.g., `a\pLb`.
811 //
812 // Start: `L`
813 // End: `b`
814 //
815 // And also, `a\p{Greek}b`.
816 //
817 // Start: `{`
818 // End: `b`
819 //
820 // `negate` is true when the class name is used with `\P`.
parse_unicode_class(&mut self, neg: bool) -> Result<CharClass>821 fn parse_unicode_class(&mut self, neg: bool) -> Result<CharClass> {
822 self.ignore_space();
823 let name =
824 if self.bump_if('{') {
825 self.ignore_space();
826 let n = self.bump_get(is_ascii_word).unwrap_or("".into());
827 self.ignore_space();
828 if n.is_empty() || !self.bump_if('}') {
829 // e.g., \p{Greek
830 return Err(self.err(ErrorKind::UnclosedUnicodeName));
831 }
832 n
833 } else {
834 if self.eof() {
835 // e.g., \p
836 return Err(self.err(ErrorKind::UnexpectedEscapeEof));
837 }
838 self.bump().to_string()
839 };
840 match unicode_class(&name) {
841 None => Err(self.err(ErrorKind::UnrecognizedUnicodeClass(name))),
842 Some(cls) => {
843 if self.flags.unicode {
844 Ok(self.class_transform(neg, cls))
845 } else {
846 Err(self.err(ErrorKind::UnicodeNotAllowed))
847 }
848 }
849 }
850 }
851
852 // Parses a perl character class with Unicode support.
853 //
854 // `name` must be one of d, s, w, D, S, W. If not, this function panics.
855 //
856 // No parser state is changed.
parse_perl_class(&mut self, name: char) -> CharClass857 fn parse_perl_class(&mut self, name: char) -> CharClass {
858 use unicode::regex::{PERLD, PERLS, PERLW};
859 let (cls, negate) = match (self.flags.unicode, name) {
860 (true, 'd') => (raw_class_to_expr(PERLD), false),
861 (true, 'D') => (raw_class_to_expr(PERLD), true),
862 (true, 's') => (raw_class_to_expr(PERLS), false),
863 (true, 'S') => (raw_class_to_expr(PERLS), true),
864 (true, 'w') => (raw_class_to_expr(PERLW), false),
865 (true, 'W') => (raw_class_to_expr(PERLW), true),
866 (false, 'd') => (ascii_class("digit").unwrap(), false),
867 (false, 'D') => (ascii_class("digit").unwrap(), true),
868 (false, 's') => (ascii_class("space").unwrap(), false),
869 (false, 'S') => (ascii_class("space").unwrap(), true),
870 (false, 'w') => (ascii_class("word").unwrap(), false),
871 (false, 'W') => (ascii_class("word").unwrap(), true),
872 _ => unreachable!(),
873 };
874 self.class_transform(negate, cls)
875 }
876
877 // Always bump to the next input and return the given expression as a
878 // `Build`.
879 //
880 // This is mostly for convenience when the surrounding context implies
881 // that the next character corresponds to the given expression.
parse_one(&mut self, e: Expr) -> Build882 fn parse_one(&mut self, e: Expr) -> Build {
883 self.bump();
884 Build::Expr(e)
885 }
886 }
887
888 // Auxiliary helper methods.
889 impl Parser {
chars(&self) -> Chars890 fn chars(&self) -> Chars {
891 Chars::new(&self.chars[self.chari..])
892 }
893
ignore_space(&mut self)894 fn ignore_space(&mut self) {
895 if !self.flags.ignore_space {
896 return;
897 }
898 while !self.eof() {
899 match self.cur() {
900 '#' => {
901 self.bump();
902 while !self.eof() {
903 match self.bump() {
904 '\n' => break,
905 _ => continue,
906 }
907 }
908 },
909 c => if !c.is_whitespace() {
910 return;
911 } else {
912 self.bump();
913 }
914 }
915 }
916 }
917
bump(&mut self) -> char918 fn bump(&mut self) -> char {
919 let c = self.cur();
920 self.chari = checkadd(self.chari, self.chars().next_count());
921 c
922 }
923
cur(&self) -> char924 fn cur(&self) -> char { self.chars().next().unwrap() }
925
eof(&self) -> bool926 fn eof(&self) -> bool { self.chars().next().is_none() }
927
bump_get<B: Bumpable>(&mut self, s: B) -> Option<String>928 fn bump_get<B: Bumpable>(&mut self, s: B) -> Option<String> {
929 let n = s.match_end(self);
930 if n == 0 {
931 None
932 } else {
933 let end = checkadd(self.chari, n);
934 let s = self.chars[self.chari..end]
935 .iter().cloned().collect::<String>();
936 self.chari = end;
937 Some(s)
938 }
939 }
940
bump_if<B: Bumpable>(&mut self, s: B) -> bool941 fn bump_if<B: Bumpable>(&mut self, s: B) -> bool {
942 let n = s.match_end(self);
943 if n == 0 {
944 false
945 } else {
946 self.chari = checkadd(self.chari, n);
947 true
948 }
949 }
950
peek_is<B: Bumpable>(&self, s: B) -> bool951 fn peek_is<B: Bumpable>(&self, s: B) -> bool {
952 s.match_end(self) > 0
953 }
954
err(&self, kind: ErrorKind) -> Error955 fn err(&self, kind: ErrorKind) -> Error {
956 self.errat(self.chari, kind)
957 }
958
errat(&self, pos: usize, kind: ErrorKind) -> Error959 fn errat(&self, pos: usize, kind: ErrorKind) -> Error {
960 Error { pos: pos, surround: self.windowat(pos), kind: kind }
961 }
962
windowat(&self, pos: usize) -> String963 fn windowat(&self, pos: usize) -> String {
964 let s = max(5, pos) - 5;
965 let e = min(self.chars.len(), checkadd(pos, 5));
966 self.chars[s..e].iter().cloned().collect()
967 }
968
pop(&mut self, expected: ErrorKind) -> Result<Expr>969 fn pop(&mut self, expected: ErrorKind) -> Result<Expr> {
970 match self.stack.pop() {
971 None | Some(Build::LeftParen{..}) => Err(self.err(expected)),
972 Some(Build::Expr(e)) => Ok(e),
973 }
974 }
975
976 // If the current context calls for case insensitivity, then apply
977 // case folding. Similarly, if `negate` is `true`, then negate the
978 // class. (Negation always proceeds case folding.)
class_transform(&self, negate: bool, mut cls: CharClass) -> CharClass979 fn class_transform(&self, negate: bool, mut cls: CharClass) -> CharClass {
980 if self.flags.casei {
981 cls = cls.case_fold();
982 }
983 if negate {
984 cls = cls.negate();
985 }
986 cls
987 }
988
989 // Translates a Unicode codepoint into a single UTF-8 byte, and returns an
990 // error if it's not possible.
991 //
992 // This will panic if self.flags.unicode == true.
codepoint_to_one_byte(&self, c: char) -> Result<u8>993 fn codepoint_to_one_byte(&self, c: char) -> Result<u8> {
994 assert!(!self.flags.unicode);
995 let bytes = c.to_string().as_bytes().to_owned();
996 if bytes.len() > 1 {
997 return Err(self.err(ErrorKind::UnicodeNotAllowed));
998 }
999 Ok(bytes[0])
1000 }
1001
1002 // Creates a new byte literal from a single byte.
1003 //
1004 // If the given number can't fit into a single byte, then it is assumed
1005 // to be a Unicode codepoint and an error is returned.
1006 //
1007 // This should only be called when the bytes flag is enabled.
u32_to_one_byte(&self, b: u32) -> Result<Build>1008 fn u32_to_one_byte(&self, b: u32) -> Result<Build> {
1009 assert!(!self.flags.unicode);
1010 if b > u8::MAX as u32 {
1011 Err(self.err(ErrorKind::UnicodeNotAllowed))
1012 } else if !self.flags.allow_bytes && b > 0x7F {
1013 Err(self.err(ErrorKind::InvalidUtf8))
1014 } else {
1015 Ok(Build::Expr(Expr::LiteralBytes {
1016 bytes: vec![b as u8],
1017 casei: self.flags.casei,
1018 }))
1019 }
1020 }
1021
1022 // Creates a new literal expr from a Unicode codepoint.
1023 //
1024 // Creates a byte literal if the `bytes` flag is set.
lit(&self, c: char) -> Result<Build>1025 fn lit(&self, c: char) -> Result<Build> {
1026 Ok(Build::Expr(if self.flags.unicode {
1027 Expr::Literal {
1028 chars: vec![c],
1029 casei: self.flags.casei,
1030 }
1031 } else {
1032 Expr::LiteralBytes {
1033 bytes: vec![try!(self.codepoint_to_one_byte(c))],
1034 casei: self.flags.casei,
1035 }
1036 }))
1037 }
1038 }
1039
1040 struct Chars<'a> {
1041 chars: &'a [char],
1042 cur: usize,
1043 }
1044
1045 impl<'a> Iterator for Chars<'a> {
1046 type Item = char;
next(&mut self) -> Option<char>1047 fn next(&mut self) -> Option<char> {
1048 let x = self.c();
1049 self.advance();
1050 return x;
1051 }
1052 }
1053
1054 impl<'a> Chars<'a> {
new(chars: &[char]) -> Chars1055 fn new(chars: &[char]) -> Chars {
1056 Chars {
1057 chars: chars,
1058 cur: 0,
1059 }
1060 }
1061
c(&self) -> Option<char>1062 fn c(&self) -> Option<char> {
1063 self.chars.get(self.cur).map(|&c| c)
1064 }
1065
advance(&mut self)1066 fn advance(&mut self) {
1067 self.cur = checkadd(self.cur, 1);
1068 }
1069
next_count(&mut self) -> usize1070 fn next_count(&mut self) -> usize {
1071 self.next();
1072 self.cur
1073 }
1074 }
1075
1076 // Auxiliary methods for manipulating the expression stack.
1077 impl Parser {
1078 // Called whenever an alternate (`|`) is found.
1079 //
1080 // This pops the expression stack until:
1081 //
1082 // 1. The stack is empty. Pushes an alternation with one arm.
1083 // 2. An opening parenthesis is found. Leave the parenthesis
1084 // on the stack and push an alternation with one arm.
1085 // 3. An alternate (`|`) is found. Pop the existing alternation,
1086 // add an arm and push the modified alternation.
1087 //
1088 // Each "arm" in the above corresponds to the concatenation of all
1089 // popped expressions.
1090 //
1091 // In the first two cases, the stack is left in an invalid state
1092 // because an alternation with one arm is not allowed. This
1093 // particular state will be detected by `finish_concat` and an
1094 // error will be reported.
1095 //
1096 // In none of the cases is an empty arm allowed. If an empty arm
1097 // is found, an error is reported.
alternate(&mut self) -> Result<Build>1098 fn alternate(&mut self) -> Result<Build> {
1099 let mut concat = vec![];
1100 let alts = |es| Ok(Build::Expr(Expr::Alternate(es)));
1101 loop {
1102 match self.stack.pop() {
1103 None => {
1104 if concat.is_empty() {
1105 // e.g., |a
1106 return Err(self.err(ErrorKind::EmptyAlternate));
1107 }
1108 return alts(vec![rev_concat(concat)]);
1109 }
1110 Some(e @ Build::LeftParen{..}) => {
1111 if concat.is_empty() {
1112 // e.g., (|a)
1113 return Err(self.err(ErrorKind::EmptyAlternate));
1114 }
1115 self.stack.push(e);
1116 return alts(vec![rev_concat(concat)]);
1117 }
1118 Some(Build::Expr(Expr::Alternate(mut es))) => {
1119 if concat.is_empty() {
1120 // e.g., a||
1121 return Err(self.err(ErrorKind::EmptyAlternate));
1122 }
1123 es.push(rev_concat(concat));
1124 return alts(es);
1125 }
1126 Some(Build::Expr(e)) => { concat.push(e); }
1127 }
1128 }
1129 }
1130
1131 // Called whenever a closing parenthesis (`)`) is found.
1132 //
1133 // This pops the expression stack until:
1134 //
1135 // 1. The stack is empty. An error is reported because this
1136 // indicates an unopened parenthesis.
1137 // 2. An opening parenthesis is found. Pop the opening parenthesis
1138 // and push a `Group` expression.
1139 // 3. An alternate (`|`) is found. Pop the existing alternation
1140 // and an arm to it in place. Pop one more item from the stack.
1141 // If the stack was empty, then report an unopened parenthesis
1142 // error, otherwise assume it is an opening parenthesis and
1143 // push a `Group` expression with the popped alternation.
1144 // (We can assume this is an opening parenthesis because an
1145 // alternation either corresponds to the entire Regex or it
1146 // corresponds to an entire group. This is guaranteed by the
1147 // `alternate` method.)
1148 //
1149 // Each "arm" in the above corresponds to the concatenation of all
1150 // popped expressions.
1151 //
1152 // Empty arms nor empty groups are allowed.
close_paren(&mut self) -> Result<(Flags, Build)>1153 fn close_paren(&mut self) -> Result<(Flags, Build)> {
1154 let mut concat = vec![];
1155 loop {
1156 match self.stack.pop() {
1157 // e.g., )
1158 None => return Err(self.err(ErrorKind::UnopenedParen)),
1159 Some(Build::LeftParen { i, name, old_flags, .. }) => {
1160 if concat.is_empty() {
1161 // e.g., ()
1162 return Err(self.err(ErrorKind::EmptyGroup));
1163 }
1164 return Ok((old_flags, Build::Expr(Expr::Group {
1165 e: Box::new(rev_concat(concat)),
1166 i: i,
1167 name: name,
1168 })));
1169 }
1170 Some(Build::Expr(Expr::Alternate(mut es))) => {
1171 if concat.is_empty() {
1172 // e.g., (a|)
1173 return Err(self.err(ErrorKind::EmptyAlternate));
1174 }
1175 es.push(rev_concat(concat));
1176 match self.stack.pop() {
1177 // e.g., a|b)
1178 None => return Err(self.err(ErrorKind::UnopenedParen)),
1179 Some(Build::Expr(_)) => unreachable!(),
1180 Some(Build::LeftParen { i, name, old_flags, .. }) => {
1181 return Ok((old_flags, Build::Expr(Expr::Group {
1182 e: Box::new(Expr::Alternate(es)),
1183 i: i,
1184 name: name,
1185 })));
1186 }
1187 }
1188 }
1189 Some(Build::Expr(e)) => { concat.push(e); }
1190 }
1191 }
1192 }
1193
1194 // Called only when the parser reaches the end of input.
1195 //
1196 // This pops the expression stack until:
1197 //
1198 // 1. The stack is empty. Return concatenation of popped
1199 // expressions. This concatenation may be empty!
1200 // 2. An alternation is found. Pop the alternation and push
1201 // a new arm. Return the alternation as the entire Regex.
1202 // After this, the stack must be empty, or else there is
1203 // an unclosed paren.
1204 //
1205 // If an opening parenthesis is popped, then an error is
1206 // returned since it indicates an unclosed parenthesis.
finish_concat(&mut self) -> Result<Expr>1207 fn finish_concat(&mut self) -> Result<Expr> {
1208 let mut concat = vec![];
1209 loop {
1210 match self.stack.pop() {
1211 None => { return Ok(rev_concat(concat)); }
1212 Some(Build::LeftParen{ chari, ..}) => {
1213 // e.g., a(b
1214 return Err(self.errat(chari, ErrorKind::UnclosedParen));
1215 }
1216 Some(Build::Expr(Expr::Alternate(mut es))) => {
1217 if concat.is_empty() {
1218 // e.g., a|
1219 return Err(self.err(ErrorKind::EmptyAlternate));
1220 }
1221 es.push(rev_concat(concat));
1222 // Make sure there are no opening parens remaining.
1223 match self.stack.pop() {
1224 None => return Ok(Expr::Alternate(es)),
1225 Some(Build::LeftParen{ chari, ..}) => {
1226 // e.g., (a|b
1227 return Err(self.errat(
1228 chari, ErrorKind::UnclosedParen));
1229 }
1230 e => unreachable!("{:?}", e),
1231 }
1232 }
1233 Some(Build::Expr(e)) => { concat.push(e); }
1234 }
1235 }
1236 }
1237 }
1238
1239 // Methods for working with the bracket stack used for character class parsing.
1240 impl Parser {
1241
1242 // After parsing a closing bracket `]`, process elements of the bracket
1243 // stack until finding the corresponding opening bracket `[`, and return
1244 // the combined character class. E.g. with `[^b-f&&ab-c]`:
1245 //
1246 // 1. Adjacent sets are merged into a single union: `ab-c` -> `a-c`
1247 // 2. Unions separated by `&&` are intersected: `b-f` and `a-c` -> `b-c`
1248 // 3. Negation is applied if necessary: `b-c` -> negation of `b-c`
close_bracket(&self, stack: &mut Vec<Bracket>) -> Result<CharClass>1249 fn close_bracket(&self, stack: &mut Vec<Bracket>) -> Result<CharClass> {
1250 let mut union = CharClass::empty();
1251 let mut intersect = vec![];
1252 loop {
1253 match stack.pop() {
1254 Some(Bracket::Set(class)) => {
1255 union.ranges.extend(class);
1256 }
1257 Some(Bracket::Intersection) => {
1258 let class = self.class_union_transform(union);
1259 intersect.push(class);
1260 union = CharClass::empty();
1261 }
1262 Some(Bracket::LeftBracket { negated }) => {
1263 let mut class = self.class_union_transform(union);
1264 for c in intersect {
1265 class = class.intersection(&c);
1266 }
1267 // negate after combining all sets (`^` has lower precedence than `&&`)
1268 if negated {
1269 class = class.negate();
1270 }
1271 if class.is_empty() {
1272 // e.g., [^\d\D]
1273 return Err(self.err(ErrorKind::EmptyClass));
1274 }
1275 return Ok(class);
1276 }
1277 // The first element on the stack is a `LeftBracket`
1278 None => unreachable!()
1279 }
1280 }
1281 }
1282
1283 // Apply case folding if requested on the union character class, and
1284 // return a canonicalized class.
class_union_transform(&self, class: CharClass) -> CharClass1285 fn class_union_transform(&self, class: CharClass) -> CharClass {
1286 if self.flags.casei {
1287 // Case folding canonicalizes too
1288 class.case_fold()
1289 } else {
1290 class.canonicalize()
1291 }
1292 }
1293 }
1294
1295 impl Build {
is_empty(&self) -> bool1296 fn is_empty(&self) -> bool {
1297 match *self {
1298 Build::Expr(Expr::Empty) => true,
1299 _ => false,
1300 }
1301 }
1302 }
1303
1304 // Make it ergonomic to conditionally bump the parser.
1305 // i.e., `bump_if('a')` or `bump_if("abc")`.
1306 trait Bumpable {
match_end(self, p: &Parser) -> usize1307 fn match_end(self, p: &Parser) -> usize;
1308 }
1309
1310 impl Bumpable for char {
match_end(self, p: &Parser) -> usize1311 fn match_end(self, p: &Parser) -> usize {
1312 let mut chars = p.chars();
1313 if chars.next().map(|c| c == self).unwrap_or(false) {
1314 chars.cur
1315 } else {
1316 0
1317 }
1318 }
1319 }
1320
1321 impl<'a> Bumpable for &'a str {
match_end(self, p: &Parser) -> usize1322 fn match_end(self, p: &Parser) -> usize {
1323 let mut search = self.chars();
1324 let mut rest = p.chars();
1325 let mut count = 0;
1326 loop {
1327 match (rest.next(), search.next()) {
1328 (Some(c1), Some(c2)) if c1 == c2 => count = rest.cur,
1329 (_, None) => return count,
1330 _ => return 0,
1331 }
1332 }
1333 }
1334 }
1335
1336 impl<F: FnMut(char) -> bool> Bumpable for F {
match_end(mut self, p: &Parser) -> usize1337 fn match_end(mut self, p: &Parser) -> usize {
1338 let mut chars = p.chars();
1339 let mut count = 0;
1340 while let Some(c) = chars.next() {
1341 if !self(c) {
1342 break
1343 }
1344 count = chars.cur;
1345 }
1346 count
1347 }
1348 }
1349
1350 // Turn a sequence of expressions into a concatenation.
1351 // This only uses `Concat` if there are 2 or more expressions.
rev_concat(mut exprs: Vec<Expr>) -> Expr1352 fn rev_concat(mut exprs: Vec<Expr>) -> Expr {
1353 if exprs.len() == 0 {
1354 Expr::Empty
1355 } else if exprs.len() == 1 {
1356 exprs.pop().unwrap()
1357 } else {
1358 exprs.reverse();
1359 Expr::Concat(exprs)
1360 }
1361 }
1362
1363 // Returns true if and only if the given character is allowed in a capture
1364 // name. Note that the first char of a capture name must not be numeric.
is_valid_capture_char(c: char) -> bool1365 fn is_valid_capture_char(c: char) -> bool {
1366 c == '_' || (c >= '0' && c <= '9')
1367 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
1368 }
1369
is_ascii_word(c: char) -> bool1370 fn is_ascii_word(c: char) -> bool {
1371 match c {
1372 'a' ... 'z' | 'A' ... 'Z' | '_' | '0' ... '9' => true,
1373 _ => false,
1374 }
1375 }
1376
1377 /// Returns true if the give character has significance in a regex.
is_punct(c: char) -> bool1378 pub fn is_punct(c: char) -> bool {
1379 match c {
1380 '\\' | '.' | '+' | '*' | '?' | '(' | ')' | '|' |
1381 '[' | ']' | '{' | '}' | '^' | '$' | '#' | '&' | '-' | '~' => true,
1382 _ => false,
1383 }
1384 }
1385
checkadd(x: usize, y: usize) -> usize1386 fn checkadd(x: usize, y: usize) -> usize {
1387 x.checked_add(y).expect("regex length overflow")
1388 }
1389
unicode_class(name: &str) -> Option<CharClass>1390 fn unicode_class(name: &str) -> Option<CharClass> {
1391 UNICODE_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| {
1392 raw_class_to_expr(UNICODE_CLASSES[i].1)
1393 })
1394 }
1395
ascii_class(name: &str) -> Option<CharClass>1396 fn ascii_class(name: &str) -> Option<CharClass> {
1397 ASCII_CLASSES.binary_search_by(|&(s, _)| s.cmp(name)).ok().map(|i| {
1398 raw_class_to_expr(ASCII_CLASSES[i].1)
1399 })
1400 }
1401
raw_class_to_expr(raw: &[(char, char)]) -> CharClass1402 fn raw_class_to_expr(raw: &[(char, char)]) -> CharClass {
1403 let range = |&(s, e)| ClassRange { start: s, end: e };
1404 CharClass::new(raw.iter().map(range).collect())
1405 }
1406
1407 type Class = &'static [(char, char)];
1408 type NamedClasses = &'static [(&'static str, Class)];
1409
1410 const ASCII_CLASSES: NamedClasses = &[
1411 // Classes must be in alphabetical order so that bsearch works.
1412 // [:alnum:] alphanumeric (== [0-9A-Za-z])
1413 // [:alpha:] alphabetic (== [A-Za-z])
1414 // [:ascii:] ASCII (== [\x00-\x7F])
1415 // [:blank:] blank (== [\t ])
1416 // [:cntrl:] control (== [\x00-\x1F\x7F])
1417 // [:digit:] digits (== [0-9])
1418 // [:graph:] graphical (== [!-~])
1419 // [:lower:] lower case (== [a-z])
1420 // [:print:] printable (== [ -~] == [ [:graph:]])
1421 // [:punct:] punctuation (== [!-/:-@[-`{-~])
1422 // [:space:] whitespace (== [\t\n\v\f\r ])
1423 // [:upper:] upper case (== [A-Z])
1424 // [:word:] word characters (== [0-9A-Za-z_])
1425 // [:xdigit:] hex digit (== [0-9A-Fa-f])
1426 // Taken from: http://golang.org/pkg/regex/syntax/
1427 ("alnum", &ALNUM),
1428 ("alpha", &ALPHA),
1429 ("ascii", &ASCII),
1430 ("blank", &BLANK),
1431 ("cntrl", &CNTRL),
1432 ("digit", &DIGIT),
1433 ("graph", &GRAPH),
1434 ("lower", &LOWER),
1435 ("print", &PRINT),
1436 ("punct", &PUNCT),
1437 ("space", &SPACE),
1438 ("upper", &UPPER),
1439 ("word", &WORD),
1440 ("xdigit", &XDIGIT),
1441 ];
1442
1443 const ALNUM: Class = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
1444 const ALPHA: Class = &[('A', 'Z'), ('a', 'z')];
1445 const ASCII: Class = &[('\x00', '\x7F')];
1446 const BLANK: Class = &[(' ', ' '), ('\t', '\t')];
1447 const CNTRL: Class = &[('\x00', '\x1F'), ('\x7F', '\x7F')];
1448 const DIGIT: Class = &[('0', '9')];
1449 const GRAPH: Class = &[('!', '~')];
1450 const LOWER: Class = &[('a', 'z')];
1451 const PRINT: Class = &[(' ', '~')];
1452 const PUNCT: Class = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
1453 const SPACE: Class = &[('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'),
1454 ('\x0C', '\x0C'), ('\r', '\r'), (' ', ' ')];
1455 const UPPER: Class = &[('A', 'Z')];
1456 const WORD: Class = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')];
1457 const XDIGIT: Class = &[('0', '9'), ('A', 'F'), ('a', 'f')];
1458
1459 #[cfg(test)]
1460 mod tests {
1461 use {
1462 CharClass, ClassRange, ByteClass, ByteRange,
1463 Expr, Repeater,
1464 ErrorKind,
1465 };
1466 use unicode::regex::{PERLD, PERLS, PERLW};
1467 use super::{LOWER, UPPER, WORD, Flags, Parser, ascii_class};
1468
1469 static YI: &'static [(char, char)] = &[
1470 ('\u{a000}', '\u{a48c}'), ('\u{a490}', '\u{a4c6}'),
1471 ];
1472
p(s: &str) -> Expr1473 fn p(s: &str) -> Expr { Parser::parse(s, Flags::default()).unwrap() }
pf(s: &str, flags: Flags) -> Expr1474 fn pf(s: &str, flags: Flags) -> Expr { Parser::parse(s, flags).unwrap() }
lit(c: char) -> Expr1475 fn lit(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: false } }
liti(c: char) -> Expr1476 fn liti(c: char) -> Expr { Expr::Literal { chars: vec![c], casei: true } }
b<T>(v: T) -> Box<T>1477 fn b<T>(v: T) -> Box<T> { Box::new(v) }
c(es: &[Expr]) -> Expr1478 fn c(es: &[Expr]) -> Expr { Expr::Concat(es.to_vec()) }
1479
pb(s: &str) -> Expr1480 fn pb(s: &str) -> Expr {
1481 let flags = Flags { allow_bytes: true, .. Flags::default() };
1482 Parser::parse(s, flags).unwrap()
1483 }
1484
blit(b: u8) -> Expr1485 fn blit(b: u8) -> Expr {
1486 Expr::LiteralBytes {
1487 bytes: vec![b],
1488 casei: false,
1489 }
1490 }
1491
bliti(b: u8) -> Expr1492 fn bliti(b: u8) -> Expr {
1493 Expr::LiteralBytes {
1494 bytes: vec![b],
1495 casei: true,
1496 }
1497 }
1498
class(ranges: &[(char, char)]) -> CharClass1499 fn class(ranges: &[(char, char)]) -> CharClass {
1500 let ranges = ranges.iter().cloned()
1501 .map(|(c1, c2)| ClassRange::new(c1, c2)).collect();
1502 CharClass::new(ranges)
1503 }
1504
classes(classes: &[&[(char, char)]]) -> CharClass1505 fn classes(classes: &[&[(char, char)]]) -> CharClass {
1506 let mut cls = CharClass::empty();
1507 for &ranges in classes {
1508 cls.ranges.extend(class(ranges));
1509 }
1510 cls.canonicalize()
1511 }
1512
bclass(ranges: &[(u8, u8)]) -> ByteClass1513 fn bclass(ranges: &[(u8, u8)]) -> ByteClass {
1514 let ranges = ranges.iter().cloned()
1515 .map(|(c1, c2)| ByteRange::new(c1, c2)).collect();
1516 ByteClass::new(ranges)
1517 }
1518
asciid() -> CharClass1519 fn asciid() -> CharClass {
1520 ascii_class("digit").unwrap()
1521 }
1522
asciis() -> CharClass1523 fn asciis() -> CharClass {
1524 ascii_class("space").unwrap()
1525 }
1526
asciiw() -> CharClass1527 fn asciiw() -> CharClass {
1528 ascii_class("word").unwrap()
1529 }
1530
asciid_bytes() -> ByteClass1531 fn asciid_bytes() -> ByteClass {
1532 asciid().to_byte_class()
1533 }
1534
asciis_bytes() -> ByteClass1535 fn asciis_bytes() -> ByteClass {
1536 asciis().to_byte_class()
1537 }
1538
asciiw_bytes() -> ByteClass1539 fn asciiw_bytes() -> ByteClass {
1540 asciiw().to_byte_class()
1541 }
1542
1543 #[test]
empty()1544 fn empty() {
1545 assert_eq!(p(""), Expr::Empty);
1546 }
1547
1548 #[test]
literal()1549 fn literal() {
1550 assert_eq!(p("a"), lit('a'));
1551 assert_eq!(pb("(?-u)a"), blit(b'a'));
1552 }
1553
1554 #[test]
literal_string()1555 fn literal_string() {
1556 assert_eq!(p("ab"), Expr::Concat(vec![lit('a'), lit('b')]));
1557 assert_eq!(pb("(?-u)ab"), Expr::Concat(vec![blit(b'a'), blit(b'b')]));
1558 }
1559
1560 #[test]
start_literal()1561 fn start_literal() {
1562 assert_eq!(p("^a"), Expr::Concat(vec![
1563 Expr::StartText,
1564 Expr::Literal { chars: vec!['a'], casei: false },
1565 ]));
1566 }
1567
1568 #[test]
repeat_zero_or_one_greedy()1569 fn repeat_zero_or_one_greedy() {
1570 assert_eq!(p("a?"), Expr::Repeat {
1571 e: b(lit('a')),
1572 r: Repeater::ZeroOrOne,
1573 greedy: true,
1574 });
1575 }
1576
1577 #[test]
repeat_zero_or_one_greedy_concat()1578 fn repeat_zero_or_one_greedy_concat() {
1579 assert_eq!(p("ab?"), Expr::Concat(vec![
1580 lit('a'),
1581 Expr::Repeat {
1582 e: b(lit('b')),
1583 r: Repeater::ZeroOrOne,
1584 greedy: true,
1585 },
1586 ]));
1587 }
1588
1589 #[test]
repeat_zero_or_one_nongreedy()1590 fn repeat_zero_or_one_nongreedy() {
1591 assert_eq!(p("a??"), Expr::Repeat {
1592 e: b(lit('a')),
1593 r: Repeater::ZeroOrOne,
1594 greedy: false,
1595 });
1596 }
1597
1598 #[test]
repeat_one_or_more_greedy()1599 fn repeat_one_or_more_greedy() {
1600 assert_eq!(p("a+"), Expr::Repeat {
1601 e: b(lit('a')),
1602 r: Repeater::OneOrMore,
1603 greedy: true,
1604 });
1605 }
1606
1607 #[test]
repeat_one_or_more_nongreedy()1608 fn repeat_one_or_more_nongreedy() {
1609 assert_eq!(p("a+?"), Expr::Repeat {
1610 e: b(lit('a')),
1611 r: Repeater::OneOrMore,
1612 greedy: false,
1613 });
1614 }
1615
1616 #[test]
repeat_zero_or_more_greedy()1617 fn repeat_zero_or_more_greedy() {
1618 assert_eq!(p("a*"), Expr::Repeat {
1619 e: b(lit('a')),
1620 r: Repeater::ZeroOrMore,
1621 greedy: true,
1622 });
1623 }
1624
1625 #[test]
repeat_zero_or_more_nongreedy()1626 fn repeat_zero_or_more_nongreedy() {
1627 assert_eq!(p("a*?"), Expr::Repeat {
1628 e: b(lit('a')),
1629 r: Repeater::ZeroOrMore,
1630 greedy: false,
1631 });
1632 }
1633
1634 #[test]
repeat_counted_exact()1635 fn repeat_counted_exact() {
1636 assert_eq!(p("a{5}"), Expr::Repeat {
1637 e: b(lit('a')),
1638 r: Repeater::Range { min: 5, max: Some(5) },
1639 greedy: true,
1640 });
1641 }
1642
1643 #[test]
repeat_counted_min()1644 fn repeat_counted_min() {
1645 assert_eq!(p("a{5,}"), Expr::Repeat {
1646 e: b(lit('a')),
1647 r: Repeater::Range { min: 5, max: None },
1648 greedy: true,
1649 });
1650 }
1651
1652 #[test]
repeat_counted_min_max()1653 fn repeat_counted_min_max() {
1654 assert_eq!(p("a{5,10}"), Expr::Repeat {
1655 e: b(lit('a')),
1656 r: Repeater::Range { min: 5, max: Some(10) },
1657 greedy: true,
1658 });
1659 }
1660
1661 #[test]
repeat_counted_exact_nongreedy()1662 fn repeat_counted_exact_nongreedy() {
1663 assert_eq!(p("a{5}?"), Expr::Repeat {
1664 e: b(lit('a')),
1665 r: Repeater::Range { min: 5, max: Some(5) },
1666 greedy: false,
1667 });
1668 }
1669
1670 #[test]
repeat_counted_min_nongreedy()1671 fn repeat_counted_min_nongreedy() {
1672 assert_eq!(p("a{5,}?"), Expr::Repeat {
1673 e: b(lit('a')),
1674 r: Repeater::Range { min: 5, max: None },
1675 greedy: false,
1676 });
1677 }
1678
1679 #[test]
repeat_counted_min_max_nongreedy()1680 fn repeat_counted_min_max_nongreedy() {
1681 assert_eq!(p("a{5,10}?"), Expr::Repeat {
1682 e: b(lit('a')),
1683 r: Repeater::Range { min: 5, max: Some(10) },
1684 greedy: false,
1685 });
1686 }
1687
1688 #[test]
repeat_counted_whitespace()1689 fn repeat_counted_whitespace() {
1690 assert_eq!(p("a{ 5 }"), Expr::Repeat {
1691 e: b(lit('a')),
1692 r: Repeater::Range { min: 5, max: Some(5) },
1693 greedy: true,
1694 });
1695 assert_eq!(p("a{ 5 , 10 }"), Expr::Repeat {
1696 e: b(lit('a')),
1697 r: Repeater::Range { min: 5, max: Some(10) },
1698 greedy: true,
1699 });
1700 }
1701
1702 #[test]
group_literal()1703 fn group_literal() {
1704 assert_eq!(p("(a)"), Expr::Group {
1705 e: b(lit('a')),
1706 i: Some(1),
1707 name: None,
1708 });
1709 }
1710
1711 #[test]
group_literal_concat()1712 fn group_literal_concat() {
1713 assert_eq!(p("(ab)"), Expr::Group {
1714 e: b(c(&[lit('a'), lit('b')])),
1715 i: Some(1),
1716 name: None,
1717 });
1718 }
1719
1720 #[test]
alt_two()1721 fn alt_two() {
1722 assert_eq!(p("a|b"), Expr::Alternate(vec![lit('a'), lit('b')]));
1723 }
1724
1725 #[test]
alt_many()1726 fn alt_many() {
1727 assert_eq!(p("a|b|c"), Expr::Alternate(vec![
1728 lit('a'), lit('b'), lit('c'),
1729 ]));
1730 }
1731
1732 #[test]
alt_many_concat()1733 fn alt_many_concat() {
1734 assert_eq!(p("ab|bc|cd"), Expr::Alternate(vec![
1735 c(&[lit('a'), lit('b')]),
1736 c(&[lit('b'), lit('c')]),
1737 c(&[lit('c'), lit('d')]),
1738 ]));
1739 }
1740
1741 #[test]
alt_group_two()1742 fn alt_group_two() {
1743 assert_eq!(p("(a|b)"), Expr::Group {
1744 e: b(Expr::Alternate(vec![lit('a'), lit('b')])),
1745 i: Some(1),
1746 name: None,
1747 });
1748 }
1749
1750 #[test]
alt_group_many()1751 fn alt_group_many() {
1752 assert_eq!(p("(a|b|c)"), Expr::Group {
1753 e: b(Expr::Alternate(vec![lit('a'), lit('b'), lit('c')])),
1754 i: Some(1),
1755 name: None,
1756 });
1757 }
1758
1759 #[test]
alt_group_many_concat()1760 fn alt_group_many_concat() {
1761 assert_eq!(p("(ab|bc|cd)"), Expr::Group {
1762 e: b(Expr::Alternate(vec![
1763 c(&[lit('a'), lit('b')]),
1764 c(&[lit('b'), lit('c')]),
1765 c(&[lit('c'), lit('d')]),
1766 ])),
1767 i: Some(1),
1768 name: None,
1769 });
1770 }
1771
1772 #[test]
alt_group_nested()1773 fn alt_group_nested() {
1774 assert_eq!(p("(ab|(bc|(cd)))"), Expr::Group {
1775 e: b(Expr::Alternate(vec![
1776 c(&[lit('a'), lit('b')]),
1777 Expr::Group {
1778 e: b(Expr::Alternate(vec![
1779 c(&[lit('b'), lit('c')]),
1780 Expr::Group {
1781 e: b(c(&[lit('c'), lit('d')])),
1782 i: Some(3),
1783 name: None,
1784 }
1785 ])),
1786 i: Some(2),
1787 name: None,
1788 },
1789 ])),
1790 i: Some(1),
1791 name: None,
1792 });
1793 }
1794
1795 #[test]
group_name()1796 fn group_name() {
1797 assert_eq!(p("(?P<foo>a)"), Expr::Group {
1798 e: b(lit('a')),
1799 i: Some(1),
1800 name: Some("foo".into()),
1801 });
1802 }
1803
1804 #[test]
group_no_capture()1805 fn group_no_capture() {
1806 assert_eq!(p("(?:a)"), Expr::Group {
1807 e: b(lit('a')),
1808 i: None,
1809 name: None,
1810 });
1811 }
1812
1813 #[test]
group_flags()1814 fn group_flags() {
1815 assert_eq!(p("(?i:a)"), Expr::Group {
1816 e: b(liti('a')),
1817 i: None,
1818 name: None,
1819 });
1820 assert_eq!(pb("(?i-u:a)"), Expr::Group {
1821 e: b(bliti(b'a')),
1822 i: None,
1823 name: None,
1824 });
1825 }
1826
1827 #[test]
group_flags_returned()1828 fn group_flags_returned() {
1829 assert_eq!(p("(?i:a)a"), c(&[
1830 Expr::Group {
1831 e: b(liti('a')),
1832 i: None,
1833 name: None,
1834 },
1835 lit('a'),
1836 ]));
1837 assert_eq!(pb("(?i-u:a)a"), c(&[
1838 Expr::Group {
1839 e: b(bliti(b'a')),
1840 i: None,
1841 name: None,
1842 },
1843 lit('a'),
1844 ]));
1845 }
1846
1847 #[test]
group_flags_retained()1848 fn group_flags_retained() {
1849 assert_eq!(p("(?i)(?-i:a)a"), c(&[
1850 Expr::Group {
1851 e: b(lit('a')),
1852 i: None,
1853 name: None,
1854 },
1855 liti('a'),
1856 ]));
1857 assert_eq!(pb("(?i-u)(?u-i:a)a"), c(&[
1858 Expr::Group {
1859 e: b(lit('a')),
1860 i: None,
1861 name: None,
1862 },
1863 bliti(b'a'),
1864 ]));
1865 }
1866
1867 #[test]
flags_inline()1868 fn flags_inline() {
1869 assert_eq!(p("(?i)a"), liti('a'));
1870 }
1871
1872 #[test]
flags_inline_multiple()1873 fn flags_inline_multiple() {
1874 assert_eq!(p("(?is)a."), c(&[liti('a'), Expr::AnyChar]));
1875 }
1876
1877 #[test]
flags_inline_multiline()1878 fn flags_inline_multiline() {
1879 assert_eq!(p("(?m)^(?-m)$"), c(&[Expr::StartLine, Expr::EndText]));
1880 }
1881
1882 #[test]
flags_inline_swap_greed()1883 fn flags_inline_swap_greed() {
1884 assert_eq!(p("(?U)a*a*?(?i-U)a*a*?"), c(&[
1885 Expr::Repeat {
1886 e: b(lit('a')),
1887 r: Repeater::ZeroOrMore,
1888 greedy: false,
1889 },
1890 Expr::Repeat {
1891 e: b(lit('a')),
1892 r: Repeater::ZeroOrMore,
1893 greedy: true,
1894 },
1895 Expr::Repeat {
1896 e: b(liti('a')),
1897 r: Repeater::ZeroOrMore,
1898 greedy: true,
1899 },
1900 Expr::Repeat {
1901 e: b(liti('a')),
1902 r: Repeater::ZeroOrMore,
1903 greedy: false,
1904 },
1905 ]));
1906 }
1907
1908 #[test]
flags_inline_multiple_negate_one()1909 fn flags_inline_multiple_negate_one() {
1910 assert_eq!(p("(?is)a.(?i-s)a."), c(&[
1911 liti('a'), Expr::AnyChar, liti('a'), Expr::AnyCharNoNL,
1912 ]));
1913 }
1914
1915 #[test]
any_byte()1916 fn any_byte() {
1917 assert_eq!(
1918 pb("(?-u).(?u)."), c(&[Expr::AnyByteNoNL, Expr::AnyCharNoNL]));
1919 assert_eq!(
1920 pb("(?s)(?-u).(?u)."), c(&[Expr::AnyByte, Expr::AnyChar]));
1921 }
1922
1923 #[test]
flags_inline_negate()1924 fn flags_inline_negate() {
1925 assert_eq!(p("(?i)a(?-i)a"), c(&[liti('a'), lit('a')]));
1926 }
1927
1928 #[test]
flags_group_inline()1929 fn flags_group_inline() {
1930 assert_eq!(p("(a(?i)a)a"), c(&[
1931 Expr::Group {
1932 e: b(c(&[lit('a'), liti('a')])),
1933 i: Some(1),
1934 name: None,
1935 },
1936 lit('a'),
1937 ]));
1938 }
1939
1940 #[test]
flags_group_inline_retain()1941 fn flags_group_inline_retain() {
1942 assert_eq!(p("(?i)((?-i)a)a"), c(&[
1943 Expr::Group {
1944 e: b(lit('a')),
1945 i: Some(1),
1946 name: None,
1947 },
1948 liti('a'),
1949 ]));
1950 }
1951
1952 #[test]
flags_default_casei()1953 fn flags_default_casei() {
1954 let flags = Flags { casei: true, .. Flags::default() };
1955 assert_eq!(pf("a", flags), liti('a'));
1956 }
1957
1958 #[test]
flags_default_multi()1959 fn flags_default_multi() {
1960 let flags = Flags { multi: true, .. Flags::default() };
1961 assert_eq!(pf("^", flags), Expr::StartLine);
1962 }
1963
1964 #[test]
flags_default_dotnl()1965 fn flags_default_dotnl() {
1966 let flags = Flags { dotnl: true, .. Flags::default() };
1967 assert_eq!(pf(".", flags), Expr::AnyChar);
1968 }
1969
1970 #[test]
flags_default_swap_greed()1971 fn flags_default_swap_greed() {
1972 let flags = Flags { swap_greed: true, .. Flags::default() };
1973 assert_eq!(pf("a+", flags), Expr::Repeat {
1974 e: b(lit('a')),
1975 r: Repeater::OneOrMore,
1976 greedy: false,
1977 });
1978 }
1979
1980 #[test]
flags_default_ignore_space()1981 fn flags_default_ignore_space() {
1982 let flags = Flags { ignore_space: true, .. Flags::default() };
1983 assert_eq!(pf(" a ", flags), lit('a'));
1984 }
1985
1986 #[test]
escape_simple()1987 fn escape_simple() {
1988 assert_eq!(p(r"\a\f\t\n\r\v"), c(&[
1989 lit('\x07'), lit('\x0C'), lit('\t'),
1990 lit('\n'), lit('\r'), lit('\x0B'),
1991 ]));
1992 }
1993
1994 #[test]
escape_boundaries()1995 fn escape_boundaries() {
1996 assert_eq!(p(r"\A\z\b\B"), c(&[
1997 Expr::StartText, Expr::EndText,
1998 Expr::WordBoundary, Expr::NotWordBoundary,
1999 ]));
2000 assert_eq!(pb(r"(?-u)\b\B"), c(&[
2001 Expr::WordBoundaryAscii, Expr::NotWordBoundaryAscii,
2002 ]));
2003 }
2004
2005 #[test]
escape_punctuation()2006 fn escape_punctuation() {
2007 assert_eq!(p(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), c(&[
2008 lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'),
2009 lit('('), lit(')'), lit('|'), lit('['), lit(']'),
2010 lit('{'), lit('}'), lit('^'), lit('$'), lit('#'),
2011 ]));
2012 }
2013
2014 #[test]
escape_octal()2015 fn escape_octal() {
2016 assert_eq!(p(r"\123"), lit('S'));
2017 assert_eq!(p(r"\1234"), c(&[lit('S'), lit('4')]));
2018
2019 assert_eq!(pb(r"(?-u)\377"), blit(0xFF));
2020 }
2021
2022 #[test]
escape_hex2()2023 fn escape_hex2() {
2024 assert_eq!(p(r"\x53"), lit('S'));
2025 assert_eq!(p(r"\x534"), c(&[lit('S'), lit('4')]));
2026
2027 assert_eq!(pb(r"(?-u)\xff"), blit(0xFF));
2028 assert_eq!(pb(r"(?-u)\x00"), blit(0x0));
2029 assert_eq!(pb(r"(?-u)[\x00]"),
2030 Expr::ClassBytes(bclass(&[(b'\x00', b'\x00')])));
2031 assert_eq!(pb(r"(?-u)[^\x00]"),
2032 Expr::ClassBytes(bclass(&[(b'\x01', b'\xFF')])));
2033 }
2034
2035 #[test]
escape_hex()2036 fn escape_hex() {
2037 assert_eq!(p(r"\x{53}"), lit('S'));
2038 assert_eq!(p(r"\x{53}4"), c(&[lit('S'), lit('4')]));
2039 assert_eq!(p(r"\x{2603}"), lit('\u{2603}'));
2040
2041 assert_eq!(pb(r"(?-u)\x{00FF}"), blit(0xFF));
2042 }
2043
2044 #[test]
escape_unicode_name()2045 fn escape_unicode_name() {
2046 assert_eq!(p(r"\p{Yi}"), Expr::Class(class(YI)));
2047 }
2048
2049 #[test]
escape_unicode_letter()2050 fn escape_unicode_letter() {
2051 assert_eq!(p(r"\pZ"), Expr::Class(class(&[
2052 ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2053 ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2054 ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2055 ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2056 ])));
2057 }
2058
2059 #[test]
escape_unicode_name_case_fold()2060 fn escape_unicode_name_case_fold() {
2061 assert_eq!(p(r"(?i)\p{Yi}"), Expr::Class(class(YI).case_fold()));
2062 }
2063
2064 #[test]
escape_unicode_letter_case_fold()2065 fn escape_unicode_letter_case_fold() {
2066 assert_eq!(p(r"(?i)\pZ"), Expr::Class(class(&[
2067 ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2068 ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2069 ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2070 ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2071 ]).case_fold()));
2072 }
2073
2074 #[test]
escape_unicode_name_negate()2075 fn escape_unicode_name_negate() {
2076 assert_eq!(p(r"\P{Yi}"), Expr::Class(class(YI).negate()));
2077 }
2078
2079 #[test]
escape_unicode_letter_negate()2080 fn escape_unicode_letter_negate() {
2081 assert_eq!(p(r"\PZ"), Expr::Class(class(&[
2082 ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2083 ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2084 ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2085 ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2086 ]).negate()));
2087 }
2088
2089 #[test]
escape_unicode_name_negate_case_fold()2090 fn escape_unicode_name_negate_case_fold() {
2091 assert_eq!(p(r"(?i)\P{Yi}"),
2092 Expr::Class(class(YI).negate().case_fold()));
2093 }
2094
2095 #[test]
escape_unicode_letter_negate_case_fold()2096 fn escape_unicode_letter_negate_case_fold() {
2097 assert_eq!(p(r"(?i)\PZ"), Expr::Class(class(&[
2098 ('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}'),
2099 ('\u{1680}', '\u{1680}'), ('\u{2000}', '\u{200a}'),
2100 ('\u{2028}', '\u{2029}'), ('\u{202f}', '\u{202f}'),
2101 ('\u{205f}', '\u{205f}'), ('\u{3000}', '\u{3000}'),
2102 ]).negate().case_fold()));
2103 }
2104
2105 #[test]
escape_perl_d()2106 fn escape_perl_d() {
2107 assert_eq!(p(r"\d"), Expr::Class(class(PERLD)));
2108 assert_eq!(pb(r"(?-u)\d"), Expr::Class(asciid()));
2109 }
2110
2111 #[test]
escape_perl_s()2112 fn escape_perl_s() {
2113 assert_eq!(p(r"\s"), Expr::Class(class(PERLS)));
2114 assert_eq!(pb(r"(?-u)\s"), Expr::Class(asciis()));
2115 }
2116
2117 #[test]
escape_perl_w()2118 fn escape_perl_w() {
2119 assert_eq!(p(r"\w"), Expr::Class(class(PERLW)));
2120 assert_eq!(pb(r"(?-u)\w"), Expr::Class(asciiw()));
2121 }
2122
2123 #[test]
escape_perl_d_negate()2124 fn escape_perl_d_negate() {
2125 assert_eq!(p(r"\D"), Expr::Class(class(PERLD).negate()));
2126 assert_eq!(pb(r"(?-u)\D"), Expr::Class(asciid().negate()));
2127 }
2128
2129 #[test]
escape_perl_s_negate()2130 fn escape_perl_s_negate() {
2131 assert_eq!(p(r"\S"), Expr::Class(class(PERLS).negate()));
2132 assert_eq!(pb(r"(?-u)\S"), Expr::Class(asciis().negate()));
2133 }
2134
2135 #[test]
escape_perl_w_negate()2136 fn escape_perl_w_negate() {
2137 assert_eq!(p(r"\W"), Expr::Class(class(PERLW).negate()));
2138 assert_eq!(pb(r"(?-u)\W"), Expr::Class(asciiw().negate()));
2139 }
2140
2141 #[test]
escape_perl_d_case_fold()2142 fn escape_perl_d_case_fold() {
2143 assert_eq!(p(r"(?i)\d"), Expr::Class(class(PERLD).case_fold()));
2144 assert_eq!(pb(r"(?i-u)\d"), Expr::Class(asciid().case_fold()));
2145 }
2146
2147 #[test]
escape_perl_s_case_fold()2148 fn escape_perl_s_case_fold() {
2149 assert_eq!(p(r"(?i)\s"), Expr::Class(class(PERLS).case_fold()));
2150 assert_eq!(pb(r"(?i-u)\s"), Expr::Class(asciis().case_fold()));
2151 }
2152
2153 #[test]
escape_perl_w_case_fold()2154 fn escape_perl_w_case_fold() {
2155 assert_eq!(p(r"(?i)\w"), Expr::Class(class(PERLW).case_fold()));
2156 assert_eq!(pb(r"(?i-u)\w"), Expr::Class(asciiw().case_fold()));
2157 }
2158
2159 #[test]
escape_perl_d_case_fold_negate()2160 fn escape_perl_d_case_fold_negate() {
2161 assert_eq!(p(r"(?i)\D"),
2162 Expr::Class(class(PERLD).case_fold().negate()));
2163 let bytes = asciid().case_fold().negate();
2164 assert_eq!(pb(r"(?i-u)\D"), Expr::Class(bytes));
2165 }
2166
2167 #[test]
escape_perl_s_case_fold_negate()2168 fn escape_perl_s_case_fold_negate() {
2169 assert_eq!(p(r"(?i)\S"),
2170 Expr::Class(class(PERLS).case_fold().negate()));
2171 let bytes = asciis().case_fold().negate();
2172 assert_eq!(pb(r"(?i-u)\S"), Expr::Class(bytes));
2173 }
2174
2175 #[test]
escape_perl_w_case_fold_negate()2176 fn escape_perl_w_case_fold_negate() {
2177 assert_eq!(p(r"(?i)\W"),
2178 Expr::Class(class(PERLW).case_fold().negate()));
2179 let bytes = asciiw().case_fold().negate();
2180 assert_eq!(pb(r"(?i-u)\W"), Expr::Class(bytes));
2181 }
2182
2183 #[test]
class_singleton()2184 fn class_singleton() {
2185 assert_eq!(p(r"[a]"), Expr::Class(class(&[('a', 'a')])));
2186 assert_eq!(p(r"[\x00]"), Expr::Class(class(&[('\x00', '\x00')])));
2187 assert_eq!(p(r"[\n]"), Expr::Class(class(&[('\n', '\n')])));
2188 assert_eq!(p("[\n]"), Expr::Class(class(&[('\n', '\n')])));
2189
2190 assert_eq!(pb(r"(?-u)[a]"), Expr::ClassBytes(bclass(&[(b'a', b'a')])));
2191 assert_eq!(pb(r"(?-u)[\x00]"), Expr::ClassBytes(bclass(&[(0, 0)])));
2192 assert_eq!(pb(r"(?-u)[\xFF]"),
2193 Expr::ClassBytes(bclass(&[(0xFF, 0xFF)])));
2194 assert_eq!(pb("(?-u)[\n]"),
2195 Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
2196 assert_eq!(pb(r"(?-u)[\n]"),
2197 Expr::ClassBytes(bclass(&[(b'\n', b'\n')])));
2198 }
2199
2200 #[test]
class_singleton_negate()2201 fn class_singleton_negate() {
2202 assert_eq!(p(r"[^a]"), Expr::Class(class(&[
2203 ('\x00', '\x60'), ('\x62', '\u{10FFFF}'),
2204 ])));
2205 assert_eq!(p(r"[^\x00]"), Expr::Class(class(&[
2206 ('\x01', '\u{10FFFF}'),
2207 ])));
2208 assert_eq!(p(r"[^\n]"), Expr::Class(class(&[
2209 ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'),
2210 ])));
2211 assert_eq!(p("[^\n]"), Expr::Class(class(&[
2212 ('\x00', '\x09'), ('\x0b', '\u{10FFFF}'),
2213 ])));
2214
2215 assert_eq!(pb(r"(?-u)[^a]"), Expr::ClassBytes(bclass(&[
2216 (0x00, 0x60), (0x62, 0xFF),
2217 ])));
2218 assert_eq!(pb(r"(?-u)[^\x00]"), Expr::ClassBytes(bclass(&[
2219 (0x01, 0xFF),
2220 ])));
2221 assert_eq!(pb(r"(?-u)[^\n]"), Expr::ClassBytes(bclass(&[
2222 (0x00, 0x09), (0x0B, 0xFF),
2223 ])));
2224 assert_eq!(pb("(?-u)[^\n]"), Expr::ClassBytes(bclass(&[
2225 (0x00, 0x09), (0x0B, 0xFF),
2226 ])));
2227 }
2228
2229 #[test]
class_singleton_class()2230 fn class_singleton_class() {
2231 assert_eq!(p(r"[\d]"), Expr::Class(class(PERLD)));
2232 assert_eq!(p(r"[\p{Yi}]"), Expr::Class(class(YI)));
2233
2234 let bytes = class(PERLD).to_byte_class();
2235 assert_eq!(pb(r"(?-u)[\d]"), Expr::ClassBytes(bytes));
2236 }
2237
2238 #[test]
class_singleton_class_negate()2239 fn class_singleton_class_negate() {
2240 assert_eq!(p(r"[^\d]"), Expr::Class(class(PERLD).negate()));
2241 assert_eq!(p(r"[^\w]"), Expr::Class(class(PERLW).negate()));
2242 assert_eq!(p(r"[^\s]"), Expr::Class(class(PERLS).negate()));
2243
2244 let bytes = asciid_bytes().negate();
2245 assert_eq!(pb(r"(?-u)[^\d]"), Expr::ClassBytes(bytes));
2246 let bytes = asciiw_bytes().negate();
2247 assert_eq!(pb(r"(?-u)[^\w]"), Expr::ClassBytes(bytes));
2248 let bytes = asciis_bytes().negate();
2249 assert_eq!(pb(r"(?-u)[^\s]"), Expr::ClassBytes(bytes));
2250 }
2251
2252 #[test]
class_singleton_class_negate_negate()2253 fn class_singleton_class_negate_negate() {
2254 assert_eq!(p(r"[^\D]"), Expr::Class(class(PERLD)));
2255 assert_eq!(p(r"[^\W]"), Expr::Class(class(PERLW)));
2256 assert_eq!(p(r"[^\S]"), Expr::Class(class(PERLS)));
2257
2258 assert_eq!(pb(r"(?-u)[^\D]"), Expr::ClassBytes(asciid_bytes()));
2259 assert_eq!(pb(r"(?-u)[^\W]"), Expr::ClassBytes(asciiw_bytes()));
2260 assert_eq!(pb(r"(?-u)[^\S]"), Expr::ClassBytes(asciis_bytes()));
2261 }
2262
2263 #[test]
class_singleton_class_casei()2264 fn class_singleton_class_casei() {
2265 assert_eq!(p(r"(?i)[\d]"), Expr::Class(class(PERLD).case_fold()));
2266 assert_eq!(p(r"(?i)[\p{Yi}]"), Expr::Class(class(YI).case_fold()));
2267
2268 assert_eq!(pb(r"(?i-u)[\d]"),
2269 Expr::ClassBytes(asciid_bytes().case_fold()));
2270 }
2271
2272 #[test]
class_singleton_class_negate_casei()2273 fn class_singleton_class_negate_casei() {
2274 assert_eq!(p(r"(?i)[^\d]"),
2275 Expr::Class(class(PERLD).case_fold().negate()));
2276 assert_eq!(p(r"(?i)[^\w]"),
2277 Expr::Class(class(PERLW).case_fold().negate()));
2278 assert_eq!(p(r"(?i)[^\s]"),
2279 Expr::Class(class(PERLS).case_fold().negate()));
2280
2281 let bytes = asciid_bytes().case_fold().negate();
2282 assert_eq!(pb(r"(?i-u)[^\d]"), Expr::ClassBytes(bytes));
2283 let bytes = asciiw_bytes().case_fold().negate();
2284 assert_eq!(pb(r"(?i-u)[^\w]"), Expr::ClassBytes(bytes));
2285 let bytes = asciis_bytes().case_fold().negate();
2286 assert_eq!(pb(r"(?i-u)[^\s]"), Expr::ClassBytes(bytes));
2287 }
2288
2289 #[test]
class_singleton_class_negate_negate_casei()2290 fn class_singleton_class_negate_negate_casei() {
2291 assert_eq!(p(r"(?i)[^\D]"), Expr::Class(class(PERLD).case_fold()));
2292 assert_eq!(p(r"(?i)[^\W]"), Expr::Class(class(PERLW).case_fold()));
2293 assert_eq!(p(r"(?i)[^\S]"), Expr::Class(class(PERLS).case_fold()));
2294
2295 assert_eq!(pb(r"(?i-u)[^\D]"),
2296 Expr::ClassBytes(asciid_bytes().case_fold()));
2297 assert_eq!(pb(r"(?i-u)[^\W]"),
2298 Expr::ClassBytes(asciiw_bytes().case_fold()));
2299 assert_eq!(pb(r"(?i-u)[^\S]"),
2300 Expr::ClassBytes(asciis_bytes().case_fold()));
2301 }
2302
2303 #[test]
class_multiple_class()2304 fn class_multiple_class() {
2305 assert_eq!(p(r"[\d\p{Yi}]"), Expr::Class(classes(&[
2306 PERLD, YI,
2307 ])));
2308 }
2309
2310 #[test]
class_multiple_class_negate()2311 fn class_multiple_class_negate() {
2312 assert_eq!(p(r"[^\d\p{Yi}]"), Expr::Class(classes(&[
2313 PERLD, YI,
2314 ]).negate()));
2315 }
2316
2317 #[test]
class_multiple_class_negate_negate()2318 fn class_multiple_class_negate_negate() {
2319 let nperlw = class(PERLW).negate();
2320 let nyi = class(YI).negate();
2321 let cls = CharClass::empty().merge(nperlw).merge(nyi);
2322 assert_eq!(p(r"[^\W\P{Yi}]"), Expr::Class(cls.negate()));
2323 }
2324
2325 #[test]
class_multiple_class_casei()2326 fn class_multiple_class_casei() {
2327 assert_eq!(p(r"(?i)[\d\p{Yi}]"), Expr::Class(classes(&[
2328 PERLD, YI,
2329 ]).case_fold()));
2330 }
2331
2332 #[test]
class_multiple_class_negate_casei()2333 fn class_multiple_class_negate_casei() {
2334 assert_eq!(p(r"(?i)[^\d\p{Yi}]"), Expr::Class(classes(&[
2335 PERLD, YI,
2336 ]).case_fold().negate()));
2337 }
2338
2339 #[test]
class_multiple_class_negate_negate_casei()2340 fn class_multiple_class_negate_negate_casei() {
2341 let nperlw = class(PERLW).negate();
2342 let nyi = class(YI).negate();
2343 let class = CharClass::empty().merge(nperlw).merge(nyi);
2344 assert_eq!(p(r"(?i)[^\W\P{Yi}]"),
2345 Expr::Class(class.case_fold().negate()));
2346 }
2347
2348 #[test]
class_class_hypen()2349 fn class_class_hypen() {
2350 assert_eq!(p(r"[\p{Yi}-]"), Expr::Class(classes(&[
2351 &[('-', '-')], YI,
2352 ])));
2353 assert_eq!(p(r"[\p{Yi}-a]"), Expr::Class(classes(&[
2354 &[('-', '-')], &[('a', 'a')], YI,
2355 ])));
2356 }
2357
2358 #[test]
class_brackets()2359 fn class_brackets() {
2360 assert_eq!(p(r"[]]"), Expr::Class(class(&[(']', ']')])));
2361 assert_eq!(p(r"[]\[]"), Expr::Class(class(&[('[', '['), (']', ']')])));
2362 assert_eq!(p(r"[\[]]"), Expr::Concat(vec![
2363 Expr::Class(class(&[('[', '[')])),
2364 lit(']'),
2365 ]));
2366 }
2367
2368 #[test]
class_brackets_hypen()2369 fn class_brackets_hypen() {
2370 assert_eq!(p("[]-]"), Expr::Class(class(&[('-', '-'), (']', ']')])));
2371 assert_eq!(p("[-]]"), Expr::Concat(vec![
2372 Expr::Class(class(&[('-', '-')])),
2373 lit(']'),
2374 ]));
2375 }
2376
2377 #[test]
class_nested_class_union()2378 fn class_nested_class_union() {
2379 assert_eq!(p(r"[c[a-b]]"), Expr::Class(class(&[('a', 'c')])));
2380 assert_eq!(p(r"[[a-b]]"), Expr::Class(class(&[('a', 'b')])));
2381 assert_eq!(p(r"[[c][a-b]]"), Expr::Class(class(&[('a', 'c')])));
2382
2383 assert_eq!(pb(r"(?-u)[c[a-b]]"),
2384 Expr::ClassBytes(bclass(&[(b'a', b'c')])));
2385 assert_eq!(pb(r"(?-u)[[a-b]]"),
2386 Expr::ClassBytes(bclass(&[(b'a', b'b')])));
2387 assert_eq!(pb(r"(?-u)[[c][a-b]]"),
2388 Expr::ClassBytes(bclass(&[(b'a', b'c')])));
2389 }
2390
2391 #[test]
class_nested_class_union_casei()2392 fn class_nested_class_union_casei() {
2393 assert_eq!(p(r"(?i)[c[a-b]]"),
2394 Expr::Class(class(&[('a', 'c')]).case_fold()));
2395 assert_eq!(p(r"(?i)[[a-b]]"),
2396 Expr::Class(class(&[('a', 'b')]).case_fold()));
2397 assert_eq!(p(r"(?i)[[c][a-b]]"),
2398 Expr::Class(class(&[('a', 'c')]).case_fold()));
2399
2400 assert_eq!(pb(r"(?i-u)[[\d]]"),
2401 Expr::ClassBytes(asciid_bytes().case_fold()));
2402 }
2403
2404 #[test]
class_nested_class_negate()2405 fn class_nested_class_negate() {
2406 assert_eq!(p(r"[^[\d]]"), Expr::Class(class(PERLD).negate()));
2407 assert_eq!(p(r"[[^\d]]"), Expr::Class(class(PERLD).negate()));
2408 assert_eq!(p(r"[^[^\d]]"), Expr::Class(class(PERLD)));
2409 assert_eq!(p(r"[^[\w]]"), Expr::Class(class(PERLW).negate()));
2410 assert_eq!(p(r"[[^\w]]"), Expr::Class(class(PERLW).negate()));
2411 assert_eq!(p(r"[^[^\w]]"), Expr::Class(class(PERLW)));
2412 assert_eq!(p(r"[a-b[^c]]"),
2413 Expr::Class(class(&[('\u{0}', 'b'), ('d', '\u{10FFFF}')])));
2414
2415 assert_eq!(pb(r"(?-u)[^[\d]]"),
2416 Expr::ClassBytes(asciid_bytes().negate()));
2417 assert_eq!(pb(r"(?-u)[[^\d]]"),
2418 Expr::ClassBytes(asciid_bytes().negate()));
2419 assert_eq!(pb(r"(?-u)[^[^\d]]"),
2420 Expr::ClassBytes(asciid_bytes()));
2421 assert_eq!(pb(r"(?-u)[^[\w]]"),
2422 Expr::ClassBytes(asciiw_bytes().negate()));
2423 assert_eq!(pb(r"(?-u)[[^\w]]"),
2424 Expr::ClassBytes(asciiw_bytes().negate()));
2425 assert_eq!(pb(r"(?-u)[^[^\w]]"),
2426 Expr::ClassBytes(asciiw_bytes()));
2427 assert_eq!(pb(r"(?-u)[a-b[^c]]"),
2428 Expr::ClassBytes(bclass(&[(b'\x00', b'b'), (b'd', b'\xFF')])))
2429 }
2430
2431 #[test]
class_nested_class_negate_casei()2432 fn class_nested_class_negate_casei() {
2433 assert_eq!(p(r"(?i)[^[\d]]"),
2434 Expr::Class(class(PERLD).case_fold().negate()));
2435 assert_eq!(p(r"(?i)[[^\d]]"),
2436 Expr::Class(class(PERLD).case_fold().negate()));
2437 assert_eq!(p(r"(?i)[^[^\d]]"),
2438 Expr::Class(class(PERLD).case_fold()));
2439 assert_eq!(p(r"(?i)[^[\w]]"),
2440 Expr::Class(class(PERLW).case_fold().negate()));
2441 assert_eq!(p(r"(?i)[[^\w]]"),
2442 Expr::Class(class(PERLW).case_fold().negate()));
2443 assert_eq!(p(r"(?i)[^[^\w]]"),
2444 Expr::Class(class(PERLW).case_fold()));
2445 let mut cls = CharClass::empty().negate();
2446 cls.remove('c');
2447 cls.remove('C');
2448 assert_eq!(p(r"(?i)[a-b[^c]]"), Expr::Class(cls));
2449
2450 assert_eq!(pb(r"(?i-u)[^[\d]]"),
2451 Expr::ClassBytes(asciid_bytes().case_fold().negate()));
2452 assert_eq!(pb(r"(?i-u)[[^\d]]"),
2453 Expr::ClassBytes(asciid_bytes().case_fold().negate()));
2454 assert_eq!(pb(r"(?i-u)[^[^\d]]"),
2455 Expr::ClassBytes(asciid_bytes().case_fold()));
2456 assert_eq!(pb(r"(?i-u)[^[\w]]"),
2457 Expr::ClassBytes(asciiw_bytes().case_fold().negate()));
2458 assert_eq!(pb(r"(?i-u)[[^\w]]"),
2459 Expr::ClassBytes(asciiw_bytes().case_fold().negate()));
2460 assert_eq!(pb(r"(?i-u)[^[^\w]]"),
2461 Expr::ClassBytes(asciiw_bytes().case_fold()));
2462 let mut bytes = ByteClass::new(vec![]).negate();
2463 bytes.remove(b'c');
2464 bytes.remove(b'C');
2465 assert_eq!(pb(r"(?i-u)[a-b[^c]]"), Expr::ClassBytes(bytes));
2466 }
2467
2468 #[test]
class_nested_class_brackets_hyphen()2469 fn class_nested_class_brackets_hyphen() {
2470 // This is confusing, but `]` is allowed if first character within a class
2471 // It parses as a nested class with the `]` and `-` characters
2472 assert_eq!(p(r"[[]-]]"), Expr::Class(class(&[('-', '-'), (']', ']')])));
2473 assert_eq!(p(r"[[\[]]"), Expr::Class(class(&[('[', '[')])));
2474 assert_eq!(p(r"[[\]]]"), Expr::Class(class(&[(']', ']')])));
2475 }
2476
2477 #[test]
class_nested_class_deep_nesting()2478 fn class_nested_class_deep_nesting() {
2479 // Makes sure that implementation can handle deep nesting.
2480 // With recursive parsing, this regex would blow the stack size.
2481 use std::iter::repeat;
2482 let nesting = 10_000;
2483 let open: String = repeat("[").take(nesting).collect();
2484 let close: String = repeat("]").take(nesting).collect();
2485 let s = format!("{}a{}", open, close);
2486 assert_eq!(p(&s), Expr::Class(class(&[('a', 'a')])));
2487 }
2488
2489 #[test]
class_intersection_ranges()2490 fn class_intersection_ranges() {
2491 assert_eq!(p(r"[abc&&b-c]"), Expr::Class(class(&[('b', 'c')])));
2492 assert_eq!(p(r"[abc&&[b-c]]"), Expr::Class(class(&[('b', 'c')])));
2493 assert_eq!(p(r"[[abc]&&[b-c]]"), Expr::Class(class(&[('b', 'c')])));
2494 assert_eq!(p(r"[a-z&&b-y&&c-x]"), Expr::Class(class(&[('c', 'x')])));
2495 assert_eq!(p(r"[c-da-b&&a-d]"), Expr::Class(class(&[('a', 'd')])));
2496 assert_eq!(p(r"[a-d&&c-da-b]"), Expr::Class(class(&[('a', 'd')])));
2497
2498 assert_eq!(pb(r"(?-u)[abc&&b-c]"),
2499 Expr::ClassBytes(bclass(&[(b'b', b'c')])));
2500 assert_eq!(pb(r"(?-u)[abc&&[b-c]]"),
2501 Expr::ClassBytes(bclass(&[(b'b', b'c')])));
2502 assert_eq!(pb(r"(?-u)[[abc]&&[b-c]]"),
2503 Expr::ClassBytes(bclass(&[(b'b', b'c')])));
2504 assert_eq!(pb(r"(?-u)[a-z&&b-y&&c-x]"),
2505 Expr::ClassBytes(bclass(&[(b'c', b'x')])));
2506 assert_eq!(pb(r"(?-u)[c-da-b&&a-d]"),
2507 Expr::ClassBytes(bclass(&[(b'a', b'd')])));
2508 }
2509
2510 #[test]
class_intersection_ranges_casei()2511 fn class_intersection_ranges_casei() {
2512 assert_eq!(p(r"(?i)[abc&&b-c]"),
2513 Expr::Class(class(&[('b', 'c')]).case_fold()));
2514 assert_eq!(p(r"(?i)[abc&&[b-c]]"),
2515 Expr::Class(class(&[('b', 'c')]).case_fold()));
2516 assert_eq!(p(r"(?i)[[abc]&&[b-c]]"),
2517 Expr::Class(class(&[('b', 'c')]).case_fold()));
2518 assert_eq!(p(r"(?i)[a-z&&b-y&&c-x]"),
2519 Expr::Class(class(&[('c', 'x')]).case_fold()));
2520 assert_eq!(p(r"(?i)[c-da-b&&a-d]"),
2521 Expr::Class(class(&[('a', 'd')]).case_fold()));
2522
2523 assert_eq!(pb(r"(?i-u)[abc&&b-c]"),
2524 Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
2525 assert_eq!(pb(r"(?i-u)[abc&&[b-c]]"),
2526 Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
2527 assert_eq!(pb(r"(?i-u)[[abc]&&[b-c]]"),
2528 Expr::ClassBytes(bclass(&[(b'b', b'c')]).case_fold()));
2529 assert_eq!(pb(r"(?i-u)[a-z&&b-y&&c-x]"),
2530 Expr::ClassBytes(bclass(&[(b'c', b'x')]).case_fold()));
2531 assert_eq!(pb(r"(?i-u)[c-da-b&&a-d]"),
2532 Expr::ClassBytes(bclass(&[(b'a', b'd')]).case_fold()));
2533 }
2534
2535 #[test]
class_intersection_classes()2536 fn class_intersection_classes() {
2537 assert_eq!(p(r"[\w&&\d]"), Expr::Class(class(PERLD)));
2538 assert_eq!(p(r"[\w&&[[:ascii:]]]"), Expr::Class(asciiw()));
2539 assert_eq!(p(r"[\x00-\xFF&&\pZ]"),
2540 Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')])));
2541
2542 assert_eq!(pb(r"(?-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes()));
2543 assert_eq!(pb(r"(?-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes()));
2544 }
2545
2546 #[test]
class_intersection_classes_casei()2547 fn class_intersection_classes_casei() {
2548 assert_eq!(p(r"(?i)[\w&&\d]"), Expr::Class(class(PERLD).case_fold()));
2549 assert_eq!(p(r"(?i)[\w&&[[:ascii:]]]"), Expr::Class(asciiw().case_fold()));
2550 assert_eq!(p(r"(?i)[\x00-\xFF&&\pZ]"),
2551 Expr::Class(class(&[('\u{20}', '\u{20}'), ('\u{a0}', '\u{a0}')])));
2552
2553 assert_eq!(pb(r"(?i-u)[\w&&\d]"), Expr::ClassBytes(asciid_bytes().case_fold()));
2554 assert_eq!(pb(r"(?i-u)[\w&&[[:ascii:]]]"), Expr::ClassBytes(asciiw_bytes().case_fold()));
2555 }
2556
2557 #[test]
class_intersection_negate()2558 fn class_intersection_negate() {
2559 assert_eq!(p(r"[^\w&&\d]"), Expr::Class(class(PERLD).negate()));
2560 assert_eq!(p(r"[^[\w&&\d]]"), Expr::Class(class(PERLD).negate()));
2561 assert_eq!(p(r"[^[^\w&&\d]]"), Expr::Class(class(PERLD)));
2562 assert_eq!(p(r"[\w&&[^\d]]"),
2563 Expr::Class(class(PERLW).intersection(&class(PERLD).negate())));
2564 assert_eq!(p(r"[[^\w]&&[^\d]]"),
2565 Expr::Class(class(PERLW).negate()));
2566
2567 assert_eq!(pb(r"(?-u)[^\w&&\d]"),
2568 Expr::ClassBytes(asciid_bytes().negate()));
2569 assert_eq!(pb(r"(?-u)[^[\w&&\d]]"),
2570 Expr::ClassBytes(asciid_bytes().negate()));
2571 assert_eq!(pb(r"(?-u)[^[^\w&&\d]]"),
2572 Expr::ClassBytes(asciid_bytes()));
2573 assert_eq!(pb(r"(?-u)[\w&&[^\d]]"),
2574 Expr::ClassBytes(asciiw().intersection(&asciid().negate()).to_byte_class()));
2575 assert_eq!(pb(r"(?-u)[[^\w]&&[^\d]]"),
2576 Expr::ClassBytes(asciiw_bytes().negate()));
2577 }
2578
2579 #[test]
class_intersection_negate_casei()2580 fn class_intersection_negate_casei() {
2581 assert_eq!(p(r"(?i)[^\w&&a-z]"),
2582 Expr::Class(class(&[('a', 'z')]).case_fold().negate()));
2583 assert_eq!(p(r"(?i)[^[\w&&a-z]]"),
2584 Expr::Class(class(&[('a', 'z')]).case_fold().negate()));
2585 assert_eq!(p(r"(?i)[^[^\w&&a-z]]"),
2586 Expr::Class(class(&[('a', 'z')]).case_fold()));
2587 assert_eq!(p(r"(?i)[\w&&[^a-z]]"),
2588 Expr::Class(
2589 class(PERLW).intersection(&class(&[('a', 'z')])
2590 .case_fold().negate())));
2591 assert_eq!(p(r"(?i)[[^\w]&&[^a-z]]"),
2592 Expr::Class(class(PERLW).negate()));
2593
2594 assert_eq!(pb(r"(?i-u)[^\w&&a-z]"),
2595 Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate()));
2596 assert_eq!(pb(r"(?i-u)[^[\w&&a-z]]"),
2597 Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold().negate()));
2598 assert_eq!(pb(r"(?i-u)[^[^\w&&a-z]]"),
2599 Expr::ClassBytes(bclass(&[(b'a', b'z')]).case_fold()));
2600 assert_eq!(pb(r"(?i-u)[\w&&[^a-z]]"),
2601 Expr::ClassBytes(bclass(&[(b'0', b'9'), (b'_', b'_')])));
2602 assert_eq!(pb(r"(?i-u)[[^\w]&&[^a-z]]"),
2603 Expr::ClassBytes(asciiw_bytes().negate()));
2604 }
2605
2606 #[test]
class_intersection_caret()2607 fn class_intersection_caret() {
2608 // In `[a^]`, `^` does not need to be escaped, so it makes sense that
2609 // `^` is also allowed to be unescaped after `&&`.
2610 assert_eq!(p(r"[\^&&^]"), Expr::Class(class(&[('^', '^')])));
2611 }
2612
2613 #[test]
class_intersection_brackets_hyphen()2614 fn class_intersection_brackets_hyphen() {
2615 // `]` needs to be escaped after `&&` because it is not at the start of the class.
2616 assert_eq!(p(r"[]&&\]]"), Expr::Class(class(&[(']', ']')])));
2617
2618 assert_eq!(p(r"[-&&-]"), Expr::Class(class(&[('-', '-')])));
2619 }
2620
2621 #[test]
class_intersection_ampersand()2622 fn class_intersection_ampersand() {
2623 // Unescaped `&` after `&&`
2624 assert_eq!(p(r"[\&&&&]"), Expr::Class(class(&[('&', '&')])));
2625 assert_eq!(p(r"[\&&&\&]"), Expr::Class(class(&[('&', '&')])));
2626 }
2627
2628 #[test]
class_intersection_precedence()2629 fn class_intersection_precedence() {
2630 assert_eq!(p(r"[a-w&&[^c-g]z]"), Expr::Class(class(&[('a', 'b'), ('h', 'w')])));
2631 }
2632
2633 #[test]
class_special_escaped_set_chars()2634 fn class_special_escaped_set_chars() {
2635 // These tests ensure that some special characters require escaping
2636 // for use in character classes. The intention is to use these
2637 // characters to implement sets as described in UTS#18 RL1.3. Once
2638 // that's done, these tests should be removed and replaced with others.
2639 assert_eq!(p(r"[\[]"), Expr::Class(class(&[('[', '[')])));
2640 assert_eq!(p(r"[&]"), Expr::Class(class(&[('&', '&')])));
2641 assert_eq!(p(r"[\&]"), Expr::Class(class(&[('&', '&')])));
2642 assert_eq!(p(r"[\&\&]"), Expr::Class(class(&[('&', '&')])));
2643 assert_eq!(p(r"[\x00-&]"), Expr::Class(class(&[('\u{0}', '&')])));
2644 assert_eq!(p(r"[&-\xFF]"), Expr::Class(class(&[('&', '\u{FF}')])));
2645
2646 assert_eq!(p(r"[~]"), Expr::Class(class(&[('~', '~')])));
2647 assert_eq!(p(r"[\~]"), Expr::Class(class(&[('~', '~')])));
2648 assert_eq!(p(r"[\~\~]"), Expr::Class(class(&[('~', '~')])));
2649 assert_eq!(p(r"[\x00-~]"), Expr::Class(class(&[('\u{0}', '~')])));
2650 assert_eq!(p(r"[~-\xFF]"), Expr::Class(class(&[('~', '\u{FF}')])));
2651
2652 assert_eq!(p(r"[+-\-]"), Expr::Class(class(&[('+', '-')])));
2653 assert_eq!(p(r"[a-a\--\xFF]"), Expr::Class(class(&[
2654 ('-', '\u{FF}'),
2655 ])));
2656 }
2657
2658 #[test]
class_overlapping()2659 fn class_overlapping() {
2660 assert_eq!(p("[a-fd-h]"), Expr::Class(class(&[('a', 'h')])));
2661 assert_eq!(p("[a-fg-m]"), Expr::Class(class(&[('a', 'm')])));
2662
2663 assert_eq!(pb("(?-u)[a-fd-h]"),
2664 Expr::ClassBytes(bclass(&[(b'a', b'h')])));
2665 assert_eq!(pb("(?-u)[a-fg-m]"),
2666 Expr::ClassBytes(bclass(&[(b'a', b'm')])));
2667 }
2668
2669 #[test]
ascii_classes()2670 fn ascii_classes() {
2671 assert_eq!(p("[:blank:]"), Expr::Class(class(&[
2672 (':', ':'), ('a', 'b'), ('k', 'l'), ('n', 'n'),
2673 ])));
2674 assert_eq!(p("[[:upper:]]"), Expr::Class(class(UPPER)));
2675
2676 assert_eq!(pb("(?-u)[[:upper:]]"),
2677 Expr::ClassBytes(class(UPPER).to_byte_class()));
2678 }
2679
2680 #[test]
ascii_classes_not()2681 fn ascii_classes_not() {
2682 assert_eq!(p("[:abc:]"),
2683 Expr::Class(class(&[(':', ':'), ('a', 'c')])));
2684 assert_eq!(pb("(?-u)[:abc:]"),
2685 Expr::ClassBytes(bclass(&[(b':', b':'), (b'a', b'c')])));
2686 }
2687
2688 #[test]
ascii_classes_multiple()2689 fn ascii_classes_multiple() {
2690 assert_eq!(p("[[:lower:][:upper:]]"),
2691 Expr::Class(classes(&[UPPER, LOWER])));
2692
2693 assert_eq!(pb("(?-u)[[:lower:][:upper:]]"),
2694 Expr::ClassBytes(classes(&[UPPER, LOWER]).to_byte_class()));
2695 }
2696
2697 #[test]
ascii_classes_negate()2698 fn ascii_classes_negate() {
2699 assert_eq!(p("[[:^upper:]]"), Expr::Class(class(UPPER).negate()));
2700 assert_eq!(p("[^[:^upper:]]"), Expr::Class(class(UPPER)));
2701
2702 assert_eq!(pb("(?-u)[[:^upper:]]"),
2703 Expr::ClassBytes(class(UPPER).to_byte_class().negate()));
2704 assert_eq!(pb("(?-u)[^[:^upper:]]"),
2705 Expr::ClassBytes(class(UPPER).to_byte_class()));
2706 }
2707
2708 #[test]
ascii_classes_negate_multiple()2709 fn ascii_classes_negate_multiple() {
2710 let (nlower, nword) = (class(LOWER).negate(), class(WORD).negate());
2711 let cls = CharClass::empty().merge(nlower).merge(nword);
2712 assert_eq!(p("[[:^lower:][:^word:]]"), Expr::Class(cls.clone()));
2713 assert_eq!(p("[^[:^lower:][:^word:]]"), Expr::Class(cls.negate()));
2714 }
2715
2716 #[test]
ascii_classes_case_fold()2717 fn ascii_classes_case_fold() {
2718 assert_eq!(p("(?i)[[:upper:]]"),
2719 Expr::Class(class(UPPER).case_fold()));
2720
2721 assert_eq!(pb("(?i-u)[[:upper:]]"),
2722 Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
2723 }
2724
2725 #[test]
ascii_classes_negate_case_fold()2726 fn ascii_classes_negate_case_fold() {
2727 assert_eq!(p("(?i)[[:^upper:]]"),
2728 Expr::Class(class(UPPER).case_fold().negate()));
2729 assert_eq!(p("(?i)[^[:^upper:]]"),
2730 Expr::Class(class(UPPER).case_fold()));
2731
2732 assert_eq!(pb("(?i-u)[[:^upper:]]"),
2733 Expr::ClassBytes(
2734 class(UPPER).to_byte_class().case_fold().negate()));
2735 assert_eq!(pb("(?i-u)[^[:^upper:]]"),
2736 Expr::ClassBytes(class(UPPER).to_byte_class().case_fold()));
2737 }
2738
2739 #[test]
single_class_negate_case_fold()2740 fn single_class_negate_case_fold() {
2741 assert_eq!(p("(?i)[^x]"),
2742 Expr::Class(class(&[('x', 'x')]).case_fold().negate()));
2743
2744 assert_eq!(pb("(?i-u)[^x]"),
2745 Expr::ClassBytes(
2746 class(&[('x', 'x')])
2747 .to_byte_class().case_fold().negate()));
2748 }
2749
2750 #[test]
ignore_space_empty()2751 fn ignore_space_empty() {
2752 assert_eq!(p("(?x) "), Expr::Empty);
2753 }
2754
2755 #[test]
ignore_space_literal()2756 fn ignore_space_literal() {
2757 assert_eq!(p("(?x) a b c"), Expr::Concat(vec![
2758 lit('a'), lit('b'), lit('c'),
2759 ]));
2760 }
2761
2762 #[test]
ignore_space_literal_off()2763 fn ignore_space_literal_off() {
2764 assert_eq!(p("(?x) a b c(?-x) a"), Expr::Concat(vec![
2765 lit('a'), lit('b'), lit('c'), lit(' '), lit('a'),
2766 ]));
2767 }
2768
2769 #[test]
ignore_space_class()2770 fn ignore_space_class() {
2771 assert_eq!(p("(?x)[a
2772 - z
2773 ]"), Expr::Class(class(&[('a', 'z')])));
2774 assert_eq!(p("(?x)[ ^ a
2775 - z
2776 ]"), Expr::Class(class(&[('a', 'z')]).negate()));
2777 }
2778
2779 #[test]
ignore_space_escape_octal()2780 fn ignore_space_escape_octal() {
2781 assert_eq!(p(r"(?x)\12 3"), Expr::Concat(vec![
2782 lit('\n'),
2783 lit('3'),
2784 ]));
2785 }
2786
2787 #[test]
ignore_space_escape_hex()2788 fn ignore_space_escape_hex() {
2789 assert_eq!(p(r"(?x)\x { 53 }"), lit('S'));
2790 assert_eq!(p(r"(?x)\x # comment
2791 { # comment
2792 53 # comment
2793 } # comment"), lit('S'));
2794 }
2795
2796 #[test]
ignore_space_escape_hex2()2797 fn ignore_space_escape_hex2() {
2798 assert_eq!(p(r"(?x)\x 53"), lit('S'));
2799 assert_eq!(p(r"(?x)\x # comment
2800 53 # comment"), lit('S'));
2801 }
2802
2803 #[test]
ignore_space_escape_unicode_name()2804 fn ignore_space_escape_unicode_name() {
2805 assert_eq!(p(r"(?x)\p # comment
2806 { # comment
2807 Yi # comment
2808 } # comment"), Expr::Class(class(YI)));
2809 }
2810
2811 #[test]
ignore_space_repeat_counted()2812 fn ignore_space_repeat_counted() {
2813 assert_eq!(p("(?x)a # comment
2814 { # comment
2815 5 # comment
2816 , # comment
2817 10 # comment
2818 }"), Expr::Repeat {
2819 e: b(lit('a')),
2820 r: Repeater::Range { min: 5, max: Some(10) },
2821 greedy: true,
2822 });
2823 }
2824
2825 #[test]
ignore_space_comments()2826 fn ignore_space_comments() {
2827 assert_eq!(p(r"(?x)(?P<foo>
2828 a # comment 1
2829 )(?P<bar>
2830 z # comment 2
2831 )"), Expr::Concat(vec![
2832 Expr::Group {
2833 e: Box::new(lit('a')),
2834 i: Some(1),
2835 name: Some("foo".into()),
2836 },
2837 Expr::Group {
2838 e: Box::new(lit('z')),
2839 i: Some(2),
2840 name: Some("bar".into()),
2841 },
2842 ]));
2843 }
2844
2845 #[test]
ignore_space_comments_re_enable()2846 fn ignore_space_comments_re_enable() {
2847 assert_eq!(p(r"(?x)a # hi
2848 (?-x:#) # sweet"), Expr::Concat(vec![
2849 lit('a'),
2850 Expr::Group {
2851 e: Box::new(lit('#')),
2852 i: None,
2853 name: None,
2854 },
2855 ]));
2856 }
2857
2858 #[test]
ignore_space_escape_punctuation()2859 fn ignore_space_escape_punctuation() {
2860 assert_eq!(p(r"(?x)\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"), c(&[
2861 lit('\\'), lit('.'), lit('+'), lit('*'), lit('?'),
2862 lit('('), lit(')'), lit('|'), lit('['), lit(']'),
2863 lit('{'), lit('}'), lit('^'), lit('$'), lit('#'),
2864 ]));
2865 }
2866
2867 #[test]
ignore_space_escape_hash()2868 fn ignore_space_escape_hash() {
2869 assert_eq!(p(r"(?x)a\# # hi there"), Expr::Concat(vec![
2870 lit('a'),
2871 lit('#'),
2872 ]));
2873 }
2874
2875 #[test]
ignore_space_escape_space()2876 fn ignore_space_escape_space() {
2877 assert_eq!(p(r"(?x)a\ # hi there"), Expr::Concat(vec![
2878 lit('a'),
2879 lit(' '),
2880 ]));
2881 }
2882
2883 // Test every single possible error case.
2884
2885 macro_rules! test_err {
2886 ($re:expr, $pos:expr, $kind:expr) => {
2887 test_err!($re, $pos, $kind, Flags::default());
2888 };
2889 ($re:expr, $pos:expr, $kind:expr, $flags:expr) => {{
2890 let err = Parser::parse($re, $flags).unwrap_err();
2891 assert_eq!($pos, err.pos);
2892 assert_eq!($kind, err.kind);
2893 assert!($re.contains(&err.surround));
2894 }}
2895 }
2896
2897 #[test]
invalid_utf8_not_allowed()2898 fn invalid_utf8_not_allowed() {
2899 // let flags = Flags { unicode: false, .. Flags::default() };
2900 test_err!(r"(?-u)\xFF", 9, ErrorKind::InvalidUtf8);
2901 test_err!(r"(?-u).", 5, ErrorKind::InvalidUtf8);
2902 test_err!(r"(?-u)(?s).", 9, ErrorKind::InvalidUtf8);
2903 test_err!(r"(?-u)[\x00-\x80]", 15, ErrorKind::InvalidUtf8);
2904 test_err!(r"(?-u)\222", 9, ErrorKind::InvalidUtf8);
2905 test_err!(r"(?-u)\x{0080}", 13, ErrorKind::InvalidUtf8);
2906 }
2907
2908 #[test]
unicode_char_not_allowed()2909 fn unicode_char_not_allowed() {
2910 let flags = Flags { allow_bytes: true, .. Flags::default() };
2911 test_err!("☃(?-u:☃)", 7, ErrorKind::UnicodeNotAllowed, flags);
2912 }
2913
2914 #[test]
unicode_class_not_allowed()2915 fn unicode_class_not_allowed() {
2916 let flags = Flags { allow_bytes: true, .. Flags::default() };
2917 test_err!(r"☃(?-u:\pL)", 9, ErrorKind::UnicodeNotAllowed, flags);
2918 }
2919
2920 #[test]
unicode_class_literal_not_allowed()2921 fn unicode_class_literal_not_allowed() {
2922 let flags = Flags { allow_bytes: true, .. Flags::default() };
2923 test_err!(r"(?-u)[☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
2924 test_err!(r"(?-u)[☃-☃]", 6, ErrorKind::UnicodeNotAllowed, flags);
2925 }
2926
2927 #[test]
unicode_hex_not_allowed()2928 fn unicode_hex_not_allowed() {
2929 let flags = Flags { allow_bytes: true, .. Flags::default() };
2930 test_err!(r"(?-u)\x{FFFF}", 13, ErrorKind::UnicodeNotAllowed, flags);
2931 test_err!(r"(?-u)\x{100}", 12, ErrorKind::UnicodeNotAllowed, flags);
2932 }
2933
2934 #[test]
unicode_octal_not_allowed()2935 fn unicode_octal_not_allowed() {
2936 let flags = Flags { allow_bytes: true, .. Flags::default() };
2937 test_err!(r"(?-u)\400", 9, ErrorKind::UnicodeNotAllowed, flags);
2938 }
2939
2940 #[test]
error_repeat_no_expr_simple()2941 fn error_repeat_no_expr_simple() {
2942 test_err!("(*", 1, ErrorKind::RepeaterExpectsExpr);
2943 }
2944
2945 #[test]
error_repeat_no_expr_counted()2946 fn error_repeat_no_expr_counted() {
2947 test_err!("({5}", 1, ErrorKind::RepeaterExpectsExpr);
2948 }
2949
2950 #[test]
error_repeat_beginning_counted()2951 fn error_repeat_beginning_counted() {
2952 test_err!("{5}", 0, ErrorKind::RepeaterExpectsExpr);
2953 }
2954
2955 #[test]
error_repeat_illegal_exprs_simple()2956 fn error_repeat_illegal_exprs_simple() {
2957 test_err!("a**", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat {
2958 e: b(lit('a')),
2959 r: Repeater::ZeroOrMore,
2960 greedy: true,
2961 }));
2962 test_err!("a|*", 2,
2963 ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')]))
2964 );
2965 }
2966
2967 #[test]
error_repeat_illegal_exprs_counted()2968 fn error_repeat_illegal_exprs_counted() {
2969 test_err!("a*{5}", 2, ErrorKind::RepeaterUnexpectedExpr(Expr::Repeat {
2970 e: b(lit('a')),
2971 r: Repeater::ZeroOrMore,
2972 greedy: true,
2973 }));
2974 test_err!("a|{5}", 2,
2975 ErrorKind::RepeaterUnexpectedExpr(Expr::Alternate(vec![lit('a')]))
2976 );
2977 }
2978
2979 #[test]
error_repeat_empty_number()2980 fn error_repeat_empty_number() {
2981 test_err!("a{}", 2, ErrorKind::MissingBase10);
2982 }
2983
2984 #[test]
error_repeat_eof()2985 fn error_repeat_eof() {
2986 test_err!("a{5", 3, ErrorKind::UnclosedRepeat);
2987 }
2988
2989 #[test]
error_repeat_empty_number_eof()2990 fn error_repeat_empty_number_eof() {
2991 test_err!("a{xyz", 5, ErrorKind::InvalidBase10("xyz".into()));
2992 test_err!("a{12,xyz", 8, ErrorKind::InvalidBase10("xyz".into()));
2993 }
2994
2995 #[test]
error_repeat_invalid_number()2996 fn error_repeat_invalid_number() {
2997 test_err!("a{9999999999}", 12,
2998 ErrorKind::InvalidBase10("9999999999".into()));
2999 test_err!("a{1,9999999999}", 14,
3000 ErrorKind::InvalidBase10("9999999999".into()));
3001 }
3002
3003 #[test]
error_repeat_invalid_number_extra()3004 fn error_repeat_invalid_number_extra() {
3005 test_err!("a{12x}", 5, ErrorKind::InvalidBase10("12x".into()));
3006 test_err!("a{1,12x}", 7, ErrorKind::InvalidBase10("12x".into()));
3007 }
3008
3009 #[test]
error_repeat_invalid_range()3010 fn error_repeat_invalid_range() {
3011 test_err!("a{2,1}", 5,
3012 ErrorKind::InvalidRepeatRange { min: 2, max: 1 });
3013 }
3014
3015 #[test]
error_alternate_empty()3016 fn error_alternate_empty() {
3017 test_err!("|a", 0, ErrorKind::EmptyAlternate);
3018 }
3019
3020 #[test]
error_alternate_empty_with_group()3021 fn error_alternate_empty_with_group() {
3022 test_err!("(|a)", 1, ErrorKind::EmptyAlternate);
3023 }
3024
3025 #[test]
error_alternate_empty_with_alternate()3026 fn error_alternate_empty_with_alternate() {
3027 test_err!("a||", 2, ErrorKind::EmptyAlternate);
3028 }
3029
3030 #[test]
error_close_paren_unopened_empty()3031 fn error_close_paren_unopened_empty() {
3032 test_err!(")", 0, ErrorKind::UnopenedParen);
3033 }
3034
3035 #[test]
error_close_paren_unopened()3036 fn error_close_paren_unopened() {
3037 test_err!("ab)", 2, ErrorKind::UnopenedParen);
3038 }
3039
3040 #[test]
error_close_paren_unopened_with_alt()3041 fn error_close_paren_unopened_with_alt() {
3042 test_err!("a|b)", 3, ErrorKind::UnopenedParen);
3043 }
3044
3045 #[test]
error_close_paren_unclosed_with_alt()3046 fn error_close_paren_unclosed_with_alt() {
3047 test_err!("(a|b", 0, ErrorKind::UnclosedParen);
3048 }
3049
3050 #[test]
error_close_paren_empty_alt()3051 fn error_close_paren_empty_alt() {
3052 test_err!("(a|)", 3, ErrorKind::EmptyAlternate);
3053 }
3054
3055 #[test]
error_close_paren_empty_group()3056 fn error_close_paren_empty_group() {
3057 test_err!("()", 1, ErrorKind::EmptyGroup);
3058 }
3059
3060 #[test]
error_close_paren_empty_group_with_name()3061 fn error_close_paren_empty_group_with_name() {
3062 test_err!("(?P<foo>)", 8, ErrorKind::EmptyGroup);
3063 }
3064
3065 #[test]
error_finish_concat_unclosed()3066 fn error_finish_concat_unclosed() {
3067 test_err!("ab(xy", 2, ErrorKind::UnclosedParen);
3068 }
3069
3070 #[test]
error_finish_concat_empty_alt()3071 fn error_finish_concat_empty_alt() {
3072 test_err!("a|", 2, ErrorKind::EmptyAlternate);
3073 }
3074
3075 #[test]
error_group_name_invalid()3076 fn error_group_name_invalid() {
3077 test_err!("(?P<a#>x)", 6, ErrorKind::InvalidCaptureName("a#".into()));
3078 }
3079
3080 #[test]
error_group_name_invalid_leading()3081 fn error_group_name_invalid_leading() {
3082 test_err!("(?P<1a>a)", 6, ErrorKind::InvalidCaptureName("1a".into()));
3083 }
3084
3085 #[test]
error_group_name_unexpected_eof()3086 fn error_group_name_unexpected_eof() {
3087 test_err!("(?P<a", 5, ErrorKind::UnclosedCaptureName("a".into()));
3088 }
3089
3090 #[test]
error_group_name_empty()3091 fn error_group_name_empty() {
3092 test_err!("(?P<>a)", 4, ErrorKind::EmptyCaptureName);
3093 }
3094
3095 #[test]
error_group_opts_unrecognized_flag()3096 fn error_group_opts_unrecognized_flag() {
3097 test_err!("(?z:a)", 2, ErrorKind::UnrecognizedFlag('z'));
3098 }
3099
3100 #[test]
error_group_opts_unexpected_eof()3101 fn error_group_opts_unexpected_eof() {
3102 test_err!("(?i", 3, ErrorKind::UnexpectedFlagEof);
3103 }
3104
3105 #[test]
error_group_opts_double_negation()3106 fn error_group_opts_double_negation() {
3107 test_err!("(?-i-s:a)", 4, ErrorKind::DoubleFlagNegation);
3108 }
3109
3110 #[test]
error_group_opts_empty_negation()3111 fn error_group_opts_empty_negation() {
3112 test_err!("(?i-:a)", 4, ErrorKind::EmptyFlagNegation);
3113 }
3114
3115 #[test]
error_group_opts_empty()3116 fn error_group_opts_empty() {
3117 test_err!("(?)", 2, ErrorKind::EmptyFlagNegation);
3118 }
3119
3120 #[test]
error_escape_unexpected_eof()3121 fn error_escape_unexpected_eof() {
3122 test_err!(r"\", 1, ErrorKind::UnexpectedEscapeEof);
3123 }
3124
3125 #[test]
error_escape_unrecognized()3126 fn error_escape_unrecognized() {
3127 test_err!(r"\m", 1, ErrorKind::UnrecognizedEscape('m'));
3128 }
3129
3130 #[test]
error_escape_hex2_eof0()3131 fn error_escape_hex2_eof0() {
3132 test_err!(r"\x", 2, ErrorKind::UnexpectedTwoDigitHexEof);
3133 }
3134
3135 #[test]
error_escape_hex2_eof1()3136 fn error_escape_hex2_eof1() {
3137 test_err!(r"\xA", 3, ErrorKind::UnexpectedTwoDigitHexEof);
3138 }
3139
3140 #[test]
error_escape_hex2_invalid()3141 fn error_escape_hex2_invalid() {
3142 test_err!(r"\xAG", 4, ErrorKind::InvalidBase16("AG".into()));
3143 }
3144
3145 #[test]
error_escape_hex_eof0()3146 fn error_escape_hex_eof0() {
3147 test_err!(r"\x{", 3, ErrorKind::InvalidBase16("".into()));
3148 }
3149
3150 #[test]
error_escape_hex_eof1()3151 fn error_escape_hex_eof1() {
3152 test_err!(r"\x{A", 4, ErrorKind::UnclosedHex);
3153 }
3154
3155 #[test]
error_escape_hex_invalid()3156 fn error_escape_hex_invalid() {
3157 test_err!(r"\x{AG}", 5, ErrorKind::InvalidBase16("AG".into()));
3158 }
3159
3160 #[test]
error_escape_hex_invalid_scalar_value_surrogate()3161 fn error_escape_hex_invalid_scalar_value_surrogate() {
3162 test_err!(r"\x{D800}", 8, ErrorKind::InvalidScalarValue(0xD800));
3163 }
3164
3165 #[test]
error_escape_hex_invalid_scalar_value_high()3166 fn error_escape_hex_invalid_scalar_value_high() {
3167 test_err!(r"\x{110000}", 10, ErrorKind::InvalidScalarValue(0x110000));
3168 }
3169
3170 #[test]
error_escape_hex_invalid_u32()3171 fn error_escape_hex_invalid_u32() {
3172 test_err!(r"\x{9999999999}", 13,
3173 ErrorKind::InvalidBase16("9999999999".into()));
3174 }
3175
3176 #[test]
error_unicode_unclosed()3177 fn error_unicode_unclosed() {
3178 test_err!(r"\p{", 3, ErrorKind::UnclosedUnicodeName);
3179 test_err!(r"\p{Greek", 8, ErrorKind::UnclosedUnicodeName);
3180 }
3181
3182 #[test]
error_unicode_no_letter()3183 fn error_unicode_no_letter() {
3184 test_err!(r"\p", 2, ErrorKind::UnexpectedEscapeEof);
3185 }
3186
3187 #[test]
error_unicode_unknown_letter()3188 fn error_unicode_unknown_letter() {
3189 test_err!(r"\pA", 3, ErrorKind::UnrecognizedUnicodeClass("A".into()));
3190 }
3191
3192 #[test]
error_unicode_unknown_name()3193 fn error_unicode_unknown_name() {
3194 test_err!(r"\p{Yii}", 7,
3195 ErrorKind::UnrecognizedUnicodeClass("Yii".into()));
3196 }
3197
3198 #[test]
error_class_eof_empty()3199 fn error_class_eof_empty() {
3200 test_err!("[", 1, ErrorKind::UnexpectedClassEof);
3201 test_err!("[^", 2, ErrorKind::UnexpectedClassEof);
3202 }
3203
3204 #[test]
error_class_eof_non_empty()3205 fn error_class_eof_non_empty() {
3206 test_err!("[a", 2, ErrorKind::UnexpectedClassEof);
3207 test_err!("[^a", 3, ErrorKind::UnexpectedClassEof);
3208 }
3209
3210 #[test]
error_class_eof_range()3211 fn error_class_eof_range() {
3212 test_err!("[a-", 3, ErrorKind::UnexpectedClassEof);
3213 test_err!("[^a-", 4, ErrorKind::UnexpectedClassEof);
3214 test_err!("[---", 4, ErrorKind::UnexpectedClassEof);
3215 }
3216
3217 #[test]
error_class_invalid_escape()3218 fn error_class_invalid_escape() {
3219 test_err!(r"[\pA]", 4,
3220 ErrorKind::UnrecognizedUnicodeClass("A".into()));
3221 }
3222
3223 #[test]
error_class_valid_escape_not_allowed()3224 fn error_class_valid_escape_not_allowed() {
3225 test_err!(r"[\A]", 3, ErrorKind::InvalidClassEscape(Expr::StartText));
3226 }
3227
3228 #[test]
error_class_range_valid_escape_not_allowed()3229 fn error_class_range_valid_escape_not_allowed() {
3230 test_err!(r"[a-\d]", 5,
3231 ErrorKind::InvalidClassEscape(Expr::Class(class(PERLD))));
3232 test_err!(r"[a-\A]", 5,
3233 ErrorKind::InvalidClassEscape(Expr::StartText));
3234 test_err!(r"[\A-a]", 3,
3235 ErrorKind::InvalidClassEscape(Expr::StartText));
3236 }
3237
3238 #[test]
error_class_invalid_range()3239 fn error_class_invalid_range() {
3240 test_err!("[z-a]", 4, ErrorKind::InvalidClassRange {
3241 start: 'z',
3242 end: 'a',
3243 });
3244 }
3245
3246 #[test]
error_class_empty_range()3247 fn error_class_empty_range() {
3248 test_err!("[]", 2, ErrorKind::UnexpectedClassEof);
3249 test_err!("[^]", 3, ErrorKind::UnexpectedClassEof);
3250 test_err!(r"[^\d\D]", 7, ErrorKind::EmptyClass);
3251
3252 let flags = Flags { allow_bytes: true, .. Flags::default() };
3253 test_err!(r"(?-u)[^\x00-\xFF]", 17, ErrorKind::EmptyClass, flags);
3254 }
3255
3256 #[test]
error_class_unsupported_char()3257 fn error_class_unsupported_char() {
3258 // These tests ensure that some unescaped special characters are
3259 // rejected in character classes. The intention is to use these
3260 // characters to implement sets as described in UTS#18 RL1.3. Once
3261 // that's done, these tests should be removed and replaced with others.
3262 test_err!("[~~]", 2, ErrorKind::UnsupportedClassChar('~'));
3263 test_err!("[+--]", 4, ErrorKind::UnsupportedClassChar('-'));
3264 test_err!(r"[a-a--\xFF]", 5, ErrorKind::UnsupportedClassChar('-'));
3265 test_err!(r"[a&&~~]", 5, ErrorKind::UnsupportedClassChar('~'));
3266 test_err!(r"[a&&--]", 5, ErrorKind::UnsupportedClassChar('-'));
3267 }
3268
3269 #[test]
error_class_nested_class()3270 fn error_class_nested_class() {
3271 test_err!(r"[[]]", 4, ErrorKind::UnexpectedClassEof);
3272 test_err!(r"[[][]]", 6, ErrorKind::UnexpectedClassEof);
3273 test_err!(r"[[^\d\D]]", 8, ErrorKind::EmptyClass);
3274 test_err!(r"[[]", 3, ErrorKind::UnexpectedClassEof);
3275 test_err!(r"[[^]", 4, ErrorKind::UnexpectedClassEof);
3276 }
3277
3278 #[test]
error_class_intersection()3279 fn error_class_intersection() {
3280 test_err!(r"[&&]", 4, ErrorKind::EmptyClass);
3281 test_err!(r"[a&&]", 5, ErrorKind::EmptyClass);
3282 test_err!(r"[&&&&]", 6, ErrorKind::EmptyClass);
3283 // `]` after `&&` is not the same as in (`[]]`), because it's also not
3284 // allowed unescaped in `[a]]`.
3285 test_err!(r"[]&&]]", 5, ErrorKind::EmptyClass);
3286
3287 let flags = Flags { allow_bytes: true, .. Flags::default() };
3288 test_err!(r"(?-u)[a&&\pZ]", 12, ErrorKind::UnicodeNotAllowed, flags);
3289 }
3290
3291 #[test]
error_duplicate_capture_name()3292 fn error_duplicate_capture_name() {
3293 test_err!("(?P<a>.)(?P<a>.)", 14,
3294 ErrorKind::DuplicateCaptureName("a".into()));
3295 }
3296
3297 #[test]
error_ignore_space_escape_hex()3298 fn error_ignore_space_escape_hex() {
3299 test_err!(r"(?x)\x{ 5 3 }", 10, ErrorKind::UnclosedHex);
3300 }
3301
3302 #[test]
error_ignore_space_escape_hex2()3303 fn error_ignore_space_escape_hex2() {
3304 test_err!(r"(?x)\x 5 3", 9, ErrorKind::InvalidBase16("5 ".into()));
3305 }
3306
3307 #[test]
error_ignore_space_escape_unicode_name()3308 fn error_ignore_space_escape_unicode_name() {
3309 test_err!(r"(?x)\p{Y i}", 9, ErrorKind::UnclosedUnicodeName);
3310 }
3311 }
3312