1 /*!
2 This module provides a regular expression parser.
3 */
4
5 use std::borrow::Borrow;
6 use std::cell::{Cell, RefCell};
7 use std::mem;
8 use std::result;
9
10 use ast::{self, Ast, Position, Span};
11 use either::Either;
12
13 use is_meta_character;
14
15 type Result<T> = result::Result<T, ast::Error>;
16
17 /// A primitive is an expression with no sub-expressions. This includes
18 /// literals, assertions and non-set character classes. This representation
19 /// is used as intermediate state in the parser.
20 ///
21 /// This does not include ASCII character classes, since they can only appear
22 /// within a set character class.
23 #[derive(Clone, Debug, Eq, PartialEq)]
24 enum Primitive {
25 Literal(ast::Literal),
26 Assertion(ast::Assertion),
27 Dot(Span),
28 Perl(ast::ClassPerl),
29 Unicode(ast::ClassUnicode),
30 }
31
32 impl Primitive {
33 /// Return the span of this primitive.
span(&self) -> &Span34 fn span(&self) -> &Span {
35 match *self {
36 Primitive::Literal(ref x) => &x.span,
37 Primitive::Assertion(ref x) => &x.span,
38 Primitive::Dot(ref span) => span,
39 Primitive::Perl(ref x) => &x.span,
40 Primitive::Unicode(ref x) => &x.span,
41 }
42 }
43
44 /// Convert this primitive into a proper AST.
into_ast(self) -> Ast45 fn into_ast(self) -> Ast {
46 match self {
47 Primitive::Literal(lit) => Ast::Literal(lit),
48 Primitive::Assertion(assert) => Ast::Assertion(assert),
49 Primitive::Dot(span) => Ast::Dot(span),
50 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
51 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
52 }
53 }
54
55 /// Convert this primitive into an item in a character class.
56 ///
57 /// If this primitive is not a legal item (i.e., an assertion or a dot),
58 /// then return an error.
into_class_set_item<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::ClassSetItem>59 fn into_class_set_item<P: Borrow<Parser>>(
60 self,
61 p: &ParserI<P>,
62 ) -> Result<ast::ClassSetItem> {
63 use self::Primitive::*;
64 use ast::ClassSetItem;
65
66 match self {
67 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
68 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
69 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
70 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
71 }
72 }
73
74 /// Convert this primitive into a literal in a character class. In
75 /// particular, literals are the only valid items that can appear in
76 /// ranges.
77 ///
78 /// If this primitive is not a legal item (i.e., a class, assertion or a
79 /// dot), then return an error.
into_class_literal<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::Literal>80 fn into_class_literal<P: Borrow<Parser>>(
81 self,
82 p: &ParserI<P>,
83 ) -> Result<ast::Literal> {
84 use self::Primitive::*;
85
86 match self {
87 Literal(lit) => Ok(lit),
88 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
89 }
90 }
91 }
92
93 /// Returns true if the given character is a hexadecimal digit.
is_hex(c: char) -> bool94 fn is_hex(c: char) -> bool {
95 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
96 }
97
98 /// Returns true if the given character is a valid in a capture group name.
99 ///
100 /// If `first` is true, then `c` is treated as the first character in the
101 /// group name (which is not allowed to be a digit).
is_capture_char(c: char, first: bool) -> bool102 fn is_capture_char(c: char, first: bool) -> bool {
103 c == '_'
104 || (!first && c >= '0' && c <= '9')
105 || (c >= 'a' && c <= 'z')
106 || (c >= 'A' && c <= 'Z')
107 }
108
109 /// A builder for a regular expression parser.
110 ///
111 /// This builder permits modifying configuration options for the parser.
112 #[derive(Clone, Debug)]
113 pub struct ParserBuilder {
114 ignore_whitespace: bool,
115 nest_limit: u32,
116 octal: bool,
117 }
118
119 impl Default for ParserBuilder {
default() -> ParserBuilder120 fn default() -> ParserBuilder {
121 ParserBuilder::new()
122 }
123 }
124
125 impl ParserBuilder {
126 /// Create a new parser builder with a default configuration.
new() -> ParserBuilder127 pub fn new() -> ParserBuilder {
128 ParserBuilder {
129 ignore_whitespace: false,
130 nest_limit: 250,
131 octal: false,
132 }
133 }
134
135 /// Build a parser from this configuration with the given pattern.
build(&self) -> Parser136 pub fn build(&self) -> Parser {
137 Parser {
138 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
139 capture_index: Cell::new(0),
140 nest_limit: self.nest_limit,
141 octal: self.octal,
142 initial_ignore_whitespace: self.ignore_whitespace,
143 ignore_whitespace: Cell::new(self.ignore_whitespace),
144 comments: RefCell::new(vec![]),
145 stack_group: RefCell::new(vec![]),
146 stack_class: RefCell::new(vec![]),
147 capture_names: RefCell::new(vec![]),
148 scratch: RefCell::new(String::new()),
149 }
150 }
151
152 /// Set the nesting limit for this parser.
153 ///
154 /// The nesting limit controls how deep the abstract syntax tree is allowed
155 /// to be. If the AST exceeds the given limit (e.g., with too many nested
156 /// groups), then an error is returned by the parser.
157 ///
158 /// The purpose of this limit is to act as a heuristic to prevent stack
159 /// overflow for consumers that do structural induction on an `Ast` using
160 /// explicit recursion. While this crate never does this (instead using
161 /// constant stack space and moving the call stack to the heap), other
162 /// crates may.
163 ///
164 /// This limit is not checked until the entire Ast is parsed. Therefore,
165 /// if callers want to put a limit on the amount of heap space used, then
166 /// they should impose a limit on the length, in bytes, of the concrete
167 /// pattern string. In particular, this is viable since this parser
168 /// implementation will limit itself to heap space proportional to the
169 /// lenth of the pattern string.
170 ///
171 /// Note that a nest limit of `0` will return a nest limit error for most
172 /// patterns but not all. For example, a nest limit of `0` permits `a` but
173 /// not `ab`, since `ab` requires a concatenation, which results in a nest
174 /// depth of `1`. In general, a nest limit is not something that manifests
175 /// in an obvious way in the concrete syntax, therefore, it should not be
176 /// used in a granular way.
nest_limit(&mut self, limit: u32) -> &mut ParserBuilder177 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
178 self.nest_limit = limit;
179 self
180 }
181
182 /// Whether to support octal syntax or not.
183 ///
184 /// Octal syntax is a little-known way of uttering Unicode codepoints in
185 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
186 /// `\141` are all equivalent regular expressions, where the last example
187 /// shows octal syntax.
188 ///
189 /// While supporting octal syntax isn't in and of itself a problem, it does
190 /// make good error messages harder. That is, in PCRE based regex engines,
191 /// syntax like `\0` invokes a backreference, which is explicitly
192 /// unsupported in Rust's regex engine. However, many users expect it to
193 /// be supported. Therefore, when octal support is disabled, the error
194 /// message will explicitly mention that backreferences aren't supported.
195 ///
196 /// Octal syntax is disabled by default.
octal(&mut self, yes: bool) -> &mut ParserBuilder197 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
198 self.octal = yes;
199 self
200 }
201
202 /// Enable verbose mode in the regular expression.
203 ///
204 /// When enabled, verbose mode permits insigificant whitespace in many
205 /// places in the regular expression, as well as comments. Comments are
206 /// started using `#` and continue until the end of the line.
207 ///
208 /// By default, this is disabled. It may be selectively enabled in the
209 /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder210 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
211 self.ignore_whitespace = yes;
212 self
213 }
214 }
215
216 /// A regular expression parser.
217 ///
218 /// This parses a string representation of a regular expression into an
219 /// abstract syntax tree. The size of the tree is proportional to the length
220 /// of the regular expression pattern.
221 ///
222 /// A `Parser` can be configured in more detail via a
223 /// [`ParserBuilder`](struct.ParserBuilder.html).
224 #[derive(Clone, Debug)]
225 pub struct Parser {
226 /// The current position of the parser.
227 pos: Cell<Position>,
228 /// The current capture index.
229 capture_index: Cell<u32>,
230 /// The maximum number of open parens/brackets allowed. If the parser
231 /// exceeds this number, then an error is returned.
232 nest_limit: u32,
233 /// Whether to support octal syntax or not. When `false`, the parser will
234 /// return an error helpfully pointing out that backreferences are not
235 /// supported.
236 octal: bool,
237 /// The initial setting for `ignore_whitespace` as provided by
238 /// Th`ParserBuilder`. is is used when reseting the parser's state.
239 initial_ignore_whitespace: bool,
240 /// Whether whitespace should be ignored. When enabled, comments are
241 /// also permitted.
242 ignore_whitespace: Cell<bool>,
243 /// A list of comments, in order of appearance.
244 comments: RefCell<Vec<ast::Comment>>,
245 /// A stack of grouped sub-expressions, including alternations.
246 stack_group: RefCell<Vec<GroupState>>,
247 /// A stack of nested character classes. This is only non-empty when
248 /// parsing a class.
249 stack_class: RefCell<Vec<ClassState>>,
250 /// A sorted sequence of capture names. This is used to detect duplicate
251 /// capture names and report an error if one is detected.
252 capture_names: RefCell<Vec<ast::CaptureName>>,
253 /// A scratch buffer used in various places. Mostly this is used to
254 /// accumulate relevant characters from parts of a pattern.
255 scratch: RefCell<String>,
256 }
257
258 /// ParserI is the internal parser implementation.
259 ///
260 /// We use this separate type so that we can carry the provided pattern string
261 /// along with us. In particular, a `Parser` internal state is not tied to any
262 /// one pattern, but `ParserI` is.
263 ///
264 /// This type also lets us use `ParserI<&Parser>` in production code while
265 /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
266 /// work against the internal interface of the parser.
267 #[derive(Clone, Debug)]
268 struct ParserI<'s, P> {
269 /// The parser state/configuration.
270 parser: P,
271 /// The full regular expression provided by the user.
272 pattern: &'s str,
273 }
274
275 /// GroupState represents a single stack frame while parsing nested groups
276 /// and alternations. Each frame records the state up to an opening parenthesis
277 /// or a alternating bracket `|`.
278 #[derive(Clone, Debug)]
279 enum GroupState {
280 /// This state is pushed whenever an opening group is found.
281 Group {
282 /// The concatenation immediately preceding the opening group.
283 concat: ast::Concat,
284 /// The group that has been opened. Its sub-AST is always empty.
285 group: ast::Group,
286 /// Whether this group has the `x` flag enabled or not.
287 ignore_whitespace: bool,
288 },
289 /// This state is pushed whenever a new alternation branch is found. If
290 /// an alternation branch is found and this state is at the top of the
291 /// stack, then this state should be modified to include the new
292 /// alternation.
293 Alternation(ast::Alternation),
294 }
295
296 /// ClassState represents a single stack frame while parsing character classes.
297 /// Each frame records the state up to an intersection, difference, symmetric
298 /// difference or nested class.
299 ///
300 /// Note that a parser's character class stack is only non-empty when parsing
301 /// a character class. In all other cases, it is empty.
302 #[derive(Clone, Debug)]
303 enum ClassState {
304 /// This state is pushed whenever an opening bracket is found.
305 Open {
306 /// The union of class items immediately preceding this class.
307 union: ast::ClassSetUnion,
308 /// The class that has been opened. Typically this just corresponds
309 /// to the `[`, but it can also include `[^` since `^` indicates
310 /// negation of the class.
311 set: ast::ClassBracketed,
312 },
313 /// This state is pushed when a operator is seen. When popped, the stored
314 /// set becomes the left hand side of the operator.
315 Op {
316 /// The type of the operation, i.e., &&, -- or ~~.
317 kind: ast::ClassSetBinaryOpKind,
318 /// The left-hand side of the operator.
319 lhs: ast::ClassSet,
320 },
321 }
322
323 impl Parser {
324 /// Create a new parser with a default configuration.
325 ///
326 /// The parser can be run with either the `parse` or `parse_with_comments`
327 /// methods. The parse methods return an abstract syntax tree.
328 ///
329 /// To set configuration options on the parser, use
330 /// [`ParserBuilder`](struct.ParserBuilder.html).
new() -> Parser331 pub fn new() -> Parser {
332 ParserBuilder::new().build()
333 }
334
335 /// Parse the regular expression into an abstract syntax tree.
parse(&mut self, pattern: &str) -> Result<Ast>336 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
337 ParserI::new(self, pattern).parse()
338 }
339
340 /// Parse the regular expression and return an abstract syntax tree with
341 /// all of the comments found in the pattern.
parse_with_comments( &mut self, pattern: &str, ) -> Result<ast::WithComments>342 pub fn parse_with_comments(
343 &mut self,
344 pattern: &str,
345 ) -> Result<ast::WithComments> {
346 ParserI::new(self, pattern).parse_with_comments()
347 }
348
349 /// Reset the internal state of a parser.
350 ///
351 /// This is called at the beginning of every parse. This prevents the
352 /// parser from running with inconsistent state (say, if a previous
353 /// invocation returned an error and the parser is reused).
reset(&self)354 fn reset(&self) {
355 // These settings should be in line with the construction
356 // in `ParserBuilder::build`.
357 self.pos.set(Position { offset: 0, line: 1, column: 1 });
358 self.ignore_whitespace.set(self.initial_ignore_whitespace);
359 self.comments.borrow_mut().clear();
360 self.stack_group.borrow_mut().clear();
361 self.stack_class.borrow_mut().clear();
362 }
363 }
364
365 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
366 /// Build an internal parser from a parser configuration and a pattern.
new(parser: P, pattern: &'s str) -> ParserI<'s, P>367 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
368 ParserI { parser: parser, pattern: pattern }
369 }
370
371 /// Return a reference to the parser state.
parser(&self) -> &Parser372 fn parser(&self) -> &Parser {
373 self.parser.borrow()
374 }
375
376 /// Return a reference to the pattern being parsed.
pattern(&self) -> &str377 fn pattern(&self) -> &str {
378 self.pattern.borrow()
379 }
380
381 /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error382 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
383 ast::Error {
384 kind: kind,
385 pattern: self.pattern().to_string(),
386 span: span,
387 }
388 }
389
390 /// Return the current offset of the parser.
391 ///
392 /// The offset starts at `0` from the beginning of the regular expression
393 /// pattern string.
offset(&self) -> usize394 fn offset(&self) -> usize {
395 self.parser().pos.get().offset
396 }
397
398 /// Return the current line number of the parser.
399 ///
400 /// The line number starts at `1`.
line(&self) -> usize401 fn line(&self) -> usize {
402 self.parser().pos.get().line
403 }
404
405 /// Return the current column of the parser.
406 ///
407 /// The column number starts at `1` and is reset whenever a `\n` is seen.
column(&self) -> usize408 fn column(&self) -> usize {
409 self.parser().pos.get().column
410 }
411
412 /// Return the next capturing index. Each subsequent call increments the
413 /// internal index.
414 ///
415 /// The span given should correspond to the location of the opening
416 /// parenthesis.
417 ///
418 /// If the capture limit is exceeded, then an error is returned.
next_capture_index(&self, span: Span) -> Result<u32>419 fn next_capture_index(&self, span: Span) -> Result<u32> {
420 let current = self.parser().capture_index.get();
421 let i = current.checked_add(1).ok_or_else(|| {
422 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
423 })?;
424 self.parser().capture_index.set(i);
425 Ok(i)
426 }
427
428 /// Adds the given capture name to this parser. If this capture name has
429 /// already been used, then an error is returned.
add_capture_name(&self, cap: &ast::CaptureName) -> Result<()>430 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
431 let mut names = self.parser().capture_names.borrow_mut();
432 match names
433 .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
434 {
435 Err(i) => {
436 names.insert(i, cap.clone());
437 Ok(())
438 }
439 Ok(i) => Err(self.error(
440 cap.span,
441 ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
442 )),
443 }
444 }
445
446 /// Return whether the parser should ignore whitespace or not.
ignore_whitespace(&self) -> bool447 fn ignore_whitespace(&self) -> bool {
448 self.parser().ignore_whitespace.get()
449 }
450
451 /// Return the character at the current position of the parser.
452 ///
453 /// This panics if the current position does not point to a valid char.
char(&self) -> char454 fn char(&self) -> char {
455 self.char_at(self.offset())
456 }
457
458 /// Return the character at the given position.
459 ///
460 /// This panics if the given position does not point to a valid char.
char_at(&self, i: usize) -> char461 fn char_at(&self, i: usize) -> char {
462 self.pattern()[i..]
463 .chars()
464 .next()
465 .unwrap_or_else(|| panic!("expected char at offset {}", i))
466 }
467
468 /// Bump the parser to the next Unicode scalar value.
469 ///
470 /// If the end of the input has been reached, then `false` is returned.
bump(&self) -> bool471 fn bump(&self) -> bool {
472 if self.is_eof() {
473 return false;
474 }
475 let Position { mut offset, mut line, mut column } = self.pos();
476 if self.char() == '\n' {
477 line = line.checked_add(1).unwrap();
478 column = 1;
479 } else {
480 column = column.checked_add(1).unwrap();
481 }
482 offset += self.char().len_utf8();
483 self.parser().pos.set(Position {
484 offset: offset,
485 line: line,
486 column: column,
487 });
488 self.pattern()[self.offset()..].chars().next().is_some()
489 }
490
491 /// If the substring starting at the current position of the parser has
492 /// the given prefix, then bump the parser to the character immediately
493 /// following the prefix and return true. Otherwise, don't bump the parser
494 /// and return false.
bump_if(&self, prefix: &str) -> bool495 fn bump_if(&self, prefix: &str) -> bool {
496 if self.pattern()[self.offset()..].starts_with(prefix) {
497 for _ in 0..prefix.chars().count() {
498 self.bump();
499 }
500 true
501 } else {
502 false
503 }
504 }
505
506 /// Returns true if and only if the parser is positioned at a look-around
507 /// prefix. The conditions under which this returns true must always
508 /// correspond to a regular expression that would otherwise be consider
509 /// invalid.
510 ///
511 /// This should only be called immediately after parsing the opening of
512 /// a group or a set of flags.
is_lookaround_prefix(&self) -> bool513 fn is_lookaround_prefix(&self) -> bool {
514 self.bump_if("?=")
515 || self.bump_if("?!")
516 || self.bump_if("?<=")
517 || self.bump_if("?<!")
518 }
519
520 /// Bump the parser, and if the `x` flag is enabled, bump through any
521 /// subsequent spaces. Return true if and only if the parser is not at
522 /// EOF.
bump_and_bump_space(&self) -> bool523 fn bump_and_bump_space(&self) -> bool {
524 if !self.bump() {
525 return false;
526 }
527 self.bump_space();
528 !self.is_eof()
529 }
530
531 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
532 /// comments), then this will advance the parser through all whitespace
533 /// and comments to the next non-whitespace non-comment byte.
534 ///
535 /// If the `x` flag is disabled, then this is a no-op.
536 ///
537 /// This should be used selectively throughout the parser where
538 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
539 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
bump_space(&self)540 fn bump_space(&self) {
541 if !self.ignore_whitespace() {
542 return;
543 }
544 while !self.is_eof() {
545 if self.char().is_whitespace() {
546 self.bump();
547 } else if self.char() == '#' {
548 let start = self.pos();
549 let mut comment_text = String::new();
550 self.bump();
551 while !self.is_eof() {
552 let c = self.char();
553 self.bump();
554 if c == '\n' {
555 break;
556 }
557 comment_text.push(c);
558 }
559 let comment = ast::Comment {
560 span: Span::new(start, self.pos()),
561 comment: comment_text,
562 };
563 self.parser().comments.borrow_mut().push(comment);
564 } else {
565 break;
566 }
567 }
568 }
569
570 /// Peek at the next character in the input without advancing the parser.
571 ///
572 /// If the input has been exhausted, then this returns `None`.
peek(&self) -> Option<char>573 fn peek(&self) -> Option<char> {
574 if self.is_eof() {
575 return None;
576 }
577 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
578 }
579
580 /// Like peek, but will ignore spaces when the parser is in whitespace
581 /// insensitive mode.
peek_space(&self) -> Option<char>582 fn peek_space(&self) -> Option<char> {
583 if !self.ignore_whitespace() {
584 return self.peek();
585 }
586 if self.is_eof() {
587 return None;
588 }
589 let mut start = self.offset() + self.char().len_utf8();
590 let mut in_comment = false;
591 for (i, c) in self.pattern()[start..].char_indices() {
592 if c.is_whitespace() {
593 continue;
594 } else if !in_comment && c == '#' {
595 in_comment = true;
596 } else if in_comment && c == '\n' {
597 in_comment = false;
598 } else {
599 start += i;
600 break;
601 }
602 }
603 self.pattern()[start..].chars().next()
604 }
605
606 /// Returns true if the next call to `bump` would return false.
is_eof(&self) -> bool607 fn is_eof(&self) -> bool {
608 self.offset() == self.pattern().len()
609 }
610
611 /// Return the current position of the parser, which includes the offset,
612 /// line and column.
pos(&self) -> Position613 fn pos(&self) -> Position {
614 self.parser().pos.get()
615 }
616
617 /// Create a span at the current position of the parser. Both the start
618 /// and end of the span are set.
span(&self) -> Span619 fn span(&self) -> Span {
620 Span::splat(self.pos())
621 }
622
623 /// Create a span that covers the current character.
span_char(&self) -> Span624 fn span_char(&self) -> Span {
625 let mut next = Position {
626 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
627 line: self.line(),
628 column: self.column().checked_add(1).unwrap(),
629 };
630 if self.char() == '\n' {
631 next.line += 1;
632 next.column = 1;
633 }
634 Span::new(self.pos(), next)
635 }
636
637 /// Parse and push a single alternation on to the parser's internal stack.
638 /// If the top of the stack already has an alternation, then add to that
639 /// instead of pushing a new one.
640 ///
641 /// The concatenation given corresponds to a single alternation branch.
642 /// The concatenation returned starts the next branch and is empty.
643 ///
644 /// This assumes the parser is currently positioned at `|` and will advance
645 /// the parser to the character following `|`.
646 #[inline(never)]
push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat>647 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
648 assert_eq!(self.char(), '|');
649 concat.span.end = self.pos();
650 self.push_or_add_alternation(concat);
651 self.bump();
652 Ok(ast::Concat { span: self.span(), asts: vec![] })
653 }
654
655 /// Pushes or adds the given branch of an alternation to the parser's
656 /// internal stack of state.
push_or_add_alternation(&self, concat: ast::Concat)657 fn push_or_add_alternation(&self, concat: ast::Concat) {
658 use self::GroupState::*;
659
660 let mut stack = self.parser().stack_group.borrow_mut();
661 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
662 alts.asts.push(concat.into_ast());
663 return;
664 }
665 stack.push(Alternation(ast::Alternation {
666 span: Span::new(concat.span.start, self.pos()),
667 asts: vec![concat.into_ast()],
668 }));
669 }
670
671 /// Parse and push a group AST (and its parent concatenation) on to the
672 /// parser's internal stack. Return a fresh concatenation corresponding
673 /// to the group's sub-AST.
674 ///
675 /// If a set of flags was found (with no group), then the concatenation
676 /// is returned with that set of flags added.
677 ///
678 /// This assumes that the parser is currently positioned on the opening
679 /// parenthesis. It advances the parser to the character at the start
680 /// of the sub-expression (or adjoining expression).
681 ///
682 /// If there was a problem parsing the start of the group, then an error
683 /// is returned.
684 #[inline(never)]
push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat>685 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
686 assert_eq!(self.char(), '(');
687 match self.parse_group()? {
688 Either::Left(set) => {
689 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
690 if let Some(v) = ignore {
691 self.parser().ignore_whitespace.set(v);
692 }
693
694 concat.asts.push(Ast::Flags(set));
695 Ok(concat)
696 }
697 Either::Right(group) => {
698 let old_ignore_whitespace = self.ignore_whitespace();
699 let new_ignore_whitespace = group
700 .flags()
701 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
702 .unwrap_or(old_ignore_whitespace);
703 self.parser().stack_group.borrow_mut().push(
704 GroupState::Group {
705 concat: concat,
706 group: group,
707 ignore_whitespace: old_ignore_whitespace,
708 },
709 );
710 self.parser().ignore_whitespace.set(new_ignore_whitespace);
711 Ok(ast::Concat { span: self.span(), asts: vec![] })
712 }
713 }
714 }
715
716 /// Pop a group AST from the parser's internal stack and set the group's
717 /// AST to the given concatenation. Return the concatenation containing
718 /// the group.
719 ///
720 /// This assumes that the parser is currently positioned on the closing
721 /// parenthesis and advances the parser to the character following the `)`.
722 ///
723 /// If no such group could be popped, then an unopened group error is
724 /// returned.
725 #[inline(never)]
pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat>726 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
727 use self::GroupState::*;
728
729 assert_eq!(self.char(), ')');
730 let mut stack = self.parser().stack_group.borrow_mut();
731 let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
732 .pop()
733 {
734 Some(Group { concat, group, ignore_whitespace }) => {
735 (concat, group, ignore_whitespace, None)
736 }
737 Some(Alternation(alt)) => match stack.pop() {
738 Some(Group { concat, group, ignore_whitespace }) => {
739 (concat, group, ignore_whitespace, Some(alt))
740 }
741 None | Some(Alternation(_)) => {
742 return Err(self.error(
743 self.span_char(),
744 ast::ErrorKind::GroupUnopened,
745 ));
746 }
747 },
748 None => {
749 return Err(self
750 .error(self.span_char(), ast::ErrorKind::GroupUnopened));
751 }
752 };
753 self.parser().ignore_whitespace.set(ignore_whitespace);
754 group_concat.span.end = self.pos();
755 self.bump();
756 group.span.end = self.pos();
757 match alt {
758 Some(mut alt) => {
759 alt.span.end = group_concat.span.end;
760 alt.asts.push(group_concat.into_ast());
761 group.ast = Box::new(alt.into_ast());
762 }
763 None => {
764 group.ast = Box::new(group_concat.into_ast());
765 }
766 }
767 prior_concat.asts.push(Ast::Group(group));
768 Ok(prior_concat)
769 }
770
771 /// Pop the last state from the parser's internal stack, if it exists, and
772 /// add the given concatenation to it. There either must be no state or a
773 /// single alternation item on the stack. Any other scenario produces an
774 /// error.
775 ///
776 /// This assumes that the parser has advanced to the end.
777 #[inline(never)]
pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast>778 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
779 concat.span.end = self.pos();
780 let mut stack = self.parser().stack_group.borrow_mut();
781 let ast = match stack.pop() {
782 None => Ok(concat.into_ast()),
783 Some(GroupState::Alternation(mut alt)) => {
784 alt.span.end = self.pos();
785 alt.asts.push(concat.into_ast());
786 Ok(Ast::Alternation(alt))
787 }
788 Some(GroupState::Group { group, .. }) => {
789 return Err(
790 self.error(group.span, ast::ErrorKind::GroupUnclosed)
791 );
792 }
793 };
794 // If we try to pop again, there should be nothing.
795 match stack.pop() {
796 None => ast,
797 Some(GroupState::Alternation(_)) => {
798 // This unreachable is unfortunate. This case can't happen
799 // because the only way we can be here is if there were two
800 // `GroupState::Alternation`s adjacent in the parser's stack,
801 // which we guarantee to never happen because we never push a
802 // `GroupState::Alternation` if one is already at the top of
803 // the stack.
804 unreachable!()
805 }
806 Some(GroupState::Group { group, .. }) => {
807 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
808 }
809 }
810 }
811
812 /// Parse the opening of a character class and push the current class
813 /// parsing context onto the parser's stack. This assumes that the parser
814 /// is positioned at an opening `[`. The given union should correspond to
815 /// the union of set items built up before seeing the `[`.
816 ///
817 /// If there was a problem parsing the opening of the class, then an error
818 /// is returned. Otherwise, a new union of set items for the class is
819 /// returned (which may be populated with either a `]` or a `-`).
820 #[inline(never)]
push_class_open( &self, parent_union: ast::ClassSetUnion, ) -> Result<ast::ClassSetUnion>821 fn push_class_open(
822 &self,
823 parent_union: ast::ClassSetUnion,
824 ) -> Result<ast::ClassSetUnion> {
825 assert_eq!(self.char(), '[');
826
827 let (nested_set, nested_union) = self.parse_set_class_open()?;
828 self.parser()
829 .stack_class
830 .borrow_mut()
831 .push(ClassState::Open { union: parent_union, set: nested_set });
832 Ok(nested_union)
833 }
834
835 /// Parse the end of a character class set and pop the character class
836 /// parser stack. The union given corresponds to the last union built
837 /// before seeing the closing `]`. The union returned corresponds to the
838 /// parent character class set with the nested class added to it.
839 ///
840 /// This assumes that the parser is positioned at a `]` and will advance
841 /// the parser to the byte immediately following the `]`.
842 ///
843 /// If the stack is empty after popping, then this returns the final
844 /// "top-level" character class AST (where a "top-level" character class
845 /// is one that is not nested inside any other character class).
846 ///
847 /// If there is no corresponding opening bracket on the parser's stack,
848 /// then an error is returned.
849 #[inline(never)]
pop_class( &self, nested_union: ast::ClassSetUnion, ) -> Result<Either<ast::ClassSetUnion, ast::Class>>850 fn pop_class(
851 &self,
852 nested_union: ast::ClassSetUnion,
853 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
854 assert_eq!(self.char(), ']');
855
856 let item = ast::ClassSet::Item(nested_union.into_item());
857 let prevset = self.pop_class_op(item);
858 let mut stack = self.parser().stack_class.borrow_mut();
859 match stack.pop() {
860 None => {
861 // We can never observe an empty stack:
862 //
863 // 1) We are guaranteed to start with a non-empty stack since
864 // the character class parser is only initiated when it sees
865 // a `[`.
866 // 2) If we ever observe an empty stack while popping after
867 // seeing a `]`, then we signal the character class parser
868 // to terminate.
869 panic!("unexpected empty character class stack")
870 }
871 Some(ClassState::Op { .. }) => {
872 // This panic is unfortunate, but this case is impossible
873 // since we already popped the Op state if one exists above.
874 // Namely, every push to the class parser stack is guarded by
875 // whether an existing Op is already on the top of the stack.
876 // If it is, the existing Op is modified. That is, the stack
877 // can never have consecutive Op states.
878 panic!("unexpected ClassState::Op")
879 }
880 Some(ClassState::Open { mut union, mut set }) => {
881 self.bump();
882 set.span.end = self.pos();
883 set.kind = prevset;
884 if stack.is_empty() {
885 Ok(Either::Right(ast::Class::Bracketed(set)))
886 } else {
887 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
888 Ok(Either::Left(union))
889 }
890 }
891 }
892 }
893
894 /// Return an "unclosed class" error whose span points to the most
895 /// recently opened class.
896 ///
897 /// This should only be called while parsing a character class.
898 #[inline(never)]
unclosed_class_error(&self) -> ast::Error899 fn unclosed_class_error(&self) -> ast::Error {
900 for state in self.parser().stack_class.borrow().iter().rev() {
901 match *state {
902 ClassState::Open { ref set, .. } => {
903 return self
904 .error(set.span, ast::ErrorKind::ClassUnclosed);
905 }
906 _ => {}
907 }
908 }
909 // We are guaranteed to have a non-empty stack with at least
910 // one open bracket, so we should never get here.
911 panic!("no open character class found")
912 }
913
914 /// Push the current set of class items on to the class parser's stack as
915 /// the left hand side of the given operator.
916 ///
917 /// A fresh set union is returned, which should be used to build the right
918 /// hand side of this operator.
919 #[inline(never)]
push_class_op( &self, next_kind: ast::ClassSetBinaryOpKind, next_union: ast::ClassSetUnion, ) -> ast::ClassSetUnion920 fn push_class_op(
921 &self,
922 next_kind: ast::ClassSetBinaryOpKind,
923 next_union: ast::ClassSetUnion,
924 ) -> ast::ClassSetUnion {
925 let item = ast::ClassSet::Item(next_union.into_item());
926 let new_lhs = self.pop_class_op(item);
927 self.parser()
928 .stack_class
929 .borrow_mut()
930 .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
931 ast::ClassSetUnion { span: self.span(), items: vec![] }
932 }
933
934 /// Pop a character class set from the character class parser stack. If the
935 /// top of the stack is just an item (not an operation), then return the
936 /// given set unchanged. If the top of the stack is an operation, then the
937 /// given set will be used as the rhs of the operation on the top of the
938 /// stack. In that case, the binary operation is returned as a set.
939 #[inline(never)]
pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet940 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
941 let mut stack = self.parser().stack_class.borrow_mut();
942 let (kind, lhs) = match stack.pop() {
943 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
944 Some(state @ ClassState::Open { .. }) => {
945 stack.push(state);
946 return rhs;
947 }
948 None => unreachable!(),
949 };
950 let span = Span::new(lhs.span().start, rhs.span().end);
951 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
952 span: span,
953 kind: kind,
954 lhs: Box::new(lhs),
955 rhs: Box::new(rhs),
956 })
957 }
958 }
959
960 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
961 /// Parse the regular expression into an abstract syntax tree.
parse(&self) -> Result<Ast>962 fn parse(&self) -> Result<Ast> {
963 self.parse_with_comments().map(|astc| astc.ast)
964 }
965
966 /// Parse the regular expression and return an abstract syntax tree with
967 /// all of the comments found in the pattern.
parse_with_comments(&self) -> Result<ast::WithComments>968 fn parse_with_comments(&self) -> Result<ast::WithComments> {
969 assert_eq!(self.offset(), 0, "parser can only be used once");
970 self.parser().reset();
971 let mut concat = ast::Concat { span: self.span(), asts: vec![] };
972 loop {
973 self.bump_space();
974 if self.is_eof() {
975 break;
976 }
977 match self.char() {
978 '(' => concat = self.push_group(concat)?,
979 ')' => concat = self.pop_group(concat)?,
980 '|' => concat = self.push_alternate(concat)?,
981 '[' => {
982 let class = self.parse_set_class()?;
983 concat.asts.push(Ast::Class(class));
984 }
985 '?' => {
986 concat = self.parse_uncounted_repetition(
987 concat,
988 ast::RepetitionKind::ZeroOrOne,
989 )?;
990 }
991 '*' => {
992 concat = self.parse_uncounted_repetition(
993 concat,
994 ast::RepetitionKind::ZeroOrMore,
995 )?;
996 }
997 '+' => {
998 concat = self.parse_uncounted_repetition(
999 concat,
1000 ast::RepetitionKind::OneOrMore,
1001 )?;
1002 }
1003 '{' => {
1004 concat = self.parse_counted_repetition(concat)?;
1005 }
1006 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1007 }
1008 }
1009 let ast = self.pop_group_end(concat)?;
1010 NestLimiter::new(self).check(&ast)?;
1011 Ok(ast::WithComments {
1012 ast: ast,
1013 comments: mem::replace(
1014 &mut *self.parser().comments.borrow_mut(),
1015 vec![],
1016 ),
1017 })
1018 }
1019
1020 /// Parses an uncounted repetition operation. An uncounted repetition
1021 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1022 /// The given `kind` should correspond to the operator observed by the
1023 /// caller.
1024 ///
1025 /// This assumes that the paser is currently positioned at the repetition
1026 /// operator and advances the parser to the first character after the
1027 /// operator. (Note that the operator may include a single additional `?`,
1028 /// which makes the operator ungreedy.)
1029 ///
1030 /// The caller should include the concatenation that is being built. The
1031 /// concatenation returned includes the repetition operator applied to the
1032 /// last expression in the given concatenation.
1033 #[inline(never)]
parse_uncounted_repetition( &self, mut concat: ast::Concat, kind: ast::RepetitionKind, ) -> Result<ast::Concat>1034 fn parse_uncounted_repetition(
1035 &self,
1036 mut concat: ast::Concat,
1037 kind: ast::RepetitionKind,
1038 ) -> Result<ast::Concat> {
1039 assert!(
1040 self.char() == '?' || self.char() == '*' || self.char() == '+'
1041 );
1042 let op_start = self.pos();
1043 let ast = match concat.asts.pop() {
1044 Some(ast) => ast,
1045 None => {
1046 return Err(
1047 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1048 )
1049 }
1050 };
1051 match ast {
1052 Ast::Empty(_) | Ast::Flags(_) => {
1053 return Err(
1054 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1055 )
1056 }
1057 _ => {}
1058 }
1059 let mut greedy = true;
1060 if self.bump() && self.char() == '?' {
1061 greedy = false;
1062 self.bump();
1063 }
1064 concat.asts.push(Ast::Repetition(ast::Repetition {
1065 span: ast.span().with_end(self.pos()),
1066 op: ast::RepetitionOp {
1067 span: Span::new(op_start, self.pos()),
1068 kind: kind,
1069 },
1070 greedy: greedy,
1071 ast: Box::new(ast),
1072 }));
1073 Ok(concat)
1074 }
1075
1076 /// Parses a counted repetition operation. A counted repetition operator
1077 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1078 /// operators.
1079 ///
1080 /// This assumes that the paser is currently positioned at the opening `{`
1081 /// and advances the parser to the first character after the operator.
1082 /// (Note that the operator may include a single additional `?`, which
1083 /// makes the operator ungreedy.)
1084 ///
1085 /// The caller should include the concatenation that is being built. The
1086 /// concatenation returned includes the repetition operator applied to the
1087 /// last expression in the given concatenation.
1088 #[inline(never)]
parse_counted_repetition( &self, mut concat: ast::Concat, ) -> Result<ast::Concat>1089 fn parse_counted_repetition(
1090 &self,
1091 mut concat: ast::Concat,
1092 ) -> Result<ast::Concat> {
1093 assert!(self.char() == '{');
1094 let start = self.pos();
1095 let ast = match concat.asts.pop() {
1096 Some(ast) => ast,
1097 None => {
1098 return Err(
1099 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1100 )
1101 }
1102 };
1103 match ast {
1104 Ast::Empty(_) | Ast::Flags(_) => {
1105 return Err(
1106 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1107 )
1108 }
1109 _ => {}
1110 }
1111 if !self.bump_and_bump_space() {
1112 return Err(self.error(
1113 Span::new(start, self.pos()),
1114 ast::ErrorKind::RepetitionCountUnclosed,
1115 ));
1116 }
1117 let count_start = specialize_err(
1118 self.parse_decimal(),
1119 ast::ErrorKind::DecimalEmpty,
1120 ast::ErrorKind::RepetitionCountDecimalEmpty,
1121 )?;
1122 let mut range = ast::RepetitionRange::Exactly(count_start);
1123 if self.is_eof() {
1124 return Err(self.error(
1125 Span::new(start, self.pos()),
1126 ast::ErrorKind::RepetitionCountUnclosed,
1127 ));
1128 }
1129 if self.char() == ',' {
1130 if !self.bump_and_bump_space() {
1131 return Err(self.error(
1132 Span::new(start, self.pos()),
1133 ast::ErrorKind::RepetitionCountUnclosed,
1134 ));
1135 }
1136 if self.char() != '}' {
1137 let count_end = specialize_err(
1138 self.parse_decimal(),
1139 ast::ErrorKind::DecimalEmpty,
1140 ast::ErrorKind::RepetitionCountDecimalEmpty,
1141 )?;
1142 range = ast::RepetitionRange::Bounded(count_start, count_end);
1143 } else {
1144 range = ast::RepetitionRange::AtLeast(count_start);
1145 }
1146 }
1147 if self.is_eof() || self.char() != '}' {
1148 return Err(self.error(
1149 Span::new(start, self.pos()),
1150 ast::ErrorKind::RepetitionCountUnclosed,
1151 ));
1152 }
1153
1154 let mut greedy = true;
1155 if self.bump_and_bump_space() && self.char() == '?' {
1156 greedy = false;
1157 self.bump();
1158 }
1159
1160 let op_span = Span::new(start, self.pos());
1161 if !range.is_valid() {
1162 return Err(
1163 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1164 );
1165 }
1166 concat.asts.push(Ast::Repetition(ast::Repetition {
1167 span: ast.span().with_end(self.pos()),
1168 op: ast::RepetitionOp {
1169 span: op_span,
1170 kind: ast::RepetitionKind::Range(range),
1171 },
1172 greedy: greedy,
1173 ast: Box::new(ast),
1174 }));
1175 Ok(concat)
1176 }
1177
1178 /// Parse a group (which contains a sub-expression) or a set of flags.
1179 ///
1180 /// If a group was found, then it is returned with an empty AST. If a set
1181 /// of flags is found, then that set is returned.
1182 ///
1183 /// The parser should be positioned at the opening parenthesis.
1184 ///
1185 /// This advances the parser to the character before the start of the
1186 /// sub-expression (in the case of a group) or to the closing parenthesis
1187 /// immediately following the set of flags.
1188 ///
1189 /// # Errors
1190 ///
1191 /// If flags are given and incorrectly specified, then a corresponding
1192 /// error is returned.
1193 ///
1194 /// If a capture name is given and it is incorrectly specified, then a
1195 /// corresponding error is returned.
1196 #[inline(never)]
parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>>1197 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1198 assert_eq!(self.char(), '(');
1199 let open_span = self.span_char();
1200 self.bump();
1201 self.bump_space();
1202 if self.is_lookaround_prefix() {
1203 return Err(self.error(
1204 Span::new(open_span.start, self.span().end),
1205 ast::ErrorKind::UnsupportedLookAround,
1206 ));
1207 }
1208 let inner_span = self.span();
1209 if self.bump_if("?P<") {
1210 let capture_index = self.next_capture_index(open_span)?;
1211 let cap = self.parse_capture_name(capture_index)?;
1212 Ok(Either::Right(ast::Group {
1213 span: open_span,
1214 kind: ast::GroupKind::CaptureName(cap),
1215 ast: Box::new(Ast::Empty(self.span())),
1216 }))
1217 } else if self.bump_if("?") {
1218 if self.is_eof() {
1219 return Err(
1220 self.error(open_span, ast::ErrorKind::GroupUnclosed)
1221 );
1222 }
1223 let flags = self.parse_flags()?;
1224 let char_end = self.char();
1225 self.bump();
1226 if char_end == ')' {
1227 // We don't allow empty flags, e.g., `(?)`. We instead
1228 // interpret it as a repetition operator missing its argument.
1229 if flags.items.is_empty() {
1230 return Err(self.error(
1231 inner_span,
1232 ast::ErrorKind::RepetitionMissing,
1233 ));
1234 }
1235 Ok(Either::Left(ast::SetFlags {
1236 span: Span { end: self.pos(), ..open_span },
1237 flags: flags,
1238 }))
1239 } else {
1240 assert_eq!(char_end, ':');
1241 Ok(Either::Right(ast::Group {
1242 span: open_span,
1243 kind: ast::GroupKind::NonCapturing(flags),
1244 ast: Box::new(Ast::Empty(self.span())),
1245 }))
1246 }
1247 } else {
1248 let capture_index = self.next_capture_index(open_span)?;
1249 Ok(Either::Right(ast::Group {
1250 span: open_span,
1251 kind: ast::GroupKind::CaptureIndex(capture_index),
1252 ast: Box::new(Ast::Empty(self.span())),
1253 }))
1254 }
1255 }
1256
1257 /// Parses a capture group name. Assumes that the parser is positioned at
1258 /// the first character in the name following the opening `<` (and may
1259 /// possibly be EOF). This advances the parser to the first character
1260 /// following the closing `>`.
1261 ///
1262 /// The caller must provide the capture index of the group for this name.
1263 #[inline(never)]
parse_capture_name( &self, capture_index: u32, ) -> Result<ast::CaptureName>1264 fn parse_capture_name(
1265 &self,
1266 capture_index: u32,
1267 ) -> Result<ast::CaptureName> {
1268 if self.is_eof() {
1269 return Err(self
1270 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1271 }
1272 let start = self.pos();
1273 loop {
1274 if self.char() == '>' {
1275 break;
1276 }
1277 if !is_capture_char(self.char(), self.pos() == start) {
1278 return Err(self.error(
1279 self.span_char(),
1280 ast::ErrorKind::GroupNameInvalid,
1281 ));
1282 }
1283 if !self.bump() {
1284 break;
1285 }
1286 }
1287 let end = self.pos();
1288 if self.is_eof() {
1289 return Err(self
1290 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1291 }
1292 assert_eq!(self.char(), '>');
1293 self.bump();
1294 let name = &self.pattern()[start.offset..end.offset];
1295 if name.is_empty() {
1296 return Err(self.error(
1297 Span::new(start, start),
1298 ast::ErrorKind::GroupNameEmpty,
1299 ));
1300 }
1301 let capname = ast::CaptureName {
1302 span: Span::new(start, end),
1303 name: name.to_string(),
1304 index: capture_index,
1305 };
1306 self.add_capture_name(&capname)?;
1307 Ok(capname)
1308 }
1309
1310 /// Parse a sequence of flags starting at the current character.
1311 ///
1312 /// This advances the parser to the character immediately following the
1313 /// flags, which is guaranteed to be either `:` or `)`.
1314 ///
1315 /// # Errors
1316 ///
1317 /// If any flags are duplicated, then an error is returned.
1318 ///
1319 /// If the negation operator is used more than once, then an error is
1320 /// returned.
1321 ///
1322 /// If no flags could be found or if the negation operation is not followed
1323 /// by any flags, then an error is returned.
1324 #[inline(never)]
parse_flags(&self) -> Result<ast::Flags>1325 fn parse_flags(&self) -> Result<ast::Flags> {
1326 let mut flags = ast::Flags { span: self.span(), items: vec![] };
1327 let mut last_was_negation = None;
1328 while self.char() != ':' && self.char() != ')' {
1329 if self.char() == '-' {
1330 last_was_negation = Some(self.span_char());
1331 let item = ast::FlagsItem {
1332 span: self.span_char(),
1333 kind: ast::FlagsItemKind::Negation,
1334 };
1335 if let Some(i) = flags.add_item(item) {
1336 return Err(self.error(
1337 self.span_char(),
1338 ast::ErrorKind::FlagRepeatedNegation {
1339 original: flags.items[i].span,
1340 },
1341 ));
1342 }
1343 } else {
1344 last_was_negation = None;
1345 let item = ast::FlagsItem {
1346 span: self.span_char(),
1347 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1348 };
1349 if let Some(i) = flags.add_item(item) {
1350 return Err(self.error(
1351 self.span_char(),
1352 ast::ErrorKind::FlagDuplicate {
1353 original: flags.items[i].span,
1354 },
1355 ));
1356 }
1357 }
1358 if !self.bump() {
1359 return Err(
1360 self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1361 );
1362 }
1363 }
1364 if let Some(span) = last_was_negation {
1365 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1366 }
1367 flags.span.end = self.pos();
1368 Ok(flags)
1369 }
1370
1371 /// Parse the current character as a flag. Do not advance the parser.
1372 ///
1373 /// # Errors
1374 ///
1375 /// If the flag is not recognized, then an error is returned.
1376 #[inline(never)]
parse_flag(&self) -> Result<ast::Flag>1377 fn parse_flag(&self) -> Result<ast::Flag> {
1378 match self.char() {
1379 'i' => Ok(ast::Flag::CaseInsensitive),
1380 'm' => Ok(ast::Flag::MultiLine),
1381 's' => Ok(ast::Flag::DotMatchesNewLine),
1382 'U' => Ok(ast::Flag::SwapGreed),
1383 'u' => Ok(ast::Flag::Unicode),
1384 'x' => Ok(ast::Flag::IgnoreWhitespace),
1385 _ => {
1386 Err(self
1387 .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1388 }
1389 }
1390 }
1391
1392 /// Parse a primitive AST. e.g., A literal, non-set character class or
1393 /// assertion.
1394 ///
1395 /// This assumes that the parser expects a primitive at the current
1396 /// location. i.e., All other non-primitive cases have been handled.
1397 /// For example, if the parser's position is at `|`, then `|` will be
1398 /// treated as a literal (e.g., inside a character class).
1399 ///
1400 /// This advances the parser to the first character immediately following
1401 /// the primitive.
parse_primitive(&self) -> Result<Primitive>1402 fn parse_primitive(&self) -> Result<Primitive> {
1403 match self.char() {
1404 '\\' => self.parse_escape(),
1405 '.' => {
1406 let ast = Primitive::Dot(self.span_char());
1407 self.bump();
1408 Ok(ast)
1409 }
1410 '^' => {
1411 let ast = Primitive::Assertion(ast::Assertion {
1412 span: self.span_char(),
1413 kind: ast::AssertionKind::StartLine,
1414 });
1415 self.bump();
1416 Ok(ast)
1417 }
1418 '$' => {
1419 let ast = Primitive::Assertion(ast::Assertion {
1420 span: self.span_char(),
1421 kind: ast::AssertionKind::EndLine,
1422 });
1423 self.bump();
1424 Ok(ast)
1425 }
1426 c => {
1427 let ast = Primitive::Literal(ast::Literal {
1428 span: self.span_char(),
1429 kind: ast::LiteralKind::Verbatim,
1430 c: c,
1431 });
1432 self.bump();
1433 Ok(ast)
1434 }
1435 }
1436 }
1437
1438 /// Parse an escape sequence as a primitive AST.
1439 ///
1440 /// This assumes the parser is positioned at the start of the escape
1441 /// sequence, i.e., `\`. It advances the parser to the first position
1442 /// immediately following the escape sequence.
1443 #[inline(never)]
parse_escape(&self) -> Result<Primitive>1444 fn parse_escape(&self) -> Result<Primitive> {
1445 assert_eq!(self.char(), '\\');
1446 let start = self.pos();
1447 if !self.bump() {
1448 return Err(self.error(
1449 Span::new(start, self.pos()),
1450 ast::ErrorKind::EscapeUnexpectedEof,
1451 ));
1452 }
1453 let c = self.char();
1454 // Put some of the more complicated routines into helpers.
1455 match c {
1456 '0'..='7' => {
1457 if !self.parser().octal {
1458 return Err(self.error(
1459 Span::new(start, self.span_char().end),
1460 ast::ErrorKind::UnsupportedBackreference,
1461 ));
1462 }
1463 let mut lit = self.parse_octal();
1464 lit.span.start = start;
1465 return Ok(Primitive::Literal(lit));
1466 }
1467 '8'..='9' if !self.parser().octal => {
1468 return Err(self.error(
1469 Span::new(start, self.span_char().end),
1470 ast::ErrorKind::UnsupportedBackreference,
1471 ));
1472 }
1473 'x' | 'u' | 'U' => {
1474 let mut lit = self.parse_hex()?;
1475 lit.span.start = start;
1476 return Ok(Primitive::Literal(lit));
1477 }
1478 'p' | 'P' => {
1479 let mut cls = self.parse_unicode_class()?;
1480 cls.span.start = start;
1481 return Ok(Primitive::Unicode(cls));
1482 }
1483 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1484 let mut cls = self.parse_perl_class();
1485 cls.span.start = start;
1486 return Ok(Primitive::Perl(cls));
1487 }
1488 _ => {}
1489 }
1490
1491 // Handle all of the one letter sequences inline.
1492 self.bump();
1493 let span = Span::new(start, self.pos());
1494 if is_meta_character(c) {
1495 return Ok(Primitive::Literal(ast::Literal {
1496 span: span,
1497 kind: ast::LiteralKind::Punctuation,
1498 c: c,
1499 }));
1500 }
1501 let special = |kind, c| {
1502 Ok(Primitive::Literal(ast::Literal {
1503 span: span,
1504 kind: ast::LiteralKind::Special(kind),
1505 c: c,
1506 }))
1507 };
1508 match c {
1509 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1510 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1511 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1512 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1513 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1514 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1515 ' ' if self.ignore_whitespace() => {
1516 special(ast::SpecialLiteralKind::Space, ' ')
1517 }
1518 'A' => Ok(Primitive::Assertion(ast::Assertion {
1519 span: span,
1520 kind: ast::AssertionKind::StartText,
1521 })),
1522 'z' => Ok(Primitive::Assertion(ast::Assertion {
1523 span: span,
1524 kind: ast::AssertionKind::EndText,
1525 })),
1526 'b' => Ok(Primitive::Assertion(ast::Assertion {
1527 span: span,
1528 kind: ast::AssertionKind::WordBoundary,
1529 })),
1530 'B' => Ok(Primitive::Assertion(ast::Assertion {
1531 span: span,
1532 kind: ast::AssertionKind::NotWordBoundary,
1533 })),
1534 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1535 }
1536 }
1537
1538 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1539 /// long. This expects the parser to be positioned at the first octal
1540 /// digit and advances the parser to the first character immediately
1541 /// following the octal number. This also assumes that parsing octal
1542 /// escapes is enabled.
1543 ///
1544 /// Assuming the preconditions are met, this routine can never fail.
1545 #[inline(never)]
parse_octal(&self) -> ast::Literal1546 fn parse_octal(&self) -> ast::Literal {
1547 use std::char;
1548 use std::u32;
1549
1550 assert!(self.parser().octal);
1551 assert!('0' <= self.char() && self.char() <= '7');
1552 let start = self.pos();
1553 // Parse up to two more digits.
1554 while self.bump()
1555 && '0' <= self.char()
1556 && self.char() <= '7'
1557 && self.pos().offset - start.offset <= 2
1558 {}
1559 let end = self.pos();
1560 let octal = &self.pattern()[start.offset..end.offset];
1561 // Parsing the octal should never fail since the above guarantees a
1562 // valid number.
1563 let codepoint =
1564 u32::from_str_radix(octal, 8).expect("valid octal number");
1565 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1566 // invalid Unicode scalar values.
1567 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1568 ast::Literal {
1569 span: Span::new(start, end),
1570 kind: ast::LiteralKind::Octal,
1571 c: c,
1572 }
1573 }
1574
1575 /// Parse a hex representation of a Unicode codepoint. This handles both
1576 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1577 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1578 /// the first character immediately following the hexadecimal literal.
1579 #[inline(never)]
parse_hex(&self) -> Result<ast::Literal>1580 fn parse_hex(&self) -> Result<ast::Literal> {
1581 assert!(
1582 self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1583 );
1584
1585 let hex_kind = match self.char() {
1586 'x' => ast::HexLiteralKind::X,
1587 'u' => ast::HexLiteralKind::UnicodeShort,
1588 _ => ast::HexLiteralKind::UnicodeLong,
1589 };
1590 if !self.bump_and_bump_space() {
1591 return Err(
1592 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1593 );
1594 }
1595 if self.char() == '{' {
1596 self.parse_hex_brace(hex_kind)
1597 } else {
1598 self.parse_hex_digits(hex_kind)
1599 }
1600 }
1601
1602 /// Parse an N-digit hex representation of a Unicode codepoint. This
1603 /// expects the parser to be positioned at the first digit and will advance
1604 /// the parser to the first character immediately following the escape
1605 /// sequence.
1606 ///
1607 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1608 /// or 8 (for `\UNNNNNNNN`).
1609 #[inline(never)]
parse_hex_digits( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1610 fn parse_hex_digits(
1611 &self,
1612 kind: ast::HexLiteralKind,
1613 ) -> Result<ast::Literal> {
1614 use std::char;
1615 use std::u32;
1616
1617 let mut scratch = self.parser().scratch.borrow_mut();
1618 scratch.clear();
1619
1620 let start = self.pos();
1621 for i in 0..kind.digits() {
1622 if i > 0 && !self.bump_and_bump_space() {
1623 return Err(self
1624 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1625 }
1626 if !is_hex(self.char()) {
1627 return Err(self.error(
1628 self.span_char(),
1629 ast::ErrorKind::EscapeHexInvalidDigit,
1630 ));
1631 }
1632 scratch.push(self.char());
1633 }
1634 // The final bump just moves the parser past the literal, which may
1635 // be EOF.
1636 self.bump_and_bump_space();
1637 let end = self.pos();
1638 let hex = scratch.as_str();
1639 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1640 None => Err(self.error(
1641 Span::new(start, end),
1642 ast::ErrorKind::EscapeHexInvalid,
1643 )),
1644 Some(c) => Ok(ast::Literal {
1645 span: Span::new(start, end),
1646 kind: ast::LiteralKind::HexFixed(kind),
1647 c: c,
1648 }),
1649 }
1650 }
1651
1652 /// Parse a hex representation of any Unicode scalar value. This expects
1653 /// the parser to be positioned at the opening brace `{` and will advance
1654 /// the parser to the first character following the closing brace `}`.
1655 #[inline(never)]
parse_hex_brace( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1656 fn parse_hex_brace(
1657 &self,
1658 kind: ast::HexLiteralKind,
1659 ) -> Result<ast::Literal> {
1660 use std::char;
1661 use std::u32;
1662
1663 let mut scratch = self.parser().scratch.borrow_mut();
1664 scratch.clear();
1665
1666 let brace_pos = self.pos();
1667 let start = self.span_char().end;
1668 while self.bump_and_bump_space() && self.char() != '}' {
1669 if !is_hex(self.char()) {
1670 return Err(self.error(
1671 self.span_char(),
1672 ast::ErrorKind::EscapeHexInvalidDigit,
1673 ));
1674 }
1675 scratch.push(self.char());
1676 }
1677 if self.is_eof() {
1678 return Err(self.error(
1679 Span::new(brace_pos, self.pos()),
1680 ast::ErrorKind::EscapeUnexpectedEof,
1681 ));
1682 }
1683 let end = self.pos();
1684 let hex = scratch.as_str();
1685 assert_eq!(self.char(), '}');
1686 self.bump_and_bump_space();
1687
1688 if hex.is_empty() {
1689 return Err(self.error(
1690 Span::new(brace_pos, self.pos()),
1691 ast::ErrorKind::EscapeHexEmpty,
1692 ));
1693 }
1694 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1695 None => Err(self.error(
1696 Span::new(start, end),
1697 ast::ErrorKind::EscapeHexInvalid,
1698 )),
1699 Some(c) => Ok(ast::Literal {
1700 span: Span::new(start, self.pos()),
1701 kind: ast::LiteralKind::HexBrace(kind),
1702 c: c,
1703 }),
1704 }
1705 }
1706
1707 /// Parse a decimal number into a u32 while trimming leading and trailing
1708 /// whitespace.
1709 ///
1710 /// This expects the parser to be positioned at the first position where
1711 /// a decimal digit could occur. This will advance the parser to the byte
1712 /// immediately following the last contiguous decimal digit.
1713 ///
1714 /// If no decimal digit could be found or if there was a problem parsing
1715 /// the complete set of digits into a u32, then an error is returned.
parse_decimal(&self) -> Result<u32>1716 fn parse_decimal(&self) -> Result<u32> {
1717 let mut scratch = self.parser().scratch.borrow_mut();
1718 scratch.clear();
1719
1720 while !self.is_eof() && self.char().is_whitespace() {
1721 self.bump();
1722 }
1723 let start = self.pos();
1724 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1725 scratch.push(self.char());
1726 self.bump_and_bump_space();
1727 }
1728 let span = Span::new(start, self.pos());
1729 while !self.is_eof() && self.char().is_whitespace() {
1730 self.bump_and_bump_space();
1731 }
1732 let digits = scratch.as_str();
1733 if digits.is_empty() {
1734 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1735 }
1736 match u32::from_str_radix(digits, 10).ok() {
1737 Some(n) => Ok(n),
1738 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1739 }
1740 }
1741
1742 /// Parse a standard character class consisting primarily of characters or
1743 /// character ranges, but can also contain nested character classes of
1744 /// any type (sans `.`).
1745 ///
1746 /// This assumes the parser is positioned at the opening `[`. If parsing
1747 /// is successful, then the parser is advanced to the position immediately
1748 /// following the closing `]`.
1749 #[inline(never)]
parse_set_class(&self) -> Result<ast::Class>1750 fn parse_set_class(&self) -> Result<ast::Class> {
1751 assert_eq!(self.char(), '[');
1752
1753 let mut union =
1754 ast::ClassSetUnion { span: self.span(), items: vec![] };
1755 loop {
1756 self.bump_space();
1757 if self.is_eof() {
1758 return Err(self.unclosed_class_error());
1759 }
1760 match self.char() {
1761 '[' => {
1762 // If we've already parsed the opening bracket, then
1763 // attempt to treat this as the beginning of an ASCII
1764 // class. If ASCII class parsing fails, then the parser
1765 // backs up to `[`.
1766 if !self.parser().stack_class.borrow().is_empty() {
1767 if let Some(cls) = self.maybe_parse_ascii_class() {
1768 union.push(ast::ClassSetItem::Ascii(cls));
1769 continue;
1770 }
1771 }
1772 union = self.push_class_open(union)?;
1773 }
1774 ']' => match self.pop_class(union)? {
1775 Either::Left(nested_union) => {
1776 union = nested_union;
1777 }
1778 Either::Right(class) => return Ok(class),
1779 },
1780 '&' if self.peek() == Some('&') => {
1781 assert!(self.bump_if("&&"));
1782 union = self.push_class_op(
1783 ast::ClassSetBinaryOpKind::Intersection,
1784 union,
1785 );
1786 }
1787 '-' if self.peek() == Some('-') => {
1788 assert!(self.bump_if("--"));
1789 union = self.push_class_op(
1790 ast::ClassSetBinaryOpKind::Difference,
1791 union,
1792 );
1793 }
1794 '~' if self.peek() == Some('~') => {
1795 assert!(self.bump_if("~~"));
1796 union = self.push_class_op(
1797 ast::ClassSetBinaryOpKind::SymmetricDifference,
1798 union,
1799 );
1800 }
1801 _ => {
1802 union.push(self.parse_set_class_range()?);
1803 }
1804 }
1805 }
1806 }
1807
1808 /// Parse a single primitive item in a character class set. The item to
1809 /// be parsed can either be one of a simple literal character, a range
1810 /// between two simple literal characters or a "primitive" character
1811 /// class like \w or \p{Greek}.
1812 ///
1813 /// If an invalid escape is found, or if a character class is found where
1814 /// a simple literal is expected (e.g., in a range), then an error is
1815 /// returned.
1816 #[inline(never)]
parse_set_class_range(&self) -> Result<ast::ClassSetItem>1817 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1818 let prim1 = self.parse_set_class_item()?;
1819 self.bump_space();
1820 if self.is_eof() {
1821 return Err(self.unclosed_class_error());
1822 }
1823 // If the next char isn't a `-`, then we don't have a range.
1824 // There are two exceptions. If the char after a `-` is a `]`, then
1825 // `-` is interpreted as a literal `-`. Alternatively, if the char
1826 // after a `-` is a `-`, then `--` corresponds to a "difference"
1827 // operation.
1828 if self.char() != '-'
1829 || self.peek_space() == Some(']')
1830 || self.peek_space() == Some('-')
1831 {
1832 return prim1.into_class_set_item(self);
1833 }
1834 // OK, now we're parsing a range, so bump past the `-` and parse the
1835 // second half of the range.
1836 if !self.bump_and_bump_space() {
1837 return Err(self.unclosed_class_error());
1838 }
1839 let prim2 = self.parse_set_class_item()?;
1840 let range = ast::ClassSetRange {
1841 span: Span::new(prim1.span().start, prim2.span().end),
1842 start: prim1.into_class_literal(self)?,
1843 end: prim2.into_class_literal(self)?,
1844 };
1845 if !range.is_valid() {
1846 return Err(
1847 self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1848 );
1849 }
1850 Ok(ast::ClassSetItem::Range(range))
1851 }
1852
1853 /// Parse a single item in a character class as a primitive, where the
1854 /// primitive either consists of a verbatim literal or a single escape
1855 /// sequence.
1856 ///
1857 /// This assumes the parser is positioned at the beginning of a primitive,
1858 /// and advances the parser to the first position after the primitive if
1859 /// successful.
1860 ///
1861 /// Note that it is the caller's responsibility to report an error if an
1862 /// illegal primitive was parsed.
1863 #[inline(never)]
parse_set_class_item(&self) -> Result<Primitive>1864 fn parse_set_class_item(&self) -> Result<Primitive> {
1865 if self.char() == '\\' {
1866 self.parse_escape()
1867 } else {
1868 let x = Primitive::Literal(ast::Literal {
1869 span: self.span_char(),
1870 kind: ast::LiteralKind::Verbatim,
1871 c: self.char(),
1872 });
1873 self.bump();
1874 Ok(x)
1875 }
1876 }
1877
1878 /// Parses the opening of a character class set. This includes the opening
1879 /// bracket along with `^` if present to indicate negation. This also
1880 /// starts parsing the opening set of unioned items if applicable, since
1881 /// there are special rules applied to certain characters in the opening
1882 /// of a character class. For example, `[^]]` is the class of all
1883 /// characters not equal to `]`. (`]` would need to be escaped in any other
1884 /// position.) Similarly for `-`.
1885 ///
1886 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1887 /// empty union. This empty union should be replaced with the actual item
1888 /// when it is popped from the parser's stack.
1889 ///
1890 /// This assumes the parser is positioned at the opening `[` and advances
1891 /// the parser to the first non-special byte of the character class.
1892 ///
1893 /// An error is returned if EOF is found.
1894 #[inline(never)]
parse_set_class_open( &self, ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)>1895 fn parse_set_class_open(
1896 &self,
1897 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1898 assert_eq!(self.char(), '[');
1899 let start = self.pos();
1900 if !self.bump_and_bump_space() {
1901 return Err(self.error(
1902 Span::new(start, self.pos()),
1903 ast::ErrorKind::ClassUnclosed,
1904 ));
1905 }
1906
1907 let negated = if self.char() != '^' {
1908 false
1909 } else {
1910 if !self.bump_and_bump_space() {
1911 return Err(self.error(
1912 Span::new(start, self.pos()),
1913 ast::ErrorKind::ClassUnclosed,
1914 ));
1915 }
1916 true
1917 };
1918 // Accept any number of `-` as literal `-`.
1919 let mut union =
1920 ast::ClassSetUnion { span: self.span(), items: vec![] };
1921 while self.char() == '-' {
1922 union.push(ast::ClassSetItem::Literal(ast::Literal {
1923 span: self.span_char(),
1924 kind: ast::LiteralKind::Verbatim,
1925 c: '-',
1926 }));
1927 if !self.bump_and_bump_space() {
1928 return Err(self.error(
1929 Span::new(start, self.pos()),
1930 ast::ErrorKind::ClassUnclosed,
1931 ));
1932 }
1933 }
1934 // If `]` is the *first* char in a set, then interpret it as a literal
1935 // `]`. That is, an empty class is impossible to write.
1936 if union.items.is_empty() && self.char() == ']' {
1937 union.push(ast::ClassSetItem::Literal(ast::Literal {
1938 span: self.span_char(),
1939 kind: ast::LiteralKind::Verbatim,
1940 c: ']',
1941 }));
1942 if !self.bump_and_bump_space() {
1943 return Err(self.error(
1944 Span::new(start, self.pos()),
1945 ast::ErrorKind::ClassUnclosed,
1946 ));
1947 }
1948 }
1949 let set = ast::ClassBracketed {
1950 span: Span::new(start, self.pos()),
1951 negated: negated,
1952 kind: ast::ClassSet::union(ast::ClassSetUnion {
1953 span: Span::new(union.span.start, union.span.start),
1954 items: vec![],
1955 }),
1956 };
1957 Ok((set, union))
1958 }
1959
1960 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1961 ///
1962 /// This assumes the parser is positioned at the opening `[`.
1963 ///
1964 /// If no valid ASCII character class could be found, then this does not
1965 /// advance the parser and `None` is returned. Otherwise, the parser is
1966 /// advanced to the first byte following the closing `]` and the
1967 /// corresponding ASCII class is returned.
1968 #[inline(never)]
maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii>1969 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1970 // ASCII character classes are interesting from a parsing perspective
1971 // because parsing cannot fail with any interesting error. For example,
1972 // in order to use an ASCII character class, it must be enclosed in
1973 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1974 // of it as "ASCII character characters have the syntax `[:NAME:]`
1975 // which can only appear within character brackets." This means that
1976 // things like `[[:lower:]A]` are legal constructs.
1977 //
1978 // However, if one types an incorrect ASCII character class, e.g.,
1979 // `[[:loower:]]`, then we treat that as a normal nested character
1980 // class containing the characters `:elorw`. One might argue that we
1981 // should return an error instead since the repeated colons give away
1982 // the intent to write an ASCII class. But what if the user typed
1983 // `[[:lower]]` instead? How can we tell that was intended to be an
1984 // ASCII class and not just a normal nested class?
1985 //
1986 // Reasonable people can probably disagree over this, but for better
1987 // or worse, we implement semantics that never fails at the expense
1988 // of better failure modes.
1989 assert_eq!(self.char(), '[');
1990 // If parsing fails, then we back up the parser to this starting point.
1991 let start = self.pos();
1992 let mut negated = false;
1993 if !self.bump() || self.char() != ':' {
1994 self.parser().pos.set(start);
1995 return None;
1996 }
1997 if !self.bump() {
1998 self.parser().pos.set(start);
1999 return None;
2000 }
2001 if self.char() == '^' {
2002 negated = true;
2003 if !self.bump() {
2004 self.parser().pos.set(start);
2005 return None;
2006 }
2007 }
2008 let name_start = self.offset();
2009 while self.char() != ':' && self.bump() {}
2010 if self.is_eof() {
2011 self.parser().pos.set(start);
2012 return None;
2013 }
2014 let name = &self.pattern()[name_start..self.offset()];
2015 if !self.bump_if(":]") {
2016 self.parser().pos.set(start);
2017 return None;
2018 }
2019 let kind = match ast::ClassAsciiKind::from_name(name) {
2020 Some(kind) => kind,
2021 None => {
2022 self.parser().pos.set(start);
2023 return None;
2024 }
2025 };
2026 Some(ast::ClassAscii {
2027 span: Span::new(start, self.pos()),
2028 kind: kind,
2029 negated: negated,
2030 })
2031 }
2032
2033 /// Parse a Unicode class in either the single character notation, `\pN`
2034 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2035 /// the parser is positioned at the `p` (or `P` for negation) and will
2036 /// advance the parser to the character immediately following the class.
2037 ///
2038 /// Note that this does not check whether the class name is valid or not.
2039 #[inline(never)]
parse_unicode_class(&self) -> Result<ast::ClassUnicode>2040 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2041 assert!(self.char() == 'p' || self.char() == 'P');
2042
2043 let mut scratch = self.parser().scratch.borrow_mut();
2044 scratch.clear();
2045
2046 let negated = self.char() == 'P';
2047 if !self.bump_and_bump_space() {
2048 return Err(
2049 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2050 );
2051 }
2052 let (start, kind) = if self.char() == '{' {
2053 let start = self.span_char().end;
2054 while self.bump_and_bump_space() && self.char() != '}' {
2055 scratch.push(self.char());
2056 }
2057 if self.is_eof() {
2058 return Err(self
2059 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2060 }
2061 assert_eq!(self.char(), '}');
2062 self.bump();
2063
2064 let name = scratch.as_str();
2065 if let Some(i) = name.find("!=") {
2066 (
2067 start,
2068 ast::ClassUnicodeKind::NamedValue {
2069 op: ast::ClassUnicodeOpKind::NotEqual,
2070 name: name[..i].to_string(),
2071 value: name[i + 2..].to_string(),
2072 },
2073 )
2074 } else if let Some(i) = name.find(':') {
2075 (
2076 start,
2077 ast::ClassUnicodeKind::NamedValue {
2078 op: ast::ClassUnicodeOpKind::Colon,
2079 name: name[..i].to_string(),
2080 value: name[i + 1..].to_string(),
2081 },
2082 )
2083 } else if let Some(i) = name.find('=') {
2084 (
2085 start,
2086 ast::ClassUnicodeKind::NamedValue {
2087 op: ast::ClassUnicodeOpKind::Equal,
2088 name: name[..i].to_string(),
2089 value: name[i + 1..].to_string(),
2090 },
2091 )
2092 } else {
2093 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2094 }
2095 } else {
2096 let start = self.pos();
2097 let c = self.char();
2098 if c == '\\' {
2099 return Err(self.error(
2100 self.span_char(),
2101 ast::ErrorKind::UnicodeClassInvalid,
2102 ));
2103 }
2104 self.bump_and_bump_space();
2105 let kind = ast::ClassUnicodeKind::OneLetter(c);
2106 (start, kind)
2107 };
2108 Ok(ast::ClassUnicode {
2109 span: Span::new(start, self.pos()),
2110 negated: negated,
2111 kind: kind,
2112 })
2113 }
2114
2115 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2116 /// parser is currently at a valid character class name and will be
2117 /// advanced to the character immediately following the class.
2118 #[inline(never)]
parse_perl_class(&self) -> ast::ClassPerl2119 fn parse_perl_class(&self) -> ast::ClassPerl {
2120 let c = self.char();
2121 let span = self.span_char();
2122 self.bump();
2123 let (negated, kind) = match c {
2124 'd' => (false, ast::ClassPerlKind::Digit),
2125 'D' => (true, ast::ClassPerlKind::Digit),
2126 's' => (false, ast::ClassPerlKind::Space),
2127 'S' => (true, ast::ClassPerlKind::Space),
2128 'w' => (false, ast::ClassPerlKind::Word),
2129 'W' => (true, ast::ClassPerlKind::Word),
2130 c => panic!("expected valid Perl class but got '{}'", c),
2131 };
2132 ast::ClassPerl { span: span, kind: kind, negated: negated }
2133 }
2134 }
2135
2136 /// A type that traverses a fully parsed Ast and checks whether its depth
2137 /// exceeds the specified nesting limit. If it does, then an error is returned.
2138 #[derive(Debug)]
2139 struct NestLimiter<'p, 's: 'p, P: 'p + 's> {
2140 /// The parser that is checking the nest limit.
2141 p: &'p ParserI<'s, P>,
2142 /// The current depth while walking an Ast.
2143 depth: u32,
2144 }
2145
2146 impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P>2147 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2148 NestLimiter { p: p, depth: 0 }
2149 }
2150
2151 #[inline(never)]
check(self, ast: &Ast) -> Result<()>2152 fn check(self, ast: &Ast) -> Result<()> {
2153 ast::visit(ast, self)
2154 }
2155
increment_depth(&mut self, span: &Span) -> Result<()>2156 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2157 let new = self.depth.checked_add(1).ok_or_else(|| {
2158 self.p.error(
2159 span.clone(),
2160 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
2161 )
2162 })?;
2163 let limit = self.p.parser().nest_limit;
2164 if new > limit {
2165 return Err(self.p.error(
2166 span.clone(),
2167 ast::ErrorKind::NestLimitExceeded(limit),
2168 ));
2169 }
2170 self.depth = new;
2171 Ok(())
2172 }
2173
decrement_depth(&mut self)2174 fn decrement_depth(&mut self) {
2175 // Assuming the correctness of the visitor, this should never drop
2176 // below 0.
2177 self.depth = self.depth.checked_sub(1).unwrap();
2178 }
2179 }
2180
2181 impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2182 type Output = ();
2183 type Err = ast::Error;
2184
finish(self) -> Result<()>2185 fn finish(self) -> Result<()> {
2186 Ok(())
2187 }
2188
visit_pre(&mut self, ast: &Ast) -> Result<()>2189 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2190 let span = match *ast {
2191 Ast::Empty(_)
2192 | Ast::Flags(_)
2193 | Ast::Literal(_)
2194 | Ast::Dot(_)
2195 | Ast::Assertion(_)
2196 | Ast::Class(ast::Class::Unicode(_))
2197 | Ast::Class(ast::Class::Perl(_)) => {
2198 // These are all base cases, so we don't increment depth.
2199 return Ok(());
2200 }
2201 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2202 Ast::Repetition(ref x) => &x.span,
2203 Ast::Group(ref x) => &x.span,
2204 Ast::Alternation(ref x) => &x.span,
2205 Ast::Concat(ref x) => &x.span,
2206 };
2207 self.increment_depth(span)
2208 }
2209
visit_post(&mut self, ast: &Ast) -> Result<()>2210 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2211 match *ast {
2212 Ast::Empty(_)
2213 | Ast::Flags(_)
2214 | Ast::Literal(_)
2215 | Ast::Dot(_)
2216 | Ast::Assertion(_)
2217 | Ast::Class(ast::Class::Unicode(_))
2218 | Ast::Class(ast::Class::Perl(_)) => {
2219 // These are all base cases, so we don't decrement depth.
2220 Ok(())
2221 }
2222 Ast::Class(ast::Class::Bracketed(_))
2223 | Ast::Repetition(_)
2224 | Ast::Group(_)
2225 | Ast::Alternation(_)
2226 | Ast::Concat(_) => {
2227 self.decrement_depth();
2228 Ok(())
2229 }
2230 }
2231 }
2232
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2233 fn visit_class_set_item_pre(
2234 &mut self,
2235 ast: &ast::ClassSetItem,
2236 ) -> Result<()> {
2237 let span = match *ast {
2238 ast::ClassSetItem::Empty(_)
2239 | ast::ClassSetItem::Literal(_)
2240 | ast::ClassSetItem::Range(_)
2241 | ast::ClassSetItem::Ascii(_)
2242 | ast::ClassSetItem::Unicode(_)
2243 | ast::ClassSetItem::Perl(_) => {
2244 // These are all base cases, so we don't increment depth.
2245 return Ok(());
2246 }
2247 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2248 ast::ClassSetItem::Union(ref x) => &x.span,
2249 };
2250 self.increment_depth(span)
2251 }
2252
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2253 fn visit_class_set_item_post(
2254 &mut self,
2255 ast: &ast::ClassSetItem,
2256 ) -> Result<()> {
2257 match *ast {
2258 ast::ClassSetItem::Empty(_)
2259 | ast::ClassSetItem::Literal(_)
2260 | ast::ClassSetItem::Range(_)
2261 | ast::ClassSetItem::Ascii(_)
2262 | ast::ClassSetItem::Unicode(_)
2263 | ast::ClassSetItem::Perl(_) => {
2264 // These are all base cases, so we don't decrement depth.
2265 Ok(())
2266 }
2267 ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2268 self.decrement_depth();
2269 Ok(())
2270 }
2271 }
2272 }
2273
visit_class_set_binary_op_pre( &mut self, ast: &ast::ClassSetBinaryOp, ) -> Result<()>2274 fn visit_class_set_binary_op_pre(
2275 &mut self,
2276 ast: &ast::ClassSetBinaryOp,
2277 ) -> Result<()> {
2278 self.increment_depth(&ast.span)
2279 }
2280
visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<()>2281 fn visit_class_set_binary_op_post(
2282 &mut self,
2283 _ast: &ast::ClassSetBinaryOp,
2284 ) -> Result<()> {
2285 self.decrement_depth();
2286 Ok(())
2287 }
2288 }
2289
2290 /// When the result is an error, transforms the ast::ErrorKind from the source
2291 /// Result into another one. This function is used to return clearer error
2292 /// messages when possible.
specialize_err<T>( result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind, ) -> Result<T>2293 fn specialize_err<T>(
2294 result: Result<T>,
2295 from: ast::ErrorKind,
2296 to: ast::ErrorKind,
2297 ) -> Result<T> {
2298 if let Err(e) = result {
2299 if e.kind == from {
2300 Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2301 } else {
2302 Err(e)
2303 }
2304 } else {
2305 result
2306 }
2307 }
2308
2309 #[cfg(test)]
2310 mod tests {
2311 use std::ops::Range;
2312
2313 use super::{Parser, ParserBuilder, ParserI, Primitive};
2314 use ast::{self, Ast, Position, Span};
2315
2316 // Our own assert_eq, which has slightly better formatting (but honestly
2317 // still kind of crappy).
2318 macro_rules! assert_eq {
2319 ($left:expr, $right:expr) => {{
2320 match (&$left, &$right) {
2321 (left_val, right_val) => {
2322 if !(*left_val == *right_val) {
2323 panic!(
2324 "assertion failed: `(left == right)`\n\n\
2325 left: `{:?}`\nright: `{:?}`\n\n",
2326 left_val, right_val
2327 )
2328 }
2329 }
2330 }
2331 }};
2332 }
2333
2334 // We create these errors to compare with real ast::Errors in the tests.
2335 // We define equality between TestError and ast::Error to disregard the
2336 // pattern string in ast::Error, which is annoying to provide in tests.
2337 #[derive(Clone, Debug)]
2338 struct TestError {
2339 span: Span,
2340 kind: ast::ErrorKind,
2341 }
2342
2343 impl PartialEq<ast::Error> for TestError {
eq(&self, other: &ast::Error) -> bool2344 fn eq(&self, other: &ast::Error) -> bool {
2345 self.span == other.span && self.kind == other.kind
2346 }
2347 }
2348
2349 impl PartialEq<TestError> for ast::Error {
eq(&self, other: &TestError) -> bool2350 fn eq(&self, other: &TestError) -> bool {
2351 self.span == other.span && self.kind == other.kind
2352 }
2353 }
2354
s(str: &str) -> String2355 fn s(str: &str) -> String {
2356 str.to_string()
2357 }
2358
parser(pattern: &str) -> ParserI<Parser>2359 fn parser(pattern: &str) -> ParserI<Parser> {
2360 ParserI::new(Parser::new(), pattern)
2361 }
2362
parser_octal(pattern: &str) -> ParserI<Parser>2363 fn parser_octal(pattern: &str) -> ParserI<Parser> {
2364 let parser = ParserBuilder::new().octal(true).build();
2365 ParserI::new(parser, pattern)
2366 }
2367
parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser>2368 fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> {
2369 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2370 ParserI::new(p, pattern)
2371 }
2372
parser_ignore_whitespace(pattern: &str) -> ParserI<Parser>2373 fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> {
2374 let p = ParserBuilder::new().ignore_whitespace(true).build();
2375 ParserI::new(p, pattern)
2376 }
2377
2378 /// Short alias for creating a new span.
nspan(start: Position, end: Position) -> Span2379 fn nspan(start: Position, end: Position) -> Span {
2380 Span::new(start, end)
2381 }
2382
2383 /// Short alias for creating a new position.
npos(offset: usize, line: usize, column: usize) -> Position2384 fn npos(offset: usize, line: usize, column: usize) -> Position {
2385 Position::new(offset, line, column)
2386 }
2387
2388 /// Create a new span from the given offset range. This assumes a single
2389 /// line and sets the columns based on the offsets. i.e., This only works
2390 /// out of the box for ASCII, which is fine for most tests.
span(range: Range<usize>) -> Span2391 fn span(range: Range<usize>) -> Span {
2392 let start = Position::new(range.start, 1, range.start + 1);
2393 let end = Position::new(range.end, 1, range.end + 1);
2394 Span::new(start, end)
2395 }
2396
2397 /// Create a new span for the corresponding byte range in the given string.
span_range(subject: &str, range: Range<usize>) -> Span2398 fn span_range(subject: &str, range: Range<usize>) -> Span {
2399 let start = Position {
2400 offset: range.start,
2401 line: 1 + subject[..range.start].matches('\n').count(),
2402 column: 1 + subject[..range.start]
2403 .chars()
2404 .rev()
2405 .position(|c| c == '\n')
2406 .unwrap_or(subject[..range.start].chars().count()),
2407 };
2408 let end = Position {
2409 offset: range.end,
2410 line: 1 + subject[..range.end].matches('\n').count(),
2411 column: 1 + subject[..range.end]
2412 .chars()
2413 .rev()
2414 .position(|c| c == '\n')
2415 .unwrap_or(subject[..range.end].chars().count()),
2416 };
2417 Span::new(start, end)
2418 }
2419
2420 /// Create a verbatim literal starting at the given position.
lit(c: char, start: usize) -> Ast2421 fn lit(c: char, start: usize) -> Ast {
2422 lit_with(c, span(start..start + c.len_utf8()))
2423 }
2424
2425 /// Create a punctuation literal starting at the given position.
punct_lit(c: char, span: Span) -> Ast2426 fn punct_lit(c: char, span: Span) -> Ast {
2427 Ast::Literal(ast::Literal {
2428 span: span,
2429 kind: ast::LiteralKind::Punctuation,
2430 c: c,
2431 })
2432 }
2433
2434 /// Create a verbatim literal with the given span.
lit_with(c: char, span: Span) -> Ast2435 fn lit_with(c: char, span: Span) -> Ast {
2436 Ast::Literal(ast::Literal {
2437 span: span,
2438 kind: ast::LiteralKind::Verbatim,
2439 c: c,
2440 })
2441 }
2442
2443 /// Create a concatenation with the given range.
concat(range: Range<usize>, asts: Vec<Ast>) -> Ast2444 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2445 concat_with(span(range), asts)
2446 }
2447
2448 /// Create a concatenation with the given span.
concat_with(span: Span, asts: Vec<Ast>) -> Ast2449 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2450 Ast::Concat(ast::Concat { span: span, asts: asts })
2451 }
2452
2453 /// Create an alternation with the given span.
alt(range: Range<usize>, asts: Vec<Ast>) -> Ast2454 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2455 Ast::Alternation(ast::Alternation { span: span(range), asts: asts })
2456 }
2457
2458 /// Create a capturing group with the given span.
group(range: Range<usize>, index: u32, ast: Ast) -> Ast2459 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2460 Ast::Group(ast::Group {
2461 span: span(range),
2462 kind: ast::GroupKind::CaptureIndex(index),
2463 ast: Box::new(ast),
2464 })
2465 }
2466
2467 /// Create an ast::SetFlags.
2468 ///
2469 /// The given pattern should be the full pattern string. The range given
2470 /// should correspond to the byte offsets where the flag set occurs.
2471 ///
2472 /// If negated is true, then the set is interpreted as beginning with a
2473 /// negation.
flag_set( pat: &str, range: Range<usize>, flag: ast::Flag, negated: bool, ) -> Ast2474 fn flag_set(
2475 pat: &str,
2476 range: Range<usize>,
2477 flag: ast::Flag,
2478 negated: bool,
2479 ) -> Ast {
2480 let mut items = vec![ast::FlagsItem {
2481 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2482 kind: ast::FlagsItemKind::Flag(flag),
2483 }];
2484 if negated {
2485 items.insert(
2486 0,
2487 ast::FlagsItem {
2488 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2489 kind: ast::FlagsItemKind::Negation,
2490 },
2491 );
2492 }
2493 Ast::Flags(ast::SetFlags {
2494 span: span_range(pat, range.clone()),
2495 flags: ast::Flags {
2496 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2497 items: items,
2498 },
2499 })
2500 }
2501
2502 #[test]
parse_nest_limit()2503 fn parse_nest_limit() {
2504 // A nest limit of 0 still allows some types of regexes.
2505 assert_eq!(
2506 parser_nest_limit("", 0).parse(),
2507 Ok(Ast::Empty(span(0..0)))
2508 );
2509 assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2510
2511 // Test repetition operations, which require one level of nesting.
2512 assert_eq!(
2513 parser_nest_limit("a+", 0).parse().unwrap_err(),
2514 TestError {
2515 span: span(0..2),
2516 kind: ast::ErrorKind::NestLimitExceeded(0),
2517 }
2518 );
2519 assert_eq!(
2520 parser_nest_limit("a+", 1).parse(),
2521 Ok(Ast::Repetition(ast::Repetition {
2522 span: span(0..2),
2523 op: ast::RepetitionOp {
2524 span: span(1..2),
2525 kind: ast::RepetitionKind::OneOrMore,
2526 },
2527 greedy: true,
2528 ast: Box::new(lit('a', 0)),
2529 }))
2530 );
2531 assert_eq!(
2532 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2533 TestError {
2534 span: span(0..3),
2535 kind: ast::ErrorKind::NestLimitExceeded(1),
2536 }
2537 );
2538 assert_eq!(
2539 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2540 TestError {
2541 span: span(0..2),
2542 kind: ast::ErrorKind::NestLimitExceeded(1),
2543 }
2544 );
2545 assert_eq!(
2546 parser_nest_limit("a+*", 2).parse(),
2547 Ok(Ast::Repetition(ast::Repetition {
2548 span: span(0..3),
2549 op: ast::RepetitionOp {
2550 span: span(2..3),
2551 kind: ast::RepetitionKind::ZeroOrMore,
2552 },
2553 greedy: true,
2554 ast: Box::new(Ast::Repetition(ast::Repetition {
2555 span: span(0..2),
2556 op: ast::RepetitionOp {
2557 span: span(1..2),
2558 kind: ast::RepetitionKind::OneOrMore,
2559 },
2560 greedy: true,
2561 ast: Box::new(lit('a', 0)),
2562 })),
2563 }))
2564 );
2565
2566 // Test concatenations. A concatenation requires one level of nesting.
2567 assert_eq!(
2568 parser_nest_limit("ab", 0).parse().unwrap_err(),
2569 TestError {
2570 span: span(0..2),
2571 kind: ast::ErrorKind::NestLimitExceeded(0),
2572 }
2573 );
2574 assert_eq!(
2575 parser_nest_limit("ab", 1).parse(),
2576 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2577 );
2578 assert_eq!(
2579 parser_nest_limit("abc", 1).parse(),
2580 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2581 );
2582
2583 // Test alternations. An alternation requires one level of nesting.
2584 assert_eq!(
2585 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2586 TestError {
2587 span: span(0..3),
2588 kind: ast::ErrorKind::NestLimitExceeded(0),
2589 }
2590 );
2591 assert_eq!(
2592 parser_nest_limit("a|b", 1).parse(),
2593 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2594 );
2595 assert_eq!(
2596 parser_nest_limit("a|b|c", 1).parse(),
2597 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2598 );
2599
2600 // Test character classes. Classes form their own mini-recursive
2601 // syntax!
2602 assert_eq!(
2603 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2604 TestError {
2605 span: span(0..3),
2606 kind: ast::ErrorKind::NestLimitExceeded(0),
2607 }
2608 );
2609 assert_eq!(
2610 parser_nest_limit("[a]", 1).parse(),
2611 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2612 span: span(0..3),
2613 negated: false,
2614 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2615 ast::Literal {
2616 span: span(1..2),
2617 kind: ast::LiteralKind::Verbatim,
2618 c: 'a',
2619 }
2620 )),
2621 })))
2622 );
2623 assert_eq!(
2624 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2625 TestError {
2626 span: span(1..3),
2627 kind: ast::ErrorKind::NestLimitExceeded(1),
2628 }
2629 );
2630 assert_eq!(
2631 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2632 TestError {
2633 span: span(3..7),
2634 kind: ast::ErrorKind::NestLimitExceeded(2),
2635 }
2636 );
2637 assert_eq!(
2638 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2639 TestError {
2640 span: span(4..6),
2641 kind: ast::ErrorKind::NestLimitExceeded(3),
2642 }
2643 );
2644 assert_eq!(
2645 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2646 TestError {
2647 span: span(1..5),
2648 kind: ast::ErrorKind::NestLimitExceeded(1),
2649 }
2650 );
2651 assert_eq!(
2652 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2653 TestError {
2654 span: span(4..6),
2655 kind: ast::ErrorKind::NestLimitExceeded(2),
2656 }
2657 );
2658 }
2659
2660 #[test]
parse_comments()2661 fn parse_comments() {
2662 let pat = "(?x)
2663 # This is comment 1.
2664 foo # This is comment 2.
2665 # This is comment 3.
2666 bar
2667 # This is comment 4.";
2668 let astc = parser(pat).parse_with_comments().unwrap();
2669 assert_eq!(
2670 astc.ast,
2671 concat_with(
2672 span_range(pat, 0..pat.len()),
2673 vec![
2674 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2675 lit_with('f', span_range(pat, 26..27)),
2676 lit_with('o', span_range(pat, 27..28)),
2677 lit_with('o', span_range(pat, 28..29)),
2678 lit_with('b', span_range(pat, 74..75)),
2679 lit_with('a', span_range(pat, 75..76)),
2680 lit_with('r', span_range(pat, 76..77)),
2681 ]
2682 )
2683 );
2684 assert_eq!(
2685 astc.comments,
2686 vec![
2687 ast::Comment {
2688 span: span_range(pat, 5..26),
2689 comment: s(" This is comment 1."),
2690 },
2691 ast::Comment {
2692 span: span_range(pat, 30..51),
2693 comment: s(" This is comment 2."),
2694 },
2695 ast::Comment {
2696 span: span_range(pat, 53..74),
2697 comment: s(" This is comment 3."),
2698 },
2699 ast::Comment {
2700 span: span_range(pat, 78..98),
2701 comment: s(" This is comment 4."),
2702 },
2703 ]
2704 );
2705 }
2706
2707 #[test]
parse_holistic()2708 fn parse_holistic() {
2709 assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2710 assert_eq!(
2711 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2712 Ok(concat(
2713 0..36,
2714 vec![
2715 punct_lit('\\', span(0..2)),
2716 punct_lit('.', span(2..4)),
2717 punct_lit('+', span(4..6)),
2718 punct_lit('*', span(6..8)),
2719 punct_lit('?', span(8..10)),
2720 punct_lit('(', span(10..12)),
2721 punct_lit(')', span(12..14)),
2722 punct_lit('|', span(14..16)),
2723 punct_lit('[', span(16..18)),
2724 punct_lit(']', span(18..20)),
2725 punct_lit('{', span(20..22)),
2726 punct_lit('}', span(22..24)),
2727 punct_lit('^', span(24..26)),
2728 punct_lit('$', span(26..28)),
2729 punct_lit('#', span(28..30)),
2730 punct_lit('&', span(30..32)),
2731 punct_lit('-', span(32..34)),
2732 punct_lit('~', span(34..36)),
2733 ]
2734 ))
2735 );
2736 }
2737
2738 #[test]
parse_ignore_whitespace()2739 fn parse_ignore_whitespace() {
2740 // Test that basic whitespace insensitivity works.
2741 let pat = "(?x)a b";
2742 assert_eq!(
2743 parser(pat).parse(),
2744 Ok(concat_with(
2745 nspan(npos(0, 1, 1), npos(7, 1, 8)),
2746 vec![
2747 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2748 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2749 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2750 ]
2751 ))
2752 );
2753
2754 // Test that we can toggle whitespace insensitivity.
2755 let pat = "(?x)a b(?-x)a b";
2756 assert_eq!(
2757 parser(pat).parse(),
2758 Ok(concat_with(
2759 nspan(npos(0, 1, 1), npos(15, 1, 16)),
2760 vec![
2761 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2762 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2763 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2764 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2765 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2766 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2767 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2768 ]
2769 ))
2770 );
2771
2772 // Test that nesting whitespace insensitive flags works.
2773 let pat = "a (?x:a )a ";
2774 assert_eq!(
2775 parser(pat).parse(),
2776 Ok(concat_with(
2777 span_range(pat, 0..11),
2778 vec![
2779 lit_with('a', span_range(pat, 0..1)),
2780 lit_with(' ', span_range(pat, 1..2)),
2781 Ast::Group(ast::Group {
2782 span: span_range(pat, 2..9),
2783 kind: ast::GroupKind::NonCapturing(ast::Flags {
2784 span: span_range(pat, 4..5),
2785 items: vec![ast::FlagsItem {
2786 span: span_range(pat, 4..5),
2787 kind: ast::FlagsItemKind::Flag(
2788 ast::Flag::IgnoreWhitespace
2789 ),
2790 },],
2791 }),
2792 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2793 }),
2794 lit_with('a', span_range(pat, 9..10)),
2795 lit_with(' ', span_range(pat, 10..11)),
2796 ]
2797 ))
2798 );
2799
2800 // Test that whitespace after an opening paren is insignificant.
2801 let pat = "(?x)( ?P<foo> a )";
2802 assert_eq!(
2803 parser(pat).parse(),
2804 Ok(concat_with(
2805 span_range(pat, 0..pat.len()),
2806 vec![
2807 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2808 Ast::Group(ast::Group {
2809 span: span_range(pat, 4..pat.len()),
2810 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2811 span: span_range(pat, 9..12),
2812 name: s("foo"),
2813 index: 1,
2814 }),
2815 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2816 }),
2817 ]
2818 ))
2819 );
2820 let pat = "(?x)( a )";
2821 assert_eq!(
2822 parser(pat).parse(),
2823 Ok(concat_with(
2824 span_range(pat, 0..pat.len()),
2825 vec![
2826 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2827 Ast::Group(ast::Group {
2828 span: span_range(pat, 4..pat.len()),
2829 kind: ast::GroupKind::CaptureIndex(1),
2830 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2831 }),
2832 ]
2833 ))
2834 );
2835 let pat = "(?x)( ?: a )";
2836 assert_eq!(
2837 parser(pat).parse(),
2838 Ok(concat_with(
2839 span_range(pat, 0..pat.len()),
2840 vec![
2841 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2842 Ast::Group(ast::Group {
2843 span: span_range(pat, 4..pat.len()),
2844 kind: ast::GroupKind::NonCapturing(ast::Flags {
2845 span: span_range(pat, 8..8),
2846 items: vec![],
2847 }),
2848 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2849 }),
2850 ]
2851 ))
2852 );
2853 let pat = r"(?x)\x { 53 }";
2854 assert_eq!(
2855 parser(pat).parse(),
2856 Ok(concat_with(
2857 span_range(pat, 0..pat.len()),
2858 vec![
2859 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2860 Ast::Literal(ast::Literal {
2861 span: span(4..13),
2862 kind: ast::LiteralKind::HexBrace(
2863 ast::HexLiteralKind::X
2864 ),
2865 c: 'S',
2866 }),
2867 ]
2868 ))
2869 );
2870
2871 // Test that whitespace after an escape is OK.
2872 let pat = r"(?x)\ ";
2873 assert_eq!(
2874 parser(pat).parse(),
2875 Ok(concat_with(
2876 span_range(pat, 0..pat.len()),
2877 vec![
2878 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2879 Ast::Literal(ast::Literal {
2880 span: span_range(pat, 4..6),
2881 kind: ast::LiteralKind::Special(
2882 ast::SpecialLiteralKind::Space
2883 ),
2884 c: ' ',
2885 }),
2886 ]
2887 ))
2888 );
2889 // ... but only when `x` mode is enabled.
2890 let pat = r"\ ";
2891 assert_eq!(
2892 parser(pat).parse().unwrap_err(),
2893 TestError {
2894 span: span_range(pat, 0..2),
2895 kind: ast::ErrorKind::EscapeUnrecognized,
2896 }
2897 );
2898 }
2899
2900 #[test]
parse_newlines()2901 fn parse_newlines() {
2902 let pat = ".\n.";
2903 assert_eq!(
2904 parser(pat).parse(),
2905 Ok(concat_with(
2906 span_range(pat, 0..3),
2907 vec![
2908 Ast::Dot(span_range(pat, 0..1)),
2909 lit_with('\n', span_range(pat, 1..2)),
2910 Ast::Dot(span_range(pat, 2..3)),
2911 ]
2912 ))
2913 );
2914
2915 let pat = "foobar\nbaz\nquux\n";
2916 assert_eq!(
2917 parser(pat).parse(),
2918 Ok(concat_with(
2919 span_range(pat, 0..pat.len()),
2920 vec![
2921 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2922 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2923 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2924 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2925 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2926 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2927 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2928 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2929 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2930 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2931 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2932 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2933 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2934 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2935 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2936 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2937 ]
2938 ))
2939 );
2940 }
2941
2942 #[test]
parse_uncounted_repetition()2943 fn parse_uncounted_repetition() {
2944 assert_eq!(
2945 parser(r"a*").parse(),
2946 Ok(Ast::Repetition(ast::Repetition {
2947 span: span(0..2),
2948 op: ast::RepetitionOp {
2949 span: span(1..2),
2950 kind: ast::RepetitionKind::ZeroOrMore,
2951 },
2952 greedy: true,
2953 ast: Box::new(lit('a', 0)),
2954 }))
2955 );
2956 assert_eq!(
2957 parser(r"a+").parse(),
2958 Ok(Ast::Repetition(ast::Repetition {
2959 span: span(0..2),
2960 op: ast::RepetitionOp {
2961 span: span(1..2),
2962 kind: ast::RepetitionKind::OneOrMore,
2963 },
2964 greedy: true,
2965 ast: Box::new(lit('a', 0)),
2966 }))
2967 );
2968
2969 assert_eq!(
2970 parser(r"a?").parse(),
2971 Ok(Ast::Repetition(ast::Repetition {
2972 span: span(0..2),
2973 op: ast::RepetitionOp {
2974 span: span(1..2),
2975 kind: ast::RepetitionKind::ZeroOrOne,
2976 },
2977 greedy: true,
2978 ast: Box::new(lit('a', 0)),
2979 }))
2980 );
2981 assert_eq!(
2982 parser(r"a??").parse(),
2983 Ok(Ast::Repetition(ast::Repetition {
2984 span: span(0..3),
2985 op: ast::RepetitionOp {
2986 span: span(1..3),
2987 kind: ast::RepetitionKind::ZeroOrOne,
2988 },
2989 greedy: false,
2990 ast: Box::new(lit('a', 0)),
2991 }))
2992 );
2993 assert_eq!(
2994 parser(r"a?").parse(),
2995 Ok(Ast::Repetition(ast::Repetition {
2996 span: span(0..2),
2997 op: ast::RepetitionOp {
2998 span: span(1..2),
2999 kind: ast::RepetitionKind::ZeroOrOne,
3000 },
3001 greedy: true,
3002 ast: Box::new(lit('a', 0)),
3003 }))
3004 );
3005 assert_eq!(
3006 parser(r"a?b").parse(),
3007 Ok(concat(
3008 0..3,
3009 vec![
3010 Ast::Repetition(ast::Repetition {
3011 span: span(0..2),
3012 op: ast::RepetitionOp {
3013 span: span(1..2),
3014 kind: ast::RepetitionKind::ZeroOrOne,
3015 },
3016 greedy: true,
3017 ast: Box::new(lit('a', 0)),
3018 }),
3019 lit('b', 2),
3020 ]
3021 ))
3022 );
3023 assert_eq!(
3024 parser(r"a??b").parse(),
3025 Ok(concat(
3026 0..4,
3027 vec![
3028 Ast::Repetition(ast::Repetition {
3029 span: span(0..3),
3030 op: ast::RepetitionOp {
3031 span: span(1..3),
3032 kind: ast::RepetitionKind::ZeroOrOne,
3033 },
3034 greedy: false,
3035 ast: Box::new(lit('a', 0)),
3036 }),
3037 lit('b', 3),
3038 ]
3039 ))
3040 );
3041 assert_eq!(
3042 parser(r"ab?").parse(),
3043 Ok(concat(
3044 0..3,
3045 vec![
3046 lit('a', 0),
3047 Ast::Repetition(ast::Repetition {
3048 span: span(1..3),
3049 op: ast::RepetitionOp {
3050 span: span(2..3),
3051 kind: ast::RepetitionKind::ZeroOrOne,
3052 },
3053 greedy: true,
3054 ast: Box::new(lit('b', 1)),
3055 }),
3056 ]
3057 ))
3058 );
3059 assert_eq!(
3060 parser(r"(ab)?").parse(),
3061 Ok(Ast::Repetition(ast::Repetition {
3062 span: span(0..5),
3063 op: ast::RepetitionOp {
3064 span: span(4..5),
3065 kind: ast::RepetitionKind::ZeroOrOne,
3066 },
3067 greedy: true,
3068 ast: Box::new(group(
3069 0..4,
3070 1,
3071 concat(1..3, vec![lit('a', 1), lit('b', 2),])
3072 )),
3073 }))
3074 );
3075 assert_eq!(
3076 parser(r"|a?").parse(),
3077 Ok(alt(
3078 0..3,
3079 vec![
3080 Ast::Empty(span(0..0)),
3081 Ast::Repetition(ast::Repetition {
3082 span: span(1..3),
3083 op: ast::RepetitionOp {
3084 span: span(2..3),
3085 kind: ast::RepetitionKind::ZeroOrOne,
3086 },
3087 greedy: true,
3088 ast: Box::new(lit('a', 1)),
3089 }),
3090 ]
3091 ))
3092 );
3093
3094 assert_eq!(
3095 parser(r"*").parse().unwrap_err(),
3096 TestError {
3097 span: span(0..0),
3098 kind: ast::ErrorKind::RepetitionMissing,
3099 }
3100 );
3101 assert_eq!(
3102 parser(r"(?i)*").parse().unwrap_err(),
3103 TestError {
3104 span: span(4..4),
3105 kind: ast::ErrorKind::RepetitionMissing,
3106 }
3107 );
3108 assert_eq!(
3109 parser(r"(*)").parse().unwrap_err(),
3110 TestError {
3111 span: span(1..1),
3112 kind: ast::ErrorKind::RepetitionMissing,
3113 }
3114 );
3115 assert_eq!(
3116 parser(r"(?:?)").parse().unwrap_err(),
3117 TestError {
3118 span: span(3..3),
3119 kind: ast::ErrorKind::RepetitionMissing,
3120 }
3121 );
3122 assert_eq!(
3123 parser(r"+").parse().unwrap_err(),
3124 TestError {
3125 span: span(0..0),
3126 kind: ast::ErrorKind::RepetitionMissing,
3127 }
3128 );
3129 assert_eq!(
3130 parser(r"?").parse().unwrap_err(),
3131 TestError {
3132 span: span(0..0),
3133 kind: ast::ErrorKind::RepetitionMissing,
3134 }
3135 );
3136 assert_eq!(
3137 parser(r"(?)").parse().unwrap_err(),
3138 TestError {
3139 span: span(1..1),
3140 kind: ast::ErrorKind::RepetitionMissing,
3141 }
3142 );
3143 assert_eq!(
3144 parser(r"|*").parse().unwrap_err(),
3145 TestError {
3146 span: span(1..1),
3147 kind: ast::ErrorKind::RepetitionMissing,
3148 }
3149 );
3150 assert_eq!(
3151 parser(r"|+").parse().unwrap_err(),
3152 TestError {
3153 span: span(1..1),
3154 kind: ast::ErrorKind::RepetitionMissing,
3155 }
3156 );
3157 assert_eq!(
3158 parser(r"|?").parse().unwrap_err(),
3159 TestError {
3160 span: span(1..1),
3161 kind: ast::ErrorKind::RepetitionMissing,
3162 }
3163 );
3164 }
3165
3166 #[test]
parse_counted_repetition()3167 fn parse_counted_repetition() {
3168 assert_eq!(
3169 parser(r"a{5}").parse(),
3170 Ok(Ast::Repetition(ast::Repetition {
3171 span: span(0..4),
3172 op: ast::RepetitionOp {
3173 span: span(1..4),
3174 kind: ast::RepetitionKind::Range(
3175 ast::RepetitionRange::Exactly(5)
3176 ),
3177 },
3178 greedy: true,
3179 ast: Box::new(lit('a', 0)),
3180 }))
3181 );
3182 assert_eq!(
3183 parser(r"a{5,}").parse(),
3184 Ok(Ast::Repetition(ast::Repetition {
3185 span: span(0..5),
3186 op: ast::RepetitionOp {
3187 span: span(1..5),
3188 kind: ast::RepetitionKind::Range(
3189 ast::RepetitionRange::AtLeast(5)
3190 ),
3191 },
3192 greedy: true,
3193 ast: Box::new(lit('a', 0)),
3194 }))
3195 );
3196 assert_eq!(
3197 parser(r"a{5,9}").parse(),
3198 Ok(Ast::Repetition(ast::Repetition {
3199 span: span(0..6),
3200 op: ast::RepetitionOp {
3201 span: span(1..6),
3202 kind: ast::RepetitionKind::Range(
3203 ast::RepetitionRange::Bounded(5, 9)
3204 ),
3205 },
3206 greedy: true,
3207 ast: Box::new(lit('a', 0)),
3208 }))
3209 );
3210 assert_eq!(
3211 parser(r"a{5}?").parse(),
3212 Ok(Ast::Repetition(ast::Repetition {
3213 span: span(0..5),
3214 op: ast::RepetitionOp {
3215 span: span(1..5),
3216 kind: ast::RepetitionKind::Range(
3217 ast::RepetitionRange::Exactly(5)
3218 ),
3219 },
3220 greedy: false,
3221 ast: Box::new(lit('a', 0)),
3222 }))
3223 );
3224 assert_eq!(
3225 parser(r"ab{5}").parse(),
3226 Ok(concat(
3227 0..5,
3228 vec![
3229 lit('a', 0),
3230 Ast::Repetition(ast::Repetition {
3231 span: span(1..5),
3232 op: ast::RepetitionOp {
3233 span: span(2..5),
3234 kind: ast::RepetitionKind::Range(
3235 ast::RepetitionRange::Exactly(5)
3236 ),
3237 },
3238 greedy: true,
3239 ast: Box::new(lit('b', 1)),
3240 }),
3241 ]
3242 ))
3243 );
3244 assert_eq!(
3245 parser(r"ab{5}c").parse(),
3246 Ok(concat(
3247 0..6,
3248 vec![
3249 lit('a', 0),
3250 Ast::Repetition(ast::Repetition {
3251 span: span(1..5),
3252 op: ast::RepetitionOp {
3253 span: span(2..5),
3254 kind: ast::RepetitionKind::Range(
3255 ast::RepetitionRange::Exactly(5)
3256 ),
3257 },
3258 greedy: true,
3259 ast: Box::new(lit('b', 1)),
3260 }),
3261 lit('c', 5),
3262 ]
3263 ))
3264 );
3265
3266 assert_eq!(
3267 parser(r"a{ 5 }").parse(),
3268 Ok(Ast::Repetition(ast::Repetition {
3269 span: span(0..6),
3270 op: ast::RepetitionOp {
3271 span: span(1..6),
3272 kind: ast::RepetitionKind::Range(
3273 ast::RepetitionRange::Exactly(5)
3274 ),
3275 },
3276 greedy: true,
3277 ast: Box::new(lit('a', 0)),
3278 }))
3279 );
3280 assert_eq!(
3281 parser(r"a{ 5 , 9 }").parse(),
3282 Ok(Ast::Repetition(ast::Repetition {
3283 span: span(0..10),
3284 op: ast::RepetitionOp {
3285 span: span(1..10),
3286 kind: ast::RepetitionKind::Range(
3287 ast::RepetitionRange::Bounded(5, 9)
3288 ),
3289 },
3290 greedy: true,
3291 ast: Box::new(lit('a', 0)),
3292 }))
3293 );
3294 assert_eq!(
3295 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3296 Ok(Ast::Repetition(ast::Repetition {
3297 span: span(0..8),
3298 op: ast::RepetitionOp {
3299 span: span(1..8),
3300 kind: ast::RepetitionKind::Range(
3301 ast::RepetitionRange::Bounded(5, 9)
3302 ),
3303 },
3304 greedy: false,
3305 ast: Box::new(lit('a', 0)),
3306 }))
3307 );
3308
3309 assert_eq!(
3310 parser(r"(?i){0}").parse().unwrap_err(),
3311 TestError {
3312 span: span(4..4),
3313 kind: ast::ErrorKind::RepetitionMissing,
3314 }
3315 );
3316 assert_eq!(
3317 parser(r"(?m){1,1}").parse().unwrap_err(),
3318 TestError {
3319 span: span(4..4),
3320 kind: ast::ErrorKind::RepetitionMissing,
3321 }
3322 );
3323 assert_eq!(
3324 parser(r"a{]}").parse().unwrap_err(),
3325 TestError {
3326 span: span(2..2),
3327 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3328 }
3329 );
3330 assert_eq!(
3331 parser(r"a{1,]}").parse().unwrap_err(),
3332 TestError {
3333 span: span(4..4),
3334 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3335 }
3336 );
3337 assert_eq!(
3338 parser(r"a{").parse().unwrap_err(),
3339 TestError {
3340 span: span(1..2),
3341 kind: ast::ErrorKind::RepetitionCountUnclosed,
3342 }
3343 );
3344 assert_eq!(
3345 parser(r"a{}").parse().unwrap_err(),
3346 TestError {
3347 span: span(2..2),
3348 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3349 }
3350 );
3351 assert_eq!(
3352 parser(r"a{a").parse().unwrap_err(),
3353 TestError {
3354 span: span(2..2),
3355 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3356 }
3357 );
3358 assert_eq!(
3359 parser(r"a{9999999999}").parse().unwrap_err(),
3360 TestError {
3361 span: span(2..12),
3362 kind: ast::ErrorKind::DecimalInvalid,
3363 }
3364 );
3365 assert_eq!(
3366 parser(r"a{9").parse().unwrap_err(),
3367 TestError {
3368 span: span(1..3),
3369 kind: ast::ErrorKind::RepetitionCountUnclosed,
3370 }
3371 );
3372 assert_eq!(
3373 parser(r"a{9,a").parse().unwrap_err(),
3374 TestError {
3375 span: span(4..4),
3376 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3377 }
3378 );
3379 assert_eq!(
3380 parser(r"a{9,9999999999}").parse().unwrap_err(),
3381 TestError {
3382 span: span(4..14),
3383 kind: ast::ErrorKind::DecimalInvalid,
3384 }
3385 );
3386 assert_eq!(
3387 parser(r"a{9,").parse().unwrap_err(),
3388 TestError {
3389 span: span(1..4),
3390 kind: ast::ErrorKind::RepetitionCountUnclosed,
3391 }
3392 );
3393 assert_eq!(
3394 parser(r"a{9,11").parse().unwrap_err(),
3395 TestError {
3396 span: span(1..6),
3397 kind: ast::ErrorKind::RepetitionCountUnclosed,
3398 }
3399 );
3400 assert_eq!(
3401 parser(r"a{2,1}").parse().unwrap_err(),
3402 TestError {
3403 span: span(1..6),
3404 kind: ast::ErrorKind::RepetitionCountInvalid,
3405 }
3406 );
3407 assert_eq!(
3408 parser(r"{5}").parse().unwrap_err(),
3409 TestError {
3410 span: span(0..0),
3411 kind: ast::ErrorKind::RepetitionMissing,
3412 }
3413 );
3414 assert_eq!(
3415 parser(r"|{5}").parse().unwrap_err(),
3416 TestError {
3417 span: span(1..1),
3418 kind: ast::ErrorKind::RepetitionMissing,
3419 }
3420 );
3421 }
3422
3423 #[test]
parse_alternate()3424 fn parse_alternate() {
3425 assert_eq!(
3426 parser(r"a|b").parse(),
3427 Ok(Ast::Alternation(ast::Alternation {
3428 span: span(0..3),
3429 asts: vec![lit('a', 0), lit('b', 2)],
3430 }))
3431 );
3432 assert_eq!(
3433 parser(r"(a|b)").parse(),
3434 Ok(group(
3435 0..5,
3436 1,
3437 Ast::Alternation(ast::Alternation {
3438 span: span(1..4),
3439 asts: vec![lit('a', 1), lit('b', 3)],
3440 })
3441 ))
3442 );
3443
3444 assert_eq!(
3445 parser(r"a|b|c").parse(),
3446 Ok(Ast::Alternation(ast::Alternation {
3447 span: span(0..5),
3448 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3449 }))
3450 );
3451 assert_eq!(
3452 parser(r"ax|by|cz").parse(),
3453 Ok(Ast::Alternation(ast::Alternation {
3454 span: span(0..8),
3455 asts: vec![
3456 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3457 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3458 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3459 ],
3460 }))
3461 );
3462 assert_eq!(
3463 parser(r"(ax|by|cz)").parse(),
3464 Ok(group(
3465 0..10,
3466 1,
3467 Ast::Alternation(ast::Alternation {
3468 span: span(1..9),
3469 asts: vec![
3470 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3471 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3472 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3473 ],
3474 })
3475 ))
3476 );
3477 assert_eq!(
3478 parser(r"(ax|(by|(cz)))").parse(),
3479 Ok(group(
3480 0..14,
3481 1,
3482 alt(
3483 1..13,
3484 vec![
3485 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3486 group(
3487 4..13,
3488 2,
3489 alt(
3490 5..12,
3491 vec![
3492 concat(
3493 5..7,
3494 vec![lit('b', 5), lit('y', 6)]
3495 ),
3496 group(
3497 8..12,
3498 3,
3499 concat(
3500 9..11,
3501 vec![lit('c', 9), lit('z', 10),]
3502 )
3503 ),
3504 ]
3505 )
3506 ),
3507 ]
3508 )
3509 ))
3510 );
3511
3512 assert_eq!(
3513 parser(r"|").parse(),
3514 Ok(alt(
3515 0..1,
3516 vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
3517 ))
3518 );
3519 assert_eq!(
3520 parser(r"||").parse(),
3521 Ok(alt(
3522 0..2,
3523 vec![
3524 Ast::Empty(span(0..0)),
3525 Ast::Empty(span(1..1)),
3526 Ast::Empty(span(2..2)),
3527 ]
3528 ))
3529 );
3530 assert_eq!(
3531 parser(r"a|").parse(),
3532 Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
3533 );
3534 assert_eq!(
3535 parser(r"|a").parse(),
3536 Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
3537 );
3538
3539 assert_eq!(
3540 parser(r"(|)").parse(),
3541 Ok(group(
3542 0..3,
3543 1,
3544 alt(
3545 1..2,
3546 vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
3547 )
3548 ))
3549 );
3550 assert_eq!(
3551 parser(r"(a|)").parse(),
3552 Ok(group(
3553 0..4,
3554 1,
3555 alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
3556 ))
3557 );
3558 assert_eq!(
3559 parser(r"(|a)").parse(),
3560 Ok(group(
3561 0..4,
3562 1,
3563 alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
3564 ))
3565 );
3566
3567 assert_eq!(
3568 parser(r"a|b)").parse().unwrap_err(),
3569 TestError {
3570 span: span(3..4),
3571 kind: ast::ErrorKind::GroupUnopened,
3572 }
3573 );
3574 assert_eq!(
3575 parser(r"(a|b").parse().unwrap_err(),
3576 TestError {
3577 span: span(0..1),
3578 kind: ast::ErrorKind::GroupUnclosed,
3579 }
3580 );
3581 }
3582
3583 #[test]
parse_unsupported_lookaround()3584 fn parse_unsupported_lookaround() {
3585 assert_eq!(
3586 parser(r"(?=a)").parse().unwrap_err(),
3587 TestError {
3588 span: span(0..3),
3589 kind: ast::ErrorKind::UnsupportedLookAround,
3590 }
3591 );
3592 assert_eq!(
3593 parser(r"(?!a)").parse().unwrap_err(),
3594 TestError {
3595 span: span(0..3),
3596 kind: ast::ErrorKind::UnsupportedLookAround,
3597 }
3598 );
3599 assert_eq!(
3600 parser(r"(?<=a)").parse().unwrap_err(),
3601 TestError {
3602 span: span(0..4),
3603 kind: ast::ErrorKind::UnsupportedLookAround,
3604 }
3605 );
3606 assert_eq!(
3607 parser(r"(?<!a)").parse().unwrap_err(),
3608 TestError {
3609 span: span(0..4),
3610 kind: ast::ErrorKind::UnsupportedLookAround,
3611 }
3612 );
3613 }
3614
3615 #[test]
parse_group()3616 fn parse_group() {
3617 assert_eq!(
3618 parser("(?i)").parse(),
3619 Ok(Ast::Flags(ast::SetFlags {
3620 span: span(0..4),
3621 flags: ast::Flags {
3622 span: span(2..3),
3623 items: vec![ast::FlagsItem {
3624 span: span(2..3),
3625 kind: ast::FlagsItemKind::Flag(
3626 ast::Flag::CaseInsensitive
3627 ),
3628 }],
3629 },
3630 }))
3631 );
3632 assert_eq!(
3633 parser("(?iU)").parse(),
3634 Ok(Ast::Flags(ast::SetFlags {
3635 span: span(0..5),
3636 flags: ast::Flags {
3637 span: span(2..4),
3638 items: vec![
3639 ast::FlagsItem {
3640 span: span(2..3),
3641 kind: ast::FlagsItemKind::Flag(
3642 ast::Flag::CaseInsensitive
3643 ),
3644 },
3645 ast::FlagsItem {
3646 span: span(3..4),
3647 kind: ast::FlagsItemKind::Flag(
3648 ast::Flag::SwapGreed
3649 ),
3650 },
3651 ],
3652 },
3653 }))
3654 );
3655 assert_eq!(
3656 parser("(?i-U)").parse(),
3657 Ok(Ast::Flags(ast::SetFlags {
3658 span: span(0..6),
3659 flags: ast::Flags {
3660 span: span(2..5),
3661 items: vec![
3662 ast::FlagsItem {
3663 span: span(2..3),
3664 kind: ast::FlagsItemKind::Flag(
3665 ast::Flag::CaseInsensitive
3666 ),
3667 },
3668 ast::FlagsItem {
3669 span: span(3..4),
3670 kind: ast::FlagsItemKind::Negation,
3671 },
3672 ast::FlagsItem {
3673 span: span(4..5),
3674 kind: ast::FlagsItemKind::Flag(
3675 ast::Flag::SwapGreed
3676 ),
3677 },
3678 ],
3679 },
3680 }))
3681 );
3682
3683 assert_eq!(
3684 parser("()").parse(),
3685 Ok(Ast::Group(ast::Group {
3686 span: span(0..2),
3687 kind: ast::GroupKind::CaptureIndex(1),
3688 ast: Box::new(Ast::Empty(span(1..1))),
3689 }))
3690 );
3691 assert_eq!(
3692 parser("(a)").parse(),
3693 Ok(Ast::Group(ast::Group {
3694 span: span(0..3),
3695 kind: ast::GroupKind::CaptureIndex(1),
3696 ast: Box::new(lit('a', 1)),
3697 }))
3698 );
3699 assert_eq!(
3700 parser("(())").parse(),
3701 Ok(Ast::Group(ast::Group {
3702 span: span(0..4),
3703 kind: ast::GroupKind::CaptureIndex(1),
3704 ast: Box::new(Ast::Group(ast::Group {
3705 span: span(1..3),
3706 kind: ast::GroupKind::CaptureIndex(2),
3707 ast: Box::new(Ast::Empty(span(2..2))),
3708 })),
3709 }))
3710 );
3711
3712 assert_eq!(
3713 parser("(?:a)").parse(),
3714 Ok(Ast::Group(ast::Group {
3715 span: span(0..5),
3716 kind: ast::GroupKind::NonCapturing(ast::Flags {
3717 span: span(2..2),
3718 items: vec![],
3719 }),
3720 ast: Box::new(lit('a', 3)),
3721 }))
3722 );
3723
3724 assert_eq!(
3725 parser("(?i:a)").parse(),
3726 Ok(Ast::Group(ast::Group {
3727 span: span(0..6),
3728 kind: ast::GroupKind::NonCapturing(ast::Flags {
3729 span: span(2..3),
3730 items: vec![ast::FlagsItem {
3731 span: span(2..3),
3732 kind: ast::FlagsItemKind::Flag(
3733 ast::Flag::CaseInsensitive
3734 ),
3735 },],
3736 }),
3737 ast: Box::new(lit('a', 4)),
3738 }))
3739 );
3740 assert_eq!(
3741 parser("(?i-U:a)").parse(),
3742 Ok(Ast::Group(ast::Group {
3743 span: span(0..8),
3744 kind: ast::GroupKind::NonCapturing(ast::Flags {
3745 span: span(2..5),
3746 items: vec![
3747 ast::FlagsItem {
3748 span: span(2..3),
3749 kind: ast::FlagsItemKind::Flag(
3750 ast::Flag::CaseInsensitive
3751 ),
3752 },
3753 ast::FlagsItem {
3754 span: span(3..4),
3755 kind: ast::FlagsItemKind::Negation,
3756 },
3757 ast::FlagsItem {
3758 span: span(4..5),
3759 kind: ast::FlagsItemKind::Flag(
3760 ast::Flag::SwapGreed
3761 ),
3762 },
3763 ],
3764 }),
3765 ast: Box::new(lit('a', 6)),
3766 }))
3767 );
3768
3769 assert_eq!(
3770 parser("(").parse().unwrap_err(),
3771 TestError {
3772 span: span(0..1),
3773 kind: ast::ErrorKind::GroupUnclosed,
3774 }
3775 );
3776 assert_eq!(
3777 parser("(?").parse().unwrap_err(),
3778 TestError {
3779 span: span(0..1),
3780 kind: ast::ErrorKind::GroupUnclosed,
3781 }
3782 );
3783 assert_eq!(
3784 parser("(?P").parse().unwrap_err(),
3785 TestError {
3786 span: span(2..3),
3787 kind: ast::ErrorKind::FlagUnrecognized,
3788 }
3789 );
3790 assert_eq!(
3791 parser("(?P<").parse().unwrap_err(),
3792 TestError {
3793 span: span(4..4),
3794 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3795 }
3796 );
3797 assert_eq!(
3798 parser("(a").parse().unwrap_err(),
3799 TestError {
3800 span: span(0..1),
3801 kind: ast::ErrorKind::GroupUnclosed,
3802 }
3803 );
3804 assert_eq!(
3805 parser("(()").parse().unwrap_err(),
3806 TestError {
3807 span: span(0..1),
3808 kind: ast::ErrorKind::GroupUnclosed,
3809 }
3810 );
3811 assert_eq!(
3812 parser(")").parse().unwrap_err(),
3813 TestError {
3814 span: span(0..1),
3815 kind: ast::ErrorKind::GroupUnopened,
3816 }
3817 );
3818 assert_eq!(
3819 parser("a)").parse().unwrap_err(),
3820 TestError {
3821 span: span(1..2),
3822 kind: ast::ErrorKind::GroupUnopened,
3823 }
3824 );
3825 }
3826
3827 #[test]
parse_capture_name()3828 fn parse_capture_name() {
3829 assert_eq!(
3830 parser("(?P<a>z)").parse(),
3831 Ok(Ast::Group(ast::Group {
3832 span: span(0..8),
3833 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3834 span: span(4..5),
3835 name: s("a"),
3836 index: 1,
3837 }),
3838 ast: Box::new(lit('z', 6)),
3839 }))
3840 );
3841 assert_eq!(
3842 parser("(?P<abc>z)").parse(),
3843 Ok(Ast::Group(ast::Group {
3844 span: span(0..10),
3845 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3846 span: span(4..7),
3847 name: s("abc"),
3848 index: 1,
3849 }),
3850 ast: Box::new(lit('z', 8)),
3851 }))
3852 );
3853
3854 assert_eq!(
3855 parser("(?P<").parse().unwrap_err(),
3856 TestError {
3857 span: span(4..4),
3858 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3859 }
3860 );
3861 assert_eq!(
3862 parser("(?P<>z)").parse().unwrap_err(),
3863 TestError {
3864 span: span(4..4),
3865 kind: ast::ErrorKind::GroupNameEmpty,
3866 }
3867 );
3868 assert_eq!(
3869 parser("(?P<a").parse().unwrap_err(),
3870 TestError {
3871 span: span(5..5),
3872 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3873 }
3874 );
3875 assert_eq!(
3876 parser("(?P<ab").parse().unwrap_err(),
3877 TestError {
3878 span: span(6..6),
3879 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3880 }
3881 );
3882 assert_eq!(
3883 parser("(?P<0a").parse().unwrap_err(),
3884 TestError {
3885 span: span(4..5),
3886 kind: ast::ErrorKind::GroupNameInvalid,
3887 }
3888 );
3889 assert_eq!(
3890 parser("(?P<~").parse().unwrap_err(),
3891 TestError {
3892 span: span(4..5),
3893 kind: ast::ErrorKind::GroupNameInvalid,
3894 }
3895 );
3896 assert_eq!(
3897 parser("(?P<abc~").parse().unwrap_err(),
3898 TestError {
3899 span: span(7..8),
3900 kind: ast::ErrorKind::GroupNameInvalid,
3901 }
3902 );
3903 assert_eq!(
3904 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3905 TestError {
3906 span: span(12..13),
3907 kind: ast::ErrorKind::GroupNameDuplicate {
3908 original: span(4..5),
3909 },
3910 }
3911 );
3912 }
3913
3914 #[test]
parse_flags()3915 fn parse_flags() {
3916 assert_eq!(
3917 parser("i:").parse_flags(),
3918 Ok(ast::Flags {
3919 span: span(0..1),
3920 items: vec![ast::FlagsItem {
3921 span: span(0..1),
3922 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3923 }],
3924 })
3925 );
3926 assert_eq!(
3927 parser("i)").parse_flags(),
3928 Ok(ast::Flags {
3929 span: span(0..1),
3930 items: vec![ast::FlagsItem {
3931 span: span(0..1),
3932 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3933 }],
3934 })
3935 );
3936
3937 assert_eq!(
3938 parser("isU:").parse_flags(),
3939 Ok(ast::Flags {
3940 span: span(0..3),
3941 items: vec![
3942 ast::FlagsItem {
3943 span: span(0..1),
3944 kind: ast::FlagsItemKind::Flag(
3945 ast::Flag::CaseInsensitive
3946 ),
3947 },
3948 ast::FlagsItem {
3949 span: span(1..2),
3950 kind: ast::FlagsItemKind::Flag(
3951 ast::Flag::DotMatchesNewLine
3952 ),
3953 },
3954 ast::FlagsItem {
3955 span: span(2..3),
3956 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3957 },
3958 ],
3959 })
3960 );
3961
3962 assert_eq!(
3963 parser("-isU:").parse_flags(),
3964 Ok(ast::Flags {
3965 span: span(0..4),
3966 items: vec![
3967 ast::FlagsItem {
3968 span: span(0..1),
3969 kind: ast::FlagsItemKind::Negation,
3970 },
3971 ast::FlagsItem {
3972 span: span(1..2),
3973 kind: ast::FlagsItemKind::Flag(
3974 ast::Flag::CaseInsensitive
3975 ),
3976 },
3977 ast::FlagsItem {
3978 span: span(2..3),
3979 kind: ast::FlagsItemKind::Flag(
3980 ast::Flag::DotMatchesNewLine
3981 ),
3982 },
3983 ast::FlagsItem {
3984 span: span(3..4),
3985 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3986 },
3987 ],
3988 })
3989 );
3990 assert_eq!(
3991 parser("i-sU:").parse_flags(),
3992 Ok(ast::Flags {
3993 span: span(0..4),
3994 items: vec![
3995 ast::FlagsItem {
3996 span: span(0..1),
3997 kind: ast::FlagsItemKind::Flag(
3998 ast::Flag::CaseInsensitive
3999 ),
4000 },
4001 ast::FlagsItem {
4002 span: span(1..2),
4003 kind: ast::FlagsItemKind::Negation,
4004 },
4005 ast::FlagsItem {
4006 span: span(2..3),
4007 kind: ast::FlagsItemKind::Flag(
4008 ast::Flag::DotMatchesNewLine
4009 ),
4010 },
4011 ast::FlagsItem {
4012 span: span(3..4),
4013 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4014 },
4015 ],
4016 })
4017 );
4018
4019 assert_eq!(
4020 parser("isU").parse_flags().unwrap_err(),
4021 TestError {
4022 span: span(3..3),
4023 kind: ast::ErrorKind::FlagUnexpectedEof,
4024 }
4025 );
4026 assert_eq!(
4027 parser("isUa:").parse_flags().unwrap_err(),
4028 TestError {
4029 span: span(3..4),
4030 kind: ast::ErrorKind::FlagUnrecognized,
4031 }
4032 );
4033 assert_eq!(
4034 parser("isUi:").parse_flags().unwrap_err(),
4035 TestError {
4036 span: span(3..4),
4037 kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4038 }
4039 );
4040 assert_eq!(
4041 parser("i-sU-i:").parse_flags().unwrap_err(),
4042 TestError {
4043 span: span(4..5),
4044 kind: ast::ErrorKind::FlagRepeatedNegation {
4045 original: span(1..2),
4046 },
4047 }
4048 );
4049 assert_eq!(
4050 parser("-)").parse_flags().unwrap_err(),
4051 TestError {
4052 span: span(0..1),
4053 kind: ast::ErrorKind::FlagDanglingNegation,
4054 }
4055 );
4056 assert_eq!(
4057 parser("i-)").parse_flags().unwrap_err(),
4058 TestError {
4059 span: span(1..2),
4060 kind: ast::ErrorKind::FlagDanglingNegation,
4061 }
4062 );
4063 assert_eq!(
4064 parser("iU-)").parse_flags().unwrap_err(),
4065 TestError {
4066 span: span(2..3),
4067 kind: ast::ErrorKind::FlagDanglingNegation,
4068 }
4069 );
4070 }
4071
4072 #[test]
parse_flag()4073 fn parse_flag() {
4074 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4075 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4076 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4077 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4078 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4079 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4080
4081 assert_eq!(
4082 parser("a").parse_flag().unwrap_err(),
4083 TestError {
4084 span: span(0..1),
4085 kind: ast::ErrorKind::FlagUnrecognized,
4086 }
4087 );
4088 assert_eq!(
4089 parser("☃").parse_flag().unwrap_err(),
4090 TestError {
4091 span: span_range("☃", 0..3),
4092 kind: ast::ErrorKind::FlagUnrecognized,
4093 }
4094 );
4095 }
4096
4097 #[test]
parse_primitive_non_escape()4098 fn parse_primitive_non_escape() {
4099 assert_eq!(
4100 parser(r".").parse_primitive(),
4101 Ok(Primitive::Dot(span(0..1)))
4102 );
4103 assert_eq!(
4104 parser(r"^").parse_primitive(),
4105 Ok(Primitive::Assertion(ast::Assertion {
4106 span: span(0..1),
4107 kind: ast::AssertionKind::StartLine,
4108 }))
4109 );
4110 assert_eq!(
4111 parser(r"$").parse_primitive(),
4112 Ok(Primitive::Assertion(ast::Assertion {
4113 span: span(0..1),
4114 kind: ast::AssertionKind::EndLine,
4115 }))
4116 );
4117
4118 assert_eq!(
4119 parser(r"a").parse_primitive(),
4120 Ok(Primitive::Literal(ast::Literal {
4121 span: span(0..1),
4122 kind: ast::LiteralKind::Verbatim,
4123 c: 'a',
4124 }))
4125 );
4126 assert_eq!(
4127 parser(r"|").parse_primitive(),
4128 Ok(Primitive::Literal(ast::Literal {
4129 span: span(0..1),
4130 kind: ast::LiteralKind::Verbatim,
4131 c: '|',
4132 }))
4133 );
4134 assert_eq!(
4135 parser(r"☃").parse_primitive(),
4136 Ok(Primitive::Literal(ast::Literal {
4137 span: span_range("☃", 0..3),
4138 kind: ast::LiteralKind::Verbatim,
4139 c: '☃',
4140 }))
4141 );
4142 }
4143
4144 #[test]
parse_escape()4145 fn parse_escape() {
4146 assert_eq!(
4147 parser(r"\|").parse_primitive(),
4148 Ok(Primitive::Literal(ast::Literal {
4149 span: span(0..2),
4150 kind: ast::LiteralKind::Punctuation,
4151 c: '|',
4152 }))
4153 );
4154 let specials = &[
4155 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4156 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4157 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4158 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4159 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4160 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4161 ];
4162 for &(pat, c, ref kind) in specials {
4163 assert_eq!(
4164 parser(pat).parse_primitive(),
4165 Ok(Primitive::Literal(ast::Literal {
4166 span: span(0..2),
4167 kind: ast::LiteralKind::Special(kind.clone()),
4168 c: c,
4169 }))
4170 );
4171 }
4172 assert_eq!(
4173 parser(r"\A").parse_primitive(),
4174 Ok(Primitive::Assertion(ast::Assertion {
4175 span: span(0..2),
4176 kind: ast::AssertionKind::StartText,
4177 }))
4178 );
4179 assert_eq!(
4180 parser(r"\z").parse_primitive(),
4181 Ok(Primitive::Assertion(ast::Assertion {
4182 span: span(0..2),
4183 kind: ast::AssertionKind::EndText,
4184 }))
4185 );
4186 assert_eq!(
4187 parser(r"\b").parse_primitive(),
4188 Ok(Primitive::Assertion(ast::Assertion {
4189 span: span(0..2),
4190 kind: ast::AssertionKind::WordBoundary,
4191 }))
4192 );
4193 assert_eq!(
4194 parser(r"\B").parse_primitive(),
4195 Ok(Primitive::Assertion(ast::Assertion {
4196 span: span(0..2),
4197 kind: ast::AssertionKind::NotWordBoundary,
4198 }))
4199 );
4200
4201 assert_eq!(
4202 parser(r"\").parse_escape().unwrap_err(),
4203 TestError {
4204 span: span(0..1),
4205 kind: ast::ErrorKind::EscapeUnexpectedEof,
4206 }
4207 );
4208 assert_eq!(
4209 parser(r"\y").parse_escape().unwrap_err(),
4210 TestError {
4211 span: span(0..2),
4212 kind: ast::ErrorKind::EscapeUnrecognized,
4213 }
4214 );
4215 }
4216
4217 #[test]
parse_unsupported_backreference()4218 fn parse_unsupported_backreference() {
4219 assert_eq!(
4220 parser(r"\0").parse_escape().unwrap_err(),
4221 TestError {
4222 span: span(0..2),
4223 kind: ast::ErrorKind::UnsupportedBackreference,
4224 }
4225 );
4226 assert_eq!(
4227 parser(r"\9").parse_escape().unwrap_err(),
4228 TestError {
4229 span: span(0..2),
4230 kind: ast::ErrorKind::UnsupportedBackreference,
4231 }
4232 );
4233 }
4234
4235 #[test]
parse_octal()4236 fn parse_octal() {
4237 for i in 0..511 {
4238 let pat = format!(r"\{:o}", i);
4239 assert_eq!(
4240 parser_octal(&pat).parse_escape(),
4241 Ok(Primitive::Literal(ast::Literal {
4242 span: span(0..pat.len()),
4243 kind: ast::LiteralKind::Octal,
4244 c: ::std::char::from_u32(i).unwrap(),
4245 }))
4246 );
4247 }
4248 assert_eq!(
4249 parser_octal(r"\778").parse_escape(),
4250 Ok(Primitive::Literal(ast::Literal {
4251 span: span(0..3),
4252 kind: ast::LiteralKind::Octal,
4253 c: '?',
4254 }))
4255 );
4256 assert_eq!(
4257 parser_octal(r"\7777").parse_escape(),
4258 Ok(Primitive::Literal(ast::Literal {
4259 span: span(0..4),
4260 kind: ast::LiteralKind::Octal,
4261 c: '\u{01FF}',
4262 }))
4263 );
4264 assert_eq!(
4265 parser_octal(r"\778").parse(),
4266 Ok(Ast::Concat(ast::Concat {
4267 span: span(0..4),
4268 asts: vec![
4269 Ast::Literal(ast::Literal {
4270 span: span(0..3),
4271 kind: ast::LiteralKind::Octal,
4272 c: '?',
4273 }),
4274 Ast::Literal(ast::Literal {
4275 span: span(3..4),
4276 kind: ast::LiteralKind::Verbatim,
4277 c: '8',
4278 }),
4279 ],
4280 }))
4281 );
4282 assert_eq!(
4283 parser_octal(r"\7777").parse(),
4284 Ok(Ast::Concat(ast::Concat {
4285 span: span(0..5),
4286 asts: vec![
4287 Ast::Literal(ast::Literal {
4288 span: span(0..4),
4289 kind: ast::LiteralKind::Octal,
4290 c: '\u{01FF}',
4291 }),
4292 Ast::Literal(ast::Literal {
4293 span: span(4..5),
4294 kind: ast::LiteralKind::Verbatim,
4295 c: '7',
4296 }),
4297 ],
4298 }))
4299 );
4300
4301 assert_eq!(
4302 parser_octal(r"\8").parse_escape().unwrap_err(),
4303 TestError {
4304 span: span(0..2),
4305 kind: ast::ErrorKind::EscapeUnrecognized,
4306 }
4307 );
4308 }
4309
4310 #[test]
parse_hex_two()4311 fn parse_hex_two() {
4312 for i in 0..256 {
4313 let pat = format!(r"\x{:02x}", i);
4314 assert_eq!(
4315 parser(&pat).parse_escape(),
4316 Ok(Primitive::Literal(ast::Literal {
4317 span: span(0..pat.len()),
4318 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4319 c: ::std::char::from_u32(i).unwrap(),
4320 }))
4321 );
4322 }
4323
4324 assert_eq!(
4325 parser(r"\xF").parse_escape().unwrap_err(),
4326 TestError {
4327 span: span(3..3),
4328 kind: ast::ErrorKind::EscapeUnexpectedEof,
4329 }
4330 );
4331 assert_eq!(
4332 parser(r"\xG").parse_escape().unwrap_err(),
4333 TestError {
4334 span: span(2..3),
4335 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4336 }
4337 );
4338 assert_eq!(
4339 parser(r"\xFG").parse_escape().unwrap_err(),
4340 TestError {
4341 span: span(3..4),
4342 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4343 }
4344 );
4345 }
4346
4347 #[test]
parse_hex_four()4348 fn parse_hex_four() {
4349 for i in 0..65536 {
4350 let c = match ::std::char::from_u32(i) {
4351 None => continue,
4352 Some(c) => c,
4353 };
4354 let pat = format!(r"\u{:04x}", i);
4355 assert_eq!(
4356 parser(&pat).parse_escape(),
4357 Ok(Primitive::Literal(ast::Literal {
4358 span: span(0..pat.len()),
4359 kind: ast::LiteralKind::HexFixed(
4360 ast::HexLiteralKind::UnicodeShort
4361 ),
4362 c: c,
4363 }))
4364 );
4365 }
4366
4367 assert_eq!(
4368 parser(r"\uF").parse_escape().unwrap_err(),
4369 TestError {
4370 span: span(3..3),
4371 kind: ast::ErrorKind::EscapeUnexpectedEof,
4372 }
4373 );
4374 assert_eq!(
4375 parser(r"\uG").parse_escape().unwrap_err(),
4376 TestError {
4377 span: span(2..3),
4378 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4379 }
4380 );
4381 assert_eq!(
4382 parser(r"\uFG").parse_escape().unwrap_err(),
4383 TestError {
4384 span: span(3..4),
4385 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4386 }
4387 );
4388 assert_eq!(
4389 parser(r"\uFFG").parse_escape().unwrap_err(),
4390 TestError {
4391 span: span(4..5),
4392 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4393 }
4394 );
4395 assert_eq!(
4396 parser(r"\uFFFG").parse_escape().unwrap_err(),
4397 TestError {
4398 span: span(5..6),
4399 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4400 }
4401 );
4402 assert_eq!(
4403 parser(r"\uD800").parse_escape().unwrap_err(),
4404 TestError {
4405 span: span(2..6),
4406 kind: ast::ErrorKind::EscapeHexInvalid,
4407 }
4408 );
4409 }
4410
4411 #[test]
parse_hex_eight()4412 fn parse_hex_eight() {
4413 for i in 0..65536 {
4414 let c = match ::std::char::from_u32(i) {
4415 None => continue,
4416 Some(c) => c,
4417 };
4418 let pat = format!(r"\U{:08x}", i);
4419 assert_eq!(
4420 parser(&pat).parse_escape(),
4421 Ok(Primitive::Literal(ast::Literal {
4422 span: span(0..pat.len()),
4423 kind: ast::LiteralKind::HexFixed(
4424 ast::HexLiteralKind::UnicodeLong
4425 ),
4426 c: c,
4427 }))
4428 );
4429 }
4430
4431 assert_eq!(
4432 parser(r"\UF").parse_escape().unwrap_err(),
4433 TestError {
4434 span: span(3..3),
4435 kind: ast::ErrorKind::EscapeUnexpectedEof,
4436 }
4437 );
4438 assert_eq!(
4439 parser(r"\UG").parse_escape().unwrap_err(),
4440 TestError {
4441 span: span(2..3),
4442 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4443 }
4444 );
4445 assert_eq!(
4446 parser(r"\UFG").parse_escape().unwrap_err(),
4447 TestError {
4448 span: span(3..4),
4449 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4450 }
4451 );
4452 assert_eq!(
4453 parser(r"\UFFG").parse_escape().unwrap_err(),
4454 TestError {
4455 span: span(4..5),
4456 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4457 }
4458 );
4459 assert_eq!(
4460 parser(r"\UFFFG").parse_escape().unwrap_err(),
4461 TestError {
4462 span: span(5..6),
4463 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4464 }
4465 );
4466 assert_eq!(
4467 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4468 TestError {
4469 span: span(6..7),
4470 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4471 }
4472 );
4473 assert_eq!(
4474 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4475 TestError {
4476 span: span(7..8),
4477 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4478 }
4479 );
4480 assert_eq!(
4481 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4482 TestError {
4483 span: span(8..9),
4484 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4485 }
4486 );
4487 assert_eq!(
4488 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4489 TestError {
4490 span: span(9..10),
4491 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4492 }
4493 );
4494 }
4495
4496 #[test]
parse_hex_brace()4497 fn parse_hex_brace() {
4498 assert_eq!(
4499 parser(r"\u{26c4}").parse_escape(),
4500 Ok(Primitive::Literal(ast::Literal {
4501 span: span(0..8),
4502 kind: ast::LiteralKind::HexBrace(
4503 ast::HexLiteralKind::UnicodeShort
4504 ),
4505 c: '⛄',
4506 }))
4507 );
4508 assert_eq!(
4509 parser(r"\U{26c4}").parse_escape(),
4510 Ok(Primitive::Literal(ast::Literal {
4511 span: span(0..8),
4512 kind: ast::LiteralKind::HexBrace(
4513 ast::HexLiteralKind::UnicodeLong
4514 ),
4515 c: '⛄',
4516 }))
4517 );
4518 assert_eq!(
4519 parser(r"\x{26c4}").parse_escape(),
4520 Ok(Primitive::Literal(ast::Literal {
4521 span: span(0..8),
4522 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4523 c: '⛄',
4524 }))
4525 );
4526 assert_eq!(
4527 parser(r"\x{26C4}").parse_escape(),
4528 Ok(Primitive::Literal(ast::Literal {
4529 span: span(0..8),
4530 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4531 c: '⛄',
4532 }))
4533 );
4534 assert_eq!(
4535 parser(r"\x{10fFfF}").parse_escape(),
4536 Ok(Primitive::Literal(ast::Literal {
4537 span: span(0..10),
4538 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4539 c: '\u{10FFFF}',
4540 }))
4541 );
4542
4543 assert_eq!(
4544 parser(r"\x").parse_escape().unwrap_err(),
4545 TestError {
4546 span: span(2..2),
4547 kind: ast::ErrorKind::EscapeUnexpectedEof,
4548 }
4549 );
4550 assert_eq!(
4551 parser(r"\x{").parse_escape().unwrap_err(),
4552 TestError {
4553 span: span(2..3),
4554 kind: ast::ErrorKind::EscapeUnexpectedEof,
4555 }
4556 );
4557 assert_eq!(
4558 parser(r"\x{FF").parse_escape().unwrap_err(),
4559 TestError {
4560 span: span(2..5),
4561 kind: ast::ErrorKind::EscapeUnexpectedEof,
4562 }
4563 );
4564 assert_eq!(
4565 parser(r"\x{}").parse_escape().unwrap_err(),
4566 TestError {
4567 span: span(2..4),
4568 kind: ast::ErrorKind::EscapeHexEmpty,
4569 }
4570 );
4571 assert_eq!(
4572 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4573 TestError {
4574 span: span(4..5),
4575 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4576 }
4577 );
4578 assert_eq!(
4579 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4580 TestError {
4581 span: span(3..9),
4582 kind: ast::ErrorKind::EscapeHexInvalid,
4583 }
4584 );
4585 assert_eq!(
4586 parser(r"\x{D800}").parse_escape().unwrap_err(),
4587 TestError {
4588 span: span(3..7),
4589 kind: ast::ErrorKind::EscapeHexInvalid,
4590 }
4591 );
4592 assert_eq!(
4593 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4594 TestError {
4595 span: span(3..12),
4596 kind: ast::ErrorKind::EscapeHexInvalid,
4597 }
4598 );
4599 }
4600
4601 #[test]
parse_decimal()4602 fn parse_decimal() {
4603 assert_eq!(parser("123").parse_decimal(), Ok(123));
4604 assert_eq!(parser("0").parse_decimal(), Ok(0));
4605 assert_eq!(parser("01").parse_decimal(), Ok(1));
4606
4607 assert_eq!(
4608 parser("-1").parse_decimal().unwrap_err(),
4609 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4610 );
4611 assert_eq!(
4612 parser("").parse_decimal().unwrap_err(),
4613 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4614 );
4615 assert_eq!(
4616 parser("9999999999").parse_decimal().unwrap_err(),
4617 TestError {
4618 span: span(0..10),
4619 kind: ast::ErrorKind::DecimalInvalid,
4620 }
4621 );
4622 }
4623
4624 #[test]
parse_set_class()4625 fn parse_set_class() {
4626 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4627 ast::ClassSet::union(ast::ClassSetUnion {
4628 span: span,
4629 items: items,
4630 })
4631 }
4632
4633 fn intersection(
4634 span: Span,
4635 lhs: ast::ClassSet,
4636 rhs: ast::ClassSet,
4637 ) -> ast::ClassSet {
4638 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4639 span: span,
4640 kind: ast::ClassSetBinaryOpKind::Intersection,
4641 lhs: Box::new(lhs),
4642 rhs: Box::new(rhs),
4643 })
4644 }
4645
4646 fn difference(
4647 span: Span,
4648 lhs: ast::ClassSet,
4649 rhs: ast::ClassSet,
4650 ) -> ast::ClassSet {
4651 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4652 span: span,
4653 kind: ast::ClassSetBinaryOpKind::Difference,
4654 lhs: Box::new(lhs),
4655 rhs: Box::new(rhs),
4656 })
4657 }
4658
4659 fn symdifference(
4660 span: Span,
4661 lhs: ast::ClassSet,
4662 rhs: ast::ClassSet,
4663 ) -> ast::ClassSet {
4664 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4665 span: span,
4666 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4667 lhs: Box::new(lhs),
4668 rhs: Box::new(rhs),
4669 })
4670 }
4671
4672 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4673 ast::ClassSet::Item(item)
4674 }
4675
4676 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4677 ast::ClassSetItem::Ascii(cls)
4678 }
4679
4680 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4681 ast::ClassSetItem::Unicode(cls)
4682 }
4683
4684 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4685 ast::ClassSetItem::Perl(cls)
4686 }
4687
4688 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4689 ast::ClassSetItem::Bracketed(Box::new(cls))
4690 }
4691
4692 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4693 ast::ClassSetItem::Literal(ast::Literal {
4694 span: span,
4695 kind: ast::LiteralKind::Verbatim,
4696 c: c,
4697 })
4698 }
4699
4700 fn empty(span: Span) -> ast::ClassSetItem {
4701 ast::ClassSetItem::Empty(span)
4702 }
4703
4704 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4705 let pos1 = Position {
4706 offset: span.start.offset + start.len_utf8(),
4707 column: span.start.column + 1,
4708 ..span.start
4709 };
4710 let pos2 = Position {
4711 offset: span.end.offset - end.len_utf8(),
4712 column: span.end.column - 1,
4713 ..span.end
4714 };
4715 ast::ClassSetItem::Range(ast::ClassSetRange {
4716 span: span,
4717 start: ast::Literal {
4718 span: Span { end: pos1, ..span },
4719 kind: ast::LiteralKind::Verbatim,
4720 c: start,
4721 },
4722 end: ast::Literal {
4723 span: Span { start: pos2, ..span },
4724 kind: ast::LiteralKind::Verbatim,
4725 c: end,
4726 },
4727 })
4728 }
4729
4730 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4731 ast::ClassAscii {
4732 span: span,
4733 kind: ast::ClassAsciiKind::Alnum,
4734 negated: negated,
4735 }
4736 }
4737
4738 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4739 ast::ClassAscii {
4740 span: span,
4741 kind: ast::ClassAsciiKind::Lower,
4742 negated: negated,
4743 }
4744 }
4745
4746 assert_eq!(
4747 parser("[[:alnum:]]").parse(),
4748 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4749 span: span(0..11),
4750 negated: false,
4751 kind: itemset(item_ascii(alnum(span(1..10), false))),
4752 })))
4753 );
4754 assert_eq!(
4755 parser("[[[:alnum:]]]").parse(),
4756 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4757 span: span(0..13),
4758 negated: false,
4759 kind: itemset(item_bracket(ast::ClassBracketed {
4760 span: span(1..12),
4761 negated: false,
4762 kind: itemset(item_ascii(alnum(span(2..11), false))),
4763 })),
4764 })))
4765 );
4766 assert_eq!(
4767 parser("[[:alnum:]&&[:lower:]]").parse(),
4768 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4769 span: span(0..22),
4770 negated: false,
4771 kind: intersection(
4772 span(1..21),
4773 itemset(item_ascii(alnum(span(1..10), false))),
4774 itemset(item_ascii(lower(span(12..21), false))),
4775 ),
4776 })))
4777 );
4778 assert_eq!(
4779 parser("[[:alnum:]--[:lower:]]").parse(),
4780 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4781 span: span(0..22),
4782 negated: false,
4783 kind: difference(
4784 span(1..21),
4785 itemset(item_ascii(alnum(span(1..10), false))),
4786 itemset(item_ascii(lower(span(12..21), false))),
4787 ),
4788 })))
4789 );
4790 assert_eq!(
4791 parser("[[:alnum:]~~[:lower:]]").parse(),
4792 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4793 span: span(0..22),
4794 negated: false,
4795 kind: symdifference(
4796 span(1..21),
4797 itemset(item_ascii(alnum(span(1..10), false))),
4798 itemset(item_ascii(lower(span(12..21), false))),
4799 ),
4800 })))
4801 );
4802
4803 assert_eq!(
4804 parser("[a]").parse(),
4805 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4806 span: span(0..3),
4807 negated: false,
4808 kind: itemset(lit(span(1..2), 'a')),
4809 })))
4810 );
4811 assert_eq!(
4812 parser(r"[a\]]").parse(),
4813 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4814 span: span(0..5),
4815 negated: false,
4816 kind: union(
4817 span(1..4),
4818 vec![
4819 lit(span(1..2), 'a'),
4820 ast::ClassSetItem::Literal(ast::Literal {
4821 span: span(2..4),
4822 kind: ast::LiteralKind::Punctuation,
4823 c: ']',
4824 }),
4825 ]
4826 ),
4827 })))
4828 );
4829 assert_eq!(
4830 parser(r"[a\-z]").parse(),
4831 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4832 span: span(0..6),
4833 negated: false,
4834 kind: union(
4835 span(1..5),
4836 vec![
4837 lit(span(1..2), 'a'),
4838 ast::ClassSetItem::Literal(ast::Literal {
4839 span: span(2..4),
4840 kind: ast::LiteralKind::Punctuation,
4841 c: '-',
4842 }),
4843 lit(span(4..5), 'z'),
4844 ]
4845 ),
4846 })))
4847 );
4848 assert_eq!(
4849 parser("[ab]").parse(),
4850 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4851 span: span(0..4),
4852 negated: false,
4853 kind: union(
4854 span(1..3),
4855 vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
4856 ),
4857 })))
4858 );
4859 assert_eq!(
4860 parser("[a-]").parse(),
4861 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4862 span: span(0..4),
4863 negated: false,
4864 kind: union(
4865 span(1..3),
4866 vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
4867 ),
4868 })))
4869 );
4870 assert_eq!(
4871 parser("[-a]").parse(),
4872 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4873 span: span(0..4),
4874 negated: false,
4875 kind: union(
4876 span(1..3),
4877 vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
4878 ),
4879 })))
4880 );
4881 assert_eq!(
4882 parser(r"[\pL]").parse(),
4883 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4884 span: span(0..5),
4885 negated: false,
4886 kind: itemset(item_unicode(ast::ClassUnicode {
4887 span: span(1..4),
4888 negated: false,
4889 kind: ast::ClassUnicodeKind::OneLetter('L'),
4890 })),
4891 })))
4892 );
4893 assert_eq!(
4894 parser(r"[\w]").parse(),
4895 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4896 span: span(0..4),
4897 negated: false,
4898 kind: itemset(item_perl(ast::ClassPerl {
4899 span: span(1..3),
4900 kind: ast::ClassPerlKind::Word,
4901 negated: false,
4902 })),
4903 })))
4904 );
4905 assert_eq!(
4906 parser(r"[a\wz]").parse(),
4907 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4908 span: span(0..6),
4909 negated: false,
4910 kind: union(
4911 span(1..5),
4912 vec![
4913 lit(span(1..2), 'a'),
4914 item_perl(ast::ClassPerl {
4915 span: span(2..4),
4916 kind: ast::ClassPerlKind::Word,
4917 negated: false,
4918 }),
4919 lit(span(4..5), 'z'),
4920 ]
4921 ),
4922 })))
4923 );
4924
4925 assert_eq!(
4926 parser("[a-z]").parse(),
4927 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4928 span: span(0..5),
4929 negated: false,
4930 kind: itemset(range(span(1..4), 'a', 'z')),
4931 })))
4932 );
4933 assert_eq!(
4934 parser("[a-cx-z]").parse(),
4935 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4936 span: span(0..8),
4937 negated: false,
4938 kind: union(
4939 span(1..7),
4940 vec![
4941 range(span(1..4), 'a', 'c'),
4942 range(span(4..7), 'x', 'z'),
4943 ]
4944 ),
4945 })))
4946 );
4947 assert_eq!(
4948 parser(r"[\w&&a-cx-z]").parse(),
4949 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4950 span: span(0..12),
4951 negated: false,
4952 kind: intersection(
4953 span(1..11),
4954 itemset(item_perl(ast::ClassPerl {
4955 span: span(1..3),
4956 kind: ast::ClassPerlKind::Word,
4957 negated: false,
4958 })),
4959 union(
4960 span(5..11),
4961 vec![
4962 range(span(5..8), 'a', 'c'),
4963 range(span(8..11), 'x', 'z'),
4964 ]
4965 ),
4966 ),
4967 })))
4968 );
4969 assert_eq!(
4970 parser(r"[a-cx-z&&\w]").parse(),
4971 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4972 span: span(0..12),
4973 negated: false,
4974 kind: intersection(
4975 span(1..11),
4976 union(
4977 span(1..7),
4978 vec![
4979 range(span(1..4), 'a', 'c'),
4980 range(span(4..7), 'x', 'z'),
4981 ]
4982 ),
4983 itemset(item_perl(ast::ClassPerl {
4984 span: span(9..11),
4985 kind: ast::ClassPerlKind::Word,
4986 negated: false,
4987 })),
4988 ),
4989 })))
4990 );
4991 assert_eq!(
4992 parser(r"[a--b--c]").parse(),
4993 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4994 span: span(0..9),
4995 negated: false,
4996 kind: difference(
4997 span(1..8),
4998 difference(
4999 span(1..5),
5000 itemset(lit(span(1..2), 'a')),
5001 itemset(lit(span(4..5), 'b')),
5002 ),
5003 itemset(lit(span(7..8), 'c')),
5004 ),
5005 })))
5006 );
5007 assert_eq!(
5008 parser(r"[a~~b~~c]").parse(),
5009 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5010 span: span(0..9),
5011 negated: false,
5012 kind: symdifference(
5013 span(1..8),
5014 symdifference(
5015 span(1..5),
5016 itemset(lit(span(1..2), 'a')),
5017 itemset(lit(span(4..5), 'b')),
5018 ),
5019 itemset(lit(span(7..8), 'c')),
5020 ),
5021 })))
5022 );
5023 assert_eq!(
5024 parser(r"[\^&&^]").parse(),
5025 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5026 span: span(0..7),
5027 negated: false,
5028 kind: intersection(
5029 span(1..6),
5030 itemset(ast::ClassSetItem::Literal(ast::Literal {
5031 span: span(1..3),
5032 kind: ast::LiteralKind::Punctuation,
5033 c: '^',
5034 })),
5035 itemset(lit(span(5..6), '^')),
5036 ),
5037 })))
5038 );
5039 assert_eq!(
5040 parser(r"[\&&&&]").parse(),
5041 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5042 span: span(0..7),
5043 negated: false,
5044 kind: intersection(
5045 span(1..6),
5046 itemset(ast::ClassSetItem::Literal(ast::Literal {
5047 span: span(1..3),
5048 kind: ast::LiteralKind::Punctuation,
5049 c: '&',
5050 })),
5051 itemset(lit(span(5..6), '&')),
5052 ),
5053 })))
5054 );
5055 assert_eq!(
5056 parser(r"[&&&&]").parse(),
5057 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5058 span: span(0..6),
5059 negated: false,
5060 kind: intersection(
5061 span(1..5),
5062 intersection(
5063 span(1..3),
5064 itemset(empty(span(1..1))),
5065 itemset(empty(span(3..3))),
5066 ),
5067 itemset(empty(span(5..5))),
5068 ),
5069 })))
5070 );
5071
5072 let pat = "[☃-⛄]";
5073 assert_eq!(
5074 parser(pat).parse(),
5075 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5076 span: span_range(pat, 0..9),
5077 negated: false,
5078 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5079 span: span_range(pat, 1..8),
5080 start: ast::Literal {
5081 span: span_range(pat, 1..4),
5082 kind: ast::LiteralKind::Verbatim,
5083 c: '☃',
5084 },
5085 end: ast::Literal {
5086 span: span_range(pat, 5..8),
5087 kind: ast::LiteralKind::Verbatim,
5088 c: '⛄',
5089 },
5090 })),
5091 })))
5092 );
5093
5094 assert_eq!(
5095 parser(r"[]]").parse(),
5096 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5097 span: span(0..3),
5098 negated: false,
5099 kind: itemset(lit(span(1..2), ']')),
5100 })))
5101 );
5102 assert_eq!(
5103 parser(r"[]\[]").parse(),
5104 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5105 span: span(0..5),
5106 negated: false,
5107 kind: union(
5108 span(1..4),
5109 vec![
5110 lit(span(1..2), ']'),
5111 ast::ClassSetItem::Literal(ast::Literal {
5112 span: span(2..4),
5113 kind: ast::LiteralKind::Punctuation,
5114 c: '[',
5115 }),
5116 ]
5117 ),
5118 })))
5119 );
5120 assert_eq!(
5121 parser(r"[\[]]").parse(),
5122 Ok(concat(
5123 0..5,
5124 vec![
5125 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5126 span: span(0..4),
5127 negated: false,
5128 kind: itemset(ast::ClassSetItem::Literal(
5129 ast::Literal {
5130 span: span(1..3),
5131 kind: ast::LiteralKind::Punctuation,
5132 c: '[',
5133 }
5134 )),
5135 })),
5136 Ast::Literal(ast::Literal {
5137 span: span(4..5),
5138 kind: ast::LiteralKind::Verbatim,
5139 c: ']',
5140 }),
5141 ]
5142 ))
5143 );
5144
5145 assert_eq!(
5146 parser("[").parse().unwrap_err(),
5147 TestError {
5148 span: span(0..1),
5149 kind: ast::ErrorKind::ClassUnclosed,
5150 }
5151 );
5152 assert_eq!(
5153 parser("[[").parse().unwrap_err(),
5154 TestError {
5155 span: span(1..2),
5156 kind: ast::ErrorKind::ClassUnclosed,
5157 }
5158 );
5159 assert_eq!(
5160 parser("[[-]").parse().unwrap_err(),
5161 TestError {
5162 span: span(0..1),
5163 kind: ast::ErrorKind::ClassUnclosed,
5164 }
5165 );
5166 assert_eq!(
5167 parser("[[[:alnum:]").parse().unwrap_err(),
5168 TestError {
5169 span: span(1..2),
5170 kind: ast::ErrorKind::ClassUnclosed,
5171 }
5172 );
5173 assert_eq!(
5174 parser(r"[\b]").parse().unwrap_err(),
5175 TestError {
5176 span: span(1..3),
5177 kind: ast::ErrorKind::ClassEscapeInvalid,
5178 }
5179 );
5180 assert_eq!(
5181 parser(r"[\w-a]").parse().unwrap_err(),
5182 TestError {
5183 span: span(1..3),
5184 kind: ast::ErrorKind::ClassRangeLiteral,
5185 }
5186 );
5187 assert_eq!(
5188 parser(r"[a-\w]").parse().unwrap_err(),
5189 TestError {
5190 span: span(3..5),
5191 kind: ast::ErrorKind::ClassRangeLiteral,
5192 }
5193 );
5194 assert_eq!(
5195 parser(r"[z-a]").parse().unwrap_err(),
5196 TestError {
5197 span: span(1..4),
5198 kind: ast::ErrorKind::ClassRangeInvalid,
5199 }
5200 );
5201
5202 assert_eq!(
5203 parser_ignore_whitespace("[a ").parse().unwrap_err(),
5204 TestError {
5205 span: span(0..1),
5206 kind: ast::ErrorKind::ClassUnclosed,
5207 }
5208 );
5209 assert_eq!(
5210 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5211 TestError {
5212 span: span(0..1),
5213 kind: ast::ErrorKind::ClassUnclosed,
5214 }
5215 );
5216 }
5217
5218 #[test]
parse_set_class_open()5219 fn parse_set_class_open() {
5220 assert_eq!(parser("[a]").parse_set_class_open(), {
5221 let set = ast::ClassBracketed {
5222 span: span(0..1),
5223 negated: false,
5224 kind: ast::ClassSet::union(ast::ClassSetUnion {
5225 span: span(1..1),
5226 items: vec![],
5227 }),
5228 };
5229 let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5230 Ok((set, union))
5231 });
5232 assert_eq!(
5233 parser_ignore_whitespace("[ a]").parse_set_class_open(),
5234 {
5235 let set = ast::ClassBracketed {
5236 span: span(0..4),
5237 negated: false,
5238 kind: ast::ClassSet::union(ast::ClassSetUnion {
5239 span: span(4..4),
5240 items: vec![],
5241 }),
5242 };
5243 let union =
5244 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5245 Ok((set, union))
5246 }
5247 );
5248 assert_eq!(parser("[^a]").parse_set_class_open(), {
5249 let set = ast::ClassBracketed {
5250 span: span(0..2),
5251 negated: true,
5252 kind: ast::ClassSet::union(ast::ClassSetUnion {
5253 span: span(2..2),
5254 items: vec![],
5255 }),
5256 };
5257 let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5258 Ok((set, union))
5259 });
5260 assert_eq!(
5261 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5262 {
5263 let set = ast::ClassBracketed {
5264 span: span(0..4),
5265 negated: true,
5266 kind: ast::ClassSet::union(ast::ClassSetUnion {
5267 span: span(4..4),
5268 items: vec![],
5269 }),
5270 };
5271 let union =
5272 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5273 Ok((set, union))
5274 }
5275 );
5276 assert_eq!(parser("[-a]").parse_set_class_open(), {
5277 let set = ast::ClassBracketed {
5278 span: span(0..2),
5279 negated: false,
5280 kind: ast::ClassSet::union(ast::ClassSetUnion {
5281 span: span(1..1),
5282 items: vec![],
5283 }),
5284 };
5285 let union = ast::ClassSetUnion {
5286 span: span(1..2),
5287 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5288 span: span(1..2),
5289 kind: ast::LiteralKind::Verbatim,
5290 c: '-',
5291 })],
5292 };
5293 Ok((set, union))
5294 });
5295 assert_eq!(
5296 parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5297 {
5298 let set = ast::ClassBracketed {
5299 span: span(0..4),
5300 negated: false,
5301 kind: ast::ClassSet::union(ast::ClassSetUnion {
5302 span: span(2..2),
5303 items: vec![],
5304 }),
5305 };
5306 let union = ast::ClassSetUnion {
5307 span: span(2..3),
5308 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5309 span: span(2..3),
5310 kind: ast::LiteralKind::Verbatim,
5311 c: '-',
5312 })],
5313 };
5314 Ok((set, union))
5315 }
5316 );
5317 assert_eq!(parser("[^-a]").parse_set_class_open(), {
5318 let set = ast::ClassBracketed {
5319 span: span(0..3),
5320 negated: true,
5321 kind: ast::ClassSet::union(ast::ClassSetUnion {
5322 span: span(2..2),
5323 items: vec![],
5324 }),
5325 };
5326 let union = ast::ClassSetUnion {
5327 span: span(2..3),
5328 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5329 span: span(2..3),
5330 kind: ast::LiteralKind::Verbatim,
5331 c: '-',
5332 })],
5333 };
5334 Ok((set, union))
5335 });
5336 assert_eq!(parser("[--a]").parse_set_class_open(), {
5337 let set = ast::ClassBracketed {
5338 span: span(0..3),
5339 negated: false,
5340 kind: ast::ClassSet::union(ast::ClassSetUnion {
5341 span: span(1..1),
5342 items: vec![],
5343 }),
5344 };
5345 let union = ast::ClassSetUnion {
5346 span: span(1..3),
5347 items: vec![
5348 ast::ClassSetItem::Literal(ast::Literal {
5349 span: span(1..2),
5350 kind: ast::LiteralKind::Verbatim,
5351 c: '-',
5352 }),
5353 ast::ClassSetItem::Literal(ast::Literal {
5354 span: span(2..3),
5355 kind: ast::LiteralKind::Verbatim,
5356 c: '-',
5357 }),
5358 ],
5359 };
5360 Ok((set, union))
5361 });
5362 assert_eq!(parser("[]a]").parse_set_class_open(), {
5363 let set = ast::ClassBracketed {
5364 span: span(0..2),
5365 negated: false,
5366 kind: ast::ClassSet::union(ast::ClassSetUnion {
5367 span: span(1..1),
5368 items: vec![],
5369 }),
5370 };
5371 let union = ast::ClassSetUnion {
5372 span: span(1..2),
5373 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5374 span: span(1..2),
5375 kind: ast::LiteralKind::Verbatim,
5376 c: ']',
5377 })],
5378 };
5379 Ok((set, union))
5380 });
5381 assert_eq!(
5382 parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5383 {
5384 let set = ast::ClassBracketed {
5385 span: span(0..4),
5386 negated: false,
5387 kind: ast::ClassSet::union(ast::ClassSetUnion {
5388 span: span(2..2),
5389 items: vec![],
5390 }),
5391 };
5392 let union = ast::ClassSetUnion {
5393 span: span(2..3),
5394 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5395 span: span(2..3),
5396 kind: ast::LiteralKind::Verbatim,
5397 c: ']',
5398 })],
5399 };
5400 Ok((set, union))
5401 }
5402 );
5403 assert_eq!(parser("[^]a]").parse_set_class_open(), {
5404 let set = ast::ClassBracketed {
5405 span: span(0..3),
5406 negated: true,
5407 kind: ast::ClassSet::union(ast::ClassSetUnion {
5408 span: span(2..2),
5409 items: vec![],
5410 }),
5411 };
5412 let union = ast::ClassSetUnion {
5413 span: span(2..3),
5414 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5415 span: span(2..3),
5416 kind: ast::LiteralKind::Verbatim,
5417 c: ']',
5418 })],
5419 };
5420 Ok((set, union))
5421 });
5422 assert_eq!(parser("[-]a]").parse_set_class_open(), {
5423 let set = ast::ClassBracketed {
5424 span: span(0..2),
5425 negated: false,
5426 kind: ast::ClassSet::union(ast::ClassSetUnion {
5427 span: span(1..1),
5428 items: vec![],
5429 }),
5430 };
5431 let union = ast::ClassSetUnion {
5432 span: span(1..2),
5433 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5434 span: span(1..2),
5435 kind: ast::LiteralKind::Verbatim,
5436 c: '-',
5437 })],
5438 };
5439 Ok((set, union))
5440 });
5441
5442 assert_eq!(
5443 parser("[").parse_set_class_open().unwrap_err(),
5444 TestError {
5445 span: span(0..1),
5446 kind: ast::ErrorKind::ClassUnclosed,
5447 }
5448 );
5449 assert_eq!(
5450 parser_ignore_whitespace("[ ")
5451 .parse_set_class_open()
5452 .unwrap_err(),
5453 TestError {
5454 span: span(0..5),
5455 kind: ast::ErrorKind::ClassUnclosed,
5456 }
5457 );
5458 assert_eq!(
5459 parser("[^").parse_set_class_open().unwrap_err(),
5460 TestError {
5461 span: span(0..2),
5462 kind: ast::ErrorKind::ClassUnclosed,
5463 }
5464 );
5465 assert_eq!(
5466 parser("[]").parse_set_class_open().unwrap_err(),
5467 TestError {
5468 span: span(0..2),
5469 kind: ast::ErrorKind::ClassUnclosed,
5470 }
5471 );
5472 assert_eq!(
5473 parser("[-").parse_set_class_open().unwrap_err(),
5474 TestError {
5475 span: span(0..2),
5476 kind: ast::ErrorKind::ClassUnclosed,
5477 }
5478 );
5479 assert_eq!(
5480 parser("[--").parse_set_class_open().unwrap_err(),
5481 TestError {
5482 span: span(0..3),
5483 kind: ast::ErrorKind::ClassUnclosed,
5484 }
5485 );
5486 }
5487
5488 #[test]
maybe_parse_ascii_class()5489 fn maybe_parse_ascii_class() {
5490 assert_eq!(
5491 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5492 Some(ast::ClassAscii {
5493 span: span(0..9),
5494 kind: ast::ClassAsciiKind::Alnum,
5495 negated: false,
5496 })
5497 );
5498 assert_eq!(
5499 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5500 Some(ast::ClassAscii {
5501 span: span(0..9),
5502 kind: ast::ClassAsciiKind::Alnum,
5503 negated: false,
5504 })
5505 );
5506 assert_eq!(
5507 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5508 Some(ast::ClassAscii {
5509 span: span(0..10),
5510 kind: ast::ClassAsciiKind::Alnum,
5511 negated: true,
5512 })
5513 );
5514
5515 let p = parser(r"[:");
5516 assert_eq!(p.maybe_parse_ascii_class(), None);
5517 assert_eq!(p.offset(), 0);
5518
5519 let p = parser(r"[:^");
5520 assert_eq!(p.maybe_parse_ascii_class(), None);
5521 assert_eq!(p.offset(), 0);
5522
5523 let p = parser(r"[^:alnum:]");
5524 assert_eq!(p.maybe_parse_ascii_class(), None);
5525 assert_eq!(p.offset(), 0);
5526
5527 let p = parser(r"[:alnnum:]");
5528 assert_eq!(p.maybe_parse_ascii_class(), None);
5529 assert_eq!(p.offset(), 0);
5530
5531 let p = parser(r"[:alnum]");
5532 assert_eq!(p.maybe_parse_ascii_class(), None);
5533 assert_eq!(p.offset(), 0);
5534
5535 let p = parser(r"[:alnum:");
5536 assert_eq!(p.maybe_parse_ascii_class(), None);
5537 assert_eq!(p.offset(), 0);
5538 }
5539
5540 #[test]
parse_unicode_class()5541 fn parse_unicode_class() {
5542 assert_eq!(
5543 parser(r"\pN").parse_escape(),
5544 Ok(Primitive::Unicode(ast::ClassUnicode {
5545 span: span(0..3),
5546 negated: false,
5547 kind: ast::ClassUnicodeKind::OneLetter('N'),
5548 }))
5549 );
5550 assert_eq!(
5551 parser(r"\PN").parse_escape(),
5552 Ok(Primitive::Unicode(ast::ClassUnicode {
5553 span: span(0..3),
5554 negated: true,
5555 kind: ast::ClassUnicodeKind::OneLetter('N'),
5556 }))
5557 );
5558 assert_eq!(
5559 parser(r"\p{N}").parse_escape(),
5560 Ok(Primitive::Unicode(ast::ClassUnicode {
5561 span: span(0..5),
5562 negated: false,
5563 kind: ast::ClassUnicodeKind::Named(s("N")),
5564 }))
5565 );
5566 assert_eq!(
5567 parser(r"\P{N}").parse_escape(),
5568 Ok(Primitive::Unicode(ast::ClassUnicode {
5569 span: span(0..5),
5570 negated: true,
5571 kind: ast::ClassUnicodeKind::Named(s("N")),
5572 }))
5573 );
5574 assert_eq!(
5575 parser(r"\p{Greek}").parse_escape(),
5576 Ok(Primitive::Unicode(ast::ClassUnicode {
5577 span: span(0..9),
5578 negated: false,
5579 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5580 }))
5581 );
5582
5583 assert_eq!(
5584 parser(r"\p{scx:Katakana}").parse_escape(),
5585 Ok(Primitive::Unicode(ast::ClassUnicode {
5586 span: span(0..16),
5587 negated: false,
5588 kind: ast::ClassUnicodeKind::NamedValue {
5589 op: ast::ClassUnicodeOpKind::Colon,
5590 name: s("scx"),
5591 value: s("Katakana"),
5592 },
5593 }))
5594 );
5595 assert_eq!(
5596 parser(r"\p{scx=Katakana}").parse_escape(),
5597 Ok(Primitive::Unicode(ast::ClassUnicode {
5598 span: span(0..16),
5599 negated: false,
5600 kind: ast::ClassUnicodeKind::NamedValue {
5601 op: ast::ClassUnicodeOpKind::Equal,
5602 name: s("scx"),
5603 value: s("Katakana"),
5604 },
5605 }))
5606 );
5607 assert_eq!(
5608 parser(r"\p{scx!=Katakana}").parse_escape(),
5609 Ok(Primitive::Unicode(ast::ClassUnicode {
5610 span: span(0..17),
5611 negated: false,
5612 kind: ast::ClassUnicodeKind::NamedValue {
5613 op: ast::ClassUnicodeOpKind::NotEqual,
5614 name: s("scx"),
5615 value: s("Katakana"),
5616 },
5617 }))
5618 );
5619
5620 assert_eq!(
5621 parser(r"\p{:}").parse_escape(),
5622 Ok(Primitive::Unicode(ast::ClassUnicode {
5623 span: span(0..5),
5624 negated: false,
5625 kind: ast::ClassUnicodeKind::NamedValue {
5626 op: ast::ClassUnicodeOpKind::Colon,
5627 name: s(""),
5628 value: s(""),
5629 },
5630 }))
5631 );
5632 assert_eq!(
5633 parser(r"\p{=}").parse_escape(),
5634 Ok(Primitive::Unicode(ast::ClassUnicode {
5635 span: span(0..5),
5636 negated: false,
5637 kind: ast::ClassUnicodeKind::NamedValue {
5638 op: ast::ClassUnicodeOpKind::Equal,
5639 name: s(""),
5640 value: s(""),
5641 },
5642 }))
5643 );
5644 assert_eq!(
5645 parser(r"\p{!=}").parse_escape(),
5646 Ok(Primitive::Unicode(ast::ClassUnicode {
5647 span: span(0..6),
5648 negated: false,
5649 kind: ast::ClassUnicodeKind::NamedValue {
5650 op: ast::ClassUnicodeOpKind::NotEqual,
5651 name: s(""),
5652 value: s(""),
5653 },
5654 }))
5655 );
5656
5657 assert_eq!(
5658 parser(r"\p").parse_escape().unwrap_err(),
5659 TestError {
5660 span: span(2..2),
5661 kind: ast::ErrorKind::EscapeUnexpectedEof,
5662 }
5663 );
5664 assert_eq!(
5665 parser(r"\p{").parse_escape().unwrap_err(),
5666 TestError {
5667 span: span(3..3),
5668 kind: ast::ErrorKind::EscapeUnexpectedEof,
5669 }
5670 );
5671 assert_eq!(
5672 parser(r"\p{N").parse_escape().unwrap_err(),
5673 TestError {
5674 span: span(4..4),
5675 kind: ast::ErrorKind::EscapeUnexpectedEof,
5676 }
5677 );
5678 assert_eq!(
5679 parser(r"\p{Greek").parse_escape().unwrap_err(),
5680 TestError {
5681 span: span(8..8),
5682 kind: ast::ErrorKind::EscapeUnexpectedEof,
5683 }
5684 );
5685
5686 assert_eq!(
5687 parser(r"\pNz").parse(),
5688 Ok(Ast::Concat(ast::Concat {
5689 span: span(0..4),
5690 asts: vec![
5691 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5692 span: span(0..3),
5693 negated: false,
5694 kind: ast::ClassUnicodeKind::OneLetter('N'),
5695 })),
5696 Ast::Literal(ast::Literal {
5697 span: span(3..4),
5698 kind: ast::LiteralKind::Verbatim,
5699 c: 'z',
5700 }),
5701 ],
5702 }))
5703 );
5704 assert_eq!(
5705 parser(r"\p{Greek}z").parse(),
5706 Ok(Ast::Concat(ast::Concat {
5707 span: span(0..10),
5708 asts: vec![
5709 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5710 span: span(0..9),
5711 negated: false,
5712 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5713 })),
5714 Ast::Literal(ast::Literal {
5715 span: span(9..10),
5716 kind: ast::LiteralKind::Verbatim,
5717 c: 'z',
5718 }),
5719 ],
5720 }))
5721 );
5722 assert_eq!(
5723 parser(r"\p\{").parse().unwrap_err(),
5724 TestError {
5725 span: span(2..3),
5726 kind: ast::ErrorKind::UnicodeClassInvalid,
5727 }
5728 );
5729 assert_eq!(
5730 parser(r"\P\{").parse().unwrap_err(),
5731 TestError {
5732 span: span(2..3),
5733 kind: ast::ErrorKind::UnicodeClassInvalid,
5734 }
5735 );
5736 }
5737
5738 #[test]
parse_perl_class()5739 fn parse_perl_class() {
5740 assert_eq!(
5741 parser(r"\d").parse_escape(),
5742 Ok(Primitive::Perl(ast::ClassPerl {
5743 span: span(0..2),
5744 kind: ast::ClassPerlKind::Digit,
5745 negated: false,
5746 }))
5747 );
5748 assert_eq!(
5749 parser(r"\D").parse_escape(),
5750 Ok(Primitive::Perl(ast::ClassPerl {
5751 span: span(0..2),
5752 kind: ast::ClassPerlKind::Digit,
5753 negated: true,
5754 }))
5755 );
5756 assert_eq!(
5757 parser(r"\s").parse_escape(),
5758 Ok(Primitive::Perl(ast::ClassPerl {
5759 span: span(0..2),
5760 kind: ast::ClassPerlKind::Space,
5761 negated: false,
5762 }))
5763 );
5764 assert_eq!(
5765 parser(r"\S").parse_escape(),
5766 Ok(Primitive::Perl(ast::ClassPerl {
5767 span: span(0..2),
5768 kind: ast::ClassPerlKind::Space,
5769 negated: true,
5770 }))
5771 );
5772 assert_eq!(
5773 parser(r"\w").parse_escape(),
5774 Ok(Primitive::Perl(ast::ClassPerl {
5775 span: span(0..2),
5776 kind: ast::ClassPerlKind::Word,
5777 negated: false,
5778 }))
5779 );
5780 assert_eq!(
5781 parser(r"\W").parse_escape(),
5782 Ok(Primitive::Perl(ast::ClassPerl {
5783 span: span(0..2),
5784 kind: ast::ClassPerlKind::Word,
5785 negated: true,
5786 }))
5787 );
5788
5789 assert_eq!(
5790 parser(r"\d").parse(),
5791 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5792 span: span(0..2),
5793 kind: ast::ClassPerlKind::Digit,
5794 negated: false,
5795 })))
5796 );
5797 assert_eq!(
5798 parser(r"\dz").parse(),
5799 Ok(Ast::Concat(ast::Concat {
5800 span: span(0..3),
5801 asts: vec![
5802 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5803 span: span(0..2),
5804 kind: ast::ClassPerlKind::Digit,
5805 negated: false,
5806 })),
5807 Ast::Literal(ast::Literal {
5808 span: span(2..3),
5809 kind: ast::LiteralKind::Verbatim,
5810 c: 'z',
5811 }),
5812 ],
5813 }))
5814 );
5815 }
5816
5817 // This tests a bug fix where the nest limit checker wasn't decrementing
5818 // its depth during post-traversal, which causes long regexes to trip
5819 // the default limit too aggressively.
5820 #[test]
regression_454_nest_too_big()5821 fn regression_454_nest_too_big() {
5822 let pattern = r#"
5823 2(?:
5824 [45]\d{3}|
5825 7(?:
5826 1[0-267]|
5827 2[0-289]|
5828 3[0-29]|
5829 4[01]|
5830 5[1-3]|
5831 6[013]|
5832 7[0178]|
5833 91
5834 )|
5835 8(?:
5836 0[125]|
5837 [139][1-6]|
5838 2[0157-9]|
5839 41|
5840 6[1-35]|
5841 7[1-5]|
5842 8[1-8]|
5843 90
5844 )|
5845 9(?:
5846 0[0-2]|
5847 1[0-4]|
5848 2[568]|
5849 3[3-6]|
5850 5[5-7]|
5851 6[0167]|
5852 7[15]|
5853 8[0146-9]
5854 )
5855 )\d{4}
5856 "#;
5857 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5858 }
5859
5860 // This tests that we treat a trailing `-` in a character class as a
5861 // literal `-` even when whitespace mode is enabled and there is whitespace
5862 // after the trailing `-`.
5863 #[test]
regression_455_trailing_dash_ignore_whitespace()5864 fn regression_455_trailing_dash_ignore_whitespace() {
5865 assert!(parser("(?x)[ / - ]").parse().is_ok());
5866 assert!(parser("(?x)[ a - ]").parse().is_ok());
5867 assert!(parser(
5868 "(?x)[
5869 a
5870 - ]
5871 "
5872 )
5873 .parse()
5874 .is_ok());
5875 assert!(parser(
5876 "(?x)[
5877 a # wat
5878 - ]
5879 "
5880 )
5881 .parse()
5882 .is_ok());
5883
5884 assert!(parser("(?x)[ / -").parse().is_err());
5885 assert!(parser("(?x)[ / - ").parse().is_err());
5886 assert!(parser(
5887 "(?x)[
5888 / -
5889 "
5890 )
5891 .parse()
5892 .is_err());
5893 assert!(parser(
5894 "(?x)[
5895 / - # wat
5896 "
5897 )
5898 .parse()
5899 .is_err());
5900 }
5901 }
5902