1 /*!
2 This module provides a regular expression parser.
3 */
4
5 use std::borrow::Borrow;
6 use std::cell::{Cell, RefCell};
7 use std::mem;
8 use std::result;
9
10 use ast::{self, Ast, Position, Span};
11 use either::Either;
12
13 use is_meta_character;
14
15 type Result<T> = result::Result<T, ast::Error>;
16
17 /// A primitive is an expression with no sub-expressions. This includes
18 /// literals, assertions and non-set character classes. This representation
19 /// is used as intermediate state in the parser.
20 ///
21 /// This does not include ASCII character classes, since they can only appear
22 /// within a set character class.
23 #[derive(Clone, Debug, Eq, PartialEq)]
24 enum Primitive {
25 Literal(ast::Literal),
26 Assertion(ast::Assertion),
27 Dot(Span),
28 Perl(ast::ClassPerl),
29 Unicode(ast::ClassUnicode),
30 }
31
32 impl Primitive {
33 /// Return the span of this primitive.
span(&self) -> &Span34 fn span(&self) -> &Span {
35 match *self {
36 Primitive::Literal(ref x) => &x.span,
37 Primitive::Assertion(ref x) => &x.span,
38 Primitive::Dot(ref span) => span,
39 Primitive::Perl(ref x) => &x.span,
40 Primitive::Unicode(ref x) => &x.span,
41 }
42 }
43
44 /// Convert this primitive into a proper AST.
into_ast(self) -> Ast45 fn into_ast(self) -> Ast {
46 match self {
47 Primitive::Literal(lit) => Ast::Literal(lit),
48 Primitive::Assertion(assert) => Ast::Assertion(assert),
49 Primitive::Dot(span) => Ast::Dot(span),
50 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
51 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
52 }
53 }
54
55 /// Convert this primitive into an item in a character class.
56 ///
57 /// If this primitive is not a legal item (i.e., an assertion or a dot),
58 /// then return an error.
into_class_set_item<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::ClassSetItem>59 fn into_class_set_item<P: Borrow<Parser>>(
60 self,
61 p: &ParserI<P>,
62 ) -> Result<ast::ClassSetItem> {
63 use self::Primitive::*;
64 use ast::ClassSetItem;
65
66 match self {
67 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
68 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
69 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
70 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
71 }
72 }
73
74 /// Convert this primitive into a literal in a character class. In
75 /// particular, literals are the only valid items that can appear in
76 /// ranges.
77 ///
78 /// If this primitive is not a legal item (i.e., a class, assertion or a
79 /// dot), then return an error.
into_class_literal<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::Literal>80 fn into_class_literal<P: Borrow<Parser>>(
81 self,
82 p: &ParserI<P>,
83 ) -> Result<ast::Literal> {
84 use self::Primitive::*;
85
86 match self {
87 Literal(lit) => Ok(lit),
88 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
89 }
90 }
91 }
92
93 /// Returns true if the given character is a hexadecimal digit.
is_hex(c: char) -> bool94 fn is_hex(c: char) -> bool {
95 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
96 }
97
98 /// Returns true if the given character is a valid in a capture group name.
99 ///
100 /// If `first` is true, then `c` is treated as the first character in the
101 /// group name (which is not allowed to be a digit).
is_capture_char(c: char, first: bool) -> bool102 fn is_capture_char(c: char, first: bool) -> bool {
103 c == '_'
104 || (!first && c >= '0' && c <= '9')
105 || (c >= 'a' && c <= 'z')
106 || (c >= 'A' && c <= 'Z')
107 }
108
109 /// A builder for a regular expression parser.
110 ///
111 /// This builder permits modifying configuration options for the parser.
112 #[derive(Clone, Debug)]
113 pub struct ParserBuilder {
114 ignore_whitespace: bool,
115 nest_limit: u32,
116 octal: bool,
117 }
118
119 impl Default for ParserBuilder {
default() -> ParserBuilder120 fn default() -> ParserBuilder {
121 ParserBuilder::new()
122 }
123 }
124
125 impl ParserBuilder {
126 /// Create a new parser builder with a default configuration.
new() -> ParserBuilder127 pub fn new() -> ParserBuilder {
128 ParserBuilder {
129 ignore_whitespace: false,
130 nest_limit: 250,
131 octal: false,
132 }
133 }
134
135 /// Build a parser from this configuration with the given pattern.
build(&self) -> Parser136 pub fn build(&self) -> Parser {
137 Parser {
138 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
139 capture_index: Cell::new(0),
140 nest_limit: self.nest_limit,
141 octal: self.octal,
142 initial_ignore_whitespace: self.ignore_whitespace,
143 ignore_whitespace: Cell::new(self.ignore_whitespace),
144 comments: RefCell::new(vec![]),
145 stack_group: RefCell::new(vec![]),
146 stack_class: RefCell::new(vec![]),
147 capture_names: RefCell::new(vec![]),
148 scratch: RefCell::new(String::new()),
149 }
150 }
151
152 /// Set the nesting limit for this parser.
153 ///
154 /// The nesting limit controls how deep the abstract syntax tree is allowed
155 /// to be. If the AST exceeds the given limit (e.g., with too many nested
156 /// groups), then an error is returned by the parser.
157 ///
158 /// The purpose of this limit is to act as a heuristic to prevent stack
159 /// overflow for consumers that do structural induction on an `Ast` using
160 /// explicit recursion. While this crate never does this (instead using
161 /// constant stack space and moving the call stack to the heap), other
162 /// crates may.
163 ///
164 /// This limit is not checked until the entire Ast is parsed. Therefore,
165 /// if callers want to put a limit on the amount of heap space used, then
166 /// they should impose a limit on the length, in bytes, of the concrete
167 /// pattern string. In particular, this is viable since this parser
168 /// implementation will limit itself to heap space proportional to the
169 /// lenth of the pattern string.
170 ///
171 /// Note that a nest limit of `0` will return a nest limit error for most
172 /// patterns but not all. For example, a nest limit of `0` permits `a` but
173 /// not `ab`, since `ab` requires a concatenation, which results in a nest
174 /// depth of `1`. In general, a nest limit is not something that manifests
175 /// in an obvious way in the concrete syntax, therefore, it should not be
176 /// used in a granular way.
nest_limit(&mut self, limit: u32) -> &mut ParserBuilder177 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
178 self.nest_limit = limit;
179 self
180 }
181
182 /// Whether to support octal syntax or not.
183 ///
184 /// Octal syntax is a little-known way of uttering Unicode codepoints in
185 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
186 /// `\141` are all equivalent regular expressions, where the last example
187 /// shows octal syntax.
188 ///
189 /// While supporting octal syntax isn't in and of itself a problem, it does
190 /// make good error messages harder. That is, in PCRE based regex engines,
191 /// syntax like `\0` invokes a backreference, which is explicitly
192 /// unsupported in Rust's regex engine. However, many users expect it to
193 /// be supported. Therefore, when octal support is disabled, the error
194 /// message will explicitly mention that backreferences aren't supported.
195 ///
196 /// Octal syntax is disabled by default.
octal(&mut self, yes: bool) -> &mut ParserBuilder197 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
198 self.octal = yes;
199 self
200 }
201
202 /// Enable verbose mode in the regular expression.
203 ///
204 /// When enabled, verbose mode permits insigificant whitespace in many
205 /// places in the regular expression, as well as comments. Comments are
206 /// started using `#` and continue until the end of the line.
207 ///
208 /// By default, this is disabled. It may be selectively enabled in the
209 /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder210 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
211 self.ignore_whitespace = yes;
212 self
213 }
214 }
215
216 /// A regular expression parser.
217 ///
218 /// This parses a string representation of a regular expression into an
219 /// abstract syntax tree. The size of the tree is proportional to the length
220 /// of the regular expression pattern.
221 ///
222 /// A `Parser` can be configured in more detail via a
223 /// [`ParserBuilder`](struct.ParserBuilder.html).
224 #[derive(Clone, Debug)]
225 pub struct Parser {
226 /// The current position of the parser.
227 pos: Cell<Position>,
228 /// The current capture index.
229 capture_index: Cell<u32>,
230 /// The maximum number of open parens/brackets allowed. If the parser
231 /// exceeds this number, then an error is returned.
232 nest_limit: u32,
233 /// Whether to support octal syntax or not. When `false`, the parser will
234 /// return an error helpfully pointing out that backreferences are not
235 /// supported.
236 octal: bool,
237 /// The initial setting for `ignore_whitespace` as provided by
238 /// Th`ParserBuilder`. is is used when reseting the parser's state.
239 initial_ignore_whitespace: bool,
240 /// Whether whitespace should be ignored. When enabled, comments are
241 /// also permitted.
242 ignore_whitespace: Cell<bool>,
243 /// A list of comments, in order of appearance.
244 comments: RefCell<Vec<ast::Comment>>,
245 /// A stack of grouped sub-expressions, including alternations.
246 stack_group: RefCell<Vec<GroupState>>,
247 /// A stack of nested character classes. This is only non-empty when
248 /// parsing a class.
249 stack_class: RefCell<Vec<ClassState>>,
250 /// A sorted sequence of capture names. This is used to detect duplicate
251 /// capture names and report an error if one is detected.
252 capture_names: RefCell<Vec<ast::CaptureName>>,
253 /// A scratch buffer used in various places. Mostly this is used to
254 /// accumulate relevant characters from parts of a pattern.
255 scratch: RefCell<String>,
256 }
257
258 /// ParserI is the internal parser implementation.
259 ///
260 /// We use this separate type so that we can carry the provided pattern string
261 /// along with us. In particular, a `Parser` internal state is not tied to any
262 /// one pattern, but `ParserI` is.
263 ///
264 /// This type also lets us use `ParserI<&Parser>` in production code while
265 /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
266 /// work against the internal interface of the parser.
267 #[derive(Clone, Debug)]
268 struct ParserI<'s, P> {
269 /// The parser state/configuration.
270 parser: P,
271 /// The full regular expression provided by the user.
272 pattern: &'s str,
273 }
274
275 /// GroupState represents a single stack frame while parsing nested groups
276 /// and alternations. Each frame records the state up to an opening parenthesis
277 /// or a alternating bracket `|`.
278 #[derive(Clone, Debug)]
279 enum GroupState {
280 /// This state is pushed whenever an opening group is found.
281 Group {
282 /// The concatenation immediately preceding the opening group.
283 concat: ast::Concat,
284 /// The group that has been opened. Its sub-AST is always empty.
285 group: ast::Group,
286 /// Whether this group has the `x` flag enabled or not.
287 ignore_whitespace: bool,
288 },
289 /// This state is pushed whenever a new alternation branch is found. If
290 /// an alternation branch is found and this state is at the top of the
291 /// stack, then this state should be modified to include the new
292 /// alternation.
293 Alternation(ast::Alternation),
294 }
295
296 /// ClassState represents a single stack frame while parsing character classes.
297 /// Each frame records the state up to an intersection, difference, symmetric
298 /// difference or nested class.
299 ///
300 /// Note that a parser's character class stack is only non-empty when parsing
301 /// a character class. In all other cases, it is empty.
302 #[derive(Clone, Debug)]
303 enum ClassState {
304 /// This state is pushed whenever an opening bracket is found.
305 Open {
306 /// The union of class items immediately preceding this class.
307 union: ast::ClassSetUnion,
308 /// The class that has been opened. Typically this just corresponds
309 /// to the `[`, but it can also include `[^` since `^` indicates
310 /// negation of the class.
311 set: ast::ClassBracketed,
312 },
313 /// This state is pushed when a operator is seen. When popped, the stored
314 /// set becomes the left hand side of the operator.
315 Op {
316 /// The type of the operation, i.e., &&, -- or ~~.
317 kind: ast::ClassSetBinaryOpKind,
318 /// The left-hand side of the operator.
319 lhs: ast::ClassSet,
320 },
321 }
322
323 impl Parser {
324 /// Create a new parser with a default configuration.
325 ///
326 /// The parser can be run with either the `parse` or `parse_with_comments`
327 /// methods. The parse methods return an abstract syntax tree.
328 ///
329 /// To set configuration options on the parser, use
330 /// [`ParserBuilder`](struct.ParserBuilder.html).
new() -> Parser331 pub fn new() -> Parser {
332 ParserBuilder::new().build()
333 }
334
335 /// Parse the regular expression into an abstract syntax tree.
parse(&mut self, pattern: &str) -> Result<Ast>336 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
337 ParserI::new(self, pattern).parse()
338 }
339
340 /// Parse the regular expression and return an abstract syntax tree with
341 /// all of the comments found in the pattern.
parse_with_comments( &mut self, pattern: &str, ) -> Result<ast::WithComments>342 pub fn parse_with_comments(
343 &mut self,
344 pattern: &str,
345 ) -> Result<ast::WithComments> {
346 ParserI::new(self, pattern).parse_with_comments()
347 }
348
349 /// Reset the internal state of a parser.
350 ///
351 /// This is called at the beginning of every parse. This prevents the
352 /// parser from running with inconsistent state (say, if a previous
353 /// invocation returned an error and the parser is reused).
reset(&self)354 fn reset(&self) {
355 // These settings should be in line with the construction
356 // in `ParserBuilder::build`.
357 self.pos.set(Position { offset: 0, line: 1, column: 1 });
358 self.ignore_whitespace.set(self.initial_ignore_whitespace);
359 self.comments.borrow_mut().clear();
360 self.stack_group.borrow_mut().clear();
361 self.stack_class.borrow_mut().clear();
362 }
363 }
364
365 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
366 /// Build an internal parser from a parser configuration and a pattern.
new(parser: P, pattern: &'s str) -> ParserI<'s, P>367 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
368 ParserI { parser: parser, pattern: pattern }
369 }
370
371 /// Return a reference to the parser state.
parser(&self) -> &Parser372 fn parser(&self) -> &Parser {
373 self.parser.borrow()
374 }
375
376 /// Return a reference to the pattern being parsed.
pattern(&self) -> &str377 fn pattern(&self) -> &str {
378 self.pattern.borrow()
379 }
380
381 /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error382 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
383 ast::Error {
384 kind: kind,
385 pattern: self.pattern().to_string(),
386 span: span,
387 }
388 }
389
390 /// Return the current offset of the parser.
391 ///
392 /// The offset starts at `0` from the beginning of the regular expression
393 /// pattern string.
offset(&self) -> usize394 fn offset(&self) -> usize {
395 self.parser().pos.get().offset
396 }
397
398 /// Return the current line number of the parser.
399 ///
400 /// The line number starts at `1`.
line(&self) -> usize401 fn line(&self) -> usize {
402 self.parser().pos.get().line
403 }
404
405 /// Return the current column of the parser.
406 ///
407 /// The column number starts at `1` and is reset whenever a `\n` is seen.
column(&self) -> usize408 fn column(&self) -> usize {
409 self.parser().pos.get().column
410 }
411
412 /// Return the next capturing index. Each subsequent call increments the
413 /// internal index.
414 ///
415 /// The span given should correspond to the location of the opening
416 /// parenthesis.
417 ///
418 /// If the capture limit is exceeded, then an error is returned.
next_capture_index(&self, span: Span) -> Result<u32>419 fn next_capture_index(&self, span: Span) -> Result<u32> {
420 let current = self.parser().capture_index.get();
421 let i = current.checked_add(1).ok_or_else(|| {
422 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
423 })?;
424 self.parser().capture_index.set(i);
425 Ok(i)
426 }
427
428 /// Adds the given capture name to this parser. If this capture name has
429 /// already been used, then an error is returned.
add_capture_name(&self, cap: &ast::CaptureName) -> Result<()>430 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
431 let mut names = self.parser().capture_names.borrow_mut();
432 match names
433 .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
434 {
435 Err(i) => {
436 names.insert(i, cap.clone());
437 Ok(())
438 }
439 Ok(i) => Err(self.error(
440 cap.span,
441 ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
442 )),
443 }
444 }
445
446 /// Return whether the parser should ignore whitespace or not.
ignore_whitespace(&self) -> bool447 fn ignore_whitespace(&self) -> bool {
448 self.parser().ignore_whitespace.get()
449 }
450
451 /// Return the character at the current position of the parser.
452 ///
453 /// This panics if the current position does not point to a valid char.
char(&self) -> char454 fn char(&self) -> char {
455 self.char_at(self.offset())
456 }
457
458 /// Return the character at the given position.
459 ///
460 /// This panics if the given position does not point to a valid char.
char_at(&self, i: usize) -> char461 fn char_at(&self, i: usize) -> char {
462 self.pattern()[i..]
463 .chars()
464 .next()
465 .unwrap_or_else(|| panic!("expected char at offset {}", i))
466 }
467
468 /// Bump the parser to the next Unicode scalar value.
469 ///
470 /// If the end of the input has been reached, then `false` is returned.
bump(&self) -> bool471 fn bump(&self) -> bool {
472 if self.is_eof() {
473 return false;
474 }
475 let Position { mut offset, mut line, mut column } = self.pos();
476 if self.char() == '\n' {
477 line = line.checked_add(1).unwrap();
478 column = 1;
479 } else {
480 column = column.checked_add(1).unwrap();
481 }
482 offset += self.char().len_utf8();
483 self.parser().pos.set(Position {
484 offset: offset,
485 line: line,
486 column: column,
487 });
488 self.pattern()[self.offset()..].chars().next().is_some()
489 }
490
491 /// If the substring starting at the current position of the parser has
492 /// the given prefix, then bump the parser to the character immediately
493 /// following the prefix and return true. Otherwise, don't bump the parser
494 /// and return false.
bump_if(&self, prefix: &str) -> bool495 fn bump_if(&self, prefix: &str) -> bool {
496 if self.pattern()[self.offset()..].starts_with(prefix) {
497 for _ in 0..prefix.chars().count() {
498 self.bump();
499 }
500 true
501 } else {
502 false
503 }
504 }
505
506 /// Returns true if and only if the parser is positioned at a look-around
507 /// prefix. The conditions under which this returns true must always
508 /// correspond to a regular expression that would otherwise be consider
509 /// invalid.
510 ///
511 /// This should only be called immediately after parsing the opening of
512 /// a group or a set of flags.
is_lookaround_prefix(&self) -> bool513 fn is_lookaround_prefix(&self) -> bool {
514 self.bump_if("?=")
515 || self.bump_if("?!")
516 || self.bump_if("?<=")
517 || self.bump_if("?<!")
518 }
519
520 /// Bump the parser, and if the `x` flag is enabled, bump through any
521 /// subsequent spaces. Return true if and only if the parser is not at
522 /// EOF.
bump_and_bump_space(&self) -> bool523 fn bump_and_bump_space(&self) -> bool {
524 if !self.bump() {
525 return false;
526 }
527 self.bump_space();
528 !self.is_eof()
529 }
530
531 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
532 /// comments), then this will advance the parser through all whitespace
533 /// and comments to the next non-whitespace non-comment byte.
534 ///
535 /// If the `x` flag is disabled, then this is a no-op.
536 ///
537 /// This should be used selectively throughout the parser where
538 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
539 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
bump_space(&self)540 fn bump_space(&self) {
541 if !self.ignore_whitespace() {
542 return;
543 }
544 while !self.is_eof() {
545 if self.char().is_whitespace() {
546 self.bump();
547 } else if self.char() == '#' {
548 let start = self.pos();
549 let mut comment_text = String::new();
550 self.bump();
551 while !self.is_eof() {
552 let c = self.char();
553 self.bump();
554 if c == '\n' {
555 break;
556 }
557 comment_text.push(c);
558 }
559 let comment = ast::Comment {
560 span: Span::new(start, self.pos()),
561 comment: comment_text,
562 };
563 self.parser().comments.borrow_mut().push(comment);
564 } else {
565 break;
566 }
567 }
568 }
569
570 /// Peek at the next character in the input without advancing the parser.
571 ///
572 /// If the input has been exhausted, then this returns `None`.
peek(&self) -> Option<char>573 fn peek(&self) -> Option<char> {
574 if self.is_eof() {
575 return None;
576 }
577 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
578 }
579
580 /// Like peek, but will ignore spaces when the parser is in whitespace
581 /// insensitive mode.
peek_space(&self) -> Option<char>582 fn peek_space(&self) -> Option<char> {
583 if !self.ignore_whitespace() {
584 return self.peek();
585 }
586 if self.is_eof() {
587 return None;
588 }
589 let mut start = self.offset() + self.char().len_utf8();
590 let mut in_comment = false;
591 for (i, c) in self.pattern()[start..].char_indices() {
592 if c.is_whitespace() {
593 continue;
594 } else if !in_comment && c == '#' {
595 in_comment = true;
596 } else if in_comment && c == '\n' {
597 in_comment = false;
598 } else {
599 start += i;
600 break;
601 }
602 }
603 self.pattern()[start..].chars().next()
604 }
605
606 /// Returns true if the next call to `bump` would return false.
is_eof(&self) -> bool607 fn is_eof(&self) -> bool {
608 self.offset() == self.pattern().len()
609 }
610
611 /// Return the current position of the parser, which includes the offset,
612 /// line and column.
pos(&self) -> Position613 fn pos(&self) -> Position {
614 self.parser().pos.get()
615 }
616
617 /// Create a span at the current position of the parser. Both the start
618 /// and end of the span are set.
span(&self) -> Span619 fn span(&self) -> Span {
620 Span::splat(self.pos())
621 }
622
623 /// Create a span that covers the current character.
span_char(&self) -> Span624 fn span_char(&self) -> Span {
625 let mut next = Position {
626 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
627 line: self.line(),
628 column: self.column().checked_add(1).unwrap(),
629 };
630 if self.char() == '\n' {
631 next.line += 1;
632 next.column = 1;
633 }
634 Span::new(self.pos(), next)
635 }
636
637 /// Parse and push a single alternation on to the parser's internal stack.
638 /// If the top of the stack already has an alternation, then add to that
639 /// instead of pushing a new one.
640 ///
641 /// The concatenation given corresponds to a single alternation branch.
642 /// The concatenation returned starts the next branch and is empty.
643 ///
644 /// This assumes the parser is currently positioned at `|` and will advance
645 /// the parser to the character following `|`.
646 #[inline(never)]
push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat>647 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
648 assert_eq!(self.char(), '|');
649 concat.span.end = self.pos();
650 self.push_or_add_alternation(concat);
651 self.bump();
652 Ok(ast::Concat { span: self.span(), asts: vec![] })
653 }
654
655 /// Pushes or adds the given branch of an alternation to the parser's
656 /// internal stack of state.
push_or_add_alternation(&self, concat: ast::Concat)657 fn push_or_add_alternation(&self, concat: ast::Concat) {
658 use self::GroupState::*;
659
660 let mut stack = self.parser().stack_group.borrow_mut();
661 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
662 alts.asts.push(concat.into_ast());
663 return;
664 }
665 stack.push(Alternation(ast::Alternation {
666 span: Span::new(concat.span.start, self.pos()),
667 asts: vec![concat.into_ast()],
668 }));
669 }
670
671 /// Parse and push a group AST (and its parent concatenation) on to the
672 /// parser's internal stack. Return a fresh concatenation corresponding
673 /// to the group's sub-AST.
674 ///
675 /// If a set of flags was found (with no group), then the concatenation
676 /// is returned with that set of flags added.
677 ///
678 /// This assumes that the parser is currently positioned on the opening
679 /// parenthesis. It advances the parser to the character at the start
680 /// of the sub-expression (or adjoining expression).
681 ///
682 /// If there was a problem parsing the start of the group, then an error
683 /// is returned.
684 #[inline(never)]
push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat>685 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
686 assert_eq!(self.char(), '(');
687 match self.parse_group()? {
688 Either::Left(set) => {
689 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
690 if let Some(v) = ignore {
691 self.parser().ignore_whitespace.set(v);
692 }
693
694 concat.asts.push(Ast::Flags(set));
695 Ok(concat)
696 }
697 Either::Right(group) => {
698 let old_ignore_whitespace = self.ignore_whitespace();
699 let new_ignore_whitespace = group
700 .flags()
701 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
702 .unwrap_or(old_ignore_whitespace);
703 self.parser().stack_group.borrow_mut().push(
704 GroupState::Group {
705 concat: concat,
706 group: group,
707 ignore_whitespace: old_ignore_whitespace,
708 },
709 );
710 self.parser().ignore_whitespace.set(new_ignore_whitespace);
711 Ok(ast::Concat { span: self.span(), asts: vec![] })
712 }
713 }
714 }
715
716 /// Pop a group AST from the parser's internal stack and set the group's
717 /// AST to the given concatenation. Return the concatenation containing
718 /// the group.
719 ///
720 /// This assumes that the parser is currently positioned on the closing
721 /// parenthesis and advances the parser to the character following the `)`.
722 ///
723 /// If no such group could be popped, then an unopened group error is
724 /// returned.
725 #[inline(never)]
pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat>726 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
727 use self::GroupState::*;
728
729 assert_eq!(self.char(), ')');
730 let mut stack = self.parser().stack_group.borrow_mut();
731 let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
732 .pop()
733 {
734 Some(Group { concat, group, ignore_whitespace }) => {
735 (concat, group, ignore_whitespace, None)
736 }
737 Some(Alternation(alt)) => match stack.pop() {
738 Some(Group { concat, group, ignore_whitespace }) => {
739 (concat, group, ignore_whitespace, Some(alt))
740 }
741 None | Some(Alternation(_)) => {
742 return Err(self.error(
743 self.span_char(),
744 ast::ErrorKind::GroupUnopened,
745 ));
746 }
747 },
748 None => {
749 return Err(self
750 .error(self.span_char(), ast::ErrorKind::GroupUnopened));
751 }
752 };
753 self.parser().ignore_whitespace.set(ignore_whitespace);
754 group_concat.span.end = self.pos();
755 self.bump();
756 group.span.end = self.pos();
757 match alt {
758 Some(mut alt) => {
759 alt.span.end = group_concat.span.end;
760 alt.asts.push(group_concat.into_ast());
761 group.ast = Box::new(alt.into_ast());
762 }
763 None => {
764 group.ast = Box::new(group_concat.into_ast());
765 }
766 }
767 prior_concat.asts.push(Ast::Group(group));
768 Ok(prior_concat)
769 }
770
771 /// Pop the last state from the parser's internal stack, if it exists, and
772 /// add the given concatenation to it. There either must be no state or a
773 /// single alternation item on the stack. Any other scenario produces an
774 /// error.
775 ///
776 /// This assumes that the parser has advanced to the end.
777 #[inline(never)]
pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast>778 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
779 concat.span.end = self.pos();
780 let mut stack = self.parser().stack_group.borrow_mut();
781 let ast = match stack.pop() {
782 None => Ok(concat.into_ast()),
783 Some(GroupState::Alternation(mut alt)) => {
784 alt.span.end = self.pos();
785 alt.asts.push(concat.into_ast());
786 Ok(Ast::Alternation(alt))
787 }
788 Some(GroupState::Group { group, .. }) => {
789 return Err(
790 self.error(group.span, ast::ErrorKind::GroupUnclosed)
791 );
792 }
793 };
794 // If we try to pop again, there should be nothing.
795 match stack.pop() {
796 None => ast,
797 Some(GroupState::Alternation(_)) => {
798 // This unreachable is unfortunate. This case can't happen
799 // because the only way we can be here is if there were two
800 // `GroupState::Alternation`s adjacent in the parser's stack,
801 // which we guarantee to never happen because we never push a
802 // `GroupState::Alternation` if one is already at the top of
803 // the stack.
804 unreachable!()
805 }
806 Some(GroupState::Group { group, .. }) => {
807 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
808 }
809 }
810 }
811
812 /// Parse the opening of a character class and push the current class
813 /// parsing context onto the parser's stack. This assumes that the parser
814 /// is positioned at an opening `[`. The given union should correspond to
815 /// the union of set items built up before seeing the `[`.
816 ///
817 /// If there was a problem parsing the opening of the class, then an error
818 /// is returned. Otherwise, a new union of set items for the class is
819 /// returned (which may be populated with either a `]` or a `-`).
820 #[inline(never)]
push_class_open( &self, parent_union: ast::ClassSetUnion, ) -> Result<ast::ClassSetUnion>821 fn push_class_open(
822 &self,
823 parent_union: ast::ClassSetUnion,
824 ) -> Result<ast::ClassSetUnion> {
825 assert_eq!(self.char(), '[');
826
827 let (nested_set, nested_union) = self.parse_set_class_open()?;
828 self.parser()
829 .stack_class
830 .borrow_mut()
831 .push(ClassState::Open { union: parent_union, set: nested_set });
832 Ok(nested_union)
833 }
834
835 /// Parse the end of a character class set and pop the character class
836 /// parser stack. The union given corresponds to the last union built
837 /// before seeing the closing `]`. The union returned corresponds to the
838 /// parent character class set with the nested class added to it.
839 ///
840 /// This assumes that the parser is positioned at a `]` and will advance
841 /// the parser to the byte immediately following the `]`.
842 ///
843 /// If the stack is empty after popping, then this returns the final
844 /// "top-level" character class AST (where a "top-level" character class
845 /// is one that is not nested inside any other character class).
846 ///
847 /// If there is no corresponding opening bracket on the parser's stack,
848 /// then an error is returned.
849 #[inline(never)]
pop_class( &self, nested_union: ast::ClassSetUnion, ) -> Result<Either<ast::ClassSetUnion, ast::Class>>850 fn pop_class(
851 &self,
852 nested_union: ast::ClassSetUnion,
853 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
854 assert_eq!(self.char(), ']');
855
856 let item = ast::ClassSet::Item(nested_union.into_item());
857 let prevset = self.pop_class_op(item);
858 let mut stack = self.parser().stack_class.borrow_mut();
859 match stack.pop() {
860 None => {
861 // We can never observe an empty stack:
862 //
863 // 1) We are guaranteed to start with a non-empty stack since
864 // the character class parser is only initiated when it sees
865 // a `[`.
866 // 2) If we ever observe an empty stack while popping after
867 // seeing a `]`, then we signal the character class parser
868 // to terminate.
869 panic!("unexpected empty character class stack")
870 }
871 Some(ClassState::Op { .. }) => {
872 // This panic is unfortunate, but this case is impossible
873 // since we already popped the Op state if one exists above.
874 // Namely, every push to the class parser stack is guarded by
875 // whether an existing Op is already on the top of the stack.
876 // If it is, the existing Op is modified. That is, the stack
877 // can never have consecutive Op states.
878 panic!("unexpected ClassState::Op")
879 }
880 Some(ClassState::Open { mut union, mut set }) => {
881 self.bump();
882 set.span.end = self.pos();
883 set.kind = prevset;
884 if stack.is_empty() {
885 Ok(Either::Right(ast::Class::Bracketed(set)))
886 } else {
887 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
888 Ok(Either::Left(union))
889 }
890 }
891 }
892 }
893
894 /// Return an "unclosed class" error whose span points to the most
895 /// recently opened class.
896 ///
897 /// This should only be called while parsing a character class.
898 #[inline(never)]
unclosed_class_error(&self) -> ast::Error899 fn unclosed_class_error(&self) -> ast::Error {
900 for state in self.parser().stack_class.borrow().iter().rev() {
901 match *state {
902 ClassState::Open { ref set, .. } => {
903 return self
904 .error(set.span, ast::ErrorKind::ClassUnclosed);
905 }
906 _ => {}
907 }
908 }
909 // We are guaranteed to have a non-empty stack with at least
910 // one open bracket, so we should never get here.
911 panic!("no open character class found")
912 }
913
914 /// Push the current set of class items on to the class parser's stack as
915 /// the left hand side of the given operator.
916 ///
917 /// A fresh set union is returned, which should be used to build the right
918 /// hand side of this operator.
919 #[inline(never)]
push_class_op( &self, next_kind: ast::ClassSetBinaryOpKind, next_union: ast::ClassSetUnion, ) -> ast::ClassSetUnion920 fn push_class_op(
921 &self,
922 next_kind: ast::ClassSetBinaryOpKind,
923 next_union: ast::ClassSetUnion,
924 ) -> ast::ClassSetUnion {
925 let item = ast::ClassSet::Item(next_union.into_item());
926 let new_lhs = self.pop_class_op(item);
927 self.parser()
928 .stack_class
929 .borrow_mut()
930 .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
931 ast::ClassSetUnion { span: self.span(), items: vec![] }
932 }
933
934 /// Pop a character class set from the character class parser stack. If the
935 /// top of the stack is just an item (not an operation), then return the
936 /// given set unchanged. If the top of the stack is an operation, then the
937 /// given set will be used as the rhs of the operation on the top of the
938 /// stack. In that case, the binary operation is returned as a set.
939 #[inline(never)]
pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet940 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
941 let mut stack = self.parser().stack_class.borrow_mut();
942 let (kind, lhs) = match stack.pop() {
943 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
944 Some(state @ ClassState::Open { .. }) => {
945 stack.push(state);
946 return rhs;
947 }
948 None => unreachable!(),
949 };
950 let span = Span::new(lhs.span().start, rhs.span().end);
951 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
952 span: span,
953 kind: kind,
954 lhs: Box::new(lhs),
955 rhs: Box::new(rhs),
956 })
957 }
958 }
959
960 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
961 /// Parse the regular expression into an abstract syntax tree.
parse(&self) -> Result<Ast>962 fn parse(&self) -> Result<Ast> {
963 self.parse_with_comments().map(|astc| astc.ast)
964 }
965
966 /// Parse the regular expression and return an abstract syntax tree with
967 /// all of the comments found in the pattern.
parse_with_comments(&self) -> Result<ast::WithComments>968 fn parse_with_comments(&self) -> Result<ast::WithComments> {
969 assert_eq!(self.offset(), 0, "parser can only be used once");
970 self.parser().reset();
971 let mut concat = ast::Concat { span: self.span(), asts: vec![] };
972 loop {
973 self.bump_space();
974 if self.is_eof() {
975 break;
976 }
977 match self.char() {
978 '(' => concat = self.push_group(concat)?,
979 ')' => concat = self.pop_group(concat)?,
980 '|' => concat = self.push_alternate(concat)?,
981 '[' => {
982 let class = self.parse_set_class()?;
983 concat.asts.push(Ast::Class(class));
984 }
985 '?' => {
986 concat = self.parse_uncounted_repetition(
987 concat,
988 ast::RepetitionKind::ZeroOrOne,
989 )?;
990 }
991 '*' => {
992 concat = self.parse_uncounted_repetition(
993 concat,
994 ast::RepetitionKind::ZeroOrMore,
995 )?;
996 }
997 '+' => {
998 concat = self.parse_uncounted_repetition(
999 concat,
1000 ast::RepetitionKind::OneOrMore,
1001 )?;
1002 }
1003 '{' => {
1004 concat = self.parse_counted_repetition(concat)?;
1005 }
1006 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1007 }
1008 }
1009 let ast = self.pop_group_end(concat)?;
1010 NestLimiter::new(self).check(&ast)?;
1011 Ok(ast::WithComments {
1012 ast: ast,
1013 comments: mem::replace(
1014 &mut *self.parser().comments.borrow_mut(),
1015 vec![],
1016 ),
1017 })
1018 }
1019
1020 /// Parses an uncounted repetition operation. An uncounted repetition
1021 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1022 /// The given `kind` should correspond to the operator observed by the
1023 /// caller.
1024 ///
1025 /// This assumes that the paser is currently positioned at the repetition
1026 /// operator and advances the parser to the first character after the
1027 /// operator. (Note that the operator may include a single additional `?`,
1028 /// which makes the operator ungreedy.)
1029 ///
1030 /// The caller should include the concatenation that is being built. The
1031 /// concatenation returned includes the repetition operator applied to the
1032 /// last expression in the given concatenation.
1033 #[inline(never)]
parse_uncounted_repetition( &self, mut concat: ast::Concat, kind: ast::RepetitionKind, ) -> Result<ast::Concat>1034 fn parse_uncounted_repetition(
1035 &self,
1036 mut concat: ast::Concat,
1037 kind: ast::RepetitionKind,
1038 ) -> Result<ast::Concat> {
1039 assert!(
1040 self.char() == '?' || self.char() == '*' || self.char() == '+'
1041 );
1042 let op_start = self.pos();
1043 let ast = match concat.asts.pop() {
1044 Some(ast) => ast,
1045 None => {
1046 return Err(
1047 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1048 )
1049 }
1050 };
1051 match ast {
1052 Ast::Empty(_) | Ast::Flags(_) => {
1053 return Err(
1054 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1055 )
1056 }
1057 _ => {}
1058 }
1059 let mut greedy = true;
1060 if self.bump() && self.char() == '?' {
1061 greedy = false;
1062 self.bump();
1063 }
1064 concat.asts.push(Ast::Repetition(ast::Repetition {
1065 span: ast.span().with_end(self.pos()),
1066 op: ast::RepetitionOp {
1067 span: Span::new(op_start, self.pos()),
1068 kind: kind,
1069 },
1070 greedy: greedy,
1071 ast: Box::new(ast),
1072 }));
1073 Ok(concat)
1074 }
1075
1076 /// Parses a counted repetition operation. A counted repetition operator
1077 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1078 /// operators.
1079 ///
1080 /// This assumes that the paser is currently positioned at the opening `{`
1081 /// and advances the parser to the first character after the operator.
1082 /// (Note that the operator may include a single additional `?`, which
1083 /// makes the operator ungreedy.)
1084 ///
1085 /// The caller should include the concatenation that is being built. The
1086 /// concatenation returned includes the repetition operator applied to the
1087 /// last expression in the given concatenation.
1088 #[inline(never)]
parse_counted_repetition( &self, mut concat: ast::Concat, ) -> Result<ast::Concat>1089 fn parse_counted_repetition(
1090 &self,
1091 mut concat: ast::Concat,
1092 ) -> Result<ast::Concat> {
1093 assert!(self.char() == '{');
1094 let start = self.pos();
1095 let ast = match concat.asts.pop() {
1096 Some(ast) => ast,
1097 None => {
1098 return Err(
1099 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1100 )
1101 }
1102 };
1103 match ast {
1104 Ast::Empty(_) | Ast::Flags(_) => {
1105 return Err(
1106 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1107 )
1108 }
1109 _ => {}
1110 }
1111 if !self.bump_and_bump_space() {
1112 return Err(self.error(
1113 Span::new(start, self.pos()),
1114 ast::ErrorKind::RepetitionCountUnclosed,
1115 ));
1116 }
1117 let count_start = specialize_err(
1118 self.parse_decimal(),
1119 ast::ErrorKind::DecimalEmpty,
1120 ast::ErrorKind::RepetitionCountDecimalEmpty,
1121 )?;
1122 let mut range = ast::RepetitionRange::Exactly(count_start);
1123 if self.is_eof() {
1124 return Err(self.error(
1125 Span::new(start, self.pos()),
1126 ast::ErrorKind::RepetitionCountUnclosed,
1127 ));
1128 }
1129 if self.char() == ',' {
1130 if !self.bump_and_bump_space() {
1131 return Err(self.error(
1132 Span::new(start, self.pos()),
1133 ast::ErrorKind::RepetitionCountUnclosed,
1134 ));
1135 }
1136 if self.char() != '}' {
1137 let count_end = specialize_err(
1138 self.parse_decimal(),
1139 ast::ErrorKind::DecimalEmpty,
1140 ast::ErrorKind::RepetitionCountDecimalEmpty,
1141 )?;
1142 range = ast::RepetitionRange::Bounded(count_start, count_end);
1143 } else {
1144 range = ast::RepetitionRange::AtLeast(count_start);
1145 }
1146 }
1147 if self.is_eof() || self.char() != '}' {
1148 return Err(self.error(
1149 Span::new(start, self.pos()),
1150 ast::ErrorKind::RepetitionCountUnclosed,
1151 ));
1152 }
1153
1154 let mut greedy = true;
1155 if self.bump_and_bump_space() && self.char() == '?' {
1156 greedy = false;
1157 self.bump();
1158 }
1159
1160 let op_span = Span::new(start, self.pos());
1161 if !range.is_valid() {
1162 return Err(
1163 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1164 );
1165 }
1166 concat.asts.push(Ast::Repetition(ast::Repetition {
1167 span: ast.span().with_end(self.pos()),
1168 op: ast::RepetitionOp {
1169 span: op_span,
1170 kind: ast::RepetitionKind::Range(range),
1171 },
1172 greedy: greedy,
1173 ast: Box::new(ast),
1174 }));
1175 Ok(concat)
1176 }
1177
1178 /// Parse a group (which contains a sub-expression) or a set of flags.
1179 ///
1180 /// If a group was found, then it is returned with an empty AST. If a set
1181 /// of flags is found, then that set is returned.
1182 ///
1183 /// The parser should be positioned at the opening parenthesis.
1184 ///
1185 /// This advances the parser to the character before the start of the
1186 /// sub-expression (in the case of a group) or to the closing parenthesis
1187 /// immediately following the set of flags.
1188 ///
1189 /// # Errors
1190 ///
1191 /// If flags are given and incorrectly specified, then a corresponding
1192 /// error is returned.
1193 ///
1194 /// If a capture name is given and it is incorrectly specified, then a
1195 /// corresponding error is returned.
1196 #[inline(never)]
parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>>1197 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1198 assert_eq!(self.char(), '(');
1199 let open_span = self.span_char();
1200 self.bump();
1201 self.bump_space();
1202 if self.is_lookaround_prefix() {
1203 return Err(self.error(
1204 Span::new(open_span.start, self.span().end),
1205 ast::ErrorKind::UnsupportedLookAround,
1206 ));
1207 }
1208 let inner_span = self.span();
1209 if self.bump_if("?P<") {
1210 let capture_index = self.next_capture_index(open_span)?;
1211 let cap = self.parse_capture_name(capture_index)?;
1212 Ok(Either::Right(ast::Group {
1213 span: open_span,
1214 kind: ast::GroupKind::CaptureName(cap),
1215 ast: Box::new(Ast::Empty(self.span())),
1216 }))
1217 } else if self.bump_if("?") {
1218 if self.is_eof() {
1219 return Err(
1220 self.error(open_span, ast::ErrorKind::GroupUnclosed)
1221 );
1222 }
1223 let flags = self.parse_flags()?;
1224 let char_end = self.char();
1225 self.bump();
1226 if char_end == ')' {
1227 // We don't allow empty flags, e.g., `(?)`. We instead
1228 // interpret it as a repetition operator missing its argument.
1229 if flags.items.is_empty() {
1230 return Err(self.error(
1231 inner_span,
1232 ast::ErrorKind::RepetitionMissing,
1233 ));
1234 }
1235 Ok(Either::Left(ast::SetFlags {
1236 span: Span { end: self.pos(), ..open_span },
1237 flags: flags,
1238 }))
1239 } else {
1240 assert_eq!(char_end, ':');
1241 Ok(Either::Right(ast::Group {
1242 span: open_span,
1243 kind: ast::GroupKind::NonCapturing(flags),
1244 ast: Box::new(Ast::Empty(self.span())),
1245 }))
1246 }
1247 } else {
1248 let capture_index = self.next_capture_index(open_span)?;
1249 Ok(Either::Right(ast::Group {
1250 span: open_span,
1251 kind: ast::GroupKind::CaptureIndex(capture_index),
1252 ast: Box::new(Ast::Empty(self.span())),
1253 }))
1254 }
1255 }
1256
1257 /// Parses a capture group name. Assumes that the parser is positioned at
1258 /// the first character in the name following the opening `<` (and may
1259 /// possibly be EOF). This advances the parser to the first character
1260 /// following the closing `>`.
1261 ///
1262 /// The caller must provide the capture index of the group for this name.
1263 #[inline(never)]
parse_capture_name( &self, capture_index: u32, ) -> Result<ast::CaptureName>1264 fn parse_capture_name(
1265 &self,
1266 capture_index: u32,
1267 ) -> Result<ast::CaptureName> {
1268 if self.is_eof() {
1269 return Err(self
1270 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1271 }
1272 let start = self.pos();
1273 loop {
1274 if self.char() == '>' {
1275 break;
1276 }
1277 if !is_capture_char(self.char(), self.pos() == start) {
1278 return Err(self.error(
1279 self.span_char(),
1280 ast::ErrorKind::GroupNameInvalid,
1281 ));
1282 }
1283 if !self.bump() {
1284 break;
1285 }
1286 }
1287 let end = self.pos();
1288 if self.is_eof() {
1289 return Err(self
1290 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1291 }
1292 assert_eq!(self.char(), '>');
1293 self.bump();
1294 let name = &self.pattern()[start.offset..end.offset];
1295 if name.is_empty() {
1296 return Err(self.error(
1297 Span::new(start, start),
1298 ast::ErrorKind::GroupNameEmpty,
1299 ));
1300 }
1301 let capname = ast::CaptureName {
1302 span: Span::new(start, end),
1303 name: name.to_string(),
1304 index: capture_index,
1305 };
1306 self.add_capture_name(&capname)?;
1307 Ok(capname)
1308 }
1309
1310 /// Parse a sequence of flags starting at the current character.
1311 ///
1312 /// This advances the parser to the character immediately following the
1313 /// flags, which is guaranteed to be either `:` or `)`.
1314 ///
1315 /// # Errors
1316 ///
1317 /// If any flags are duplicated, then an error is returned.
1318 ///
1319 /// If the negation operator is used more than once, then an error is
1320 /// returned.
1321 ///
1322 /// If no flags could be found or if the negation operation is not followed
1323 /// by any flags, then an error is returned.
1324 #[inline(never)]
parse_flags(&self) -> Result<ast::Flags>1325 fn parse_flags(&self) -> Result<ast::Flags> {
1326 let mut flags = ast::Flags { span: self.span(), items: vec![] };
1327 let mut last_was_negation = None;
1328 while self.char() != ':' && self.char() != ')' {
1329 if self.char() == '-' {
1330 last_was_negation = Some(self.span_char());
1331 let item = ast::FlagsItem {
1332 span: self.span_char(),
1333 kind: ast::FlagsItemKind::Negation,
1334 };
1335 if let Some(i) = flags.add_item(item) {
1336 return Err(self.error(
1337 self.span_char(),
1338 ast::ErrorKind::FlagRepeatedNegation {
1339 original: flags.items[i].span,
1340 },
1341 ));
1342 }
1343 } else {
1344 last_was_negation = None;
1345 let item = ast::FlagsItem {
1346 span: self.span_char(),
1347 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1348 };
1349 if let Some(i) = flags.add_item(item) {
1350 return Err(self.error(
1351 self.span_char(),
1352 ast::ErrorKind::FlagDuplicate {
1353 original: flags.items[i].span,
1354 },
1355 ));
1356 }
1357 }
1358 if !self.bump() {
1359 return Err(
1360 self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1361 );
1362 }
1363 }
1364 if let Some(span) = last_was_negation {
1365 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1366 }
1367 flags.span.end = self.pos();
1368 Ok(flags)
1369 }
1370
1371 /// Parse the current character as a flag. Do not advance the parser.
1372 ///
1373 /// # Errors
1374 ///
1375 /// If the flag is not recognized, then an error is returned.
1376 #[inline(never)]
parse_flag(&self) -> Result<ast::Flag>1377 fn parse_flag(&self) -> Result<ast::Flag> {
1378 match self.char() {
1379 'i' => Ok(ast::Flag::CaseInsensitive),
1380 'm' => Ok(ast::Flag::MultiLine),
1381 's' => Ok(ast::Flag::DotMatchesNewLine),
1382 'U' => Ok(ast::Flag::SwapGreed),
1383 'u' => Ok(ast::Flag::Unicode),
1384 'x' => Ok(ast::Flag::IgnoreWhitespace),
1385 _ => {
1386 Err(self
1387 .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1388 }
1389 }
1390 }
1391
1392 /// Parse a primitive AST. e.g., A literal, non-set character class or
1393 /// assertion.
1394 ///
1395 /// This assumes that the parser expects a primitive at the current
1396 /// location. i.e., All other non-primitive cases have been handled.
1397 /// For example, if the parser's position is at `|`, then `|` will be
1398 /// treated as a literal (e.g., inside a character class).
1399 ///
1400 /// This advances the parser to the first character immediately following
1401 /// the primitive.
parse_primitive(&self) -> Result<Primitive>1402 fn parse_primitive(&self) -> Result<Primitive> {
1403 match self.char() {
1404 '\\' => self.parse_escape(),
1405 '.' => {
1406 let ast = Primitive::Dot(self.span_char());
1407 self.bump();
1408 Ok(ast)
1409 }
1410 '^' => {
1411 let ast = Primitive::Assertion(ast::Assertion {
1412 span: self.span_char(),
1413 kind: ast::AssertionKind::StartLine,
1414 });
1415 self.bump();
1416 Ok(ast)
1417 }
1418 '$' => {
1419 let ast = Primitive::Assertion(ast::Assertion {
1420 span: self.span_char(),
1421 kind: ast::AssertionKind::EndLine,
1422 });
1423 self.bump();
1424 Ok(ast)
1425 }
1426 c => {
1427 let ast = Primitive::Literal(ast::Literal {
1428 span: self.span_char(),
1429 kind: ast::LiteralKind::Verbatim,
1430 c: c,
1431 });
1432 self.bump();
1433 Ok(ast)
1434 }
1435 }
1436 }
1437
1438 /// Parse an escape sequence as a primitive AST.
1439 ///
1440 /// This assumes the parser is positioned at the start of the escape
1441 /// sequence, i.e., `\`. It advances the parser to the first position
1442 /// immediately following the escape sequence.
1443 #[inline(never)]
parse_escape(&self) -> Result<Primitive>1444 fn parse_escape(&self) -> Result<Primitive> {
1445 assert_eq!(self.char(), '\\');
1446 let start = self.pos();
1447 if !self.bump() {
1448 return Err(self.error(
1449 Span::new(start, self.pos()),
1450 ast::ErrorKind::EscapeUnexpectedEof,
1451 ));
1452 }
1453 let c = self.char();
1454 // Put some of the more complicated routines into helpers.
1455 match c {
1456 '0'..='7' => {
1457 if !self.parser().octal {
1458 return Err(self.error(
1459 Span::new(start, self.span_char().end),
1460 ast::ErrorKind::UnsupportedBackreference,
1461 ));
1462 }
1463 let mut lit = self.parse_octal();
1464 lit.span.start = start;
1465 return Ok(Primitive::Literal(lit));
1466 }
1467 '8'..='9' if !self.parser().octal => {
1468 return Err(self.error(
1469 Span::new(start, self.span_char().end),
1470 ast::ErrorKind::UnsupportedBackreference,
1471 ));
1472 }
1473 'x' | 'u' | 'U' => {
1474 let mut lit = self.parse_hex()?;
1475 lit.span.start = start;
1476 return Ok(Primitive::Literal(lit));
1477 }
1478 'p' | 'P' => {
1479 let mut cls = self.parse_unicode_class()?;
1480 cls.span.start = start;
1481 return Ok(Primitive::Unicode(cls));
1482 }
1483 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1484 let mut cls = self.parse_perl_class();
1485 cls.span.start = start;
1486 return Ok(Primitive::Perl(cls));
1487 }
1488 _ => {}
1489 }
1490
1491 // Handle all of the one letter sequences inline.
1492 self.bump();
1493 let span = Span::new(start, self.pos());
1494 if is_meta_character(c) {
1495 return Ok(Primitive::Literal(ast::Literal {
1496 span: span,
1497 kind: ast::LiteralKind::Punctuation,
1498 c: c,
1499 }));
1500 }
1501 let special = |kind, c| {
1502 Ok(Primitive::Literal(ast::Literal {
1503 span: span,
1504 kind: ast::LiteralKind::Special(kind),
1505 c: c,
1506 }))
1507 };
1508 match c {
1509 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1510 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1511 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1512 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1513 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1514 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1515 ' ' if self.ignore_whitespace() => {
1516 special(ast::SpecialLiteralKind::Space, ' ')
1517 }
1518 'A' => Ok(Primitive::Assertion(ast::Assertion {
1519 span: span,
1520 kind: ast::AssertionKind::StartText,
1521 })),
1522 'z' => Ok(Primitive::Assertion(ast::Assertion {
1523 span: span,
1524 kind: ast::AssertionKind::EndText,
1525 })),
1526 'b' => Ok(Primitive::Assertion(ast::Assertion {
1527 span: span,
1528 kind: ast::AssertionKind::WordBoundary,
1529 })),
1530 'B' => Ok(Primitive::Assertion(ast::Assertion {
1531 span: span,
1532 kind: ast::AssertionKind::NotWordBoundary,
1533 })),
1534 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1535 }
1536 }
1537
1538 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1539 /// long. This expects the parser to be positioned at the first octal
1540 /// digit and advances the parser to the first character immediately
1541 /// following the octal number. This also assumes that parsing octal
1542 /// escapes is enabled.
1543 ///
1544 /// Assuming the preconditions are met, this routine can never fail.
1545 #[inline(never)]
parse_octal(&self) -> ast::Literal1546 fn parse_octal(&self) -> ast::Literal {
1547 use std::char;
1548 use std::u32;
1549
1550 assert!(self.parser().octal);
1551 assert!('0' <= self.char() && self.char() <= '7');
1552 let start = self.pos();
1553 // Parse up to two more digits.
1554 while self.bump()
1555 && '0' <= self.char()
1556 && self.char() <= '7'
1557 && self.pos().offset - start.offset <= 2
1558 {}
1559 let end = self.pos();
1560 let octal = &self.pattern()[start.offset..end.offset];
1561 // Parsing the octal should never fail since the above guarantees a
1562 // valid number.
1563 let codepoint =
1564 u32::from_str_radix(octal, 8).expect("valid octal number");
1565 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1566 // invalid Unicode scalar values.
1567 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1568 ast::Literal {
1569 span: Span::new(start, end),
1570 kind: ast::LiteralKind::Octal,
1571 c: c,
1572 }
1573 }
1574
1575 /// Parse a hex representation of a Unicode codepoint. This handles both
1576 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1577 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1578 /// the first character immediately following the hexadecimal literal.
1579 #[inline(never)]
parse_hex(&self) -> Result<ast::Literal>1580 fn parse_hex(&self) -> Result<ast::Literal> {
1581 assert!(
1582 self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1583 );
1584
1585 let hex_kind = match self.char() {
1586 'x' => ast::HexLiteralKind::X,
1587 'u' => ast::HexLiteralKind::UnicodeShort,
1588 _ => ast::HexLiteralKind::UnicodeLong,
1589 };
1590 if !self.bump_and_bump_space() {
1591 return Err(
1592 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1593 );
1594 }
1595 if self.char() == '{' {
1596 self.parse_hex_brace(hex_kind)
1597 } else {
1598 self.parse_hex_digits(hex_kind)
1599 }
1600 }
1601
1602 /// Parse an N-digit hex representation of a Unicode codepoint. This
1603 /// expects the parser to be positioned at the first digit and will advance
1604 /// the parser to the first character immediately following the escape
1605 /// sequence.
1606 ///
1607 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1608 /// or 8 (for `\UNNNNNNNN`).
1609 #[inline(never)]
parse_hex_digits( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1610 fn parse_hex_digits(
1611 &self,
1612 kind: ast::HexLiteralKind,
1613 ) -> Result<ast::Literal> {
1614 use std::char;
1615 use std::u32;
1616
1617 let mut scratch = self.parser().scratch.borrow_mut();
1618 scratch.clear();
1619
1620 let start = self.pos();
1621 for i in 0..kind.digits() {
1622 if i > 0 && !self.bump_and_bump_space() {
1623 return Err(self
1624 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1625 }
1626 if !is_hex(self.char()) {
1627 return Err(self.error(
1628 self.span_char(),
1629 ast::ErrorKind::EscapeHexInvalidDigit,
1630 ));
1631 }
1632 scratch.push(self.char());
1633 }
1634 // The final bump just moves the parser past the literal, which may
1635 // be EOF.
1636 self.bump_and_bump_space();
1637 let end = self.pos();
1638 let hex = scratch.as_str();
1639 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1640 None => Err(self.error(
1641 Span::new(start, end),
1642 ast::ErrorKind::EscapeHexInvalid,
1643 )),
1644 Some(c) => Ok(ast::Literal {
1645 span: Span::new(start, end),
1646 kind: ast::LiteralKind::HexFixed(kind),
1647 c: c,
1648 }),
1649 }
1650 }
1651
1652 /// Parse a hex representation of any Unicode scalar value. This expects
1653 /// the parser to be positioned at the opening brace `{` and will advance
1654 /// the parser to the first character following the closing brace `}`.
1655 #[inline(never)]
parse_hex_brace( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1656 fn parse_hex_brace(
1657 &self,
1658 kind: ast::HexLiteralKind,
1659 ) -> Result<ast::Literal> {
1660 use std::char;
1661 use std::u32;
1662
1663 let mut scratch = self.parser().scratch.borrow_mut();
1664 scratch.clear();
1665
1666 let brace_pos = self.pos();
1667 let start = self.span_char().end;
1668 while self.bump_and_bump_space() && self.char() != '}' {
1669 if !is_hex(self.char()) {
1670 return Err(self.error(
1671 self.span_char(),
1672 ast::ErrorKind::EscapeHexInvalidDigit,
1673 ));
1674 }
1675 scratch.push(self.char());
1676 }
1677 if self.is_eof() {
1678 return Err(self.error(
1679 Span::new(brace_pos, self.pos()),
1680 ast::ErrorKind::EscapeUnexpectedEof,
1681 ));
1682 }
1683 let end = self.pos();
1684 let hex = scratch.as_str();
1685 assert_eq!(self.char(), '}');
1686 self.bump_and_bump_space();
1687
1688 if hex.is_empty() {
1689 return Err(self.error(
1690 Span::new(brace_pos, self.pos()),
1691 ast::ErrorKind::EscapeHexEmpty,
1692 ));
1693 }
1694 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1695 None => Err(self.error(
1696 Span::new(start, end),
1697 ast::ErrorKind::EscapeHexInvalid,
1698 )),
1699 Some(c) => Ok(ast::Literal {
1700 span: Span::new(start, self.pos()),
1701 kind: ast::LiteralKind::HexBrace(kind),
1702 c: c,
1703 }),
1704 }
1705 }
1706
1707 /// Parse a decimal number into a u32 while trimming leading and trailing
1708 /// whitespace.
1709 ///
1710 /// This expects the parser to be positioned at the first position where
1711 /// a decimal digit could occur. This will advance the parser to the byte
1712 /// immediately following the last contiguous decimal digit.
1713 ///
1714 /// If no decimal digit could be found or if there was a problem parsing
1715 /// the complete set of digits into a u32, then an error is returned.
parse_decimal(&self) -> Result<u32>1716 fn parse_decimal(&self) -> Result<u32> {
1717 let mut scratch = self.parser().scratch.borrow_mut();
1718 scratch.clear();
1719
1720 while !self.is_eof() && self.char().is_whitespace() {
1721 self.bump();
1722 }
1723 let start = self.pos();
1724 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1725 scratch.push(self.char());
1726 self.bump_and_bump_space();
1727 }
1728 let span = Span::new(start, self.pos());
1729 while !self.is_eof() && self.char().is_whitespace() {
1730 self.bump_and_bump_space();
1731 }
1732 let digits = scratch.as_str();
1733 if digits.is_empty() {
1734 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1735 }
1736 match u32::from_str_radix(digits, 10).ok() {
1737 Some(n) => Ok(n),
1738 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1739 }
1740 }
1741
1742 /// Parse a standard character class consisting primarily of characters or
1743 /// character ranges, but can also contain nested character classes of
1744 /// any type (sans `.`).
1745 ///
1746 /// This assumes the parser is positioned at the opening `[`. If parsing
1747 /// is successful, then the parser is advanced to the position immediately
1748 /// following the closing `]`.
1749 #[inline(never)]
parse_set_class(&self) -> Result<ast::Class>1750 fn parse_set_class(&self) -> Result<ast::Class> {
1751 assert_eq!(self.char(), '[');
1752
1753 let mut union =
1754 ast::ClassSetUnion { span: self.span(), items: vec![] };
1755 loop {
1756 self.bump_space();
1757 if self.is_eof() {
1758 return Err(self.unclosed_class_error());
1759 }
1760 match self.char() {
1761 '[' => {
1762 // If we've already parsed the opening bracket, then
1763 // attempt to treat this as the beginning of an ASCII
1764 // class. If ASCII class parsing fails, then the parser
1765 // backs up to `[`.
1766 if !self.parser().stack_class.borrow().is_empty() {
1767 if let Some(cls) = self.maybe_parse_ascii_class() {
1768 union.push(ast::ClassSetItem::Ascii(cls));
1769 continue;
1770 }
1771 }
1772 union = self.push_class_open(union)?;
1773 }
1774 ']' => match self.pop_class(union)? {
1775 Either::Left(nested_union) => {
1776 union = nested_union;
1777 }
1778 Either::Right(class) => return Ok(class),
1779 },
1780 '&' if self.peek() == Some('&') => {
1781 assert!(self.bump_if("&&"));
1782 union = self.push_class_op(
1783 ast::ClassSetBinaryOpKind::Intersection,
1784 union,
1785 );
1786 }
1787 '-' if self.peek() == Some('-') => {
1788 assert!(self.bump_if("--"));
1789 union = self.push_class_op(
1790 ast::ClassSetBinaryOpKind::Difference,
1791 union,
1792 );
1793 }
1794 '~' if self.peek() == Some('~') => {
1795 assert!(self.bump_if("~~"));
1796 union = self.push_class_op(
1797 ast::ClassSetBinaryOpKind::SymmetricDifference,
1798 union,
1799 );
1800 }
1801 _ => {
1802 union.push(self.parse_set_class_range()?);
1803 }
1804 }
1805 }
1806 }
1807
1808 /// Parse a single primitive item in a character class set. The item to
1809 /// be parsed can either be one of a simple literal character, a range
1810 /// between two simple literal characters or a "primitive" character
1811 /// class like \w or \p{Greek}.
1812 ///
1813 /// If an invalid escape is found, or if a character class is found where
1814 /// a simple literal is expected (e.g., in a range), then an error is
1815 /// returned.
1816 #[inline(never)]
parse_set_class_range(&self) -> Result<ast::ClassSetItem>1817 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1818 let prim1 = self.parse_set_class_item()?;
1819 self.bump_space();
1820 if self.is_eof() {
1821 return Err(self.unclosed_class_error());
1822 }
1823 // If the next char isn't a `-`, then we don't have a range.
1824 // There are two exceptions. If the char after a `-` is a `]`, then
1825 // `-` is interpreted as a literal `-`. Alternatively, if the char
1826 // after a `-` is a `-`, then `--` corresponds to a "difference"
1827 // operation.
1828 if self.char() != '-'
1829 || self.peek_space() == Some(']')
1830 || self.peek_space() == Some('-')
1831 {
1832 return prim1.into_class_set_item(self);
1833 }
1834 // OK, now we're parsing a range, so bump past the `-` and parse the
1835 // second half of the range.
1836 if !self.bump_and_bump_space() {
1837 return Err(self.unclosed_class_error());
1838 }
1839 let prim2 = self.parse_set_class_item()?;
1840 let range = ast::ClassSetRange {
1841 span: Span::new(prim1.span().start, prim2.span().end),
1842 start: prim1.into_class_literal(self)?,
1843 end: prim2.into_class_literal(self)?,
1844 };
1845 if !range.is_valid() {
1846 return Err(
1847 self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1848 );
1849 }
1850 Ok(ast::ClassSetItem::Range(range))
1851 }
1852
1853 /// Parse a single item in a character class as a primitive, where the
1854 /// primitive either consists of a verbatim literal or a single escape
1855 /// sequence.
1856 ///
1857 /// This assumes the parser is positioned at the beginning of a primitive,
1858 /// and advances the parser to the first position after the primitive if
1859 /// successful.
1860 ///
1861 /// Note that it is the caller's responsibility to report an error if an
1862 /// illegal primitive was parsed.
1863 #[inline(never)]
parse_set_class_item(&self) -> Result<Primitive>1864 fn parse_set_class_item(&self) -> Result<Primitive> {
1865 if self.char() == '\\' {
1866 self.parse_escape()
1867 } else {
1868 let x = Primitive::Literal(ast::Literal {
1869 span: self.span_char(),
1870 kind: ast::LiteralKind::Verbatim,
1871 c: self.char(),
1872 });
1873 self.bump();
1874 Ok(x)
1875 }
1876 }
1877
1878 /// Parses the opening of a character class set. This includes the opening
1879 /// bracket along with `^` if present to indicate negation. This also
1880 /// starts parsing the opening set of unioned items if applicable, since
1881 /// there are special rules applied to certain characters in the opening
1882 /// of a character class. For example, `[^]]` is the class of all
1883 /// characters not equal to `]`. (`]` would need to be escaped in any other
1884 /// position.) Similarly for `-`.
1885 ///
1886 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1887 /// empty union. This empty union should be replaced with the actual item
1888 /// when it is popped from the parser's stack.
1889 ///
1890 /// This assumes the parser is positioned at the opening `[` and advances
1891 /// the parser to the first non-special byte of the character class.
1892 ///
1893 /// An error is returned if EOF is found.
1894 #[inline(never)]
parse_set_class_open( &self, ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)>1895 fn parse_set_class_open(
1896 &self,
1897 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1898 assert_eq!(self.char(), '[');
1899 let start = self.pos();
1900 if !self.bump_and_bump_space() {
1901 return Err(self.error(
1902 Span::new(start, self.pos()),
1903 ast::ErrorKind::ClassUnclosed,
1904 ));
1905 }
1906
1907 let negated = if self.char() != '^' {
1908 false
1909 } else {
1910 if !self.bump_and_bump_space() {
1911 return Err(self.error(
1912 Span::new(start, self.pos()),
1913 ast::ErrorKind::ClassUnclosed,
1914 ));
1915 }
1916 true
1917 };
1918 // Accept any number of `-` as literal `-`.
1919 let mut union =
1920 ast::ClassSetUnion { span: self.span(), items: vec![] };
1921 while self.char() == '-' {
1922 union.push(ast::ClassSetItem::Literal(ast::Literal {
1923 span: self.span_char(),
1924 kind: ast::LiteralKind::Verbatim,
1925 c: '-',
1926 }));
1927 if !self.bump_and_bump_space() {
1928 return Err(self.error(
1929 Span::new(start, self.pos()),
1930 ast::ErrorKind::ClassUnclosed,
1931 ));
1932 }
1933 }
1934 // If `]` is the *first* char in a set, then interpret it as a literal
1935 // `]`. That is, an empty class is impossible to write.
1936 if union.items.is_empty() && self.char() == ']' {
1937 union.push(ast::ClassSetItem::Literal(ast::Literal {
1938 span: self.span_char(),
1939 kind: ast::LiteralKind::Verbatim,
1940 c: ']',
1941 }));
1942 if !self.bump_and_bump_space() {
1943 return Err(self.error(
1944 Span::new(start, self.pos()),
1945 ast::ErrorKind::ClassUnclosed,
1946 ));
1947 }
1948 }
1949 let set = ast::ClassBracketed {
1950 span: Span::new(start, self.pos()),
1951 negated: negated,
1952 kind: ast::ClassSet::union(ast::ClassSetUnion {
1953 span: Span::new(union.span.start, union.span.start),
1954 items: vec![],
1955 }),
1956 };
1957 Ok((set, union))
1958 }
1959
1960 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1961 ///
1962 /// This assumes the parser is positioned at the opening `[`.
1963 ///
1964 /// If no valid ASCII character class could be found, then this does not
1965 /// advance the parser and `None` is returned. Otherwise, the parser is
1966 /// advanced to the first byte following the closing `]` and the
1967 /// corresponding ASCII class is returned.
1968 #[inline(never)]
maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii>1969 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1970 // ASCII character classes are interesting from a parsing perspective
1971 // because parsing cannot fail with any interesting error. For example,
1972 // in order to use an ASCII character class, it must be enclosed in
1973 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1974 // of it as "ASCII character characters have the syntax `[:NAME:]`
1975 // which can only appear within character brackets." This means that
1976 // things like `[[:lower:]A]` are legal constructs.
1977 //
1978 // However, if one types an incorrect ASCII character class, e.g.,
1979 // `[[:loower:]]`, then we treat that as a normal nested character
1980 // class containing the characters `:elorw`. One might argue that we
1981 // should return an error instead since the repeated colons give away
1982 // the intent to write an ASCII class. But what if the user typed
1983 // `[[:lower]]` instead? How can we tell that was intended to be an
1984 // ASCII class and not just a normal nested class?
1985 //
1986 // Reasonable people can probably disagree over this, but for better
1987 // or worse, we implement semantics that never fails at the expense
1988 // of better failure modes.
1989 assert_eq!(self.char(), '[');
1990 // If parsing fails, then we back up the parser to this starting point.
1991 let start = self.pos();
1992 let mut negated = false;
1993 if !self.bump() || self.char() != ':' {
1994 self.parser().pos.set(start);
1995 return None;
1996 }
1997 if !self.bump() {
1998 self.parser().pos.set(start);
1999 return None;
2000 }
2001 if self.char() == '^' {
2002 negated = true;
2003 if !self.bump() {
2004 self.parser().pos.set(start);
2005 return None;
2006 }
2007 }
2008 let name_start = self.offset();
2009 while self.char() != ':' && self.bump() {}
2010 if self.is_eof() {
2011 self.parser().pos.set(start);
2012 return None;
2013 }
2014 let name = &self.pattern()[name_start..self.offset()];
2015 if !self.bump_if(":]") {
2016 self.parser().pos.set(start);
2017 return None;
2018 }
2019 let kind = match ast::ClassAsciiKind::from_name(name) {
2020 Some(kind) => kind,
2021 None => {
2022 self.parser().pos.set(start);
2023 return None;
2024 }
2025 };
2026 Some(ast::ClassAscii {
2027 span: Span::new(start, self.pos()),
2028 kind: kind,
2029 negated: negated,
2030 })
2031 }
2032
2033 /// Parse a Unicode class in either the single character notation, `\pN`
2034 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2035 /// the parser is positioned at the `p` (or `P` for negation) and will
2036 /// advance the parser to the character immediately following the class.
2037 ///
2038 /// Note that this does not check whether the class name is valid or not.
2039 #[inline(never)]
parse_unicode_class(&self) -> Result<ast::ClassUnicode>2040 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2041 assert!(self.char() == 'p' || self.char() == 'P');
2042
2043 let mut scratch = self.parser().scratch.borrow_mut();
2044 scratch.clear();
2045
2046 let negated = self.char() == 'P';
2047 if !self.bump_and_bump_space() {
2048 return Err(
2049 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2050 );
2051 }
2052 let (start, kind) = if self.char() == '{' {
2053 let start = self.span_char().end;
2054 while self.bump_and_bump_space() && self.char() != '}' {
2055 scratch.push(self.char());
2056 }
2057 if self.is_eof() {
2058 return Err(self
2059 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2060 }
2061 assert_eq!(self.char(), '}');
2062 self.bump();
2063
2064 let name = scratch.as_str();
2065 if let Some(i) = name.find("!=") {
2066 (
2067 start,
2068 ast::ClassUnicodeKind::NamedValue {
2069 op: ast::ClassUnicodeOpKind::NotEqual,
2070 name: name[..i].to_string(),
2071 value: name[i + 2..].to_string(),
2072 },
2073 )
2074 } else if let Some(i) = name.find(':') {
2075 (
2076 start,
2077 ast::ClassUnicodeKind::NamedValue {
2078 op: ast::ClassUnicodeOpKind::Colon,
2079 name: name[..i].to_string(),
2080 value: name[i + 1..].to_string(),
2081 },
2082 )
2083 } else if let Some(i) = name.find('=') {
2084 (
2085 start,
2086 ast::ClassUnicodeKind::NamedValue {
2087 op: ast::ClassUnicodeOpKind::Equal,
2088 name: name[..i].to_string(),
2089 value: name[i + 1..].to_string(),
2090 },
2091 )
2092 } else {
2093 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2094 }
2095 } else {
2096 let start = self.pos();
2097 let c = self.char();
2098 self.bump_and_bump_space();
2099 let kind = ast::ClassUnicodeKind::OneLetter(c);
2100 (start, kind)
2101 };
2102 Ok(ast::ClassUnicode {
2103 span: Span::new(start, self.pos()),
2104 negated: negated,
2105 kind: kind,
2106 })
2107 }
2108
2109 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2110 /// parser is currently at a valid character class name and will be
2111 /// advanced to the character immediately following the class.
2112 #[inline(never)]
parse_perl_class(&self) -> ast::ClassPerl2113 fn parse_perl_class(&self) -> ast::ClassPerl {
2114 let c = self.char();
2115 let span = self.span_char();
2116 self.bump();
2117 let (negated, kind) = match c {
2118 'd' => (false, ast::ClassPerlKind::Digit),
2119 'D' => (true, ast::ClassPerlKind::Digit),
2120 's' => (false, ast::ClassPerlKind::Space),
2121 'S' => (true, ast::ClassPerlKind::Space),
2122 'w' => (false, ast::ClassPerlKind::Word),
2123 'W' => (true, ast::ClassPerlKind::Word),
2124 c => panic!("expected valid Perl class but got '{}'", c),
2125 };
2126 ast::ClassPerl { span: span, kind: kind, negated: negated }
2127 }
2128 }
2129
2130 /// A type that traverses a fully parsed Ast and checks whether its depth
2131 /// exceeds the specified nesting limit. If it does, then an error is returned.
2132 #[derive(Debug)]
2133 struct NestLimiter<'p, 's: 'p, P: 'p + 's> {
2134 /// The parser that is checking the nest limit.
2135 p: &'p ParserI<'s, P>,
2136 /// The current depth while walking an Ast.
2137 depth: u32,
2138 }
2139
2140 impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P>2141 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2142 NestLimiter { p: p, depth: 0 }
2143 }
2144
2145 #[inline(never)]
check(self, ast: &Ast) -> Result<()>2146 fn check(self, ast: &Ast) -> Result<()> {
2147 ast::visit(ast, self)
2148 }
2149
increment_depth(&mut self, span: &Span) -> Result<()>2150 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2151 let new = self.depth.checked_add(1).ok_or_else(|| {
2152 self.p.error(
2153 span.clone(),
2154 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
2155 )
2156 })?;
2157 let limit = self.p.parser().nest_limit;
2158 if new > limit {
2159 return Err(self.p.error(
2160 span.clone(),
2161 ast::ErrorKind::NestLimitExceeded(limit),
2162 ));
2163 }
2164 self.depth = new;
2165 Ok(())
2166 }
2167
decrement_depth(&mut self)2168 fn decrement_depth(&mut self) {
2169 // Assuming the correctness of the visitor, this should never drop
2170 // below 0.
2171 self.depth = self.depth.checked_sub(1).unwrap();
2172 }
2173 }
2174
2175 impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2176 type Output = ();
2177 type Err = ast::Error;
2178
finish(self) -> Result<()>2179 fn finish(self) -> Result<()> {
2180 Ok(())
2181 }
2182
visit_pre(&mut self, ast: &Ast) -> Result<()>2183 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2184 let span = match *ast {
2185 Ast::Empty(_)
2186 | Ast::Flags(_)
2187 | Ast::Literal(_)
2188 | Ast::Dot(_)
2189 | Ast::Assertion(_)
2190 | Ast::Class(ast::Class::Unicode(_))
2191 | Ast::Class(ast::Class::Perl(_)) => {
2192 // These are all base cases, so we don't increment depth.
2193 return Ok(());
2194 }
2195 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2196 Ast::Repetition(ref x) => &x.span,
2197 Ast::Group(ref x) => &x.span,
2198 Ast::Alternation(ref x) => &x.span,
2199 Ast::Concat(ref x) => &x.span,
2200 };
2201 self.increment_depth(span)
2202 }
2203
visit_post(&mut self, ast: &Ast) -> Result<()>2204 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2205 match *ast {
2206 Ast::Empty(_)
2207 | Ast::Flags(_)
2208 | Ast::Literal(_)
2209 | Ast::Dot(_)
2210 | Ast::Assertion(_)
2211 | Ast::Class(ast::Class::Unicode(_))
2212 | Ast::Class(ast::Class::Perl(_)) => {
2213 // These are all base cases, so we don't decrement depth.
2214 Ok(())
2215 }
2216 Ast::Class(ast::Class::Bracketed(_))
2217 | Ast::Repetition(_)
2218 | Ast::Group(_)
2219 | Ast::Alternation(_)
2220 | Ast::Concat(_) => {
2221 self.decrement_depth();
2222 Ok(())
2223 }
2224 }
2225 }
2226
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2227 fn visit_class_set_item_pre(
2228 &mut self,
2229 ast: &ast::ClassSetItem,
2230 ) -> Result<()> {
2231 let span = match *ast {
2232 ast::ClassSetItem::Empty(_)
2233 | ast::ClassSetItem::Literal(_)
2234 | ast::ClassSetItem::Range(_)
2235 | ast::ClassSetItem::Ascii(_)
2236 | ast::ClassSetItem::Unicode(_)
2237 | ast::ClassSetItem::Perl(_) => {
2238 // These are all base cases, so we don't increment depth.
2239 return Ok(());
2240 }
2241 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2242 ast::ClassSetItem::Union(ref x) => &x.span,
2243 };
2244 self.increment_depth(span)
2245 }
2246
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2247 fn visit_class_set_item_post(
2248 &mut self,
2249 ast: &ast::ClassSetItem,
2250 ) -> Result<()> {
2251 match *ast {
2252 ast::ClassSetItem::Empty(_)
2253 | ast::ClassSetItem::Literal(_)
2254 | ast::ClassSetItem::Range(_)
2255 | ast::ClassSetItem::Ascii(_)
2256 | ast::ClassSetItem::Unicode(_)
2257 | ast::ClassSetItem::Perl(_) => {
2258 // These are all base cases, so we don't decrement depth.
2259 Ok(())
2260 }
2261 ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2262 self.decrement_depth();
2263 Ok(())
2264 }
2265 }
2266 }
2267
visit_class_set_binary_op_pre( &mut self, ast: &ast::ClassSetBinaryOp, ) -> Result<()>2268 fn visit_class_set_binary_op_pre(
2269 &mut self,
2270 ast: &ast::ClassSetBinaryOp,
2271 ) -> Result<()> {
2272 self.increment_depth(&ast.span)
2273 }
2274
visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<()>2275 fn visit_class_set_binary_op_post(
2276 &mut self,
2277 _ast: &ast::ClassSetBinaryOp,
2278 ) -> Result<()> {
2279 self.decrement_depth();
2280 Ok(())
2281 }
2282 }
2283
2284 /// When the result is an error, transforms the ast::ErrorKind from the source
2285 /// Result into another one. This function is used to return clearer error
2286 /// messages when possible.
specialize_err<T>( result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind, ) -> Result<T>2287 fn specialize_err<T>(
2288 result: Result<T>,
2289 from: ast::ErrorKind,
2290 to: ast::ErrorKind,
2291 ) -> Result<T> {
2292 if let Err(e) = result {
2293 if e.kind == from {
2294 Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2295 } else {
2296 Err(e)
2297 }
2298 } else {
2299 result
2300 }
2301 }
2302
2303 #[cfg(test)]
2304 mod tests {
2305 use std::ops::Range;
2306
2307 use super::{Parser, ParserBuilder, ParserI, Primitive};
2308 use ast::{self, Ast, Position, Span};
2309
2310 // Our own assert_eq, which has slightly better formatting (but honestly
2311 // still kind of crappy).
2312 macro_rules! assert_eq {
2313 ($left:expr, $right:expr) => {{
2314 match (&$left, &$right) {
2315 (left_val, right_val) => {
2316 if !(*left_val == *right_val) {
2317 panic!(
2318 "assertion failed: `(left == right)`\n\n\
2319 left: `{:?}`\nright: `{:?}`\n\n",
2320 left_val, right_val
2321 )
2322 }
2323 }
2324 }
2325 }};
2326 }
2327
2328 // We create these errors to compare with real ast::Errors in the tests.
2329 // We define equality between TestError and ast::Error to disregard the
2330 // pattern string in ast::Error, which is annoying to provide in tests.
2331 #[derive(Clone, Debug)]
2332 struct TestError {
2333 span: Span,
2334 kind: ast::ErrorKind,
2335 }
2336
2337 impl PartialEq<ast::Error> for TestError {
eq(&self, other: &ast::Error) -> bool2338 fn eq(&self, other: &ast::Error) -> bool {
2339 self.span == other.span && self.kind == other.kind
2340 }
2341 }
2342
2343 impl PartialEq<TestError> for ast::Error {
eq(&self, other: &TestError) -> bool2344 fn eq(&self, other: &TestError) -> bool {
2345 self.span == other.span && self.kind == other.kind
2346 }
2347 }
2348
s(str: &str) -> String2349 fn s(str: &str) -> String {
2350 str.to_string()
2351 }
2352
parser(pattern: &str) -> ParserI<Parser>2353 fn parser(pattern: &str) -> ParserI<Parser> {
2354 ParserI::new(Parser::new(), pattern)
2355 }
2356
parser_octal(pattern: &str) -> ParserI<Parser>2357 fn parser_octal(pattern: &str) -> ParserI<Parser> {
2358 let parser = ParserBuilder::new().octal(true).build();
2359 ParserI::new(parser, pattern)
2360 }
2361
parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser>2362 fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> {
2363 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2364 ParserI::new(p, pattern)
2365 }
2366
parser_ignore_whitespace(pattern: &str) -> ParserI<Parser>2367 fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> {
2368 let p = ParserBuilder::new().ignore_whitespace(true).build();
2369 ParserI::new(p, pattern)
2370 }
2371
2372 /// Short alias for creating a new span.
nspan(start: Position, end: Position) -> Span2373 fn nspan(start: Position, end: Position) -> Span {
2374 Span::new(start, end)
2375 }
2376
2377 /// Short alias for creating a new position.
npos(offset: usize, line: usize, column: usize) -> Position2378 fn npos(offset: usize, line: usize, column: usize) -> Position {
2379 Position::new(offset, line, column)
2380 }
2381
2382 /// Create a new span from the given offset range. This assumes a single
2383 /// line and sets the columns based on the offsets. i.e., This only works
2384 /// out of the box for ASCII, which is fine for most tests.
span(range: Range<usize>) -> Span2385 fn span(range: Range<usize>) -> Span {
2386 let start = Position::new(range.start, 1, range.start + 1);
2387 let end = Position::new(range.end, 1, range.end + 1);
2388 Span::new(start, end)
2389 }
2390
2391 /// Create a new span for the corresponding byte range in the given string.
span_range(subject: &str, range: Range<usize>) -> Span2392 fn span_range(subject: &str, range: Range<usize>) -> Span {
2393 let start = Position {
2394 offset: range.start,
2395 line: 1 + subject[..range.start].matches('\n').count(),
2396 column: 1 + subject[..range.start]
2397 .chars()
2398 .rev()
2399 .position(|c| c == '\n')
2400 .unwrap_or(subject[..range.start].chars().count()),
2401 };
2402 let end = Position {
2403 offset: range.end,
2404 line: 1 + subject[..range.end].matches('\n').count(),
2405 column: 1 + subject[..range.end]
2406 .chars()
2407 .rev()
2408 .position(|c| c == '\n')
2409 .unwrap_or(subject[..range.end].chars().count()),
2410 };
2411 Span::new(start, end)
2412 }
2413
2414 /// Create a verbatim literal starting at the given position.
lit(c: char, start: usize) -> Ast2415 fn lit(c: char, start: usize) -> Ast {
2416 lit_with(c, span(start..start + c.len_utf8()))
2417 }
2418
2419 /// Create a punctuation literal starting at the given position.
punct_lit(c: char, span: Span) -> Ast2420 fn punct_lit(c: char, span: Span) -> Ast {
2421 Ast::Literal(ast::Literal {
2422 span: span,
2423 kind: ast::LiteralKind::Punctuation,
2424 c: c,
2425 })
2426 }
2427
2428 /// Create a verbatim literal with the given span.
lit_with(c: char, span: Span) -> Ast2429 fn lit_with(c: char, span: Span) -> Ast {
2430 Ast::Literal(ast::Literal {
2431 span: span,
2432 kind: ast::LiteralKind::Verbatim,
2433 c: c,
2434 })
2435 }
2436
2437 /// Create a concatenation with the given range.
concat(range: Range<usize>, asts: Vec<Ast>) -> Ast2438 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2439 concat_with(span(range), asts)
2440 }
2441
2442 /// Create a concatenation with the given span.
concat_with(span: Span, asts: Vec<Ast>) -> Ast2443 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2444 Ast::Concat(ast::Concat { span: span, asts: asts })
2445 }
2446
2447 /// Create an alternation with the given span.
alt(range: Range<usize>, asts: Vec<Ast>) -> Ast2448 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2449 Ast::Alternation(ast::Alternation { span: span(range), asts: asts })
2450 }
2451
2452 /// Create a capturing group with the given span.
group(range: Range<usize>, index: u32, ast: Ast) -> Ast2453 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2454 Ast::Group(ast::Group {
2455 span: span(range),
2456 kind: ast::GroupKind::CaptureIndex(index),
2457 ast: Box::new(ast),
2458 })
2459 }
2460
2461 /// Create an ast::SetFlags.
2462 ///
2463 /// The given pattern should be the full pattern string. The range given
2464 /// should correspond to the byte offsets where the flag set occurs.
2465 ///
2466 /// If negated is true, then the set is interpreted as beginning with a
2467 /// negation.
flag_set( pat: &str, range: Range<usize>, flag: ast::Flag, negated: bool, ) -> Ast2468 fn flag_set(
2469 pat: &str,
2470 range: Range<usize>,
2471 flag: ast::Flag,
2472 negated: bool,
2473 ) -> Ast {
2474 let mut items = vec![ast::FlagsItem {
2475 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2476 kind: ast::FlagsItemKind::Flag(flag),
2477 }];
2478 if negated {
2479 items.insert(
2480 0,
2481 ast::FlagsItem {
2482 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2483 kind: ast::FlagsItemKind::Negation,
2484 },
2485 );
2486 }
2487 Ast::Flags(ast::SetFlags {
2488 span: span_range(pat, range.clone()),
2489 flags: ast::Flags {
2490 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2491 items: items,
2492 },
2493 })
2494 }
2495
2496 #[test]
parse_nest_limit()2497 fn parse_nest_limit() {
2498 // A nest limit of 0 still allows some types of regexes.
2499 assert_eq!(
2500 parser_nest_limit("", 0).parse(),
2501 Ok(Ast::Empty(span(0..0)))
2502 );
2503 assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2504
2505 // Test repetition operations, which require one level of nesting.
2506 assert_eq!(
2507 parser_nest_limit("a+", 0).parse().unwrap_err(),
2508 TestError {
2509 span: span(0..2),
2510 kind: ast::ErrorKind::NestLimitExceeded(0),
2511 }
2512 );
2513 assert_eq!(
2514 parser_nest_limit("a+", 1).parse(),
2515 Ok(Ast::Repetition(ast::Repetition {
2516 span: span(0..2),
2517 op: ast::RepetitionOp {
2518 span: span(1..2),
2519 kind: ast::RepetitionKind::OneOrMore,
2520 },
2521 greedy: true,
2522 ast: Box::new(lit('a', 0)),
2523 }))
2524 );
2525 assert_eq!(
2526 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2527 TestError {
2528 span: span(0..3),
2529 kind: ast::ErrorKind::NestLimitExceeded(1),
2530 }
2531 );
2532 assert_eq!(
2533 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2534 TestError {
2535 span: span(0..2),
2536 kind: ast::ErrorKind::NestLimitExceeded(1),
2537 }
2538 );
2539 assert_eq!(
2540 parser_nest_limit("a+*", 2).parse(),
2541 Ok(Ast::Repetition(ast::Repetition {
2542 span: span(0..3),
2543 op: ast::RepetitionOp {
2544 span: span(2..3),
2545 kind: ast::RepetitionKind::ZeroOrMore,
2546 },
2547 greedy: true,
2548 ast: Box::new(Ast::Repetition(ast::Repetition {
2549 span: span(0..2),
2550 op: ast::RepetitionOp {
2551 span: span(1..2),
2552 kind: ast::RepetitionKind::OneOrMore,
2553 },
2554 greedy: true,
2555 ast: Box::new(lit('a', 0)),
2556 })),
2557 }))
2558 );
2559
2560 // Test concatenations. A concatenation requires one level of nesting.
2561 assert_eq!(
2562 parser_nest_limit("ab", 0).parse().unwrap_err(),
2563 TestError {
2564 span: span(0..2),
2565 kind: ast::ErrorKind::NestLimitExceeded(0),
2566 }
2567 );
2568 assert_eq!(
2569 parser_nest_limit("ab", 1).parse(),
2570 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2571 );
2572 assert_eq!(
2573 parser_nest_limit("abc", 1).parse(),
2574 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2575 );
2576
2577 // Test alternations. An alternation requires one level of nesting.
2578 assert_eq!(
2579 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2580 TestError {
2581 span: span(0..3),
2582 kind: ast::ErrorKind::NestLimitExceeded(0),
2583 }
2584 );
2585 assert_eq!(
2586 parser_nest_limit("a|b", 1).parse(),
2587 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2588 );
2589 assert_eq!(
2590 parser_nest_limit("a|b|c", 1).parse(),
2591 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2592 );
2593
2594 // Test character classes. Classes form their own mini-recursive
2595 // syntax!
2596 assert_eq!(
2597 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2598 TestError {
2599 span: span(0..3),
2600 kind: ast::ErrorKind::NestLimitExceeded(0),
2601 }
2602 );
2603 assert_eq!(
2604 parser_nest_limit("[a]", 1).parse(),
2605 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2606 span: span(0..3),
2607 negated: false,
2608 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2609 ast::Literal {
2610 span: span(1..2),
2611 kind: ast::LiteralKind::Verbatim,
2612 c: 'a',
2613 }
2614 )),
2615 })))
2616 );
2617 assert_eq!(
2618 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2619 TestError {
2620 span: span(1..3),
2621 kind: ast::ErrorKind::NestLimitExceeded(1),
2622 }
2623 );
2624 assert_eq!(
2625 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2626 TestError {
2627 span: span(3..7),
2628 kind: ast::ErrorKind::NestLimitExceeded(2),
2629 }
2630 );
2631 assert_eq!(
2632 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2633 TestError {
2634 span: span(4..6),
2635 kind: ast::ErrorKind::NestLimitExceeded(3),
2636 }
2637 );
2638 assert_eq!(
2639 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2640 TestError {
2641 span: span(1..5),
2642 kind: ast::ErrorKind::NestLimitExceeded(1),
2643 }
2644 );
2645 assert_eq!(
2646 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2647 TestError {
2648 span: span(4..6),
2649 kind: ast::ErrorKind::NestLimitExceeded(2),
2650 }
2651 );
2652 }
2653
2654 #[test]
parse_comments()2655 fn parse_comments() {
2656 let pat = "(?x)
2657 # This is comment 1.
2658 foo # This is comment 2.
2659 # This is comment 3.
2660 bar
2661 # This is comment 4.";
2662 let astc = parser(pat).parse_with_comments().unwrap();
2663 assert_eq!(
2664 astc.ast,
2665 concat_with(
2666 span_range(pat, 0..pat.len()),
2667 vec![
2668 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2669 lit_with('f', span_range(pat, 26..27)),
2670 lit_with('o', span_range(pat, 27..28)),
2671 lit_with('o', span_range(pat, 28..29)),
2672 lit_with('b', span_range(pat, 74..75)),
2673 lit_with('a', span_range(pat, 75..76)),
2674 lit_with('r', span_range(pat, 76..77)),
2675 ]
2676 )
2677 );
2678 assert_eq!(
2679 astc.comments,
2680 vec![
2681 ast::Comment {
2682 span: span_range(pat, 5..26),
2683 comment: s(" This is comment 1."),
2684 },
2685 ast::Comment {
2686 span: span_range(pat, 30..51),
2687 comment: s(" This is comment 2."),
2688 },
2689 ast::Comment {
2690 span: span_range(pat, 53..74),
2691 comment: s(" This is comment 3."),
2692 },
2693 ast::Comment {
2694 span: span_range(pat, 78..98),
2695 comment: s(" This is comment 4."),
2696 },
2697 ]
2698 );
2699 }
2700
2701 #[test]
parse_holistic()2702 fn parse_holistic() {
2703 assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2704 assert_eq!(
2705 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2706 Ok(concat(
2707 0..36,
2708 vec![
2709 punct_lit('\\', span(0..2)),
2710 punct_lit('.', span(2..4)),
2711 punct_lit('+', span(4..6)),
2712 punct_lit('*', span(6..8)),
2713 punct_lit('?', span(8..10)),
2714 punct_lit('(', span(10..12)),
2715 punct_lit(')', span(12..14)),
2716 punct_lit('|', span(14..16)),
2717 punct_lit('[', span(16..18)),
2718 punct_lit(']', span(18..20)),
2719 punct_lit('{', span(20..22)),
2720 punct_lit('}', span(22..24)),
2721 punct_lit('^', span(24..26)),
2722 punct_lit('$', span(26..28)),
2723 punct_lit('#', span(28..30)),
2724 punct_lit('&', span(30..32)),
2725 punct_lit('-', span(32..34)),
2726 punct_lit('~', span(34..36)),
2727 ]
2728 ))
2729 );
2730 }
2731
2732 #[test]
parse_ignore_whitespace()2733 fn parse_ignore_whitespace() {
2734 // Test that basic whitespace insensitivity works.
2735 let pat = "(?x)a b";
2736 assert_eq!(
2737 parser(pat).parse(),
2738 Ok(concat_with(
2739 nspan(npos(0, 1, 1), npos(7, 1, 8)),
2740 vec![
2741 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2742 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2743 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2744 ]
2745 ))
2746 );
2747
2748 // Test that we can toggle whitespace insensitivity.
2749 let pat = "(?x)a b(?-x)a b";
2750 assert_eq!(
2751 parser(pat).parse(),
2752 Ok(concat_with(
2753 nspan(npos(0, 1, 1), npos(15, 1, 16)),
2754 vec![
2755 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2756 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2757 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2758 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2759 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2760 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2761 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2762 ]
2763 ))
2764 );
2765
2766 // Test that nesting whitespace insensitive flags works.
2767 let pat = "a (?x:a )a ";
2768 assert_eq!(
2769 parser(pat).parse(),
2770 Ok(concat_with(
2771 span_range(pat, 0..11),
2772 vec![
2773 lit_with('a', span_range(pat, 0..1)),
2774 lit_with(' ', span_range(pat, 1..2)),
2775 Ast::Group(ast::Group {
2776 span: span_range(pat, 2..9),
2777 kind: ast::GroupKind::NonCapturing(ast::Flags {
2778 span: span_range(pat, 4..5),
2779 items: vec![ast::FlagsItem {
2780 span: span_range(pat, 4..5),
2781 kind: ast::FlagsItemKind::Flag(
2782 ast::Flag::IgnoreWhitespace
2783 ),
2784 },],
2785 }),
2786 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2787 }),
2788 lit_with('a', span_range(pat, 9..10)),
2789 lit_with(' ', span_range(pat, 10..11)),
2790 ]
2791 ))
2792 );
2793
2794 // Test that whitespace after an opening paren is insignificant.
2795 let pat = "(?x)( ?P<foo> a )";
2796 assert_eq!(
2797 parser(pat).parse(),
2798 Ok(concat_with(
2799 span_range(pat, 0..pat.len()),
2800 vec![
2801 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2802 Ast::Group(ast::Group {
2803 span: span_range(pat, 4..pat.len()),
2804 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2805 span: span_range(pat, 9..12),
2806 name: s("foo"),
2807 index: 1,
2808 }),
2809 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2810 }),
2811 ]
2812 ))
2813 );
2814 let pat = "(?x)( a )";
2815 assert_eq!(
2816 parser(pat).parse(),
2817 Ok(concat_with(
2818 span_range(pat, 0..pat.len()),
2819 vec![
2820 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2821 Ast::Group(ast::Group {
2822 span: span_range(pat, 4..pat.len()),
2823 kind: ast::GroupKind::CaptureIndex(1),
2824 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2825 }),
2826 ]
2827 ))
2828 );
2829 let pat = "(?x)( ?: a )";
2830 assert_eq!(
2831 parser(pat).parse(),
2832 Ok(concat_with(
2833 span_range(pat, 0..pat.len()),
2834 vec![
2835 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2836 Ast::Group(ast::Group {
2837 span: span_range(pat, 4..pat.len()),
2838 kind: ast::GroupKind::NonCapturing(ast::Flags {
2839 span: span_range(pat, 8..8),
2840 items: vec![],
2841 }),
2842 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2843 }),
2844 ]
2845 ))
2846 );
2847 let pat = r"(?x)\x { 53 }";
2848 assert_eq!(
2849 parser(pat).parse(),
2850 Ok(concat_with(
2851 span_range(pat, 0..pat.len()),
2852 vec![
2853 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2854 Ast::Literal(ast::Literal {
2855 span: span(4..13),
2856 kind: ast::LiteralKind::HexBrace(
2857 ast::HexLiteralKind::X
2858 ),
2859 c: 'S',
2860 }),
2861 ]
2862 ))
2863 );
2864
2865 // Test that whitespace after an escape is OK.
2866 let pat = r"(?x)\ ";
2867 assert_eq!(
2868 parser(pat).parse(),
2869 Ok(concat_with(
2870 span_range(pat, 0..pat.len()),
2871 vec![
2872 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2873 Ast::Literal(ast::Literal {
2874 span: span_range(pat, 4..6),
2875 kind: ast::LiteralKind::Special(
2876 ast::SpecialLiteralKind::Space
2877 ),
2878 c: ' ',
2879 }),
2880 ]
2881 ))
2882 );
2883 // ... but only when `x` mode is enabled.
2884 let pat = r"\ ";
2885 assert_eq!(
2886 parser(pat).parse().unwrap_err(),
2887 TestError {
2888 span: span_range(pat, 0..2),
2889 kind: ast::ErrorKind::EscapeUnrecognized,
2890 }
2891 );
2892 }
2893
2894 #[test]
parse_newlines()2895 fn parse_newlines() {
2896 let pat = ".\n.";
2897 assert_eq!(
2898 parser(pat).parse(),
2899 Ok(concat_with(
2900 span_range(pat, 0..3),
2901 vec![
2902 Ast::Dot(span_range(pat, 0..1)),
2903 lit_with('\n', span_range(pat, 1..2)),
2904 Ast::Dot(span_range(pat, 2..3)),
2905 ]
2906 ))
2907 );
2908
2909 let pat = "foobar\nbaz\nquux\n";
2910 assert_eq!(
2911 parser(pat).parse(),
2912 Ok(concat_with(
2913 span_range(pat, 0..pat.len()),
2914 vec![
2915 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2916 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2917 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2918 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2919 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2920 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2921 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2922 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2923 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2924 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2925 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2926 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2927 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2928 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2929 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2930 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2931 ]
2932 ))
2933 );
2934 }
2935
2936 #[test]
parse_uncounted_repetition()2937 fn parse_uncounted_repetition() {
2938 assert_eq!(
2939 parser(r"a*").parse(),
2940 Ok(Ast::Repetition(ast::Repetition {
2941 span: span(0..2),
2942 op: ast::RepetitionOp {
2943 span: span(1..2),
2944 kind: ast::RepetitionKind::ZeroOrMore,
2945 },
2946 greedy: true,
2947 ast: Box::new(lit('a', 0)),
2948 }))
2949 );
2950 assert_eq!(
2951 parser(r"a+").parse(),
2952 Ok(Ast::Repetition(ast::Repetition {
2953 span: span(0..2),
2954 op: ast::RepetitionOp {
2955 span: span(1..2),
2956 kind: ast::RepetitionKind::OneOrMore,
2957 },
2958 greedy: true,
2959 ast: Box::new(lit('a', 0)),
2960 }))
2961 );
2962
2963 assert_eq!(
2964 parser(r"a?").parse(),
2965 Ok(Ast::Repetition(ast::Repetition {
2966 span: span(0..2),
2967 op: ast::RepetitionOp {
2968 span: span(1..2),
2969 kind: ast::RepetitionKind::ZeroOrOne,
2970 },
2971 greedy: true,
2972 ast: Box::new(lit('a', 0)),
2973 }))
2974 );
2975 assert_eq!(
2976 parser(r"a??").parse(),
2977 Ok(Ast::Repetition(ast::Repetition {
2978 span: span(0..3),
2979 op: ast::RepetitionOp {
2980 span: span(1..3),
2981 kind: ast::RepetitionKind::ZeroOrOne,
2982 },
2983 greedy: false,
2984 ast: Box::new(lit('a', 0)),
2985 }))
2986 );
2987 assert_eq!(
2988 parser(r"a?").parse(),
2989 Ok(Ast::Repetition(ast::Repetition {
2990 span: span(0..2),
2991 op: ast::RepetitionOp {
2992 span: span(1..2),
2993 kind: ast::RepetitionKind::ZeroOrOne,
2994 },
2995 greedy: true,
2996 ast: Box::new(lit('a', 0)),
2997 }))
2998 );
2999 assert_eq!(
3000 parser(r"a?b").parse(),
3001 Ok(concat(
3002 0..3,
3003 vec![
3004 Ast::Repetition(ast::Repetition {
3005 span: span(0..2),
3006 op: ast::RepetitionOp {
3007 span: span(1..2),
3008 kind: ast::RepetitionKind::ZeroOrOne,
3009 },
3010 greedy: true,
3011 ast: Box::new(lit('a', 0)),
3012 }),
3013 lit('b', 2),
3014 ]
3015 ))
3016 );
3017 assert_eq!(
3018 parser(r"a??b").parse(),
3019 Ok(concat(
3020 0..4,
3021 vec![
3022 Ast::Repetition(ast::Repetition {
3023 span: span(0..3),
3024 op: ast::RepetitionOp {
3025 span: span(1..3),
3026 kind: ast::RepetitionKind::ZeroOrOne,
3027 },
3028 greedy: false,
3029 ast: Box::new(lit('a', 0)),
3030 }),
3031 lit('b', 3),
3032 ]
3033 ))
3034 );
3035 assert_eq!(
3036 parser(r"ab?").parse(),
3037 Ok(concat(
3038 0..3,
3039 vec![
3040 lit('a', 0),
3041 Ast::Repetition(ast::Repetition {
3042 span: span(1..3),
3043 op: ast::RepetitionOp {
3044 span: span(2..3),
3045 kind: ast::RepetitionKind::ZeroOrOne,
3046 },
3047 greedy: true,
3048 ast: Box::new(lit('b', 1)),
3049 }),
3050 ]
3051 ))
3052 );
3053 assert_eq!(
3054 parser(r"(ab)?").parse(),
3055 Ok(Ast::Repetition(ast::Repetition {
3056 span: span(0..5),
3057 op: ast::RepetitionOp {
3058 span: span(4..5),
3059 kind: ast::RepetitionKind::ZeroOrOne,
3060 },
3061 greedy: true,
3062 ast: Box::new(group(
3063 0..4,
3064 1,
3065 concat(1..3, vec![lit('a', 1), lit('b', 2),])
3066 )),
3067 }))
3068 );
3069 assert_eq!(
3070 parser(r"|a?").parse(),
3071 Ok(alt(
3072 0..3,
3073 vec![
3074 Ast::Empty(span(0..0)),
3075 Ast::Repetition(ast::Repetition {
3076 span: span(1..3),
3077 op: ast::RepetitionOp {
3078 span: span(2..3),
3079 kind: ast::RepetitionKind::ZeroOrOne,
3080 },
3081 greedy: true,
3082 ast: Box::new(lit('a', 1)),
3083 }),
3084 ]
3085 ))
3086 );
3087
3088 assert_eq!(
3089 parser(r"*").parse().unwrap_err(),
3090 TestError {
3091 span: span(0..0),
3092 kind: ast::ErrorKind::RepetitionMissing,
3093 }
3094 );
3095 assert_eq!(
3096 parser(r"(?i)*").parse().unwrap_err(),
3097 TestError {
3098 span: span(4..4),
3099 kind: ast::ErrorKind::RepetitionMissing,
3100 }
3101 );
3102 assert_eq!(
3103 parser(r"(*)").parse().unwrap_err(),
3104 TestError {
3105 span: span(1..1),
3106 kind: ast::ErrorKind::RepetitionMissing,
3107 }
3108 );
3109 assert_eq!(
3110 parser(r"(?:?)").parse().unwrap_err(),
3111 TestError {
3112 span: span(3..3),
3113 kind: ast::ErrorKind::RepetitionMissing,
3114 }
3115 );
3116 assert_eq!(
3117 parser(r"+").parse().unwrap_err(),
3118 TestError {
3119 span: span(0..0),
3120 kind: ast::ErrorKind::RepetitionMissing,
3121 }
3122 );
3123 assert_eq!(
3124 parser(r"?").parse().unwrap_err(),
3125 TestError {
3126 span: span(0..0),
3127 kind: ast::ErrorKind::RepetitionMissing,
3128 }
3129 );
3130 assert_eq!(
3131 parser(r"(?)").parse().unwrap_err(),
3132 TestError {
3133 span: span(1..1),
3134 kind: ast::ErrorKind::RepetitionMissing,
3135 }
3136 );
3137 assert_eq!(
3138 parser(r"|*").parse().unwrap_err(),
3139 TestError {
3140 span: span(1..1),
3141 kind: ast::ErrorKind::RepetitionMissing,
3142 }
3143 );
3144 assert_eq!(
3145 parser(r"|+").parse().unwrap_err(),
3146 TestError {
3147 span: span(1..1),
3148 kind: ast::ErrorKind::RepetitionMissing,
3149 }
3150 );
3151 assert_eq!(
3152 parser(r"|?").parse().unwrap_err(),
3153 TestError {
3154 span: span(1..1),
3155 kind: ast::ErrorKind::RepetitionMissing,
3156 }
3157 );
3158 }
3159
3160 #[test]
parse_counted_repetition()3161 fn parse_counted_repetition() {
3162 assert_eq!(
3163 parser(r"a{5}").parse(),
3164 Ok(Ast::Repetition(ast::Repetition {
3165 span: span(0..4),
3166 op: ast::RepetitionOp {
3167 span: span(1..4),
3168 kind: ast::RepetitionKind::Range(
3169 ast::RepetitionRange::Exactly(5)
3170 ),
3171 },
3172 greedy: true,
3173 ast: Box::new(lit('a', 0)),
3174 }))
3175 );
3176 assert_eq!(
3177 parser(r"a{5,}").parse(),
3178 Ok(Ast::Repetition(ast::Repetition {
3179 span: span(0..5),
3180 op: ast::RepetitionOp {
3181 span: span(1..5),
3182 kind: ast::RepetitionKind::Range(
3183 ast::RepetitionRange::AtLeast(5)
3184 ),
3185 },
3186 greedy: true,
3187 ast: Box::new(lit('a', 0)),
3188 }))
3189 );
3190 assert_eq!(
3191 parser(r"a{5,9}").parse(),
3192 Ok(Ast::Repetition(ast::Repetition {
3193 span: span(0..6),
3194 op: ast::RepetitionOp {
3195 span: span(1..6),
3196 kind: ast::RepetitionKind::Range(
3197 ast::RepetitionRange::Bounded(5, 9)
3198 ),
3199 },
3200 greedy: true,
3201 ast: Box::new(lit('a', 0)),
3202 }))
3203 );
3204 assert_eq!(
3205 parser(r"a{5}?").parse(),
3206 Ok(Ast::Repetition(ast::Repetition {
3207 span: span(0..5),
3208 op: ast::RepetitionOp {
3209 span: span(1..5),
3210 kind: ast::RepetitionKind::Range(
3211 ast::RepetitionRange::Exactly(5)
3212 ),
3213 },
3214 greedy: false,
3215 ast: Box::new(lit('a', 0)),
3216 }))
3217 );
3218 assert_eq!(
3219 parser(r"ab{5}").parse(),
3220 Ok(concat(
3221 0..5,
3222 vec![
3223 lit('a', 0),
3224 Ast::Repetition(ast::Repetition {
3225 span: span(1..5),
3226 op: ast::RepetitionOp {
3227 span: span(2..5),
3228 kind: ast::RepetitionKind::Range(
3229 ast::RepetitionRange::Exactly(5)
3230 ),
3231 },
3232 greedy: true,
3233 ast: Box::new(lit('b', 1)),
3234 }),
3235 ]
3236 ))
3237 );
3238 assert_eq!(
3239 parser(r"ab{5}c").parse(),
3240 Ok(concat(
3241 0..6,
3242 vec![
3243 lit('a', 0),
3244 Ast::Repetition(ast::Repetition {
3245 span: span(1..5),
3246 op: ast::RepetitionOp {
3247 span: span(2..5),
3248 kind: ast::RepetitionKind::Range(
3249 ast::RepetitionRange::Exactly(5)
3250 ),
3251 },
3252 greedy: true,
3253 ast: Box::new(lit('b', 1)),
3254 }),
3255 lit('c', 5),
3256 ]
3257 ))
3258 );
3259
3260 assert_eq!(
3261 parser(r"a{ 5 }").parse(),
3262 Ok(Ast::Repetition(ast::Repetition {
3263 span: span(0..6),
3264 op: ast::RepetitionOp {
3265 span: span(1..6),
3266 kind: ast::RepetitionKind::Range(
3267 ast::RepetitionRange::Exactly(5)
3268 ),
3269 },
3270 greedy: true,
3271 ast: Box::new(lit('a', 0)),
3272 }))
3273 );
3274 assert_eq!(
3275 parser(r"a{ 5 , 9 }").parse(),
3276 Ok(Ast::Repetition(ast::Repetition {
3277 span: span(0..10),
3278 op: ast::RepetitionOp {
3279 span: span(1..10),
3280 kind: ast::RepetitionKind::Range(
3281 ast::RepetitionRange::Bounded(5, 9)
3282 ),
3283 },
3284 greedy: true,
3285 ast: Box::new(lit('a', 0)),
3286 }))
3287 );
3288 assert_eq!(
3289 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3290 Ok(Ast::Repetition(ast::Repetition {
3291 span: span(0..8),
3292 op: ast::RepetitionOp {
3293 span: span(1..8),
3294 kind: ast::RepetitionKind::Range(
3295 ast::RepetitionRange::Bounded(5, 9)
3296 ),
3297 },
3298 greedy: false,
3299 ast: Box::new(lit('a', 0)),
3300 }))
3301 );
3302
3303 assert_eq!(
3304 parser(r"(?i){0}").parse().unwrap_err(),
3305 TestError {
3306 span: span(4..4),
3307 kind: ast::ErrorKind::RepetitionMissing,
3308 }
3309 );
3310 assert_eq!(
3311 parser(r"(?m){1,1}").parse().unwrap_err(),
3312 TestError {
3313 span: span(4..4),
3314 kind: ast::ErrorKind::RepetitionMissing,
3315 }
3316 );
3317 assert_eq!(
3318 parser(r"a{]}").parse().unwrap_err(),
3319 TestError {
3320 span: span(2..2),
3321 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3322 }
3323 );
3324 assert_eq!(
3325 parser(r"a{1,]}").parse().unwrap_err(),
3326 TestError {
3327 span: span(4..4),
3328 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3329 }
3330 );
3331 assert_eq!(
3332 parser(r"a{").parse().unwrap_err(),
3333 TestError {
3334 span: span(1..2),
3335 kind: ast::ErrorKind::RepetitionCountUnclosed,
3336 }
3337 );
3338 assert_eq!(
3339 parser(r"a{}").parse().unwrap_err(),
3340 TestError {
3341 span: span(2..2),
3342 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3343 }
3344 );
3345 assert_eq!(
3346 parser(r"a{a").parse().unwrap_err(),
3347 TestError {
3348 span: span(2..2),
3349 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3350 }
3351 );
3352 assert_eq!(
3353 parser(r"a{9999999999}").parse().unwrap_err(),
3354 TestError {
3355 span: span(2..12),
3356 kind: ast::ErrorKind::DecimalInvalid,
3357 }
3358 );
3359 assert_eq!(
3360 parser(r"a{9").parse().unwrap_err(),
3361 TestError {
3362 span: span(1..3),
3363 kind: ast::ErrorKind::RepetitionCountUnclosed,
3364 }
3365 );
3366 assert_eq!(
3367 parser(r"a{9,a").parse().unwrap_err(),
3368 TestError {
3369 span: span(4..4),
3370 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3371 }
3372 );
3373 assert_eq!(
3374 parser(r"a{9,9999999999}").parse().unwrap_err(),
3375 TestError {
3376 span: span(4..14),
3377 kind: ast::ErrorKind::DecimalInvalid,
3378 }
3379 );
3380 assert_eq!(
3381 parser(r"a{9,").parse().unwrap_err(),
3382 TestError {
3383 span: span(1..4),
3384 kind: ast::ErrorKind::RepetitionCountUnclosed,
3385 }
3386 );
3387 assert_eq!(
3388 parser(r"a{9,11").parse().unwrap_err(),
3389 TestError {
3390 span: span(1..6),
3391 kind: ast::ErrorKind::RepetitionCountUnclosed,
3392 }
3393 );
3394 assert_eq!(
3395 parser(r"a{2,1}").parse().unwrap_err(),
3396 TestError {
3397 span: span(1..6),
3398 kind: ast::ErrorKind::RepetitionCountInvalid,
3399 }
3400 );
3401 assert_eq!(
3402 parser(r"{5}").parse().unwrap_err(),
3403 TestError {
3404 span: span(0..0),
3405 kind: ast::ErrorKind::RepetitionMissing,
3406 }
3407 );
3408 assert_eq!(
3409 parser(r"|{5}").parse().unwrap_err(),
3410 TestError {
3411 span: span(1..1),
3412 kind: ast::ErrorKind::RepetitionMissing,
3413 }
3414 );
3415 }
3416
3417 #[test]
parse_alternate()3418 fn parse_alternate() {
3419 assert_eq!(
3420 parser(r"a|b").parse(),
3421 Ok(Ast::Alternation(ast::Alternation {
3422 span: span(0..3),
3423 asts: vec![lit('a', 0), lit('b', 2)],
3424 }))
3425 );
3426 assert_eq!(
3427 parser(r"(a|b)").parse(),
3428 Ok(group(
3429 0..5,
3430 1,
3431 Ast::Alternation(ast::Alternation {
3432 span: span(1..4),
3433 asts: vec![lit('a', 1), lit('b', 3)],
3434 })
3435 ))
3436 );
3437
3438 assert_eq!(
3439 parser(r"a|b|c").parse(),
3440 Ok(Ast::Alternation(ast::Alternation {
3441 span: span(0..5),
3442 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3443 }))
3444 );
3445 assert_eq!(
3446 parser(r"ax|by|cz").parse(),
3447 Ok(Ast::Alternation(ast::Alternation {
3448 span: span(0..8),
3449 asts: vec![
3450 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3451 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3452 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3453 ],
3454 }))
3455 );
3456 assert_eq!(
3457 parser(r"(ax|by|cz)").parse(),
3458 Ok(group(
3459 0..10,
3460 1,
3461 Ast::Alternation(ast::Alternation {
3462 span: span(1..9),
3463 asts: vec![
3464 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3465 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3466 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3467 ],
3468 })
3469 ))
3470 );
3471 assert_eq!(
3472 parser(r"(ax|(by|(cz)))").parse(),
3473 Ok(group(
3474 0..14,
3475 1,
3476 alt(
3477 1..13,
3478 vec![
3479 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3480 group(
3481 4..13,
3482 2,
3483 alt(
3484 5..12,
3485 vec![
3486 concat(
3487 5..7,
3488 vec![lit('b', 5), lit('y', 6)]
3489 ),
3490 group(
3491 8..12,
3492 3,
3493 concat(
3494 9..11,
3495 vec![lit('c', 9), lit('z', 10),]
3496 )
3497 ),
3498 ]
3499 )
3500 ),
3501 ]
3502 )
3503 ))
3504 );
3505
3506 assert_eq!(
3507 parser(r"|").parse(),
3508 Ok(alt(
3509 0..1,
3510 vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
3511 ))
3512 );
3513 assert_eq!(
3514 parser(r"||").parse(),
3515 Ok(alt(
3516 0..2,
3517 vec![
3518 Ast::Empty(span(0..0)),
3519 Ast::Empty(span(1..1)),
3520 Ast::Empty(span(2..2)),
3521 ]
3522 ))
3523 );
3524 assert_eq!(
3525 parser(r"a|").parse(),
3526 Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
3527 );
3528 assert_eq!(
3529 parser(r"|a").parse(),
3530 Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
3531 );
3532
3533 assert_eq!(
3534 parser(r"(|)").parse(),
3535 Ok(group(
3536 0..3,
3537 1,
3538 alt(
3539 1..2,
3540 vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
3541 )
3542 ))
3543 );
3544 assert_eq!(
3545 parser(r"(a|)").parse(),
3546 Ok(group(
3547 0..4,
3548 1,
3549 alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
3550 ))
3551 );
3552 assert_eq!(
3553 parser(r"(|a)").parse(),
3554 Ok(group(
3555 0..4,
3556 1,
3557 alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
3558 ))
3559 );
3560
3561 assert_eq!(
3562 parser(r"a|b)").parse().unwrap_err(),
3563 TestError {
3564 span: span(3..4),
3565 kind: ast::ErrorKind::GroupUnopened,
3566 }
3567 );
3568 assert_eq!(
3569 parser(r"(a|b").parse().unwrap_err(),
3570 TestError {
3571 span: span(0..1),
3572 kind: ast::ErrorKind::GroupUnclosed,
3573 }
3574 );
3575 }
3576
3577 #[test]
parse_unsupported_lookaround()3578 fn parse_unsupported_lookaround() {
3579 assert_eq!(
3580 parser(r"(?=a)").parse().unwrap_err(),
3581 TestError {
3582 span: span(0..3),
3583 kind: ast::ErrorKind::UnsupportedLookAround,
3584 }
3585 );
3586 assert_eq!(
3587 parser(r"(?!a)").parse().unwrap_err(),
3588 TestError {
3589 span: span(0..3),
3590 kind: ast::ErrorKind::UnsupportedLookAround,
3591 }
3592 );
3593 assert_eq!(
3594 parser(r"(?<=a)").parse().unwrap_err(),
3595 TestError {
3596 span: span(0..4),
3597 kind: ast::ErrorKind::UnsupportedLookAround,
3598 }
3599 );
3600 assert_eq!(
3601 parser(r"(?<!a)").parse().unwrap_err(),
3602 TestError {
3603 span: span(0..4),
3604 kind: ast::ErrorKind::UnsupportedLookAround,
3605 }
3606 );
3607 }
3608
3609 #[test]
parse_group()3610 fn parse_group() {
3611 assert_eq!(
3612 parser("(?i)").parse(),
3613 Ok(Ast::Flags(ast::SetFlags {
3614 span: span(0..4),
3615 flags: ast::Flags {
3616 span: span(2..3),
3617 items: vec![ast::FlagsItem {
3618 span: span(2..3),
3619 kind: ast::FlagsItemKind::Flag(
3620 ast::Flag::CaseInsensitive
3621 ),
3622 }],
3623 },
3624 }))
3625 );
3626 assert_eq!(
3627 parser("(?iU)").parse(),
3628 Ok(Ast::Flags(ast::SetFlags {
3629 span: span(0..5),
3630 flags: ast::Flags {
3631 span: span(2..4),
3632 items: vec![
3633 ast::FlagsItem {
3634 span: span(2..3),
3635 kind: ast::FlagsItemKind::Flag(
3636 ast::Flag::CaseInsensitive
3637 ),
3638 },
3639 ast::FlagsItem {
3640 span: span(3..4),
3641 kind: ast::FlagsItemKind::Flag(
3642 ast::Flag::SwapGreed
3643 ),
3644 },
3645 ],
3646 },
3647 }))
3648 );
3649 assert_eq!(
3650 parser("(?i-U)").parse(),
3651 Ok(Ast::Flags(ast::SetFlags {
3652 span: span(0..6),
3653 flags: ast::Flags {
3654 span: span(2..5),
3655 items: vec![
3656 ast::FlagsItem {
3657 span: span(2..3),
3658 kind: ast::FlagsItemKind::Flag(
3659 ast::Flag::CaseInsensitive
3660 ),
3661 },
3662 ast::FlagsItem {
3663 span: span(3..4),
3664 kind: ast::FlagsItemKind::Negation,
3665 },
3666 ast::FlagsItem {
3667 span: span(4..5),
3668 kind: ast::FlagsItemKind::Flag(
3669 ast::Flag::SwapGreed
3670 ),
3671 },
3672 ],
3673 },
3674 }))
3675 );
3676
3677 assert_eq!(
3678 parser("()").parse(),
3679 Ok(Ast::Group(ast::Group {
3680 span: span(0..2),
3681 kind: ast::GroupKind::CaptureIndex(1),
3682 ast: Box::new(Ast::Empty(span(1..1))),
3683 }))
3684 );
3685 assert_eq!(
3686 parser("(a)").parse(),
3687 Ok(Ast::Group(ast::Group {
3688 span: span(0..3),
3689 kind: ast::GroupKind::CaptureIndex(1),
3690 ast: Box::new(lit('a', 1)),
3691 }))
3692 );
3693 assert_eq!(
3694 parser("(())").parse(),
3695 Ok(Ast::Group(ast::Group {
3696 span: span(0..4),
3697 kind: ast::GroupKind::CaptureIndex(1),
3698 ast: Box::new(Ast::Group(ast::Group {
3699 span: span(1..3),
3700 kind: ast::GroupKind::CaptureIndex(2),
3701 ast: Box::new(Ast::Empty(span(2..2))),
3702 })),
3703 }))
3704 );
3705
3706 assert_eq!(
3707 parser("(?:a)").parse(),
3708 Ok(Ast::Group(ast::Group {
3709 span: span(0..5),
3710 kind: ast::GroupKind::NonCapturing(ast::Flags {
3711 span: span(2..2),
3712 items: vec![],
3713 }),
3714 ast: Box::new(lit('a', 3)),
3715 }))
3716 );
3717
3718 assert_eq!(
3719 parser("(?i:a)").parse(),
3720 Ok(Ast::Group(ast::Group {
3721 span: span(0..6),
3722 kind: ast::GroupKind::NonCapturing(ast::Flags {
3723 span: span(2..3),
3724 items: vec![ast::FlagsItem {
3725 span: span(2..3),
3726 kind: ast::FlagsItemKind::Flag(
3727 ast::Flag::CaseInsensitive
3728 ),
3729 },],
3730 }),
3731 ast: Box::new(lit('a', 4)),
3732 }))
3733 );
3734 assert_eq!(
3735 parser("(?i-U:a)").parse(),
3736 Ok(Ast::Group(ast::Group {
3737 span: span(0..8),
3738 kind: ast::GroupKind::NonCapturing(ast::Flags {
3739 span: span(2..5),
3740 items: vec![
3741 ast::FlagsItem {
3742 span: span(2..3),
3743 kind: ast::FlagsItemKind::Flag(
3744 ast::Flag::CaseInsensitive
3745 ),
3746 },
3747 ast::FlagsItem {
3748 span: span(3..4),
3749 kind: ast::FlagsItemKind::Negation,
3750 },
3751 ast::FlagsItem {
3752 span: span(4..5),
3753 kind: ast::FlagsItemKind::Flag(
3754 ast::Flag::SwapGreed
3755 ),
3756 },
3757 ],
3758 }),
3759 ast: Box::new(lit('a', 6)),
3760 }))
3761 );
3762
3763 assert_eq!(
3764 parser("(").parse().unwrap_err(),
3765 TestError {
3766 span: span(0..1),
3767 kind: ast::ErrorKind::GroupUnclosed,
3768 }
3769 );
3770 assert_eq!(
3771 parser("(?").parse().unwrap_err(),
3772 TestError {
3773 span: span(0..1),
3774 kind: ast::ErrorKind::GroupUnclosed,
3775 }
3776 );
3777 assert_eq!(
3778 parser("(?P").parse().unwrap_err(),
3779 TestError {
3780 span: span(2..3),
3781 kind: ast::ErrorKind::FlagUnrecognized,
3782 }
3783 );
3784 assert_eq!(
3785 parser("(?P<").parse().unwrap_err(),
3786 TestError {
3787 span: span(4..4),
3788 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3789 }
3790 );
3791 assert_eq!(
3792 parser("(a").parse().unwrap_err(),
3793 TestError {
3794 span: span(0..1),
3795 kind: ast::ErrorKind::GroupUnclosed,
3796 }
3797 );
3798 assert_eq!(
3799 parser("(()").parse().unwrap_err(),
3800 TestError {
3801 span: span(0..1),
3802 kind: ast::ErrorKind::GroupUnclosed,
3803 }
3804 );
3805 assert_eq!(
3806 parser(")").parse().unwrap_err(),
3807 TestError {
3808 span: span(0..1),
3809 kind: ast::ErrorKind::GroupUnopened,
3810 }
3811 );
3812 assert_eq!(
3813 parser("a)").parse().unwrap_err(),
3814 TestError {
3815 span: span(1..2),
3816 kind: ast::ErrorKind::GroupUnopened,
3817 }
3818 );
3819 }
3820
3821 #[test]
parse_capture_name()3822 fn parse_capture_name() {
3823 assert_eq!(
3824 parser("(?P<a>z)").parse(),
3825 Ok(Ast::Group(ast::Group {
3826 span: span(0..8),
3827 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3828 span: span(4..5),
3829 name: s("a"),
3830 index: 1,
3831 }),
3832 ast: Box::new(lit('z', 6)),
3833 }))
3834 );
3835 assert_eq!(
3836 parser("(?P<abc>z)").parse(),
3837 Ok(Ast::Group(ast::Group {
3838 span: span(0..10),
3839 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3840 span: span(4..7),
3841 name: s("abc"),
3842 index: 1,
3843 }),
3844 ast: Box::new(lit('z', 8)),
3845 }))
3846 );
3847
3848 assert_eq!(
3849 parser("(?P<").parse().unwrap_err(),
3850 TestError {
3851 span: span(4..4),
3852 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3853 }
3854 );
3855 assert_eq!(
3856 parser("(?P<>z)").parse().unwrap_err(),
3857 TestError {
3858 span: span(4..4),
3859 kind: ast::ErrorKind::GroupNameEmpty,
3860 }
3861 );
3862 assert_eq!(
3863 parser("(?P<a").parse().unwrap_err(),
3864 TestError {
3865 span: span(5..5),
3866 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3867 }
3868 );
3869 assert_eq!(
3870 parser("(?P<ab").parse().unwrap_err(),
3871 TestError {
3872 span: span(6..6),
3873 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3874 }
3875 );
3876 assert_eq!(
3877 parser("(?P<0a").parse().unwrap_err(),
3878 TestError {
3879 span: span(4..5),
3880 kind: ast::ErrorKind::GroupNameInvalid,
3881 }
3882 );
3883 assert_eq!(
3884 parser("(?P<~").parse().unwrap_err(),
3885 TestError {
3886 span: span(4..5),
3887 kind: ast::ErrorKind::GroupNameInvalid,
3888 }
3889 );
3890 assert_eq!(
3891 parser("(?P<abc~").parse().unwrap_err(),
3892 TestError {
3893 span: span(7..8),
3894 kind: ast::ErrorKind::GroupNameInvalid,
3895 }
3896 );
3897 assert_eq!(
3898 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3899 TestError {
3900 span: span(12..13),
3901 kind: ast::ErrorKind::GroupNameDuplicate {
3902 original: span(4..5),
3903 },
3904 }
3905 );
3906 }
3907
3908 #[test]
parse_flags()3909 fn parse_flags() {
3910 assert_eq!(
3911 parser("i:").parse_flags(),
3912 Ok(ast::Flags {
3913 span: span(0..1),
3914 items: vec![ast::FlagsItem {
3915 span: span(0..1),
3916 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3917 }],
3918 })
3919 );
3920 assert_eq!(
3921 parser("i)").parse_flags(),
3922 Ok(ast::Flags {
3923 span: span(0..1),
3924 items: vec![ast::FlagsItem {
3925 span: span(0..1),
3926 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3927 }],
3928 })
3929 );
3930
3931 assert_eq!(
3932 parser("isU:").parse_flags(),
3933 Ok(ast::Flags {
3934 span: span(0..3),
3935 items: vec![
3936 ast::FlagsItem {
3937 span: span(0..1),
3938 kind: ast::FlagsItemKind::Flag(
3939 ast::Flag::CaseInsensitive
3940 ),
3941 },
3942 ast::FlagsItem {
3943 span: span(1..2),
3944 kind: ast::FlagsItemKind::Flag(
3945 ast::Flag::DotMatchesNewLine
3946 ),
3947 },
3948 ast::FlagsItem {
3949 span: span(2..3),
3950 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3951 },
3952 ],
3953 })
3954 );
3955
3956 assert_eq!(
3957 parser("-isU:").parse_flags(),
3958 Ok(ast::Flags {
3959 span: span(0..4),
3960 items: vec![
3961 ast::FlagsItem {
3962 span: span(0..1),
3963 kind: ast::FlagsItemKind::Negation,
3964 },
3965 ast::FlagsItem {
3966 span: span(1..2),
3967 kind: ast::FlagsItemKind::Flag(
3968 ast::Flag::CaseInsensitive
3969 ),
3970 },
3971 ast::FlagsItem {
3972 span: span(2..3),
3973 kind: ast::FlagsItemKind::Flag(
3974 ast::Flag::DotMatchesNewLine
3975 ),
3976 },
3977 ast::FlagsItem {
3978 span: span(3..4),
3979 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3980 },
3981 ],
3982 })
3983 );
3984 assert_eq!(
3985 parser("i-sU:").parse_flags(),
3986 Ok(ast::Flags {
3987 span: span(0..4),
3988 items: vec![
3989 ast::FlagsItem {
3990 span: span(0..1),
3991 kind: ast::FlagsItemKind::Flag(
3992 ast::Flag::CaseInsensitive
3993 ),
3994 },
3995 ast::FlagsItem {
3996 span: span(1..2),
3997 kind: ast::FlagsItemKind::Negation,
3998 },
3999 ast::FlagsItem {
4000 span: span(2..3),
4001 kind: ast::FlagsItemKind::Flag(
4002 ast::Flag::DotMatchesNewLine
4003 ),
4004 },
4005 ast::FlagsItem {
4006 span: span(3..4),
4007 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4008 },
4009 ],
4010 })
4011 );
4012
4013 assert_eq!(
4014 parser("isU").parse_flags().unwrap_err(),
4015 TestError {
4016 span: span(3..3),
4017 kind: ast::ErrorKind::FlagUnexpectedEof,
4018 }
4019 );
4020 assert_eq!(
4021 parser("isUa:").parse_flags().unwrap_err(),
4022 TestError {
4023 span: span(3..4),
4024 kind: ast::ErrorKind::FlagUnrecognized,
4025 }
4026 );
4027 assert_eq!(
4028 parser("isUi:").parse_flags().unwrap_err(),
4029 TestError {
4030 span: span(3..4),
4031 kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4032 }
4033 );
4034 assert_eq!(
4035 parser("i-sU-i:").parse_flags().unwrap_err(),
4036 TestError {
4037 span: span(4..5),
4038 kind: ast::ErrorKind::FlagRepeatedNegation {
4039 original: span(1..2),
4040 },
4041 }
4042 );
4043 assert_eq!(
4044 parser("-)").parse_flags().unwrap_err(),
4045 TestError {
4046 span: span(0..1),
4047 kind: ast::ErrorKind::FlagDanglingNegation,
4048 }
4049 );
4050 assert_eq!(
4051 parser("i-)").parse_flags().unwrap_err(),
4052 TestError {
4053 span: span(1..2),
4054 kind: ast::ErrorKind::FlagDanglingNegation,
4055 }
4056 );
4057 assert_eq!(
4058 parser("iU-)").parse_flags().unwrap_err(),
4059 TestError {
4060 span: span(2..3),
4061 kind: ast::ErrorKind::FlagDanglingNegation,
4062 }
4063 );
4064 }
4065
4066 #[test]
parse_flag()4067 fn parse_flag() {
4068 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4069 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4070 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4071 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4072 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4073 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4074
4075 assert_eq!(
4076 parser("a").parse_flag().unwrap_err(),
4077 TestError {
4078 span: span(0..1),
4079 kind: ast::ErrorKind::FlagUnrecognized,
4080 }
4081 );
4082 assert_eq!(
4083 parser("☃").parse_flag().unwrap_err(),
4084 TestError {
4085 span: span_range("☃", 0..3),
4086 kind: ast::ErrorKind::FlagUnrecognized,
4087 }
4088 );
4089 }
4090
4091 #[test]
parse_primitive_non_escape()4092 fn parse_primitive_non_escape() {
4093 assert_eq!(
4094 parser(r".").parse_primitive(),
4095 Ok(Primitive::Dot(span(0..1)))
4096 );
4097 assert_eq!(
4098 parser(r"^").parse_primitive(),
4099 Ok(Primitive::Assertion(ast::Assertion {
4100 span: span(0..1),
4101 kind: ast::AssertionKind::StartLine,
4102 }))
4103 );
4104 assert_eq!(
4105 parser(r"$").parse_primitive(),
4106 Ok(Primitive::Assertion(ast::Assertion {
4107 span: span(0..1),
4108 kind: ast::AssertionKind::EndLine,
4109 }))
4110 );
4111
4112 assert_eq!(
4113 parser(r"a").parse_primitive(),
4114 Ok(Primitive::Literal(ast::Literal {
4115 span: span(0..1),
4116 kind: ast::LiteralKind::Verbatim,
4117 c: 'a',
4118 }))
4119 );
4120 assert_eq!(
4121 parser(r"|").parse_primitive(),
4122 Ok(Primitive::Literal(ast::Literal {
4123 span: span(0..1),
4124 kind: ast::LiteralKind::Verbatim,
4125 c: '|',
4126 }))
4127 );
4128 assert_eq!(
4129 parser(r"☃").parse_primitive(),
4130 Ok(Primitive::Literal(ast::Literal {
4131 span: span_range("☃", 0..3),
4132 kind: ast::LiteralKind::Verbatim,
4133 c: '☃',
4134 }))
4135 );
4136 }
4137
4138 #[test]
parse_escape()4139 fn parse_escape() {
4140 assert_eq!(
4141 parser(r"\|").parse_primitive(),
4142 Ok(Primitive::Literal(ast::Literal {
4143 span: span(0..2),
4144 kind: ast::LiteralKind::Punctuation,
4145 c: '|',
4146 }))
4147 );
4148 let specials = &[
4149 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4150 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4151 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4152 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4153 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4154 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4155 ];
4156 for &(pat, c, ref kind) in specials {
4157 assert_eq!(
4158 parser(pat).parse_primitive(),
4159 Ok(Primitive::Literal(ast::Literal {
4160 span: span(0..2),
4161 kind: ast::LiteralKind::Special(kind.clone()),
4162 c: c,
4163 }))
4164 );
4165 }
4166 assert_eq!(
4167 parser(r"\A").parse_primitive(),
4168 Ok(Primitive::Assertion(ast::Assertion {
4169 span: span(0..2),
4170 kind: ast::AssertionKind::StartText,
4171 }))
4172 );
4173 assert_eq!(
4174 parser(r"\z").parse_primitive(),
4175 Ok(Primitive::Assertion(ast::Assertion {
4176 span: span(0..2),
4177 kind: ast::AssertionKind::EndText,
4178 }))
4179 );
4180 assert_eq!(
4181 parser(r"\b").parse_primitive(),
4182 Ok(Primitive::Assertion(ast::Assertion {
4183 span: span(0..2),
4184 kind: ast::AssertionKind::WordBoundary,
4185 }))
4186 );
4187 assert_eq!(
4188 parser(r"\B").parse_primitive(),
4189 Ok(Primitive::Assertion(ast::Assertion {
4190 span: span(0..2),
4191 kind: ast::AssertionKind::NotWordBoundary,
4192 }))
4193 );
4194
4195 assert_eq!(
4196 parser(r"\").parse_escape().unwrap_err(),
4197 TestError {
4198 span: span(0..1),
4199 kind: ast::ErrorKind::EscapeUnexpectedEof,
4200 }
4201 );
4202 assert_eq!(
4203 parser(r"\y").parse_escape().unwrap_err(),
4204 TestError {
4205 span: span(0..2),
4206 kind: ast::ErrorKind::EscapeUnrecognized,
4207 }
4208 );
4209 }
4210
4211 #[test]
parse_unsupported_backreference()4212 fn parse_unsupported_backreference() {
4213 assert_eq!(
4214 parser(r"\0").parse_escape().unwrap_err(),
4215 TestError {
4216 span: span(0..2),
4217 kind: ast::ErrorKind::UnsupportedBackreference,
4218 }
4219 );
4220 assert_eq!(
4221 parser(r"\9").parse_escape().unwrap_err(),
4222 TestError {
4223 span: span(0..2),
4224 kind: ast::ErrorKind::UnsupportedBackreference,
4225 }
4226 );
4227 }
4228
4229 #[test]
parse_octal()4230 fn parse_octal() {
4231 for i in 0..511 {
4232 let pat = format!(r"\{:o}", i);
4233 assert_eq!(
4234 parser_octal(&pat).parse_escape(),
4235 Ok(Primitive::Literal(ast::Literal {
4236 span: span(0..pat.len()),
4237 kind: ast::LiteralKind::Octal,
4238 c: ::std::char::from_u32(i).unwrap(),
4239 }))
4240 );
4241 }
4242 assert_eq!(
4243 parser_octal(r"\778").parse_escape(),
4244 Ok(Primitive::Literal(ast::Literal {
4245 span: span(0..3),
4246 kind: ast::LiteralKind::Octal,
4247 c: '?',
4248 }))
4249 );
4250 assert_eq!(
4251 parser_octal(r"\7777").parse_escape(),
4252 Ok(Primitive::Literal(ast::Literal {
4253 span: span(0..4),
4254 kind: ast::LiteralKind::Octal,
4255 c: '\u{01FF}',
4256 }))
4257 );
4258 assert_eq!(
4259 parser_octal(r"\778").parse(),
4260 Ok(Ast::Concat(ast::Concat {
4261 span: span(0..4),
4262 asts: vec![
4263 Ast::Literal(ast::Literal {
4264 span: span(0..3),
4265 kind: ast::LiteralKind::Octal,
4266 c: '?',
4267 }),
4268 Ast::Literal(ast::Literal {
4269 span: span(3..4),
4270 kind: ast::LiteralKind::Verbatim,
4271 c: '8',
4272 }),
4273 ],
4274 }))
4275 );
4276 assert_eq!(
4277 parser_octal(r"\7777").parse(),
4278 Ok(Ast::Concat(ast::Concat {
4279 span: span(0..5),
4280 asts: vec![
4281 Ast::Literal(ast::Literal {
4282 span: span(0..4),
4283 kind: ast::LiteralKind::Octal,
4284 c: '\u{01FF}',
4285 }),
4286 Ast::Literal(ast::Literal {
4287 span: span(4..5),
4288 kind: ast::LiteralKind::Verbatim,
4289 c: '7',
4290 }),
4291 ],
4292 }))
4293 );
4294
4295 assert_eq!(
4296 parser_octal(r"\8").parse_escape().unwrap_err(),
4297 TestError {
4298 span: span(0..2),
4299 kind: ast::ErrorKind::EscapeUnrecognized,
4300 }
4301 );
4302 }
4303
4304 #[test]
parse_hex_two()4305 fn parse_hex_two() {
4306 for i in 0..256 {
4307 let pat = format!(r"\x{:02x}", i);
4308 assert_eq!(
4309 parser(&pat).parse_escape(),
4310 Ok(Primitive::Literal(ast::Literal {
4311 span: span(0..pat.len()),
4312 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4313 c: ::std::char::from_u32(i).unwrap(),
4314 }))
4315 );
4316 }
4317
4318 assert_eq!(
4319 parser(r"\xF").parse_escape().unwrap_err(),
4320 TestError {
4321 span: span(3..3),
4322 kind: ast::ErrorKind::EscapeUnexpectedEof,
4323 }
4324 );
4325 assert_eq!(
4326 parser(r"\xG").parse_escape().unwrap_err(),
4327 TestError {
4328 span: span(2..3),
4329 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4330 }
4331 );
4332 assert_eq!(
4333 parser(r"\xFG").parse_escape().unwrap_err(),
4334 TestError {
4335 span: span(3..4),
4336 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4337 }
4338 );
4339 }
4340
4341 #[test]
parse_hex_four()4342 fn parse_hex_four() {
4343 for i in 0..65536 {
4344 let c = match ::std::char::from_u32(i) {
4345 None => continue,
4346 Some(c) => c,
4347 };
4348 let pat = format!(r"\u{:04x}", i);
4349 assert_eq!(
4350 parser(&pat).parse_escape(),
4351 Ok(Primitive::Literal(ast::Literal {
4352 span: span(0..pat.len()),
4353 kind: ast::LiteralKind::HexFixed(
4354 ast::HexLiteralKind::UnicodeShort
4355 ),
4356 c: c,
4357 }))
4358 );
4359 }
4360
4361 assert_eq!(
4362 parser(r"\uF").parse_escape().unwrap_err(),
4363 TestError {
4364 span: span(3..3),
4365 kind: ast::ErrorKind::EscapeUnexpectedEof,
4366 }
4367 );
4368 assert_eq!(
4369 parser(r"\uG").parse_escape().unwrap_err(),
4370 TestError {
4371 span: span(2..3),
4372 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4373 }
4374 );
4375 assert_eq!(
4376 parser(r"\uFG").parse_escape().unwrap_err(),
4377 TestError {
4378 span: span(3..4),
4379 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4380 }
4381 );
4382 assert_eq!(
4383 parser(r"\uFFG").parse_escape().unwrap_err(),
4384 TestError {
4385 span: span(4..5),
4386 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4387 }
4388 );
4389 assert_eq!(
4390 parser(r"\uFFFG").parse_escape().unwrap_err(),
4391 TestError {
4392 span: span(5..6),
4393 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4394 }
4395 );
4396 assert_eq!(
4397 parser(r"\uD800").parse_escape().unwrap_err(),
4398 TestError {
4399 span: span(2..6),
4400 kind: ast::ErrorKind::EscapeHexInvalid,
4401 }
4402 );
4403 }
4404
4405 #[test]
parse_hex_eight()4406 fn parse_hex_eight() {
4407 for i in 0..65536 {
4408 let c = match ::std::char::from_u32(i) {
4409 None => continue,
4410 Some(c) => c,
4411 };
4412 let pat = format!(r"\U{:08x}", i);
4413 assert_eq!(
4414 parser(&pat).parse_escape(),
4415 Ok(Primitive::Literal(ast::Literal {
4416 span: span(0..pat.len()),
4417 kind: ast::LiteralKind::HexFixed(
4418 ast::HexLiteralKind::UnicodeLong
4419 ),
4420 c: c,
4421 }))
4422 );
4423 }
4424
4425 assert_eq!(
4426 parser(r"\UF").parse_escape().unwrap_err(),
4427 TestError {
4428 span: span(3..3),
4429 kind: ast::ErrorKind::EscapeUnexpectedEof,
4430 }
4431 );
4432 assert_eq!(
4433 parser(r"\UG").parse_escape().unwrap_err(),
4434 TestError {
4435 span: span(2..3),
4436 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4437 }
4438 );
4439 assert_eq!(
4440 parser(r"\UFG").parse_escape().unwrap_err(),
4441 TestError {
4442 span: span(3..4),
4443 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4444 }
4445 );
4446 assert_eq!(
4447 parser(r"\UFFG").parse_escape().unwrap_err(),
4448 TestError {
4449 span: span(4..5),
4450 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4451 }
4452 );
4453 assert_eq!(
4454 parser(r"\UFFFG").parse_escape().unwrap_err(),
4455 TestError {
4456 span: span(5..6),
4457 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4458 }
4459 );
4460 assert_eq!(
4461 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4462 TestError {
4463 span: span(6..7),
4464 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4465 }
4466 );
4467 assert_eq!(
4468 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4469 TestError {
4470 span: span(7..8),
4471 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4472 }
4473 );
4474 assert_eq!(
4475 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4476 TestError {
4477 span: span(8..9),
4478 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4479 }
4480 );
4481 assert_eq!(
4482 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4483 TestError {
4484 span: span(9..10),
4485 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4486 }
4487 );
4488 }
4489
4490 #[test]
parse_hex_brace()4491 fn parse_hex_brace() {
4492 assert_eq!(
4493 parser(r"\u{26c4}").parse_escape(),
4494 Ok(Primitive::Literal(ast::Literal {
4495 span: span(0..8),
4496 kind: ast::LiteralKind::HexBrace(
4497 ast::HexLiteralKind::UnicodeShort
4498 ),
4499 c: '⛄',
4500 }))
4501 );
4502 assert_eq!(
4503 parser(r"\U{26c4}").parse_escape(),
4504 Ok(Primitive::Literal(ast::Literal {
4505 span: span(0..8),
4506 kind: ast::LiteralKind::HexBrace(
4507 ast::HexLiteralKind::UnicodeLong
4508 ),
4509 c: '⛄',
4510 }))
4511 );
4512 assert_eq!(
4513 parser(r"\x{26c4}").parse_escape(),
4514 Ok(Primitive::Literal(ast::Literal {
4515 span: span(0..8),
4516 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4517 c: '⛄',
4518 }))
4519 );
4520 assert_eq!(
4521 parser(r"\x{26C4}").parse_escape(),
4522 Ok(Primitive::Literal(ast::Literal {
4523 span: span(0..8),
4524 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4525 c: '⛄',
4526 }))
4527 );
4528 assert_eq!(
4529 parser(r"\x{10fFfF}").parse_escape(),
4530 Ok(Primitive::Literal(ast::Literal {
4531 span: span(0..10),
4532 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4533 c: '\u{10FFFF}',
4534 }))
4535 );
4536
4537 assert_eq!(
4538 parser(r"\x").parse_escape().unwrap_err(),
4539 TestError {
4540 span: span(2..2),
4541 kind: ast::ErrorKind::EscapeUnexpectedEof,
4542 }
4543 );
4544 assert_eq!(
4545 parser(r"\x{").parse_escape().unwrap_err(),
4546 TestError {
4547 span: span(2..3),
4548 kind: ast::ErrorKind::EscapeUnexpectedEof,
4549 }
4550 );
4551 assert_eq!(
4552 parser(r"\x{FF").parse_escape().unwrap_err(),
4553 TestError {
4554 span: span(2..5),
4555 kind: ast::ErrorKind::EscapeUnexpectedEof,
4556 }
4557 );
4558 assert_eq!(
4559 parser(r"\x{}").parse_escape().unwrap_err(),
4560 TestError {
4561 span: span(2..4),
4562 kind: ast::ErrorKind::EscapeHexEmpty,
4563 }
4564 );
4565 assert_eq!(
4566 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4567 TestError {
4568 span: span(4..5),
4569 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4570 }
4571 );
4572 assert_eq!(
4573 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4574 TestError {
4575 span: span(3..9),
4576 kind: ast::ErrorKind::EscapeHexInvalid,
4577 }
4578 );
4579 assert_eq!(
4580 parser(r"\x{D800}").parse_escape().unwrap_err(),
4581 TestError {
4582 span: span(3..7),
4583 kind: ast::ErrorKind::EscapeHexInvalid,
4584 }
4585 );
4586 assert_eq!(
4587 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4588 TestError {
4589 span: span(3..12),
4590 kind: ast::ErrorKind::EscapeHexInvalid,
4591 }
4592 );
4593 }
4594
4595 #[test]
parse_decimal()4596 fn parse_decimal() {
4597 assert_eq!(parser("123").parse_decimal(), Ok(123));
4598 assert_eq!(parser("0").parse_decimal(), Ok(0));
4599 assert_eq!(parser("01").parse_decimal(), Ok(1));
4600
4601 assert_eq!(
4602 parser("-1").parse_decimal().unwrap_err(),
4603 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4604 );
4605 assert_eq!(
4606 parser("").parse_decimal().unwrap_err(),
4607 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4608 );
4609 assert_eq!(
4610 parser("9999999999").parse_decimal().unwrap_err(),
4611 TestError {
4612 span: span(0..10),
4613 kind: ast::ErrorKind::DecimalInvalid,
4614 }
4615 );
4616 }
4617
4618 #[test]
parse_set_class()4619 fn parse_set_class() {
4620 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4621 ast::ClassSet::union(ast::ClassSetUnion {
4622 span: span,
4623 items: items,
4624 })
4625 }
4626
4627 fn intersection(
4628 span: Span,
4629 lhs: ast::ClassSet,
4630 rhs: ast::ClassSet,
4631 ) -> ast::ClassSet {
4632 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4633 span: span,
4634 kind: ast::ClassSetBinaryOpKind::Intersection,
4635 lhs: Box::new(lhs),
4636 rhs: Box::new(rhs),
4637 })
4638 }
4639
4640 fn difference(
4641 span: Span,
4642 lhs: ast::ClassSet,
4643 rhs: ast::ClassSet,
4644 ) -> ast::ClassSet {
4645 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4646 span: span,
4647 kind: ast::ClassSetBinaryOpKind::Difference,
4648 lhs: Box::new(lhs),
4649 rhs: Box::new(rhs),
4650 })
4651 }
4652
4653 fn symdifference(
4654 span: Span,
4655 lhs: ast::ClassSet,
4656 rhs: ast::ClassSet,
4657 ) -> ast::ClassSet {
4658 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4659 span: span,
4660 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4661 lhs: Box::new(lhs),
4662 rhs: Box::new(rhs),
4663 })
4664 }
4665
4666 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4667 ast::ClassSet::Item(item)
4668 }
4669
4670 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4671 ast::ClassSetItem::Ascii(cls)
4672 }
4673
4674 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4675 ast::ClassSetItem::Unicode(cls)
4676 }
4677
4678 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4679 ast::ClassSetItem::Perl(cls)
4680 }
4681
4682 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4683 ast::ClassSetItem::Bracketed(Box::new(cls))
4684 }
4685
4686 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4687 ast::ClassSetItem::Literal(ast::Literal {
4688 span: span,
4689 kind: ast::LiteralKind::Verbatim,
4690 c: c,
4691 })
4692 }
4693
4694 fn empty(span: Span) -> ast::ClassSetItem {
4695 ast::ClassSetItem::Empty(span)
4696 }
4697
4698 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4699 let pos1 = Position {
4700 offset: span.start.offset + start.len_utf8(),
4701 column: span.start.column + 1,
4702 ..span.start
4703 };
4704 let pos2 = Position {
4705 offset: span.end.offset - end.len_utf8(),
4706 column: span.end.column - 1,
4707 ..span.end
4708 };
4709 ast::ClassSetItem::Range(ast::ClassSetRange {
4710 span: span,
4711 start: ast::Literal {
4712 span: Span { end: pos1, ..span },
4713 kind: ast::LiteralKind::Verbatim,
4714 c: start,
4715 },
4716 end: ast::Literal {
4717 span: Span { start: pos2, ..span },
4718 kind: ast::LiteralKind::Verbatim,
4719 c: end,
4720 },
4721 })
4722 }
4723
4724 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4725 ast::ClassAscii {
4726 span: span,
4727 kind: ast::ClassAsciiKind::Alnum,
4728 negated: negated,
4729 }
4730 }
4731
4732 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4733 ast::ClassAscii {
4734 span: span,
4735 kind: ast::ClassAsciiKind::Lower,
4736 negated: negated,
4737 }
4738 }
4739
4740 assert_eq!(
4741 parser("[[:alnum:]]").parse(),
4742 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4743 span: span(0..11),
4744 negated: false,
4745 kind: itemset(item_ascii(alnum(span(1..10), false))),
4746 })))
4747 );
4748 assert_eq!(
4749 parser("[[[:alnum:]]]").parse(),
4750 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4751 span: span(0..13),
4752 negated: false,
4753 kind: itemset(item_bracket(ast::ClassBracketed {
4754 span: span(1..12),
4755 negated: false,
4756 kind: itemset(item_ascii(alnum(span(2..11), false))),
4757 })),
4758 })))
4759 );
4760 assert_eq!(
4761 parser("[[:alnum:]&&[:lower:]]").parse(),
4762 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4763 span: span(0..22),
4764 negated: false,
4765 kind: intersection(
4766 span(1..21),
4767 itemset(item_ascii(alnum(span(1..10), false))),
4768 itemset(item_ascii(lower(span(12..21), false))),
4769 ),
4770 })))
4771 );
4772 assert_eq!(
4773 parser("[[:alnum:]--[:lower:]]").parse(),
4774 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4775 span: span(0..22),
4776 negated: false,
4777 kind: difference(
4778 span(1..21),
4779 itemset(item_ascii(alnum(span(1..10), false))),
4780 itemset(item_ascii(lower(span(12..21), false))),
4781 ),
4782 })))
4783 );
4784 assert_eq!(
4785 parser("[[:alnum:]~~[:lower:]]").parse(),
4786 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4787 span: span(0..22),
4788 negated: false,
4789 kind: symdifference(
4790 span(1..21),
4791 itemset(item_ascii(alnum(span(1..10), false))),
4792 itemset(item_ascii(lower(span(12..21), false))),
4793 ),
4794 })))
4795 );
4796
4797 assert_eq!(
4798 parser("[a]").parse(),
4799 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4800 span: span(0..3),
4801 negated: false,
4802 kind: itemset(lit(span(1..2), 'a')),
4803 })))
4804 );
4805 assert_eq!(
4806 parser(r"[a\]]").parse(),
4807 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4808 span: span(0..5),
4809 negated: false,
4810 kind: union(
4811 span(1..4),
4812 vec![
4813 lit(span(1..2), 'a'),
4814 ast::ClassSetItem::Literal(ast::Literal {
4815 span: span(2..4),
4816 kind: ast::LiteralKind::Punctuation,
4817 c: ']',
4818 }),
4819 ]
4820 ),
4821 })))
4822 );
4823 assert_eq!(
4824 parser(r"[a\-z]").parse(),
4825 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4826 span: span(0..6),
4827 negated: false,
4828 kind: union(
4829 span(1..5),
4830 vec![
4831 lit(span(1..2), 'a'),
4832 ast::ClassSetItem::Literal(ast::Literal {
4833 span: span(2..4),
4834 kind: ast::LiteralKind::Punctuation,
4835 c: '-',
4836 }),
4837 lit(span(4..5), 'z'),
4838 ]
4839 ),
4840 })))
4841 );
4842 assert_eq!(
4843 parser("[ab]").parse(),
4844 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4845 span: span(0..4),
4846 negated: false,
4847 kind: union(
4848 span(1..3),
4849 vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
4850 ),
4851 })))
4852 );
4853 assert_eq!(
4854 parser("[a-]").parse(),
4855 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4856 span: span(0..4),
4857 negated: false,
4858 kind: union(
4859 span(1..3),
4860 vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
4861 ),
4862 })))
4863 );
4864 assert_eq!(
4865 parser("[-a]").parse(),
4866 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4867 span: span(0..4),
4868 negated: false,
4869 kind: union(
4870 span(1..3),
4871 vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
4872 ),
4873 })))
4874 );
4875 assert_eq!(
4876 parser(r"[\pL]").parse(),
4877 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4878 span: span(0..5),
4879 negated: false,
4880 kind: itemset(item_unicode(ast::ClassUnicode {
4881 span: span(1..4),
4882 negated: false,
4883 kind: ast::ClassUnicodeKind::OneLetter('L'),
4884 })),
4885 })))
4886 );
4887 assert_eq!(
4888 parser(r"[\w]").parse(),
4889 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4890 span: span(0..4),
4891 negated: false,
4892 kind: itemset(item_perl(ast::ClassPerl {
4893 span: span(1..3),
4894 kind: ast::ClassPerlKind::Word,
4895 negated: false,
4896 })),
4897 })))
4898 );
4899 assert_eq!(
4900 parser(r"[a\wz]").parse(),
4901 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4902 span: span(0..6),
4903 negated: false,
4904 kind: union(
4905 span(1..5),
4906 vec![
4907 lit(span(1..2), 'a'),
4908 item_perl(ast::ClassPerl {
4909 span: span(2..4),
4910 kind: ast::ClassPerlKind::Word,
4911 negated: false,
4912 }),
4913 lit(span(4..5), 'z'),
4914 ]
4915 ),
4916 })))
4917 );
4918
4919 assert_eq!(
4920 parser("[a-z]").parse(),
4921 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4922 span: span(0..5),
4923 negated: false,
4924 kind: itemset(range(span(1..4), 'a', 'z')),
4925 })))
4926 );
4927 assert_eq!(
4928 parser("[a-cx-z]").parse(),
4929 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4930 span: span(0..8),
4931 negated: false,
4932 kind: union(
4933 span(1..7),
4934 vec![
4935 range(span(1..4), 'a', 'c'),
4936 range(span(4..7), 'x', 'z'),
4937 ]
4938 ),
4939 })))
4940 );
4941 assert_eq!(
4942 parser(r"[\w&&a-cx-z]").parse(),
4943 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4944 span: span(0..12),
4945 negated: false,
4946 kind: intersection(
4947 span(1..11),
4948 itemset(item_perl(ast::ClassPerl {
4949 span: span(1..3),
4950 kind: ast::ClassPerlKind::Word,
4951 negated: false,
4952 })),
4953 union(
4954 span(5..11),
4955 vec![
4956 range(span(5..8), 'a', 'c'),
4957 range(span(8..11), 'x', 'z'),
4958 ]
4959 ),
4960 ),
4961 })))
4962 );
4963 assert_eq!(
4964 parser(r"[a-cx-z&&\w]").parse(),
4965 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4966 span: span(0..12),
4967 negated: false,
4968 kind: intersection(
4969 span(1..11),
4970 union(
4971 span(1..7),
4972 vec![
4973 range(span(1..4), 'a', 'c'),
4974 range(span(4..7), 'x', 'z'),
4975 ]
4976 ),
4977 itemset(item_perl(ast::ClassPerl {
4978 span: span(9..11),
4979 kind: ast::ClassPerlKind::Word,
4980 negated: false,
4981 })),
4982 ),
4983 })))
4984 );
4985 assert_eq!(
4986 parser(r"[a--b--c]").parse(),
4987 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4988 span: span(0..9),
4989 negated: false,
4990 kind: difference(
4991 span(1..8),
4992 difference(
4993 span(1..5),
4994 itemset(lit(span(1..2), 'a')),
4995 itemset(lit(span(4..5), 'b')),
4996 ),
4997 itemset(lit(span(7..8), 'c')),
4998 ),
4999 })))
5000 );
5001 assert_eq!(
5002 parser(r"[a~~b~~c]").parse(),
5003 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5004 span: span(0..9),
5005 negated: false,
5006 kind: symdifference(
5007 span(1..8),
5008 symdifference(
5009 span(1..5),
5010 itemset(lit(span(1..2), 'a')),
5011 itemset(lit(span(4..5), 'b')),
5012 ),
5013 itemset(lit(span(7..8), 'c')),
5014 ),
5015 })))
5016 );
5017 assert_eq!(
5018 parser(r"[\^&&^]").parse(),
5019 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5020 span: span(0..7),
5021 negated: false,
5022 kind: intersection(
5023 span(1..6),
5024 itemset(ast::ClassSetItem::Literal(ast::Literal {
5025 span: span(1..3),
5026 kind: ast::LiteralKind::Punctuation,
5027 c: '^',
5028 })),
5029 itemset(lit(span(5..6), '^')),
5030 ),
5031 })))
5032 );
5033 assert_eq!(
5034 parser(r"[\&&&&]").parse(),
5035 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5036 span: span(0..7),
5037 negated: false,
5038 kind: intersection(
5039 span(1..6),
5040 itemset(ast::ClassSetItem::Literal(ast::Literal {
5041 span: span(1..3),
5042 kind: ast::LiteralKind::Punctuation,
5043 c: '&',
5044 })),
5045 itemset(lit(span(5..6), '&')),
5046 ),
5047 })))
5048 );
5049 assert_eq!(
5050 parser(r"[&&&&]").parse(),
5051 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5052 span: span(0..6),
5053 negated: false,
5054 kind: intersection(
5055 span(1..5),
5056 intersection(
5057 span(1..3),
5058 itemset(empty(span(1..1))),
5059 itemset(empty(span(3..3))),
5060 ),
5061 itemset(empty(span(5..5))),
5062 ),
5063 })))
5064 );
5065
5066 let pat = "[☃-⛄]";
5067 assert_eq!(
5068 parser(pat).parse(),
5069 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5070 span: span_range(pat, 0..9),
5071 negated: false,
5072 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5073 span: span_range(pat, 1..8),
5074 start: ast::Literal {
5075 span: span_range(pat, 1..4),
5076 kind: ast::LiteralKind::Verbatim,
5077 c: '☃',
5078 },
5079 end: ast::Literal {
5080 span: span_range(pat, 5..8),
5081 kind: ast::LiteralKind::Verbatim,
5082 c: '⛄',
5083 },
5084 })),
5085 })))
5086 );
5087
5088 assert_eq!(
5089 parser(r"[]]").parse(),
5090 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5091 span: span(0..3),
5092 negated: false,
5093 kind: itemset(lit(span(1..2), ']')),
5094 })))
5095 );
5096 assert_eq!(
5097 parser(r"[]\[]").parse(),
5098 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5099 span: span(0..5),
5100 negated: false,
5101 kind: union(
5102 span(1..4),
5103 vec![
5104 lit(span(1..2), ']'),
5105 ast::ClassSetItem::Literal(ast::Literal {
5106 span: span(2..4),
5107 kind: ast::LiteralKind::Punctuation,
5108 c: '[',
5109 }),
5110 ]
5111 ),
5112 })))
5113 );
5114 assert_eq!(
5115 parser(r"[\[]]").parse(),
5116 Ok(concat(
5117 0..5,
5118 vec![
5119 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5120 span: span(0..4),
5121 negated: false,
5122 kind: itemset(ast::ClassSetItem::Literal(
5123 ast::Literal {
5124 span: span(1..3),
5125 kind: ast::LiteralKind::Punctuation,
5126 c: '[',
5127 }
5128 )),
5129 })),
5130 Ast::Literal(ast::Literal {
5131 span: span(4..5),
5132 kind: ast::LiteralKind::Verbatim,
5133 c: ']',
5134 }),
5135 ]
5136 ))
5137 );
5138
5139 assert_eq!(
5140 parser("[").parse().unwrap_err(),
5141 TestError {
5142 span: span(0..1),
5143 kind: ast::ErrorKind::ClassUnclosed,
5144 }
5145 );
5146 assert_eq!(
5147 parser("[[").parse().unwrap_err(),
5148 TestError {
5149 span: span(1..2),
5150 kind: ast::ErrorKind::ClassUnclosed,
5151 }
5152 );
5153 assert_eq!(
5154 parser("[[-]").parse().unwrap_err(),
5155 TestError {
5156 span: span(0..1),
5157 kind: ast::ErrorKind::ClassUnclosed,
5158 }
5159 );
5160 assert_eq!(
5161 parser("[[[:alnum:]").parse().unwrap_err(),
5162 TestError {
5163 span: span(1..2),
5164 kind: ast::ErrorKind::ClassUnclosed,
5165 }
5166 );
5167 assert_eq!(
5168 parser(r"[\b]").parse().unwrap_err(),
5169 TestError {
5170 span: span(1..3),
5171 kind: ast::ErrorKind::ClassEscapeInvalid,
5172 }
5173 );
5174 assert_eq!(
5175 parser(r"[\w-a]").parse().unwrap_err(),
5176 TestError {
5177 span: span(1..3),
5178 kind: ast::ErrorKind::ClassRangeLiteral,
5179 }
5180 );
5181 assert_eq!(
5182 parser(r"[a-\w]").parse().unwrap_err(),
5183 TestError {
5184 span: span(3..5),
5185 kind: ast::ErrorKind::ClassRangeLiteral,
5186 }
5187 );
5188 assert_eq!(
5189 parser(r"[z-a]").parse().unwrap_err(),
5190 TestError {
5191 span: span(1..4),
5192 kind: ast::ErrorKind::ClassRangeInvalid,
5193 }
5194 );
5195
5196 assert_eq!(
5197 parser_ignore_whitespace("[a ").parse().unwrap_err(),
5198 TestError {
5199 span: span(0..1),
5200 kind: ast::ErrorKind::ClassUnclosed,
5201 }
5202 );
5203 assert_eq!(
5204 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5205 TestError {
5206 span: span(0..1),
5207 kind: ast::ErrorKind::ClassUnclosed,
5208 }
5209 );
5210 }
5211
5212 #[test]
parse_set_class_open()5213 fn parse_set_class_open() {
5214 assert_eq!(parser("[a]").parse_set_class_open(), {
5215 let set = ast::ClassBracketed {
5216 span: span(0..1),
5217 negated: false,
5218 kind: ast::ClassSet::union(ast::ClassSetUnion {
5219 span: span(1..1),
5220 items: vec![],
5221 }),
5222 };
5223 let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5224 Ok((set, union))
5225 });
5226 assert_eq!(
5227 parser_ignore_whitespace("[ a]").parse_set_class_open(),
5228 {
5229 let set = ast::ClassBracketed {
5230 span: span(0..4),
5231 negated: false,
5232 kind: ast::ClassSet::union(ast::ClassSetUnion {
5233 span: span(4..4),
5234 items: vec![],
5235 }),
5236 };
5237 let union =
5238 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5239 Ok((set, union))
5240 }
5241 );
5242 assert_eq!(parser("[^a]").parse_set_class_open(), {
5243 let set = ast::ClassBracketed {
5244 span: span(0..2),
5245 negated: true,
5246 kind: ast::ClassSet::union(ast::ClassSetUnion {
5247 span: span(2..2),
5248 items: vec![],
5249 }),
5250 };
5251 let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5252 Ok((set, union))
5253 });
5254 assert_eq!(
5255 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5256 {
5257 let set = ast::ClassBracketed {
5258 span: span(0..4),
5259 negated: true,
5260 kind: ast::ClassSet::union(ast::ClassSetUnion {
5261 span: span(4..4),
5262 items: vec![],
5263 }),
5264 };
5265 let union =
5266 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5267 Ok((set, union))
5268 }
5269 );
5270 assert_eq!(parser("[-a]").parse_set_class_open(), {
5271 let set = ast::ClassBracketed {
5272 span: span(0..2),
5273 negated: false,
5274 kind: ast::ClassSet::union(ast::ClassSetUnion {
5275 span: span(1..1),
5276 items: vec![],
5277 }),
5278 };
5279 let union = ast::ClassSetUnion {
5280 span: span(1..2),
5281 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5282 span: span(1..2),
5283 kind: ast::LiteralKind::Verbatim,
5284 c: '-',
5285 })],
5286 };
5287 Ok((set, union))
5288 });
5289 assert_eq!(
5290 parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5291 {
5292 let set = ast::ClassBracketed {
5293 span: span(0..4),
5294 negated: false,
5295 kind: ast::ClassSet::union(ast::ClassSetUnion {
5296 span: span(2..2),
5297 items: vec![],
5298 }),
5299 };
5300 let union = ast::ClassSetUnion {
5301 span: span(2..3),
5302 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5303 span: span(2..3),
5304 kind: ast::LiteralKind::Verbatim,
5305 c: '-',
5306 })],
5307 };
5308 Ok((set, union))
5309 }
5310 );
5311 assert_eq!(parser("[^-a]").parse_set_class_open(), {
5312 let set = ast::ClassBracketed {
5313 span: span(0..3),
5314 negated: true,
5315 kind: ast::ClassSet::union(ast::ClassSetUnion {
5316 span: span(2..2),
5317 items: vec![],
5318 }),
5319 };
5320 let union = ast::ClassSetUnion {
5321 span: span(2..3),
5322 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5323 span: span(2..3),
5324 kind: ast::LiteralKind::Verbatim,
5325 c: '-',
5326 })],
5327 };
5328 Ok((set, union))
5329 });
5330 assert_eq!(parser("[--a]").parse_set_class_open(), {
5331 let set = ast::ClassBracketed {
5332 span: span(0..3),
5333 negated: false,
5334 kind: ast::ClassSet::union(ast::ClassSetUnion {
5335 span: span(1..1),
5336 items: vec![],
5337 }),
5338 };
5339 let union = ast::ClassSetUnion {
5340 span: span(1..3),
5341 items: vec![
5342 ast::ClassSetItem::Literal(ast::Literal {
5343 span: span(1..2),
5344 kind: ast::LiteralKind::Verbatim,
5345 c: '-',
5346 }),
5347 ast::ClassSetItem::Literal(ast::Literal {
5348 span: span(2..3),
5349 kind: ast::LiteralKind::Verbatim,
5350 c: '-',
5351 }),
5352 ],
5353 };
5354 Ok((set, union))
5355 });
5356 assert_eq!(parser("[]a]").parse_set_class_open(), {
5357 let set = ast::ClassBracketed {
5358 span: span(0..2),
5359 negated: false,
5360 kind: ast::ClassSet::union(ast::ClassSetUnion {
5361 span: span(1..1),
5362 items: vec![],
5363 }),
5364 };
5365 let union = ast::ClassSetUnion {
5366 span: span(1..2),
5367 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5368 span: span(1..2),
5369 kind: ast::LiteralKind::Verbatim,
5370 c: ']',
5371 })],
5372 };
5373 Ok((set, union))
5374 });
5375 assert_eq!(
5376 parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5377 {
5378 let set = ast::ClassBracketed {
5379 span: span(0..4),
5380 negated: false,
5381 kind: ast::ClassSet::union(ast::ClassSetUnion {
5382 span: span(2..2),
5383 items: vec![],
5384 }),
5385 };
5386 let union = ast::ClassSetUnion {
5387 span: span(2..3),
5388 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5389 span: span(2..3),
5390 kind: ast::LiteralKind::Verbatim,
5391 c: ']',
5392 })],
5393 };
5394 Ok((set, union))
5395 }
5396 );
5397 assert_eq!(parser("[^]a]").parse_set_class_open(), {
5398 let set = ast::ClassBracketed {
5399 span: span(0..3),
5400 negated: true,
5401 kind: ast::ClassSet::union(ast::ClassSetUnion {
5402 span: span(2..2),
5403 items: vec![],
5404 }),
5405 };
5406 let union = ast::ClassSetUnion {
5407 span: span(2..3),
5408 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5409 span: span(2..3),
5410 kind: ast::LiteralKind::Verbatim,
5411 c: ']',
5412 })],
5413 };
5414 Ok((set, union))
5415 });
5416 assert_eq!(parser("[-]a]").parse_set_class_open(), {
5417 let set = ast::ClassBracketed {
5418 span: span(0..2),
5419 negated: false,
5420 kind: ast::ClassSet::union(ast::ClassSetUnion {
5421 span: span(1..1),
5422 items: vec![],
5423 }),
5424 };
5425 let union = ast::ClassSetUnion {
5426 span: span(1..2),
5427 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5428 span: span(1..2),
5429 kind: ast::LiteralKind::Verbatim,
5430 c: '-',
5431 })],
5432 };
5433 Ok((set, union))
5434 });
5435
5436 assert_eq!(
5437 parser("[").parse_set_class_open().unwrap_err(),
5438 TestError {
5439 span: span(0..1),
5440 kind: ast::ErrorKind::ClassUnclosed,
5441 }
5442 );
5443 assert_eq!(
5444 parser_ignore_whitespace("[ ")
5445 .parse_set_class_open()
5446 .unwrap_err(),
5447 TestError {
5448 span: span(0..5),
5449 kind: ast::ErrorKind::ClassUnclosed,
5450 }
5451 );
5452 assert_eq!(
5453 parser("[^").parse_set_class_open().unwrap_err(),
5454 TestError {
5455 span: span(0..2),
5456 kind: ast::ErrorKind::ClassUnclosed,
5457 }
5458 );
5459 assert_eq!(
5460 parser("[]").parse_set_class_open().unwrap_err(),
5461 TestError {
5462 span: span(0..2),
5463 kind: ast::ErrorKind::ClassUnclosed,
5464 }
5465 );
5466 assert_eq!(
5467 parser("[-").parse_set_class_open().unwrap_err(),
5468 TestError {
5469 span: span(0..2),
5470 kind: ast::ErrorKind::ClassUnclosed,
5471 }
5472 );
5473 assert_eq!(
5474 parser("[--").parse_set_class_open().unwrap_err(),
5475 TestError {
5476 span: span(0..3),
5477 kind: ast::ErrorKind::ClassUnclosed,
5478 }
5479 );
5480 }
5481
5482 #[test]
maybe_parse_ascii_class()5483 fn maybe_parse_ascii_class() {
5484 assert_eq!(
5485 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5486 Some(ast::ClassAscii {
5487 span: span(0..9),
5488 kind: ast::ClassAsciiKind::Alnum,
5489 negated: false,
5490 })
5491 );
5492 assert_eq!(
5493 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5494 Some(ast::ClassAscii {
5495 span: span(0..9),
5496 kind: ast::ClassAsciiKind::Alnum,
5497 negated: false,
5498 })
5499 );
5500 assert_eq!(
5501 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5502 Some(ast::ClassAscii {
5503 span: span(0..10),
5504 kind: ast::ClassAsciiKind::Alnum,
5505 negated: true,
5506 })
5507 );
5508
5509 let p = parser(r"[:");
5510 assert_eq!(p.maybe_parse_ascii_class(), None);
5511 assert_eq!(p.offset(), 0);
5512
5513 let p = parser(r"[:^");
5514 assert_eq!(p.maybe_parse_ascii_class(), None);
5515 assert_eq!(p.offset(), 0);
5516
5517 let p = parser(r"[^:alnum:]");
5518 assert_eq!(p.maybe_parse_ascii_class(), None);
5519 assert_eq!(p.offset(), 0);
5520
5521 let p = parser(r"[:alnnum:]");
5522 assert_eq!(p.maybe_parse_ascii_class(), None);
5523 assert_eq!(p.offset(), 0);
5524
5525 let p = parser(r"[:alnum]");
5526 assert_eq!(p.maybe_parse_ascii_class(), None);
5527 assert_eq!(p.offset(), 0);
5528
5529 let p = parser(r"[:alnum:");
5530 assert_eq!(p.maybe_parse_ascii_class(), None);
5531 assert_eq!(p.offset(), 0);
5532 }
5533
5534 #[test]
parse_unicode_class()5535 fn parse_unicode_class() {
5536 assert_eq!(
5537 parser(r"\pN").parse_escape(),
5538 Ok(Primitive::Unicode(ast::ClassUnicode {
5539 span: span(0..3),
5540 negated: false,
5541 kind: ast::ClassUnicodeKind::OneLetter('N'),
5542 }))
5543 );
5544 assert_eq!(
5545 parser(r"\PN").parse_escape(),
5546 Ok(Primitive::Unicode(ast::ClassUnicode {
5547 span: span(0..3),
5548 negated: true,
5549 kind: ast::ClassUnicodeKind::OneLetter('N'),
5550 }))
5551 );
5552 assert_eq!(
5553 parser(r"\p{N}").parse_escape(),
5554 Ok(Primitive::Unicode(ast::ClassUnicode {
5555 span: span(0..5),
5556 negated: false,
5557 kind: ast::ClassUnicodeKind::Named(s("N")),
5558 }))
5559 );
5560 assert_eq!(
5561 parser(r"\P{N}").parse_escape(),
5562 Ok(Primitive::Unicode(ast::ClassUnicode {
5563 span: span(0..5),
5564 negated: true,
5565 kind: ast::ClassUnicodeKind::Named(s("N")),
5566 }))
5567 );
5568 assert_eq!(
5569 parser(r"\p{Greek}").parse_escape(),
5570 Ok(Primitive::Unicode(ast::ClassUnicode {
5571 span: span(0..9),
5572 negated: false,
5573 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5574 }))
5575 );
5576
5577 assert_eq!(
5578 parser(r"\p{scx:Katakana}").parse_escape(),
5579 Ok(Primitive::Unicode(ast::ClassUnicode {
5580 span: span(0..16),
5581 negated: false,
5582 kind: ast::ClassUnicodeKind::NamedValue {
5583 op: ast::ClassUnicodeOpKind::Colon,
5584 name: s("scx"),
5585 value: s("Katakana"),
5586 },
5587 }))
5588 );
5589 assert_eq!(
5590 parser(r"\p{scx=Katakana}").parse_escape(),
5591 Ok(Primitive::Unicode(ast::ClassUnicode {
5592 span: span(0..16),
5593 negated: false,
5594 kind: ast::ClassUnicodeKind::NamedValue {
5595 op: ast::ClassUnicodeOpKind::Equal,
5596 name: s("scx"),
5597 value: s("Katakana"),
5598 },
5599 }))
5600 );
5601 assert_eq!(
5602 parser(r"\p{scx!=Katakana}").parse_escape(),
5603 Ok(Primitive::Unicode(ast::ClassUnicode {
5604 span: span(0..17),
5605 negated: false,
5606 kind: ast::ClassUnicodeKind::NamedValue {
5607 op: ast::ClassUnicodeOpKind::NotEqual,
5608 name: s("scx"),
5609 value: s("Katakana"),
5610 },
5611 }))
5612 );
5613
5614 assert_eq!(
5615 parser(r"\p{:}").parse_escape(),
5616 Ok(Primitive::Unicode(ast::ClassUnicode {
5617 span: span(0..5),
5618 negated: false,
5619 kind: ast::ClassUnicodeKind::NamedValue {
5620 op: ast::ClassUnicodeOpKind::Colon,
5621 name: s(""),
5622 value: s(""),
5623 },
5624 }))
5625 );
5626 assert_eq!(
5627 parser(r"\p{=}").parse_escape(),
5628 Ok(Primitive::Unicode(ast::ClassUnicode {
5629 span: span(0..5),
5630 negated: false,
5631 kind: ast::ClassUnicodeKind::NamedValue {
5632 op: ast::ClassUnicodeOpKind::Equal,
5633 name: s(""),
5634 value: s(""),
5635 },
5636 }))
5637 );
5638 assert_eq!(
5639 parser(r"\p{!=}").parse_escape(),
5640 Ok(Primitive::Unicode(ast::ClassUnicode {
5641 span: span(0..6),
5642 negated: false,
5643 kind: ast::ClassUnicodeKind::NamedValue {
5644 op: ast::ClassUnicodeOpKind::NotEqual,
5645 name: s(""),
5646 value: s(""),
5647 },
5648 }))
5649 );
5650
5651 assert_eq!(
5652 parser(r"\p").parse_escape().unwrap_err(),
5653 TestError {
5654 span: span(2..2),
5655 kind: ast::ErrorKind::EscapeUnexpectedEof,
5656 }
5657 );
5658 assert_eq!(
5659 parser(r"\p{").parse_escape().unwrap_err(),
5660 TestError {
5661 span: span(3..3),
5662 kind: ast::ErrorKind::EscapeUnexpectedEof,
5663 }
5664 );
5665 assert_eq!(
5666 parser(r"\p{N").parse_escape().unwrap_err(),
5667 TestError {
5668 span: span(4..4),
5669 kind: ast::ErrorKind::EscapeUnexpectedEof,
5670 }
5671 );
5672 assert_eq!(
5673 parser(r"\p{Greek").parse_escape().unwrap_err(),
5674 TestError {
5675 span: span(8..8),
5676 kind: ast::ErrorKind::EscapeUnexpectedEof,
5677 }
5678 );
5679
5680 assert_eq!(
5681 parser(r"\pNz").parse(),
5682 Ok(Ast::Concat(ast::Concat {
5683 span: span(0..4),
5684 asts: vec![
5685 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5686 span: span(0..3),
5687 negated: false,
5688 kind: ast::ClassUnicodeKind::OneLetter('N'),
5689 })),
5690 Ast::Literal(ast::Literal {
5691 span: span(3..4),
5692 kind: ast::LiteralKind::Verbatim,
5693 c: 'z',
5694 }),
5695 ],
5696 }))
5697 );
5698 assert_eq!(
5699 parser(r"\p{Greek}z").parse(),
5700 Ok(Ast::Concat(ast::Concat {
5701 span: span(0..10),
5702 asts: vec![
5703 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5704 span: span(0..9),
5705 negated: false,
5706 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5707 })),
5708 Ast::Literal(ast::Literal {
5709 span: span(9..10),
5710 kind: ast::LiteralKind::Verbatim,
5711 c: 'z',
5712 }),
5713 ],
5714 }))
5715 );
5716 }
5717
5718 #[test]
parse_perl_class()5719 fn parse_perl_class() {
5720 assert_eq!(
5721 parser(r"\d").parse_escape(),
5722 Ok(Primitive::Perl(ast::ClassPerl {
5723 span: span(0..2),
5724 kind: ast::ClassPerlKind::Digit,
5725 negated: false,
5726 }))
5727 );
5728 assert_eq!(
5729 parser(r"\D").parse_escape(),
5730 Ok(Primitive::Perl(ast::ClassPerl {
5731 span: span(0..2),
5732 kind: ast::ClassPerlKind::Digit,
5733 negated: true,
5734 }))
5735 );
5736 assert_eq!(
5737 parser(r"\s").parse_escape(),
5738 Ok(Primitive::Perl(ast::ClassPerl {
5739 span: span(0..2),
5740 kind: ast::ClassPerlKind::Space,
5741 negated: false,
5742 }))
5743 );
5744 assert_eq!(
5745 parser(r"\S").parse_escape(),
5746 Ok(Primitive::Perl(ast::ClassPerl {
5747 span: span(0..2),
5748 kind: ast::ClassPerlKind::Space,
5749 negated: true,
5750 }))
5751 );
5752 assert_eq!(
5753 parser(r"\w").parse_escape(),
5754 Ok(Primitive::Perl(ast::ClassPerl {
5755 span: span(0..2),
5756 kind: ast::ClassPerlKind::Word,
5757 negated: false,
5758 }))
5759 );
5760 assert_eq!(
5761 parser(r"\W").parse_escape(),
5762 Ok(Primitive::Perl(ast::ClassPerl {
5763 span: span(0..2),
5764 kind: ast::ClassPerlKind::Word,
5765 negated: true,
5766 }))
5767 );
5768
5769 assert_eq!(
5770 parser(r"\d").parse(),
5771 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5772 span: span(0..2),
5773 kind: ast::ClassPerlKind::Digit,
5774 negated: false,
5775 })))
5776 );
5777 assert_eq!(
5778 parser(r"\dz").parse(),
5779 Ok(Ast::Concat(ast::Concat {
5780 span: span(0..3),
5781 asts: vec![
5782 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5783 span: span(0..2),
5784 kind: ast::ClassPerlKind::Digit,
5785 negated: false,
5786 })),
5787 Ast::Literal(ast::Literal {
5788 span: span(2..3),
5789 kind: ast::LiteralKind::Verbatim,
5790 c: 'z',
5791 }),
5792 ],
5793 }))
5794 );
5795 }
5796
5797 // This tests a bug fix where the nest limit checker wasn't decrementing
5798 // its depth during post-traversal, which causes long regexes to trip
5799 // the default limit too aggressively.
5800 #[test]
regression_454_nest_too_big()5801 fn regression_454_nest_too_big() {
5802 let pattern = r#"
5803 2(?:
5804 [45]\d{3}|
5805 7(?:
5806 1[0-267]|
5807 2[0-289]|
5808 3[0-29]|
5809 4[01]|
5810 5[1-3]|
5811 6[013]|
5812 7[0178]|
5813 91
5814 )|
5815 8(?:
5816 0[125]|
5817 [139][1-6]|
5818 2[0157-9]|
5819 41|
5820 6[1-35]|
5821 7[1-5]|
5822 8[1-8]|
5823 90
5824 )|
5825 9(?:
5826 0[0-2]|
5827 1[0-4]|
5828 2[568]|
5829 3[3-6]|
5830 5[5-7]|
5831 6[0167]|
5832 7[15]|
5833 8[0146-9]
5834 )
5835 )\d{4}
5836 "#;
5837 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5838 }
5839
5840 // This tests that we treat a trailing `-` in a character class as a
5841 // literal `-` even when whitespace mode is enabled and there is whitespace
5842 // after the trailing `-`.
5843 #[test]
regression_455_trailing_dash_ignore_whitespace()5844 fn regression_455_trailing_dash_ignore_whitespace() {
5845 assert!(parser("(?x)[ / - ]").parse().is_ok());
5846 assert!(parser("(?x)[ a - ]").parse().is_ok());
5847 assert!(parser(
5848 "(?x)[
5849 a
5850 - ]
5851 "
5852 )
5853 .parse()
5854 .is_ok());
5855 assert!(parser(
5856 "(?x)[
5857 a # wat
5858 - ]
5859 "
5860 )
5861 .parse()
5862 .is_ok());
5863
5864 assert!(parser("(?x)[ / -").parse().is_err());
5865 assert!(parser("(?x)[ / - ").parse().is_err());
5866 assert!(parser(
5867 "(?x)[
5868 / -
5869 "
5870 )
5871 .parse()
5872 .is_err());
5873 assert!(parser(
5874 "(?x)[
5875 / - # wat
5876 "
5877 )
5878 .parse()
5879 .is_err());
5880 }
5881 }
5882