1 /*!
2 This module provides a regular expression parser.
3 */
4
5 use std::borrow::Borrow;
6 use std::cell::{Cell, RefCell};
7 use std::mem;
8 use std::result;
9
10 use crate::ast::{self, Ast, Position, Span};
11 use crate::either::Either;
12
13 use crate::is_meta_character;
14
15 type Result<T> = result::Result<T, ast::Error>;
16
17 /// A primitive is an expression with no sub-expressions. This includes
18 /// literals, assertions and non-set character classes. This representation
19 /// is used as intermediate state in the parser.
20 ///
21 /// This does not include ASCII character classes, since they can only appear
22 /// within a set character class.
23 #[derive(Clone, Debug, Eq, PartialEq)]
24 enum Primitive {
25 Literal(ast::Literal),
26 Assertion(ast::Assertion),
27 Dot(Span),
28 Perl(ast::ClassPerl),
29 Unicode(ast::ClassUnicode),
30 }
31
32 impl Primitive {
33 /// Return the span of this primitive.
span(&self) -> &Span34 fn span(&self) -> &Span {
35 match *self {
36 Primitive::Literal(ref x) => &x.span,
37 Primitive::Assertion(ref x) => &x.span,
38 Primitive::Dot(ref span) => span,
39 Primitive::Perl(ref x) => &x.span,
40 Primitive::Unicode(ref x) => &x.span,
41 }
42 }
43
44 /// Convert this primitive into a proper AST.
into_ast(self) -> Ast45 fn into_ast(self) -> Ast {
46 match self {
47 Primitive::Literal(lit) => Ast::Literal(lit),
48 Primitive::Assertion(assert) => Ast::Assertion(assert),
49 Primitive::Dot(span) => Ast::Dot(span),
50 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
51 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
52 }
53 }
54
55 /// Convert this primitive into an item in a character class.
56 ///
57 /// If this primitive is not a legal item (i.e., an assertion or a dot),
58 /// then return an error.
into_class_set_item<P: Borrow<Parser>>( self, p: &ParserI<'_, P>, ) -> Result<ast::ClassSetItem>59 fn into_class_set_item<P: Borrow<Parser>>(
60 self,
61 p: &ParserI<'_, P>,
62 ) -> Result<ast::ClassSetItem> {
63 use self::Primitive::*;
64 use crate::ast::ClassSetItem;
65
66 match self {
67 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
68 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
69 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
70 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
71 }
72 }
73
74 /// Convert this primitive into a literal in a character class. In
75 /// particular, literals are the only valid items that can appear in
76 /// ranges.
77 ///
78 /// If this primitive is not a legal item (i.e., a class, assertion or a
79 /// dot), then return an error.
into_class_literal<P: Borrow<Parser>>( self, p: &ParserI<'_, P>, ) -> Result<ast::Literal>80 fn into_class_literal<P: Borrow<Parser>>(
81 self,
82 p: &ParserI<'_, P>,
83 ) -> Result<ast::Literal> {
84 use self::Primitive::*;
85
86 match self {
87 Literal(lit) => Ok(lit),
88 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
89 }
90 }
91 }
92
93 /// Returns true if the given character is a hexadecimal digit.
is_hex(c: char) -> bool94 fn is_hex(c: char) -> bool {
95 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
96 }
97
98 /// Returns true if the given character is a valid in a capture group name.
99 ///
100 /// If `first` is true, then `c` is treated as the first character in the
101 /// group name (which must be alphabetic or underscore).
is_capture_char(c: char, first: bool) -> bool102 fn is_capture_char(c: char, first: bool) -> bool {
103 c == '_'
104 || (!first
105 && (('0' <= c && c <= '9') || c == '.' || c == '[' || c == ']'))
106 || ('A' <= c && c <= 'Z')
107 || ('a' <= c && c <= 'z')
108 }
109
110 /// A builder for a regular expression parser.
111 ///
112 /// This builder permits modifying configuration options for the parser.
113 #[derive(Clone, Debug)]
114 pub struct ParserBuilder {
115 ignore_whitespace: bool,
116 nest_limit: u32,
117 octal: bool,
118 }
119
120 impl Default for ParserBuilder {
default() -> ParserBuilder121 fn default() -> ParserBuilder {
122 ParserBuilder::new()
123 }
124 }
125
126 impl ParserBuilder {
127 /// Create a new parser builder with a default configuration.
new() -> ParserBuilder128 pub fn new() -> ParserBuilder {
129 ParserBuilder {
130 ignore_whitespace: false,
131 nest_limit: 250,
132 octal: false,
133 }
134 }
135
136 /// Build a parser from this configuration with the given pattern.
build(&self) -> Parser137 pub fn build(&self) -> Parser {
138 Parser {
139 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
140 capture_index: Cell::new(0),
141 nest_limit: self.nest_limit,
142 octal: self.octal,
143 initial_ignore_whitespace: self.ignore_whitespace,
144 ignore_whitespace: Cell::new(self.ignore_whitespace),
145 comments: RefCell::new(vec![]),
146 stack_group: RefCell::new(vec![]),
147 stack_class: RefCell::new(vec![]),
148 capture_names: RefCell::new(vec![]),
149 scratch: RefCell::new(String::new()),
150 }
151 }
152
153 /// Set the nesting limit for this parser.
154 ///
155 /// The nesting limit controls how deep the abstract syntax tree is allowed
156 /// to be. If the AST exceeds the given limit (e.g., with too many nested
157 /// groups), then an error is returned by the parser.
158 ///
159 /// The purpose of this limit is to act as a heuristic to prevent stack
160 /// overflow for consumers that do structural induction on an `Ast` using
161 /// explicit recursion. While this crate never does this (instead using
162 /// constant stack space and moving the call stack to the heap), other
163 /// crates may.
164 ///
165 /// This limit is not checked until the entire Ast is parsed. Therefore,
166 /// if callers want to put a limit on the amount of heap space used, then
167 /// they should impose a limit on the length, in bytes, of the concrete
168 /// pattern string. In particular, this is viable since this parser
169 /// implementation will limit itself to heap space proportional to the
170 /// lenth of the pattern string.
171 ///
172 /// Note that a nest limit of `0` will return a nest limit error for most
173 /// patterns but not all. For example, a nest limit of `0` permits `a` but
174 /// not `ab`, since `ab` requires a concatenation, which results in a nest
175 /// depth of `1`. In general, a nest limit is not something that manifests
176 /// in an obvious way in the concrete syntax, therefore, it should not be
177 /// used in a granular way.
nest_limit(&mut self, limit: u32) -> &mut ParserBuilder178 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
179 self.nest_limit = limit;
180 self
181 }
182
183 /// Whether to support octal syntax or not.
184 ///
185 /// Octal syntax is a little-known way of uttering Unicode codepoints in
186 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
187 /// `\141` are all equivalent regular expressions, where the last example
188 /// shows octal syntax.
189 ///
190 /// While supporting octal syntax isn't in and of itself a problem, it does
191 /// make good error messages harder. That is, in PCRE based regex engines,
192 /// syntax like `\0` invokes a backreference, which is explicitly
193 /// unsupported in Rust's regex engine. However, many users expect it to
194 /// be supported. Therefore, when octal support is disabled, the error
195 /// message will explicitly mention that backreferences aren't supported.
196 ///
197 /// Octal syntax is disabled by default.
octal(&mut self, yes: bool) -> &mut ParserBuilder198 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
199 self.octal = yes;
200 self
201 }
202
203 /// Enable verbose mode in the regular expression.
204 ///
205 /// When enabled, verbose mode permits insigificant whitespace in many
206 /// places in the regular expression, as well as comments. Comments are
207 /// started using `#` and continue until the end of the line.
208 ///
209 /// By default, this is disabled. It may be selectively enabled in the
210 /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder211 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
212 self.ignore_whitespace = yes;
213 self
214 }
215 }
216
217 /// A regular expression parser.
218 ///
219 /// This parses a string representation of a regular expression into an
220 /// abstract syntax tree. The size of the tree is proportional to the length
221 /// of the regular expression pattern.
222 ///
223 /// A `Parser` can be configured in more detail via a
224 /// [`ParserBuilder`](struct.ParserBuilder.html).
225 #[derive(Clone, Debug)]
226 pub struct Parser {
227 /// The current position of the parser.
228 pos: Cell<Position>,
229 /// The current capture index.
230 capture_index: Cell<u32>,
231 /// The maximum number of open parens/brackets allowed. If the parser
232 /// exceeds this number, then an error is returned.
233 nest_limit: u32,
234 /// Whether to support octal syntax or not. When `false`, the parser will
235 /// return an error helpfully pointing out that backreferences are not
236 /// supported.
237 octal: bool,
238 /// The initial setting for `ignore_whitespace` as provided by
239 /// Th`ParserBuilder`. is is used when reseting the parser's state.
240 initial_ignore_whitespace: bool,
241 /// Whether whitespace should be ignored. When enabled, comments are
242 /// also permitted.
243 ignore_whitespace: Cell<bool>,
244 /// A list of comments, in order of appearance.
245 comments: RefCell<Vec<ast::Comment>>,
246 /// A stack of grouped sub-expressions, including alternations.
247 stack_group: RefCell<Vec<GroupState>>,
248 /// A stack of nested character classes. This is only non-empty when
249 /// parsing a class.
250 stack_class: RefCell<Vec<ClassState>>,
251 /// A sorted sequence of capture names. This is used to detect duplicate
252 /// capture names and report an error if one is detected.
253 capture_names: RefCell<Vec<ast::CaptureName>>,
254 /// A scratch buffer used in various places. Mostly this is used to
255 /// accumulate relevant characters from parts of a pattern.
256 scratch: RefCell<String>,
257 }
258
259 /// ParserI is the internal parser implementation.
260 ///
261 /// We use this separate type so that we can carry the provided pattern string
262 /// along with us. In particular, a `Parser` internal state is not tied to any
263 /// one pattern, but `ParserI` is.
264 ///
265 /// This type also lets us use `ParserI<&Parser>` in production code while
266 /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
267 /// work against the internal interface of the parser.
268 #[derive(Clone, Debug)]
269 struct ParserI<'s, P> {
270 /// The parser state/configuration.
271 parser: P,
272 /// The full regular expression provided by the user.
273 pattern: &'s str,
274 }
275
276 /// GroupState represents a single stack frame while parsing nested groups
277 /// and alternations. Each frame records the state up to an opening parenthesis
278 /// or a alternating bracket `|`.
279 #[derive(Clone, Debug)]
280 enum GroupState {
281 /// This state is pushed whenever an opening group is found.
282 Group {
283 /// The concatenation immediately preceding the opening group.
284 concat: ast::Concat,
285 /// The group that has been opened. Its sub-AST is always empty.
286 group: ast::Group,
287 /// Whether this group has the `x` flag enabled or not.
288 ignore_whitespace: bool,
289 },
290 /// This state is pushed whenever a new alternation branch is found. If
291 /// an alternation branch is found and this state is at the top of the
292 /// stack, then this state should be modified to include the new
293 /// alternation.
294 Alternation(ast::Alternation),
295 }
296
297 /// ClassState represents a single stack frame while parsing character classes.
298 /// Each frame records the state up to an intersection, difference, symmetric
299 /// difference or nested class.
300 ///
301 /// Note that a parser's character class stack is only non-empty when parsing
302 /// a character class. In all other cases, it is empty.
303 #[derive(Clone, Debug)]
304 enum ClassState {
305 /// This state is pushed whenever an opening bracket is found.
306 Open {
307 /// The union of class items immediately preceding this class.
308 union: ast::ClassSetUnion,
309 /// The class that has been opened. Typically this just corresponds
310 /// to the `[`, but it can also include `[^` since `^` indicates
311 /// negation of the class.
312 set: ast::ClassBracketed,
313 },
314 /// This state is pushed when a operator is seen. When popped, the stored
315 /// set becomes the left hand side of the operator.
316 Op {
317 /// The type of the operation, i.e., &&, -- or ~~.
318 kind: ast::ClassSetBinaryOpKind,
319 /// The left-hand side of the operator.
320 lhs: ast::ClassSet,
321 },
322 }
323
324 impl Parser {
325 /// Create a new parser with a default configuration.
326 ///
327 /// The parser can be run with either the `parse` or `parse_with_comments`
328 /// methods. The parse methods return an abstract syntax tree.
329 ///
330 /// To set configuration options on the parser, use
331 /// [`ParserBuilder`](struct.ParserBuilder.html).
new() -> Parser332 pub fn new() -> Parser {
333 ParserBuilder::new().build()
334 }
335
336 /// Parse the regular expression into an abstract syntax tree.
parse(&mut self, pattern: &str) -> Result<Ast>337 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
338 ParserI::new(self, pattern).parse()
339 }
340
341 /// Parse the regular expression and return an abstract syntax tree with
342 /// all of the comments found in the pattern.
parse_with_comments( &mut self, pattern: &str, ) -> Result<ast::WithComments>343 pub fn parse_with_comments(
344 &mut self,
345 pattern: &str,
346 ) -> Result<ast::WithComments> {
347 ParserI::new(self, pattern).parse_with_comments()
348 }
349
350 /// Reset the internal state of a parser.
351 ///
352 /// This is called at the beginning of every parse. This prevents the
353 /// parser from running with inconsistent state (say, if a previous
354 /// invocation returned an error and the parser is reused).
reset(&self)355 fn reset(&self) {
356 // These settings should be in line with the construction
357 // in `ParserBuilder::build`.
358 self.pos.set(Position { offset: 0, line: 1, column: 1 });
359 self.ignore_whitespace.set(self.initial_ignore_whitespace);
360 self.comments.borrow_mut().clear();
361 self.stack_group.borrow_mut().clear();
362 self.stack_class.borrow_mut().clear();
363 }
364 }
365
366 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
367 /// Build an internal parser from a parser configuration and a pattern.
new(parser: P, pattern: &'s str) -> ParserI<'s, P>368 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
369 ParserI { parser: parser, pattern: pattern }
370 }
371
372 /// Return a reference to the parser state.
parser(&self) -> &Parser373 fn parser(&self) -> &Parser {
374 self.parser.borrow()
375 }
376
377 /// Return a reference to the pattern being parsed.
pattern(&self) -> &str378 fn pattern(&self) -> &str {
379 self.pattern.borrow()
380 }
381
382 /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error383 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
384 ast::Error {
385 kind: kind,
386 pattern: self.pattern().to_string(),
387 span: span,
388 }
389 }
390
391 /// Return the current offset of the parser.
392 ///
393 /// The offset starts at `0` from the beginning of the regular expression
394 /// pattern string.
offset(&self) -> usize395 fn offset(&self) -> usize {
396 self.parser().pos.get().offset
397 }
398
399 /// Return the current line number of the parser.
400 ///
401 /// The line number starts at `1`.
line(&self) -> usize402 fn line(&self) -> usize {
403 self.parser().pos.get().line
404 }
405
406 /// Return the current column of the parser.
407 ///
408 /// The column number starts at `1` and is reset whenever a `\n` is seen.
column(&self) -> usize409 fn column(&self) -> usize {
410 self.parser().pos.get().column
411 }
412
413 /// Return the next capturing index. Each subsequent call increments the
414 /// internal index.
415 ///
416 /// The span given should correspond to the location of the opening
417 /// parenthesis.
418 ///
419 /// If the capture limit is exceeded, then an error is returned.
next_capture_index(&self, span: Span) -> Result<u32>420 fn next_capture_index(&self, span: Span) -> Result<u32> {
421 let current = self.parser().capture_index.get();
422 let i = current.checked_add(1).ok_or_else(|| {
423 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
424 })?;
425 self.parser().capture_index.set(i);
426 Ok(i)
427 }
428
429 /// Adds the given capture name to this parser. If this capture name has
430 /// already been used, then an error is returned.
add_capture_name(&self, cap: &ast::CaptureName) -> Result<()>431 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
432 let mut names = self.parser().capture_names.borrow_mut();
433 match names
434 .binary_search_by_key(&cap.name.as_str(), |c| c.name.as_str())
435 {
436 Err(i) => {
437 names.insert(i, cap.clone());
438 Ok(())
439 }
440 Ok(i) => Err(self.error(
441 cap.span,
442 ast::ErrorKind::GroupNameDuplicate { original: names[i].span },
443 )),
444 }
445 }
446
447 /// Return whether the parser should ignore whitespace or not.
ignore_whitespace(&self) -> bool448 fn ignore_whitespace(&self) -> bool {
449 self.parser().ignore_whitespace.get()
450 }
451
452 /// Return the character at the current position of the parser.
453 ///
454 /// This panics if the current position does not point to a valid char.
char(&self) -> char455 fn char(&self) -> char {
456 self.char_at(self.offset())
457 }
458
459 /// Return the character at the given position.
460 ///
461 /// This panics if the given position does not point to a valid char.
char_at(&self, i: usize) -> char462 fn char_at(&self, i: usize) -> char {
463 self.pattern()[i..]
464 .chars()
465 .next()
466 .unwrap_or_else(|| panic!("expected char at offset {}", i))
467 }
468
469 /// Bump the parser to the next Unicode scalar value.
470 ///
471 /// If the end of the input has been reached, then `false` is returned.
bump(&self) -> bool472 fn bump(&self) -> bool {
473 if self.is_eof() {
474 return false;
475 }
476 let Position { mut offset, mut line, mut column } = self.pos();
477 if self.char() == '\n' {
478 line = line.checked_add(1).unwrap();
479 column = 1;
480 } else {
481 column = column.checked_add(1).unwrap();
482 }
483 offset += self.char().len_utf8();
484 self.parser().pos.set(Position {
485 offset: offset,
486 line: line,
487 column: column,
488 });
489 self.pattern()[self.offset()..].chars().next().is_some()
490 }
491
492 /// If the substring starting at the current position of the parser has
493 /// the given prefix, then bump the parser to the character immediately
494 /// following the prefix and return true. Otherwise, don't bump the parser
495 /// and return false.
bump_if(&self, prefix: &str) -> bool496 fn bump_if(&self, prefix: &str) -> bool {
497 if self.pattern()[self.offset()..].starts_with(prefix) {
498 for _ in 0..prefix.chars().count() {
499 self.bump();
500 }
501 true
502 } else {
503 false
504 }
505 }
506
507 /// Returns true if and only if the parser is positioned at a look-around
508 /// prefix. The conditions under which this returns true must always
509 /// correspond to a regular expression that would otherwise be consider
510 /// invalid.
511 ///
512 /// This should only be called immediately after parsing the opening of
513 /// a group or a set of flags.
is_lookaround_prefix(&self) -> bool514 fn is_lookaround_prefix(&self) -> bool {
515 self.bump_if("?=")
516 || self.bump_if("?!")
517 || self.bump_if("?<=")
518 || self.bump_if("?<!")
519 }
520
521 /// Bump the parser, and if the `x` flag is enabled, bump through any
522 /// subsequent spaces. Return true if and only if the parser is not at
523 /// EOF.
bump_and_bump_space(&self) -> bool524 fn bump_and_bump_space(&self) -> bool {
525 if !self.bump() {
526 return false;
527 }
528 self.bump_space();
529 !self.is_eof()
530 }
531
532 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
533 /// comments), then this will advance the parser through all whitespace
534 /// and comments to the next non-whitespace non-comment byte.
535 ///
536 /// If the `x` flag is disabled, then this is a no-op.
537 ///
538 /// This should be used selectively throughout the parser where
539 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
540 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
bump_space(&self)541 fn bump_space(&self) {
542 if !self.ignore_whitespace() {
543 return;
544 }
545 while !self.is_eof() {
546 if self.char().is_whitespace() {
547 self.bump();
548 } else if self.char() == '#' {
549 let start = self.pos();
550 let mut comment_text = String::new();
551 self.bump();
552 while !self.is_eof() {
553 let c = self.char();
554 self.bump();
555 if c == '\n' {
556 break;
557 }
558 comment_text.push(c);
559 }
560 let comment = ast::Comment {
561 span: Span::new(start, self.pos()),
562 comment: comment_text,
563 };
564 self.parser().comments.borrow_mut().push(comment);
565 } else {
566 break;
567 }
568 }
569 }
570
571 /// Peek at the next character in the input without advancing the parser.
572 ///
573 /// If the input has been exhausted, then this returns `None`.
peek(&self) -> Option<char>574 fn peek(&self) -> Option<char> {
575 if self.is_eof() {
576 return None;
577 }
578 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
579 }
580
581 /// Like peek, but will ignore spaces when the parser is in whitespace
582 /// insensitive mode.
peek_space(&self) -> Option<char>583 fn peek_space(&self) -> Option<char> {
584 if !self.ignore_whitespace() {
585 return self.peek();
586 }
587 if self.is_eof() {
588 return None;
589 }
590 let mut start = self.offset() + self.char().len_utf8();
591 let mut in_comment = false;
592 for (i, c) in self.pattern()[start..].char_indices() {
593 if c.is_whitespace() {
594 continue;
595 } else if !in_comment && c == '#' {
596 in_comment = true;
597 } else if in_comment && c == '\n' {
598 in_comment = false;
599 } else {
600 start += i;
601 break;
602 }
603 }
604 self.pattern()[start..].chars().next()
605 }
606
607 /// Returns true if the next call to `bump` would return false.
is_eof(&self) -> bool608 fn is_eof(&self) -> bool {
609 self.offset() == self.pattern().len()
610 }
611
612 /// Return the current position of the parser, which includes the offset,
613 /// line and column.
pos(&self) -> Position614 fn pos(&self) -> Position {
615 self.parser().pos.get()
616 }
617
618 /// Create a span at the current position of the parser. Both the start
619 /// and end of the span are set.
span(&self) -> Span620 fn span(&self) -> Span {
621 Span::splat(self.pos())
622 }
623
624 /// Create a span that covers the current character.
span_char(&self) -> Span625 fn span_char(&self) -> Span {
626 let mut next = Position {
627 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
628 line: self.line(),
629 column: self.column().checked_add(1).unwrap(),
630 };
631 if self.char() == '\n' {
632 next.line += 1;
633 next.column = 1;
634 }
635 Span::new(self.pos(), next)
636 }
637
638 /// Parse and push a single alternation on to the parser's internal stack.
639 /// If the top of the stack already has an alternation, then add to that
640 /// instead of pushing a new one.
641 ///
642 /// The concatenation given corresponds to a single alternation branch.
643 /// The concatenation returned starts the next branch and is empty.
644 ///
645 /// This assumes the parser is currently positioned at `|` and will advance
646 /// the parser to the character following `|`.
647 #[inline(never)]
push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat>648 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
649 assert_eq!(self.char(), '|');
650 concat.span.end = self.pos();
651 self.push_or_add_alternation(concat);
652 self.bump();
653 Ok(ast::Concat { span: self.span(), asts: vec![] })
654 }
655
656 /// Pushes or adds the given branch of an alternation to the parser's
657 /// internal stack of state.
push_or_add_alternation(&self, concat: ast::Concat)658 fn push_or_add_alternation(&self, concat: ast::Concat) {
659 use self::GroupState::*;
660
661 let mut stack = self.parser().stack_group.borrow_mut();
662 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
663 alts.asts.push(concat.into_ast());
664 return;
665 }
666 stack.push(Alternation(ast::Alternation {
667 span: Span::new(concat.span.start, self.pos()),
668 asts: vec![concat.into_ast()],
669 }));
670 }
671
672 /// Parse and push a group AST (and its parent concatenation) on to the
673 /// parser's internal stack. Return a fresh concatenation corresponding
674 /// to the group's sub-AST.
675 ///
676 /// If a set of flags was found (with no group), then the concatenation
677 /// is returned with that set of flags added.
678 ///
679 /// This assumes that the parser is currently positioned on the opening
680 /// parenthesis. It advances the parser to the character at the start
681 /// of the sub-expression (or adjoining expression).
682 ///
683 /// If there was a problem parsing the start of the group, then an error
684 /// is returned.
685 #[inline(never)]
push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat>686 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
687 assert_eq!(self.char(), '(');
688 match self.parse_group()? {
689 Either::Left(set) => {
690 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
691 if let Some(v) = ignore {
692 self.parser().ignore_whitespace.set(v);
693 }
694
695 concat.asts.push(Ast::Flags(set));
696 Ok(concat)
697 }
698 Either::Right(group) => {
699 let old_ignore_whitespace = self.ignore_whitespace();
700 let new_ignore_whitespace = group
701 .flags()
702 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
703 .unwrap_or(old_ignore_whitespace);
704 self.parser().stack_group.borrow_mut().push(
705 GroupState::Group {
706 concat: concat,
707 group: group,
708 ignore_whitespace: old_ignore_whitespace,
709 },
710 );
711 self.parser().ignore_whitespace.set(new_ignore_whitespace);
712 Ok(ast::Concat { span: self.span(), asts: vec![] })
713 }
714 }
715 }
716
717 /// Pop a group AST from the parser's internal stack and set the group's
718 /// AST to the given concatenation. Return the concatenation containing
719 /// the group.
720 ///
721 /// This assumes that the parser is currently positioned on the closing
722 /// parenthesis and advances the parser to the character following the `)`.
723 ///
724 /// If no such group could be popped, then an unopened group error is
725 /// returned.
726 #[inline(never)]
pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat>727 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
728 use self::GroupState::*;
729
730 assert_eq!(self.char(), ')');
731 let mut stack = self.parser().stack_group.borrow_mut();
732 let (mut prior_concat, mut group, ignore_whitespace, alt) = match stack
733 .pop()
734 {
735 Some(Group { concat, group, ignore_whitespace }) => {
736 (concat, group, ignore_whitespace, None)
737 }
738 Some(Alternation(alt)) => match stack.pop() {
739 Some(Group { concat, group, ignore_whitespace }) => {
740 (concat, group, ignore_whitespace, Some(alt))
741 }
742 None | Some(Alternation(_)) => {
743 return Err(self.error(
744 self.span_char(),
745 ast::ErrorKind::GroupUnopened,
746 ));
747 }
748 },
749 None => {
750 return Err(self
751 .error(self.span_char(), ast::ErrorKind::GroupUnopened));
752 }
753 };
754 self.parser().ignore_whitespace.set(ignore_whitespace);
755 group_concat.span.end = self.pos();
756 self.bump();
757 group.span.end = self.pos();
758 match alt {
759 Some(mut alt) => {
760 alt.span.end = group_concat.span.end;
761 alt.asts.push(group_concat.into_ast());
762 group.ast = Box::new(alt.into_ast());
763 }
764 None => {
765 group.ast = Box::new(group_concat.into_ast());
766 }
767 }
768 prior_concat.asts.push(Ast::Group(group));
769 Ok(prior_concat)
770 }
771
772 /// Pop the last state from the parser's internal stack, if it exists, and
773 /// add the given concatenation to it. There either must be no state or a
774 /// single alternation item on the stack. Any other scenario produces an
775 /// error.
776 ///
777 /// This assumes that the parser has advanced to the end.
778 #[inline(never)]
pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast>779 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
780 concat.span.end = self.pos();
781 let mut stack = self.parser().stack_group.borrow_mut();
782 let ast = match stack.pop() {
783 None => Ok(concat.into_ast()),
784 Some(GroupState::Alternation(mut alt)) => {
785 alt.span.end = self.pos();
786 alt.asts.push(concat.into_ast());
787 Ok(Ast::Alternation(alt))
788 }
789 Some(GroupState::Group { group, .. }) => {
790 return Err(
791 self.error(group.span, ast::ErrorKind::GroupUnclosed)
792 );
793 }
794 };
795 // If we try to pop again, there should be nothing.
796 match stack.pop() {
797 None => ast,
798 Some(GroupState::Alternation(_)) => {
799 // This unreachable is unfortunate. This case can't happen
800 // because the only way we can be here is if there were two
801 // `GroupState::Alternation`s adjacent in the parser's stack,
802 // which we guarantee to never happen because we never push a
803 // `GroupState::Alternation` if one is already at the top of
804 // the stack.
805 unreachable!()
806 }
807 Some(GroupState::Group { group, .. }) => {
808 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
809 }
810 }
811 }
812
813 /// Parse the opening of a character class and push the current class
814 /// parsing context onto the parser's stack. This assumes that the parser
815 /// is positioned at an opening `[`. The given union should correspond to
816 /// the union of set items built up before seeing the `[`.
817 ///
818 /// If there was a problem parsing the opening of the class, then an error
819 /// is returned. Otherwise, a new union of set items for the class is
820 /// returned (which may be populated with either a `]` or a `-`).
821 #[inline(never)]
push_class_open( &self, parent_union: ast::ClassSetUnion, ) -> Result<ast::ClassSetUnion>822 fn push_class_open(
823 &self,
824 parent_union: ast::ClassSetUnion,
825 ) -> Result<ast::ClassSetUnion> {
826 assert_eq!(self.char(), '[');
827
828 let (nested_set, nested_union) = self.parse_set_class_open()?;
829 self.parser()
830 .stack_class
831 .borrow_mut()
832 .push(ClassState::Open { union: parent_union, set: nested_set });
833 Ok(nested_union)
834 }
835
836 /// Parse the end of a character class set and pop the character class
837 /// parser stack. The union given corresponds to the last union built
838 /// before seeing the closing `]`. The union returned corresponds to the
839 /// parent character class set with the nested class added to it.
840 ///
841 /// This assumes that the parser is positioned at a `]` and will advance
842 /// the parser to the byte immediately following the `]`.
843 ///
844 /// If the stack is empty after popping, then this returns the final
845 /// "top-level" character class AST (where a "top-level" character class
846 /// is one that is not nested inside any other character class).
847 ///
848 /// If there is no corresponding opening bracket on the parser's stack,
849 /// then an error is returned.
850 #[inline(never)]
pop_class( &self, nested_union: ast::ClassSetUnion, ) -> Result<Either<ast::ClassSetUnion, ast::Class>>851 fn pop_class(
852 &self,
853 nested_union: ast::ClassSetUnion,
854 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
855 assert_eq!(self.char(), ']');
856
857 let item = ast::ClassSet::Item(nested_union.into_item());
858 let prevset = self.pop_class_op(item);
859 let mut stack = self.parser().stack_class.borrow_mut();
860 match stack.pop() {
861 None => {
862 // We can never observe an empty stack:
863 //
864 // 1) We are guaranteed to start with a non-empty stack since
865 // the character class parser is only initiated when it sees
866 // a `[`.
867 // 2) If we ever observe an empty stack while popping after
868 // seeing a `]`, then we signal the character class parser
869 // to terminate.
870 panic!("unexpected empty character class stack")
871 }
872 Some(ClassState::Op { .. }) => {
873 // This panic is unfortunate, but this case is impossible
874 // since we already popped the Op state if one exists above.
875 // Namely, every push to the class parser stack is guarded by
876 // whether an existing Op is already on the top of the stack.
877 // If it is, the existing Op is modified. That is, the stack
878 // can never have consecutive Op states.
879 panic!("unexpected ClassState::Op")
880 }
881 Some(ClassState::Open { mut union, mut set }) => {
882 self.bump();
883 set.span.end = self.pos();
884 set.kind = prevset;
885 if stack.is_empty() {
886 Ok(Either::Right(ast::Class::Bracketed(set)))
887 } else {
888 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
889 Ok(Either::Left(union))
890 }
891 }
892 }
893 }
894
895 /// Return an "unclosed class" error whose span points to the most
896 /// recently opened class.
897 ///
898 /// This should only be called while parsing a character class.
899 #[inline(never)]
unclosed_class_error(&self) -> ast::Error900 fn unclosed_class_error(&self) -> ast::Error {
901 for state in self.parser().stack_class.borrow().iter().rev() {
902 match *state {
903 ClassState::Open { ref set, .. } => {
904 return self
905 .error(set.span, ast::ErrorKind::ClassUnclosed);
906 }
907 _ => {}
908 }
909 }
910 // We are guaranteed to have a non-empty stack with at least
911 // one open bracket, so we should never get here.
912 panic!("no open character class found")
913 }
914
915 /// Push the current set of class items on to the class parser's stack as
916 /// the left hand side of the given operator.
917 ///
918 /// A fresh set union is returned, which should be used to build the right
919 /// hand side of this operator.
920 #[inline(never)]
push_class_op( &self, next_kind: ast::ClassSetBinaryOpKind, next_union: ast::ClassSetUnion, ) -> ast::ClassSetUnion921 fn push_class_op(
922 &self,
923 next_kind: ast::ClassSetBinaryOpKind,
924 next_union: ast::ClassSetUnion,
925 ) -> ast::ClassSetUnion {
926 let item = ast::ClassSet::Item(next_union.into_item());
927 let new_lhs = self.pop_class_op(item);
928 self.parser()
929 .stack_class
930 .borrow_mut()
931 .push(ClassState::Op { kind: next_kind, lhs: new_lhs });
932 ast::ClassSetUnion { span: self.span(), items: vec![] }
933 }
934
935 /// Pop a character class set from the character class parser stack. If the
936 /// top of the stack is just an item (not an operation), then return the
937 /// given set unchanged. If the top of the stack is an operation, then the
938 /// given set will be used as the rhs of the operation on the top of the
939 /// stack. In that case, the binary operation is returned as a set.
940 #[inline(never)]
pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet941 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
942 let mut stack = self.parser().stack_class.borrow_mut();
943 let (kind, lhs) = match stack.pop() {
944 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
945 Some(state @ ClassState::Open { .. }) => {
946 stack.push(state);
947 return rhs;
948 }
949 None => unreachable!(),
950 };
951 let span = Span::new(lhs.span().start, rhs.span().end);
952 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
953 span: span,
954 kind: kind,
955 lhs: Box::new(lhs),
956 rhs: Box::new(rhs),
957 })
958 }
959 }
960
961 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
962 /// Parse the regular expression into an abstract syntax tree.
parse(&self) -> Result<Ast>963 fn parse(&self) -> Result<Ast> {
964 self.parse_with_comments().map(|astc| astc.ast)
965 }
966
967 /// Parse the regular expression and return an abstract syntax tree with
968 /// all of the comments found in the pattern.
parse_with_comments(&self) -> Result<ast::WithComments>969 fn parse_with_comments(&self) -> Result<ast::WithComments> {
970 assert_eq!(self.offset(), 0, "parser can only be used once");
971 self.parser().reset();
972 let mut concat = ast::Concat { span: self.span(), asts: vec![] };
973 loop {
974 self.bump_space();
975 if self.is_eof() {
976 break;
977 }
978 match self.char() {
979 '(' => concat = self.push_group(concat)?,
980 ')' => concat = self.pop_group(concat)?,
981 '|' => concat = self.push_alternate(concat)?,
982 '[' => {
983 let class = self.parse_set_class()?;
984 concat.asts.push(Ast::Class(class));
985 }
986 '?' => {
987 concat = self.parse_uncounted_repetition(
988 concat,
989 ast::RepetitionKind::ZeroOrOne,
990 )?;
991 }
992 '*' => {
993 concat = self.parse_uncounted_repetition(
994 concat,
995 ast::RepetitionKind::ZeroOrMore,
996 )?;
997 }
998 '+' => {
999 concat = self.parse_uncounted_repetition(
1000 concat,
1001 ast::RepetitionKind::OneOrMore,
1002 )?;
1003 }
1004 '{' => {
1005 concat = self.parse_counted_repetition(concat)?;
1006 }
1007 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1008 }
1009 }
1010 let ast = self.pop_group_end(concat)?;
1011 NestLimiter::new(self).check(&ast)?;
1012 Ok(ast::WithComments {
1013 ast: ast,
1014 comments: mem::replace(
1015 &mut *self.parser().comments.borrow_mut(),
1016 vec![],
1017 ),
1018 })
1019 }
1020
1021 /// Parses an uncounted repetition operation. An uncounted repetition
1022 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1023 /// The given `kind` should correspond to the operator observed by the
1024 /// caller.
1025 ///
1026 /// This assumes that the paser is currently positioned at the repetition
1027 /// operator and advances the parser to the first character after the
1028 /// operator. (Note that the operator may include a single additional `?`,
1029 /// which makes the operator ungreedy.)
1030 ///
1031 /// The caller should include the concatenation that is being built. The
1032 /// concatenation returned includes the repetition operator applied to the
1033 /// last expression in the given concatenation.
1034 #[inline(never)]
parse_uncounted_repetition( &self, mut concat: ast::Concat, kind: ast::RepetitionKind, ) -> Result<ast::Concat>1035 fn parse_uncounted_repetition(
1036 &self,
1037 mut concat: ast::Concat,
1038 kind: ast::RepetitionKind,
1039 ) -> Result<ast::Concat> {
1040 assert!(
1041 self.char() == '?' || self.char() == '*' || self.char() == '+'
1042 );
1043 let op_start = self.pos();
1044 let ast = match concat.asts.pop() {
1045 Some(ast) => ast,
1046 None => {
1047 return Err(
1048 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1049 )
1050 }
1051 };
1052 match ast {
1053 Ast::Empty(_) | Ast::Flags(_) => {
1054 return Err(
1055 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1056 )
1057 }
1058 _ => {}
1059 }
1060 let mut greedy = true;
1061 if self.bump() && self.char() == '?' {
1062 greedy = false;
1063 self.bump();
1064 }
1065 concat.asts.push(Ast::Repetition(ast::Repetition {
1066 span: ast.span().with_end(self.pos()),
1067 op: ast::RepetitionOp {
1068 span: Span::new(op_start, self.pos()),
1069 kind: kind,
1070 },
1071 greedy: greedy,
1072 ast: Box::new(ast),
1073 }));
1074 Ok(concat)
1075 }
1076
1077 /// Parses a counted repetition operation. A counted repetition operator
1078 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1079 /// operators.
1080 ///
1081 /// This assumes that the paser is currently positioned at the opening `{`
1082 /// and advances the parser to the first character after the operator.
1083 /// (Note that the operator may include a single additional `?`, which
1084 /// makes the operator ungreedy.)
1085 ///
1086 /// The caller should include the concatenation that is being built. The
1087 /// concatenation returned includes the repetition operator applied to the
1088 /// last expression in the given concatenation.
1089 #[inline(never)]
parse_counted_repetition( &self, mut concat: ast::Concat, ) -> Result<ast::Concat>1090 fn parse_counted_repetition(
1091 &self,
1092 mut concat: ast::Concat,
1093 ) -> Result<ast::Concat> {
1094 assert!(self.char() == '{');
1095 let start = self.pos();
1096 let ast = match concat.asts.pop() {
1097 Some(ast) => ast,
1098 None => {
1099 return Err(
1100 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1101 )
1102 }
1103 };
1104 match ast {
1105 Ast::Empty(_) | Ast::Flags(_) => {
1106 return Err(
1107 self.error(self.span(), ast::ErrorKind::RepetitionMissing)
1108 )
1109 }
1110 _ => {}
1111 }
1112 if !self.bump_and_bump_space() {
1113 return Err(self.error(
1114 Span::new(start, self.pos()),
1115 ast::ErrorKind::RepetitionCountUnclosed,
1116 ));
1117 }
1118 let count_start = specialize_err(
1119 self.parse_decimal(),
1120 ast::ErrorKind::DecimalEmpty,
1121 ast::ErrorKind::RepetitionCountDecimalEmpty,
1122 )?;
1123 let mut range = ast::RepetitionRange::Exactly(count_start);
1124 if self.is_eof() {
1125 return Err(self.error(
1126 Span::new(start, self.pos()),
1127 ast::ErrorKind::RepetitionCountUnclosed,
1128 ));
1129 }
1130 if self.char() == ',' {
1131 if !self.bump_and_bump_space() {
1132 return Err(self.error(
1133 Span::new(start, self.pos()),
1134 ast::ErrorKind::RepetitionCountUnclosed,
1135 ));
1136 }
1137 if self.char() != '}' {
1138 let count_end = specialize_err(
1139 self.parse_decimal(),
1140 ast::ErrorKind::DecimalEmpty,
1141 ast::ErrorKind::RepetitionCountDecimalEmpty,
1142 )?;
1143 range = ast::RepetitionRange::Bounded(count_start, count_end);
1144 } else {
1145 range = ast::RepetitionRange::AtLeast(count_start);
1146 }
1147 }
1148 if self.is_eof() || self.char() != '}' {
1149 return Err(self.error(
1150 Span::new(start, self.pos()),
1151 ast::ErrorKind::RepetitionCountUnclosed,
1152 ));
1153 }
1154
1155 let mut greedy = true;
1156 if self.bump_and_bump_space() && self.char() == '?' {
1157 greedy = false;
1158 self.bump();
1159 }
1160
1161 let op_span = Span::new(start, self.pos());
1162 if !range.is_valid() {
1163 return Err(
1164 self.error(op_span, ast::ErrorKind::RepetitionCountInvalid)
1165 );
1166 }
1167 concat.asts.push(Ast::Repetition(ast::Repetition {
1168 span: ast.span().with_end(self.pos()),
1169 op: ast::RepetitionOp {
1170 span: op_span,
1171 kind: ast::RepetitionKind::Range(range),
1172 },
1173 greedy: greedy,
1174 ast: Box::new(ast),
1175 }));
1176 Ok(concat)
1177 }
1178
1179 /// Parse a group (which contains a sub-expression) or a set of flags.
1180 ///
1181 /// If a group was found, then it is returned with an empty AST. If a set
1182 /// of flags is found, then that set is returned.
1183 ///
1184 /// The parser should be positioned at the opening parenthesis.
1185 ///
1186 /// This advances the parser to the character before the start of the
1187 /// sub-expression (in the case of a group) or to the closing parenthesis
1188 /// immediately following the set of flags.
1189 ///
1190 /// # Errors
1191 ///
1192 /// If flags are given and incorrectly specified, then a corresponding
1193 /// error is returned.
1194 ///
1195 /// If a capture name is given and it is incorrectly specified, then a
1196 /// corresponding error is returned.
1197 #[inline(never)]
parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>>1198 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1199 assert_eq!(self.char(), '(');
1200 let open_span = self.span_char();
1201 self.bump();
1202 self.bump_space();
1203 if self.is_lookaround_prefix() {
1204 return Err(self.error(
1205 Span::new(open_span.start, self.span().end),
1206 ast::ErrorKind::UnsupportedLookAround,
1207 ));
1208 }
1209 let inner_span = self.span();
1210 if self.bump_if("?P<") {
1211 let capture_index = self.next_capture_index(open_span)?;
1212 let cap = self.parse_capture_name(capture_index)?;
1213 Ok(Either::Right(ast::Group {
1214 span: open_span,
1215 kind: ast::GroupKind::CaptureName(cap),
1216 ast: Box::new(Ast::Empty(self.span())),
1217 }))
1218 } else if self.bump_if("?") {
1219 if self.is_eof() {
1220 return Err(
1221 self.error(open_span, ast::ErrorKind::GroupUnclosed)
1222 );
1223 }
1224 let flags = self.parse_flags()?;
1225 let char_end = self.char();
1226 self.bump();
1227 if char_end == ')' {
1228 // We don't allow empty flags, e.g., `(?)`. We instead
1229 // interpret it as a repetition operator missing its argument.
1230 if flags.items.is_empty() {
1231 return Err(self.error(
1232 inner_span,
1233 ast::ErrorKind::RepetitionMissing,
1234 ));
1235 }
1236 Ok(Either::Left(ast::SetFlags {
1237 span: Span { end: self.pos(), ..open_span },
1238 flags: flags,
1239 }))
1240 } else {
1241 assert_eq!(char_end, ':');
1242 Ok(Either::Right(ast::Group {
1243 span: open_span,
1244 kind: ast::GroupKind::NonCapturing(flags),
1245 ast: Box::new(Ast::Empty(self.span())),
1246 }))
1247 }
1248 } else {
1249 let capture_index = self.next_capture_index(open_span)?;
1250 Ok(Either::Right(ast::Group {
1251 span: open_span,
1252 kind: ast::GroupKind::CaptureIndex(capture_index),
1253 ast: Box::new(Ast::Empty(self.span())),
1254 }))
1255 }
1256 }
1257
1258 /// Parses a capture group name. Assumes that the parser is positioned at
1259 /// the first character in the name following the opening `<` (and may
1260 /// possibly be EOF). This advances the parser to the first character
1261 /// following the closing `>`.
1262 ///
1263 /// The caller must provide the capture index of the group for this name.
1264 #[inline(never)]
parse_capture_name( &self, capture_index: u32, ) -> Result<ast::CaptureName>1265 fn parse_capture_name(
1266 &self,
1267 capture_index: u32,
1268 ) -> Result<ast::CaptureName> {
1269 if self.is_eof() {
1270 return Err(self
1271 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1272 }
1273 let start = self.pos();
1274 loop {
1275 if self.char() == '>' {
1276 break;
1277 }
1278 if !is_capture_char(self.char(), self.pos() == start) {
1279 return Err(self.error(
1280 self.span_char(),
1281 ast::ErrorKind::GroupNameInvalid,
1282 ));
1283 }
1284 if !self.bump() {
1285 break;
1286 }
1287 }
1288 let end = self.pos();
1289 if self.is_eof() {
1290 return Err(self
1291 .error(self.span(), ast::ErrorKind::GroupNameUnexpectedEof));
1292 }
1293 assert_eq!(self.char(), '>');
1294 self.bump();
1295 let name = &self.pattern()[start.offset..end.offset];
1296 if name.is_empty() {
1297 return Err(self.error(
1298 Span::new(start, start),
1299 ast::ErrorKind::GroupNameEmpty,
1300 ));
1301 }
1302 let capname = ast::CaptureName {
1303 span: Span::new(start, end),
1304 name: name.to_string(),
1305 index: capture_index,
1306 };
1307 self.add_capture_name(&capname)?;
1308 Ok(capname)
1309 }
1310
1311 /// Parse a sequence of flags starting at the current character.
1312 ///
1313 /// This advances the parser to the character immediately following the
1314 /// flags, which is guaranteed to be either `:` or `)`.
1315 ///
1316 /// # Errors
1317 ///
1318 /// If any flags are duplicated, then an error is returned.
1319 ///
1320 /// If the negation operator is used more than once, then an error is
1321 /// returned.
1322 ///
1323 /// If no flags could be found or if the negation operation is not followed
1324 /// by any flags, then an error is returned.
1325 #[inline(never)]
parse_flags(&self) -> Result<ast::Flags>1326 fn parse_flags(&self) -> Result<ast::Flags> {
1327 let mut flags = ast::Flags { span: self.span(), items: vec![] };
1328 let mut last_was_negation = None;
1329 while self.char() != ':' && self.char() != ')' {
1330 if self.char() == '-' {
1331 last_was_negation = Some(self.span_char());
1332 let item = ast::FlagsItem {
1333 span: self.span_char(),
1334 kind: ast::FlagsItemKind::Negation,
1335 };
1336 if let Some(i) = flags.add_item(item) {
1337 return Err(self.error(
1338 self.span_char(),
1339 ast::ErrorKind::FlagRepeatedNegation {
1340 original: flags.items[i].span,
1341 },
1342 ));
1343 }
1344 } else {
1345 last_was_negation = None;
1346 let item = ast::FlagsItem {
1347 span: self.span_char(),
1348 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1349 };
1350 if let Some(i) = flags.add_item(item) {
1351 return Err(self.error(
1352 self.span_char(),
1353 ast::ErrorKind::FlagDuplicate {
1354 original: flags.items[i].span,
1355 },
1356 ));
1357 }
1358 }
1359 if !self.bump() {
1360 return Err(
1361 self.error(self.span(), ast::ErrorKind::FlagUnexpectedEof)
1362 );
1363 }
1364 }
1365 if let Some(span) = last_was_negation {
1366 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1367 }
1368 flags.span.end = self.pos();
1369 Ok(flags)
1370 }
1371
1372 /// Parse the current character as a flag. Do not advance the parser.
1373 ///
1374 /// # Errors
1375 ///
1376 /// If the flag is not recognized, then an error is returned.
1377 #[inline(never)]
parse_flag(&self) -> Result<ast::Flag>1378 fn parse_flag(&self) -> Result<ast::Flag> {
1379 match self.char() {
1380 'i' => Ok(ast::Flag::CaseInsensitive),
1381 'm' => Ok(ast::Flag::MultiLine),
1382 's' => Ok(ast::Flag::DotMatchesNewLine),
1383 'U' => Ok(ast::Flag::SwapGreed),
1384 'u' => Ok(ast::Flag::Unicode),
1385 'x' => Ok(ast::Flag::IgnoreWhitespace),
1386 _ => {
1387 Err(self
1388 .error(self.span_char(), ast::ErrorKind::FlagUnrecognized))
1389 }
1390 }
1391 }
1392
1393 /// Parse a primitive AST. e.g., A literal, non-set character class or
1394 /// assertion.
1395 ///
1396 /// This assumes that the parser expects a primitive at the current
1397 /// location. i.e., All other non-primitive cases have been handled.
1398 /// For example, if the parser's position is at `|`, then `|` will be
1399 /// treated as a literal (e.g., inside a character class).
1400 ///
1401 /// This advances the parser to the first character immediately following
1402 /// the primitive.
parse_primitive(&self) -> Result<Primitive>1403 fn parse_primitive(&self) -> Result<Primitive> {
1404 match self.char() {
1405 '\\' => self.parse_escape(),
1406 '.' => {
1407 let ast = Primitive::Dot(self.span_char());
1408 self.bump();
1409 Ok(ast)
1410 }
1411 '^' => {
1412 let ast = Primitive::Assertion(ast::Assertion {
1413 span: self.span_char(),
1414 kind: ast::AssertionKind::StartLine,
1415 });
1416 self.bump();
1417 Ok(ast)
1418 }
1419 '$' => {
1420 let ast = Primitive::Assertion(ast::Assertion {
1421 span: self.span_char(),
1422 kind: ast::AssertionKind::EndLine,
1423 });
1424 self.bump();
1425 Ok(ast)
1426 }
1427 c => {
1428 let ast = Primitive::Literal(ast::Literal {
1429 span: self.span_char(),
1430 kind: ast::LiteralKind::Verbatim,
1431 c: c,
1432 });
1433 self.bump();
1434 Ok(ast)
1435 }
1436 }
1437 }
1438
1439 /// Parse an escape sequence as a primitive AST.
1440 ///
1441 /// This assumes the parser is positioned at the start of the escape
1442 /// sequence, i.e., `\`. It advances the parser to the first position
1443 /// immediately following the escape sequence.
1444 #[inline(never)]
parse_escape(&self) -> Result<Primitive>1445 fn parse_escape(&self) -> Result<Primitive> {
1446 assert_eq!(self.char(), '\\');
1447 let start = self.pos();
1448 if !self.bump() {
1449 return Err(self.error(
1450 Span::new(start, self.pos()),
1451 ast::ErrorKind::EscapeUnexpectedEof,
1452 ));
1453 }
1454 let c = self.char();
1455 // Put some of the more complicated routines into helpers.
1456 match c {
1457 '0'..='7' => {
1458 if !self.parser().octal {
1459 return Err(self.error(
1460 Span::new(start, self.span_char().end),
1461 ast::ErrorKind::UnsupportedBackreference,
1462 ));
1463 }
1464 let mut lit = self.parse_octal();
1465 lit.span.start = start;
1466 return Ok(Primitive::Literal(lit));
1467 }
1468 '8'..='9' if !self.parser().octal => {
1469 return Err(self.error(
1470 Span::new(start, self.span_char().end),
1471 ast::ErrorKind::UnsupportedBackreference,
1472 ));
1473 }
1474 'x' | 'u' | 'U' => {
1475 let mut lit = self.parse_hex()?;
1476 lit.span.start = start;
1477 return Ok(Primitive::Literal(lit));
1478 }
1479 'p' | 'P' => {
1480 let mut cls = self.parse_unicode_class()?;
1481 cls.span.start = start;
1482 return Ok(Primitive::Unicode(cls));
1483 }
1484 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1485 let mut cls = self.parse_perl_class();
1486 cls.span.start = start;
1487 return Ok(Primitive::Perl(cls));
1488 }
1489 _ => {}
1490 }
1491
1492 // Handle all of the one letter sequences inline.
1493 self.bump();
1494 let span = Span::new(start, self.pos());
1495 if is_meta_character(c) {
1496 return Ok(Primitive::Literal(ast::Literal {
1497 span: span,
1498 kind: ast::LiteralKind::Punctuation,
1499 c: c,
1500 }));
1501 }
1502 let special = |kind, c| {
1503 Ok(Primitive::Literal(ast::Literal {
1504 span: span,
1505 kind: ast::LiteralKind::Special(kind),
1506 c: c,
1507 }))
1508 };
1509 match c {
1510 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1511 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1512 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1513 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1514 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1515 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1516 ' ' if self.ignore_whitespace() => {
1517 special(ast::SpecialLiteralKind::Space, ' ')
1518 }
1519 'A' => Ok(Primitive::Assertion(ast::Assertion {
1520 span: span,
1521 kind: ast::AssertionKind::StartText,
1522 })),
1523 'z' => Ok(Primitive::Assertion(ast::Assertion {
1524 span: span,
1525 kind: ast::AssertionKind::EndText,
1526 })),
1527 'b' => Ok(Primitive::Assertion(ast::Assertion {
1528 span: span,
1529 kind: ast::AssertionKind::WordBoundary,
1530 })),
1531 'B' => Ok(Primitive::Assertion(ast::Assertion {
1532 span: span,
1533 kind: ast::AssertionKind::NotWordBoundary,
1534 })),
1535 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1536 }
1537 }
1538
1539 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1540 /// long. This expects the parser to be positioned at the first octal
1541 /// digit and advances the parser to the first character immediately
1542 /// following the octal number. This also assumes that parsing octal
1543 /// escapes is enabled.
1544 ///
1545 /// Assuming the preconditions are met, this routine can never fail.
1546 #[inline(never)]
parse_octal(&self) -> ast::Literal1547 fn parse_octal(&self) -> ast::Literal {
1548 use std::char;
1549 use std::u32;
1550
1551 assert!(self.parser().octal);
1552 assert!('0' <= self.char() && self.char() <= '7');
1553 let start = self.pos();
1554 // Parse up to two more digits.
1555 while self.bump()
1556 && '0' <= self.char()
1557 && self.char() <= '7'
1558 && self.pos().offset - start.offset <= 2
1559 {}
1560 let end = self.pos();
1561 let octal = &self.pattern()[start.offset..end.offset];
1562 // Parsing the octal should never fail since the above guarantees a
1563 // valid number.
1564 let codepoint =
1565 u32::from_str_radix(octal, 8).expect("valid octal number");
1566 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1567 // invalid Unicode scalar values.
1568 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1569 ast::Literal {
1570 span: Span::new(start, end),
1571 kind: ast::LiteralKind::Octal,
1572 c: c,
1573 }
1574 }
1575
1576 /// Parse a hex representation of a Unicode codepoint. This handles both
1577 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1578 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1579 /// the first character immediately following the hexadecimal literal.
1580 #[inline(never)]
parse_hex(&self) -> Result<ast::Literal>1581 fn parse_hex(&self) -> Result<ast::Literal> {
1582 assert!(
1583 self.char() == 'x' || self.char() == 'u' || self.char() == 'U'
1584 );
1585
1586 let hex_kind = match self.char() {
1587 'x' => ast::HexLiteralKind::X,
1588 'u' => ast::HexLiteralKind::UnicodeShort,
1589 _ => ast::HexLiteralKind::UnicodeLong,
1590 };
1591 if !self.bump_and_bump_space() {
1592 return Err(
1593 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
1594 );
1595 }
1596 if self.char() == '{' {
1597 self.parse_hex_brace(hex_kind)
1598 } else {
1599 self.parse_hex_digits(hex_kind)
1600 }
1601 }
1602
1603 /// Parse an N-digit hex representation of a Unicode codepoint. This
1604 /// expects the parser to be positioned at the first digit and will advance
1605 /// the parser to the first character immediately following the escape
1606 /// sequence.
1607 ///
1608 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1609 /// or 8 (for `\UNNNNNNNN`).
1610 #[inline(never)]
parse_hex_digits( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1611 fn parse_hex_digits(
1612 &self,
1613 kind: ast::HexLiteralKind,
1614 ) -> Result<ast::Literal> {
1615 use std::char;
1616 use std::u32;
1617
1618 let mut scratch = self.parser().scratch.borrow_mut();
1619 scratch.clear();
1620
1621 let start = self.pos();
1622 for i in 0..kind.digits() {
1623 if i > 0 && !self.bump_and_bump_space() {
1624 return Err(self
1625 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
1626 }
1627 if !is_hex(self.char()) {
1628 return Err(self.error(
1629 self.span_char(),
1630 ast::ErrorKind::EscapeHexInvalidDigit,
1631 ));
1632 }
1633 scratch.push(self.char());
1634 }
1635 // The final bump just moves the parser past the literal, which may
1636 // be EOF.
1637 self.bump_and_bump_space();
1638 let end = self.pos();
1639 let hex = scratch.as_str();
1640 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1641 None => Err(self.error(
1642 Span::new(start, end),
1643 ast::ErrorKind::EscapeHexInvalid,
1644 )),
1645 Some(c) => Ok(ast::Literal {
1646 span: Span::new(start, end),
1647 kind: ast::LiteralKind::HexFixed(kind),
1648 c: c,
1649 }),
1650 }
1651 }
1652
1653 /// Parse a hex representation of any Unicode scalar value. This expects
1654 /// the parser to be positioned at the opening brace `{` and will advance
1655 /// the parser to the first character following the closing brace `}`.
1656 #[inline(never)]
parse_hex_brace( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1657 fn parse_hex_brace(
1658 &self,
1659 kind: ast::HexLiteralKind,
1660 ) -> Result<ast::Literal> {
1661 use std::char;
1662 use std::u32;
1663
1664 let mut scratch = self.parser().scratch.borrow_mut();
1665 scratch.clear();
1666
1667 let brace_pos = self.pos();
1668 let start = self.span_char().end;
1669 while self.bump_and_bump_space() && self.char() != '}' {
1670 if !is_hex(self.char()) {
1671 return Err(self.error(
1672 self.span_char(),
1673 ast::ErrorKind::EscapeHexInvalidDigit,
1674 ));
1675 }
1676 scratch.push(self.char());
1677 }
1678 if self.is_eof() {
1679 return Err(self.error(
1680 Span::new(brace_pos, self.pos()),
1681 ast::ErrorKind::EscapeUnexpectedEof,
1682 ));
1683 }
1684 let end = self.pos();
1685 let hex = scratch.as_str();
1686 assert_eq!(self.char(), '}');
1687 self.bump_and_bump_space();
1688
1689 if hex.is_empty() {
1690 return Err(self.error(
1691 Span::new(brace_pos, self.pos()),
1692 ast::ErrorKind::EscapeHexEmpty,
1693 ));
1694 }
1695 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1696 None => Err(self.error(
1697 Span::new(start, end),
1698 ast::ErrorKind::EscapeHexInvalid,
1699 )),
1700 Some(c) => Ok(ast::Literal {
1701 span: Span::new(start, self.pos()),
1702 kind: ast::LiteralKind::HexBrace(kind),
1703 c: c,
1704 }),
1705 }
1706 }
1707
1708 /// Parse a decimal number into a u32 while trimming leading and trailing
1709 /// whitespace.
1710 ///
1711 /// This expects the parser to be positioned at the first position where
1712 /// a decimal digit could occur. This will advance the parser to the byte
1713 /// immediately following the last contiguous decimal digit.
1714 ///
1715 /// If no decimal digit could be found or if there was a problem parsing
1716 /// the complete set of digits into a u32, then an error is returned.
parse_decimal(&self) -> Result<u32>1717 fn parse_decimal(&self) -> Result<u32> {
1718 let mut scratch = self.parser().scratch.borrow_mut();
1719 scratch.clear();
1720
1721 while !self.is_eof() && self.char().is_whitespace() {
1722 self.bump();
1723 }
1724 let start = self.pos();
1725 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1726 scratch.push(self.char());
1727 self.bump_and_bump_space();
1728 }
1729 let span = Span::new(start, self.pos());
1730 while !self.is_eof() && self.char().is_whitespace() {
1731 self.bump_and_bump_space();
1732 }
1733 let digits = scratch.as_str();
1734 if digits.is_empty() {
1735 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1736 }
1737 match u32::from_str_radix(digits, 10).ok() {
1738 Some(n) => Ok(n),
1739 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1740 }
1741 }
1742
1743 /// Parse a standard character class consisting primarily of characters or
1744 /// character ranges, but can also contain nested character classes of
1745 /// any type (sans `.`).
1746 ///
1747 /// This assumes the parser is positioned at the opening `[`. If parsing
1748 /// is successful, then the parser is advanced to the position immediately
1749 /// following the closing `]`.
1750 #[inline(never)]
parse_set_class(&self) -> Result<ast::Class>1751 fn parse_set_class(&self) -> Result<ast::Class> {
1752 assert_eq!(self.char(), '[');
1753
1754 let mut union =
1755 ast::ClassSetUnion { span: self.span(), items: vec![] };
1756 loop {
1757 self.bump_space();
1758 if self.is_eof() {
1759 return Err(self.unclosed_class_error());
1760 }
1761 match self.char() {
1762 '[' => {
1763 // If we've already parsed the opening bracket, then
1764 // attempt to treat this as the beginning of an ASCII
1765 // class. If ASCII class parsing fails, then the parser
1766 // backs up to `[`.
1767 if !self.parser().stack_class.borrow().is_empty() {
1768 if let Some(cls) = self.maybe_parse_ascii_class() {
1769 union.push(ast::ClassSetItem::Ascii(cls));
1770 continue;
1771 }
1772 }
1773 union = self.push_class_open(union)?;
1774 }
1775 ']' => match self.pop_class(union)? {
1776 Either::Left(nested_union) => {
1777 union = nested_union;
1778 }
1779 Either::Right(class) => return Ok(class),
1780 },
1781 '&' if self.peek() == Some('&') => {
1782 assert!(self.bump_if("&&"));
1783 union = self.push_class_op(
1784 ast::ClassSetBinaryOpKind::Intersection,
1785 union,
1786 );
1787 }
1788 '-' if self.peek() == Some('-') => {
1789 assert!(self.bump_if("--"));
1790 union = self.push_class_op(
1791 ast::ClassSetBinaryOpKind::Difference,
1792 union,
1793 );
1794 }
1795 '~' if self.peek() == Some('~') => {
1796 assert!(self.bump_if("~~"));
1797 union = self.push_class_op(
1798 ast::ClassSetBinaryOpKind::SymmetricDifference,
1799 union,
1800 );
1801 }
1802 _ => {
1803 union.push(self.parse_set_class_range()?);
1804 }
1805 }
1806 }
1807 }
1808
1809 /// Parse a single primitive item in a character class set. The item to
1810 /// be parsed can either be one of a simple literal character, a range
1811 /// between two simple literal characters or a "primitive" character
1812 /// class like \w or \p{Greek}.
1813 ///
1814 /// If an invalid escape is found, or if a character class is found where
1815 /// a simple literal is expected (e.g., in a range), then an error is
1816 /// returned.
1817 #[inline(never)]
parse_set_class_range(&self) -> Result<ast::ClassSetItem>1818 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1819 let prim1 = self.parse_set_class_item()?;
1820 self.bump_space();
1821 if self.is_eof() {
1822 return Err(self.unclosed_class_error());
1823 }
1824 // If the next char isn't a `-`, then we don't have a range.
1825 // There are two exceptions. If the char after a `-` is a `]`, then
1826 // `-` is interpreted as a literal `-`. Alternatively, if the char
1827 // after a `-` is a `-`, then `--` corresponds to a "difference"
1828 // operation.
1829 if self.char() != '-'
1830 || self.peek_space() == Some(']')
1831 || self.peek_space() == Some('-')
1832 {
1833 return prim1.into_class_set_item(self);
1834 }
1835 // OK, now we're parsing a range, so bump past the `-` and parse the
1836 // second half of the range.
1837 if !self.bump_and_bump_space() {
1838 return Err(self.unclosed_class_error());
1839 }
1840 let prim2 = self.parse_set_class_item()?;
1841 let range = ast::ClassSetRange {
1842 span: Span::new(prim1.span().start, prim2.span().end),
1843 start: prim1.into_class_literal(self)?,
1844 end: prim2.into_class_literal(self)?,
1845 };
1846 if !range.is_valid() {
1847 return Err(
1848 self.error(range.span, ast::ErrorKind::ClassRangeInvalid)
1849 );
1850 }
1851 Ok(ast::ClassSetItem::Range(range))
1852 }
1853
1854 /// Parse a single item in a character class as a primitive, where the
1855 /// primitive either consists of a verbatim literal or a single escape
1856 /// sequence.
1857 ///
1858 /// This assumes the parser is positioned at the beginning of a primitive,
1859 /// and advances the parser to the first position after the primitive if
1860 /// successful.
1861 ///
1862 /// Note that it is the caller's responsibility to report an error if an
1863 /// illegal primitive was parsed.
1864 #[inline(never)]
parse_set_class_item(&self) -> Result<Primitive>1865 fn parse_set_class_item(&self) -> Result<Primitive> {
1866 if self.char() == '\\' {
1867 self.parse_escape()
1868 } else {
1869 let x = Primitive::Literal(ast::Literal {
1870 span: self.span_char(),
1871 kind: ast::LiteralKind::Verbatim,
1872 c: self.char(),
1873 });
1874 self.bump();
1875 Ok(x)
1876 }
1877 }
1878
1879 /// Parses the opening of a character class set. This includes the opening
1880 /// bracket along with `^` if present to indicate negation. This also
1881 /// starts parsing the opening set of unioned items if applicable, since
1882 /// there are special rules applied to certain characters in the opening
1883 /// of a character class. For example, `[^]]` is the class of all
1884 /// characters not equal to `]`. (`]` would need to be escaped in any other
1885 /// position.) Similarly for `-`.
1886 ///
1887 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1888 /// empty union. This empty union should be replaced with the actual item
1889 /// when it is popped from the parser's stack.
1890 ///
1891 /// This assumes the parser is positioned at the opening `[` and advances
1892 /// the parser to the first non-special byte of the character class.
1893 ///
1894 /// An error is returned if EOF is found.
1895 #[inline(never)]
parse_set_class_open( &self, ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)>1896 fn parse_set_class_open(
1897 &self,
1898 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1899 assert_eq!(self.char(), '[');
1900 let start = self.pos();
1901 if !self.bump_and_bump_space() {
1902 return Err(self.error(
1903 Span::new(start, self.pos()),
1904 ast::ErrorKind::ClassUnclosed,
1905 ));
1906 }
1907
1908 let negated = if self.char() != '^' {
1909 false
1910 } else {
1911 if !self.bump_and_bump_space() {
1912 return Err(self.error(
1913 Span::new(start, self.pos()),
1914 ast::ErrorKind::ClassUnclosed,
1915 ));
1916 }
1917 true
1918 };
1919 // Accept any number of `-` as literal `-`.
1920 let mut union =
1921 ast::ClassSetUnion { span: self.span(), items: vec![] };
1922 while self.char() == '-' {
1923 union.push(ast::ClassSetItem::Literal(ast::Literal {
1924 span: self.span_char(),
1925 kind: ast::LiteralKind::Verbatim,
1926 c: '-',
1927 }));
1928 if !self.bump_and_bump_space() {
1929 return Err(self.error(
1930 Span::new(start, self.pos()),
1931 ast::ErrorKind::ClassUnclosed,
1932 ));
1933 }
1934 }
1935 // If `]` is the *first* char in a set, then interpret it as a literal
1936 // `]`. That is, an empty class is impossible to write.
1937 if union.items.is_empty() && self.char() == ']' {
1938 union.push(ast::ClassSetItem::Literal(ast::Literal {
1939 span: self.span_char(),
1940 kind: ast::LiteralKind::Verbatim,
1941 c: ']',
1942 }));
1943 if !self.bump_and_bump_space() {
1944 return Err(self.error(
1945 Span::new(start, self.pos()),
1946 ast::ErrorKind::ClassUnclosed,
1947 ));
1948 }
1949 }
1950 let set = ast::ClassBracketed {
1951 span: Span::new(start, self.pos()),
1952 negated: negated,
1953 kind: ast::ClassSet::union(ast::ClassSetUnion {
1954 span: Span::new(union.span.start, union.span.start),
1955 items: vec![],
1956 }),
1957 };
1958 Ok((set, union))
1959 }
1960
1961 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1962 ///
1963 /// This assumes the parser is positioned at the opening `[`.
1964 ///
1965 /// If no valid ASCII character class could be found, then this does not
1966 /// advance the parser and `None` is returned. Otherwise, the parser is
1967 /// advanced to the first byte following the closing `]` and the
1968 /// corresponding ASCII class is returned.
1969 #[inline(never)]
maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii>1970 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1971 // ASCII character classes are interesting from a parsing perspective
1972 // because parsing cannot fail with any interesting error. For example,
1973 // in order to use an ASCII character class, it must be enclosed in
1974 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1975 // of it as "ASCII character characters have the syntax `[:NAME:]`
1976 // which can only appear within character brackets." This means that
1977 // things like `[[:lower:]A]` are legal constructs.
1978 //
1979 // However, if one types an incorrect ASCII character class, e.g.,
1980 // `[[:loower:]]`, then we treat that as a normal nested character
1981 // class containing the characters `:elorw`. One might argue that we
1982 // should return an error instead since the repeated colons give away
1983 // the intent to write an ASCII class. But what if the user typed
1984 // `[[:lower]]` instead? How can we tell that was intended to be an
1985 // ASCII class and not just a normal nested class?
1986 //
1987 // Reasonable people can probably disagree over this, but for better
1988 // or worse, we implement semantics that never fails at the expense
1989 // of better failure modes.
1990 assert_eq!(self.char(), '[');
1991 // If parsing fails, then we back up the parser to this starting point.
1992 let start = self.pos();
1993 let mut negated = false;
1994 if !self.bump() || self.char() != ':' {
1995 self.parser().pos.set(start);
1996 return None;
1997 }
1998 if !self.bump() {
1999 self.parser().pos.set(start);
2000 return None;
2001 }
2002 if self.char() == '^' {
2003 negated = true;
2004 if !self.bump() {
2005 self.parser().pos.set(start);
2006 return None;
2007 }
2008 }
2009 let name_start = self.offset();
2010 while self.char() != ':' && self.bump() {}
2011 if self.is_eof() {
2012 self.parser().pos.set(start);
2013 return None;
2014 }
2015 let name = &self.pattern()[name_start..self.offset()];
2016 if !self.bump_if(":]") {
2017 self.parser().pos.set(start);
2018 return None;
2019 }
2020 let kind = match ast::ClassAsciiKind::from_name(name) {
2021 Some(kind) => kind,
2022 None => {
2023 self.parser().pos.set(start);
2024 return None;
2025 }
2026 };
2027 Some(ast::ClassAscii {
2028 span: Span::new(start, self.pos()),
2029 kind: kind,
2030 negated: negated,
2031 })
2032 }
2033
2034 /// Parse a Unicode class in either the single character notation, `\pN`
2035 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2036 /// the parser is positioned at the `p` (or `P` for negation) and will
2037 /// advance the parser to the character immediately following the class.
2038 ///
2039 /// Note that this does not check whether the class name is valid or not.
2040 #[inline(never)]
parse_unicode_class(&self) -> Result<ast::ClassUnicode>2041 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2042 assert!(self.char() == 'p' || self.char() == 'P');
2043
2044 let mut scratch = self.parser().scratch.borrow_mut();
2045 scratch.clear();
2046
2047 let negated = self.char() == 'P';
2048 if !self.bump_and_bump_space() {
2049 return Err(
2050 self.error(self.span(), ast::ErrorKind::EscapeUnexpectedEof)
2051 );
2052 }
2053 let (start, kind) = if self.char() == '{' {
2054 let start = self.span_char().end;
2055 while self.bump_and_bump_space() && self.char() != '}' {
2056 scratch.push(self.char());
2057 }
2058 if self.is_eof() {
2059 return Err(self
2060 .error(self.span(), ast::ErrorKind::EscapeUnexpectedEof));
2061 }
2062 assert_eq!(self.char(), '}');
2063 self.bump();
2064
2065 let name = scratch.as_str();
2066 if let Some(i) = name.find("!=") {
2067 (
2068 start,
2069 ast::ClassUnicodeKind::NamedValue {
2070 op: ast::ClassUnicodeOpKind::NotEqual,
2071 name: name[..i].to_string(),
2072 value: name[i + 2..].to_string(),
2073 },
2074 )
2075 } else if let Some(i) = name.find(':') {
2076 (
2077 start,
2078 ast::ClassUnicodeKind::NamedValue {
2079 op: ast::ClassUnicodeOpKind::Colon,
2080 name: name[..i].to_string(),
2081 value: name[i + 1..].to_string(),
2082 },
2083 )
2084 } else if let Some(i) = name.find('=') {
2085 (
2086 start,
2087 ast::ClassUnicodeKind::NamedValue {
2088 op: ast::ClassUnicodeOpKind::Equal,
2089 name: name[..i].to_string(),
2090 value: name[i + 1..].to_string(),
2091 },
2092 )
2093 } else {
2094 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2095 }
2096 } else {
2097 let start = self.pos();
2098 let c = self.char();
2099 if c == '\\' {
2100 return Err(self.error(
2101 self.span_char(),
2102 ast::ErrorKind::UnicodeClassInvalid,
2103 ));
2104 }
2105 self.bump_and_bump_space();
2106 let kind = ast::ClassUnicodeKind::OneLetter(c);
2107 (start, kind)
2108 };
2109 Ok(ast::ClassUnicode {
2110 span: Span::new(start, self.pos()),
2111 negated: negated,
2112 kind: kind,
2113 })
2114 }
2115
2116 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2117 /// parser is currently at a valid character class name and will be
2118 /// advanced to the character immediately following the class.
2119 #[inline(never)]
parse_perl_class(&self) -> ast::ClassPerl2120 fn parse_perl_class(&self) -> ast::ClassPerl {
2121 let c = self.char();
2122 let span = self.span_char();
2123 self.bump();
2124 let (negated, kind) = match c {
2125 'd' => (false, ast::ClassPerlKind::Digit),
2126 'D' => (true, ast::ClassPerlKind::Digit),
2127 's' => (false, ast::ClassPerlKind::Space),
2128 'S' => (true, ast::ClassPerlKind::Space),
2129 'w' => (false, ast::ClassPerlKind::Word),
2130 'W' => (true, ast::ClassPerlKind::Word),
2131 c => panic!("expected valid Perl class but got '{}'", c),
2132 };
2133 ast::ClassPerl { span: span, kind: kind, negated: negated }
2134 }
2135 }
2136
2137 /// A type that traverses a fully parsed Ast and checks whether its depth
2138 /// exceeds the specified nesting limit. If it does, then an error is returned.
2139 #[derive(Debug)]
2140 struct NestLimiter<'p, 's, P> {
2141 /// The parser that is checking the nest limit.
2142 p: &'p ParserI<'s, P>,
2143 /// The current depth while walking an Ast.
2144 depth: u32,
2145 }
2146
2147 impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P>2148 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2149 NestLimiter { p: p, depth: 0 }
2150 }
2151
2152 #[inline(never)]
check(self, ast: &Ast) -> Result<()>2153 fn check(self, ast: &Ast) -> Result<()> {
2154 ast::visit(ast, self)
2155 }
2156
increment_depth(&mut self, span: &Span) -> Result<()>2157 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2158 let new = self.depth.checked_add(1).ok_or_else(|| {
2159 self.p.error(
2160 span.clone(),
2161 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
2162 )
2163 })?;
2164 let limit = self.p.parser().nest_limit;
2165 if new > limit {
2166 return Err(self.p.error(
2167 span.clone(),
2168 ast::ErrorKind::NestLimitExceeded(limit),
2169 ));
2170 }
2171 self.depth = new;
2172 Ok(())
2173 }
2174
decrement_depth(&mut self)2175 fn decrement_depth(&mut self) {
2176 // Assuming the correctness of the visitor, this should never drop
2177 // below 0.
2178 self.depth = self.depth.checked_sub(1).unwrap();
2179 }
2180 }
2181
2182 impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2183 type Output = ();
2184 type Err = ast::Error;
2185
finish(self) -> Result<()>2186 fn finish(self) -> Result<()> {
2187 Ok(())
2188 }
2189
visit_pre(&mut self, ast: &Ast) -> Result<()>2190 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2191 let span = match *ast {
2192 Ast::Empty(_)
2193 | Ast::Flags(_)
2194 | Ast::Literal(_)
2195 | Ast::Dot(_)
2196 | Ast::Assertion(_)
2197 | Ast::Class(ast::Class::Unicode(_))
2198 | Ast::Class(ast::Class::Perl(_)) => {
2199 // These are all base cases, so we don't increment depth.
2200 return Ok(());
2201 }
2202 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2203 Ast::Repetition(ref x) => &x.span,
2204 Ast::Group(ref x) => &x.span,
2205 Ast::Alternation(ref x) => &x.span,
2206 Ast::Concat(ref x) => &x.span,
2207 };
2208 self.increment_depth(span)
2209 }
2210
visit_post(&mut self, ast: &Ast) -> Result<()>2211 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2212 match *ast {
2213 Ast::Empty(_)
2214 | Ast::Flags(_)
2215 | Ast::Literal(_)
2216 | Ast::Dot(_)
2217 | Ast::Assertion(_)
2218 | Ast::Class(ast::Class::Unicode(_))
2219 | Ast::Class(ast::Class::Perl(_)) => {
2220 // These are all base cases, so we don't decrement depth.
2221 Ok(())
2222 }
2223 Ast::Class(ast::Class::Bracketed(_))
2224 | Ast::Repetition(_)
2225 | Ast::Group(_)
2226 | Ast::Alternation(_)
2227 | Ast::Concat(_) => {
2228 self.decrement_depth();
2229 Ok(())
2230 }
2231 }
2232 }
2233
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2234 fn visit_class_set_item_pre(
2235 &mut self,
2236 ast: &ast::ClassSetItem,
2237 ) -> Result<()> {
2238 let span = match *ast {
2239 ast::ClassSetItem::Empty(_)
2240 | ast::ClassSetItem::Literal(_)
2241 | ast::ClassSetItem::Range(_)
2242 | ast::ClassSetItem::Ascii(_)
2243 | ast::ClassSetItem::Unicode(_)
2244 | ast::ClassSetItem::Perl(_) => {
2245 // These are all base cases, so we don't increment depth.
2246 return Ok(());
2247 }
2248 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2249 ast::ClassSetItem::Union(ref x) => &x.span,
2250 };
2251 self.increment_depth(span)
2252 }
2253
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2254 fn visit_class_set_item_post(
2255 &mut self,
2256 ast: &ast::ClassSetItem,
2257 ) -> Result<()> {
2258 match *ast {
2259 ast::ClassSetItem::Empty(_)
2260 | ast::ClassSetItem::Literal(_)
2261 | ast::ClassSetItem::Range(_)
2262 | ast::ClassSetItem::Ascii(_)
2263 | ast::ClassSetItem::Unicode(_)
2264 | ast::ClassSetItem::Perl(_) => {
2265 // These are all base cases, so we don't decrement depth.
2266 Ok(())
2267 }
2268 ast::ClassSetItem::Bracketed(_) | ast::ClassSetItem::Union(_) => {
2269 self.decrement_depth();
2270 Ok(())
2271 }
2272 }
2273 }
2274
visit_class_set_binary_op_pre( &mut self, ast: &ast::ClassSetBinaryOp, ) -> Result<()>2275 fn visit_class_set_binary_op_pre(
2276 &mut self,
2277 ast: &ast::ClassSetBinaryOp,
2278 ) -> Result<()> {
2279 self.increment_depth(&ast.span)
2280 }
2281
visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<()>2282 fn visit_class_set_binary_op_post(
2283 &mut self,
2284 _ast: &ast::ClassSetBinaryOp,
2285 ) -> Result<()> {
2286 self.decrement_depth();
2287 Ok(())
2288 }
2289 }
2290
2291 /// When the result is an error, transforms the ast::ErrorKind from the source
2292 /// Result into another one. This function is used to return clearer error
2293 /// messages when possible.
specialize_err<T>( result: Result<T>, from: ast::ErrorKind, to: ast::ErrorKind, ) -> Result<T>2294 fn specialize_err<T>(
2295 result: Result<T>,
2296 from: ast::ErrorKind,
2297 to: ast::ErrorKind,
2298 ) -> Result<T> {
2299 if let Err(e) = result {
2300 if e.kind == from {
2301 Err(ast::Error { kind: to, pattern: e.pattern, span: e.span })
2302 } else {
2303 Err(e)
2304 }
2305 } else {
2306 result
2307 }
2308 }
2309
2310 #[cfg(test)]
2311 mod tests {
2312 use std::ops::Range;
2313
2314 use super::{Parser, ParserBuilder, ParserI, Primitive};
2315 use crate::ast::{self, Ast, Position, Span};
2316
2317 // Our own assert_eq, which has slightly better formatting (but honestly
2318 // still kind of crappy).
2319 macro_rules! assert_eq {
2320 ($left:expr, $right:expr) => {{
2321 match (&$left, &$right) {
2322 (left_val, right_val) => {
2323 if !(*left_val == *right_val) {
2324 panic!(
2325 "assertion failed: `(left == right)`\n\n\
2326 left: `{:?}`\nright: `{:?}`\n\n",
2327 left_val, right_val
2328 )
2329 }
2330 }
2331 }
2332 }};
2333 }
2334
2335 // We create these errors to compare with real ast::Errors in the tests.
2336 // We define equality between TestError and ast::Error to disregard the
2337 // pattern string in ast::Error, which is annoying to provide in tests.
2338 #[derive(Clone, Debug)]
2339 struct TestError {
2340 span: Span,
2341 kind: ast::ErrorKind,
2342 }
2343
2344 impl PartialEq<ast::Error> for TestError {
eq(&self, other: &ast::Error) -> bool2345 fn eq(&self, other: &ast::Error) -> bool {
2346 self.span == other.span && self.kind == other.kind
2347 }
2348 }
2349
2350 impl PartialEq<TestError> for ast::Error {
eq(&self, other: &TestError) -> bool2351 fn eq(&self, other: &TestError) -> bool {
2352 self.span == other.span && self.kind == other.kind
2353 }
2354 }
2355
s(str: &str) -> String2356 fn s(str: &str) -> String {
2357 str.to_string()
2358 }
2359
parser(pattern: &str) -> ParserI<'_, Parser>2360 fn parser(pattern: &str) -> ParserI<'_, Parser> {
2361 ParserI::new(Parser::new(), pattern)
2362 }
2363
parser_octal(pattern: &str) -> ParserI<'_, Parser>2364 fn parser_octal(pattern: &str) -> ParserI<'_, Parser> {
2365 let parser = ParserBuilder::new().octal(true).build();
2366 ParserI::new(parser, pattern)
2367 }
2368
parser_nest_limit( pattern: &str, nest_limit: u32, ) -> ParserI<'_, Parser>2369 fn parser_nest_limit(
2370 pattern: &str,
2371 nest_limit: u32,
2372 ) -> ParserI<'_, Parser> {
2373 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2374 ParserI::new(p, pattern)
2375 }
2376
parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser>2377 fn parser_ignore_whitespace(pattern: &str) -> ParserI<'_, Parser> {
2378 let p = ParserBuilder::new().ignore_whitespace(true).build();
2379 ParserI::new(p, pattern)
2380 }
2381
2382 /// Short alias for creating a new span.
nspan(start: Position, end: Position) -> Span2383 fn nspan(start: Position, end: Position) -> Span {
2384 Span::new(start, end)
2385 }
2386
2387 /// Short alias for creating a new position.
npos(offset: usize, line: usize, column: usize) -> Position2388 fn npos(offset: usize, line: usize, column: usize) -> Position {
2389 Position::new(offset, line, column)
2390 }
2391
2392 /// Create a new span from the given offset range. This assumes a single
2393 /// line and sets the columns based on the offsets. i.e., This only works
2394 /// out of the box for ASCII, which is fine for most tests.
span(range: Range<usize>) -> Span2395 fn span(range: Range<usize>) -> Span {
2396 let start = Position::new(range.start, 1, range.start + 1);
2397 let end = Position::new(range.end, 1, range.end + 1);
2398 Span::new(start, end)
2399 }
2400
2401 /// Create a new span for the corresponding byte range in the given string.
span_range(subject: &str, range: Range<usize>) -> Span2402 fn span_range(subject: &str, range: Range<usize>) -> Span {
2403 let start = Position {
2404 offset: range.start,
2405 line: 1 + subject[..range.start].matches('\n').count(),
2406 column: 1 + subject[..range.start]
2407 .chars()
2408 .rev()
2409 .position(|c| c == '\n')
2410 .unwrap_or(subject[..range.start].chars().count()),
2411 };
2412 let end = Position {
2413 offset: range.end,
2414 line: 1 + subject[..range.end].matches('\n').count(),
2415 column: 1 + subject[..range.end]
2416 .chars()
2417 .rev()
2418 .position(|c| c == '\n')
2419 .unwrap_or(subject[..range.end].chars().count()),
2420 };
2421 Span::new(start, end)
2422 }
2423
2424 /// Create a verbatim literal starting at the given position.
lit(c: char, start: usize) -> Ast2425 fn lit(c: char, start: usize) -> Ast {
2426 lit_with(c, span(start..start + c.len_utf8()))
2427 }
2428
2429 /// Create a punctuation literal starting at the given position.
punct_lit(c: char, span: Span) -> Ast2430 fn punct_lit(c: char, span: Span) -> Ast {
2431 Ast::Literal(ast::Literal {
2432 span: span,
2433 kind: ast::LiteralKind::Punctuation,
2434 c: c,
2435 })
2436 }
2437
2438 /// Create a verbatim literal with the given span.
lit_with(c: char, span: Span) -> Ast2439 fn lit_with(c: char, span: Span) -> Ast {
2440 Ast::Literal(ast::Literal {
2441 span: span,
2442 kind: ast::LiteralKind::Verbatim,
2443 c: c,
2444 })
2445 }
2446
2447 /// Create a concatenation with the given range.
concat(range: Range<usize>, asts: Vec<Ast>) -> Ast2448 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2449 concat_with(span(range), asts)
2450 }
2451
2452 /// Create a concatenation with the given span.
concat_with(span: Span, asts: Vec<Ast>) -> Ast2453 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2454 Ast::Concat(ast::Concat { span: span, asts: asts })
2455 }
2456
2457 /// Create an alternation with the given span.
alt(range: Range<usize>, asts: Vec<Ast>) -> Ast2458 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2459 Ast::Alternation(ast::Alternation { span: span(range), asts: asts })
2460 }
2461
2462 /// Create a capturing group with the given span.
group(range: Range<usize>, index: u32, ast: Ast) -> Ast2463 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2464 Ast::Group(ast::Group {
2465 span: span(range),
2466 kind: ast::GroupKind::CaptureIndex(index),
2467 ast: Box::new(ast),
2468 })
2469 }
2470
2471 /// Create an ast::SetFlags.
2472 ///
2473 /// The given pattern should be the full pattern string. The range given
2474 /// should correspond to the byte offsets where the flag set occurs.
2475 ///
2476 /// If negated is true, then the set is interpreted as beginning with a
2477 /// negation.
flag_set( pat: &str, range: Range<usize>, flag: ast::Flag, negated: bool, ) -> Ast2478 fn flag_set(
2479 pat: &str,
2480 range: Range<usize>,
2481 flag: ast::Flag,
2482 negated: bool,
2483 ) -> Ast {
2484 let mut items = vec![ast::FlagsItem {
2485 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2486 kind: ast::FlagsItemKind::Flag(flag),
2487 }];
2488 if negated {
2489 items.insert(
2490 0,
2491 ast::FlagsItem {
2492 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2493 kind: ast::FlagsItemKind::Negation,
2494 },
2495 );
2496 }
2497 Ast::Flags(ast::SetFlags {
2498 span: span_range(pat, range.clone()),
2499 flags: ast::Flags {
2500 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2501 items: items,
2502 },
2503 })
2504 }
2505
2506 #[test]
parse_nest_limit()2507 fn parse_nest_limit() {
2508 // A nest limit of 0 still allows some types of regexes.
2509 assert_eq!(
2510 parser_nest_limit("", 0).parse(),
2511 Ok(Ast::Empty(span(0..0)))
2512 );
2513 assert_eq!(parser_nest_limit("a", 0).parse(), Ok(lit('a', 0)));
2514
2515 // Test repetition operations, which require one level of nesting.
2516 assert_eq!(
2517 parser_nest_limit("a+", 0).parse().unwrap_err(),
2518 TestError {
2519 span: span(0..2),
2520 kind: ast::ErrorKind::NestLimitExceeded(0),
2521 }
2522 );
2523 assert_eq!(
2524 parser_nest_limit("a+", 1).parse(),
2525 Ok(Ast::Repetition(ast::Repetition {
2526 span: span(0..2),
2527 op: ast::RepetitionOp {
2528 span: span(1..2),
2529 kind: ast::RepetitionKind::OneOrMore,
2530 },
2531 greedy: true,
2532 ast: Box::new(lit('a', 0)),
2533 }))
2534 );
2535 assert_eq!(
2536 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2537 TestError {
2538 span: span(0..3),
2539 kind: ast::ErrorKind::NestLimitExceeded(1),
2540 }
2541 );
2542 assert_eq!(
2543 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2544 TestError {
2545 span: span(0..2),
2546 kind: ast::ErrorKind::NestLimitExceeded(1),
2547 }
2548 );
2549 assert_eq!(
2550 parser_nest_limit("a+*", 2).parse(),
2551 Ok(Ast::Repetition(ast::Repetition {
2552 span: span(0..3),
2553 op: ast::RepetitionOp {
2554 span: span(2..3),
2555 kind: ast::RepetitionKind::ZeroOrMore,
2556 },
2557 greedy: true,
2558 ast: Box::new(Ast::Repetition(ast::Repetition {
2559 span: span(0..2),
2560 op: ast::RepetitionOp {
2561 span: span(1..2),
2562 kind: ast::RepetitionKind::OneOrMore,
2563 },
2564 greedy: true,
2565 ast: Box::new(lit('a', 0)),
2566 })),
2567 }))
2568 );
2569
2570 // Test concatenations. A concatenation requires one level of nesting.
2571 assert_eq!(
2572 parser_nest_limit("ab", 0).parse().unwrap_err(),
2573 TestError {
2574 span: span(0..2),
2575 kind: ast::ErrorKind::NestLimitExceeded(0),
2576 }
2577 );
2578 assert_eq!(
2579 parser_nest_limit("ab", 1).parse(),
2580 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)]))
2581 );
2582 assert_eq!(
2583 parser_nest_limit("abc", 1).parse(),
2584 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)]))
2585 );
2586
2587 // Test alternations. An alternation requires one level of nesting.
2588 assert_eq!(
2589 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2590 TestError {
2591 span: span(0..3),
2592 kind: ast::ErrorKind::NestLimitExceeded(0),
2593 }
2594 );
2595 assert_eq!(
2596 parser_nest_limit("a|b", 1).parse(),
2597 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)]))
2598 );
2599 assert_eq!(
2600 parser_nest_limit("a|b|c", 1).parse(),
2601 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)]))
2602 );
2603
2604 // Test character classes. Classes form their own mini-recursive
2605 // syntax!
2606 assert_eq!(
2607 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2608 TestError {
2609 span: span(0..3),
2610 kind: ast::ErrorKind::NestLimitExceeded(0),
2611 }
2612 );
2613 assert_eq!(
2614 parser_nest_limit("[a]", 1).parse(),
2615 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2616 span: span(0..3),
2617 negated: false,
2618 kind: ast::ClassSet::Item(ast::ClassSetItem::Literal(
2619 ast::Literal {
2620 span: span(1..2),
2621 kind: ast::LiteralKind::Verbatim,
2622 c: 'a',
2623 }
2624 )),
2625 })))
2626 );
2627 assert_eq!(
2628 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2629 TestError {
2630 span: span(1..3),
2631 kind: ast::ErrorKind::NestLimitExceeded(1),
2632 }
2633 );
2634 assert_eq!(
2635 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2636 TestError {
2637 span: span(3..7),
2638 kind: ast::ErrorKind::NestLimitExceeded(2),
2639 }
2640 );
2641 assert_eq!(
2642 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2643 TestError {
2644 span: span(4..6),
2645 kind: ast::ErrorKind::NestLimitExceeded(3),
2646 }
2647 );
2648 assert_eq!(
2649 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2650 TestError {
2651 span: span(1..5),
2652 kind: ast::ErrorKind::NestLimitExceeded(1),
2653 }
2654 );
2655 assert_eq!(
2656 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2657 TestError {
2658 span: span(4..6),
2659 kind: ast::ErrorKind::NestLimitExceeded(2),
2660 }
2661 );
2662 }
2663
2664 #[test]
parse_comments()2665 fn parse_comments() {
2666 let pat = "(?x)
2667 # This is comment 1.
2668 foo # This is comment 2.
2669 # This is comment 3.
2670 bar
2671 # This is comment 4.";
2672 let astc = parser(pat).parse_with_comments().unwrap();
2673 assert_eq!(
2674 astc.ast,
2675 concat_with(
2676 span_range(pat, 0..pat.len()),
2677 vec![
2678 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2679 lit_with('f', span_range(pat, 26..27)),
2680 lit_with('o', span_range(pat, 27..28)),
2681 lit_with('o', span_range(pat, 28..29)),
2682 lit_with('b', span_range(pat, 74..75)),
2683 lit_with('a', span_range(pat, 75..76)),
2684 lit_with('r', span_range(pat, 76..77)),
2685 ]
2686 )
2687 );
2688 assert_eq!(
2689 astc.comments,
2690 vec![
2691 ast::Comment {
2692 span: span_range(pat, 5..26),
2693 comment: s(" This is comment 1."),
2694 },
2695 ast::Comment {
2696 span: span_range(pat, 30..51),
2697 comment: s(" This is comment 2."),
2698 },
2699 ast::Comment {
2700 span: span_range(pat, 53..74),
2701 comment: s(" This is comment 3."),
2702 },
2703 ast::Comment {
2704 span: span_range(pat, 78..98),
2705 comment: s(" This is comment 4."),
2706 },
2707 ]
2708 );
2709 }
2710
2711 #[test]
parse_holistic()2712 fn parse_holistic() {
2713 assert_eq!(parser("]").parse(), Ok(lit(']', 0)));
2714 assert_eq!(
2715 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2716 Ok(concat(
2717 0..36,
2718 vec![
2719 punct_lit('\\', span(0..2)),
2720 punct_lit('.', span(2..4)),
2721 punct_lit('+', span(4..6)),
2722 punct_lit('*', span(6..8)),
2723 punct_lit('?', span(8..10)),
2724 punct_lit('(', span(10..12)),
2725 punct_lit(')', span(12..14)),
2726 punct_lit('|', span(14..16)),
2727 punct_lit('[', span(16..18)),
2728 punct_lit(']', span(18..20)),
2729 punct_lit('{', span(20..22)),
2730 punct_lit('}', span(22..24)),
2731 punct_lit('^', span(24..26)),
2732 punct_lit('$', span(26..28)),
2733 punct_lit('#', span(28..30)),
2734 punct_lit('&', span(30..32)),
2735 punct_lit('-', span(32..34)),
2736 punct_lit('~', span(34..36)),
2737 ]
2738 ))
2739 );
2740 }
2741
2742 #[test]
parse_ignore_whitespace()2743 fn parse_ignore_whitespace() {
2744 // Test that basic whitespace insensitivity works.
2745 let pat = "(?x)a b";
2746 assert_eq!(
2747 parser(pat).parse(),
2748 Ok(concat_with(
2749 nspan(npos(0, 1, 1), npos(7, 1, 8)),
2750 vec![
2751 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2752 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2753 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2754 ]
2755 ))
2756 );
2757
2758 // Test that we can toggle whitespace insensitivity.
2759 let pat = "(?x)a b(?-x)a b";
2760 assert_eq!(
2761 parser(pat).parse(),
2762 Ok(concat_with(
2763 nspan(npos(0, 1, 1), npos(15, 1, 16)),
2764 vec![
2765 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2766 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2767 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2768 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2769 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2770 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2771 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2772 ]
2773 ))
2774 );
2775
2776 // Test that nesting whitespace insensitive flags works.
2777 let pat = "a (?x:a )a ";
2778 assert_eq!(
2779 parser(pat).parse(),
2780 Ok(concat_with(
2781 span_range(pat, 0..11),
2782 vec![
2783 lit_with('a', span_range(pat, 0..1)),
2784 lit_with(' ', span_range(pat, 1..2)),
2785 Ast::Group(ast::Group {
2786 span: span_range(pat, 2..9),
2787 kind: ast::GroupKind::NonCapturing(ast::Flags {
2788 span: span_range(pat, 4..5),
2789 items: vec![ast::FlagsItem {
2790 span: span_range(pat, 4..5),
2791 kind: ast::FlagsItemKind::Flag(
2792 ast::Flag::IgnoreWhitespace
2793 ),
2794 },],
2795 }),
2796 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2797 }),
2798 lit_with('a', span_range(pat, 9..10)),
2799 lit_with(' ', span_range(pat, 10..11)),
2800 ]
2801 ))
2802 );
2803
2804 // Test that whitespace after an opening paren is insignificant.
2805 let pat = "(?x)( ?P<foo> a )";
2806 assert_eq!(
2807 parser(pat).parse(),
2808 Ok(concat_with(
2809 span_range(pat, 0..pat.len()),
2810 vec![
2811 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2812 Ast::Group(ast::Group {
2813 span: span_range(pat, 4..pat.len()),
2814 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2815 span: span_range(pat, 9..12),
2816 name: s("foo"),
2817 index: 1,
2818 }),
2819 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2820 }),
2821 ]
2822 ))
2823 );
2824 let pat = "(?x)( a )";
2825 assert_eq!(
2826 parser(pat).parse(),
2827 Ok(concat_with(
2828 span_range(pat, 0..pat.len()),
2829 vec![
2830 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2831 Ast::Group(ast::Group {
2832 span: span_range(pat, 4..pat.len()),
2833 kind: ast::GroupKind::CaptureIndex(1),
2834 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2835 }),
2836 ]
2837 ))
2838 );
2839 let pat = "(?x)( ?: a )";
2840 assert_eq!(
2841 parser(pat).parse(),
2842 Ok(concat_with(
2843 span_range(pat, 0..pat.len()),
2844 vec![
2845 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2846 Ast::Group(ast::Group {
2847 span: span_range(pat, 4..pat.len()),
2848 kind: ast::GroupKind::NonCapturing(ast::Flags {
2849 span: span_range(pat, 8..8),
2850 items: vec![],
2851 }),
2852 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2853 }),
2854 ]
2855 ))
2856 );
2857 let pat = r"(?x)\x { 53 }";
2858 assert_eq!(
2859 parser(pat).parse(),
2860 Ok(concat_with(
2861 span_range(pat, 0..pat.len()),
2862 vec![
2863 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2864 Ast::Literal(ast::Literal {
2865 span: span(4..13),
2866 kind: ast::LiteralKind::HexBrace(
2867 ast::HexLiteralKind::X
2868 ),
2869 c: 'S',
2870 }),
2871 ]
2872 ))
2873 );
2874
2875 // Test that whitespace after an escape is OK.
2876 let pat = r"(?x)\ ";
2877 assert_eq!(
2878 parser(pat).parse(),
2879 Ok(concat_with(
2880 span_range(pat, 0..pat.len()),
2881 vec![
2882 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2883 Ast::Literal(ast::Literal {
2884 span: span_range(pat, 4..6),
2885 kind: ast::LiteralKind::Special(
2886 ast::SpecialLiteralKind::Space
2887 ),
2888 c: ' ',
2889 }),
2890 ]
2891 ))
2892 );
2893 // ... but only when `x` mode is enabled.
2894 let pat = r"\ ";
2895 assert_eq!(
2896 parser(pat).parse().unwrap_err(),
2897 TestError {
2898 span: span_range(pat, 0..2),
2899 kind: ast::ErrorKind::EscapeUnrecognized,
2900 }
2901 );
2902 }
2903
2904 #[test]
parse_newlines()2905 fn parse_newlines() {
2906 let pat = ".\n.";
2907 assert_eq!(
2908 parser(pat).parse(),
2909 Ok(concat_with(
2910 span_range(pat, 0..3),
2911 vec![
2912 Ast::Dot(span_range(pat, 0..1)),
2913 lit_with('\n', span_range(pat, 1..2)),
2914 Ast::Dot(span_range(pat, 2..3)),
2915 ]
2916 ))
2917 );
2918
2919 let pat = "foobar\nbaz\nquux\n";
2920 assert_eq!(
2921 parser(pat).parse(),
2922 Ok(concat_with(
2923 span_range(pat, 0..pat.len()),
2924 vec![
2925 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2926 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2927 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2928 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2929 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2930 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2931 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2932 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2933 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2934 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2935 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2936 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2937 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2938 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2939 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2940 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2941 ]
2942 ))
2943 );
2944 }
2945
2946 #[test]
parse_uncounted_repetition()2947 fn parse_uncounted_repetition() {
2948 assert_eq!(
2949 parser(r"a*").parse(),
2950 Ok(Ast::Repetition(ast::Repetition {
2951 span: span(0..2),
2952 op: ast::RepetitionOp {
2953 span: span(1..2),
2954 kind: ast::RepetitionKind::ZeroOrMore,
2955 },
2956 greedy: true,
2957 ast: Box::new(lit('a', 0)),
2958 }))
2959 );
2960 assert_eq!(
2961 parser(r"a+").parse(),
2962 Ok(Ast::Repetition(ast::Repetition {
2963 span: span(0..2),
2964 op: ast::RepetitionOp {
2965 span: span(1..2),
2966 kind: ast::RepetitionKind::OneOrMore,
2967 },
2968 greedy: true,
2969 ast: Box::new(lit('a', 0)),
2970 }))
2971 );
2972
2973 assert_eq!(
2974 parser(r"a?").parse(),
2975 Ok(Ast::Repetition(ast::Repetition {
2976 span: span(0..2),
2977 op: ast::RepetitionOp {
2978 span: span(1..2),
2979 kind: ast::RepetitionKind::ZeroOrOne,
2980 },
2981 greedy: true,
2982 ast: Box::new(lit('a', 0)),
2983 }))
2984 );
2985 assert_eq!(
2986 parser(r"a??").parse(),
2987 Ok(Ast::Repetition(ast::Repetition {
2988 span: span(0..3),
2989 op: ast::RepetitionOp {
2990 span: span(1..3),
2991 kind: ast::RepetitionKind::ZeroOrOne,
2992 },
2993 greedy: false,
2994 ast: Box::new(lit('a', 0)),
2995 }))
2996 );
2997 assert_eq!(
2998 parser(r"a?").parse(),
2999 Ok(Ast::Repetition(ast::Repetition {
3000 span: span(0..2),
3001 op: ast::RepetitionOp {
3002 span: span(1..2),
3003 kind: ast::RepetitionKind::ZeroOrOne,
3004 },
3005 greedy: true,
3006 ast: Box::new(lit('a', 0)),
3007 }))
3008 );
3009 assert_eq!(
3010 parser(r"a?b").parse(),
3011 Ok(concat(
3012 0..3,
3013 vec![
3014 Ast::Repetition(ast::Repetition {
3015 span: span(0..2),
3016 op: ast::RepetitionOp {
3017 span: span(1..2),
3018 kind: ast::RepetitionKind::ZeroOrOne,
3019 },
3020 greedy: true,
3021 ast: Box::new(lit('a', 0)),
3022 }),
3023 lit('b', 2),
3024 ]
3025 ))
3026 );
3027 assert_eq!(
3028 parser(r"a??b").parse(),
3029 Ok(concat(
3030 0..4,
3031 vec![
3032 Ast::Repetition(ast::Repetition {
3033 span: span(0..3),
3034 op: ast::RepetitionOp {
3035 span: span(1..3),
3036 kind: ast::RepetitionKind::ZeroOrOne,
3037 },
3038 greedy: false,
3039 ast: Box::new(lit('a', 0)),
3040 }),
3041 lit('b', 3),
3042 ]
3043 ))
3044 );
3045 assert_eq!(
3046 parser(r"ab?").parse(),
3047 Ok(concat(
3048 0..3,
3049 vec![
3050 lit('a', 0),
3051 Ast::Repetition(ast::Repetition {
3052 span: span(1..3),
3053 op: ast::RepetitionOp {
3054 span: span(2..3),
3055 kind: ast::RepetitionKind::ZeroOrOne,
3056 },
3057 greedy: true,
3058 ast: Box::new(lit('b', 1)),
3059 }),
3060 ]
3061 ))
3062 );
3063 assert_eq!(
3064 parser(r"(ab)?").parse(),
3065 Ok(Ast::Repetition(ast::Repetition {
3066 span: span(0..5),
3067 op: ast::RepetitionOp {
3068 span: span(4..5),
3069 kind: ast::RepetitionKind::ZeroOrOne,
3070 },
3071 greedy: true,
3072 ast: Box::new(group(
3073 0..4,
3074 1,
3075 concat(1..3, vec![lit('a', 1), lit('b', 2),])
3076 )),
3077 }))
3078 );
3079 assert_eq!(
3080 parser(r"|a?").parse(),
3081 Ok(alt(
3082 0..3,
3083 vec![
3084 Ast::Empty(span(0..0)),
3085 Ast::Repetition(ast::Repetition {
3086 span: span(1..3),
3087 op: ast::RepetitionOp {
3088 span: span(2..3),
3089 kind: ast::RepetitionKind::ZeroOrOne,
3090 },
3091 greedy: true,
3092 ast: Box::new(lit('a', 1)),
3093 }),
3094 ]
3095 ))
3096 );
3097
3098 assert_eq!(
3099 parser(r"*").parse().unwrap_err(),
3100 TestError {
3101 span: span(0..0),
3102 kind: ast::ErrorKind::RepetitionMissing,
3103 }
3104 );
3105 assert_eq!(
3106 parser(r"(?i)*").parse().unwrap_err(),
3107 TestError {
3108 span: span(4..4),
3109 kind: ast::ErrorKind::RepetitionMissing,
3110 }
3111 );
3112 assert_eq!(
3113 parser(r"(*)").parse().unwrap_err(),
3114 TestError {
3115 span: span(1..1),
3116 kind: ast::ErrorKind::RepetitionMissing,
3117 }
3118 );
3119 assert_eq!(
3120 parser(r"(?:?)").parse().unwrap_err(),
3121 TestError {
3122 span: span(3..3),
3123 kind: ast::ErrorKind::RepetitionMissing,
3124 }
3125 );
3126 assert_eq!(
3127 parser(r"+").parse().unwrap_err(),
3128 TestError {
3129 span: span(0..0),
3130 kind: ast::ErrorKind::RepetitionMissing,
3131 }
3132 );
3133 assert_eq!(
3134 parser(r"?").parse().unwrap_err(),
3135 TestError {
3136 span: span(0..0),
3137 kind: ast::ErrorKind::RepetitionMissing,
3138 }
3139 );
3140 assert_eq!(
3141 parser(r"(?)").parse().unwrap_err(),
3142 TestError {
3143 span: span(1..1),
3144 kind: ast::ErrorKind::RepetitionMissing,
3145 }
3146 );
3147 assert_eq!(
3148 parser(r"|*").parse().unwrap_err(),
3149 TestError {
3150 span: span(1..1),
3151 kind: ast::ErrorKind::RepetitionMissing,
3152 }
3153 );
3154 assert_eq!(
3155 parser(r"|+").parse().unwrap_err(),
3156 TestError {
3157 span: span(1..1),
3158 kind: ast::ErrorKind::RepetitionMissing,
3159 }
3160 );
3161 assert_eq!(
3162 parser(r"|?").parse().unwrap_err(),
3163 TestError {
3164 span: span(1..1),
3165 kind: ast::ErrorKind::RepetitionMissing,
3166 }
3167 );
3168 }
3169
3170 #[test]
parse_counted_repetition()3171 fn parse_counted_repetition() {
3172 assert_eq!(
3173 parser(r"a{5}").parse(),
3174 Ok(Ast::Repetition(ast::Repetition {
3175 span: span(0..4),
3176 op: ast::RepetitionOp {
3177 span: span(1..4),
3178 kind: ast::RepetitionKind::Range(
3179 ast::RepetitionRange::Exactly(5)
3180 ),
3181 },
3182 greedy: true,
3183 ast: Box::new(lit('a', 0)),
3184 }))
3185 );
3186 assert_eq!(
3187 parser(r"a{5,}").parse(),
3188 Ok(Ast::Repetition(ast::Repetition {
3189 span: span(0..5),
3190 op: ast::RepetitionOp {
3191 span: span(1..5),
3192 kind: ast::RepetitionKind::Range(
3193 ast::RepetitionRange::AtLeast(5)
3194 ),
3195 },
3196 greedy: true,
3197 ast: Box::new(lit('a', 0)),
3198 }))
3199 );
3200 assert_eq!(
3201 parser(r"a{5,9}").parse(),
3202 Ok(Ast::Repetition(ast::Repetition {
3203 span: span(0..6),
3204 op: ast::RepetitionOp {
3205 span: span(1..6),
3206 kind: ast::RepetitionKind::Range(
3207 ast::RepetitionRange::Bounded(5, 9)
3208 ),
3209 },
3210 greedy: true,
3211 ast: Box::new(lit('a', 0)),
3212 }))
3213 );
3214 assert_eq!(
3215 parser(r"a{5}?").parse(),
3216 Ok(Ast::Repetition(ast::Repetition {
3217 span: span(0..5),
3218 op: ast::RepetitionOp {
3219 span: span(1..5),
3220 kind: ast::RepetitionKind::Range(
3221 ast::RepetitionRange::Exactly(5)
3222 ),
3223 },
3224 greedy: false,
3225 ast: Box::new(lit('a', 0)),
3226 }))
3227 );
3228 assert_eq!(
3229 parser(r"ab{5}").parse(),
3230 Ok(concat(
3231 0..5,
3232 vec![
3233 lit('a', 0),
3234 Ast::Repetition(ast::Repetition {
3235 span: span(1..5),
3236 op: ast::RepetitionOp {
3237 span: span(2..5),
3238 kind: ast::RepetitionKind::Range(
3239 ast::RepetitionRange::Exactly(5)
3240 ),
3241 },
3242 greedy: true,
3243 ast: Box::new(lit('b', 1)),
3244 }),
3245 ]
3246 ))
3247 );
3248 assert_eq!(
3249 parser(r"ab{5}c").parse(),
3250 Ok(concat(
3251 0..6,
3252 vec![
3253 lit('a', 0),
3254 Ast::Repetition(ast::Repetition {
3255 span: span(1..5),
3256 op: ast::RepetitionOp {
3257 span: span(2..5),
3258 kind: ast::RepetitionKind::Range(
3259 ast::RepetitionRange::Exactly(5)
3260 ),
3261 },
3262 greedy: true,
3263 ast: Box::new(lit('b', 1)),
3264 }),
3265 lit('c', 5),
3266 ]
3267 ))
3268 );
3269
3270 assert_eq!(
3271 parser(r"a{ 5 }").parse(),
3272 Ok(Ast::Repetition(ast::Repetition {
3273 span: span(0..6),
3274 op: ast::RepetitionOp {
3275 span: span(1..6),
3276 kind: ast::RepetitionKind::Range(
3277 ast::RepetitionRange::Exactly(5)
3278 ),
3279 },
3280 greedy: true,
3281 ast: Box::new(lit('a', 0)),
3282 }))
3283 );
3284 assert_eq!(
3285 parser(r"a{ 5 , 9 }").parse(),
3286 Ok(Ast::Repetition(ast::Repetition {
3287 span: span(0..10),
3288 op: ast::RepetitionOp {
3289 span: span(1..10),
3290 kind: ast::RepetitionKind::Range(
3291 ast::RepetitionRange::Bounded(5, 9)
3292 ),
3293 },
3294 greedy: true,
3295 ast: Box::new(lit('a', 0)),
3296 }))
3297 );
3298 assert_eq!(
3299 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3300 Ok(Ast::Repetition(ast::Repetition {
3301 span: span(0..8),
3302 op: ast::RepetitionOp {
3303 span: span(1..8),
3304 kind: ast::RepetitionKind::Range(
3305 ast::RepetitionRange::Bounded(5, 9)
3306 ),
3307 },
3308 greedy: false,
3309 ast: Box::new(lit('a', 0)),
3310 }))
3311 );
3312
3313 assert_eq!(
3314 parser(r"(?i){0}").parse().unwrap_err(),
3315 TestError {
3316 span: span(4..4),
3317 kind: ast::ErrorKind::RepetitionMissing,
3318 }
3319 );
3320 assert_eq!(
3321 parser(r"(?m){1,1}").parse().unwrap_err(),
3322 TestError {
3323 span: span(4..4),
3324 kind: ast::ErrorKind::RepetitionMissing,
3325 }
3326 );
3327 assert_eq!(
3328 parser(r"a{]}").parse().unwrap_err(),
3329 TestError {
3330 span: span(2..2),
3331 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3332 }
3333 );
3334 assert_eq!(
3335 parser(r"a{1,]}").parse().unwrap_err(),
3336 TestError {
3337 span: span(4..4),
3338 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3339 }
3340 );
3341 assert_eq!(
3342 parser(r"a{").parse().unwrap_err(),
3343 TestError {
3344 span: span(1..2),
3345 kind: ast::ErrorKind::RepetitionCountUnclosed,
3346 }
3347 );
3348 assert_eq!(
3349 parser(r"a{}").parse().unwrap_err(),
3350 TestError {
3351 span: span(2..2),
3352 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3353 }
3354 );
3355 assert_eq!(
3356 parser(r"a{a").parse().unwrap_err(),
3357 TestError {
3358 span: span(2..2),
3359 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3360 }
3361 );
3362 assert_eq!(
3363 parser(r"a{9999999999}").parse().unwrap_err(),
3364 TestError {
3365 span: span(2..12),
3366 kind: ast::ErrorKind::DecimalInvalid,
3367 }
3368 );
3369 assert_eq!(
3370 parser(r"a{9").parse().unwrap_err(),
3371 TestError {
3372 span: span(1..3),
3373 kind: ast::ErrorKind::RepetitionCountUnclosed,
3374 }
3375 );
3376 assert_eq!(
3377 parser(r"a{9,a").parse().unwrap_err(),
3378 TestError {
3379 span: span(4..4),
3380 kind: ast::ErrorKind::RepetitionCountDecimalEmpty,
3381 }
3382 );
3383 assert_eq!(
3384 parser(r"a{9,9999999999}").parse().unwrap_err(),
3385 TestError {
3386 span: span(4..14),
3387 kind: ast::ErrorKind::DecimalInvalid,
3388 }
3389 );
3390 assert_eq!(
3391 parser(r"a{9,").parse().unwrap_err(),
3392 TestError {
3393 span: span(1..4),
3394 kind: ast::ErrorKind::RepetitionCountUnclosed,
3395 }
3396 );
3397 assert_eq!(
3398 parser(r"a{9,11").parse().unwrap_err(),
3399 TestError {
3400 span: span(1..6),
3401 kind: ast::ErrorKind::RepetitionCountUnclosed,
3402 }
3403 );
3404 assert_eq!(
3405 parser(r"a{2,1}").parse().unwrap_err(),
3406 TestError {
3407 span: span(1..6),
3408 kind: ast::ErrorKind::RepetitionCountInvalid,
3409 }
3410 );
3411 assert_eq!(
3412 parser(r"{5}").parse().unwrap_err(),
3413 TestError {
3414 span: span(0..0),
3415 kind: ast::ErrorKind::RepetitionMissing,
3416 }
3417 );
3418 assert_eq!(
3419 parser(r"|{5}").parse().unwrap_err(),
3420 TestError {
3421 span: span(1..1),
3422 kind: ast::ErrorKind::RepetitionMissing,
3423 }
3424 );
3425 }
3426
3427 #[test]
parse_alternate()3428 fn parse_alternate() {
3429 assert_eq!(
3430 parser(r"a|b").parse(),
3431 Ok(Ast::Alternation(ast::Alternation {
3432 span: span(0..3),
3433 asts: vec![lit('a', 0), lit('b', 2)],
3434 }))
3435 );
3436 assert_eq!(
3437 parser(r"(a|b)").parse(),
3438 Ok(group(
3439 0..5,
3440 1,
3441 Ast::Alternation(ast::Alternation {
3442 span: span(1..4),
3443 asts: vec![lit('a', 1), lit('b', 3)],
3444 })
3445 ))
3446 );
3447
3448 assert_eq!(
3449 parser(r"a|b|c").parse(),
3450 Ok(Ast::Alternation(ast::Alternation {
3451 span: span(0..5),
3452 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3453 }))
3454 );
3455 assert_eq!(
3456 parser(r"ax|by|cz").parse(),
3457 Ok(Ast::Alternation(ast::Alternation {
3458 span: span(0..8),
3459 asts: vec![
3460 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3461 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3462 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3463 ],
3464 }))
3465 );
3466 assert_eq!(
3467 parser(r"(ax|by|cz)").parse(),
3468 Ok(group(
3469 0..10,
3470 1,
3471 Ast::Alternation(ast::Alternation {
3472 span: span(1..9),
3473 asts: vec![
3474 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3475 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3476 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3477 ],
3478 })
3479 ))
3480 );
3481 assert_eq!(
3482 parser(r"(ax|(by|(cz)))").parse(),
3483 Ok(group(
3484 0..14,
3485 1,
3486 alt(
3487 1..13,
3488 vec![
3489 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3490 group(
3491 4..13,
3492 2,
3493 alt(
3494 5..12,
3495 vec![
3496 concat(
3497 5..7,
3498 vec![lit('b', 5), lit('y', 6)]
3499 ),
3500 group(
3501 8..12,
3502 3,
3503 concat(
3504 9..11,
3505 vec![lit('c', 9), lit('z', 10),]
3506 )
3507 ),
3508 ]
3509 )
3510 ),
3511 ]
3512 )
3513 ))
3514 );
3515
3516 assert_eq!(
3517 parser(r"|").parse(),
3518 Ok(alt(
3519 0..1,
3520 vec![Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),]
3521 ))
3522 );
3523 assert_eq!(
3524 parser(r"||").parse(),
3525 Ok(alt(
3526 0..2,
3527 vec![
3528 Ast::Empty(span(0..0)),
3529 Ast::Empty(span(1..1)),
3530 Ast::Empty(span(2..2)),
3531 ]
3532 ))
3533 );
3534 assert_eq!(
3535 parser(r"a|").parse(),
3536 Ok(alt(0..2, vec![lit('a', 0), Ast::Empty(span(2..2)),]))
3537 );
3538 assert_eq!(
3539 parser(r"|a").parse(),
3540 Ok(alt(0..2, vec![Ast::Empty(span(0..0)), lit('a', 1),]))
3541 );
3542
3543 assert_eq!(
3544 parser(r"(|)").parse(),
3545 Ok(group(
3546 0..3,
3547 1,
3548 alt(
3549 1..2,
3550 vec![Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),]
3551 )
3552 ))
3553 );
3554 assert_eq!(
3555 parser(r"(a|)").parse(),
3556 Ok(group(
3557 0..4,
3558 1,
3559 alt(1..3, vec![lit('a', 1), Ast::Empty(span(3..3)),])
3560 ))
3561 );
3562 assert_eq!(
3563 parser(r"(|a)").parse(),
3564 Ok(group(
3565 0..4,
3566 1,
3567 alt(1..3, vec![Ast::Empty(span(1..1)), lit('a', 2),])
3568 ))
3569 );
3570
3571 assert_eq!(
3572 parser(r"a|b)").parse().unwrap_err(),
3573 TestError {
3574 span: span(3..4),
3575 kind: ast::ErrorKind::GroupUnopened,
3576 }
3577 );
3578 assert_eq!(
3579 parser(r"(a|b").parse().unwrap_err(),
3580 TestError {
3581 span: span(0..1),
3582 kind: ast::ErrorKind::GroupUnclosed,
3583 }
3584 );
3585 }
3586
3587 #[test]
parse_unsupported_lookaround()3588 fn parse_unsupported_lookaround() {
3589 assert_eq!(
3590 parser(r"(?=a)").parse().unwrap_err(),
3591 TestError {
3592 span: span(0..3),
3593 kind: ast::ErrorKind::UnsupportedLookAround,
3594 }
3595 );
3596 assert_eq!(
3597 parser(r"(?!a)").parse().unwrap_err(),
3598 TestError {
3599 span: span(0..3),
3600 kind: ast::ErrorKind::UnsupportedLookAround,
3601 }
3602 );
3603 assert_eq!(
3604 parser(r"(?<=a)").parse().unwrap_err(),
3605 TestError {
3606 span: span(0..4),
3607 kind: ast::ErrorKind::UnsupportedLookAround,
3608 }
3609 );
3610 assert_eq!(
3611 parser(r"(?<!a)").parse().unwrap_err(),
3612 TestError {
3613 span: span(0..4),
3614 kind: ast::ErrorKind::UnsupportedLookAround,
3615 }
3616 );
3617 }
3618
3619 #[test]
parse_group()3620 fn parse_group() {
3621 assert_eq!(
3622 parser("(?i)").parse(),
3623 Ok(Ast::Flags(ast::SetFlags {
3624 span: span(0..4),
3625 flags: ast::Flags {
3626 span: span(2..3),
3627 items: vec![ast::FlagsItem {
3628 span: span(2..3),
3629 kind: ast::FlagsItemKind::Flag(
3630 ast::Flag::CaseInsensitive
3631 ),
3632 }],
3633 },
3634 }))
3635 );
3636 assert_eq!(
3637 parser("(?iU)").parse(),
3638 Ok(Ast::Flags(ast::SetFlags {
3639 span: span(0..5),
3640 flags: ast::Flags {
3641 span: span(2..4),
3642 items: vec![
3643 ast::FlagsItem {
3644 span: span(2..3),
3645 kind: ast::FlagsItemKind::Flag(
3646 ast::Flag::CaseInsensitive
3647 ),
3648 },
3649 ast::FlagsItem {
3650 span: span(3..4),
3651 kind: ast::FlagsItemKind::Flag(
3652 ast::Flag::SwapGreed
3653 ),
3654 },
3655 ],
3656 },
3657 }))
3658 );
3659 assert_eq!(
3660 parser("(?i-U)").parse(),
3661 Ok(Ast::Flags(ast::SetFlags {
3662 span: span(0..6),
3663 flags: ast::Flags {
3664 span: span(2..5),
3665 items: vec![
3666 ast::FlagsItem {
3667 span: span(2..3),
3668 kind: ast::FlagsItemKind::Flag(
3669 ast::Flag::CaseInsensitive
3670 ),
3671 },
3672 ast::FlagsItem {
3673 span: span(3..4),
3674 kind: ast::FlagsItemKind::Negation,
3675 },
3676 ast::FlagsItem {
3677 span: span(4..5),
3678 kind: ast::FlagsItemKind::Flag(
3679 ast::Flag::SwapGreed
3680 ),
3681 },
3682 ],
3683 },
3684 }))
3685 );
3686
3687 assert_eq!(
3688 parser("()").parse(),
3689 Ok(Ast::Group(ast::Group {
3690 span: span(0..2),
3691 kind: ast::GroupKind::CaptureIndex(1),
3692 ast: Box::new(Ast::Empty(span(1..1))),
3693 }))
3694 );
3695 assert_eq!(
3696 parser("(a)").parse(),
3697 Ok(Ast::Group(ast::Group {
3698 span: span(0..3),
3699 kind: ast::GroupKind::CaptureIndex(1),
3700 ast: Box::new(lit('a', 1)),
3701 }))
3702 );
3703 assert_eq!(
3704 parser("(())").parse(),
3705 Ok(Ast::Group(ast::Group {
3706 span: span(0..4),
3707 kind: ast::GroupKind::CaptureIndex(1),
3708 ast: Box::new(Ast::Group(ast::Group {
3709 span: span(1..3),
3710 kind: ast::GroupKind::CaptureIndex(2),
3711 ast: Box::new(Ast::Empty(span(2..2))),
3712 })),
3713 }))
3714 );
3715
3716 assert_eq!(
3717 parser("(?:a)").parse(),
3718 Ok(Ast::Group(ast::Group {
3719 span: span(0..5),
3720 kind: ast::GroupKind::NonCapturing(ast::Flags {
3721 span: span(2..2),
3722 items: vec![],
3723 }),
3724 ast: Box::new(lit('a', 3)),
3725 }))
3726 );
3727
3728 assert_eq!(
3729 parser("(?i:a)").parse(),
3730 Ok(Ast::Group(ast::Group {
3731 span: span(0..6),
3732 kind: ast::GroupKind::NonCapturing(ast::Flags {
3733 span: span(2..3),
3734 items: vec![ast::FlagsItem {
3735 span: span(2..3),
3736 kind: ast::FlagsItemKind::Flag(
3737 ast::Flag::CaseInsensitive
3738 ),
3739 },],
3740 }),
3741 ast: Box::new(lit('a', 4)),
3742 }))
3743 );
3744 assert_eq!(
3745 parser("(?i-U:a)").parse(),
3746 Ok(Ast::Group(ast::Group {
3747 span: span(0..8),
3748 kind: ast::GroupKind::NonCapturing(ast::Flags {
3749 span: span(2..5),
3750 items: vec![
3751 ast::FlagsItem {
3752 span: span(2..3),
3753 kind: ast::FlagsItemKind::Flag(
3754 ast::Flag::CaseInsensitive
3755 ),
3756 },
3757 ast::FlagsItem {
3758 span: span(3..4),
3759 kind: ast::FlagsItemKind::Negation,
3760 },
3761 ast::FlagsItem {
3762 span: span(4..5),
3763 kind: ast::FlagsItemKind::Flag(
3764 ast::Flag::SwapGreed
3765 ),
3766 },
3767 ],
3768 }),
3769 ast: Box::new(lit('a', 6)),
3770 }))
3771 );
3772
3773 assert_eq!(
3774 parser("(").parse().unwrap_err(),
3775 TestError {
3776 span: span(0..1),
3777 kind: ast::ErrorKind::GroupUnclosed,
3778 }
3779 );
3780 assert_eq!(
3781 parser("(?").parse().unwrap_err(),
3782 TestError {
3783 span: span(0..1),
3784 kind: ast::ErrorKind::GroupUnclosed,
3785 }
3786 );
3787 assert_eq!(
3788 parser("(?P").parse().unwrap_err(),
3789 TestError {
3790 span: span(2..3),
3791 kind: ast::ErrorKind::FlagUnrecognized,
3792 }
3793 );
3794 assert_eq!(
3795 parser("(?P<").parse().unwrap_err(),
3796 TestError {
3797 span: span(4..4),
3798 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3799 }
3800 );
3801 assert_eq!(
3802 parser("(a").parse().unwrap_err(),
3803 TestError {
3804 span: span(0..1),
3805 kind: ast::ErrorKind::GroupUnclosed,
3806 }
3807 );
3808 assert_eq!(
3809 parser("(()").parse().unwrap_err(),
3810 TestError {
3811 span: span(0..1),
3812 kind: ast::ErrorKind::GroupUnclosed,
3813 }
3814 );
3815 assert_eq!(
3816 parser(")").parse().unwrap_err(),
3817 TestError {
3818 span: span(0..1),
3819 kind: ast::ErrorKind::GroupUnopened,
3820 }
3821 );
3822 assert_eq!(
3823 parser("a)").parse().unwrap_err(),
3824 TestError {
3825 span: span(1..2),
3826 kind: ast::ErrorKind::GroupUnopened,
3827 }
3828 );
3829 }
3830
3831 #[test]
parse_capture_name()3832 fn parse_capture_name() {
3833 assert_eq!(
3834 parser("(?P<a>z)").parse(),
3835 Ok(Ast::Group(ast::Group {
3836 span: span(0..8),
3837 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3838 span: span(4..5),
3839 name: s("a"),
3840 index: 1,
3841 }),
3842 ast: Box::new(lit('z', 6)),
3843 }))
3844 );
3845 assert_eq!(
3846 parser("(?P<abc>z)").parse(),
3847 Ok(Ast::Group(ast::Group {
3848 span: span(0..10),
3849 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3850 span: span(4..7),
3851 name: s("abc"),
3852 index: 1,
3853 }),
3854 ast: Box::new(lit('z', 8)),
3855 }))
3856 );
3857
3858 assert_eq!(
3859 parser("(?P<a_1>z)").parse(),
3860 Ok(Ast::Group(ast::Group {
3861 span: span(0..10),
3862 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3863 span: span(4..7),
3864 name: s("a_1"),
3865 index: 1,
3866 }),
3867 ast: Box::new(lit('z', 8)),
3868 }))
3869 );
3870
3871 assert_eq!(
3872 parser("(?P<a.1>z)").parse(),
3873 Ok(Ast::Group(ast::Group {
3874 span: span(0..10),
3875 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3876 span: span(4..7),
3877 name: s("a.1"),
3878 index: 1,
3879 }),
3880 ast: Box::new(lit('z', 8)),
3881 }))
3882 );
3883
3884 assert_eq!(
3885 parser("(?P<a[1]>z)").parse(),
3886 Ok(Ast::Group(ast::Group {
3887 span: span(0..11),
3888 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3889 span: span(4..8),
3890 name: s("a[1]"),
3891 index: 1,
3892 }),
3893 ast: Box::new(lit('z', 9)),
3894 }))
3895 );
3896
3897 assert_eq!(
3898 parser("(?P<").parse().unwrap_err(),
3899 TestError {
3900 span: span(4..4),
3901 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3902 }
3903 );
3904 assert_eq!(
3905 parser("(?P<>z)").parse().unwrap_err(),
3906 TestError {
3907 span: span(4..4),
3908 kind: ast::ErrorKind::GroupNameEmpty,
3909 }
3910 );
3911 assert_eq!(
3912 parser("(?P<a").parse().unwrap_err(),
3913 TestError {
3914 span: span(5..5),
3915 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3916 }
3917 );
3918 assert_eq!(
3919 parser("(?P<ab").parse().unwrap_err(),
3920 TestError {
3921 span: span(6..6),
3922 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3923 }
3924 );
3925 assert_eq!(
3926 parser("(?P<0a").parse().unwrap_err(),
3927 TestError {
3928 span: span(4..5),
3929 kind: ast::ErrorKind::GroupNameInvalid,
3930 }
3931 );
3932 assert_eq!(
3933 parser("(?P<~").parse().unwrap_err(),
3934 TestError {
3935 span: span(4..5),
3936 kind: ast::ErrorKind::GroupNameInvalid,
3937 }
3938 );
3939 assert_eq!(
3940 parser("(?P<abc~").parse().unwrap_err(),
3941 TestError {
3942 span: span(7..8),
3943 kind: ast::ErrorKind::GroupNameInvalid,
3944 }
3945 );
3946 assert_eq!(
3947 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3948 TestError {
3949 span: span(12..13),
3950 kind: ast::ErrorKind::GroupNameDuplicate {
3951 original: span(4..5),
3952 },
3953 }
3954 );
3955 }
3956
3957 #[test]
parse_flags()3958 fn parse_flags() {
3959 assert_eq!(
3960 parser("i:").parse_flags(),
3961 Ok(ast::Flags {
3962 span: span(0..1),
3963 items: vec![ast::FlagsItem {
3964 span: span(0..1),
3965 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3966 }],
3967 })
3968 );
3969 assert_eq!(
3970 parser("i)").parse_flags(),
3971 Ok(ast::Flags {
3972 span: span(0..1),
3973 items: vec![ast::FlagsItem {
3974 span: span(0..1),
3975 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3976 }],
3977 })
3978 );
3979
3980 assert_eq!(
3981 parser("isU:").parse_flags(),
3982 Ok(ast::Flags {
3983 span: span(0..3),
3984 items: vec![
3985 ast::FlagsItem {
3986 span: span(0..1),
3987 kind: ast::FlagsItemKind::Flag(
3988 ast::Flag::CaseInsensitive
3989 ),
3990 },
3991 ast::FlagsItem {
3992 span: span(1..2),
3993 kind: ast::FlagsItemKind::Flag(
3994 ast::Flag::DotMatchesNewLine
3995 ),
3996 },
3997 ast::FlagsItem {
3998 span: span(2..3),
3999 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4000 },
4001 ],
4002 })
4003 );
4004
4005 assert_eq!(
4006 parser("-isU:").parse_flags(),
4007 Ok(ast::Flags {
4008 span: span(0..4),
4009 items: vec![
4010 ast::FlagsItem {
4011 span: span(0..1),
4012 kind: ast::FlagsItemKind::Negation,
4013 },
4014 ast::FlagsItem {
4015 span: span(1..2),
4016 kind: ast::FlagsItemKind::Flag(
4017 ast::Flag::CaseInsensitive
4018 ),
4019 },
4020 ast::FlagsItem {
4021 span: span(2..3),
4022 kind: ast::FlagsItemKind::Flag(
4023 ast::Flag::DotMatchesNewLine
4024 ),
4025 },
4026 ast::FlagsItem {
4027 span: span(3..4),
4028 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4029 },
4030 ],
4031 })
4032 );
4033 assert_eq!(
4034 parser("i-sU:").parse_flags(),
4035 Ok(ast::Flags {
4036 span: span(0..4),
4037 items: vec![
4038 ast::FlagsItem {
4039 span: span(0..1),
4040 kind: ast::FlagsItemKind::Flag(
4041 ast::Flag::CaseInsensitive
4042 ),
4043 },
4044 ast::FlagsItem {
4045 span: span(1..2),
4046 kind: ast::FlagsItemKind::Negation,
4047 },
4048 ast::FlagsItem {
4049 span: span(2..3),
4050 kind: ast::FlagsItemKind::Flag(
4051 ast::Flag::DotMatchesNewLine
4052 ),
4053 },
4054 ast::FlagsItem {
4055 span: span(3..4),
4056 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
4057 },
4058 ],
4059 })
4060 );
4061
4062 assert_eq!(
4063 parser("isU").parse_flags().unwrap_err(),
4064 TestError {
4065 span: span(3..3),
4066 kind: ast::ErrorKind::FlagUnexpectedEof,
4067 }
4068 );
4069 assert_eq!(
4070 parser("isUa:").parse_flags().unwrap_err(),
4071 TestError {
4072 span: span(3..4),
4073 kind: ast::ErrorKind::FlagUnrecognized,
4074 }
4075 );
4076 assert_eq!(
4077 parser("isUi:").parse_flags().unwrap_err(),
4078 TestError {
4079 span: span(3..4),
4080 kind: ast::ErrorKind::FlagDuplicate { original: span(0..1) },
4081 }
4082 );
4083 assert_eq!(
4084 parser("i-sU-i:").parse_flags().unwrap_err(),
4085 TestError {
4086 span: span(4..5),
4087 kind: ast::ErrorKind::FlagRepeatedNegation {
4088 original: span(1..2),
4089 },
4090 }
4091 );
4092 assert_eq!(
4093 parser("-)").parse_flags().unwrap_err(),
4094 TestError {
4095 span: span(0..1),
4096 kind: ast::ErrorKind::FlagDanglingNegation,
4097 }
4098 );
4099 assert_eq!(
4100 parser("i-)").parse_flags().unwrap_err(),
4101 TestError {
4102 span: span(1..2),
4103 kind: ast::ErrorKind::FlagDanglingNegation,
4104 }
4105 );
4106 assert_eq!(
4107 parser("iU-)").parse_flags().unwrap_err(),
4108 TestError {
4109 span: span(2..3),
4110 kind: ast::ErrorKind::FlagDanglingNegation,
4111 }
4112 );
4113 }
4114
4115 #[test]
parse_flag()4116 fn parse_flag() {
4117 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
4118 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
4119 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
4120 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
4121 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
4122 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
4123
4124 assert_eq!(
4125 parser("a").parse_flag().unwrap_err(),
4126 TestError {
4127 span: span(0..1),
4128 kind: ast::ErrorKind::FlagUnrecognized,
4129 }
4130 );
4131 assert_eq!(
4132 parser("☃").parse_flag().unwrap_err(),
4133 TestError {
4134 span: span_range("☃", 0..3),
4135 kind: ast::ErrorKind::FlagUnrecognized,
4136 }
4137 );
4138 }
4139
4140 #[test]
parse_primitive_non_escape()4141 fn parse_primitive_non_escape() {
4142 assert_eq!(
4143 parser(r".").parse_primitive(),
4144 Ok(Primitive::Dot(span(0..1)))
4145 );
4146 assert_eq!(
4147 parser(r"^").parse_primitive(),
4148 Ok(Primitive::Assertion(ast::Assertion {
4149 span: span(0..1),
4150 kind: ast::AssertionKind::StartLine,
4151 }))
4152 );
4153 assert_eq!(
4154 parser(r"$").parse_primitive(),
4155 Ok(Primitive::Assertion(ast::Assertion {
4156 span: span(0..1),
4157 kind: ast::AssertionKind::EndLine,
4158 }))
4159 );
4160
4161 assert_eq!(
4162 parser(r"a").parse_primitive(),
4163 Ok(Primitive::Literal(ast::Literal {
4164 span: span(0..1),
4165 kind: ast::LiteralKind::Verbatim,
4166 c: 'a',
4167 }))
4168 );
4169 assert_eq!(
4170 parser(r"|").parse_primitive(),
4171 Ok(Primitive::Literal(ast::Literal {
4172 span: span(0..1),
4173 kind: ast::LiteralKind::Verbatim,
4174 c: '|',
4175 }))
4176 );
4177 assert_eq!(
4178 parser(r"☃").parse_primitive(),
4179 Ok(Primitive::Literal(ast::Literal {
4180 span: span_range("☃", 0..3),
4181 kind: ast::LiteralKind::Verbatim,
4182 c: '☃',
4183 }))
4184 );
4185 }
4186
4187 #[test]
parse_escape()4188 fn parse_escape() {
4189 assert_eq!(
4190 parser(r"\|").parse_primitive(),
4191 Ok(Primitive::Literal(ast::Literal {
4192 span: span(0..2),
4193 kind: ast::LiteralKind::Punctuation,
4194 c: '|',
4195 }))
4196 );
4197 let specials = &[
4198 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
4199 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
4200 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
4201 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
4202 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
4203 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
4204 ];
4205 for &(pat, c, ref kind) in specials {
4206 assert_eq!(
4207 parser(pat).parse_primitive(),
4208 Ok(Primitive::Literal(ast::Literal {
4209 span: span(0..2),
4210 kind: ast::LiteralKind::Special(kind.clone()),
4211 c: c,
4212 }))
4213 );
4214 }
4215 assert_eq!(
4216 parser(r"\A").parse_primitive(),
4217 Ok(Primitive::Assertion(ast::Assertion {
4218 span: span(0..2),
4219 kind: ast::AssertionKind::StartText,
4220 }))
4221 );
4222 assert_eq!(
4223 parser(r"\z").parse_primitive(),
4224 Ok(Primitive::Assertion(ast::Assertion {
4225 span: span(0..2),
4226 kind: ast::AssertionKind::EndText,
4227 }))
4228 );
4229 assert_eq!(
4230 parser(r"\b").parse_primitive(),
4231 Ok(Primitive::Assertion(ast::Assertion {
4232 span: span(0..2),
4233 kind: ast::AssertionKind::WordBoundary,
4234 }))
4235 );
4236 assert_eq!(
4237 parser(r"\B").parse_primitive(),
4238 Ok(Primitive::Assertion(ast::Assertion {
4239 span: span(0..2),
4240 kind: ast::AssertionKind::NotWordBoundary,
4241 }))
4242 );
4243
4244 assert_eq!(
4245 parser(r"\").parse_escape().unwrap_err(),
4246 TestError {
4247 span: span(0..1),
4248 kind: ast::ErrorKind::EscapeUnexpectedEof,
4249 }
4250 );
4251 assert_eq!(
4252 parser(r"\y").parse_escape().unwrap_err(),
4253 TestError {
4254 span: span(0..2),
4255 kind: ast::ErrorKind::EscapeUnrecognized,
4256 }
4257 );
4258 }
4259
4260 #[test]
parse_unsupported_backreference()4261 fn parse_unsupported_backreference() {
4262 assert_eq!(
4263 parser(r"\0").parse_escape().unwrap_err(),
4264 TestError {
4265 span: span(0..2),
4266 kind: ast::ErrorKind::UnsupportedBackreference,
4267 }
4268 );
4269 assert_eq!(
4270 parser(r"\9").parse_escape().unwrap_err(),
4271 TestError {
4272 span: span(0..2),
4273 kind: ast::ErrorKind::UnsupportedBackreference,
4274 }
4275 );
4276 }
4277
4278 #[test]
parse_octal()4279 fn parse_octal() {
4280 for i in 0..511 {
4281 let pat = format!(r"\{:o}", i);
4282 assert_eq!(
4283 parser_octal(&pat).parse_escape(),
4284 Ok(Primitive::Literal(ast::Literal {
4285 span: span(0..pat.len()),
4286 kind: ast::LiteralKind::Octal,
4287 c: ::std::char::from_u32(i).unwrap(),
4288 }))
4289 );
4290 }
4291 assert_eq!(
4292 parser_octal(r"\778").parse_escape(),
4293 Ok(Primitive::Literal(ast::Literal {
4294 span: span(0..3),
4295 kind: ast::LiteralKind::Octal,
4296 c: '?',
4297 }))
4298 );
4299 assert_eq!(
4300 parser_octal(r"\7777").parse_escape(),
4301 Ok(Primitive::Literal(ast::Literal {
4302 span: span(0..4),
4303 kind: ast::LiteralKind::Octal,
4304 c: '\u{01FF}',
4305 }))
4306 );
4307 assert_eq!(
4308 parser_octal(r"\778").parse(),
4309 Ok(Ast::Concat(ast::Concat {
4310 span: span(0..4),
4311 asts: vec![
4312 Ast::Literal(ast::Literal {
4313 span: span(0..3),
4314 kind: ast::LiteralKind::Octal,
4315 c: '?',
4316 }),
4317 Ast::Literal(ast::Literal {
4318 span: span(3..4),
4319 kind: ast::LiteralKind::Verbatim,
4320 c: '8',
4321 }),
4322 ],
4323 }))
4324 );
4325 assert_eq!(
4326 parser_octal(r"\7777").parse(),
4327 Ok(Ast::Concat(ast::Concat {
4328 span: span(0..5),
4329 asts: vec![
4330 Ast::Literal(ast::Literal {
4331 span: span(0..4),
4332 kind: ast::LiteralKind::Octal,
4333 c: '\u{01FF}',
4334 }),
4335 Ast::Literal(ast::Literal {
4336 span: span(4..5),
4337 kind: ast::LiteralKind::Verbatim,
4338 c: '7',
4339 }),
4340 ],
4341 }))
4342 );
4343
4344 assert_eq!(
4345 parser_octal(r"\8").parse_escape().unwrap_err(),
4346 TestError {
4347 span: span(0..2),
4348 kind: ast::ErrorKind::EscapeUnrecognized,
4349 }
4350 );
4351 }
4352
4353 #[test]
parse_hex_two()4354 fn parse_hex_two() {
4355 for i in 0..256 {
4356 let pat = format!(r"\x{:02x}", i);
4357 assert_eq!(
4358 parser(&pat).parse_escape(),
4359 Ok(Primitive::Literal(ast::Literal {
4360 span: span(0..pat.len()),
4361 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
4362 c: ::std::char::from_u32(i).unwrap(),
4363 }))
4364 );
4365 }
4366
4367 assert_eq!(
4368 parser(r"\xF").parse_escape().unwrap_err(),
4369 TestError {
4370 span: span(3..3),
4371 kind: ast::ErrorKind::EscapeUnexpectedEof,
4372 }
4373 );
4374 assert_eq!(
4375 parser(r"\xG").parse_escape().unwrap_err(),
4376 TestError {
4377 span: span(2..3),
4378 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4379 }
4380 );
4381 assert_eq!(
4382 parser(r"\xFG").parse_escape().unwrap_err(),
4383 TestError {
4384 span: span(3..4),
4385 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4386 }
4387 );
4388 }
4389
4390 #[test]
parse_hex_four()4391 fn parse_hex_four() {
4392 for i in 0..65536 {
4393 let c = match ::std::char::from_u32(i) {
4394 None => continue,
4395 Some(c) => c,
4396 };
4397 let pat = format!(r"\u{:04x}", i);
4398 assert_eq!(
4399 parser(&pat).parse_escape(),
4400 Ok(Primitive::Literal(ast::Literal {
4401 span: span(0..pat.len()),
4402 kind: ast::LiteralKind::HexFixed(
4403 ast::HexLiteralKind::UnicodeShort
4404 ),
4405 c: c,
4406 }))
4407 );
4408 }
4409
4410 assert_eq!(
4411 parser(r"\uF").parse_escape().unwrap_err(),
4412 TestError {
4413 span: span(3..3),
4414 kind: ast::ErrorKind::EscapeUnexpectedEof,
4415 }
4416 );
4417 assert_eq!(
4418 parser(r"\uG").parse_escape().unwrap_err(),
4419 TestError {
4420 span: span(2..3),
4421 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4422 }
4423 );
4424 assert_eq!(
4425 parser(r"\uFG").parse_escape().unwrap_err(),
4426 TestError {
4427 span: span(3..4),
4428 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4429 }
4430 );
4431 assert_eq!(
4432 parser(r"\uFFG").parse_escape().unwrap_err(),
4433 TestError {
4434 span: span(4..5),
4435 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4436 }
4437 );
4438 assert_eq!(
4439 parser(r"\uFFFG").parse_escape().unwrap_err(),
4440 TestError {
4441 span: span(5..6),
4442 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4443 }
4444 );
4445 assert_eq!(
4446 parser(r"\uD800").parse_escape().unwrap_err(),
4447 TestError {
4448 span: span(2..6),
4449 kind: ast::ErrorKind::EscapeHexInvalid,
4450 }
4451 );
4452 }
4453
4454 #[test]
parse_hex_eight()4455 fn parse_hex_eight() {
4456 for i in 0..65536 {
4457 let c = match ::std::char::from_u32(i) {
4458 None => continue,
4459 Some(c) => c,
4460 };
4461 let pat = format!(r"\U{:08x}", i);
4462 assert_eq!(
4463 parser(&pat).parse_escape(),
4464 Ok(Primitive::Literal(ast::Literal {
4465 span: span(0..pat.len()),
4466 kind: ast::LiteralKind::HexFixed(
4467 ast::HexLiteralKind::UnicodeLong
4468 ),
4469 c: c,
4470 }))
4471 );
4472 }
4473
4474 assert_eq!(
4475 parser(r"\UF").parse_escape().unwrap_err(),
4476 TestError {
4477 span: span(3..3),
4478 kind: ast::ErrorKind::EscapeUnexpectedEof,
4479 }
4480 );
4481 assert_eq!(
4482 parser(r"\UG").parse_escape().unwrap_err(),
4483 TestError {
4484 span: span(2..3),
4485 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4486 }
4487 );
4488 assert_eq!(
4489 parser(r"\UFG").parse_escape().unwrap_err(),
4490 TestError {
4491 span: span(3..4),
4492 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4493 }
4494 );
4495 assert_eq!(
4496 parser(r"\UFFG").parse_escape().unwrap_err(),
4497 TestError {
4498 span: span(4..5),
4499 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4500 }
4501 );
4502 assert_eq!(
4503 parser(r"\UFFFG").parse_escape().unwrap_err(),
4504 TestError {
4505 span: span(5..6),
4506 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4507 }
4508 );
4509 assert_eq!(
4510 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4511 TestError {
4512 span: span(6..7),
4513 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4514 }
4515 );
4516 assert_eq!(
4517 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4518 TestError {
4519 span: span(7..8),
4520 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4521 }
4522 );
4523 assert_eq!(
4524 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4525 TestError {
4526 span: span(8..9),
4527 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4528 }
4529 );
4530 assert_eq!(
4531 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4532 TestError {
4533 span: span(9..10),
4534 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4535 }
4536 );
4537 }
4538
4539 #[test]
parse_hex_brace()4540 fn parse_hex_brace() {
4541 assert_eq!(
4542 parser(r"\u{26c4}").parse_escape(),
4543 Ok(Primitive::Literal(ast::Literal {
4544 span: span(0..8),
4545 kind: ast::LiteralKind::HexBrace(
4546 ast::HexLiteralKind::UnicodeShort
4547 ),
4548 c: '⛄',
4549 }))
4550 );
4551 assert_eq!(
4552 parser(r"\U{26c4}").parse_escape(),
4553 Ok(Primitive::Literal(ast::Literal {
4554 span: span(0..8),
4555 kind: ast::LiteralKind::HexBrace(
4556 ast::HexLiteralKind::UnicodeLong
4557 ),
4558 c: '⛄',
4559 }))
4560 );
4561 assert_eq!(
4562 parser(r"\x{26c4}").parse_escape(),
4563 Ok(Primitive::Literal(ast::Literal {
4564 span: span(0..8),
4565 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4566 c: '⛄',
4567 }))
4568 );
4569 assert_eq!(
4570 parser(r"\x{26C4}").parse_escape(),
4571 Ok(Primitive::Literal(ast::Literal {
4572 span: span(0..8),
4573 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4574 c: '⛄',
4575 }))
4576 );
4577 assert_eq!(
4578 parser(r"\x{10fFfF}").parse_escape(),
4579 Ok(Primitive::Literal(ast::Literal {
4580 span: span(0..10),
4581 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4582 c: '\u{10FFFF}',
4583 }))
4584 );
4585
4586 assert_eq!(
4587 parser(r"\x").parse_escape().unwrap_err(),
4588 TestError {
4589 span: span(2..2),
4590 kind: ast::ErrorKind::EscapeUnexpectedEof,
4591 }
4592 );
4593 assert_eq!(
4594 parser(r"\x{").parse_escape().unwrap_err(),
4595 TestError {
4596 span: span(2..3),
4597 kind: ast::ErrorKind::EscapeUnexpectedEof,
4598 }
4599 );
4600 assert_eq!(
4601 parser(r"\x{FF").parse_escape().unwrap_err(),
4602 TestError {
4603 span: span(2..5),
4604 kind: ast::ErrorKind::EscapeUnexpectedEof,
4605 }
4606 );
4607 assert_eq!(
4608 parser(r"\x{}").parse_escape().unwrap_err(),
4609 TestError {
4610 span: span(2..4),
4611 kind: ast::ErrorKind::EscapeHexEmpty,
4612 }
4613 );
4614 assert_eq!(
4615 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4616 TestError {
4617 span: span(4..5),
4618 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4619 }
4620 );
4621 assert_eq!(
4622 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4623 TestError {
4624 span: span(3..9),
4625 kind: ast::ErrorKind::EscapeHexInvalid,
4626 }
4627 );
4628 assert_eq!(
4629 parser(r"\x{D800}").parse_escape().unwrap_err(),
4630 TestError {
4631 span: span(3..7),
4632 kind: ast::ErrorKind::EscapeHexInvalid,
4633 }
4634 );
4635 assert_eq!(
4636 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4637 TestError {
4638 span: span(3..12),
4639 kind: ast::ErrorKind::EscapeHexInvalid,
4640 }
4641 );
4642 }
4643
4644 #[test]
parse_decimal()4645 fn parse_decimal() {
4646 assert_eq!(parser("123").parse_decimal(), Ok(123));
4647 assert_eq!(parser("0").parse_decimal(), Ok(0));
4648 assert_eq!(parser("01").parse_decimal(), Ok(1));
4649
4650 assert_eq!(
4651 parser("-1").parse_decimal().unwrap_err(),
4652 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4653 );
4654 assert_eq!(
4655 parser("").parse_decimal().unwrap_err(),
4656 TestError { span: span(0..0), kind: ast::ErrorKind::DecimalEmpty }
4657 );
4658 assert_eq!(
4659 parser("9999999999").parse_decimal().unwrap_err(),
4660 TestError {
4661 span: span(0..10),
4662 kind: ast::ErrorKind::DecimalInvalid,
4663 }
4664 );
4665 }
4666
4667 #[test]
parse_set_class()4668 fn parse_set_class() {
4669 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4670 ast::ClassSet::union(ast::ClassSetUnion {
4671 span: span,
4672 items: items,
4673 })
4674 }
4675
4676 fn intersection(
4677 span: Span,
4678 lhs: ast::ClassSet,
4679 rhs: ast::ClassSet,
4680 ) -> ast::ClassSet {
4681 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4682 span: span,
4683 kind: ast::ClassSetBinaryOpKind::Intersection,
4684 lhs: Box::new(lhs),
4685 rhs: Box::new(rhs),
4686 })
4687 }
4688
4689 fn difference(
4690 span: Span,
4691 lhs: ast::ClassSet,
4692 rhs: ast::ClassSet,
4693 ) -> ast::ClassSet {
4694 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4695 span: span,
4696 kind: ast::ClassSetBinaryOpKind::Difference,
4697 lhs: Box::new(lhs),
4698 rhs: Box::new(rhs),
4699 })
4700 }
4701
4702 fn symdifference(
4703 span: Span,
4704 lhs: ast::ClassSet,
4705 rhs: ast::ClassSet,
4706 ) -> ast::ClassSet {
4707 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4708 span: span,
4709 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4710 lhs: Box::new(lhs),
4711 rhs: Box::new(rhs),
4712 })
4713 }
4714
4715 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4716 ast::ClassSet::Item(item)
4717 }
4718
4719 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4720 ast::ClassSetItem::Ascii(cls)
4721 }
4722
4723 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4724 ast::ClassSetItem::Unicode(cls)
4725 }
4726
4727 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4728 ast::ClassSetItem::Perl(cls)
4729 }
4730
4731 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4732 ast::ClassSetItem::Bracketed(Box::new(cls))
4733 }
4734
4735 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4736 ast::ClassSetItem::Literal(ast::Literal {
4737 span: span,
4738 kind: ast::LiteralKind::Verbatim,
4739 c: c,
4740 })
4741 }
4742
4743 fn empty(span: Span) -> ast::ClassSetItem {
4744 ast::ClassSetItem::Empty(span)
4745 }
4746
4747 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4748 let pos1 = Position {
4749 offset: span.start.offset + start.len_utf8(),
4750 column: span.start.column + 1,
4751 ..span.start
4752 };
4753 let pos2 = Position {
4754 offset: span.end.offset - end.len_utf8(),
4755 column: span.end.column - 1,
4756 ..span.end
4757 };
4758 ast::ClassSetItem::Range(ast::ClassSetRange {
4759 span: span,
4760 start: ast::Literal {
4761 span: Span { end: pos1, ..span },
4762 kind: ast::LiteralKind::Verbatim,
4763 c: start,
4764 },
4765 end: ast::Literal {
4766 span: Span { start: pos2, ..span },
4767 kind: ast::LiteralKind::Verbatim,
4768 c: end,
4769 },
4770 })
4771 }
4772
4773 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4774 ast::ClassAscii {
4775 span: span,
4776 kind: ast::ClassAsciiKind::Alnum,
4777 negated: negated,
4778 }
4779 }
4780
4781 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4782 ast::ClassAscii {
4783 span: span,
4784 kind: ast::ClassAsciiKind::Lower,
4785 negated: negated,
4786 }
4787 }
4788
4789 assert_eq!(
4790 parser("[[:alnum:]]").parse(),
4791 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4792 span: span(0..11),
4793 negated: false,
4794 kind: itemset(item_ascii(alnum(span(1..10), false))),
4795 })))
4796 );
4797 assert_eq!(
4798 parser("[[[:alnum:]]]").parse(),
4799 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4800 span: span(0..13),
4801 negated: false,
4802 kind: itemset(item_bracket(ast::ClassBracketed {
4803 span: span(1..12),
4804 negated: false,
4805 kind: itemset(item_ascii(alnum(span(2..11), false))),
4806 })),
4807 })))
4808 );
4809 assert_eq!(
4810 parser("[[:alnum:]&&[:lower:]]").parse(),
4811 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4812 span: span(0..22),
4813 negated: false,
4814 kind: intersection(
4815 span(1..21),
4816 itemset(item_ascii(alnum(span(1..10), false))),
4817 itemset(item_ascii(lower(span(12..21), false))),
4818 ),
4819 })))
4820 );
4821 assert_eq!(
4822 parser("[[:alnum:]--[:lower:]]").parse(),
4823 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4824 span: span(0..22),
4825 negated: false,
4826 kind: difference(
4827 span(1..21),
4828 itemset(item_ascii(alnum(span(1..10), false))),
4829 itemset(item_ascii(lower(span(12..21), false))),
4830 ),
4831 })))
4832 );
4833 assert_eq!(
4834 parser("[[:alnum:]~~[:lower:]]").parse(),
4835 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4836 span: span(0..22),
4837 negated: false,
4838 kind: symdifference(
4839 span(1..21),
4840 itemset(item_ascii(alnum(span(1..10), false))),
4841 itemset(item_ascii(lower(span(12..21), false))),
4842 ),
4843 })))
4844 );
4845
4846 assert_eq!(
4847 parser("[a]").parse(),
4848 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4849 span: span(0..3),
4850 negated: false,
4851 kind: itemset(lit(span(1..2), 'a')),
4852 })))
4853 );
4854 assert_eq!(
4855 parser(r"[a\]]").parse(),
4856 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4857 span: span(0..5),
4858 negated: false,
4859 kind: union(
4860 span(1..4),
4861 vec![
4862 lit(span(1..2), 'a'),
4863 ast::ClassSetItem::Literal(ast::Literal {
4864 span: span(2..4),
4865 kind: ast::LiteralKind::Punctuation,
4866 c: ']',
4867 }),
4868 ]
4869 ),
4870 })))
4871 );
4872 assert_eq!(
4873 parser(r"[a\-z]").parse(),
4874 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4875 span: span(0..6),
4876 negated: false,
4877 kind: union(
4878 span(1..5),
4879 vec![
4880 lit(span(1..2), 'a'),
4881 ast::ClassSetItem::Literal(ast::Literal {
4882 span: span(2..4),
4883 kind: ast::LiteralKind::Punctuation,
4884 c: '-',
4885 }),
4886 lit(span(4..5), 'z'),
4887 ]
4888 ),
4889 })))
4890 );
4891 assert_eq!(
4892 parser("[ab]").parse(),
4893 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4894 span: span(0..4),
4895 negated: false,
4896 kind: union(
4897 span(1..3),
4898 vec![lit(span(1..2), 'a'), lit(span(2..3), 'b'),]
4899 ),
4900 })))
4901 );
4902 assert_eq!(
4903 parser("[a-]").parse(),
4904 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4905 span: span(0..4),
4906 negated: false,
4907 kind: union(
4908 span(1..3),
4909 vec![lit(span(1..2), 'a'), lit(span(2..3), '-'),]
4910 ),
4911 })))
4912 );
4913 assert_eq!(
4914 parser("[-a]").parse(),
4915 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4916 span: span(0..4),
4917 negated: false,
4918 kind: union(
4919 span(1..3),
4920 vec![lit(span(1..2), '-'), lit(span(2..3), 'a'),]
4921 ),
4922 })))
4923 );
4924 assert_eq!(
4925 parser(r"[\pL]").parse(),
4926 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4927 span: span(0..5),
4928 negated: false,
4929 kind: itemset(item_unicode(ast::ClassUnicode {
4930 span: span(1..4),
4931 negated: false,
4932 kind: ast::ClassUnicodeKind::OneLetter('L'),
4933 })),
4934 })))
4935 );
4936 assert_eq!(
4937 parser(r"[\w]").parse(),
4938 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4939 span: span(0..4),
4940 negated: false,
4941 kind: itemset(item_perl(ast::ClassPerl {
4942 span: span(1..3),
4943 kind: ast::ClassPerlKind::Word,
4944 negated: false,
4945 })),
4946 })))
4947 );
4948 assert_eq!(
4949 parser(r"[a\wz]").parse(),
4950 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4951 span: span(0..6),
4952 negated: false,
4953 kind: union(
4954 span(1..5),
4955 vec![
4956 lit(span(1..2), 'a'),
4957 item_perl(ast::ClassPerl {
4958 span: span(2..4),
4959 kind: ast::ClassPerlKind::Word,
4960 negated: false,
4961 }),
4962 lit(span(4..5), 'z'),
4963 ]
4964 ),
4965 })))
4966 );
4967
4968 assert_eq!(
4969 parser("[a-z]").parse(),
4970 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4971 span: span(0..5),
4972 negated: false,
4973 kind: itemset(range(span(1..4), 'a', 'z')),
4974 })))
4975 );
4976 assert_eq!(
4977 parser("[a-cx-z]").parse(),
4978 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4979 span: span(0..8),
4980 negated: false,
4981 kind: union(
4982 span(1..7),
4983 vec![
4984 range(span(1..4), 'a', 'c'),
4985 range(span(4..7), 'x', 'z'),
4986 ]
4987 ),
4988 })))
4989 );
4990 assert_eq!(
4991 parser(r"[\w&&a-cx-z]").parse(),
4992 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4993 span: span(0..12),
4994 negated: false,
4995 kind: intersection(
4996 span(1..11),
4997 itemset(item_perl(ast::ClassPerl {
4998 span: span(1..3),
4999 kind: ast::ClassPerlKind::Word,
5000 negated: false,
5001 })),
5002 union(
5003 span(5..11),
5004 vec![
5005 range(span(5..8), 'a', 'c'),
5006 range(span(8..11), 'x', 'z'),
5007 ]
5008 ),
5009 ),
5010 })))
5011 );
5012 assert_eq!(
5013 parser(r"[a-cx-z&&\w]").parse(),
5014 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5015 span: span(0..12),
5016 negated: false,
5017 kind: intersection(
5018 span(1..11),
5019 union(
5020 span(1..7),
5021 vec![
5022 range(span(1..4), 'a', 'c'),
5023 range(span(4..7), 'x', 'z'),
5024 ]
5025 ),
5026 itemset(item_perl(ast::ClassPerl {
5027 span: span(9..11),
5028 kind: ast::ClassPerlKind::Word,
5029 negated: false,
5030 })),
5031 ),
5032 })))
5033 );
5034 assert_eq!(
5035 parser(r"[a--b--c]").parse(),
5036 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5037 span: span(0..9),
5038 negated: false,
5039 kind: difference(
5040 span(1..8),
5041 difference(
5042 span(1..5),
5043 itemset(lit(span(1..2), 'a')),
5044 itemset(lit(span(4..5), 'b')),
5045 ),
5046 itemset(lit(span(7..8), 'c')),
5047 ),
5048 })))
5049 );
5050 assert_eq!(
5051 parser(r"[a~~b~~c]").parse(),
5052 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5053 span: span(0..9),
5054 negated: false,
5055 kind: symdifference(
5056 span(1..8),
5057 symdifference(
5058 span(1..5),
5059 itemset(lit(span(1..2), 'a')),
5060 itemset(lit(span(4..5), 'b')),
5061 ),
5062 itemset(lit(span(7..8), 'c')),
5063 ),
5064 })))
5065 );
5066 assert_eq!(
5067 parser(r"[\^&&^]").parse(),
5068 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5069 span: span(0..7),
5070 negated: false,
5071 kind: intersection(
5072 span(1..6),
5073 itemset(ast::ClassSetItem::Literal(ast::Literal {
5074 span: span(1..3),
5075 kind: ast::LiteralKind::Punctuation,
5076 c: '^',
5077 })),
5078 itemset(lit(span(5..6), '^')),
5079 ),
5080 })))
5081 );
5082 assert_eq!(
5083 parser(r"[\&&&&]").parse(),
5084 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5085 span: span(0..7),
5086 negated: false,
5087 kind: intersection(
5088 span(1..6),
5089 itemset(ast::ClassSetItem::Literal(ast::Literal {
5090 span: span(1..3),
5091 kind: ast::LiteralKind::Punctuation,
5092 c: '&',
5093 })),
5094 itemset(lit(span(5..6), '&')),
5095 ),
5096 })))
5097 );
5098 assert_eq!(
5099 parser(r"[&&&&]").parse(),
5100 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5101 span: span(0..6),
5102 negated: false,
5103 kind: intersection(
5104 span(1..5),
5105 intersection(
5106 span(1..3),
5107 itemset(empty(span(1..1))),
5108 itemset(empty(span(3..3))),
5109 ),
5110 itemset(empty(span(5..5))),
5111 ),
5112 })))
5113 );
5114
5115 let pat = "[☃-⛄]";
5116 assert_eq!(
5117 parser(pat).parse(),
5118 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5119 span: span_range(pat, 0..9),
5120 negated: false,
5121 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
5122 span: span_range(pat, 1..8),
5123 start: ast::Literal {
5124 span: span_range(pat, 1..4),
5125 kind: ast::LiteralKind::Verbatim,
5126 c: '☃',
5127 },
5128 end: ast::Literal {
5129 span: span_range(pat, 5..8),
5130 kind: ast::LiteralKind::Verbatim,
5131 c: '⛄',
5132 },
5133 })),
5134 })))
5135 );
5136
5137 assert_eq!(
5138 parser(r"[]]").parse(),
5139 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5140 span: span(0..3),
5141 negated: false,
5142 kind: itemset(lit(span(1..2), ']')),
5143 })))
5144 );
5145 assert_eq!(
5146 parser(r"[]\[]").parse(),
5147 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5148 span: span(0..5),
5149 negated: false,
5150 kind: union(
5151 span(1..4),
5152 vec![
5153 lit(span(1..2), ']'),
5154 ast::ClassSetItem::Literal(ast::Literal {
5155 span: span(2..4),
5156 kind: ast::LiteralKind::Punctuation,
5157 c: '[',
5158 }),
5159 ]
5160 ),
5161 })))
5162 );
5163 assert_eq!(
5164 parser(r"[\[]]").parse(),
5165 Ok(concat(
5166 0..5,
5167 vec![
5168 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
5169 span: span(0..4),
5170 negated: false,
5171 kind: itemset(ast::ClassSetItem::Literal(
5172 ast::Literal {
5173 span: span(1..3),
5174 kind: ast::LiteralKind::Punctuation,
5175 c: '[',
5176 }
5177 )),
5178 })),
5179 Ast::Literal(ast::Literal {
5180 span: span(4..5),
5181 kind: ast::LiteralKind::Verbatim,
5182 c: ']',
5183 }),
5184 ]
5185 ))
5186 );
5187
5188 assert_eq!(
5189 parser("[").parse().unwrap_err(),
5190 TestError {
5191 span: span(0..1),
5192 kind: ast::ErrorKind::ClassUnclosed,
5193 }
5194 );
5195 assert_eq!(
5196 parser("[[").parse().unwrap_err(),
5197 TestError {
5198 span: span(1..2),
5199 kind: ast::ErrorKind::ClassUnclosed,
5200 }
5201 );
5202 assert_eq!(
5203 parser("[[-]").parse().unwrap_err(),
5204 TestError {
5205 span: span(0..1),
5206 kind: ast::ErrorKind::ClassUnclosed,
5207 }
5208 );
5209 assert_eq!(
5210 parser("[[[:alnum:]").parse().unwrap_err(),
5211 TestError {
5212 span: span(1..2),
5213 kind: ast::ErrorKind::ClassUnclosed,
5214 }
5215 );
5216 assert_eq!(
5217 parser(r"[\b]").parse().unwrap_err(),
5218 TestError {
5219 span: span(1..3),
5220 kind: ast::ErrorKind::ClassEscapeInvalid,
5221 }
5222 );
5223 assert_eq!(
5224 parser(r"[\w-a]").parse().unwrap_err(),
5225 TestError {
5226 span: span(1..3),
5227 kind: ast::ErrorKind::ClassRangeLiteral,
5228 }
5229 );
5230 assert_eq!(
5231 parser(r"[a-\w]").parse().unwrap_err(),
5232 TestError {
5233 span: span(3..5),
5234 kind: ast::ErrorKind::ClassRangeLiteral,
5235 }
5236 );
5237 assert_eq!(
5238 parser(r"[z-a]").parse().unwrap_err(),
5239 TestError {
5240 span: span(1..4),
5241 kind: ast::ErrorKind::ClassRangeInvalid,
5242 }
5243 );
5244
5245 assert_eq!(
5246 parser_ignore_whitespace("[a ").parse().unwrap_err(),
5247 TestError {
5248 span: span(0..1),
5249 kind: ast::ErrorKind::ClassUnclosed,
5250 }
5251 );
5252 assert_eq!(
5253 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
5254 TestError {
5255 span: span(0..1),
5256 kind: ast::ErrorKind::ClassUnclosed,
5257 }
5258 );
5259 }
5260
5261 #[test]
parse_set_class_open()5262 fn parse_set_class_open() {
5263 assert_eq!(parser("[a]").parse_set_class_open(), {
5264 let set = ast::ClassBracketed {
5265 span: span(0..1),
5266 negated: false,
5267 kind: ast::ClassSet::union(ast::ClassSetUnion {
5268 span: span(1..1),
5269 items: vec![],
5270 }),
5271 };
5272 let union = ast::ClassSetUnion { span: span(1..1), items: vec![] };
5273 Ok((set, union))
5274 });
5275 assert_eq!(
5276 parser_ignore_whitespace("[ a]").parse_set_class_open(),
5277 {
5278 let set = ast::ClassBracketed {
5279 span: span(0..4),
5280 negated: false,
5281 kind: ast::ClassSet::union(ast::ClassSetUnion {
5282 span: span(4..4),
5283 items: vec![],
5284 }),
5285 };
5286 let union =
5287 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5288 Ok((set, union))
5289 }
5290 );
5291 assert_eq!(parser("[^a]").parse_set_class_open(), {
5292 let set = ast::ClassBracketed {
5293 span: span(0..2),
5294 negated: true,
5295 kind: ast::ClassSet::union(ast::ClassSetUnion {
5296 span: span(2..2),
5297 items: vec![],
5298 }),
5299 };
5300 let union = ast::ClassSetUnion { span: span(2..2), items: vec![] };
5301 Ok((set, union))
5302 });
5303 assert_eq!(
5304 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(),
5305 {
5306 let set = ast::ClassBracketed {
5307 span: span(0..4),
5308 negated: true,
5309 kind: ast::ClassSet::union(ast::ClassSetUnion {
5310 span: span(4..4),
5311 items: vec![],
5312 }),
5313 };
5314 let union =
5315 ast::ClassSetUnion { span: span(4..4), items: vec![] };
5316 Ok((set, union))
5317 }
5318 );
5319 assert_eq!(parser("[-a]").parse_set_class_open(), {
5320 let set = ast::ClassBracketed {
5321 span: span(0..2),
5322 negated: false,
5323 kind: ast::ClassSet::union(ast::ClassSetUnion {
5324 span: span(1..1),
5325 items: vec![],
5326 }),
5327 };
5328 let union = ast::ClassSetUnion {
5329 span: span(1..2),
5330 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5331 span: span(1..2),
5332 kind: ast::LiteralKind::Verbatim,
5333 c: '-',
5334 })],
5335 };
5336 Ok((set, union))
5337 });
5338 assert_eq!(
5339 parser_ignore_whitespace("[ - a]").parse_set_class_open(),
5340 {
5341 let set = ast::ClassBracketed {
5342 span: span(0..4),
5343 negated: false,
5344 kind: ast::ClassSet::union(ast::ClassSetUnion {
5345 span: span(2..2),
5346 items: vec![],
5347 }),
5348 };
5349 let union = ast::ClassSetUnion {
5350 span: span(2..3),
5351 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5352 span: span(2..3),
5353 kind: ast::LiteralKind::Verbatim,
5354 c: '-',
5355 })],
5356 };
5357 Ok((set, union))
5358 }
5359 );
5360 assert_eq!(parser("[^-a]").parse_set_class_open(), {
5361 let set = ast::ClassBracketed {
5362 span: span(0..3),
5363 negated: true,
5364 kind: ast::ClassSet::union(ast::ClassSetUnion {
5365 span: span(2..2),
5366 items: vec![],
5367 }),
5368 };
5369 let union = ast::ClassSetUnion {
5370 span: span(2..3),
5371 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5372 span: span(2..3),
5373 kind: ast::LiteralKind::Verbatim,
5374 c: '-',
5375 })],
5376 };
5377 Ok((set, union))
5378 });
5379 assert_eq!(parser("[--a]").parse_set_class_open(), {
5380 let set = ast::ClassBracketed {
5381 span: span(0..3),
5382 negated: false,
5383 kind: ast::ClassSet::union(ast::ClassSetUnion {
5384 span: span(1..1),
5385 items: vec![],
5386 }),
5387 };
5388 let union = ast::ClassSetUnion {
5389 span: span(1..3),
5390 items: vec![
5391 ast::ClassSetItem::Literal(ast::Literal {
5392 span: span(1..2),
5393 kind: ast::LiteralKind::Verbatim,
5394 c: '-',
5395 }),
5396 ast::ClassSetItem::Literal(ast::Literal {
5397 span: span(2..3),
5398 kind: ast::LiteralKind::Verbatim,
5399 c: '-',
5400 }),
5401 ],
5402 };
5403 Ok((set, union))
5404 });
5405 assert_eq!(parser("[]a]").parse_set_class_open(), {
5406 let set = ast::ClassBracketed {
5407 span: span(0..2),
5408 negated: false,
5409 kind: ast::ClassSet::union(ast::ClassSetUnion {
5410 span: span(1..1),
5411 items: vec![],
5412 }),
5413 };
5414 let union = ast::ClassSetUnion {
5415 span: span(1..2),
5416 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5417 span: span(1..2),
5418 kind: ast::LiteralKind::Verbatim,
5419 c: ']',
5420 })],
5421 };
5422 Ok((set, union))
5423 });
5424 assert_eq!(
5425 parser_ignore_whitespace("[ ] a]").parse_set_class_open(),
5426 {
5427 let set = ast::ClassBracketed {
5428 span: span(0..4),
5429 negated: false,
5430 kind: ast::ClassSet::union(ast::ClassSetUnion {
5431 span: span(2..2),
5432 items: vec![],
5433 }),
5434 };
5435 let union = ast::ClassSetUnion {
5436 span: span(2..3),
5437 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5438 span: span(2..3),
5439 kind: ast::LiteralKind::Verbatim,
5440 c: ']',
5441 })],
5442 };
5443 Ok((set, union))
5444 }
5445 );
5446 assert_eq!(parser("[^]a]").parse_set_class_open(), {
5447 let set = ast::ClassBracketed {
5448 span: span(0..3),
5449 negated: true,
5450 kind: ast::ClassSet::union(ast::ClassSetUnion {
5451 span: span(2..2),
5452 items: vec![],
5453 }),
5454 };
5455 let union = ast::ClassSetUnion {
5456 span: span(2..3),
5457 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5458 span: span(2..3),
5459 kind: ast::LiteralKind::Verbatim,
5460 c: ']',
5461 })],
5462 };
5463 Ok((set, union))
5464 });
5465 assert_eq!(parser("[-]a]").parse_set_class_open(), {
5466 let set = ast::ClassBracketed {
5467 span: span(0..2),
5468 negated: false,
5469 kind: ast::ClassSet::union(ast::ClassSetUnion {
5470 span: span(1..1),
5471 items: vec![],
5472 }),
5473 };
5474 let union = ast::ClassSetUnion {
5475 span: span(1..2),
5476 items: vec![ast::ClassSetItem::Literal(ast::Literal {
5477 span: span(1..2),
5478 kind: ast::LiteralKind::Verbatim,
5479 c: '-',
5480 })],
5481 };
5482 Ok((set, union))
5483 });
5484
5485 assert_eq!(
5486 parser("[").parse_set_class_open().unwrap_err(),
5487 TestError {
5488 span: span(0..1),
5489 kind: ast::ErrorKind::ClassUnclosed,
5490 }
5491 );
5492 assert_eq!(
5493 parser_ignore_whitespace("[ ")
5494 .parse_set_class_open()
5495 .unwrap_err(),
5496 TestError {
5497 span: span(0..5),
5498 kind: ast::ErrorKind::ClassUnclosed,
5499 }
5500 );
5501 assert_eq!(
5502 parser("[^").parse_set_class_open().unwrap_err(),
5503 TestError {
5504 span: span(0..2),
5505 kind: ast::ErrorKind::ClassUnclosed,
5506 }
5507 );
5508 assert_eq!(
5509 parser("[]").parse_set_class_open().unwrap_err(),
5510 TestError {
5511 span: span(0..2),
5512 kind: ast::ErrorKind::ClassUnclosed,
5513 }
5514 );
5515 assert_eq!(
5516 parser("[-").parse_set_class_open().unwrap_err(),
5517 TestError {
5518 span: span(0..2),
5519 kind: ast::ErrorKind::ClassUnclosed,
5520 }
5521 );
5522 assert_eq!(
5523 parser("[--").parse_set_class_open().unwrap_err(),
5524 TestError {
5525 span: span(0..3),
5526 kind: ast::ErrorKind::ClassUnclosed,
5527 }
5528 );
5529 }
5530
5531 #[test]
maybe_parse_ascii_class()5532 fn maybe_parse_ascii_class() {
5533 assert_eq!(
5534 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5535 Some(ast::ClassAscii {
5536 span: span(0..9),
5537 kind: ast::ClassAsciiKind::Alnum,
5538 negated: false,
5539 })
5540 );
5541 assert_eq!(
5542 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5543 Some(ast::ClassAscii {
5544 span: span(0..9),
5545 kind: ast::ClassAsciiKind::Alnum,
5546 negated: false,
5547 })
5548 );
5549 assert_eq!(
5550 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5551 Some(ast::ClassAscii {
5552 span: span(0..10),
5553 kind: ast::ClassAsciiKind::Alnum,
5554 negated: true,
5555 })
5556 );
5557
5558 let p = parser(r"[:");
5559 assert_eq!(p.maybe_parse_ascii_class(), None);
5560 assert_eq!(p.offset(), 0);
5561
5562 let p = parser(r"[:^");
5563 assert_eq!(p.maybe_parse_ascii_class(), None);
5564 assert_eq!(p.offset(), 0);
5565
5566 let p = parser(r"[^:alnum:]");
5567 assert_eq!(p.maybe_parse_ascii_class(), None);
5568 assert_eq!(p.offset(), 0);
5569
5570 let p = parser(r"[:alnnum:]");
5571 assert_eq!(p.maybe_parse_ascii_class(), None);
5572 assert_eq!(p.offset(), 0);
5573
5574 let p = parser(r"[:alnum]");
5575 assert_eq!(p.maybe_parse_ascii_class(), None);
5576 assert_eq!(p.offset(), 0);
5577
5578 let p = parser(r"[:alnum:");
5579 assert_eq!(p.maybe_parse_ascii_class(), None);
5580 assert_eq!(p.offset(), 0);
5581 }
5582
5583 #[test]
parse_unicode_class()5584 fn parse_unicode_class() {
5585 assert_eq!(
5586 parser(r"\pN").parse_escape(),
5587 Ok(Primitive::Unicode(ast::ClassUnicode {
5588 span: span(0..3),
5589 negated: false,
5590 kind: ast::ClassUnicodeKind::OneLetter('N'),
5591 }))
5592 );
5593 assert_eq!(
5594 parser(r"\PN").parse_escape(),
5595 Ok(Primitive::Unicode(ast::ClassUnicode {
5596 span: span(0..3),
5597 negated: true,
5598 kind: ast::ClassUnicodeKind::OneLetter('N'),
5599 }))
5600 );
5601 assert_eq!(
5602 parser(r"\p{N}").parse_escape(),
5603 Ok(Primitive::Unicode(ast::ClassUnicode {
5604 span: span(0..5),
5605 negated: false,
5606 kind: ast::ClassUnicodeKind::Named(s("N")),
5607 }))
5608 );
5609 assert_eq!(
5610 parser(r"\P{N}").parse_escape(),
5611 Ok(Primitive::Unicode(ast::ClassUnicode {
5612 span: span(0..5),
5613 negated: true,
5614 kind: ast::ClassUnicodeKind::Named(s("N")),
5615 }))
5616 );
5617 assert_eq!(
5618 parser(r"\p{Greek}").parse_escape(),
5619 Ok(Primitive::Unicode(ast::ClassUnicode {
5620 span: span(0..9),
5621 negated: false,
5622 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5623 }))
5624 );
5625
5626 assert_eq!(
5627 parser(r"\p{scx:Katakana}").parse_escape(),
5628 Ok(Primitive::Unicode(ast::ClassUnicode {
5629 span: span(0..16),
5630 negated: false,
5631 kind: ast::ClassUnicodeKind::NamedValue {
5632 op: ast::ClassUnicodeOpKind::Colon,
5633 name: s("scx"),
5634 value: s("Katakana"),
5635 },
5636 }))
5637 );
5638 assert_eq!(
5639 parser(r"\p{scx=Katakana}").parse_escape(),
5640 Ok(Primitive::Unicode(ast::ClassUnicode {
5641 span: span(0..16),
5642 negated: false,
5643 kind: ast::ClassUnicodeKind::NamedValue {
5644 op: ast::ClassUnicodeOpKind::Equal,
5645 name: s("scx"),
5646 value: s("Katakana"),
5647 },
5648 }))
5649 );
5650 assert_eq!(
5651 parser(r"\p{scx!=Katakana}").parse_escape(),
5652 Ok(Primitive::Unicode(ast::ClassUnicode {
5653 span: span(0..17),
5654 negated: false,
5655 kind: ast::ClassUnicodeKind::NamedValue {
5656 op: ast::ClassUnicodeOpKind::NotEqual,
5657 name: s("scx"),
5658 value: s("Katakana"),
5659 },
5660 }))
5661 );
5662
5663 assert_eq!(
5664 parser(r"\p{:}").parse_escape(),
5665 Ok(Primitive::Unicode(ast::ClassUnicode {
5666 span: span(0..5),
5667 negated: false,
5668 kind: ast::ClassUnicodeKind::NamedValue {
5669 op: ast::ClassUnicodeOpKind::Colon,
5670 name: s(""),
5671 value: s(""),
5672 },
5673 }))
5674 );
5675 assert_eq!(
5676 parser(r"\p{=}").parse_escape(),
5677 Ok(Primitive::Unicode(ast::ClassUnicode {
5678 span: span(0..5),
5679 negated: false,
5680 kind: ast::ClassUnicodeKind::NamedValue {
5681 op: ast::ClassUnicodeOpKind::Equal,
5682 name: s(""),
5683 value: s(""),
5684 },
5685 }))
5686 );
5687 assert_eq!(
5688 parser(r"\p{!=}").parse_escape(),
5689 Ok(Primitive::Unicode(ast::ClassUnicode {
5690 span: span(0..6),
5691 negated: false,
5692 kind: ast::ClassUnicodeKind::NamedValue {
5693 op: ast::ClassUnicodeOpKind::NotEqual,
5694 name: s(""),
5695 value: s(""),
5696 },
5697 }))
5698 );
5699
5700 assert_eq!(
5701 parser(r"\p").parse_escape().unwrap_err(),
5702 TestError {
5703 span: span(2..2),
5704 kind: ast::ErrorKind::EscapeUnexpectedEof,
5705 }
5706 );
5707 assert_eq!(
5708 parser(r"\p{").parse_escape().unwrap_err(),
5709 TestError {
5710 span: span(3..3),
5711 kind: ast::ErrorKind::EscapeUnexpectedEof,
5712 }
5713 );
5714 assert_eq!(
5715 parser(r"\p{N").parse_escape().unwrap_err(),
5716 TestError {
5717 span: span(4..4),
5718 kind: ast::ErrorKind::EscapeUnexpectedEof,
5719 }
5720 );
5721 assert_eq!(
5722 parser(r"\p{Greek").parse_escape().unwrap_err(),
5723 TestError {
5724 span: span(8..8),
5725 kind: ast::ErrorKind::EscapeUnexpectedEof,
5726 }
5727 );
5728
5729 assert_eq!(
5730 parser(r"\pNz").parse(),
5731 Ok(Ast::Concat(ast::Concat {
5732 span: span(0..4),
5733 asts: vec![
5734 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5735 span: span(0..3),
5736 negated: false,
5737 kind: ast::ClassUnicodeKind::OneLetter('N'),
5738 })),
5739 Ast::Literal(ast::Literal {
5740 span: span(3..4),
5741 kind: ast::LiteralKind::Verbatim,
5742 c: 'z',
5743 }),
5744 ],
5745 }))
5746 );
5747 assert_eq!(
5748 parser(r"\p{Greek}z").parse(),
5749 Ok(Ast::Concat(ast::Concat {
5750 span: span(0..10),
5751 asts: vec![
5752 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5753 span: span(0..9),
5754 negated: false,
5755 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5756 })),
5757 Ast::Literal(ast::Literal {
5758 span: span(9..10),
5759 kind: ast::LiteralKind::Verbatim,
5760 c: 'z',
5761 }),
5762 ],
5763 }))
5764 );
5765 assert_eq!(
5766 parser(r"\p\{").parse().unwrap_err(),
5767 TestError {
5768 span: span(2..3),
5769 kind: ast::ErrorKind::UnicodeClassInvalid,
5770 }
5771 );
5772 assert_eq!(
5773 parser(r"\P\{").parse().unwrap_err(),
5774 TestError {
5775 span: span(2..3),
5776 kind: ast::ErrorKind::UnicodeClassInvalid,
5777 }
5778 );
5779 }
5780
5781 #[test]
parse_perl_class()5782 fn parse_perl_class() {
5783 assert_eq!(
5784 parser(r"\d").parse_escape(),
5785 Ok(Primitive::Perl(ast::ClassPerl {
5786 span: span(0..2),
5787 kind: ast::ClassPerlKind::Digit,
5788 negated: false,
5789 }))
5790 );
5791 assert_eq!(
5792 parser(r"\D").parse_escape(),
5793 Ok(Primitive::Perl(ast::ClassPerl {
5794 span: span(0..2),
5795 kind: ast::ClassPerlKind::Digit,
5796 negated: true,
5797 }))
5798 );
5799 assert_eq!(
5800 parser(r"\s").parse_escape(),
5801 Ok(Primitive::Perl(ast::ClassPerl {
5802 span: span(0..2),
5803 kind: ast::ClassPerlKind::Space,
5804 negated: false,
5805 }))
5806 );
5807 assert_eq!(
5808 parser(r"\S").parse_escape(),
5809 Ok(Primitive::Perl(ast::ClassPerl {
5810 span: span(0..2),
5811 kind: ast::ClassPerlKind::Space,
5812 negated: true,
5813 }))
5814 );
5815 assert_eq!(
5816 parser(r"\w").parse_escape(),
5817 Ok(Primitive::Perl(ast::ClassPerl {
5818 span: span(0..2),
5819 kind: ast::ClassPerlKind::Word,
5820 negated: false,
5821 }))
5822 );
5823 assert_eq!(
5824 parser(r"\W").parse_escape(),
5825 Ok(Primitive::Perl(ast::ClassPerl {
5826 span: span(0..2),
5827 kind: ast::ClassPerlKind::Word,
5828 negated: true,
5829 }))
5830 );
5831
5832 assert_eq!(
5833 parser(r"\d").parse(),
5834 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5835 span: span(0..2),
5836 kind: ast::ClassPerlKind::Digit,
5837 negated: false,
5838 })))
5839 );
5840 assert_eq!(
5841 parser(r"\dz").parse(),
5842 Ok(Ast::Concat(ast::Concat {
5843 span: span(0..3),
5844 asts: vec![
5845 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5846 span: span(0..2),
5847 kind: ast::ClassPerlKind::Digit,
5848 negated: false,
5849 })),
5850 Ast::Literal(ast::Literal {
5851 span: span(2..3),
5852 kind: ast::LiteralKind::Verbatim,
5853 c: 'z',
5854 }),
5855 ],
5856 }))
5857 );
5858 }
5859
5860 // This tests a bug fix where the nest limit checker wasn't decrementing
5861 // its depth during post-traversal, which causes long regexes to trip
5862 // the default limit too aggressively.
5863 #[test]
regression_454_nest_too_big()5864 fn regression_454_nest_too_big() {
5865 let pattern = r#"
5866 2(?:
5867 [45]\d{3}|
5868 7(?:
5869 1[0-267]|
5870 2[0-289]|
5871 3[0-29]|
5872 4[01]|
5873 5[1-3]|
5874 6[013]|
5875 7[0178]|
5876 91
5877 )|
5878 8(?:
5879 0[125]|
5880 [139][1-6]|
5881 2[0157-9]|
5882 41|
5883 6[1-35]|
5884 7[1-5]|
5885 8[1-8]|
5886 90
5887 )|
5888 9(?:
5889 0[0-2]|
5890 1[0-4]|
5891 2[568]|
5892 3[3-6]|
5893 5[5-7]|
5894 6[0167]|
5895 7[15]|
5896 8[0146-9]
5897 )
5898 )\d{4}
5899 "#;
5900 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5901 }
5902
5903 // This tests that we treat a trailing `-` in a character class as a
5904 // literal `-` even when whitespace mode is enabled and there is whitespace
5905 // after the trailing `-`.
5906 #[test]
regression_455_trailing_dash_ignore_whitespace()5907 fn regression_455_trailing_dash_ignore_whitespace() {
5908 assert!(parser("(?x)[ / - ]").parse().is_ok());
5909 assert!(parser("(?x)[ a - ]").parse().is_ok());
5910 assert!(parser(
5911 "(?x)[
5912 a
5913 - ]
5914 "
5915 )
5916 .parse()
5917 .is_ok());
5918 assert!(parser(
5919 "(?x)[
5920 a # wat
5921 - ]
5922 "
5923 )
5924 .parse()
5925 .is_ok());
5926
5927 assert!(parser("(?x)[ / -").parse().is_err());
5928 assert!(parser("(?x)[ / - ").parse().is_err());
5929 assert!(parser(
5930 "(?x)[
5931 / -
5932 "
5933 )
5934 .parse()
5935 .is_err());
5936 assert!(parser(
5937 "(?x)[
5938 / - # wat
5939 "
5940 )
5941 .parse()
5942 .is_err());
5943 }
5944 }
5945