1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 This module provides a regular expression parser.
13 */
14
15 use std::borrow::Borrow;
16 use std::cell::{Cell, RefCell};
17 use std::mem;
18 use std::result;
19
20 use ast::{self, Ast, Position, Span};
21 use either::Either;
22
23 use is_meta_character;
24
25 type Result<T> = result::Result<T, ast::Error>;
26
27 /// A primitive is an expression with no sub-expressions. This includes
28 /// literals, assertions and non-set character classes. This representation
29 /// is used as intermediate state in the parser.
30 ///
31 /// This does not include ASCII character classes, since they can only appear
32 /// within a set character class.
33 #[derive(Clone, Debug, Eq, PartialEq)]
34 enum Primitive {
35 Literal(ast::Literal),
36 Assertion(ast::Assertion),
37 Dot(Span),
38 Perl(ast::ClassPerl),
39 Unicode(ast::ClassUnicode),
40 }
41
42 impl Primitive {
43 /// Return the span of this primitive.
span(&self) -> &Span44 fn span(&self) -> &Span {
45 match *self {
46 Primitive::Literal(ref x) => &x.span,
47 Primitive::Assertion(ref x) => &x.span,
48 Primitive::Dot(ref span) => span,
49 Primitive::Perl(ref x) => &x.span,
50 Primitive::Unicode(ref x) => &x.span,
51 }
52 }
53
54 /// Convert this primitive into a proper AST.
into_ast(self) -> Ast55 fn into_ast(self) -> Ast {
56 match self {
57 Primitive::Literal(lit) => Ast::Literal(lit),
58 Primitive::Assertion(assert) => Ast::Assertion(assert),
59 Primitive::Dot(span) => Ast::Dot(span),
60 Primitive::Perl(cls) => Ast::Class(ast::Class::Perl(cls)),
61 Primitive::Unicode(cls) => Ast::Class(ast::Class::Unicode(cls)),
62 }
63 }
64
65 /// Convert this primitive into an item in a character class.
66 ///
67 /// If this primitive is not a legal item (i.e., an assertion or a dot),
68 /// then return an error.
into_class_set_item<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::ClassSetItem>69 fn into_class_set_item<P: Borrow<Parser>>(
70 self,
71 p: &ParserI<P>,
72 ) -> Result<ast::ClassSetItem> {
73 use ast::ClassSetItem;
74 use self::Primitive::*;
75
76 match self {
77 Literal(lit) => Ok(ClassSetItem::Literal(lit)),
78 Perl(cls) => Ok(ClassSetItem::Perl(cls)),
79 Unicode(cls) => Ok(ClassSetItem::Unicode(cls)),
80 x => Err(p.error(*x.span(), ast::ErrorKind::ClassEscapeInvalid)),
81 }
82 }
83
84 /// Convert this primitive into a literal in a character class. In
85 /// particular, literals are the only valid items that can appear in
86 /// ranges.
87 ///
88 /// If this primitive is not a legal item (i.e., a class, assertion or a
89 /// dot), then return an error.
into_class_literal<P: Borrow<Parser>>( self, p: &ParserI<P>, ) -> Result<ast::Literal>90 fn into_class_literal<P: Borrow<Parser>>(
91 self,
92 p: &ParserI<P>,
93 ) -> Result<ast::Literal> {
94 use self::Primitive::*;
95
96 match self {
97 Literal(lit) => Ok(lit),
98 x => Err(p.error(*x.span(), ast::ErrorKind::ClassRangeLiteral)),
99 }
100 }
101 }
102
103 /// Returns true if the given character is a hexadecimal digit.
is_hex(c: char) -> bool104 fn is_hex(c: char) -> bool {
105 ('0' <= c && c <= '9') || ('a' <= c && c <= 'f') || ('A' <= c && c <= 'F')
106 }
107
108 /// Returns true if the given character is a valid in a capture group name.
109 ///
110 /// If `first` is true, then `c` is treated as the first character in the
111 /// group name (which is not allowed to be a digit).
is_capture_char(c: char, first: bool) -> bool112 fn is_capture_char(c: char, first: bool) -> bool {
113 c == '_' || (!first && c >= '0' && c <= '9')
114 || (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z')
115 }
116
117 /// A builder for a regular expression parser.
118 ///
119 /// This builder permits modifying configuration options for the parser.
120 #[derive(Clone, Debug)]
121 pub struct ParserBuilder {
122 ignore_whitespace: bool,
123 nest_limit: u32,
124 octal: bool,
125 }
126
127 impl Default for ParserBuilder {
default() -> ParserBuilder128 fn default() -> ParserBuilder {
129 ParserBuilder::new()
130 }
131 }
132
133 impl ParserBuilder {
134 /// Create a new parser builder with a default configuration.
new() -> ParserBuilder135 pub fn new() -> ParserBuilder {
136 ParserBuilder {
137 ignore_whitespace: false,
138 nest_limit: 250,
139 octal: false,
140 }
141 }
142
143 /// Build a parser from this configuration with the given pattern.
build(&self) -> Parser144 pub fn build(&self) -> Parser {
145 Parser {
146 pos: Cell::new(Position { offset: 0, line: 1, column: 1 }),
147 capture_index: Cell::new(0),
148 nest_limit: self.nest_limit,
149 octal: self.octal,
150 initial_ignore_whitespace: self.ignore_whitespace,
151 ignore_whitespace: Cell::new(self.ignore_whitespace),
152 comments: RefCell::new(vec![]),
153 stack_group: RefCell::new(vec![]),
154 stack_class: RefCell::new(vec![]),
155 capture_names: RefCell::new(vec![]),
156 scratch: RefCell::new(String::new()),
157 }
158 }
159
160 /// Set the nesting limit for this parser.
161 ///
162 /// The nesting limit controls how deep the abstract syntax tree is allowed
163 /// to be. If the AST exceeds the given limit (e.g., with too many nested
164 /// groups), then an error is returned by the parser.
165 ///
166 /// The purpose of this limit is to act as a heuristic to prevent stack
167 /// overflow for consumers that do structural induction on an `Ast` using
168 /// explicit recursion. While this crate never does this (instead using
169 /// constant stack space and moving the call stack to the heap), other
170 /// crates may.
171 ///
172 /// This limit is not checked until the entire Ast is parsed. Therefore,
173 /// if callers want to put a limit on the amount of heap space used, then
174 /// they should impose a limit on the length, in bytes, of the concrete
175 /// pattern string. In particular, this is viable since this parser
176 /// implementation will limit itself to heap space proportional to the
177 /// lenth of the pattern string.
178 ///
179 /// Note that a nest limit of `0` will return a nest limit error for most
180 /// patterns but not all. For example, a nest limit of `0` permits `a` but
181 /// not `ab`, since `ab` requires a concatenation, which results in a nest
182 /// depth of `1`. In general, a nest limit is not something that manifests
183 /// in an obvious way in the concrete syntax, therefore, it should not be
184 /// used in a granular way.
nest_limit(&mut self, limit: u32) -> &mut ParserBuilder185 pub fn nest_limit(&mut self, limit: u32) -> &mut ParserBuilder {
186 self.nest_limit = limit;
187 self
188 }
189
190 /// Whether to support octal syntax or not.
191 ///
192 /// Octal syntax is a little-known way of uttering Unicode codepoints in
193 /// a regular expression. For example, `a`, `\x61`, `\u0061` and
194 /// `\141` are all equivalent regular expressions, where the last example
195 /// shows octal syntax.
196 ///
197 /// While supporting octal syntax isn't in and of itself a problem, it does
198 /// make good error messages harder. That is, in PCRE based regex engines,
199 /// syntax like `\0` invokes a backreference, which is explicitly
200 /// unsupported in Rust's regex engine. However, many users expect it to
201 /// be supported. Therefore, when octal support is disabled, the error
202 /// message will explicitly mention that backreferences aren't supported.
203 ///
204 /// Octal syntax is disabled by default.
octal(&mut self, yes: bool) -> &mut ParserBuilder205 pub fn octal(&mut self, yes: bool) -> &mut ParserBuilder {
206 self.octal = yes;
207 self
208 }
209
210 /// Enable verbose mode in the regular expression.
211 ///
212 /// When enabled, verbose mode permits insigificant whitespace in many
213 /// places in the regular expression, as well as comments. Comments are
214 /// started using `#` and continue until the end of the line.
215 ///
216 /// By default, this is disabled. It may be selectively enabled in the
217 /// regular expression by using the `x` flag regardless of this setting.
ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder218 pub fn ignore_whitespace(&mut self, yes: bool) -> &mut ParserBuilder {
219 self.ignore_whitespace = yes;
220 self
221 }
222 }
223
224 /// A regular expression parser.
225 ///
226 /// This parses a string representation of a regular expression into an
227 /// abstract syntax tree. The size of the tree is proportional to the length
228 /// of the regular expression pattern.
229 ///
230 /// A `Parser` can be configured in more detail via a
231 /// [`ParserBuilder`](struct.ParserBuilder.html).
232 #[derive(Clone, Debug)]
233 pub struct Parser {
234 /// The current position of the parser.
235 pos: Cell<Position>,
236 /// The current capture index.
237 capture_index: Cell<u32>,
238 /// The maximum number of open parens/brackets allowed. If the parser
239 /// exceeds this number, then an error is returned.
240 nest_limit: u32,
241 /// Whether to support octal syntax or not. When `false`, the parser will
242 /// return an error helpfully pointing out that backreferences are not
243 /// supported.
244 octal: bool,
245 /// The initial setting for `ignore_whitespace` as provided by
246 /// Th`ParserBuilder`. is is used when reseting the parser's state.
247 initial_ignore_whitespace: bool,
248 /// Whether whitespace should be ignored. When enabled, comments are
249 /// also permitted.
250 ignore_whitespace: Cell<bool>,
251 /// A list of comments, in order of appearance.
252 comments: RefCell<Vec<ast::Comment>>,
253 /// A stack of grouped sub-expressions, including alternations.
254 stack_group: RefCell<Vec<GroupState>>,
255 /// A stack of nested character classes. This is only non-empty when
256 /// parsing a class.
257 stack_class: RefCell<Vec<ClassState>>,
258 /// A sorted sequence of capture names. This is used to detect duplicate
259 /// capture names and report an error if one is detected.
260 capture_names: RefCell<Vec<ast::CaptureName>>,
261 /// A scratch buffer used in various places. Mostly this is used to
262 /// accumulate relevant characters from parts of a pattern.
263 scratch: RefCell<String>,
264 }
265
266 /// ParserI is the internal parser implementation.
267 ///
268 /// We use this separate type so that we can carry the provided pattern string
269 /// along with us. In particular, a `Parser` internal state is not tied to any
270 /// one pattern, but `ParserI` is.
271 ///
272 /// This type also lets us use `ParserI<&Parser>` in production code while
273 /// retaining the convenience of `ParserI<Parser>` for tests, which sometimes
274 /// work against the internal interface of the parser.
275 #[derive(Clone, Debug)]
276 struct ParserI<'s, P> {
277 /// The parser state/configuration.
278 parser: P,
279 /// The full regular expression provided by the user.
280 pattern: &'s str,
281 }
282
283 /// GroupState represents a single stack frame while parsing nested groups
284 /// and alternations. Each frame records the state up to an opening parenthesis
285 /// or a alternating bracket `|`.
286 #[derive(Clone, Debug)]
287 enum GroupState {
288 /// This state is pushed whenever an opening group is found.
289 Group {
290 /// The concatenation immediately preceding the opening group.
291 concat: ast::Concat,
292 /// The group that has been opened. Its sub-AST is always empty.
293 group: ast::Group,
294 /// Whether this group has the `x` flag enabled or not.
295 ignore_whitespace: bool,
296 },
297 /// This state is pushed whenever a new alternation branch is found. If
298 /// an alternation branch is found and this state is at the top of the
299 /// stack, then this state should be modified to include the new
300 /// alternation.
301 Alternation(ast::Alternation),
302 }
303
304 /// ClassState represents a single stack frame while parsing character classes.
305 /// Each frame records the state up to an intersection, difference, symmetric
306 /// difference or nested class.
307 ///
308 /// Note that a parser's character class stack is only non-empty when parsing
309 /// a character class. In all other cases, it is empty.
310 #[derive(Clone, Debug)]
311 enum ClassState {
312 /// This state is pushed whenever an opening bracket is found.
313 Open {
314 /// The union of class items immediately preceding this class.
315 union: ast::ClassSetUnion,
316 /// The class that has been opened. Typically this just corresponds
317 /// to the `[`, but it can also include `[^` since `^` indicates
318 /// negation of the class.
319 set: ast::ClassBracketed,
320 },
321 /// This state is pushed when a operator is seen. When popped, the stored
322 /// set becomes the left hand side of the operator.
323 Op {
324 /// The type of the operation, i.e., &&, -- or ~~.
325 kind: ast::ClassSetBinaryOpKind,
326 /// The left-hand side of the operator.
327 lhs: ast::ClassSet,
328 },
329 }
330
331 impl Parser {
332 /// Create a new parser with a default configuration.
333 ///
334 /// The parser can be run with either the `parse` or `parse_with_comments`
335 /// methods. The parse methods return an abstract syntax tree.
336 ///
337 /// To set configuration options on the parser, use
338 /// [`ParserBuilder`](struct.ParserBuilder.html).
new() -> Parser339 pub fn new() -> Parser {
340 ParserBuilder::new().build()
341 }
342
343 /// Parse the regular expression into an abstract syntax tree.
parse(&mut self, pattern: &str) -> Result<Ast>344 pub fn parse(&mut self, pattern: &str) -> Result<Ast> {
345 ParserI::new(self, pattern).parse()
346 }
347
348 /// Parse the regular expression and return an abstract syntax tree with
349 /// all of the comments found in the pattern.
parse_with_comments( &mut self, pattern: &str, ) -> Result<ast::WithComments>350 pub fn parse_with_comments(
351 &mut self,
352 pattern: &str,
353 ) -> Result<ast::WithComments> {
354 ParserI::new(self, pattern).parse_with_comments()
355 }
356
357 /// Reset the internal state of a parser.
358 ///
359 /// This is called at the beginning of every parse. This prevents the
360 /// parser from running with inconsistent state (say, if a previous
361 /// invocation returned an error and the parser is reused).
reset(&self)362 fn reset(&self) {
363 // These settings should be in line with the construction
364 // in `ParserBuilder::build`.
365 self.pos.set(Position { offset: 0, line: 1, column: 1});
366 self.ignore_whitespace.set(self.initial_ignore_whitespace);
367 self.comments.borrow_mut().clear();
368 self.stack_group.borrow_mut().clear();
369 self.stack_class.borrow_mut().clear();
370 }
371 }
372
373 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
374 /// Build an internal parser from a parser configuration and a pattern.
new(parser: P, pattern: &'s str) -> ParserI<'s, P>375 fn new(parser: P, pattern: &'s str) -> ParserI<'s, P> {
376 ParserI { parser: parser, pattern: pattern }
377 }
378
379 /// Return a reference to the parser state.
parser(&self) -> &Parser380 fn parser(&self) -> &Parser {
381 self.parser.borrow()
382 }
383
384 /// Return a reference to the pattern being parsed.
pattern(&self) -> &str385 fn pattern(&self) -> &str {
386 self.pattern.borrow()
387 }
388
389 /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error390 fn error(&self, span: Span, kind: ast::ErrorKind) -> ast::Error {
391 ast::Error {
392 kind: kind,
393 pattern: self.pattern().to_string(),
394 span: span,
395 }
396 }
397
398 /// Return the current offset of the parser.
399 ///
400 /// The offset starts at `0` from the beginning of the regular expression
401 /// pattern string.
offset(&self) -> usize402 fn offset(&self) -> usize {
403 self.parser().pos.get().offset
404 }
405
406 /// Return the current line number of the parser.
407 ///
408 /// The line number starts at `1`.
line(&self) -> usize409 fn line(&self) -> usize {
410 self.parser().pos.get().line
411 }
412
413 /// Return the current column of the parser.
414 ///
415 /// The column number starts at `1` and is reset whenever a `\n` is seen.
column(&self) -> usize416 fn column(&self) -> usize {
417 self.parser().pos.get().column
418 }
419
420 /// Return the next capturing index. Each subsequent call increments the
421 /// internal index.
422 ///
423 /// The span given should correspond to the location of the opening
424 /// parenthesis.
425 ///
426 /// If the capture limit is exceeded, then an error is returned.
next_capture_index(&self, span: Span) -> Result<u32>427 fn next_capture_index(&self, span: Span) -> Result<u32> {
428 let current = self.parser().capture_index.get();
429 let i = current.checked_add(1).ok_or_else(|| {
430 self.error(span, ast::ErrorKind::CaptureLimitExceeded)
431 })?;
432 self.parser().capture_index.set(i);
433 Ok(i)
434 }
435
436 /// Adds the given capture name to this parser. If this capture name has
437 /// already been used, then an error is returned.
add_capture_name(&self, cap: &ast::CaptureName) -> Result<()>438 fn add_capture_name(&self, cap: &ast::CaptureName) -> Result<()> {
439 let mut names = self.parser().capture_names.borrow_mut();
440 match names.binary_search_by_key(
441 &cap.name.as_str(),
442 |c| c.name.as_str(),
443 ) {
444 Err(i) => {
445 names.insert(i, cap.clone());
446 Ok(())
447 }
448 Ok(i) => {
449 Err(self.error(cap.span, ast::ErrorKind::GroupNameDuplicate {
450 original: names[i].span,
451 }))
452 }
453 }
454 }
455
456 /// Return whether the parser should ignore whitespace or not.
ignore_whitespace(&self) -> bool457 fn ignore_whitespace(&self) -> bool {
458 self.parser().ignore_whitespace.get()
459 }
460
461 /// Return the character at the current position of the parser.
462 ///
463 /// This panics if the current position does not point to a valid char.
char(&self) -> char464 fn char(&self) -> char {
465 self.char_at(self.offset())
466 }
467
468 /// Return the character at the given position.
469 ///
470 /// This panics if the given position does not point to a valid char.
char_at(&self, i: usize) -> char471 fn char_at(&self, i: usize) -> char {
472 self.pattern()[i..].chars().next()
473 .unwrap_or_else(|| {
474 panic!("expected char at offset {}", i)
475 })
476 }
477
478 /// Bump the parser to the next Unicode scalar value.
479 ///
480 /// If the end of the input has been reached, then `false` is returned.
bump(&self) -> bool481 fn bump(&self) -> bool {
482 if self.is_eof() {
483 return false;
484 }
485 let Position { mut offset, mut line, mut column } = self.pos();
486 if self.char() == '\n' {
487 line = line.checked_add(1).unwrap();
488 column = 1;
489 } else {
490 column = column.checked_add(1).unwrap();
491 }
492 offset += self.char().len_utf8();
493 self.parser().pos.set(Position {
494 offset: offset,
495 line: line,
496 column: column,
497 });
498 self.pattern()[self.offset()..].chars().next().is_some()
499 }
500
501 /// If the substring starting at the current position of the parser has
502 /// the given prefix, then bump the parser to the character immediately
503 /// following the prefix and return true. Otherwise, don't bump the parser
504 /// and return false.
bump_if(&self, prefix: &str) -> bool505 fn bump_if(&self, prefix: &str) -> bool {
506 if self.pattern()[self.offset()..].starts_with(prefix) {
507 for _ in 0..prefix.chars().count() {
508 self.bump();
509 }
510 true
511 } else {
512 false
513 }
514 }
515
516 /// Returns true if and only if the parser is positioned at a look-around
517 /// prefix. The conditions under which this returns true must always
518 /// correspond to a regular expression that would otherwise be consider
519 /// invalid.
520 ///
521 /// This should only be called immediately after parsing the opening of
522 /// a group or a set of flags.
is_lookaround_prefix(&self) -> bool523 fn is_lookaround_prefix(&self) -> bool {
524 self.bump_if("?=")
525 || self.bump_if("?!")
526 || self.bump_if("?<=")
527 || self.bump_if("?<!")
528 }
529
530 /// Bump the parser, and if the `x` flag is enabled, bump through any
531 /// subsequent spaces. Return true if and only if the parser is not at
532 /// EOF.
bump_and_bump_space(&self) -> bool533 fn bump_and_bump_space(&self) -> bool {
534 if !self.bump() {
535 return false;
536 }
537 self.bump_space();
538 !self.is_eof()
539 }
540
541 /// If the `x` flag is enabled (i.e., whitespace insensitivity with
542 /// comments), then this will advance the parser through all whitespace
543 /// and comments to the next non-whitespace non-comment byte.
544 ///
545 /// If the `x` flag is disabled, then this is a no-op.
546 ///
547 /// This should be used selectively throughout the parser where
548 /// arbitrary whitespace is permitted when the `x` flag is enabled. For
549 /// example, `{ 5 , 6}` is equivalent to `{5,6}`.
bump_space(&self)550 fn bump_space(&self) {
551 if !self.ignore_whitespace() {
552 return;
553 }
554 while !self.is_eof() {
555 if self.char().is_whitespace() {
556 self.bump();
557 } else if self.char() == '#' {
558 let start = self.pos();
559 let mut comment_text = String::new();
560 self.bump();
561 while !self.is_eof() {
562 let c = self.char();
563 self.bump();
564 if c == '\n' {
565 break;
566 }
567 comment_text.push(c);
568 }
569 let comment = ast::Comment {
570 span: Span::new(start, self.pos()),
571 comment: comment_text,
572 };
573 self.parser().comments.borrow_mut().push(comment);
574 } else {
575 break;
576 }
577 }
578 }
579
580 /// Peek at the next character in the input without advancing the parser.
581 ///
582 /// If the input has been exhausted, then this returns `None`.
peek(&self) -> Option<char>583 fn peek(&self) -> Option<char> {
584 if self.is_eof() {
585 return None;
586 }
587 self.pattern()[self.offset() + self.char().len_utf8()..].chars().next()
588 }
589
590 /// Like peek, but will ignore spaces when the parser is in whitespace
591 /// insensitive mode.
peek_space(&self) -> Option<char>592 fn peek_space(&self) -> Option<char> {
593 if !self.ignore_whitespace() {
594 return self.peek();
595 }
596 if self.is_eof() {
597 return None;
598 }
599 let mut start = self.offset() + self.char().len_utf8();
600 let mut in_comment = false;
601 for (i, c) in self.pattern()[start..].char_indices() {
602 if c.is_whitespace() {
603 continue;
604 } else if !in_comment && c == '#' {
605 in_comment = true;
606 } else if in_comment && c == '\n' {
607 in_comment = false;
608 } else {
609 start += i;
610 break;
611 }
612 }
613 self.pattern()[start..].chars().next()
614 }
615
616 /// Returns true if the next call to `bump` would return false.
is_eof(&self) -> bool617 fn is_eof(&self) -> bool {
618 self.offset() == self.pattern().len()
619 }
620
621 /// Return the current position of the parser, which includes the offset,
622 /// line and column.
pos(&self) -> Position623 fn pos(&self) -> Position {
624 self.parser().pos.get()
625 }
626
627 /// Create a span at the current position of the parser. Both the start
628 /// and end of the span are set.
span(&self) -> Span629 fn span(&self) -> Span {
630 Span::splat(self.pos())
631 }
632
633 /// Create a span that covers the current character.
span_char(&self) -> Span634 fn span_char(&self) -> Span {
635 let mut next = Position {
636 offset: self.offset().checked_add(self.char().len_utf8()).unwrap(),
637 line: self.line(),
638 column: self.column().checked_add(1).unwrap(),
639 };
640 if self.char() == '\n' {
641 next.line += 1;
642 next.column = 1;
643 }
644 Span::new(self.pos(), next)
645 }
646
647 /// Parse and push a single alternation on to the parser's internal stack.
648 /// If the top of the stack already has an alternation, then add to that
649 /// instead of pushing a new one.
650 ///
651 /// The concatenation given corresponds to a single alternation branch.
652 /// The concatenation returned starts the next branch and is empty.
653 ///
654 /// This assumes the parser is currently positioned at `|` and will advance
655 /// the parser to the character following `|`.
push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat>656 fn push_alternate(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
657 assert_eq!(self.char(), '|');
658 concat.span.end = self.pos();
659 self.push_or_add_alternation(concat);
660 self.bump();
661 Ok(ast::Concat {
662 span: self.span(),
663 asts: vec![],
664 })
665 }
666
667 /// Pushes or adds the given branch of an alternation to the parser's
668 /// internal stack of state.
push_or_add_alternation(&self, concat: ast::Concat)669 fn push_or_add_alternation(&self, concat: ast::Concat) {
670 use self::GroupState::*;
671
672 let mut stack = self.parser().stack_group.borrow_mut();
673 if let Some(&mut Alternation(ref mut alts)) = stack.last_mut() {
674 alts.asts.push(concat.into_ast());
675 return;
676 }
677 stack.push(Alternation(ast::Alternation {
678 span: Span::new(concat.span.start, self.pos()),
679 asts: vec![concat.into_ast()],
680 }));
681 }
682
683 /// Parse and push a group AST (and its parent concatenation) on to the
684 /// parser's internal stack. Return a fresh concatenation corresponding
685 /// to the group's sub-AST.
686 ///
687 /// If a set of flags was found (with no group), then the concatenation
688 /// is returned with that set of flags added.
689 ///
690 /// This assumes that the parser is currently positioned on the opening
691 /// parenthesis. It advances the parser to the character at the start
692 /// of the sub-expression (or adjoining expression).
693 ///
694 /// If there was a problem parsing the start of the group, then an error
695 /// is returned.
push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat>696 fn push_group(&self, mut concat: ast::Concat) -> Result<ast::Concat> {
697 assert_eq!(self.char(), '(');
698 match self.parse_group()? {
699 Either::Left(set) => {
700 let ignore = set.flags.flag_state(ast::Flag::IgnoreWhitespace);
701 if let Some(v) = ignore {
702 self.parser().ignore_whitespace.set(v);
703 }
704
705 concat.asts.push(Ast::Flags(set));
706 Ok(concat)
707 }
708 Either::Right(group) => {
709 let old_ignore_whitespace = self.ignore_whitespace();
710 let new_ignore_whitespace = group
711 .flags()
712 .and_then(|f| f.flag_state(ast::Flag::IgnoreWhitespace))
713 .unwrap_or(old_ignore_whitespace);
714 self.parser().stack_group.borrow_mut().push(GroupState::Group {
715 concat: concat,
716 group: group,
717 ignore_whitespace: old_ignore_whitespace,
718 });
719 self.parser().ignore_whitespace.set(new_ignore_whitespace);
720 Ok(ast::Concat {
721 span: self.span(),
722 asts: vec![],
723 })
724 }
725 }
726 }
727
728 /// Pop a group AST from the parser's internal stack and set the group's
729 /// AST to the given concatenation. Return the concatenation containing
730 /// the group.
731 ///
732 /// This assumes that the parser is currently positioned on the closing
733 /// parenthesis and advances the parser to the character following the `)`.
734 ///
735 /// If no such group could be popped, then an unopened group error is
736 /// returned.
pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat>737 fn pop_group(&self, mut group_concat: ast::Concat) -> Result<ast::Concat> {
738 use self::GroupState::*;
739
740 assert_eq!(self.char(), ')');
741 let mut stack = self.parser().stack_group.borrow_mut();
742 let (mut prior_concat, mut group, ignore_whitespace, alt) =
743 match stack.pop() {
744 Some(Group { concat, group, ignore_whitespace }) => {
745 (concat, group, ignore_whitespace, None)
746 }
747 Some(Alternation(alt)) => {
748 match stack.pop() {
749 Some(Group { concat, group, ignore_whitespace }) => {
750 (concat, group, ignore_whitespace, Some(alt))
751 }
752 None | Some(Alternation(_)) => {
753 return Err(self.error(
754 self.span_char(),
755 ast::ErrorKind::GroupUnopened,
756 ));
757 }
758 }
759 }
760 None => {
761 return Err(self.error(
762 self.span_char(),
763 ast::ErrorKind::GroupUnopened,
764 ));
765 }
766 };
767 self.parser().ignore_whitespace.set(ignore_whitespace);
768 group_concat.span.end = self.pos();
769 self.bump();
770 group.span.end = self.pos();
771 match alt {
772 Some(mut alt) => {
773 alt.span.end = group_concat.span.end;
774 alt.asts.push(group_concat.into_ast());
775 group.ast = Box::new(alt.into_ast());
776 }
777 None => {
778 group.ast = Box::new(group_concat.into_ast());
779 }
780 }
781 prior_concat.asts.push(Ast::Group(group));
782 Ok(prior_concat)
783 }
784
785 /// Pop the last state from the parser's internal stack, if it exists, and
786 /// add the given concatenation to it. There either must be no state or a
787 /// single alternation item on the stack. Any other scenario produces an
788 /// error.
789 ///
790 /// This assumes that the parser has advanced to the end.
pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast>791 fn pop_group_end(&self, mut concat: ast::Concat) -> Result<Ast> {
792 concat.span.end = self.pos();
793 let mut stack = self.parser().stack_group.borrow_mut();
794 let ast = match stack.pop() {
795 None => Ok(concat.into_ast()),
796 Some(GroupState::Alternation(mut alt)) => {
797 alt.span.end = self.pos();
798 alt.asts.push(concat.into_ast());
799 Ok(Ast::Alternation(alt))
800 }
801 Some(GroupState::Group { group, .. }) => {
802 return Err(self.error(
803 group.span,
804 ast::ErrorKind::GroupUnclosed,
805 ));
806 }
807 };
808 // If we try to pop again, there should be nothing.
809 match stack.pop() {
810 None => ast,
811 Some(GroupState::Alternation(_)) => {
812 // This unreachable is unfortunate. This case can't happen
813 // because the only way we can be here is if there were two
814 // `GroupState::Alternation`s adjacent in the parser's stack,
815 // which we guarantee to never happen because we never push a
816 // `GroupState::Alternation` if one is already at the top of
817 // the stack.
818 unreachable!()
819 }
820 Some(GroupState::Group { group, .. }) => {
821 Err(self.error(group.span, ast::ErrorKind::GroupUnclosed))
822 }
823 }
824 }
825
826 /// Parse the opening of a character class and push the current class
827 /// parsing context onto the parser's stack. This assumes that the parser
828 /// is positioned at an opening `[`. The given union should correspond to
829 /// the union of set items built up before seeing the `[`.
830 ///
831 /// If there was a problem parsing the opening of the class, then an error
832 /// is returned. Otherwise, a new union of set items for the class is
833 /// returned (which may be populated with either a `]` or a `-`).
push_class_open( &self, parent_union: ast::ClassSetUnion, ) -> Result<ast::ClassSetUnion>834 fn push_class_open(
835 &self,
836 parent_union: ast::ClassSetUnion,
837 ) -> Result<ast::ClassSetUnion> {
838 assert_eq!(self.char(), '[');
839
840 let (nested_set, nested_union) = self.parse_set_class_open()?;
841 self.parser().stack_class.borrow_mut().push(ClassState::Open {
842 union: parent_union,
843 set: nested_set,
844 });
845 Ok(nested_union)
846 }
847
848 /// Parse the end of a character class set and pop the character class
849 /// parser stack. The union given corresponds to the last union built
850 /// before seeing the closing `]`. The union returned corresponds to the
851 /// parent character class set with the nested class added to it.
852 ///
853 /// This assumes that the parser is positioned at a `]` and will advance
854 /// the parser to the byte immediately following the `]`.
855 ///
856 /// If the stack is empty after popping, then this returns the final
857 /// "top-level" character class AST (where a "top-level" character class
858 /// is one that is not nested inside any other character class).
859 ///
860 /// If there is no corresponding opening bracket on the parser's stack,
861 /// then an error is returned.
pop_class( &self, nested_union: ast::ClassSetUnion, ) -> Result<Either<ast::ClassSetUnion, ast::Class>>862 fn pop_class(
863 &self,
864 nested_union: ast::ClassSetUnion,
865 ) -> Result<Either<ast::ClassSetUnion, ast::Class>> {
866 assert_eq!(self.char(), ']');
867
868 let item = ast::ClassSet::Item(nested_union.into_item());
869 let prevset = self.pop_class_op(item);
870 let mut stack = self.parser().stack_class.borrow_mut();
871 match stack.pop() {
872 None => {
873 // We can never observe an empty stack:
874 //
875 // 1) We are guaranteed to start with a non-empty stack since
876 // the character class parser is only initiated when it sees
877 // a `[`.
878 // 2) If we ever observe an empty stack while popping after
879 // seeing a `]`, then we signal the character class parser
880 // to terminate.
881 panic!("unexpected empty character class stack")
882 },
883 Some(ClassState::Op { .. }) => {
884 // This panic is unfortunate, but this case is impossible
885 // since we already popped the Op state if one exists above.
886 // Namely, every push to the class parser stack is guarded by
887 // whether an existing Op is already on the top of the stack.
888 // If it is, the existing Op is modified. That is, the stack
889 // can never have consecutive Op states.
890 panic!("unexpected ClassState::Op")
891 }
892 Some(ClassState::Open { mut union, mut set }) => {
893 self.bump();
894 set.span.end = self.pos();
895 set.kind = prevset;
896 if stack.is_empty() {
897 Ok(Either::Right(ast::Class::Bracketed(set)))
898 } else {
899 union.push(ast::ClassSetItem::Bracketed(Box::new(set)));
900 Ok(Either::Left(union))
901 }
902 }
903 }
904 }
905
906 /// Return an "unclosed class" error whose span points to the most
907 /// recently opened class.
908 ///
909 /// This should only be called while parsing a character class.
unclosed_class_error(&self) -> ast::Error910 fn unclosed_class_error(&self) -> ast::Error {
911 for state in self.parser().stack_class.borrow().iter().rev() {
912 match *state {
913 ClassState::Open { ref set, .. } => {
914 return self.error(set.span, ast::ErrorKind::ClassUnclosed);
915 }
916 _ => {}
917 }
918 }
919 // We are guaranteed to have a non-empty stack with at least
920 // one open bracket, so we should never get here.
921 panic!("no open character class found")
922 }
923
924 /// Push the current set of class items on to the class parser's stack as
925 /// the left hand side of the given operator.
926 ///
927 /// A fresh set union is returned, which should be used to build the right
928 /// hand side of this operator.
push_class_op( &self, next_kind: ast::ClassSetBinaryOpKind, next_union: ast::ClassSetUnion, ) -> ast::ClassSetUnion929 fn push_class_op(
930 &self,
931 next_kind: ast::ClassSetBinaryOpKind,
932 next_union: ast::ClassSetUnion,
933 ) -> ast::ClassSetUnion {
934
935 let item = ast::ClassSet::Item(next_union.into_item());
936 let new_lhs = self.pop_class_op(item);
937 self.parser().stack_class.borrow_mut().push(ClassState::Op {
938 kind: next_kind,
939 lhs: new_lhs,
940 });
941 ast::ClassSetUnion { span: self.span(), items: vec![] }
942 }
943
944 /// Pop a character class set from the character class parser stack. If the
945 /// top of the stack is just an item (not an operation), then return the
946 /// given set unchanged. If the top of the stack is an operation, then the
947 /// given set will be used as the rhs of the operation on the top of the
948 /// stack. In that case, the binary operation is returned as a set.
pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet949 fn pop_class_op(&self, rhs: ast::ClassSet) -> ast::ClassSet {
950 let mut stack = self.parser().stack_class.borrow_mut();
951 let (kind, lhs) = match stack.pop() {
952 Some(ClassState::Op { kind, lhs }) => (kind, lhs),
953 Some(state @ ClassState::Open { .. }) => {
954 stack.push(state);
955 return rhs;
956 }
957 None => unreachable!(),
958 };
959 let span = Span::new(lhs.span().start, rhs.span().end);
960 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
961 span: span,
962 kind: kind,
963 lhs: Box::new(lhs),
964 rhs: Box::new(rhs),
965 })
966 }
967 }
968
969 impl<'s, P: Borrow<Parser>> ParserI<'s, P> {
970 /// Parse the regular expression into an abstract syntax tree.
parse(&self) -> Result<Ast>971 fn parse(&self) -> Result<Ast> {
972 self.parse_with_comments().map(|astc| astc.ast)
973 }
974
975 /// Parse the regular expression and return an abstract syntax tree with
976 /// all of the comments found in the pattern.
parse_with_comments(&self) -> Result<ast::WithComments>977 fn parse_with_comments(&self) -> Result<ast::WithComments> {
978 assert_eq!(self.offset(), 0, "parser can only be used once");
979 self.parser().reset();
980 let mut concat = ast::Concat {
981 span: self.span(),
982 asts: vec![],
983 };
984 loop {
985 self.bump_space();
986 if self.is_eof() {
987 break;
988 }
989 match self.char() {
990 '(' => concat = self.push_group(concat)?,
991 ')' => concat = self.pop_group(concat)?,
992 '|' => concat = self.push_alternate(concat)?,
993 '[' => {
994 let class = self.parse_set_class()?;
995 concat.asts.push(Ast::Class(class));
996 }
997 '?' => {
998 concat = self.parse_uncounted_repetition(
999 concat, ast::RepetitionKind::ZeroOrOne)?;
1000 }
1001 '*' => {
1002 concat = self.parse_uncounted_repetition(
1003 concat, ast::RepetitionKind::ZeroOrMore)?;
1004 }
1005 '+' => {
1006 concat = self.parse_uncounted_repetition(
1007 concat, ast::RepetitionKind::OneOrMore)?;
1008 }
1009 '{' => {
1010 concat = self.parse_counted_repetition(concat)?;
1011 }
1012 _ => concat.asts.push(self.parse_primitive()?.into_ast()),
1013 }
1014 }
1015 let ast = self.pop_group_end(concat)?;
1016 NestLimiter::new(self).check(&ast)?;
1017 Ok(ast::WithComments {
1018 ast: ast,
1019 comments: mem::replace(
1020 &mut *self.parser().comments.borrow_mut(),
1021 vec![],
1022 ),
1023 })
1024 }
1025
1026 /// Parses an uncounted repetition operation. An uncounted repetition
1027 /// operator includes ?, * and +, but does not include the {m,n} syntax.
1028 /// The given `kind` should correspond to the operator observed by the
1029 /// caller.
1030 ///
1031 /// This assumes that the paser is currently positioned at the repetition
1032 /// operator and advances the parser to the first character after the
1033 /// operator. (Note that the operator may include a single additional `?`,
1034 /// which makes the operator ungreedy.)
1035 ///
1036 /// The caller should include the concatenation that is being built. The
1037 /// concatenation returned includes the repetition operator applied to the
1038 /// last expression in the given concatenation.
parse_uncounted_repetition( &self, mut concat: ast::Concat, kind: ast::RepetitionKind, ) -> Result<ast::Concat>1039 fn parse_uncounted_repetition(
1040 &self,
1041 mut concat: ast::Concat,
1042 kind: ast::RepetitionKind,
1043 ) -> Result<ast::Concat> {
1044 assert!(
1045 self.char() == '?' || self.char() == '*' || self.char() == '+');
1046 let op_start = self.pos();
1047 let ast = match concat.asts.pop() {
1048 Some(ast) => ast,
1049 None => return Err(self.error(
1050 self.span(),
1051 ast::ErrorKind::RepetitionMissing,
1052 )),
1053 };
1054 match ast {
1055 Ast::Empty(_) | Ast::Flags(_) => return Err(self.error(
1056 self.span(),
1057 ast::ErrorKind::RepetitionMissing,
1058 )),
1059 _ => {}
1060 }
1061 let mut greedy = true;
1062 if self.bump() && self.char() == '?' {
1063 greedy = false;
1064 self.bump();
1065 }
1066 concat.asts.push(Ast::Repetition(ast::Repetition {
1067 span: ast.span().with_end(self.pos()),
1068 op: ast::RepetitionOp {
1069 span: Span::new(op_start, self.pos()),
1070 kind: kind,
1071 },
1072 greedy: greedy,
1073 ast: Box::new(ast),
1074 }));
1075 Ok(concat)
1076 }
1077
1078 /// Parses a counted repetition operation. A counted repetition operator
1079 /// corresponds to the {m,n} syntax, and does not include the ?, * or +
1080 /// operators.
1081 ///
1082 /// This assumes that the paser is currently positioned at the opening `{`
1083 /// and advances the parser to the first character after the operator.
1084 /// (Note that the operator may include a single additional `?`, which
1085 /// makes the operator ungreedy.)
1086 ///
1087 /// The caller should include the concatenation that is being built. The
1088 /// concatenation returned includes the repetition operator applied to the
1089 /// last expression in the given concatenation.
parse_counted_repetition( &self, mut concat: ast::Concat, ) -> Result<ast::Concat>1090 fn parse_counted_repetition(
1091 &self,
1092 mut concat: ast::Concat,
1093 ) -> Result<ast::Concat> {
1094 assert!(self.char() == '{');
1095 let start = self.pos();
1096 let ast = match concat.asts.pop() {
1097 Some(ast) => ast,
1098 None => return Err(self.error(
1099 self.span(),
1100 ast::ErrorKind::RepetitionMissing,
1101 )),
1102 };
1103 match ast {
1104 Ast::Empty(_) | Ast::Flags(_) => return Err(self.error(
1105 self.span(),
1106 ast::ErrorKind::RepetitionMissing,
1107 )),
1108 _ => {}
1109 }
1110 if !self.bump_and_bump_space() {
1111 return Err(self.error(
1112 Span::new(start, self.pos()),
1113 ast::ErrorKind::RepetitionCountUnclosed,
1114 ));
1115 }
1116 let count_start = self.parse_decimal()?;
1117 let mut range = ast::RepetitionRange::Exactly(count_start);
1118 if self.is_eof() {
1119 return Err(self.error(
1120 Span::new(start, self.pos()),
1121 ast::ErrorKind::RepetitionCountUnclosed,
1122 ));
1123 }
1124 if self.char() == ',' {
1125 if !self.bump_and_bump_space() {
1126 return Err(self.error(
1127 Span::new(start, self.pos()),
1128 ast::ErrorKind::RepetitionCountUnclosed,
1129 ));
1130 }
1131 if self.char() != '}' {
1132 let count_end = self.parse_decimal()?;
1133 range = ast::RepetitionRange::Bounded(count_start, count_end);
1134 } else {
1135 range = ast::RepetitionRange::AtLeast(count_start);
1136 }
1137 }
1138 if self.is_eof() || self.char() != '}' {
1139 return Err(self.error(
1140 Span::new(start, self.pos()),
1141 ast::ErrorKind::RepetitionCountUnclosed,
1142 ));
1143 }
1144
1145 let mut greedy = true;
1146 if self.bump_and_bump_space() && self.char() == '?' {
1147 greedy = false;
1148 self.bump();
1149 }
1150
1151 let op_span = Span::new(start, self.pos());
1152 if !range.is_valid() {
1153 return Err(self.error(
1154 op_span,
1155 ast::ErrorKind::RepetitionCountInvalid,
1156 ));
1157 }
1158 concat.asts.push(Ast::Repetition(ast::Repetition {
1159 span: ast.span().with_end(self.pos()),
1160 op: ast::RepetitionOp {
1161 span: op_span,
1162 kind: ast::RepetitionKind::Range(range),
1163 },
1164 greedy: greedy,
1165 ast: Box::new(ast),
1166 }));
1167 Ok(concat)
1168 }
1169
1170 /// Parse a group (which contains a sub-expression) or a set of flags.
1171 ///
1172 /// If a group was found, then it is returned with an empty AST. If a set
1173 /// of flags is found, then that set is returned.
1174 ///
1175 /// The parser should be positioned at the opening parenthesis.
1176 ///
1177 /// This advances the parser to the character before the start of the
1178 /// sub-expression (in the case of a group) or to the closing parenthesis
1179 /// immediately following the set of flags.
1180 ///
1181 /// # Errors
1182 ///
1183 /// If flags are given and incorrectly specified, then a corresponding
1184 /// error is returned.
1185 ///
1186 /// If a capture name is given and it is incorrectly specified, then a
1187 /// corresponding error is returned.
parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>>1188 fn parse_group(&self) -> Result<Either<ast::SetFlags, ast::Group>> {
1189 assert_eq!(self.char(), '(');
1190 let open_span = self.span_char();
1191 self.bump();
1192 self.bump_space();
1193 if self.is_lookaround_prefix() {
1194 return Err(self.error(
1195 Span::new(open_span.start, self.span().end),
1196 ast::ErrorKind::UnsupportedLookAround,
1197 ));
1198 }
1199 let inner_span = self.span();
1200 if self.bump_if("?P<") {
1201 let capture_index = self.next_capture_index(open_span)?;
1202 let cap = self.parse_capture_name(capture_index)?;
1203 Ok(Either::Right(ast::Group {
1204 span: open_span,
1205 kind: ast::GroupKind::CaptureName(cap),
1206 ast: Box::new(Ast::Empty(self.span())),
1207 }))
1208 } else if self.bump_if("?") {
1209 if self.is_eof() {
1210 return Err(self.error(
1211 open_span,
1212 ast::ErrorKind::GroupUnclosed,
1213 ));
1214 }
1215 let flags = self.parse_flags()?;
1216 let char_end = self.char();
1217 self.bump();
1218 if char_end == ')' {
1219 // We don't allow empty flags, e.g., `(?)`. We instead
1220 // interpret it as a repetition operator missing its argument.
1221 if flags.items.is_empty() {
1222 return Err(self.error(
1223 inner_span,
1224 ast::ErrorKind::RepetitionMissing,
1225 ));
1226 }
1227 Ok(Either::Left(ast::SetFlags {
1228 span: Span { end: self.pos(), ..open_span },
1229 flags: flags,
1230 }))
1231 } else {
1232 assert_eq!(char_end, ':');
1233 Ok(Either::Right(ast::Group {
1234 span: open_span,
1235 kind: ast::GroupKind::NonCapturing(flags),
1236 ast: Box::new(Ast::Empty(self.span())),
1237 }))
1238 }
1239 } else {
1240 let capture_index = self.next_capture_index(open_span)?;
1241 Ok(Either::Right(ast::Group {
1242 span: open_span,
1243 kind: ast::GroupKind::CaptureIndex(capture_index),
1244 ast: Box::new(Ast::Empty(self.span())),
1245 }))
1246 }
1247 }
1248
1249 /// Parses a capture group name. Assumes that the parser is positioned at
1250 /// the first character in the name following the opening `<` (and may
1251 /// possibly be EOF). This advances the parser to the first character
1252 /// following the closing `>`.
1253 ///
1254 /// The caller must provide the capture index of the group for this name.
parse_capture_name( &self, capture_index: u32, ) -> Result<ast::CaptureName>1255 fn parse_capture_name(
1256 &self,
1257 capture_index: u32,
1258 ) -> Result<ast::CaptureName> {
1259 if self.is_eof() {
1260 return Err(self.error(
1261 self.span(),
1262 ast::ErrorKind::GroupNameUnexpectedEof,
1263 ));
1264 }
1265 let start = self.pos();
1266 loop {
1267 if self.char() == '>' {
1268 break;
1269 }
1270 if !is_capture_char(self.char(), self.pos() == start) {
1271 return Err(self.error(
1272 self.span_char(),
1273 ast::ErrorKind::GroupNameInvalid,
1274 ));
1275 }
1276 if !self.bump() {
1277 break;
1278 }
1279 }
1280 let end = self.pos();
1281 if self.is_eof() {
1282 return Err(self.error(
1283 self.span(),
1284 ast::ErrorKind::GroupNameUnexpectedEof,
1285 ));
1286 }
1287 assert_eq!(self.char(), '>');
1288 self.bump();
1289 let name = &self.pattern()[start.offset..end.offset];
1290 if name.is_empty() {
1291 return Err(self.error(
1292 Span::new(start, start),
1293 ast::ErrorKind::GroupNameEmpty,
1294 ));
1295 }
1296 let capname = ast::CaptureName {
1297 span: Span::new(start, end),
1298 name: name.to_string(),
1299 index: capture_index,
1300 };
1301 self.add_capture_name(&capname)?;
1302 Ok(capname)
1303 }
1304
1305 /// Parse a sequence of flags starting at the current character.
1306 ///
1307 /// This advances the parser to the character immediately following the
1308 /// flags, which is guaranteed to be either `:` or `)`.
1309 ///
1310 /// # Errors
1311 ///
1312 /// If any flags are duplicated, then an error is returned.
1313 ///
1314 /// If the negation operator is used more than once, then an error is
1315 /// returned.
1316 ///
1317 /// If no flags could be found or if the negation operation is not followed
1318 /// by any flags, then an error is returned.
parse_flags(&self) -> Result<ast::Flags>1319 fn parse_flags(&self) -> Result<ast::Flags> {
1320 let mut flags = ast::Flags {
1321 span: self.span(),
1322 items: vec![],
1323 };
1324 let mut last_was_negation = None;
1325 while self.char() != ':' && self.char() != ')' {
1326 if self.char() == '-' {
1327 last_was_negation = Some(self.span_char());
1328 let item = ast::FlagsItem {
1329 span: self.span_char(),
1330 kind: ast::FlagsItemKind::Negation,
1331 };
1332 if let Some(i) = flags.add_item(item) {
1333 return Err(self.error(
1334 self.span_char(),
1335 ast::ErrorKind::FlagRepeatedNegation {
1336 original: flags.items[i].span,
1337 },
1338 ));
1339 }
1340 } else {
1341 last_was_negation = None;
1342 let item = ast::FlagsItem {
1343 span: self.span_char(),
1344 kind: ast::FlagsItemKind::Flag(self.parse_flag()?),
1345 };
1346 if let Some(i) = flags.add_item(item) {
1347 return Err(self.error(
1348 self.span_char(),
1349 ast::ErrorKind::FlagDuplicate {
1350 original: flags.items[i].span,
1351 },
1352 ));
1353 }
1354 }
1355 if !self.bump() {
1356 return Err(self.error(
1357 self.span(),
1358 ast::ErrorKind::FlagUnexpectedEof,
1359 ));
1360 }
1361 }
1362 if let Some(span) = last_was_negation {
1363 return Err(self.error(span, ast::ErrorKind::FlagDanglingNegation));
1364 }
1365 flags.span.end = self.pos();
1366 Ok(flags)
1367 }
1368
1369 /// Parse the current character as a flag. Do not advance the parser.
1370 ///
1371 /// # Errors
1372 ///
1373 /// If the flag is not recognized, then an error is returned.
parse_flag(&self) -> Result<ast::Flag>1374 fn parse_flag(&self) -> Result<ast::Flag> {
1375 match self.char() {
1376 'i' => Ok(ast::Flag::CaseInsensitive),
1377 'm' => Ok(ast::Flag::MultiLine),
1378 's' => Ok(ast::Flag::DotMatchesNewLine),
1379 'U' => Ok(ast::Flag::SwapGreed),
1380 'u' => Ok(ast::Flag::Unicode),
1381 'x' => Ok(ast::Flag::IgnoreWhitespace),
1382 _ => Err(self.error(
1383 self.span_char(),
1384 ast::ErrorKind::FlagUnrecognized,
1385 )),
1386 }
1387 }
1388
1389 /// Parse a primitive AST. e.g., A literal, non-set character class or
1390 /// assertion.
1391 ///
1392 /// This assumes that the parser expects a primitive at the current
1393 /// location. i.e., All other non-primitive cases have been handled.
1394 /// For example, if the parser's position is at `|`, then `|` will be
1395 /// treated as a literal (e.g., inside a character class).
1396 ///
1397 /// This advances the parser to the first character immediately following
1398 /// the primitive.
parse_primitive(&self) -> Result<Primitive>1399 fn parse_primitive(&self) -> Result<Primitive> {
1400 match self.char() {
1401 '\\' => self.parse_escape(),
1402 '.' => {
1403 let ast = Primitive::Dot(self.span_char());
1404 self.bump();
1405 Ok(ast)
1406 }
1407 '^' => {
1408 let ast = Primitive::Assertion(ast::Assertion {
1409 span: self.span_char(),
1410 kind: ast::AssertionKind::StartLine,
1411 });
1412 self.bump();
1413 Ok(ast)
1414 }
1415 '$' => {
1416 let ast = Primitive::Assertion(ast::Assertion {
1417 span: self.span_char(),
1418 kind: ast::AssertionKind::EndLine,
1419 });
1420 self.bump();
1421 Ok(ast)
1422 }
1423 c => {
1424 let ast = Primitive::Literal(ast::Literal {
1425 span: self.span_char(),
1426 kind: ast::LiteralKind::Verbatim,
1427 c: c,
1428 });
1429 self.bump();
1430 Ok(ast)
1431 }
1432 }
1433 }
1434
1435 /// Parse an escape sequence as a primitive AST.
1436 ///
1437 /// This assumes the parser is positioned at the start of the escape
1438 /// sequence, i.e., `\`. It advances the parser to the first position
1439 /// immediately following the escape sequence.
parse_escape(&self) -> Result<Primitive>1440 fn parse_escape(&self) -> Result<Primitive> {
1441 assert_eq!(self.char(), '\\');
1442 let start = self.pos();
1443 if !self.bump() {
1444 return Err(self.error(
1445 Span::new(start, self.pos()),
1446 ast::ErrorKind::EscapeUnexpectedEof,
1447 ));
1448 }
1449 let c = self.char();
1450 // Put some of the more complicated routines into helpers.
1451 match c {
1452 '0'...'7' => {
1453 if !self.parser().octal {
1454 return Err(self.error(
1455 Span::new(start, self.span_char().end),
1456 ast::ErrorKind::UnsupportedBackreference,
1457 ));
1458 }
1459 let mut lit = self.parse_octal();
1460 lit.span.start = start;
1461 return Ok(Primitive::Literal(lit));
1462 }
1463 '8'...'9' if !self.parser().octal => {
1464 return Err(self.error(
1465 Span::new(start, self.span_char().end),
1466 ast::ErrorKind::UnsupportedBackreference,
1467 ));
1468 }
1469 'x' | 'u' | 'U' => {
1470 let mut lit = self.parse_hex()?;
1471 lit.span.start = start;
1472 return Ok(Primitive::Literal(lit));
1473 }
1474 'p' | 'P' => {
1475 let mut cls = self.parse_unicode_class()?;
1476 cls.span.start = start;
1477 return Ok(Primitive::Unicode(cls));
1478 }
1479 'd' | 's' | 'w' | 'D' | 'S' | 'W' => {
1480 let mut cls = self.parse_perl_class();
1481 cls.span.start = start;
1482 return Ok(Primitive::Perl(cls));
1483 }
1484 _ => {}
1485 }
1486
1487 // Handle all of the one letter sequences inline.
1488 self.bump();
1489 let span = Span::new(start, self.pos());
1490 if is_meta_character(c) {
1491 return Ok(Primitive::Literal(ast::Literal {
1492 span: span,
1493 kind: ast::LiteralKind::Punctuation,
1494 c: c,
1495 }));
1496 }
1497 let special = |kind, c| Ok(Primitive::Literal(ast::Literal {
1498 span: span,
1499 kind: ast::LiteralKind::Special(kind),
1500 c: c,
1501 }));
1502 match c {
1503 'a' => special(ast::SpecialLiteralKind::Bell, '\x07'),
1504 'f' => special(ast::SpecialLiteralKind::FormFeed, '\x0C'),
1505 't' => special(ast::SpecialLiteralKind::Tab, '\t'),
1506 'n' => special(ast::SpecialLiteralKind::LineFeed, '\n'),
1507 'r' => special(ast::SpecialLiteralKind::CarriageReturn, '\r'),
1508 'v' => special(ast::SpecialLiteralKind::VerticalTab, '\x0B'),
1509 ' ' if self.ignore_whitespace() => {
1510 special(ast::SpecialLiteralKind::Space, ' ')
1511 }
1512 'A' => Ok(Primitive::Assertion(ast::Assertion {
1513 span: span,
1514 kind: ast::AssertionKind::StartText,
1515 })),
1516 'z' => Ok(Primitive::Assertion(ast::Assertion {
1517 span: span,
1518 kind: ast::AssertionKind::EndText,
1519 })),
1520 'b' => Ok(Primitive::Assertion(ast::Assertion {
1521 span: span,
1522 kind: ast::AssertionKind::WordBoundary,
1523 })),
1524 'B' => Ok(Primitive::Assertion(ast::Assertion {
1525 span: span,
1526 kind: ast::AssertionKind::NotWordBoundary,
1527 })),
1528 _ => Err(self.error(span, ast::ErrorKind::EscapeUnrecognized)),
1529 }
1530 }
1531
1532 /// Parse an octal representation of a Unicode codepoint up to 3 digits
1533 /// long. This expects the parser to be positioned at the first octal
1534 /// digit and advances the parser to the first character immediately
1535 /// following the octal number. This also assumes that parsing octal
1536 /// escapes is enabled.
1537 ///
1538 /// Assuming the preconditions are met, this routine can never fail.
parse_octal(&self) -> ast::Literal1539 fn parse_octal(&self) -> ast::Literal {
1540 use std::char;
1541 use std::u32;
1542
1543 assert!(self.parser().octal);
1544 assert!('0' <= self.char() && self.char() <= '7');
1545 let start = self.pos();
1546 // Parse up to two more digits.
1547 while
1548 self.bump() &&
1549 '0' <= self.char() && self.char() <= '7' &&
1550 self.pos().offset - start.offset <= 2
1551 {}
1552 let end = self.pos();
1553 let octal = &self.pattern()[start.offset..end.offset];
1554 // Parsing the octal should never fail since the above guarantees a
1555 // valid number.
1556 let codepoint =
1557 u32::from_str_radix(octal, 8).expect("valid octal number");
1558 // The max value for 3 digit octal is 0777 = 511 and [0, 511] has no
1559 // invalid Unicode scalar values.
1560 let c = char::from_u32(codepoint).expect("Unicode scalar value");
1561 ast::Literal {
1562 span: Span::new(start, end),
1563 kind: ast::LiteralKind::Octal,
1564 c: c,
1565 }
1566 }
1567
1568 /// Parse a hex representation of a Unicode codepoint. This handles both
1569 /// hex notations, i.e., `\xFF` and `\x{FFFF}`. This expects the parser to
1570 /// be positioned at the `x`, `u` or `U` prefix. The parser is advanced to
1571 /// the first character immediately following the hexadecimal literal.
parse_hex(&self) -> Result<ast::Literal>1572 fn parse_hex(&self) -> Result<ast::Literal> {
1573 assert!(self.char() == 'x'
1574 || self.char() == 'u'
1575 || self.char() == 'U');
1576
1577 let hex_kind = match self.char() {
1578 'x' => ast::HexLiteralKind::X,
1579 'u' => ast::HexLiteralKind::UnicodeShort,
1580 _ => ast::HexLiteralKind::UnicodeLong,
1581 };
1582 if !self.bump_and_bump_space() {
1583 return Err(self.error(
1584 self.span(),
1585 ast::ErrorKind::EscapeUnexpectedEof,
1586 ));
1587 }
1588 if self.char() == '{' {
1589 self.parse_hex_brace(hex_kind)
1590 } else {
1591 self.parse_hex_digits(hex_kind)
1592 }
1593 }
1594
1595 /// Parse an N-digit hex representation of a Unicode codepoint. This
1596 /// expects the parser to be positioned at the first digit and will advance
1597 /// the parser to the first character immediately following the escape
1598 /// sequence.
1599 ///
1600 /// The number of digits given must be 2 (for `\xNN`), 4 (for `\uNNNN`)
1601 /// or 8 (for `\UNNNNNNNN`).
parse_hex_digits( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1602 fn parse_hex_digits(
1603 &self,
1604 kind: ast::HexLiteralKind,
1605 ) -> Result<ast::Literal> {
1606 use std::char;
1607 use std::u32;
1608
1609 let mut scratch = self.parser().scratch.borrow_mut();
1610 scratch.clear();
1611
1612 let start = self.pos();
1613 for i in 0..kind.digits() {
1614 if i > 0 && !self.bump_and_bump_space() {
1615 return Err(self.error(
1616 self.span(),
1617 ast::ErrorKind::EscapeUnexpectedEof,
1618 ));
1619 }
1620 if !is_hex(self.char()) {
1621 return Err(self.error(
1622 self.span_char(),
1623 ast::ErrorKind::EscapeHexInvalidDigit,
1624 ));
1625 }
1626 scratch.push(self.char());
1627 }
1628 // The final bump just moves the parser past the literal, which may
1629 // be EOF.
1630 self.bump_and_bump_space();
1631 let end = self.pos();
1632 let hex = scratch.as_str();
1633 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1634 None => Err(self.error(
1635 Span::new(start, end),
1636 ast::ErrorKind::EscapeHexInvalid,
1637 )),
1638 Some(c) => Ok(ast::Literal {
1639 span: Span::new(start, end),
1640 kind: ast::LiteralKind::HexFixed(kind),
1641 c: c,
1642 }),
1643 }
1644 }
1645
1646 /// Parse a hex representation of any Unicode scalar value. This expects
1647 /// the parser to be positioned at the opening brace `{` and will advance
1648 /// the parser to the first character following the closing brace `}`.
parse_hex_brace( &self, kind: ast::HexLiteralKind, ) -> Result<ast::Literal>1649 fn parse_hex_brace(
1650 &self,
1651 kind: ast::HexLiteralKind,
1652 ) -> Result<ast::Literal> {
1653 use std::char;
1654 use std::u32;
1655
1656 let mut scratch = self.parser().scratch.borrow_mut();
1657 scratch.clear();
1658
1659 let brace_pos = self.pos();
1660 let start = self.span_char().end;
1661 while self.bump_and_bump_space() && self.char() != '}' {
1662 if !is_hex(self.char()) {
1663 return Err(self.error(
1664 self.span_char(),
1665 ast::ErrorKind::EscapeHexInvalidDigit,
1666 ));
1667 }
1668 scratch.push(self.char());
1669 }
1670 if self.is_eof() {
1671 return Err(self.error(
1672 Span::new(brace_pos, self.pos()),
1673 ast::ErrorKind::EscapeUnexpectedEof,
1674 ));
1675 }
1676 let end = self.pos();
1677 let hex = scratch.as_str();
1678 assert_eq!(self.char(), '}');
1679 self.bump_and_bump_space();
1680
1681 if hex.is_empty() {
1682 return Err(self.error(
1683 Span::new(brace_pos, self.pos()),
1684 ast::ErrorKind::EscapeHexEmpty,
1685 ));
1686 }
1687 match u32::from_str_radix(hex, 16).ok().and_then(char::from_u32) {
1688 None => Err(self.error(
1689 Span::new(start, end),
1690 ast::ErrorKind::EscapeHexInvalid,
1691 )),
1692 Some(c) => Ok(ast::Literal {
1693 span: Span::new(start, self.pos()),
1694 kind: ast::LiteralKind::HexBrace(kind),
1695 c: c,
1696 }),
1697 }
1698 }
1699
1700 /// Parse a decimal number into a u32 while trimming leading and trailing
1701 /// whitespace.
1702 ///
1703 /// This expects the parser to be positioned at the first position where
1704 /// a decimal digit could occur. This will advance the parser to the byte
1705 /// immediately following the last contiguous decimal digit.
1706 ///
1707 /// If no decimal digit could be found or if there was a problem parsing
1708 /// the complete set of digits into a u32, then an error is returned.
parse_decimal(&self) -> Result<u32>1709 fn parse_decimal(&self) -> Result<u32> {
1710 let mut scratch = self.parser().scratch.borrow_mut();
1711 scratch.clear();
1712
1713 while !self.is_eof() && self.char().is_whitespace() {
1714 self.bump();
1715 }
1716 let start = self.pos();
1717 while !self.is_eof() && '0' <= self.char() && self.char() <= '9' {
1718 scratch.push(self.char());
1719 self.bump_and_bump_space();
1720 }
1721 let span = Span::new(start, self.pos());
1722 while !self.is_eof() && self.char().is_whitespace() {
1723 self.bump_and_bump_space();
1724 }
1725 let digits = scratch.as_str();
1726 if digits.is_empty() {
1727 return Err(self.error(span, ast::ErrorKind::DecimalEmpty));
1728 }
1729 match u32::from_str_radix(digits, 10).ok() {
1730 Some(n) => Ok(n),
1731 None => Err(self.error(span, ast::ErrorKind::DecimalInvalid)),
1732 }
1733 }
1734
1735 /// Parse a standard character class consisting primarily of characters or
1736 /// character ranges, but can also contain nested character classes of
1737 /// any type (sans `.`).
1738 ///
1739 /// This assumes the parser is positioned at the opening `[`. If parsing
1740 /// is successful, then the parser is advanced to the position immediately
1741 /// following the closing `]`.
parse_set_class(&self) -> Result<ast::Class>1742 fn parse_set_class(&self) -> Result<ast::Class> {
1743 assert_eq!(self.char(), '[');
1744
1745 let mut union = ast::ClassSetUnion {
1746 span: self.span(),
1747 items: vec![],
1748 };
1749 loop {
1750 self.bump_space();
1751 if self.is_eof() {
1752 return Err(self.unclosed_class_error());
1753 }
1754 match self.char() {
1755 '[' => {
1756 // If we've already parsed the opening bracket, then
1757 // attempt to treat this as the beginning of an ASCII
1758 // class. If ASCII class parsing fails, then the parser
1759 // backs up to `[`.
1760 if !self.parser().stack_class.borrow().is_empty() {
1761 if let Some(cls) = self.maybe_parse_ascii_class() {
1762 union.push(ast::ClassSetItem::Ascii(cls));
1763 continue;
1764 }
1765 }
1766 union = self.push_class_open(union)?;
1767 }
1768 ']' => {
1769 match self.pop_class(union)? {
1770 Either::Left(nested_union) => { union = nested_union; }
1771 Either::Right(class) => return Ok(class),
1772 }
1773 }
1774 '&' if self.peek() == Some('&') => {
1775 assert!(self.bump_if("&&"));
1776 union = self.push_class_op(
1777 ast::ClassSetBinaryOpKind::Intersection, union);
1778 }
1779 '-' if self.peek() == Some('-') => {
1780 assert!(self.bump_if("--"));
1781 union = self.push_class_op(
1782 ast::ClassSetBinaryOpKind::Difference, union);
1783 }
1784 '~' if self.peek() == Some('~') => {
1785 assert!(self.bump_if("~~"));
1786 union = self.push_class_op(
1787 ast::ClassSetBinaryOpKind::SymmetricDifference, union);
1788 }
1789 _ => {
1790 union.push(self.parse_set_class_range()?);
1791 }
1792 }
1793 }
1794 }
1795
1796 /// Parse a single primitive item in a character class set. The item to
1797 /// be parsed can either be one of a simple literal character, a range
1798 /// between two simple literal characters or a "primitive" character
1799 /// class like \w or \p{Greek}.
1800 ///
1801 /// If an invalid escape is found, or if a character class is found where
1802 /// a simple literal is expected (e.g., in a range), then an error is
1803 /// returned.
parse_set_class_range(&self) -> Result<ast::ClassSetItem>1804 fn parse_set_class_range(&self) -> Result<ast::ClassSetItem> {
1805 let prim1 = self.parse_set_class_item()?;
1806 self.bump_space();
1807 if self.is_eof() {
1808 return Err(self.unclosed_class_error());
1809 }
1810 // If the next char isn't a `-`, then we don't have a range.
1811 // There are two exceptions. If the char after a `-` is a `]`, then
1812 // `-` is interpreted as a literal `-`. Alternatively, if the char
1813 // after a `-` is a `-`, then `--` corresponds to a "difference"
1814 // operation.
1815 if self.char() != '-'
1816 || self.peek_space() == Some(']')
1817 || self.peek_space() == Some('-')
1818 {
1819 return prim1.into_class_set_item(self);
1820 }
1821 // OK, now we're parsing a range, so bump past the `-` and parse the
1822 // second half of the range.
1823 if !self.bump_and_bump_space() {
1824 return Err(self.unclosed_class_error());
1825 }
1826 let prim2 = self.parse_set_class_item()?;
1827 let range = ast::ClassSetRange {
1828 span: Span::new(prim1.span().start, prim2.span().end),
1829 start: prim1.into_class_literal(self)?,
1830 end: prim2.into_class_literal(self)?,
1831 };
1832 if !range.is_valid() {
1833 return Err(self.error(
1834 range.span,
1835 ast::ErrorKind::ClassRangeInvalid,
1836 ));
1837 }
1838 Ok(ast::ClassSetItem::Range(range))
1839 }
1840
1841 /// Parse a single item in a character class as a primitive, where the
1842 /// primitive either consists of a verbatim literal or a single escape
1843 /// sequence.
1844 ///
1845 /// This assumes the parser is positioned at the beginning of a primitive,
1846 /// and advances the parser to the first position after the primitive if
1847 /// successful.
1848 ///
1849 /// Note that it is the caller's responsibility to report an error if an
1850 /// illegal primitive was parsed.
parse_set_class_item(&self) -> Result<Primitive>1851 fn parse_set_class_item(&self) -> Result<Primitive> {
1852 if self.char() == '\\' {
1853 self.parse_escape()
1854 } else {
1855 let x = Primitive::Literal(ast::Literal {
1856 span: self.span_char(),
1857 kind: ast::LiteralKind::Verbatim,
1858 c: self.char(),
1859 });
1860 self.bump();
1861 Ok(x)
1862 }
1863 }
1864
1865 /// Parses the opening of a character class set. This includes the opening
1866 /// bracket along with `^` if present to indicate negation. This also
1867 /// starts parsing the opening set of unioned items if applicable, since
1868 /// there are special rules applied to certain characters in the opening
1869 /// of a character class. For example, `[^]]` is the class of all
1870 /// characters not equal to `]`. (`]` would need to be escaped in any other
1871 /// position.) Similarly for `-`.
1872 ///
1873 /// In all cases, the op inside the returned `ast::ClassBracketed` is an
1874 /// empty union. This empty union should be replaced with the actual item
1875 /// when it is popped from the parser's stack.
1876 ///
1877 /// This assumes the parser is positioned at the opening `[` and advances
1878 /// the parser to the first non-special byte of the character class.
1879 ///
1880 /// An error is returned if EOF is found.
parse_set_class_open( &self, ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)>1881 fn parse_set_class_open(
1882 &self,
1883 ) -> Result<(ast::ClassBracketed, ast::ClassSetUnion)> {
1884 assert_eq!(self.char(), '[');
1885 let start = self.pos();
1886 if !self.bump_and_bump_space() {
1887 return Err(self.error(
1888 Span::new(start, self.pos()),
1889 ast::ErrorKind::ClassUnclosed,
1890 ));
1891 }
1892
1893 let negated =
1894 if self.char() != '^' {
1895 false
1896 } else {
1897 if !self.bump_and_bump_space() {
1898 return Err(self.error(
1899 Span::new(start, self.pos()),
1900 ast::ErrorKind::ClassUnclosed,
1901 ));
1902 }
1903 true
1904 };
1905 // Accept any number of `-` as literal `-`.
1906 let mut union = ast::ClassSetUnion {
1907 span: self.span(),
1908 items: vec![],
1909 };
1910 while self.char() == '-' {
1911 union.push(ast::ClassSetItem::Literal(ast::Literal {
1912 span: self.span_char(),
1913 kind: ast::LiteralKind::Verbatim,
1914 c: '-',
1915 }));
1916 if !self.bump_and_bump_space() {
1917 return Err(self.error(
1918 Span::new(start, self.pos()),
1919 ast::ErrorKind::ClassUnclosed,
1920 ));
1921 }
1922 }
1923 // If `]` is the *first* char in a set, then interpret it as a literal
1924 // `]`. That is, an empty class is impossible to write.
1925 if union.items.is_empty() && self.char() == ']' {
1926 union.push(ast::ClassSetItem::Literal(ast::Literal {
1927 span: self.span_char(),
1928 kind: ast::LiteralKind::Verbatim,
1929 c: ']',
1930 }));
1931 if !self.bump_and_bump_space() {
1932 return Err(self.error(
1933 Span::new(start, self.pos()),
1934 ast::ErrorKind::ClassUnclosed,
1935 ));
1936 }
1937 }
1938 let set = ast::ClassBracketed {
1939 span: Span::new(start, self.pos()),
1940 negated: negated,
1941 kind: ast::ClassSet::union(ast::ClassSetUnion {
1942 span: Span::new(union.span.start, union.span.start),
1943 items: vec![],
1944 }),
1945 };
1946 Ok((set, union))
1947 }
1948
1949 /// Attempt to parse an ASCII character class, e.g., `[:alnum:]`.
1950 ///
1951 /// This assumes the parser is positioned at the opening `[`.
1952 ///
1953 /// If no valid ASCII character class could be found, then this does not
1954 /// advance the parser and `None` is returned. Otherwise, the parser is
1955 /// advanced to the first byte following the closing `]` and the
1956 /// corresponding ASCII class is returned.
maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii>1957 fn maybe_parse_ascii_class(&self) -> Option<ast::ClassAscii> {
1958 // ASCII character classes are interesting from a parsing perspective
1959 // because parsing cannot fail with any interesting error. For example,
1960 // in order to use an ASCII character class, it must be enclosed in
1961 // double brackets, e.g., `[[:alnum:]]`. Alternatively, you might think
1962 // of it as "ASCII character characters have the syntax `[:NAME:]`
1963 // which can only appear within character brackets." This means that
1964 // things like `[[:lower:]A]` are legal constructs.
1965 //
1966 // However, if one types an incorrect ASCII character class, e.g.,
1967 // `[[:loower:]]`, then we treat that as a normal nested character
1968 // class containing the characters `:elorw`. One might argue that we
1969 // should return an error instead since the repeated colons give away
1970 // the intent to write an ASCII class. But what if the user typed
1971 // `[[:lower]]` instead? How can we tell that was intended to be an
1972 // ASCII class and not just a normal nested class?
1973 //
1974 // Reasonable people can probably disagree over this, but for better
1975 // or worse, we implement semantics that never fails at the expense
1976 // of better failure modes.
1977 assert_eq!(self.char(), '[');
1978 // If parsing fails, then we back up the parser to this starting point.
1979 let start = self.pos();
1980 let mut negated = false;
1981 if !self.bump() || self.char() != ':' {
1982 self.parser().pos.set(start);
1983 return None;
1984 }
1985 if !self.bump() {
1986 self.parser().pos.set(start);
1987 return None;
1988 }
1989 if self.char() == '^' {
1990 negated = true;
1991 if !self.bump() {
1992 self.parser().pos.set(start);
1993 return None;
1994 }
1995 }
1996 let name_start = self.offset();
1997 while self.char() != ':' && self.bump() {}
1998 if self.is_eof() {
1999 self.parser().pos.set(start);
2000 return None;
2001 }
2002 let name = &self.pattern()[name_start..self.offset()];
2003 if !self.bump_if(":]") {
2004 self.parser().pos.set(start);
2005 return None;
2006 }
2007 let kind = match ast::ClassAsciiKind::from_name(name) {
2008 Some(kind) => kind,
2009 None => {
2010 self.parser().pos.set(start);
2011 return None;
2012 }
2013 };
2014 Some(ast::ClassAscii {
2015 span: Span::new(start, self.pos()),
2016 kind: kind,
2017 negated: negated,
2018 })
2019 }
2020
2021 /// Parse a Unicode class in either the single character notation, `\pN`
2022 /// or the multi-character bracketed notation, `\p{Greek}`. This assumes
2023 /// the parser is positioned at the `p` (or `P` for negation) and will
2024 /// advance the parser to the character immediately following the class.
2025 ///
2026 /// Note that this does not check whether the class name is valid or not.
parse_unicode_class(&self) -> Result<ast::ClassUnicode>2027 fn parse_unicode_class(&self) -> Result<ast::ClassUnicode> {
2028 assert!(self.char() == 'p' || self.char() == 'P');
2029
2030 let mut scratch = self.parser().scratch.borrow_mut();
2031 scratch.clear();
2032
2033 let negated = self.char() == 'P';
2034 if !self.bump_and_bump_space() {
2035 return Err(self.error(
2036 self.span(),
2037 ast::ErrorKind::EscapeUnexpectedEof,
2038 ));
2039 }
2040 let (start, kind) =
2041 if self.char() == '{' {
2042 let start = self.span_char().end;
2043 while self.bump_and_bump_space() && self.char() != '}' {
2044 scratch.push(self.char());
2045 }
2046 if self.is_eof() {
2047 return Err(self.error(
2048 self.span(),
2049 ast::ErrorKind::EscapeUnexpectedEof,
2050 ));
2051 }
2052 assert_eq!(self.char(), '}');
2053 self.bump();
2054
2055 let name = scratch.as_str();
2056 if let Some(i) = name.find("!=") {
2057 (start, ast::ClassUnicodeKind::NamedValue {
2058 op: ast::ClassUnicodeOpKind::NotEqual,
2059 name: name[..i].to_string(),
2060 value: name[i+2..].to_string(),
2061 })
2062 } else if let Some(i) = name.find(':') {
2063 (start, ast::ClassUnicodeKind::NamedValue {
2064 op: ast::ClassUnicodeOpKind::Colon,
2065 name: name[..i].to_string(),
2066 value: name[i+1..].to_string(),
2067 })
2068 } else if let Some(i) = name.find('=') {
2069 (start, ast::ClassUnicodeKind::NamedValue {
2070 op: ast::ClassUnicodeOpKind::Equal,
2071 name: name[..i].to_string(),
2072 value: name[i+1..].to_string(),
2073 })
2074 } else {
2075 (start, ast::ClassUnicodeKind::Named(name.to_string()))
2076 }
2077 } else {
2078 let start = self.pos();
2079 let c = self.char();
2080 self.bump_and_bump_space();
2081 let kind = ast::ClassUnicodeKind::OneLetter(c);
2082 (start, kind)
2083 };
2084 Ok(ast::ClassUnicode {
2085 span: Span::new(start, self.pos()),
2086 negated: negated,
2087 kind: kind,
2088 })
2089 }
2090
2091 /// Parse a Perl character class, e.g., `\d` or `\W`. This assumes the
2092 /// parser is currently at a valid character class name and will be
2093 /// advanced to the character immediately following the class.
parse_perl_class(&self) -> ast::ClassPerl2094 fn parse_perl_class(&self) -> ast::ClassPerl {
2095 let c = self.char();
2096 let span = self.span_char();
2097 self.bump();
2098 let (negated, kind) = match c {
2099 'd' => (false, ast::ClassPerlKind::Digit),
2100 'D' => (true, ast::ClassPerlKind::Digit),
2101 's' => (false, ast::ClassPerlKind::Space),
2102 'S' => (true, ast::ClassPerlKind::Space),
2103 'w' => (false, ast::ClassPerlKind::Word),
2104 'W' => (true, ast::ClassPerlKind::Word),
2105 c => panic!("expected valid Perl class but got '{}'", c),
2106 };
2107 ast::ClassPerl { span: span, kind: kind, negated: negated }
2108 }
2109 }
2110
2111 /// A type that traverses a fully parsed Ast and checks whether its depth
2112 /// exceeds the specified nesting limit. If it does, then an error is returned.
2113 #[derive(Debug)]
2114 struct NestLimiter<'p, 's: 'p, P: 'p + 's> {
2115 /// The parser that is checking the nest limit.
2116 p: &'p ParserI<'s, P>,
2117 /// The current depth while walking an Ast.
2118 depth: u32,
2119 }
2120
2121 impl<'p, 's, P: Borrow<Parser>> NestLimiter<'p, 's, P> {
new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P>2122 fn new(p: &'p ParserI<'s, P>) -> NestLimiter<'p, 's, P> {
2123 NestLimiter { p: p, depth: 0 }
2124 }
2125
check(self, ast: &Ast) -> Result<()>2126 fn check(self, ast: &Ast) -> Result<()> {
2127 ast::visit(ast, self)
2128 }
2129
increment_depth(&mut self, span: &Span) -> Result<()>2130 fn increment_depth(&mut self, span: &Span) -> Result<()> {
2131 let new = self.depth.checked_add(1).ok_or_else(|| self.p.error(
2132 span.clone(),
2133 ast::ErrorKind::NestLimitExceeded(::std::u32::MAX),
2134 ))?;
2135 let limit = self.p.parser().nest_limit;
2136 if new > limit {
2137 return Err(self.p.error(
2138 span.clone(),
2139 ast::ErrorKind::NestLimitExceeded(limit),
2140 ));
2141 }
2142 self.depth = new;
2143 Ok(())
2144 }
2145
decrement_depth(&mut self)2146 fn decrement_depth(&mut self) {
2147 // Assuming the correctness of the visitor, this should never drop
2148 // below 0.
2149 self.depth = self.depth.checked_sub(1).unwrap();
2150 }
2151 }
2152
2153 impl<'p, 's, P: Borrow<Parser>> ast::Visitor for NestLimiter<'p, 's, P> {
2154 type Output = ();
2155 type Err = ast::Error;
2156
finish(self) -> Result<()>2157 fn finish(self) -> Result<()> {
2158 Ok(())
2159 }
2160
visit_pre(&mut self, ast: &Ast) -> Result<()>2161 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
2162 let span = match *ast {
2163 Ast::Empty(_)
2164 | Ast::Flags(_)
2165 | Ast::Literal(_)
2166 | Ast::Dot(_)
2167 | Ast::Assertion(_)
2168 | Ast::Class(ast::Class::Unicode(_))
2169 | Ast::Class(ast::Class::Perl(_)) => {
2170 // These are all base cases, so we don't increment depth.
2171 return Ok(());
2172 }
2173 Ast::Class(ast::Class::Bracketed(ref x)) => &x.span,
2174 Ast::Repetition(ref x) => &x.span,
2175 Ast::Group(ref x) => &x.span,
2176 Ast::Alternation(ref x) => &x.span,
2177 Ast::Concat(ref x) => &x.span,
2178 };
2179 self.increment_depth(span)
2180 }
2181
visit_post(&mut self, ast: &Ast) -> Result<()>2182 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
2183 match *ast {
2184 Ast::Empty(_)
2185 | Ast::Flags(_)
2186 | Ast::Literal(_)
2187 | Ast::Dot(_)
2188 | Ast::Assertion(_)
2189 | Ast::Class(ast::Class::Unicode(_))
2190 | Ast::Class(ast::Class::Perl(_)) => {
2191 // These are all base cases, so we don't decrement depth.
2192 Ok(())
2193 }
2194 Ast::Class(ast::Class::Bracketed(_))
2195 | Ast::Repetition(_)
2196 | Ast::Group(_)
2197 | Ast::Alternation(_)
2198 | Ast::Concat(_) => {
2199 self.decrement_depth();
2200 Ok(())
2201 }
2202 }
2203 }
2204
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2205 fn visit_class_set_item_pre(
2206 &mut self,
2207 ast: &ast::ClassSetItem,
2208 ) -> Result<()> {
2209 let span = match *ast {
2210 ast::ClassSetItem::Empty(_)
2211 | ast::ClassSetItem::Literal(_)
2212 | ast::ClassSetItem::Range(_)
2213 | ast::ClassSetItem::Ascii(_)
2214 | ast::ClassSetItem::Unicode(_)
2215 | ast::ClassSetItem::Perl(_) => {
2216 // These are all base cases, so we don't increment depth.
2217 return Ok(());
2218 }
2219 ast::ClassSetItem::Bracketed(ref x) => &x.span,
2220 ast::ClassSetItem::Union(ref x) => &x.span,
2221 };
2222 self.increment_depth(span)
2223 }
2224
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>2225 fn visit_class_set_item_post(
2226 &mut self,
2227 ast: &ast::ClassSetItem,
2228 ) -> Result<()> {
2229 match *ast {
2230 ast::ClassSetItem::Empty(_)
2231 | ast::ClassSetItem::Literal(_)
2232 | ast::ClassSetItem::Range(_)
2233 | ast::ClassSetItem::Ascii(_)
2234 | ast::ClassSetItem::Unicode(_)
2235 | ast::ClassSetItem::Perl(_) => {
2236 // These are all base cases, so we don't decrement depth.
2237 Ok(())
2238 }
2239 ast::ClassSetItem::Bracketed(_)
2240 | ast::ClassSetItem::Union(_) => {
2241 self.decrement_depth();
2242 Ok(())
2243 }
2244 }
2245 }
2246
visit_class_set_binary_op_pre( &mut self, ast: &ast::ClassSetBinaryOp, ) -> Result<()>2247 fn visit_class_set_binary_op_pre(
2248 &mut self,
2249 ast: &ast::ClassSetBinaryOp,
2250 ) -> Result<()> {
2251 self.increment_depth(&ast.span)
2252 }
2253
visit_class_set_binary_op_post( &mut self, _ast: &ast::ClassSetBinaryOp, ) -> Result<()>2254 fn visit_class_set_binary_op_post(
2255 &mut self,
2256 _ast: &ast::ClassSetBinaryOp,
2257 ) -> Result<()> {
2258 self.decrement_depth();
2259 Ok(())
2260 }
2261 }
2262
2263 #[cfg(test)]
2264 mod tests {
2265 use std::ops::Range;
2266
2267 use ast::{self, Ast, Position, Span};
2268 use super::{Parser, ParserI, ParserBuilder, Primitive};
2269
2270 // Our own assert_eq, which has slightly better formatting (but honestly
2271 // still kind of crappy).
2272 macro_rules! assert_eq {
2273 ($left:expr, $right:expr) => ({
2274 match (&$left, &$right) {
2275 (left_val, right_val) => {
2276 if !(*left_val == *right_val) {
2277 panic!("assertion failed: `(left == right)`\n\n\
2278 left: `{:?}`\nright: `{:?}`\n\n",
2279 left_val, right_val)
2280 }
2281 }
2282 }
2283 });
2284 }
2285
2286 // We create these errors to compare with real ast::Errors in the tests.
2287 // We define equality between TestError and ast::Error to disregard the
2288 // pattern string in ast::Error, which is annoying to provide in tests.
2289 #[derive(Clone, Debug)]
2290 struct TestError {
2291 span: Span,
2292 kind: ast::ErrorKind,
2293 }
2294
2295 impl PartialEq<ast::Error> for TestError {
eq(&self, other: &ast::Error) -> bool2296 fn eq(&self, other: &ast::Error) -> bool {
2297 self.span == other.span && self.kind == other.kind
2298 }
2299 }
2300
2301 impl PartialEq<TestError> for ast::Error {
eq(&self, other: &TestError) -> bool2302 fn eq(&self, other: &TestError) -> bool {
2303 self.span == other.span && self.kind == other.kind
2304 }
2305 }
2306
s(str: &str) -> String2307 fn s(str: &str) -> String {
2308 str.to_string()
2309 }
2310
parser(pattern: &str) -> ParserI<Parser>2311 fn parser(pattern: &str) -> ParserI<Parser> {
2312 ParserI::new(Parser::new(), pattern)
2313 }
2314
parser_octal(pattern: &str) -> ParserI<Parser>2315 fn parser_octal(pattern: &str) -> ParserI<Parser> {
2316 let parser = ParserBuilder::new().octal(true).build();
2317 ParserI::new(parser, pattern)
2318 }
2319
parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser>2320 fn parser_nest_limit(pattern: &str, nest_limit: u32) -> ParserI<Parser> {
2321 let p = ParserBuilder::new().nest_limit(nest_limit).build();
2322 ParserI::new(p, pattern)
2323 }
2324
parser_ignore_whitespace(pattern: &str) -> ParserI<Parser>2325 fn parser_ignore_whitespace(pattern: &str) -> ParserI<Parser> {
2326 let p = ParserBuilder::new().ignore_whitespace(true).build();
2327 ParserI::new(p, pattern)
2328 }
2329
2330 /// Short alias for creating a new span.
nspan(start: Position, end: Position) -> Span2331 fn nspan(start: Position, end: Position) -> Span {
2332 Span::new(start, end)
2333 }
2334
2335 /// Short alias for creating a new position.
npos(offset: usize, line: usize, column: usize) -> Position2336 fn npos(offset: usize, line: usize, column: usize) -> Position {
2337 Position::new(offset, line, column)
2338 }
2339
2340 /// Create a new span from the given offset range. This assumes a single
2341 /// line and sets the columns based on the offsets. i.e., This only works
2342 /// out of the box for ASCII, which is fine for most tests.
span(range: Range<usize>) -> Span2343 fn span(range: Range<usize>) -> Span {
2344 let start = Position::new(range.start, 1, range.start + 1);
2345 let end = Position::new(range.end, 1, range.end + 1);
2346 Span::new(start, end)
2347 }
2348
2349 /// Create a new span for the corresponding byte range in the given string.
span_range(subject: &str, range: Range<usize>) -> Span2350 fn span_range(subject: &str, range: Range<usize>) -> Span {
2351 let start = Position {
2352 offset: range.start,
2353 line: 1 + subject[..range.start].matches('\n').count(),
2354 column: 1 + subject[..range.start]
2355 .chars()
2356 .rev()
2357 .position(|c| c == '\n')
2358 .unwrap_or(subject[..range.start].chars().count()),
2359 };
2360 let end = Position {
2361 offset: range.end,
2362 line: 1 + subject[..range.end].matches('\n').count(),
2363 column: 1 + subject[..range.end]
2364 .chars()
2365 .rev()
2366 .position(|c| c == '\n')
2367 .unwrap_or(subject[..range.end].chars().count()),
2368 };
2369 Span::new(start, end)
2370 }
2371
2372 /// Create a verbatim literal starting at the given position.
lit(c: char, start: usize) -> Ast2373 fn lit(c: char, start: usize) -> Ast {
2374 lit_with(c, span(start..start + c.len_utf8()))
2375 }
2376
2377 /// Create a punctuation literal starting at the given position.
punct_lit(c: char, span: Span) -> Ast2378 fn punct_lit(c: char, span: Span) -> Ast {
2379 Ast::Literal(ast::Literal {
2380 span: span,
2381 kind: ast::LiteralKind::Punctuation,
2382 c: c,
2383 })
2384 }
2385
2386 /// Create a verbatim literal with the given span.
lit_with(c: char, span: Span) -> Ast2387 fn lit_with(c: char, span: Span) -> Ast {
2388 Ast::Literal(ast::Literal {
2389 span: span,
2390 kind: ast::LiteralKind::Verbatim,
2391 c: c,
2392 })
2393 }
2394
2395 /// Create a concatenation with the given range.
concat(range: Range<usize>, asts: Vec<Ast>) -> Ast2396 fn concat(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2397 concat_with(span(range), asts)
2398 }
2399
2400 /// Create a concatenation with the given span.
concat_with(span: Span, asts: Vec<Ast>) -> Ast2401 fn concat_with(span: Span, asts: Vec<Ast>) -> Ast {
2402 Ast::Concat(ast::Concat { span: span, asts: asts })
2403 }
2404
2405 /// Create an alternation with the given span.
alt(range: Range<usize>, asts: Vec<Ast>) -> Ast2406 fn alt(range: Range<usize>, asts: Vec<Ast>) -> Ast {
2407 Ast::Alternation(ast::Alternation { span: span(range), asts: asts })
2408 }
2409
2410 /// Create a capturing group with the given span.
group(range: Range<usize>, index: u32, ast: Ast) -> Ast2411 fn group(range: Range<usize>, index: u32, ast: Ast) -> Ast {
2412 Ast::Group(ast::Group {
2413 span: span(range),
2414 kind: ast::GroupKind::CaptureIndex(index),
2415 ast: Box::new(ast),
2416 })
2417 }
2418
2419 /// Create an ast::SetFlags.
2420 ///
2421 /// The given pattern should be the full pattern string. The range given
2422 /// should correspond to the byte offsets where the flag set occurs.
2423 ///
2424 /// If negated is true, then the set is interpreted as beginning with a
2425 /// negation.
flag_set( pat: &str, range: Range<usize>, flag: ast::Flag, negated: bool, ) -> Ast2426 fn flag_set(
2427 pat: &str,
2428 range: Range<usize>,
2429 flag: ast::Flag,
2430 negated: bool,
2431 ) -> Ast {
2432 let mut items = vec![
2433 ast::FlagsItem {
2434 span: span_range(pat, (range.end - 2)..(range.end - 1)),
2435 kind: ast::FlagsItemKind::Flag(flag),
2436 },
2437 ];
2438 if negated {
2439 items.insert(0, ast::FlagsItem {
2440 span: span_range(pat, (range.start + 2)..(range.end - 2)),
2441 kind: ast::FlagsItemKind::Negation,
2442 });
2443 }
2444 Ast::Flags(ast::SetFlags {
2445 span: span_range(pat, range.clone()),
2446 flags: ast::Flags {
2447 span: span_range(pat, (range.start + 2)..(range.end - 1)),
2448 items: items,
2449 },
2450 })
2451 }
2452
2453 #[test]
parse_nest_limit()2454 fn parse_nest_limit() {
2455 // A nest limit of 0 still allows some types of regexes.
2456 assert_eq!(
2457 parser_nest_limit("", 0).parse(),
2458 Ok(Ast::Empty(span(0..0))));
2459 assert_eq!(
2460 parser_nest_limit("a", 0).parse(),
2461 Ok(lit('a', 0)));
2462
2463 // Test repetition operations, which require one level of nesting.
2464 assert_eq!(
2465 parser_nest_limit("a+", 0).parse().unwrap_err(),
2466 TestError {
2467 span: span(0..2),
2468 kind: ast::ErrorKind::NestLimitExceeded(0),
2469 });
2470 assert_eq!(
2471 parser_nest_limit("a+", 1).parse(),
2472 Ok(Ast::Repetition(ast::Repetition {
2473 span: span(0..2),
2474 op: ast::RepetitionOp {
2475 span: span(1..2),
2476 kind: ast::RepetitionKind::OneOrMore,
2477 },
2478 greedy: true,
2479 ast: Box::new(lit('a', 0)),
2480 })));
2481 assert_eq!(
2482 parser_nest_limit("(a)+", 1).parse().unwrap_err(),
2483 TestError {
2484 span: span(0..3),
2485 kind: ast::ErrorKind::NestLimitExceeded(1),
2486 });
2487 assert_eq!(
2488 parser_nest_limit("a+*", 1).parse().unwrap_err(),
2489 TestError {
2490 span: span(0..2),
2491 kind: ast::ErrorKind::NestLimitExceeded(1),
2492 });
2493 assert_eq!(
2494 parser_nest_limit("a+*", 2).parse(),
2495 Ok(Ast::Repetition(ast::Repetition {
2496 span: span(0..3),
2497 op: ast::RepetitionOp {
2498 span: span(2..3),
2499 kind: ast::RepetitionKind::ZeroOrMore,
2500 },
2501 greedy: true,
2502 ast: Box::new(Ast::Repetition(ast::Repetition {
2503 span: span(0..2),
2504 op: ast::RepetitionOp {
2505 span: span(1..2),
2506 kind: ast::RepetitionKind::OneOrMore,
2507 },
2508 greedy: true,
2509 ast: Box::new(lit('a', 0)),
2510 })),
2511 })));
2512
2513 // Test concatenations. A concatenation requires one level of nesting.
2514 assert_eq!(
2515 parser_nest_limit("ab", 0).parse().unwrap_err(),
2516 TestError {
2517 span: span(0..2),
2518 kind: ast::ErrorKind::NestLimitExceeded(0),
2519 });
2520 assert_eq!(
2521 parser_nest_limit("ab", 1).parse(),
2522 Ok(concat(0..2, vec![lit('a', 0), lit('b', 1)])));
2523 assert_eq!(
2524 parser_nest_limit("abc", 1).parse(),
2525 Ok(concat(0..3, vec![lit('a', 0), lit('b', 1), lit('c', 2)])));
2526
2527 // Test alternations. An alternation requires one level of nesting.
2528 assert_eq!(
2529 parser_nest_limit("a|b", 0).parse().unwrap_err(),
2530 TestError {
2531 span: span(0..3),
2532 kind: ast::ErrorKind::NestLimitExceeded(0),
2533 });
2534 assert_eq!(
2535 parser_nest_limit("a|b", 1).parse(),
2536 Ok(alt(0..3, vec![lit('a', 0), lit('b', 2)])));
2537 assert_eq!(
2538 parser_nest_limit("a|b|c", 1).parse(),
2539 Ok(alt(0..5, vec![lit('a', 0), lit('b', 2), lit('c', 4)])));
2540
2541 // Test character classes. Classes form their own mini-recursive
2542 // syntax!
2543 assert_eq!(
2544 parser_nest_limit("[a]", 0).parse().unwrap_err(),
2545 TestError {
2546 span: span(0..3),
2547 kind: ast::ErrorKind::NestLimitExceeded(0),
2548 });
2549 assert_eq!(
2550 parser_nest_limit("[a]", 1).parse(),
2551 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
2552 span: span(0..3),
2553 negated: false,
2554 kind: ast::ClassSet::Item(
2555 ast::ClassSetItem::Literal(ast::Literal {
2556 span: span(1..2),
2557 kind: ast::LiteralKind::Verbatim,
2558 c: 'a',
2559 })
2560 ),
2561 }))));
2562 assert_eq!(
2563 parser_nest_limit("[ab]", 1).parse().unwrap_err(),
2564 TestError {
2565 span: span(1..3),
2566 kind: ast::ErrorKind::NestLimitExceeded(1),
2567 });
2568 assert_eq!(
2569 parser_nest_limit("[ab[cd]]", 2).parse().unwrap_err(),
2570 TestError {
2571 span: span(3..7),
2572 kind: ast::ErrorKind::NestLimitExceeded(2),
2573 });
2574 assert_eq!(
2575 parser_nest_limit("[ab[cd]]", 3).parse().unwrap_err(),
2576 TestError {
2577 span: span(4..6),
2578 kind: ast::ErrorKind::NestLimitExceeded(3),
2579 });
2580 assert_eq!(
2581 parser_nest_limit("[a--b]", 1).parse().unwrap_err(),
2582 TestError {
2583 span: span(1..5),
2584 kind: ast::ErrorKind::NestLimitExceeded(1),
2585 });
2586 assert_eq!(
2587 parser_nest_limit("[a--bc]", 2).parse().unwrap_err(),
2588 TestError {
2589 span: span(4..6),
2590 kind: ast::ErrorKind::NestLimitExceeded(2),
2591 });
2592 }
2593
2594 #[test]
parse_comments()2595 fn parse_comments() {
2596 let pat = "(?x)
2597 # This is comment 1.
2598 foo # This is comment 2.
2599 # This is comment 3.
2600 bar
2601 # This is comment 4.";
2602 let astc = parser(pat).parse_with_comments().unwrap();
2603 assert_eq!(
2604 astc.ast,
2605 concat_with(span_range(pat, 0..pat.len()), vec![
2606 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2607 lit_with('f', span_range(pat, 26..27)),
2608 lit_with('o', span_range(pat, 27..28)),
2609 lit_with('o', span_range(pat, 28..29)),
2610 lit_with('b', span_range(pat, 74..75)),
2611 lit_with('a', span_range(pat, 75..76)),
2612 lit_with('r', span_range(pat, 76..77)),
2613 ]));
2614 assert_eq!(astc.comments, vec![
2615 ast::Comment {
2616 span: span_range(pat, 5..26),
2617 comment: s(" This is comment 1."),
2618 },
2619 ast::Comment {
2620 span: span_range(pat, 30..51),
2621 comment: s(" This is comment 2."),
2622 },
2623 ast::Comment {
2624 span: span_range(pat, 53..74),
2625 comment: s(" This is comment 3."),
2626 },
2627 ast::Comment {
2628 span: span_range(pat, 78..98),
2629 comment: s(" This is comment 4."),
2630 },
2631 ]);
2632 }
2633
2634 #[test]
parse_holistic()2635 fn parse_holistic() {
2636 assert_eq!(
2637 parser("]").parse(),
2638 Ok(lit(']', 0)));
2639 assert_eq!(
2640 parser(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#\&\-\~").parse(),
2641 Ok(concat(0..36, vec![
2642 punct_lit('\\', span(0..2)),
2643 punct_lit('.', span(2..4)),
2644 punct_lit('+', span(4..6)),
2645 punct_lit('*', span(6..8)),
2646 punct_lit('?', span(8..10)),
2647 punct_lit('(', span(10..12)),
2648 punct_lit(')', span(12..14)),
2649 punct_lit('|', span(14..16)),
2650 punct_lit('[', span(16..18)),
2651 punct_lit(']', span(18..20)),
2652 punct_lit('{', span(20..22)),
2653 punct_lit('}', span(22..24)),
2654 punct_lit('^', span(24..26)),
2655 punct_lit('$', span(26..28)),
2656 punct_lit('#', span(28..30)),
2657 punct_lit('&', span(30..32)),
2658 punct_lit('-', span(32..34)),
2659 punct_lit('~', span(34..36)),
2660 ])));
2661 }
2662
2663 #[test]
parse_ignore_whitespace()2664 fn parse_ignore_whitespace() {
2665 // Test that basic whitespace insensitivity works.
2666 let pat = "(?x)a b";
2667 assert_eq!(
2668 parser(pat).parse(),
2669 Ok(concat_with(nspan(npos(0, 1, 1), npos(7, 1, 8)), vec![
2670 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2671 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2672 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2673 ])));
2674
2675 // Test that we can toggle whitespace insensitivity.
2676 let pat = "(?x)a b(?-x)a b";
2677 assert_eq!(
2678 parser(pat).parse(),
2679 Ok(concat_with(nspan(npos(0, 1, 1), npos(15, 1, 16)), vec![
2680 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2681 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2682 lit_with('b', nspan(npos(6, 1, 7), npos(7, 1, 8))),
2683 flag_set(pat, 7..12, ast::Flag::IgnoreWhitespace, true),
2684 lit_with('a', nspan(npos(12, 1, 13), npos(13, 1, 14))),
2685 lit_with(' ', nspan(npos(13, 1, 14), npos(14, 1, 15))),
2686 lit_with('b', nspan(npos(14, 1, 15), npos(15, 1, 16))),
2687 ])));
2688
2689 // Test that nesting whitespace insensitive flags works.
2690 let pat = "a (?x:a )a ";
2691 assert_eq!(
2692 parser(pat).parse(),
2693 Ok(concat_with(span_range(pat, 0..11), vec![
2694 lit_with('a', span_range(pat, 0..1)),
2695 lit_with(' ', span_range(pat, 1..2)),
2696 Ast::Group(ast::Group {
2697 span: span_range(pat, 2..9),
2698 kind: ast::GroupKind::NonCapturing(ast::Flags {
2699 span: span_range(pat, 4..5),
2700 items: vec![
2701 ast::FlagsItem {
2702 span: span_range(pat, 4..5),
2703 kind: ast::FlagsItemKind::Flag(
2704 ast::Flag::IgnoreWhitespace),
2705 },
2706 ],
2707 }),
2708 ast: Box::new(lit_with('a', span_range(pat, 6..7))),
2709 }),
2710 lit_with('a', span_range(pat, 9..10)),
2711 lit_with(' ', span_range(pat, 10..11)),
2712 ])));
2713
2714 // Test that whitespace after an opening paren is insignificant.
2715 let pat = "(?x)( ?P<foo> a )";
2716 assert_eq!(
2717 parser(pat).parse(),
2718 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2719 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2720 Ast::Group(ast::Group {
2721 span: span_range(pat, 4..pat.len()),
2722 kind: ast::GroupKind::CaptureName(ast::CaptureName {
2723 span: span_range(pat, 9..12),
2724 name: s("foo"),
2725 index: 1,
2726 }),
2727 ast: Box::new(lit_with('a', span_range(pat, 14..15))),
2728 }),
2729 ])));
2730 let pat = "(?x)( a )";
2731 assert_eq!(
2732 parser(pat).parse(),
2733 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2734 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2735 Ast::Group(ast::Group {
2736 span: span_range(pat, 4..pat.len()),
2737 kind: ast::GroupKind::CaptureIndex(1),
2738 ast: Box::new(lit_with('a', span_range(pat, 7..8))),
2739 }),
2740 ])));
2741 let pat = "(?x)( ?: a )";
2742 assert_eq!(
2743 parser(pat).parse(),
2744 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2745 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2746 Ast::Group(ast::Group {
2747 span: span_range(pat, 4..pat.len()),
2748 kind: ast::GroupKind::NonCapturing(ast::Flags {
2749 span: span_range(pat, 8..8),
2750 items: vec![],
2751 }),
2752 ast: Box::new(lit_with('a', span_range(pat, 11..12))),
2753 }),
2754 ])));
2755 let pat = r"(?x)\x { 53 }";
2756 assert_eq!(
2757 parser(pat).parse(),
2758 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2759 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2760 Ast::Literal(ast::Literal {
2761 span: span(4..13),
2762 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
2763 c: 'S',
2764 }),
2765 ])));
2766
2767 // Test that whitespace after an escape is OK.
2768 let pat = r"(?x)\ ";
2769 assert_eq!(
2770 parser(pat).parse(),
2771 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2772 flag_set(pat, 0..4, ast::Flag::IgnoreWhitespace, false),
2773 Ast::Literal(ast::Literal {
2774 span: span_range(pat, 4..6),
2775 kind: ast::LiteralKind::Special(
2776 ast::SpecialLiteralKind::Space),
2777 c: ' ',
2778 }),
2779 ])));
2780 // ... but only when `x` mode is enabled.
2781 let pat = r"\ ";
2782 assert_eq!(
2783 parser(pat).parse().unwrap_err(),
2784 TestError {
2785 span: span_range(pat, 0..2),
2786 kind: ast::ErrorKind::EscapeUnrecognized,
2787 });
2788 }
2789
2790 #[test]
parse_newlines()2791 fn parse_newlines() {
2792 let pat = ".\n.";
2793 assert_eq!(
2794 parser(pat).parse(),
2795 Ok(concat_with(span_range(pat, 0..3), vec![
2796 Ast::Dot(span_range(pat, 0..1)),
2797 lit_with('\n', span_range(pat, 1..2)),
2798 Ast::Dot(span_range(pat, 2..3)),
2799 ])));
2800
2801 let pat = "foobar\nbaz\nquux\n";
2802 assert_eq!(
2803 parser(pat).parse(),
2804 Ok(concat_with(span_range(pat, 0..pat.len()), vec![
2805 lit_with('f', nspan(npos(0, 1, 1), npos(1, 1, 2))),
2806 lit_with('o', nspan(npos(1, 1, 2), npos(2, 1, 3))),
2807 lit_with('o', nspan(npos(2, 1, 3), npos(3, 1, 4))),
2808 lit_with('b', nspan(npos(3, 1, 4), npos(4, 1, 5))),
2809 lit_with('a', nspan(npos(4, 1, 5), npos(5, 1, 6))),
2810 lit_with('r', nspan(npos(5, 1, 6), npos(6, 1, 7))),
2811 lit_with('\n', nspan(npos(6, 1, 7), npos(7, 2, 1))),
2812 lit_with('b', nspan(npos(7, 2, 1), npos(8, 2, 2))),
2813 lit_with('a', nspan(npos(8, 2, 2), npos(9, 2, 3))),
2814 lit_with('z', nspan(npos(9, 2, 3), npos(10, 2, 4))),
2815 lit_with('\n', nspan(npos(10, 2, 4), npos(11, 3, 1))),
2816 lit_with('q', nspan(npos(11, 3, 1), npos(12, 3, 2))),
2817 lit_with('u', nspan(npos(12, 3, 2), npos(13, 3, 3))),
2818 lit_with('u', nspan(npos(13, 3, 3), npos(14, 3, 4))),
2819 lit_with('x', nspan(npos(14, 3, 4), npos(15, 3, 5))),
2820 lit_with('\n', nspan(npos(15, 3, 5), npos(16, 4, 1))),
2821 ])));
2822 }
2823
2824 #[test]
parse_uncounted_repetition()2825 fn parse_uncounted_repetition() {
2826 assert_eq!(
2827 parser(r"a*").parse(),
2828 Ok(Ast::Repetition(ast::Repetition {
2829 span: span(0..2),
2830 op: ast::RepetitionOp {
2831 span: span(1..2),
2832 kind: ast::RepetitionKind::ZeroOrMore,
2833 },
2834 greedy: true,
2835 ast: Box::new(lit('a', 0)),
2836 })));
2837 assert_eq!(
2838 parser(r"a+").parse(),
2839 Ok(Ast::Repetition(ast::Repetition {
2840 span: span(0..2),
2841 op: ast::RepetitionOp {
2842 span: span(1..2),
2843 kind: ast::RepetitionKind::OneOrMore,
2844 },
2845 greedy: true,
2846 ast: Box::new(lit('a', 0)),
2847 })));
2848
2849 assert_eq!(
2850 parser(r"a?").parse(),
2851 Ok(Ast::Repetition(ast::Repetition {
2852 span: span(0..2),
2853 op: ast::RepetitionOp {
2854 span: span(1..2),
2855 kind: ast::RepetitionKind::ZeroOrOne,
2856 },
2857 greedy: true,
2858 ast: Box::new(lit('a', 0)),
2859 })));
2860 assert_eq!(
2861 parser(r"a??").parse(),
2862 Ok(Ast::Repetition(ast::Repetition {
2863 span: span(0..3),
2864 op: ast::RepetitionOp {
2865 span: span(1..3),
2866 kind: ast::RepetitionKind::ZeroOrOne,
2867 },
2868 greedy: false,
2869 ast: Box::new(lit('a', 0)),
2870 })));
2871 assert_eq!(
2872 parser(r"a?").parse(),
2873 Ok(Ast::Repetition(ast::Repetition {
2874 span: span(0..2),
2875 op: ast::RepetitionOp {
2876 span: span(1..2),
2877 kind: ast::RepetitionKind::ZeroOrOne,
2878 },
2879 greedy: true,
2880 ast: Box::new(lit('a', 0)),
2881 })));
2882 assert_eq!(
2883 parser(r"a?b").parse(),
2884 Ok(concat(0..3, vec![
2885 Ast::Repetition(ast::Repetition {
2886 span: span(0..2),
2887 op: ast::RepetitionOp {
2888 span: span(1..2),
2889 kind: ast::RepetitionKind::ZeroOrOne,
2890 },
2891 greedy: true,
2892 ast: Box::new(lit('a', 0)),
2893 }),
2894 lit('b', 2),
2895 ])));
2896 assert_eq!(
2897 parser(r"a??b").parse(),
2898 Ok(concat(0..4, vec![
2899 Ast::Repetition(ast::Repetition {
2900 span: span(0..3),
2901 op: ast::RepetitionOp {
2902 span: span(1..3),
2903 kind: ast::RepetitionKind::ZeroOrOne,
2904 },
2905 greedy: false,
2906 ast: Box::new(lit('a', 0)),
2907 }),
2908 lit('b', 3),
2909 ])));
2910 assert_eq!(
2911 parser(r"ab?").parse(),
2912 Ok(concat(0..3, vec![
2913 lit('a', 0),
2914 Ast::Repetition(ast::Repetition {
2915 span: span(1..3),
2916 op: ast::RepetitionOp {
2917 span: span(2..3),
2918 kind: ast::RepetitionKind::ZeroOrOne,
2919 },
2920 greedy: true,
2921 ast: Box::new(lit('b', 1)),
2922 }),
2923 ])));
2924 assert_eq!(
2925 parser(r"(ab)?").parse(),
2926 Ok(Ast::Repetition(ast::Repetition {
2927 span: span(0..5),
2928 op: ast::RepetitionOp {
2929 span: span(4..5),
2930 kind: ast::RepetitionKind::ZeroOrOne,
2931 },
2932 greedy: true,
2933 ast: Box::new(group(0..4, 1, concat(1..3, vec![
2934 lit('a', 1),
2935 lit('b', 2),
2936 ]))),
2937 })));
2938 assert_eq!(
2939 parser(r"|a?").parse(),
2940 Ok(alt(0..3, vec![
2941 Ast::Empty(span(0..0)),
2942 Ast::Repetition(ast::Repetition {
2943 span: span(1..3),
2944 op: ast::RepetitionOp {
2945 span: span(2..3),
2946 kind: ast::RepetitionKind::ZeroOrOne,
2947 },
2948 greedy: true,
2949 ast: Box::new(lit('a', 1)),
2950 }),
2951 ])));
2952
2953 assert_eq!(
2954 parser(r"*").parse().unwrap_err(),
2955 TestError {
2956 span: span(0..0),
2957 kind: ast::ErrorKind::RepetitionMissing,
2958 });
2959 assert_eq!(
2960 parser(r"(?i)*").parse().unwrap_err(),
2961 TestError {
2962 span: span(4..4),
2963 kind: ast::ErrorKind::RepetitionMissing,
2964 });
2965 assert_eq!(
2966 parser(r"(*)").parse().unwrap_err(),
2967 TestError {
2968 span: span(1..1),
2969 kind: ast::ErrorKind::RepetitionMissing,
2970 });
2971 assert_eq!(
2972 parser(r"(?:?)").parse().unwrap_err(),
2973 TestError {
2974 span: span(3..3),
2975 kind: ast::ErrorKind::RepetitionMissing,
2976 });
2977 assert_eq!(
2978 parser(r"+").parse().unwrap_err(),
2979 TestError {
2980 span: span(0..0),
2981 kind: ast::ErrorKind::RepetitionMissing,
2982 });
2983 assert_eq!(
2984 parser(r"?").parse().unwrap_err(),
2985 TestError {
2986 span: span(0..0),
2987 kind: ast::ErrorKind::RepetitionMissing,
2988 });
2989 assert_eq!(
2990 parser(r"(?)").parse().unwrap_err(),
2991 TestError {
2992 span: span(1..1),
2993 kind: ast::ErrorKind::RepetitionMissing,
2994 });
2995 assert_eq!(
2996 parser(r"|*").parse().unwrap_err(),
2997 TestError {
2998 span: span(1..1),
2999 kind: ast::ErrorKind::RepetitionMissing,
3000 });
3001 assert_eq!(
3002 parser(r"|+").parse().unwrap_err(),
3003 TestError {
3004 span: span(1..1),
3005 kind: ast::ErrorKind::RepetitionMissing,
3006 });
3007 assert_eq!(
3008 parser(r"|?").parse().unwrap_err(),
3009 TestError {
3010 span: span(1..1),
3011 kind: ast::ErrorKind::RepetitionMissing,
3012 });
3013 }
3014
3015 #[test]
parse_counted_repetition()3016 fn parse_counted_repetition() {
3017 assert_eq!(
3018 parser(r"a{5}").parse(),
3019 Ok(Ast::Repetition(ast::Repetition {
3020 span: span(0..4),
3021 op: ast::RepetitionOp {
3022 span: span(1..4),
3023 kind: ast::RepetitionKind::Range(
3024 ast::RepetitionRange::Exactly(5)),
3025 },
3026 greedy: true,
3027 ast: Box::new(lit('a', 0)),
3028 })));
3029 assert_eq!(
3030 parser(r"a{5,}").parse(),
3031 Ok(Ast::Repetition(ast::Repetition {
3032 span: span(0..5),
3033 op: ast::RepetitionOp {
3034 span: span(1..5),
3035 kind: ast::RepetitionKind::Range(
3036 ast::RepetitionRange::AtLeast(5)),
3037 },
3038 greedy: true,
3039 ast: Box::new(lit('a', 0)),
3040 })));
3041 assert_eq!(
3042 parser(r"a{5,9}").parse(),
3043 Ok(Ast::Repetition(ast::Repetition {
3044 span: span(0..6),
3045 op: ast::RepetitionOp {
3046 span: span(1..6),
3047 kind: ast::RepetitionKind::Range(
3048 ast::RepetitionRange::Bounded(5, 9)),
3049 },
3050 greedy: true,
3051 ast: Box::new(lit('a', 0)),
3052 })));
3053 assert_eq!(
3054 parser(r"a{5}?").parse(),
3055 Ok(Ast::Repetition(ast::Repetition {
3056 span: span(0..5),
3057 op: ast::RepetitionOp {
3058 span: span(1..5),
3059 kind: ast::RepetitionKind::Range(
3060 ast::RepetitionRange::Exactly(5)),
3061 },
3062 greedy: false,
3063 ast: Box::new(lit('a', 0)),
3064 })));
3065 assert_eq!(
3066 parser(r"ab{5}").parse(),
3067 Ok(concat(0..5, vec![
3068 lit('a', 0),
3069 Ast::Repetition(ast::Repetition {
3070 span: span(1..5),
3071 op: ast::RepetitionOp {
3072 span: span(2..5),
3073 kind: ast::RepetitionKind::Range(
3074 ast::RepetitionRange::Exactly(5)),
3075 },
3076 greedy: true,
3077 ast: Box::new(lit('b', 1)),
3078 }),
3079 ])));
3080 assert_eq!(
3081 parser(r"ab{5}c").parse(),
3082 Ok(concat(0..6, vec![
3083 lit('a', 0),
3084 Ast::Repetition(ast::Repetition {
3085 span: span(1..5),
3086 op: ast::RepetitionOp {
3087 span: span(2..5),
3088 kind: ast::RepetitionKind::Range(
3089 ast::RepetitionRange::Exactly(5)),
3090 },
3091 greedy: true,
3092 ast: Box::new(lit('b', 1)),
3093 }),
3094 lit('c', 5),
3095 ])));
3096
3097 assert_eq!(
3098 parser(r"a{ 5 }").parse(),
3099 Ok(Ast::Repetition(ast::Repetition {
3100 span: span(0..6),
3101 op: ast::RepetitionOp {
3102 span: span(1..6),
3103 kind: ast::RepetitionKind::Range(
3104 ast::RepetitionRange::Exactly(5)),
3105 },
3106 greedy: true,
3107 ast: Box::new(lit('a', 0)),
3108 })));
3109 assert_eq!(
3110 parser(r"a{ 5 , 9 }").parse(),
3111 Ok(Ast::Repetition(ast::Repetition {
3112 span: span(0..10),
3113 op: ast::RepetitionOp {
3114 span: span(1..10),
3115 kind: ast::RepetitionKind::Range(
3116 ast::RepetitionRange::Bounded(5, 9)),
3117 },
3118 greedy: true,
3119 ast: Box::new(lit('a', 0)),
3120 })));
3121 assert_eq!(
3122 parser_ignore_whitespace(r"a{5,9} ?").parse(),
3123 Ok(Ast::Repetition(ast::Repetition {
3124 span: span(0..8),
3125 op: ast::RepetitionOp {
3126 span: span(1..8),
3127 kind: ast::RepetitionKind::Range(
3128 ast::RepetitionRange::Bounded(5, 9)),
3129 },
3130 greedy: false,
3131 ast: Box::new(lit('a', 0)),
3132 })));
3133
3134 assert_eq!(
3135 parser(r"(?i){0}").parse().unwrap_err(),
3136 TestError {
3137 span: span(4..4),
3138 kind: ast::ErrorKind::RepetitionMissing,
3139 });
3140 assert_eq!(
3141 parser(r"(?m){1,1}").parse().unwrap_err(),
3142 TestError {
3143 span: span(4..4),
3144 kind: ast::ErrorKind::RepetitionMissing,
3145 });
3146 assert_eq!(
3147 parser(r"a{").parse().unwrap_err(),
3148 TestError {
3149 span: span(1..2),
3150 kind: ast::ErrorKind::RepetitionCountUnclosed,
3151 });
3152 assert_eq!(
3153 parser(r"a{}").parse().unwrap_err(),
3154 TestError {
3155 span: span(2..2),
3156 kind: ast::ErrorKind::DecimalEmpty,
3157 });
3158 assert_eq!(
3159 parser(r"a{a").parse().unwrap_err(),
3160 TestError {
3161 span: span(2..2),
3162 kind: ast::ErrorKind::DecimalEmpty,
3163 });
3164 assert_eq!(
3165 parser(r"a{9999999999}").parse().unwrap_err(),
3166 TestError {
3167 span: span(2..12),
3168 kind: ast::ErrorKind::DecimalInvalid,
3169 });
3170 assert_eq!(
3171 parser(r"a{9").parse().unwrap_err(),
3172 TestError {
3173 span: span(1..3),
3174 kind: ast::ErrorKind::RepetitionCountUnclosed,
3175 });
3176 assert_eq!(
3177 parser(r"a{9,a").parse().unwrap_err(),
3178 TestError {
3179 span: span(4..4),
3180 kind: ast::ErrorKind::DecimalEmpty,
3181 });
3182 assert_eq!(
3183 parser(r"a{9,9999999999}").parse().unwrap_err(),
3184 TestError {
3185 span: span(4..14),
3186 kind: ast::ErrorKind::DecimalInvalid,
3187 });
3188 assert_eq!(
3189 parser(r"a{9,").parse().unwrap_err(),
3190 TestError {
3191 span: span(1..4),
3192 kind: ast::ErrorKind::RepetitionCountUnclosed,
3193 });
3194 assert_eq!(
3195 parser(r"a{9,11").parse().unwrap_err(),
3196 TestError {
3197 span: span(1..6),
3198 kind: ast::ErrorKind::RepetitionCountUnclosed,
3199 });
3200 assert_eq!(
3201 parser(r"a{2,1}").parse().unwrap_err(),
3202 TestError {
3203 span: span(1..6),
3204 kind: ast::ErrorKind::RepetitionCountInvalid,
3205 });
3206 assert_eq!(
3207 parser(r"{5}").parse().unwrap_err(),
3208 TestError {
3209 span: span(0..0),
3210 kind: ast::ErrorKind::RepetitionMissing,
3211 });
3212 assert_eq!(
3213 parser(r"|{5}").parse().unwrap_err(),
3214 TestError {
3215 span: span(1..1),
3216 kind: ast::ErrorKind::RepetitionMissing,
3217 });
3218 }
3219
3220 #[test]
parse_alternate()3221 fn parse_alternate() {
3222 assert_eq!(
3223 parser(r"a|b").parse(),
3224 Ok(Ast::Alternation(ast::Alternation {
3225 span: span(0..3),
3226 asts: vec![lit('a', 0), lit('b', 2)],
3227 })));
3228 assert_eq!(
3229 parser(r"(a|b)").parse(),
3230 Ok(group(0..5, 1, Ast::Alternation(ast::Alternation {
3231 span: span(1..4),
3232 asts: vec![lit('a', 1), lit('b', 3)],
3233 }))));
3234
3235 assert_eq!(
3236 parser(r"a|b|c").parse(),
3237 Ok(Ast::Alternation(ast::Alternation {
3238 span: span(0..5),
3239 asts: vec![lit('a', 0), lit('b', 2), lit('c', 4)],
3240 })));
3241 assert_eq!(
3242 parser(r"ax|by|cz").parse(),
3243 Ok(Ast::Alternation(ast::Alternation {
3244 span: span(0..8),
3245 asts: vec![
3246 concat(0..2, vec![lit('a', 0), lit('x', 1)]),
3247 concat(3..5, vec![lit('b', 3), lit('y', 4)]),
3248 concat(6..8, vec![lit('c', 6), lit('z', 7)]),
3249 ],
3250 })));
3251 assert_eq!(
3252 parser(r"(ax|by|cz)").parse(),
3253 Ok(group(0..10, 1, Ast::Alternation(ast::Alternation {
3254 span: span(1..9),
3255 asts: vec![
3256 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3257 concat(4..6, vec![lit('b', 4), lit('y', 5)]),
3258 concat(7..9, vec![lit('c', 7), lit('z', 8)]),
3259 ],
3260 }))));
3261 assert_eq!(
3262 parser(r"(ax|(by|(cz)))").parse(),
3263 Ok(group(0..14, 1, alt(1..13, vec![
3264 concat(1..3, vec![lit('a', 1), lit('x', 2)]),
3265 group(4..13, 2, alt(5..12, vec![
3266 concat(5..7, vec![lit('b', 5), lit('y', 6)]),
3267 group(8..12, 3, concat(9..11, vec![
3268 lit('c', 9),
3269 lit('z', 10),
3270 ])),
3271 ])),
3272 ]))));
3273
3274 assert_eq!(
3275 parser(r"|").parse(), Ok(alt(0..1, vec![
3276 Ast::Empty(span(0..0)), Ast::Empty(span(1..1)),
3277 ])));
3278 assert_eq!(
3279 parser(r"||").parse(), Ok(alt(0..2, vec![
3280 Ast::Empty(span(0..0)),
3281 Ast::Empty(span(1..1)),
3282 Ast::Empty(span(2..2)),
3283 ])));
3284 assert_eq!(
3285 parser(r"a|").parse(), Ok(alt(0..2, vec![
3286 lit('a', 0), Ast::Empty(span(2..2)),
3287 ])));
3288 assert_eq!(
3289 parser(r"|a").parse(), Ok(alt(0..2, vec![
3290 Ast::Empty(span(0..0)), lit('a', 1),
3291 ])));
3292
3293 assert_eq!(
3294 parser(r"(|)").parse(), Ok(group(0..3, 1, alt(1..2, vec![
3295 Ast::Empty(span(1..1)), Ast::Empty(span(2..2)),
3296 ]))));
3297 assert_eq!(
3298 parser(r"(a|)").parse(), Ok(group(0..4, 1, alt(1..3, vec![
3299 lit('a', 1), Ast::Empty(span(3..3)),
3300 ]))));
3301 assert_eq!(
3302 parser(r"(|a)").parse(), Ok(group(0..4, 1, alt(1..3, vec![
3303 Ast::Empty(span(1..1)), lit('a', 2),
3304 ]))));
3305
3306 assert_eq!(
3307 parser(r"a|b)").parse().unwrap_err(),
3308 TestError {
3309 span: span(3..4),
3310 kind: ast::ErrorKind::GroupUnopened,
3311 });
3312 assert_eq!(
3313 parser(r"(a|b").parse().unwrap_err(),
3314 TestError {
3315 span: span(0..1),
3316 kind: ast::ErrorKind::GroupUnclosed,
3317 });
3318 }
3319
3320 #[test]
parse_unsupported_lookaround()3321 fn parse_unsupported_lookaround() {
3322 assert_eq!(
3323 parser(r"(?=a)").parse().unwrap_err(),
3324 TestError {
3325 span: span(0..3),
3326 kind: ast::ErrorKind::UnsupportedLookAround,
3327 });
3328 assert_eq!(
3329 parser(r"(?!a)").parse().unwrap_err(),
3330 TestError {
3331 span: span(0..3),
3332 kind: ast::ErrorKind::UnsupportedLookAround,
3333 });
3334 assert_eq!(
3335 parser(r"(?<=a)").parse().unwrap_err(),
3336 TestError {
3337 span: span(0..4),
3338 kind: ast::ErrorKind::UnsupportedLookAround,
3339 });
3340 assert_eq!(
3341 parser(r"(?<!a)").parse().unwrap_err(),
3342 TestError {
3343 span: span(0..4),
3344 kind: ast::ErrorKind::UnsupportedLookAround,
3345 });
3346 }
3347
3348 #[test]
parse_group()3349 fn parse_group() {
3350 assert_eq!(parser("(?i)").parse(), Ok(Ast::Flags(ast::SetFlags {
3351 span: span(0..4),
3352 flags: ast::Flags {
3353 span: span(2..3),
3354 items: vec![ast::FlagsItem {
3355 span: span(2..3),
3356 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3357 }],
3358 },
3359 })));
3360 assert_eq!(parser("(?iU)").parse(), Ok(Ast::Flags(ast::SetFlags {
3361 span: span(0..5),
3362 flags: ast::Flags {
3363 span: span(2..4),
3364 items: vec![
3365 ast::FlagsItem {
3366 span: span(2..3),
3367 kind: ast::FlagsItemKind::Flag(
3368 ast::Flag::CaseInsensitive),
3369 },
3370 ast::FlagsItem {
3371 span: span(3..4),
3372 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3373 },
3374 ],
3375 },
3376 })));
3377 assert_eq!(parser("(?i-U)").parse(), Ok(Ast::Flags(ast::SetFlags {
3378 span: span(0..6),
3379 flags: ast::Flags {
3380 span: span(2..5),
3381 items: vec![
3382 ast::FlagsItem {
3383 span: span(2..3),
3384 kind: ast::FlagsItemKind::Flag(
3385 ast::Flag::CaseInsensitive),
3386 },
3387 ast::FlagsItem {
3388 span: span(3..4),
3389 kind: ast::FlagsItemKind::Negation,
3390 },
3391 ast::FlagsItem {
3392 span: span(4..5),
3393 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3394 },
3395 ],
3396 },
3397 })));
3398
3399 assert_eq!(parser("()").parse(), Ok(Ast::Group(ast::Group {
3400 span: span(0..2),
3401 kind: ast::GroupKind::CaptureIndex(1),
3402 ast: Box::new(Ast::Empty(span(1..1))),
3403 })));
3404 assert_eq!(parser("(a)").parse(), Ok(Ast::Group(ast::Group {
3405 span: span(0..3),
3406 kind: ast::GroupKind::CaptureIndex(1),
3407 ast: Box::new(lit('a', 1)),
3408 })));
3409 assert_eq!(parser("(())").parse(), Ok(Ast::Group(ast::Group {
3410 span: span(0..4),
3411 kind: ast::GroupKind::CaptureIndex(1),
3412 ast: Box::new(Ast::Group(ast::Group {
3413 span: span(1..3),
3414 kind: ast::GroupKind::CaptureIndex(2),
3415 ast: Box::new(Ast::Empty(span(2..2))),
3416 })),
3417 })));
3418
3419 assert_eq!(parser("(?:a)").parse(), Ok(Ast::Group(ast::Group {
3420 span: span(0..5),
3421 kind: ast::GroupKind::NonCapturing(ast::Flags {
3422 span: span(2..2),
3423 items: vec![],
3424 }),
3425 ast: Box::new(lit('a', 3)),
3426 })));
3427
3428 assert_eq!(parser("(?i:a)").parse(), Ok(Ast::Group(ast::Group {
3429 span: span(0..6),
3430 kind: ast::GroupKind::NonCapturing(ast::Flags {
3431 span: span(2..3),
3432 items: vec![
3433 ast::FlagsItem {
3434 span: span(2..3),
3435 kind: ast::FlagsItemKind::Flag(
3436 ast::Flag::CaseInsensitive),
3437 },
3438 ],
3439 }),
3440 ast: Box::new(lit('a', 4)),
3441 })));
3442 assert_eq!(parser("(?i-U:a)").parse(), Ok(Ast::Group(ast::Group {
3443 span: span(0..8),
3444 kind: ast::GroupKind::NonCapturing(ast::Flags {
3445 span: span(2..5),
3446 items: vec![
3447 ast::FlagsItem {
3448 span: span(2..3),
3449 kind: ast::FlagsItemKind::Flag(
3450 ast::Flag::CaseInsensitive),
3451 },
3452 ast::FlagsItem {
3453 span: span(3..4),
3454 kind: ast::FlagsItemKind::Negation,
3455 },
3456 ast::FlagsItem {
3457 span: span(4..5),
3458 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3459 },
3460 ],
3461 }),
3462 ast: Box::new(lit('a', 6)),
3463 })));
3464
3465 assert_eq!(
3466 parser("(").parse().unwrap_err(),
3467 TestError {
3468 span: span(0..1),
3469 kind: ast::ErrorKind::GroupUnclosed,
3470 });
3471 assert_eq!(
3472 parser("(?").parse().unwrap_err(),
3473 TestError {
3474 span: span(0..1),
3475 kind: ast::ErrorKind::GroupUnclosed,
3476 });
3477 assert_eq!(
3478 parser("(?P").parse().unwrap_err(),
3479 TestError {
3480 span: span(2..3),
3481 kind: ast::ErrorKind::FlagUnrecognized,
3482 });
3483 assert_eq!(
3484 parser("(?P<").parse().unwrap_err(),
3485 TestError {
3486 span: span(4..4),
3487 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3488 });
3489 assert_eq!(
3490 parser("(a").parse().unwrap_err(),
3491 TestError {
3492 span: span(0..1),
3493 kind: ast::ErrorKind::GroupUnclosed,
3494 });
3495 assert_eq!(
3496 parser("(()").parse().unwrap_err(),
3497 TestError {
3498 span: span(0..1),
3499 kind: ast::ErrorKind::GroupUnclosed,
3500 });
3501 assert_eq!(
3502 parser(")").parse().unwrap_err(),
3503 TestError {
3504 span: span(0..1),
3505 kind: ast::ErrorKind::GroupUnopened,
3506 });
3507 assert_eq!(
3508 parser("a)").parse().unwrap_err(),
3509 TestError {
3510 span: span(1..2),
3511 kind: ast::ErrorKind::GroupUnopened,
3512 });
3513 }
3514
3515 #[test]
parse_capture_name()3516 fn parse_capture_name() {
3517 assert_eq!(parser("(?P<a>z)").parse(), Ok(Ast::Group(ast::Group {
3518 span: span(0..8),
3519 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3520 span: span(4..5),
3521 name: s("a"),
3522 index: 1,
3523 }),
3524 ast: Box::new(lit('z', 6)),
3525 })));
3526 assert_eq!(parser("(?P<abc>z)").parse(), Ok(Ast::Group(ast::Group {
3527 span: span(0..10),
3528 kind: ast::GroupKind::CaptureName(ast::CaptureName {
3529 span: span(4..7),
3530 name: s("abc"),
3531 index: 1,
3532 }),
3533 ast: Box::new(lit('z', 8)),
3534 })));
3535
3536 assert_eq!(
3537 parser("(?P<").parse().unwrap_err(),
3538 TestError {
3539 span: span(4..4),
3540 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3541 });
3542 assert_eq!(
3543 parser("(?P<>z)").parse().unwrap_err(),
3544 TestError {
3545 span: span(4..4),
3546 kind: ast::ErrorKind::GroupNameEmpty,
3547 });
3548 assert_eq!(
3549 parser("(?P<a").parse().unwrap_err(),
3550 TestError {
3551 span: span(5..5),
3552 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3553 });
3554 assert_eq!(
3555 parser("(?P<ab").parse().unwrap_err(),
3556 TestError {
3557 span: span(6..6),
3558 kind: ast::ErrorKind::GroupNameUnexpectedEof,
3559 });
3560 assert_eq!(
3561 parser("(?P<0a").parse().unwrap_err(),
3562 TestError {
3563 span: span(4..5),
3564 kind: ast::ErrorKind::GroupNameInvalid,
3565 });
3566 assert_eq!(
3567 parser("(?P<~").parse().unwrap_err(),
3568 TestError {
3569 span: span(4..5),
3570 kind: ast::ErrorKind::GroupNameInvalid,
3571 });
3572 assert_eq!(
3573 parser("(?P<abc~").parse().unwrap_err(),
3574 TestError {
3575 span: span(7..8),
3576 kind: ast::ErrorKind::GroupNameInvalid,
3577 });
3578 assert_eq!(
3579 parser("(?P<a>y)(?P<a>z)").parse().unwrap_err(),
3580 TestError {
3581 span: span(12..13),
3582 kind: ast::ErrorKind::GroupNameDuplicate {
3583 original: span(4..5),
3584 },
3585 });
3586 }
3587
3588 #[test]
parse_flags()3589 fn parse_flags() {
3590 assert_eq!(parser("i:").parse_flags(), Ok(ast::Flags {
3591 span: span(0..1),
3592 items: vec![ast::FlagsItem {
3593 span: span(0..1),
3594 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3595 }],
3596 }));
3597 assert_eq!(parser("i)").parse_flags(), Ok(ast::Flags {
3598 span: span(0..1),
3599 items: vec![ast::FlagsItem {
3600 span: span(0..1),
3601 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3602 }],
3603 }));
3604
3605 assert_eq!(parser("isU:").parse_flags(), Ok(ast::Flags {
3606 span: span(0..3),
3607 items: vec![
3608 ast::FlagsItem {
3609 span: span(0..1),
3610 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3611 },
3612 ast::FlagsItem {
3613 span: span(1..2),
3614 kind: ast::FlagsItemKind::Flag(
3615 ast::Flag::DotMatchesNewLine),
3616 },
3617 ast::FlagsItem {
3618 span: span(2..3),
3619 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3620 },
3621 ],
3622 }));
3623
3624 assert_eq!(parser("-isU:").parse_flags(), Ok(ast::Flags {
3625 span: span(0..4),
3626 items: vec![
3627 ast::FlagsItem {
3628 span: span(0..1),
3629 kind: ast::FlagsItemKind::Negation,
3630 },
3631 ast::FlagsItem {
3632 span: span(1..2),
3633 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3634 },
3635 ast::FlagsItem {
3636 span: span(2..3),
3637 kind: ast::FlagsItemKind::Flag(
3638 ast::Flag::DotMatchesNewLine),
3639 },
3640 ast::FlagsItem {
3641 span: span(3..4),
3642 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3643 },
3644 ],
3645 }));
3646 assert_eq!(parser("i-sU:").parse_flags(), Ok(ast::Flags {
3647 span: span(0..4),
3648 items: vec![
3649 ast::FlagsItem {
3650 span: span(0..1),
3651 kind: ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive),
3652 },
3653 ast::FlagsItem {
3654 span: span(1..2),
3655 kind: ast::FlagsItemKind::Negation,
3656 },
3657 ast::FlagsItem {
3658 span: span(2..3),
3659 kind: ast::FlagsItemKind::Flag(
3660 ast::Flag::DotMatchesNewLine),
3661 },
3662 ast::FlagsItem {
3663 span: span(3..4),
3664 kind: ast::FlagsItemKind::Flag(ast::Flag::SwapGreed),
3665 },
3666 ],
3667 }));
3668
3669 assert_eq!(
3670 parser("isU").parse_flags().unwrap_err(),
3671 TestError {
3672 span: span(3..3),
3673 kind: ast::ErrorKind::FlagUnexpectedEof,
3674 });
3675 assert_eq!(
3676 parser("isUa:").parse_flags().unwrap_err(),
3677 TestError {
3678 span: span(3..4),
3679 kind: ast::ErrorKind::FlagUnrecognized,
3680 });
3681 assert_eq!(
3682 parser("isUi:").parse_flags().unwrap_err(),
3683 TestError {
3684 span: span(3..4),
3685 kind: ast::ErrorKind::FlagDuplicate {
3686 original: span(0..1),
3687 },
3688 });
3689 assert_eq!(
3690 parser("i-sU-i:").parse_flags().unwrap_err(),
3691 TestError {
3692 span: span(4..5),
3693 kind: ast::ErrorKind::FlagRepeatedNegation {
3694 original: span(1..2),
3695 },
3696 });
3697 assert_eq!(
3698 parser("-)").parse_flags().unwrap_err(),
3699 TestError {
3700 span: span(0..1),
3701 kind: ast::ErrorKind::FlagDanglingNegation,
3702 });
3703 assert_eq!(
3704 parser("i-)").parse_flags().unwrap_err(),
3705 TestError {
3706 span: span(1..2),
3707 kind: ast::ErrorKind::FlagDanglingNegation,
3708 });
3709 assert_eq!(
3710 parser("iU-)").parse_flags().unwrap_err(),
3711 TestError {
3712 span: span(2..3),
3713 kind: ast::ErrorKind::FlagDanglingNegation,
3714 });
3715 }
3716
3717 #[test]
parse_flag()3718 fn parse_flag() {
3719 assert_eq!(parser("i").parse_flag(), Ok(ast::Flag::CaseInsensitive));
3720 assert_eq!(parser("m").parse_flag(), Ok(ast::Flag::MultiLine));
3721 assert_eq!(parser("s").parse_flag(), Ok(ast::Flag::DotMatchesNewLine));
3722 assert_eq!(parser("U").parse_flag(), Ok(ast::Flag::SwapGreed));
3723 assert_eq!(parser("u").parse_flag(), Ok(ast::Flag::Unicode));
3724 assert_eq!(parser("x").parse_flag(), Ok(ast::Flag::IgnoreWhitespace));
3725
3726 assert_eq!(
3727 parser("a").parse_flag().unwrap_err(),
3728 TestError {
3729 span: span(0..1),
3730 kind: ast::ErrorKind::FlagUnrecognized,
3731 });
3732 assert_eq!(
3733 parser("☃").parse_flag().unwrap_err(),
3734 TestError {
3735 span: span_range("☃", 0..3),
3736 kind: ast::ErrorKind::FlagUnrecognized,
3737 });
3738 }
3739
3740 #[test]
parse_primitive_non_escape()3741 fn parse_primitive_non_escape() {
3742 assert_eq!(
3743 parser(r".").parse_primitive(),
3744 Ok(Primitive::Dot(span(0..1))));
3745 assert_eq!(
3746 parser(r"^").parse_primitive(),
3747 Ok(Primitive::Assertion(ast::Assertion {
3748 span: span(0..1),
3749 kind: ast::AssertionKind::StartLine,
3750 })));
3751 assert_eq!(
3752 parser(r"$").parse_primitive(),
3753 Ok(Primitive::Assertion(ast::Assertion {
3754 span: span(0..1),
3755 kind: ast::AssertionKind::EndLine,
3756 })));
3757
3758 assert_eq!(
3759 parser(r"a").parse_primitive(),
3760 Ok(Primitive::Literal(ast::Literal {
3761 span: span(0..1),
3762 kind: ast::LiteralKind::Verbatim,
3763 c: 'a',
3764 })));
3765 assert_eq!(
3766 parser(r"|").parse_primitive(),
3767 Ok(Primitive::Literal(ast::Literal {
3768 span: span(0..1),
3769 kind: ast::LiteralKind::Verbatim,
3770 c: '|',
3771 })));
3772 assert_eq!(
3773 parser(r"☃").parse_primitive(),
3774 Ok(Primitive::Literal(ast::Literal {
3775 span: span_range("☃", 0..3),
3776 kind: ast::LiteralKind::Verbatim,
3777 c: '☃',
3778 })));
3779 }
3780
3781 #[test]
parse_escape()3782 fn parse_escape() {
3783 assert_eq!(
3784 parser(r"\|").parse_primitive(),
3785 Ok(Primitive::Literal(ast::Literal {
3786 span: span(0..2),
3787 kind: ast::LiteralKind::Punctuation,
3788 c: '|',
3789 })));
3790 let specials = &[
3791 (r"\a", '\x07', ast::SpecialLiteralKind::Bell),
3792 (r"\f", '\x0C', ast::SpecialLiteralKind::FormFeed),
3793 (r"\t", '\t', ast::SpecialLiteralKind::Tab),
3794 (r"\n", '\n', ast::SpecialLiteralKind::LineFeed),
3795 (r"\r", '\r', ast::SpecialLiteralKind::CarriageReturn),
3796 (r"\v", '\x0B', ast::SpecialLiteralKind::VerticalTab),
3797 ];
3798 for &(pat, c, ref kind) in specials {
3799 assert_eq!(
3800 parser(pat).parse_primitive(),
3801 Ok(Primitive::Literal(ast::Literal {
3802 span: span(0..2),
3803 kind: ast::LiteralKind::Special(kind.clone()),
3804 c: c,
3805 })));
3806 }
3807 assert_eq!(
3808 parser(r"\A").parse_primitive(),
3809 Ok(Primitive::Assertion(ast::Assertion {
3810 span: span(0..2),
3811 kind: ast::AssertionKind::StartText,
3812 })));
3813 assert_eq!(
3814 parser(r"\z").parse_primitive(),
3815 Ok(Primitive::Assertion(ast::Assertion {
3816 span: span(0..2),
3817 kind: ast::AssertionKind::EndText,
3818 })));
3819 assert_eq!(
3820 parser(r"\b").parse_primitive(),
3821 Ok(Primitive::Assertion(ast::Assertion {
3822 span: span(0..2),
3823 kind: ast::AssertionKind::WordBoundary,
3824 })));
3825 assert_eq!(
3826 parser(r"\B").parse_primitive(),
3827 Ok(Primitive::Assertion(ast::Assertion {
3828 span: span(0..2),
3829 kind: ast::AssertionKind::NotWordBoundary,
3830 })));
3831
3832 assert_eq!(
3833 parser(r"\").parse_escape().unwrap_err(),
3834 TestError {
3835 span: span(0..1),
3836 kind: ast::ErrorKind::EscapeUnexpectedEof,
3837 });
3838 assert_eq!(
3839 parser(r"\y").parse_escape().unwrap_err(),
3840 TestError {
3841 span: span(0..2),
3842 kind: ast::ErrorKind::EscapeUnrecognized,
3843 });
3844 }
3845
3846 #[test]
parse_unsupported_backreference()3847 fn parse_unsupported_backreference() {
3848 assert_eq!(
3849 parser(r"\0").parse_escape().unwrap_err(),
3850 TestError {
3851 span: span(0..2),
3852 kind: ast::ErrorKind::UnsupportedBackreference,
3853 });
3854 assert_eq!(
3855 parser(r"\9").parse_escape().unwrap_err(),
3856 TestError {
3857 span: span(0..2),
3858 kind: ast::ErrorKind::UnsupportedBackreference,
3859 });
3860 }
3861
3862 #[test]
parse_octal()3863 fn parse_octal() {
3864 for i in 0..511 {
3865 let pat = format!(r"\{:o}", i);
3866 assert_eq!(
3867 parser_octal(&pat).parse_escape(),
3868 Ok(Primitive::Literal(ast::Literal {
3869 span: span(0..pat.len()),
3870 kind: ast::LiteralKind::Octal,
3871 c: ::std::char::from_u32(i).unwrap(),
3872 })));
3873 }
3874 assert_eq!(
3875 parser_octal(r"\778").parse_escape(),
3876 Ok(Primitive::Literal(ast::Literal {
3877 span: span(0..3),
3878 kind: ast::LiteralKind::Octal,
3879 c: '?',
3880 })));
3881 assert_eq!(
3882 parser_octal(r"\7777").parse_escape(),
3883 Ok(Primitive::Literal(ast::Literal {
3884 span: span(0..4),
3885 kind: ast::LiteralKind::Octal,
3886 c: '\u{01FF}',
3887 })));
3888 assert_eq!(
3889 parser_octal(r"\778").parse(),
3890 Ok(Ast::Concat(ast::Concat {
3891 span: span(0..4),
3892 asts: vec![
3893 Ast::Literal(ast::Literal {
3894 span: span(0..3),
3895 kind: ast::LiteralKind::Octal,
3896 c: '?',
3897 }),
3898 Ast::Literal(ast::Literal {
3899 span: span(3..4),
3900 kind: ast::LiteralKind::Verbatim,
3901 c: '8',
3902 }),
3903 ],
3904 })));
3905 assert_eq!(
3906 parser_octal(r"\7777").parse(),
3907 Ok(Ast::Concat(ast::Concat {
3908 span: span(0..5),
3909 asts: vec![
3910 Ast::Literal(ast::Literal {
3911 span: span(0..4),
3912 kind: ast::LiteralKind::Octal,
3913 c: '\u{01FF}',
3914 }),
3915 Ast::Literal(ast::Literal {
3916 span: span(4..5),
3917 kind: ast::LiteralKind::Verbatim,
3918 c: '7',
3919 }),
3920 ],
3921 })));
3922
3923 assert_eq!(
3924 parser_octal(r"\8").parse_escape().unwrap_err(),
3925 TestError {
3926 span: span(0..2),
3927 kind: ast::ErrorKind::EscapeUnrecognized,
3928 });
3929 }
3930
3931 #[test]
parse_hex_two()3932 fn parse_hex_two() {
3933 for i in 0..256 {
3934 let pat = format!(r"\x{:02x}", i);
3935 assert_eq!(
3936 parser(&pat).parse_escape(),
3937 Ok(Primitive::Literal(ast::Literal {
3938 span: span(0..pat.len()),
3939 kind: ast::LiteralKind::HexFixed(ast::HexLiteralKind::X),
3940 c: ::std::char::from_u32(i).unwrap(),
3941 })));
3942 }
3943
3944 assert_eq!(
3945 parser(r"\xF").parse_escape().unwrap_err(),
3946 TestError {
3947 span: span(3..3),
3948 kind: ast::ErrorKind::EscapeUnexpectedEof,
3949 });
3950 assert_eq!(
3951 parser(r"\xG").parse_escape().unwrap_err(),
3952 TestError {
3953 span: span(2..3),
3954 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3955 });
3956 assert_eq!(
3957 parser(r"\xFG").parse_escape().unwrap_err(),
3958 TestError {
3959 span: span(3..4),
3960 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3961 });
3962 }
3963
3964 #[test]
parse_hex_four()3965 fn parse_hex_four() {
3966 for i in 0..65536 {
3967 let c = match ::std::char::from_u32(i) {
3968 None => continue,
3969 Some(c) => c,
3970 };
3971 let pat = format!(r"\u{:04x}", i);
3972 assert_eq!(
3973 parser(&pat).parse_escape(),
3974 Ok(Primitive::Literal(ast::Literal {
3975 span: span(0..pat.len()),
3976 kind: ast::LiteralKind::HexFixed(
3977 ast::HexLiteralKind::UnicodeShort),
3978 c: c,
3979 })));
3980 }
3981
3982 assert_eq!(
3983 parser(r"\uF").parse_escape().unwrap_err(),
3984 TestError {
3985 span: span(3..3),
3986 kind: ast::ErrorKind::EscapeUnexpectedEof,
3987 });
3988 assert_eq!(
3989 parser(r"\uG").parse_escape().unwrap_err(),
3990 TestError {
3991 span: span(2..3),
3992 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3993 });
3994 assert_eq!(
3995 parser(r"\uFG").parse_escape().unwrap_err(),
3996 TestError {
3997 span: span(3..4),
3998 kind: ast::ErrorKind::EscapeHexInvalidDigit,
3999 });
4000 assert_eq!(
4001 parser(r"\uFFG").parse_escape().unwrap_err(),
4002 TestError {
4003 span: span(4..5),
4004 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4005 });
4006 assert_eq!(
4007 parser(r"\uFFFG").parse_escape().unwrap_err(),
4008 TestError {
4009 span: span(5..6),
4010 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4011 });
4012 assert_eq!(
4013 parser(r"\uD800").parse_escape().unwrap_err(),
4014 TestError {
4015 span: span(2..6),
4016 kind: ast::ErrorKind::EscapeHexInvalid,
4017 });
4018 }
4019
4020 #[test]
parse_hex_eight()4021 fn parse_hex_eight() {
4022 for i in 0..65536 {
4023 let c = match ::std::char::from_u32(i) {
4024 None => continue,
4025 Some(c) => c,
4026 };
4027 let pat = format!(r"\U{:08x}", i);
4028 assert_eq!(
4029 parser(&pat).parse_escape(),
4030 Ok(Primitive::Literal(ast::Literal {
4031 span: span(0..pat.len()),
4032 kind: ast::LiteralKind::HexFixed(
4033 ast::HexLiteralKind::UnicodeLong),
4034 c: c,
4035 })));
4036 }
4037
4038 assert_eq!(
4039 parser(r"\UF").parse_escape().unwrap_err(),
4040 TestError {
4041 span: span(3..3),
4042 kind: ast::ErrorKind::EscapeUnexpectedEof,
4043 });
4044 assert_eq!(
4045 parser(r"\UG").parse_escape().unwrap_err(),
4046 TestError {
4047 span: span(2..3),
4048 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4049 });
4050 assert_eq!(
4051 parser(r"\UFG").parse_escape().unwrap_err(),
4052 TestError {
4053 span: span(3..4),
4054 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4055 });
4056 assert_eq!(
4057 parser(r"\UFFG").parse_escape().unwrap_err(),
4058 TestError {
4059 span: span(4..5),
4060 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4061 });
4062 assert_eq!(
4063 parser(r"\UFFFG").parse_escape().unwrap_err(),
4064 TestError {
4065 span: span(5..6),
4066 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4067 });
4068 assert_eq!(
4069 parser(r"\UFFFFG").parse_escape().unwrap_err(),
4070 TestError {
4071 span: span(6..7),
4072 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4073 });
4074 assert_eq!(
4075 parser(r"\UFFFFFG").parse_escape().unwrap_err(),
4076 TestError {
4077 span: span(7..8),
4078 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4079 });
4080 assert_eq!(
4081 parser(r"\UFFFFFFG").parse_escape().unwrap_err(),
4082 TestError {
4083 span: span(8..9),
4084 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4085 });
4086 assert_eq!(
4087 parser(r"\UFFFFFFFG").parse_escape().unwrap_err(),
4088 TestError {
4089 span: span(9..10),
4090 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4091 });
4092 }
4093
4094 #[test]
parse_hex_brace()4095 fn parse_hex_brace() {
4096 assert_eq!(
4097 parser(r"\u{26c4}").parse_escape(),
4098 Ok(Primitive::Literal(ast::Literal {
4099 span: span(0..8),
4100 kind: ast::LiteralKind::HexBrace(
4101 ast::HexLiteralKind::UnicodeShort),
4102 c: '⛄',
4103 })));
4104 assert_eq!(
4105 parser(r"\U{26c4}").parse_escape(),
4106 Ok(Primitive::Literal(ast::Literal {
4107 span: span(0..8),
4108 kind: ast::LiteralKind::HexBrace(
4109 ast::HexLiteralKind::UnicodeLong),
4110 c: '⛄',
4111 })));
4112 assert_eq!(
4113 parser(r"\x{26c4}").parse_escape(),
4114 Ok(Primitive::Literal(ast::Literal {
4115 span: span(0..8),
4116 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4117 c: '⛄',
4118 })));
4119 assert_eq!(
4120 parser(r"\x{26C4}").parse_escape(),
4121 Ok(Primitive::Literal(ast::Literal {
4122 span: span(0..8),
4123 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4124 c: '⛄',
4125 })));
4126 assert_eq!(
4127 parser(r"\x{10fFfF}").parse_escape(),
4128 Ok(Primitive::Literal(ast::Literal {
4129 span: span(0..10),
4130 kind: ast::LiteralKind::HexBrace(ast::HexLiteralKind::X),
4131 c: '\u{10FFFF}',
4132 })));
4133
4134 assert_eq!(
4135 parser(r"\x").parse_escape().unwrap_err(),
4136 TestError {
4137 span: span(2..2),
4138 kind: ast::ErrorKind::EscapeUnexpectedEof,
4139 });
4140 assert_eq!(
4141 parser(r"\x{").parse_escape().unwrap_err(),
4142 TestError {
4143 span: span(2..3),
4144 kind: ast::ErrorKind::EscapeUnexpectedEof,
4145 });
4146 assert_eq!(
4147 parser(r"\x{FF").parse_escape().unwrap_err(),
4148 TestError {
4149 span: span(2..5),
4150 kind: ast::ErrorKind::EscapeUnexpectedEof,
4151 });
4152 assert_eq!(
4153 parser(r"\x{}").parse_escape().unwrap_err(),
4154 TestError {
4155 span: span(2..4),
4156 kind: ast::ErrorKind::EscapeHexEmpty,
4157 });
4158 assert_eq!(
4159 parser(r"\x{FGF}").parse_escape().unwrap_err(),
4160 TestError {
4161 span: span(4..5),
4162 kind: ast::ErrorKind::EscapeHexInvalidDigit,
4163 });
4164 assert_eq!(
4165 parser(r"\x{FFFFFF}").parse_escape().unwrap_err(),
4166 TestError {
4167 span: span(3..9),
4168 kind: ast::ErrorKind::EscapeHexInvalid,
4169 });
4170 assert_eq!(
4171 parser(r"\x{D800}").parse_escape().unwrap_err(),
4172 TestError {
4173 span: span(3..7),
4174 kind: ast::ErrorKind::EscapeHexInvalid,
4175 });
4176 assert_eq!(
4177 parser(r"\x{FFFFFFFFF}").parse_escape().unwrap_err(),
4178 TestError {
4179 span: span(3..12),
4180 kind: ast::ErrorKind::EscapeHexInvalid,
4181 });
4182 }
4183
4184 #[test]
parse_decimal()4185 fn parse_decimal() {
4186 assert_eq!(parser("123").parse_decimal(), Ok(123));
4187 assert_eq!(parser("0").parse_decimal(), Ok(0));
4188 assert_eq!(parser("01").parse_decimal(), Ok(1));
4189
4190 assert_eq!(
4191 parser("-1").parse_decimal().unwrap_err(),
4192 TestError {
4193 span: span(0..0),
4194 kind: ast::ErrorKind::DecimalEmpty,
4195 });
4196 assert_eq!(
4197 parser("").parse_decimal().unwrap_err(),
4198 TestError {
4199 span: span(0..0),
4200 kind: ast::ErrorKind::DecimalEmpty,
4201 });
4202 assert_eq!(
4203 parser("9999999999").parse_decimal().unwrap_err(),
4204 TestError {
4205 span: span(0..10),
4206 kind: ast::ErrorKind::DecimalInvalid,
4207 });
4208 }
4209
4210 #[test]
parse_set_class()4211 fn parse_set_class() {
4212 fn union(span: Span, items: Vec<ast::ClassSetItem>) -> ast::ClassSet {
4213 ast::ClassSet::union(ast::ClassSetUnion {
4214 span: span,
4215 items: items,
4216 })
4217 }
4218
4219 fn intersection(
4220 span: Span,
4221 lhs: ast::ClassSet,
4222 rhs: ast::ClassSet,
4223 ) -> ast::ClassSet {
4224 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4225 span: span,
4226 kind: ast::ClassSetBinaryOpKind::Intersection,
4227 lhs: Box::new(lhs),
4228 rhs: Box::new(rhs),
4229 })
4230 }
4231
4232 fn difference(
4233 span: Span,
4234 lhs: ast::ClassSet,
4235 rhs: ast::ClassSet,
4236 ) -> ast::ClassSet {
4237 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4238 span: span,
4239 kind: ast::ClassSetBinaryOpKind::Difference,
4240 lhs: Box::new(lhs),
4241 rhs: Box::new(rhs),
4242 })
4243 }
4244
4245 fn symdifference(
4246 span: Span,
4247 lhs: ast::ClassSet,
4248 rhs: ast::ClassSet,
4249 ) -> ast::ClassSet {
4250 ast::ClassSet::BinaryOp(ast::ClassSetBinaryOp {
4251 span: span,
4252 kind: ast::ClassSetBinaryOpKind::SymmetricDifference,
4253 lhs: Box::new(lhs),
4254 rhs: Box::new(rhs),
4255 })
4256 }
4257
4258 fn itemset(item: ast::ClassSetItem) -> ast::ClassSet {
4259 ast::ClassSet::Item(item)
4260 }
4261
4262 fn item_ascii(cls: ast::ClassAscii) -> ast::ClassSetItem {
4263 ast::ClassSetItem::Ascii(cls)
4264 }
4265
4266 fn item_unicode(cls: ast::ClassUnicode) -> ast::ClassSetItem {
4267 ast::ClassSetItem::Unicode(cls)
4268 }
4269
4270 fn item_perl(cls: ast::ClassPerl) -> ast::ClassSetItem {
4271 ast::ClassSetItem::Perl(cls)
4272 }
4273
4274 fn item_bracket(cls: ast::ClassBracketed) -> ast::ClassSetItem {
4275 ast::ClassSetItem::Bracketed(Box::new(cls))
4276 }
4277
4278 fn lit(span: Span, c: char) -> ast::ClassSetItem {
4279 ast::ClassSetItem::Literal(ast::Literal {
4280 span: span,
4281 kind: ast::LiteralKind::Verbatim,
4282 c: c,
4283 })
4284 }
4285
4286 fn empty(span: Span) -> ast::ClassSetItem {
4287 ast::ClassSetItem::Empty(span)
4288 }
4289
4290 fn range(span: Span, start: char, end: char) -> ast::ClassSetItem {
4291 let pos1 = Position {
4292 offset: span.start.offset + start.len_utf8(),
4293 column: span.start.column + 1,
4294 ..span.start
4295 };
4296 let pos2 = Position {
4297 offset: span.end.offset - end.len_utf8(),
4298 column: span.end.column - 1,
4299 ..span.end
4300 };
4301 ast::ClassSetItem::Range(ast::ClassSetRange {
4302 span: span,
4303 start: ast::Literal {
4304 span: Span { end: pos1, ..span },
4305 kind: ast::LiteralKind::Verbatim,
4306 c: start,
4307 },
4308 end: ast::Literal {
4309 span: Span { start: pos2, ..span },
4310 kind: ast::LiteralKind::Verbatim,
4311 c: end,
4312 },
4313 })
4314 }
4315
4316 fn alnum(span: Span, negated: bool) -> ast::ClassAscii {
4317 ast::ClassAscii {
4318 span: span,
4319 kind: ast::ClassAsciiKind::Alnum,
4320 negated: negated,
4321 }
4322 }
4323
4324 fn lower(span: Span, negated: bool) -> ast::ClassAscii {
4325 ast::ClassAscii {
4326 span: span,
4327 kind: ast::ClassAsciiKind::Lower,
4328 negated: negated,
4329 }
4330 }
4331
4332 assert_eq!(
4333 parser("[[:alnum:]]").parse(),
4334 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4335 span: span(0..11),
4336 negated: false,
4337 kind: itemset(item_ascii(alnum(span(1..10), false))),
4338 }))));
4339 assert_eq!(
4340 parser("[[[:alnum:]]]").parse(),
4341 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4342 span: span(0..13),
4343 negated: false,
4344 kind: itemset(item_bracket(ast::ClassBracketed {
4345 span: span(1..12),
4346 negated: false,
4347 kind: itemset(item_ascii(alnum(span(2..11), false))),
4348 })),
4349 }))));
4350 assert_eq!(
4351 parser("[[:alnum:]&&[:lower:]]").parse(),
4352 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4353 span: span(0..22),
4354 negated: false,
4355 kind: intersection(
4356 span(1..21),
4357 itemset(item_ascii(alnum(span(1..10), false))),
4358 itemset(item_ascii(lower(span(12..21), false))),
4359 ),
4360 }))));
4361 assert_eq!(
4362 parser("[[:alnum:]--[:lower:]]").parse(),
4363 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4364 span: span(0..22),
4365 negated: false,
4366 kind: difference(
4367 span(1..21),
4368 itemset(item_ascii(alnum(span(1..10), false))),
4369 itemset(item_ascii(lower(span(12..21), false))),
4370 ),
4371 }))));
4372 assert_eq!(
4373 parser("[[:alnum:]~~[:lower:]]").parse(),
4374 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4375 span: span(0..22),
4376 negated: false,
4377 kind: symdifference(
4378 span(1..21),
4379 itemset(item_ascii(alnum(span(1..10), false))),
4380 itemset(item_ascii(lower(span(12..21), false))),
4381 ),
4382 }))));
4383
4384 assert_eq!(
4385 parser("[a]").parse(),
4386 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4387 span: span(0..3),
4388 negated: false,
4389 kind: itemset(lit(span(1..2), 'a')),
4390 }))));
4391 assert_eq!(
4392 parser(r"[a\]]").parse(),
4393 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4394 span: span(0..5),
4395 negated: false,
4396 kind: union(span(1..4), vec![
4397 lit(span(1..2), 'a'),
4398 ast::ClassSetItem::Literal(ast::Literal {
4399 span: span(2..4),
4400 kind: ast::LiteralKind::Punctuation,
4401 c: ']',
4402 }),
4403 ]),
4404 }))));
4405 assert_eq!(
4406 parser(r"[a\-z]").parse(),
4407 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4408 span: span(0..6),
4409 negated: false,
4410 kind: union(span(1..5), vec![
4411 lit(span(1..2), 'a'),
4412 ast::ClassSetItem::Literal(ast::Literal {
4413 span: span(2..4),
4414 kind: ast::LiteralKind::Punctuation,
4415 c: '-',
4416 }),
4417 lit(span(4..5), 'z'),
4418 ]),
4419 }))));
4420 assert_eq!(
4421 parser("[ab]").parse(),
4422 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4423 span: span(0..4),
4424 negated: false,
4425 kind: union(span(1..3), vec![
4426 lit(span(1..2), 'a'),
4427 lit(span(2..3), 'b'),
4428 ]),
4429 }))));
4430 assert_eq!(
4431 parser("[a-]").parse(),
4432 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4433 span: span(0..4),
4434 negated: false,
4435 kind: union(span(1..3), vec![
4436 lit(span(1..2), 'a'),
4437 lit(span(2..3), '-'),
4438 ]),
4439 }))));
4440 assert_eq!(
4441 parser("[-a]").parse(),
4442 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4443 span: span(0..4),
4444 negated: false,
4445 kind: union(span(1..3), vec![
4446 lit(span(1..2), '-'),
4447 lit(span(2..3), 'a'),
4448 ]),
4449 }))));
4450 assert_eq!(
4451 parser(r"[\pL]").parse(),
4452 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4453 span: span(0..5),
4454 negated: false,
4455 kind: itemset(item_unicode(ast::ClassUnicode {
4456 span: span(1..4),
4457 negated: false,
4458 kind: ast::ClassUnicodeKind::OneLetter('L'),
4459 })),
4460 }))));
4461 assert_eq!(
4462 parser(r"[\w]").parse(),
4463 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4464 span: span(0..4),
4465 negated: false,
4466 kind: itemset(item_perl(ast::ClassPerl {
4467 span: span(1..3),
4468 kind: ast::ClassPerlKind::Word,
4469 negated: false,
4470 })),
4471 }))));
4472 assert_eq!(
4473 parser(r"[a\wz]").parse(),
4474 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4475 span: span(0..6),
4476 negated: false,
4477 kind: union(span(1..5), vec![
4478 lit(span(1..2), 'a'),
4479 item_perl(ast::ClassPerl {
4480 span: span(2..4),
4481 kind: ast::ClassPerlKind::Word,
4482 negated: false,
4483 }),
4484 lit(span(4..5), 'z'),
4485 ]),
4486 }))));
4487
4488 assert_eq!(
4489 parser("[a-z]").parse(),
4490 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4491 span: span(0..5),
4492 negated: false,
4493 kind: itemset(range(span(1..4), 'a', 'z')),
4494 }))));
4495 assert_eq!(
4496 parser("[a-cx-z]").parse(),
4497 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4498 span: span(0..8),
4499 negated: false,
4500 kind: union(span(1..7), vec![
4501 range(span(1..4), 'a', 'c'),
4502 range(span(4..7), 'x', 'z'),
4503 ]),
4504 }))));
4505 assert_eq!(
4506 parser(r"[\w&&a-cx-z]").parse(),
4507 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4508 span: span(0..12),
4509 negated: false,
4510 kind: intersection(
4511 span(1..11),
4512 itemset(item_perl(ast::ClassPerl {
4513 span: span(1..3),
4514 kind: ast::ClassPerlKind::Word,
4515 negated: false,
4516 })),
4517 union(span(5..11), vec![
4518 range(span(5..8), 'a', 'c'),
4519 range(span(8..11), 'x', 'z'),
4520 ]),
4521 ),
4522 }))));
4523 assert_eq!(
4524 parser(r"[a-cx-z&&\w]").parse(),
4525 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4526 span: span(0..12),
4527 negated: false,
4528 kind: intersection(
4529 span(1..11),
4530 union(span(1..7), vec![
4531 range(span(1..4), 'a', 'c'),
4532 range(span(4..7), 'x', 'z'),
4533 ]),
4534 itemset(item_perl(ast::ClassPerl {
4535 span: span(9..11),
4536 kind: ast::ClassPerlKind::Word,
4537 negated: false,
4538 })),
4539 ),
4540 }))));
4541 assert_eq!(
4542 parser(r"[a--b--c]").parse(),
4543 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4544 span: span(0..9),
4545 negated: false,
4546 kind: difference(
4547 span(1..8),
4548 difference(
4549 span(1..5),
4550 itemset(lit(span(1..2), 'a')),
4551 itemset(lit(span(4..5), 'b')),
4552 ),
4553 itemset(lit(span(7..8), 'c')),
4554 ),
4555 }))));
4556 assert_eq!(
4557 parser(r"[a~~b~~c]").parse(),
4558 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4559 span: span(0..9),
4560 negated: false,
4561 kind: symdifference(
4562 span(1..8),
4563 symdifference(
4564 span(1..5),
4565 itemset(lit(span(1..2), 'a')),
4566 itemset(lit(span(4..5), 'b')),
4567 ),
4568 itemset(lit(span(7..8), 'c')),
4569 ),
4570 }))));
4571 assert_eq!(
4572 parser(r"[\^&&^]").parse(),
4573 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4574 span: span(0..7),
4575 negated: false,
4576 kind: intersection(
4577 span(1..6),
4578 itemset(ast::ClassSetItem::Literal(ast::Literal {
4579 span: span(1..3),
4580 kind: ast::LiteralKind::Punctuation,
4581 c: '^',
4582 })),
4583 itemset(lit(span(5..6), '^')),
4584 ),
4585 }))));
4586 assert_eq!(
4587 parser(r"[\&&&&]").parse(),
4588 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4589 span: span(0..7),
4590 negated: false,
4591 kind: intersection(
4592 span(1..6),
4593 itemset(ast::ClassSetItem::Literal(ast::Literal {
4594 span: span(1..3),
4595 kind: ast::LiteralKind::Punctuation,
4596 c: '&',
4597 })),
4598 itemset(lit(span(5..6), '&')),
4599 ),
4600 }))));
4601 assert_eq!(
4602 parser(r"[&&&&]").parse(),
4603 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4604 span: span(0..6),
4605 negated: false,
4606 kind: intersection(
4607 span(1..5),
4608 intersection(
4609 span(1..3),
4610 itemset(empty(span(1..1))),
4611 itemset(empty(span(3..3))),
4612 ),
4613 itemset(empty(span(5..5))),
4614 ),
4615 }))));
4616
4617 let pat = "[☃-⛄]";
4618 assert_eq!(
4619 parser(pat).parse(),
4620 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4621 span: span_range(pat, 0..9),
4622 negated: false,
4623 kind: itemset(ast::ClassSetItem::Range(ast::ClassSetRange {
4624 span: span_range(pat, 1..8),
4625 start: ast::Literal {
4626 span: span_range(pat, 1..4),
4627 kind: ast::LiteralKind::Verbatim,
4628 c: '☃',
4629 },
4630 end: ast::Literal {
4631 span: span_range(pat, 5..8),
4632 kind: ast::LiteralKind::Verbatim,
4633 c: '⛄',
4634 },
4635 })),
4636 }))));
4637
4638 assert_eq!(
4639 parser(r"[]]").parse(),
4640 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4641 span: span(0..3),
4642 negated: false,
4643 kind: itemset(lit(span(1..2), ']')),
4644 }))));
4645 assert_eq!(
4646 parser(r"[]\[]").parse(),
4647 Ok(Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4648 span: span(0..5),
4649 negated: false,
4650 kind: union(span(1..4), vec![
4651 lit(span(1..2), ']'),
4652 ast::ClassSetItem::Literal(ast::Literal {
4653 span: span(2..4),
4654 kind: ast::LiteralKind::Punctuation,
4655 c: '[',
4656 }),
4657 ]),
4658 }))));
4659 assert_eq!(
4660 parser(r"[\[]]").parse(),
4661 Ok(concat(0..5, vec![
4662 Ast::Class(ast::Class::Bracketed(ast::ClassBracketed {
4663 span: span(0..4),
4664 negated: false,
4665 kind: itemset(ast::ClassSetItem::Literal(ast::Literal {
4666 span: span(1..3),
4667 kind: ast::LiteralKind::Punctuation,
4668 c: '[',
4669 })),
4670 })),
4671 Ast::Literal(ast::Literal {
4672 span: span(4..5),
4673 kind: ast::LiteralKind::Verbatim,
4674 c: ']',
4675 }),
4676 ])));
4677
4678 assert_eq!(
4679 parser("[").parse().unwrap_err(),
4680 TestError {
4681 span: span(0..1),
4682 kind: ast::ErrorKind::ClassUnclosed,
4683 });
4684 assert_eq!(
4685 parser("[[").parse().unwrap_err(),
4686 TestError {
4687 span: span(1..2),
4688 kind: ast::ErrorKind::ClassUnclosed,
4689 });
4690 assert_eq!(
4691 parser("[[-]").parse().unwrap_err(),
4692 TestError {
4693 span: span(0..1),
4694 kind: ast::ErrorKind::ClassUnclosed,
4695 });
4696 assert_eq!(
4697 parser("[[[:alnum:]").parse().unwrap_err(),
4698 TestError {
4699 span: span(1..2),
4700 kind: ast::ErrorKind::ClassUnclosed,
4701 });
4702 assert_eq!(
4703 parser(r"[\b]").parse().unwrap_err(),
4704 TestError {
4705 span: span(1..3),
4706 kind: ast::ErrorKind::ClassEscapeInvalid,
4707 });
4708 assert_eq!(
4709 parser(r"[\w-a]").parse().unwrap_err(),
4710 TestError {
4711 span: span(1..3),
4712 kind: ast::ErrorKind::ClassRangeLiteral,
4713 });
4714 assert_eq!(
4715 parser(r"[a-\w]").parse().unwrap_err(),
4716 TestError {
4717 span: span(3..5),
4718 kind: ast::ErrorKind::ClassRangeLiteral,
4719 });
4720 assert_eq!(
4721 parser(r"[z-a]").parse().unwrap_err(),
4722 TestError {
4723 span: span(1..4),
4724 kind: ast::ErrorKind::ClassRangeInvalid,
4725 });
4726
4727 assert_eq!(
4728 parser_ignore_whitespace("[a ").parse().unwrap_err(),
4729 TestError {
4730 span: span(0..1),
4731 kind: ast::ErrorKind::ClassUnclosed,
4732 });
4733 assert_eq!(
4734 parser_ignore_whitespace("[a- ").parse().unwrap_err(),
4735 TestError {
4736 span: span(0..1),
4737 kind: ast::ErrorKind::ClassUnclosed,
4738 });
4739 }
4740
4741 #[test]
parse_set_class_open()4742 fn parse_set_class_open() {
4743 assert_eq!(
4744 parser("[a]").parse_set_class_open(), {
4745 let set = ast::ClassBracketed {
4746 span: span(0..1),
4747 negated: false,
4748 kind: ast::ClassSet::union(ast::ClassSetUnion {
4749 span: span(1..1),
4750 items: vec![],
4751 }),
4752 };
4753 let union = ast::ClassSetUnion {
4754 span: span(1..1),
4755 items: vec![],
4756 };
4757 Ok((set, union))
4758 });
4759 assert_eq!(
4760 parser_ignore_whitespace("[ a]").parse_set_class_open(), {
4761 let set = ast::ClassBracketed {
4762 span: span(0..4),
4763 negated: false,
4764 kind: ast::ClassSet::union(ast::ClassSetUnion {
4765 span: span(4..4),
4766 items: vec![],
4767 }),
4768 };
4769 let union = ast::ClassSetUnion {
4770 span: span(4..4),
4771 items: vec![],
4772 };
4773 Ok((set, union))
4774 });
4775 assert_eq!(
4776 parser("[^a]").parse_set_class_open(), {
4777 let set = ast::ClassBracketed {
4778 span: span(0..2),
4779 negated: true,
4780 kind: ast::ClassSet::union(ast::ClassSetUnion {
4781 span: span(2..2),
4782 items: vec![],
4783 }),
4784 };
4785 let union = ast::ClassSetUnion {
4786 span: span(2..2),
4787 items: vec![],
4788 };
4789 Ok((set, union))
4790 });
4791 assert_eq!(
4792 parser_ignore_whitespace("[ ^ a]").parse_set_class_open(), {
4793 let set = ast::ClassBracketed {
4794 span: span(0..4),
4795 negated: true,
4796 kind: ast::ClassSet::union(ast::ClassSetUnion {
4797 span: span(4..4),
4798 items: vec![],
4799 }),
4800 };
4801 let union = ast::ClassSetUnion {
4802 span: span(4..4),
4803 items: vec![],
4804 };
4805 Ok((set, union))
4806 });
4807 assert_eq!(
4808 parser("[-a]").parse_set_class_open(), {
4809 let set = ast::ClassBracketed {
4810 span: span(0..2),
4811 negated: false,
4812 kind: ast::ClassSet::union(ast::ClassSetUnion {
4813 span: span(1..1),
4814 items: vec![],
4815 }),
4816 };
4817 let union = ast::ClassSetUnion {
4818 span: span(1..2),
4819 items: vec![
4820 ast::ClassSetItem::Literal(ast::Literal {
4821 span: span(1..2),
4822 kind: ast::LiteralKind::Verbatim,
4823 c: '-',
4824 }),
4825 ],
4826 };
4827 Ok((set, union))
4828 });
4829 assert_eq!(
4830 parser_ignore_whitespace("[ - a]").parse_set_class_open(), {
4831 let set = ast::ClassBracketed {
4832 span: span(0..4),
4833 negated: false,
4834 kind: ast::ClassSet::union(ast::ClassSetUnion {
4835 span: span(2..2),
4836 items: vec![],
4837 }),
4838 };
4839 let union = ast::ClassSetUnion {
4840 span: span(2..3),
4841 items: vec![
4842 ast::ClassSetItem::Literal(ast::Literal {
4843 span: span(2..3),
4844 kind: ast::LiteralKind::Verbatim,
4845 c: '-',
4846 }),
4847 ],
4848 };
4849 Ok((set, union))
4850 });
4851 assert_eq!(
4852 parser("[^-a]").parse_set_class_open(), {
4853 let set = ast::ClassBracketed {
4854 span: span(0..3),
4855 negated: true,
4856 kind: ast::ClassSet::union(ast::ClassSetUnion {
4857 span: span(2..2),
4858 items: vec![],
4859 }),
4860 };
4861 let union = ast::ClassSetUnion {
4862 span: span(2..3),
4863 items: vec![
4864 ast::ClassSetItem::Literal(ast::Literal {
4865 span: span(2..3),
4866 kind: ast::LiteralKind::Verbatim,
4867 c: '-',
4868 }),
4869 ],
4870 };
4871 Ok((set, union))
4872 });
4873 assert_eq!(
4874 parser("[--a]").parse_set_class_open(), {
4875 let set = ast::ClassBracketed {
4876 span: span(0..3),
4877 negated: false,
4878 kind: ast::ClassSet::union(ast::ClassSetUnion {
4879 span: span(1..1),
4880 items: vec![],
4881 }),
4882 };
4883 let union = ast::ClassSetUnion {
4884 span: span(1..3),
4885 items: vec![
4886 ast::ClassSetItem::Literal(ast::Literal {
4887 span: span(1..2),
4888 kind: ast::LiteralKind::Verbatim,
4889 c: '-',
4890 }),
4891 ast::ClassSetItem::Literal(ast::Literal {
4892 span: span(2..3),
4893 kind: ast::LiteralKind::Verbatim,
4894 c: '-',
4895 }),
4896 ],
4897 };
4898 Ok((set, union))
4899 });
4900 assert_eq!(
4901 parser("[]a]").parse_set_class_open(), {
4902 let set = ast::ClassBracketed {
4903 span: span(0..2),
4904 negated: false,
4905 kind: ast::ClassSet::union(ast::ClassSetUnion {
4906 span: span(1..1),
4907 items: vec![],
4908 }),
4909 };
4910 let union = ast::ClassSetUnion {
4911 span: span(1..2),
4912 items: vec![
4913 ast::ClassSetItem::Literal(ast::Literal {
4914 span: span(1..2),
4915 kind: ast::LiteralKind::Verbatim,
4916 c: ']',
4917 }),
4918 ],
4919 };
4920 Ok((set, union))
4921 });
4922 assert_eq!(
4923 parser_ignore_whitespace("[ ] a]").parse_set_class_open(), {
4924 let set = ast::ClassBracketed {
4925 span: span(0..4),
4926 negated: false,
4927 kind: ast::ClassSet::union(ast::ClassSetUnion {
4928 span: span(2..2),
4929 items: vec![],
4930 }),
4931 };
4932 let union = ast::ClassSetUnion {
4933 span: span(2..3),
4934 items: vec![
4935 ast::ClassSetItem::Literal(ast::Literal {
4936 span: span(2..3),
4937 kind: ast::LiteralKind::Verbatim,
4938 c: ']',
4939 }),
4940 ],
4941 };
4942 Ok((set, union))
4943 });
4944 assert_eq!(
4945 parser("[^]a]").parse_set_class_open(), {
4946 let set = ast::ClassBracketed {
4947 span: span(0..3),
4948 negated: true,
4949 kind: ast::ClassSet::union(ast::ClassSetUnion {
4950 span: span(2..2),
4951 items: vec![],
4952 }),
4953 };
4954 let union = ast::ClassSetUnion {
4955 span: span(2..3),
4956 items: vec![
4957 ast::ClassSetItem::Literal(ast::Literal {
4958 span: span(2..3),
4959 kind: ast::LiteralKind::Verbatim,
4960 c: ']',
4961 }),
4962 ],
4963 };
4964 Ok((set, union))
4965 });
4966 assert_eq!(
4967 parser("[-]a]").parse_set_class_open(), {
4968 let set = ast::ClassBracketed {
4969 span: span(0..2),
4970 negated: false,
4971 kind: ast::ClassSet::union(ast::ClassSetUnion {
4972 span: span(1..1),
4973 items: vec![],
4974 }),
4975 };
4976 let union = ast::ClassSetUnion {
4977 span: span(1..2),
4978 items: vec![
4979 ast::ClassSetItem::Literal(ast::Literal {
4980 span: span(1..2),
4981 kind: ast::LiteralKind::Verbatim,
4982 c: '-',
4983 }),
4984 ],
4985 };
4986 Ok((set, union))
4987 });
4988
4989 assert_eq!(
4990 parser("[").parse_set_class_open().unwrap_err(),
4991 TestError {
4992 span: span(0..1),
4993 kind: ast::ErrorKind::ClassUnclosed,
4994 });
4995 assert_eq!(
4996 parser_ignore_whitespace("[ ")
4997 .parse_set_class_open()
4998 .unwrap_err(),
4999 TestError {
5000 span: span(0..5),
5001 kind: ast::ErrorKind::ClassUnclosed,
5002 });
5003 assert_eq!(
5004 parser("[^").parse_set_class_open().unwrap_err(),
5005 TestError {
5006 span: span(0..2),
5007 kind: ast::ErrorKind::ClassUnclosed,
5008 });
5009 assert_eq!(
5010 parser("[]").parse_set_class_open().unwrap_err(),
5011 TestError {
5012 span: span(0..2),
5013 kind: ast::ErrorKind::ClassUnclosed,
5014 });
5015 assert_eq!(
5016 parser("[-").parse_set_class_open().unwrap_err(),
5017 TestError {
5018 span: span(0..2),
5019 kind: ast::ErrorKind::ClassUnclosed,
5020 });
5021 assert_eq!(
5022 parser("[--").parse_set_class_open().unwrap_err(),
5023 TestError {
5024 span: span(0..3),
5025 kind: ast::ErrorKind::ClassUnclosed,
5026 });
5027 }
5028
5029 #[test]
maybe_parse_ascii_class()5030 fn maybe_parse_ascii_class() {
5031 assert_eq!(
5032 parser(r"[:alnum:]").maybe_parse_ascii_class(),
5033 Some(ast::ClassAscii {
5034 span: span(0..9),
5035 kind: ast::ClassAsciiKind::Alnum,
5036 negated: false,
5037 }));
5038 assert_eq!(
5039 parser(r"[:alnum:]A").maybe_parse_ascii_class(),
5040 Some(ast::ClassAscii {
5041 span: span(0..9),
5042 kind: ast::ClassAsciiKind::Alnum,
5043 negated: false,
5044 }));
5045 assert_eq!(
5046 parser(r"[:^alnum:]").maybe_parse_ascii_class(),
5047 Some(ast::ClassAscii {
5048 span: span(0..10),
5049 kind: ast::ClassAsciiKind::Alnum,
5050 negated: true,
5051 }));
5052
5053 let p = parser(r"[:");
5054 assert_eq!(p.maybe_parse_ascii_class(), None);
5055 assert_eq!(p.offset(), 0);
5056
5057 let p = parser(r"[:^");
5058 assert_eq!(p.maybe_parse_ascii_class(), None);
5059 assert_eq!(p.offset(), 0);
5060
5061 let p = parser(r"[^:alnum:]");
5062 assert_eq!(p.maybe_parse_ascii_class(), None);
5063 assert_eq!(p.offset(), 0);
5064
5065 let p = parser(r"[:alnnum:]");
5066 assert_eq!(p.maybe_parse_ascii_class(), None);
5067 assert_eq!(p.offset(), 0);
5068
5069 let p = parser(r"[:alnum]");
5070 assert_eq!(p.maybe_parse_ascii_class(), None);
5071 assert_eq!(p.offset(), 0);
5072
5073 let p = parser(r"[:alnum:");
5074 assert_eq!(p.maybe_parse_ascii_class(), None);
5075 assert_eq!(p.offset(), 0);
5076 }
5077
5078 #[test]
parse_unicode_class()5079 fn parse_unicode_class() {
5080 assert_eq!(
5081 parser(r"\pN").parse_escape(),
5082 Ok(Primitive::Unicode(ast::ClassUnicode {
5083 span: span(0..3),
5084 negated: false,
5085 kind: ast::ClassUnicodeKind::OneLetter('N'),
5086 })));
5087 assert_eq!(
5088 parser(r"\PN").parse_escape(),
5089 Ok(Primitive::Unicode(ast::ClassUnicode {
5090 span: span(0..3),
5091 negated: true,
5092 kind: ast::ClassUnicodeKind::OneLetter('N'),
5093 })));
5094 assert_eq!(
5095 parser(r"\p{N}").parse_escape(),
5096 Ok(Primitive::Unicode(ast::ClassUnicode {
5097 span: span(0..5),
5098 negated: false,
5099 kind: ast::ClassUnicodeKind::Named(s("N")),
5100 })));
5101 assert_eq!(
5102 parser(r"\P{N}").parse_escape(),
5103 Ok(Primitive::Unicode(ast::ClassUnicode {
5104 span: span(0..5),
5105 negated: true,
5106 kind: ast::ClassUnicodeKind::Named(s("N")),
5107 })));
5108 assert_eq!(
5109 parser(r"\p{Greek}").parse_escape(),
5110 Ok(Primitive::Unicode(ast::ClassUnicode {
5111 span: span(0..9),
5112 negated: false,
5113 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5114 })));
5115
5116 assert_eq!(
5117 parser(r"\p{scx:Katakana}").parse_escape(),
5118 Ok(Primitive::Unicode(ast::ClassUnicode {
5119 span: span(0..16),
5120 negated: false,
5121 kind: ast::ClassUnicodeKind::NamedValue {
5122 op: ast::ClassUnicodeOpKind::Colon,
5123 name: s("scx"),
5124 value: s("Katakana"),
5125 },
5126 })));
5127 assert_eq!(
5128 parser(r"\p{scx=Katakana}").parse_escape(),
5129 Ok(Primitive::Unicode(ast::ClassUnicode {
5130 span: span(0..16),
5131 negated: false,
5132 kind: ast::ClassUnicodeKind::NamedValue {
5133 op: ast::ClassUnicodeOpKind::Equal,
5134 name: s("scx"),
5135 value: s("Katakana"),
5136 },
5137 })));
5138 assert_eq!(
5139 parser(r"\p{scx!=Katakana}").parse_escape(),
5140 Ok(Primitive::Unicode(ast::ClassUnicode {
5141 span: span(0..17),
5142 negated: false,
5143 kind: ast::ClassUnicodeKind::NamedValue {
5144 op: ast::ClassUnicodeOpKind::NotEqual,
5145 name: s("scx"),
5146 value: s("Katakana"),
5147 },
5148 })));
5149
5150 assert_eq!(
5151 parser(r"\p{:}").parse_escape(),
5152 Ok(Primitive::Unicode(ast::ClassUnicode {
5153 span: span(0..5),
5154 negated: false,
5155 kind: ast::ClassUnicodeKind::NamedValue {
5156 op: ast::ClassUnicodeOpKind::Colon,
5157 name: s(""),
5158 value: s(""),
5159 },
5160 })));
5161 assert_eq!(
5162 parser(r"\p{=}").parse_escape(),
5163 Ok(Primitive::Unicode(ast::ClassUnicode {
5164 span: span(0..5),
5165 negated: false,
5166 kind: ast::ClassUnicodeKind::NamedValue {
5167 op: ast::ClassUnicodeOpKind::Equal,
5168 name: s(""),
5169 value: s(""),
5170 },
5171 })));
5172 assert_eq!(
5173 parser(r"\p{!=}").parse_escape(),
5174 Ok(Primitive::Unicode(ast::ClassUnicode {
5175 span: span(0..6),
5176 negated: false,
5177 kind: ast::ClassUnicodeKind::NamedValue {
5178 op: ast::ClassUnicodeOpKind::NotEqual,
5179 name: s(""),
5180 value: s(""),
5181 },
5182 })));
5183
5184 assert_eq!(
5185 parser(r"\p").parse_escape().unwrap_err(),
5186 TestError {
5187 span: span(2..2),
5188 kind: ast::ErrorKind::EscapeUnexpectedEof,
5189 });
5190 assert_eq!(
5191 parser(r"\p{").parse_escape().unwrap_err(),
5192 TestError {
5193 span: span(3..3),
5194 kind: ast::ErrorKind::EscapeUnexpectedEof,
5195 });
5196 assert_eq!(
5197 parser(r"\p{N").parse_escape().unwrap_err(),
5198 TestError {
5199 span: span(4..4),
5200 kind: ast::ErrorKind::EscapeUnexpectedEof,
5201 });
5202 assert_eq!(
5203 parser(r"\p{Greek").parse_escape().unwrap_err(),
5204 TestError {
5205 span: span(8..8),
5206 kind: ast::ErrorKind::EscapeUnexpectedEof,
5207 });
5208
5209 assert_eq!(
5210 parser(r"\pNz").parse(),
5211 Ok(Ast::Concat(ast::Concat {
5212 span: span(0..4),
5213 asts: vec![
5214 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5215 span: span(0..3),
5216 negated: false,
5217 kind: ast::ClassUnicodeKind::OneLetter('N'),
5218 })),
5219 Ast::Literal(ast::Literal {
5220 span: span(3..4),
5221 kind: ast::LiteralKind::Verbatim,
5222 c: 'z',
5223 }),
5224 ],
5225 })));
5226 assert_eq!(
5227 parser(r"\p{Greek}z").parse(),
5228 Ok(Ast::Concat(ast::Concat {
5229 span: span(0..10),
5230 asts: vec![
5231 Ast::Class(ast::Class::Unicode(ast::ClassUnicode {
5232 span: span(0..9),
5233 negated: false,
5234 kind: ast::ClassUnicodeKind::Named(s("Greek")),
5235 })),
5236 Ast::Literal(ast::Literal {
5237 span: span(9..10),
5238 kind: ast::LiteralKind::Verbatim,
5239 c: 'z',
5240 }),
5241 ],
5242 })));
5243 }
5244
5245 #[test]
parse_perl_class()5246 fn parse_perl_class() {
5247 assert_eq!(
5248 parser(r"\d").parse_escape(),
5249 Ok(Primitive::Perl(ast::ClassPerl {
5250 span: span(0..2),
5251 kind: ast::ClassPerlKind::Digit,
5252 negated: false,
5253 })));
5254 assert_eq!(
5255 parser(r"\D").parse_escape(),
5256 Ok(Primitive::Perl(ast::ClassPerl {
5257 span: span(0..2),
5258 kind: ast::ClassPerlKind::Digit,
5259 negated: true,
5260 })));
5261 assert_eq!(
5262 parser(r"\s").parse_escape(),
5263 Ok(Primitive::Perl(ast::ClassPerl {
5264 span: span(0..2),
5265 kind: ast::ClassPerlKind::Space,
5266 negated: false,
5267 })));
5268 assert_eq!(
5269 parser(r"\S").parse_escape(),
5270 Ok(Primitive::Perl(ast::ClassPerl {
5271 span: span(0..2),
5272 kind: ast::ClassPerlKind::Space,
5273 negated: true,
5274 })));
5275 assert_eq!(
5276 parser(r"\w").parse_escape(),
5277 Ok(Primitive::Perl(ast::ClassPerl {
5278 span: span(0..2),
5279 kind: ast::ClassPerlKind::Word,
5280 negated: false,
5281 })));
5282 assert_eq!(
5283 parser(r"\W").parse_escape(),
5284 Ok(Primitive::Perl(ast::ClassPerl {
5285 span: span(0..2),
5286 kind: ast::ClassPerlKind::Word,
5287 negated: true,
5288 })));
5289
5290 assert_eq!(
5291 parser(r"\d").parse(),
5292 Ok(Ast::Class(ast::Class::Perl(ast::ClassPerl {
5293 span: span(0..2),
5294 kind: ast::ClassPerlKind::Digit,
5295 negated: false,
5296 }))));
5297 assert_eq!(
5298 parser(r"\dz").parse(),
5299 Ok(Ast::Concat(ast::Concat {
5300 span: span(0..3),
5301 asts: vec![
5302 Ast::Class(ast::Class::Perl(ast::ClassPerl {
5303 span: span(0..2),
5304 kind: ast::ClassPerlKind::Digit,
5305 negated: false,
5306 })),
5307 Ast::Literal(ast::Literal {
5308 span: span(2..3),
5309 kind: ast::LiteralKind::Verbatim,
5310 c: 'z',
5311 }),
5312 ],
5313 })));
5314 }
5315
5316 // This tests a bug fix where the nest limit checker wasn't decrementing
5317 // its depth during post-traversal, which causes long regexes to trip
5318 // the default limit too aggressively.
5319 #[test]
regression_454_nest_too_big()5320 fn regression_454_nest_too_big() {
5321 let pattern = r#"
5322 2(?:
5323 [45]\d{3}|
5324 7(?:
5325 1[0-267]|
5326 2[0-289]|
5327 3[0-29]|
5328 4[01]|
5329 5[1-3]|
5330 6[013]|
5331 7[0178]|
5332 91
5333 )|
5334 8(?:
5335 0[125]|
5336 [139][1-6]|
5337 2[0157-9]|
5338 41|
5339 6[1-35]|
5340 7[1-5]|
5341 8[1-8]|
5342 90
5343 )|
5344 9(?:
5345 0[0-2]|
5346 1[0-4]|
5347 2[568]|
5348 3[3-6]|
5349 5[5-7]|
5350 6[0167]|
5351 7[15]|
5352 8[0146-9]
5353 )
5354 )\d{4}
5355 "#;
5356 assert!(parser_nest_limit(pattern, 50).parse().is_ok());
5357 }
5358
5359 // This tests that we treat a trailing `-` in a character class as a
5360 // literal `-` even when whitespace mode is enabled and there is whitespace
5361 // after the trailing `-`.
5362 #[test]
regression_455_trailing_dash_ignore_whitespace()5363 fn regression_455_trailing_dash_ignore_whitespace() {
5364 assert!(parser("(?x)[ / - ]").parse().is_ok());
5365 assert!(parser("(?x)[ a - ]").parse().is_ok());
5366 assert!(parser("(?x)[
5367 a
5368 - ]
5369 ").parse().is_ok());
5370 assert!(parser("(?x)[
5371 a # wat
5372 - ]
5373 ").parse().is_ok());
5374
5375 assert!(parser("(?x)[ / -").parse().is_err());
5376 assert!(parser("(?x)[ / - ").parse().is_err());
5377 assert!(parser("(?x)[
5378 / -
5379 ").parse().is_err());
5380 assert!(parser("(?x)[
5381 / - # wat
5382 ").parse().is_err());
5383 }
5384 }
5385