1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10 
11 /*!
12 Defines a translator that converts an `Ast` to an `Hir`.
13 */
14 
15 use std::cell::{Cell, RefCell};
16 use std::result;
17 
18 use ast::{self, Ast, Span, Visitor};
19 use hir::{self, Error, ErrorKind, Hir};
20 use unicode::{self, ClassQuery};
21 
22 type Result<T> = result::Result<T, Error>;
23 
24 /// A builder for constructing an AST->HIR translator.
25 #[derive(Clone, Debug)]
26 pub struct TranslatorBuilder {
27     allow_invalid_utf8: bool,
28     flags: Flags,
29 }
30 
31 impl Default for TranslatorBuilder {
default() -> TranslatorBuilder32     fn default() -> TranslatorBuilder {
33         TranslatorBuilder::new()
34     }
35 }
36 
37 impl TranslatorBuilder {
38     /// Create a new translator builder with a default c onfiguration.
new() -> TranslatorBuilder39     pub fn new() -> TranslatorBuilder {
40         TranslatorBuilder {
41             allow_invalid_utf8: false,
42             flags: Flags::default(),
43         }
44     }
45 
46     /// Build a translator using the current configuration.
build(&self) -> Translator47     pub fn build(&self) -> Translator {
48         Translator {
49             stack: RefCell::new(vec![]),
50             flags: Cell::new(self.flags),
51             allow_invalid_utf8: self.allow_invalid_utf8,
52         }
53     }
54 
55     /// When enabled, translation will permit the construction of a regular
56     /// expression that may match invalid UTF-8.
57     ///
58     /// When disabled (the default), the translator is guaranteed to produce
59     /// an expression that will only ever match valid UTF-8 (otherwise, the
60     /// translator will return an error).
61     ///
62     /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
63     /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
64     /// the parser to return an error. Namely, a negated ASCII word boundary
65     /// can result in matching positions that aren't valid UTF-8 boundaries.
allow_invalid_utf8( &mut self, yes: bool, ) -> &mut TranslatorBuilder66     pub fn allow_invalid_utf8(
67         &mut self,
68         yes: bool,
69     ) -> &mut TranslatorBuilder {
70         self.allow_invalid_utf8 = yes;
71         self
72     }
73 
74     /// Enable or disable the case insensitive flag (`i`) by default.
case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder75     pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
76         self.flags.case_insensitive = if yes { Some(true) } else { None };
77         self
78     }
79 
80     /// Enable or disable the multi-line matching flag (`m`) by default.
multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder81     pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
82         self.flags.multi_line = if yes { Some(true) } else { None };
83         self
84     }
85 
86     /// Enable or disable the "dot matches any character" flag (`s`) by
87     /// default.
dot_matches_new_line( &mut self, yes: bool, ) -> &mut TranslatorBuilder88     pub fn dot_matches_new_line(
89         &mut self,
90         yes: bool,
91     ) -> &mut TranslatorBuilder {
92         self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
93         self
94     }
95 
96     /// Enable or disable the "swap greed" flag (`U`) by default.
swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder97     pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
98         self.flags.swap_greed = if yes { Some(true) } else { None };
99         self
100     }
101 
102     /// Enable or disable the Unicode flag (`u`) by default.
unicode(&mut self, yes: bool) -> &mut TranslatorBuilder103     pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
104         self.flags.unicode = if yes { None } else { Some(false) };
105         self
106     }
107 }
108 
109 /// A translator maps abstract syntax to a high level intermediate
110 /// representation.
111 ///
112 /// A translator may be benefit from reuse. That is, a translator can translate
113 /// many abstract syntax trees.
114 ///
115 /// A `Translator` can be configured in more detail via a
116 /// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
117 #[derive(Clone, Debug)]
118 pub struct Translator {
119     /// Our call stack, but on the heap.
120     stack: RefCell<Vec<HirFrame>>,
121     /// The current flag settings.
122     flags: Cell<Flags>,
123     /// Whether we're allowed to produce HIR that can match arbitrary bytes.
124     allow_invalid_utf8: bool,
125 }
126 
127 impl Translator {
128     /// Create a new translator using the default configuration.
new() -> Translator129     pub fn new() -> Translator {
130         TranslatorBuilder::new().build()
131     }
132 
133     /// Translate the given abstract syntax tree (AST) into a high level
134     /// intermediate representation (HIR).
135     ///
136     /// If there was a problem doing the translation, then an HIR-specific
137     /// error is returned.
138     ///
139     /// The original pattern string used to produce the `Ast` *must* also be
140     /// provided. The translator does not use the pattern string during any
141     /// correct translation, but is used for error reporting.
translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir>142     pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
143         ast::visit(ast, TranslatorI::new(self, pattern))
144     }
145 }
146 
147 /// An HirFrame is a single stack frame, represented explicitly, which is
148 /// created for each item in the Ast that we traverse.
149 ///
150 /// Note that technically, this type doesn't represent our entire stack
151 /// frame. In particular, the Ast visitor represents any state associated with
152 /// traversing the Ast itself.
153 #[derive(Clone, Debug)]
154 enum HirFrame {
155     /// An arbitrary HIR expression. These get pushed whenever we hit a base
156     /// case in the Ast. They get popped after an inductive (i.e., recursive)
157     /// step is complete.
158     Expr(Hir),
159     /// A Unicode character class. This frame is mutated as we descend into
160     /// the Ast of a character class (which is itself its own mini recursive
161     /// structure).
162     ClassUnicode(hir::ClassUnicode),
163     /// A byte-oriented character class. This frame is mutated as we descend
164     /// into the Ast of a character class (which is itself its own mini
165     /// recursive structure).
166     ///
167     /// Byte character classes are created when Unicode mode (`u`) is disabled.
168     /// If `allow_invalid_utf8` is disabled (the default), then a byte
169     /// character is only permitted to match ASCII text.
170     ClassBytes(hir::ClassBytes),
171     /// This is pushed on to the stack upon first seeing any kind of group,
172     /// indicated by parentheses (including non-capturing groups). It is popped
173     /// upon leaving a group.
174     Group {
175         /// The old active flags, if any, when this group was opened.
176         ///
177         /// If this group sets flags, then the new active flags are set to the
178         /// result of merging the old flags with the flags introduced by this
179         /// group.
180         ///
181         /// When this group is popped, the active flags should be restored to
182         /// the flags set here.
183         ///
184         /// The "active" flags correspond to whatever flags are set in the
185         /// Translator.
186         old_flags: Option<Flags>,
187     },
188     /// This is pushed whenever a concatenation is observed. After visiting
189     /// every sub-expression in the concatenation, the translator's stack is
190     /// popped until it sees a Concat frame.
191     Concat,
192     /// This is pushed whenever an alternation is observed. After visiting
193     /// every sub-expression in the alternation, the translator's stack is
194     /// popped until it sees an Alternation frame.
195     Alternation,
196 }
197 
198 impl HirFrame {
199     /// Assert that the current stack frame is an Hir expression and return it.
unwrap_expr(self) -> Hir200     fn unwrap_expr(self) -> Hir {
201         match self {
202             HirFrame::Expr(expr) => expr,
203             _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self)
204         }
205     }
206 
207     /// Assert that the current stack frame is a Unicode class expression and
208     /// return it.
unwrap_class_unicode(self) -> hir::ClassUnicode209     fn unwrap_class_unicode(self) -> hir::ClassUnicode {
210         match self {
211             HirFrame::ClassUnicode(cls) => cls,
212             _ => panic!("tried to unwrap Unicode class \
213                          from HirFrame, got: {:?}", self)
214         }
215     }
216 
217     /// Assert that the current stack frame is a byte class expression and
218     /// return it.
unwrap_class_bytes(self) -> hir::ClassBytes219     fn unwrap_class_bytes(self) -> hir::ClassBytes {
220         match self {
221             HirFrame::ClassBytes(cls) => cls,
222             _ => panic!("tried to unwrap byte class \
223                          from HirFrame, got: {:?}", self)
224         }
225     }
226 
227     /// Assert that the current stack frame is a group indicator and return
228     /// its corresponding flags (the flags that were active at the time the
229     /// group was entered) if they exist.
unwrap_group(self) -> Option<Flags>230     fn unwrap_group(self) -> Option<Flags> {
231         match self {
232             HirFrame::Group { old_flags } => old_flags,
233             _ => panic!("tried to unwrap group from HirFrame, got: {:?}", self)
234         }
235     }
236 }
237 
238 impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
239     type Output = Hir;
240     type Err = Error;
241 
finish(self) -> Result<Hir>242     fn finish(self) -> Result<Hir> {
243         // ... otherwise, we should have exactly one HIR on the stack.
244         assert_eq!(self.trans().stack.borrow().len(), 1);
245         Ok(self.pop().unwrap().unwrap_expr())
246     }
247 
visit_pre(&mut self, ast: &Ast) -> Result<()>248     fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
249         match *ast {
250             Ast::Class(ast::Class::Bracketed(_)) => {
251                 if self.flags().unicode() {
252                     let cls = hir::ClassUnicode::empty();
253                     self.push(HirFrame::ClassUnicode(cls));
254                 } else {
255                     let cls = hir::ClassBytes::empty();
256                     self.push(HirFrame::ClassBytes(cls));
257                 }
258             }
259             Ast::Group(ref x) => {
260                 let old_flags = x.flags().map(|ast| self.set_flags(ast));
261                 self.push(HirFrame::Group {
262                     old_flags: old_flags,
263                 });
264             }
265             Ast::Concat(ref x) if x.asts.is_empty() => {}
266             Ast::Concat(_) => {
267                 self.push(HirFrame::Concat);
268             }
269             Ast::Alternation(ref x) if x.asts.is_empty() => {}
270             Ast::Alternation(_) => {
271                 self.push(HirFrame::Alternation);
272             }
273             _ => {}
274         }
275         Ok(())
276     }
277 
visit_post(&mut self, ast: &Ast) -> Result<()>278     fn visit_post(&mut self, ast: &Ast) -> Result<()> {
279         match *ast {
280             Ast::Empty(_) => {
281                 self.push(HirFrame::Expr(Hir::empty()));
282             }
283             Ast::Flags(ref x) => {
284                 self.set_flags(&x.flags);
285                 // Flags in the AST are generally considered directives and
286                 // not actual sub-expressions. However, they can be used in
287                 // the concrete syntax like `((?i))`, and we need some kind of
288                 // indication of an expression there, and Empty is the correct
289                 // choice.
290                 //
291                 // There can also be things like `(?i)+`, but we rule those out
292                 // in the parser. In the future, we might allow them for
293                 // consistency sake.
294                 self.push(HirFrame::Expr(Hir::empty()));
295             }
296             Ast::Literal(ref x) => {
297                 self.push(HirFrame::Expr(self.hir_literal(x)?));
298             }
299             Ast::Dot(span) => {
300                 self.push(HirFrame::Expr(self.hir_dot(span)?));
301             }
302             Ast::Assertion(ref x) => {
303                 self.push(HirFrame::Expr(self.hir_assertion(x)?));
304             }
305             Ast::Class(ast::Class::Perl(ref x)) => {
306                 if self.flags().unicode() {
307                     let cls = self.hir_perl_unicode_class(x);
308                     let hcls = hir::Class::Unicode(cls);
309                     self.push(HirFrame::Expr(Hir::class(hcls)));
310                 } else {
311                     let cls = self.hir_perl_byte_class(x);
312                     let hcls = hir::Class::Bytes(cls);
313                     self.push(HirFrame::Expr(Hir::class(hcls)));
314                 }
315             }
316             Ast::Class(ast::Class::Unicode(ref x)) => {
317                 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
318                 self.push(HirFrame::Expr(Hir::class(cls)));
319             }
320             Ast::Class(ast::Class::Bracketed(ref ast)) => {
321                 if self.flags().unicode() {
322                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
323                     self.unicode_fold_and_negate(ast.negated, &mut cls);
324                     if cls.iter().next().is_none() {
325                         return Err(self.error(
326                             ast.span, ErrorKind::EmptyClassNotAllowed));
327                     }
328                     let expr = Hir::class(hir::Class::Unicode(cls));
329                     self.push(HirFrame::Expr(expr));
330                 } else {
331                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
332                     self.bytes_fold_and_negate(
333                         &ast.span, ast.negated, &mut cls)?;
334                     if cls.iter().next().is_none() {
335                         return Err(self.error(
336                             ast.span, ErrorKind::EmptyClassNotAllowed));
337                     }
338 
339                     let expr = Hir::class(hir::Class::Bytes(cls));
340                     self.push(HirFrame::Expr(expr));
341                 }
342             }
343             Ast::Repetition(ref x) => {
344                 let expr = self.pop().unwrap().unwrap_expr();
345                 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
346             }
347             Ast::Group(ref x) => {
348                 let expr = self.pop().unwrap().unwrap_expr();
349                 if let Some(flags) = self.pop().unwrap().unwrap_group() {
350                     self.trans().flags.set(flags);
351                 }
352                 self.push(HirFrame::Expr(self.hir_group(x, expr)));
353             }
354             Ast::Concat(_) => {
355                 let mut exprs = vec![];
356                 while let Some(HirFrame::Expr(expr)) = self.pop() {
357                     if !expr.kind().is_empty() {
358                         exprs.push(expr);
359                     }
360                 }
361                 exprs.reverse();
362                 self.push(HirFrame::Expr(Hir::concat(exprs)));
363             }
364             Ast::Alternation(_) => {
365                 let mut exprs = vec![];
366                 while let Some(HirFrame::Expr(expr)) = self.pop() {
367                     exprs.push(expr);
368                 }
369                 exprs.reverse();
370                 self.push(HirFrame::Expr(Hir::alternation(exprs)));
371             }
372         }
373         Ok(())
374     }
375 
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>376     fn visit_class_set_item_pre(
377         &mut self,
378         ast: &ast::ClassSetItem,
379     ) -> Result<()> {
380         match *ast {
381             ast::ClassSetItem::Bracketed(_) => {
382                 if self.flags().unicode() {
383                     let cls = hir::ClassUnicode::empty();
384                     self.push(HirFrame::ClassUnicode(cls));
385                 } else {
386                     let cls = hir::ClassBytes::empty();
387                     self.push(HirFrame::ClassBytes(cls));
388                 }
389             }
390             // We needn't handle the Union case here since the visitor will
391             // do it for us.
392             _ => {}
393         }
394         Ok(())
395     }
396 
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>397     fn visit_class_set_item_post(
398         &mut self,
399         ast: &ast::ClassSetItem,
400     ) -> Result<()> {
401         match *ast {
402             ast::ClassSetItem::Empty(_) => {}
403             ast::ClassSetItem::Literal(ref x) => {
404                 if self.flags().unicode() {
405                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
406                     cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
407                     self.push(HirFrame::ClassUnicode(cls));
408                 } else {
409                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
410                     let byte = self.class_literal_byte(x)?;
411                     cls.push(hir::ClassBytesRange::new(byte, byte));
412                     self.push(HirFrame::ClassBytes(cls));
413                 }
414             }
415             ast::ClassSetItem::Range(ref x) => {
416                 if self.flags().unicode() {
417                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
418                     cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
419                     self.push(HirFrame::ClassUnicode(cls));
420                 } else {
421                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
422                     let start = self.class_literal_byte(&x.start)?;
423                     let end = self.class_literal_byte(&x.end)?;
424                     cls.push(hir::ClassBytesRange::new(start, end));
425                     self.push(HirFrame::ClassBytes(cls));
426                 }
427             }
428             ast::ClassSetItem::Ascii(ref x) => {
429                 if self.flags().unicode() {
430                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
431                     for &(s, e) in ascii_class(&x.kind) {
432                         cls.push(hir::ClassUnicodeRange::new(s, e));
433                     }
434                     self.unicode_fold_and_negate(x.negated, &mut cls);
435                     self.push(HirFrame::ClassUnicode(cls));
436                 } else {
437                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
438                     for &(s, e) in ascii_class(&x.kind) {
439                         cls.push(hir::ClassBytesRange::new(s as u8, e as u8));
440                     }
441                     self.bytes_fold_and_negate(
442                         &x.span, x.negated, &mut cls)?;
443                     self.push(HirFrame::ClassBytes(cls));
444                 }
445             }
446             ast::ClassSetItem::Unicode(ref x) => {
447                 let xcls = self.hir_unicode_class(x)?;
448                 let mut cls = self.pop().unwrap().unwrap_class_unicode();
449                 cls.union(&xcls);
450                 self.push(HirFrame::ClassUnicode(cls));
451             }
452             ast::ClassSetItem::Perl(ref x) => {
453                 if self.flags().unicode() {
454                     let xcls = self.hir_perl_unicode_class(x);
455                     let mut cls = self.pop().unwrap().unwrap_class_unicode();
456                     cls.union(&xcls);
457                     self.push(HirFrame::ClassUnicode(cls));
458                 } else {
459                     let xcls = self.hir_perl_byte_class(x);
460                     let mut cls = self.pop().unwrap().unwrap_class_bytes();
461                     cls.union(&xcls);
462                     self.push(HirFrame::ClassBytes(cls));
463                 }
464             }
465             ast::ClassSetItem::Bracketed(ref ast) => {
466                 if self.flags().unicode() {
467                     let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
468                     self.unicode_fold_and_negate(ast.negated, &mut cls1);
469 
470                     let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
471                     cls2.union(&cls1);
472                     self.push(HirFrame::ClassUnicode(cls2));
473                 } else {
474                     let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
475                     self.bytes_fold_and_negate(
476                         &ast.span, ast.negated, &mut cls1)?;
477 
478                     let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
479                     cls2.union(&cls1);
480                     self.push(HirFrame::ClassBytes(cls2));
481                 }
482             }
483             // This is handled automatically by the visitor.
484             ast::ClassSetItem::Union(_) => {}
485         }
486         Ok(())
487     }
488 
visit_class_set_binary_op_pre( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>489     fn visit_class_set_binary_op_pre(
490         &mut self,
491         _op: &ast::ClassSetBinaryOp,
492     ) -> Result<()> {
493         if self.flags().unicode() {
494             let cls = hir::ClassUnicode::empty();
495             self.push(HirFrame::ClassUnicode(cls));
496         } else {
497             let cls = hir::ClassBytes::empty();
498             self.push(HirFrame::ClassBytes(cls));
499         }
500         Ok(())
501     }
502 
visit_class_set_binary_op_in( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>503     fn visit_class_set_binary_op_in(
504         &mut self,
505         _op: &ast::ClassSetBinaryOp,
506     ) -> Result<()> {
507         if self.flags().unicode() {
508             let cls = hir::ClassUnicode::empty();
509             self.push(HirFrame::ClassUnicode(cls));
510         } else {
511             let cls = hir::ClassBytes::empty();
512             self.push(HirFrame::ClassBytes(cls));
513         }
514         Ok(())
515     }
516 
visit_class_set_binary_op_post( &mut self, op: &ast::ClassSetBinaryOp, ) -> Result<()>517     fn visit_class_set_binary_op_post(
518         &mut self,
519         op: &ast::ClassSetBinaryOp,
520     ) -> Result<()> {
521         use ast::ClassSetBinaryOpKind::*;
522 
523         if self.flags().unicode() {
524             let mut rhs = self.pop().unwrap().unwrap_class_unicode();
525             let mut lhs = self.pop().unwrap().unwrap_class_unicode();
526             let mut cls = self.pop().unwrap().unwrap_class_unicode();
527             if self.flags().case_insensitive() {
528                 rhs.case_fold_simple();
529                 lhs.case_fold_simple();
530             }
531             match op.kind {
532                 Intersection => lhs.intersect(&rhs),
533                 Difference => lhs.difference(&rhs),
534                 SymmetricDifference => lhs.symmetric_difference(&rhs),
535             }
536             cls.union(&lhs);
537             self.push(HirFrame::ClassUnicode(cls));
538         } else {
539             let mut rhs = self.pop().unwrap().unwrap_class_bytes();
540             let mut lhs = self.pop().unwrap().unwrap_class_bytes();
541             let mut cls = self.pop().unwrap().unwrap_class_bytes();
542             if self.flags().case_insensitive() {
543                 rhs.case_fold_simple();
544                 lhs.case_fold_simple();
545             }
546             match op.kind {
547                 Intersection => lhs.intersect(&rhs),
548                 Difference => lhs.difference(&rhs),
549                 SymmetricDifference => lhs.symmetric_difference(&rhs),
550             }
551             cls.union(&lhs);
552             self.push(HirFrame::ClassBytes(cls));
553         }
554         Ok(())
555     }
556 }
557 
558 /// The internal implementation of a translator.
559 ///
560 /// This type is responsible for carrying around the original pattern string,
561 /// which is not tied to the internal state of a translator.
562 ///
563 /// A TranslatorI exists for the time it takes to translate a single Ast.
564 #[derive(Clone, Debug)]
565 struct TranslatorI<'t, 'p> {
566     trans: &'t Translator,
567     pattern: &'p str,
568 }
569 
570 impl<'t, 'p> TranslatorI<'t, 'p> {
571     /// Build a new internal translator.
new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p>572     fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
573         TranslatorI { trans: trans, pattern: pattern }
574     }
575 
576     /// Return a reference to the underlying translator.
trans(&self) -> &Translator577     fn trans(&self) -> &Translator {
578         &self.trans
579     }
580 
581     /// Push the given frame on to the call stack.
push(&self, frame: HirFrame)582     fn push(&self, frame: HirFrame) {
583         self.trans().stack.borrow_mut().push(frame);
584     }
585 
586     /// Pop the top of the call stack. If the call stack is empty, return None.
pop(&self) -> Option<HirFrame>587     fn pop(&self) -> Option<HirFrame> {
588         self.trans().stack.borrow_mut().pop()
589     }
590 
591     /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ErrorKind) -> Error592     fn error(&self, span: Span, kind: ErrorKind) -> Error {
593         Error { kind: kind, pattern: self.pattern.to_string(), span: span }
594     }
595 
596     /// Return a copy of the active flags.
flags(&self) -> Flags597     fn flags(&self) -> Flags {
598         self.trans().flags.get()
599     }
600 
601     /// Set the flags of this translator from the flags set in the given AST.
602     /// Then, return the old flags.
set_flags(&self, ast_flags: &ast::Flags) -> Flags603     fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
604         let old_flags = self.flags();
605         let mut new_flags = Flags::from_ast(ast_flags);
606         new_flags.merge(&old_flags);
607         self.trans().flags.set(new_flags);
608         old_flags
609     }
610 
hir_literal(&self, lit: &ast::Literal) -> Result<Hir>611     fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
612         let ch = match self.literal_to_char(lit)? {
613             byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
614             hir::Literal::Unicode(ch) => ch,
615         };
616         if self.flags().case_insensitive() {
617             self.hir_from_char_case_insensitive(lit.span, ch)
618         } else {
619             self.hir_from_char(lit.span, ch)
620         }
621     }
622 
623     /// Convert an Ast literal to its scalar representation.
624     ///
625     /// When Unicode mode is enabled, then this always succeeds and returns a
626     /// `char` (Unicode scalar value).
627     ///
628     /// When Unicode mode is disabled, then a raw byte is returned. If that
629     /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
630     /// an error.
literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal>631     fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
632         if self.flags().unicode() {
633             return Ok(hir::Literal::Unicode(lit.c));
634         }
635         let byte = match lit.byte() {
636             None => return Ok(hir::Literal::Unicode(lit.c)),
637             Some(byte) => byte,
638         };
639         if byte <= 0x7F {
640             return Ok(hir::Literal::Unicode(byte as char));
641         }
642         if !self.trans().allow_invalid_utf8 {
643             return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
644         }
645         Ok(hir::Literal::Byte(byte))
646     }
647 
hir_from_char(&self, span: Span, c: char) -> Result<Hir>648     fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
649         if !self.flags().unicode() && c.len_utf8() > 1 {
650             return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
651         }
652         Ok(Hir::literal(hir::Literal::Unicode(c)))
653     }
654 
hir_from_char_case_insensitive( &self, span: Span, c: char, ) -> Result<Hir>655     fn hir_from_char_case_insensitive(
656         &self,
657         span: Span,
658         c: char,
659     ) -> Result<Hir> {
660         // If case folding won't do anything, then don't bother trying.
661         if !unicode::contains_simple_case_mapping(c, c) {
662             return self.hir_from_char(span, c);
663         }
664         if self.flags().unicode() {
665             let mut cls = hir::ClassUnicode::new(vec![
666                 hir::ClassUnicodeRange::new(c, c),
667             ]);
668             cls.case_fold_simple();
669             Ok(Hir::class(hir::Class::Unicode(cls)))
670         } else {
671             if c.len_utf8() > 1 {
672                 return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
673             }
674             let mut cls = hir::ClassBytes::new(vec![
675                 hir::ClassBytesRange::new(c as u8, c as u8),
676             ]);
677             cls.case_fold_simple();
678             Ok(Hir::class(hir::Class::Bytes(cls)))
679         }
680     }
681 
hir_dot(&self, span: Span) -> Result<Hir>682     fn hir_dot(&self, span: Span) -> Result<Hir> {
683         let unicode = self.flags().unicode();
684         if !unicode && !self.trans().allow_invalid_utf8 {
685             return Err(self.error(span, ErrorKind::InvalidUtf8));
686         }
687         Ok(if self.flags().dot_matches_new_line() {
688             Hir::any(!unicode)
689         } else {
690             Hir::dot(!unicode)
691         })
692     }
693 
hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir>694     fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
695         let unicode = self.flags().unicode();
696         let multi_line = self.flags().multi_line();
697         Ok(match asst.kind {
698             ast::AssertionKind::StartLine => {
699                 Hir::anchor(if multi_line {
700                     hir::Anchor::StartLine
701                 } else {
702                     hir::Anchor::StartText
703                 })
704             }
705             ast::AssertionKind::EndLine => {
706                 Hir::anchor(if multi_line {
707                     hir::Anchor::EndLine
708                 } else {
709                     hir::Anchor::EndText
710                 })
711             }
712             ast::AssertionKind::StartText => {
713                 Hir::anchor(hir::Anchor::StartText)
714             }
715             ast::AssertionKind::EndText => {
716                 Hir::anchor(hir::Anchor::EndText)
717             }
718             ast::AssertionKind::WordBoundary => {
719                 Hir::word_boundary(if unicode {
720                     hir::WordBoundary::Unicode
721                 } else {
722                     hir::WordBoundary::Ascii
723                 })
724             }
725             ast::AssertionKind::NotWordBoundary => {
726                 Hir::word_boundary(if unicode {
727                     hir::WordBoundary::UnicodeNegate
728                 } else {
729                     // It is possible for negated ASCII word boundaries to
730                     // match at invalid UTF-8 boundaries, even when searching
731                     // valid UTF-8.
732                     if !self.trans().allow_invalid_utf8 {
733                         return Err(self.error(
734                             asst.span, ErrorKind::InvalidUtf8));
735                     }
736                     hir::WordBoundary::AsciiNegate
737                 })
738             }
739         })
740     }
741 
hir_group(&self, group: &ast::Group, expr: Hir) -> Hir742     fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
743         let kind = match group.kind {
744             ast::GroupKind::CaptureIndex(idx) => {
745                 hir::GroupKind::CaptureIndex(idx)
746             }
747             ast::GroupKind::CaptureName(ref capname) => {
748                 hir::GroupKind::CaptureName {
749                     name: capname.name.clone(),
750                     index: capname.index,
751                 }
752             }
753             ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
754         };
755         Hir::group(hir::Group {
756             kind: kind,
757             hir: Box::new(expr),
758         })
759     }
760 
hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir761     fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
762         let kind = match rep.op.kind {
763             ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
764             ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
765             ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
766             ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
767                 hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
768             }
769             ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
770                 hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
771             }
772             ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(m,n)) => {
773                 hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
774             }
775         };
776         let greedy =
777             if self.flags().swap_greed() {
778                 !rep.greedy
779             } else {
780                 rep.greedy
781             };
782         Hir::repetition(hir::Repetition {
783             kind: kind,
784             greedy: greedy,
785             hir: Box::new(expr),
786         })
787     }
788 
hir_unicode_class( &self, ast_class: &ast::ClassUnicode, ) -> Result<hir::ClassUnicode>789     fn hir_unicode_class(
790         &self,
791         ast_class: &ast::ClassUnicode,
792     ) -> Result<hir::ClassUnicode> {
793         use ast::ClassUnicodeKind::*;
794 
795         if !self.flags().unicode() {
796             return Err(self.error(
797                 ast_class.span,
798                 ErrorKind::UnicodeNotAllowed,
799             ));
800         }
801         let query = match ast_class.kind {
802             OneLetter(name) => ClassQuery::OneLetter(name),
803             Named(ref name) => ClassQuery::Binary(name),
804             NamedValue { ref name, ref value, .. } => {
805                 ClassQuery::ByValue {
806                     property_name: name,
807                     property_value: value,
808                 }
809             }
810         };
811         match unicode::class(query) {
812             Ok(mut class) => {
813                 self.unicode_fold_and_negate(ast_class.negated, &mut class);
814                 Ok(class)
815             }
816             Err(unicode::Error::PropertyNotFound) => {
817                 Err(self.error(
818                     ast_class.span,
819                     ErrorKind::UnicodePropertyNotFound,
820                 ))
821             }
822             Err(unicode::Error::PropertyValueNotFound) => {
823                 Err(self.error(
824                     ast_class.span,
825                     ErrorKind::UnicodePropertyValueNotFound,
826                 ))
827             }
828         }
829     }
830 
hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, ) -> hir::ClassUnicode831     fn hir_perl_unicode_class(
832         &self,
833         ast_class: &ast::ClassPerl,
834     ) -> hir::ClassUnicode {
835         use ast::ClassPerlKind::*;
836         use unicode_tables::perl_word::PERL_WORD;
837 
838         assert!(self.flags().unicode());
839         let mut class = match ast_class.kind {
840             Digit => {
841                 let query = ClassQuery::Binary("Decimal_Number");
842                 unicode::class(query).unwrap()
843             }
844             Space => {
845                 let query = ClassQuery::Binary("Whitespace");
846                 unicode::class(query).unwrap()
847             }
848             Word => unicode::hir_class(PERL_WORD),
849         };
850         // We needn't apply case folding here because the Perl Unicode classes
851         // are already closed under Unicode simple case folding.
852         if ast_class.negated {
853             class.negate();
854         }
855         class
856     }
857 
hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, ) -> hir::ClassBytes858     fn hir_perl_byte_class(
859         &self,
860         ast_class: &ast::ClassPerl,
861     ) -> hir::ClassBytes {
862         use ast::ClassPerlKind::*;
863 
864         assert!(!self.flags().unicode());
865         let mut class = match ast_class.kind {
866             Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
867             Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
868             Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
869         };
870         // We needn't apply case folding here because the Perl ASCII classes
871         // are already closed (under ASCII case folding).
872         if ast_class.negated {
873             class.negate();
874         }
875         class
876     }
877 
unicode_fold_and_negate( &self, negated: bool, class: &mut hir::ClassUnicode, )878     fn unicode_fold_and_negate(
879         &self,
880         negated: bool,
881         class: &mut hir::ClassUnicode,
882     ) {
883         // Note that we must apply case folding before negation!
884         // Consider `(?i)[^x]`. If we applied negation field, then
885         // the result would be the character class that matched any
886         // Unicode scalar value.
887         if self.flags().case_insensitive() {
888             class.case_fold_simple();
889         }
890         if negated {
891             class.negate();
892         }
893     }
894 
bytes_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassBytes, ) -> Result<()>895     fn bytes_fold_and_negate(
896         &self,
897         span: &Span,
898         negated: bool,
899         class: &mut hir::ClassBytes,
900     ) -> Result<()> {
901         // Note that we must apply case folding before negation!
902         // Consider `(?i)[^x]`. If we applied negation field, then
903         // the result would be the character class that matched any
904         // Unicode scalar value.
905         if self.flags().case_insensitive() {
906             class.case_fold_simple();
907         }
908         if negated {
909             class.negate();
910         }
911         if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
912             return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
913         }
914         Ok(())
915     }
916 
917     /// Return a scalar byte value suitable for use as a literal in a byte
918     /// character class.
class_literal_byte(&self, ast: &ast::Literal) -> Result<u8>919     fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
920         match self.literal_to_char(ast)? {
921             hir::Literal::Byte(byte) => Ok(byte),
922             hir::Literal::Unicode(ch) => {
923                 if ch <= 0x7F as char {
924                     Ok(ch as u8)
925                 } else {
926                     // We can't feasibly support Unicode in
927                     // byte oriented classes. Byte classes don't
928                     // do Unicode case folding.
929                     Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
930                 }
931             }
932         }
933     }
934 }
935 
936 /// A translator's representation of a regular expression's flags at any given
937 /// moment in time.
938 ///
939 /// Each flag can be in one of three states: absent, present but disabled or
940 /// present but enabled.
941 #[derive(Clone, Copy, Debug, Default)]
942 struct Flags {
943     case_insensitive: Option<bool>,
944     multi_line: Option<bool>,
945     dot_matches_new_line: Option<bool>,
946     swap_greed: Option<bool>,
947     unicode: Option<bool>,
948     // Note that `ignore_whitespace` is omitted here because it is handled
949     // entirely in the parser.
950 }
951 
952 impl Flags {
from_ast(ast: &ast::Flags) -> Flags953     fn from_ast(ast: &ast::Flags) -> Flags {
954         let mut flags = Flags::default();
955         let mut enable = true;
956         for item in &ast.items {
957             match item.kind {
958                 ast::FlagsItemKind::Negation => {
959                     enable = false;
960                 }
961                 ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
962                     flags.case_insensitive = Some(enable);
963                 }
964                 ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
965                     flags.multi_line = Some(enable);
966                 }
967                 ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
968                     flags.dot_matches_new_line = Some(enable);
969                 }
970                 ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
971                     flags.swap_greed = Some(enable);
972                 }
973                 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
974                     flags.unicode = Some(enable);
975                 }
976                 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
977             }
978         }
979         flags
980     }
981 
merge(&mut self, previous: &Flags)982     fn merge(&mut self, previous: &Flags) {
983         if self.case_insensitive.is_none() {
984             self.case_insensitive = previous.case_insensitive;
985         }
986         if self.multi_line.is_none() {
987             self.multi_line = previous.multi_line;
988         }
989         if self.dot_matches_new_line.is_none() {
990             self.dot_matches_new_line = previous.dot_matches_new_line;
991         }
992         if self.swap_greed.is_none() {
993             self.swap_greed = previous.swap_greed;
994         }
995         if self.unicode.is_none() {
996             self.unicode = previous.unicode;
997         }
998     }
999 
case_insensitive(&self) -> bool1000     fn case_insensitive(&self) -> bool {
1001         self.case_insensitive.unwrap_or(false)
1002     }
1003 
multi_line(&self) -> bool1004     fn multi_line(&self) -> bool {
1005         self.multi_line.unwrap_or(false)
1006     }
1007 
dot_matches_new_line(&self) -> bool1008     fn dot_matches_new_line(&self) -> bool {
1009         self.dot_matches_new_line.unwrap_or(false)
1010     }
1011 
swap_greed(&self) -> bool1012     fn swap_greed(&self) -> bool {
1013         self.swap_greed.unwrap_or(false)
1014     }
1015 
unicode(&self) -> bool1016     fn unicode(&self) -> bool {
1017         self.unicode.unwrap_or(true)
1018     }
1019 }
1020 
hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes1021 fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1022     let ranges: Vec<_> = ascii_class(kind).iter().cloned().map(|(s, e)| {
1023         hir::ClassBytesRange::new(s as u8, e as u8)
1024     }).collect();
1025     hir::ClassBytes::new(ranges)
1026 }
1027 
ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)]1028 fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
1029     use ast::ClassAsciiKind::*;
1030 
1031     // The contortions below with `const` appear necessary for older versions
1032     // of Rust.
1033     type T = &'static [(char, char)];
1034     match *kind {
1035         Alnum => {
1036             const X: T = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
1037             X
1038         }
1039         Alpha => {
1040             const X: T = &[('A', 'Z'), ('a', 'z')];
1041             X
1042         }
1043         Ascii => {
1044             const X: T = &[('\x00', '\x7F')];
1045             X
1046         }
1047         Blank => {
1048             const X: T = &[('\t', '\t'), (' ', ' ')];
1049             X
1050         }
1051         Cntrl => {
1052             const X: T = &[('\x00', '\x1F'), ('\x7F', '\x7F')];
1053             X
1054         }
1055         Digit => {
1056             const X: T = &[('0', '9')];
1057             X
1058         }
1059         Graph => {
1060             const X: T = &[('!', '~')];
1061             X
1062         }
1063         Lower => {
1064             const X: T = &[('a', 'z')];
1065             X
1066         }
1067         Print => {
1068             const X: T = &[(' ', '~')];
1069             X
1070         }
1071         Punct => {
1072             const X: T = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
1073             X
1074         }
1075         Space => {
1076             const X: T = &[
1077                 ('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'),
1078                 ('\r', '\r'), (' ', ' '),
1079             ];
1080             X
1081         }
1082         Upper => {
1083             const X: T = &[('A', 'Z')];
1084             X
1085         }
1086         Word => {
1087             const X: T = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')];
1088             X
1089         }
1090         Xdigit => {
1091             const X: T = &[('0', '9'), ('A', 'F'), ('a', 'f')];
1092             X
1093         }
1094     }
1095 }
1096 
1097 #[cfg(test)]
1098 mod tests {
1099     use ast::{self, Ast, Position, Span};
1100     use ast::parse::ParserBuilder;
1101     use hir::{self, Hir, HirKind};
1102     use unicode::{self, ClassQuery};
1103 
1104     use super::{TranslatorBuilder, ascii_class};
1105 
1106     // We create these errors to compare with real hir::Errors in the tests.
1107     // We define equality between TestError and hir::Error to disregard the
1108     // pattern string in hir::Error, which is annoying to provide in tests.
1109     #[derive(Clone, Debug)]
1110     struct TestError {
1111         span: Span,
1112         kind: hir::ErrorKind,
1113     }
1114 
1115     impl PartialEq<hir::Error> for TestError {
eq(&self, other: &hir::Error) -> bool1116         fn eq(&self, other: &hir::Error) -> bool {
1117             self.span == other.span && self.kind == other.kind
1118         }
1119     }
1120 
1121     impl PartialEq<TestError> for hir::Error {
eq(&self, other: &TestError) -> bool1122         fn eq(&self, other: &TestError) -> bool {
1123             self.span == other.span && self.kind == other.kind
1124         }
1125     }
1126 
parse(pattern: &str) -> Ast1127     fn parse(pattern: &str) -> Ast {
1128         ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1129     }
1130 
t(pattern: &str) -> Hir1131     fn t(pattern: &str) -> Hir {
1132         TranslatorBuilder::new()
1133             .allow_invalid_utf8(false)
1134             .build()
1135             .translate(pattern, &parse(pattern))
1136             .unwrap()
1137     }
1138 
t_err(pattern: &str) -> hir::Error1139     fn t_err(pattern: &str) -> hir::Error {
1140         TranslatorBuilder::new()
1141             .allow_invalid_utf8(false)
1142             .build()
1143             .translate(pattern, &parse(pattern))
1144             .unwrap_err()
1145     }
1146 
t_bytes(pattern: &str) -> Hir1147     fn t_bytes(pattern: &str) -> Hir {
1148         TranslatorBuilder::new()
1149             .allow_invalid_utf8(true)
1150             .build()
1151             .translate(pattern, &parse(pattern))
1152             .unwrap()
1153     }
1154 
hir_lit(s: &str) -> Hir1155     fn hir_lit(s: &str) -> Hir {
1156         match s.len() {
1157             0 => Hir::empty(),
1158             _ => {
1159                 let lits = s
1160                     .chars()
1161                     .map(hir::Literal::Unicode)
1162                     .map(Hir::literal)
1163                     .collect();
1164                 Hir::concat(lits)
1165             }
1166         }
1167     }
1168 
hir_blit(s: &[u8]) -> Hir1169     fn hir_blit(s: &[u8]) -> Hir {
1170         match s.len() {
1171             0 => Hir::empty(),
1172             1 => Hir::literal(hir::Literal::Byte(s[0])),
1173             _ => {
1174                 let lits = s
1175                     .iter()
1176                     .cloned()
1177                     .map(hir::Literal::Byte)
1178                     .map(Hir::literal)
1179                     .collect();
1180                 Hir::concat(lits)
1181             }
1182         }
1183     }
1184 
hir_group(i: u32, expr: Hir) -> Hir1185     fn hir_group(i: u32, expr: Hir)  -> Hir {
1186         Hir::group(hir::Group {
1187             kind: hir::GroupKind::CaptureIndex(i),
1188             hir: Box::new(expr),
1189         })
1190     }
1191 
hir_group_name(i: u32, name: &str, expr: Hir) -> Hir1192     fn hir_group_name(i: u32, name: &str, expr: Hir)  -> Hir {
1193         Hir::group(hir::Group {
1194             kind: hir::GroupKind::CaptureName {
1195                 name: name.to_string(),
1196                 index: i,
1197             },
1198             hir: Box::new(expr),
1199         })
1200     }
1201 
hir_group_nocap(expr: Hir) -> Hir1202     fn hir_group_nocap(expr: Hir)  -> Hir {
1203         Hir::group(hir::Group {
1204             kind: hir::GroupKind::NonCapturing,
1205             hir: Box::new(expr),
1206         })
1207     }
1208 
hir_quest(greedy: bool, expr: Hir) -> Hir1209     fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1210         Hir::repetition(hir::Repetition {
1211             kind: hir::RepetitionKind::ZeroOrOne,
1212             greedy: greedy,
1213             hir: Box::new(expr),
1214         })
1215     }
1216 
hir_star(greedy: bool, expr: Hir) -> Hir1217     fn hir_star(greedy: bool, expr: Hir) -> Hir {
1218         Hir::repetition(hir::Repetition {
1219             kind: hir::RepetitionKind::ZeroOrMore,
1220             greedy: greedy,
1221             hir: Box::new(expr),
1222         })
1223     }
1224 
hir_plus(greedy: bool, expr: Hir) -> Hir1225     fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1226         Hir::repetition(hir::Repetition {
1227             kind: hir::RepetitionKind::OneOrMore,
1228             greedy: greedy,
1229             hir: Box::new(expr),
1230         })
1231     }
1232 
hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir1233     fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
1234         Hir::repetition(hir::Repetition {
1235             kind: hir::RepetitionKind::Range(range),
1236             greedy: greedy,
1237             hir: Box::new(expr),
1238         })
1239     }
1240 
hir_alt(alts: Vec<Hir>) -> Hir1241     fn hir_alt(alts: Vec<Hir>) -> Hir {
1242         Hir::alternation(alts)
1243     }
1244 
hir_cat(exprs: Vec<Hir>) -> Hir1245     fn hir_cat(exprs: Vec<Hir>) -> Hir {
1246         Hir::concat(exprs)
1247     }
1248 
hir_uclass_query(query: ClassQuery) -> Hir1249     fn hir_uclass_query(query: ClassQuery) -> Hir {
1250         Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1251     }
1252 
hir_uclass_perl_word() -> Hir1253     fn hir_uclass_perl_word() -> Hir {
1254         use unicode_tables::perl_word::PERL_WORD;
1255         Hir::class(hir::Class::Unicode(unicode::hir_class(PERL_WORD)))
1256     }
1257 
hir_uclass(ranges: &[(char, char)]) -> Hir1258     fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1259         let ranges: Vec<hir::ClassUnicodeRange> = ranges
1260             .iter()
1261             .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1262             .collect();
1263         Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
1264     }
1265 
hir_bclass(ranges: &[(u8, u8)]) -> Hir1266     fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1267         let ranges: Vec<hir::ClassBytesRange> = ranges
1268             .iter()
1269             .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1270             .collect();
1271         Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
1272     }
1273 
hir_bclass_from_char(ranges: &[(char, char)]) -> Hir1274     fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
1275         let ranges: Vec<hir::ClassBytesRange> = ranges
1276             .iter()
1277             .map(|&(s, e)| {
1278                 assert!(s as u32 <= 0x7F);
1279                 assert!(e as u32 <= 0x7F);
1280                 hir::ClassBytesRange::new(s as u8, e as u8)
1281             })
1282             .collect();
1283         Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
1284     }
1285 
hir_case_fold(expr: Hir) -> Hir1286     fn hir_case_fold(expr: Hir) -> Hir {
1287         match expr.into_kind() {
1288             HirKind::Class(mut cls) => {
1289                 cls.case_fold_simple();
1290                 Hir::class(cls)
1291             }
1292             _ => panic!("cannot case fold non-class Hir expr"),
1293         }
1294     }
1295 
hir_negate(expr: Hir) -> Hir1296     fn hir_negate(expr: Hir) -> Hir {
1297         match expr.into_kind() {
1298             HirKind::Class(mut cls) => {
1299                 cls.negate();
1300                 Hir::class(cls)
1301             }
1302             _ => panic!("cannot negate non-class Hir expr"),
1303         }
1304     }
1305 
hir_union(expr1: Hir, expr2: Hir) -> Hir1306     fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1307         use hir::Class::{Bytes, Unicode};
1308 
1309         match (expr1.into_kind(), expr2.into_kind()) {
1310             (
1311                 HirKind::Class(Unicode(mut c1)),
1312                 HirKind::Class(Unicode(c2)),
1313             ) => {
1314                 c1.union(&c2);
1315                 Hir::class(hir::Class::Unicode(c1))
1316             }
1317             (
1318                 HirKind::Class(Bytes(mut c1)),
1319                 HirKind::Class(Bytes(c2)),
1320             ) => {
1321                 c1.union(&c2);
1322                 Hir::class(hir::Class::Bytes(c1))
1323             }
1324             _ => panic!("cannot union non-class Hir exprs"),
1325         }
1326     }
1327 
hir_difference(expr1: Hir, expr2: Hir) -> Hir1328     fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1329         use hir::Class::{Bytes, Unicode};
1330 
1331         match (expr1.into_kind(), expr2.into_kind()) {
1332             (
1333                 HirKind::Class(Unicode(mut c1)),
1334                 HirKind::Class(Unicode(c2)),
1335             ) => {
1336                 c1.difference(&c2);
1337                 Hir::class(hir::Class::Unicode(c1))
1338             }
1339             (
1340                 HirKind::Class(Bytes(mut c1)),
1341                 HirKind::Class(Bytes(c2)),
1342             ) => {
1343                 c1.difference(&c2);
1344                 Hir::class(hir::Class::Bytes(c1))
1345             }
1346             _ => panic!("cannot difference non-class Hir exprs"),
1347         }
1348     }
1349 
hir_anchor(anchor: hir::Anchor) -> Hir1350     fn hir_anchor(anchor: hir::Anchor) -> Hir {
1351         Hir::anchor(anchor)
1352     }
1353 
hir_word(wb: hir::WordBoundary) -> Hir1354     fn hir_word(wb: hir::WordBoundary) -> Hir {
1355         Hir::word_boundary(wb)
1356     }
1357 
1358     #[test]
empty()1359     fn empty() {
1360         assert_eq!(t(""), Hir::empty());
1361         assert_eq!(t("(?i)"), Hir::empty());
1362         assert_eq!(t("()"), hir_group(1, Hir::empty()));
1363         assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1364         assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
1365         assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1366         assert_eq!(t("()|()"), hir_alt(vec![
1367             hir_group(1, Hir::empty()),
1368             hir_group(2, Hir::empty()),
1369         ]));
1370         assert_eq!(t("(|b)"), hir_group(1, hir_alt(vec![
1371             Hir::empty(),
1372             hir_lit("b"),
1373         ])));
1374         assert_eq!(t("(a|)"), hir_group(1, hir_alt(vec![
1375             hir_lit("a"),
1376             Hir::empty(),
1377         ])));
1378         assert_eq!(t("(a||c)"), hir_group(1, hir_alt(vec![
1379             hir_lit("a"),
1380             Hir::empty(),
1381             hir_lit("c"),
1382         ])));
1383         assert_eq!(t("(||)"), hir_group(1, hir_alt(vec![
1384             Hir::empty(),
1385             Hir::empty(),
1386             Hir::empty(),
1387         ])));
1388     }
1389 
1390     #[test]
literal()1391     fn literal() {
1392         assert_eq!(t("a"), hir_lit("a"));
1393         assert_eq!(t("(?-u)a"), hir_lit("a"));
1394         assert_eq!(t("☃"), hir_lit("☃"));
1395         assert_eq!(t("abcd"), hir_lit("abcd"));
1396 
1397         assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1398         assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1399         assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1400         assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1401 
1402         assert_eq!(t_err("(?-u)☃"), TestError {
1403             kind: hir::ErrorKind::UnicodeNotAllowed,
1404             span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 7)),
1405         });
1406         assert_eq!(t_err(r"(?-u)\xFF"), TestError {
1407             kind: hir::ErrorKind::InvalidUtf8,
1408             span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)),
1409         });
1410     }
1411 
1412     #[test]
literal_case_insensitive()1413     fn literal_case_insensitive() {
1414         assert_eq!(t("(?i)a"), hir_uclass(&[
1415             ('A', 'A'), ('a', 'a'),
1416         ]));
1417         assert_eq!(t("(?i:a)"), hir_group_nocap(hir_uclass(&[
1418             ('A', 'A'), ('a', 'a')],
1419         )));
1420         assert_eq!(t("a(?i)a(?-i)a"), hir_cat(vec![
1421             hir_lit("a"),
1422             hir_uclass(&[('A', 'A'), ('a', 'a')]),
1423             hir_lit("a"),
1424         ]));
1425         assert_eq!(t("(?i)ab@c"), hir_cat(vec![
1426             hir_uclass(&[('A', 'A'), ('a', 'a')]),
1427             hir_uclass(&[('B', 'B'), ('b', 'b')]),
1428             hir_lit("@"),
1429             hir_uclass(&[('C', 'C'), ('c', 'c')]),
1430         ]));
1431         assert_eq!(t("(?i)β"), hir_uclass(&[
1432             ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),
1433         ]));
1434 
1435         assert_eq!(t("(?i-u)a"), hir_bclass(&[
1436             (b'A', b'A'), (b'a', b'a'),
1437         ]));
1438         assert_eq!(t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![
1439             hir_lit("a"),
1440             hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1441             hir_lit("a"),
1442         ]));
1443         assert_eq!(t("(?i-u)ab@c"), hir_cat(vec![
1444             hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1445             hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1446             hir_lit("@"),
1447             hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1448         ]));
1449 
1450         assert_eq!(t_bytes("(?i-u)a"), hir_bclass(&[
1451             (b'A', b'A'), (b'a', b'a'),
1452         ]));
1453         assert_eq!(t_bytes("(?i-u)\x61"), hir_bclass(&[
1454             (b'A', b'A'), (b'a', b'a'),
1455         ]));
1456         assert_eq!(t_bytes(r"(?i-u)\x61"), hir_bclass(&[
1457             (b'A', b'A'), (b'a', b'a'),
1458         ]));
1459         assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1460 
1461         assert_eq!(t_err("(?i-u)β"), TestError {
1462             kind: hir::ErrorKind::UnicodeNotAllowed,
1463             span: Span::new(
1464                 Position::new(6, 1, 7),
1465                 Position::new(8, 1, 8),
1466             ),
1467         });
1468     }
1469 
1470     #[test]
dot()1471     fn dot() {
1472         assert_eq!(t("."), hir_uclass(&[
1473             ('\0', '\t'),
1474             ('\x0B', '\u{10FFFF}'),
1475         ]));
1476         assert_eq!(t("(?s)."), hir_uclass(&[
1477             ('\0', '\u{10FFFF}'),
1478         ]));
1479         assert_eq!(t_bytes("(?-u)."), hir_bclass(&[
1480             (b'\0', b'\t'),
1481             (b'\x0B', b'\xFF'),
1482         ]));
1483         assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[
1484             (b'\0', b'\xFF'),
1485         ]));
1486 
1487         // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1488         assert_eq!(t_err("(?-u)."), TestError {
1489             kind: hir::ErrorKind::InvalidUtf8,
1490             span: Span::new(Position::new(5, 1, 6), Position::new(6, 1, 7)),
1491         });
1492         assert_eq!(t_err("(?s-u)."), TestError {
1493             kind: hir::ErrorKind::InvalidUtf8,
1494             span: Span::new(Position::new(6, 1, 7), Position::new(7, 1, 8)),
1495         });
1496     }
1497 
1498     #[test]
assertions()1499     fn assertions() {
1500         assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
1501         assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
1502         assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
1503         assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
1504         assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
1505         assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
1506         assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
1507         assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
1508 
1509         assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
1510         assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
1511         assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
1512         assert_eq!(
1513             t_bytes(r"(?-u)\B"),
1514             hir_word(hir::WordBoundary::AsciiNegate));
1515 
1516         assert_eq!(t_err(r"(?-u)\B"), TestError {
1517             kind: hir::ErrorKind::InvalidUtf8,
1518             span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
1519         });
1520     }
1521 
1522     #[test]
group()1523     fn group() {
1524         assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
1525         assert_eq!(t("(a)(b)"), hir_cat(vec![
1526             hir_group(1, hir_lit("a")),
1527             hir_group(2, hir_lit("b")),
1528         ]));
1529         assert_eq!(t("(a)|(b)"), hir_alt(vec![
1530             hir_group(1, hir_lit("a")),
1531             hir_group(2, hir_lit("b")),
1532         ]));
1533         assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
1534         assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
1535         assert_eq!(t("(?P<foo>a)(?P<bar>b)"), hir_cat(vec![
1536             hir_group_name(1, "foo", hir_lit("a")),
1537             hir_group_name(2, "bar", hir_lit("b")),
1538         ]));
1539         assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1540         assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
1541         assert_eq!(t("(?:a)(b)"), hir_cat(vec![
1542             hir_group_nocap(hir_lit("a")),
1543             hir_group(1, hir_lit("b")),
1544         ]));
1545         assert_eq!(t("(a)(?:b)(c)"), hir_cat(vec![
1546             hir_group(1, hir_lit("a")),
1547             hir_group_nocap(hir_lit("b")),
1548             hir_group(2, hir_lit("c")),
1549         ]));
1550         assert_eq!(t("(a)(?P<foo>b)(c)"), hir_cat(vec![
1551             hir_group(1, hir_lit("a")),
1552             hir_group_name(2, "foo", hir_lit("b")),
1553             hir_group(3, hir_lit("c")),
1554         ]));
1555         assert_eq!(t("()"), hir_group(1, Hir::empty()));
1556         assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
1557         assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
1558         assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
1559     }
1560 
1561     #[test]
flags()1562     fn flags() {
1563         assert_eq!(t("(?i:a)a"), hir_cat(vec![
1564             hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
1565             hir_lit("a"),
1566         ]));
1567         assert_eq!(t("(?i-u:a)β"), hir_cat(vec![
1568             hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1569             hir_lit("β"),
1570         ]));
1571         assert_eq!(t("(?i)(?-i:a)a"), hir_cat(vec![
1572             hir_group_nocap(hir_lit("a")),
1573             hir_uclass(&[('A', 'A'), ('a', 'a')]),
1574         ]));
1575         assert_eq!(t("(?im)a^"), hir_cat(vec![
1576             hir_uclass(&[('A', 'A'), ('a', 'a')]),
1577             hir_anchor(hir::Anchor::StartLine),
1578         ]));
1579         assert_eq!(t("(?im)a^(?i-m)a^"), hir_cat(vec![
1580             hir_uclass(&[('A', 'A'), ('a', 'a')]),
1581             hir_anchor(hir::Anchor::StartLine),
1582             hir_uclass(&[('A', 'A'), ('a', 'a')]),
1583             hir_anchor(hir::Anchor::StartText),
1584         ]));
1585         assert_eq!(t("(?U)a*a*?(?-U)a*a*?"), hir_cat(vec![
1586             hir_star(false, hir_lit("a")),
1587             hir_star(true, hir_lit("a")),
1588             hir_star(true, hir_lit("a")),
1589             hir_star(false, hir_lit("a")),
1590         ]));
1591         assert_eq!(t("(?:a(?i)a)a"), hir_cat(vec![
1592             hir_group_nocap(hir_cat(vec![
1593                 hir_lit("a"),
1594                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1595             ])),
1596             hir_lit("a"),
1597         ]));
1598         assert_eq!(t("(?i)(?:a(?-i)a)a"), hir_cat(vec![
1599             hir_group_nocap(hir_cat(vec![
1600                 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1601                 hir_lit("a"),
1602             ])),
1603             hir_uclass(&[('A', 'A'), ('a', 'a')]),
1604         ]));
1605     }
1606 
1607     #[test]
escape()1608     fn escape() {
1609         assert_eq!(
1610             t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
1611             hir_lit(r"\.+*?()|[]{}^$#"));
1612     }
1613 
1614     #[test]
repetition()1615     fn repetition() {
1616         assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
1617         assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
1618         assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
1619         assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
1620         assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
1621         assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
1622 
1623         assert_eq!(
1624             t("a{1}"),
1625             hir_range(
1626                 true,
1627                 hir::RepetitionRange::Exactly(1),
1628                 hir_lit("a"),
1629             ));
1630         assert_eq!(
1631             t("a{1,}"),
1632             hir_range(
1633                 true,
1634                 hir::RepetitionRange::AtLeast(1),
1635                 hir_lit("a"),
1636             ));
1637         assert_eq!(
1638             t("a{1,2}"),
1639             hir_range(
1640                 true,
1641                 hir::RepetitionRange::Bounded(1, 2),
1642                 hir_lit("a"),
1643             ));
1644         assert_eq!(
1645             t("a{1}?"),
1646             hir_range(
1647                 false,
1648                 hir::RepetitionRange::Exactly(1),
1649                 hir_lit("a"),
1650             ));
1651         assert_eq!(
1652             t("a{1,}?"),
1653             hir_range(
1654                 false,
1655                 hir::RepetitionRange::AtLeast(1),
1656                 hir_lit("a"),
1657             ));
1658         assert_eq!(
1659             t("a{1,2}?"),
1660             hir_range(
1661                 false,
1662                 hir::RepetitionRange::Bounded(1, 2),
1663                 hir_lit("a"),
1664             ));
1665 
1666         assert_eq!(t("ab?"), hir_cat(vec![
1667             hir_lit("a"),
1668             hir_quest(true, hir_lit("b")),
1669         ]));
1670         assert_eq!(t("(ab)?"), hir_quest(true, hir_group(1, hir_cat(vec![
1671             hir_lit("a"),
1672             hir_lit("b"),
1673         ]))));
1674         assert_eq!(t("a|b?"), hir_alt(vec![
1675             hir_lit("a"),
1676             hir_quest(true, hir_lit("b")),
1677         ]));
1678     }
1679 
1680     #[test]
cat_alt()1681     fn cat_alt() {
1682         assert_eq!(t("(ab)"), hir_group(1, hir_cat(vec![
1683             hir_lit("a"),
1684             hir_lit("b"),
1685         ])));
1686         assert_eq!(t("a|b"), hir_alt(vec![
1687             hir_lit("a"),
1688             hir_lit("b"),
1689         ]));
1690         assert_eq!(t("a|b|c"), hir_alt(vec![
1691             hir_lit("a"),
1692             hir_lit("b"),
1693             hir_lit("c"),
1694         ]));
1695         assert_eq!(t("ab|bc|cd"), hir_alt(vec![
1696             hir_lit("ab"),
1697             hir_lit("bc"),
1698             hir_lit("cd"),
1699         ]));
1700         assert_eq!(t("(a|b)"), hir_group(1, hir_alt(vec![
1701             hir_lit("a"),
1702             hir_lit("b"),
1703         ])));
1704         assert_eq!(t("(a|b|c)"), hir_group(1, hir_alt(vec![
1705             hir_lit("a"),
1706             hir_lit("b"),
1707             hir_lit("c"),
1708         ])));
1709         assert_eq!(t("(ab|bc|cd)"), hir_group(1, hir_alt(vec![
1710             hir_lit("ab"),
1711             hir_lit("bc"),
1712             hir_lit("cd"),
1713         ])));
1714         assert_eq!(t("(ab|(bc|(cd)))"), hir_group(1, hir_alt(vec![
1715             hir_lit("ab"),
1716             hir_group(2, hir_alt(vec![
1717                 hir_lit("bc"),
1718                 hir_group(3, hir_lit("cd")),
1719             ])),
1720         ])));
1721     }
1722 
1723     #[test]
class_ascii()1724     fn class_ascii() {
1725         assert_eq!(
1726             t("[[:alnum:]]"),
1727             hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)));
1728         assert_eq!(
1729             t("[[:alpha:]]"),
1730             hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)));
1731         assert_eq!(
1732             t("[[:ascii:]]"),
1733             hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)));
1734         assert_eq!(
1735             t("[[:blank:]]"),
1736             hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)));
1737         assert_eq!(
1738             t("[[:cntrl:]]"),
1739             hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)));
1740         assert_eq!(
1741             t("[[:digit:]]"),
1742             hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)));
1743         assert_eq!(
1744             t("[[:graph:]]"),
1745             hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)));
1746         assert_eq!(
1747             t("[[:lower:]]"),
1748             hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)));
1749         assert_eq!(
1750             t("[[:print:]]"),
1751             hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)));
1752         assert_eq!(
1753             t("[[:punct:]]"),
1754             hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)));
1755         assert_eq!(
1756             t("[[:space:]]"),
1757             hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)));
1758         assert_eq!(
1759             t("[[:upper:]]"),
1760             hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)));
1761         assert_eq!(
1762             t("[[:word:]]"),
1763             hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)));
1764         assert_eq!(
1765             t("[[:xdigit:]]"),
1766             hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)));
1767 
1768         assert_eq!(
1769             t("[[:^lower:]]"),
1770             hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))));
1771         assert_eq!(
1772             t("(?i)[[:lower:]]"),
1773             hir_uclass(&[
1774                 ('A', 'Z'), ('a', 'z'),
1775                 ('\u{17F}', '\u{17F}'),
1776                 ('\u{212A}', '\u{212A}'),
1777             ]));
1778 
1779         assert_eq!(
1780             t("(?-u)[[:lower:]]"),
1781             hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)));
1782         assert_eq!(
1783             t("(?i-u)[[:lower:]]"),
1784             hir_case_fold(hir_bclass_from_char(ascii_class(
1785                 &ast::ClassAsciiKind::Lower))));
1786 
1787         assert_eq!(t_err("(?-u)[[:^lower:]]"), TestError {
1788             kind: hir::ErrorKind::InvalidUtf8,
1789             span: Span::new(Position::new(6, 1, 7), Position::new(16, 1, 17)),
1790         });
1791         assert_eq!(t_err("(?i-u)[[:^lower:]]"), TestError {
1792             kind: hir::ErrorKind::InvalidUtf8,
1793             span: Span::new(Position::new(7, 1, 8), Position::new(17, 1, 18)),
1794         });
1795     }
1796 
1797     #[test]
class_perl()1798     fn class_perl() {
1799         // Unicode
1800         assert_eq!(
1801             t(r"\d"),
1802             hir_uclass_query(ClassQuery::Binary("digit")));
1803         assert_eq!(
1804             t(r"\s"),
1805             hir_uclass_query(ClassQuery::Binary("space")));
1806         assert_eq!(
1807             t(r"\w"),
1808             hir_uclass_perl_word());
1809         assert_eq!(
1810             t(r"(?i)\d"),
1811             hir_uclass_query(ClassQuery::Binary("digit")));
1812         assert_eq!(
1813             t(r"(?i)\s"),
1814             hir_uclass_query(ClassQuery::Binary("space")));
1815         assert_eq!(
1816             t(r"(?i)\w"),
1817             hir_uclass_perl_word());
1818 
1819         // Unicode, negated
1820         assert_eq!(
1821             t(r"\D"),
1822             hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
1823         assert_eq!(
1824             t(r"\S"),
1825             hir_negate(hir_uclass_query(ClassQuery::Binary("space"))));
1826         assert_eq!(
1827             t(r"\W"),
1828             hir_negate(hir_uclass_perl_word()));
1829         assert_eq!(
1830             t(r"(?i)\D"),
1831             hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
1832         assert_eq!(
1833             t(r"(?i)\S"),
1834             hir_negate(hir_uclass_query(ClassQuery::Binary("space"))));
1835         assert_eq!(
1836             t(r"(?i)\W"),
1837             hir_negate(hir_uclass_perl_word()));
1838 
1839         // ASCII only
1840         assert_eq!(
1841             t(r"(?-u)\d"),
1842             hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)));
1843         assert_eq!(
1844             t(r"(?-u)\s"),
1845             hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)));
1846         assert_eq!(
1847             t(r"(?-u)\w"),
1848             hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)));
1849         assert_eq!(
1850             t(r"(?i-u)\d"),
1851             hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)));
1852         assert_eq!(
1853             t(r"(?i-u)\s"),
1854             hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)));
1855         assert_eq!(
1856             t(r"(?i-u)\w"),
1857             hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)));
1858 
1859         // ASCII only, negated
1860         assert_eq!(
1861             t(r"(?-u)\D"),
1862             hir_negate(hir_bclass_from_char(ascii_class(
1863                 &ast::ClassAsciiKind::Digit))));
1864         assert_eq!(
1865             t(r"(?-u)\S"),
1866             hir_negate(hir_bclass_from_char(ascii_class(
1867                 &ast::ClassAsciiKind::Space))));
1868         assert_eq!(
1869             t(r"(?-u)\W"),
1870             hir_negate(hir_bclass_from_char(ascii_class(
1871                 &ast::ClassAsciiKind::Word))));
1872         assert_eq!(
1873             t(r"(?i-u)\D"),
1874             hir_negate(hir_bclass_from_char(ascii_class(
1875                 &ast::ClassAsciiKind::Digit))));
1876         assert_eq!(
1877             t(r"(?i-u)\S"),
1878             hir_negate(hir_bclass_from_char(ascii_class(
1879                 &ast::ClassAsciiKind::Space))));
1880         assert_eq!(
1881             t(r"(?i-u)\W"),
1882             hir_negate(hir_bclass_from_char(ascii_class(
1883                 &ast::ClassAsciiKind::Word))));
1884     }
1885 
1886     #[test]
class_unicode()1887     fn class_unicode() {
1888         assert_eq!(
1889             t(r"\pZ"),
1890             hir_uclass_query(ClassQuery::Binary("Z")));
1891         assert_eq!(
1892             t(r"\pz"),
1893             hir_uclass_query(ClassQuery::Binary("Z")));
1894         assert_eq!(
1895             t(r"\p{Separator}"),
1896             hir_uclass_query(ClassQuery::Binary("Z")));
1897         assert_eq!(
1898             t(r"\p{se      PaRa ToR}"),
1899             hir_uclass_query(ClassQuery::Binary("Z")));
1900         assert_eq!(
1901             t(r"\p{gc:Separator}"),
1902             hir_uclass_query(ClassQuery::Binary("Z")));
1903         assert_eq!(
1904             t(r"\p{gc=Separator}"),
1905             hir_uclass_query(ClassQuery::Binary("Z")));
1906         assert_eq!(
1907             t(r"\p{Other}"),
1908             hir_uclass_query(ClassQuery::Binary("Other")));
1909         assert_eq!(
1910             t(r"\pC"),
1911             hir_uclass_query(ClassQuery::Binary("Other")));
1912 
1913         assert_eq!(
1914             t(r"\PZ"),
1915             hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))));
1916         assert_eq!(
1917             t(r"\P{separator}"),
1918             hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))));
1919         assert_eq!(
1920             t(r"\P{gc!=separator}"),
1921             hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))));
1922 
1923         assert_eq!(
1924             t(r"\p{Greek}"),
1925             hir_uclass_query(ClassQuery::Binary("Greek")));
1926         assert_eq!(
1927             t(r"(?i)\p{Greek}"),
1928             hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))));
1929         assert_eq!(
1930             t(r"(?i)\P{Greek}"),
1931             hir_negate(hir_case_fold(hir_uclass_query(
1932                 ClassQuery::Binary("Greek")))));
1933 
1934         assert_eq!(
1935             t(r"\p{any}"),
1936             hir_uclass_query(ClassQuery::Binary("Any")));
1937         assert_eq!(
1938             t(r"\p{assigned}"),
1939             hir_uclass_query(ClassQuery::Binary("Assigned")));
1940         assert_eq!(
1941             t(r"\p{ascii}"),
1942             hir_uclass_query(ClassQuery::Binary("ASCII")));
1943         assert_eq!(
1944             t(r"\p{gc:any}"),
1945             hir_uclass_query(ClassQuery::Binary("Any")));
1946         assert_eq!(
1947             t(r"\p{gc:assigned}"),
1948             hir_uclass_query(ClassQuery::Binary("Assigned")));
1949         assert_eq!(
1950             t(r"\p{gc:ascii}"),
1951             hir_uclass_query(ClassQuery::Binary("ASCII")));
1952 
1953         assert_eq!(t_err(r"(?-u)\pZ"), TestError {
1954             kind: hir::ErrorKind::UnicodeNotAllowed,
1955             span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 9)),
1956         });
1957         assert_eq!(t_err(r"(?-u)\p{Separator}"), TestError {
1958             kind: hir::ErrorKind::UnicodeNotAllowed,
1959             span: Span::new(Position::new(5, 1, 6), Position::new(18, 1, 19)),
1960         });
1961         assert_eq!(t_err(r"\pE"), TestError {
1962             kind: hir::ErrorKind::UnicodePropertyNotFound,
1963             span: Span::new(Position::new(0, 1, 1), Position::new(3, 1, 4)),
1964         });
1965         assert_eq!(t_err(r"\p{Foo}"), TestError {
1966             kind: hir::ErrorKind::UnicodePropertyNotFound,
1967             span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)),
1968         });
1969         assert_eq!(t_err(r"\p{gc:Foo}"), TestError {
1970             kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1971             span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)),
1972         });
1973         assert_eq!(t_err(r"\p{sc:Foo}"), TestError {
1974             kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1975             span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)),
1976         });
1977         assert_eq!(t_err(r"\p{scx:Foo}"), TestError {
1978             kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1979             span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)),
1980         });
1981         assert_eq!(t_err(r"\p{age:Foo}"), TestError {
1982             kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1983             span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)),
1984         });
1985     }
1986 
1987     #[test]
class_bracketed()1988     fn class_bracketed() {
1989         assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
1990         assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
1991         assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
1992         assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
1993         assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
1994         assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
1995         assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
1996         assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
1997         assert_eq!(
1998             t(r"[\d]"),
1999             hir_uclass_query(ClassQuery::Binary("digit")));
2000         assert_eq!(
2001             t(r"[\pZ]"),
2002             hir_uclass_query(ClassQuery::Binary("separator")));
2003         assert_eq!(
2004             t(r"[\p{separator}]"),
2005             hir_uclass_query(ClassQuery::Binary("separator")));
2006         assert_eq!(
2007             t(r"[^\D]"),
2008             hir_uclass_query(ClassQuery::Binary("digit")));
2009         assert_eq!(
2010             t(r"[^\PZ]"),
2011             hir_uclass_query(ClassQuery::Binary("separator")));
2012         assert_eq!(
2013             t(r"[^\P{separator}]"),
2014             hir_uclass_query(ClassQuery::Binary("separator")));
2015         assert_eq!(
2016             t(r"(?i)[^\D]"),
2017             hir_uclass_query(ClassQuery::Binary("digit")));
2018         assert_eq!(
2019             t(r"(?i)[^\P{greek}]"),
2020             hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))));
2021 
2022         assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2023         assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2024         assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2025 
2026         assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2027         assert_eq!(t("(?i)[k]"), hir_uclass(&[
2028             ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
2029         ]));
2030         assert_eq!(t("(?i)[β]"), hir_uclass(&[
2031             ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),
2032         ]));
2033         assert_eq!(t("(?i-u)[k]"), hir_bclass(&[
2034             (b'K', b'K'), (b'k', b'k'),
2035         ]));
2036 
2037         assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
2038         assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
2039         assert_eq!(
2040             t_bytes("(?-u)[^a]"),
2041             hir_negate(hir_bclass(&[(b'a', b'a')])));
2042         assert_eq!(
2043             t(r"[^\d]"),
2044             hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
2045         assert_eq!(
2046             t(r"[^\pZ]"),
2047             hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))));
2048         assert_eq!(
2049             t(r"[^\p{separator}]"),
2050             hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))));
2051         assert_eq!(
2052             t(r"(?i)[^\p{greek}]"),
2053             hir_negate(hir_case_fold(hir_uclass_query(
2054                 ClassQuery::Binary("greek")))));
2055         assert_eq!(
2056             t(r"(?i)[\P{greek}]"),
2057             hir_negate(hir_case_fold(hir_uclass_query(
2058                 ClassQuery::Binary("greek")))));
2059 
2060         // Test some weird cases.
2061         assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2062 
2063         assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2064         assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2065         assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2066         assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2067         assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2068 
2069         assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2070         assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2071         assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2072         assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2073         assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2074 
2075         assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2076         assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2077         assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2078         assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2079         assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2080 
2081         assert_eq!(t_err("(?-u)[^a]"), TestError {
2082             kind: hir::ErrorKind::InvalidUtf8,
2083             span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)),
2084         });
2085         assert_eq!(t_err(r"[^\s\S]"), TestError {
2086             kind: hir::ErrorKind::EmptyClassNotAllowed,
2087             span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)),
2088         });
2089         assert_eq!(t_err(r"(?-u)[^\s\S]"), TestError {
2090             kind: hir::ErrorKind::EmptyClassNotAllowed,
2091             span: Span::new(Position::new(5, 1, 6), Position::new(12, 1, 13)),
2092         });
2093     }
2094 
2095     #[test]
class_bracketed_union()2096     fn class_bracketed_union() {
2097         assert_eq!(
2098             t("[a-zA-Z]"),
2099             hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2100         assert_eq!(
2101             t(r"[a\pZb]"),
2102             hir_union(
2103                 hir_uclass(&[('a', 'b')]),
2104                 hir_uclass_query(ClassQuery::Binary("separator"))));
2105         assert_eq!(
2106             t(r"[\pZ\p{Greek}]"),
2107             hir_union(
2108                 hir_uclass_query(ClassQuery::Binary("greek")),
2109                 hir_uclass_query(ClassQuery::Binary("separator"))));
2110         assert_eq!(
2111             t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2112             hir_union(
2113                 hir_uclass_query(ClassQuery::ByValue {
2114                     property_name: "age",
2115                     property_value: "3.0",
2116                 }),
2117                 hir_union(
2118                     hir_uclass_query(ClassQuery::Binary("greek")),
2119                     hir_uclass_query(ClassQuery::Binary("separator")))));
2120         assert_eq!(
2121             t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2122             hir_union(
2123                 hir_uclass_query(ClassQuery::ByValue {
2124                     property_name: "age",
2125                     property_value: "3.0",
2126                 }),
2127                 hir_union(
2128                     hir_uclass_query(ClassQuery::Binary("cyrillic")),
2129                     hir_union(
2130                         hir_uclass_query(ClassQuery::Binary("greek")),
2131                         hir_uclass_query(ClassQuery::Binary("separator"))))));
2132 
2133         assert_eq!(
2134             t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2135             hir_case_fold(hir_union(
2136                 hir_uclass_query(ClassQuery::ByValue {
2137                     property_name: "age",
2138                     property_value: "3.0",
2139                 }),
2140                 hir_union(
2141                     hir_uclass_query(ClassQuery::Binary("greek")),
2142                     hir_uclass_query(ClassQuery::Binary("separator"))))));
2143         assert_eq!(
2144             t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2145             hir_negate(hir_union(
2146                 hir_uclass_query(ClassQuery::ByValue {
2147                     property_name: "age",
2148                     property_value: "3.0",
2149                 }),
2150                 hir_union(
2151                     hir_uclass_query(ClassQuery::Binary("greek")),
2152                     hir_uclass_query(ClassQuery::Binary("separator"))))));
2153         assert_eq!(
2154             t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2155             hir_negate(hir_case_fold(hir_union(
2156                 hir_uclass_query(ClassQuery::ByValue {
2157                     property_name: "age",
2158                     property_value: "3.0",
2159                 }),
2160                 hir_union(
2161                     hir_uclass_query(ClassQuery::Binary("greek")),
2162                     hir_uclass_query(ClassQuery::Binary("separator")))))));
2163     }
2164 
2165     #[test]
class_bracketed_nested()2166     fn class_bracketed_nested() {
2167         assert_eq!(
2168             t(r"[a[^c]]"),
2169             hir_negate(hir_uclass(&[('c', 'c')])));
2170         assert_eq!(
2171             t(r"[a-b[^c]]"),
2172             hir_negate(hir_uclass(&[('c', 'c')])));
2173         assert_eq!(
2174             t(r"[a-c[^c]]"),
2175             hir_negate(hir_uclass(&[])));
2176 
2177         assert_eq!(
2178             t(r"[^a[^c]]"),
2179             hir_uclass(&[('c', 'c')]));
2180         assert_eq!(
2181             t(r"[^a-b[^c]]"),
2182             hir_uclass(&[('c', 'c')]));
2183 
2184         assert_eq!(
2185             t(r"(?i)[a[^c]]"),
2186             hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))));
2187         assert_eq!(
2188             t(r"(?i)[a-b[^c]]"),
2189             hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))));
2190 
2191         assert_eq!(
2192             t(r"(?i)[^a[^c]]"),
2193             hir_uclass(&[('C', 'C'), ('c', 'c')]));
2194         assert_eq!(
2195             t(r"(?i)[^a-b[^c]]"),
2196             hir_uclass(&[('C', 'C'), ('c', 'c')]));
2197 
2198         assert_eq!(t_err(r"[^a-c[^c]]"), TestError {
2199             kind: hir::ErrorKind::EmptyClassNotAllowed,
2200             span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)),
2201         });
2202         assert_eq!(t_err(r"(?i)[^a-c[^c]]"), TestError {
2203             kind: hir::ErrorKind::EmptyClassNotAllowed,
2204             span: Span::new(Position::new(4, 1, 5), Position::new(14, 1, 15)),
2205         });
2206     }
2207 
2208     #[test]
class_bracketed_intersect()2209     fn class_bracketed_intersect() {
2210         assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2211         assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2212         assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2213         assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2214         assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2215         assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2216         assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2217         assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
2218         assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
2219 
2220         assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
2221         assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2222         assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2223         assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
2224         assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
2225         assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
2226 
2227         assert_eq!(
2228             t("(?i)[abc&&b-c]"),
2229             hir_case_fold(hir_uclass(&[('b', 'c')])));
2230         assert_eq!(
2231             t("(?i)[abc&&[b-c]]"),
2232             hir_case_fold(hir_uclass(&[('b', 'c')])));
2233         assert_eq!(
2234             t("(?i)[[abc]&&[b-c]]"),
2235             hir_case_fold(hir_uclass(&[('b', 'c')])));
2236         assert_eq!(
2237             t("(?i)[a-z&&b-y&&c-x]"),
2238             hir_case_fold(hir_uclass(&[('c', 'x')])));
2239         assert_eq!(
2240             t("(?i)[c-da-b&&a-d]"),
2241             hir_case_fold(hir_uclass(&[('a', 'd')])));
2242         assert_eq!(
2243             t("(?i)[a-d&&c-da-b]"),
2244             hir_case_fold(hir_uclass(&[('a', 'd')])));
2245 
2246         assert_eq!(
2247             t("(?i-u)[abc&&b-c]"),
2248             hir_case_fold(hir_bclass(&[(b'b', b'c')])));
2249         assert_eq!(
2250             t("(?i-u)[abc&&[b-c]]"),
2251             hir_case_fold(hir_bclass(&[(b'b', b'c')])));
2252         assert_eq!(
2253             t("(?i-u)[[abc]&&[b-c]]"),
2254             hir_case_fold(hir_bclass(&[(b'b', b'c')])));
2255         assert_eq!(
2256             t("(?i-u)[a-z&&b-y&&c-x]"),
2257             hir_case_fold(hir_bclass(&[(b'c', b'x')])));
2258         assert_eq!(
2259             t("(?i-u)[c-da-b&&a-d]"),
2260             hir_case_fold(hir_bclass(&[(b'a', b'd')])));
2261         assert_eq!(
2262             t("(?i-u)[a-d&&c-da-b]"),
2263             hir_case_fold(hir_bclass(&[(b'a', b'd')])));
2264 
2265         // In `[a^]`, `^` does not need to be escaped, so it makes sense that
2266         // `^` is also allowed to be unescaped after `&&`.
2267         assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
2268         // `]` needs to be escaped after `&&` since it's not at start of class.
2269         assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
2270         assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
2271         assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
2272         assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
2273         // Test precedence.
2274         assert_eq!(
2275             t(r"[a-w&&[^c-g]z]"),
2276             hir_uclass(&[('a', 'b'), ('h', 'w')]));
2277     }
2278 
2279     #[test]
class_bracketed_intersect_negate()2280     fn class_bracketed_intersect_negate() {
2281         assert_eq!(
2282             t(r"[^\w&&\d]"),
2283             hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
2284         assert_eq!(
2285             t(r"[^[a-z&&a-c]]"),
2286             hir_negate(hir_uclass(&[('a', 'c')])));
2287         assert_eq!(
2288             t(r"[^[\w&&\d]]"),
2289             hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
2290         assert_eq!(
2291             t(r"[^[^\w&&\d]]"),
2292             hir_uclass_query(ClassQuery::Binary("digit")));
2293         assert_eq!(
2294             t(r"[[[^\w]&&[^\d]]]"),
2295             hir_negate(hir_uclass_perl_word()));
2296 
2297         assert_eq!(
2298             t_bytes(r"(?-u)[^\w&&\d]"),
2299             hir_negate(hir_bclass_from_char(ascii_class(
2300                 &ast::ClassAsciiKind::Digit))));
2301         assert_eq!(
2302             t_bytes(r"(?-u)[^[a-z&&a-c]]"),
2303             hir_negate(hir_bclass(&[(b'a', b'c')])));
2304         assert_eq!(
2305             t_bytes(r"(?-u)[^[\w&&\d]]"),
2306             hir_negate(hir_bclass_from_char(ascii_class(
2307                 &ast::ClassAsciiKind::Digit))));
2308         assert_eq!(
2309             t_bytes(r"(?-u)[^[^\w&&\d]]"),
2310             hir_bclass_from_char(ascii_class(
2311                 &ast::ClassAsciiKind::Digit)));
2312         assert_eq!(
2313             t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
2314             hir_negate(hir_bclass_from_char(ascii_class(
2315                 &ast::ClassAsciiKind::Word))));
2316     }
2317 
2318     #[test]
class_bracketed_difference()2319     fn class_bracketed_difference() {
2320         assert_eq!(
2321             t(r"[\pL--[:ascii:]]"),
2322             hir_difference(
2323                 hir_uclass_query(ClassQuery::Binary("letter")),
2324                 hir_uclass(&[('\0', '\x7F')])));
2325 
2326         assert_eq!(
2327             t(r"(?-u)[[:alpha:]--[:lower:]]"),
2328             hir_bclass(&[(b'A', b'Z')]));
2329     }
2330 
2331     #[test]
class_bracketed_symmetric_difference()2332     fn class_bracketed_symmetric_difference() {
2333         assert_eq!(
2334             t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
2335             hir_uclass(&[
2336                 ('\u{0342}', '\u{0342}'),
2337                 ('\u{0345}', '\u{0345}'),
2338                 ('\u{1DC0}', '\u{1DC1}'),
2339             ]));
2340         assert_eq!(
2341             t(r"[a-g~~c-j]"),
2342             hir_uclass(&[('a', 'b'), ('h', 'j')]));
2343 
2344         assert_eq!(
2345             t(r"(?-u)[a-g~~c-j]"),
2346             hir_bclass(&[(b'a', b'b'), (b'h', b'j')]));
2347     }
2348 
2349     #[test]
ignore_whitespace()2350     fn ignore_whitespace() {
2351         assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
2352         assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
2353         assert_eq!(t(r"(?x)\x # comment
2354 { # comment
2355     53 # comment
2356 } #comment"), hir_lit("S"));
2357 
2358         assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
2359         assert_eq!(t(r"(?x)\x # comment
2360         53 # comment"), hir_lit("S"));
2361         assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
2362 
2363         assert_eq!(t(r"(?x)\p # comment
2364 { # comment
2365     Separator # comment
2366 } # comment"), hir_uclass_query(ClassQuery::Binary("separator")));
2367 
2368         assert_eq!(t(r"(?x)a # comment
2369 { # comment
2370     5 # comment
2371     , # comment
2372     10 # comment
2373 } # comment"),
2374             hir_range(
2375                 true, hir::RepetitionRange::Bounded(5, 10), hir_lit("a")));
2376 
2377         assert_eq!(t(r"(?x)a\  # hi there"), hir_lit("a "));
2378     }
2379 
2380     #[test]
analysis_is_always_utf8()2381     fn analysis_is_always_utf8() {
2382         // Positive examples.
2383         assert!(t_bytes(r"a").is_always_utf8());
2384         assert!(t_bytes(r"ab").is_always_utf8());
2385         assert!(t_bytes(r"(?-u)a").is_always_utf8());
2386         assert!(t_bytes(r"(?-u)ab").is_always_utf8());
2387         assert!(t_bytes(r"\xFF").is_always_utf8());
2388         assert!(t_bytes(r"\xFF\xFF").is_always_utf8());
2389         assert!(t_bytes(r"[^a]").is_always_utf8());
2390         assert!(t_bytes(r"[^a][^a]").is_always_utf8());
2391         assert!(t_bytes(r"\b").is_always_utf8());
2392         assert!(t_bytes(r"\B").is_always_utf8());
2393         assert!(t_bytes(r"(?-u)\b").is_always_utf8());
2394 
2395         // Negative examples.
2396         assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
2397         assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
2398         assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
2399         assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
2400         assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
2401     }
2402 
2403     #[test]
analysis_is_all_assertions()2404     fn analysis_is_all_assertions() {
2405         // Positive examples.
2406         assert!(t(r"\b").is_all_assertions());
2407         assert!(t(r"\B").is_all_assertions());
2408         assert!(t(r"^").is_all_assertions());
2409         assert!(t(r"$").is_all_assertions());
2410         assert!(t(r"\A").is_all_assertions());
2411         assert!(t(r"\z").is_all_assertions());
2412         assert!(t(r"$^\z\A\b\B").is_all_assertions());
2413         assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions());
2414         assert!(t(r"^$|$^").is_all_assertions());
2415         assert!(t(r"((\b)+())*^").is_all_assertions());
2416 
2417         // Negative examples.
2418         assert!(!t(r"^a").is_all_assertions());
2419     }
2420 
2421     #[test]
analysis_is_anchored()2422     fn analysis_is_anchored() {
2423         // Positive examples.
2424         assert!(t(r"^").is_anchored_start());
2425         assert!(t(r"$").is_anchored_end());
2426         assert!(t(r"^").is_line_anchored_start());
2427         assert!(t(r"$").is_line_anchored_end());
2428 
2429         assert!(t(r"^^").is_anchored_start());
2430         assert!(t(r"$$").is_anchored_end());
2431         assert!(t(r"^^").is_line_anchored_start());
2432         assert!(t(r"$$").is_line_anchored_end());
2433 
2434         assert!(t(r"^$").is_anchored_start());
2435         assert!(t(r"^$").is_anchored_end());
2436         assert!(t(r"^$").is_line_anchored_start());
2437         assert!(t(r"^$").is_line_anchored_end());
2438 
2439         assert!(t(r"^foo").is_anchored_start());
2440         assert!(t(r"foo$").is_anchored_end());
2441         assert!(t(r"^foo").is_line_anchored_start());
2442         assert!(t(r"foo$").is_line_anchored_end());
2443 
2444         assert!(t(r"^foo|^bar").is_anchored_start());
2445         assert!(t(r"foo$|bar$").is_anchored_end());
2446         assert!(t(r"^foo|^bar").is_line_anchored_start());
2447         assert!(t(r"foo$|bar$").is_line_anchored_end());
2448 
2449         assert!(t(r"^(foo|bar)").is_anchored_start());
2450         assert!(t(r"(foo|bar)$").is_anchored_end());
2451         assert!(t(r"^(foo|bar)").is_line_anchored_start());
2452         assert!(t(r"(foo|bar)$").is_line_anchored_end());
2453 
2454         assert!(t(r"^+").is_anchored_start());
2455         assert!(t(r"$+").is_anchored_end());
2456         assert!(t(r"^+").is_line_anchored_start());
2457         assert!(t(r"$+").is_line_anchored_end());
2458         assert!(t(r"^++").is_anchored_start());
2459         assert!(t(r"$++").is_anchored_end());
2460         assert!(t(r"^++").is_line_anchored_start());
2461         assert!(t(r"$++").is_line_anchored_end());
2462         assert!(t(r"(^)+").is_anchored_start());
2463         assert!(t(r"($)+").is_anchored_end());
2464         assert!(t(r"(^)+").is_line_anchored_start());
2465         assert!(t(r"($)+").is_line_anchored_end());
2466 
2467         assert!(t(r"$^").is_anchored_start());
2468         assert!(t(r"$^").is_anchored_start());
2469         assert!(t(r"$^").is_line_anchored_end());
2470         assert!(t(r"$^").is_line_anchored_end());
2471         assert!(t(r"$^|^$").is_anchored_start());
2472         assert!(t(r"$^|^$").is_anchored_end());
2473         assert!(t(r"$^|^$").is_line_anchored_start());
2474         assert!(t(r"$^|^$").is_line_anchored_end());
2475 
2476         assert!(t(r"\b^").is_anchored_start());
2477         assert!(t(r"$\b").is_anchored_end());
2478         assert!(t(r"\b^").is_line_anchored_start());
2479         assert!(t(r"$\b").is_line_anchored_end());
2480         assert!(t(r"^(?m:^)").is_anchored_start());
2481         assert!(t(r"(?m:$)$").is_anchored_end());
2482         assert!(t(r"^(?m:^)").is_line_anchored_start());
2483         assert!(t(r"(?m:$)$").is_line_anchored_end());
2484         assert!(t(r"(?m:^)^").is_anchored_start());
2485         assert!(t(r"$(?m:$)").is_anchored_end());
2486         assert!(t(r"(?m:^)^").is_line_anchored_start());
2487         assert!(t(r"$(?m:$)").is_line_anchored_end());
2488 
2489         // Negative examples.
2490         assert!(!t(r"(?m)^").is_anchored_start());
2491         assert!(!t(r"(?m)$").is_anchored_end());
2492         assert!(!t(r"(?m:^$)|$^").is_anchored_start());
2493         assert!(!t(r"(?m:^$)|$^").is_anchored_end());
2494         assert!(!t(r"$^|(?m:^$)").is_anchored_start());
2495         assert!(!t(r"$^|(?m:^$)").is_anchored_end());
2496 
2497         assert!(!t(r"a^").is_anchored_start());
2498         assert!(!t(r"$a").is_anchored_start());
2499         assert!(!t(r"a^").is_line_anchored_start());
2500         assert!(!t(r"$a").is_line_anchored_start());
2501 
2502         assert!(!t(r"a^").is_anchored_end());
2503         assert!(!t(r"$a").is_anchored_end());
2504         assert!(!t(r"a^").is_line_anchored_end());
2505         assert!(!t(r"$a").is_line_anchored_end());
2506 
2507         assert!(!t(r"^foo|bar").is_anchored_start());
2508         assert!(!t(r"foo|bar$").is_anchored_end());
2509         assert!(!t(r"^foo|bar").is_line_anchored_start());
2510         assert!(!t(r"foo|bar$").is_line_anchored_end());
2511 
2512         assert!(!t(r"^*").is_anchored_start());
2513         assert!(!t(r"$*").is_anchored_end());
2514         assert!(!t(r"^*").is_line_anchored_start());
2515         assert!(!t(r"$*").is_line_anchored_end());
2516         assert!(!t(r"^*+").is_anchored_start());
2517         assert!(!t(r"$*+").is_anchored_end());
2518         assert!(!t(r"^*+").is_line_anchored_start());
2519         assert!(!t(r"$*+").is_line_anchored_end());
2520         assert!(!t(r"^+*").is_anchored_start());
2521         assert!(!t(r"$+*").is_anchored_end());
2522         assert!(!t(r"^+*").is_line_anchored_start());
2523         assert!(!t(r"$+*").is_line_anchored_end());
2524         assert!(!t(r"(^)*").is_anchored_start());
2525         assert!(!t(r"($)*").is_anchored_end());
2526         assert!(!t(r"(^)*").is_line_anchored_start());
2527         assert!(!t(r"($)*").is_line_anchored_end());
2528     }
2529 
2530     #[test]
analysis_is_line_anchored()2531     fn analysis_is_line_anchored() {
2532         assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
2533         assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
2534 
2535         assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
2536         assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
2537 
2538         assert!(t(r"(?m)^").is_line_anchored_start());
2539         assert!(t(r"(?m)$").is_line_anchored_end());
2540 
2541         assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
2542         assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
2543 
2544         assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
2545         assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
2546     }
2547 
2548     #[test]
analysis_is_any_anchored()2549     fn analysis_is_any_anchored() {
2550         // Positive examples.
2551         assert!(t(r"^").is_any_anchored_start());
2552         assert!(t(r"$").is_any_anchored_end());
2553         assert!(t(r"\A").is_any_anchored_start());
2554         assert!(t(r"\z").is_any_anchored_end());
2555 
2556         // Negative examples.
2557         assert!(!t(r"(?m)^").is_any_anchored_start());
2558         assert!(!t(r"(?m)$").is_any_anchored_end());
2559         assert!(!t(r"$").is_any_anchored_start());
2560         assert!(!t(r"^").is_any_anchored_end());
2561     }
2562 
2563     #[test]
analysis_is_match_empty()2564     fn analysis_is_match_empty() {
2565         // Positive examples.
2566         assert!(t(r"").is_match_empty());
2567         assert!(t(r"()").is_match_empty());
2568         assert!(t(r"()*").is_match_empty());
2569         assert!(t(r"()+").is_match_empty());
2570         assert!(t(r"()?").is_match_empty());
2571         assert!(t(r"a*").is_match_empty());
2572         assert!(t(r"a?").is_match_empty());
2573         assert!(t(r"a{0}").is_match_empty());
2574         assert!(t(r"a{0,}").is_match_empty());
2575         assert!(t(r"a{0,1}").is_match_empty());
2576         assert!(t(r"a{0,10}").is_match_empty());
2577         assert!(t(r"\pL*").is_match_empty());
2578         assert!(t(r"a*|b").is_match_empty());
2579         assert!(t(r"b|a*").is_match_empty());
2580         assert!(t(r"a*a?(abcd)*").is_match_empty());
2581         assert!(t(r"^").is_match_empty());
2582         assert!(t(r"$").is_match_empty());
2583         assert!(t(r"(?m)^").is_match_empty());
2584         assert!(t(r"(?m)$").is_match_empty());
2585         assert!(t(r"\A").is_match_empty());
2586         assert!(t(r"\z").is_match_empty());
2587         assert!(t(r"\B").is_match_empty());
2588         assert!(t_bytes(r"(?-u)\B").is_match_empty());
2589 
2590         // Negative examples.
2591         assert!(!t(r"a+").is_match_empty());
2592         assert!(!t(r"a{1}").is_match_empty());
2593         assert!(!t(r"a{1,}").is_match_empty());
2594         assert!(!t(r"a{1,2}").is_match_empty());
2595         assert!(!t(r"a{1,10}").is_match_empty());
2596         assert!(!t(r"b|a").is_match_empty());
2597         assert!(!t(r"a*a+(abcd)*").is_match_empty());
2598         assert!(!t(r"\b").is_match_empty());
2599         assert!(!t(r"(?-u)\b").is_match_empty());
2600     }
2601 
2602     #[test]
analysis_is_literal()2603     fn analysis_is_literal() {
2604         // Positive examples.
2605         assert!(t(r"").is_literal());
2606         assert!(t(r"a").is_literal());
2607         assert!(t(r"ab").is_literal());
2608         assert!(t(r"abc").is_literal());
2609         assert!(t(r"(?m)abc").is_literal());
2610 
2611         // Negative examples.
2612         assert!(!t(r"^").is_literal());
2613         assert!(!t(r"a|b").is_literal());
2614         assert!(!t(r"(a)").is_literal());
2615         assert!(!t(r"a+").is_literal());
2616         assert!(!t(r"foo(a)").is_literal());
2617         assert!(!t(r"(a)foo").is_literal());
2618         assert!(!t(r"[a]").is_literal());
2619     }
2620 
2621     #[test]
analysis_is_alternation_literal()2622     fn analysis_is_alternation_literal() {
2623         // Positive examples.
2624         assert!(t(r"").is_alternation_literal());
2625         assert!(t(r"a").is_alternation_literal());
2626         assert!(t(r"ab").is_alternation_literal());
2627         assert!(t(r"abc").is_alternation_literal());
2628         assert!(t(r"(?m)abc").is_alternation_literal());
2629         assert!(t(r"a|b").is_alternation_literal());
2630         assert!(t(r"a|b|c").is_alternation_literal());
2631         assert!(t(r"foo|bar").is_alternation_literal());
2632         assert!(t(r"foo|bar|baz").is_alternation_literal());
2633 
2634         // Negative examples.
2635         assert!(!t(r"^").is_alternation_literal());
2636         assert!(!t(r"(a)").is_alternation_literal());
2637         assert!(!t(r"a+").is_alternation_literal());
2638         assert!(!t(r"foo(a)").is_alternation_literal());
2639         assert!(!t(r"(a)foo").is_alternation_literal());
2640         assert!(!t(r"[a]").is_alternation_literal());
2641         assert!(!t(r"[a]|b").is_alternation_literal());
2642         assert!(!t(r"a|[b]").is_alternation_literal());
2643         assert!(!t(r"(a)|b").is_alternation_literal());
2644         assert!(!t(r"a|(b)").is_alternation_literal());
2645     }
2646 }
2647