1 // Copyright 2018 The Rust Project Developers. See the COPYRIGHT
2 // file at the top-level directory of this distribution and at
3 // http://rust-lang.org/COPYRIGHT.
4 //
5 // Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
6 // http://www.apache.org/licenses/LICENSE-2.0> or the MIT license
7 // <LICENSE-MIT or http://opensource.org/licenses/MIT>, at your
8 // option. This file may not be copied, modified, or distributed
9 // except according to those terms.
10
11 /*!
12 Defines a translator that converts an `Ast` to an `Hir`.
13 */
14
15 use std::cell::{Cell, RefCell};
16 use std::result;
17
18 use ast::{self, Ast, Span, Visitor};
19 use hir::{self, Error, ErrorKind, Hir};
20 use unicode::{self, ClassQuery};
21
22 type Result<T> = result::Result<T, Error>;
23
24 /// A builder for constructing an AST->HIR translator.
25 #[derive(Clone, Debug)]
26 pub struct TranslatorBuilder {
27 allow_invalid_utf8: bool,
28 flags: Flags,
29 }
30
31 impl Default for TranslatorBuilder {
default() -> TranslatorBuilder32 fn default() -> TranslatorBuilder {
33 TranslatorBuilder::new()
34 }
35 }
36
37 impl TranslatorBuilder {
38 /// Create a new translator builder with a default c onfiguration.
new() -> TranslatorBuilder39 pub fn new() -> TranslatorBuilder {
40 TranslatorBuilder {
41 allow_invalid_utf8: false,
42 flags: Flags::default(),
43 }
44 }
45
46 /// Build a translator using the current configuration.
build(&self) -> Translator47 pub fn build(&self) -> Translator {
48 Translator {
49 stack: RefCell::new(vec![]),
50 flags: Cell::new(self.flags),
51 allow_invalid_utf8: self.allow_invalid_utf8,
52 }
53 }
54
55 /// When enabled, translation will permit the construction of a regular
56 /// expression that may match invalid UTF-8.
57 ///
58 /// When disabled (the default), the translator is guaranteed to produce
59 /// an expression that will only ever match valid UTF-8 (otherwise, the
60 /// translator will return an error).
61 ///
62 /// Perhaps surprisingly, when invalid UTF-8 isn't allowed, a negated ASCII
63 /// word boundary (uttered as `(?-u:\B)` in the concrete syntax) will cause
64 /// the parser to return an error. Namely, a negated ASCII word boundary
65 /// can result in matching positions that aren't valid UTF-8 boundaries.
allow_invalid_utf8( &mut self, yes: bool, ) -> &mut TranslatorBuilder66 pub fn allow_invalid_utf8(
67 &mut self,
68 yes: bool,
69 ) -> &mut TranslatorBuilder {
70 self.allow_invalid_utf8 = yes;
71 self
72 }
73
74 /// Enable or disable the case insensitive flag (`i`) by default.
case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder75 pub fn case_insensitive(&mut self, yes: bool) -> &mut TranslatorBuilder {
76 self.flags.case_insensitive = if yes { Some(true) } else { None };
77 self
78 }
79
80 /// Enable or disable the multi-line matching flag (`m`) by default.
multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder81 pub fn multi_line(&mut self, yes: bool) -> &mut TranslatorBuilder {
82 self.flags.multi_line = if yes { Some(true) } else { None };
83 self
84 }
85
86 /// Enable or disable the "dot matches any character" flag (`s`) by
87 /// default.
dot_matches_new_line( &mut self, yes: bool, ) -> &mut TranslatorBuilder88 pub fn dot_matches_new_line(
89 &mut self,
90 yes: bool,
91 ) -> &mut TranslatorBuilder {
92 self.flags.dot_matches_new_line = if yes { Some(true) } else { None };
93 self
94 }
95
96 /// Enable or disable the "swap greed" flag (`U`) by default.
swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder97 pub fn swap_greed(&mut self, yes: bool) -> &mut TranslatorBuilder {
98 self.flags.swap_greed = if yes { Some(true) } else { None };
99 self
100 }
101
102 /// Enable or disable the Unicode flag (`u`) by default.
unicode(&mut self, yes: bool) -> &mut TranslatorBuilder103 pub fn unicode(&mut self, yes: bool) -> &mut TranslatorBuilder {
104 self.flags.unicode = if yes { None } else { Some(false) };
105 self
106 }
107 }
108
109 /// A translator maps abstract syntax to a high level intermediate
110 /// representation.
111 ///
112 /// A translator may be benefit from reuse. That is, a translator can translate
113 /// many abstract syntax trees.
114 ///
115 /// A `Translator` can be configured in more detail via a
116 /// [`TranslatorBuilder`](struct.TranslatorBuilder.html).
117 #[derive(Clone, Debug)]
118 pub struct Translator {
119 /// Our call stack, but on the heap.
120 stack: RefCell<Vec<HirFrame>>,
121 /// The current flag settings.
122 flags: Cell<Flags>,
123 /// Whether we're allowed to produce HIR that can match arbitrary bytes.
124 allow_invalid_utf8: bool,
125 }
126
127 impl Translator {
128 /// Create a new translator using the default configuration.
new() -> Translator129 pub fn new() -> Translator {
130 TranslatorBuilder::new().build()
131 }
132
133 /// Translate the given abstract syntax tree (AST) into a high level
134 /// intermediate representation (HIR).
135 ///
136 /// If there was a problem doing the translation, then an HIR-specific
137 /// error is returned.
138 ///
139 /// The original pattern string used to produce the `Ast` *must* also be
140 /// provided. The translator does not use the pattern string during any
141 /// correct translation, but is used for error reporting.
translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir>142 pub fn translate(&mut self, pattern: &str, ast: &Ast) -> Result<Hir> {
143 ast::visit(ast, TranslatorI::new(self, pattern))
144 }
145 }
146
147 /// An HirFrame is a single stack frame, represented explicitly, which is
148 /// created for each item in the Ast that we traverse.
149 ///
150 /// Note that technically, this type doesn't represent our entire stack
151 /// frame. In particular, the Ast visitor represents any state associated with
152 /// traversing the Ast itself.
153 #[derive(Clone, Debug)]
154 enum HirFrame {
155 /// An arbitrary HIR expression. These get pushed whenever we hit a base
156 /// case in the Ast. They get popped after an inductive (i.e., recursive)
157 /// step is complete.
158 Expr(Hir),
159 /// A Unicode character class. This frame is mutated as we descend into
160 /// the Ast of a character class (which is itself its own mini recursive
161 /// structure).
162 ClassUnicode(hir::ClassUnicode),
163 /// A byte-oriented character class. This frame is mutated as we descend
164 /// into the Ast of a character class (which is itself its own mini
165 /// recursive structure).
166 ///
167 /// Byte character classes are created when Unicode mode (`u`) is disabled.
168 /// If `allow_invalid_utf8` is disabled (the default), then a byte
169 /// character is only permitted to match ASCII text.
170 ClassBytes(hir::ClassBytes),
171 /// This is pushed on to the stack upon first seeing any kind of group,
172 /// indicated by parentheses (including non-capturing groups). It is popped
173 /// upon leaving a group.
174 Group {
175 /// The old active flags, if any, when this group was opened.
176 ///
177 /// If this group sets flags, then the new active flags are set to the
178 /// result of merging the old flags with the flags introduced by this
179 /// group.
180 ///
181 /// When this group is popped, the active flags should be restored to
182 /// the flags set here.
183 ///
184 /// The "active" flags correspond to whatever flags are set in the
185 /// Translator.
186 old_flags: Option<Flags>,
187 },
188 /// This is pushed whenever a concatenation is observed. After visiting
189 /// every sub-expression in the concatenation, the translator's stack is
190 /// popped until it sees a Concat frame.
191 Concat,
192 /// This is pushed whenever an alternation is observed. After visiting
193 /// every sub-expression in the alternation, the translator's stack is
194 /// popped until it sees an Alternation frame.
195 Alternation,
196 }
197
198 impl HirFrame {
199 /// Assert that the current stack frame is an Hir expression and return it.
unwrap_expr(self) -> Hir200 fn unwrap_expr(self) -> Hir {
201 match self {
202 HirFrame::Expr(expr) => expr,
203 _ => panic!("tried to unwrap expr from HirFrame, got: {:?}", self)
204 }
205 }
206
207 /// Assert that the current stack frame is a Unicode class expression and
208 /// return it.
unwrap_class_unicode(self) -> hir::ClassUnicode209 fn unwrap_class_unicode(self) -> hir::ClassUnicode {
210 match self {
211 HirFrame::ClassUnicode(cls) => cls,
212 _ => panic!("tried to unwrap Unicode class \
213 from HirFrame, got: {:?}", self)
214 }
215 }
216
217 /// Assert that the current stack frame is a byte class expression and
218 /// return it.
unwrap_class_bytes(self) -> hir::ClassBytes219 fn unwrap_class_bytes(self) -> hir::ClassBytes {
220 match self {
221 HirFrame::ClassBytes(cls) => cls,
222 _ => panic!("tried to unwrap byte class \
223 from HirFrame, got: {:?}", self)
224 }
225 }
226
227 /// Assert that the current stack frame is a group indicator and return
228 /// its corresponding flags (the flags that were active at the time the
229 /// group was entered) if they exist.
unwrap_group(self) -> Option<Flags>230 fn unwrap_group(self) -> Option<Flags> {
231 match self {
232 HirFrame::Group { old_flags } => old_flags,
233 _ => panic!("tried to unwrap group from HirFrame, got: {:?}", self)
234 }
235 }
236 }
237
238 impl<'t, 'p> Visitor for TranslatorI<'t, 'p> {
239 type Output = Hir;
240 type Err = Error;
241
finish(self) -> Result<Hir>242 fn finish(self) -> Result<Hir> {
243 // ... otherwise, we should have exactly one HIR on the stack.
244 assert_eq!(self.trans().stack.borrow().len(), 1);
245 Ok(self.pop().unwrap().unwrap_expr())
246 }
247
visit_pre(&mut self, ast: &Ast) -> Result<()>248 fn visit_pre(&mut self, ast: &Ast) -> Result<()> {
249 match *ast {
250 Ast::Class(ast::Class::Bracketed(_)) => {
251 if self.flags().unicode() {
252 let cls = hir::ClassUnicode::empty();
253 self.push(HirFrame::ClassUnicode(cls));
254 } else {
255 let cls = hir::ClassBytes::empty();
256 self.push(HirFrame::ClassBytes(cls));
257 }
258 }
259 Ast::Group(ref x) => {
260 let old_flags = x.flags().map(|ast| self.set_flags(ast));
261 self.push(HirFrame::Group {
262 old_flags: old_flags,
263 });
264 }
265 Ast::Concat(ref x) if x.asts.is_empty() => {}
266 Ast::Concat(_) => {
267 self.push(HirFrame::Concat);
268 }
269 Ast::Alternation(ref x) if x.asts.is_empty() => {}
270 Ast::Alternation(_) => {
271 self.push(HirFrame::Alternation);
272 }
273 _ => {}
274 }
275 Ok(())
276 }
277
visit_post(&mut self, ast: &Ast) -> Result<()>278 fn visit_post(&mut self, ast: &Ast) -> Result<()> {
279 match *ast {
280 Ast::Empty(_) => {
281 self.push(HirFrame::Expr(Hir::empty()));
282 }
283 Ast::Flags(ref x) => {
284 self.set_flags(&x.flags);
285 // Flags in the AST are generally considered directives and
286 // not actual sub-expressions. However, they can be used in
287 // the concrete syntax like `((?i))`, and we need some kind of
288 // indication of an expression there, and Empty is the correct
289 // choice.
290 //
291 // There can also be things like `(?i)+`, but we rule those out
292 // in the parser. In the future, we might allow them for
293 // consistency sake.
294 self.push(HirFrame::Expr(Hir::empty()));
295 }
296 Ast::Literal(ref x) => {
297 self.push(HirFrame::Expr(self.hir_literal(x)?));
298 }
299 Ast::Dot(span) => {
300 self.push(HirFrame::Expr(self.hir_dot(span)?));
301 }
302 Ast::Assertion(ref x) => {
303 self.push(HirFrame::Expr(self.hir_assertion(x)?));
304 }
305 Ast::Class(ast::Class::Perl(ref x)) => {
306 if self.flags().unicode() {
307 let cls = self.hir_perl_unicode_class(x);
308 let hcls = hir::Class::Unicode(cls);
309 self.push(HirFrame::Expr(Hir::class(hcls)));
310 } else {
311 let cls = self.hir_perl_byte_class(x);
312 let hcls = hir::Class::Bytes(cls);
313 self.push(HirFrame::Expr(Hir::class(hcls)));
314 }
315 }
316 Ast::Class(ast::Class::Unicode(ref x)) => {
317 let cls = hir::Class::Unicode(self.hir_unicode_class(x)?);
318 self.push(HirFrame::Expr(Hir::class(cls)));
319 }
320 Ast::Class(ast::Class::Bracketed(ref ast)) => {
321 if self.flags().unicode() {
322 let mut cls = self.pop().unwrap().unwrap_class_unicode();
323 self.unicode_fold_and_negate(ast.negated, &mut cls);
324 if cls.iter().next().is_none() {
325 return Err(self.error(
326 ast.span, ErrorKind::EmptyClassNotAllowed));
327 }
328 let expr = Hir::class(hir::Class::Unicode(cls));
329 self.push(HirFrame::Expr(expr));
330 } else {
331 let mut cls = self.pop().unwrap().unwrap_class_bytes();
332 self.bytes_fold_and_negate(
333 &ast.span, ast.negated, &mut cls)?;
334 if cls.iter().next().is_none() {
335 return Err(self.error(
336 ast.span, ErrorKind::EmptyClassNotAllowed));
337 }
338
339 let expr = Hir::class(hir::Class::Bytes(cls));
340 self.push(HirFrame::Expr(expr));
341 }
342 }
343 Ast::Repetition(ref x) => {
344 let expr = self.pop().unwrap().unwrap_expr();
345 self.push(HirFrame::Expr(self.hir_repetition(x, expr)));
346 }
347 Ast::Group(ref x) => {
348 let expr = self.pop().unwrap().unwrap_expr();
349 if let Some(flags) = self.pop().unwrap().unwrap_group() {
350 self.trans().flags.set(flags);
351 }
352 self.push(HirFrame::Expr(self.hir_group(x, expr)));
353 }
354 Ast::Concat(_) => {
355 let mut exprs = vec![];
356 while let Some(HirFrame::Expr(expr)) = self.pop() {
357 if !expr.kind().is_empty() {
358 exprs.push(expr);
359 }
360 }
361 exprs.reverse();
362 self.push(HirFrame::Expr(Hir::concat(exprs)));
363 }
364 Ast::Alternation(_) => {
365 let mut exprs = vec![];
366 while let Some(HirFrame::Expr(expr)) = self.pop() {
367 exprs.push(expr);
368 }
369 exprs.reverse();
370 self.push(HirFrame::Expr(Hir::alternation(exprs)));
371 }
372 }
373 Ok(())
374 }
375
visit_class_set_item_pre( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>376 fn visit_class_set_item_pre(
377 &mut self,
378 ast: &ast::ClassSetItem,
379 ) -> Result<()> {
380 match *ast {
381 ast::ClassSetItem::Bracketed(_) => {
382 if self.flags().unicode() {
383 let cls = hir::ClassUnicode::empty();
384 self.push(HirFrame::ClassUnicode(cls));
385 } else {
386 let cls = hir::ClassBytes::empty();
387 self.push(HirFrame::ClassBytes(cls));
388 }
389 }
390 // We needn't handle the Union case here since the visitor will
391 // do it for us.
392 _ => {}
393 }
394 Ok(())
395 }
396
visit_class_set_item_post( &mut self, ast: &ast::ClassSetItem, ) -> Result<()>397 fn visit_class_set_item_post(
398 &mut self,
399 ast: &ast::ClassSetItem,
400 ) -> Result<()> {
401 match *ast {
402 ast::ClassSetItem::Empty(_) => {}
403 ast::ClassSetItem::Literal(ref x) => {
404 if self.flags().unicode() {
405 let mut cls = self.pop().unwrap().unwrap_class_unicode();
406 cls.push(hir::ClassUnicodeRange::new(x.c, x.c));
407 self.push(HirFrame::ClassUnicode(cls));
408 } else {
409 let mut cls = self.pop().unwrap().unwrap_class_bytes();
410 let byte = self.class_literal_byte(x)?;
411 cls.push(hir::ClassBytesRange::new(byte, byte));
412 self.push(HirFrame::ClassBytes(cls));
413 }
414 }
415 ast::ClassSetItem::Range(ref x) => {
416 if self.flags().unicode() {
417 let mut cls = self.pop().unwrap().unwrap_class_unicode();
418 cls.push(hir::ClassUnicodeRange::new(x.start.c, x.end.c));
419 self.push(HirFrame::ClassUnicode(cls));
420 } else {
421 let mut cls = self.pop().unwrap().unwrap_class_bytes();
422 let start = self.class_literal_byte(&x.start)?;
423 let end = self.class_literal_byte(&x.end)?;
424 cls.push(hir::ClassBytesRange::new(start, end));
425 self.push(HirFrame::ClassBytes(cls));
426 }
427 }
428 ast::ClassSetItem::Ascii(ref x) => {
429 if self.flags().unicode() {
430 let mut cls = self.pop().unwrap().unwrap_class_unicode();
431 for &(s, e) in ascii_class(&x.kind) {
432 cls.push(hir::ClassUnicodeRange::new(s, e));
433 }
434 self.unicode_fold_and_negate(x.negated, &mut cls);
435 self.push(HirFrame::ClassUnicode(cls));
436 } else {
437 let mut cls = self.pop().unwrap().unwrap_class_bytes();
438 for &(s, e) in ascii_class(&x.kind) {
439 cls.push(hir::ClassBytesRange::new(s as u8, e as u8));
440 }
441 self.bytes_fold_and_negate(
442 &x.span, x.negated, &mut cls)?;
443 self.push(HirFrame::ClassBytes(cls));
444 }
445 }
446 ast::ClassSetItem::Unicode(ref x) => {
447 let xcls = self.hir_unicode_class(x)?;
448 let mut cls = self.pop().unwrap().unwrap_class_unicode();
449 cls.union(&xcls);
450 self.push(HirFrame::ClassUnicode(cls));
451 }
452 ast::ClassSetItem::Perl(ref x) => {
453 if self.flags().unicode() {
454 let xcls = self.hir_perl_unicode_class(x);
455 let mut cls = self.pop().unwrap().unwrap_class_unicode();
456 cls.union(&xcls);
457 self.push(HirFrame::ClassUnicode(cls));
458 } else {
459 let xcls = self.hir_perl_byte_class(x);
460 let mut cls = self.pop().unwrap().unwrap_class_bytes();
461 cls.union(&xcls);
462 self.push(HirFrame::ClassBytes(cls));
463 }
464 }
465 ast::ClassSetItem::Bracketed(ref ast) => {
466 if self.flags().unicode() {
467 let mut cls1 = self.pop().unwrap().unwrap_class_unicode();
468 self.unicode_fold_and_negate(ast.negated, &mut cls1);
469
470 let mut cls2 = self.pop().unwrap().unwrap_class_unicode();
471 cls2.union(&cls1);
472 self.push(HirFrame::ClassUnicode(cls2));
473 } else {
474 let mut cls1 = self.pop().unwrap().unwrap_class_bytes();
475 self.bytes_fold_and_negate(
476 &ast.span, ast.negated, &mut cls1)?;
477
478 let mut cls2 = self.pop().unwrap().unwrap_class_bytes();
479 cls2.union(&cls1);
480 self.push(HirFrame::ClassBytes(cls2));
481 }
482 }
483 // This is handled automatically by the visitor.
484 ast::ClassSetItem::Union(_) => {}
485 }
486 Ok(())
487 }
488
visit_class_set_binary_op_pre( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>489 fn visit_class_set_binary_op_pre(
490 &mut self,
491 _op: &ast::ClassSetBinaryOp,
492 ) -> Result<()> {
493 if self.flags().unicode() {
494 let cls = hir::ClassUnicode::empty();
495 self.push(HirFrame::ClassUnicode(cls));
496 } else {
497 let cls = hir::ClassBytes::empty();
498 self.push(HirFrame::ClassBytes(cls));
499 }
500 Ok(())
501 }
502
visit_class_set_binary_op_in( &mut self, _op: &ast::ClassSetBinaryOp, ) -> Result<()>503 fn visit_class_set_binary_op_in(
504 &mut self,
505 _op: &ast::ClassSetBinaryOp,
506 ) -> Result<()> {
507 if self.flags().unicode() {
508 let cls = hir::ClassUnicode::empty();
509 self.push(HirFrame::ClassUnicode(cls));
510 } else {
511 let cls = hir::ClassBytes::empty();
512 self.push(HirFrame::ClassBytes(cls));
513 }
514 Ok(())
515 }
516
visit_class_set_binary_op_post( &mut self, op: &ast::ClassSetBinaryOp, ) -> Result<()>517 fn visit_class_set_binary_op_post(
518 &mut self,
519 op: &ast::ClassSetBinaryOp,
520 ) -> Result<()> {
521 use ast::ClassSetBinaryOpKind::*;
522
523 if self.flags().unicode() {
524 let mut rhs = self.pop().unwrap().unwrap_class_unicode();
525 let mut lhs = self.pop().unwrap().unwrap_class_unicode();
526 let mut cls = self.pop().unwrap().unwrap_class_unicode();
527 if self.flags().case_insensitive() {
528 rhs.case_fold_simple();
529 lhs.case_fold_simple();
530 }
531 match op.kind {
532 Intersection => lhs.intersect(&rhs),
533 Difference => lhs.difference(&rhs),
534 SymmetricDifference => lhs.symmetric_difference(&rhs),
535 }
536 cls.union(&lhs);
537 self.push(HirFrame::ClassUnicode(cls));
538 } else {
539 let mut rhs = self.pop().unwrap().unwrap_class_bytes();
540 let mut lhs = self.pop().unwrap().unwrap_class_bytes();
541 let mut cls = self.pop().unwrap().unwrap_class_bytes();
542 if self.flags().case_insensitive() {
543 rhs.case_fold_simple();
544 lhs.case_fold_simple();
545 }
546 match op.kind {
547 Intersection => lhs.intersect(&rhs),
548 Difference => lhs.difference(&rhs),
549 SymmetricDifference => lhs.symmetric_difference(&rhs),
550 }
551 cls.union(&lhs);
552 self.push(HirFrame::ClassBytes(cls));
553 }
554 Ok(())
555 }
556 }
557
558 /// The internal implementation of a translator.
559 ///
560 /// This type is responsible for carrying around the original pattern string,
561 /// which is not tied to the internal state of a translator.
562 ///
563 /// A TranslatorI exists for the time it takes to translate a single Ast.
564 #[derive(Clone, Debug)]
565 struct TranslatorI<'t, 'p> {
566 trans: &'t Translator,
567 pattern: &'p str,
568 }
569
570 impl<'t, 'p> TranslatorI<'t, 'p> {
571 /// Build a new internal translator.
new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p>572 fn new(trans: &'t Translator, pattern: &'p str) -> TranslatorI<'t, 'p> {
573 TranslatorI { trans: trans, pattern: pattern }
574 }
575
576 /// Return a reference to the underlying translator.
trans(&self) -> &Translator577 fn trans(&self) -> &Translator {
578 &self.trans
579 }
580
581 /// Push the given frame on to the call stack.
push(&self, frame: HirFrame)582 fn push(&self, frame: HirFrame) {
583 self.trans().stack.borrow_mut().push(frame);
584 }
585
586 /// Pop the top of the call stack. If the call stack is empty, return None.
pop(&self) -> Option<HirFrame>587 fn pop(&self) -> Option<HirFrame> {
588 self.trans().stack.borrow_mut().pop()
589 }
590
591 /// Create a new error with the given span and error type.
error(&self, span: Span, kind: ErrorKind) -> Error592 fn error(&self, span: Span, kind: ErrorKind) -> Error {
593 Error { kind: kind, pattern: self.pattern.to_string(), span: span }
594 }
595
596 /// Return a copy of the active flags.
flags(&self) -> Flags597 fn flags(&self) -> Flags {
598 self.trans().flags.get()
599 }
600
601 /// Set the flags of this translator from the flags set in the given AST.
602 /// Then, return the old flags.
set_flags(&self, ast_flags: &ast::Flags) -> Flags603 fn set_flags(&self, ast_flags: &ast::Flags) -> Flags {
604 let old_flags = self.flags();
605 let mut new_flags = Flags::from_ast(ast_flags);
606 new_flags.merge(&old_flags);
607 self.trans().flags.set(new_flags);
608 old_flags
609 }
610
hir_literal(&self, lit: &ast::Literal) -> Result<Hir>611 fn hir_literal(&self, lit: &ast::Literal) -> Result<Hir> {
612 let ch = match self.literal_to_char(lit)? {
613 byte @ hir::Literal::Byte(_) => return Ok(Hir::literal(byte)),
614 hir::Literal::Unicode(ch) => ch,
615 };
616 if self.flags().case_insensitive() {
617 self.hir_from_char_case_insensitive(lit.span, ch)
618 } else {
619 self.hir_from_char(lit.span, ch)
620 }
621 }
622
623 /// Convert an Ast literal to its scalar representation.
624 ///
625 /// When Unicode mode is enabled, then this always succeeds and returns a
626 /// `char` (Unicode scalar value).
627 ///
628 /// When Unicode mode is disabled, then a raw byte is returned. If that
629 /// byte is not ASCII and invalid UTF-8 is not allowed, then this returns
630 /// an error.
literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal>631 fn literal_to_char(&self, lit: &ast::Literal) -> Result<hir::Literal> {
632 if self.flags().unicode() {
633 return Ok(hir::Literal::Unicode(lit.c));
634 }
635 let byte = match lit.byte() {
636 None => return Ok(hir::Literal::Unicode(lit.c)),
637 Some(byte) => byte,
638 };
639 if byte <= 0x7F {
640 return Ok(hir::Literal::Unicode(byte as char));
641 }
642 if !self.trans().allow_invalid_utf8 {
643 return Err(self.error(lit.span, ErrorKind::InvalidUtf8));
644 }
645 Ok(hir::Literal::Byte(byte))
646 }
647
hir_from_char(&self, span: Span, c: char) -> Result<Hir>648 fn hir_from_char(&self, span: Span, c: char) -> Result<Hir> {
649 if !self.flags().unicode() && c.len_utf8() > 1 {
650 return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
651 }
652 Ok(Hir::literal(hir::Literal::Unicode(c)))
653 }
654
hir_from_char_case_insensitive( &self, span: Span, c: char, ) -> Result<Hir>655 fn hir_from_char_case_insensitive(
656 &self,
657 span: Span,
658 c: char,
659 ) -> Result<Hir> {
660 // If case folding won't do anything, then don't bother trying.
661 if !unicode::contains_simple_case_mapping(c, c) {
662 return self.hir_from_char(span, c);
663 }
664 if self.flags().unicode() {
665 let mut cls = hir::ClassUnicode::new(vec![
666 hir::ClassUnicodeRange::new(c, c),
667 ]);
668 cls.case_fold_simple();
669 Ok(Hir::class(hir::Class::Unicode(cls)))
670 } else {
671 if c.len_utf8() > 1 {
672 return Err(self.error(span, ErrorKind::UnicodeNotAllowed));
673 }
674 let mut cls = hir::ClassBytes::new(vec![
675 hir::ClassBytesRange::new(c as u8, c as u8),
676 ]);
677 cls.case_fold_simple();
678 Ok(Hir::class(hir::Class::Bytes(cls)))
679 }
680 }
681
hir_dot(&self, span: Span) -> Result<Hir>682 fn hir_dot(&self, span: Span) -> Result<Hir> {
683 let unicode = self.flags().unicode();
684 if !unicode && !self.trans().allow_invalid_utf8 {
685 return Err(self.error(span, ErrorKind::InvalidUtf8));
686 }
687 Ok(if self.flags().dot_matches_new_line() {
688 Hir::any(!unicode)
689 } else {
690 Hir::dot(!unicode)
691 })
692 }
693
hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir>694 fn hir_assertion(&self, asst: &ast::Assertion) -> Result<Hir> {
695 let unicode = self.flags().unicode();
696 let multi_line = self.flags().multi_line();
697 Ok(match asst.kind {
698 ast::AssertionKind::StartLine => {
699 Hir::anchor(if multi_line {
700 hir::Anchor::StartLine
701 } else {
702 hir::Anchor::StartText
703 })
704 }
705 ast::AssertionKind::EndLine => {
706 Hir::anchor(if multi_line {
707 hir::Anchor::EndLine
708 } else {
709 hir::Anchor::EndText
710 })
711 }
712 ast::AssertionKind::StartText => {
713 Hir::anchor(hir::Anchor::StartText)
714 }
715 ast::AssertionKind::EndText => {
716 Hir::anchor(hir::Anchor::EndText)
717 }
718 ast::AssertionKind::WordBoundary => {
719 Hir::word_boundary(if unicode {
720 hir::WordBoundary::Unicode
721 } else {
722 hir::WordBoundary::Ascii
723 })
724 }
725 ast::AssertionKind::NotWordBoundary => {
726 Hir::word_boundary(if unicode {
727 hir::WordBoundary::UnicodeNegate
728 } else {
729 // It is possible for negated ASCII word boundaries to
730 // match at invalid UTF-8 boundaries, even when searching
731 // valid UTF-8.
732 if !self.trans().allow_invalid_utf8 {
733 return Err(self.error(
734 asst.span, ErrorKind::InvalidUtf8));
735 }
736 hir::WordBoundary::AsciiNegate
737 })
738 }
739 })
740 }
741
hir_group(&self, group: &ast::Group, expr: Hir) -> Hir742 fn hir_group(&self, group: &ast::Group, expr: Hir) -> Hir {
743 let kind = match group.kind {
744 ast::GroupKind::CaptureIndex(idx) => {
745 hir::GroupKind::CaptureIndex(idx)
746 }
747 ast::GroupKind::CaptureName(ref capname) => {
748 hir::GroupKind::CaptureName {
749 name: capname.name.clone(),
750 index: capname.index,
751 }
752 }
753 ast::GroupKind::NonCapturing(_) => hir::GroupKind::NonCapturing,
754 };
755 Hir::group(hir::Group {
756 kind: kind,
757 hir: Box::new(expr),
758 })
759 }
760
hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir761 fn hir_repetition(&self, rep: &ast::Repetition, expr: Hir) -> Hir {
762 let kind = match rep.op.kind {
763 ast::RepetitionKind::ZeroOrOne => hir::RepetitionKind::ZeroOrOne,
764 ast::RepetitionKind::ZeroOrMore => hir::RepetitionKind::ZeroOrMore,
765 ast::RepetitionKind::OneOrMore => hir::RepetitionKind::OneOrMore,
766 ast::RepetitionKind::Range(ast::RepetitionRange::Exactly(m)) => {
767 hir::RepetitionKind::Range(hir::RepetitionRange::Exactly(m))
768 }
769 ast::RepetitionKind::Range(ast::RepetitionRange::AtLeast(m)) => {
770 hir::RepetitionKind::Range(hir::RepetitionRange::AtLeast(m))
771 }
772 ast::RepetitionKind::Range(ast::RepetitionRange::Bounded(m,n)) => {
773 hir::RepetitionKind::Range(hir::RepetitionRange::Bounded(m, n))
774 }
775 };
776 let greedy =
777 if self.flags().swap_greed() {
778 !rep.greedy
779 } else {
780 rep.greedy
781 };
782 Hir::repetition(hir::Repetition {
783 kind: kind,
784 greedy: greedy,
785 hir: Box::new(expr),
786 })
787 }
788
hir_unicode_class( &self, ast_class: &ast::ClassUnicode, ) -> Result<hir::ClassUnicode>789 fn hir_unicode_class(
790 &self,
791 ast_class: &ast::ClassUnicode,
792 ) -> Result<hir::ClassUnicode> {
793 use ast::ClassUnicodeKind::*;
794
795 if !self.flags().unicode() {
796 return Err(self.error(
797 ast_class.span,
798 ErrorKind::UnicodeNotAllowed,
799 ));
800 }
801 let query = match ast_class.kind {
802 OneLetter(name) => ClassQuery::OneLetter(name),
803 Named(ref name) => ClassQuery::Binary(name),
804 NamedValue { ref name, ref value, .. } => {
805 ClassQuery::ByValue {
806 property_name: name,
807 property_value: value,
808 }
809 }
810 };
811 match unicode::class(query) {
812 Ok(mut class) => {
813 self.unicode_fold_and_negate(ast_class.negated, &mut class);
814 Ok(class)
815 }
816 Err(unicode::Error::PropertyNotFound) => {
817 Err(self.error(
818 ast_class.span,
819 ErrorKind::UnicodePropertyNotFound,
820 ))
821 }
822 Err(unicode::Error::PropertyValueNotFound) => {
823 Err(self.error(
824 ast_class.span,
825 ErrorKind::UnicodePropertyValueNotFound,
826 ))
827 }
828 }
829 }
830
hir_perl_unicode_class( &self, ast_class: &ast::ClassPerl, ) -> hir::ClassUnicode831 fn hir_perl_unicode_class(
832 &self,
833 ast_class: &ast::ClassPerl,
834 ) -> hir::ClassUnicode {
835 use ast::ClassPerlKind::*;
836 use unicode_tables::perl_word::PERL_WORD;
837
838 assert!(self.flags().unicode());
839 let mut class = match ast_class.kind {
840 Digit => {
841 let query = ClassQuery::Binary("Decimal_Number");
842 unicode::class(query).unwrap()
843 }
844 Space => {
845 let query = ClassQuery::Binary("Whitespace");
846 unicode::class(query).unwrap()
847 }
848 Word => unicode::hir_class(PERL_WORD),
849 };
850 // We needn't apply case folding here because the Perl Unicode classes
851 // are already closed under Unicode simple case folding.
852 if ast_class.negated {
853 class.negate();
854 }
855 class
856 }
857
hir_perl_byte_class( &self, ast_class: &ast::ClassPerl, ) -> hir::ClassBytes858 fn hir_perl_byte_class(
859 &self,
860 ast_class: &ast::ClassPerl,
861 ) -> hir::ClassBytes {
862 use ast::ClassPerlKind::*;
863
864 assert!(!self.flags().unicode());
865 let mut class = match ast_class.kind {
866 Digit => hir_ascii_class_bytes(&ast::ClassAsciiKind::Digit),
867 Space => hir_ascii_class_bytes(&ast::ClassAsciiKind::Space),
868 Word => hir_ascii_class_bytes(&ast::ClassAsciiKind::Word),
869 };
870 // We needn't apply case folding here because the Perl ASCII classes
871 // are already closed (under ASCII case folding).
872 if ast_class.negated {
873 class.negate();
874 }
875 class
876 }
877
unicode_fold_and_negate( &self, negated: bool, class: &mut hir::ClassUnicode, )878 fn unicode_fold_and_negate(
879 &self,
880 negated: bool,
881 class: &mut hir::ClassUnicode,
882 ) {
883 // Note that we must apply case folding before negation!
884 // Consider `(?i)[^x]`. If we applied negation field, then
885 // the result would be the character class that matched any
886 // Unicode scalar value.
887 if self.flags().case_insensitive() {
888 class.case_fold_simple();
889 }
890 if negated {
891 class.negate();
892 }
893 }
894
bytes_fold_and_negate( &self, span: &Span, negated: bool, class: &mut hir::ClassBytes, ) -> Result<()>895 fn bytes_fold_and_negate(
896 &self,
897 span: &Span,
898 negated: bool,
899 class: &mut hir::ClassBytes,
900 ) -> Result<()> {
901 // Note that we must apply case folding before negation!
902 // Consider `(?i)[^x]`. If we applied negation field, then
903 // the result would be the character class that matched any
904 // Unicode scalar value.
905 if self.flags().case_insensitive() {
906 class.case_fold_simple();
907 }
908 if negated {
909 class.negate();
910 }
911 if !self.trans().allow_invalid_utf8 && !class.is_all_ascii() {
912 return Err(self.error(span.clone(), ErrorKind::InvalidUtf8));
913 }
914 Ok(())
915 }
916
917 /// Return a scalar byte value suitable for use as a literal in a byte
918 /// character class.
class_literal_byte(&self, ast: &ast::Literal) -> Result<u8>919 fn class_literal_byte(&self, ast: &ast::Literal) -> Result<u8> {
920 match self.literal_to_char(ast)? {
921 hir::Literal::Byte(byte) => Ok(byte),
922 hir::Literal::Unicode(ch) => {
923 if ch <= 0x7F as char {
924 Ok(ch as u8)
925 } else {
926 // We can't feasibly support Unicode in
927 // byte oriented classes. Byte classes don't
928 // do Unicode case folding.
929 Err(self.error(ast.span, ErrorKind::UnicodeNotAllowed))
930 }
931 }
932 }
933 }
934 }
935
936 /// A translator's representation of a regular expression's flags at any given
937 /// moment in time.
938 ///
939 /// Each flag can be in one of three states: absent, present but disabled or
940 /// present but enabled.
941 #[derive(Clone, Copy, Debug, Default)]
942 struct Flags {
943 case_insensitive: Option<bool>,
944 multi_line: Option<bool>,
945 dot_matches_new_line: Option<bool>,
946 swap_greed: Option<bool>,
947 unicode: Option<bool>,
948 // Note that `ignore_whitespace` is omitted here because it is handled
949 // entirely in the parser.
950 }
951
952 impl Flags {
from_ast(ast: &ast::Flags) -> Flags953 fn from_ast(ast: &ast::Flags) -> Flags {
954 let mut flags = Flags::default();
955 let mut enable = true;
956 for item in &ast.items {
957 match item.kind {
958 ast::FlagsItemKind::Negation => {
959 enable = false;
960 }
961 ast::FlagsItemKind::Flag(ast::Flag::CaseInsensitive) => {
962 flags.case_insensitive = Some(enable);
963 }
964 ast::FlagsItemKind::Flag(ast::Flag::MultiLine) => {
965 flags.multi_line = Some(enable);
966 }
967 ast::FlagsItemKind::Flag(ast::Flag::DotMatchesNewLine) => {
968 flags.dot_matches_new_line = Some(enable);
969 }
970 ast::FlagsItemKind::Flag(ast::Flag::SwapGreed) => {
971 flags.swap_greed = Some(enable);
972 }
973 ast::FlagsItemKind::Flag(ast::Flag::Unicode) => {
974 flags.unicode = Some(enable);
975 }
976 ast::FlagsItemKind::Flag(ast::Flag::IgnoreWhitespace) => {}
977 }
978 }
979 flags
980 }
981
merge(&mut self, previous: &Flags)982 fn merge(&mut self, previous: &Flags) {
983 if self.case_insensitive.is_none() {
984 self.case_insensitive = previous.case_insensitive;
985 }
986 if self.multi_line.is_none() {
987 self.multi_line = previous.multi_line;
988 }
989 if self.dot_matches_new_line.is_none() {
990 self.dot_matches_new_line = previous.dot_matches_new_line;
991 }
992 if self.swap_greed.is_none() {
993 self.swap_greed = previous.swap_greed;
994 }
995 if self.unicode.is_none() {
996 self.unicode = previous.unicode;
997 }
998 }
999
case_insensitive(&self) -> bool1000 fn case_insensitive(&self) -> bool {
1001 self.case_insensitive.unwrap_or(false)
1002 }
1003
multi_line(&self) -> bool1004 fn multi_line(&self) -> bool {
1005 self.multi_line.unwrap_or(false)
1006 }
1007
dot_matches_new_line(&self) -> bool1008 fn dot_matches_new_line(&self) -> bool {
1009 self.dot_matches_new_line.unwrap_or(false)
1010 }
1011
swap_greed(&self) -> bool1012 fn swap_greed(&self) -> bool {
1013 self.swap_greed.unwrap_or(false)
1014 }
1015
unicode(&self) -> bool1016 fn unicode(&self) -> bool {
1017 self.unicode.unwrap_or(true)
1018 }
1019 }
1020
hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes1021 fn hir_ascii_class_bytes(kind: &ast::ClassAsciiKind) -> hir::ClassBytes {
1022 let ranges: Vec<_> = ascii_class(kind).iter().cloned().map(|(s, e)| {
1023 hir::ClassBytesRange::new(s as u8, e as u8)
1024 }).collect();
1025 hir::ClassBytes::new(ranges)
1026 }
1027
ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)]1028 fn ascii_class(kind: &ast::ClassAsciiKind) -> &'static [(char, char)] {
1029 use ast::ClassAsciiKind::*;
1030
1031 // The contortions below with `const` appear necessary for older versions
1032 // of Rust.
1033 type T = &'static [(char, char)];
1034 match *kind {
1035 Alnum => {
1036 const X: T = &[('0', '9'), ('A', 'Z'), ('a', 'z')];
1037 X
1038 }
1039 Alpha => {
1040 const X: T = &[('A', 'Z'), ('a', 'z')];
1041 X
1042 }
1043 Ascii => {
1044 const X: T = &[('\x00', '\x7F')];
1045 X
1046 }
1047 Blank => {
1048 const X: T = &[('\t', '\t'), (' ', ' ')];
1049 X
1050 }
1051 Cntrl => {
1052 const X: T = &[('\x00', '\x1F'), ('\x7F', '\x7F')];
1053 X
1054 }
1055 Digit => {
1056 const X: T = &[('0', '9')];
1057 X
1058 }
1059 Graph => {
1060 const X: T = &[('!', '~')];
1061 X
1062 }
1063 Lower => {
1064 const X: T = &[('a', 'z')];
1065 X
1066 }
1067 Print => {
1068 const X: T = &[(' ', '~')];
1069 X
1070 }
1071 Punct => {
1072 const X: T = &[('!', '/'), (':', '@'), ('[', '`'), ('{', '~')];
1073 X
1074 }
1075 Space => {
1076 const X: T = &[
1077 ('\t', '\t'), ('\n', '\n'), ('\x0B', '\x0B'), ('\x0C', '\x0C'),
1078 ('\r', '\r'), (' ', ' '),
1079 ];
1080 X
1081 }
1082 Upper => {
1083 const X: T = &[('A', 'Z')];
1084 X
1085 }
1086 Word => {
1087 const X: T = &[('0', '9'), ('A', 'Z'), ('_', '_'), ('a', 'z')];
1088 X
1089 }
1090 Xdigit => {
1091 const X: T = &[('0', '9'), ('A', 'F'), ('a', 'f')];
1092 X
1093 }
1094 }
1095 }
1096
1097 #[cfg(test)]
1098 mod tests {
1099 use ast::{self, Ast, Position, Span};
1100 use ast::parse::ParserBuilder;
1101 use hir::{self, Hir, HirKind};
1102 use unicode::{self, ClassQuery};
1103
1104 use super::{TranslatorBuilder, ascii_class};
1105
1106 // We create these errors to compare with real hir::Errors in the tests.
1107 // We define equality between TestError and hir::Error to disregard the
1108 // pattern string in hir::Error, which is annoying to provide in tests.
1109 #[derive(Clone, Debug)]
1110 struct TestError {
1111 span: Span,
1112 kind: hir::ErrorKind,
1113 }
1114
1115 impl PartialEq<hir::Error> for TestError {
eq(&self, other: &hir::Error) -> bool1116 fn eq(&self, other: &hir::Error) -> bool {
1117 self.span == other.span && self.kind == other.kind
1118 }
1119 }
1120
1121 impl PartialEq<TestError> for hir::Error {
eq(&self, other: &TestError) -> bool1122 fn eq(&self, other: &TestError) -> bool {
1123 self.span == other.span && self.kind == other.kind
1124 }
1125 }
1126
parse(pattern: &str) -> Ast1127 fn parse(pattern: &str) -> Ast {
1128 ParserBuilder::new().octal(true).build().parse(pattern).unwrap()
1129 }
1130
t(pattern: &str) -> Hir1131 fn t(pattern: &str) -> Hir {
1132 TranslatorBuilder::new()
1133 .allow_invalid_utf8(false)
1134 .build()
1135 .translate(pattern, &parse(pattern))
1136 .unwrap()
1137 }
1138
t_err(pattern: &str) -> hir::Error1139 fn t_err(pattern: &str) -> hir::Error {
1140 TranslatorBuilder::new()
1141 .allow_invalid_utf8(false)
1142 .build()
1143 .translate(pattern, &parse(pattern))
1144 .unwrap_err()
1145 }
1146
t_bytes(pattern: &str) -> Hir1147 fn t_bytes(pattern: &str) -> Hir {
1148 TranslatorBuilder::new()
1149 .allow_invalid_utf8(true)
1150 .build()
1151 .translate(pattern, &parse(pattern))
1152 .unwrap()
1153 }
1154
hir_lit(s: &str) -> Hir1155 fn hir_lit(s: &str) -> Hir {
1156 match s.len() {
1157 0 => Hir::empty(),
1158 _ => {
1159 let lits = s
1160 .chars()
1161 .map(hir::Literal::Unicode)
1162 .map(Hir::literal)
1163 .collect();
1164 Hir::concat(lits)
1165 }
1166 }
1167 }
1168
hir_blit(s: &[u8]) -> Hir1169 fn hir_blit(s: &[u8]) -> Hir {
1170 match s.len() {
1171 0 => Hir::empty(),
1172 1 => Hir::literal(hir::Literal::Byte(s[0])),
1173 _ => {
1174 let lits = s
1175 .iter()
1176 .cloned()
1177 .map(hir::Literal::Byte)
1178 .map(Hir::literal)
1179 .collect();
1180 Hir::concat(lits)
1181 }
1182 }
1183 }
1184
hir_group(i: u32, expr: Hir) -> Hir1185 fn hir_group(i: u32, expr: Hir) -> Hir {
1186 Hir::group(hir::Group {
1187 kind: hir::GroupKind::CaptureIndex(i),
1188 hir: Box::new(expr),
1189 })
1190 }
1191
hir_group_name(i: u32, name: &str, expr: Hir) -> Hir1192 fn hir_group_name(i: u32, name: &str, expr: Hir) -> Hir {
1193 Hir::group(hir::Group {
1194 kind: hir::GroupKind::CaptureName {
1195 name: name.to_string(),
1196 index: i,
1197 },
1198 hir: Box::new(expr),
1199 })
1200 }
1201
hir_group_nocap(expr: Hir) -> Hir1202 fn hir_group_nocap(expr: Hir) -> Hir {
1203 Hir::group(hir::Group {
1204 kind: hir::GroupKind::NonCapturing,
1205 hir: Box::new(expr),
1206 })
1207 }
1208
hir_quest(greedy: bool, expr: Hir) -> Hir1209 fn hir_quest(greedy: bool, expr: Hir) -> Hir {
1210 Hir::repetition(hir::Repetition {
1211 kind: hir::RepetitionKind::ZeroOrOne,
1212 greedy: greedy,
1213 hir: Box::new(expr),
1214 })
1215 }
1216
hir_star(greedy: bool, expr: Hir) -> Hir1217 fn hir_star(greedy: bool, expr: Hir) -> Hir {
1218 Hir::repetition(hir::Repetition {
1219 kind: hir::RepetitionKind::ZeroOrMore,
1220 greedy: greedy,
1221 hir: Box::new(expr),
1222 })
1223 }
1224
hir_plus(greedy: bool, expr: Hir) -> Hir1225 fn hir_plus(greedy: bool, expr: Hir) -> Hir {
1226 Hir::repetition(hir::Repetition {
1227 kind: hir::RepetitionKind::OneOrMore,
1228 greedy: greedy,
1229 hir: Box::new(expr),
1230 })
1231 }
1232
hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir1233 fn hir_range(greedy: bool, range: hir::RepetitionRange, expr: Hir) -> Hir {
1234 Hir::repetition(hir::Repetition {
1235 kind: hir::RepetitionKind::Range(range),
1236 greedy: greedy,
1237 hir: Box::new(expr),
1238 })
1239 }
1240
hir_alt(alts: Vec<Hir>) -> Hir1241 fn hir_alt(alts: Vec<Hir>) -> Hir {
1242 Hir::alternation(alts)
1243 }
1244
hir_cat(exprs: Vec<Hir>) -> Hir1245 fn hir_cat(exprs: Vec<Hir>) -> Hir {
1246 Hir::concat(exprs)
1247 }
1248
hir_uclass_query(query: ClassQuery) -> Hir1249 fn hir_uclass_query(query: ClassQuery) -> Hir {
1250 Hir::class(hir::Class::Unicode(unicode::class(query).unwrap()))
1251 }
1252
hir_uclass_perl_word() -> Hir1253 fn hir_uclass_perl_word() -> Hir {
1254 use unicode_tables::perl_word::PERL_WORD;
1255 Hir::class(hir::Class::Unicode(unicode::hir_class(PERL_WORD)))
1256 }
1257
hir_uclass(ranges: &[(char, char)]) -> Hir1258 fn hir_uclass(ranges: &[(char, char)]) -> Hir {
1259 let ranges: Vec<hir::ClassUnicodeRange> = ranges
1260 .iter()
1261 .map(|&(s, e)| hir::ClassUnicodeRange::new(s, e))
1262 .collect();
1263 Hir::class(hir::Class::Unicode(hir::ClassUnicode::new(ranges)))
1264 }
1265
hir_bclass(ranges: &[(u8, u8)]) -> Hir1266 fn hir_bclass(ranges: &[(u8, u8)]) -> Hir {
1267 let ranges: Vec<hir::ClassBytesRange> = ranges
1268 .iter()
1269 .map(|&(s, e)| hir::ClassBytesRange::new(s, e))
1270 .collect();
1271 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
1272 }
1273
hir_bclass_from_char(ranges: &[(char, char)]) -> Hir1274 fn hir_bclass_from_char(ranges: &[(char, char)]) -> Hir {
1275 let ranges: Vec<hir::ClassBytesRange> = ranges
1276 .iter()
1277 .map(|&(s, e)| {
1278 assert!(s as u32 <= 0x7F);
1279 assert!(e as u32 <= 0x7F);
1280 hir::ClassBytesRange::new(s as u8, e as u8)
1281 })
1282 .collect();
1283 Hir::class(hir::Class::Bytes(hir::ClassBytes::new(ranges)))
1284 }
1285
hir_case_fold(expr: Hir) -> Hir1286 fn hir_case_fold(expr: Hir) -> Hir {
1287 match expr.into_kind() {
1288 HirKind::Class(mut cls) => {
1289 cls.case_fold_simple();
1290 Hir::class(cls)
1291 }
1292 _ => panic!("cannot case fold non-class Hir expr"),
1293 }
1294 }
1295
hir_negate(expr: Hir) -> Hir1296 fn hir_negate(expr: Hir) -> Hir {
1297 match expr.into_kind() {
1298 HirKind::Class(mut cls) => {
1299 cls.negate();
1300 Hir::class(cls)
1301 }
1302 _ => panic!("cannot negate non-class Hir expr"),
1303 }
1304 }
1305
hir_union(expr1: Hir, expr2: Hir) -> Hir1306 fn hir_union(expr1: Hir, expr2: Hir) -> Hir {
1307 use hir::Class::{Bytes, Unicode};
1308
1309 match (expr1.into_kind(), expr2.into_kind()) {
1310 (
1311 HirKind::Class(Unicode(mut c1)),
1312 HirKind::Class(Unicode(c2)),
1313 ) => {
1314 c1.union(&c2);
1315 Hir::class(hir::Class::Unicode(c1))
1316 }
1317 (
1318 HirKind::Class(Bytes(mut c1)),
1319 HirKind::Class(Bytes(c2)),
1320 ) => {
1321 c1.union(&c2);
1322 Hir::class(hir::Class::Bytes(c1))
1323 }
1324 _ => panic!("cannot union non-class Hir exprs"),
1325 }
1326 }
1327
hir_difference(expr1: Hir, expr2: Hir) -> Hir1328 fn hir_difference(expr1: Hir, expr2: Hir) -> Hir {
1329 use hir::Class::{Bytes, Unicode};
1330
1331 match (expr1.into_kind(), expr2.into_kind()) {
1332 (
1333 HirKind::Class(Unicode(mut c1)),
1334 HirKind::Class(Unicode(c2)),
1335 ) => {
1336 c1.difference(&c2);
1337 Hir::class(hir::Class::Unicode(c1))
1338 }
1339 (
1340 HirKind::Class(Bytes(mut c1)),
1341 HirKind::Class(Bytes(c2)),
1342 ) => {
1343 c1.difference(&c2);
1344 Hir::class(hir::Class::Bytes(c1))
1345 }
1346 _ => panic!("cannot difference non-class Hir exprs"),
1347 }
1348 }
1349
hir_anchor(anchor: hir::Anchor) -> Hir1350 fn hir_anchor(anchor: hir::Anchor) -> Hir {
1351 Hir::anchor(anchor)
1352 }
1353
hir_word(wb: hir::WordBoundary) -> Hir1354 fn hir_word(wb: hir::WordBoundary) -> Hir {
1355 Hir::word_boundary(wb)
1356 }
1357
1358 #[test]
empty()1359 fn empty() {
1360 assert_eq!(t(""), Hir::empty());
1361 assert_eq!(t("(?i)"), Hir::empty());
1362 assert_eq!(t("()"), hir_group(1, Hir::empty()));
1363 assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1364 assert_eq!(t("(?P<wat>)"), hir_group_name(1, "wat", Hir::empty()));
1365 assert_eq!(t("|"), hir_alt(vec![Hir::empty(), Hir::empty()]));
1366 assert_eq!(t("()|()"), hir_alt(vec![
1367 hir_group(1, Hir::empty()),
1368 hir_group(2, Hir::empty()),
1369 ]));
1370 assert_eq!(t("(|b)"), hir_group(1, hir_alt(vec![
1371 Hir::empty(),
1372 hir_lit("b"),
1373 ])));
1374 assert_eq!(t("(a|)"), hir_group(1, hir_alt(vec![
1375 hir_lit("a"),
1376 Hir::empty(),
1377 ])));
1378 assert_eq!(t("(a||c)"), hir_group(1, hir_alt(vec![
1379 hir_lit("a"),
1380 Hir::empty(),
1381 hir_lit("c"),
1382 ])));
1383 assert_eq!(t("(||)"), hir_group(1, hir_alt(vec![
1384 Hir::empty(),
1385 Hir::empty(),
1386 Hir::empty(),
1387 ])));
1388 }
1389
1390 #[test]
literal()1391 fn literal() {
1392 assert_eq!(t("a"), hir_lit("a"));
1393 assert_eq!(t("(?-u)a"), hir_lit("a"));
1394 assert_eq!(t("☃"), hir_lit("☃"));
1395 assert_eq!(t("abcd"), hir_lit("abcd"));
1396
1397 assert_eq!(t_bytes("(?-u)a"), hir_lit("a"));
1398 assert_eq!(t_bytes("(?-u)\x61"), hir_lit("a"));
1399 assert_eq!(t_bytes(r"(?-u)\x61"), hir_lit("a"));
1400 assert_eq!(t_bytes(r"(?-u)\xFF"), hir_blit(b"\xFF"));
1401
1402 assert_eq!(t_err("(?-u)☃"), TestError {
1403 kind: hir::ErrorKind::UnicodeNotAllowed,
1404 span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 7)),
1405 });
1406 assert_eq!(t_err(r"(?-u)\xFF"), TestError {
1407 kind: hir::ErrorKind::InvalidUtf8,
1408 span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)),
1409 });
1410 }
1411
1412 #[test]
literal_case_insensitive()1413 fn literal_case_insensitive() {
1414 assert_eq!(t("(?i)a"), hir_uclass(&[
1415 ('A', 'A'), ('a', 'a'),
1416 ]));
1417 assert_eq!(t("(?i:a)"), hir_group_nocap(hir_uclass(&[
1418 ('A', 'A'), ('a', 'a')],
1419 )));
1420 assert_eq!(t("a(?i)a(?-i)a"), hir_cat(vec![
1421 hir_lit("a"),
1422 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1423 hir_lit("a"),
1424 ]));
1425 assert_eq!(t("(?i)ab@c"), hir_cat(vec![
1426 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1427 hir_uclass(&[('B', 'B'), ('b', 'b')]),
1428 hir_lit("@"),
1429 hir_uclass(&[('C', 'C'), ('c', 'c')]),
1430 ]));
1431 assert_eq!(t("(?i)β"), hir_uclass(&[
1432 ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),
1433 ]));
1434
1435 assert_eq!(t("(?i-u)a"), hir_bclass(&[
1436 (b'A', b'A'), (b'a', b'a'),
1437 ]));
1438 assert_eq!(t("(?-u)a(?i)a(?-i)a"), hir_cat(vec![
1439 hir_lit("a"),
1440 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1441 hir_lit("a"),
1442 ]));
1443 assert_eq!(t("(?i-u)ab@c"), hir_cat(vec![
1444 hir_bclass(&[(b'A', b'A'), (b'a', b'a')]),
1445 hir_bclass(&[(b'B', b'B'), (b'b', b'b')]),
1446 hir_lit("@"),
1447 hir_bclass(&[(b'C', b'C'), (b'c', b'c')]),
1448 ]));
1449
1450 assert_eq!(t_bytes("(?i-u)a"), hir_bclass(&[
1451 (b'A', b'A'), (b'a', b'a'),
1452 ]));
1453 assert_eq!(t_bytes("(?i-u)\x61"), hir_bclass(&[
1454 (b'A', b'A'), (b'a', b'a'),
1455 ]));
1456 assert_eq!(t_bytes(r"(?i-u)\x61"), hir_bclass(&[
1457 (b'A', b'A'), (b'a', b'a'),
1458 ]));
1459 assert_eq!(t_bytes(r"(?i-u)\xFF"), hir_blit(b"\xFF"));
1460
1461 assert_eq!(t_err("(?i-u)β"), TestError {
1462 kind: hir::ErrorKind::UnicodeNotAllowed,
1463 span: Span::new(
1464 Position::new(6, 1, 7),
1465 Position::new(8, 1, 8),
1466 ),
1467 });
1468 }
1469
1470 #[test]
dot()1471 fn dot() {
1472 assert_eq!(t("."), hir_uclass(&[
1473 ('\0', '\t'),
1474 ('\x0B', '\u{10FFFF}'),
1475 ]));
1476 assert_eq!(t("(?s)."), hir_uclass(&[
1477 ('\0', '\u{10FFFF}'),
1478 ]));
1479 assert_eq!(t_bytes("(?-u)."), hir_bclass(&[
1480 (b'\0', b'\t'),
1481 (b'\x0B', b'\xFF'),
1482 ]));
1483 assert_eq!(t_bytes("(?s-u)."), hir_bclass(&[
1484 (b'\0', b'\xFF'),
1485 ]));
1486
1487 // If invalid UTF-8 isn't allowed, then non-Unicode `.` isn't allowed.
1488 assert_eq!(t_err("(?-u)."), TestError {
1489 kind: hir::ErrorKind::InvalidUtf8,
1490 span: Span::new(Position::new(5, 1, 6), Position::new(6, 1, 7)),
1491 });
1492 assert_eq!(t_err("(?s-u)."), TestError {
1493 kind: hir::ErrorKind::InvalidUtf8,
1494 span: Span::new(Position::new(6, 1, 7), Position::new(7, 1, 8)),
1495 });
1496 }
1497
1498 #[test]
assertions()1499 fn assertions() {
1500 assert_eq!(t("^"), hir_anchor(hir::Anchor::StartText));
1501 assert_eq!(t("$"), hir_anchor(hir::Anchor::EndText));
1502 assert_eq!(t(r"\A"), hir_anchor(hir::Anchor::StartText));
1503 assert_eq!(t(r"\z"), hir_anchor(hir::Anchor::EndText));
1504 assert_eq!(t("(?m)^"), hir_anchor(hir::Anchor::StartLine));
1505 assert_eq!(t("(?m)$"), hir_anchor(hir::Anchor::EndLine));
1506 assert_eq!(t(r"(?m)\A"), hir_anchor(hir::Anchor::StartText));
1507 assert_eq!(t(r"(?m)\z"), hir_anchor(hir::Anchor::EndText));
1508
1509 assert_eq!(t(r"\b"), hir_word(hir::WordBoundary::Unicode));
1510 assert_eq!(t(r"\B"), hir_word(hir::WordBoundary::UnicodeNegate));
1511 assert_eq!(t(r"(?-u)\b"), hir_word(hir::WordBoundary::Ascii));
1512 assert_eq!(
1513 t_bytes(r"(?-u)\B"),
1514 hir_word(hir::WordBoundary::AsciiNegate));
1515
1516 assert_eq!(t_err(r"(?-u)\B"), TestError {
1517 kind: hir::ErrorKind::InvalidUtf8,
1518 span: Span::new(Position::new(5, 1, 6), Position::new(7, 1, 8)),
1519 });
1520 }
1521
1522 #[test]
group()1523 fn group() {
1524 assert_eq!(t("(a)"), hir_group(1, hir_lit("a")));
1525 assert_eq!(t("(a)(b)"), hir_cat(vec![
1526 hir_group(1, hir_lit("a")),
1527 hir_group(2, hir_lit("b")),
1528 ]));
1529 assert_eq!(t("(a)|(b)"), hir_alt(vec![
1530 hir_group(1, hir_lit("a")),
1531 hir_group(2, hir_lit("b")),
1532 ]));
1533 assert_eq!(t("(?P<foo>)"), hir_group_name(1, "foo", Hir::empty()));
1534 assert_eq!(t("(?P<foo>a)"), hir_group_name(1, "foo", hir_lit("a")));
1535 assert_eq!(t("(?P<foo>a)(?P<bar>b)"), hir_cat(vec![
1536 hir_group_name(1, "foo", hir_lit("a")),
1537 hir_group_name(2, "bar", hir_lit("b")),
1538 ]));
1539 assert_eq!(t("(?:)"), hir_group_nocap(Hir::empty()));
1540 assert_eq!(t("(?:a)"), hir_group_nocap(hir_lit("a")));
1541 assert_eq!(t("(?:a)(b)"), hir_cat(vec![
1542 hir_group_nocap(hir_lit("a")),
1543 hir_group(1, hir_lit("b")),
1544 ]));
1545 assert_eq!(t("(a)(?:b)(c)"), hir_cat(vec![
1546 hir_group(1, hir_lit("a")),
1547 hir_group_nocap(hir_lit("b")),
1548 hir_group(2, hir_lit("c")),
1549 ]));
1550 assert_eq!(t("(a)(?P<foo>b)(c)"), hir_cat(vec![
1551 hir_group(1, hir_lit("a")),
1552 hir_group_name(2, "foo", hir_lit("b")),
1553 hir_group(3, hir_lit("c")),
1554 ]));
1555 assert_eq!(t("()"), hir_group(1, Hir::empty()));
1556 assert_eq!(t("((?i))"), hir_group(1, Hir::empty()));
1557 assert_eq!(t("((?x))"), hir_group(1, Hir::empty()));
1558 assert_eq!(t("(((?x)))"), hir_group(1, hir_group(2, Hir::empty())));
1559 }
1560
1561 #[test]
flags()1562 fn flags() {
1563 assert_eq!(t("(?i:a)a"), hir_cat(vec![
1564 hir_group_nocap(hir_uclass(&[('A', 'A'), ('a', 'a')])),
1565 hir_lit("a"),
1566 ]));
1567 assert_eq!(t("(?i-u:a)β"), hir_cat(vec![
1568 hir_group_nocap(hir_bclass(&[(b'A', b'A'), (b'a', b'a')])),
1569 hir_lit("β"),
1570 ]));
1571 assert_eq!(t("(?i)(?-i:a)a"), hir_cat(vec![
1572 hir_group_nocap(hir_lit("a")),
1573 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1574 ]));
1575 assert_eq!(t("(?im)a^"), hir_cat(vec![
1576 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1577 hir_anchor(hir::Anchor::StartLine),
1578 ]));
1579 assert_eq!(t("(?im)a^(?i-m)a^"), hir_cat(vec![
1580 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1581 hir_anchor(hir::Anchor::StartLine),
1582 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1583 hir_anchor(hir::Anchor::StartText),
1584 ]));
1585 assert_eq!(t("(?U)a*a*?(?-U)a*a*?"), hir_cat(vec![
1586 hir_star(false, hir_lit("a")),
1587 hir_star(true, hir_lit("a")),
1588 hir_star(true, hir_lit("a")),
1589 hir_star(false, hir_lit("a")),
1590 ]));
1591 assert_eq!(t("(?:a(?i)a)a"), hir_cat(vec![
1592 hir_group_nocap(hir_cat(vec![
1593 hir_lit("a"),
1594 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1595 ])),
1596 hir_lit("a"),
1597 ]));
1598 assert_eq!(t("(?i)(?:a(?-i)a)a"), hir_cat(vec![
1599 hir_group_nocap(hir_cat(vec![
1600 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1601 hir_lit("a"),
1602 ])),
1603 hir_uclass(&[('A', 'A'), ('a', 'a')]),
1604 ]));
1605 }
1606
1607 #[test]
escape()1608 fn escape() {
1609 assert_eq!(
1610 t(r"\\\.\+\*\?\(\)\|\[\]\{\}\^\$\#"),
1611 hir_lit(r"\.+*?()|[]{}^$#"));
1612 }
1613
1614 #[test]
repetition()1615 fn repetition() {
1616 assert_eq!(t("a?"), hir_quest(true, hir_lit("a")));
1617 assert_eq!(t("a*"), hir_star(true, hir_lit("a")));
1618 assert_eq!(t("a+"), hir_plus(true, hir_lit("a")));
1619 assert_eq!(t("a??"), hir_quest(false, hir_lit("a")));
1620 assert_eq!(t("a*?"), hir_star(false, hir_lit("a")));
1621 assert_eq!(t("a+?"), hir_plus(false, hir_lit("a")));
1622
1623 assert_eq!(
1624 t("a{1}"),
1625 hir_range(
1626 true,
1627 hir::RepetitionRange::Exactly(1),
1628 hir_lit("a"),
1629 ));
1630 assert_eq!(
1631 t("a{1,}"),
1632 hir_range(
1633 true,
1634 hir::RepetitionRange::AtLeast(1),
1635 hir_lit("a"),
1636 ));
1637 assert_eq!(
1638 t("a{1,2}"),
1639 hir_range(
1640 true,
1641 hir::RepetitionRange::Bounded(1, 2),
1642 hir_lit("a"),
1643 ));
1644 assert_eq!(
1645 t("a{1}?"),
1646 hir_range(
1647 false,
1648 hir::RepetitionRange::Exactly(1),
1649 hir_lit("a"),
1650 ));
1651 assert_eq!(
1652 t("a{1,}?"),
1653 hir_range(
1654 false,
1655 hir::RepetitionRange::AtLeast(1),
1656 hir_lit("a"),
1657 ));
1658 assert_eq!(
1659 t("a{1,2}?"),
1660 hir_range(
1661 false,
1662 hir::RepetitionRange::Bounded(1, 2),
1663 hir_lit("a"),
1664 ));
1665
1666 assert_eq!(t("ab?"), hir_cat(vec![
1667 hir_lit("a"),
1668 hir_quest(true, hir_lit("b")),
1669 ]));
1670 assert_eq!(t("(ab)?"), hir_quest(true, hir_group(1, hir_cat(vec![
1671 hir_lit("a"),
1672 hir_lit("b"),
1673 ]))));
1674 assert_eq!(t("a|b?"), hir_alt(vec![
1675 hir_lit("a"),
1676 hir_quest(true, hir_lit("b")),
1677 ]));
1678 }
1679
1680 #[test]
cat_alt()1681 fn cat_alt() {
1682 assert_eq!(t("(ab)"), hir_group(1, hir_cat(vec![
1683 hir_lit("a"),
1684 hir_lit("b"),
1685 ])));
1686 assert_eq!(t("a|b"), hir_alt(vec![
1687 hir_lit("a"),
1688 hir_lit("b"),
1689 ]));
1690 assert_eq!(t("a|b|c"), hir_alt(vec![
1691 hir_lit("a"),
1692 hir_lit("b"),
1693 hir_lit("c"),
1694 ]));
1695 assert_eq!(t("ab|bc|cd"), hir_alt(vec![
1696 hir_lit("ab"),
1697 hir_lit("bc"),
1698 hir_lit("cd"),
1699 ]));
1700 assert_eq!(t("(a|b)"), hir_group(1, hir_alt(vec![
1701 hir_lit("a"),
1702 hir_lit("b"),
1703 ])));
1704 assert_eq!(t("(a|b|c)"), hir_group(1, hir_alt(vec![
1705 hir_lit("a"),
1706 hir_lit("b"),
1707 hir_lit("c"),
1708 ])));
1709 assert_eq!(t("(ab|bc|cd)"), hir_group(1, hir_alt(vec![
1710 hir_lit("ab"),
1711 hir_lit("bc"),
1712 hir_lit("cd"),
1713 ])));
1714 assert_eq!(t("(ab|(bc|(cd)))"), hir_group(1, hir_alt(vec![
1715 hir_lit("ab"),
1716 hir_group(2, hir_alt(vec![
1717 hir_lit("bc"),
1718 hir_group(3, hir_lit("cd")),
1719 ])),
1720 ])));
1721 }
1722
1723 #[test]
class_ascii()1724 fn class_ascii() {
1725 assert_eq!(
1726 t("[[:alnum:]]"),
1727 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alnum)));
1728 assert_eq!(
1729 t("[[:alpha:]]"),
1730 hir_uclass(ascii_class(&ast::ClassAsciiKind::Alpha)));
1731 assert_eq!(
1732 t("[[:ascii:]]"),
1733 hir_uclass(ascii_class(&ast::ClassAsciiKind::Ascii)));
1734 assert_eq!(
1735 t("[[:blank:]]"),
1736 hir_uclass(ascii_class(&ast::ClassAsciiKind::Blank)));
1737 assert_eq!(
1738 t("[[:cntrl:]]"),
1739 hir_uclass(ascii_class(&ast::ClassAsciiKind::Cntrl)));
1740 assert_eq!(
1741 t("[[:digit:]]"),
1742 hir_uclass(ascii_class(&ast::ClassAsciiKind::Digit)));
1743 assert_eq!(
1744 t("[[:graph:]]"),
1745 hir_uclass(ascii_class(&ast::ClassAsciiKind::Graph)));
1746 assert_eq!(
1747 t("[[:lower:]]"),
1748 hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower)));
1749 assert_eq!(
1750 t("[[:print:]]"),
1751 hir_uclass(ascii_class(&ast::ClassAsciiKind::Print)));
1752 assert_eq!(
1753 t("[[:punct:]]"),
1754 hir_uclass(ascii_class(&ast::ClassAsciiKind::Punct)));
1755 assert_eq!(
1756 t("[[:space:]]"),
1757 hir_uclass(ascii_class(&ast::ClassAsciiKind::Space)));
1758 assert_eq!(
1759 t("[[:upper:]]"),
1760 hir_uclass(ascii_class(&ast::ClassAsciiKind::Upper)));
1761 assert_eq!(
1762 t("[[:word:]]"),
1763 hir_uclass(ascii_class(&ast::ClassAsciiKind::Word)));
1764 assert_eq!(
1765 t("[[:xdigit:]]"),
1766 hir_uclass(ascii_class(&ast::ClassAsciiKind::Xdigit)));
1767
1768 assert_eq!(
1769 t("[[:^lower:]]"),
1770 hir_negate(hir_uclass(ascii_class(&ast::ClassAsciiKind::Lower))));
1771 assert_eq!(
1772 t("(?i)[[:lower:]]"),
1773 hir_uclass(&[
1774 ('A', 'Z'), ('a', 'z'),
1775 ('\u{17F}', '\u{17F}'),
1776 ('\u{212A}', '\u{212A}'),
1777 ]));
1778
1779 assert_eq!(
1780 t("(?-u)[[:lower:]]"),
1781 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Lower)));
1782 assert_eq!(
1783 t("(?i-u)[[:lower:]]"),
1784 hir_case_fold(hir_bclass_from_char(ascii_class(
1785 &ast::ClassAsciiKind::Lower))));
1786
1787 assert_eq!(t_err("(?-u)[[:^lower:]]"), TestError {
1788 kind: hir::ErrorKind::InvalidUtf8,
1789 span: Span::new(Position::new(6, 1, 7), Position::new(16, 1, 17)),
1790 });
1791 assert_eq!(t_err("(?i-u)[[:^lower:]]"), TestError {
1792 kind: hir::ErrorKind::InvalidUtf8,
1793 span: Span::new(Position::new(7, 1, 8), Position::new(17, 1, 18)),
1794 });
1795 }
1796
1797 #[test]
class_perl()1798 fn class_perl() {
1799 // Unicode
1800 assert_eq!(
1801 t(r"\d"),
1802 hir_uclass_query(ClassQuery::Binary("digit")));
1803 assert_eq!(
1804 t(r"\s"),
1805 hir_uclass_query(ClassQuery::Binary("space")));
1806 assert_eq!(
1807 t(r"\w"),
1808 hir_uclass_perl_word());
1809 assert_eq!(
1810 t(r"(?i)\d"),
1811 hir_uclass_query(ClassQuery::Binary("digit")));
1812 assert_eq!(
1813 t(r"(?i)\s"),
1814 hir_uclass_query(ClassQuery::Binary("space")));
1815 assert_eq!(
1816 t(r"(?i)\w"),
1817 hir_uclass_perl_word());
1818
1819 // Unicode, negated
1820 assert_eq!(
1821 t(r"\D"),
1822 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
1823 assert_eq!(
1824 t(r"\S"),
1825 hir_negate(hir_uclass_query(ClassQuery::Binary("space"))));
1826 assert_eq!(
1827 t(r"\W"),
1828 hir_negate(hir_uclass_perl_word()));
1829 assert_eq!(
1830 t(r"(?i)\D"),
1831 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
1832 assert_eq!(
1833 t(r"(?i)\S"),
1834 hir_negate(hir_uclass_query(ClassQuery::Binary("space"))));
1835 assert_eq!(
1836 t(r"(?i)\W"),
1837 hir_negate(hir_uclass_perl_word()));
1838
1839 // ASCII only
1840 assert_eq!(
1841 t(r"(?-u)\d"),
1842 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)));
1843 assert_eq!(
1844 t(r"(?-u)\s"),
1845 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)));
1846 assert_eq!(
1847 t(r"(?-u)\w"),
1848 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)));
1849 assert_eq!(
1850 t(r"(?i-u)\d"),
1851 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Digit)));
1852 assert_eq!(
1853 t(r"(?i-u)\s"),
1854 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Space)));
1855 assert_eq!(
1856 t(r"(?i-u)\w"),
1857 hir_bclass_from_char(ascii_class(&ast::ClassAsciiKind::Word)));
1858
1859 // ASCII only, negated
1860 assert_eq!(
1861 t(r"(?-u)\D"),
1862 hir_negate(hir_bclass_from_char(ascii_class(
1863 &ast::ClassAsciiKind::Digit))));
1864 assert_eq!(
1865 t(r"(?-u)\S"),
1866 hir_negate(hir_bclass_from_char(ascii_class(
1867 &ast::ClassAsciiKind::Space))));
1868 assert_eq!(
1869 t(r"(?-u)\W"),
1870 hir_negate(hir_bclass_from_char(ascii_class(
1871 &ast::ClassAsciiKind::Word))));
1872 assert_eq!(
1873 t(r"(?i-u)\D"),
1874 hir_negate(hir_bclass_from_char(ascii_class(
1875 &ast::ClassAsciiKind::Digit))));
1876 assert_eq!(
1877 t(r"(?i-u)\S"),
1878 hir_negate(hir_bclass_from_char(ascii_class(
1879 &ast::ClassAsciiKind::Space))));
1880 assert_eq!(
1881 t(r"(?i-u)\W"),
1882 hir_negate(hir_bclass_from_char(ascii_class(
1883 &ast::ClassAsciiKind::Word))));
1884 }
1885
1886 #[test]
class_unicode()1887 fn class_unicode() {
1888 assert_eq!(
1889 t(r"\pZ"),
1890 hir_uclass_query(ClassQuery::Binary("Z")));
1891 assert_eq!(
1892 t(r"\pz"),
1893 hir_uclass_query(ClassQuery::Binary("Z")));
1894 assert_eq!(
1895 t(r"\p{Separator}"),
1896 hir_uclass_query(ClassQuery::Binary("Z")));
1897 assert_eq!(
1898 t(r"\p{se PaRa ToR}"),
1899 hir_uclass_query(ClassQuery::Binary("Z")));
1900 assert_eq!(
1901 t(r"\p{gc:Separator}"),
1902 hir_uclass_query(ClassQuery::Binary("Z")));
1903 assert_eq!(
1904 t(r"\p{gc=Separator}"),
1905 hir_uclass_query(ClassQuery::Binary("Z")));
1906 assert_eq!(
1907 t(r"\p{Other}"),
1908 hir_uclass_query(ClassQuery::Binary("Other")));
1909 assert_eq!(
1910 t(r"\pC"),
1911 hir_uclass_query(ClassQuery::Binary("Other")));
1912
1913 assert_eq!(
1914 t(r"\PZ"),
1915 hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))));
1916 assert_eq!(
1917 t(r"\P{separator}"),
1918 hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))));
1919 assert_eq!(
1920 t(r"\P{gc!=separator}"),
1921 hir_negate(hir_uclass_query(ClassQuery::Binary("Z"))));
1922
1923 assert_eq!(
1924 t(r"\p{Greek}"),
1925 hir_uclass_query(ClassQuery::Binary("Greek")));
1926 assert_eq!(
1927 t(r"(?i)\p{Greek}"),
1928 hir_case_fold(hir_uclass_query(ClassQuery::Binary("Greek"))));
1929 assert_eq!(
1930 t(r"(?i)\P{Greek}"),
1931 hir_negate(hir_case_fold(hir_uclass_query(
1932 ClassQuery::Binary("Greek")))));
1933
1934 assert_eq!(
1935 t(r"\p{any}"),
1936 hir_uclass_query(ClassQuery::Binary("Any")));
1937 assert_eq!(
1938 t(r"\p{assigned}"),
1939 hir_uclass_query(ClassQuery::Binary("Assigned")));
1940 assert_eq!(
1941 t(r"\p{ascii}"),
1942 hir_uclass_query(ClassQuery::Binary("ASCII")));
1943 assert_eq!(
1944 t(r"\p{gc:any}"),
1945 hir_uclass_query(ClassQuery::Binary("Any")));
1946 assert_eq!(
1947 t(r"\p{gc:assigned}"),
1948 hir_uclass_query(ClassQuery::Binary("Assigned")));
1949 assert_eq!(
1950 t(r"\p{gc:ascii}"),
1951 hir_uclass_query(ClassQuery::Binary("ASCII")));
1952
1953 assert_eq!(t_err(r"(?-u)\pZ"), TestError {
1954 kind: hir::ErrorKind::UnicodeNotAllowed,
1955 span: Span::new(Position::new(5, 1, 6), Position::new(8, 1, 9)),
1956 });
1957 assert_eq!(t_err(r"(?-u)\p{Separator}"), TestError {
1958 kind: hir::ErrorKind::UnicodeNotAllowed,
1959 span: Span::new(Position::new(5, 1, 6), Position::new(18, 1, 19)),
1960 });
1961 assert_eq!(t_err(r"\pE"), TestError {
1962 kind: hir::ErrorKind::UnicodePropertyNotFound,
1963 span: Span::new(Position::new(0, 1, 1), Position::new(3, 1, 4)),
1964 });
1965 assert_eq!(t_err(r"\p{Foo}"), TestError {
1966 kind: hir::ErrorKind::UnicodePropertyNotFound,
1967 span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)),
1968 });
1969 assert_eq!(t_err(r"\p{gc:Foo}"), TestError {
1970 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1971 span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)),
1972 });
1973 assert_eq!(t_err(r"\p{sc:Foo}"), TestError {
1974 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1975 span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)),
1976 });
1977 assert_eq!(t_err(r"\p{scx:Foo}"), TestError {
1978 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1979 span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)),
1980 });
1981 assert_eq!(t_err(r"\p{age:Foo}"), TestError {
1982 kind: hir::ErrorKind::UnicodePropertyValueNotFound,
1983 span: Span::new(Position::new(0, 1, 1), Position::new(11, 1, 12)),
1984 });
1985 }
1986
1987 #[test]
class_bracketed()1988 fn class_bracketed() {
1989 assert_eq!(t("[a]"), hir_uclass(&[('a', 'a')]));
1990 assert_eq!(t("[^[a]]"), hir_negate(hir_uclass(&[('a', 'a')])));
1991 assert_eq!(t("[a-z]"), hir_uclass(&[('a', 'z')]));
1992 assert_eq!(t("[a-fd-h]"), hir_uclass(&[('a', 'h')]));
1993 assert_eq!(t("[a-fg-m]"), hir_uclass(&[('a', 'm')]));
1994 assert_eq!(t(r"[\x00]"), hir_uclass(&[('\0', '\0')]));
1995 assert_eq!(t(r"[\n]"), hir_uclass(&[('\n', '\n')]));
1996 assert_eq!(t("[\n]"), hir_uclass(&[('\n', '\n')]));
1997 assert_eq!(
1998 t(r"[\d]"),
1999 hir_uclass_query(ClassQuery::Binary("digit")));
2000 assert_eq!(
2001 t(r"[\pZ]"),
2002 hir_uclass_query(ClassQuery::Binary("separator")));
2003 assert_eq!(
2004 t(r"[\p{separator}]"),
2005 hir_uclass_query(ClassQuery::Binary("separator")));
2006 assert_eq!(
2007 t(r"[^\D]"),
2008 hir_uclass_query(ClassQuery::Binary("digit")));
2009 assert_eq!(
2010 t(r"[^\PZ]"),
2011 hir_uclass_query(ClassQuery::Binary("separator")));
2012 assert_eq!(
2013 t(r"[^\P{separator}]"),
2014 hir_uclass_query(ClassQuery::Binary("separator")));
2015 assert_eq!(
2016 t(r"(?i)[^\D]"),
2017 hir_uclass_query(ClassQuery::Binary("digit")));
2018 assert_eq!(
2019 t(r"(?i)[^\P{greek}]"),
2020 hir_case_fold(hir_uclass_query(ClassQuery::Binary("greek"))));
2021
2022 assert_eq!(t("(?-u)[a]"), hir_bclass(&[(b'a', b'a')]));
2023 assert_eq!(t(r"(?-u)[\x00]"), hir_bclass(&[(b'\0', b'\0')]));
2024 assert_eq!(t_bytes(r"(?-u)[\xFF]"), hir_bclass(&[(b'\xFF', b'\xFF')]));
2025
2026 assert_eq!(t("(?i)[a]"), hir_uclass(&[('A', 'A'), ('a', 'a')]));
2027 assert_eq!(t("(?i)[k]"), hir_uclass(&[
2028 ('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}'),
2029 ]));
2030 assert_eq!(t("(?i)[β]"), hir_uclass(&[
2031 ('Β', 'Β'), ('β', 'β'), ('ϐ', 'ϐ'),
2032 ]));
2033 assert_eq!(t("(?i-u)[k]"), hir_bclass(&[
2034 (b'K', b'K'), (b'k', b'k'),
2035 ]));
2036
2037 assert_eq!(t("[^a]"), hir_negate(hir_uclass(&[('a', 'a')])));
2038 assert_eq!(t(r"[^\x00]"), hir_negate(hir_uclass(&[('\0', '\0')])));
2039 assert_eq!(
2040 t_bytes("(?-u)[^a]"),
2041 hir_negate(hir_bclass(&[(b'a', b'a')])));
2042 assert_eq!(
2043 t(r"[^\d]"),
2044 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
2045 assert_eq!(
2046 t(r"[^\pZ]"),
2047 hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))));
2048 assert_eq!(
2049 t(r"[^\p{separator}]"),
2050 hir_negate(hir_uclass_query(ClassQuery::Binary("separator"))));
2051 assert_eq!(
2052 t(r"(?i)[^\p{greek}]"),
2053 hir_negate(hir_case_fold(hir_uclass_query(
2054 ClassQuery::Binary("greek")))));
2055 assert_eq!(
2056 t(r"(?i)[\P{greek}]"),
2057 hir_negate(hir_case_fold(hir_uclass_query(
2058 ClassQuery::Binary("greek")))));
2059
2060 // Test some weird cases.
2061 assert_eq!(t(r"[\[]"), hir_uclass(&[('[', '[')]));
2062
2063 assert_eq!(t(r"[&]"), hir_uclass(&[('&', '&')]));
2064 assert_eq!(t(r"[\&]"), hir_uclass(&[('&', '&')]));
2065 assert_eq!(t(r"[\&\&]"), hir_uclass(&[('&', '&')]));
2066 assert_eq!(t(r"[\x00-&]"), hir_uclass(&[('\0', '&')]));
2067 assert_eq!(t(r"[&-\xFF]"), hir_uclass(&[('&', '\u{FF}')]));
2068
2069 assert_eq!(t(r"[~]"), hir_uclass(&[('~', '~')]));
2070 assert_eq!(t(r"[\~]"), hir_uclass(&[('~', '~')]));
2071 assert_eq!(t(r"[\~\~]"), hir_uclass(&[('~', '~')]));
2072 assert_eq!(t(r"[\x00-~]"), hir_uclass(&[('\0', '~')]));
2073 assert_eq!(t(r"[~-\xFF]"), hir_uclass(&[('~', '\u{FF}')]));
2074
2075 assert_eq!(t(r"[-]"), hir_uclass(&[('-', '-')]));
2076 assert_eq!(t(r"[\-]"), hir_uclass(&[('-', '-')]));
2077 assert_eq!(t(r"[\-\-]"), hir_uclass(&[('-', '-')]));
2078 assert_eq!(t(r"[\x00-\-]"), hir_uclass(&[('\0', '-')]));
2079 assert_eq!(t(r"[\--\xFF]"), hir_uclass(&[('-', '\u{FF}')]));
2080
2081 assert_eq!(t_err("(?-u)[^a]"), TestError {
2082 kind: hir::ErrorKind::InvalidUtf8,
2083 span: Span::new(Position::new(5, 1, 6), Position::new(9, 1, 10)),
2084 });
2085 assert_eq!(t_err(r"[^\s\S]"), TestError {
2086 kind: hir::ErrorKind::EmptyClassNotAllowed,
2087 span: Span::new(Position::new(0, 1, 1), Position::new(7, 1, 8)),
2088 });
2089 assert_eq!(t_err(r"(?-u)[^\s\S]"), TestError {
2090 kind: hir::ErrorKind::EmptyClassNotAllowed,
2091 span: Span::new(Position::new(5, 1, 6), Position::new(12, 1, 13)),
2092 });
2093 }
2094
2095 #[test]
class_bracketed_union()2096 fn class_bracketed_union() {
2097 assert_eq!(
2098 t("[a-zA-Z]"),
2099 hir_uclass(&[('A', 'Z'), ('a', 'z')]));
2100 assert_eq!(
2101 t(r"[a\pZb]"),
2102 hir_union(
2103 hir_uclass(&[('a', 'b')]),
2104 hir_uclass_query(ClassQuery::Binary("separator"))));
2105 assert_eq!(
2106 t(r"[\pZ\p{Greek}]"),
2107 hir_union(
2108 hir_uclass_query(ClassQuery::Binary("greek")),
2109 hir_uclass_query(ClassQuery::Binary("separator"))));
2110 assert_eq!(
2111 t(r"[\p{age:3.0}\pZ\p{Greek}]"),
2112 hir_union(
2113 hir_uclass_query(ClassQuery::ByValue {
2114 property_name: "age",
2115 property_value: "3.0",
2116 }),
2117 hir_union(
2118 hir_uclass_query(ClassQuery::Binary("greek")),
2119 hir_uclass_query(ClassQuery::Binary("separator")))));
2120 assert_eq!(
2121 t(r"[[[\p{age:3.0}\pZ]\p{Greek}][\p{Cyrillic}]]"),
2122 hir_union(
2123 hir_uclass_query(ClassQuery::ByValue {
2124 property_name: "age",
2125 property_value: "3.0",
2126 }),
2127 hir_union(
2128 hir_uclass_query(ClassQuery::Binary("cyrillic")),
2129 hir_union(
2130 hir_uclass_query(ClassQuery::Binary("greek")),
2131 hir_uclass_query(ClassQuery::Binary("separator"))))));
2132
2133 assert_eq!(
2134 t(r"(?i)[\p{age:3.0}\pZ\p{Greek}]"),
2135 hir_case_fold(hir_union(
2136 hir_uclass_query(ClassQuery::ByValue {
2137 property_name: "age",
2138 property_value: "3.0",
2139 }),
2140 hir_union(
2141 hir_uclass_query(ClassQuery::Binary("greek")),
2142 hir_uclass_query(ClassQuery::Binary("separator"))))));
2143 assert_eq!(
2144 t(r"[^\p{age:3.0}\pZ\p{Greek}]"),
2145 hir_negate(hir_union(
2146 hir_uclass_query(ClassQuery::ByValue {
2147 property_name: "age",
2148 property_value: "3.0",
2149 }),
2150 hir_union(
2151 hir_uclass_query(ClassQuery::Binary("greek")),
2152 hir_uclass_query(ClassQuery::Binary("separator"))))));
2153 assert_eq!(
2154 t(r"(?i)[^\p{age:3.0}\pZ\p{Greek}]"),
2155 hir_negate(hir_case_fold(hir_union(
2156 hir_uclass_query(ClassQuery::ByValue {
2157 property_name: "age",
2158 property_value: "3.0",
2159 }),
2160 hir_union(
2161 hir_uclass_query(ClassQuery::Binary("greek")),
2162 hir_uclass_query(ClassQuery::Binary("separator")))))));
2163 }
2164
2165 #[test]
class_bracketed_nested()2166 fn class_bracketed_nested() {
2167 assert_eq!(
2168 t(r"[a[^c]]"),
2169 hir_negate(hir_uclass(&[('c', 'c')])));
2170 assert_eq!(
2171 t(r"[a-b[^c]]"),
2172 hir_negate(hir_uclass(&[('c', 'c')])));
2173 assert_eq!(
2174 t(r"[a-c[^c]]"),
2175 hir_negate(hir_uclass(&[])));
2176
2177 assert_eq!(
2178 t(r"[^a[^c]]"),
2179 hir_uclass(&[('c', 'c')]));
2180 assert_eq!(
2181 t(r"[^a-b[^c]]"),
2182 hir_uclass(&[('c', 'c')]));
2183
2184 assert_eq!(
2185 t(r"(?i)[a[^c]]"),
2186 hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))));
2187 assert_eq!(
2188 t(r"(?i)[a-b[^c]]"),
2189 hir_negate(hir_case_fold(hir_uclass(&[('c', 'c')]))));
2190
2191 assert_eq!(
2192 t(r"(?i)[^a[^c]]"),
2193 hir_uclass(&[('C', 'C'), ('c', 'c')]));
2194 assert_eq!(
2195 t(r"(?i)[^a-b[^c]]"),
2196 hir_uclass(&[('C', 'C'), ('c', 'c')]));
2197
2198 assert_eq!(t_err(r"[^a-c[^c]]"), TestError {
2199 kind: hir::ErrorKind::EmptyClassNotAllowed,
2200 span: Span::new(Position::new(0, 1, 1), Position::new(10, 1, 11)),
2201 });
2202 assert_eq!(t_err(r"(?i)[^a-c[^c]]"), TestError {
2203 kind: hir::ErrorKind::EmptyClassNotAllowed,
2204 span: Span::new(Position::new(4, 1, 5), Position::new(14, 1, 15)),
2205 });
2206 }
2207
2208 #[test]
class_bracketed_intersect()2209 fn class_bracketed_intersect() {
2210 assert_eq!(t("[abc&&b-c]"), hir_uclass(&[('b', 'c')]));
2211 assert_eq!(t("[abc&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2212 assert_eq!(t("[[abc]&&[b-c]]"), hir_uclass(&[('b', 'c')]));
2213 assert_eq!(t("[a-z&&b-y&&c-x]"), hir_uclass(&[('c', 'x')]));
2214 assert_eq!(t("[c-da-b&&a-d]"), hir_uclass(&[('a', 'd')]));
2215 assert_eq!(t("[a-d&&c-da-b]"), hir_uclass(&[('a', 'd')]));
2216 assert_eq!(t(r"[a-z&&a-c]"), hir_uclass(&[('a', 'c')]));
2217 assert_eq!(t(r"[[a-z&&a-c]]"), hir_uclass(&[('a', 'c')]));
2218 assert_eq!(t(r"[^[a-z&&a-c]]"), hir_negate(hir_uclass(&[('a', 'c')])));
2219
2220 assert_eq!(t("(?-u)[abc&&b-c]"), hir_bclass(&[(b'b', b'c')]));
2221 assert_eq!(t("(?-u)[abc&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2222 assert_eq!(t("(?-u)[[abc]&&[b-c]]"), hir_bclass(&[(b'b', b'c')]));
2223 assert_eq!(t("(?-u)[a-z&&b-y&&c-x]"), hir_bclass(&[(b'c', b'x')]));
2224 assert_eq!(t("(?-u)[c-da-b&&a-d]"), hir_bclass(&[(b'a', b'd')]));
2225 assert_eq!(t("(?-u)[a-d&&c-da-b]"), hir_bclass(&[(b'a', b'd')]));
2226
2227 assert_eq!(
2228 t("(?i)[abc&&b-c]"),
2229 hir_case_fold(hir_uclass(&[('b', 'c')])));
2230 assert_eq!(
2231 t("(?i)[abc&&[b-c]]"),
2232 hir_case_fold(hir_uclass(&[('b', 'c')])));
2233 assert_eq!(
2234 t("(?i)[[abc]&&[b-c]]"),
2235 hir_case_fold(hir_uclass(&[('b', 'c')])));
2236 assert_eq!(
2237 t("(?i)[a-z&&b-y&&c-x]"),
2238 hir_case_fold(hir_uclass(&[('c', 'x')])));
2239 assert_eq!(
2240 t("(?i)[c-da-b&&a-d]"),
2241 hir_case_fold(hir_uclass(&[('a', 'd')])));
2242 assert_eq!(
2243 t("(?i)[a-d&&c-da-b]"),
2244 hir_case_fold(hir_uclass(&[('a', 'd')])));
2245
2246 assert_eq!(
2247 t("(?i-u)[abc&&b-c]"),
2248 hir_case_fold(hir_bclass(&[(b'b', b'c')])));
2249 assert_eq!(
2250 t("(?i-u)[abc&&[b-c]]"),
2251 hir_case_fold(hir_bclass(&[(b'b', b'c')])));
2252 assert_eq!(
2253 t("(?i-u)[[abc]&&[b-c]]"),
2254 hir_case_fold(hir_bclass(&[(b'b', b'c')])));
2255 assert_eq!(
2256 t("(?i-u)[a-z&&b-y&&c-x]"),
2257 hir_case_fold(hir_bclass(&[(b'c', b'x')])));
2258 assert_eq!(
2259 t("(?i-u)[c-da-b&&a-d]"),
2260 hir_case_fold(hir_bclass(&[(b'a', b'd')])));
2261 assert_eq!(
2262 t("(?i-u)[a-d&&c-da-b]"),
2263 hir_case_fold(hir_bclass(&[(b'a', b'd')])));
2264
2265 // In `[a^]`, `^` does not need to be escaped, so it makes sense that
2266 // `^` is also allowed to be unescaped after `&&`.
2267 assert_eq!(t(r"[\^&&^]"), hir_uclass(&[('^', '^')]));
2268 // `]` needs to be escaped after `&&` since it's not at start of class.
2269 assert_eq!(t(r"[]&&\]]"), hir_uclass(&[(']', ']')]));
2270 assert_eq!(t(r"[-&&-]"), hir_uclass(&[('-', '-')]));
2271 assert_eq!(t(r"[\&&&&]"), hir_uclass(&[('&', '&')]));
2272 assert_eq!(t(r"[\&&&\&]"), hir_uclass(&[('&', '&')]));
2273 // Test precedence.
2274 assert_eq!(
2275 t(r"[a-w&&[^c-g]z]"),
2276 hir_uclass(&[('a', 'b'), ('h', 'w')]));
2277 }
2278
2279 #[test]
class_bracketed_intersect_negate()2280 fn class_bracketed_intersect_negate() {
2281 assert_eq!(
2282 t(r"[^\w&&\d]"),
2283 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
2284 assert_eq!(
2285 t(r"[^[a-z&&a-c]]"),
2286 hir_negate(hir_uclass(&[('a', 'c')])));
2287 assert_eq!(
2288 t(r"[^[\w&&\d]]"),
2289 hir_negate(hir_uclass_query(ClassQuery::Binary("digit"))));
2290 assert_eq!(
2291 t(r"[^[^\w&&\d]]"),
2292 hir_uclass_query(ClassQuery::Binary("digit")));
2293 assert_eq!(
2294 t(r"[[[^\w]&&[^\d]]]"),
2295 hir_negate(hir_uclass_perl_word()));
2296
2297 assert_eq!(
2298 t_bytes(r"(?-u)[^\w&&\d]"),
2299 hir_negate(hir_bclass_from_char(ascii_class(
2300 &ast::ClassAsciiKind::Digit))));
2301 assert_eq!(
2302 t_bytes(r"(?-u)[^[a-z&&a-c]]"),
2303 hir_negate(hir_bclass(&[(b'a', b'c')])));
2304 assert_eq!(
2305 t_bytes(r"(?-u)[^[\w&&\d]]"),
2306 hir_negate(hir_bclass_from_char(ascii_class(
2307 &ast::ClassAsciiKind::Digit))));
2308 assert_eq!(
2309 t_bytes(r"(?-u)[^[^\w&&\d]]"),
2310 hir_bclass_from_char(ascii_class(
2311 &ast::ClassAsciiKind::Digit)));
2312 assert_eq!(
2313 t_bytes(r"(?-u)[[[^\w]&&[^\d]]]"),
2314 hir_negate(hir_bclass_from_char(ascii_class(
2315 &ast::ClassAsciiKind::Word))));
2316 }
2317
2318 #[test]
class_bracketed_difference()2319 fn class_bracketed_difference() {
2320 assert_eq!(
2321 t(r"[\pL--[:ascii:]]"),
2322 hir_difference(
2323 hir_uclass_query(ClassQuery::Binary("letter")),
2324 hir_uclass(&[('\0', '\x7F')])));
2325
2326 assert_eq!(
2327 t(r"(?-u)[[:alpha:]--[:lower:]]"),
2328 hir_bclass(&[(b'A', b'Z')]));
2329 }
2330
2331 #[test]
class_bracketed_symmetric_difference()2332 fn class_bracketed_symmetric_difference() {
2333 assert_eq!(
2334 t(r"[\p{sc:Greek}~~\p{scx:Greek}]"),
2335 hir_uclass(&[
2336 ('\u{0342}', '\u{0342}'),
2337 ('\u{0345}', '\u{0345}'),
2338 ('\u{1DC0}', '\u{1DC1}'),
2339 ]));
2340 assert_eq!(
2341 t(r"[a-g~~c-j]"),
2342 hir_uclass(&[('a', 'b'), ('h', 'j')]));
2343
2344 assert_eq!(
2345 t(r"(?-u)[a-g~~c-j]"),
2346 hir_bclass(&[(b'a', b'b'), (b'h', b'j')]));
2347 }
2348
2349 #[test]
ignore_whitespace()2350 fn ignore_whitespace() {
2351 assert_eq!(t(r"(?x)\12 3"), hir_lit("\n3"));
2352 assert_eq!(t(r"(?x)\x { 53 }"), hir_lit("S"));
2353 assert_eq!(t(r"(?x)\x # comment
2354 { # comment
2355 53 # comment
2356 } #comment"), hir_lit("S"));
2357
2358 assert_eq!(t(r"(?x)\x 53"), hir_lit("S"));
2359 assert_eq!(t(r"(?x)\x # comment
2360 53 # comment"), hir_lit("S"));
2361 assert_eq!(t(r"(?x)\x5 3"), hir_lit("S"));
2362
2363 assert_eq!(t(r"(?x)\p # comment
2364 { # comment
2365 Separator # comment
2366 } # comment"), hir_uclass_query(ClassQuery::Binary("separator")));
2367
2368 assert_eq!(t(r"(?x)a # comment
2369 { # comment
2370 5 # comment
2371 , # comment
2372 10 # comment
2373 } # comment"),
2374 hir_range(
2375 true, hir::RepetitionRange::Bounded(5, 10), hir_lit("a")));
2376
2377 assert_eq!(t(r"(?x)a\ # hi there"), hir_lit("a "));
2378 }
2379
2380 #[test]
analysis_is_always_utf8()2381 fn analysis_is_always_utf8() {
2382 // Positive examples.
2383 assert!(t_bytes(r"a").is_always_utf8());
2384 assert!(t_bytes(r"ab").is_always_utf8());
2385 assert!(t_bytes(r"(?-u)a").is_always_utf8());
2386 assert!(t_bytes(r"(?-u)ab").is_always_utf8());
2387 assert!(t_bytes(r"\xFF").is_always_utf8());
2388 assert!(t_bytes(r"\xFF\xFF").is_always_utf8());
2389 assert!(t_bytes(r"[^a]").is_always_utf8());
2390 assert!(t_bytes(r"[^a][^a]").is_always_utf8());
2391 assert!(t_bytes(r"\b").is_always_utf8());
2392 assert!(t_bytes(r"\B").is_always_utf8());
2393 assert!(t_bytes(r"(?-u)\b").is_always_utf8());
2394
2395 // Negative examples.
2396 assert!(!t_bytes(r"(?-u)\xFF").is_always_utf8());
2397 assert!(!t_bytes(r"(?-u)\xFF\xFF").is_always_utf8());
2398 assert!(!t_bytes(r"(?-u)[^a]").is_always_utf8());
2399 assert!(!t_bytes(r"(?-u)[^a][^a]").is_always_utf8());
2400 assert!(!t_bytes(r"(?-u)\B").is_always_utf8());
2401 }
2402
2403 #[test]
analysis_is_all_assertions()2404 fn analysis_is_all_assertions() {
2405 // Positive examples.
2406 assert!(t(r"\b").is_all_assertions());
2407 assert!(t(r"\B").is_all_assertions());
2408 assert!(t(r"^").is_all_assertions());
2409 assert!(t(r"$").is_all_assertions());
2410 assert!(t(r"\A").is_all_assertions());
2411 assert!(t(r"\z").is_all_assertions());
2412 assert!(t(r"$^\z\A\b\B").is_all_assertions());
2413 assert!(t(r"$|^|\z|\A|\b|\B").is_all_assertions());
2414 assert!(t(r"^$|$^").is_all_assertions());
2415 assert!(t(r"((\b)+())*^").is_all_assertions());
2416
2417 // Negative examples.
2418 assert!(!t(r"^a").is_all_assertions());
2419 }
2420
2421 #[test]
analysis_is_anchored()2422 fn analysis_is_anchored() {
2423 // Positive examples.
2424 assert!(t(r"^").is_anchored_start());
2425 assert!(t(r"$").is_anchored_end());
2426 assert!(t(r"^").is_line_anchored_start());
2427 assert!(t(r"$").is_line_anchored_end());
2428
2429 assert!(t(r"^^").is_anchored_start());
2430 assert!(t(r"$$").is_anchored_end());
2431 assert!(t(r"^^").is_line_anchored_start());
2432 assert!(t(r"$$").is_line_anchored_end());
2433
2434 assert!(t(r"^$").is_anchored_start());
2435 assert!(t(r"^$").is_anchored_end());
2436 assert!(t(r"^$").is_line_anchored_start());
2437 assert!(t(r"^$").is_line_anchored_end());
2438
2439 assert!(t(r"^foo").is_anchored_start());
2440 assert!(t(r"foo$").is_anchored_end());
2441 assert!(t(r"^foo").is_line_anchored_start());
2442 assert!(t(r"foo$").is_line_anchored_end());
2443
2444 assert!(t(r"^foo|^bar").is_anchored_start());
2445 assert!(t(r"foo$|bar$").is_anchored_end());
2446 assert!(t(r"^foo|^bar").is_line_anchored_start());
2447 assert!(t(r"foo$|bar$").is_line_anchored_end());
2448
2449 assert!(t(r"^(foo|bar)").is_anchored_start());
2450 assert!(t(r"(foo|bar)$").is_anchored_end());
2451 assert!(t(r"^(foo|bar)").is_line_anchored_start());
2452 assert!(t(r"(foo|bar)$").is_line_anchored_end());
2453
2454 assert!(t(r"^+").is_anchored_start());
2455 assert!(t(r"$+").is_anchored_end());
2456 assert!(t(r"^+").is_line_anchored_start());
2457 assert!(t(r"$+").is_line_anchored_end());
2458 assert!(t(r"^++").is_anchored_start());
2459 assert!(t(r"$++").is_anchored_end());
2460 assert!(t(r"^++").is_line_anchored_start());
2461 assert!(t(r"$++").is_line_anchored_end());
2462 assert!(t(r"(^)+").is_anchored_start());
2463 assert!(t(r"($)+").is_anchored_end());
2464 assert!(t(r"(^)+").is_line_anchored_start());
2465 assert!(t(r"($)+").is_line_anchored_end());
2466
2467 assert!(t(r"$^").is_anchored_start());
2468 assert!(t(r"$^").is_anchored_start());
2469 assert!(t(r"$^").is_line_anchored_end());
2470 assert!(t(r"$^").is_line_anchored_end());
2471 assert!(t(r"$^|^$").is_anchored_start());
2472 assert!(t(r"$^|^$").is_anchored_end());
2473 assert!(t(r"$^|^$").is_line_anchored_start());
2474 assert!(t(r"$^|^$").is_line_anchored_end());
2475
2476 assert!(t(r"\b^").is_anchored_start());
2477 assert!(t(r"$\b").is_anchored_end());
2478 assert!(t(r"\b^").is_line_anchored_start());
2479 assert!(t(r"$\b").is_line_anchored_end());
2480 assert!(t(r"^(?m:^)").is_anchored_start());
2481 assert!(t(r"(?m:$)$").is_anchored_end());
2482 assert!(t(r"^(?m:^)").is_line_anchored_start());
2483 assert!(t(r"(?m:$)$").is_line_anchored_end());
2484 assert!(t(r"(?m:^)^").is_anchored_start());
2485 assert!(t(r"$(?m:$)").is_anchored_end());
2486 assert!(t(r"(?m:^)^").is_line_anchored_start());
2487 assert!(t(r"$(?m:$)").is_line_anchored_end());
2488
2489 // Negative examples.
2490 assert!(!t(r"(?m)^").is_anchored_start());
2491 assert!(!t(r"(?m)$").is_anchored_end());
2492 assert!(!t(r"(?m:^$)|$^").is_anchored_start());
2493 assert!(!t(r"(?m:^$)|$^").is_anchored_end());
2494 assert!(!t(r"$^|(?m:^$)").is_anchored_start());
2495 assert!(!t(r"$^|(?m:^$)").is_anchored_end());
2496
2497 assert!(!t(r"a^").is_anchored_start());
2498 assert!(!t(r"$a").is_anchored_start());
2499 assert!(!t(r"a^").is_line_anchored_start());
2500 assert!(!t(r"$a").is_line_anchored_start());
2501
2502 assert!(!t(r"a^").is_anchored_end());
2503 assert!(!t(r"$a").is_anchored_end());
2504 assert!(!t(r"a^").is_line_anchored_end());
2505 assert!(!t(r"$a").is_line_anchored_end());
2506
2507 assert!(!t(r"^foo|bar").is_anchored_start());
2508 assert!(!t(r"foo|bar$").is_anchored_end());
2509 assert!(!t(r"^foo|bar").is_line_anchored_start());
2510 assert!(!t(r"foo|bar$").is_line_anchored_end());
2511
2512 assert!(!t(r"^*").is_anchored_start());
2513 assert!(!t(r"$*").is_anchored_end());
2514 assert!(!t(r"^*").is_line_anchored_start());
2515 assert!(!t(r"$*").is_line_anchored_end());
2516 assert!(!t(r"^*+").is_anchored_start());
2517 assert!(!t(r"$*+").is_anchored_end());
2518 assert!(!t(r"^*+").is_line_anchored_start());
2519 assert!(!t(r"$*+").is_line_anchored_end());
2520 assert!(!t(r"^+*").is_anchored_start());
2521 assert!(!t(r"$+*").is_anchored_end());
2522 assert!(!t(r"^+*").is_line_anchored_start());
2523 assert!(!t(r"$+*").is_line_anchored_end());
2524 assert!(!t(r"(^)*").is_anchored_start());
2525 assert!(!t(r"($)*").is_anchored_end());
2526 assert!(!t(r"(^)*").is_line_anchored_start());
2527 assert!(!t(r"($)*").is_line_anchored_end());
2528 }
2529
2530 #[test]
analysis_is_line_anchored()2531 fn analysis_is_line_anchored() {
2532 assert!(t(r"(?m)^(foo|bar)").is_line_anchored_start());
2533 assert!(t(r"(?m)(foo|bar)$").is_line_anchored_end());
2534
2535 assert!(t(r"(?m)^foo|^bar").is_line_anchored_start());
2536 assert!(t(r"(?m)foo$|bar$").is_line_anchored_end());
2537
2538 assert!(t(r"(?m)^").is_line_anchored_start());
2539 assert!(t(r"(?m)$").is_line_anchored_end());
2540
2541 assert!(t(r"(?m:^$)|$^").is_line_anchored_start());
2542 assert!(t(r"(?m:^$)|$^").is_line_anchored_end());
2543
2544 assert!(t(r"$^|(?m:^$)").is_line_anchored_start());
2545 assert!(t(r"$^|(?m:^$)").is_line_anchored_end());
2546 }
2547
2548 #[test]
analysis_is_any_anchored()2549 fn analysis_is_any_anchored() {
2550 // Positive examples.
2551 assert!(t(r"^").is_any_anchored_start());
2552 assert!(t(r"$").is_any_anchored_end());
2553 assert!(t(r"\A").is_any_anchored_start());
2554 assert!(t(r"\z").is_any_anchored_end());
2555
2556 // Negative examples.
2557 assert!(!t(r"(?m)^").is_any_anchored_start());
2558 assert!(!t(r"(?m)$").is_any_anchored_end());
2559 assert!(!t(r"$").is_any_anchored_start());
2560 assert!(!t(r"^").is_any_anchored_end());
2561 }
2562
2563 #[test]
analysis_is_match_empty()2564 fn analysis_is_match_empty() {
2565 // Positive examples.
2566 assert!(t(r"").is_match_empty());
2567 assert!(t(r"()").is_match_empty());
2568 assert!(t(r"()*").is_match_empty());
2569 assert!(t(r"()+").is_match_empty());
2570 assert!(t(r"()?").is_match_empty());
2571 assert!(t(r"a*").is_match_empty());
2572 assert!(t(r"a?").is_match_empty());
2573 assert!(t(r"a{0}").is_match_empty());
2574 assert!(t(r"a{0,}").is_match_empty());
2575 assert!(t(r"a{0,1}").is_match_empty());
2576 assert!(t(r"a{0,10}").is_match_empty());
2577 assert!(t(r"\pL*").is_match_empty());
2578 assert!(t(r"a*|b").is_match_empty());
2579 assert!(t(r"b|a*").is_match_empty());
2580 assert!(t(r"a*a?(abcd)*").is_match_empty());
2581 assert!(t(r"^").is_match_empty());
2582 assert!(t(r"$").is_match_empty());
2583 assert!(t(r"(?m)^").is_match_empty());
2584 assert!(t(r"(?m)$").is_match_empty());
2585 assert!(t(r"\A").is_match_empty());
2586 assert!(t(r"\z").is_match_empty());
2587 assert!(t(r"\B").is_match_empty());
2588 assert!(t_bytes(r"(?-u)\B").is_match_empty());
2589
2590 // Negative examples.
2591 assert!(!t(r"a+").is_match_empty());
2592 assert!(!t(r"a{1}").is_match_empty());
2593 assert!(!t(r"a{1,}").is_match_empty());
2594 assert!(!t(r"a{1,2}").is_match_empty());
2595 assert!(!t(r"a{1,10}").is_match_empty());
2596 assert!(!t(r"b|a").is_match_empty());
2597 assert!(!t(r"a*a+(abcd)*").is_match_empty());
2598 assert!(!t(r"\b").is_match_empty());
2599 assert!(!t(r"(?-u)\b").is_match_empty());
2600 }
2601
2602 #[test]
analysis_is_literal()2603 fn analysis_is_literal() {
2604 // Positive examples.
2605 assert!(t(r"").is_literal());
2606 assert!(t(r"a").is_literal());
2607 assert!(t(r"ab").is_literal());
2608 assert!(t(r"abc").is_literal());
2609 assert!(t(r"(?m)abc").is_literal());
2610
2611 // Negative examples.
2612 assert!(!t(r"^").is_literal());
2613 assert!(!t(r"a|b").is_literal());
2614 assert!(!t(r"(a)").is_literal());
2615 assert!(!t(r"a+").is_literal());
2616 assert!(!t(r"foo(a)").is_literal());
2617 assert!(!t(r"(a)foo").is_literal());
2618 assert!(!t(r"[a]").is_literal());
2619 }
2620
2621 #[test]
analysis_is_alternation_literal()2622 fn analysis_is_alternation_literal() {
2623 // Positive examples.
2624 assert!(t(r"").is_alternation_literal());
2625 assert!(t(r"a").is_alternation_literal());
2626 assert!(t(r"ab").is_alternation_literal());
2627 assert!(t(r"abc").is_alternation_literal());
2628 assert!(t(r"(?m)abc").is_alternation_literal());
2629 assert!(t(r"a|b").is_alternation_literal());
2630 assert!(t(r"a|b|c").is_alternation_literal());
2631 assert!(t(r"foo|bar").is_alternation_literal());
2632 assert!(t(r"foo|bar|baz").is_alternation_literal());
2633
2634 // Negative examples.
2635 assert!(!t(r"^").is_alternation_literal());
2636 assert!(!t(r"(a)").is_alternation_literal());
2637 assert!(!t(r"a+").is_alternation_literal());
2638 assert!(!t(r"foo(a)").is_alternation_literal());
2639 assert!(!t(r"(a)foo").is_alternation_literal());
2640 assert!(!t(r"[a]").is_alternation_literal());
2641 assert!(!t(r"[a]|b").is_alternation_literal());
2642 assert!(!t(r"a|[b]").is_alternation_literal());
2643 assert!(!t(r"(a)|b").is_alternation_literal());
2644 assert!(!t(r"a|(b)").is_alternation_literal());
2645 }
2646 }
2647