1 /*!
2 Defines a high-level intermediate representation for regular expressions.
3 */
4 use std::char;
5 use std::cmp;
6 use std::error;
7 use std::fmt;
8 use std::result;
9 use std::u8;
10 
11 use crate::ast::Span;
12 use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter};
13 use crate::unicode;
14 
15 pub use crate::hir::visitor::{visit, Visitor};
16 pub use crate::unicode::CaseFoldError;
17 
18 mod interval;
19 pub mod literal;
20 pub mod print;
21 pub mod translate;
22 mod visitor;
23 
24 /// An error that can occur while translating an `Ast` to a `Hir`.
25 #[derive(Clone, Debug, Eq, PartialEq)]
26 pub struct Error {
27     /// The kind of error.
28     kind: ErrorKind,
29     /// The original pattern that the translator's Ast was parsed from. Every
30     /// span in an error is a valid range into this string.
31     pattern: String,
32     /// The span of this error, derived from the Ast given to the translator.
33     span: Span,
34 }
35 
36 impl Error {
37     /// Return the type of this error.
kind(&self) -> &ErrorKind38     pub fn kind(&self) -> &ErrorKind {
39         &self.kind
40     }
41 
42     /// The original pattern string in which this error occurred.
43     ///
44     /// Every span reported by this error is reported in terms of this string.
pattern(&self) -> &str45     pub fn pattern(&self) -> &str {
46         &self.pattern
47     }
48 
49     /// Return the span at which this error occurred.
span(&self) -> &Span50     pub fn span(&self) -> &Span {
51         &self.span
52     }
53 }
54 
55 /// The type of an error that occurred while building an `Hir`.
56 #[derive(Clone, Debug, Eq, PartialEq)]
57 pub enum ErrorKind {
58     /// This error occurs when a Unicode feature is used when Unicode
59     /// support is disabled. For example `(?-u:\pL)` would trigger this error.
60     UnicodeNotAllowed,
61     /// This error occurs when translating a pattern that could match a byte
62     /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled.
63     InvalidUtf8,
64     /// This occurs when an unrecognized Unicode property name could not
65     /// be found.
66     UnicodePropertyNotFound,
67     /// This occurs when an unrecognized Unicode property value could not
68     /// be found.
69     UnicodePropertyValueNotFound,
70     /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or
71     /// `\d`) could not be found. This can occur when the `unicode-perl`
72     /// crate feature is not enabled.
73     UnicodePerlClassNotFound,
74     /// This occurs when the Unicode simple case mapping tables are not
75     /// available, and the regular expression required Unicode aware case
76     /// insensitivity.
77     UnicodeCaseUnavailable,
78     /// This occurs when the translator attempts to construct a character class
79     /// that is empty.
80     ///
81     /// Note that this restriction in the translator may be removed in the
82     /// future.
83     EmptyClassNotAllowed,
84     /// Hints that destructuring should not be exhaustive.
85     ///
86     /// This enum may grow additional variants, so this makes sure clients
87     /// don't count on exhaustive matching. (Otherwise, adding a new variant
88     /// could break existing code.)
89     #[doc(hidden)]
90     __Nonexhaustive,
91 }
92 
93 impl ErrorKind {
94     // TODO: Remove this method entirely on the next breaking semver release.
95     #[allow(deprecated)]
description(&self) -> &str96     fn description(&self) -> &str {
97         use self::ErrorKind::*;
98         match *self {
99             UnicodeNotAllowed => "Unicode not allowed here",
100             InvalidUtf8 => "pattern can match invalid UTF-8",
101             UnicodePropertyNotFound => "Unicode property not found",
102             UnicodePropertyValueNotFound => "Unicode property value not found",
103             UnicodePerlClassNotFound => {
104                 "Unicode-aware Perl class not found \
105                  (make sure the unicode-perl feature is enabled)"
106             }
107             UnicodeCaseUnavailable => {
108                 "Unicode-aware case insensitivity matching is not available \
109                  (make sure the unicode-case feature is enabled)"
110             }
111             EmptyClassNotAllowed => "empty character classes are not allowed",
112             __Nonexhaustive => unreachable!(),
113         }
114     }
115 }
116 
117 impl error::Error for Error {
118     // TODO: Remove this method entirely on the next breaking semver release.
119     #[allow(deprecated)]
description(&self) -> &str120     fn description(&self) -> &str {
121         self.kind.description()
122     }
123 }
124 
125 impl fmt::Display for Error {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result126     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
127         crate::error::Formatter::from(self).fmt(f)
128     }
129 }
130 
131 impl fmt::Display for ErrorKind {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result132     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
133         // TODO: Remove this on the next breaking semver release.
134         #[allow(deprecated)]
135         f.write_str(self.description())
136     }
137 }
138 
139 /// A high-level intermediate representation (HIR) for a regular expression.
140 ///
141 /// The HIR of a regular expression represents an intermediate step between its
142 /// abstract syntax (a structured description of the concrete syntax) and
143 /// compiled byte codes. The purpose of HIR is to make regular expressions
144 /// easier to analyze. In particular, the AST is much more complex than the
145 /// HIR. For example, while an AST supports arbitrarily nested character
146 /// classes, the HIR will flatten all nested classes into a single set. The HIR
147 /// will also "compile away" every flag present in the concrete syntax. For
148 /// example, users of HIR expressions never need to worry about case folding;
149 /// it is handled automatically by the translator (e.g., by translating `(?i)A`
150 /// to `[aA]`).
151 ///
152 /// If the HIR was produced by a translator that disallows invalid UTF-8, then
153 /// the HIR is guaranteed to match UTF-8 exclusively.
154 ///
155 /// This type defines its own destructor that uses constant stack space and
156 /// heap space proportional to the size of the HIR.
157 ///
158 /// The specific type of an HIR expression can be accessed via its `kind`
159 /// or `into_kind` methods. This extra level of indirection exists for two
160 /// reasons:
161 ///
162 /// 1. Construction of an HIR expression *must* use the constructor methods
163 ///    on this `Hir` type instead of building the `HirKind` values directly.
164 ///    This permits construction to enforce invariants like "concatenations
165 ///    always consist of two or more sub-expressions."
166 /// 2. Every HIR expression contains attributes that are defined inductively,
167 ///    and can be computed cheaply during the construction process. For
168 ///    example, one such attribute is whether the expression must match at the
169 ///    beginning of the text.
170 ///
171 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular
172 /// expression pattern string, and uses constant stack space and heap space
173 /// proportional to the size of the `Hir`.
174 #[derive(Clone, Debug, Eq, PartialEq)]
175 pub struct Hir {
176     /// The underlying HIR kind.
177     kind: HirKind,
178     /// Analysis info about this HIR, computed during construction.
179     info: HirInfo,
180 }
181 
182 /// The kind of an arbitrary `Hir` expression.
183 #[derive(Clone, Debug, Eq, PartialEq)]
184 pub enum HirKind {
185     /// The empty regular expression, which matches everything, including the
186     /// empty string.
187     Empty,
188     /// A single literal character that matches exactly this character.
189     Literal(Literal),
190     /// A single character class that matches any of the characters in the
191     /// class. A class can either consist of Unicode scalar values as
192     /// characters, or it can use bytes.
193     Class(Class),
194     /// An anchor assertion. An anchor assertion match always has zero length.
195     Anchor(Anchor),
196     /// A word boundary assertion, which may or may not be Unicode aware. A
197     /// word boundary assertion match always has zero length.
198     WordBoundary(WordBoundary),
199     /// A repetition operation applied to a child expression.
200     Repetition(Repetition),
201     /// A possibly capturing group, which contains a child expression.
202     Group(Group),
203     /// A concatenation of expressions. A concatenation always has at least two
204     /// child expressions.
205     ///
206     /// A concatenation matches only if each of its child expression matches
207     /// one after the other.
208     Concat(Vec<Hir>),
209     /// An alternation of expressions. An alternation always has at least two
210     /// child expressions.
211     ///
212     /// An alternation matches only if at least one of its child expression
213     /// matches. If multiple expressions match, then the leftmost is preferred.
214     Alternation(Vec<Hir>),
215 }
216 
217 impl Hir {
218     /// Returns a reference to the underlying HIR kind.
kind(&self) -> &HirKind219     pub fn kind(&self) -> &HirKind {
220         &self.kind
221     }
222 
223     /// Consumes ownership of this HIR expression and returns its underlying
224     /// `HirKind`.
into_kind(mut self) -> HirKind225     pub fn into_kind(mut self) -> HirKind {
226         use std::mem;
227         mem::replace(&mut self.kind, HirKind::Empty)
228     }
229 
230     /// Returns an empty HIR expression.
231     ///
232     /// An empty HIR expression always matches, including the empty string.
empty() -> Hir233     pub fn empty() -> Hir {
234         let mut info = HirInfo::new();
235         info.set_always_utf8(true);
236         info.set_all_assertions(true);
237         info.set_anchored_start(false);
238         info.set_anchored_end(false);
239         info.set_line_anchored_start(false);
240         info.set_line_anchored_end(false);
241         info.set_any_anchored_start(false);
242         info.set_any_anchored_end(false);
243         info.set_match_empty(true);
244         info.set_literal(false);
245         info.set_alternation_literal(false);
246         Hir { kind: HirKind::Empty, info: info }
247     }
248 
249     /// Creates a literal HIR expression.
250     ///
251     /// If the given literal has a `Byte` variant with an ASCII byte, then this
252     /// method panics. This enforces the invariant that `Byte` variants are
253     /// only used to express matching of invalid UTF-8.
literal(lit: Literal) -> Hir254     pub fn literal(lit: Literal) -> Hir {
255         if let Literal::Byte(b) = lit {
256             assert!(b > 0x7F);
257         }
258 
259         let mut info = HirInfo::new();
260         info.set_always_utf8(lit.is_unicode());
261         info.set_all_assertions(false);
262         info.set_anchored_start(false);
263         info.set_anchored_end(false);
264         info.set_line_anchored_start(false);
265         info.set_line_anchored_end(false);
266         info.set_any_anchored_start(false);
267         info.set_any_anchored_end(false);
268         info.set_match_empty(false);
269         info.set_literal(true);
270         info.set_alternation_literal(true);
271         Hir { kind: HirKind::Literal(lit), info: info }
272     }
273 
274     /// Creates a class HIR expression.
class(class: Class) -> Hir275     pub fn class(class: Class) -> Hir {
276         let mut info = HirInfo::new();
277         info.set_always_utf8(class.is_always_utf8());
278         info.set_all_assertions(false);
279         info.set_anchored_start(false);
280         info.set_anchored_end(false);
281         info.set_line_anchored_start(false);
282         info.set_line_anchored_end(false);
283         info.set_any_anchored_start(false);
284         info.set_any_anchored_end(false);
285         info.set_match_empty(false);
286         info.set_literal(false);
287         info.set_alternation_literal(false);
288         Hir { kind: HirKind::Class(class), info: info }
289     }
290 
291     /// Creates an anchor assertion HIR expression.
anchor(anchor: Anchor) -> Hir292     pub fn anchor(anchor: Anchor) -> Hir {
293         let mut info = HirInfo::new();
294         info.set_always_utf8(true);
295         info.set_all_assertions(true);
296         info.set_anchored_start(false);
297         info.set_anchored_end(false);
298         info.set_line_anchored_start(false);
299         info.set_line_anchored_end(false);
300         info.set_any_anchored_start(false);
301         info.set_any_anchored_end(false);
302         info.set_match_empty(true);
303         info.set_literal(false);
304         info.set_alternation_literal(false);
305         if let Anchor::StartText = anchor {
306             info.set_anchored_start(true);
307             info.set_line_anchored_start(true);
308             info.set_any_anchored_start(true);
309         }
310         if let Anchor::EndText = anchor {
311             info.set_anchored_end(true);
312             info.set_line_anchored_end(true);
313             info.set_any_anchored_end(true);
314         }
315         if let Anchor::StartLine = anchor {
316             info.set_line_anchored_start(true);
317         }
318         if let Anchor::EndLine = anchor {
319             info.set_line_anchored_end(true);
320         }
321         Hir { kind: HirKind::Anchor(anchor), info: info }
322     }
323 
324     /// Creates a word boundary assertion HIR expression.
word_boundary(word_boundary: WordBoundary) -> Hir325     pub fn word_boundary(word_boundary: WordBoundary) -> Hir {
326         let mut info = HirInfo::new();
327         info.set_always_utf8(true);
328         info.set_all_assertions(true);
329         info.set_anchored_start(false);
330         info.set_anchored_end(false);
331         info.set_line_anchored_start(false);
332         info.set_line_anchored_end(false);
333         info.set_any_anchored_start(false);
334         info.set_any_anchored_end(false);
335         info.set_literal(false);
336         info.set_alternation_literal(false);
337         // A negated word boundary matches the empty string, but a normal
338         // word boundary does not!
339         info.set_match_empty(word_boundary.is_negated());
340         // Negated ASCII word boundaries can match invalid UTF-8.
341         if let WordBoundary::AsciiNegate = word_boundary {
342             info.set_always_utf8(false);
343         }
344         Hir { kind: HirKind::WordBoundary(word_boundary), info: info }
345     }
346 
347     /// Creates a repetition HIR expression.
repetition(rep: Repetition) -> Hir348     pub fn repetition(rep: Repetition) -> Hir {
349         let mut info = HirInfo::new();
350         info.set_always_utf8(rep.hir.is_always_utf8());
351         info.set_all_assertions(rep.hir.is_all_assertions());
352         // If this operator can match the empty string, then it can never
353         // be anchored.
354         info.set_anchored_start(
355             !rep.is_match_empty() && rep.hir.is_anchored_start(),
356         );
357         info.set_anchored_end(
358             !rep.is_match_empty() && rep.hir.is_anchored_end(),
359         );
360         info.set_line_anchored_start(
361             !rep.is_match_empty() && rep.hir.is_anchored_start(),
362         );
363         info.set_line_anchored_end(
364             !rep.is_match_empty() && rep.hir.is_anchored_end(),
365         );
366         info.set_any_anchored_start(rep.hir.is_any_anchored_start());
367         info.set_any_anchored_end(rep.hir.is_any_anchored_end());
368         info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty());
369         info.set_literal(false);
370         info.set_alternation_literal(false);
371         Hir { kind: HirKind::Repetition(rep), info: info }
372     }
373 
374     /// Creates a group HIR expression.
group(group: Group) -> Hir375     pub fn group(group: Group) -> Hir {
376         let mut info = HirInfo::new();
377         info.set_always_utf8(group.hir.is_always_utf8());
378         info.set_all_assertions(group.hir.is_all_assertions());
379         info.set_anchored_start(group.hir.is_anchored_start());
380         info.set_anchored_end(group.hir.is_anchored_end());
381         info.set_line_anchored_start(group.hir.is_line_anchored_start());
382         info.set_line_anchored_end(group.hir.is_line_anchored_end());
383         info.set_any_anchored_start(group.hir.is_any_anchored_start());
384         info.set_any_anchored_end(group.hir.is_any_anchored_end());
385         info.set_match_empty(group.hir.is_match_empty());
386         info.set_literal(false);
387         info.set_alternation_literal(false);
388         Hir { kind: HirKind::Group(group), info: info }
389     }
390 
391     /// Returns the concatenation of the given expressions.
392     ///
393     /// This flattens the concatenation as appropriate.
concat(mut exprs: Vec<Hir>) -> Hir394     pub fn concat(mut exprs: Vec<Hir>) -> Hir {
395         match exprs.len() {
396             0 => Hir::empty(),
397             1 => exprs.pop().unwrap(),
398             _ => {
399                 let mut info = HirInfo::new();
400                 info.set_always_utf8(true);
401                 info.set_all_assertions(true);
402                 info.set_any_anchored_start(false);
403                 info.set_any_anchored_end(false);
404                 info.set_match_empty(true);
405                 info.set_literal(true);
406                 info.set_alternation_literal(true);
407 
408                 // Some attributes require analyzing all sub-expressions.
409                 for e in &exprs {
410                     let x = info.is_always_utf8() && e.is_always_utf8();
411                     info.set_always_utf8(x);
412 
413                     let x = info.is_all_assertions() && e.is_all_assertions();
414                     info.set_all_assertions(x);
415 
416                     let x = info.is_any_anchored_start()
417                         || e.is_any_anchored_start();
418                     info.set_any_anchored_start(x);
419 
420                     let x =
421                         info.is_any_anchored_end() || e.is_any_anchored_end();
422                     info.set_any_anchored_end(x);
423 
424                     let x = info.is_match_empty() && e.is_match_empty();
425                     info.set_match_empty(x);
426 
427                     let x = info.is_literal() && e.is_literal();
428                     info.set_literal(x);
429 
430                     let x = info.is_alternation_literal()
431                         && e.is_alternation_literal();
432                     info.set_alternation_literal(x);
433                 }
434                 // Anchored attributes require something slightly more
435                 // sophisticated. Normally, WLOG, to determine whether an
436                 // expression is anchored to the start, we'd only need to check
437                 // the first expression of a concatenation. However,
438                 // expressions like `$\b^` are still anchored to the start,
439                 // but the first expression in the concatenation *isn't*
440                 // anchored to the start. So the "first" expression to look at
441                 // is actually one that is either not an assertion or is
442                 // specifically the StartText assertion.
443                 info.set_anchored_start(
444                     exprs
445                         .iter()
446                         .take_while(|e| {
447                             e.is_anchored_start() || e.is_all_assertions()
448                         })
449                         .any(|e| e.is_anchored_start()),
450                 );
451                 // Similarly for the end anchor, but in reverse.
452                 info.set_anchored_end(
453                     exprs
454                         .iter()
455                         .rev()
456                         .take_while(|e| {
457                             e.is_anchored_end() || e.is_all_assertions()
458                         })
459                         .any(|e| e.is_anchored_end()),
460                 );
461                 // Repeat the process for line anchors.
462                 info.set_line_anchored_start(
463                     exprs
464                         .iter()
465                         .take_while(|e| {
466                             e.is_line_anchored_start() || e.is_all_assertions()
467                         })
468                         .any(|e| e.is_line_anchored_start()),
469                 );
470                 info.set_line_anchored_end(
471                     exprs
472                         .iter()
473                         .rev()
474                         .take_while(|e| {
475                             e.is_line_anchored_end() || e.is_all_assertions()
476                         })
477                         .any(|e| e.is_line_anchored_end()),
478                 );
479                 Hir { kind: HirKind::Concat(exprs), info: info }
480             }
481         }
482     }
483 
484     /// Returns the alternation of the given expressions.
485     ///
486     /// This flattens the alternation as appropriate.
alternation(mut exprs: Vec<Hir>) -> Hir487     pub fn alternation(mut exprs: Vec<Hir>) -> Hir {
488         match exprs.len() {
489             0 => Hir::empty(),
490             1 => exprs.pop().unwrap(),
491             _ => {
492                 let mut info = HirInfo::new();
493                 info.set_always_utf8(true);
494                 info.set_all_assertions(true);
495                 info.set_anchored_start(true);
496                 info.set_anchored_end(true);
497                 info.set_line_anchored_start(true);
498                 info.set_line_anchored_end(true);
499                 info.set_any_anchored_start(false);
500                 info.set_any_anchored_end(false);
501                 info.set_match_empty(false);
502                 info.set_literal(false);
503                 info.set_alternation_literal(true);
504 
505                 // Some attributes require analyzing all sub-expressions.
506                 for e in &exprs {
507                     let x = info.is_always_utf8() && e.is_always_utf8();
508                     info.set_always_utf8(x);
509 
510                     let x = info.is_all_assertions() && e.is_all_assertions();
511                     info.set_all_assertions(x);
512 
513                     let x = info.is_anchored_start() && e.is_anchored_start();
514                     info.set_anchored_start(x);
515 
516                     let x = info.is_anchored_end() && e.is_anchored_end();
517                     info.set_anchored_end(x);
518 
519                     let x = info.is_line_anchored_start()
520                         && e.is_line_anchored_start();
521                     info.set_line_anchored_start(x);
522 
523                     let x = info.is_line_anchored_end()
524                         && e.is_line_anchored_end();
525                     info.set_line_anchored_end(x);
526 
527                     let x = info.is_any_anchored_start()
528                         || e.is_any_anchored_start();
529                     info.set_any_anchored_start(x);
530 
531                     let x =
532                         info.is_any_anchored_end() || e.is_any_anchored_end();
533                     info.set_any_anchored_end(x);
534 
535                     let x = info.is_match_empty() || e.is_match_empty();
536                     info.set_match_empty(x);
537 
538                     let x = info.is_alternation_literal() && e.is_literal();
539                     info.set_alternation_literal(x);
540                 }
541                 Hir { kind: HirKind::Alternation(exprs), info: info }
542             }
543         }
544     }
545 
546     /// Build an HIR expression for `.`.
547     ///
548     /// A `.` expression matches any character except for `\n`. To build an
549     /// expression that matches any character, including `\n`, use the `any`
550     /// method.
551     ///
552     /// If `bytes` is `true`, then this assumes characters are limited to a
553     /// single byte.
dot(bytes: bool) -> Hir554     pub fn dot(bytes: bool) -> Hir {
555         if bytes {
556             let mut cls = ClassBytes::empty();
557             cls.push(ClassBytesRange::new(b'\0', b'\x09'));
558             cls.push(ClassBytesRange::new(b'\x0B', b'\xFF'));
559             Hir::class(Class::Bytes(cls))
560         } else {
561             let mut cls = ClassUnicode::empty();
562             cls.push(ClassUnicodeRange::new('\0', '\x09'));
563             cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}'));
564             Hir::class(Class::Unicode(cls))
565         }
566     }
567 
568     /// Build an HIR expression for `(?s).`.
569     ///
570     /// A `(?s).` expression matches any character, including `\n`. To build an
571     /// expression that matches any character except for `\n`, then use the
572     /// `dot` method.
573     ///
574     /// If `bytes` is `true`, then this assumes characters are limited to a
575     /// single byte.
any(bytes: bool) -> Hir576     pub fn any(bytes: bool) -> Hir {
577         if bytes {
578             let mut cls = ClassBytes::empty();
579             cls.push(ClassBytesRange::new(b'\0', b'\xFF'));
580             Hir::class(Class::Bytes(cls))
581         } else {
582             let mut cls = ClassUnicode::empty();
583             cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}'));
584             Hir::class(Class::Unicode(cls))
585         }
586     }
587 
588     /// Return true if and only if this HIR will always match valid UTF-8.
589     ///
590     /// When this returns false, then it is possible for this HIR expression
591     /// to match invalid UTF-8.
is_always_utf8(&self) -> bool592     pub fn is_always_utf8(&self) -> bool {
593         self.info.is_always_utf8()
594     }
595 
596     /// Returns true if and only if this entire HIR expression is made up of
597     /// zero-width assertions.
598     ///
599     /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but
600     /// not `^a`.
is_all_assertions(&self) -> bool601     pub fn is_all_assertions(&self) -> bool {
602         self.info.is_all_assertions()
603     }
604 
605     /// Return true if and only if this HIR is required to match from the
606     /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`,
607     /// `^foo|^bar` but not `^foo|bar`.
is_anchored_start(&self) -> bool608     pub fn is_anchored_start(&self) -> bool {
609         self.info.is_anchored_start()
610     }
611 
612     /// Return true if and only if this HIR is required to match at the end
613     /// of text. This includes expressions like `foo$`, `(foo|bar)$`,
614     /// `foo$|bar$` but not `foo$|bar`.
is_anchored_end(&self) -> bool615     pub fn is_anchored_end(&self) -> bool {
616         self.info.is_anchored_end()
617     }
618 
619     /// Return true if and only if this HIR is required to match from the
620     /// beginning of text or the beginning of a line. This includes expressions
621     /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar`
622     /// but not `^foo|bar` or `(?m)^foo|bar`.
623     ///
624     /// Note that if `is_anchored_start` is `true`, then
625     /// `is_line_anchored_start` will also be `true`. The reverse implication
626     /// is not true. For example, `(?m)^foo` is line anchored, but not
627     /// `is_anchored_start`.
is_line_anchored_start(&self) -> bool628     pub fn is_line_anchored_start(&self) -> bool {
629         self.info.is_line_anchored_start()
630     }
631 
632     /// Return true if and only if this HIR is required to match at the
633     /// end of text or the end of a line. This includes expressions like
634     /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`,
635     /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`.
636     ///
637     /// Note that if `is_anchored_end` is `true`, then
638     /// `is_line_anchored_end` will also be `true`. The reverse implication
639     /// is not true. For example, `(?m)foo$` is line anchored, but not
640     /// `is_anchored_end`.
is_line_anchored_end(&self) -> bool641     pub fn is_line_anchored_end(&self) -> bool {
642         self.info.is_line_anchored_end()
643     }
644 
645     /// Return true if and only if this HIR contains any sub-expression that
646     /// is required to match at the beginning of text. Specifically, this
647     /// returns true if the `^` symbol (when multiline mode is disabled) or the
648     /// `\A` escape appear anywhere in the regex.
is_any_anchored_start(&self) -> bool649     pub fn is_any_anchored_start(&self) -> bool {
650         self.info.is_any_anchored_start()
651     }
652 
653     /// Return true if and only if this HIR contains any sub-expression that is
654     /// required to match at the end of text. Specifically, this returns true
655     /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape
656     /// appear anywhere in the regex.
is_any_anchored_end(&self) -> bool657     pub fn is_any_anchored_end(&self) -> bool {
658         self.info.is_any_anchored_end()
659     }
660 
661     /// Return true if and only if the empty string is part of the language
662     /// matched by this regular expression.
663     ///
664     /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`,
665     /// but not `a`, `a+` or `\b`.
is_match_empty(&self) -> bool666     pub fn is_match_empty(&self) -> bool {
667         self.info.is_match_empty()
668     }
669 
670     /// Return true if and only if this HIR is a simple literal. This is only
671     /// true when this HIR expression is either itself a `Literal` or a
672     /// concatenation of only `Literal`s.
673     ///
674     /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`,
675     /// `` are not (even though that contain sub-expressions that are literals).
is_literal(&self) -> bool676     pub fn is_literal(&self) -> bool {
677         self.info.is_literal()
678     }
679 
680     /// Return true if and only if this HIR is either a simple literal or an
681     /// alternation of simple literals. This is only
682     /// true when this HIR expression is either itself a `Literal` or a
683     /// concatenation of only `Literal`s or an alternation of only `Literal`s.
684     ///
685     /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation
686     /// literals, but `f+`, `(foo)`, `foo()`, ``
687     /// are not (even though that contain sub-expressions that are literals).
is_alternation_literal(&self) -> bool688     pub fn is_alternation_literal(&self) -> bool {
689         self.info.is_alternation_literal()
690     }
691 }
692 
693 impl HirKind {
694     /// Return true if and only if this HIR is the empty regular expression.
695     ///
696     /// Note that this is not defined inductively. That is, it only tests if
697     /// this kind is the `Empty` variant. To get the inductive definition,
698     /// use the `is_match_empty` method on [`Hir`](struct.Hir.html).
is_empty(&self) -> bool699     pub fn is_empty(&self) -> bool {
700         match *self {
701             HirKind::Empty => true,
702             _ => false,
703         }
704     }
705 
706     /// Returns true if and only if this kind has any (including possibly
707     /// empty) subexpressions.
has_subexprs(&self) -> bool708     pub fn has_subexprs(&self) -> bool {
709         match *self {
710             HirKind::Empty
711             | HirKind::Literal(_)
712             | HirKind::Class(_)
713             | HirKind::Anchor(_)
714             | HirKind::WordBoundary(_) => false,
715             HirKind::Group(_)
716             | HirKind::Repetition(_)
717             | HirKind::Concat(_)
718             | HirKind::Alternation(_) => true,
719         }
720     }
721 }
722 
723 /// Print a display representation of this Hir.
724 ///
725 /// The result of this is a valid regular expression pattern string.
726 ///
727 /// This implementation uses constant stack space and heap space proportional
728 /// to the size of the `Hir`.
729 impl fmt::Display for Hir {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result730     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
731         use crate::hir::print::Printer;
732         Printer::new().print(self, f)
733     }
734 }
735 
736 /// The high-level intermediate representation of a literal.
737 ///
738 /// A literal corresponds to a single character, where a character is either
739 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters
740 /// are preferred whenever possible. In particular, a `Byte` variant is only
741 /// ever produced when it could match invalid UTF-8.
742 #[derive(Clone, Debug, Eq, PartialEq)]
743 pub enum Literal {
744     /// A single character represented by a Unicode scalar value.
745     Unicode(char),
746     /// A single character represented by an arbitrary byte.
747     Byte(u8),
748 }
749 
750 impl Literal {
751     /// Returns true if and only if this literal corresponds to a Unicode
752     /// scalar value.
is_unicode(&self) -> bool753     pub fn is_unicode(&self) -> bool {
754         match *self {
755             Literal::Unicode(_) => true,
756             Literal::Byte(b) if b <= 0x7F => true,
757             Literal::Byte(_) => false,
758         }
759     }
760 }
761 
762 /// The high-level intermediate representation of a character class.
763 ///
764 /// A character class corresponds to a set of characters. A character is either
765 /// defined by a Unicode scalar value or a byte. Unicode characters are used
766 /// by default, while bytes are used when Unicode mode (via the `u` flag) is
767 /// disabled.
768 ///
769 /// A character class, regardless of its character type, is represented by a
770 /// sequence of non-overlapping non-adjacent ranges of characters.
771 ///
772 /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may
773 /// be produced even when it exclusively matches valid UTF-8. This is because
774 /// a `Bytes` variant represents an intention by the author of the regular
775 /// expression to disable Unicode mode, which in turn impacts the semantics of
776 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not
777 /// match the same set of strings.
778 #[derive(Clone, Debug, Eq, PartialEq)]
779 pub enum Class {
780     /// A set of characters represented by Unicode scalar values.
781     Unicode(ClassUnicode),
782     /// A set of characters represented by arbitrary bytes (one byte per
783     /// character).
784     Bytes(ClassBytes),
785 }
786 
787 impl Class {
788     /// Apply Unicode simple case folding to this character class, in place.
789     /// The character class will be expanded to include all simple case folded
790     /// character variants.
791     ///
792     /// If this is a byte oriented character class, then this will be limited
793     /// to the ASCII ranges `A-Z` and `a-z`.
case_fold_simple(&mut self)794     pub fn case_fold_simple(&mut self) {
795         match *self {
796             Class::Unicode(ref mut x) => x.case_fold_simple(),
797             Class::Bytes(ref mut x) => x.case_fold_simple(),
798         }
799     }
800 
801     /// Negate this character class in place.
802     ///
803     /// After completion, this character class will contain precisely the
804     /// characters that weren't previously in the class.
negate(&mut self)805     pub fn negate(&mut self) {
806         match *self {
807             Class::Unicode(ref mut x) => x.negate(),
808             Class::Bytes(ref mut x) => x.negate(),
809         }
810     }
811 
812     /// Returns true if and only if this character class will only ever match
813     /// valid UTF-8.
814     ///
815     /// A character class can match invalid UTF-8 only when the following
816     /// conditions are met:
817     ///
818     /// 1. The translator was configured to permit generating an expression
819     ///    that can match invalid UTF-8. (By default, this is disabled.)
820     /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete
821     ///    syntax or in the parser builder. By default, Unicode mode is
822     ///    enabled.
is_always_utf8(&self) -> bool823     pub fn is_always_utf8(&self) -> bool {
824         match *self {
825             Class::Unicode(_) => true,
826             Class::Bytes(ref x) => x.is_all_ascii(),
827         }
828     }
829 }
830 
831 /// A set of characters represented by Unicode scalar values.
832 #[derive(Clone, Debug, Eq, PartialEq)]
833 pub struct ClassUnicode {
834     set: IntervalSet<ClassUnicodeRange>,
835 }
836 
837 impl ClassUnicode {
838     /// Create a new class from a sequence of ranges.
839     ///
840     /// The given ranges do not need to be in any specific order, and ranges
841     /// may overlap.
new<I>(ranges: I) -> ClassUnicode where I: IntoIterator<Item = ClassUnicodeRange>,842     pub fn new<I>(ranges: I) -> ClassUnicode
843     where
844         I: IntoIterator<Item = ClassUnicodeRange>,
845     {
846         ClassUnicode { set: IntervalSet::new(ranges) }
847     }
848 
849     /// Create a new class with no ranges.
empty() -> ClassUnicode850     pub fn empty() -> ClassUnicode {
851         ClassUnicode::new(vec![])
852     }
853 
854     /// Add a new range to this set.
push(&mut self, range: ClassUnicodeRange)855     pub fn push(&mut self, range: ClassUnicodeRange) {
856         self.set.push(range);
857     }
858 
859     /// Return an iterator over all ranges in this class.
860     ///
861     /// The iterator yields ranges in ascending order.
iter(&self) -> ClassUnicodeIter<'_>862     pub fn iter(&self) -> ClassUnicodeIter<'_> {
863         ClassUnicodeIter(self.set.iter())
864     }
865 
866     /// Return the underlying ranges as a slice.
ranges(&self) -> &[ClassUnicodeRange]867     pub fn ranges(&self) -> &[ClassUnicodeRange] {
868         self.set.intervals()
869     }
870 
871     /// Expand this character class such that it contains all case folded
872     /// characters, according to Unicode's "simple" mapping. For example, if
873     /// this class consists of the range `a-z`, then applying case folding will
874     /// result in the class containing both the ranges `a-z` and `A-Z`.
875     ///
876     /// # Panics
877     ///
878     /// This routine panics when the case mapping data necessary for this
879     /// routine to complete is unavailable. This occurs when the `unicode-case`
880     /// feature is not enabled.
881     ///
882     /// Callers should prefer using `try_case_fold_simple` instead, which will
883     /// return an error instead of panicking.
case_fold_simple(&mut self)884     pub fn case_fold_simple(&mut self) {
885         self.set
886             .case_fold_simple()
887             .expect("unicode-case feature must be enabled");
888     }
889 
890     /// Expand this character class such that it contains all case folded
891     /// characters, according to Unicode's "simple" mapping. For example, if
892     /// this class consists of the range `a-z`, then applying case folding will
893     /// result in the class containing both the ranges `a-z` and `A-Z`.
894     ///
895     /// # Error
896     ///
897     /// This routine returns an error when the case mapping data necessary
898     /// for this routine to complete is unavailable. This occurs when the
899     /// `unicode-case` feature is not enabled.
try_case_fold_simple( &mut self, ) -> result::Result<(), CaseFoldError>900     pub fn try_case_fold_simple(
901         &mut self,
902     ) -> result::Result<(), CaseFoldError> {
903         self.set.case_fold_simple()
904     }
905 
906     /// Negate this character class.
907     ///
908     /// For all `c` where `c` is a Unicode scalar value, if `c` was in this
909     /// set, then it will not be in this set after negation.
negate(&mut self)910     pub fn negate(&mut self) {
911         self.set.negate();
912     }
913 
914     /// Union this character class with the given character class, in place.
union(&mut self, other: &ClassUnicode)915     pub fn union(&mut self, other: &ClassUnicode) {
916         self.set.union(&other.set);
917     }
918 
919     /// Intersect this character class with the given character class, in
920     /// place.
intersect(&mut self, other: &ClassUnicode)921     pub fn intersect(&mut self, other: &ClassUnicode) {
922         self.set.intersect(&other.set);
923     }
924 
925     /// Subtract the given character class from this character class, in place.
difference(&mut self, other: &ClassUnicode)926     pub fn difference(&mut self, other: &ClassUnicode) {
927         self.set.difference(&other.set);
928     }
929 
930     /// Compute the symmetric difference of the given character classes, in
931     /// place.
932     ///
933     /// This computes the symmetric difference of two character classes. This
934     /// removes all elements in this class that are also in the given class,
935     /// but all adds all elements from the given class that aren't in this
936     /// class. That is, the class will contain all elements in either class,
937     /// but will not contain any elements that are in both classes.
symmetric_difference(&mut self, other: &ClassUnicode)938     pub fn symmetric_difference(&mut self, other: &ClassUnicode) {
939         self.set.symmetric_difference(&other.set);
940     }
941 
942     /// Returns true if and only if this character class will either match
943     /// nothing or only ASCII bytes. Stated differently, this returns false
944     /// if and only if this class contains a non-ASCII codepoint.
is_all_ascii(&self) -> bool945     pub fn is_all_ascii(&self) -> bool {
946         self.set.intervals().last().map_or(true, |r| r.end <= '\x7F')
947     }
948 }
949 
950 /// An iterator over all ranges in a Unicode character class.
951 ///
952 /// The lifetime `'a` refers to the lifetime of the underlying class.
953 #[derive(Debug)]
954 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>);
955 
956 impl<'a> Iterator for ClassUnicodeIter<'a> {
957     type Item = &'a ClassUnicodeRange;
958 
next(&mut self) -> Option<&'a ClassUnicodeRange>959     fn next(&mut self) -> Option<&'a ClassUnicodeRange> {
960         self.0.next()
961     }
962 }
963 
964 /// A single range of characters represented by Unicode scalar values.
965 ///
966 /// The range is closed. That is, the start and end of the range are included
967 /// in the range.
968 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
969 pub struct ClassUnicodeRange {
970     start: char,
971     end: char,
972 }
973 
974 impl fmt::Debug for ClassUnicodeRange {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result975     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
976         let start = if !self.start.is_whitespace() && !self.start.is_control()
977         {
978             self.start.to_string()
979         } else {
980             format!("0x{:X}", self.start as u32)
981         };
982         let end = if !self.end.is_whitespace() && !self.end.is_control() {
983             self.end.to_string()
984         } else {
985             format!("0x{:X}", self.end as u32)
986         };
987         f.debug_struct("ClassUnicodeRange")
988             .field("start", &start)
989             .field("end", &end)
990             .finish()
991     }
992 }
993 
994 impl Interval for ClassUnicodeRange {
995     type Bound = char;
996 
997     #[inline]
lower(&self) -> char998     fn lower(&self) -> char {
999         self.start
1000     }
1001     #[inline]
upper(&self) -> char1002     fn upper(&self) -> char {
1003         self.end
1004     }
1005     #[inline]
set_lower(&mut self, bound: char)1006     fn set_lower(&mut self, bound: char) {
1007         self.start = bound;
1008     }
1009     #[inline]
set_upper(&mut self, bound: char)1010     fn set_upper(&mut self, bound: char) {
1011         self.end = bound;
1012     }
1013 
1014     /// Apply simple case folding to this Unicode scalar value range.
1015     ///
1016     /// Additional ranges are appended to the given vector. Canonical ordering
1017     /// is *not* maintained in the given vector.
case_fold_simple( &self, ranges: &mut Vec<ClassUnicodeRange>, ) -> Result<(), unicode::CaseFoldError>1018     fn case_fold_simple(
1019         &self,
1020         ranges: &mut Vec<ClassUnicodeRange>,
1021     ) -> Result<(), unicode::CaseFoldError> {
1022         if !unicode::contains_simple_case_mapping(self.start, self.end)? {
1023             return Ok(());
1024         }
1025         let start = self.start as u32;
1026         let end = (self.end as u32).saturating_add(1);
1027         let mut next_simple_cp = None;
1028         for cp in (start..end).filter_map(char::from_u32) {
1029             if next_simple_cp.map_or(false, |next| cp < next) {
1030                 continue;
1031             }
1032             let it = match unicode::simple_fold(cp)? {
1033                 Ok(it) => it,
1034                 Err(next) => {
1035                     next_simple_cp = next;
1036                     continue;
1037                 }
1038             };
1039             for cp_folded in it {
1040                 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded));
1041             }
1042         }
1043         Ok(())
1044     }
1045 }
1046 
1047 impl ClassUnicodeRange {
1048     /// Create a new Unicode scalar value range for a character class.
1049     ///
1050     /// The returned range is always in a canonical form. That is, the range
1051     /// returned always satisfies the invariant that `start <= end`.
new(start: char, end: char) -> ClassUnicodeRange1052     pub fn new(start: char, end: char) -> ClassUnicodeRange {
1053         ClassUnicodeRange::create(start, end)
1054     }
1055 
1056     /// Return the start of this range.
1057     ///
1058     /// The start of a range is always less than or equal to the end of the
1059     /// range.
start(&self) -> char1060     pub fn start(&self) -> char {
1061         self.start
1062     }
1063 
1064     /// Return the end of this range.
1065     ///
1066     /// The end of a range is always greater than or equal to the start of the
1067     /// range.
end(&self) -> char1068     pub fn end(&self) -> char {
1069         self.end
1070     }
1071 }
1072 
1073 /// A set of characters represented by arbitrary bytes (where one byte
1074 /// corresponds to one character).
1075 #[derive(Clone, Debug, Eq, PartialEq)]
1076 pub struct ClassBytes {
1077     set: IntervalSet<ClassBytesRange>,
1078 }
1079 
1080 impl ClassBytes {
1081     /// Create a new class from a sequence of ranges.
1082     ///
1083     /// The given ranges do not need to be in any specific order, and ranges
1084     /// may overlap.
new<I>(ranges: I) -> ClassBytes where I: IntoIterator<Item = ClassBytesRange>,1085     pub fn new<I>(ranges: I) -> ClassBytes
1086     where
1087         I: IntoIterator<Item = ClassBytesRange>,
1088     {
1089         ClassBytes { set: IntervalSet::new(ranges) }
1090     }
1091 
1092     /// Create a new class with no ranges.
empty() -> ClassBytes1093     pub fn empty() -> ClassBytes {
1094         ClassBytes::new(vec![])
1095     }
1096 
1097     /// Add a new range to this set.
push(&mut self, range: ClassBytesRange)1098     pub fn push(&mut self, range: ClassBytesRange) {
1099         self.set.push(range);
1100     }
1101 
1102     /// Return an iterator over all ranges in this class.
1103     ///
1104     /// The iterator yields ranges in ascending order.
iter(&self) -> ClassBytesIter<'_>1105     pub fn iter(&self) -> ClassBytesIter<'_> {
1106         ClassBytesIter(self.set.iter())
1107     }
1108 
1109     /// Return the underlying ranges as a slice.
ranges(&self) -> &[ClassBytesRange]1110     pub fn ranges(&self) -> &[ClassBytesRange] {
1111         self.set.intervals()
1112     }
1113 
1114     /// Expand this character class such that it contains all case folded
1115     /// characters. For example, if this class consists of the range `a-z`,
1116     /// then applying case folding will result in the class containing both the
1117     /// ranges `a-z` and `A-Z`.
1118     ///
1119     /// Note that this only applies ASCII case folding, which is limited to the
1120     /// characters `a-z` and `A-Z`.
case_fold_simple(&mut self)1121     pub fn case_fold_simple(&mut self) {
1122         self.set.case_fold_simple().expect("ASCII case folding never fails");
1123     }
1124 
1125     /// Negate this byte class.
1126     ///
1127     /// For all `b` where `b` is a any byte, if `b` was in this set, then it
1128     /// will not be in this set after negation.
negate(&mut self)1129     pub fn negate(&mut self) {
1130         self.set.negate();
1131     }
1132 
1133     /// Union this byte class with the given byte class, in place.
union(&mut self, other: &ClassBytes)1134     pub fn union(&mut self, other: &ClassBytes) {
1135         self.set.union(&other.set);
1136     }
1137 
1138     /// Intersect this byte class with the given byte class, in place.
intersect(&mut self, other: &ClassBytes)1139     pub fn intersect(&mut self, other: &ClassBytes) {
1140         self.set.intersect(&other.set);
1141     }
1142 
1143     /// Subtract the given byte class from this byte class, in place.
difference(&mut self, other: &ClassBytes)1144     pub fn difference(&mut self, other: &ClassBytes) {
1145         self.set.difference(&other.set);
1146     }
1147 
1148     /// Compute the symmetric difference of the given byte classes, in place.
1149     ///
1150     /// This computes the symmetric difference of two byte classes. This
1151     /// removes all elements in this class that are also in the given class,
1152     /// but all adds all elements from the given class that aren't in this
1153     /// class. That is, the class will contain all elements in either class,
1154     /// but will not contain any elements that are in both classes.
symmetric_difference(&mut self, other: &ClassBytes)1155     pub fn symmetric_difference(&mut self, other: &ClassBytes) {
1156         self.set.symmetric_difference(&other.set);
1157     }
1158 
1159     /// Returns true if and only if this character class will either match
1160     /// nothing or only ASCII bytes. Stated differently, this returns false
1161     /// if and only if this class contains a non-ASCII byte.
is_all_ascii(&self) -> bool1162     pub fn is_all_ascii(&self) -> bool {
1163         self.set.intervals().last().map_or(true, |r| r.end <= 0x7F)
1164     }
1165 }
1166 
1167 /// An iterator over all ranges in a byte character class.
1168 ///
1169 /// The lifetime `'a` refers to the lifetime of the underlying class.
1170 #[derive(Debug)]
1171 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>);
1172 
1173 impl<'a> Iterator for ClassBytesIter<'a> {
1174     type Item = &'a ClassBytesRange;
1175 
next(&mut self) -> Option<&'a ClassBytesRange>1176     fn next(&mut self) -> Option<&'a ClassBytesRange> {
1177         self.0.next()
1178     }
1179 }
1180 
1181 /// A single range of characters represented by arbitrary bytes.
1182 ///
1183 /// The range is closed. That is, the start and end of the range are included
1184 /// in the range.
1185 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)]
1186 pub struct ClassBytesRange {
1187     start: u8,
1188     end: u8,
1189 }
1190 
1191 impl Interval for ClassBytesRange {
1192     type Bound = u8;
1193 
1194     #[inline]
lower(&self) -> u81195     fn lower(&self) -> u8 {
1196         self.start
1197     }
1198     #[inline]
upper(&self) -> u81199     fn upper(&self) -> u8 {
1200         self.end
1201     }
1202     #[inline]
set_lower(&mut self, bound: u8)1203     fn set_lower(&mut self, bound: u8) {
1204         self.start = bound;
1205     }
1206     #[inline]
set_upper(&mut self, bound: u8)1207     fn set_upper(&mut self, bound: u8) {
1208         self.end = bound;
1209     }
1210 
1211     /// Apply simple case folding to this byte range. Only ASCII case mappings
1212     /// (for a-z) are applied.
1213     ///
1214     /// Additional ranges are appended to the given vector. Canonical ordering
1215     /// is *not* maintained in the given vector.
case_fold_simple( &self, ranges: &mut Vec<ClassBytesRange>, ) -> Result<(), unicode::CaseFoldError>1216     fn case_fold_simple(
1217         &self,
1218         ranges: &mut Vec<ClassBytesRange>,
1219     ) -> Result<(), unicode::CaseFoldError> {
1220         if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) {
1221             let lower = cmp::max(self.start, b'a');
1222             let upper = cmp::min(self.end, b'z');
1223             ranges.push(ClassBytesRange::new(lower - 32, upper - 32));
1224         }
1225         if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) {
1226             let lower = cmp::max(self.start, b'A');
1227             let upper = cmp::min(self.end, b'Z');
1228             ranges.push(ClassBytesRange::new(lower + 32, upper + 32));
1229         }
1230         Ok(())
1231     }
1232 }
1233 
1234 impl ClassBytesRange {
1235     /// Create a new byte range for a character class.
1236     ///
1237     /// The returned range is always in a canonical form. That is, the range
1238     /// returned always satisfies the invariant that `start <= end`.
new(start: u8, end: u8) -> ClassBytesRange1239     pub fn new(start: u8, end: u8) -> ClassBytesRange {
1240         ClassBytesRange::create(start, end)
1241     }
1242 
1243     /// Return the start of this range.
1244     ///
1245     /// The start of a range is always less than or equal to the end of the
1246     /// range.
start(&self) -> u81247     pub fn start(&self) -> u8 {
1248         self.start
1249     }
1250 
1251     /// Return the end of this range.
1252     ///
1253     /// The end of a range is always greater than or equal to the start of the
1254     /// range.
end(&self) -> u81255     pub fn end(&self) -> u8 {
1256         self.end
1257     }
1258 }
1259 
1260 impl fmt::Debug for ClassBytesRange {
fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1261     fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
1262         let mut debug = f.debug_struct("ClassBytesRange");
1263         if self.start <= 0x7F {
1264             debug.field("start", &(self.start as char));
1265         } else {
1266             debug.field("start", &self.start);
1267         }
1268         if self.end <= 0x7F {
1269             debug.field("end", &(self.end as char));
1270         } else {
1271             debug.field("end", &self.end);
1272         }
1273         debug.finish()
1274     }
1275 }
1276 
1277 /// The high-level intermediate representation for an anchor assertion.
1278 ///
1279 /// A matching anchor assertion is always zero-length.
1280 #[derive(Clone, Debug, Eq, PartialEq)]
1281 pub enum Anchor {
1282     /// Match the beginning of a line or the beginning of text. Specifically,
1283     /// this matches at the starting position of the input, or at the position
1284     /// immediately following a `\n` character.
1285     StartLine,
1286     /// Match the end of a line or the end of text. Specifically,
1287     /// this matches at the end position of the input, or at the position
1288     /// immediately preceding a `\n` character.
1289     EndLine,
1290     /// Match the beginning of text. Specifically, this matches at the starting
1291     /// position of the input.
1292     StartText,
1293     /// Match the end of text. Specifically, this matches at the ending
1294     /// position of the input.
1295     EndText,
1296 }
1297 
1298 /// The high-level intermediate representation for a word-boundary assertion.
1299 ///
1300 /// A matching word boundary assertion is always zero-length.
1301 #[derive(Clone, Debug, Eq, PartialEq)]
1302 pub enum WordBoundary {
1303     /// Match a Unicode-aware word boundary. That is, this matches a position
1304     /// where the left adjacent character and right adjacent character
1305     /// correspond to a word and non-word or a non-word and word character.
1306     Unicode,
1307     /// Match a Unicode-aware negation of a word boundary.
1308     UnicodeNegate,
1309     /// Match an ASCII-only word boundary. That is, this matches a position
1310     /// where the left adjacent character and right adjacent character
1311     /// correspond to a word and non-word or a non-word and word character.
1312     Ascii,
1313     /// Match an ASCII-only negation of a word boundary.
1314     AsciiNegate,
1315 }
1316 
1317 impl WordBoundary {
1318     /// Returns true if and only if this word boundary assertion is negated.
is_negated(&self) -> bool1319     pub fn is_negated(&self) -> bool {
1320         match *self {
1321             WordBoundary::Unicode | WordBoundary::Ascii => false,
1322             WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true,
1323         }
1324     }
1325 }
1326 
1327 /// The high-level intermediate representation for a group.
1328 ///
1329 /// This represents one of three possible group types:
1330 ///
1331 /// 1. A non-capturing group (e.g., `(?:expr)`).
1332 /// 2. A capturing group (e.g., `(expr)`).
1333 /// 3. A named capturing group (e.g., `(?P<name>expr)`).
1334 #[derive(Clone, Debug, Eq, PartialEq)]
1335 pub struct Group {
1336     /// The kind of this group. If it is a capturing group, then the kind
1337     /// contains the capture group index (and the name, if it is a named
1338     /// group).
1339     pub kind: GroupKind,
1340     /// The expression inside the capturing group, which may be empty.
1341     pub hir: Box<Hir>,
1342 }
1343 
1344 /// The kind of group.
1345 #[derive(Clone, Debug, Eq, PartialEq)]
1346 pub enum GroupKind {
1347     /// A normal unnamed capturing group.
1348     ///
1349     /// The value is the capture index of the group.
1350     CaptureIndex(u32),
1351     /// A named capturing group.
1352     CaptureName {
1353         /// The name of the group.
1354         name: String,
1355         /// The capture index of the group.
1356         index: u32,
1357     },
1358     /// A non-capturing group.
1359     NonCapturing,
1360 }
1361 
1362 /// The high-level intermediate representation of a repetition operator.
1363 ///
1364 /// A repetition operator permits the repetition of an arbitrary
1365 /// sub-expression.
1366 #[derive(Clone, Debug, Eq, PartialEq)]
1367 pub struct Repetition {
1368     /// The kind of this repetition operator.
1369     pub kind: RepetitionKind,
1370     /// Whether this repetition operator is greedy or not. A greedy operator
1371     /// will match as much as it can. A non-greedy operator will match as
1372     /// little as it can.
1373     ///
1374     /// Typically, operators are greedy by default and are only non-greedy when
1375     /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is
1376     /// not. However, this can be inverted via the `U` "ungreedy" flag.
1377     pub greedy: bool,
1378     /// The expression being repeated.
1379     pub hir: Box<Hir>,
1380 }
1381 
1382 impl Repetition {
1383     /// Returns true if and only if this repetition operator makes it possible
1384     /// to match the empty string.
1385     ///
1386     /// Note that this is not defined inductively. For example, while `a*`
1387     /// will report `true`, `()+` will not, even though `()` matches the empty
1388     /// string and one or more occurrences of something that matches the empty
1389     /// string will always match the empty string. In order to get the
1390     /// inductive definition, see the corresponding method on
1391     /// [`Hir`](struct.Hir.html).
is_match_empty(&self) -> bool1392     pub fn is_match_empty(&self) -> bool {
1393         match self.kind {
1394             RepetitionKind::ZeroOrOne => true,
1395             RepetitionKind::ZeroOrMore => true,
1396             RepetitionKind::OneOrMore => false,
1397             RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0,
1398             RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0,
1399             RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0,
1400         }
1401     }
1402 }
1403 
1404 /// The kind of a repetition operator.
1405 #[derive(Clone, Debug, Eq, PartialEq)]
1406 pub enum RepetitionKind {
1407     /// Matches a sub-expression zero or one times.
1408     ZeroOrOne,
1409     /// Matches a sub-expression zero or more times.
1410     ZeroOrMore,
1411     /// Matches a sub-expression one or more times.
1412     OneOrMore,
1413     /// Matches a sub-expression within a bounded range of times.
1414     Range(RepetitionRange),
1415 }
1416 
1417 /// The kind of a counted repetition operator.
1418 #[derive(Clone, Debug, Eq, PartialEq)]
1419 pub enum RepetitionRange {
1420     /// Matches a sub-expression exactly this many times.
1421     Exactly(u32),
1422     /// Matches a sub-expression at least this many times.
1423     AtLeast(u32),
1424     /// Matches a sub-expression at least `m` times and at most `n` times.
1425     Bounded(u32, u32),
1426 }
1427 
1428 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack
1429 /// space but heap space proportional to the depth of the total `Hir`.
1430 impl Drop for Hir {
drop(&mut self)1431     fn drop(&mut self) {
1432         use std::mem;
1433 
1434         match *self.kind() {
1435             HirKind::Empty
1436             | HirKind::Literal(_)
1437             | HirKind::Class(_)
1438             | HirKind::Anchor(_)
1439             | HirKind::WordBoundary(_) => return,
1440             HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return,
1441             HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return,
1442             HirKind::Concat(ref x) if x.is_empty() => return,
1443             HirKind::Alternation(ref x) if x.is_empty() => return,
1444             _ => {}
1445         }
1446 
1447         let mut stack = vec![mem::replace(self, Hir::empty())];
1448         while let Some(mut expr) = stack.pop() {
1449             match expr.kind {
1450                 HirKind::Empty
1451                 | HirKind::Literal(_)
1452                 | HirKind::Class(_)
1453                 | HirKind::Anchor(_)
1454                 | HirKind::WordBoundary(_) => {}
1455                 HirKind::Group(ref mut x) => {
1456                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
1457                 }
1458                 HirKind::Repetition(ref mut x) => {
1459                     stack.push(mem::replace(&mut x.hir, Hir::empty()));
1460                 }
1461                 HirKind::Concat(ref mut x) => {
1462                     stack.extend(x.drain(..));
1463                 }
1464                 HirKind::Alternation(ref mut x) => {
1465                     stack.extend(x.drain(..));
1466                 }
1467             }
1468         }
1469     }
1470 }
1471 
1472 /// A type that documents various attributes of an HIR expression.
1473 ///
1474 /// These attributes are typically defined inductively on the HIR.
1475 #[derive(Clone, Debug, Eq, PartialEq)]
1476 struct HirInfo {
1477     /// Represent yes/no questions by a bitfield to conserve space, since
1478     /// this is included in every HIR expression.
1479     ///
1480     /// If more attributes need to be added, it is OK to increase the size of
1481     /// this as appropriate.
1482     bools: u16,
1483 }
1484 
1485 // A simple macro for defining bitfield accessors/mutators.
1486 macro_rules! define_bool {
1487     ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => {
1488         fn $is_fn_name(&self) -> bool {
1489             self.bools & (0b1 << $bit) > 0
1490         }
1491 
1492         fn $set_fn_name(&mut self, yes: bool) {
1493             if yes {
1494                 self.bools |= 1 << $bit;
1495             } else {
1496                 self.bools &= !(1 << $bit);
1497             }
1498         }
1499     };
1500 }
1501 
1502 impl HirInfo {
new() -> HirInfo1503     fn new() -> HirInfo {
1504         HirInfo { bools: 0 }
1505     }
1506 
1507     define_bool!(0, is_always_utf8, set_always_utf8);
1508     define_bool!(1, is_all_assertions, set_all_assertions);
1509     define_bool!(2, is_anchored_start, set_anchored_start);
1510     define_bool!(3, is_anchored_end, set_anchored_end);
1511     define_bool!(4, is_line_anchored_start, set_line_anchored_start);
1512     define_bool!(5, is_line_anchored_end, set_line_anchored_end);
1513     define_bool!(6, is_any_anchored_start, set_any_anchored_start);
1514     define_bool!(7, is_any_anchored_end, set_any_anchored_end);
1515     define_bool!(8, is_match_empty, set_match_empty);
1516     define_bool!(9, is_literal, set_literal);
1517     define_bool!(10, is_alternation_literal, set_alternation_literal);
1518 }
1519 
1520 #[cfg(test)]
1521 mod tests {
1522     use super::*;
1523 
uclass(ranges: &[(char, char)]) -> ClassUnicode1524     fn uclass(ranges: &[(char, char)]) -> ClassUnicode {
1525         let ranges: Vec<ClassUnicodeRange> = ranges
1526             .iter()
1527             .map(|&(s, e)| ClassUnicodeRange::new(s, e))
1528             .collect();
1529         ClassUnicode::new(ranges)
1530     }
1531 
bclass(ranges: &[(u8, u8)]) -> ClassBytes1532     fn bclass(ranges: &[(u8, u8)]) -> ClassBytes {
1533         let ranges: Vec<ClassBytesRange> =
1534             ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect();
1535         ClassBytes::new(ranges)
1536     }
1537 
uranges(cls: &ClassUnicode) -> Vec<(char, char)>1538     fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> {
1539         cls.iter().map(|x| (x.start(), x.end())).collect()
1540     }
1541 
1542     #[cfg(feature = "unicode-case")]
ucasefold(cls: &ClassUnicode) -> ClassUnicode1543     fn ucasefold(cls: &ClassUnicode) -> ClassUnicode {
1544         let mut cls_ = cls.clone();
1545         cls_.case_fold_simple();
1546         cls_
1547     }
1548 
uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1549     fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1550         let mut cls_ = cls1.clone();
1551         cls_.union(cls2);
1552         cls_
1553     }
1554 
uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1555     fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1556         let mut cls_ = cls1.clone();
1557         cls_.intersect(cls2);
1558         cls_
1559     }
1560 
udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1561     fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode {
1562         let mut cls_ = cls1.clone();
1563         cls_.difference(cls2);
1564         cls_
1565     }
1566 
usymdifference( cls1: &ClassUnicode, cls2: &ClassUnicode, ) -> ClassUnicode1567     fn usymdifference(
1568         cls1: &ClassUnicode,
1569         cls2: &ClassUnicode,
1570     ) -> ClassUnicode {
1571         let mut cls_ = cls1.clone();
1572         cls_.symmetric_difference(cls2);
1573         cls_
1574     }
1575 
unegate(cls: &ClassUnicode) -> ClassUnicode1576     fn unegate(cls: &ClassUnicode) -> ClassUnicode {
1577         let mut cls_ = cls.clone();
1578         cls_.negate();
1579         cls_
1580     }
1581 
branges(cls: &ClassBytes) -> Vec<(u8, u8)>1582     fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> {
1583         cls.iter().map(|x| (x.start(), x.end())).collect()
1584     }
1585 
bcasefold(cls: &ClassBytes) -> ClassBytes1586     fn bcasefold(cls: &ClassBytes) -> ClassBytes {
1587         let mut cls_ = cls.clone();
1588         cls_.case_fold_simple();
1589         cls_
1590     }
1591 
bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1592     fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1593         let mut cls_ = cls1.clone();
1594         cls_.union(cls2);
1595         cls_
1596     }
1597 
bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1598     fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1599         let mut cls_ = cls1.clone();
1600         cls_.intersect(cls2);
1601         cls_
1602     }
1603 
bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1604     fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1605         let mut cls_ = cls1.clone();
1606         cls_.difference(cls2);
1607         cls_
1608     }
1609 
bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1610     fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes {
1611         let mut cls_ = cls1.clone();
1612         cls_.symmetric_difference(cls2);
1613         cls_
1614     }
1615 
bnegate(cls: &ClassBytes) -> ClassBytes1616     fn bnegate(cls: &ClassBytes) -> ClassBytes {
1617         let mut cls_ = cls.clone();
1618         cls_.negate();
1619         cls_
1620     }
1621 
1622     #[test]
class_range_canonical_unicode()1623     fn class_range_canonical_unicode() {
1624         let range = ClassUnicodeRange::new('\u{00FF}', '\0');
1625         assert_eq!('\0', range.start());
1626         assert_eq!('\u{00FF}', range.end());
1627     }
1628 
1629     #[test]
class_range_canonical_bytes()1630     fn class_range_canonical_bytes() {
1631         let range = ClassBytesRange::new(b'\xFF', b'\0');
1632         assert_eq!(b'\0', range.start());
1633         assert_eq!(b'\xFF', range.end());
1634     }
1635 
1636     #[test]
class_canonicalize_unicode()1637     fn class_canonicalize_unicode() {
1638         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1639         let expected = vec![('a', 'c'), ('x', 'z')];
1640         assert_eq!(expected, uranges(&cls));
1641 
1642         let cls = uclass(&[('x', 'z'), ('a', 'c')]);
1643         let expected = vec![('a', 'c'), ('x', 'z')];
1644         assert_eq!(expected, uranges(&cls));
1645 
1646         let cls = uclass(&[('x', 'z'), ('w', 'y')]);
1647         let expected = vec![('w', 'z')];
1648         assert_eq!(expected, uranges(&cls));
1649 
1650         let cls = uclass(&[
1651             ('c', 'f'),
1652             ('a', 'g'),
1653             ('d', 'j'),
1654             ('a', 'c'),
1655             ('m', 'p'),
1656             ('l', 's'),
1657         ]);
1658         let expected = vec![('a', 'j'), ('l', 's')];
1659         assert_eq!(expected, uranges(&cls));
1660 
1661         let cls = uclass(&[('x', 'z'), ('u', 'w')]);
1662         let expected = vec![('u', 'z')];
1663         assert_eq!(expected, uranges(&cls));
1664 
1665         let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]);
1666         let expected = vec![('\x00', '\u{10FFFF}')];
1667         assert_eq!(expected, uranges(&cls));
1668 
1669         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1670         let expected = vec![('a', 'b')];
1671         assert_eq!(expected, uranges(&cls));
1672     }
1673 
1674     #[test]
class_canonicalize_bytes()1675     fn class_canonicalize_bytes() {
1676         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1677         let expected = vec![(b'a', b'c'), (b'x', b'z')];
1678         assert_eq!(expected, branges(&cls));
1679 
1680         let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]);
1681         let expected = vec![(b'a', b'c'), (b'x', b'z')];
1682         assert_eq!(expected, branges(&cls));
1683 
1684         let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]);
1685         let expected = vec![(b'w', b'z')];
1686         assert_eq!(expected, branges(&cls));
1687 
1688         let cls = bclass(&[
1689             (b'c', b'f'),
1690             (b'a', b'g'),
1691             (b'd', b'j'),
1692             (b'a', b'c'),
1693             (b'm', b'p'),
1694             (b'l', b's'),
1695         ]);
1696         let expected = vec![(b'a', b'j'), (b'l', b's')];
1697         assert_eq!(expected, branges(&cls));
1698 
1699         let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]);
1700         let expected = vec![(b'u', b'z')];
1701         assert_eq!(expected, branges(&cls));
1702 
1703         let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]);
1704         let expected = vec![(b'\x00', b'\xFF')];
1705         assert_eq!(expected, branges(&cls));
1706 
1707         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1708         let expected = vec![(b'a', b'b')];
1709         assert_eq!(expected, branges(&cls));
1710     }
1711 
1712     #[test]
1713     #[cfg(feature = "unicode-case")]
class_case_fold_unicode()1714     fn class_case_fold_unicode() {
1715         let cls = uclass(&[
1716             ('C', 'F'),
1717             ('A', 'G'),
1718             ('D', 'J'),
1719             ('A', 'C'),
1720             ('M', 'P'),
1721             ('L', 'S'),
1722             ('c', 'f'),
1723         ]);
1724         let expected = uclass(&[
1725             ('A', 'J'),
1726             ('L', 'S'),
1727             ('a', 'j'),
1728             ('l', 's'),
1729             ('\u{17F}', '\u{17F}'),
1730         ]);
1731         assert_eq!(expected, ucasefold(&cls));
1732 
1733         let cls = uclass(&[('A', 'Z')]);
1734         let expected = uclass(&[
1735             ('A', 'Z'),
1736             ('a', 'z'),
1737             ('\u{17F}', '\u{17F}'),
1738             ('\u{212A}', '\u{212A}'),
1739         ]);
1740         assert_eq!(expected, ucasefold(&cls));
1741 
1742         let cls = uclass(&[('a', 'z')]);
1743         let expected = uclass(&[
1744             ('A', 'Z'),
1745             ('a', 'z'),
1746             ('\u{17F}', '\u{17F}'),
1747             ('\u{212A}', '\u{212A}'),
1748         ]);
1749         assert_eq!(expected, ucasefold(&cls));
1750 
1751         let cls = uclass(&[('A', 'A'), ('_', '_')]);
1752         let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]);
1753         assert_eq!(expected, ucasefold(&cls));
1754 
1755         let cls = uclass(&[('A', 'A'), ('=', '=')]);
1756         let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]);
1757         assert_eq!(expected, ucasefold(&cls));
1758 
1759         let cls = uclass(&[('\x00', '\x10')]);
1760         assert_eq!(cls, ucasefold(&cls));
1761 
1762         let cls = uclass(&[('k', 'k')]);
1763         let expected =
1764             uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]);
1765         assert_eq!(expected, ucasefold(&cls));
1766 
1767         let cls = uclass(&[('@', '@')]);
1768         assert_eq!(cls, ucasefold(&cls));
1769     }
1770 
1771     #[test]
1772     #[cfg(not(feature = "unicode-case"))]
class_case_fold_unicode_disabled()1773     fn class_case_fold_unicode_disabled() {
1774         let mut cls = uclass(&[
1775             ('C', 'F'),
1776             ('A', 'G'),
1777             ('D', 'J'),
1778             ('A', 'C'),
1779             ('M', 'P'),
1780             ('L', 'S'),
1781             ('c', 'f'),
1782         ]);
1783         assert!(cls.try_case_fold_simple().is_err());
1784     }
1785 
1786     #[test]
1787     #[should_panic]
1788     #[cfg(not(feature = "unicode-case"))]
class_case_fold_unicode_disabled_panics()1789     fn class_case_fold_unicode_disabled_panics() {
1790         let mut cls = uclass(&[
1791             ('C', 'F'),
1792             ('A', 'G'),
1793             ('D', 'J'),
1794             ('A', 'C'),
1795             ('M', 'P'),
1796             ('L', 'S'),
1797             ('c', 'f'),
1798         ]);
1799         cls.case_fold_simple();
1800     }
1801 
1802     #[test]
class_case_fold_bytes()1803     fn class_case_fold_bytes() {
1804         let cls = bclass(&[
1805             (b'C', b'F'),
1806             (b'A', b'G'),
1807             (b'D', b'J'),
1808             (b'A', b'C'),
1809             (b'M', b'P'),
1810             (b'L', b'S'),
1811             (b'c', b'f'),
1812         ]);
1813         let expected =
1814             bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]);
1815         assert_eq!(expected, bcasefold(&cls));
1816 
1817         let cls = bclass(&[(b'A', b'Z')]);
1818         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1819         assert_eq!(expected, bcasefold(&cls));
1820 
1821         let cls = bclass(&[(b'a', b'z')]);
1822         let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]);
1823         assert_eq!(expected, bcasefold(&cls));
1824 
1825         let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]);
1826         let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]);
1827         assert_eq!(expected, bcasefold(&cls));
1828 
1829         let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]);
1830         let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]);
1831         assert_eq!(expected, bcasefold(&cls));
1832 
1833         let cls = bclass(&[(b'\x00', b'\x10')]);
1834         assert_eq!(cls, bcasefold(&cls));
1835 
1836         let cls = bclass(&[(b'k', b'k')]);
1837         let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]);
1838         assert_eq!(expected, bcasefold(&cls));
1839 
1840         let cls = bclass(&[(b'@', b'@')]);
1841         assert_eq!(cls, bcasefold(&cls));
1842     }
1843 
1844     #[test]
class_negate_unicode()1845     fn class_negate_unicode() {
1846         let cls = uclass(&[('a', 'a')]);
1847         let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]);
1848         assert_eq!(expected, unegate(&cls));
1849 
1850         let cls = uclass(&[('a', 'a'), ('b', 'b')]);
1851         let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]);
1852         assert_eq!(expected, unegate(&cls));
1853 
1854         let cls = uclass(&[('a', 'c'), ('x', 'z')]);
1855         let expected = uclass(&[
1856             ('\x00', '\x60'),
1857             ('\x64', '\x77'),
1858             ('\x7B', '\u{10FFFF}'),
1859         ]);
1860         assert_eq!(expected, unegate(&cls));
1861 
1862         let cls = uclass(&[('\x00', 'a')]);
1863         let expected = uclass(&[('\x62', '\u{10FFFF}')]);
1864         assert_eq!(expected, unegate(&cls));
1865 
1866         let cls = uclass(&[('a', '\u{10FFFF}')]);
1867         let expected = uclass(&[('\x00', '\x60')]);
1868         assert_eq!(expected, unegate(&cls));
1869 
1870         let cls = uclass(&[('\x00', '\u{10FFFF}')]);
1871         let expected = uclass(&[]);
1872         assert_eq!(expected, unegate(&cls));
1873 
1874         let cls = uclass(&[]);
1875         let expected = uclass(&[('\x00', '\u{10FFFF}')]);
1876         assert_eq!(expected, unegate(&cls));
1877 
1878         let cls =
1879             uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]);
1880         let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]);
1881         assert_eq!(expected, unegate(&cls));
1882 
1883         let cls = uclass(&[('\x00', '\u{D7FF}')]);
1884         let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1885         assert_eq!(expected, unegate(&cls));
1886 
1887         let cls = uclass(&[('\x00', '\u{D7FE}')]);
1888         let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]);
1889         assert_eq!(expected, unegate(&cls));
1890 
1891         let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]);
1892         let expected = uclass(&[('\x00', '\u{D7FF}')]);
1893         assert_eq!(expected, unegate(&cls));
1894 
1895         let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]);
1896         let expected = uclass(&[('\x00', '\u{E000}')]);
1897         assert_eq!(expected, unegate(&cls));
1898     }
1899 
1900     #[test]
class_negate_bytes()1901     fn class_negate_bytes() {
1902         let cls = bclass(&[(b'a', b'a')]);
1903         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]);
1904         assert_eq!(expected, bnegate(&cls));
1905 
1906         let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]);
1907         let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]);
1908         assert_eq!(expected, bnegate(&cls));
1909 
1910         let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]);
1911         let expected = bclass(&[
1912             (b'\x00', b'\x60'),
1913             (b'\x64', b'\x77'),
1914             (b'\x7B', b'\xFF'),
1915         ]);
1916         assert_eq!(expected, bnegate(&cls));
1917 
1918         let cls = bclass(&[(b'\x00', b'a')]);
1919         let expected = bclass(&[(b'\x62', b'\xFF')]);
1920         assert_eq!(expected, bnegate(&cls));
1921 
1922         let cls = bclass(&[(b'a', b'\xFF')]);
1923         let expected = bclass(&[(b'\x00', b'\x60')]);
1924         assert_eq!(expected, bnegate(&cls));
1925 
1926         let cls = bclass(&[(b'\x00', b'\xFF')]);
1927         let expected = bclass(&[]);
1928         assert_eq!(expected, bnegate(&cls));
1929 
1930         let cls = bclass(&[]);
1931         let expected = bclass(&[(b'\x00', b'\xFF')]);
1932         assert_eq!(expected, bnegate(&cls));
1933 
1934         let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]);
1935         let expected = bclass(&[(b'\xFE', b'\xFE')]);
1936         assert_eq!(expected, bnegate(&cls));
1937     }
1938 
1939     #[test]
class_union_unicode()1940     fn class_union_unicode() {
1941         let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]);
1942         let cls2 = uclass(&[('a', 'z')]);
1943         let expected = uclass(&[('a', 'z'), ('A', 'C')]);
1944         assert_eq!(expected, uunion(&cls1, &cls2));
1945     }
1946 
1947     #[test]
class_union_bytes()1948     fn class_union_bytes() {
1949         let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]);
1950         let cls2 = bclass(&[(b'a', b'z')]);
1951         let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]);
1952         assert_eq!(expected, bunion(&cls1, &cls2));
1953     }
1954 
1955     #[test]
class_intersect_unicode()1956     fn class_intersect_unicode() {
1957         let cls1 = uclass(&[]);
1958         let cls2 = uclass(&[('a', 'a')]);
1959         let expected = uclass(&[]);
1960         assert_eq!(expected, uintersect(&cls1, &cls2));
1961 
1962         let cls1 = uclass(&[('a', 'a')]);
1963         let cls2 = uclass(&[('a', 'a')]);
1964         let expected = uclass(&[('a', 'a')]);
1965         assert_eq!(expected, uintersect(&cls1, &cls2));
1966 
1967         let cls1 = uclass(&[('a', 'a')]);
1968         let cls2 = uclass(&[('b', 'b')]);
1969         let expected = uclass(&[]);
1970         assert_eq!(expected, uintersect(&cls1, &cls2));
1971 
1972         let cls1 = uclass(&[('a', 'a')]);
1973         let cls2 = uclass(&[('a', 'c')]);
1974         let expected = uclass(&[('a', 'a')]);
1975         assert_eq!(expected, uintersect(&cls1, &cls2));
1976 
1977         let cls1 = uclass(&[('a', 'b')]);
1978         let cls2 = uclass(&[('a', 'c')]);
1979         let expected = uclass(&[('a', 'b')]);
1980         assert_eq!(expected, uintersect(&cls1, &cls2));
1981 
1982         let cls1 = uclass(&[('a', 'b')]);
1983         let cls2 = uclass(&[('b', 'c')]);
1984         let expected = uclass(&[('b', 'b')]);
1985         assert_eq!(expected, uintersect(&cls1, &cls2));
1986 
1987         let cls1 = uclass(&[('a', 'b')]);
1988         let cls2 = uclass(&[('c', 'd')]);
1989         let expected = uclass(&[]);
1990         assert_eq!(expected, uintersect(&cls1, &cls2));
1991 
1992         let cls1 = uclass(&[('b', 'c')]);
1993         let cls2 = uclass(&[('a', 'd')]);
1994         let expected = uclass(&[('b', 'c')]);
1995         assert_eq!(expected, uintersect(&cls1, &cls2));
1996 
1997         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
1998         let cls2 = uclass(&[('a', 'h')]);
1999         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2000         assert_eq!(expected, uintersect(&cls1, &cls2));
2001 
2002         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2003         let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2004         let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2005         assert_eq!(expected, uintersect(&cls1, &cls2));
2006 
2007         let cls1 = uclass(&[('a', 'b'), ('g', 'h')]);
2008         let cls2 = uclass(&[('d', 'e'), ('k', 'l')]);
2009         let expected = uclass(&[]);
2010         assert_eq!(expected, uintersect(&cls1, &cls2));
2011 
2012         let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]);
2013         let cls2 = uclass(&[('h', 'h')]);
2014         let expected = uclass(&[('h', 'h')]);
2015         assert_eq!(expected, uintersect(&cls1, &cls2));
2016 
2017         let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]);
2018         let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]);
2019         let expected = uclass(&[]);
2020         assert_eq!(expected, uintersect(&cls1, &cls2));
2021 
2022         let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]);
2023         let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]);
2024         let expected = uclass(&[('b', 'f')]);
2025         assert_eq!(expected, uintersect(&cls1, &cls2));
2026     }
2027 
2028     #[test]
class_intersect_bytes()2029     fn class_intersect_bytes() {
2030         let cls1 = bclass(&[]);
2031         let cls2 = bclass(&[(b'a', b'a')]);
2032         let expected = bclass(&[]);
2033         assert_eq!(expected, bintersect(&cls1, &cls2));
2034 
2035         let cls1 = bclass(&[(b'a', b'a')]);
2036         let cls2 = bclass(&[(b'a', b'a')]);
2037         let expected = bclass(&[(b'a', b'a')]);
2038         assert_eq!(expected, bintersect(&cls1, &cls2));
2039 
2040         let cls1 = bclass(&[(b'a', b'a')]);
2041         let cls2 = bclass(&[(b'b', b'b')]);
2042         let expected = bclass(&[]);
2043         assert_eq!(expected, bintersect(&cls1, &cls2));
2044 
2045         let cls1 = bclass(&[(b'a', b'a')]);
2046         let cls2 = bclass(&[(b'a', b'c')]);
2047         let expected = bclass(&[(b'a', b'a')]);
2048         assert_eq!(expected, bintersect(&cls1, &cls2));
2049 
2050         let cls1 = bclass(&[(b'a', b'b')]);
2051         let cls2 = bclass(&[(b'a', b'c')]);
2052         let expected = bclass(&[(b'a', b'b')]);
2053         assert_eq!(expected, bintersect(&cls1, &cls2));
2054 
2055         let cls1 = bclass(&[(b'a', b'b')]);
2056         let cls2 = bclass(&[(b'b', b'c')]);
2057         let expected = bclass(&[(b'b', b'b')]);
2058         assert_eq!(expected, bintersect(&cls1, &cls2));
2059 
2060         let cls1 = bclass(&[(b'a', b'b')]);
2061         let cls2 = bclass(&[(b'c', b'd')]);
2062         let expected = bclass(&[]);
2063         assert_eq!(expected, bintersect(&cls1, &cls2));
2064 
2065         let cls1 = bclass(&[(b'b', b'c')]);
2066         let cls2 = bclass(&[(b'a', b'd')]);
2067         let expected = bclass(&[(b'b', b'c')]);
2068         assert_eq!(expected, bintersect(&cls1, &cls2));
2069 
2070         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2071         let cls2 = bclass(&[(b'a', b'h')]);
2072         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2073         assert_eq!(expected, bintersect(&cls1, &cls2));
2074 
2075         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2076         let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2077         let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2078         assert_eq!(expected, bintersect(&cls1, &cls2));
2079 
2080         let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]);
2081         let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]);
2082         let expected = bclass(&[]);
2083         assert_eq!(expected, bintersect(&cls1, &cls2));
2084 
2085         let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]);
2086         let cls2 = bclass(&[(b'h', b'h')]);
2087         let expected = bclass(&[(b'h', b'h')]);
2088         assert_eq!(expected, bintersect(&cls1, &cls2));
2089 
2090         let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]);
2091         let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]);
2092         let expected = bclass(&[]);
2093         assert_eq!(expected, bintersect(&cls1, &cls2));
2094 
2095         let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]);
2096         let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]);
2097         let expected = bclass(&[(b'b', b'f')]);
2098         assert_eq!(expected, bintersect(&cls1, &cls2));
2099     }
2100 
2101     #[test]
class_difference_unicode()2102     fn class_difference_unicode() {
2103         let cls1 = uclass(&[('a', 'a')]);
2104         let cls2 = uclass(&[('a', 'a')]);
2105         let expected = uclass(&[]);
2106         assert_eq!(expected, udifference(&cls1, &cls2));
2107 
2108         let cls1 = uclass(&[('a', 'a')]);
2109         let cls2 = uclass(&[]);
2110         let expected = uclass(&[('a', 'a')]);
2111         assert_eq!(expected, udifference(&cls1, &cls2));
2112 
2113         let cls1 = uclass(&[]);
2114         let cls2 = uclass(&[('a', 'a')]);
2115         let expected = uclass(&[]);
2116         assert_eq!(expected, udifference(&cls1, &cls2));
2117 
2118         let cls1 = uclass(&[('a', 'z')]);
2119         let cls2 = uclass(&[('a', 'a')]);
2120         let expected = uclass(&[('b', 'z')]);
2121         assert_eq!(expected, udifference(&cls1, &cls2));
2122 
2123         let cls1 = uclass(&[('a', 'z')]);
2124         let cls2 = uclass(&[('z', 'z')]);
2125         let expected = uclass(&[('a', 'y')]);
2126         assert_eq!(expected, udifference(&cls1, &cls2));
2127 
2128         let cls1 = uclass(&[('a', 'z')]);
2129         let cls2 = uclass(&[('m', 'm')]);
2130         let expected = uclass(&[('a', 'l'), ('n', 'z')]);
2131         assert_eq!(expected, udifference(&cls1, &cls2));
2132 
2133         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2134         let cls2 = uclass(&[('a', 'z')]);
2135         let expected = uclass(&[]);
2136         assert_eq!(expected, udifference(&cls1, &cls2));
2137 
2138         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2139         let cls2 = uclass(&[('d', 'v')]);
2140         let expected = uclass(&[('a', 'c')]);
2141         assert_eq!(expected, udifference(&cls1, &cls2));
2142 
2143         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2144         let cls2 = uclass(&[('b', 'g'), ('s', 'u')]);
2145         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2146         assert_eq!(expected, udifference(&cls1, &cls2));
2147 
2148         let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]);
2149         let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]);
2150         let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]);
2151         assert_eq!(expected, udifference(&cls1, &cls2));
2152 
2153         let cls1 = uclass(&[('x', 'z')]);
2154         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2155         let expected = uclass(&[('x', 'z')]);
2156         assert_eq!(expected, udifference(&cls1, &cls2));
2157 
2158         let cls1 = uclass(&[('a', 'z')]);
2159         let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]);
2160         let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]);
2161         assert_eq!(expected, udifference(&cls1, &cls2));
2162     }
2163 
2164     #[test]
class_difference_bytes()2165     fn class_difference_bytes() {
2166         let cls1 = bclass(&[(b'a', b'a')]);
2167         let cls2 = bclass(&[(b'a', b'a')]);
2168         let expected = bclass(&[]);
2169         assert_eq!(expected, bdifference(&cls1, &cls2));
2170 
2171         let cls1 = bclass(&[(b'a', b'a')]);
2172         let cls2 = bclass(&[]);
2173         let expected = bclass(&[(b'a', b'a')]);
2174         assert_eq!(expected, bdifference(&cls1, &cls2));
2175 
2176         let cls1 = bclass(&[]);
2177         let cls2 = bclass(&[(b'a', b'a')]);
2178         let expected = bclass(&[]);
2179         assert_eq!(expected, bdifference(&cls1, &cls2));
2180 
2181         let cls1 = bclass(&[(b'a', b'z')]);
2182         let cls2 = bclass(&[(b'a', b'a')]);
2183         let expected = bclass(&[(b'b', b'z')]);
2184         assert_eq!(expected, bdifference(&cls1, &cls2));
2185 
2186         let cls1 = bclass(&[(b'a', b'z')]);
2187         let cls2 = bclass(&[(b'z', b'z')]);
2188         let expected = bclass(&[(b'a', b'y')]);
2189         assert_eq!(expected, bdifference(&cls1, &cls2));
2190 
2191         let cls1 = bclass(&[(b'a', b'z')]);
2192         let cls2 = bclass(&[(b'm', b'm')]);
2193         let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]);
2194         assert_eq!(expected, bdifference(&cls1, &cls2));
2195 
2196         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2197         let cls2 = bclass(&[(b'a', b'z')]);
2198         let expected = bclass(&[]);
2199         assert_eq!(expected, bdifference(&cls1, &cls2));
2200 
2201         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2202         let cls2 = bclass(&[(b'd', b'v')]);
2203         let expected = bclass(&[(b'a', b'c')]);
2204         assert_eq!(expected, bdifference(&cls1, &cls2));
2205 
2206         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2207         let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]);
2208         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2209         assert_eq!(expected, bdifference(&cls1, &cls2));
2210 
2211         let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]);
2212         let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]);
2213         let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]);
2214         assert_eq!(expected, bdifference(&cls1, &cls2));
2215 
2216         let cls1 = bclass(&[(b'x', b'z')]);
2217         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2218         let expected = bclass(&[(b'x', b'z')]);
2219         assert_eq!(expected, bdifference(&cls1, &cls2));
2220 
2221         let cls1 = bclass(&[(b'a', b'z')]);
2222         let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]);
2223         let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]);
2224         assert_eq!(expected, bdifference(&cls1, &cls2));
2225     }
2226 
2227     #[test]
class_symmetric_difference_unicode()2228     fn class_symmetric_difference_unicode() {
2229         let cls1 = uclass(&[('a', 'm')]);
2230         let cls2 = uclass(&[('g', 't')]);
2231         let expected = uclass(&[('a', 'f'), ('n', 't')]);
2232         assert_eq!(expected, usymdifference(&cls1, &cls2));
2233     }
2234 
2235     #[test]
class_symmetric_difference_bytes()2236     fn class_symmetric_difference_bytes() {
2237         let cls1 = bclass(&[(b'a', b'm')]);
2238         let cls2 = bclass(&[(b'g', b't')]);
2239         let expected = bclass(&[(b'a', b'f'), (b'n', b't')]);
2240         assert_eq!(expected, bsymdifference(&cls1, &cls2));
2241     }
2242 
2243     #[test]
2244     #[should_panic]
hir_byte_literal_non_ascii()2245     fn hir_byte_literal_non_ascii() {
2246         Hir::literal(Literal::Byte(b'a'));
2247     }
2248 
2249     // We use a thread with an explicit stack size to test that our destructor
2250     // for Hir can handle arbitrarily sized expressions in constant stack
2251     // space. In case we run on a platform without threads (WASM?), we limit
2252     // this test to Windows/Unix.
2253     #[test]
2254     #[cfg(any(unix, windows))]
no_stack_overflow_on_drop()2255     fn no_stack_overflow_on_drop() {
2256         use std::thread;
2257 
2258         let run = || {
2259             let mut expr = Hir::empty();
2260             for _ in 0..100 {
2261                 expr = Hir::group(Group {
2262                     kind: GroupKind::NonCapturing,
2263                     hir: Box::new(expr),
2264                 });
2265                 expr = Hir::repetition(Repetition {
2266                     kind: RepetitionKind::ZeroOrOne,
2267                     greedy: true,
2268                     hir: Box::new(expr),
2269                 });
2270 
2271                 expr = Hir {
2272                     kind: HirKind::Concat(vec![expr]),
2273                     info: HirInfo::new(),
2274                 };
2275                 expr = Hir {
2276                     kind: HirKind::Alternation(vec![expr]),
2277                     info: HirInfo::new(),
2278                 };
2279             }
2280             assert!(!expr.kind.is_empty());
2281         };
2282 
2283         // We run our test on a thread with a small stack size so we can
2284         // force the issue more easily.
2285         thread::Builder::new()
2286             .stack_size(1 << 10)
2287             .spawn(run)
2288             .unwrap()
2289             .join()
2290             .unwrap();
2291     }
2292 }
2293