1 /*! 2 Defines a high-level intermediate representation for regular expressions. 3 */ 4 use std::char; 5 use std::cmp; 6 use std::error; 7 use std::fmt; 8 use std::result; 9 use std::u8; 10 11 use crate::ast::Span; 12 use crate::hir::interval::{Interval, IntervalSet, IntervalSetIter}; 13 use crate::unicode; 14 15 pub use crate::hir::visitor::{visit, Visitor}; 16 pub use crate::unicode::CaseFoldError; 17 18 mod interval; 19 pub mod literal; 20 pub mod print; 21 pub mod translate; 22 mod visitor; 23 24 /// An error that can occur while translating an `Ast` to a `Hir`. 25 #[derive(Clone, Debug, Eq, PartialEq)] 26 pub struct Error { 27 /// The kind of error. 28 kind: ErrorKind, 29 /// The original pattern that the translator's Ast was parsed from. Every 30 /// span in an error is a valid range into this string. 31 pattern: String, 32 /// The span of this error, derived from the Ast given to the translator. 33 span: Span, 34 } 35 36 impl Error { 37 /// Return the type of this error. kind(&self) -> &ErrorKind38 pub fn kind(&self) -> &ErrorKind { 39 &self.kind 40 } 41 42 /// The original pattern string in which this error occurred. 43 /// 44 /// Every span reported by this error is reported in terms of this string. pattern(&self) -> &str45 pub fn pattern(&self) -> &str { 46 &self.pattern 47 } 48 49 /// Return the span at which this error occurred. span(&self) -> &Span50 pub fn span(&self) -> &Span { 51 &self.span 52 } 53 } 54 55 /// The type of an error that occurred while building an `Hir`. 56 #[derive(Clone, Debug, Eq, PartialEq)] 57 pub enum ErrorKind { 58 /// This error occurs when a Unicode feature is used when Unicode 59 /// support is disabled. For example `(?-u:\pL)` would trigger this error. 60 UnicodeNotAllowed, 61 /// This error occurs when translating a pattern that could match a byte 62 /// sequence that isn't UTF-8 and `allow_invalid_utf8` was disabled. 63 InvalidUtf8, 64 /// This occurs when an unrecognized Unicode property name could not 65 /// be found. 66 UnicodePropertyNotFound, 67 /// This occurs when an unrecognized Unicode property value could not 68 /// be found. 69 UnicodePropertyValueNotFound, 70 /// This occurs when a Unicode-aware Perl character class (`\w`, `\s` or 71 /// `\d`) could not be found. This can occur when the `unicode-perl` 72 /// crate feature is not enabled. 73 UnicodePerlClassNotFound, 74 /// This occurs when the Unicode simple case mapping tables are not 75 /// available, and the regular expression required Unicode aware case 76 /// insensitivity. 77 UnicodeCaseUnavailable, 78 /// This occurs when the translator attempts to construct a character class 79 /// that is empty. 80 /// 81 /// Note that this restriction in the translator may be removed in the 82 /// future. 83 EmptyClassNotAllowed, 84 /// Hints that destructuring should not be exhaustive. 85 /// 86 /// This enum may grow additional variants, so this makes sure clients 87 /// don't count on exhaustive matching. (Otherwise, adding a new variant 88 /// could break existing code.) 89 #[doc(hidden)] 90 __Nonexhaustive, 91 } 92 93 impl ErrorKind { 94 // TODO: Remove this method entirely on the next breaking semver release. 95 #[allow(deprecated)] description(&self) -> &str96 fn description(&self) -> &str { 97 use self::ErrorKind::*; 98 match *self { 99 UnicodeNotAllowed => "Unicode not allowed here", 100 InvalidUtf8 => "pattern can match invalid UTF-8", 101 UnicodePropertyNotFound => "Unicode property not found", 102 UnicodePropertyValueNotFound => "Unicode property value not found", 103 UnicodePerlClassNotFound => { 104 "Unicode-aware Perl class not found \ 105 (make sure the unicode-perl feature is enabled)" 106 } 107 UnicodeCaseUnavailable => { 108 "Unicode-aware case insensitivity matching is not available \ 109 (make sure the unicode-case feature is enabled)" 110 } 111 EmptyClassNotAllowed => "empty character classes are not allowed", 112 __Nonexhaustive => unreachable!(), 113 } 114 } 115 } 116 117 impl error::Error for Error { 118 // TODO: Remove this method entirely on the next breaking semver release. 119 #[allow(deprecated)] description(&self) -> &str120 fn description(&self) -> &str { 121 self.kind.description() 122 } 123 } 124 125 impl fmt::Display for Error { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result126 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 127 crate::error::Formatter::from(self).fmt(f) 128 } 129 } 130 131 impl fmt::Display for ErrorKind { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result132 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 133 // TODO: Remove this on the next breaking semver release. 134 #[allow(deprecated)] 135 f.write_str(self.description()) 136 } 137 } 138 139 /// A high-level intermediate representation (HIR) for a regular expression. 140 /// 141 /// The HIR of a regular expression represents an intermediate step between its 142 /// abstract syntax (a structured description of the concrete syntax) and 143 /// compiled byte codes. The purpose of HIR is to make regular expressions 144 /// easier to analyze. In particular, the AST is much more complex than the 145 /// HIR. For example, while an AST supports arbitrarily nested character 146 /// classes, the HIR will flatten all nested classes into a single set. The HIR 147 /// will also "compile away" every flag present in the concrete syntax. For 148 /// example, users of HIR expressions never need to worry about case folding; 149 /// it is handled automatically by the translator (e.g., by translating `(?i)A` 150 /// to `[aA]`). 151 /// 152 /// If the HIR was produced by a translator that disallows invalid UTF-8, then 153 /// the HIR is guaranteed to match UTF-8 exclusively. 154 /// 155 /// This type defines its own destructor that uses constant stack space and 156 /// heap space proportional to the size of the HIR. 157 /// 158 /// The specific type of an HIR expression can be accessed via its `kind` 159 /// or `into_kind` methods. This extra level of indirection exists for two 160 /// reasons: 161 /// 162 /// 1. Construction of an HIR expression *must* use the constructor methods 163 /// on this `Hir` type instead of building the `HirKind` values directly. 164 /// This permits construction to enforce invariants like "concatenations 165 /// always consist of two or more sub-expressions." 166 /// 2. Every HIR expression contains attributes that are defined inductively, 167 /// and can be computed cheaply during the construction process. For 168 /// example, one such attribute is whether the expression must match at the 169 /// beginning of the text. 170 /// 171 /// Also, an `Hir`'s `fmt::Display` implementation prints an HIR as a regular 172 /// expression pattern string, and uses constant stack space and heap space 173 /// proportional to the size of the `Hir`. 174 #[derive(Clone, Debug, Eq, PartialEq)] 175 pub struct Hir { 176 /// The underlying HIR kind. 177 kind: HirKind, 178 /// Analysis info about this HIR, computed during construction. 179 info: HirInfo, 180 } 181 182 /// The kind of an arbitrary `Hir` expression. 183 #[derive(Clone, Debug, Eq, PartialEq)] 184 pub enum HirKind { 185 /// The empty regular expression, which matches everything, including the 186 /// empty string. 187 Empty, 188 /// A single literal character that matches exactly this character. 189 Literal(Literal), 190 /// A single character class that matches any of the characters in the 191 /// class. A class can either consist of Unicode scalar values as 192 /// characters, or it can use bytes. 193 Class(Class), 194 /// An anchor assertion. An anchor assertion match always has zero length. 195 Anchor(Anchor), 196 /// A word boundary assertion, which may or may not be Unicode aware. A 197 /// word boundary assertion match always has zero length. 198 WordBoundary(WordBoundary), 199 /// A repetition operation applied to a child expression. 200 Repetition(Repetition), 201 /// A possibly capturing group, which contains a child expression. 202 Group(Group), 203 /// A concatenation of expressions. A concatenation always has at least two 204 /// child expressions. 205 /// 206 /// A concatenation matches only if each of its child expression matches 207 /// one after the other. 208 Concat(Vec<Hir>), 209 /// An alternation of expressions. An alternation always has at least two 210 /// child expressions. 211 /// 212 /// An alternation matches only if at least one of its child expression 213 /// matches. If multiple expressions match, then the leftmost is preferred. 214 Alternation(Vec<Hir>), 215 } 216 217 impl Hir { 218 /// Returns a reference to the underlying HIR kind. kind(&self) -> &HirKind219 pub fn kind(&self) -> &HirKind { 220 &self.kind 221 } 222 223 /// Consumes ownership of this HIR expression and returns its underlying 224 /// `HirKind`. into_kind(mut self) -> HirKind225 pub fn into_kind(mut self) -> HirKind { 226 use std::mem; 227 mem::replace(&mut self.kind, HirKind::Empty) 228 } 229 230 /// Returns an empty HIR expression. 231 /// 232 /// An empty HIR expression always matches, including the empty string. empty() -> Hir233 pub fn empty() -> Hir { 234 let mut info = HirInfo::new(); 235 info.set_always_utf8(true); 236 info.set_all_assertions(true); 237 info.set_anchored_start(false); 238 info.set_anchored_end(false); 239 info.set_line_anchored_start(false); 240 info.set_line_anchored_end(false); 241 info.set_any_anchored_start(false); 242 info.set_any_anchored_end(false); 243 info.set_match_empty(true); 244 info.set_literal(false); 245 info.set_alternation_literal(false); 246 Hir { kind: HirKind::Empty, info: info } 247 } 248 249 /// Creates a literal HIR expression. 250 /// 251 /// If the given literal has a `Byte` variant with an ASCII byte, then this 252 /// method panics. This enforces the invariant that `Byte` variants are 253 /// only used to express matching of invalid UTF-8. literal(lit: Literal) -> Hir254 pub fn literal(lit: Literal) -> Hir { 255 if let Literal::Byte(b) = lit { 256 assert!(b > 0x7F); 257 } 258 259 let mut info = HirInfo::new(); 260 info.set_always_utf8(lit.is_unicode()); 261 info.set_all_assertions(false); 262 info.set_anchored_start(false); 263 info.set_anchored_end(false); 264 info.set_line_anchored_start(false); 265 info.set_line_anchored_end(false); 266 info.set_any_anchored_start(false); 267 info.set_any_anchored_end(false); 268 info.set_match_empty(false); 269 info.set_literal(true); 270 info.set_alternation_literal(true); 271 Hir { kind: HirKind::Literal(lit), info: info } 272 } 273 274 /// Creates a class HIR expression. class(class: Class) -> Hir275 pub fn class(class: Class) -> Hir { 276 let mut info = HirInfo::new(); 277 info.set_always_utf8(class.is_always_utf8()); 278 info.set_all_assertions(false); 279 info.set_anchored_start(false); 280 info.set_anchored_end(false); 281 info.set_line_anchored_start(false); 282 info.set_line_anchored_end(false); 283 info.set_any_anchored_start(false); 284 info.set_any_anchored_end(false); 285 info.set_match_empty(false); 286 info.set_literal(false); 287 info.set_alternation_literal(false); 288 Hir { kind: HirKind::Class(class), info: info } 289 } 290 291 /// Creates an anchor assertion HIR expression. anchor(anchor: Anchor) -> Hir292 pub fn anchor(anchor: Anchor) -> Hir { 293 let mut info = HirInfo::new(); 294 info.set_always_utf8(true); 295 info.set_all_assertions(true); 296 info.set_anchored_start(false); 297 info.set_anchored_end(false); 298 info.set_line_anchored_start(false); 299 info.set_line_anchored_end(false); 300 info.set_any_anchored_start(false); 301 info.set_any_anchored_end(false); 302 info.set_match_empty(true); 303 info.set_literal(false); 304 info.set_alternation_literal(false); 305 if let Anchor::StartText = anchor { 306 info.set_anchored_start(true); 307 info.set_line_anchored_start(true); 308 info.set_any_anchored_start(true); 309 } 310 if let Anchor::EndText = anchor { 311 info.set_anchored_end(true); 312 info.set_line_anchored_end(true); 313 info.set_any_anchored_end(true); 314 } 315 if let Anchor::StartLine = anchor { 316 info.set_line_anchored_start(true); 317 } 318 if let Anchor::EndLine = anchor { 319 info.set_line_anchored_end(true); 320 } 321 Hir { kind: HirKind::Anchor(anchor), info: info } 322 } 323 324 /// Creates a word boundary assertion HIR expression. word_boundary(word_boundary: WordBoundary) -> Hir325 pub fn word_boundary(word_boundary: WordBoundary) -> Hir { 326 let mut info = HirInfo::new(); 327 info.set_always_utf8(true); 328 info.set_all_assertions(true); 329 info.set_anchored_start(false); 330 info.set_anchored_end(false); 331 info.set_line_anchored_start(false); 332 info.set_line_anchored_end(false); 333 info.set_any_anchored_start(false); 334 info.set_any_anchored_end(false); 335 info.set_literal(false); 336 info.set_alternation_literal(false); 337 // A negated word boundary matches the empty string, but a normal 338 // word boundary does not! 339 info.set_match_empty(word_boundary.is_negated()); 340 // Negated ASCII word boundaries can match invalid UTF-8. 341 if let WordBoundary::AsciiNegate = word_boundary { 342 info.set_always_utf8(false); 343 } 344 Hir { kind: HirKind::WordBoundary(word_boundary), info: info } 345 } 346 347 /// Creates a repetition HIR expression. repetition(rep: Repetition) -> Hir348 pub fn repetition(rep: Repetition) -> Hir { 349 let mut info = HirInfo::new(); 350 info.set_always_utf8(rep.hir.is_always_utf8()); 351 info.set_all_assertions(rep.hir.is_all_assertions()); 352 // If this operator can match the empty string, then it can never 353 // be anchored. 354 info.set_anchored_start( 355 !rep.is_match_empty() && rep.hir.is_anchored_start(), 356 ); 357 info.set_anchored_end( 358 !rep.is_match_empty() && rep.hir.is_anchored_end(), 359 ); 360 info.set_line_anchored_start( 361 !rep.is_match_empty() && rep.hir.is_anchored_start(), 362 ); 363 info.set_line_anchored_end( 364 !rep.is_match_empty() && rep.hir.is_anchored_end(), 365 ); 366 info.set_any_anchored_start(rep.hir.is_any_anchored_start()); 367 info.set_any_anchored_end(rep.hir.is_any_anchored_end()); 368 info.set_match_empty(rep.is_match_empty() || rep.hir.is_match_empty()); 369 info.set_literal(false); 370 info.set_alternation_literal(false); 371 Hir { kind: HirKind::Repetition(rep), info: info } 372 } 373 374 /// Creates a group HIR expression. group(group: Group) -> Hir375 pub fn group(group: Group) -> Hir { 376 let mut info = HirInfo::new(); 377 info.set_always_utf8(group.hir.is_always_utf8()); 378 info.set_all_assertions(group.hir.is_all_assertions()); 379 info.set_anchored_start(group.hir.is_anchored_start()); 380 info.set_anchored_end(group.hir.is_anchored_end()); 381 info.set_line_anchored_start(group.hir.is_line_anchored_start()); 382 info.set_line_anchored_end(group.hir.is_line_anchored_end()); 383 info.set_any_anchored_start(group.hir.is_any_anchored_start()); 384 info.set_any_anchored_end(group.hir.is_any_anchored_end()); 385 info.set_match_empty(group.hir.is_match_empty()); 386 info.set_literal(false); 387 info.set_alternation_literal(false); 388 Hir { kind: HirKind::Group(group), info: info } 389 } 390 391 /// Returns the concatenation of the given expressions. 392 /// 393 /// This flattens the concatenation as appropriate. concat(mut exprs: Vec<Hir>) -> Hir394 pub fn concat(mut exprs: Vec<Hir>) -> Hir { 395 match exprs.len() { 396 0 => Hir::empty(), 397 1 => exprs.pop().unwrap(), 398 _ => { 399 let mut info = HirInfo::new(); 400 info.set_always_utf8(true); 401 info.set_all_assertions(true); 402 info.set_any_anchored_start(false); 403 info.set_any_anchored_end(false); 404 info.set_match_empty(true); 405 info.set_literal(true); 406 info.set_alternation_literal(true); 407 408 // Some attributes require analyzing all sub-expressions. 409 for e in &exprs { 410 let x = info.is_always_utf8() && e.is_always_utf8(); 411 info.set_always_utf8(x); 412 413 let x = info.is_all_assertions() && e.is_all_assertions(); 414 info.set_all_assertions(x); 415 416 let x = info.is_any_anchored_start() 417 || e.is_any_anchored_start(); 418 info.set_any_anchored_start(x); 419 420 let x = 421 info.is_any_anchored_end() || e.is_any_anchored_end(); 422 info.set_any_anchored_end(x); 423 424 let x = info.is_match_empty() && e.is_match_empty(); 425 info.set_match_empty(x); 426 427 let x = info.is_literal() && e.is_literal(); 428 info.set_literal(x); 429 430 let x = info.is_alternation_literal() 431 && e.is_alternation_literal(); 432 info.set_alternation_literal(x); 433 } 434 // Anchored attributes require something slightly more 435 // sophisticated. Normally, WLOG, to determine whether an 436 // expression is anchored to the start, we'd only need to check 437 // the first expression of a concatenation. However, 438 // expressions like `$\b^` are still anchored to the start, 439 // but the first expression in the concatenation *isn't* 440 // anchored to the start. So the "first" expression to look at 441 // is actually one that is either not an assertion or is 442 // specifically the StartText assertion. 443 info.set_anchored_start( 444 exprs 445 .iter() 446 .take_while(|e| { 447 e.is_anchored_start() || e.is_all_assertions() 448 }) 449 .any(|e| e.is_anchored_start()), 450 ); 451 // Similarly for the end anchor, but in reverse. 452 info.set_anchored_end( 453 exprs 454 .iter() 455 .rev() 456 .take_while(|e| { 457 e.is_anchored_end() || e.is_all_assertions() 458 }) 459 .any(|e| e.is_anchored_end()), 460 ); 461 // Repeat the process for line anchors. 462 info.set_line_anchored_start( 463 exprs 464 .iter() 465 .take_while(|e| { 466 e.is_line_anchored_start() || e.is_all_assertions() 467 }) 468 .any(|e| e.is_line_anchored_start()), 469 ); 470 info.set_line_anchored_end( 471 exprs 472 .iter() 473 .rev() 474 .take_while(|e| { 475 e.is_line_anchored_end() || e.is_all_assertions() 476 }) 477 .any(|e| e.is_line_anchored_end()), 478 ); 479 Hir { kind: HirKind::Concat(exprs), info: info } 480 } 481 } 482 } 483 484 /// Returns the alternation of the given expressions. 485 /// 486 /// This flattens the alternation as appropriate. alternation(mut exprs: Vec<Hir>) -> Hir487 pub fn alternation(mut exprs: Vec<Hir>) -> Hir { 488 match exprs.len() { 489 0 => Hir::empty(), 490 1 => exprs.pop().unwrap(), 491 _ => { 492 let mut info = HirInfo::new(); 493 info.set_always_utf8(true); 494 info.set_all_assertions(true); 495 info.set_anchored_start(true); 496 info.set_anchored_end(true); 497 info.set_line_anchored_start(true); 498 info.set_line_anchored_end(true); 499 info.set_any_anchored_start(false); 500 info.set_any_anchored_end(false); 501 info.set_match_empty(false); 502 info.set_literal(false); 503 info.set_alternation_literal(true); 504 505 // Some attributes require analyzing all sub-expressions. 506 for e in &exprs { 507 let x = info.is_always_utf8() && e.is_always_utf8(); 508 info.set_always_utf8(x); 509 510 let x = info.is_all_assertions() && e.is_all_assertions(); 511 info.set_all_assertions(x); 512 513 let x = info.is_anchored_start() && e.is_anchored_start(); 514 info.set_anchored_start(x); 515 516 let x = info.is_anchored_end() && e.is_anchored_end(); 517 info.set_anchored_end(x); 518 519 let x = info.is_line_anchored_start() 520 && e.is_line_anchored_start(); 521 info.set_line_anchored_start(x); 522 523 let x = info.is_line_anchored_end() 524 && e.is_line_anchored_end(); 525 info.set_line_anchored_end(x); 526 527 let x = info.is_any_anchored_start() 528 || e.is_any_anchored_start(); 529 info.set_any_anchored_start(x); 530 531 let x = 532 info.is_any_anchored_end() || e.is_any_anchored_end(); 533 info.set_any_anchored_end(x); 534 535 let x = info.is_match_empty() || e.is_match_empty(); 536 info.set_match_empty(x); 537 538 let x = info.is_alternation_literal() && e.is_literal(); 539 info.set_alternation_literal(x); 540 } 541 Hir { kind: HirKind::Alternation(exprs), info: info } 542 } 543 } 544 } 545 546 /// Build an HIR expression for `.`. 547 /// 548 /// A `.` expression matches any character except for `\n`. To build an 549 /// expression that matches any character, including `\n`, use the `any` 550 /// method. 551 /// 552 /// If `bytes` is `true`, then this assumes characters are limited to a 553 /// single byte. dot(bytes: bool) -> Hir554 pub fn dot(bytes: bool) -> Hir { 555 if bytes { 556 let mut cls = ClassBytes::empty(); 557 cls.push(ClassBytesRange::new(b'\0', b'\x09')); 558 cls.push(ClassBytesRange::new(b'\x0B', b'\xFF')); 559 Hir::class(Class::Bytes(cls)) 560 } else { 561 let mut cls = ClassUnicode::empty(); 562 cls.push(ClassUnicodeRange::new('\0', '\x09')); 563 cls.push(ClassUnicodeRange::new('\x0B', '\u{10FFFF}')); 564 Hir::class(Class::Unicode(cls)) 565 } 566 } 567 568 /// Build an HIR expression for `(?s).`. 569 /// 570 /// A `(?s).` expression matches any character, including `\n`. To build an 571 /// expression that matches any character except for `\n`, then use the 572 /// `dot` method. 573 /// 574 /// If `bytes` is `true`, then this assumes characters are limited to a 575 /// single byte. any(bytes: bool) -> Hir576 pub fn any(bytes: bool) -> Hir { 577 if bytes { 578 let mut cls = ClassBytes::empty(); 579 cls.push(ClassBytesRange::new(b'\0', b'\xFF')); 580 Hir::class(Class::Bytes(cls)) 581 } else { 582 let mut cls = ClassUnicode::empty(); 583 cls.push(ClassUnicodeRange::new('\0', '\u{10FFFF}')); 584 Hir::class(Class::Unicode(cls)) 585 } 586 } 587 588 /// Return true if and only if this HIR will always match valid UTF-8. 589 /// 590 /// When this returns false, then it is possible for this HIR expression 591 /// to match invalid UTF-8. is_always_utf8(&self) -> bool592 pub fn is_always_utf8(&self) -> bool { 593 self.info.is_always_utf8() 594 } 595 596 /// Returns true if and only if this entire HIR expression is made up of 597 /// zero-width assertions. 598 /// 599 /// This includes expressions like `^$\b\A\z` and even `((\b)+())*^`, but 600 /// not `^a`. is_all_assertions(&self) -> bool601 pub fn is_all_assertions(&self) -> bool { 602 self.info.is_all_assertions() 603 } 604 605 /// Return true if and only if this HIR is required to match from the 606 /// beginning of text. This includes expressions like `^foo`, `^(foo|bar)`, 607 /// `^foo|^bar` but not `^foo|bar`. is_anchored_start(&self) -> bool608 pub fn is_anchored_start(&self) -> bool { 609 self.info.is_anchored_start() 610 } 611 612 /// Return true if and only if this HIR is required to match at the end 613 /// of text. This includes expressions like `foo$`, `(foo|bar)$`, 614 /// `foo$|bar$` but not `foo$|bar`. is_anchored_end(&self) -> bool615 pub fn is_anchored_end(&self) -> bool { 616 self.info.is_anchored_end() 617 } 618 619 /// Return true if and only if this HIR is required to match from the 620 /// beginning of text or the beginning of a line. This includes expressions 621 /// like `^foo`, `(?m)^foo`, `^(foo|bar)`, `^(foo|bar)`, `(?m)^foo|^bar` 622 /// but not `^foo|bar` or `(?m)^foo|bar`. 623 /// 624 /// Note that if `is_anchored_start` is `true`, then 625 /// `is_line_anchored_start` will also be `true`. The reverse implication 626 /// is not true. For example, `(?m)^foo` is line anchored, but not 627 /// `is_anchored_start`. is_line_anchored_start(&self) -> bool628 pub fn is_line_anchored_start(&self) -> bool { 629 self.info.is_line_anchored_start() 630 } 631 632 /// Return true if and only if this HIR is required to match at the 633 /// end of text or the end of a line. This includes expressions like 634 /// `foo$`, `(?m)foo$`, `(foo|bar)$`, `(?m)(foo|bar)$`, `foo$|bar$`, 635 /// `(?m)(foo|bar)$`, but not `foo$|bar` or `(?m)foo$|bar`. 636 /// 637 /// Note that if `is_anchored_end` is `true`, then 638 /// `is_line_anchored_end` will also be `true`. The reverse implication 639 /// is not true. For example, `(?m)foo$` is line anchored, but not 640 /// `is_anchored_end`. is_line_anchored_end(&self) -> bool641 pub fn is_line_anchored_end(&self) -> bool { 642 self.info.is_line_anchored_end() 643 } 644 645 /// Return true if and only if this HIR contains any sub-expression that 646 /// is required to match at the beginning of text. Specifically, this 647 /// returns true if the `^` symbol (when multiline mode is disabled) or the 648 /// `\A` escape appear anywhere in the regex. is_any_anchored_start(&self) -> bool649 pub fn is_any_anchored_start(&self) -> bool { 650 self.info.is_any_anchored_start() 651 } 652 653 /// Return true if and only if this HIR contains any sub-expression that is 654 /// required to match at the end of text. Specifically, this returns true 655 /// if the `$` symbol (when multiline mode is disabled) or the `\z` escape 656 /// appear anywhere in the regex. is_any_anchored_end(&self) -> bool657 pub fn is_any_anchored_end(&self) -> bool { 658 self.info.is_any_anchored_end() 659 } 660 661 /// Return true if and only if the empty string is part of the language 662 /// matched by this regular expression. 663 /// 664 /// This includes `a*`, `a?b*`, `a{0}`, `()`, `()+`, `^$`, `a|b?`, `\B`, 665 /// but not `a`, `a+` or `\b`. is_match_empty(&self) -> bool666 pub fn is_match_empty(&self) -> bool { 667 self.info.is_match_empty() 668 } 669 670 /// Return true if and only if this HIR is a simple literal. This is only 671 /// true when this HIR expression is either itself a `Literal` or a 672 /// concatenation of only `Literal`s. 673 /// 674 /// For example, `f` and `foo` are literals, but `f+`, `(foo)`, `foo()`, 675 /// `` are not (even though that contain sub-expressions that are literals). is_literal(&self) -> bool676 pub fn is_literal(&self) -> bool { 677 self.info.is_literal() 678 } 679 680 /// Return true if and only if this HIR is either a simple literal or an 681 /// alternation of simple literals. This is only 682 /// true when this HIR expression is either itself a `Literal` or a 683 /// concatenation of only `Literal`s or an alternation of only `Literal`s. 684 /// 685 /// For example, `f`, `foo`, `a|b|c`, and `foo|bar|baz` are alternation 686 /// literals, but `f+`, `(foo)`, `foo()`, `` 687 /// are not (even though that contain sub-expressions that are literals). is_alternation_literal(&self) -> bool688 pub fn is_alternation_literal(&self) -> bool { 689 self.info.is_alternation_literal() 690 } 691 } 692 693 impl HirKind { 694 /// Return true if and only if this HIR is the empty regular expression. 695 /// 696 /// Note that this is not defined inductively. That is, it only tests if 697 /// this kind is the `Empty` variant. To get the inductive definition, 698 /// use the `is_match_empty` method on [`Hir`](struct.Hir.html). is_empty(&self) -> bool699 pub fn is_empty(&self) -> bool { 700 match *self { 701 HirKind::Empty => true, 702 _ => false, 703 } 704 } 705 706 /// Returns true if and only if this kind has any (including possibly 707 /// empty) subexpressions. has_subexprs(&self) -> bool708 pub fn has_subexprs(&self) -> bool { 709 match *self { 710 HirKind::Empty 711 | HirKind::Literal(_) 712 | HirKind::Class(_) 713 | HirKind::Anchor(_) 714 | HirKind::WordBoundary(_) => false, 715 HirKind::Group(_) 716 | HirKind::Repetition(_) 717 | HirKind::Concat(_) 718 | HirKind::Alternation(_) => true, 719 } 720 } 721 } 722 723 /// Print a display representation of this Hir. 724 /// 725 /// The result of this is a valid regular expression pattern string. 726 /// 727 /// This implementation uses constant stack space and heap space proportional 728 /// to the size of the `Hir`. 729 impl fmt::Display for Hir { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result730 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 731 use crate::hir::print::Printer; 732 Printer::new().print(self, f) 733 } 734 } 735 736 /// The high-level intermediate representation of a literal. 737 /// 738 /// A literal corresponds to a single character, where a character is either 739 /// defined by a Unicode scalar value or an arbitrary byte. Unicode characters 740 /// are preferred whenever possible. In particular, a `Byte` variant is only 741 /// ever produced when it could match invalid UTF-8. 742 #[derive(Clone, Debug, Eq, PartialEq)] 743 pub enum Literal { 744 /// A single character represented by a Unicode scalar value. 745 Unicode(char), 746 /// A single character represented by an arbitrary byte. 747 Byte(u8), 748 } 749 750 impl Literal { 751 /// Returns true if and only if this literal corresponds to a Unicode 752 /// scalar value. is_unicode(&self) -> bool753 pub fn is_unicode(&self) -> bool { 754 match *self { 755 Literal::Unicode(_) => true, 756 Literal::Byte(b) if b <= 0x7F => true, 757 Literal::Byte(_) => false, 758 } 759 } 760 } 761 762 /// The high-level intermediate representation of a character class. 763 /// 764 /// A character class corresponds to a set of characters. A character is either 765 /// defined by a Unicode scalar value or a byte. Unicode characters are used 766 /// by default, while bytes are used when Unicode mode (via the `u` flag) is 767 /// disabled. 768 /// 769 /// A character class, regardless of its character type, is represented by a 770 /// sequence of non-overlapping non-adjacent ranges of characters. 771 /// 772 /// Note that unlike [`Literal`](enum.Literal.html), a `Bytes` variant may 773 /// be produced even when it exclusively matches valid UTF-8. This is because 774 /// a `Bytes` variant represents an intention by the author of the regular 775 /// expression to disable Unicode mode, which in turn impacts the semantics of 776 /// case insensitive matching. For example, `(?i)k` and `(?i-u)k` will not 777 /// match the same set of strings. 778 #[derive(Clone, Debug, Eq, PartialEq)] 779 pub enum Class { 780 /// A set of characters represented by Unicode scalar values. 781 Unicode(ClassUnicode), 782 /// A set of characters represented by arbitrary bytes (one byte per 783 /// character). 784 Bytes(ClassBytes), 785 } 786 787 impl Class { 788 /// Apply Unicode simple case folding to this character class, in place. 789 /// The character class will be expanded to include all simple case folded 790 /// character variants. 791 /// 792 /// If this is a byte oriented character class, then this will be limited 793 /// to the ASCII ranges `A-Z` and `a-z`. case_fold_simple(&mut self)794 pub fn case_fold_simple(&mut self) { 795 match *self { 796 Class::Unicode(ref mut x) => x.case_fold_simple(), 797 Class::Bytes(ref mut x) => x.case_fold_simple(), 798 } 799 } 800 801 /// Negate this character class in place. 802 /// 803 /// After completion, this character class will contain precisely the 804 /// characters that weren't previously in the class. negate(&mut self)805 pub fn negate(&mut self) { 806 match *self { 807 Class::Unicode(ref mut x) => x.negate(), 808 Class::Bytes(ref mut x) => x.negate(), 809 } 810 } 811 812 /// Returns true if and only if this character class will only ever match 813 /// valid UTF-8. 814 /// 815 /// A character class can match invalid UTF-8 only when the following 816 /// conditions are met: 817 /// 818 /// 1. The translator was configured to permit generating an expression 819 /// that can match invalid UTF-8. (By default, this is disabled.) 820 /// 2. Unicode mode (via the `u` flag) was disabled either in the concrete 821 /// syntax or in the parser builder. By default, Unicode mode is 822 /// enabled. is_always_utf8(&self) -> bool823 pub fn is_always_utf8(&self) -> bool { 824 match *self { 825 Class::Unicode(_) => true, 826 Class::Bytes(ref x) => x.is_all_ascii(), 827 } 828 } 829 } 830 831 /// A set of characters represented by Unicode scalar values. 832 #[derive(Clone, Debug, Eq, PartialEq)] 833 pub struct ClassUnicode { 834 set: IntervalSet<ClassUnicodeRange>, 835 } 836 837 impl ClassUnicode { 838 /// Create a new class from a sequence of ranges. 839 /// 840 /// The given ranges do not need to be in any specific order, and ranges 841 /// may overlap. new<I>(ranges: I) -> ClassUnicode where I: IntoIterator<Item = ClassUnicodeRange>,842 pub fn new<I>(ranges: I) -> ClassUnicode 843 where 844 I: IntoIterator<Item = ClassUnicodeRange>, 845 { 846 ClassUnicode { set: IntervalSet::new(ranges) } 847 } 848 849 /// Create a new class with no ranges. empty() -> ClassUnicode850 pub fn empty() -> ClassUnicode { 851 ClassUnicode::new(vec![]) 852 } 853 854 /// Add a new range to this set. push(&mut self, range: ClassUnicodeRange)855 pub fn push(&mut self, range: ClassUnicodeRange) { 856 self.set.push(range); 857 } 858 859 /// Return an iterator over all ranges in this class. 860 /// 861 /// The iterator yields ranges in ascending order. iter(&self) -> ClassUnicodeIter<'_>862 pub fn iter(&self) -> ClassUnicodeIter<'_> { 863 ClassUnicodeIter(self.set.iter()) 864 } 865 866 /// Return the underlying ranges as a slice. ranges(&self) -> &[ClassUnicodeRange]867 pub fn ranges(&self) -> &[ClassUnicodeRange] { 868 self.set.intervals() 869 } 870 871 /// Expand this character class such that it contains all case folded 872 /// characters, according to Unicode's "simple" mapping. For example, if 873 /// this class consists of the range `a-z`, then applying case folding will 874 /// result in the class containing both the ranges `a-z` and `A-Z`. 875 /// 876 /// # Panics 877 /// 878 /// This routine panics when the case mapping data necessary for this 879 /// routine to complete is unavailable. This occurs when the `unicode-case` 880 /// feature is not enabled. 881 /// 882 /// Callers should prefer using `try_case_fold_simple` instead, which will 883 /// return an error instead of panicking. case_fold_simple(&mut self)884 pub fn case_fold_simple(&mut self) { 885 self.set 886 .case_fold_simple() 887 .expect("unicode-case feature must be enabled"); 888 } 889 890 /// Expand this character class such that it contains all case folded 891 /// characters, according to Unicode's "simple" mapping. For example, if 892 /// this class consists of the range `a-z`, then applying case folding will 893 /// result in the class containing both the ranges `a-z` and `A-Z`. 894 /// 895 /// # Error 896 /// 897 /// This routine returns an error when the case mapping data necessary 898 /// for this routine to complete is unavailable. This occurs when the 899 /// `unicode-case` feature is not enabled. try_case_fold_simple( &mut self, ) -> result::Result<(), CaseFoldError>900 pub fn try_case_fold_simple( 901 &mut self, 902 ) -> result::Result<(), CaseFoldError> { 903 self.set.case_fold_simple() 904 } 905 906 /// Negate this character class. 907 /// 908 /// For all `c` where `c` is a Unicode scalar value, if `c` was in this 909 /// set, then it will not be in this set after negation. negate(&mut self)910 pub fn negate(&mut self) { 911 self.set.negate(); 912 } 913 914 /// Union this character class with the given character class, in place. union(&mut self, other: &ClassUnicode)915 pub fn union(&mut self, other: &ClassUnicode) { 916 self.set.union(&other.set); 917 } 918 919 /// Intersect this character class with the given character class, in 920 /// place. intersect(&mut self, other: &ClassUnicode)921 pub fn intersect(&mut self, other: &ClassUnicode) { 922 self.set.intersect(&other.set); 923 } 924 925 /// Subtract the given character class from this character class, in place. difference(&mut self, other: &ClassUnicode)926 pub fn difference(&mut self, other: &ClassUnicode) { 927 self.set.difference(&other.set); 928 } 929 930 /// Compute the symmetric difference of the given character classes, in 931 /// place. 932 /// 933 /// This computes the symmetric difference of two character classes. This 934 /// removes all elements in this class that are also in the given class, 935 /// but all adds all elements from the given class that aren't in this 936 /// class. That is, the class will contain all elements in either class, 937 /// but will not contain any elements that are in both classes. symmetric_difference(&mut self, other: &ClassUnicode)938 pub fn symmetric_difference(&mut self, other: &ClassUnicode) { 939 self.set.symmetric_difference(&other.set); 940 } 941 942 /// Returns true if and only if this character class will either match 943 /// nothing or only ASCII bytes. Stated differently, this returns false 944 /// if and only if this class contains a non-ASCII codepoint. is_all_ascii(&self) -> bool945 pub fn is_all_ascii(&self) -> bool { 946 self.set.intervals().last().map_or(true, |r| r.end <= '\x7F') 947 } 948 } 949 950 /// An iterator over all ranges in a Unicode character class. 951 /// 952 /// The lifetime `'a` refers to the lifetime of the underlying class. 953 #[derive(Debug)] 954 pub struct ClassUnicodeIter<'a>(IntervalSetIter<'a, ClassUnicodeRange>); 955 956 impl<'a> Iterator for ClassUnicodeIter<'a> { 957 type Item = &'a ClassUnicodeRange; 958 next(&mut self) -> Option<&'a ClassUnicodeRange>959 fn next(&mut self) -> Option<&'a ClassUnicodeRange> { 960 self.0.next() 961 } 962 } 963 964 /// A single range of characters represented by Unicode scalar values. 965 /// 966 /// The range is closed. That is, the start and end of the range are included 967 /// in the range. 968 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] 969 pub struct ClassUnicodeRange { 970 start: char, 971 end: char, 972 } 973 974 impl fmt::Debug for ClassUnicodeRange { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result975 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 976 let start = if !self.start.is_whitespace() && !self.start.is_control() 977 { 978 self.start.to_string() 979 } else { 980 format!("0x{:X}", self.start as u32) 981 }; 982 let end = if !self.end.is_whitespace() && !self.end.is_control() { 983 self.end.to_string() 984 } else { 985 format!("0x{:X}", self.end as u32) 986 }; 987 f.debug_struct("ClassUnicodeRange") 988 .field("start", &start) 989 .field("end", &end) 990 .finish() 991 } 992 } 993 994 impl Interval for ClassUnicodeRange { 995 type Bound = char; 996 997 #[inline] lower(&self) -> char998 fn lower(&self) -> char { 999 self.start 1000 } 1001 #[inline] upper(&self) -> char1002 fn upper(&self) -> char { 1003 self.end 1004 } 1005 #[inline] set_lower(&mut self, bound: char)1006 fn set_lower(&mut self, bound: char) { 1007 self.start = bound; 1008 } 1009 #[inline] set_upper(&mut self, bound: char)1010 fn set_upper(&mut self, bound: char) { 1011 self.end = bound; 1012 } 1013 1014 /// Apply simple case folding to this Unicode scalar value range. 1015 /// 1016 /// Additional ranges are appended to the given vector. Canonical ordering 1017 /// is *not* maintained in the given vector. case_fold_simple( &self, ranges: &mut Vec<ClassUnicodeRange>, ) -> Result<(), unicode::CaseFoldError>1018 fn case_fold_simple( 1019 &self, 1020 ranges: &mut Vec<ClassUnicodeRange>, 1021 ) -> Result<(), unicode::CaseFoldError> { 1022 if !unicode::contains_simple_case_mapping(self.start, self.end)? { 1023 return Ok(()); 1024 } 1025 let start = self.start as u32; 1026 let end = (self.end as u32).saturating_add(1); 1027 let mut next_simple_cp = None; 1028 for cp in (start..end).filter_map(char::from_u32) { 1029 if next_simple_cp.map_or(false, |next| cp < next) { 1030 continue; 1031 } 1032 let it = match unicode::simple_fold(cp)? { 1033 Ok(it) => it, 1034 Err(next) => { 1035 next_simple_cp = next; 1036 continue; 1037 } 1038 }; 1039 for cp_folded in it { 1040 ranges.push(ClassUnicodeRange::new(cp_folded, cp_folded)); 1041 } 1042 } 1043 Ok(()) 1044 } 1045 } 1046 1047 impl ClassUnicodeRange { 1048 /// Create a new Unicode scalar value range for a character class. 1049 /// 1050 /// The returned range is always in a canonical form. That is, the range 1051 /// returned always satisfies the invariant that `start <= end`. new(start: char, end: char) -> ClassUnicodeRange1052 pub fn new(start: char, end: char) -> ClassUnicodeRange { 1053 ClassUnicodeRange::create(start, end) 1054 } 1055 1056 /// Return the start of this range. 1057 /// 1058 /// The start of a range is always less than or equal to the end of the 1059 /// range. start(&self) -> char1060 pub fn start(&self) -> char { 1061 self.start 1062 } 1063 1064 /// Return the end of this range. 1065 /// 1066 /// The end of a range is always greater than or equal to the start of the 1067 /// range. end(&self) -> char1068 pub fn end(&self) -> char { 1069 self.end 1070 } 1071 } 1072 1073 /// A set of characters represented by arbitrary bytes (where one byte 1074 /// corresponds to one character). 1075 #[derive(Clone, Debug, Eq, PartialEq)] 1076 pub struct ClassBytes { 1077 set: IntervalSet<ClassBytesRange>, 1078 } 1079 1080 impl ClassBytes { 1081 /// Create a new class from a sequence of ranges. 1082 /// 1083 /// The given ranges do not need to be in any specific order, and ranges 1084 /// may overlap. new<I>(ranges: I) -> ClassBytes where I: IntoIterator<Item = ClassBytesRange>,1085 pub fn new<I>(ranges: I) -> ClassBytes 1086 where 1087 I: IntoIterator<Item = ClassBytesRange>, 1088 { 1089 ClassBytes { set: IntervalSet::new(ranges) } 1090 } 1091 1092 /// Create a new class with no ranges. empty() -> ClassBytes1093 pub fn empty() -> ClassBytes { 1094 ClassBytes::new(vec![]) 1095 } 1096 1097 /// Add a new range to this set. push(&mut self, range: ClassBytesRange)1098 pub fn push(&mut self, range: ClassBytesRange) { 1099 self.set.push(range); 1100 } 1101 1102 /// Return an iterator over all ranges in this class. 1103 /// 1104 /// The iterator yields ranges in ascending order. iter(&self) -> ClassBytesIter<'_>1105 pub fn iter(&self) -> ClassBytesIter<'_> { 1106 ClassBytesIter(self.set.iter()) 1107 } 1108 1109 /// Return the underlying ranges as a slice. ranges(&self) -> &[ClassBytesRange]1110 pub fn ranges(&self) -> &[ClassBytesRange] { 1111 self.set.intervals() 1112 } 1113 1114 /// Expand this character class such that it contains all case folded 1115 /// characters. For example, if this class consists of the range `a-z`, 1116 /// then applying case folding will result in the class containing both the 1117 /// ranges `a-z` and `A-Z`. 1118 /// 1119 /// Note that this only applies ASCII case folding, which is limited to the 1120 /// characters `a-z` and `A-Z`. case_fold_simple(&mut self)1121 pub fn case_fold_simple(&mut self) { 1122 self.set.case_fold_simple().expect("ASCII case folding never fails"); 1123 } 1124 1125 /// Negate this byte class. 1126 /// 1127 /// For all `b` where `b` is a any byte, if `b` was in this set, then it 1128 /// will not be in this set after negation. negate(&mut self)1129 pub fn negate(&mut self) { 1130 self.set.negate(); 1131 } 1132 1133 /// Union this byte class with the given byte class, in place. union(&mut self, other: &ClassBytes)1134 pub fn union(&mut self, other: &ClassBytes) { 1135 self.set.union(&other.set); 1136 } 1137 1138 /// Intersect this byte class with the given byte class, in place. intersect(&mut self, other: &ClassBytes)1139 pub fn intersect(&mut self, other: &ClassBytes) { 1140 self.set.intersect(&other.set); 1141 } 1142 1143 /// Subtract the given byte class from this byte class, in place. difference(&mut self, other: &ClassBytes)1144 pub fn difference(&mut self, other: &ClassBytes) { 1145 self.set.difference(&other.set); 1146 } 1147 1148 /// Compute the symmetric difference of the given byte classes, in place. 1149 /// 1150 /// This computes the symmetric difference of two byte classes. This 1151 /// removes all elements in this class that are also in the given class, 1152 /// but all adds all elements from the given class that aren't in this 1153 /// class. That is, the class will contain all elements in either class, 1154 /// but will not contain any elements that are in both classes. symmetric_difference(&mut self, other: &ClassBytes)1155 pub fn symmetric_difference(&mut self, other: &ClassBytes) { 1156 self.set.symmetric_difference(&other.set); 1157 } 1158 1159 /// Returns true if and only if this character class will either match 1160 /// nothing or only ASCII bytes. Stated differently, this returns false 1161 /// if and only if this class contains a non-ASCII byte. is_all_ascii(&self) -> bool1162 pub fn is_all_ascii(&self) -> bool { 1163 self.set.intervals().last().map_or(true, |r| r.end <= 0x7F) 1164 } 1165 } 1166 1167 /// An iterator over all ranges in a byte character class. 1168 /// 1169 /// The lifetime `'a` refers to the lifetime of the underlying class. 1170 #[derive(Debug)] 1171 pub struct ClassBytesIter<'a>(IntervalSetIter<'a, ClassBytesRange>); 1172 1173 impl<'a> Iterator for ClassBytesIter<'a> { 1174 type Item = &'a ClassBytesRange; 1175 next(&mut self) -> Option<&'a ClassBytesRange>1176 fn next(&mut self) -> Option<&'a ClassBytesRange> { 1177 self.0.next() 1178 } 1179 } 1180 1181 /// A single range of characters represented by arbitrary bytes. 1182 /// 1183 /// The range is closed. That is, the start and end of the range are included 1184 /// in the range. 1185 #[derive(Clone, Copy, Default, Eq, PartialEq, PartialOrd, Ord)] 1186 pub struct ClassBytesRange { 1187 start: u8, 1188 end: u8, 1189 } 1190 1191 impl Interval for ClassBytesRange { 1192 type Bound = u8; 1193 1194 #[inline] lower(&self) -> u81195 fn lower(&self) -> u8 { 1196 self.start 1197 } 1198 #[inline] upper(&self) -> u81199 fn upper(&self) -> u8 { 1200 self.end 1201 } 1202 #[inline] set_lower(&mut self, bound: u8)1203 fn set_lower(&mut self, bound: u8) { 1204 self.start = bound; 1205 } 1206 #[inline] set_upper(&mut self, bound: u8)1207 fn set_upper(&mut self, bound: u8) { 1208 self.end = bound; 1209 } 1210 1211 /// Apply simple case folding to this byte range. Only ASCII case mappings 1212 /// (for a-z) are applied. 1213 /// 1214 /// Additional ranges are appended to the given vector. Canonical ordering 1215 /// is *not* maintained in the given vector. case_fold_simple( &self, ranges: &mut Vec<ClassBytesRange>, ) -> Result<(), unicode::CaseFoldError>1216 fn case_fold_simple( 1217 &self, 1218 ranges: &mut Vec<ClassBytesRange>, 1219 ) -> Result<(), unicode::CaseFoldError> { 1220 if !ClassBytesRange::new(b'a', b'z').is_intersection_empty(self) { 1221 let lower = cmp::max(self.start, b'a'); 1222 let upper = cmp::min(self.end, b'z'); 1223 ranges.push(ClassBytesRange::new(lower - 32, upper - 32)); 1224 } 1225 if !ClassBytesRange::new(b'A', b'Z').is_intersection_empty(self) { 1226 let lower = cmp::max(self.start, b'A'); 1227 let upper = cmp::min(self.end, b'Z'); 1228 ranges.push(ClassBytesRange::new(lower + 32, upper + 32)); 1229 } 1230 Ok(()) 1231 } 1232 } 1233 1234 impl ClassBytesRange { 1235 /// Create a new byte range for a character class. 1236 /// 1237 /// The returned range is always in a canonical form. That is, the range 1238 /// returned always satisfies the invariant that `start <= end`. new(start: u8, end: u8) -> ClassBytesRange1239 pub fn new(start: u8, end: u8) -> ClassBytesRange { 1240 ClassBytesRange::create(start, end) 1241 } 1242 1243 /// Return the start of this range. 1244 /// 1245 /// The start of a range is always less than or equal to the end of the 1246 /// range. start(&self) -> u81247 pub fn start(&self) -> u8 { 1248 self.start 1249 } 1250 1251 /// Return the end of this range. 1252 /// 1253 /// The end of a range is always greater than or equal to the start of the 1254 /// range. end(&self) -> u81255 pub fn end(&self) -> u8 { 1256 self.end 1257 } 1258 } 1259 1260 impl fmt::Debug for ClassBytesRange { fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result1261 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { 1262 let mut debug = f.debug_struct("ClassBytesRange"); 1263 if self.start <= 0x7F { 1264 debug.field("start", &(self.start as char)); 1265 } else { 1266 debug.field("start", &self.start); 1267 } 1268 if self.end <= 0x7F { 1269 debug.field("end", &(self.end as char)); 1270 } else { 1271 debug.field("end", &self.end); 1272 } 1273 debug.finish() 1274 } 1275 } 1276 1277 /// The high-level intermediate representation for an anchor assertion. 1278 /// 1279 /// A matching anchor assertion is always zero-length. 1280 #[derive(Clone, Debug, Eq, PartialEq)] 1281 pub enum Anchor { 1282 /// Match the beginning of a line or the beginning of text. Specifically, 1283 /// this matches at the starting position of the input, or at the position 1284 /// immediately following a `\n` character. 1285 StartLine, 1286 /// Match the end of a line or the end of text. Specifically, 1287 /// this matches at the end position of the input, or at the position 1288 /// immediately preceding a `\n` character. 1289 EndLine, 1290 /// Match the beginning of text. Specifically, this matches at the starting 1291 /// position of the input. 1292 StartText, 1293 /// Match the end of text. Specifically, this matches at the ending 1294 /// position of the input. 1295 EndText, 1296 } 1297 1298 /// The high-level intermediate representation for a word-boundary assertion. 1299 /// 1300 /// A matching word boundary assertion is always zero-length. 1301 #[derive(Clone, Debug, Eq, PartialEq)] 1302 pub enum WordBoundary { 1303 /// Match a Unicode-aware word boundary. That is, this matches a position 1304 /// where the left adjacent character and right adjacent character 1305 /// correspond to a word and non-word or a non-word and word character. 1306 Unicode, 1307 /// Match a Unicode-aware negation of a word boundary. 1308 UnicodeNegate, 1309 /// Match an ASCII-only word boundary. That is, this matches a position 1310 /// where the left adjacent character and right adjacent character 1311 /// correspond to a word and non-word or a non-word and word character. 1312 Ascii, 1313 /// Match an ASCII-only negation of a word boundary. 1314 AsciiNegate, 1315 } 1316 1317 impl WordBoundary { 1318 /// Returns true if and only if this word boundary assertion is negated. is_negated(&self) -> bool1319 pub fn is_negated(&self) -> bool { 1320 match *self { 1321 WordBoundary::Unicode | WordBoundary::Ascii => false, 1322 WordBoundary::UnicodeNegate | WordBoundary::AsciiNegate => true, 1323 } 1324 } 1325 } 1326 1327 /// The high-level intermediate representation for a group. 1328 /// 1329 /// This represents one of three possible group types: 1330 /// 1331 /// 1. A non-capturing group (e.g., `(?:expr)`). 1332 /// 2. A capturing group (e.g., `(expr)`). 1333 /// 3. A named capturing group (e.g., `(?P<name>expr)`). 1334 #[derive(Clone, Debug, Eq, PartialEq)] 1335 pub struct Group { 1336 /// The kind of this group. If it is a capturing group, then the kind 1337 /// contains the capture group index (and the name, if it is a named 1338 /// group). 1339 pub kind: GroupKind, 1340 /// The expression inside the capturing group, which may be empty. 1341 pub hir: Box<Hir>, 1342 } 1343 1344 /// The kind of group. 1345 #[derive(Clone, Debug, Eq, PartialEq)] 1346 pub enum GroupKind { 1347 /// A normal unnamed capturing group. 1348 /// 1349 /// The value is the capture index of the group. 1350 CaptureIndex(u32), 1351 /// A named capturing group. 1352 CaptureName { 1353 /// The name of the group. 1354 name: String, 1355 /// The capture index of the group. 1356 index: u32, 1357 }, 1358 /// A non-capturing group. 1359 NonCapturing, 1360 } 1361 1362 /// The high-level intermediate representation of a repetition operator. 1363 /// 1364 /// A repetition operator permits the repetition of an arbitrary 1365 /// sub-expression. 1366 #[derive(Clone, Debug, Eq, PartialEq)] 1367 pub struct Repetition { 1368 /// The kind of this repetition operator. 1369 pub kind: RepetitionKind, 1370 /// Whether this repetition operator is greedy or not. A greedy operator 1371 /// will match as much as it can. A non-greedy operator will match as 1372 /// little as it can. 1373 /// 1374 /// Typically, operators are greedy by default and are only non-greedy when 1375 /// a `?` suffix is used, e.g., `(expr)*` is greedy while `(expr)*?` is 1376 /// not. However, this can be inverted via the `U` "ungreedy" flag. 1377 pub greedy: bool, 1378 /// The expression being repeated. 1379 pub hir: Box<Hir>, 1380 } 1381 1382 impl Repetition { 1383 /// Returns true if and only if this repetition operator makes it possible 1384 /// to match the empty string. 1385 /// 1386 /// Note that this is not defined inductively. For example, while `a*` 1387 /// will report `true`, `()+` will not, even though `()` matches the empty 1388 /// string and one or more occurrences of something that matches the empty 1389 /// string will always match the empty string. In order to get the 1390 /// inductive definition, see the corresponding method on 1391 /// [`Hir`](struct.Hir.html). is_match_empty(&self) -> bool1392 pub fn is_match_empty(&self) -> bool { 1393 match self.kind { 1394 RepetitionKind::ZeroOrOne => true, 1395 RepetitionKind::ZeroOrMore => true, 1396 RepetitionKind::OneOrMore => false, 1397 RepetitionKind::Range(RepetitionRange::Exactly(m)) => m == 0, 1398 RepetitionKind::Range(RepetitionRange::AtLeast(m)) => m == 0, 1399 RepetitionKind::Range(RepetitionRange::Bounded(m, _)) => m == 0, 1400 } 1401 } 1402 } 1403 1404 /// The kind of a repetition operator. 1405 #[derive(Clone, Debug, Eq, PartialEq)] 1406 pub enum RepetitionKind { 1407 /// Matches a sub-expression zero or one times. 1408 ZeroOrOne, 1409 /// Matches a sub-expression zero or more times. 1410 ZeroOrMore, 1411 /// Matches a sub-expression one or more times. 1412 OneOrMore, 1413 /// Matches a sub-expression within a bounded range of times. 1414 Range(RepetitionRange), 1415 } 1416 1417 /// The kind of a counted repetition operator. 1418 #[derive(Clone, Debug, Eq, PartialEq)] 1419 pub enum RepetitionRange { 1420 /// Matches a sub-expression exactly this many times. 1421 Exactly(u32), 1422 /// Matches a sub-expression at least this many times. 1423 AtLeast(u32), 1424 /// Matches a sub-expression at least `m` times and at most `n` times. 1425 Bounded(u32, u32), 1426 } 1427 1428 /// A custom `Drop` impl is used for `HirKind` such that it uses constant stack 1429 /// space but heap space proportional to the depth of the total `Hir`. 1430 impl Drop for Hir { drop(&mut self)1431 fn drop(&mut self) { 1432 use std::mem; 1433 1434 match *self.kind() { 1435 HirKind::Empty 1436 | HirKind::Literal(_) 1437 | HirKind::Class(_) 1438 | HirKind::Anchor(_) 1439 | HirKind::WordBoundary(_) => return, 1440 HirKind::Group(ref x) if !x.hir.kind.has_subexprs() => return, 1441 HirKind::Repetition(ref x) if !x.hir.kind.has_subexprs() => return, 1442 HirKind::Concat(ref x) if x.is_empty() => return, 1443 HirKind::Alternation(ref x) if x.is_empty() => return, 1444 _ => {} 1445 } 1446 1447 let mut stack = vec![mem::replace(self, Hir::empty())]; 1448 while let Some(mut expr) = stack.pop() { 1449 match expr.kind { 1450 HirKind::Empty 1451 | HirKind::Literal(_) 1452 | HirKind::Class(_) 1453 | HirKind::Anchor(_) 1454 | HirKind::WordBoundary(_) => {} 1455 HirKind::Group(ref mut x) => { 1456 stack.push(mem::replace(&mut x.hir, Hir::empty())); 1457 } 1458 HirKind::Repetition(ref mut x) => { 1459 stack.push(mem::replace(&mut x.hir, Hir::empty())); 1460 } 1461 HirKind::Concat(ref mut x) => { 1462 stack.extend(x.drain(..)); 1463 } 1464 HirKind::Alternation(ref mut x) => { 1465 stack.extend(x.drain(..)); 1466 } 1467 } 1468 } 1469 } 1470 } 1471 1472 /// A type that documents various attributes of an HIR expression. 1473 /// 1474 /// These attributes are typically defined inductively on the HIR. 1475 #[derive(Clone, Debug, Eq, PartialEq)] 1476 struct HirInfo { 1477 /// Represent yes/no questions by a bitfield to conserve space, since 1478 /// this is included in every HIR expression. 1479 /// 1480 /// If more attributes need to be added, it is OK to increase the size of 1481 /// this as appropriate. 1482 bools: u16, 1483 } 1484 1485 // A simple macro for defining bitfield accessors/mutators. 1486 macro_rules! define_bool { 1487 ($bit:expr, $is_fn_name:ident, $set_fn_name:ident) => { 1488 fn $is_fn_name(&self) -> bool { 1489 self.bools & (0b1 << $bit) > 0 1490 } 1491 1492 fn $set_fn_name(&mut self, yes: bool) { 1493 if yes { 1494 self.bools |= 1 << $bit; 1495 } else { 1496 self.bools &= !(1 << $bit); 1497 } 1498 } 1499 }; 1500 } 1501 1502 impl HirInfo { new() -> HirInfo1503 fn new() -> HirInfo { 1504 HirInfo { bools: 0 } 1505 } 1506 1507 define_bool!(0, is_always_utf8, set_always_utf8); 1508 define_bool!(1, is_all_assertions, set_all_assertions); 1509 define_bool!(2, is_anchored_start, set_anchored_start); 1510 define_bool!(3, is_anchored_end, set_anchored_end); 1511 define_bool!(4, is_line_anchored_start, set_line_anchored_start); 1512 define_bool!(5, is_line_anchored_end, set_line_anchored_end); 1513 define_bool!(6, is_any_anchored_start, set_any_anchored_start); 1514 define_bool!(7, is_any_anchored_end, set_any_anchored_end); 1515 define_bool!(8, is_match_empty, set_match_empty); 1516 define_bool!(9, is_literal, set_literal); 1517 define_bool!(10, is_alternation_literal, set_alternation_literal); 1518 } 1519 1520 #[cfg(test)] 1521 mod tests { 1522 use super::*; 1523 uclass(ranges: &[(char, char)]) -> ClassUnicode1524 fn uclass(ranges: &[(char, char)]) -> ClassUnicode { 1525 let ranges: Vec<ClassUnicodeRange> = ranges 1526 .iter() 1527 .map(|&(s, e)| ClassUnicodeRange::new(s, e)) 1528 .collect(); 1529 ClassUnicode::new(ranges) 1530 } 1531 bclass(ranges: &[(u8, u8)]) -> ClassBytes1532 fn bclass(ranges: &[(u8, u8)]) -> ClassBytes { 1533 let ranges: Vec<ClassBytesRange> = 1534 ranges.iter().map(|&(s, e)| ClassBytesRange::new(s, e)).collect(); 1535 ClassBytes::new(ranges) 1536 } 1537 uranges(cls: &ClassUnicode) -> Vec<(char, char)>1538 fn uranges(cls: &ClassUnicode) -> Vec<(char, char)> { 1539 cls.iter().map(|x| (x.start(), x.end())).collect() 1540 } 1541 1542 #[cfg(feature = "unicode-case")] ucasefold(cls: &ClassUnicode) -> ClassUnicode1543 fn ucasefold(cls: &ClassUnicode) -> ClassUnicode { 1544 let mut cls_ = cls.clone(); 1545 cls_.case_fold_simple(); 1546 cls_ 1547 } 1548 uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1549 fn uunion(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { 1550 let mut cls_ = cls1.clone(); 1551 cls_.union(cls2); 1552 cls_ 1553 } 1554 uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1555 fn uintersect(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { 1556 let mut cls_ = cls1.clone(); 1557 cls_.intersect(cls2); 1558 cls_ 1559 } 1560 udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode1561 fn udifference(cls1: &ClassUnicode, cls2: &ClassUnicode) -> ClassUnicode { 1562 let mut cls_ = cls1.clone(); 1563 cls_.difference(cls2); 1564 cls_ 1565 } 1566 usymdifference( cls1: &ClassUnicode, cls2: &ClassUnicode, ) -> ClassUnicode1567 fn usymdifference( 1568 cls1: &ClassUnicode, 1569 cls2: &ClassUnicode, 1570 ) -> ClassUnicode { 1571 let mut cls_ = cls1.clone(); 1572 cls_.symmetric_difference(cls2); 1573 cls_ 1574 } 1575 unegate(cls: &ClassUnicode) -> ClassUnicode1576 fn unegate(cls: &ClassUnicode) -> ClassUnicode { 1577 let mut cls_ = cls.clone(); 1578 cls_.negate(); 1579 cls_ 1580 } 1581 branges(cls: &ClassBytes) -> Vec<(u8, u8)>1582 fn branges(cls: &ClassBytes) -> Vec<(u8, u8)> { 1583 cls.iter().map(|x| (x.start(), x.end())).collect() 1584 } 1585 bcasefold(cls: &ClassBytes) -> ClassBytes1586 fn bcasefold(cls: &ClassBytes) -> ClassBytes { 1587 let mut cls_ = cls.clone(); 1588 cls_.case_fold_simple(); 1589 cls_ 1590 } 1591 bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1592 fn bunion(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1593 let mut cls_ = cls1.clone(); 1594 cls_.union(cls2); 1595 cls_ 1596 } 1597 bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1598 fn bintersect(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1599 let mut cls_ = cls1.clone(); 1600 cls_.intersect(cls2); 1601 cls_ 1602 } 1603 bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1604 fn bdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1605 let mut cls_ = cls1.clone(); 1606 cls_.difference(cls2); 1607 cls_ 1608 } 1609 bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes1610 fn bsymdifference(cls1: &ClassBytes, cls2: &ClassBytes) -> ClassBytes { 1611 let mut cls_ = cls1.clone(); 1612 cls_.symmetric_difference(cls2); 1613 cls_ 1614 } 1615 bnegate(cls: &ClassBytes) -> ClassBytes1616 fn bnegate(cls: &ClassBytes) -> ClassBytes { 1617 let mut cls_ = cls.clone(); 1618 cls_.negate(); 1619 cls_ 1620 } 1621 1622 #[test] class_range_canonical_unicode()1623 fn class_range_canonical_unicode() { 1624 let range = ClassUnicodeRange::new('\u{00FF}', '\0'); 1625 assert_eq!('\0', range.start()); 1626 assert_eq!('\u{00FF}', range.end()); 1627 } 1628 1629 #[test] class_range_canonical_bytes()1630 fn class_range_canonical_bytes() { 1631 let range = ClassBytesRange::new(b'\xFF', b'\0'); 1632 assert_eq!(b'\0', range.start()); 1633 assert_eq!(b'\xFF', range.end()); 1634 } 1635 1636 #[test] class_canonicalize_unicode()1637 fn class_canonicalize_unicode() { 1638 let cls = uclass(&[('a', 'c'), ('x', 'z')]); 1639 let expected = vec![('a', 'c'), ('x', 'z')]; 1640 assert_eq!(expected, uranges(&cls)); 1641 1642 let cls = uclass(&[('x', 'z'), ('a', 'c')]); 1643 let expected = vec![('a', 'c'), ('x', 'z')]; 1644 assert_eq!(expected, uranges(&cls)); 1645 1646 let cls = uclass(&[('x', 'z'), ('w', 'y')]); 1647 let expected = vec![('w', 'z')]; 1648 assert_eq!(expected, uranges(&cls)); 1649 1650 let cls = uclass(&[ 1651 ('c', 'f'), 1652 ('a', 'g'), 1653 ('d', 'j'), 1654 ('a', 'c'), 1655 ('m', 'p'), 1656 ('l', 's'), 1657 ]); 1658 let expected = vec![('a', 'j'), ('l', 's')]; 1659 assert_eq!(expected, uranges(&cls)); 1660 1661 let cls = uclass(&[('x', 'z'), ('u', 'w')]); 1662 let expected = vec![('u', 'z')]; 1663 assert_eq!(expected, uranges(&cls)); 1664 1665 let cls = uclass(&[('\x00', '\u{10FFFF}'), ('\x00', '\u{10FFFF}')]); 1666 let expected = vec![('\x00', '\u{10FFFF}')]; 1667 assert_eq!(expected, uranges(&cls)); 1668 1669 let cls = uclass(&[('a', 'a'), ('b', 'b')]); 1670 let expected = vec![('a', 'b')]; 1671 assert_eq!(expected, uranges(&cls)); 1672 } 1673 1674 #[test] class_canonicalize_bytes()1675 fn class_canonicalize_bytes() { 1676 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); 1677 let expected = vec![(b'a', b'c'), (b'x', b'z')]; 1678 assert_eq!(expected, branges(&cls)); 1679 1680 let cls = bclass(&[(b'x', b'z'), (b'a', b'c')]); 1681 let expected = vec![(b'a', b'c'), (b'x', b'z')]; 1682 assert_eq!(expected, branges(&cls)); 1683 1684 let cls = bclass(&[(b'x', b'z'), (b'w', b'y')]); 1685 let expected = vec![(b'w', b'z')]; 1686 assert_eq!(expected, branges(&cls)); 1687 1688 let cls = bclass(&[ 1689 (b'c', b'f'), 1690 (b'a', b'g'), 1691 (b'd', b'j'), 1692 (b'a', b'c'), 1693 (b'm', b'p'), 1694 (b'l', b's'), 1695 ]); 1696 let expected = vec![(b'a', b'j'), (b'l', b's')]; 1697 assert_eq!(expected, branges(&cls)); 1698 1699 let cls = bclass(&[(b'x', b'z'), (b'u', b'w')]); 1700 let expected = vec![(b'u', b'z')]; 1701 assert_eq!(expected, branges(&cls)); 1702 1703 let cls = bclass(&[(b'\x00', b'\xFF'), (b'\x00', b'\xFF')]); 1704 let expected = vec![(b'\x00', b'\xFF')]; 1705 assert_eq!(expected, branges(&cls)); 1706 1707 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); 1708 let expected = vec![(b'a', b'b')]; 1709 assert_eq!(expected, branges(&cls)); 1710 } 1711 1712 #[test] 1713 #[cfg(feature = "unicode-case")] class_case_fold_unicode()1714 fn class_case_fold_unicode() { 1715 let cls = uclass(&[ 1716 ('C', 'F'), 1717 ('A', 'G'), 1718 ('D', 'J'), 1719 ('A', 'C'), 1720 ('M', 'P'), 1721 ('L', 'S'), 1722 ('c', 'f'), 1723 ]); 1724 let expected = uclass(&[ 1725 ('A', 'J'), 1726 ('L', 'S'), 1727 ('a', 'j'), 1728 ('l', 's'), 1729 ('\u{17F}', '\u{17F}'), 1730 ]); 1731 assert_eq!(expected, ucasefold(&cls)); 1732 1733 let cls = uclass(&[('A', 'Z')]); 1734 let expected = uclass(&[ 1735 ('A', 'Z'), 1736 ('a', 'z'), 1737 ('\u{17F}', '\u{17F}'), 1738 ('\u{212A}', '\u{212A}'), 1739 ]); 1740 assert_eq!(expected, ucasefold(&cls)); 1741 1742 let cls = uclass(&[('a', 'z')]); 1743 let expected = uclass(&[ 1744 ('A', 'Z'), 1745 ('a', 'z'), 1746 ('\u{17F}', '\u{17F}'), 1747 ('\u{212A}', '\u{212A}'), 1748 ]); 1749 assert_eq!(expected, ucasefold(&cls)); 1750 1751 let cls = uclass(&[('A', 'A'), ('_', '_')]); 1752 let expected = uclass(&[('A', 'A'), ('_', '_'), ('a', 'a')]); 1753 assert_eq!(expected, ucasefold(&cls)); 1754 1755 let cls = uclass(&[('A', 'A'), ('=', '=')]); 1756 let expected = uclass(&[('=', '='), ('A', 'A'), ('a', 'a')]); 1757 assert_eq!(expected, ucasefold(&cls)); 1758 1759 let cls = uclass(&[('\x00', '\x10')]); 1760 assert_eq!(cls, ucasefold(&cls)); 1761 1762 let cls = uclass(&[('k', 'k')]); 1763 let expected = 1764 uclass(&[('K', 'K'), ('k', 'k'), ('\u{212A}', '\u{212A}')]); 1765 assert_eq!(expected, ucasefold(&cls)); 1766 1767 let cls = uclass(&[('@', '@')]); 1768 assert_eq!(cls, ucasefold(&cls)); 1769 } 1770 1771 #[test] 1772 #[cfg(not(feature = "unicode-case"))] class_case_fold_unicode_disabled()1773 fn class_case_fold_unicode_disabled() { 1774 let mut cls = uclass(&[ 1775 ('C', 'F'), 1776 ('A', 'G'), 1777 ('D', 'J'), 1778 ('A', 'C'), 1779 ('M', 'P'), 1780 ('L', 'S'), 1781 ('c', 'f'), 1782 ]); 1783 assert!(cls.try_case_fold_simple().is_err()); 1784 } 1785 1786 #[test] 1787 #[should_panic] 1788 #[cfg(not(feature = "unicode-case"))] class_case_fold_unicode_disabled_panics()1789 fn class_case_fold_unicode_disabled_panics() { 1790 let mut cls = uclass(&[ 1791 ('C', 'F'), 1792 ('A', 'G'), 1793 ('D', 'J'), 1794 ('A', 'C'), 1795 ('M', 'P'), 1796 ('L', 'S'), 1797 ('c', 'f'), 1798 ]); 1799 cls.case_fold_simple(); 1800 } 1801 1802 #[test] class_case_fold_bytes()1803 fn class_case_fold_bytes() { 1804 let cls = bclass(&[ 1805 (b'C', b'F'), 1806 (b'A', b'G'), 1807 (b'D', b'J'), 1808 (b'A', b'C'), 1809 (b'M', b'P'), 1810 (b'L', b'S'), 1811 (b'c', b'f'), 1812 ]); 1813 let expected = 1814 bclass(&[(b'A', b'J'), (b'L', b'S'), (b'a', b'j'), (b'l', b's')]); 1815 assert_eq!(expected, bcasefold(&cls)); 1816 1817 let cls = bclass(&[(b'A', b'Z')]); 1818 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); 1819 assert_eq!(expected, bcasefold(&cls)); 1820 1821 let cls = bclass(&[(b'a', b'z')]); 1822 let expected = bclass(&[(b'A', b'Z'), (b'a', b'z')]); 1823 assert_eq!(expected, bcasefold(&cls)); 1824 1825 let cls = bclass(&[(b'A', b'A'), (b'_', b'_')]); 1826 let expected = bclass(&[(b'A', b'A'), (b'_', b'_'), (b'a', b'a')]); 1827 assert_eq!(expected, bcasefold(&cls)); 1828 1829 let cls = bclass(&[(b'A', b'A'), (b'=', b'=')]); 1830 let expected = bclass(&[(b'=', b'='), (b'A', b'A'), (b'a', b'a')]); 1831 assert_eq!(expected, bcasefold(&cls)); 1832 1833 let cls = bclass(&[(b'\x00', b'\x10')]); 1834 assert_eq!(cls, bcasefold(&cls)); 1835 1836 let cls = bclass(&[(b'k', b'k')]); 1837 let expected = bclass(&[(b'K', b'K'), (b'k', b'k')]); 1838 assert_eq!(expected, bcasefold(&cls)); 1839 1840 let cls = bclass(&[(b'@', b'@')]); 1841 assert_eq!(cls, bcasefold(&cls)); 1842 } 1843 1844 #[test] class_negate_unicode()1845 fn class_negate_unicode() { 1846 let cls = uclass(&[('a', 'a')]); 1847 let expected = uclass(&[('\x00', '\x60'), ('\x62', '\u{10FFFF}')]); 1848 assert_eq!(expected, unegate(&cls)); 1849 1850 let cls = uclass(&[('a', 'a'), ('b', 'b')]); 1851 let expected = uclass(&[('\x00', '\x60'), ('\x63', '\u{10FFFF}')]); 1852 assert_eq!(expected, unegate(&cls)); 1853 1854 let cls = uclass(&[('a', 'c'), ('x', 'z')]); 1855 let expected = uclass(&[ 1856 ('\x00', '\x60'), 1857 ('\x64', '\x77'), 1858 ('\x7B', '\u{10FFFF}'), 1859 ]); 1860 assert_eq!(expected, unegate(&cls)); 1861 1862 let cls = uclass(&[('\x00', 'a')]); 1863 let expected = uclass(&[('\x62', '\u{10FFFF}')]); 1864 assert_eq!(expected, unegate(&cls)); 1865 1866 let cls = uclass(&[('a', '\u{10FFFF}')]); 1867 let expected = uclass(&[('\x00', '\x60')]); 1868 assert_eq!(expected, unegate(&cls)); 1869 1870 let cls = uclass(&[('\x00', '\u{10FFFF}')]); 1871 let expected = uclass(&[]); 1872 assert_eq!(expected, unegate(&cls)); 1873 1874 let cls = uclass(&[]); 1875 let expected = uclass(&[('\x00', '\u{10FFFF}')]); 1876 assert_eq!(expected, unegate(&cls)); 1877 1878 let cls = 1879 uclass(&[('\x00', '\u{10FFFD}'), ('\u{10FFFF}', '\u{10FFFF}')]); 1880 let expected = uclass(&[('\u{10FFFE}', '\u{10FFFE}')]); 1881 assert_eq!(expected, unegate(&cls)); 1882 1883 let cls = uclass(&[('\x00', '\u{D7FF}')]); 1884 let expected = uclass(&[('\u{E000}', '\u{10FFFF}')]); 1885 assert_eq!(expected, unegate(&cls)); 1886 1887 let cls = uclass(&[('\x00', '\u{D7FE}')]); 1888 let expected = uclass(&[('\u{D7FF}', '\u{10FFFF}')]); 1889 assert_eq!(expected, unegate(&cls)); 1890 1891 let cls = uclass(&[('\u{E000}', '\u{10FFFF}')]); 1892 let expected = uclass(&[('\x00', '\u{D7FF}')]); 1893 assert_eq!(expected, unegate(&cls)); 1894 1895 let cls = uclass(&[('\u{E001}', '\u{10FFFF}')]); 1896 let expected = uclass(&[('\x00', '\u{E000}')]); 1897 assert_eq!(expected, unegate(&cls)); 1898 } 1899 1900 #[test] class_negate_bytes()1901 fn class_negate_bytes() { 1902 let cls = bclass(&[(b'a', b'a')]); 1903 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x62', b'\xFF')]); 1904 assert_eq!(expected, bnegate(&cls)); 1905 1906 let cls = bclass(&[(b'a', b'a'), (b'b', b'b')]); 1907 let expected = bclass(&[(b'\x00', b'\x60'), (b'\x63', b'\xFF')]); 1908 assert_eq!(expected, bnegate(&cls)); 1909 1910 let cls = bclass(&[(b'a', b'c'), (b'x', b'z')]); 1911 let expected = bclass(&[ 1912 (b'\x00', b'\x60'), 1913 (b'\x64', b'\x77'), 1914 (b'\x7B', b'\xFF'), 1915 ]); 1916 assert_eq!(expected, bnegate(&cls)); 1917 1918 let cls = bclass(&[(b'\x00', b'a')]); 1919 let expected = bclass(&[(b'\x62', b'\xFF')]); 1920 assert_eq!(expected, bnegate(&cls)); 1921 1922 let cls = bclass(&[(b'a', b'\xFF')]); 1923 let expected = bclass(&[(b'\x00', b'\x60')]); 1924 assert_eq!(expected, bnegate(&cls)); 1925 1926 let cls = bclass(&[(b'\x00', b'\xFF')]); 1927 let expected = bclass(&[]); 1928 assert_eq!(expected, bnegate(&cls)); 1929 1930 let cls = bclass(&[]); 1931 let expected = bclass(&[(b'\x00', b'\xFF')]); 1932 assert_eq!(expected, bnegate(&cls)); 1933 1934 let cls = bclass(&[(b'\x00', b'\xFD'), (b'\xFF', b'\xFF')]); 1935 let expected = bclass(&[(b'\xFE', b'\xFE')]); 1936 assert_eq!(expected, bnegate(&cls)); 1937 } 1938 1939 #[test] class_union_unicode()1940 fn class_union_unicode() { 1941 let cls1 = uclass(&[('a', 'g'), ('m', 't'), ('A', 'C')]); 1942 let cls2 = uclass(&[('a', 'z')]); 1943 let expected = uclass(&[('a', 'z'), ('A', 'C')]); 1944 assert_eq!(expected, uunion(&cls1, &cls2)); 1945 } 1946 1947 #[test] class_union_bytes()1948 fn class_union_bytes() { 1949 let cls1 = bclass(&[(b'a', b'g'), (b'm', b't'), (b'A', b'C')]); 1950 let cls2 = bclass(&[(b'a', b'z')]); 1951 let expected = bclass(&[(b'a', b'z'), (b'A', b'C')]); 1952 assert_eq!(expected, bunion(&cls1, &cls2)); 1953 } 1954 1955 #[test] class_intersect_unicode()1956 fn class_intersect_unicode() { 1957 let cls1 = uclass(&[]); 1958 let cls2 = uclass(&[('a', 'a')]); 1959 let expected = uclass(&[]); 1960 assert_eq!(expected, uintersect(&cls1, &cls2)); 1961 1962 let cls1 = uclass(&[('a', 'a')]); 1963 let cls2 = uclass(&[('a', 'a')]); 1964 let expected = uclass(&[('a', 'a')]); 1965 assert_eq!(expected, uintersect(&cls1, &cls2)); 1966 1967 let cls1 = uclass(&[('a', 'a')]); 1968 let cls2 = uclass(&[('b', 'b')]); 1969 let expected = uclass(&[]); 1970 assert_eq!(expected, uintersect(&cls1, &cls2)); 1971 1972 let cls1 = uclass(&[('a', 'a')]); 1973 let cls2 = uclass(&[('a', 'c')]); 1974 let expected = uclass(&[('a', 'a')]); 1975 assert_eq!(expected, uintersect(&cls1, &cls2)); 1976 1977 let cls1 = uclass(&[('a', 'b')]); 1978 let cls2 = uclass(&[('a', 'c')]); 1979 let expected = uclass(&[('a', 'b')]); 1980 assert_eq!(expected, uintersect(&cls1, &cls2)); 1981 1982 let cls1 = uclass(&[('a', 'b')]); 1983 let cls2 = uclass(&[('b', 'c')]); 1984 let expected = uclass(&[('b', 'b')]); 1985 assert_eq!(expected, uintersect(&cls1, &cls2)); 1986 1987 let cls1 = uclass(&[('a', 'b')]); 1988 let cls2 = uclass(&[('c', 'd')]); 1989 let expected = uclass(&[]); 1990 assert_eq!(expected, uintersect(&cls1, &cls2)); 1991 1992 let cls1 = uclass(&[('b', 'c')]); 1993 let cls2 = uclass(&[('a', 'd')]); 1994 let expected = uclass(&[('b', 'c')]); 1995 assert_eq!(expected, uintersect(&cls1, &cls2)); 1996 1997 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 1998 let cls2 = uclass(&[('a', 'h')]); 1999 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2000 assert_eq!(expected, uintersect(&cls1, &cls2)); 2001 2002 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2003 let cls2 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2004 let expected = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2005 assert_eq!(expected, uintersect(&cls1, &cls2)); 2006 2007 let cls1 = uclass(&[('a', 'b'), ('g', 'h')]); 2008 let cls2 = uclass(&[('d', 'e'), ('k', 'l')]); 2009 let expected = uclass(&[]); 2010 assert_eq!(expected, uintersect(&cls1, &cls2)); 2011 2012 let cls1 = uclass(&[('a', 'b'), ('d', 'e'), ('g', 'h')]); 2013 let cls2 = uclass(&[('h', 'h')]); 2014 let expected = uclass(&[('h', 'h')]); 2015 assert_eq!(expected, uintersect(&cls1, &cls2)); 2016 2017 let cls1 = uclass(&[('a', 'b'), ('e', 'f'), ('i', 'j')]); 2018 let cls2 = uclass(&[('c', 'd'), ('g', 'h'), ('k', 'l')]); 2019 let expected = uclass(&[]); 2020 assert_eq!(expected, uintersect(&cls1, &cls2)); 2021 2022 let cls1 = uclass(&[('a', 'b'), ('c', 'd'), ('e', 'f')]); 2023 let cls2 = uclass(&[('b', 'c'), ('d', 'e'), ('f', 'g')]); 2024 let expected = uclass(&[('b', 'f')]); 2025 assert_eq!(expected, uintersect(&cls1, &cls2)); 2026 } 2027 2028 #[test] class_intersect_bytes()2029 fn class_intersect_bytes() { 2030 let cls1 = bclass(&[]); 2031 let cls2 = bclass(&[(b'a', b'a')]); 2032 let expected = bclass(&[]); 2033 assert_eq!(expected, bintersect(&cls1, &cls2)); 2034 2035 let cls1 = bclass(&[(b'a', b'a')]); 2036 let cls2 = bclass(&[(b'a', b'a')]); 2037 let expected = bclass(&[(b'a', b'a')]); 2038 assert_eq!(expected, bintersect(&cls1, &cls2)); 2039 2040 let cls1 = bclass(&[(b'a', b'a')]); 2041 let cls2 = bclass(&[(b'b', b'b')]); 2042 let expected = bclass(&[]); 2043 assert_eq!(expected, bintersect(&cls1, &cls2)); 2044 2045 let cls1 = bclass(&[(b'a', b'a')]); 2046 let cls2 = bclass(&[(b'a', b'c')]); 2047 let expected = bclass(&[(b'a', b'a')]); 2048 assert_eq!(expected, bintersect(&cls1, &cls2)); 2049 2050 let cls1 = bclass(&[(b'a', b'b')]); 2051 let cls2 = bclass(&[(b'a', b'c')]); 2052 let expected = bclass(&[(b'a', b'b')]); 2053 assert_eq!(expected, bintersect(&cls1, &cls2)); 2054 2055 let cls1 = bclass(&[(b'a', b'b')]); 2056 let cls2 = bclass(&[(b'b', b'c')]); 2057 let expected = bclass(&[(b'b', b'b')]); 2058 assert_eq!(expected, bintersect(&cls1, &cls2)); 2059 2060 let cls1 = bclass(&[(b'a', b'b')]); 2061 let cls2 = bclass(&[(b'c', b'd')]); 2062 let expected = bclass(&[]); 2063 assert_eq!(expected, bintersect(&cls1, &cls2)); 2064 2065 let cls1 = bclass(&[(b'b', b'c')]); 2066 let cls2 = bclass(&[(b'a', b'd')]); 2067 let expected = bclass(&[(b'b', b'c')]); 2068 assert_eq!(expected, bintersect(&cls1, &cls2)); 2069 2070 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2071 let cls2 = bclass(&[(b'a', b'h')]); 2072 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2073 assert_eq!(expected, bintersect(&cls1, &cls2)); 2074 2075 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2076 let cls2 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2077 let expected = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2078 assert_eq!(expected, bintersect(&cls1, &cls2)); 2079 2080 let cls1 = bclass(&[(b'a', b'b'), (b'g', b'h')]); 2081 let cls2 = bclass(&[(b'd', b'e'), (b'k', b'l')]); 2082 let expected = bclass(&[]); 2083 assert_eq!(expected, bintersect(&cls1, &cls2)); 2084 2085 let cls1 = bclass(&[(b'a', b'b'), (b'd', b'e'), (b'g', b'h')]); 2086 let cls2 = bclass(&[(b'h', b'h')]); 2087 let expected = bclass(&[(b'h', b'h')]); 2088 assert_eq!(expected, bintersect(&cls1, &cls2)); 2089 2090 let cls1 = bclass(&[(b'a', b'b'), (b'e', b'f'), (b'i', b'j')]); 2091 let cls2 = bclass(&[(b'c', b'd'), (b'g', b'h'), (b'k', b'l')]); 2092 let expected = bclass(&[]); 2093 assert_eq!(expected, bintersect(&cls1, &cls2)); 2094 2095 let cls1 = bclass(&[(b'a', b'b'), (b'c', b'd'), (b'e', b'f')]); 2096 let cls2 = bclass(&[(b'b', b'c'), (b'd', b'e'), (b'f', b'g')]); 2097 let expected = bclass(&[(b'b', b'f')]); 2098 assert_eq!(expected, bintersect(&cls1, &cls2)); 2099 } 2100 2101 #[test] class_difference_unicode()2102 fn class_difference_unicode() { 2103 let cls1 = uclass(&[('a', 'a')]); 2104 let cls2 = uclass(&[('a', 'a')]); 2105 let expected = uclass(&[]); 2106 assert_eq!(expected, udifference(&cls1, &cls2)); 2107 2108 let cls1 = uclass(&[('a', 'a')]); 2109 let cls2 = uclass(&[]); 2110 let expected = uclass(&[('a', 'a')]); 2111 assert_eq!(expected, udifference(&cls1, &cls2)); 2112 2113 let cls1 = uclass(&[]); 2114 let cls2 = uclass(&[('a', 'a')]); 2115 let expected = uclass(&[]); 2116 assert_eq!(expected, udifference(&cls1, &cls2)); 2117 2118 let cls1 = uclass(&[('a', 'z')]); 2119 let cls2 = uclass(&[('a', 'a')]); 2120 let expected = uclass(&[('b', 'z')]); 2121 assert_eq!(expected, udifference(&cls1, &cls2)); 2122 2123 let cls1 = uclass(&[('a', 'z')]); 2124 let cls2 = uclass(&[('z', 'z')]); 2125 let expected = uclass(&[('a', 'y')]); 2126 assert_eq!(expected, udifference(&cls1, &cls2)); 2127 2128 let cls1 = uclass(&[('a', 'z')]); 2129 let cls2 = uclass(&[('m', 'm')]); 2130 let expected = uclass(&[('a', 'l'), ('n', 'z')]); 2131 assert_eq!(expected, udifference(&cls1, &cls2)); 2132 2133 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2134 let cls2 = uclass(&[('a', 'z')]); 2135 let expected = uclass(&[]); 2136 assert_eq!(expected, udifference(&cls1, &cls2)); 2137 2138 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2139 let cls2 = uclass(&[('d', 'v')]); 2140 let expected = uclass(&[('a', 'c')]); 2141 assert_eq!(expected, udifference(&cls1, &cls2)); 2142 2143 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2144 let cls2 = uclass(&[('b', 'g'), ('s', 'u')]); 2145 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); 2146 assert_eq!(expected, udifference(&cls1, &cls2)); 2147 2148 let cls1 = uclass(&[('a', 'c'), ('g', 'i'), ('r', 't')]); 2149 let cls2 = uclass(&[('b', 'd'), ('e', 'g'), ('s', 'u')]); 2150 let expected = uclass(&[('a', 'a'), ('h', 'i'), ('r', 'r')]); 2151 assert_eq!(expected, udifference(&cls1, &cls2)); 2152 2153 let cls1 = uclass(&[('x', 'z')]); 2154 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); 2155 let expected = uclass(&[('x', 'z')]); 2156 assert_eq!(expected, udifference(&cls1, &cls2)); 2157 2158 let cls1 = uclass(&[('a', 'z')]); 2159 let cls2 = uclass(&[('a', 'c'), ('e', 'g'), ('s', 'u')]); 2160 let expected = uclass(&[('d', 'd'), ('h', 'r'), ('v', 'z')]); 2161 assert_eq!(expected, udifference(&cls1, &cls2)); 2162 } 2163 2164 #[test] class_difference_bytes()2165 fn class_difference_bytes() { 2166 let cls1 = bclass(&[(b'a', b'a')]); 2167 let cls2 = bclass(&[(b'a', b'a')]); 2168 let expected = bclass(&[]); 2169 assert_eq!(expected, bdifference(&cls1, &cls2)); 2170 2171 let cls1 = bclass(&[(b'a', b'a')]); 2172 let cls2 = bclass(&[]); 2173 let expected = bclass(&[(b'a', b'a')]); 2174 assert_eq!(expected, bdifference(&cls1, &cls2)); 2175 2176 let cls1 = bclass(&[]); 2177 let cls2 = bclass(&[(b'a', b'a')]); 2178 let expected = bclass(&[]); 2179 assert_eq!(expected, bdifference(&cls1, &cls2)); 2180 2181 let cls1 = bclass(&[(b'a', b'z')]); 2182 let cls2 = bclass(&[(b'a', b'a')]); 2183 let expected = bclass(&[(b'b', b'z')]); 2184 assert_eq!(expected, bdifference(&cls1, &cls2)); 2185 2186 let cls1 = bclass(&[(b'a', b'z')]); 2187 let cls2 = bclass(&[(b'z', b'z')]); 2188 let expected = bclass(&[(b'a', b'y')]); 2189 assert_eq!(expected, bdifference(&cls1, &cls2)); 2190 2191 let cls1 = bclass(&[(b'a', b'z')]); 2192 let cls2 = bclass(&[(b'm', b'm')]); 2193 let expected = bclass(&[(b'a', b'l'), (b'n', b'z')]); 2194 assert_eq!(expected, bdifference(&cls1, &cls2)); 2195 2196 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2197 let cls2 = bclass(&[(b'a', b'z')]); 2198 let expected = bclass(&[]); 2199 assert_eq!(expected, bdifference(&cls1, &cls2)); 2200 2201 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2202 let cls2 = bclass(&[(b'd', b'v')]); 2203 let expected = bclass(&[(b'a', b'c')]); 2204 assert_eq!(expected, bdifference(&cls1, &cls2)); 2205 2206 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2207 let cls2 = bclass(&[(b'b', b'g'), (b's', b'u')]); 2208 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); 2209 assert_eq!(expected, bdifference(&cls1, &cls2)); 2210 2211 let cls1 = bclass(&[(b'a', b'c'), (b'g', b'i'), (b'r', b't')]); 2212 let cls2 = bclass(&[(b'b', b'd'), (b'e', b'g'), (b's', b'u')]); 2213 let expected = bclass(&[(b'a', b'a'), (b'h', b'i'), (b'r', b'r')]); 2214 assert_eq!(expected, bdifference(&cls1, &cls2)); 2215 2216 let cls1 = bclass(&[(b'x', b'z')]); 2217 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); 2218 let expected = bclass(&[(b'x', b'z')]); 2219 assert_eq!(expected, bdifference(&cls1, &cls2)); 2220 2221 let cls1 = bclass(&[(b'a', b'z')]); 2222 let cls2 = bclass(&[(b'a', b'c'), (b'e', b'g'), (b's', b'u')]); 2223 let expected = bclass(&[(b'd', b'd'), (b'h', b'r'), (b'v', b'z')]); 2224 assert_eq!(expected, bdifference(&cls1, &cls2)); 2225 } 2226 2227 #[test] class_symmetric_difference_unicode()2228 fn class_symmetric_difference_unicode() { 2229 let cls1 = uclass(&[('a', 'm')]); 2230 let cls2 = uclass(&[('g', 't')]); 2231 let expected = uclass(&[('a', 'f'), ('n', 't')]); 2232 assert_eq!(expected, usymdifference(&cls1, &cls2)); 2233 } 2234 2235 #[test] class_symmetric_difference_bytes()2236 fn class_symmetric_difference_bytes() { 2237 let cls1 = bclass(&[(b'a', b'm')]); 2238 let cls2 = bclass(&[(b'g', b't')]); 2239 let expected = bclass(&[(b'a', b'f'), (b'n', b't')]); 2240 assert_eq!(expected, bsymdifference(&cls1, &cls2)); 2241 } 2242 2243 #[test] 2244 #[should_panic] hir_byte_literal_non_ascii()2245 fn hir_byte_literal_non_ascii() { 2246 Hir::literal(Literal::Byte(b'a')); 2247 } 2248 2249 // We use a thread with an explicit stack size to test that our destructor 2250 // for Hir can handle arbitrarily sized expressions in constant stack 2251 // space. In case we run on a platform without threads (WASM?), we limit 2252 // this test to Windows/Unix. 2253 #[test] 2254 #[cfg(any(unix, windows))] no_stack_overflow_on_drop()2255 fn no_stack_overflow_on_drop() { 2256 use std::thread; 2257 2258 let run = || { 2259 let mut expr = Hir::empty(); 2260 for _ in 0..100 { 2261 expr = Hir::group(Group { 2262 kind: GroupKind::NonCapturing, 2263 hir: Box::new(expr), 2264 }); 2265 expr = Hir::repetition(Repetition { 2266 kind: RepetitionKind::ZeroOrOne, 2267 greedy: true, 2268 hir: Box::new(expr), 2269 }); 2270 2271 expr = Hir { 2272 kind: HirKind::Concat(vec![expr]), 2273 info: HirInfo::new(), 2274 }; 2275 expr = Hir { 2276 kind: HirKind::Alternation(vec![expr]), 2277 info: HirInfo::new(), 2278 }; 2279 } 2280 assert!(!expr.kind.is_empty()); 2281 }; 2282 2283 // We run our test on a thread with a small stack size so we can 2284 // force the issue more easily. 2285 thread::Builder::new() 2286 .stack_size(1 << 10) 2287 .spawn(run) 2288 .unwrap() 2289 .join() 2290 .unwrap(); 2291 } 2292 } 2293