1package syntax 2 3import ( 4 "fmt" 5 "math" 6 "os" 7 "sort" 8 "strconv" 9 "unicode" 10) 11 12type RegexOptions int32 13 14const ( 15 IgnoreCase RegexOptions = 0x0001 // "i" 16 Multiline = 0x0002 // "m" 17 ExplicitCapture = 0x0004 // "n" 18 Compiled = 0x0008 // "c" 19 Singleline = 0x0010 // "s" 20 IgnorePatternWhitespace = 0x0020 // "x" 21 RightToLeft = 0x0040 // "r" 22 Debug = 0x0080 // "d" 23 ECMAScript = 0x0100 // "e" 24 RE2 = 0x0200 // RE2 compat mode 25) 26 27func optionFromCode(ch rune) RegexOptions { 28 // case-insensitive 29 switch ch { 30 case 'i', 'I': 31 return IgnoreCase 32 case 'r', 'R': 33 return RightToLeft 34 case 'm', 'M': 35 return Multiline 36 case 'n', 'N': 37 return ExplicitCapture 38 case 's', 'S': 39 return Singleline 40 case 'x', 'X': 41 return IgnorePatternWhitespace 42 case 'd', 'D': 43 return Debug 44 case 'e', 'E': 45 return ECMAScript 46 default: 47 return 0 48 } 49} 50 51// An Error describes a failure to parse a regular expression 52// and gives the offending expression. 53type Error struct { 54 Code ErrorCode 55 Expr string 56 Args []interface{} 57} 58 59func (e *Error) Error() string { 60 if len(e.Args) == 0 { 61 return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`" 62 } 63 return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`" 64} 65 66// An ErrorCode describes a failure to parse a regular expression. 67type ErrorCode string 68 69const ( 70 // internal issue 71 ErrInternalError ErrorCode = "regexp/syntax: internal error" 72 // Parser errors 73 ErrUnterminatedComment = "unterminated comment" 74 ErrInvalidCharRange = "invalid character class range" 75 ErrInvalidRepeatSize = "invalid repeat count" 76 ErrInvalidUTF8 = "invalid UTF-8" 77 ErrCaptureGroupOutOfRange = "capture group number out of range" 78 ErrUnexpectedParen = "unexpected )" 79 ErrMissingParen = "missing closing )" 80 ErrMissingBrace = "missing closing }" 81 ErrInvalidRepeatOp = "invalid nested repetition operator" 82 ErrMissingRepeatArgument = "missing argument to repetition operator" 83 ErrConditionalExpression = "illegal conditional (?(...)) expression" 84 ErrTooManyAlternates = "too many | in (?()|)" 85 ErrUnrecognizedGrouping = "unrecognized grouping construct: (%v" 86 ErrInvalidGroupName = "invalid group name: group names must begin with a word character and have a matching terminator" 87 ErrCapNumNotZero = "capture number cannot be zero" 88 ErrUndefinedBackRef = "reference to undefined group number %v" 89 ErrUndefinedNameRef = "reference to undefined group name %v" 90 ErrAlternationCantCapture = "alternation conditions do not capture and cannot be named" 91 ErrAlternationCantHaveComment = "alternation conditions cannot be comments" 92 ErrMalformedReference = "(?(%v) ) malformed" 93 ErrUndefinedReference = "(?(%v) ) reference to undefined group" 94 ErrIllegalEndEscape = "illegal \\ at end of pattern" 95 ErrMalformedSlashP = "malformed \\p{X} character escape" 96 ErrIncompleteSlashP = "incomplete \\p{X} character escape" 97 ErrUnknownSlashP = "unknown unicode category, script, or property '%v'" 98 ErrUnrecognizedEscape = "unrecognized escape sequence \\%v" 99 ErrMissingControl = "missing control character" 100 ErrUnrecognizedControl = "unrecognized control character" 101 ErrTooFewHex = "insufficient hexadecimal digits" 102 ErrInvalidHex = "hex values may not be larger than 0x10FFFF" 103 ErrMalformedNameRef = "malformed \\k<...> named back reference" 104 ErrBadClassInCharRange = "cannot include class \\%v in character range" 105 ErrUnterminatedBracket = "unterminated [] set" 106 ErrSubtractionMustBeLast = "a subtraction must be the last element in a character class" 107 ErrReversedCharRange = "[x-y] range in reverse order" 108) 109 110func (e ErrorCode) String() string { 111 return string(e) 112} 113 114type parser struct { 115 stack *regexNode 116 group *regexNode 117 alternation *regexNode 118 concatenation *regexNode 119 unit *regexNode 120 121 patternRaw string 122 pattern []rune 123 124 currentPos int 125 specialCase *unicode.SpecialCase 126 127 autocap int 128 capcount int 129 captop int 130 capsize int 131 132 caps map[int]int 133 capnames map[string]int 134 135 capnumlist []int 136 capnamelist []string 137 138 options RegexOptions 139 optionsStack []RegexOptions 140 ignoreNextParen bool 141} 142 143const ( 144 maxValueDiv10 int = math.MaxInt32 / 10 145 maxValueMod10 = math.MaxInt32 % 10 146) 147 148// Parse converts a regex string into a parse tree 149func Parse(re string, op RegexOptions) (*RegexTree, error) { 150 p := parser{ 151 options: op, 152 caps: make(map[int]int), 153 } 154 p.setPattern(re) 155 156 if err := p.countCaptures(); err != nil { 157 return nil, err 158 } 159 160 p.reset(op) 161 root, err := p.scanRegex() 162 163 if err != nil { 164 return nil, err 165 } 166 tree := &RegexTree{ 167 root: root, 168 caps: p.caps, 169 capnumlist: p.capnumlist, 170 captop: p.captop, 171 Capnames: p.capnames, 172 Caplist: p.capnamelist, 173 options: op, 174 } 175 176 if tree.options&Debug > 0 { 177 os.Stdout.WriteString(tree.Dump()) 178 } 179 180 return tree, nil 181} 182 183func (p *parser) setPattern(pattern string) { 184 p.patternRaw = pattern 185 p.pattern = make([]rune, 0, len(pattern)) 186 187 //populate our rune array to handle utf8 encoding 188 for _, r := range pattern { 189 p.pattern = append(p.pattern, r) 190 } 191} 192func (p *parser) getErr(code ErrorCode, args ...interface{}) error { 193 return &Error{Code: code, Expr: p.patternRaw, Args: args} 194} 195 196func (p *parser) noteCaptureSlot(i, pos int) { 197 if _, ok := p.caps[i]; !ok { 198 // the rhs of the hashtable isn't used in the parser 199 p.caps[i] = pos 200 p.capcount++ 201 202 if p.captop <= i { 203 if i == math.MaxInt32 { 204 p.captop = i 205 } else { 206 p.captop = i + 1 207 } 208 } 209 } 210} 211 212func (p *parser) noteCaptureName(name string, pos int) { 213 if p.capnames == nil { 214 p.capnames = make(map[string]int) 215 } 216 217 if _, ok := p.capnames[name]; !ok { 218 p.capnames[name] = pos 219 p.capnamelist = append(p.capnamelist, name) 220 } 221} 222 223func (p *parser) assignNameSlots() { 224 if p.capnames != nil { 225 for _, name := range p.capnamelist { 226 for p.isCaptureSlot(p.autocap) { 227 p.autocap++ 228 } 229 pos := p.capnames[name] 230 p.capnames[name] = p.autocap 231 p.noteCaptureSlot(p.autocap, pos) 232 233 p.autocap++ 234 } 235 } 236 237 // if the caps array has at least one gap, construct the list of used slots 238 if p.capcount < p.captop { 239 p.capnumlist = make([]int, p.capcount) 240 i := 0 241 242 for k := range p.caps { 243 p.capnumlist[i] = k 244 i++ 245 } 246 247 sort.Ints(p.capnumlist) 248 } 249 250 // merge capsnumlist into capnamelist 251 if p.capnames != nil || p.capnumlist != nil { 252 var oldcapnamelist []string 253 var next int 254 var k int 255 256 if p.capnames == nil { 257 oldcapnamelist = nil 258 p.capnames = make(map[string]int) 259 p.capnamelist = []string{} 260 next = -1 261 } else { 262 oldcapnamelist = p.capnamelist 263 p.capnamelist = []string{} 264 next = p.capnames[oldcapnamelist[0]] 265 } 266 267 for i := 0; i < p.capcount; i++ { 268 j := i 269 if p.capnumlist != nil { 270 j = p.capnumlist[i] 271 } 272 273 if next == j { 274 p.capnamelist = append(p.capnamelist, oldcapnamelist[k]) 275 k++ 276 277 if k == len(oldcapnamelist) { 278 next = -1 279 } else { 280 next = p.capnames[oldcapnamelist[k]] 281 } 282 283 } else { 284 //feature: culture? 285 str := strconv.Itoa(j) 286 p.capnamelist = append(p.capnamelist, str) 287 p.capnames[str] = j 288 } 289 } 290 } 291} 292 293func (p *parser) consumeAutocap() int { 294 r := p.autocap 295 p.autocap++ 296 return r 297} 298 299// CountCaptures is a prescanner for deducing the slots used for 300// captures by doing a partial tokenization of the pattern. 301func (p *parser) countCaptures() error { 302 var ch rune 303 304 p.noteCaptureSlot(0, 0) 305 306 p.autocap = 1 307 308 for p.charsRight() > 0 { 309 pos := p.textpos() 310 ch = p.moveRightGetChar() 311 switch ch { 312 case '\\': 313 if p.charsRight() > 0 { 314 p.scanBackslash(true) 315 } 316 317 case '#': 318 if p.useOptionX() { 319 p.moveLeft() 320 p.scanBlank() 321 } 322 323 case '[': 324 p.scanCharSet(false, true) 325 326 case ')': 327 if !p.emptyOptionsStack() { 328 p.popOptions() 329 } 330 331 case '(': 332 if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' { 333 p.moveLeft() 334 p.scanBlank() 335 } else { 336 p.pushOptions() 337 if p.charsRight() > 0 && p.rightChar(0) == '?' { 338 // we have (?... 339 p.moveRight(1) 340 341 if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') { 342 // named group: (?<... or (?'... 343 344 p.moveRight(1) 345 ch = p.rightChar(0) 346 347 if ch != '0' && IsWordChar(ch) { 348 if ch >= '1' && ch <= '9' { 349 dec, err := p.scanDecimal() 350 if err != nil { 351 return err 352 } 353 p.noteCaptureSlot(dec, pos) 354 } else { 355 p.noteCaptureName(p.scanCapname(), pos) 356 } 357 } 358 } else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') { 359 // RE2-compat (?P<) 360 p.moveRight(2) 361 ch = p.rightChar(0) 362 if IsWordChar(ch) { 363 p.noteCaptureName(p.scanCapname(), pos) 364 } 365 366 } else { 367 // (?... 368 369 // get the options if it's an option construct (?cimsx-cimsx...) 370 p.scanOptions() 371 372 if p.charsRight() > 0 { 373 if p.rightChar(0) == ')' { 374 // (?cimsx-cimsx) 375 p.moveRight(1) 376 p.popKeepOptions() 377 } else if p.rightChar(0) == '(' { 378 // alternation construct: (?(foo)yes|no) 379 // ignore the next paren so we don't capture the condition 380 p.ignoreNextParen = true 381 382 // break from here so we don't reset ignoreNextParen 383 continue 384 } 385 } 386 } 387 } else { 388 if !p.useOptionN() && !p.ignoreNextParen { 389 p.noteCaptureSlot(p.consumeAutocap(), pos) 390 } 391 } 392 } 393 394 p.ignoreNextParen = false 395 396 } 397 } 398 399 p.assignNameSlots() 400 return nil 401} 402 403func (p *parser) reset(topopts RegexOptions) { 404 p.currentPos = 0 405 p.autocap = 1 406 p.ignoreNextParen = false 407 408 if len(p.optionsStack) > 0 { 409 p.optionsStack = p.optionsStack[:0] 410 } 411 412 p.options = topopts 413 p.stack = nil 414} 415 416func (p *parser) scanRegex() (*regexNode, error) { 417 ch := '@' // nonspecial ch, means at beginning 418 isQuant := false 419 420 p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1)) 421 422 for p.charsRight() > 0 { 423 wasPrevQuantifier := isQuant 424 isQuant = false 425 426 if err := p.scanBlank(); err != nil { 427 return nil, err 428 } 429 430 startpos := p.textpos() 431 432 // move past all of the normal characters. We'll stop when we hit some kind of control character, 433 // or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace. 434 if p.useOptionX() { 435 for p.charsRight() > 0 { 436 ch = p.rightChar(0) 437 //UGLY: clean up, this is ugly 438 if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) { 439 break 440 } 441 p.moveRight(1) 442 } 443 } else { 444 for p.charsRight() > 0 { 445 ch = p.rightChar(0) 446 if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) { 447 break 448 } 449 p.moveRight(1) 450 } 451 } 452 453 endpos := p.textpos() 454 455 p.scanBlank() 456 457 if p.charsRight() == 0 { 458 ch = '!' // nonspecial, means at end 459 } else if ch = p.rightChar(0); isSpecial(ch) { 460 isQuant = isQuantifier(ch) 461 p.moveRight(1) 462 } else { 463 ch = ' ' // nonspecial, means at ordinary char 464 } 465 466 if startpos < endpos { 467 cchUnquantified := endpos - startpos 468 if isQuant { 469 cchUnquantified-- 470 } 471 wasPrevQuantifier = false 472 473 if cchUnquantified > 0 { 474 p.addToConcatenate(startpos, cchUnquantified, false) 475 } 476 477 if isQuant { 478 p.addUnitOne(p.charAt(endpos - 1)) 479 } 480 } 481 482 switch ch { 483 case '!': 484 goto BreakOuterScan 485 486 case ' ': 487 goto ContinueOuterScan 488 489 case '[': 490 cc, err := p.scanCharSet(p.useOptionI(), false) 491 if err != nil { 492 return nil, err 493 } 494 p.addUnitSet(cc) 495 496 case '(': 497 p.pushOptions() 498 499 if grouper, err := p.scanGroupOpen(); err != nil { 500 return nil, err 501 } else if grouper == nil { 502 p.popKeepOptions() 503 } else { 504 p.pushGroup() 505 p.startGroup(grouper) 506 } 507 508 continue 509 510 case '|': 511 p.addAlternate() 512 goto ContinueOuterScan 513 514 case ')': 515 if p.emptyStack() { 516 return nil, p.getErr(ErrUnexpectedParen) 517 } 518 519 if err := p.addGroup(); err != nil { 520 return nil, err 521 } 522 if err := p.popGroup(); err != nil { 523 return nil, err 524 } 525 p.popOptions() 526 527 if p.unit == nil { 528 goto ContinueOuterScan 529 } 530 531 case '\\': 532 n, err := p.scanBackslash(false) 533 if err != nil { 534 return nil, err 535 } 536 p.addUnitNode(n) 537 538 case '^': 539 if p.useOptionM() { 540 p.addUnitType(ntBol) 541 } else { 542 p.addUnitType(ntBeginning) 543 } 544 545 case '$': 546 if p.useOptionM() { 547 p.addUnitType(ntEol) 548 } else { 549 p.addUnitType(ntEndZ) 550 } 551 552 case '.': 553 if p.useOptionE() { 554 p.addUnitSet(ECMAAnyClass()) 555 } else if p.useOptionS() { 556 p.addUnitSet(AnyClass()) 557 } else { 558 p.addUnitNotone('\n') 559 } 560 561 case '{', '*', '+', '?': 562 if p.unit == nil { 563 if wasPrevQuantifier { 564 return nil, p.getErr(ErrInvalidRepeatOp) 565 } else { 566 return nil, p.getErr(ErrMissingRepeatArgument) 567 } 568 } 569 p.moveLeft() 570 571 default: 572 return nil, p.getErr(ErrInternalError) 573 } 574 575 if err := p.scanBlank(); err != nil { 576 return nil, err 577 } 578 579 if p.charsRight() > 0 { 580 isQuant = p.isTrueQuantifier() 581 } 582 if p.charsRight() == 0 || !isQuant { 583 //maintain odd C# assignment order -- not sure if required, could clean up? 584 p.addConcatenate() 585 goto ContinueOuterScan 586 } 587 588 ch = p.moveRightGetChar() 589 590 // Handle quantifiers 591 for p.unit != nil { 592 var min, max int 593 var lazy bool 594 595 switch ch { 596 case '*': 597 min = 0 598 max = math.MaxInt32 599 600 case '?': 601 min = 0 602 max = 1 603 604 case '+': 605 min = 1 606 max = math.MaxInt32 607 608 case '{': 609 { 610 var err error 611 startpos = p.textpos() 612 if min, err = p.scanDecimal(); err != nil { 613 return nil, err 614 } 615 max = min 616 if startpos < p.textpos() { 617 if p.charsRight() > 0 && p.rightChar(0) == ',' { 618 p.moveRight(1) 619 if p.charsRight() == 0 || p.rightChar(0) == '}' { 620 max = math.MaxInt32 621 } else { 622 if max, err = p.scanDecimal(); err != nil { 623 return nil, err 624 } 625 } 626 } 627 } 628 629 if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' { 630 p.addConcatenate() 631 p.textto(startpos - 1) 632 goto ContinueOuterScan 633 } 634 } 635 636 default: 637 return nil, p.getErr(ErrInternalError) 638 } 639 640 if err := p.scanBlank(); err != nil { 641 return nil, err 642 } 643 644 if p.charsRight() == 0 || p.rightChar(0) != '?' { 645 lazy = false 646 } else { 647 p.moveRight(1) 648 lazy = true 649 } 650 651 if min > max { 652 return nil, p.getErr(ErrInvalidRepeatSize) 653 } 654 655 p.addConcatenate3(lazy, min, max) 656 } 657 658 ContinueOuterScan: 659 } 660 661BreakOuterScan: 662 ; 663 664 if !p.emptyStack() { 665 return nil, p.getErr(ErrMissingParen) 666 } 667 668 if err := p.addGroup(); err != nil { 669 return nil, err 670 } 671 672 return p.unit, nil 673 674} 675 676/* 677 * Simple parsing for replacement patterns 678 */ 679func (p *parser) scanReplacement() (*regexNode, error) { 680 var c, startpos int 681 682 p.concatenation = newRegexNode(ntConcatenate, p.options) 683 684 for { 685 c = p.charsRight() 686 if c == 0 { 687 break 688 } 689 690 startpos = p.textpos() 691 692 for c > 0 && p.rightChar(0) != '$' { 693 p.moveRight(1) 694 c-- 695 } 696 697 p.addToConcatenate(startpos, p.textpos()-startpos, true) 698 699 if c > 0 { 700 if p.moveRightGetChar() == '$' { 701 n, err := p.scanDollar() 702 if err != nil { 703 return nil, err 704 } 705 p.addUnitNode(n) 706 } 707 p.addConcatenate() 708 } 709 } 710 711 return p.concatenation, nil 712} 713 714/* 715 * Scans $ patterns recognized within replacement patterns 716 */ 717func (p *parser) scanDollar() (*regexNode, error) { 718 if p.charsRight() == 0 { 719 return newRegexNodeCh(ntOne, p.options, '$'), nil 720 } 721 722 ch := p.rightChar(0) 723 angled := false 724 backpos := p.textpos() 725 lastEndPos := backpos 726 727 // Note angle 728 729 if ch == '{' && p.charsRight() > 1 { 730 angled = true 731 p.moveRight(1) 732 ch = p.rightChar(0) 733 } 734 735 // Try to parse backreference: \1 or \{1} or \{cap} 736 737 if ch >= '0' && ch <= '9' { 738 if !angled && p.useOptionE() { 739 capnum := -1 740 newcapnum := int(ch - '0') 741 p.moveRight(1) 742 if p.isCaptureSlot(newcapnum) { 743 capnum = newcapnum 744 lastEndPos = p.textpos() 745 } 746 747 for p.charsRight() > 0 { 748 ch = p.rightChar(0) 749 if ch < '0' || ch > '9' { 750 break 751 } 752 digit := int(ch - '0') 753 if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) { 754 return nil, p.getErr(ErrCaptureGroupOutOfRange) 755 } 756 757 newcapnum = newcapnum*10 + digit 758 759 p.moveRight(1) 760 if p.isCaptureSlot(newcapnum) { 761 capnum = newcapnum 762 lastEndPos = p.textpos() 763 } 764 } 765 p.textto(lastEndPos) 766 if capnum >= 0 { 767 return newRegexNodeM(ntRef, p.options, capnum), nil 768 } 769 } else { 770 capnum, err := p.scanDecimal() 771 if err != nil { 772 return nil, err 773 } 774 if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' { 775 if p.isCaptureSlot(capnum) { 776 return newRegexNodeM(ntRef, p.options, capnum), nil 777 } 778 } 779 } 780 } else if angled && IsWordChar(ch) { 781 capname := p.scanCapname() 782 783 if p.charsRight() > 0 && p.moveRightGetChar() == '}' { 784 if p.isCaptureName(capname) { 785 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil 786 } 787 } 788 } else if !angled { 789 capnum := 1 790 791 switch ch { 792 case '$': 793 p.moveRight(1) 794 return newRegexNodeCh(ntOne, p.options, '$'), nil 795 case '&': 796 capnum = 0 797 case '`': 798 capnum = replaceLeftPortion 799 case '\'': 800 capnum = replaceRightPortion 801 case '+': 802 capnum = replaceLastGroup 803 case '_': 804 capnum = replaceWholeString 805 } 806 807 if capnum != 1 { 808 p.moveRight(1) 809 return newRegexNodeM(ntRef, p.options, capnum), nil 810 } 811 } 812 813 // unrecognized $: literalize 814 815 p.textto(backpos) 816 return newRegexNodeCh(ntOne, p.options, '$'), nil 817} 818 819// scanGroupOpen scans chars following a '(' (not counting the '('), and returns 820// a RegexNode for the type of group scanned, or nil if the group 821// simply changed options (?cimsx-cimsx) or was a comment (#...). 822func (p *parser) scanGroupOpen() (*regexNode, error) { 823 var ch rune 824 var nt nodeType 825 var err error 826 close := '>' 827 start := p.textpos() 828 829 // just return a RegexNode if we have: 830 // 1. "(" followed by nothing 831 // 2. "(x" where x != ? 832 // 3. "(?)" 833 if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) { 834 if p.useOptionN() || p.ignoreNextParen { 835 p.ignoreNextParen = false 836 return newRegexNode(ntGroup, p.options), nil 837 } 838 return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil 839 } 840 841 p.moveRight(1) 842 843 for { 844 if p.charsRight() == 0 { 845 break 846 } 847 848 switch ch = p.moveRightGetChar(); ch { 849 case ':': 850 nt = ntGroup 851 852 case '=': 853 p.options &= ^RightToLeft 854 nt = ntRequire 855 856 case '!': 857 p.options &= ^RightToLeft 858 nt = ntPrevent 859 860 case '>': 861 nt = ntGreedy 862 863 case '\'': 864 close = '\'' 865 fallthrough 866 867 case '<': 868 if p.charsRight() == 0 { 869 goto BreakRecognize 870 } 871 872 switch ch = p.moveRightGetChar(); ch { 873 case '=': 874 if close == '\'' { 875 goto BreakRecognize 876 } 877 878 p.options |= RightToLeft 879 nt = ntRequire 880 881 case '!': 882 if close == '\'' { 883 goto BreakRecognize 884 } 885 886 p.options |= RightToLeft 887 nt = ntPrevent 888 889 default: 890 p.moveLeft() 891 capnum := -1 892 uncapnum := -1 893 proceed := false 894 895 // grab part before - 896 897 if ch >= '0' && ch <= '9' { 898 if capnum, err = p.scanDecimal(); err != nil { 899 return nil, err 900 } 901 902 if !p.isCaptureSlot(capnum) { 903 capnum = -1 904 } 905 906 // check if we have bogus characters after the number 907 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { 908 return nil, p.getErr(ErrInvalidGroupName) 909 } 910 if capnum == 0 { 911 return nil, p.getErr(ErrCapNumNotZero) 912 } 913 } else if IsWordChar(ch) { 914 capname := p.scanCapname() 915 916 if p.isCaptureName(capname) { 917 capnum = p.captureSlotFromName(capname) 918 } 919 920 // check if we have bogus character after the name 921 if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') { 922 return nil, p.getErr(ErrInvalidGroupName) 923 } 924 } else if ch == '-' { 925 proceed = true 926 } else { 927 // bad group name - starts with something other than a word character and isn't a number 928 return nil, p.getErr(ErrInvalidGroupName) 929 } 930 931 // grab part after - if any 932 933 if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' { 934 p.moveRight(1) 935 936 //no more chars left, no closing char, etc 937 if p.charsRight() == 0 { 938 return nil, p.getErr(ErrInvalidGroupName) 939 } 940 941 ch = p.rightChar(0) 942 if ch >= '0' && ch <= '9' { 943 if uncapnum, err = p.scanDecimal(); err != nil { 944 return nil, err 945 } 946 947 if !p.isCaptureSlot(uncapnum) { 948 return nil, p.getErr(ErrUndefinedBackRef, uncapnum) 949 } 950 951 // check if we have bogus characters after the number 952 if p.charsRight() > 0 && p.rightChar(0) != close { 953 return nil, p.getErr(ErrInvalidGroupName) 954 } 955 } else if IsWordChar(ch) { 956 uncapname := p.scanCapname() 957 958 if !p.isCaptureName(uncapname) { 959 return nil, p.getErr(ErrUndefinedNameRef, uncapname) 960 } 961 uncapnum = p.captureSlotFromName(uncapname) 962 963 // check if we have bogus character after the name 964 if p.charsRight() > 0 && p.rightChar(0) != close { 965 return nil, p.getErr(ErrInvalidGroupName) 966 } 967 } else { 968 // bad group name - starts with something other than a word character and isn't a number 969 return nil, p.getErr(ErrInvalidGroupName) 970 } 971 } 972 973 // actually make the node 974 975 if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close { 976 return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil 977 } 978 goto BreakRecognize 979 } 980 981 case '(': 982 // alternation construct (?(...) | ) 983 984 parenPos := p.textpos() 985 if p.charsRight() > 0 { 986 ch = p.rightChar(0) 987 988 // check if the alternation condition is a backref 989 if ch >= '0' && ch <= '9' { 990 var capnum int 991 if capnum, err = p.scanDecimal(); err != nil { 992 return nil, err 993 } 994 if p.charsRight() > 0 && p.moveRightGetChar() == ')' { 995 if p.isCaptureSlot(capnum) { 996 return newRegexNodeM(ntTestref, p.options, capnum), nil 997 } 998 return nil, p.getErr(ErrUndefinedReference, capnum) 999 } 1000 1001 return nil, p.getErr(ErrMalformedReference, capnum) 1002 1003 } else if IsWordChar(ch) { 1004 capname := p.scanCapname() 1005 1006 if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' { 1007 return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil 1008 } 1009 } 1010 } 1011 // not a backref 1012 nt = ntTestgroup 1013 p.textto(parenPos - 1) // jump to the start of the parentheses 1014 p.ignoreNextParen = true // but make sure we don't try to capture the insides 1015 1016 charsRight := p.charsRight() 1017 if charsRight >= 3 && p.rightChar(1) == '?' { 1018 rightchar2 := p.rightChar(2) 1019 // disallow comments in the condition 1020 if rightchar2 == '#' { 1021 return nil, p.getErr(ErrAlternationCantHaveComment) 1022 } 1023 1024 // disallow named capture group (?<..>..) in the condition 1025 if rightchar2 == '\'' { 1026 return nil, p.getErr(ErrAlternationCantCapture) 1027 } 1028 1029 if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') { 1030 return nil, p.getErr(ErrAlternationCantCapture) 1031 } 1032 } 1033 1034 case 'P': 1035 if p.useRE2() { 1036 // support for P<name> syntax 1037 if p.charsRight() < 3 { 1038 goto BreakRecognize 1039 } 1040 1041 ch = p.moveRightGetChar() 1042 if ch != '<' { 1043 goto BreakRecognize 1044 } 1045 1046 ch = p.moveRightGetChar() 1047 p.moveLeft() 1048 1049 if IsWordChar(ch) { 1050 capnum := -1 1051 capname := p.scanCapname() 1052 1053 if p.isCaptureName(capname) { 1054 capnum = p.captureSlotFromName(capname) 1055 } 1056 1057 // check if we have bogus character after the name 1058 if p.charsRight() > 0 && p.rightChar(0) != '>' { 1059 return nil, p.getErr(ErrInvalidGroupName) 1060 } 1061 1062 // actually make the node 1063 1064 if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' { 1065 return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil 1066 } 1067 goto BreakRecognize 1068 1069 } else { 1070 // bad group name - starts with something other than a word character and isn't a number 1071 return nil, p.getErr(ErrInvalidGroupName) 1072 } 1073 } 1074 // if we're not using RE2 compat mode then 1075 // we just behave like normal 1076 fallthrough 1077 1078 default: 1079 p.moveLeft() 1080 1081 nt = ntGroup 1082 // disallow options in the children of a testgroup node 1083 if p.group.t != ntTestgroup { 1084 p.scanOptions() 1085 } 1086 if p.charsRight() == 0 { 1087 goto BreakRecognize 1088 } 1089 1090 if ch = p.moveRightGetChar(); ch == ')' { 1091 return nil, nil 1092 } 1093 1094 if ch != ':' { 1095 goto BreakRecognize 1096 } 1097 1098 } 1099 1100 return newRegexNode(nt, p.options), nil 1101 } 1102 1103BreakRecognize: 1104 1105 // break Recognize comes here 1106 1107 return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()])) 1108} 1109 1110// scans backslash specials and basics 1111func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) { 1112 1113 if p.charsRight() == 0 { 1114 return nil, p.getErr(ErrIllegalEndEscape) 1115 } 1116 1117 switch ch := p.rightChar(0); ch { 1118 case 'b', 'B', 'A', 'G', 'Z', 'z': 1119 p.moveRight(1) 1120 return newRegexNode(p.typeFromCode(ch), p.options), nil 1121 1122 case 'w': 1123 p.moveRight(1) 1124 if p.useOptionE() { 1125 return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil 1126 } 1127 return newRegexNodeSet(ntSet, p.options, WordClass()), nil 1128 1129 case 'W': 1130 p.moveRight(1) 1131 if p.useOptionE() { 1132 return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil 1133 } 1134 return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil 1135 1136 case 's': 1137 p.moveRight(1) 1138 if p.useOptionE() { 1139 return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil 1140 } 1141 return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil 1142 1143 case 'S': 1144 p.moveRight(1) 1145 if p.useOptionE() { 1146 return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil 1147 } 1148 return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil 1149 1150 case 'd': 1151 p.moveRight(1) 1152 if p.useOptionE() { 1153 return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil 1154 } 1155 return newRegexNodeSet(ntSet, p.options, DigitClass()), nil 1156 1157 case 'D': 1158 p.moveRight(1) 1159 if p.useOptionE() { 1160 return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil 1161 } 1162 return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil 1163 1164 case 'p', 'P': 1165 p.moveRight(1) 1166 prop, err := p.parseProperty() 1167 if err != nil { 1168 return nil, err 1169 } 1170 cc := &CharSet{} 1171 cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw) 1172 if p.useOptionI() { 1173 cc.addLowercase() 1174 } 1175 1176 return newRegexNodeSet(ntSet, p.options, cc), nil 1177 1178 default: 1179 return p.scanBasicBackslash(scanOnly) 1180 } 1181} 1182 1183// Scans \-style backreferences and character escapes 1184func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) { 1185 if p.charsRight() == 0 { 1186 return nil, p.getErr(ErrIllegalEndEscape) 1187 } 1188 angled := false 1189 close := '\x00' 1190 1191 backpos := p.textpos() 1192 ch := p.rightChar(0) 1193 1194 // allow \k<foo> instead of \<foo>, which is now deprecated 1195 1196 if ch == 'k' { 1197 if p.charsRight() >= 2 { 1198 p.moveRight(1) 1199 ch = p.moveRightGetChar() 1200 1201 if ch == '<' || ch == '\'' { 1202 angled = true 1203 if ch == '\'' { 1204 close = '\'' 1205 } else { 1206 close = '>' 1207 } 1208 } 1209 } 1210 1211 if !angled || p.charsRight() <= 0 { 1212 return nil, p.getErr(ErrMalformedNameRef) 1213 } 1214 1215 ch = p.rightChar(0) 1216 1217 } else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g 1218 angled = true 1219 if ch == '\'' { 1220 close = '\'' 1221 } else { 1222 close = '>' 1223 } 1224 1225 p.moveRight(1) 1226 ch = p.rightChar(0) 1227 } 1228 1229 // Try to parse backreference: \<1> or \<cap> 1230 1231 if angled && ch >= '0' && ch <= '9' { 1232 capnum, err := p.scanDecimal() 1233 if err != nil { 1234 return nil, err 1235 } 1236 1237 if p.charsRight() > 0 && p.moveRightGetChar() == close { 1238 if p.isCaptureSlot(capnum) { 1239 return newRegexNodeM(ntRef, p.options, capnum), nil 1240 } 1241 return nil, p.getErr(ErrUndefinedBackRef, capnum) 1242 } 1243 } else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1 1244 capnum, err := p.scanDecimal() 1245 if err != nil { 1246 return nil, err 1247 } 1248 1249 if scanOnly { 1250 return nil, nil 1251 } 1252 1253 if p.useOptionE() || p.isCaptureSlot(capnum) { 1254 return newRegexNodeM(ntRef, p.options, capnum), nil 1255 } 1256 if capnum <= 9 { 1257 return nil, p.getErr(ErrUndefinedBackRef, capnum) 1258 } 1259 1260 } else if angled && IsWordChar(ch) { 1261 capname := p.scanCapname() 1262 1263 if p.charsRight() > 0 && p.moveRightGetChar() == close { 1264 if p.isCaptureName(capname) { 1265 return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil 1266 } 1267 return nil, p.getErr(ErrUndefinedNameRef, capname) 1268 } 1269 } 1270 1271 // Not backreference: must be char code 1272 1273 p.textto(backpos) 1274 ch, err := p.scanCharEscape() 1275 if err != nil { 1276 return nil, err 1277 } 1278 1279 if p.useOptionI() { 1280 ch = unicode.ToLower(ch) 1281 } 1282 1283 return newRegexNodeCh(ntOne, p.options, ch), nil 1284} 1285 1286// Scans X for \p{X} or \P{X} 1287func (p *parser) parseProperty() (string, error) { 1288 if p.charsRight() < 3 { 1289 return "", p.getErr(ErrIncompleteSlashP) 1290 } 1291 ch := p.moveRightGetChar() 1292 if ch != '{' { 1293 return "", p.getErr(ErrMalformedSlashP) 1294 } 1295 1296 startpos := p.textpos() 1297 for p.charsRight() > 0 { 1298 ch = p.moveRightGetChar() 1299 if !(IsWordChar(ch) || ch == '-') { 1300 p.moveLeft() 1301 break 1302 } 1303 } 1304 capname := string(p.pattern[startpos:p.textpos()]) 1305 1306 if p.charsRight() == 0 || p.moveRightGetChar() != '}' { 1307 return "", p.getErr(ErrIncompleteSlashP) 1308 } 1309 1310 if !isValidUnicodeCat(capname) { 1311 return "", p.getErr(ErrUnknownSlashP, capname) 1312 } 1313 1314 return capname, nil 1315} 1316 1317// Returns ReNode type for zero-length assertions with a \ code. 1318func (p *parser) typeFromCode(ch rune) nodeType { 1319 switch ch { 1320 case 'b': 1321 if p.useOptionE() { 1322 return ntECMABoundary 1323 } 1324 return ntBoundary 1325 case 'B': 1326 if p.useOptionE() { 1327 return ntNonECMABoundary 1328 } 1329 return ntNonboundary 1330 case 'A': 1331 return ntBeginning 1332 case 'G': 1333 return ntStart 1334 case 'Z': 1335 return ntEndZ 1336 case 'z': 1337 return ntEnd 1338 default: 1339 return ntNothing 1340 } 1341} 1342 1343// Scans whitespace or x-mode comments. 1344func (p *parser) scanBlank() error { 1345 if p.useOptionX() { 1346 for { 1347 for p.charsRight() > 0 && isSpace(p.rightChar(0)) { 1348 p.moveRight(1) 1349 } 1350 1351 if p.charsRight() == 0 { 1352 break 1353 } 1354 1355 if p.rightChar(0) == '#' { 1356 for p.charsRight() > 0 && p.rightChar(0) != '\n' { 1357 p.moveRight(1) 1358 } 1359 } else if p.charsRight() >= 3 && p.rightChar(2) == '#' && 1360 p.rightChar(1) == '?' && p.rightChar(0) == '(' { 1361 for p.charsRight() > 0 && p.rightChar(0) != ')' { 1362 p.moveRight(1) 1363 } 1364 if p.charsRight() == 0 { 1365 return p.getErr(ErrUnterminatedComment) 1366 } 1367 p.moveRight(1) 1368 } else { 1369 break 1370 } 1371 } 1372 } else { 1373 for { 1374 if p.charsRight() < 3 || p.rightChar(2) != '#' || 1375 p.rightChar(1) != '?' || p.rightChar(0) != '(' { 1376 return nil 1377 } 1378 1379 for p.charsRight() > 0 && p.rightChar(0) != ')' { 1380 p.moveRight(1) 1381 } 1382 if p.charsRight() == 0 { 1383 return p.getErr(ErrUnterminatedComment) 1384 } 1385 p.moveRight(1) 1386 } 1387 } 1388 return nil 1389} 1390 1391func (p *parser) scanCapname() string { 1392 startpos := p.textpos() 1393 1394 for p.charsRight() > 0 { 1395 if !IsWordChar(p.moveRightGetChar()) { 1396 p.moveLeft() 1397 break 1398 } 1399 } 1400 1401 return string(p.pattern[startpos:p.textpos()]) 1402} 1403 1404//Scans contents of [] (not including []'s), and converts to a set. 1405func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) { 1406 ch := '\x00' 1407 chPrev := '\x00' 1408 inRange := false 1409 firstChar := true 1410 closed := false 1411 1412 var cc *CharSet 1413 if !scanOnly { 1414 cc = &CharSet{} 1415 } 1416 1417 if p.charsRight() > 0 && p.rightChar(0) == '^' { 1418 p.moveRight(1) 1419 if !scanOnly { 1420 cc.negate = true 1421 } 1422 } 1423 1424 for ; p.charsRight() > 0; firstChar = false { 1425 fTranslatedChar := false 1426 ch = p.moveRightGetChar() 1427 if ch == ']' { 1428 if !firstChar { 1429 closed = true 1430 break 1431 } else if p.useOptionE() { 1432 if !scanOnly { 1433 cc.addRanges(NoneClass().ranges) 1434 } 1435 closed = true 1436 break 1437 } 1438 1439 } else if ch == '\\' && p.charsRight() > 0 { 1440 switch ch = p.moveRightGetChar(); ch { 1441 case 'D', 'd': 1442 if !scanOnly { 1443 if inRange { 1444 return nil, p.getErr(ErrBadClassInCharRange, ch) 1445 } 1446 cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw) 1447 } 1448 continue 1449 1450 case 'S', 's': 1451 if !scanOnly { 1452 if inRange { 1453 return nil, p.getErr(ErrBadClassInCharRange, ch) 1454 } 1455 cc.addSpace(p.useOptionE(), ch == 'S') 1456 } 1457 continue 1458 1459 case 'W', 'w': 1460 if !scanOnly { 1461 if inRange { 1462 return nil, p.getErr(ErrBadClassInCharRange, ch) 1463 } 1464 1465 cc.addWord(p.useOptionE(), ch == 'W') 1466 } 1467 continue 1468 1469 case 'p', 'P': 1470 if !scanOnly { 1471 if inRange { 1472 return nil, p.getErr(ErrBadClassInCharRange, ch) 1473 } 1474 prop, err := p.parseProperty() 1475 if err != nil { 1476 return nil, err 1477 } 1478 cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw) 1479 } else { 1480 p.parseProperty() 1481 } 1482 1483 continue 1484 1485 case '-': 1486 if !scanOnly { 1487 cc.addRange(ch, ch) 1488 } 1489 continue 1490 1491 default: 1492 p.moveLeft() 1493 var err error 1494 ch, err = p.scanCharEscape() // non-literal character 1495 if err != nil { 1496 return nil, err 1497 } 1498 fTranslatedChar = true 1499 break // this break will only break out of the switch 1500 } 1501 } else if ch == '[' { 1502 // This is code for Posix style properties - [:Ll:] or [:IsTibetan:]. 1503 // It currently doesn't do anything other than skip the whole thing! 1504 if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange { 1505 savePos := p.textpos() 1506 1507 p.moveRight(1) 1508 negate := false 1509 if p.charsRight() > 1 && p.rightChar(0) == '^' { 1510 negate = true 1511 p.moveRight(1) 1512 } 1513 1514 nm := p.scanCapname() // snag the name 1515 if !scanOnly && p.useRE2() { 1516 // look up the name since these are valid for RE2 1517 // add the group based on the name 1518 if ok := cc.addNamedASCII(nm, negate); !ok { 1519 return nil, p.getErr(ErrInvalidCharRange) 1520 } 1521 } 1522 if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' { 1523 p.textto(savePos) 1524 } else if p.useRE2() { 1525 // move on 1526 continue 1527 } 1528 } 1529 } 1530 1531 if inRange { 1532 inRange = false 1533 if !scanOnly { 1534 if ch == '[' && !fTranslatedChar && !firstChar { 1535 // We thought we were in a range, but we're actually starting a subtraction. 1536 // In that case, we'll add chPrev to our char class, skip the opening [, and 1537 // scan the new character class recursively. 1538 cc.addChar(chPrev) 1539 sub, err := p.scanCharSet(caseInsensitive, false) 1540 if err != nil { 1541 return nil, err 1542 } 1543 cc.addSubtraction(sub) 1544 1545 if p.charsRight() > 0 && p.rightChar(0) != ']' { 1546 return nil, p.getErr(ErrSubtractionMustBeLast) 1547 } 1548 } else { 1549 // a regular range, like a-z 1550 if chPrev > ch { 1551 return nil, p.getErr(ErrReversedCharRange) 1552 } 1553 cc.addRange(chPrev, ch) 1554 } 1555 } 1556 } else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' { 1557 // this could be the start of a range 1558 chPrev = ch 1559 inRange = true 1560 p.moveRight(1) 1561 } else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar { 1562 // we aren't in a range, and now there is a subtraction. Usually this happens 1563 // only when a subtraction follows a range, like [a-z-[b]] 1564 if !scanOnly { 1565 p.moveRight(1) 1566 sub, err := p.scanCharSet(caseInsensitive, false) 1567 if err != nil { 1568 return nil, err 1569 } 1570 cc.addSubtraction(sub) 1571 1572 if p.charsRight() > 0 && p.rightChar(0) != ']' { 1573 return nil, p.getErr(ErrSubtractionMustBeLast) 1574 } 1575 } else { 1576 p.moveRight(1) 1577 p.scanCharSet(caseInsensitive, true) 1578 } 1579 } else { 1580 if !scanOnly { 1581 cc.addRange(ch, ch) 1582 } 1583 } 1584 } 1585 1586 if !closed { 1587 return nil, p.getErr(ErrUnterminatedBracket) 1588 } 1589 1590 if !scanOnly && caseInsensitive { 1591 cc.addLowercase() 1592 } 1593 1594 return cc, nil 1595} 1596 1597// Scans any number of decimal digits (pegs value at 2^31-1 if too large) 1598func (p *parser) scanDecimal() (int, error) { 1599 i := 0 1600 var d int 1601 1602 for p.charsRight() > 0 { 1603 d = int(p.rightChar(0) - '0') 1604 if d < 0 || d > 9 { 1605 break 1606 } 1607 p.moveRight(1) 1608 1609 if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) { 1610 return 0, p.getErr(ErrCaptureGroupOutOfRange) 1611 } 1612 1613 i *= 10 1614 i += d 1615 } 1616 1617 return int(i), nil 1618} 1619 1620// Returns true for options allowed only at the top level 1621func isOnlyTopOption(option RegexOptions) bool { 1622 return option == RightToLeft || option == ECMAScript || option == RE2 1623} 1624 1625// Scans cimsx-cimsx option string, stops at the first unrecognized char. 1626func (p *parser) scanOptions() { 1627 1628 for off := false; p.charsRight() > 0; p.moveRight(1) { 1629 ch := p.rightChar(0) 1630 1631 if ch == '-' { 1632 off = true 1633 } else if ch == '+' { 1634 off = false 1635 } else { 1636 option := optionFromCode(ch) 1637 if option == 0 || isOnlyTopOption(option) { 1638 return 1639 } 1640 1641 if off { 1642 p.options &= ^option 1643 } else { 1644 p.options |= option 1645 } 1646 } 1647 } 1648} 1649 1650// Scans \ code for escape codes that map to single unicode chars. 1651func (p *parser) scanCharEscape() (rune, error) { 1652 1653 ch := p.moveRightGetChar() 1654 1655 if ch >= '0' && ch <= '7' { 1656 p.moveLeft() 1657 return p.scanOctal(), nil 1658 } 1659 1660 switch ch { 1661 case 'x': 1662 // support for \x{HEX} syntax from Perl and PCRE 1663 if p.charsRight() > 0 && p.rightChar(0) == '{' { 1664 p.moveRight(1) 1665 return p.scanHexUntilBrace() 1666 } 1667 return p.scanHex(2) 1668 case 'u': 1669 return p.scanHex(4) 1670 case 'a': 1671 return '\u0007', nil 1672 case 'b': 1673 return '\b', nil 1674 case 'e': 1675 return '\u001B', nil 1676 case 'f': 1677 return '\f', nil 1678 case 'n': 1679 return '\n', nil 1680 case 'r': 1681 return '\r', nil 1682 case 't': 1683 return '\t', nil 1684 case 'v': 1685 return '\u000B', nil 1686 case 'c': 1687 return p.scanControl() 1688 default: 1689 if !p.useOptionE() && IsWordChar(ch) { 1690 return 0, p.getErr(ErrUnrecognizedEscape, string(ch)) 1691 } 1692 return ch, nil 1693 } 1694} 1695 1696// Grabs and converts an ascii control character 1697func (p *parser) scanControl() (rune, error) { 1698 if p.charsRight() <= 0 { 1699 return 0, p.getErr(ErrMissingControl) 1700 } 1701 1702 ch := p.moveRightGetChar() 1703 1704 // \ca interpreted as \cA 1705 1706 if ch >= 'a' && ch <= 'z' { 1707 ch = (ch - ('a' - 'A')) 1708 } 1709 ch = (ch - '@') 1710 if ch >= 0 && ch < ' ' { 1711 return ch, nil 1712 } 1713 1714 return 0, p.getErr(ErrUnrecognizedControl) 1715 1716} 1717 1718// Scan hex digits until we hit a closing brace. 1719// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors 1720func (p *parser) scanHexUntilBrace() (rune, error) { 1721 // PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit 1722 // so we can enforce that 1723 i := 0 1724 hasContent := false 1725 1726 for p.charsRight() > 0 { 1727 ch := p.moveRightGetChar() 1728 if ch == '}' { 1729 // hit our close brace, we're done here 1730 // prevent \x{} 1731 if !hasContent { 1732 return 0, p.getErr(ErrTooFewHex) 1733 } 1734 return rune(i), nil 1735 } 1736 hasContent = true 1737 // no brace needs to be hex digit 1738 d := hexDigit(ch) 1739 if d < 0 { 1740 return 0, p.getErr(ErrMissingBrace) 1741 } 1742 1743 i *= 0x10 1744 i += d 1745 1746 if i > unicode.MaxRune { 1747 return 0, p.getErr(ErrInvalidHex) 1748 } 1749 } 1750 1751 // we only make it here if we run out of digits without finding the brace 1752 return 0, p.getErr(ErrMissingBrace) 1753} 1754 1755// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF) 1756func (p *parser) scanHex(c int) (rune, error) { 1757 1758 i := 0 1759 1760 if p.charsRight() >= c { 1761 for c > 0 { 1762 d := hexDigit(p.moveRightGetChar()) 1763 if d < 0 { 1764 break 1765 } 1766 i *= 0x10 1767 i += d 1768 c-- 1769 } 1770 } 1771 1772 if c > 0 { 1773 return 0, p.getErr(ErrTooFewHex) 1774 } 1775 1776 return rune(i), nil 1777} 1778 1779// Returns n <= 0xF for a hex digit. 1780func hexDigit(ch rune) int { 1781 1782 if d := uint(ch - '0'); d <= 9 { 1783 return int(d) 1784 } 1785 1786 if d := uint(ch - 'a'); d <= 5 { 1787 return int(d + 0xa) 1788 } 1789 1790 if d := uint(ch - 'A'); d <= 5 { 1791 return int(d + 0xa) 1792 } 1793 1794 return -1 1795} 1796 1797// Scans up to three octal digits (stops before exceeding 0377). 1798func (p *parser) scanOctal() rune { 1799 // Consume octal chars only up to 3 digits and value 0377 1800 1801 c := 3 1802 1803 if c > p.charsRight() { 1804 c = p.charsRight() 1805 } 1806 1807 //we know the first char is good because the caller had to check 1808 i := 0 1809 d := int(p.rightChar(0) - '0') 1810 for c > 0 && d <= 7 { 1811 i *= 8 1812 i += d 1813 if p.useOptionE() && i >= 0x20 { 1814 break 1815 } 1816 c-- 1817 1818 p.moveRight(1) 1819 if !p.rightMost() { 1820 d = int(p.rightChar(0) - '0') 1821 } 1822 } 1823 1824 // Octal codes only go up to 255. Any larger and the behavior that Perl follows 1825 // is simply to truncate the high bits. 1826 i &= 0xFF 1827 1828 return rune(i) 1829} 1830 1831// Returns the current parsing position. 1832func (p *parser) textpos() int { 1833 return p.currentPos 1834} 1835 1836// Zaps to a specific parsing position. 1837func (p *parser) textto(pos int) { 1838 p.currentPos = pos 1839} 1840 1841// Returns the char at the right of the current parsing position and advances to the right. 1842func (p *parser) moveRightGetChar() rune { 1843 ch := p.pattern[p.currentPos] 1844 p.currentPos++ 1845 return ch 1846} 1847 1848// Moves the current position to the right. 1849func (p *parser) moveRight(i int) { 1850 // default would be 1 1851 p.currentPos += i 1852} 1853 1854// Moves the current parsing position one to the left. 1855func (p *parser) moveLeft() { 1856 p.currentPos-- 1857} 1858 1859// Returns the char left of the current parsing position. 1860func (p *parser) charAt(i int) rune { 1861 return p.pattern[i] 1862} 1863 1864// Returns the char i chars right of the current parsing position. 1865func (p *parser) rightChar(i int) rune { 1866 // default would be 0 1867 return p.pattern[p.currentPos+i] 1868} 1869 1870// Number of characters to the right of the current parsing position. 1871func (p *parser) charsRight() int { 1872 return len(p.pattern) - p.currentPos 1873} 1874 1875func (p *parser) rightMost() bool { 1876 return p.currentPos == len(p.pattern) 1877} 1878 1879// Looks up the slot number for a given name 1880func (p *parser) captureSlotFromName(capname string) int { 1881 return p.capnames[capname] 1882} 1883 1884// True if the capture slot was noted 1885func (p *parser) isCaptureSlot(i int) bool { 1886 if p.caps != nil { 1887 _, ok := p.caps[i] 1888 return ok 1889 } 1890 1891 return (i >= 0 && i < p.capsize) 1892} 1893 1894// Looks up the slot number for a given name 1895func (p *parser) isCaptureName(capname string) bool { 1896 if p.capnames == nil { 1897 return false 1898 } 1899 1900 _, ok := p.capnames[capname] 1901 return ok 1902} 1903 1904// option shortcuts 1905 1906// True if N option disabling '(' autocapture is on. 1907func (p *parser) useOptionN() bool { 1908 return (p.options & ExplicitCapture) != 0 1909} 1910 1911// True if I option enabling case-insensitivity is on. 1912func (p *parser) useOptionI() bool { 1913 return (p.options & IgnoreCase) != 0 1914} 1915 1916// True if M option altering meaning of $ and ^ is on. 1917func (p *parser) useOptionM() bool { 1918 return (p.options & Multiline) != 0 1919} 1920 1921// True if S option altering meaning of . is on. 1922func (p *parser) useOptionS() bool { 1923 return (p.options & Singleline) != 0 1924} 1925 1926// True if X option enabling whitespace/comment mode is on. 1927func (p *parser) useOptionX() bool { 1928 return (p.options & IgnorePatternWhitespace) != 0 1929} 1930 1931// True if E option enabling ECMAScript behavior on. 1932func (p *parser) useOptionE() bool { 1933 return (p.options & ECMAScript) != 0 1934} 1935 1936// true to use RE2 compatibility parsing behavior. 1937func (p *parser) useRE2() bool { 1938 return (p.options & RE2) != 0 1939} 1940 1941// True if options stack is empty. 1942func (p *parser) emptyOptionsStack() bool { 1943 return len(p.optionsStack) == 0 1944} 1945 1946// Finish the current quantifiable (when a quantifier is not found or is not possible) 1947func (p *parser) addConcatenate() { 1948 // The first (| inside a Testgroup group goes directly to the group 1949 p.concatenation.addChild(p.unit) 1950 p.unit = nil 1951} 1952 1953// Finish the current quantifiable (when a quantifier is found) 1954func (p *parser) addConcatenate3(lazy bool, min, max int) { 1955 p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max)) 1956 p.unit = nil 1957} 1958 1959// Sets the current unit to a single char node 1960func (p *parser) addUnitOne(ch rune) { 1961 if p.useOptionI() { 1962 ch = unicode.ToLower(ch) 1963 } 1964 1965 p.unit = newRegexNodeCh(ntOne, p.options, ch) 1966} 1967 1968// Sets the current unit to a single inverse-char node 1969func (p *parser) addUnitNotone(ch rune) { 1970 if p.useOptionI() { 1971 ch = unicode.ToLower(ch) 1972 } 1973 1974 p.unit = newRegexNodeCh(ntNotone, p.options, ch) 1975} 1976 1977// Sets the current unit to a single set node 1978func (p *parser) addUnitSet(set *CharSet) { 1979 p.unit = newRegexNodeSet(ntSet, p.options, set) 1980} 1981 1982// Sets the current unit to a subtree 1983func (p *parser) addUnitNode(node *regexNode) { 1984 p.unit = node 1985} 1986 1987// Sets the current unit to an assertion of the specified type 1988func (p *parser) addUnitType(t nodeType) { 1989 p.unit = newRegexNode(t, p.options) 1990} 1991 1992// Finish the current group (in response to a ')' or end) 1993func (p *parser) addGroup() error { 1994 if p.group.t == ntTestgroup || p.group.t == ntTestref { 1995 p.group.addChild(p.concatenation.reverseLeft()) 1996 if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 { 1997 return p.getErr(ErrTooManyAlternates) 1998 } 1999 } else { 2000 p.alternation.addChild(p.concatenation.reverseLeft()) 2001 p.group.addChild(p.alternation) 2002 } 2003 2004 p.unit = p.group 2005 return nil 2006} 2007 2008// Pops the option stack, but keeps the current options unchanged. 2009func (p *parser) popKeepOptions() { 2010 lastIdx := len(p.optionsStack) - 1 2011 p.optionsStack = p.optionsStack[:lastIdx] 2012} 2013 2014// Recalls options from the stack. 2015func (p *parser) popOptions() { 2016 lastIdx := len(p.optionsStack) - 1 2017 // get the last item on the stack and then remove it by reslicing 2018 p.options = p.optionsStack[lastIdx] 2019 p.optionsStack = p.optionsStack[:lastIdx] 2020} 2021 2022// Saves options on a stack. 2023func (p *parser) pushOptions() { 2024 p.optionsStack = append(p.optionsStack, p.options) 2025} 2026 2027// Add a string to the last concatenate. 2028func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) { 2029 var node *regexNode 2030 2031 if cch == 0 { 2032 return 2033 } 2034 2035 if cch > 1 { 2036 str := p.pattern[pos : pos+cch] 2037 2038 if p.useOptionI() && !isReplacement { 2039 // We do the ToLower character by character for consistency. With surrogate chars, doing 2040 // a ToLower on the entire string could actually change the surrogate pair. This is more correct 2041 // linguistically, but since Regex doesn't support surrogates, it's more important to be 2042 // consistent. 2043 for i := 0; i < len(str); i++ { 2044 str[i] = unicode.ToLower(str[i]) 2045 } 2046 } 2047 2048 node = newRegexNodeStr(ntMulti, p.options, str) 2049 } else { 2050 ch := p.charAt(pos) 2051 2052 if p.useOptionI() && !isReplacement { 2053 ch = unicode.ToLower(ch) 2054 } 2055 2056 node = newRegexNodeCh(ntOne, p.options, ch) 2057 } 2058 2059 p.concatenation.addChild(node) 2060} 2061 2062// Push the parser state (in response to an open paren) 2063func (p *parser) pushGroup() { 2064 p.group.next = p.stack 2065 p.alternation.next = p.group 2066 p.concatenation.next = p.alternation 2067 p.stack = p.concatenation 2068} 2069 2070// Remember the pushed state (in response to a ')') 2071func (p *parser) popGroup() error { 2072 p.concatenation = p.stack 2073 p.alternation = p.concatenation.next 2074 p.group = p.alternation.next 2075 p.stack = p.group.next 2076 2077 // The first () inside a Testgroup group goes directly to the group 2078 if p.group.t == ntTestgroup && len(p.group.children) == 0 { 2079 if p.unit == nil { 2080 return p.getErr(ErrConditionalExpression) 2081 } 2082 2083 p.group.addChild(p.unit) 2084 p.unit = nil 2085 } 2086 return nil 2087} 2088 2089// True if the group stack is empty. 2090func (p *parser) emptyStack() bool { 2091 return p.stack == nil 2092} 2093 2094// Start a new round for the parser state (in response to an open paren or string start) 2095func (p *parser) startGroup(openGroup *regexNode) { 2096 p.group = openGroup 2097 p.alternation = newRegexNode(ntAlternate, p.options) 2098 p.concatenation = newRegexNode(ntConcatenate, p.options) 2099} 2100 2101// Finish the current concatenation (in response to a |) 2102func (p *parser) addAlternate() { 2103 // The | parts inside a Testgroup group go directly to the group 2104 2105 if p.group.t == ntTestgroup || p.group.t == ntTestref { 2106 p.group.addChild(p.concatenation.reverseLeft()) 2107 } else { 2108 p.alternation.addChild(p.concatenation.reverseLeft()) 2109 } 2110 2111 p.concatenation = newRegexNode(ntConcatenate, p.options) 2112} 2113 2114// For categorizing ascii characters. 2115 2116const ( 2117 Q byte = 5 // quantifier 2118 S = 4 // ordinary stopper 2119 Z = 3 // ScanBlank stopper 2120 X = 2 // whitespace 2121 E = 1 // should be escaped 2122) 2123 2124var _category = []byte{ 2125 //01 2 3 4 5 6 7 8 9 A B C D E F 0 1 2 3 4 5 6 7 8 9 A B C D E F 2126 0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2127 // ! " # $ % & ' ( ) * + , - . / 0 1 2 3 4 5 6 7 8 9 : ; < = > ? 2128 X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, 2129 //@A B C D E F G H I J K L M N O P Q R S T U V W X Y Z [ \ ] ^ _ 2130 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0, 2131 //'a b c d e f g h i j k l m n o p q r s t u v w x y z { | } ~ 2132 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0, 2133} 2134 2135func isSpace(ch rune) bool { 2136 return (ch <= ' ' && _category[ch] == X) 2137} 2138 2139// Returns true for those characters that terminate a string of ordinary chars. 2140func isSpecial(ch rune) bool { 2141 return (ch <= '|' && _category[ch] >= S) 2142} 2143 2144// Returns true for those characters that terminate a string of ordinary chars. 2145func isStopperX(ch rune) bool { 2146 return (ch <= '|' && _category[ch] >= X) 2147} 2148 2149// Returns true for those characters that begin a quantifier. 2150func isQuantifier(ch rune) bool { 2151 return (ch <= '{' && _category[ch] >= Q) 2152} 2153 2154func (p *parser) isTrueQuantifier() bool { 2155 nChars := p.charsRight() 2156 if nChars == 0 { 2157 return false 2158 } 2159 2160 startpos := p.textpos() 2161 ch := p.charAt(startpos) 2162 if ch != '{' { 2163 return ch <= '{' && _category[ch] >= Q 2164 } 2165 2166 //UGLY: this is ugly -- the original code was ugly too 2167 pos := startpos 2168 for { 2169 nChars-- 2170 if nChars <= 0 { 2171 break 2172 } 2173 pos++ 2174 ch = p.charAt(pos) 2175 if ch < '0' || ch > '9' { 2176 break 2177 } 2178 } 2179 2180 if nChars == 0 || pos-startpos == 1 { 2181 return false 2182 } 2183 if ch == '}' { 2184 return true 2185 } 2186 if ch != ',' { 2187 return false 2188 } 2189 for { 2190 nChars-- 2191 if nChars <= 0 { 2192 break 2193 } 2194 pos++ 2195 ch = p.charAt(pos) 2196 if ch < '0' || ch > '9' { 2197 break 2198 } 2199 } 2200 2201 return nChars > 0 && ch == '}' 2202} 2203