1package syntax
2
3import (
4	"fmt"
5	"math"
6	"os"
7	"sort"
8	"strconv"
9	"unicode"
10)
11
12type RegexOptions int32
13
14const (
15	IgnoreCase              RegexOptions = 0x0001 // "i"
16	Multiline                            = 0x0002 // "m"
17	ExplicitCapture                      = 0x0004 // "n"
18	Compiled                             = 0x0008 // "c"
19	Singleline                           = 0x0010 // "s"
20	IgnorePatternWhitespace              = 0x0020 // "x"
21	RightToLeft                          = 0x0040 // "r"
22	Debug                                = 0x0080 // "d"
23	ECMAScript                           = 0x0100 // "e"
24	RE2                                  = 0x0200 // RE2 compat mode
25)
26
27func optionFromCode(ch rune) RegexOptions {
28	// case-insensitive
29	switch ch {
30	case 'i', 'I':
31		return IgnoreCase
32	case 'r', 'R':
33		return RightToLeft
34	case 'm', 'M':
35		return Multiline
36	case 'n', 'N':
37		return ExplicitCapture
38	case 's', 'S':
39		return Singleline
40	case 'x', 'X':
41		return IgnorePatternWhitespace
42	case 'd', 'D':
43		return Debug
44	case 'e', 'E':
45		return ECMAScript
46	default:
47		return 0
48	}
49}
50
51// An Error describes a failure to parse a regular expression
52// and gives the offending expression.
53type Error struct {
54	Code ErrorCode
55	Expr string
56	Args []interface{}
57}
58
59func (e *Error) Error() string {
60	if len(e.Args) == 0 {
61		return "error parsing regexp: " + e.Code.String() + " in `" + e.Expr + "`"
62	}
63	return "error parsing regexp: " + fmt.Sprintf(e.Code.String(), e.Args...) + " in `" + e.Expr + "`"
64}
65
66// An ErrorCode describes a failure to parse a regular expression.
67type ErrorCode string
68
69const (
70	// internal issue
71	ErrInternalError ErrorCode = "regexp/syntax: internal error"
72	// Parser errors
73	ErrUnterminatedComment        = "unterminated comment"
74	ErrInvalidCharRange           = "invalid character class range"
75	ErrInvalidRepeatSize          = "invalid repeat count"
76	ErrInvalidUTF8                = "invalid UTF-8"
77	ErrCaptureGroupOutOfRange     = "capture group number out of range"
78	ErrUnexpectedParen            = "unexpected )"
79	ErrMissingParen               = "missing closing )"
80	ErrMissingBrace               = "missing closing }"
81	ErrInvalidRepeatOp            = "invalid nested repetition operator"
82	ErrMissingRepeatArgument      = "missing argument to repetition operator"
83	ErrConditionalExpression      = "illegal conditional (?(...)) expression"
84	ErrTooManyAlternates          = "too many | in (?()|)"
85	ErrUnrecognizedGrouping       = "unrecognized grouping construct: (%v"
86	ErrInvalidGroupName           = "invalid group name: group names must begin with a word character and have a matching terminator"
87	ErrCapNumNotZero              = "capture number cannot be zero"
88	ErrUndefinedBackRef           = "reference to undefined group number %v"
89	ErrUndefinedNameRef           = "reference to undefined group name %v"
90	ErrAlternationCantCapture     = "alternation conditions do not capture and cannot be named"
91	ErrAlternationCantHaveComment = "alternation conditions cannot be comments"
92	ErrMalformedReference         = "(?(%v) ) malformed"
93	ErrUndefinedReference         = "(?(%v) ) reference to undefined group"
94	ErrIllegalEndEscape           = "illegal \\ at end of pattern"
95	ErrMalformedSlashP            = "malformed \\p{X} character escape"
96	ErrIncompleteSlashP           = "incomplete \\p{X} character escape"
97	ErrUnknownSlashP              = "unknown unicode category, script, or property '%v'"
98	ErrUnrecognizedEscape         = "unrecognized escape sequence \\%v"
99	ErrMissingControl             = "missing control character"
100	ErrUnrecognizedControl        = "unrecognized control character"
101	ErrTooFewHex                  = "insufficient hexadecimal digits"
102	ErrInvalidHex                 = "hex values may not be larger than 0x10FFFF"
103	ErrMalformedNameRef           = "malformed \\k<...> named back reference"
104	ErrBadClassInCharRange        = "cannot include class \\%v in character range"
105	ErrUnterminatedBracket        = "unterminated [] set"
106	ErrSubtractionMustBeLast      = "a subtraction must be the last element in a character class"
107	ErrReversedCharRange          = "[x-y] range in reverse order"
108)
109
110func (e ErrorCode) String() string {
111	return string(e)
112}
113
114type parser struct {
115	stack         *regexNode
116	group         *regexNode
117	alternation   *regexNode
118	concatenation *regexNode
119	unit          *regexNode
120
121	patternRaw string
122	pattern    []rune
123
124	currentPos  int
125	specialCase *unicode.SpecialCase
126
127	autocap  int
128	capcount int
129	captop   int
130	capsize  int
131
132	caps     map[int]int
133	capnames map[string]int
134
135	capnumlist  []int
136	capnamelist []string
137
138	options         RegexOptions
139	optionsStack    []RegexOptions
140	ignoreNextParen bool
141}
142
143const (
144	maxValueDiv10 int = math.MaxInt32 / 10
145	maxValueMod10     = math.MaxInt32 % 10
146)
147
148// Parse converts a regex string into a parse tree
149func Parse(re string, op RegexOptions) (*RegexTree, error) {
150	p := parser{
151		options: op,
152		caps:    make(map[int]int),
153	}
154	p.setPattern(re)
155
156	if err := p.countCaptures(); err != nil {
157		return nil, err
158	}
159
160	p.reset(op)
161	root, err := p.scanRegex()
162
163	if err != nil {
164		return nil, err
165	}
166	tree := &RegexTree{
167		root:       root,
168		caps:       p.caps,
169		capnumlist: p.capnumlist,
170		captop:     p.captop,
171		Capnames:   p.capnames,
172		Caplist:    p.capnamelist,
173		options:    op,
174	}
175
176	if tree.options&Debug > 0 {
177		os.Stdout.WriteString(tree.Dump())
178	}
179
180	return tree, nil
181}
182
183func (p *parser) setPattern(pattern string) {
184	p.patternRaw = pattern
185	p.pattern = make([]rune, 0, len(pattern))
186
187	//populate our rune array to handle utf8 encoding
188	for _, r := range pattern {
189		p.pattern = append(p.pattern, r)
190	}
191}
192func (p *parser) getErr(code ErrorCode, args ...interface{}) error {
193	return &Error{Code: code, Expr: p.patternRaw, Args: args}
194}
195
196func (p *parser) noteCaptureSlot(i, pos int) {
197	if _, ok := p.caps[i]; !ok {
198		// the rhs of the hashtable isn't used in the parser
199		p.caps[i] = pos
200		p.capcount++
201
202		if p.captop <= i {
203			if i == math.MaxInt32 {
204				p.captop = i
205			} else {
206				p.captop = i + 1
207			}
208		}
209	}
210}
211
212func (p *parser) noteCaptureName(name string, pos int) {
213	if p.capnames == nil {
214		p.capnames = make(map[string]int)
215	}
216
217	if _, ok := p.capnames[name]; !ok {
218		p.capnames[name] = pos
219		p.capnamelist = append(p.capnamelist, name)
220	}
221}
222
223func (p *parser) assignNameSlots() {
224	if p.capnames != nil {
225		for _, name := range p.capnamelist {
226			for p.isCaptureSlot(p.autocap) {
227				p.autocap++
228			}
229			pos := p.capnames[name]
230			p.capnames[name] = p.autocap
231			p.noteCaptureSlot(p.autocap, pos)
232
233			p.autocap++
234		}
235	}
236
237	// if the caps array has at least one gap, construct the list of used slots
238	if p.capcount < p.captop {
239		p.capnumlist = make([]int, p.capcount)
240		i := 0
241
242		for k := range p.caps {
243			p.capnumlist[i] = k
244			i++
245		}
246
247		sort.Ints(p.capnumlist)
248	}
249
250	// merge capsnumlist into capnamelist
251	if p.capnames != nil || p.capnumlist != nil {
252		var oldcapnamelist []string
253		var next int
254		var k int
255
256		if p.capnames == nil {
257			oldcapnamelist = nil
258			p.capnames = make(map[string]int)
259			p.capnamelist = []string{}
260			next = -1
261		} else {
262			oldcapnamelist = p.capnamelist
263			p.capnamelist = []string{}
264			next = p.capnames[oldcapnamelist[0]]
265		}
266
267		for i := 0; i < p.capcount; i++ {
268			j := i
269			if p.capnumlist != nil {
270				j = p.capnumlist[i]
271			}
272
273			if next == j {
274				p.capnamelist = append(p.capnamelist, oldcapnamelist[k])
275				k++
276
277				if k == len(oldcapnamelist) {
278					next = -1
279				} else {
280					next = p.capnames[oldcapnamelist[k]]
281				}
282
283			} else {
284				//feature: culture?
285				str := strconv.Itoa(j)
286				p.capnamelist = append(p.capnamelist, str)
287				p.capnames[str] = j
288			}
289		}
290	}
291}
292
293func (p *parser) consumeAutocap() int {
294	r := p.autocap
295	p.autocap++
296	return r
297}
298
299// CountCaptures is a prescanner for deducing the slots used for
300// captures by doing a partial tokenization of the pattern.
301func (p *parser) countCaptures() error {
302	var ch rune
303
304	p.noteCaptureSlot(0, 0)
305
306	p.autocap = 1
307
308	for p.charsRight() > 0 {
309		pos := p.textpos()
310		ch = p.moveRightGetChar()
311		switch ch {
312		case '\\':
313			if p.charsRight() > 0 {
314				p.scanBackslash(true)
315			}
316
317		case '#':
318			if p.useOptionX() {
319				p.moveLeft()
320				p.scanBlank()
321			}
322
323		case '[':
324			p.scanCharSet(false, true)
325
326		case ')':
327			if !p.emptyOptionsStack() {
328				p.popOptions()
329			}
330
331		case '(':
332			if p.charsRight() >= 2 && p.rightChar(1) == '#' && p.rightChar(0) == '?' {
333				p.moveLeft()
334				p.scanBlank()
335			} else {
336				p.pushOptions()
337				if p.charsRight() > 0 && p.rightChar(0) == '?' {
338					// we have (?...
339					p.moveRight(1)
340
341					if p.charsRight() > 1 && (p.rightChar(0) == '<' || p.rightChar(0) == '\'') {
342						// named group: (?<... or (?'...
343
344						p.moveRight(1)
345						ch = p.rightChar(0)
346
347						if ch != '0' && IsWordChar(ch) {
348							if ch >= '1' && ch <= '9' {
349								dec, err := p.scanDecimal()
350								if err != nil {
351									return err
352								}
353								p.noteCaptureSlot(dec, pos)
354							} else {
355								p.noteCaptureName(p.scanCapname(), pos)
356							}
357						}
358					} else if p.useRE2() && p.charsRight() > 2 && (p.rightChar(0) == 'P' && p.rightChar(1) == '<') {
359						// RE2-compat (?P<)
360						p.moveRight(2)
361						ch = p.rightChar(0)
362						if IsWordChar(ch) {
363							p.noteCaptureName(p.scanCapname(), pos)
364						}
365
366					} else {
367						// (?...
368
369						// get the options if it's an option construct (?cimsx-cimsx...)
370						p.scanOptions()
371
372						if p.charsRight() > 0 {
373							if p.rightChar(0) == ')' {
374								// (?cimsx-cimsx)
375								p.moveRight(1)
376								p.popKeepOptions()
377							} else if p.rightChar(0) == '(' {
378								// alternation construct: (?(foo)yes|no)
379								// ignore the next paren so we don't capture the condition
380								p.ignoreNextParen = true
381
382								// break from here so we don't reset ignoreNextParen
383								continue
384							}
385						}
386					}
387				} else {
388					if !p.useOptionN() && !p.ignoreNextParen {
389						p.noteCaptureSlot(p.consumeAutocap(), pos)
390					}
391				}
392			}
393
394			p.ignoreNextParen = false
395
396		}
397	}
398
399	p.assignNameSlots()
400	return nil
401}
402
403func (p *parser) reset(topopts RegexOptions) {
404	p.currentPos = 0
405	p.autocap = 1
406	p.ignoreNextParen = false
407
408	if len(p.optionsStack) > 0 {
409		p.optionsStack = p.optionsStack[:0]
410	}
411
412	p.options = topopts
413	p.stack = nil
414}
415
416func (p *parser) scanRegex() (*regexNode, error) {
417	ch := '@' // nonspecial ch, means at beginning
418	isQuant := false
419
420	p.startGroup(newRegexNodeMN(ntCapture, p.options, 0, -1))
421
422	for p.charsRight() > 0 {
423		wasPrevQuantifier := isQuant
424		isQuant = false
425
426		if err := p.scanBlank(); err != nil {
427			return nil, err
428		}
429
430		startpos := p.textpos()
431
432		// move past all of the normal characters.  We'll stop when we hit some kind of control character,
433		// or if IgnorePatternWhiteSpace is on, we'll stop when we see some whitespace.
434		if p.useOptionX() {
435			for p.charsRight() > 0 {
436				ch = p.rightChar(0)
437				//UGLY: clean up, this is ugly
438				if !(!isStopperX(ch) || (ch == '{' && !p.isTrueQuantifier())) {
439					break
440				}
441				p.moveRight(1)
442			}
443		} else {
444			for p.charsRight() > 0 {
445				ch = p.rightChar(0)
446				if !(!isSpecial(ch) || ch == '{' && !p.isTrueQuantifier()) {
447					break
448				}
449				p.moveRight(1)
450			}
451		}
452
453		endpos := p.textpos()
454
455		p.scanBlank()
456
457		if p.charsRight() == 0 {
458			ch = '!' // nonspecial, means at end
459		} else if ch = p.rightChar(0); isSpecial(ch) {
460			isQuant = isQuantifier(ch)
461			p.moveRight(1)
462		} else {
463			ch = ' ' // nonspecial, means at ordinary char
464		}
465
466		if startpos < endpos {
467			cchUnquantified := endpos - startpos
468			if isQuant {
469				cchUnquantified--
470			}
471			wasPrevQuantifier = false
472
473			if cchUnquantified > 0 {
474				p.addToConcatenate(startpos, cchUnquantified, false)
475			}
476
477			if isQuant {
478				p.addUnitOne(p.charAt(endpos - 1))
479			}
480		}
481
482		switch ch {
483		case '!':
484			goto BreakOuterScan
485
486		case ' ':
487			goto ContinueOuterScan
488
489		case '[':
490			cc, err := p.scanCharSet(p.useOptionI(), false)
491			if err != nil {
492				return nil, err
493			}
494			p.addUnitSet(cc)
495
496		case '(':
497			p.pushOptions()
498
499			if grouper, err := p.scanGroupOpen(); err != nil {
500				return nil, err
501			} else if grouper == nil {
502				p.popKeepOptions()
503			} else {
504				p.pushGroup()
505				p.startGroup(grouper)
506			}
507
508			continue
509
510		case '|':
511			p.addAlternate()
512			goto ContinueOuterScan
513
514		case ')':
515			if p.emptyStack() {
516				return nil, p.getErr(ErrUnexpectedParen)
517			}
518
519			if err := p.addGroup(); err != nil {
520				return nil, err
521			}
522			if err := p.popGroup(); err != nil {
523				return nil, err
524			}
525			p.popOptions()
526
527			if p.unit == nil {
528				goto ContinueOuterScan
529			}
530
531		case '\\':
532			n, err := p.scanBackslash(false)
533			if err != nil {
534				return nil, err
535			}
536			p.addUnitNode(n)
537
538		case '^':
539			if p.useOptionM() {
540				p.addUnitType(ntBol)
541			} else {
542				p.addUnitType(ntBeginning)
543			}
544
545		case '$':
546			if p.useOptionM() {
547				p.addUnitType(ntEol)
548			} else {
549				p.addUnitType(ntEndZ)
550			}
551
552		case '.':
553			if p.useOptionE() {
554				p.addUnitSet(ECMAAnyClass())
555			} else if p.useOptionS() {
556				p.addUnitSet(AnyClass())
557			} else {
558				p.addUnitNotone('\n')
559			}
560
561		case '{', '*', '+', '?':
562			if p.unit == nil {
563				if wasPrevQuantifier {
564					return nil, p.getErr(ErrInvalidRepeatOp)
565				} else {
566					return nil, p.getErr(ErrMissingRepeatArgument)
567				}
568			}
569			p.moveLeft()
570
571		default:
572			return nil, p.getErr(ErrInternalError)
573		}
574
575		if err := p.scanBlank(); err != nil {
576			return nil, err
577		}
578
579		if p.charsRight() > 0 {
580			isQuant = p.isTrueQuantifier()
581		}
582		if p.charsRight() == 0 || !isQuant {
583			//maintain odd C# assignment order -- not sure if required, could clean up?
584			p.addConcatenate()
585			goto ContinueOuterScan
586		}
587
588		ch = p.moveRightGetChar()
589
590		// Handle quantifiers
591		for p.unit != nil {
592			var min, max int
593			var lazy bool
594
595			switch ch {
596			case '*':
597				min = 0
598				max = math.MaxInt32
599
600			case '?':
601				min = 0
602				max = 1
603
604			case '+':
605				min = 1
606				max = math.MaxInt32
607
608			case '{':
609				{
610					var err error
611					startpos = p.textpos()
612					if min, err = p.scanDecimal(); err != nil {
613						return nil, err
614					}
615					max = min
616					if startpos < p.textpos() {
617						if p.charsRight() > 0 && p.rightChar(0) == ',' {
618							p.moveRight(1)
619							if p.charsRight() == 0 || p.rightChar(0) == '}' {
620								max = math.MaxInt32
621							} else {
622								if max, err = p.scanDecimal(); err != nil {
623									return nil, err
624								}
625							}
626						}
627					}
628
629					if startpos == p.textpos() || p.charsRight() == 0 || p.moveRightGetChar() != '}' {
630						p.addConcatenate()
631						p.textto(startpos - 1)
632						goto ContinueOuterScan
633					}
634				}
635
636			default:
637				return nil, p.getErr(ErrInternalError)
638			}
639
640			if err := p.scanBlank(); err != nil {
641				return nil, err
642			}
643
644			if p.charsRight() == 0 || p.rightChar(0) != '?' {
645				lazy = false
646			} else {
647				p.moveRight(1)
648				lazy = true
649			}
650
651			if min > max {
652				return nil, p.getErr(ErrInvalidRepeatSize)
653			}
654
655			p.addConcatenate3(lazy, min, max)
656		}
657
658	ContinueOuterScan:
659	}
660
661BreakOuterScan:
662	;
663
664	if !p.emptyStack() {
665		return nil, p.getErr(ErrMissingParen)
666	}
667
668	if err := p.addGroup(); err != nil {
669		return nil, err
670	}
671
672	return p.unit, nil
673
674}
675
676/*
677 * Simple parsing for replacement patterns
678 */
679func (p *parser) scanReplacement() (*regexNode, error) {
680	var c, startpos int
681
682	p.concatenation = newRegexNode(ntConcatenate, p.options)
683
684	for {
685		c = p.charsRight()
686		if c == 0 {
687			break
688		}
689
690		startpos = p.textpos()
691
692		for c > 0 && p.rightChar(0) != '$' {
693			p.moveRight(1)
694			c--
695		}
696
697		p.addToConcatenate(startpos, p.textpos()-startpos, true)
698
699		if c > 0 {
700			if p.moveRightGetChar() == '$' {
701				n, err := p.scanDollar()
702				if err != nil {
703					return nil, err
704				}
705				p.addUnitNode(n)
706			}
707			p.addConcatenate()
708		}
709	}
710
711	return p.concatenation, nil
712}
713
714/*
715 * Scans $ patterns recognized within replacement patterns
716 */
717func (p *parser) scanDollar() (*regexNode, error) {
718	if p.charsRight() == 0 {
719		return newRegexNodeCh(ntOne, p.options, '$'), nil
720	}
721
722	ch := p.rightChar(0)
723	angled := false
724	backpos := p.textpos()
725	lastEndPos := backpos
726
727	// Note angle
728
729	if ch == '{' && p.charsRight() > 1 {
730		angled = true
731		p.moveRight(1)
732		ch = p.rightChar(0)
733	}
734
735	// Try to parse backreference: \1 or \{1} or \{cap}
736
737	if ch >= '0' && ch <= '9' {
738		if !angled && p.useOptionE() {
739			capnum := -1
740			newcapnum := int(ch - '0')
741			p.moveRight(1)
742			if p.isCaptureSlot(newcapnum) {
743				capnum = newcapnum
744				lastEndPos = p.textpos()
745			}
746
747			for p.charsRight() > 0 {
748				ch = p.rightChar(0)
749				if ch < '0' || ch > '9' {
750					break
751				}
752				digit := int(ch - '0')
753				if newcapnum > maxValueDiv10 || (newcapnum == maxValueDiv10 && digit > maxValueMod10) {
754					return nil, p.getErr(ErrCaptureGroupOutOfRange)
755				}
756
757				newcapnum = newcapnum*10 + digit
758
759				p.moveRight(1)
760				if p.isCaptureSlot(newcapnum) {
761					capnum = newcapnum
762					lastEndPos = p.textpos()
763				}
764			}
765			p.textto(lastEndPos)
766			if capnum >= 0 {
767				return newRegexNodeM(ntRef, p.options, capnum), nil
768			}
769		} else {
770			capnum, err := p.scanDecimal()
771			if err != nil {
772				return nil, err
773			}
774			if !angled || p.charsRight() > 0 && p.moveRightGetChar() == '}' {
775				if p.isCaptureSlot(capnum) {
776					return newRegexNodeM(ntRef, p.options, capnum), nil
777				}
778			}
779		}
780	} else if angled && IsWordChar(ch) {
781		capname := p.scanCapname()
782
783		if p.charsRight() > 0 && p.moveRightGetChar() == '}' {
784			if p.isCaptureName(capname) {
785				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
786			}
787		}
788	} else if !angled {
789		capnum := 1
790
791		switch ch {
792		case '$':
793			p.moveRight(1)
794			return newRegexNodeCh(ntOne, p.options, '$'), nil
795		case '&':
796			capnum = 0
797		case '`':
798			capnum = replaceLeftPortion
799		case '\'':
800			capnum = replaceRightPortion
801		case '+':
802			capnum = replaceLastGroup
803		case '_':
804			capnum = replaceWholeString
805		}
806
807		if capnum != 1 {
808			p.moveRight(1)
809			return newRegexNodeM(ntRef, p.options, capnum), nil
810		}
811	}
812
813	// unrecognized $: literalize
814
815	p.textto(backpos)
816	return newRegexNodeCh(ntOne, p.options, '$'), nil
817}
818
819// scanGroupOpen scans chars following a '(' (not counting the '('), and returns
820// a RegexNode for the type of group scanned, or nil if the group
821// simply changed options (?cimsx-cimsx) or was a comment (#...).
822func (p *parser) scanGroupOpen() (*regexNode, error) {
823	var ch rune
824	var nt nodeType
825	var err error
826	close := '>'
827	start := p.textpos()
828
829	// just return a RegexNode if we have:
830	// 1. "(" followed by nothing
831	// 2. "(x" where x != ?
832	// 3. "(?)"
833	if p.charsRight() == 0 || p.rightChar(0) != '?' || (p.rightChar(0) == '?' && (p.charsRight() > 1 && p.rightChar(1) == ')')) {
834		if p.useOptionN() || p.ignoreNextParen {
835			p.ignoreNextParen = false
836			return newRegexNode(ntGroup, p.options), nil
837		}
838		return newRegexNodeMN(ntCapture, p.options, p.consumeAutocap(), -1), nil
839	}
840
841	p.moveRight(1)
842
843	for {
844		if p.charsRight() == 0 {
845			break
846		}
847
848		switch ch = p.moveRightGetChar(); ch {
849		case ':':
850			nt = ntGroup
851
852		case '=':
853			p.options &= ^RightToLeft
854			nt = ntRequire
855
856		case '!':
857			p.options &= ^RightToLeft
858			nt = ntPrevent
859
860		case '>':
861			nt = ntGreedy
862
863		case '\'':
864			close = '\''
865			fallthrough
866
867		case '<':
868			if p.charsRight() == 0 {
869				goto BreakRecognize
870			}
871
872			switch ch = p.moveRightGetChar(); ch {
873			case '=':
874				if close == '\'' {
875					goto BreakRecognize
876				}
877
878				p.options |= RightToLeft
879				nt = ntRequire
880
881			case '!':
882				if close == '\'' {
883					goto BreakRecognize
884				}
885
886				p.options |= RightToLeft
887				nt = ntPrevent
888
889			default:
890				p.moveLeft()
891				capnum := -1
892				uncapnum := -1
893				proceed := false
894
895				// grab part before -
896
897				if ch >= '0' && ch <= '9' {
898					if capnum, err = p.scanDecimal(); err != nil {
899						return nil, err
900					}
901
902					if !p.isCaptureSlot(capnum) {
903						capnum = -1
904					}
905
906					// check if we have bogus characters after the number
907					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
908						return nil, p.getErr(ErrInvalidGroupName)
909					}
910					if capnum == 0 {
911						return nil, p.getErr(ErrCapNumNotZero)
912					}
913				} else if IsWordChar(ch) {
914					capname := p.scanCapname()
915
916					if p.isCaptureName(capname) {
917						capnum = p.captureSlotFromName(capname)
918					}
919
920					// check if we have bogus character after the name
921					if p.charsRight() > 0 && !(p.rightChar(0) == close || p.rightChar(0) == '-') {
922						return nil, p.getErr(ErrInvalidGroupName)
923					}
924				} else if ch == '-' {
925					proceed = true
926				} else {
927					// bad group name - starts with something other than a word character and isn't a number
928					return nil, p.getErr(ErrInvalidGroupName)
929				}
930
931				// grab part after - if any
932
933				if (capnum != -1 || proceed == true) && p.charsRight() > 0 && p.rightChar(0) == '-' {
934					p.moveRight(1)
935
936					//no more chars left, no closing char, etc
937					if p.charsRight() == 0 {
938						return nil, p.getErr(ErrInvalidGroupName)
939					}
940
941					ch = p.rightChar(0)
942					if ch >= '0' && ch <= '9' {
943						if uncapnum, err = p.scanDecimal(); err != nil {
944							return nil, err
945						}
946
947						if !p.isCaptureSlot(uncapnum) {
948							return nil, p.getErr(ErrUndefinedBackRef, uncapnum)
949						}
950
951						// check if we have bogus characters after the number
952						if p.charsRight() > 0 && p.rightChar(0) != close {
953							return nil, p.getErr(ErrInvalidGroupName)
954						}
955					} else if IsWordChar(ch) {
956						uncapname := p.scanCapname()
957
958						if !p.isCaptureName(uncapname) {
959							return nil, p.getErr(ErrUndefinedNameRef, uncapname)
960						}
961						uncapnum = p.captureSlotFromName(uncapname)
962
963						// check if we have bogus character after the name
964						if p.charsRight() > 0 && p.rightChar(0) != close {
965							return nil, p.getErr(ErrInvalidGroupName)
966						}
967					} else {
968						// bad group name - starts with something other than a word character and isn't a number
969						return nil, p.getErr(ErrInvalidGroupName)
970					}
971				}
972
973				// actually make the node
974
975				if (capnum != -1 || uncapnum != -1) && p.charsRight() > 0 && p.moveRightGetChar() == close {
976					return newRegexNodeMN(ntCapture, p.options, capnum, uncapnum), nil
977				}
978				goto BreakRecognize
979			}
980
981		case '(':
982			// alternation construct (?(...) | )
983
984			parenPos := p.textpos()
985			if p.charsRight() > 0 {
986				ch = p.rightChar(0)
987
988				// check if the alternation condition is a backref
989				if ch >= '0' && ch <= '9' {
990					var capnum int
991					if capnum, err = p.scanDecimal(); err != nil {
992						return nil, err
993					}
994					if p.charsRight() > 0 && p.moveRightGetChar() == ')' {
995						if p.isCaptureSlot(capnum) {
996							return newRegexNodeM(ntTestref, p.options, capnum), nil
997						}
998						return nil, p.getErr(ErrUndefinedReference, capnum)
999					}
1000
1001					return nil, p.getErr(ErrMalformedReference, capnum)
1002
1003				} else if IsWordChar(ch) {
1004					capname := p.scanCapname()
1005
1006					if p.isCaptureName(capname) && p.charsRight() > 0 && p.moveRightGetChar() == ')' {
1007						return newRegexNodeM(ntTestref, p.options, p.captureSlotFromName(capname)), nil
1008					}
1009				}
1010			}
1011			// not a backref
1012			nt = ntTestgroup
1013			p.textto(parenPos - 1)   // jump to the start of the parentheses
1014			p.ignoreNextParen = true // but make sure we don't try to capture the insides
1015
1016			charsRight := p.charsRight()
1017			if charsRight >= 3 && p.rightChar(1) == '?' {
1018				rightchar2 := p.rightChar(2)
1019				// disallow comments in the condition
1020				if rightchar2 == '#' {
1021					return nil, p.getErr(ErrAlternationCantHaveComment)
1022				}
1023
1024				// disallow named capture group (?<..>..) in the condition
1025				if rightchar2 == '\'' {
1026					return nil, p.getErr(ErrAlternationCantCapture)
1027				}
1028
1029				if charsRight >= 4 && (rightchar2 == '<' && p.rightChar(3) != '!' && p.rightChar(3) != '=') {
1030					return nil, p.getErr(ErrAlternationCantCapture)
1031				}
1032			}
1033
1034		case 'P':
1035			if p.useRE2() {
1036				// support for P<name> syntax
1037				if p.charsRight() < 3 {
1038					goto BreakRecognize
1039				}
1040
1041				ch = p.moveRightGetChar()
1042				if ch != '<' {
1043					goto BreakRecognize
1044				}
1045
1046				ch = p.moveRightGetChar()
1047				p.moveLeft()
1048
1049				if IsWordChar(ch) {
1050					capnum := -1
1051					capname := p.scanCapname()
1052
1053					if p.isCaptureName(capname) {
1054						capnum = p.captureSlotFromName(capname)
1055					}
1056
1057					// check if we have bogus character after the name
1058					if p.charsRight() > 0 && p.rightChar(0) != '>' {
1059						return nil, p.getErr(ErrInvalidGroupName)
1060					}
1061
1062					// actually make the node
1063
1064					if capnum != -1 && p.charsRight() > 0 && p.moveRightGetChar() == '>' {
1065						return newRegexNodeMN(ntCapture, p.options, capnum, -1), nil
1066					}
1067					goto BreakRecognize
1068
1069				} else {
1070					// bad group name - starts with something other than a word character and isn't a number
1071					return nil, p.getErr(ErrInvalidGroupName)
1072				}
1073			}
1074			// if we're not using RE2 compat mode then
1075			// we just behave like normal
1076			fallthrough
1077
1078		default:
1079			p.moveLeft()
1080
1081			nt = ntGroup
1082			// disallow options in the children of a testgroup node
1083			if p.group.t != ntTestgroup {
1084				p.scanOptions()
1085			}
1086			if p.charsRight() == 0 {
1087				goto BreakRecognize
1088			}
1089
1090			if ch = p.moveRightGetChar(); ch == ')' {
1091				return nil, nil
1092			}
1093
1094			if ch != ':' {
1095				goto BreakRecognize
1096			}
1097
1098		}
1099
1100		return newRegexNode(nt, p.options), nil
1101	}
1102
1103BreakRecognize:
1104
1105	// break Recognize comes here
1106
1107	return nil, p.getErr(ErrUnrecognizedGrouping, string(p.pattern[start:p.textpos()]))
1108}
1109
1110// scans backslash specials and basics
1111func (p *parser) scanBackslash(scanOnly bool) (*regexNode, error) {
1112
1113	if p.charsRight() == 0 {
1114		return nil, p.getErr(ErrIllegalEndEscape)
1115	}
1116
1117	switch ch := p.rightChar(0); ch {
1118	case 'b', 'B', 'A', 'G', 'Z', 'z':
1119		p.moveRight(1)
1120		return newRegexNode(p.typeFromCode(ch), p.options), nil
1121
1122	case 'w':
1123		p.moveRight(1)
1124		if p.useOptionE() {
1125			return newRegexNodeSet(ntSet, p.options, ECMAWordClass()), nil
1126		}
1127		return newRegexNodeSet(ntSet, p.options, WordClass()), nil
1128
1129	case 'W':
1130		p.moveRight(1)
1131		if p.useOptionE() {
1132			return newRegexNodeSet(ntSet, p.options, NotECMAWordClass()), nil
1133		}
1134		return newRegexNodeSet(ntSet, p.options, NotWordClass()), nil
1135
1136	case 's':
1137		p.moveRight(1)
1138		if p.useOptionE() {
1139			return newRegexNodeSet(ntSet, p.options, ECMASpaceClass()), nil
1140		}
1141		return newRegexNodeSet(ntSet, p.options, SpaceClass()), nil
1142
1143	case 'S':
1144		p.moveRight(1)
1145		if p.useOptionE() {
1146			return newRegexNodeSet(ntSet, p.options, NotECMASpaceClass()), nil
1147		}
1148		return newRegexNodeSet(ntSet, p.options, NotSpaceClass()), nil
1149
1150	case 'd':
1151		p.moveRight(1)
1152		if p.useOptionE() {
1153			return newRegexNodeSet(ntSet, p.options, ECMADigitClass()), nil
1154		}
1155		return newRegexNodeSet(ntSet, p.options, DigitClass()), nil
1156
1157	case 'D':
1158		p.moveRight(1)
1159		if p.useOptionE() {
1160			return newRegexNodeSet(ntSet, p.options, NotECMADigitClass()), nil
1161		}
1162		return newRegexNodeSet(ntSet, p.options, NotDigitClass()), nil
1163
1164	case 'p', 'P':
1165		p.moveRight(1)
1166		prop, err := p.parseProperty()
1167		if err != nil {
1168			return nil, err
1169		}
1170		cc := &CharSet{}
1171		cc.addCategory(prop, (ch != 'p'), p.useOptionI(), p.patternRaw)
1172		if p.useOptionI() {
1173			cc.addLowercase()
1174		}
1175
1176		return newRegexNodeSet(ntSet, p.options, cc), nil
1177
1178	default:
1179		return p.scanBasicBackslash(scanOnly)
1180	}
1181}
1182
1183// Scans \-style backreferences and character escapes
1184func (p *parser) scanBasicBackslash(scanOnly bool) (*regexNode, error) {
1185	if p.charsRight() == 0 {
1186		return nil, p.getErr(ErrIllegalEndEscape)
1187	}
1188	angled := false
1189	close := '\x00'
1190
1191	backpos := p.textpos()
1192	ch := p.rightChar(0)
1193
1194	// allow \k<foo> instead of \<foo>, which is now deprecated
1195
1196	if ch == 'k' {
1197		if p.charsRight() >= 2 {
1198			p.moveRight(1)
1199			ch = p.moveRightGetChar()
1200
1201			if ch == '<' || ch == '\'' {
1202				angled = true
1203				if ch == '\'' {
1204					close = '\''
1205				} else {
1206					close = '>'
1207				}
1208			}
1209		}
1210
1211		if !angled || p.charsRight() <= 0 {
1212			return nil, p.getErr(ErrMalformedNameRef)
1213		}
1214
1215		ch = p.rightChar(0)
1216
1217	} else if (ch == '<' || ch == '\'') && p.charsRight() > 1 { // Note angle without \g
1218		angled = true
1219		if ch == '\'' {
1220			close = '\''
1221		} else {
1222			close = '>'
1223		}
1224
1225		p.moveRight(1)
1226		ch = p.rightChar(0)
1227	}
1228
1229	// Try to parse backreference: \<1> or \<cap>
1230
1231	if angled && ch >= '0' && ch <= '9' {
1232		capnum, err := p.scanDecimal()
1233		if err != nil {
1234			return nil, err
1235		}
1236
1237		if p.charsRight() > 0 && p.moveRightGetChar() == close {
1238			if p.isCaptureSlot(capnum) {
1239				return newRegexNodeM(ntRef, p.options, capnum), nil
1240			}
1241			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1242		}
1243	} else if !angled && ch >= '1' && ch <= '9' { // Try to parse backreference or octal: \1
1244		capnum, err := p.scanDecimal()
1245		if err != nil {
1246			return nil, err
1247		}
1248
1249		if scanOnly {
1250			return nil, nil
1251		}
1252
1253		if p.useOptionE() || p.isCaptureSlot(capnum) {
1254			return newRegexNodeM(ntRef, p.options, capnum), nil
1255		}
1256		if capnum <= 9 {
1257			return nil, p.getErr(ErrUndefinedBackRef, capnum)
1258		}
1259
1260	} else if angled && IsWordChar(ch) {
1261		capname := p.scanCapname()
1262
1263		if p.charsRight() > 0 && p.moveRightGetChar() == close {
1264			if p.isCaptureName(capname) {
1265				return newRegexNodeM(ntRef, p.options, p.captureSlotFromName(capname)), nil
1266			}
1267			return nil, p.getErr(ErrUndefinedNameRef, capname)
1268		}
1269	}
1270
1271	// Not backreference: must be char code
1272
1273	p.textto(backpos)
1274	ch, err := p.scanCharEscape()
1275	if err != nil {
1276		return nil, err
1277	}
1278
1279	if p.useOptionI() {
1280		ch = unicode.ToLower(ch)
1281	}
1282
1283	return newRegexNodeCh(ntOne, p.options, ch), nil
1284}
1285
1286// Scans X for \p{X} or \P{X}
1287func (p *parser) parseProperty() (string, error) {
1288	if p.charsRight() < 3 {
1289		return "", p.getErr(ErrIncompleteSlashP)
1290	}
1291	ch := p.moveRightGetChar()
1292	if ch != '{' {
1293		return "", p.getErr(ErrMalformedSlashP)
1294	}
1295
1296	startpos := p.textpos()
1297	for p.charsRight() > 0 {
1298		ch = p.moveRightGetChar()
1299		if !(IsWordChar(ch) || ch == '-') {
1300			p.moveLeft()
1301			break
1302		}
1303	}
1304	capname := string(p.pattern[startpos:p.textpos()])
1305
1306	if p.charsRight() == 0 || p.moveRightGetChar() != '}' {
1307		return "", p.getErr(ErrIncompleteSlashP)
1308	}
1309
1310	if !isValidUnicodeCat(capname) {
1311		return "", p.getErr(ErrUnknownSlashP, capname)
1312	}
1313
1314	return capname, nil
1315}
1316
1317// Returns ReNode type for zero-length assertions with a \ code.
1318func (p *parser) typeFromCode(ch rune) nodeType {
1319	switch ch {
1320	case 'b':
1321		if p.useOptionE() {
1322			return ntECMABoundary
1323		}
1324		return ntBoundary
1325	case 'B':
1326		if p.useOptionE() {
1327			return ntNonECMABoundary
1328		}
1329		return ntNonboundary
1330	case 'A':
1331		return ntBeginning
1332	case 'G':
1333		return ntStart
1334	case 'Z':
1335		return ntEndZ
1336	case 'z':
1337		return ntEnd
1338	default:
1339		return ntNothing
1340	}
1341}
1342
1343// Scans whitespace or x-mode comments.
1344func (p *parser) scanBlank() error {
1345	if p.useOptionX() {
1346		for {
1347			for p.charsRight() > 0 && isSpace(p.rightChar(0)) {
1348				p.moveRight(1)
1349			}
1350
1351			if p.charsRight() == 0 {
1352				break
1353			}
1354
1355			if p.rightChar(0) == '#' {
1356				for p.charsRight() > 0 && p.rightChar(0) != '\n' {
1357					p.moveRight(1)
1358				}
1359			} else if p.charsRight() >= 3 && p.rightChar(2) == '#' &&
1360				p.rightChar(1) == '?' && p.rightChar(0) == '(' {
1361				for p.charsRight() > 0 && p.rightChar(0) != ')' {
1362					p.moveRight(1)
1363				}
1364				if p.charsRight() == 0 {
1365					return p.getErr(ErrUnterminatedComment)
1366				}
1367				p.moveRight(1)
1368			} else {
1369				break
1370			}
1371		}
1372	} else {
1373		for {
1374			if p.charsRight() < 3 || p.rightChar(2) != '#' ||
1375				p.rightChar(1) != '?' || p.rightChar(0) != '(' {
1376				return nil
1377			}
1378
1379			for p.charsRight() > 0 && p.rightChar(0) != ')' {
1380				p.moveRight(1)
1381			}
1382			if p.charsRight() == 0 {
1383				return p.getErr(ErrUnterminatedComment)
1384			}
1385			p.moveRight(1)
1386		}
1387	}
1388	return nil
1389}
1390
1391func (p *parser) scanCapname() string {
1392	startpos := p.textpos()
1393
1394	for p.charsRight() > 0 {
1395		if !IsWordChar(p.moveRightGetChar()) {
1396			p.moveLeft()
1397			break
1398		}
1399	}
1400
1401	return string(p.pattern[startpos:p.textpos()])
1402}
1403
1404//Scans contents of [] (not including []'s), and converts to a set.
1405func (p *parser) scanCharSet(caseInsensitive, scanOnly bool) (*CharSet, error) {
1406	ch := '\x00'
1407	chPrev := '\x00'
1408	inRange := false
1409	firstChar := true
1410	closed := false
1411
1412	var cc *CharSet
1413	if !scanOnly {
1414		cc = &CharSet{}
1415	}
1416
1417	if p.charsRight() > 0 && p.rightChar(0) == '^' {
1418		p.moveRight(1)
1419		if !scanOnly {
1420			cc.negate = true
1421		}
1422	}
1423
1424	for ; p.charsRight() > 0; firstChar = false {
1425		fTranslatedChar := false
1426		ch = p.moveRightGetChar()
1427		if ch == ']' {
1428			if !firstChar {
1429				closed = true
1430				break
1431			} else if p.useOptionE() {
1432				if !scanOnly {
1433					cc.addRanges(NoneClass().ranges)
1434				}
1435				closed = true
1436				break
1437			}
1438
1439		} else if ch == '\\' && p.charsRight() > 0 {
1440			switch ch = p.moveRightGetChar(); ch {
1441			case 'D', 'd':
1442				if !scanOnly {
1443					if inRange {
1444						return nil, p.getErr(ErrBadClassInCharRange, ch)
1445					}
1446					cc.addDigit(p.useOptionE(), ch == 'D', p.patternRaw)
1447				}
1448				continue
1449
1450			case 'S', 's':
1451				if !scanOnly {
1452					if inRange {
1453						return nil, p.getErr(ErrBadClassInCharRange, ch)
1454					}
1455					cc.addSpace(p.useOptionE(), ch == 'S')
1456				}
1457				continue
1458
1459			case 'W', 'w':
1460				if !scanOnly {
1461					if inRange {
1462						return nil, p.getErr(ErrBadClassInCharRange, ch)
1463					}
1464
1465					cc.addWord(p.useOptionE(), ch == 'W')
1466				}
1467				continue
1468
1469			case 'p', 'P':
1470				if !scanOnly {
1471					if inRange {
1472						return nil, p.getErr(ErrBadClassInCharRange, ch)
1473					}
1474					prop, err := p.parseProperty()
1475					if err != nil {
1476						return nil, err
1477					}
1478					cc.addCategory(prop, (ch != 'p'), caseInsensitive, p.patternRaw)
1479				} else {
1480					p.parseProperty()
1481				}
1482
1483				continue
1484
1485			case '-':
1486				if !scanOnly {
1487					cc.addRange(ch, ch)
1488				}
1489				continue
1490
1491			default:
1492				p.moveLeft()
1493				var err error
1494				ch, err = p.scanCharEscape() // non-literal character
1495				if err != nil {
1496					return nil, err
1497				}
1498				fTranslatedChar = true
1499				break // this break will only break out of the switch
1500			}
1501		} else if ch == '[' {
1502			// This is code for Posix style properties - [:Ll:] or [:IsTibetan:].
1503			// It currently doesn't do anything other than skip the whole thing!
1504			if p.charsRight() > 0 && p.rightChar(0) == ':' && !inRange {
1505				savePos := p.textpos()
1506
1507				p.moveRight(1)
1508				negate := false
1509				if p.charsRight() > 1 && p.rightChar(0) == '^' {
1510					negate = true
1511					p.moveRight(1)
1512				}
1513
1514				nm := p.scanCapname() // snag the name
1515				if !scanOnly && p.useRE2() {
1516					// look up the name since these are valid for RE2
1517					// add the group based on the name
1518					if ok := cc.addNamedASCII(nm, negate); !ok {
1519						return nil, p.getErr(ErrInvalidCharRange)
1520					}
1521				}
1522				if p.charsRight() < 2 || p.moveRightGetChar() != ':' || p.moveRightGetChar() != ']' {
1523					p.textto(savePos)
1524				} else if p.useRE2() {
1525					// move on
1526					continue
1527				}
1528			}
1529		}
1530
1531		if inRange {
1532			inRange = false
1533			if !scanOnly {
1534				if ch == '[' && !fTranslatedChar && !firstChar {
1535					// We thought we were in a range, but we're actually starting a subtraction.
1536					// In that case, we'll add chPrev to our char class, skip the opening [, and
1537					// scan the new character class recursively.
1538					cc.addChar(chPrev)
1539					sub, err := p.scanCharSet(caseInsensitive, false)
1540					if err != nil {
1541						return nil, err
1542					}
1543					cc.addSubtraction(sub)
1544
1545					if p.charsRight() > 0 && p.rightChar(0) != ']' {
1546						return nil, p.getErr(ErrSubtractionMustBeLast)
1547					}
1548				} else {
1549					// a regular range, like a-z
1550					if chPrev > ch {
1551						return nil, p.getErr(ErrReversedCharRange)
1552					}
1553					cc.addRange(chPrev, ch)
1554				}
1555			}
1556		} else if p.charsRight() >= 2 && p.rightChar(0) == '-' && p.rightChar(1) != ']' {
1557			// this could be the start of a range
1558			chPrev = ch
1559			inRange = true
1560			p.moveRight(1)
1561		} else if p.charsRight() >= 1 && ch == '-' && !fTranslatedChar && p.rightChar(0) == '[' && !firstChar {
1562			// we aren't in a range, and now there is a subtraction.  Usually this happens
1563			// only when a subtraction follows a range, like [a-z-[b]]
1564			if !scanOnly {
1565				p.moveRight(1)
1566				sub, err := p.scanCharSet(caseInsensitive, false)
1567				if err != nil {
1568					return nil, err
1569				}
1570				cc.addSubtraction(sub)
1571
1572				if p.charsRight() > 0 && p.rightChar(0) != ']' {
1573					return nil, p.getErr(ErrSubtractionMustBeLast)
1574				}
1575			} else {
1576				p.moveRight(1)
1577				p.scanCharSet(caseInsensitive, true)
1578			}
1579		} else {
1580			if !scanOnly {
1581				cc.addRange(ch, ch)
1582			}
1583		}
1584	}
1585
1586	if !closed {
1587		return nil, p.getErr(ErrUnterminatedBracket)
1588	}
1589
1590	if !scanOnly && caseInsensitive {
1591		cc.addLowercase()
1592	}
1593
1594	return cc, nil
1595}
1596
1597// Scans any number of decimal digits (pegs value at 2^31-1 if too large)
1598func (p *parser) scanDecimal() (int, error) {
1599	i := 0
1600	var d int
1601
1602	for p.charsRight() > 0 {
1603		d = int(p.rightChar(0) - '0')
1604		if d < 0 || d > 9 {
1605			break
1606		}
1607		p.moveRight(1)
1608
1609		if i > maxValueDiv10 || (i == maxValueDiv10 && d > maxValueMod10) {
1610			return 0, p.getErr(ErrCaptureGroupOutOfRange)
1611		}
1612
1613		i *= 10
1614		i += d
1615	}
1616
1617	return int(i), nil
1618}
1619
1620// Returns true for options allowed only at the top level
1621func isOnlyTopOption(option RegexOptions) bool {
1622	return option == RightToLeft || option == ECMAScript || option == RE2
1623}
1624
1625// Scans cimsx-cimsx option string, stops at the first unrecognized char.
1626func (p *parser) scanOptions() {
1627
1628	for off := false; p.charsRight() > 0; p.moveRight(1) {
1629		ch := p.rightChar(0)
1630
1631		if ch == '-' {
1632			off = true
1633		} else if ch == '+' {
1634			off = false
1635		} else {
1636			option := optionFromCode(ch)
1637			if option == 0 || isOnlyTopOption(option) {
1638				return
1639			}
1640
1641			if off {
1642				p.options &= ^option
1643			} else {
1644				p.options |= option
1645			}
1646		}
1647	}
1648}
1649
1650// Scans \ code for escape codes that map to single unicode chars.
1651func (p *parser) scanCharEscape() (rune, error) {
1652
1653	ch := p.moveRightGetChar()
1654
1655	if ch >= '0' && ch <= '7' {
1656		p.moveLeft()
1657		return p.scanOctal(), nil
1658	}
1659
1660	switch ch {
1661	case 'x':
1662		// support for \x{HEX} syntax from Perl and PCRE
1663		if p.charsRight() > 0 && p.rightChar(0) == '{' {
1664			p.moveRight(1)
1665			return p.scanHexUntilBrace()
1666		}
1667		return p.scanHex(2)
1668	case 'u':
1669		return p.scanHex(4)
1670	case 'a':
1671		return '\u0007', nil
1672	case 'b':
1673		return '\b', nil
1674	case 'e':
1675		return '\u001B', nil
1676	case 'f':
1677		return '\f', nil
1678	case 'n':
1679		return '\n', nil
1680	case 'r':
1681		return '\r', nil
1682	case 't':
1683		return '\t', nil
1684	case 'v':
1685		return '\u000B', nil
1686	case 'c':
1687		return p.scanControl()
1688	default:
1689		if !p.useOptionE() && IsWordChar(ch) {
1690			return 0, p.getErr(ErrUnrecognizedEscape, string(ch))
1691		}
1692		return ch, nil
1693	}
1694}
1695
1696// Grabs and converts an ascii control character
1697func (p *parser) scanControl() (rune, error) {
1698	if p.charsRight() <= 0 {
1699		return 0, p.getErr(ErrMissingControl)
1700	}
1701
1702	ch := p.moveRightGetChar()
1703
1704	// \ca interpreted as \cA
1705
1706	if ch >= 'a' && ch <= 'z' {
1707		ch = (ch - ('a' - 'A'))
1708	}
1709	ch = (ch - '@')
1710	if ch >= 0 && ch < ' ' {
1711		return ch, nil
1712	}
1713
1714	return 0, p.getErr(ErrUnrecognizedControl)
1715
1716}
1717
1718// Scan hex digits until we hit a closing brace.
1719// Non-hex digits, hex value too large for UTF-8, or running out of chars are errors
1720func (p *parser) scanHexUntilBrace() (rune, error) {
1721	// PCRE spec reads like unlimited hex digits are allowed, but unicode has a limit
1722	// so we can enforce that
1723	i := 0
1724	hasContent := false
1725
1726	for p.charsRight() > 0 {
1727		ch := p.moveRightGetChar()
1728		if ch == '}' {
1729			// hit our close brace, we're done here
1730			// prevent \x{}
1731			if !hasContent {
1732				return 0, p.getErr(ErrTooFewHex)
1733			}
1734			return rune(i), nil
1735		}
1736		hasContent = true
1737		// no brace needs to be hex digit
1738		d := hexDigit(ch)
1739		if d < 0 {
1740			return 0, p.getErr(ErrMissingBrace)
1741		}
1742
1743		i *= 0x10
1744		i += d
1745
1746		if i > unicode.MaxRune {
1747			return 0, p.getErr(ErrInvalidHex)
1748		}
1749	}
1750
1751	// we only make it here if we run out of digits without finding the brace
1752	return 0, p.getErr(ErrMissingBrace)
1753}
1754
1755// Scans exactly c hex digits (c=2 for \xFF, c=4 for \uFFFF)
1756func (p *parser) scanHex(c int) (rune, error) {
1757
1758	i := 0
1759
1760	if p.charsRight() >= c {
1761		for c > 0 {
1762			d := hexDigit(p.moveRightGetChar())
1763			if d < 0 {
1764				break
1765			}
1766			i *= 0x10
1767			i += d
1768			c--
1769		}
1770	}
1771
1772	if c > 0 {
1773		return 0, p.getErr(ErrTooFewHex)
1774	}
1775
1776	return rune(i), nil
1777}
1778
1779// Returns n <= 0xF for a hex digit.
1780func hexDigit(ch rune) int {
1781
1782	if d := uint(ch - '0'); d <= 9 {
1783		return int(d)
1784	}
1785
1786	if d := uint(ch - 'a'); d <= 5 {
1787		return int(d + 0xa)
1788	}
1789
1790	if d := uint(ch - 'A'); d <= 5 {
1791		return int(d + 0xa)
1792	}
1793
1794	return -1
1795}
1796
1797// Scans up to three octal digits (stops before exceeding 0377).
1798func (p *parser) scanOctal() rune {
1799	// Consume octal chars only up to 3 digits and value 0377
1800
1801	c := 3
1802
1803	if c > p.charsRight() {
1804		c = p.charsRight()
1805	}
1806
1807	//we know the first char is good because the caller had to check
1808	i := 0
1809	d := int(p.rightChar(0) - '0')
1810	for c > 0 && d <= 7 {
1811		i *= 8
1812		i += d
1813		if p.useOptionE() && i >= 0x20 {
1814			break
1815		}
1816		c--
1817
1818		p.moveRight(1)
1819		if !p.rightMost() {
1820			d = int(p.rightChar(0) - '0')
1821		}
1822	}
1823
1824	// Octal codes only go up to 255.  Any larger and the behavior that Perl follows
1825	// is simply to truncate the high bits.
1826	i &= 0xFF
1827
1828	return rune(i)
1829}
1830
1831// Returns the current parsing position.
1832func (p *parser) textpos() int {
1833	return p.currentPos
1834}
1835
1836// Zaps to a specific parsing position.
1837func (p *parser) textto(pos int) {
1838	p.currentPos = pos
1839}
1840
1841// Returns the char at the right of the current parsing position and advances to the right.
1842func (p *parser) moveRightGetChar() rune {
1843	ch := p.pattern[p.currentPos]
1844	p.currentPos++
1845	return ch
1846}
1847
1848// Moves the current position to the right.
1849func (p *parser) moveRight(i int) {
1850	// default would be 1
1851	p.currentPos += i
1852}
1853
1854// Moves the current parsing position one to the left.
1855func (p *parser) moveLeft() {
1856	p.currentPos--
1857}
1858
1859// Returns the char left of the current parsing position.
1860func (p *parser) charAt(i int) rune {
1861	return p.pattern[i]
1862}
1863
1864// Returns the char i chars right of the current parsing position.
1865func (p *parser) rightChar(i int) rune {
1866	// default would be 0
1867	return p.pattern[p.currentPos+i]
1868}
1869
1870// Number of characters to the right of the current parsing position.
1871func (p *parser) charsRight() int {
1872	return len(p.pattern) - p.currentPos
1873}
1874
1875func (p *parser) rightMost() bool {
1876	return p.currentPos == len(p.pattern)
1877}
1878
1879// Looks up the slot number for a given name
1880func (p *parser) captureSlotFromName(capname string) int {
1881	return p.capnames[capname]
1882}
1883
1884// True if the capture slot was noted
1885func (p *parser) isCaptureSlot(i int) bool {
1886	if p.caps != nil {
1887		_, ok := p.caps[i]
1888		return ok
1889	}
1890
1891	return (i >= 0 && i < p.capsize)
1892}
1893
1894// Looks up the slot number for a given name
1895func (p *parser) isCaptureName(capname string) bool {
1896	if p.capnames == nil {
1897		return false
1898	}
1899
1900	_, ok := p.capnames[capname]
1901	return ok
1902}
1903
1904// option shortcuts
1905
1906// True if N option disabling '(' autocapture is on.
1907func (p *parser) useOptionN() bool {
1908	return (p.options & ExplicitCapture) != 0
1909}
1910
1911// True if I option enabling case-insensitivity is on.
1912func (p *parser) useOptionI() bool {
1913	return (p.options & IgnoreCase) != 0
1914}
1915
1916// True if M option altering meaning of $ and ^ is on.
1917func (p *parser) useOptionM() bool {
1918	return (p.options & Multiline) != 0
1919}
1920
1921// True if S option altering meaning of . is on.
1922func (p *parser) useOptionS() bool {
1923	return (p.options & Singleline) != 0
1924}
1925
1926// True if X option enabling whitespace/comment mode is on.
1927func (p *parser) useOptionX() bool {
1928	return (p.options & IgnorePatternWhitespace) != 0
1929}
1930
1931// True if E option enabling ECMAScript behavior on.
1932func (p *parser) useOptionE() bool {
1933	return (p.options & ECMAScript) != 0
1934}
1935
1936// true to use RE2 compatibility parsing behavior.
1937func (p *parser) useRE2() bool {
1938	return (p.options & RE2) != 0
1939}
1940
1941// True if options stack is empty.
1942func (p *parser) emptyOptionsStack() bool {
1943	return len(p.optionsStack) == 0
1944}
1945
1946// Finish the current quantifiable (when a quantifier is not found or is not possible)
1947func (p *parser) addConcatenate() {
1948	// The first (| inside a Testgroup group goes directly to the group
1949	p.concatenation.addChild(p.unit)
1950	p.unit = nil
1951}
1952
1953// Finish the current quantifiable (when a quantifier is found)
1954func (p *parser) addConcatenate3(lazy bool, min, max int) {
1955	p.concatenation.addChild(p.unit.makeQuantifier(lazy, min, max))
1956	p.unit = nil
1957}
1958
1959// Sets the current unit to a single char node
1960func (p *parser) addUnitOne(ch rune) {
1961	if p.useOptionI() {
1962		ch = unicode.ToLower(ch)
1963	}
1964
1965	p.unit = newRegexNodeCh(ntOne, p.options, ch)
1966}
1967
1968// Sets the current unit to a single inverse-char node
1969func (p *parser) addUnitNotone(ch rune) {
1970	if p.useOptionI() {
1971		ch = unicode.ToLower(ch)
1972	}
1973
1974	p.unit = newRegexNodeCh(ntNotone, p.options, ch)
1975}
1976
1977// Sets the current unit to a single set node
1978func (p *parser) addUnitSet(set *CharSet) {
1979	p.unit = newRegexNodeSet(ntSet, p.options, set)
1980}
1981
1982// Sets the current unit to a subtree
1983func (p *parser) addUnitNode(node *regexNode) {
1984	p.unit = node
1985}
1986
1987// Sets the current unit to an assertion of the specified type
1988func (p *parser) addUnitType(t nodeType) {
1989	p.unit = newRegexNode(t, p.options)
1990}
1991
1992// Finish the current group (in response to a ')' or end)
1993func (p *parser) addGroup() error {
1994	if p.group.t == ntTestgroup || p.group.t == ntTestref {
1995		p.group.addChild(p.concatenation.reverseLeft())
1996		if (p.group.t == ntTestref && len(p.group.children) > 2) || len(p.group.children) > 3 {
1997			return p.getErr(ErrTooManyAlternates)
1998		}
1999	} else {
2000		p.alternation.addChild(p.concatenation.reverseLeft())
2001		p.group.addChild(p.alternation)
2002	}
2003
2004	p.unit = p.group
2005	return nil
2006}
2007
2008// Pops the option stack, but keeps the current options unchanged.
2009func (p *parser) popKeepOptions() {
2010	lastIdx := len(p.optionsStack) - 1
2011	p.optionsStack = p.optionsStack[:lastIdx]
2012}
2013
2014// Recalls options from the stack.
2015func (p *parser) popOptions() {
2016	lastIdx := len(p.optionsStack) - 1
2017	// get the last item on the stack and then remove it by reslicing
2018	p.options = p.optionsStack[lastIdx]
2019	p.optionsStack = p.optionsStack[:lastIdx]
2020}
2021
2022// Saves options on a stack.
2023func (p *parser) pushOptions() {
2024	p.optionsStack = append(p.optionsStack, p.options)
2025}
2026
2027// Add a string to the last concatenate.
2028func (p *parser) addToConcatenate(pos, cch int, isReplacement bool) {
2029	var node *regexNode
2030
2031	if cch == 0 {
2032		return
2033	}
2034
2035	if cch > 1 {
2036		str := p.pattern[pos : pos+cch]
2037
2038		if p.useOptionI() && !isReplacement {
2039			// We do the ToLower character by character for consistency.  With surrogate chars, doing
2040			// a ToLower on the entire string could actually change the surrogate pair.  This is more correct
2041			// linguistically, but since Regex doesn't support surrogates, it's more important to be
2042			// consistent.
2043			for i := 0; i < len(str); i++ {
2044				str[i] = unicode.ToLower(str[i])
2045			}
2046		}
2047
2048		node = newRegexNodeStr(ntMulti, p.options, str)
2049	} else {
2050		ch := p.charAt(pos)
2051
2052		if p.useOptionI() && !isReplacement {
2053			ch = unicode.ToLower(ch)
2054		}
2055
2056		node = newRegexNodeCh(ntOne, p.options, ch)
2057	}
2058
2059	p.concatenation.addChild(node)
2060}
2061
2062// Push the parser state (in response to an open paren)
2063func (p *parser) pushGroup() {
2064	p.group.next = p.stack
2065	p.alternation.next = p.group
2066	p.concatenation.next = p.alternation
2067	p.stack = p.concatenation
2068}
2069
2070// Remember the pushed state (in response to a ')')
2071func (p *parser) popGroup() error {
2072	p.concatenation = p.stack
2073	p.alternation = p.concatenation.next
2074	p.group = p.alternation.next
2075	p.stack = p.group.next
2076
2077	// The first () inside a Testgroup group goes directly to the group
2078	if p.group.t == ntTestgroup && len(p.group.children) == 0 {
2079		if p.unit == nil {
2080			return p.getErr(ErrConditionalExpression)
2081		}
2082
2083		p.group.addChild(p.unit)
2084		p.unit = nil
2085	}
2086	return nil
2087}
2088
2089// True if the group stack is empty.
2090func (p *parser) emptyStack() bool {
2091	return p.stack == nil
2092}
2093
2094// Start a new round for the parser state (in response to an open paren or string start)
2095func (p *parser) startGroup(openGroup *regexNode) {
2096	p.group = openGroup
2097	p.alternation = newRegexNode(ntAlternate, p.options)
2098	p.concatenation = newRegexNode(ntConcatenate, p.options)
2099}
2100
2101// Finish the current concatenation (in response to a |)
2102func (p *parser) addAlternate() {
2103	// The | parts inside a Testgroup group go directly to the group
2104
2105	if p.group.t == ntTestgroup || p.group.t == ntTestref {
2106		p.group.addChild(p.concatenation.reverseLeft())
2107	} else {
2108		p.alternation.addChild(p.concatenation.reverseLeft())
2109	}
2110
2111	p.concatenation = newRegexNode(ntConcatenate, p.options)
2112}
2113
2114// For categorizing ascii characters.
2115
2116const (
2117	Q byte = 5 // quantifier
2118	S      = 4 // ordinary stopper
2119	Z      = 3 // ScanBlank stopper
2120	X      = 2 // whitespace
2121	E      = 1 // should be escaped
2122)
2123
2124var _category = []byte{
2125	//01  2  3  4  5  6  7  8  9  A  B  C  D  E  F  0  1  2  3  4  5  6  7  8  9  A  B  C  D  E  F
2126	0, 0, 0, 0, 0, 0, 0, 0, 0, X, X, X, X, X, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2127	// !  "  #  $  %  &  '  (  )  *  +  ,  -  .  /  0  1  2  3  4  5  6  7  8  9  :  ;  <  =  >  ?
2128	X, 0, 0, Z, S, 0, 0, 0, S, S, Q, Q, 0, 0, S, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q,
2129	//@A  B  C  D  E  F  G  H  I  J  K  L  M  N  O  P  Q  R  S  T  U  V  W  X  Y  Z  [  \  ]  ^  _
2130	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, S, 0,
2131	//'a  b  c  d  e  f  g  h  i  j  k  l  m  n  o  p  q  r  s  t  u  v  w  x  y  z  {  |  }  ~
2132	0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, Q, S, 0, 0, 0,
2133}
2134
2135func isSpace(ch rune) bool {
2136	return (ch <= ' ' && _category[ch] == X)
2137}
2138
2139// Returns true for those characters that terminate a string of ordinary chars.
2140func isSpecial(ch rune) bool {
2141	return (ch <= '|' && _category[ch] >= S)
2142}
2143
2144// Returns true for those characters that terminate a string of ordinary chars.
2145func isStopperX(ch rune) bool {
2146	return (ch <= '|' && _category[ch] >= X)
2147}
2148
2149// Returns true for those characters that begin a quantifier.
2150func isQuantifier(ch rune) bool {
2151	return (ch <= '{' && _category[ch] >= Q)
2152}
2153
2154func (p *parser) isTrueQuantifier() bool {
2155	nChars := p.charsRight()
2156	if nChars == 0 {
2157		return false
2158	}
2159
2160	startpos := p.textpos()
2161	ch := p.charAt(startpos)
2162	if ch != '{' {
2163		return ch <= '{' && _category[ch] >= Q
2164	}
2165
2166	//UGLY: this is ugly -- the original code was ugly too
2167	pos := startpos
2168	for {
2169		nChars--
2170		if nChars <= 0 {
2171			break
2172		}
2173		pos++
2174		ch = p.charAt(pos)
2175		if ch < '0' || ch > '9' {
2176			break
2177		}
2178	}
2179
2180	if nChars == 0 || pos-startpos == 1 {
2181		return false
2182	}
2183	if ch == '}' {
2184		return true
2185	}
2186	if ch != ',' {
2187		return false
2188	}
2189	for {
2190		nChars--
2191		if nChars <= 0 {
2192			break
2193		}
2194		pos++
2195		ch = p.charAt(pos)
2196		if ch < '0' || ch > '9' {
2197			break
2198		}
2199	}
2200
2201	return nChars > 0 && ch == '}'
2202}
2203