1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package regexp implements regular expression search.
6//
7// The syntax of the regular expressions accepted is the same
8// general syntax used by Perl, Python, and other languages.
9// More precisely, it is the syntax accepted by RE2 and described at
10// https://golang.org/s/re2syntax, except for \C.
11// For an overview of the syntax, run
12//   go doc regexp/syntax
13//
14// The regexp implementation provided by this package is
15// guaranteed to run in time linear in the size of the input.
16// (This is a property not guaranteed by most open source
17// implementations of regular expressions.) For more information
18// about this property, see
19//	http://swtch.com/~rsc/regexp/regexp1.html
20// or any book about automata theory.
21//
22// All characters are UTF-8-encoded code points.
23//
24// There are 16 methods of Regexp that match a regular expression and identify
25// the matched text. Their names are matched by this regular expression:
26//
27//	Find(All)?(String)?(Submatch)?(Index)?
28//
29// If 'All' is present, the routine matches successive non-overlapping
30// matches of the entire expression. Empty matches abutting a preceding
31// match are ignored. The return value is a slice containing the successive
32// return values of the corresponding non-'All' routine. These routines take
33// an extra integer argument, n; if n >= 0, the function returns at most n
34// matches/submatches.
35//
36// If 'String' is present, the argument is a string; otherwise it is a slice
37// of bytes; return values are adjusted as appropriate.
38//
39// If 'Submatch' is present, the return value is a slice identifying the
40// successive submatches of the expression. Submatches are matches of
41// parenthesized subexpressions (also known as capturing groups) within the
42// regular expression, numbered from left to right in order of opening
43// parenthesis. Submatch 0 is the match of the entire expression, submatch 1
44// the match of the first parenthesized subexpression, and so on.
45//
46// If 'Index' is present, matches and submatches are identified by byte index
47// pairs within the input string: result[2*n:2*n+1] identifies the indexes of
48// the nth submatch. The pair for n==0 identifies the match of the entire
49// expression. If 'Index' is not present, the match is identified by the
50// text of the match/submatch. If an index is negative, it means that
51// subexpression did not match any string in the input.
52//
53// There is also a subset of the methods that can be applied to text read
54// from a RuneReader:
55//
56//	MatchReader, FindReaderIndex, FindReaderSubmatchIndex
57//
58// This set may grow. Note that regular expression matches may need to
59// examine text beyond the text returned by a match, so the methods that
60// match text from a RuneReader may read arbitrarily far into the input
61// before returning.
62//
63// (There are a few other methods that do not match this pattern.)
64//
65package regexp
66
67import (
68	"bytes"
69	"io"
70	"regexp/syntax"
71	"strconv"
72	"strings"
73	"sync"
74	"unicode"
75	"unicode/utf8"
76)
77
78// Regexp is the representation of a compiled regular expression.
79// A Regexp is safe for concurrent use by multiple goroutines,
80// except for configuration methods, such as Longest.
81type Regexp struct {
82	// read-only after Compile
83	regexpRO
84
85	// cache of machines for running regexp
86	mu      sync.Mutex
87	machine []*machine
88}
89
90type regexpRO struct {
91	expr           string         // as passed to Compile
92	prog           *syntax.Prog   // compiled program
93	onepass        *onePassProg   // onepass program or nil
94	prefix         string         // required prefix in unanchored matches
95	prefixBytes    []byte         // prefix, as a []byte
96	prefixComplete bool           // prefix is the entire regexp
97	prefixRune     rune           // first rune in prefix
98	prefixEnd      uint32         // pc for last rune in prefix
99	cond           syntax.EmptyOp // empty-width conditions required at start of match
100	numSubexp      int
101	subexpNames    []string
102	longest        bool
103}
104
105// String returns the source text used to compile the regular expression.
106func (re *Regexp) String() string {
107	return re.expr
108}
109
110// Copy returns a new Regexp object copied from re.
111//
112// When using a Regexp in multiple goroutines, giving each goroutine
113// its own copy helps to avoid lock contention.
114func (re *Regexp) Copy() *Regexp {
115	// It is not safe to copy Regexp by value
116	// since it contains a sync.Mutex.
117	return &Regexp{
118		regexpRO: re.regexpRO,
119	}
120}
121
122// Compile parses a regular expression and returns, if successful,
123// a Regexp object that can be used to match against text.
124//
125// When matching against text, the regexp returns a match that
126// begins as early as possible in the input (leftmost), and among those
127// it chooses the one that a backtracking search would have found first.
128// This so-called leftmost-first matching is the same semantics
129// that Perl, Python, and other implementations use, although this
130// package implements it without the expense of backtracking.
131// For POSIX leftmost-longest matching, see CompilePOSIX.
132func Compile(expr string) (*Regexp, error) {
133	return compile(expr, syntax.Perl, false)
134}
135
136// CompilePOSIX is like Compile but restricts the regular expression
137// to POSIX ERE (egrep) syntax and changes the match semantics to
138// leftmost-longest.
139//
140// That is, when matching against text, the regexp returns a match that
141// begins as early as possible in the input (leftmost), and among those
142// it chooses a match that is as long as possible.
143// This so-called leftmost-longest matching is the same semantics
144// that early regular expression implementations used and that POSIX
145// specifies.
146//
147// However, there can be multiple leftmost-longest matches, with different
148// submatch choices, and here this package diverges from POSIX.
149// Among the possible leftmost-longest matches, this package chooses
150// the one that a backtracking search would have found first, while POSIX
151// specifies that the match be chosen to maximize the length of the first
152// subexpression, then the second, and so on from left to right.
153// The POSIX rule is computationally prohibitive and not even well-defined.
154// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details.
155func CompilePOSIX(expr string) (*Regexp, error) {
156	return compile(expr, syntax.POSIX, true)
157}
158
159// Longest makes future searches prefer the leftmost-longest match.
160// That is, when matching against text, the regexp returns a match that
161// begins as early as possible in the input (leftmost), and among those
162// it chooses a match that is as long as possible.
163// This method modifies the Regexp and may not be called concurrently
164// with any other methods.
165func (re *Regexp) Longest() {
166	re.longest = true
167}
168
169func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) {
170	re, err := syntax.Parse(expr, mode)
171	if err != nil {
172		return nil, err
173	}
174	maxCap := re.MaxCap()
175	capNames := re.CapNames()
176
177	re = re.Simplify()
178	prog, err := syntax.Compile(re)
179	if err != nil {
180		return nil, err
181	}
182	regexp := &Regexp{
183		regexpRO: regexpRO{
184			expr:        expr,
185			prog:        prog,
186			onepass:     compileOnePass(prog),
187			numSubexp:   maxCap,
188			subexpNames: capNames,
189			cond:        prog.StartCond(),
190			longest:     longest,
191		},
192	}
193	if regexp.onepass == notOnePass {
194		regexp.prefix, regexp.prefixComplete = prog.Prefix()
195	} else {
196		regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog)
197	}
198	if regexp.prefix != "" {
199		// TODO(rsc): Remove this allocation by adding
200		// IndexString to package bytes.
201		regexp.prefixBytes = []byte(regexp.prefix)
202		regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix)
203	}
204	return regexp, nil
205}
206
207// get returns a machine to use for matching re.
208// It uses the re's machine cache if possible, to avoid
209// unnecessary allocation.
210func (re *Regexp) get() *machine {
211	re.mu.Lock()
212	if n := len(re.machine); n > 0 {
213		z := re.machine[n-1]
214		re.machine = re.machine[:n-1]
215		re.mu.Unlock()
216		return z
217	}
218	re.mu.Unlock()
219	z := progMachine(re.prog, re.onepass)
220	z.re = re
221	return z
222}
223
224// put returns a machine to the re's machine cache.
225// There is no attempt to limit the size of the cache, so it will
226// grow to the maximum number of simultaneous matches
227// run using re.  (The cache empties when re gets garbage collected.)
228func (re *Regexp) put(z *machine) {
229	re.mu.Lock()
230	re.machine = append(re.machine, z)
231	re.mu.Unlock()
232}
233
234// MustCompile is like Compile but panics if the expression cannot be parsed.
235// It simplifies safe initialization of global variables holding compiled regular
236// expressions.
237func MustCompile(str string) *Regexp {
238	regexp, error := Compile(str)
239	if error != nil {
240		panic(`regexp: Compile(` + quote(str) + `): ` + error.Error())
241	}
242	return regexp
243}
244
245// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed.
246// It simplifies safe initialization of global variables holding compiled regular
247// expressions.
248func MustCompilePOSIX(str string) *Regexp {
249	regexp, error := CompilePOSIX(str)
250	if error != nil {
251		panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error())
252	}
253	return regexp
254}
255
256func quote(s string) string {
257	if strconv.CanBackquote(s) {
258		return "`" + s + "`"
259	}
260	return strconv.Quote(s)
261}
262
263// NumSubexp returns the number of parenthesized subexpressions in this Regexp.
264func (re *Regexp) NumSubexp() int {
265	return re.numSubexp
266}
267
268// SubexpNames returns the names of the parenthesized subexpressions
269// in this Regexp. The name for the first sub-expression is names[1],
270// so that if m is a match slice, the name for m[i] is SubexpNames()[i].
271// Since the Regexp as a whole cannot be named, names[0] is always
272// the empty string. The slice should not be modified.
273func (re *Regexp) SubexpNames() []string {
274	return re.subexpNames
275}
276
277const endOfText rune = -1
278
279// input abstracts different representations of the input text. It provides
280// one-character lookahead.
281type input interface {
282	step(pos int) (r rune, width int) // advance one rune
283	canCheckPrefix() bool             // can we look ahead without losing info?
284	hasPrefix(re *Regexp) bool
285	index(re *Regexp, pos int) int
286	context(pos int) syntax.EmptyOp
287}
288
289// inputString scans a string.
290type inputString struct {
291	str string
292}
293
294func (i *inputString) step(pos int) (rune, int) {
295	if pos < len(i.str) {
296		c := i.str[pos]
297		if c < utf8.RuneSelf {
298			return rune(c), 1
299		}
300		return utf8.DecodeRuneInString(i.str[pos:])
301	}
302	return endOfText, 0
303}
304
305func (i *inputString) canCheckPrefix() bool {
306	return true
307}
308
309func (i *inputString) hasPrefix(re *Regexp) bool {
310	return strings.HasPrefix(i.str, re.prefix)
311}
312
313func (i *inputString) index(re *Regexp, pos int) int {
314	return strings.Index(i.str[pos:], re.prefix)
315}
316
317func (i *inputString) context(pos int) syntax.EmptyOp {
318	r1, r2 := endOfText, endOfText
319	// 0 < pos && pos <= len(i.str)
320	if uint(pos-1) < uint(len(i.str)) {
321		r1 = rune(i.str[pos-1])
322		if r1 >= utf8.RuneSelf {
323			r1, _ = utf8.DecodeLastRuneInString(i.str[:pos])
324		}
325	}
326	// 0 <= pos && pos < len(i.str)
327	if uint(pos) < uint(len(i.str)) {
328		r2 = rune(i.str[pos])
329		if r2 >= utf8.RuneSelf {
330			r2, _ = utf8.DecodeRuneInString(i.str[pos:])
331		}
332	}
333	return syntax.EmptyOpContext(r1, r2)
334}
335
336// inputBytes scans a byte slice.
337type inputBytes struct {
338	str []byte
339}
340
341func (i *inputBytes) step(pos int) (rune, int) {
342	if pos < len(i.str) {
343		c := i.str[pos]
344		if c < utf8.RuneSelf {
345			return rune(c), 1
346		}
347		return utf8.DecodeRune(i.str[pos:])
348	}
349	return endOfText, 0
350}
351
352func (i *inputBytes) canCheckPrefix() bool {
353	return true
354}
355
356func (i *inputBytes) hasPrefix(re *Regexp) bool {
357	return bytes.HasPrefix(i.str, re.prefixBytes)
358}
359
360func (i *inputBytes) index(re *Regexp, pos int) int {
361	return bytes.Index(i.str[pos:], re.prefixBytes)
362}
363
364func (i *inputBytes) context(pos int) syntax.EmptyOp {
365	r1, r2 := endOfText, endOfText
366	// 0 < pos && pos <= len(i.str)
367	if uint(pos-1) < uint(len(i.str)) {
368		r1 = rune(i.str[pos-1])
369		if r1 >= utf8.RuneSelf {
370			r1, _ = utf8.DecodeLastRune(i.str[:pos])
371		}
372	}
373	// 0 <= pos && pos < len(i.str)
374	if uint(pos) < uint(len(i.str)) {
375		r2 = rune(i.str[pos])
376		if r2 >= utf8.RuneSelf {
377			r2, _ = utf8.DecodeRune(i.str[pos:])
378		}
379	}
380	return syntax.EmptyOpContext(r1, r2)
381}
382
383// inputReader scans a RuneReader.
384type inputReader struct {
385	r     io.RuneReader
386	atEOT bool
387	pos   int
388}
389
390func (i *inputReader) step(pos int) (rune, int) {
391	if !i.atEOT && pos != i.pos {
392		return endOfText, 0
393
394	}
395	r, w, err := i.r.ReadRune()
396	if err != nil {
397		i.atEOT = true
398		return endOfText, 0
399	}
400	i.pos += w
401	return r, w
402}
403
404func (i *inputReader) canCheckPrefix() bool {
405	return false
406}
407
408func (i *inputReader) hasPrefix(re *Regexp) bool {
409	return false
410}
411
412func (i *inputReader) index(re *Regexp, pos int) int {
413	return -1
414}
415
416func (i *inputReader) context(pos int) syntax.EmptyOp {
417	return 0
418}
419
420// LiteralPrefix returns a literal string that must begin any match
421// of the regular expression re. It returns the boolean true if the
422// literal string comprises the entire regular expression.
423func (re *Regexp) LiteralPrefix() (prefix string, complete bool) {
424	return re.prefix, re.prefixComplete
425}
426
427// MatchReader reports whether the Regexp matches the text read by the
428// RuneReader.
429func (re *Regexp) MatchReader(r io.RuneReader) bool {
430	return re.doMatch(r, nil, "")
431}
432
433// MatchString reports whether the Regexp matches the string s.
434func (re *Regexp) MatchString(s string) bool {
435	return re.doMatch(nil, nil, s)
436}
437
438// Match reports whether the Regexp matches the byte slice b.
439func (re *Regexp) Match(b []byte) bool {
440	return re.doMatch(nil, b, "")
441}
442
443// MatchReader checks whether a textual regular expression matches the text
444// read by the RuneReader. More complicated queries need to use Compile and
445// the full Regexp interface.
446func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) {
447	re, err := Compile(pattern)
448	if err != nil {
449		return false, err
450	}
451	return re.MatchReader(r), nil
452}
453
454// MatchString checks whether a textual regular expression
455// matches a string. More complicated queries need
456// to use Compile and the full Regexp interface.
457func MatchString(pattern string, s string) (matched bool, err error) {
458	re, err := Compile(pattern)
459	if err != nil {
460		return false, err
461	}
462	return re.MatchString(s), nil
463}
464
465// Match checks whether a textual regular expression
466// matches a byte slice. More complicated queries need
467// to use Compile and the full Regexp interface.
468func Match(pattern string, b []byte) (matched bool, err error) {
469	re, err := Compile(pattern)
470	if err != nil {
471		return false, err
472	}
473	return re.Match(b), nil
474}
475
476// ReplaceAllString returns a copy of src, replacing matches of the Regexp
477// with the replacement string repl. Inside repl, $ signs are interpreted as
478// in Expand, so for instance $1 represents the text of the first submatch.
479func (re *Regexp) ReplaceAllString(src, repl string) string {
480	n := 2
481	if strings.Contains(repl, "$") {
482		n = 2 * (re.numSubexp + 1)
483	}
484	b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte {
485		return re.expand(dst, repl, nil, src, match)
486	})
487	return string(b)
488}
489
490// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp
491// with the replacement string repl. The replacement repl is substituted directly,
492// without using Expand.
493func (re *Regexp) ReplaceAllLiteralString(src, repl string) string {
494	return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
495		return append(dst, repl...)
496	}))
497}
498
499// ReplaceAllStringFunc returns a copy of src in which all matches of the
500// Regexp have been replaced by the return value of function repl applied
501// to the matched substring. The replacement returned by repl is substituted
502// directly, without using Expand.
503func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string {
504	b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte {
505		return append(dst, repl(src[match[0]:match[1]])...)
506	})
507	return string(b)
508}
509
510func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte {
511	lastMatchEnd := 0 // end position of the most recent match
512	searchPos := 0    // position where we next look for a match
513	var buf []byte
514	var endPos int
515	if bsrc != nil {
516		endPos = len(bsrc)
517	} else {
518		endPos = len(src)
519	}
520	if nmatch > re.prog.NumCap {
521		nmatch = re.prog.NumCap
522	}
523
524	var dstCap [2]int
525	for searchPos <= endPos {
526		a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0])
527		if len(a) == 0 {
528			break // no more matches
529		}
530
531		// Copy the unmatched characters before this match.
532		if bsrc != nil {
533			buf = append(buf, bsrc[lastMatchEnd:a[0]]...)
534		} else {
535			buf = append(buf, src[lastMatchEnd:a[0]]...)
536		}
537
538		// Now insert a copy of the replacement string, but not for a
539		// match of the empty string immediately after another match.
540		// (Otherwise, we get double replacement for patterns that
541		// match both empty and nonempty strings.)
542		if a[1] > lastMatchEnd || a[0] == 0 {
543			buf = repl(buf, a)
544		}
545		lastMatchEnd = a[1]
546
547		// Advance past this match; always advance at least one character.
548		var width int
549		if bsrc != nil {
550			_, width = utf8.DecodeRune(bsrc[searchPos:])
551		} else {
552			_, width = utf8.DecodeRuneInString(src[searchPos:])
553		}
554		if searchPos+width > a[1] {
555			searchPos += width
556		} else if searchPos+1 > a[1] {
557			// This clause is only needed at the end of the input
558			// string. In that case, DecodeRuneInString returns width=0.
559			searchPos++
560		} else {
561			searchPos = a[1]
562		}
563	}
564
565	// Copy the unmatched characters after the last match.
566	if bsrc != nil {
567		buf = append(buf, bsrc[lastMatchEnd:]...)
568	} else {
569		buf = append(buf, src[lastMatchEnd:]...)
570	}
571
572	return buf
573}
574
575// ReplaceAll returns a copy of src, replacing matches of the Regexp
576// with the replacement text repl. Inside repl, $ signs are interpreted as
577// in Expand, so for instance $1 represents the text of the first submatch.
578func (re *Regexp) ReplaceAll(src, repl []byte) []byte {
579	n := 2
580	if bytes.IndexByte(repl, '$') >= 0 {
581		n = 2 * (re.numSubexp + 1)
582	}
583	srepl := ""
584	b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte {
585		if len(srepl) != len(repl) {
586			srepl = string(repl)
587		}
588		return re.expand(dst, srepl, src, "", match)
589	})
590	return b
591}
592
593// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp
594// with the replacement bytes repl. The replacement repl is substituted directly,
595// without using Expand.
596func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte {
597	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
598		return append(dst, repl...)
599	})
600}
601
602// ReplaceAllFunc returns a copy of src in which all matches of the
603// Regexp have been replaced by the return value of function repl applied
604// to the matched byte slice. The replacement returned by repl is substituted
605// directly, without using Expand.
606func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte {
607	return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte {
608		return append(dst, repl(src[match[0]:match[1]])...)
609	})
610}
611
612// Bitmap used by func special to check whether a character needs to be escaped.
613var specialBytes [16]byte
614
615// special reports whether byte b needs to be escaped by QuoteMeta.
616func special(b byte) bool {
617	return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0
618}
619
620func init() {
621	for _, b := range []byte(`\.+*?()|[]{}^$`) {
622		specialBytes[b%16] |= 1 << (b / 16)
623	}
624}
625
626// QuoteMeta returns a string that quotes all regular expression metacharacters
627// inside the argument text; the returned string is a regular expression matching
628// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`.
629func QuoteMeta(s string) string {
630	// A byte loop is correct because all metacharacters are ASCII.
631	var i int
632	for i = 0; i < len(s); i++ {
633		if special(s[i]) {
634			break
635		}
636	}
637	// No meta characters found, so return original string.
638	if i >= len(s) {
639		return s
640	}
641
642	b := make([]byte, 2*len(s)-i)
643	copy(b, s[:i])
644	j := i
645	for ; i < len(s); i++ {
646		if special(s[i]) {
647			b[j] = '\\'
648			j++
649		}
650		b[j] = s[i]
651		j++
652	}
653	return string(b[:j])
654}
655
656// The number of capture values in the program may correspond
657// to fewer capturing expressions than are in the regexp.
658// For example, "(a){0}" turns into an empty program, so the
659// maximum capture in the program is 0 but we need to return
660// an expression for \1.  Pad appends -1s to the slice a as needed.
661func (re *Regexp) pad(a []int) []int {
662	if a == nil {
663		// No match.
664		return nil
665	}
666	n := (1 + re.numSubexp) * 2
667	for len(a) < n {
668		a = append(a, -1)
669	}
670	return a
671}
672
673// Find matches in slice b if b is non-nil, otherwise find matches in string s.
674func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) {
675	var end int
676	if b == nil {
677		end = len(s)
678	} else {
679		end = len(b)
680	}
681
682	for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; {
683		matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil)
684		if len(matches) == 0 {
685			break
686		}
687
688		accept := true
689		if matches[1] == pos {
690			// We've found an empty match.
691			if matches[0] == prevMatchEnd {
692				// We don't allow an empty match right
693				// after a previous match, so ignore it.
694				accept = false
695			}
696			var width int
697			// TODO: use step()
698			if b == nil {
699				_, width = utf8.DecodeRuneInString(s[pos:end])
700			} else {
701				_, width = utf8.DecodeRune(b[pos:end])
702			}
703			if width > 0 {
704				pos += width
705			} else {
706				pos = end + 1
707			}
708		} else {
709			pos = matches[1]
710		}
711		prevMatchEnd = matches[1]
712
713		if accept {
714			deliver(re.pad(matches))
715			i++
716		}
717	}
718}
719
720// Find returns a slice holding the text of the leftmost match in b of the regular expression.
721// A return value of nil indicates no match.
722func (re *Regexp) Find(b []byte) []byte {
723	var dstCap [2]int
724	a := re.doExecute(nil, b, "", 0, 2, dstCap[:0])
725	if a == nil {
726		return nil
727	}
728	return b[a[0]:a[1]]
729}
730
731// FindIndex returns a two-element slice of integers defining the location of
732// the leftmost match in b of the regular expression. The match itself is at
733// b[loc[0]:loc[1]].
734// A return value of nil indicates no match.
735func (re *Regexp) FindIndex(b []byte) (loc []int) {
736	a := re.doExecute(nil, b, "", 0, 2, nil)
737	if a == nil {
738		return nil
739	}
740	return a[0:2]
741}
742
743// FindString returns a string holding the text of the leftmost match in s of the regular
744// expression. If there is no match, the return value is an empty string,
745// but it will also be empty if the regular expression successfully matches
746// an empty string. Use FindStringIndex or FindStringSubmatch if it is
747// necessary to distinguish these cases.
748func (re *Regexp) FindString(s string) string {
749	var dstCap [2]int
750	a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0])
751	if a == nil {
752		return ""
753	}
754	return s[a[0]:a[1]]
755}
756
757// FindStringIndex returns a two-element slice of integers defining the
758// location of the leftmost match in s of the regular expression. The match
759// itself is at s[loc[0]:loc[1]].
760// A return value of nil indicates no match.
761func (re *Regexp) FindStringIndex(s string) (loc []int) {
762	a := re.doExecute(nil, nil, s, 0, 2, nil)
763	if a == nil {
764		return nil
765	}
766	return a[0:2]
767}
768
769// FindReaderIndex returns a two-element slice of integers defining the
770// location of the leftmost match of the regular expression in text read from
771// the RuneReader. The match text was found in the input stream at
772// byte offset loc[0] through loc[1]-1.
773// A return value of nil indicates no match.
774func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) {
775	a := re.doExecute(r, nil, "", 0, 2, nil)
776	if a == nil {
777		return nil
778	}
779	return a[0:2]
780}
781
782// FindSubmatch returns a slice of slices holding the text of the leftmost
783// match of the regular expression in b and the matches, if any, of its
784// subexpressions, as defined by the 'Submatch' descriptions in the package
785// comment.
786// A return value of nil indicates no match.
787func (re *Regexp) FindSubmatch(b []byte) [][]byte {
788	var dstCap [4]int
789	a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0])
790	if a == nil {
791		return nil
792	}
793	ret := make([][]byte, 1+re.numSubexp)
794	for i := range ret {
795		if 2*i < len(a) && a[2*i] >= 0 {
796			ret[i] = b[a[2*i]:a[2*i+1]]
797		}
798	}
799	return ret
800}
801
802// Expand appends template to dst and returns the result; during the
803// append, Expand replaces variables in the template with corresponding
804// matches drawn from src. The match slice should have been returned by
805// FindSubmatchIndex.
806//
807// In the template, a variable is denoted by a substring of the form
808// $name or ${name}, where name is a non-empty sequence of letters,
809// digits, and underscores. A purely numeric name like $1 refers to
810// the submatch with the corresponding index; other names refer to
811// capturing parentheses named with the (?P<name>...) syntax. A
812// reference to an out of range or unmatched index or a name that is not
813// present in the regular expression is replaced with an empty slice.
814//
815// In the $name form, name is taken to be as long as possible: $1x is
816// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0.
817//
818// To insert a literal $ in the output, use $$ in the template.
819func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte {
820	return re.expand(dst, string(template), src, "", match)
821}
822
823// ExpandString is like Expand but the template and source are strings.
824// It appends to and returns a byte slice in order to give the calling
825// code control over allocation.
826func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte {
827	return re.expand(dst, template, nil, src, match)
828}
829
830func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte {
831	for len(template) > 0 {
832		i := strings.Index(template, "$")
833		if i < 0 {
834			break
835		}
836		dst = append(dst, template[:i]...)
837		template = template[i:]
838		if len(template) > 1 && template[1] == '$' {
839			// Treat $$ as $.
840			dst = append(dst, '$')
841			template = template[2:]
842			continue
843		}
844		name, num, rest, ok := extract(template)
845		if !ok {
846			// Malformed; treat $ as raw text.
847			dst = append(dst, '$')
848			template = template[1:]
849			continue
850		}
851		template = rest
852		if num >= 0 {
853			if 2*num+1 < len(match) && match[2*num] >= 0 {
854				if bsrc != nil {
855					dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...)
856				} else {
857					dst = append(dst, src[match[2*num]:match[2*num+1]]...)
858				}
859			}
860		} else {
861			for i, namei := range re.subexpNames {
862				if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 {
863					if bsrc != nil {
864						dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...)
865					} else {
866						dst = append(dst, src[match[2*i]:match[2*i+1]]...)
867					}
868					break
869				}
870			}
871		}
872	}
873	dst = append(dst, template...)
874	return dst
875}
876
877// extract returns the name from a leading "$name" or "${name}" in str.
878// If it is a number, extract returns num set to that number; otherwise num = -1.
879func extract(str string) (name string, num int, rest string, ok bool) {
880	if len(str) < 2 || str[0] != '$' {
881		return
882	}
883	brace := false
884	if str[1] == '{' {
885		brace = true
886		str = str[2:]
887	} else {
888		str = str[1:]
889	}
890	i := 0
891	for i < len(str) {
892		rune, size := utf8.DecodeRuneInString(str[i:])
893		if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' {
894			break
895		}
896		i += size
897	}
898	if i == 0 {
899		// empty name is not okay
900		return
901	}
902	name = str[:i]
903	if brace {
904		if i >= len(str) || str[i] != '}' {
905			// missing closing brace
906			return
907		}
908		i++
909	}
910
911	// Parse number.
912	num = 0
913	for i := 0; i < len(name); i++ {
914		if name[i] < '0' || '9' < name[i] || num >= 1e8 {
915			num = -1
916			break
917		}
918		num = num*10 + int(name[i]) - '0'
919	}
920	// Disallow leading zeros.
921	if name[0] == '0' && len(name) > 1 {
922		num = -1
923	}
924
925	rest = str[i:]
926	ok = true
927	return
928}
929
930// FindSubmatchIndex returns a slice holding the index pairs identifying the
931// leftmost match of the regular expression in b and the matches, if any, of
932// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions
933// in the package comment.
934// A return value of nil indicates no match.
935func (re *Regexp) FindSubmatchIndex(b []byte) []int {
936	return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil))
937}
938
939// FindStringSubmatch returns a slice of strings holding the text of the
940// leftmost match of the regular expression in s and the matches, if any, of
941// its subexpressions, as defined by the 'Submatch' description in the
942// package comment.
943// A return value of nil indicates no match.
944func (re *Regexp) FindStringSubmatch(s string) []string {
945	var dstCap [4]int
946	a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0])
947	if a == nil {
948		return nil
949	}
950	ret := make([]string, 1+re.numSubexp)
951	for i := range ret {
952		if 2*i < len(a) && a[2*i] >= 0 {
953			ret[i] = s[a[2*i]:a[2*i+1]]
954		}
955	}
956	return ret
957}
958
959// FindStringSubmatchIndex returns a slice holding the index pairs
960// identifying the leftmost match of the regular expression in s and the
961// matches, if any, of its subexpressions, as defined by the 'Submatch' and
962// 'Index' descriptions in the package comment.
963// A return value of nil indicates no match.
964func (re *Regexp) FindStringSubmatchIndex(s string) []int {
965	return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil))
966}
967
968// FindReaderSubmatchIndex returns a slice holding the index pairs
969// identifying the leftmost match of the regular expression of text read by
970// the RuneReader, and the matches, if any, of its subexpressions, as defined
971// by the 'Submatch' and 'Index' descriptions in the package comment. A
972// return value of nil indicates no match.
973func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int {
974	return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil))
975}
976
977const startSize = 10 // The size at which to start a slice in the 'All' routines.
978
979// FindAll is the 'All' version of Find; it returns a slice of all successive
980// matches of the expression, as defined by the 'All' description in the
981// package comment.
982// A return value of nil indicates no match.
983func (re *Regexp) FindAll(b []byte, n int) [][]byte {
984	if n < 0 {
985		n = len(b) + 1
986	}
987	result := make([][]byte, 0, startSize)
988	re.allMatches("", b, n, func(match []int) {
989		result = append(result, b[match[0]:match[1]])
990	})
991	if len(result) == 0 {
992		return nil
993	}
994	return result
995}
996
997// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all
998// successive matches of the expression, as defined by the 'All' description
999// in the package comment.
1000// A return value of nil indicates no match.
1001func (re *Regexp) FindAllIndex(b []byte, n int) [][]int {
1002	if n < 0 {
1003		n = len(b) + 1
1004	}
1005	result := make([][]int, 0, startSize)
1006	re.allMatches("", b, n, func(match []int) {
1007		result = append(result, match[0:2])
1008	})
1009	if len(result) == 0 {
1010		return nil
1011	}
1012	return result
1013}
1014
1015// FindAllString is the 'All' version of FindString; it returns a slice of all
1016// successive matches of the expression, as defined by the 'All' description
1017// in the package comment.
1018// A return value of nil indicates no match.
1019func (re *Regexp) FindAllString(s string, n int) []string {
1020	if n < 0 {
1021		n = len(s) + 1
1022	}
1023	result := make([]string, 0, startSize)
1024	re.allMatches(s, nil, n, func(match []int) {
1025		result = append(result, s[match[0]:match[1]])
1026	})
1027	if len(result) == 0 {
1028		return nil
1029	}
1030	return result
1031}
1032
1033// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a
1034// slice of all successive matches of the expression, as defined by the 'All'
1035// description in the package comment.
1036// A return value of nil indicates no match.
1037func (re *Regexp) FindAllStringIndex(s string, n int) [][]int {
1038	if n < 0 {
1039		n = len(s) + 1
1040	}
1041	result := make([][]int, 0, startSize)
1042	re.allMatches(s, nil, n, func(match []int) {
1043		result = append(result, match[0:2])
1044	})
1045	if len(result) == 0 {
1046		return nil
1047	}
1048	return result
1049}
1050
1051// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice
1052// of all successive matches of the expression, as defined by the 'All'
1053// description in the package comment.
1054// A return value of nil indicates no match.
1055func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte {
1056	if n < 0 {
1057		n = len(b) + 1
1058	}
1059	result := make([][][]byte, 0, startSize)
1060	re.allMatches("", b, n, func(match []int) {
1061		slice := make([][]byte, len(match)/2)
1062		for j := range slice {
1063			if match[2*j] >= 0 {
1064				slice[j] = b[match[2*j]:match[2*j+1]]
1065			}
1066		}
1067		result = append(result, slice)
1068	})
1069	if len(result) == 0 {
1070		return nil
1071	}
1072	return result
1073}
1074
1075// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns
1076// a slice of all successive matches of the expression, as defined by the
1077// 'All' description in the package comment.
1078// A return value of nil indicates no match.
1079func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int {
1080	if n < 0 {
1081		n = len(b) + 1
1082	}
1083	result := make([][]int, 0, startSize)
1084	re.allMatches("", b, n, func(match []int) {
1085		result = append(result, match)
1086	})
1087	if len(result) == 0 {
1088		return nil
1089	}
1090	return result
1091}
1092
1093// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it
1094// returns a slice of all successive matches of the expression, as defined by
1095// the 'All' description in the package comment.
1096// A return value of nil indicates no match.
1097func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string {
1098	if n < 0 {
1099		n = len(s) + 1
1100	}
1101	result := make([][]string, 0, startSize)
1102	re.allMatches(s, nil, n, func(match []int) {
1103		slice := make([]string, len(match)/2)
1104		for j := range slice {
1105			if match[2*j] >= 0 {
1106				slice[j] = s[match[2*j]:match[2*j+1]]
1107			}
1108		}
1109		result = append(result, slice)
1110	})
1111	if len(result) == 0 {
1112		return nil
1113	}
1114	return result
1115}
1116
1117// FindAllStringSubmatchIndex is the 'All' version of
1118// FindStringSubmatchIndex; it returns a slice of all successive matches of
1119// the expression, as defined by the 'All' description in the package
1120// comment.
1121// A return value of nil indicates no match.
1122func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int {
1123	if n < 0 {
1124		n = len(s) + 1
1125	}
1126	result := make([][]int, 0, startSize)
1127	re.allMatches(s, nil, n, func(match []int) {
1128		result = append(result, match)
1129	})
1130	if len(result) == 0 {
1131		return nil
1132	}
1133	return result
1134}
1135
1136// Split slices s into substrings separated by the expression and returns a slice of
1137// the substrings between those expression matches.
1138//
1139// The slice returned by this method consists of all the substrings of s
1140// not contained in the slice returned by FindAllString. When called on an expression
1141// that contains no metacharacters, it is equivalent to strings.SplitN.
1142//
1143// Example:
1144//   s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5)
1145//   // s: ["", "b", "b", "c", "cadaaae"]
1146//
1147// The count determines the number of substrings to return:
1148//   n > 0: at most n substrings; the last substring will be the unsplit remainder.
1149//   n == 0: the result is nil (zero substrings)
1150//   n < 0: all substrings
1151func (re *Regexp) Split(s string, n int) []string {
1152
1153	if n == 0 {
1154		return nil
1155	}
1156
1157	if len(re.expr) > 0 && len(s) == 0 {
1158		return []string{""}
1159	}
1160
1161	matches := re.FindAllStringIndex(s, n)
1162	strings := make([]string, 0, len(matches))
1163
1164	beg := 0
1165	end := 0
1166	for _, match := range matches {
1167		if n > 0 && len(strings) >= n-1 {
1168			break
1169		}
1170
1171		end = match[0]
1172		if match[1] != 0 {
1173			strings = append(strings, s[beg:end])
1174		}
1175		beg = match[1]
1176	}
1177
1178	if end != len(s) {
1179		strings = append(strings, s[beg:])
1180	}
1181
1182	return strings
1183}
1184