1// Copyright 2009 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package regexp implements regular expression search. 6// 7// The syntax of the regular expressions accepted is the same 8// general syntax used by Perl, Python, and other languages. 9// More precisely, it is the syntax accepted by RE2 and described at 10// https://golang.org/s/re2syntax, except for \C. 11// For an overview of the syntax, run 12// go doc regexp/syntax 13// 14// The regexp implementation provided by this package is 15// guaranteed to run in time linear in the size of the input. 16// (This is a property not guaranteed by most open source 17// implementations of regular expressions.) For more information 18// about this property, see 19// http://swtch.com/~rsc/regexp/regexp1.html 20// or any book about automata theory. 21// 22// All characters are UTF-8-encoded code points. 23// 24// There are 16 methods of Regexp that match a regular expression and identify 25// the matched text. Their names are matched by this regular expression: 26// 27// Find(All)?(String)?(Submatch)?(Index)? 28// 29// If 'All' is present, the routine matches successive non-overlapping 30// matches of the entire expression. Empty matches abutting a preceding 31// match are ignored. The return value is a slice containing the successive 32// return values of the corresponding non-'All' routine. These routines take 33// an extra integer argument, n; if n >= 0, the function returns at most n 34// matches/submatches. 35// 36// If 'String' is present, the argument is a string; otherwise it is a slice 37// of bytes; return values are adjusted as appropriate. 38// 39// If 'Submatch' is present, the return value is a slice identifying the 40// successive submatches of the expression. Submatches are matches of 41// parenthesized subexpressions (also known as capturing groups) within the 42// regular expression, numbered from left to right in order of opening 43// parenthesis. Submatch 0 is the match of the entire expression, submatch 1 44// the match of the first parenthesized subexpression, and so on. 45// 46// If 'Index' is present, matches and submatches are identified by byte index 47// pairs within the input string: result[2*n:2*n+1] identifies the indexes of 48// the nth submatch. The pair for n==0 identifies the match of the entire 49// expression. If 'Index' is not present, the match is identified by the 50// text of the match/submatch. If an index is negative, it means that 51// subexpression did not match any string in the input. 52// 53// There is also a subset of the methods that can be applied to text read 54// from a RuneReader: 55// 56// MatchReader, FindReaderIndex, FindReaderSubmatchIndex 57// 58// This set may grow. Note that regular expression matches may need to 59// examine text beyond the text returned by a match, so the methods that 60// match text from a RuneReader may read arbitrarily far into the input 61// before returning. 62// 63// (There are a few other methods that do not match this pattern.) 64// 65package regexp 66 67import ( 68 "bytes" 69 "io" 70 "regexp/syntax" 71 "strconv" 72 "strings" 73 "sync" 74 "unicode" 75 "unicode/utf8" 76) 77 78// Regexp is the representation of a compiled regular expression. 79// A Regexp is safe for concurrent use by multiple goroutines, 80// except for configuration methods, such as Longest. 81type Regexp struct { 82 // read-only after Compile 83 regexpRO 84 85 // cache of machines for running regexp 86 mu sync.Mutex 87 machine []*machine 88} 89 90type regexpRO struct { 91 expr string // as passed to Compile 92 prog *syntax.Prog // compiled program 93 onepass *onePassProg // onepass program or nil 94 prefix string // required prefix in unanchored matches 95 prefixBytes []byte // prefix, as a []byte 96 prefixComplete bool // prefix is the entire regexp 97 prefixRune rune // first rune in prefix 98 prefixEnd uint32 // pc for last rune in prefix 99 cond syntax.EmptyOp // empty-width conditions required at start of match 100 numSubexp int 101 subexpNames []string 102 longest bool 103} 104 105// String returns the source text used to compile the regular expression. 106func (re *Regexp) String() string { 107 return re.expr 108} 109 110// Copy returns a new Regexp object copied from re. 111// 112// When using a Regexp in multiple goroutines, giving each goroutine 113// its own copy helps to avoid lock contention. 114func (re *Regexp) Copy() *Regexp { 115 // It is not safe to copy Regexp by value 116 // since it contains a sync.Mutex. 117 return &Regexp{ 118 regexpRO: re.regexpRO, 119 } 120} 121 122// Compile parses a regular expression and returns, if successful, 123// a Regexp object that can be used to match against text. 124// 125// When matching against text, the regexp returns a match that 126// begins as early as possible in the input (leftmost), and among those 127// it chooses the one that a backtracking search would have found first. 128// This so-called leftmost-first matching is the same semantics 129// that Perl, Python, and other implementations use, although this 130// package implements it without the expense of backtracking. 131// For POSIX leftmost-longest matching, see CompilePOSIX. 132func Compile(expr string) (*Regexp, error) { 133 return compile(expr, syntax.Perl, false) 134} 135 136// CompilePOSIX is like Compile but restricts the regular expression 137// to POSIX ERE (egrep) syntax and changes the match semantics to 138// leftmost-longest. 139// 140// That is, when matching against text, the regexp returns a match that 141// begins as early as possible in the input (leftmost), and among those 142// it chooses a match that is as long as possible. 143// This so-called leftmost-longest matching is the same semantics 144// that early regular expression implementations used and that POSIX 145// specifies. 146// 147// However, there can be multiple leftmost-longest matches, with different 148// submatch choices, and here this package diverges from POSIX. 149// Among the possible leftmost-longest matches, this package chooses 150// the one that a backtracking search would have found first, while POSIX 151// specifies that the match be chosen to maximize the length of the first 152// subexpression, then the second, and so on from left to right. 153// The POSIX rule is computationally prohibitive and not even well-defined. 154// See http://swtch.com/~rsc/regexp/regexp2.html#posix for details. 155func CompilePOSIX(expr string) (*Regexp, error) { 156 return compile(expr, syntax.POSIX, true) 157} 158 159// Longest makes future searches prefer the leftmost-longest match. 160// That is, when matching against text, the regexp returns a match that 161// begins as early as possible in the input (leftmost), and among those 162// it chooses a match that is as long as possible. 163// This method modifies the Regexp and may not be called concurrently 164// with any other methods. 165func (re *Regexp) Longest() { 166 re.longest = true 167} 168 169func compile(expr string, mode syntax.Flags, longest bool) (*Regexp, error) { 170 re, err := syntax.Parse(expr, mode) 171 if err != nil { 172 return nil, err 173 } 174 maxCap := re.MaxCap() 175 capNames := re.CapNames() 176 177 re = re.Simplify() 178 prog, err := syntax.Compile(re) 179 if err != nil { 180 return nil, err 181 } 182 regexp := &Regexp{ 183 regexpRO: regexpRO{ 184 expr: expr, 185 prog: prog, 186 onepass: compileOnePass(prog), 187 numSubexp: maxCap, 188 subexpNames: capNames, 189 cond: prog.StartCond(), 190 longest: longest, 191 }, 192 } 193 if regexp.onepass == notOnePass { 194 regexp.prefix, regexp.prefixComplete = prog.Prefix() 195 } else { 196 regexp.prefix, regexp.prefixComplete, regexp.prefixEnd = onePassPrefix(prog) 197 } 198 if regexp.prefix != "" { 199 // TODO(rsc): Remove this allocation by adding 200 // IndexString to package bytes. 201 regexp.prefixBytes = []byte(regexp.prefix) 202 regexp.prefixRune, _ = utf8.DecodeRuneInString(regexp.prefix) 203 } 204 return regexp, nil 205} 206 207// get returns a machine to use for matching re. 208// It uses the re's machine cache if possible, to avoid 209// unnecessary allocation. 210func (re *Regexp) get() *machine { 211 re.mu.Lock() 212 if n := len(re.machine); n > 0 { 213 z := re.machine[n-1] 214 re.machine = re.machine[:n-1] 215 re.mu.Unlock() 216 return z 217 } 218 re.mu.Unlock() 219 z := progMachine(re.prog, re.onepass) 220 z.re = re 221 return z 222} 223 224// put returns a machine to the re's machine cache. 225// There is no attempt to limit the size of the cache, so it will 226// grow to the maximum number of simultaneous matches 227// run using re. (The cache empties when re gets garbage collected.) 228func (re *Regexp) put(z *machine) { 229 re.mu.Lock() 230 re.machine = append(re.machine, z) 231 re.mu.Unlock() 232} 233 234// MustCompile is like Compile but panics if the expression cannot be parsed. 235// It simplifies safe initialization of global variables holding compiled regular 236// expressions. 237func MustCompile(str string) *Regexp { 238 regexp, error := Compile(str) 239 if error != nil { 240 panic(`regexp: Compile(` + quote(str) + `): ` + error.Error()) 241 } 242 return regexp 243} 244 245// MustCompilePOSIX is like CompilePOSIX but panics if the expression cannot be parsed. 246// It simplifies safe initialization of global variables holding compiled regular 247// expressions. 248func MustCompilePOSIX(str string) *Regexp { 249 regexp, error := CompilePOSIX(str) 250 if error != nil { 251 panic(`regexp: CompilePOSIX(` + quote(str) + `): ` + error.Error()) 252 } 253 return regexp 254} 255 256func quote(s string) string { 257 if strconv.CanBackquote(s) { 258 return "`" + s + "`" 259 } 260 return strconv.Quote(s) 261} 262 263// NumSubexp returns the number of parenthesized subexpressions in this Regexp. 264func (re *Regexp) NumSubexp() int { 265 return re.numSubexp 266} 267 268// SubexpNames returns the names of the parenthesized subexpressions 269// in this Regexp. The name for the first sub-expression is names[1], 270// so that if m is a match slice, the name for m[i] is SubexpNames()[i]. 271// Since the Regexp as a whole cannot be named, names[0] is always 272// the empty string. The slice should not be modified. 273func (re *Regexp) SubexpNames() []string { 274 return re.subexpNames 275} 276 277const endOfText rune = -1 278 279// input abstracts different representations of the input text. It provides 280// one-character lookahead. 281type input interface { 282 step(pos int) (r rune, width int) // advance one rune 283 canCheckPrefix() bool // can we look ahead without losing info? 284 hasPrefix(re *Regexp) bool 285 index(re *Regexp, pos int) int 286 context(pos int) syntax.EmptyOp 287} 288 289// inputString scans a string. 290type inputString struct { 291 str string 292} 293 294func (i *inputString) step(pos int) (rune, int) { 295 if pos < len(i.str) { 296 c := i.str[pos] 297 if c < utf8.RuneSelf { 298 return rune(c), 1 299 } 300 return utf8.DecodeRuneInString(i.str[pos:]) 301 } 302 return endOfText, 0 303} 304 305func (i *inputString) canCheckPrefix() bool { 306 return true 307} 308 309func (i *inputString) hasPrefix(re *Regexp) bool { 310 return strings.HasPrefix(i.str, re.prefix) 311} 312 313func (i *inputString) index(re *Regexp, pos int) int { 314 return strings.Index(i.str[pos:], re.prefix) 315} 316 317func (i *inputString) context(pos int) syntax.EmptyOp { 318 r1, r2 := endOfText, endOfText 319 // 0 < pos && pos <= len(i.str) 320 if uint(pos-1) < uint(len(i.str)) { 321 r1 = rune(i.str[pos-1]) 322 if r1 >= utf8.RuneSelf { 323 r1, _ = utf8.DecodeLastRuneInString(i.str[:pos]) 324 } 325 } 326 // 0 <= pos && pos < len(i.str) 327 if uint(pos) < uint(len(i.str)) { 328 r2 = rune(i.str[pos]) 329 if r2 >= utf8.RuneSelf { 330 r2, _ = utf8.DecodeRuneInString(i.str[pos:]) 331 } 332 } 333 return syntax.EmptyOpContext(r1, r2) 334} 335 336// inputBytes scans a byte slice. 337type inputBytes struct { 338 str []byte 339} 340 341func (i *inputBytes) step(pos int) (rune, int) { 342 if pos < len(i.str) { 343 c := i.str[pos] 344 if c < utf8.RuneSelf { 345 return rune(c), 1 346 } 347 return utf8.DecodeRune(i.str[pos:]) 348 } 349 return endOfText, 0 350} 351 352func (i *inputBytes) canCheckPrefix() bool { 353 return true 354} 355 356func (i *inputBytes) hasPrefix(re *Regexp) bool { 357 return bytes.HasPrefix(i.str, re.prefixBytes) 358} 359 360func (i *inputBytes) index(re *Regexp, pos int) int { 361 return bytes.Index(i.str[pos:], re.prefixBytes) 362} 363 364func (i *inputBytes) context(pos int) syntax.EmptyOp { 365 r1, r2 := endOfText, endOfText 366 // 0 < pos && pos <= len(i.str) 367 if uint(pos-1) < uint(len(i.str)) { 368 r1 = rune(i.str[pos-1]) 369 if r1 >= utf8.RuneSelf { 370 r1, _ = utf8.DecodeLastRune(i.str[:pos]) 371 } 372 } 373 // 0 <= pos && pos < len(i.str) 374 if uint(pos) < uint(len(i.str)) { 375 r2 = rune(i.str[pos]) 376 if r2 >= utf8.RuneSelf { 377 r2, _ = utf8.DecodeRune(i.str[pos:]) 378 } 379 } 380 return syntax.EmptyOpContext(r1, r2) 381} 382 383// inputReader scans a RuneReader. 384type inputReader struct { 385 r io.RuneReader 386 atEOT bool 387 pos int 388} 389 390func (i *inputReader) step(pos int) (rune, int) { 391 if !i.atEOT && pos != i.pos { 392 return endOfText, 0 393 394 } 395 r, w, err := i.r.ReadRune() 396 if err != nil { 397 i.atEOT = true 398 return endOfText, 0 399 } 400 i.pos += w 401 return r, w 402} 403 404func (i *inputReader) canCheckPrefix() bool { 405 return false 406} 407 408func (i *inputReader) hasPrefix(re *Regexp) bool { 409 return false 410} 411 412func (i *inputReader) index(re *Regexp, pos int) int { 413 return -1 414} 415 416func (i *inputReader) context(pos int) syntax.EmptyOp { 417 return 0 418} 419 420// LiteralPrefix returns a literal string that must begin any match 421// of the regular expression re. It returns the boolean true if the 422// literal string comprises the entire regular expression. 423func (re *Regexp) LiteralPrefix() (prefix string, complete bool) { 424 return re.prefix, re.prefixComplete 425} 426 427// MatchReader reports whether the Regexp matches the text read by the 428// RuneReader. 429func (re *Regexp) MatchReader(r io.RuneReader) bool { 430 return re.doMatch(r, nil, "") 431} 432 433// MatchString reports whether the Regexp matches the string s. 434func (re *Regexp) MatchString(s string) bool { 435 return re.doMatch(nil, nil, s) 436} 437 438// Match reports whether the Regexp matches the byte slice b. 439func (re *Regexp) Match(b []byte) bool { 440 return re.doMatch(nil, b, "") 441} 442 443// MatchReader checks whether a textual regular expression matches the text 444// read by the RuneReader. More complicated queries need to use Compile and 445// the full Regexp interface. 446func MatchReader(pattern string, r io.RuneReader) (matched bool, err error) { 447 re, err := Compile(pattern) 448 if err != nil { 449 return false, err 450 } 451 return re.MatchReader(r), nil 452} 453 454// MatchString checks whether a textual regular expression 455// matches a string. More complicated queries need 456// to use Compile and the full Regexp interface. 457func MatchString(pattern string, s string) (matched bool, err error) { 458 re, err := Compile(pattern) 459 if err != nil { 460 return false, err 461 } 462 return re.MatchString(s), nil 463} 464 465// Match checks whether a textual regular expression 466// matches a byte slice. More complicated queries need 467// to use Compile and the full Regexp interface. 468func Match(pattern string, b []byte) (matched bool, err error) { 469 re, err := Compile(pattern) 470 if err != nil { 471 return false, err 472 } 473 return re.Match(b), nil 474} 475 476// ReplaceAllString returns a copy of src, replacing matches of the Regexp 477// with the replacement string repl. Inside repl, $ signs are interpreted as 478// in Expand, so for instance $1 represents the text of the first submatch. 479func (re *Regexp) ReplaceAllString(src, repl string) string { 480 n := 2 481 if strings.Contains(repl, "$") { 482 n = 2 * (re.numSubexp + 1) 483 } 484 b := re.replaceAll(nil, src, n, func(dst []byte, match []int) []byte { 485 return re.expand(dst, repl, nil, src, match) 486 }) 487 return string(b) 488} 489 490// ReplaceAllLiteralString returns a copy of src, replacing matches of the Regexp 491// with the replacement string repl. The replacement repl is substituted directly, 492// without using Expand. 493func (re *Regexp) ReplaceAllLiteralString(src, repl string) string { 494 return string(re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 495 return append(dst, repl...) 496 })) 497} 498 499// ReplaceAllStringFunc returns a copy of src in which all matches of the 500// Regexp have been replaced by the return value of function repl applied 501// to the matched substring. The replacement returned by repl is substituted 502// directly, without using Expand. 503func (re *Regexp) ReplaceAllStringFunc(src string, repl func(string) string) string { 504 b := re.replaceAll(nil, src, 2, func(dst []byte, match []int) []byte { 505 return append(dst, repl(src[match[0]:match[1]])...) 506 }) 507 return string(b) 508} 509 510func (re *Regexp) replaceAll(bsrc []byte, src string, nmatch int, repl func(dst []byte, m []int) []byte) []byte { 511 lastMatchEnd := 0 // end position of the most recent match 512 searchPos := 0 // position where we next look for a match 513 var buf []byte 514 var endPos int 515 if bsrc != nil { 516 endPos = len(bsrc) 517 } else { 518 endPos = len(src) 519 } 520 if nmatch > re.prog.NumCap { 521 nmatch = re.prog.NumCap 522 } 523 524 var dstCap [2]int 525 for searchPos <= endPos { 526 a := re.doExecute(nil, bsrc, src, searchPos, nmatch, dstCap[:0]) 527 if len(a) == 0 { 528 break // no more matches 529 } 530 531 // Copy the unmatched characters before this match. 532 if bsrc != nil { 533 buf = append(buf, bsrc[lastMatchEnd:a[0]]...) 534 } else { 535 buf = append(buf, src[lastMatchEnd:a[0]]...) 536 } 537 538 // Now insert a copy of the replacement string, but not for a 539 // match of the empty string immediately after another match. 540 // (Otherwise, we get double replacement for patterns that 541 // match both empty and nonempty strings.) 542 if a[1] > lastMatchEnd || a[0] == 0 { 543 buf = repl(buf, a) 544 } 545 lastMatchEnd = a[1] 546 547 // Advance past this match; always advance at least one character. 548 var width int 549 if bsrc != nil { 550 _, width = utf8.DecodeRune(bsrc[searchPos:]) 551 } else { 552 _, width = utf8.DecodeRuneInString(src[searchPos:]) 553 } 554 if searchPos+width > a[1] { 555 searchPos += width 556 } else if searchPos+1 > a[1] { 557 // This clause is only needed at the end of the input 558 // string. In that case, DecodeRuneInString returns width=0. 559 searchPos++ 560 } else { 561 searchPos = a[1] 562 } 563 } 564 565 // Copy the unmatched characters after the last match. 566 if bsrc != nil { 567 buf = append(buf, bsrc[lastMatchEnd:]...) 568 } else { 569 buf = append(buf, src[lastMatchEnd:]...) 570 } 571 572 return buf 573} 574 575// ReplaceAll returns a copy of src, replacing matches of the Regexp 576// with the replacement text repl. Inside repl, $ signs are interpreted as 577// in Expand, so for instance $1 represents the text of the first submatch. 578func (re *Regexp) ReplaceAll(src, repl []byte) []byte { 579 n := 2 580 if bytes.IndexByte(repl, '$') >= 0 { 581 n = 2 * (re.numSubexp + 1) 582 } 583 srepl := "" 584 b := re.replaceAll(src, "", n, func(dst []byte, match []int) []byte { 585 if len(srepl) != len(repl) { 586 srepl = string(repl) 587 } 588 return re.expand(dst, srepl, src, "", match) 589 }) 590 return b 591} 592 593// ReplaceAllLiteral returns a copy of src, replacing matches of the Regexp 594// with the replacement bytes repl. The replacement repl is substituted directly, 595// without using Expand. 596func (re *Regexp) ReplaceAllLiteral(src, repl []byte) []byte { 597 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 598 return append(dst, repl...) 599 }) 600} 601 602// ReplaceAllFunc returns a copy of src in which all matches of the 603// Regexp have been replaced by the return value of function repl applied 604// to the matched byte slice. The replacement returned by repl is substituted 605// directly, without using Expand. 606func (re *Regexp) ReplaceAllFunc(src []byte, repl func([]byte) []byte) []byte { 607 return re.replaceAll(src, "", 2, func(dst []byte, match []int) []byte { 608 return append(dst, repl(src[match[0]:match[1]])...) 609 }) 610} 611 612// Bitmap used by func special to check whether a character needs to be escaped. 613var specialBytes [16]byte 614 615// special reports whether byte b needs to be escaped by QuoteMeta. 616func special(b byte) bool { 617 return b < utf8.RuneSelf && specialBytes[b%16]&(1<<(b/16)) != 0 618} 619 620func init() { 621 for _, b := range []byte(`\.+*?()|[]{}^$`) { 622 specialBytes[b%16] |= 1 << (b / 16) 623 } 624} 625 626// QuoteMeta returns a string that quotes all regular expression metacharacters 627// inside the argument text; the returned string is a regular expression matching 628// the literal text. For example, QuoteMeta(`[foo]`) returns `\[foo\]`. 629func QuoteMeta(s string) string { 630 // A byte loop is correct because all metacharacters are ASCII. 631 var i int 632 for i = 0; i < len(s); i++ { 633 if special(s[i]) { 634 break 635 } 636 } 637 // No meta characters found, so return original string. 638 if i >= len(s) { 639 return s 640 } 641 642 b := make([]byte, 2*len(s)-i) 643 copy(b, s[:i]) 644 j := i 645 for ; i < len(s); i++ { 646 if special(s[i]) { 647 b[j] = '\\' 648 j++ 649 } 650 b[j] = s[i] 651 j++ 652 } 653 return string(b[:j]) 654} 655 656// The number of capture values in the program may correspond 657// to fewer capturing expressions than are in the regexp. 658// For example, "(a){0}" turns into an empty program, so the 659// maximum capture in the program is 0 but we need to return 660// an expression for \1. Pad appends -1s to the slice a as needed. 661func (re *Regexp) pad(a []int) []int { 662 if a == nil { 663 // No match. 664 return nil 665 } 666 n := (1 + re.numSubexp) * 2 667 for len(a) < n { 668 a = append(a, -1) 669 } 670 return a 671} 672 673// Find matches in slice b if b is non-nil, otherwise find matches in string s. 674func (re *Regexp) allMatches(s string, b []byte, n int, deliver func([]int)) { 675 var end int 676 if b == nil { 677 end = len(s) 678 } else { 679 end = len(b) 680 } 681 682 for pos, i, prevMatchEnd := 0, 0, -1; i < n && pos <= end; { 683 matches := re.doExecute(nil, b, s, pos, re.prog.NumCap, nil) 684 if len(matches) == 0 { 685 break 686 } 687 688 accept := true 689 if matches[1] == pos { 690 // We've found an empty match. 691 if matches[0] == prevMatchEnd { 692 // We don't allow an empty match right 693 // after a previous match, so ignore it. 694 accept = false 695 } 696 var width int 697 // TODO: use step() 698 if b == nil { 699 _, width = utf8.DecodeRuneInString(s[pos:end]) 700 } else { 701 _, width = utf8.DecodeRune(b[pos:end]) 702 } 703 if width > 0 { 704 pos += width 705 } else { 706 pos = end + 1 707 } 708 } else { 709 pos = matches[1] 710 } 711 prevMatchEnd = matches[1] 712 713 if accept { 714 deliver(re.pad(matches)) 715 i++ 716 } 717 } 718} 719 720// Find returns a slice holding the text of the leftmost match in b of the regular expression. 721// A return value of nil indicates no match. 722func (re *Regexp) Find(b []byte) []byte { 723 var dstCap [2]int 724 a := re.doExecute(nil, b, "", 0, 2, dstCap[:0]) 725 if a == nil { 726 return nil 727 } 728 return b[a[0]:a[1]] 729} 730 731// FindIndex returns a two-element slice of integers defining the location of 732// the leftmost match in b of the regular expression. The match itself is at 733// b[loc[0]:loc[1]]. 734// A return value of nil indicates no match. 735func (re *Regexp) FindIndex(b []byte) (loc []int) { 736 a := re.doExecute(nil, b, "", 0, 2, nil) 737 if a == nil { 738 return nil 739 } 740 return a[0:2] 741} 742 743// FindString returns a string holding the text of the leftmost match in s of the regular 744// expression. If there is no match, the return value is an empty string, 745// but it will also be empty if the regular expression successfully matches 746// an empty string. Use FindStringIndex or FindStringSubmatch if it is 747// necessary to distinguish these cases. 748func (re *Regexp) FindString(s string) string { 749 var dstCap [2]int 750 a := re.doExecute(nil, nil, s, 0, 2, dstCap[:0]) 751 if a == nil { 752 return "" 753 } 754 return s[a[0]:a[1]] 755} 756 757// FindStringIndex returns a two-element slice of integers defining the 758// location of the leftmost match in s of the regular expression. The match 759// itself is at s[loc[0]:loc[1]]. 760// A return value of nil indicates no match. 761func (re *Regexp) FindStringIndex(s string) (loc []int) { 762 a := re.doExecute(nil, nil, s, 0, 2, nil) 763 if a == nil { 764 return nil 765 } 766 return a[0:2] 767} 768 769// FindReaderIndex returns a two-element slice of integers defining the 770// location of the leftmost match of the regular expression in text read from 771// the RuneReader. The match text was found in the input stream at 772// byte offset loc[0] through loc[1]-1. 773// A return value of nil indicates no match. 774func (re *Regexp) FindReaderIndex(r io.RuneReader) (loc []int) { 775 a := re.doExecute(r, nil, "", 0, 2, nil) 776 if a == nil { 777 return nil 778 } 779 return a[0:2] 780} 781 782// FindSubmatch returns a slice of slices holding the text of the leftmost 783// match of the regular expression in b and the matches, if any, of its 784// subexpressions, as defined by the 'Submatch' descriptions in the package 785// comment. 786// A return value of nil indicates no match. 787func (re *Regexp) FindSubmatch(b []byte) [][]byte { 788 var dstCap [4]int 789 a := re.doExecute(nil, b, "", 0, re.prog.NumCap, dstCap[:0]) 790 if a == nil { 791 return nil 792 } 793 ret := make([][]byte, 1+re.numSubexp) 794 for i := range ret { 795 if 2*i < len(a) && a[2*i] >= 0 { 796 ret[i] = b[a[2*i]:a[2*i+1]] 797 } 798 } 799 return ret 800} 801 802// Expand appends template to dst and returns the result; during the 803// append, Expand replaces variables in the template with corresponding 804// matches drawn from src. The match slice should have been returned by 805// FindSubmatchIndex. 806// 807// In the template, a variable is denoted by a substring of the form 808// $name or ${name}, where name is a non-empty sequence of letters, 809// digits, and underscores. A purely numeric name like $1 refers to 810// the submatch with the corresponding index; other names refer to 811// capturing parentheses named with the (?P<name>...) syntax. A 812// reference to an out of range or unmatched index or a name that is not 813// present in the regular expression is replaced with an empty slice. 814// 815// In the $name form, name is taken to be as long as possible: $1x is 816// equivalent to ${1x}, not ${1}x, and, $10 is equivalent to ${10}, not ${1}0. 817// 818// To insert a literal $ in the output, use $$ in the template. 819func (re *Regexp) Expand(dst []byte, template []byte, src []byte, match []int) []byte { 820 return re.expand(dst, string(template), src, "", match) 821} 822 823// ExpandString is like Expand but the template and source are strings. 824// It appends to and returns a byte slice in order to give the calling 825// code control over allocation. 826func (re *Regexp) ExpandString(dst []byte, template string, src string, match []int) []byte { 827 return re.expand(dst, template, nil, src, match) 828} 829 830func (re *Regexp) expand(dst []byte, template string, bsrc []byte, src string, match []int) []byte { 831 for len(template) > 0 { 832 i := strings.Index(template, "$") 833 if i < 0 { 834 break 835 } 836 dst = append(dst, template[:i]...) 837 template = template[i:] 838 if len(template) > 1 && template[1] == '$' { 839 // Treat $$ as $. 840 dst = append(dst, '$') 841 template = template[2:] 842 continue 843 } 844 name, num, rest, ok := extract(template) 845 if !ok { 846 // Malformed; treat $ as raw text. 847 dst = append(dst, '$') 848 template = template[1:] 849 continue 850 } 851 template = rest 852 if num >= 0 { 853 if 2*num+1 < len(match) && match[2*num] >= 0 { 854 if bsrc != nil { 855 dst = append(dst, bsrc[match[2*num]:match[2*num+1]]...) 856 } else { 857 dst = append(dst, src[match[2*num]:match[2*num+1]]...) 858 } 859 } 860 } else { 861 for i, namei := range re.subexpNames { 862 if name == namei && 2*i+1 < len(match) && match[2*i] >= 0 { 863 if bsrc != nil { 864 dst = append(dst, bsrc[match[2*i]:match[2*i+1]]...) 865 } else { 866 dst = append(dst, src[match[2*i]:match[2*i+1]]...) 867 } 868 break 869 } 870 } 871 } 872 } 873 dst = append(dst, template...) 874 return dst 875} 876 877// extract returns the name from a leading "$name" or "${name}" in str. 878// If it is a number, extract returns num set to that number; otherwise num = -1. 879func extract(str string) (name string, num int, rest string, ok bool) { 880 if len(str) < 2 || str[0] != '$' { 881 return 882 } 883 brace := false 884 if str[1] == '{' { 885 brace = true 886 str = str[2:] 887 } else { 888 str = str[1:] 889 } 890 i := 0 891 for i < len(str) { 892 rune, size := utf8.DecodeRuneInString(str[i:]) 893 if !unicode.IsLetter(rune) && !unicode.IsDigit(rune) && rune != '_' { 894 break 895 } 896 i += size 897 } 898 if i == 0 { 899 // empty name is not okay 900 return 901 } 902 name = str[:i] 903 if brace { 904 if i >= len(str) || str[i] != '}' { 905 // missing closing brace 906 return 907 } 908 i++ 909 } 910 911 // Parse number. 912 num = 0 913 for i := 0; i < len(name); i++ { 914 if name[i] < '0' || '9' < name[i] || num >= 1e8 { 915 num = -1 916 break 917 } 918 num = num*10 + int(name[i]) - '0' 919 } 920 // Disallow leading zeros. 921 if name[0] == '0' && len(name) > 1 { 922 num = -1 923 } 924 925 rest = str[i:] 926 ok = true 927 return 928} 929 930// FindSubmatchIndex returns a slice holding the index pairs identifying the 931// leftmost match of the regular expression in b and the matches, if any, of 932// its subexpressions, as defined by the 'Submatch' and 'Index' descriptions 933// in the package comment. 934// A return value of nil indicates no match. 935func (re *Regexp) FindSubmatchIndex(b []byte) []int { 936 return re.pad(re.doExecute(nil, b, "", 0, re.prog.NumCap, nil)) 937} 938 939// FindStringSubmatch returns a slice of strings holding the text of the 940// leftmost match of the regular expression in s and the matches, if any, of 941// its subexpressions, as defined by the 'Submatch' description in the 942// package comment. 943// A return value of nil indicates no match. 944func (re *Regexp) FindStringSubmatch(s string) []string { 945 var dstCap [4]int 946 a := re.doExecute(nil, nil, s, 0, re.prog.NumCap, dstCap[:0]) 947 if a == nil { 948 return nil 949 } 950 ret := make([]string, 1+re.numSubexp) 951 for i := range ret { 952 if 2*i < len(a) && a[2*i] >= 0 { 953 ret[i] = s[a[2*i]:a[2*i+1]] 954 } 955 } 956 return ret 957} 958 959// FindStringSubmatchIndex returns a slice holding the index pairs 960// identifying the leftmost match of the regular expression in s and the 961// matches, if any, of its subexpressions, as defined by the 'Submatch' and 962// 'Index' descriptions in the package comment. 963// A return value of nil indicates no match. 964func (re *Regexp) FindStringSubmatchIndex(s string) []int { 965 return re.pad(re.doExecute(nil, nil, s, 0, re.prog.NumCap, nil)) 966} 967 968// FindReaderSubmatchIndex returns a slice holding the index pairs 969// identifying the leftmost match of the regular expression of text read by 970// the RuneReader, and the matches, if any, of its subexpressions, as defined 971// by the 'Submatch' and 'Index' descriptions in the package comment. A 972// return value of nil indicates no match. 973func (re *Regexp) FindReaderSubmatchIndex(r io.RuneReader) []int { 974 return re.pad(re.doExecute(r, nil, "", 0, re.prog.NumCap, nil)) 975} 976 977const startSize = 10 // The size at which to start a slice in the 'All' routines. 978 979// FindAll is the 'All' version of Find; it returns a slice of all successive 980// matches of the expression, as defined by the 'All' description in the 981// package comment. 982// A return value of nil indicates no match. 983func (re *Regexp) FindAll(b []byte, n int) [][]byte { 984 if n < 0 { 985 n = len(b) + 1 986 } 987 result := make([][]byte, 0, startSize) 988 re.allMatches("", b, n, func(match []int) { 989 result = append(result, b[match[0]:match[1]]) 990 }) 991 if len(result) == 0 { 992 return nil 993 } 994 return result 995} 996 997// FindAllIndex is the 'All' version of FindIndex; it returns a slice of all 998// successive matches of the expression, as defined by the 'All' description 999// in the package comment. 1000// A return value of nil indicates no match. 1001func (re *Regexp) FindAllIndex(b []byte, n int) [][]int { 1002 if n < 0 { 1003 n = len(b) + 1 1004 } 1005 result := make([][]int, 0, startSize) 1006 re.allMatches("", b, n, func(match []int) { 1007 result = append(result, match[0:2]) 1008 }) 1009 if len(result) == 0 { 1010 return nil 1011 } 1012 return result 1013} 1014 1015// FindAllString is the 'All' version of FindString; it returns a slice of all 1016// successive matches of the expression, as defined by the 'All' description 1017// in the package comment. 1018// A return value of nil indicates no match. 1019func (re *Regexp) FindAllString(s string, n int) []string { 1020 if n < 0 { 1021 n = len(s) + 1 1022 } 1023 result := make([]string, 0, startSize) 1024 re.allMatches(s, nil, n, func(match []int) { 1025 result = append(result, s[match[0]:match[1]]) 1026 }) 1027 if len(result) == 0 { 1028 return nil 1029 } 1030 return result 1031} 1032 1033// FindAllStringIndex is the 'All' version of FindStringIndex; it returns a 1034// slice of all successive matches of the expression, as defined by the 'All' 1035// description in the package comment. 1036// A return value of nil indicates no match. 1037func (re *Regexp) FindAllStringIndex(s string, n int) [][]int { 1038 if n < 0 { 1039 n = len(s) + 1 1040 } 1041 result := make([][]int, 0, startSize) 1042 re.allMatches(s, nil, n, func(match []int) { 1043 result = append(result, match[0:2]) 1044 }) 1045 if len(result) == 0 { 1046 return nil 1047 } 1048 return result 1049} 1050 1051// FindAllSubmatch is the 'All' version of FindSubmatch; it returns a slice 1052// of all successive matches of the expression, as defined by the 'All' 1053// description in the package comment. 1054// A return value of nil indicates no match. 1055func (re *Regexp) FindAllSubmatch(b []byte, n int) [][][]byte { 1056 if n < 0 { 1057 n = len(b) + 1 1058 } 1059 result := make([][][]byte, 0, startSize) 1060 re.allMatches("", b, n, func(match []int) { 1061 slice := make([][]byte, len(match)/2) 1062 for j := range slice { 1063 if match[2*j] >= 0 { 1064 slice[j] = b[match[2*j]:match[2*j+1]] 1065 } 1066 } 1067 result = append(result, slice) 1068 }) 1069 if len(result) == 0 { 1070 return nil 1071 } 1072 return result 1073} 1074 1075// FindAllSubmatchIndex is the 'All' version of FindSubmatchIndex; it returns 1076// a slice of all successive matches of the expression, as defined by the 1077// 'All' description in the package comment. 1078// A return value of nil indicates no match. 1079func (re *Regexp) FindAllSubmatchIndex(b []byte, n int) [][]int { 1080 if n < 0 { 1081 n = len(b) + 1 1082 } 1083 result := make([][]int, 0, startSize) 1084 re.allMatches("", b, n, func(match []int) { 1085 result = append(result, match) 1086 }) 1087 if len(result) == 0 { 1088 return nil 1089 } 1090 return result 1091} 1092 1093// FindAllStringSubmatch is the 'All' version of FindStringSubmatch; it 1094// returns a slice of all successive matches of the expression, as defined by 1095// the 'All' description in the package comment. 1096// A return value of nil indicates no match. 1097func (re *Regexp) FindAllStringSubmatch(s string, n int) [][]string { 1098 if n < 0 { 1099 n = len(s) + 1 1100 } 1101 result := make([][]string, 0, startSize) 1102 re.allMatches(s, nil, n, func(match []int) { 1103 slice := make([]string, len(match)/2) 1104 for j := range slice { 1105 if match[2*j] >= 0 { 1106 slice[j] = s[match[2*j]:match[2*j+1]] 1107 } 1108 } 1109 result = append(result, slice) 1110 }) 1111 if len(result) == 0 { 1112 return nil 1113 } 1114 return result 1115} 1116 1117// FindAllStringSubmatchIndex is the 'All' version of 1118// FindStringSubmatchIndex; it returns a slice of all successive matches of 1119// the expression, as defined by the 'All' description in the package 1120// comment. 1121// A return value of nil indicates no match. 1122func (re *Regexp) FindAllStringSubmatchIndex(s string, n int) [][]int { 1123 if n < 0 { 1124 n = len(s) + 1 1125 } 1126 result := make([][]int, 0, startSize) 1127 re.allMatches(s, nil, n, func(match []int) { 1128 result = append(result, match) 1129 }) 1130 if len(result) == 0 { 1131 return nil 1132 } 1133 return result 1134} 1135 1136// Split slices s into substrings separated by the expression and returns a slice of 1137// the substrings between those expression matches. 1138// 1139// The slice returned by this method consists of all the substrings of s 1140// not contained in the slice returned by FindAllString. When called on an expression 1141// that contains no metacharacters, it is equivalent to strings.SplitN. 1142// 1143// Example: 1144// s := regexp.MustCompile("a*").Split("abaabaccadaaae", 5) 1145// // s: ["", "b", "b", "c", "cadaaae"] 1146// 1147// The count determines the number of substrings to return: 1148// n > 0: at most n substrings; the last substring will be the unsplit remainder. 1149// n == 0: the result is nil (zero substrings) 1150// n < 0: all substrings 1151func (re *Regexp) Split(s string, n int) []string { 1152 1153 if n == 0 { 1154 return nil 1155 } 1156 1157 if len(re.expr) > 0 && len(s) == 0 { 1158 return []string{""} 1159 } 1160 1161 matches := re.FindAllStringIndex(s, n) 1162 strings := make([]string, 0, len(matches)) 1163 1164 beg := 0 1165 end := 0 1166 for _, match := range matches { 1167 if n > 0 && len(strings) >= n-1 { 1168 break 1169 } 1170 1171 end = match[0] 1172 if match[1] != 0 { 1173 strings = append(strings, s[beg:end]) 1174 } 1175 beg = match[1] 1176 } 1177 1178 if end != len(s) { 1179 strings = append(strings, s[beg:]) 1180 } 1181 1182 return strings 1183} 1184