1// Copyright 2010 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package regexp 6 7import ( 8 "bufio" 9 "compress/bzip2" 10 "fmt" 11 "io" 12 "math/rand" 13 "os" 14 "path/filepath" 15 "regexp/syntax" 16 "strconv" 17 "strings" 18 "testing" 19 "unicode/utf8" 20) 21 22// TestRE2 tests this package's regexp API against test cases 23// considered during RE2's exhaustive tests, which run all possible 24// regexps over a given set of atoms and operators, up to a given 25// complexity, over all possible strings over a given alphabet, 26// up to a given size. Rather than try to link with RE2, we read a 27// log file containing the test cases and the expected matches. 28// The log file, re2.txt, is generated by running 'make exhaustive-log' 29// in the open source RE2 distribution. http://code.google.com/p/re2/ 30// 31// The test file format is a sequence of stanzas like: 32// 33// strings 34// "abc" 35// "123x" 36// regexps 37// "[a-z]+" 38// 0-3;0-3 39// -;- 40// "([0-9])([0-9])([0-9])" 41// -;- 42// -;0-3 0-1 1-2 2-3 43// 44// The stanza begins by defining a set of strings, quoted 45// using Go double-quote syntax, one per line. Then the 46// regexps section gives a sequence of regexps to run on 47// the strings. In the block that follows a regexp, each line 48// gives the semicolon-separated match results of running 49// the regexp on the corresponding string. 50// Each match result is either a single -, meaning no match, or a 51// space-separated sequence of pairs giving the match and 52// submatch indices. An unmatched subexpression formats 53// its pair as a single - (not illustrated above). For now 54// each regexp run produces two match results, one for a 55// ``full match'' that restricts the regexp to matching the entire 56// string or nothing, and one for a ``partial match'' that gives 57// the leftmost first match found in the string. 58// 59// Lines beginning with # are comments. Lines beginning with 60// a capital letter are test names printed during RE2's test suite 61// and are echoed into t but otherwise ignored. 62// 63// At time of writing, re2.txt is 32 MB but compresses to 760 kB, 64// so we store re2.txt.gz in the repository and decompress it on the fly. 65// 66func TestRE2Search(t *testing.T) { 67 testRE2(t, "testdata/re2-search.txt") 68} 69 70func TestRE2Exhaustive(t *testing.T) { 71 if testing.Short() { 72 t.Skip("skipping TestRE2Exhaustive during short test") 73 } 74 testRE2(t, "testdata/re2-exhaustive.txt.bz2") 75} 76 77func testRE2(t *testing.T, file string) { 78 f, err := os.Open(file) 79 if err != nil { 80 t.Fatal(err) 81 } 82 defer f.Close() 83 var txt io.Reader 84 if strings.HasSuffix(file, ".bz2") { 85 z := bzip2.NewReader(f) 86 txt = z 87 file = file[:len(file)-len(".bz2")] // for error messages 88 } else { 89 txt = f 90 } 91 lineno := 0 92 r := bufio.NewReader(txt) 93 var ( 94 str []string 95 input []string 96 inStrings bool 97 re *Regexp 98 refull *Regexp 99 nfail int 100 ncase int 101 ) 102 for { 103 line, err := r.ReadString('\n') 104 if err != nil { 105 if err == io.EOF { 106 break 107 } 108 t.Fatalf("%s:%d: %v", file, lineno, err) 109 } 110 line = line[:len(line)-1] // chop \n 111 lineno++ 112 switch { 113 case line == "": 114 t.Fatalf("%s:%d: unexpected blank line", file, lineno) 115 case line[0] == '#': 116 continue 117 case 'A' <= line[0] && line[0] <= 'Z': 118 // Test name. 119 t.Logf("%s\n", line) 120 continue 121 case line == "strings": 122 str = str[:0] 123 inStrings = true 124 case line == "regexps": 125 inStrings = false 126 case line[0] == '"': 127 q, err := strconv.Unquote(line) 128 if err != nil { 129 // Fatal because we'll get out of sync. 130 t.Fatalf("%s:%d: unquote %s: %v", file, lineno, line, err) 131 } 132 if inStrings { 133 str = append(str, q) 134 continue 135 } 136 // Is a regexp. 137 if len(input) != 0 { 138 t.Fatalf("%s:%d: out of sync: have %d strings left before %#q", file, lineno, len(input), q) 139 } 140 re, err = tryCompile(q) 141 if err != nil { 142 if err.Error() == "error parsing regexp: invalid escape sequence: `\\C`" { 143 // We don't and likely never will support \C; keep going. 144 continue 145 } 146 t.Errorf("%s:%d: compile %#q: %v", file, lineno, q, err) 147 if nfail++; nfail >= 100 { 148 t.Fatalf("stopping after %d errors", nfail) 149 } 150 continue 151 } 152 full := `\A(?:` + q + `)\z` 153 refull, err = tryCompile(full) 154 if err != nil { 155 // Fatal because q worked, so this should always work. 156 t.Fatalf("%s:%d: compile full %#q: %v", file, lineno, full, err) 157 } 158 input = str 159 case line[0] == '-' || '0' <= line[0] && line[0] <= '9': 160 // A sequence of match results. 161 ncase++ 162 if re == nil { 163 // Failed to compile: skip results. 164 continue 165 } 166 if len(input) == 0 { 167 t.Fatalf("%s:%d: out of sync: no input remaining", file, lineno) 168 } 169 var text string 170 text, input = input[0], input[1:] 171 if !isSingleBytes(text) && strings.Contains(re.String(), `\B`) { 172 // RE2's \B considers every byte position, 173 // so it sees 'not word boundary' in the 174 // middle of UTF-8 sequences. This package 175 // only considers the positions between runes, 176 // so it disagrees. Skip those cases. 177 continue 178 } 179 res := strings.Split(line, ";") 180 if len(res) != len(run) { 181 t.Fatalf("%s:%d: have %d test results, want %d", file, lineno, len(res), len(run)) 182 } 183 for i := range res { 184 have, suffix := run[i](re, refull, text) 185 want := parseResult(t, file, lineno, res[i]) 186 if !same(have, want) { 187 t.Errorf("%s:%d: %#q%s.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, re, suffix, text, have, want) 188 if nfail++; nfail >= 100 { 189 t.Fatalf("stopping after %d errors", nfail) 190 } 191 continue 192 } 193 b, suffix := match[i](re, refull, text) 194 if b != (want != nil) { 195 t.Errorf("%s:%d: %#q%s.MatchString(%#q) = %v, want %v", file, lineno, re, suffix, text, b, !b) 196 if nfail++; nfail >= 100 { 197 t.Fatalf("stopping after %d errors", nfail) 198 } 199 continue 200 } 201 } 202 203 default: 204 t.Fatalf("%s:%d: out of sync: %s\n", file, lineno, line) 205 } 206 } 207 if len(input) != 0 { 208 t.Fatalf("%s:%d: out of sync: have %d strings left at EOF", file, lineno, len(input)) 209 } 210 t.Logf("%d cases tested", ncase) 211} 212 213var run = []func(*Regexp, *Regexp, string) ([]int, string){ 214 runFull, 215 runPartial, 216 runFullLongest, 217 runPartialLongest, 218} 219 220func runFull(re, refull *Regexp, text string) ([]int, string) { 221 refull.longest = false 222 return refull.FindStringSubmatchIndex(text), "[full]" 223} 224 225func runPartial(re, refull *Regexp, text string) ([]int, string) { 226 re.longest = false 227 return re.FindStringSubmatchIndex(text), "" 228} 229 230func runFullLongest(re, refull *Regexp, text string) ([]int, string) { 231 refull.longest = true 232 return refull.FindStringSubmatchIndex(text), "[full,longest]" 233} 234 235func runPartialLongest(re, refull *Regexp, text string) ([]int, string) { 236 re.longest = true 237 return re.FindStringSubmatchIndex(text), "[longest]" 238} 239 240var match = []func(*Regexp, *Regexp, string) (bool, string){ 241 matchFull, 242 matchPartial, 243 matchFullLongest, 244 matchPartialLongest, 245} 246 247func matchFull(re, refull *Regexp, text string) (bool, string) { 248 refull.longest = false 249 return refull.MatchString(text), "[full]" 250} 251 252func matchPartial(re, refull *Regexp, text string) (bool, string) { 253 re.longest = false 254 return re.MatchString(text), "" 255} 256 257func matchFullLongest(re, refull *Regexp, text string) (bool, string) { 258 refull.longest = true 259 return refull.MatchString(text), "[full,longest]" 260} 261 262func matchPartialLongest(re, refull *Regexp, text string) (bool, string) { 263 re.longest = true 264 return re.MatchString(text), "[longest]" 265} 266 267func isSingleBytes(s string) bool { 268 for _, c := range s { 269 if c >= utf8.RuneSelf { 270 return false 271 } 272 } 273 return true 274} 275 276func tryCompile(s string) (re *Regexp, err error) { 277 // Protect against panic during Compile. 278 defer func() { 279 if r := recover(); r != nil { 280 err = fmt.Errorf("panic: %v", r) 281 } 282 }() 283 return Compile(s) 284} 285 286func parseResult(t *testing.T, file string, lineno int, res string) []int { 287 // A single - indicates no match. 288 if res == "-" { 289 return nil 290 } 291 // Otherwise, a space-separated list of pairs. 292 n := 1 293 for j := 0; j < len(res); j++ { 294 if res[j] == ' ' { 295 n++ 296 } 297 } 298 out := make([]int, 2*n) 299 i := 0 300 n = 0 301 for j := 0; j <= len(res); j++ { 302 if j == len(res) || res[j] == ' ' { 303 // Process a single pair. - means no submatch. 304 pair := res[i:j] 305 if pair == "-" { 306 out[n] = -1 307 out[n+1] = -1 308 } else { 309 k := strings.Index(pair, "-") 310 if k < 0 { 311 t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) 312 } 313 lo, err1 := strconv.Atoi(pair[:k]) 314 hi, err2 := strconv.Atoi(pair[k+1:]) 315 if err1 != nil || err2 != nil || lo > hi { 316 t.Fatalf("%s:%d: invalid pair %s", file, lineno, pair) 317 } 318 out[n] = lo 319 out[n+1] = hi 320 } 321 n += 2 322 i = j + 1 323 } 324 } 325 return out 326} 327 328func same(x, y []int) bool { 329 if len(x) != len(y) { 330 return false 331 } 332 for i, xi := range x { 333 if xi != y[i] { 334 return false 335 } 336 } 337 return true 338} 339 340// TestFowler runs this package's regexp API against the 341// POSIX regular expression tests collected by Glenn Fowler 342// at http://www2.research.att.com/~gsf/testregex/. 343func TestFowler(t *testing.T) { 344 files, err := filepath.Glob("testdata/*.dat") 345 if err != nil { 346 t.Fatal(err) 347 } 348 for _, file := range files { 349 t.Log(file) 350 testFowler(t, file) 351 } 352} 353 354var notab = MustCompilePOSIX(`[^\t]+`) 355 356func testFowler(t *testing.T, file string) { 357 f, err := os.Open(file) 358 if err != nil { 359 t.Error(err) 360 return 361 } 362 defer f.Close() 363 b := bufio.NewReader(f) 364 lineno := 0 365 lastRegexp := "" 366Reading: 367 for { 368 lineno++ 369 line, err := b.ReadString('\n') 370 if err != nil { 371 if err != io.EOF { 372 t.Errorf("%s:%d: %v", file, lineno, err) 373 } 374 break Reading 375 } 376 377 // http://www2.research.att.com/~gsf/man/man1/testregex.html 378 // 379 // INPUT FORMAT 380 // Input lines may be blank, a comment beginning with #, or a test 381 // specification. A specification is five fields separated by one 382 // or more tabs. NULL denotes the empty string and NIL denotes the 383 // 0 pointer. 384 if line[0] == '#' || line[0] == '\n' { 385 continue Reading 386 } 387 line = line[:len(line)-1] 388 field := notab.FindAllString(line, -1) 389 for i, f := range field { 390 if f == "NULL" { 391 field[i] = "" 392 } 393 if f == "NIL" { 394 t.Logf("%s:%d: skip: %s", file, lineno, line) 395 continue Reading 396 } 397 } 398 if len(field) == 0 { 399 continue Reading 400 } 401 402 // Field 1: the regex(3) flags to apply, one character per REG_feature 403 // flag. The test is skipped if REG_feature is not supported by the 404 // implementation. If the first character is not [BEASKLP] then the 405 // specification is a global control line. One or more of [BEASKLP] may be 406 // specified; the test will be repeated for each mode. 407 // 408 // B basic BRE (grep, ed, sed) 409 // E REG_EXTENDED ERE (egrep) 410 // A REG_AUGMENTED ARE (egrep with negation) 411 // S REG_SHELL SRE (sh glob) 412 // K REG_SHELL|REG_AUGMENTED KRE (ksh glob) 413 // L REG_LITERAL LRE (fgrep) 414 // 415 // a REG_LEFT|REG_RIGHT implicit ^...$ 416 // b REG_NOTBOL lhs does not match ^ 417 // c REG_COMMENT ignore space and #...\n 418 // d REG_SHELL_DOT explicit leading . match 419 // e REG_NOTEOL rhs does not match $ 420 // f REG_MULTIPLE multiple \n separated patterns 421 // g FNM_LEADING_DIR testfnmatch only -- match until / 422 // h REG_MULTIREF multiple digit backref 423 // i REG_ICASE ignore case 424 // j REG_SPAN . matches \n 425 // k REG_ESCAPE \ to ecape [...] delimiter 426 // l REG_LEFT implicit ^... 427 // m REG_MINIMAL minimal match 428 // n REG_NEWLINE explicit \n match 429 // o REG_ENCLOSED (|&) magic inside [@|&](...) 430 // p REG_SHELL_PATH explicit / match 431 // q REG_DELIMITED delimited pattern 432 // r REG_RIGHT implicit ...$ 433 // s REG_SHELL_ESCAPED \ not special 434 // t REG_MUSTDELIM all delimiters must be specified 435 // u standard unspecified behavior -- errors not counted 436 // v REG_CLASS_ESCAPE \ special inside [...] 437 // w REG_NOSUB no subexpression match array 438 // x REG_LENIENT let some errors slide 439 // y REG_LEFT regexec() implicit ^... 440 // z REG_NULL NULL subexpressions ok 441 // $ expand C \c escapes in fields 2 and 3 442 // / field 2 is a regsubcomp() expression 443 // = field 3 is a regdecomp() expression 444 // 445 // Field 1 control lines: 446 // 447 // C set LC_COLLATE and LC_CTYPE to locale in field 2 448 // 449 // ?test ... output field 5 if passed and != EXPECTED, silent otherwise 450 // &test ... output field 5 if current and previous passed 451 // |test ... output field 5 if current passed and previous failed 452 // ; ... output field 2 if previous failed 453 // {test ... skip if failed until } 454 // } end of skip 455 // 456 // : comment comment copied as output NOTE 457 // :comment:test :comment: ignored 458 // N[OTE] comment comment copied as output NOTE 459 // T[EST] comment comment 460 // 461 // number use number for nmatch (20 by default) 462 flag := field[0] 463 switch flag[0] { 464 case '?', '&', '|', ';', '{', '}': 465 // Ignore all the control operators. 466 // Just run everything. 467 flag = flag[1:] 468 if flag == "" { 469 continue Reading 470 } 471 case ':': 472 i := strings.Index(flag[1:], ":") 473 if i < 0 { 474 t.Logf("skip: %s", line) 475 continue Reading 476 } 477 flag = flag[1+i+1:] 478 case 'C', 'N', 'T', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9': 479 t.Logf("skip: %s", line) 480 continue Reading 481 } 482 483 // Can check field count now that we've handled the myriad comment formats. 484 if len(field) < 4 { 485 t.Errorf("%s:%d: too few fields: %s", file, lineno, line) 486 continue Reading 487 } 488 489 // Expand C escapes (a.k.a. Go escapes). 490 if strings.Contains(flag, "$") { 491 f := `"` + field[1] + `"` 492 if field[1], err = strconv.Unquote(f); err != nil { 493 t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) 494 } 495 f = `"` + field[2] + `"` 496 if field[2], err = strconv.Unquote(f); err != nil { 497 t.Errorf("%s:%d: cannot unquote %s", file, lineno, f) 498 } 499 } 500 501 // Field 2: the regular expression pattern; SAME uses the pattern from 502 // the previous specification. 503 // 504 if field[1] == "SAME" { 505 field[1] = lastRegexp 506 } 507 lastRegexp = field[1] 508 509 // Field 3: the string to match. 510 text := field[2] 511 512 // Field 4: the test outcome... 513 ok, shouldCompile, shouldMatch, pos := parseFowlerResult(field[3]) 514 if !ok { 515 t.Errorf("%s:%d: cannot parse result %#q", file, lineno, field[3]) 516 continue Reading 517 } 518 519 // Field 5: optional comment appended to the report. 520 521 Testing: 522 // Run test once for each specified capital letter mode that we support. 523 for _, c := range flag { 524 pattern := field[1] 525 syn := syntax.POSIX | syntax.ClassNL 526 switch c { 527 default: 528 continue Testing 529 case 'E': 530 // extended regexp (what we support) 531 case 'L': 532 // literal 533 pattern = QuoteMeta(pattern) 534 } 535 536 for _, c := range flag { 537 switch c { 538 case 'i': 539 syn |= syntax.FoldCase 540 } 541 } 542 543 re, err := compile(pattern, syn, true) 544 if err != nil { 545 if shouldCompile { 546 t.Errorf("%s:%d: %#q did not compile", file, lineno, pattern) 547 } 548 continue Testing 549 } 550 if !shouldCompile { 551 t.Errorf("%s:%d: %#q should not compile", file, lineno, pattern) 552 continue Testing 553 } 554 match := re.MatchString(text) 555 if match != shouldMatch { 556 t.Errorf("%s:%d: %#q.Match(%#q) = %v, want %v", file, lineno, pattern, text, match, shouldMatch) 557 continue Testing 558 } 559 have := re.FindStringSubmatchIndex(text) 560 if (len(have) > 0) != match { 561 t.Errorf("%s:%d: %#q.Match(%#q) = %v, but %#q.FindSubmatchIndex(%#q) = %v", file, lineno, pattern, text, match, pattern, text, have) 562 continue Testing 563 } 564 if len(have) > len(pos) { 565 have = have[:len(pos)] 566 } 567 if !same(have, pos) { 568 t.Errorf("%s:%d: %#q.FindSubmatchIndex(%#q) = %v, want %v", file, lineno, pattern, text, have, pos) 569 } 570 } 571 } 572} 573 574func parseFowlerResult(s string) (ok, compiled, matched bool, pos []int) { 575 // Field 4: the test outcome. This is either one of the posix error 576 // codes (with REG_ omitted) or the match array, a list of (m,n) 577 // entries with m and n being first and last+1 positions in the 578 // field 3 string, or NULL if REG_NOSUB is in effect and success 579 // is expected. BADPAT is acceptable in place of any regcomp(3) 580 // error code. The match[] array is initialized to (-2,-2) before 581 // each test. All array elements from 0 to nmatch-1 must be specified 582 // in the outcome. Unspecified endpoints (offset -1) are denoted by ?. 583 // Unset endpoints (offset -2) are denoted by X. {x}(o:n) denotes a 584 // matched (?{...}) expression, where x is the text enclosed by {...}, 585 // o is the expression ordinal counting from 1, and n is the length of 586 // the unmatched portion of the subject string. If x starts with a 587 // number then that is the return value of re_execf(), otherwise 0 is 588 // returned. 589 switch { 590 case s == "": 591 // Match with no position information. 592 ok = true 593 compiled = true 594 matched = true 595 return 596 case s == "NOMATCH": 597 // Match failure. 598 ok = true 599 compiled = true 600 matched = false 601 return 602 case 'A' <= s[0] && s[0] <= 'Z': 603 // All the other error codes are compile errors. 604 ok = true 605 compiled = false 606 return 607 } 608 compiled = true 609 610 var x []int 611 for s != "" { 612 var end byte = ')' 613 if len(x)%2 == 0 { 614 if s[0] != '(' { 615 ok = false 616 return 617 } 618 s = s[1:] 619 end = ',' 620 } 621 i := 0 622 for i < len(s) && s[i] != end { 623 i++ 624 } 625 if i == 0 || i == len(s) { 626 ok = false 627 return 628 } 629 var v = -1 630 var err error 631 if s[:i] != "?" { 632 v, err = strconv.Atoi(s[:i]) 633 if err != nil { 634 ok = false 635 return 636 } 637 } 638 x = append(x, v) 639 s = s[i+1:] 640 } 641 if len(x)%2 != 0 { 642 ok = false 643 return 644 } 645 ok = true 646 matched = true 647 pos = x 648 return 649} 650 651var text []byte 652 653func makeText(n int) []byte { 654 if len(text) >= n { 655 return text[:n] 656 } 657 text = make([]byte, n) 658 for i := range text { 659 if rand.Intn(30) == 0 { 660 text[i] = '\n' 661 } else { 662 text[i] = byte(rand.Intn(0x7E+1-0x20) + 0x20) 663 } 664 } 665 return text 666} 667 668func benchmark(b *testing.B, re string, n int) { 669 r := MustCompile(re) 670 t := makeText(n) 671 b.ResetTimer() 672 b.SetBytes(int64(n)) 673 for i := 0; i < b.N; i++ { 674 if r.Match(t) { 675 b.Fatal("match!") 676 } 677 } 678} 679 680const ( 681 easy0 = "ABCDEFGHIJKLMNOPQRSTUVWXYZ$" 682 easy1 = "A[AB]B[BC]C[CD]D[DE]E[EF]F[FG]G[GH]H[HI]I[IJ]J$" 683 medium = "[XYZ]ABCDEFGHIJKLMNOPQRSTUVWXYZ$" 684 hard = "[ -~]*ABCDEFGHIJKLMNOPQRSTUVWXYZ$" 685 parens = "([ -~])*(A)(B)(C)(D)(E)(F)(G)(H)(I)(J)(K)(L)(M)" + 686 "(N)(O)(P)(Q)(R)(S)(T)(U)(V)(W)(X)(Y)(Z)$" 687) 688 689func BenchmarkMatchEasy0_32(b *testing.B) { benchmark(b, easy0, 32<<0) } 690func BenchmarkMatchEasy0_1K(b *testing.B) { benchmark(b, easy0, 1<<10) } 691func BenchmarkMatchEasy0_32K(b *testing.B) { benchmark(b, easy0, 32<<10) } 692func BenchmarkMatchEasy0_1M(b *testing.B) { benchmark(b, easy0, 1<<20) } 693func BenchmarkMatchEasy0_32M(b *testing.B) { benchmark(b, easy0, 32<<20) } 694func BenchmarkMatchEasy1_32(b *testing.B) { benchmark(b, easy1, 32<<0) } 695func BenchmarkMatchEasy1_1K(b *testing.B) { benchmark(b, easy1, 1<<10) } 696func BenchmarkMatchEasy1_32K(b *testing.B) { benchmark(b, easy1, 32<<10) } 697func BenchmarkMatchEasy1_1M(b *testing.B) { benchmark(b, easy1, 1<<20) } 698func BenchmarkMatchEasy1_32M(b *testing.B) { benchmark(b, easy1, 32<<20) } 699func BenchmarkMatchMedium_32(b *testing.B) { benchmark(b, medium, 1<<0) } 700func BenchmarkMatchMedium_1K(b *testing.B) { benchmark(b, medium, 1<<10) } 701func BenchmarkMatchMedium_32K(b *testing.B) { benchmark(b, medium, 32<<10) } 702func BenchmarkMatchMedium_1M(b *testing.B) { benchmark(b, medium, 1<<20) } 703func BenchmarkMatchMedium_32M(b *testing.B) { benchmark(b, medium, 32<<20) } 704func BenchmarkMatchHard_32(b *testing.B) { benchmark(b, hard, 32<<0) } 705func BenchmarkMatchHard_1K(b *testing.B) { benchmark(b, hard, 1<<10) } 706func BenchmarkMatchHard_32K(b *testing.B) { benchmark(b, hard, 32<<10) } 707func BenchmarkMatchHard_1M(b *testing.B) { benchmark(b, hard, 1<<20) } 708func BenchmarkMatchHard_32M(b *testing.B) { benchmark(b, hard, 32<<20) } 709