package regexp2 import ( "bufio" "bytes" "fmt" "log" "os" "regexp" "strconv" "strings" "testing" "time" ) // Process the file "testoutput1" from PCRE2 v10.21 (public domain) var totalCount, failCount = 0, 0 func TestPcre_Basics(t *testing.T) { defer func() { if failCount > 0 { t.Logf("%v of %v patterns failed", failCount, totalCount) } }() // open our test patterns file and run through it // validating results as we go file, err := os.Open("testoutput1") if err != nil { log.Fatal(err) } defer file.Close() // the high level structure of the file: // #comments - ignore only outside of the pattern // pattern (could be multi-line, could be surrounded by "" or //) after the / there are the options some we understand, some we don't // test case // 0: success case // \= Expect no match (ignored) // another test case // No Match // // another pattern ...etc scanner := bufio.NewScanner(file) // main pattern loop for scanner.Scan() { // reading the file a line at a time line := scanner.Text() if trim := strings.TrimSpace(line); trim == "" || strings.HasPrefix(trim, "#") { // skip blanks and comments continue } patternStart := line[0] if patternStart != '/' && patternStart != '"' { // an error! expected a pattern but we didn't understand what was in the file t.Fatalf("Unknown file format, expected line to start with '/' or '\"', line in: %v", line) } // start building our pattern, handling multi-line patterns pattern := line totalCount++ // keep appending the lines to our pattern string until we // find our closing tag, don't allow the first char to match on the // line start, but subsequent lines could end on the first char allowFirst := false for !containsEnder(line, patternStart, allowFirst) { if !scanner.Scan() { // an error! expected more pattern, but got eof t.Fatalf("Unknown file format, expected more pattern text, but got EOF, pattern so far: %v", pattern) } line = scanner.Text() pattern += fmt.Sprintf("\n%s", line) allowFirst = true } // we have our raw pattern! -- we need to convert this to a compiled regex re := compileRawPattern(t, pattern) var ( capsIdx map[int]int m *Match toMatch string ) // now we need to parse the test cases if there are any // they start with 4 spaces -- if we don't get a 4-space start then // we're back out to our next pattern for scanner.Scan() { line = scanner.Text() // blank line is our separator for a new pattern if strings.TrimSpace(line) == "" { break } // could be either " " or "\= Expect" if strings.HasPrefix(line, "\\= Expect") { continue } else if strings.HasPrefix(line, " ") { // trim off leading spaces for our text to match toMatch = line[4:] // trim off trailing spaces too toMatch = strings.TrimRight(toMatch, " ") m = matchString(t, re, toMatch) capsIdx = make(map[int]int) continue //t.Fatalf("Expected match text to start with 4 spaces, instead got: '%v'", line) } else if strings.HasPrefix(line, "No match") { validateNoMatch(t, re, m) // no match means we're done continue } else if subs := matchGroup.FindStringSubmatch(line); len(subs) == 3 { gIdx, _ := strconv.Atoi(subs[1]) if _, ok := capsIdx[gIdx]; !ok { capsIdx[gIdx] = 0 } validateMatch(t, re, m, toMatch, subs[2], gIdx, capsIdx[gIdx]) capsIdx[gIdx]++ continue } else { // no match -- problem t.Fatalf("Unknown file format, expected match or match group but got '%v'", line) } } } if err := scanner.Err(); err != nil { log.Fatal(err) } } var matchGroup = regexp.MustCompile(`^\s*(\d+): (.*)`) func problem(t *testing.T, input string, args ...interface{}) { failCount++ t.Errorf(input, args...) } func validateNoMatch(t *testing.T, re *Regexp, m *Match) { if re == nil || m == nil { return } problem(t, "Expected no match for pattern '%v', but got '%v'", re.pattern, m.String()) } func validateMatch(t *testing.T, re *Regexp, m *Match, toMatch, value string, idx, capIdx int) { if re == nil { // already error'd earlier up stream return } if m == nil { // we didn't match, but should have problem(t, "Expected match for pattern '%v' with input '%v', but got no match", re.pattern, toMatch) return } g := m.Groups() if len(g) <= idx { problem(t, "Expected group %v does not exist in pattern '%v' with input '%v'", idx, re.pattern, toMatch) return } if value == "" { // this means we shouldn't have a cap for this group if len(g[idx].Captures) > 0 { problem(t, "Expected no cap %v in group %v in pattern '%v' with input '%v'", g[idx].Captures[capIdx].String(), idx, re.pattern, toMatch) } return } if len(g[idx].Captures) <= capIdx { problem(t, "Expected cap %v does not exist in group %v in pattern '%v' with input '%v'", capIdx, idx, re.pattern, toMatch) return } escp := unEscapeGroup(g[idx].String()) //escp := unEscapeGroup(g[idx].Captures[capIdx].String()) if escp != value { problem(t, "Expected '%v' but got '%v' for cap %v, group %v for pattern '%v' with input '%v'", value, escp, capIdx, idx, re.pattern, toMatch) return } } func compileRawPattern(t *testing.T, pattern string) *Regexp { // check our end for RegexOptions -trim them off index := strings.LastIndexAny(pattern, "/\"") // // Append "= Debug" to compare details between corefx and regexp2 on the PCRE test suite // var opts RegexOptions if index+1 < len(pattern) { textOptions := pattern[index+1:] pattern = pattern[:index+1] // there are lots of complex options here for _, textOpt := range strings.Split(textOptions, ",") { switch textOpt { case "dupnames": // we don't know how to handle this... default: if strings.Contains(textOpt, "i") { opts |= IgnoreCase } if strings.Contains(textOpt, "s") { opts |= Singleline } if strings.Contains(textOpt, "m") { opts |= Multiline } if strings.Contains(textOpt, "x") { opts |= IgnorePatternWhitespace } } } } // trim off first and last char pattern = pattern[1 : len(pattern)-1] defer func() { if rec := recover(); rec != nil { problem(t, "PANIC in compiling \"%v\": %v", pattern, rec) } }() re, err := Compile(pattern, opts) if err != nil { problem(t, "Error parsing \"%v\": %v", pattern, err) } return re } func matchString(t *testing.T, re *Regexp, toMatch string) *Match { if re == nil { return nil } re.MatchTimeout = time.Second * 1 escp := "" var err error if toMatch != "\\" { escp = unEscapeToMatch(toMatch) } m, err := re.FindStringMatch(escp) if err != nil { problem(t, "Error matching \"%v\" in pattern \"%v\": %v", toMatch, re.pattern, err) } return m } func containsEnder(line string, ender byte, allowFirst bool) bool { index := strings.LastIndexByte(line, ender) if index > 0 { return true } else if index == 0 && allowFirst { return true } return false } func unEscapeToMatch(line string) string { idx := strings.IndexRune(line, '\\') // no slashes means no unescape needed if idx == -1 { return line } buf := bytes.NewBufferString(line[:idx]) // get the runes for the rest of the string -- we're going full parser scan on this inEscape := false // take any \'s and convert them for i := idx; i < len(line); i++ { ch := line[i] if ch == '\\' { if inEscape { buf.WriteByte(ch) } inEscape = !inEscape continue } if inEscape { switch ch { case 'x': buf.WriteByte(scanHex(line, &i)) case 'a': buf.WriteByte(0x07) case 'b': buf.WriteByte('\b') case 'e': buf.WriteByte(0x1b) case 'f': buf.WriteByte('\f') case 'n': buf.WriteByte('\n') case 'r': buf.WriteByte('\r') case 't': buf.WriteByte('\t') case 'v': buf.WriteByte(0x0b) default: if ch >= '0' && ch <= '7' { buf.WriteByte(scanOctal(line, &i)) } else { buf.WriteByte(ch) //panic(fmt.Sprintf("unexpected escape '%v' in %v", string(ch), line)) } } inEscape = false } else { buf.WriteByte(ch) } } return buf.String() } func unEscapeGroup(val string) string { // use hex for chars 0x00-0x1f, 0x7f-0xff buf := &bytes.Buffer{} for i := 0; i < len(val); i++ { ch := val[i] if ch <= 0x1f || ch >= 0x7f { //write it as a \x00 fmt.Fprintf(buf, "\\x%.2x", ch) } else { // write as-is buf.WriteByte(ch) } } return buf.String() } func scanHex(line string, idx *int) byte { if *idx >= len(line)-2 { panic(fmt.Sprintf("not enough hex chars in %v at %v", line, *idx)) } (*idx)++ d1 := hexDigit(line[*idx]) (*idx)++ d2 := hexDigit(line[*idx]) if d1 < 0 || d2 < 0 { panic("bad hex chars") } return byte(d1*0x10 + d2) } // Returns n <= 0xF for a hex digit. func hexDigit(ch byte) int { if d := uint(ch - '0'); d <= 9 { return int(d) } if d := uint(ch - 'a'); d <= 5 { return int(d + 0xa) } if d := uint(ch - 'A'); d <= 5 { return int(d + 0xa) } return -1 } // Scans up to three octal digits (stops before exceeding 0377). func scanOctal(line string, idx *int) byte { // Consume octal chars only up to 3 digits and value 0377 // octals can be 3,2, or 1 digit c := 3 if diff := len(line) - *idx; c > diff { c = diff } i := 0 d := int(line[*idx] - '0') for c > 0 && d <= 7 { i *= 8 i += d c-- (*idx)++ if *idx < len(line) { d = int(line[*idx] - '0') } } (*idx)-- // Octal codes only go up to 255. Any larger and the behavior that Perl follows // is simply to truncate the high bits. i &= 0xFF return byte(i) }