1package regexp2 2 3import ( 4 "bufio" 5 "bytes" 6 "fmt" 7 "log" 8 "os" 9 "regexp" 10 "strconv" 11 "strings" 12 "testing" 13 "time" 14) 15 16// Process the file "testoutput1" from PCRE2 v10.21 (public domain) 17var totalCount, failCount = 0, 0 18 19func TestPcre_Basics(t *testing.T) { 20 defer func() { 21 if failCount > 0 { 22 t.Logf("%v of %v patterns failed", failCount, totalCount) 23 } 24 }() 25 // open our test patterns file and run through it 26 // validating results as we go 27 file, err := os.Open("testoutput1") 28 if err != nil { 29 log.Fatal(err) 30 } 31 defer file.Close() 32 33 // the high level structure of the file: 34 // #comments - ignore only outside of the pattern 35 // pattern (could be multi-line, could be surrounded by "" or //) after the / there are the options some we understand, some we don't 36 // test case 37 // 0: success case 38 // \= Expect no match (ignored) 39 // another test case 40 // No Match 41 // 42 // another pattern ...etc 43 44 scanner := bufio.NewScanner(file) 45 // main pattern loop 46 for scanner.Scan() { 47 // reading the file a line at a time 48 line := scanner.Text() 49 50 if trim := strings.TrimSpace(line); trim == "" || strings.HasPrefix(trim, "#") { 51 // skip blanks and comments 52 continue 53 } 54 55 patternStart := line[0] 56 if patternStart != '/' && patternStart != '"' { 57 // an error! expected a pattern but we didn't understand what was in the file 58 t.Fatalf("Unknown file format, expected line to start with '/' or '\"', line in: %v", line) 59 } 60 61 // start building our pattern, handling multi-line patterns 62 pattern := line 63 totalCount++ 64 65 // keep appending the lines to our pattern string until we 66 // find our closing tag, don't allow the first char to match on the 67 // line start, but subsequent lines could end on the first char 68 allowFirst := false 69 for !containsEnder(line, patternStart, allowFirst) { 70 if !scanner.Scan() { 71 // an error! expected more pattern, but got eof 72 t.Fatalf("Unknown file format, expected more pattern text, but got EOF, pattern so far: %v", pattern) 73 } 74 line = scanner.Text() 75 pattern += fmt.Sprintf("\n%s", line) 76 allowFirst = true 77 } 78 79 // we have our raw pattern! -- we need to convert this to a compiled regex 80 re := compileRawPattern(t, pattern) 81 82 var ( 83 capsIdx map[int]int 84 m *Match 85 toMatch string 86 ) 87 // now we need to parse the test cases if there are any 88 // they start with 4 spaces -- if we don't get a 4-space start then 89 // we're back out to our next pattern 90 for scanner.Scan() { 91 line = scanner.Text() 92 93 // blank line is our separator for a new pattern 94 if strings.TrimSpace(line) == "" { 95 break 96 } 97 98 // could be either " " or "\= Expect" 99 if strings.HasPrefix(line, "\\= Expect") { 100 continue 101 } else if strings.HasPrefix(line, " ") { 102 // trim off leading spaces for our text to match 103 toMatch = line[4:] 104 // trim off trailing spaces too 105 toMatch = strings.TrimRight(toMatch, " ") 106 107 m = matchString(t, re, toMatch) 108 109 capsIdx = make(map[int]int) 110 continue 111 //t.Fatalf("Expected match text to start with 4 spaces, instead got: '%v'", line) 112 } else if strings.HasPrefix(line, "No match") { 113 validateNoMatch(t, re, m) 114 // no match means we're done 115 continue 116 } else if subs := matchGroup.FindStringSubmatch(line); len(subs) == 3 { 117 gIdx, _ := strconv.Atoi(subs[1]) 118 if _, ok := capsIdx[gIdx]; !ok { 119 capsIdx[gIdx] = 0 120 } 121 validateMatch(t, re, m, toMatch, subs[2], gIdx, capsIdx[gIdx]) 122 capsIdx[gIdx]++ 123 continue 124 } else { 125 // no match -- problem 126 t.Fatalf("Unknown file format, expected match or match group but got '%v'", line) 127 } 128 } 129 130 } 131 132 if err := scanner.Err(); err != nil { 133 log.Fatal(err) 134 } 135} 136 137var matchGroup = regexp.MustCompile(`^\s*(\d+): (.*)`) 138 139func problem(t *testing.T, input string, args ...interface{}) { 140 failCount++ 141 t.Errorf(input, args...) 142} 143 144func validateNoMatch(t *testing.T, re *Regexp, m *Match) { 145 if re == nil || m == nil { 146 return 147 } 148 149 problem(t, "Expected no match for pattern '%v', but got '%v'", re.pattern, m.String()) 150} 151 152func validateMatch(t *testing.T, re *Regexp, m *Match, toMatch, value string, idx, capIdx int) { 153 if re == nil { 154 // already error'd earlier up stream 155 return 156 } 157 158 if m == nil { 159 // we didn't match, but should have 160 problem(t, "Expected match for pattern '%v' with input '%v', but got no match", re.pattern, toMatch) 161 return 162 } 163 164 g := m.Groups() 165 if len(g) <= idx { 166 problem(t, "Expected group %v does not exist in pattern '%v' with input '%v'", idx, re.pattern, toMatch) 167 return 168 } 169 170 if value == "<unset>" { 171 // this means we shouldn't have a cap for this group 172 if len(g[idx].Captures) > 0 { 173 problem(t, "Expected no cap %v in group %v in pattern '%v' with input '%v'", g[idx].Captures[capIdx].String(), idx, re.pattern, toMatch) 174 } 175 176 return 177 } 178 179 if len(g[idx].Captures) <= capIdx { 180 problem(t, "Expected cap %v does not exist in group %v in pattern '%v' with input '%v'", capIdx, idx, re.pattern, toMatch) 181 return 182 } 183 184 escp := unEscapeGroup(g[idx].String()) 185 //escp := unEscapeGroup(g[idx].Captures[capIdx].String()) 186 if escp != value { 187 problem(t, "Expected '%v' but got '%v' for cap %v, group %v for pattern '%v' with input '%v'", value, escp, capIdx, idx, re.pattern, toMatch) 188 return 189 } 190} 191 192func compileRawPattern(t *testing.T, pattern string) *Regexp { 193 // check our end for RegexOptions -trim them off 194 index := strings.LastIndexAny(pattern, "/\"") 195 // 196 // Append "= Debug" to compare details between corefx and regexp2 on the PCRE test suite 197 // 198 var opts RegexOptions 199 200 if index+1 < len(pattern) { 201 textOptions := pattern[index+1:] 202 pattern = pattern[:index+1] 203 // there are lots of complex options here 204 for _, textOpt := range strings.Split(textOptions, ",") { 205 switch textOpt { 206 case "dupnames": 207 // we don't know how to handle this... 208 default: 209 if strings.Contains(textOpt, "i") { 210 opts |= IgnoreCase 211 } 212 if strings.Contains(textOpt, "s") { 213 opts |= Singleline 214 } 215 if strings.Contains(textOpt, "m") { 216 opts |= Multiline 217 } 218 if strings.Contains(textOpt, "x") { 219 opts |= IgnorePatternWhitespace 220 } 221 } 222 } 223 224 } 225 226 // trim off first and last char 227 pattern = pattern[1 : len(pattern)-1] 228 229 defer func() { 230 if rec := recover(); rec != nil { 231 problem(t, "PANIC in compiling \"%v\": %v", pattern, rec) 232 } 233 }() 234 re, err := Compile(pattern, opts) 235 if err != nil { 236 problem(t, "Error parsing \"%v\": %v", pattern, err) 237 } 238 return re 239} 240 241func matchString(t *testing.T, re *Regexp, toMatch string) *Match { 242 if re == nil { 243 return nil 244 } 245 246 re.MatchTimeout = time.Second * 1 247 248 escp := "" 249 var err error 250 if toMatch != "\\" { 251 escp = unEscapeToMatch(toMatch) 252 } 253 m, err := re.FindStringMatch(escp) 254 if err != nil { 255 problem(t, "Error matching \"%v\" in pattern \"%v\": %v", toMatch, re.pattern, err) 256 } 257 return m 258} 259 260func containsEnder(line string, ender byte, allowFirst bool) bool { 261 index := strings.LastIndexByte(line, ender) 262 if index > 0 { 263 return true 264 } else if index == 0 && allowFirst { 265 return true 266 } 267 return false 268} 269 270func unEscapeToMatch(line string) string { 271 idx := strings.IndexRune(line, '\\') 272 // no slashes means no unescape needed 273 if idx == -1 { 274 return line 275 } 276 277 buf := bytes.NewBufferString(line[:idx]) 278 // get the runes for the rest of the string -- we're going full parser scan on this 279 280 inEscape := false 281 // take any \'s and convert them 282 for i := idx; i < len(line); i++ { 283 ch := line[i] 284 if ch == '\\' { 285 if inEscape { 286 buf.WriteByte(ch) 287 } 288 inEscape = !inEscape 289 continue 290 } 291 if inEscape { 292 switch ch { 293 case 'x': 294 buf.WriteByte(scanHex(line, &i)) 295 case 'a': 296 buf.WriteByte(0x07) 297 case 'b': 298 buf.WriteByte('\b') 299 case 'e': 300 buf.WriteByte(0x1b) 301 case 'f': 302 buf.WriteByte('\f') 303 case 'n': 304 buf.WriteByte('\n') 305 case 'r': 306 buf.WriteByte('\r') 307 case 't': 308 buf.WriteByte('\t') 309 case 'v': 310 buf.WriteByte(0x0b) 311 default: 312 if ch >= '0' && ch <= '7' { 313 buf.WriteByte(scanOctal(line, &i)) 314 } else { 315 buf.WriteByte(ch) 316 //panic(fmt.Sprintf("unexpected escape '%v' in %v", string(ch), line)) 317 } 318 } 319 inEscape = false 320 } else { 321 buf.WriteByte(ch) 322 } 323 } 324 325 return buf.String() 326} 327 328func unEscapeGroup(val string) string { 329 // use hex for chars 0x00-0x1f, 0x7f-0xff 330 buf := &bytes.Buffer{} 331 332 for i := 0; i < len(val); i++ { 333 ch := val[i] 334 if ch <= 0x1f || ch >= 0x7f { 335 //write it as a \x00 336 fmt.Fprintf(buf, "\\x%.2x", ch) 337 } else { 338 // write as-is 339 buf.WriteByte(ch) 340 } 341 } 342 343 return buf.String() 344} 345 346func scanHex(line string, idx *int) byte { 347 if *idx >= len(line)-2 { 348 panic(fmt.Sprintf("not enough hex chars in %v at %v", line, *idx)) 349 } 350 (*idx)++ 351 d1 := hexDigit(line[*idx]) 352 (*idx)++ 353 d2 := hexDigit(line[*idx]) 354 if d1 < 0 || d2 < 0 { 355 panic("bad hex chars") 356 } 357 358 return byte(d1*0x10 + d2) 359} 360 361// Returns n <= 0xF for a hex digit. 362func hexDigit(ch byte) int { 363 364 if d := uint(ch - '0'); d <= 9 { 365 return int(d) 366 } 367 368 if d := uint(ch - 'a'); d <= 5 { 369 return int(d + 0xa) 370 } 371 372 if d := uint(ch - 'A'); d <= 5 { 373 return int(d + 0xa) 374 } 375 376 return -1 377} 378 379// Scans up to three octal digits (stops before exceeding 0377). 380func scanOctal(line string, idx *int) byte { 381 // Consume octal chars only up to 3 digits and value 0377 382 383 // octals can be 3,2, or 1 digit 384 c := 3 385 386 if diff := len(line) - *idx; c > diff { 387 c = diff 388 } 389 390 i := 0 391 d := int(line[*idx] - '0') 392 for c > 0 && d <= 7 { 393 i *= 8 394 i += d 395 396 c-- 397 (*idx)++ 398 if *idx < len(line) { 399 d = int(line[*idx] - '0') 400 } 401 } 402 (*idx)-- 403 404 // Octal codes only go up to 255. Any larger and the behavior that Perl follows 405 // is simply to truncate the high bits. 406 i &= 0xFF 407 408 return byte(i) 409} 410