1package regexp2
2
3import (
4	"bufio"
5	"bytes"
6	"fmt"
7	"log"
8	"os"
9	"regexp"
10	"strconv"
11	"strings"
12	"testing"
13	"time"
14)
15
16// Process the file "testoutput1" from PCRE2 v10.21 (public domain)
17var totalCount, failCount = 0, 0
18
19func TestPcre_Basics(t *testing.T) {
20	defer func() {
21		if failCount > 0 {
22			t.Logf("%v of %v patterns failed", failCount, totalCount)
23		}
24	}()
25	// open our test patterns file and run through it
26	// validating results as we go
27	file, err := os.Open("testoutput1")
28	if err != nil {
29		log.Fatal(err)
30	}
31	defer file.Close()
32
33	// the high level structure of the file:
34	//		#comments - ignore only outside of the pattern
35	//		pattern (could be multi-line, could be surrounded by "" or //) after the / there are the options some we understand, some we don't
36	//		    test case
37	//		 0: success case
38	//		\= Expect no match (ignored)
39	//		    another test case
40	//		No Match
41	//
42	//		another pattern ...etc
43
44	scanner := bufio.NewScanner(file)
45	// main pattern loop
46	for scanner.Scan() {
47		// reading the file a line at a time
48		line := scanner.Text()
49
50		if trim := strings.TrimSpace(line); trim == "" || strings.HasPrefix(trim, "#") {
51			// skip blanks and comments
52			continue
53		}
54
55		patternStart := line[0]
56		if patternStart != '/' && patternStart != '"' {
57			// an error!  expected a pattern but we didn't understand what was in the file
58			t.Fatalf("Unknown file format, expected line to start with '/' or '\"', line in: %v", line)
59		}
60
61		// start building our pattern, handling multi-line patterns
62		pattern := line
63		totalCount++
64
65		// keep appending the lines to our pattern string until we
66		// find our closing tag, don't allow the first char to match on the
67		// line start, but subsequent lines could end on the first char
68		allowFirst := false
69		for !containsEnder(line, patternStart, allowFirst) {
70			if !scanner.Scan() {
71				// an error!  expected more pattern, but got eof
72				t.Fatalf("Unknown file format, expected more pattern text, but got EOF, pattern so far: %v", pattern)
73			}
74			line = scanner.Text()
75			pattern += fmt.Sprintf("\n%s", line)
76			allowFirst = true
77		}
78
79		// we have our raw pattern! -- we need to convert this to a compiled regex
80		re := compileRawPattern(t, pattern)
81
82		var (
83			capsIdx map[int]int
84			m       *Match
85			toMatch string
86		)
87		// now we need to parse the test cases if there are any
88		// they start with 4 spaces -- if we don't get a 4-space start then
89		// we're back out to our next pattern
90		for scanner.Scan() {
91			line = scanner.Text()
92
93			// blank line is our separator for a new pattern
94			if strings.TrimSpace(line) == "" {
95				break
96			}
97
98			// could be either "    " or "\= Expect"
99			if strings.HasPrefix(line, "\\= Expect") {
100				continue
101			} else if strings.HasPrefix(line, "    ") {
102				// trim off leading spaces for our text to match
103				toMatch = line[4:]
104				// trim off trailing spaces too
105				toMatch = strings.TrimRight(toMatch, " ")
106
107				m = matchString(t, re, toMatch)
108
109				capsIdx = make(map[int]int)
110				continue
111				//t.Fatalf("Expected match text to start with 4 spaces, instead got: '%v'", line)
112			} else if strings.HasPrefix(line, "No match") {
113				validateNoMatch(t, re, m)
114				// no match means we're done
115				continue
116			} else if subs := matchGroup.FindStringSubmatch(line); len(subs) == 3 {
117				gIdx, _ := strconv.Atoi(subs[1])
118				if _, ok := capsIdx[gIdx]; !ok {
119					capsIdx[gIdx] = 0
120				}
121				validateMatch(t, re, m, toMatch, subs[2], gIdx, capsIdx[gIdx])
122				capsIdx[gIdx]++
123				continue
124			} else {
125				// no match -- problem
126				t.Fatalf("Unknown file format, expected match or match group but got '%v'", line)
127			}
128		}
129
130	}
131
132	if err := scanner.Err(); err != nil {
133		log.Fatal(err)
134	}
135}
136
137var matchGroup = regexp.MustCompile(`^\s*(\d+): (.*)`)
138
139func problem(t *testing.T, input string, args ...interface{}) {
140	failCount++
141	t.Errorf(input, args...)
142}
143
144func validateNoMatch(t *testing.T, re *Regexp, m *Match) {
145	if re == nil || m == nil {
146		return
147	}
148
149	problem(t, "Expected no match for pattern '%v', but got '%v'", re.pattern, m.String())
150}
151
152func validateMatch(t *testing.T, re *Regexp, m *Match, toMatch, value string, idx, capIdx int) {
153	if re == nil {
154		// already error'd earlier up stream
155		return
156	}
157
158	if m == nil {
159		// we didn't match, but should have
160		problem(t, "Expected match for pattern '%v' with input '%v', but got no match", re.pattern, toMatch)
161		return
162	}
163
164	g := m.Groups()
165	if len(g) <= idx {
166		problem(t, "Expected group %v does not exist in pattern '%v' with input '%v'", idx, re.pattern, toMatch)
167		return
168	}
169
170	if value == "<unset>" {
171		// this means we shouldn't have a cap for this group
172		if len(g[idx].Captures) > 0 {
173			problem(t, "Expected no cap %v in group %v in pattern '%v' with input '%v'", g[idx].Captures[capIdx].String(), idx, re.pattern, toMatch)
174		}
175
176		return
177	}
178
179	if len(g[idx].Captures) <= capIdx {
180		problem(t, "Expected cap %v does not exist in group %v in pattern '%v' with input '%v'", capIdx, idx, re.pattern, toMatch)
181		return
182	}
183
184	escp := unEscapeGroup(g[idx].String())
185	//escp := unEscapeGroup(g[idx].Captures[capIdx].String())
186	if escp != value {
187		problem(t, "Expected '%v' but got '%v' for cap %v, group %v for pattern '%v' with input '%v'", value, escp, capIdx, idx, re.pattern, toMatch)
188		return
189	}
190}
191
192func compileRawPattern(t *testing.T, pattern string) *Regexp {
193	// check our end for RegexOptions -trim them off
194	index := strings.LastIndexAny(pattern, "/\"")
195	//
196	// Append "= Debug" to compare details between corefx and regexp2 on the PCRE test suite
197	//
198	var opts RegexOptions
199
200	if index+1 < len(pattern) {
201		textOptions := pattern[index+1:]
202		pattern = pattern[:index+1]
203		// there are lots of complex options here
204		for _, textOpt := range strings.Split(textOptions, ",") {
205			switch textOpt {
206			case "dupnames":
207				// we don't know how to handle this...
208			default:
209				if strings.Contains(textOpt, "i") {
210					opts |= IgnoreCase
211				}
212				if strings.Contains(textOpt, "s") {
213					opts |= Singleline
214				}
215				if strings.Contains(textOpt, "m") {
216					opts |= Multiline
217				}
218				if strings.Contains(textOpt, "x") {
219					opts |= IgnorePatternWhitespace
220				}
221			}
222		}
223
224	}
225
226	// trim off first and last char
227	pattern = pattern[1 : len(pattern)-1]
228
229	defer func() {
230		if rec := recover(); rec != nil {
231			problem(t, "PANIC in compiling \"%v\": %v", pattern, rec)
232		}
233	}()
234	re, err := Compile(pattern, opts)
235	if err != nil {
236		problem(t, "Error parsing \"%v\": %v", pattern, err)
237	}
238	return re
239}
240
241func matchString(t *testing.T, re *Regexp, toMatch string) *Match {
242	if re == nil {
243		return nil
244	}
245
246	re.MatchTimeout = time.Second * 1
247
248	escp := ""
249	var err error
250	if toMatch != "\\" {
251		escp = unEscapeToMatch(toMatch)
252	}
253	m, err := re.FindStringMatch(escp)
254	if err != nil {
255		problem(t, "Error matching \"%v\" in pattern \"%v\": %v", toMatch, re.pattern, err)
256	}
257	return m
258}
259
260func containsEnder(line string, ender byte, allowFirst bool) bool {
261	index := strings.LastIndexByte(line, ender)
262	if index > 0 {
263		return true
264	} else if index == 0 && allowFirst {
265		return true
266	}
267	return false
268}
269
270func unEscapeToMatch(line string) string {
271	idx := strings.IndexRune(line, '\\')
272	// no slashes means no unescape needed
273	if idx == -1 {
274		return line
275	}
276
277	buf := bytes.NewBufferString(line[:idx])
278	// get the runes for the rest of the string -- we're going full parser scan on this
279
280	inEscape := false
281	// take any \'s and convert them
282	for i := idx; i < len(line); i++ {
283		ch := line[i]
284		if ch == '\\' {
285			if inEscape {
286				buf.WriteByte(ch)
287			}
288			inEscape = !inEscape
289			continue
290		}
291		if inEscape {
292			switch ch {
293			case 'x':
294				buf.WriteByte(scanHex(line, &i))
295			case 'a':
296				buf.WriteByte(0x07)
297			case 'b':
298				buf.WriteByte('\b')
299			case 'e':
300				buf.WriteByte(0x1b)
301			case 'f':
302				buf.WriteByte('\f')
303			case 'n':
304				buf.WriteByte('\n')
305			case 'r':
306				buf.WriteByte('\r')
307			case 't':
308				buf.WriteByte('\t')
309			case 'v':
310				buf.WriteByte(0x0b)
311			default:
312				if ch >= '0' && ch <= '7' {
313					buf.WriteByte(scanOctal(line, &i))
314				} else {
315					buf.WriteByte(ch)
316					//panic(fmt.Sprintf("unexpected escape '%v' in %v", string(ch), line))
317				}
318			}
319			inEscape = false
320		} else {
321			buf.WriteByte(ch)
322		}
323	}
324
325	return buf.String()
326}
327
328func unEscapeGroup(val string) string {
329	// use hex for chars 0x00-0x1f, 0x7f-0xff
330	buf := &bytes.Buffer{}
331
332	for i := 0; i < len(val); i++ {
333		ch := val[i]
334		if ch <= 0x1f || ch >= 0x7f {
335			//write it as a \x00
336			fmt.Fprintf(buf, "\\x%.2x", ch)
337		} else {
338			// write as-is
339			buf.WriteByte(ch)
340		}
341	}
342
343	return buf.String()
344}
345
346func scanHex(line string, idx *int) byte {
347	if *idx >= len(line)-2 {
348		panic(fmt.Sprintf("not enough hex chars in %v at %v", line, *idx))
349	}
350	(*idx)++
351	d1 := hexDigit(line[*idx])
352	(*idx)++
353	d2 := hexDigit(line[*idx])
354	if d1 < 0 || d2 < 0 {
355		panic("bad hex chars")
356	}
357
358	return byte(d1*0x10 + d2)
359}
360
361// Returns n <= 0xF for a hex digit.
362func hexDigit(ch byte) int {
363
364	if d := uint(ch - '0'); d <= 9 {
365		return int(d)
366	}
367
368	if d := uint(ch - 'a'); d <= 5 {
369		return int(d + 0xa)
370	}
371
372	if d := uint(ch - 'A'); d <= 5 {
373		return int(d + 0xa)
374	}
375
376	return -1
377}
378
379// Scans up to three octal digits (stops before exceeding 0377).
380func scanOctal(line string, idx *int) byte {
381	// Consume octal chars only up to 3 digits and value 0377
382
383	// octals can be 3,2, or 1 digit
384	c := 3
385
386	if diff := len(line) - *idx; c > diff {
387		c = diff
388	}
389
390	i := 0
391	d := int(line[*idx] - '0')
392	for c > 0 && d <= 7 {
393		i *= 8
394		i += d
395
396		c--
397		(*idx)++
398		if *idx < len(line) {
399			d = int(line[*idx] - '0')
400		}
401	}
402	(*idx)--
403
404	// Octal codes only go up to 255.  Any larger and the behavior that Perl follows
405	// is simply to truncate the high bits.
406	i &= 0xFF
407
408	return byte(i)
409}
410