1// Copyright 2014 The Go Authors.  All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package regexp
6
7import (
8	"bytes"
9	"regexp/syntax"
10	"sort"
11	"unicode"
12)
13
14// "One-pass" regexp execution.
15// Some regexps can be analyzed to determine that they never need
16// backtracking: they are guaranteed to run in one pass over the string
17// without bothering to save all the usual NFA state.
18// Detect those and execute them more quickly.
19
20// A onePassProg is a compiled one-pass regular expression program.
21// It is the same as syntax.Prog except for the use of onePassInst.
22type onePassProg struct {
23	Inst   []onePassInst
24	Start  int // index of start instruction
25	NumCap int // number of InstCapture insts in re
26}
27
28// A onePassInst is a single instruction in a one-pass regular expression program.
29// It is the same as syntax.Inst except for the new 'Next' field.
30type onePassInst struct {
31	syntax.Inst
32	Next []uint32
33}
34
35// OnePassPrefix returns a literal string that all matches for the
36// regexp must start with.  Complete is true if the prefix
37// is the entire match. Pc is the index of the last rune instruction
38// in the string. The OnePassPrefix skips over the mandatory
39// EmptyBeginText
40func onePassPrefix(p *syntax.Prog) (prefix string, complete bool, pc uint32) {
41	i := &p.Inst[p.Start]
42	if i.Op != syntax.InstEmptyWidth || (syntax.EmptyOp(i.Arg))&syntax.EmptyBeginText == 0 {
43		return "", i.Op == syntax.InstMatch, uint32(p.Start)
44	}
45	pc = i.Out
46	i = &p.Inst[pc]
47	for i.Op == syntax.InstNop {
48		pc = i.Out
49		i = &p.Inst[pc]
50	}
51	// Avoid allocation of buffer if prefix is empty.
52	if iop(i) != syntax.InstRune || len(i.Rune) != 1 {
53		return "", i.Op == syntax.InstMatch, uint32(p.Start)
54	}
55
56	// Have prefix; gather characters.
57	var buf bytes.Buffer
58	for iop(i) == syntax.InstRune && len(i.Rune) == 1 && syntax.Flags(i.Arg)&syntax.FoldCase == 0 {
59		buf.WriteRune(i.Rune[0])
60		pc, i = i.Out, &p.Inst[i.Out]
61	}
62	if i.Op == syntax.InstEmptyWidth &&
63		syntax.EmptyOp(i.Arg)&syntax.EmptyEndText != 0 &&
64		p.Inst[i.Out].Op == syntax.InstMatch {
65		complete = true
66	}
67	return buf.String(), complete, pc
68}
69
70// OnePassNext selects the next actionable state of the prog, based on the input character.
71// It should only be called when i.Op == InstAlt or InstAltMatch, and from the one-pass machine.
72// One of the alternates may ultimately lead without input to end of line. If the instruction
73// is InstAltMatch the path to the InstMatch is in i.Out, the normal node in i.Next.
74func onePassNext(i *onePassInst, r rune) uint32 {
75	next := i.MatchRunePos(r)
76	if next >= 0 {
77		return i.Next[next]
78	}
79	if i.Op == syntax.InstAltMatch {
80		return i.Out
81	}
82	return 0
83}
84
85func iop(i *syntax.Inst) syntax.InstOp {
86	op := i.Op
87	switch op {
88	case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
89		op = syntax.InstRune
90	}
91	return op
92}
93
94// Sparse Array implementation is used as a queueOnePass.
95type queueOnePass struct {
96	sparse          []uint32
97	dense           []uint32
98	size, nextIndex uint32
99}
100
101func (q *queueOnePass) empty() bool {
102	return q.nextIndex >= q.size
103}
104
105func (q *queueOnePass) next() (n uint32) {
106	n = q.dense[q.nextIndex]
107	q.nextIndex++
108	return
109}
110
111func (q *queueOnePass) clear() {
112	q.size = 0
113	q.nextIndex = 0
114}
115
116func (q *queueOnePass) contains(u uint32) bool {
117	if u >= uint32(len(q.sparse)) {
118		return false
119	}
120	return q.sparse[u] < q.size && q.dense[q.sparse[u]] == u
121}
122
123func (q *queueOnePass) insert(u uint32) {
124	if !q.contains(u) {
125		q.insertNew(u)
126	}
127}
128
129func (q *queueOnePass) insertNew(u uint32) {
130	if u >= uint32(len(q.sparse)) {
131		return
132	}
133	q.sparse[u] = q.size
134	q.dense[q.size] = u
135	q.size++
136}
137
138func newQueue(size int) (q *queueOnePass) {
139	return &queueOnePass{
140		sparse: make([]uint32, size),
141		dense:  make([]uint32, size),
142	}
143}
144
145// mergeRuneSets merges two non-intersecting runesets, and returns the merged result,
146// and a NextIp array. The idea is that if a rune matches the OnePassRunes at index
147// i, NextIp[i/2] is the target. If the input sets intersect, an empty runeset and a
148// NextIp array with the single element mergeFailed is returned.
149// The code assumes that both inputs contain ordered and non-intersecting rune pairs.
150const mergeFailed = uint32(0xffffffff)
151
152var (
153	noRune = []rune{}
154	noNext = []uint32{mergeFailed}
155)
156
157func mergeRuneSets(leftRunes, rightRunes *[]rune, leftPC, rightPC uint32) ([]rune, []uint32) {
158	leftLen := len(*leftRunes)
159	rightLen := len(*rightRunes)
160	if leftLen&0x1 != 0 || rightLen&0x1 != 0 {
161		panic("mergeRuneSets odd length []rune")
162	}
163	var (
164		lx, rx int
165	)
166	merged := make([]rune, 0)
167	next := make([]uint32, 0)
168	ok := true
169	defer func() {
170		if !ok {
171			merged = nil
172			next = nil
173		}
174	}()
175
176	ix := -1
177	extend := func(newLow *int, newArray *[]rune, pc uint32) bool {
178		if ix > 0 && (*newArray)[*newLow] <= merged[ix] {
179			return false
180		}
181		merged = append(merged, (*newArray)[*newLow], (*newArray)[*newLow+1])
182		*newLow += 2
183		ix += 2
184		next = append(next, pc)
185		return true
186	}
187
188	for lx < leftLen || rx < rightLen {
189		switch {
190		case rx >= rightLen:
191			ok = extend(&lx, leftRunes, leftPC)
192		case lx >= leftLen:
193			ok = extend(&rx, rightRunes, rightPC)
194		case (*rightRunes)[rx] < (*leftRunes)[lx]:
195			ok = extend(&rx, rightRunes, rightPC)
196		default:
197			ok = extend(&lx, leftRunes, leftPC)
198		}
199		if !ok {
200			return noRune, noNext
201		}
202	}
203	return merged, next
204}
205
206// cleanupOnePass drops working memory, and restores certain shortcut instructions.
207func cleanupOnePass(prog *onePassProg, original *syntax.Prog) {
208	for ix, instOriginal := range original.Inst {
209		switch instOriginal.Op {
210		case syntax.InstAlt, syntax.InstAltMatch, syntax.InstRune:
211		case syntax.InstCapture, syntax.InstEmptyWidth, syntax.InstNop, syntax.InstMatch, syntax.InstFail:
212			prog.Inst[ix].Next = nil
213		case syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL:
214			prog.Inst[ix].Next = nil
215			prog.Inst[ix] = onePassInst{Inst: instOriginal}
216		}
217	}
218}
219
220// onePassCopy creates a copy of the original Prog, as we'll be modifying it
221func onePassCopy(prog *syntax.Prog) *onePassProg {
222	p := &onePassProg{
223		Start:  prog.Start,
224		NumCap: prog.NumCap,
225	}
226	for _, inst := range prog.Inst {
227		p.Inst = append(p.Inst, onePassInst{Inst: inst})
228	}
229
230	// rewrites one or more common Prog constructs that enable some otherwise
231	// non-onepass Progs to be onepass. A:BD (for example) means an InstAlt at
232	// ip A, that points to ips B & C.
233	// A:BC + B:DA => A:BC + B:CD
234	// A:BC + B:DC => A:DC + B:DC
235	for pc := range p.Inst {
236		switch p.Inst[pc].Op {
237		default:
238			continue
239		case syntax.InstAlt, syntax.InstAltMatch:
240			// A:Bx + B:Ay
241			p_A_Other := &p.Inst[pc].Out
242			p_A_Alt := &p.Inst[pc].Arg
243			// make sure a target is another Alt
244			instAlt := p.Inst[*p_A_Alt]
245			if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
246				p_A_Alt, p_A_Other = p_A_Other, p_A_Alt
247				instAlt = p.Inst[*p_A_Alt]
248				if !(instAlt.Op == syntax.InstAlt || instAlt.Op == syntax.InstAltMatch) {
249					continue
250				}
251			}
252			instOther := p.Inst[*p_A_Other]
253			// Analyzing both legs pointing to Alts is for another day
254			if instOther.Op == syntax.InstAlt || instOther.Op == syntax.InstAltMatch {
255				// too complicated
256				continue
257			}
258			// simple empty transition loop
259			// A:BC + B:DA => A:BC + B:DC
260			p_B_Alt := &p.Inst[*p_A_Alt].Out
261			p_B_Other := &p.Inst[*p_A_Alt].Arg
262			patch := false
263			if instAlt.Out == uint32(pc) {
264				patch = true
265			} else if instAlt.Arg == uint32(pc) {
266				patch = true
267				p_B_Alt, p_B_Other = p_B_Other, p_B_Alt
268			}
269			if patch {
270				*p_B_Alt = *p_A_Other
271			}
272
273			// empty transition to common target
274			// A:BC + B:DC => A:DC + B:DC
275			if *p_A_Other == *p_B_Alt {
276				*p_A_Alt = *p_B_Other
277			}
278		}
279	}
280	return p
281}
282
283// runeSlice exists to permit sorting the case-folded rune sets.
284type runeSlice []rune
285
286func (p runeSlice) Len() int           { return len(p) }
287func (p runeSlice) Less(i, j int) bool { return p[i] < p[j] }
288func (p runeSlice) Swap(i, j int)      { p[i], p[j] = p[j], p[i] }
289
290// Sort is a convenience method.
291func (p runeSlice) Sort() {
292	sort.Sort(p)
293}
294
295var anyRuneNotNL = []rune{0, '\n' - 1, '\n' + 1, unicode.MaxRune}
296var anyRune = []rune{0, unicode.MaxRune}
297
298// makeOnePass creates a onepass Prog, if possible. It is possible if at any alt,
299// the match engine can always tell which branch to take. The routine may modify
300// p if it is turned into a onepass Prog. If it isn't possible for this to be a
301// onepass Prog, the Prog notOnePass is returned. makeOnePass is recursive
302// to the size of the Prog.
303func makeOnePass(p *onePassProg) *onePassProg {
304	// If the machine is very long, it's not worth the time to check if we can use one pass.
305	if len(p.Inst) >= 1000 {
306		return notOnePass
307	}
308
309	var (
310		instQueue    = newQueue(len(p.Inst))
311		visitQueue   = newQueue(len(p.Inst))
312		check        func(uint32, map[uint32]bool) bool
313		onePassRunes = make([][]rune, len(p.Inst))
314	)
315
316	// check that paths from Alt instructions are unambiguous, and rebuild the new
317	// program as a onepass program
318	check = func(pc uint32, m map[uint32]bool) (ok bool) {
319		ok = true
320		inst := &p.Inst[pc]
321		if visitQueue.contains(pc) {
322			return
323		}
324		visitQueue.insert(pc)
325		switch inst.Op {
326		case syntax.InstAlt, syntax.InstAltMatch:
327			ok = check(inst.Out, m) && check(inst.Arg, m)
328			// check no-input paths to InstMatch
329			matchOut := m[inst.Out]
330			matchArg := m[inst.Arg]
331			if matchOut && matchArg {
332				ok = false
333				break
334			}
335			// Match on empty goes in inst.Out
336			if matchArg {
337				inst.Out, inst.Arg = inst.Arg, inst.Out
338				matchOut, matchArg = matchArg, matchOut
339			}
340			if matchOut {
341				m[pc] = true
342				inst.Op = syntax.InstAltMatch
343			}
344
345			// build a dispatch operator from the two legs of the alt.
346			onePassRunes[pc], inst.Next = mergeRuneSets(
347				&onePassRunes[inst.Out], &onePassRunes[inst.Arg], inst.Out, inst.Arg)
348			if len(inst.Next) > 0 && inst.Next[0] == mergeFailed {
349				ok = false
350				break
351			}
352		case syntax.InstCapture, syntax.InstNop:
353			ok = check(inst.Out, m)
354			m[pc] = m[inst.Out]
355			// pass matching runes back through these no-ops.
356			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
357			inst.Next = []uint32{}
358			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
359				inst.Next = append(inst.Next, inst.Out)
360			}
361		case syntax.InstEmptyWidth:
362			ok = check(inst.Out, m)
363			m[pc] = m[inst.Out]
364			onePassRunes[pc] = append([]rune{}, onePassRunes[inst.Out]...)
365			inst.Next = []uint32{}
366			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
367				inst.Next = append(inst.Next, inst.Out)
368			}
369		case syntax.InstMatch, syntax.InstFail:
370			m[pc] = inst.Op == syntax.InstMatch
371			break
372		case syntax.InstRune:
373			m[pc] = false
374			if len(inst.Next) > 0 {
375				break
376			}
377			instQueue.insert(inst.Out)
378			if len(inst.Rune) == 0 {
379				onePassRunes[pc] = []rune{}
380				inst.Next = []uint32{inst.Out}
381				break
382			}
383			runes := make([]rune, 0)
384			if len(inst.Rune) == 1 && syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
385				r0 := inst.Rune[0]
386				runes = append(runes, r0, r0)
387				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
388					runes = append(runes, r1, r1)
389				}
390				sort.Sort(runeSlice(runes))
391			} else {
392				runes = append(runes, inst.Rune...)
393			}
394			onePassRunes[pc] = runes
395			inst.Next = []uint32{}
396			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
397				inst.Next = append(inst.Next, inst.Out)
398			}
399			inst.Op = syntax.InstRune
400		case syntax.InstRune1:
401			m[pc] = false
402			if len(inst.Next) > 0 {
403				break
404			}
405			instQueue.insert(inst.Out)
406			runes := []rune{}
407			// expand case-folded runes
408			if syntax.Flags(inst.Arg)&syntax.FoldCase != 0 {
409				r0 := inst.Rune[0]
410				runes = append(runes, r0, r0)
411				for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
412					runes = append(runes, r1, r1)
413				}
414				sort.Sort(runeSlice(runes))
415			} else {
416				runes = append(runes, inst.Rune[0], inst.Rune[0])
417			}
418			onePassRunes[pc] = runes
419			inst.Next = []uint32{}
420			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
421				inst.Next = append(inst.Next, inst.Out)
422			}
423			inst.Op = syntax.InstRune
424		case syntax.InstRuneAny:
425			m[pc] = false
426			if len(inst.Next) > 0 {
427				break
428			}
429			instQueue.insert(inst.Out)
430			onePassRunes[pc] = append([]rune{}, anyRune...)
431			inst.Next = []uint32{inst.Out}
432		case syntax.InstRuneAnyNotNL:
433			m[pc] = false
434			if len(inst.Next) > 0 {
435				break
436			}
437			instQueue.insert(inst.Out)
438			onePassRunes[pc] = append([]rune{}, anyRuneNotNL...)
439			inst.Next = []uint32{}
440			for i := len(onePassRunes[pc]) / 2; i >= 0; i-- {
441				inst.Next = append(inst.Next, inst.Out)
442			}
443		}
444		return
445	}
446
447	instQueue.clear()
448	instQueue.insert(uint32(p.Start))
449	m := make(map[uint32]bool, len(p.Inst))
450	for !instQueue.empty() {
451		visitQueue.clear()
452		pc := instQueue.next()
453		if !check(uint32(pc), m) {
454			p = notOnePass
455			break
456		}
457	}
458	if p != notOnePass {
459		for i := range p.Inst {
460			p.Inst[i].Rune = onePassRunes[i]
461		}
462	}
463	return p
464}
465
466var notOnePass *onePassProg = nil
467
468// compileOnePass returns a new *syntax.Prog suitable for onePass execution if the original Prog
469// can be recharacterized as a one-pass regexp program, or syntax.notOnePass if the
470// Prog cannot be converted. For a one pass prog, the fundamental condition that must
471// be true is: at any InstAlt, there must be no ambiguity about what branch to  take.
472func compileOnePass(prog *syntax.Prog) (p *onePassProg) {
473	if prog.Start == 0 {
474		return notOnePass
475	}
476	// onepass regexp is anchored
477	if prog.Inst[prog.Start].Op != syntax.InstEmptyWidth ||
478		syntax.EmptyOp(prog.Inst[prog.Start].Arg)&syntax.EmptyBeginText != syntax.EmptyBeginText {
479		return notOnePass
480	}
481	// every instruction leading to InstMatch must be EmptyEndText
482	for _, inst := range prog.Inst {
483		opOut := prog.Inst[inst.Out].Op
484		switch inst.Op {
485		default:
486			if opOut == syntax.InstMatch {
487				return notOnePass
488			}
489		case syntax.InstAlt, syntax.InstAltMatch:
490			if opOut == syntax.InstMatch || prog.Inst[inst.Arg].Op == syntax.InstMatch {
491				return notOnePass
492			}
493		case syntax.InstEmptyWidth:
494			if opOut == syntax.InstMatch {
495				if syntax.EmptyOp(inst.Arg)&syntax.EmptyEndText == syntax.EmptyEndText {
496					continue
497				}
498				return notOnePass
499			}
500		}
501	}
502	// Creates a slightly optimized copy of the original Prog
503	// that cleans up some Prog idioms that block valid onepass programs
504	p = onePassCopy(prog)
505
506	// checkAmbiguity on InstAlts, build onepass Prog if possible
507	p = makeOnePass(p)
508
509	if p != notOnePass {
510		cleanupOnePass(p, prog)
511	}
512	return p
513}
514