1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package syntax
6
7import (
8	"bytes"
9	"strconv"
10	"unicode"
11)
12
13// Compiled program.
14// May not belong in this package, but convenient for now.
15
16// A Prog is a compiled regular expression program.
17type Prog struct {
18	Inst   []Inst
19	Start  int // index of start instruction
20	NumCap int // number of InstCapture insts in re
21}
22
23// An InstOp is an instruction opcode.
24type InstOp uint8
25
26const (
27	InstAlt InstOp = iota
28	InstAltMatch
29	InstCapture
30	InstEmptyWidth
31	InstMatch
32	InstFail
33	InstNop
34	InstRune
35	InstRune1
36	InstRuneAny
37	InstRuneAnyNotNL
38)
39
40var instOpNames = []string{
41	"InstAlt",
42	"InstAltMatch",
43	"InstCapture",
44	"InstEmptyWidth",
45	"InstMatch",
46	"InstFail",
47	"InstNop",
48	"InstRune",
49	"InstRune1",
50	"InstRuneAny",
51	"InstRuneAnyNotNL",
52}
53
54func (i InstOp) String() string {
55	if uint(i) >= uint(len(instOpNames)) {
56		return ""
57	}
58	return instOpNames[i]
59}
60
61// An EmptyOp specifies a kind or mixture of zero-width assertions.
62type EmptyOp uint8
63
64const (
65	EmptyBeginLine EmptyOp = 1 << iota
66	EmptyEndLine
67	EmptyBeginText
68	EmptyEndText
69	EmptyWordBoundary
70	EmptyNoWordBoundary
71)
72
73// EmptyOpContext returns the zero-width assertions
74// satisfied at the position between the runes r1 and r2.
75// Passing r1 == -1 indicates that the position is
76// at the beginning of the text.
77// Passing r2 == -1 indicates that the position is
78// at the end of the text.
79func EmptyOpContext(r1, r2 rune) EmptyOp {
80	var op EmptyOp = EmptyNoWordBoundary
81	var boundary byte
82	switch {
83	case IsWordChar(r1):
84		boundary = 1
85	case r1 == '\n':
86		op |= EmptyBeginLine
87	case r1 < 0:
88		op |= EmptyBeginText | EmptyBeginLine
89	}
90	switch {
91	case IsWordChar(r2):
92		boundary ^= 1
93	case r2 == '\n':
94		op |= EmptyEndLine
95	case r2 < 0:
96		op |= EmptyEndText | EmptyEndLine
97	}
98	if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2)
99		op ^= (EmptyWordBoundary | EmptyNoWordBoundary)
100	}
101	return op
102}
103
104// IsWordChar reports whether r is consider a ``word character''
105// during the evaluation of the \b and \B zero-width assertions.
106// These assertions are ASCII-only: the word characters are [A-Za-z0-9_].
107func IsWordChar(r rune) bool {
108	return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_'
109}
110
111// An Inst is a single instruction in a regular expression program.
112type Inst struct {
113	Op   InstOp
114	Out  uint32 // all but InstMatch, InstFail
115	Arg  uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth
116	Rune []rune
117}
118
119func (p *Prog) String() string {
120	var b bytes.Buffer
121	dumpProg(&b, p)
122	return b.String()
123}
124
125// skipNop follows any no-op or capturing instructions
126// and returns the resulting pc.
127func (p *Prog) skipNop(pc uint32) (*Inst, uint32) {
128	i := &p.Inst[pc]
129	for i.Op == InstNop || i.Op == InstCapture {
130		pc = i.Out
131		i = &p.Inst[pc]
132	}
133	return i, pc
134}
135
136// op returns i.Op but merges all the Rune special cases into InstRune
137func (i *Inst) op() InstOp {
138	op := i.Op
139	switch op {
140	case InstRune1, InstRuneAny, InstRuneAnyNotNL:
141		op = InstRune
142	}
143	return op
144}
145
146// Prefix returns a literal string that all matches for the
147// regexp must start with. Complete is true if the prefix
148// is the entire match.
149func (p *Prog) Prefix() (prefix string, complete bool) {
150	i, _ := p.skipNop(uint32(p.Start))
151
152	// Avoid allocation of buffer if prefix is empty.
153	if i.op() != InstRune || len(i.Rune) != 1 {
154		return "", i.Op == InstMatch
155	}
156
157	// Have prefix; gather characters.
158	var buf bytes.Buffer
159	for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 {
160		buf.WriteRune(i.Rune[0])
161		i, _ = p.skipNop(i.Out)
162	}
163	return buf.String(), i.Op == InstMatch
164}
165
166// StartCond returns the leading empty-width conditions that must
167// be true in any match. It returns ^EmptyOp(0) if no matches are possible.
168func (p *Prog) StartCond() EmptyOp {
169	var flag EmptyOp
170	pc := uint32(p.Start)
171	i := &p.Inst[pc]
172Loop:
173	for {
174		switch i.Op {
175		case InstEmptyWidth:
176			flag |= EmptyOp(i.Arg)
177		case InstFail:
178			return ^EmptyOp(0)
179		case InstCapture, InstNop:
180			// skip
181		default:
182			break Loop
183		}
184		pc = i.Out
185		i = &p.Inst[pc]
186	}
187	return flag
188}
189
190const noMatch = -1
191
192// MatchRune reports whether the instruction matches (and consumes) r.
193// It should only be called when i.Op == InstRune.
194func (i *Inst) MatchRune(r rune) bool {
195	return i.MatchRunePos(r) != noMatch
196}
197
198// MatchRunePos checks whether the instruction matches (and consumes) r.
199// If so, MatchRunePos returns the index of the matching rune pair
200// (or, when len(i.Rune) == 1, rune singleton).
201// If not, MatchRunePos returns -1.
202// MatchRunePos should only be called when i.Op == InstRune.
203func (i *Inst) MatchRunePos(r rune) int {
204	rune := i.Rune
205
206	// Special case: single-rune slice is from literal string, not char class.
207	if len(rune) == 1 {
208		r0 := rune[0]
209		if r == r0 {
210			return 0
211		}
212		if Flags(i.Arg)&FoldCase != 0 {
213			for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) {
214				if r == r1 {
215					return 0
216				}
217			}
218		}
219		return noMatch
220	}
221
222	// Peek at the first few pairs.
223	// Should handle ASCII well.
224	for j := 0; j < len(rune) && j <= 8; j += 2 {
225		if r < rune[j] {
226			return noMatch
227		}
228		if r <= rune[j+1] {
229			return j / 2
230		}
231	}
232
233	// Otherwise binary search.
234	lo := 0
235	hi := len(rune) / 2
236	for lo < hi {
237		m := lo + (hi-lo)/2
238		if c := rune[2*m]; c <= r {
239			if r <= rune[2*m+1] {
240				return m
241			}
242			lo = m + 1
243		} else {
244			hi = m
245		}
246	}
247	return noMatch
248}
249
250// MatchEmptyWidth reports whether the instruction matches
251// an empty string between the runes before and after.
252// It should only be called when i.Op == InstEmptyWidth.
253func (i *Inst) MatchEmptyWidth(before rune, after rune) bool {
254	switch EmptyOp(i.Arg) {
255	case EmptyBeginLine:
256		return before == '\n' || before == -1
257	case EmptyEndLine:
258		return after == '\n' || after == -1
259	case EmptyBeginText:
260		return before == -1
261	case EmptyEndText:
262		return after == -1
263	case EmptyWordBoundary:
264		return IsWordChar(before) != IsWordChar(after)
265	case EmptyNoWordBoundary:
266		return IsWordChar(before) == IsWordChar(after)
267	}
268	panic("unknown empty width arg")
269}
270
271func (i *Inst) String() string {
272	var b bytes.Buffer
273	dumpInst(&b, i)
274	return b.String()
275}
276
277func bw(b *bytes.Buffer, args ...string) {
278	for _, s := range args {
279		b.WriteString(s)
280	}
281}
282
283func dumpProg(b *bytes.Buffer, p *Prog) {
284	for j := range p.Inst {
285		i := &p.Inst[j]
286		pc := strconv.Itoa(j)
287		if len(pc) < 3 {
288			b.WriteString("   "[len(pc):])
289		}
290		if j == p.Start {
291			pc += "*"
292		}
293		bw(b, pc, "\t")
294		dumpInst(b, i)
295		bw(b, "\n")
296	}
297}
298
299func u32(i uint32) string {
300	return strconv.FormatUint(uint64(i), 10)
301}
302
303func dumpInst(b *bytes.Buffer, i *Inst) {
304	switch i.Op {
305	case InstAlt:
306		bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg))
307	case InstAltMatch:
308		bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg))
309	case InstCapture:
310		bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out))
311	case InstEmptyWidth:
312		bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out))
313	case InstMatch:
314		bw(b, "match")
315	case InstFail:
316		bw(b, "fail")
317	case InstNop:
318		bw(b, "nop -> ", u32(i.Out))
319	case InstRune:
320		if i.Rune == nil {
321			// shouldn't happen
322			bw(b, "rune <nil>")
323		}
324		bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune)))
325		if Flags(i.Arg)&FoldCase != 0 {
326			bw(b, "/i")
327		}
328		bw(b, " -> ", u32(i.Out))
329	case InstRune1:
330		bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out))
331	case InstRuneAny:
332		bw(b, "any -> ", u32(i.Out))
333	case InstRuneAnyNotNL:
334		bw(b, "anynotnl -> ", u32(i.Out))
335	}
336}
337