1package syntax
2
3import (
4	"bytes"
5	"fmt"
6	"math"
7)
8
9// similar to prog.go in the go regex package...also with comment 'may not belong in this package'
10
11// File provides operator constants for use by the Builder and the Machine.
12
13// Implementation notes:
14//
15// Regexps are built into RegexCodes, which contain an operation array,
16// a string table, and some constants.
17//
18// Each operation is one of the codes below, followed by the integer
19// operands specified for each op.
20//
21// Strings and sets are indices into a string table.
22
23type InstOp int
24
25const (
26	// 					    lef/back operands        description
27
28	Onerep    InstOp = 0 // lef,back char,min,max    a {n}
29	Notonerep        = 1 // lef,back char,min,max    .{n}
30	Setrep           = 2 // lef,back set,min,max     [\d]{n}
31
32	Oneloop    = 3 // lef,back char,min,max    a {,n}
33	Notoneloop = 4 // lef,back char,min,max    .{,n}
34	Setloop    = 5 // lef,back set,min,max     [\d]{,n}
35
36	Onelazy    = 6 // lef,back char,min,max    a {,n}?
37	Notonelazy = 7 // lef,back char,min,max    .{,n}?
38	Setlazy    = 8 // lef,back set,min,max     [\d]{,n}?
39
40	One    = 9  // lef      char            a
41	Notone = 10 // lef      char            [^a]
42	Set    = 11 // lef      set             [a-z\s]  \w \s \d
43
44	Multi = 12 // lef      string          abcd
45	Ref   = 13 // lef      group           \#
46
47	Bol         = 14 //                          ^
48	Eol         = 15 //                          $
49	Boundary    = 16 //                          \b
50	Nonboundary = 17 //                          \B
51	Beginning   = 18 //                          \A
52	Start       = 19 //                          \G
53	EndZ        = 20 //                          \Z
54	End         = 21 //                          \Z
55
56	Nothing = 22 //                          Reject!
57
58	// Primitive control structures
59
60	Lazybranch      = 23 // back     jump            straight first
61	Branchmark      = 24 // back     jump            branch first for loop
62	Lazybranchmark  = 25 // back     jump            straight first for loop
63	Nullcount       = 26 // back     val             set counter, null mark
64	Setcount        = 27 // back     val             set counter, make mark
65	Branchcount     = 28 // back     jump,limit      branch++ if zero<=c<limit
66	Lazybranchcount = 29 // back     jump,limit      same, but straight first
67	Nullmark        = 30 // back                     save position
68	Setmark         = 31 // back                     save position
69	Capturemark     = 32 // back     group           define group
70	Getmark         = 33 // back                     recall position
71	Setjump         = 34 // back                     save backtrack state
72	Backjump        = 35 //                          zap back to saved state
73	Forejump        = 36 //                          zap backtracking state
74	Testref         = 37 //                          backtrack if ref undefined
75	Goto            = 38 //          jump            just go
76
77	Prune = 39 //                          prune it baby
78	Stop  = 40 //                          done!
79
80	ECMABoundary    = 41 //                          \b
81	NonECMABoundary = 42 //                          \B
82
83	// Modifiers for alternate modes
84
85	Mask  = 63  // Mask to get unmodified ordinary operator
86	Rtl   = 64  // bit to indicate that we're reverse scanning.
87	Back  = 128 // bit to indicate that we're backtracking.
88	Back2 = 256 // bit to indicate that we're backtracking on a second branch.
89	Ci    = 512 // bit to indicate that we're case-insensitive.
90)
91
92type Code struct {
93	Codes       []int       // the code
94	Strings     [][]rune    // string table
95	Sets        []*CharSet  //character set table
96	TrackCount  int         // how many instructions use backtracking
97	Caps        map[int]int // mapping of user group numbers -> impl group slots
98	Capsize     int         // number of impl group slots
99	FcPrefix    *Prefix     // the set of candidate first characters (may be null)
100	BmPrefix    *BmPrefix   // the fixed prefix string as a Boyer-Moore machine (may be null)
101	Anchors     AnchorLoc   // the set of zero-length start anchors (RegexFCD.Bol, etc)
102	RightToLeft bool        // true if right to left
103}
104
105func opcodeBacktracks(op InstOp) bool {
106	op &= Mask
107
108	switch op {
109	case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark,
110		Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump,
111		Forejump, Goto:
112		return true
113
114	default:
115		return false
116	}
117}
118
119func opcodeSize(op InstOp) int {
120	op &= Mask
121
122	switch op {
123	case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ,
124		End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop:
125		return 1
126
127	case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark,
128		Prune, Set:
129		return 2
130
131	case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy,
132		Setlazy, Setrep, Setloop:
133		return 3
134
135	default:
136		panic(fmt.Errorf("Unexpected op code: %v", op))
137	}
138}
139
140var codeStr = []string{
141	"Onerep", "Notonerep", "Setrep",
142	"Oneloop", "Notoneloop", "Setloop",
143	"Onelazy", "Notonelazy", "Setlazy",
144	"One", "Notone", "Set",
145	"Multi", "Ref",
146	"Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End",
147	"Nothing",
148	"Lazybranch", "Branchmark", "Lazybranchmark",
149	"Nullcount", "Setcount", "Branchcount", "Lazybranchcount",
150	"Nullmark", "Setmark", "Capturemark", "Getmark",
151	"Setjump", "Backjump", "Forejump", "Testref", "Goto",
152	"Prune", "Stop",
153	"ECMABoundary", "NonECMABoundary",
154}
155
156func operatorDescription(op InstOp) string {
157	desc := codeStr[op&Mask]
158	if (op & Ci) != 0 {
159		desc += "-Ci"
160	}
161	if (op & Rtl) != 0 {
162		desc += "-Rtl"
163	}
164	if (op & Back) != 0 {
165		desc += "-Back"
166	}
167	if (op & Back2) != 0 {
168		desc += "-Back2"
169	}
170
171	return desc
172}
173
174// OpcodeDescription is a humman readable string of the specific offset
175func (c *Code) OpcodeDescription(offset int) string {
176	buf := &bytes.Buffer{}
177
178	op := InstOp(c.Codes[offset])
179	fmt.Fprintf(buf, "%06d ", offset)
180
181	if opcodeBacktracks(op & Mask) {
182		buf.WriteString("*")
183	} else {
184		buf.WriteString(" ")
185	}
186	buf.WriteString(operatorDescription(op))
187	buf.WriteString("(")
188	op &= Mask
189
190	switch op {
191	case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy:
192		buf.WriteString("Ch = ")
193		buf.WriteString(CharDescription(rune(c.Codes[offset+1])))
194
195	case Set, Setrep, Setloop, Setlazy:
196		buf.WriteString("Set = ")
197		buf.WriteString(c.Sets[c.Codes[offset+1]].String())
198
199	case Multi:
200		fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]]))
201
202	case Ref, Testref:
203		fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
204
205	case Capturemark:
206		fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1])
207		if c.Codes[offset+2] != -1 {
208			fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2])
209		}
210
211	case Nullcount, Setcount:
212		fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1])
213
214	case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount:
215		fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1])
216	}
217
218	switch op {
219	case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy:
220		buf.WriteString(", Rep = ")
221		if c.Codes[offset+2] == math.MaxInt32 {
222			buf.WriteString("inf")
223		} else {
224			fmt.Fprintf(buf, "%d", c.Codes[offset+2])
225		}
226
227	case Branchcount, Lazybranchcount:
228		buf.WriteString(", Limit = ")
229		if c.Codes[offset+2] == math.MaxInt32 {
230			buf.WriteString("inf")
231		} else {
232			fmt.Fprintf(buf, "%d", c.Codes[offset+2])
233		}
234
235	}
236
237	buf.WriteString(")")
238
239	return buf.String()
240}
241
242func (c *Code) Dump() string {
243	buf := &bytes.Buffer{}
244
245	if c.RightToLeft {
246		fmt.Fprintln(buf, "Direction:  right-to-left")
247	} else {
248		fmt.Fprintln(buf, "Direction:  left-to-right")
249	}
250	if c.FcPrefix == nil {
251		fmt.Fprintln(buf, "Firstchars: n/a")
252	} else {
253		fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String())
254	}
255
256	if c.BmPrefix == nil {
257		fmt.Fprintln(buf, "Prefix:     n/a")
258	} else {
259		fmt.Fprintf(buf, "Prefix:     %v\n", Escape(c.BmPrefix.String()))
260	}
261
262	fmt.Fprintf(buf, "Anchors:    %v\n", c.Anchors)
263	fmt.Fprintln(buf)
264
265	if c.BmPrefix != nil {
266		fmt.Fprintln(buf, "BoyerMoore:")
267		fmt.Fprintln(buf, c.BmPrefix.Dump("    "))
268	}
269	for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) {
270		fmt.Fprintln(buf, c.OpcodeDescription(i))
271	}
272
273	return buf.String()
274}
275