1//  Copyright (c) 2017 Couchbase, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// 		http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package regexp
16
17import (
18	"encoding/binary"
19	"fmt"
20)
21
22// StateLimit is the maximum number of states allowed
23const StateLimit = 10000
24
25// ErrTooManyStates is returned if you attempt to build a Levenshtein
26// automaton which requires too many states.
27var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states",
28	StateLimit)
29
30type dfaBuilder struct {
31	dfa    *dfa
32	cache  map[string]int
33	keyBuf []byte
34}
35
36func newDfaBuilder(insts prog) *dfaBuilder {
37	d := &dfaBuilder{
38		dfa: &dfa{
39			insts:  insts,
40			states: make([]state, 0, 16),
41		},
42		cache: make(map[string]int, 1024),
43	}
44	// add 0 state that is invalid
45	d.dfa.states = append(d.dfa.states, state{
46		next:  make([]int, 256),
47		match: false,
48	})
49	return d
50}
51
52func (d *dfaBuilder) build() (*dfa, error) {
53	cur := newSparseSet(uint(len(d.dfa.insts)))
54	next := newSparseSet(uint(len(d.dfa.insts)))
55
56	d.dfa.add(cur, 0)
57	ns, instsReuse := d.cachedState(cur, nil)
58	states := intStack{ns}
59	seen := make(map[int]struct{})
60	var s int
61	states, s = states.Pop()
62	for s != 0 {
63		for b := 0; b < 256; b++ {
64			var ns int
65			ns, instsReuse = d.runState(cur, next, s, byte(b), instsReuse)
66			if ns != 0 {
67				if _, ok := seen[ns]; !ok {
68					seen[ns] = struct{}{}
69					states = states.Push(ns)
70				}
71			}
72			if len(d.dfa.states) > StateLimit {
73				return nil, ErrTooManyStates
74			}
75		}
76		states, s = states.Pop()
77	}
78	return d.dfa, nil
79}
80
81func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte, instsReuse []uint) (
82	int, []uint) {
83	cur.Clear()
84	for _, ip := range d.dfa.states[state].insts {
85		cur.Add(ip)
86	}
87	d.dfa.run(cur, next, b)
88	var nextState int
89	nextState, instsReuse = d.cachedState(next, instsReuse)
90	d.dfa.states[state].next[b] = nextState
91	return nextState, instsReuse
92}
93
94func instsKey(insts []uint, buf []byte) []byte {
95	if cap(buf) < 8*len(insts) {
96		buf = make([]byte, 8*len(insts))
97	} else {
98		buf = buf[0 : 8*len(insts)]
99	}
100	for i, inst := range insts {
101		binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst))
102	}
103	return buf
104}
105
106func (d *dfaBuilder) cachedState(set *sparseSet,
107	instsReuse []uint) (int, []uint) {
108	insts := instsReuse[:0]
109	if cap(insts) == 0 {
110		insts = make([]uint, 0, set.Len())
111	}
112	var isMatch bool
113	for i := uint(0); i < uint(set.Len()); i++ {
114		ip := set.Get(i)
115		switch d.dfa.insts[ip].op {
116		case OpRange:
117			insts = append(insts, ip)
118		case OpMatch:
119			isMatch = true
120			insts = append(insts, ip)
121		}
122	}
123	if len(insts) == 0 {
124		return 0, insts
125	}
126	d.keyBuf = instsKey(insts, d.keyBuf)
127	v, ok := d.cache[string(d.keyBuf)]
128	if ok {
129		return v, insts
130	}
131	d.dfa.states = append(d.dfa.states, state{
132		insts: insts,
133		next:  make([]int, 256),
134		match: isMatch,
135	})
136	newV := len(d.dfa.states) - 1
137	d.cache[string(d.keyBuf)] = newV
138	return newV, nil
139}
140
141type dfa struct {
142	insts  prog
143	states []state
144}
145
146func (d *dfa) add(set *sparseSet, ip uint) {
147	if set.Contains(ip) {
148		return
149	}
150	set.Add(ip)
151	switch d.insts[ip].op {
152	case OpJmp:
153		d.add(set, d.insts[ip].to)
154	case OpSplit:
155		d.add(set, d.insts[ip].splitA)
156		d.add(set, d.insts[ip].splitB)
157	}
158}
159
160func (d *dfa) run(from, to *sparseSet, b byte) bool {
161	to.Clear()
162	var isMatch bool
163	for i := uint(0); i < uint(from.Len()); i++ {
164		ip := from.Get(i)
165		switch d.insts[ip].op {
166		case OpMatch:
167			isMatch = true
168		case OpRange:
169			if d.insts[ip].rangeStart <= b &&
170				b <= d.insts[ip].rangeEnd {
171				d.add(to, ip+1)
172			}
173		}
174	}
175	return isMatch
176}
177
178type state struct {
179	insts []uint
180	next  []int
181	match bool
182}
183
184type intStack []int
185
186func (s intStack) Push(v int) intStack {
187	return append(s, v)
188}
189
190func (s intStack) Pop() (intStack, int) {
191	l := len(s)
192	if l < 1 {
193		return s, 0
194	}
195	return s[:l-1], s[l-1]
196}
197