1// Copyright (c) 2017 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package regexp 16 17import ( 18 "encoding/binary" 19 "fmt" 20) 21 22// StateLimit is the maximum number of states allowed 23const StateLimit = 10000 24 25// ErrTooManyStates is returned if you attempt to build a Levenshtein 26// automaton which requires too many states. 27var ErrTooManyStates = fmt.Errorf("dfa contains more than %d states", 28 StateLimit) 29 30type dfaBuilder struct { 31 dfa *dfa 32 cache map[string]int 33 keyBuf []byte 34} 35 36func newDfaBuilder(insts prog) *dfaBuilder { 37 d := &dfaBuilder{ 38 dfa: &dfa{ 39 insts: insts, 40 states: make([]state, 0, 16), 41 }, 42 cache: make(map[string]int, 1024), 43 } 44 // add 0 state that is invalid 45 d.dfa.states = append(d.dfa.states, state{ 46 next: make([]int, 256), 47 match: false, 48 }) 49 return d 50} 51 52func (d *dfaBuilder) build() (*dfa, error) { 53 cur := newSparseSet(uint(len(d.dfa.insts))) 54 next := newSparseSet(uint(len(d.dfa.insts))) 55 56 d.dfa.add(cur, 0) 57 ns, instsReuse := d.cachedState(cur, nil) 58 states := intStack{ns} 59 seen := make(map[int]struct{}) 60 var s int 61 states, s = states.Pop() 62 for s != 0 { 63 for b := 0; b < 256; b++ { 64 var ns int 65 ns, instsReuse = d.runState(cur, next, s, byte(b), instsReuse) 66 if ns != 0 { 67 if _, ok := seen[ns]; !ok { 68 seen[ns] = struct{}{} 69 states = states.Push(ns) 70 } 71 } 72 if len(d.dfa.states) > StateLimit { 73 return nil, ErrTooManyStates 74 } 75 } 76 states, s = states.Pop() 77 } 78 return d.dfa, nil 79} 80 81func (d *dfaBuilder) runState(cur, next *sparseSet, state int, b byte, instsReuse []uint) ( 82 int, []uint) { 83 cur.Clear() 84 for _, ip := range d.dfa.states[state].insts { 85 cur.Add(ip) 86 } 87 d.dfa.run(cur, next, b) 88 var nextState int 89 nextState, instsReuse = d.cachedState(next, instsReuse) 90 d.dfa.states[state].next[b] = nextState 91 return nextState, instsReuse 92} 93 94func instsKey(insts []uint, buf []byte) []byte { 95 if cap(buf) < 8*len(insts) { 96 buf = make([]byte, 8*len(insts)) 97 } else { 98 buf = buf[0 : 8*len(insts)] 99 } 100 for i, inst := range insts { 101 binary.LittleEndian.PutUint64(buf[i*8:], uint64(inst)) 102 } 103 return buf 104} 105 106func (d *dfaBuilder) cachedState(set *sparseSet, 107 instsReuse []uint) (int, []uint) { 108 insts := instsReuse[:0] 109 if cap(insts) == 0 { 110 insts = make([]uint, 0, set.Len()) 111 } 112 var isMatch bool 113 for i := uint(0); i < uint(set.Len()); i++ { 114 ip := set.Get(i) 115 switch d.dfa.insts[ip].op { 116 case OpRange: 117 insts = append(insts, ip) 118 case OpMatch: 119 isMatch = true 120 insts = append(insts, ip) 121 } 122 } 123 if len(insts) == 0 { 124 return 0, insts 125 } 126 d.keyBuf = instsKey(insts, d.keyBuf) 127 v, ok := d.cache[string(d.keyBuf)] 128 if ok { 129 return v, insts 130 } 131 d.dfa.states = append(d.dfa.states, state{ 132 insts: insts, 133 next: make([]int, 256), 134 match: isMatch, 135 }) 136 newV := len(d.dfa.states) - 1 137 d.cache[string(d.keyBuf)] = newV 138 return newV, nil 139} 140 141type dfa struct { 142 insts prog 143 states []state 144} 145 146func (d *dfa) add(set *sparseSet, ip uint) { 147 if set.Contains(ip) { 148 return 149 } 150 set.Add(ip) 151 switch d.insts[ip].op { 152 case OpJmp: 153 d.add(set, d.insts[ip].to) 154 case OpSplit: 155 d.add(set, d.insts[ip].splitA) 156 d.add(set, d.insts[ip].splitB) 157 } 158} 159 160func (d *dfa) run(from, to *sparseSet, b byte) bool { 161 to.Clear() 162 var isMatch bool 163 for i := uint(0); i < uint(from.Len()); i++ { 164 ip := from.Get(i) 165 switch d.insts[ip].op { 166 case OpMatch: 167 isMatch = true 168 case OpRange: 169 if d.insts[ip].rangeStart <= b && 170 b <= d.insts[ip].rangeEnd { 171 d.add(to, ip+1) 172 } 173 } 174 } 175 return isMatch 176} 177 178type state struct { 179 insts []uint 180 next []int 181 match bool 182} 183 184type intStack []int 185 186func (s intStack) Push(v int) intStack { 187 return append(s, v) 188} 189 190func (s intStack) Pop() (intStack, int) { 191 l := len(s) 192 if l < 1 { 193 return s, 0 194 } 195 return s[:l-1], s[l-1] 196} 197