1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package regexp 6 7import ( 8 "io" 9 "regexp/syntax" 10) 11 12// A queue is a 'sparse array' holding pending threads of execution. 13// See http://research.swtch.com/2008/03/using-uninitialized-memory-for-fun-and.html 14type queue struct { 15 sparse []uint32 16 dense []entry 17} 18 19// An entry is an entry on a queue. 20// It holds both the instruction pc and the actual thread. 21// Some queue entries are just place holders so that the machine 22// knows it has considered that pc. Such entries have t == nil. 23type entry struct { 24 pc uint32 25 t *thread 26} 27 28// A thread is the state of a single path through the machine: 29// an instruction and a corresponding capture array. 30// See http://swtch.com/~rsc/regexp/regexp2.html 31type thread struct { 32 inst *syntax.Inst 33 cap []int 34} 35 36// A machine holds all the state during an NFA simulation for p. 37type machine struct { 38 re *Regexp // corresponding Regexp 39 p *syntax.Prog // compiled program 40 op *onePassProg // compiled onepass program, or notOnePass 41 maxBitStateLen int // max length of string to search with bitstate 42 b *bitState // state for backtracker, allocated lazily 43 q0, q1 queue // two queues for runq, nextq 44 pool []*thread // pool of available threads 45 matched bool // whether a match was found 46 matchcap []int // capture information for the match 47 48 // cached inputs, to avoid allocation 49 inputBytes inputBytes 50 inputString inputString 51 inputReader inputReader 52} 53 54func (m *machine) newInputBytes(b []byte) input { 55 m.inputBytes.str = b 56 return &m.inputBytes 57} 58 59func (m *machine) newInputString(s string) input { 60 m.inputString.str = s 61 return &m.inputString 62} 63 64func (m *machine) newInputReader(r io.RuneReader) input { 65 m.inputReader.r = r 66 m.inputReader.atEOT = false 67 m.inputReader.pos = 0 68 return &m.inputReader 69} 70 71// progMachine returns a new machine running the prog p. 72func progMachine(p *syntax.Prog, op *onePassProg) *machine { 73 m := &machine{p: p, op: op} 74 n := len(m.p.Inst) 75 m.q0 = queue{make([]uint32, n), make([]entry, 0, n)} 76 m.q1 = queue{make([]uint32, n), make([]entry, 0, n)} 77 ncap := p.NumCap 78 if ncap < 2 { 79 ncap = 2 80 } 81 if op == notOnePass { 82 m.maxBitStateLen = maxBitStateLen(p) 83 } 84 m.matchcap = make([]int, ncap) 85 return m 86} 87 88func (m *machine) init(ncap int) { 89 for _, t := range m.pool { 90 t.cap = t.cap[:ncap] 91 } 92 m.matchcap = m.matchcap[:ncap] 93} 94 95// alloc allocates a new thread with the given instruction. 96// It uses the free pool if possible. 97func (m *machine) alloc(i *syntax.Inst) *thread { 98 var t *thread 99 if n := len(m.pool); n > 0 { 100 t = m.pool[n-1] 101 m.pool = m.pool[:n-1] 102 } else { 103 t = new(thread) 104 t.cap = make([]int, len(m.matchcap), cap(m.matchcap)) 105 } 106 t.inst = i 107 return t 108} 109 110// match runs the machine over the input starting at pos. 111// It reports whether a match was found. 112// If so, m.matchcap holds the submatch information. 113func (m *machine) match(i input, pos int) bool { 114 startCond := m.re.cond 115 if startCond == ^syntax.EmptyOp(0) { // impossible 116 return false 117 } 118 m.matched = false 119 for i := range m.matchcap { 120 m.matchcap[i] = -1 121 } 122 runq, nextq := &m.q0, &m.q1 123 r, r1 := endOfText, endOfText 124 width, width1 := 0, 0 125 r, width = i.step(pos) 126 if r != endOfText { 127 r1, width1 = i.step(pos + width) 128 } 129 var flag syntax.EmptyOp 130 if pos == 0 { 131 flag = syntax.EmptyOpContext(-1, r) 132 } else { 133 flag = i.context(pos) 134 } 135 for { 136 if len(runq.dense) == 0 { 137 if startCond&syntax.EmptyBeginText != 0 && pos != 0 { 138 // Anchored match, past beginning of text. 139 break 140 } 141 if m.matched { 142 // Have match; finished exploring alternatives. 143 break 144 } 145 if len(m.re.prefix) > 0 && r1 != m.re.prefixRune && i.canCheckPrefix() { 146 // Match requires literal prefix; fast search for it. 147 advance := i.index(m.re, pos) 148 if advance < 0 { 149 break 150 } 151 pos += advance 152 r, width = i.step(pos) 153 r1, width1 = i.step(pos + width) 154 } 155 } 156 if !m.matched { 157 if len(m.matchcap) > 0 { 158 m.matchcap[0] = pos 159 } 160 m.add(runq, uint32(m.p.Start), pos, m.matchcap, flag, nil) 161 } 162 flag = syntax.EmptyOpContext(r, r1) 163 m.step(runq, nextq, pos, pos+width, r, flag) 164 if width == 0 { 165 break 166 } 167 if len(m.matchcap) == 0 && m.matched { 168 // Found a match and not paying attention 169 // to where it is, so any match will do. 170 break 171 } 172 pos += width 173 r, width = r1, width1 174 if r != endOfText { 175 r1, width1 = i.step(pos + width) 176 } 177 runq, nextq = nextq, runq 178 } 179 m.clear(nextq) 180 return m.matched 181} 182 183// clear frees all threads on the thread queue. 184func (m *machine) clear(q *queue) { 185 for _, d := range q.dense { 186 if d.t != nil { 187 m.pool = append(m.pool, d.t) 188 } 189 } 190 q.dense = q.dense[:0] 191} 192 193// step executes one step of the machine, running each of the threads 194// on runq and appending new threads to nextq. 195// The step processes the rune c (which may be endOfText), 196// which starts at position pos and ends at nextPos. 197// nextCond gives the setting for the empty-width flags after c. 198func (m *machine) step(runq, nextq *queue, pos, nextPos int, c rune, nextCond syntax.EmptyOp) { 199 longest := m.re.longest 200 for j := 0; j < len(runq.dense); j++ { 201 d := &runq.dense[j] 202 t := d.t 203 if t == nil { 204 continue 205 } 206 if longest && m.matched && len(t.cap) > 0 && m.matchcap[0] < t.cap[0] { 207 m.pool = append(m.pool, t) 208 continue 209 } 210 i := t.inst 211 add := false 212 switch i.Op { 213 default: 214 panic("bad inst") 215 216 case syntax.InstMatch: 217 if len(t.cap) > 0 && (!longest || !m.matched || m.matchcap[1] < pos) { 218 t.cap[1] = pos 219 copy(m.matchcap, t.cap) 220 } 221 if !longest { 222 // First-match mode: cut off all lower-priority threads. 223 for _, d := range runq.dense[j+1:] { 224 if d.t != nil { 225 m.pool = append(m.pool, d.t) 226 } 227 } 228 runq.dense = runq.dense[:0] 229 } 230 m.matched = true 231 232 case syntax.InstRune: 233 add = i.MatchRune(c) 234 case syntax.InstRune1: 235 add = c == i.Rune[0] 236 case syntax.InstRuneAny: 237 add = true 238 case syntax.InstRuneAnyNotNL: 239 add = c != '\n' 240 } 241 if add { 242 t = m.add(nextq, i.Out, nextPos, t.cap, nextCond, t) 243 } 244 if t != nil { 245 m.pool = append(m.pool, t) 246 } 247 } 248 runq.dense = runq.dense[:0] 249} 250 251// add adds an entry to q for pc, unless the q already has such an entry. 252// It also recursively adds an entry for all instructions reachable from pc by following 253// empty-width conditions satisfied by cond. pos gives the current position 254// in the input. 255func (m *machine) add(q *queue, pc uint32, pos int, cap []int, cond syntax.EmptyOp, t *thread) *thread { 256 if pc == 0 { 257 return t 258 } 259 if j := q.sparse[pc]; j < uint32(len(q.dense)) && q.dense[j].pc == pc { 260 return t 261 } 262 263 j := len(q.dense) 264 q.dense = q.dense[:j+1] 265 d := &q.dense[j] 266 d.t = nil 267 d.pc = pc 268 q.sparse[pc] = uint32(j) 269 270 i := &m.p.Inst[pc] 271 switch i.Op { 272 default: 273 panic("unhandled") 274 case syntax.InstFail: 275 // nothing 276 case syntax.InstAlt, syntax.InstAltMatch: 277 t = m.add(q, i.Out, pos, cap, cond, t) 278 t = m.add(q, i.Arg, pos, cap, cond, t) 279 case syntax.InstEmptyWidth: 280 if syntax.EmptyOp(i.Arg)&^cond == 0 { 281 t = m.add(q, i.Out, pos, cap, cond, t) 282 } 283 case syntax.InstNop: 284 t = m.add(q, i.Out, pos, cap, cond, t) 285 case syntax.InstCapture: 286 if int(i.Arg) < len(cap) { 287 opos := cap[i.Arg] 288 cap[i.Arg] = pos 289 m.add(q, i.Out, pos, cap, cond, nil) 290 cap[i.Arg] = opos 291 } else { 292 t = m.add(q, i.Out, pos, cap, cond, t) 293 } 294 case syntax.InstMatch, syntax.InstRune, syntax.InstRune1, syntax.InstRuneAny, syntax.InstRuneAnyNotNL: 295 if t == nil { 296 t = m.alloc(i) 297 } else { 298 t.inst = i 299 } 300 if len(cap) > 0 && &t.cap[0] != &cap[0] { 301 copy(t.cap, cap) 302 } 303 d.t = t 304 t = nil 305 } 306 return t 307} 308 309// onepass runs the machine over the input starting at pos. 310// It reports whether a match was found. 311// If so, m.matchcap holds the submatch information. 312// ncap is the number of captures. 313func (m *machine) onepass(i input, pos, ncap int) bool { 314 startCond := m.re.cond 315 if startCond == ^syntax.EmptyOp(0) { // impossible 316 return false 317 } 318 m.matched = false 319 m.matchcap = m.matchcap[:ncap] 320 for i := range m.matchcap { 321 m.matchcap[i] = -1 322 } 323 r, r1 := endOfText, endOfText 324 width, width1 := 0, 0 325 r, width = i.step(pos) 326 if r != endOfText { 327 r1, width1 = i.step(pos + width) 328 } 329 var flag syntax.EmptyOp 330 if pos == 0 { 331 flag = syntax.EmptyOpContext(-1, r) 332 } else { 333 flag = i.context(pos) 334 } 335 pc := m.op.Start 336 inst := m.op.Inst[pc] 337 // If there is a simple literal prefix, skip over it. 338 if pos == 0 && syntax.EmptyOp(inst.Arg)&^flag == 0 && 339 len(m.re.prefix) > 0 && i.canCheckPrefix() { 340 // Match requires literal prefix; fast search for it. 341 if !i.hasPrefix(m.re) { 342 return m.matched 343 } 344 pos += len(m.re.prefix) 345 r, width = i.step(pos) 346 r1, width1 = i.step(pos + width) 347 flag = i.context(pos) 348 pc = int(m.re.prefixEnd) 349 } 350 for { 351 inst = m.op.Inst[pc] 352 pc = int(inst.Out) 353 switch inst.Op { 354 default: 355 panic("bad inst") 356 case syntax.InstMatch: 357 m.matched = true 358 if len(m.matchcap) > 0 { 359 m.matchcap[0] = 0 360 m.matchcap[1] = pos 361 } 362 return m.matched 363 case syntax.InstRune: 364 if !inst.MatchRune(r) { 365 return m.matched 366 } 367 case syntax.InstRune1: 368 if r != inst.Rune[0] { 369 return m.matched 370 } 371 case syntax.InstRuneAny: 372 // Nothing 373 case syntax.InstRuneAnyNotNL: 374 if r == '\n' { 375 return m.matched 376 } 377 // peek at the input rune to see which branch of the Alt to take 378 case syntax.InstAlt, syntax.InstAltMatch: 379 pc = int(onePassNext(&inst, r)) 380 continue 381 case syntax.InstFail: 382 return m.matched 383 case syntax.InstNop: 384 continue 385 case syntax.InstEmptyWidth: 386 if syntax.EmptyOp(inst.Arg)&^flag != 0 { 387 return m.matched 388 } 389 continue 390 case syntax.InstCapture: 391 if int(inst.Arg) < len(m.matchcap) { 392 m.matchcap[inst.Arg] = pos 393 } 394 continue 395 } 396 if width == 0 { 397 break 398 } 399 flag = syntax.EmptyOpContext(r, r1) 400 pos += width 401 r, width = r1, width1 402 if r != endOfText { 403 r1, width1 = i.step(pos + width) 404 } 405 } 406 return m.matched 407} 408 409// doMatch reports whether either r, b or s match the regexp. 410func (re *Regexp) doMatch(r io.RuneReader, b []byte, s string) bool { 411 return re.doExecute(r, b, s, 0, 0, nil) != nil 412} 413 414// doExecute finds the leftmost match in the input, appends the position 415// of its subexpressions to dstCap and returns dstCap. 416// 417// nil is returned if no matches are found and non-nil if matches are found. 418func (re *Regexp) doExecute(r io.RuneReader, b []byte, s string, pos int, ncap int, dstCap []int) []int { 419 m := re.get() 420 var i input 421 var size int 422 if r != nil { 423 i = m.newInputReader(r) 424 } else if b != nil { 425 i = m.newInputBytes(b) 426 size = len(b) 427 } else { 428 i = m.newInputString(s) 429 size = len(s) 430 } 431 if m.op != notOnePass { 432 if !m.onepass(i, pos, ncap) { 433 re.put(m) 434 return nil 435 } 436 } else if size < m.maxBitStateLen && r == nil { 437 if m.b == nil { 438 m.b = newBitState(m.p) 439 } 440 if !m.backtrack(i, pos, size, ncap) { 441 re.put(m) 442 return nil 443 } 444 } else { 445 m.init(ncap) 446 if !m.match(i, pos) { 447 re.put(m) 448 return nil 449 } 450 } 451 dstCap = append(dstCap, m.matchcap...) 452 if dstCap == nil { 453 // Keep the promise of returning non-nil value on match. 454 dstCap = arrayNoInts[:0] 455 } 456 re.put(m) 457 return dstCap 458} 459 460// arrayNoInts is returned by doExecute match if nil dstCap is passed 461// to it with ncap=0. 462var arrayNoInts [0]int 463