1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5package syntax 6 7import ( 8 "bytes" 9 "strconv" 10 "unicode" 11) 12 13// Compiled program. 14// May not belong in this package, but convenient for now. 15 16// A Prog is a compiled regular expression program. 17type Prog struct { 18 Inst []Inst 19 Start int // index of start instruction 20 NumCap int // number of InstCapture insts in re 21} 22 23// An InstOp is an instruction opcode. 24type InstOp uint8 25 26const ( 27 InstAlt InstOp = iota 28 InstAltMatch 29 InstCapture 30 InstEmptyWidth 31 InstMatch 32 InstFail 33 InstNop 34 InstRune 35 InstRune1 36 InstRuneAny 37 InstRuneAnyNotNL 38) 39 40var instOpNames = []string{ 41 "InstAlt", 42 "InstAltMatch", 43 "InstCapture", 44 "InstEmptyWidth", 45 "InstMatch", 46 "InstFail", 47 "InstNop", 48 "InstRune", 49 "InstRune1", 50 "InstRuneAny", 51 "InstRuneAnyNotNL", 52} 53 54func (i InstOp) String() string { 55 if uint(i) >= uint(len(instOpNames)) { 56 return "" 57 } 58 return instOpNames[i] 59} 60 61// An EmptyOp specifies a kind or mixture of zero-width assertions. 62type EmptyOp uint8 63 64const ( 65 EmptyBeginLine EmptyOp = 1 << iota 66 EmptyEndLine 67 EmptyBeginText 68 EmptyEndText 69 EmptyWordBoundary 70 EmptyNoWordBoundary 71) 72 73// EmptyOpContext returns the zero-width assertions 74// satisfied at the position between the runes r1 and r2. 75// Passing r1 == -1 indicates that the position is 76// at the beginning of the text. 77// Passing r2 == -1 indicates that the position is 78// at the end of the text. 79func EmptyOpContext(r1, r2 rune) EmptyOp { 80 var op EmptyOp = EmptyNoWordBoundary 81 var boundary byte 82 switch { 83 case IsWordChar(r1): 84 boundary = 1 85 case r1 == '\n': 86 op |= EmptyBeginLine 87 case r1 < 0: 88 op |= EmptyBeginText | EmptyBeginLine 89 } 90 switch { 91 case IsWordChar(r2): 92 boundary ^= 1 93 case r2 == '\n': 94 op |= EmptyEndLine 95 case r2 < 0: 96 op |= EmptyEndText | EmptyEndLine 97 } 98 if boundary != 0 { // IsWordChar(r1) != IsWordChar(r2) 99 op ^= (EmptyWordBoundary | EmptyNoWordBoundary) 100 } 101 return op 102} 103 104// IsWordChar reports whether r is consider a ``word character'' 105// during the evaluation of the \b and \B zero-width assertions. 106// These assertions are ASCII-only: the word characters are [A-Za-z0-9_]. 107func IsWordChar(r rune) bool { 108 return 'A' <= r && r <= 'Z' || 'a' <= r && r <= 'z' || '0' <= r && r <= '9' || r == '_' 109} 110 111// An Inst is a single instruction in a regular expression program. 112type Inst struct { 113 Op InstOp 114 Out uint32 // all but InstMatch, InstFail 115 Arg uint32 // InstAlt, InstAltMatch, InstCapture, InstEmptyWidth 116 Rune []rune 117} 118 119func (p *Prog) String() string { 120 var b bytes.Buffer 121 dumpProg(&b, p) 122 return b.String() 123} 124 125// skipNop follows any no-op or capturing instructions 126// and returns the resulting pc. 127func (p *Prog) skipNop(pc uint32) (*Inst, uint32) { 128 i := &p.Inst[pc] 129 for i.Op == InstNop || i.Op == InstCapture { 130 pc = i.Out 131 i = &p.Inst[pc] 132 } 133 return i, pc 134} 135 136// op returns i.Op but merges all the Rune special cases into InstRune 137func (i *Inst) op() InstOp { 138 op := i.Op 139 switch op { 140 case InstRune1, InstRuneAny, InstRuneAnyNotNL: 141 op = InstRune 142 } 143 return op 144} 145 146// Prefix returns a literal string that all matches for the 147// regexp must start with. Complete is true if the prefix 148// is the entire match. 149func (p *Prog) Prefix() (prefix string, complete bool) { 150 i, _ := p.skipNop(uint32(p.Start)) 151 152 // Avoid allocation of buffer if prefix is empty. 153 if i.op() != InstRune || len(i.Rune) != 1 { 154 return "", i.Op == InstMatch 155 } 156 157 // Have prefix; gather characters. 158 var buf bytes.Buffer 159 for i.op() == InstRune && len(i.Rune) == 1 && Flags(i.Arg)&FoldCase == 0 { 160 buf.WriteRune(i.Rune[0]) 161 i, _ = p.skipNop(i.Out) 162 } 163 return buf.String(), i.Op == InstMatch 164} 165 166// StartCond returns the leading empty-width conditions that must 167// be true in any match. It returns ^EmptyOp(0) if no matches are possible. 168func (p *Prog) StartCond() EmptyOp { 169 var flag EmptyOp 170 pc := uint32(p.Start) 171 i := &p.Inst[pc] 172Loop: 173 for { 174 switch i.Op { 175 case InstEmptyWidth: 176 flag |= EmptyOp(i.Arg) 177 case InstFail: 178 return ^EmptyOp(0) 179 case InstCapture, InstNop: 180 // skip 181 default: 182 break Loop 183 } 184 pc = i.Out 185 i = &p.Inst[pc] 186 } 187 return flag 188} 189 190const noMatch = -1 191 192// MatchRune reports whether the instruction matches (and consumes) r. 193// It should only be called when i.Op == InstRune. 194func (i *Inst) MatchRune(r rune) bool { 195 return i.MatchRunePos(r) != noMatch 196} 197 198// MatchRunePos checks whether the instruction matches (and consumes) r. 199// If so, MatchRunePos returns the index of the matching rune pair 200// (or, when len(i.Rune) == 1, rune singleton). 201// If not, MatchRunePos returns -1. 202// MatchRunePos should only be called when i.Op == InstRune. 203func (i *Inst) MatchRunePos(r rune) int { 204 rune := i.Rune 205 206 // Special case: single-rune slice is from literal string, not char class. 207 if len(rune) == 1 { 208 r0 := rune[0] 209 if r == r0 { 210 return 0 211 } 212 if Flags(i.Arg)&FoldCase != 0 { 213 for r1 := unicode.SimpleFold(r0); r1 != r0; r1 = unicode.SimpleFold(r1) { 214 if r == r1 { 215 return 0 216 } 217 } 218 } 219 return noMatch 220 } 221 222 // Peek at the first few pairs. 223 // Should handle ASCII well. 224 for j := 0; j < len(rune) && j <= 8; j += 2 { 225 if r < rune[j] { 226 return noMatch 227 } 228 if r <= rune[j+1] { 229 return j / 2 230 } 231 } 232 233 // Otherwise binary search. 234 lo := 0 235 hi := len(rune) / 2 236 for lo < hi { 237 m := lo + (hi-lo)/2 238 if c := rune[2*m]; c <= r { 239 if r <= rune[2*m+1] { 240 return m 241 } 242 lo = m + 1 243 } else { 244 hi = m 245 } 246 } 247 return noMatch 248} 249 250// MatchEmptyWidth reports whether the instruction matches 251// an empty string between the runes before and after. 252// It should only be called when i.Op == InstEmptyWidth. 253func (i *Inst) MatchEmptyWidth(before rune, after rune) bool { 254 switch EmptyOp(i.Arg) { 255 case EmptyBeginLine: 256 return before == '\n' || before == -1 257 case EmptyEndLine: 258 return after == '\n' || after == -1 259 case EmptyBeginText: 260 return before == -1 261 case EmptyEndText: 262 return after == -1 263 case EmptyWordBoundary: 264 return IsWordChar(before) != IsWordChar(after) 265 case EmptyNoWordBoundary: 266 return IsWordChar(before) == IsWordChar(after) 267 } 268 panic("unknown empty width arg") 269} 270 271func (i *Inst) String() string { 272 var b bytes.Buffer 273 dumpInst(&b, i) 274 return b.String() 275} 276 277func bw(b *bytes.Buffer, args ...string) { 278 for _, s := range args { 279 b.WriteString(s) 280 } 281} 282 283func dumpProg(b *bytes.Buffer, p *Prog) { 284 for j := range p.Inst { 285 i := &p.Inst[j] 286 pc := strconv.Itoa(j) 287 if len(pc) < 3 { 288 b.WriteString(" "[len(pc):]) 289 } 290 if j == p.Start { 291 pc += "*" 292 } 293 bw(b, pc, "\t") 294 dumpInst(b, i) 295 bw(b, "\n") 296 } 297} 298 299func u32(i uint32) string { 300 return strconv.FormatUint(uint64(i), 10) 301} 302 303func dumpInst(b *bytes.Buffer, i *Inst) { 304 switch i.Op { 305 case InstAlt: 306 bw(b, "alt -> ", u32(i.Out), ", ", u32(i.Arg)) 307 case InstAltMatch: 308 bw(b, "altmatch -> ", u32(i.Out), ", ", u32(i.Arg)) 309 case InstCapture: 310 bw(b, "cap ", u32(i.Arg), " -> ", u32(i.Out)) 311 case InstEmptyWidth: 312 bw(b, "empty ", u32(i.Arg), " -> ", u32(i.Out)) 313 case InstMatch: 314 bw(b, "match") 315 case InstFail: 316 bw(b, "fail") 317 case InstNop: 318 bw(b, "nop -> ", u32(i.Out)) 319 case InstRune: 320 if i.Rune == nil { 321 // shouldn't happen 322 bw(b, "rune <nil>") 323 } 324 bw(b, "rune ", strconv.QuoteToASCII(string(i.Rune))) 325 if Flags(i.Arg)&FoldCase != 0 { 326 bw(b, "/i") 327 } 328 bw(b, " -> ", u32(i.Out)) 329 case InstRune1: 330 bw(b, "rune1 ", strconv.QuoteToASCII(string(i.Rune)), " -> ", u32(i.Out)) 331 case InstRuneAny: 332 bw(b, "any -> ", u32(i.Out)) 333 case InstRuneAnyNotNL: 334 bw(b, "anynotnl -> ", u32(i.Out)) 335 } 336} 337