1package syntax 2 3import ( 4 "bytes" 5 "fmt" 6 "math" 7) 8 9// similar to prog.go in the go regex package...also with comment 'may not belong in this package' 10 11// File provides operator constants for use by the Builder and the Machine. 12 13// Implementation notes: 14// 15// Regexps are built into RegexCodes, which contain an operation array, 16// a string table, and some constants. 17// 18// Each operation is one of the codes below, followed by the integer 19// operands specified for each op. 20// 21// Strings and sets are indices into a string table. 22 23type InstOp int 24 25const ( 26 // lef/back operands description 27 28 Onerep InstOp = 0 // lef,back char,min,max a {n} 29 Notonerep = 1 // lef,back char,min,max .{n} 30 Setrep = 2 // lef,back set,min,max [\d]{n} 31 32 Oneloop = 3 // lef,back char,min,max a {,n} 33 Notoneloop = 4 // lef,back char,min,max .{,n} 34 Setloop = 5 // lef,back set,min,max [\d]{,n} 35 36 Onelazy = 6 // lef,back char,min,max a {,n}? 37 Notonelazy = 7 // lef,back char,min,max .{,n}? 38 Setlazy = 8 // lef,back set,min,max [\d]{,n}? 39 40 One = 9 // lef char a 41 Notone = 10 // lef char [^a] 42 Set = 11 // lef set [a-z\s] \w \s \d 43 44 Multi = 12 // lef string abcd 45 Ref = 13 // lef group \# 46 47 Bol = 14 // ^ 48 Eol = 15 // $ 49 Boundary = 16 // \b 50 Nonboundary = 17 // \B 51 Beginning = 18 // \A 52 Start = 19 // \G 53 EndZ = 20 // \Z 54 End = 21 // \Z 55 56 Nothing = 22 // Reject! 57 58 // Primitive control structures 59 60 Lazybranch = 23 // back jump straight first 61 Branchmark = 24 // back jump branch first for loop 62 Lazybranchmark = 25 // back jump straight first for loop 63 Nullcount = 26 // back val set counter, null mark 64 Setcount = 27 // back val set counter, make mark 65 Branchcount = 28 // back jump,limit branch++ if zero<=c<limit 66 Lazybranchcount = 29 // back jump,limit same, but straight first 67 Nullmark = 30 // back save position 68 Setmark = 31 // back save position 69 Capturemark = 32 // back group define group 70 Getmark = 33 // back recall position 71 Setjump = 34 // back save backtrack state 72 Backjump = 35 // zap back to saved state 73 Forejump = 36 // zap backtracking state 74 Testref = 37 // backtrack if ref undefined 75 Goto = 38 // jump just go 76 77 Prune = 39 // prune it baby 78 Stop = 40 // done! 79 80 ECMABoundary = 41 // \b 81 NonECMABoundary = 42 // \B 82 83 // Modifiers for alternate modes 84 85 Mask = 63 // Mask to get unmodified ordinary operator 86 Rtl = 64 // bit to indicate that we're reverse scanning. 87 Back = 128 // bit to indicate that we're backtracking. 88 Back2 = 256 // bit to indicate that we're backtracking on a second branch. 89 Ci = 512 // bit to indicate that we're case-insensitive. 90) 91 92type Code struct { 93 Codes []int // the code 94 Strings [][]rune // string table 95 Sets []*CharSet //character set table 96 TrackCount int // how many instructions use backtracking 97 Caps map[int]int // mapping of user group numbers -> impl group slots 98 Capsize int // number of impl group slots 99 FcPrefix *Prefix // the set of candidate first characters (may be null) 100 BmPrefix *BmPrefix // the fixed prefix string as a Boyer-Moore machine (may be null) 101 Anchors AnchorLoc // the set of zero-length start anchors (RegexFCD.Bol, etc) 102 RightToLeft bool // true if right to left 103} 104 105func opcodeBacktracks(op InstOp) bool { 106 op &= Mask 107 108 switch op { 109 case Oneloop, Notoneloop, Setloop, Onelazy, Notonelazy, Setlazy, Lazybranch, Branchmark, Lazybranchmark, 110 Nullcount, Setcount, Branchcount, Lazybranchcount, Setmark, Capturemark, Getmark, Setjump, Backjump, 111 Forejump, Goto: 112 return true 113 114 default: 115 return false 116 } 117} 118 119func opcodeSize(op InstOp) int { 120 op &= Mask 121 122 switch op { 123 case Nothing, Bol, Eol, Boundary, Nonboundary, ECMABoundary, NonECMABoundary, Beginning, Start, EndZ, 124 End, Nullmark, Setmark, Getmark, Setjump, Backjump, Forejump, Stop: 125 return 1 126 127 case One, Notone, Multi, Ref, Testref, Goto, Nullcount, Setcount, Lazybranch, Branchmark, Lazybranchmark, 128 Prune, Set: 129 return 2 130 131 case Capturemark, Branchcount, Lazybranchcount, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, 132 Setlazy, Setrep, Setloop: 133 return 3 134 135 default: 136 panic(fmt.Errorf("Unexpected op code: %v", op)) 137 } 138} 139 140var codeStr = []string{ 141 "Onerep", "Notonerep", "Setrep", 142 "Oneloop", "Notoneloop", "Setloop", 143 "Onelazy", "Notonelazy", "Setlazy", 144 "One", "Notone", "Set", 145 "Multi", "Ref", 146 "Bol", "Eol", "Boundary", "Nonboundary", "Beginning", "Start", "EndZ", "End", 147 "Nothing", 148 "Lazybranch", "Branchmark", "Lazybranchmark", 149 "Nullcount", "Setcount", "Branchcount", "Lazybranchcount", 150 "Nullmark", "Setmark", "Capturemark", "Getmark", 151 "Setjump", "Backjump", "Forejump", "Testref", "Goto", 152 "Prune", "Stop", 153 "ECMABoundary", "NonECMABoundary", 154} 155 156func operatorDescription(op InstOp) string { 157 desc := codeStr[op&Mask] 158 if (op & Ci) != 0 { 159 desc += "-Ci" 160 } 161 if (op & Rtl) != 0 { 162 desc += "-Rtl" 163 } 164 if (op & Back) != 0 { 165 desc += "-Back" 166 } 167 if (op & Back2) != 0 { 168 desc += "-Back2" 169 } 170 171 return desc 172} 173 174// OpcodeDescription is a humman readable string of the specific offset 175func (c *Code) OpcodeDescription(offset int) string { 176 buf := &bytes.Buffer{} 177 178 op := InstOp(c.Codes[offset]) 179 fmt.Fprintf(buf, "%06d ", offset) 180 181 if opcodeBacktracks(op & Mask) { 182 buf.WriteString("*") 183 } else { 184 buf.WriteString(" ") 185 } 186 buf.WriteString(operatorDescription(op)) 187 buf.WriteString("(") 188 op &= Mask 189 190 switch op { 191 case One, Notone, Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy: 192 buf.WriteString("Ch = ") 193 buf.WriteString(CharDescription(rune(c.Codes[offset+1]))) 194 195 case Set, Setrep, Setloop, Setlazy: 196 buf.WriteString("Set = ") 197 buf.WriteString(c.Sets[c.Codes[offset+1]].String()) 198 199 case Multi: 200 fmt.Fprintf(buf, "String = %s", string(c.Strings[c.Codes[offset+1]])) 201 202 case Ref, Testref: 203 fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) 204 205 case Capturemark: 206 fmt.Fprintf(buf, "Index = %d", c.Codes[offset+1]) 207 if c.Codes[offset+2] != -1 { 208 fmt.Fprintf(buf, ", Unindex = %d", c.Codes[offset+2]) 209 } 210 211 case Nullcount, Setcount: 212 fmt.Fprintf(buf, "Value = %d", c.Codes[offset+1]) 213 214 case Goto, Lazybranch, Branchmark, Lazybranchmark, Branchcount, Lazybranchcount: 215 fmt.Fprintf(buf, "Addr = %d", c.Codes[offset+1]) 216 } 217 218 switch op { 219 case Onerep, Notonerep, Oneloop, Notoneloop, Onelazy, Notonelazy, Setrep, Setloop, Setlazy: 220 buf.WriteString(", Rep = ") 221 if c.Codes[offset+2] == math.MaxInt32 { 222 buf.WriteString("inf") 223 } else { 224 fmt.Fprintf(buf, "%d", c.Codes[offset+2]) 225 } 226 227 case Branchcount, Lazybranchcount: 228 buf.WriteString(", Limit = ") 229 if c.Codes[offset+2] == math.MaxInt32 { 230 buf.WriteString("inf") 231 } else { 232 fmt.Fprintf(buf, "%d", c.Codes[offset+2]) 233 } 234 235 } 236 237 buf.WriteString(")") 238 239 return buf.String() 240} 241 242func (c *Code) Dump() string { 243 buf := &bytes.Buffer{} 244 245 if c.RightToLeft { 246 fmt.Fprintln(buf, "Direction: right-to-left") 247 } else { 248 fmt.Fprintln(buf, "Direction: left-to-right") 249 } 250 if c.FcPrefix == nil { 251 fmt.Fprintln(buf, "Firstchars: n/a") 252 } else { 253 fmt.Fprintf(buf, "Firstchars: %v\n", c.FcPrefix.PrefixSet.String()) 254 } 255 256 if c.BmPrefix == nil { 257 fmt.Fprintln(buf, "Prefix: n/a") 258 } else { 259 fmt.Fprintf(buf, "Prefix: %v\n", Escape(c.BmPrefix.String())) 260 } 261 262 fmt.Fprintf(buf, "Anchors: %v\n", c.Anchors) 263 fmt.Fprintln(buf) 264 265 if c.BmPrefix != nil { 266 fmt.Fprintln(buf, "BoyerMoore:") 267 fmt.Fprintln(buf, c.BmPrefix.Dump(" ")) 268 } 269 for i := 0; i < len(c.Codes); i += opcodeSize(InstOp(c.Codes[i])) { 270 fmt.Fprintln(buf, c.OpcodeDescription(i)) 271 } 272 273 return buf.String() 274} 275