1/* 2Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a 3more feature full regex engine behind the scenes. 4 5It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET. 6You'll likely be better off with the RE2 engine from the regexp package and should only use this if you 7need to write very complex patterns or require compatibility with .NET. 8*/ 9package regexp2 10 11import ( 12 "errors" 13 "math" 14 "strconv" 15 "sync" 16 "time" 17 18 "github.com/dlclark/regexp2/syntax" 19) 20 21// Default timeout used when running regexp matches -- "forever" 22var DefaultMatchTimeout = time.Duration(math.MaxInt64) 23 24// Regexp is the representation of a compiled regular expression. 25// A Regexp is safe for concurrent use by multiple goroutines. 26type Regexp struct { 27 //timeout when trying to find matches 28 MatchTimeout time.Duration 29 30 // read-only after Compile 31 pattern string // as passed to Compile 32 options RegexOptions // options 33 34 caps map[int]int // capnum->index 35 capnames map[string]int //capture group name -> index 36 capslist []string //sorted list of capture group names 37 capsize int // size of the capture array 38 39 code *syntax.Code // compiled program 40 41 // cache of machines for running regexp 42 muRun sync.Mutex 43 runner []*runner 44} 45 46// Compile parses a regular expression and returns, if successful, 47// a Regexp object that can be used to match against text. 48func Compile(expr string, opt RegexOptions) (*Regexp, error) { 49 // parse it 50 tree, err := syntax.Parse(expr, syntax.RegexOptions(opt)) 51 if err != nil { 52 return nil, err 53 } 54 55 // translate it to code 56 code, err := syntax.Write(tree) 57 if err != nil { 58 return nil, err 59 } 60 61 // return it 62 return &Regexp{ 63 pattern: expr, 64 options: opt, 65 caps: code.Caps, 66 capnames: tree.Capnames, 67 capslist: tree.Caplist, 68 capsize: code.Capsize, 69 code: code, 70 MatchTimeout: DefaultMatchTimeout, 71 }, nil 72} 73 74// MustCompile is like Compile but panics if the expression cannot be parsed. 75// It simplifies safe initialization of global variables holding compiled regular 76// expressions. 77func MustCompile(str string, opt RegexOptions) *Regexp { 78 regexp, error := Compile(str, opt) 79 if error != nil { 80 panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error()) 81 } 82 return regexp 83} 84 85// Escape adds backslashes to any special characters in the input string 86func Escape(input string) string { 87 return syntax.Escape(input) 88} 89 90// Unescape removes any backslashes from previously-escaped special characters in the input string 91func Unescape(input string) (string, error) { 92 return syntax.Unescape(input) 93} 94 95// String returns the source text used to compile the regular expression. 96func (re *Regexp) String() string { 97 return re.pattern 98} 99 100func quote(s string) string { 101 if strconv.CanBackquote(s) { 102 return "`" + s + "`" 103 } 104 return strconv.Quote(s) 105} 106 107// RegexOptions impact the runtime and parsing behavior 108// for each specific regex. They are setable in code as well 109// as in the regex pattern itself. 110type RegexOptions int32 111 112const ( 113 None RegexOptions = 0x0 114 IgnoreCase = 0x0001 // "i" 115 Multiline = 0x0002 // "m" 116 ExplicitCapture = 0x0004 // "n" 117 Compiled = 0x0008 // "c" 118 Singleline = 0x0010 // "s" 119 IgnorePatternWhitespace = 0x0020 // "x" 120 RightToLeft = 0x0040 // "r" 121 Debug = 0x0080 // "d" 122 ECMAScript = 0x0100 // "e" 123) 124 125func (re *Regexp) RightToLeft() bool { 126 return re.options&RightToLeft != 0 127} 128 129func (re *Regexp) Debug() bool { 130 return re.options&Debug != 0 131} 132 133// Replace searches the input string and replaces each match found with the replacement text. 134// Count will limit the number of matches attempted and startAt will allow 135// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option). 136// Set startAt and count to -1 to go through the whole string 137func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) { 138 data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options)) 139 if err != nil { 140 return "", err 141 } 142 //TODO: cache ReplacerData 143 144 return replace(re, data, nil, input, startAt, count) 145} 146 147// ReplaceFunc searches the input string and replaces each match found using the string from the evaluator 148// Count will limit the number of matches attempted and startAt will allow 149// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option). 150// Set startAt and count to -1 to go through the whole string. 151func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) { 152 return replace(re, nil, evaluator, input, startAt, count) 153} 154 155// FindStringMatch searches the input string for a Regexp match 156func (re *Regexp) FindStringMatch(s string) (*Match, error) { 157 // convert string to runes 158 return re.run(false, -1, getRunes(s)) 159} 160 161// FindRunesMatch searches the input rune slice for a Regexp match 162func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) { 163 return re.run(false, -1, r) 164} 165 166// FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index 167func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) { 168 if startAt > len(s) { 169 return nil, errors.New("startAt must be less than the length of the input string") 170 } 171 r, startAt := re.getRunesAndStart(s, startAt) 172 if startAt == -1 { 173 // we didn't find our start index in the string -- that's a problem 174 return nil, errors.New("startAt must align to the start of a valid rune in the input string") 175 } 176 177 return re.run(false, startAt, r) 178} 179 180// FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index 181func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) { 182 return re.run(false, startAt, r) 183} 184 185// FindNextMatch returns the next match in the same input string as the match parameter. 186// Will return nil if there is no next match or if given a nil match. 187func (re *Regexp) FindNextMatch(m *Match) (*Match, error) { 188 if m == nil { 189 return nil, nil 190 } 191 192 // If previous match was empty, advance by one before matching to prevent 193 // infinite loop 194 startAt := m.textpos 195 if m.Length == 0 { 196 if m.textpos == len(m.text) { 197 return nil, nil 198 } 199 200 if re.RightToLeft() { 201 startAt-- 202 } else { 203 startAt++ 204 } 205 } 206 return re.run(false, startAt, m.text) 207} 208 209// MatchString return true if the string matches the regex 210// error will be set if a timeout occurs 211func (re *Regexp) MatchString(s string) (bool, error) { 212 m, err := re.run(true, -1, getRunes(s)) 213 if err != nil { 214 return false, err 215 } 216 return m != nil, nil 217} 218 219func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) { 220 if startAt < 0 { 221 if re.RightToLeft() { 222 r := getRunes(s) 223 return r, len(r) 224 } 225 return getRunes(s), 0 226 } 227 ret := make([]rune, len(s)) 228 i := 0 229 runeIdx := -1 230 for strIdx, r := range s { 231 if strIdx == startAt { 232 runeIdx = i 233 } 234 ret[i] = r 235 i++ 236 } 237 return ret[:i], runeIdx 238} 239 240func getRunes(s string) []rune { 241 ret := make([]rune, len(s)) 242 i := 0 243 for _, r := range s { 244 ret[i] = r 245 i++ 246 } 247 return ret[:i] 248} 249 250// MatchRunes return true if the runes matches the regex 251// error will be set if a timeout occurs 252func (re *Regexp) MatchRunes(r []rune) (bool, error) { 253 m, err := re.run(true, -1, r) 254 if err != nil { 255 return false, err 256 } 257 return m != nil, nil 258} 259 260// GetGroupNames Returns the set of strings used to name capturing groups in the expression. 261func (re *Regexp) GetGroupNames() []string { 262 var result []string 263 264 if re.capslist == nil { 265 result = make([]string, re.capsize) 266 267 for i := 0; i < re.capsize; i++ { 268 result[i] = strconv.Itoa(i) 269 } 270 } else { 271 result = make([]string, len(re.capslist)) 272 copy(result, re.capslist) 273 } 274 275 return result 276} 277 278// GetGroupNumbers returns the integer group numbers corresponding to a group name. 279func (re *Regexp) GetGroupNumbers() []int { 280 var result []int 281 282 if re.caps == nil { 283 result = make([]int, re.capsize) 284 285 for i := 0; i < re.capsize; i++ { 286 result[i] = i 287 } 288 } else { 289 result = make([]int, len(re.caps)) 290 291 for k, v := range re.caps { 292 result[v] = k 293 } 294 } 295 296 return result 297} 298 299// GroupNameFromNumber retrieves a group name that corresponds to a group number. 300// It will return "" for and unknown group number. Unnamed groups automatically 301// receive a name that is the decimal string equivalent of its number. 302func (re *Regexp) GroupNameFromNumber(i int) string { 303 if re.capslist == nil { 304 if i >= 0 && i < re.capsize { 305 return strconv.Itoa(i) 306 } 307 308 return "" 309 } 310 311 if re.caps != nil { 312 var ok bool 313 if i, ok = re.caps[i]; !ok { 314 return "" 315 } 316 } 317 318 if i >= 0 && i < len(re.capslist) { 319 return re.capslist[i] 320 } 321 322 return "" 323} 324 325// GroupNumberFromName returns a group number that corresponds to a group name. 326// Returns -1 if the name is not a recognized group name. Numbered groups 327// automatically get a group name that is the decimal string equivalent of its number. 328func (re *Regexp) GroupNumberFromName(name string) int { 329 // look up name if we have a hashtable of names 330 if re.capnames != nil { 331 if k, ok := re.capnames[name]; ok { 332 return k 333 } 334 335 return -1 336 } 337 338 // convert to an int if it looks like a number 339 result := 0 340 for i := 0; i < len(name); i++ { 341 ch := name[i] 342 343 if ch > '9' || ch < '0' { 344 return -1 345 } 346 347 result *= 10 348 result += int(ch - '0') 349 } 350 351 // return int if it's in range 352 if result >= 0 && result < re.capsize { 353 return result 354 } 355 356 return -1 357} 358