1/* 2Package regexp2 is a regexp package that has an interface similar to Go's framework regexp engine but uses a 3more feature full regex engine behind the scenes. 4 5It doesn't have constant time guarantees, but it allows backtracking and is compatible with Perl5 and .NET. 6You'll likely be better off with the RE2 engine from the regexp package and should only use this if you 7need to write very complex patterns or require compatibility with .NET. 8*/ 9package regexp2 10 11import ( 12 "errors" 13 "math" 14 "strconv" 15 "sync" 16 "time" 17 18 "github.com/dlclark/regexp2/syntax" 19) 20 21// Default timeout used when running regexp matches -- "forever" 22var DefaultMatchTimeout = time.Duration(math.MaxInt64) 23 24// Regexp is the representation of a compiled regular expression. 25// A Regexp is safe for concurrent use by multiple goroutines. 26type Regexp struct { 27 //timeout when trying to find matches 28 MatchTimeout time.Duration 29 30 // read-only after Compile 31 pattern string // as passed to Compile 32 options RegexOptions // options 33 34 caps map[int]int // capnum->index 35 capnames map[string]int //capture group name -> index 36 capslist []string //sorted list of capture group names 37 capsize int // size of the capture array 38 39 code *syntax.Code // compiled program 40 41 // cache of machines for running regexp 42 muRun sync.Mutex 43 runner []*runner 44} 45 46// Compile parses a regular expression and returns, if successful, 47// a Regexp object that can be used to match against text. 48func Compile(expr string, opt RegexOptions) (*Regexp, error) { 49 // parse it 50 tree, err := syntax.Parse(expr, syntax.RegexOptions(opt)) 51 if err != nil { 52 return nil, err 53 } 54 55 // translate it to code 56 code, err := syntax.Write(tree) 57 if err != nil { 58 return nil, err 59 } 60 61 // return it 62 return &Regexp{ 63 pattern: expr, 64 options: opt, 65 caps: code.Caps, 66 capnames: tree.Capnames, 67 capslist: tree.Caplist, 68 capsize: code.Capsize, 69 code: code, 70 MatchTimeout: DefaultMatchTimeout, 71 }, nil 72} 73 74// MustCompile is like Compile but panics if the expression cannot be parsed. 75// It simplifies safe initialization of global variables holding compiled regular 76// expressions. 77func MustCompile(str string, opt RegexOptions) *Regexp { 78 regexp, error := Compile(str, opt) 79 if error != nil { 80 panic(`regexp2: Compile(` + quote(str) + `): ` + error.Error()) 81 } 82 return regexp 83} 84 85// Escape adds backslashes to any special characters in the input string 86func Escape(input string) string { 87 return syntax.Escape(input) 88} 89 90// Unescape removes any backslashes from previously-escaped special characters in the input string 91func Unescape(input string) (string, error) { 92 return syntax.Unescape(input) 93} 94 95// String returns the source text used to compile the regular expression. 96func (re *Regexp) String() string { 97 return re.pattern 98} 99 100func quote(s string) string { 101 if strconv.CanBackquote(s) { 102 return "`" + s + "`" 103 } 104 return strconv.Quote(s) 105} 106 107// RegexOptions impact the runtime and parsing behavior 108// for each specific regex. They are setable in code as well 109// as in the regex pattern itself. 110type RegexOptions int32 111 112const ( 113 None RegexOptions = 0x0 114 IgnoreCase = 0x0001 // "i" 115 Multiline = 0x0002 // "m" 116 ExplicitCapture = 0x0004 // "n" 117 Compiled = 0x0008 // "c" 118 Singleline = 0x0010 // "s" 119 IgnorePatternWhitespace = 0x0020 // "x" 120 RightToLeft = 0x0040 // "r" 121 Debug = 0x0080 // "d" 122 ECMAScript = 0x0100 // "e" 123 RE2 = 0x0200 // RE2 (regexp package) compatibility mode 124) 125 126func (re *Regexp) RightToLeft() bool { 127 return re.options&RightToLeft != 0 128} 129 130func (re *Regexp) Debug() bool { 131 return re.options&Debug != 0 132} 133 134// Replace searches the input string and replaces each match found with the replacement text. 135// Count will limit the number of matches attempted and startAt will allow 136// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option). 137// Set startAt and count to -1 to go through the whole string 138func (re *Regexp) Replace(input, replacement string, startAt, count int) (string, error) { 139 data, err := syntax.NewReplacerData(replacement, re.caps, re.capsize, re.capnames, syntax.RegexOptions(re.options)) 140 if err != nil { 141 return "", err 142 } 143 //TODO: cache ReplacerData 144 145 return replace(re, data, nil, input, startAt, count) 146} 147 148// ReplaceFunc searches the input string and replaces each match found using the string from the evaluator 149// Count will limit the number of matches attempted and startAt will allow 150// us to skip past possible matches at the start of the input (left or right depending on RightToLeft option). 151// Set startAt and count to -1 to go through the whole string. 152func (re *Regexp) ReplaceFunc(input string, evaluator MatchEvaluator, startAt, count int) (string, error) { 153 return replace(re, nil, evaluator, input, startAt, count) 154} 155 156// FindStringMatch searches the input string for a Regexp match 157func (re *Regexp) FindStringMatch(s string) (*Match, error) { 158 // convert string to runes 159 return re.run(false, -1, getRunes(s)) 160} 161 162// FindRunesMatch searches the input rune slice for a Regexp match 163func (re *Regexp) FindRunesMatch(r []rune) (*Match, error) { 164 return re.run(false, -1, r) 165} 166 167// FindStringMatchStartingAt searches the input string for a Regexp match starting at the startAt index 168func (re *Regexp) FindStringMatchStartingAt(s string, startAt int) (*Match, error) { 169 if startAt > len(s) { 170 return nil, errors.New("startAt must be less than the length of the input string") 171 } 172 r, startAt := re.getRunesAndStart(s, startAt) 173 if startAt == -1 { 174 // we didn't find our start index in the string -- that's a problem 175 return nil, errors.New("startAt must align to the start of a valid rune in the input string") 176 } 177 178 return re.run(false, startAt, r) 179} 180 181// FindRunesMatchStartingAt searches the input rune slice for a Regexp match starting at the startAt index 182func (re *Regexp) FindRunesMatchStartingAt(r []rune, startAt int) (*Match, error) { 183 return re.run(false, startAt, r) 184} 185 186// FindNextMatch returns the next match in the same input string as the match parameter. 187// Will return nil if there is no next match or if given a nil match. 188func (re *Regexp) FindNextMatch(m *Match) (*Match, error) { 189 if m == nil { 190 return nil, nil 191 } 192 193 // If previous match was empty, advance by one before matching to prevent 194 // infinite loop 195 startAt := m.textpos 196 if m.Length == 0 { 197 if m.textpos == len(m.text) { 198 return nil, nil 199 } 200 201 if re.RightToLeft() { 202 startAt-- 203 } else { 204 startAt++ 205 } 206 } 207 return re.run(false, startAt, m.text) 208} 209 210// MatchString return true if the string matches the regex 211// error will be set if a timeout occurs 212func (re *Regexp) MatchString(s string) (bool, error) { 213 m, err := re.run(true, -1, getRunes(s)) 214 if err != nil { 215 return false, err 216 } 217 return m != nil, nil 218} 219 220func (re *Regexp) getRunesAndStart(s string, startAt int) ([]rune, int) { 221 if startAt < 0 { 222 if re.RightToLeft() { 223 r := getRunes(s) 224 return r, len(r) 225 } 226 return getRunes(s), 0 227 } 228 ret := make([]rune, len(s)) 229 i := 0 230 runeIdx := -1 231 for strIdx, r := range s { 232 if strIdx == startAt { 233 runeIdx = i 234 } 235 ret[i] = r 236 i++ 237 } 238 if startAt == len(s) { 239 runeIdx = i 240 } 241 return ret[:i], runeIdx 242} 243 244func getRunes(s string) []rune { 245 return []rune(s) 246} 247 248// MatchRunes return true if the runes matches the regex 249// error will be set if a timeout occurs 250func (re *Regexp) MatchRunes(r []rune) (bool, error) { 251 m, err := re.run(true, -1, r) 252 if err != nil { 253 return false, err 254 } 255 return m != nil, nil 256} 257 258// GetGroupNames Returns the set of strings used to name capturing groups in the expression. 259func (re *Regexp) GetGroupNames() []string { 260 var result []string 261 262 if re.capslist == nil { 263 result = make([]string, re.capsize) 264 265 for i := 0; i < len(result); i++ { 266 result[i] = strconv.Itoa(i) 267 } 268 } else { 269 result = make([]string, len(re.capslist)) 270 copy(result, re.capslist) 271 } 272 273 return result 274} 275 276// GetGroupNumbers returns the integer group numbers corresponding to a group name. 277func (re *Regexp) GetGroupNumbers() []int { 278 var result []int 279 280 if re.caps == nil { 281 result = make([]int, re.capsize) 282 283 for i := 0; i < len(result); i++ { 284 result[i] = i 285 } 286 } else { 287 result = make([]int, len(re.caps)) 288 289 for k, v := range re.caps { 290 result[v] = k 291 } 292 } 293 294 return result 295} 296 297// GroupNameFromNumber retrieves a group name that corresponds to a group number. 298// It will return "" for and unknown group number. Unnamed groups automatically 299// receive a name that is the decimal string equivalent of its number. 300func (re *Regexp) GroupNameFromNumber(i int) string { 301 if re.capslist == nil { 302 if i >= 0 && i < re.capsize { 303 return strconv.Itoa(i) 304 } 305 306 return "" 307 } 308 309 if re.caps != nil { 310 var ok bool 311 if i, ok = re.caps[i]; !ok { 312 return "" 313 } 314 } 315 316 if i >= 0 && i < len(re.capslist) { 317 return re.capslist[i] 318 } 319 320 return "" 321} 322 323// GroupNumberFromName returns a group number that corresponds to a group name. 324// Returns -1 if the name is not a recognized group name. Numbered groups 325// automatically get a group name that is the decimal string equivalent of its number. 326func (re *Regexp) GroupNumberFromName(name string) int { 327 // look up name if we have a hashtable of names 328 if re.capnames != nil { 329 if k, ok := re.capnames[name]; ok { 330 return k 331 } 332 333 return -1 334 } 335 336 // convert to an int if it looks like a number 337 result := 0 338 for i := 0; i < len(name); i++ { 339 ch := name[i] 340 341 if ch > '9' || ch < '0' { 342 return -1 343 } 344 345 result *= 10 346 result += int(ch - '0') 347 } 348 349 // return int if it's in range 350 if result >= 0 && result < re.capsize { 351 return result 352 } 353 354 return -1 355} 356