1package goja 2 3import ( 4 "fmt" 5 "github.com/dlclark/regexp2" 6 "github.com/dop251/goja/unistring" 7 "io" 8 "regexp" 9 "sort" 10 "strings" 11 "unicode/utf16" 12) 13 14type regexp2MatchCache struct { 15 target valueString 16 runes []rune 17 posMap []int 18} 19 20// Not goroutine-safe. Use regexp2Wrapper.clone() 21type regexp2Wrapper struct { 22 rx *regexp2.Regexp 23 cache *regexp2MatchCache 24} 25 26type regexpWrapper regexp.Regexp 27 28type positionMapItem struct { 29 src, dst int 30} 31type positionMap []positionMapItem 32 33func (m positionMap) get(src int) int { 34 if src <= 0 { 35 return src 36 } 37 res := sort.Search(len(m), func(n int) bool { return m[n].src >= src }) 38 if res >= len(m) || m[res].src != src { 39 panic("index not found") 40 } 41 return m[res].dst 42} 43 44type arrayRuneReader struct { 45 runes []rune 46 pos int 47} 48 49func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) { 50 if rd.pos < len(rd.runes) { 51 r = rd.runes[rd.pos] 52 size = 1 53 rd.pos++ 54 } else { 55 err = io.EOF 56 } 57 return 58} 59 60// Not goroutine-safe. Use regexpPattern.clone() 61type regexpPattern struct { 62 src string 63 64 global, ignoreCase, multiline, sticky, unicode bool 65 66 regexpWrapper *regexpWrapper 67 regexp2Wrapper *regexp2Wrapper 68} 69 70func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) { 71 var opts regexp2.RegexOptions = regexp2.ECMAScript 72 if multiline { 73 opts |= regexp2.Multiline 74 } 75 if ignoreCase { 76 opts |= regexp2.IgnoreCase 77 } 78 regexp2Pattern, err1 := regexp2.Compile(src, opts) 79 if err1 != nil { 80 return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1) 81 } 82 83 return ®exp2Wrapper{rx: regexp2Pattern}, nil 84} 85 86func (p *regexpPattern) createRegexp2() { 87 if p.regexp2Wrapper != nil { 88 return 89 } 90 rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase) 91 if err != nil { 92 // At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug. 93 panic(err) 94 } 95 p.regexp2Wrapper = rx 96} 97 98func buildUTF8PosMap(s valueString) (positionMap, string) { 99 pm := make(positionMap, 0, s.length()) 100 rd := s.reader(0) 101 sPos, utf8Pos := 0, 0 102 var sb strings.Builder 103 for { 104 r, size, err := rd.ReadRune() 105 if err == io.EOF { 106 break 107 } 108 if err != nil { 109 // the string contains invalid UTF-16, bailing out 110 return nil, "" 111 } 112 utf8Size, _ := sb.WriteRune(r) 113 sPos += size 114 utf8Pos += utf8Size 115 pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos}) 116 } 117 return pm, sb.String() 118} 119 120func (p *regexpPattern) findSubmatchIndex(s valueString, start int) []int { 121 if p.regexpWrapper == nil { 122 return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) 123 } 124 if start != 0 { 125 // Unfortunately Go's regexp library does not allow starting from an arbitrary position. 126 // If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not 127 // work correctly. 128 p.createRegexp2() 129 return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky) 130 } 131 return p.regexpWrapper.findSubmatchIndex(s, p.unicode) 132} 133 134func (p *regexpPattern) findAllSubmatchIndex(s valueString, start int, limit int, sticky bool) [][]int { 135 if p.regexpWrapper == nil { 136 return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) 137 } 138 if start == 0 { 139 if s, ok := s.(asciiString); ok { 140 return p.regexpWrapper.findAllSubmatchIndex(s.String(), limit, sticky) 141 } 142 if limit == 1 { 143 result := p.regexpWrapper.findSubmatchIndexUnicode(s.(unicodeString), p.unicode) 144 if result == nil { 145 return nil 146 } 147 return [][]int{result} 148 } 149 // Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an 150 // input. 151 if p.unicode { 152 // Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8. 153 pm, str := buildUTF8PosMap(s) 154 if pm != nil { 155 res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky) 156 for _, result := range res { 157 for i, idx := range result { 158 result[i] = pm.get(idx) 159 } 160 } 161 return res 162 } 163 } 164 } 165 166 p.createRegexp2() 167 return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode) 168} 169 170// clone creates a copy of the regexpPattern which can be used concurrently. 171func (p *regexpPattern) clone() *regexpPattern { 172 ret := ®expPattern{ 173 src: p.src, 174 global: p.global, 175 ignoreCase: p.ignoreCase, 176 multiline: p.multiline, 177 sticky: p.sticky, 178 unicode: p.unicode, 179 } 180 if p.regexpWrapper != nil { 181 ret.regexpWrapper = p.regexpWrapper.clone() 182 } 183 if p.regexp2Wrapper != nil { 184 ret.regexp2Wrapper = p.regexp2Wrapper.clone() 185 } 186 return ret 187} 188 189type regexpObject struct { 190 baseObject 191 pattern *regexpPattern 192 source valueString 193 194 standard bool 195} 196 197func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode, doCache bool) (result []int) { 198 if fullUnicode { 199 return r.findSubmatchIndexUnicode(s, start, doCache) 200 } 201 return r.findSubmatchIndexUTF16(s, start, doCache) 202} 203 204func (r *regexp2Wrapper) findUTF16Cached(s valueString, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) { 205 wrapped := r.rx 206 cache := r.cache 207 if cache != nil && cache.posMap == nil && cache.target.SameAs(s) { 208 runes = cache.runes 209 } else { 210 runes = s.utf16Runes() 211 cache = nil 212 } 213 match, err = wrapped.FindRunesMatchStartingAt(runes, start) 214 if doCache && match != nil && err == nil { 215 if cache == nil { 216 if r.cache == nil { 217 r.cache = new(regexp2MatchCache) 218 } 219 *r.cache = regexp2MatchCache{ 220 target: s, 221 runes: runes, 222 } 223 } 224 } else { 225 r.cache = nil 226 } 227 return 228} 229 230func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int, doCache bool) (result []int) { 231 match, _, err := r.findUTF16Cached(s, start, doCache) 232 if err != nil { 233 return 234 } 235 236 if match == nil { 237 return 238 } 239 groups := match.Groups() 240 241 result = make([]int, 0, len(groups)<<1) 242 for _, group := range groups { 243 if len(group.Captures) > 0 { 244 result = append(result, group.Index, group.Index+group.Length) 245 } else { 246 result = append(result, -1, 0) 247 } 248 } 249 return 250} 251 252func (r *regexp2Wrapper) findUnicodeCached(s valueString, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) { 253 var ( 254 runes []rune 255 mappedStart int 256 splitPair bool 257 savedRune rune 258 ) 259 wrapped := r.rx 260 cache := r.cache 261 if cache != nil && cache.posMap != nil && cache.target.SameAs(s) { 262 runes, posMap = cache.runes, cache.posMap 263 mappedStart, splitPair = posMapReverseLookup(posMap, start) 264 } else { 265 posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start) 266 cache = nil 267 } 268 if splitPair { 269 // temporarily set the rune at mappedStart to the second code point of the pair 270 _, second := utf16.EncodeRune(runes[mappedStart]) 271 savedRune, runes[mappedStart] = runes[mappedStart], second 272 } 273 match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart) 274 if doCache && match != nil && err == nil { 275 if splitPair { 276 runes[mappedStart] = savedRune 277 } 278 if cache == nil { 279 if r.cache == nil { 280 r.cache = new(regexp2MatchCache) 281 } 282 *r.cache = regexp2MatchCache{ 283 target: s, 284 runes: runes, 285 posMap: posMap, 286 } 287 } 288 } else { 289 r.cache = nil 290 } 291 292 return 293} 294 295func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int, doCache bool) (result []int) { 296 match, posMap, err := r.findUnicodeCached(s, start, doCache) 297 if match == nil || err != nil { 298 return 299 } 300 301 groups := match.Groups() 302 303 result = make([]int, 0, len(groups)<<1) 304 for _, group := range groups { 305 if len(group.Captures) > 0 { 306 result = append(result, posMap[group.Index], posMap[group.Index+group.Length]) 307 } else { 308 result = append(result, -1, 0) 309 } 310 } 311 return 312} 313 314func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit int, sticky bool) [][]int { 315 wrapped := r.rx 316 match, runes, err := r.findUTF16Cached(s, start, false) 317 if match == nil || err != nil { 318 return nil 319 } 320 if limit < 0 { 321 limit = len(runes) + 1 322 } 323 results := make([][]int, 0, limit) 324 for match != nil { 325 groups := match.Groups() 326 327 result := make([]int, 0, len(groups)<<1) 328 329 for _, group := range groups { 330 if len(group.Captures) > 0 { 331 startPos := group.Index 332 endPos := group.Index + group.Length 333 result = append(result, startPos, endPos) 334 } else { 335 result = append(result, -1, 0) 336 } 337 } 338 339 if sticky && len(result) > 1 { 340 if result[0] != start { 341 break 342 } 343 start = result[1] 344 } 345 346 results = append(results, result) 347 limit-- 348 if limit <= 0 { 349 break 350 } 351 match, err = wrapped.FindNextMatch(match) 352 if err != nil { 353 return nil 354 } 355 } 356 return results 357} 358 359func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) { 360 posMap = make([]int, 0, l+1) 361 curPos := 0 362 runes = make([]rune, 0, l) 363 startFound := false 364 for { 365 if !startFound { 366 if curPos == start { 367 mappedStart = len(runes) 368 startFound = true 369 } 370 if curPos > start { 371 // start position splits a surrogate pair 372 mappedStart = len(runes) - 1 373 splitPair = true 374 startFound = true 375 } 376 } 377 rn, size, err := rd.ReadRune() 378 if err != nil { 379 break 380 } 381 runes = append(runes, rn) 382 posMap = append(posMap, curPos) 383 curPos += size 384 } 385 posMap = append(posMap, curPos) 386 return 387} 388 389func posMapReverseLookup(posMap []int, pos int) (int, bool) { 390 mapped := sort.SearchInts(posMap, pos) 391 if mapped < len(posMap) && posMap[mapped] != pos { 392 return mapped - 1, true 393 } 394 return mapped, false 395} 396 397func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int { 398 wrapped := r.rx 399 if limit < 0 { 400 limit = len(s) + 1 401 } 402 results := make([][]int, 0, limit) 403 match, posMap, err := r.findUnicodeCached(s, start, false) 404 if err != nil { 405 return nil 406 } 407 for match != nil { 408 groups := match.Groups() 409 410 result := make([]int, 0, len(groups)<<1) 411 412 for _, group := range groups { 413 if len(group.Captures) > 0 { 414 start := posMap[group.Index] 415 end := posMap[group.Index+group.Length] 416 result = append(result, start, end) 417 } else { 418 result = append(result, -1, 0) 419 } 420 } 421 422 if sticky && len(result) > 1 { 423 if result[0] != start { 424 break 425 } 426 start = result[1] 427 } 428 429 results = append(results, result) 430 match, err = wrapped.FindNextMatch(match) 431 if err != nil { 432 return nil 433 } 434 } 435 return results 436} 437 438func (r *regexp2Wrapper) findAllSubmatchIndex(s valueString, start, limit int, sticky, fullUnicode bool) [][]int { 439 switch s := s.(type) { 440 case asciiString: 441 return r.findAllSubmatchIndexUTF16(s, start, limit, sticky) 442 case unicodeString: 443 if fullUnicode { 444 return r.findAllSubmatchIndexUnicode(s, start, limit, sticky) 445 } 446 return r.findAllSubmatchIndexUTF16(s, start, limit, sticky) 447 default: 448 panic("Unsupported string type") 449 } 450} 451 452func (r *regexp2Wrapper) clone() *regexp2Wrapper { 453 return ®exp2Wrapper{ 454 rx: r.rx, 455 } 456} 457 458func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) { 459 wrapped := (*regexp.Regexp)(r) 460 results = wrapped.FindAllStringSubmatchIndex(s, limit) 461 pos := 0 462 if sticky { 463 for i, result := range results { 464 if len(result) > 1 { 465 if result[0] != pos { 466 return results[:i] 467 } 468 pos = result[1] 469 } 470 } 471 } 472 return 473} 474 475func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) []int { 476 switch s := s.(type) { 477 case asciiString: 478 return r.findSubmatchIndexASCII(string(s)) 479 case unicodeString: 480 return r.findSubmatchIndexUnicode(s, fullUnicode) 481 default: 482 panic("Unsupported string type") 483 } 484} 485 486func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int { 487 wrapped := (*regexp.Regexp)(r) 488 return wrapped.FindStringSubmatchIndex(s) 489} 490 491func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) { 492 wrapped := (*regexp.Regexp)(r) 493 if fullUnicode { 494 posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0) 495 res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes}) 496 for i, item := range res { 497 if item >= 0 { 498 res[i] = posMap[item] 499 } 500 } 501 return res 502 } 503 return wrapped.FindReaderSubmatchIndex(s.utf16Reader(0)) 504} 505 506func (r *regexpWrapper) clone() *regexpWrapper { 507 return r 508} 509 510func (r *regexpObject) execResultToArray(target valueString, result []int) Value { 511 captureCount := len(result) >> 1 512 valueArray := make([]Value, captureCount) 513 matchIndex := result[0] 514 lowerBound := matchIndex 515 for index := 0; index < captureCount; index++ { 516 offset := index << 1 517 if result[offset] >= lowerBound { 518 valueArray[index] = target.substring(result[offset], result[offset+1]) 519 lowerBound = result[offset] 520 } else { 521 valueArray[index] = _undefined 522 } 523 } 524 match := r.val.runtime.newArrayValues(valueArray) 525 match.self.setOwnStr("input", target, false) 526 match.self.setOwnStr("index", intToValue(int64(matchIndex)), false) 527 return match 528} 529 530func (r *regexpObject) getLastIndex() int64 { 531 lastIndex := toLength(r.getStr("lastIndex", nil)) 532 if !r.pattern.global && !r.pattern.sticky { 533 return 0 534 } 535 return lastIndex 536} 537 538func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool { 539 if r.pattern.sticky { 540 if firstResult == nil || int64(firstResult[0]) != index { 541 r.setOwnStr("lastIndex", intToValue(0), true) 542 return false 543 } 544 } else { 545 if firstResult == nil { 546 if r.pattern.global { 547 r.setOwnStr("lastIndex", intToValue(0), true) 548 } 549 return false 550 } 551 } 552 553 if r.pattern.global || r.pattern.sticky { 554 r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true) 555 } 556 return true 557} 558 559func (r *regexpObject) execRegexp(target valueString) (match bool, result []int) { 560 index := r.getLastIndex() 561 if index >= 0 && index <= int64(target.length()) { 562 result = r.pattern.findSubmatchIndex(target, int(index)) 563 } 564 match = r.updateLastIndex(index, result, result) 565 return 566} 567 568func (r *regexpObject) exec(target valueString) Value { 569 match, result := r.execRegexp(target) 570 if match { 571 return r.execResultToArray(target, result) 572 } 573 return _null 574} 575 576func (r *regexpObject) test(target valueString) bool { 577 match, _ := r.execRegexp(target) 578 return match 579} 580 581func (r *regexpObject) clone() *regexpObject { 582 r1 := r.val.runtime.newRegexpObject(r.prototype) 583 r1.source = r.source 584 r1.pattern = r.pattern 585 586 return r1 587} 588 589func (r *regexpObject) init() { 590 r.baseObject.init() 591 r.standard = true 592 r._putProp("lastIndex", intToValue(0), true, false, false) 593} 594 595func (r *regexpObject) setProto(proto *Object, throw bool) bool { 596 res := r.baseObject.setProto(proto, throw) 597 if res { 598 r.standard = false 599 } 600 return res 601} 602 603func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool { 604 res := r.baseObject.defineOwnPropertyStr(name, desc, throw) 605 if res { 606 r.standard = false 607 } 608 return res 609} 610 611func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool { 612 res := r.baseObject.defineOwnPropertySym(name, desc, throw) 613 if res && r.standard { 614 switch name { 615 case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace: 616 r.standard = false 617 } 618 } 619 return res 620} 621 622func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool { 623 res := r.baseObject.deleteStr(name, throw) 624 if res { 625 r.standard = false 626 } 627 return res 628} 629 630func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool { 631 res := r.baseObject.setOwnStr(name, value, throw) 632 if res && r.standard && name == "exec" { 633 r.standard = false 634 } 635 return res 636} 637 638func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool { 639 res := r.baseObject.setOwnSym(name, value, throw) 640 if res && r.standard { 641 switch name { 642 case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace: 643 r.standard = false 644 } 645 } 646 return res 647} 648