1package goja
2
3import (
4	"fmt"
5	"github.com/dlclark/regexp2"
6	"github.com/dop251/goja/unistring"
7	"io"
8	"regexp"
9	"sort"
10	"strings"
11	"unicode/utf16"
12)
13
14type regexp2MatchCache struct {
15	target valueString
16	runes  []rune
17	posMap []int
18}
19
20// Not goroutine-safe. Use regexp2Wrapper.clone()
21type regexp2Wrapper struct {
22	rx    *regexp2.Regexp
23	cache *regexp2MatchCache
24}
25
26type regexpWrapper regexp.Regexp
27
28type positionMapItem struct {
29	src, dst int
30}
31type positionMap []positionMapItem
32
33func (m positionMap) get(src int) int {
34	if src <= 0 {
35		return src
36	}
37	res := sort.Search(len(m), func(n int) bool { return m[n].src >= src })
38	if res >= len(m) || m[res].src != src {
39		panic("index not found")
40	}
41	return m[res].dst
42}
43
44type arrayRuneReader struct {
45	runes []rune
46	pos   int
47}
48
49func (rd *arrayRuneReader) ReadRune() (r rune, size int, err error) {
50	if rd.pos < len(rd.runes) {
51		r = rd.runes[rd.pos]
52		size = 1
53		rd.pos++
54	} else {
55		err = io.EOF
56	}
57	return
58}
59
60// Not goroutine-safe. Use regexpPattern.clone()
61type regexpPattern struct {
62	src string
63
64	global, ignoreCase, multiline, sticky, unicode bool
65
66	regexpWrapper  *regexpWrapper
67	regexp2Wrapper *regexp2Wrapper
68}
69
70func compileRegexp2(src string, multiline, ignoreCase bool) (*regexp2Wrapper, error) {
71	var opts regexp2.RegexOptions = regexp2.ECMAScript
72	if multiline {
73		opts |= regexp2.Multiline
74	}
75	if ignoreCase {
76		opts |= regexp2.IgnoreCase
77	}
78	regexp2Pattern, err1 := regexp2.Compile(src, opts)
79	if err1 != nil {
80		return nil, fmt.Errorf("Invalid regular expression (regexp2): %s (%v)", src, err1)
81	}
82
83	return &regexp2Wrapper{rx: regexp2Pattern}, nil
84}
85
86func (p *regexpPattern) createRegexp2() {
87	if p.regexp2Wrapper != nil {
88		return
89	}
90	rx, err := compileRegexp2(p.src, p.multiline, p.ignoreCase)
91	if err != nil {
92		// At this point the regexp should have been successfully converted to re2, if it fails now, it's a bug.
93		panic(err)
94	}
95	p.regexp2Wrapper = rx
96}
97
98func buildUTF8PosMap(s valueString) (positionMap, string) {
99	pm := make(positionMap, 0, s.length())
100	rd := s.reader(0)
101	sPos, utf8Pos := 0, 0
102	var sb strings.Builder
103	for {
104		r, size, err := rd.ReadRune()
105		if err == io.EOF {
106			break
107		}
108		if err != nil {
109			// the string contains invalid UTF-16, bailing out
110			return nil, ""
111		}
112		utf8Size, _ := sb.WriteRune(r)
113		sPos += size
114		utf8Pos += utf8Size
115		pm = append(pm, positionMapItem{src: utf8Pos, dst: sPos})
116	}
117	return pm, sb.String()
118}
119
120func (p *regexpPattern) findSubmatchIndex(s valueString, start int) []int {
121	if p.regexpWrapper == nil {
122		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
123	}
124	if start != 0 {
125		// Unfortunately Go's regexp library does not allow starting from an arbitrary position.
126		// If we just drop the first _start_ characters of the string the assertions (^, $, \b and \B) will not
127		// work correctly.
128		p.createRegexp2()
129		return p.regexp2Wrapper.findSubmatchIndex(s, start, p.unicode, p.global || p.sticky)
130	}
131	return p.regexpWrapper.findSubmatchIndex(s, p.unicode)
132}
133
134func (p *regexpPattern) findAllSubmatchIndex(s valueString, start int, limit int, sticky bool) [][]int {
135	if p.regexpWrapper == nil {
136		return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
137	}
138	if start == 0 {
139		if s, ok := s.(asciiString); ok {
140			return p.regexpWrapper.findAllSubmatchIndex(s.String(), limit, sticky)
141		}
142		if limit == 1 {
143			result := p.regexpWrapper.findSubmatchIndexUnicode(s.(unicodeString), p.unicode)
144			if result == nil {
145				return nil
146			}
147			return [][]int{result}
148		}
149		// Unfortunately Go's regexp library lacks FindAllReaderSubmatchIndex(), so we have to use a UTF-8 string as an
150		// input.
151		if p.unicode {
152			// Try to convert s to UTF-8. If it does not contain any invalid UTF-16 we can do the matching in UTF-8.
153			pm, str := buildUTF8PosMap(s)
154			if pm != nil {
155				res := p.regexpWrapper.findAllSubmatchIndex(str, limit, sticky)
156				for _, result := range res {
157					for i, idx := range result {
158						result[i] = pm.get(idx)
159					}
160				}
161				return res
162			}
163		}
164	}
165
166	p.createRegexp2()
167	return p.regexp2Wrapper.findAllSubmatchIndex(s, start, limit, sticky, p.unicode)
168}
169
170// clone creates a copy of the regexpPattern which can be used concurrently.
171func (p *regexpPattern) clone() *regexpPattern {
172	ret := &regexpPattern{
173		src:        p.src,
174		global:     p.global,
175		ignoreCase: p.ignoreCase,
176		multiline:  p.multiline,
177		sticky:     p.sticky,
178		unicode:    p.unicode,
179	}
180	if p.regexpWrapper != nil {
181		ret.regexpWrapper = p.regexpWrapper.clone()
182	}
183	if p.regexp2Wrapper != nil {
184		ret.regexp2Wrapper = p.regexp2Wrapper.clone()
185	}
186	return ret
187}
188
189type regexpObject struct {
190	baseObject
191	pattern *regexpPattern
192	source  valueString
193
194	standard bool
195}
196
197func (r *regexp2Wrapper) findSubmatchIndex(s valueString, start int, fullUnicode, doCache bool) (result []int) {
198	if fullUnicode {
199		return r.findSubmatchIndexUnicode(s, start, doCache)
200	}
201	return r.findSubmatchIndexUTF16(s, start, doCache)
202}
203
204func (r *regexp2Wrapper) findUTF16Cached(s valueString, start int, doCache bool) (match *regexp2.Match, runes []rune, err error) {
205	wrapped := r.rx
206	cache := r.cache
207	if cache != nil && cache.posMap == nil && cache.target.SameAs(s) {
208		runes = cache.runes
209	} else {
210		runes = s.utf16Runes()
211		cache = nil
212	}
213	match, err = wrapped.FindRunesMatchStartingAt(runes, start)
214	if doCache && match != nil && err == nil {
215		if cache == nil {
216			if r.cache == nil {
217				r.cache = new(regexp2MatchCache)
218			}
219			*r.cache = regexp2MatchCache{
220				target: s,
221				runes:  runes,
222			}
223		}
224	} else {
225		r.cache = nil
226	}
227	return
228}
229
230func (r *regexp2Wrapper) findSubmatchIndexUTF16(s valueString, start int, doCache bool) (result []int) {
231	match, _, err := r.findUTF16Cached(s, start, doCache)
232	if err != nil {
233		return
234	}
235
236	if match == nil {
237		return
238	}
239	groups := match.Groups()
240
241	result = make([]int, 0, len(groups)<<1)
242	for _, group := range groups {
243		if len(group.Captures) > 0 {
244			result = append(result, group.Index, group.Index+group.Length)
245		} else {
246			result = append(result, -1, 0)
247		}
248	}
249	return
250}
251
252func (r *regexp2Wrapper) findUnicodeCached(s valueString, start int, doCache bool) (match *regexp2.Match, posMap []int, err error) {
253	var (
254		runes       []rune
255		mappedStart int
256		splitPair   bool
257		savedRune   rune
258	)
259	wrapped := r.rx
260	cache := r.cache
261	if cache != nil && cache.posMap != nil && cache.target.SameAs(s) {
262		runes, posMap = cache.runes, cache.posMap
263		mappedStart, splitPair = posMapReverseLookup(posMap, start)
264	} else {
265		posMap, runes, mappedStart, splitPair = buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), start)
266		cache = nil
267	}
268	if splitPair {
269		// temporarily set the rune at mappedStart to the second code point of the pair
270		_, second := utf16.EncodeRune(runes[mappedStart])
271		savedRune, runes[mappedStart] = runes[mappedStart], second
272	}
273	match, err = wrapped.FindRunesMatchStartingAt(runes, mappedStart)
274	if doCache && match != nil && err == nil {
275		if splitPair {
276			runes[mappedStart] = savedRune
277		}
278		if cache == nil {
279			if r.cache == nil {
280				r.cache = new(regexp2MatchCache)
281			}
282			*r.cache = regexp2MatchCache{
283				target: s,
284				runes:  runes,
285				posMap: posMap,
286			}
287		}
288	} else {
289		r.cache = nil
290	}
291
292	return
293}
294
295func (r *regexp2Wrapper) findSubmatchIndexUnicode(s valueString, start int, doCache bool) (result []int) {
296	match, posMap, err := r.findUnicodeCached(s, start, doCache)
297	if match == nil || err != nil {
298		return
299	}
300
301	groups := match.Groups()
302
303	result = make([]int, 0, len(groups)<<1)
304	for _, group := range groups {
305		if len(group.Captures) > 0 {
306			result = append(result, posMap[group.Index], posMap[group.Index+group.Length])
307		} else {
308			result = append(result, -1, 0)
309		}
310	}
311	return
312}
313
314func (r *regexp2Wrapper) findAllSubmatchIndexUTF16(s valueString, start, limit int, sticky bool) [][]int {
315	wrapped := r.rx
316	match, runes, err := r.findUTF16Cached(s, start, false)
317	if match == nil || err != nil {
318		return nil
319	}
320	if limit < 0 {
321		limit = len(runes) + 1
322	}
323	results := make([][]int, 0, limit)
324	for match != nil {
325		groups := match.Groups()
326
327		result := make([]int, 0, len(groups)<<1)
328
329		for _, group := range groups {
330			if len(group.Captures) > 0 {
331				startPos := group.Index
332				endPos := group.Index + group.Length
333				result = append(result, startPos, endPos)
334			} else {
335				result = append(result, -1, 0)
336			}
337		}
338
339		if sticky && len(result) > 1 {
340			if result[0] != start {
341				break
342			}
343			start = result[1]
344		}
345
346		results = append(results, result)
347		limit--
348		if limit <= 0 {
349			break
350		}
351		match, err = wrapped.FindNextMatch(match)
352		if err != nil {
353			return nil
354		}
355	}
356	return results
357}
358
359func buildPosMap(rd io.RuneReader, l, start int) (posMap []int, runes []rune, mappedStart int, splitPair bool) {
360	posMap = make([]int, 0, l+1)
361	curPos := 0
362	runes = make([]rune, 0, l)
363	startFound := false
364	for {
365		if !startFound {
366			if curPos == start {
367				mappedStart = len(runes)
368				startFound = true
369			}
370			if curPos > start {
371				// start position splits a surrogate pair
372				mappedStart = len(runes) - 1
373				splitPair = true
374				startFound = true
375			}
376		}
377		rn, size, err := rd.ReadRune()
378		if err != nil {
379			break
380		}
381		runes = append(runes, rn)
382		posMap = append(posMap, curPos)
383		curPos += size
384	}
385	posMap = append(posMap, curPos)
386	return
387}
388
389func posMapReverseLookup(posMap []int, pos int) (int, bool) {
390	mapped := sort.SearchInts(posMap, pos)
391	if mapped < len(posMap) && posMap[mapped] != pos {
392		return mapped - 1, true
393	}
394	return mapped, false
395}
396
397func (r *regexp2Wrapper) findAllSubmatchIndexUnicode(s unicodeString, start, limit int, sticky bool) [][]int {
398	wrapped := r.rx
399	if limit < 0 {
400		limit = len(s) + 1
401	}
402	results := make([][]int, 0, limit)
403	match, posMap, err := r.findUnicodeCached(s, start, false)
404	if err != nil {
405		return nil
406	}
407	for match != nil {
408		groups := match.Groups()
409
410		result := make([]int, 0, len(groups)<<1)
411
412		for _, group := range groups {
413			if len(group.Captures) > 0 {
414				start := posMap[group.Index]
415				end := posMap[group.Index+group.Length]
416				result = append(result, start, end)
417			} else {
418				result = append(result, -1, 0)
419			}
420		}
421
422		if sticky && len(result) > 1 {
423			if result[0] != start {
424				break
425			}
426			start = result[1]
427		}
428
429		results = append(results, result)
430		match, err = wrapped.FindNextMatch(match)
431		if err != nil {
432			return nil
433		}
434	}
435	return results
436}
437
438func (r *regexp2Wrapper) findAllSubmatchIndex(s valueString, start, limit int, sticky, fullUnicode bool) [][]int {
439	switch s := s.(type) {
440	case asciiString:
441		return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
442	case unicodeString:
443		if fullUnicode {
444			return r.findAllSubmatchIndexUnicode(s, start, limit, sticky)
445		}
446		return r.findAllSubmatchIndexUTF16(s, start, limit, sticky)
447	default:
448		panic("Unsupported string type")
449	}
450}
451
452func (r *regexp2Wrapper) clone() *regexp2Wrapper {
453	return &regexp2Wrapper{
454		rx: r.rx,
455	}
456}
457
458func (r *regexpWrapper) findAllSubmatchIndex(s string, limit int, sticky bool) (results [][]int) {
459	wrapped := (*regexp.Regexp)(r)
460	results = wrapped.FindAllStringSubmatchIndex(s, limit)
461	pos := 0
462	if sticky {
463		for i, result := range results {
464			if len(result) > 1 {
465				if result[0] != pos {
466					return results[:i]
467				}
468				pos = result[1]
469			}
470		}
471	}
472	return
473}
474
475func (r *regexpWrapper) findSubmatchIndex(s valueString, fullUnicode bool) []int {
476	switch s := s.(type) {
477	case asciiString:
478		return r.findSubmatchIndexASCII(string(s))
479	case unicodeString:
480		return r.findSubmatchIndexUnicode(s, fullUnicode)
481	default:
482		panic("Unsupported string type")
483	}
484}
485
486func (r *regexpWrapper) findSubmatchIndexASCII(s string) []int {
487	wrapped := (*regexp.Regexp)(r)
488	return wrapped.FindStringSubmatchIndex(s)
489}
490
491func (r *regexpWrapper) findSubmatchIndexUnicode(s unicodeString, fullUnicode bool) (result []int) {
492	wrapped := (*regexp.Regexp)(r)
493	if fullUnicode {
494		posMap, runes, _, _ := buildPosMap(&lenientUtf16Decoder{utf16Reader: s.utf16Reader(0)}, s.length(), 0)
495		res := wrapped.FindReaderSubmatchIndex(&arrayRuneReader{runes: runes})
496		for i, item := range res {
497			if item >= 0 {
498				res[i] = posMap[item]
499			}
500		}
501		return res
502	}
503	return wrapped.FindReaderSubmatchIndex(s.utf16Reader(0))
504}
505
506func (r *regexpWrapper) clone() *regexpWrapper {
507	return r
508}
509
510func (r *regexpObject) execResultToArray(target valueString, result []int) Value {
511	captureCount := len(result) >> 1
512	valueArray := make([]Value, captureCount)
513	matchIndex := result[0]
514	lowerBound := matchIndex
515	for index := 0; index < captureCount; index++ {
516		offset := index << 1
517		if result[offset] >= lowerBound {
518			valueArray[index] = target.substring(result[offset], result[offset+1])
519			lowerBound = result[offset]
520		} else {
521			valueArray[index] = _undefined
522		}
523	}
524	match := r.val.runtime.newArrayValues(valueArray)
525	match.self.setOwnStr("input", target, false)
526	match.self.setOwnStr("index", intToValue(int64(matchIndex)), false)
527	return match
528}
529
530func (r *regexpObject) getLastIndex() int64 {
531	lastIndex := toLength(r.getStr("lastIndex", nil))
532	if !r.pattern.global && !r.pattern.sticky {
533		return 0
534	}
535	return lastIndex
536}
537
538func (r *regexpObject) updateLastIndex(index int64, firstResult, lastResult []int) bool {
539	if r.pattern.sticky {
540		if firstResult == nil || int64(firstResult[0]) != index {
541			r.setOwnStr("lastIndex", intToValue(0), true)
542			return false
543		}
544	} else {
545		if firstResult == nil {
546			if r.pattern.global {
547				r.setOwnStr("lastIndex", intToValue(0), true)
548			}
549			return false
550		}
551	}
552
553	if r.pattern.global || r.pattern.sticky {
554		r.setOwnStr("lastIndex", intToValue(int64(lastResult[1])), true)
555	}
556	return true
557}
558
559func (r *regexpObject) execRegexp(target valueString) (match bool, result []int) {
560	index := r.getLastIndex()
561	if index >= 0 && index <= int64(target.length()) {
562		result = r.pattern.findSubmatchIndex(target, int(index))
563	}
564	match = r.updateLastIndex(index, result, result)
565	return
566}
567
568func (r *regexpObject) exec(target valueString) Value {
569	match, result := r.execRegexp(target)
570	if match {
571		return r.execResultToArray(target, result)
572	}
573	return _null
574}
575
576func (r *regexpObject) test(target valueString) bool {
577	match, _ := r.execRegexp(target)
578	return match
579}
580
581func (r *regexpObject) clone() *regexpObject {
582	r1 := r.val.runtime.newRegexpObject(r.prototype)
583	r1.source = r.source
584	r1.pattern = r.pattern
585
586	return r1
587}
588
589func (r *regexpObject) init() {
590	r.baseObject.init()
591	r.standard = true
592	r._putProp("lastIndex", intToValue(0), true, false, false)
593}
594
595func (r *regexpObject) setProto(proto *Object, throw bool) bool {
596	res := r.baseObject.setProto(proto, throw)
597	if res {
598		r.standard = false
599	}
600	return res
601}
602
603func (r *regexpObject) defineOwnPropertyStr(name unistring.String, desc PropertyDescriptor, throw bool) bool {
604	res := r.baseObject.defineOwnPropertyStr(name, desc, throw)
605	if res {
606		r.standard = false
607	}
608	return res
609}
610
611func (r *regexpObject) defineOwnPropertySym(name *Symbol, desc PropertyDescriptor, throw bool) bool {
612	res := r.baseObject.defineOwnPropertySym(name, desc, throw)
613	if res && r.standard {
614		switch name {
615		case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
616			r.standard = false
617		}
618	}
619	return res
620}
621
622func (r *regexpObject) deleteStr(name unistring.String, throw bool) bool {
623	res := r.baseObject.deleteStr(name, throw)
624	if res {
625		r.standard = false
626	}
627	return res
628}
629
630func (r *regexpObject) setOwnStr(name unistring.String, value Value, throw bool) bool {
631	res := r.baseObject.setOwnStr(name, value, throw)
632	if res && r.standard && name == "exec" {
633		r.standard = false
634	}
635	return res
636}
637
638func (r *regexpObject) setOwnSym(name *Symbol, value Value, throw bool) bool {
639	res := r.baseObject.setOwnSym(name, value, throw)
640	if res && r.standard {
641		switch name {
642		case SymMatch, SymMatchAll, SymSearch, SymSplit, SymReplace:
643			r.standard = false
644		}
645	}
646	return res
647}
648