1// Copyright 2015 Huan Du. All rights reserved.
2// Licensed under the MIT license that can be found in the LICENSE file.
3
4package xstrings
5
6import (
7	"unicode"
8	"unicode/utf8"
9)
10
11type runeRangeMap struct {
12	FromLo rune // Lower bound of range map.
13	FromHi rune // An inclusive higher bound of range map.
14	ToLo   rune
15	ToHi   rune
16}
17
18type runeDict struct {
19	Dict [unicode.MaxASCII + 1]rune
20}
21
22type runeMap map[rune]rune
23
24// Translator can translate string with pre-compiled from and to patterns.
25// If a from/to pattern pair needs to be used more than once, it's recommended
26// to create a Translator and reuse it.
27type Translator struct {
28	quickDict  *runeDict       // A quick dictionary to look up rune by index. Only available for latin runes.
29	runeMap    runeMap         // Rune map for translation.
30	ranges     []*runeRangeMap // Ranges of runes.
31	mappedRune rune            // If mappedRune >= 0, all matched runes are translated to the mappedRune.
32	reverted   bool            // If to pattern is empty, all matched characters will be deleted.
33	hasPattern bool
34}
35
36// NewTranslator creates new Translator through a from/to pattern pair.
37func NewTranslator(from, to string) *Translator {
38	tr := &Translator{}
39
40	if from == "" {
41		return tr
42	}
43
44	reverted := from[0] == '^'
45	deletion := len(to) == 0
46
47	if reverted {
48		from = from[1:]
49	}
50
51	var fromStart, fromEnd, fromRangeStep rune
52	var toStart, toEnd, toRangeStep rune
53	var fromRangeSize, toRangeSize rune
54	var singleRunes []rune
55
56	// Update the to rune range.
57	updateRange := func() {
58		// No more rune to read in the to rune pattern.
59		if toEnd == utf8.RuneError {
60			return
61		}
62
63		if toRangeStep == 0 {
64			to, toStart, toEnd, toRangeStep = nextRuneRange(to, toEnd)
65			return
66		}
67
68		// Current range is not empty. Consume 1 rune from start.
69		if toStart != toEnd {
70			toStart += toRangeStep
71			return
72		}
73
74		// No more rune. Repeat the last rune.
75		if to == "" {
76			toEnd = utf8.RuneError
77			return
78		}
79
80		// Both start and end are used. Read two more runes from the to pattern.
81		to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
82	}
83
84	if deletion {
85		toStart = utf8.RuneError
86		toEnd = utf8.RuneError
87	} else {
88		// If from pattern is reverted, only the last rune in the to pattern will be used.
89		if reverted {
90			var size int
91
92			for len(to) > 0 {
93				toStart, size = utf8.DecodeRuneInString(to)
94				to = to[size:]
95			}
96
97			toEnd = utf8.RuneError
98		} else {
99			to, toStart, toEnd, toRangeStep = nextRuneRange(to, utf8.RuneError)
100		}
101	}
102
103	fromEnd = utf8.RuneError
104
105	for len(from) > 0 {
106		from, fromStart, fromEnd, fromRangeStep = nextRuneRange(from, fromEnd)
107
108		// fromStart is a single character. Just map it with a rune in the to pattern.
109		if fromRangeStep == 0 {
110			singleRunes = tr.addRune(fromStart, toStart, singleRunes)
111			updateRange()
112			continue
113		}
114
115		for toEnd != utf8.RuneError && fromStart != fromEnd {
116			// If mapped rune is a single character instead of a range, simply shift first
117			// rune in the range.
118			if toRangeStep == 0 {
119				singleRunes = tr.addRune(fromStart, toStart, singleRunes)
120				updateRange()
121				fromStart += fromRangeStep
122				continue
123			}
124
125			fromRangeSize = (fromEnd - fromStart) * fromRangeStep
126			toRangeSize = (toEnd - toStart) * toRangeStep
127
128			// Not enough runes in the to pattern. Need to read more.
129			if fromRangeSize > toRangeSize {
130				fromStart, toStart = tr.addRuneRange(fromStart, fromStart+toRangeSize*fromRangeStep, toStart, toEnd, singleRunes)
131				fromStart += fromRangeStep
132				updateRange()
133
134				// Edge case: If fromRangeSize == toRangeSize + 1, the last fromStart value needs be considered
135				// as a single rune.
136				if fromStart == fromEnd {
137					singleRunes = tr.addRune(fromStart, toStart, singleRunes)
138					updateRange()
139				}
140
141				continue
142			}
143
144			fromStart, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart+fromRangeSize*toRangeStep, singleRunes)
145			updateRange()
146			break
147		}
148
149		if fromStart == fromEnd {
150			fromEnd = utf8.RuneError
151			continue
152		}
153
154		_, toStart = tr.addRuneRange(fromStart, fromEnd, toStart, toStart, singleRunes)
155		fromEnd = utf8.RuneError
156	}
157
158	if fromEnd != utf8.RuneError {
159		tr.addRune(fromEnd, toStart, singleRunes)
160	}
161
162	tr.reverted = reverted
163	tr.mappedRune = -1
164	tr.hasPattern = true
165
166	// Translate RuneError only if in deletion or reverted mode.
167	if deletion || reverted {
168		tr.mappedRune = toStart
169	}
170
171	return tr
172}
173
174func (tr *Translator) addRune(from, to rune, singleRunes []rune) []rune {
175	if from <= unicode.MaxASCII {
176		if tr.quickDict == nil {
177			tr.quickDict = &runeDict{}
178		}
179
180		tr.quickDict.Dict[from] = to
181	} else {
182		if tr.runeMap == nil {
183			tr.runeMap = make(runeMap)
184		}
185
186		tr.runeMap[from] = to
187	}
188
189	singleRunes = append(singleRunes, from)
190	return singleRunes
191}
192
193func (tr *Translator) addRuneRange(fromLo, fromHi, toLo, toHi rune, singleRunes []rune) (rune, rune) {
194	var r rune
195	var rrm *runeRangeMap
196
197	if fromLo < fromHi {
198		rrm = &runeRangeMap{
199			FromLo: fromLo,
200			FromHi: fromHi,
201			ToLo:   toLo,
202			ToHi:   toHi,
203		}
204	} else {
205		rrm = &runeRangeMap{
206			FromLo: fromHi,
207			FromHi: fromLo,
208			ToLo:   toHi,
209			ToHi:   toLo,
210		}
211	}
212
213	// If there is any single rune conflicts with this rune range, clear single rune record.
214	for _, r = range singleRunes {
215		if rrm.FromLo <= r && r <= rrm.FromHi {
216			if r <= unicode.MaxASCII {
217				tr.quickDict.Dict[r] = 0
218			} else {
219				delete(tr.runeMap, r)
220			}
221		}
222	}
223
224	tr.ranges = append(tr.ranges, rrm)
225	return fromHi, toHi
226}
227
228func nextRuneRange(str string, last rune) (remaining string, start, end rune, rangeStep rune) {
229	var r rune
230	var size int
231
232	remaining = str
233	escaping := false
234	isRange := false
235
236	for len(remaining) > 0 {
237		r, size = utf8.DecodeRuneInString(remaining)
238		remaining = remaining[size:]
239
240		// Parse special characters.
241		if !escaping {
242			if r == '\\' {
243				escaping = true
244				continue
245			}
246
247			if r == '-' {
248				// Ignore slash at beginning of string.
249				if last == utf8.RuneError {
250					continue
251				}
252
253				start = last
254				isRange = true
255				continue
256			}
257		}
258
259		escaping = false
260
261		if last != utf8.RuneError {
262			// This is a range which start and end are the same.
263			// Considier it as a normal character.
264			if isRange && last == r {
265				isRange = false
266				continue
267			}
268
269			start = last
270			end = r
271
272			if isRange {
273				if start < end {
274					rangeStep = 1
275				} else {
276					rangeStep = -1
277				}
278			}
279
280			return
281		}
282
283		last = r
284	}
285
286	start = last
287	end = utf8.RuneError
288	return
289}
290
291// Translate str with a from/to pattern pair.
292//
293// See comment in Translate function for usage and samples.
294func (tr *Translator) Translate(str string) string {
295	if !tr.hasPattern || str == "" {
296		return str
297	}
298
299	var r rune
300	var size int
301	var needTr bool
302
303	orig := str
304
305	var output *stringBuilder
306
307	for len(str) > 0 {
308		r, size = utf8.DecodeRuneInString(str)
309		r, needTr = tr.TranslateRune(r)
310
311		if needTr && output == nil {
312			output = allocBuffer(orig, str)
313		}
314
315		if r != utf8.RuneError && output != nil {
316			output.WriteRune(r)
317		}
318
319		str = str[size:]
320	}
321
322	// No character is translated.
323	if output == nil {
324		return orig
325	}
326
327	return output.String()
328}
329
330// TranslateRune return translated rune and true if r matches the from pattern.
331// If r doesn't match the pattern, original r is returned and translated is false.
332func (tr *Translator) TranslateRune(r rune) (result rune, translated bool) {
333	switch {
334	case tr.quickDict != nil:
335		if r <= unicode.MaxASCII {
336			result = tr.quickDict.Dict[r]
337
338			if result != 0 {
339				translated = true
340
341				if tr.mappedRune >= 0 {
342					result = tr.mappedRune
343				}
344
345				break
346			}
347		}
348
349		fallthrough
350
351	case tr.runeMap != nil:
352		var ok bool
353
354		if result, ok = tr.runeMap[r]; ok {
355			translated = true
356
357			if tr.mappedRune >= 0 {
358				result = tr.mappedRune
359			}
360
361			break
362		}
363
364		fallthrough
365
366	default:
367		var rrm *runeRangeMap
368		ranges := tr.ranges
369
370		for i := len(ranges) - 1; i >= 0; i-- {
371			rrm = ranges[i]
372
373			if rrm.FromLo <= r && r <= rrm.FromHi {
374				translated = true
375
376				if tr.mappedRune >= 0 {
377					result = tr.mappedRune
378					break
379				}
380
381				if rrm.ToLo < rrm.ToHi {
382					result = rrm.ToLo + r - rrm.FromLo
383				} else if rrm.ToLo > rrm.ToHi {
384					// ToHi can be smaller than ToLo if range is from higher to lower.
385					result = rrm.ToLo - r + rrm.FromLo
386				} else {
387					result = rrm.ToLo
388				}
389
390				break
391			}
392		}
393	}
394
395	if tr.reverted {
396		if !translated {
397			result = tr.mappedRune
398		}
399
400		translated = !translated
401	}
402
403	if !translated {
404		result = r
405	}
406
407	return
408}
409
410// HasPattern returns true if Translator has one pattern at least.
411func (tr *Translator) HasPattern() bool {
412	return tr.hasPattern
413}
414
415// Translate str with the characters defined in from replaced by characters defined in to.
416//
417// From and to are patterns representing a set of characters. Pattern is defined as following.
418//
419//     * Special characters
420//       * '-' means a range of runes, e.g.
421//         * "a-z" means all characters from 'a' to 'z' inclusive;
422//         * "z-a" means all characters from 'z' to 'a' inclusive.
423//       * '^' as first character means a set of all runes excepted listed, e.g.
424//         * "^a-z" means all characters except 'a' to 'z' inclusive.
425//       * '\' escapes special characters.
426//     * Normal character represents itself, e.g. "abc" is a set including 'a', 'b' and 'c'.
427//
428// Translate will try to find a 1:1 mapping from from to to.
429// If to is smaller than from, last rune in to will be used to map "out of range" characters in from.
430//
431// Note that '^' only works in the from pattern. It will be considered as a normal character in the to pattern.
432//
433// If the to pattern is an empty string, Translate works exactly the same as Delete.
434//
435// Samples:
436//     Translate("hello", "aeiou", "12345")    => "h2ll4"
437//     Translate("hello", "a-z", "A-Z")        => "HELLO"
438//     Translate("hello", "z-a", "a-z")        => "svool"
439//     Translate("hello", "aeiou", "*")        => "h*ll*"
440//     Translate("hello", "^l", "*")           => "**ll*"
441//     Translate("hello ^ world", `\^lo`, "*") => "he*** * w*r*d"
442func Translate(str, from, to string) string {
443	tr := NewTranslator(from, to)
444	return tr.Translate(str)
445}
446
447// Delete runes in str matching the pattern.
448// Pattern is defined in Translate function.
449//
450// Samples:
451//     Delete("hello", "aeiou") => "hll"
452//     Delete("hello", "a-k")   => "llo"
453//     Delete("hello", "^a-k")  => "he"
454func Delete(str, pattern string) string {
455	tr := NewTranslator(pattern, "")
456	return tr.Translate(str)
457}
458
459// Count how many runes in str match the pattern.
460// Pattern is defined in Translate function.
461//
462// Samples:
463//     Count("hello", "aeiou") => 3
464//     Count("hello", "a-k")   => 3
465//     Count("hello", "^a-k")  => 2
466func Count(str, pattern string) int {
467	if pattern == "" || str == "" {
468		return 0
469	}
470
471	var r rune
472	var size int
473	var matched bool
474
475	tr := NewTranslator(pattern, "")
476	cnt := 0
477
478	for len(str) > 0 {
479		r, size = utf8.DecodeRuneInString(str)
480		str = str[size:]
481
482		if _, matched = tr.TranslateRune(r); matched {
483			cnt++
484		}
485	}
486
487	return cnt
488}
489
490// Squeeze deletes adjacent repeated runes in str.
491// If pattern is not empty, only runes matching the pattern will be squeezed.
492//
493// Samples:
494//     Squeeze("hello", "")             => "helo"
495//     Squeeze("hello", "m-z")          => "hello"
496//     Squeeze("hello   world", " ")    => "hello world"
497func Squeeze(str, pattern string) string {
498	var last, r rune
499	var size int
500	var skipSqueeze, matched bool
501	var tr *Translator
502	var output *stringBuilder
503
504	orig := str
505	last = -1
506
507	if len(pattern) > 0 {
508		tr = NewTranslator(pattern, "")
509	}
510
511	for len(str) > 0 {
512		r, size = utf8.DecodeRuneInString(str)
513
514		// Need to squeeze the str.
515		if last == r && !skipSqueeze {
516			if tr != nil {
517				if _, matched = tr.TranslateRune(r); !matched {
518					skipSqueeze = true
519				}
520			}
521
522			if output == nil {
523				output = allocBuffer(orig, str)
524			}
525
526			if skipSqueeze {
527				output.WriteRune(r)
528			}
529		} else {
530			if output != nil {
531				output.WriteRune(r)
532			}
533
534			last = r
535			skipSqueeze = false
536		}
537
538		str = str[size:]
539	}
540
541	if output == nil {
542		return orig
543	}
544
545	return output.String()
546}
547