1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package utf8string provides an efficient way to index strings by rune rather than by byte.
6package utf8string
7
8import (
9	"errors"
10	"unicode/utf8"
11)
12
13// String wraps a regular string with a small structure that provides more
14// efficient indexing by code point index, as opposed to byte index.
15// Scanning incrementally forwards or backwards is O(1) per index operation
16// (although not as fast a range clause going forwards).  Random access is
17// O(N) in the length of the string, but the overhead is less than always
18// scanning from the beginning.
19// If the string is ASCII, random access is O(1).
20// Unlike the built-in string type, String has internal mutable state and
21// is not thread-safe.
22type String struct {
23	str      string
24	numRunes int
25	// If width > 0, the rune at runePos starts at bytePos and has the specified width.
26	width    int
27	bytePos  int
28	runePos  int
29	nonASCII int // byte index of the first non-ASCII rune.
30}
31
32// NewString returns a new UTF-8 string with the provided contents.
33func NewString(contents string) *String {
34	return new(String).Init(contents)
35}
36
37// Init initializes an existing String to hold the provided contents.
38// It returns a pointer to the initialized String.
39func (s *String) Init(contents string) *String {
40	s.str = contents
41	s.bytePos = 0
42	s.runePos = 0
43	for i := 0; i < len(contents); i++ {
44		if contents[i] >= utf8.RuneSelf {
45			// Not ASCII.
46			s.numRunes = utf8.RuneCountInString(contents)
47			_, s.width = utf8.DecodeRuneInString(contents)
48			s.nonASCII = i
49			return s
50		}
51	}
52	// ASCII is simple.  Also, the empty string is ASCII.
53	s.numRunes = len(contents)
54	s.width = 0
55	s.nonASCII = len(contents)
56	return s
57}
58
59// String returns the contents of the String.  This method also means the
60// String is directly printable by fmt.Print.
61func (s *String) String() string {
62	return s.str
63}
64
65// RuneCount returns the number of runes (Unicode code points) in the String.
66func (s *String) RuneCount() int {
67	return s.numRunes
68}
69
70// IsASCII returns a boolean indicating whether the String contains only ASCII bytes.
71func (s *String) IsASCII() bool {
72	return s.width == 0
73}
74
75// Slice returns the string sliced at rune positions [i:j].
76func (s *String) Slice(i, j int) string {
77	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
78	if j < s.nonASCII {
79		return s.str[i:j]
80	}
81	if i < 0 || j > s.numRunes || i > j {
82		panic(sliceOutOfRange)
83	}
84	if i == j {
85		return ""
86	}
87	// For non-ASCII, after At(i), bytePos is always the position of the indexed character.
88	var low, high int
89	switch {
90	case i < s.nonASCII:
91		low = i
92	case i == s.numRunes:
93		low = len(s.str)
94	default:
95		s.At(i)
96		low = s.bytePos
97	}
98	switch {
99	case j == s.numRunes:
100		high = len(s.str)
101	default:
102		s.At(j)
103		high = s.bytePos
104	}
105	return s.str[low:high]
106}
107
108// At returns the rune with index i in the String.  The sequence of runes is the same
109// as iterating over the contents with a "for range" clause.
110func (s *String) At(i int) rune {
111	// ASCII is easy.  Let the compiler catch the indexing error if there is one.
112	if i < s.nonASCII {
113		return rune(s.str[i])
114	}
115
116	// Now we do need to know the index is valid.
117	if i < 0 || i >= s.numRunes {
118		panic(outOfRange)
119	}
120
121	var r rune
122
123	// Five easy common cases: within 1 spot of bytePos/runePos, or the beginning, or the end.
124	// With these cases, all scans from beginning or end work in O(1) time per rune.
125	switch {
126
127	case i == s.runePos-1: // backing up one rune
128		r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
129		s.runePos = i
130		s.bytePos -= s.width
131		return r
132	case i == s.runePos+1: // moving ahead one rune
133		s.runePos = i
134		s.bytePos += s.width
135		fallthrough
136	case i == s.runePos:
137		r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
138		return r
139	case i == 0: // start of string
140		r, s.width = utf8.DecodeRuneInString(s.str)
141		s.runePos = 0
142		s.bytePos = 0
143		return r
144
145	case i == s.numRunes-1: // last rune in string
146		r, s.width = utf8.DecodeLastRuneInString(s.str)
147		s.runePos = i
148		s.bytePos = len(s.str) - s.width
149		return r
150	}
151
152	// We need to do a linear scan.  There are three places to start from:
153	// 1) The beginning
154	// 2) bytePos/runePos.
155	// 3) The end
156	// Choose the closest in rune count, scanning backwards if necessary.
157	forward := true
158	if i < s.runePos {
159		// Between beginning and pos.  Which is closer?
160		// Since both i and runePos are guaranteed >= nonASCII, that's the
161		// lowest location we need to start from.
162		if i < (s.runePos-s.nonASCII)/2 {
163			// Scan forward from beginning
164			s.bytePos, s.runePos = s.nonASCII, s.nonASCII
165		} else {
166			// Scan backwards from where we are
167			forward = false
168		}
169	} else {
170		// Between pos and end.  Which is closer?
171		if i-s.runePos < (s.numRunes-s.runePos)/2 {
172			// Scan forward from pos
173		} else {
174			// Scan backwards from end
175			s.bytePos, s.runePos = len(s.str), s.numRunes
176			forward = false
177		}
178	}
179	if forward {
180		// TODO: Is it much faster to use a range loop for this scan?
181		for {
182			r, s.width = utf8.DecodeRuneInString(s.str[s.bytePos:])
183			if s.runePos == i {
184				break
185			}
186			s.runePos++
187			s.bytePos += s.width
188		}
189	} else {
190		for {
191			r, s.width = utf8.DecodeLastRuneInString(s.str[0:s.bytePos])
192			s.runePos--
193			s.bytePos -= s.width
194			if s.runePos == i {
195				break
196			}
197		}
198	}
199	return r
200}
201
202var outOfRange = errors.New("utf8.String: index out of range")
203var sliceOutOfRange = errors.New("utf8.String: slice index out of range")
204