1package uniseg
2
3import "unicode/utf8"
4
5// The states of the grapheme cluster parser.
6const (
7	grAny = iota
8	grCR
9	grControlLF
10	grL
11	grLVV
12	grLVTT
13	grPrepend
14	grExtendedPictographic
15	grExtendedPictographicZWJ
16	grRIOdd
17	grRIEven
18)
19
20// The grapheme cluster parser's breaking instructions.
21const (
22	grNoBoundary = iota
23	grBoundary
24)
25
26// The grapheme cluster parser's state transitions. Maps (state, property) to
27// (new state, breaking instruction, rule number). The breaking instruction
28// always refers to the boundary between the last and next code point.
29//
30// This map is queried as follows:
31//
32//   1. Find specific state + specific property. Stop if found.
33//   2. Find specific state + any property.
34//   3. Find any state + specific property.
35//   4. If only (2) or (3) (but not both) was found, stop.
36//   5. If both (2) and (3) were found, use state and breaking instruction from
37//      the transition with the lower rule number, prefer (3) if rule numbers
38//      are equal. Stop.
39//   6. Assume grAny and grBoundary.
40var grTransitions = map[[2]int][3]int{
41	// GB5
42	{grAny, prCR}:      {grCR, grBoundary, 50},
43	{grAny, prLF}:      {grControlLF, grBoundary, 50},
44	{grAny, prControl}: {grControlLF, grBoundary, 50},
45
46	// GB4
47	{grCR, prAny}:        {grAny, grBoundary, 40},
48	{grControlLF, prAny}: {grAny, grBoundary, 40},
49
50	// GB3.
51	{grCR, prLF}: {grAny, grNoBoundary, 30},
52
53	// GB6.
54	{grAny, prL}: {grL, grBoundary, 9990},
55	{grL, prL}:   {grL, grNoBoundary, 60},
56	{grL, prV}:   {grLVV, grNoBoundary, 60},
57	{grL, prLV}:  {grLVV, grNoBoundary, 60},
58	{grL, prLVT}: {grLVTT, grNoBoundary, 60},
59
60	// GB7.
61	{grAny, prLV}: {grLVV, grBoundary, 9990},
62	{grAny, prV}:  {grLVV, grBoundary, 9990},
63	{grLVV, prV}:  {grLVV, grNoBoundary, 70},
64	{grLVV, prT}:  {grLVTT, grNoBoundary, 70},
65
66	// GB8.
67	{grAny, prLVT}: {grLVTT, grBoundary, 9990},
68	{grAny, prT}:   {grLVTT, grBoundary, 9990},
69	{grLVTT, prT}:  {grLVTT, grNoBoundary, 80},
70
71	// GB9.
72	{grAny, prExtend}: {grAny, grNoBoundary, 90},
73	{grAny, prZWJ}:    {grAny, grNoBoundary, 90},
74
75	// GB9a.
76	{grAny, prSpacingMark}: {grAny, grNoBoundary, 91},
77
78	// GB9b.
79	{grAny, prPreprend}: {grPrepend, grBoundary, 9990},
80	{grPrepend, prAny}:  {grAny, grNoBoundary, 92},
81
82	// GB11.
83	{grAny, prExtendedPictographic}:                     {grExtendedPictographic, grBoundary, 9990},
84	{grExtendedPictographic, prExtend}:                  {grExtendedPictographic, grNoBoundary, 110},
85	{grExtendedPictographic, prZWJ}:                     {grExtendedPictographicZWJ, grNoBoundary, 110},
86	{grExtendedPictographicZWJ, prExtendedPictographic}: {grExtendedPictographic, grNoBoundary, 110},
87
88	// GB12 / GB13.
89	{grAny, prRegionalIndicator}:    {grRIOdd, grBoundary, 9990},
90	{grRIOdd, prRegionalIndicator}:  {grRIEven, grNoBoundary, 120},
91	{grRIEven, prRegionalIndicator}: {grRIOdd, grBoundary, 120},
92}
93
94// Graphemes implements an iterator over Unicode extended grapheme clusters,
95// specified in the Unicode Standard Annex #29. Grapheme clusters correspond to
96// "user-perceived characters". These characters often consist of multiple
97// code points (e.g. the "woman kissing woman" emoji consists of 8 code points:
98// woman + ZWJ + heavy black heart (2 code points) + ZWJ + kiss mark + ZWJ +
99// woman) and the rules described in Annex #29 must be applied to group those
100// code points into clusters perceived by the user as one character.
101type Graphemes struct {
102	// The code points over which this class iterates.
103	codePoints []rune
104
105	// The (byte-based) indices of the code points into the original string plus
106	// len(original string). Thus, len(indices) = len(codePoints) + 1.
107	indices []int
108
109	// The current grapheme cluster to be returned. These are indices into
110	// codePoints/indices. If start == end, we either haven't started iterating
111	// yet (0) or the iteration has already completed (1).
112	start, end int
113
114	// The index of the next code point to be parsed.
115	pos int
116
117	// The current state of the code point parser.
118	state int
119}
120
121// NewGraphemes returns a new grapheme cluster iterator.
122func NewGraphemes(s string) *Graphemes {
123	l := utf8.RuneCountInString(s)
124	codePoints := make([]rune, l)
125	indices := make([]int, l+1)
126	i := 0
127	for pos, r := range s {
128		codePoints[i] = r
129		indices[i] = pos
130		i++
131	}
132	indices[l] = len(s)
133	g := &Graphemes{
134		codePoints: codePoints,
135		indices:    indices,
136	}
137	g.Next() // Parse ahead.
138	return g
139}
140
141// Next advances the iterator by one grapheme cluster and returns false if no
142// clusters are left. This function must be called before the first cluster is
143// accessed.
144func (g *Graphemes) Next() bool {
145	g.start = g.end
146
147	// The state transition gives us a boundary instruction BEFORE the next code
148	// point so we always need to stay ahead by one code point.
149
150	// Parse the next code point.
151	for g.pos <= len(g.codePoints) {
152		// GB2.
153		if g.pos == len(g.codePoints) {
154			g.end = g.pos
155			g.pos++
156			break
157		}
158
159		// Determine the property of the next character.
160		nextProperty := property(g.codePoints[g.pos])
161		g.pos++
162
163		// Find the applicable transition.
164		var boundary bool
165		transition, ok := grTransitions[[2]int{g.state, nextProperty}]
166		if ok {
167			// We have a specific transition. We'll use it.
168			g.state = transition[0]
169			boundary = transition[1] == grBoundary
170		} else {
171			// No specific transition found. Try the less specific ones.
172			transAnyProp, okAnyProp := grTransitions[[2]int{g.state, prAny}]
173			transAnyState, okAnyState := grTransitions[[2]int{grAny, nextProperty}]
174			if okAnyProp && okAnyState {
175				// Both apply. We'll use a mix (see comments for grTransitions).
176				g.state = transAnyState[0]
177				boundary = transAnyState[1] == grBoundary
178				if transAnyProp[2] < transAnyState[2] {
179					g.state = transAnyProp[0]
180					boundary = transAnyProp[1] == grBoundary
181				}
182			} else if okAnyProp {
183				// We only have a specific state.
184				g.state = transAnyProp[0]
185				boundary = transAnyProp[1] == grBoundary
186				// This branch will probably never be reached because okAnyState will
187				// always be true given the current transition map. But we keep it here
188				// for future modifications to the transition map where this may not be
189				// true anymore.
190			} else if okAnyState {
191				// We only have a specific property.
192				g.state = transAnyState[0]
193				boundary = transAnyState[1] == grBoundary
194			} else {
195				// No known transition. GB999: Any x Any.
196				g.state = grAny
197				boundary = true
198			}
199		}
200
201		// If we found a cluster boundary, let's stop here. The current cluster will
202		// be the one that just ended.
203		if g.pos-1 == 0 /* GB1 */ || boundary {
204			g.end = g.pos - 1
205			break
206		}
207	}
208
209	return g.start != g.end
210}
211
212// Runes returns a slice of runes (code points) which corresponds to the current
213// grapheme cluster. If the iterator is already past the end or Next() has not
214// yet been called, nil is returned.
215func (g *Graphemes) Runes() []rune {
216	if g.start == g.end {
217		return nil
218	}
219	return g.codePoints[g.start:g.end]
220}
221
222// Str returns a substring of the original string which corresponds to the
223// current grapheme cluster. If the iterator is already past the end or Next()
224// has not yet been called, an empty string is returned.
225func (g *Graphemes) Str() string {
226	if g.start == g.end {
227		return ""
228	}
229	return string(g.codePoints[g.start:g.end])
230}
231
232// Bytes returns a byte slice which corresponds to the current grapheme cluster.
233// If the iterator is already past the end or Next() has not yet been called,
234// nil is returned.
235func (g *Graphemes) Bytes() []byte {
236	if g.start == g.end {
237		return nil
238	}
239	return []byte(string(g.codePoints[g.start:g.end]))
240}
241
242// Positions returns the interval of the current grapheme cluster as byte
243// positions into the original string. The first returned value "from" indexes
244// the first byte and the second returned value "to" indexes the first byte that
245// is not included anymore, i.e. str[from:to] is the current grapheme cluster of
246// the original string "str". If Next() has not yet been called, both values are
247// 0. If the iterator is already past the end, both values are 1.
248func (g *Graphemes) Positions() (int, int) {
249	return g.indices[g.start], g.indices[g.end]
250}
251
252// Reset puts the iterator into its initial state such that the next call to
253// Next() sets it to the first grapheme cluster again.
254func (g *Graphemes) Reset() {
255	g.start, g.end, g.pos, g.state = 0, 0, 0, grAny
256	g.Next() // Parse ahead again.
257}
258
259// GraphemeClusterCount returns the number of user-perceived characters
260// (grapheme clusters) for the given string. To calculate this number, it
261// iterates through the string using the Graphemes iterator.
262func GraphemeClusterCount(s string) (n int) {
263	g := NewGraphemes(s)
264	for g.Next() {
265		n++
266	}
267	return
268}
269