1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package colltab
6
7// An Iter incrementally converts chunks of the input text to collation
8// elements, while ensuring that the collation elements are in normalized order
9// (that is, they are in the order as if the input text were normalized first).
10type Iter struct {
11	Weighter Weighter
12	Elems    []Elem
13	// N is the number of elements in Elems that will not be reordered on
14	// subsequent iterations, N <= len(Elems).
15	N int
16
17	bytes []byte
18	str   string
19	// Because the Elems buffer may contain collation elements that are needed
20	// for look-ahead, we need two positions in the text (bytes or str): one for
21	// the end position in the text for the current iteration and one for the
22	// start of the next call to appendNext.
23	pEnd  int // end position in text corresponding to N.
24	pNext int // pEnd <= pNext.
25}
26
27// Reset sets the position in the current input text to p and discards any
28// results obtained so far.
29func (i *Iter) Reset(p int) {
30	i.Elems = i.Elems[:0]
31	i.N = 0
32	i.pEnd = p
33	i.pNext = p
34}
35
36// Len returns the length of the input text.
37func (i *Iter) Len() int {
38	if i.bytes != nil {
39		return len(i.bytes)
40	}
41	return len(i.str)
42}
43
44// Discard removes the collation elements up to N.
45func (i *Iter) Discard() {
46	// TODO: change this such that only modifiers following starters will have
47	// to be copied.
48	i.Elems = i.Elems[:copy(i.Elems, i.Elems[i.N:])]
49	i.N = 0
50}
51
52// End returns the end position of the input text for which Next has returned
53// results.
54func (i *Iter) End() int {
55	return i.pEnd
56}
57
58// SetInput resets i to input s.
59func (i *Iter) SetInput(s []byte) {
60	i.bytes = s
61	i.str = ""
62	i.Reset(0)
63}
64
65// SetInputString resets i to input s.
66func (i *Iter) SetInputString(s string) {
67	i.str = s
68	i.bytes = nil
69	i.Reset(0)
70}
71
72func (i *Iter) done() bool {
73	return i.pNext >= len(i.str) && i.pNext >= len(i.bytes)
74}
75
76func (i *Iter) appendNext() bool {
77	if i.done() {
78		return false
79	}
80	var sz int
81	if i.bytes == nil {
82		i.Elems, sz = i.Weighter.AppendNextString(i.Elems, i.str[i.pNext:])
83	} else {
84		i.Elems, sz = i.Weighter.AppendNext(i.Elems, i.bytes[i.pNext:])
85	}
86	if sz == 0 {
87		sz = 1
88	}
89	i.pNext += sz
90	return true
91}
92
93// Next appends Elems to the internal array. On each iteration, it will either
94// add starters or modifiers. In the majority of cases, an Elem with a primary
95// value > 0 will have a CCC of 0. The CCC values of collation elements are also
96// used to detect if the input string was not normalized and to adjust the
97// result accordingly.
98func (i *Iter) Next() bool {
99	if i.N == len(i.Elems) && !i.appendNext() {
100		return false
101	}
102
103	// Check if the current segment starts with a starter.
104	prevCCC := i.Elems[len(i.Elems)-1].CCC()
105	if prevCCC == 0 {
106		i.N = len(i.Elems)
107		i.pEnd = i.pNext
108		return true
109	} else if i.Elems[i.N].CCC() == 0 {
110		// set i.N to only cover part of i.Elems for which prevCCC == 0 and
111		// use rest for the next call to next.
112		for i.N++; i.N < len(i.Elems) && i.Elems[i.N].CCC() == 0; i.N++ {
113		}
114		i.pEnd = i.pNext
115		return true
116	}
117
118	// The current (partial) segment starts with modifiers. We need to collect
119	// all successive modifiers to ensure that they are normalized.
120	for {
121		p := len(i.Elems)
122		i.pEnd = i.pNext
123		if !i.appendNext() {
124			break
125		}
126
127		if ccc := i.Elems[p].CCC(); ccc == 0 || len(i.Elems)-i.N > maxCombiningCharacters {
128			// Leave the starter for the next iteration. This ensures that we
129			// do not return sequences of collation elements that cross two
130			// segments.
131			//
132			// TODO: handle large number of combining characters by fully
133			// normalizing the input segment before iteration. This ensures
134			// results are consistent across the text repo.
135			i.N = p
136			return true
137		} else if ccc < prevCCC {
138			i.doNorm(p, ccc) // should be rare, never occurs for NFD and FCC.
139		} else {
140			prevCCC = ccc
141		}
142	}
143
144	done := len(i.Elems) != i.N
145	i.N = len(i.Elems)
146	return done
147}
148
149// nextNoNorm is the same as next, but does not "normalize" the collation
150// elements.
151func (i *Iter) nextNoNorm() bool {
152	// TODO: remove this function. Using this instead of next does not seem
153	// to improve performance in any significant way. We retain this until
154	// later for evaluation purposes.
155	if i.done() {
156		return false
157	}
158	i.appendNext()
159	i.N = len(i.Elems)
160	return true
161}
162
163const maxCombiningCharacters = 30
164
165// doNorm reorders the collation elements in i.Elems.
166// It assumes that blocks of collation elements added with appendNext
167// either start and end with the same CCC or start with CCC == 0.
168// This allows for a single insertion point for the entire block.
169// The correctness of this assumption is verified in builder.go.
170func (i *Iter) doNorm(p int, ccc uint8) {
171	n := len(i.Elems)
172	k := p
173	for p--; p > i.N && ccc < i.Elems[p-1].CCC(); p-- {
174	}
175	i.Elems = append(i.Elems, i.Elems[p:k]...)
176	copy(i.Elems[p:], i.Elems[k:])
177	i.Elems = i.Elems[:n]
178}
179