1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package colltab
6
7import (
8	"unicode/utf8"
9
10	"golang.org/x/text/unicode/norm"
11)
12
13// Table holds all collation data for a given collation ordering.
14type Table struct {
15	Index Trie // main trie
16
17	// expansion info
18	ExpandElem []uint32
19
20	// contraction info
21	ContractTries  ContractTrieSet
22	ContractElem   []uint32
23	MaxContractLen int
24	VariableTop    uint32
25}
26
27func (t *Table) AppendNext(w []Elem, b []byte) (res []Elem, n int) {
28	return t.appendNext(w, source{bytes: b})
29}
30
31func (t *Table) AppendNextString(w []Elem, s string) (res []Elem, n int) {
32	return t.appendNext(w, source{str: s})
33}
34
35func (t *Table) Start(p int, b []byte) int {
36	// TODO: implement
37	panic("not implemented")
38}
39
40func (t *Table) StartString(p int, s string) int {
41	// TODO: implement
42	panic("not implemented")
43}
44
45func (t *Table) Domain() []string {
46	// TODO: implement
47	panic("not implemented")
48}
49
50func (t *Table) Top() uint32 {
51	return t.VariableTop
52}
53
54type source struct {
55	str   string
56	bytes []byte
57}
58
59func (src *source) lookup(t *Table) (ce Elem, sz int) {
60	if src.bytes == nil {
61		return t.Index.lookupString(src.str)
62	}
63	return t.Index.lookup(src.bytes)
64}
65
66func (src *source) tail(sz int) {
67	if src.bytes == nil {
68		src.str = src.str[sz:]
69	} else {
70		src.bytes = src.bytes[sz:]
71	}
72}
73
74func (src *source) nfd(buf []byte, end int) []byte {
75	if src.bytes == nil {
76		return norm.NFD.AppendString(buf[:0], src.str[:end])
77	}
78	return norm.NFD.Append(buf[:0], src.bytes[:end]...)
79}
80
81func (src *source) rune() (r rune, sz int) {
82	if src.bytes == nil {
83		return utf8.DecodeRuneInString(src.str)
84	}
85	return utf8.DecodeRune(src.bytes)
86}
87
88func (src *source) properties(f norm.Form) norm.Properties {
89	if src.bytes == nil {
90		return f.PropertiesString(src.str)
91	}
92	return f.Properties(src.bytes)
93}
94
95// appendNext appends the weights corresponding to the next rune or
96// contraction in s.  If a contraction is matched to a discontinuous
97// sequence of runes, the weights for the interstitial runes are
98// appended as well.  It returns a new slice that includes the appended
99// weights and the number of bytes consumed from s.
100func (t *Table) appendNext(w []Elem, src source) (res []Elem, n int) {
101	ce, sz := src.lookup(t)
102	tp := ce.ctype()
103	if tp == ceNormal {
104		if ce == 0 {
105			r, _ := src.rune()
106			const (
107				hangulSize  = 3
108				firstHangul = 0xAC00
109				lastHangul  = 0xD7A3
110			)
111			if r >= firstHangul && r <= lastHangul {
112				// TODO: performance can be considerably improved here.
113				n = sz
114				var buf [16]byte // Used for decomposing Hangul.
115				for b := src.nfd(buf[:0], hangulSize); len(b) > 0; b = b[sz:] {
116					ce, sz = t.Index.lookup(b)
117					w = append(w, ce)
118				}
119				return w, n
120			}
121			ce = makeImplicitCE(implicitPrimary(r))
122		}
123		w = append(w, ce)
124	} else if tp == ceExpansionIndex {
125		w = t.appendExpansion(w, ce)
126	} else if tp == ceContractionIndex {
127		n := 0
128		src.tail(sz)
129		if src.bytes == nil {
130			w, n = t.matchContractionString(w, ce, src.str)
131		} else {
132			w, n = t.matchContraction(w, ce, src.bytes)
133		}
134		sz += n
135	} else if tp == ceDecompose {
136		// Decompose using NFKD and replace tertiary weights.
137		t1, t2 := splitDecompose(ce)
138		i := len(w)
139		nfkd := src.properties(norm.NFKD).Decomposition()
140		for p := 0; len(nfkd) > 0; nfkd = nfkd[p:] {
141			w, p = t.appendNext(w, source{bytes: nfkd})
142		}
143		w[i] = w[i].updateTertiary(t1)
144		if i++; i < len(w) {
145			w[i] = w[i].updateTertiary(t2)
146			for i++; i < len(w); i++ {
147				w[i] = w[i].updateTertiary(maxTertiary)
148			}
149		}
150	}
151	return w, sz
152}
153
154func (t *Table) appendExpansion(w []Elem, ce Elem) []Elem {
155	i := splitExpandIndex(ce)
156	n := int(t.ExpandElem[i])
157	i++
158	for _, ce := range t.ExpandElem[i : i+n] {
159		w = append(w, Elem(ce))
160	}
161	return w
162}
163
164func (t *Table) matchContraction(w []Elem, ce Elem, suffix []byte) ([]Elem, int) {
165	index, n, offset := splitContractIndex(ce)
166
167	scan := t.ContractTries.scanner(index, n, suffix)
168	buf := [norm.MaxSegmentSize]byte{}
169	bufp := 0
170	p := scan.scan(0)
171
172	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
173		// By now we should have filtered most cases.
174		p0 := p
175		bufn := 0
176		rune := norm.NFD.Properties(suffix[p:])
177		p += rune.Size()
178		if rune.LeadCCC() != 0 {
179			prevCC := rune.TrailCCC()
180			// A gap may only occur in the last normalization segment.
181			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
182			if end := norm.NFD.FirstBoundary(suffix[p:]); end != -1 {
183				scan.s = suffix[:p+end]
184			}
185			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
186				rune = norm.NFD.Properties(suffix[p:])
187				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
188					break
189				}
190				prevCC = rune.TrailCCC()
191				if pp := scan.scan(p); pp != p {
192					// Copy the interstitial runes for later processing.
193					bufn += copy(buf[bufn:], suffix[p0:p])
194					if scan.pindex == pp {
195						bufp = bufn
196					}
197					p, p0 = pp, pp
198				} else {
199					p += rune.Size()
200				}
201			}
202		}
203	}
204	// Append weights for the matched contraction, which may be an expansion.
205	i, n := scan.result()
206	ce = Elem(t.ContractElem[i+offset])
207	if ce.ctype() == ceNormal {
208		w = append(w, ce)
209	} else {
210		w = t.appendExpansion(w, ce)
211	}
212	// Append weights for the runes in the segment not part of the contraction.
213	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
214		w, p = t.appendNext(w, source{bytes: b})
215	}
216	return w, n
217}
218
219// TODO: unify the two implementations. This is best done after first simplifying
220// the algorithm taking into account the inclusion of both NFC and NFD forms
221// in the table.
222func (t *Table) matchContractionString(w []Elem, ce Elem, suffix string) ([]Elem, int) {
223	index, n, offset := splitContractIndex(ce)
224
225	scan := t.ContractTries.scannerString(index, n, suffix)
226	buf := [norm.MaxSegmentSize]byte{}
227	bufp := 0
228	p := scan.scan(0)
229
230	if !scan.done && p < len(suffix) && suffix[p] >= utf8.RuneSelf {
231		// By now we should have filtered most cases.
232		p0 := p
233		bufn := 0
234		rune := norm.NFD.PropertiesString(suffix[p:])
235		p += rune.Size()
236		if rune.LeadCCC() != 0 {
237			prevCC := rune.TrailCCC()
238			// A gap may only occur in the last normalization segment.
239			// This also ensures that len(scan.s) < norm.MaxSegmentSize.
240			if end := norm.NFD.FirstBoundaryInString(suffix[p:]); end != -1 {
241				scan.s = suffix[:p+end]
242			}
243			for p < len(suffix) && !scan.done && suffix[p] >= utf8.RuneSelf {
244				rune = norm.NFD.PropertiesString(suffix[p:])
245				if ccc := rune.LeadCCC(); ccc == 0 || prevCC >= ccc {
246					break
247				}
248				prevCC = rune.TrailCCC()
249				if pp := scan.scan(p); pp != p {
250					// Copy the interstitial runes for later processing.
251					bufn += copy(buf[bufn:], suffix[p0:p])
252					if scan.pindex == pp {
253						bufp = bufn
254					}
255					p, p0 = pp, pp
256				} else {
257					p += rune.Size()
258				}
259			}
260		}
261	}
262	// Append weights for the matched contraction, which may be an expansion.
263	i, n := scan.result()
264	ce = Elem(t.ContractElem[i+offset])
265	if ce.ctype() == ceNormal {
266		w = append(w, ce)
267	} else {
268		w = t.appendExpansion(w, ce)
269	}
270	// Append weights for the runes in the segment not part of the contraction.
271	for b, p := buf[:bufp], 0; len(b) > 0; b = b[p:] {
272		w, p = t.appendNext(w, source{bytes: b})
273	}
274	return w, n
275}
276