1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// TODO: remove hard-coded versions when we have implemented fractional weights.
6// The current implementation is incompatible with later CLDR versions.
7//go:generate go run maketables.go -cldr=23 -unicode=6.2.0
8
9// Package collate contains types for comparing and sorting Unicode strings
10// according to a given collation order.
11package collate // import "golang.org/x/text/collate"
12
13import (
14	"bytes"
15	"strings"
16
17	"golang.org/x/text/internal/colltab"
18	"golang.org/x/text/language"
19)
20
21// Collator provides functionality for comparing strings for a given
22// collation order.
23type Collator struct {
24	options
25
26	sorter sorter
27
28	_iter [2]iter
29}
30
31func (c *Collator) iter(i int) *iter {
32	// TODO: evaluate performance for making the second iterator optional.
33	return &c._iter[i]
34}
35
36// Supported returns the list of languages for which collating differs from its parent.
37func Supported() []language.Tag {
38	// TODO: use language.Coverage instead.
39
40	t := make([]language.Tag, len(tags))
41	copy(t, tags)
42	return t
43}
44
45func init() {
46	ids := strings.Split(availableLocales, ",")
47	tags = make([]language.Tag, len(ids))
48	for i, s := range ids {
49		tags[i] = language.Raw.MustParse(s)
50	}
51}
52
53var tags []language.Tag
54
55// New returns a new Collator initialized for the given locale.
56func New(t language.Tag, o ...Option) *Collator {
57	index := colltab.MatchLang(t, tags)
58	c := newCollator(getTable(locales[index]))
59
60	// Set options from the user-supplied tag.
61	c.setFromTag(t)
62
63	// Set the user-supplied options.
64	c.setOptions(o)
65
66	c.init()
67	return c
68}
69
70// NewFromTable returns a new Collator for the given Weighter.
71func NewFromTable(w colltab.Weighter, o ...Option) *Collator {
72	c := newCollator(w)
73	c.setOptions(o)
74	c.init()
75	return c
76}
77
78func (c *Collator) init() {
79	if c.numeric {
80		c.t = colltab.NewNumericWeighter(c.t)
81	}
82	c._iter[0].init(c)
83	c._iter[1].init(c)
84}
85
86// Buffer holds keys generated by Key and KeyString.
87type Buffer struct {
88	buf [4096]byte
89	key []byte
90}
91
92func (b *Buffer) init() {
93	if b.key == nil {
94		b.key = b.buf[:0]
95	}
96}
97
98// Reset clears the buffer from previous results generated by Key and KeyString.
99func (b *Buffer) Reset() {
100	b.key = b.key[:0]
101}
102
103// Compare returns an integer comparing the two byte slices.
104// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
105func (c *Collator) Compare(a, b []byte) int {
106	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
107	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
108	c.iter(0).SetInput(a)
109	c.iter(1).SetInput(b)
110	if res := c.compare(); res != 0 {
111		return res
112	}
113	if !c.ignore[colltab.Identity] {
114		return bytes.Compare(a, b)
115	}
116	return 0
117}
118
119// CompareString returns an integer comparing the two strings.
120// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
121func (c *Collator) CompareString(a, b string) int {
122	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
123	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
124	c.iter(0).SetInputString(a)
125	c.iter(1).SetInputString(b)
126	if res := c.compare(); res != 0 {
127		return res
128	}
129	if !c.ignore[colltab.Identity] {
130		if a < b {
131			return -1
132		} else if a > b {
133			return 1
134		}
135	}
136	return 0
137}
138
139func compareLevel(f func(i *iter) int, a, b *iter) int {
140	a.pce = 0
141	b.pce = 0
142	for {
143		va := f(a)
144		vb := f(b)
145		if va != vb {
146			if va < vb {
147				return -1
148			}
149			return 1
150		} else if va == 0 {
151			break
152		}
153	}
154	return 0
155}
156
157func (c *Collator) compare() int {
158	ia, ib := c.iter(0), c.iter(1)
159	// Process primary level
160	if c.alternate != altShifted {
161		// TODO: implement script reordering
162		if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
163			return res
164		}
165	} else {
166		// TODO: handle shifted
167	}
168	if !c.ignore[colltab.Secondary] {
169		f := (*iter).nextSecondary
170		if c.backwards {
171			f = (*iter).prevSecondary
172		}
173		if res := compareLevel(f, ia, ib); res != 0 {
174			return res
175		}
176	}
177	// TODO: special case handling (Danish?)
178	if !c.ignore[colltab.Tertiary] || c.caseLevel {
179		if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
180			return res
181		}
182		if !c.ignore[colltab.Quaternary] {
183			if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
184				return res
185			}
186		}
187	}
188	return 0
189}
190
191// Key returns the collation key for str.
192// Passing the buffer buf may avoid memory allocations.
193// The returned slice will point to an allocation in Buffer and will remain
194// valid until the next call to buf.Reset().
195func (c *Collator) Key(buf *Buffer, str []byte) []byte {
196	// See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
197	buf.init()
198	return c.key(buf, c.getColElems(str))
199}
200
201// KeyFromString returns the collation key for str.
202// Passing the buffer buf may avoid memory allocations.
203// The returned slice will point to an allocation in Buffer and will retain
204// valid until the next call to buf.ResetKeys().
205func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
206	// See https://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
207	buf.init()
208	return c.key(buf, c.getColElemsString(str))
209}
210
211func (c *Collator) key(buf *Buffer, w []colltab.Elem) []byte {
212	processWeights(c.alternate, c.t.Top(), w)
213	kn := len(buf.key)
214	c.keyFromElems(buf, w)
215	return buf.key[kn:]
216}
217
218func (c *Collator) getColElems(str []byte) []colltab.Elem {
219	i := c.iter(0)
220	i.SetInput(str)
221	for i.Next() {
222	}
223	return i.Elems
224}
225
226func (c *Collator) getColElemsString(str string) []colltab.Elem {
227	i := c.iter(0)
228	i.SetInputString(str)
229	for i.Next() {
230	}
231	return i.Elems
232}
233
234type iter struct {
235	wa [512]colltab.Elem
236
237	colltab.Iter
238	pce int
239}
240
241func (i *iter) init(c *Collator) {
242	i.Weighter = c.t
243	i.Elems = i.wa[:0]
244}
245
246func (i *iter) nextPrimary() int {
247	for {
248		for ; i.pce < i.N; i.pce++ {
249			if v := i.Elems[i.pce].Primary(); v != 0 {
250				i.pce++
251				return v
252			}
253		}
254		if !i.Next() {
255			return 0
256		}
257	}
258	panic("should not reach here")
259}
260
261func (i *iter) nextSecondary() int {
262	for ; i.pce < len(i.Elems); i.pce++ {
263		if v := i.Elems[i.pce].Secondary(); v != 0 {
264			i.pce++
265			return v
266		}
267	}
268	return 0
269}
270
271func (i *iter) prevSecondary() int {
272	for ; i.pce < len(i.Elems); i.pce++ {
273		if v := i.Elems[len(i.Elems)-i.pce-1].Secondary(); v != 0 {
274			i.pce++
275			return v
276		}
277	}
278	return 0
279}
280
281func (i *iter) nextTertiary() int {
282	for ; i.pce < len(i.Elems); i.pce++ {
283		if v := i.Elems[i.pce].Tertiary(); v != 0 {
284			i.pce++
285			return int(v)
286		}
287	}
288	return 0
289}
290
291func (i *iter) nextQuaternary() int {
292	for ; i.pce < len(i.Elems); i.pce++ {
293		if v := i.Elems[i.pce].Quaternary(); v != 0 {
294			i.pce++
295			return v
296		}
297	}
298	return 0
299}
300
301func appendPrimary(key []byte, p int) []byte {
302	// Convert to variable length encoding; supports up to 23 bits.
303	if p <= 0x7FFF {
304		key = append(key, uint8(p>>8), uint8(p))
305	} else {
306		key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
307	}
308	return key
309}
310
311// keyFromElems converts the weights ws to a compact sequence of bytes.
312// The result will be appended to the byte buffer in buf.
313func (c *Collator) keyFromElems(buf *Buffer, ws []colltab.Elem) {
314	for _, v := range ws {
315		if w := v.Primary(); w > 0 {
316			buf.key = appendPrimary(buf.key, w)
317		}
318	}
319	if !c.ignore[colltab.Secondary] {
320		buf.key = append(buf.key, 0, 0)
321		// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
322		if !c.backwards {
323			for _, v := range ws {
324				if w := v.Secondary(); w > 0 {
325					buf.key = append(buf.key, uint8(w>>8), uint8(w))
326				}
327			}
328		} else {
329			for i := len(ws) - 1; i >= 0; i-- {
330				if w := ws[i].Secondary(); w > 0 {
331					buf.key = append(buf.key, uint8(w>>8), uint8(w))
332				}
333			}
334		}
335	} else if c.caseLevel {
336		buf.key = append(buf.key, 0, 0)
337	}
338	if !c.ignore[colltab.Tertiary] || c.caseLevel {
339		buf.key = append(buf.key, 0, 0)
340		for _, v := range ws {
341			if w := v.Tertiary(); w > 0 {
342				buf.key = append(buf.key, uint8(w))
343			}
344		}
345		// Derive the quaternary weights from the options and other levels.
346		// Note that we represent MaxQuaternary as 0xFF. The first byte of the
347		// representation of a primary weight is always smaller than 0xFF,
348		// so using this single byte value will compare correctly.
349		if !c.ignore[colltab.Quaternary] && c.alternate >= altShifted {
350			if c.alternate == altShiftTrimmed {
351				lastNonFFFF := len(buf.key)
352				buf.key = append(buf.key, 0)
353				for _, v := range ws {
354					if w := v.Quaternary(); w == colltab.MaxQuaternary {
355						buf.key = append(buf.key, 0xFF)
356					} else if w > 0 {
357						buf.key = appendPrimary(buf.key, w)
358						lastNonFFFF = len(buf.key)
359					}
360				}
361				buf.key = buf.key[:lastNonFFFF]
362			} else {
363				buf.key = append(buf.key, 0)
364				for _, v := range ws {
365					if w := v.Quaternary(); w == colltab.MaxQuaternary {
366						buf.key = append(buf.key, 0xFF)
367					} else if w > 0 {
368						buf.key = appendPrimary(buf.key, w)
369					}
370				}
371			}
372		}
373	}
374}
375
376func processWeights(vw alternateHandling, top uint32, wa []colltab.Elem) {
377	ignore := false
378	vtop := int(top)
379	switch vw {
380	case altShifted, altShiftTrimmed:
381		for i := range wa {
382			if p := wa[i].Primary(); p <= vtop && p != 0 {
383				wa[i] = colltab.MakeQuaternary(p)
384				ignore = true
385			} else if p == 0 {
386				if ignore {
387					wa[i] = colltab.Ignore
388				}
389			} else {
390				ignore = false
391			}
392		}
393	case altBlanked:
394		for i := range wa {
395			if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
396				wa[i] = colltab.Ignore
397				ignore = true
398			} else {
399				ignore = false
400			}
401		}
402	}
403}
404