1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package collate contains types for comparing and sorting Unicode strings
6// according to a given collation order.  Package locale provides a high-level
7// interface to collation. Users should typically use that package instead.
8package collate
9
10import (
11	"bytes"
12	"exp/norm"
13)
14
15// AlternateHandling identifies the various ways in which variables are handled.
16// A rune with a primary weight lower than the variable top is considered a
17// variable.
18// See http://www.unicode.org/reports/tr10/#Variable_Weighting for details.
19type AlternateHandling int
20
21const (
22	// AltNonIgnorable turns off special handling of variables.
23	AltNonIgnorable AlternateHandling = iota
24
25	// AltBlanked sets variables and all subsequent primary ignorables to be
26	// ignorable at all levels. This is identical to removing all variables
27	// and subsequent primary ignorables from the input.
28	AltBlanked
29
30	// AltShifted sets variables to be ignorable for levels one through three and
31	// adds a fourth level based on the values of the ignored levels.
32	AltShifted
33
34	// AltShiftTrimmed is a slight variant of AltShifted that is used to
35	// emulate POSIX.
36	AltShiftTrimmed
37)
38
39// Collator provides functionality for comparing strings for a given
40// collation order.
41type Collator struct {
42	// TODO: hide most of these options. Low-level options are set through the locale
43	// identifier (as defined by LDML) while high-level options are set through SetOptions.
44	// Using high-level options allows us to be more flexible (such as not ignoring
45	// Thai vowels for IgnoreDiacriticals) and more user-friendly (such as allowing
46	// diacritical marks to be ignored but not case without having to fiddle with levels).
47
48	// Strength sets the maximum level to use in comparison.
49	Strength Level
50
51	// Alternate specifies an alternative handling of variables.
52	Alternate AlternateHandling
53
54	// Backwards specifies the order of sorting at the secondary level.
55	// This option exists predominantly to support reverse sorting of accents in French.
56	Backwards bool
57
58	// TODO: implement:
59	// With HiraganaQuaternary enabled, Hiragana codepoints will get lower values
60	// than all the other non-variable code points. Strength must be greater or
61	// equal to Quaternary for this to take effect.
62	HiraganaQuaternary bool
63
64	// If CaseLevel is true, a level consisting only of case characteristics will
65	// be inserted in front of the tertiary level.  To ignore accents but take
66	// cases into account, set Strength to Primary and CaseLevel to true.
67	CaseLevel bool
68
69	// If Numeric is true, any sequence of decimal digits (category is Nd) is sorted
70	// at a primary level with its numeric value.  For example, "A-21" < "A-123".
71	Numeric bool
72
73	// The largest primary value that is considered to be variable.
74	variableTop uint32
75
76	f norm.Form
77
78	t Weigher
79
80	sorter sorter
81
82	_iter [2]iter
83}
84
85// An Option is used to change the behavior of Collator.  They override the
86// settings passed through the locale identifier.
87type Option int
88
89const (
90	Numeric          Option = 1 << iota // Sort numbers numerically ("2" < "12").
91	IgnoreCase                          // Case-insensitive search.
92	IgnoreDiacritics                    // Ignore diacritical marks. ("o" == "ö").
93	IgnoreWidth                         // Ignore full versus normal width.
94	UpperFirst                          // Sort upper case before lower case.
95	LowerFirst                          // Sort lower case before upper case.
96	Force                               // Force ordering if strings are equivalent but not equal.
97
98	Loose = IgnoreDiacritics | IgnoreWidth | IgnoreCase
99)
100
101// SetOptions accepts a Options or-ed together.  All previous calls to SetOptions are ignored.
102func (c *Collator) SetOptions(o Option) {
103	// TODO: implement
104}
105
106func (c *Collator) iter(i int) *iter {
107	// TODO: evaluate performance for making the second iterator optional.
108	return &c._iter[i]
109}
110
111// Locales returns the list of locales for which collating differs from its parent locale.
112// The returned value should not be modified.
113func Locales() []string {
114	return availableLocales
115}
116
117// New returns a new Collator initialized for the given locale.
118func New(loc string) *Collator {
119	// TODO: handle locale selection according to spec.
120	var t tableIndex
121	if loc != "" {
122		if idx, ok := locales[loc]; ok {
123			t = idx
124		} else {
125			t = locales["root"]
126		}
127	}
128	return NewFromTable(Init(t))
129}
130
131func NewFromTable(t Weigher) *Collator {
132	c := &Collator{
133		Strength: Tertiary,
134		f:        norm.NFD,
135		t:        t,
136	}
137	c._iter[0].init(c)
138	c._iter[1].init(c)
139	return c
140}
141
142// Buffer holds keys generated by Key and KeyString.
143type Buffer struct {
144	buf [4096]byte
145	key []byte
146}
147
148func (b *Buffer) init() {
149	if b.key == nil {
150		b.key = b.buf[:0]
151	}
152}
153
154// Reset clears the buffer from previous results generated by Key and KeyString.
155func (b *Buffer) Reset() {
156	b.key = b.key[:0]
157}
158
159// Compare returns an integer comparing the two byte slices.
160// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
161func (c *Collator) Compare(a, b []byte) int {
162	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
163	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
164	c.iter(0).setInput(a)
165	c.iter(1).setInput(b)
166	if res := c.compare(); res != 0 {
167		return res
168	}
169	if Identity == c.Strength {
170		return bytes.Compare(a, b)
171	}
172	return 0
173}
174
175// CompareString returns an integer comparing the two strings.
176// The result will be 0 if a==b, -1 if a < b, and +1 if a > b.
177func (c *Collator) CompareString(a, b string) int {
178	// TODO: skip identical prefixes once we have a fast way to detect if a rune is
179	// part of a contraction. This would lead to roughly a 10% speedup for the colcmp regtest.
180	c.iter(0).setInputString(a)
181	c.iter(1).setInputString(b)
182	if res := c.compare(); res != 0 {
183		return res
184	}
185	if Identity == c.Strength {
186		if a < b {
187			return -1
188		} else if a > b {
189			return 1
190		}
191	}
192	return 0
193}
194
195func compareLevel(f func(i *iter) int, a, b *iter) int {
196	a.pce = 0
197	b.pce = 0
198	for {
199		va := f(a)
200		vb := f(b)
201		if va != vb {
202			if va < vb {
203				return -1
204			}
205			return 1
206		} else if va == 0 {
207			break
208		}
209	}
210	return 0
211}
212
213func (c *Collator) compare() int {
214	ia, ib := c.iter(0), c.iter(1)
215	// Process primary level
216	if c.Alternate != AltShifted {
217		// TODO: implement script reordering
218		// TODO: special hiragana handling
219		if res := compareLevel((*iter).nextPrimary, ia, ib); res != 0 {
220			return res
221		}
222	} else {
223		// TODO: handle shifted
224	}
225	if Secondary <= c.Strength {
226		f := (*iter).nextSecondary
227		if c.Backwards {
228			f = (*iter).prevSecondary
229		}
230		if res := compareLevel(f, ia, ib); res != 0 {
231			return res
232		}
233	}
234	// TODO: special case handling (Danish?)
235	if Tertiary <= c.Strength || c.CaseLevel {
236		if res := compareLevel((*iter).nextTertiary, ia, ib); res != 0 {
237			return res
238		}
239		// TODO: Not needed for the default value of AltNonIgnorable?
240		if Quaternary <= c.Strength {
241			if res := compareLevel((*iter).nextQuaternary, ia, ib); res != 0 {
242				return res
243			}
244		}
245	}
246	return 0
247}
248
249// Key returns the collation key for str.
250// Passing the buffer buf may avoid memory allocations.
251// The returned slice will point to an allocation in Buffer and will remain
252// valid until the next call to buf.Reset().
253func (c *Collator) Key(buf *Buffer, str []byte) []byte {
254	// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
255	buf.init()
256	return c.key(buf, c.getColElems(str))
257}
258
259// KeyFromString returns the collation key for str.
260// Passing the buffer buf may avoid memory allocations.
261// The returned slice will point to an allocation in Buffer and will retain
262// valid until the next call to buf.ResetKeys().
263func (c *Collator) KeyFromString(buf *Buffer, str string) []byte {
264	// See http://www.unicode.org/reports/tr10/#Main_Algorithm for more details.
265	buf.init()
266	return c.key(buf, c.getColElemsString(str))
267}
268
269func (c *Collator) key(buf *Buffer, w []Elem) []byte {
270	processWeights(c.Alternate, c.variableTop, w)
271	kn := len(buf.key)
272	c.keyFromElems(buf, w)
273	return buf.key[kn:]
274}
275
276func (c *Collator) getColElems(str []byte) []Elem {
277	i := c.iter(0)
278	i.setInput(str)
279	for i.next() {
280	}
281	return i.ce
282}
283
284func (c *Collator) getColElemsString(str string) []Elem {
285	i := c.iter(0)
286	i.setInputString(str)
287	for i.next() {
288	}
289	return i.ce
290}
291
292type iter struct {
293	bytes []byte
294	str   string
295
296	wa  [512]Elem
297	ce  []Elem
298	pce int
299	nce int // nce <= len(nce)
300
301	prevCCC  uint8
302	pStarter int
303
304	t Weigher
305}
306
307func (i *iter) init(c *Collator) {
308	i.t = c.t
309	i.ce = i.wa[:0]
310}
311
312func (i *iter) reset() {
313	i.ce = i.ce[:0]
314	i.nce = 0
315	i.prevCCC = 0
316	i.pStarter = 0
317}
318
319func (i *iter) setInput(s []byte) *iter {
320	i.bytes = s
321	i.str = ""
322	i.reset()
323	return i
324}
325
326func (i *iter) setInputString(s string) *iter {
327	i.str = s
328	i.bytes = nil
329	i.reset()
330	return i
331}
332
333func (i *iter) done() bool {
334	return len(i.str) == 0 && len(i.bytes) == 0
335}
336
337func (i *iter) tail(n int) {
338	if i.bytes == nil {
339		i.str = i.str[n:]
340	} else {
341		i.bytes = i.bytes[n:]
342	}
343}
344
345func (i *iter) appendNext() int {
346	var sz int
347	if i.bytes == nil {
348		i.ce, sz = i.t.AppendNextString(i.ce, i.str)
349	} else {
350		i.ce, sz = i.t.AppendNext(i.ce, i.bytes)
351	}
352	return sz
353}
354
355// next appends Elems to the internal array until it adds an element with CCC=0.
356// In the majority of cases, a Elem with a primary value > 0 will have
357// a CCC of 0. The CCC values of colation elements are also used to detect if the
358// input string was not normalized and to adjust the result accordingly.
359func (i *iter) next() bool {
360	for !i.done() {
361		p0 := len(i.ce)
362		sz := i.appendNext()
363		i.tail(sz)
364		last := len(i.ce) - 1
365		if ccc := i.ce[last].CCC(); ccc == 0 {
366			i.nce = len(i.ce)
367			i.pStarter = last
368			i.prevCCC = 0
369			return true
370		} else if p0 < last && i.ce[p0].CCC() == 0 {
371			// set i.nce to only cover part of i.ce for which ccc == 0 and
372			// use rest the next call to next.
373			for p0++; p0 < last && i.ce[p0].CCC() == 0; p0++ {
374			}
375			i.nce = p0
376			i.pStarter = p0 - 1
377			i.prevCCC = ccc
378			return true
379		} else if ccc < i.prevCCC {
380			i.doNorm(p0, ccc) // should be rare for most common cases
381		} else {
382			i.prevCCC = ccc
383		}
384	}
385	if len(i.ce) != i.nce {
386		i.nce = len(i.ce)
387		return true
388	}
389	return false
390}
391
392// nextPlain is the same as next, but does not "normalize" the collation
393// elements.
394// TODO: remove this function. Using this instead of next does not seem
395// to improve performance in any significant way. We retain this until
396// later for evaluation purposes.
397func (i *iter) nextPlain() bool {
398	if i.done() {
399		return false
400	}
401	sz := i.appendNext()
402	i.tail(sz)
403	i.nce = len(i.ce)
404	return true
405}
406
407const maxCombiningCharacters = 30
408
409// doNorm reorders the collation elements in i.ce.
410// It assumes that blocks of collation elements added with appendNext
411// either start and end with the same CCC or start with CCC == 0.
412// This allows for a single insertion point for the entire block.
413// The correctness of this assumption is verified in builder.go.
414func (i *iter) doNorm(p int, ccc uint8) {
415	if p-i.pStarter > maxCombiningCharacters {
416		i.prevCCC = i.ce[len(i.ce)-1].CCC()
417		i.pStarter = len(i.ce) - 1
418		return
419	}
420	n := len(i.ce)
421	k := p
422	for p--; p > i.pStarter && ccc < i.ce[p-1].CCC(); p-- {
423	}
424	i.ce = append(i.ce, i.ce[p:k]...)
425	copy(i.ce[p:], i.ce[k:])
426	i.ce = i.ce[:n]
427}
428
429func (i *iter) nextPrimary() int {
430	for {
431		for ; i.pce < i.nce; i.pce++ {
432			if v := i.ce[i.pce].Primary(); v != 0 {
433				i.pce++
434				return v
435			}
436		}
437		if !i.next() {
438			return 0
439		}
440	}
441	panic("should not reach here")
442}
443
444func (i *iter) nextSecondary() int {
445	for ; i.pce < len(i.ce); i.pce++ {
446		if v := i.ce[i.pce].Secondary(); v != 0 {
447			i.pce++
448			return v
449		}
450	}
451	return 0
452}
453
454func (i *iter) prevSecondary() int {
455	for ; i.pce < len(i.ce); i.pce++ {
456		if v := i.ce[len(i.ce)-i.pce-1].Secondary(); v != 0 {
457			i.pce++
458			return v
459		}
460	}
461	return 0
462}
463
464func (i *iter) nextTertiary() int {
465	for ; i.pce < len(i.ce); i.pce++ {
466		if v := i.ce[i.pce].Tertiary(); v != 0 {
467			i.pce++
468			return int(v)
469		}
470	}
471	return 0
472}
473
474func (i *iter) nextQuaternary() int {
475	for ; i.pce < len(i.ce); i.pce++ {
476		if v := i.ce[i.pce].Quaternary(); v != 0 {
477			i.pce++
478			return v
479		}
480	}
481	return 0
482}
483
484func appendPrimary(key []byte, p int) []byte {
485	// Convert to variable length encoding; supports up to 23 bits.
486	if p <= 0x7FFF {
487		key = append(key, uint8(p>>8), uint8(p))
488	} else {
489		key = append(key, uint8(p>>16)|0x80, uint8(p>>8), uint8(p))
490	}
491	return key
492}
493
494// keyFromElems converts the weights ws to a compact sequence of bytes.
495// The result will be appended to the byte buffer in buf.
496func (c *Collator) keyFromElems(buf *Buffer, ws []Elem) {
497	for _, v := range ws {
498		if w := v.Primary(); w > 0 {
499			buf.key = appendPrimary(buf.key, w)
500		}
501	}
502	if Secondary <= c.Strength {
503		buf.key = append(buf.key, 0, 0)
504		// TODO: we can use one 0 if we can guarantee that all non-zero weights are > 0xFF.
505		if !c.Backwards {
506			for _, v := range ws {
507				if w := v.Secondary(); w > 0 {
508					buf.key = append(buf.key, uint8(w>>8), uint8(w))
509				}
510			}
511		} else {
512			for i := len(ws) - 1; i >= 0; i-- {
513				if w := ws[i].Secondary(); w > 0 {
514					buf.key = append(buf.key, uint8(w>>8), uint8(w))
515				}
516			}
517		}
518	} else if c.CaseLevel {
519		buf.key = append(buf.key, 0, 0)
520	}
521	if Tertiary <= c.Strength || c.CaseLevel {
522		buf.key = append(buf.key, 0, 0)
523		for _, v := range ws {
524			if w := v.Tertiary(); w > 0 {
525				buf.key = append(buf.key, uint8(w))
526			}
527		}
528		// Derive the quaternary weights from the options and other levels.
529		// Note that we represent MaxQuaternary as 0xFF. The first byte of the
530		// representation of a primary weight is always smaller than 0xFF,
531		// so using this single byte value will compare correctly.
532		if Quaternary <= c.Strength && c.Alternate >= AltShifted {
533			if c.Alternate == AltShiftTrimmed {
534				lastNonFFFF := len(buf.key)
535				buf.key = append(buf.key, 0)
536				for _, v := range ws {
537					if w := v.Quaternary(); w == MaxQuaternary {
538						buf.key = append(buf.key, 0xFF)
539					} else if w > 0 {
540						buf.key = appendPrimary(buf.key, w)
541						lastNonFFFF = len(buf.key)
542					}
543				}
544				buf.key = buf.key[:lastNonFFFF]
545			} else {
546				buf.key = append(buf.key, 0)
547				for _, v := range ws {
548					if w := v.Quaternary(); w == MaxQuaternary {
549						buf.key = append(buf.key, 0xFF)
550					} else if w > 0 {
551						buf.key = appendPrimary(buf.key, w)
552					}
553				}
554			}
555		}
556	}
557}
558
559func processWeights(vw AlternateHandling, top uint32, wa []Elem) {
560	ignore := false
561	vtop := int(top)
562	switch vw {
563	case AltShifted, AltShiftTrimmed:
564		for i := range wa {
565			if p := wa[i].Primary(); p <= vtop && p != 0 {
566				wa[i] = MakeQuaternary(p)
567				ignore = true
568			} else if p == 0 {
569				if ignore {
570					wa[i] = ceIgnore
571				}
572			} else {
573				ignore = false
574			}
575		}
576	case AltBlanked:
577		for i := range wa {
578			if p := wa[i].Primary(); p <= vtop && (ignore || p != 0) {
579				wa[i] = ceIgnore
580				ignore = true
581			} else {
582				ignore = false
583			}
584		}
585	}
586}
587