1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package colltab
6
7import (
8	"fmt"
9	"unicode"
10)
11
12// Level identifies the collation comparison level.
13// The primary level corresponds to the basic sorting of text.
14// The secondary level corresponds to accents and related linguistic elements.
15// The tertiary level corresponds to casing and related concepts.
16// The quaternary level is derived from the other levels by the
17// various algorithms for handling variable elements.
18type Level int
19
20const (
21	Primary Level = iota
22	Secondary
23	Tertiary
24	Quaternary
25	Identity
26
27	NumLevels
28)
29
30const (
31	defaultSecondary = 0x20
32	defaultTertiary  = 0x2
33	maxTertiary      = 0x1F
34	MaxQuaternary    = 0x1FFFFF // 21 bits.
35)
36
37// Elem is a representation of a collation element. This API provides ways to encode
38// and decode Elems. Implementations of collation tables may use values greater
39// or equal to PrivateUse for their own purposes.  However, these should never be
40// returned by AppendNext.
41type Elem uint32
42
43const (
44	maxCE       Elem = 0xAFFFFFFF
45	PrivateUse       = minContract
46	minContract      = 0xC0000000
47	maxContract      = 0xDFFFFFFF
48	minExpand        = 0xE0000000
49	maxExpand        = 0xEFFFFFFF
50	minDecomp        = 0xF0000000
51)
52
53type ceType int
54
55const (
56	ceNormal           ceType = iota // ceNormal includes implicits (ce == 0)
57	ceContractionIndex               // rune can be a start of a contraction
58	ceExpansionIndex                 // rune expands into a sequence of collation elements
59	ceDecompose                      // rune expands using NFKC decomposition
60)
61
62func (ce Elem) ctype() ceType {
63	if ce <= maxCE {
64		return ceNormal
65	}
66	if ce <= maxContract {
67		return ceContractionIndex
68	} else {
69		if ce <= maxExpand {
70			return ceExpansionIndex
71		}
72		return ceDecompose
73	}
74	panic("should not reach here")
75	return ceType(-1)
76}
77
78// For normal collation elements, we assume that a collation element either has
79// a primary or non-default secondary value, not both.
80// Collation elements with a primary value are of the form
81// 01pppppp pppppppp ppppppp0 ssssssss
82//   - p* is primary collation value
83//   - s* is the secondary collation value
84// 00pppppp pppppppp ppppppps sssttttt, where
85//   - p* is primary collation value
86//   - s* offset of secondary from default value.
87//   - t* is the tertiary collation value
88// 100ttttt cccccccc pppppppp pppppppp
89//   - t* is the tertiar collation value
90//   - c* is the canonical combining class
91//   - p* is the primary collation value
92// Collation elements with a secondary value are of the form
93// 1010cccc ccccssss ssssssss tttttttt, where
94//   - c* is the canonical combining class
95//   - s* is the secondary collation value
96//   - t* is the tertiary collation value
97// 11qqqqqq qqqqqqqq qqqqqqq0 00000000
98//   - q* quaternary value
99const (
100	ceTypeMask              = 0xC0000000
101	ceTypeMaskExt           = 0xE0000000
102	ceIgnoreMask            = 0xF00FFFFF
103	ceType1                 = 0x40000000
104	ceType2                 = 0x00000000
105	ceType3or4              = 0x80000000
106	ceType4                 = 0xA0000000
107	ceTypeQ                 = 0xC0000000
108	Ignore                  = ceType4
109	firstNonPrimary         = 0x80000000
110	lastSpecialPrimary      = 0xA0000000
111	secondaryMask           = 0x80000000
112	hasTertiaryMask         = 0x40000000
113	primaryValueMask        = 0x3FFFFE00
114	maxPrimaryBits          = 21
115	compactPrimaryBits      = 16
116	maxSecondaryBits        = 12
117	maxTertiaryBits         = 8
118	maxCCCBits              = 8
119	maxSecondaryCompactBits = 8
120	maxSecondaryDiffBits    = 4
121	maxTertiaryCompactBits  = 5
122	primaryShift            = 9
123	compactSecondaryShift   = 5
124	minCompactSecondary     = defaultSecondary - 4
125)
126
127func makeImplicitCE(primary int) Elem {
128	return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
129}
130
131// MakeElem returns an Elem for the given values.  It will return an error
132// if the given combination of values is invalid.
133func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
134	if w := primary; w >= 1<<maxPrimaryBits || w < 0 {
135		return 0, fmt.Errorf("makeCE: primary weight out of bounds: %x >= %x", w, 1<<maxPrimaryBits)
136	}
137	if w := secondary; w >= 1<<maxSecondaryBits || w < 0 {
138		return 0, fmt.Errorf("makeCE: secondary weight out of bounds: %x >= %x", w, 1<<maxSecondaryBits)
139	}
140	if w := tertiary; w >= 1<<maxTertiaryBits || w < 0 {
141		return 0, fmt.Errorf("makeCE: tertiary weight out of bounds: %x >= %x", w, 1<<maxTertiaryBits)
142	}
143	ce := Elem(0)
144	if primary != 0 {
145		if ccc != 0 {
146			if primary >= 1<<compactPrimaryBits {
147				return 0, fmt.Errorf("makeCE: primary weight with non-zero CCC out of bounds: %x >= %x", primary, 1<<compactPrimaryBits)
148			}
149			if secondary != defaultSecondary {
150				return 0, fmt.Errorf("makeCE: cannot combine non-default secondary value (%x) with non-zero CCC (%x)", secondary, ccc)
151			}
152			ce = Elem(tertiary << (compactPrimaryBits + maxCCCBits))
153			ce |= Elem(ccc) << compactPrimaryBits
154			ce |= Elem(primary)
155			ce |= ceType3or4
156		} else if tertiary == defaultTertiary {
157			if secondary >= 1<<maxSecondaryCompactBits {
158				return 0, fmt.Errorf("makeCE: secondary weight with non-zero primary out of bounds: %x >= %x", secondary, 1<<maxSecondaryCompactBits)
159			}
160			ce = Elem(primary<<(maxSecondaryCompactBits+1) + secondary)
161			ce |= ceType1
162		} else {
163			d := secondary - defaultSecondary + maxSecondaryDiffBits
164			if d >= 1<<maxSecondaryDiffBits || d < 0 {
165				return 0, fmt.Errorf("makeCE: secondary weight diff out of bounds: %x < 0 || %x > %x", d, d, 1<<maxSecondaryDiffBits)
166			}
167			if tertiary >= 1<<maxTertiaryCompactBits {
168				return 0, fmt.Errorf("makeCE: tertiary weight with non-zero primary out of bounds: %x > %x", tertiary, 1<<maxTertiaryCompactBits)
169			}
170			ce = Elem(primary<<maxSecondaryDiffBits + d)
171			ce = ce<<maxTertiaryCompactBits + Elem(tertiary)
172		}
173	} else {
174		ce = Elem(secondary<<maxTertiaryBits + tertiary)
175		ce += Elem(ccc) << (maxSecondaryBits + maxTertiaryBits)
176		ce |= ceType4
177	}
178	return ce, nil
179}
180
181// MakeQuaternary returns an Elem with the given quaternary value.
182func MakeQuaternary(v int) Elem {
183	return ceTypeQ | Elem(v<<primaryShift)
184}
185
186// Mask sets weights for any level smaller than l to 0.
187// The resulting Elem can be used to test for equality with
188// other Elems to which the same mask has been applied.
189func (ce Elem) Mask(l Level) uint32 {
190	return 0
191}
192
193// CCC returns the canonical combining class associated with the underlying character,
194// if applicable, or 0 otherwise.
195func (ce Elem) CCC() uint8 {
196	if ce&ceType3or4 != 0 {
197		if ce&ceType4 == ceType3or4 {
198			return uint8(ce >> 16)
199		}
200		return uint8(ce >> 20)
201	}
202	return 0
203}
204
205// Primary returns the primary collation weight for ce.
206func (ce Elem) Primary() int {
207	if ce >= firstNonPrimary {
208		if ce > lastSpecialPrimary {
209			return 0
210		}
211		return int(uint16(ce))
212	}
213	return int(ce&primaryValueMask) >> primaryShift
214}
215
216// Secondary returns the secondary collation weight for ce.
217func (ce Elem) Secondary() int {
218	switch ce & ceTypeMask {
219	case ceType1:
220		return int(uint8(ce))
221	case ceType2:
222		return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
223	case ceType3or4:
224		if ce < ceType4 {
225			return defaultSecondary
226		}
227		return int(ce>>8) & 0xFFF
228	case ceTypeQ:
229		return 0
230	}
231	panic("should not reach here")
232}
233
234// Tertiary returns the tertiary collation weight for ce.
235func (ce Elem) Tertiary() uint8 {
236	if ce&hasTertiaryMask == 0 {
237		if ce&ceType3or4 == 0 {
238			return uint8(ce & 0x1F)
239		}
240		if ce&ceType4 == ceType4 {
241			return uint8(ce)
242		}
243		return uint8(ce>>24) & 0x1F // type 2
244	} else if ce&ceTypeMask == ceType1 {
245		return defaultTertiary
246	}
247	// ce is a quaternary value.
248	return 0
249}
250
251func (ce Elem) updateTertiary(t uint8) Elem {
252	if ce&ceTypeMask == ceType1 {
253		// convert to type 4
254		nce := ce & primaryValueMask
255		nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
256		ce = nce
257	} else if ce&ceTypeMaskExt == ceType3or4 {
258		ce &= ^Elem(maxTertiary << 24)
259		return ce | (Elem(t) << 24)
260	} else {
261		// type 2 or 4
262		ce &= ^Elem(maxTertiary)
263	}
264	return ce | Elem(t)
265}
266
267// Quaternary returns the quaternary value if explicitly specified,
268// 0 if ce == Ignore, or MaxQuaternary otherwise.
269// Quaternary values are used only for shifted variants.
270func (ce Elem) Quaternary() int {
271	if ce&ceTypeMask == ceTypeQ {
272		return int(ce&primaryValueMask) >> primaryShift
273	} else if ce&ceIgnoreMask == Ignore {
274		return 0
275	}
276	return MaxQuaternary
277}
278
279// Weight returns the collation weight for the given level.
280func (ce Elem) Weight(l Level) int {
281	switch l {
282	case Primary:
283		return ce.Primary()
284	case Secondary:
285		return ce.Secondary()
286	case Tertiary:
287		return int(ce.Tertiary())
288	case Quaternary:
289		return ce.Quaternary()
290	}
291	return 0 // return 0 (ignore) for undefined levels.
292}
293
294// For contractions, collation elements are of the form
295// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
296//   - n* is the size of the first node in the contraction trie.
297//   - i* is the index of the first node in the contraction trie.
298//   - b* is the offset into the contraction collation element table.
299// See contract.go for details on the contraction trie.
300const (
301	maxNBits              = 4
302	maxTrieIndexBits      = 12
303	maxContractOffsetBits = 13
304)
305
306func splitContractIndex(ce Elem) (index, n, offset int) {
307	n = int(ce & (1<<maxNBits - 1))
308	ce >>= maxNBits
309	index = int(ce & (1<<maxTrieIndexBits - 1))
310	ce >>= maxTrieIndexBits
311	offset = int(ce & (1<<maxContractOffsetBits - 1))
312	return
313}
314
315// For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
316// where b* is the index into the expansion sequence table.
317const maxExpandIndexBits = 16
318
319func splitExpandIndex(ce Elem) (index int) {
320	return int(uint16(ce))
321}
322
323// Some runes can be expanded using NFKD decomposition. Instead of storing the full
324// sequence of collation elements, we decompose the rune and lookup the collation
325// elements for each rune in the decomposition and modify the tertiary weights.
326// The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
327//   - v* is the replacement tertiary weight for the first rune,
328//   - w* is the replacement tertiary weight for the second rune,
329// Tertiary weights of subsequent runes should be replaced with maxTertiary.
330// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
331func splitDecompose(ce Elem) (t1, t2 uint8) {
332	return uint8(ce), uint8(ce >> 8)
333}
334
335const (
336	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
337	minUnified       rune = 0x4E00
338	maxUnified            = 0x9FFF
339	minCompatibility      = 0xF900
340	maxCompatibility      = 0xFAFF
341	minRare               = 0x3400
342	maxRare               = 0x4DBF
343)
344const (
345	commonUnifiedOffset = 0x10000
346	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
347	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
348	illegalOffset       = otherOffset + int(unicode.MaxRune)
349	maxPrimary          = illegalOffset + 1
350)
351
352// implicitPrimary returns the primary weight for the a rune
353// for which there is no entry for the rune in the collation table.
354// We take a different approach from the one specified in
355// https://unicode.org/reports/tr10/#Implicit_Weights,
356// but preserve the resulting relative ordering of the runes.
357func implicitPrimary(r rune) int {
358	if unicode.Is(unicode.Ideographic, r) {
359		if r >= minUnified && r <= maxUnified {
360			// The most common case for CJK.
361			return int(r) + commonUnifiedOffset
362		}
363		if r >= minCompatibility && r <= maxCompatibility {
364			// This will typically not hit. The DUCET explicitly specifies mappings
365			// for all characters that do not decompose.
366			return int(r) + commonUnifiedOffset
367		}
368		return int(r) + rareUnifiedOffset
369	}
370	return int(r) + otherOffset
371}
372