1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package collate
6
7import (
8	"unicode"
9)
10
11// Level identifies the collation comparison level.
12// The primary level corresponds to the basic sorting of text.
13// The secondary level corresponds to accents and related linguistic elements.
14// The tertiary level corresponds to casing and related concepts.
15// The quaternary level is derived from the other levels by the
16// various algorithms for handling variable elements.
17type Level int
18
19const (
20	Primary Level = iota
21	Secondary
22	Tertiary
23	Quaternary
24	Identity
25)
26
27const (
28	defaultSecondary = 0x20
29	defaultTertiary  = 0x2
30	maxTertiary      = 0x1F
31	MaxQuaternary    = 0x1FFFFF // 21 bits.
32)
33
34// Elem is a representation of a collation element. This API provides ways to encode
35// and decode Elems. Implementations of collation tables may use values greater
36// or equal to PrivateUse for their own purposes.  However, these should never be
37// returned by AppendNext.
38type Elem uint32
39
40const (
41	maxCE       Elem = 0xAFFFFFFF
42	PrivateUse       = minContract
43	minContract      = 0xC0000000
44	maxContract      = 0xDFFFFFFF
45	minExpand        = 0xE0000000
46	maxExpand        = 0xEFFFFFFF
47	minDecomp        = 0xF0000000
48)
49
50type ceType int
51
52const (
53	ceNormal           ceType = iota // ceNormal includes implicits (ce == 0)
54	ceContractionIndex               // rune can be a start of a contraction
55	ceExpansionIndex                 // rune expands into a sequence of collation elements
56	ceDecompose                      // rune expands using NFKC decomposition
57)
58
59func (ce Elem) ctype() ceType {
60	if ce <= maxCE {
61		return ceNormal
62	}
63	if ce <= maxContract {
64		return ceContractionIndex
65	} else {
66		if ce <= maxExpand {
67			return ceExpansionIndex
68		}
69		return ceDecompose
70	}
71	panic("should not reach here")
72	return ceType(-1)
73}
74
75// For normal collation elements, we assume that a collation element either has
76// a primary or non-default secondary value, not both.
77// Collation elements with a primary value are of the form
78// 01pppppp pppppppp ppppppp0 ssssssss
79//   - p* is primary collation value
80//   - s* is the secondary collation value
81// 00pppppp pppppppp ppppppps sssttttt, where
82//   - p* is primary collation value
83//   - s* offset of secondary from default value.
84//   - t* is the tertiary collation value
85// 100ttttt cccccccc pppppppp pppppppp
86//   - t* is the tertiar collation value
87//   - c* is the cannonical combining class
88//   - p* is the primary collation value
89// Collation elements with a secondary value are of the form
90// 1010cccc ccccssss ssssssss tttttttt, where
91//   - c* is the canonical combining class
92//   - s* is the secondary collation value
93//   - t* is the tertiary collation value
94// 11qqqqqq qqqqqqqq qqqqqqq0 00000000
95//   - q* quaternary value
96const (
97	ceTypeMask            = 0xC0000000
98	ceTypeMaskExt         = 0xE0000000
99	ceType1               = 0x40000000
100	ceType2               = 0x00000000
101	ceType3or4            = 0x80000000
102	ceType4               = 0xA0000000
103	ceTypeQ               = 0xC0000000
104	ceIgnore              = ceType4
105	firstNonPrimary       = 0x80000000
106	lastSpecialPrimary    = 0xA0000000
107	secondaryMask         = 0x80000000
108	hasTertiaryMask       = 0x40000000
109	primaryValueMask      = 0x3FFFFE00
110	primaryShift          = 9
111	compactPrimaryBits    = 16
112	compactSecondaryShift = 5
113	minCompactSecondary   = defaultSecondary - 4
114)
115
116func makeImplicitCE(primary int) Elem {
117	return ceType1 | Elem(primary<<primaryShift) | defaultSecondary
118}
119
120// MakeElem returns an Elem for the given values.  It will return an error
121// if the given combination of values is invalid.
122func MakeElem(primary, secondary, tertiary int, ccc uint8) (Elem, error) {
123	// TODO: implement
124	return 0, nil
125}
126
127// MakeQuaternary returns an Elem with the given quaternary value.
128func MakeQuaternary(v int) Elem {
129	return ceTypeQ | Elem(v<<primaryShift)
130}
131
132// Mask sets weights for any level smaller than l to 0.
133// The resulting Elem can be used to test for equality with
134// other Elems to which the same mask has been applied.
135func (ce Elem) Mask(l Level) uint32 {
136	return 0
137}
138
139// CCC returns the canoncial combining class associated with the underlying character,
140// if applicable, or 0 otherwise.
141func (ce Elem) CCC() uint8 {
142	if ce&ceType3or4 != 0 {
143		if ce&ceType4 == ceType3or4 {
144			return uint8(ce >> 16)
145		}
146		return uint8(ce >> 20)
147	}
148	return 0
149}
150
151// Primary returns the primary collation weight for ce.
152func (ce Elem) Primary() int {
153	if ce >= firstNonPrimary {
154		if ce > lastSpecialPrimary {
155			return 0
156		}
157		return int(uint16(ce))
158	}
159	return int(ce&primaryValueMask) >> primaryShift
160}
161
162// Secondary returns the secondary collation weight for ce.
163func (ce Elem) Secondary() int {
164	switch ce & ceTypeMask {
165	case ceType1:
166		return int(uint8(ce))
167	case ceType2:
168		return minCompactSecondary + int((ce>>compactSecondaryShift)&0xF)
169	case ceType3or4:
170		if ce < ceType4 {
171			return defaultSecondary
172		}
173		return int(ce>>8) & 0xFFF
174	case ceTypeQ:
175		return 0
176	}
177	panic("should not reach here")
178}
179
180// Tertiary returns the tertiary collation weight for ce.
181func (ce Elem) Tertiary() uint8 {
182	if ce&hasTertiaryMask == 0 {
183		if ce&ceType3or4 == 0 {
184			return uint8(ce & 0x1F)
185		}
186		if ce&ceType4 == ceType4 {
187			return uint8(ce)
188		}
189		return uint8(ce>>24) & 0x1F // type 2
190	} else if ce&ceTypeMask == ceType1 {
191		return defaultTertiary
192	}
193	// ce is a quaternary value.
194	return 0
195}
196
197func (ce Elem) updateTertiary(t uint8) Elem {
198	if ce&ceTypeMask == ceType1 {
199		// convert to type 4
200		nce := ce & primaryValueMask
201		nce |= Elem(uint8(ce)-minCompactSecondary) << compactSecondaryShift
202		ce = nce
203	} else if ce&ceTypeMaskExt == ceType3or4 {
204		ce &= ^Elem(maxTertiary << 24)
205		return ce | (Elem(t) << 24)
206	} else {
207		// type 2 or 4
208		ce &= ^Elem(maxTertiary)
209	}
210	return ce | Elem(t)
211}
212
213// Quaternary returns the quaternary value if explicitly specified,
214// 0 if ce == ceIgnore, or MaxQuaternary otherwise.
215// Quaternary values are used only for shifted variants.
216func (ce Elem) Quaternary() int {
217	if ce&ceTypeMask == ceTypeQ {
218		return int(ce&primaryValueMask) >> primaryShift
219	} else if ce == ceIgnore {
220		return 0
221	}
222	return MaxQuaternary
223}
224
225// Weight returns the collation weight for the given level.
226func (ce Elem) Weight(l Level) int {
227	switch l {
228	case Primary:
229		return ce.Primary()
230	case Secondary:
231		return ce.Secondary()
232	case Tertiary:
233		return int(ce.Tertiary())
234	case Quaternary:
235		return ce.Quaternary()
236	}
237	return 0 // return 0 (ignore) for undefined levels.
238}
239
240// For contractions, collation elements are of the form
241// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
242//   - n* is the size of the first node in the contraction trie.
243//   - i* is the index of the first node in the contraction trie.
244//   - b* is the offset into the contraction collation element table.
245// See contract.go for details on the contraction trie.
246const (
247	maxNBits              = 4
248	maxTrieIndexBits      = 12
249	maxContractOffsetBits = 13
250)
251
252func splitContractIndex(ce Elem) (index, n, offset int) {
253	n = int(ce & (1<<maxNBits - 1))
254	ce >>= maxNBits
255	index = int(ce & (1<<maxTrieIndexBits - 1))
256	ce >>= maxTrieIndexBits
257	offset = int(ce & (1<<maxContractOffsetBits - 1))
258	return
259}
260
261// For expansions, Elems are of the form 11100000 00000000 bbbbbbbb bbbbbbbb,
262// where b* is the index into the expansion sequence table.
263const maxExpandIndexBits = 16
264
265func splitExpandIndex(ce Elem) (index int) {
266	return int(uint16(ce))
267}
268
269// Some runes can be expanded using NFKD decomposition. Instead of storing the full
270// sequence of collation elements, we decompose the rune and lookup the collation
271// elements for each rune in the decomposition and modify the tertiary weights.
272// The Elem, in this case, is of the form 11110000 00000000 wwwwwwww vvvvvvvv, where
273//   - v* is the replacement tertiary weight for the first rune,
274//   - w* is the replacement tertiary weight for the second rune,
275// Tertiary weights of subsequent runes should be replaced with maxTertiary.
276// See http://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
277func splitDecompose(ce Elem) (t1, t2 uint8) {
278	return uint8(ce), uint8(ce >> 8)
279}
280
281const (
282	// These constants were taken from http://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
283	minUnified       rune = 0x4E00
284	maxUnified            = 0x9FFF
285	minCompatibility      = 0xF900
286	maxCompatibility      = 0xFAFF
287	minRare               = 0x3400
288	maxRare               = 0x4DBF
289)
290const (
291	commonUnifiedOffset = 0x10000
292	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
293	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
294	illegalOffset       = otherOffset + int(unicode.MaxRune)
295	maxPrimary          = illegalOffset + 1
296)
297
298// implicitPrimary returns the primary weight for the a rune
299// for which there is no entry for the rune in the collation table.
300// We take a different approach from the one specified in
301// http://unicode.org/reports/tr10/#Implicit_Weights,
302// but preserve the resulting relative ordering of the runes.
303func implicitPrimary(r rune) int {
304	if unicode.Is(unicode.Ideographic, r) {
305		if r >= minUnified && r <= maxUnified {
306			// The most common case for CJK.
307			return int(r) + commonUnifiedOffset
308		}
309		if r >= minCompatibility && r <= maxCompatibility {
310			// This will typically not hit. The DUCET explicitly specifies mappings
311			// for all characters that do not decompose.
312			return int(r) + commonUnifiedOffset
313		}
314		return int(r) + rareUnifiedOffset
315	}
316	return int(r) + otherOffset
317}
318