1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package build
6
7import (
8	"fmt"
9	"unicode"
10
11	"golang.org/x/text/internal/colltab"
12)
13
14const (
15	defaultSecondary = 0x20
16	defaultTertiary  = 0x2
17	maxTertiary      = 0x1F
18)
19
20type rawCE struct {
21	w   []int
22	ccc uint8
23}
24
25func makeRawCE(w []int, ccc uint8) rawCE {
26	ce := rawCE{w: make([]int, 4), ccc: ccc}
27	copy(ce.w, w)
28	return ce
29}
30
31// A collation element is represented as an uint32.
32// In the typical case, a rune maps to a single collation element. If a rune
33// can be the start of a contraction or expands into multiple collation elements,
34// then the collation element that is associated with a rune will have a special
35// form to represent such m to n mappings.  Such special collation elements
36// have a value >= 0x80000000.
37
38const (
39	maxPrimaryBits   = 21
40	maxSecondaryBits = 12
41	maxTertiaryBits  = 8
42)
43
44func makeCE(ce rawCE) (uint32, error) {
45	v, e := colltab.MakeElem(ce.w[0], ce.w[1], ce.w[2], ce.ccc)
46	return uint32(v), e
47}
48
49// For contractions, collation elements are of the form
50// 110bbbbb bbbbbbbb iiiiiiii iiiinnnn, where
51//   - n* is the size of the first node in the contraction trie.
52//   - i* is the index of the first node in the contraction trie.
53//   - b* is the offset into the contraction collation element table.
54// See contract.go for details on the contraction trie.
55const (
56	contractID            = 0xC0000000
57	maxNBits              = 4
58	maxTrieIndexBits      = 12
59	maxContractOffsetBits = 13
60)
61
62func makeContractIndex(h ctHandle, offset int) (uint32, error) {
63	if h.n >= 1<<maxNBits {
64		return 0, fmt.Errorf("size of contraction trie node too large: %d >= %d", h.n, 1<<maxNBits)
65	}
66	if h.index >= 1<<maxTrieIndexBits {
67		return 0, fmt.Errorf("size of contraction trie offset too large: %d >= %d", h.index, 1<<maxTrieIndexBits)
68	}
69	if offset >= 1<<maxContractOffsetBits {
70		return 0, fmt.Errorf("contraction offset out of bounds: %x >= %x", offset, 1<<maxContractOffsetBits)
71	}
72	ce := uint32(contractID)
73	ce += uint32(offset << (maxNBits + maxTrieIndexBits))
74	ce += uint32(h.index << maxNBits)
75	ce += uint32(h.n)
76	return ce, nil
77}
78
79// For expansions, collation elements are of the form
80// 11100000 00000000 bbbbbbbb bbbbbbbb,
81// where b* is the index into the expansion sequence table.
82const (
83	expandID           = 0xE0000000
84	maxExpandIndexBits = 16
85)
86
87func makeExpandIndex(index int) (uint32, error) {
88	if index >= 1<<maxExpandIndexBits {
89		return 0, fmt.Errorf("expansion index out of bounds: %x >= %x", index, 1<<maxExpandIndexBits)
90	}
91	return expandID + uint32(index), nil
92}
93
94// Each list of collation elements corresponding to an expansion starts with
95// a header indicating the length of the sequence.
96func makeExpansionHeader(n int) (uint32, error) {
97	return uint32(n), nil
98}
99
100// Some runes can be expanded using NFKD decomposition. Instead of storing the full
101// sequence of collation elements, we decompose the rune and lookup the collation
102// elements for each rune in the decomposition and modify the tertiary weights.
103// The collation element, in this case, is of the form
104// 11110000 00000000 wwwwwwww vvvvvvvv, where
105//   - v* is the replacement tertiary weight for the first rune,
106//   - w* is the replacement tertiary weight for the second rune,
107// Tertiary weights of subsequent runes should be replaced with maxTertiary.
108// See https://www.unicode.org/reports/tr10/#Compatibility_Decompositions for more details.
109const (
110	decompID = 0xF0000000
111)
112
113func makeDecompose(t1, t2 int) (uint32, error) {
114	if t1 >= 256 || t1 < 0 {
115		return 0, fmt.Errorf("first tertiary weight out of bounds: %d >= 256", t1)
116	}
117	if t2 >= 256 || t2 < 0 {
118		return 0, fmt.Errorf("second tertiary weight out of bounds: %d >= 256", t2)
119	}
120	return uint32(t2<<8+t1) + decompID, nil
121}
122
123const (
124	// These constants were taken from https://www.unicode.org/versions/Unicode6.0.0/ch12.pdf.
125	minUnified       rune = 0x4E00
126	maxUnified            = 0x9FFF
127	minCompatibility      = 0xF900
128	maxCompatibility      = 0xFAFF
129	minRare               = 0x3400
130	maxRare               = 0x4DBF
131)
132const (
133	commonUnifiedOffset = 0x10000
134	rareUnifiedOffset   = 0x20000 // largest rune in common is U+FAFF
135	otherOffset         = 0x50000 // largest rune in rare is U+2FA1D
136	illegalOffset       = otherOffset + int(unicode.MaxRune)
137	maxPrimary          = illegalOffset + 1
138)
139
140// implicitPrimary returns the primary weight for the a rune
141// for which there is no entry for the rune in the collation table.
142// We take a different approach from the one specified in
143// https://unicode.org/reports/tr10/#Implicit_Weights,
144// but preserve the resulting relative ordering of the runes.
145func implicitPrimary(r rune) int {
146	if unicode.Is(unicode.Ideographic, r) {
147		if r >= minUnified && r <= maxUnified {
148			// The most common case for CJK.
149			return int(r) + commonUnifiedOffset
150		}
151		if r >= minCompatibility && r <= maxCompatibility {
152			// This will typically not hit. The DUCET explicitly specifies mappings
153			// for all characters that do not decompose.
154			return int(r) + commonUnifiedOffset
155		}
156		return int(r) + rareUnifiedOffset
157	}
158	return int(r) + otherOffset
159}
160
161// convertLargeWeights converts collation elements with large
162// primaries (either double primaries or for illegal runes)
163// to our own representation.
164// A CJK character C is represented in the DUCET as
165//   [.FBxx.0020.0002.C][.BBBB.0000.0000.C]
166// We will rewrite these characters to a single CE.
167// We assume the CJK values start at 0x8000.
168// See https://unicode.org/reports/tr10/#Implicit_Weights
169func convertLargeWeights(elems []rawCE) (res []rawCE, err error) {
170	const (
171		cjkPrimaryStart   = 0xFB40
172		rarePrimaryStart  = 0xFB80
173		otherPrimaryStart = 0xFBC0
174		illegalPrimary    = 0xFFFE
175		highBitsMask      = 0x3F
176		lowBitsMask       = 0x7FFF
177		lowBitsFlag       = 0x8000
178		shiftBits         = 15
179	)
180	for i := 0; i < len(elems); i++ {
181		ce := elems[i].w
182		p := ce[0]
183		if p < cjkPrimaryStart {
184			continue
185		}
186		if p > 0xFFFF {
187			return elems, fmt.Errorf("found primary weight %X; should be <= 0xFFFF", p)
188		}
189		if p >= illegalPrimary {
190			ce[0] = illegalOffset + p - illegalPrimary
191		} else {
192			if i+1 >= len(elems) {
193				return elems, fmt.Errorf("second part of double primary weight missing: %v", elems)
194			}
195			if elems[i+1].w[0]&lowBitsFlag == 0 {
196				return elems, fmt.Errorf("malformed second part of double primary weight: %v", elems)
197			}
198			np := ((p & highBitsMask) << shiftBits) + elems[i+1].w[0]&lowBitsMask
199			switch {
200			case p < rarePrimaryStart:
201				np += commonUnifiedOffset
202			case p < otherPrimaryStart:
203				np += rareUnifiedOffset
204			default:
205				p += otherOffset
206			}
207			ce[0] = np
208			for j := i + 1; j+1 < len(elems); j++ {
209				elems[j] = elems[j+1]
210			}
211			elems = elems[:len(elems)-1]
212		}
213	}
214	return elems, nil
215}
216
217// nextWeight computes the first possible collation weights following elems
218// for the given level.
219func nextWeight(level colltab.Level, elems []rawCE) []rawCE {
220	if level == colltab.Identity {
221		next := make([]rawCE, len(elems))
222		copy(next, elems)
223		return next
224	}
225	next := []rawCE{makeRawCE(elems[0].w, elems[0].ccc)}
226	next[0].w[level]++
227	if level < colltab.Secondary {
228		next[0].w[colltab.Secondary] = defaultSecondary
229	}
230	if level < colltab.Tertiary {
231		next[0].w[colltab.Tertiary] = defaultTertiary
232	}
233	// Filter entries that cannot influence ordering.
234	for _, ce := range elems[1:] {
235		skip := true
236		for i := colltab.Primary; i < level; i++ {
237			skip = skip && ce.w[i] == 0
238		}
239		if !skip {
240			next = append(next, ce)
241		}
242	}
243	return next
244}
245
246func nextVal(elems []rawCE, i int, level colltab.Level) (index, value int) {
247	for ; i < len(elems) && elems[i].w[level] == 0; i++ {
248	}
249	if i < len(elems) {
250		return i, elems[i].w[level]
251	}
252	return i, 0
253}
254
255// compareWeights returns -1 if a < b, 1 if a > b, or 0 otherwise.
256// It also returns the collation level at which the difference is found.
257func compareWeights(a, b []rawCE) (result int, level colltab.Level) {
258	for level := colltab.Primary; level < colltab.Identity; level++ {
259		var va, vb int
260		for ia, ib := 0, 0; ia < len(a) || ib < len(b); ia, ib = ia+1, ib+1 {
261			ia, va = nextVal(a, ia, level)
262			ib, vb = nextVal(b, ib, level)
263			if va != vb {
264				if va < vb {
265					return -1, level
266				} else {
267					return 1, level
268				}
269			}
270		}
271	}
272	return 0, colltab.Identity
273}
274
275func equalCE(a, b rawCE) bool {
276	for i := 0; i < 3; i++ {
277		if b.w[i] != a.w[i] {
278			return false
279		}
280	}
281	return true
282}
283
284func equalCEArrays(a, b []rawCE) bool {
285	if len(a) != len(b) {
286		return false
287	}
288	for i := range a {
289		if !equalCE(a[i], b[i]) {
290			return false
291		}
292	}
293	return true
294}
295