1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7package main
8
9// This file contains definitions for interpreting the trie value of the case
10// trie generated by "go run gen*.go". It is shared by both the generator
11// program and the resultant package. Sharing is achieved by the generator
12// copying gen_trieval.go to trieval.go and changing what's above this comment.
13
14// info holds case information for a single rune. It is the value returned
15// by a trie lookup. Most mapping information can be stored in a single 16-bit
16// value. If not, for example when a rune is mapped to multiple runes, the value
17// stores some basic case data and an index into an array with additional data.
18//
19// The per-rune values have the following format:
20//
21//   if (exception) {
22//     15..4  unsigned exception index
23//   } else {
24//     15..8  XOR pattern or index to XOR pattern for case mapping
25//            Only 13..8 are used for XOR patterns.
26//         7  inverseFold (fold to upper, not to lower)
27//         6  index: interpret the XOR pattern as an index
28//            or isMid if case mode is cIgnorableUncased.
29//      5..4  CCC: zero (normal or break), above or other
30//   }
31//      3  exception: interpret this value as an exception index
32//         (TODO: is this bit necessary? Probably implied from case mode.)
33//   2..0  case mode
34//
35// For the non-exceptional cases, a rune must be either uncased, lowercase or
36// uppercase. If the rune is cased, the XOR pattern maps either a lowercase
37// rune to uppercase or an uppercase rune to lowercase (applied to the 10
38// least-significant bits of the rune).
39//
40// See the definitions below for a more detailed description of the various
41// bits.
42type info uint16
43
44const (
45	casedMask      = 0x0003
46	fullCasedMask  = 0x0007
47	ignorableMask  = 0x0006
48	ignorableValue = 0x0004
49
50	inverseFoldBit = 1 << 7
51	isMidBit       = 1 << 6
52
53	exceptionBit     = 1 << 3
54	exceptionShift   = 4
55	numExceptionBits = 12
56
57	xorIndexBit = 1 << 6
58	xorShift    = 8
59
60	// There is no mapping if all xor bits and the exception bit are zero.
61	hasMappingMask = 0xff80 | exceptionBit
62)
63
64// The case mode bits encodes the case type of a rune. This includes uncased,
65// title, upper and lower case and case ignorable. (For a definition of these
66// terms see Chapter 3 of The Unicode Standard Core Specification.) In some rare
67// cases, a rune can be both cased and case-ignorable. This is encoded by
68// cIgnorableCased. A rune of this type is always lower case. Some runes are
69// cased while not having a mapping.
70//
71// A common pattern for scripts in the Unicode standard is for upper and lower
72// case runes to alternate for increasing rune values (e.g. the accented Latin
73// ranges starting from U+0100 and U+1E00 among others and some Cyrillic
74// characters). We use this property by defining a cXORCase mode, where the case
75// mode (always upper or lower case) is derived from the rune value. As the XOR
76// pattern for case mappings is often identical for successive runes, using
77// cXORCase can result in large series of identical trie values. This, in turn,
78// allows us to better compress the trie blocks.
79const (
80	cUncased          info = iota // 000
81	cTitle                        // 001
82	cLower                        // 010
83	cUpper                        // 011
84	cIgnorableUncased             // 100
85	cIgnorableCased               // 101 // lower case if mappings exist
86	cXORCase                      // 11x // case is cLower | ((rune&1) ^ x)
87
88	maxCaseMode = cUpper
89)
90
91func (c info) isCased() bool {
92	return c&casedMask != 0
93}
94
95func (c info) isCaseIgnorable() bool {
96	return c&ignorableMask == ignorableValue
97}
98
99func (c info) isNotCasedAndNotCaseIgnorable() bool {
100	return c&fullCasedMask == 0
101}
102
103func (c info) isCaseIgnorableAndNotCased() bool {
104	return c&fullCasedMask == cIgnorableUncased
105}
106
107func (c info) isMid() bool {
108	return c&(fullCasedMask|isMidBit) == isMidBit|cIgnorableUncased
109}
110
111// The case mapping implementation will need to know about various Canonical
112// Combining Class (CCC) values. We encode two of these in the trie value:
113// cccZero (0) and cccAbove (230). If the value is cccOther, it means that
114// CCC(r) > 0, but not 230. A value of cccBreak means that CCC(r) == 0 and that
115// the rune also has the break category Break (see below).
116const (
117	cccBreak info = iota << 4
118	cccZero
119	cccAbove
120	cccOther
121
122	cccMask = cccBreak | cccZero | cccAbove | cccOther
123)
124
125const (
126	starter       = 0
127	above         = 230
128	iotaSubscript = 240
129)
130
131// The exceptions slice holds data that does not fit in a normal info entry.
132// The entry is pointed to by the exception index in an entry. It has the
133// following format:
134//
135// Header
136// byte 0:
137//  7..6  unused
138//  5..4  CCC type (same bits as entry)
139//     3  unused
140//  2..0  length of fold
141//
142// byte 1:
143//   7..6  unused
144//   5..3  length of 1st mapping of case type
145//   2..0  length of 2nd mapping of case type
146//
147//   case     1st    2nd
148//   lower -> upper, title
149//   upper -> lower, title
150//   title -> lower, upper
151//
152// Lengths with the value 0x7 indicate no value and implies no change.
153// A length of 0 indicates a mapping to zero-length string.
154//
155// Body bytes:
156//   case folding bytes
157//   lowercase mapping bytes
158//   uppercase mapping bytes
159//   titlecase mapping bytes
160//   closure mapping bytes (for NFKC_Casefold). (TODO)
161//
162// Fallbacks:
163//   missing fold  -> lower
164//   missing title -> upper
165//   all missing   -> original rune
166//
167// exceptions starts with a dummy byte to enforce that there is no zero index
168// value.
169const (
170	lengthMask = 0x07
171	lengthBits = 3
172	noChange   = 0
173)
174
175// References to generated trie.
176
177var trie = newCaseTrie(0)
178
179var sparse = sparseBlocks{
180	values:  sparseValues[:],
181	offsets: sparseOffsets[:],
182}
183
184// Sparse block lookup code.
185
186// valueRange is an entry in a sparse block.
187type valueRange struct {
188	value  uint16
189	lo, hi byte
190}
191
192type sparseBlocks struct {
193	values  []valueRange
194	offsets []uint16
195}
196
197// lookup returns the value from values block n for byte b using binary search.
198func (s *sparseBlocks) lookup(n uint32, b byte) uint16 {
199	lo := s.offsets[n]
200	hi := s.offsets[n+1]
201	for lo < hi {
202		m := lo + (hi-lo)/2
203		r := s.values[m]
204		if r.lo <= b && b <= r.hi {
205			return r.value
206		}
207		if b < r.lo {
208			hi = m
209		} else {
210			lo = m + 1
211		}
212	}
213	return 0
214}
215
216// lastRuneForTesting is the last rune used for testing. Everything after this
217// is boring.
218const lastRuneForTesting = rune(0x1FFFF)
219