1// Copyright 2009 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package unicode provides data and functions to test some properties of
6// Unicode code points.
7package unicode
8
9const (
10	MaxRune         = '\U0010FFFF' // Maximum valid Unicode code point.
11	ReplacementChar = '\uFFFD'     // Represents invalid code points.
12	MaxASCII        = '\u007F'     // maximum ASCII value.
13	MaxLatin1       = '\u00FF'     // maximum Latin-1 value.
14)
15
16// RangeTable defines a set of Unicode code points by listing the ranges of
17// code points within the set. The ranges are listed in two slices
18// to save space: a slice of 16-bit ranges and a slice of 32-bit ranges.
19// The two slices must be in sorted order and non-overlapping.
20// Also, R32 should contain only values >= 0x10000 (1<<16).
21type RangeTable struct {
22	R16         []Range16
23	R32         []Range32
24	LatinOffset int // number of entries in R16 with Hi <= MaxLatin1
25}
26
27// Range16 represents of a range of 16-bit Unicode code points. The range runs from Lo to Hi
28// inclusive and has the specified stride.
29type Range16 struct {
30	Lo     uint16
31	Hi     uint16
32	Stride uint16
33}
34
35// Range32 represents of a range of Unicode code points and is used when one or
36// more of the values will not fit in 16 bits. The range runs from Lo to Hi
37// inclusive and has the specified stride. Lo and Hi must always be >= 1<<16.
38type Range32 struct {
39	Lo     uint32
40	Hi     uint32
41	Stride uint32
42}
43
44// CaseRange represents a range of Unicode code points for simple (one
45// code point to one code point) case conversion.
46// The range runs from Lo to Hi inclusive, with a fixed stride of 1. Deltas
47// are the number to add to the code point to reach the code point for a
48// different case for that character. They may be negative. If zero, it
49// means the character is in the corresponding case. There is a special
50// case representing sequences of alternating corresponding Upper and Lower
51// pairs. It appears with a fixed Delta of
52//	{UpperLower, UpperLower, UpperLower}
53// The constant UpperLower has an otherwise impossible delta value.
54type CaseRange struct {
55	Lo    uint32
56	Hi    uint32
57	Delta d
58}
59
60// SpecialCase represents language-specific case mappings such as Turkish.
61// Methods of SpecialCase customize (by overriding) the standard mappings.
62type SpecialCase []CaseRange
63
64// BUG(r): There is no mechanism for full case folding, that is, for
65// characters that involve multiple runes in the input or output.
66
67// Indices into the Delta arrays inside CaseRanges for case mapping.
68const (
69	UpperCase = iota
70	LowerCase
71	TitleCase
72	MaxCase
73)
74
75type d [MaxCase]rune // to make the CaseRanges text shorter
76
77// If the Delta field of a CaseRange is UpperLower, it means
78// this CaseRange represents a sequence of the form (say)
79// Upper Lower Upper Lower.
80const (
81	UpperLower = MaxRune + 1 // (Cannot be a valid delta.)
82)
83
84// linearMax is the maximum size table for linear search for non-Latin1 rune.
85// Derived by running 'go test -calibrate'.
86const linearMax = 18
87
88// is16 reports whether r is in the sorted slice of 16-bit ranges.
89func is16(ranges []Range16, r uint16) bool {
90	if len(ranges) <= linearMax || r <= MaxLatin1 {
91		for i := range ranges {
92			range_ := &ranges[i]
93			if r < range_.Lo {
94				return false
95			}
96			if r <= range_.Hi {
97				return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
98			}
99		}
100		return false
101	}
102
103	// binary search over ranges
104	lo := 0
105	hi := len(ranges)
106	for lo < hi {
107		m := lo + (hi-lo)/2
108		range_ := &ranges[m]
109		if range_.Lo <= r && r <= range_.Hi {
110			return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
111		}
112		if r < range_.Lo {
113			hi = m
114		} else {
115			lo = m + 1
116		}
117	}
118	return false
119}
120
121// is32 reports whether r is in the sorted slice of 32-bit ranges.
122func is32(ranges []Range32, r uint32) bool {
123	if len(ranges) <= linearMax {
124		for i := range ranges {
125			range_ := &ranges[i]
126			if r < range_.Lo {
127				return false
128			}
129			if r <= range_.Hi {
130				return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
131			}
132		}
133		return false
134	}
135
136	// binary search over ranges
137	lo := 0
138	hi := len(ranges)
139	for lo < hi {
140		m := lo + (hi-lo)/2
141		range_ := ranges[m]
142		if range_.Lo <= r && r <= range_.Hi {
143			return range_.Stride == 1 || (r-range_.Lo)%range_.Stride == 0
144		}
145		if r < range_.Lo {
146			hi = m
147		} else {
148			lo = m + 1
149		}
150	}
151	return false
152}
153
154// Is reports whether the rune is in the specified table of ranges.
155func Is(rangeTab *RangeTable, r rune) bool {
156	r16 := rangeTab.R16
157	if len(r16) > 0 && r <= rune(r16[len(r16)-1].Hi) {
158		return is16(r16, uint16(r))
159	}
160	r32 := rangeTab.R32
161	if len(r32) > 0 && r >= rune(r32[0].Lo) {
162		return is32(r32, uint32(r))
163	}
164	return false
165}
166
167func isExcludingLatin(rangeTab *RangeTable, r rune) bool {
168	r16 := rangeTab.R16
169	if off := rangeTab.LatinOffset; len(r16) > off && r <= rune(r16[len(r16)-1].Hi) {
170		return is16(r16[off:], uint16(r))
171	}
172	r32 := rangeTab.R32
173	if len(r32) > 0 && r >= rune(r32[0].Lo) {
174		return is32(r32, uint32(r))
175	}
176	return false
177}
178
179// IsUpper reports whether the rune is an upper case letter.
180func IsUpper(r rune) bool {
181	// See comment in IsGraphic.
182	if uint32(r) <= MaxLatin1 {
183		return properties[uint8(r)]&pLmask == pLu
184	}
185	return isExcludingLatin(Upper, r)
186}
187
188// IsLower reports whether the rune is a lower case letter.
189func IsLower(r rune) bool {
190	// See comment in IsGraphic.
191	if uint32(r) <= MaxLatin1 {
192		return properties[uint8(r)]&pLmask == pLl
193	}
194	return isExcludingLatin(Lower, r)
195}
196
197// IsTitle reports whether the rune is a title case letter.
198func IsTitle(r rune) bool {
199	if r <= MaxLatin1 {
200		return false
201	}
202	return isExcludingLatin(Title, r)
203}
204
205// to maps the rune using the specified case mapping.
206// It additionally reports whether caseRange contained a mapping for r.
207func to(_case int, r rune, caseRange []CaseRange) (mappedRune rune, foundMapping bool) {
208	if _case < 0 || MaxCase <= _case {
209		return ReplacementChar, false // as reasonable an error as any
210	}
211	// binary search over ranges
212	lo := 0
213	hi := len(caseRange)
214	for lo < hi {
215		m := lo + (hi-lo)/2
216		cr := caseRange[m]
217		if rune(cr.Lo) <= r && r <= rune(cr.Hi) {
218			delta := cr.Delta[_case]
219			if delta > MaxRune {
220				// In an Upper-Lower sequence, which always starts with
221				// an UpperCase letter, the real deltas always look like:
222				//	{0, 1, 0}    UpperCase (Lower is next)
223				//	{-1, 0, -1}  LowerCase (Upper, Title are previous)
224				// The characters at even offsets from the beginning of the
225				// sequence are upper case; the ones at odd offsets are lower.
226				// The correct mapping can be done by clearing or setting the low
227				// bit in the sequence offset.
228				// The constants UpperCase and TitleCase are even while LowerCase
229				// is odd so we take the low bit from _case.
230				return rune(cr.Lo) + ((r-rune(cr.Lo))&^1 | rune(_case&1)), true
231			}
232			return r + delta, true
233		}
234		if r < rune(cr.Lo) {
235			hi = m
236		} else {
237			lo = m + 1
238		}
239	}
240	return r, false
241}
242
243// To maps the rune to the specified case: UpperCase, LowerCase, or TitleCase.
244func To(_case int, r rune) rune {
245	r, _ = to(_case, r, CaseRanges)
246	return r
247}
248
249// ToUpper maps the rune to upper case.
250func ToUpper(r rune) rune {
251	if r <= MaxASCII {
252		if 'a' <= r && r <= 'z' {
253			r -= 'a' - 'A'
254		}
255		return r
256	}
257	return To(UpperCase, r)
258}
259
260// ToLower maps the rune to lower case.
261func ToLower(r rune) rune {
262	if r <= MaxASCII {
263		if 'A' <= r && r <= 'Z' {
264			r += 'a' - 'A'
265		}
266		return r
267	}
268	return To(LowerCase, r)
269}
270
271// ToTitle maps the rune to title case.
272func ToTitle(r rune) rune {
273	if r <= MaxASCII {
274		if 'a' <= r && r <= 'z' { // title case is upper case for ASCII
275			r -= 'a' - 'A'
276		}
277		return r
278	}
279	return To(TitleCase, r)
280}
281
282// ToUpper maps the rune to upper case giving priority to the special mapping.
283func (special SpecialCase) ToUpper(r rune) rune {
284	r1, hadMapping := to(UpperCase, r, []CaseRange(special))
285	if r1 == r && !hadMapping {
286		r1 = ToUpper(r)
287	}
288	return r1
289}
290
291// ToTitle maps the rune to title case giving priority to the special mapping.
292func (special SpecialCase) ToTitle(r rune) rune {
293	r1, hadMapping := to(TitleCase, r, []CaseRange(special))
294	if r1 == r && !hadMapping {
295		r1 = ToTitle(r)
296	}
297	return r1
298}
299
300// ToLower maps the rune to lower case giving priority to the special mapping.
301func (special SpecialCase) ToLower(r rune) rune {
302	r1, hadMapping := to(LowerCase, r, []CaseRange(special))
303	if r1 == r && !hadMapping {
304		r1 = ToLower(r)
305	}
306	return r1
307}
308
309// caseOrbit is defined in tables.go as []foldPair. Right now all the
310// entries fit in uint16, so use uint16. If that changes, compilation
311// will fail (the constants in the composite literal will not fit in uint16)
312// and the types here can change to uint32.
313type foldPair struct {
314	From uint16
315	To   uint16
316}
317
318// SimpleFold iterates over Unicode code points equivalent under
319// the Unicode-defined simple case folding. Among the code points
320// equivalent to rune (including rune itself), SimpleFold returns the
321// smallest rune > r if one exists, or else the smallest rune >= 0.
322// If r is not a valid Unicode code point, SimpleFold(r) returns r.
323//
324// For example:
325//	SimpleFold('A') = 'a'
326//	SimpleFold('a') = 'A'
327//
328//	SimpleFold('K') = 'k'
329//	SimpleFold('k') = '\u212A' (Kelvin symbol, K)
330//	SimpleFold('\u212A') = 'K'
331//
332//	SimpleFold('1') = '1'
333//
334//	SimpleFold(-2) = -2
335//
336func SimpleFold(r rune) rune {
337	if r < 0 || r > MaxRune {
338		return r
339	}
340
341	if int(r) < len(asciiFold) {
342		return rune(asciiFold[r])
343	}
344
345	// Consult caseOrbit table for special cases.
346	lo := 0
347	hi := len(caseOrbit)
348	for lo < hi {
349		m := lo + (hi-lo)/2
350		if rune(caseOrbit[m].From) < r {
351			lo = m + 1
352		} else {
353			hi = m
354		}
355	}
356	if lo < len(caseOrbit) && rune(caseOrbit[lo].From) == r {
357		return rune(caseOrbit[lo].To)
358	}
359
360	// No folding specified. This is a one- or two-element
361	// equivalence class containing rune and ToLower(rune)
362	// and ToUpper(rune) if they are different from rune.
363	if l := ToLower(r); l != r {
364		return l
365	}
366	return ToUpper(r)
367}
368