1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package language
6
7import (
8	"bytes"
9	"fmt"
10	"sort"
11	"strconv"
12
13	"golang.org/x/text/internal/tag"
14)
15
16// findIndex tries to find the given tag in idx and returns a standardized error
17// if it could not be found.
18func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
19	if !tag.FixCase(form, key) {
20		return 0, ErrSyntax
21	}
22	i := idx.Index(key)
23	if i == -1 {
24		return 0, NewValueError(key)
25	}
26	return i, nil
27}
28
29func searchUint(imap []uint16, key uint16) int {
30	return sort.Search(len(imap), func(i int) bool {
31		return imap[i] >= key
32	})
33}
34
35type Language uint16
36
37// getLangID returns the langID of s if s is a canonical subtag
38// or langUnknown if s is not a canonical subtag.
39func getLangID(s []byte) (Language, error) {
40	if len(s) == 2 {
41		return getLangISO2(s)
42	}
43	return getLangISO3(s)
44}
45
46// TODO language normalization as well as the AliasMaps could be moved to the
47// higher level package, but it is a bit tricky to separate the generation.
48
49func (id Language) Canonicalize() (Language, AliasType) {
50	return normLang(id)
51}
52
53// mapLang returns the mapped langID of id according to mapping m.
54func normLang(id Language) (Language, AliasType) {
55	k := sort.Search(len(AliasMap), func(i int) bool {
56		return AliasMap[i].From >= uint16(id)
57	})
58	if k < len(AliasMap) && AliasMap[k].From == uint16(id) {
59		return Language(AliasMap[k].To), AliasTypes[k]
60	}
61	return id, AliasTypeUnknown
62}
63
64// getLangISO2 returns the langID for the given 2-letter ISO language code
65// or unknownLang if this does not exist.
66func getLangISO2(s []byte) (Language, error) {
67	if !tag.FixCase("zz", s) {
68		return 0, ErrSyntax
69	}
70	if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
71		return Language(i), nil
72	}
73	return 0, NewValueError(s)
74}
75
76const base = 'z' - 'a' + 1
77
78func strToInt(s []byte) uint {
79	v := uint(0)
80	for i := 0; i < len(s); i++ {
81		v *= base
82		v += uint(s[i] - 'a')
83	}
84	return v
85}
86
87// converts the given integer to the original ASCII string passed to strToInt.
88// len(s) must match the number of characters obtained.
89func intToStr(v uint, s []byte) {
90	for i := len(s) - 1; i >= 0; i-- {
91		s[i] = byte(v%base) + 'a'
92		v /= base
93	}
94}
95
96// getLangISO3 returns the langID for the given 3-letter ISO language code
97// or unknownLang if this does not exist.
98func getLangISO3(s []byte) (Language, error) {
99	if tag.FixCase("und", s) {
100		// first try to match canonical 3-letter entries
101		for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
102			if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
103				// We treat "und" as special and always translate it to "unspecified".
104				// Note that ZZ and Zzzz are private use and are not treated as
105				// unspecified by default.
106				id := Language(i)
107				if id == nonCanonicalUnd {
108					return 0, nil
109				}
110				return id, nil
111			}
112		}
113		if i := altLangISO3.Index(s); i != -1 {
114			return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil
115		}
116		n := strToInt(s)
117		if langNoIndex[n/8]&(1<<(n%8)) != 0 {
118			return Language(n) + langNoIndexOffset, nil
119		}
120		// Check for non-canonical uses of ISO3.
121		for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
122			if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
123				return Language(i), nil
124			}
125		}
126		return 0, NewValueError(s)
127	}
128	return 0, ErrSyntax
129}
130
131// StringToBuf writes the string to b and returns the number of bytes
132// written.  cap(b) must be >= 3.
133func (id Language) StringToBuf(b []byte) int {
134	if id >= langNoIndexOffset {
135		intToStr(uint(id)-langNoIndexOffset, b[:3])
136		return 3
137	} else if id == 0 {
138		return copy(b, "und")
139	}
140	l := lang[id<<2:]
141	if l[3] == 0 {
142		return copy(b, l[:3])
143	}
144	return copy(b, l[:2])
145}
146
147// String returns the BCP 47 representation of the langID.
148// Use b as variable name, instead of id, to ensure the variable
149// used is consistent with that of Base in which this type is embedded.
150func (b Language) String() string {
151	if b == 0 {
152		return "und"
153	} else if b >= langNoIndexOffset {
154		b -= langNoIndexOffset
155		buf := [3]byte{}
156		intToStr(uint(b), buf[:])
157		return string(buf[:])
158	}
159	l := lang.Elem(int(b))
160	if l[3] == 0 {
161		return l[:3]
162	}
163	return l[:2]
164}
165
166// ISO3 returns the ISO 639-3 language code.
167func (b Language) ISO3() string {
168	if b == 0 || b >= langNoIndexOffset {
169		return b.String()
170	}
171	l := lang.Elem(int(b))
172	if l[3] == 0 {
173		return l[:3]
174	} else if l[2] == 0 {
175		return altLangISO3.Elem(int(l[3]))[:3]
176	}
177	// This allocation will only happen for 3-letter ISO codes
178	// that are non-canonical BCP 47 language identifiers.
179	return l[0:1] + l[2:4]
180}
181
182// IsPrivateUse reports whether this language code is reserved for private use.
183func (b Language) IsPrivateUse() bool {
184	return langPrivateStart <= b && b <= langPrivateEnd
185}
186
187// SuppressScript returns the script marked as SuppressScript in the IANA
188// language tag repository, or 0 if there is no such script.
189func (b Language) SuppressScript() Script {
190	if b < langNoIndexOffset {
191		return Script(suppressScript[b])
192	}
193	return 0
194}
195
196type Region uint16
197
198// getRegionID returns the region id for s if s is a valid 2-letter region code
199// or unknownRegion.
200func getRegionID(s []byte) (Region, error) {
201	if len(s) == 3 {
202		if isAlpha(s[0]) {
203			return getRegionISO3(s)
204		}
205		if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
206			return getRegionM49(int(i))
207		}
208	}
209	return getRegionISO2(s)
210}
211
212// getRegionISO2 returns the regionID for the given 2-letter ISO country code
213// or unknownRegion if this does not exist.
214func getRegionISO2(s []byte) (Region, error) {
215	i, err := findIndex(regionISO, s, "ZZ")
216	if err != nil {
217		return 0, err
218	}
219	return Region(i) + isoRegionOffset, nil
220}
221
222// getRegionISO3 returns the regionID for the given 3-letter ISO country code
223// or unknownRegion if this does not exist.
224func getRegionISO3(s []byte) (Region, error) {
225	if tag.FixCase("ZZZ", s) {
226		for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
227			if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
228				return Region(i) + isoRegionOffset, nil
229			}
230		}
231		for i := 0; i < len(altRegionISO3); i += 3 {
232			if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
233				return Region(altRegionIDs[i/3]), nil
234			}
235		}
236		return 0, NewValueError(s)
237	}
238	return 0, ErrSyntax
239}
240
241func getRegionM49(n int) (Region, error) {
242	if 0 < n && n <= 999 {
243		const (
244			searchBits = 7
245			regionBits = 9
246			regionMask = 1<<regionBits - 1
247		)
248		idx := n >> searchBits
249		buf := fromM49[m49Index[idx]:m49Index[idx+1]]
250		val := uint16(n) << regionBits // we rely on bits shifting out
251		i := sort.Search(len(buf), func(i int) bool {
252			return buf[i] >= val
253		})
254		if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
255			return Region(r & regionMask), nil
256		}
257	}
258	var e ValueError
259	fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
260	return 0, e
261}
262
263// normRegion returns a region if r is deprecated or 0 otherwise.
264// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
265// TODO: consider mapping split up regions to new most populous one (like CLDR).
266func normRegion(r Region) Region {
267	m := regionOldMap
268	k := sort.Search(len(m), func(i int) bool {
269		return m[i].From >= uint16(r)
270	})
271	if k < len(m) && m[k].From == uint16(r) {
272		return Region(m[k].To)
273	}
274	return 0
275}
276
277const (
278	iso3166UserAssigned = 1 << iota
279	ccTLD
280	bcp47Region
281)
282
283func (r Region) typ() byte {
284	return regionTypes[r]
285}
286
287// String returns the BCP 47 representation for the region.
288// It returns "ZZ" for an unspecified region.
289func (r Region) String() string {
290	if r < isoRegionOffset {
291		if r == 0 {
292			return "ZZ"
293		}
294		return fmt.Sprintf("%03d", r.M49())
295	}
296	r -= isoRegionOffset
297	return regionISO.Elem(int(r))[:2]
298}
299
300// ISO3 returns the 3-letter ISO code of r.
301// Note that not all regions have a 3-letter ISO code.
302// In such cases this method returns "ZZZ".
303func (r Region) ISO3() string {
304	if r < isoRegionOffset {
305		return "ZZZ"
306	}
307	r -= isoRegionOffset
308	reg := regionISO.Elem(int(r))
309	switch reg[2] {
310	case 0:
311		return altRegionISO3[reg[3]:][:3]
312	case ' ':
313		return "ZZZ"
314	}
315	return reg[0:1] + reg[2:4]
316}
317
318// M49 returns the UN M.49 encoding of r, or 0 if this encoding
319// is not defined for r.
320func (r Region) M49() int {
321	return int(m49[r])
322}
323
324// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
325// may include private-use tags that are assigned by CLDR and used in this
326// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
327func (r Region) IsPrivateUse() bool {
328	return r.typ()&iso3166UserAssigned != 0
329}
330
331type Script uint8
332
333// getScriptID returns the script id for string s. It assumes that s
334// is of the format [A-Z][a-z]{3}.
335func getScriptID(idx tag.Index, s []byte) (Script, error) {
336	i, err := findIndex(idx, s, "Zzzz")
337	return Script(i), err
338}
339
340// String returns the script code in title case.
341// It returns "Zzzz" for an unspecified script.
342func (s Script) String() string {
343	if s == 0 {
344		return "Zzzz"
345	}
346	return script.Elem(int(s))
347}
348
349// IsPrivateUse reports whether this script code is reserved for private use.
350func (s Script) IsPrivateUse() bool {
351	return _Qaaa <= s && s <= _Qabx
352}
353
354const (
355	maxAltTaglen = len("en-US-POSIX")
356	maxLen       = maxAltTaglen
357)
358
359var (
360	// grandfatheredMap holds a mapping from legacy and grandfathered tags to
361	// their base language or index to more elaborate tag.
362	grandfatheredMap = map[[maxLen]byte]int16{
363		[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
364		[maxLen]byte{'i', '-', 'a', 'm', 'i'}:                          _ami, // i-ami
365		[maxLen]byte{'i', '-', 'b', 'n', 'n'}:                          _bnn, // i-bnn
366		[maxLen]byte{'i', '-', 'h', 'a', 'k'}:                          _hak, // i-hak
367		[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}:      _tlh, // i-klingon
368		[maxLen]byte{'i', '-', 'l', 'u', 'x'}:                          _lb,  // i-lux
369		[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}:           _nv,  // i-navajo
370		[maxLen]byte{'i', '-', 'p', 'w', 'n'}:                          _pwn, // i-pwn
371		[maxLen]byte{'i', '-', 't', 'a', 'o'}:                          _tao, // i-tao
372		[maxLen]byte{'i', '-', 't', 'a', 'y'}:                          _tay, // i-tay
373		[maxLen]byte{'i', '-', 't', 's', 'u'}:                          _tsu, // i-tsu
374		[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}:                     _nb,  // no-bok
375		[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}:                     _nn,  // no-nyn
376		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}:      _sfb, // sgn-BE-FR
377		[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}:      _vgt, // sgn-BE-NL
378		[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}:      _sgg, // sgn-CH-DE
379		[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}:           _cmn, // zh-guoyu
380		[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}:           _hak, // zh-hakka
381		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
382		[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}:           _hsn, // zh-xiang
383
384		// Grandfathered tags with no modern replacement will be converted as
385		// follows:
386		[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
387		[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}:           -2, // en-GB-oed
388		[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}:           -3, // i-default
389		[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}:      -4, // i-enochian
390		[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}:                     -5, // i-mingo
391		[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}:                          -6, // zh-min
392
393		// CLDR-specific tag.
394		[maxLen]byte{'r', 'o', 'o', 't'}:                                    0,  // root
395		[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
396	}
397
398	altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
399
400	altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
401)
402
403func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
404	if v, ok := grandfatheredMap[s]; ok {
405		if v < 0 {
406			return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
407		}
408		t.LangID = Language(v)
409		return t, true
410	}
411	return t, false
412}
413