1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package language
6
7import (
8	"bytes"
9	"errors"
10	"fmt"
11	"sort"
12
13	"golang.org/x/text/internal/tag"
14)
15
16// isAlpha returns true if the byte is not a digit.
17// b must be an ASCII letter or digit.
18func isAlpha(b byte) bool {
19	return b > '9'
20}
21
22// isAlphaNum returns true if the string contains only ASCII letters or digits.
23func isAlphaNum(s []byte) bool {
24	for _, c := range s {
25		if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
26			return false
27		}
28	}
29	return true
30}
31
32// ErrSyntax is returned by any of the parsing functions when the
33// input is not well-formed, according to BCP 47.
34// TODO: return the position at which the syntax error occurred?
35var ErrSyntax = errors.New("language: tag is not well-formed")
36
37// ErrDuplicateKey is returned when a tag contains the same key twice with
38// different values in the -u section.
39var ErrDuplicateKey = errors.New("language: different values for same key in -u extension")
40
41// ValueError is returned by any of the parsing functions when the
42// input is well-formed but the respective subtag is not recognized
43// as a valid value.
44type ValueError struct {
45	v [8]byte
46}
47
48// NewValueError creates a new ValueError.
49func NewValueError(tag []byte) ValueError {
50	var e ValueError
51	copy(e.v[:], tag)
52	return e
53}
54
55func (e ValueError) tag() []byte {
56	n := bytes.IndexByte(e.v[:], 0)
57	if n == -1 {
58		n = 8
59	}
60	return e.v[:n]
61}
62
63// Error implements the error interface.
64func (e ValueError) Error() string {
65	return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
66}
67
68// Subtag returns the subtag for which the error occurred.
69func (e ValueError) Subtag() string {
70	return string(e.tag())
71}
72
73// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
74type scanner struct {
75	b     []byte
76	bytes [max99thPercentileSize]byte
77	token []byte
78	start int // start position of the current token
79	end   int // end position of the current token
80	next  int // next point for scan
81	err   error
82	done  bool
83}
84
85func makeScannerString(s string) scanner {
86	scan := scanner{}
87	if len(s) <= len(scan.bytes) {
88		scan.b = scan.bytes[:copy(scan.bytes[:], s)]
89	} else {
90		scan.b = []byte(s)
91	}
92	scan.init()
93	return scan
94}
95
96// makeScanner returns a scanner using b as the input buffer.
97// b is not copied and may be modified by the scanner routines.
98func makeScanner(b []byte) scanner {
99	scan := scanner{b: b}
100	scan.init()
101	return scan
102}
103
104func (s *scanner) init() {
105	for i, c := range s.b {
106		if c == '_' {
107			s.b[i] = '-'
108		}
109	}
110	s.scan()
111}
112
113// restToLower converts the string between start and end to lower case.
114func (s *scanner) toLower(start, end int) {
115	for i := start; i < end; i++ {
116		c := s.b[i]
117		if 'A' <= c && c <= 'Z' {
118			s.b[i] += 'a' - 'A'
119		}
120	}
121}
122
123func (s *scanner) setError(e error) {
124	if s.err == nil || (e == ErrSyntax && s.err != ErrSyntax) {
125		s.err = e
126	}
127}
128
129// resizeRange shrinks or grows the array at position oldStart such that
130// a new string of size newSize can fit between oldStart and oldEnd.
131// Sets the scan point to after the resized range.
132func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
133	s.start = oldStart
134	if end := oldStart + newSize; end != oldEnd {
135		diff := end - oldEnd
136		var b []byte
137		if n := len(s.b) + diff; n > cap(s.b) {
138			b = make([]byte, n)
139			copy(b, s.b[:oldStart])
140		} else {
141			b = s.b[:n]
142		}
143		copy(b[end:], s.b[oldEnd:])
144		s.b = b
145		s.next = end + (s.next - s.end)
146		s.end = end
147	}
148}
149
150// replace replaces the current token with repl.
151func (s *scanner) replace(repl string) {
152	s.resizeRange(s.start, s.end, len(repl))
153	copy(s.b[s.start:], repl)
154}
155
156// gobble removes the current token from the input.
157// Caller must call scan after calling gobble.
158func (s *scanner) gobble(e error) {
159	s.setError(e)
160	if s.start == 0 {
161		s.b = s.b[:+copy(s.b, s.b[s.next:])]
162		s.end = 0
163	} else {
164		s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
165		s.end = s.start - 1
166	}
167	s.next = s.start
168}
169
170// deleteRange removes the given range from s.b before the current token.
171func (s *scanner) deleteRange(start, end int) {
172	s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
173	diff := end - start
174	s.next -= diff
175	s.start -= diff
176	s.end -= diff
177}
178
179// scan parses the next token of a BCP 47 string.  Tokens that are larger
180// than 8 characters or include non-alphanumeric characters result in an error
181// and are gobbled and removed from the output.
182// It returns the end position of the last token consumed.
183func (s *scanner) scan() (end int) {
184	end = s.end
185	s.token = nil
186	for s.start = s.next; s.next < len(s.b); {
187		i := bytes.IndexByte(s.b[s.next:], '-')
188		if i == -1 {
189			s.end = len(s.b)
190			s.next = len(s.b)
191			i = s.end - s.start
192		} else {
193			s.end = s.next + i
194			s.next = s.end + 1
195		}
196		token := s.b[s.start:s.end]
197		if i < 1 || i > 8 || !isAlphaNum(token) {
198			s.gobble(ErrSyntax)
199			continue
200		}
201		s.token = token
202		return end
203	}
204	if n := len(s.b); n > 0 && s.b[n-1] == '-' {
205		s.setError(ErrSyntax)
206		s.b = s.b[:len(s.b)-1]
207	}
208	s.done = true
209	return end
210}
211
212// acceptMinSize parses multiple tokens of the given size or greater.
213// It returns the end position of the last token consumed.
214func (s *scanner) acceptMinSize(min int) (end int) {
215	end = s.end
216	s.scan()
217	for ; len(s.token) >= min; s.scan() {
218		end = s.end
219	}
220	return end
221}
222
223// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
224// failed it returns an error and any part of the tag that could be parsed.
225// If parsing succeeded but an unknown value was found, it returns
226// ValueError. The Tag returned in this case is just stripped of the unknown
227// value. All other values are preserved. It accepts tags in the BCP 47 format
228// and extensions to this standard defined in
229// https://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
230func Parse(s string) (t Tag, err error) {
231	// TODO: consider supporting old-style locale key-value pairs.
232	if s == "" {
233		return Und, ErrSyntax
234	}
235	defer func() {
236		if recover() != nil {
237			t = Und
238			err = ErrSyntax
239			return
240		}
241	}()
242	if len(s) <= maxAltTaglen {
243		b := [maxAltTaglen]byte{}
244		for i, c := range s {
245			// Generating invalid UTF-8 is okay as it won't match.
246			if 'A' <= c && c <= 'Z' {
247				c += 'a' - 'A'
248			} else if c == '_' {
249				c = '-'
250			}
251			b[i] = byte(c)
252		}
253		if t, ok := grandfathered(b); ok {
254			return t, nil
255		}
256	}
257	scan := makeScannerString(s)
258	return parse(&scan, s)
259}
260
261func parse(scan *scanner, s string) (t Tag, err error) {
262	t = Und
263	var end int
264	if n := len(scan.token); n <= 1 {
265		scan.toLower(0, len(scan.b))
266		if n == 0 || scan.token[0] != 'x' {
267			return t, ErrSyntax
268		}
269		end = parseExtensions(scan)
270	} else if n >= 4 {
271		return Und, ErrSyntax
272	} else { // the usual case
273		t, end = parseTag(scan)
274		if n := len(scan.token); n == 1 {
275			t.pExt = uint16(end)
276			end = parseExtensions(scan)
277		} else if end < len(scan.b) {
278			scan.setError(ErrSyntax)
279			scan.b = scan.b[:end]
280		}
281	}
282	if int(t.pVariant) < len(scan.b) {
283		if end < len(s) {
284			s = s[:end]
285		}
286		if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
287			t.str = s
288		} else {
289			t.str = string(scan.b)
290		}
291	} else {
292		t.pVariant, t.pExt = 0, 0
293	}
294	return t, scan.err
295}
296
297// parseTag parses language, script, region and variants.
298// It returns a Tag and the end position in the input that was parsed.
299func parseTag(scan *scanner) (t Tag, end int) {
300	var e error
301	// TODO: set an error if an unknown lang, script or region is encountered.
302	t.LangID, e = getLangID(scan.token)
303	scan.setError(e)
304	scan.replace(t.LangID.String())
305	langStart := scan.start
306	end = scan.scan()
307	for len(scan.token) == 3 && isAlpha(scan.token[0]) {
308		// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
309		// to a tag of the form <extlang>.
310		lang, e := getLangID(scan.token)
311		if lang != 0 {
312			t.LangID = lang
313			copy(scan.b[langStart:], lang.String())
314			scan.b[langStart+3] = '-'
315			scan.start = langStart + 4
316		}
317		scan.gobble(e)
318		end = scan.scan()
319	}
320	if len(scan.token) == 4 && isAlpha(scan.token[0]) {
321		t.ScriptID, e = getScriptID(script, scan.token)
322		if t.ScriptID == 0 {
323			scan.gobble(e)
324		}
325		end = scan.scan()
326	}
327	if n := len(scan.token); n >= 2 && n <= 3 {
328		t.RegionID, e = getRegionID(scan.token)
329		if t.RegionID == 0 {
330			scan.gobble(e)
331		} else {
332			scan.replace(t.RegionID.String())
333		}
334		end = scan.scan()
335	}
336	scan.toLower(scan.start, len(scan.b))
337	t.pVariant = byte(end)
338	end = parseVariants(scan, end, t)
339	t.pExt = uint16(end)
340	return t, end
341}
342
343var separator = []byte{'-'}
344
345// parseVariants scans tokens as long as each token is a valid variant string.
346// Duplicate variants are removed.
347func parseVariants(scan *scanner, end int, t Tag) int {
348	start := scan.start
349	varIDBuf := [4]uint8{}
350	variantBuf := [4][]byte{}
351	varID := varIDBuf[:0]
352	variant := variantBuf[:0]
353	last := -1
354	needSort := false
355	for ; len(scan.token) >= 4; scan.scan() {
356		// TODO: measure the impact of needing this conversion and redesign
357		// the data structure if there is an issue.
358		v, ok := variantIndex[string(scan.token)]
359		if !ok {
360			// unknown variant
361			// TODO: allow user-defined variants?
362			scan.gobble(NewValueError(scan.token))
363			continue
364		}
365		varID = append(varID, v)
366		variant = append(variant, scan.token)
367		if !needSort {
368			if last < int(v) {
369				last = int(v)
370			} else {
371				needSort = true
372				// There is no legal combinations of more than 7 variants
373				// (and this is by no means a useful sequence).
374				const maxVariants = 8
375				if len(varID) > maxVariants {
376					break
377				}
378			}
379		}
380		end = scan.end
381	}
382	if needSort {
383		sort.Sort(variantsSort{varID, variant})
384		k, l := 0, -1
385		for i, v := range varID {
386			w := int(v)
387			if l == w {
388				// Remove duplicates.
389				continue
390			}
391			varID[k] = varID[i]
392			variant[k] = variant[i]
393			k++
394			l = w
395		}
396		if str := bytes.Join(variant[:k], separator); len(str) == 0 {
397			end = start - 1
398		} else {
399			scan.resizeRange(start, end, len(str))
400			copy(scan.b[scan.start:], str)
401			end = scan.end
402		}
403	}
404	return end
405}
406
407type variantsSort struct {
408	i []uint8
409	v [][]byte
410}
411
412func (s variantsSort) Len() int {
413	return len(s.i)
414}
415
416func (s variantsSort) Swap(i, j int) {
417	s.i[i], s.i[j] = s.i[j], s.i[i]
418	s.v[i], s.v[j] = s.v[j], s.v[i]
419}
420
421func (s variantsSort) Less(i, j int) bool {
422	return s.i[i] < s.i[j]
423}
424
425type bytesSort struct {
426	b [][]byte
427	n int // first n bytes to compare
428}
429
430func (b bytesSort) Len() int {
431	return len(b.b)
432}
433
434func (b bytesSort) Swap(i, j int) {
435	b.b[i], b.b[j] = b.b[j], b.b[i]
436}
437
438func (b bytesSort) Less(i, j int) bool {
439	for k := 0; k < b.n; k++ {
440		if b.b[i][k] == b.b[j][k] {
441			continue
442		}
443		return b.b[i][k] < b.b[j][k]
444	}
445	return false
446}
447
448// parseExtensions parses and normalizes the extensions in the buffer.
449// It returns the last position of scan.b that is part of any extension.
450// It also trims scan.b to remove excess parts accordingly.
451func parseExtensions(scan *scanner) int {
452	start := scan.start
453	exts := [][]byte{}
454	private := []byte{}
455	end := scan.end
456	for len(scan.token) == 1 {
457		extStart := scan.start
458		ext := scan.token[0]
459		end = parseExtension(scan)
460		extension := scan.b[extStart:end]
461		if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
462			scan.setError(ErrSyntax)
463			end = extStart
464			continue
465		} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
466			scan.b = scan.b[:end]
467			return end
468		} else if ext == 'x' {
469			private = extension
470			break
471		}
472		exts = append(exts, extension)
473	}
474	sort.Sort(bytesSort{exts, 1})
475	if len(private) > 0 {
476		exts = append(exts, private)
477	}
478	scan.b = scan.b[:start]
479	if len(exts) > 0 {
480		scan.b = append(scan.b, bytes.Join(exts, separator)...)
481	} else if start > 0 {
482		// Strip trailing '-'.
483		scan.b = scan.b[:start-1]
484	}
485	return end
486}
487
488// parseExtension parses a single extension and returns the position of
489// the extension end.
490func parseExtension(scan *scanner) int {
491	start, end := scan.start, scan.end
492	switch scan.token[0] {
493	case 'u': // https://www.ietf.org/rfc/rfc6067.txt
494		attrStart := end
495		scan.scan()
496		for last := []byte{}; len(scan.token) > 2; scan.scan() {
497			if bytes.Compare(scan.token, last) != -1 {
498				// Attributes are unsorted. Start over from scratch.
499				p := attrStart + 1
500				scan.next = p
501				attrs := [][]byte{}
502				for scan.scan(); len(scan.token) > 2; scan.scan() {
503					attrs = append(attrs, scan.token)
504					end = scan.end
505				}
506				sort.Sort(bytesSort{attrs, 3})
507				copy(scan.b[p:], bytes.Join(attrs, separator))
508				break
509			}
510			last = scan.token
511			end = scan.end
512		}
513		// Scan key-type sequences. A key is of length 2 and may be followed
514		// by 0 or more "type" subtags from 3 to the maximum of 8 letters.
515		var last, key []byte
516		for attrEnd := end; len(scan.token) == 2; last = key {
517			key = scan.token
518			end = scan.end
519			for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
520				end = scan.end
521			}
522			// TODO: check key value validity
523			if bytes.Compare(key, last) != 1 || scan.err != nil {
524				// We have an invalid key or the keys are not sorted.
525				// Start scanning keys from scratch and reorder.
526				p := attrEnd + 1
527				scan.next = p
528				keys := [][]byte{}
529				for scan.scan(); len(scan.token) == 2; {
530					keyStart := scan.start
531					end = scan.end
532					for scan.scan(); end < scan.end && len(scan.token) > 2; scan.scan() {
533						end = scan.end
534					}
535					keys = append(keys, scan.b[keyStart:end])
536				}
537				sort.Stable(bytesSort{keys, 2})
538				if n := len(keys); n > 0 {
539					k := 0
540					for i := 1; i < n; i++ {
541						if !bytes.Equal(keys[k][:2], keys[i][:2]) {
542							k++
543							keys[k] = keys[i]
544						} else if !bytes.Equal(keys[k], keys[i]) {
545							scan.setError(ErrDuplicateKey)
546						}
547					}
548					keys = keys[:k+1]
549				}
550				reordered := bytes.Join(keys, separator)
551				if e := p + len(reordered); e < end {
552					scan.deleteRange(e, end)
553					end = e
554				}
555				copy(scan.b[p:], reordered)
556				break
557			}
558		}
559	case 't': // https://www.ietf.org/rfc/rfc6497.txt
560		scan.scan()
561		if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
562			_, end = parseTag(scan)
563			scan.toLower(start, end)
564		}
565		for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
566			end = scan.acceptMinSize(3)
567		}
568	case 'x':
569		end = scan.acceptMinSize(1)
570	default:
571		end = scan.acceptMinSize(2)
572	}
573	return end
574}
575
576// getExtension returns the name, body and end position of the extension.
577func getExtension(s string, p int) (end int, ext string) {
578	if s[p] == '-' {
579		p++
580	}
581	if s[p] == 'x' {
582		return len(s), s[p:]
583	}
584	end = nextExtension(s, p)
585	return end, s[p:end]
586}
587
588// nextExtension finds the next extension within the string, searching
589// for the -<char>- pattern from position p.
590// In the fast majority of cases, language tags will have at most
591// one extension and extensions tend to be small.
592func nextExtension(s string, p int) int {
593	for n := len(s) - 3; p < n; {
594		if s[p] == '-' {
595			if s[p+2] == '-' {
596				return p
597			}
598			p += 3
599		} else {
600			p++
601		}
602	}
603	return len(s)
604}
605