1// Copyright 2013 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package cldr
6
7import (
8	"bufio"
9	"encoding/xml"
10	"errors"
11	"fmt"
12	"strconv"
13	"strings"
14	"unicode"
15	"unicode/utf8"
16)
17
18// RuleProcessor can be passed to Collator's Process method, which
19// parses the rules and calls the respective method for each rule found.
20type RuleProcessor interface {
21	Reset(anchor string, before int) error
22	Insert(level int, str, context, extend string) error
23	Index(id string)
24}
25
26const (
27	// cldrIndex is a Unicode-reserved sentinel value used to mark the start
28	// of a grouping within an index.
29	// We ignore any rule that starts with this rune.
30	// See https://unicode.org/reports/tr35/#Collation_Elements for details.
31	cldrIndex = "\uFDD0"
32
33	// specialAnchor is the format in which to represent logical reset positions,
34	// such as "first tertiary ignorable".
35	specialAnchor = "<%s/>"
36)
37
38// Process parses the rules for the tailorings of this collation
39// and calls the respective methods of p for each rule found.
40func (c Collation) Process(p RuleProcessor) (err error) {
41	if len(c.Cr) > 0 {
42		if len(c.Cr) > 1 {
43			return fmt.Errorf("multiple cr elements, want 0 or 1")
44		}
45		return processRules(p, c.Cr[0].Data())
46	}
47	if c.Rules.Any != nil {
48		return c.processXML(p)
49	}
50	return errors.New("no tailoring data")
51}
52
53// processRules parses rules in the Collation Rule Syntax defined in
54// https://www.unicode.org/reports/tr35/tr35-collation.html#Collation_Tailorings.
55func processRules(p RuleProcessor, s string) (err error) {
56	chk := func(s string, e error) string {
57		if err == nil {
58			err = e
59		}
60		return s
61	}
62	i := 0 // Save the line number for use after the loop.
63	scanner := bufio.NewScanner(strings.NewReader(s))
64	for ; scanner.Scan() && err == nil; i++ {
65		for s := skipSpace(scanner.Text()); s != "" && s[0] != '#'; s = skipSpace(s) {
66			level := 5
67			var ch byte
68			switch ch, s = s[0], s[1:]; ch {
69			case '&': // followed by <anchor> or '[' <key> ']'
70				if s = skipSpace(s); consume(&s, '[') {
71					s = chk(parseSpecialAnchor(p, s))
72				} else {
73					s = chk(parseAnchor(p, 0, s))
74				}
75			case '<': // sort relation '<'{1,4}, optionally followed by '*'.
76				for level = 1; consume(&s, '<'); level++ {
77				}
78				if level > 4 {
79					err = fmt.Errorf("level %d > 4", level)
80				}
81				fallthrough
82			case '=': // identity relation, optionally followed by *.
83				if consume(&s, '*') {
84					s = chk(parseSequence(p, level, s))
85				} else {
86					s = chk(parseOrder(p, level, s))
87				}
88			default:
89				chk("", fmt.Errorf("illegal operator %q", ch))
90				break
91			}
92		}
93	}
94	if chk("", scanner.Err()); err != nil {
95		return fmt.Errorf("%d: %v", i, err)
96	}
97	return nil
98}
99
100// parseSpecialAnchor parses the anchor syntax which is either of the form
101//    ['before' <level>] <anchor>
102// or
103//    [<label>]
104// The starting should already be consumed.
105func parseSpecialAnchor(p RuleProcessor, s string) (tail string, err error) {
106	i := strings.IndexByte(s, ']')
107	if i == -1 {
108		return "", errors.New("unmatched bracket")
109	}
110	a := strings.TrimSpace(s[:i])
111	s = s[i+1:]
112	if strings.HasPrefix(a, "before ") {
113		l, err := strconv.ParseUint(skipSpace(a[len("before "):]), 10, 3)
114		if err != nil {
115			return s, err
116		}
117		return parseAnchor(p, int(l), s)
118	}
119	return s, p.Reset(fmt.Sprintf(specialAnchor, a), 0)
120}
121
122func parseAnchor(p RuleProcessor, level int, s string) (tail string, err error) {
123	anchor, s, err := scanString(s)
124	if err != nil {
125		return s, err
126	}
127	return s, p.Reset(anchor, level)
128}
129
130func parseOrder(p RuleProcessor, level int, s string) (tail string, err error) {
131	var value, context, extend string
132	if value, s, err = scanString(s); err != nil {
133		return s, err
134	}
135	if strings.HasPrefix(value, cldrIndex) {
136		p.Index(value[len(cldrIndex):])
137		return
138	}
139	if consume(&s, '|') {
140		if context, s, err = scanString(s); err != nil {
141			return s, errors.New("missing string after context")
142		}
143	}
144	if consume(&s, '/') {
145		if extend, s, err = scanString(s); err != nil {
146			return s, errors.New("missing string after extension")
147		}
148	}
149	return s, p.Insert(level, value, context, extend)
150}
151
152// scanString scans a single input string.
153func scanString(s string) (str, tail string, err error) {
154	if s = skipSpace(s); s == "" {
155		return s, s, errors.New("missing string")
156	}
157	buf := [16]byte{} // small but enough to hold most cases.
158	value := buf[:0]
159	for s != "" {
160		if consume(&s, '\'') {
161			i := strings.IndexByte(s, '\'')
162			if i == -1 {
163				return "", "", errors.New(`unmatched single quote`)
164			}
165			if i == 0 {
166				value = append(value, '\'')
167			} else {
168				value = append(value, s[:i]...)
169			}
170			s = s[i+1:]
171			continue
172		}
173		r, sz := utf8.DecodeRuneInString(s)
174		if unicode.IsSpace(r) || strings.ContainsRune("&<=#", r) {
175			break
176		}
177		value = append(value, s[:sz]...)
178		s = s[sz:]
179	}
180	return string(value), skipSpace(s), nil
181}
182
183func parseSequence(p RuleProcessor, level int, s string) (tail string, err error) {
184	if s = skipSpace(s); s == "" {
185		return s, errors.New("empty sequence")
186	}
187	last := rune(0)
188	for s != "" {
189		r, sz := utf8.DecodeRuneInString(s)
190		s = s[sz:]
191
192		if r == '-' {
193			// We have a range. The first element was already written.
194			if last == 0 {
195				return s, errors.New("range without starter value")
196			}
197			r, sz = utf8.DecodeRuneInString(s)
198			s = s[sz:]
199			if r == utf8.RuneError || r < last {
200				return s, fmt.Errorf("invalid range %q-%q", last, r)
201			}
202			for i := last + 1; i <= r; i++ {
203				if err := p.Insert(level, string(i), "", ""); err != nil {
204					return s, err
205				}
206			}
207			last = 0
208			continue
209		}
210
211		if unicode.IsSpace(r) || unicode.IsPunct(r) {
212			break
213		}
214
215		// normal case
216		if err := p.Insert(level, string(r), "", ""); err != nil {
217			return s, err
218		}
219		last = r
220	}
221	return s, nil
222}
223
224func skipSpace(s string) string {
225	return strings.TrimLeftFunc(s, unicode.IsSpace)
226}
227
228// consumes returns whether the next byte is ch. If so, it gobbles it by
229// updating s.
230func consume(s *string, ch byte) (ok bool) {
231	if *s == "" || (*s)[0] != ch {
232		return false
233	}
234	*s = (*s)[1:]
235	return true
236}
237
238// The following code parses Collation rules of CLDR version 24 and before.
239
240var lmap = map[byte]int{
241	'p': 1,
242	's': 2,
243	't': 3,
244	'i': 5,
245}
246
247type rulesElem struct {
248	Rules struct {
249		Common
250		Any []*struct {
251			XMLName xml.Name
252			rule
253		} `xml:",any"`
254	} `xml:"rules"`
255}
256
257type rule struct {
258	Value  string `xml:",chardata"`
259	Before string `xml:"before,attr"`
260	Any    []*struct {
261		XMLName xml.Name
262		rule
263	} `xml:",any"`
264}
265
266var emptyValueError = errors.New("cldr: empty rule value")
267
268func (r *rule) value() (string, error) {
269	// Convert hexadecimal Unicode codepoint notation to a string.
270	s := charRe.ReplaceAllStringFunc(r.Value, replaceUnicode)
271	r.Value = s
272	if s == "" {
273		if len(r.Any) != 1 {
274			return "", emptyValueError
275		}
276		r.Value = fmt.Sprintf(specialAnchor, r.Any[0].XMLName.Local)
277		r.Any = nil
278	} else if len(r.Any) != 0 {
279		return "", fmt.Errorf("cldr: XML elements found in collation rule: %v", r.Any)
280	}
281	return r.Value, nil
282}
283
284func (r rule) process(p RuleProcessor, name, context, extend string) error {
285	v, err := r.value()
286	if err != nil {
287		return err
288	}
289	switch name {
290	case "p", "s", "t", "i":
291		if strings.HasPrefix(v, cldrIndex) {
292			p.Index(v[len(cldrIndex):])
293			return nil
294		}
295		if err := p.Insert(lmap[name[0]], v, context, extend); err != nil {
296			return err
297		}
298	case "pc", "sc", "tc", "ic":
299		level := lmap[name[0]]
300		for _, s := range v {
301			if err := p.Insert(level, string(s), context, extend); err != nil {
302				return err
303			}
304		}
305	default:
306		return fmt.Errorf("cldr: unsupported tag: %q", name)
307	}
308	return nil
309}
310
311// processXML parses the format of CLDR versions 24 and older.
312func (c Collation) processXML(p RuleProcessor) (err error) {
313	// Collation is generated and defined in xml.go.
314	var v string
315	for _, r := range c.Rules.Any {
316		switch r.XMLName.Local {
317		case "reset":
318			level := 0
319			switch r.Before {
320			case "primary", "1":
321				level = 1
322			case "secondary", "2":
323				level = 2
324			case "tertiary", "3":
325				level = 3
326			case "":
327			default:
328				return fmt.Errorf("cldr: unknown level %q", r.Before)
329			}
330			v, err = r.value()
331			if err == nil {
332				err = p.Reset(v, level)
333			}
334		case "x":
335			var context, extend string
336			for _, r1 := range r.Any {
337				v, err = r1.value()
338				switch r1.XMLName.Local {
339				case "context":
340					context = v
341				case "extend":
342					extend = v
343				}
344			}
345			for _, r1 := range r.Any {
346				if t := r1.XMLName.Local; t == "context" || t == "extend" {
347					continue
348				}
349				r1.rule.process(p, r1.XMLName.Local, context, extend)
350			}
351		default:
352			err = r.rule.process(p, r.XMLName.Local, "", "")
353		}
354		if err != nil {
355			return err
356		}
357	}
358	return nil
359}
360