1// Copyright 2017 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// Package cldrtree builds and generates a CLDR index file, including all
6// inheritance.
7//
8package cldrtree
9
10//go:generate go test -gen
11
12// cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR
13// data each branch in the tree is indicated by either an element name or an
14// attribute value. A Tree does not distinguish between these two cases, but
15// rather assumes that all branches can be accessed by an enum with a compact
16// range of positive integer values starting from 0.
17//
18// Each Tree consists of three parts:
19//    - a slice mapping compact language identifiers to an offset into a set of
20//      indices,
21//    - a set of indices, stored as a large blob of uint16 values that encode
22//      the actual tree structure of data, and
23//    - a set of buckets that each holds a collection of strings.
24// each of which is explained in more detail below.
25//
26//
27// Tree lookup
28// A tree lookup is done by providing a locale and a "path", which is a
29// sequence of enum values. The search starts with getting the index for the
30// given locale and then incrementally jumping into the index using the path
31// values. If an element cannot be found in the index, the search starts anew
32// for the locale's parent locale. The path may change during lookup by means
33// of aliasing, described below.
34//
35// Buckets
36// Buckets hold the actual string data of the leaf values of the CLDR tree.
37// This data is stored in buckets, rather than one large string, for multiple
38// reasons:
39//   - it allows representing leaf values more compactly, by storing all leaf
40//     values in a single bucket and then needing only needing a uint16 to index
41//     into this bucket for all leaf values,
42//   - (TBD) allow multiple trees to share subsets of buckets, mostly to allow
43//     linking in a smaller amount of data if only a subset of the buckets is
44//     needed,
45//   - to be nice to go fmt and the compiler.
46//
47// indices
48// An index is a slice of uint16 for which the values are interpreted in one of
49// two ways: as a node or a set of leaf values.
50// A set of leaf values has the following form:
51//      <max_size>, <bucket>, <offset>...
52// max_size indicates the maximum enum value for which an offset is defined.
53// An offset value of 0xFFFF (missingValue) also indicates an undefined value.
54// If defined offset indicates the offset within the given bucket of the string.
55// A node value has the following form:
56//      <max_size>, <offset_or_alias>...
57// max_size indicates the maximum value for which an offset is defined.
58// A missing offset may also be indicated with 0. If the high bit (0x8000, or
59// inheritMask) is not set, the offset points to the offset within the index
60// for the current locale.
61// An offset with high bit set is an alias. In this case the uint16 has the form
62//       bits:
63//         15: 1
64//      14-12: negative offset into path relative to current position
65//       0-11: new enum value for path element.
66// On encountering an alias, the path is modified accordingly and the lookup is
67// restarted for the given locale.
68
69import (
70	"fmt"
71	"reflect"
72	"regexp"
73	"strings"
74	"unicode/utf8"
75
76	"golang.org/x/text/internal/gen"
77	"golang.org/x/text/language"
78	"golang.org/x/text/unicode/cldr"
79)
80
81// TODO:
82// - allow two Trees to share the same set of buckets.
83
84// A Builder allows storing CLDR data in compact form.
85type Builder struct {
86	table []string
87
88	rootMeta    *metaData
89	locales     []locale
90	strToBucket map[string]stringInfo
91	buckets     [][]byte
92	enums       []*enum
93	err         error
94
95	// Stats
96	size        int
97	sizeAll     int
98	bucketWaste int
99}
100
101const (
102	maxBucketSize = 8 * 1024 // 8K
103	maxStrlen     = 254      // allow 0xFF sentinel
104)
105
106func (b *Builder) setError(err error) {
107	if b.err == nil {
108		b.err = err
109	}
110}
111
112func (b *Builder) addString(data string) stringInfo {
113	data = b.makeString(data)
114	info, ok := b.strToBucket[data]
115	if !ok {
116		b.size += len(data)
117		x := len(b.buckets) - 1
118		bucket := b.buckets[x]
119		if len(bucket)+len(data) < maxBucketSize {
120			info.bucket = uint16(x)
121			info.bucketPos = uint16(len(bucket))
122			b.buckets[x] = append(bucket, data...)
123		} else {
124			info.bucket = uint16(len(b.buckets))
125			info.bucketPos = 0
126			b.buckets = append(b.buckets, []byte(data))
127		}
128		b.strToBucket[data] = info
129	}
130	return info
131}
132
133func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo {
134	data = b.makeString(data)
135	info, ok := b.strToBucket[data]
136	if !ok || info.bucket != bucket {
137		if ok {
138			b.bucketWaste += len(data)
139		}
140		b.size += len(data)
141		bk := b.buckets[bucket]
142		info.bucket = bucket
143		info.bucketPos = uint16(len(bk))
144		b.buckets[bucket] = append(bk, data...)
145		b.strToBucket[data] = info
146	}
147	return info
148}
149
150func (b *Builder) makeString(data string) string {
151	if len(data) > maxStrlen {
152		b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen))
153		data = data[:maxStrlen]
154		for i := len(data) - 1; i > len(data)-4; i-- {
155			if utf8.RuneStart(data[i]) {
156				data = data[:i]
157				break
158			}
159		}
160	}
161	data = string([]byte{byte(len(data))}) + data
162	b.sizeAll += len(data)
163	return data
164}
165
166type stringInfo struct {
167	bufferPos uint32
168	bucket    uint16
169	bucketPos uint16
170}
171
172// New creates a new Builder.
173func New(tableName string) *Builder {
174	b := &Builder{
175		strToBucket: map[string]stringInfo{},
176		buckets:     [][]byte{nil}, // initialize with first bucket.
177	}
178	b.rootMeta = &metaData{
179		b:        b,
180		typeInfo: &typeInfo{},
181	}
182	return b
183}
184
185// Gen writes all the tables and types for the collected data.
186func (b *Builder) Gen(w *gen.CodeWriter) error {
187	t, err := build(b)
188	if err != nil {
189		return err
190	}
191	return generate(b, t, w)
192}
193
194// GenTestData generates tables useful for testing data generated with Gen.
195func (b *Builder) GenTestData(w *gen.CodeWriter) error {
196	return generateTestData(b, w)
197}
198
199type locale struct {
200	tag  language.Tag
201	root *Index
202}
203
204// Locale creates an index for the given locale.
205func (b *Builder) Locale(t language.Tag) *Index {
206	index := &Index{
207		meta: b.rootMeta,
208	}
209	b.locales = append(b.locales, locale{tag: t, root: index})
210	return index
211}
212
213// An Index holds a map of either leaf values or other indices.
214type Index struct {
215	meta *metaData
216
217	subIndex []*Index
218	values   []keyValue
219}
220
221func (i *Index) setError(err error) { i.meta.b.setError(err) }
222
223type keyValue struct {
224	key   enumIndex
225	value stringInfo
226}
227
228// Element is a CLDR XML element.
229type Element interface {
230	GetCommon() *cldr.Common
231}
232
233// Index creates a subindex where the type and enum values are not shared
234// with siblings by default. The name is derived from the elem. If elem is
235// an alias reference, the alias will be resolved and linked. If elem is nil
236// Index returns nil.
237func (i *Index) Index(elem Element, opt ...Option) *Index {
238	if elem == nil || reflect.ValueOf(elem).IsNil() {
239		return nil
240	}
241	c := elem.GetCommon()
242	o := &options{
243		parent: i,
244		name:   c.GetCommon().Element(),
245	}
246	o.fill(opt)
247	o.setAlias(elem)
248	return i.subIndexForKey(o)
249}
250
251// IndexWithName is like Section but derives the name from the given name.
252func (i *Index) IndexWithName(name string, opt ...Option) *Index {
253	o := &options{parent: i, name: name}
254	o.fill(opt)
255	return i.subIndexForKey(o)
256}
257
258// IndexFromType creates a subindex the value of tye type attribute as key. It
259// will also configure the Index to share the enumeration values with all
260// sibling values. If elem is an alias, it will be resolved and linked.
261func (i *Index) IndexFromType(elem Element, opts ...Option) *Index {
262	o := &options{
263		parent: i,
264		name:   elem.GetCommon().Type,
265	}
266	o.fill(opts)
267	o.setAlias(elem)
268	useSharedType()(o)
269	return i.subIndexForKey(o)
270}
271
272// IndexFromAlt creates a subindex the value of tye alt attribute as key. It
273// will also configure the Index to share the enumeration values with all
274// sibling values. If elem is an alias, it will be resolved and linked.
275func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index {
276	o := &options{
277		parent: i,
278		name:   elem.GetCommon().Alt,
279	}
280	o.fill(opts)
281	o.setAlias(elem)
282	useSharedType()(o)
283	return i.subIndexForKey(o)
284}
285
286func (i *Index) subIndexForKey(opts *options) *Index {
287	key := opts.name
288	if len(i.values) > 0 {
289		panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key))
290	}
291	meta := i.meta.sub(key, opts)
292	for _, x := range i.subIndex {
293		if x.meta == meta {
294			return x
295		}
296	}
297	if alias := opts.alias; alias != nil {
298		if a := alias.GetCommon().Alias; a != nil {
299			if a.Source != "locale" {
300				i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path))
301			}
302			if meta.inheritOffset < 0 {
303				i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path))
304			}
305			path := a.Path
306			for ; strings.HasPrefix(path, "../"); path = path[len("../"):] {
307				meta.inheritOffset--
308			}
309			m := aliasRe.FindStringSubmatch(path)
310			if m == nil {
311				i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path))
312			} else {
313				key := m[4]
314				if key == "" {
315					key = m[1]
316				}
317				meta.inheritIndex = key
318			}
319		}
320	}
321	x := &Index{meta: meta}
322	i.subIndex = append(i.subIndex, x)
323	return x
324}
325
326var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`)
327
328// SetValue sets the value, the data from a CLDR XML element, for the given key.
329func (i *Index) SetValue(key string, value Element, opt ...Option) {
330	if len(i.subIndex) > 0 {
331		panic(fmt.Errorf("adding value for key %q when index already exists", key))
332	}
333	o := &options{parent: i}
334	o.fill(opt)
335	c := value.GetCommon()
336	if c.Alias != nil {
337		i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path))
338	}
339	i.setValue(key, c.Data(), o)
340}
341
342func (i *Index) setValue(key, data string, o *options) {
343	index, _ := i.meta.typeInfo.lookupSubtype(key, o)
344	kv := keyValue{key: index}
345	if len(i.values) > 0 {
346		// Add string to the same bucket as the other values.
347		bucket := i.values[0].value.bucket
348		kv.value = i.meta.b.addStringToBucket(data, bucket)
349	} else {
350		kv.value = i.meta.b.addString(data)
351	}
352	i.values = append(i.values, kv)
353}
354