1// Copyright 2017 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// Package cldrtree builds and generates a CLDR index file, including all 6// inheritance. 7// 8package cldrtree 9 10//go:generate go test -gen 11 12// cldrtree stores CLDR data in a tree-like structure called Tree. In the CLDR 13// data each branch in the tree is indicated by either an element name or an 14// attribute value. A Tree does not distinguish between these two cases, but 15// rather assumes that all branches can be accessed by an enum with a compact 16// range of positive integer values starting from 0. 17// 18// Each Tree consists of three parts: 19// - a slice mapping compact language identifiers to an offset into a set of 20// indices, 21// - a set of indices, stored as a large blob of uint16 values that encode 22// the actual tree structure of data, and 23// - a set of buckets that each holds a collection of strings. 24// each of which is explained in more detail below. 25// 26// 27// Tree lookup 28// A tree lookup is done by providing a locale and a "path", which is a 29// sequence of enum values. The search starts with getting the index for the 30// given locale and then incrementally jumping into the index using the path 31// values. If an element cannot be found in the index, the search starts anew 32// for the locale's parent locale. The path may change during lookup by means 33// of aliasing, described below. 34// 35// Buckets 36// Buckets hold the actual string data of the leaf values of the CLDR tree. 37// This data is stored in buckets, rather than one large string, for multiple 38// reasons: 39// - it allows representing leaf values more compactly, by storing all leaf 40// values in a single bucket and then needing only needing a uint16 to index 41// into this bucket for all leaf values, 42// - (TBD) allow multiple trees to share subsets of buckets, mostly to allow 43// linking in a smaller amount of data if only a subset of the buckets is 44// needed, 45// - to be nice to go fmt and the compiler. 46// 47// indices 48// An index is a slice of uint16 for which the values are interpreted in one of 49// two ways: as a node or a set of leaf values. 50// A set of leaf values has the following form: 51// <max_size>, <bucket>, <offset>... 52// max_size indicates the maximum enum value for which an offset is defined. 53// An offset value of 0xFFFF (missingValue) also indicates an undefined value. 54// If defined offset indicates the offset within the given bucket of the string. 55// A node value has the following form: 56// <max_size>, <offset_or_alias>... 57// max_size indicates the maximum value for which an offset is defined. 58// A missing offset may also be indicated with 0. If the high bit (0x8000, or 59// inheritMask) is not set, the offset points to the offset within the index 60// for the current locale. 61// An offset with high bit set is an alias. In this case the uint16 has the form 62// bits: 63// 15: 1 64// 14-12: negative offset into path relative to current position 65// 0-11: new enum value for path element. 66// On encountering an alias, the path is modified accordingly and the lookup is 67// restarted for the given locale. 68 69import ( 70 "fmt" 71 "reflect" 72 "regexp" 73 "strings" 74 "unicode/utf8" 75 76 "golang.org/x/text/internal/gen" 77 "golang.org/x/text/language" 78 "golang.org/x/text/unicode/cldr" 79) 80 81// TODO: 82// - allow two Trees to share the same set of buckets. 83 84// A Builder allows storing CLDR data in compact form. 85type Builder struct { 86 table []string 87 88 rootMeta *metaData 89 locales []locale 90 strToBucket map[string]stringInfo 91 buckets [][]byte 92 enums []*enum 93 err error 94 95 // Stats 96 size int 97 sizeAll int 98 bucketWaste int 99} 100 101const ( 102 maxBucketSize = 8 * 1024 // 8K 103 maxStrlen = 254 // allow 0xFF sentinel 104) 105 106func (b *Builder) setError(err error) { 107 if b.err == nil { 108 b.err = err 109 } 110} 111 112func (b *Builder) addString(data string) stringInfo { 113 data = b.makeString(data) 114 info, ok := b.strToBucket[data] 115 if !ok { 116 b.size += len(data) 117 x := len(b.buckets) - 1 118 bucket := b.buckets[x] 119 if len(bucket)+len(data) < maxBucketSize { 120 info.bucket = uint16(x) 121 info.bucketPos = uint16(len(bucket)) 122 b.buckets[x] = append(bucket, data...) 123 } else { 124 info.bucket = uint16(len(b.buckets)) 125 info.bucketPos = 0 126 b.buckets = append(b.buckets, []byte(data)) 127 } 128 b.strToBucket[data] = info 129 } 130 return info 131} 132 133func (b *Builder) addStringToBucket(data string, bucket uint16) stringInfo { 134 data = b.makeString(data) 135 info, ok := b.strToBucket[data] 136 if !ok || info.bucket != bucket { 137 if ok { 138 b.bucketWaste += len(data) 139 } 140 b.size += len(data) 141 bk := b.buckets[bucket] 142 info.bucket = bucket 143 info.bucketPos = uint16(len(bk)) 144 b.buckets[bucket] = append(bk, data...) 145 b.strToBucket[data] = info 146 } 147 return info 148} 149 150func (b *Builder) makeString(data string) string { 151 if len(data) > maxStrlen { 152 b.setError(fmt.Errorf("string %q exceeds maximum length of %d", data, maxStrlen)) 153 data = data[:maxStrlen] 154 for i := len(data) - 1; i > len(data)-4; i-- { 155 if utf8.RuneStart(data[i]) { 156 data = data[:i] 157 break 158 } 159 } 160 } 161 data = string([]byte{byte(len(data))}) + data 162 b.sizeAll += len(data) 163 return data 164} 165 166type stringInfo struct { 167 bufferPos uint32 168 bucket uint16 169 bucketPos uint16 170} 171 172// New creates a new Builder. 173func New(tableName string) *Builder { 174 b := &Builder{ 175 strToBucket: map[string]stringInfo{}, 176 buckets: [][]byte{nil}, // initialize with first bucket. 177 } 178 b.rootMeta = &metaData{ 179 b: b, 180 typeInfo: &typeInfo{}, 181 } 182 return b 183} 184 185// Gen writes all the tables and types for the collected data. 186func (b *Builder) Gen(w *gen.CodeWriter) error { 187 t, err := build(b) 188 if err != nil { 189 return err 190 } 191 return generate(b, t, w) 192} 193 194// GenTestData generates tables useful for testing data generated with Gen. 195func (b *Builder) GenTestData(w *gen.CodeWriter) error { 196 return generateTestData(b, w) 197} 198 199type locale struct { 200 tag language.Tag 201 root *Index 202} 203 204// Locale creates an index for the given locale. 205func (b *Builder) Locale(t language.Tag) *Index { 206 index := &Index{ 207 meta: b.rootMeta, 208 } 209 b.locales = append(b.locales, locale{tag: t, root: index}) 210 return index 211} 212 213// An Index holds a map of either leaf values or other indices. 214type Index struct { 215 meta *metaData 216 217 subIndex []*Index 218 values []keyValue 219} 220 221func (i *Index) setError(err error) { i.meta.b.setError(err) } 222 223type keyValue struct { 224 key enumIndex 225 value stringInfo 226} 227 228// Element is a CLDR XML element. 229type Element interface { 230 GetCommon() *cldr.Common 231} 232 233// Index creates a subindex where the type and enum values are not shared 234// with siblings by default. The name is derived from the elem. If elem is 235// an alias reference, the alias will be resolved and linked. If elem is nil 236// Index returns nil. 237func (i *Index) Index(elem Element, opt ...Option) *Index { 238 if elem == nil || reflect.ValueOf(elem).IsNil() { 239 return nil 240 } 241 c := elem.GetCommon() 242 o := &options{ 243 parent: i, 244 name: c.GetCommon().Element(), 245 } 246 o.fill(opt) 247 o.setAlias(elem) 248 return i.subIndexForKey(o) 249} 250 251// IndexWithName is like Section but derives the name from the given name. 252func (i *Index) IndexWithName(name string, opt ...Option) *Index { 253 o := &options{parent: i, name: name} 254 o.fill(opt) 255 return i.subIndexForKey(o) 256} 257 258// IndexFromType creates a subindex the value of tye type attribute as key. It 259// will also configure the Index to share the enumeration values with all 260// sibling values. If elem is an alias, it will be resolved and linked. 261func (i *Index) IndexFromType(elem Element, opts ...Option) *Index { 262 o := &options{ 263 parent: i, 264 name: elem.GetCommon().Type, 265 } 266 o.fill(opts) 267 o.setAlias(elem) 268 useSharedType()(o) 269 return i.subIndexForKey(o) 270} 271 272// IndexFromAlt creates a subindex the value of tye alt attribute as key. It 273// will also configure the Index to share the enumeration values with all 274// sibling values. If elem is an alias, it will be resolved and linked. 275func (i *Index) IndexFromAlt(elem Element, opts ...Option) *Index { 276 o := &options{ 277 parent: i, 278 name: elem.GetCommon().Alt, 279 } 280 o.fill(opts) 281 o.setAlias(elem) 282 useSharedType()(o) 283 return i.subIndexForKey(o) 284} 285 286func (i *Index) subIndexForKey(opts *options) *Index { 287 key := opts.name 288 if len(i.values) > 0 { 289 panic(fmt.Errorf("cldrtree: adding Index for %q when value already exists", key)) 290 } 291 meta := i.meta.sub(key, opts) 292 for _, x := range i.subIndex { 293 if x.meta == meta { 294 return x 295 } 296 } 297 if alias := opts.alias; alias != nil { 298 if a := alias.GetCommon().Alias; a != nil { 299 if a.Source != "locale" { 300 i.setError(fmt.Errorf("cldrtree: non-locale alias not supported %v", a.Path)) 301 } 302 if meta.inheritOffset < 0 { 303 i.setError(fmt.Errorf("cldrtree: alias was already set %v", a.Path)) 304 } 305 path := a.Path 306 for ; strings.HasPrefix(path, "../"); path = path[len("../"):] { 307 meta.inheritOffset-- 308 } 309 m := aliasRe.FindStringSubmatch(path) 310 if m == nil { 311 i.setError(fmt.Errorf("cldrtree: could not parse alias %q", a.Path)) 312 } else { 313 key := m[4] 314 if key == "" { 315 key = m[1] 316 } 317 meta.inheritIndex = key 318 } 319 } 320 } 321 x := &Index{meta: meta} 322 i.subIndex = append(i.subIndex, x) 323 return x 324} 325 326var aliasRe = regexp.MustCompile(`^([a-zA-Z]+)(\[@([a-zA-Z-]+)='([a-zA-Z-]+)'\])?`) 327 328// SetValue sets the value, the data from a CLDR XML element, for the given key. 329func (i *Index) SetValue(key string, value Element, opt ...Option) { 330 if len(i.subIndex) > 0 { 331 panic(fmt.Errorf("adding value for key %q when index already exists", key)) 332 } 333 o := &options{parent: i} 334 o.fill(opt) 335 c := value.GetCommon() 336 if c.Alias != nil { 337 i.setError(fmt.Errorf("cldrtree: alias not supported for SetValue %v", c.Alias.Path)) 338 } 339 i.setValue(key, c.Data(), o) 340} 341 342func (i *Index) setValue(key, data string, o *options) { 343 index, _ := i.meta.typeInfo.lookupSubtype(key, o) 344 kv := keyValue{key: index} 345 if len(i.values) > 0 { 346 // Add string to the same bucket as the other values. 347 bucket := i.values[0].value.bucket 348 kv.value = i.meta.b.addStringToBucket(data, bucket) 349 } else { 350 kv.value = i.meta.b.addString(data) 351 } 352 i.values = append(i.values, kv) 353} 354