1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8package main 9 10// This file generates data for the CLDR plural rules, as defined in 11// https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules 12// 13// We assume a slightly simplified grammar: 14// 15// condition = and_condition ('or' and_condition)* samples 16// and_condition = relation ('and' relation)* 17// relation = expr ('=' | '!=') range_list 18// expr = operand ('%' '10' '0'* )? 19// operand = 'n' | 'i' | 'f' | 't' | 'v' | 'w' 20// range_list = (range | value) (',' range_list)* 21// range = value'..'value 22// value = digit+ 23// digit = 0|1|2|3|4|5|6|7|8|9 24// 25// samples = ('@integer' sampleList)? 26// ('@decimal' sampleList)? 27// sampleList = sampleRange (',' sampleRange)* (',' ('…'|'...'))? 28// sampleRange = decimalValue ('~' decimalValue)? 29// decimalValue = value ('.' value)? 30// 31// Symbol Value 32// n absolute value of the source number (integer and decimals). 33// i integer digits of n. 34// v number of visible fraction digits in n, with trailing zeros. 35// w number of visible fraction digits in n, without trailing zeros. 36// f visible fractional digits in n, with trailing zeros. 37// t visible fractional digits in n, without trailing zeros. 38// 39// The algorithm for which the data is generated is based on the following 40// observations 41// 42// - the number of different sets of numbers which the plural rules use to 43// test inclusion is limited, 44// - most numbers that are tested on are < 100 45// 46// This allows us to define a bitmap for each number < 100 where a bit i 47// indicates whether this number is included in some defined set i. 48// The function matchPlural in plural.go defines how we can subsequently use 49// this data to determine inclusion. 50// 51// There are a few languages for which this doesn't work. For one Italian and 52// Azerbaijan, which both test against numbers > 100 for ordinals and Breton, 53// which considers whether numbers are multiples of hundreds. The model here 54// could be extended to handle Italian and Azerbaijan fairly easily (by 55// considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first 56// 100), but for now it seems easier to just hard-code these cases. 57 58import ( 59 "bufio" 60 "bytes" 61 "flag" 62 "fmt" 63 "log" 64 "strconv" 65 "strings" 66 67 "golang.org/x/text/internal/gen" 68 "golang.org/x/text/internal/language" 69 "golang.org/x/text/internal/language/compact" 70 "golang.org/x/text/unicode/cldr" 71) 72 73var ( 74 test = flag.Bool("test", false, 75 "test existing tables; can be used to compare web data with package data.") 76 outputFile = flag.String("output", "tables.go", "output file") 77 outputTestFile = flag.String("testoutput", "data_test.go", "output file") 78 79 draft = flag.String("draft", 80 "contributed", 81 `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`) 82) 83 84func main() { 85 gen.Init() 86 87 const pkg = "plural" 88 89 gen.Repackage("gen_common.go", "common.go", pkg) 90 // Read the CLDR zip file. 91 r := gen.OpenCLDRCoreZip() 92 defer r.Close() 93 94 d := &cldr.Decoder{} 95 d.SetDirFilter("supplemental", "main") 96 d.SetSectionFilter("numbers", "plurals") 97 data, err := d.DecodeZip(r) 98 if err != nil { 99 log.Fatalf("DecodeZip: %v", err) 100 } 101 102 w := gen.NewCodeWriter() 103 defer w.WriteGoFile(*outputFile, pkg) 104 105 gen.WriteCLDRVersion(w) 106 107 genPlurals(w, data) 108 109 w = gen.NewCodeWriter() 110 defer w.WriteGoFile(*outputTestFile, pkg) 111 112 genPluralsTests(w, data) 113} 114 115type pluralTest struct { 116 locales string // space-separated list of locales for this test 117 form int // Use int instead of Form to simplify generation. 118 integer []string // Entries of the form \d+ or \d+~\d+ 119 decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+ 120} 121 122func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) { 123 w.WriteType(pluralTest{}) 124 125 for _, plurals := range data.Supplemental().Plurals { 126 if plurals.Type == "" { 127 // The empty type is reserved for plural ranges. 128 continue 129 } 130 tests := []pluralTest{} 131 132 for _, pRules := range plurals.PluralRules { 133 for _, rule := range pRules.PluralRule { 134 test := pluralTest{ 135 locales: pRules.Locales, 136 form: int(countMap[rule.Count]), 137 } 138 scan := bufio.NewScanner(strings.NewReader(rule.Data())) 139 scan.Split(splitTokens) 140 var p *[]string 141 for scan.Scan() { 142 switch t := scan.Text(); t { 143 case "@integer": 144 p = &test.integer 145 case "@decimal": 146 p = &test.decimal 147 case ",", "…": 148 default: 149 if p != nil { 150 *p = append(*p, t) 151 } 152 } 153 } 154 tests = append(tests, test) 155 } 156 } 157 w.WriteVar(plurals.Type+"Tests", tests) 158 } 159} 160 161func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) { 162 for _, plurals := range data.Supplemental().Plurals { 163 if plurals.Type == "" { 164 continue 165 } 166 // Initialize setMap and inclusionMasks. They are already populated with 167 // a few entries to serve as an example and to assign nice numbers to 168 // common cases. 169 170 // setMap contains sets of numbers represented by boolean arrays where 171 // a true value for element i means that the number i is included. 172 setMap := map[[numN]bool]int{ 173 // The above init func adds an entry for including all numbers. 174 [numN]bool{1: true}: 1, // fix {1} to a nice value 175 [numN]bool{2: true}: 2, // fix {2} to a nice value 176 [numN]bool{0: true}: 3, // fix {0} to a nice value 177 } 178 179 // inclusionMasks contains bit masks for every number under numN to 180 // indicate in which set the number is included. Bit 1 << x will be set 181 // if it is included in set x. 182 inclusionMasks := [numN]uint64{ 183 // Note: these entries are not complete: more bits will be set along the way. 184 0: 1 << 3, 185 1: 1 << 1, 186 2: 1 << 2, 187 } 188 189 // Create set {0..99}. We will assign this set the identifier 0. 190 var all [numN]bool 191 for i := range all { 192 // Mark number i as being included in the set (which has identifier 0). 193 inclusionMasks[i] |= 1 << 0 194 // Mark number i as included in the set. 195 all[i] = true 196 } 197 // Register the identifier for the set. 198 setMap[all] = 0 199 200 rules := []pluralCheck{} 201 index := []byte{0} 202 langMap := map[compact.ID]byte{0: 0} 203 204 for _, pRules := range plurals.PluralRules { 205 // Parse the rules. 206 var conds []orCondition 207 for _, rule := range pRules.PluralRule { 208 form := countMap[rule.Count] 209 conds = parsePluralCondition(conds, rule.Data(), form) 210 } 211 // Encode the rules. 212 for _, c := range conds { 213 // If an or condition only has filters, we create an entry for 214 // this filter and the set that contains all values. 215 empty := true 216 for _, b := range c.used { 217 empty = empty && !b 218 } 219 if empty { 220 rules = append(rules, pluralCheck{ 221 cat: byte(opMod<<opShift) | byte(c.form), 222 setID: 0, // all values 223 }) 224 continue 225 } 226 // We have some entries with values. 227 for i, set := range c.set { 228 if !c.used[i] { 229 continue 230 } 231 index, ok := setMap[set] 232 if !ok { 233 index = len(setMap) 234 setMap[set] = index 235 for i := range inclusionMasks { 236 if set[i] { 237 inclusionMasks[i] |= 1 << uint64(index) 238 } 239 } 240 } 241 rules = append(rules, pluralCheck{ 242 cat: byte(i<<opShift | andNext), 243 setID: byte(index), 244 }) 245 } 246 // Now set the last entry to the plural form the rule matches. 247 rules[len(rules)-1].cat &^= formMask 248 rules[len(rules)-1].cat |= byte(c.form) 249 } 250 // Point the relevant locales to the created entries. 251 for _, loc := range strings.Split(pRules.Locales, " ") { 252 if strings.TrimSpace(loc) == "" { 253 continue 254 } 255 lang, ok := compact.FromTag(language.MustParse(loc)) 256 if !ok { 257 log.Printf("No compact index for locale %q", loc) 258 } 259 langMap[lang] = byte(len(index) - 1) 260 } 261 index = append(index, byte(len(rules))) 262 } 263 w.WriteVar(plurals.Type+"Rules", rules) 264 w.WriteVar(plurals.Type+"Index", index) 265 // Expand the values: first by using the parent relationship. 266 langToIndex := make([]byte, compact.NumCompactTags) 267 for i := range langToIndex { 268 for p := compact.ID(i); ; p = p.Parent() { 269 if x, ok := langMap[p]; ok { 270 langToIndex[i] = x 271 break 272 } 273 } 274 } 275 // Now expand by including entries with identical languages for which 276 // one isn't set. 277 for i, v := range langToIndex { 278 if v == 0 { 279 id, _ := compact.FromTag(language.Tag{ 280 LangID: compact.ID(i).Tag().LangID, 281 }) 282 if p := langToIndex[id]; p != 0 { 283 langToIndex[i] = p 284 } 285 } 286 } 287 w.WriteVar(plurals.Type+"LangToIndex", langToIndex) 288 // Need to convert array to slice because of golang.org/issue/7651. 289 // This will allow tables to be dropped when unused. This is especially 290 // relevant for the ordinal data, which I suspect won't be used as much. 291 w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:]) 292 293 if len(rules) > 0xFF { 294 log.Fatalf("Too many entries for rules: %#x", len(rules)) 295 } 296 if len(index) > 0xFF { 297 log.Fatalf("Too many entries for index: %#x", len(index)) 298 } 299 if len(setMap) > 64 { // maximum number of bits. 300 log.Fatalf("Too many entries for setMap: %d", len(setMap)) 301 } 302 w.WriteComment( 303 "Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets", 304 plurals.Type, len(rules), len(index), len(setMap)) 305 // Prevent comment from attaching to the next entry. 306 fmt.Fprint(w, "\n\n") 307 } 308} 309 310type orCondition struct { 311 original string // for debugging 312 313 form Form 314 used [32]bool 315 set [32][numN]bool 316} 317 318func (o *orCondition) add(op opID, mod int, v []int) (ok bool) { 319 ok = true 320 for _, x := range v { 321 if x >= maxMod { 322 ok = false 323 break 324 } 325 } 326 for i := 0; i < numN; i++ { 327 m := i 328 if mod != 0 { 329 m = i % mod 330 } 331 if !intIn(m, v) { 332 o.set[op][i] = false 333 } 334 } 335 if ok { 336 o.used[op] = true 337 } 338 return ok 339} 340 341func intIn(x int, a []int) bool { 342 for _, y := range a { 343 if x == y { 344 return true 345 } 346 } 347 return false 348} 349 350var operandIndex = map[string]opID{ 351 "i": opI, 352 "n": opN, 353 "f": opF, 354 "v": opV, 355 "w": opW, 356} 357 358// parsePluralCondition parses the condition of a single pluralRule and appends 359// the resulting or conditions to conds. 360// 361// Example rules: 362// // Category "one" in English: only allow 1 with no visible fraction 363// i = 1 and v = 0 @integer 1 364// 365// // Category "few" in Czech: all numbers with visible fractions 366// v != 0 @decimal ... 367// 368// // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or 369// // numbers with a fraction 11..19 and no trailing zeros. 370// n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ... 371// 372// @integer and @decimal are followed by examples and are not relevant for the 373// rule itself. The are used here to signal the termination of the rule. 374func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition { 375 scan := bufio.NewScanner(strings.NewReader(s)) 376 scan.Split(splitTokens) 377 for { 378 cond := orCondition{original: s, form: f} 379 // Set all numbers to be allowed for all number classes and restrict 380 // from here on. 381 for i := range cond.set { 382 for j := range cond.set[i] { 383 cond.set[i][j] = true 384 } 385 } 386 andLoop: 387 for { 388 var token string 389 scan.Scan() // Must exist. 390 switch class := scan.Text(); class { 391 case "t": 392 class = "w" // equal to w for t == 0 393 fallthrough 394 case "n", "i", "f", "v", "w": 395 op := scanToken(scan) 396 opCode := operandIndex[class] 397 mod := 0 398 if op == "%" { 399 opCode |= opMod 400 401 switch v := scanUint(scan); v { 402 case 10, 100: 403 mod = v 404 case 1000: 405 // A more general solution would be to allow checking 406 // against multiples of 100 and include entries for the 407 // numbers 100..900 in the inclusion masks. At the 408 // moment this would only help Azerbaijan and Italian. 409 410 // Italian doesn't use '%', so this must be Azerbaijan. 411 cond.used[opAzerbaijan00s] = true 412 return append(conds, cond) 413 414 case 1000000: 415 cond.used[opBretonM] = true 416 return append(conds, cond) 417 418 default: 419 log.Fatalf("Modulo value not supported %d", v) 420 } 421 op = scanToken(scan) 422 } 423 if op != "=" && op != "!=" { 424 log.Fatalf("Unexpected op %q", op) 425 } 426 if op == "!=" { 427 opCode |= opNotEqual 428 } 429 a := []int{} 430 v := scanUint(scan) 431 if class == "w" && v != 0 { 432 log.Fatalf("Must compare against zero for operand type %q", class) 433 } 434 token = scanToken(scan) 435 for { 436 switch token { 437 case "..": 438 end := scanUint(scan) 439 for ; v <= end; v++ { 440 a = append(a, v) 441 } 442 token = scanToken(scan) 443 default: // ",", "or", "and", "@..." 444 a = append(a, v) 445 } 446 if token != "," { 447 break 448 } 449 v = scanUint(scan) 450 token = scanToken(scan) 451 } 452 if !cond.add(opCode, mod, a) { 453 // Detected large numbers. As we ruled out Azerbaijan, this 454 // must be the many rule for Italian ordinals. 455 cond.set[opItalian800] = cond.set[opN] 456 cond.used[opItalian800] = true 457 } 458 459 case "@integer", "@decimal": // "other" entry: tests only. 460 return conds 461 default: 462 log.Fatalf("Unexpected operand class %q (%s)", class, s) 463 } 464 switch token { 465 case "or": 466 conds = append(conds, cond) 467 break andLoop 468 case "@integer", "@decimal": // examples 469 // There is always an example in practice, so we always terminate here. 470 if err := scan.Err(); err != nil { 471 log.Fatal(err) 472 } 473 return append(conds, cond) 474 case "and": 475 // keep accumulating 476 default: 477 log.Fatalf("Unexpected token %q", token) 478 } 479 } 480 } 481} 482 483func scanToken(scan *bufio.Scanner) string { 484 scan.Scan() 485 return scan.Text() 486} 487 488func scanUint(scan *bufio.Scanner) int { 489 scan.Scan() 490 val, err := strconv.ParseUint(scan.Text(), 10, 32) 491 if err != nil { 492 log.Fatal(err) 493 } 494 return int(val) 495} 496 497// splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules. 498func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) { 499 condTokens := [][]byte{ 500 []byte(".."), 501 []byte(","), 502 []byte("!="), 503 []byte("="), 504 } 505 advance, token, err = bufio.ScanWords(data, atEOF) 506 for _, t := range condTokens { 507 if len(t) >= len(token) { 508 continue 509 } 510 switch p := bytes.Index(token, t); { 511 case p == -1: 512 case p == 0: 513 advance = len(t) 514 token = token[:len(t)] 515 return advance - len(token) + len(t), token[:len(t)], err 516 case p < advance: 517 // Don't split when "=" overlaps "!=". 518 if t[0] == '=' && token[p-1] == '!' { 519 continue 520 } 521 advance = p 522 token = token[:p] 523 } 524 } 525 return advance, token, err 526} 527