1// Copyright 2016 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7package main 8 9// This file generates data for the CLDR plural rules, as defined in 10// https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules 11// 12// We assume a slightly simplified grammar: 13// 14// condition = and_condition ('or' and_condition)* samples 15// and_condition = relation ('and' relation)* 16// relation = expr ('=' | '!=') range_list 17// expr = operand ('%' '10' '0'* )? 18// operand = 'n' | 'i' | 'f' | 't' | 'v' | 'w' 19// range_list = (range | value) (',' range_list)* 20// range = value'..'value 21// value = digit+ 22// digit = 0|1|2|3|4|5|6|7|8|9 23// 24// samples = ('@integer' sampleList)? 25// ('@decimal' sampleList)? 26// sampleList = sampleRange (',' sampleRange)* (',' ('…'|'...'))? 27// sampleRange = decimalValue ('~' decimalValue)? 28// decimalValue = value ('.' value)? 29// 30// Symbol Value 31// n absolute value of the source number (integer and decimals). 32// i integer digits of n. 33// v number of visible fraction digits in n, with trailing zeros. 34// w number of visible fraction digits in n, without trailing zeros. 35// f visible fractional digits in n, with trailing zeros. 36// t visible fractional digits in n, without trailing zeros. 37// 38// The algorithm for which the data is generated is based on the following 39// observations 40// 41// - the number of different sets of numbers which the plural rules use to 42// test inclusion is limited, 43// - most numbers that are tested on are < 100 44// 45// This allows us to define a bitmap for each number < 100 where a bit i 46// indicates whether this number is included in some defined set i. 47// The function matchPlural in plural.go defines how we can subsequently use 48// this data to determine inclusion. 49// 50// There are a few languages for which this doesn't work. For one Italian and 51// Azerbaijan, which both test against numbers > 100 for ordinals and Breton, 52// which considers whether numbers are multiples of hundreds. The model here 53// could be extended to handle Italian and Azerbaijan fairly easily (by 54// considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first 55// 100), but for now it seems easier to just hard-code these cases. 56 57import ( 58 "bufio" 59 "bytes" 60 "flag" 61 "fmt" 62 "log" 63 "strconv" 64 "strings" 65 66 "golang.org/x/text/internal/gen" 67 "golang.org/x/text/internal/language" 68 "golang.org/x/text/internal/language/compact" 69 "golang.org/x/text/unicode/cldr" 70) 71 72var ( 73 test = flag.Bool("test", false, 74 "test existing tables; can be used to compare web data with package data.") 75 outputFile = flag.String("output", "tables.go", "output file") 76 outputTestFile = flag.String("testoutput", "data_test.go", "output file") 77 78 draft = flag.String("draft", 79 "contributed", 80 `Minimal draft requirements (approved, contributed, provisional, unconfirmed).`) 81) 82 83func main() { 84 gen.Init() 85 86 const pkg = "plural" 87 88 gen.Repackage("gen_common.go", "common.go", pkg) 89 // Read the CLDR zip file. 90 r := gen.OpenCLDRCoreZip() 91 defer r.Close() 92 93 d := &cldr.Decoder{} 94 d.SetDirFilter("supplemental", "main") 95 d.SetSectionFilter("numbers", "plurals") 96 data, err := d.DecodeZip(r) 97 if err != nil { 98 log.Fatalf("DecodeZip: %v", err) 99 } 100 101 w := gen.NewCodeWriter() 102 defer w.WriteGoFile(*outputFile, pkg) 103 104 gen.WriteCLDRVersion(w) 105 106 genPlurals(w, data) 107 108 w = gen.NewCodeWriter() 109 defer w.WriteGoFile(*outputTestFile, pkg) 110 111 genPluralsTests(w, data) 112} 113 114type pluralTest struct { 115 locales string // space-separated list of locales for this test 116 form int // Use int instead of Form to simplify generation. 117 integer []string // Entries of the form \d+ or \d+~\d+ 118 decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+ 119} 120 121func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) { 122 w.WriteType(pluralTest{}) 123 124 for _, plurals := range data.Supplemental().Plurals { 125 if plurals.Type == "" { 126 // The empty type is reserved for plural ranges. 127 continue 128 } 129 tests := []pluralTest{} 130 131 for _, pRules := range plurals.PluralRules { 132 for _, rule := range pRules.PluralRule { 133 test := pluralTest{ 134 locales: pRules.Locales, 135 form: int(countMap[rule.Count]), 136 } 137 scan := bufio.NewScanner(strings.NewReader(rule.Data())) 138 scan.Split(splitTokens) 139 var p *[]string 140 for scan.Scan() { 141 switch t := scan.Text(); t { 142 case "@integer": 143 p = &test.integer 144 case "@decimal": 145 p = &test.decimal 146 case ",", "…": 147 default: 148 if p != nil { 149 *p = append(*p, t) 150 } 151 } 152 } 153 tests = append(tests, test) 154 } 155 } 156 w.WriteVar(plurals.Type+"Tests", tests) 157 } 158} 159 160func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) { 161 for _, plurals := range data.Supplemental().Plurals { 162 if plurals.Type == "" { 163 continue 164 } 165 // Initialize setMap and inclusionMasks. They are already populated with 166 // a few entries to serve as an example and to assign nice numbers to 167 // common cases. 168 169 // setMap contains sets of numbers represented by boolean arrays where 170 // a true value for element i means that the number i is included. 171 setMap := map[[numN]bool]int{ 172 // The above init func adds an entry for including all numbers. 173 [numN]bool{1: true}: 1, // fix {1} to a nice value 174 [numN]bool{2: true}: 2, // fix {2} to a nice value 175 [numN]bool{0: true}: 3, // fix {0} to a nice value 176 } 177 178 // inclusionMasks contains bit masks for every number under numN to 179 // indicate in which set the number is included. Bit 1 << x will be set 180 // if it is included in set x. 181 inclusionMasks := [numN]uint64{ 182 // Note: these entries are not complete: more bits will be set along the way. 183 0: 1 << 3, 184 1: 1 << 1, 185 2: 1 << 2, 186 } 187 188 // Create set {0..99}. We will assign this set the identifier 0. 189 var all [numN]bool 190 for i := range all { 191 // Mark number i as being included in the set (which has identifier 0). 192 inclusionMasks[i] |= 1 << 0 193 // Mark number i as included in the set. 194 all[i] = true 195 } 196 // Register the identifier for the set. 197 setMap[all] = 0 198 199 rules := []pluralCheck{} 200 index := []byte{0} 201 langMap := map[compact.ID]byte{0: 0} 202 203 for _, pRules := range plurals.PluralRules { 204 // Parse the rules. 205 var conds []orCondition 206 for _, rule := range pRules.PluralRule { 207 form := countMap[rule.Count] 208 conds = parsePluralCondition(conds, rule.Data(), form) 209 } 210 // Encode the rules. 211 for _, c := range conds { 212 // If an or condition only has filters, we create an entry for 213 // this filter and the set that contains all values. 214 empty := true 215 for _, b := range c.used { 216 empty = empty && !b 217 } 218 if empty { 219 rules = append(rules, pluralCheck{ 220 cat: byte(opMod<<opShift) | byte(c.form), 221 setID: 0, // all values 222 }) 223 continue 224 } 225 // We have some entries with values. 226 for i, set := range c.set { 227 if !c.used[i] { 228 continue 229 } 230 index, ok := setMap[set] 231 if !ok { 232 index = len(setMap) 233 setMap[set] = index 234 for i := range inclusionMasks { 235 if set[i] { 236 inclusionMasks[i] |= 1 << uint64(index) 237 } 238 } 239 } 240 rules = append(rules, pluralCheck{ 241 cat: byte(i<<opShift | andNext), 242 setID: byte(index), 243 }) 244 } 245 // Now set the last entry to the plural form the rule matches. 246 rules[len(rules)-1].cat &^= formMask 247 rules[len(rules)-1].cat |= byte(c.form) 248 } 249 // Point the relevant locales to the created entries. 250 for _, loc := range strings.Split(pRules.Locales, " ") { 251 if strings.TrimSpace(loc) == "" { 252 continue 253 } 254 lang, ok := compact.FromTag(language.MustParse(loc)) 255 if !ok { 256 log.Printf("No compact index for locale %q", loc) 257 } 258 langMap[lang] = byte(len(index) - 1) 259 } 260 index = append(index, byte(len(rules))) 261 } 262 w.WriteVar(plurals.Type+"Rules", rules) 263 w.WriteVar(plurals.Type+"Index", index) 264 // Expand the values: first by using the parent relationship. 265 langToIndex := make([]byte, compact.NumCompactTags) 266 for i := range langToIndex { 267 for p := compact.ID(i); ; p = p.Parent() { 268 if x, ok := langMap[p]; ok { 269 langToIndex[i] = x 270 break 271 } 272 } 273 } 274 // Now expand by including entries with identical languages for which 275 // one isn't set. 276 for i, v := range langToIndex { 277 if v == 0 { 278 id, _ := compact.FromTag(language.Tag{ 279 LangID: compact.ID(i).Tag().LangID, 280 }) 281 if p := langToIndex[id]; p != 0 { 282 langToIndex[i] = p 283 } 284 } 285 } 286 w.WriteVar(plurals.Type+"LangToIndex", langToIndex) 287 // Need to convert array to slice because of golang.org/issue/7651. 288 // This will allow tables to be dropped when unused. This is especially 289 // relevant for the ordinal data, which I suspect won't be used as much. 290 w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:]) 291 292 if len(rules) > 0xFF { 293 log.Fatalf("Too many entries for rules: %#x", len(rules)) 294 } 295 if len(index) > 0xFF { 296 log.Fatalf("Too many entries for index: %#x", len(index)) 297 } 298 if len(setMap) > 64 { // maximum number of bits. 299 log.Fatalf("Too many entries for setMap: %d", len(setMap)) 300 } 301 w.WriteComment( 302 "Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets", 303 plurals.Type, len(rules), len(index), len(setMap)) 304 // Prevent comment from attaching to the next entry. 305 fmt.Fprint(w, "\n\n") 306 } 307} 308 309type orCondition struct { 310 original string // for debugging 311 312 form Form 313 used [32]bool 314 set [32][numN]bool 315} 316 317func (o *orCondition) add(op opID, mod int, v []int) (ok bool) { 318 ok = true 319 for _, x := range v { 320 if x >= maxMod { 321 ok = false 322 break 323 } 324 } 325 for i := 0; i < numN; i++ { 326 m := i 327 if mod != 0 { 328 m = i % mod 329 } 330 if !intIn(m, v) { 331 o.set[op][i] = false 332 } 333 } 334 if ok { 335 o.used[op] = true 336 } 337 return ok 338} 339 340func intIn(x int, a []int) bool { 341 for _, y := range a { 342 if x == y { 343 return true 344 } 345 } 346 return false 347} 348 349var operandIndex = map[string]opID{ 350 "i": opI, 351 "n": opN, 352 "f": opF, 353 "v": opV, 354 "w": opW, 355} 356 357// parsePluralCondition parses the condition of a single pluralRule and appends 358// the resulting or conditions to conds. 359// 360// Example rules: 361// // Category "one" in English: only allow 1 with no visible fraction 362// i = 1 and v = 0 @integer 1 363// 364// // Category "few" in Czech: all numbers with visible fractions 365// v != 0 @decimal ... 366// 367// // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or 368// // numbers with a fraction 11..19 and no trailing zeros. 369// n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ... 370// 371// @integer and @decimal are followed by examples and are not relevant for the 372// rule itself. The are used here to signal the termination of the rule. 373func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition { 374 scan := bufio.NewScanner(strings.NewReader(s)) 375 scan.Split(splitTokens) 376 for { 377 cond := orCondition{original: s, form: f} 378 // Set all numbers to be allowed for all number classes and restrict 379 // from here on. 380 for i := range cond.set { 381 for j := range cond.set[i] { 382 cond.set[i][j] = true 383 } 384 } 385 andLoop: 386 for { 387 var token string 388 scan.Scan() // Must exist. 389 switch class := scan.Text(); class { 390 case "t": 391 class = "w" // equal to w for t == 0 392 fallthrough 393 case "n", "i", "f", "v", "w": 394 op := scanToken(scan) 395 opCode := operandIndex[class] 396 mod := 0 397 if op == "%" { 398 opCode |= opMod 399 400 switch v := scanUint(scan); v { 401 case 10, 100: 402 mod = v 403 case 1000: 404 // A more general solution would be to allow checking 405 // against multiples of 100 and include entries for the 406 // numbers 100..900 in the inclusion masks. At the 407 // moment this would only help Azerbaijan and Italian. 408 409 // Italian doesn't use '%', so this must be Azerbaijan. 410 cond.used[opAzerbaijan00s] = true 411 return append(conds, cond) 412 413 case 1000000: 414 cond.used[opBretonM] = true 415 return append(conds, cond) 416 417 default: 418 log.Fatalf("Modulo value not supported %d", v) 419 } 420 op = scanToken(scan) 421 } 422 if op != "=" && op != "!=" { 423 log.Fatalf("Unexpected op %q", op) 424 } 425 if op == "!=" { 426 opCode |= opNotEqual 427 } 428 a := []int{} 429 v := scanUint(scan) 430 if class == "w" && v != 0 { 431 log.Fatalf("Must compare against zero for operand type %q", class) 432 } 433 token = scanToken(scan) 434 for { 435 switch token { 436 case "..": 437 end := scanUint(scan) 438 for ; v <= end; v++ { 439 a = append(a, v) 440 } 441 token = scanToken(scan) 442 default: // ",", "or", "and", "@..." 443 a = append(a, v) 444 } 445 if token != "," { 446 break 447 } 448 v = scanUint(scan) 449 token = scanToken(scan) 450 } 451 if !cond.add(opCode, mod, a) { 452 // Detected large numbers. As we ruled out Azerbaijan, this 453 // must be the many rule for Italian ordinals. 454 cond.set[opItalian800] = cond.set[opN] 455 cond.used[opItalian800] = true 456 } 457 458 case "@integer", "@decimal": // "other" entry: tests only. 459 return conds 460 default: 461 log.Fatalf("Unexpected operand class %q (%s)", class, s) 462 } 463 switch token { 464 case "or": 465 conds = append(conds, cond) 466 break andLoop 467 case "@integer", "@decimal": // examples 468 // There is always an example in practice, so we always terminate here. 469 if err := scan.Err(); err != nil { 470 log.Fatal(err) 471 } 472 return append(conds, cond) 473 case "and": 474 // keep accumulating 475 default: 476 log.Fatalf("Unexpected token %q", token) 477 } 478 } 479 } 480} 481 482func scanToken(scan *bufio.Scanner) string { 483 scan.Scan() 484 return scan.Text() 485} 486 487func scanUint(scan *bufio.Scanner) int { 488 scan.Scan() 489 val, err := strconv.ParseUint(scan.Text(), 10, 32) 490 if err != nil { 491 log.Fatal(err) 492 } 493 return int(val) 494} 495 496// splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules. 497func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) { 498 condTokens := [][]byte{ 499 []byte(".."), 500 []byte(","), 501 []byte("!="), 502 []byte("="), 503 } 504 advance, token, err = bufio.ScanWords(data, atEOF) 505 for _, t := range condTokens { 506 if len(t) >= len(token) { 507 continue 508 } 509 switch p := bytes.Index(token, t); { 510 case p == -1: 511 case p == 0: 512 advance = len(t) 513 token = token[:len(t)] 514 return advance - len(token) + len(t), token[:len(t)], err 515 case p < advance: 516 // Don't split when "=" overlaps "!=". 517 if t[0] == '=' && token[p-1] == '!' { 518 continue 519 } 520 advance = p 521 token = token[:p] 522 } 523 } 524 return advance, token, err 525} 526