1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8package main
9
10// This file generates data for the CLDR plural rules, as defined in
11//    https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
12//
13// We assume a slightly simplified grammar:
14//
15// 		condition     = and_condition ('or' and_condition)* samples
16// 		and_condition = relation ('and' relation)*
17// 		relation      = expr ('=' | '!=') range_list
18// 		expr          = operand ('%' '10' '0'* )?
19// 		operand       = 'n' | 'i' | 'f' | 't' | 'v' | 'w'
20// 		range_list    = (range | value) (',' range_list)*
21// 		range         = value'..'value
22// 		value         = digit+
23// 		digit         = 0|1|2|3|4|5|6|7|8|9
24//
25// 		samples       = ('@integer' sampleList)?
26// 		                ('@decimal' sampleList)?
27// 		sampleList    = sampleRange (',' sampleRange)* (',' ('…'|'...'))?
28// 		sampleRange   = decimalValue ('~' decimalValue)?
29// 		decimalValue  = value ('.' value)?
30//
31//		Symbol	Value
32//		n	absolute value of the source number (integer and decimals).
33//		i	integer digits of n.
34//		v	number of visible fraction digits in n, with trailing zeros.
35//		w	number of visible fraction digits in n, without trailing zeros.
36//		f	visible fractional digits in n, with trailing zeros.
37//		t	visible fractional digits in n, without trailing zeros.
38//
39// The algorithm for which the data is generated is based on the following
40// observations
41//
42//    - the number of different sets of numbers which the plural rules use to
43//      test inclusion is limited,
44//    - most numbers that are tested on are < 100
45//
46// This allows us to define a bitmap for each number < 100 where a bit i
47// indicates whether this number is included in some defined set i.
48// The function matchPlural in plural.go defines how we can subsequently use
49// this data to determine inclusion.
50//
51// There are a few languages for which this doesn't work. For one Italian and
52// Azerbaijan, which both test against numbers > 100 for ordinals and Breton,
53// which considers whether numbers are multiples of hundreds. The model here
54// could be extended to handle Italian and Azerbaijan fairly easily (by
55// considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first
56// 100), but for now it seems easier to just hard-code these cases.
57
58import (
59	"bufio"
60	"bytes"
61	"flag"
62	"fmt"
63	"log"
64	"strconv"
65	"strings"
66
67	"golang.org/x/text/internal/gen"
68	"golang.org/x/text/internal/language"
69	"golang.org/x/text/internal/language/compact"
70	"golang.org/x/text/unicode/cldr"
71)
72
73var (
74	test = flag.Bool("test", false,
75		"test existing tables; can be used to compare web data with package data.")
76	outputFile     = flag.String("output", "tables.go", "output file")
77	outputTestFile = flag.String("testoutput", "data_test.go", "output file")
78
79	draft = flag.String("draft",
80		"contributed",
81		`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
82)
83
84func main() {
85	gen.Init()
86
87	const pkg = "plural"
88
89	gen.Repackage("gen_common.go", "common.go", pkg)
90	// Read the CLDR zip file.
91	r := gen.OpenCLDRCoreZip()
92	defer r.Close()
93
94	d := &cldr.Decoder{}
95	d.SetDirFilter("supplemental", "main")
96	d.SetSectionFilter("numbers", "plurals")
97	data, err := d.DecodeZip(r)
98	if err != nil {
99		log.Fatalf("DecodeZip: %v", err)
100	}
101
102	w := gen.NewCodeWriter()
103	defer w.WriteGoFile(*outputFile, pkg)
104
105	gen.WriteCLDRVersion(w)
106
107	genPlurals(w, data)
108
109	w = gen.NewCodeWriter()
110	defer w.WriteGoFile(*outputTestFile, pkg)
111
112	genPluralsTests(w, data)
113}
114
115type pluralTest struct {
116	locales string   // space-separated list of locales for this test
117	form    int      // Use int instead of Form to simplify generation.
118	integer []string // Entries of the form \d+ or \d+~\d+
119	decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+
120}
121
122func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) {
123	w.WriteType(pluralTest{})
124
125	for _, plurals := range data.Supplemental().Plurals {
126		if plurals.Type == "" {
127			// The empty type is reserved for plural ranges.
128			continue
129		}
130		tests := []pluralTest{}
131
132		for _, pRules := range plurals.PluralRules {
133			for _, rule := range pRules.PluralRule {
134				test := pluralTest{
135					locales: pRules.Locales,
136					form:    int(countMap[rule.Count]),
137				}
138				scan := bufio.NewScanner(strings.NewReader(rule.Data()))
139				scan.Split(splitTokens)
140				var p *[]string
141				for scan.Scan() {
142					switch t := scan.Text(); t {
143					case "@integer":
144						p = &test.integer
145					case "@decimal":
146						p = &test.decimal
147					case ",", "…":
148					default:
149						if p != nil {
150							*p = append(*p, t)
151						}
152					}
153				}
154				tests = append(tests, test)
155			}
156		}
157		w.WriteVar(plurals.Type+"Tests", tests)
158	}
159}
160
161func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) {
162	for _, plurals := range data.Supplemental().Plurals {
163		if plurals.Type == "" {
164			continue
165		}
166		// Initialize setMap and inclusionMasks. They are already populated with
167		// a few entries to serve as an example and to assign nice numbers to
168		// common cases.
169
170		// setMap contains sets of numbers represented by boolean arrays where
171		// a true value for element i means that the number i is included.
172		setMap := map[[numN]bool]int{
173			// The above init func adds an entry for including all numbers.
174			[numN]bool{1: true}: 1, // fix {1} to a nice value
175			[numN]bool{2: true}: 2, // fix {2} to a nice value
176			[numN]bool{0: true}: 3, // fix {0} to a nice value
177		}
178
179		// inclusionMasks contains bit masks for every number under numN to
180		// indicate in which set the number is included. Bit 1 << x will be set
181		// if it is included in set x.
182		inclusionMasks := [numN]uint64{
183			// Note: these entries are not complete: more bits will be set along the way.
184			0: 1 << 3,
185			1: 1 << 1,
186			2: 1 << 2,
187		}
188
189		// Create set {0..99}. We will assign this set the identifier 0.
190		var all [numN]bool
191		for i := range all {
192			// Mark number i as being included in the set (which has identifier 0).
193			inclusionMasks[i] |= 1 << 0
194			// Mark number i as included in the set.
195			all[i] = true
196		}
197		// Register the identifier for the set.
198		setMap[all] = 0
199
200		rules := []pluralCheck{}
201		index := []byte{0}
202		langMap := map[compact.ID]byte{0: 0}
203
204		for _, pRules := range plurals.PluralRules {
205			// Parse the rules.
206			var conds []orCondition
207			for _, rule := range pRules.PluralRule {
208				form := countMap[rule.Count]
209				conds = parsePluralCondition(conds, rule.Data(), form)
210			}
211			// Encode the rules.
212			for _, c := range conds {
213				// If an or condition only has filters, we create an entry for
214				// this filter and the set that contains all values.
215				empty := true
216				for _, b := range c.used {
217					empty = empty && !b
218				}
219				if empty {
220					rules = append(rules, pluralCheck{
221						cat:   byte(opMod<<opShift) | byte(c.form),
222						setID: 0, // all values
223					})
224					continue
225				}
226				// We have some entries with values.
227				for i, set := range c.set {
228					if !c.used[i] {
229						continue
230					}
231					index, ok := setMap[set]
232					if !ok {
233						index = len(setMap)
234						setMap[set] = index
235						for i := range inclusionMasks {
236							if set[i] {
237								inclusionMasks[i] |= 1 << uint64(index)
238							}
239						}
240					}
241					rules = append(rules, pluralCheck{
242						cat:   byte(i<<opShift | andNext),
243						setID: byte(index),
244					})
245				}
246				// Now set the last entry to the plural form the rule matches.
247				rules[len(rules)-1].cat &^= formMask
248				rules[len(rules)-1].cat |= byte(c.form)
249			}
250			// Point the relevant locales to the created entries.
251			for _, loc := range strings.Split(pRules.Locales, " ") {
252				if strings.TrimSpace(loc) == "" {
253					continue
254				}
255				lang, ok := compact.FromTag(language.MustParse(loc))
256				if !ok {
257					log.Printf("No compact index for locale %q", loc)
258				}
259				langMap[lang] = byte(len(index) - 1)
260			}
261			index = append(index, byte(len(rules)))
262		}
263		w.WriteVar(plurals.Type+"Rules", rules)
264		w.WriteVar(plurals.Type+"Index", index)
265		// Expand the values: first by using the parent relationship.
266		langToIndex := make([]byte, compact.NumCompactTags)
267		for i := range langToIndex {
268			for p := compact.ID(i); ; p = p.Parent() {
269				if x, ok := langMap[p]; ok {
270					langToIndex[i] = x
271					break
272				}
273			}
274		}
275		// Now expand by including entries with identical languages for which
276		// one isn't set.
277		for i, v := range langToIndex {
278			if v == 0 {
279				id, _ := compact.FromTag(language.Tag{
280					LangID: compact.ID(i).Tag().LangID,
281				})
282				if p := langToIndex[id]; p != 0 {
283					langToIndex[i] = p
284				}
285			}
286		}
287		w.WriteVar(plurals.Type+"LangToIndex", langToIndex)
288		// Need to convert array to slice because of golang.org/issue/7651.
289		// This will allow tables to be dropped when unused. This is especially
290		// relevant for the ordinal data, which I suspect won't be used as much.
291		w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:])
292
293		if len(rules) > 0xFF {
294			log.Fatalf("Too many entries for rules: %#x", len(rules))
295		}
296		if len(index) > 0xFF {
297			log.Fatalf("Too many entries for index: %#x", len(index))
298		}
299		if len(setMap) > 64 { // maximum number of bits.
300			log.Fatalf("Too many entries for setMap: %d", len(setMap))
301		}
302		w.WriteComment(
303			"Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets",
304			plurals.Type, len(rules), len(index), len(setMap))
305		// Prevent comment from attaching to the next entry.
306		fmt.Fprint(w, "\n\n")
307	}
308}
309
310type orCondition struct {
311	original string // for debugging
312
313	form Form
314	used [32]bool
315	set  [32][numN]bool
316}
317
318func (o *orCondition) add(op opID, mod int, v []int) (ok bool) {
319	ok = true
320	for _, x := range v {
321		if x >= maxMod {
322			ok = false
323			break
324		}
325	}
326	for i := 0; i < numN; i++ {
327		m := i
328		if mod != 0 {
329			m = i % mod
330		}
331		if !intIn(m, v) {
332			o.set[op][i] = false
333		}
334	}
335	if ok {
336		o.used[op] = true
337	}
338	return ok
339}
340
341func intIn(x int, a []int) bool {
342	for _, y := range a {
343		if x == y {
344			return true
345		}
346	}
347	return false
348}
349
350var operandIndex = map[string]opID{
351	"i": opI,
352	"n": opN,
353	"f": opF,
354	"v": opV,
355	"w": opW,
356}
357
358// parsePluralCondition parses the condition of a single pluralRule and appends
359// the resulting or conditions to conds.
360//
361// Example rules:
362//   // Category "one" in English: only allow 1 with no visible fraction
363//   i = 1 and v = 0 @integer 1
364//
365//   // Category "few" in Czech: all numbers with visible fractions
366//   v != 0   @decimal ...
367//
368//   // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or
369//   // numbers with a fraction 11..19 and no trailing zeros.
370//   n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ...
371//
372// @integer and @decimal are followed by examples and are not relevant for the
373// rule itself. The are used here to signal the termination of the rule.
374func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition {
375	scan := bufio.NewScanner(strings.NewReader(s))
376	scan.Split(splitTokens)
377	for {
378		cond := orCondition{original: s, form: f}
379		// Set all numbers to be allowed for all number classes and restrict
380		// from here on.
381		for i := range cond.set {
382			for j := range cond.set[i] {
383				cond.set[i][j] = true
384			}
385		}
386	andLoop:
387		for {
388			var token string
389			scan.Scan() // Must exist.
390			switch class := scan.Text(); class {
391			case "t":
392				class = "w" // equal to w for t == 0
393				fallthrough
394			case "n", "i", "f", "v", "w":
395				op := scanToken(scan)
396				opCode := operandIndex[class]
397				mod := 0
398				if op == "%" {
399					opCode |= opMod
400
401					switch v := scanUint(scan); v {
402					case 10, 100:
403						mod = v
404					case 1000:
405						// A more general solution would be to allow checking
406						// against multiples of 100 and include entries for the
407						// numbers 100..900 in the inclusion masks. At the
408						// moment this would only help Azerbaijan and Italian.
409
410						// Italian doesn't use '%', so this must be Azerbaijan.
411						cond.used[opAzerbaijan00s] = true
412						return append(conds, cond)
413
414					case 1000000:
415						cond.used[opBretonM] = true
416						return append(conds, cond)
417
418					default:
419						log.Fatalf("Modulo value not supported %d", v)
420					}
421					op = scanToken(scan)
422				}
423				if op != "=" && op != "!=" {
424					log.Fatalf("Unexpected op %q", op)
425				}
426				if op == "!=" {
427					opCode |= opNotEqual
428				}
429				a := []int{}
430				v := scanUint(scan)
431				if class == "w" && v != 0 {
432					log.Fatalf("Must compare against zero for operand type %q", class)
433				}
434				token = scanToken(scan)
435				for {
436					switch token {
437					case "..":
438						end := scanUint(scan)
439						for ; v <= end; v++ {
440							a = append(a, v)
441						}
442						token = scanToken(scan)
443					default: // ",", "or", "and", "@..."
444						a = append(a, v)
445					}
446					if token != "," {
447						break
448					}
449					v = scanUint(scan)
450					token = scanToken(scan)
451				}
452				if !cond.add(opCode, mod, a) {
453					// Detected large numbers. As we ruled out Azerbaijan, this
454					// must be the many rule for Italian ordinals.
455					cond.set[opItalian800] = cond.set[opN]
456					cond.used[opItalian800] = true
457				}
458
459			case "@integer", "@decimal": // "other" entry: tests only.
460				return conds
461			default:
462				log.Fatalf("Unexpected operand class %q (%s)", class, s)
463			}
464			switch token {
465			case "or":
466				conds = append(conds, cond)
467				break andLoop
468			case "@integer", "@decimal": // examples
469				// There is always an example in practice, so we always terminate here.
470				if err := scan.Err(); err != nil {
471					log.Fatal(err)
472				}
473				return append(conds, cond)
474			case "and":
475				// keep accumulating
476			default:
477				log.Fatalf("Unexpected token %q", token)
478			}
479		}
480	}
481}
482
483func scanToken(scan *bufio.Scanner) string {
484	scan.Scan()
485	return scan.Text()
486}
487
488func scanUint(scan *bufio.Scanner) int {
489	scan.Scan()
490	val, err := strconv.ParseUint(scan.Text(), 10, 32)
491	if err != nil {
492		log.Fatal(err)
493	}
494	return int(val)
495}
496
497// splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules.
498func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) {
499	condTokens := [][]byte{
500		[]byte(".."),
501		[]byte(","),
502		[]byte("!="),
503		[]byte("="),
504	}
505	advance, token, err = bufio.ScanWords(data, atEOF)
506	for _, t := range condTokens {
507		if len(t) >= len(token) {
508			continue
509		}
510		switch p := bytes.Index(token, t); {
511		case p == -1:
512		case p == 0:
513			advance = len(t)
514			token = token[:len(t)]
515			return advance - len(token) + len(t), token[:len(t)], err
516		case p < advance:
517			// Don't split when "=" overlaps "!=".
518			if t[0] == '=' && token[p-1] == '!' {
519				continue
520			}
521			advance = p
522			token = token[:p]
523		}
524	}
525	return advance, token, err
526}
527