1// Copyright 2016 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7package main
8
9// This file generates data for the CLDR plural rules, as defined in
10//    https://unicode.org/reports/tr35/tr35-numbers.html#Language_Plural_Rules
11//
12// We assume a slightly simplified grammar:
13//
14// 		condition     = and_condition ('or' and_condition)* samples
15// 		and_condition = relation ('and' relation)*
16// 		relation      = expr ('=' | '!=') range_list
17// 		expr          = operand ('%' '10' '0'* )?
18// 		operand       = 'n' | 'i' | 'f' | 't' | 'v' | 'w'
19// 		range_list    = (range | value) (',' range_list)*
20// 		range         = value'..'value
21// 		value         = digit+
22// 		digit         = 0|1|2|3|4|5|6|7|8|9
23//
24// 		samples       = ('@integer' sampleList)?
25// 		                ('@decimal' sampleList)?
26// 		sampleList    = sampleRange (',' sampleRange)* (',' ('…'|'...'))?
27// 		sampleRange   = decimalValue ('~' decimalValue)?
28// 		decimalValue  = value ('.' value)?
29//
30//		Symbol	Value
31//		n	absolute value of the source number (integer and decimals).
32//		i	integer digits of n.
33//		v	number of visible fraction digits in n, with trailing zeros.
34//		w	number of visible fraction digits in n, without trailing zeros.
35//		f	visible fractional digits in n, with trailing zeros.
36//		t	visible fractional digits in n, without trailing zeros.
37//
38// The algorithm for which the data is generated is based on the following
39// observations
40//
41//    - the number of different sets of numbers which the plural rules use to
42//      test inclusion is limited,
43//    - most numbers that are tested on are < 100
44//
45// This allows us to define a bitmap for each number < 100 where a bit i
46// indicates whether this number is included in some defined set i.
47// The function matchPlural in plural.go defines how we can subsequently use
48// this data to determine inclusion.
49//
50// There are a few languages for which this doesn't work. For one Italian and
51// Azerbaijan, which both test against numbers > 100 for ordinals and Breton,
52// which considers whether numbers are multiples of hundreds. The model here
53// could be extended to handle Italian and Azerbaijan fairly easily (by
54// considering the numbers 100, 200, 300, ..., 800, 900 in addition to the first
55// 100), but for now it seems easier to just hard-code these cases.
56
57import (
58	"bufio"
59	"bytes"
60	"flag"
61	"fmt"
62	"log"
63	"strconv"
64	"strings"
65
66	"golang.org/x/text/internal/gen"
67	"golang.org/x/text/internal/language"
68	"golang.org/x/text/internal/language/compact"
69	"golang.org/x/text/unicode/cldr"
70)
71
72var (
73	test = flag.Bool("test", false,
74		"test existing tables; can be used to compare web data with package data.")
75	outputFile     = flag.String("output", "tables.go", "output file")
76	outputTestFile = flag.String("testoutput", "data_test.go", "output file")
77
78	draft = flag.String("draft",
79		"contributed",
80		`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
81)
82
83func main() {
84	gen.Init()
85
86	const pkg = "plural"
87
88	gen.Repackage("gen_common.go", "common.go", pkg)
89	// Read the CLDR zip file.
90	r := gen.OpenCLDRCoreZip()
91	defer r.Close()
92
93	d := &cldr.Decoder{}
94	d.SetDirFilter("supplemental", "main")
95	d.SetSectionFilter("numbers", "plurals")
96	data, err := d.DecodeZip(r)
97	if err != nil {
98		log.Fatalf("DecodeZip: %v", err)
99	}
100
101	w := gen.NewCodeWriter()
102	defer w.WriteGoFile(*outputFile, pkg)
103
104	gen.WriteCLDRVersion(w)
105
106	genPlurals(w, data)
107
108	w = gen.NewCodeWriter()
109	defer w.WriteGoFile(*outputTestFile, pkg)
110
111	genPluralsTests(w, data)
112}
113
114type pluralTest struct {
115	locales string   // space-separated list of locales for this test
116	form    int      // Use int instead of Form to simplify generation.
117	integer []string // Entries of the form \d+ or \d+~\d+
118	decimal []string // Entries of the form \f+ or \f+ +~\f+, where f is \d+\.\d+
119}
120
121func genPluralsTests(w *gen.CodeWriter, data *cldr.CLDR) {
122	w.WriteType(pluralTest{})
123
124	for _, plurals := range data.Supplemental().Plurals {
125		if plurals.Type == "" {
126			// The empty type is reserved for plural ranges.
127			continue
128		}
129		tests := []pluralTest{}
130
131		for _, pRules := range plurals.PluralRules {
132			for _, rule := range pRules.PluralRule {
133				test := pluralTest{
134					locales: pRules.Locales,
135					form:    int(countMap[rule.Count]),
136				}
137				scan := bufio.NewScanner(strings.NewReader(rule.Data()))
138				scan.Split(splitTokens)
139				var p *[]string
140				for scan.Scan() {
141					switch t := scan.Text(); t {
142					case "@integer":
143						p = &test.integer
144					case "@decimal":
145						p = &test.decimal
146					case ",", "…":
147					default:
148						if p != nil {
149							*p = append(*p, t)
150						}
151					}
152				}
153				tests = append(tests, test)
154			}
155		}
156		w.WriteVar(plurals.Type+"Tests", tests)
157	}
158}
159
160func genPlurals(w *gen.CodeWriter, data *cldr.CLDR) {
161	for _, plurals := range data.Supplemental().Plurals {
162		if plurals.Type == "" {
163			continue
164		}
165		// Initialize setMap and inclusionMasks. They are already populated with
166		// a few entries to serve as an example and to assign nice numbers to
167		// common cases.
168
169		// setMap contains sets of numbers represented by boolean arrays where
170		// a true value for element i means that the number i is included.
171		setMap := map[[numN]bool]int{
172			// The above init func adds an entry for including all numbers.
173			[numN]bool{1: true}: 1, // fix {1} to a nice value
174			[numN]bool{2: true}: 2, // fix {2} to a nice value
175			[numN]bool{0: true}: 3, // fix {0} to a nice value
176		}
177
178		// inclusionMasks contains bit masks for every number under numN to
179		// indicate in which set the number is included. Bit 1 << x will be set
180		// if it is included in set x.
181		inclusionMasks := [numN]uint64{
182			// Note: these entries are not complete: more bits will be set along the way.
183			0: 1 << 3,
184			1: 1 << 1,
185			2: 1 << 2,
186		}
187
188		// Create set {0..99}. We will assign this set the identifier 0.
189		var all [numN]bool
190		for i := range all {
191			// Mark number i as being included in the set (which has identifier 0).
192			inclusionMasks[i] |= 1 << 0
193			// Mark number i as included in the set.
194			all[i] = true
195		}
196		// Register the identifier for the set.
197		setMap[all] = 0
198
199		rules := []pluralCheck{}
200		index := []byte{0}
201		langMap := map[compact.ID]byte{0: 0}
202
203		for _, pRules := range plurals.PluralRules {
204			// Parse the rules.
205			var conds []orCondition
206			for _, rule := range pRules.PluralRule {
207				form := countMap[rule.Count]
208				conds = parsePluralCondition(conds, rule.Data(), form)
209			}
210			// Encode the rules.
211			for _, c := range conds {
212				// If an or condition only has filters, we create an entry for
213				// this filter and the set that contains all values.
214				empty := true
215				for _, b := range c.used {
216					empty = empty && !b
217				}
218				if empty {
219					rules = append(rules, pluralCheck{
220						cat:   byte(opMod<<opShift) | byte(c.form),
221						setID: 0, // all values
222					})
223					continue
224				}
225				// We have some entries with values.
226				for i, set := range c.set {
227					if !c.used[i] {
228						continue
229					}
230					index, ok := setMap[set]
231					if !ok {
232						index = len(setMap)
233						setMap[set] = index
234						for i := range inclusionMasks {
235							if set[i] {
236								inclusionMasks[i] |= 1 << uint64(index)
237							}
238						}
239					}
240					rules = append(rules, pluralCheck{
241						cat:   byte(i<<opShift | andNext),
242						setID: byte(index),
243					})
244				}
245				// Now set the last entry to the plural form the rule matches.
246				rules[len(rules)-1].cat &^= formMask
247				rules[len(rules)-1].cat |= byte(c.form)
248			}
249			// Point the relevant locales to the created entries.
250			for _, loc := range strings.Split(pRules.Locales, " ") {
251				if strings.TrimSpace(loc) == "" {
252					continue
253				}
254				lang, ok := compact.FromTag(language.MustParse(loc))
255				if !ok {
256					log.Printf("No compact index for locale %q", loc)
257				}
258				langMap[lang] = byte(len(index) - 1)
259			}
260			index = append(index, byte(len(rules)))
261		}
262		w.WriteVar(plurals.Type+"Rules", rules)
263		w.WriteVar(plurals.Type+"Index", index)
264		// Expand the values: first by using the parent relationship.
265		langToIndex := make([]byte, compact.NumCompactTags)
266		for i := range langToIndex {
267			for p := compact.ID(i); ; p = p.Parent() {
268				if x, ok := langMap[p]; ok {
269					langToIndex[i] = x
270					break
271				}
272			}
273		}
274		// Now expand by including entries with identical languages for which
275		// one isn't set.
276		for i, v := range langToIndex {
277			if v == 0 {
278				id, _ := compact.FromTag(language.Tag{
279					LangID: compact.ID(i).Tag().LangID,
280				})
281				if p := langToIndex[id]; p != 0 {
282					langToIndex[i] = p
283				}
284			}
285		}
286		w.WriteVar(plurals.Type+"LangToIndex", langToIndex)
287		// Need to convert array to slice because of golang.org/issue/7651.
288		// This will allow tables to be dropped when unused. This is especially
289		// relevant for the ordinal data, which I suspect won't be used as much.
290		w.WriteVar(plurals.Type+"InclusionMasks", inclusionMasks[:])
291
292		if len(rules) > 0xFF {
293			log.Fatalf("Too many entries for rules: %#x", len(rules))
294		}
295		if len(index) > 0xFF {
296			log.Fatalf("Too many entries for index: %#x", len(index))
297		}
298		if len(setMap) > 64 { // maximum number of bits.
299			log.Fatalf("Too many entries for setMap: %d", len(setMap))
300		}
301		w.WriteComment(
302			"Slots used for %s: %X of 0xFF rules; %X of 0xFF indexes; %d of 64 sets",
303			plurals.Type, len(rules), len(index), len(setMap))
304		// Prevent comment from attaching to the next entry.
305		fmt.Fprint(w, "\n\n")
306	}
307}
308
309type orCondition struct {
310	original string // for debugging
311
312	form Form
313	used [32]bool
314	set  [32][numN]bool
315}
316
317func (o *orCondition) add(op opID, mod int, v []int) (ok bool) {
318	ok = true
319	for _, x := range v {
320		if x >= maxMod {
321			ok = false
322			break
323		}
324	}
325	for i := 0; i < numN; i++ {
326		m := i
327		if mod != 0 {
328			m = i % mod
329		}
330		if !intIn(m, v) {
331			o.set[op][i] = false
332		}
333	}
334	if ok {
335		o.used[op] = true
336	}
337	return ok
338}
339
340func intIn(x int, a []int) bool {
341	for _, y := range a {
342		if x == y {
343			return true
344		}
345	}
346	return false
347}
348
349var operandIndex = map[string]opID{
350	"i": opI,
351	"n": opN,
352	"f": opF,
353	"v": opV,
354	"w": opW,
355}
356
357// parsePluralCondition parses the condition of a single pluralRule and appends
358// the resulting or conditions to conds.
359//
360// Example rules:
361//   // Category "one" in English: only allow 1 with no visible fraction
362//   i = 1 and v = 0 @integer 1
363//
364//   // Category "few" in Czech: all numbers with visible fractions
365//   v != 0   @decimal ...
366//
367//   // Category "zero" in Latvian: all multiples of 10 or the numbers 11-19 or
368//   // numbers with a fraction 11..19 and no trailing zeros.
369//   n % 10 = 0 or n % 100 = 11..19 or v = 2 and f % 100 = 11..19 @integer ...
370//
371// @integer and @decimal are followed by examples and are not relevant for the
372// rule itself. The are used here to signal the termination of the rule.
373func parsePluralCondition(conds []orCondition, s string, f Form) []orCondition {
374	scan := bufio.NewScanner(strings.NewReader(s))
375	scan.Split(splitTokens)
376	for {
377		cond := orCondition{original: s, form: f}
378		// Set all numbers to be allowed for all number classes and restrict
379		// from here on.
380		for i := range cond.set {
381			for j := range cond.set[i] {
382				cond.set[i][j] = true
383			}
384		}
385	andLoop:
386		for {
387			var token string
388			scan.Scan() // Must exist.
389			switch class := scan.Text(); class {
390			case "t":
391				class = "w" // equal to w for t == 0
392				fallthrough
393			case "n", "i", "f", "v", "w":
394				op := scanToken(scan)
395				opCode := operandIndex[class]
396				mod := 0
397				if op == "%" {
398					opCode |= opMod
399
400					switch v := scanUint(scan); v {
401					case 10, 100:
402						mod = v
403					case 1000:
404						// A more general solution would be to allow checking
405						// against multiples of 100 and include entries for the
406						// numbers 100..900 in the inclusion masks. At the
407						// moment this would only help Azerbaijan and Italian.
408
409						// Italian doesn't use '%', so this must be Azerbaijan.
410						cond.used[opAzerbaijan00s] = true
411						return append(conds, cond)
412
413					case 1000000:
414						cond.used[opBretonM] = true
415						return append(conds, cond)
416
417					default:
418						log.Fatalf("Modulo value not supported %d", v)
419					}
420					op = scanToken(scan)
421				}
422				if op != "=" && op != "!=" {
423					log.Fatalf("Unexpected op %q", op)
424				}
425				if op == "!=" {
426					opCode |= opNotEqual
427				}
428				a := []int{}
429				v := scanUint(scan)
430				if class == "w" && v != 0 {
431					log.Fatalf("Must compare against zero for operand type %q", class)
432				}
433				token = scanToken(scan)
434				for {
435					switch token {
436					case "..":
437						end := scanUint(scan)
438						for ; v <= end; v++ {
439							a = append(a, v)
440						}
441						token = scanToken(scan)
442					default: // ",", "or", "and", "@..."
443						a = append(a, v)
444					}
445					if token != "," {
446						break
447					}
448					v = scanUint(scan)
449					token = scanToken(scan)
450				}
451				if !cond.add(opCode, mod, a) {
452					// Detected large numbers. As we ruled out Azerbaijan, this
453					// must be the many rule for Italian ordinals.
454					cond.set[opItalian800] = cond.set[opN]
455					cond.used[opItalian800] = true
456				}
457
458			case "@integer", "@decimal": // "other" entry: tests only.
459				return conds
460			default:
461				log.Fatalf("Unexpected operand class %q (%s)", class, s)
462			}
463			switch token {
464			case "or":
465				conds = append(conds, cond)
466				break andLoop
467			case "@integer", "@decimal": // examples
468				// There is always an example in practice, so we always terminate here.
469				if err := scan.Err(); err != nil {
470					log.Fatal(err)
471				}
472				return append(conds, cond)
473			case "and":
474				// keep accumulating
475			default:
476				log.Fatalf("Unexpected token %q", token)
477			}
478		}
479	}
480}
481
482func scanToken(scan *bufio.Scanner) string {
483	scan.Scan()
484	return scan.Text()
485}
486
487func scanUint(scan *bufio.Scanner) int {
488	scan.Scan()
489	val, err := strconv.ParseUint(scan.Text(), 10, 32)
490	if err != nil {
491		log.Fatal(err)
492	}
493	return int(val)
494}
495
496// splitTokens can be used with bufio.Scanner to tokenize CLDR plural rules.
497func splitTokens(data []byte, atEOF bool) (advance int, token []byte, err error) {
498	condTokens := [][]byte{
499		[]byte(".."),
500		[]byte(","),
501		[]byte("!="),
502		[]byte("="),
503	}
504	advance, token, err = bufio.ScanWords(data, atEOF)
505	for _, t := range condTokens {
506		if len(t) >= len(token) {
507			continue
508		}
509		switch p := bytes.Index(token, t); {
510		case p == -1:
511		case p == 0:
512			advance = len(t)
513			token = token[:len(t)]
514			return advance - len(token) + len(t), token[:len(t)], err
515		case p < advance:
516			// Don't split when "=" overlaps "!=".
517			if t[0] == '=' && token[p-1] == '!' {
518				continue
519			}
520			advance = p
521			token = token[:p]
522		}
523	}
524	return advance, token, err
525}
526