1// Copyright 2015 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5//go:build ignore
6// +build ignore
7
8// This program generates the trie for width operations. The generated table
9// includes width category information as well as the normalization mappings.
10package main
11
12import (
13	"bytes"
14	"fmt"
15	"io"
16	"log"
17	"math"
18	"unicode/utf8"
19
20	"golang.org/x/text/internal/gen"
21	"golang.org/x/text/internal/triegen"
22)
23
24// See gen_common.go for flags.
25
26func main() {
27	gen.Init()
28	genTables()
29	genTests()
30	gen.Repackage("gen_trieval.go", "trieval.go", "width")
31	gen.Repackage("gen_common.go", "common_test.go", "width")
32}
33
34func genTables() {
35	t := triegen.NewTrie("width")
36	// fold and inverse mappings. See mapComment for a description of the format
37	// of each entry. Add dummy value to make an index of 0 mean no mapping.
38	inverse := [][4]byte{{}}
39	mapping := map[[4]byte]int{[4]byte{}: 0}
40
41	getWidthData(func(r rune, tag elem, alt rune) {
42		idx := 0
43		if alt != 0 {
44			var buf [4]byte
45			buf[0] = byte(utf8.EncodeRune(buf[1:], alt))
46			s := string(r)
47			buf[buf[0]] ^= s[len(s)-1]
48			var ok bool
49			if idx, ok = mapping[buf]; !ok {
50				idx = len(mapping)
51				if idx > math.MaxUint8 {
52					log.Fatalf("Index %d does not fit in a byte.", idx)
53				}
54				mapping[buf] = idx
55				inverse = append(inverse, buf)
56			}
57		}
58		t.Insert(r, uint64(tag|elem(idx)))
59	})
60
61	w := &bytes.Buffer{}
62	gen.WriteUnicodeVersion(w)
63
64	sz, err := t.Gen(w)
65	if err != nil {
66		log.Fatal(err)
67	}
68
69	sz += writeMappings(w, inverse)
70
71	fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024)
72
73	gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes())
74}
75
76const inverseDataComment = `
77// inverseData contains 4-byte entries of the following format:
78//   <length> <modified UTF-8-encoded rune> <0 padding>
79// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the
80// UTF-8 encoding of the original rune. Mappings often have the following
81// pattern:
82//   A -> A  (U+FF21 -> U+0041)
83//   B -> B  (U+FF22 -> U+0042)
84//   ...
85// By xor-ing the last byte the same entry can be shared by many mappings. This
86// reduces the total number of distinct entries by about two thirds.
87// The resulting entry for the aforementioned mappings is
88//   { 0x01, 0xE0, 0x00, 0x00 }
89// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get
90//   E0 ^ A1 = 41.
91// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get
92//   E0 ^ A2 = 42.
93// Note that because of the xor-ing, the byte sequence stored in the entry is
94// not valid UTF-8.`
95
96func writeMappings(w io.Writer, data [][4]byte) int {
97	fmt.Fprintln(w, inverseDataComment)
98	fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data))
99	for _, x := range data {
100		fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3])
101	}
102	fmt.Fprintln(w, "}")
103	return len(data) * 4
104}
105
106func genTests() {
107	w := &bytes.Buffer{}
108	fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n")
109	getWidthData(func(r rune, tag elem, alt rune) {
110		if alt != 0 {
111			fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag)
112		}
113	})
114	fmt.Fprintln(w, "}")
115	gen.WriteGoFile("runes_test.go", "width", w.Bytes())
116}
117