1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5//go:build ignore 6// +build ignore 7 8// This program generates the trie for width operations. The generated table 9// includes width category information as well as the normalization mappings. 10package main 11 12import ( 13 "bytes" 14 "fmt" 15 "io" 16 "log" 17 "math" 18 "unicode/utf8" 19 20 "golang.org/x/text/internal/gen" 21 "golang.org/x/text/internal/triegen" 22) 23 24// See gen_common.go for flags. 25 26func main() { 27 gen.Init() 28 genTables() 29 genTests() 30 gen.Repackage("gen_trieval.go", "trieval.go", "width") 31 gen.Repackage("gen_common.go", "common_test.go", "width") 32} 33 34func genTables() { 35 t := triegen.NewTrie("width") 36 // fold and inverse mappings. See mapComment for a description of the format 37 // of each entry. Add dummy value to make an index of 0 mean no mapping. 38 inverse := [][4]byte{{}} 39 mapping := map[[4]byte]int{[4]byte{}: 0} 40 41 getWidthData(func(r rune, tag elem, alt rune) { 42 idx := 0 43 if alt != 0 { 44 var buf [4]byte 45 buf[0] = byte(utf8.EncodeRune(buf[1:], alt)) 46 s := string(r) 47 buf[buf[0]] ^= s[len(s)-1] 48 var ok bool 49 if idx, ok = mapping[buf]; !ok { 50 idx = len(mapping) 51 if idx > math.MaxUint8 { 52 log.Fatalf("Index %d does not fit in a byte.", idx) 53 } 54 mapping[buf] = idx 55 inverse = append(inverse, buf) 56 } 57 } 58 t.Insert(r, uint64(tag|elem(idx))) 59 }) 60 61 w := &bytes.Buffer{} 62 gen.WriteUnicodeVersion(w) 63 64 sz, err := t.Gen(w) 65 if err != nil { 66 log.Fatal(err) 67 } 68 69 sz += writeMappings(w, inverse) 70 71 fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) 72 73 gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes()) 74} 75 76const inverseDataComment = ` 77// inverseData contains 4-byte entries of the following format: 78// <length> <modified UTF-8-encoded rune> <0 padding> 79// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the 80// UTF-8 encoding of the original rune. Mappings often have the following 81// pattern: 82// A -> A (U+FF21 -> U+0041) 83// B -> B (U+FF22 -> U+0042) 84// ... 85// By xor-ing the last byte the same entry can be shared by many mappings. This 86// reduces the total number of distinct entries by about two thirds. 87// The resulting entry for the aforementioned mappings is 88// { 0x01, 0xE0, 0x00, 0x00 } 89// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get 90// E0 ^ A1 = 41. 91// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get 92// E0 ^ A2 = 42. 93// Note that because of the xor-ing, the byte sequence stored in the entry is 94// not valid UTF-8.` 95 96func writeMappings(w io.Writer, data [][4]byte) int { 97 fmt.Fprintln(w, inverseDataComment) 98 fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data)) 99 for _, x := range data { 100 fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3]) 101 } 102 fmt.Fprintln(w, "}") 103 return len(data) * 4 104} 105 106func genTests() { 107 w := &bytes.Buffer{} 108 fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n") 109 getWidthData(func(r rune, tag elem, alt rune) { 110 if alt != 0 { 111 fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag) 112 } 113 }) 114 fmt.Fprintln(w, "}") 115 gen.WriteGoFile("runes_test.go", "width", w.Bytes()) 116} 117