1// Copyright 2015 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// This program generates the trie for width operations. The generated table 8// includes width category information as well as the normalization mappings. 9package main 10 11import ( 12 "bytes" 13 "fmt" 14 "io" 15 "log" 16 "math" 17 "unicode/utf8" 18 19 "golang.org/x/text/internal/gen" 20 "golang.org/x/text/internal/triegen" 21) 22 23// See gen_common.go for flags. 24 25func main() { 26 gen.Init() 27 genTables() 28 genTests() 29 gen.Repackage("gen_trieval.go", "trieval.go", "width") 30 gen.Repackage("gen_common.go", "common_test.go", "width") 31} 32 33func genTables() { 34 t := triegen.NewTrie("width") 35 // fold and inverse mappings. See mapComment for a description of the format 36 // of each entry. Add dummy value to make an index of 0 mean no mapping. 37 inverse := [][4]byte{{}} 38 mapping := map[[4]byte]int{[4]byte{}: 0} 39 40 getWidthData(func(r rune, tag elem, alt rune) { 41 idx := 0 42 if alt != 0 { 43 var buf [4]byte 44 buf[0] = byte(utf8.EncodeRune(buf[1:], alt)) 45 s := string(r) 46 buf[buf[0]] ^= s[len(s)-1] 47 var ok bool 48 if idx, ok = mapping[buf]; !ok { 49 idx = len(mapping) 50 if idx > math.MaxUint8 { 51 log.Fatalf("Index %d does not fit in a byte.", idx) 52 } 53 mapping[buf] = idx 54 inverse = append(inverse, buf) 55 } 56 } 57 t.Insert(r, uint64(tag|elem(idx))) 58 }) 59 60 w := &bytes.Buffer{} 61 gen.WriteUnicodeVersion(w) 62 63 sz, err := t.Gen(w) 64 if err != nil { 65 log.Fatal(err) 66 } 67 68 sz += writeMappings(w, inverse) 69 70 fmt.Fprintf(w, "// Total table size %d bytes (%dKiB)\n", sz, sz/1024) 71 72 gen.WriteVersionedGoFile(*outputFile, "width", w.Bytes()) 73} 74 75const inverseDataComment = ` 76// inverseData contains 4-byte entries of the following format: 77// <length> <modified UTF-8-encoded rune> <0 padding> 78// The last byte of the UTF-8-encoded rune is xor-ed with the last byte of the 79// UTF-8 encoding of the original rune. Mappings often have the following 80// pattern: 81// A -> A (U+FF21 -> U+0041) 82// B -> B (U+FF22 -> U+0042) 83// ... 84// By xor-ing the last byte the same entry can be shared by many mappings. This 85// reduces the total number of distinct entries by about two thirds. 86// The resulting entry for the aforementioned mappings is 87// { 0x01, 0xE0, 0x00, 0x00 } 88// Using this entry to map U+FF21 (UTF-8 [EF BC A1]), we get 89// E0 ^ A1 = 41. 90// Similarly, for U+FF22 (UTF-8 [EF BC A2]), we get 91// E0 ^ A2 = 42. 92// Note that because of the xor-ing, the byte sequence stored in the entry is 93// not valid UTF-8.` 94 95func writeMappings(w io.Writer, data [][4]byte) int { 96 fmt.Fprintln(w, inverseDataComment) 97 fmt.Fprintf(w, "var inverseData = [%d][4]byte{\n", len(data)) 98 for _, x := range data { 99 fmt.Fprintf(w, "{ 0x%02x, 0x%02x, 0x%02x, 0x%02x },\n", x[0], x[1], x[2], x[3]) 100 } 101 fmt.Fprintln(w, "}") 102 return len(data) * 4 103} 104 105func genTests() { 106 w := &bytes.Buffer{} 107 fmt.Fprintf(w, "\nvar mapRunes = map[rune]struct{r rune; e elem}{\n") 108 getWidthData(func(r rune, tag elem, alt rune) { 109 if alt != 0 { 110 fmt.Fprintf(w, "\t0x%X: {0x%X, 0x%X},\n", r, alt, tag) 111 } 112 }) 113 fmt.Fprintln(w, "}") 114 gen.WriteGoFile("runes_test.go", "width", w.Bytes()) 115} 116