1// Copyright 2011 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// +build ignore
6
7// Trie table generator.
8// Used by make*tables tools to generate a go file with trie data structures
9// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte
10// sequence are used to lookup offsets in the index table to be used for the
11// next byte. The last byte is used to index into a table with 16-bit values.
12
13package main
14
15import (
16	"fmt"
17	"hash/crc32"
18	"log"
19	"unicode/utf8"
20)
21
22const (
23	blockSize        = 64
24	blockOffset      = 2 // Subtract two blocks to compensate for the 0x80 added to continuation bytes.
25	maxSparseEntries = 16
26)
27
28// Intermediate trie structure
29type trieNode struct {
30	table [256]*trieNode
31	value int
32	b     byte
33	leaf  bool
34}
35
36func newNode() *trieNode {
37	return new(trieNode)
38}
39
40func (n trieNode) String() string {
41	s := fmt.Sprint("trieNode{table: { non-nil at index: ")
42	for i, v := range n.table {
43		if v != nil {
44			s += fmt.Sprintf("%d, ", i)
45		}
46	}
47	s += fmt.Sprintf("}, value:%#x, b:%#x leaf:%v}", n.value, n.b, n.leaf)
48	return s
49}
50
51func (n trieNode) isInternal() bool {
52	internal := true
53	for i := 0; i < 256; i++ {
54		if nn := n.table[i]; nn != nil {
55			if !internal && !nn.leaf {
56				log.Fatalf("triegen: isInternal: node contains both leaf and non-leaf children (%v)", n)
57			}
58			internal = internal && !nn.leaf
59		}
60	}
61	return internal
62}
63
64func (n trieNode) mostFrequentStride() int {
65	counts := make(map[int]int)
66	v := 0
67	for _, t := range n.table[0x80 : 0x80+blockSize] {
68		if t != nil {
69			if stride := t.value - v; v != 0 && stride >= 0 {
70				counts[stride]++
71			}
72			v = t.value
73		} else {
74			v = 0
75		}
76	}
77	var maxs, maxc int
78	for stride, cnt := range counts {
79		if cnt > maxc || (cnt == maxc && stride < maxs) {
80			maxs, maxc = stride, cnt
81		}
82	}
83	return maxs
84}
85
86func (n trieNode) countSparseEntries() int {
87	stride := n.mostFrequentStride()
88	var count, v int
89	for _, t := range n.table[0x80 : 0x80+blockSize] {
90		tv := 0
91		if t != nil {
92			tv = t.value
93		}
94		if tv-v != stride {
95			if tv != 0 {
96				count++
97			}
98		}
99		v = tv
100	}
101	return count
102}
103
104func (n *trieNode) insert(r rune, value uint16) {
105	var p [utf8.UTFMax]byte
106	sz := utf8.EncodeRune(p[:], r)
107
108	for i := 0; i < sz; i++ {
109		if n.leaf {
110			log.Fatalf("triegen: insert: node (%#v) should not be a leaf", n)
111		}
112		nn := n.table[p[i]]
113		if nn == nil {
114			nn = newNode()
115			nn.b = p[i]
116			n.table[p[i]] = nn
117		}
118		n = nn
119	}
120	n.value = int(value)
121	n.leaf = true
122}
123
124type nodeIndex struct {
125	lookupBlocks []*trieNode
126	valueBlocks  []*trieNode
127	sparseBlocks []*trieNode
128	sparseOffset []uint16
129	sparseCount  int
130
131	lookupBlockIdx map[uint32]int
132	valueBlockIdx  map[uint32]int
133}
134
135func newIndex() *nodeIndex {
136	index := &nodeIndex{}
137	index.lookupBlocks = make([]*trieNode, 0)
138	index.valueBlocks = make([]*trieNode, 0)
139	index.sparseBlocks = make([]*trieNode, 0)
140	index.sparseOffset = make([]uint16, 1)
141	index.lookupBlockIdx = make(map[uint32]int)
142	index.valueBlockIdx = make(map[uint32]int)
143	return index
144}
145
146func computeOffsets(index *nodeIndex, n *trieNode) int {
147	if n.leaf {
148		return n.value
149	}
150	hasher := crc32.New(crc32.MakeTable(crc32.IEEE))
151	// We only index continuation bytes.
152	for i := 0; i < blockSize; i++ {
153		v := 0
154		if nn := n.table[0x80+i]; nn != nil {
155			v = computeOffsets(index, nn)
156		}
157		hasher.Write([]byte{uint8(v >> 8), uint8(v)})
158	}
159	h := hasher.Sum32()
160	if n.isInternal() {
161		v, ok := index.lookupBlockIdx[h]
162		if !ok {
163			v = len(index.lookupBlocks) - blockOffset
164			index.lookupBlocks = append(index.lookupBlocks, n)
165			index.lookupBlockIdx[h] = v
166		}
167		n.value = v
168	} else {
169		v, ok := index.valueBlockIdx[h]
170		if !ok {
171			if c := n.countSparseEntries(); c > maxSparseEntries {
172				v = len(index.valueBlocks) - blockOffset
173				index.valueBlocks = append(index.valueBlocks, n)
174				index.valueBlockIdx[h] = v
175			} else {
176				v = -len(index.sparseOffset)
177				index.sparseBlocks = append(index.sparseBlocks, n)
178				index.sparseOffset = append(index.sparseOffset, uint16(index.sparseCount))
179				index.sparseCount += c + 1
180				index.valueBlockIdx[h] = v
181			}
182		}
183		n.value = v
184	}
185	return n.value
186}
187
188func printValueBlock(nr int, n *trieNode, offset int) {
189	boff := nr * blockSize
190	fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
191	var printnewline bool
192	for i := 0; i < blockSize; i++ {
193		if i%6 == 0 {
194			printnewline = true
195		}
196		v := 0
197		if nn := n.table[i+offset]; nn != nil {
198			v = nn.value
199		}
200		if v != 0 {
201			if printnewline {
202				fmt.Printf("\n")
203				printnewline = false
204			}
205			fmt.Printf("%#04x:%#04x, ", boff+i, v)
206		}
207	}
208}
209
210func printSparseBlock(nr int, n *trieNode) {
211	boff := -n.value
212	fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
213	v := 0
214	//stride := f(n)
215	stride := n.mostFrequentStride()
216	c := n.countSparseEntries()
217	fmt.Printf("\n{value:%#04x,lo:%#02x},", stride, uint8(c))
218	for i, nn := range n.table[0x80 : 0x80+blockSize] {
219		nv := 0
220		if nn != nil {
221			nv = nn.value
222		}
223		if nv-v != stride {
224			if v != 0 {
225				fmt.Printf(",hi:%#02x},", 0x80+i-1)
226			}
227			if nv != 0 {
228				fmt.Printf("\n{value:%#04x,lo:%#02x", nv, nn.b)
229			}
230		}
231		v = nv
232	}
233	if v != 0 {
234		fmt.Printf(",hi:%#02x},", 0x80+blockSize-1)
235	}
236}
237
238func printLookupBlock(nr int, n *trieNode, offset, cutoff int) {
239	boff := nr * blockSize
240	fmt.Printf("\n// Block %#x, offset %#x", nr, boff)
241	var printnewline bool
242	for i := 0; i < blockSize; i++ {
243		if i%8 == 0 {
244			printnewline = true
245		}
246		v := 0
247		if nn := n.table[i+offset]; nn != nil {
248			v = nn.value
249		}
250		if v != 0 {
251			if v < 0 {
252				v = -v - 1 + cutoff
253			}
254			if printnewline {
255				fmt.Printf("\n")
256				printnewline = false
257			}
258			fmt.Printf("%#03x:%#02x, ", boff+i, v)
259		}
260	}
261}
262
263// printTables returns the size in bytes of the generated tables.
264func (t *trieNode) printTables(name string) int {
265	index := newIndex()
266	// Values for 7-bit ASCII are stored in first two block, followed by nil block.
267	index.valueBlocks = append(index.valueBlocks, nil, nil, nil)
268	// First byte of multi-byte UTF-8 codepoints are indexed in 4th block.
269	index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil, nil)
270	// Index starter bytes of multi-byte UTF-8.
271	for i := 0xC0; i < 0x100; i++ {
272		if t.table[i] != nil {
273			computeOffsets(index, t.table[i])
274		}
275	}
276
277	nv := len(index.valueBlocks) * blockSize
278	fmt.Printf("// %sValues: %d entries, %d bytes\n", name, nv, nv*2)
279	fmt.Printf("// Block 2 is the null block.\n")
280	fmt.Printf("var %sValues = [%d]uint16 {", name, nv)
281	printValueBlock(0, t, 0)
282	printValueBlock(1, t, 64)
283	printValueBlock(2, newNode(), 0)
284	for i := 3; i < len(index.valueBlocks); i++ {
285		printValueBlock(i, index.valueBlocks[i], 0x80)
286	}
287	fmt.Print("\n}\n\n")
288
289	ls := len(index.sparseBlocks)
290	fmt.Printf("// %sSparseOffset: %d entries, %d bytes\n", name, ls, ls*2)
291	fmt.Printf("var %sSparseOffset = %#v\n\n", name, index.sparseOffset[1:])
292
293	ns := index.sparseCount
294	fmt.Printf("// %sSparseValues: %d entries, %d bytes\n", name, ns, ns*4)
295	fmt.Printf("var %sSparseValues = [%d]valueRange {", name, ns)
296	for i, n := range index.sparseBlocks {
297		printSparseBlock(i, n)
298	}
299	fmt.Print("\n}\n\n")
300
301	cutoff := len(index.valueBlocks) - blockOffset
302	ni := len(index.lookupBlocks) * blockSize
303	fmt.Printf("// %sLookup: %d bytes\n", name, ni)
304	fmt.Printf("// Block 0 is the null block.\n")
305	fmt.Printf("var %sLookup = [%d]uint8 {", name, ni)
306	printLookupBlock(0, newNode(), 0, cutoff)
307	printLookupBlock(1, newNode(), 0, cutoff)
308	printLookupBlock(2, newNode(), 0, cutoff)
309	printLookupBlock(3, t, 0xC0, cutoff)
310	for i := 4; i < len(index.lookupBlocks); i++ {
311		printLookupBlock(i, index.lookupBlocks[i], 0x80, cutoff)
312	}
313	fmt.Print("\n}\n\n")
314	fmt.Printf("var %sTrie = trie{ %sLookup[:], %sValues[:], %sSparseValues[:], %sSparseOffset[:], %d}\n\n",
315		name, name, name, name, name, cutoff)
316	return nv*2 + ns*4 + ni + ls*2
317}
318