1// Copyright 2012 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The trie in this file is used to associate the first full character
6// in a UTF-8 string to a collation element.
7// All but the last byte in a UTF-8 byte sequence are
8// used to look up offsets in the index table to be used for the next byte.
9// The last byte is used to index into a table of collation elements.
10// This file contains the code for the generation of the trie.
11
12package build
13
14import (
15	"fmt"
16	"hash/fnv"
17	"io"
18	"reflect"
19)
20
21const (
22	blockSize   = 64
23	blockOffset = 2 // Subtract 2 blocks to compensate for the 0x80 added to continuation bytes.
24)
25
26type trieHandle struct {
27	lookupStart uint16 // offset in table for first byte
28	valueStart  uint16 // offset in table for first byte
29}
30
31type trie struct {
32	index  []uint16
33	values []uint32
34}
35
36// trieNode is the intermediate trie structure used for generating a trie.
37type trieNode struct {
38	index    []*trieNode
39	value    []uint32
40	b        byte
41	refValue uint16
42	refIndex uint16
43}
44
45func newNode() *trieNode {
46	return &trieNode{
47		index: make([]*trieNode, 64),
48		value: make([]uint32, 128), // root node size is 128 instead of 64
49	}
50}
51
52func (n *trieNode) isInternal() bool {
53	return n.value != nil
54}
55
56func (n *trieNode) insert(r rune, value uint32) {
57	const maskx = 0x3F // mask out two most-significant bits
58	str := string(r)
59	if len(str) == 1 {
60		n.value[str[0]] = value
61		return
62	}
63	for i := 0; i < len(str)-1; i++ {
64		b := str[i] & maskx
65		if n.index == nil {
66			n.index = make([]*trieNode, blockSize)
67		}
68		nn := n.index[b]
69		if nn == nil {
70			nn = &trieNode{}
71			nn.b = b
72			n.index[b] = nn
73		}
74		n = nn
75	}
76	if n.value == nil {
77		n.value = make([]uint32, blockSize)
78	}
79	b := str[len(str)-1] & maskx
80	n.value[b] = value
81}
82
83type trieBuilder struct {
84	t *trie
85
86	roots []*trieHandle
87
88	lookupBlocks []*trieNode
89	valueBlocks  []*trieNode
90
91	lookupBlockIdx map[uint32]*trieNode
92	valueBlockIdx  map[uint32]*trieNode
93}
94
95func newTrieBuilder() *trieBuilder {
96	index := &trieBuilder{}
97	index.lookupBlocks = make([]*trieNode, 0)
98	index.valueBlocks = make([]*trieNode, 0)
99	index.lookupBlockIdx = make(map[uint32]*trieNode)
100	index.valueBlockIdx = make(map[uint32]*trieNode)
101	// The third nil is the default null block.  The other two blocks
102	// are used to guarantee an offset of at least 3 for each block.
103	index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil)
104	index.t = &trie{}
105	return index
106}
107
108func (b *trieBuilder) computeOffsets(n *trieNode) *trieNode {
109	hasher := fnv.New32()
110	if n.index != nil {
111		for i, nn := range n.index {
112			var vi, vv uint16
113			if nn != nil {
114				nn = b.computeOffsets(nn)
115				n.index[i] = nn
116				vi = nn.refIndex
117				vv = nn.refValue
118			}
119			hasher.Write([]byte{byte(vi >> 8), byte(vi)})
120			hasher.Write([]byte{byte(vv >> 8), byte(vv)})
121		}
122		h := hasher.Sum32()
123		nn, ok := b.lookupBlockIdx[h]
124		if !ok {
125			n.refIndex = uint16(len(b.lookupBlocks)) - blockOffset
126			b.lookupBlocks = append(b.lookupBlocks, n)
127			b.lookupBlockIdx[h] = n
128		} else {
129			n = nn
130		}
131	} else {
132		for _, v := range n.value {
133			hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
134		}
135		h := hasher.Sum32()
136		nn, ok := b.valueBlockIdx[h]
137		if !ok {
138			n.refValue = uint16(len(b.valueBlocks)) - blockOffset
139			n.refIndex = n.refValue
140			b.valueBlocks = append(b.valueBlocks, n)
141			b.valueBlockIdx[h] = n
142		} else {
143			n = nn
144		}
145	}
146	return n
147}
148
149func (b *trieBuilder) addStartValueBlock(n *trieNode) uint16 {
150	hasher := fnv.New32()
151	for _, v := range n.value[:2*blockSize] {
152		hasher.Write([]byte{byte(v >> 24), byte(v >> 16), byte(v >> 8), byte(v)})
153	}
154	h := hasher.Sum32()
155	nn, ok := b.valueBlockIdx[h]
156	if !ok {
157		n.refValue = uint16(len(b.valueBlocks))
158		n.refIndex = n.refValue
159		b.valueBlocks = append(b.valueBlocks, n)
160		// Add a dummy block to accommodate the double block size.
161		b.valueBlocks = append(b.valueBlocks, nil)
162		b.valueBlockIdx[h] = n
163	} else {
164		n = nn
165	}
166	return n.refValue
167}
168
169func genValueBlock(t *trie, n *trieNode) {
170	if n != nil {
171		for _, v := range n.value {
172			t.values = append(t.values, v)
173		}
174	}
175}
176
177func genLookupBlock(t *trie, n *trieNode) {
178	for _, nn := range n.index {
179		v := uint16(0)
180		if nn != nil {
181			if n.index != nil {
182				v = nn.refIndex
183			} else {
184				v = nn.refValue
185			}
186		}
187		t.index = append(t.index, v)
188	}
189}
190
191func (b *trieBuilder) addTrie(n *trieNode) *trieHandle {
192	h := &trieHandle{}
193	b.roots = append(b.roots, h)
194	h.valueStart = b.addStartValueBlock(n)
195	if len(b.roots) == 1 {
196		// We insert a null block after the first start value block.
197		// This ensures that continuation bytes UTF-8 sequences of length
198		// greater than 2 will automatically hit a null block if there
199		// was an undefined entry.
200		b.valueBlocks = append(b.valueBlocks, nil)
201	}
202	n = b.computeOffsets(n)
203	// Offset by one extra block as the first byte starts at 0xC0 instead of 0x80.
204	h.lookupStart = n.refIndex - 1
205	return h
206}
207
208// generate generates and returns the trie for n.
209func (b *trieBuilder) generate() (t *trie, err error) {
210	t = b.t
211	if len(b.valueBlocks) >= 1<<16 {
212		return nil, fmt.Errorf("maximum number of value blocks exceeded (%d > %d)", len(b.valueBlocks), 1<<16)
213	}
214	if len(b.lookupBlocks) >= 1<<16 {
215		return nil, fmt.Errorf("maximum number of lookup blocks exceeded (%d > %d)", len(b.lookupBlocks), 1<<16)
216	}
217	genValueBlock(t, b.valueBlocks[0])
218	genValueBlock(t, &trieNode{value: make([]uint32, 64)})
219	for i := 2; i < len(b.valueBlocks); i++ {
220		genValueBlock(t, b.valueBlocks[i])
221	}
222	n := &trieNode{index: make([]*trieNode, 64)}
223	genLookupBlock(t, n)
224	genLookupBlock(t, n)
225	genLookupBlock(t, n)
226	for i := 3; i < len(b.lookupBlocks); i++ {
227		genLookupBlock(t, b.lookupBlocks[i])
228	}
229	return b.t, nil
230}
231
232func (t *trie) printArrays(w io.Writer, name string) (n, size int, err error) {
233	p := func(f string, a ...interface{}) {
234		nn, e := fmt.Fprintf(w, f, a...)
235		n += nn
236		if err == nil {
237			err = e
238		}
239	}
240	nv := len(t.values)
241	p("// %sValues: %d entries, %d bytes\n", name, nv, nv*4)
242	p("// Block 2 is the null block.\n")
243	p("var %sValues = [%d]uint32 {", name, nv)
244	var printnewline bool
245	for i, v := range t.values {
246		if i%blockSize == 0 {
247			p("\n\t// Block %#x, offset %#x", i/blockSize, i)
248		}
249		if i%4 == 0 {
250			printnewline = true
251		}
252		if v != 0 {
253			if printnewline {
254				p("\n\t")
255				printnewline = false
256			}
257			p("%#04x:%#08x, ", i, v)
258		}
259	}
260	p("\n}\n\n")
261	ni := len(t.index)
262	p("// %sLookup: %d entries, %d bytes\n", name, ni, ni*2)
263	p("// Block 0 is the null block.\n")
264	p("var %sLookup = [%d]uint16 {", name, ni)
265	printnewline = false
266	for i, v := range t.index {
267		if i%blockSize == 0 {
268			p("\n\t// Block %#x, offset %#x", i/blockSize, i)
269		}
270		if i%8 == 0 {
271			printnewline = true
272		}
273		if v != 0 {
274			if printnewline {
275				p("\n\t")
276				printnewline = false
277			}
278			p("%#03x:%#02x, ", i, v)
279		}
280	}
281	p("\n}\n\n")
282	return n, nv*4 + ni*2, err
283}
284
285func (t *trie) printStruct(w io.Writer, handle *trieHandle, name string) (n, sz int, err error) {
286	const msg = "trie{ %sLookup[%d:], %sValues[%d:], %sLookup[:], %sValues[:]}"
287	n, err = fmt.Fprintf(w, msg, name, handle.lookupStart*blockSize, name, handle.valueStart*blockSize, name, name)
288	sz += int(reflect.TypeOf(trie{}).Size())
289	return
290}
291