1// Copyright 2014 The Go Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5package triegen
6
7import (
8	"bytes"
9	"fmt"
10	"io"
11	"strings"
12	"text/template"
13)
14
15// print writes all the data structures as well as the code necessary to use the
16// trie to w.
17func (b *builder) print(w io.Writer) error {
18	b.Stats.NValueEntries = len(b.ValueBlocks) * blockSize
19	b.Stats.NValueBytes = len(b.ValueBlocks) * blockSize * b.ValueSize
20	b.Stats.NIndexEntries = len(b.IndexBlocks) * blockSize
21	b.Stats.NIndexBytes = len(b.IndexBlocks) * blockSize * b.IndexSize
22	b.Stats.NHandleBytes = len(b.Trie) * 2 * b.IndexSize
23
24	// If we only have one root trie, all starter blocks are at position 0 and
25	// we can access the arrays directly.
26	if len(b.Trie) == 1 {
27		// At this point we cannot refer to the generated tables directly.
28		b.ASCIIBlock = b.Name + "Values"
29		b.StarterBlock = b.Name + "Index"
30	} else {
31		// Otherwise we need to have explicit starter indexes in the trie
32		// structure.
33		b.ASCIIBlock = "t.ascii"
34		b.StarterBlock = "t.utf8Start"
35	}
36
37	b.SourceType = "[]byte"
38	if err := lookupGen.Execute(w, b); err != nil {
39		return err
40	}
41
42	b.SourceType = "string"
43	if err := lookupGen.Execute(w, b); err != nil {
44		return err
45	}
46
47	if err := trieGen.Execute(w, b); err != nil {
48		return err
49	}
50
51	for _, c := range b.Compactions {
52		if err := c.c.Print(w); err != nil {
53			return err
54		}
55	}
56
57	return nil
58}
59
60func printValues(n int, values []uint64) string {
61	w := &bytes.Buffer{}
62	boff := n * blockSize
63	fmt.Fprintf(w, "\t// Block %#x, offset %#x", n, boff)
64	var newline bool
65	for i, v := range values {
66		if i%6 == 0 {
67			newline = true
68		}
69		if v != 0 {
70			if newline {
71				fmt.Fprintf(w, "\n")
72				newline = false
73			}
74			fmt.Fprintf(w, "\t%#02x:%#04x, ", boff+i, v)
75		}
76	}
77	return w.String()
78}
79
80func printIndex(b *builder, nr int, n *node) string {
81	w := &bytes.Buffer{}
82	boff := nr * blockSize
83	fmt.Fprintf(w, "\t// Block %#x, offset %#x", nr, boff)
84	var newline bool
85	for i, c := range n.children {
86		if i%8 == 0 {
87			newline = true
88		}
89		if c != nil {
90			v := b.Compactions[c.index.compaction].Offset + uint32(c.index.index)
91			if v != 0 {
92				if newline {
93					fmt.Fprintf(w, "\n")
94					newline = false
95				}
96				fmt.Fprintf(w, "\t%#02x:%#02x, ", boff+i, v)
97			}
98		}
99	}
100	return w.String()
101}
102
103var (
104	trieGen = template.Must(template.New("trie").Funcs(template.FuncMap{
105		"printValues": printValues,
106		"printIndex":  printIndex,
107		"title":       strings.Title,
108		"dec":         func(x int) int { return x - 1 },
109		"psize": func(n int) string {
110			return fmt.Sprintf("%d bytes (%.2f KiB)", n, float64(n)/1024)
111		},
112	}).Parse(trieTemplate))
113	lookupGen = template.Must(template.New("lookup").Parse(lookupTemplate))
114)
115
116// TODO: consider the return type of lookup. It could be uint64, even if the
117// internal value type is smaller. We will have to verify this with the
118// performance of unicode/norm, which is very sensitive to such changes.
119const trieTemplate = `{{$b := .}}{{$multi := gt (len .Trie) 1}}
120// {{.Name}}Trie. Total size: {{psize .Size}}. Checksum: {{printf "%08x" .Checksum}}.
121type {{.Name}}Trie struct { {{if $multi}}
122	ascii []{{.ValueType}} // index for ASCII bytes
123	utf8Start  []{{.IndexType}} // index for UTF-8 bytes >= 0xC0
124{{end}}}
125
126func new{{title .Name}}Trie(i int) *{{.Name}}Trie { {{if $multi}}
127	h := {{.Name}}TrieHandles[i]
128	return &{{.Name}}Trie{ {{.Name}}Values[uint32(h.ascii)<<6:], {{.Name}}Index[uint32(h.multi)<<6:] }
129}
130
131type {{.Name}}TrieHandle struct {
132	ascii, multi {{.IndexType}}
133}
134
135// {{.Name}}TrieHandles: {{len .Trie}} handles, {{.Stats.NHandleBytes}} bytes
136var {{.Name}}TrieHandles = [{{len .Trie}}]{{.Name}}TrieHandle{
137{{range .Trie}}	{ {{.ASCIIIndex}}, {{.StarterIndex}} }, // {{printf "%08x" .Checksum}}: {{.Name}}
138{{end}}}{{else}}
139	return &{{.Name}}Trie{}
140}
141{{end}}
142// lookupValue determines the type of block n and looks up the value for b.
143func (t *{{.Name}}Trie) lookupValue(n uint32, b byte) {{.ValueType}}{{$last := dec (len .Compactions)}} {
144	switch { {{range $i, $c := .Compactions}}
145		{{if eq $i $last}}default{{else}}case n < {{$c.Cutoff}}{{end}}:{{if ne $i 0}}
146			n -= {{$c.Offset}}{{end}}
147			return {{print $b.ValueType}}({{$c.Handler}}){{end}}
148	}
149}
150
151// {{.Name}}Values: {{len .ValueBlocks}} blocks, {{.Stats.NValueEntries}} entries, {{.Stats.NValueBytes}} bytes
152// The third block is the zero block.
153var {{.Name}}Values = [{{.Stats.NValueEntries}}]{{.ValueType}} {
154{{range $i, $v := .ValueBlocks}}{{printValues $i $v}}
155{{end}}}
156
157// {{.Name}}Index: {{len .IndexBlocks}} blocks, {{.Stats.NIndexEntries}} entries, {{.Stats.NIndexBytes}} bytes
158// Block 0 is the zero block.
159var {{.Name}}Index = [{{.Stats.NIndexEntries}}]{{.IndexType}} {
160{{range $i, $v := .IndexBlocks}}{{printIndex $b $i $v}}
161{{end}}}
162`
163
164// TODO: consider allowing zero-length strings after evaluating performance with
165// unicode/norm.
166const lookupTemplate = `
167// lookup{{if eq .SourceType "string"}}String{{end}} returns the trie value for the first UTF-8 encoding in s and
168// the width in bytes of this encoding. The size will be 0 if s does not
169// hold enough bytes to complete the encoding. len(s) must be greater than 0.
170func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}(s {{.SourceType}}) (v {{.ValueType}}, sz int) {
171	c0 := s[0]
172	switch {
173	case c0 < 0x80: // is ASCII
174		return {{.ASCIIBlock}}[c0], 1
175	case c0 < 0xC2:
176		return 0, 1  // Illegal UTF-8: not a starter, not ASCII.
177	case c0 < 0xE0: // 2-byte UTF-8
178		if len(s) < 2 {
179			return 0, 0
180		}
181		i := {{.StarterBlock}}[c0]
182		c1 := s[1]
183		if c1 < 0x80 || 0xC0 <= c1 {
184			return 0, 1 // Illegal UTF-8: not a continuation byte.
185		}
186		return t.lookupValue(uint32(i), c1), 2
187	case c0 < 0xF0: // 3-byte UTF-8
188		if len(s) < 3 {
189			return 0, 0
190		}
191		i := {{.StarterBlock}}[c0]
192		c1 := s[1]
193		if c1 < 0x80 || 0xC0 <= c1 {
194			return 0, 1 // Illegal UTF-8: not a continuation byte.
195		}
196		o := uint32(i)<<6 + uint32(c1)
197		i = {{.Name}}Index[o]
198		c2 := s[2]
199		if c2 < 0x80 || 0xC0 <= c2 {
200			return 0, 2 // Illegal UTF-8: not a continuation byte.
201		}
202		return t.lookupValue(uint32(i), c2), 3
203	case c0 < 0xF8: // 4-byte UTF-8
204		if len(s) < 4 {
205			return 0, 0
206		}
207		i := {{.StarterBlock}}[c0]
208		c1 := s[1]
209		if c1 < 0x80 || 0xC0 <= c1 {
210			return 0, 1 // Illegal UTF-8: not a continuation byte.
211		}
212		o := uint32(i)<<6 + uint32(c1)
213		i = {{.Name}}Index[o]
214		c2 := s[2]
215		if c2 < 0x80 || 0xC0 <= c2 {
216			return 0, 2 // Illegal UTF-8: not a continuation byte.
217		}
218		o = uint32(i)<<6 + uint32(c2)
219		i = {{.Name}}Index[o]
220		c3 := s[3]
221		if c3 < 0x80 || 0xC0 <= c3 {
222			return 0, 3 // Illegal UTF-8: not a continuation byte.
223		}
224		return t.lookupValue(uint32(i), c3), 4
225	}
226	// Illegal rune
227	return 0, 1
228}
229
230// lookup{{if eq .SourceType "string"}}String{{end}}Unsafe returns the trie value for the first UTF-8 encoding in s.
231// s must start with a full and valid UTF-8 encoded rune.
232func (t *{{.Name}}Trie) lookup{{if eq .SourceType "string"}}String{{end}}Unsafe(s {{.SourceType}}) {{.ValueType}} {
233	c0 := s[0]
234	if c0 < 0x80 { // is ASCII
235		return {{.ASCIIBlock}}[c0]
236	}
237	i := {{.StarterBlock}}[c0]
238	if c0 < 0xE0 { // 2-byte UTF-8
239		return t.lookupValue(uint32(i), s[1])
240	}
241	i = {{.Name}}Index[uint32(i)<<6+uint32(s[1])]
242	if c0 < 0xF0 { // 3-byte UTF-8
243		return t.lookupValue(uint32(i), s[2])
244	}
245	i = {{.Name}}Index[uint32(i)<<6+uint32(s[2])]
246	if c0 < 0xF8 { // 4-byte UTF-8
247		return t.lookupValue(uint32(i), s[3])
248	}
249	return 0
250}
251`
252