1// Copyright 2011 The Go Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style 3// license that can be found in the LICENSE file. 4 5// +build ignore 6 7// Trie table generator. 8// Used by make*tables tools to generate a go file with trie data structures 9// for mapping UTF-8 to a 16-bit value. All but the last byte in a UTF-8 byte 10// sequence are used to lookup offsets in the index table to be used for the 11// next byte. The last byte is used to index into a table with 16-bit values. 12 13package main 14 15import ( 16 "fmt" 17 "hash/crc32" 18 "log" 19 "unicode/utf8" 20) 21 22const ( 23 blockSize = 64 24 blockOffset = 2 // Subtract two blocks to compensate for the 0x80 added to continuation bytes. 25 maxSparseEntries = 16 26) 27 28// Intermediate trie structure 29type trieNode struct { 30 table [256]*trieNode 31 value int 32 b byte 33 leaf bool 34} 35 36func newNode() *trieNode { 37 return new(trieNode) 38} 39 40func (n trieNode) String() string { 41 s := fmt.Sprint("trieNode{table: { non-nil at index: ") 42 for i, v := range n.table { 43 if v != nil { 44 s += fmt.Sprintf("%d, ", i) 45 } 46 } 47 s += fmt.Sprintf("}, value:%#x, b:%#x leaf:%v}", n.value, n.b, n.leaf) 48 return s 49} 50 51func (n trieNode) isInternal() bool { 52 internal := true 53 for i := 0; i < 256; i++ { 54 if nn := n.table[i]; nn != nil { 55 if !internal && !nn.leaf { 56 log.Fatalf("triegen: isInternal: node contains both leaf and non-leaf children (%v)", n) 57 } 58 internal = internal && !nn.leaf 59 } 60 } 61 return internal 62} 63 64func (n trieNode) mostFrequentStride() int { 65 counts := make(map[int]int) 66 v := 0 67 for _, t := range n.table[0x80 : 0x80+blockSize] { 68 if t != nil { 69 if stride := t.value - v; v != 0 && stride >= 0 { 70 counts[stride]++ 71 } 72 v = t.value 73 } else { 74 v = 0 75 } 76 } 77 var maxs, maxc int 78 for stride, cnt := range counts { 79 if cnt > maxc || (cnt == maxc && stride < maxs) { 80 maxs, maxc = stride, cnt 81 } 82 } 83 return maxs 84} 85 86func (n trieNode) countSparseEntries() int { 87 stride := n.mostFrequentStride() 88 var count, v int 89 for _, t := range n.table[0x80 : 0x80+blockSize] { 90 tv := 0 91 if t != nil { 92 tv = t.value 93 } 94 if tv-v != stride { 95 if tv != 0 { 96 count++ 97 } 98 } 99 v = tv 100 } 101 return count 102} 103 104func (n *trieNode) insert(r rune, value uint16) { 105 var p [utf8.UTFMax]byte 106 sz := utf8.EncodeRune(p[:], r) 107 108 for i := 0; i < sz; i++ { 109 if n.leaf { 110 log.Fatalf("triegen: insert: node (%#v) should not be a leaf", n) 111 } 112 nn := n.table[p[i]] 113 if nn == nil { 114 nn = newNode() 115 nn.b = p[i] 116 n.table[p[i]] = nn 117 } 118 n = nn 119 } 120 n.value = int(value) 121 n.leaf = true 122} 123 124type nodeIndex struct { 125 lookupBlocks []*trieNode 126 valueBlocks []*trieNode 127 sparseBlocks []*trieNode 128 sparseOffset []uint16 129 sparseCount int 130 131 lookupBlockIdx map[uint32]int 132 valueBlockIdx map[uint32]int 133} 134 135func newIndex() *nodeIndex { 136 index := &nodeIndex{} 137 index.lookupBlocks = make([]*trieNode, 0) 138 index.valueBlocks = make([]*trieNode, 0) 139 index.sparseBlocks = make([]*trieNode, 0) 140 index.sparseOffset = make([]uint16, 1) 141 index.lookupBlockIdx = make(map[uint32]int) 142 index.valueBlockIdx = make(map[uint32]int) 143 return index 144} 145 146func computeOffsets(index *nodeIndex, n *trieNode) int { 147 if n.leaf { 148 return n.value 149 } 150 hasher := crc32.New(crc32.MakeTable(crc32.IEEE)) 151 // We only index continuation bytes. 152 for i := 0; i < blockSize; i++ { 153 v := 0 154 if nn := n.table[0x80+i]; nn != nil { 155 v = computeOffsets(index, nn) 156 } 157 hasher.Write([]byte{uint8(v >> 8), uint8(v)}) 158 } 159 h := hasher.Sum32() 160 if n.isInternal() { 161 v, ok := index.lookupBlockIdx[h] 162 if !ok { 163 v = len(index.lookupBlocks) - blockOffset 164 index.lookupBlocks = append(index.lookupBlocks, n) 165 index.lookupBlockIdx[h] = v 166 } 167 n.value = v 168 } else { 169 v, ok := index.valueBlockIdx[h] 170 if !ok { 171 if c := n.countSparseEntries(); c > maxSparseEntries { 172 v = len(index.valueBlocks) - blockOffset 173 index.valueBlocks = append(index.valueBlocks, n) 174 index.valueBlockIdx[h] = v 175 } else { 176 v = -len(index.sparseOffset) 177 index.sparseBlocks = append(index.sparseBlocks, n) 178 index.sparseOffset = append(index.sparseOffset, uint16(index.sparseCount)) 179 index.sparseCount += c + 1 180 index.valueBlockIdx[h] = v 181 } 182 } 183 n.value = v 184 } 185 return n.value 186} 187 188func printValueBlock(nr int, n *trieNode, offset int) { 189 boff := nr * blockSize 190 fmt.Printf("\n// Block %#x, offset %#x", nr, boff) 191 var printnewline bool 192 for i := 0; i < blockSize; i++ { 193 if i%6 == 0 { 194 printnewline = true 195 } 196 v := 0 197 if nn := n.table[i+offset]; nn != nil { 198 v = nn.value 199 } 200 if v != 0 { 201 if printnewline { 202 fmt.Printf("\n") 203 printnewline = false 204 } 205 fmt.Printf("%#04x:%#04x, ", boff+i, v) 206 } 207 } 208} 209 210func printSparseBlock(nr int, n *trieNode) { 211 boff := -n.value 212 fmt.Printf("\n// Block %#x, offset %#x", nr, boff) 213 v := 0 214 //stride := f(n) 215 stride := n.mostFrequentStride() 216 c := n.countSparseEntries() 217 fmt.Printf("\n{value:%#04x,lo:%#02x},", stride, uint8(c)) 218 for i, nn := range n.table[0x80 : 0x80+blockSize] { 219 nv := 0 220 if nn != nil { 221 nv = nn.value 222 } 223 if nv-v != stride { 224 if v != 0 { 225 fmt.Printf(",hi:%#02x},", 0x80+i-1) 226 } 227 if nv != 0 { 228 fmt.Printf("\n{value:%#04x,lo:%#02x", nv, nn.b) 229 } 230 } 231 v = nv 232 } 233 if v != 0 { 234 fmt.Printf(",hi:%#02x},", 0x80+blockSize-1) 235 } 236} 237 238func printLookupBlock(nr int, n *trieNode, offset, cutoff int) { 239 boff := nr * blockSize 240 fmt.Printf("\n// Block %#x, offset %#x", nr, boff) 241 var printnewline bool 242 for i := 0; i < blockSize; i++ { 243 if i%8 == 0 { 244 printnewline = true 245 } 246 v := 0 247 if nn := n.table[i+offset]; nn != nil { 248 v = nn.value 249 } 250 if v != 0 { 251 if v < 0 { 252 v = -v - 1 + cutoff 253 } 254 if printnewline { 255 fmt.Printf("\n") 256 printnewline = false 257 } 258 fmt.Printf("%#03x:%#02x, ", boff+i, v) 259 } 260 } 261} 262 263// printTables returns the size in bytes of the generated tables. 264func (t *trieNode) printTables(name string) int { 265 index := newIndex() 266 // Values for 7-bit ASCII are stored in first two block, followed by nil block. 267 index.valueBlocks = append(index.valueBlocks, nil, nil, nil) 268 // First byte of multi-byte UTF-8 codepoints are indexed in 4th block. 269 index.lookupBlocks = append(index.lookupBlocks, nil, nil, nil, nil) 270 // Index starter bytes of multi-byte UTF-8. 271 for i := 0xC0; i < 0x100; i++ { 272 if t.table[i] != nil { 273 computeOffsets(index, t.table[i]) 274 } 275 } 276 277 nv := len(index.valueBlocks) * blockSize 278 fmt.Printf("// %sValues: %d entries, %d bytes\n", name, nv, nv*2) 279 fmt.Printf("// Block 2 is the null block.\n") 280 fmt.Printf("var %sValues = [%d]uint16 {", name, nv) 281 printValueBlock(0, t, 0) 282 printValueBlock(1, t, 64) 283 printValueBlock(2, newNode(), 0) 284 for i := 3; i < len(index.valueBlocks); i++ { 285 printValueBlock(i, index.valueBlocks[i], 0x80) 286 } 287 fmt.Print("\n}\n\n") 288 289 ls := len(index.sparseBlocks) 290 fmt.Printf("// %sSparseOffset: %d entries, %d bytes\n", name, ls, ls*2) 291 fmt.Printf("var %sSparseOffset = %#v\n\n", name, index.sparseOffset[1:]) 292 293 ns := index.sparseCount 294 fmt.Printf("// %sSparseValues: %d entries, %d bytes\n", name, ns, ns*4) 295 fmt.Printf("var %sSparseValues = [%d]valueRange {", name, ns) 296 for i, n := range index.sparseBlocks { 297 printSparseBlock(i, n) 298 } 299 fmt.Print("\n}\n\n") 300 301 cutoff := len(index.valueBlocks) - blockOffset 302 ni := len(index.lookupBlocks) * blockSize 303 fmt.Printf("// %sLookup: %d bytes\n", name, ni) 304 fmt.Printf("// Block 0 is the null block.\n") 305 fmt.Printf("var %sLookup = [%d]uint8 {", name, ni) 306 printLookupBlock(0, newNode(), 0, cutoff) 307 printLookupBlock(1, newNode(), 0, cutoff) 308 printLookupBlock(2, newNode(), 0, cutoff) 309 printLookupBlock(3, t, 0xC0, cutoff) 310 for i := 4; i < len(index.lookupBlocks); i++ { 311 printLookupBlock(i, index.lookupBlocks[i], 0x80, cutoff) 312 } 313 fmt.Print("\n}\n\n") 314 fmt.Printf("var %sTrie = trie{ %sLookup[:], %sValues[:], %sSparseValues[:], %sSparseOffset[:], %d}\n\n", 315 name, name, name, name, name, cutoff) 316 return nv*2 + ns*4 + ni + ls*2 317} 318