1// Copyright (c) 2017 Couchbase, Inc. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// http://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15package vellum 16 17import ( 18 "bytes" 19 "io" 20) 21 22var defaultBuilderOpts = &BuilderOpts{ 23 Encoder: 1, 24 RegistryTableSize: 10000, 25 RegistryMRUSize: 2, 26} 27 28// A Builder is used to build a new FST. When possible data is 29// streamed out to the underlying Writer as soon as possible. 30type Builder struct { 31 unfinished *unfinishedNodes 32 registry *registry 33 last []byte 34 len int 35 36 lastAddr int 37 38 encoder encoder 39 opts *BuilderOpts 40 41 builderNodePool *builderNodePool 42} 43 44const noneAddr = 1 45const emptyAddr = 0 46 47// NewBuilder returns a new Builder which will stream out the 48// underlying representation to the provided Writer as the set is built. 49func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) { 50 if opts == nil { 51 opts = defaultBuilderOpts 52 } 53 builderNodePool := &builderNodePool{} 54 rv := &Builder{ 55 unfinished: newUnfinishedNodes(builderNodePool), 56 registry: newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize), 57 builderNodePool: builderNodePool, 58 opts: opts, 59 lastAddr: noneAddr, 60 } 61 62 var err error 63 rv.encoder, err = loadEncoder(opts.Encoder, w) 64 if err != nil { 65 return nil, err 66 } 67 err = rv.encoder.start() 68 if err != nil { 69 return nil, err 70 } 71 return rv, nil 72} 73 74func (b *Builder) Reset(w io.Writer) error { 75 b.unfinished.Reset() 76 b.registry.Reset() 77 b.lastAddr = noneAddr 78 b.encoder.reset(w) 79 b.last = nil 80 b.len = 0 81 82 err := b.encoder.start() 83 if err != nil { 84 return err 85 } 86 return nil 87} 88 89// Insert the provided value to the set being built. 90// NOTE: values must be inserted in lexicographical order. 91func (b *Builder) Insert(key []byte, val uint64) error { 92 // ensure items are added in lexicographic order 93 if bytes.Compare(key, b.last) < 0 { 94 return ErrOutOfOrder 95 } 96 if len(key) == 0 { 97 b.len = 1 98 b.unfinished.setRootOutput(val) 99 return nil 100 } 101 102 prefixLen, out := b.unfinished.findCommonPrefixAndSetOutput(key, val) 103 b.len++ 104 err := b.compileFrom(prefixLen) 105 if err != nil { 106 return err 107 } 108 b.copyLastKey(key) 109 b.unfinished.addSuffix(key[prefixLen:], out) 110 111 return nil 112} 113 114func (b *Builder) copyLastKey(key []byte) { 115 if b.last == nil { 116 b.last = make([]byte, 0, 64) 117 } else { 118 b.last = b.last[:0] 119 } 120 b.last = append(b.last, key...) 121} 122 123// Close MUST be called after inserting all values. 124func (b *Builder) Close() error { 125 err := b.compileFrom(0) 126 if err != nil { 127 return err 128 } 129 root := b.unfinished.popRoot() 130 rootAddr, err := b.compile(root) 131 if err != nil { 132 return err 133 } 134 return b.encoder.finish(b.len, rootAddr) 135} 136 137func (b *Builder) compileFrom(iState int) error { 138 addr := noneAddr 139 for iState+1 < len(b.unfinished.stack) { 140 var node *builderNode 141 if addr == noneAddr { 142 node = b.unfinished.popEmpty() 143 } else { 144 node = b.unfinished.popFreeze(addr) 145 } 146 var err error 147 addr, err = b.compile(node) 148 if err != nil { 149 return nil 150 } 151 } 152 b.unfinished.topLastFreeze(addr) 153 return nil 154} 155 156func (b *Builder) compile(node *builderNode) (int, error) { 157 if node.final && len(node.trans) == 0 && 158 node.finalOutput == 0 { 159 return 0, nil 160 } 161 found, addr, entry := b.registry.entry(node) 162 if found { 163 return addr, nil 164 } 165 addr, err := b.encoder.encodeState(node, b.lastAddr) 166 if err != nil { 167 return 0, err 168 } 169 170 b.lastAddr = addr 171 entry.addr = addr 172 return addr, nil 173} 174 175type unfinishedNodes struct { 176 stack []*builderNodeUnfinished 177 178 // cache allocates a reasonable number of builderNodeUnfinished 179 // objects up front and tries to keep reusing them 180 // because the main data structure is a stack, we assume the 181 // same access pattern, and don't track items separately 182 // this means calls get() and pushXYZ() must be paired, 183 // as well as calls put() and popXYZ() 184 cache []builderNodeUnfinished 185 186 builderNodePool *builderNodePool 187} 188 189func (u *unfinishedNodes) Reset() { 190 u.stack = u.stack[:0] 191 for i := 0; i < len(u.cache); i++ { 192 u.cache[i] = builderNodeUnfinished{} 193 } 194 u.pushEmpty(false) 195} 196 197func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes { 198 rv := &unfinishedNodes{ 199 stack: make([]*builderNodeUnfinished, 0, 64), 200 cache: make([]builderNodeUnfinished, 64), 201 builderNodePool: p, 202 } 203 rv.pushEmpty(false) 204 return rv 205} 206 207// get new builderNodeUnfinished, reusing cache if possible 208func (u *unfinishedNodes) get() *builderNodeUnfinished { 209 if len(u.stack) < len(u.cache) { 210 return &u.cache[len(u.stack)] 211 } 212 // full now allocate a new one 213 return &builderNodeUnfinished{} 214} 215 216// return builderNodeUnfinished, clearing it for reuse 217func (u *unfinishedNodes) put() { 218 if len(u.stack) >= len(u.cache) { 219 return 220 // do nothing, not part of cache 221 } 222 u.cache[len(u.stack)] = builderNodeUnfinished{} 223} 224 225func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte, 226 out uint64) (int, uint64) { 227 var i int 228 for i < len(key) { 229 if i >= len(u.stack) { 230 break 231 } 232 var addPrefix uint64 233 if !u.stack[i].hasLastT { 234 break 235 } 236 if u.stack[i].lastIn == key[i] { 237 commonPre := outputPrefix(u.stack[i].lastOut, out) 238 addPrefix = outputSub(u.stack[i].lastOut, commonPre) 239 out = outputSub(out, commonPre) 240 u.stack[i].lastOut = commonPre 241 i++ 242 } else { 243 break 244 } 245 246 if addPrefix != 0 { 247 u.stack[i].addOutputPrefix(addPrefix) 248 } 249 } 250 251 return i, out 252} 253 254func (u *unfinishedNodes) pushEmpty(final bool) { 255 next := u.get() 256 next.node = u.builderNodePool.Get() 257 next.node.final = final 258 u.stack = append(u.stack, next) 259} 260 261func (u *unfinishedNodes) popRoot() *builderNode { 262 l := len(u.stack) 263 var unfinished *builderNodeUnfinished 264 u.stack, unfinished = u.stack[:l-1], u.stack[l-1] 265 rv := unfinished.node 266 u.put() 267 return rv 268} 269 270func (u *unfinishedNodes) popFreeze(addr int) *builderNode { 271 l := len(u.stack) 272 var unfinished *builderNodeUnfinished 273 u.stack, unfinished = u.stack[:l-1], u.stack[l-1] 274 unfinished.lastCompiled(addr) 275 rv := unfinished.node 276 u.put() 277 return rv 278} 279 280func (u *unfinishedNodes) popEmpty() *builderNode { 281 l := len(u.stack) 282 var unfinished *builderNodeUnfinished 283 u.stack, unfinished = u.stack[:l-1], u.stack[l-1] 284 rv := unfinished.node 285 u.put() 286 return rv 287} 288 289func (u *unfinishedNodes) setRootOutput(out uint64) { 290 u.stack[0].node.final = true 291 u.stack[0].node.finalOutput = out 292} 293 294func (u *unfinishedNodes) topLastFreeze(addr int) { 295 last := len(u.stack) - 1 296 u.stack[last].lastCompiled(addr) 297} 298 299func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) { 300 if len(bs) == 0 { 301 return 302 } 303 last := len(u.stack) - 1 304 u.stack[last].hasLastT = true 305 u.stack[last].lastIn = bs[0] 306 u.stack[last].lastOut = out 307 for _, b := range bs[1:] { 308 next := u.get() 309 next.node = u.builderNodePool.Get() 310 next.hasLastT = true 311 next.lastIn = b 312 next.lastOut = 0 313 u.stack = append(u.stack, next) 314 } 315 u.pushEmpty(true) 316} 317 318type builderNodeUnfinished struct { 319 node *builderNode 320 lastOut uint64 321 lastIn byte 322 hasLastT bool 323} 324 325func (b *builderNodeUnfinished) lastCompiled(addr int) { 326 if b.hasLastT { 327 transIn := b.lastIn 328 transOut := b.lastOut 329 b.hasLastT = false 330 b.lastOut = 0 331 b.node.trans = append(b.node.trans, transition{ 332 in: transIn, 333 out: transOut, 334 addr: addr, 335 }) 336 } 337} 338 339func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) { 340 if b.node.final { 341 b.node.finalOutput = outputCat(prefix, b.node.finalOutput) 342 } 343 for i := range b.node.trans { 344 b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out) 345 } 346 if b.hasLastT { 347 b.lastOut = outputCat(prefix, b.lastOut) 348 } 349} 350 351type builderNode struct { 352 finalOutput uint64 353 trans []transition 354 final bool 355 356 // intrusive linked list 357 next *builderNode 358} 359 360// reset resets the receiver builderNode to a re-usable state. 361func (n *builderNode) reset() { 362 n.final = false 363 n.finalOutput = 0 364 for i := range n.trans { 365 n.trans[i] = emptyTransition 366 } 367 n.trans = n.trans[:0] 368 n.next = nil 369} 370 371func (n *builderNode) equiv(o *builderNode) bool { 372 if n.final != o.final { 373 return false 374 } 375 if n.finalOutput != o.finalOutput { 376 return false 377 } 378 if len(n.trans) != len(o.trans) { 379 return false 380 } 381 for i, ntrans := range n.trans { 382 otrans := o.trans[i] 383 if ntrans.in != otrans.in { 384 return false 385 } 386 if ntrans.addr != otrans.addr { 387 return false 388 } 389 if ntrans.out != otrans.out { 390 return false 391 } 392 } 393 return true 394} 395 396var emptyTransition = transition{} 397 398type transition struct { 399 out uint64 400 addr int 401 in byte 402} 403 404func outputPrefix(l, r uint64) uint64 { 405 if l < r { 406 return l 407 } 408 return r 409} 410 411func outputSub(l, r uint64) uint64 { 412 return l - r 413} 414 415func outputCat(l, r uint64) uint64 { 416 return l + r 417} 418 419// builderNodePool pools builderNodes using a singly linked list. 420// 421// NB: builderNode lifecylce is described by the following interactions - 422// +------------------------+ +----------------------+ 423// | Unfinished Nodes | Transfer once | Registry | 424// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) | 425// +------------------------+ marked frozen +----------------------+ 426// ^ | 427// | | 428// | Put() 429// | Get() on +-------------------+ when 430// +-new char--------| builderNode Pool |<-----------evicted 431// +-------------------+ 432type builderNodePool struct { 433 head *builderNode 434} 435 436func (p *builderNodePool) Get() *builderNode { 437 if p.head == nil { 438 return &builderNode{} 439 } 440 head := p.head 441 p.head = p.head.next 442 return head 443} 444 445func (p *builderNodePool) Put(v *builderNode) { 446 if v == nil { 447 return 448 } 449 v.reset() 450 v.next = p.head 451 p.head = v 452} 453