1//  Copyright (c) 2017 Couchbase, Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// 		http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15package vellum
16
17import (
18	"bytes"
19	"io"
20)
21
22var defaultBuilderOpts = &BuilderOpts{
23	Encoder:           1,
24	RegistryTableSize: 10000,
25	RegistryMRUSize:   2,
26}
27
28// A Builder is used to build a new FST.  When possible data is
29// streamed out to the underlying Writer as soon as possible.
30type Builder struct {
31	unfinished *unfinishedNodes
32	registry   *registry
33	last       []byte
34	len        int
35
36	lastAddr int
37
38	encoder encoder
39	opts    *BuilderOpts
40
41	builderNodePool *builderNodePool
42}
43
44const noneAddr = 1
45const emptyAddr = 0
46
47// NewBuilder returns a new Builder which will stream out the
48// underlying representation to the provided Writer as the set is built.
49func newBuilder(w io.Writer, opts *BuilderOpts) (*Builder, error) {
50	if opts == nil {
51		opts = defaultBuilderOpts
52	}
53	builderNodePool := &builderNodePool{}
54	rv := &Builder{
55		unfinished:      newUnfinishedNodes(builderNodePool),
56		registry:        newRegistry(builderNodePool, opts.RegistryTableSize, opts.RegistryMRUSize),
57		builderNodePool: builderNodePool,
58		opts:            opts,
59		lastAddr:        noneAddr,
60	}
61
62	var err error
63	rv.encoder, err = loadEncoder(opts.Encoder, w)
64	if err != nil {
65		return nil, err
66	}
67	err = rv.encoder.start()
68	if err != nil {
69		return nil, err
70	}
71	return rv, nil
72}
73
74func (b *Builder) Reset(w io.Writer) error {
75	b.unfinished.Reset()
76	b.registry.Reset()
77	b.lastAddr = noneAddr
78	b.encoder.reset(w)
79	b.last = nil
80	b.len = 0
81
82	err := b.encoder.start()
83	if err != nil {
84		return err
85	}
86	return nil
87}
88
89// Insert the provided value to the set being built.
90// NOTE: values must be inserted in lexicographical order.
91func (b *Builder) Insert(key []byte, val uint64) error {
92	// ensure items are added in lexicographic order
93	if bytes.Compare(key, b.last) < 0 {
94		return ErrOutOfOrder
95	}
96	if len(key) == 0 {
97		b.len = 1
98		b.unfinished.setRootOutput(val)
99		return nil
100	}
101
102	prefixLen, out := b.unfinished.findCommonPrefixAndSetOutput(key, val)
103	b.len++
104	err := b.compileFrom(prefixLen)
105	if err != nil {
106		return err
107	}
108	b.copyLastKey(key)
109	b.unfinished.addSuffix(key[prefixLen:], out)
110
111	return nil
112}
113
114func (b *Builder) copyLastKey(key []byte) {
115	if b.last == nil {
116		b.last = make([]byte, 0, 64)
117	} else {
118		b.last = b.last[:0]
119	}
120	b.last = append(b.last, key...)
121}
122
123// Close MUST be called after inserting all values.
124func (b *Builder) Close() error {
125	err := b.compileFrom(0)
126	if err != nil {
127		return err
128	}
129	root := b.unfinished.popRoot()
130	rootAddr, err := b.compile(root)
131	if err != nil {
132		return err
133	}
134	return b.encoder.finish(b.len, rootAddr)
135}
136
137func (b *Builder) compileFrom(iState int) error {
138	addr := noneAddr
139	for iState+1 < len(b.unfinished.stack) {
140		var node *builderNode
141		if addr == noneAddr {
142			node = b.unfinished.popEmpty()
143		} else {
144			node = b.unfinished.popFreeze(addr)
145		}
146		var err error
147		addr, err = b.compile(node)
148		if err != nil {
149			return nil
150		}
151	}
152	b.unfinished.topLastFreeze(addr)
153	return nil
154}
155
156func (b *Builder) compile(node *builderNode) (int, error) {
157	if node.final && len(node.trans) == 0 &&
158		node.finalOutput == 0 {
159		return 0, nil
160	}
161	found, addr, entry := b.registry.entry(node)
162	if found {
163		return addr, nil
164	}
165	addr, err := b.encoder.encodeState(node, b.lastAddr)
166	if err != nil {
167		return 0, err
168	}
169
170	b.lastAddr = addr
171	entry.addr = addr
172	return addr, nil
173}
174
175type unfinishedNodes struct {
176	stack []*builderNodeUnfinished
177
178	// cache allocates a reasonable number of builderNodeUnfinished
179	// objects up front and tries to keep reusing them
180	// because the main data structure is a stack, we assume the
181	// same access pattern, and don't track items separately
182	// this means calls get() and pushXYZ() must be paired,
183	// as well as calls put() and popXYZ()
184	cache []builderNodeUnfinished
185
186	builderNodePool *builderNodePool
187}
188
189func (u *unfinishedNodes) Reset() {
190	u.stack = u.stack[:0]
191	for i := 0; i < len(u.cache); i++ {
192		u.cache[i] = builderNodeUnfinished{}
193	}
194	u.pushEmpty(false)
195}
196
197func newUnfinishedNodes(p *builderNodePool) *unfinishedNodes {
198	rv := &unfinishedNodes{
199		stack:           make([]*builderNodeUnfinished, 0, 64),
200		cache:           make([]builderNodeUnfinished, 64),
201		builderNodePool: p,
202	}
203	rv.pushEmpty(false)
204	return rv
205}
206
207// get new builderNodeUnfinished, reusing cache if possible
208func (u *unfinishedNodes) get() *builderNodeUnfinished {
209	if len(u.stack) < len(u.cache) {
210		return &u.cache[len(u.stack)]
211	}
212	// full now allocate a new one
213	return &builderNodeUnfinished{}
214}
215
216// return builderNodeUnfinished, clearing it for reuse
217func (u *unfinishedNodes) put() {
218	if len(u.stack) >= len(u.cache) {
219		return
220		// do nothing, not part of cache
221	}
222	u.cache[len(u.stack)] = builderNodeUnfinished{}
223}
224
225func (u *unfinishedNodes) findCommonPrefixAndSetOutput(key []byte,
226	out uint64) (int, uint64) {
227	var i int
228	for i < len(key) {
229		if i >= len(u.stack) {
230			break
231		}
232		var addPrefix uint64
233		if !u.stack[i].hasLastT {
234			break
235		}
236		if u.stack[i].lastIn == key[i] {
237			commonPre := outputPrefix(u.stack[i].lastOut, out)
238			addPrefix = outputSub(u.stack[i].lastOut, commonPre)
239			out = outputSub(out, commonPre)
240			u.stack[i].lastOut = commonPre
241			i++
242		} else {
243			break
244		}
245
246		if addPrefix != 0 {
247			u.stack[i].addOutputPrefix(addPrefix)
248		}
249	}
250
251	return i, out
252}
253
254func (u *unfinishedNodes) pushEmpty(final bool) {
255	next := u.get()
256	next.node = u.builderNodePool.Get()
257	next.node.final = final
258	u.stack = append(u.stack, next)
259}
260
261func (u *unfinishedNodes) popRoot() *builderNode {
262	l := len(u.stack)
263	var unfinished *builderNodeUnfinished
264	u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
265	rv := unfinished.node
266	u.put()
267	return rv
268}
269
270func (u *unfinishedNodes) popFreeze(addr int) *builderNode {
271	l := len(u.stack)
272	var unfinished *builderNodeUnfinished
273	u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
274	unfinished.lastCompiled(addr)
275	rv := unfinished.node
276	u.put()
277	return rv
278}
279
280func (u *unfinishedNodes) popEmpty() *builderNode {
281	l := len(u.stack)
282	var unfinished *builderNodeUnfinished
283	u.stack, unfinished = u.stack[:l-1], u.stack[l-1]
284	rv := unfinished.node
285	u.put()
286	return rv
287}
288
289func (u *unfinishedNodes) setRootOutput(out uint64) {
290	u.stack[0].node.final = true
291	u.stack[0].node.finalOutput = out
292}
293
294func (u *unfinishedNodes) topLastFreeze(addr int) {
295	last := len(u.stack) - 1
296	u.stack[last].lastCompiled(addr)
297}
298
299func (u *unfinishedNodes) addSuffix(bs []byte, out uint64) {
300	if len(bs) == 0 {
301		return
302	}
303	last := len(u.stack) - 1
304	u.stack[last].hasLastT = true
305	u.stack[last].lastIn = bs[0]
306	u.stack[last].lastOut = out
307	for _, b := range bs[1:] {
308		next := u.get()
309		next.node = u.builderNodePool.Get()
310		next.hasLastT = true
311		next.lastIn = b
312		next.lastOut = 0
313		u.stack = append(u.stack, next)
314	}
315	u.pushEmpty(true)
316}
317
318type builderNodeUnfinished struct {
319	node     *builderNode
320	lastOut  uint64
321	lastIn   byte
322	hasLastT bool
323}
324
325func (b *builderNodeUnfinished) lastCompiled(addr int) {
326	if b.hasLastT {
327		transIn := b.lastIn
328		transOut := b.lastOut
329		b.hasLastT = false
330		b.lastOut = 0
331		b.node.trans = append(b.node.trans, transition{
332			in:   transIn,
333			out:  transOut,
334			addr: addr,
335		})
336	}
337}
338
339func (b *builderNodeUnfinished) addOutputPrefix(prefix uint64) {
340	if b.node.final {
341		b.node.finalOutput = outputCat(prefix, b.node.finalOutput)
342	}
343	for i := range b.node.trans {
344		b.node.trans[i].out = outputCat(prefix, b.node.trans[i].out)
345	}
346	if b.hasLastT {
347		b.lastOut = outputCat(prefix, b.lastOut)
348	}
349}
350
351type builderNode struct {
352	finalOutput uint64
353	trans       []transition
354	final       bool
355
356	// intrusive linked list
357	next *builderNode
358}
359
360// reset resets the receiver builderNode to a re-usable state.
361func (n *builderNode) reset() {
362	n.final = false
363	n.finalOutput = 0
364	for i := range n.trans {
365		n.trans[i] = emptyTransition
366	}
367	n.trans = n.trans[:0]
368	n.next = nil
369}
370
371func (n *builderNode) equiv(o *builderNode) bool {
372	if n.final != o.final {
373		return false
374	}
375	if n.finalOutput != o.finalOutput {
376		return false
377	}
378	if len(n.trans) != len(o.trans) {
379		return false
380	}
381	for i, ntrans := range n.trans {
382		otrans := o.trans[i]
383		if ntrans.in != otrans.in {
384			return false
385		}
386		if ntrans.addr != otrans.addr {
387			return false
388		}
389		if ntrans.out != otrans.out {
390			return false
391		}
392	}
393	return true
394}
395
396var emptyTransition = transition{}
397
398type transition struct {
399	out  uint64
400	addr int
401	in   byte
402}
403
404func outputPrefix(l, r uint64) uint64 {
405	if l < r {
406		return l
407	}
408	return r
409}
410
411func outputSub(l, r uint64) uint64 {
412	return l - r
413}
414
415func outputCat(l, r uint64) uint64 {
416	return l + r
417}
418
419// builderNodePool pools builderNodes using a singly linked list.
420//
421// NB: builderNode lifecylce is described by the following interactions -
422// +------------------------+                            +----------------------+
423// |    Unfinished Nodes    |      Transfer once         |        Registry      |
424// |(not frozen builderNode)|-----builderNode is ------->| (frozen builderNode) |
425// +------------------------+      marked frozen         +----------------------+
426//              ^                                                     |
427//              |                                                     |
428//              |                                                   Put()
429//              | Get() on        +-------------------+             when
430//              +-new char--------| builderNode Pool  |<-----------evicted
431//                                +-------------------+
432type builderNodePool struct {
433	head *builderNode
434}
435
436func (p *builderNodePool) Get() *builderNode {
437	if p.head == nil {
438		return &builderNode{}
439	}
440	head := p.head
441	p.head = p.head.next
442	return head
443}
444
445func (p *builderNodePool) Put(v *builderNode) {
446	if v == nil {
447		return
448	}
449	v.reset()
450	v.next = p.head
451	p.head = v
452}
453