1// Copyright 2014 The lldb Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style
3// license that can be found in the LICENSE file.
4
5// The storage space management.
6
7package lldb
8
9import (
10	"bytes"
11	"errors"
12	"fmt"
13	"io"
14	"sort"
15	"strings"
16	"sync"
17
18	"github.com/cznic/internal/buffer"
19	"github.com/cznic/mathutil"
20	"github.com/cznic/zappy"
21)
22
23const (
24	maxBuf = maxRq + 20
25)
26
27// Options are passed to the NewAllocator to amend some configuration.  The
28// compatibility promise is the same as of struct types in the Go standard
29// library - introducing changes can be made only by adding new exported
30// fields, which is backward compatible as long as client code uses field names
31// to assign values of imported struct types literals.
32//
33// NOTE: No options are currently defined.
34type Options struct{}
35
36// AllocStats record statistics about a Filer. It can be optionally filled by
37// Allocator.Verify, if successful.
38type AllocStats struct {
39	Handles     int64           // total valid handles in use
40	Compression int64           // number of compressed blocks
41	TotalAtoms  int64           // total number of atoms == AllocAtoms + FreeAtoms
42	AllocBytes  int64           // bytes allocated (after decompression, if/where used)
43	AllocAtoms  int64           // atoms allocated/used, including relocation atoms
44	Relocations int64           // number of relocated used blocks
45	FreeAtoms   int64           // atoms unused
46	AllocMap    map[int64]int64 // allocated block size in atoms -> count of such blocks
47	FreeMap     map[int64]int64 // free block size in atoms -> count of such blocks
48}
49
50/*
51
52Allocator implements "raw" storage space management (allocation and
53deallocation) for a low level of a DB engine.  The storage is an abstraction
54provided by a Filer.
55
56The terms MUST or MUST NOT, if/where used in the documentation of Allocator,
57written in all caps as seen here, are a requirement for any possible
58alternative implementations aiming for compatibility with this one.
59
60Filer file
61
62A Filer file, or simply 'file', is a linear, contiguous sequence of blocks.
63Blocks may be either free (currently unused) or allocated (currently used).
64Some blocks may eventually become virtual in a sense as they may not be
65realized in the storage (sparse files).
66
67Free Lists Table
68
69File starts with a FLT. This table records heads of 14 doubly linked free
70lists. The zero based index (I) vs minimal size of free blocks in that list,
71except the last one which registers free blocks of size 4112+ atoms:
72
73	MinSize == 2^I
74
75	For example 0 -> 1, 1 -> 2, ... 12 -> 4096.
76
77Each entry in the FLT is 8 bytes in netwtork order, MSB MUST be zero, ie. the
78slot value is effectively only 7 bytes. The value is the handle of the head of
79the respective doubly linked free list. The FLT size is 14*8 == 112(0x70)
80bytes. If the free blocks list for any particular size is empty, the respective
81FLT slot is zero. Sizes of free blocks in one list MUST NOT overlap with sizes
82of free lists in other list. For example, even though a free block of size 2
83technically is of minimal size >= 1, it MUST NOT be put to the list for slot 0
84(minimal size 1), but in slot 1( minimal size 2).
85
86	slot 0:		sizes [1, 2)
87	slot 1:		sizes [2, 4)
88	slot 2:		sizes [4, 8)
89	...
90	slot 11:	sizes [2048, 4096)
91	slot 12:	sizes [4096, 4112)
92	slot 13:	sizes [4112, inf)
93
94The last FLT slot collects all free blocks bigger than its minimal size. That
95still respects the 'no overlap' invariant.
96
97File blocks
98
99A block is a linear, contiguous sequence of atoms. The first and last atoms of
100a block provide information about, for example, whether the block is free or
101used, what is the size of the block, etc.  Details are discussed elsewhere. The
102first block of a file starts immediately after FLT, ie. at file offset
103112(0x70).
104
105Block atoms
106
107An atom is a fixed size piece of a block (and thus of a file too); it is 16
108bytes long. A consequence is that for a valid file:
109
110 filesize == 0 (mod 16)
111
112The first atom of the first block is considered to be atom #1.
113
114Block handles
115
116A handle is an integer referring to a block. The reference is the number of the
117atom the block starts with. Put in other way:
118
119 handle == offset/16 - 6
120 offset == 16 * (handle + 6)
121
122`offset` is the offset of the first byte of the block, measured in bytes
123- as in fseek(3). Handle has type `int64`, but only the lower 7 bytes may be
124nonzero while referring to a block, both in code as well as when persisted in
125the the file's internal bookkeeping structures - see 'Block types' bellow. So a
126handle is effectively only `uint56`.  This also means that the maximum usable
127size of a file is 2^56 atoms.  That is 2^60 bytes == 1 exabyte (10^18 bytes).
128
129Nil handles
130
131A handle with numeric value of '0' refers to no block.
132
133Zero padding
134
135A padding is used to round-up a block size to be a whole number of atoms. Any
136padding, if present, MUST be all zero bytes. Note that the size of padding is
137in [0, 15].
138
139Content wiping
140
141When a block is deallocated, its data content is not wiped as the added
142overhead may be substantial while not necessarily needed. Client code should
143however overwrite the content of any block having sensitive data with eg. zeros
144(good compression) - before deallocating the block.
145
146Block tags
147
148Every block is tagged in its first byte (a head tag) and last byte (tail tag).
149Block types are:
150
151 1. Short content used block (head tags 0x00-0xFB)
152 2. Long content used block (head tag 0xFC)
153 3. Relocated used block (head tag 0xFD)
154 4. Short, single atom, free block (head tag 0xFE)
155 5. Long free block (head tag 0xFF)
156
157Note: Relocated used block, 3. above (head tag 0xFD) MUST NOT refer to blocks
158other then 1. or 2. above (head tags 0x00-0xFC).
159
160Content blocks
161
162Used blocks (head tags 0x00-0xFC) tail tag distinguish used/unused block and if
163content is compressed or not.
164
165Content compression
166
167The tail flag of an used block is one of
168
169	CC == 0 // Content is not compressed.
170	CC == 1 // Content is in zappy compression format.
171
172If compression of written content is enabled, there are two cases: If
173compressed size < original size then the compressed content should be written
174if it will save at least one atom of the block. If compressed size >= original
175size then the compressed content should not be used.
176
177It's recommended to use compression. For example the BTrees implementation
178assumes compression is used. Using compression may cause a slowdown in some
179cases while it may as well cause a speedup.
180
181Short content block
182
183Short content block carries content of length between N == 0(0x00) and N ==
184251(0xFB) bytes.
185
186	|<-first atom start  ...  last atom end->|
187	+---++--   ...   --+--   ...   --++------+
188	| 0 ||    1...     |  0x*...0x*E || 0x*F |
189	+---++--   ...   --+--   ...   --++------+
190	| N ||   content   |   padding   ||  CC  |
191	+---++--   ...   --+--   ...   --++------+
192
193	A == (N+1)/16 + 1        // The number of atoms in the block [1, 16]
194	padding == 15 - (N+1)%16 // Length of the zero padding
195
196Long content block
197
198Long content block carries content of length between N == 252(0xFC) and N ==
19965787(0x100FB) bytes.
200
201	|<-first atom start    ...     last atom end->|
202	+------++------+-- ... --+--  ...   --++------+
203	|  0   || 1..2 |   3...  | 0x*...0x*E || 0x*F |
204	+------++------+-- ... --+--  ...   --++------+
205	| 0xFC ||  M   | content |  padding   ||  CC  |
206	+------++------+-- ... --+--  ...   --++------+
207
208	A == (N+3)/16 + 1        // The number of atoms in the block [16, 4112]
209	M == N % 0x10000         // Stored as 2 bytes in network byte order
210	padding == 15 - (N+3)%16 // Length of the zero padding
211
212Relocated used block
213
214Relocated block allows to permanently assign a handle to some content and
215resize the content anytime afterwards without having to update all the possible
216existing references; the handle can be constant while the content size may be
217dynamic. When relocating a block, any space left by the original block content,
218above this single atom block, MUST be reclaimed.
219
220Relocations MUST point only to a used short or long block == blocks with tags
2210x00...0xFC.
222
223	+------++------+---------++----+
224	|  0   || 1..7 | 8...14  || 15 |
225	+------++------+---------++----+
226	| 0xFD ||  H   | padding || 0  |
227	+------++------+---------++----+
228
229H is the handle of the relocated block in network byte order.
230
231Free blocks
232
233Free blocks are the result of space deallocation. Free blocks are organized in
234one or more doubly linked lists, abstracted by the FLT interface. Free blocks
235MUST be "registered" by putting them in such list. Allocator MUST reuse a big
236enough free block, if such exists, before growing the file size. When a free
237block is created by deallocation or reallocation it MUST be joined with any
238adjacently existing free blocks before "registering". If the resulting free
239block is now a last block of a file, the free block MUST be discarded and the
240file size MUST be truncated accordingly instead. Put differently, there MUST
241NOT ever be a free block at the file end.
242
243A single free atom
244
245Is an unused block of size 1 atom.
246
247	+------++------+--------++------+
248	|  0   || 1..7 | 8...14 ||  15  |
249	+------++------+--------++------+
250	| 0xFE ||  P   |   N    || 0xFE |
251	+------++------+--------++------+
252
253P and N, stored in network byte order, are the previous and next free block
254handles in the doubly linked list to which this free block belongs.
255
256A long unused block
257
258Is an unused block of size > 1 atom.
259
260	+------++------+-------+---------+- ... -+----------++------+
261	|  0   || 1..7 | 8..14 | 15...21 |       | Z-7..Z-1 ||  Z   |
262	+------++------+-------+---------+- ... -+----------++------+
263	| 0xFF ||  S   |   P   |    N    | Leak  |    S     || 0xFF |
264	+------++------+-------+---------+- ... -+----------++------+
265
266	Z == 16 * S - 1
267
268S is the size of this unused block in atoms. P and N are the previous and next
269free block handles in the doubly linked list to which this free block belongs.
270Leak contains any data the block had before deallocating this block.  See also
271the subtitle 'Content wiping' above. S, P and N are stored in network byte
272order. Large free blocks may trigger a consideration of file hole punching of
273the Leak field - for some value of 'large'.
274
275Note: Allocator methods vs CRUD[1]:
276
277	Alloc	[C]reate
278	Get	[R]ead
279	Realloc	[U]pdate
280	Free	[D]elete
281
282Note: No Allocator method returns io.EOF.
283
284  [1]: http://en.wikipedia.org/wiki/Create,_read,_update_and_delete
285
286*/
287type Allocator struct {
288	f        Filer
289	flt      flt
290	cache    cache
291	m        map[int64]*node
292	lru      lst
293	mu       sync.Mutex
294	expHit   int64
295	expMiss  int64
296	cacheSz  int
297	hit      uint16
298	miss     uint16
299	Compress bool // enables content compression
300}
301
302// NewAllocator returns a new Allocator. To open an existing file, pass its
303// Filer. To create a "new" file, pass a Filer which file is of zero size.
304func NewAllocator(f Filer, opts *Options) (a *Allocator, err error) {
305	if opts == nil { // Enforce *Options is always passed
306		return nil, errors.New("NewAllocator: nil opts passed")
307	}
308
309	a = &Allocator{
310		f:       f,
311		cacheSz: 10,
312	}
313
314	a.cinit()
315	switch x := f.(type) {
316	case *RollbackFiler:
317		x.afterRollback = func() error {
318			a.cinit()
319			return a.flt.load(a.f, 0)
320		}
321	case *ACIDFiler0:
322		x.RollbackFiler.afterRollback = func() error {
323			a.cinit()
324			return a.flt.load(a.f, 0)
325		}
326	}
327
328	sz, err := f.Size()
329	if err != nil {
330		return
331	}
332
333	a.flt.init()
334	if sz == 0 {
335		var b [fltSz]byte
336		if err = a.f.BeginUpdate(); err != nil {
337			return
338		}
339
340		if _, err = f.WriteAt(b[:], 0); err != nil {
341			_ = a.f.Rollback()
342			return
343		}
344
345		return a, a.f.EndUpdate()
346	}
347
348	return a, a.flt.load(f, 0)
349}
350
351// CacheStats reports cache statistics.
352//
353//TODO return a struct perhaps.
354func (a *Allocator) CacheStats() (buffersUsed, buffersTotal int, bytesUsed, bytesTotal, hits, misses int64) {
355	buffersUsed = len(a.m)
356	buffersTotal = buffersUsed + len(a.cache)
357	bytesUsed = a.lru.size()
358	bytesTotal = bytesUsed + a.cache.size()
359	hits = a.expHit
360	misses = a.expMiss
361	return
362}
363
364func (a *Allocator) cinit() {
365	for h, n := range a.m {
366		a.cache.put(a.lru.remove(n))
367		delete(a.m, h)
368	}
369	if a.m == nil {
370		a.m = map[int64]*node{}
371	}
372}
373
374func (a *Allocator) cadd(b []byte, h int64) {
375	if len(a.m) < a.cacheSz {
376		n := a.cache.get(len(b))
377		n.h = h
378		copy(n.b, b)
379		a.m[h] = a.lru.pushFront(n)
380		return
381	}
382
383	// cache full
384	delete(a.m, a.cache.put(a.lru.removeBack()).h)
385	n := a.cache.get(len(b))
386	n.h = h
387	copy(n.b, b)
388	a.m[h] = a.lru.pushFront(n)
389	return
390}
391
392func (a *Allocator) cfree(h int64) {
393	n, ok := a.m[h]
394	if !ok { // must have been evicted
395		return
396	}
397
398	a.cache.put(a.lru.remove(n))
399	delete(a.m, h)
400}
401
402// Alloc allocates storage space for b and returns the handle of the new block
403// with content set to b or an error, if any. The returned handle is valid only
404// while the block is used - until the block is deallocated. No two valid
405// handles share the same value within the same Filer, but any value of a
406// handle not referring to any used block may become valid any time as a result
407// of Alloc.
408//
409// Invoking Alloc on an empty Allocator is guaranteed to return handle with
410// value 1. The intended use of content of handle 1 is a root "directory" of
411// other data held by an Allocator.
412//
413// Passing handles not obtained initially from Alloc or not anymore valid to
414// any other Allocator methods can result in an irreparably corrupted database.
415func (a *Allocator) Alloc(b []byte) (handle int64, err error) {
416	pbuf := buffer.Get(zappy.MaxEncodedLen(len(b)))
417	defer buffer.Put(pbuf)
418	buf := *pbuf
419	buf, _, cc, err := a.makeUsedBlock(buf, b)
420	if err != nil {
421		return
422	}
423
424	if handle, err = a.alloc(buf, cc); err == nil {
425		a.cadd(b, handle)
426	}
427	return
428}
429
430func (a *Allocator) alloc(b []byte, cc byte) (h int64, err error) {
431	rqAtoms := n2atoms(len(b))
432	if h = a.flt.find(rqAtoms); h == 0 { // must grow
433		var sz int64
434		if sz, err = a.f.Size(); err != nil {
435			return
436		}
437
438		h = off2h(sz)
439		err = a.writeUsedBlock(h, cc, b)
440		return
441	}
442
443	// Handle is the first item of a free blocks list.
444	tag, s, prev, next, err := a.nfo(h)
445	if err != nil {
446		return
447	}
448
449	if tag != tagFreeShort && tag != tagFreeLong {
450		err = &ErrILSEQ{Type: ErrExpFreeTag, Off: h2off(h), Arg: int64(tag)}
451		return
452	}
453
454	if prev != 0 {
455		err = &ErrILSEQ{Type: ErrHead, Off: h2off(h), Arg: prev}
456		return
457	}
458
459	if s < int64(rqAtoms) {
460		err = &ErrILSEQ{Type: ErrSmall, Arg: int64(rqAtoms), Arg2: s, Off: h2off(h)}
461		return
462	}
463
464	if err = a.unlink(h, s, prev, next); err != nil {
465		return
466	}
467
468	if s > int64(rqAtoms) {
469		freeH := h + int64(rqAtoms)
470		freeAtoms := s - int64(rqAtoms)
471		if err = a.link(freeH, freeAtoms); err != nil {
472			return
473		}
474	}
475	return h, a.writeUsedBlock(h, cc, b)
476}
477
478// Free deallocates the block referred to by handle or returns an error, if
479// any.
480//
481// After Free succeeds, handle is invalid and must not be used.
482//
483// Handle must have been obtained initially from Alloc and must be still valid,
484// otherwise a database may get irreparably corrupted.
485func (a *Allocator) Free(handle int64) (err error) {
486	if handle <= 0 || handle > maxHandle {
487		return &ErrINVAL{"Allocator.Free: handle out of limits", handle}
488	}
489
490	a.cfree(handle)
491	return a.free(handle, 0, true)
492}
493
494func (a *Allocator) free(h, from int64, acceptRelocs bool) (err error) {
495	tag, atoms, _, n, err := a.nfo(h)
496	if err != nil {
497		return
498	}
499
500	switch tag {
501	default:
502		// nop
503	case tagUsedLong:
504		// nop
505	case tagUsedRelocated:
506		if !acceptRelocs {
507			return &ErrILSEQ{Type: ErrUnexpReloc, Off: h2off(h), Arg: h2off(from)}
508		}
509
510		if err = a.free(n, h, false); err != nil {
511			return
512		}
513	case tagFreeShort, tagFreeLong:
514		return &ErrINVAL{"Allocator.Free: attempt to free a free block at off", h2off(h)}
515	}
516
517	return a.free2(h, atoms)
518}
519
520func (a *Allocator) free2(h, atoms int64) (err error) {
521	sz, err := a.f.Size()
522	if err != nil {
523		return
524	}
525
526	ltag, latoms, lp, ln, err := a.leftNfo(h)
527	if err != nil {
528		return
529	}
530
531	if ltag != tagFreeShort && ltag != tagFreeLong {
532		latoms = 0
533	}
534
535	var rtag byte
536	var ratoms, rp, rn int64
537
538	isTail := h2off(h)+atoms*16 == sz
539	if !isTail {
540		if rtag, ratoms, rp, rn, err = a.nfo(h + atoms); err != nil {
541			return
542		}
543	}
544
545	if rtag != tagFreeShort && rtag != tagFreeLong {
546		ratoms = 0
547	}
548
549	switch {
550	case latoms == 0 && ratoms == 0:
551		// -> isolated <-
552		if isTail { // cut tail
553			return a.f.Truncate(h2off(h))
554		}
555
556		return a.link(h, atoms)
557	case latoms == 0 && ratoms != 0:
558		// right join ->
559		if err = a.unlink(h+atoms, ratoms, rp, rn); err != nil {
560			return
561		}
562
563		return a.link(h, atoms+ratoms)
564	case latoms != 0 && ratoms == 0:
565		// <- left join
566		if err = a.unlink(h-latoms, latoms, lp, ln); err != nil {
567			return
568		}
569
570		if isTail {
571			return a.f.Truncate(h2off(h - latoms))
572		}
573
574		return a.link(h-latoms, latoms+atoms)
575	}
576
577	// case latoms != 0 && ratoms != 0:
578	// <- middle join ->
579	lh, rh := h-latoms, h+atoms
580	if err = a.unlink(lh, latoms, lp, ln); err != nil {
581		return
582	}
583
584	// Prev unlink may have invalidated rp or rn
585	if _, _, rp, rn, err = a.nfo(rh); err != nil {
586		return
587	}
588
589	if err = a.unlink(rh, ratoms, rp, rn); err != nil {
590		return
591	}
592
593	return a.link(h-latoms, latoms+atoms+ratoms)
594}
595
596// Add a free block h to the appropriate free list
597func (a *Allocator) link(h, atoms int64) (err error) {
598	if err = a.makeFree(h, atoms, 0, a.flt.head(atoms)); err != nil {
599		return
600	}
601
602	return a.flt.setHead(h, atoms, a.f)
603}
604
605// Remove free block h from the free list
606func (a *Allocator) unlink(h, atoms, p, n int64) (err error) {
607	switch {
608	case p == 0 && n == 0:
609		// single item list, must be head
610		return a.flt.setHead(0, atoms, a.f)
611	case p == 0 && n != 0:
612		// head of list (has next item[s])
613		if err = a.prev(n, 0); err != nil {
614			return
615		}
616
617		// new head
618		return a.flt.setHead(n, atoms, a.f)
619	case p != 0 && n == 0:
620		// last item in list
621		return a.next(p, 0)
622	}
623	// case p != 0 && n != 0:
624	// intermediate item in a list
625	if err = a.next(p, n); err != nil {
626		return
627	}
628
629	return a.prev(n, p)
630}
631
632//TODO remove ?
633// Return len(slice) == n, reuse src if possible.
634func need(n int, src []byte) []byte {
635	if cap(src) < n {
636		return *buffer.Get(n)
637	}
638
639	return src[:n]
640}
641
642// Get returns the data content of a block referred to by handle or an error if
643// any.  The returned slice may be a sub-slice of buf if buf was large enough
644// to hold the entire content.  Otherwise, a newly allocated slice will be
645// returned.  It is valid to pass a nil buf.
646//
647// If the content was stored using compression then it is transparently
648// returned decompressed.
649//
650// Handle must have been obtained initially from Alloc and must be still valid,
651// otherwise invalid data may be returned without detecting the error.
652//
653// Get is safe for concurrent access by multiple goroutines iff no other
654// goroutine mutates the DB.
655func (a *Allocator) Get(buf []byte, handle int64) (b []byte, err error) {
656	buf = buf[:cap(buf)]
657	a.mu.Lock() // X1+
658	if n, ok := a.m[handle]; ok {
659		a.lru.moveToFront(n)
660		b = need(len(n.b), buf)
661		copy(b, n.b)
662		a.expHit++
663		a.hit++
664		a.mu.Unlock() // X1-
665		return
666	}
667
668	a.expMiss++
669	a.miss++
670	if a.miss > 10 && len(a.m) < 500 {
671		if 100*a.hit/a.miss < 95 {
672			a.cacheSz++
673		}
674		a.hit, a.miss = 0, 0
675	}
676	a.mu.Unlock() // X1-
677
678	defer func(h int64) {
679		if err == nil {
680			a.mu.Lock() // X2+
681			a.cadd(b, h)
682			a.mu.Unlock() // X2-
683		}
684	}(handle)
685
686	pfirst := buffer.Get(16)
687	defer buffer.Put(pfirst)
688	first := *pfirst
689	relocated := false
690	relocSrc := handle
691reloc:
692	if handle <= 0 || handle > maxHandle {
693		return nil, &ErrINVAL{"Allocator.Get: handle out of limits", handle}
694	}
695
696	off := h2off(handle)
697	if err = a.read(first, off); err != nil {
698		return
699	}
700
701	switch tag := first[0]; tag {
702	default:
703		dlen := int(tag)
704		atoms := n2atoms(dlen)
705		switch atoms {
706		case 1:
707			switch tag = first[15]; tag {
708			default:
709				return nil, &ErrILSEQ{Type: ErrTailTag, Off: off, Arg: int64(tag)}
710			case tagNotCompressed:
711				b = need(dlen, buf)
712				copy(b, first[1:])
713				return
714			case tagCompressed:
715				return zappy.Decode(buf, first[1:dlen+1])
716			}
717		default:
718			pcc := buffer.Get(1)
719			defer buffer.Put(pcc)
720			cc := *pcc
721			dlen := int(tag)
722			atoms := n2atoms(dlen)
723			tailOff := off + 16*int64(atoms) - 1
724			if err = a.read(cc, tailOff); err != nil {
725				return
726			}
727
728			switch tag = cc[0]; tag {
729			default:
730				return nil, &ErrILSEQ{Type: ErrTailTag, Off: off, Arg: int64(tag)}
731			case tagNotCompressed:
732				b = need(dlen, buf)
733				off += 1
734				if err = a.read(b, off); err != nil {
735					b = buf[:0]
736				}
737				return
738			case tagCompressed:
739				pzbuf := buffer.Get(dlen)
740				defer buffer.Put(pzbuf)
741				zbuf := *pzbuf
742				off += 1
743				if err = a.read(zbuf, off); err != nil {
744					return buf[:0], err
745				}
746
747				return zappy.Decode(buf, zbuf)
748			}
749		}
750	case 0:
751		return buf[:0], nil
752	case tagUsedLong:
753		pcc := buffer.Get(1)
754		defer buffer.Put(pcc)
755		cc := *pcc
756		dlen := m2n(int(first[1])<<8 | int(first[2]))
757		atoms := n2atoms(dlen)
758		tailOff := off + 16*int64(atoms) - 1
759		if err = a.read(cc, tailOff); err != nil {
760			return
761		}
762
763		switch tag = cc[0]; tag {
764		default:
765			return nil, &ErrILSEQ{Type: ErrTailTag, Off: off, Arg: int64(tag)}
766		case tagNotCompressed:
767			b = need(dlen, buf)
768			off += 3
769			if err = a.read(b, off); err != nil {
770				b = buf[:0]
771			}
772			return
773		case tagCompressed:
774			pzbuf := buffer.Get(dlen)
775			defer buffer.Put(pzbuf)
776			zbuf := *pzbuf
777			off += 3
778			if err = a.read(zbuf, off); err != nil {
779				return buf[:0], err
780			}
781
782			return zappy.Decode(buf, zbuf)
783		}
784	case tagFreeShort, tagFreeLong:
785		return nil, &ErrILSEQ{Type: ErrExpUsedTag, Off: off, Arg: int64(tag)}
786	case tagUsedRelocated:
787		if relocated {
788			return nil, &ErrILSEQ{Type: ErrUnexpReloc, Off: off, Arg: relocSrc}
789		}
790
791		handle = b2h(first[1:])
792		relocated = true
793		goto reloc
794	}
795}
796
797var reallocTestHook bool
798
799// Realloc sets the content of a block referred to by handle or returns an
800// error, if any.
801//
802// Handle must have been obtained initially from Alloc and must be still valid,
803// otherwise a database may get irreparably corrupted.
804func (a *Allocator) Realloc(handle int64, b []byte) (err error) {
805	if handle <= 0 || handle > maxHandle {
806		return &ErrINVAL{"Realloc: handle out of limits", handle}
807	}
808
809	a.cfree(handle)
810	if err = a.realloc(handle, b); err != nil {
811		return
812	}
813
814	if reallocTestHook {
815		if err = cacheAudit(a.m, &a.lru); err != nil {
816			return
817		}
818	}
819
820	a.cadd(b, handle)
821	return
822}
823
824func (a *Allocator) realloc(handle int64, b []byte) (err error) {
825	var dlen, needAtoms0 int
826
827	pb8 := buffer.Get(8)
828	defer buffer.Put(pb8)
829	b8 := *pb8
830	pdst := buffer.Get(zappy.MaxEncodedLen(len(b)))
831	defer buffer.Put(pdst)
832	dst := *pdst
833	b, needAtoms0, cc, err := a.makeUsedBlock(dst, b)
834	if err != nil {
835		return
836	}
837
838	needAtoms := int64(needAtoms0)
839	off := h2off(handle)
840	if err = a.read(b8[:], off); err != nil {
841		return
842	}
843
844	switch tag := b8[0]; tag {
845	default:
846		dlen = int(b8[0])
847	case tagUsedLong:
848		dlen = m2n(int(b8[1])<<8 | int(b8[2]))
849	case tagUsedRelocated:
850		if err = a.free(b2h(b8[1:]), handle, false); err != nil {
851			return err
852		}
853
854		dlen = 0
855	case tagFreeShort, tagFreeLong:
856		return &ErrINVAL{"Allocator.Realloc: invalid handle", handle}
857	}
858
859	atoms := int64(n2atoms(dlen))
860retry:
861	switch {
862	case needAtoms < atoms:
863		// in place shrink
864		if err = a.writeUsedBlock(handle, cc, b); err != nil {
865			return
866		}
867
868		fh, fa := handle+needAtoms, atoms-needAtoms
869		var sz int64
870		if sz, err = a.f.Size(); err != nil {
871			return err
872		}
873
874		if h2off(fh)+16*fa == sz {
875			return a.f.Truncate(h2off(fh))
876		}
877
878		return a.free2(fh, fa)
879	case needAtoms == atoms:
880		// in place replace
881		return a.writeUsedBlock(handle, cc, b)
882	}
883
884	// case needAtoms > atoms:
885	// in place extend or relocate
886	var sz int64
887	if sz, err = a.f.Size(); err != nil {
888		return
889	}
890
891	off = h2off(handle)
892	switch {
893	case off+atoms*16 == sz:
894		// relocating tail block - shortcut
895		return a.writeUsedBlock(handle, cc, b)
896	default:
897		if off+atoms*16 < sz {
898			// handle is not a tail block, check right neighbour
899			rh := handle + atoms
900			rtag, ratoms, p, n, e := a.nfo(rh)
901			if e != nil {
902				return e
903			}
904
905			if rtag == tagFreeShort || rtag == tagFreeLong {
906				// Right neighbour is a free block
907				if needAtoms <= atoms+ratoms {
908					// can expand in place
909					if err = a.unlink(rh, ratoms, p, n); err != nil {
910						return
911					}
912
913					atoms += ratoms
914					goto retry
915
916				}
917			}
918		}
919	}
920
921	if atoms > 1 {
922		if err = a.realloc(handle, nil); err != nil {
923			return
924		}
925	}
926
927	var newH int64
928	if newH, err = a.alloc(b, cc); err != nil {
929		return err
930	}
931
932	prb := buffer.CGet(16)
933	defer buffer.Put(prb)
934	rb := *prb
935	rb[0] = tagUsedRelocated
936	h2b(rb[1:], newH)
937	if err = a.writeAt(rb[:], h2off(handle)); err != nil {
938		return
939	}
940
941	return a.writeUsedBlock(newH, cc, b)
942}
943
944func (a *Allocator) writeAt(b []byte, off int64) (err error) {
945	var n int
946	if n, err = a.f.WriteAt(b, off); err != nil {
947		return
948	}
949
950	if n != len(b) {
951		err = io.ErrShortWrite
952	}
953	return
954}
955
956func (a *Allocator) write(off int64, b ...[]byte) (err error) {
957	rq := 0
958	for _, part := range b {
959		rq += len(part)
960	}
961	pbuf := buffer.Get(rq)
962	defer buffer.Put(pbuf)
963	buf := *pbuf
964	buf = buf[:0]
965	for _, part := range b {
966		buf = append(buf, part...)
967	}
968	return a.writeAt(buf, off)
969}
970
971func (a *Allocator) read(b []byte, off int64) (err error) {
972	var rn int
973	if rn, err = a.f.ReadAt(b, off); rn != len(b) {
974		return &ErrILSEQ{Type: ErrOther, Off: off, More: err}
975	}
976
977	return nil
978}
979
980// nfo returns h's tag. If it's a free block then return also (s)ize (in
981// atoms), (p)rev and (n)ext. If it's a used block then only (s)ize is returned
982// (again in atoms). If it's a used relocate block then (n)ext is set to the
983// relocation target handle.
984func (a *Allocator) nfo(h int64) (tag byte, s, p, n int64, err error) {
985	off := h2off(h)
986	rq := int64(22)
987	sz, err := a.f.Size()
988	if err != nil {
989		return
990	}
991
992	if off+rq >= sz {
993		if rq = sz - off; rq < 15 {
994			err = io.ErrUnexpectedEOF
995			return
996		}
997	}
998
999	pbuf := buffer.Get(22)
1000	defer buffer.Put(pbuf)
1001	buf := *pbuf
1002	if err = a.read(buf[:rq], off); err != nil {
1003		return
1004	}
1005
1006	switch tag = buf[0]; tag {
1007	default:
1008		s = int64(n2atoms(int(tag)))
1009	case tagUsedLong:
1010		s = int64(n2atoms(m2n(int(buf[1])<<8 | int(buf[2]))))
1011	case tagFreeLong:
1012		if rq < 22 {
1013			err = io.ErrUnexpectedEOF
1014			return
1015		}
1016
1017		s, p, n = b2h(buf[1:]), b2h(buf[8:]), b2h(buf[15:])
1018	case tagUsedRelocated:
1019		s, n = 1, b2h(buf[1:])
1020	case tagFreeShort:
1021		s, p, n = 1, b2h(buf[1:]), b2h(buf[8:])
1022	}
1023	return
1024}
1025
1026// leftNfo returns nfo for h's left neighbor if h > 1 and the left neighbor is
1027// a free block. Otherwise all zero values are returned instead.
1028func (a *Allocator) leftNfo(h int64) (tag byte, s, p, n int64, err error) {
1029	if !(h > 1) {
1030		return
1031	}
1032
1033	pbuf := buffer.Get(8)
1034	defer buffer.Put(pbuf)
1035	buf := *pbuf
1036	off := h2off(h)
1037	if err = a.read(buf[:], off-8); err != nil {
1038		return
1039	}
1040
1041	switch tag := buf[7]; tag {
1042	case tagFreeShort:
1043		return a.nfo(h - 1)
1044	case tagFreeLong:
1045		return a.nfo(h - b2h(buf[:]))
1046	}
1047	return
1048}
1049
1050// Set h.prev = p
1051func (a *Allocator) prev(h, p int64) (err error) {
1052	pb := buffer.Get(7)
1053	defer buffer.Put(pb)
1054	b := *pb
1055	off := h2off(h)
1056	if err = a.read(b[:1], off); err != nil {
1057		return
1058	}
1059
1060	switch tag := b[0]; tag {
1061	default:
1062		return &ErrILSEQ{Type: ErrExpFreeTag, Off: off, Arg: int64(tag)}
1063	case tagFreeShort:
1064		off += 1
1065	case tagFreeLong:
1066		off += 8
1067	}
1068	return a.writeAt(h2b(b[:7], p), off)
1069}
1070
1071// Set h.next = n
1072func (a *Allocator) next(h, n int64) (err error) {
1073	pb := buffer.Get(7)
1074	defer buffer.Put(pb)
1075	b := *pb
1076	off := h2off(h)
1077	if err = a.read(b[:1], off); err != nil {
1078		return
1079	}
1080
1081	switch tag := b[0]; tag {
1082	default:
1083		return &ErrILSEQ{Type: ErrExpFreeTag, Off: off, Arg: int64(tag)}
1084	case tagFreeShort:
1085		off += 8
1086	case tagFreeLong:
1087		off += 15
1088	}
1089	return a.writeAt(h2b(b[:7], n), off)
1090}
1091
1092// Make the filer image @h a free block.
1093func (a *Allocator) makeFree(h, atoms, prev, next int64) (err error) {
1094	pbuf := buffer.Get(22)
1095	defer buffer.Put(pbuf)
1096	buf := *pbuf
1097	switch {
1098	case atoms == 1:
1099		buf[0], buf[15] = tagFreeShort, tagFreeShort
1100		h2b(buf[1:], prev)
1101		h2b(buf[8:], next)
1102		if err = a.write(h2off(h), buf[:16]); err != nil {
1103			return
1104		}
1105	default:
1106
1107		buf[0] = tagFreeLong
1108		h2b(buf[1:], atoms)
1109		h2b(buf[8:], prev)
1110		h2b(buf[15:], next)
1111		if err = a.write(h2off(h), buf[:22]); err != nil {
1112			return
1113		}
1114
1115		h2b(buf[:], atoms)
1116		buf[7] = tagFreeLong
1117		if err = a.write(h2off(h+atoms)-8, buf[:8]); err != nil {
1118			return
1119		}
1120	}
1121	if prev != 0 {
1122		if err = a.next(prev, h); err != nil {
1123			return
1124		}
1125	}
1126
1127	if next != 0 {
1128		err = a.prev(next, h)
1129	}
1130	return
1131}
1132
1133func (a *Allocator) makeUsedBlock(dst []byte, b []byte) (w []byte, rqAtoms int, cc byte, err error) {
1134	cc = tagNotCompressed
1135	w = b
1136
1137	var n int
1138	if n = len(b); n > maxRq {
1139		return nil, 0, 0, &ErrINVAL{"Allocator.makeUsedBlock: content size out of limits", n}
1140	}
1141
1142	rqAtoms = n2atoms(n)
1143	if a.Compress && n > 14 { // attempt compression
1144		if dst, err = zappy.Encode(dst, b); err != nil {
1145			return
1146		}
1147
1148		n2 := len(dst)
1149		if rqAtoms2 := n2atoms(n2); rqAtoms2 < rqAtoms { // compression saved at least a single atom
1150			w, rqAtoms, cc = dst, rqAtoms2, tagCompressed
1151		}
1152	}
1153	return
1154}
1155
1156func (a *Allocator) writeUsedBlock(h int64, cc byte, b []byte) (err error) {
1157	n := len(b)
1158	rq := n2atoms(n) << 4
1159	pbuf := buffer.Get(rq)
1160	defer buffer.Put(pbuf)
1161	buf := *pbuf
1162	switch n <= maxShort {
1163	case true:
1164		buf[0] = byte(n)
1165		copy(buf[1:], b)
1166	case false:
1167		m := n2m(n)
1168		buf[0], buf[1], buf[2] = tagUsedLong, byte(m>>8), byte(m)
1169		copy(buf[3:], b)
1170	}
1171	if p := n2padding(n); p != 0 {
1172		copy(buf[rq-1-p:], zeros[:])
1173	}
1174	buf[rq-1] = cc
1175	return a.writeAt(buf, h2off(h))
1176}
1177
1178func (a *Allocator) verifyUnused(h, totalAtoms int64, tag byte, log func(error) bool, fast bool) (atoms, prev, next int64, err error) {
1179	switch tag {
1180	default:
1181		panic("internal error")
1182	case tagFreeShort:
1183		var b [16]byte
1184		off := h2off(h)
1185		if err = a.read(b[:], off); err != nil {
1186			return
1187		}
1188
1189		if b[15] != tagFreeShort {
1190			err = &ErrILSEQ{Type: ErrShortFreeTailTag, Off: off, Arg: int64(b[15])}
1191			log(err)
1192			return
1193		}
1194
1195		atoms, prev, next = 1, b2h(b[1:]), b2h(b[8:])
1196	case tagFreeLong:
1197		var b [22]byte
1198		off := h2off(h)
1199		if err = a.read(b[:], off); err != nil {
1200			return
1201		}
1202
1203		atoms, prev, next = b2h(b[1:]), b2h(b[8:]), b2h(b[15:])
1204		if fast {
1205			return
1206		}
1207
1208		if atoms < 2 {
1209			err = &ErrILSEQ{Type: ErrLongFreeBlkTooShort, Off: off, Arg: atoms}
1210			break
1211		}
1212
1213		if h+atoms-1 > totalAtoms {
1214			err = &ErrILSEQ{Type: ErrLongFreeBlkTooLong, Off: off, Arg: atoms}
1215			break
1216		}
1217
1218		if prev > totalAtoms {
1219			err = &ErrILSEQ{Type: ErrLongFreePrevBeyondEOF, Off: off, Arg: next}
1220			break
1221		}
1222
1223		if next > totalAtoms {
1224			err = &ErrILSEQ{Type: ErrLongFreeNextBeyondEOF, Off: off, Arg: next}
1225			break
1226		}
1227
1228		toff := h2off(h+atoms) - 8
1229		if err = a.read(b[:8], toff); err != nil {
1230			return
1231		}
1232
1233		if b[7] != tag {
1234			err = &ErrILSEQ{Type: ErrLongFreeTailTag, Off: off, Arg: int64(b[7])}
1235			break
1236		}
1237
1238		if s2 := b2h(b[:]); s2 != atoms {
1239			err = &ErrILSEQ{Type: ErrVerifyTailSize, Off: off, Arg: atoms, Arg2: s2}
1240			break
1241		}
1242
1243	}
1244	if err != nil {
1245		log(err)
1246	}
1247	return
1248}
1249
1250func (a *Allocator) verifyUsed(h, totalAtoms int64, tag byte, buf, ubuf []byte, log func(error) bool, fast bool) (compressed bool, dlen int, atoms, link int64, err error) {
1251	var (
1252		padding  int
1253		doff     int64
1254		padZeros [15]byte
1255		tailBuf  [16]byte
1256	)
1257
1258	switch tag {
1259	default: // Short used
1260		dlen = int(tag)
1261		atoms = int64((dlen+1)/16) + 1
1262		padding = 15 - (dlen+1)%16
1263		doff = h2off(h) + 1
1264	case tagUsedLong:
1265		off := h2off(h) + 1
1266		var b2 [2]byte
1267		if err = a.read(b2[:], off); err != nil {
1268			return
1269		}
1270
1271		dlen = m2n(int(b2[0])<<8 | int(b2[1]))
1272		atoms = int64((dlen+3)/16) + 1
1273		padding = 15 - (dlen+3)%16
1274		doff = h2off(h) + 3
1275	case tagUsedRelocated:
1276		dlen = 7
1277		atoms = 1
1278		padding = 7
1279		doff = h2off(h) + 1
1280	case tagFreeShort, tagFreeLong:
1281		panic("internal error")
1282	}
1283
1284	if fast {
1285		if tag == tagUsedRelocated {
1286			dlen = 0
1287			if err = a.read(buf[:7], doff); err != nil {
1288				return
1289			}
1290
1291			link = b2h(buf)
1292		}
1293
1294		return false, dlen, atoms, link, nil
1295	}
1296
1297	if ok := h+atoms-1 <= totalAtoms; !ok { // invalid last block
1298		err = &ErrILSEQ{Type: ErrVerifyUsedSpan, Off: h2off(h), Arg: atoms}
1299		log(err)
1300		return
1301	}
1302
1303	tailsz := 1 + padding
1304	off := h2off(h) + 16*atoms - int64(tailsz)
1305	if err = a.read(tailBuf[:tailsz], off); err != nil {
1306		return false, 0, 0, 0, err
1307	}
1308
1309	if ok := bytes.Equal(padZeros[:padding], tailBuf[:padding]); !ok {
1310		err = &ErrILSEQ{Type: ErrVerifyPadding, Off: h2off(h)}
1311		log(err)
1312		return
1313	}
1314
1315	var cc byte
1316	switch cc = tailBuf[padding]; cc {
1317	default:
1318		err = &ErrILSEQ{Type: ErrTailTag, Off: h2off(h)}
1319		log(err)
1320		return
1321	case tagCompressed:
1322		compressed = true
1323		if tag == tagUsedRelocated {
1324			err = &ErrILSEQ{Type: ErrTailTag, Off: h2off(h)}
1325			log(err)
1326			return
1327		}
1328
1329		fallthrough
1330	case tagNotCompressed:
1331		if err = a.read(buf[:dlen], doff); err != nil {
1332			return false, 0, 0, 0, err
1333		}
1334	}
1335
1336	if cc == tagCompressed {
1337		if ubuf, err = zappy.Decode(ubuf, buf[:dlen]); err != nil || len(ubuf) > maxRq {
1338			err = &ErrILSEQ{Type: ErrDecompress, Off: h2off(h)}
1339			log(err)
1340			return
1341		}
1342
1343		dlen = len(ubuf)
1344	}
1345
1346	if tag == tagUsedRelocated {
1347		link = b2h(buf)
1348		if link == 0 {
1349			err = &ErrILSEQ{Type: ErrNullReloc, Off: h2off(h)}
1350			log(err)
1351			return
1352		}
1353
1354		if link > totalAtoms { // invalid last block
1355			err = &ErrILSEQ{Type: ErrRelocBeyondEOF, Off: h2off(h), Arg: link}
1356			log(err)
1357			return
1358		}
1359	}
1360
1361	return
1362}
1363
1364var nolog = func(error) bool { return false }
1365
1366// Verify attempts to find any structural errors in a Filer wrt the
1367// organization of it as defined by Allocator. 'bitmap' is a scratch pad for
1368// necessary bookkeeping and will grow to at most to Allocator's
1369// Filer.Size()/128 (0,78%).  Any problems found are reported to 'log' except
1370// non verify related errors like disk read fails etc.  If 'log' returns false
1371// or the error doesn't allow to (reliably) continue, the verification process
1372// is stopped and an error is returned from the Verify function. Passing a nil
1373// log works like providing a log function always returning false. Any
1374// non-structural errors, like for instance Filer read errors, are NOT reported
1375// to 'log', but returned as the Verify's return value, because Verify cannot
1376// proceed in such cases.  Verify returns nil only if it fully completed
1377// verifying Allocator's Filer without detecting any error.
1378//
1379// It is recommended to limit the number reported problems by returning false
1380// from 'log' after reaching some limit. Huge and corrupted DB can produce an
1381// overwhelming error report dataset.
1382//
1383// The verifying process will scan the whole DB at least 3 times (a trade
1384// between processing space and time consumed). It doesn't read the content of
1385// free blocks above the head/tail info bytes. If the 3rd phase detects lost
1386// free space, then a 4th scan (a faster one) is performed to precisely report
1387// all of them.
1388//
1389// If the DB/Filer to be verified is reasonably small, respective if its
1390// size/128 can comfortably fit within process's free memory, then it is
1391// recommended to consider using a MemFiler for the bit map.
1392//
1393// Statistics are returned via 'stats' if non nil. The statistics are valid
1394// only if Verify succeeded, ie. it didn't reported anything to log and it
1395// returned a nil error.
1396func (a *Allocator) Verify(bitmap Filer, log func(error) bool, stats *AllocStats) (err error) {
1397	if log == nil {
1398		log = nolog
1399	}
1400
1401	n, err := bitmap.Size()
1402	if err != nil {
1403		return
1404	}
1405
1406	if n != 0 {
1407		return &ErrINVAL{"Allocator.Verify: bit map initial size non zero (%d)", n}
1408	}
1409
1410	var bits int64
1411	bitMask := [8]byte{1, 2, 4, 8, 16, 32, 64, 128}
1412	byteBuf := []byte{0}
1413
1414	//DONE
1415	// +performance, this implementation is hopefully correct but _very_
1416	// naive, probably good as a prototype only. Use maybe a MemFiler
1417	// "cache" etc.
1418	// ----
1419	// Turns out the OS caching is as effective as it can probably get.
1420	bit := func(on bool, h int64) (wasOn bool, err error) {
1421		m := bitMask[h&7]
1422		off := h >> 3
1423		var v byte
1424		sz, err := bitmap.Size()
1425		if err != nil {
1426			return
1427		}
1428
1429		if off < sz {
1430			if n, err := bitmap.ReadAt(byteBuf, off); n != 1 {
1431				return false, &ErrILSEQ{Type: ErrOther, Off: off, More: fmt.Errorf("Allocator.Verify - reading bitmap: %s", err)}
1432			}
1433
1434			v = byteBuf[0]
1435		}
1436		switch wasOn = v&m != 0; on {
1437		case true:
1438			if !wasOn {
1439				v |= m
1440				bits++
1441			}
1442		case false:
1443			if wasOn {
1444				v ^= m
1445				bits--
1446			}
1447		}
1448		byteBuf[0] = v
1449		if n, err := bitmap.WriteAt(byteBuf, off); n != 1 || err != nil {
1450			return false, &ErrILSEQ{Type: ErrOther, Off: off, More: fmt.Errorf("Allocator.Verify - writing bitmap: %s", err)}
1451		}
1452
1453		return
1454	}
1455
1456	// Phase 1 - sequentially scan a.f to reliably determine block
1457	// boundaries. Set a bit for every block start.
1458	var (
1459		buf, ubuf       [maxRq]byte
1460		prevH, h, atoms int64
1461		wasOn           bool
1462		tag             byte
1463		st              = AllocStats{
1464			AllocMap: map[int64]int64{},
1465			FreeMap:  map[int64]int64{},
1466		}
1467		dlen int
1468	)
1469
1470	fsz, err := a.f.Size()
1471	if err != nil {
1472		return
1473	}
1474
1475	ok := fsz%16 == 0
1476	totalAtoms := (fsz - fltSz) / atomLen
1477	if !ok {
1478		err = &ErrILSEQ{Type: ErrFileSize, Name: a.f.Name(), Arg: fsz}
1479		log(err)
1480		return
1481	}
1482
1483	st.TotalAtoms = totalAtoms
1484	prevTag := -1
1485	lastH := int64(-1)
1486
1487	for h = 1; h <= totalAtoms; h += atoms {
1488		prevH = h // For checking last block == used
1489
1490		off := h2off(h)
1491		if err = a.read(buf[:1], off); err != nil {
1492			return
1493		}
1494
1495		switch tag = buf[0]; tag {
1496		default: // Short used
1497			fallthrough
1498		case tagUsedLong, tagUsedRelocated:
1499			var compressed bool
1500			if compressed, dlen, atoms, _, err = a.verifyUsed(h, totalAtoms, tag, buf[:], ubuf[:], log, false); err != nil {
1501				return
1502			}
1503
1504			if compressed {
1505				st.Compression++
1506			}
1507			st.AllocAtoms += atoms
1508			switch {
1509			case tag == tagUsedRelocated:
1510				st.AllocMap[1]++
1511				st.Relocations++
1512			default:
1513				st.AllocMap[atoms]++
1514				st.AllocBytes += int64(dlen)
1515				st.Handles++
1516			}
1517		case tagFreeShort, tagFreeLong:
1518			if prevTag == tagFreeShort || prevTag == tagFreeLong {
1519				err = &ErrILSEQ{Type: ErrAdjacentFree, Off: h2off(lastH), Arg: off}
1520				log(err)
1521				return
1522			}
1523
1524			if atoms, _, _, err = a.verifyUnused(h, totalAtoms, tag, log, false); err != nil {
1525				return
1526			}
1527
1528			st.FreeMap[atoms]++
1529			st.FreeAtoms += atoms
1530		}
1531
1532		if wasOn, err = bit(true, h); err != nil {
1533			return
1534		}
1535
1536		if wasOn {
1537			panic("internal error")
1538		}
1539
1540		prevTag = int(tag)
1541		lastH = h
1542	}
1543
1544	if totalAtoms != 0 && (tag == tagFreeShort || tag == tagFreeLong) {
1545		err = &ErrILSEQ{Type: ErrFreeTailBlock, Off: h2off(prevH)}
1546		log(err)
1547		return
1548	}
1549
1550	// Phase 2 - check used blocks, turn off the map bit for every used
1551	// block.
1552	for h = 1; h <= totalAtoms; h += atoms {
1553		off := h2off(h)
1554		if err = a.read(buf[:1], off); err != nil {
1555			return
1556		}
1557
1558		var link int64
1559		switch tag = buf[0]; tag {
1560		default: // Short used
1561			fallthrough
1562		case tagUsedLong, tagUsedRelocated:
1563			if _, _, atoms, link, err = a.verifyUsed(h, totalAtoms, tag, buf[:], ubuf[:], log, true); err != nil {
1564				return
1565			}
1566		case tagFreeShort, tagFreeLong:
1567			if atoms, _, _, err = a.verifyUnused(h, totalAtoms, tag, log, true); err != nil {
1568				return
1569			}
1570		}
1571
1572		turnoff := true
1573		switch tag {
1574		case tagUsedRelocated:
1575			if err = a.read(buf[:1], h2off(link)); err != nil {
1576				return
1577			}
1578
1579			switch linkedTag := buf[0]; linkedTag {
1580			case tagFreeShort, tagFreeLong, tagUsedRelocated:
1581				err = &ErrILSEQ{Type: ErrInvalidRelocTarget, Off: off, Arg: link}
1582				log(err)
1583				return
1584			}
1585
1586		case tagFreeShort, tagFreeLong:
1587			turnoff = false
1588		}
1589
1590		if !turnoff {
1591			continue
1592		}
1593
1594		if wasOn, err = bit(false, h); err != nil {
1595			return
1596		}
1597
1598		if !wasOn {
1599			panic("internal error")
1600		}
1601
1602	}
1603
1604	// Phase 3 - using the flt check heads link to proper free blocks.  For
1605	// every free block, walk the list, verify the {next, prev} links and
1606	// turn the respective map bit off. After processing all free lists,
1607	// the map bits count should be zero. Otherwise there are "lost" free
1608	// blocks.
1609
1610	var prev, next, fprev, fnext int64
1611	rep := a.flt
1612
1613	for _, list := range rep {
1614		prev, next = 0, list.head
1615		for ; next != 0; prev, next = next, fnext {
1616			if wasOn, err = bit(false, next); err != nil {
1617				return
1618			}
1619
1620			if !wasOn {
1621				err = &ErrILSEQ{Type: ErrFLT, Off: h2off(next), Arg: h}
1622				log(err)
1623				return
1624			}
1625
1626			off := h2off(next)
1627			if err = a.read(buf[:1], off); err != nil {
1628				return
1629			}
1630
1631			switch tag = buf[0]; tag {
1632			default:
1633				panic("internal error")
1634			case tagFreeShort, tagFreeLong:
1635				if atoms, fprev, fnext, err = a.verifyUnused(next, totalAtoms, tag, log, true); err != nil {
1636					return
1637				}
1638
1639				if min := list.minSize; atoms < min {
1640					err = &ErrILSEQ{Type: ErrFLTSize, Off: h2off(next), Arg: atoms, Arg2: min}
1641					log(err)
1642					return
1643				}
1644
1645				if fprev != prev {
1646					err = &ErrILSEQ{Type: ErrFreeChaining, Off: h2off(next)}
1647					log(err)
1648					return
1649				}
1650			}
1651		}
1652
1653	}
1654
1655	if bits == 0 { // Verify succeeded
1656		if stats != nil {
1657			*stats = st
1658		}
1659		return
1660	}
1661
1662	// Phase 4 - if after phase 3 there are lost free blocks, report all of
1663	// them to 'log'
1664	for i := range ubuf { // setup zeros for compares
1665		ubuf[i] = 0
1666	}
1667
1668	var off, lh int64
1669	rem, err := bitmap.Size()
1670	if err != nil {
1671		return err
1672	}
1673
1674	for rem != 0 {
1675		rq := int(mathutil.MinInt64(64*1024, rem))
1676		var n int
1677		if n, err = bitmap.ReadAt(buf[:rq], off); n != rq {
1678			return &ErrILSEQ{Type: ErrOther, Off: off, More: fmt.Errorf("bitmap ReadAt(size %d, off %#x): %s", rq, off, err)}
1679		}
1680
1681		if !bytes.Equal(buf[:rq], ubuf[:rq]) {
1682			for d, v := range buf[:rq] {
1683				if v != 0 {
1684					for i, m := range bitMask {
1685						if v&m != 0 {
1686							lh = 8*(off+int64(d)) + int64(i)
1687							err = &ErrILSEQ{Type: ErrLostFreeBlock, Off: h2off(lh)}
1688							log(err)
1689							return
1690						}
1691					}
1692				}
1693			}
1694		}
1695
1696		off += int64(rq)
1697		rem -= int64(rq)
1698	}
1699
1700	return
1701}
1702
1703type fltSlot struct {
1704	head    int64
1705	minSize int64
1706}
1707
1708func (f fltSlot) String() string {
1709	return fmt.Sprintf("head %#x, minSize %#x\n", f.head, f.minSize)
1710}
1711
1712type flt [14]fltSlot
1713
1714func (f *flt) init() {
1715	sz := 1
1716	for i := range *f {
1717		f[i].minSize, f[i].head = int64(sz), 0
1718		sz <<= 1
1719	}
1720	f[13].minSize = 4112
1721}
1722
1723func (f *flt) load(fi Filer, off int64) (err error) {
1724	pb := buffer.Get(fltSz)
1725	defer buffer.Put(pb)
1726	b := *pb
1727	if _, err = fi.ReadAt(b[:], off); err != nil {
1728		return
1729	}
1730
1731	for i := range *f {
1732		off := 8*i + 1
1733		f[i].head = b2h(b[off:])
1734	}
1735	return
1736}
1737
1738func (f *flt) find(rq int) (h int64) {
1739	switch {
1740	case rq < 1:
1741		panic(rq)
1742	case rq >= maxFLTRq:
1743		h, f[13].head = f[13].head, 0
1744		return
1745	default:
1746		g := f[mathutil.Log2Uint16(uint16(rq)):]
1747		for i := range g {
1748			p := &g[i]
1749			if rq <= int(p.minSize) {
1750				if h = p.head; h != 0 {
1751					p.head = 0
1752					return
1753				}
1754			}
1755		}
1756		return
1757	}
1758}
1759
1760func (f *flt) head(atoms int64) (h int64) {
1761	switch {
1762	case atoms < 1:
1763		panic(atoms)
1764	case atoms >= maxFLTRq:
1765		return f[13].head
1766	default:
1767		lg := mathutil.Log2Uint16(uint16(atoms))
1768		g := f[lg:]
1769		for i := range g {
1770			if atoms < g[i+1].minSize {
1771				return g[i].head
1772			}
1773		}
1774		panic("internal error")
1775	}
1776}
1777
1778func (f *flt) setHead(h, atoms int64, fi Filer) (err error) {
1779	switch {
1780	case atoms < 1:
1781		panic(atoms)
1782	case atoms >= maxFLTRq:
1783		pb := buffer.Get(7)
1784		defer buffer.Put(pb)
1785		b := *pb
1786		if _, err = fi.WriteAt(h2b(b[:], h), 8*13+1); err != nil {
1787			return
1788		}
1789
1790		f[13].head = h
1791		return
1792	default:
1793		lg := mathutil.Log2Uint16(uint16(atoms))
1794		g := f[lg:]
1795		for i := range f {
1796			if atoms < g[i+1].minSize {
1797				pb := buffer.Get(7)
1798				defer buffer.Put(pb)
1799				b := *pb
1800				if _, err = fi.WriteAt(h2b(b[:], h), 8*int64(i+lg)+1); err != nil {
1801					return
1802				}
1803
1804				g[i].head = h
1805				return
1806			}
1807		}
1808		panic("internal error")
1809	}
1810}
1811
1812func (f *flt) String() string {
1813	a := []string{}
1814	for i, v := range *f {
1815		a = append(a, fmt.Sprintf("[%2d] %s", i, v))
1816	}
1817	return strings.Join(a, "")
1818}
1819
1820type node struct {
1821	b          []byte
1822	h          int64
1823	prev, next *node
1824}
1825
1826type cache []*node
1827
1828func (c *cache) get(n int) *node {
1829	r, _ := c.get2(n)
1830	return r
1831}
1832
1833func (c *cache) get2(n int) (r *node, isZeroed bool) {
1834	s := *c
1835	lens := len(s)
1836	if lens == 0 {
1837		return &node{b: make([]byte, n, mathutil.Min(2*n, maxBuf))}, true
1838	}
1839
1840	i := sort.Search(lens, func(x int) bool { return len(s[x].b) >= n })
1841	if i == lens {
1842		i--
1843		s[i].b, isZeroed = make([]byte, n, mathutil.Min(2*n, maxBuf)), true
1844	}
1845
1846	r = s[i]
1847	r.b = r.b[:n]
1848	copy(s[i:], s[i+1:])
1849	s = s[:lens-1]
1850	*c = s
1851	return
1852}
1853
1854func (c *cache) cget(n int) (r *node) {
1855	r, ok := c.get2(n)
1856	if ok {
1857		return
1858	}
1859
1860	for i := range r.b {
1861		r.b[i] = 0
1862	}
1863	return
1864}
1865
1866func (c *cache) size() (sz int64) {
1867	for _, n := range *c {
1868		sz += int64(cap(n.b))
1869	}
1870	return
1871}
1872
1873func (c *cache) put(n *node) *node {
1874	s := *c
1875	n.b = n.b[:cap(n.b)]
1876	lenb := len(n.b)
1877	lens := len(s)
1878	i := sort.Search(lens, func(x int) bool { return len(s[x].b) >= lenb })
1879	s = append(s, nil)
1880	copy(s[i+1:], s[i:])
1881	s[i] = n
1882	*c = s
1883	return n
1884}
1885
1886type lst struct {
1887	front, back *node
1888}
1889
1890func (l *lst) pushFront(n *node) *node {
1891	if l.front == nil {
1892		l.front, l.back, n.prev, n.next = n, n, nil, nil
1893		return n
1894	}
1895
1896	n.prev, n.next, l.front.prev, l.front = nil, l.front, n, n
1897	return n
1898}
1899
1900func (l *lst) remove(n *node) *node {
1901	if n.prev == nil {
1902		l.front = n.next
1903	} else {
1904		n.prev.next = n.next
1905	}
1906	if n.next == nil {
1907		l.back = n.prev
1908	} else {
1909		n.next.prev = n.prev
1910	}
1911	n.prev, n.next = nil, nil
1912	return n
1913}
1914
1915func (l *lst) removeBack() *node {
1916	return l.remove(l.back)
1917}
1918
1919func (l *lst) moveToFront(n *node) *node {
1920	return l.pushFront(l.remove(n))
1921}
1922
1923func (l *lst) size() (sz int64) {
1924	for n := l.front; n != nil; n = n.next {
1925		sz += int64(cap(n.b))
1926	}
1927	return
1928}
1929
1930func cacheAudit(m map[int64]*node, l *lst) (err error) {
1931	cnt := 0
1932	for h, n := range m {
1933		if g, e := n.h, h; g != e {
1934			return fmt.Errorf("cacheAudit: invalid node handle %d != %d", g, e)
1935		}
1936
1937		if cnt, err = l.audit(n, true); err != nil {
1938			return
1939		}
1940	}
1941
1942	if g, e := cnt, len(m); g != e {
1943		return fmt.Errorf("cacheAudit: invalid cache size %d != %d", g, e)
1944	}
1945
1946	return
1947}
1948
1949func (l *lst) audit(n *node, onList bool) (cnt int, err error) {
1950	if !onList && (n.prev != nil || n.next != nil) {
1951		return -1, fmt.Errorf("lst.audit: free node with non nil linkage")
1952	}
1953
1954	if l.front == nil && l.back != nil || l.back == nil && l.front != nil {
1955		return -1, fmt.Errorf("lst.audit: one of .front/.back is nil while the other is non nil")
1956	}
1957
1958	if l.front == l.back && l.front != nil {
1959		x := l.front
1960		if x.prev != nil || x.next != nil {
1961			return -1, fmt.Errorf("lst.audit: single node has non nil linkage")
1962		}
1963
1964		if onList && x != n {
1965			return -1, fmt.Errorf("lst.audit: single node is alien")
1966		}
1967	}
1968
1969	seen := false
1970	var prev *node
1971	x := l.front
1972	for x != nil {
1973		cnt++
1974		if x.prev != prev {
1975			return -1, fmt.Errorf("lst.audit: broken .prev linkage")
1976		}
1977
1978		if x == n {
1979			seen = true
1980		}
1981
1982		prev = x
1983		x = x.next
1984	}
1985
1986	if prev != l.back {
1987		return -1, fmt.Errorf("lst.audit: broken .back linkage")
1988	}
1989
1990	if onList && !seen {
1991		return -1, fmt.Errorf("lst.audit: node missing in list")
1992	}
1993
1994	if !onList && seen {
1995		return -1, fmt.Errorf("lst.audit: node should not be on the list")
1996	}
1997
1998	return
1999}
2000