1package bolt
2
3import (
4	"errors"
5	"fmt"
6	"hash/fnv"
7	"log"
8	"os"
9	"runtime"
10	"sort"
11	"sync"
12	"time"
13	"unsafe"
14)
15
16// The largest step that can be taken when remapping the mmap.
17const maxMmapStep = 1 << 30 // 1GB
18
19// The data file format version.
20const version = 2
21
22// Represents a marker value to indicate that a file is a Bolt DB.
23const magic uint32 = 0xED0CDAED
24
25const pgidNoFreelist pgid = 0xffffffffffffffff
26
27// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when
28// syncing changes to a file.  This is required as some operating systems,
29// such as OpenBSD, do not have a unified buffer cache (UBC) and writes
30// must be synchronized using the msync(2) syscall.
31const IgnoreNoSync = runtime.GOOS == "openbsd"
32
33// Default values if not set in a DB instance.
34const (
35	DefaultMaxBatchSize  int = 1000
36	DefaultMaxBatchDelay     = 10 * time.Millisecond
37	DefaultAllocSize         = 16 * 1024 * 1024
38)
39
40// default page size for db is set to the OS page size.
41var defaultPageSize = os.Getpagesize()
42
43// The time elapsed between consecutive file locking attempts.
44const flockRetryTimeout = 50 * time.Millisecond
45
46// FreelistType is the type of the freelist backend
47type FreelistType string
48
49const (
50	// FreelistArrayType indicates backend freelist type is array
51	FreelistArrayType = FreelistType("array")
52	// FreelistMapType indicates backend freelist type is hashmap
53	FreelistMapType = FreelistType("hashmap")
54)
55
56// DB represents a collection of buckets persisted to a file on disk.
57// All data access is performed through transactions which can be obtained through the DB.
58// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called.
59type DB struct {
60	// When enabled, the database will perform a Check() after every commit.
61	// A panic is issued if the database is in an inconsistent state. This
62	// flag has a large performance impact so it should only be used for
63	// debugging purposes.
64	StrictMode bool
65
66	// Setting the NoSync flag will cause the database to skip fsync()
67	// calls after each commit. This can be useful when bulk loading data
68	// into a database and you can restart the bulk load in the event of
69	// a system failure or database corruption. Do not set this flag for
70	// normal use.
71	//
72	// If the package global IgnoreNoSync constant is true, this value is
73	// ignored.  See the comment on that constant for more details.
74	//
75	// THIS IS UNSAFE. PLEASE USE WITH CAUTION.
76	NoSync bool
77
78	// When true, skips syncing freelist to disk. This improves the database
79	// write performance under normal operation, but requires a full database
80	// re-sync during recovery.
81	NoFreelistSync bool
82
83	// FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
84	// dramatic performance degradation if database is large and framentation in freelist is common.
85	// The alternative one is using hashmap, it is faster in almost all circumstances
86	// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
87	// The default type is array
88	FreelistType FreelistType
89
90	// When true, skips the truncate call when growing the database.
91	// Setting this to true is only safe on non-ext3/ext4 systems.
92	// Skipping truncation avoids preallocation of hard drive space and
93	// bypasses a truncate() and fsync() syscall on remapping.
94	//
95	// https://github.com/ooni/psiphon/oopsi/github.com/boltdb/bolt/issues/284
96	NoGrowSync bool
97
98	// If you want to read the entire database fast, you can set MmapFlag to
99	// syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead.
100	MmapFlags int
101
102	// MaxBatchSize is the maximum size of a batch. Default value is
103	// copied from DefaultMaxBatchSize in Open.
104	//
105	// If <=0, disables batching.
106	//
107	// Do not change concurrently with calls to Batch.
108	MaxBatchSize int
109
110	// MaxBatchDelay is the maximum delay before a batch starts.
111	// Default value is copied from DefaultMaxBatchDelay in Open.
112	//
113	// If <=0, effectively disables batching.
114	//
115	// Do not change concurrently with calls to Batch.
116	MaxBatchDelay time.Duration
117
118	// AllocSize is the amount of space allocated when the database
119	// needs to create new pages. This is done to amortize the cost
120	// of truncate() and fsync() when growing the data file.
121	AllocSize int
122
123	path     string
124	openFile func(string, int, os.FileMode) (*os.File, error)
125	file     *os.File
126	dataref  []byte // mmap'ed readonly, write throws SEGV
127	data     *[maxMapSize]byte
128	datasz   int
129	filesz   int // current on disk file size
130	meta0    *meta
131	meta1    *meta
132	pageSize int
133	opened   bool
134	rwtx     *Tx
135	txs      []*Tx
136	stats    Stats
137
138	// [Psiphon]
139	// https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
140	mmapErr error // set on mmap failure; subsequently returned by all methods
141
142	freelist     *freelist
143	freelistLoad sync.Once
144
145	pagePool sync.Pool
146
147	batchMu sync.Mutex
148	batch   *batch
149
150	rwlock   sync.Mutex   // Allows only one writer at a time.
151	metalock sync.Mutex   // Protects meta page access.
152	mmaplock sync.RWMutex // Protects mmap access during remapping.
153	statlock sync.RWMutex // Protects stats access.
154
155	ops struct {
156		writeAt func(b []byte, off int64) (n int, err error)
157	}
158
159	// Read only mode.
160	// When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately.
161	readOnly bool
162}
163
164// Path returns the path to currently open database file.
165func (db *DB) Path() string {
166	return db.path
167}
168
169// GoString returns the Go string representation of the database.
170func (db *DB) GoString() string {
171	return fmt.Sprintf("bolt.DB{path:%q}", db.path)
172}
173
174// String returns the string representation of the database.
175func (db *DB) String() string {
176	return fmt.Sprintf("DB<%q>", db.path)
177}
178
179// Open creates and opens a database at the given path.
180// If the file does not exist then it will be created automatically.
181// Passing in nil options will cause Bolt to open the database with the default options.
182func Open(path string, mode os.FileMode, options *Options) (*DB, error) {
183	db := &DB{
184		opened: true,
185	}
186
187	// [Psiphon]
188	// Ensure cleanup on panic so recovery can reset a locked file.
189	defer func() {
190		if r := recover(); r != nil {
191			_ = db.close()
192			panic(r)
193		}
194	}()
195
196	// Set default options if no options are provided.
197	if options == nil {
198		options = DefaultOptions
199	}
200	db.NoSync = options.NoSync
201	db.NoGrowSync = options.NoGrowSync
202	db.MmapFlags = options.MmapFlags
203	db.NoFreelistSync = options.NoFreelistSync
204	db.FreelistType = options.FreelistType
205
206	// Set default values for later DB operations.
207	db.MaxBatchSize = DefaultMaxBatchSize
208	db.MaxBatchDelay = DefaultMaxBatchDelay
209	db.AllocSize = DefaultAllocSize
210
211	flag := os.O_RDWR
212	if options.ReadOnly {
213		flag = os.O_RDONLY
214		db.readOnly = true
215	}
216
217	db.openFile = options.OpenFile
218	if db.openFile == nil {
219		db.openFile = os.OpenFile
220	}
221
222	// Open data file and separate sync handler for metadata writes.
223	var err error
224	if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil {
225		_ = db.close()
226		return nil, err
227	}
228	db.path = db.file.Name()
229
230	// Lock file so that other processes using Bolt in read-write mode cannot
231	// use the database  at the same time. This would cause corruption since
232	// the two processes would write meta pages and free pages separately.
233	// The database file is locked exclusively (only one process can grab the lock)
234	// if !options.ReadOnly.
235	// The database file is locked using the shared lock (more than one process may
236	// hold a lock at the same time) otherwise (options.ReadOnly is set).
237	if err := flock(db, !db.readOnly, options.Timeout); err != nil {
238		_ = db.close()
239		return nil, err
240	}
241
242	// Default values for test hooks
243	db.ops.writeAt = db.file.WriteAt
244
245	if db.pageSize = options.PageSize; db.pageSize == 0 {
246		// Set the default page size to the OS page size.
247		db.pageSize = defaultPageSize
248	}
249
250	// Initialize the database if it doesn't exist.
251	if info, err := db.file.Stat(); err != nil {
252		_ = db.close()
253		return nil, err
254	} else if info.Size() == 0 {
255		// Initialize new files with meta pages.
256		if err := db.init(); err != nil {
257			// clean up file descriptor on initialization fail
258			_ = db.close()
259			return nil, err
260		}
261	} else {
262		// Read the first meta page to determine the page size.
263		var buf [0x1000]byte
264		// If we can't read the page size, but can read a page, assume
265		// it's the same as the OS or one given -- since that's how the
266		// page size was chosen in the first place.
267		//
268		// If the first page is invalid and this OS uses a different
269		// page size than what the database was created with then we
270		// are out of luck and cannot access the database.
271		//
272		// TODO: scan for next page
273		if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) {
274			if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil {
275				db.pageSize = int(m.pageSize)
276			}
277		} else {
278			_ = db.close()
279			return nil, ErrInvalid
280		}
281	}
282
283	// Initialize page pool.
284	db.pagePool = sync.Pool{
285		New: func() interface{} {
286			return make([]byte, db.pageSize)
287		},
288	}
289
290	// Memory map the data file.
291	if err := db.mmap(options.InitialMmapSize); err != nil {
292		_ = db.close()
293		return nil, err
294	}
295
296	if db.readOnly {
297		return db, nil
298	}
299
300	db.loadFreelist()
301
302	// Flush freelist when transitioning from no sync to sync so
303	// NoFreelistSync unaware boltdb can open the db later.
304	if !db.NoFreelistSync && !db.hasSyncedFreelist() {
305		tx, err := db.Begin(true)
306		if tx != nil {
307			err = tx.Commit()
308		}
309		if err != nil {
310			_ = db.close()
311			return nil, err
312		}
313	}
314
315	// Mark the database as opened and return.
316	return db, nil
317}
318
319// loadFreelist reads the freelist if it is synced, or reconstructs it
320// by scanning the DB if it is not synced. It assumes there are no
321// concurrent accesses being made to the freelist.
322func (db *DB) loadFreelist() {
323	db.freelistLoad.Do(func() {
324		db.freelist = newFreelist(db.FreelistType)
325		if !db.hasSyncedFreelist() {
326			// Reconstruct free list by scanning the DB.
327			db.freelist.readIDs(db.freepages())
328		} else {
329			// Read free list from freelist page.
330			db.freelist.read(db.page(db.meta().freelist))
331		}
332		db.stats.FreePageN = db.freelist.free_count()
333	})
334}
335
336func (db *DB) hasSyncedFreelist() bool {
337	return db.meta().freelist != pgidNoFreelist
338}
339
340// mmap opens the underlying memory-mapped file and initializes the meta references.
341// minsz is the minimum size that the new mmap can be.
342func (db *DB) mmap(minsz int) error {
343	db.mmaplock.Lock()
344	defer db.mmaplock.Unlock()
345
346	info, err := db.file.Stat()
347	if err != nil {
348		return fmt.Errorf("mmap stat error: %s", err)
349	} else if int(info.Size()) < db.pageSize*2 {
350		return fmt.Errorf("file size too small")
351	}
352
353	// Ensure the size is at least the minimum size.
354	var size = int(info.Size())
355	if size < minsz {
356		size = minsz
357	}
358	size, err = db.mmapSize(size)
359	if err != nil {
360		return err
361	}
362
363	// Dereference all mmap references before unmapping.
364	if db.rwtx != nil {
365		db.rwtx.root.dereference()
366	}
367
368	// Unmap existing data before continuing.
369	if err := db.munmap(); err != nil {
370		return err
371	}
372
373	// Memory-map the data file as a byte slice.
374	if err := mmap(db, size); err != nil {
375
376		// [Psiphon]
377		// https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
378		// If mmap fails, we cannot safely continue. Mark the db as unusable,
379		// causing all future calls to return the mmap error.
380		db.mmapErr = MmapError(err.Error())
381		return db.mmapErr
382	}
383
384	// Save references to the meta pages.
385	db.meta0 = db.page(0).meta()
386	db.meta1 = db.page(1).meta()
387
388	// Validate the meta pages. We only return an error if both meta pages fail
389	// validation, since meta0 failing validation means that it wasn't saved
390	// properly -- but we can recover using meta1. And vice-versa.
391	err0 := db.meta0.validate()
392	err1 := db.meta1.validate()
393	if err0 != nil && err1 != nil {
394		return err0
395	}
396
397	return nil
398}
399
400// munmap unmaps the data file from memory.
401func (db *DB) munmap() error {
402	if err := munmap(db); err != nil {
403		return fmt.Errorf("unmap error: " + err.Error())
404	}
405	return nil
406}
407
408// mmapSize determines the appropriate size for the mmap given the current size
409// of the database. The minimum size is 32KB and doubles until it reaches 1GB.
410// Returns an error if the new mmap size is greater than the max allowed.
411func (db *DB) mmapSize(size int) (int, error) {
412	// Double the size from 32KB until 1GB.
413	for i := uint(15); i <= 30; i++ {
414		if size <= 1<<i {
415			return 1 << i, nil
416		}
417	}
418
419	// Verify the requested size is not above the maximum allowed.
420	if size > maxMapSize {
421		return 0, fmt.Errorf("mmap too large")
422	}
423
424	// If larger than 1GB then grow by 1GB at a time.
425	sz := int64(size)
426	if remainder := sz % int64(maxMmapStep); remainder > 0 {
427		sz += int64(maxMmapStep) - remainder
428	}
429
430	// Ensure that the mmap size is a multiple of the page size.
431	// This should always be true since we're incrementing in MBs.
432	pageSize := int64(db.pageSize)
433	if (sz % pageSize) != 0 {
434		sz = ((sz / pageSize) + 1) * pageSize
435	}
436
437	// If we've exceeded the max size then only grow up to the max size.
438	if sz > maxMapSize {
439		sz = maxMapSize
440	}
441
442	return int(sz), nil
443}
444
445// init creates a new database file and initializes its meta pages.
446func (db *DB) init() error {
447	// Create two meta pages on a buffer.
448	buf := make([]byte, db.pageSize*4)
449	for i := 0; i < 2; i++ {
450		p := db.pageInBuffer(buf[:], pgid(i))
451		p.id = pgid(i)
452		p.flags = metaPageFlag
453
454		// Initialize the meta page.
455		m := p.meta()
456		m.magic = magic
457		m.version = version
458		m.pageSize = uint32(db.pageSize)
459		m.freelist = 2
460		m.root = bucket{root: 3}
461		m.pgid = 4
462		m.txid = txid(i)
463		m.checksum = m.sum64()
464	}
465
466	// Write an empty freelist at page 3.
467	p := db.pageInBuffer(buf[:], pgid(2))
468	p.id = pgid(2)
469	p.flags = freelistPageFlag
470	p.count = 0
471
472	// Write an empty leaf page at page 4.
473	p = db.pageInBuffer(buf[:], pgid(3))
474	p.id = pgid(3)
475	p.flags = leafPageFlag
476	p.count = 0
477
478	// Write the buffer to our data file.
479	if _, err := db.ops.writeAt(buf, 0); err != nil {
480		return err
481	}
482	if err := fdatasync(db); err != nil {
483		return err
484	}
485
486	return nil
487}
488
489// Close releases all database resources.
490// It will block waiting for any open transactions to finish
491// before closing the database and returning.
492func (db *DB) Close() error {
493	db.rwlock.Lock()
494	defer db.rwlock.Unlock()
495
496	db.metalock.Lock()
497	defer db.metalock.Unlock()
498
499	db.mmaplock.Lock()
500	defer db.mmaplock.Unlock()
501
502	return db.close()
503}
504
505func (db *DB) close() error {
506	if !db.opened {
507		return nil
508	}
509
510	db.opened = false
511
512	db.freelist = nil
513
514	// Clear ops.
515	db.ops.writeAt = nil
516
517	// Close the mmap.
518	if err := db.munmap(); err != nil {
519		return err
520	}
521
522	// Close file handles.
523	if db.file != nil {
524		// No need to unlock read-only file.
525		if !db.readOnly {
526			// Unlock the file.
527			if err := funlock(db); err != nil {
528				log.Printf("bolt.Close(): funlock error: %s", err)
529			}
530		}
531
532		// Close the file descriptor.
533		if err := db.file.Close(); err != nil {
534			return fmt.Errorf("db file close: %s", err)
535		}
536		db.file = nil
537	}
538
539	db.path = ""
540	return nil
541}
542
543// Begin starts a new transaction.
544// Multiple read-only transactions can be used concurrently but only one
545// write transaction can be used at a time. Starting multiple write transactions
546// will cause the calls to block and be serialized until the current write
547// transaction finishes.
548//
549// Transactions should not be dependent on one another. Opening a read
550// transaction and a write transaction in the same goroutine can cause the
551// writer to deadlock because the database periodically needs to re-mmap itself
552// as it grows and it cannot do that while a read transaction is open.
553//
554// If a long running read transaction (for example, a snapshot transaction) is
555// needed, you might want to set DB.InitialMmapSize to a large enough value
556// to avoid potential blocking of write transaction.
557//
558// IMPORTANT: You must close read-only transactions after you are finished or
559// else the database will not reclaim old pages.
560func (db *DB) Begin(writable bool) (*Tx, error) {
561	if writable {
562		return db.beginRWTx()
563	}
564	return db.beginTx()
565}
566
567func (db *DB) beginTx() (*Tx, error) {
568	// Lock the meta pages while we initialize the transaction. We obtain
569	// the meta lock before the mmap lock because that's the order that the
570	// write transaction will obtain them.
571	db.metalock.Lock()
572
573	// Obtain a read-only lock on the mmap. When the mmap is remapped it will
574	// obtain a write lock so all transactions must finish before it can be
575	// remapped.
576	db.mmaplock.RLock()
577
578	// Exit if the database is not open yet.
579	if !db.opened {
580		db.mmaplock.RUnlock()
581		db.metalock.Unlock()
582		return nil, ErrDatabaseNotOpen
583	}
584
585	// [Psiphon]
586	// https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
587	// Return mmap error if a previous mmap failed.
588	if db.mmapErr != nil {
589		db.mmaplock.RUnlock()
590		db.metalock.Unlock()
591		return nil, db.mmapErr
592	}
593
594	// Create a transaction associated with the database.
595	t := &Tx{}
596	t.init(db)
597
598	// Keep track of transaction until it closes.
599	db.txs = append(db.txs, t)
600	n := len(db.txs)
601
602	// Unlock the meta pages.
603	db.metalock.Unlock()
604
605	// Update the transaction stats.
606	db.statlock.Lock()
607	db.stats.TxN++
608	db.stats.OpenTxN = n
609	db.statlock.Unlock()
610
611	return t, nil
612}
613
614func (db *DB) beginRWTx() (*Tx, error) {
615	// If the database was opened with Options.ReadOnly, return an error.
616	if db.readOnly {
617		return nil, ErrDatabaseReadOnly
618	}
619
620	// Obtain writer lock. This is released by the transaction when it closes.
621	// This enforces only one writer transaction at a time.
622	db.rwlock.Lock()
623
624	// Once we have the writer lock then we can lock the meta pages so that
625	// we can set up the transaction.
626	db.metalock.Lock()
627	defer db.metalock.Unlock()
628
629	// Exit if the database is not open yet.
630	if !db.opened {
631		db.rwlock.Unlock()
632		return nil, ErrDatabaseNotOpen
633	}
634
635	// [Psiphon]
636	// https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73
637	// Return mmap error if a previous mmap failed.
638	if db.mmapErr != nil {
639		db.rwlock.Unlock()
640		return nil, db.mmapErr
641	}
642
643	// Create a transaction associated with the database.
644	t := &Tx{writable: true}
645	t.init(db)
646	db.rwtx = t
647	db.freePages()
648	return t, nil
649}
650
651// freePages releases any pages associated with closed read-only transactions.
652func (db *DB) freePages() {
653	// Free all pending pages prior to earliest open transaction.
654	sort.Sort(txsById(db.txs))
655	minid := txid(0xFFFFFFFFFFFFFFFF)
656	if len(db.txs) > 0 {
657		minid = db.txs[0].meta.txid
658	}
659	if minid > 0 {
660		db.freelist.release(minid - 1)
661	}
662	// Release unused txid extents.
663	for _, t := range db.txs {
664		db.freelist.releaseRange(minid, t.meta.txid-1)
665		minid = t.meta.txid + 1
666	}
667	db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF))
668	// Any page both allocated and freed in an extent is safe to release.
669}
670
671type txsById []*Tx
672
673func (t txsById) Len() int           { return len(t) }
674func (t txsById) Swap(i, j int)      { t[i], t[j] = t[j], t[i] }
675func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid }
676
677// removeTx removes a transaction from the database.
678func (db *DB) removeTx(tx *Tx) {
679	// Release the read lock on the mmap.
680	db.mmaplock.RUnlock()
681
682	// Use the meta lock to restrict access to the DB object.
683	db.metalock.Lock()
684
685	// Remove the transaction.
686	for i, t := range db.txs {
687		if t == tx {
688			last := len(db.txs) - 1
689			db.txs[i] = db.txs[last]
690			db.txs[last] = nil
691			db.txs = db.txs[:last]
692			break
693		}
694	}
695	n := len(db.txs)
696
697	// Unlock the meta pages.
698	db.metalock.Unlock()
699
700	// Merge statistics.
701	db.statlock.Lock()
702	db.stats.OpenTxN = n
703	db.stats.TxStats.add(&tx.stats)
704	db.statlock.Unlock()
705}
706
707// Update executes a function within the context of a read-write managed transaction.
708// If no error is returned from the function then the transaction is committed.
709// If an error is returned then the entire transaction is rolled back.
710// Any error that is returned from the function or returned from the commit is
711// returned from the Update() method.
712//
713// Attempting to manually commit or rollback within the function will cause a panic.
714func (db *DB) Update(fn func(*Tx) error) error {
715	t, err := db.Begin(true)
716	if err != nil {
717		return err
718	}
719
720	// Make sure the transaction rolls back in the event of a panic.
721	defer func() {
722		if t.db != nil {
723			t.rollback()
724		}
725	}()
726
727	// Mark as a managed tx so that the inner function cannot manually commit.
728	t.managed = true
729
730	// If an error is returned from the function then rollback and return error.
731	err = fn(t)
732	t.managed = false
733	if err != nil {
734		_ = t.Rollback()
735		return err
736	}
737
738	return t.Commit()
739}
740
741// View executes a function within the context of a managed read-only transaction.
742// Any error that is returned from the function is returned from the View() method.
743//
744// Attempting to manually rollback within the function will cause a panic.
745func (db *DB) View(fn func(*Tx) error) error {
746	t, err := db.Begin(false)
747	if err != nil {
748		return err
749	}
750
751	// Make sure the transaction rolls back in the event of a panic.
752	defer func() {
753		if t.db != nil {
754			t.rollback()
755		}
756	}()
757
758	// Mark as a managed tx so that the inner function cannot manually rollback.
759	t.managed = true
760
761	// If an error is returned from the function then pass it through.
762	err = fn(t)
763	t.managed = false
764	if err != nil {
765		_ = t.Rollback()
766		return err
767	}
768
769	return t.Rollback()
770}
771
772// Batch calls fn as part of a batch. It behaves similar to Update,
773// except:
774//
775// 1. concurrent Batch calls can be combined into a single Bolt
776// transaction.
777//
778// 2. the function passed to Batch may be called multiple times,
779// regardless of whether it returns error or not.
780//
781// This means that Batch function side effects must be idempotent and
782// take permanent effect only after a successful return is seen in
783// caller.
784//
785// The maximum batch size and delay can be adjusted with DB.MaxBatchSize
786// and DB.MaxBatchDelay, respectively.
787//
788// Batch is only useful when there are multiple goroutines calling it.
789func (db *DB) Batch(fn func(*Tx) error) error {
790	errCh := make(chan error, 1)
791
792	db.batchMu.Lock()
793	if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) {
794		// There is no existing batch, or the existing batch is full; start a new one.
795		db.batch = &batch{
796			db: db,
797		}
798		db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger)
799	}
800	db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh})
801	if len(db.batch.calls) >= db.MaxBatchSize {
802		// wake up batch, it's ready to run
803		go db.batch.trigger()
804	}
805	db.batchMu.Unlock()
806
807	err := <-errCh
808	if err == trySolo {
809		err = db.Update(fn)
810	}
811	return err
812}
813
814type call struct {
815	fn  func(*Tx) error
816	err chan<- error
817}
818
819type batch struct {
820	db    *DB
821	timer *time.Timer
822	start sync.Once
823	calls []call
824}
825
826// trigger runs the batch if it hasn't already been run.
827func (b *batch) trigger() {
828	b.start.Do(b.run)
829}
830
831// run performs the transactions in the batch and communicates results
832// back to DB.Batch.
833func (b *batch) run() {
834	b.db.batchMu.Lock()
835	b.timer.Stop()
836	// Make sure no new work is added to this batch, but don't break
837	// other batches.
838	if b.db.batch == b {
839		b.db.batch = nil
840	}
841	b.db.batchMu.Unlock()
842
843retry:
844	for len(b.calls) > 0 {
845		var failIdx = -1
846		err := b.db.Update(func(tx *Tx) error {
847			for i, c := range b.calls {
848				if err := safelyCall(c.fn, tx); err != nil {
849					failIdx = i
850					return err
851				}
852			}
853			return nil
854		})
855
856		if failIdx >= 0 {
857			// take the failing transaction out of the batch. it's
858			// safe to shorten b.calls here because db.batch no longer
859			// points to us, and we hold the mutex anyway.
860			c := b.calls[failIdx]
861			b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1]
862			// tell the submitter re-run it solo, continue with the rest of the batch
863			c.err <- trySolo
864			continue retry
865		}
866
867		// pass success, or bolt internal errors, to all callers
868		for _, c := range b.calls {
869			c.err <- err
870		}
871		break retry
872	}
873}
874
875// trySolo is a special sentinel error value used for signaling that a
876// transaction function should be re-run. It should never be seen by
877// callers.
878var trySolo = errors.New("batch function returned an error and should be re-run solo")
879
880type panicked struct {
881	reason interface{}
882}
883
884func (p panicked) Error() string {
885	if err, ok := p.reason.(error); ok {
886		return err.Error()
887	}
888	return fmt.Sprintf("panic: %v", p.reason)
889}
890
891func safelyCall(fn func(*Tx) error, tx *Tx) (err error) {
892	defer func() {
893		if p := recover(); p != nil {
894			err = panicked{p}
895		}
896	}()
897	return fn(tx)
898}
899
900// Sync executes fdatasync() against the database file handle.
901//
902// This is not necessary under normal operation, however, if you use NoSync
903// then it allows you to force the database file to sync against the disk.
904func (db *DB) Sync() error { return fdatasync(db) }
905
906// Stats retrieves ongoing performance stats for the database.
907// This is only updated when a transaction closes.
908func (db *DB) Stats() Stats {
909	db.statlock.RLock()
910	defer db.statlock.RUnlock()
911	return db.stats
912}
913
914// This is for internal access to the raw data bytes from the C cursor, use
915// carefully, or not at all.
916func (db *DB) Info() *Info {
917	return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize}
918}
919
920// page retrieves a page reference from the mmap based on the current page size.
921func (db *DB) page(id pgid) *page {
922	pos := id * pgid(db.pageSize)
923	return (*page)(unsafe.Pointer(&db.data[pos]))
924}
925
926// pageInBuffer retrieves a page reference from a given byte array based on the current page size.
927func (db *DB) pageInBuffer(b []byte, id pgid) *page {
928	return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)]))
929}
930
931// meta retrieves the current meta page reference.
932func (db *DB) meta() *meta {
933	// We have to return the meta with the highest txid which doesn't fail
934	// validation. Otherwise, we can cause errors when in fact the database is
935	// in a consistent state. metaA is the one with the higher txid.
936	metaA := db.meta0
937	metaB := db.meta1
938	if db.meta1.txid > db.meta0.txid {
939		metaA = db.meta1
940		metaB = db.meta0
941	}
942
943	// Use higher meta page if valid. Otherwise fallback to previous, if valid.
944	if err := metaA.validate(); err == nil {
945		return metaA
946	} else if err := metaB.validate(); err == nil {
947		return metaB
948	}
949
950	// This should never be reached, because both meta1 and meta0 were validated
951	// on mmap() and we do fsync() on every write.
952	panic("bolt.DB.meta(): invalid meta pages")
953}
954
955// allocate returns a contiguous block of memory starting at a given page.
956func (db *DB) allocate(txid txid, count int) (*page, error) {
957	// Allocate a temporary buffer for the page.
958	var buf []byte
959	if count == 1 {
960		buf = db.pagePool.Get().([]byte)
961	} else {
962		buf = make([]byte, count*db.pageSize)
963	}
964	p := (*page)(unsafe.Pointer(&buf[0]))
965	p.overflow = uint32(count - 1)
966
967	// Use pages from the freelist if they are available.
968	if p.id = db.freelist.allocate(txid, count); p.id != 0 {
969		return p, nil
970	}
971
972	// Resize mmap() if we're at the end.
973	p.id = db.rwtx.meta.pgid
974	var minsz = int((p.id+pgid(count))+1) * db.pageSize
975	if minsz >= db.datasz {
976		if err := db.mmap(minsz); err != nil {
977			return nil, fmt.Errorf("mmap allocate error: %s", err)
978		}
979	}
980
981	// Move the page id high water mark.
982	db.rwtx.meta.pgid += pgid(count)
983
984	return p, nil
985}
986
987// grow grows the size of the database to the given sz.
988func (db *DB) grow(sz int) error {
989	// Ignore if the new size is less than available file size.
990	if sz <= db.filesz {
991		return nil
992	}
993
994	// If the data is smaller than the alloc size then only allocate what's needed.
995	// Once it goes over the allocation size then allocate in chunks.
996	if db.datasz < db.AllocSize {
997		sz = db.datasz
998	} else {
999		sz += db.AllocSize
1000	}
1001
1002	// Truncate and fsync to ensure file size metadata is flushed.
1003	// https://github.com/ooni/psiphon/oopsi/github.com/boltdb/bolt/issues/284
1004	if !db.NoGrowSync && !db.readOnly {
1005		if runtime.GOOS != "windows" {
1006			if err := db.file.Truncate(int64(sz)); err != nil {
1007				return fmt.Errorf("file resize error: %s", err)
1008			}
1009		}
1010		if err := db.file.Sync(); err != nil {
1011			return fmt.Errorf("file sync error: %s", err)
1012		}
1013	}
1014
1015	db.filesz = sz
1016	return nil
1017}
1018
1019func (db *DB) IsReadOnly() bool {
1020	return db.readOnly
1021}
1022
1023func (db *DB) freepages() []pgid {
1024	tx, err := db.beginTx()
1025	defer func() {
1026		err = tx.Rollback()
1027		if err != nil {
1028			panic("freepages: failed to rollback tx")
1029		}
1030	}()
1031	if err != nil {
1032		panic("freepages: failed to open read only tx")
1033	}
1034
1035	reachable := make(map[pgid]*page)
1036	nofreed := make(map[pgid]bool)
1037
1038	// [Psiphon]
1039	// Use single-error checkBucket.
1040	err = tx.checkBucket(&tx.root, reachable, nofreed)
1041	if err != nil {
1042		panic(fmt.Sprintf("freepages: failed to get all reachable pages (%s)", err))
1043	}
1044
1045	var fids []pgid
1046	for i := pgid(2); i < db.meta().pgid; i++ {
1047		if _, ok := reachable[i]; !ok {
1048			fids = append(fids, i)
1049		}
1050	}
1051	return fids
1052}
1053
1054// Options represents the options that can be set when opening a database.
1055type Options struct {
1056	// Timeout is the amount of time to wait to obtain a file lock.
1057	// When set to zero it will wait indefinitely. This option is only
1058	// available on Darwin and Linux.
1059	Timeout time.Duration
1060
1061	// Sets the DB.NoGrowSync flag before memory mapping the file.
1062	NoGrowSync bool
1063
1064	// Do not sync freelist to disk. This improves the database write performance
1065	// under normal operation, but requires a full database re-sync during recovery.
1066	NoFreelistSync bool
1067
1068	// FreelistType sets the backend freelist type. There are two options. Array which is simple but endures
1069	// dramatic performance degradation if database is large and framentation in freelist is common.
1070	// The alternative one is using hashmap, it is faster in almost all circumstances
1071	// but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe.
1072	// The default type is array
1073	FreelistType FreelistType
1074
1075	// Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to
1076	// grab a shared lock (UNIX).
1077	ReadOnly bool
1078
1079	// Sets the DB.MmapFlags flag before memory mapping the file.
1080	MmapFlags int
1081
1082	// InitialMmapSize is the initial mmap size of the database
1083	// in bytes. Read transactions won't block write transaction
1084	// if the InitialMmapSize is large enough to hold database mmap
1085	// size. (See DB.Begin for more information)
1086	//
1087	// If <=0, the initial map size is 0.
1088	// If initialMmapSize is smaller than the previous database size,
1089	// it takes no effect.
1090	InitialMmapSize int
1091
1092	// PageSize overrides the default OS page size.
1093	PageSize int
1094
1095	// NoSync sets the initial value of DB.NoSync. Normally this can just be
1096	// set directly on the DB itself when returned from Open(), but this option
1097	// is useful in APIs which expose Options but not the underlying DB.
1098	NoSync bool
1099
1100	// OpenFile is used to open files. It defaults to os.OpenFile. This option
1101	// is useful for writing hermetic tests.
1102	OpenFile func(string, int, os.FileMode) (*os.File, error)
1103}
1104
1105// DefaultOptions represent the options used if nil options are passed into Open().
1106// No timeout is used which will cause Bolt to wait indefinitely for a lock.
1107var DefaultOptions = &Options{
1108	Timeout:      0,
1109	NoGrowSync:   false,
1110	FreelistType: FreelistArrayType,
1111}
1112
1113// Stats represents statistics about the database.
1114type Stats struct {
1115	// Freelist stats
1116	FreePageN     int // total number of free pages on the freelist
1117	PendingPageN  int // total number of pending pages on the freelist
1118	FreeAlloc     int // total bytes allocated in free pages
1119	FreelistInuse int // total bytes used by the freelist
1120
1121	// Transaction stats
1122	TxN     int // total number of started read transactions
1123	OpenTxN int // number of currently open read transactions
1124
1125	TxStats TxStats // global, ongoing stats.
1126}
1127
1128// Sub calculates and returns the difference between two sets of database stats.
1129// This is useful when obtaining stats at two different points and time and
1130// you need the performance counters that occurred within that time span.
1131func (s *Stats) Sub(other *Stats) Stats {
1132	if other == nil {
1133		return *s
1134	}
1135	var diff Stats
1136	diff.FreePageN = s.FreePageN
1137	diff.PendingPageN = s.PendingPageN
1138	diff.FreeAlloc = s.FreeAlloc
1139	diff.FreelistInuse = s.FreelistInuse
1140	diff.TxN = s.TxN - other.TxN
1141	diff.TxStats = s.TxStats.Sub(&other.TxStats)
1142	return diff
1143}
1144
1145type Info struct {
1146	Data     uintptr
1147	PageSize int
1148}
1149
1150type meta struct {
1151	magic    uint32
1152	version  uint32
1153	pageSize uint32
1154	flags    uint32
1155	root     bucket
1156	freelist pgid
1157	pgid     pgid
1158	txid     txid
1159	checksum uint64
1160}
1161
1162// validate checks the marker bytes and version of the meta page to ensure it matches this binary.
1163func (m *meta) validate() error {
1164	if m.magic != magic {
1165		return ErrInvalid
1166	} else if m.version != version {
1167		return ErrVersionMismatch
1168	} else if m.checksum != 0 && m.checksum != m.sum64() {
1169		return ErrChecksum
1170	}
1171	return nil
1172}
1173
1174// copy copies one meta object to another.
1175func (m *meta) copy(dest *meta) {
1176	*dest = *m
1177}
1178
1179// write writes the meta onto a page.
1180func (m *meta) write(p *page) {
1181	if m.root.root >= m.pgid {
1182		panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid))
1183	} else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist {
1184		// TODO: reject pgidNoFreeList if !NoFreelistSync
1185		panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid))
1186	}
1187
1188	// Page id is either going to be 0 or 1 which we can determine by the transaction ID.
1189	p.id = pgid(m.txid % 2)
1190	p.flags |= metaPageFlag
1191
1192	// Calculate the checksum.
1193	m.checksum = m.sum64()
1194
1195	m.copy(p.meta())
1196}
1197
1198// generates the checksum for the meta.
1199func (m *meta) sum64() uint64 {
1200	var h = fnv.New64a()
1201	_, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:])
1202	return h.Sum64()
1203}
1204
1205// _assert will panic with a given formatted message if the given condition is false.
1206func _assert(condition bool, msg string, v ...interface{}) {
1207	if !condition {
1208		panic(fmt.Sprintf("assertion failed: "+msg, v...))
1209	}
1210}
1211