1package bolt 2 3import ( 4 "errors" 5 "fmt" 6 "hash/fnv" 7 "log" 8 "os" 9 "runtime" 10 "sort" 11 "sync" 12 "time" 13 "unsafe" 14) 15 16// The largest step that can be taken when remapping the mmap. 17const maxMmapStep = 1 << 30 // 1GB 18 19// The data file format version. 20const version = 2 21 22// Represents a marker value to indicate that a file is a Bolt DB. 23const magic uint32 = 0xED0CDAED 24 25const pgidNoFreelist pgid = 0xffffffffffffffff 26 27// IgnoreNoSync specifies whether the NoSync field of a DB is ignored when 28// syncing changes to a file. This is required as some operating systems, 29// such as OpenBSD, do not have a unified buffer cache (UBC) and writes 30// must be synchronized using the msync(2) syscall. 31const IgnoreNoSync = runtime.GOOS == "openbsd" 32 33// Default values if not set in a DB instance. 34const ( 35 DefaultMaxBatchSize int = 1000 36 DefaultMaxBatchDelay = 10 * time.Millisecond 37 DefaultAllocSize = 16 * 1024 * 1024 38) 39 40// default page size for db is set to the OS page size. 41var defaultPageSize = os.Getpagesize() 42 43// The time elapsed between consecutive file locking attempts. 44const flockRetryTimeout = 50 * time.Millisecond 45 46// FreelistType is the type of the freelist backend 47type FreelistType string 48 49const ( 50 // FreelistArrayType indicates backend freelist type is array 51 FreelistArrayType = FreelistType("array") 52 // FreelistMapType indicates backend freelist type is hashmap 53 FreelistMapType = FreelistType("hashmap") 54) 55 56// DB represents a collection of buckets persisted to a file on disk. 57// All data access is performed through transactions which can be obtained through the DB. 58// All the functions on DB will return a ErrDatabaseNotOpen if accessed before Open() is called. 59type DB struct { 60 // When enabled, the database will perform a Check() after every commit. 61 // A panic is issued if the database is in an inconsistent state. This 62 // flag has a large performance impact so it should only be used for 63 // debugging purposes. 64 StrictMode bool 65 66 // Setting the NoSync flag will cause the database to skip fsync() 67 // calls after each commit. This can be useful when bulk loading data 68 // into a database and you can restart the bulk load in the event of 69 // a system failure or database corruption. Do not set this flag for 70 // normal use. 71 // 72 // If the package global IgnoreNoSync constant is true, this value is 73 // ignored. See the comment on that constant for more details. 74 // 75 // THIS IS UNSAFE. PLEASE USE WITH CAUTION. 76 NoSync bool 77 78 // When true, skips syncing freelist to disk. This improves the database 79 // write performance under normal operation, but requires a full database 80 // re-sync during recovery. 81 NoFreelistSync bool 82 83 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures 84 // dramatic performance degradation if database is large and framentation in freelist is common. 85 // The alternative one is using hashmap, it is faster in almost all circumstances 86 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. 87 // The default type is array 88 FreelistType FreelistType 89 90 // When true, skips the truncate call when growing the database. 91 // Setting this to true is only safe on non-ext3/ext4 systems. 92 // Skipping truncation avoids preallocation of hard drive space and 93 // bypasses a truncate() and fsync() syscall on remapping. 94 // 95 // https://github.com/ooni/psiphon/oopsi/github.com/boltdb/bolt/issues/284 96 NoGrowSync bool 97 98 // If you want to read the entire database fast, you can set MmapFlag to 99 // syscall.MAP_POPULATE on Linux 2.6.23+ for sequential read-ahead. 100 MmapFlags int 101 102 // MaxBatchSize is the maximum size of a batch. Default value is 103 // copied from DefaultMaxBatchSize in Open. 104 // 105 // If <=0, disables batching. 106 // 107 // Do not change concurrently with calls to Batch. 108 MaxBatchSize int 109 110 // MaxBatchDelay is the maximum delay before a batch starts. 111 // Default value is copied from DefaultMaxBatchDelay in Open. 112 // 113 // If <=0, effectively disables batching. 114 // 115 // Do not change concurrently with calls to Batch. 116 MaxBatchDelay time.Duration 117 118 // AllocSize is the amount of space allocated when the database 119 // needs to create new pages. This is done to amortize the cost 120 // of truncate() and fsync() when growing the data file. 121 AllocSize int 122 123 path string 124 openFile func(string, int, os.FileMode) (*os.File, error) 125 file *os.File 126 dataref []byte // mmap'ed readonly, write throws SEGV 127 data *[maxMapSize]byte 128 datasz int 129 filesz int // current on disk file size 130 meta0 *meta 131 meta1 *meta 132 pageSize int 133 opened bool 134 rwtx *Tx 135 txs []*Tx 136 stats Stats 137 138 // [Psiphon] 139 // https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73 140 mmapErr error // set on mmap failure; subsequently returned by all methods 141 142 freelist *freelist 143 freelistLoad sync.Once 144 145 pagePool sync.Pool 146 147 batchMu sync.Mutex 148 batch *batch 149 150 rwlock sync.Mutex // Allows only one writer at a time. 151 metalock sync.Mutex // Protects meta page access. 152 mmaplock sync.RWMutex // Protects mmap access during remapping. 153 statlock sync.RWMutex // Protects stats access. 154 155 ops struct { 156 writeAt func(b []byte, off int64) (n int, err error) 157 } 158 159 // Read only mode. 160 // When true, Update() and Begin(true) return ErrDatabaseReadOnly immediately. 161 readOnly bool 162} 163 164// Path returns the path to currently open database file. 165func (db *DB) Path() string { 166 return db.path 167} 168 169// GoString returns the Go string representation of the database. 170func (db *DB) GoString() string { 171 return fmt.Sprintf("bolt.DB{path:%q}", db.path) 172} 173 174// String returns the string representation of the database. 175func (db *DB) String() string { 176 return fmt.Sprintf("DB<%q>", db.path) 177} 178 179// Open creates and opens a database at the given path. 180// If the file does not exist then it will be created automatically. 181// Passing in nil options will cause Bolt to open the database with the default options. 182func Open(path string, mode os.FileMode, options *Options) (*DB, error) { 183 db := &DB{ 184 opened: true, 185 } 186 187 // [Psiphon] 188 // Ensure cleanup on panic so recovery can reset a locked file. 189 defer func() { 190 if r := recover(); r != nil { 191 _ = db.close() 192 panic(r) 193 } 194 }() 195 196 // Set default options if no options are provided. 197 if options == nil { 198 options = DefaultOptions 199 } 200 db.NoSync = options.NoSync 201 db.NoGrowSync = options.NoGrowSync 202 db.MmapFlags = options.MmapFlags 203 db.NoFreelistSync = options.NoFreelistSync 204 db.FreelistType = options.FreelistType 205 206 // Set default values for later DB operations. 207 db.MaxBatchSize = DefaultMaxBatchSize 208 db.MaxBatchDelay = DefaultMaxBatchDelay 209 db.AllocSize = DefaultAllocSize 210 211 flag := os.O_RDWR 212 if options.ReadOnly { 213 flag = os.O_RDONLY 214 db.readOnly = true 215 } 216 217 db.openFile = options.OpenFile 218 if db.openFile == nil { 219 db.openFile = os.OpenFile 220 } 221 222 // Open data file and separate sync handler for metadata writes. 223 var err error 224 if db.file, err = db.openFile(path, flag|os.O_CREATE, mode); err != nil { 225 _ = db.close() 226 return nil, err 227 } 228 db.path = db.file.Name() 229 230 // Lock file so that other processes using Bolt in read-write mode cannot 231 // use the database at the same time. This would cause corruption since 232 // the two processes would write meta pages and free pages separately. 233 // The database file is locked exclusively (only one process can grab the lock) 234 // if !options.ReadOnly. 235 // The database file is locked using the shared lock (more than one process may 236 // hold a lock at the same time) otherwise (options.ReadOnly is set). 237 if err := flock(db, !db.readOnly, options.Timeout); err != nil { 238 _ = db.close() 239 return nil, err 240 } 241 242 // Default values for test hooks 243 db.ops.writeAt = db.file.WriteAt 244 245 if db.pageSize = options.PageSize; db.pageSize == 0 { 246 // Set the default page size to the OS page size. 247 db.pageSize = defaultPageSize 248 } 249 250 // Initialize the database if it doesn't exist. 251 if info, err := db.file.Stat(); err != nil { 252 _ = db.close() 253 return nil, err 254 } else if info.Size() == 0 { 255 // Initialize new files with meta pages. 256 if err := db.init(); err != nil { 257 // clean up file descriptor on initialization fail 258 _ = db.close() 259 return nil, err 260 } 261 } else { 262 // Read the first meta page to determine the page size. 263 var buf [0x1000]byte 264 // If we can't read the page size, but can read a page, assume 265 // it's the same as the OS or one given -- since that's how the 266 // page size was chosen in the first place. 267 // 268 // If the first page is invalid and this OS uses a different 269 // page size than what the database was created with then we 270 // are out of luck and cannot access the database. 271 // 272 // TODO: scan for next page 273 if bw, err := db.file.ReadAt(buf[:], 0); err == nil && bw == len(buf) { 274 if m := db.pageInBuffer(buf[:], 0).meta(); m.validate() == nil { 275 db.pageSize = int(m.pageSize) 276 } 277 } else { 278 _ = db.close() 279 return nil, ErrInvalid 280 } 281 } 282 283 // Initialize page pool. 284 db.pagePool = sync.Pool{ 285 New: func() interface{} { 286 return make([]byte, db.pageSize) 287 }, 288 } 289 290 // Memory map the data file. 291 if err := db.mmap(options.InitialMmapSize); err != nil { 292 _ = db.close() 293 return nil, err 294 } 295 296 if db.readOnly { 297 return db, nil 298 } 299 300 db.loadFreelist() 301 302 // Flush freelist when transitioning from no sync to sync so 303 // NoFreelistSync unaware boltdb can open the db later. 304 if !db.NoFreelistSync && !db.hasSyncedFreelist() { 305 tx, err := db.Begin(true) 306 if tx != nil { 307 err = tx.Commit() 308 } 309 if err != nil { 310 _ = db.close() 311 return nil, err 312 } 313 } 314 315 // Mark the database as opened and return. 316 return db, nil 317} 318 319// loadFreelist reads the freelist if it is synced, or reconstructs it 320// by scanning the DB if it is not synced. It assumes there are no 321// concurrent accesses being made to the freelist. 322func (db *DB) loadFreelist() { 323 db.freelistLoad.Do(func() { 324 db.freelist = newFreelist(db.FreelistType) 325 if !db.hasSyncedFreelist() { 326 // Reconstruct free list by scanning the DB. 327 db.freelist.readIDs(db.freepages()) 328 } else { 329 // Read free list from freelist page. 330 db.freelist.read(db.page(db.meta().freelist)) 331 } 332 db.stats.FreePageN = db.freelist.free_count() 333 }) 334} 335 336func (db *DB) hasSyncedFreelist() bool { 337 return db.meta().freelist != pgidNoFreelist 338} 339 340// mmap opens the underlying memory-mapped file and initializes the meta references. 341// minsz is the minimum size that the new mmap can be. 342func (db *DB) mmap(minsz int) error { 343 db.mmaplock.Lock() 344 defer db.mmaplock.Unlock() 345 346 info, err := db.file.Stat() 347 if err != nil { 348 return fmt.Errorf("mmap stat error: %s", err) 349 } else if int(info.Size()) < db.pageSize*2 { 350 return fmt.Errorf("file size too small") 351 } 352 353 // Ensure the size is at least the minimum size. 354 var size = int(info.Size()) 355 if size < minsz { 356 size = minsz 357 } 358 size, err = db.mmapSize(size) 359 if err != nil { 360 return err 361 } 362 363 // Dereference all mmap references before unmapping. 364 if db.rwtx != nil { 365 db.rwtx.root.dereference() 366 } 367 368 // Unmap existing data before continuing. 369 if err := db.munmap(); err != nil { 370 return err 371 } 372 373 // Memory-map the data file as a byte slice. 374 if err := mmap(db, size); err != nil { 375 376 // [Psiphon] 377 // https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73 378 // If mmap fails, we cannot safely continue. Mark the db as unusable, 379 // causing all future calls to return the mmap error. 380 db.mmapErr = MmapError(err.Error()) 381 return db.mmapErr 382 } 383 384 // Save references to the meta pages. 385 db.meta0 = db.page(0).meta() 386 db.meta1 = db.page(1).meta() 387 388 // Validate the meta pages. We only return an error if both meta pages fail 389 // validation, since meta0 failing validation means that it wasn't saved 390 // properly -- but we can recover using meta1. And vice-versa. 391 err0 := db.meta0.validate() 392 err1 := db.meta1.validate() 393 if err0 != nil && err1 != nil { 394 return err0 395 } 396 397 return nil 398} 399 400// munmap unmaps the data file from memory. 401func (db *DB) munmap() error { 402 if err := munmap(db); err != nil { 403 return fmt.Errorf("unmap error: " + err.Error()) 404 } 405 return nil 406} 407 408// mmapSize determines the appropriate size for the mmap given the current size 409// of the database. The minimum size is 32KB and doubles until it reaches 1GB. 410// Returns an error if the new mmap size is greater than the max allowed. 411func (db *DB) mmapSize(size int) (int, error) { 412 // Double the size from 32KB until 1GB. 413 for i := uint(15); i <= 30; i++ { 414 if size <= 1<<i { 415 return 1 << i, nil 416 } 417 } 418 419 // Verify the requested size is not above the maximum allowed. 420 if size > maxMapSize { 421 return 0, fmt.Errorf("mmap too large") 422 } 423 424 // If larger than 1GB then grow by 1GB at a time. 425 sz := int64(size) 426 if remainder := sz % int64(maxMmapStep); remainder > 0 { 427 sz += int64(maxMmapStep) - remainder 428 } 429 430 // Ensure that the mmap size is a multiple of the page size. 431 // This should always be true since we're incrementing in MBs. 432 pageSize := int64(db.pageSize) 433 if (sz % pageSize) != 0 { 434 sz = ((sz / pageSize) + 1) * pageSize 435 } 436 437 // If we've exceeded the max size then only grow up to the max size. 438 if sz > maxMapSize { 439 sz = maxMapSize 440 } 441 442 return int(sz), nil 443} 444 445// init creates a new database file and initializes its meta pages. 446func (db *DB) init() error { 447 // Create two meta pages on a buffer. 448 buf := make([]byte, db.pageSize*4) 449 for i := 0; i < 2; i++ { 450 p := db.pageInBuffer(buf[:], pgid(i)) 451 p.id = pgid(i) 452 p.flags = metaPageFlag 453 454 // Initialize the meta page. 455 m := p.meta() 456 m.magic = magic 457 m.version = version 458 m.pageSize = uint32(db.pageSize) 459 m.freelist = 2 460 m.root = bucket{root: 3} 461 m.pgid = 4 462 m.txid = txid(i) 463 m.checksum = m.sum64() 464 } 465 466 // Write an empty freelist at page 3. 467 p := db.pageInBuffer(buf[:], pgid(2)) 468 p.id = pgid(2) 469 p.flags = freelistPageFlag 470 p.count = 0 471 472 // Write an empty leaf page at page 4. 473 p = db.pageInBuffer(buf[:], pgid(3)) 474 p.id = pgid(3) 475 p.flags = leafPageFlag 476 p.count = 0 477 478 // Write the buffer to our data file. 479 if _, err := db.ops.writeAt(buf, 0); err != nil { 480 return err 481 } 482 if err := fdatasync(db); err != nil { 483 return err 484 } 485 486 return nil 487} 488 489// Close releases all database resources. 490// It will block waiting for any open transactions to finish 491// before closing the database and returning. 492func (db *DB) Close() error { 493 db.rwlock.Lock() 494 defer db.rwlock.Unlock() 495 496 db.metalock.Lock() 497 defer db.metalock.Unlock() 498 499 db.mmaplock.Lock() 500 defer db.mmaplock.Unlock() 501 502 return db.close() 503} 504 505func (db *DB) close() error { 506 if !db.opened { 507 return nil 508 } 509 510 db.opened = false 511 512 db.freelist = nil 513 514 // Clear ops. 515 db.ops.writeAt = nil 516 517 // Close the mmap. 518 if err := db.munmap(); err != nil { 519 return err 520 } 521 522 // Close file handles. 523 if db.file != nil { 524 // No need to unlock read-only file. 525 if !db.readOnly { 526 // Unlock the file. 527 if err := funlock(db); err != nil { 528 log.Printf("bolt.Close(): funlock error: %s", err) 529 } 530 } 531 532 // Close the file descriptor. 533 if err := db.file.Close(); err != nil { 534 return fmt.Errorf("db file close: %s", err) 535 } 536 db.file = nil 537 } 538 539 db.path = "" 540 return nil 541} 542 543// Begin starts a new transaction. 544// Multiple read-only transactions can be used concurrently but only one 545// write transaction can be used at a time. Starting multiple write transactions 546// will cause the calls to block and be serialized until the current write 547// transaction finishes. 548// 549// Transactions should not be dependent on one another. Opening a read 550// transaction and a write transaction in the same goroutine can cause the 551// writer to deadlock because the database periodically needs to re-mmap itself 552// as it grows and it cannot do that while a read transaction is open. 553// 554// If a long running read transaction (for example, a snapshot transaction) is 555// needed, you might want to set DB.InitialMmapSize to a large enough value 556// to avoid potential blocking of write transaction. 557// 558// IMPORTANT: You must close read-only transactions after you are finished or 559// else the database will not reclaim old pages. 560func (db *DB) Begin(writable bool) (*Tx, error) { 561 if writable { 562 return db.beginRWTx() 563 } 564 return db.beginTx() 565} 566 567func (db *DB) beginTx() (*Tx, error) { 568 // Lock the meta pages while we initialize the transaction. We obtain 569 // the meta lock before the mmap lock because that's the order that the 570 // write transaction will obtain them. 571 db.metalock.Lock() 572 573 // Obtain a read-only lock on the mmap. When the mmap is remapped it will 574 // obtain a write lock so all transactions must finish before it can be 575 // remapped. 576 db.mmaplock.RLock() 577 578 // Exit if the database is not open yet. 579 if !db.opened { 580 db.mmaplock.RUnlock() 581 db.metalock.Unlock() 582 return nil, ErrDatabaseNotOpen 583 } 584 585 // [Psiphon] 586 // https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73 587 // Return mmap error if a previous mmap failed. 588 if db.mmapErr != nil { 589 db.mmaplock.RUnlock() 590 db.metalock.Unlock() 591 return nil, db.mmapErr 592 } 593 594 // Create a transaction associated with the database. 595 t := &Tx{} 596 t.init(db) 597 598 // Keep track of transaction until it closes. 599 db.txs = append(db.txs, t) 600 n := len(db.txs) 601 602 // Unlock the meta pages. 603 db.metalock.Unlock() 604 605 // Update the transaction stats. 606 db.statlock.Lock() 607 db.stats.TxN++ 608 db.stats.OpenTxN = n 609 db.statlock.Unlock() 610 611 return t, nil 612} 613 614func (db *DB) beginRWTx() (*Tx, error) { 615 // If the database was opened with Options.ReadOnly, return an error. 616 if db.readOnly { 617 return nil, ErrDatabaseReadOnly 618 } 619 620 // Obtain writer lock. This is released by the transaction when it closes. 621 // This enforces only one writer transaction at a time. 622 db.rwlock.Lock() 623 624 // Once we have the writer lock then we can lock the meta pages so that 625 // we can set up the transaction. 626 db.metalock.Lock() 627 defer db.metalock.Unlock() 628 629 // Exit if the database is not open yet. 630 if !db.opened { 631 db.rwlock.Unlock() 632 return nil, ErrDatabaseNotOpen 633 } 634 635 // [Psiphon] 636 // https://github.com/ooni/psiphon/oopsi/github.com/etcd-io/bbolt/commit/b3e98dcb3752e0a8d5db6503b80fe19e462fdb73 637 // Return mmap error if a previous mmap failed. 638 if db.mmapErr != nil { 639 db.rwlock.Unlock() 640 return nil, db.mmapErr 641 } 642 643 // Create a transaction associated with the database. 644 t := &Tx{writable: true} 645 t.init(db) 646 db.rwtx = t 647 db.freePages() 648 return t, nil 649} 650 651// freePages releases any pages associated with closed read-only transactions. 652func (db *DB) freePages() { 653 // Free all pending pages prior to earliest open transaction. 654 sort.Sort(txsById(db.txs)) 655 minid := txid(0xFFFFFFFFFFFFFFFF) 656 if len(db.txs) > 0 { 657 minid = db.txs[0].meta.txid 658 } 659 if minid > 0 { 660 db.freelist.release(minid - 1) 661 } 662 // Release unused txid extents. 663 for _, t := range db.txs { 664 db.freelist.releaseRange(minid, t.meta.txid-1) 665 minid = t.meta.txid + 1 666 } 667 db.freelist.releaseRange(minid, txid(0xFFFFFFFFFFFFFFFF)) 668 // Any page both allocated and freed in an extent is safe to release. 669} 670 671type txsById []*Tx 672 673func (t txsById) Len() int { return len(t) } 674func (t txsById) Swap(i, j int) { t[i], t[j] = t[j], t[i] } 675func (t txsById) Less(i, j int) bool { return t[i].meta.txid < t[j].meta.txid } 676 677// removeTx removes a transaction from the database. 678func (db *DB) removeTx(tx *Tx) { 679 // Release the read lock on the mmap. 680 db.mmaplock.RUnlock() 681 682 // Use the meta lock to restrict access to the DB object. 683 db.metalock.Lock() 684 685 // Remove the transaction. 686 for i, t := range db.txs { 687 if t == tx { 688 last := len(db.txs) - 1 689 db.txs[i] = db.txs[last] 690 db.txs[last] = nil 691 db.txs = db.txs[:last] 692 break 693 } 694 } 695 n := len(db.txs) 696 697 // Unlock the meta pages. 698 db.metalock.Unlock() 699 700 // Merge statistics. 701 db.statlock.Lock() 702 db.stats.OpenTxN = n 703 db.stats.TxStats.add(&tx.stats) 704 db.statlock.Unlock() 705} 706 707// Update executes a function within the context of a read-write managed transaction. 708// If no error is returned from the function then the transaction is committed. 709// If an error is returned then the entire transaction is rolled back. 710// Any error that is returned from the function or returned from the commit is 711// returned from the Update() method. 712// 713// Attempting to manually commit or rollback within the function will cause a panic. 714func (db *DB) Update(fn func(*Tx) error) error { 715 t, err := db.Begin(true) 716 if err != nil { 717 return err 718 } 719 720 // Make sure the transaction rolls back in the event of a panic. 721 defer func() { 722 if t.db != nil { 723 t.rollback() 724 } 725 }() 726 727 // Mark as a managed tx so that the inner function cannot manually commit. 728 t.managed = true 729 730 // If an error is returned from the function then rollback and return error. 731 err = fn(t) 732 t.managed = false 733 if err != nil { 734 _ = t.Rollback() 735 return err 736 } 737 738 return t.Commit() 739} 740 741// View executes a function within the context of a managed read-only transaction. 742// Any error that is returned from the function is returned from the View() method. 743// 744// Attempting to manually rollback within the function will cause a panic. 745func (db *DB) View(fn func(*Tx) error) error { 746 t, err := db.Begin(false) 747 if err != nil { 748 return err 749 } 750 751 // Make sure the transaction rolls back in the event of a panic. 752 defer func() { 753 if t.db != nil { 754 t.rollback() 755 } 756 }() 757 758 // Mark as a managed tx so that the inner function cannot manually rollback. 759 t.managed = true 760 761 // If an error is returned from the function then pass it through. 762 err = fn(t) 763 t.managed = false 764 if err != nil { 765 _ = t.Rollback() 766 return err 767 } 768 769 return t.Rollback() 770} 771 772// Batch calls fn as part of a batch. It behaves similar to Update, 773// except: 774// 775// 1. concurrent Batch calls can be combined into a single Bolt 776// transaction. 777// 778// 2. the function passed to Batch may be called multiple times, 779// regardless of whether it returns error or not. 780// 781// This means that Batch function side effects must be idempotent and 782// take permanent effect only after a successful return is seen in 783// caller. 784// 785// The maximum batch size and delay can be adjusted with DB.MaxBatchSize 786// and DB.MaxBatchDelay, respectively. 787// 788// Batch is only useful when there are multiple goroutines calling it. 789func (db *DB) Batch(fn func(*Tx) error) error { 790 errCh := make(chan error, 1) 791 792 db.batchMu.Lock() 793 if (db.batch == nil) || (db.batch != nil && len(db.batch.calls) >= db.MaxBatchSize) { 794 // There is no existing batch, or the existing batch is full; start a new one. 795 db.batch = &batch{ 796 db: db, 797 } 798 db.batch.timer = time.AfterFunc(db.MaxBatchDelay, db.batch.trigger) 799 } 800 db.batch.calls = append(db.batch.calls, call{fn: fn, err: errCh}) 801 if len(db.batch.calls) >= db.MaxBatchSize { 802 // wake up batch, it's ready to run 803 go db.batch.trigger() 804 } 805 db.batchMu.Unlock() 806 807 err := <-errCh 808 if err == trySolo { 809 err = db.Update(fn) 810 } 811 return err 812} 813 814type call struct { 815 fn func(*Tx) error 816 err chan<- error 817} 818 819type batch struct { 820 db *DB 821 timer *time.Timer 822 start sync.Once 823 calls []call 824} 825 826// trigger runs the batch if it hasn't already been run. 827func (b *batch) trigger() { 828 b.start.Do(b.run) 829} 830 831// run performs the transactions in the batch and communicates results 832// back to DB.Batch. 833func (b *batch) run() { 834 b.db.batchMu.Lock() 835 b.timer.Stop() 836 // Make sure no new work is added to this batch, but don't break 837 // other batches. 838 if b.db.batch == b { 839 b.db.batch = nil 840 } 841 b.db.batchMu.Unlock() 842 843retry: 844 for len(b.calls) > 0 { 845 var failIdx = -1 846 err := b.db.Update(func(tx *Tx) error { 847 for i, c := range b.calls { 848 if err := safelyCall(c.fn, tx); err != nil { 849 failIdx = i 850 return err 851 } 852 } 853 return nil 854 }) 855 856 if failIdx >= 0 { 857 // take the failing transaction out of the batch. it's 858 // safe to shorten b.calls here because db.batch no longer 859 // points to us, and we hold the mutex anyway. 860 c := b.calls[failIdx] 861 b.calls[failIdx], b.calls = b.calls[len(b.calls)-1], b.calls[:len(b.calls)-1] 862 // tell the submitter re-run it solo, continue with the rest of the batch 863 c.err <- trySolo 864 continue retry 865 } 866 867 // pass success, or bolt internal errors, to all callers 868 for _, c := range b.calls { 869 c.err <- err 870 } 871 break retry 872 } 873} 874 875// trySolo is a special sentinel error value used for signaling that a 876// transaction function should be re-run. It should never be seen by 877// callers. 878var trySolo = errors.New("batch function returned an error and should be re-run solo") 879 880type panicked struct { 881 reason interface{} 882} 883 884func (p panicked) Error() string { 885 if err, ok := p.reason.(error); ok { 886 return err.Error() 887 } 888 return fmt.Sprintf("panic: %v", p.reason) 889} 890 891func safelyCall(fn func(*Tx) error, tx *Tx) (err error) { 892 defer func() { 893 if p := recover(); p != nil { 894 err = panicked{p} 895 } 896 }() 897 return fn(tx) 898} 899 900// Sync executes fdatasync() against the database file handle. 901// 902// This is not necessary under normal operation, however, if you use NoSync 903// then it allows you to force the database file to sync against the disk. 904func (db *DB) Sync() error { return fdatasync(db) } 905 906// Stats retrieves ongoing performance stats for the database. 907// This is only updated when a transaction closes. 908func (db *DB) Stats() Stats { 909 db.statlock.RLock() 910 defer db.statlock.RUnlock() 911 return db.stats 912} 913 914// This is for internal access to the raw data bytes from the C cursor, use 915// carefully, or not at all. 916func (db *DB) Info() *Info { 917 return &Info{uintptr(unsafe.Pointer(&db.data[0])), db.pageSize} 918} 919 920// page retrieves a page reference from the mmap based on the current page size. 921func (db *DB) page(id pgid) *page { 922 pos := id * pgid(db.pageSize) 923 return (*page)(unsafe.Pointer(&db.data[pos])) 924} 925 926// pageInBuffer retrieves a page reference from a given byte array based on the current page size. 927func (db *DB) pageInBuffer(b []byte, id pgid) *page { 928 return (*page)(unsafe.Pointer(&b[id*pgid(db.pageSize)])) 929} 930 931// meta retrieves the current meta page reference. 932func (db *DB) meta() *meta { 933 // We have to return the meta with the highest txid which doesn't fail 934 // validation. Otherwise, we can cause errors when in fact the database is 935 // in a consistent state. metaA is the one with the higher txid. 936 metaA := db.meta0 937 metaB := db.meta1 938 if db.meta1.txid > db.meta0.txid { 939 metaA = db.meta1 940 metaB = db.meta0 941 } 942 943 // Use higher meta page if valid. Otherwise fallback to previous, if valid. 944 if err := metaA.validate(); err == nil { 945 return metaA 946 } else if err := metaB.validate(); err == nil { 947 return metaB 948 } 949 950 // This should never be reached, because both meta1 and meta0 were validated 951 // on mmap() and we do fsync() on every write. 952 panic("bolt.DB.meta(): invalid meta pages") 953} 954 955// allocate returns a contiguous block of memory starting at a given page. 956func (db *DB) allocate(txid txid, count int) (*page, error) { 957 // Allocate a temporary buffer for the page. 958 var buf []byte 959 if count == 1 { 960 buf = db.pagePool.Get().([]byte) 961 } else { 962 buf = make([]byte, count*db.pageSize) 963 } 964 p := (*page)(unsafe.Pointer(&buf[0])) 965 p.overflow = uint32(count - 1) 966 967 // Use pages from the freelist if they are available. 968 if p.id = db.freelist.allocate(txid, count); p.id != 0 { 969 return p, nil 970 } 971 972 // Resize mmap() if we're at the end. 973 p.id = db.rwtx.meta.pgid 974 var minsz = int((p.id+pgid(count))+1) * db.pageSize 975 if minsz >= db.datasz { 976 if err := db.mmap(minsz); err != nil { 977 return nil, fmt.Errorf("mmap allocate error: %s", err) 978 } 979 } 980 981 // Move the page id high water mark. 982 db.rwtx.meta.pgid += pgid(count) 983 984 return p, nil 985} 986 987// grow grows the size of the database to the given sz. 988func (db *DB) grow(sz int) error { 989 // Ignore if the new size is less than available file size. 990 if sz <= db.filesz { 991 return nil 992 } 993 994 // If the data is smaller than the alloc size then only allocate what's needed. 995 // Once it goes over the allocation size then allocate in chunks. 996 if db.datasz < db.AllocSize { 997 sz = db.datasz 998 } else { 999 sz += db.AllocSize 1000 } 1001 1002 // Truncate and fsync to ensure file size metadata is flushed. 1003 // https://github.com/ooni/psiphon/oopsi/github.com/boltdb/bolt/issues/284 1004 if !db.NoGrowSync && !db.readOnly { 1005 if runtime.GOOS != "windows" { 1006 if err := db.file.Truncate(int64(sz)); err != nil { 1007 return fmt.Errorf("file resize error: %s", err) 1008 } 1009 } 1010 if err := db.file.Sync(); err != nil { 1011 return fmt.Errorf("file sync error: %s", err) 1012 } 1013 } 1014 1015 db.filesz = sz 1016 return nil 1017} 1018 1019func (db *DB) IsReadOnly() bool { 1020 return db.readOnly 1021} 1022 1023func (db *DB) freepages() []pgid { 1024 tx, err := db.beginTx() 1025 defer func() { 1026 err = tx.Rollback() 1027 if err != nil { 1028 panic("freepages: failed to rollback tx") 1029 } 1030 }() 1031 if err != nil { 1032 panic("freepages: failed to open read only tx") 1033 } 1034 1035 reachable := make(map[pgid]*page) 1036 nofreed := make(map[pgid]bool) 1037 1038 // [Psiphon] 1039 // Use single-error checkBucket. 1040 err = tx.checkBucket(&tx.root, reachable, nofreed) 1041 if err != nil { 1042 panic(fmt.Sprintf("freepages: failed to get all reachable pages (%s)", err)) 1043 } 1044 1045 var fids []pgid 1046 for i := pgid(2); i < db.meta().pgid; i++ { 1047 if _, ok := reachable[i]; !ok { 1048 fids = append(fids, i) 1049 } 1050 } 1051 return fids 1052} 1053 1054// Options represents the options that can be set when opening a database. 1055type Options struct { 1056 // Timeout is the amount of time to wait to obtain a file lock. 1057 // When set to zero it will wait indefinitely. This option is only 1058 // available on Darwin and Linux. 1059 Timeout time.Duration 1060 1061 // Sets the DB.NoGrowSync flag before memory mapping the file. 1062 NoGrowSync bool 1063 1064 // Do not sync freelist to disk. This improves the database write performance 1065 // under normal operation, but requires a full database re-sync during recovery. 1066 NoFreelistSync bool 1067 1068 // FreelistType sets the backend freelist type. There are two options. Array which is simple but endures 1069 // dramatic performance degradation if database is large and framentation in freelist is common. 1070 // The alternative one is using hashmap, it is faster in almost all circumstances 1071 // but it doesn't guarantee that it offers the smallest page id available. In normal case it is safe. 1072 // The default type is array 1073 FreelistType FreelistType 1074 1075 // Open database in read-only mode. Uses flock(..., LOCK_SH |LOCK_NB) to 1076 // grab a shared lock (UNIX). 1077 ReadOnly bool 1078 1079 // Sets the DB.MmapFlags flag before memory mapping the file. 1080 MmapFlags int 1081 1082 // InitialMmapSize is the initial mmap size of the database 1083 // in bytes. Read transactions won't block write transaction 1084 // if the InitialMmapSize is large enough to hold database mmap 1085 // size. (See DB.Begin for more information) 1086 // 1087 // If <=0, the initial map size is 0. 1088 // If initialMmapSize is smaller than the previous database size, 1089 // it takes no effect. 1090 InitialMmapSize int 1091 1092 // PageSize overrides the default OS page size. 1093 PageSize int 1094 1095 // NoSync sets the initial value of DB.NoSync. Normally this can just be 1096 // set directly on the DB itself when returned from Open(), but this option 1097 // is useful in APIs which expose Options but not the underlying DB. 1098 NoSync bool 1099 1100 // OpenFile is used to open files. It defaults to os.OpenFile. This option 1101 // is useful for writing hermetic tests. 1102 OpenFile func(string, int, os.FileMode) (*os.File, error) 1103} 1104 1105// DefaultOptions represent the options used if nil options are passed into Open(). 1106// No timeout is used which will cause Bolt to wait indefinitely for a lock. 1107var DefaultOptions = &Options{ 1108 Timeout: 0, 1109 NoGrowSync: false, 1110 FreelistType: FreelistArrayType, 1111} 1112 1113// Stats represents statistics about the database. 1114type Stats struct { 1115 // Freelist stats 1116 FreePageN int // total number of free pages on the freelist 1117 PendingPageN int // total number of pending pages on the freelist 1118 FreeAlloc int // total bytes allocated in free pages 1119 FreelistInuse int // total bytes used by the freelist 1120 1121 // Transaction stats 1122 TxN int // total number of started read transactions 1123 OpenTxN int // number of currently open read transactions 1124 1125 TxStats TxStats // global, ongoing stats. 1126} 1127 1128// Sub calculates and returns the difference between two sets of database stats. 1129// This is useful when obtaining stats at two different points and time and 1130// you need the performance counters that occurred within that time span. 1131func (s *Stats) Sub(other *Stats) Stats { 1132 if other == nil { 1133 return *s 1134 } 1135 var diff Stats 1136 diff.FreePageN = s.FreePageN 1137 diff.PendingPageN = s.PendingPageN 1138 diff.FreeAlloc = s.FreeAlloc 1139 diff.FreelistInuse = s.FreelistInuse 1140 diff.TxN = s.TxN - other.TxN 1141 diff.TxStats = s.TxStats.Sub(&other.TxStats) 1142 return diff 1143} 1144 1145type Info struct { 1146 Data uintptr 1147 PageSize int 1148} 1149 1150type meta struct { 1151 magic uint32 1152 version uint32 1153 pageSize uint32 1154 flags uint32 1155 root bucket 1156 freelist pgid 1157 pgid pgid 1158 txid txid 1159 checksum uint64 1160} 1161 1162// validate checks the marker bytes and version of the meta page to ensure it matches this binary. 1163func (m *meta) validate() error { 1164 if m.magic != magic { 1165 return ErrInvalid 1166 } else if m.version != version { 1167 return ErrVersionMismatch 1168 } else if m.checksum != 0 && m.checksum != m.sum64() { 1169 return ErrChecksum 1170 } 1171 return nil 1172} 1173 1174// copy copies one meta object to another. 1175func (m *meta) copy(dest *meta) { 1176 *dest = *m 1177} 1178 1179// write writes the meta onto a page. 1180func (m *meta) write(p *page) { 1181 if m.root.root >= m.pgid { 1182 panic(fmt.Sprintf("root bucket pgid (%d) above high water mark (%d)", m.root.root, m.pgid)) 1183 } else if m.freelist >= m.pgid && m.freelist != pgidNoFreelist { 1184 // TODO: reject pgidNoFreeList if !NoFreelistSync 1185 panic(fmt.Sprintf("freelist pgid (%d) above high water mark (%d)", m.freelist, m.pgid)) 1186 } 1187 1188 // Page id is either going to be 0 or 1 which we can determine by the transaction ID. 1189 p.id = pgid(m.txid % 2) 1190 p.flags |= metaPageFlag 1191 1192 // Calculate the checksum. 1193 m.checksum = m.sum64() 1194 1195 m.copy(p.meta()) 1196} 1197 1198// generates the checksum for the meta. 1199func (m *meta) sum64() uint64 { 1200 var h = fnv.New64a() 1201 _, _ = h.Write((*[unsafe.Offsetof(meta{}.checksum)]byte)(unsafe.Pointer(m))[:]) 1202 return h.Sum64() 1203} 1204 1205// _assert will panic with a given formatted message if the given condition is false. 1206func _assert(condition bool, msg string, v ...interface{}) { 1207 if !condition { 1208 panic(fmt.Sprintf("assertion failed: "+msg, v...)) 1209 } 1210} 1211