1 /* 2 * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 3 * Use is subject to license terms. 4 */ 5 6 #pragma ident "%Z%%M% %I% %E% SMI" 7 8 /* 9 ** 2001 September 15 10 ** 11 ** The author disclaims copyright to this source code. In place of 12 ** a legal notice, here is a blessing: 13 ** 14 ** May you do good and not evil. 15 ** May you find forgiveness for yourself and forgive others. 16 ** May you share freely, never taking more than you give. 17 ** 18 ************************************************************************* 19 ** $Id: btree.c,v 1.103 2004/03/10 13:42:38 drh Exp $ 20 ** 21 ** This file implements a external (disk-based) database using BTrees. 22 ** For a detailed discussion of BTrees, refer to 23 ** 24 ** Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3: 25 ** "Sorting And Searching", pages 473-480. Addison-Wesley 26 ** Publishing Company, Reading, Massachusetts. 27 ** 28 ** The basic idea is that each page of the file contains N database 29 ** entries and N+1 pointers to subpages. 30 ** 31 ** ---------------------------------------------------------------- 32 ** | Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) | 33 ** ---------------------------------------------------------------- 34 ** 35 ** All of the keys on the page that Ptr(0) points to have values less 36 ** than Key(0). All of the keys on page Ptr(1) and its subpages have 37 ** values greater than Key(0) and less than Key(1). All of the keys 38 ** on Ptr(N+1) and its subpages have values greater than Key(N). And 39 ** so forth. 40 ** 41 ** Finding a particular key requires reading O(log(M)) pages from the 42 ** disk where M is the number of entries in the tree. 43 ** 44 ** In this implementation, a single file can hold one or more separate 45 ** BTrees. Each BTree is identified by the index of its root page. The 46 ** key and data for any entry are combined to form the "payload". Up to 47 ** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the 48 ** database page. If the payload is larger than MX_LOCAL_PAYLOAD bytes 49 ** then surplus bytes are stored on overflow pages. The payload for an 50 ** entry and the preceding pointer are combined to form a "Cell". Each 51 ** page has a small header which contains the Ptr(N+1) pointer. 52 ** 53 ** The first page of the file contains a magic string used to verify that 54 ** the file really is a valid BTree database, a pointer to a list of unused 55 ** pages in the file, and some meta information. The root of the first 56 ** BTree begins on page 2 of the file. (Pages are numbered beginning with 57 ** 1, not 0.) Thus a minimum database contains 2 pages. 58 */ 59 #include "sqliteInt.h" 60 #include "pager.h" 61 #include "btree.h" 62 #include <assert.h> 63 64 /* Forward declarations */ 65 static BtOps sqliteBtreeOps; 66 static BtCursorOps sqliteBtreeCursorOps; 67 68 /* 69 ** Macros used for byteswapping. B is a pointer to the Btree 70 ** structure. This is needed to access the Btree.needSwab boolean 71 ** in order to tell if byte swapping is needed or not. 72 ** X is an unsigned integer. SWAB16 byte swaps a 16-bit integer. 73 ** SWAB32 byteswaps a 32-bit integer. 74 */ 75 #define SWAB16(B,X) ((B)->needSwab? swab16((u16)X) : ((u16)X)) 76 #define SWAB32(B,X) ((B)->needSwab? swab32(X) : (X)) 77 #define SWAB_ADD(B,X,A) \ 78 if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); } 79 80 /* 81 ** The following global variable - available only if SQLITE_TEST is 82 ** defined - is used to determine whether new databases are created in 83 ** native byte order or in non-native byte order. Non-native byte order 84 ** databases are created for testing purposes only. Under normal operation, 85 ** only native byte-order databases should be created, but we should be 86 ** able to read or write existing databases regardless of the byteorder. 87 */ 88 #ifdef SQLITE_TEST 89 int btree_native_byte_order = 1; 90 #else 91 # define btree_native_byte_order 1 92 #endif 93 94 /* 95 ** Forward declarations of structures used only in this file. 96 */ 97 typedef struct PageOne PageOne; 98 typedef struct MemPage MemPage; 99 typedef struct PageHdr PageHdr; 100 typedef struct Cell Cell; 101 typedef struct CellHdr CellHdr; 102 typedef struct FreeBlk FreeBlk; 103 typedef struct OverflowPage OverflowPage; 104 typedef struct FreelistInfo FreelistInfo; 105 106 /* 107 ** All structures on a database page are aligned to 4-byte boundries. 108 ** This routine rounds up a number of bytes to the next multiple of 4. 109 ** 110 ** This might need to change for computer architectures that require 111 ** and 8-byte alignment boundry for structures. 112 */ 113 #define ROUNDUP(X) ((X+3) & ~3) 114 115 /* 116 ** This is a magic string that appears at the beginning of every 117 ** SQLite database in order to identify the file as a real database. 118 */ 119 static const char zMagicHeader[] = 120 "** This file contains an SQLite 2.1 database **"; 121 #define MAGIC_SIZE (sizeof(zMagicHeader)) 122 123 /* 124 ** This is a magic integer also used to test the integrity of the database 125 ** file. This integer is used in addition to the string above so that 126 ** if the file is written on a little-endian architecture and read 127 ** on a big-endian architectures (or vice versa) we can detect the 128 ** problem. 129 ** 130 ** The number used was obtained at random and has no special 131 ** significance other than the fact that it represents a different 132 ** integer on little-endian and big-endian machines. 133 */ 134 #define MAGIC 0xdae37528 135 136 /* 137 ** The first page of the database file contains a magic header string 138 ** to identify the file as an SQLite database file. It also contains 139 ** a pointer to the first free page of the file. Page 2 contains the 140 ** root of the principle BTree. The file might contain other BTrees 141 ** rooted on pages above 2. 142 ** 143 ** The first page also contains SQLITE_N_BTREE_META integers that 144 ** can be used by higher-level routines. 145 ** 146 ** Remember that pages are numbered beginning with 1. (See pager.c 147 ** for additional information.) Page 0 does not exist and a page 148 ** number of 0 is used to mean "no such page". 149 */ 150 struct PageOne { 151 char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */ 152 int iMagic; /* Integer to verify correct byte order */ 153 Pgno freeList; /* First free page in a list of all free pages */ 154 int nFree; /* Number of pages on the free list */ 155 int aMeta[SQLITE_N_BTREE_META-1]; /* User defined integers */ 156 }; 157 158 /* 159 ** Each database page has a header that is an instance of this 160 ** structure. 161 ** 162 ** PageHdr.firstFree is 0 if there is no free space on this page. 163 ** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a 164 ** FreeBlk structure that describes the first block of free space. 165 ** All free space is defined by a linked list of FreeBlk structures. 166 ** 167 ** Data is stored in a linked list of Cell structures. PageHdr.firstCell 168 ** is the index into MemPage.u.aDisk[] of the first cell on the page. The 169 ** Cells are kept in sorted order. 170 ** 171 ** A Cell contains all information about a database entry and a pointer 172 ** to a child page that contains other entries less than itself. In 173 ** other words, the i-th Cell contains both Ptr(i) and Key(i). The 174 ** right-most pointer of the page is contained in PageHdr.rightChild. 175 */ 176 struct PageHdr { 177 Pgno rightChild; /* Child page that comes after all cells on this page */ 178 u16 firstCell; /* Index in MemPage.u.aDisk[] of the first cell */ 179 u16 firstFree; /* Index in MemPage.u.aDisk[] of the first free block */ 180 }; 181 182 /* 183 ** Entries on a page of the database are called "Cells". Each Cell 184 ** has a header and data. This structure defines the header. The 185 ** key and data (collectively the "payload") follow this header on 186 ** the database page. 187 ** 188 ** A definition of the complete Cell structure is given below. The 189 ** header for the cell must be defined first in order to do some 190 ** of the sizing #defines that follow. 191 */ 192 struct CellHdr { 193 Pgno leftChild; /* Child page that comes before this cell */ 194 u16 nKey; /* Number of bytes in the key */ 195 u16 iNext; /* Index in MemPage.u.aDisk[] of next cell in sorted order */ 196 u8 nKeyHi; /* Upper 8 bits of key size for keys larger than 64K bytes */ 197 u8 nDataHi; /* Upper 8 bits of data size when the size is more than 64K */ 198 u16 nData; /* Number of bytes of data */ 199 }; 200 201 /* 202 ** The key and data size are split into a lower 16-bit segment and an 203 ** upper 8-bit segment in order to pack them together into a smaller 204 ** space. The following macros reassembly a key or data size back 205 ** into an integer. 206 */ 207 #define NKEY(b,h) (SWAB16(b,h.nKey) + h.nKeyHi*65536) 208 #define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536) 209 210 /* 211 ** The minimum size of a complete Cell. The Cell must contain a header 212 ** and at least 4 bytes of payload. 213 */ 214 #define MIN_CELL_SIZE (sizeof(CellHdr)+4) 215 216 /* 217 ** The maximum number of database entries that can be held in a single 218 ** page of the database. 219 */ 220 #define MX_CELL ((SQLITE_USABLE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE) 221 222 /* 223 ** The amount of usable space on a single page of the BTree. This is the 224 ** page size minus the overhead of the page header. 225 */ 226 #define USABLE_SPACE (SQLITE_USABLE_SIZE - sizeof(PageHdr)) 227 228 /* 229 ** The maximum amount of payload (in bytes) that can be stored locally for 230 ** a database entry. If the entry contains more data than this, the 231 ** extra goes onto overflow pages. 232 ** 233 ** This number is chosen so that at least 4 cells will fit on every page. 234 */ 235 #define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3) 236 237 /* 238 ** Data on a database page is stored as a linked list of Cell structures. 239 ** Both the key and the data are stored in aPayload[]. The key always comes 240 ** first. The aPayload[] field grows as necessary to hold the key and data, 241 ** up to a maximum of MX_LOCAL_PAYLOAD bytes. If the size of the key and 242 ** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the 243 ** page number of the first overflow page. 244 ** 245 ** Though this structure is fixed in size, the Cell on the database 246 ** page varies in size. Every cell has a CellHdr and at least 4 bytes 247 ** of payload space. Additional payload bytes (up to the maximum of 248 ** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as 249 ** needed. 250 */ 251 struct Cell { 252 CellHdr h; /* The cell header */ 253 char aPayload[MX_LOCAL_PAYLOAD]; /* Key and data */ 254 Pgno ovfl; /* The first overflow page */ 255 }; 256 257 /* 258 ** Free space on a page is remembered using a linked list of the FreeBlk 259 ** structures. Space on a database page is allocated in increments of 260 ** at least 4 bytes and is always aligned to a 4-byte boundry. The 261 ** linked list of FreeBlks is always kept in order by address. 262 */ 263 struct FreeBlk { 264 u16 iSize; /* Number of bytes in this block of free space */ 265 u16 iNext; /* Index in MemPage.u.aDisk[] of the next free block */ 266 }; 267 268 /* 269 ** The number of bytes of payload that will fit on a single overflow page. 270 */ 271 #define OVERFLOW_SIZE (SQLITE_USABLE_SIZE-sizeof(Pgno)) 272 273 /* 274 ** When the key and data for a single entry in the BTree will not fit in 275 ** the MX_LOCAL_PAYLOAD bytes of space available on the database page, 276 ** then all extra bytes are written to a linked list of overflow pages. 277 ** Each overflow page is an instance of the following structure. 278 ** 279 ** Unused pages in the database are also represented by instances of 280 ** the OverflowPage structure. The PageOne.freeList field is the 281 ** page number of the first page in a linked list of unused database 282 ** pages. 283 */ 284 struct OverflowPage { 285 Pgno iNext; 286 char aPayload[OVERFLOW_SIZE]; 287 }; 288 289 /* 290 ** The PageOne.freeList field points to a linked list of overflow pages 291 ** hold information about free pages. The aPayload section of each 292 ** overflow page contains an instance of the following structure. The 293 ** aFree[] array holds the page number of nFree unused pages in the disk 294 ** file. 295 */ 296 struct FreelistInfo { 297 int nFree; 298 Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)]; 299 }; 300 301 /* 302 ** For every page in the database file, an instance of the following structure 303 ** is stored in memory. The u.aDisk[] array contains the raw bits read from 304 ** the disk. The rest is auxiliary information held in memory only. The 305 ** auxiliary info is only valid for regular database pages - it is not 306 ** used for overflow pages and pages on the freelist. 307 ** 308 ** Of particular interest in the auxiliary info is the apCell[] entry. Each 309 ** apCell[] entry is a pointer to a Cell structure in u.aDisk[]. The cells are 310 ** put in this array so that they can be accessed in constant time, rather 311 ** than in linear time which would be needed if we had to walk the linked 312 ** list on every access. 313 ** 314 ** Note that apCell[] contains enough space to hold up to two more Cells 315 ** than can possibly fit on one page. In the steady state, every apCell[] 316 ** points to memory inside u.aDisk[]. But in the middle of an insert 317 ** operation, some apCell[] entries may temporarily point to data space 318 ** outside of u.aDisk[]. This is a transient situation that is quickly 319 ** resolved. But while it is happening, it is possible for a database 320 ** page to hold as many as two more cells than it might otherwise hold. 321 ** The extra two entries in apCell[] are an allowance for this situation. 322 ** 323 ** The pParent field points back to the parent page. This allows us to 324 ** walk up the BTree from any leaf to the root. Care must be taken to 325 ** unref() the parent page pointer when this page is no longer referenced. 326 ** The pageDestructor() routine handles that chore. 327 */ 328 struct MemPage { 329 union u_page_data { 330 char aDisk[SQLITE_PAGE_SIZE]; /* Page data stored on disk */ 331 PageHdr hdr; /* Overlay page header */ 332 } u; 333 u8 isInit; /* True if auxiliary data is initialized */ 334 u8 idxShift; /* True if apCell[] indices have changed */ 335 u8 isOverfull; /* Some apCell[] points outside u.aDisk[] */ 336 MemPage *pParent; /* The parent of this page. NULL for root */ 337 int idxParent; /* Index in pParent->apCell[] of this node */ 338 int nFree; /* Number of free bytes in u.aDisk[] */ 339 int nCell; /* Number of entries on this page */ 340 Cell *apCell[MX_CELL+2]; /* All data entires in sorted order */ 341 }; 342 343 /* 344 ** The in-memory image of a disk page has the auxiliary information appended 345 ** to the end. EXTRA_SIZE is the number of bytes of space needed to hold 346 ** that extra information. 347 */ 348 #define EXTRA_SIZE (sizeof(MemPage)-sizeof(union u_page_data)) 349 350 /* 351 ** Everything we need to know about an open database 352 */ 353 struct Btree { 354 BtOps *pOps; /* Function table */ 355 Pager *pPager; /* The page cache */ 356 BtCursor *pCursor; /* A list of all open cursors */ 357 PageOne *page1; /* First page of the database */ 358 u8 inTrans; /* True if a transaction is in progress */ 359 u8 inCkpt; /* True if there is a checkpoint on the transaction */ 360 u8 readOnly; /* True if the underlying file is readonly */ 361 u8 needSwab; /* Need to byte-swapping */ 362 }; 363 typedef Btree Bt; 364 365 /* 366 ** A cursor is a pointer to a particular entry in the BTree. 367 ** The entry is identified by its MemPage and the index in 368 ** MemPage.apCell[] of the entry. 369 */ 370 struct BtCursor { 371 BtCursorOps *pOps; /* Function table */ 372 Btree *pBt; /* The Btree to which this cursor belongs */ 373 BtCursor *pNext, *pPrev; /* Forms a linked list of all cursors */ 374 BtCursor *pShared; /* Loop of cursors with the same root page */ 375 Pgno pgnoRoot; /* The root page of this tree */ 376 MemPage *pPage; /* Page that contains the entry */ 377 int idx; /* Index of the entry in pPage->apCell[] */ 378 u8 wrFlag; /* True if writable */ 379 u8 eSkip; /* Determines if next step operation is a no-op */ 380 u8 iMatch; /* compare result from last sqliteBtreeMoveto() */ 381 }; 382 383 /* 384 ** Legal values for BtCursor.eSkip. 385 */ 386 #define SKIP_NONE 0 /* Always step the cursor */ 387 #define SKIP_NEXT 1 /* The next sqliteBtreeNext() is a no-op */ 388 #define SKIP_PREV 2 /* The next sqliteBtreePrevious() is a no-op */ 389 #define SKIP_INVALID 3 /* Calls to Next() and Previous() are invalid */ 390 391 /* Forward declarations */ 392 static int fileBtreeCloseCursor(BtCursor *pCur); 393 394 /* 395 ** Routines for byte swapping. 396 */ 397 u16 swab16(u16 x){ 398 return ((x & 0xff)<<8) | ((x>>8)&0xff); 399 } 400 u32 swab32(u32 x){ 401 return ((x & 0xff)<<24) | ((x & 0xff00)<<8) | 402 ((x>>8) & 0xff00) | ((x>>24)&0xff); 403 } 404 405 /* 406 ** Compute the total number of bytes that a Cell needs on the main 407 ** database page. The number returned includes the Cell header, 408 ** local payload storage, and the pointer to overflow pages (if 409 ** applicable). Additional space allocated on overflow pages 410 ** is NOT included in the value returned from this routine. 411 */ 412 static int cellSize(Btree *pBt, Cell *pCell){ 413 int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h); 414 if( n>MX_LOCAL_PAYLOAD ){ 415 n = MX_LOCAL_PAYLOAD + sizeof(Pgno); 416 }else{ 417 n = ROUNDUP(n); 418 } 419 n += sizeof(CellHdr); 420 return n; 421 } 422 423 /* 424 ** Defragment the page given. All Cells are moved to the 425 ** beginning of the page and all free space is collected 426 ** into one big FreeBlk at the end of the page. 427 */ 428 static void defragmentPage(Btree *pBt, MemPage *pPage){ 429 int pc, i, n; 430 FreeBlk *pFBlk; 431 char newPage[SQLITE_USABLE_SIZE]; 432 433 assert( sqlitepager_iswriteable(pPage) ); 434 assert( pPage->isInit ); 435 pc = sizeof(PageHdr); 436 pPage->u.hdr.firstCell = SWAB16(pBt, pc); 437 memcpy(newPage, pPage->u.aDisk, pc); 438 for(i=0; i<pPage->nCell; i++){ 439 Cell *pCell = pPage->apCell[i]; 440 441 /* This routine should never be called on an overfull page. The 442 ** following asserts verify that constraint. */ 443 assert( Addr(pCell) > Addr(pPage) ); 444 assert( Addr(pCell) < Addr(pPage) + SQLITE_USABLE_SIZE ); 445 446 n = cellSize(pBt, pCell); 447 pCell->h.iNext = SWAB16(pBt, pc + n); 448 memcpy(&newPage[pc], pCell, n); 449 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[pc]; 450 pc += n; 451 } 452 assert( pPage->nFree==SQLITE_USABLE_SIZE-pc ); 453 memcpy(pPage->u.aDisk, newPage, pc); 454 if( pPage->nCell>0 ){ 455 pPage->apCell[pPage->nCell-1]->h.iNext = 0; 456 } 457 pFBlk = (FreeBlk*)&pPage->u.aDisk[pc]; 458 pFBlk->iSize = SWAB16(pBt, SQLITE_USABLE_SIZE - pc); 459 pFBlk->iNext = 0; 460 pPage->u.hdr.firstFree = SWAB16(pBt, pc); 461 memset(&pFBlk[1], 0, SQLITE_USABLE_SIZE - pc - sizeof(FreeBlk)); 462 } 463 464 /* 465 ** Allocate nByte bytes of space on a page. nByte must be a 466 ** multiple of 4. 467 ** 468 ** Return the index into pPage->u.aDisk[] of the first byte of 469 ** the new allocation. Or return 0 if there is not enough free 470 ** space on the page to satisfy the allocation request. 471 ** 472 ** If the page contains nBytes of free space but does not contain 473 ** nBytes of contiguous free space, then this routine automatically 474 ** calls defragementPage() to consolidate all free space before 475 ** allocating the new chunk. 476 */ 477 static int allocateSpace(Btree *pBt, MemPage *pPage, int nByte){ 478 FreeBlk *p; 479 u16 *pIdx; 480 int start; 481 int iSize; 482 #ifndef NDEBUG 483 int cnt = 0; 484 #endif 485 486 assert( sqlitepager_iswriteable(pPage) ); 487 assert( nByte==ROUNDUP(nByte) ); 488 assert( pPage->isInit ); 489 if( pPage->nFree<nByte || pPage->isOverfull ) return 0; 490 pIdx = &pPage->u.hdr.firstFree; 491 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)]; 492 while( (iSize = SWAB16(pBt, p->iSize))<nByte ){ 493 assert( cnt++ < SQLITE_USABLE_SIZE/4 ); 494 if( p->iNext==0 ){ 495 defragmentPage(pBt, pPage); 496 pIdx = &pPage->u.hdr.firstFree; 497 }else{ 498 pIdx = &p->iNext; 499 } 500 p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)]; 501 } 502 if( iSize==nByte ){ 503 start = SWAB16(pBt, *pIdx); 504 *pIdx = p->iNext; 505 }else{ 506 FreeBlk *pNew; 507 start = SWAB16(pBt, *pIdx); 508 pNew = (FreeBlk*)&pPage->u.aDisk[start + nByte]; 509 pNew->iNext = p->iNext; 510 pNew->iSize = SWAB16(pBt, iSize - nByte); 511 *pIdx = SWAB16(pBt, start + nByte); 512 } 513 pPage->nFree -= nByte; 514 return start; 515 } 516 517 /* 518 ** Return a section of the MemPage.u.aDisk[] to the freelist. 519 ** The first byte of the new free block is pPage->u.aDisk[start] 520 ** and the size of the block is "size" bytes. Size must be 521 ** a multiple of 4. 522 ** 523 ** Most of the effort here is involved in coalesing adjacent 524 ** free blocks into a single big free block. 525 */ 526 static void freeSpace(Btree *pBt, MemPage *pPage, int start, int size){ 527 int end = start + size; 528 u16 *pIdx, idx; 529 FreeBlk *pFBlk; 530 FreeBlk *pNew; 531 FreeBlk *pNext; 532 int iSize; 533 534 assert( sqlitepager_iswriteable(pPage) ); 535 assert( size == ROUNDUP(size) ); 536 assert( start == ROUNDUP(start) ); 537 assert( pPage->isInit ); 538 pIdx = &pPage->u.hdr.firstFree; 539 idx = SWAB16(pBt, *pIdx); 540 while( idx!=0 && idx<start ){ 541 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx]; 542 iSize = SWAB16(pBt, pFBlk->iSize); 543 if( idx + iSize == start ){ 544 pFBlk->iSize = SWAB16(pBt, iSize + size); 545 if( idx + iSize + size == SWAB16(pBt, pFBlk->iNext) ){ 546 pNext = (FreeBlk*)&pPage->u.aDisk[idx + iSize + size]; 547 if( pBt->needSwab ){ 548 pFBlk->iSize = swab16((u16)swab16(pNext->iSize)+iSize+size); 549 }else{ 550 pFBlk->iSize += pNext->iSize; 551 } 552 pFBlk->iNext = pNext->iNext; 553 } 554 pPage->nFree += size; 555 return; 556 } 557 pIdx = &pFBlk->iNext; 558 idx = SWAB16(pBt, *pIdx); 559 } 560 pNew = (FreeBlk*)&pPage->u.aDisk[start]; 561 if( idx != end ){ 562 pNew->iSize = SWAB16(pBt, size); 563 pNew->iNext = SWAB16(pBt, idx); 564 }else{ 565 pNext = (FreeBlk*)&pPage->u.aDisk[idx]; 566 pNew->iSize = SWAB16(pBt, size + SWAB16(pBt, pNext->iSize)); 567 pNew->iNext = pNext->iNext; 568 } 569 *pIdx = SWAB16(pBt, start); 570 pPage->nFree += size; 571 } 572 573 /* 574 ** Initialize the auxiliary information for a disk block. 575 ** 576 ** The pParent parameter must be a pointer to the MemPage which 577 ** is the parent of the page being initialized. The root of the 578 ** BTree (usually page 2) has no parent and so for that page, 579 ** pParent==NULL. 580 ** 581 ** Return SQLITE_OK on success. If we see that the page does 582 ** not contain a well-formed database page, then return 583 ** SQLITE_CORRUPT. Note that a return of SQLITE_OK does not 584 ** guarantee that the page is well-formed. It only shows that 585 ** we failed to detect any corruption. 586 */ 587 static int initPage(Bt *pBt, MemPage *pPage, Pgno pgnoThis, MemPage *pParent){ 588 int idx; /* An index into pPage->u.aDisk[] */ 589 Cell *pCell; /* A pointer to a Cell in pPage->u.aDisk[] */ 590 FreeBlk *pFBlk; /* A pointer to a free block in pPage->u.aDisk[] */ 591 int sz; /* The size of a Cell in bytes */ 592 int freeSpace; /* Amount of free space on the page */ 593 594 if( pPage->pParent ){ 595 assert( pPage->pParent==pParent ); 596 return SQLITE_OK; 597 } 598 if( pParent ){ 599 pPage->pParent = pParent; 600 sqlitepager_ref(pParent); 601 } 602 if( pPage->isInit ) return SQLITE_OK; 603 pPage->isInit = 1; 604 pPage->nCell = 0; 605 freeSpace = USABLE_SPACE; 606 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 607 while( idx!=0 ){ 608 if( idx>SQLITE_USABLE_SIZE-MIN_CELL_SIZE ) goto page_format_error; 609 if( idx<sizeof(PageHdr) ) goto page_format_error; 610 if( idx!=ROUNDUP(idx) ) goto page_format_error; 611 pCell = (Cell*)&pPage->u.aDisk[idx]; 612 sz = cellSize(pBt, pCell); 613 if( idx+sz > SQLITE_USABLE_SIZE ) goto page_format_error; 614 freeSpace -= sz; 615 pPage->apCell[pPage->nCell++] = pCell; 616 idx = SWAB16(pBt, pCell->h.iNext); 617 } 618 pPage->nFree = 0; 619 idx = SWAB16(pBt, pPage->u.hdr.firstFree); 620 while( idx!=0 ){ 621 int iNext; 622 if( idx>SQLITE_USABLE_SIZE-sizeof(FreeBlk) ) goto page_format_error; 623 if( idx<sizeof(PageHdr) ) goto page_format_error; 624 pFBlk = (FreeBlk*)&pPage->u.aDisk[idx]; 625 pPage->nFree += SWAB16(pBt, pFBlk->iSize); 626 iNext = SWAB16(pBt, pFBlk->iNext); 627 if( iNext>0 && iNext <= idx ) goto page_format_error; 628 idx = iNext; 629 } 630 if( pPage->nCell==0 && pPage->nFree==0 ){ 631 /* As a special case, an uninitialized root page appears to be 632 ** an empty database */ 633 return SQLITE_OK; 634 } 635 if( pPage->nFree!=freeSpace ) goto page_format_error; 636 return SQLITE_OK; 637 638 page_format_error: 639 return SQLITE_CORRUPT; 640 } 641 642 /* 643 ** Set up a raw page so that it looks like a database page holding 644 ** no entries. 645 */ 646 static void zeroPage(Btree *pBt, MemPage *pPage){ 647 PageHdr *pHdr; 648 FreeBlk *pFBlk; 649 assert( sqlitepager_iswriteable(pPage) ); 650 memset(pPage, 0, SQLITE_USABLE_SIZE); 651 pHdr = &pPage->u.hdr; 652 pHdr->firstCell = 0; 653 pHdr->firstFree = SWAB16(pBt, sizeof(*pHdr)); 654 pFBlk = (FreeBlk*)&pHdr[1]; 655 pFBlk->iNext = 0; 656 pPage->nFree = SQLITE_USABLE_SIZE - sizeof(*pHdr); 657 pFBlk->iSize = SWAB16(pBt, pPage->nFree); 658 pPage->nCell = 0; 659 pPage->isOverfull = 0; 660 } 661 662 /* 663 ** This routine is called when the reference count for a page 664 ** reaches zero. We need to unref the pParent pointer when that 665 ** happens. 666 */ 667 static void pageDestructor(void *pData){ 668 MemPage *pPage = (MemPage*)pData; 669 if( pPage->pParent ){ 670 MemPage *pParent = pPage->pParent; 671 pPage->pParent = 0; 672 sqlitepager_unref(pParent); 673 } 674 } 675 676 /* 677 ** Open a new database. 678 ** 679 ** Actually, this routine just sets up the internal data structures 680 ** for accessing the database. We do not open the database file 681 ** until the first page is loaded. 682 ** 683 ** zFilename is the name of the database file. If zFilename is NULL 684 ** a new database with a random name is created. This randomly named 685 ** database file will be deleted when sqliteBtreeClose() is called. 686 */ 687 int sqliteBtreeOpen( 688 const char *zFilename, /* Name of the file containing the BTree database */ 689 int omitJournal, /* if TRUE then do not journal this file */ 690 int nCache, /* How many pages in the page cache */ 691 Btree **ppBtree /* Pointer to new Btree object written here */ 692 ){ 693 Btree *pBt; 694 int rc; 695 696 /* 697 ** The following asserts make sure that structures used by the btree are 698 ** the right size. This is to guard against size changes that result 699 ** when compiling on a different architecture. 700 */ 701 assert( sizeof(u32)==4 ); 702 assert( sizeof(u16)==2 ); 703 assert( sizeof(Pgno)==4 ); 704 assert( sizeof(PageHdr)==8 ); 705 assert( sizeof(CellHdr)==12 ); 706 assert( sizeof(FreeBlk)==4 ); 707 assert( sizeof(OverflowPage)==SQLITE_USABLE_SIZE ); 708 assert( sizeof(FreelistInfo)==OVERFLOW_SIZE ); 709 assert( sizeof(ptr)==sizeof(char*) ); 710 assert( sizeof(uptr)==sizeof(ptr) ); 711 712 pBt = sqliteMalloc( sizeof(*pBt) ); 713 if( pBt==0 ){ 714 *ppBtree = 0; 715 return SQLITE_NOMEM; 716 } 717 if( nCache<10 ) nCache = 10; 718 rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE, 719 !omitJournal); 720 if( rc!=SQLITE_OK ){ 721 if( pBt->pPager ) sqlitepager_close(pBt->pPager); 722 sqliteFree(pBt); 723 *ppBtree = 0; 724 return rc; 725 } 726 sqlitepager_set_destructor(pBt->pPager, pageDestructor); 727 pBt->pCursor = 0; 728 pBt->page1 = 0; 729 pBt->readOnly = sqlitepager_isreadonly(pBt->pPager); 730 pBt->pOps = &sqliteBtreeOps; 731 *ppBtree = pBt; 732 return SQLITE_OK; 733 } 734 735 /* 736 ** Close an open database and invalidate all cursors. 737 */ 738 static int fileBtreeClose(Btree *pBt){ 739 while( pBt->pCursor ){ 740 fileBtreeCloseCursor(pBt->pCursor); 741 } 742 sqlitepager_close(pBt->pPager); 743 sqliteFree(pBt); 744 return SQLITE_OK; 745 } 746 747 /* 748 ** Change the limit on the number of pages allowed in the cache. 749 ** 750 ** The maximum number of cache pages is set to the absolute 751 ** value of mxPage. If mxPage is negative, the pager will 752 ** operate asynchronously - it will not stop to do fsync()s 753 ** to insure data is written to the disk surface before 754 ** continuing. Transactions still work if synchronous is off, 755 ** and the database cannot be corrupted if this program 756 ** crashes. But if the operating system crashes or there is 757 ** an abrupt power failure when synchronous is off, the database 758 ** could be left in an inconsistent and unrecoverable state. 759 ** Synchronous is on by default so database corruption is not 760 ** normally a worry. 761 */ 762 static int fileBtreeSetCacheSize(Btree *pBt, int mxPage){ 763 sqlitepager_set_cachesize(pBt->pPager, mxPage); 764 return SQLITE_OK; 765 } 766 767 /* 768 ** Change the way data is synced to disk in order to increase or decrease 769 ** how well the database resists damage due to OS crashes and power 770 ** failures. Level 1 is the same as asynchronous (no syncs() occur and 771 ** there is a high probability of damage) Level 2 is the default. There 772 ** is a very low but non-zero probability of damage. Level 3 reduces the 773 ** probability of damage to near zero but with a write performance reduction. 774 */ 775 static int fileBtreeSetSafetyLevel(Btree *pBt, int level){ 776 sqlitepager_set_safety_level(pBt->pPager, level); 777 return SQLITE_OK; 778 } 779 780 /* 781 ** Get a reference to page1 of the database file. This will 782 ** also acquire a readlock on that file. 783 ** 784 ** SQLITE_OK is returned on success. If the file is not a 785 ** well-formed database file, then SQLITE_CORRUPT is returned. 786 ** SQLITE_BUSY is returned if the database is locked. SQLITE_NOMEM 787 ** is returned if we run out of memory. SQLITE_PROTOCOL is returned 788 ** if there is a locking protocol violation. 789 */ 790 static int lockBtree(Btree *pBt){ 791 int rc; 792 if( pBt->page1 ) return SQLITE_OK; 793 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pBt->page1); 794 if( rc!=SQLITE_OK ) return rc; 795 796 /* Do some checking to help insure the file we opened really is 797 ** a valid database file. 798 */ 799 if( sqlitepager_pagecount(pBt->pPager)>0 ){ 800 PageOne *pP1 = pBt->page1; 801 if( strcmp(pP1->zMagic,zMagicHeader)!=0 || 802 (pP1->iMagic!=MAGIC && swab32(pP1->iMagic)!=MAGIC) ){ 803 rc = SQLITE_NOTADB; 804 goto page1_init_failed; 805 } 806 pBt->needSwab = pP1->iMagic!=MAGIC; 807 } 808 return rc; 809 810 page1_init_failed: 811 sqlitepager_unref(pBt->page1); 812 pBt->page1 = 0; 813 return rc; 814 } 815 816 /* 817 ** If there are no outstanding cursors and we are not in the middle 818 ** of a transaction but there is a read lock on the database, then 819 ** this routine unrefs the first page of the database file which 820 ** has the effect of releasing the read lock. 821 ** 822 ** If there are any outstanding cursors, this routine is a no-op. 823 ** 824 ** If there is a transaction in progress, this routine is a no-op. 825 */ 826 static void unlockBtreeIfUnused(Btree *pBt){ 827 if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->page1!=0 ){ 828 sqlitepager_unref(pBt->page1); 829 pBt->page1 = 0; 830 pBt->inTrans = 0; 831 pBt->inCkpt = 0; 832 } 833 } 834 835 /* 836 ** Create a new database by initializing the first two pages of the 837 ** file. 838 */ 839 static int newDatabase(Btree *pBt){ 840 MemPage *pRoot; 841 PageOne *pP1; 842 int rc; 843 if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK; 844 pP1 = pBt->page1; 845 rc = sqlitepager_write(pBt->page1); 846 if( rc ) return rc; 847 rc = sqlitepager_get(pBt->pPager, 2, (void**)&pRoot); 848 if( rc ) return rc; 849 rc = sqlitepager_write(pRoot); 850 if( rc ){ 851 sqlitepager_unref(pRoot); 852 return rc; 853 } 854 strcpy(pP1->zMagic, zMagicHeader); 855 if( btree_native_byte_order ){ 856 pP1->iMagic = MAGIC; 857 pBt->needSwab = 0; 858 }else{ 859 pP1->iMagic = swab32(MAGIC); 860 pBt->needSwab = 1; 861 } 862 zeroPage(pBt, pRoot); 863 sqlitepager_unref(pRoot); 864 return SQLITE_OK; 865 } 866 867 /* 868 ** Attempt to start a new transaction. 869 ** 870 ** A transaction must be started before attempting any changes 871 ** to the database. None of the following routines will work 872 ** unless a transaction is started first: 873 ** 874 ** sqliteBtreeCreateTable() 875 ** sqliteBtreeCreateIndex() 876 ** sqliteBtreeClearTable() 877 ** sqliteBtreeDropTable() 878 ** sqliteBtreeInsert() 879 ** sqliteBtreeDelete() 880 ** sqliteBtreeUpdateMeta() 881 */ 882 static int fileBtreeBeginTrans(Btree *pBt){ 883 int rc; 884 if( pBt->inTrans ) return SQLITE_ERROR; 885 if( pBt->readOnly ) return SQLITE_READONLY; 886 if( pBt->page1==0 ){ 887 rc = lockBtree(pBt); 888 if( rc!=SQLITE_OK ){ 889 return rc; 890 } 891 } 892 rc = sqlitepager_begin(pBt->page1); 893 if( rc==SQLITE_OK ){ 894 rc = newDatabase(pBt); 895 } 896 if( rc==SQLITE_OK ){ 897 pBt->inTrans = 1; 898 pBt->inCkpt = 0; 899 }else{ 900 unlockBtreeIfUnused(pBt); 901 } 902 return rc; 903 } 904 905 /* 906 ** Commit the transaction currently in progress. 907 ** 908 ** This will release the write lock on the database file. If there 909 ** are no active cursors, it also releases the read lock. 910 */ 911 static int fileBtreeCommit(Btree *pBt){ 912 int rc; 913 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager); 914 pBt->inTrans = 0; 915 pBt->inCkpt = 0; 916 unlockBtreeIfUnused(pBt); 917 return rc; 918 } 919 920 /* 921 ** Rollback the transaction in progress. All cursors will be 922 ** invalided by this operation. Any attempt to use a cursor 923 ** that was open at the beginning of this operation will result 924 ** in an error. 925 ** 926 ** This will release the write lock on the database file. If there 927 ** are no active cursors, it also releases the read lock. 928 */ 929 static int fileBtreeRollback(Btree *pBt){ 930 int rc; 931 BtCursor *pCur; 932 if( pBt->inTrans==0 ) return SQLITE_OK; 933 pBt->inTrans = 0; 934 pBt->inCkpt = 0; 935 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager); 936 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 937 if( pCur->pPage && pCur->pPage->isInit==0 ){ 938 sqlitepager_unref(pCur->pPage); 939 pCur->pPage = 0; 940 } 941 } 942 unlockBtreeIfUnused(pBt); 943 return rc; 944 } 945 946 /* 947 ** Set the checkpoint for the current transaction. The checkpoint serves 948 ** as a sub-transaction that can be rolled back independently of the 949 ** main transaction. You must start a transaction before starting a 950 ** checkpoint. The checkpoint is ended automatically if the transaction 951 ** commits or rolls back. 952 ** 953 ** Only one checkpoint may be active at a time. It is an error to try 954 ** to start a new checkpoint if another checkpoint is already active. 955 */ 956 static int fileBtreeBeginCkpt(Btree *pBt){ 957 int rc; 958 if( !pBt->inTrans || pBt->inCkpt ){ 959 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 960 } 961 rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager); 962 pBt->inCkpt = 1; 963 return rc; 964 } 965 966 967 /* 968 ** Commit a checkpoint to transaction currently in progress. If no 969 ** checkpoint is active, this is a no-op. 970 */ 971 static int fileBtreeCommitCkpt(Btree *pBt){ 972 int rc; 973 if( pBt->inCkpt && !pBt->readOnly ){ 974 rc = sqlitepager_ckpt_commit(pBt->pPager); 975 }else{ 976 rc = SQLITE_OK; 977 } 978 pBt->inCkpt = 0; 979 return rc; 980 } 981 982 /* 983 ** Rollback the checkpoint to the current transaction. If there 984 ** is no active checkpoint or transaction, this routine is a no-op. 985 ** 986 ** All cursors will be invalided by this operation. Any attempt 987 ** to use a cursor that was open at the beginning of this operation 988 ** will result in an error. 989 */ 990 static int fileBtreeRollbackCkpt(Btree *pBt){ 991 int rc; 992 BtCursor *pCur; 993 if( pBt->inCkpt==0 || pBt->readOnly ) return SQLITE_OK; 994 rc = sqlitepager_ckpt_rollback(pBt->pPager); 995 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 996 if( pCur->pPage && pCur->pPage->isInit==0 ){ 997 sqlitepager_unref(pCur->pPage); 998 pCur->pPage = 0; 999 } 1000 } 1001 pBt->inCkpt = 0; 1002 return rc; 1003 } 1004 1005 /* 1006 ** Create a new cursor for the BTree whose root is on the page 1007 ** iTable. The act of acquiring a cursor gets a read lock on 1008 ** the database file. 1009 ** 1010 ** If wrFlag==0, then the cursor can only be used for reading. 1011 ** If wrFlag==1, then the cursor can be used for reading or for 1012 ** writing if other conditions for writing are also met. These 1013 ** are the conditions that must be met in order for writing to 1014 ** be allowed: 1015 ** 1016 ** 1: The cursor must have been opened with wrFlag==1 1017 ** 1018 ** 2: No other cursors may be open with wrFlag==0 on the same table 1019 ** 1020 ** 3: The database must be writable (not on read-only media) 1021 ** 1022 ** 4: There must be an active transaction. 1023 ** 1024 ** Condition 2 warrants further discussion. If any cursor is opened 1025 ** on a table with wrFlag==0, that prevents all other cursors from 1026 ** writing to that table. This is a kind of "read-lock". When a cursor 1027 ** is opened with wrFlag==0 it is guaranteed that the table will not 1028 ** change as long as the cursor is open. This allows the cursor to 1029 ** do a sequential scan of the table without having to worry about 1030 ** entries being inserted or deleted during the scan. Cursors should 1031 ** be opened with wrFlag==0 only if this read-lock property is needed. 1032 ** That is to say, cursors should be opened with wrFlag==0 only if they 1033 ** intend to use the sqliteBtreeNext() system call. All other cursors 1034 ** should be opened with wrFlag==1 even if they never really intend 1035 ** to write. 1036 ** 1037 ** No checking is done to make sure that page iTable really is the 1038 ** root page of a b-tree. If it is not, then the cursor acquired 1039 ** will not work correctly. 1040 */ 1041 static 1042 int fileBtreeCursor(Btree *pBt, int iTable, int wrFlag, BtCursor **ppCur){ 1043 int rc; 1044 BtCursor *pCur, *pRing; 1045 1046 if( pBt->readOnly && wrFlag ){ 1047 *ppCur = 0; 1048 return SQLITE_READONLY; 1049 } 1050 if( pBt->page1==0 ){ 1051 rc = lockBtree(pBt); 1052 if( rc!=SQLITE_OK ){ 1053 *ppCur = 0; 1054 return rc; 1055 } 1056 } 1057 pCur = sqliteMalloc( sizeof(*pCur) ); 1058 if( pCur==0 ){ 1059 rc = SQLITE_NOMEM; 1060 goto create_cursor_exception; 1061 } 1062 pCur->pgnoRoot = (Pgno)iTable; 1063 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pCur->pPage); 1064 if( rc!=SQLITE_OK ){ 1065 goto create_cursor_exception; 1066 } 1067 rc = initPage(pBt, pCur->pPage, pCur->pgnoRoot, 0); 1068 if( rc!=SQLITE_OK ){ 1069 goto create_cursor_exception; 1070 } 1071 pCur->pOps = &sqliteBtreeCursorOps; 1072 pCur->pBt = pBt; 1073 pCur->wrFlag = wrFlag; 1074 pCur->idx = 0; 1075 pCur->eSkip = SKIP_INVALID; 1076 pCur->pNext = pBt->pCursor; 1077 if( pCur->pNext ){ 1078 pCur->pNext->pPrev = pCur; 1079 } 1080 pCur->pPrev = 0; 1081 pRing = pBt->pCursor; 1082 while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; } 1083 if( pRing ){ 1084 pCur->pShared = pRing->pShared; 1085 pRing->pShared = pCur; 1086 }else{ 1087 pCur->pShared = pCur; 1088 } 1089 pBt->pCursor = pCur; 1090 *ppCur = pCur; 1091 return SQLITE_OK; 1092 1093 create_cursor_exception: 1094 *ppCur = 0; 1095 if( pCur ){ 1096 if( pCur->pPage ) sqlitepager_unref(pCur->pPage); 1097 sqliteFree(pCur); 1098 } 1099 unlockBtreeIfUnused(pBt); 1100 return rc; 1101 } 1102 1103 /* 1104 ** Close a cursor. The read lock on the database file is released 1105 ** when the last cursor is closed. 1106 */ 1107 static int fileBtreeCloseCursor(BtCursor *pCur){ 1108 Btree *pBt = pCur->pBt; 1109 if( pCur->pPrev ){ 1110 pCur->pPrev->pNext = pCur->pNext; 1111 }else{ 1112 pBt->pCursor = pCur->pNext; 1113 } 1114 if( pCur->pNext ){ 1115 pCur->pNext->pPrev = pCur->pPrev; 1116 } 1117 if( pCur->pPage ){ 1118 sqlitepager_unref(pCur->pPage); 1119 } 1120 if( pCur->pShared!=pCur ){ 1121 BtCursor *pRing = pCur->pShared; 1122 while( pRing->pShared!=pCur ){ pRing = pRing->pShared; } 1123 pRing->pShared = pCur->pShared; 1124 } 1125 unlockBtreeIfUnused(pBt); 1126 sqliteFree(pCur); 1127 return SQLITE_OK; 1128 } 1129 1130 /* 1131 ** Make a temporary cursor by filling in the fields of pTempCur. 1132 ** The temporary cursor is not on the cursor list for the Btree. 1133 */ 1134 static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){ 1135 memcpy(pTempCur, pCur, sizeof(*pCur)); 1136 pTempCur->pNext = 0; 1137 pTempCur->pPrev = 0; 1138 if( pTempCur->pPage ){ 1139 sqlitepager_ref(pTempCur->pPage); 1140 } 1141 } 1142 1143 /* 1144 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor() 1145 ** function above. 1146 */ 1147 static void releaseTempCursor(BtCursor *pCur){ 1148 if( pCur->pPage ){ 1149 sqlitepager_unref(pCur->pPage); 1150 } 1151 } 1152 1153 /* 1154 ** Set *pSize to the number of bytes of key in the entry the 1155 ** cursor currently points to. Always return SQLITE_OK. 1156 ** Failure is not possible. If the cursor is not currently 1157 ** pointing to an entry (which can happen, for example, if 1158 ** the database is empty) then *pSize is set to 0. 1159 */ 1160 static int fileBtreeKeySize(BtCursor *pCur, int *pSize){ 1161 Cell *pCell; 1162 MemPage *pPage; 1163 1164 pPage = pCur->pPage; 1165 assert( pPage!=0 ); 1166 if( pCur->idx >= pPage->nCell ){ 1167 *pSize = 0; 1168 }else{ 1169 pCell = pPage->apCell[pCur->idx]; 1170 *pSize = NKEY(pCur->pBt, pCell->h); 1171 } 1172 return SQLITE_OK; 1173 } 1174 1175 /* 1176 ** Read payload information from the entry that the pCur cursor is 1177 ** pointing to. Begin reading the payload at "offset" and read 1178 ** a total of "amt" bytes. Put the result in zBuf. 1179 ** 1180 ** This routine does not make a distinction between key and data. 1181 ** It just reads bytes from the payload area. 1182 */ 1183 static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){ 1184 char *aPayload; 1185 Pgno nextPage; 1186 int rc; 1187 Btree *pBt = pCur->pBt; 1188 assert( pCur!=0 && pCur->pPage!=0 ); 1189 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 1190 aPayload = pCur->pPage->apCell[pCur->idx]->aPayload; 1191 if( offset<MX_LOCAL_PAYLOAD ){ 1192 int a = amt; 1193 if( a+offset>MX_LOCAL_PAYLOAD ){ 1194 a = MX_LOCAL_PAYLOAD - offset; 1195 } 1196 memcpy(zBuf, &aPayload[offset], a); 1197 if( a==amt ){ 1198 return SQLITE_OK; 1199 } 1200 offset = 0; 1201 zBuf += a; 1202 amt -= a; 1203 }else{ 1204 offset -= MX_LOCAL_PAYLOAD; 1205 } 1206 if( amt>0 ){ 1207 nextPage = SWAB32(pBt, pCur->pPage->apCell[pCur->idx]->ovfl); 1208 } 1209 while( amt>0 && nextPage ){ 1210 OverflowPage *pOvfl; 1211 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl); 1212 if( rc!=0 ){ 1213 return rc; 1214 } 1215 nextPage = SWAB32(pBt, pOvfl->iNext); 1216 if( offset<OVERFLOW_SIZE ){ 1217 int a = amt; 1218 if( a + offset > OVERFLOW_SIZE ){ 1219 a = OVERFLOW_SIZE - offset; 1220 } 1221 memcpy(zBuf, &pOvfl->aPayload[offset], a); 1222 offset = 0; 1223 amt -= a; 1224 zBuf += a; 1225 }else{ 1226 offset -= OVERFLOW_SIZE; 1227 } 1228 sqlitepager_unref(pOvfl); 1229 } 1230 if( amt>0 ){ 1231 return SQLITE_CORRUPT; 1232 } 1233 return SQLITE_OK; 1234 } 1235 1236 /* 1237 ** Read part of the key associated with cursor pCur. A maximum 1238 ** of "amt" bytes will be transfered into zBuf[]. The transfer 1239 ** begins at "offset". The number of bytes actually read is 1240 ** returned. 1241 ** 1242 ** Change: It used to be that the amount returned will be smaller 1243 ** than the amount requested if there are not enough bytes in the key 1244 ** to satisfy the request. But now, it must be the case that there 1245 ** is enough data available to satisfy the request. If not, an exception 1246 ** is raised. The change was made in an effort to boost performance 1247 ** by eliminating unneeded tests. 1248 */ 1249 static int fileBtreeKey(BtCursor *pCur, int offset, int amt, char *zBuf){ 1250 MemPage *pPage; 1251 1252 assert( amt>=0 ); 1253 assert( offset>=0 ); 1254 assert( pCur->pPage!=0 ); 1255 pPage = pCur->pPage; 1256 if( pCur->idx >= pPage->nCell ){ 1257 return 0; 1258 } 1259 assert( amt+offset <= NKEY(pCur->pBt, pPage->apCell[pCur->idx]->h) ); 1260 getPayload(pCur, offset, amt, zBuf); 1261 return amt; 1262 } 1263 1264 /* 1265 ** Set *pSize to the number of bytes of data in the entry the 1266 ** cursor currently points to. Always return SQLITE_OK. 1267 ** Failure is not possible. If the cursor is not currently 1268 ** pointing to an entry (which can happen, for example, if 1269 ** the database is empty) then *pSize is set to 0. 1270 */ 1271 static int fileBtreeDataSize(BtCursor *pCur, int *pSize){ 1272 Cell *pCell; 1273 MemPage *pPage; 1274 1275 pPage = pCur->pPage; 1276 assert( pPage!=0 ); 1277 if( pCur->idx >= pPage->nCell ){ 1278 *pSize = 0; 1279 }else{ 1280 pCell = pPage->apCell[pCur->idx]; 1281 *pSize = NDATA(pCur->pBt, pCell->h); 1282 } 1283 return SQLITE_OK; 1284 } 1285 1286 /* 1287 ** Read part of the data associated with cursor pCur. A maximum 1288 ** of "amt" bytes will be transfered into zBuf[]. The transfer 1289 ** begins at "offset". The number of bytes actually read is 1290 ** returned. The amount returned will be smaller than the 1291 ** amount requested if there are not enough bytes in the data 1292 ** to satisfy the request. 1293 */ 1294 static int fileBtreeData(BtCursor *pCur, int offset, int amt, char *zBuf){ 1295 Cell *pCell; 1296 MemPage *pPage; 1297 1298 assert( amt>=0 ); 1299 assert( offset>=0 ); 1300 assert( pCur->pPage!=0 ); 1301 pPage = pCur->pPage; 1302 if( pCur->idx >= pPage->nCell ){ 1303 return 0; 1304 } 1305 pCell = pPage->apCell[pCur->idx]; 1306 assert( amt+offset <= NDATA(pCur->pBt, pCell->h) ); 1307 getPayload(pCur, offset + NKEY(pCur->pBt, pCell->h), amt, zBuf); 1308 return amt; 1309 } 1310 1311 /* 1312 ** Compare an external key against the key on the entry that pCur points to. 1313 ** 1314 ** The external key is pKey and is nKey bytes long. The last nIgnore bytes 1315 ** of the key associated with pCur are ignored, as if they do not exist. 1316 ** (The normal case is for nIgnore to be zero in which case the entire 1317 ** internal key is used in the comparison.) 1318 ** 1319 ** The comparison result is written to *pRes as follows: 1320 ** 1321 ** *pRes<0 This means pCur<pKey 1322 ** 1323 ** *pRes==0 This means pCur==pKey for all nKey bytes 1324 ** 1325 ** *pRes>0 This means pCur>pKey 1326 ** 1327 ** When one key is an exact prefix of the other, the shorter key is 1328 ** considered less than the longer one. In order to be equal the 1329 ** keys must be exactly the same length. (The length of the pCur key 1330 ** is the actual key length minus nIgnore bytes.) 1331 */ 1332 static int fileBtreeKeyCompare( 1333 BtCursor *pCur, /* Pointer to entry to compare against */ 1334 const void *pKey, /* Key to compare against entry that pCur points to */ 1335 int nKey, /* Number of bytes in pKey */ 1336 int nIgnore, /* Ignore this many bytes at the end of pCur */ 1337 int *pResult /* Write the result here */ 1338 ){ 1339 Pgno nextPage; 1340 int n, c, rc, nLocal; 1341 Cell *pCell; 1342 Btree *pBt = pCur->pBt; 1343 const char *zKey = (const char*)pKey; 1344 1345 assert( pCur->pPage ); 1346 assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell ); 1347 pCell = pCur->pPage->apCell[pCur->idx]; 1348 nLocal = NKEY(pBt, pCell->h) - nIgnore; 1349 if( nLocal<0 ) nLocal = 0; 1350 n = nKey<nLocal ? nKey : nLocal; 1351 if( n>MX_LOCAL_PAYLOAD ){ 1352 n = MX_LOCAL_PAYLOAD; 1353 } 1354 c = memcmp(pCell->aPayload, zKey, n); 1355 if( c!=0 ){ 1356 *pResult = c; 1357 return SQLITE_OK; 1358 } 1359 zKey += n; 1360 nKey -= n; 1361 nLocal -= n; 1362 nextPage = SWAB32(pBt, pCell->ovfl); 1363 while( nKey>0 && nLocal>0 ){ 1364 OverflowPage *pOvfl; 1365 if( nextPage==0 ){ 1366 return SQLITE_CORRUPT; 1367 } 1368 rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl); 1369 if( rc ){ 1370 return rc; 1371 } 1372 nextPage = SWAB32(pBt, pOvfl->iNext); 1373 n = nKey<nLocal ? nKey : nLocal; 1374 if( n>OVERFLOW_SIZE ){ 1375 n = OVERFLOW_SIZE; 1376 } 1377 c = memcmp(pOvfl->aPayload, zKey, n); 1378 sqlitepager_unref(pOvfl); 1379 if( c!=0 ){ 1380 *pResult = c; 1381 return SQLITE_OK; 1382 } 1383 nKey -= n; 1384 nLocal -= n; 1385 zKey += n; 1386 } 1387 if( c==0 ){ 1388 c = nLocal - nKey; 1389 } 1390 *pResult = c; 1391 return SQLITE_OK; 1392 } 1393 1394 /* 1395 ** Move the cursor down to a new child page. The newPgno argument is the 1396 ** page number of the child page in the byte order of the disk image. 1397 */ 1398 static int moveToChild(BtCursor *pCur, int newPgno){ 1399 int rc; 1400 MemPage *pNewPage; 1401 Btree *pBt = pCur->pBt; 1402 1403 newPgno = SWAB32(pBt, newPgno); 1404 rc = sqlitepager_get(pBt->pPager, newPgno, (void**)&pNewPage); 1405 if( rc ) return rc; 1406 rc = initPage(pBt, pNewPage, newPgno, pCur->pPage); 1407 if( rc ) return rc; 1408 assert( pCur->idx>=pCur->pPage->nCell 1409 || pCur->pPage->apCell[pCur->idx]->h.leftChild==SWAB32(pBt,newPgno) ); 1410 assert( pCur->idx<pCur->pPage->nCell 1411 || pCur->pPage->u.hdr.rightChild==SWAB32(pBt,newPgno) ); 1412 pNewPage->idxParent = pCur->idx; 1413 pCur->pPage->idxShift = 0; 1414 sqlitepager_unref(pCur->pPage); 1415 pCur->pPage = pNewPage; 1416 pCur->idx = 0; 1417 if( pNewPage->nCell<1 ){ 1418 return SQLITE_CORRUPT; 1419 } 1420 return SQLITE_OK; 1421 } 1422 1423 /* 1424 ** Move the cursor up to the parent page. 1425 ** 1426 ** pCur->idx is set to the cell index that contains the pointer 1427 ** to the page we are coming from. If we are coming from the 1428 ** right-most child page then pCur->idx is set to one more than 1429 ** the largest cell index. 1430 */ 1431 static void moveToParent(BtCursor *pCur){ 1432 Pgno oldPgno; 1433 MemPage *pParent; 1434 MemPage *pPage; 1435 int idxParent; 1436 pPage = pCur->pPage; 1437 assert( pPage!=0 ); 1438 pParent = pPage->pParent; 1439 assert( pParent!=0 ); 1440 idxParent = pPage->idxParent; 1441 sqlitepager_ref(pParent); 1442 sqlitepager_unref(pPage); 1443 pCur->pPage = pParent; 1444 assert( pParent->idxShift==0 ); 1445 if( pParent->idxShift==0 ){ 1446 pCur->idx = idxParent; 1447 #ifndef NDEBUG 1448 /* Verify that pCur->idx is the correct index to point back to the child 1449 ** page we just came from 1450 */ 1451 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage)); 1452 if( pCur->idx<pParent->nCell ){ 1453 assert( pParent->apCell[idxParent]->h.leftChild==oldPgno ); 1454 }else{ 1455 assert( pParent->u.hdr.rightChild==oldPgno ); 1456 } 1457 #endif 1458 }else{ 1459 /* The MemPage.idxShift flag indicates that cell indices might have 1460 ** changed since idxParent was set and hence idxParent might be out 1461 ** of date. So recompute the parent cell index by scanning all cells 1462 ** and locating the one that points to the child we just came from. 1463 */ 1464 int i; 1465 pCur->idx = pParent->nCell; 1466 oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage)); 1467 for(i=0; i<pParent->nCell; i++){ 1468 if( pParent->apCell[i]->h.leftChild==oldPgno ){ 1469 pCur->idx = i; 1470 break; 1471 } 1472 } 1473 } 1474 } 1475 1476 /* 1477 ** Move the cursor to the root page 1478 */ 1479 static int moveToRoot(BtCursor *pCur){ 1480 MemPage *pNew; 1481 int rc; 1482 Btree *pBt = pCur->pBt; 1483 1484 rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pNew); 1485 if( rc ) return rc; 1486 rc = initPage(pBt, pNew, pCur->pgnoRoot, 0); 1487 if( rc ) return rc; 1488 sqlitepager_unref(pCur->pPage); 1489 pCur->pPage = pNew; 1490 pCur->idx = 0; 1491 return SQLITE_OK; 1492 } 1493 1494 /* 1495 ** Move the cursor down to the left-most leaf entry beneath the 1496 ** entry to which it is currently pointing. 1497 */ 1498 static int moveToLeftmost(BtCursor *pCur){ 1499 Pgno pgno; 1500 int rc; 1501 1502 while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){ 1503 rc = moveToChild(pCur, pgno); 1504 if( rc ) return rc; 1505 } 1506 return SQLITE_OK; 1507 } 1508 1509 /* 1510 ** Move the cursor down to the right-most leaf entry beneath the 1511 ** page to which it is currently pointing. Notice the difference 1512 ** between moveToLeftmost() and moveToRightmost(). moveToLeftmost() 1513 ** finds the left-most entry beneath the *entry* whereas moveToRightmost() 1514 ** finds the right-most entry beneath the *page*. 1515 */ 1516 static int moveToRightmost(BtCursor *pCur){ 1517 Pgno pgno; 1518 int rc; 1519 1520 while( (pgno = pCur->pPage->u.hdr.rightChild)!=0 ){ 1521 pCur->idx = pCur->pPage->nCell; 1522 rc = moveToChild(pCur, pgno); 1523 if( rc ) return rc; 1524 } 1525 pCur->idx = pCur->pPage->nCell - 1; 1526 return SQLITE_OK; 1527 } 1528 1529 /* Move the cursor to the first entry in the table. Return SQLITE_OK 1530 ** on success. Set *pRes to 0 if the cursor actually points to something 1531 ** or set *pRes to 1 if the table is empty. 1532 */ 1533 static int fileBtreeFirst(BtCursor *pCur, int *pRes){ 1534 int rc; 1535 if( pCur->pPage==0 ) return SQLITE_ABORT; 1536 rc = moveToRoot(pCur); 1537 if( rc ) return rc; 1538 if( pCur->pPage->nCell==0 ){ 1539 *pRes = 1; 1540 return SQLITE_OK; 1541 } 1542 *pRes = 0; 1543 rc = moveToLeftmost(pCur); 1544 pCur->eSkip = SKIP_NONE; 1545 return rc; 1546 } 1547 1548 /* Move the cursor to the last entry in the table. Return SQLITE_OK 1549 ** on success. Set *pRes to 0 if the cursor actually points to something 1550 ** or set *pRes to 1 if the table is empty. 1551 */ 1552 static int fileBtreeLast(BtCursor *pCur, int *pRes){ 1553 int rc; 1554 if( pCur->pPage==0 ) return SQLITE_ABORT; 1555 rc = moveToRoot(pCur); 1556 if( rc ) return rc; 1557 assert( pCur->pPage->isInit ); 1558 if( pCur->pPage->nCell==0 ){ 1559 *pRes = 1; 1560 return SQLITE_OK; 1561 } 1562 *pRes = 0; 1563 rc = moveToRightmost(pCur); 1564 pCur->eSkip = SKIP_NONE; 1565 return rc; 1566 } 1567 1568 /* Move the cursor so that it points to an entry near pKey. 1569 ** Return a success code. 1570 ** 1571 ** If an exact match is not found, then the cursor is always 1572 ** left pointing at a leaf page which would hold the entry if it 1573 ** were present. The cursor might point to an entry that comes 1574 ** before or after the key. 1575 ** 1576 ** The result of comparing the key with the entry to which the 1577 ** cursor is left pointing is stored in pCur->iMatch. The same 1578 ** value is also written to *pRes if pRes!=NULL. The meaning of 1579 ** this value is as follows: 1580 ** 1581 ** *pRes<0 The cursor is left pointing at an entry that 1582 ** is smaller than pKey or if the table is empty 1583 ** and the cursor is therefore left point to nothing. 1584 ** 1585 ** *pRes==0 The cursor is left pointing at an entry that 1586 ** exactly matches pKey. 1587 ** 1588 ** *pRes>0 The cursor is left pointing at an entry that 1589 ** is larger than pKey. 1590 */ 1591 static 1592 int fileBtreeMoveto(BtCursor *pCur, const void *pKey, int nKey, int *pRes){ 1593 int rc; 1594 if( pCur->pPage==0 ) return SQLITE_ABORT; 1595 pCur->eSkip = SKIP_NONE; 1596 rc = moveToRoot(pCur); 1597 if( rc ) return rc; 1598 for(;;){ 1599 int lwr, upr; 1600 Pgno chldPg; 1601 MemPage *pPage = pCur->pPage; 1602 int c = -1; /* pRes return if table is empty must be -1 */ 1603 lwr = 0; 1604 upr = pPage->nCell-1; 1605 while( lwr<=upr ){ 1606 pCur->idx = (lwr+upr)/2; 1607 rc = fileBtreeKeyCompare(pCur, pKey, nKey, 0, &c); 1608 if( rc ) return rc; 1609 if( c==0 ){ 1610 pCur->iMatch = c; 1611 if( pRes ) *pRes = 0; 1612 return SQLITE_OK; 1613 } 1614 if( c<0 ){ 1615 lwr = pCur->idx+1; 1616 }else{ 1617 upr = pCur->idx-1; 1618 } 1619 } 1620 assert( lwr==upr+1 ); 1621 assert( pPage->isInit ); 1622 if( lwr>=pPage->nCell ){ 1623 chldPg = pPage->u.hdr.rightChild; 1624 }else{ 1625 chldPg = pPage->apCell[lwr]->h.leftChild; 1626 } 1627 if( chldPg==0 ){ 1628 pCur->iMatch = c; 1629 if( pRes ) *pRes = c; 1630 return SQLITE_OK; 1631 } 1632 pCur->idx = lwr; 1633 rc = moveToChild(pCur, chldPg); 1634 if( rc ) return rc; 1635 } 1636 /* NOT REACHED */ 1637 } 1638 1639 /* 1640 ** Advance the cursor to the next entry in the database. If 1641 ** successful then set *pRes=0. If the cursor 1642 ** was already pointing to the last entry in the database before 1643 ** this routine was called, then set *pRes=1. 1644 */ 1645 static int fileBtreeNext(BtCursor *pCur, int *pRes){ 1646 int rc; 1647 MemPage *pPage = pCur->pPage; 1648 assert( pRes!=0 ); 1649 if( pPage==0 ){ 1650 *pRes = 1; 1651 return SQLITE_ABORT; 1652 } 1653 assert( pPage->isInit ); 1654 assert( pCur->eSkip!=SKIP_INVALID ); 1655 if( pPage->nCell==0 ){ 1656 *pRes = 1; 1657 return SQLITE_OK; 1658 } 1659 assert( pCur->idx<pPage->nCell ); 1660 if( pCur->eSkip==SKIP_NEXT ){ 1661 pCur->eSkip = SKIP_NONE; 1662 *pRes = 0; 1663 return SQLITE_OK; 1664 } 1665 pCur->eSkip = SKIP_NONE; 1666 pCur->idx++; 1667 if( pCur->idx>=pPage->nCell ){ 1668 if( pPage->u.hdr.rightChild ){ 1669 rc = moveToChild(pCur, pPage->u.hdr.rightChild); 1670 if( rc ) return rc; 1671 rc = moveToLeftmost(pCur); 1672 *pRes = 0; 1673 return rc; 1674 } 1675 do{ 1676 if( pPage->pParent==0 ){ 1677 *pRes = 1; 1678 return SQLITE_OK; 1679 } 1680 moveToParent(pCur); 1681 pPage = pCur->pPage; 1682 }while( pCur->idx>=pPage->nCell ); 1683 *pRes = 0; 1684 return SQLITE_OK; 1685 } 1686 *pRes = 0; 1687 if( pPage->u.hdr.rightChild==0 ){ 1688 return SQLITE_OK; 1689 } 1690 rc = moveToLeftmost(pCur); 1691 return rc; 1692 } 1693 1694 /* 1695 ** Step the cursor to the back to the previous entry in the database. If 1696 ** successful then set *pRes=0. If the cursor 1697 ** was already pointing to the first entry in the database before 1698 ** this routine was called, then set *pRes=1. 1699 */ 1700 static int fileBtreePrevious(BtCursor *pCur, int *pRes){ 1701 int rc; 1702 Pgno pgno; 1703 MemPage *pPage; 1704 pPage = pCur->pPage; 1705 if( pPage==0 ){ 1706 *pRes = 1; 1707 return SQLITE_ABORT; 1708 } 1709 assert( pPage->isInit ); 1710 assert( pCur->eSkip!=SKIP_INVALID ); 1711 if( pPage->nCell==0 ){ 1712 *pRes = 1; 1713 return SQLITE_OK; 1714 } 1715 if( pCur->eSkip==SKIP_PREV ){ 1716 pCur->eSkip = SKIP_NONE; 1717 *pRes = 0; 1718 return SQLITE_OK; 1719 } 1720 pCur->eSkip = SKIP_NONE; 1721 assert( pCur->idx>=0 ); 1722 if( (pgno = pPage->apCell[pCur->idx]->h.leftChild)!=0 ){ 1723 rc = moveToChild(pCur, pgno); 1724 if( rc ) return rc; 1725 rc = moveToRightmost(pCur); 1726 }else{ 1727 while( pCur->idx==0 ){ 1728 if( pPage->pParent==0 ){ 1729 if( pRes ) *pRes = 1; 1730 return SQLITE_OK; 1731 } 1732 moveToParent(pCur); 1733 pPage = pCur->pPage; 1734 } 1735 pCur->idx--; 1736 rc = SQLITE_OK; 1737 } 1738 *pRes = 0; 1739 return rc; 1740 } 1741 1742 /* 1743 ** Allocate a new page from the database file. 1744 ** 1745 ** The new page is marked as dirty. (In other words, sqlitepager_write() 1746 ** has already been called on the new page.) The new page has also 1747 ** been referenced and the calling routine is responsible for calling 1748 ** sqlitepager_unref() on the new page when it is done. 1749 ** 1750 ** SQLITE_OK is returned on success. Any other return value indicates 1751 ** an error. *ppPage and *pPgno are undefined in the event of an error. 1752 ** Do not invoke sqlitepager_unref() on *ppPage if an error is returned. 1753 ** 1754 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to 1755 ** locate a page close to the page number "nearby". This can be used in an 1756 ** attempt to keep related pages close to each other in the database file, 1757 ** which in turn can make database access faster. 1758 */ 1759 static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){ 1760 PageOne *pPage1 = pBt->page1; 1761 int rc; 1762 if( pPage1->freeList ){ 1763 OverflowPage *pOvfl; 1764 FreelistInfo *pInfo; 1765 1766 rc = sqlitepager_write(pPage1); 1767 if( rc ) return rc; 1768 SWAB_ADD(pBt, pPage1->nFree, -1); 1769 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList), 1770 (void**)&pOvfl); 1771 if( rc ) return rc; 1772 rc = sqlitepager_write(pOvfl); 1773 if( rc ){ 1774 sqlitepager_unref(pOvfl); 1775 return rc; 1776 } 1777 pInfo = (FreelistInfo*)pOvfl->aPayload; 1778 if( pInfo->nFree==0 ){ 1779 *pPgno = SWAB32(pBt, pPage1->freeList); 1780 pPage1->freeList = pOvfl->iNext; 1781 *ppPage = (MemPage*)pOvfl; 1782 }else{ 1783 int closest, n; 1784 n = SWAB32(pBt, pInfo->nFree); 1785 if( n>1 && nearby>0 ){ 1786 int i, dist; 1787 closest = 0; 1788 dist = SWAB32(pBt, pInfo->aFree[0]) - nearby; 1789 if( dist<0 ) dist = -dist; 1790 for(i=1; i<n; i++){ 1791 int d2 = SWAB32(pBt, pInfo->aFree[i]) - nearby; 1792 if( d2<0 ) d2 = -d2; 1793 if( d2<dist ) closest = i; 1794 } 1795 }else{ 1796 closest = 0; 1797 } 1798 SWAB_ADD(pBt, pInfo->nFree, -1); 1799 *pPgno = SWAB32(pBt, pInfo->aFree[closest]); 1800 pInfo->aFree[closest] = pInfo->aFree[n-1]; 1801 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage); 1802 sqlitepager_unref(pOvfl); 1803 if( rc==SQLITE_OK ){ 1804 sqlitepager_dont_rollback(*ppPage); 1805 rc = sqlitepager_write(*ppPage); 1806 } 1807 } 1808 }else{ 1809 *pPgno = sqlitepager_pagecount(pBt->pPager) + 1; 1810 rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage); 1811 if( rc ) return rc; 1812 rc = sqlitepager_write(*ppPage); 1813 } 1814 return rc; 1815 } 1816 1817 /* 1818 ** Add a page of the database file to the freelist. Either pgno or 1819 ** pPage but not both may be 0. 1820 ** 1821 ** sqlitepager_unref() is NOT called for pPage. 1822 */ 1823 static int freePage(Btree *pBt, void *pPage, Pgno pgno){ 1824 PageOne *pPage1 = pBt->page1; 1825 OverflowPage *pOvfl = (OverflowPage*)pPage; 1826 int rc; 1827 int needUnref = 0; 1828 MemPage *pMemPage; 1829 1830 if( pgno==0 ){ 1831 assert( pOvfl!=0 ); 1832 pgno = sqlitepager_pagenumber(pOvfl); 1833 } 1834 assert( pgno>2 ); 1835 assert( sqlitepager_pagenumber(pOvfl)==pgno ); 1836 pMemPage = (MemPage*)pPage; 1837 pMemPage->isInit = 0; 1838 if( pMemPage->pParent ){ 1839 sqlitepager_unref(pMemPage->pParent); 1840 pMemPage->pParent = 0; 1841 } 1842 rc = sqlitepager_write(pPage1); 1843 if( rc ){ 1844 return rc; 1845 } 1846 SWAB_ADD(pBt, pPage1->nFree, 1); 1847 if( pPage1->nFree!=0 && pPage1->freeList!=0 ){ 1848 OverflowPage *pFreeIdx; 1849 rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList), 1850 (void**)&pFreeIdx); 1851 if( rc==SQLITE_OK ){ 1852 FreelistInfo *pInfo = (FreelistInfo*)pFreeIdx->aPayload; 1853 int n = SWAB32(pBt, pInfo->nFree); 1854 if( n<(sizeof(pInfo->aFree)/sizeof(pInfo->aFree[0])) ){ 1855 rc = sqlitepager_write(pFreeIdx); 1856 if( rc==SQLITE_OK ){ 1857 pInfo->aFree[n] = SWAB32(pBt, pgno); 1858 SWAB_ADD(pBt, pInfo->nFree, 1); 1859 sqlitepager_unref(pFreeIdx); 1860 sqlitepager_dont_write(pBt->pPager, pgno); 1861 return rc; 1862 } 1863 } 1864 sqlitepager_unref(pFreeIdx); 1865 } 1866 } 1867 if( pOvfl==0 ){ 1868 assert( pgno>0 ); 1869 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pOvfl); 1870 if( rc ) return rc; 1871 needUnref = 1; 1872 } 1873 rc = sqlitepager_write(pOvfl); 1874 if( rc ){ 1875 if( needUnref ) sqlitepager_unref(pOvfl); 1876 return rc; 1877 } 1878 pOvfl->iNext = pPage1->freeList; 1879 pPage1->freeList = SWAB32(pBt, pgno); 1880 memset(pOvfl->aPayload, 0, OVERFLOW_SIZE); 1881 if( needUnref ) rc = sqlitepager_unref(pOvfl); 1882 return rc; 1883 } 1884 1885 /* 1886 ** Erase all the data out of a cell. This involves returning overflow 1887 ** pages back the freelist. 1888 */ 1889 static int clearCell(Btree *pBt, Cell *pCell){ 1890 Pager *pPager = pBt->pPager; 1891 OverflowPage *pOvfl; 1892 Pgno ovfl, nextOvfl; 1893 int rc; 1894 1895 if( NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h) <= MX_LOCAL_PAYLOAD ){ 1896 return SQLITE_OK; 1897 } 1898 ovfl = SWAB32(pBt, pCell->ovfl); 1899 pCell->ovfl = 0; 1900 while( ovfl ){ 1901 rc = sqlitepager_get(pPager, ovfl, (void**)&pOvfl); 1902 if( rc ) return rc; 1903 nextOvfl = SWAB32(pBt, pOvfl->iNext); 1904 rc = freePage(pBt, pOvfl, ovfl); 1905 if( rc ) return rc; 1906 sqlitepager_unref(pOvfl); 1907 ovfl = nextOvfl; 1908 } 1909 return SQLITE_OK; 1910 } 1911 1912 /* 1913 ** Create a new cell from key and data. Overflow pages are allocated as 1914 ** necessary and linked to this cell. 1915 */ 1916 static int fillInCell( 1917 Btree *pBt, /* The whole Btree. Needed to allocate pages */ 1918 Cell *pCell, /* Populate this Cell structure */ 1919 const void *pKey, int nKey, /* The key */ 1920 const void *pData,int nData /* The data */ 1921 ){ 1922 OverflowPage *pOvfl, *pPrior; 1923 Pgno *pNext; 1924 int spaceLeft; 1925 int n, rc; 1926 int nPayload; 1927 const char *pPayload; 1928 char *pSpace; 1929 Pgno nearby = 0; 1930 1931 pCell->h.leftChild = 0; 1932 pCell->h.nKey = SWAB16(pBt, nKey & 0xffff); 1933 pCell->h.nKeyHi = nKey >> 16; 1934 pCell->h.nData = SWAB16(pBt, nData & 0xffff); 1935 pCell->h.nDataHi = nData >> 16; 1936 pCell->h.iNext = 0; 1937 1938 pNext = &pCell->ovfl; 1939 pSpace = pCell->aPayload; 1940 spaceLeft = MX_LOCAL_PAYLOAD; 1941 pPayload = pKey; 1942 pKey = 0; 1943 nPayload = nKey; 1944 pPrior = 0; 1945 while( nPayload>0 ){ 1946 if( spaceLeft==0 ){ 1947 rc = allocatePage(pBt, (MemPage**)&pOvfl, pNext, nearby); 1948 if( rc ){ 1949 *pNext = 0; 1950 }else{ 1951 nearby = *pNext; 1952 } 1953 if( pPrior ) sqlitepager_unref(pPrior); 1954 if( rc ){ 1955 clearCell(pBt, pCell); 1956 return rc; 1957 } 1958 if( pBt->needSwab ) *pNext = swab32(*pNext); 1959 pPrior = pOvfl; 1960 spaceLeft = OVERFLOW_SIZE; 1961 pSpace = pOvfl->aPayload; 1962 pNext = &pOvfl->iNext; 1963 } 1964 n = nPayload; 1965 if( n>spaceLeft ) n = spaceLeft; 1966 memcpy(pSpace, pPayload, n); 1967 nPayload -= n; 1968 if( nPayload==0 && pData ){ 1969 pPayload = pData; 1970 nPayload = nData; 1971 pData = 0; 1972 }else{ 1973 pPayload += n; 1974 } 1975 spaceLeft -= n; 1976 pSpace += n; 1977 } 1978 *pNext = 0; 1979 if( pPrior ){ 1980 sqlitepager_unref(pPrior); 1981 } 1982 return SQLITE_OK; 1983 } 1984 1985 /* 1986 ** Change the MemPage.pParent pointer on the page whose number is 1987 ** given in the second argument so that MemPage.pParent holds the 1988 ** pointer in the third argument. 1989 */ 1990 static void reparentPage(Pager *pPager, Pgno pgno, MemPage *pNewParent,int idx){ 1991 MemPage *pThis; 1992 1993 if( pgno==0 ) return; 1994 assert( pPager!=0 ); 1995 pThis = sqlitepager_lookup(pPager, pgno); 1996 if( pThis && pThis->isInit ){ 1997 if( pThis->pParent!=pNewParent ){ 1998 if( pThis->pParent ) sqlitepager_unref(pThis->pParent); 1999 pThis->pParent = pNewParent; 2000 if( pNewParent ) sqlitepager_ref(pNewParent); 2001 } 2002 pThis->idxParent = idx; 2003 sqlitepager_unref(pThis); 2004 } 2005 } 2006 2007 /* 2008 ** Reparent all children of the given page to be the given page. 2009 ** In other words, for every child of pPage, invoke reparentPage() 2010 ** to make sure that each child knows that pPage is its parent. 2011 ** 2012 ** This routine gets called after you memcpy() one page into 2013 ** another. 2014 */ 2015 static void reparentChildPages(Btree *pBt, MemPage *pPage){ 2016 int i; 2017 Pager *pPager = pBt->pPager; 2018 for(i=0; i<pPage->nCell; i++){ 2019 reparentPage(pPager, SWAB32(pBt, pPage->apCell[i]->h.leftChild), pPage, i); 2020 } 2021 reparentPage(pPager, SWAB32(pBt, pPage->u.hdr.rightChild), pPage, i); 2022 pPage->idxShift = 0; 2023 } 2024 2025 /* 2026 ** Remove the i-th cell from pPage. This routine effects pPage only. 2027 ** The cell content is not freed or deallocated. It is assumed that 2028 ** the cell content has been copied someplace else. This routine just 2029 ** removes the reference to the cell from pPage. 2030 ** 2031 ** "sz" must be the number of bytes in the cell. 2032 ** 2033 ** Do not bother maintaining the integrity of the linked list of Cells. 2034 ** Only the pPage->apCell[] array is important. The relinkCellList() 2035 ** routine will be called soon after this routine in order to rebuild 2036 ** the linked list. 2037 */ 2038 static void dropCell(Btree *pBt, MemPage *pPage, int idx, int sz){ 2039 int j; 2040 assert( idx>=0 && idx<pPage->nCell ); 2041 assert( sz==cellSize(pBt, pPage->apCell[idx]) ); 2042 assert( sqlitepager_iswriteable(pPage) ); 2043 freeSpace(pBt, pPage, Addr(pPage->apCell[idx]) - Addr(pPage), sz); 2044 for(j=idx; j<pPage->nCell-1; j++){ 2045 pPage->apCell[j] = pPage->apCell[j+1]; 2046 } 2047 pPage->nCell--; 2048 pPage->idxShift = 1; 2049 } 2050 2051 /* 2052 ** Insert a new cell on pPage at cell index "i". pCell points to the 2053 ** content of the cell. 2054 ** 2055 ** If the cell content will fit on the page, then put it there. If it 2056 ** will not fit, then just make pPage->apCell[i] point to the content 2057 ** and set pPage->isOverfull. 2058 ** 2059 ** Do not bother maintaining the integrity of the linked list of Cells. 2060 ** Only the pPage->apCell[] array is important. The relinkCellList() 2061 ** routine will be called soon after this routine in order to rebuild 2062 ** the linked list. 2063 */ 2064 static void insertCell(Btree *pBt, MemPage *pPage, int i, Cell *pCell, int sz){ 2065 int idx, j; 2066 assert( i>=0 && i<=pPage->nCell ); 2067 assert( sz==cellSize(pBt, pCell) ); 2068 assert( sqlitepager_iswriteable(pPage) ); 2069 idx = allocateSpace(pBt, pPage, sz); 2070 for(j=pPage->nCell; j>i; j--){ 2071 pPage->apCell[j] = pPage->apCell[j-1]; 2072 } 2073 pPage->nCell++; 2074 if( idx<=0 ){ 2075 pPage->isOverfull = 1; 2076 pPage->apCell[i] = pCell; 2077 }else{ 2078 memcpy(&pPage->u.aDisk[idx], pCell, sz); 2079 pPage->apCell[i] = (Cell*)&pPage->u.aDisk[idx]; 2080 } 2081 pPage->idxShift = 1; 2082 } 2083 2084 /* 2085 ** Rebuild the linked list of cells on a page so that the cells 2086 ** occur in the order specified by the pPage->apCell[] array. 2087 ** Invoke this routine once to repair damage after one or more 2088 ** invocations of either insertCell() or dropCell(). 2089 */ 2090 static void relinkCellList(Btree *pBt, MemPage *pPage){ 2091 int i; 2092 u16 *pIdx; 2093 assert( sqlitepager_iswriteable(pPage) ); 2094 pIdx = &pPage->u.hdr.firstCell; 2095 for(i=0; i<pPage->nCell; i++){ 2096 int idx = Addr(pPage->apCell[i]) - Addr(pPage); 2097 assert( idx>0 && idx<SQLITE_USABLE_SIZE ); 2098 *pIdx = SWAB16(pBt, idx); 2099 pIdx = &pPage->apCell[i]->h.iNext; 2100 } 2101 *pIdx = 0; 2102 } 2103 2104 /* 2105 ** Make a copy of the contents of pFrom into pTo. The pFrom->apCell[] 2106 ** pointers that point into pFrom->u.aDisk[] must be adjusted to point 2107 ** into pTo->u.aDisk[] instead. But some pFrom->apCell[] entries might 2108 ** not point to pFrom->u.aDisk[]. Those are unchanged. 2109 */ 2110 static void copyPage(MemPage *pTo, MemPage *pFrom){ 2111 uptr from, to; 2112 int i; 2113 memcpy(pTo->u.aDisk, pFrom->u.aDisk, SQLITE_USABLE_SIZE); 2114 pTo->pParent = 0; 2115 pTo->isInit = 1; 2116 pTo->nCell = pFrom->nCell; 2117 pTo->nFree = pFrom->nFree; 2118 pTo->isOverfull = pFrom->isOverfull; 2119 to = Addr(pTo); 2120 from = Addr(pFrom); 2121 for(i=0; i<pTo->nCell; i++){ 2122 uptr x = Addr(pFrom->apCell[i]); 2123 if( x>from && x<from+SQLITE_USABLE_SIZE ){ 2124 *((uptr*)&pTo->apCell[i]) = x + to - from; 2125 }else{ 2126 pTo->apCell[i] = pFrom->apCell[i]; 2127 } 2128 } 2129 } 2130 2131 /* 2132 ** The following parameters determine how many adjacent pages get involved 2133 ** in a balancing operation. NN is the number of neighbors on either side 2134 ** of the page that participate in the balancing operation. NB is the 2135 ** total number of pages that participate, including the target page and 2136 ** NN neighbors on either side. 2137 ** 2138 ** The minimum value of NN is 1 (of course). Increasing NN above 1 2139 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance 2140 ** in exchange for a larger degradation in INSERT and UPDATE performance. 2141 ** The value of NN appears to give the best results overall. 2142 */ 2143 #define NN 1 /* Number of neighbors on either side of pPage */ 2144 #define NB (NN*2+1) /* Total pages involved in the balance */ 2145 2146 /* 2147 ** This routine redistributes Cells on pPage and up to two siblings 2148 ** of pPage so that all pages have about the same amount of free space. 2149 ** Usually one sibling on either side of pPage is used in the balancing, 2150 ** though both siblings might come from one side if pPage is the first 2151 ** or last child of its parent. If pPage has fewer than two siblings 2152 ** (something which can only happen if pPage is the root page or a 2153 ** child of root) then all available siblings participate in the balancing. 2154 ** 2155 ** The number of siblings of pPage might be increased or decreased by 2156 ** one in an effort to keep pages between 66% and 100% full. The root page 2157 ** is special and is allowed to be less than 66% full. If pPage is 2158 ** the root page, then the depth of the tree might be increased 2159 ** or decreased by one, as necessary, to keep the root page from being 2160 ** overfull or empty. 2161 ** 2162 ** This routine calls relinkCellList() on its input page regardless of 2163 ** whether or not it does any real balancing. Client routines will typically 2164 ** invoke insertCell() or dropCell() before calling this routine, so we 2165 ** need to call relinkCellList() to clean up the mess that those other 2166 ** routines left behind. 2167 ** 2168 ** pCur is left pointing to the same cell as when this routine was called 2169 ** even if that cell gets moved to a different page. pCur may be NULL. 2170 ** Set the pCur parameter to NULL if you do not care about keeping track 2171 ** of a cell as that will save this routine the work of keeping track of it. 2172 ** 2173 ** Note that when this routine is called, some of the Cells on pPage 2174 ** might not actually be stored in pPage->u.aDisk[]. This can happen 2175 ** if the page is overfull. Part of the job of this routine is to 2176 ** make sure all Cells for pPage once again fit in pPage->u.aDisk[]. 2177 ** 2178 ** In the course of balancing the siblings of pPage, the parent of pPage 2179 ** might become overfull or underfull. If that happens, then this routine 2180 ** is called recursively on the parent. 2181 ** 2182 ** If this routine fails for any reason, it might leave the database 2183 ** in a corrupted state. So if this routine fails, the database should 2184 ** be rolled back. 2185 */ 2186 static int balance(Btree *pBt, MemPage *pPage, BtCursor *pCur){ 2187 MemPage *pParent; /* The parent of pPage */ 2188 int nCell; /* Number of cells in apCell[] */ 2189 int nOld; /* Number of pages in apOld[] */ 2190 int nNew; /* Number of pages in apNew[] */ 2191 int nDiv; /* Number of cells in apDiv[] */ 2192 int i, j, k; /* Loop counters */ 2193 int idx; /* Index of pPage in pParent->apCell[] */ 2194 int nxDiv; /* Next divider slot in pParent->apCell[] */ 2195 int rc; /* The return code */ 2196 int iCur; /* apCell[iCur] is the cell of the cursor */ 2197 MemPage *pOldCurPage; /* The cursor originally points to this page */ 2198 int subtotal; /* Subtotal of bytes in cells on one page */ 2199 MemPage *extraUnref = 0; /* A page that needs to be unref-ed */ 2200 MemPage *apOld[NB]; /* pPage and up to two siblings */ 2201 Pgno pgnoOld[NB]; /* Page numbers for each page in apOld[] */ 2202 MemPage *apNew[NB+1]; /* pPage and up to NB siblings after balancing */ 2203 Pgno pgnoNew[NB+1]; /* Page numbers for each page in apNew[] */ 2204 int idxDiv[NB]; /* Indices of divider cells in pParent */ 2205 Cell *apDiv[NB]; /* Divider cells in pParent */ 2206 Cell aTemp[NB]; /* Temporary holding area for apDiv[] */ 2207 int cntNew[NB+1]; /* Index in apCell[] of cell after i-th page */ 2208 int szNew[NB+1]; /* Combined size of cells place on i-th page */ 2209 MemPage aOld[NB]; /* Temporary copies of pPage and its siblings */ 2210 Cell *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */ 2211 int szCell[(MX_CELL+2)*NB]; /* Local size of all cells */ 2212 2213 /* 2214 ** Return without doing any work if pPage is neither overfull nor 2215 ** underfull. 2216 */ 2217 assert( sqlitepager_iswriteable(pPage) ); 2218 if( !pPage->isOverfull && pPage->nFree<SQLITE_USABLE_SIZE/2 2219 && pPage->nCell>=2){ 2220 relinkCellList(pBt, pPage); 2221 return SQLITE_OK; 2222 } 2223 2224 /* 2225 ** Find the parent of the page to be balanceed. 2226 ** If there is no parent, it means this page is the root page and 2227 ** special rules apply. 2228 */ 2229 pParent = pPage->pParent; 2230 if( pParent==0 ){ 2231 Pgno pgnoChild; 2232 MemPage *pChild; 2233 assert( pPage->isInit ); 2234 if( pPage->nCell==0 ){ 2235 if( pPage->u.hdr.rightChild ){ 2236 /* 2237 ** The root page is empty. Copy the one child page 2238 ** into the root page and return. This reduces the depth 2239 ** of the BTree by one. 2240 */ 2241 pgnoChild = SWAB32(pBt, pPage->u.hdr.rightChild); 2242 rc = sqlitepager_get(pBt->pPager, pgnoChild, (void**)&pChild); 2243 if( rc ) return rc; 2244 memcpy(pPage, pChild, SQLITE_USABLE_SIZE); 2245 pPage->isInit = 0; 2246 rc = initPage(pBt, pPage, sqlitepager_pagenumber(pPage), 0); 2247 assert( rc==SQLITE_OK ); 2248 reparentChildPages(pBt, pPage); 2249 if( pCur && pCur->pPage==pChild ){ 2250 sqlitepager_unref(pChild); 2251 pCur->pPage = pPage; 2252 sqlitepager_ref(pPage); 2253 } 2254 freePage(pBt, pChild, pgnoChild); 2255 sqlitepager_unref(pChild); 2256 }else{ 2257 relinkCellList(pBt, pPage); 2258 } 2259 return SQLITE_OK; 2260 } 2261 if( !pPage->isOverfull ){ 2262 /* It is OK for the root page to be less than half full. 2263 */ 2264 relinkCellList(pBt, pPage); 2265 return SQLITE_OK; 2266 } 2267 /* 2268 ** If we get to here, it means the root page is overfull. 2269 ** When this happens, Create a new child page and copy the 2270 ** contents of the root into the child. Then make the root 2271 ** page an empty page with rightChild pointing to the new 2272 ** child. Then fall thru to the code below which will cause 2273 ** the overfull child page to be split. 2274 */ 2275 rc = sqlitepager_write(pPage); 2276 if( rc ) return rc; 2277 rc = allocatePage(pBt, &pChild, &pgnoChild, sqlitepager_pagenumber(pPage)); 2278 if( rc ) return rc; 2279 assert( sqlitepager_iswriteable(pChild) ); 2280 copyPage(pChild, pPage); 2281 pChild->pParent = pPage; 2282 pChild->idxParent = 0; 2283 sqlitepager_ref(pPage); 2284 pChild->isOverfull = 1; 2285 if( pCur && pCur->pPage==pPage ){ 2286 sqlitepager_unref(pPage); 2287 pCur->pPage = pChild; 2288 }else{ 2289 extraUnref = pChild; 2290 } 2291 zeroPage(pBt, pPage); 2292 pPage->u.hdr.rightChild = SWAB32(pBt, pgnoChild); 2293 pParent = pPage; 2294 pPage = pChild; 2295 } 2296 rc = sqlitepager_write(pParent); 2297 if( rc ) return rc; 2298 assert( pParent->isInit ); 2299 2300 /* 2301 ** Find the Cell in the parent page whose h.leftChild points back 2302 ** to pPage. The "idx" variable is the index of that cell. If pPage 2303 ** is the rightmost child of pParent then set idx to pParent->nCell 2304 */ 2305 if( pParent->idxShift ){ 2306 Pgno pgno, swabPgno; 2307 pgno = sqlitepager_pagenumber(pPage); 2308 swabPgno = SWAB32(pBt, pgno); 2309 for(idx=0; idx<pParent->nCell; idx++){ 2310 if( pParent->apCell[idx]->h.leftChild==swabPgno ){ 2311 break; 2312 } 2313 } 2314 assert( idx<pParent->nCell || pParent->u.hdr.rightChild==swabPgno ); 2315 }else{ 2316 idx = pPage->idxParent; 2317 } 2318 2319 /* 2320 ** Initialize variables so that it will be safe to jump 2321 ** directly to balance_cleanup at any moment. 2322 */ 2323 nOld = nNew = 0; 2324 sqlitepager_ref(pParent); 2325 2326 /* 2327 ** Find sibling pages to pPage and the Cells in pParent that divide 2328 ** the siblings. An attempt is made to find NN siblings on either 2329 ** side of pPage. More siblings are taken from one side, however, if 2330 ** pPage there are fewer than NN siblings on the other side. If pParent 2331 ** has NB or fewer children then all children of pParent are taken. 2332 */ 2333 nxDiv = idx - NN; 2334 if( nxDiv + NB > pParent->nCell ){ 2335 nxDiv = pParent->nCell - NB + 1; 2336 } 2337 if( nxDiv<0 ){ 2338 nxDiv = 0; 2339 } 2340 nDiv = 0; 2341 for(i=0, k=nxDiv; i<NB; i++, k++){ 2342 if( k<pParent->nCell ){ 2343 idxDiv[i] = k; 2344 apDiv[i] = pParent->apCell[k]; 2345 nDiv++; 2346 pgnoOld[i] = SWAB32(pBt, apDiv[i]->h.leftChild); 2347 }else if( k==pParent->nCell ){ 2348 pgnoOld[i] = SWAB32(pBt, pParent->u.hdr.rightChild); 2349 }else{ 2350 break; 2351 } 2352 rc = sqlitepager_get(pBt->pPager, pgnoOld[i], (void**)&apOld[i]); 2353 if( rc ) goto balance_cleanup; 2354 rc = initPage(pBt, apOld[i], pgnoOld[i], pParent); 2355 if( rc ) goto balance_cleanup; 2356 apOld[i]->idxParent = k; 2357 nOld++; 2358 } 2359 2360 /* 2361 ** Set iCur to be the index in apCell[] of the cell that the cursor 2362 ** is pointing to. We will need this later on in order to keep the 2363 ** cursor pointing at the same cell. If pCur points to a page that 2364 ** has no involvement with this rebalancing, then set iCur to a large 2365 ** number so that the iCur==j tests always fail in the main cell 2366 ** distribution loop below. 2367 */ 2368 if( pCur ){ 2369 iCur = 0; 2370 for(i=0; i<nOld; i++){ 2371 if( pCur->pPage==apOld[i] ){ 2372 iCur += pCur->idx; 2373 break; 2374 } 2375 iCur += apOld[i]->nCell; 2376 if( i<nOld-1 && pCur->pPage==pParent && pCur->idx==idxDiv[i] ){ 2377 break; 2378 } 2379 iCur++; 2380 } 2381 pOldCurPage = pCur->pPage; 2382 } 2383 2384 /* 2385 ** Make copies of the content of pPage and its siblings into aOld[]. 2386 ** The rest of this function will use data from the copies rather 2387 ** that the original pages since the original pages will be in the 2388 ** process of being overwritten. 2389 */ 2390 for(i=0; i<nOld; i++){ 2391 copyPage(&aOld[i], apOld[i]); 2392 } 2393 2394 /* 2395 ** Load pointers to all cells on sibling pages and the divider cells 2396 ** into the local apCell[] array. Make copies of the divider cells 2397 ** into aTemp[] and remove the the divider Cells from pParent. 2398 */ 2399 nCell = 0; 2400 for(i=0; i<nOld; i++){ 2401 MemPage *pOld = &aOld[i]; 2402 for(j=0; j<pOld->nCell; j++){ 2403 apCell[nCell] = pOld->apCell[j]; 2404 szCell[nCell] = cellSize(pBt, apCell[nCell]); 2405 nCell++; 2406 } 2407 if( i<nOld-1 ){ 2408 szCell[nCell] = cellSize(pBt, apDiv[i]); 2409 memcpy(&aTemp[i], apDiv[i], szCell[nCell]); 2410 apCell[nCell] = &aTemp[i]; 2411 dropCell(pBt, pParent, nxDiv, szCell[nCell]); 2412 assert( SWAB32(pBt, apCell[nCell]->h.leftChild)==pgnoOld[i] ); 2413 apCell[nCell]->h.leftChild = pOld->u.hdr.rightChild; 2414 nCell++; 2415 } 2416 } 2417 2418 /* 2419 ** Figure out the number of pages needed to hold all nCell cells. 2420 ** Store this number in "k". Also compute szNew[] which is the total 2421 ** size of all cells on the i-th page and cntNew[] which is the index 2422 ** in apCell[] of the cell that divides path i from path i+1. 2423 ** cntNew[k] should equal nCell. 2424 ** 2425 ** This little patch of code is critical for keeping the tree 2426 ** balanced. 2427 */ 2428 for(subtotal=k=i=0; i<nCell; i++){ 2429 subtotal += szCell[i]; 2430 if( subtotal > USABLE_SPACE ){ 2431 szNew[k] = subtotal - szCell[i]; 2432 cntNew[k] = i; 2433 subtotal = 0; 2434 k++; 2435 } 2436 } 2437 szNew[k] = subtotal; 2438 cntNew[k] = nCell; 2439 k++; 2440 for(i=k-1; i>0; i--){ 2441 while( szNew[i]<USABLE_SPACE/2 ){ 2442 cntNew[i-1]--; 2443 assert( cntNew[i-1]>0 ); 2444 szNew[i] += szCell[cntNew[i-1]]; 2445 szNew[i-1] -= szCell[cntNew[i-1]-1]; 2446 } 2447 } 2448 assert( cntNew[0]>0 ); 2449 2450 /* 2451 ** Allocate k new pages. Reuse old pages where possible. 2452 */ 2453 for(i=0; i<k; i++){ 2454 if( i<nOld ){ 2455 apNew[i] = apOld[i]; 2456 pgnoNew[i] = pgnoOld[i]; 2457 apOld[i] = 0; 2458 rc = sqlitepager_write(apNew[i]); 2459 if( rc ) goto balance_cleanup; 2460 }else{ 2461 rc = allocatePage(pBt, &apNew[i], &pgnoNew[i], pgnoNew[i-1]); 2462 if( rc ) goto balance_cleanup; 2463 } 2464 nNew++; 2465 zeroPage(pBt, apNew[i]); 2466 apNew[i]->isInit = 1; 2467 } 2468 2469 /* Free any old pages that were not reused as new pages. 2470 */ 2471 while( i<nOld ){ 2472 rc = freePage(pBt, apOld[i], pgnoOld[i]); 2473 if( rc ) goto balance_cleanup; 2474 sqlitepager_unref(apOld[i]); 2475 apOld[i] = 0; 2476 i++; 2477 } 2478 2479 /* 2480 ** Put the new pages in accending order. This helps to 2481 ** keep entries in the disk file in order so that a scan 2482 ** of the table is a linear scan through the file. That 2483 ** in turn helps the operating system to deliver pages 2484 ** from the disk more rapidly. 2485 ** 2486 ** An O(n^2) insertion sort algorithm is used, but since 2487 ** n is never more than NB (a small constant), that should 2488 ** not be a problem. 2489 ** 2490 ** When NB==3, this one optimization makes the database 2491 ** about 25% faster for large insertions and deletions. 2492 */ 2493 for(i=0; i<k-1; i++){ 2494 int minV = pgnoNew[i]; 2495 int minI = i; 2496 for(j=i+1; j<k; j++){ 2497 if( pgnoNew[j]<(unsigned)minV ){ 2498 minI = j; 2499 minV = pgnoNew[j]; 2500 } 2501 } 2502 if( minI>i ){ 2503 int t; 2504 MemPage *pT; 2505 t = pgnoNew[i]; 2506 pT = apNew[i]; 2507 pgnoNew[i] = pgnoNew[minI]; 2508 apNew[i] = apNew[minI]; 2509 pgnoNew[minI] = t; 2510 apNew[minI] = pT; 2511 } 2512 } 2513 2514 /* 2515 ** Evenly distribute the data in apCell[] across the new pages. 2516 ** Insert divider cells into pParent as necessary. 2517 */ 2518 j = 0; 2519 for(i=0; i<nNew; i++){ 2520 MemPage *pNew = apNew[i]; 2521 while( j<cntNew[i] ){ 2522 assert( pNew->nFree>=szCell[j] ); 2523 if( pCur && iCur==j ){ pCur->pPage = pNew; pCur->idx = pNew->nCell; } 2524 insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]); 2525 j++; 2526 } 2527 assert( pNew->nCell>0 ); 2528 assert( !pNew->isOverfull ); 2529 relinkCellList(pBt, pNew); 2530 if( i<nNew-1 && j<nCell ){ 2531 pNew->u.hdr.rightChild = apCell[j]->h.leftChild; 2532 apCell[j]->h.leftChild = SWAB32(pBt, pgnoNew[i]); 2533 if( pCur && iCur==j ){ pCur->pPage = pParent; pCur->idx = nxDiv; } 2534 insertCell(pBt, pParent, nxDiv, apCell[j], szCell[j]); 2535 j++; 2536 nxDiv++; 2537 } 2538 } 2539 assert( j==nCell ); 2540 apNew[nNew-1]->u.hdr.rightChild = aOld[nOld-1].u.hdr.rightChild; 2541 if( nxDiv==pParent->nCell ){ 2542 pParent->u.hdr.rightChild = SWAB32(pBt, pgnoNew[nNew-1]); 2543 }else{ 2544 pParent->apCell[nxDiv]->h.leftChild = SWAB32(pBt, pgnoNew[nNew-1]); 2545 } 2546 if( pCur ){ 2547 if( j<=iCur && pCur->pPage==pParent && pCur->idx>idxDiv[nOld-1] ){ 2548 assert( pCur->pPage==pOldCurPage ); 2549 pCur->idx += nNew - nOld; 2550 }else{ 2551 assert( pOldCurPage!=0 ); 2552 sqlitepager_ref(pCur->pPage); 2553 sqlitepager_unref(pOldCurPage); 2554 } 2555 } 2556 2557 /* 2558 ** Reparent children of all cells. 2559 */ 2560 for(i=0; i<nNew; i++){ 2561 reparentChildPages(pBt, apNew[i]); 2562 } 2563 reparentChildPages(pBt, pParent); 2564 2565 /* 2566 ** balance the parent page. 2567 */ 2568 rc = balance(pBt, pParent, pCur); 2569 2570 /* 2571 ** Cleanup before returning. 2572 */ 2573 balance_cleanup: 2574 if( extraUnref ){ 2575 sqlitepager_unref(extraUnref); 2576 } 2577 for(i=0; i<nOld; i++){ 2578 if( apOld[i]!=0 && apOld[i]!=&aOld[i] ) sqlitepager_unref(apOld[i]); 2579 } 2580 for(i=0; i<nNew; i++){ 2581 sqlitepager_unref(apNew[i]); 2582 } 2583 if( pCur && pCur->pPage==0 ){ 2584 pCur->pPage = pParent; 2585 pCur->idx = 0; 2586 }else{ 2587 sqlitepager_unref(pParent); 2588 } 2589 return rc; 2590 } 2591 2592 /* 2593 ** This routine checks all cursors that point to the same table 2594 ** as pCur points to. If any of those cursors were opened with 2595 ** wrFlag==0 then this routine returns SQLITE_LOCKED. If all 2596 ** cursors point to the same table were opened with wrFlag==1 2597 ** then this routine returns SQLITE_OK. 2598 ** 2599 ** In addition to checking for read-locks (where a read-lock 2600 ** means a cursor opened with wrFlag==0) this routine also moves 2601 ** all cursors other than pCur so that they are pointing to the 2602 ** first Cell on root page. This is necessary because an insert 2603 ** or delete might change the number of cells on a page or delete 2604 ** a page entirely and we do not want to leave any cursors 2605 ** pointing to non-existant pages or cells. 2606 */ 2607 static int checkReadLocks(BtCursor *pCur){ 2608 BtCursor *p; 2609 assert( pCur->wrFlag ); 2610 for(p=pCur->pShared; p!=pCur; p=p->pShared){ 2611 assert( p ); 2612 assert( p->pgnoRoot==pCur->pgnoRoot ); 2613 if( p->wrFlag==0 ) return SQLITE_LOCKED; 2614 if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){ 2615 moveToRoot(p); 2616 } 2617 } 2618 return SQLITE_OK; 2619 } 2620 2621 /* 2622 ** Insert a new record into the BTree. The key is given by (pKey,nKey) 2623 ** and the data is given by (pData,nData). The cursor is used only to 2624 ** define what database the record should be inserted into. The cursor 2625 ** is left pointing at the new record. 2626 */ 2627 static int fileBtreeInsert( 2628 BtCursor *pCur, /* Insert data into the table of this cursor */ 2629 const void *pKey, int nKey, /* The key of the new record */ 2630 const void *pData, int nData /* The data of the new record */ 2631 ){ 2632 Cell newCell; 2633 int rc; 2634 int loc; 2635 int szNew; 2636 MemPage *pPage; 2637 Btree *pBt = pCur->pBt; 2638 2639 if( pCur->pPage==0 ){ 2640 return SQLITE_ABORT; /* A rollback destroyed this cursor */ 2641 } 2642 if( !pBt->inTrans || nKey+nData==0 ){ 2643 /* Must start a transaction before doing an insert */ 2644 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2645 } 2646 assert( !pBt->readOnly ); 2647 if( !pCur->wrFlag ){ 2648 return SQLITE_PERM; /* Cursor not open for writing */ 2649 } 2650 if( checkReadLocks(pCur) ){ 2651 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 2652 } 2653 rc = fileBtreeMoveto(pCur, pKey, nKey, &loc); 2654 if( rc ) return rc; 2655 pPage = pCur->pPage; 2656 assert( pPage->isInit ); 2657 rc = sqlitepager_write(pPage); 2658 if( rc ) return rc; 2659 rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData); 2660 if( rc ) return rc; 2661 szNew = cellSize(pBt, &newCell); 2662 if( loc==0 ){ 2663 newCell.h.leftChild = pPage->apCell[pCur->idx]->h.leftChild; 2664 rc = clearCell(pBt, pPage->apCell[pCur->idx]); 2665 if( rc ) return rc; 2666 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pPage->apCell[pCur->idx])); 2667 }else if( loc<0 && pPage->nCell>0 ){ 2668 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */ 2669 pCur->idx++; 2670 }else{ 2671 assert( pPage->u.hdr.rightChild==0 ); /* Must be a leaf page */ 2672 } 2673 insertCell(pBt, pPage, pCur->idx, &newCell, szNew); 2674 rc = balance(pCur->pBt, pPage, pCur); 2675 /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */ 2676 /* fflush(stdout); */ 2677 pCur->eSkip = SKIP_INVALID; 2678 return rc; 2679 } 2680 2681 /* 2682 ** Delete the entry that the cursor is pointing to. 2683 ** 2684 ** The cursor is left pointing at either the next or the previous 2685 ** entry. If the cursor is left pointing to the next entry, then 2686 ** the pCur->eSkip flag is set to SKIP_NEXT which forces the next call to 2687 ** sqliteBtreeNext() to be a no-op. That way, you can always call 2688 ** sqliteBtreeNext() after a delete and the cursor will be left 2689 ** pointing to the first entry after the deleted entry. Similarly, 2690 ** pCur->eSkip is set to SKIP_PREV is the cursor is left pointing to 2691 ** the entry prior to the deleted entry so that a subsequent call to 2692 ** sqliteBtreePrevious() will always leave the cursor pointing at the 2693 ** entry immediately before the one that was deleted. 2694 */ 2695 static int fileBtreeDelete(BtCursor *pCur){ 2696 MemPage *pPage = pCur->pPage; 2697 Cell *pCell; 2698 int rc; 2699 Pgno pgnoChild; 2700 Btree *pBt = pCur->pBt; 2701 2702 assert( pPage->isInit ); 2703 if( pCur->pPage==0 ){ 2704 return SQLITE_ABORT; /* A rollback destroyed this cursor */ 2705 } 2706 if( !pBt->inTrans ){ 2707 /* Must start a transaction before doing a delete */ 2708 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2709 } 2710 assert( !pBt->readOnly ); 2711 if( pCur->idx >= pPage->nCell ){ 2712 return SQLITE_ERROR; /* The cursor is not pointing to anything */ 2713 } 2714 if( !pCur->wrFlag ){ 2715 return SQLITE_PERM; /* Did not open this cursor for writing */ 2716 } 2717 if( checkReadLocks(pCur) ){ 2718 return SQLITE_LOCKED; /* The table pCur points to has a read lock */ 2719 } 2720 rc = sqlitepager_write(pPage); 2721 if( rc ) return rc; 2722 pCell = pPage->apCell[pCur->idx]; 2723 pgnoChild = SWAB32(pBt, pCell->h.leftChild); 2724 rc = clearCell(pBt, pCell); 2725 if( rc ) return rc; 2726 if( pgnoChild ){ 2727 /* 2728 ** The entry we are about to delete is not a leaf so if we do not 2729 ** do something we will leave a hole on an internal page. 2730 ** We have to fill the hole by moving in a cell from a leaf. The 2731 ** next Cell after the one to be deleted is guaranteed to exist and 2732 ** to be a leaf so we can use it. 2733 */ 2734 BtCursor leafCur; 2735 Cell *pNext; 2736 int szNext; 2737 int notUsed; 2738 getTempCursor(pCur, &leafCur); 2739 rc = fileBtreeNext(&leafCur, ¬Used); 2740 if( rc!=SQLITE_OK ){ 2741 if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT; 2742 return rc; 2743 } 2744 rc = sqlitepager_write(leafCur.pPage); 2745 if( rc ) return rc; 2746 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell)); 2747 pNext = leafCur.pPage->apCell[leafCur.idx]; 2748 szNext = cellSize(pBt, pNext); 2749 pNext->h.leftChild = SWAB32(pBt, pgnoChild); 2750 insertCell(pBt, pPage, pCur->idx, pNext, szNext); 2751 rc = balance(pBt, pPage, pCur); 2752 if( rc ) return rc; 2753 pCur->eSkip = SKIP_NEXT; 2754 dropCell(pBt, leafCur.pPage, leafCur.idx, szNext); 2755 rc = balance(pBt, leafCur.pPage, pCur); 2756 releaseTempCursor(&leafCur); 2757 }else{ 2758 dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell)); 2759 if( pCur->idx>=pPage->nCell ){ 2760 pCur->idx = pPage->nCell-1; 2761 if( pCur->idx<0 ){ 2762 pCur->idx = 0; 2763 pCur->eSkip = SKIP_NEXT; 2764 }else{ 2765 pCur->eSkip = SKIP_PREV; 2766 } 2767 }else{ 2768 pCur->eSkip = SKIP_NEXT; 2769 } 2770 rc = balance(pBt, pPage, pCur); 2771 } 2772 return rc; 2773 } 2774 2775 /* 2776 ** Create a new BTree table. Write into *piTable the page 2777 ** number for the root page of the new table. 2778 ** 2779 ** In the current implementation, BTree tables and BTree indices are the 2780 ** the same. In the future, we may change this so that BTree tables 2781 ** are restricted to having a 4-byte integer key and arbitrary data and 2782 ** BTree indices are restricted to having an arbitrary key and no data. 2783 ** But for now, this routine also serves to create indices. 2784 */ 2785 static int fileBtreeCreateTable(Btree *pBt, int *piTable){ 2786 MemPage *pRoot; 2787 Pgno pgnoRoot; 2788 int rc; 2789 if( !pBt->inTrans ){ 2790 /* Must start a transaction first */ 2791 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2792 } 2793 if( pBt->readOnly ){ 2794 return SQLITE_READONLY; 2795 } 2796 rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0); 2797 if( rc ) return rc; 2798 assert( sqlitepager_iswriteable(pRoot) ); 2799 zeroPage(pBt, pRoot); 2800 sqlitepager_unref(pRoot); 2801 *piTable = (int)pgnoRoot; 2802 return SQLITE_OK; 2803 } 2804 2805 /* 2806 ** Erase the given database page and all its children. Return 2807 ** the page to the freelist. 2808 */ 2809 static int clearDatabasePage(Btree *pBt, Pgno pgno, int freePageFlag){ 2810 MemPage *pPage; 2811 int rc; 2812 Cell *pCell; 2813 int idx; 2814 2815 rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pPage); 2816 if( rc ) return rc; 2817 rc = sqlitepager_write(pPage); 2818 if( rc ) return rc; 2819 rc = initPage(pBt, pPage, pgno, 0); 2820 if( rc ) return rc; 2821 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 2822 while( idx>0 ){ 2823 pCell = (Cell*)&pPage->u.aDisk[idx]; 2824 idx = SWAB16(pBt, pCell->h.iNext); 2825 if( pCell->h.leftChild ){ 2826 rc = clearDatabasePage(pBt, SWAB32(pBt, pCell->h.leftChild), 1); 2827 if( rc ) return rc; 2828 } 2829 rc = clearCell(pBt, pCell); 2830 if( rc ) return rc; 2831 } 2832 if( pPage->u.hdr.rightChild ){ 2833 rc = clearDatabasePage(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1); 2834 if( rc ) return rc; 2835 } 2836 if( freePageFlag ){ 2837 rc = freePage(pBt, pPage, pgno); 2838 }else{ 2839 zeroPage(pBt, pPage); 2840 } 2841 sqlitepager_unref(pPage); 2842 return rc; 2843 } 2844 2845 /* 2846 ** Delete all information from a single table in the database. 2847 */ 2848 static int fileBtreeClearTable(Btree *pBt, int iTable){ 2849 int rc; 2850 BtCursor *pCur; 2851 if( !pBt->inTrans ){ 2852 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2853 } 2854 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2855 if( pCur->pgnoRoot==(Pgno)iTable ){ 2856 if( pCur->wrFlag==0 ) return SQLITE_LOCKED; 2857 moveToRoot(pCur); 2858 } 2859 } 2860 rc = clearDatabasePage(pBt, (Pgno)iTable, 0); 2861 if( rc ){ 2862 fileBtreeRollback(pBt); 2863 } 2864 return rc; 2865 } 2866 2867 /* 2868 ** Erase all information in a table and add the root of the table to 2869 ** the freelist. Except, the root of the principle table (the one on 2870 ** page 2) is never added to the freelist. 2871 */ 2872 static int fileBtreeDropTable(Btree *pBt, int iTable){ 2873 int rc; 2874 MemPage *pPage; 2875 BtCursor *pCur; 2876 if( !pBt->inTrans ){ 2877 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 2878 } 2879 for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){ 2880 if( pCur->pgnoRoot==(Pgno)iTable ){ 2881 return SQLITE_LOCKED; /* Cannot drop a table that has a cursor */ 2882 } 2883 } 2884 rc = sqlitepager_get(pBt->pPager, (Pgno)iTable, (void**)&pPage); 2885 if( rc ) return rc; 2886 rc = fileBtreeClearTable(pBt, iTable); 2887 if( rc ) return rc; 2888 if( iTable>2 ){ 2889 rc = freePage(pBt, pPage, iTable); 2890 }else{ 2891 zeroPage(pBt, pPage); 2892 } 2893 sqlitepager_unref(pPage); 2894 return rc; 2895 } 2896 2897 #if 0 /* UNTESTED */ 2898 /* 2899 ** Copy all cell data from one database file into another. 2900 ** pages back the freelist. 2901 */ 2902 static int copyCell(Btree *pBtFrom, BTree *pBtTo, Cell *pCell){ 2903 Pager *pFromPager = pBtFrom->pPager; 2904 OverflowPage *pOvfl; 2905 Pgno ovfl, nextOvfl; 2906 Pgno *pPrev; 2907 int rc = SQLITE_OK; 2908 MemPage *pNew, *pPrevPg; 2909 Pgno new; 2910 2911 if( NKEY(pBtTo, pCell->h) + NDATA(pBtTo, pCell->h) <= MX_LOCAL_PAYLOAD ){ 2912 return SQLITE_OK; 2913 } 2914 pPrev = &pCell->ovfl; 2915 pPrevPg = 0; 2916 ovfl = SWAB32(pBtTo, pCell->ovfl); 2917 while( ovfl && rc==SQLITE_OK ){ 2918 rc = sqlitepager_get(pFromPager, ovfl, (void**)&pOvfl); 2919 if( rc ) return rc; 2920 nextOvfl = SWAB32(pBtFrom, pOvfl->iNext); 2921 rc = allocatePage(pBtTo, &pNew, &new, 0); 2922 if( rc==SQLITE_OK ){ 2923 rc = sqlitepager_write(pNew); 2924 if( rc==SQLITE_OK ){ 2925 memcpy(pNew, pOvfl, SQLITE_USABLE_SIZE); 2926 *pPrev = SWAB32(pBtTo, new); 2927 if( pPrevPg ){ 2928 sqlitepager_unref(pPrevPg); 2929 } 2930 pPrev = &pOvfl->iNext; 2931 pPrevPg = pNew; 2932 } 2933 } 2934 sqlitepager_unref(pOvfl); 2935 ovfl = nextOvfl; 2936 } 2937 if( pPrevPg ){ 2938 sqlitepager_unref(pPrevPg); 2939 } 2940 return rc; 2941 } 2942 #endif 2943 2944 2945 #if 0 /* UNTESTED */ 2946 /* 2947 ** Copy a page of data from one database over to another. 2948 */ 2949 static int copyDatabasePage( 2950 Btree *pBtFrom, 2951 Pgno pgnoFrom, 2952 Btree *pBtTo, 2953 Pgno *pTo 2954 ){ 2955 MemPage *pPageFrom, *pPage; 2956 Pgno to; 2957 int rc; 2958 Cell *pCell; 2959 int idx; 2960 2961 rc = sqlitepager_get(pBtFrom->pPager, pgno, (void**)&pPageFrom); 2962 if( rc ) return rc; 2963 rc = allocatePage(pBt, &pPage, pTo, 0); 2964 if( rc==SQLITE_OK ){ 2965 rc = sqlitepager_write(pPage); 2966 } 2967 if( rc==SQLITE_OK ){ 2968 memcpy(pPage, pPageFrom, SQLITE_USABLE_SIZE); 2969 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 2970 while( idx>0 ){ 2971 pCell = (Cell*)&pPage->u.aDisk[idx]; 2972 idx = SWAB16(pBt, pCell->h.iNext); 2973 if( pCell->h.leftChild ){ 2974 Pgno newChld; 2975 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pCell->h.leftChild), 2976 pBtTo, &newChld); 2977 if( rc ) return rc; 2978 pCell->h.leftChild = SWAB32(pBtFrom, newChld); 2979 } 2980 rc = copyCell(pBtFrom, pBtTo, pCell); 2981 if( rc ) return rc; 2982 } 2983 if( pPage->u.hdr.rightChild ){ 2984 Pgno newChld; 2985 rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pPage->u.hdr.rightChild), 2986 pBtTo, &newChld); 2987 if( rc ) return rc; 2988 pPage->u.hdr.rightChild = SWAB32(pBtTo, newChild); 2989 } 2990 } 2991 sqlitepager_unref(pPage); 2992 return rc; 2993 } 2994 #endif 2995 2996 /* 2997 ** Read the meta-information out of a database file. 2998 */ 2999 static int fileBtreeGetMeta(Btree *pBt, int *aMeta){ 3000 PageOne *pP1; 3001 int rc; 3002 int i; 3003 3004 rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1); 3005 if( rc ) return rc; 3006 aMeta[0] = SWAB32(pBt, pP1->nFree); 3007 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){ 3008 aMeta[i+1] = SWAB32(pBt, pP1->aMeta[i]); 3009 } 3010 sqlitepager_unref(pP1); 3011 return SQLITE_OK; 3012 } 3013 3014 /* 3015 ** Write meta-information back into the database. 3016 */ 3017 static int fileBtreeUpdateMeta(Btree *pBt, int *aMeta){ 3018 PageOne *pP1; 3019 int rc, i; 3020 if( !pBt->inTrans ){ 3021 return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR; 3022 } 3023 pP1 = pBt->page1; 3024 rc = sqlitepager_write(pP1); 3025 if( rc ) return rc; 3026 for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){ 3027 pP1->aMeta[i] = SWAB32(pBt, aMeta[i+1]); 3028 } 3029 return SQLITE_OK; 3030 } 3031 3032 /****************************************************************************** 3033 ** The complete implementation of the BTree subsystem is above this line. 3034 ** All the code the follows is for testing and troubleshooting the BTree 3035 ** subsystem. None of the code that follows is used during normal operation. 3036 ******************************************************************************/ 3037 3038 /* 3039 ** Print a disassembly of the given page on standard output. This routine 3040 ** is used for debugging and testing only. 3041 */ 3042 #ifdef SQLITE_TEST 3043 static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){ 3044 int rc; 3045 MemPage *pPage; 3046 int i, j; 3047 int nFree; 3048 u16 idx; 3049 char range[20]; 3050 unsigned char payload[20]; 3051 rc = sqlitepager_get(pBt->pPager, (Pgno)pgno, (void**)&pPage); 3052 if( rc ){ 3053 return rc; 3054 } 3055 if( recursive ) printf("PAGE %d:\n", pgno); 3056 i = 0; 3057 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 3058 while( idx>0 && idx<=SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){ 3059 Cell *pCell = (Cell*)&pPage->u.aDisk[idx]; 3060 int sz = cellSize(pBt, pCell); 3061 sprintf(range,"%d..%d", idx, idx+sz-1); 3062 sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h); 3063 if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1; 3064 memcpy(payload, pCell->aPayload, sz); 3065 for(j=0; j<sz; j++){ 3066 if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.'; 3067 } 3068 payload[sz] = 0; 3069 printf( 3070 "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n", 3071 i, range, (int)pCell->h.leftChild, 3072 NKEY(pBt, pCell->h), NDATA(pBt, pCell->h), 3073 payload 3074 ); 3075 if( pPage->isInit && pPage->apCell[i]!=pCell ){ 3076 printf("**** apCell[%d] does not match on prior entry ****\n", i); 3077 } 3078 i++; 3079 idx = SWAB16(pBt, pCell->h.iNext); 3080 } 3081 if( idx!=0 ){ 3082 printf("ERROR: next cell index out of range: %d\n", idx); 3083 } 3084 printf("right_child: %d\n", SWAB32(pBt, pPage->u.hdr.rightChild)); 3085 nFree = 0; 3086 i = 0; 3087 idx = SWAB16(pBt, pPage->u.hdr.firstFree); 3088 while( idx>0 && idx<SQLITE_USABLE_SIZE ){ 3089 FreeBlk *p = (FreeBlk*)&pPage->u.aDisk[idx]; 3090 sprintf(range,"%d..%d", idx, idx+p->iSize-1); 3091 nFree += SWAB16(pBt, p->iSize); 3092 printf("freeblock %2d: i=%-10s size=%-4d total=%d\n", 3093 i, range, SWAB16(pBt, p->iSize), nFree); 3094 idx = SWAB16(pBt, p->iNext); 3095 i++; 3096 } 3097 if( idx!=0 ){ 3098 printf("ERROR: next freeblock index out of range: %d\n", idx); 3099 } 3100 if( recursive && pPage->u.hdr.rightChild!=0 ){ 3101 idx = SWAB16(pBt, pPage->u.hdr.firstCell); 3102 while( idx>0 && idx<SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){ 3103 Cell *pCell = (Cell*)&pPage->u.aDisk[idx]; 3104 fileBtreePageDump(pBt, SWAB32(pBt, pCell->h.leftChild), 1); 3105 idx = SWAB16(pBt, pCell->h.iNext); 3106 } 3107 fileBtreePageDump(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1); 3108 } 3109 sqlitepager_unref(pPage); 3110 return SQLITE_OK; 3111 } 3112 #endif 3113 3114 #ifdef SQLITE_TEST 3115 /* 3116 ** Fill aResult[] with information about the entry and page that the 3117 ** cursor is pointing to. 3118 ** 3119 ** aResult[0] = The page number 3120 ** aResult[1] = The entry number 3121 ** aResult[2] = Total number of entries on this page 3122 ** aResult[3] = Size of this entry 3123 ** aResult[4] = Number of free bytes on this page 3124 ** aResult[5] = Number of free blocks on the page 3125 ** aResult[6] = Page number of the left child of this entry 3126 ** aResult[7] = Page number of the right child for the whole page 3127 ** 3128 ** This routine is used for testing and debugging only. 3129 */ 3130 static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){ 3131 int cnt, idx; 3132 MemPage *pPage = pCur->pPage; 3133 Btree *pBt = pCur->pBt; 3134 aResult[0] = sqlitepager_pagenumber(pPage); 3135 aResult[1] = pCur->idx; 3136 aResult[2] = pPage->nCell; 3137 if( pCur->idx>=0 && pCur->idx<pPage->nCell ){ 3138 aResult[3] = cellSize(pBt, pPage->apCell[pCur->idx]); 3139 aResult[6] = SWAB32(pBt, pPage->apCell[pCur->idx]->h.leftChild); 3140 }else{ 3141 aResult[3] = 0; 3142 aResult[6] = 0; 3143 } 3144 aResult[4] = pPage->nFree; 3145 cnt = 0; 3146 idx = SWAB16(pBt, pPage->u.hdr.firstFree); 3147 while( idx>0 && idx<SQLITE_USABLE_SIZE ){ 3148 cnt++; 3149 idx = SWAB16(pBt, ((FreeBlk*)&pPage->u.aDisk[idx])->iNext); 3150 } 3151 aResult[5] = cnt; 3152 aResult[7] = SWAB32(pBt, pPage->u.hdr.rightChild); 3153 return SQLITE_OK; 3154 } 3155 #endif 3156 3157 /* 3158 ** Return the pager associated with a BTree. This routine is used for 3159 ** testing and debugging only. 3160 */ 3161 static Pager *fileBtreePager(Btree *pBt){ 3162 return pBt->pPager; 3163 } 3164 3165 /* 3166 ** This structure is passed around through all the sanity checking routines 3167 ** in order to keep track of some global state information. 3168 */ 3169 typedef struct IntegrityCk IntegrityCk; 3170 struct IntegrityCk { 3171 Btree *pBt; /* The tree being checked out */ 3172 Pager *pPager; /* The associated pager. Also accessible by pBt->pPager */ 3173 int nPage; /* Number of pages in the database */ 3174 int *anRef; /* Number of times each page is referenced */ 3175 char *zErrMsg; /* An error message. NULL of no errors seen. */ 3176 }; 3177 3178 /* 3179 ** Append a message to the error message string. 3180 */ 3181 static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){ 3182 if( pCheck->zErrMsg ){ 3183 char *zOld = pCheck->zErrMsg; 3184 pCheck->zErrMsg = 0; 3185 sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0); 3186 sqliteFree(zOld); 3187 }else{ 3188 sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0); 3189 } 3190 } 3191 3192 /* 3193 ** Add 1 to the reference count for page iPage. If this is the second 3194 ** reference to the page, add an error message to pCheck->zErrMsg. 3195 ** Return 1 if there are 2 ore more references to the page and 0 if 3196 ** if this is the first reference to the page. 3197 ** 3198 ** Also check that the page number is in bounds. 3199 */ 3200 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){ 3201 if( iPage==0 ) return 1; 3202 if( iPage>pCheck->nPage || iPage<0 ){ 3203 char zBuf[100]; 3204 sprintf(zBuf, "invalid page number %d", iPage); 3205 checkAppendMsg(pCheck, zContext, zBuf); 3206 return 1; 3207 } 3208 if( pCheck->anRef[iPage]==1 ){ 3209 char zBuf[100]; 3210 sprintf(zBuf, "2nd reference to page %d", iPage); 3211 checkAppendMsg(pCheck, zContext, zBuf); 3212 return 1; 3213 } 3214 return (pCheck->anRef[iPage]++)>1; 3215 } 3216 3217 /* 3218 ** Check the integrity of the freelist or of an overflow page list. 3219 ** Verify that the number of pages on the list is N. 3220 */ 3221 static void checkList( 3222 IntegrityCk *pCheck, /* Integrity checking context */ 3223 int isFreeList, /* True for a freelist. False for overflow page list */ 3224 int iPage, /* Page number for first page in the list */ 3225 int N, /* Expected number of pages in the list */ 3226 char *zContext /* Context for error messages */ 3227 ){ 3228 int i; 3229 char zMsg[100]; 3230 while( N-- > 0 ){ 3231 OverflowPage *pOvfl; 3232 if( iPage<1 ){ 3233 sprintf(zMsg, "%d pages missing from overflow list", N+1); 3234 checkAppendMsg(pCheck, zContext, zMsg); 3235 break; 3236 } 3237 if( checkRef(pCheck, iPage, zContext) ) break; 3238 if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){ 3239 sprintf(zMsg, "failed to get page %d", iPage); 3240 checkAppendMsg(pCheck, zContext, zMsg); 3241 break; 3242 } 3243 if( isFreeList ){ 3244 FreelistInfo *pInfo = (FreelistInfo*)pOvfl->aPayload; 3245 int n = SWAB32(pCheck->pBt, pInfo->nFree); 3246 for(i=0; i<n; i++){ 3247 checkRef(pCheck, SWAB32(pCheck->pBt, pInfo->aFree[i]), zContext); 3248 } 3249 N -= n; 3250 } 3251 iPage = SWAB32(pCheck->pBt, pOvfl->iNext); 3252 sqlitepager_unref(pOvfl); 3253 } 3254 } 3255 3256 /* 3257 ** Return negative if zKey1<zKey2. 3258 ** Return zero if zKey1==zKey2. 3259 ** Return positive if zKey1>zKey2. 3260 */ 3261 static int keyCompare( 3262 const char *zKey1, int nKey1, 3263 const char *zKey2, int nKey2 3264 ){ 3265 int min = nKey1>nKey2 ? nKey2 : nKey1; 3266 int c = memcmp(zKey1, zKey2, min); 3267 if( c==0 ){ 3268 c = nKey1 - nKey2; 3269 } 3270 return c; 3271 } 3272 3273 /* 3274 ** Do various sanity checks on a single page of a tree. Return 3275 ** the tree depth. Root pages return 0. Parents of root pages 3276 ** return 1, and so forth. 3277 ** 3278 ** These checks are done: 3279 ** 3280 ** 1. Make sure that cells and freeblocks do not overlap 3281 ** but combine to completely cover the page. 3282 ** 2. Make sure cell keys are in order. 3283 ** 3. Make sure no key is less than or equal to zLowerBound. 3284 ** 4. Make sure no key is greater than or equal to zUpperBound. 3285 ** 5. Check the integrity of overflow pages. 3286 ** 6. Recursively call checkTreePage on all children. 3287 ** 7. Verify that the depth of all children is the same. 3288 ** 8. Make sure this page is at least 33% full or else it is 3289 ** the root of the tree. 3290 */ 3291 static int checkTreePage( 3292 IntegrityCk *pCheck, /* Context for the sanity check */ 3293 int iPage, /* Page number of the page to check */ 3294 MemPage *pParent, /* Parent page */ 3295 char *zParentContext, /* Parent context */ 3296 char *zLowerBound, /* All keys should be greater than this, if not NULL */ 3297 int nLower, /* Number of characters in zLowerBound */ 3298 char *zUpperBound, /* All keys should be less than this, if not NULL */ 3299 int nUpper /* Number of characters in zUpperBound */ 3300 ){ 3301 MemPage *pPage; 3302 int i, rc, depth, d2, pgno; 3303 char *zKey1, *zKey2; 3304 int nKey1, nKey2; 3305 BtCursor cur; 3306 Btree *pBt; 3307 char zMsg[100]; 3308 char zContext[100]; 3309 char hit[SQLITE_USABLE_SIZE]; 3310 3311 /* Check that the page exists 3312 */ 3313 cur.pBt = pBt = pCheck->pBt; 3314 if( iPage==0 ) return 0; 3315 if( checkRef(pCheck, iPage, zParentContext) ) return 0; 3316 sprintf(zContext, "On tree page %d: ", iPage); 3317 if( (rc = sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pPage))!=0 ){ 3318 sprintf(zMsg, "unable to get the page. error code=%d", rc); 3319 checkAppendMsg(pCheck, zContext, zMsg); 3320 return 0; 3321 } 3322 if( (rc = initPage(pBt, pPage, (Pgno)iPage, pParent))!=0 ){ 3323 sprintf(zMsg, "initPage() returns error code %d", rc); 3324 checkAppendMsg(pCheck, zContext, zMsg); 3325 sqlitepager_unref(pPage); 3326 return 0; 3327 } 3328 3329 /* Check out all the cells. 3330 */ 3331 depth = 0; 3332 if( zLowerBound ){ 3333 zKey1 = sqliteMalloc( nLower+1 ); 3334 memcpy(zKey1, zLowerBound, nLower); 3335 zKey1[nLower] = 0; 3336 }else{ 3337 zKey1 = 0; 3338 } 3339 nKey1 = nLower; 3340 cur.pPage = pPage; 3341 for(i=0; i<pPage->nCell; i++){ 3342 Cell *pCell = pPage->apCell[i]; 3343 int sz; 3344 3345 /* Check payload overflow pages 3346 */ 3347 nKey2 = NKEY(pBt, pCell->h); 3348 sz = nKey2 + NDATA(pBt, pCell->h); 3349 sprintf(zContext, "On page %d cell %d: ", iPage, i); 3350 if( sz>MX_LOCAL_PAYLOAD ){ 3351 int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE; 3352 checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext); 3353 } 3354 3355 /* Check that keys are in the right order 3356 */ 3357 cur.idx = i; 3358 zKey2 = sqliteMallocRaw( nKey2+1 ); 3359 getPayload(&cur, 0, nKey2, zKey2); 3360 if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){ 3361 checkAppendMsg(pCheck, zContext, "Key is out of order"); 3362 } 3363 3364 /* Check sanity of left child page. 3365 */ 3366 pgno = SWAB32(pBt, pCell->h.leftChild); 3367 d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2); 3368 if( i>0 && d2!=depth ){ 3369 checkAppendMsg(pCheck, zContext, "Child page depth differs"); 3370 } 3371 depth = d2; 3372 sqliteFree(zKey1); 3373 zKey1 = zKey2; 3374 nKey1 = nKey2; 3375 } 3376 pgno = SWAB32(pBt, pPage->u.hdr.rightChild); 3377 sprintf(zContext, "On page %d at right child: ", iPage); 3378 checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper); 3379 sqliteFree(zKey1); 3380 3381 /* Check for complete coverage of the page 3382 */ 3383 memset(hit, 0, sizeof(hit)); 3384 memset(hit, 1, sizeof(PageHdr)); 3385 for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_USABLE_SIZE; ){ 3386 Cell *pCell = (Cell*)&pPage->u.aDisk[i]; 3387 int j; 3388 for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++; 3389 i = SWAB16(pBt, pCell->h.iNext); 3390 } 3391 for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_USABLE_SIZE; ){ 3392 FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i]; 3393 int j; 3394 for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++; 3395 i = SWAB16(pBt,pFBlk->iNext); 3396 } 3397 for(i=0; i<SQLITE_USABLE_SIZE; i++){ 3398 if( hit[i]==0 ){ 3399 sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage); 3400 checkAppendMsg(pCheck, zMsg, 0); 3401 break; 3402 }else if( hit[i]>1 ){ 3403 sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage); 3404 checkAppendMsg(pCheck, zMsg, 0); 3405 break; 3406 } 3407 } 3408 3409 /* Check that free space is kept to a minimum 3410 */ 3411 #if 0 3412 if( pParent && pParent->nCell>2 && pPage->nFree>3*SQLITE_USABLE_SIZE/4 ){ 3413 sprintf(zMsg, "free space (%d) greater than max (%d)", pPage->nFree, 3414 SQLITE_USABLE_SIZE/3); 3415 checkAppendMsg(pCheck, zContext, zMsg); 3416 } 3417 #endif 3418 3419 sqlitepager_unref(pPage); 3420 return depth; 3421 } 3422 3423 /* 3424 ** This routine does a complete check of the given BTree file. aRoot[] is 3425 ** an array of pages numbers were each page number is the root page of 3426 ** a table. nRoot is the number of entries in aRoot. 3427 ** 3428 ** If everything checks out, this routine returns NULL. If something is 3429 ** amiss, an error message is written into memory obtained from malloc() 3430 ** and a pointer to that error message is returned. The calling function 3431 ** is responsible for freeing the error message when it is done. 3432 */ 3433 char *fileBtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){ 3434 int i; 3435 int nRef; 3436 IntegrityCk sCheck; 3437 3438 nRef = *sqlitepager_stats(pBt->pPager); 3439 if( lockBtree(pBt)!=SQLITE_OK ){ 3440 return sqliteStrDup("Unable to acquire a read lock on the database"); 3441 } 3442 sCheck.pBt = pBt; 3443 sCheck.pPager = pBt->pPager; 3444 sCheck.nPage = sqlitepager_pagecount(sCheck.pPager); 3445 if( sCheck.nPage==0 ){ 3446 unlockBtreeIfUnused(pBt); 3447 return 0; 3448 } 3449 sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) ); 3450 sCheck.anRef[1] = 1; 3451 for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; } 3452 sCheck.zErrMsg = 0; 3453 3454 /* Check the integrity of the freelist 3455 */ 3456 checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList), 3457 SWAB32(pBt, pBt->page1->nFree), "Main freelist: "); 3458 3459 /* Check all the tables. 3460 */ 3461 for(i=0; i<nRoot; i++){ 3462 if( aRoot[i]==0 ) continue; 3463 checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0); 3464 } 3465 3466 /* Make sure every page in the file is referenced 3467 */ 3468 for(i=1; i<=sCheck.nPage; i++){ 3469 if( sCheck.anRef[i]==0 ){ 3470 char zBuf[100]; 3471 sprintf(zBuf, "Page %d is never used", i); 3472 checkAppendMsg(&sCheck, zBuf, 0); 3473 } 3474 } 3475 3476 /* Make sure this analysis did not leave any unref() pages 3477 */ 3478 unlockBtreeIfUnused(pBt); 3479 if( nRef != *sqlitepager_stats(pBt->pPager) ){ 3480 char zBuf[100]; 3481 sprintf(zBuf, 3482 "Outstanding page count goes from %d to %d during this analysis", 3483 nRef, *sqlitepager_stats(pBt->pPager) 3484 ); 3485 checkAppendMsg(&sCheck, zBuf, 0); 3486 } 3487 3488 /* Clean up and report errors. 3489 */ 3490 sqliteFree(sCheck.anRef); 3491 return sCheck.zErrMsg; 3492 } 3493 3494 /* 3495 ** Return the full pathname of the underlying database file. 3496 */ 3497 static const char *fileBtreeGetFilename(Btree *pBt){ 3498 assert( pBt->pPager!=0 ); 3499 return sqlitepager_filename(pBt->pPager); 3500 } 3501 3502 /* 3503 ** Copy the complete content of pBtFrom into pBtTo. A transaction 3504 ** must be active for both files. 3505 ** 3506 ** The size of file pBtFrom may be reduced by this operation. 3507 ** If anything goes wrong, the transaction on pBtFrom is rolled back. 3508 */ 3509 static int fileBtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){ 3510 int rc = SQLITE_OK; 3511 Pgno i, nPage, nToPage; 3512 3513 if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR; 3514 if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR; 3515 if( pBtTo->pCursor ) return SQLITE_BUSY; 3516 memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_USABLE_SIZE); 3517 rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1); 3518 nToPage = sqlitepager_pagecount(pBtTo->pPager); 3519 nPage = sqlitepager_pagecount(pBtFrom->pPager); 3520 for(i=2; rc==SQLITE_OK && i<=nPage; i++){ 3521 void *pPage; 3522 rc = sqlitepager_get(pBtFrom->pPager, i, &pPage); 3523 if( rc ) break; 3524 rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage); 3525 if( rc ) break; 3526 sqlitepager_unref(pPage); 3527 } 3528 for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){ 3529 void *pPage; 3530 rc = sqlitepager_get(pBtTo->pPager, i, &pPage); 3531 if( rc ) break; 3532 rc = sqlitepager_write(pPage); 3533 sqlitepager_unref(pPage); 3534 sqlitepager_dont_write(pBtTo->pPager, i); 3535 } 3536 if( !rc && nPage<nToPage ){ 3537 rc = sqlitepager_truncate(pBtTo->pPager, nPage); 3538 } 3539 if( rc ){ 3540 fileBtreeRollback(pBtTo); 3541 } 3542 return rc; 3543 } 3544 3545 /* 3546 ** The following tables contain pointers to all of the interface 3547 ** routines for this implementation of the B*Tree backend. To 3548 ** substitute a different implemention of the backend, one has merely 3549 ** to provide pointers to alternative functions in similar tables. 3550 */ 3551 static BtOps sqliteBtreeOps = { 3552 fileBtreeClose, 3553 fileBtreeSetCacheSize, 3554 fileBtreeSetSafetyLevel, 3555 fileBtreeBeginTrans, 3556 fileBtreeCommit, 3557 fileBtreeRollback, 3558 fileBtreeBeginCkpt, 3559 fileBtreeCommitCkpt, 3560 fileBtreeRollbackCkpt, 3561 fileBtreeCreateTable, 3562 fileBtreeCreateTable, /* Really sqliteBtreeCreateIndex() */ 3563 fileBtreeDropTable, 3564 fileBtreeClearTable, 3565 fileBtreeCursor, 3566 fileBtreeGetMeta, 3567 fileBtreeUpdateMeta, 3568 fileBtreeIntegrityCheck, 3569 fileBtreeGetFilename, 3570 fileBtreeCopyFile, 3571 fileBtreePager, 3572 #ifdef SQLITE_TEST 3573 fileBtreePageDump, 3574 #endif 3575 }; 3576 static BtCursorOps sqliteBtreeCursorOps = { 3577 fileBtreeMoveto, 3578 fileBtreeDelete, 3579 fileBtreeInsert, 3580 fileBtreeFirst, 3581 fileBtreeLast, 3582 fileBtreeNext, 3583 fileBtreePrevious, 3584 fileBtreeKeySize, 3585 fileBtreeKey, 3586 fileBtreeKeyCompare, 3587 fileBtreeDataSize, 3588 fileBtreeData, 3589 fileBtreeCloseCursor, 3590 #ifdef SQLITE_TEST 3591 fileBtreeCursorDump, 3592 #endif 3593 }; 3594