1 /*
2 ** 2001 September 15
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** $Id: btree.c,v 1.1.1.1 2004/08/08 15:03:57 matt Exp $
13 **
14 ** This file implements a external (disk-based) database using BTrees.
15 ** For a detailed discussion of BTrees, refer to
16 **
17 **     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
18 **     "Sorting And Searching", pages 473-480. Addison-Wesley
19 **     Publishing Company, Reading, Massachusetts.
20 **
21 ** The basic idea is that each page of the file contains N database
22 ** entries and N+1 pointers to subpages.
23 **
24 **   ----------------------------------------------------------------
25 **   |  Ptr(0) | Key(0) | Ptr(1) | Key(1) | ... | Key(N) | Ptr(N+1) |
26 **   ----------------------------------------------------------------
27 **
28 ** All of the keys on the page that Ptr(0) points to have values less
29 ** than Key(0).  All of the keys on page Ptr(1) and its subpages have
30 ** values greater than Key(0) and less than Key(1).  All of the keys
31 ** on Ptr(N+1) and its subpages have values greater than Key(N).  And
32 ** so forth.
33 **
34 ** Finding a particular key requires reading O(log(M)) pages from the
35 ** disk where M is the number of entries in the tree.
36 **
37 ** In this implementation, a single file can hold one or more separate
38 ** BTrees.  Each BTree is identified by the index of its root page.  The
39 ** key and data for any entry are combined to form the "payload".  Up to
40 ** MX_LOCAL_PAYLOAD bytes of payload can be carried directly on the
41 ** database page.  If the payload is larger than MX_LOCAL_PAYLOAD bytes
42 ** then surplus bytes are stored on overflow pages.  The payload for an
43 ** entry and the preceding pointer are combined to form a "Cell".  Each
44 ** page has a small header which contains the Ptr(N+1) pointer.
45 **
46 ** The first page of the file contains a magic string used to verify that
47 ** the file really is a valid BTree database, a pointer to a list of unused
48 ** pages in the file, and some meta information.  The root of the first
49 ** BTree begins on page 2 of the file.  (Pages are numbered beginning with
50 ** 1, not 0.)  Thus a minimum database contains 2 pages.
51 */
52 #include "sqliteInt.h"
53 #include "pager.h"
54 #include "btree.h"
55 #include <assert.h>
56 
57 /* Forward declarations */
58 static BtOps sqliteBtreeOps;
59 static BtCursorOps sqliteBtreeCursorOps;
60 
61 /*
62 ** Macros used for byteswapping.  B is a pointer to the Btree
63 ** structure.  This is needed to access the Btree.needSwab boolean
64 ** in order to tell if byte swapping is needed or not.
65 ** X is an unsigned integer.  SWAB16 byte swaps a 16-bit integer.
66 ** SWAB32 byteswaps a 32-bit integer.
67 */
68 #define SWAB16(B,X)   ((B)->needSwab? swab16((u16)X) : ((u16)X))
69 #define SWAB32(B,X)   ((B)->needSwab? swab32(X) : (X))
70 #define SWAB_ADD(B,X,A) \
71    if((B)->needSwab){ X=swab32(swab32(X)+A); }else{ X += (A); }
72 
73 /*
74 ** The following global variable - available only if SQLITE_TEST is
75 ** defined - is used to determine whether new databases are created in
76 ** native byte order or in non-native byte order.  Non-native byte order
77 ** databases are created for testing purposes only.  Under normal operation,
78 ** only native byte-order databases should be created, but we should be
79 ** able to read or write existing databases regardless of the byteorder.
80 */
81 #ifdef SQLITE_TEST
82 int btree_native_byte_order = 1;
83 #else
84 # define btree_native_byte_order 1
85 #endif
86 
87 /*
88 ** Forward declarations of structures used only in this file.
89 */
90 typedef struct PageOne PageOne;
91 typedef struct MemPage MemPage;
92 typedef struct PageHdr PageHdr;
93 typedef struct Cell Cell;
94 typedef struct CellHdr CellHdr;
95 typedef struct FreeBlk FreeBlk;
96 typedef struct OverflowPage OverflowPage;
97 typedef struct FreelistInfo FreelistInfo;
98 
99 /*
100 ** All structures on a database page are aligned to 4-byte boundries.
101 ** This routine rounds up a number of bytes to the next multiple of 4.
102 **
103 ** This might need to change for computer architectures that require
104 ** and 8-byte alignment boundry for structures.
105 */
106 #define ROUNDUP(X)  ((X+3) & ~3)
107 
108 /*
109 ** This is a magic string that appears at the beginning of every
110 ** SQLite database in order to identify the file as a real database.
111 */
112 static const char zMagicHeader[] =
113    "** This file contains an SQLite 2.1 database **";
114 #define MAGIC_SIZE (sizeof(zMagicHeader))
115 
116 /*
117 ** This is a magic integer also used to test the integrity of the database
118 ** file.  This integer is used in addition to the string above so that
119 ** if the file is written on a little-endian architecture and read
120 ** on a big-endian architectures (or vice versa) we can detect the
121 ** problem.
122 **
123 ** The number used was obtained at random and has no special
124 ** significance other than the fact that it represents a different
125 ** integer on little-endian and big-endian machines.
126 */
127 #define MAGIC 0xdae37528
128 
129 /*
130 ** The first page of the database file contains a magic header string
131 ** to identify the file as an SQLite database file.  It also contains
132 ** a pointer to the first free page of the file.  Page 2 contains the
133 ** root of the principle BTree.  The file might contain other BTrees
134 ** rooted on pages above 2.
135 **
136 ** The first page also contains SQLITE_N_BTREE_META integers that
137 ** can be used by higher-level routines.
138 **
139 ** Remember that pages are numbered beginning with 1.  (See pager.c
140 ** for additional information.)  Page 0 does not exist and a page
141 ** number of 0 is used to mean "no such page".
142 */
143 struct PageOne {
144   char zMagic[MAGIC_SIZE]; /* String that identifies the file as a database */
145   int iMagic;              /* Integer to verify correct byte order */
146   Pgno freeList;           /* First free page in a list of all free pages */
147   int nFree;               /* Number of pages on the free list */
148   int aMeta[SQLITE_N_BTREE_META-1];  /* User defined integers */
149 };
150 
151 /*
152 ** Each database page has a header that is an instance of this
153 ** structure.
154 **
155 ** PageHdr.firstFree is 0 if there is no free space on this page.
156 ** Otherwise, PageHdr.firstFree is the index in MemPage.u.aDisk[] of a
157 ** FreeBlk structure that describes the first block of free space.
158 ** All free space is defined by a linked list of FreeBlk structures.
159 **
160 ** Data is stored in a linked list of Cell structures.  PageHdr.firstCell
161 ** is the index into MemPage.u.aDisk[] of the first cell on the page.  The
162 ** Cells are kept in sorted order.
163 **
164 ** A Cell contains all information about a database entry and a pointer
165 ** to a child page that contains other entries less than itself.  In
166 ** other words, the i-th Cell contains both Ptr(i) and Key(i).  The
167 ** right-most pointer of the page is contained in PageHdr.rightChild.
168 */
169 struct PageHdr {
170   Pgno rightChild;  /* Child page that comes after all cells on this page */
171   u16 firstCell;    /* Index in MemPage.u.aDisk[] of the first cell */
172   u16 firstFree;    /* Index in MemPage.u.aDisk[] of the first free block */
173 };
174 
175 /*
176 ** Entries on a page of the database are called "Cells".  Each Cell
177 ** has a header and data.  This structure defines the header.  The
178 ** key and data (collectively the "payload") follow this header on
179 ** the database page.
180 **
181 ** A definition of the complete Cell structure is given below.  The
182 ** header for the cell must be defined first in order to do some
183 ** of the sizing #defines that follow.
184 */
185 struct CellHdr {
186   Pgno leftChild; /* Child page that comes before this cell */
187   u16 nKey;       /* Number of bytes in the key */
188   u16 iNext;      /* Index in MemPage.u.aDisk[] of next cell in sorted order */
189   u8 nKeyHi;      /* Upper 8 bits of key size for keys larger than 64K bytes */
190   u8 nDataHi;     /* Upper 8 bits of data size when the size is more than 64K */
191   u16 nData;      /* Number of bytes of data */
192 };
193 
194 /*
195 ** The key and data size are split into a lower 16-bit segment and an
196 ** upper 8-bit segment in order to pack them together into a smaller
197 ** space.  The following macros reassembly a key or data size back
198 ** into an integer.
199 */
200 #define NKEY(b,h)  (SWAB16(b,h.nKey) + h.nKeyHi*65536)
201 #define NDATA(b,h) (SWAB16(b,h.nData) + h.nDataHi*65536)
202 
203 /*
204 ** The minimum size of a complete Cell.  The Cell must contain a header
205 ** and at least 4 bytes of payload.
206 */
207 #define MIN_CELL_SIZE  (sizeof(CellHdr)+4)
208 
209 /*
210 ** The maximum number of database entries that can be held in a single
211 ** page of the database.
212 */
213 #define MX_CELL ((SQLITE_USABLE_SIZE-sizeof(PageHdr))/MIN_CELL_SIZE)
214 
215 /*
216 ** The amount of usable space on a single page of the BTree.  This is the
217 ** page size minus the overhead of the page header.
218 */
219 #define USABLE_SPACE  (SQLITE_USABLE_SIZE - sizeof(PageHdr))
220 
221 /*
222 ** The maximum amount of payload (in bytes) that can be stored locally for
223 ** a database entry.  If the entry contains more data than this, the
224 ** extra goes onto overflow pages.
225 **
226 ** This number is chosen so that at least 4 cells will fit on every page.
227 */
228 #define MX_LOCAL_PAYLOAD ((USABLE_SPACE/4-(sizeof(CellHdr)+sizeof(Pgno)))&~3)
229 
230 /*
231 ** Data on a database page is stored as a linked list of Cell structures.
232 ** Both the key and the data are stored in aPayload[].  The key always comes
233 ** first.  The aPayload[] field grows as necessary to hold the key and data,
234 ** up to a maximum of MX_LOCAL_PAYLOAD bytes.  If the size of the key and
235 ** data combined exceeds MX_LOCAL_PAYLOAD bytes, then Cell.ovfl is the
236 ** page number of the first overflow page.
237 **
238 ** Though this structure is fixed in size, the Cell on the database
239 ** page varies in size.  Every cell has a CellHdr and at least 4 bytes
240 ** of payload space.  Additional payload bytes (up to the maximum of
241 ** MX_LOCAL_PAYLOAD) and the Cell.ovfl value are allocated only as
242 ** needed.
243 */
244 struct Cell {
245   CellHdr h;                        /* The cell header */
246   char aPayload[MX_LOCAL_PAYLOAD];  /* Key and data */
247   Pgno ovfl;                        /* The first overflow page */
248 };
249 
250 /*
251 ** Free space on a page is remembered using a linked list of the FreeBlk
252 ** structures.  Space on a database page is allocated in increments of
253 ** at least 4 bytes and is always aligned to a 4-byte boundry.  The
254 ** linked list of FreeBlks is always kept in order by address.
255 */
256 struct FreeBlk {
257   u16 iSize;      /* Number of bytes in this block of free space */
258   u16 iNext;      /* Index in MemPage.u.aDisk[] of the next free block */
259 };
260 
261 /*
262 ** The number of bytes of payload that will fit on a single overflow page.
263 */
264 #define OVERFLOW_SIZE (SQLITE_USABLE_SIZE-sizeof(Pgno))
265 
266 /*
267 ** When the key and data for a single entry in the BTree will not fit in
268 ** the MX_LOCAL_PAYLOAD bytes of space available on the database page,
269 ** then all extra bytes are written to a linked list of overflow pages.
270 ** Each overflow page is an instance of the following structure.
271 **
272 ** Unused pages in the database are also represented by instances of
273 ** the OverflowPage structure.  The PageOne.freeList field is the
274 ** page number of the first page in a linked list of unused database
275 ** pages.
276 */
277 struct OverflowPage {
278   Pgno iNext;
279   char aPayload[OVERFLOW_SIZE];
280 };
281 
282 /*
283 ** The PageOne.freeList field points to a linked list of overflow pages
284 ** hold information about free pages.  The aPayload section of each
285 ** overflow page contains an instance of the following structure.  The
286 ** aFree[] array holds the page number of nFree unused pages in the disk
287 ** file.
288 */
289 struct FreelistInfo {
290   int nFree;
291   Pgno aFree[(OVERFLOW_SIZE-sizeof(int))/sizeof(Pgno)];
292 };
293 
294 /*
295 ** For every page in the database file, an instance of the following structure
296 ** is stored in memory.  The u.aDisk[] array contains the raw bits read from
297 ** the disk.  The rest is auxiliary information held in memory only. The
298 ** auxiliary info is only valid for regular database pages - it is not
299 ** used for overflow pages and pages on the freelist.
300 **
301 ** Of particular interest in the auxiliary info is the apCell[] entry.  Each
302 ** apCell[] entry is a pointer to a Cell structure in u.aDisk[].  The cells are
303 ** put in this array so that they can be accessed in constant time, rather
304 ** than in linear time which would be needed if we had to walk the linked
305 ** list on every access.
306 **
307 ** Note that apCell[] contains enough space to hold up to two more Cells
308 ** than can possibly fit on one page.  In the steady state, every apCell[]
309 ** points to memory inside u.aDisk[].  But in the middle of an insert
310 ** operation, some apCell[] entries may temporarily point to data space
311 ** outside of u.aDisk[].  This is a transient situation that is quickly
312 ** resolved.  But while it is happening, it is possible for a database
313 ** page to hold as many as two more cells than it might otherwise hold.
314 ** The extra two entries in apCell[] are an allowance for this situation.
315 **
316 ** The pParent field points back to the parent page.  This allows us to
317 ** walk up the BTree from any leaf to the root.  Care must be taken to
318 ** unref() the parent page pointer when this page is no longer referenced.
319 ** The pageDestructor() routine handles that chore.
320 */
321 struct MemPage {
322   union u_page_data {
323     char aDisk[SQLITE_PAGE_SIZE];  /* Page data stored on disk */
324     PageHdr hdr;                   /* Overlay page header */
325   } u;
326   u8 isInit;                     /* True if auxiliary data is initialized */
327   u8 idxShift;                   /* True if apCell[] indices have changed */
328   u8 isOverfull;                 /* Some apCell[] points outside u.aDisk[] */
329   MemPage *pParent;              /* The parent of this page.  NULL for root */
330   int idxParent;                 /* Index in pParent->apCell[] of this node */
331   int nFree;                     /* Number of free bytes in u.aDisk[] */
332   int nCell;                     /* Number of entries on this page */
333   Cell *apCell[MX_CELL+2];       /* All data entires in sorted order */
334 };
335 
336 /*
337 ** The in-memory image of a disk page has the auxiliary information appended
338 ** to the end.  EXTRA_SIZE is the number of bytes of space needed to hold
339 ** that extra information.
340 */
341 #define EXTRA_SIZE (sizeof(MemPage)-sizeof(union u_page_data))
342 
343 /*
344 ** Everything we need to know about an open database
345 */
346 struct Btree {
347   BtOps *pOps;          /* Function table */
348   Pager *pPager;        /* The page cache */
349   BtCursor *pCursor;    /* A list of all open cursors */
350   PageOne *page1;       /* First page of the database */
351   u8 inTrans;           /* True if a transaction is in progress */
352   u8 inCkpt;            /* True if there is a checkpoint on the transaction */
353   u8 readOnly;          /* True if the underlying file is readonly */
354   u8 needSwab;          /* Need to byte-swapping */
355 };
356 typedef Btree Bt;
357 
358 /*
359 ** A cursor is a pointer to a particular entry in the BTree.
360 ** The entry is identified by its MemPage and the index in
361 ** MemPage.apCell[] of the entry.
362 */
363 struct BtCursor {
364   BtCursorOps *pOps;        /* Function table */
365   Btree *pBt;               /* The Btree to which this cursor belongs */
366   BtCursor *pNext, *pPrev;  /* Forms a linked list of all cursors */
367   BtCursor *pShared;        /* Loop of cursors with the same root page */
368   Pgno pgnoRoot;            /* The root page of this tree */
369   MemPage *pPage;           /* Page that contains the entry */
370   int idx;                  /* Index of the entry in pPage->apCell[] */
371   u8 wrFlag;                /* True if writable */
372   u8 eSkip;                 /* Determines if next step operation is a no-op */
373   u8 iMatch;                /* compare result from last sqliteBtreeMoveto() */
374 };
375 
376 /*
377 ** Legal values for BtCursor.eSkip.
378 */
379 #define SKIP_NONE     0   /* Always step the cursor */
380 #define SKIP_NEXT     1   /* The next sqliteBtreeNext() is a no-op */
381 #define SKIP_PREV     2   /* The next sqliteBtreePrevious() is a no-op */
382 #define SKIP_INVALID  3   /* Calls to Next() and Previous() are invalid */
383 
384 /* Forward declarations */
385 static int fileBtreeCloseCursor(BtCursor *pCur);
386 
387 /*
388 ** Routines for byte swapping.
389 */
swab16(u16 x)390 u16 swab16(u16 x){
391   return ((x & 0xff)<<8) | ((x>>8)&0xff);
392 }
swab32(u32 x)393 u32 swab32(u32 x){
394   return ((x & 0xff)<<24) | ((x & 0xff00)<<8) |
395          ((x>>8) & 0xff00) | ((x>>24)&0xff);
396 }
397 
398 /*
399 ** Compute the total number of bytes that a Cell needs on the main
400 ** database page.  The number returned includes the Cell header,
401 ** local payload storage, and the pointer to overflow pages (if
402 ** applicable).  Additional space allocated on overflow pages
403 ** is NOT included in the value returned from this routine.
404 */
cellSize(Btree * pBt,Cell * pCell)405 static int cellSize(Btree *pBt, Cell *pCell){
406   int n = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
407   if( n>MX_LOCAL_PAYLOAD ){
408     n = MX_LOCAL_PAYLOAD + sizeof(Pgno);
409   }else{
410     n = ROUNDUP(n);
411   }
412   n += sizeof(CellHdr);
413   return n;
414 }
415 
416 /*
417 ** Defragment the page given.  All Cells are moved to the
418 ** beginning of the page and all free space is collected
419 ** into one big FreeBlk at the end of the page.
420 */
defragmentPage(Btree * pBt,MemPage * pPage)421 static void defragmentPage(Btree *pBt, MemPage *pPage){
422   int pc, i, n;
423   FreeBlk *pFBlk;
424   char newPage[SQLITE_USABLE_SIZE];
425 
426   assert( sqlitepager_iswriteable(pPage) );
427   assert( pPage->isInit );
428   pc = sizeof(PageHdr);
429   pPage->u.hdr.firstCell = SWAB16(pBt, pc);
430   memcpy(newPage, pPage->u.aDisk, pc);
431   for(i=0; i<pPage->nCell; i++){
432     Cell *pCell = pPage->apCell[i];
433 
434     /* This routine should never be called on an overfull page.  The
435     ** following asserts verify that constraint. */
436     assert( Addr(pCell) > Addr(pPage) );
437     assert( Addr(pCell) < Addr(pPage) + SQLITE_USABLE_SIZE );
438 
439     n = cellSize(pBt, pCell);
440     pCell->h.iNext = SWAB16(pBt, pc + n);
441     memcpy(&newPage[pc], pCell, n);
442     pPage->apCell[i] = (Cell*)&pPage->u.aDisk[pc];
443     pc += n;
444   }
445   assert( pPage->nFree==SQLITE_USABLE_SIZE-pc );
446   memcpy(pPage->u.aDisk, newPage, pc);
447   if( pPage->nCell>0 ){
448     pPage->apCell[pPage->nCell-1]->h.iNext = 0;
449   }
450   pFBlk = (FreeBlk*)&pPage->u.aDisk[pc];
451   pFBlk->iSize = SWAB16(pBt, SQLITE_USABLE_SIZE - pc);
452   pFBlk->iNext = 0;
453   pPage->u.hdr.firstFree = SWAB16(pBt, pc);
454   memset(&pFBlk[1], 0, SQLITE_USABLE_SIZE - pc - sizeof(FreeBlk));
455 }
456 
457 /*
458 ** Allocate nByte bytes of space on a page.  nByte must be a
459 ** multiple of 4.
460 **
461 ** Return the index into pPage->u.aDisk[] of the first byte of
462 ** the new allocation. Or return 0 if there is not enough free
463 ** space on the page to satisfy the allocation request.
464 **
465 ** If the page contains nBytes of free space but does not contain
466 ** nBytes of contiguous free space, then this routine automatically
467 ** calls defragementPage() to consolidate all free space before
468 ** allocating the new chunk.
469 */
allocateSpace(Btree * pBt,MemPage * pPage,int nByte)470 static int allocateSpace(Btree *pBt, MemPage *pPage, int nByte){
471   FreeBlk *p;
472   u16 *pIdx;
473   int start;
474   int iSize;
475 #ifndef NDEBUG
476   int cnt = 0;
477 #endif
478 
479   assert( sqlitepager_iswriteable(pPage) );
480   assert( nByte==ROUNDUP(nByte) );
481   assert( pPage->isInit );
482   if( pPage->nFree<nByte || pPage->isOverfull ) return 0;
483   pIdx = &pPage->u.hdr.firstFree;
484   p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
485   while( (iSize = SWAB16(pBt, p->iSize))<nByte ){
486     assert( cnt++ < SQLITE_USABLE_SIZE/4 );
487     if( p->iNext==0 ){
488       defragmentPage(pBt, pPage);
489       pIdx = &pPage->u.hdr.firstFree;
490     }else{
491       pIdx = &p->iNext;
492     }
493     p = (FreeBlk*)&pPage->u.aDisk[SWAB16(pBt, *pIdx)];
494   }
495   if( iSize==nByte ){
496     start = SWAB16(pBt, *pIdx);
497     *pIdx = p->iNext;
498   }else{
499     FreeBlk *pNew;
500     start = SWAB16(pBt, *pIdx);
501     pNew = (FreeBlk*)&pPage->u.aDisk[start + nByte];
502     pNew->iNext = p->iNext;
503     pNew->iSize = SWAB16(pBt, iSize - nByte);
504     *pIdx = SWAB16(pBt, start + nByte);
505   }
506   pPage->nFree -= nByte;
507   return start;
508 }
509 
510 /*
511 ** Return a section of the MemPage.u.aDisk[] to the freelist.
512 ** The first byte of the new free block is pPage->u.aDisk[start]
513 ** and the size of the block is "size" bytes.  Size must be
514 ** a multiple of 4.
515 **
516 ** Most of the effort here is involved in coalesing adjacent
517 ** free blocks into a single big free block.
518 */
freeSpace(Btree * pBt,MemPage * pPage,int start,int size)519 static void freeSpace(Btree *pBt, MemPage *pPage, int start, int size){
520   int end = start + size;
521   u16 *pIdx, idx;
522   FreeBlk *pFBlk;
523   FreeBlk *pNew;
524   FreeBlk *pNext;
525   int iSize;
526 
527   assert( sqlitepager_iswriteable(pPage) );
528   assert( size == ROUNDUP(size) );
529   assert( start == ROUNDUP(start) );
530   assert( pPage->isInit );
531   pIdx = &pPage->u.hdr.firstFree;
532   idx = SWAB16(pBt, *pIdx);
533   while( idx!=0 && idx<start ){
534     pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
535     iSize = SWAB16(pBt, pFBlk->iSize);
536     if( idx + iSize == start ){
537       pFBlk->iSize = SWAB16(pBt, iSize + size);
538       if( idx + iSize + size == SWAB16(pBt, pFBlk->iNext) ){
539         pNext = (FreeBlk*)&pPage->u.aDisk[idx + iSize + size];
540         if( pBt->needSwab ){
541           pFBlk->iSize = swab16((u16)swab16(pNext->iSize)+iSize+size);
542         }else{
543           pFBlk->iSize += pNext->iSize;
544         }
545         pFBlk->iNext = pNext->iNext;
546       }
547       pPage->nFree += size;
548       return;
549     }
550     pIdx = &pFBlk->iNext;
551     idx = SWAB16(pBt, *pIdx);
552   }
553   pNew = (FreeBlk*)&pPage->u.aDisk[start];
554   if( idx != end ){
555     pNew->iSize = SWAB16(pBt, size);
556     pNew->iNext = SWAB16(pBt, idx);
557   }else{
558     pNext = (FreeBlk*)&pPage->u.aDisk[idx];
559     pNew->iSize = SWAB16(pBt, size + SWAB16(pBt, pNext->iSize));
560     pNew->iNext = pNext->iNext;
561   }
562   *pIdx = SWAB16(pBt, start);
563   pPage->nFree += size;
564 }
565 
566 /*
567 ** Initialize the auxiliary information for a disk block.
568 **
569 ** The pParent parameter must be a pointer to the MemPage which
570 ** is the parent of the page being initialized.  The root of the
571 ** BTree (usually page 2) has no parent and so for that page,
572 ** pParent==NULL.
573 **
574 ** Return SQLITE_OK on success.  If we see that the page does
575 ** not contain a well-formed database page, then return
576 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
577 ** guarantee that the page is well-formed.  It only shows that
578 ** we failed to detect any corruption.
579 */
initPage(Bt * pBt,MemPage * pPage,Pgno pgnoThis,MemPage * pParent)580 static int initPage(Bt *pBt, MemPage *pPage, Pgno pgnoThis, MemPage *pParent){
581   int idx;           /* An index into pPage->u.aDisk[] */
582   Cell *pCell;       /* A pointer to a Cell in pPage->u.aDisk[] */
583   FreeBlk *pFBlk;    /* A pointer to a free block in pPage->u.aDisk[] */
584   int sz;            /* The size of a Cell in bytes */
585   int freeSpace;     /* Amount of free space on the page */
586 
587   if( pPage->pParent ){
588     assert( pPage->pParent==pParent );
589     return SQLITE_OK;
590   }
591   if( pParent ){
592     pPage->pParent = pParent;
593     sqlitepager_ref(pParent);
594   }
595   if( pPage->isInit ) return SQLITE_OK;
596   pPage->isInit = 1;
597   pPage->nCell = 0;
598   freeSpace = USABLE_SPACE;
599   idx = SWAB16(pBt, pPage->u.hdr.firstCell);
600   while( idx!=0 ){
601     if( idx>SQLITE_USABLE_SIZE-MIN_CELL_SIZE ) goto page_format_error;
602     if( idx<sizeof(PageHdr) ) goto page_format_error;
603     if( idx!=ROUNDUP(idx) ) goto page_format_error;
604     pCell = (Cell*)&pPage->u.aDisk[idx];
605     sz = cellSize(pBt, pCell);
606     if( idx+sz > SQLITE_USABLE_SIZE ) goto page_format_error;
607     freeSpace -= sz;
608     pPage->apCell[pPage->nCell++] = pCell;
609     idx = SWAB16(pBt, pCell->h.iNext);
610   }
611   pPage->nFree = 0;
612   idx = SWAB16(pBt, pPage->u.hdr.firstFree);
613   while( idx!=0 ){
614     int iNext;
615     if( idx>SQLITE_USABLE_SIZE-sizeof(FreeBlk) ) goto page_format_error;
616     if( idx<sizeof(PageHdr) ) goto page_format_error;
617     pFBlk = (FreeBlk*)&pPage->u.aDisk[idx];
618     pPage->nFree += SWAB16(pBt, pFBlk->iSize);
619     iNext = SWAB16(pBt, pFBlk->iNext);
620     if( iNext>0 && iNext <= idx ) goto page_format_error;
621     idx = iNext;
622   }
623   if( pPage->nCell==0 && pPage->nFree==0 ){
624     /* As a special case, an uninitialized root page appears to be
625     ** an empty database */
626     return SQLITE_OK;
627   }
628   if( pPage->nFree!=freeSpace ) goto page_format_error;
629   return SQLITE_OK;
630 
631 page_format_error:
632   return SQLITE_CORRUPT;
633 }
634 
635 /*
636 ** Set up a raw page so that it looks like a database page holding
637 ** no entries.
638 */
zeroPage(Btree * pBt,MemPage * pPage)639 static void zeroPage(Btree *pBt, MemPage *pPage){
640   PageHdr *pHdr;
641   FreeBlk *pFBlk;
642   assert( sqlitepager_iswriteable(pPage) );
643   memset(pPage, 0, SQLITE_USABLE_SIZE);
644   pHdr = &pPage->u.hdr;
645   pHdr->firstCell = 0;
646   pHdr->firstFree = SWAB16(pBt, sizeof(*pHdr));
647   pFBlk = (FreeBlk*)&pHdr[1];
648   pFBlk->iNext = 0;
649   pPage->nFree = SQLITE_USABLE_SIZE - sizeof(*pHdr);
650   pFBlk->iSize = SWAB16(pBt, pPage->nFree);
651   pPage->nCell = 0;
652   pPage->isOverfull = 0;
653 }
654 
655 /*
656 ** This routine is called when the reference count for a page
657 ** reaches zero.  We need to unref the pParent pointer when that
658 ** happens.
659 */
pageDestructor(void * pData)660 static void pageDestructor(void *pData){
661   MemPage *pPage = (MemPage*)pData;
662   if( pPage->pParent ){
663     MemPage *pParent = pPage->pParent;
664     pPage->pParent = 0;
665     sqlitepager_unref(pParent);
666   }
667 }
668 
669 /*
670 ** Open a new database.
671 **
672 ** Actually, this routine just sets up the internal data structures
673 ** for accessing the database.  We do not open the database file
674 ** until the first page is loaded.
675 **
676 ** zFilename is the name of the database file.  If zFilename is NULL
677 ** a new database with a random name is created.  This randomly named
678 ** database file will be deleted when sqliteBtreeClose() is called.
679 */
sqliteBtreeOpen(const char * zFilename,int omitJournal,int nCache,Btree ** ppBtree)680 int sqliteBtreeOpen(
681   const char *zFilename,    /* Name of the file containing the BTree database */
682   int omitJournal,          /* if TRUE then do not journal this file */
683   int nCache,               /* How many pages in the page cache */
684   Btree **ppBtree           /* Pointer to new Btree object written here */
685 ){
686   Btree *pBt;
687   int rc;
688 
689   /*
690   ** The following asserts make sure that structures used by the btree are
691   ** the right size.  This is to guard against size changes that result
692   ** when compiling on a different architecture.
693   */
694   assert( sizeof(u32)==4 );
695   assert( sizeof(u16)==2 );
696   assert( sizeof(Pgno)==4 );
697   assert( sizeof(PageHdr)==8 );
698   assert( sizeof(CellHdr)==12 );
699   assert( sizeof(FreeBlk)==4 );
700   assert( sizeof(OverflowPage)==SQLITE_USABLE_SIZE );
701   assert( sizeof(FreelistInfo)==OVERFLOW_SIZE );
702   assert( sizeof(ptr)==sizeof(char*) );
703   assert( sizeof(uptr)==sizeof(ptr) );
704 
705   pBt = sqliteMalloc( sizeof(*pBt) );
706   if( pBt==0 ){
707     *ppBtree = 0;
708     return SQLITE_NOMEM;
709   }
710   if( nCache<10 ) nCache = 10;
711   rc = sqlitepager_open(&pBt->pPager, zFilename, nCache, EXTRA_SIZE,
712                         !omitJournal);
713   if( rc!=SQLITE_OK ){
714     if( pBt->pPager ) sqlitepager_close(pBt->pPager);
715     sqliteFree(pBt);
716     *ppBtree = 0;
717     return rc;
718   }
719   sqlitepager_set_destructor(pBt->pPager, pageDestructor);
720   pBt->pCursor = 0;
721   pBt->page1 = 0;
722   pBt->readOnly = sqlitepager_isreadonly(pBt->pPager);
723   pBt->pOps = &sqliteBtreeOps;
724   *ppBtree = pBt;
725   return SQLITE_OK;
726 }
727 
728 /*
729 ** Close an open database and invalidate all cursors.
730 */
fileBtreeClose(Btree * pBt)731 static int fileBtreeClose(Btree *pBt){
732   while( pBt->pCursor ){
733     fileBtreeCloseCursor(pBt->pCursor);
734   }
735   sqlitepager_close(pBt->pPager);
736   sqliteFree(pBt);
737   return SQLITE_OK;
738 }
739 
740 /*
741 ** Change the limit on the number of pages allowed in the cache.
742 **
743 ** The maximum number of cache pages is set to the absolute
744 ** value of mxPage.  If mxPage is negative, the pager will
745 ** operate asynchronously - it will not stop to do fsync()s
746 ** to insure data is written to the disk surface before
747 ** continuing.  Transactions still work if synchronous is off,
748 ** and the database cannot be corrupted if this program
749 ** crashes.  But if the operating system crashes or there is
750 ** an abrupt power failure when synchronous is off, the database
751 ** could be left in an inconsistent and unrecoverable state.
752 ** Synchronous is on by default so database corruption is not
753 ** normally a worry.
754 */
fileBtreeSetCacheSize(Btree * pBt,int mxPage)755 static int fileBtreeSetCacheSize(Btree *pBt, int mxPage){
756   sqlitepager_set_cachesize(pBt->pPager, mxPage);
757   return SQLITE_OK;
758 }
759 
760 /*
761 ** Change the way data is synced to disk in order to increase or decrease
762 ** how well the database resists damage due to OS crashes and power
763 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
764 ** there is a high probability of damage)  Level 2 is the default.  There
765 ** is a very low but non-zero probability of damage.  Level 3 reduces the
766 ** probability of damage to near zero but with a write performance reduction.
767 */
fileBtreeSetSafetyLevel(Btree * pBt,int level)768 static int fileBtreeSetSafetyLevel(Btree *pBt, int level){
769   sqlitepager_set_safety_level(pBt->pPager, level);
770   return SQLITE_OK;
771 }
772 
773 /*
774 ** Get a reference to page1 of the database file.  This will
775 ** also acquire a readlock on that file.
776 **
777 ** SQLITE_OK is returned on success.  If the file is not a
778 ** well-formed database file, then SQLITE_CORRUPT is returned.
779 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
780 ** is returned if we run out of memory.  SQLITE_PROTOCOL is returned
781 ** if there is a locking protocol violation.
782 */
lockBtree(Btree * pBt)783 static int lockBtree(Btree *pBt){
784   int rc;
785   if( pBt->page1 ) return SQLITE_OK;
786   rc = sqlitepager_get(pBt->pPager, 1, (void**)&pBt->page1);
787   if( rc!=SQLITE_OK ) return rc;
788 
789   /* Do some checking to help insure the file we opened really is
790   ** a valid database file.
791   */
792   if( sqlitepager_pagecount(pBt->pPager)>0 ){
793     PageOne *pP1 = pBt->page1;
794     if( strcmp(pP1->zMagic,zMagicHeader)!=0 ||
795           (pP1->iMagic!=MAGIC && swab32(pP1->iMagic)!=MAGIC) ){
796       rc = SQLITE_NOTADB;
797       goto page1_init_failed;
798     }
799     pBt->needSwab = pP1->iMagic!=MAGIC;
800   }
801   return rc;
802 
803 page1_init_failed:
804   sqlitepager_unref(pBt->page1);
805   pBt->page1 = 0;
806   return rc;
807 }
808 
809 /*
810 ** If there are no outstanding cursors and we are not in the middle
811 ** of a transaction but there is a read lock on the database, then
812 ** this routine unrefs the first page of the database file which
813 ** has the effect of releasing the read lock.
814 **
815 ** If there are any outstanding cursors, this routine is a no-op.
816 **
817 ** If there is a transaction in progress, this routine is a no-op.
818 */
unlockBtreeIfUnused(Btree * pBt)819 static void unlockBtreeIfUnused(Btree *pBt){
820   if( pBt->inTrans==0 && pBt->pCursor==0 && pBt->page1!=0 ){
821     sqlitepager_unref(pBt->page1);
822     pBt->page1 = 0;
823     pBt->inTrans = 0;
824     pBt->inCkpt = 0;
825   }
826 }
827 
828 /*
829 ** Create a new database by initializing the first two pages of the
830 ** file.
831 */
newDatabase(Btree * pBt)832 static int newDatabase(Btree *pBt){
833   MemPage *pRoot;
834   PageOne *pP1;
835   int rc;
836   if( sqlitepager_pagecount(pBt->pPager)>1 ) return SQLITE_OK;
837   pP1 = pBt->page1;
838   rc = sqlitepager_write(pBt->page1);
839   if( rc ) return rc;
840   rc = sqlitepager_get(pBt->pPager, 2, (void**)&pRoot);
841   if( rc ) return rc;
842   rc = sqlitepager_write(pRoot);
843   if( rc ){
844     sqlitepager_unref(pRoot);
845     return rc;
846   }
847   strcpy(pP1->zMagic, zMagicHeader);
848   if( btree_native_byte_order ){
849     pP1->iMagic = MAGIC;
850     pBt->needSwab = 0;
851   }else{
852     pP1->iMagic = swab32(MAGIC);
853     pBt->needSwab = 1;
854   }
855   zeroPage(pBt, pRoot);
856   sqlitepager_unref(pRoot);
857   return SQLITE_OK;
858 }
859 
860 /*
861 ** Attempt to start a new transaction.
862 **
863 ** A transaction must be started before attempting any changes
864 ** to the database.  None of the following routines will work
865 ** unless a transaction is started first:
866 **
867 **      sqliteBtreeCreateTable()
868 **      sqliteBtreeCreateIndex()
869 **      sqliteBtreeClearTable()
870 **      sqliteBtreeDropTable()
871 **      sqliteBtreeInsert()
872 **      sqliteBtreeDelete()
873 **      sqliteBtreeUpdateMeta()
874 */
fileBtreeBeginTrans(Btree * pBt)875 static int fileBtreeBeginTrans(Btree *pBt){
876   int rc;
877   if( pBt->inTrans ) return SQLITE_ERROR;
878   if( pBt->readOnly ) return SQLITE_READONLY;
879   if( pBt->page1==0 ){
880     rc = lockBtree(pBt);
881     if( rc!=SQLITE_OK ){
882       return rc;
883     }
884   }
885   rc = sqlitepager_begin(pBt->page1);
886   if( rc==SQLITE_OK ){
887     rc = newDatabase(pBt);
888   }
889   if( rc==SQLITE_OK ){
890     pBt->inTrans = 1;
891     pBt->inCkpt = 0;
892   }else{
893     unlockBtreeIfUnused(pBt);
894   }
895   return rc;
896 }
897 
898 /*
899 ** Commit the transaction currently in progress.
900 **
901 ** This will release the write lock on the database file.  If there
902 ** are no active cursors, it also releases the read lock.
903 */
fileBtreeCommit(Btree * pBt)904 static int fileBtreeCommit(Btree *pBt){
905   int rc;
906   rc = pBt->readOnly ? SQLITE_OK : sqlitepager_commit(pBt->pPager);
907   pBt->inTrans = 0;
908   pBt->inCkpt = 0;
909   unlockBtreeIfUnused(pBt);
910   return rc;
911 }
912 
913 /*
914 ** Rollback the transaction in progress.  All cursors will be
915 ** invalided by this operation.  Any attempt to use a cursor
916 ** that was open at the beginning of this operation will result
917 ** in an error.
918 **
919 ** This will release the write lock on the database file.  If there
920 ** are no active cursors, it also releases the read lock.
921 */
fileBtreeRollback(Btree * pBt)922 static int fileBtreeRollback(Btree *pBt){
923   int rc;
924   BtCursor *pCur;
925   if( pBt->inTrans==0 ) return SQLITE_OK;
926   pBt->inTrans = 0;
927   pBt->inCkpt = 0;
928   rc = pBt->readOnly ? SQLITE_OK : sqlitepager_rollback(pBt->pPager);
929   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
930     if( pCur->pPage && pCur->pPage->isInit==0 ){
931       sqlitepager_unref(pCur->pPage);
932       pCur->pPage = 0;
933     }
934   }
935   unlockBtreeIfUnused(pBt);
936   return rc;
937 }
938 
939 /*
940 ** Set the checkpoint for the current transaction.  The checkpoint serves
941 ** as a sub-transaction that can be rolled back independently of the
942 ** main transaction.  You must start a transaction before starting a
943 ** checkpoint.  The checkpoint is ended automatically if the transaction
944 ** commits or rolls back.
945 **
946 ** Only one checkpoint may be active at a time.  It is an error to try
947 ** to start a new checkpoint if another checkpoint is already active.
948 */
fileBtreeBeginCkpt(Btree * pBt)949 static int fileBtreeBeginCkpt(Btree *pBt){
950   int rc;
951   if( !pBt->inTrans || pBt->inCkpt ){
952     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
953   }
954   rc = pBt->readOnly ? SQLITE_OK : sqlitepager_ckpt_begin(pBt->pPager);
955   pBt->inCkpt = 1;
956   return rc;
957 }
958 
959 
960 /*
961 ** Commit a checkpoint to transaction currently in progress.  If no
962 ** checkpoint is active, this is a no-op.
963 */
fileBtreeCommitCkpt(Btree * pBt)964 static int fileBtreeCommitCkpt(Btree *pBt){
965   int rc;
966   if( pBt->inCkpt && !pBt->readOnly ){
967     rc = sqlitepager_ckpt_commit(pBt->pPager);
968   }else{
969     rc = SQLITE_OK;
970   }
971   pBt->inCkpt = 0;
972   return rc;
973 }
974 
975 /*
976 ** Rollback the checkpoint to the current transaction.  If there
977 ** is no active checkpoint or transaction, this routine is a no-op.
978 **
979 ** All cursors will be invalided by this operation.  Any attempt
980 ** to use a cursor that was open at the beginning of this operation
981 ** will result in an error.
982 */
fileBtreeRollbackCkpt(Btree * pBt)983 static int fileBtreeRollbackCkpt(Btree *pBt){
984   int rc;
985   BtCursor *pCur;
986   if( pBt->inCkpt==0 || pBt->readOnly ) return SQLITE_OK;
987   rc = sqlitepager_ckpt_rollback(pBt->pPager);
988   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
989     if( pCur->pPage && pCur->pPage->isInit==0 ){
990       sqlitepager_unref(pCur->pPage);
991       pCur->pPage = 0;
992     }
993   }
994   pBt->inCkpt = 0;
995   return rc;
996 }
997 
998 /*
999 ** Create a new cursor for the BTree whose root is on the page
1000 ** iTable.  The act of acquiring a cursor gets a read lock on
1001 ** the database file.
1002 **
1003 ** If wrFlag==0, then the cursor can only be used for reading.
1004 ** If wrFlag==1, then the cursor can be used for reading or for
1005 ** writing if other conditions for writing are also met.  These
1006 ** are the conditions that must be met in order for writing to
1007 ** be allowed:
1008 **
1009 ** 1:  The cursor must have been opened with wrFlag==1
1010 **
1011 ** 2:  No other cursors may be open with wrFlag==0 on the same table
1012 **
1013 ** 3:  The database must be writable (not on read-only media)
1014 **
1015 ** 4:  There must be an active transaction.
1016 **
1017 ** Condition 2 warrants further discussion.  If any cursor is opened
1018 ** on a table with wrFlag==0, that prevents all other cursors from
1019 ** writing to that table.  This is a kind of "read-lock".  When a cursor
1020 ** is opened with wrFlag==0 it is guaranteed that the table will not
1021 ** change as long as the cursor is open.  This allows the cursor to
1022 ** do a sequential scan of the table without having to worry about
1023 ** entries being inserted or deleted during the scan.  Cursors should
1024 ** be opened with wrFlag==0 only if this read-lock property is needed.
1025 ** That is to say, cursors should be opened with wrFlag==0 only if they
1026 ** intend to use the sqliteBtreeNext() system call.  All other cursors
1027 ** should be opened with wrFlag==1 even if they never really intend
1028 ** to write.
1029 **
1030 ** No checking is done to make sure that page iTable really is the
1031 ** root page of a b-tree.  If it is not, then the cursor acquired
1032 ** will not work correctly.
1033 */
1034 static
fileBtreeCursor(Btree * pBt,int iTable,int wrFlag,BtCursor ** ppCur)1035 int fileBtreeCursor(Btree *pBt, int iTable, int wrFlag, BtCursor **ppCur){
1036   int rc;
1037   BtCursor *pCur, *pRing;
1038 
1039   if( pBt->readOnly && wrFlag ){
1040     *ppCur = 0;
1041     return SQLITE_READONLY;
1042   }
1043   if( pBt->page1==0 ){
1044     rc = lockBtree(pBt);
1045     if( rc!=SQLITE_OK ){
1046       *ppCur = 0;
1047       return rc;
1048     }
1049   }
1050   pCur = sqliteMalloc( sizeof(*pCur) );
1051   if( pCur==0 ){
1052     rc = SQLITE_NOMEM;
1053     goto create_cursor_exception;
1054   }
1055   pCur->pgnoRoot = (Pgno)iTable;
1056   rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pCur->pPage);
1057   if( rc!=SQLITE_OK ){
1058     goto create_cursor_exception;
1059   }
1060   rc = initPage(pBt, pCur->pPage, pCur->pgnoRoot, 0);
1061   if( rc!=SQLITE_OK ){
1062     goto create_cursor_exception;
1063   }
1064   pCur->pOps = &sqliteBtreeCursorOps;
1065   pCur->pBt = pBt;
1066   pCur->wrFlag = wrFlag;
1067   pCur->idx = 0;
1068   pCur->eSkip = SKIP_INVALID;
1069   pCur->pNext = pBt->pCursor;
1070   if( pCur->pNext ){
1071     pCur->pNext->pPrev = pCur;
1072   }
1073   pCur->pPrev = 0;
1074   pRing = pBt->pCursor;
1075   while( pRing && pRing->pgnoRoot!=pCur->pgnoRoot ){ pRing = pRing->pNext; }
1076   if( pRing ){
1077     pCur->pShared = pRing->pShared;
1078     pRing->pShared = pCur;
1079   }else{
1080     pCur->pShared = pCur;
1081   }
1082   pBt->pCursor = pCur;
1083   *ppCur = pCur;
1084   return SQLITE_OK;
1085 
1086 create_cursor_exception:
1087   *ppCur = 0;
1088   if( pCur ){
1089     if( pCur->pPage ) sqlitepager_unref(pCur->pPage);
1090     sqliteFree(pCur);
1091   }
1092   unlockBtreeIfUnused(pBt);
1093   return rc;
1094 }
1095 
1096 /*
1097 ** Close a cursor.  The read lock on the database file is released
1098 ** when the last cursor is closed.
1099 */
fileBtreeCloseCursor(BtCursor * pCur)1100 static int fileBtreeCloseCursor(BtCursor *pCur){
1101   Btree *pBt = pCur->pBt;
1102   if( pCur->pPrev ){
1103     pCur->pPrev->pNext = pCur->pNext;
1104   }else{
1105     pBt->pCursor = pCur->pNext;
1106   }
1107   if( pCur->pNext ){
1108     pCur->pNext->pPrev = pCur->pPrev;
1109   }
1110   if( pCur->pPage ){
1111     sqlitepager_unref(pCur->pPage);
1112   }
1113   if( pCur->pShared!=pCur ){
1114     BtCursor *pRing = pCur->pShared;
1115     while( pRing->pShared!=pCur ){ pRing = pRing->pShared; }
1116     pRing->pShared = pCur->pShared;
1117   }
1118   unlockBtreeIfUnused(pBt);
1119   sqliteFree(pCur);
1120   return SQLITE_OK;
1121 }
1122 
1123 /*
1124 ** Make a temporary cursor by filling in the fields of pTempCur.
1125 ** The temporary cursor is not on the cursor list for the Btree.
1126 */
getTempCursor(BtCursor * pCur,BtCursor * pTempCur)1127 static void getTempCursor(BtCursor *pCur, BtCursor *pTempCur){
1128   memcpy(pTempCur, pCur, sizeof(*pCur));
1129   pTempCur->pNext = 0;
1130   pTempCur->pPrev = 0;
1131   if( pTempCur->pPage ){
1132     sqlitepager_ref(pTempCur->pPage);
1133   }
1134 }
1135 
1136 /*
1137 ** Delete a temporary cursor such as was made by the CreateTemporaryCursor()
1138 ** function above.
1139 */
releaseTempCursor(BtCursor * pCur)1140 static void releaseTempCursor(BtCursor *pCur){
1141   if( pCur->pPage ){
1142     sqlitepager_unref(pCur->pPage);
1143   }
1144 }
1145 
1146 /*
1147 ** Set *pSize to the number of bytes of key in the entry the
1148 ** cursor currently points to.  Always return SQLITE_OK.
1149 ** Failure is not possible.  If the cursor is not currently
1150 ** pointing to an entry (which can happen, for example, if
1151 ** the database is empty) then *pSize is set to 0.
1152 */
fileBtreeKeySize(BtCursor * pCur,int * pSize)1153 static int fileBtreeKeySize(BtCursor *pCur, int *pSize){
1154   Cell *pCell;
1155   MemPage *pPage;
1156 
1157   pPage = pCur->pPage;
1158   assert( pPage!=0 );
1159   if( pCur->idx >= pPage->nCell ){
1160     *pSize = 0;
1161   }else{
1162     pCell = pPage->apCell[pCur->idx];
1163     *pSize = NKEY(pCur->pBt, pCell->h);
1164   }
1165   return SQLITE_OK;
1166 }
1167 
1168 /*
1169 ** Read payload information from the entry that the pCur cursor is
1170 ** pointing to.  Begin reading the payload at "offset" and read
1171 ** a total of "amt" bytes.  Put the result in zBuf.
1172 **
1173 ** This routine does not make a distinction between key and data.
1174 ** It just reads bytes from the payload area.
1175 */
getPayload(BtCursor * pCur,int offset,int amt,char * zBuf)1176 static int getPayload(BtCursor *pCur, int offset, int amt, char *zBuf){
1177   char *aPayload;
1178   Pgno nextPage;
1179   int rc;
1180   Btree *pBt = pCur->pBt;
1181   assert( pCur!=0 && pCur->pPage!=0 );
1182   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1183   aPayload = pCur->pPage->apCell[pCur->idx]->aPayload;
1184   if( offset<MX_LOCAL_PAYLOAD ){
1185     int a = amt;
1186     if( a+offset>MX_LOCAL_PAYLOAD ){
1187       a = MX_LOCAL_PAYLOAD - offset;
1188     }
1189     memcpy(zBuf, &aPayload[offset], a);
1190     if( a==amt ){
1191       return SQLITE_OK;
1192     }
1193     offset = 0;
1194     zBuf += a;
1195     amt -= a;
1196   }else{
1197     offset -= MX_LOCAL_PAYLOAD;
1198   }
1199   if( amt>0 ){
1200     nextPage = SWAB32(pBt, pCur->pPage->apCell[pCur->idx]->ovfl);
1201   }
1202   while( amt>0 && nextPage ){
1203     OverflowPage *pOvfl;
1204     rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
1205     if( rc!=0 ){
1206       return rc;
1207     }
1208     nextPage = SWAB32(pBt, pOvfl->iNext);
1209     if( offset<OVERFLOW_SIZE ){
1210       int a = amt;
1211       if( a + offset > OVERFLOW_SIZE ){
1212         a = OVERFLOW_SIZE - offset;
1213       }
1214       memcpy(zBuf, &pOvfl->aPayload[offset], a);
1215       offset = 0;
1216       amt -= a;
1217       zBuf += a;
1218     }else{
1219       offset -= OVERFLOW_SIZE;
1220     }
1221     sqlitepager_unref(pOvfl);
1222   }
1223   if( amt>0 ){
1224     return SQLITE_CORRUPT;
1225   }
1226   return SQLITE_OK;
1227 }
1228 
1229 /*
1230 ** Read part of the key associated with cursor pCur.  A maximum
1231 ** of "amt" bytes will be transfered into zBuf[].  The transfer
1232 ** begins at "offset".  The number of bytes actually read is
1233 ** returned.
1234 **
1235 ** Change:  It used to be that the amount returned will be smaller
1236 ** than the amount requested if there are not enough bytes in the key
1237 ** to satisfy the request.  But now, it must be the case that there
1238 ** is enough data available to satisfy the request.  If not, an exception
1239 ** is raised.  The change was made in an effort to boost performance
1240 ** by eliminating unneeded tests.
1241 */
fileBtreeKey(BtCursor * pCur,int offset,int amt,char * zBuf)1242 static int fileBtreeKey(BtCursor *pCur, int offset, int amt, char *zBuf){
1243   MemPage *pPage;
1244 
1245   assert( amt>=0 );
1246   assert( offset>=0 );
1247   assert( pCur->pPage!=0 );
1248   pPage = pCur->pPage;
1249   if( pCur->idx >= pPage->nCell ){
1250     return 0;
1251   }
1252   assert( amt+offset <= NKEY(pCur->pBt, pPage->apCell[pCur->idx]->h) );
1253   getPayload(pCur, offset, amt, zBuf);
1254   return amt;
1255 }
1256 
1257 /*
1258 ** Set *pSize to the number of bytes of data in the entry the
1259 ** cursor currently points to.  Always return SQLITE_OK.
1260 ** Failure is not possible.  If the cursor is not currently
1261 ** pointing to an entry (which can happen, for example, if
1262 ** the database is empty) then *pSize is set to 0.
1263 */
fileBtreeDataSize(BtCursor * pCur,int * pSize)1264 static int fileBtreeDataSize(BtCursor *pCur, int *pSize){
1265   Cell *pCell;
1266   MemPage *pPage;
1267 
1268   pPage = pCur->pPage;
1269   assert( pPage!=0 );
1270   if( pCur->idx >= pPage->nCell ){
1271     *pSize = 0;
1272   }else{
1273     pCell = pPage->apCell[pCur->idx];
1274     *pSize = NDATA(pCur->pBt, pCell->h);
1275   }
1276   return SQLITE_OK;
1277 }
1278 
1279 /*
1280 ** Read part of the data associated with cursor pCur.  A maximum
1281 ** of "amt" bytes will be transfered into zBuf[].  The transfer
1282 ** begins at "offset".  The number of bytes actually read is
1283 ** returned.  The amount returned will be smaller than the
1284 ** amount requested if there are not enough bytes in the data
1285 ** to satisfy the request.
1286 */
fileBtreeData(BtCursor * pCur,int offset,int amt,char * zBuf)1287 static int fileBtreeData(BtCursor *pCur, int offset, int amt, char *zBuf){
1288   Cell *pCell;
1289   MemPage *pPage;
1290 
1291   assert( amt>=0 );
1292   assert( offset>=0 );
1293   assert( pCur->pPage!=0 );
1294   pPage = pCur->pPage;
1295   if( pCur->idx >= pPage->nCell ){
1296     return 0;
1297   }
1298   pCell = pPage->apCell[pCur->idx];
1299   assert( amt+offset <= NDATA(pCur->pBt, pCell->h) );
1300   getPayload(pCur, offset + NKEY(pCur->pBt, pCell->h), amt, zBuf);
1301   return amt;
1302 }
1303 
1304 /*
1305 ** Compare an external key against the key on the entry that pCur points to.
1306 **
1307 ** The external key is pKey and is nKey bytes long.  The last nIgnore bytes
1308 ** of the key associated with pCur are ignored, as if they do not exist.
1309 ** (The normal case is for nIgnore to be zero in which case the entire
1310 ** internal key is used in the comparison.)
1311 **
1312 ** The comparison result is written to *pRes as follows:
1313 **
1314 **    *pRes<0    This means pCur<pKey
1315 **
1316 **    *pRes==0   This means pCur==pKey for all nKey bytes
1317 **
1318 **    *pRes>0    This means pCur>pKey
1319 **
1320 ** When one key is an exact prefix of the other, the shorter key is
1321 ** considered less than the longer one.  In order to be equal the
1322 ** keys must be exactly the same length. (The length of the pCur key
1323 ** is the actual key length minus nIgnore bytes.)
1324 */
fileBtreeKeyCompare(BtCursor * pCur,const void * pKey,int nKey,int nIgnore,int * pResult)1325 static int fileBtreeKeyCompare(
1326   BtCursor *pCur,       /* Pointer to entry to compare against */
1327   const void *pKey,     /* Key to compare against entry that pCur points to */
1328   int nKey,             /* Number of bytes in pKey */
1329   int nIgnore,          /* Ignore this many bytes at the end of pCur */
1330   int *pResult          /* Write the result here */
1331 ){
1332   Pgno nextPage;
1333   int n, c, rc, nLocal;
1334   Cell *pCell;
1335   Btree *pBt = pCur->pBt;
1336   const char *zKey  = (const char*)pKey;
1337 
1338   assert( pCur->pPage );
1339   assert( pCur->idx>=0 && pCur->idx<pCur->pPage->nCell );
1340   pCell = pCur->pPage->apCell[pCur->idx];
1341   nLocal = NKEY(pBt, pCell->h) - nIgnore;
1342   if( nLocal<0 ) nLocal = 0;
1343   n = nKey<nLocal ? nKey : nLocal;
1344   if( n>MX_LOCAL_PAYLOAD ){
1345     n = MX_LOCAL_PAYLOAD;
1346   }
1347   c = memcmp(pCell->aPayload, zKey, n);
1348   if( c!=0 ){
1349     *pResult = c;
1350     return SQLITE_OK;
1351   }
1352   zKey += n;
1353   nKey -= n;
1354   nLocal -= n;
1355   nextPage = SWAB32(pBt, pCell->ovfl);
1356   while( nKey>0 && nLocal>0 ){
1357     OverflowPage *pOvfl;
1358     if( nextPage==0 ){
1359       return SQLITE_CORRUPT;
1360     }
1361     rc = sqlitepager_get(pBt->pPager, nextPage, (void**)&pOvfl);
1362     if( rc ){
1363       return rc;
1364     }
1365     nextPage = SWAB32(pBt, pOvfl->iNext);
1366     n = nKey<nLocal ? nKey : nLocal;
1367     if( n>OVERFLOW_SIZE ){
1368       n = OVERFLOW_SIZE;
1369     }
1370     c = memcmp(pOvfl->aPayload, zKey, n);
1371     sqlitepager_unref(pOvfl);
1372     if( c!=0 ){
1373       *pResult = c;
1374       return SQLITE_OK;
1375     }
1376     nKey -= n;
1377     nLocal -= n;
1378     zKey += n;
1379   }
1380   if( c==0 ){
1381     c = nLocal - nKey;
1382   }
1383   *pResult = c;
1384   return SQLITE_OK;
1385 }
1386 
1387 /*
1388 ** Move the cursor down to a new child page.  The newPgno argument is the
1389 ** page number of the child page in the byte order of the disk image.
1390 */
moveToChild(BtCursor * pCur,int newPgno)1391 static int moveToChild(BtCursor *pCur, int newPgno){
1392   int rc;
1393   MemPage *pNewPage;
1394   Btree *pBt = pCur->pBt;
1395 
1396   newPgno = SWAB32(pBt, newPgno);
1397   rc = sqlitepager_get(pBt->pPager, newPgno, (void**)&pNewPage);
1398   if( rc ) return rc;
1399   rc = initPage(pBt, pNewPage, newPgno, pCur->pPage);
1400   if( rc ) return rc;
1401   assert( pCur->idx>=pCur->pPage->nCell
1402           || pCur->pPage->apCell[pCur->idx]->h.leftChild==SWAB32(pBt,newPgno) );
1403   assert( pCur->idx<pCur->pPage->nCell
1404           || pCur->pPage->u.hdr.rightChild==SWAB32(pBt,newPgno) );
1405   pNewPage->idxParent = pCur->idx;
1406   pCur->pPage->idxShift = 0;
1407   sqlitepager_unref(pCur->pPage);
1408   pCur->pPage = pNewPage;
1409   pCur->idx = 0;
1410   if( pNewPage->nCell<1 ){
1411     return SQLITE_CORRUPT;
1412   }
1413   return SQLITE_OK;
1414 }
1415 
1416 /*
1417 ** Move the cursor up to the parent page.
1418 **
1419 ** pCur->idx is set to the cell index that contains the pointer
1420 ** to the page we are coming from.  If we are coming from the
1421 ** right-most child page then pCur->idx is set to one more than
1422 ** the largest cell index.
1423 */
moveToParent(BtCursor * pCur)1424 static void moveToParent(BtCursor *pCur){
1425   Pgno oldPgno;
1426   MemPage *pParent;
1427   MemPage *pPage;
1428   int idxParent;
1429   pPage = pCur->pPage;
1430   assert( pPage!=0 );
1431   pParent = pPage->pParent;
1432   assert( pParent!=0 );
1433   idxParent = pPage->idxParent;
1434   sqlitepager_ref(pParent);
1435   sqlitepager_unref(pPage);
1436   pCur->pPage = pParent;
1437   assert( pParent->idxShift==0 );
1438   if( pParent->idxShift==0 ){
1439     pCur->idx = idxParent;
1440 #ifndef NDEBUG
1441     /* Verify that pCur->idx is the correct index to point back to the child
1442     ** page we just came from
1443     */
1444     oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
1445     if( pCur->idx<pParent->nCell ){
1446       assert( pParent->apCell[idxParent]->h.leftChild==oldPgno );
1447     }else{
1448       assert( pParent->u.hdr.rightChild==oldPgno );
1449     }
1450 #endif
1451   }else{
1452     /* The MemPage.idxShift flag indicates that cell indices might have
1453     ** changed since idxParent was set and hence idxParent might be out
1454     ** of date.  So recompute the parent cell index by scanning all cells
1455     ** and locating the one that points to the child we just came from.
1456     */
1457     int i;
1458     pCur->idx = pParent->nCell;
1459     oldPgno = SWAB32(pCur->pBt, sqlitepager_pagenumber(pPage));
1460     for(i=0; i<pParent->nCell; i++){
1461       if( pParent->apCell[i]->h.leftChild==oldPgno ){
1462         pCur->idx = i;
1463         break;
1464       }
1465     }
1466   }
1467 }
1468 
1469 /*
1470 ** Move the cursor to the root page
1471 */
moveToRoot(BtCursor * pCur)1472 static int moveToRoot(BtCursor *pCur){
1473   MemPage *pNew;
1474   int rc;
1475   Btree *pBt = pCur->pBt;
1476 
1477   rc = sqlitepager_get(pBt->pPager, pCur->pgnoRoot, (void**)&pNew);
1478   if( rc ) return rc;
1479   rc = initPage(pBt, pNew, pCur->pgnoRoot, 0);
1480   if( rc ) return rc;
1481   sqlitepager_unref(pCur->pPage);
1482   pCur->pPage = pNew;
1483   pCur->idx = 0;
1484   return SQLITE_OK;
1485 }
1486 
1487 /*
1488 ** Move the cursor down to the left-most leaf entry beneath the
1489 ** entry to which it is currently pointing.
1490 */
moveToLeftmost(BtCursor * pCur)1491 static int moveToLeftmost(BtCursor *pCur){
1492   Pgno pgno;
1493   int rc;
1494 
1495   while( (pgno = pCur->pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
1496     rc = moveToChild(pCur, pgno);
1497     if( rc ) return rc;
1498   }
1499   return SQLITE_OK;
1500 }
1501 
1502 /*
1503 ** Move the cursor down to the right-most leaf entry beneath the
1504 ** page to which it is currently pointing.  Notice the difference
1505 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
1506 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
1507 ** finds the right-most entry beneath the *page*.
1508 */
moveToRightmost(BtCursor * pCur)1509 static int moveToRightmost(BtCursor *pCur){
1510   Pgno pgno;
1511   int rc;
1512 
1513   while( (pgno = pCur->pPage->u.hdr.rightChild)!=0 ){
1514     pCur->idx = pCur->pPage->nCell;
1515     rc = moveToChild(pCur, pgno);
1516     if( rc ) return rc;
1517   }
1518   pCur->idx = pCur->pPage->nCell - 1;
1519   return SQLITE_OK;
1520 }
1521 
1522 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
1523 ** on success.  Set *pRes to 0 if the cursor actually points to something
1524 ** or set *pRes to 1 if the table is empty.
1525 */
fileBtreeFirst(BtCursor * pCur,int * pRes)1526 static int fileBtreeFirst(BtCursor *pCur, int *pRes){
1527   int rc;
1528   if( pCur->pPage==0 ) return SQLITE_ABORT;
1529   rc = moveToRoot(pCur);
1530   if( rc ) return rc;
1531   if( pCur->pPage->nCell==0 ){
1532     *pRes = 1;
1533     return SQLITE_OK;
1534   }
1535   *pRes = 0;
1536   rc = moveToLeftmost(pCur);
1537   pCur->eSkip = SKIP_NONE;
1538   return rc;
1539 }
1540 
1541 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
1542 ** on success.  Set *pRes to 0 if the cursor actually points to something
1543 ** or set *pRes to 1 if the table is empty.
1544 */
fileBtreeLast(BtCursor * pCur,int * pRes)1545 static int fileBtreeLast(BtCursor *pCur, int *pRes){
1546   int rc;
1547   if( pCur->pPage==0 ) return SQLITE_ABORT;
1548   rc = moveToRoot(pCur);
1549   if( rc ) return rc;
1550   assert( pCur->pPage->isInit );
1551   if( pCur->pPage->nCell==0 ){
1552     *pRes = 1;
1553     return SQLITE_OK;
1554   }
1555   *pRes = 0;
1556   rc = moveToRightmost(pCur);
1557   pCur->eSkip = SKIP_NONE;
1558   return rc;
1559 }
1560 
1561 /* Move the cursor so that it points to an entry near pKey.
1562 ** Return a success code.
1563 **
1564 ** If an exact match is not found, then the cursor is always
1565 ** left pointing at a leaf page which would hold the entry if it
1566 ** were present.  The cursor might point to an entry that comes
1567 ** before or after the key.
1568 **
1569 ** The result of comparing the key with the entry to which the
1570 ** cursor is left pointing is stored in pCur->iMatch.  The same
1571 ** value is also written to *pRes if pRes!=NULL.  The meaning of
1572 ** this value is as follows:
1573 **
1574 **     *pRes<0      The cursor is left pointing at an entry that
1575 **                  is smaller than pKey or if the table is empty
1576 **                  and the cursor is therefore left point to nothing.
1577 **
1578 **     *pRes==0     The cursor is left pointing at an entry that
1579 **                  exactly matches pKey.
1580 **
1581 **     *pRes>0      The cursor is left pointing at an entry that
1582 **                  is larger than pKey.
1583 */
1584 static
fileBtreeMoveto(BtCursor * pCur,const void * pKey,int nKey,int * pRes)1585 int fileBtreeMoveto(BtCursor *pCur, const void *pKey, int nKey, int *pRes){
1586   int rc;
1587   if( pCur->pPage==0 ) return SQLITE_ABORT;
1588   pCur->eSkip = SKIP_NONE;
1589   rc = moveToRoot(pCur);
1590   if( rc ) return rc;
1591   for(;;){
1592     int lwr, upr;
1593     Pgno chldPg;
1594     MemPage *pPage = pCur->pPage;
1595     int c = -1;  /* pRes return if table is empty must be -1 */
1596     lwr = 0;
1597     upr = pPage->nCell-1;
1598     while( lwr<=upr ){
1599       pCur->idx = (lwr+upr)/2;
1600       rc = fileBtreeKeyCompare(pCur, pKey, nKey, 0, &c);
1601       if( rc ) return rc;
1602       if( c==0 ){
1603         pCur->iMatch = c;
1604         if( pRes ) *pRes = 0;
1605         return SQLITE_OK;
1606       }
1607       if( c<0 ){
1608         lwr = pCur->idx+1;
1609       }else{
1610         upr = pCur->idx-1;
1611       }
1612     }
1613     assert( lwr==upr+1 );
1614     assert( pPage->isInit );
1615     if( lwr>=pPage->nCell ){
1616       chldPg = pPage->u.hdr.rightChild;
1617     }else{
1618       chldPg = pPage->apCell[lwr]->h.leftChild;
1619     }
1620     if( chldPg==0 ){
1621       pCur->iMatch = c;
1622       if( pRes ) *pRes = c;
1623       return SQLITE_OK;
1624     }
1625     pCur->idx = lwr;
1626     rc = moveToChild(pCur, chldPg);
1627     if( rc ) return rc;
1628   }
1629   /* NOT REACHED */
1630 }
1631 
1632 /*
1633 ** Advance the cursor to the next entry in the database.  If
1634 ** successful then set *pRes=0.  If the cursor
1635 ** was already pointing to the last entry in the database before
1636 ** this routine was called, then set *pRes=1.
1637 */
fileBtreeNext(BtCursor * pCur,int * pRes)1638 static int fileBtreeNext(BtCursor *pCur, int *pRes){
1639   int rc;
1640   MemPage *pPage = pCur->pPage;
1641   assert( pRes!=0 );
1642   if( pPage==0 ){
1643     *pRes = 1;
1644     return SQLITE_ABORT;
1645   }
1646   assert( pPage->isInit );
1647   assert( pCur->eSkip!=SKIP_INVALID );
1648   if( pPage->nCell==0 ){
1649     *pRes = 1;
1650     return SQLITE_OK;
1651   }
1652   assert( pCur->idx<pPage->nCell );
1653   if( pCur->eSkip==SKIP_NEXT ){
1654     pCur->eSkip = SKIP_NONE;
1655     *pRes = 0;
1656     return SQLITE_OK;
1657   }
1658   pCur->eSkip = SKIP_NONE;
1659   pCur->idx++;
1660   if( pCur->idx>=pPage->nCell ){
1661     if( pPage->u.hdr.rightChild ){
1662       rc = moveToChild(pCur, pPage->u.hdr.rightChild);
1663       if( rc ) return rc;
1664       rc = moveToLeftmost(pCur);
1665       *pRes = 0;
1666       return rc;
1667     }
1668     do{
1669       if( pPage->pParent==0 ){
1670         *pRes = 1;
1671         return SQLITE_OK;
1672       }
1673       moveToParent(pCur);
1674       pPage = pCur->pPage;
1675     }while( pCur->idx>=pPage->nCell );
1676     *pRes = 0;
1677     return SQLITE_OK;
1678   }
1679   *pRes = 0;
1680   if( pPage->u.hdr.rightChild==0 ){
1681     return SQLITE_OK;
1682   }
1683   rc = moveToLeftmost(pCur);
1684   return rc;
1685 }
1686 
1687 /*
1688 ** Step the cursor to the back to the previous entry in the database.  If
1689 ** successful then set *pRes=0.  If the cursor
1690 ** was already pointing to the first entry in the database before
1691 ** this routine was called, then set *pRes=1.
1692 */
fileBtreePrevious(BtCursor * pCur,int * pRes)1693 static int fileBtreePrevious(BtCursor *pCur, int *pRes){
1694   int rc;
1695   Pgno pgno;
1696   MemPage *pPage;
1697   pPage = pCur->pPage;
1698   if( pPage==0 ){
1699     *pRes = 1;
1700     return SQLITE_ABORT;
1701   }
1702   assert( pPage->isInit );
1703   assert( pCur->eSkip!=SKIP_INVALID );
1704   if( pPage->nCell==0 ){
1705     *pRes = 1;
1706     return SQLITE_OK;
1707   }
1708   if( pCur->eSkip==SKIP_PREV ){
1709     pCur->eSkip = SKIP_NONE;
1710     *pRes = 0;
1711     return SQLITE_OK;
1712   }
1713   pCur->eSkip = SKIP_NONE;
1714   assert( pCur->idx>=0 );
1715   if( (pgno = pPage->apCell[pCur->idx]->h.leftChild)!=0 ){
1716     rc = moveToChild(pCur, pgno);
1717     if( rc ) return rc;
1718     rc = moveToRightmost(pCur);
1719   }else{
1720     while( pCur->idx==0 ){
1721       if( pPage->pParent==0 ){
1722         if( pRes ) *pRes = 1;
1723         return SQLITE_OK;
1724       }
1725       moveToParent(pCur);
1726       pPage = pCur->pPage;
1727     }
1728     pCur->idx--;
1729     rc = SQLITE_OK;
1730   }
1731   *pRes = 0;
1732   return rc;
1733 }
1734 
1735 /*
1736 ** Allocate a new page from the database file.
1737 **
1738 ** The new page is marked as dirty.  (In other words, sqlitepager_write()
1739 ** has already been called on the new page.)  The new page has also
1740 ** been referenced and the calling routine is responsible for calling
1741 ** sqlitepager_unref() on the new page when it is done.
1742 **
1743 ** SQLITE_OK is returned on success.  Any other return value indicates
1744 ** an error.  *ppPage and *pPgno are undefined in the event of an error.
1745 ** Do not invoke sqlitepager_unref() on *ppPage if an error is returned.
1746 **
1747 ** If the "nearby" parameter is not 0, then a (feeble) effort is made to
1748 ** locate a page close to the page number "nearby".  This can be used in an
1749 ** attempt to keep related pages close to each other in the database file,
1750 ** which in turn can make database access faster.
1751 */
allocatePage(Btree * pBt,MemPage ** ppPage,Pgno * pPgno,Pgno nearby)1752 static int allocatePage(Btree *pBt, MemPage **ppPage, Pgno *pPgno, Pgno nearby){
1753   PageOne *pPage1 = pBt->page1;
1754   int rc;
1755   if( pPage1->freeList ){
1756     OverflowPage *pOvfl;
1757     FreelistInfo *pInfo;
1758 
1759     rc = sqlitepager_write(pPage1);
1760     if( rc ) return rc;
1761     SWAB_ADD(pBt, pPage1->nFree, -1);
1762     rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1763                         (void**)&pOvfl);
1764     if( rc ) return rc;
1765     rc = sqlitepager_write(pOvfl);
1766     if( rc ){
1767       sqlitepager_unref(pOvfl);
1768       return rc;
1769     }
1770     pInfo = (FreelistInfo*)pOvfl->aPayload;
1771     if( pInfo->nFree==0 ){
1772       *pPgno = SWAB32(pBt, pPage1->freeList);
1773       pPage1->freeList = pOvfl->iNext;
1774       *ppPage = (MemPage*)pOvfl;
1775     }else{
1776       int closest, n;
1777       n = SWAB32(pBt, pInfo->nFree);
1778       if( n>1 && nearby>0 ){
1779         int i, dist;
1780         closest = 0;
1781         dist = SWAB32(pBt, pInfo->aFree[0]) - nearby;
1782         if( dist<0 ) dist = -dist;
1783         for(i=1; i<n; i++){
1784           int d2 = SWAB32(pBt, pInfo->aFree[i]) - nearby;
1785           if( d2<0 ) d2 = -d2;
1786           if( d2<dist ) closest = i;
1787         }
1788       }else{
1789         closest = 0;
1790       }
1791       SWAB_ADD(pBt, pInfo->nFree, -1);
1792       *pPgno = SWAB32(pBt, pInfo->aFree[closest]);
1793       pInfo->aFree[closest] = pInfo->aFree[n-1];
1794       rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
1795       sqlitepager_unref(pOvfl);
1796       if( rc==SQLITE_OK ){
1797         sqlitepager_dont_rollback(*ppPage);
1798         rc = sqlitepager_write(*ppPage);
1799       }
1800     }
1801   }else{
1802     *pPgno = sqlitepager_pagecount(pBt->pPager) + 1;
1803     rc = sqlitepager_get(pBt->pPager, *pPgno, (void**)ppPage);
1804     if( rc ) return rc;
1805     rc = sqlitepager_write(*ppPage);
1806   }
1807   return rc;
1808 }
1809 
1810 /*
1811 ** Add a page of the database file to the freelist.  Either pgno or
1812 ** pPage but not both may be 0.
1813 **
1814 ** sqlitepager_unref() is NOT called for pPage.
1815 */
freePage(Btree * pBt,void * pPage,Pgno pgno)1816 static int freePage(Btree *pBt, void *pPage, Pgno pgno){
1817   PageOne *pPage1 = pBt->page1;
1818   OverflowPage *pOvfl = (OverflowPage*)pPage;
1819   int rc;
1820   int needUnref = 0;
1821   MemPage *pMemPage;
1822 
1823   if( pgno==0 ){
1824     assert( pOvfl!=0 );
1825     pgno = sqlitepager_pagenumber(pOvfl);
1826   }
1827   assert( pgno>2 );
1828   assert( sqlitepager_pagenumber(pOvfl)==pgno );
1829   pMemPage = (MemPage*)pPage;
1830   pMemPage->isInit = 0;
1831   if( pMemPage->pParent ){
1832     sqlitepager_unref(pMemPage->pParent);
1833     pMemPage->pParent = 0;
1834   }
1835   rc = sqlitepager_write(pPage1);
1836   if( rc ){
1837     return rc;
1838   }
1839   SWAB_ADD(pBt, pPage1->nFree, 1);
1840   if( pPage1->nFree!=0 && pPage1->freeList!=0 ){
1841     OverflowPage *pFreeIdx;
1842     rc = sqlitepager_get(pBt->pPager, SWAB32(pBt, pPage1->freeList),
1843                         (void**)&pFreeIdx);
1844     if( rc==SQLITE_OK ){
1845       FreelistInfo *pInfo = (FreelistInfo*)pFreeIdx->aPayload;
1846       int n = SWAB32(pBt, pInfo->nFree);
1847       if( n<(sizeof(pInfo->aFree)/sizeof(pInfo->aFree[0])) ){
1848         rc = sqlitepager_write(pFreeIdx);
1849         if( rc==SQLITE_OK ){
1850           pInfo->aFree[n] = SWAB32(pBt, pgno);
1851           SWAB_ADD(pBt, pInfo->nFree, 1);
1852           sqlitepager_unref(pFreeIdx);
1853           sqlitepager_dont_write(pBt->pPager, pgno);
1854           return rc;
1855         }
1856       }
1857       sqlitepager_unref(pFreeIdx);
1858     }
1859   }
1860   if( pOvfl==0 ){
1861     assert( pgno>0 );
1862     rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pOvfl);
1863     if( rc ) return rc;
1864     needUnref = 1;
1865   }
1866   rc = sqlitepager_write(pOvfl);
1867   if( rc ){
1868     if( needUnref ) sqlitepager_unref(pOvfl);
1869     return rc;
1870   }
1871   pOvfl->iNext = pPage1->freeList;
1872   pPage1->freeList = SWAB32(pBt, pgno);
1873   memset(pOvfl->aPayload, 0, OVERFLOW_SIZE);
1874   if( needUnref ) rc = sqlitepager_unref(pOvfl);
1875   return rc;
1876 }
1877 
1878 /*
1879 ** Erase all the data out of a cell.  This involves returning overflow
1880 ** pages back the freelist.
1881 */
clearCell(Btree * pBt,Cell * pCell)1882 static int clearCell(Btree *pBt, Cell *pCell){
1883   Pager *pPager = pBt->pPager;
1884   OverflowPage *pOvfl;
1885   Pgno ovfl, nextOvfl;
1886   int rc;
1887 
1888   if( NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h) <= MX_LOCAL_PAYLOAD ){
1889     return SQLITE_OK;
1890   }
1891   ovfl = SWAB32(pBt, pCell->ovfl);
1892   pCell->ovfl = 0;
1893   while( ovfl ){
1894     rc = sqlitepager_get(pPager, ovfl, (void**)&pOvfl);
1895     if( rc ) return rc;
1896     nextOvfl = SWAB32(pBt, pOvfl->iNext);
1897     rc = freePage(pBt, pOvfl, ovfl);
1898     if( rc ) return rc;
1899     sqlitepager_unref(pOvfl);
1900     ovfl = nextOvfl;
1901   }
1902   return SQLITE_OK;
1903 }
1904 
1905 /*
1906 ** Create a new cell from key and data.  Overflow pages are allocated as
1907 ** necessary and linked to this cell.
1908 */
fillInCell(Btree * pBt,Cell * pCell,const void * pKey,int nKey,const void * pData,int nData)1909 static int fillInCell(
1910   Btree *pBt,              /* The whole Btree.  Needed to allocate pages */
1911   Cell *pCell,             /* Populate this Cell structure */
1912   const void *pKey, int nKey,    /* The key */
1913   const void *pData,int nData    /* The data */
1914 ){
1915   OverflowPage *pOvfl, *pPrior;
1916   Pgno *pNext;
1917   int spaceLeft;
1918   int n, rc;
1919   int nPayload;
1920   const char *pPayload;
1921   char *pSpace;
1922   Pgno nearby = 0;
1923 
1924   pCell->h.leftChild = 0;
1925   pCell->h.nKey = SWAB16(pBt, nKey & 0xffff);
1926   pCell->h.nKeyHi = nKey >> 16;
1927   pCell->h.nData = SWAB16(pBt, nData & 0xffff);
1928   pCell->h.nDataHi = nData >> 16;
1929   pCell->h.iNext = 0;
1930 
1931   pNext = &pCell->ovfl;
1932   pSpace = pCell->aPayload;
1933   spaceLeft = MX_LOCAL_PAYLOAD;
1934   pPayload = pKey;
1935   pKey = 0;
1936   nPayload = nKey;
1937   pPrior = 0;
1938   while( nPayload>0 ){
1939     if( spaceLeft==0 ){
1940       rc = allocatePage(pBt, (MemPage**)&pOvfl, pNext, nearby);
1941       if( rc ){
1942         *pNext = 0;
1943       }else{
1944         nearby = *pNext;
1945       }
1946       if( pPrior ) sqlitepager_unref(pPrior);
1947       if( rc ){
1948         clearCell(pBt, pCell);
1949         return rc;
1950       }
1951       if( pBt->needSwab ) *pNext = swab32(*pNext);
1952       pPrior = pOvfl;
1953       spaceLeft = OVERFLOW_SIZE;
1954       pSpace = pOvfl->aPayload;
1955       pNext = &pOvfl->iNext;
1956     }
1957     n = nPayload;
1958     if( n>spaceLeft ) n = spaceLeft;
1959     memcpy(pSpace, pPayload, n);
1960     nPayload -= n;
1961     if( nPayload==0 && pData ){
1962       pPayload = pData;
1963       nPayload = nData;
1964       pData = 0;
1965     }else{
1966       pPayload += n;
1967     }
1968     spaceLeft -= n;
1969     pSpace += n;
1970   }
1971   *pNext = 0;
1972   if( pPrior ){
1973     sqlitepager_unref(pPrior);
1974   }
1975   return SQLITE_OK;
1976 }
1977 
1978 /*
1979 ** Change the MemPage.pParent pointer on the page whose number is
1980 ** given in the second argument so that MemPage.pParent holds the
1981 ** pointer in the third argument.
1982 */
reparentPage(Pager * pPager,Pgno pgno,MemPage * pNewParent,int idx)1983 static void reparentPage(Pager *pPager, Pgno pgno, MemPage *pNewParent,int idx){
1984   MemPage *pThis;
1985 
1986   if( pgno==0 ) return;
1987   assert( pPager!=0 );
1988   pThis = sqlitepager_lookup(pPager, pgno);
1989   if( pThis && pThis->isInit ){
1990     if( pThis->pParent!=pNewParent ){
1991       if( pThis->pParent ) sqlitepager_unref(pThis->pParent);
1992       pThis->pParent = pNewParent;
1993       if( pNewParent ) sqlitepager_ref(pNewParent);
1994     }
1995     pThis->idxParent = idx;
1996     sqlitepager_unref(pThis);
1997   }
1998 }
1999 
2000 /*
2001 ** Reparent all children of the given page to be the given page.
2002 ** In other words, for every child of pPage, invoke reparentPage()
2003 ** to make sure that each child knows that pPage is its parent.
2004 **
2005 ** This routine gets called after you memcpy() one page into
2006 ** another.
2007 */
reparentChildPages(Btree * pBt,MemPage * pPage)2008 static void reparentChildPages(Btree *pBt, MemPage *pPage){
2009   int i;
2010   Pager *pPager = pBt->pPager;
2011   for(i=0; i<pPage->nCell; i++){
2012     reparentPage(pPager, SWAB32(pBt, pPage->apCell[i]->h.leftChild), pPage, i);
2013   }
2014   reparentPage(pPager, SWAB32(pBt, pPage->u.hdr.rightChild), pPage, i);
2015   pPage->idxShift = 0;
2016 }
2017 
2018 /*
2019 ** Remove the i-th cell from pPage.  This routine effects pPage only.
2020 ** The cell content is not freed or deallocated.  It is assumed that
2021 ** the cell content has been copied someplace else.  This routine just
2022 ** removes the reference to the cell from pPage.
2023 **
2024 ** "sz" must be the number of bytes in the cell.
2025 **
2026 ** Do not bother maintaining the integrity of the linked list of Cells.
2027 ** Only the pPage->apCell[] array is important.  The relinkCellList()
2028 ** routine will be called soon after this routine in order to rebuild
2029 ** the linked list.
2030 */
dropCell(Btree * pBt,MemPage * pPage,int idx,int sz)2031 static void dropCell(Btree *pBt, MemPage *pPage, int idx, int sz){
2032   int j;
2033   assert( idx>=0 && idx<pPage->nCell );
2034   assert( sz==cellSize(pBt, pPage->apCell[idx]) );
2035   assert( sqlitepager_iswriteable(pPage) );
2036   freeSpace(pBt, pPage, Addr(pPage->apCell[idx]) - Addr(pPage), sz);
2037   for(j=idx; j<pPage->nCell-1; j++){
2038     pPage->apCell[j] = pPage->apCell[j+1];
2039   }
2040   pPage->nCell--;
2041   pPage->idxShift = 1;
2042 }
2043 
2044 /*
2045 ** Insert a new cell on pPage at cell index "i".  pCell points to the
2046 ** content of the cell.
2047 **
2048 ** If the cell content will fit on the page, then put it there.  If it
2049 ** will not fit, then just make pPage->apCell[i] point to the content
2050 ** and set pPage->isOverfull.
2051 **
2052 ** Do not bother maintaining the integrity of the linked list of Cells.
2053 ** Only the pPage->apCell[] array is important.  The relinkCellList()
2054 ** routine will be called soon after this routine in order to rebuild
2055 ** the linked list.
2056 */
insertCell(Btree * pBt,MemPage * pPage,int i,Cell * pCell,int sz)2057 static void insertCell(Btree *pBt, MemPage *pPage, int i, Cell *pCell, int sz){
2058   int idx, j;
2059   assert( i>=0 && i<=pPage->nCell );
2060   assert( sz==cellSize(pBt, pCell) );
2061   assert( sqlitepager_iswriteable(pPage) );
2062   idx = allocateSpace(pBt, pPage, sz);
2063   for(j=pPage->nCell; j>i; j--){
2064     pPage->apCell[j] = pPage->apCell[j-1];
2065   }
2066   pPage->nCell++;
2067   if( idx<=0 ){
2068     pPage->isOverfull = 1;
2069     pPage->apCell[i] = pCell;
2070   }else{
2071     memcpy(&pPage->u.aDisk[idx], pCell, sz);
2072     pPage->apCell[i] = (Cell*)&pPage->u.aDisk[idx];
2073   }
2074   pPage->idxShift = 1;
2075 }
2076 
2077 /*
2078 ** Rebuild the linked list of cells on a page so that the cells
2079 ** occur in the order specified by the pPage->apCell[] array.
2080 ** Invoke this routine once to repair damage after one or more
2081 ** invocations of either insertCell() or dropCell().
2082 */
relinkCellList(Btree * pBt,MemPage * pPage)2083 static void relinkCellList(Btree *pBt, MemPage *pPage){
2084   int i;
2085   u16 *pIdx;
2086   assert( sqlitepager_iswriteable(pPage) );
2087   pIdx = &pPage->u.hdr.firstCell;
2088   for(i=0; i<pPage->nCell; i++){
2089     int idx = Addr(pPage->apCell[i]) - Addr(pPage);
2090     assert( idx>0 && idx<SQLITE_USABLE_SIZE );
2091     *pIdx = SWAB16(pBt, idx);
2092     pIdx = &pPage->apCell[i]->h.iNext;
2093   }
2094   *pIdx = 0;
2095 }
2096 
2097 /*
2098 ** Make a copy of the contents of pFrom into pTo.  The pFrom->apCell[]
2099 ** pointers that point into pFrom->u.aDisk[] must be adjusted to point
2100 ** into pTo->u.aDisk[] instead.  But some pFrom->apCell[] entries might
2101 ** not point to pFrom->u.aDisk[].  Those are unchanged.
2102 */
copyPage(MemPage * pTo,MemPage * pFrom)2103 static void copyPage(MemPage *pTo, MemPage *pFrom){
2104   uptr from, to;
2105   int i;
2106   memcpy(pTo->u.aDisk, pFrom->u.aDisk, SQLITE_USABLE_SIZE);
2107   pTo->pParent = 0;
2108   pTo->isInit = 1;
2109   pTo->nCell = pFrom->nCell;
2110   pTo->nFree = pFrom->nFree;
2111   pTo->isOverfull = pFrom->isOverfull;
2112   to = Addr(pTo);
2113   from = Addr(pFrom);
2114   for(i=0; i<pTo->nCell; i++){
2115     uptr x = Addr(pFrom->apCell[i]);
2116     if( x>from && x<from+SQLITE_USABLE_SIZE ){
2117       *((uptr*)&pTo->apCell[i]) = x + to - from;
2118     }else{
2119       pTo->apCell[i] = pFrom->apCell[i];
2120     }
2121   }
2122 }
2123 
2124 /*
2125 ** The following parameters determine how many adjacent pages get involved
2126 ** in a balancing operation.  NN is the number of neighbors on either side
2127 ** of the page that participate in the balancing operation.  NB is the
2128 ** total number of pages that participate, including the target page and
2129 ** NN neighbors on either side.
2130 **
2131 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
2132 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
2133 ** in exchange for a larger degradation in INSERT and UPDATE performance.
2134 ** The value of NN appears to give the best results overall.
2135 */
2136 #define NN 1             /* Number of neighbors on either side of pPage */
2137 #define NB (NN*2+1)      /* Total pages involved in the balance */
2138 
2139 /*
2140 ** This routine redistributes Cells on pPage and up to two siblings
2141 ** of pPage so that all pages have about the same amount of free space.
2142 ** Usually one sibling on either side of pPage is used in the balancing,
2143 ** though both siblings might come from one side if pPage is the first
2144 ** or last child of its parent.  If pPage has fewer than two siblings
2145 ** (something which can only happen if pPage is the root page or a
2146 ** child of root) then all available siblings participate in the balancing.
2147 **
2148 ** The number of siblings of pPage might be increased or decreased by
2149 ** one in an effort to keep pages between 66% and 100% full. The root page
2150 ** is special and is allowed to be less than 66% full. If pPage is
2151 ** the root page, then the depth of the tree might be increased
2152 ** or decreased by one, as necessary, to keep the root page from being
2153 ** overfull or empty.
2154 **
2155 ** This routine calls relinkCellList() on its input page regardless of
2156 ** whether or not it does any real balancing.  Client routines will typically
2157 ** invoke insertCell() or dropCell() before calling this routine, so we
2158 ** need to call relinkCellList() to clean up the mess that those other
2159 ** routines left behind.
2160 **
2161 ** pCur is left pointing to the same cell as when this routine was called
2162 ** even if that cell gets moved to a different page.  pCur may be NULL.
2163 ** Set the pCur parameter to NULL if you do not care about keeping track
2164 ** of a cell as that will save this routine the work of keeping track of it.
2165 **
2166 ** Note that when this routine is called, some of the Cells on pPage
2167 ** might not actually be stored in pPage->u.aDisk[].  This can happen
2168 ** if the page is overfull.  Part of the job of this routine is to
2169 ** make sure all Cells for pPage once again fit in pPage->u.aDisk[].
2170 **
2171 ** In the course of balancing the siblings of pPage, the parent of pPage
2172 ** might become overfull or underfull.  If that happens, then this routine
2173 ** is called recursively on the parent.
2174 **
2175 ** If this routine fails for any reason, it might leave the database
2176 ** in a corrupted state.  So if this routine fails, the database should
2177 ** be rolled back.
2178 */
balance(Btree * pBt,MemPage * pPage,BtCursor * pCur)2179 static int balance(Btree *pBt, MemPage *pPage, BtCursor *pCur){
2180   MemPage *pParent;            /* The parent of pPage */
2181   int nCell;                   /* Number of cells in apCell[] */
2182   int nOld;                    /* Number of pages in apOld[] */
2183   int nNew;                    /* Number of pages in apNew[] */
2184   int nDiv;                    /* Number of cells in apDiv[] */
2185   int i, j, k;                 /* Loop counters */
2186   int idx;                     /* Index of pPage in pParent->apCell[] */
2187   int nxDiv;                   /* Next divider slot in pParent->apCell[] */
2188   int rc;                      /* The return code */
2189   int iCur;                    /* apCell[iCur] is the cell of the cursor */
2190   MemPage *pOldCurPage;        /* The cursor originally points to this page */
2191   int subtotal;                /* Subtotal of bytes in cells on one page */
2192   MemPage *extraUnref = 0;     /* A page that needs to be unref-ed */
2193   MemPage *apOld[NB];          /* pPage and up to two siblings */
2194   Pgno pgnoOld[NB];            /* Page numbers for each page in apOld[] */
2195   MemPage *apNew[NB+1];        /* pPage and up to NB siblings after balancing */
2196   Pgno pgnoNew[NB+1];          /* Page numbers for each page in apNew[] */
2197   int idxDiv[NB];              /* Indices of divider cells in pParent */
2198   Cell *apDiv[NB];             /* Divider cells in pParent */
2199   Cell aTemp[NB];              /* Temporary holding area for apDiv[] */
2200   int cntNew[NB+1];            /* Index in apCell[] of cell after i-th page */
2201   int szNew[NB+1];             /* Combined size of cells place on i-th page */
2202   MemPage aOld[NB];            /* Temporary copies of pPage and its siblings */
2203   Cell *apCell[(MX_CELL+2)*NB]; /* All cells from pages being balanced */
2204   int szCell[(MX_CELL+2)*NB];  /* Local size of all cells */
2205 
2206   /*
2207   ** Return without doing any work if pPage is neither overfull nor
2208   ** underfull.
2209   */
2210   assert( sqlitepager_iswriteable(pPage) );
2211   if( !pPage->isOverfull && pPage->nFree<SQLITE_USABLE_SIZE/2
2212         && pPage->nCell>=2){
2213     relinkCellList(pBt, pPage);
2214     return SQLITE_OK;
2215   }
2216 
2217   /*
2218   ** Find the parent of the page to be balanceed.
2219   ** If there is no parent, it means this page is the root page and
2220   ** special rules apply.
2221   */
2222   pParent = pPage->pParent;
2223   if( pParent==0 ){
2224     Pgno pgnoChild;
2225     MemPage *pChild;
2226     assert( pPage->isInit );
2227     if( pPage->nCell==0 ){
2228       if( pPage->u.hdr.rightChild ){
2229         /*
2230         ** The root page is empty.  Copy the one child page
2231         ** into the root page and return.  This reduces the depth
2232         ** of the BTree by one.
2233         */
2234         pgnoChild = SWAB32(pBt, pPage->u.hdr.rightChild);
2235         rc = sqlitepager_get(pBt->pPager, pgnoChild, (void**)&pChild);
2236         if( rc ) return rc;
2237         memcpy(pPage, pChild, SQLITE_USABLE_SIZE);
2238         pPage->isInit = 0;
2239         rc = initPage(pBt, pPage, sqlitepager_pagenumber(pPage), 0);
2240         assert( rc==SQLITE_OK );
2241         reparentChildPages(pBt, pPage);
2242         if( pCur && pCur->pPage==pChild ){
2243           sqlitepager_unref(pChild);
2244           pCur->pPage = pPage;
2245           sqlitepager_ref(pPage);
2246         }
2247         freePage(pBt, pChild, pgnoChild);
2248         sqlitepager_unref(pChild);
2249       }else{
2250         relinkCellList(pBt, pPage);
2251       }
2252       return SQLITE_OK;
2253     }
2254     if( !pPage->isOverfull ){
2255       /* It is OK for the root page to be less than half full.
2256       */
2257       relinkCellList(pBt, pPage);
2258       return SQLITE_OK;
2259     }
2260     /*
2261     ** If we get to here, it means the root page is overfull.
2262     ** When this happens, Create a new child page and copy the
2263     ** contents of the root into the child.  Then make the root
2264     ** page an empty page with rightChild pointing to the new
2265     ** child.  Then fall thru to the code below which will cause
2266     ** the overfull child page to be split.
2267     */
2268     rc = sqlitepager_write(pPage);
2269     if( rc ) return rc;
2270     rc = allocatePage(pBt, &pChild, &pgnoChild, sqlitepager_pagenumber(pPage));
2271     if( rc ) return rc;
2272     assert( sqlitepager_iswriteable(pChild) );
2273     copyPage(pChild, pPage);
2274     pChild->pParent = pPage;
2275     pChild->idxParent = 0;
2276     sqlitepager_ref(pPage);
2277     pChild->isOverfull = 1;
2278     if( pCur && pCur->pPage==pPage ){
2279       sqlitepager_unref(pPage);
2280       pCur->pPage = pChild;
2281     }else{
2282       extraUnref = pChild;
2283     }
2284     zeroPage(pBt, pPage);
2285     pPage->u.hdr.rightChild = SWAB32(pBt, pgnoChild);
2286     pParent = pPage;
2287     pPage = pChild;
2288   }
2289   rc = sqlitepager_write(pParent);
2290   if( rc ) return rc;
2291   assert( pParent->isInit );
2292 
2293   /*
2294   ** Find the Cell in the parent page whose h.leftChild points back
2295   ** to pPage.  The "idx" variable is the index of that cell.  If pPage
2296   ** is the rightmost child of pParent then set idx to pParent->nCell
2297   */
2298   if( pParent->idxShift ){
2299     Pgno pgno, swabPgno;
2300     pgno = sqlitepager_pagenumber(pPage);
2301     swabPgno = SWAB32(pBt, pgno);
2302     for(idx=0; idx<pParent->nCell; idx++){
2303       if( pParent->apCell[idx]->h.leftChild==swabPgno ){
2304         break;
2305       }
2306     }
2307     assert( idx<pParent->nCell || pParent->u.hdr.rightChild==swabPgno );
2308   }else{
2309     idx = pPage->idxParent;
2310   }
2311 
2312   /*
2313   ** Initialize variables so that it will be safe to jump
2314   ** directly to balance_cleanup at any moment.
2315   */
2316   nOld = nNew = 0;
2317   sqlitepager_ref(pParent);
2318 
2319   /*
2320   ** Find sibling pages to pPage and the Cells in pParent that divide
2321   ** the siblings.  An attempt is made to find NN siblings on either
2322   ** side of pPage.  More siblings are taken from one side, however, if
2323   ** pPage there are fewer than NN siblings on the other side.  If pParent
2324   ** has NB or fewer children then all children of pParent are taken.
2325   */
2326   nxDiv = idx - NN;
2327   if( nxDiv + NB > pParent->nCell ){
2328     nxDiv = pParent->nCell - NB + 1;
2329   }
2330   if( nxDiv<0 ){
2331     nxDiv = 0;
2332   }
2333   nDiv = 0;
2334   for(i=0, k=nxDiv; i<NB; i++, k++){
2335     if( k<pParent->nCell ){
2336       idxDiv[i] = k;
2337       apDiv[i] = pParent->apCell[k];
2338       nDiv++;
2339       pgnoOld[i] = SWAB32(pBt, apDiv[i]->h.leftChild);
2340     }else if( k==pParent->nCell ){
2341       pgnoOld[i] = SWAB32(pBt, pParent->u.hdr.rightChild);
2342     }else{
2343       break;
2344     }
2345     rc = sqlitepager_get(pBt->pPager, pgnoOld[i], (void**)&apOld[i]);
2346     if( rc ) goto balance_cleanup;
2347     rc = initPage(pBt, apOld[i], pgnoOld[i], pParent);
2348     if( rc ) goto balance_cleanup;
2349     apOld[i]->idxParent = k;
2350     nOld++;
2351   }
2352 
2353   /*
2354   ** Set iCur to be the index in apCell[] of the cell that the cursor
2355   ** is pointing to.  We will need this later on in order to keep the
2356   ** cursor pointing at the same cell.  If pCur points to a page that
2357   ** has no involvement with this rebalancing, then set iCur to a large
2358   ** number so that the iCur==j tests always fail in the main cell
2359   ** distribution loop below.
2360   */
2361   if( pCur ){
2362     iCur = 0;
2363     for(i=0; i<nOld; i++){
2364       if( pCur->pPage==apOld[i] ){
2365         iCur += pCur->idx;
2366         break;
2367       }
2368       iCur += apOld[i]->nCell;
2369       if( i<nOld-1 && pCur->pPage==pParent && pCur->idx==idxDiv[i] ){
2370         break;
2371       }
2372       iCur++;
2373     }
2374     pOldCurPage = pCur->pPage;
2375   }
2376 
2377   /*
2378   ** Make copies of the content of pPage and its siblings into aOld[].
2379   ** The rest of this function will use data from the copies rather
2380   ** that the original pages since the original pages will be in the
2381   ** process of being overwritten.
2382   */
2383   for(i=0; i<nOld; i++){
2384     copyPage(&aOld[i], apOld[i]);
2385   }
2386 
2387   /*
2388   ** Load pointers to all cells on sibling pages and the divider cells
2389   ** into the local apCell[] array.  Make copies of the divider cells
2390   ** into aTemp[] and remove the the divider Cells from pParent.
2391   */
2392   nCell = 0;
2393   for(i=0; i<nOld; i++){
2394     MemPage *pOld = &aOld[i];
2395     for(j=0; j<pOld->nCell; j++){
2396       apCell[nCell] = pOld->apCell[j];
2397       szCell[nCell] = cellSize(pBt, apCell[nCell]);
2398       nCell++;
2399     }
2400     if( i<nOld-1 ){
2401       szCell[nCell] = cellSize(pBt, apDiv[i]);
2402       memcpy(&aTemp[i], apDiv[i], szCell[nCell]);
2403       apCell[nCell] = &aTemp[i];
2404       dropCell(pBt, pParent, nxDiv, szCell[nCell]);
2405       assert( SWAB32(pBt, apCell[nCell]->h.leftChild)==pgnoOld[i] );
2406       apCell[nCell]->h.leftChild = pOld->u.hdr.rightChild;
2407       nCell++;
2408     }
2409   }
2410 
2411   /*
2412   ** Figure out the number of pages needed to hold all nCell cells.
2413   ** Store this number in "k".  Also compute szNew[] which is the total
2414   ** size of all cells on the i-th page and cntNew[] which is the index
2415   ** in apCell[] of the cell that divides path i from path i+1.
2416   ** cntNew[k] should equal nCell.
2417   **
2418   ** This little patch of code is critical for keeping the tree
2419   ** balanced.
2420   */
2421   for(subtotal=k=i=0; i<nCell; i++){
2422     subtotal += szCell[i];
2423     if( subtotal > USABLE_SPACE ){
2424       szNew[k] = subtotal - szCell[i];
2425       cntNew[k] = i;
2426       subtotal = 0;
2427       k++;
2428     }
2429   }
2430   szNew[k] = subtotal;
2431   cntNew[k] = nCell;
2432   k++;
2433   for(i=k-1; i>0; i--){
2434     while( szNew[i]<USABLE_SPACE/2 ){
2435       cntNew[i-1]--;
2436       assert( cntNew[i-1]>0 );
2437       szNew[i] += szCell[cntNew[i-1]];
2438       szNew[i-1] -= szCell[cntNew[i-1]-1];
2439     }
2440   }
2441   assert( cntNew[0]>0 );
2442 
2443   /*
2444   ** Allocate k new pages.  Reuse old pages where possible.
2445   */
2446   for(i=0; i<k; i++){
2447     if( i<nOld ){
2448       apNew[i] = apOld[i];
2449       pgnoNew[i] = pgnoOld[i];
2450       apOld[i] = 0;
2451       sqlitepager_write(apNew[i]);
2452     }else{
2453       rc = allocatePage(pBt, &apNew[i], &pgnoNew[i], pgnoNew[i-1]);
2454       if( rc ) goto balance_cleanup;
2455     }
2456     nNew++;
2457     zeroPage(pBt, apNew[i]);
2458     apNew[i]->isInit = 1;
2459   }
2460 
2461   /* Free any old pages that were not reused as new pages.
2462   */
2463   while( i<nOld ){
2464     rc = freePage(pBt, apOld[i], pgnoOld[i]);
2465     if( rc ) goto balance_cleanup;
2466     sqlitepager_unref(apOld[i]);
2467     apOld[i] = 0;
2468     i++;
2469   }
2470 
2471   /*
2472   ** Put the new pages in accending order.  This helps to
2473   ** keep entries in the disk file in order so that a scan
2474   ** of the table is a linear scan through the file.  That
2475   ** in turn helps the operating system to deliver pages
2476   ** from the disk more rapidly.
2477   **
2478   ** An O(n^2) insertion sort algorithm is used, but since
2479   ** n is never more than NB (a small constant), that should
2480   ** not be a problem.
2481   **
2482   ** When NB==3, this one optimization makes the database
2483   ** about 25% faster for large insertions and deletions.
2484   */
2485   for(i=0; i<k-1; i++){
2486     int minV = pgnoNew[i];
2487     int minI = i;
2488     for(j=i+1; j<k; j++){
2489       if( pgnoNew[j]<(unsigned)minV ){
2490         minI = j;
2491         minV = pgnoNew[j];
2492       }
2493     }
2494     if( minI>i ){
2495       int t;
2496       MemPage *pT;
2497       t = pgnoNew[i];
2498       pT = apNew[i];
2499       pgnoNew[i] = pgnoNew[minI];
2500       apNew[i] = apNew[minI];
2501       pgnoNew[minI] = t;
2502       apNew[minI] = pT;
2503     }
2504   }
2505 
2506   /*
2507   ** Evenly distribute the data in apCell[] across the new pages.
2508   ** Insert divider cells into pParent as necessary.
2509   */
2510   j = 0;
2511   for(i=0; i<nNew; i++){
2512     MemPage *pNew = apNew[i];
2513     while( j<cntNew[i] ){
2514       assert( pNew->nFree>=szCell[j] );
2515       if( pCur && iCur==j ){ pCur->pPage = pNew; pCur->idx = pNew->nCell; }
2516       insertCell(pBt, pNew, pNew->nCell, apCell[j], szCell[j]);
2517       j++;
2518     }
2519     assert( pNew->nCell>0 );
2520     assert( !pNew->isOverfull );
2521     relinkCellList(pBt, pNew);
2522     if( i<nNew-1 && j<nCell ){
2523       pNew->u.hdr.rightChild = apCell[j]->h.leftChild;
2524       apCell[j]->h.leftChild = SWAB32(pBt, pgnoNew[i]);
2525       if( pCur && iCur==j ){ pCur->pPage = pParent; pCur->idx = nxDiv; }
2526       insertCell(pBt, pParent, nxDiv, apCell[j], szCell[j]);
2527       j++;
2528       nxDiv++;
2529     }
2530   }
2531   assert( j==nCell );
2532   apNew[nNew-1]->u.hdr.rightChild = aOld[nOld-1].u.hdr.rightChild;
2533   if( nxDiv==pParent->nCell ){
2534     pParent->u.hdr.rightChild = SWAB32(pBt, pgnoNew[nNew-1]);
2535   }else{
2536     pParent->apCell[nxDiv]->h.leftChild = SWAB32(pBt, pgnoNew[nNew-1]);
2537   }
2538   if( pCur ){
2539     if( j<=iCur && pCur->pPage==pParent && pCur->idx>idxDiv[nOld-1] ){
2540       assert( pCur->pPage==pOldCurPage );
2541       pCur->idx += nNew - nOld;
2542     }else{
2543       assert( pOldCurPage!=0 );
2544       sqlitepager_ref(pCur->pPage);
2545       sqlitepager_unref(pOldCurPage);
2546     }
2547   }
2548 
2549   /*
2550   ** Reparent children of all cells.
2551   */
2552   for(i=0; i<nNew; i++){
2553     reparentChildPages(pBt, apNew[i]);
2554   }
2555   reparentChildPages(pBt, pParent);
2556 
2557   /*
2558   ** balance the parent page.
2559   */
2560   rc = balance(pBt, pParent, pCur);
2561 
2562   /*
2563   ** Cleanup before returning.
2564   */
2565 balance_cleanup:
2566   if( extraUnref ){
2567     sqlitepager_unref(extraUnref);
2568   }
2569   for(i=0; i<nOld; i++){
2570     if( apOld[i]!=0 && apOld[i]!=&aOld[i] ) sqlitepager_unref(apOld[i]);
2571   }
2572   for(i=0; i<nNew; i++){
2573     sqlitepager_unref(apNew[i]);
2574   }
2575   if( pCur && pCur->pPage==0 ){
2576     pCur->pPage = pParent;
2577     pCur->idx = 0;
2578   }else{
2579     sqlitepager_unref(pParent);
2580   }
2581   return rc;
2582 }
2583 
2584 /*
2585 ** This routine checks all cursors that point to the same table
2586 ** as pCur points to.  If any of those cursors were opened with
2587 ** wrFlag==0 then this routine returns SQLITE_LOCKED.  If all
2588 ** cursors point to the same table were opened with wrFlag==1
2589 ** then this routine returns SQLITE_OK.
2590 **
2591 ** In addition to checking for read-locks (where a read-lock
2592 ** means a cursor opened with wrFlag==0) this routine also moves
2593 ** all cursors other than pCur so that they are pointing to the
2594 ** first Cell on root page.  This is necessary because an insert
2595 ** or delete might change the number of cells on a page or delete
2596 ** a page entirely and we do not want to leave any cursors
2597 ** pointing to non-existant pages or cells.
2598 */
checkReadLocks(BtCursor * pCur)2599 static int checkReadLocks(BtCursor *pCur){
2600   BtCursor *p;
2601   assert( pCur->wrFlag );
2602   for(p=pCur->pShared; p!=pCur; p=p->pShared){
2603     assert( p );
2604     assert( p->pgnoRoot==pCur->pgnoRoot );
2605     if( p->wrFlag==0 ) return SQLITE_LOCKED;
2606     if( sqlitepager_pagenumber(p->pPage)!=p->pgnoRoot ){
2607       moveToRoot(p);
2608     }
2609   }
2610   return SQLITE_OK;
2611 }
2612 
2613 /*
2614 ** Insert a new record into the BTree.  The key is given by (pKey,nKey)
2615 ** and the data is given by (pData,nData).  The cursor is used only to
2616 ** define what database the record should be inserted into.  The cursor
2617 ** is left pointing at the new record.
2618 */
fileBtreeInsert(BtCursor * pCur,const void * pKey,int nKey,const void * pData,int nData)2619 static int fileBtreeInsert(
2620   BtCursor *pCur,                /* Insert data into the table of this cursor */
2621   const void *pKey, int nKey,    /* The key of the new record */
2622   const void *pData, int nData   /* The data of the new record */
2623 ){
2624   Cell newCell;
2625   int rc;
2626   int loc;
2627   int szNew;
2628   MemPage *pPage;
2629   Btree *pBt = pCur->pBt;
2630 
2631   if( pCur->pPage==0 ){
2632     return SQLITE_ABORT;  /* A rollback destroyed this cursor */
2633   }
2634   if( !pBt->inTrans || nKey+nData==0 ){
2635     /* Must start a transaction before doing an insert */
2636     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2637   }
2638   assert( !pBt->readOnly );
2639   if( !pCur->wrFlag ){
2640     return SQLITE_PERM;   /* Cursor not open for writing */
2641   }
2642   if( checkReadLocks(pCur) ){
2643     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2644   }
2645   rc = fileBtreeMoveto(pCur, pKey, nKey, &loc);
2646   if( rc ) return rc;
2647   pPage = pCur->pPage;
2648   assert( pPage->isInit );
2649   rc = sqlitepager_write(pPage);
2650   if( rc ) return rc;
2651   rc = fillInCell(pBt, &newCell, pKey, nKey, pData, nData);
2652   if( rc ) return rc;
2653   szNew = cellSize(pBt, &newCell);
2654   if( loc==0 ){
2655     newCell.h.leftChild = pPage->apCell[pCur->idx]->h.leftChild;
2656     rc = clearCell(pBt, pPage->apCell[pCur->idx]);
2657     if( rc ) return rc;
2658     dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pPage->apCell[pCur->idx]));
2659   }else if( loc<0 && pPage->nCell>0 ){
2660     assert( pPage->u.hdr.rightChild==0 );  /* Must be a leaf page */
2661     pCur->idx++;
2662   }else{
2663     assert( pPage->u.hdr.rightChild==0 );  /* Must be a leaf page */
2664   }
2665   insertCell(pBt, pPage, pCur->idx, &newCell, szNew);
2666   rc = balance(pCur->pBt, pPage, pCur);
2667   /* sqliteBtreePageDump(pCur->pBt, pCur->pgnoRoot, 1); */
2668   /* fflush(stdout); */
2669   pCur->eSkip = SKIP_INVALID;
2670   return rc;
2671 }
2672 
2673 /*
2674 ** Delete the entry that the cursor is pointing to.
2675 **
2676 ** The cursor is left pointing at either the next or the previous
2677 ** entry.  If the cursor is left pointing to the next entry, then
2678 ** the pCur->eSkip flag is set to SKIP_NEXT which forces the next call to
2679 ** sqliteBtreeNext() to be a no-op.  That way, you can always call
2680 ** sqliteBtreeNext() after a delete and the cursor will be left
2681 ** pointing to the first entry after the deleted entry.  Similarly,
2682 ** pCur->eSkip is set to SKIP_PREV is the cursor is left pointing to
2683 ** the entry prior to the deleted entry so that a subsequent call to
2684 ** sqliteBtreePrevious() will always leave the cursor pointing at the
2685 ** entry immediately before the one that was deleted.
2686 */
fileBtreeDelete(BtCursor * pCur)2687 static int fileBtreeDelete(BtCursor *pCur){
2688   MemPage *pPage = pCur->pPage;
2689   Cell *pCell;
2690   int rc;
2691   Pgno pgnoChild;
2692   Btree *pBt = pCur->pBt;
2693 
2694   assert( pPage->isInit );
2695   if( pCur->pPage==0 ){
2696     return SQLITE_ABORT;  /* A rollback destroyed this cursor */
2697   }
2698   if( !pBt->inTrans ){
2699     /* Must start a transaction before doing a delete */
2700     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2701   }
2702   assert( !pBt->readOnly );
2703   if( pCur->idx >= pPage->nCell ){
2704     return SQLITE_ERROR;  /* The cursor is not pointing to anything */
2705   }
2706   if( !pCur->wrFlag ){
2707     return SQLITE_PERM;   /* Did not open this cursor for writing */
2708   }
2709   if( checkReadLocks(pCur) ){
2710     return SQLITE_LOCKED; /* The table pCur points to has a read lock */
2711   }
2712   rc = sqlitepager_write(pPage);
2713   if( rc ) return rc;
2714   pCell = pPage->apCell[pCur->idx];
2715   pgnoChild = SWAB32(pBt, pCell->h.leftChild);
2716   clearCell(pBt, pCell);
2717   if( pgnoChild ){
2718     /*
2719     ** The entry we are about to delete is not a leaf so if we do not
2720     ** do something we will leave a hole on an internal page.
2721     ** We have to fill the hole by moving in a cell from a leaf.  The
2722     ** next Cell after the one to be deleted is guaranteed to exist and
2723     ** to be a leaf so we can use it.
2724     */
2725     BtCursor leafCur;
2726     Cell *pNext;
2727     int szNext;
2728     int notUsed;
2729     getTempCursor(pCur, &leafCur);
2730     rc = fileBtreeNext(&leafCur, &notUsed);
2731     if( rc!=SQLITE_OK ){
2732       if( rc!=SQLITE_NOMEM ) rc = SQLITE_CORRUPT;
2733       return rc;
2734     }
2735     rc = sqlitepager_write(leafCur.pPage);
2736     if( rc ) return rc;
2737     dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
2738     pNext = leafCur.pPage->apCell[leafCur.idx];
2739     szNext = cellSize(pBt, pNext);
2740     pNext->h.leftChild = SWAB32(pBt, pgnoChild);
2741     insertCell(pBt, pPage, pCur->idx, pNext, szNext);
2742     rc = balance(pBt, pPage, pCur);
2743     if( rc ) return rc;
2744     pCur->eSkip = SKIP_NEXT;
2745     dropCell(pBt, leafCur.pPage, leafCur.idx, szNext);
2746     rc = balance(pBt, leafCur.pPage, pCur);
2747     releaseTempCursor(&leafCur);
2748   }else{
2749     dropCell(pBt, pPage, pCur->idx, cellSize(pBt, pCell));
2750     if( pCur->idx>=pPage->nCell ){
2751       pCur->idx = pPage->nCell-1;
2752       if( pCur->idx<0 ){
2753         pCur->idx = 0;
2754         pCur->eSkip = SKIP_NEXT;
2755       }else{
2756         pCur->eSkip = SKIP_PREV;
2757       }
2758     }else{
2759       pCur->eSkip = SKIP_NEXT;
2760     }
2761     rc = balance(pBt, pPage, pCur);
2762   }
2763   return rc;
2764 }
2765 
2766 /*
2767 ** Create a new BTree table.  Write into *piTable the page
2768 ** number for the root page of the new table.
2769 **
2770 ** In the current implementation, BTree tables and BTree indices are the
2771 ** the same.  In the future, we may change this so that BTree tables
2772 ** are restricted to having a 4-byte integer key and arbitrary data and
2773 ** BTree indices are restricted to having an arbitrary key and no data.
2774 ** But for now, this routine also serves to create indices.
2775 */
fileBtreeCreateTable(Btree * pBt,int * piTable)2776 static int fileBtreeCreateTable(Btree *pBt, int *piTable){
2777   MemPage *pRoot;
2778   Pgno pgnoRoot;
2779   int rc;
2780   if( !pBt->inTrans ){
2781     /* Must start a transaction first */
2782     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2783   }
2784   if( pBt->readOnly ){
2785     return SQLITE_READONLY;
2786   }
2787   rc = allocatePage(pBt, &pRoot, &pgnoRoot, 0);
2788   if( rc ) return rc;
2789   assert( sqlitepager_iswriteable(pRoot) );
2790   zeroPage(pBt, pRoot);
2791   sqlitepager_unref(pRoot);
2792   *piTable = (int)pgnoRoot;
2793   return SQLITE_OK;
2794 }
2795 
2796 /*
2797 ** Erase the given database page and all its children.  Return
2798 ** the page to the freelist.
2799 */
clearDatabasePage(Btree * pBt,Pgno pgno,int freePageFlag)2800 static int clearDatabasePage(Btree *pBt, Pgno pgno, int freePageFlag){
2801   MemPage *pPage;
2802   int rc;
2803   Cell *pCell;
2804   int idx;
2805 
2806   rc = sqlitepager_get(pBt->pPager, pgno, (void**)&pPage);
2807   if( rc ) return rc;
2808   rc = sqlitepager_write(pPage);
2809   if( rc ) return rc;
2810   rc = initPage(pBt, pPage, pgno, 0);
2811   if( rc ) return rc;
2812   idx = SWAB16(pBt, pPage->u.hdr.firstCell);
2813   while( idx>0 ){
2814     pCell = (Cell*)&pPage->u.aDisk[idx];
2815     idx = SWAB16(pBt, pCell->h.iNext);
2816     if( pCell->h.leftChild ){
2817       rc = clearDatabasePage(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
2818       if( rc ) return rc;
2819     }
2820     rc = clearCell(pBt, pCell);
2821     if( rc ) return rc;
2822   }
2823   if( pPage->u.hdr.rightChild ){
2824     rc = clearDatabasePage(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
2825     if( rc ) return rc;
2826   }
2827   if( freePageFlag ){
2828     rc = freePage(pBt, pPage, pgno);
2829   }else{
2830     zeroPage(pBt, pPage);
2831   }
2832   sqlitepager_unref(pPage);
2833   return rc;
2834 }
2835 
2836 /*
2837 ** Delete all information from a single table in the database.
2838 */
fileBtreeClearTable(Btree * pBt,int iTable)2839 static int fileBtreeClearTable(Btree *pBt, int iTable){
2840   int rc;
2841   BtCursor *pCur;
2842   if( !pBt->inTrans ){
2843     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2844   }
2845   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2846     if( pCur->pgnoRoot==(Pgno)iTable ){
2847       if( pCur->wrFlag==0 ) return SQLITE_LOCKED;
2848       moveToRoot(pCur);
2849     }
2850   }
2851   rc = clearDatabasePage(pBt, (Pgno)iTable, 0);
2852   if( rc ){
2853     fileBtreeRollback(pBt);
2854   }
2855   return rc;
2856 }
2857 
2858 /*
2859 ** Erase all information in a table and add the root of the table to
2860 ** the freelist.  Except, the root of the principle table (the one on
2861 ** page 2) is never added to the freelist.
2862 */
fileBtreeDropTable(Btree * pBt,int iTable)2863 static int fileBtreeDropTable(Btree *pBt, int iTable){
2864   int rc;
2865   MemPage *pPage;
2866   BtCursor *pCur;
2867   if( !pBt->inTrans ){
2868     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
2869   }
2870   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
2871     if( pCur->pgnoRoot==(Pgno)iTable ){
2872       return SQLITE_LOCKED;  /* Cannot drop a table that has a cursor */
2873     }
2874   }
2875   rc = sqlitepager_get(pBt->pPager, (Pgno)iTable, (void**)&pPage);
2876   if( rc ) return rc;
2877   rc = fileBtreeClearTable(pBt, iTable);
2878   if( rc ) return rc;
2879   if( iTable>2 ){
2880     rc = freePage(pBt, pPage, iTable);
2881   }else{
2882     zeroPage(pBt, pPage);
2883   }
2884   sqlitepager_unref(pPage);
2885   return rc;
2886 }
2887 
2888 #if 0 /* UNTESTED */
2889 /*
2890 ** Copy all cell data from one database file into another.
2891 ** pages back the freelist.
2892 */
2893 static int copyCell(Btree *pBtFrom, BTree *pBtTo, Cell *pCell){
2894   Pager *pFromPager = pBtFrom->pPager;
2895   OverflowPage *pOvfl;
2896   Pgno ovfl, nextOvfl;
2897   Pgno *pPrev;
2898   int rc = SQLITE_OK;
2899   MemPage *pNew, *pPrevPg;
2900   Pgno new;
2901 
2902   if( NKEY(pBtTo, pCell->h) + NDATA(pBtTo, pCell->h) <= MX_LOCAL_PAYLOAD ){
2903     return SQLITE_OK;
2904   }
2905   pPrev = &pCell->ovfl;
2906   pPrevPg = 0;
2907   ovfl = SWAB32(pBtTo, pCell->ovfl);
2908   while( ovfl && rc==SQLITE_OK ){
2909     rc = sqlitepager_get(pFromPager, ovfl, (void**)&pOvfl);
2910     if( rc ) return rc;
2911     nextOvfl = SWAB32(pBtFrom, pOvfl->iNext);
2912     rc = allocatePage(pBtTo, &pNew, &new, 0);
2913     if( rc==SQLITE_OK ){
2914       rc = sqlitepager_write(pNew);
2915       if( rc==SQLITE_OK ){
2916         memcpy(pNew, pOvfl, SQLITE_USABLE_SIZE);
2917         *pPrev = SWAB32(pBtTo, new);
2918         if( pPrevPg ){
2919           sqlitepager_unref(pPrevPg);
2920         }
2921         pPrev = &pOvfl->iNext;
2922         pPrevPg = pNew;
2923       }
2924     }
2925     sqlitepager_unref(pOvfl);
2926     ovfl = nextOvfl;
2927   }
2928   if( pPrevPg ){
2929     sqlitepager_unref(pPrevPg);
2930   }
2931   return rc;
2932 }
2933 #endif
2934 
2935 
2936 #if 0 /* UNTESTED */
2937 /*
2938 ** Copy a page of data from one database over to another.
2939 */
2940 static int copyDatabasePage(
2941   Btree *pBtFrom,
2942   Pgno pgnoFrom,
2943   Btree *pBtTo,
2944   Pgno *pTo
2945 ){
2946   MemPage *pPageFrom, *pPage;
2947   Pgno to;
2948   int rc;
2949   Cell *pCell;
2950   int idx;
2951 
2952   rc = sqlitepager_get(pBtFrom->pPager, pgno, (void**)&pPageFrom);
2953   if( rc ) return rc;
2954   rc = allocatePage(pBt, &pPage, pTo, 0);
2955   if( rc==SQLITE_OK ){
2956     rc = sqlitepager_write(pPage);
2957   }
2958   if( rc==SQLITE_OK ){
2959     memcpy(pPage, pPageFrom, SQLITE_USABLE_SIZE);
2960     idx = SWAB16(pBt, pPage->u.hdr.firstCell);
2961     while( idx>0 ){
2962       pCell = (Cell*)&pPage->u.aDisk[idx];
2963       idx = SWAB16(pBt, pCell->h.iNext);
2964       if( pCell->h.leftChild ){
2965         Pgno newChld;
2966         rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pCell->h.leftChild),
2967                               pBtTo, &newChld);
2968         if( rc ) return rc;
2969         pCell->h.leftChild = SWAB32(pBtFrom, newChld);
2970       }
2971       rc = copyCell(pBtFrom, pBtTo, pCell);
2972       if( rc ) return rc;
2973     }
2974     if( pPage->u.hdr.rightChild ){
2975       Pgno newChld;
2976       rc = copyDatabasePage(pBtFrom, SWAB32(pBtFrom, pPage->u.hdr.rightChild),
2977                             pBtTo, &newChld);
2978       if( rc ) return rc;
2979       pPage->u.hdr.rightChild = SWAB32(pBtTo, newChild);
2980     }
2981   }
2982   sqlitepager_unref(pPage);
2983   return rc;
2984 }
2985 #endif
2986 
2987 /*
2988 ** Read the meta-information out of a database file.
2989 */
fileBtreeGetMeta(Btree * pBt,int * aMeta)2990 static int fileBtreeGetMeta(Btree *pBt, int *aMeta){
2991   PageOne *pP1;
2992   int rc;
2993   int i;
2994 
2995   rc = sqlitepager_get(pBt->pPager, 1, (void**)&pP1);
2996   if( rc ) return rc;
2997   aMeta[0] = SWAB32(pBt, pP1->nFree);
2998   for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
2999     aMeta[i+1] = SWAB32(pBt, pP1->aMeta[i]);
3000   }
3001   sqlitepager_unref(pP1);
3002   return SQLITE_OK;
3003 }
3004 
3005 /*
3006 ** Write meta-information back into the database.
3007 */
fileBtreeUpdateMeta(Btree * pBt,int * aMeta)3008 static int fileBtreeUpdateMeta(Btree *pBt, int *aMeta){
3009   PageOne *pP1;
3010   int rc, i;
3011   if( !pBt->inTrans ){
3012     return pBt->readOnly ? SQLITE_READONLY : SQLITE_ERROR;
3013   }
3014   pP1 = pBt->page1;
3015   rc = sqlitepager_write(pP1);
3016   if( rc ) return rc;
3017   for(i=0; i<sizeof(pP1->aMeta)/sizeof(pP1->aMeta[0]); i++){
3018     pP1->aMeta[i] = SWAB32(pBt, aMeta[i+1]);
3019   }
3020   return SQLITE_OK;
3021 }
3022 
3023 /******************************************************************************
3024 ** The complete implementation of the BTree subsystem is above this line.
3025 ** All the code the follows is for testing and troubleshooting the BTree
3026 ** subsystem.  None of the code that follows is used during normal operation.
3027 ******************************************************************************/
3028 
3029 /*
3030 ** Print a disassembly of the given page on standard output.  This routine
3031 ** is used for debugging and testing only.
3032 */
3033 #ifdef SQLITE_TEST
fileBtreePageDump(Btree * pBt,int pgno,int recursive)3034 static int fileBtreePageDump(Btree *pBt, int pgno, int recursive){
3035   int rc;
3036   MemPage *pPage;
3037   int i, j;
3038   int nFree;
3039   u16 idx;
3040   char range[20];
3041   unsigned char payload[20];
3042   rc = sqlitepager_get(pBt->pPager, (Pgno)pgno, (void**)&pPage);
3043   if( rc ){
3044     return rc;
3045   }
3046   if( recursive ) printf("PAGE %d:\n", pgno);
3047   i = 0;
3048   idx = SWAB16(pBt, pPage->u.hdr.firstCell);
3049   while( idx>0 && idx<=SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){
3050     Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
3051     int sz = cellSize(pBt, pCell);
3052     sprintf(range,"%d..%d", idx, idx+sz-1);
3053     sz = NKEY(pBt, pCell->h) + NDATA(pBt, pCell->h);
3054     if( sz>sizeof(payload)-1 ) sz = sizeof(payload)-1;
3055     memcpy(payload, pCell->aPayload, sz);
3056     for(j=0; j<sz; j++){
3057       if( payload[j]<0x20 || payload[j]>0x7f ) payload[j] = '.';
3058     }
3059     payload[sz] = 0;
3060     printf(
3061       "cell %2d: i=%-10s chld=%-4d nk=%-4d nd=%-4d payload=%s\n",
3062       i, range, (int)pCell->h.leftChild,
3063       NKEY(pBt, pCell->h), NDATA(pBt, pCell->h),
3064       payload
3065     );
3066     if( pPage->isInit && pPage->apCell[i]!=pCell ){
3067       printf("**** apCell[%d] does not match on prior entry ****\n", i);
3068     }
3069     i++;
3070     idx = SWAB16(pBt, pCell->h.iNext);
3071   }
3072   if( idx!=0 ){
3073     printf("ERROR: next cell index out of range: %d\n", idx);
3074   }
3075   printf("right_child: %d\n", SWAB32(pBt, pPage->u.hdr.rightChild));
3076   nFree = 0;
3077   i = 0;
3078   idx = SWAB16(pBt, pPage->u.hdr.firstFree);
3079   while( idx>0 && idx<SQLITE_USABLE_SIZE ){
3080     FreeBlk *p = (FreeBlk*)&pPage->u.aDisk[idx];
3081     sprintf(range,"%d..%d", idx, idx+p->iSize-1);
3082     nFree += SWAB16(pBt, p->iSize);
3083     printf("freeblock %2d: i=%-10s size=%-4d total=%d\n",
3084        i, range, SWAB16(pBt, p->iSize), nFree);
3085     idx = SWAB16(pBt, p->iNext);
3086     i++;
3087   }
3088   if( idx!=0 ){
3089     printf("ERROR: next freeblock index out of range: %d\n", idx);
3090   }
3091   if( recursive && pPage->u.hdr.rightChild!=0 ){
3092     idx = SWAB16(pBt, pPage->u.hdr.firstCell);
3093     while( idx>0 && idx<SQLITE_USABLE_SIZE-MIN_CELL_SIZE ){
3094       Cell *pCell = (Cell*)&pPage->u.aDisk[idx];
3095       fileBtreePageDump(pBt, SWAB32(pBt, pCell->h.leftChild), 1);
3096       idx = SWAB16(pBt, pCell->h.iNext);
3097     }
3098     fileBtreePageDump(pBt, SWAB32(pBt, pPage->u.hdr.rightChild), 1);
3099   }
3100   sqlitepager_unref(pPage);
3101   return SQLITE_OK;
3102 }
3103 #endif
3104 
3105 #ifdef SQLITE_TEST
3106 /*
3107 ** Fill aResult[] with information about the entry and page that the
3108 ** cursor is pointing to.
3109 **
3110 **   aResult[0] =  The page number
3111 **   aResult[1] =  The entry number
3112 **   aResult[2] =  Total number of entries on this page
3113 **   aResult[3] =  Size of this entry
3114 **   aResult[4] =  Number of free bytes on this page
3115 **   aResult[5] =  Number of free blocks on the page
3116 **   aResult[6] =  Page number of the left child of this entry
3117 **   aResult[7] =  Page number of the right child for the whole page
3118 **
3119 ** This routine is used for testing and debugging only.
3120 */
fileBtreeCursorDump(BtCursor * pCur,int * aResult)3121 static int fileBtreeCursorDump(BtCursor *pCur, int *aResult){
3122   int cnt, idx;
3123   MemPage *pPage = pCur->pPage;
3124   Btree *pBt = pCur->pBt;
3125   aResult[0] = sqlitepager_pagenumber(pPage);
3126   aResult[1] = pCur->idx;
3127   aResult[2] = pPage->nCell;
3128   if( pCur->idx>=0 && pCur->idx<pPage->nCell ){
3129     aResult[3] = cellSize(pBt, pPage->apCell[pCur->idx]);
3130     aResult[6] = SWAB32(pBt, pPage->apCell[pCur->idx]->h.leftChild);
3131   }else{
3132     aResult[3] = 0;
3133     aResult[6] = 0;
3134   }
3135   aResult[4] = pPage->nFree;
3136   cnt = 0;
3137   idx = SWAB16(pBt, pPage->u.hdr.firstFree);
3138   while( idx>0 && idx<SQLITE_USABLE_SIZE ){
3139     cnt++;
3140     idx = SWAB16(pBt, ((FreeBlk*)&pPage->u.aDisk[idx])->iNext);
3141   }
3142   aResult[5] = cnt;
3143   aResult[7] = SWAB32(pBt, pPage->u.hdr.rightChild);
3144   return SQLITE_OK;
3145 }
3146 #endif
3147 
3148 /*
3149 ** Return the pager associated with a BTree.  This routine is used for
3150 ** testing and debugging only.
3151 */
fileBtreePager(Btree * pBt)3152 static Pager *fileBtreePager(Btree *pBt){
3153   return pBt->pPager;
3154 }
3155 
3156 /*
3157 ** This structure is passed around through all the sanity checking routines
3158 ** in order to keep track of some global state information.
3159 */
3160 typedef struct IntegrityCk IntegrityCk;
3161 struct IntegrityCk {
3162   Btree *pBt;    /* The tree being checked out */
3163   Pager *pPager; /* The associated pager.  Also accessible by pBt->pPager */
3164   int nPage;     /* Number of pages in the database */
3165   int *anRef;    /* Number of times each page is referenced */
3166   char *zErrMsg; /* An error message.  NULL of no errors seen. */
3167 };
3168 
3169 /*
3170 ** Append a message to the error message string.
3171 */
checkAppendMsg(IntegrityCk * pCheck,char * zMsg1,char * zMsg2)3172 static void checkAppendMsg(IntegrityCk *pCheck, char *zMsg1, char *zMsg2){
3173   if( pCheck->zErrMsg ){
3174     char *zOld = pCheck->zErrMsg;
3175     pCheck->zErrMsg = 0;
3176     sqliteSetString(&pCheck->zErrMsg, zOld, "\n", zMsg1, zMsg2, (char*)0);
3177     sqliteFree(zOld);
3178   }else{
3179     sqliteSetString(&pCheck->zErrMsg, zMsg1, zMsg2, (char*)0);
3180   }
3181 }
3182 
3183 /*
3184 ** Add 1 to the reference count for page iPage.  If this is the second
3185 ** reference to the page, add an error message to pCheck->zErrMsg.
3186 ** Return 1 if there are 2 ore more references to the page and 0 if
3187 ** if this is the first reference to the page.
3188 **
3189 ** Also check that the page number is in bounds.
3190 */
checkRef(IntegrityCk * pCheck,int iPage,char * zContext)3191 static int checkRef(IntegrityCk *pCheck, int iPage, char *zContext){
3192   if( iPage==0 ) return 1;
3193   if( iPage>pCheck->nPage || iPage<0 ){
3194     char zBuf[100];
3195     sprintf(zBuf, "invalid page number %d", iPage);
3196     checkAppendMsg(pCheck, zContext, zBuf);
3197     return 1;
3198   }
3199   if( pCheck->anRef[iPage]==1 ){
3200     char zBuf[100];
3201     sprintf(zBuf, "2nd reference to page %d", iPage);
3202     checkAppendMsg(pCheck, zContext, zBuf);
3203     return 1;
3204   }
3205   return  (pCheck->anRef[iPage]++)>1;
3206 }
3207 
3208 /*
3209 ** Check the integrity of the freelist or of an overflow page list.
3210 ** Verify that the number of pages on the list is N.
3211 */
checkList(IntegrityCk * pCheck,int isFreeList,int iPage,int N,char * zContext)3212 static void checkList(
3213   IntegrityCk *pCheck,  /* Integrity checking context */
3214   int isFreeList,       /* True for a freelist.  False for overflow page list */
3215   int iPage,            /* Page number for first page in the list */
3216   int N,                /* Expected number of pages in the list */
3217   char *zContext        /* Context for error messages */
3218 ){
3219   int i;
3220   char zMsg[100];
3221   while( N-- > 0 ){
3222     OverflowPage *pOvfl;
3223     if( iPage<1 ){
3224       sprintf(zMsg, "%d pages missing from overflow list", N+1);
3225       checkAppendMsg(pCheck, zContext, zMsg);
3226       break;
3227     }
3228     if( checkRef(pCheck, iPage, zContext) ) break;
3229     if( sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pOvfl) ){
3230       sprintf(zMsg, "failed to get page %d", iPage);
3231       checkAppendMsg(pCheck, zContext, zMsg);
3232       break;
3233     }
3234     if( isFreeList ){
3235       FreelistInfo *pInfo = (FreelistInfo*)pOvfl->aPayload;
3236       int n = SWAB32(pCheck->pBt, pInfo->nFree);
3237       for(i=0; i<n; i++){
3238         checkRef(pCheck, SWAB32(pCheck->pBt, pInfo->aFree[i]), zContext);
3239       }
3240       N -= n;
3241     }
3242     iPage = SWAB32(pCheck->pBt, pOvfl->iNext);
3243     sqlitepager_unref(pOvfl);
3244   }
3245 }
3246 
3247 /*
3248 ** Return negative if zKey1<zKey2.
3249 ** Return zero if zKey1==zKey2.
3250 ** Return positive if zKey1>zKey2.
3251 */
keyCompare(const char * zKey1,int nKey1,const char * zKey2,int nKey2)3252 static int keyCompare(
3253   const char *zKey1, int nKey1,
3254   const char *zKey2, int nKey2
3255 ){
3256   int min = nKey1>nKey2 ? nKey2 : nKey1;
3257   int c = memcmp(zKey1, zKey2, min);
3258   if( c==0 ){
3259     c = nKey1 - nKey2;
3260   }
3261   return c;
3262 }
3263 
3264 /*
3265 ** Do various sanity checks on a single page of a tree.  Return
3266 ** the tree depth.  Root pages return 0.  Parents of root pages
3267 ** return 1, and so forth.
3268 **
3269 ** These checks are done:
3270 **
3271 **      1.  Make sure that cells and freeblocks do not overlap
3272 **          but combine to completely cover the page.
3273 **      2.  Make sure cell keys are in order.
3274 **      3.  Make sure no key is less than or equal to zLowerBound.
3275 **      4.  Make sure no key is greater than or equal to zUpperBound.
3276 **      5.  Check the integrity of overflow pages.
3277 **      6.  Recursively call checkTreePage on all children.
3278 **      7.  Verify that the depth of all children is the same.
3279 **      8.  Make sure this page is at least 33% full or else it is
3280 **          the root of the tree.
3281 */
checkTreePage(IntegrityCk * pCheck,int iPage,MemPage * pParent,char * zParentContext,char * zLowerBound,int nLower,char * zUpperBound,int nUpper)3282 static int checkTreePage(
3283   IntegrityCk *pCheck,  /* Context for the sanity check */
3284   int iPage,            /* Page number of the page to check */
3285   MemPage *pParent,     /* Parent page */
3286   char *zParentContext, /* Parent context */
3287   char *zLowerBound,    /* All keys should be greater than this, if not NULL */
3288   int nLower,           /* Number of characters in zLowerBound */
3289   char *zUpperBound,    /* All keys should be less than this, if not NULL */
3290   int nUpper            /* Number of characters in zUpperBound */
3291 ){
3292   MemPage *pPage;
3293   int i, rc, depth, d2, pgno;
3294   char *zKey1, *zKey2;
3295   int nKey1, nKey2;
3296   BtCursor cur;
3297   Btree *pBt;
3298   char zMsg[100];
3299   char zContext[100];
3300   char hit[SQLITE_USABLE_SIZE];
3301 
3302   /* Check that the page exists
3303   */
3304   cur.pBt = pBt = pCheck->pBt;
3305   if( iPage==0 ) return 0;
3306   if( checkRef(pCheck, iPage, zParentContext) ) return 0;
3307   sprintf(zContext, "On tree page %d: ", iPage);
3308   if( (rc = sqlitepager_get(pCheck->pPager, (Pgno)iPage, (void**)&pPage))!=0 ){
3309     sprintf(zMsg, "unable to get the page. error code=%d", rc);
3310     checkAppendMsg(pCheck, zContext, zMsg);
3311     return 0;
3312   }
3313   if( (rc = initPage(pBt, pPage, (Pgno)iPage, pParent))!=0 ){
3314     sprintf(zMsg, "initPage() returns error code %d", rc);
3315     checkAppendMsg(pCheck, zContext, zMsg);
3316     sqlitepager_unref(pPage);
3317     return 0;
3318   }
3319 
3320   /* Check out all the cells.
3321   */
3322   depth = 0;
3323   if( zLowerBound ){
3324     zKey1 = sqliteMalloc( nLower+1 );
3325     memcpy(zKey1, zLowerBound, nLower);
3326     zKey1[nLower] = 0;
3327   }else{
3328     zKey1 = 0;
3329   }
3330   nKey1 = nLower;
3331   cur.pPage = pPage;
3332   for(i=0; i<pPage->nCell; i++){
3333     Cell *pCell = pPage->apCell[i];
3334     int sz;
3335 
3336     /* Check payload overflow pages
3337     */
3338     nKey2 = NKEY(pBt, pCell->h);
3339     sz = nKey2 + NDATA(pBt, pCell->h);
3340     sprintf(zContext, "On page %d cell %d: ", iPage, i);
3341     if( sz>MX_LOCAL_PAYLOAD ){
3342       int nPage = (sz - MX_LOCAL_PAYLOAD + OVERFLOW_SIZE - 1)/OVERFLOW_SIZE;
3343       checkList(pCheck, 0, SWAB32(pBt, pCell->ovfl), nPage, zContext);
3344     }
3345 
3346     /* Check that keys are in the right order
3347     */
3348     cur.idx = i;
3349     zKey2 = sqliteMallocRaw( nKey2+1 );
3350     getPayload(&cur, 0, nKey2, zKey2);
3351     if( zKey1 && keyCompare(zKey1, nKey1, zKey2, nKey2)>=0 ){
3352       checkAppendMsg(pCheck, zContext, "Key is out of order");
3353     }
3354 
3355     /* Check sanity of left child page.
3356     */
3357     pgno = SWAB32(pBt, pCell->h.leftChild);
3358     d2 = checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zKey2,nKey2);
3359     if( i>0 && d2!=depth ){
3360       checkAppendMsg(pCheck, zContext, "Child page depth differs");
3361     }
3362     depth = d2;
3363     sqliteFree(zKey1);
3364     zKey1 = zKey2;
3365     nKey1 = nKey2;
3366   }
3367   pgno = SWAB32(pBt, pPage->u.hdr.rightChild);
3368   sprintf(zContext, "On page %d at right child: ", iPage);
3369   checkTreePage(pCheck, pgno, pPage, zContext, zKey1,nKey1,zUpperBound,nUpper);
3370   sqliteFree(zKey1);
3371 
3372   /* Check for complete coverage of the page
3373   */
3374   memset(hit, 0, sizeof(hit));
3375   memset(hit, 1, sizeof(PageHdr));
3376   for(i=SWAB16(pBt, pPage->u.hdr.firstCell); i>0 && i<SQLITE_USABLE_SIZE; ){
3377     Cell *pCell = (Cell*)&pPage->u.aDisk[i];
3378     int j;
3379     for(j=i+cellSize(pBt, pCell)-1; j>=i; j--) hit[j]++;
3380     i = SWAB16(pBt, pCell->h.iNext);
3381   }
3382   for(i=SWAB16(pBt,pPage->u.hdr.firstFree); i>0 && i<SQLITE_USABLE_SIZE; ){
3383     FreeBlk *pFBlk = (FreeBlk*)&pPage->u.aDisk[i];
3384     int j;
3385     for(j=i+SWAB16(pBt,pFBlk->iSize)-1; j>=i; j--) hit[j]++;
3386     i = SWAB16(pBt,pFBlk->iNext);
3387   }
3388   for(i=0; i<SQLITE_USABLE_SIZE; i++){
3389     if( hit[i]==0 ){
3390       sprintf(zMsg, "Unused space at byte %d of page %d", i, iPage);
3391       checkAppendMsg(pCheck, zMsg, 0);
3392       break;
3393     }else if( hit[i]>1 ){
3394       sprintf(zMsg, "Multiple uses for byte %d of page %d", i, iPage);
3395       checkAppendMsg(pCheck, zMsg, 0);
3396       break;
3397     }
3398   }
3399 
3400   /* Check that free space is kept to a minimum
3401   */
3402 #if 0
3403   if( pParent && pParent->nCell>2 && pPage->nFree>3*SQLITE_USABLE_SIZE/4 ){
3404     sprintf(zMsg, "free space (%d) greater than max (%d)", pPage->nFree,
3405        SQLITE_USABLE_SIZE/3);
3406     checkAppendMsg(pCheck, zContext, zMsg);
3407   }
3408 #endif
3409 
3410   sqlitepager_unref(pPage);
3411   return depth;
3412 }
3413 
3414 /*
3415 ** This routine does a complete check of the given BTree file.  aRoot[] is
3416 ** an array of pages numbers were each page number is the root page of
3417 ** a table.  nRoot is the number of entries in aRoot.
3418 **
3419 ** If everything checks out, this routine returns NULL.  If something is
3420 ** amiss, an error message is written into memory obtained from malloc()
3421 ** and a pointer to that error message is returned.  The calling function
3422 ** is responsible for freeing the error message when it is done.
3423 */
fileBtreeIntegrityCheck(Btree * pBt,int * aRoot,int nRoot)3424 char *fileBtreeIntegrityCheck(Btree *pBt, int *aRoot, int nRoot){
3425   int i;
3426   int nRef;
3427   IntegrityCk sCheck;
3428 
3429   nRef = *sqlitepager_stats(pBt->pPager);
3430   if( lockBtree(pBt)!=SQLITE_OK ){
3431     return sqliteStrDup("Unable to acquire a read lock on the database");
3432   }
3433   sCheck.pBt = pBt;
3434   sCheck.pPager = pBt->pPager;
3435   sCheck.nPage = sqlitepager_pagecount(sCheck.pPager);
3436   if( sCheck.nPage==0 ){
3437     unlockBtreeIfUnused(pBt);
3438     return 0;
3439   }
3440   sCheck.anRef = sqliteMallocRaw( (sCheck.nPage+1)*sizeof(sCheck.anRef[0]) );
3441   sCheck.anRef[1] = 1;
3442   for(i=2; i<=sCheck.nPage; i++){ sCheck.anRef[i] = 0; }
3443   sCheck.zErrMsg = 0;
3444 
3445   /* Check the integrity of the freelist
3446   */
3447   checkList(&sCheck, 1, SWAB32(pBt, pBt->page1->freeList),
3448             SWAB32(pBt, pBt->page1->nFree), "Main freelist: ");
3449 
3450   /* Check all the tables.
3451   */
3452   for(i=0; i<nRoot; i++){
3453     if( aRoot[i]==0 ) continue;
3454     checkTreePage(&sCheck, aRoot[i], 0, "List of tree roots: ", 0,0,0,0);
3455   }
3456 
3457   /* Make sure every page in the file is referenced
3458   */
3459   for(i=1; i<=sCheck.nPage; i++){
3460     if( sCheck.anRef[i]==0 ){
3461       char zBuf[100];
3462       sprintf(zBuf, "Page %d is never used", i);
3463       checkAppendMsg(&sCheck, zBuf, 0);
3464     }
3465   }
3466 
3467   /* Make sure this analysis did not leave any unref() pages
3468   */
3469   unlockBtreeIfUnused(pBt);
3470   if( nRef != *sqlitepager_stats(pBt->pPager) ){
3471     char zBuf[100];
3472     sprintf(zBuf,
3473       "Outstanding page count goes from %d to %d during this analysis",
3474       nRef, *sqlitepager_stats(pBt->pPager)
3475     );
3476     checkAppendMsg(&sCheck, zBuf, 0);
3477   }
3478 
3479   /* Clean  up and report errors.
3480   */
3481   sqliteFree(sCheck.anRef);
3482   return sCheck.zErrMsg;
3483 }
3484 
3485 /*
3486 ** Return the full pathname of the underlying database file.
3487 */
fileBtreeGetFilename(Btree * pBt)3488 static const char *fileBtreeGetFilename(Btree *pBt){
3489   assert( pBt->pPager!=0 );
3490   return sqlitepager_filename(pBt->pPager);
3491 }
3492 
3493 /*
3494 ** Copy the complete content of pBtFrom into pBtTo.  A transaction
3495 ** must be active for both files.
3496 **
3497 ** The size of file pBtFrom may be reduced by this operation.
3498 ** If anything goes wrong, the transaction on pBtFrom is rolled back.
3499 */
fileBtreeCopyFile(Btree * pBtTo,Btree * pBtFrom)3500 static int fileBtreeCopyFile(Btree *pBtTo, Btree *pBtFrom){
3501   int rc = SQLITE_OK;
3502   Pgno i, nPage, nToPage;
3503 
3504   if( !pBtTo->inTrans || !pBtFrom->inTrans ) return SQLITE_ERROR;
3505   if( pBtTo->needSwab!=pBtFrom->needSwab ) return SQLITE_ERROR;
3506   if( pBtTo->pCursor ) return SQLITE_BUSY;
3507   memcpy(pBtTo->page1, pBtFrom->page1, SQLITE_USABLE_SIZE);
3508   rc = sqlitepager_overwrite(pBtTo->pPager, 1, pBtFrom->page1);
3509   nToPage = sqlitepager_pagecount(pBtTo->pPager);
3510   nPage = sqlitepager_pagecount(pBtFrom->pPager);
3511   for(i=2; rc==SQLITE_OK && i<=nPage; i++){
3512     void *pPage;
3513     rc = sqlitepager_get(pBtFrom->pPager, i, &pPage);
3514     if( rc ) break;
3515     rc = sqlitepager_overwrite(pBtTo->pPager, i, pPage);
3516     if( rc ) break;
3517     sqlitepager_unref(pPage);
3518   }
3519   for(i=nPage+1; rc==SQLITE_OK && i<=nToPage; i++){
3520     void *pPage;
3521     rc = sqlitepager_get(pBtTo->pPager, i, &pPage);
3522     if( rc ) break;
3523     rc = sqlitepager_write(pPage);
3524     sqlitepager_unref(pPage);
3525     sqlitepager_dont_write(pBtTo->pPager, i);
3526   }
3527   if( !rc && nPage<nToPage ){
3528     rc = sqlitepager_truncate(pBtTo->pPager, nPage);
3529   }
3530   if( rc ){
3531     fileBtreeRollback(pBtTo);
3532   }
3533   return rc;
3534 }
3535 
3536 /*
3537 ** The following tables contain pointers to all of the interface
3538 ** routines for this implementation of the B*Tree backend.  To
3539 ** substitute a different implemention of the backend, one has merely
3540 ** to provide pointers to alternative functions in similar tables.
3541 */
3542 static BtOps sqliteBtreeOps = {
3543     fileBtreeClose,
3544     fileBtreeSetCacheSize,
3545     fileBtreeSetSafetyLevel,
3546     fileBtreeBeginTrans,
3547     fileBtreeCommit,
3548     fileBtreeRollback,
3549     fileBtreeBeginCkpt,
3550     fileBtreeCommitCkpt,
3551     fileBtreeRollbackCkpt,
3552     fileBtreeCreateTable,
3553     fileBtreeCreateTable,  /* Really sqliteBtreeCreateIndex() */
3554     fileBtreeDropTable,
3555     fileBtreeClearTable,
3556     fileBtreeCursor,
3557     fileBtreeGetMeta,
3558     fileBtreeUpdateMeta,
3559     fileBtreeIntegrityCheck,
3560     fileBtreeGetFilename,
3561     fileBtreeCopyFile,
3562     fileBtreePager,
3563 #ifdef SQLITE_TEST
3564     fileBtreePageDump,
3565 #endif
3566 };
3567 static BtCursorOps sqliteBtreeCursorOps = {
3568     fileBtreeMoveto,
3569     fileBtreeDelete,
3570     fileBtreeInsert,
3571     fileBtreeFirst,
3572     fileBtreeLast,
3573     fileBtreeNext,
3574     fileBtreePrevious,
3575     fileBtreeKeySize,
3576     fileBtreeKey,
3577     fileBtreeKeyCompare,
3578     fileBtreeDataSize,
3579     fileBtreeData,
3580     fileBtreeCloseCursor,
3581 #ifdef SQLITE_TEST
3582     fileBtreeCursorDump,
3583 #endif
3584 };
3585