1 /*
2 ** 2004 April 6
3 **
4 ** The author disclaims copyright to this source code.  In place of
5 ** a legal notice, here is a blessing:
6 **
7 **    May you do good and not evil.
8 **    May you find forgiveness for yourself and forgive others.
9 **    May you share freely, never taking more than you give.
10 **
11 *************************************************************************
12 ** This file implements an external (disk-based) database using BTrees.
13 ** See the header comment on "btreeInt.h" for additional information.
14 ** Including a description of file format and an overview of operation.
15 */
16 #include "btreeInt.h"
17 
18 /*
19 ** The header string that appears at the beginning of every
20 ** SQLite database.
21 */
22 static const char zMagicHeader[] = SQLITE_FILE_HEADER;
23 
24 /*
25 ** Set this global variable to 1 to enable tracing using the TRACE
26 ** macro.
27 */
28 #if 0
29 int sqlite3BtreeTrace=1;  /* True to enable tracing */
30 # define TRACE(X)  if(sqlite3BtreeTrace){printf X;fflush(stdout);}
31 #else
32 # define TRACE(X)
33 #endif
34 
35 /*
36 ** Extract a 2-byte big-endian integer from an array of unsigned bytes.
37 ** But if the value is zero, make it 65536.
38 **
39 ** This routine is used to extract the "offset to cell content area" value
40 ** from the header of a btree page.  If the page size is 65536 and the page
41 ** is empty, the offset should be 65536, but the 2-byte value stores zero.
42 ** This routine makes the necessary adjustment to 65536.
43 */
44 #define get2byteNotZero(X)  (((((int)get2byte(X))-1)&0xffff)+1)
45 
46 /*
47 ** Values passed as the 5th argument to allocateBtreePage()
48 */
49 #define BTALLOC_ANY   0           /* Allocate any page */
50 #define BTALLOC_EXACT 1           /* Allocate exact page if possible */
51 #define BTALLOC_LE    2           /* Allocate any page <= the parameter */
52 
53 /*
54 ** Macro IfNotOmitAV(x) returns (x) if SQLITE_OMIT_AUTOVACUUM is not
55 ** defined, or 0 if it is. For example:
56 **
57 **   bIncrVacuum = IfNotOmitAV(pBtShared->incrVacuum);
58 */
59 #ifndef SQLITE_OMIT_AUTOVACUUM
60 #define IfNotOmitAV(expr) (expr)
61 #else
62 #define IfNotOmitAV(expr) 0
63 #endif
64 
65 #ifndef SQLITE_OMIT_SHARED_CACHE
66 /*
67 ** A list of BtShared objects that are eligible for participation
68 ** in shared cache.  This variable has file scope during normal builds,
69 ** but the test harness needs to access it so we make it global for
70 ** test builds.
71 **
72 ** Access to this variable is protected by SQLITE_MUTEX_STATIC_MAIN.
73 */
74 #ifdef SQLITE_TEST
75 BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
76 #else
77 static BtShared *SQLITE_WSD sqlite3SharedCacheList = 0;
78 #endif
79 #endif /* SQLITE_OMIT_SHARED_CACHE */
80 
81 #ifndef SQLITE_OMIT_SHARED_CACHE
82 /*
83 ** Enable or disable the shared pager and schema features.
84 **
85 ** This routine has no effect on existing database connections.
86 ** The shared cache setting effects only future calls to
87 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
88 */
sqlite3_enable_shared_cache(int enable)89 int sqlite3_enable_shared_cache(int enable){
90   sqlite3GlobalConfig.sharedCacheEnabled = enable;
91   return SQLITE_OK;
92 }
93 #endif
94 
95 
96 
97 #ifdef SQLITE_OMIT_SHARED_CACHE
98   /*
99   ** The functions querySharedCacheTableLock(), setSharedCacheTableLock(),
100   ** and clearAllSharedCacheTableLocks()
101   ** manipulate entries in the BtShared.pLock linked list used to store
102   ** shared-cache table level locks. If the library is compiled with the
103   ** shared-cache feature disabled, then there is only ever one user
104   ** of each BtShared structure and so this locking is not necessary.
105   ** So define the lock related functions as no-ops.
106   */
107   #define querySharedCacheTableLock(a,b,c) SQLITE_OK
108   #define setSharedCacheTableLock(a,b,c) SQLITE_OK
109   #define clearAllSharedCacheTableLocks(a)
110   #define downgradeAllSharedCacheTableLocks(a)
111   #define hasSharedCacheTableLock(a,b,c,d) 1
112   #define hasReadConflicts(a, b) 0
113 #endif
114 
115 #ifdef SQLITE_DEBUG
116 /*
117 ** Return and reset the seek counter for a Btree object.
118 */
sqlite3BtreeSeekCount(Btree * pBt)119 sqlite3_uint64 sqlite3BtreeSeekCount(Btree *pBt){
120   u64 n =  pBt->nSeek;
121   pBt->nSeek = 0;
122   return n;
123 }
124 #endif
125 
126 /*
127 ** Implementation of the SQLITE_CORRUPT_PAGE() macro. Takes a single
128 ** (MemPage*) as an argument. The (MemPage*) must not be NULL.
129 **
130 ** If SQLITE_DEBUG is not defined, then this macro is equivalent to
131 ** SQLITE_CORRUPT_BKPT. Or, if SQLITE_DEBUG is set, then the log message
132 ** normally produced as a side-effect of SQLITE_CORRUPT_BKPT is augmented
133 ** with the page number and filename associated with the (MemPage*).
134 */
135 #ifdef SQLITE_DEBUG
corruptPageError(int lineno,MemPage * p)136 int corruptPageError(int lineno, MemPage *p){
137   char *zMsg;
138   sqlite3BeginBenignMalloc();
139   zMsg = sqlite3_mprintf("database corruption page %d of %s",
140       (int)p->pgno, sqlite3PagerFilename(p->pBt->pPager, 0)
141   );
142   sqlite3EndBenignMalloc();
143   if( zMsg ){
144     sqlite3ReportError(SQLITE_CORRUPT, lineno, zMsg);
145   }
146   sqlite3_free(zMsg);
147   return SQLITE_CORRUPT_BKPT;
148 }
149 # define SQLITE_CORRUPT_PAGE(pMemPage) corruptPageError(__LINE__, pMemPage)
150 #else
151 # define SQLITE_CORRUPT_PAGE(pMemPage) SQLITE_CORRUPT_PGNO(pMemPage->pgno)
152 #endif
153 
154 #ifndef SQLITE_OMIT_SHARED_CACHE
155 
156 #ifdef SQLITE_DEBUG
157 /*
158 **** This function is only used as part of an assert() statement. ***
159 **
160 ** Check to see if pBtree holds the required locks to read or write to the
161 ** table with root page iRoot.   Return 1 if it does and 0 if not.
162 **
163 ** For example, when writing to a table with root-page iRoot via
164 ** Btree connection pBtree:
165 **
166 **    assert( hasSharedCacheTableLock(pBtree, iRoot, 0, WRITE_LOCK) );
167 **
168 ** When writing to an index that resides in a sharable database, the
169 ** caller should have first obtained a lock specifying the root page of
170 ** the corresponding table. This makes things a bit more complicated,
171 ** as this module treats each table as a separate structure. To determine
172 ** the table corresponding to the index being written, this
173 ** function has to search through the database schema.
174 **
175 ** Instead of a lock on the table/index rooted at page iRoot, the caller may
176 ** hold a write-lock on the schema table (root page 1). This is also
177 ** acceptable.
178 */
hasSharedCacheTableLock(Btree * pBtree,Pgno iRoot,int isIndex,int eLockType)179 static int hasSharedCacheTableLock(
180   Btree *pBtree,         /* Handle that must hold lock */
181   Pgno iRoot,            /* Root page of b-tree */
182   int isIndex,           /* True if iRoot is the root of an index b-tree */
183   int eLockType          /* Required lock type (READ_LOCK or WRITE_LOCK) */
184 ){
185   Schema *pSchema = (Schema *)pBtree->pBt->pSchema;
186   Pgno iTab = 0;
187   BtLock *pLock;
188 
189   /* If this database is not shareable, or if the client is reading
190   ** and has the read-uncommitted flag set, then no lock is required.
191   ** Return true immediately.
192   */
193   if( (pBtree->sharable==0)
194    || (eLockType==READ_LOCK && (pBtree->db->flags & SQLITE_ReadUncommit))
195   ){
196     return 1;
197   }
198 
199   /* If the client is reading  or writing an index and the schema is
200   ** not loaded, then it is too difficult to actually check to see if
201   ** the correct locks are held.  So do not bother - just return true.
202   ** This case does not come up very often anyhow.
203   */
204   if( isIndex && (!pSchema || (pSchema->schemaFlags&DB_SchemaLoaded)==0) ){
205     return 1;
206   }
207 
208   /* Figure out the root-page that the lock should be held on. For table
209   ** b-trees, this is just the root page of the b-tree being read or
210   ** written. For index b-trees, it is the root page of the associated
211   ** table.  */
212   if( isIndex ){
213     HashElem *p;
214     int bSeen = 0;
215     for(p=sqliteHashFirst(&pSchema->idxHash); p; p=sqliteHashNext(p)){
216       Index *pIdx = (Index *)sqliteHashData(p);
217       if( pIdx->tnum==(int)iRoot ){
218         if( bSeen ){
219           /* Two or more indexes share the same root page.  There must
220           ** be imposter tables.  So just return true.  The assert is not
221           ** useful in that case. */
222           return 1;
223         }
224         iTab = pIdx->pTable->tnum;
225         bSeen = 1;
226       }
227     }
228   }else{
229     iTab = iRoot;
230   }
231 
232   /* Search for the required lock. Either a write-lock on root-page iTab, a
233   ** write-lock on the schema table, or (if the client is reading) a
234   ** read-lock on iTab will suffice. Return 1 if any of these are found.  */
235   for(pLock=pBtree->pBt->pLock; pLock; pLock=pLock->pNext){
236     if( pLock->pBtree==pBtree
237      && (pLock->iTable==iTab || (pLock->eLock==WRITE_LOCK && pLock->iTable==1))
238      && pLock->eLock>=eLockType
239     ){
240       return 1;
241     }
242   }
243 
244   /* Failed to find the required lock. */
245   return 0;
246 }
247 #endif /* SQLITE_DEBUG */
248 
249 #ifdef SQLITE_DEBUG
250 /*
251 **** This function may be used as part of assert() statements only. ****
252 **
253 ** Return true if it would be illegal for pBtree to write into the
254 ** table or index rooted at iRoot because other shared connections are
255 ** simultaneously reading that same table or index.
256 **
257 ** It is illegal for pBtree to write if some other Btree object that
258 ** shares the same BtShared object is currently reading or writing
259 ** the iRoot table.  Except, if the other Btree object has the
260 ** read-uncommitted flag set, then it is OK for the other object to
261 ** have a read cursor.
262 **
263 ** For example, before writing to any part of the table or index
264 ** rooted at page iRoot, one should call:
265 **
266 **    assert( !hasReadConflicts(pBtree, iRoot) );
267 */
hasReadConflicts(Btree * pBtree,Pgno iRoot)268 static int hasReadConflicts(Btree *pBtree, Pgno iRoot){
269   BtCursor *p;
270   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
271     if( p->pgnoRoot==iRoot
272      && p->pBtree!=pBtree
273      && 0==(p->pBtree->db->flags & SQLITE_ReadUncommit)
274     ){
275       return 1;
276     }
277   }
278   return 0;
279 }
280 #endif    /* #ifdef SQLITE_DEBUG */
281 
282 /*
283 ** Query to see if Btree handle p may obtain a lock of type eLock
284 ** (READ_LOCK or WRITE_LOCK) on the table with root-page iTab. Return
285 ** SQLITE_OK if the lock may be obtained (by calling
286 ** setSharedCacheTableLock()), or SQLITE_LOCKED if not.
287 */
querySharedCacheTableLock(Btree * p,Pgno iTab,u8 eLock)288 static int querySharedCacheTableLock(Btree *p, Pgno iTab, u8 eLock){
289   BtShared *pBt = p->pBt;
290   BtLock *pIter;
291 
292   assert( sqlite3BtreeHoldsMutex(p) );
293   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
294   assert( p->db!=0 );
295   assert( !(p->db->flags&SQLITE_ReadUncommit)||eLock==WRITE_LOCK||iTab==1 );
296 
297   /* If requesting a write-lock, then the Btree must have an open write
298   ** transaction on this file. And, obviously, for this to be so there
299   ** must be an open write transaction on the file itself.
300   */
301   assert( eLock==READ_LOCK || (p==pBt->pWriter && p->inTrans==TRANS_WRITE) );
302   assert( eLock==READ_LOCK || pBt->inTransaction==TRANS_WRITE );
303 
304   /* This routine is a no-op if the shared-cache is not enabled */
305   if( !p->sharable ){
306     return SQLITE_OK;
307   }
308 
309   /* If some other connection is holding an exclusive lock, the
310   ** requested lock may not be obtained.
311   */
312   if( pBt->pWriter!=p && (pBt->btsFlags & BTS_EXCLUSIVE)!=0 ){
313     sqlite3ConnectionBlocked(p->db, pBt->pWriter->db);
314     return SQLITE_LOCKED_SHAREDCACHE;
315   }
316 
317   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
318     /* The condition (pIter->eLock!=eLock) in the following if(...)
319     ** statement is a simplification of:
320     **
321     **   (eLock==WRITE_LOCK || pIter->eLock==WRITE_LOCK)
322     **
323     ** since we know that if eLock==WRITE_LOCK, then no other connection
324     ** may hold a WRITE_LOCK on any table in this file (since there can
325     ** only be a single writer).
326     */
327     assert( pIter->eLock==READ_LOCK || pIter->eLock==WRITE_LOCK );
328     assert( eLock==READ_LOCK || pIter->pBtree==p || pIter->eLock==READ_LOCK);
329     if( pIter->pBtree!=p && pIter->iTable==iTab && pIter->eLock!=eLock ){
330       sqlite3ConnectionBlocked(p->db, pIter->pBtree->db);
331       if( eLock==WRITE_LOCK ){
332         assert( p==pBt->pWriter );
333         pBt->btsFlags |= BTS_PENDING;
334       }
335       return SQLITE_LOCKED_SHAREDCACHE;
336     }
337   }
338   return SQLITE_OK;
339 }
340 #endif /* !SQLITE_OMIT_SHARED_CACHE */
341 
342 #ifndef SQLITE_OMIT_SHARED_CACHE
343 /*
344 ** Add a lock on the table with root-page iTable to the shared-btree used
345 ** by Btree handle p. Parameter eLock must be either READ_LOCK or
346 ** WRITE_LOCK.
347 **
348 ** This function assumes the following:
349 **
350 **   (a) The specified Btree object p is connected to a sharable
351 **       database (one with the BtShared.sharable flag set), and
352 **
353 **   (b) No other Btree objects hold a lock that conflicts
354 **       with the requested lock (i.e. querySharedCacheTableLock() has
355 **       already been called and returned SQLITE_OK).
356 **
357 ** SQLITE_OK is returned if the lock is added successfully. SQLITE_NOMEM
358 ** is returned if a malloc attempt fails.
359 */
setSharedCacheTableLock(Btree * p,Pgno iTable,u8 eLock)360 static int setSharedCacheTableLock(Btree *p, Pgno iTable, u8 eLock){
361   BtShared *pBt = p->pBt;
362   BtLock *pLock = 0;
363   BtLock *pIter;
364 
365   assert( sqlite3BtreeHoldsMutex(p) );
366   assert( eLock==READ_LOCK || eLock==WRITE_LOCK );
367   assert( p->db!=0 );
368 
369   /* A connection with the read-uncommitted flag set will never try to
370   ** obtain a read-lock using this function. The only read-lock obtained
371   ** by a connection in read-uncommitted mode is on the sqlite_schema
372   ** table, and that lock is obtained in BtreeBeginTrans().  */
373   assert( 0==(p->db->flags&SQLITE_ReadUncommit) || eLock==WRITE_LOCK );
374 
375   /* This function should only be called on a sharable b-tree after it
376   ** has been determined that no other b-tree holds a conflicting lock.  */
377   assert( p->sharable );
378   assert( SQLITE_OK==querySharedCacheTableLock(p, iTable, eLock) );
379 
380   /* First search the list for an existing lock on this table. */
381   for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
382     if( pIter->iTable==iTable && pIter->pBtree==p ){
383       pLock = pIter;
384       break;
385     }
386   }
387 
388   /* If the above search did not find a BtLock struct associating Btree p
389   ** with table iTable, allocate one and link it into the list.
390   */
391   if( !pLock ){
392     pLock = (BtLock *)sqlite3MallocZero(sizeof(BtLock));
393     if( !pLock ){
394       return SQLITE_NOMEM_BKPT;
395     }
396     pLock->iTable = iTable;
397     pLock->pBtree = p;
398     pLock->pNext = pBt->pLock;
399     pBt->pLock = pLock;
400   }
401 
402   /* Set the BtLock.eLock variable to the maximum of the current lock
403   ** and the requested lock. This means if a write-lock was already held
404   ** and a read-lock requested, we don't incorrectly downgrade the lock.
405   */
406   assert( WRITE_LOCK>READ_LOCK );
407   if( eLock>pLock->eLock ){
408     pLock->eLock = eLock;
409   }
410 
411   return SQLITE_OK;
412 }
413 #endif /* !SQLITE_OMIT_SHARED_CACHE */
414 
415 #ifndef SQLITE_OMIT_SHARED_CACHE
416 /*
417 ** Release all the table locks (locks obtained via calls to
418 ** the setSharedCacheTableLock() procedure) held by Btree object p.
419 **
420 ** This function assumes that Btree p has an open read or write
421 ** transaction. If it does not, then the BTS_PENDING flag
422 ** may be incorrectly cleared.
423 */
clearAllSharedCacheTableLocks(Btree * p)424 static void clearAllSharedCacheTableLocks(Btree *p){
425   BtShared *pBt = p->pBt;
426   BtLock **ppIter = &pBt->pLock;
427 
428   assert( sqlite3BtreeHoldsMutex(p) );
429   assert( p->sharable || 0==*ppIter );
430   assert( p->inTrans>0 );
431 
432   while( *ppIter ){
433     BtLock *pLock = *ppIter;
434     assert( (pBt->btsFlags & BTS_EXCLUSIVE)==0 || pBt->pWriter==pLock->pBtree );
435     assert( pLock->pBtree->inTrans>=pLock->eLock );
436     if( pLock->pBtree==p ){
437       *ppIter = pLock->pNext;
438       assert( pLock->iTable!=1 || pLock==&p->lock );
439       if( pLock->iTable!=1 ){
440         sqlite3_free(pLock);
441       }
442     }else{
443       ppIter = &pLock->pNext;
444     }
445   }
446 
447   assert( (pBt->btsFlags & BTS_PENDING)==0 || pBt->pWriter );
448   if( pBt->pWriter==p ){
449     pBt->pWriter = 0;
450     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
451   }else if( pBt->nTransaction==2 ){
452     /* This function is called when Btree p is concluding its
453     ** transaction. If there currently exists a writer, and p is not
454     ** that writer, then the number of locks held by connections other
455     ** than the writer must be about to drop to zero. In this case
456     ** set the BTS_PENDING flag to 0.
457     **
458     ** If there is not currently a writer, then BTS_PENDING must
459     ** be zero already. So this next line is harmless in that case.
460     */
461     pBt->btsFlags &= ~BTS_PENDING;
462   }
463 }
464 
465 /*
466 ** This function changes all write-locks held by Btree p into read-locks.
467 */
downgradeAllSharedCacheTableLocks(Btree * p)468 static void downgradeAllSharedCacheTableLocks(Btree *p){
469   BtShared *pBt = p->pBt;
470   if( pBt->pWriter==p ){
471     BtLock *pLock;
472     pBt->pWriter = 0;
473     pBt->btsFlags &= ~(BTS_EXCLUSIVE|BTS_PENDING);
474     for(pLock=pBt->pLock; pLock; pLock=pLock->pNext){
475       assert( pLock->eLock==READ_LOCK || pLock->pBtree==p );
476       pLock->eLock = READ_LOCK;
477     }
478   }
479 }
480 
481 #endif /* SQLITE_OMIT_SHARED_CACHE */
482 
483 static void releasePage(MemPage *pPage);         /* Forward reference */
484 static void releasePageOne(MemPage *pPage);      /* Forward reference */
485 static void releasePageNotNull(MemPage *pPage);  /* Forward reference */
486 
487 /*
488 ***** This routine is used inside of assert() only ****
489 **
490 ** Verify that the cursor holds the mutex on its BtShared
491 */
492 #ifdef SQLITE_DEBUG
cursorHoldsMutex(BtCursor * p)493 static int cursorHoldsMutex(BtCursor *p){
494   return sqlite3_mutex_held(p->pBt->mutex);
495 }
496 
497 /* Verify that the cursor and the BtShared agree about what is the current
498 ** database connetion. This is important in shared-cache mode. If the database
499 ** connection pointers get out-of-sync, it is possible for routines like
500 ** btreeInitPage() to reference an stale connection pointer that references a
501 ** a connection that has already closed.  This routine is used inside assert()
502 ** statements only and for the purpose of double-checking that the btree code
503 ** does keep the database connection pointers up-to-date.
504 */
cursorOwnsBtShared(BtCursor * p)505 static int cursorOwnsBtShared(BtCursor *p){
506   assert( cursorHoldsMutex(p) );
507   return (p->pBtree->db==p->pBt->db);
508 }
509 #endif
510 
511 /*
512 ** Invalidate the overflow cache of the cursor passed as the first argument.
513 ** on the shared btree structure pBt.
514 */
515 #define invalidateOverflowCache(pCur) (pCur->curFlags &= ~BTCF_ValidOvfl)
516 
517 /*
518 ** Invalidate the overflow page-list cache for all cursors opened
519 ** on the shared btree structure pBt.
520 */
invalidateAllOverflowCache(BtShared * pBt)521 static void invalidateAllOverflowCache(BtShared *pBt){
522   BtCursor *p;
523   assert( sqlite3_mutex_held(pBt->mutex) );
524   for(p=pBt->pCursor; p; p=p->pNext){
525     invalidateOverflowCache(p);
526   }
527 }
528 
529 #ifndef SQLITE_OMIT_INCRBLOB
530 /*
531 ** This function is called before modifying the contents of a table
532 ** to invalidate any incrblob cursors that are open on the
533 ** row or one of the rows being modified.
534 **
535 ** If argument isClearTable is true, then the entire contents of the
536 ** table is about to be deleted. In this case invalidate all incrblob
537 ** cursors open on any row within the table with root-page pgnoRoot.
538 **
539 ** Otherwise, if argument isClearTable is false, then the row with
540 ** rowid iRow is being replaced or deleted. In this case invalidate
541 ** only those incrblob cursors open on that specific row.
542 */
invalidateIncrblobCursors(Btree * pBtree,Pgno pgnoRoot,i64 iRow,int isClearTable)543 static void invalidateIncrblobCursors(
544   Btree *pBtree,          /* The database file to check */
545   Pgno pgnoRoot,          /* The table that might be changing */
546   i64 iRow,               /* The rowid that might be changing */
547   int isClearTable        /* True if all rows are being deleted */
548 ){
549   BtCursor *p;
550   if( pBtree->hasIncrblobCur==0 ) return;
551   assert( sqlite3BtreeHoldsMutex(pBtree) );
552   pBtree->hasIncrblobCur = 0;
553   for(p=pBtree->pBt->pCursor; p; p=p->pNext){
554     if( (p->curFlags & BTCF_Incrblob)!=0 ){
555       pBtree->hasIncrblobCur = 1;
556       if( p->pgnoRoot==pgnoRoot && (isClearTable || p->info.nKey==iRow) ){
557         p->eState = CURSOR_INVALID;
558       }
559     }
560   }
561 }
562 
563 #else
564   /* Stub function when INCRBLOB is omitted */
565   #define invalidateIncrblobCursors(w,x,y,z)
566 #endif /* SQLITE_OMIT_INCRBLOB */
567 
568 /*
569 ** Set bit pgno of the BtShared.pHasContent bitvec. This is called
570 ** when a page that previously contained data becomes a free-list leaf
571 ** page.
572 **
573 ** The BtShared.pHasContent bitvec exists to work around an obscure
574 ** bug caused by the interaction of two useful IO optimizations surrounding
575 ** free-list leaf pages:
576 **
577 **   1) When all data is deleted from a page and the page becomes
578 **      a free-list leaf page, the page is not written to the database
579 **      (as free-list leaf pages contain no meaningful data). Sometimes
580 **      such a page is not even journalled (as it will not be modified,
581 **      why bother journalling it?).
582 **
583 **   2) When a free-list leaf page is reused, its content is not read
584 **      from the database or written to the journal file (why should it
585 **      be, if it is not at all meaningful?).
586 **
587 ** By themselves, these optimizations work fine and provide a handy
588 ** performance boost to bulk delete or insert operations. However, if
589 ** a page is moved to the free-list and then reused within the same
590 ** transaction, a problem comes up. If the page is not journalled when
591 ** it is moved to the free-list and it is also not journalled when it
592 ** is extracted from the free-list and reused, then the original data
593 ** may be lost. In the event of a rollback, it may not be possible
594 ** to restore the database to its original configuration.
595 **
596 ** The solution is the BtShared.pHasContent bitvec. Whenever a page is
597 ** moved to become a free-list leaf page, the corresponding bit is
598 ** set in the bitvec. Whenever a leaf page is extracted from the free-list,
599 ** optimization 2 above is omitted if the corresponding bit is already
600 ** set in BtShared.pHasContent. The contents of the bitvec are cleared
601 ** at the end of every transaction.
602 */
btreeSetHasContent(BtShared * pBt,Pgno pgno)603 static int btreeSetHasContent(BtShared *pBt, Pgno pgno){
604   int rc = SQLITE_OK;
605   if( !pBt->pHasContent ){
606     assert( pgno<=pBt->nPage );
607     pBt->pHasContent = sqlite3BitvecCreate(pBt->nPage);
608     if( !pBt->pHasContent ){
609       rc = SQLITE_NOMEM_BKPT;
610     }
611   }
612   if( rc==SQLITE_OK && pgno<=sqlite3BitvecSize(pBt->pHasContent) ){
613     rc = sqlite3BitvecSet(pBt->pHasContent, pgno);
614   }
615   return rc;
616 }
617 
618 /*
619 ** Query the BtShared.pHasContent vector.
620 **
621 ** This function is called when a free-list leaf page is removed from the
622 ** free-list for reuse. It returns false if it is safe to retrieve the
623 ** page from the pager layer with the 'no-content' flag set. True otherwise.
624 */
btreeGetHasContent(BtShared * pBt,Pgno pgno)625 static int btreeGetHasContent(BtShared *pBt, Pgno pgno){
626   Bitvec *p = pBt->pHasContent;
627   return p && (pgno>sqlite3BitvecSize(p) || sqlite3BitvecTestNotNull(p, pgno));
628 }
629 
630 /*
631 ** Clear (destroy) the BtShared.pHasContent bitvec. This should be
632 ** invoked at the conclusion of each write-transaction.
633 */
btreeClearHasContent(BtShared * pBt)634 static void btreeClearHasContent(BtShared *pBt){
635   sqlite3BitvecDestroy(pBt->pHasContent);
636   pBt->pHasContent = 0;
637 }
638 
639 /*
640 ** Release all of the apPage[] pages for a cursor.
641 */
btreeReleaseAllCursorPages(BtCursor * pCur)642 static void btreeReleaseAllCursorPages(BtCursor *pCur){
643   int i;
644   if( pCur->iPage>=0 ){
645     for(i=0; i<pCur->iPage; i++){
646       releasePageNotNull(pCur->apPage[i]);
647     }
648     releasePageNotNull(pCur->pPage);
649     pCur->iPage = -1;
650   }
651 }
652 
653 /*
654 ** The cursor passed as the only argument must point to a valid entry
655 ** when this function is called (i.e. have eState==CURSOR_VALID). This
656 ** function saves the current cursor key in variables pCur->nKey and
657 ** pCur->pKey. SQLITE_OK is returned if successful or an SQLite error
658 ** code otherwise.
659 **
660 ** If the cursor is open on an intkey table, then the integer key
661 ** (the rowid) is stored in pCur->nKey and pCur->pKey is left set to
662 ** NULL. If the cursor is open on a non-intkey table, then pCur->pKey is
663 ** set to point to a malloced buffer pCur->nKey bytes in size containing
664 ** the key.
665 */
saveCursorKey(BtCursor * pCur)666 static int saveCursorKey(BtCursor *pCur){
667   int rc = SQLITE_OK;
668   assert( CURSOR_VALID==pCur->eState );
669   assert( 0==pCur->pKey );
670   assert( cursorHoldsMutex(pCur) );
671 
672   if( pCur->curIntKey ){
673     /* Only the rowid is required for a table btree */
674     pCur->nKey = sqlite3BtreeIntegerKey(pCur);
675   }else{
676     /* For an index btree, save the complete key content. It is possible
677     ** that the current key is corrupt. In that case, it is possible that
678     ** the sqlite3VdbeRecordUnpack() function may overread the buffer by
679     ** up to the size of 1 varint plus 1 8-byte value when the cursor
680     ** position is restored. Hence the 17 bytes of padding allocated
681     ** below. */
682     void *pKey;
683     pCur->nKey = sqlite3BtreePayloadSize(pCur);
684     pKey = sqlite3Malloc( pCur->nKey + 9 + 8 );
685     if( pKey ){
686       rc = sqlite3BtreePayload(pCur, 0, (int)pCur->nKey, pKey);
687       if( rc==SQLITE_OK ){
688         memset(((u8*)pKey)+pCur->nKey, 0, 9+8);
689         pCur->pKey = pKey;
690       }else{
691         sqlite3_free(pKey);
692       }
693     }else{
694       rc = SQLITE_NOMEM_BKPT;
695     }
696   }
697   assert( !pCur->curIntKey || !pCur->pKey );
698   return rc;
699 }
700 
701 /*
702 ** Save the current cursor position in the variables BtCursor.nKey
703 ** and BtCursor.pKey. The cursor's state is set to CURSOR_REQUIRESEEK.
704 **
705 ** The caller must ensure that the cursor is valid (has eState==CURSOR_VALID)
706 ** prior to calling this routine.
707 */
saveCursorPosition(BtCursor * pCur)708 static int saveCursorPosition(BtCursor *pCur){
709   int rc;
710 
711   assert( CURSOR_VALID==pCur->eState || CURSOR_SKIPNEXT==pCur->eState );
712   assert( 0==pCur->pKey );
713   assert( cursorHoldsMutex(pCur) );
714 
715   if( pCur->curFlags & BTCF_Pinned ){
716     return SQLITE_CONSTRAINT_PINNED;
717   }
718   if( pCur->eState==CURSOR_SKIPNEXT ){
719     pCur->eState = CURSOR_VALID;
720   }else{
721     pCur->skipNext = 0;
722   }
723 
724   rc = saveCursorKey(pCur);
725   if( rc==SQLITE_OK ){
726     btreeReleaseAllCursorPages(pCur);
727     pCur->eState = CURSOR_REQUIRESEEK;
728   }
729 
730   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl|BTCF_AtLast);
731   return rc;
732 }
733 
734 /* Forward reference */
735 static int SQLITE_NOINLINE saveCursorsOnList(BtCursor*,Pgno,BtCursor*);
736 
737 /*
738 ** Save the positions of all cursors (except pExcept) that are open on
739 ** the table with root-page iRoot.  "Saving the cursor position" means that
740 ** the location in the btree is remembered in such a way that it can be
741 ** moved back to the same spot after the btree has been modified.  This
742 ** routine is called just before cursor pExcept is used to modify the
743 ** table, for example in BtreeDelete() or BtreeInsert().
744 **
745 ** If there are two or more cursors on the same btree, then all such
746 ** cursors should have their BTCF_Multiple flag set.  The btreeCursor()
747 ** routine enforces that rule.  This routine only needs to be called in
748 ** the uncommon case when pExpect has the BTCF_Multiple flag set.
749 **
750 ** If pExpect!=NULL and if no other cursors are found on the same root-page,
751 ** then the BTCF_Multiple flag on pExpect is cleared, to avoid another
752 ** pointless call to this routine.
753 **
754 ** Implementation note:  This routine merely checks to see if any cursors
755 ** need to be saved.  It calls out to saveCursorsOnList() in the (unusual)
756 ** event that cursors are in need to being saved.
757 */
saveAllCursors(BtShared * pBt,Pgno iRoot,BtCursor * pExcept)758 static int saveAllCursors(BtShared *pBt, Pgno iRoot, BtCursor *pExcept){
759   BtCursor *p;
760   assert( sqlite3_mutex_held(pBt->mutex) );
761   assert( pExcept==0 || pExcept->pBt==pBt );
762   for(p=pBt->pCursor; p; p=p->pNext){
763     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ) break;
764   }
765   if( p ) return saveCursorsOnList(p, iRoot, pExcept);
766   if( pExcept ) pExcept->curFlags &= ~BTCF_Multiple;
767   return SQLITE_OK;
768 }
769 
770 /* This helper routine to saveAllCursors does the actual work of saving
771 ** the cursors if and when a cursor is found that actually requires saving.
772 ** The common case is that no cursors need to be saved, so this routine is
773 ** broken out from its caller to avoid unnecessary stack pointer movement.
774 */
saveCursorsOnList(BtCursor * p,Pgno iRoot,BtCursor * pExcept)775 static int SQLITE_NOINLINE saveCursorsOnList(
776   BtCursor *p,         /* The first cursor that needs saving */
777   Pgno iRoot,          /* Only save cursor with this iRoot. Save all if zero */
778   BtCursor *pExcept    /* Do not save this cursor */
779 ){
780   do{
781     if( p!=pExcept && (0==iRoot || p->pgnoRoot==iRoot) ){
782       if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
783         int rc = saveCursorPosition(p);
784         if( SQLITE_OK!=rc ){
785           return rc;
786         }
787       }else{
788         testcase( p->iPage>=0 );
789         btreeReleaseAllCursorPages(p);
790       }
791     }
792     p = p->pNext;
793   }while( p );
794   return SQLITE_OK;
795 }
796 
797 /*
798 ** Clear the current cursor position.
799 */
sqlite3BtreeClearCursor(BtCursor * pCur)800 void sqlite3BtreeClearCursor(BtCursor *pCur){
801   assert( cursorHoldsMutex(pCur) );
802   sqlite3_free(pCur->pKey);
803   pCur->pKey = 0;
804   pCur->eState = CURSOR_INVALID;
805 }
806 
807 /*
808 ** In this version of BtreeMoveto, pKey is a packed index record
809 ** such as is generated by the OP_MakeRecord opcode.  Unpack the
810 ** record and then call BtreeMovetoUnpacked() to do the work.
811 */
btreeMoveto(BtCursor * pCur,const void * pKey,i64 nKey,int bias,int * pRes)812 static int btreeMoveto(
813   BtCursor *pCur,     /* Cursor open on the btree to be searched */
814   const void *pKey,   /* Packed key if the btree is an index */
815   i64 nKey,           /* Integer key for tables.  Size of pKey for indices */
816   int bias,           /* Bias search to the high end */
817   int *pRes           /* Write search results here */
818 ){
819   int rc;                    /* Status code */
820   UnpackedRecord *pIdxKey;   /* Unpacked index key */
821 
822   if( pKey ){
823     KeyInfo *pKeyInfo = pCur->pKeyInfo;
824     assert( nKey==(i64)(int)nKey );
825     pIdxKey = sqlite3VdbeAllocUnpackedRecord(pKeyInfo);
826     if( pIdxKey==0 ) return SQLITE_NOMEM_BKPT;
827     sqlite3VdbeRecordUnpack(pKeyInfo, (int)nKey, pKey, pIdxKey);
828     if( pIdxKey->nField==0 || pIdxKey->nField>pKeyInfo->nAllField ){
829       rc = SQLITE_CORRUPT_BKPT;
830       goto moveto_done;
831     }
832   }else{
833     pIdxKey = 0;
834   }
835   rc = sqlite3BtreeMovetoUnpacked(pCur, pIdxKey, nKey, bias, pRes);
836 moveto_done:
837   if( pIdxKey ){
838     sqlite3DbFree(pCur->pKeyInfo->db, pIdxKey);
839   }
840   return rc;
841 }
842 
843 /*
844 ** Restore the cursor to the position it was in (or as close to as possible)
845 ** when saveCursorPosition() was called. Note that this call deletes the
846 ** saved position info stored by saveCursorPosition(), so there can be
847 ** at most one effective restoreCursorPosition() call after each
848 ** saveCursorPosition().
849 */
btreeRestoreCursorPosition(BtCursor * pCur)850 static int btreeRestoreCursorPosition(BtCursor *pCur){
851   int rc;
852   int skipNext = 0;
853   assert( cursorOwnsBtShared(pCur) );
854   assert( pCur->eState>=CURSOR_REQUIRESEEK );
855   if( pCur->eState==CURSOR_FAULT ){
856     return pCur->skipNext;
857   }
858   pCur->eState = CURSOR_INVALID;
859   if( sqlite3FaultSim(410) ){
860     rc = SQLITE_IOERR;
861   }else{
862     rc = btreeMoveto(pCur, pCur->pKey, pCur->nKey, 0, &skipNext);
863   }
864   if( rc==SQLITE_OK ){
865     sqlite3_free(pCur->pKey);
866     pCur->pKey = 0;
867     assert( pCur->eState==CURSOR_VALID || pCur->eState==CURSOR_INVALID );
868     if( skipNext ) pCur->skipNext = skipNext;
869     if( pCur->skipNext && pCur->eState==CURSOR_VALID ){
870       pCur->eState = CURSOR_SKIPNEXT;
871     }
872   }
873   return rc;
874 }
875 
876 #define restoreCursorPosition(p) \
877   (p->eState>=CURSOR_REQUIRESEEK ? \
878          btreeRestoreCursorPosition(p) : \
879          SQLITE_OK)
880 
881 /*
882 ** Determine whether or not a cursor has moved from the position where
883 ** it was last placed, or has been invalidated for any other reason.
884 ** Cursors can move when the row they are pointing at is deleted out
885 ** from under them, for example.  Cursor might also move if a btree
886 ** is rebalanced.
887 **
888 ** Calling this routine with a NULL cursor pointer returns false.
889 **
890 ** Use the separate sqlite3BtreeCursorRestore() routine to restore a cursor
891 ** back to where it ought to be if this routine returns true.
892 */
sqlite3BtreeCursorHasMoved(BtCursor * pCur)893 int sqlite3BtreeCursorHasMoved(BtCursor *pCur){
894   assert( EIGHT_BYTE_ALIGNMENT(pCur)
895        || pCur==sqlite3BtreeFakeValidCursor() );
896   assert( offsetof(BtCursor, eState)==0 );
897   assert( sizeof(pCur->eState)==1 );
898   return CURSOR_VALID != *(u8*)pCur;
899 }
900 
901 /*
902 ** Return a pointer to a fake BtCursor object that will always answer
903 ** false to the sqlite3BtreeCursorHasMoved() routine above.  The fake
904 ** cursor returned must not be used with any other Btree interface.
905 */
sqlite3BtreeFakeValidCursor(void)906 BtCursor *sqlite3BtreeFakeValidCursor(void){
907   static u8 fakeCursor = CURSOR_VALID;
908   assert( offsetof(BtCursor, eState)==0 );
909   return (BtCursor*)&fakeCursor;
910 }
911 
912 /*
913 ** This routine restores a cursor back to its original position after it
914 ** has been moved by some outside activity (such as a btree rebalance or
915 ** a row having been deleted out from under the cursor).
916 **
917 ** On success, the *pDifferentRow parameter is false if the cursor is left
918 ** pointing at exactly the same row.  *pDifferntRow is the row the cursor
919 ** was pointing to has been deleted, forcing the cursor to point to some
920 ** nearby row.
921 **
922 ** This routine should only be called for a cursor that just returned
923 ** TRUE from sqlite3BtreeCursorHasMoved().
924 */
sqlite3BtreeCursorRestore(BtCursor * pCur,int * pDifferentRow)925 int sqlite3BtreeCursorRestore(BtCursor *pCur, int *pDifferentRow){
926   int rc;
927 
928   assert( pCur!=0 );
929   assert( pCur->eState!=CURSOR_VALID );
930   rc = restoreCursorPosition(pCur);
931   if( rc ){
932     *pDifferentRow = 1;
933     return rc;
934   }
935   if( pCur->eState!=CURSOR_VALID ){
936     *pDifferentRow = 1;
937   }else{
938     *pDifferentRow = 0;
939   }
940   return SQLITE_OK;
941 }
942 
943 #ifdef SQLITE_ENABLE_CURSOR_HINTS
944 /*
945 ** Provide hints to the cursor.  The particular hint given (and the type
946 ** and number of the varargs parameters) is determined by the eHintType
947 ** parameter.  See the definitions of the BTREE_HINT_* macros for details.
948 */
sqlite3BtreeCursorHint(BtCursor * pCur,int eHintType,...)949 void sqlite3BtreeCursorHint(BtCursor *pCur, int eHintType, ...){
950   /* Used only by system that substitute their own storage engine */
951 }
952 #endif
953 
954 /*
955 ** Provide flag hints to the cursor.
956 */
sqlite3BtreeCursorHintFlags(BtCursor * pCur,unsigned x)957 void sqlite3BtreeCursorHintFlags(BtCursor *pCur, unsigned x){
958   assert( x==BTREE_SEEK_EQ || x==BTREE_BULKLOAD || x==0 );
959   pCur->hints = x;
960 }
961 
962 
963 #ifndef SQLITE_OMIT_AUTOVACUUM
964 /*
965 ** Given a page number of a regular database page, return the page
966 ** number for the pointer-map page that contains the entry for the
967 ** input page number.
968 **
969 ** Return 0 (not a valid page) for pgno==1 since there is
970 ** no pointer map associated with page 1.  The integrity_check logic
971 ** requires that ptrmapPageno(*,1)!=1.
972 */
ptrmapPageno(BtShared * pBt,Pgno pgno)973 static Pgno ptrmapPageno(BtShared *pBt, Pgno pgno){
974   int nPagesPerMapPage;
975   Pgno iPtrMap, ret;
976   assert( sqlite3_mutex_held(pBt->mutex) );
977   if( pgno<2 ) return 0;
978   nPagesPerMapPage = (pBt->usableSize/5)+1;
979   iPtrMap = (pgno-2)/nPagesPerMapPage;
980   ret = (iPtrMap*nPagesPerMapPage) + 2;
981   if( ret==PENDING_BYTE_PAGE(pBt) ){
982     ret++;
983   }
984   return ret;
985 }
986 
987 /*
988 ** Write an entry into the pointer map.
989 **
990 ** This routine updates the pointer map entry for page number 'key'
991 ** so that it maps to type 'eType' and parent page number 'pgno'.
992 **
993 ** If *pRC is initially non-zero (non-SQLITE_OK) then this routine is
994 ** a no-op.  If an error occurs, the appropriate error code is written
995 ** into *pRC.
996 */
ptrmapPut(BtShared * pBt,Pgno key,u8 eType,Pgno parent,int * pRC)997 static void ptrmapPut(BtShared *pBt, Pgno key, u8 eType, Pgno parent, int *pRC){
998   DbPage *pDbPage;  /* The pointer map page */
999   u8 *pPtrmap;      /* The pointer map data */
1000   Pgno iPtrmap;     /* The pointer map page number */
1001   int offset;       /* Offset in pointer map page */
1002   int rc;           /* Return code from subfunctions */
1003 
1004   if( *pRC ) return;
1005 
1006   assert( sqlite3_mutex_held(pBt->mutex) );
1007   /* The super-journal page number must never be used as a pointer map page */
1008   assert( 0==PTRMAP_ISPAGE(pBt, PENDING_BYTE_PAGE(pBt)) );
1009 
1010   assert( pBt->autoVacuum );
1011   if( key==0 ){
1012     *pRC = SQLITE_CORRUPT_BKPT;
1013     return;
1014   }
1015   iPtrmap = PTRMAP_PAGENO(pBt, key);
1016   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1017   if( rc!=SQLITE_OK ){
1018     *pRC = rc;
1019     return;
1020   }
1021   if( ((char*)sqlite3PagerGetExtra(pDbPage))[0]!=0 ){
1022     /* The first byte of the extra data is the MemPage.isInit byte.
1023     ** If that byte is set, it means this page is also being used
1024     ** as a btree page. */
1025     *pRC = SQLITE_CORRUPT_BKPT;
1026     goto ptrmap_exit;
1027   }
1028   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1029   if( offset<0 ){
1030     *pRC = SQLITE_CORRUPT_BKPT;
1031     goto ptrmap_exit;
1032   }
1033   assert( offset <= (int)pBt->usableSize-5 );
1034   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1035 
1036   if( eType!=pPtrmap[offset] || get4byte(&pPtrmap[offset+1])!=parent ){
1037     TRACE(("PTRMAP_UPDATE: %d->(%d,%d)\n", key, eType, parent));
1038     *pRC= rc = sqlite3PagerWrite(pDbPage);
1039     if( rc==SQLITE_OK ){
1040       pPtrmap[offset] = eType;
1041       put4byte(&pPtrmap[offset+1], parent);
1042     }
1043   }
1044 
1045 ptrmap_exit:
1046   sqlite3PagerUnref(pDbPage);
1047 }
1048 
1049 /*
1050 ** Read an entry from the pointer map.
1051 **
1052 ** This routine retrieves the pointer map entry for page 'key', writing
1053 ** the type and parent page number to *pEType and *pPgno respectively.
1054 ** An error code is returned if something goes wrong, otherwise SQLITE_OK.
1055 */
ptrmapGet(BtShared * pBt,Pgno key,u8 * pEType,Pgno * pPgno)1056 static int ptrmapGet(BtShared *pBt, Pgno key, u8 *pEType, Pgno *pPgno){
1057   DbPage *pDbPage;   /* The pointer map page */
1058   int iPtrmap;       /* Pointer map page index */
1059   u8 *pPtrmap;       /* Pointer map page data */
1060   int offset;        /* Offset of entry in pointer map */
1061   int rc;
1062 
1063   assert( sqlite3_mutex_held(pBt->mutex) );
1064 
1065   iPtrmap = PTRMAP_PAGENO(pBt, key);
1066   rc = sqlite3PagerGet(pBt->pPager, iPtrmap, &pDbPage, 0);
1067   if( rc!=0 ){
1068     return rc;
1069   }
1070   pPtrmap = (u8 *)sqlite3PagerGetData(pDbPage);
1071 
1072   offset = PTRMAP_PTROFFSET(iPtrmap, key);
1073   if( offset<0 ){
1074     sqlite3PagerUnref(pDbPage);
1075     return SQLITE_CORRUPT_BKPT;
1076   }
1077   assert( offset <= (int)pBt->usableSize-5 );
1078   assert( pEType!=0 );
1079   *pEType = pPtrmap[offset];
1080   if( pPgno ) *pPgno = get4byte(&pPtrmap[offset+1]);
1081 
1082   sqlite3PagerUnref(pDbPage);
1083   if( *pEType<1 || *pEType>5 ) return SQLITE_CORRUPT_PGNO(iPtrmap);
1084   return SQLITE_OK;
1085 }
1086 
1087 #else /* if defined SQLITE_OMIT_AUTOVACUUM */
1088   #define ptrmapPut(w,x,y,z,rc)
1089   #define ptrmapGet(w,x,y,z) SQLITE_OK
1090   #define ptrmapPutOvflPtr(x, y, z, rc)
1091 #endif
1092 
1093 /*
1094 ** Given a btree page and a cell index (0 means the first cell on
1095 ** the page, 1 means the second cell, and so forth) return a pointer
1096 ** to the cell content.
1097 **
1098 ** findCellPastPtr() does the same except it skips past the initial
1099 ** 4-byte child pointer found on interior pages, if there is one.
1100 **
1101 ** This routine works only for pages that do not contain overflow cells.
1102 */
1103 #define findCell(P,I) \
1104   ((P)->aData + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1105 #define findCellPastPtr(P,I) \
1106   ((P)->aDataOfst + ((P)->maskPage & get2byteAligned(&(P)->aCellIdx[2*(I)])))
1107 
1108 
1109 /*
1110 ** This is common tail processing for btreeParseCellPtr() and
1111 ** btreeParseCellPtrIndex() for the case when the cell does not fit entirely
1112 ** on a single B-tree page.  Make necessary adjustments to the CellInfo
1113 ** structure.
1114 */
btreeParseCellAdjustSizeForOverflow(MemPage * pPage,u8 * pCell,CellInfo * pInfo)1115 static SQLITE_NOINLINE void btreeParseCellAdjustSizeForOverflow(
1116   MemPage *pPage,         /* Page containing the cell */
1117   u8 *pCell,              /* Pointer to the cell text. */
1118   CellInfo *pInfo         /* Fill in this structure */
1119 ){
1120   /* If the payload will not fit completely on the local page, we have
1121   ** to decide how much to store locally and how much to spill onto
1122   ** overflow pages.  The strategy is to minimize the amount of unused
1123   ** space on overflow pages while keeping the amount of local storage
1124   ** in between minLocal and maxLocal.
1125   **
1126   ** Warning:  changing the way overflow payload is distributed in any
1127   ** way will result in an incompatible file format.
1128   */
1129   int minLocal;  /* Minimum amount of payload held locally */
1130   int maxLocal;  /* Maximum amount of payload held locally */
1131   int surplus;   /* Overflow payload available for local storage */
1132 
1133   minLocal = pPage->minLocal;
1134   maxLocal = pPage->maxLocal;
1135   surplus = minLocal + (pInfo->nPayload - minLocal)%(pPage->pBt->usableSize-4);
1136   testcase( surplus==maxLocal );
1137   testcase( surplus==maxLocal+1 );
1138   if( surplus <= maxLocal ){
1139     pInfo->nLocal = (u16)surplus;
1140   }else{
1141     pInfo->nLocal = (u16)minLocal;
1142   }
1143   pInfo->nSize = (u16)(&pInfo->pPayload[pInfo->nLocal] - pCell) + 4;
1144 }
1145 
1146 /*
1147 ** The following routines are implementations of the MemPage.xParseCell()
1148 ** method.
1149 **
1150 ** Parse a cell content block and fill in the CellInfo structure.
1151 **
1152 ** btreeParseCellPtr()        =>   table btree leaf nodes
1153 ** btreeParseCellNoPayload()  =>   table btree internal nodes
1154 ** btreeParseCellPtrIndex()   =>   index btree nodes
1155 **
1156 ** There is also a wrapper function btreeParseCell() that works for
1157 ** all MemPage types and that references the cell by index rather than
1158 ** by pointer.
1159 */
btreeParseCellPtrNoPayload(MemPage * pPage,u8 * pCell,CellInfo * pInfo)1160 static void btreeParseCellPtrNoPayload(
1161   MemPage *pPage,         /* Page containing the cell */
1162   u8 *pCell,              /* Pointer to the cell text. */
1163   CellInfo *pInfo         /* Fill in this structure */
1164 ){
1165   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1166   assert( pPage->leaf==0 );
1167   assert( pPage->childPtrSize==4 );
1168 #ifndef SQLITE_DEBUG
1169   UNUSED_PARAMETER(pPage);
1170 #endif
1171   pInfo->nSize = 4 + getVarint(&pCell[4], (u64*)&pInfo->nKey);
1172   pInfo->nPayload = 0;
1173   pInfo->nLocal = 0;
1174   pInfo->pPayload = 0;
1175   return;
1176 }
btreeParseCellPtr(MemPage * pPage,u8 * pCell,CellInfo * pInfo)1177 static void btreeParseCellPtr(
1178   MemPage *pPage,         /* Page containing the cell */
1179   u8 *pCell,              /* Pointer to the cell text. */
1180   CellInfo *pInfo         /* Fill in this structure */
1181 ){
1182   u8 *pIter;              /* For scanning through pCell */
1183   u32 nPayload;           /* Number of bytes of cell payload */
1184   u64 iKey;               /* Extracted Key value */
1185 
1186   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1187   assert( pPage->leaf==0 || pPage->leaf==1 );
1188   assert( pPage->intKeyLeaf );
1189   assert( pPage->childPtrSize==0 );
1190   pIter = pCell;
1191 
1192   /* The next block of code is equivalent to:
1193   **
1194   **     pIter += getVarint32(pIter, nPayload);
1195   **
1196   ** The code is inlined to avoid a function call.
1197   */
1198   nPayload = *pIter;
1199   if( nPayload>=0x80 ){
1200     u8 *pEnd = &pIter[8];
1201     nPayload &= 0x7f;
1202     do{
1203       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1204     }while( (*pIter)>=0x80 && pIter<pEnd );
1205   }
1206   pIter++;
1207 
1208   /* The next block of code is equivalent to:
1209   **
1210   **     pIter += getVarint(pIter, (u64*)&pInfo->nKey);
1211   **
1212   ** The code is inlined to avoid a function call.
1213   */
1214   iKey = *pIter;
1215   if( iKey>=0x80 ){
1216     u8 *pEnd = &pIter[7];
1217     iKey &= 0x7f;
1218     while(1){
1219       iKey = (iKey<<7) | (*++pIter & 0x7f);
1220       if( (*pIter)<0x80 ) break;
1221       if( pIter>=pEnd ){
1222         iKey = (iKey<<8) | *++pIter;
1223         break;
1224       }
1225     }
1226   }
1227   pIter++;
1228 
1229   pInfo->nKey = *(i64*)&iKey;
1230   pInfo->nPayload = nPayload;
1231   pInfo->pPayload = pIter;
1232   testcase( nPayload==pPage->maxLocal );
1233   testcase( nPayload==pPage->maxLocal+1 );
1234   if( nPayload<=pPage->maxLocal ){
1235     /* This is the (easy) common case where the entire payload fits
1236     ** on the local page.  No overflow is required.
1237     */
1238     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1239     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1240     pInfo->nLocal = (u16)nPayload;
1241   }else{
1242     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1243   }
1244 }
btreeParseCellPtrIndex(MemPage * pPage,u8 * pCell,CellInfo * pInfo)1245 static void btreeParseCellPtrIndex(
1246   MemPage *pPage,         /* Page containing the cell */
1247   u8 *pCell,              /* Pointer to the cell text. */
1248   CellInfo *pInfo         /* Fill in this structure */
1249 ){
1250   u8 *pIter;              /* For scanning through pCell */
1251   u32 nPayload;           /* Number of bytes of cell payload */
1252 
1253   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1254   assert( pPage->leaf==0 || pPage->leaf==1 );
1255   assert( pPage->intKeyLeaf==0 );
1256   pIter = pCell + pPage->childPtrSize;
1257   nPayload = *pIter;
1258   if( nPayload>=0x80 ){
1259     u8 *pEnd = &pIter[8];
1260     nPayload &= 0x7f;
1261     do{
1262       nPayload = (nPayload<<7) | (*++pIter & 0x7f);
1263     }while( *(pIter)>=0x80 && pIter<pEnd );
1264   }
1265   pIter++;
1266   pInfo->nKey = nPayload;
1267   pInfo->nPayload = nPayload;
1268   pInfo->pPayload = pIter;
1269   testcase( nPayload==pPage->maxLocal );
1270   testcase( nPayload==pPage->maxLocal+1 );
1271   if( nPayload<=pPage->maxLocal ){
1272     /* This is the (easy) common case where the entire payload fits
1273     ** on the local page.  No overflow is required.
1274     */
1275     pInfo->nSize = nPayload + (u16)(pIter - pCell);
1276     if( pInfo->nSize<4 ) pInfo->nSize = 4;
1277     pInfo->nLocal = (u16)nPayload;
1278   }else{
1279     btreeParseCellAdjustSizeForOverflow(pPage, pCell, pInfo);
1280   }
1281 }
btreeParseCell(MemPage * pPage,int iCell,CellInfo * pInfo)1282 static void btreeParseCell(
1283   MemPage *pPage,         /* Page containing the cell */
1284   int iCell,              /* The cell index.  First cell is 0 */
1285   CellInfo *pInfo         /* Fill in this structure */
1286 ){
1287   pPage->xParseCell(pPage, findCell(pPage, iCell), pInfo);
1288 }
1289 
1290 /*
1291 ** The following routines are implementations of the MemPage.xCellSize
1292 ** method.
1293 **
1294 ** Compute the total number of bytes that a Cell needs in the cell
1295 ** data area of the btree-page.  The return number includes the cell
1296 ** data header and the local payload, but not any overflow page or
1297 ** the space used by the cell pointer.
1298 **
1299 ** cellSizePtrNoPayload()    =>   table internal nodes
1300 ** cellSizePtr()             =>   all index nodes & table leaf nodes
1301 */
cellSizePtr(MemPage * pPage,u8 * pCell)1302 static u16 cellSizePtr(MemPage *pPage, u8 *pCell){
1303   u8 *pIter = pCell + pPage->childPtrSize; /* For looping over bytes of pCell */
1304   u8 *pEnd;                                /* End mark for a varint */
1305   u32 nSize;                               /* Size value to return */
1306 
1307 #ifdef SQLITE_DEBUG
1308   /* The value returned by this function should always be the same as
1309   ** the (CellInfo.nSize) value found by doing a full parse of the
1310   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1311   ** this function verifies that this invariant is not violated. */
1312   CellInfo debuginfo;
1313   pPage->xParseCell(pPage, pCell, &debuginfo);
1314 #endif
1315 
1316   nSize = *pIter;
1317   if( nSize>=0x80 ){
1318     pEnd = &pIter[8];
1319     nSize &= 0x7f;
1320     do{
1321       nSize = (nSize<<7) | (*++pIter & 0x7f);
1322     }while( *(pIter)>=0x80 && pIter<pEnd );
1323   }
1324   pIter++;
1325   if( pPage->intKey ){
1326     /* pIter now points at the 64-bit integer key value, a variable length
1327     ** integer. The following block moves pIter to point at the first byte
1328     ** past the end of the key value. */
1329     pEnd = &pIter[9];
1330     while( (*pIter++)&0x80 && pIter<pEnd );
1331   }
1332   testcase( nSize==pPage->maxLocal );
1333   testcase( nSize==pPage->maxLocal+1 );
1334   if( nSize<=pPage->maxLocal ){
1335     nSize += (u32)(pIter - pCell);
1336     if( nSize<4 ) nSize = 4;
1337   }else{
1338     int minLocal = pPage->minLocal;
1339     nSize = minLocal + (nSize - minLocal) % (pPage->pBt->usableSize - 4);
1340     testcase( nSize==pPage->maxLocal );
1341     testcase( nSize==pPage->maxLocal+1 );
1342     if( nSize>pPage->maxLocal ){
1343       nSize = minLocal;
1344     }
1345     nSize += 4 + (u16)(pIter - pCell);
1346   }
1347   assert( nSize==debuginfo.nSize || CORRUPT_DB );
1348   return (u16)nSize;
1349 }
cellSizePtrNoPayload(MemPage * pPage,u8 * pCell)1350 static u16 cellSizePtrNoPayload(MemPage *pPage, u8 *pCell){
1351   u8 *pIter = pCell + 4; /* For looping over bytes of pCell */
1352   u8 *pEnd;              /* End mark for a varint */
1353 
1354 #ifdef SQLITE_DEBUG
1355   /* The value returned by this function should always be the same as
1356   ** the (CellInfo.nSize) value found by doing a full parse of the
1357   ** cell. If SQLITE_DEBUG is defined, an assert() at the bottom of
1358   ** this function verifies that this invariant is not violated. */
1359   CellInfo debuginfo;
1360   pPage->xParseCell(pPage, pCell, &debuginfo);
1361 #else
1362   UNUSED_PARAMETER(pPage);
1363 #endif
1364 
1365   assert( pPage->childPtrSize==4 );
1366   pEnd = pIter + 9;
1367   while( (*pIter++)&0x80 && pIter<pEnd );
1368   assert( debuginfo.nSize==(u16)(pIter - pCell) || CORRUPT_DB );
1369   return (u16)(pIter - pCell);
1370 }
1371 
1372 
1373 #ifdef SQLITE_DEBUG
1374 /* This variation on cellSizePtr() is used inside of assert() statements
1375 ** only. */
cellSize(MemPage * pPage,int iCell)1376 static u16 cellSize(MemPage *pPage, int iCell){
1377   return pPage->xCellSize(pPage, findCell(pPage, iCell));
1378 }
1379 #endif
1380 
1381 #ifndef SQLITE_OMIT_AUTOVACUUM
1382 /*
1383 ** The cell pCell is currently part of page pSrc but will ultimately be part
1384 ** of pPage.  (pSrc and pPager are often the same.)  If pCell contains a
1385 ** pointer to an overflow page, insert an entry into the pointer-map for
1386 ** the overflow page that will be valid after pCell has been moved to pPage.
1387 */
ptrmapPutOvflPtr(MemPage * pPage,MemPage * pSrc,u8 * pCell,int * pRC)1388 static void ptrmapPutOvflPtr(MemPage *pPage, MemPage *pSrc, u8 *pCell,int *pRC){
1389   CellInfo info;
1390   if( *pRC ) return;
1391   assert( pCell!=0 );
1392   pPage->xParseCell(pPage, pCell, &info);
1393   if( info.nLocal<info.nPayload ){
1394     Pgno ovfl;
1395     if( SQLITE_WITHIN(pSrc->aDataEnd, pCell, pCell+info.nLocal) ){
1396       testcase( pSrc!=pPage );
1397       *pRC = SQLITE_CORRUPT_BKPT;
1398       return;
1399     }
1400     ovfl = get4byte(&pCell[info.nSize-4]);
1401     ptrmapPut(pPage->pBt, ovfl, PTRMAP_OVERFLOW1, pPage->pgno, pRC);
1402   }
1403 }
1404 #endif
1405 
1406 
1407 /*
1408 ** Defragment the page given. This routine reorganizes cells within the
1409 ** page so that there are no free-blocks on the free-block list.
1410 **
1411 ** Parameter nMaxFrag is the maximum amount of fragmented space that may be
1412 ** present in the page after this routine returns.
1413 **
1414 ** EVIDENCE-OF: R-44582-60138 SQLite may from time to time reorganize a
1415 ** b-tree page so that there are no freeblocks or fragment bytes, all
1416 ** unused bytes are contained in the unallocated space region, and all
1417 ** cells are packed tightly at the end of the page.
1418 */
defragmentPage(MemPage * pPage,int nMaxFrag)1419 static int defragmentPage(MemPage *pPage, int nMaxFrag){
1420   int i;                     /* Loop counter */
1421   int pc;                    /* Address of the i-th cell */
1422   int hdr;                   /* Offset to the page header */
1423   int size;                  /* Size of a cell */
1424   int usableSize;            /* Number of usable bytes on a page */
1425   int cellOffset;            /* Offset to the cell pointer array */
1426   int cbrk;                  /* Offset to the cell content area */
1427   int nCell;                 /* Number of cells on the page */
1428   unsigned char *data;       /* The page data */
1429   unsigned char *temp;       /* Temp area for cell content */
1430   unsigned char *src;        /* Source of content */
1431   int iCellFirst;            /* First allowable cell index */
1432   int iCellLast;             /* Last possible cell index */
1433 
1434   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1435   assert( pPage->pBt!=0 );
1436   assert( pPage->pBt->usableSize <= SQLITE_MAX_PAGE_SIZE );
1437   assert( pPage->nOverflow==0 );
1438   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1439   temp = 0;
1440   src = data = pPage->aData;
1441   hdr = pPage->hdrOffset;
1442   cellOffset = pPage->cellOffset;
1443   nCell = pPage->nCell;
1444   assert( nCell==get2byte(&data[hdr+3]) || CORRUPT_DB );
1445   iCellFirst = cellOffset + 2*nCell;
1446   usableSize = pPage->pBt->usableSize;
1447 
1448   /* This block handles pages with two or fewer free blocks and nMaxFrag
1449   ** or fewer fragmented bytes. In this case it is faster to move the
1450   ** two (or one) blocks of cells using memmove() and add the required
1451   ** offsets to each pointer in the cell-pointer array than it is to
1452   ** reconstruct the entire page.  */
1453   if( (int)data[hdr+7]<=nMaxFrag ){
1454     int iFree = get2byte(&data[hdr+1]);
1455     if( iFree>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1456     if( iFree ){
1457       int iFree2 = get2byte(&data[iFree]);
1458       if( iFree2>usableSize-4 ) return SQLITE_CORRUPT_PAGE(pPage);
1459       if( 0==iFree2 || (data[iFree2]==0 && data[iFree2+1]==0) ){
1460         u8 *pEnd = &data[cellOffset + nCell*2];
1461         u8 *pAddr;
1462         int sz2 = 0;
1463         int sz = get2byte(&data[iFree+2]);
1464         int top = get2byte(&data[hdr+5]);
1465         if( top>=iFree ){
1466           return SQLITE_CORRUPT_PAGE(pPage);
1467         }
1468         if( iFree2 ){
1469           if( iFree+sz>iFree2 ) return SQLITE_CORRUPT_PAGE(pPage);
1470           sz2 = get2byte(&data[iFree2+2]);
1471           if( iFree2+sz2 > usableSize ) return SQLITE_CORRUPT_PAGE(pPage);
1472           memmove(&data[iFree+sz+sz2], &data[iFree+sz], iFree2-(iFree+sz));
1473           sz += sz2;
1474         }else if( NEVER(iFree+sz>usableSize) ){
1475           return SQLITE_CORRUPT_PAGE(pPage);
1476         }
1477 
1478         cbrk = top+sz;
1479         assert( cbrk+(iFree-top) <= usableSize );
1480         memmove(&data[cbrk], &data[top], iFree-top);
1481         for(pAddr=&data[cellOffset]; pAddr<pEnd; pAddr+=2){
1482           pc = get2byte(pAddr);
1483           if( pc<iFree ){ put2byte(pAddr, pc+sz); }
1484           else if( pc<iFree2 ){ put2byte(pAddr, pc+sz2); }
1485         }
1486         goto defragment_out;
1487       }
1488     }
1489   }
1490 
1491   cbrk = usableSize;
1492   iCellLast = usableSize - 4;
1493   for(i=0; i<nCell; i++){
1494     u8 *pAddr;     /* The i-th cell pointer */
1495     pAddr = &data[cellOffset + i*2];
1496     pc = get2byte(pAddr);
1497     testcase( pc==iCellFirst );
1498     testcase( pc==iCellLast );
1499     /* These conditions have already been verified in btreeInitPage()
1500     ** if PRAGMA cell_size_check=ON.
1501     */
1502     if( pc<iCellFirst || pc>iCellLast ){
1503       return SQLITE_CORRUPT_PAGE(pPage);
1504     }
1505     assert( pc>=iCellFirst && pc<=iCellLast );
1506     size = pPage->xCellSize(pPage, &src[pc]);
1507     cbrk -= size;
1508     if( cbrk<iCellFirst || pc+size>usableSize ){
1509       return SQLITE_CORRUPT_PAGE(pPage);
1510     }
1511     assert( cbrk+size<=usableSize && cbrk>=iCellFirst );
1512     testcase( cbrk+size==usableSize );
1513     testcase( pc+size==usableSize );
1514     put2byte(pAddr, cbrk);
1515     if( temp==0 ){
1516       int x;
1517       if( cbrk==pc ) continue;
1518       temp = sqlite3PagerTempSpace(pPage->pBt->pPager);
1519       x = get2byte(&data[hdr+5]);
1520       memcpy(&temp[x], &data[x], (cbrk+size) - x);
1521       src = temp;
1522     }
1523     memcpy(&data[cbrk], &src[pc], size);
1524   }
1525   data[hdr+7] = 0;
1526 
1527  defragment_out:
1528   assert( pPage->nFree>=0 );
1529   if( data[hdr+7]+cbrk-iCellFirst!=pPage->nFree ){
1530     return SQLITE_CORRUPT_PAGE(pPage);
1531   }
1532   assert( cbrk>=iCellFirst );
1533   put2byte(&data[hdr+5], cbrk);
1534   data[hdr+1] = 0;
1535   data[hdr+2] = 0;
1536   memset(&data[iCellFirst], 0, cbrk-iCellFirst);
1537   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1538   return SQLITE_OK;
1539 }
1540 
1541 /*
1542 ** Search the free-list on page pPg for space to store a cell nByte bytes in
1543 ** size. If one can be found, return a pointer to the space and remove it
1544 ** from the free-list.
1545 **
1546 ** If no suitable space can be found on the free-list, return NULL.
1547 **
1548 ** This function may detect corruption within pPg.  If corruption is
1549 ** detected then *pRc is set to SQLITE_CORRUPT and NULL is returned.
1550 **
1551 ** Slots on the free list that are between 1 and 3 bytes larger than nByte
1552 ** will be ignored if adding the extra space to the fragmentation count
1553 ** causes the fragmentation count to exceed 60.
1554 */
pageFindSlot(MemPage * pPg,int nByte,int * pRc)1555 static u8 *pageFindSlot(MemPage *pPg, int nByte, int *pRc){
1556   const int hdr = pPg->hdrOffset;            /* Offset to page header */
1557   u8 * const aData = pPg->aData;             /* Page data */
1558   int iAddr = hdr + 1;                       /* Address of ptr to pc */
1559   int pc = get2byte(&aData[iAddr]);          /* Address of a free slot */
1560   int x;                                     /* Excess size of the slot */
1561   int maxPC = pPg->pBt->usableSize - nByte;  /* Max address for a usable slot */
1562   int size;                                  /* Size of the free slot */
1563 
1564   assert( pc>0 );
1565   while( pc<=maxPC ){
1566     /* EVIDENCE-OF: R-22710-53328 The third and fourth bytes of each
1567     ** freeblock form a big-endian integer which is the size of the freeblock
1568     ** in bytes, including the 4-byte header. */
1569     size = get2byte(&aData[pc+2]);
1570     if( (x = size - nByte)>=0 ){
1571       testcase( x==4 );
1572       testcase( x==3 );
1573       if( x<4 ){
1574         /* EVIDENCE-OF: R-11498-58022 In a well-formed b-tree page, the total
1575         ** number of bytes in fragments may not exceed 60. */
1576         if( aData[hdr+7]>57 ) return 0;
1577 
1578         /* Remove the slot from the free-list. Update the number of
1579         ** fragmented bytes within the page. */
1580         memcpy(&aData[iAddr], &aData[pc], 2);
1581         aData[hdr+7] += (u8)x;
1582       }else if( x+pc > maxPC ){
1583         /* This slot extends off the end of the usable part of the page */
1584         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1585         return 0;
1586       }else{
1587         /* The slot remains on the free-list. Reduce its size to account
1588         ** for the portion used by the new allocation. */
1589         put2byte(&aData[pc+2], x);
1590       }
1591       return &aData[pc + x];
1592     }
1593     iAddr = pc;
1594     pc = get2byte(&aData[pc]);
1595     if( pc<=iAddr+size ){
1596       if( pc ){
1597         /* The next slot in the chain is not past the end of the current slot */
1598         *pRc = SQLITE_CORRUPT_PAGE(pPg);
1599       }
1600       return 0;
1601     }
1602   }
1603   if( pc>maxPC+nByte-4 ){
1604     /* The free slot chain extends off the end of the page */
1605     *pRc = SQLITE_CORRUPT_PAGE(pPg);
1606   }
1607   return 0;
1608 }
1609 
1610 /*
1611 ** Allocate nByte bytes of space from within the B-Tree page passed
1612 ** as the first argument. Write into *pIdx the index into pPage->aData[]
1613 ** of the first byte of allocated space. Return either SQLITE_OK or
1614 ** an error code (usually SQLITE_CORRUPT).
1615 **
1616 ** The caller guarantees that there is sufficient space to make the
1617 ** allocation.  This routine might need to defragment in order to bring
1618 ** all the space together, however.  This routine will avoid using
1619 ** the first two bytes past the cell pointer area since presumably this
1620 ** allocation is being made in order to insert a new cell, so we will
1621 ** also end up needing a new cell pointer.
1622 */
allocateSpace(MemPage * pPage,int nByte,int * pIdx)1623 static int allocateSpace(MemPage *pPage, int nByte, int *pIdx){
1624   const int hdr = pPage->hdrOffset;    /* Local cache of pPage->hdrOffset */
1625   u8 * const data = pPage->aData;      /* Local cache of pPage->aData */
1626   int top;                             /* First byte of cell content area */
1627   int rc = SQLITE_OK;                  /* Integer return code */
1628   int gap;        /* First byte of gap between cell pointers and cell content */
1629 
1630   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1631   assert( pPage->pBt );
1632   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1633   assert( nByte>=0 );  /* Minimum cell size is 4 */
1634   assert( pPage->nFree>=nByte );
1635   assert( pPage->nOverflow==0 );
1636   assert( nByte < (int)(pPage->pBt->usableSize-8) );
1637 
1638   assert( pPage->cellOffset == hdr + 12 - 4*pPage->leaf );
1639   gap = pPage->cellOffset + 2*pPage->nCell;
1640   assert( gap<=65536 );
1641   /* EVIDENCE-OF: R-29356-02391 If the database uses a 65536-byte page size
1642   ** and the reserved space is zero (the usual value for reserved space)
1643   ** then the cell content offset of an empty page wants to be 65536.
1644   ** However, that integer is too large to be stored in a 2-byte unsigned
1645   ** integer, so a value of 0 is used in its place. */
1646   top = get2byte(&data[hdr+5]);
1647   assert( top<=(int)pPage->pBt->usableSize ); /* by btreeComputeFreeSpace() */
1648   if( gap>top ){
1649     if( top==0 && pPage->pBt->usableSize==65536 ){
1650       top = 65536;
1651     }else{
1652       return SQLITE_CORRUPT_PAGE(pPage);
1653     }
1654   }
1655 
1656   /* If there is enough space between gap and top for one more cell pointer,
1657   ** and if the freelist is not empty, then search the
1658   ** freelist looking for a slot big enough to satisfy the request.
1659   */
1660   testcase( gap+2==top );
1661   testcase( gap+1==top );
1662   testcase( gap==top );
1663   if( (data[hdr+2] || data[hdr+1]) && gap+2<=top ){
1664     u8 *pSpace = pageFindSlot(pPage, nByte, &rc);
1665     if( pSpace ){
1666       int g2;
1667       assert( pSpace+nByte<=data+pPage->pBt->usableSize );
1668       *pIdx = g2 = (int)(pSpace-data);
1669       if( NEVER(g2<=gap) ){
1670         return SQLITE_CORRUPT_PAGE(pPage);
1671       }else{
1672         return SQLITE_OK;
1673       }
1674     }else if( rc ){
1675       return rc;
1676     }
1677   }
1678 
1679   /* The request could not be fulfilled using a freelist slot.  Check
1680   ** to see if defragmentation is necessary.
1681   */
1682   testcase( gap+2+nByte==top );
1683   if( gap+2+nByte>top ){
1684     assert( pPage->nCell>0 || CORRUPT_DB );
1685     assert( pPage->nFree>=0 );
1686     rc = defragmentPage(pPage, MIN(4, pPage->nFree - (2+nByte)));
1687     if( rc ) return rc;
1688     top = get2byteNotZero(&data[hdr+5]);
1689     assert( gap+2+nByte<=top );
1690   }
1691 
1692 
1693   /* Allocate memory from the gap in between the cell pointer array
1694   ** and the cell content area.  The btreeComputeFreeSpace() call has already
1695   ** validated the freelist.  Given that the freelist is valid, there
1696   ** is no way that the allocation can extend off the end of the page.
1697   ** The assert() below verifies the previous sentence.
1698   */
1699   top -= nByte;
1700   put2byte(&data[hdr+5], top);
1701   assert( top+nByte <= (int)pPage->pBt->usableSize );
1702   *pIdx = top;
1703   return SQLITE_OK;
1704 }
1705 
1706 /*
1707 ** Return a section of the pPage->aData to the freelist.
1708 ** The first byte of the new free block is pPage->aData[iStart]
1709 ** and the size of the block is iSize bytes.
1710 **
1711 ** Adjacent freeblocks are coalesced.
1712 **
1713 ** Even though the freeblock list was checked by btreeComputeFreeSpace(),
1714 ** that routine will not detect overlap between cells or freeblocks.  Nor
1715 ** does it detect cells or freeblocks that encrouch into the reserved bytes
1716 ** at the end of the page.  So do additional corruption checks inside this
1717 ** routine and return SQLITE_CORRUPT if any problems are found.
1718 */
freeSpace(MemPage * pPage,u16 iStart,u16 iSize)1719 static int freeSpace(MemPage *pPage, u16 iStart, u16 iSize){
1720   u16 iPtr;                             /* Address of ptr to next freeblock */
1721   u16 iFreeBlk;                         /* Address of the next freeblock */
1722   u8 hdr;                               /* Page header size.  0 or 100 */
1723   u8 nFrag = 0;                         /* Reduction in fragmentation */
1724   u16 iOrigSize = iSize;                /* Original value of iSize */
1725   u16 x;                                /* Offset to cell content area */
1726   u32 iEnd = iStart + iSize;            /* First byte past the iStart buffer */
1727   unsigned char *data = pPage->aData;   /* Page content */
1728 
1729   assert( pPage->pBt!=0 );
1730   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
1731   assert( CORRUPT_DB || iStart>=pPage->hdrOffset+6+pPage->childPtrSize );
1732   assert( CORRUPT_DB || iEnd <= pPage->pBt->usableSize );
1733   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1734   assert( iSize>=4 );   /* Minimum cell size is 4 */
1735   assert( iStart<=pPage->pBt->usableSize-4 );
1736 
1737   /* The list of freeblocks must be in ascending order.  Find the
1738   ** spot on the list where iStart should be inserted.
1739   */
1740   hdr = pPage->hdrOffset;
1741   iPtr = hdr + 1;
1742   if( data[iPtr+1]==0 && data[iPtr]==0 ){
1743     iFreeBlk = 0;  /* Shortcut for the case when the freelist is empty */
1744   }else{
1745     while( (iFreeBlk = get2byte(&data[iPtr]))<iStart ){
1746       if( iFreeBlk<iPtr+4 ){
1747         if( iFreeBlk==0 ) break; /* TH3: corrupt082.100 */
1748         return SQLITE_CORRUPT_PAGE(pPage);
1749       }
1750       iPtr = iFreeBlk;
1751     }
1752     if( iFreeBlk>pPage->pBt->usableSize-4 ){ /* TH3: corrupt081.100 */
1753       return SQLITE_CORRUPT_PAGE(pPage);
1754     }
1755     assert( iFreeBlk>iPtr || iFreeBlk==0 );
1756 
1757     /* At this point:
1758     **    iFreeBlk:   First freeblock after iStart, or zero if none
1759     **    iPtr:       The address of a pointer to iFreeBlk
1760     **
1761     ** Check to see if iFreeBlk should be coalesced onto the end of iStart.
1762     */
1763     if( iFreeBlk && iEnd+3>=iFreeBlk ){
1764       nFrag = iFreeBlk - iEnd;
1765       if( iEnd>iFreeBlk ) return SQLITE_CORRUPT_PAGE(pPage);
1766       iEnd = iFreeBlk + get2byte(&data[iFreeBlk+2]);
1767       if( iEnd > pPage->pBt->usableSize ){
1768         return SQLITE_CORRUPT_PAGE(pPage);
1769       }
1770       iSize = iEnd - iStart;
1771       iFreeBlk = get2byte(&data[iFreeBlk]);
1772     }
1773 
1774     /* If iPtr is another freeblock (that is, if iPtr is not the freelist
1775     ** pointer in the page header) then check to see if iStart should be
1776     ** coalesced onto the end of iPtr.
1777     */
1778     if( iPtr>hdr+1 ){
1779       int iPtrEnd = iPtr + get2byte(&data[iPtr+2]);
1780       if( iPtrEnd+3>=iStart ){
1781         if( iPtrEnd>iStart ) return SQLITE_CORRUPT_PAGE(pPage);
1782         nFrag += iStart - iPtrEnd;
1783         iSize = iEnd - iPtr;
1784         iStart = iPtr;
1785       }
1786     }
1787     if( nFrag>data[hdr+7] ) return SQLITE_CORRUPT_PAGE(pPage);
1788     data[hdr+7] -= nFrag;
1789   }
1790   x = get2byte(&data[hdr+5]);
1791   if( iStart<=x ){
1792     /* The new freeblock is at the beginning of the cell content area,
1793     ** so just extend the cell content area rather than create another
1794     ** freelist entry */
1795     if( iStart<x ) return SQLITE_CORRUPT_PAGE(pPage);
1796     if( iPtr!=hdr+1 ) return SQLITE_CORRUPT_PAGE(pPage);
1797     put2byte(&data[hdr+1], iFreeBlk);
1798     put2byte(&data[hdr+5], iEnd);
1799   }else{
1800     /* Insert the new freeblock into the freelist */
1801     put2byte(&data[iPtr], iStart);
1802   }
1803   if( pPage->pBt->btsFlags & BTS_FAST_SECURE ){
1804     /* Overwrite deleted information with zeros when the secure_delete
1805     ** option is enabled */
1806     memset(&data[iStart], 0, iSize);
1807   }
1808   put2byte(&data[iStart], iFreeBlk);
1809   put2byte(&data[iStart+2], iSize);
1810   pPage->nFree += iOrigSize;
1811   return SQLITE_OK;
1812 }
1813 
1814 /*
1815 ** Decode the flags byte (the first byte of the header) for a page
1816 ** and initialize fields of the MemPage structure accordingly.
1817 **
1818 ** Only the following combinations are supported.  Anything different
1819 ** indicates a corrupt database files:
1820 **
1821 **         PTF_ZERODATA
1822 **         PTF_ZERODATA | PTF_LEAF
1823 **         PTF_LEAFDATA | PTF_INTKEY
1824 **         PTF_LEAFDATA | PTF_INTKEY | PTF_LEAF
1825 */
decodeFlags(MemPage * pPage,int flagByte)1826 static int decodeFlags(MemPage *pPage, int flagByte){
1827   BtShared *pBt;     /* A copy of pPage->pBt */
1828 
1829   assert( pPage->hdrOffset==(pPage->pgno==1 ? 100 : 0) );
1830   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1831   pPage->leaf = (u8)(flagByte>>3);  assert( PTF_LEAF == 1<<3 );
1832   flagByte &= ~PTF_LEAF;
1833   pPage->childPtrSize = 4-4*pPage->leaf;
1834   pPage->xCellSize = cellSizePtr;
1835   pBt = pPage->pBt;
1836   if( flagByte==(PTF_LEAFDATA | PTF_INTKEY) ){
1837     /* EVIDENCE-OF: R-07291-35328 A value of 5 (0x05) means the page is an
1838     ** interior table b-tree page. */
1839     assert( (PTF_LEAFDATA|PTF_INTKEY)==5 );
1840     /* EVIDENCE-OF: R-26900-09176 A value of 13 (0x0d) means the page is a
1841     ** leaf table b-tree page. */
1842     assert( (PTF_LEAFDATA|PTF_INTKEY|PTF_LEAF)==13 );
1843     pPage->intKey = 1;
1844     if( pPage->leaf ){
1845       pPage->intKeyLeaf = 1;
1846       pPage->xParseCell = btreeParseCellPtr;
1847     }else{
1848       pPage->intKeyLeaf = 0;
1849       pPage->xCellSize = cellSizePtrNoPayload;
1850       pPage->xParseCell = btreeParseCellPtrNoPayload;
1851     }
1852     pPage->maxLocal = pBt->maxLeaf;
1853     pPage->minLocal = pBt->minLeaf;
1854   }else if( flagByte==PTF_ZERODATA ){
1855     /* EVIDENCE-OF: R-43316-37308 A value of 2 (0x02) means the page is an
1856     ** interior index b-tree page. */
1857     assert( (PTF_ZERODATA)==2 );
1858     /* EVIDENCE-OF: R-59615-42828 A value of 10 (0x0a) means the page is a
1859     ** leaf index b-tree page. */
1860     assert( (PTF_ZERODATA|PTF_LEAF)==10 );
1861     pPage->intKey = 0;
1862     pPage->intKeyLeaf = 0;
1863     pPage->xParseCell = btreeParseCellPtrIndex;
1864     pPage->maxLocal = pBt->maxLocal;
1865     pPage->minLocal = pBt->minLocal;
1866   }else{
1867     /* EVIDENCE-OF: R-47608-56469 Any other value for the b-tree page type is
1868     ** an error. */
1869     return SQLITE_CORRUPT_PAGE(pPage);
1870   }
1871   pPage->max1bytePayload = pBt->max1bytePayload;
1872   return SQLITE_OK;
1873 }
1874 
1875 /*
1876 ** Compute the amount of freespace on the page.  In other words, fill
1877 ** in the pPage->nFree field.
1878 */
btreeComputeFreeSpace(MemPage * pPage)1879 static int btreeComputeFreeSpace(MemPage *pPage){
1880   int pc;            /* Address of a freeblock within pPage->aData[] */
1881   u8 hdr;            /* Offset to beginning of page header */
1882   u8 *data;          /* Equal to pPage->aData */
1883   int usableSize;    /* Amount of usable space on each page */
1884   int nFree;         /* Number of unused bytes on the page */
1885   int top;           /* First byte of the cell content area */
1886   int iCellFirst;    /* First allowable cell or freeblock offset */
1887   int iCellLast;     /* Last possible cell or freeblock offset */
1888 
1889   assert( pPage->pBt!=0 );
1890   assert( pPage->pBt->db!=0 );
1891   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
1892   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
1893   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
1894   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
1895   assert( pPage->isInit==1 );
1896   assert( pPage->nFree<0 );
1897 
1898   usableSize = pPage->pBt->usableSize;
1899   hdr = pPage->hdrOffset;
1900   data = pPage->aData;
1901   /* EVIDENCE-OF: R-58015-48175 The two-byte integer at offset 5 designates
1902   ** the start of the cell content area. A zero value for this integer is
1903   ** interpreted as 65536. */
1904   top = get2byteNotZero(&data[hdr+5]);
1905   iCellFirst = hdr + 8 + pPage->childPtrSize + 2*pPage->nCell;
1906   iCellLast = usableSize - 4;
1907 
1908   /* Compute the total free space on the page
1909   ** EVIDENCE-OF: R-23588-34450 The two-byte integer at offset 1 gives the
1910   ** start of the first freeblock on the page, or is zero if there are no
1911   ** freeblocks. */
1912   pc = get2byte(&data[hdr+1]);
1913   nFree = data[hdr+7] + top;  /* Init nFree to non-freeblock free space */
1914   if( pc>0 ){
1915     u32 next, size;
1916     if( pc<top ){
1917       /* EVIDENCE-OF: R-55530-52930 In a well-formed b-tree page, there will
1918       ** always be at least one cell before the first freeblock.
1919       */
1920       return SQLITE_CORRUPT_PAGE(pPage);
1921     }
1922     while( 1 ){
1923       if( pc>iCellLast ){
1924         /* Freeblock off the end of the page */
1925         return SQLITE_CORRUPT_PAGE(pPage);
1926       }
1927       next = get2byte(&data[pc]);
1928       size = get2byte(&data[pc+2]);
1929       nFree = nFree + size;
1930       if( next<=pc+size+3 ) break;
1931       pc = next;
1932     }
1933     if( next>0 ){
1934       /* Freeblock not in ascending order */
1935       return SQLITE_CORRUPT_PAGE(pPage);
1936     }
1937     if( pc+size>(unsigned int)usableSize ){
1938       /* Last freeblock extends past page end */
1939       return SQLITE_CORRUPT_PAGE(pPage);
1940     }
1941   }
1942 
1943   /* At this point, nFree contains the sum of the offset to the start
1944   ** of the cell-content area plus the number of free bytes within
1945   ** the cell-content area. If this is greater than the usable-size
1946   ** of the page, then the page must be corrupted. This check also
1947   ** serves to verify that the offset to the start of the cell-content
1948   ** area, according to the page header, lies within the page.
1949   */
1950   if( nFree>usableSize || nFree<iCellFirst ){
1951     return SQLITE_CORRUPT_PAGE(pPage);
1952   }
1953   pPage->nFree = (u16)(nFree - iCellFirst);
1954   return SQLITE_OK;
1955 }
1956 
1957 /*
1958 ** Do additional sanity check after btreeInitPage() if
1959 ** PRAGMA cell_size_check=ON
1960 */
btreeCellSizeCheck(MemPage * pPage)1961 static SQLITE_NOINLINE int btreeCellSizeCheck(MemPage *pPage){
1962   int iCellFirst;    /* First allowable cell or freeblock offset */
1963   int iCellLast;     /* Last possible cell or freeblock offset */
1964   int i;             /* Index into the cell pointer array */
1965   int sz;            /* Size of a cell */
1966   int pc;            /* Address of a freeblock within pPage->aData[] */
1967   u8 *data;          /* Equal to pPage->aData */
1968   int usableSize;    /* Maximum usable space on the page */
1969   int cellOffset;    /* Start of cell content area */
1970 
1971   iCellFirst = pPage->cellOffset + 2*pPage->nCell;
1972   usableSize = pPage->pBt->usableSize;
1973   iCellLast = usableSize - 4;
1974   data = pPage->aData;
1975   cellOffset = pPage->cellOffset;
1976   if( !pPage->leaf ) iCellLast--;
1977   for(i=0; i<pPage->nCell; i++){
1978     pc = get2byteAligned(&data[cellOffset+i*2]);
1979     testcase( pc==iCellFirst );
1980     testcase( pc==iCellLast );
1981     if( pc<iCellFirst || pc>iCellLast ){
1982       return SQLITE_CORRUPT_PAGE(pPage);
1983     }
1984     sz = pPage->xCellSize(pPage, &data[pc]);
1985     testcase( pc+sz==usableSize );
1986     if( pc+sz>usableSize ){
1987       return SQLITE_CORRUPT_PAGE(pPage);
1988     }
1989   }
1990   return SQLITE_OK;
1991 }
1992 
1993 /*
1994 ** Initialize the auxiliary information for a disk block.
1995 **
1996 ** Return SQLITE_OK on success.  If we see that the page does
1997 ** not contain a well-formed database page, then return
1998 ** SQLITE_CORRUPT.  Note that a return of SQLITE_OK does not
1999 ** guarantee that the page is well-formed.  It only shows that
2000 ** we failed to detect any corruption.
2001 */
btreeInitPage(MemPage * pPage)2002 static int btreeInitPage(MemPage *pPage){
2003   u8 *data;          /* Equal to pPage->aData */
2004   BtShared *pBt;        /* The main btree structure */
2005 
2006   assert( pPage->pBt!=0 );
2007   assert( pPage->pBt->db!=0 );
2008   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2009   assert( pPage->pgno==sqlite3PagerPagenumber(pPage->pDbPage) );
2010   assert( pPage == sqlite3PagerGetExtra(pPage->pDbPage) );
2011   assert( pPage->aData == sqlite3PagerGetData(pPage->pDbPage) );
2012   assert( pPage->isInit==0 );
2013 
2014   pBt = pPage->pBt;
2015   data = pPage->aData + pPage->hdrOffset;
2016   /* EVIDENCE-OF: R-28594-02890 The one-byte flag at offset 0 indicating
2017   ** the b-tree page type. */
2018   if( decodeFlags(pPage, data[0]) ){
2019     return SQLITE_CORRUPT_PAGE(pPage);
2020   }
2021   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2022   pPage->maskPage = (u16)(pBt->pageSize - 1);
2023   pPage->nOverflow = 0;
2024   pPage->cellOffset = pPage->hdrOffset + 8 + pPage->childPtrSize;
2025   pPage->aCellIdx = data + pPage->childPtrSize + 8;
2026   pPage->aDataEnd = pPage->aData + pBt->usableSize;
2027   pPage->aDataOfst = pPage->aData + pPage->childPtrSize;
2028   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
2029   ** number of cells on the page. */
2030   pPage->nCell = get2byte(&data[3]);
2031   if( pPage->nCell>MX_CELL(pBt) ){
2032     /* To many cells for a single page.  The page must be corrupt */
2033     return SQLITE_CORRUPT_PAGE(pPage);
2034   }
2035   testcase( pPage->nCell==MX_CELL(pBt) );
2036   /* EVIDENCE-OF: R-24089-57979 If a page contains no cells (which is only
2037   ** possible for a root page of a table that contains no rows) then the
2038   ** offset to the cell content area will equal the page size minus the
2039   ** bytes of reserved space. */
2040   assert( pPage->nCell>0
2041        || get2byteNotZero(&data[5])==(int)pBt->usableSize
2042        || CORRUPT_DB );
2043   pPage->nFree = -1;  /* Indicate that this value is yet uncomputed */
2044   pPage->isInit = 1;
2045   if( pBt->db->flags & SQLITE_CellSizeCk ){
2046     return btreeCellSizeCheck(pPage);
2047   }
2048   return SQLITE_OK;
2049 }
2050 
2051 /*
2052 ** Set up a raw page so that it looks like a database page holding
2053 ** no entries.
2054 */
zeroPage(MemPage * pPage,int flags)2055 static void zeroPage(MemPage *pPage, int flags){
2056   unsigned char *data = pPage->aData;
2057   BtShared *pBt = pPage->pBt;
2058   u8 hdr = pPage->hdrOffset;
2059   u16 first;
2060 
2061   assert( sqlite3PagerPagenumber(pPage->pDbPage)==pPage->pgno );
2062   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2063   assert( sqlite3PagerGetData(pPage->pDbPage) == data );
2064   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
2065   assert( sqlite3_mutex_held(pBt->mutex) );
2066   if( pBt->btsFlags & BTS_FAST_SECURE ){
2067     memset(&data[hdr], 0, pBt->usableSize - hdr);
2068   }
2069   data[hdr] = (char)flags;
2070   first = hdr + ((flags&PTF_LEAF)==0 ? 12 : 8);
2071   memset(&data[hdr+1], 0, 4);
2072   data[hdr+7] = 0;
2073   put2byte(&data[hdr+5], pBt->usableSize);
2074   pPage->nFree = (u16)(pBt->usableSize - first);
2075   decodeFlags(pPage, flags);
2076   pPage->cellOffset = first;
2077   pPage->aDataEnd = &data[pBt->usableSize];
2078   pPage->aCellIdx = &data[first];
2079   pPage->aDataOfst = &data[pPage->childPtrSize];
2080   pPage->nOverflow = 0;
2081   assert( pBt->pageSize>=512 && pBt->pageSize<=65536 );
2082   pPage->maskPage = (u16)(pBt->pageSize - 1);
2083   pPage->nCell = 0;
2084   pPage->isInit = 1;
2085 }
2086 
2087 
2088 /*
2089 ** Convert a DbPage obtained from the pager into a MemPage used by
2090 ** the btree layer.
2091 */
btreePageFromDbPage(DbPage * pDbPage,Pgno pgno,BtShared * pBt)2092 static MemPage *btreePageFromDbPage(DbPage *pDbPage, Pgno pgno, BtShared *pBt){
2093   MemPage *pPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2094   if( pgno!=pPage->pgno ){
2095     pPage->aData = sqlite3PagerGetData(pDbPage);
2096     pPage->pDbPage = pDbPage;
2097     pPage->pBt = pBt;
2098     pPage->pgno = pgno;
2099     pPage->hdrOffset = pgno==1 ? 100 : 0;
2100   }
2101   assert( pPage->aData==sqlite3PagerGetData(pDbPage) );
2102   return pPage;
2103 }
2104 
2105 /*
2106 ** Get a page from the pager.  Initialize the MemPage.pBt and
2107 ** MemPage.aData elements if needed.  See also: btreeGetUnusedPage().
2108 **
2109 ** If the PAGER_GET_NOCONTENT flag is set, it means that we do not care
2110 ** about the content of the page at this time.  So do not go to the disk
2111 ** to fetch the content.  Just fill in the content with zeros for now.
2112 ** If in the future we call sqlite3PagerWrite() on this page, that
2113 ** means we have started to be concerned about content and the disk
2114 ** read should occur at that point.
2115 */
btreeGetPage(BtShared * pBt,Pgno pgno,MemPage ** ppPage,int flags)2116 static int btreeGetPage(
2117   BtShared *pBt,       /* The btree */
2118   Pgno pgno,           /* Number of the page to fetch */
2119   MemPage **ppPage,    /* Return the page in this parameter */
2120   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2121 ){
2122   int rc;
2123   DbPage *pDbPage;
2124 
2125   assert( flags==0 || flags==PAGER_GET_NOCONTENT || flags==PAGER_GET_READONLY );
2126   assert( sqlite3_mutex_held(pBt->mutex) );
2127   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, flags);
2128   if( rc ) return rc;
2129   *ppPage = btreePageFromDbPage(pDbPage, pgno, pBt);
2130   return SQLITE_OK;
2131 }
2132 
2133 /*
2134 ** Retrieve a page from the pager cache. If the requested page is not
2135 ** already in the pager cache return NULL. Initialize the MemPage.pBt and
2136 ** MemPage.aData elements if needed.
2137 */
btreePageLookup(BtShared * pBt,Pgno pgno)2138 static MemPage *btreePageLookup(BtShared *pBt, Pgno pgno){
2139   DbPage *pDbPage;
2140   assert( sqlite3_mutex_held(pBt->mutex) );
2141   pDbPage = sqlite3PagerLookup(pBt->pPager, pgno);
2142   if( pDbPage ){
2143     return btreePageFromDbPage(pDbPage, pgno, pBt);
2144   }
2145   return 0;
2146 }
2147 
2148 /*
2149 ** Return the size of the database file in pages. If there is any kind of
2150 ** error, return ((unsigned int)-1).
2151 */
btreePagecount(BtShared * pBt)2152 static Pgno btreePagecount(BtShared *pBt){
2153   return pBt->nPage;
2154 }
sqlite3BtreeLastPage(Btree * p)2155 Pgno sqlite3BtreeLastPage(Btree *p){
2156   assert( sqlite3BtreeHoldsMutex(p) );
2157   return btreePagecount(p->pBt);
2158 }
2159 
2160 /*
2161 ** Get a page from the pager and initialize it.
2162 **
2163 ** If pCur!=0 then the page is being fetched as part of a moveToChild()
2164 ** call.  Do additional sanity checking on the page in this case.
2165 ** And if the fetch fails, this routine must decrement pCur->iPage.
2166 **
2167 ** The page is fetched as read-write unless pCur is not NULL and is
2168 ** a read-only cursor.
2169 **
2170 ** If an error occurs, then *ppPage is undefined. It
2171 ** may remain unchanged, or it may be set to an invalid value.
2172 */
getAndInitPage(BtShared * pBt,Pgno pgno,MemPage ** ppPage,BtCursor * pCur,int bReadOnly)2173 static int getAndInitPage(
2174   BtShared *pBt,                  /* The database file */
2175   Pgno pgno,                      /* Number of the page to get */
2176   MemPage **ppPage,               /* Write the page pointer here */
2177   BtCursor *pCur,                 /* Cursor to receive the page, or NULL */
2178   int bReadOnly                   /* True for a read-only page */
2179 ){
2180   int rc;
2181   DbPage *pDbPage;
2182   assert( sqlite3_mutex_held(pBt->mutex) );
2183   assert( pCur==0 || ppPage==&pCur->pPage );
2184   assert( pCur==0 || bReadOnly==pCur->curPagerFlags );
2185   assert( pCur==0 || pCur->iPage>0 );
2186 
2187   if( pgno>btreePagecount(pBt) ){
2188     rc = SQLITE_CORRUPT_BKPT;
2189     goto getAndInitPage_error1;
2190   }
2191   rc = sqlite3PagerGet(pBt->pPager, pgno, (DbPage**)&pDbPage, bReadOnly);
2192   if( rc ){
2193     goto getAndInitPage_error1;
2194   }
2195   *ppPage = (MemPage*)sqlite3PagerGetExtra(pDbPage);
2196   if( (*ppPage)->isInit==0 ){
2197     btreePageFromDbPage(pDbPage, pgno, pBt);
2198     rc = btreeInitPage(*ppPage);
2199     if( rc!=SQLITE_OK ){
2200       goto getAndInitPage_error2;
2201     }
2202   }
2203   assert( (*ppPage)->pgno==pgno );
2204   assert( (*ppPage)->aData==sqlite3PagerGetData(pDbPage) );
2205 
2206   /* If obtaining a child page for a cursor, we must verify that the page is
2207   ** compatible with the root page. */
2208   if( pCur && ((*ppPage)->nCell<1 || (*ppPage)->intKey!=pCur->curIntKey) ){
2209     rc = SQLITE_CORRUPT_PGNO(pgno);
2210     goto getAndInitPage_error2;
2211   }
2212   return SQLITE_OK;
2213 
2214 getAndInitPage_error2:
2215   releasePage(*ppPage);
2216 getAndInitPage_error1:
2217   if( pCur ){
2218     pCur->iPage--;
2219     pCur->pPage = pCur->apPage[pCur->iPage];
2220   }
2221   testcase( pgno==0 );
2222   assert( pgno!=0 || rc==SQLITE_CORRUPT );
2223   return rc;
2224 }
2225 
2226 /*
2227 ** Release a MemPage.  This should be called once for each prior
2228 ** call to btreeGetPage.
2229 **
2230 ** Page1 is a special case and must be released using releasePageOne().
2231 */
releasePageNotNull(MemPage * pPage)2232 static void releasePageNotNull(MemPage *pPage){
2233   assert( pPage->aData );
2234   assert( pPage->pBt );
2235   assert( pPage->pDbPage!=0 );
2236   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2237   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2238   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2239   sqlite3PagerUnrefNotNull(pPage->pDbPage);
2240 }
releasePage(MemPage * pPage)2241 static void releasePage(MemPage *pPage){
2242   if( pPage ) releasePageNotNull(pPage);
2243 }
releasePageOne(MemPage * pPage)2244 static void releasePageOne(MemPage *pPage){
2245   assert( pPage!=0 );
2246   assert( pPage->aData );
2247   assert( pPage->pBt );
2248   assert( pPage->pDbPage!=0 );
2249   assert( sqlite3PagerGetExtra(pPage->pDbPage) == (void*)pPage );
2250   assert( sqlite3PagerGetData(pPage->pDbPage)==pPage->aData );
2251   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2252   sqlite3PagerUnrefPageOne(pPage->pDbPage);
2253 }
2254 
2255 /*
2256 ** Get an unused page.
2257 **
2258 ** This works just like btreeGetPage() with the addition:
2259 **
2260 **   *  If the page is already in use for some other purpose, immediately
2261 **      release it and return an SQLITE_CURRUPT error.
2262 **   *  Make sure the isInit flag is clear
2263 */
btreeGetUnusedPage(BtShared * pBt,Pgno pgno,MemPage ** ppPage,int flags)2264 static int btreeGetUnusedPage(
2265   BtShared *pBt,       /* The btree */
2266   Pgno pgno,           /* Number of the page to fetch */
2267   MemPage **ppPage,    /* Return the page in this parameter */
2268   int flags            /* PAGER_GET_NOCONTENT or PAGER_GET_READONLY */
2269 ){
2270   int rc = btreeGetPage(pBt, pgno, ppPage, flags);
2271   if( rc==SQLITE_OK ){
2272     if( sqlite3PagerPageRefcount((*ppPage)->pDbPage)>1 ){
2273       releasePage(*ppPage);
2274       *ppPage = 0;
2275       return SQLITE_CORRUPT_BKPT;
2276     }
2277     (*ppPage)->isInit = 0;
2278   }else{
2279     *ppPage = 0;
2280   }
2281   return rc;
2282 }
2283 
2284 
2285 /*
2286 ** During a rollback, when the pager reloads information into the cache
2287 ** so that the cache is restored to its original state at the start of
2288 ** the transaction, for each page restored this routine is called.
2289 **
2290 ** This routine needs to reset the extra data section at the end of the
2291 ** page to agree with the restored data.
2292 */
pageReinit(DbPage * pData)2293 static void pageReinit(DbPage *pData){
2294   MemPage *pPage;
2295   pPage = (MemPage *)sqlite3PagerGetExtra(pData);
2296   assert( sqlite3PagerPageRefcount(pData)>0 );
2297   if( pPage->isInit ){
2298     assert( sqlite3_mutex_held(pPage->pBt->mutex) );
2299     pPage->isInit = 0;
2300     if( sqlite3PagerPageRefcount(pData)>1 ){
2301       /* pPage might not be a btree page;  it might be an overflow page
2302       ** or ptrmap page or a free page.  In those cases, the following
2303       ** call to btreeInitPage() will likely return SQLITE_CORRUPT.
2304       ** But no harm is done by this.  And it is very important that
2305       ** btreeInitPage() be called on every btree page so we make
2306       ** the call for every page that comes in for re-initing. */
2307       btreeInitPage(pPage);
2308     }
2309   }
2310 }
2311 
2312 /*
2313 ** Invoke the busy handler for a btree.
2314 */
btreeInvokeBusyHandler(void * pArg)2315 static int btreeInvokeBusyHandler(void *pArg){
2316   BtShared *pBt = (BtShared*)pArg;
2317   assert( pBt->db );
2318   assert( sqlite3_mutex_held(pBt->db->mutex) );
2319   return sqlite3InvokeBusyHandler(&pBt->db->busyHandler);
2320 }
2321 
2322 /*
2323 ** Open a database file.
2324 **
2325 ** zFilename is the name of the database file.  If zFilename is NULL
2326 ** then an ephemeral database is created.  The ephemeral database might
2327 ** be exclusively in memory, or it might use a disk-based memory cache.
2328 ** Either way, the ephemeral database will be automatically deleted
2329 ** when sqlite3BtreeClose() is called.
2330 **
2331 ** If zFilename is ":memory:" then an in-memory database is created
2332 ** that is automatically destroyed when it is closed.
2333 **
2334 ** The "flags" parameter is a bitmask that might contain bits like
2335 ** BTREE_OMIT_JOURNAL and/or BTREE_MEMORY.
2336 **
2337 ** If the database is already opened in the same database connection
2338 ** and we are in shared cache mode, then the open will fail with an
2339 ** SQLITE_CONSTRAINT error.  We cannot allow two or more BtShared
2340 ** objects in the same database connection since doing so will lead
2341 ** to problems with locking.
2342 */
sqlite3BtreeOpen(sqlite3_vfs * pVfs,const char * zFilename,sqlite3 * db,Btree ** ppBtree,int flags,int vfsFlags)2343 int sqlite3BtreeOpen(
2344   sqlite3_vfs *pVfs,      /* VFS to use for this b-tree */
2345   const char *zFilename,  /* Name of the file containing the BTree database */
2346   sqlite3 *db,            /* Associated database handle */
2347   Btree **ppBtree,        /* Pointer to new Btree object written here */
2348   int flags,              /* Options */
2349   int vfsFlags            /* Flags passed through to sqlite3_vfs.xOpen() */
2350 ){
2351   BtShared *pBt = 0;             /* Shared part of btree structure */
2352   Btree *p;                      /* Handle to return */
2353   sqlite3_mutex *mutexOpen = 0;  /* Prevents a race condition. Ticket #3537 */
2354   int rc = SQLITE_OK;            /* Result code from this function */
2355   u8 nReserve;                   /* Byte of unused space on each page */
2356   unsigned char zDbHeader[100];  /* Database header content */
2357 
2358   /* True if opening an ephemeral, temporary database */
2359   const int isTempDb = zFilename==0 || zFilename[0]==0;
2360 
2361   /* Set the variable isMemdb to true for an in-memory database, or
2362   ** false for a file-based database.
2363   */
2364 #ifdef SQLITE_OMIT_MEMORYDB
2365   const int isMemdb = 0;
2366 #else
2367   const int isMemdb = (zFilename && strcmp(zFilename, ":memory:")==0)
2368                        || (isTempDb && sqlite3TempInMemory(db))
2369                        || (vfsFlags & SQLITE_OPEN_MEMORY)!=0;
2370 #endif
2371 
2372   assert( db!=0 );
2373   assert( pVfs!=0 );
2374   assert( sqlite3_mutex_held(db->mutex) );
2375   assert( (flags&0xff)==flags );   /* flags fit in 8 bits */
2376 
2377   /* Only a BTREE_SINGLE database can be BTREE_UNORDERED */
2378   assert( (flags & BTREE_UNORDERED)==0 || (flags & BTREE_SINGLE)!=0 );
2379 
2380   /* A BTREE_SINGLE database is always a temporary and/or ephemeral */
2381   assert( (flags & BTREE_SINGLE)==0 || isTempDb );
2382 
2383   if( isMemdb ){
2384     flags |= BTREE_MEMORY;
2385   }
2386   if( (vfsFlags & SQLITE_OPEN_MAIN_DB)!=0 && (isMemdb || isTempDb) ){
2387     vfsFlags = (vfsFlags & ~SQLITE_OPEN_MAIN_DB) | SQLITE_OPEN_TEMP_DB;
2388   }
2389   p = sqlite3MallocZero(sizeof(Btree));
2390   if( !p ){
2391     return SQLITE_NOMEM_BKPT;
2392   }
2393   p->inTrans = TRANS_NONE;
2394   p->db = db;
2395 #ifndef SQLITE_OMIT_SHARED_CACHE
2396   p->lock.pBtree = p;
2397   p->lock.iTable = 1;
2398 #endif
2399 
2400 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2401   /*
2402   ** If this Btree is a candidate for shared cache, try to find an
2403   ** existing BtShared object that we can share with
2404   */
2405   if( isTempDb==0 && (isMemdb==0 || (vfsFlags&SQLITE_OPEN_URI)!=0) ){
2406     if( vfsFlags & SQLITE_OPEN_SHAREDCACHE ){
2407       int nFilename = sqlite3Strlen30(zFilename)+1;
2408       int nFullPathname = pVfs->mxPathname+1;
2409       char *zFullPathname = sqlite3Malloc(MAX(nFullPathname,nFilename));
2410       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2411 
2412       p->sharable = 1;
2413       if( !zFullPathname ){
2414         sqlite3_free(p);
2415         return SQLITE_NOMEM_BKPT;
2416       }
2417       if( isMemdb ){
2418         memcpy(zFullPathname, zFilename, nFilename);
2419       }else{
2420         rc = sqlite3OsFullPathname(pVfs, zFilename,
2421                                    nFullPathname, zFullPathname);
2422         if( rc ){
2423           if( rc==SQLITE_OK_SYMLINK ){
2424             rc = SQLITE_OK;
2425           }else{
2426             sqlite3_free(zFullPathname);
2427             sqlite3_free(p);
2428             return rc;
2429           }
2430         }
2431       }
2432 #if SQLITE_THREADSAFE
2433       mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
2434       sqlite3_mutex_enter(mutexOpen);
2435       mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);
2436       sqlite3_mutex_enter(mutexShared);
2437 #endif
2438       for(pBt=GLOBAL(BtShared*,sqlite3SharedCacheList); pBt; pBt=pBt->pNext){
2439         assert( pBt->nRef>0 );
2440         if( 0==strcmp(zFullPathname, sqlite3PagerFilename(pBt->pPager, 0))
2441                  && sqlite3PagerVfs(pBt->pPager)==pVfs ){
2442           int iDb;
2443           for(iDb=db->nDb-1; iDb>=0; iDb--){
2444             Btree *pExisting = db->aDb[iDb].pBt;
2445             if( pExisting && pExisting->pBt==pBt ){
2446               sqlite3_mutex_leave(mutexShared);
2447               sqlite3_mutex_leave(mutexOpen);
2448               sqlite3_free(zFullPathname);
2449               sqlite3_free(p);
2450               return SQLITE_CONSTRAINT;
2451             }
2452           }
2453           p->pBt = pBt;
2454           pBt->nRef++;
2455           break;
2456         }
2457       }
2458       sqlite3_mutex_leave(mutexShared);
2459       sqlite3_free(zFullPathname);
2460     }
2461 #ifdef SQLITE_DEBUG
2462     else{
2463       /* In debug mode, we mark all persistent databases as sharable
2464       ** even when they are not.  This exercises the locking code and
2465       ** gives more opportunity for asserts(sqlite3_mutex_held())
2466       ** statements to find locking problems.
2467       */
2468       p->sharable = 1;
2469     }
2470 #endif
2471   }
2472 #endif
2473   if( pBt==0 ){
2474     /*
2475     ** The following asserts make sure that structures used by the btree are
2476     ** the right size.  This is to guard against size changes that result
2477     ** when compiling on a different architecture.
2478     */
2479     assert( sizeof(i64)==8 );
2480     assert( sizeof(u64)==8 );
2481     assert( sizeof(u32)==4 );
2482     assert( sizeof(u16)==2 );
2483     assert( sizeof(Pgno)==4 );
2484 
2485     pBt = sqlite3MallocZero( sizeof(*pBt) );
2486     if( pBt==0 ){
2487       rc = SQLITE_NOMEM_BKPT;
2488       goto btree_open_out;
2489     }
2490     rc = sqlite3PagerOpen(pVfs, &pBt->pPager, zFilename,
2491                           sizeof(MemPage), flags, vfsFlags, pageReinit);
2492     if( rc==SQLITE_OK ){
2493       sqlite3PagerSetMmapLimit(pBt->pPager, db->szMmap);
2494       rc = sqlite3PagerReadFileheader(pBt->pPager,sizeof(zDbHeader),zDbHeader);
2495     }
2496     if( rc!=SQLITE_OK ){
2497       goto btree_open_out;
2498     }
2499     pBt->openFlags = (u8)flags;
2500     pBt->db = db;
2501     sqlite3PagerSetBusyHandler(pBt->pPager, btreeInvokeBusyHandler, pBt);
2502     p->pBt = pBt;
2503 
2504     pBt->pCursor = 0;
2505     pBt->pPage1 = 0;
2506     if( sqlite3PagerIsreadonly(pBt->pPager) ) pBt->btsFlags |= BTS_READ_ONLY;
2507 #if defined(SQLITE_SECURE_DELETE)
2508     pBt->btsFlags |= BTS_SECURE_DELETE;
2509 #elif defined(SQLITE_FAST_SECURE_DELETE)
2510     pBt->btsFlags |= BTS_OVERWRITE;
2511 #endif
2512     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
2513     ** determined by the 2-byte integer located at an offset of 16 bytes from
2514     ** the beginning of the database file. */
2515     pBt->pageSize = (zDbHeader[16]<<8) | (zDbHeader[17]<<16);
2516     if( pBt->pageSize<512 || pBt->pageSize>SQLITE_MAX_PAGE_SIZE
2517          || ((pBt->pageSize-1)&pBt->pageSize)!=0 ){
2518       pBt->pageSize = 0;
2519 #ifndef SQLITE_OMIT_AUTOVACUUM
2520       /* If the magic name ":memory:" will create an in-memory database, then
2521       ** leave the autoVacuum mode at 0 (do not auto-vacuum), even if
2522       ** SQLITE_DEFAULT_AUTOVACUUM is true. On the other hand, if
2523       ** SQLITE_OMIT_MEMORYDB has been defined, then ":memory:" is just a
2524       ** regular file-name. In this case the auto-vacuum applies as per normal.
2525       */
2526       if( zFilename && !isMemdb ){
2527         pBt->autoVacuum = (SQLITE_DEFAULT_AUTOVACUUM ? 1 : 0);
2528         pBt->incrVacuum = (SQLITE_DEFAULT_AUTOVACUUM==2 ? 1 : 0);
2529       }
2530 #endif
2531       nReserve = 0;
2532     }else{
2533       /* EVIDENCE-OF: R-37497-42412 The size of the reserved region is
2534       ** determined by the one-byte unsigned integer found at an offset of 20
2535       ** into the database file header. */
2536       nReserve = zDbHeader[20];
2537       pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2538 #ifndef SQLITE_OMIT_AUTOVACUUM
2539       pBt->autoVacuum = (get4byte(&zDbHeader[36 + 4*4])?1:0);
2540       pBt->incrVacuum = (get4byte(&zDbHeader[36 + 7*4])?1:0);
2541 #endif
2542     }
2543     rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2544     if( rc ) goto btree_open_out;
2545     pBt->usableSize = pBt->pageSize - nReserve;
2546     assert( (pBt->pageSize & 7)==0 );  /* 8-byte alignment of pageSize */
2547 
2548 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2549     /* Add the new BtShared object to the linked list sharable BtShareds.
2550     */
2551     pBt->nRef = 1;
2552     if( p->sharable ){
2553       MUTEX_LOGIC( sqlite3_mutex *mutexShared; )
2554       MUTEX_LOGIC( mutexShared = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN);)
2555       if( SQLITE_THREADSAFE && sqlite3GlobalConfig.bCoreMutex ){
2556         pBt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
2557         if( pBt->mutex==0 ){
2558           rc = SQLITE_NOMEM_BKPT;
2559           goto btree_open_out;
2560         }
2561       }
2562       sqlite3_mutex_enter(mutexShared);
2563       pBt->pNext = GLOBAL(BtShared*,sqlite3SharedCacheList);
2564       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt;
2565       sqlite3_mutex_leave(mutexShared);
2566     }
2567 #endif
2568   }
2569 
2570 #if !defined(SQLITE_OMIT_SHARED_CACHE) && !defined(SQLITE_OMIT_DISKIO)
2571   /* If the new Btree uses a sharable pBtShared, then link the new
2572   ** Btree into the list of all sharable Btrees for the same connection.
2573   ** The list is kept in ascending order by pBt address.
2574   */
2575   if( p->sharable ){
2576     int i;
2577     Btree *pSib;
2578     for(i=0; i<db->nDb; i++){
2579       if( (pSib = db->aDb[i].pBt)!=0 && pSib->sharable ){
2580         while( pSib->pPrev ){ pSib = pSib->pPrev; }
2581         if( (uptr)p->pBt<(uptr)pSib->pBt ){
2582           p->pNext = pSib;
2583           p->pPrev = 0;
2584           pSib->pPrev = p;
2585         }else{
2586           while( pSib->pNext && (uptr)pSib->pNext->pBt<(uptr)p->pBt ){
2587             pSib = pSib->pNext;
2588           }
2589           p->pNext = pSib->pNext;
2590           p->pPrev = pSib;
2591           if( p->pNext ){
2592             p->pNext->pPrev = p;
2593           }
2594           pSib->pNext = p;
2595         }
2596         break;
2597       }
2598     }
2599   }
2600 #endif
2601   *ppBtree = p;
2602 
2603 btree_open_out:
2604   if( rc!=SQLITE_OK ){
2605     if( pBt && pBt->pPager ){
2606       sqlite3PagerClose(pBt->pPager, 0);
2607     }
2608     sqlite3_free(pBt);
2609     sqlite3_free(p);
2610     *ppBtree = 0;
2611   }else{
2612     sqlite3_file *pFile;
2613 
2614     /* If the B-Tree was successfully opened, set the pager-cache size to the
2615     ** default value. Except, when opening on an existing shared pager-cache,
2616     ** do not change the pager-cache size.
2617     */
2618     if( sqlite3BtreeSchema(p, 0, 0)==0 ){
2619       sqlite3BtreeSetCacheSize(p, SQLITE_DEFAULT_CACHE_SIZE);
2620     }
2621 
2622     pFile = sqlite3PagerFile(pBt->pPager);
2623     if( pFile->pMethods ){
2624       sqlite3OsFileControlHint(pFile, SQLITE_FCNTL_PDB, (void*)&pBt->db);
2625     }
2626   }
2627   if( mutexOpen ){
2628     assert( sqlite3_mutex_held(mutexOpen) );
2629     sqlite3_mutex_leave(mutexOpen);
2630   }
2631   assert( rc!=SQLITE_OK || sqlite3BtreeConnectionCount(*ppBtree)>0 );
2632   return rc;
2633 }
2634 
2635 /*
2636 ** Decrement the BtShared.nRef counter.  When it reaches zero,
2637 ** remove the BtShared structure from the sharing list.  Return
2638 ** true if the BtShared.nRef counter reaches zero and return
2639 ** false if it is still positive.
2640 */
removeFromSharingList(BtShared * pBt)2641 static int removeFromSharingList(BtShared *pBt){
2642 #ifndef SQLITE_OMIT_SHARED_CACHE
2643   MUTEX_LOGIC( sqlite3_mutex *pMainMtx; )
2644   BtShared *pList;
2645   int removed = 0;
2646 
2647   assert( sqlite3_mutex_notheld(pBt->mutex) );
2648   MUTEX_LOGIC( pMainMtx = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_MAIN); )
2649   sqlite3_mutex_enter(pMainMtx);
2650   pBt->nRef--;
2651   if( pBt->nRef<=0 ){
2652     if( GLOBAL(BtShared*,sqlite3SharedCacheList)==pBt ){
2653       GLOBAL(BtShared*,sqlite3SharedCacheList) = pBt->pNext;
2654     }else{
2655       pList = GLOBAL(BtShared*,sqlite3SharedCacheList);
2656       while( ALWAYS(pList) && pList->pNext!=pBt ){
2657         pList=pList->pNext;
2658       }
2659       if( ALWAYS(pList) ){
2660         pList->pNext = pBt->pNext;
2661       }
2662     }
2663     if( SQLITE_THREADSAFE ){
2664       sqlite3_mutex_free(pBt->mutex);
2665     }
2666     removed = 1;
2667   }
2668   sqlite3_mutex_leave(pMainMtx);
2669   return removed;
2670 #else
2671   return 1;
2672 #endif
2673 }
2674 
2675 /*
2676 ** Make sure pBt->pTmpSpace points to an allocation of
2677 ** MX_CELL_SIZE(pBt) bytes with a 4-byte prefix for a left-child
2678 ** pointer.
2679 */
allocateTempSpace(BtShared * pBt)2680 static void allocateTempSpace(BtShared *pBt){
2681   if( !pBt->pTmpSpace ){
2682     pBt->pTmpSpace = sqlite3PageMalloc( pBt->pageSize );
2683 
2684     /* One of the uses of pBt->pTmpSpace is to format cells before
2685     ** inserting them into a leaf page (function fillInCell()). If
2686     ** a cell is less than 4 bytes in size, it is rounded up to 4 bytes
2687     ** by the various routines that manipulate binary cells. Which
2688     ** can mean that fillInCell() only initializes the first 2 or 3
2689     ** bytes of pTmpSpace, but that the first 4 bytes are copied from
2690     ** it into a database page. This is not actually a problem, but it
2691     ** does cause a valgrind error when the 1 or 2 bytes of unitialized
2692     ** data is passed to system call write(). So to avoid this error,
2693     ** zero the first 4 bytes of temp space here.
2694     **
2695     ** Also:  Provide four bytes of initialized space before the
2696     ** beginning of pTmpSpace as an area available to prepend the
2697     ** left-child pointer to the beginning of a cell.
2698     */
2699     if( pBt->pTmpSpace ){
2700       memset(pBt->pTmpSpace, 0, 8);
2701       pBt->pTmpSpace += 4;
2702     }
2703   }
2704 }
2705 
2706 /*
2707 ** Free the pBt->pTmpSpace allocation
2708 */
freeTempSpace(BtShared * pBt)2709 static void freeTempSpace(BtShared *pBt){
2710   if( pBt->pTmpSpace ){
2711     pBt->pTmpSpace -= 4;
2712     sqlite3PageFree(pBt->pTmpSpace);
2713     pBt->pTmpSpace = 0;
2714   }
2715 }
2716 
2717 /*
2718 ** Close an open database and invalidate all cursors.
2719 */
sqlite3BtreeClose(Btree * p)2720 int sqlite3BtreeClose(Btree *p){
2721   BtShared *pBt = p->pBt;
2722   BtCursor *pCur;
2723 
2724   /* Close all cursors opened via this handle.  */
2725   assert( sqlite3_mutex_held(p->db->mutex) );
2726   sqlite3BtreeEnter(p);
2727   pCur = pBt->pCursor;
2728   while( pCur ){
2729     BtCursor *pTmp = pCur;
2730     pCur = pCur->pNext;
2731     if( pTmp->pBtree==p ){
2732       sqlite3BtreeCloseCursor(pTmp);
2733     }
2734   }
2735 
2736   /* Rollback any active transaction and free the handle structure.
2737   ** The call to sqlite3BtreeRollback() drops any table-locks held by
2738   ** this handle.
2739   */
2740   sqlite3BtreeRollback(p, SQLITE_OK, 0);
2741   sqlite3BtreeLeave(p);
2742 
2743   /* If there are still other outstanding references to the shared-btree
2744   ** structure, return now. The remainder of this procedure cleans
2745   ** up the shared-btree.
2746   */
2747   assert( p->wantToLock==0 && p->locked==0 );
2748   if( !p->sharable || removeFromSharingList(pBt) ){
2749     /* The pBt is no longer on the sharing list, so we can access
2750     ** it without having to hold the mutex.
2751     **
2752     ** Clean out and delete the BtShared object.
2753     */
2754     assert( !pBt->pCursor );
2755     sqlite3PagerClose(pBt->pPager, p->db);
2756     if( pBt->xFreeSchema && pBt->pSchema ){
2757       pBt->xFreeSchema(pBt->pSchema);
2758     }
2759     sqlite3DbFree(0, pBt->pSchema);
2760     freeTempSpace(pBt);
2761     sqlite3_free(pBt);
2762   }
2763 
2764 #ifndef SQLITE_OMIT_SHARED_CACHE
2765   assert( p->wantToLock==0 );
2766   assert( p->locked==0 );
2767   if( p->pPrev ) p->pPrev->pNext = p->pNext;
2768   if( p->pNext ) p->pNext->pPrev = p->pPrev;
2769 #endif
2770 
2771   sqlite3_free(p);
2772   return SQLITE_OK;
2773 }
2774 
2775 /*
2776 ** Change the "soft" limit on the number of pages in the cache.
2777 ** Unused and unmodified pages will be recycled when the number of
2778 ** pages in the cache exceeds this soft limit.  But the size of the
2779 ** cache is allowed to grow larger than this limit if it contains
2780 ** dirty pages or pages still in active use.
2781 */
sqlite3BtreeSetCacheSize(Btree * p,int mxPage)2782 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage){
2783   BtShared *pBt = p->pBt;
2784   assert( sqlite3_mutex_held(p->db->mutex) );
2785   sqlite3BtreeEnter(p);
2786   sqlite3PagerSetCachesize(pBt->pPager, mxPage);
2787   sqlite3BtreeLeave(p);
2788   return SQLITE_OK;
2789 }
2790 
2791 /*
2792 ** Change the "spill" limit on the number of pages in the cache.
2793 ** If the number of pages exceeds this limit during a write transaction,
2794 ** the pager might attempt to "spill" pages to the journal early in
2795 ** order to free up memory.
2796 **
2797 ** The value returned is the current spill size.  If zero is passed
2798 ** as an argument, no changes are made to the spill size setting, so
2799 ** using mxPage of 0 is a way to query the current spill size.
2800 */
sqlite3BtreeSetSpillSize(Btree * p,int mxPage)2801 int sqlite3BtreeSetSpillSize(Btree *p, int mxPage){
2802   BtShared *pBt = p->pBt;
2803   int res;
2804   assert( sqlite3_mutex_held(p->db->mutex) );
2805   sqlite3BtreeEnter(p);
2806   res = sqlite3PagerSetSpillsize(pBt->pPager, mxPage);
2807   sqlite3BtreeLeave(p);
2808   return res;
2809 }
2810 
2811 #if SQLITE_MAX_MMAP_SIZE>0
2812 /*
2813 ** Change the limit on the amount of the database file that may be
2814 ** memory mapped.
2815 */
sqlite3BtreeSetMmapLimit(Btree * p,sqlite3_int64 szMmap)2816 int sqlite3BtreeSetMmapLimit(Btree *p, sqlite3_int64 szMmap){
2817   BtShared *pBt = p->pBt;
2818   assert( sqlite3_mutex_held(p->db->mutex) );
2819   sqlite3BtreeEnter(p);
2820   sqlite3PagerSetMmapLimit(pBt->pPager, szMmap);
2821   sqlite3BtreeLeave(p);
2822   return SQLITE_OK;
2823 }
2824 #endif /* SQLITE_MAX_MMAP_SIZE>0 */
2825 
2826 /*
2827 ** Change the way data is synced to disk in order to increase or decrease
2828 ** how well the database resists damage due to OS crashes and power
2829 ** failures.  Level 1 is the same as asynchronous (no syncs() occur and
2830 ** there is a high probability of damage)  Level 2 is the default.  There
2831 ** is a very low but non-zero probability of damage.  Level 3 reduces the
2832 ** probability of damage to near zero but with a write performance reduction.
2833 */
2834 #ifndef SQLITE_OMIT_PAGER_PRAGMAS
sqlite3BtreeSetPagerFlags(Btree * p,unsigned pgFlags)2835 int sqlite3BtreeSetPagerFlags(
2836   Btree *p,              /* The btree to set the safety level on */
2837   unsigned pgFlags       /* Various PAGER_* flags */
2838 ){
2839   BtShared *pBt = p->pBt;
2840   assert( sqlite3_mutex_held(p->db->mutex) );
2841   sqlite3BtreeEnter(p);
2842   sqlite3PagerSetFlags(pBt->pPager, pgFlags);
2843   sqlite3BtreeLeave(p);
2844   return SQLITE_OK;
2845 }
2846 #endif
2847 
2848 /*
2849 ** Change the default pages size and the number of reserved bytes per page.
2850 ** Or, if the page size has already been fixed, return SQLITE_READONLY
2851 ** without changing anything.
2852 **
2853 ** The page size must be a power of 2 between 512 and 65536.  If the page
2854 ** size supplied does not meet this constraint then the page size is not
2855 ** changed.
2856 **
2857 ** Page sizes are constrained to be a power of two so that the region
2858 ** of the database file used for locking (beginning at PENDING_BYTE,
2859 ** the first byte past the 1GB boundary, 0x40000000) needs to occur
2860 ** at the beginning of a page.
2861 **
2862 ** If parameter nReserve is less than zero, then the number of reserved
2863 ** bytes per page is left unchanged.
2864 **
2865 ** If the iFix!=0 then the BTS_PAGESIZE_FIXED flag is set so that the page size
2866 ** and autovacuum mode can no longer be changed.
2867 */
sqlite3BtreeSetPageSize(Btree * p,int pageSize,int nReserve,int iFix)2868 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix){
2869   int rc = SQLITE_OK;
2870   int x;
2871   BtShared *pBt = p->pBt;
2872   assert( nReserve>=0 && nReserve<=255 );
2873   sqlite3BtreeEnter(p);
2874   pBt->nReserveWanted = nReserve;
2875   x = pBt->pageSize - pBt->usableSize;
2876   if( nReserve<x ) nReserve = x;
2877   if( pBt->btsFlags & BTS_PAGESIZE_FIXED ){
2878     sqlite3BtreeLeave(p);
2879     return SQLITE_READONLY;
2880   }
2881   assert( nReserve>=0 && nReserve<=255 );
2882   if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE &&
2883         ((pageSize-1)&pageSize)==0 ){
2884     assert( (pageSize & 7)==0 );
2885     assert( !pBt->pCursor );
2886     pBt->pageSize = (u32)pageSize;
2887     freeTempSpace(pBt);
2888   }
2889   rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize, nReserve);
2890   pBt->usableSize = pBt->pageSize - (u16)nReserve;
2891   if( iFix ) pBt->btsFlags |= BTS_PAGESIZE_FIXED;
2892   sqlite3BtreeLeave(p);
2893   return rc;
2894 }
2895 
2896 /*
2897 ** Return the currently defined page size
2898 */
sqlite3BtreeGetPageSize(Btree * p)2899 int sqlite3BtreeGetPageSize(Btree *p){
2900   return p->pBt->pageSize;
2901 }
2902 
2903 /*
2904 ** This function is similar to sqlite3BtreeGetReserve(), except that it
2905 ** may only be called if it is guaranteed that the b-tree mutex is already
2906 ** held.
2907 **
2908 ** This is useful in one special case in the backup API code where it is
2909 ** known that the shared b-tree mutex is held, but the mutex on the
2910 ** database handle that owns *p is not. In this case if sqlite3BtreeEnter()
2911 ** were to be called, it might collide with some other operation on the
2912 ** database handle that owns *p, causing undefined behavior.
2913 */
sqlite3BtreeGetReserveNoMutex(Btree * p)2914 int sqlite3BtreeGetReserveNoMutex(Btree *p){
2915   int n;
2916   assert( sqlite3_mutex_held(p->pBt->mutex) );
2917   n = p->pBt->pageSize - p->pBt->usableSize;
2918   return n;
2919 }
2920 
2921 /*
2922 ** Return the number of bytes of space at the end of every page that
2923 ** are intentually left unused.  This is the "reserved" space that is
2924 ** sometimes used by extensions.
2925 **
2926 ** The value returned is the larger of the current reserve size and
2927 ** the latest reserve size requested by SQLITE_FILECTRL_RESERVE_BYTES.
2928 ** The amount of reserve can only grow - never shrink.
2929 */
sqlite3BtreeGetRequestedReserve(Btree * p)2930 int sqlite3BtreeGetRequestedReserve(Btree *p){
2931   int n1, n2;
2932   sqlite3BtreeEnter(p);
2933   n1 = (int)p->pBt->nReserveWanted;
2934   n2 = sqlite3BtreeGetReserveNoMutex(p);
2935   sqlite3BtreeLeave(p);
2936   return n1>n2 ? n1 : n2;
2937 }
2938 
2939 
2940 /*
2941 ** Set the maximum page count for a database if mxPage is positive.
2942 ** No changes are made if mxPage is 0 or negative.
2943 ** Regardless of the value of mxPage, return the maximum page count.
2944 */
sqlite3BtreeMaxPageCount(Btree * p,Pgno mxPage)2945 Pgno sqlite3BtreeMaxPageCount(Btree *p, Pgno mxPage){
2946   Pgno n;
2947   sqlite3BtreeEnter(p);
2948   n = sqlite3PagerMaxPageCount(p->pBt->pPager, mxPage);
2949   sqlite3BtreeLeave(p);
2950   return n;
2951 }
2952 
2953 /*
2954 ** Change the values for the BTS_SECURE_DELETE and BTS_OVERWRITE flags:
2955 **
2956 **    newFlag==0       Both BTS_SECURE_DELETE and BTS_OVERWRITE are cleared
2957 **    newFlag==1       BTS_SECURE_DELETE set and BTS_OVERWRITE is cleared
2958 **    newFlag==2       BTS_SECURE_DELETE cleared and BTS_OVERWRITE is set
2959 **    newFlag==(-1)    No changes
2960 **
2961 ** This routine acts as a query if newFlag is less than zero
2962 **
2963 ** With BTS_OVERWRITE set, deleted content is overwritten by zeros, but
2964 ** freelist leaf pages are not written back to the database.  Thus in-page
2965 ** deleted content is cleared, but freelist deleted content is not.
2966 **
2967 ** With BTS_SECURE_DELETE, operation is like BTS_OVERWRITE with the addition
2968 ** that freelist leaf pages are written back into the database, increasing
2969 ** the amount of disk I/O.
2970 */
sqlite3BtreeSecureDelete(Btree * p,int newFlag)2971 int sqlite3BtreeSecureDelete(Btree *p, int newFlag){
2972   int b;
2973   if( p==0 ) return 0;
2974   sqlite3BtreeEnter(p);
2975   assert( BTS_OVERWRITE==BTS_SECURE_DELETE*2 );
2976   assert( BTS_FAST_SECURE==(BTS_OVERWRITE|BTS_SECURE_DELETE) );
2977   if( newFlag>=0 ){
2978     p->pBt->btsFlags &= ~BTS_FAST_SECURE;
2979     p->pBt->btsFlags |= BTS_SECURE_DELETE*newFlag;
2980   }
2981   b = (p->pBt->btsFlags & BTS_FAST_SECURE)/BTS_SECURE_DELETE;
2982   sqlite3BtreeLeave(p);
2983   return b;
2984 }
2985 
2986 /*
2987 ** Change the 'auto-vacuum' property of the database. If the 'autoVacuum'
2988 ** parameter is non-zero, then auto-vacuum mode is enabled. If zero, it
2989 ** is disabled. The default value for the auto-vacuum property is
2990 ** determined by the SQLITE_DEFAULT_AUTOVACUUM macro.
2991 */
sqlite3BtreeSetAutoVacuum(Btree * p,int autoVacuum)2992 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum){
2993 #ifdef SQLITE_OMIT_AUTOVACUUM
2994   return SQLITE_READONLY;
2995 #else
2996   BtShared *pBt = p->pBt;
2997   int rc = SQLITE_OK;
2998   u8 av = (u8)autoVacuum;
2999 
3000   sqlite3BtreeEnter(p);
3001   if( (pBt->btsFlags & BTS_PAGESIZE_FIXED)!=0 && (av ?1:0)!=pBt->autoVacuum ){
3002     rc = SQLITE_READONLY;
3003   }else{
3004     pBt->autoVacuum = av ?1:0;
3005     pBt->incrVacuum = av==2 ?1:0;
3006   }
3007   sqlite3BtreeLeave(p);
3008   return rc;
3009 #endif
3010 }
3011 
3012 /*
3013 ** Return the value of the 'auto-vacuum' property. If auto-vacuum is
3014 ** enabled 1 is returned. Otherwise 0.
3015 */
sqlite3BtreeGetAutoVacuum(Btree * p)3016 int sqlite3BtreeGetAutoVacuum(Btree *p){
3017 #ifdef SQLITE_OMIT_AUTOVACUUM
3018   return BTREE_AUTOVACUUM_NONE;
3019 #else
3020   int rc;
3021   sqlite3BtreeEnter(p);
3022   rc = (
3023     (!p->pBt->autoVacuum)?BTREE_AUTOVACUUM_NONE:
3024     (!p->pBt->incrVacuum)?BTREE_AUTOVACUUM_FULL:
3025     BTREE_AUTOVACUUM_INCR
3026   );
3027   sqlite3BtreeLeave(p);
3028   return rc;
3029 #endif
3030 }
3031 
3032 /*
3033 ** If the user has not set the safety-level for this database connection
3034 ** using "PRAGMA synchronous", and if the safety-level is not already
3035 ** set to the value passed to this function as the second parameter,
3036 ** set it so.
3037 */
3038 #if SQLITE_DEFAULT_SYNCHRONOUS!=SQLITE_DEFAULT_WAL_SYNCHRONOUS \
3039     && !defined(SQLITE_OMIT_WAL)
setDefaultSyncFlag(BtShared * pBt,u8 safety_level)3040 static void setDefaultSyncFlag(BtShared *pBt, u8 safety_level){
3041   sqlite3 *db;
3042   Db *pDb;
3043   if( (db=pBt->db)!=0 && (pDb=db->aDb)!=0 ){
3044     while( pDb->pBt==0 || pDb->pBt->pBt!=pBt ){ pDb++; }
3045     if( pDb->bSyncSet==0
3046      && pDb->safety_level!=safety_level
3047      && pDb!=&db->aDb[1]
3048     ){
3049       pDb->safety_level = safety_level;
3050       sqlite3PagerSetFlags(pBt->pPager,
3051           pDb->safety_level | (db->flags & PAGER_FLAGS_MASK));
3052     }
3053   }
3054 }
3055 #else
3056 # define setDefaultSyncFlag(pBt,safety_level)
3057 #endif
3058 
3059 /* Forward declaration */
3060 static int newDatabase(BtShared*);
3061 
3062 
3063 /*
3064 ** Get a reference to pPage1 of the database file.  This will
3065 ** also acquire a readlock on that file.
3066 **
3067 ** SQLITE_OK is returned on success.  If the file is not a
3068 ** well-formed database file, then SQLITE_CORRUPT is returned.
3069 ** SQLITE_BUSY is returned if the database is locked.  SQLITE_NOMEM
3070 ** is returned if we run out of memory.
3071 */
lockBtree(BtShared * pBt)3072 static int lockBtree(BtShared *pBt){
3073   int rc;              /* Result code from subfunctions */
3074   MemPage *pPage1;     /* Page 1 of the database file */
3075   u32 nPage;           /* Number of pages in the database */
3076   u32 nPageFile = 0;   /* Number of pages in the database file */
3077   u32 nPageHeader;     /* Number of pages in the database according to hdr */
3078 
3079   assert( sqlite3_mutex_held(pBt->mutex) );
3080   assert( pBt->pPage1==0 );
3081   rc = sqlite3PagerSharedLock(pBt->pPager);
3082   if( rc!=SQLITE_OK ) return rc;
3083   rc = btreeGetPage(pBt, 1, &pPage1, 0);
3084   if( rc!=SQLITE_OK ) return rc;
3085 
3086   /* Do some checking to help insure the file we opened really is
3087   ** a valid database file.
3088   */
3089   nPage = nPageHeader = get4byte(28+(u8*)pPage1->aData);
3090   sqlite3PagerPagecount(pBt->pPager, (int*)&nPageFile);
3091   if( nPage==0 || memcmp(24+(u8*)pPage1->aData, 92+(u8*)pPage1->aData,4)!=0 ){
3092     nPage = nPageFile;
3093   }
3094   if( (pBt->db->flags & SQLITE_ResetDatabase)!=0 ){
3095     nPage = 0;
3096   }
3097   if( nPage>0 ){
3098     u32 pageSize;
3099     u32 usableSize;
3100     u8 *page1 = pPage1->aData;
3101     rc = SQLITE_NOTADB;
3102     /* EVIDENCE-OF: R-43737-39999 Every valid SQLite database file begins
3103     ** with the following 16 bytes (in hex): 53 51 4c 69 74 65 20 66 6f 72 6d
3104     ** 61 74 20 33 00. */
3105     if( memcmp(page1, zMagicHeader, 16)!=0 ){
3106       goto page1_init_failed;
3107     }
3108 
3109 #ifdef SQLITE_OMIT_WAL
3110     if( page1[18]>1 ){
3111       pBt->btsFlags |= BTS_READ_ONLY;
3112     }
3113     if( page1[19]>1 ){
3114       goto page1_init_failed;
3115     }
3116 #else
3117     if( page1[18]>2 ){
3118       pBt->btsFlags |= BTS_READ_ONLY;
3119     }
3120     if( page1[19]>2 ){
3121       goto page1_init_failed;
3122     }
3123 
3124     /* If the write version is set to 2, this database should be accessed
3125     ** in WAL mode. If the log is not already open, open it now. Then
3126     ** return SQLITE_OK and return without populating BtShared.pPage1.
3127     ** The caller detects this and calls this function again. This is
3128     ** required as the version of page 1 currently in the page1 buffer
3129     ** may not be the latest version - there may be a newer one in the log
3130     ** file.
3131     */
3132     if( page1[19]==2 && (pBt->btsFlags & BTS_NO_WAL)==0 ){
3133       int isOpen = 0;
3134       rc = sqlite3PagerOpenWal(pBt->pPager, &isOpen);
3135       if( rc!=SQLITE_OK ){
3136         goto page1_init_failed;
3137       }else{
3138         setDefaultSyncFlag(pBt, SQLITE_DEFAULT_WAL_SYNCHRONOUS+1);
3139         if( isOpen==0 ){
3140           releasePageOne(pPage1);
3141           return SQLITE_OK;
3142         }
3143       }
3144       rc = SQLITE_NOTADB;
3145     }else{
3146       setDefaultSyncFlag(pBt, SQLITE_DEFAULT_SYNCHRONOUS+1);
3147     }
3148 #endif
3149 
3150     /* EVIDENCE-OF: R-15465-20813 The maximum and minimum embedded payload
3151     ** fractions and the leaf payload fraction values must be 64, 32, and 32.
3152     **
3153     ** The original design allowed these amounts to vary, but as of
3154     ** version 3.6.0, we require them to be fixed.
3155     */
3156     if( memcmp(&page1[21], "\100\040\040",3)!=0 ){
3157       goto page1_init_failed;
3158     }
3159     /* EVIDENCE-OF: R-51873-39618 The page size for a database file is
3160     ** determined by the 2-byte integer located at an offset of 16 bytes from
3161     ** the beginning of the database file. */
3162     pageSize = (page1[16]<<8) | (page1[17]<<16);
3163     /* EVIDENCE-OF: R-25008-21688 The size of a page is a power of two
3164     ** between 512 and 65536 inclusive. */
3165     if( ((pageSize-1)&pageSize)!=0
3166      || pageSize>SQLITE_MAX_PAGE_SIZE
3167      || pageSize<=256
3168     ){
3169       goto page1_init_failed;
3170     }
3171     pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3172     assert( (pageSize & 7)==0 );
3173     /* EVIDENCE-OF: R-59310-51205 The "reserved space" size in the 1-byte
3174     ** integer at offset 20 is the number of bytes of space at the end of
3175     ** each page to reserve for extensions.
3176     **
3177     ** EVIDENCE-OF: R-37497-42412 The size of the reserved region is
3178     ** determined by the one-byte unsigned integer found at an offset of 20
3179     ** into the database file header. */
3180     usableSize = pageSize - page1[20];
3181     if( (u32)pageSize!=pBt->pageSize ){
3182       /* After reading the first page of the database assuming a page size
3183       ** of BtShared.pageSize, we have discovered that the page-size is
3184       ** actually pageSize. Unlock the database, leave pBt->pPage1 at
3185       ** zero and return SQLITE_OK. The caller will call this function
3186       ** again with the correct page-size.
3187       */
3188       releasePageOne(pPage1);
3189       pBt->usableSize = usableSize;
3190       pBt->pageSize = pageSize;
3191       freeTempSpace(pBt);
3192       rc = sqlite3PagerSetPagesize(pBt->pPager, &pBt->pageSize,
3193                                    pageSize-usableSize);
3194       return rc;
3195     }
3196     if( sqlite3WritableSchema(pBt->db)==0 && nPage>nPageFile ){
3197       rc = SQLITE_CORRUPT_BKPT;
3198       goto page1_init_failed;
3199     }
3200     /* EVIDENCE-OF: R-28312-64704 However, the usable size is not allowed to
3201     ** be less than 480. In other words, if the page size is 512, then the
3202     ** reserved space size cannot exceed 32. */
3203     if( usableSize<480 ){
3204       goto page1_init_failed;
3205     }
3206     pBt->pageSize = pageSize;
3207     pBt->usableSize = usableSize;
3208 #ifndef SQLITE_OMIT_AUTOVACUUM
3209     pBt->autoVacuum = (get4byte(&page1[36 + 4*4])?1:0);
3210     pBt->incrVacuum = (get4byte(&page1[36 + 7*4])?1:0);
3211 #endif
3212   }
3213 
3214   /* maxLocal is the maximum amount of payload to store locally for
3215   ** a cell.  Make sure it is small enough so that at least minFanout
3216   ** cells can will fit on one page.  We assume a 10-byte page header.
3217   ** Besides the payload, the cell must store:
3218   **     2-byte pointer to the cell
3219   **     4-byte child pointer
3220   **     9-byte nKey value
3221   **     4-byte nData value
3222   **     4-byte overflow page pointer
3223   ** So a cell consists of a 2-byte pointer, a header which is as much as
3224   ** 17 bytes long, 0 to N bytes of payload, and an optional 4 byte overflow
3225   ** page pointer.
3226   */
3227   pBt->maxLocal = (u16)((pBt->usableSize-12)*64/255 - 23);
3228   pBt->minLocal = (u16)((pBt->usableSize-12)*32/255 - 23);
3229   pBt->maxLeaf = (u16)(pBt->usableSize - 35);
3230   pBt->minLeaf = (u16)((pBt->usableSize-12)*32/255 - 23);
3231   if( pBt->maxLocal>127 ){
3232     pBt->max1bytePayload = 127;
3233   }else{
3234     pBt->max1bytePayload = (u8)pBt->maxLocal;
3235   }
3236   assert( pBt->maxLeaf + 23 <= MX_CELL_SIZE(pBt) );
3237   pBt->pPage1 = pPage1;
3238   pBt->nPage = nPage;
3239   return SQLITE_OK;
3240 
3241 page1_init_failed:
3242   releasePageOne(pPage1);
3243   pBt->pPage1 = 0;
3244   return rc;
3245 }
3246 
3247 #ifndef NDEBUG
3248 /*
3249 ** Return the number of cursors open on pBt. This is for use
3250 ** in assert() expressions, so it is only compiled if NDEBUG is not
3251 ** defined.
3252 **
3253 ** Only write cursors are counted if wrOnly is true.  If wrOnly is
3254 ** false then all cursors are counted.
3255 **
3256 ** For the purposes of this routine, a cursor is any cursor that
3257 ** is capable of reading or writing to the database.  Cursors that
3258 ** have been tripped into the CURSOR_FAULT state are not counted.
3259 */
countValidCursors(BtShared * pBt,int wrOnly)3260 static int countValidCursors(BtShared *pBt, int wrOnly){
3261   BtCursor *pCur;
3262   int r = 0;
3263   for(pCur=pBt->pCursor; pCur; pCur=pCur->pNext){
3264     if( (wrOnly==0 || (pCur->curFlags & BTCF_WriteFlag)!=0)
3265      && pCur->eState!=CURSOR_FAULT ) r++;
3266   }
3267   return r;
3268 }
3269 #endif
3270 
3271 /*
3272 ** If there are no outstanding cursors and we are not in the middle
3273 ** of a transaction but there is a read lock on the database, then
3274 ** this routine unrefs the first page of the database file which
3275 ** has the effect of releasing the read lock.
3276 **
3277 ** If there is a transaction in progress, this routine is a no-op.
3278 */
unlockBtreeIfUnused(BtShared * pBt)3279 static void unlockBtreeIfUnused(BtShared *pBt){
3280   assert( sqlite3_mutex_held(pBt->mutex) );
3281   assert( countValidCursors(pBt,0)==0 || pBt->inTransaction>TRANS_NONE );
3282   if( pBt->inTransaction==TRANS_NONE && pBt->pPage1!=0 ){
3283     MemPage *pPage1 = pBt->pPage1;
3284     assert( pPage1->aData );
3285     assert( sqlite3PagerRefcount(pBt->pPager)==1 );
3286     pBt->pPage1 = 0;
3287     releasePageOne(pPage1);
3288   }
3289 }
3290 
3291 /*
3292 ** If pBt points to an empty file then convert that empty file
3293 ** into a new empty database by initializing the first page of
3294 ** the database.
3295 */
newDatabase(BtShared * pBt)3296 static int newDatabase(BtShared *pBt){
3297   MemPage *pP1;
3298   unsigned char *data;
3299   int rc;
3300 
3301   assert( sqlite3_mutex_held(pBt->mutex) );
3302   if( pBt->nPage>0 ){
3303     return SQLITE_OK;
3304   }
3305   pP1 = pBt->pPage1;
3306   assert( pP1!=0 );
3307   data = pP1->aData;
3308   rc = sqlite3PagerWrite(pP1->pDbPage);
3309   if( rc ) return rc;
3310   memcpy(data, zMagicHeader, sizeof(zMagicHeader));
3311   assert( sizeof(zMagicHeader)==16 );
3312   data[16] = (u8)((pBt->pageSize>>8)&0xff);
3313   data[17] = (u8)((pBt->pageSize>>16)&0xff);
3314   data[18] = 1;
3315   data[19] = 1;
3316   assert( pBt->usableSize<=pBt->pageSize && pBt->usableSize+255>=pBt->pageSize);
3317   data[20] = (u8)(pBt->pageSize - pBt->usableSize);
3318   data[21] = 64;
3319   data[22] = 32;
3320   data[23] = 32;
3321   memset(&data[24], 0, 100-24);
3322   zeroPage(pP1, PTF_INTKEY|PTF_LEAF|PTF_LEAFDATA );
3323   pBt->btsFlags |= BTS_PAGESIZE_FIXED;
3324 #ifndef SQLITE_OMIT_AUTOVACUUM
3325   assert( pBt->autoVacuum==1 || pBt->autoVacuum==0 );
3326   assert( pBt->incrVacuum==1 || pBt->incrVacuum==0 );
3327   put4byte(&data[36 + 4*4], pBt->autoVacuum);
3328   put4byte(&data[36 + 7*4], pBt->incrVacuum);
3329 #endif
3330   pBt->nPage = 1;
3331   data[31] = 1;
3332   return SQLITE_OK;
3333 }
3334 
3335 /*
3336 ** Initialize the first page of the database file (creating a database
3337 ** consisting of a single page and no schema objects). Return SQLITE_OK
3338 ** if successful, or an SQLite error code otherwise.
3339 */
sqlite3BtreeNewDb(Btree * p)3340 int sqlite3BtreeNewDb(Btree *p){
3341   int rc;
3342   sqlite3BtreeEnter(p);
3343   p->pBt->nPage = 0;
3344   rc = newDatabase(p->pBt);
3345   sqlite3BtreeLeave(p);
3346   return rc;
3347 }
3348 
3349 /*
3350 ** Attempt to start a new transaction. A write-transaction
3351 ** is started if the second argument is nonzero, otherwise a read-
3352 ** transaction.  If the second argument is 2 or more and exclusive
3353 ** transaction is started, meaning that no other process is allowed
3354 ** to access the database.  A preexisting transaction may not be
3355 ** upgraded to exclusive by calling this routine a second time - the
3356 ** exclusivity flag only works for a new transaction.
3357 **
3358 ** A write-transaction must be started before attempting any
3359 ** changes to the database.  None of the following routines
3360 ** will work unless a transaction is started first:
3361 **
3362 **      sqlite3BtreeCreateTable()
3363 **      sqlite3BtreeCreateIndex()
3364 **      sqlite3BtreeClearTable()
3365 **      sqlite3BtreeDropTable()
3366 **      sqlite3BtreeInsert()
3367 **      sqlite3BtreeDelete()
3368 **      sqlite3BtreeUpdateMeta()
3369 **
3370 ** If an initial attempt to acquire the lock fails because of lock contention
3371 ** and the database was previously unlocked, then invoke the busy handler
3372 ** if there is one.  But if there was previously a read-lock, do not
3373 ** invoke the busy handler - just return SQLITE_BUSY.  SQLITE_BUSY is
3374 ** returned when there is already a read-lock in order to avoid a deadlock.
3375 **
3376 ** Suppose there are two processes A and B.  A has a read lock and B has
3377 ** a reserved lock.  B tries to promote to exclusive but is blocked because
3378 ** of A's read lock.  A tries to promote to reserved but is blocked by B.
3379 ** One or the other of the two processes must give way or there can be
3380 ** no progress.  By returning SQLITE_BUSY and not invoking the busy callback
3381 ** when A already has a read lock, we encourage A to give up and let B
3382 ** proceed.
3383 */
sqlite3BtreeBeginTrans(Btree * p,int wrflag,int * pSchemaVersion)3384 int sqlite3BtreeBeginTrans(Btree *p, int wrflag, int *pSchemaVersion){
3385   BtShared *pBt = p->pBt;
3386   Pager *pPager = pBt->pPager;
3387   int rc = SQLITE_OK;
3388 
3389   sqlite3BtreeEnter(p);
3390   btreeIntegrity(p);
3391 
3392   /* If the btree is already in a write-transaction, or it
3393   ** is already in a read-transaction and a read-transaction
3394   ** is requested, this is a no-op.
3395   */
3396   if( p->inTrans==TRANS_WRITE || (p->inTrans==TRANS_READ && !wrflag) ){
3397     goto trans_begun;
3398   }
3399   assert( pBt->inTransaction==TRANS_WRITE || IfNotOmitAV(pBt->bDoTruncate)==0 );
3400 
3401   if( (p->db->flags & SQLITE_ResetDatabase)
3402    && sqlite3PagerIsreadonly(pPager)==0
3403   ){
3404     pBt->btsFlags &= ~BTS_READ_ONLY;
3405   }
3406 
3407   /* Write transactions are not possible on a read-only database */
3408   if( (pBt->btsFlags & BTS_READ_ONLY)!=0 && wrflag ){
3409     rc = SQLITE_READONLY;
3410     goto trans_begun;
3411   }
3412 
3413 #ifndef SQLITE_OMIT_SHARED_CACHE
3414   {
3415     sqlite3 *pBlock = 0;
3416     /* If another database handle has already opened a write transaction
3417     ** on this shared-btree structure and a second write transaction is
3418     ** requested, return SQLITE_LOCKED.
3419     */
3420     if( (wrflag && pBt->inTransaction==TRANS_WRITE)
3421      || (pBt->btsFlags & BTS_PENDING)!=0
3422     ){
3423       pBlock = pBt->pWriter->db;
3424     }else if( wrflag>1 ){
3425       BtLock *pIter;
3426       for(pIter=pBt->pLock; pIter; pIter=pIter->pNext){
3427         if( pIter->pBtree!=p ){
3428           pBlock = pIter->pBtree->db;
3429           break;
3430         }
3431       }
3432     }
3433     if( pBlock ){
3434       sqlite3ConnectionBlocked(p->db, pBlock);
3435       rc = SQLITE_LOCKED_SHAREDCACHE;
3436       goto trans_begun;
3437     }
3438   }
3439 #endif
3440 
3441   /* Any read-only or read-write transaction implies a read-lock on
3442   ** page 1. So if some other shared-cache client already has a write-lock
3443   ** on page 1, the transaction cannot be opened. */
3444   rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
3445   if( SQLITE_OK!=rc ) goto trans_begun;
3446 
3447   pBt->btsFlags &= ~BTS_INITIALLY_EMPTY;
3448   if( pBt->nPage==0 ) pBt->btsFlags |= BTS_INITIALLY_EMPTY;
3449   do {
3450     sqlite3PagerWalDb(pPager, p->db);
3451 
3452 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
3453     /* If transitioning from no transaction directly to a write transaction,
3454     ** block for the WRITER lock first if possible. */
3455     if( pBt->pPage1==0 && wrflag ){
3456       assert( pBt->inTransaction==TRANS_NONE );
3457       rc = sqlite3PagerWalWriteLock(pPager, 1);
3458       if( rc!=SQLITE_BUSY && rc!=SQLITE_OK ) break;
3459     }
3460 #endif
3461 
3462     /* Call lockBtree() until either pBt->pPage1 is populated or
3463     ** lockBtree() returns something other than SQLITE_OK. lockBtree()
3464     ** may return SQLITE_OK but leave pBt->pPage1 set to 0 if after
3465     ** reading page 1 it discovers that the page-size of the database
3466     ** file is not pBt->pageSize. In this case lockBtree() will update
3467     ** pBt->pageSize to the page-size of the file on disk.
3468     */
3469     while( pBt->pPage1==0 && SQLITE_OK==(rc = lockBtree(pBt)) );
3470 
3471     if( rc==SQLITE_OK && wrflag ){
3472       if( (pBt->btsFlags & BTS_READ_ONLY)!=0 ){
3473         rc = SQLITE_READONLY;
3474       }else{
3475         rc = sqlite3PagerBegin(pPager, wrflag>1, sqlite3TempInMemory(p->db));
3476         if( rc==SQLITE_OK ){
3477           rc = newDatabase(pBt);
3478         }else if( rc==SQLITE_BUSY_SNAPSHOT && pBt->inTransaction==TRANS_NONE ){
3479           /* if there was no transaction opened when this function was
3480           ** called and SQLITE_BUSY_SNAPSHOT is returned, change the error
3481           ** code to SQLITE_BUSY. */
3482           rc = SQLITE_BUSY;
3483         }
3484       }
3485     }
3486 
3487     if( rc!=SQLITE_OK ){
3488       (void)sqlite3PagerWalWriteLock(pPager, 0);
3489       unlockBtreeIfUnused(pBt);
3490     }
3491   }while( (rc&0xFF)==SQLITE_BUSY && pBt->inTransaction==TRANS_NONE &&
3492           btreeInvokeBusyHandler(pBt) );
3493   sqlite3PagerWalDb(pPager, 0);
3494 #ifdef SQLITE_ENABLE_SETLK_TIMEOUT
3495   if( rc==SQLITE_BUSY_TIMEOUT ) rc = SQLITE_BUSY;
3496 #endif
3497 
3498   if( rc==SQLITE_OK ){
3499     if( p->inTrans==TRANS_NONE ){
3500       pBt->nTransaction++;
3501 #ifndef SQLITE_OMIT_SHARED_CACHE
3502       if( p->sharable ){
3503         assert( p->lock.pBtree==p && p->lock.iTable==1 );
3504         p->lock.eLock = READ_LOCK;
3505         p->lock.pNext = pBt->pLock;
3506         pBt->pLock = &p->lock;
3507       }
3508 #endif
3509     }
3510     p->inTrans = (wrflag?TRANS_WRITE:TRANS_READ);
3511     if( p->inTrans>pBt->inTransaction ){
3512       pBt->inTransaction = p->inTrans;
3513     }
3514     if( wrflag ){
3515       MemPage *pPage1 = pBt->pPage1;
3516 #ifndef SQLITE_OMIT_SHARED_CACHE
3517       assert( !pBt->pWriter );
3518       pBt->pWriter = p;
3519       pBt->btsFlags &= ~BTS_EXCLUSIVE;
3520       if( wrflag>1 ) pBt->btsFlags |= BTS_EXCLUSIVE;
3521 #endif
3522 
3523       /* If the db-size header field is incorrect (as it may be if an old
3524       ** client has been writing the database file), update it now. Doing
3525       ** this sooner rather than later means the database size can safely
3526       ** re-read the database size from page 1 if a savepoint or transaction
3527       ** rollback occurs within the transaction.
3528       */
3529       if( pBt->nPage!=get4byte(&pPage1->aData[28]) ){
3530         rc = sqlite3PagerWrite(pPage1->pDbPage);
3531         if( rc==SQLITE_OK ){
3532           put4byte(&pPage1->aData[28], pBt->nPage);
3533         }
3534       }
3535     }
3536   }
3537 
3538 trans_begun:
3539   if( rc==SQLITE_OK ){
3540     if( pSchemaVersion ){
3541       *pSchemaVersion = get4byte(&pBt->pPage1->aData[40]);
3542     }
3543     if( wrflag ){
3544       /* This call makes sure that the pager has the correct number of
3545       ** open savepoints. If the second parameter is greater than 0 and
3546       ** the sub-journal is not already open, then it will be opened here.
3547       */
3548       rc = sqlite3PagerOpenSavepoint(pPager, p->db->nSavepoint);
3549     }
3550   }
3551 
3552   btreeIntegrity(p);
3553   sqlite3BtreeLeave(p);
3554   return rc;
3555 }
3556 
3557 #ifndef SQLITE_OMIT_AUTOVACUUM
3558 
3559 /*
3560 ** Set the pointer-map entries for all children of page pPage. Also, if
3561 ** pPage contains cells that point to overflow pages, set the pointer
3562 ** map entries for the overflow pages as well.
3563 */
setChildPtrmaps(MemPage * pPage)3564 static int setChildPtrmaps(MemPage *pPage){
3565   int i;                             /* Counter variable */
3566   int nCell;                         /* Number of cells in page pPage */
3567   int rc;                            /* Return code */
3568   BtShared *pBt = pPage->pBt;
3569   Pgno pgno = pPage->pgno;
3570 
3571   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3572   rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3573   if( rc!=SQLITE_OK ) return rc;
3574   nCell = pPage->nCell;
3575 
3576   for(i=0; i<nCell; i++){
3577     u8 *pCell = findCell(pPage, i);
3578 
3579     ptrmapPutOvflPtr(pPage, pPage, pCell, &rc);
3580 
3581     if( !pPage->leaf ){
3582       Pgno childPgno = get4byte(pCell);
3583       ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3584     }
3585   }
3586 
3587   if( !pPage->leaf ){
3588     Pgno childPgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
3589     ptrmapPut(pBt, childPgno, PTRMAP_BTREE, pgno, &rc);
3590   }
3591 
3592   return rc;
3593 }
3594 
3595 /*
3596 ** Somewhere on pPage is a pointer to page iFrom.  Modify this pointer so
3597 ** that it points to iTo. Parameter eType describes the type of pointer to
3598 ** be modified, as  follows:
3599 **
3600 ** PTRMAP_BTREE:     pPage is a btree-page. The pointer points at a child
3601 **                   page of pPage.
3602 **
3603 ** PTRMAP_OVERFLOW1: pPage is a btree-page. The pointer points at an overflow
3604 **                   page pointed to by one of the cells on pPage.
3605 **
3606 ** PTRMAP_OVERFLOW2: pPage is an overflow-page. The pointer points at the next
3607 **                   overflow page in the list.
3608 */
modifyPagePointer(MemPage * pPage,Pgno iFrom,Pgno iTo,u8 eType)3609 static int modifyPagePointer(MemPage *pPage, Pgno iFrom, Pgno iTo, u8 eType){
3610   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
3611   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
3612   if( eType==PTRMAP_OVERFLOW2 ){
3613     /* The pointer is always the first 4 bytes of the page in this case.  */
3614     if( get4byte(pPage->aData)!=iFrom ){
3615       return SQLITE_CORRUPT_PAGE(pPage);
3616     }
3617     put4byte(pPage->aData, iTo);
3618   }else{
3619     int i;
3620     int nCell;
3621     int rc;
3622 
3623     rc = pPage->isInit ? SQLITE_OK : btreeInitPage(pPage);
3624     if( rc ) return rc;
3625     nCell = pPage->nCell;
3626 
3627     for(i=0; i<nCell; i++){
3628       u8 *pCell = findCell(pPage, i);
3629       if( eType==PTRMAP_OVERFLOW1 ){
3630         CellInfo info;
3631         pPage->xParseCell(pPage, pCell, &info);
3632         if( info.nLocal<info.nPayload ){
3633           if( pCell+info.nSize > pPage->aData+pPage->pBt->usableSize ){
3634             return SQLITE_CORRUPT_PAGE(pPage);
3635           }
3636           if( iFrom==get4byte(pCell+info.nSize-4) ){
3637             put4byte(pCell+info.nSize-4, iTo);
3638             break;
3639           }
3640         }
3641       }else{
3642         if( get4byte(pCell)==iFrom ){
3643           put4byte(pCell, iTo);
3644           break;
3645         }
3646       }
3647     }
3648 
3649     if( i==nCell ){
3650       if( eType!=PTRMAP_BTREE ||
3651           get4byte(&pPage->aData[pPage->hdrOffset+8])!=iFrom ){
3652         return SQLITE_CORRUPT_PAGE(pPage);
3653       }
3654       put4byte(&pPage->aData[pPage->hdrOffset+8], iTo);
3655     }
3656   }
3657   return SQLITE_OK;
3658 }
3659 
3660 
3661 /*
3662 ** Move the open database page pDbPage to location iFreePage in the
3663 ** database. The pDbPage reference remains valid.
3664 **
3665 ** The isCommit flag indicates that there is no need to remember that
3666 ** the journal needs to be sync()ed before database page pDbPage->pgno
3667 ** can be written to. The caller has already promised not to write to that
3668 ** page.
3669 */
relocatePage(BtShared * pBt,MemPage * pDbPage,u8 eType,Pgno iPtrPage,Pgno iFreePage,int isCommit)3670 static int relocatePage(
3671   BtShared *pBt,           /* Btree */
3672   MemPage *pDbPage,        /* Open page to move */
3673   u8 eType,                /* Pointer map 'type' entry for pDbPage */
3674   Pgno iPtrPage,           /* Pointer map 'page-no' entry for pDbPage */
3675   Pgno iFreePage,          /* The location to move pDbPage to */
3676   int isCommit             /* isCommit flag passed to sqlite3PagerMovepage */
3677 ){
3678   MemPage *pPtrPage;   /* The page that contains a pointer to pDbPage */
3679   Pgno iDbPage = pDbPage->pgno;
3680   Pager *pPager = pBt->pPager;
3681   int rc;
3682 
3683   assert( eType==PTRMAP_OVERFLOW2 || eType==PTRMAP_OVERFLOW1 ||
3684       eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE );
3685   assert( sqlite3_mutex_held(pBt->mutex) );
3686   assert( pDbPage->pBt==pBt );
3687   if( iDbPage<3 ) return SQLITE_CORRUPT_BKPT;
3688 
3689   /* Move page iDbPage from its current location to page number iFreePage */
3690   TRACE(("AUTOVACUUM: Moving %d to free page %d (ptr page %d type %d)\n",
3691       iDbPage, iFreePage, iPtrPage, eType));
3692   rc = sqlite3PagerMovepage(pPager, pDbPage->pDbPage, iFreePage, isCommit);
3693   if( rc!=SQLITE_OK ){
3694     return rc;
3695   }
3696   pDbPage->pgno = iFreePage;
3697 
3698   /* If pDbPage was a btree-page, then it may have child pages and/or cells
3699   ** that point to overflow pages. The pointer map entries for all these
3700   ** pages need to be changed.
3701   **
3702   ** If pDbPage is an overflow page, then the first 4 bytes may store a
3703   ** pointer to a subsequent overflow page. If this is the case, then
3704   ** the pointer map needs to be updated for the subsequent overflow page.
3705   */
3706   if( eType==PTRMAP_BTREE || eType==PTRMAP_ROOTPAGE ){
3707     rc = setChildPtrmaps(pDbPage);
3708     if( rc!=SQLITE_OK ){
3709       return rc;
3710     }
3711   }else{
3712     Pgno nextOvfl = get4byte(pDbPage->aData);
3713     if( nextOvfl!=0 ){
3714       ptrmapPut(pBt, nextOvfl, PTRMAP_OVERFLOW2, iFreePage, &rc);
3715       if( rc!=SQLITE_OK ){
3716         return rc;
3717       }
3718     }
3719   }
3720 
3721   /* Fix the database pointer on page iPtrPage that pointed at iDbPage so
3722   ** that it points at iFreePage. Also fix the pointer map entry for
3723   ** iPtrPage.
3724   */
3725   if( eType!=PTRMAP_ROOTPAGE ){
3726     rc = btreeGetPage(pBt, iPtrPage, &pPtrPage, 0);
3727     if( rc!=SQLITE_OK ){
3728       return rc;
3729     }
3730     rc = sqlite3PagerWrite(pPtrPage->pDbPage);
3731     if( rc!=SQLITE_OK ){
3732       releasePage(pPtrPage);
3733       return rc;
3734     }
3735     rc = modifyPagePointer(pPtrPage, iDbPage, iFreePage, eType);
3736     releasePage(pPtrPage);
3737     if( rc==SQLITE_OK ){
3738       ptrmapPut(pBt, iFreePage, eType, iPtrPage, &rc);
3739     }
3740   }
3741   return rc;
3742 }
3743 
3744 /* Forward declaration required by incrVacuumStep(). */
3745 static int allocateBtreePage(BtShared *, MemPage **, Pgno *, Pgno, u8);
3746 
3747 /*
3748 ** Perform a single step of an incremental-vacuum. If successful, return
3749 ** SQLITE_OK. If there is no work to do (and therefore no point in
3750 ** calling this function again), return SQLITE_DONE. Or, if an error
3751 ** occurs, return some other error code.
3752 **
3753 ** More specifically, this function attempts to re-organize the database so
3754 ** that the last page of the file currently in use is no longer in use.
3755 **
3756 ** Parameter nFin is the number of pages that this database would contain
3757 ** were this function called until it returns SQLITE_DONE.
3758 **
3759 ** If the bCommit parameter is non-zero, this function assumes that the
3760 ** caller will keep calling incrVacuumStep() until it returns SQLITE_DONE
3761 ** or an error. bCommit is passed true for an auto-vacuum-on-commit
3762 ** operation, or false for an incremental vacuum.
3763 */
incrVacuumStep(BtShared * pBt,Pgno nFin,Pgno iLastPg,int bCommit)3764 static int incrVacuumStep(BtShared *pBt, Pgno nFin, Pgno iLastPg, int bCommit){
3765   Pgno nFreeList;           /* Number of pages still on the free-list */
3766   int rc;
3767 
3768   assert( sqlite3_mutex_held(pBt->mutex) );
3769   assert( iLastPg>nFin );
3770 
3771   if( !PTRMAP_ISPAGE(pBt, iLastPg) && iLastPg!=PENDING_BYTE_PAGE(pBt) ){
3772     u8 eType;
3773     Pgno iPtrPage;
3774 
3775     nFreeList = get4byte(&pBt->pPage1->aData[36]);
3776     if( nFreeList==0 ){
3777       return SQLITE_DONE;
3778     }
3779 
3780     rc = ptrmapGet(pBt, iLastPg, &eType, &iPtrPage);
3781     if( rc!=SQLITE_OK ){
3782       return rc;
3783     }
3784     if( eType==PTRMAP_ROOTPAGE ){
3785       return SQLITE_CORRUPT_BKPT;
3786     }
3787 
3788     if( eType==PTRMAP_FREEPAGE ){
3789       if( bCommit==0 ){
3790         /* Remove the page from the files free-list. This is not required
3791         ** if bCommit is non-zero. In that case, the free-list will be
3792         ** truncated to zero after this function returns, so it doesn't
3793         ** matter if it still contains some garbage entries.
3794         */
3795         Pgno iFreePg;
3796         MemPage *pFreePg;
3797         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iLastPg, BTALLOC_EXACT);
3798         if( rc!=SQLITE_OK ){
3799           return rc;
3800         }
3801         assert( iFreePg==iLastPg );
3802         releasePage(pFreePg);
3803       }
3804     } else {
3805       Pgno iFreePg;             /* Index of free page to move pLastPg to */
3806       MemPage *pLastPg;
3807       u8 eMode = BTALLOC_ANY;   /* Mode parameter for allocateBtreePage() */
3808       Pgno iNear = 0;           /* nearby parameter for allocateBtreePage() */
3809 
3810       rc = btreeGetPage(pBt, iLastPg, &pLastPg, 0);
3811       if( rc!=SQLITE_OK ){
3812         return rc;
3813       }
3814 
3815       /* If bCommit is zero, this loop runs exactly once and page pLastPg
3816       ** is swapped with the first free page pulled off the free list.
3817       **
3818       ** On the other hand, if bCommit is greater than zero, then keep
3819       ** looping until a free-page located within the first nFin pages
3820       ** of the file is found.
3821       */
3822       if( bCommit==0 ){
3823         eMode = BTALLOC_LE;
3824         iNear = nFin;
3825       }
3826       do {
3827         MemPage *pFreePg;
3828         rc = allocateBtreePage(pBt, &pFreePg, &iFreePg, iNear, eMode);
3829         if( rc!=SQLITE_OK ){
3830           releasePage(pLastPg);
3831           return rc;
3832         }
3833         releasePage(pFreePg);
3834       }while( bCommit && iFreePg>nFin );
3835       assert( iFreePg<iLastPg );
3836 
3837       rc = relocatePage(pBt, pLastPg, eType, iPtrPage, iFreePg, bCommit);
3838       releasePage(pLastPg);
3839       if( rc!=SQLITE_OK ){
3840         return rc;
3841       }
3842     }
3843   }
3844 
3845   if( bCommit==0 ){
3846     do {
3847       iLastPg--;
3848     }while( iLastPg==PENDING_BYTE_PAGE(pBt) || PTRMAP_ISPAGE(pBt, iLastPg) );
3849     pBt->bDoTruncate = 1;
3850     pBt->nPage = iLastPg;
3851   }
3852   return SQLITE_OK;
3853 }
3854 
3855 /*
3856 ** The database opened by the first argument is an auto-vacuum database
3857 ** nOrig pages in size containing nFree free pages. Return the expected
3858 ** size of the database in pages following an auto-vacuum operation.
3859 */
finalDbSize(BtShared * pBt,Pgno nOrig,Pgno nFree)3860 static Pgno finalDbSize(BtShared *pBt, Pgno nOrig, Pgno nFree){
3861   int nEntry;                     /* Number of entries on one ptrmap page */
3862   Pgno nPtrmap;                   /* Number of PtrMap pages to be freed */
3863   Pgno nFin;                      /* Return value */
3864 
3865   nEntry = pBt->usableSize/5;
3866   nPtrmap = (nFree-nOrig+PTRMAP_PAGENO(pBt, nOrig)+nEntry)/nEntry;
3867   nFin = nOrig - nFree - nPtrmap;
3868   if( nOrig>PENDING_BYTE_PAGE(pBt) && nFin<PENDING_BYTE_PAGE(pBt) ){
3869     nFin--;
3870   }
3871   while( PTRMAP_ISPAGE(pBt, nFin) || nFin==PENDING_BYTE_PAGE(pBt) ){
3872     nFin--;
3873   }
3874 
3875   return nFin;
3876 }
3877 
3878 /*
3879 ** A write-transaction must be opened before calling this function.
3880 ** It performs a single unit of work towards an incremental vacuum.
3881 **
3882 ** If the incremental vacuum is finished after this function has run,
3883 ** SQLITE_DONE is returned. If it is not finished, but no error occurred,
3884 ** SQLITE_OK is returned. Otherwise an SQLite error code.
3885 */
sqlite3BtreeIncrVacuum(Btree * p)3886 int sqlite3BtreeIncrVacuum(Btree *p){
3887   int rc;
3888   BtShared *pBt = p->pBt;
3889 
3890   sqlite3BtreeEnter(p);
3891   assert( pBt->inTransaction==TRANS_WRITE && p->inTrans==TRANS_WRITE );
3892   if( !pBt->autoVacuum ){
3893     rc = SQLITE_DONE;
3894   }else{
3895     Pgno nOrig = btreePagecount(pBt);
3896     Pgno nFree = get4byte(&pBt->pPage1->aData[36]);
3897     Pgno nFin = finalDbSize(pBt, nOrig, nFree);
3898 
3899     if( nOrig<nFin || nFree>=nOrig ){
3900       rc = SQLITE_CORRUPT_BKPT;
3901     }else if( nFree>0 ){
3902       rc = saveAllCursors(pBt, 0, 0);
3903       if( rc==SQLITE_OK ){
3904         invalidateAllOverflowCache(pBt);
3905         rc = incrVacuumStep(pBt, nFin, nOrig, 0);
3906       }
3907       if( rc==SQLITE_OK ){
3908         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3909         put4byte(&pBt->pPage1->aData[28], pBt->nPage);
3910       }
3911     }else{
3912       rc = SQLITE_DONE;
3913     }
3914   }
3915   sqlite3BtreeLeave(p);
3916   return rc;
3917 }
3918 
3919 /*
3920 ** This routine is called prior to sqlite3PagerCommit when a transaction
3921 ** is committed for an auto-vacuum database.
3922 **
3923 ** If SQLITE_OK is returned, then *pnTrunc is set to the number of pages
3924 ** the database file should be truncated to during the commit process.
3925 ** i.e. the database has been reorganized so that only the first *pnTrunc
3926 ** pages are in use.
3927 */
autoVacuumCommit(BtShared * pBt)3928 static int autoVacuumCommit(BtShared *pBt){
3929   int rc = SQLITE_OK;
3930   Pager *pPager = pBt->pPager;
3931   VVA_ONLY( int nRef = sqlite3PagerRefcount(pPager); )
3932 
3933   assert( sqlite3_mutex_held(pBt->mutex) );
3934   invalidateAllOverflowCache(pBt);
3935   assert(pBt->autoVacuum);
3936   if( !pBt->incrVacuum ){
3937     Pgno nFin;         /* Number of pages in database after autovacuuming */
3938     Pgno nFree;        /* Number of pages on the freelist initially */
3939     Pgno iFree;        /* The next page to be freed */
3940     Pgno nOrig;        /* Database size before freeing */
3941 
3942     nOrig = btreePagecount(pBt);
3943     if( PTRMAP_ISPAGE(pBt, nOrig) || nOrig==PENDING_BYTE_PAGE(pBt) ){
3944       /* It is not possible to create a database for which the final page
3945       ** is either a pointer-map page or the pending-byte page. If one
3946       ** is encountered, this indicates corruption.
3947       */
3948       return SQLITE_CORRUPT_BKPT;
3949     }
3950 
3951     nFree = get4byte(&pBt->pPage1->aData[36]);
3952     nFin = finalDbSize(pBt, nOrig, nFree);
3953     if( nFin>nOrig ) return SQLITE_CORRUPT_BKPT;
3954     if( nFin<nOrig ){
3955       rc = saveAllCursors(pBt, 0, 0);
3956     }
3957     for(iFree=nOrig; iFree>nFin && rc==SQLITE_OK; iFree--){
3958       rc = incrVacuumStep(pBt, nFin, iFree, 1);
3959     }
3960     if( (rc==SQLITE_DONE || rc==SQLITE_OK) && nFree>0 ){
3961       rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
3962       put4byte(&pBt->pPage1->aData[32], 0);
3963       put4byte(&pBt->pPage1->aData[36], 0);
3964       put4byte(&pBt->pPage1->aData[28], nFin);
3965       pBt->bDoTruncate = 1;
3966       pBt->nPage = nFin;
3967     }
3968     if( rc!=SQLITE_OK ){
3969       sqlite3PagerRollback(pPager);
3970     }
3971   }
3972 
3973   assert( nRef>=sqlite3PagerRefcount(pPager) );
3974   return rc;
3975 }
3976 
3977 #else /* ifndef SQLITE_OMIT_AUTOVACUUM */
3978 # define setChildPtrmaps(x) SQLITE_OK
3979 #endif
3980 
3981 /*
3982 ** This routine does the first phase of a two-phase commit.  This routine
3983 ** causes a rollback journal to be created (if it does not already exist)
3984 ** and populated with enough information so that if a power loss occurs
3985 ** the database can be restored to its original state by playing back
3986 ** the journal.  Then the contents of the journal are flushed out to
3987 ** the disk.  After the journal is safely on oxide, the changes to the
3988 ** database are written into the database file and flushed to oxide.
3989 ** At the end of this call, the rollback journal still exists on the
3990 ** disk and we are still holding all locks, so the transaction has not
3991 ** committed.  See sqlite3BtreeCommitPhaseTwo() for the second phase of the
3992 ** commit process.
3993 **
3994 ** This call is a no-op if no write-transaction is currently active on pBt.
3995 **
3996 ** Otherwise, sync the database file for the btree pBt. zSuperJrnl points to
3997 ** the name of a super-journal file that should be written into the
3998 ** individual journal file, or is NULL, indicating no super-journal file
3999 ** (single database transaction).
4000 **
4001 ** When this is called, the super-journal should already have been
4002 ** created, populated with this journal pointer and synced to disk.
4003 **
4004 ** Once this is routine has returned, the only thing required to commit
4005 ** the write-transaction for this database file is to delete the journal.
4006 */
sqlite3BtreeCommitPhaseOne(Btree * p,const char * zSuperJrnl)4007 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zSuperJrnl){
4008   int rc = SQLITE_OK;
4009   if( p->inTrans==TRANS_WRITE ){
4010     BtShared *pBt = p->pBt;
4011     sqlite3BtreeEnter(p);
4012 #ifndef SQLITE_OMIT_AUTOVACUUM
4013     if( pBt->autoVacuum ){
4014       rc = autoVacuumCommit(pBt);
4015       if( rc!=SQLITE_OK ){
4016         sqlite3BtreeLeave(p);
4017         return rc;
4018       }
4019     }
4020     if( pBt->bDoTruncate ){
4021       sqlite3PagerTruncateImage(pBt->pPager, pBt->nPage);
4022     }
4023 #endif
4024     rc = sqlite3PagerCommitPhaseOne(pBt->pPager, zSuperJrnl, 0);
4025     sqlite3BtreeLeave(p);
4026   }
4027   return rc;
4028 }
4029 
4030 /*
4031 ** This function is called from both BtreeCommitPhaseTwo() and BtreeRollback()
4032 ** at the conclusion of a transaction.
4033 */
btreeEndTransaction(Btree * p)4034 static void btreeEndTransaction(Btree *p){
4035   BtShared *pBt = p->pBt;
4036   sqlite3 *db = p->db;
4037   assert( sqlite3BtreeHoldsMutex(p) );
4038 
4039 #ifndef SQLITE_OMIT_AUTOVACUUM
4040   pBt->bDoTruncate = 0;
4041 #endif
4042   if( p->inTrans>TRANS_NONE && db->nVdbeRead>1 ){
4043     /* If there are other active statements that belong to this database
4044     ** handle, downgrade to a read-only transaction. The other statements
4045     ** may still be reading from the database.  */
4046     downgradeAllSharedCacheTableLocks(p);
4047     p->inTrans = TRANS_READ;
4048   }else{
4049     /* If the handle had any kind of transaction open, decrement the
4050     ** transaction count of the shared btree. If the transaction count
4051     ** reaches 0, set the shared state to TRANS_NONE. The unlockBtreeIfUnused()
4052     ** call below will unlock the pager.  */
4053     if( p->inTrans!=TRANS_NONE ){
4054       clearAllSharedCacheTableLocks(p);
4055       pBt->nTransaction--;
4056       if( 0==pBt->nTransaction ){
4057         pBt->inTransaction = TRANS_NONE;
4058       }
4059     }
4060 
4061     /* Set the current transaction state to TRANS_NONE and unlock the
4062     ** pager if this call closed the only read or write transaction.  */
4063     p->inTrans = TRANS_NONE;
4064     unlockBtreeIfUnused(pBt);
4065   }
4066 
4067   btreeIntegrity(p);
4068 }
4069 
4070 /*
4071 ** Commit the transaction currently in progress.
4072 **
4073 ** This routine implements the second phase of a 2-phase commit.  The
4074 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
4075 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
4076 ** routine did all the work of writing information out to disk and flushing the
4077 ** contents so that they are written onto the disk platter.  All this
4078 ** routine has to do is delete or truncate or zero the header in the
4079 ** the rollback journal (which causes the transaction to commit) and
4080 ** drop locks.
4081 **
4082 ** Normally, if an error occurs while the pager layer is attempting to
4083 ** finalize the underlying journal file, this function returns an error and
4084 ** the upper layer will attempt a rollback. However, if the second argument
4085 ** is non-zero then this b-tree transaction is part of a multi-file
4086 ** transaction. In this case, the transaction has already been committed
4087 ** (by deleting a super-journal file) and the caller will ignore this
4088 ** functions return code. So, even if an error occurs in the pager layer,
4089 ** reset the b-tree objects internal state to indicate that the write
4090 ** transaction has been closed. This is quite safe, as the pager will have
4091 ** transitioned to the error state.
4092 **
4093 ** This will release the write lock on the database file.  If there
4094 ** are no active cursors, it also releases the read lock.
4095 */
sqlite3BtreeCommitPhaseTwo(Btree * p,int bCleanup)4096 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup){
4097 
4098   if( p->inTrans==TRANS_NONE ) return SQLITE_OK;
4099   sqlite3BtreeEnter(p);
4100   btreeIntegrity(p);
4101 
4102   /* If the handle has a write-transaction open, commit the shared-btrees
4103   ** transaction and set the shared state to TRANS_READ.
4104   */
4105   if( p->inTrans==TRANS_WRITE ){
4106     int rc;
4107     BtShared *pBt = p->pBt;
4108     assert( pBt->inTransaction==TRANS_WRITE );
4109     assert( pBt->nTransaction>0 );
4110     rc = sqlite3PagerCommitPhaseTwo(pBt->pPager);
4111     if( rc!=SQLITE_OK && bCleanup==0 ){
4112       sqlite3BtreeLeave(p);
4113       return rc;
4114     }
4115     p->iDataVersion--;  /* Compensate for pPager->iDataVersion++; */
4116     pBt->inTransaction = TRANS_READ;
4117     btreeClearHasContent(pBt);
4118   }
4119 
4120   btreeEndTransaction(p);
4121   sqlite3BtreeLeave(p);
4122   return SQLITE_OK;
4123 }
4124 
4125 /*
4126 ** Do both phases of a commit.
4127 */
sqlite3BtreeCommit(Btree * p)4128 int sqlite3BtreeCommit(Btree *p){
4129   int rc;
4130   sqlite3BtreeEnter(p);
4131   rc = sqlite3BtreeCommitPhaseOne(p, 0);
4132   if( rc==SQLITE_OK ){
4133     rc = sqlite3BtreeCommitPhaseTwo(p, 0);
4134   }
4135   sqlite3BtreeLeave(p);
4136   return rc;
4137 }
4138 
4139 /*
4140 ** This routine sets the state to CURSOR_FAULT and the error
4141 ** code to errCode for every cursor on any BtShared that pBtree
4142 ** references.  Or if the writeOnly flag is set to 1, then only
4143 ** trip write cursors and leave read cursors unchanged.
4144 **
4145 ** Every cursor is a candidate to be tripped, including cursors
4146 ** that belong to other database connections that happen to be
4147 ** sharing the cache with pBtree.
4148 **
4149 ** This routine gets called when a rollback occurs. If the writeOnly
4150 ** flag is true, then only write-cursors need be tripped - read-only
4151 ** cursors save their current positions so that they may continue
4152 ** following the rollback. Or, if writeOnly is false, all cursors are
4153 ** tripped. In general, writeOnly is false if the transaction being
4154 ** rolled back modified the database schema. In this case b-tree root
4155 ** pages may be moved or deleted from the database altogether, making
4156 ** it unsafe for read cursors to continue.
4157 **
4158 ** If the writeOnly flag is true and an error is encountered while
4159 ** saving the current position of a read-only cursor, all cursors,
4160 ** including all read-cursors are tripped.
4161 **
4162 ** SQLITE_OK is returned if successful, or if an error occurs while
4163 ** saving a cursor position, an SQLite error code.
4164 */
sqlite3BtreeTripAllCursors(Btree * pBtree,int errCode,int writeOnly)4165 int sqlite3BtreeTripAllCursors(Btree *pBtree, int errCode, int writeOnly){
4166   BtCursor *p;
4167   int rc = SQLITE_OK;
4168 
4169   assert( (writeOnly==0 || writeOnly==1) && BTCF_WriteFlag==1 );
4170   if( pBtree ){
4171     sqlite3BtreeEnter(pBtree);
4172     for(p=pBtree->pBt->pCursor; p; p=p->pNext){
4173       if( writeOnly && (p->curFlags & BTCF_WriteFlag)==0 ){
4174         if( p->eState==CURSOR_VALID || p->eState==CURSOR_SKIPNEXT ){
4175           rc = saveCursorPosition(p);
4176           if( rc!=SQLITE_OK ){
4177             (void)sqlite3BtreeTripAllCursors(pBtree, rc, 0);
4178             break;
4179           }
4180         }
4181       }else{
4182         sqlite3BtreeClearCursor(p);
4183         p->eState = CURSOR_FAULT;
4184         p->skipNext = errCode;
4185       }
4186       btreeReleaseAllCursorPages(p);
4187     }
4188     sqlite3BtreeLeave(pBtree);
4189   }
4190   return rc;
4191 }
4192 
4193 /*
4194 ** Set the pBt->nPage field correctly, according to the current
4195 ** state of the database.  Assume pBt->pPage1 is valid.
4196 */
btreeSetNPage(BtShared * pBt,MemPage * pPage1)4197 static void btreeSetNPage(BtShared *pBt, MemPage *pPage1){
4198   int nPage = get4byte(&pPage1->aData[28]);
4199   testcase( nPage==0 );
4200   if( nPage==0 ) sqlite3PagerPagecount(pBt->pPager, &nPage);
4201   testcase( pBt->nPage!=nPage );
4202   pBt->nPage = nPage;
4203 }
4204 
4205 /*
4206 ** Rollback the transaction in progress.
4207 **
4208 ** If tripCode is not SQLITE_OK then cursors will be invalidated (tripped).
4209 ** Only write cursors are tripped if writeOnly is true but all cursors are
4210 ** tripped if writeOnly is false.  Any attempt to use
4211 ** a tripped cursor will result in an error.
4212 **
4213 ** This will release the write lock on the database file.  If there
4214 ** are no active cursors, it also releases the read lock.
4215 */
sqlite3BtreeRollback(Btree * p,int tripCode,int writeOnly)4216 int sqlite3BtreeRollback(Btree *p, int tripCode, int writeOnly){
4217   int rc;
4218   BtShared *pBt = p->pBt;
4219   MemPage *pPage1;
4220 
4221   assert( writeOnly==1 || writeOnly==0 );
4222   assert( tripCode==SQLITE_ABORT_ROLLBACK || tripCode==SQLITE_OK );
4223   sqlite3BtreeEnter(p);
4224   if( tripCode==SQLITE_OK ){
4225     rc = tripCode = saveAllCursors(pBt, 0, 0);
4226     if( rc ) writeOnly = 0;
4227   }else{
4228     rc = SQLITE_OK;
4229   }
4230   if( tripCode ){
4231     int rc2 = sqlite3BtreeTripAllCursors(p, tripCode, writeOnly);
4232     assert( rc==SQLITE_OK || (writeOnly==0 && rc2==SQLITE_OK) );
4233     if( rc2!=SQLITE_OK ) rc = rc2;
4234   }
4235   btreeIntegrity(p);
4236 
4237   if( p->inTrans==TRANS_WRITE ){
4238     int rc2;
4239 
4240     assert( TRANS_WRITE==pBt->inTransaction );
4241     rc2 = sqlite3PagerRollback(pBt->pPager);
4242     if( rc2!=SQLITE_OK ){
4243       rc = rc2;
4244     }
4245 
4246     /* The rollback may have destroyed the pPage1->aData value.  So
4247     ** call btreeGetPage() on page 1 again to make
4248     ** sure pPage1->aData is set correctly. */
4249     if( btreeGetPage(pBt, 1, &pPage1, 0)==SQLITE_OK ){
4250       btreeSetNPage(pBt, pPage1);
4251       releasePageOne(pPage1);
4252     }
4253     assert( countValidCursors(pBt, 1)==0 );
4254     pBt->inTransaction = TRANS_READ;
4255     btreeClearHasContent(pBt);
4256   }
4257 
4258   btreeEndTransaction(p);
4259   sqlite3BtreeLeave(p);
4260   return rc;
4261 }
4262 
4263 /*
4264 ** Start a statement subtransaction. The subtransaction can be rolled
4265 ** back independently of the main transaction. You must start a transaction
4266 ** before starting a subtransaction. The subtransaction is ended automatically
4267 ** if the main transaction commits or rolls back.
4268 **
4269 ** Statement subtransactions are used around individual SQL statements
4270 ** that are contained within a BEGIN...COMMIT block.  If a constraint
4271 ** error occurs within the statement, the effect of that one statement
4272 ** can be rolled back without having to rollback the entire transaction.
4273 **
4274 ** A statement sub-transaction is implemented as an anonymous savepoint. The
4275 ** value passed as the second parameter is the total number of savepoints,
4276 ** including the new anonymous savepoint, open on the B-Tree. i.e. if there
4277 ** are no active savepoints and no other statement-transactions open,
4278 ** iStatement is 1. This anonymous savepoint can be released or rolled back
4279 ** using the sqlite3BtreeSavepoint() function.
4280 */
sqlite3BtreeBeginStmt(Btree * p,int iStatement)4281 int sqlite3BtreeBeginStmt(Btree *p, int iStatement){
4282   int rc;
4283   BtShared *pBt = p->pBt;
4284   sqlite3BtreeEnter(p);
4285   assert( p->inTrans==TRANS_WRITE );
4286   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
4287   assert( iStatement>0 );
4288   assert( iStatement>p->db->nSavepoint );
4289   assert( pBt->inTransaction==TRANS_WRITE );
4290   /* At the pager level, a statement transaction is a savepoint with
4291   ** an index greater than all savepoints created explicitly using
4292   ** SQL statements. It is illegal to open, release or rollback any
4293   ** such savepoints while the statement transaction savepoint is active.
4294   */
4295   rc = sqlite3PagerOpenSavepoint(pBt->pPager, iStatement);
4296   sqlite3BtreeLeave(p);
4297   return rc;
4298 }
4299 
4300 /*
4301 ** The second argument to this function, op, is always SAVEPOINT_ROLLBACK
4302 ** or SAVEPOINT_RELEASE. This function either releases or rolls back the
4303 ** savepoint identified by parameter iSavepoint, depending on the value
4304 ** of op.
4305 **
4306 ** Normally, iSavepoint is greater than or equal to zero. However, if op is
4307 ** SAVEPOINT_ROLLBACK, then iSavepoint may also be -1. In this case the
4308 ** contents of the entire transaction are rolled back. This is different
4309 ** from a normal transaction rollback, as no locks are released and the
4310 ** transaction remains open.
4311 */
sqlite3BtreeSavepoint(Btree * p,int op,int iSavepoint)4312 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint){
4313   int rc = SQLITE_OK;
4314   if( p && p->inTrans==TRANS_WRITE ){
4315     BtShared *pBt = p->pBt;
4316     assert( op==SAVEPOINT_RELEASE || op==SAVEPOINT_ROLLBACK );
4317     assert( iSavepoint>=0 || (iSavepoint==-1 && op==SAVEPOINT_ROLLBACK) );
4318     sqlite3BtreeEnter(p);
4319     if( op==SAVEPOINT_ROLLBACK ){
4320       rc = saveAllCursors(pBt, 0, 0);
4321     }
4322     if( rc==SQLITE_OK ){
4323       rc = sqlite3PagerSavepoint(pBt->pPager, op, iSavepoint);
4324     }
4325     if( rc==SQLITE_OK ){
4326       if( iSavepoint<0 && (pBt->btsFlags & BTS_INITIALLY_EMPTY)!=0 ){
4327         pBt->nPage = 0;
4328       }
4329       rc = newDatabase(pBt);
4330       btreeSetNPage(pBt, pBt->pPage1);
4331 
4332       /* pBt->nPage might be zero if the database was corrupt when
4333       ** the transaction was started. Otherwise, it must be at least 1.  */
4334       assert( CORRUPT_DB || pBt->nPage>0 );
4335     }
4336     sqlite3BtreeLeave(p);
4337   }
4338   return rc;
4339 }
4340 
4341 /*
4342 ** Create a new cursor for the BTree whose root is on the page
4343 ** iTable. If a read-only cursor is requested, it is assumed that
4344 ** the caller already has at least a read-only transaction open
4345 ** on the database already. If a write-cursor is requested, then
4346 ** the caller is assumed to have an open write transaction.
4347 **
4348 ** If the BTREE_WRCSR bit of wrFlag is clear, then the cursor can only
4349 ** be used for reading.  If the BTREE_WRCSR bit is set, then the cursor
4350 ** can be used for reading or for writing if other conditions for writing
4351 ** are also met.  These are the conditions that must be met in order
4352 ** for writing to be allowed:
4353 **
4354 ** 1:  The cursor must have been opened with wrFlag containing BTREE_WRCSR
4355 **
4356 ** 2:  Other database connections that share the same pager cache
4357 **     but which are not in the READ_UNCOMMITTED state may not have
4358 **     cursors open with wrFlag==0 on the same table.  Otherwise
4359 **     the changes made by this write cursor would be visible to
4360 **     the read cursors in the other database connection.
4361 **
4362 ** 3:  The database must be writable (not on read-only media)
4363 **
4364 ** 4:  There must be an active transaction.
4365 **
4366 ** The BTREE_FORDELETE bit of wrFlag may optionally be set if BTREE_WRCSR
4367 ** is set.  If FORDELETE is set, that is a hint to the implementation that
4368 ** this cursor will only be used to seek to and delete entries of an index
4369 ** as part of a larger DELETE statement.  The FORDELETE hint is not used by
4370 ** this implementation.  But in a hypothetical alternative storage engine
4371 ** in which index entries are automatically deleted when corresponding table
4372 ** rows are deleted, the FORDELETE flag is a hint that all SEEK and DELETE
4373 ** operations on this cursor can be no-ops and all READ operations can
4374 ** return a null row (2-bytes: 0x01 0x00).
4375 **
4376 ** No checking is done to make sure that page iTable really is the
4377 ** root page of a b-tree.  If it is not, then the cursor acquired
4378 ** will not work correctly.
4379 **
4380 ** It is assumed that the sqlite3BtreeCursorZero() has been called
4381 ** on pCur to initialize the memory space prior to invoking this routine.
4382 */
btreeCursor(Btree * p,Pgno iTable,int wrFlag,struct KeyInfo * pKeyInfo,BtCursor * pCur)4383 static int btreeCursor(
4384   Btree *p,                              /* The btree */
4385   Pgno iTable,                           /* Root page of table to open */
4386   int wrFlag,                            /* 1 to write. 0 read-only */
4387   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4388   BtCursor *pCur                         /* Space for new cursor */
4389 ){
4390   BtShared *pBt = p->pBt;                /* Shared b-tree handle */
4391   BtCursor *pX;                          /* Looping over other all cursors */
4392 
4393   assert( sqlite3BtreeHoldsMutex(p) );
4394   assert( wrFlag==0
4395        || wrFlag==BTREE_WRCSR
4396        || wrFlag==(BTREE_WRCSR|BTREE_FORDELETE)
4397   );
4398 
4399   /* The following assert statements verify that if this is a sharable
4400   ** b-tree database, the connection is holding the required table locks,
4401   ** and that no other connection has any open cursor that conflicts with
4402   ** this lock.  The iTable<1 term disables the check for corrupt schemas. */
4403   assert( hasSharedCacheTableLock(p, iTable, pKeyInfo!=0, (wrFlag?2:1))
4404           || iTable<1 );
4405   assert( wrFlag==0 || !hasReadConflicts(p, iTable) );
4406 
4407   /* Assert that the caller has opened the required transaction. */
4408   assert( p->inTrans>TRANS_NONE );
4409   assert( wrFlag==0 || p->inTrans==TRANS_WRITE );
4410   assert( pBt->pPage1 && pBt->pPage1->aData );
4411   assert( wrFlag==0 || (pBt->btsFlags & BTS_READ_ONLY)==0 );
4412 
4413   if( wrFlag ){
4414     allocateTempSpace(pBt);
4415     if( pBt->pTmpSpace==0 ) return SQLITE_NOMEM_BKPT;
4416   }
4417   if( iTable<=1 ){
4418     if( iTable<1 ){
4419       return SQLITE_CORRUPT_BKPT;
4420     }else if( btreePagecount(pBt)==0 ){
4421       assert( wrFlag==0 );
4422       iTable = 0;
4423     }
4424   }
4425 
4426   /* Now that no other errors can occur, finish filling in the BtCursor
4427   ** variables and link the cursor into the BtShared list.  */
4428   pCur->pgnoRoot = iTable;
4429   pCur->iPage = -1;
4430   pCur->pKeyInfo = pKeyInfo;
4431   pCur->pBtree = p;
4432   pCur->pBt = pBt;
4433   pCur->curFlags = wrFlag ? BTCF_WriteFlag : 0;
4434   pCur->curPagerFlags = wrFlag ? 0 : PAGER_GET_READONLY;
4435   /* If there are two or more cursors on the same btree, then all such
4436   ** cursors *must* have the BTCF_Multiple flag set. */
4437   for(pX=pBt->pCursor; pX; pX=pX->pNext){
4438     if( pX->pgnoRoot==iTable ){
4439       pX->curFlags |= BTCF_Multiple;
4440       pCur->curFlags |= BTCF_Multiple;
4441     }
4442   }
4443   pCur->pNext = pBt->pCursor;
4444   pBt->pCursor = pCur;
4445   pCur->eState = CURSOR_INVALID;
4446   return SQLITE_OK;
4447 }
btreeCursorWithLock(Btree * p,Pgno iTable,int wrFlag,struct KeyInfo * pKeyInfo,BtCursor * pCur)4448 static int btreeCursorWithLock(
4449   Btree *p,                              /* The btree */
4450   Pgno iTable,                           /* Root page of table to open */
4451   int wrFlag,                            /* 1 to write. 0 read-only */
4452   struct KeyInfo *pKeyInfo,              /* First arg to comparison function */
4453   BtCursor *pCur                         /* Space for new cursor */
4454 ){
4455   int rc;
4456   sqlite3BtreeEnter(p);
4457   rc = btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4458   sqlite3BtreeLeave(p);
4459   return rc;
4460 }
sqlite3BtreeCursor(Btree * p,Pgno iTable,int wrFlag,struct KeyInfo * pKeyInfo,BtCursor * pCur)4461 int sqlite3BtreeCursor(
4462   Btree *p,                                   /* The btree */
4463   Pgno iTable,                                /* Root page of table to open */
4464   int wrFlag,                                 /* 1 to write. 0 read-only */
4465   struct KeyInfo *pKeyInfo,                   /* First arg to xCompare() */
4466   BtCursor *pCur                              /* Write new cursor here */
4467 ){
4468   if( p->sharable ){
4469     return btreeCursorWithLock(p, iTable, wrFlag, pKeyInfo, pCur);
4470   }else{
4471     return btreeCursor(p, iTable, wrFlag, pKeyInfo, pCur);
4472   }
4473 }
4474 
4475 /*
4476 ** Return the size of a BtCursor object in bytes.
4477 **
4478 ** This interfaces is needed so that users of cursors can preallocate
4479 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
4480 ** to users so they cannot do the sizeof() themselves - they must call
4481 ** this routine.
4482 */
sqlite3BtreeCursorSize(void)4483 int sqlite3BtreeCursorSize(void){
4484   return ROUND8(sizeof(BtCursor));
4485 }
4486 
4487 /*
4488 ** Initialize memory that will be converted into a BtCursor object.
4489 **
4490 ** The simple approach here would be to memset() the entire object
4491 ** to zero.  But it turns out that the apPage[] and aiIdx[] arrays
4492 ** do not need to be zeroed and they are large, so we can save a lot
4493 ** of run-time by skipping the initialization of those elements.
4494 */
sqlite3BtreeCursorZero(BtCursor * p)4495 void sqlite3BtreeCursorZero(BtCursor *p){
4496   memset(p, 0, offsetof(BtCursor, BTCURSOR_FIRST_UNINIT));
4497 }
4498 
4499 /*
4500 ** Close a cursor.  The read lock on the database file is released
4501 ** when the last cursor is closed.
4502 */
sqlite3BtreeCloseCursor(BtCursor * pCur)4503 int sqlite3BtreeCloseCursor(BtCursor *pCur){
4504   Btree *pBtree = pCur->pBtree;
4505   if( pBtree ){
4506     BtShared *pBt = pCur->pBt;
4507     sqlite3BtreeEnter(pBtree);
4508     assert( pBt->pCursor!=0 );
4509     if( pBt->pCursor==pCur ){
4510       pBt->pCursor = pCur->pNext;
4511     }else{
4512       BtCursor *pPrev = pBt->pCursor;
4513       do{
4514         if( pPrev->pNext==pCur ){
4515           pPrev->pNext = pCur->pNext;
4516           break;
4517         }
4518         pPrev = pPrev->pNext;
4519       }while( ALWAYS(pPrev) );
4520     }
4521     btreeReleaseAllCursorPages(pCur);
4522     unlockBtreeIfUnused(pBt);
4523     sqlite3_free(pCur->aOverflow);
4524     sqlite3_free(pCur->pKey);
4525     sqlite3BtreeLeave(pBtree);
4526     pCur->pBtree = 0;
4527   }
4528   return SQLITE_OK;
4529 }
4530 
4531 /*
4532 ** Make sure the BtCursor* given in the argument has a valid
4533 ** BtCursor.info structure.  If it is not already valid, call
4534 ** btreeParseCell() to fill it in.
4535 **
4536 ** BtCursor.info is a cache of the information in the current cell.
4537 ** Using this cache reduces the number of calls to btreeParseCell().
4538 */
4539 #ifndef NDEBUG
cellInfoEqual(CellInfo * a,CellInfo * b)4540   static int cellInfoEqual(CellInfo *a, CellInfo *b){
4541     if( a->nKey!=b->nKey ) return 0;
4542     if( a->pPayload!=b->pPayload ) return 0;
4543     if( a->nPayload!=b->nPayload ) return 0;
4544     if( a->nLocal!=b->nLocal ) return 0;
4545     if( a->nSize!=b->nSize ) return 0;
4546     return 1;
4547   }
assertCellInfo(BtCursor * pCur)4548   static void assertCellInfo(BtCursor *pCur){
4549     CellInfo info;
4550     memset(&info, 0, sizeof(info));
4551     btreeParseCell(pCur->pPage, pCur->ix, &info);
4552     assert( CORRUPT_DB || cellInfoEqual(&info, &pCur->info) );
4553   }
4554 #else
4555   #define assertCellInfo(x)
4556 #endif
getCellInfo(BtCursor * pCur)4557 static SQLITE_NOINLINE void getCellInfo(BtCursor *pCur){
4558   if( pCur->info.nSize==0 ){
4559     pCur->curFlags |= BTCF_ValidNKey;
4560     btreeParseCell(pCur->pPage,pCur->ix,&pCur->info);
4561   }else{
4562     assertCellInfo(pCur);
4563   }
4564 }
4565 
4566 #ifndef NDEBUG  /* The next routine used only within assert() statements */
4567 /*
4568 ** Return true if the given BtCursor is valid.  A valid cursor is one
4569 ** that is currently pointing to a row in a (non-empty) table.
4570 ** This is a verification routine is used only within assert() statements.
4571 */
sqlite3BtreeCursorIsValid(BtCursor * pCur)4572 int sqlite3BtreeCursorIsValid(BtCursor *pCur){
4573   return pCur && pCur->eState==CURSOR_VALID;
4574 }
4575 #endif /* NDEBUG */
sqlite3BtreeCursorIsValidNN(BtCursor * pCur)4576 int sqlite3BtreeCursorIsValidNN(BtCursor *pCur){
4577   assert( pCur!=0 );
4578   return pCur->eState==CURSOR_VALID;
4579 }
4580 
4581 /*
4582 ** Return the value of the integer key or "rowid" for a table btree.
4583 ** This routine is only valid for a cursor that is pointing into a
4584 ** ordinary table btree.  If the cursor points to an index btree or
4585 ** is invalid, the result of this routine is undefined.
4586 */
sqlite3BtreeIntegerKey(BtCursor * pCur)4587 i64 sqlite3BtreeIntegerKey(BtCursor *pCur){
4588   assert( cursorHoldsMutex(pCur) );
4589   assert( pCur->eState==CURSOR_VALID );
4590   assert( pCur->curIntKey );
4591   getCellInfo(pCur);
4592   return pCur->info.nKey;
4593 }
4594 
4595 /*
4596 ** Pin or unpin a cursor.
4597 */
sqlite3BtreeCursorPin(BtCursor * pCur)4598 void sqlite3BtreeCursorPin(BtCursor *pCur){
4599   assert( (pCur->curFlags & BTCF_Pinned)==0 );
4600   pCur->curFlags |= BTCF_Pinned;
4601 }
sqlite3BtreeCursorUnpin(BtCursor * pCur)4602 void sqlite3BtreeCursorUnpin(BtCursor *pCur){
4603   assert( (pCur->curFlags & BTCF_Pinned)!=0 );
4604   pCur->curFlags &= ~BTCF_Pinned;
4605 }
4606 
4607 #ifdef SQLITE_ENABLE_OFFSET_SQL_FUNC
4608 /*
4609 ** Return the offset into the database file for the start of the
4610 ** payload to which the cursor is pointing.
4611 */
sqlite3BtreeOffset(BtCursor * pCur)4612 i64 sqlite3BtreeOffset(BtCursor *pCur){
4613   assert( cursorHoldsMutex(pCur) );
4614   assert( pCur->eState==CURSOR_VALID );
4615   getCellInfo(pCur);
4616   return (i64)pCur->pBt->pageSize*((i64)pCur->pPage->pgno - 1) +
4617          (i64)(pCur->info.pPayload - pCur->pPage->aData);
4618 }
4619 #endif /* SQLITE_ENABLE_OFFSET_SQL_FUNC */
4620 
4621 /*
4622 ** Return the number of bytes of payload for the entry that pCur is
4623 ** currently pointing to.  For table btrees, this will be the amount
4624 ** of data.  For index btrees, this will be the size of the key.
4625 **
4626 ** The caller must guarantee that the cursor is pointing to a non-NULL
4627 ** valid entry.  In other words, the calling procedure must guarantee
4628 ** that the cursor has Cursor.eState==CURSOR_VALID.
4629 */
sqlite3BtreePayloadSize(BtCursor * pCur)4630 u32 sqlite3BtreePayloadSize(BtCursor *pCur){
4631   assert( cursorHoldsMutex(pCur) );
4632   assert( pCur->eState==CURSOR_VALID );
4633   getCellInfo(pCur);
4634   return pCur->info.nPayload;
4635 }
4636 
4637 /*
4638 ** Return an upper bound on the size of any record for the table
4639 ** that the cursor is pointing into.
4640 **
4641 ** This is an optimization.  Everything will still work if this
4642 ** routine always returns 2147483647 (which is the largest record
4643 ** that SQLite can handle) or more.  But returning a smaller value might
4644 ** prevent large memory allocations when trying to interpret a
4645 ** corrupt datrabase.
4646 **
4647 ** The current implementation merely returns the size of the underlying
4648 ** database file.
4649 */
sqlite3BtreeMaxRecordSize(BtCursor * pCur)4650 sqlite3_int64 sqlite3BtreeMaxRecordSize(BtCursor *pCur){
4651   assert( cursorHoldsMutex(pCur) );
4652   assert( pCur->eState==CURSOR_VALID );
4653   return pCur->pBt->pageSize * (sqlite3_int64)pCur->pBt->nPage;
4654 }
4655 
4656 /*
4657 ** Given the page number of an overflow page in the database (parameter
4658 ** ovfl), this function finds the page number of the next page in the
4659 ** linked list of overflow pages. If possible, it uses the auto-vacuum
4660 ** pointer-map data instead of reading the content of page ovfl to do so.
4661 **
4662 ** If an error occurs an SQLite error code is returned. Otherwise:
4663 **
4664 ** The page number of the next overflow page in the linked list is
4665 ** written to *pPgnoNext. If page ovfl is the last page in its linked
4666 ** list, *pPgnoNext is set to zero.
4667 **
4668 ** If ppPage is not NULL, and a reference to the MemPage object corresponding
4669 ** to page number pOvfl was obtained, then *ppPage is set to point to that
4670 ** reference. It is the responsibility of the caller to call releasePage()
4671 ** on *ppPage to free the reference. In no reference was obtained (because
4672 ** the pointer-map was used to obtain the value for *pPgnoNext), then
4673 ** *ppPage is set to zero.
4674 */
getOverflowPage(BtShared * pBt,Pgno ovfl,MemPage ** ppPage,Pgno * pPgnoNext)4675 static int getOverflowPage(
4676   BtShared *pBt,               /* The database file */
4677   Pgno ovfl,                   /* Current overflow page number */
4678   MemPage **ppPage,            /* OUT: MemPage handle (may be NULL) */
4679   Pgno *pPgnoNext              /* OUT: Next overflow page number */
4680 ){
4681   Pgno next = 0;
4682   MemPage *pPage = 0;
4683   int rc = SQLITE_OK;
4684 
4685   assert( sqlite3_mutex_held(pBt->mutex) );
4686   assert(pPgnoNext);
4687 
4688 #ifndef SQLITE_OMIT_AUTOVACUUM
4689   /* Try to find the next page in the overflow list using the
4690   ** autovacuum pointer-map pages. Guess that the next page in
4691   ** the overflow list is page number (ovfl+1). If that guess turns
4692   ** out to be wrong, fall back to loading the data of page
4693   ** number ovfl to determine the next page number.
4694   */
4695   if( pBt->autoVacuum ){
4696     Pgno pgno;
4697     Pgno iGuess = ovfl+1;
4698     u8 eType;
4699 
4700     while( PTRMAP_ISPAGE(pBt, iGuess) || iGuess==PENDING_BYTE_PAGE(pBt) ){
4701       iGuess++;
4702     }
4703 
4704     if( iGuess<=btreePagecount(pBt) ){
4705       rc = ptrmapGet(pBt, iGuess, &eType, &pgno);
4706       if( rc==SQLITE_OK && eType==PTRMAP_OVERFLOW2 && pgno==ovfl ){
4707         next = iGuess;
4708         rc = SQLITE_DONE;
4709       }
4710     }
4711   }
4712 #endif
4713 
4714   assert( next==0 || rc==SQLITE_DONE );
4715   if( rc==SQLITE_OK ){
4716     rc = btreeGetPage(pBt, ovfl, &pPage, (ppPage==0) ? PAGER_GET_READONLY : 0);
4717     assert( rc==SQLITE_OK || pPage==0 );
4718     if( rc==SQLITE_OK ){
4719       next = get4byte(pPage->aData);
4720     }
4721   }
4722 
4723   *pPgnoNext = next;
4724   if( ppPage ){
4725     *ppPage = pPage;
4726   }else{
4727     releasePage(pPage);
4728   }
4729   return (rc==SQLITE_DONE ? SQLITE_OK : rc);
4730 }
4731 
4732 /*
4733 ** Copy data from a buffer to a page, or from a page to a buffer.
4734 **
4735 ** pPayload is a pointer to data stored on database page pDbPage.
4736 ** If argument eOp is false, then nByte bytes of data are copied
4737 ** from pPayload to the buffer pointed at by pBuf. If eOp is true,
4738 ** then sqlite3PagerWrite() is called on pDbPage and nByte bytes
4739 ** of data are copied from the buffer pBuf to pPayload.
4740 **
4741 ** SQLITE_OK is returned on success, otherwise an error code.
4742 */
copyPayload(void * pPayload,void * pBuf,int nByte,int eOp,DbPage * pDbPage)4743 static int copyPayload(
4744   void *pPayload,           /* Pointer to page data */
4745   void *pBuf,               /* Pointer to buffer */
4746   int nByte,                /* Number of bytes to copy */
4747   int eOp,                  /* 0 -> copy from page, 1 -> copy to page */
4748   DbPage *pDbPage           /* Page containing pPayload */
4749 ){
4750   if( eOp ){
4751     /* Copy data from buffer to page (a write operation) */
4752     int rc = sqlite3PagerWrite(pDbPage);
4753     if( rc!=SQLITE_OK ){
4754       return rc;
4755     }
4756     memcpy(pPayload, pBuf, nByte);
4757   }else{
4758     /* Copy data from page to buffer (a read operation) */
4759     memcpy(pBuf, pPayload, nByte);
4760   }
4761   return SQLITE_OK;
4762 }
4763 
4764 /*
4765 ** This function is used to read or overwrite payload information
4766 ** for the entry that the pCur cursor is pointing to. The eOp
4767 ** argument is interpreted as follows:
4768 **
4769 **   0: The operation is a read. Populate the overflow cache.
4770 **   1: The operation is a write. Populate the overflow cache.
4771 **
4772 ** A total of "amt" bytes are read or written beginning at "offset".
4773 ** Data is read to or from the buffer pBuf.
4774 **
4775 ** The content being read or written might appear on the main page
4776 ** or be scattered out on multiple overflow pages.
4777 **
4778 ** If the current cursor entry uses one or more overflow pages
4779 ** this function may allocate space for and lazily populate
4780 ** the overflow page-list cache array (BtCursor.aOverflow).
4781 ** Subsequent calls use this cache to make seeking to the supplied offset
4782 ** more efficient.
4783 **
4784 ** Once an overflow page-list cache has been allocated, it must be
4785 ** invalidated if some other cursor writes to the same table, or if
4786 ** the cursor is moved to a different row. Additionally, in auto-vacuum
4787 ** mode, the following events may invalidate an overflow page-list cache.
4788 **
4789 **   * An incremental vacuum,
4790 **   * A commit in auto_vacuum="full" mode,
4791 **   * Creating a table (may require moving an overflow page).
4792 */
accessPayload(BtCursor * pCur,u32 offset,u32 amt,unsigned char * pBuf,int eOp)4793 static int accessPayload(
4794   BtCursor *pCur,      /* Cursor pointing to entry to read from */
4795   u32 offset,          /* Begin reading this far into payload */
4796   u32 amt,             /* Read this many bytes */
4797   unsigned char *pBuf, /* Write the bytes into this buffer */
4798   int eOp              /* zero to read. non-zero to write. */
4799 ){
4800   unsigned char *aPayload;
4801   int rc = SQLITE_OK;
4802   int iIdx = 0;
4803   MemPage *pPage = pCur->pPage;               /* Btree page of current entry */
4804   BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
4805 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4806   unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
4807 #endif
4808 
4809   assert( pPage );
4810   assert( eOp==0 || eOp==1 );
4811   assert( pCur->eState==CURSOR_VALID );
4812   assert( pCur->ix<pPage->nCell );
4813   assert( cursorHoldsMutex(pCur) );
4814 
4815   getCellInfo(pCur);
4816   aPayload = pCur->info.pPayload;
4817   assert( offset+amt <= pCur->info.nPayload );
4818 
4819   assert( aPayload > pPage->aData );
4820   if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4821     /* Trying to read or write past the end of the data is an error.  The
4822     ** conditional above is really:
4823     **    &aPayload[pCur->info.nLocal] > &pPage->aData[pBt->usableSize]
4824     ** but is recast into its current form to avoid integer overflow problems
4825     */
4826     return SQLITE_CORRUPT_PAGE(pPage);
4827   }
4828 
4829   /* Check if data must be read/written to/from the btree page itself. */
4830   if( offset<pCur->info.nLocal ){
4831     int a = amt;
4832     if( a+offset>pCur->info.nLocal ){
4833       a = pCur->info.nLocal - offset;
4834     }
4835     rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
4836     offset = 0;
4837     pBuf += a;
4838     amt -= a;
4839   }else{
4840     offset -= pCur->info.nLocal;
4841   }
4842 
4843 
4844   if( rc==SQLITE_OK && amt>0 ){
4845     const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
4846     Pgno nextPage;
4847 
4848     nextPage = get4byte(&aPayload[pCur->info.nLocal]);
4849 
4850     /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
4851     **
4852     ** The aOverflow[] array is sized at one entry for each overflow page
4853     ** in the overflow chain. The page number of the first overflow page is
4854     ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
4855     ** means "not yet known" (the cache is lazily populated).
4856     */
4857     if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
4858       int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
4859       if( pCur->aOverflow==0
4860        || nOvfl*(int)sizeof(Pgno) > sqlite3MallocSize(pCur->aOverflow)
4861       ){
4862         Pgno *aNew = (Pgno*)sqlite3Realloc(
4863             pCur->aOverflow, nOvfl*2*sizeof(Pgno)
4864         );
4865         if( aNew==0 ){
4866           return SQLITE_NOMEM_BKPT;
4867         }else{
4868           pCur->aOverflow = aNew;
4869         }
4870       }
4871       memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
4872       pCur->curFlags |= BTCF_ValidOvfl;
4873     }else{
4874       /* If the overflow page-list cache has been allocated and the
4875       ** entry for the first required overflow page is valid, skip
4876       ** directly to it.
4877       */
4878       if( pCur->aOverflow[offset/ovflSize] ){
4879         iIdx = (offset/ovflSize);
4880         nextPage = pCur->aOverflow[iIdx];
4881         offset = (offset%ovflSize);
4882       }
4883     }
4884 
4885     assert( rc==SQLITE_OK && amt>0 );
4886     while( nextPage ){
4887       /* If required, populate the overflow page-list cache. */
4888       if( nextPage > pBt->nPage ) return SQLITE_CORRUPT_BKPT;
4889       assert( pCur->aOverflow[iIdx]==0
4890               || pCur->aOverflow[iIdx]==nextPage
4891               || CORRUPT_DB );
4892       pCur->aOverflow[iIdx] = nextPage;
4893 
4894       if( offset>=ovflSize ){
4895         /* The only reason to read this page is to obtain the page
4896         ** number for the next page in the overflow chain. The page
4897         ** data is not required. So first try to lookup the overflow
4898         ** page-list cache, if any, then fall back to the getOverflowPage()
4899         ** function.
4900         */
4901         assert( pCur->curFlags & BTCF_ValidOvfl );
4902         assert( pCur->pBtree->db==pBt->db );
4903         if( pCur->aOverflow[iIdx+1] ){
4904           nextPage = pCur->aOverflow[iIdx+1];
4905         }else{
4906           rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
4907         }
4908         offset -= ovflSize;
4909       }else{
4910         /* Need to read this page properly. It contains some of the
4911         ** range of data that is being read (eOp==0) or written (eOp!=0).
4912         */
4913         int a = amt;
4914         if( a + offset > ovflSize ){
4915           a = ovflSize - offset;
4916         }
4917 
4918 #ifdef SQLITE_DIRECT_OVERFLOW_READ
4919         /* If all the following are true:
4920         **
4921         **   1) this is a read operation, and
4922         **   2) data is required from the start of this overflow page, and
4923         **   3) there are no dirty pages in the page-cache
4924         **   4) the database is file-backed, and
4925         **   5) the page is not in the WAL file
4926         **   6) at least 4 bytes have already been read into the output buffer
4927         **
4928         ** then data can be read directly from the database file into the
4929         ** output buffer, bypassing the page-cache altogether. This speeds
4930         ** up loading large records that span many overflow pages.
4931         */
4932         if( eOp==0                                             /* (1) */
4933          && offset==0                                          /* (2) */
4934          && sqlite3PagerDirectReadOk(pBt->pPager, nextPage)    /* (3,4,5) */
4935          && &pBuf[-4]>=pBufStart                               /* (6) */
4936         ){
4937           sqlite3_file *fd = sqlite3PagerFile(pBt->pPager);
4938           u8 aSave[4];
4939           u8 *aWrite = &pBuf[-4];
4940           assert( aWrite>=pBufStart );                         /* due to (6) */
4941           memcpy(aSave, aWrite, 4);
4942           rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
4943           if( rc && nextPage>pBt->nPage ) rc = SQLITE_CORRUPT_BKPT;
4944           nextPage = get4byte(aWrite);
4945           memcpy(aWrite, aSave, 4);
4946         }else
4947 #endif
4948 
4949         {
4950           DbPage *pDbPage;
4951           rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
4952               (eOp==0 ? PAGER_GET_READONLY : 0)
4953           );
4954           if( rc==SQLITE_OK ){
4955             aPayload = sqlite3PagerGetData(pDbPage);
4956             nextPage = get4byte(aPayload);
4957             rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
4958             sqlite3PagerUnref(pDbPage);
4959             offset = 0;
4960           }
4961         }
4962         amt -= a;
4963         if( amt==0 ) return rc;
4964         pBuf += a;
4965       }
4966       if( rc ) break;
4967       iIdx++;
4968     }
4969   }
4970 
4971   if( rc==SQLITE_OK && amt>0 ){
4972     /* Overflow chain ends prematurely */
4973     return SQLITE_CORRUPT_PAGE(pPage);
4974   }
4975   return rc;
4976 }
4977 
4978 /*
4979 ** Read part of the payload for the row at which that cursor pCur is currently
4980 ** pointing.  "amt" bytes will be transferred into pBuf[].  The transfer
4981 ** begins at "offset".
4982 **
4983 ** pCur can be pointing to either a table or an index b-tree.
4984 ** If pointing to a table btree, then the content section is read.  If
4985 ** pCur is pointing to an index b-tree then the key section is read.
4986 **
4987 ** For sqlite3BtreePayload(), the caller must ensure that pCur is pointing
4988 ** to a valid row in the table.  For sqlite3BtreePayloadChecked(), the
4989 ** cursor might be invalid or might need to be restored before being read.
4990 **
4991 ** Return SQLITE_OK on success or an error code if anything goes
4992 ** wrong.  An error is returned if "offset+amt" is larger than
4993 ** the available payload.
4994 */
sqlite3BtreePayload(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)4995 int sqlite3BtreePayload(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
4996   assert( cursorHoldsMutex(pCur) );
4997   assert( pCur->eState==CURSOR_VALID );
4998   assert( pCur->iPage>=0 && pCur->pPage );
4999   assert( pCur->ix<pCur->pPage->nCell );
5000   return accessPayload(pCur, offset, amt, (unsigned char*)pBuf, 0);
5001 }
5002 
5003 /*
5004 ** This variant of sqlite3BtreePayload() works even if the cursor has not
5005 ** in the CURSOR_VALID state.  It is only used by the sqlite3_blob_read()
5006 ** interface.
5007 */
5008 #ifndef SQLITE_OMIT_INCRBLOB
accessPayloadChecked(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)5009 static SQLITE_NOINLINE int accessPayloadChecked(
5010   BtCursor *pCur,
5011   u32 offset,
5012   u32 amt,
5013   void *pBuf
5014 ){
5015   int rc;
5016   if ( pCur->eState==CURSOR_INVALID ){
5017     return SQLITE_ABORT;
5018   }
5019   assert( cursorOwnsBtShared(pCur) );
5020   rc = btreeRestoreCursorPosition(pCur);
5021   return rc ? rc : accessPayload(pCur, offset, amt, pBuf, 0);
5022 }
sqlite3BtreePayloadChecked(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)5023 int sqlite3BtreePayloadChecked(BtCursor *pCur, u32 offset, u32 amt, void *pBuf){
5024   if( pCur->eState==CURSOR_VALID ){
5025     assert( cursorOwnsBtShared(pCur) );
5026     return accessPayload(pCur, offset, amt, pBuf, 0);
5027   }else{
5028     return accessPayloadChecked(pCur, offset, amt, pBuf);
5029   }
5030 }
5031 #endif /* SQLITE_OMIT_INCRBLOB */
5032 
5033 /*
5034 ** Return a pointer to payload information from the entry that the
5035 ** pCur cursor is pointing to.  The pointer is to the beginning of
5036 ** the key if index btrees (pPage->intKey==0) and is the data for
5037 ** table btrees (pPage->intKey==1). The number of bytes of available
5038 ** key/data is written into *pAmt.  If *pAmt==0, then the value
5039 ** returned will not be a valid pointer.
5040 **
5041 ** This routine is an optimization.  It is common for the entire key
5042 ** and data to fit on the local page and for there to be no overflow
5043 ** pages.  When that is so, this routine can be used to access the
5044 ** key and data without making a copy.  If the key and/or data spills
5045 ** onto overflow pages, then accessPayload() must be used to reassemble
5046 ** the key/data and copy it into a preallocated buffer.
5047 **
5048 ** The pointer returned by this routine looks directly into the cached
5049 ** page of the database.  The data might change or move the next time
5050 ** any btree routine is called.
5051 */
fetchPayload(BtCursor * pCur,u32 * pAmt)5052 static const void *fetchPayload(
5053   BtCursor *pCur,      /* Cursor pointing to entry to read from */
5054   u32 *pAmt            /* Write the number of available bytes here */
5055 ){
5056   int amt;
5057   assert( pCur!=0 && pCur->iPage>=0 && pCur->pPage);
5058   assert( pCur->eState==CURSOR_VALID );
5059   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5060   assert( cursorOwnsBtShared(pCur) );
5061   assert( pCur->ix<pCur->pPage->nCell );
5062   assert( pCur->info.nSize>0 );
5063   assert( pCur->info.pPayload>pCur->pPage->aData || CORRUPT_DB );
5064   assert( pCur->info.pPayload<pCur->pPage->aDataEnd ||CORRUPT_DB);
5065   amt = pCur->info.nLocal;
5066   if( amt>(int)(pCur->pPage->aDataEnd - pCur->info.pPayload) ){
5067     /* There is too little space on the page for the expected amount
5068     ** of local content. Database must be corrupt. */
5069     assert( CORRUPT_DB );
5070     amt = MAX(0, (int)(pCur->pPage->aDataEnd - pCur->info.pPayload));
5071   }
5072   *pAmt = (u32)amt;
5073   return (void*)pCur->info.pPayload;
5074 }
5075 
5076 
5077 /*
5078 ** For the entry that cursor pCur is point to, return as
5079 ** many bytes of the key or data as are available on the local
5080 ** b-tree page.  Write the number of available bytes into *pAmt.
5081 **
5082 ** The pointer returned is ephemeral.  The key/data may move
5083 ** or be destroyed on the next call to any Btree routine,
5084 ** including calls from other threads against the same cache.
5085 ** Hence, a mutex on the BtShared should be held prior to calling
5086 ** this routine.
5087 **
5088 ** These routines is used to get quick access to key and data
5089 ** in the common case where no overflow pages are used.
5090 */
sqlite3BtreePayloadFetch(BtCursor * pCur,u32 * pAmt)5091 const void *sqlite3BtreePayloadFetch(BtCursor *pCur, u32 *pAmt){
5092   return fetchPayload(pCur, pAmt);
5093 }
5094 
5095 
5096 /*
5097 ** Move the cursor down to a new child page.  The newPgno argument is the
5098 ** page number of the child page to move to.
5099 **
5100 ** This function returns SQLITE_CORRUPT if the page-header flags field of
5101 ** the new child page does not match the flags field of the parent (i.e.
5102 ** if an intkey page appears to be the parent of a non-intkey page, or
5103 ** vice-versa).
5104 */
moveToChild(BtCursor * pCur,u32 newPgno)5105 static int moveToChild(BtCursor *pCur, u32 newPgno){
5106   BtShared *pBt = pCur->pBt;
5107 
5108   assert( cursorOwnsBtShared(pCur) );
5109   assert( pCur->eState==CURSOR_VALID );
5110   assert( pCur->iPage<BTCURSOR_MAX_DEPTH );
5111   assert( pCur->iPage>=0 );
5112   if( pCur->iPage>=(BTCURSOR_MAX_DEPTH-1) ){
5113     return SQLITE_CORRUPT_BKPT;
5114   }
5115   pCur->info.nSize = 0;
5116   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5117   pCur->aiIdx[pCur->iPage] = pCur->ix;
5118   pCur->apPage[pCur->iPage] = pCur->pPage;
5119   pCur->ix = 0;
5120   pCur->iPage++;
5121   return getAndInitPage(pBt, newPgno, &pCur->pPage, pCur, pCur->curPagerFlags);
5122 }
5123 
5124 #ifdef SQLITE_DEBUG
5125 /*
5126 ** Page pParent is an internal (non-leaf) tree page. This function
5127 ** asserts that page number iChild is the left-child if the iIdx'th
5128 ** cell in page pParent. Or, if iIdx is equal to the total number of
5129 ** cells in pParent, that page number iChild is the right-child of
5130 ** the page.
5131 */
assertParentIndex(MemPage * pParent,int iIdx,Pgno iChild)5132 static void assertParentIndex(MemPage *pParent, int iIdx, Pgno iChild){
5133   if( CORRUPT_DB ) return;  /* The conditions tested below might not be true
5134                             ** in a corrupt database */
5135   assert( iIdx<=pParent->nCell );
5136   if( iIdx==pParent->nCell ){
5137     assert( get4byte(&pParent->aData[pParent->hdrOffset+8])==iChild );
5138   }else{
5139     assert( get4byte(findCell(pParent, iIdx))==iChild );
5140   }
5141 }
5142 #else
5143 #  define assertParentIndex(x,y,z)
5144 #endif
5145 
5146 /*
5147 ** Move the cursor up to the parent page.
5148 **
5149 ** pCur->idx is set to the cell index that contains the pointer
5150 ** to the page we are coming from.  If we are coming from the
5151 ** right-most child page then pCur->idx is set to one more than
5152 ** the largest cell index.
5153 */
moveToParent(BtCursor * pCur)5154 static void moveToParent(BtCursor *pCur){
5155   MemPage *pLeaf;
5156   assert( cursorOwnsBtShared(pCur) );
5157   assert( pCur->eState==CURSOR_VALID );
5158   assert( pCur->iPage>0 );
5159   assert( pCur->pPage );
5160   assertParentIndex(
5161     pCur->apPage[pCur->iPage-1],
5162     pCur->aiIdx[pCur->iPage-1],
5163     pCur->pPage->pgno
5164   );
5165   testcase( pCur->aiIdx[pCur->iPage-1] > pCur->apPage[pCur->iPage-1]->nCell );
5166   pCur->info.nSize = 0;
5167   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5168   pCur->ix = pCur->aiIdx[pCur->iPage-1];
5169   pLeaf = pCur->pPage;
5170   pCur->pPage = pCur->apPage[--pCur->iPage];
5171   releasePageNotNull(pLeaf);
5172 }
5173 
5174 /*
5175 ** Move the cursor to point to the root page of its b-tree structure.
5176 **
5177 ** If the table has a virtual root page, then the cursor is moved to point
5178 ** to the virtual root page instead of the actual root page. A table has a
5179 ** virtual root page when the actual root page contains no cells and a
5180 ** single child page. This can only happen with the table rooted at page 1.
5181 **
5182 ** If the b-tree structure is empty, the cursor state is set to
5183 ** CURSOR_INVALID and this routine returns SQLITE_EMPTY. Otherwise,
5184 ** the cursor is set to point to the first cell located on the root
5185 ** (or virtual root) page and the cursor state is set to CURSOR_VALID.
5186 **
5187 ** If this function returns successfully, it may be assumed that the
5188 ** page-header flags indicate that the [virtual] root-page is the expected
5189 ** kind of b-tree page (i.e. if when opening the cursor the caller did not
5190 ** specify a KeyInfo structure the flags byte is set to 0x05 or 0x0D,
5191 ** indicating a table b-tree, or if the caller did specify a KeyInfo
5192 ** structure the flags byte is set to 0x02 or 0x0A, indicating an index
5193 ** b-tree).
5194 */
moveToRoot(BtCursor * pCur)5195 static int moveToRoot(BtCursor *pCur){
5196   MemPage *pRoot;
5197   int rc = SQLITE_OK;
5198 
5199   assert( cursorOwnsBtShared(pCur) );
5200   assert( CURSOR_INVALID < CURSOR_REQUIRESEEK );
5201   assert( CURSOR_VALID   < CURSOR_REQUIRESEEK );
5202   assert( CURSOR_FAULT   > CURSOR_REQUIRESEEK );
5203   assert( pCur->eState < CURSOR_REQUIRESEEK || pCur->iPage<0 );
5204   assert( pCur->pgnoRoot>0 || pCur->iPage<0 );
5205 
5206   if( pCur->iPage>=0 ){
5207     if( pCur->iPage ){
5208       releasePageNotNull(pCur->pPage);
5209       while( --pCur->iPage ){
5210         releasePageNotNull(pCur->apPage[pCur->iPage]);
5211       }
5212       pCur->pPage = pCur->apPage[0];
5213       goto skip_init;
5214     }
5215   }else if( pCur->pgnoRoot==0 ){
5216     pCur->eState = CURSOR_INVALID;
5217     return SQLITE_EMPTY;
5218   }else{
5219     assert( pCur->iPage==(-1) );
5220     if( pCur->eState>=CURSOR_REQUIRESEEK ){
5221       if( pCur->eState==CURSOR_FAULT ){
5222         assert( pCur->skipNext!=SQLITE_OK );
5223         return pCur->skipNext;
5224       }
5225       sqlite3BtreeClearCursor(pCur);
5226     }
5227     rc = getAndInitPage(pCur->pBtree->pBt, pCur->pgnoRoot, &pCur->pPage,
5228                         0, pCur->curPagerFlags);
5229     if( rc!=SQLITE_OK ){
5230       pCur->eState = CURSOR_INVALID;
5231       return rc;
5232     }
5233     pCur->iPage = 0;
5234     pCur->curIntKey = pCur->pPage->intKey;
5235   }
5236   pRoot = pCur->pPage;
5237   assert( pRoot->pgno==pCur->pgnoRoot );
5238 
5239   /* If pCur->pKeyInfo is not NULL, then the caller that opened this cursor
5240   ** expected to open it on an index b-tree. Otherwise, if pKeyInfo is
5241   ** NULL, the caller expects a table b-tree. If this is not the case,
5242   ** return an SQLITE_CORRUPT error.
5243   **
5244   ** Earlier versions of SQLite assumed that this test could not fail
5245   ** if the root page was already loaded when this function was called (i.e.
5246   ** if pCur->iPage>=0). But this is not so if the database is corrupted
5247   ** in such a way that page pRoot is linked into a second b-tree table
5248   ** (or the freelist).  */
5249   assert( pRoot->intKey==1 || pRoot->intKey==0 );
5250   if( pRoot->isInit==0 || (pCur->pKeyInfo==0)!=pRoot->intKey ){
5251     return SQLITE_CORRUPT_PAGE(pCur->pPage);
5252   }
5253 
5254 skip_init:
5255   pCur->ix = 0;
5256   pCur->info.nSize = 0;
5257   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidNKey|BTCF_ValidOvfl);
5258 
5259   pRoot = pCur->pPage;
5260   if( pRoot->nCell>0 ){
5261     pCur->eState = CURSOR_VALID;
5262   }else if( !pRoot->leaf ){
5263     Pgno subpage;
5264     if( pRoot->pgno!=1 ) return SQLITE_CORRUPT_BKPT;
5265     subpage = get4byte(&pRoot->aData[pRoot->hdrOffset+8]);
5266     pCur->eState = CURSOR_VALID;
5267     rc = moveToChild(pCur, subpage);
5268   }else{
5269     pCur->eState = CURSOR_INVALID;
5270     rc = SQLITE_EMPTY;
5271   }
5272   return rc;
5273 }
5274 
5275 /*
5276 ** Move the cursor down to the left-most leaf entry beneath the
5277 ** entry to which it is currently pointing.
5278 **
5279 ** The left-most leaf is the one with the smallest key - the first
5280 ** in ascending order.
5281 */
moveToLeftmost(BtCursor * pCur)5282 static int moveToLeftmost(BtCursor *pCur){
5283   Pgno pgno;
5284   int rc = SQLITE_OK;
5285   MemPage *pPage;
5286 
5287   assert( cursorOwnsBtShared(pCur) );
5288   assert( pCur->eState==CURSOR_VALID );
5289   while( rc==SQLITE_OK && !(pPage = pCur->pPage)->leaf ){
5290     assert( pCur->ix<pPage->nCell );
5291     pgno = get4byte(findCell(pPage, pCur->ix));
5292     rc = moveToChild(pCur, pgno);
5293   }
5294   return rc;
5295 }
5296 
5297 /*
5298 ** Move the cursor down to the right-most leaf entry beneath the
5299 ** page to which it is currently pointing.  Notice the difference
5300 ** between moveToLeftmost() and moveToRightmost().  moveToLeftmost()
5301 ** finds the left-most entry beneath the *entry* whereas moveToRightmost()
5302 ** finds the right-most entry beneath the *page*.
5303 **
5304 ** The right-most entry is the one with the largest key - the last
5305 ** key in ascending order.
5306 */
moveToRightmost(BtCursor * pCur)5307 static int moveToRightmost(BtCursor *pCur){
5308   Pgno pgno;
5309   int rc = SQLITE_OK;
5310   MemPage *pPage = 0;
5311 
5312   assert( cursorOwnsBtShared(pCur) );
5313   assert( pCur->eState==CURSOR_VALID );
5314   while( !(pPage = pCur->pPage)->leaf ){
5315     pgno = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5316     pCur->ix = pPage->nCell;
5317     rc = moveToChild(pCur, pgno);
5318     if( rc ) return rc;
5319   }
5320   pCur->ix = pPage->nCell-1;
5321   assert( pCur->info.nSize==0 );
5322   assert( (pCur->curFlags & BTCF_ValidNKey)==0 );
5323   return SQLITE_OK;
5324 }
5325 
5326 /* Move the cursor to the first entry in the table.  Return SQLITE_OK
5327 ** on success.  Set *pRes to 0 if the cursor actually points to something
5328 ** or set *pRes to 1 if the table is empty.
5329 */
sqlite3BtreeFirst(BtCursor * pCur,int * pRes)5330 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes){
5331   int rc;
5332 
5333   assert( cursorOwnsBtShared(pCur) );
5334   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5335   rc = moveToRoot(pCur);
5336   if( rc==SQLITE_OK ){
5337     assert( pCur->pPage->nCell>0 );
5338     *pRes = 0;
5339     rc = moveToLeftmost(pCur);
5340   }else if( rc==SQLITE_EMPTY ){
5341     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5342     *pRes = 1;
5343     rc = SQLITE_OK;
5344   }
5345   return rc;
5346 }
5347 
5348 /* Move the cursor to the last entry in the table.  Return SQLITE_OK
5349 ** on success.  Set *pRes to 0 if the cursor actually points to something
5350 ** or set *pRes to 1 if the table is empty.
5351 */
sqlite3BtreeLast(BtCursor * pCur,int * pRes)5352 int sqlite3BtreeLast(BtCursor *pCur, int *pRes){
5353   int rc;
5354 
5355   assert( cursorOwnsBtShared(pCur) );
5356   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5357 
5358   /* If the cursor already points to the last entry, this is a no-op. */
5359   if( CURSOR_VALID==pCur->eState && (pCur->curFlags & BTCF_AtLast)!=0 ){
5360 #ifdef SQLITE_DEBUG
5361     /* This block serves to assert() that the cursor really does point
5362     ** to the last entry in the b-tree. */
5363     int ii;
5364     for(ii=0; ii<pCur->iPage; ii++){
5365       assert( pCur->aiIdx[ii]==pCur->apPage[ii]->nCell );
5366     }
5367     assert( pCur->ix==pCur->pPage->nCell-1 );
5368     assert( pCur->pPage->leaf );
5369 #endif
5370     *pRes = 0;
5371     return SQLITE_OK;
5372   }
5373 
5374   rc = moveToRoot(pCur);
5375   if( rc==SQLITE_OK ){
5376     assert( pCur->eState==CURSOR_VALID );
5377     *pRes = 0;
5378     rc = moveToRightmost(pCur);
5379     if( rc==SQLITE_OK ){
5380       pCur->curFlags |= BTCF_AtLast;
5381     }else{
5382       pCur->curFlags &= ~BTCF_AtLast;
5383     }
5384   }else if( rc==SQLITE_EMPTY ){
5385     assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5386     *pRes = 1;
5387     rc = SQLITE_OK;
5388   }
5389   return rc;
5390 }
5391 
5392 /* Move the cursor so that it points to an entry near the key
5393 ** specified by pIdxKey or intKey.   Return a success code.
5394 **
5395 ** For INTKEY tables, the intKey parameter is used.  pIdxKey
5396 ** must be NULL.  For index tables, pIdxKey is used and intKey
5397 ** is ignored.
5398 **
5399 ** If an exact match is not found, then the cursor is always
5400 ** left pointing at a leaf page which would hold the entry if it
5401 ** were present.  The cursor might point to an entry that comes
5402 ** before or after the key.
5403 **
5404 ** An integer is written into *pRes which is the result of
5405 ** comparing the key with the entry to which the cursor is
5406 ** pointing.  The meaning of the integer written into
5407 ** *pRes is as follows:
5408 **
5409 **     *pRes<0      The cursor is left pointing at an entry that
5410 **                  is smaller than intKey/pIdxKey or if the table is empty
5411 **                  and the cursor is therefore left point to nothing.
5412 **
5413 **     *pRes==0     The cursor is left pointing at an entry that
5414 **                  exactly matches intKey/pIdxKey.
5415 **
5416 **     *pRes>0      The cursor is left pointing at an entry that
5417 **                  is larger than intKey/pIdxKey.
5418 **
5419 ** For index tables, the pIdxKey->eqSeen field is set to 1 if there
5420 ** exists an entry in the table that exactly matches pIdxKey.
5421 */
sqlite3BtreeMovetoUnpacked(BtCursor * pCur,UnpackedRecord * pIdxKey,i64 intKey,int biasRight,int * pRes)5422 int sqlite3BtreeMovetoUnpacked(
5423   BtCursor *pCur,          /* The cursor to be moved */
5424   UnpackedRecord *pIdxKey, /* Unpacked index key */
5425   i64 intKey,              /* The table key */
5426   int biasRight,           /* If true, bias the search to the high end */
5427   int *pRes                /* Write search results here */
5428 ){
5429   int rc;
5430   RecordCompare xRecordCompare;
5431 
5432   assert( cursorOwnsBtShared(pCur) );
5433   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5434   assert( pRes );
5435   assert( (pIdxKey==0)==(pCur->pKeyInfo==0) );
5436   assert( pCur->eState!=CURSOR_VALID || (pIdxKey==0)==(pCur->curIntKey!=0) );
5437 
5438   /* If the cursor is already positioned at the point we are trying
5439   ** to move to, then just return without doing any work */
5440   if( pIdxKey==0
5441    && pCur->eState==CURSOR_VALID && (pCur->curFlags & BTCF_ValidNKey)!=0
5442   ){
5443     if( pCur->info.nKey==intKey ){
5444       *pRes = 0;
5445       return SQLITE_OK;
5446     }
5447     if( pCur->info.nKey<intKey ){
5448       if( (pCur->curFlags & BTCF_AtLast)!=0 ){
5449         *pRes = -1;
5450         return SQLITE_OK;
5451       }
5452       /* If the requested key is one more than the previous key, then
5453       ** try to get there using sqlite3BtreeNext() rather than a full
5454       ** binary search.  This is an optimization only.  The correct answer
5455       ** is still obtained without this case, only a little more slowely */
5456       if( pCur->info.nKey+1==intKey ){
5457         *pRes = 0;
5458         rc = sqlite3BtreeNext(pCur, 0);
5459         if( rc==SQLITE_OK ){
5460           getCellInfo(pCur);
5461           if( pCur->info.nKey==intKey ){
5462             return SQLITE_OK;
5463           }
5464         }else if( rc==SQLITE_DONE ){
5465           rc = SQLITE_OK;
5466         }else{
5467           return rc;
5468         }
5469       }
5470     }
5471   }
5472 
5473 #ifdef SQLITE_DEBUG
5474   pCur->pBtree->nSeek++;   /* Performance measurement during testing */
5475 #endif
5476 
5477   if( pIdxKey ){
5478     xRecordCompare = sqlite3VdbeFindCompare(pIdxKey);
5479     pIdxKey->errCode = 0;
5480     assert( pIdxKey->default_rc==1
5481          || pIdxKey->default_rc==0
5482          || pIdxKey->default_rc==-1
5483     );
5484   }else{
5485     xRecordCompare = 0; /* All keys are integers */
5486   }
5487 
5488   rc = moveToRoot(pCur);
5489   if( rc ){
5490     if( rc==SQLITE_EMPTY ){
5491       assert( pCur->pgnoRoot==0 || pCur->pPage->nCell==0 );
5492       *pRes = -1;
5493       return SQLITE_OK;
5494     }
5495     return rc;
5496   }
5497   assert( pCur->pPage );
5498   assert( pCur->pPage->isInit );
5499   assert( pCur->eState==CURSOR_VALID );
5500   assert( pCur->pPage->nCell > 0 );
5501   assert( pCur->iPage==0 || pCur->apPage[0]->intKey==pCur->curIntKey );
5502   assert( pCur->curIntKey || pIdxKey );
5503   for(;;){
5504     int lwr, upr, idx, c;
5505     Pgno chldPg;
5506     MemPage *pPage = pCur->pPage;
5507     u8 *pCell;                          /* Pointer to current cell in pPage */
5508 
5509     /* pPage->nCell must be greater than zero. If this is the root-page
5510     ** the cursor would have been INVALID above and this for(;;) loop
5511     ** not run. If this is not the root-page, then the moveToChild() routine
5512     ** would have already detected db corruption. Similarly, pPage must
5513     ** be the right kind (index or table) of b-tree page. Otherwise
5514     ** a moveToChild() or moveToRoot() call would have detected corruption.  */
5515     assert( pPage->nCell>0 );
5516     assert( pPage->intKey==(pIdxKey==0) );
5517     lwr = 0;
5518     upr = pPage->nCell-1;
5519     assert( biasRight==0 || biasRight==1 );
5520     idx = upr>>(1-biasRight); /* idx = biasRight ? upr : (lwr+upr)/2; */
5521     pCur->ix = (u16)idx;
5522     if( xRecordCompare==0 ){
5523       for(;;){
5524         i64 nCellKey;
5525         pCell = findCellPastPtr(pPage, idx);
5526         if( pPage->intKeyLeaf ){
5527           while( 0x80 <= *(pCell++) ){
5528             if( pCell>=pPage->aDataEnd ){
5529               return SQLITE_CORRUPT_PAGE(pPage);
5530             }
5531           }
5532         }
5533         getVarint(pCell, (u64*)&nCellKey);
5534         if( nCellKey<intKey ){
5535           lwr = idx+1;
5536           if( lwr>upr ){ c = -1; break; }
5537         }else if( nCellKey>intKey ){
5538           upr = idx-1;
5539           if( lwr>upr ){ c = +1; break; }
5540         }else{
5541           assert( nCellKey==intKey );
5542           pCur->ix = (u16)idx;
5543           if( !pPage->leaf ){
5544             lwr = idx;
5545             goto moveto_next_layer;
5546           }else{
5547             pCur->curFlags |= BTCF_ValidNKey;
5548             pCur->info.nKey = nCellKey;
5549             pCur->info.nSize = 0;
5550             *pRes = 0;
5551             return SQLITE_OK;
5552           }
5553         }
5554         assert( lwr+upr>=0 );
5555         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2; */
5556       }
5557     }else{
5558       for(;;){
5559         int nCell;  /* Size of the pCell cell in bytes */
5560         pCell = findCellPastPtr(pPage, idx);
5561 
5562         /* The maximum supported page-size is 65536 bytes. This means that
5563         ** the maximum number of record bytes stored on an index B-Tree
5564         ** page is less than 16384 bytes and may be stored as a 2-byte
5565         ** varint. This information is used to attempt to avoid parsing
5566         ** the entire cell by checking for the cases where the record is
5567         ** stored entirely within the b-tree page by inspecting the first
5568         ** 2 bytes of the cell.
5569         */
5570         nCell = pCell[0];
5571         if( nCell<=pPage->max1bytePayload ){
5572           /* This branch runs if the record-size field of the cell is a
5573           ** single byte varint and the record fits entirely on the main
5574           ** b-tree page.  */
5575           testcase( pCell+nCell+1==pPage->aDataEnd );
5576           c = xRecordCompare(nCell, (void*)&pCell[1], pIdxKey);
5577         }else if( !(pCell[1] & 0x80)
5578           && (nCell = ((nCell&0x7f)<<7) + pCell[1])<=pPage->maxLocal
5579         ){
5580           /* The record-size field is a 2 byte varint and the record
5581           ** fits entirely on the main b-tree page.  */
5582           testcase( pCell+nCell+2==pPage->aDataEnd );
5583           c = xRecordCompare(nCell, (void*)&pCell[2], pIdxKey);
5584         }else{
5585           /* The record flows over onto one or more overflow pages. In
5586           ** this case the whole cell needs to be parsed, a buffer allocated
5587           ** and accessPayload() used to retrieve the record into the
5588           ** buffer before VdbeRecordCompare() can be called.
5589           **
5590           ** If the record is corrupt, the xRecordCompare routine may read
5591           ** up to two varints past the end of the buffer. An extra 18
5592           ** bytes of padding is allocated at the end of the buffer in
5593           ** case this happens.  */
5594           void *pCellKey;
5595           u8 * const pCellBody = pCell - pPage->childPtrSize;
5596           const int nOverrun = 18;  /* Size of the overrun padding */
5597           pPage->xParseCell(pPage, pCellBody, &pCur->info);
5598           nCell = (int)pCur->info.nKey;
5599           testcase( nCell<0 );   /* True if key size is 2^32 or more */
5600           testcase( nCell==0 );  /* Invalid key size:  0x80 0x80 0x00 */
5601           testcase( nCell==1 );  /* Invalid key size:  0x80 0x80 0x01 */
5602           testcase( nCell==2 );  /* Minimum legal index key size */
5603           if( nCell<2 || nCell/pCur->pBt->usableSize>pCur->pBt->nPage ){
5604             rc = SQLITE_CORRUPT_PAGE(pPage);
5605             goto moveto_finish;
5606           }
5607           pCellKey = sqlite3Malloc( nCell+nOverrun );
5608           if( pCellKey==0 ){
5609             rc = SQLITE_NOMEM_BKPT;
5610             goto moveto_finish;
5611           }
5612           pCur->ix = (u16)idx;
5613           rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
5614           memset(((u8*)pCellKey)+nCell,0,nOverrun); /* Fix uninit warnings */
5615           pCur->curFlags &= ~BTCF_ValidOvfl;
5616           if( rc ){
5617             sqlite3_free(pCellKey);
5618             goto moveto_finish;
5619           }
5620           c = sqlite3VdbeRecordCompare(nCell, pCellKey, pIdxKey);
5621           sqlite3_free(pCellKey);
5622         }
5623         assert(
5624             (pIdxKey->errCode!=SQLITE_CORRUPT || c==0)
5625          && (pIdxKey->errCode!=SQLITE_NOMEM || pCur->pBtree->db->mallocFailed)
5626         );
5627         if( c<0 ){
5628           lwr = idx+1;
5629         }else if( c>0 ){
5630           upr = idx-1;
5631         }else{
5632           assert( c==0 );
5633           *pRes = 0;
5634           rc = SQLITE_OK;
5635           pCur->ix = (u16)idx;
5636           if( pIdxKey->errCode ) rc = SQLITE_CORRUPT_BKPT;
5637           goto moveto_finish;
5638         }
5639         if( lwr>upr ) break;
5640         assert( lwr+upr>=0 );
5641         idx = (lwr+upr)>>1;  /* idx = (lwr+upr)/2 */
5642       }
5643     }
5644     assert( lwr==upr+1 || (pPage->intKey && !pPage->leaf) );
5645     assert( pPage->isInit );
5646     if( pPage->leaf ){
5647       assert( pCur->ix<pCur->pPage->nCell );
5648       pCur->ix = (u16)idx;
5649       *pRes = c;
5650       rc = SQLITE_OK;
5651       goto moveto_finish;
5652     }
5653 moveto_next_layer:
5654     if( lwr>=pPage->nCell ){
5655       chldPg = get4byte(&pPage->aData[pPage->hdrOffset+8]);
5656     }else{
5657       chldPg = get4byte(findCell(pPage, lwr));
5658     }
5659     pCur->ix = (u16)lwr;
5660     rc = moveToChild(pCur, chldPg);
5661     if( rc ) break;
5662   }
5663 moveto_finish:
5664   pCur->info.nSize = 0;
5665   assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5666   return rc;
5667 }
5668 
5669 
5670 /*
5671 ** Return TRUE if the cursor is not pointing at an entry of the table.
5672 **
5673 ** TRUE will be returned after a call to sqlite3BtreeNext() moves
5674 ** past the last entry in the table or sqlite3BtreePrev() moves past
5675 ** the first entry.  TRUE is also returned if the table is empty.
5676 */
sqlite3BtreeEof(BtCursor * pCur)5677 int sqlite3BtreeEof(BtCursor *pCur){
5678   /* TODO: What if the cursor is in CURSOR_REQUIRESEEK but all table entries
5679   ** have been deleted? This API will need to change to return an error code
5680   ** as well as the boolean result value.
5681   */
5682   return (CURSOR_VALID!=pCur->eState);
5683 }
5684 
5685 /*
5686 ** Return an estimate for the number of rows in the table that pCur is
5687 ** pointing to.  Return a negative number if no estimate is currently
5688 ** available.
5689 */
sqlite3BtreeRowCountEst(BtCursor * pCur)5690 i64 sqlite3BtreeRowCountEst(BtCursor *pCur){
5691   i64 n;
5692   u8 i;
5693 
5694   assert( cursorOwnsBtShared(pCur) );
5695   assert( sqlite3_mutex_held(pCur->pBtree->db->mutex) );
5696 
5697   /* Currently this interface is only called by the OP_IfSmaller
5698   ** opcode, and it that case the cursor will always be valid and
5699   ** will always point to a leaf node. */
5700   if( NEVER(pCur->eState!=CURSOR_VALID) ) return -1;
5701   if( NEVER(pCur->pPage->leaf==0) ) return -1;
5702 
5703   n = pCur->pPage->nCell;
5704   for(i=0; i<pCur->iPage; i++){
5705     n *= pCur->apPage[i]->nCell;
5706   }
5707   return n;
5708 }
5709 
5710 /*
5711 ** Advance the cursor to the next entry in the database.
5712 ** Return value:
5713 **
5714 **    SQLITE_OK        success
5715 **    SQLITE_DONE      cursor is already pointing at the last element
5716 **    otherwise        some kind of error occurred
5717 **
5718 ** The main entry point is sqlite3BtreeNext().  That routine is optimized
5719 ** for the common case of merely incrementing the cell counter BtCursor.aiIdx
5720 ** to the next cell on the current page.  The (slower) btreeNext() helper
5721 ** routine is called when it is necessary to move to a different page or
5722 ** to restore the cursor.
5723 **
5724 ** If bit 0x01 of the F argument in sqlite3BtreeNext(C,F) is 1, then the
5725 ** cursor corresponds to an SQL index and this routine could have been
5726 ** skipped if the SQL index had been a unique index.  The F argument
5727 ** is a hint to the implement.  SQLite btree implementation does not use
5728 ** this hint, but COMDB2 does.
5729 */
btreeNext(BtCursor * pCur)5730 static SQLITE_NOINLINE int btreeNext(BtCursor *pCur){
5731   int rc;
5732   int idx;
5733   MemPage *pPage;
5734 
5735   assert( cursorOwnsBtShared(pCur) );
5736   if( pCur->eState!=CURSOR_VALID ){
5737     assert( (pCur->curFlags & BTCF_ValidOvfl)==0 );
5738     rc = restoreCursorPosition(pCur);
5739     if( rc!=SQLITE_OK ){
5740       return rc;
5741     }
5742     if( CURSOR_INVALID==pCur->eState ){
5743       return SQLITE_DONE;
5744     }
5745     if( pCur->eState==CURSOR_SKIPNEXT ){
5746       pCur->eState = CURSOR_VALID;
5747       if( pCur->skipNext>0 ) return SQLITE_OK;
5748     }
5749   }
5750 
5751   pPage = pCur->pPage;
5752   idx = ++pCur->ix;
5753   if( !pPage->isInit || sqlite3FaultSim(412) ){
5754     /* The only known way for this to happen is for there to be a
5755     ** recursive SQL function that does a DELETE operation as part of a
5756     ** SELECT which deletes content out from under an active cursor
5757     ** in a corrupt database file where the table being DELETE-ed from
5758     ** has pages in common with the table being queried.  See TH3
5759     ** module cov1/btree78.test testcase 220 (2018-06-08) for an
5760     ** example. */
5761     return SQLITE_CORRUPT_BKPT;
5762   }
5763 
5764   /* If the database file is corrupt, it is possible for the value of idx
5765   ** to be invalid here. This can only occur if a second cursor modifies
5766   ** the page while cursor pCur is holding a reference to it. Which can
5767   ** only happen if the database is corrupt in such a way as to link the
5768   ** page into more than one b-tree structure.
5769   **
5770   ** Update 2019-12-23: appears to long longer be possible after the
5771   ** addition of anotherValidCursor() condition on balance_deeper().  */
5772   harmless( idx>pPage->nCell );
5773 
5774   if( idx>=pPage->nCell ){
5775     if( !pPage->leaf ){
5776       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
5777       if( rc ) return rc;
5778       return moveToLeftmost(pCur);
5779     }
5780     do{
5781       if( pCur->iPage==0 ){
5782         pCur->eState = CURSOR_INVALID;
5783         return SQLITE_DONE;
5784       }
5785       moveToParent(pCur);
5786       pPage = pCur->pPage;
5787     }while( pCur->ix>=pPage->nCell );
5788     if( pPage->intKey ){
5789       return sqlite3BtreeNext(pCur, 0);
5790     }else{
5791       return SQLITE_OK;
5792     }
5793   }
5794   if( pPage->leaf ){
5795     return SQLITE_OK;
5796   }else{
5797     return moveToLeftmost(pCur);
5798   }
5799 }
sqlite3BtreeNext(BtCursor * pCur,int flags)5800 int sqlite3BtreeNext(BtCursor *pCur, int flags){
5801   MemPage *pPage;
5802   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5803   assert( cursorOwnsBtShared(pCur) );
5804   assert( flags==0 || flags==1 );
5805   pCur->info.nSize = 0;
5806   pCur->curFlags &= ~(BTCF_ValidNKey|BTCF_ValidOvfl);
5807   if( pCur->eState!=CURSOR_VALID ) return btreeNext(pCur);
5808   pPage = pCur->pPage;
5809   if( (++pCur->ix)>=pPage->nCell ){
5810     pCur->ix--;
5811     return btreeNext(pCur);
5812   }
5813   if( pPage->leaf ){
5814     return SQLITE_OK;
5815   }else{
5816     return moveToLeftmost(pCur);
5817   }
5818 }
5819 
5820 /*
5821 ** Step the cursor to the back to the previous entry in the database.
5822 ** Return values:
5823 **
5824 **     SQLITE_OK     success
5825 **     SQLITE_DONE   the cursor is already on the first element of the table
5826 **     otherwise     some kind of error occurred
5827 **
5828 ** The main entry point is sqlite3BtreePrevious().  That routine is optimized
5829 ** for the common case of merely decrementing the cell counter BtCursor.aiIdx
5830 ** to the previous cell on the current page.  The (slower) btreePrevious()
5831 ** helper routine is called when it is necessary to move to a different page
5832 ** or to restore the cursor.
5833 **
5834 ** If bit 0x01 of the F argument to sqlite3BtreePrevious(C,F) is 1, then
5835 ** the cursor corresponds to an SQL index and this routine could have been
5836 ** skipped if the SQL index had been a unique index.  The F argument is a
5837 ** hint to the implement.  The native SQLite btree implementation does not
5838 ** use this hint, but COMDB2 does.
5839 */
btreePrevious(BtCursor * pCur)5840 static SQLITE_NOINLINE int btreePrevious(BtCursor *pCur){
5841   int rc;
5842   MemPage *pPage;
5843 
5844   assert( cursorOwnsBtShared(pCur) );
5845   assert( (pCur->curFlags & (BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey))==0 );
5846   assert( pCur->info.nSize==0 );
5847   if( pCur->eState!=CURSOR_VALID ){
5848     rc = restoreCursorPosition(pCur);
5849     if( rc!=SQLITE_OK ){
5850       return rc;
5851     }
5852     if( CURSOR_INVALID==pCur->eState ){
5853       return SQLITE_DONE;
5854     }
5855     if( CURSOR_SKIPNEXT==pCur->eState ){
5856       pCur->eState = CURSOR_VALID;
5857       if( pCur->skipNext<0 ) return SQLITE_OK;
5858     }
5859   }
5860 
5861   pPage = pCur->pPage;
5862   assert( pPage->isInit );
5863   if( !pPage->leaf ){
5864     int idx = pCur->ix;
5865     rc = moveToChild(pCur, get4byte(findCell(pPage, idx)));
5866     if( rc ) return rc;
5867     rc = moveToRightmost(pCur);
5868   }else{
5869     while( pCur->ix==0 ){
5870       if( pCur->iPage==0 ){
5871         pCur->eState = CURSOR_INVALID;
5872         return SQLITE_DONE;
5873       }
5874       moveToParent(pCur);
5875     }
5876     assert( pCur->info.nSize==0 );
5877     assert( (pCur->curFlags & (BTCF_ValidOvfl))==0 );
5878 
5879     pCur->ix--;
5880     pPage = pCur->pPage;
5881     if( pPage->intKey && !pPage->leaf ){
5882       rc = sqlite3BtreePrevious(pCur, 0);
5883     }else{
5884       rc = SQLITE_OK;
5885     }
5886   }
5887   return rc;
5888 }
sqlite3BtreePrevious(BtCursor * pCur,int flags)5889 int sqlite3BtreePrevious(BtCursor *pCur, int flags){
5890   assert( cursorOwnsBtShared(pCur) );
5891   assert( flags==0 || flags==1 );
5892   UNUSED_PARAMETER( flags );  /* Used in COMDB2 but not native SQLite */
5893   pCur->curFlags &= ~(BTCF_AtLast|BTCF_ValidOvfl|BTCF_ValidNKey);
5894   pCur->info.nSize = 0;
5895   if( pCur->eState!=CURSOR_VALID
5896    || pCur->ix==0
5897    || pCur->pPage->leaf==0
5898   ){
5899     return btreePrevious(pCur);
5900   }
5901   pCur->ix--;
5902   return SQLITE_OK;
5903 }
5904 
5905 /*
5906 ** Allocate a new page from the database file.
5907 **
5908 ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
5909 ** has already been called on the new page.)  The new page has also
5910 ** been referenced and the calling routine is responsible for calling
5911 ** sqlite3PagerUnref() on the new page when it is done.
5912 **
5913 ** SQLITE_OK is returned on success.  Any other return value indicates
5914 ** an error.  *ppPage is set to NULL in the event of an error.
5915 **
5916 ** If the "nearby" parameter is not 0, then an effort is made to
5917 ** locate a page close to the page number "nearby".  This can be used in an
5918 ** attempt to keep related pages close to each other in the database file,
5919 ** which in turn can make database access faster.
5920 **
5921 ** If the eMode parameter is BTALLOC_EXACT and the nearby page exists
5922 ** anywhere on the free-list, then it is guaranteed to be returned.  If
5923 ** eMode is BTALLOC_LT then the page returned will be less than or equal
5924 ** to nearby if any such page exists.  If eMode is BTALLOC_ANY then there
5925 ** are no restrictions on which page is returned.
5926 */
allocateBtreePage(BtShared * pBt,MemPage ** ppPage,Pgno * pPgno,Pgno nearby,u8 eMode)5927 static int allocateBtreePage(
5928   BtShared *pBt,         /* The btree */
5929   MemPage **ppPage,      /* Store pointer to the allocated page here */
5930   Pgno *pPgno,           /* Store the page number here */
5931   Pgno nearby,           /* Search for a page near this one */
5932   u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
5933 ){
5934   MemPage *pPage1;
5935   int rc;
5936   u32 n;     /* Number of pages on the freelist */
5937   u32 k;     /* Number of leaves on the trunk of the freelist */
5938   MemPage *pTrunk = 0;
5939   MemPage *pPrevTrunk = 0;
5940   Pgno mxPage;     /* Total size of the database file */
5941 
5942   assert( sqlite3_mutex_held(pBt->mutex) );
5943   assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
5944   pPage1 = pBt->pPage1;
5945   mxPage = btreePagecount(pBt);
5946   /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
5947   ** stores stores the total number of pages on the freelist. */
5948   n = get4byte(&pPage1->aData[36]);
5949   testcase( n==mxPage-1 );
5950   if( n>=mxPage ){
5951     return SQLITE_CORRUPT_BKPT;
5952   }
5953   if( n>0 ){
5954     /* There are pages on the freelist.  Reuse one of those pages. */
5955     Pgno iTrunk;
5956     u8 searchList = 0; /* If the free-list must be searched for 'nearby' */
5957     u32 nSearch = 0;   /* Count of the number of search attempts */
5958 
5959     /* If eMode==BTALLOC_EXACT and a query of the pointer-map
5960     ** shows that the page 'nearby' is somewhere on the free-list, then
5961     ** the entire-list will be searched for that page.
5962     */
5963 #ifndef SQLITE_OMIT_AUTOVACUUM
5964     if( eMode==BTALLOC_EXACT ){
5965       if( nearby<=mxPage ){
5966         u8 eType;
5967         assert( nearby>0 );
5968         assert( pBt->autoVacuum );
5969         rc = ptrmapGet(pBt, nearby, &eType, 0);
5970         if( rc ) return rc;
5971         if( eType==PTRMAP_FREEPAGE ){
5972           searchList = 1;
5973         }
5974       }
5975     }else if( eMode==BTALLOC_LE ){
5976       searchList = 1;
5977     }
5978 #endif
5979 
5980     /* Decrement the free-list count by 1. Set iTrunk to the index of the
5981     ** first free-list trunk page. iPrevTrunk is initially 1.
5982     */
5983     rc = sqlite3PagerWrite(pPage1->pDbPage);
5984     if( rc ) return rc;
5985     put4byte(&pPage1->aData[36], n-1);
5986 
5987     /* The code within this loop is run only once if the 'searchList' variable
5988     ** is not true. Otherwise, it runs once for each trunk-page on the
5989     ** free-list until the page 'nearby' is located (eMode==BTALLOC_EXACT)
5990     ** or until a page less than 'nearby' is located (eMode==BTALLOC_LT)
5991     */
5992     do {
5993       pPrevTrunk = pTrunk;
5994       if( pPrevTrunk ){
5995         /* EVIDENCE-OF: R-01506-11053 The first integer on a freelist trunk page
5996         ** is the page number of the next freelist trunk page in the list or
5997         ** zero if this is the last freelist trunk page. */
5998         iTrunk = get4byte(&pPrevTrunk->aData[0]);
5999       }else{
6000         /* EVIDENCE-OF: R-59841-13798 The 4-byte big-endian integer at offset 32
6001         ** stores the page number of the first page of the freelist, or zero if
6002         ** the freelist is empty. */
6003         iTrunk = get4byte(&pPage1->aData[32]);
6004       }
6005       testcase( iTrunk==mxPage );
6006       if( iTrunk>mxPage || nSearch++ > n ){
6007         rc = SQLITE_CORRUPT_PGNO(pPrevTrunk ? pPrevTrunk->pgno : 1);
6008       }else{
6009         rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
6010       }
6011       if( rc ){
6012         pTrunk = 0;
6013         goto end_allocate_page;
6014       }
6015       assert( pTrunk!=0 );
6016       assert( pTrunk->aData!=0 );
6017       /* EVIDENCE-OF: R-13523-04394 The second integer on a freelist trunk page
6018       ** is the number of leaf page pointers to follow. */
6019       k = get4byte(&pTrunk->aData[4]);
6020       if( k==0 && !searchList ){
6021         /* The trunk has no leaves and the list is not being searched.
6022         ** So extract the trunk page itself and use it as the newly
6023         ** allocated page */
6024         assert( pPrevTrunk==0 );
6025         rc = sqlite3PagerWrite(pTrunk->pDbPage);
6026         if( rc ){
6027           goto end_allocate_page;
6028         }
6029         *pPgno = iTrunk;
6030         memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
6031         *ppPage = pTrunk;
6032         pTrunk = 0;
6033         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
6034       }else if( k>(u32)(pBt->usableSize/4 - 2) ){
6035         /* Value of k is out of range.  Database corruption */
6036         rc = SQLITE_CORRUPT_PGNO(iTrunk);
6037         goto end_allocate_page;
6038 #ifndef SQLITE_OMIT_AUTOVACUUM
6039       }else if( searchList
6040             && (nearby==iTrunk || (iTrunk<nearby && eMode==BTALLOC_LE))
6041       ){
6042         /* The list is being searched and this trunk page is the page
6043         ** to allocate, regardless of whether it has leaves.
6044         */
6045         *pPgno = iTrunk;
6046         *ppPage = pTrunk;
6047         searchList = 0;
6048         rc = sqlite3PagerWrite(pTrunk->pDbPage);
6049         if( rc ){
6050           goto end_allocate_page;
6051         }
6052         if( k==0 ){
6053           if( !pPrevTrunk ){
6054             memcpy(&pPage1->aData[32], &pTrunk->aData[0], 4);
6055           }else{
6056             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6057             if( rc!=SQLITE_OK ){
6058               goto end_allocate_page;
6059             }
6060             memcpy(&pPrevTrunk->aData[0], &pTrunk->aData[0], 4);
6061           }
6062         }else{
6063           /* The trunk page is required by the caller but it contains
6064           ** pointers to free-list leaves. The first leaf becomes a trunk
6065           ** page in this case.
6066           */
6067           MemPage *pNewTrunk;
6068           Pgno iNewTrunk = get4byte(&pTrunk->aData[8]);
6069           if( iNewTrunk>mxPage ){
6070             rc = SQLITE_CORRUPT_PGNO(iTrunk);
6071             goto end_allocate_page;
6072           }
6073           testcase( iNewTrunk==mxPage );
6074           rc = btreeGetUnusedPage(pBt, iNewTrunk, &pNewTrunk, 0);
6075           if( rc!=SQLITE_OK ){
6076             goto end_allocate_page;
6077           }
6078           rc = sqlite3PagerWrite(pNewTrunk->pDbPage);
6079           if( rc!=SQLITE_OK ){
6080             releasePage(pNewTrunk);
6081             goto end_allocate_page;
6082           }
6083           memcpy(&pNewTrunk->aData[0], &pTrunk->aData[0], 4);
6084           put4byte(&pNewTrunk->aData[4], k-1);
6085           memcpy(&pNewTrunk->aData[8], &pTrunk->aData[12], (k-1)*4);
6086           releasePage(pNewTrunk);
6087           if( !pPrevTrunk ){
6088             assert( sqlite3PagerIswriteable(pPage1->pDbPage) );
6089             put4byte(&pPage1->aData[32], iNewTrunk);
6090           }else{
6091             rc = sqlite3PagerWrite(pPrevTrunk->pDbPage);
6092             if( rc ){
6093               goto end_allocate_page;
6094             }
6095             put4byte(&pPrevTrunk->aData[0], iNewTrunk);
6096           }
6097         }
6098         pTrunk = 0;
6099         TRACE(("ALLOCATE: %d trunk - %d free pages left\n", *pPgno, n-1));
6100 #endif
6101       }else if( k>0 ){
6102         /* Extract a leaf from the trunk */
6103         u32 closest;
6104         Pgno iPage;
6105         unsigned char *aData = pTrunk->aData;
6106         if( nearby>0 ){
6107           u32 i;
6108           closest = 0;
6109           if( eMode==BTALLOC_LE ){
6110             for(i=0; i<k; i++){
6111               iPage = get4byte(&aData[8+i*4]);
6112               if( iPage<=nearby ){
6113                 closest = i;
6114                 break;
6115               }
6116             }
6117           }else{
6118             int dist;
6119             dist = sqlite3AbsInt32(get4byte(&aData[8]) - nearby);
6120             for(i=1; i<k; i++){
6121               int d2 = sqlite3AbsInt32(get4byte(&aData[8+i*4]) - nearby);
6122               if( d2<dist ){
6123                 closest = i;
6124                 dist = d2;
6125               }
6126             }
6127           }
6128         }else{
6129           closest = 0;
6130         }
6131 
6132         iPage = get4byte(&aData[8+closest*4]);
6133         testcase( iPage==mxPage );
6134         if( iPage>mxPage ){
6135           rc = SQLITE_CORRUPT_PGNO(iTrunk);
6136           goto end_allocate_page;
6137         }
6138         testcase( iPage==mxPage );
6139         if( !searchList
6140          || (iPage==nearby || (iPage<nearby && eMode==BTALLOC_LE))
6141         ){
6142           int noContent;
6143           *pPgno = iPage;
6144           TRACE(("ALLOCATE: %d was leaf %d of %d on trunk %d"
6145                  ": %d more free pages\n",
6146                  *pPgno, closest+1, k, pTrunk->pgno, n-1));
6147           rc = sqlite3PagerWrite(pTrunk->pDbPage);
6148           if( rc ) goto end_allocate_page;
6149           if( closest<k-1 ){
6150             memcpy(&aData[8+closest*4], &aData[4+k*4], 4);
6151           }
6152           put4byte(&aData[4], k-1);
6153           noContent = !btreeGetHasContent(pBt, *pPgno)? PAGER_GET_NOCONTENT : 0;
6154           rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, noContent);
6155           if( rc==SQLITE_OK ){
6156             rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6157             if( rc!=SQLITE_OK ){
6158               releasePage(*ppPage);
6159               *ppPage = 0;
6160             }
6161           }
6162           searchList = 0;
6163         }
6164       }
6165       releasePage(pPrevTrunk);
6166       pPrevTrunk = 0;
6167     }while( searchList );
6168   }else{
6169     /* There are no pages on the freelist, so append a new page to the
6170     ** database image.
6171     **
6172     ** Normally, new pages allocated by this block can be requested from the
6173     ** pager layer with the 'no-content' flag set. This prevents the pager
6174     ** from trying to read the pages content from disk. However, if the
6175     ** current transaction has already run one or more incremental-vacuum
6176     ** steps, then the page we are about to allocate may contain content
6177     ** that is required in the event of a rollback. In this case, do
6178     ** not set the no-content flag. This causes the pager to load and journal
6179     ** the current page content before overwriting it.
6180     **
6181     ** Note that the pager will not actually attempt to load or journal
6182     ** content for any page that really does lie past the end of the database
6183     ** file on disk. So the effects of disabling the no-content optimization
6184     ** here are confined to those pages that lie between the end of the
6185     ** database image and the end of the database file.
6186     */
6187     int bNoContent = (0==IfNotOmitAV(pBt->bDoTruncate))? PAGER_GET_NOCONTENT:0;
6188 
6189     rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
6190     if( rc ) return rc;
6191     pBt->nPage++;
6192     if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
6193 
6194 #ifndef SQLITE_OMIT_AUTOVACUUM
6195     if( pBt->autoVacuum && PTRMAP_ISPAGE(pBt, pBt->nPage) ){
6196       /* If *pPgno refers to a pointer-map page, allocate two new pages
6197       ** at the end of the file instead of one. The first allocated page
6198       ** becomes a new pointer-map page, the second is used by the caller.
6199       */
6200       MemPage *pPg = 0;
6201       TRACE(("ALLOCATE: %d from end of file (pointer-map page)\n", pBt->nPage));
6202       assert( pBt->nPage!=PENDING_BYTE_PAGE(pBt) );
6203       rc = btreeGetUnusedPage(pBt, pBt->nPage, &pPg, bNoContent);
6204       if( rc==SQLITE_OK ){
6205         rc = sqlite3PagerWrite(pPg->pDbPage);
6206         releasePage(pPg);
6207       }
6208       if( rc ) return rc;
6209       pBt->nPage++;
6210       if( pBt->nPage==PENDING_BYTE_PAGE(pBt) ){ pBt->nPage++; }
6211     }
6212 #endif
6213     put4byte(28 + (u8*)pBt->pPage1->aData, pBt->nPage);
6214     *pPgno = pBt->nPage;
6215 
6216     assert( *pPgno!=PENDING_BYTE_PAGE(pBt) );
6217     rc = btreeGetUnusedPage(pBt, *pPgno, ppPage, bNoContent);
6218     if( rc ) return rc;
6219     rc = sqlite3PagerWrite((*ppPage)->pDbPage);
6220     if( rc!=SQLITE_OK ){
6221       releasePage(*ppPage);
6222       *ppPage = 0;
6223     }
6224     TRACE(("ALLOCATE: %d from end of file\n", *pPgno));
6225   }
6226 
6227   assert( CORRUPT_DB || *pPgno!=PENDING_BYTE_PAGE(pBt) );
6228 
6229 end_allocate_page:
6230   releasePage(pTrunk);
6231   releasePage(pPrevTrunk);
6232   assert( rc!=SQLITE_OK || sqlite3PagerPageRefcount((*ppPage)->pDbPage)<=1 );
6233   assert( rc!=SQLITE_OK || (*ppPage)->isInit==0 );
6234   return rc;
6235 }
6236 
6237 /*
6238 ** This function is used to add page iPage to the database file free-list.
6239 ** It is assumed that the page is not already a part of the free-list.
6240 **
6241 ** The value passed as the second argument to this function is optional.
6242 ** If the caller happens to have a pointer to the MemPage object
6243 ** corresponding to page iPage handy, it may pass it as the second value.
6244 ** Otherwise, it may pass NULL.
6245 **
6246 ** If a pointer to a MemPage object is passed as the second argument,
6247 ** its reference count is not altered by this function.
6248 */
freePage2(BtShared * pBt,MemPage * pMemPage,Pgno iPage)6249 static int freePage2(BtShared *pBt, MemPage *pMemPage, Pgno iPage){
6250   MemPage *pTrunk = 0;                /* Free-list trunk page */
6251   Pgno iTrunk = 0;                    /* Page number of free-list trunk page */
6252   MemPage *pPage1 = pBt->pPage1;      /* Local reference to page 1 */
6253   MemPage *pPage;                     /* Page being freed. May be NULL. */
6254   int rc;                             /* Return Code */
6255   u32 nFree;                          /* Initial number of pages on free-list */
6256 
6257   assert( sqlite3_mutex_held(pBt->mutex) );
6258   assert( CORRUPT_DB || iPage>1 );
6259   assert( !pMemPage || pMemPage->pgno==iPage );
6260 
6261   if( iPage<2 || iPage>pBt->nPage ){
6262     return SQLITE_CORRUPT_BKPT;
6263   }
6264   if( pMemPage ){
6265     pPage = pMemPage;
6266     sqlite3PagerRef(pPage->pDbPage);
6267   }else{
6268     pPage = btreePageLookup(pBt, iPage);
6269   }
6270 
6271   /* Increment the free page count on pPage1 */
6272   rc = sqlite3PagerWrite(pPage1->pDbPage);
6273   if( rc ) goto freepage_out;
6274   nFree = get4byte(&pPage1->aData[36]);
6275   put4byte(&pPage1->aData[36], nFree+1);
6276 
6277   if( pBt->btsFlags & BTS_SECURE_DELETE ){
6278     /* If the secure_delete option is enabled, then
6279     ** always fully overwrite deleted information with zeros.
6280     */
6281     if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
6282      ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
6283     ){
6284       goto freepage_out;
6285     }
6286     memset(pPage->aData, 0, pPage->pBt->pageSize);
6287   }
6288 
6289   /* If the database supports auto-vacuum, write an entry in the pointer-map
6290   ** to indicate that the page is free.
6291   */
6292   if( ISAUTOVACUUM ){
6293     ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
6294     if( rc ) goto freepage_out;
6295   }
6296 
6297   /* Now manipulate the actual database free-list structure. There are two
6298   ** possibilities. If the free-list is currently empty, or if the first
6299   ** trunk page in the free-list is full, then this page will become a
6300   ** new free-list trunk page. Otherwise, it will become a leaf of the
6301   ** first trunk page in the current free-list. This block tests if it
6302   ** is possible to add the page as a new free-list leaf.
6303   */
6304   if( nFree!=0 ){
6305     u32 nLeaf;                /* Initial number of leaf cells on trunk page */
6306 
6307     iTrunk = get4byte(&pPage1->aData[32]);
6308     if( iTrunk>btreePagecount(pBt) ){
6309       rc = SQLITE_CORRUPT_BKPT;
6310       goto freepage_out;
6311     }
6312     rc = btreeGetPage(pBt, iTrunk, &pTrunk, 0);
6313     if( rc!=SQLITE_OK ){
6314       goto freepage_out;
6315     }
6316 
6317     nLeaf = get4byte(&pTrunk->aData[4]);
6318     assert( pBt->usableSize>32 );
6319     if( nLeaf > (u32)pBt->usableSize/4 - 2 ){
6320       rc = SQLITE_CORRUPT_BKPT;
6321       goto freepage_out;
6322     }
6323     if( nLeaf < (u32)pBt->usableSize/4 - 8 ){
6324       /* In this case there is room on the trunk page to insert the page
6325       ** being freed as a new leaf.
6326       **
6327       ** Note that the trunk page is not really full until it contains
6328       ** usableSize/4 - 2 entries, not usableSize/4 - 8 entries as we have
6329       ** coded.  But due to a coding error in versions of SQLite prior to
6330       ** 3.6.0, databases with freelist trunk pages holding more than
6331       ** usableSize/4 - 8 entries will be reported as corrupt.  In order
6332       ** to maintain backwards compatibility with older versions of SQLite,
6333       ** we will continue to restrict the number of entries to usableSize/4 - 8
6334       ** for now.  At some point in the future (once everyone has upgraded
6335       ** to 3.6.0 or later) we should consider fixing the conditional above
6336       ** to read "usableSize/4-2" instead of "usableSize/4-8".
6337       **
6338       ** EVIDENCE-OF: R-19920-11576 However, newer versions of SQLite still
6339       ** avoid using the last six entries in the freelist trunk page array in
6340       ** order that database files created by newer versions of SQLite can be
6341       ** read by older versions of SQLite.
6342       */
6343       rc = sqlite3PagerWrite(pTrunk->pDbPage);
6344       if( rc==SQLITE_OK ){
6345         put4byte(&pTrunk->aData[4], nLeaf+1);
6346         put4byte(&pTrunk->aData[8+nLeaf*4], iPage);
6347         if( pPage && (pBt->btsFlags & BTS_SECURE_DELETE)==0 ){
6348           sqlite3PagerDontWrite(pPage->pDbPage);
6349         }
6350         rc = btreeSetHasContent(pBt, iPage);
6351       }
6352       TRACE(("FREE-PAGE: %d leaf on trunk page %d\n",pPage->pgno,pTrunk->pgno));
6353       goto freepage_out;
6354     }
6355   }
6356 
6357   /* If control flows to this point, then it was not possible to add the
6358   ** the page being freed as a leaf page of the first trunk in the free-list.
6359   ** Possibly because the free-list is empty, or possibly because the
6360   ** first trunk in the free-list is full. Either way, the page being freed
6361   ** will become the new first trunk page in the free-list.
6362   */
6363   if( pPage==0 && SQLITE_OK!=(rc = btreeGetPage(pBt, iPage, &pPage, 0)) ){
6364     goto freepage_out;
6365   }
6366   rc = sqlite3PagerWrite(pPage->pDbPage);
6367   if( rc!=SQLITE_OK ){
6368     goto freepage_out;
6369   }
6370   put4byte(pPage->aData, iTrunk);
6371   put4byte(&pPage->aData[4], 0);
6372   put4byte(&pPage1->aData[32], iPage);
6373   TRACE(("FREE-PAGE: %d new trunk page replacing %d\n", pPage->pgno, iTrunk));
6374 
6375 freepage_out:
6376   if( pPage ){
6377     pPage->isInit = 0;
6378   }
6379   releasePage(pPage);
6380   releasePage(pTrunk);
6381   return rc;
6382 }
freePage(MemPage * pPage,int * pRC)6383 static void freePage(MemPage *pPage, int *pRC){
6384   if( (*pRC)==SQLITE_OK ){
6385     *pRC = freePage2(pPage->pBt, pPage, pPage->pgno);
6386   }
6387 }
6388 
6389 /*
6390 ** Free any overflow pages associated with the given Cell.  Store
6391 ** size information about the cell in pInfo.
6392 */
clearCell(MemPage * pPage,unsigned char * pCell,CellInfo * pInfo)6393 static int clearCell(
6394   MemPage *pPage,          /* The page that contains the Cell */
6395   unsigned char *pCell,    /* First byte of the Cell */
6396   CellInfo *pInfo          /* Size information about the cell */
6397 ){
6398   BtShared *pBt;
6399   Pgno ovflPgno;
6400   int rc;
6401   int nOvfl;
6402   u32 ovflPageSize;
6403 
6404   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6405   pPage->xParseCell(pPage, pCell, pInfo);
6406   if( pInfo->nLocal==pInfo->nPayload ){
6407     return SQLITE_OK;  /* No overflow pages. Return without doing anything */
6408   }
6409   testcase( pCell + pInfo->nSize == pPage->aDataEnd );
6410   testcase( pCell + (pInfo->nSize-1) == pPage->aDataEnd );
6411   if( pCell + pInfo->nSize > pPage->aDataEnd ){
6412     /* Cell extends past end of page */
6413     return SQLITE_CORRUPT_PAGE(pPage);
6414   }
6415   ovflPgno = get4byte(pCell + pInfo->nSize - 4);
6416   pBt = pPage->pBt;
6417   assert( pBt->usableSize > 4 );
6418   ovflPageSize = pBt->usableSize - 4;
6419   nOvfl = (pInfo->nPayload - pInfo->nLocal + ovflPageSize - 1)/ovflPageSize;
6420   assert( nOvfl>0 ||
6421     (CORRUPT_DB && (pInfo->nPayload + ovflPageSize)<ovflPageSize)
6422   );
6423   while( nOvfl-- ){
6424     Pgno iNext = 0;
6425     MemPage *pOvfl = 0;
6426     if( ovflPgno<2 || ovflPgno>btreePagecount(pBt) ){
6427       /* 0 is not a legal page number and page 1 cannot be an
6428       ** overflow page. Therefore if ovflPgno<2 or past the end of the
6429       ** file the database must be corrupt. */
6430       return SQLITE_CORRUPT_BKPT;
6431     }
6432     if( nOvfl ){
6433       rc = getOverflowPage(pBt, ovflPgno, &pOvfl, &iNext);
6434       if( rc ) return rc;
6435     }
6436 
6437     if( ( pOvfl || ((pOvfl = btreePageLookup(pBt, ovflPgno))!=0) )
6438      && sqlite3PagerPageRefcount(pOvfl->pDbPage)!=1
6439     ){
6440       /* There is no reason any cursor should have an outstanding reference
6441       ** to an overflow page belonging to a cell that is being deleted/updated.
6442       ** So if there exists more than one reference to this page, then it
6443       ** must not really be an overflow page and the database must be corrupt.
6444       ** It is helpful to detect this before calling freePage2(), as
6445       ** freePage2() may zero the page contents if secure-delete mode is
6446       ** enabled. If this 'overflow' page happens to be a page that the
6447       ** caller is iterating through or using in some other way, this
6448       ** can be problematic.
6449       */
6450       rc = SQLITE_CORRUPT_BKPT;
6451     }else{
6452       rc = freePage2(pBt, pOvfl, ovflPgno);
6453     }
6454 
6455     if( pOvfl ){
6456       sqlite3PagerUnref(pOvfl->pDbPage);
6457     }
6458     if( rc ) return rc;
6459     ovflPgno = iNext;
6460   }
6461   return SQLITE_OK;
6462 }
6463 
6464 /*
6465 ** Create the byte sequence used to represent a cell on page pPage
6466 ** and write that byte sequence into pCell[].  Overflow pages are
6467 ** allocated and filled in as necessary.  The calling procedure
6468 ** is responsible for making sure sufficient space has been allocated
6469 ** for pCell[].
6470 **
6471 ** Note that pCell does not necessary need to point to the pPage->aData
6472 ** area.  pCell might point to some temporary storage.  The cell will
6473 ** be constructed in this temporary area then copied into pPage->aData
6474 ** later.
6475 */
fillInCell(MemPage * pPage,unsigned char * pCell,const BtreePayload * pX,int * pnSize)6476 static int fillInCell(
6477   MemPage *pPage,                /* The page that contains the cell */
6478   unsigned char *pCell,          /* Complete text of the cell */
6479   const BtreePayload *pX,        /* Payload with which to construct the cell */
6480   int *pnSize                    /* Write cell size here */
6481 ){
6482   int nPayload;
6483   const u8 *pSrc;
6484   int nSrc, n, rc, mn;
6485   int spaceLeft;
6486   MemPage *pToRelease;
6487   unsigned char *pPrior;
6488   unsigned char *pPayload;
6489   BtShared *pBt;
6490   Pgno pgnoOvfl;
6491   int nHeader;
6492 
6493   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6494 
6495   /* pPage is not necessarily writeable since pCell might be auxiliary
6496   ** buffer space that is separate from the pPage buffer area */
6497   assert( pCell<pPage->aData || pCell>=&pPage->aData[pPage->pBt->pageSize]
6498             || sqlite3PagerIswriteable(pPage->pDbPage) );
6499 
6500   /* Fill in the header. */
6501   nHeader = pPage->childPtrSize;
6502   if( pPage->intKey ){
6503     nPayload = pX->nData + pX->nZero;
6504     pSrc = pX->pData;
6505     nSrc = pX->nData;
6506     assert( pPage->intKeyLeaf ); /* fillInCell() only called for leaves */
6507     nHeader += putVarint32(&pCell[nHeader], nPayload);
6508     nHeader += putVarint(&pCell[nHeader], *(u64*)&pX->nKey);
6509   }else{
6510     assert( pX->nKey<=0x7fffffff && pX->pKey!=0 );
6511     nSrc = nPayload = (int)pX->nKey;
6512     pSrc = pX->pKey;
6513     nHeader += putVarint32(&pCell[nHeader], nPayload);
6514   }
6515 
6516   /* Fill in the payload */
6517   pPayload = &pCell[nHeader];
6518   if( nPayload<=pPage->maxLocal ){
6519     /* This is the common case where everything fits on the btree page
6520     ** and no overflow pages are required. */
6521     n = nHeader + nPayload;
6522     testcase( n==3 );
6523     testcase( n==4 );
6524     if( n<4 ) n = 4;
6525     *pnSize = n;
6526     assert( nSrc<=nPayload );
6527     testcase( nSrc<nPayload );
6528     memcpy(pPayload, pSrc, nSrc);
6529     memset(pPayload+nSrc, 0, nPayload-nSrc);
6530     return SQLITE_OK;
6531   }
6532 
6533   /* If we reach this point, it means that some of the content will need
6534   ** to spill onto overflow pages.
6535   */
6536   mn = pPage->minLocal;
6537   n = mn + (nPayload - mn) % (pPage->pBt->usableSize - 4);
6538   testcase( n==pPage->maxLocal );
6539   testcase( n==pPage->maxLocal+1 );
6540   if( n > pPage->maxLocal ) n = mn;
6541   spaceLeft = n;
6542   *pnSize = n + nHeader + 4;
6543   pPrior = &pCell[nHeader+n];
6544   pToRelease = 0;
6545   pgnoOvfl = 0;
6546   pBt = pPage->pBt;
6547 
6548   /* At this point variables should be set as follows:
6549   **
6550   **   nPayload           Total payload size in bytes
6551   **   pPayload           Begin writing payload here
6552   **   spaceLeft          Space available at pPayload.  If nPayload>spaceLeft,
6553   **                      that means content must spill into overflow pages.
6554   **   *pnSize            Size of the local cell (not counting overflow pages)
6555   **   pPrior             Where to write the pgno of the first overflow page
6556   **
6557   ** Use a call to btreeParseCellPtr() to verify that the values above
6558   ** were computed correctly.
6559   */
6560 #ifdef SQLITE_DEBUG
6561   {
6562     CellInfo info;
6563     pPage->xParseCell(pPage, pCell, &info);
6564     assert( nHeader==(int)(info.pPayload - pCell) );
6565     assert( info.nKey==pX->nKey );
6566     assert( *pnSize == info.nSize );
6567     assert( spaceLeft == info.nLocal );
6568   }
6569 #endif
6570 
6571   /* Write the payload into the local Cell and any extra into overflow pages */
6572   while( 1 ){
6573     n = nPayload;
6574     if( n>spaceLeft ) n = spaceLeft;
6575 
6576     /* If pToRelease is not zero than pPayload points into the data area
6577     ** of pToRelease.  Make sure pToRelease is still writeable. */
6578     assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6579 
6580     /* If pPayload is part of the data area of pPage, then make sure pPage
6581     ** is still writeable */
6582     assert( pPayload<pPage->aData || pPayload>=&pPage->aData[pBt->pageSize]
6583             || sqlite3PagerIswriteable(pPage->pDbPage) );
6584 
6585     if( nSrc>=n ){
6586       memcpy(pPayload, pSrc, n);
6587     }else if( nSrc>0 ){
6588       n = nSrc;
6589       memcpy(pPayload, pSrc, n);
6590     }else{
6591       memset(pPayload, 0, n);
6592     }
6593     nPayload -= n;
6594     if( nPayload<=0 ) break;
6595     pPayload += n;
6596     pSrc += n;
6597     nSrc -= n;
6598     spaceLeft -= n;
6599     if( spaceLeft==0 ){
6600       MemPage *pOvfl = 0;
6601 #ifndef SQLITE_OMIT_AUTOVACUUM
6602       Pgno pgnoPtrmap = pgnoOvfl; /* Overflow page pointer-map entry page */
6603       if( pBt->autoVacuum ){
6604         do{
6605           pgnoOvfl++;
6606         } while(
6607           PTRMAP_ISPAGE(pBt, pgnoOvfl) || pgnoOvfl==PENDING_BYTE_PAGE(pBt)
6608         );
6609       }
6610 #endif
6611       rc = allocateBtreePage(pBt, &pOvfl, &pgnoOvfl, pgnoOvfl, 0);
6612 #ifndef SQLITE_OMIT_AUTOVACUUM
6613       /* If the database supports auto-vacuum, and the second or subsequent
6614       ** overflow page is being allocated, add an entry to the pointer-map
6615       ** for that page now.
6616       **
6617       ** If this is the first overflow page, then write a partial entry
6618       ** to the pointer-map. If we write nothing to this pointer-map slot,
6619       ** then the optimistic overflow chain processing in clearCell()
6620       ** may misinterpret the uninitialized values and delete the
6621       ** wrong pages from the database.
6622       */
6623       if( pBt->autoVacuum && rc==SQLITE_OK ){
6624         u8 eType = (pgnoPtrmap?PTRMAP_OVERFLOW2:PTRMAP_OVERFLOW1);
6625         ptrmapPut(pBt, pgnoOvfl, eType, pgnoPtrmap, &rc);
6626         if( rc ){
6627           releasePage(pOvfl);
6628         }
6629       }
6630 #endif
6631       if( rc ){
6632         releasePage(pToRelease);
6633         return rc;
6634       }
6635 
6636       /* If pToRelease is not zero than pPrior points into the data area
6637       ** of pToRelease.  Make sure pToRelease is still writeable. */
6638       assert( pToRelease==0 || sqlite3PagerIswriteable(pToRelease->pDbPage) );
6639 
6640       /* If pPrior is part of the data area of pPage, then make sure pPage
6641       ** is still writeable */
6642       assert( pPrior<pPage->aData || pPrior>=&pPage->aData[pBt->pageSize]
6643             || sqlite3PagerIswriteable(pPage->pDbPage) );
6644 
6645       put4byte(pPrior, pgnoOvfl);
6646       releasePage(pToRelease);
6647       pToRelease = pOvfl;
6648       pPrior = pOvfl->aData;
6649       put4byte(pPrior, 0);
6650       pPayload = &pOvfl->aData[4];
6651       spaceLeft = pBt->usableSize - 4;
6652     }
6653   }
6654   releasePage(pToRelease);
6655   return SQLITE_OK;
6656 }
6657 
6658 /*
6659 ** Remove the i-th cell from pPage.  This routine effects pPage only.
6660 ** The cell content is not freed or deallocated.  It is assumed that
6661 ** the cell content has been copied someplace else.  This routine just
6662 ** removes the reference to the cell from pPage.
6663 **
6664 ** "sz" must be the number of bytes in the cell.
6665 */
dropCell(MemPage * pPage,int idx,int sz,int * pRC)6666 static void dropCell(MemPage *pPage, int idx, int sz, int *pRC){
6667   u32 pc;         /* Offset to cell content of cell being deleted */
6668   u8 *data;       /* pPage->aData */
6669   u8 *ptr;        /* Used to move bytes around within data[] */
6670   int rc;         /* The return code */
6671   int hdr;        /* Beginning of the header.  0 most pages.  100 page 1 */
6672 
6673   if( *pRC ) return;
6674   assert( idx>=0 && idx<pPage->nCell );
6675   assert( CORRUPT_DB || sz==cellSize(pPage, idx) );
6676   assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6677   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6678   assert( pPage->nFree>=0 );
6679   data = pPage->aData;
6680   ptr = &pPage->aCellIdx[2*idx];
6681   pc = get2byte(ptr);
6682   hdr = pPage->hdrOffset;
6683   testcase( pc==get2byte(&data[hdr+5]) );
6684   testcase( pc+sz==pPage->pBt->usableSize );
6685   if( pc+sz > pPage->pBt->usableSize ){
6686     *pRC = SQLITE_CORRUPT_BKPT;
6687     return;
6688   }
6689   rc = freeSpace(pPage, pc, sz);
6690   if( rc ){
6691     *pRC = rc;
6692     return;
6693   }
6694   pPage->nCell--;
6695   if( pPage->nCell==0 ){
6696     memset(&data[hdr+1], 0, 4);
6697     data[hdr+7] = 0;
6698     put2byte(&data[hdr+5], pPage->pBt->usableSize);
6699     pPage->nFree = pPage->pBt->usableSize - pPage->hdrOffset
6700                        - pPage->childPtrSize - 8;
6701   }else{
6702     memmove(ptr, ptr+2, 2*(pPage->nCell - idx));
6703     put2byte(&data[hdr+3], pPage->nCell);
6704     pPage->nFree += 2;
6705   }
6706 }
6707 
6708 /*
6709 ** Insert a new cell on pPage at cell index "i".  pCell points to the
6710 ** content of the cell.
6711 **
6712 ** If the cell content will fit on the page, then put it there.  If it
6713 ** will not fit, then make a copy of the cell content into pTemp if
6714 ** pTemp is not null.  Regardless of pTemp, allocate a new entry
6715 ** in pPage->apOvfl[] and make it point to the cell content (either
6716 ** in pTemp or the original pCell) and also record its index.
6717 ** Allocating a new entry in pPage->aCell[] implies that
6718 ** pPage->nOverflow is incremented.
6719 **
6720 ** *pRC must be SQLITE_OK when this routine is called.
6721 */
insertCell(MemPage * pPage,int i,u8 * pCell,int sz,u8 * pTemp,Pgno iChild,int * pRC)6722 static void insertCell(
6723   MemPage *pPage,   /* Page into which we are copying */
6724   int i,            /* New cell becomes the i-th cell of the page */
6725   u8 *pCell,        /* Content of the new cell */
6726   int sz,           /* Bytes of content in pCell */
6727   u8 *pTemp,        /* Temp storage space for pCell, if needed */
6728   Pgno iChild,      /* If non-zero, replace first 4 bytes with this value */
6729   int *pRC          /* Read and write return code from here */
6730 ){
6731   int idx = 0;      /* Where to write new cell content in data[] */
6732   int j;            /* Loop counter */
6733   u8 *data;         /* The content of the whole page */
6734   u8 *pIns;         /* The point in pPage->aCellIdx[] where no cell inserted */
6735 
6736   assert( *pRC==SQLITE_OK );
6737   assert( i>=0 && i<=pPage->nCell+pPage->nOverflow );
6738   assert( MX_CELL(pPage->pBt)<=10921 );
6739   assert( pPage->nCell<=MX_CELL(pPage->pBt) || CORRUPT_DB );
6740   assert( pPage->nOverflow<=ArraySize(pPage->apOvfl) );
6741   assert( ArraySize(pPage->apOvfl)==ArraySize(pPage->aiOvfl) );
6742   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
6743   assert( sz==pPage->xCellSize(pPage, pCell) || CORRUPT_DB );
6744   assert( pPage->nFree>=0 );
6745   if( pPage->nOverflow || sz+2>pPage->nFree ){
6746     if( pTemp ){
6747       memcpy(pTemp, pCell, sz);
6748       pCell = pTemp;
6749     }
6750     if( iChild ){
6751       put4byte(pCell, iChild);
6752     }
6753     j = pPage->nOverflow++;
6754     /* Comparison against ArraySize-1 since we hold back one extra slot
6755     ** as a contingency.  In other words, never need more than 3 overflow
6756     ** slots but 4 are allocated, just to be safe. */
6757     assert( j < ArraySize(pPage->apOvfl)-1 );
6758     pPage->apOvfl[j] = pCell;
6759     pPage->aiOvfl[j] = (u16)i;
6760 
6761     /* When multiple overflows occur, they are always sequential and in
6762     ** sorted order.  This invariants arise because multiple overflows can
6763     ** only occur when inserting divider cells into the parent page during
6764     ** balancing, and the dividers are adjacent and sorted.
6765     */
6766     assert( j==0 || pPage->aiOvfl[j-1]<(u16)i ); /* Overflows in sorted order */
6767     assert( j==0 || i==pPage->aiOvfl[j-1]+1 );   /* Overflows are sequential */
6768   }else{
6769     int rc = sqlite3PagerWrite(pPage->pDbPage);
6770     if( rc!=SQLITE_OK ){
6771       *pRC = rc;
6772       return;
6773     }
6774     assert( sqlite3PagerIswriteable(pPage->pDbPage) );
6775     data = pPage->aData;
6776     assert( &data[pPage->cellOffset]==pPage->aCellIdx );
6777     rc = allocateSpace(pPage, sz, &idx);
6778     if( rc ){ *pRC = rc; return; }
6779     /* The allocateSpace() routine guarantees the following properties
6780     ** if it returns successfully */
6781     assert( idx >= 0 );
6782     assert( idx >= pPage->cellOffset+2*pPage->nCell+2 || CORRUPT_DB );
6783     assert( idx+sz <= (int)pPage->pBt->usableSize );
6784     pPage->nFree -= (u16)(2 + sz);
6785     if( iChild ){
6786       /* In a corrupt database where an entry in the cell index section of
6787       ** a btree page has a value of 3 or less, the pCell value might point
6788       ** as many as 4 bytes in front of the start of the aData buffer for
6789       ** the source page.  Make sure this does not cause problems by not
6790       ** reading the first 4 bytes */
6791       memcpy(&data[idx+4], pCell+4, sz-4);
6792       put4byte(&data[idx], iChild);
6793     }else{
6794       memcpy(&data[idx], pCell, sz);
6795     }
6796     pIns = pPage->aCellIdx + i*2;
6797     memmove(pIns+2, pIns, 2*(pPage->nCell - i));
6798     put2byte(pIns, idx);
6799     pPage->nCell++;
6800     /* increment the cell count */
6801     if( (++data[pPage->hdrOffset+4])==0 ) data[pPage->hdrOffset+3]++;
6802     assert( get2byte(&data[pPage->hdrOffset+3])==pPage->nCell || CORRUPT_DB );
6803 #ifndef SQLITE_OMIT_AUTOVACUUM
6804     if( pPage->pBt->autoVacuum ){
6805       /* The cell may contain a pointer to an overflow page. If so, write
6806       ** the entry for the overflow page into the pointer map.
6807       */
6808       ptrmapPutOvflPtr(pPage, pPage, pCell, pRC);
6809     }
6810 #endif
6811   }
6812 }
6813 
6814 /*
6815 ** The following parameters determine how many adjacent pages get involved
6816 ** in a balancing operation.  NN is the number of neighbors on either side
6817 ** of the page that participate in the balancing operation.  NB is the
6818 ** total number of pages that participate, including the target page and
6819 ** NN neighbors on either side.
6820 **
6821 ** The minimum value of NN is 1 (of course).  Increasing NN above 1
6822 ** (to 2 or 3) gives a modest improvement in SELECT and DELETE performance
6823 ** in exchange for a larger degradation in INSERT and UPDATE performance.
6824 ** The value of NN appears to give the best results overall.
6825 **
6826 ** (Later:) The description above makes it seem as if these values are
6827 ** tunable - as if you could change them and recompile and it would all work.
6828 ** But that is unlikely.  NB has been 3 since the inception of SQLite and
6829 ** we have never tested any other value.
6830 */
6831 #define NN 1             /* Number of neighbors on either side of pPage */
6832 #define NB 3             /* (NN*2+1): Total pages involved in the balance */
6833 
6834 /*
6835 ** A CellArray object contains a cache of pointers and sizes for a
6836 ** consecutive sequence of cells that might be held on multiple pages.
6837 **
6838 ** The cells in this array are the divider cell or cells from the pParent
6839 ** page plus up to three child pages.  There are a total of nCell cells.
6840 **
6841 ** pRef is a pointer to one of the pages that contributes cells.  This is
6842 ** used to access information such as MemPage.intKey and MemPage.pBt->pageSize
6843 ** which should be common to all pages that contribute cells to this array.
6844 **
6845 ** apCell[] and szCell[] hold, respectively, pointers to the start of each
6846 ** cell and the size of each cell.  Some of the apCell[] pointers might refer
6847 ** to overflow cells.  In other words, some apCel[] pointers might not point
6848 ** to content area of the pages.
6849 **
6850 ** A szCell[] of zero means the size of that cell has not yet been computed.
6851 **
6852 ** The cells come from as many as four different pages:
6853 **
6854 **             -----------
6855 **             | Parent  |
6856 **             -----------
6857 **            /     |     \
6858 **           /      |      \
6859 **  ---------   ---------   ---------
6860 **  |Child-1|   |Child-2|   |Child-3|
6861 **  ---------   ---------   ---------
6862 **
6863 ** The order of cells is in the array is for an index btree is:
6864 **
6865 **       1.  All cells from Child-1 in order
6866 **       2.  The first divider cell from Parent
6867 **       3.  All cells from Child-2 in order
6868 **       4.  The second divider cell from Parent
6869 **       5.  All cells from Child-3 in order
6870 **
6871 ** For a table-btree (with rowids) the items 2 and 4 are empty because
6872 ** content exists only in leaves and there are no divider cells.
6873 **
6874 ** For an index btree, the apEnd[] array holds pointer to the end of page
6875 ** for Child-1, the Parent, Child-2, the Parent (again), and Child-3,
6876 ** respectively. The ixNx[] array holds the number of cells contained in
6877 ** each of these 5 stages, and all stages to the left.  Hence:
6878 **
6879 **    ixNx[0] = Number of cells in Child-1.
6880 **    ixNx[1] = Number of cells in Child-1 plus 1 for first divider.
6881 **    ixNx[2] = Number of cells in Child-1 and Child-2 + 1 for 1st divider.
6882 **    ixNx[3] = Number of cells in Child-1 and Child-2 + both divider cells
6883 **    ixNx[4] = Total number of cells.
6884 **
6885 ** For a table-btree, the concept is similar, except only apEnd[0]..apEnd[2]
6886 ** are used and they point to the leaf pages only, and the ixNx value are:
6887 **
6888 **    ixNx[0] = Number of cells in Child-1.
6889 **    ixNx[1] = Number of cells in Child-1 and Child-2.
6890 **    ixNx[2] = Total number of cells.
6891 **
6892 ** Sometimes when deleting, a child page can have zero cells.  In those
6893 ** cases, ixNx[] entries with higher indexes, and the corresponding apEnd[]
6894 ** entries, shift down.  The end result is that each ixNx[] entry should
6895 ** be larger than the previous
6896 */
6897 typedef struct CellArray CellArray;
6898 struct CellArray {
6899   int nCell;              /* Number of cells in apCell[] */
6900   MemPage *pRef;          /* Reference page */
6901   u8 **apCell;            /* All cells begin balanced */
6902   u16 *szCell;            /* Local size of all cells in apCell[] */
6903   u8 *apEnd[NB*2];        /* MemPage.aDataEnd values */
6904   int ixNx[NB*2];         /* Index of at which we move to the next apEnd[] */
6905 };
6906 
6907 /*
6908 ** Make sure the cell sizes at idx, idx+1, ..., idx+N-1 have been
6909 ** computed.
6910 */
populateCellCache(CellArray * p,int idx,int N)6911 static void populateCellCache(CellArray *p, int idx, int N){
6912   assert( idx>=0 && idx+N<=p->nCell );
6913   while( N>0 ){
6914     assert( p->apCell[idx]!=0 );
6915     if( p->szCell[idx]==0 ){
6916       p->szCell[idx] = p->pRef->xCellSize(p->pRef, p->apCell[idx]);
6917     }else{
6918       assert( CORRUPT_DB ||
6919               p->szCell[idx]==p->pRef->xCellSize(p->pRef, p->apCell[idx]) );
6920     }
6921     idx++;
6922     N--;
6923   }
6924 }
6925 
6926 /*
6927 ** Return the size of the Nth element of the cell array
6928 */
computeCellSize(CellArray * p,int N)6929 static SQLITE_NOINLINE u16 computeCellSize(CellArray *p, int N){
6930   assert( N>=0 && N<p->nCell );
6931   assert( p->szCell[N]==0 );
6932   p->szCell[N] = p->pRef->xCellSize(p->pRef, p->apCell[N]);
6933   return p->szCell[N];
6934 }
cachedCellSize(CellArray * p,int N)6935 static u16 cachedCellSize(CellArray *p, int N){
6936   assert( N>=0 && N<p->nCell );
6937   if( p->szCell[N] ) return p->szCell[N];
6938   return computeCellSize(p, N);
6939 }
6940 
6941 /*
6942 ** Array apCell[] contains pointers to nCell b-tree page cells. The
6943 ** szCell[] array contains the size in bytes of each cell. This function
6944 ** replaces the current contents of page pPg with the contents of the cell
6945 ** array.
6946 **
6947 ** Some of the cells in apCell[] may currently be stored in pPg. This
6948 ** function works around problems caused by this by making a copy of any
6949 ** such cells before overwriting the page data.
6950 **
6951 ** The MemPage.nFree field is invalidated by this function. It is the
6952 ** responsibility of the caller to set it correctly.
6953 */
rebuildPage(CellArray * pCArray,int iFirst,int nCell,MemPage * pPg)6954 static int rebuildPage(
6955   CellArray *pCArray,             /* Content to be added to page pPg */
6956   int iFirst,                     /* First cell in pCArray to use */
6957   int nCell,                      /* Final number of cells on page */
6958   MemPage *pPg                    /* The page to be reconstructed */
6959 ){
6960   const int hdr = pPg->hdrOffset;          /* Offset of header on pPg */
6961   u8 * const aData = pPg->aData;           /* Pointer to data for pPg */
6962   const int usableSize = pPg->pBt->usableSize;
6963   u8 * const pEnd = &aData[usableSize];
6964   int i = iFirst;                 /* Which cell to copy from pCArray*/
6965   u32 j;                          /* Start of cell content area */
6966   int iEnd = i+nCell;             /* Loop terminator */
6967   u8 *pCellptr = pPg->aCellIdx;
6968   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
6969   u8 *pData;
6970   int k;                          /* Current slot in pCArray->apEnd[] */
6971   u8 *pSrcEnd;                    /* Current pCArray->apEnd[k] value */
6972 
6973   assert( i<iEnd );
6974   j = get2byte(&aData[hdr+5]);
6975   if( NEVER(j>(u32)usableSize) ){ j = 0; }
6976   memcpy(&pTmp[j], &aData[j], usableSize - j);
6977 
6978   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
6979   pSrcEnd = pCArray->apEnd[k];
6980 
6981   pData = pEnd;
6982   while( 1/*exit by break*/ ){
6983     u8 *pCell = pCArray->apCell[i];
6984     u16 sz = pCArray->szCell[i];
6985     assert( sz>0 );
6986     if( SQLITE_WITHIN(pCell,aData,pEnd) ){
6987       if( ((uptr)(pCell+sz))>(uptr)pEnd ) return SQLITE_CORRUPT_BKPT;
6988       pCell = &pTmp[pCell - aData];
6989     }else if( (uptr)(pCell+sz)>(uptr)pSrcEnd
6990            && (uptr)(pCell)<(uptr)pSrcEnd
6991     ){
6992       return SQLITE_CORRUPT_BKPT;
6993     }
6994 
6995     pData -= sz;
6996     put2byte(pCellptr, (pData - aData));
6997     pCellptr += 2;
6998     if( pData < pCellptr ) return SQLITE_CORRUPT_BKPT;
6999     memcpy(pData, pCell, sz);
7000     assert( sz==pPg->xCellSize(pPg, pCell) || CORRUPT_DB );
7001     testcase( sz!=pPg->xCellSize(pPg,pCell) )
7002     i++;
7003     if( i>=iEnd ) break;
7004     if( pCArray->ixNx[k]<=i ){
7005       k++;
7006       pSrcEnd = pCArray->apEnd[k];
7007     }
7008   }
7009 
7010   /* The pPg->nFree field is now set incorrectly. The caller will fix it. */
7011   pPg->nCell = nCell;
7012   pPg->nOverflow = 0;
7013 
7014   put2byte(&aData[hdr+1], 0);
7015   put2byte(&aData[hdr+3], pPg->nCell);
7016   put2byte(&aData[hdr+5], pData - aData);
7017   aData[hdr+7] = 0x00;
7018   return SQLITE_OK;
7019 }
7020 
7021 /*
7022 ** The pCArray objects contains pointers to b-tree cells and the cell sizes.
7023 ** This function attempts to add the cells stored in the array to page pPg.
7024 ** If it cannot (because the page needs to be defragmented before the cells
7025 ** will fit), non-zero is returned. Otherwise, if the cells are added
7026 ** successfully, zero is returned.
7027 **
7028 ** Argument pCellptr points to the first entry in the cell-pointer array
7029 ** (part of page pPg) to populate. After cell apCell[0] is written to the
7030 ** page body, a 16-bit offset is written to pCellptr. And so on, for each
7031 ** cell in the array. It is the responsibility of the caller to ensure
7032 ** that it is safe to overwrite this part of the cell-pointer array.
7033 **
7034 ** When this function is called, *ppData points to the start of the
7035 ** content area on page pPg. If the size of the content area is extended,
7036 ** *ppData is updated to point to the new start of the content area
7037 ** before returning.
7038 **
7039 ** Finally, argument pBegin points to the byte immediately following the
7040 ** end of the space required by this page for the cell-pointer area (for
7041 ** all cells - not just those inserted by the current call). If the content
7042 ** area must be extended to before this point in order to accomodate all
7043 ** cells in apCell[], then the cells do not fit and non-zero is returned.
7044 */
pageInsertArray(MemPage * pPg,u8 * pBegin,u8 ** ppData,u8 * pCellptr,int iFirst,int nCell,CellArray * pCArray)7045 static int pageInsertArray(
7046   MemPage *pPg,                   /* Page to add cells to */
7047   u8 *pBegin,                     /* End of cell-pointer array */
7048   u8 **ppData,                    /* IN/OUT: Page content-area pointer */
7049   u8 *pCellptr,                   /* Pointer to cell-pointer area */
7050   int iFirst,                     /* Index of first cell to add */
7051   int nCell,                      /* Number of cells to add to pPg */
7052   CellArray *pCArray              /* Array of cells */
7053 ){
7054   int i = iFirst;                 /* Loop counter - cell index to insert */
7055   u8 *aData = pPg->aData;         /* Complete page */
7056   u8 *pData = *ppData;            /* Content area.  A subset of aData[] */
7057   int iEnd = iFirst + nCell;      /* End of loop. One past last cell to ins */
7058   int k;                          /* Current slot in pCArray->apEnd[] */
7059   u8 *pEnd;                       /* Maximum extent of cell data */
7060   assert( CORRUPT_DB || pPg->hdrOffset==0 );    /* Never called on page 1 */
7061   if( iEnd<=iFirst ) return 0;
7062   for(k=0; pCArray->ixNx[k]<=i && ALWAYS(k<NB*2); k++){}
7063   pEnd = pCArray->apEnd[k];
7064   while( 1 /*Exit by break*/ ){
7065     int sz, rc;
7066     u8 *pSlot;
7067     assert( pCArray->szCell[i]!=0 );
7068     sz = pCArray->szCell[i];
7069     if( (aData[1]==0 && aData[2]==0) || (pSlot = pageFindSlot(pPg,sz,&rc))==0 ){
7070       if( (pData - pBegin)<sz ) return 1;
7071       pData -= sz;
7072       pSlot = pData;
7073     }
7074     /* pSlot and pCArray->apCell[i] will never overlap on a well-formed
7075     ** database.  But they might for a corrupt database.  Hence use memmove()
7076     ** since memcpy() sends SIGABORT with overlapping buffers on OpenBSD */
7077     assert( (pSlot+sz)<=pCArray->apCell[i]
7078          || pSlot>=(pCArray->apCell[i]+sz)
7079          || CORRUPT_DB );
7080     if( (uptr)(pCArray->apCell[i]+sz)>(uptr)pEnd
7081      && (uptr)(pCArray->apCell[i])<(uptr)pEnd
7082     ){
7083       assert( CORRUPT_DB );
7084       (void)SQLITE_CORRUPT_BKPT;
7085       return 1;
7086     }
7087     memmove(pSlot, pCArray->apCell[i], sz);
7088     put2byte(pCellptr, (pSlot - aData));
7089     pCellptr += 2;
7090     i++;
7091     if( i>=iEnd ) break;
7092     if( pCArray->ixNx[k]<=i ){
7093       k++;
7094       pEnd = pCArray->apEnd[k];
7095     }
7096   }
7097   *ppData = pData;
7098   return 0;
7099 }
7100 
7101 /*
7102 ** The pCArray object contains pointers to b-tree cells and their sizes.
7103 **
7104 ** This function adds the space associated with each cell in the array
7105 ** that is currently stored within the body of pPg to the pPg free-list.
7106 ** The cell-pointers and other fields of the page are not updated.
7107 **
7108 ** This function returns the total number of cells added to the free-list.
7109 */
pageFreeArray(MemPage * pPg,int iFirst,int nCell,CellArray * pCArray)7110 static int pageFreeArray(
7111   MemPage *pPg,                   /* Page to edit */
7112   int iFirst,                     /* First cell to delete */
7113   int nCell,                      /* Cells to delete */
7114   CellArray *pCArray              /* Array of cells */
7115 ){
7116   u8 * const aData = pPg->aData;
7117   u8 * const pEnd = &aData[pPg->pBt->usableSize];
7118   u8 * const pStart = &aData[pPg->hdrOffset + 8 + pPg->childPtrSize];
7119   int nRet = 0;
7120   int i;
7121   int iEnd = iFirst + nCell;
7122   u8 *pFree = 0;
7123   int szFree = 0;
7124 
7125   for(i=iFirst; i<iEnd; i++){
7126     u8 *pCell = pCArray->apCell[i];
7127     if( SQLITE_WITHIN(pCell, pStart, pEnd) ){
7128       int sz;
7129       /* No need to use cachedCellSize() here.  The sizes of all cells that
7130       ** are to be freed have already been computing while deciding which
7131       ** cells need freeing */
7132       sz = pCArray->szCell[i];  assert( sz>0 );
7133       if( pFree!=(pCell + sz) ){
7134         if( pFree ){
7135           assert( pFree>aData && (pFree - aData)<65536 );
7136           freeSpace(pPg, (u16)(pFree - aData), szFree);
7137         }
7138         pFree = pCell;
7139         szFree = sz;
7140         if( pFree+sz>pEnd ) return 0;
7141       }else{
7142         pFree = pCell;
7143         szFree += sz;
7144       }
7145       nRet++;
7146     }
7147   }
7148   if( pFree ){
7149     assert( pFree>aData && (pFree - aData)<65536 );
7150     freeSpace(pPg, (u16)(pFree - aData), szFree);
7151   }
7152   return nRet;
7153 }
7154 
7155 /*
7156 ** pCArray contains pointers to and sizes of all cells in the page being
7157 ** balanced.  The current page, pPg, has pPg->nCell cells starting with
7158 ** pCArray->apCell[iOld].  After balancing, this page should hold nNew cells
7159 ** starting at apCell[iNew].
7160 **
7161 ** This routine makes the necessary adjustments to pPg so that it contains
7162 ** the correct cells after being balanced.
7163 **
7164 ** The pPg->nFree field is invalid when this function returns. It is the
7165 ** responsibility of the caller to set it correctly.
7166 */
editPage(MemPage * pPg,int iOld,int iNew,int nNew,CellArray * pCArray)7167 static int editPage(
7168   MemPage *pPg,                   /* Edit this page */
7169   int iOld,                       /* Index of first cell currently on page */
7170   int iNew,                       /* Index of new first cell on page */
7171   int nNew,                       /* Final number of cells on page */
7172   CellArray *pCArray              /* Array of cells and sizes */
7173 ){
7174   u8 * const aData = pPg->aData;
7175   const int hdr = pPg->hdrOffset;
7176   u8 *pBegin = &pPg->aCellIdx[nNew * 2];
7177   int nCell = pPg->nCell;       /* Cells stored on pPg */
7178   u8 *pData;
7179   u8 *pCellptr;
7180   int i;
7181   int iOldEnd = iOld + pPg->nCell + pPg->nOverflow;
7182   int iNewEnd = iNew + nNew;
7183 
7184 #ifdef SQLITE_DEBUG
7185   u8 *pTmp = sqlite3PagerTempSpace(pPg->pBt->pPager);
7186   memcpy(pTmp, aData, pPg->pBt->usableSize);
7187 #endif
7188 
7189   /* Remove cells from the start and end of the page */
7190   assert( nCell>=0 );
7191   if( iOld<iNew ){
7192     int nShift = pageFreeArray(pPg, iOld, iNew-iOld, pCArray);
7193     if( NEVER(nShift>nCell) ) return SQLITE_CORRUPT_BKPT;
7194     memmove(pPg->aCellIdx, &pPg->aCellIdx[nShift*2], nCell*2);
7195     nCell -= nShift;
7196   }
7197   if( iNewEnd < iOldEnd ){
7198     int nTail = pageFreeArray(pPg, iNewEnd, iOldEnd - iNewEnd, pCArray);
7199     assert( nCell>=nTail );
7200     nCell -= nTail;
7201   }
7202 
7203   pData = &aData[get2byteNotZero(&aData[hdr+5])];
7204   if( pData<pBegin ) goto editpage_fail;
7205 
7206   /* Add cells to the start of the page */
7207   if( iNew<iOld ){
7208     int nAdd = MIN(nNew,iOld-iNew);
7209     assert( (iOld-iNew)<nNew || nCell==0 || CORRUPT_DB );
7210     assert( nAdd>=0 );
7211     pCellptr = pPg->aCellIdx;
7212     memmove(&pCellptr[nAdd*2], pCellptr, nCell*2);
7213     if( pageInsertArray(
7214           pPg, pBegin, &pData, pCellptr,
7215           iNew, nAdd, pCArray
7216     ) ) goto editpage_fail;
7217     nCell += nAdd;
7218   }
7219 
7220   /* Add any overflow cells */
7221   for(i=0; i<pPg->nOverflow; i++){
7222     int iCell = (iOld + pPg->aiOvfl[i]) - iNew;
7223     if( iCell>=0 && iCell<nNew ){
7224       pCellptr = &pPg->aCellIdx[iCell * 2];
7225       if( nCell>iCell ){
7226         memmove(&pCellptr[2], pCellptr, (nCell - iCell) * 2);
7227       }
7228       nCell++;
7229       cachedCellSize(pCArray, iCell+iNew);
7230       if( pageInsertArray(
7231             pPg, pBegin, &pData, pCellptr,
7232             iCell+iNew, 1, pCArray
7233       ) ) goto editpage_fail;
7234     }
7235   }
7236 
7237   /* Append cells to the end of the page */
7238   assert( nCell>=0 );
7239   pCellptr = &pPg->aCellIdx[nCell*2];
7240   if( pageInsertArray(
7241         pPg, pBegin, &pData, pCellptr,
7242         iNew+nCell, nNew-nCell, pCArray
7243   ) ) goto editpage_fail;
7244 
7245   pPg->nCell = nNew;
7246   pPg->nOverflow = 0;
7247 
7248   put2byte(&aData[hdr+3], pPg->nCell);
7249   put2byte(&aData[hdr+5], pData - aData);
7250 
7251 #ifdef SQLITE_DEBUG
7252   for(i=0; i<nNew && !CORRUPT_DB; i++){
7253     u8 *pCell = pCArray->apCell[i+iNew];
7254     int iOff = get2byteAligned(&pPg->aCellIdx[i*2]);
7255     if( SQLITE_WITHIN(pCell, aData, &aData[pPg->pBt->usableSize]) ){
7256       pCell = &pTmp[pCell - aData];
7257     }
7258     assert( 0==memcmp(pCell, &aData[iOff],
7259             pCArray->pRef->xCellSize(pCArray->pRef, pCArray->apCell[i+iNew])) );
7260   }
7261 #endif
7262 
7263   return SQLITE_OK;
7264  editpage_fail:
7265   /* Unable to edit this page. Rebuild it from scratch instead. */
7266   populateCellCache(pCArray, iNew, nNew);
7267   return rebuildPage(pCArray, iNew, nNew, pPg);
7268 }
7269 
7270 
7271 #ifndef SQLITE_OMIT_QUICKBALANCE
7272 /*
7273 ** This version of balance() handles the common special case where
7274 ** a new entry is being inserted on the extreme right-end of the
7275 ** tree, in other words, when the new entry will become the largest
7276 ** entry in the tree.
7277 **
7278 ** Instead of trying to balance the 3 right-most leaf pages, just add
7279 ** a new page to the right-hand side and put the one new entry in
7280 ** that page.  This leaves the right side of the tree somewhat
7281 ** unbalanced.  But odds are that we will be inserting new entries
7282 ** at the end soon afterwards so the nearly empty page will quickly
7283 ** fill up.  On average.
7284 **
7285 ** pPage is the leaf page which is the right-most page in the tree.
7286 ** pParent is its parent.  pPage must have a single overflow entry
7287 ** which is also the right-most entry on the page.
7288 **
7289 ** The pSpace buffer is used to store a temporary copy of the divider
7290 ** cell that will be inserted into pParent. Such a cell consists of a 4
7291 ** byte page number followed by a variable length integer. In other
7292 ** words, at most 13 bytes. Hence the pSpace buffer must be at
7293 ** least 13 bytes in size.
7294 */
balance_quick(MemPage * pParent,MemPage * pPage,u8 * pSpace)7295 static int balance_quick(MemPage *pParent, MemPage *pPage, u8 *pSpace){
7296   BtShared *const pBt = pPage->pBt;    /* B-Tree Database */
7297   MemPage *pNew;                       /* Newly allocated page */
7298   int rc;                              /* Return Code */
7299   Pgno pgnoNew;                        /* Page number of pNew */
7300 
7301   assert( sqlite3_mutex_held(pPage->pBt->mutex) );
7302   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7303   assert( pPage->nOverflow==1 );
7304 
7305   if( pPage->nCell==0 ) return SQLITE_CORRUPT_BKPT;  /* dbfuzz001.test */
7306   assert( pPage->nFree>=0 );
7307   assert( pParent->nFree>=0 );
7308 
7309   /* Allocate a new page. This page will become the right-sibling of
7310   ** pPage. Make the parent page writable, so that the new divider cell
7311   ** may be inserted. If both these operations are successful, proceed.
7312   */
7313   rc = allocateBtreePage(pBt, &pNew, &pgnoNew, 0, 0);
7314 
7315   if( rc==SQLITE_OK ){
7316 
7317     u8 *pOut = &pSpace[4];
7318     u8 *pCell = pPage->apOvfl[0];
7319     u16 szCell = pPage->xCellSize(pPage, pCell);
7320     u8 *pStop;
7321     CellArray b;
7322 
7323     assert( sqlite3PagerIswriteable(pNew->pDbPage) );
7324     assert( CORRUPT_DB || pPage->aData[0]==(PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF) );
7325     zeroPage(pNew, PTF_INTKEY|PTF_LEAFDATA|PTF_LEAF);
7326     b.nCell = 1;
7327     b.pRef = pPage;
7328     b.apCell = &pCell;
7329     b.szCell = &szCell;
7330     b.apEnd[0] = pPage->aDataEnd;
7331     b.ixNx[0] = 2;
7332     rc = rebuildPage(&b, 0, 1, pNew);
7333     if( NEVER(rc) ){
7334       releasePage(pNew);
7335       return rc;
7336     }
7337     pNew->nFree = pBt->usableSize - pNew->cellOffset - 2 - szCell;
7338 
7339     /* If this is an auto-vacuum database, update the pointer map
7340     ** with entries for the new page, and any pointer from the
7341     ** cell on the page to an overflow page. If either of these
7342     ** operations fails, the return code is set, but the contents
7343     ** of the parent page are still manipulated by thh code below.
7344     ** That is Ok, at this point the parent page is guaranteed to
7345     ** be marked as dirty. Returning an error code will cause a
7346     ** rollback, undoing any changes made to the parent page.
7347     */
7348     if( ISAUTOVACUUM ){
7349       ptrmapPut(pBt, pgnoNew, PTRMAP_BTREE, pParent->pgno, &rc);
7350       if( szCell>pNew->minLocal ){
7351         ptrmapPutOvflPtr(pNew, pNew, pCell, &rc);
7352       }
7353     }
7354 
7355     /* Create a divider cell to insert into pParent. The divider cell
7356     ** consists of a 4-byte page number (the page number of pPage) and
7357     ** a variable length key value (which must be the same value as the
7358     ** largest key on pPage).
7359     **
7360     ** To find the largest key value on pPage, first find the right-most
7361     ** cell on pPage. The first two fields of this cell are the
7362     ** record-length (a variable length integer at most 32-bits in size)
7363     ** and the key value (a variable length integer, may have any value).
7364     ** The first of the while(...) loops below skips over the record-length
7365     ** field. The second while(...) loop copies the key value from the
7366     ** cell on pPage into the pSpace buffer.
7367     */
7368     pCell = findCell(pPage, pPage->nCell-1);
7369     pStop = &pCell[9];
7370     while( (*(pCell++)&0x80) && pCell<pStop );
7371     pStop = &pCell[9];
7372     while( ((*(pOut++) = *(pCell++))&0x80) && pCell<pStop );
7373 
7374     /* Insert the new divider cell into pParent. */
7375     if( rc==SQLITE_OK ){
7376       insertCell(pParent, pParent->nCell, pSpace, (int)(pOut-pSpace),
7377                    0, pPage->pgno, &rc);
7378     }
7379 
7380     /* Set the right-child pointer of pParent to point to the new page. */
7381     put4byte(&pParent->aData[pParent->hdrOffset+8], pgnoNew);
7382 
7383     /* Release the reference to the new page. */
7384     releasePage(pNew);
7385   }
7386 
7387   return rc;
7388 }
7389 #endif /* SQLITE_OMIT_QUICKBALANCE */
7390 
7391 #if 0
7392 /*
7393 ** This function does not contribute anything to the operation of SQLite.
7394 ** it is sometimes activated temporarily while debugging code responsible
7395 ** for setting pointer-map entries.
7396 */
7397 static int ptrmapCheckPages(MemPage **apPage, int nPage){
7398   int i, j;
7399   for(i=0; i<nPage; i++){
7400     Pgno n;
7401     u8 e;
7402     MemPage *pPage = apPage[i];
7403     BtShared *pBt = pPage->pBt;
7404     assert( pPage->isInit );
7405 
7406     for(j=0; j<pPage->nCell; j++){
7407       CellInfo info;
7408       u8 *z;
7409 
7410       z = findCell(pPage, j);
7411       pPage->xParseCell(pPage, z, &info);
7412       if( info.nLocal<info.nPayload ){
7413         Pgno ovfl = get4byte(&z[info.nSize-4]);
7414         ptrmapGet(pBt, ovfl, &e, &n);
7415         assert( n==pPage->pgno && e==PTRMAP_OVERFLOW1 );
7416       }
7417       if( !pPage->leaf ){
7418         Pgno child = get4byte(z);
7419         ptrmapGet(pBt, child, &e, &n);
7420         assert( n==pPage->pgno && e==PTRMAP_BTREE );
7421       }
7422     }
7423     if( !pPage->leaf ){
7424       Pgno child = get4byte(&pPage->aData[pPage->hdrOffset+8]);
7425       ptrmapGet(pBt, child, &e, &n);
7426       assert( n==pPage->pgno && e==PTRMAP_BTREE );
7427     }
7428   }
7429   return 1;
7430 }
7431 #endif
7432 
7433 /*
7434 ** This function is used to copy the contents of the b-tree node stored
7435 ** on page pFrom to page pTo. If page pFrom was not a leaf page, then
7436 ** the pointer-map entries for each child page are updated so that the
7437 ** parent page stored in the pointer map is page pTo. If pFrom contained
7438 ** any cells with overflow page pointers, then the corresponding pointer
7439 ** map entries are also updated so that the parent page is page pTo.
7440 **
7441 ** If pFrom is currently carrying any overflow cells (entries in the
7442 ** MemPage.apOvfl[] array), they are not copied to pTo.
7443 **
7444 ** Before returning, page pTo is reinitialized using btreeInitPage().
7445 **
7446 ** The performance of this function is not critical. It is only used by
7447 ** the balance_shallower() and balance_deeper() procedures, neither of
7448 ** which are called often under normal circumstances.
7449 */
copyNodeContent(MemPage * pFrom,MemPage * pTo,int * pRC)7450 static void copyNodeContent(MemPage *pFrom, MemPage *pTo, int *pRC){
7451   if( (*pRC)==SQLITE_OK ){
7452     BtShared * const pBt = pFrom->pBt;
7453     u8 * const aFrom = pFrom->aData;
7454     u8 * const aTo = pTo->aData;
7455     int const iFromHdr = pFrom->hdrOffset;
7456     int const iToHdr = ((pTo->pgno==1) ? 100 : 0);
7457     int rc;
7458     int iData;
7459 
7460 
7461     assert( pFrom->isInit );
7462     assert( pFrom->nFree>=iToHdr );
7463     assert( get2byte(&aFrom[iFromHdr+5]) <= (int)pBt->usableSize );
7464 
7465     /* Copy the b-tree node content from page pFrom to page pTo. */
7466     iData = get2byte(&aFrom[iFromHdr+5]);
7467     memcpy(&aTo[iData], &aFrom[iData], pBt->usableSize-iData);
7468     memcpy(&aTo[iToHdr], &aFrom[iFromHdr], pFrom->cellOffset + 2*pFrom->nCell);
7469 
7470     /* Reinitialize page pTo so that the contents of the MemPage structure
7471     ** match the new data. The initialization of pTo can actually fail under
7472     ** fairly obscure circumstances, even though it is a copy of initialized
7473     ** page pFrom.
7474     */
7475     pTo->isInit = 0;
7476     rc = btreeInitPage(pTo);
7477     if( rc==SQLITE_OK ) rc = btreeComputeFreeSpace(pTo);
7478     if( rc!=SQLITE_OK ){
7479       *pRC = rc;
7480       return;
7481     }
7482 
7483     /* If this is an auto-vacuum database, update the pointer-map entries
7484     ** for any b-tree or overflow pages that pTo now contains the pointers to.
7485     */
7486     if( ISAUTOVACUUM ){
7487       *pRC = setChildPtrmaps(pTo);
7488     }
7489   }
7490 }
7491 
7492 /*
7493 ** This routine redistributes cells on the iParentIdx'th child of pParent
7494 ** (hereafter "the page") and up to 2 siblings so that all pages have about the
7495 ** same amount of free space. Usually a single sibling on either side of the
7496 ** page are used in the balancing, though both siblings might come from one
7497 ** side if the page is the first or last child of its parent. If the page
7498 ** has fewer than 2 siblings (something which can only happen if the page
7499 ** is a root page or a child of a root page) then all available siblings
7500 ** participate in the balancing.
7501 **
7502 ** The number of siblings of the page might be increased or decreased by
7503 ** one or two in an effort to keep pages nearly full but not over full.
7504 **
7505 ** Note that when this routine is called, some of the cells on the page
7506 ** might not actually be stored in MemPage.aData[]. This can happen
7507 ** if the page is overfull. This routine ensures that all cells allocated
7508 ** to the page and its siblings fit into MemPage.aData[] before returning.
7509 **
7510 ** In the course of balancing the page and its siblings, cells may be
7511 ** inserted into or removed from the parent page (pParent). Doing so
7512 ** may cause the parent page to become overfull or underfull. If this
7513 ** happens, it is the responsibility of the caller to invoke the correct
7514 ** balancing routine to fix this problem (see the balance() routine).
7515 **
7516 ** If this routine fails for any reason, it might leave the database
7517 ** in a corrupted state. So if this routine fails, the database should
7518 ** be rolled back.
7519 **
7520 ** The third argument to this function, aOvflSpace, is a pointer to a
7521 ** buffer big enough to hold one page. If while inserting cells into the parent
7522 ** page (pParent) the parent page becomes overfull, this buffer is
7523 ** used to store the parent's overflow cells. Because this function inserts
7524 ** a maximum of four divider cells into the parent page, and the maximum
7525 ** size of a cell stored within an internal node is always less than 1/4
7526 ** of the page-size, the aOvflSpace[] buffer is guaranteed to be large
7527 ** enough for all overflow cells.
7528 **
7529 ** If aOvflSpace is set to a null pointer, this function returns
7530 ** SQLITE_NOMEM.
7531 */
balance_nonroot(MemPage * pParent,int iParentIdx,u8 * aOvflSpace,int isRoot,int bBulk)7532 static int balance_nonroot(
7533   MemPage *pParent,               /* Parent page of siblings being balanced */
7534   int iParentIdx,                 /* Index of "the page" in pParent */
7535   u8 *aOvflSpace,                 /* page-size bytes of space for parent ovfl */
7536   int isRoot,                     /* True if pParent is a root-page */
7537   int bBulk                       /* True if this call is part of a bulk load */
7538 ){
7539   BtShared *pBt;               /* The whole database */
7540   int nMaxCells = 0;           /* Allocated size of apCell, szCell, aFrom. */
7541   int nNew = 0;                /* Number of pages in apNew[] */
7542   int nOld;                    /* Number of pages in apOld[] */
7543   int i, j, k;                 /* Loop counters */
7544   int nxDiv;                   /* Next divider slot in pParent->aCell[] */
7545   int rc = SQLITE_OK;          /* The return code */
7546   u16 leafCorrection;          /* 4 if pPage is a leaf.  0 if not */
7547   int leafData;                /* True if pPage is a leaf of a LEAFDATA tree */
7548   int usableSpace;             /* Bytes in pPage beyond the header */
7549   int pageFlags;               /* Value of pPage->aData[0] */
7550   int iSpace1 = 0;             /* First unused byte of aSpace1[] */
7551   int iOvflSpace = 0;          /* First unused byte of aOvflSpace[] */
7552   int szScratch;               /* Size of scratch memory requested */
7553   MemPage *apOld[NB];          /* pPage and up to two siblings */
7554   MemPage *apNew[NB+2];        /* pPage and up to NB siblings after balancing */
7555   u8 *pRight;                  /* Location in parent of right-sibling pointer */
7556   u8 *apDiv[NB-1];             /* Divider cells in pParent */
7557   int cntNew[NB+2];            /* Index in b.paCell[] of cell after i-th page */
7558   int cntOld[NB+2];            /* Old index in b.apCell[] */
7559   int szNew[NB+2];             /* Combined size of cells placed on i-th page */
7560   u8 *aSpace1;                 /* Space for copies of dividers cells */
7561   Pgno pgno;                   /* Temp var to store a page number in */
7562   u8 abDone[NB+2];             /* True after i'th new page is populated */
7563   Pgno aPgno[NB+2];            /* Page numbers of new pages before shuffling */
7564   Pgno aPgOrder[NB+2];         /* Copy of aPgno[] used for sorting pages */
7565   u16 aPgFlags[NB+2];          /* flags field of new pages before shuffling */
7566   CellArray b;                  /* Parsed information on cells being balanced */
7567 
7568   memset(abDone, 0, sizeof(abDone));
7569   b.nCell = 0;
7570   b.apCell = 0;
7571   pBt = pParent->pBt;
7572   assert( sqlite3_mutex_held(pBt->mutex) );
7573   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
7574 
7575   /* At this point pParent may have at most one overflow cell. And if
7576   ** this overflow cell is present, it must be the cell with
7577   ** index iParentIdx. This scenario comes about when this function
7578   ** is called (indirectly) from sqlite3BtreeDelete().
7579   */
7580   assert( pParent->nOverflow==0 || pParent->nOverflow==1 );
7581   assert( pParent->nOverflow==0 || pParent->aiOvfl[0]==iParentIdx );
7582 
7583   if( !aOvflSpace ){
7584     return SQLITE_NOMEM_BKPT;
7585   }
7586   assert( pParent->nFree>=0 );
7587 
7588   /* Find the sibling pages to balance. Also locate the cells in pParent
7589   ** that divide the siblings. An attempt is made to find NN siblings on
7590   ** either side of pPage. More siblings are taken from one side, however,
7591   ** if there are fewer than NN siblings on the other side. If pParent
7592   ** has NB or fewer children then all children of pParent are taken.
7593   **
7594   ** This loop also drops the divider cells from the parent page. This
7595   ** way, the remainder of the function does not have to deal with any
7596   ** overflow cells in the parent page, since if any existed they will
7597   ** have already been removed.
7598   */
7599   i = pParent->nOverflow + pParent->nCell;
7600   if( i<2 ){
7601     nxDiv = 0;
7602   }else{
7603     assert( bBulk==0 || bBulk==1 );
7604     if( iParentIdx==0 ){
7605       nxDiv = 0;
7606     }else if( iParentIdx==i ){
7607       nxDiv = i-2+bBulk;
7608     }else{
7609       nxDiv = iParentIdx-1;
7610     }
7611     i = 2-bBulk;
7612   }
7613   nOld = i+1;
7614   if( (i+nxDiv-pParent->nOverflow)==pParent->nCell ){
7615     pRight = &pParent->aData[pParent->hdrOffset+8];
7616   }else{
7617     pRight = findCell(pParent, i+nxDiv-pParent->nOverflow);
7618   }
7619   pgno = get4byte(pRight);
7620   while( 1 ){
7621     rc = getAndInitPage(pBt, pgno, &apOld[i], 0, 0);
7622     if( rc ){
7623       memset(apOld, 0, (i+1)*sizeof(MemPage*));
7624       goto balance_cleanup;
7625     }
7626     if( apOld[i]->nFree<0 ){
7627       rc = btreeComputeFreeSpace(apOld[i]);
7628       if( rc ){
7629         memset(apOld, 0, (i)*sizeof(MemPage*));
7630         goto balance_cleanup;
7631       }
7632     }
7633     if( (i--)==0 ) break;
7634 
7635     if( pParent->nOverflow && i+nxDiv==pParent->aiOvfl[0] ){
7636       apDiv[i] = pParent->apOvfl[0];
7637       pgno = get4byte(apDiv[i]);
7638       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7639       pParent->nOverflow = 0;
7640     }else{
7641       apDiv[i] = findCell(pParent, i+nxDiv-pParent->nOverflow);
7642       pgno = get4byte(apDiv[i]);
7643       szNew[i] = pParent->xCellSize(pParent, apDiv[i]);
7644 
7645       /* Drop the cell from the parent page. apDiv[i] still points to
7646       ** the cell within the parent, even though it has been dropped.
7647       ** This is safe because dropping a cell only overwrites the first
7648       ** four bytes of it, and this function does not need the first
7649       ** four bytes of the divider cell. So the pointer is safe to use
7650       ** later on.
7651       **
7652       ** But not if we are in secure-delete mode. In secure-delete mode,
7653       ** the dropCell() routine will overwrite the entire cell with zeroes.
7654       ** In this case, temporarily copy the cell into the aOvflSpace[]
7655       ** buffer. It will be copied out again as soon as the aSpace[] buffer
7656       ** is allocated.  */
7657       if( pBt->btsFlags & BTS_FAST_SECURE ){
7658         int iOff;
7659 
7660         iOff = SQLITE_PTR_TO_INT(apDiv[i]) - SQLITE_PTR_TO_INT(pParent->aData);
7661         if( (iOff+szNew[i])>(int)pBt->usableSize ){
7662           rc = SQLITE_CORRUPT_BKPT;
7663           memset(apOld, 0, (i+1)*sizeof(MemPage*));
7664           goto balance_cleanup;
7665         }else{
7666           memcpy(&aOvflSpace[iOff], apDiv[i], szNew[i]);
7667           apDiv[i] = &aOvflSpace[apDiv[i]-pParent->aData];
7668         }
7669       }
7670       dropCell(pParent, i+nxDiv-pParent->nOverflow, szNew[i], &rc);
7671     }
7672   }
7673 
7674   /* Make nMaxCells a multiple of 4 in order to preserve 8-byte
7675   ** alignment */
7676   nMaxCells = nOld*(MX_CELL(pBt) + ArraySize(pParent->apOvfl));
7677   nMaxCells = (nMaxCells + 3)&~3;
7678 
7679   /*
7680   ** Allocate space for memory structures
7681   */
7682   szScratch =
7683        nMaxCells*sizeof(u8*)                       /* b.apCell */
7684      + nMaxCells*sizeof(u16)                       /* b.szCell */
7685      + pBt->pageSize;                              /* aSpace1 */
7686 
7687   assert( szScratch<=7*(int)pBt->pageSize );
7688   b.apCell = sqlite3StackAllocRaw(0, szScratch );
7689   if( b.apCell==0 ){
7690     rc = SQLITE_NOMEM_BKPT;
7691     goto balance_cleanup;
7692   }
7693   b.szCell = (u16*)&b.apCell[nMaxCells];
7694   aSpace1 = (u8*)&b.szCell[nMaxCells];
7695   assert( EIGHT_BYTE_ALIGNMENT(aSpace1) );
7696 
7697   /*
7698   ** Load pointers to all cells on sibling pages and the divider cells
7699   ** into the local b.apCell[] array.  Make copies of the divider cells
7700   ** into space obtained from aSpace1[]. The divider cells have already
7701   ** been removed from pParent.
7702   **
7703   ** If the siblings are on leaf pages, then the child pointers of the
7704   ** divider cells are stripped from the cells before they are copied
7705   ** into aSpace1[].  In this way, all cells in b.apCell[] are without
7706   ** child pointers.  If siblings are not leaves, then all cell in
7707   ** b.apCell[] include child pointers.  Either way, all cells in b.apCell[]
7708   ** are alike.
7709   **
7710   ** leafCorrection:  4 if pPage is a leaf.  0 if pPage is not a leaf.
7711   **       leafData:  1 if pPage holds key+data and pParent holds only keys.
7712   */
7713   b.pRef = apOld[0];
7714   leafCorrection = b.pRef->leaf*4;
7715   leafData = b.pRef->intKeyLeaf;
7716   for(i=0; i<nOld; i++){
7717     MemPage *pOld = apOld[i];
7718     int limit = pOld->nCell;
7719     u8 *aData = pOld->aData;
7720     u16 maskPage = pOld->maskPage;
7721     u8 *piCell = aData + pOld->cellOffset;
7722     u8 *piEnd;
7723     VVA_ONLY( int nCellAtStart = b.nCell; )
7724 
7725     /* Verify that all sibling pages are of the same "type" (table-leaf,
7726     ** table-interior, index-leaf, or index-interior).
7727     */
7728     if( pOld->aData[0]!=apOld[0]->aData[0] ){
7729       rc = SQLITE_CORRUPT_BKPT;
7730       goto balance_cleanup;
7731     }
7732 
7733     /* Load b.apCell[] with pointers to all cells in pOld.  If pOld
7734     ** contains overflow cells, include them in the b.apCell[] array
7735     ** in the correct spot.
7736     **
7737     ** Note that when there are multiple overflow cells, it is always the
7738     ** case that they are sequential and adjacent.  This invariant arises
7739     ** because multiple overflows can only occurs when inserting divider
7740     ** cells into a parent on a prior balance, and divider cells are always
7741     ** adjacent and are inserted in order.  There is an assert() tagged
7742     ** with "NOTE 1" in the overflow cell insertion loop to prove this
7743     ** invariant.
7744     **
7745     ** This must be done in advance.  Once the balance starts, the cell
7746     ** offset section of the btree page will be overwritten and we will no
7747     ** long be able to find the cells if a pointer to each cell is not saved
7748     ** first.
7749     */
7750     memset(&b.szCell[b.nCell], 0, sizeof(b.szCell[0])*(limit+pOld->nOverflow));
7751     if( pOld->nOverflow>0 ){
7752       if( NEVER(limit<pOld->aiOvfl[0]) ){
7753         rc = SQLITE_CORRUPT_BKPT;
7754         goto balance_cleanup;
7755       }
7756       limit = pOld->aiOvfl[0];
7757       for(j=0; j<limit; j++){
7758         b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7759         piCell += 2;
7760         b.nCell++;
7761       }
7762       for(k=0; k<pOld->nOverflow; k++){
7763         assert( k==0 || pOld->aiOvfl[k-1]+1==pOld->aiOvfl[k] );/* NOTE 1 */
7764         b.apCell[b.nCell] = pOld->apOvfl[k];
7765         b.nCell++;
7766       }
7767     }
7768     piEnd = aData + pOld->cellOffset + 2*pOld->nCell;
7769     while( piCell<piEnd ){
7770       assert( b.nCell<nMaxCells );
7771       b.apCell[b.nCell] = aData + (maskPage & get2byteAligned(piCell));
7772       piCell += 2;
7773       b.nCell++;
7774     }
7775     assert( (b.nCell-nCellAtStart)==(pOld->nCell+pOld->nOverflow) );
7776 
7777     cntOld[i] = b.nCell;
7778     if( i<nOld-1 && !leafData){
7779       u16 sz = (u16)szNew[i];
7780       u8 *pTemp;
7781       assert( b.nCell<nMaxCells );
7782       b.szCell[b.nCell] = sz;
7783       pTemp = &aSpace1[iSpace1];
7784       iSpace1 += sz;
7785       assert( sz<=pBt->maxLocal+23 );
7786       assert( iSpace1 <= (int)pBt->pageSize );
7787       memcpy(pTemp, apDiv[i], sz);
7788       b.apCell[b.nCell] = pTemp+leafCorrection;
7789       assert( leafCorrection==0 || leafCorrection==4 );
7790       b.szCell[b.nCell] = b.szCell[b.nCell] - leafCorrection;
7791       if( !pOld->leaf ){
7792         assert( leafCorrection==0 );
7793         assert( pOld->hdrOffset==0 );
7794         /* The right pointer of the child page pOld becomes the left
7795         ** pointer of the divider cell */
7796         memcpy(b.apCell[b.nCell], &pOld->aData[8], 4);
7797       }else{
7798         assert( leafCorrection==4 );
7799         while( b.szCell[b.nCell]<4 ){
7800           /* Do not allow any cells smaller than 4 bytes. If a smaller cell
7801           ** does exist, pad it with 0x00 bytes. */
7802           assert( b.szCell[b.nCell]==3 || CORRUPT_DB );
7803           assert( b.apCell[b.nCell]==&aSpace1[iSpace1-3] || CORRUPT_DB );
7804           aSpace1[iSpace1++] = 0x00;
7805           b.szCell[b.nCell]++;
7806         }
7807       }
7808       b.nCell++;
7809     }
7810   }
7811 
7812   /*
7813   ** Figure out the number of pages needed to hold all b.nCell cells.
7814   ** Store this number in "k".  Also compute szNew[] which is the total
7815   ** size of all cells on the i-th page and cntNew[] which is the index
7816   ** in b.apCell[] of the cell that divides page i from page i+1.
7817   ** cntNew[k] should equal b.nCell.
7818   **
7819   ** Values computed by this block:
7820   **
7821   **           k: The total number of sibling pages
7822   **    szNew[i]: Spaced used on the i-th sibling page.
7823   **   cntNew[i]: Index in b.apCell[] and b.szCell[] for the first cell to
7824   **              the right of the i-th sibling page.
7825   ** usableSpace: Number of bytes of space available on each sibling.
7826   **
7827   */
7828   usableSpace = pBt->usableSize - 12 + leafCorrection;
7829   for(i=k=0; i<nOld; i++, k++){
7830     MemPage *p = apOld[i];
7831     b.apEnd[k] = p->aDataEnd;
7832     b.ixNx[k] = cntOld[i];
7833     if( k && b.ixNx[k]==b.ixNx[k-1] ){
7834       k--;  /* Omit b.ixNx[] entry for child pages with no cells */
7835     }
7836     if( !leafData ){
7837       k++;
7838       b.apEnd[k] = pParent->aDataEnd;
7839       b.ixNx[k] = cntOld[i]+1;
7840     }
7841     assert( p->nFree>=0 );
7842     szNew[i] = usableSpace - p->nFree;
7843     for(j=0; j<p->nOverflow; j++){
7844       szNew[i] += 2 + p->xCellSize(p, p->apOvfl[j]);
7845     }
7846     cntNew[i] = cntOld[i];
7847   }
7848   k = nOld;
7849   for(i=0; i<k; i++){
7850     int sz;
7851     while( szNew[i]>usableSpace ){
7852       if( i+1>=k ){
7853         k = i+2;
7854         if( k>NB+2 ){ rc = SQLITE_CORRUPT_BKPT; goto balance_cleanup; }
7855         szNew[k-1] = 0;
7856         cntNew[k-1] = b.nCell;
7857       }
7858       sz = 2 + cachedCellSize(&b, cntNew[i]-1);
7859       szNew[i] -= sz;
7860       if( !leafData ){
7861         if( cntNew[i]<b.nCell ){
7862           sz = 2 + cachedCellSize(&b, cntNew[i]);
7863         }else{
7864           sz = 0;
7865         }
7866       }
7867       szNew[i+1] += sz;
7868       cntNew[i]--;
7869     }
7870     while( cntNew[i]<b.nCell ){
7871       sz = 2 + cachedCellSize(&b, cntNew[i]);
7872       if( szNew[i]+sz>usableSpace ) break;
7873       szNew[i] += sz;
7874       cntNew[i]++;
7875       if( !leafData ){
7876         if( cntNew[i]<b.nCell ){
7877           sz = 2 + cachedCellSize(&b, cntNew[i]);
7878         }else{
7879           sz = 0;
7880         }
7881       }
7882       szNew[i+1] -= sz;
7883     }
7884     if( cntNew[i]>=b.nCell ){
7885       k = i+1;
7886     }else if( cntNew[i] <= (i>0 ? cntNew[i-1] : 0) ){
7887       rc = SQLITE_CORRUPT_BKPT;
7888       goto balance_cleanup;
7889     }
7890   }
7891 
7892   /*
7893   ** The packing computed by the previous block is biased toward the siblings
7894   ** on the left side (siblings with smaller keys). The left siblings are
7895   ** always nearly full, while the right-most sibling might be nearly empty.
7896   ** The next block of code attempts to adjust the packing of siblings to
7897   ** get a better balance.
7898   **
7899   ** This adjustment is more than an optimization.  The packing above might
7900   ** be so out of balance as to be illegal.  For example, the right-most
7901   ** sibling might be completely empty.  This adjustment is not optional.
7902   */
7903   for(i=k-1; i>0; i--){
7904     int szRight = szNew[i];  /* Size of sibling on the right */
7905     int szLeft = szNew[i-1]; /* Size of sibling on the left */
7906     int r;              /* Index of right-most cell in left sibling */
7907     int d;              /* Index of first cell to the left of right sibling */
7908 
7909     r = cntNew[i-1] - 1;
7910     d = r + 1 - leafData;
7911     (void)cachedCellSize(&b, d);
7912     do{
7913       assert( d<nMaxCells );
7914       assert( r<nMaxCells );
7915       (void)cachedCellSize(&b, r);
7916       if( szRight!=0
7917        && (bBulk || szRight+b.szCell[d]+2 > szLeft-(b.szCell[r]+(i==k-1?0:2)))){
7918         break;
7919       }
7920       szRight += b.szCell[d] + 2;
7921       szLeft -= b.szCell[r] + 2;
7922       cntNew[i-1] = r;
7923       r--;
7924       d--;
7925     }while( r>=0 );
7926     szNew[i] = szRight;
7927     szNew[i-1] = szLeft;
7928     if( cntNew[i-1] <= (i>1 ? cntNew[i-2] : 0) ){
7929       rc = SQLITE_CORRUPT_BKPT;
7930       goto balance_cleanup;
7931     }
7932   }
7933 
7934   /* Sanity check:  For a non-corrupt database file one of the follwing
7935   ** must be true:
7936   **    (1) We found one or more cells (cntNew[0])>0), or
7937   **    (2) pPage is a virtual root page.  A virtual root page is when
7938   **        the real root page is page 1 and we are the only child of
7939   **        that page.
7940   */
7941   assert( cntNew[0]>0 || (pParent->pgno==1 && pParent->nCell==0) || CORRUPT_DB);
7942   TRACE(("BALANCE: old: %d(nc=%d) %d(nc=%d) %d(nc=%d)\n",
7943     apOld[0]->pgno, apOld[0]->nCell,
7944     nOld>=2 ? apOld[1]->pgno : 0, nOld>=2 ? apOld[1]->nCell : 0,
7945     nOld>=3 ? apOld[2]->pgno : 0, nOld>=3 ? apOld[2]->nCell : 0
7946   ));
7947 
7948   /*
7949   ** Allocate k new pages.  Reuse old pages where possible.
7950   */
7951   pageFlags = apOld[0]->aData[0];
7952   for(i=0; i<k; i++){
7953     MemPage *pNew;
7954     if( i<nOld ){
7955       pNew = apNew[i] = apOld[i];
7956       apOld[i] = 0;
7957       rc = sqlite3PagerWrite(pNew->pDbPage);
7958       nNew++;
7959       if( rc ) goto balance_cleanup;
7960     }else{
7961       assert( i>0 );
7962       rc = allocateBtreePage(pBt, &pNew, &pgno, (bBulk ? 1 : pgno), 0);
7963       if( rc ) goto balance_cleanup;
7964       zeroPage(pNew, pageFlags);
7965       apNew[i] = pNew;
7966       nNew++;
7967       cntOld[i] = b.nCell;
7968 
7969       /* Set the pointer-map entry for the new sibling page. */
7970       if( ISAUTOVACUUM ){
7971         ptrmapPut(pBt, pNew->pgno, PTRMAP_BTREE, pParent->pgno, &rc);
7972         if( rc!=SQLITE_OK ){
7973           goto balance_cleanup;
7974         }
7975       }
7976     }
7977   }
7978 
7979   /*
7980   ** Reassign page numbers so that the new pages are in ascending order.
7981   ** This helps to keep entries in the disk file in order so that a scan
7982   ** of the table is closer to a linear scan through the file. That in turn
7983   ** helps the operating system to deliver pages from the disk more rapidly.
7984   **
7985   ** An O(n^2) insertion sort algorithm is used, but since n is never more
7986   ** than (NB+2) (a small constant), that should not be a problem.
7987   **
7988   ** When NB==3, this one optimization makes the database about 25% faster
7989   ** for large insertions and deletions.
7990   */
7991   for(i=0; i<nNew; i++){
7992     aPgOrder[i] = aPgno[i] = apNew[i]->pgno;
7993     aPgFlags[i] = apNew[i]->pDbPage->flags;
7994     for(j=0; j<i; j++){
7995       if( aPgno[j]==aPgno[i] ){
7996         /* This branch is taken if the set of sibling pages somehow contains
7997         ** duplicate entries. This can happen if the database is corrupt.
7998         ** It would be simpler to detect this as part of the loop below, but
7999         ** we do the detection here in order to avoid populating the pager
8000         ** cache with two separate objects associated with the same
8001         ** page number.  */
8002         assert( CORRUPT_DB );
8003         rc = SQLITE_CORRUPT_BKPT;
8004         goto balance_cleanup;
8005       }
8006     }
8007   }
8008   for(i=0; i<nNew; i++){
8009     int iBest = 0;                /* aPgno[] index of page number to use */
8010     for(j=1; j<nNew; j++){
8011       if( aPgOrder[j]<aPgOrder[iBest] ) iBest = j;
8012     }
8013     pgno = aPgOrder[iBest];
8014     aPgOrder[iBest] = 0xffffffff;
8015     if( iBest!=i ){
8016       if( iBest>i ){
8017         sqlite3PagerRekey(apNew[iBest]->pDbPage, pBt->nPage+iBest+1, 0);
8018       }
8019       sqlite3PagerRekey(apNew[i]->pDbPage, pgno, aPgFlags[iBest]);
8020       apNew[i]->pgno = pgno;
8021     }
8022   }
8023 
8024   TRACE(("BALANCE: new: %d(%d nc=%d) %d(%d nc=%d) %d(%d nc=%d) "
8025          "%d(%d nc=%d) %d(%d nc=%d)\n",
8026     apNew[0]->pgno, szNew[0], cntNew[0],
8027     nNew>=2 ? apNew[1]->pgno : 0, nNew>=2 ? szNew[1] : 0,
8028     nNew>=2 ? cntNew[1] - cntNew[0] - !leafData : 0,
8029     nNew>=3 ? apNew[2]->pgno : 0, nNew>=3 ? szNew[2] : 0,
8030     nNew>=3 ? cntNew[2] - cntNew[1] - !leafData : 0,
8031     nNew>=4 ? apNew[3]->pgno : 0, nNew>=4 ? szNew[3] : 0,
8032     nNew>=4 ? cntNew[3] - cntNew[2] - !leafData : 0,
8033     nNew>=5 ? apNew[4]->pgno : 0, nNew>=5 ? szNew[4] : 0,
8034     nNew>=5 ? cntNew[4] - cntNew[3] - !leafData : 0
8035   ));
8036 
8037   assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8038   assert( nNew>=1 && nNew<=ArraySize(apNew) );
8039   assert( apNew[nNew-1]!=0 );
8040   put4byte(pRight, apNew[nNew-1]->pgno);
8041 
8042   /* If the sibling pages are not leaves, ensure that the right-child pointer
8043   ** of the right-most new sibling page is set to the value that was
8044   ** originally in the same field of the right-most old sibling page. */
8045   if( (pageFlags & PTF_LEAF)==0 && nOld!=nNew ){
8046     MemPage *pOld = (nNew>nOld ? apNew : apOld)[nOld-1];
8047     memcpy(&apNew[nNew-1]->aData[8], &pOld->aData[8], 4);
8048   }
8049 
8050   /* Make any required updates to pointer map entries associated with
8051   ** cells stored on sibling pages following the balance operation. Pointer
8052   ** map entries associated with divider cells are set by the insertCell()
8053   ** routine. The associated pointer map entries are:
8054   **
8055   **   a) if the cell contains a reference to an overflow chain, the
8056   **      entry associated with the first page in the overflow chain, and
8057   **
8058   **   b) if the sibling pages are not leaves, the child page associated
8059   **      with the cell.
8060   **
8061   ** If the sibling pages are not leaves, then the pointer map entry
8062   ** associated with the right-child of each sibling may also need to be
8063   ** updated. This happens below, after the sibling pages have been
8064   ** populated, not here.
8065   */
8066   if( ISAUTOVACUUM ){
8067     MemPage *pOld;
8068     MemPage *pNew = pOld = apNew[0];
8069     int cntOldNext = pNew->nCell + pNew->nOverflow;
8070     int iNew = 0;
8071     int iOld = 0;
8072 
8073     for(i=0; i<b.nCell; i++){
8074       u8 *pCell = b.apCell[i];
8075       while( i==cntOldNext ){
8076         iOld++;
8077         assert( iOld<nNew || iOld<nOld );
8078         assert( iOld>=0 && iOld<NB );
8079         pOld = iOld<nNew ? apNew[iOld] : apOld[iOld];
8080         cntOldNext += pOld->nCell + pOld->nOverflow + !leafData;
8081       }
8082       if( i==cntNew[iNew] ){
8083         pNew = apNew[++iNew];
8084         if( !leafData ) continue;
8085       }
8086 
8087       /* Cell pCell is destined for new sibling page pNew. Originally, it
8088       ** was either part of sibling page iOld (possibly an overflow cell),
8089       ** or else the divider cell to the left of sibling page iOld. So,
8090       ** if sibling page iOld had the same page number as pNew, and if
8091       ** pCell really was a part of sibling page iOld (not a divider or
8092       ** overflow cell), we can skip updating the pointer map entries.  */
8093       if( iOld>=nNew
8094        || pNew->pgno!=aPgno[iOld]
8095        || !SQLITE_WITHIN(pCell,pOld->aData,pOld->aDataEnd)
8096       ){
8097         if( !leafCorrection ){
8098           ptrmapPut(pBt, get4byte(pCell), PTRMAP_BTREE, pNew->pgno, &rc);
8099         }
8100         if( cachedCellSize(&b,i)>pNew->minLocal ){
8101           ptrmapPutOvflPtr(pNew, pOld, pCell, &rc);
8102         }
8103         if( rc ) goto balance_cleanup;
8104       }
8105     }
8106   }
8107 
8108   /* Insert new divider cells into pParent. */
8109   for(i=0; i<nNew-1; i++){
8110     u8 *pCell;
8111     u8 *pTemp;
8112     int sz;
8113     MemPage *pNew = apNew[i];
8114     j = cntNew[i];
8115 
8116     assert( j<nMaxCells );
8117     assert( b.apCell[j]!=0 );
8118     pCell = b.apCell[j];
8119     sz = b.szCell[j] + leafCorrection;
8120     pTemp = &aOvflSpace[iOvflSpace];
8121     if( !pNew->leaf ){
8122       memcpy(&pNew->aData[8], pCell, 4);
8123     }else if( leafData ){
8124       /* If the tree is a leaf-data tree, and the siblings are leaves,
8125       ** then there is no divider cell in b.apCell[]. Instead, the divider
8126       ** cell consists of the integer key for the right-most cell of
8127       ** the sibling-page assembled above only.
8128       */
8129       CellInfo info;
8130       j--;
8131       pNew->xParseCell(pNew, b.apCell[j], &info);
8132       pCell = pTemp;
8133       sz = 4 + putVarint(&pCell[4], info.nKey);
8134       pTemp = 0;
8135     }else{
8136       pCell -= 4;
8137       /* Obscure case for non-leaf-data trees: If the cell at pCell was
8138       ** previously stored on a leaf node, and its reported size was 4
8139       ** bytes, then it may actually be smaller than this
8140       ** (see btreeParseCellPtr(), 4 bytes is the minimum size of
8141       ** any cell). But it is important to pass the correct size to
8142       ** insertCell(), so reparse the cell now.
8143       **
8144       ** This can only happen for b-trees used to evaluate "IN (SELECT ...)"
8145       ** and WITHOUT ROWID tables with exactly one column which is the
8146       ** primary key.
8147       */
8148       if( b.szCell[j]==4 ){
8149         assert(leafCorrection==4);
8150         sz = pParent->xCellSize(pParent, pCell);
8151       }
8152     }
8153     iOvflSpace += sz;
8154     assert( sz<=pBt->maxLocal+23 );
8155     assert( iOvflSpace <= (int)pBt->pageSize );
8156     insertCell(pParent, nxDiv+i, pCell, sz, pTemp, pNew->pgno, &rc);
8157     if( rc!=SQLITE_OK ) goto balance_cleanup;
8158     assert( sqlite3PagerIswriteable(pParent->pDbPage) );
8159   }
8160 
8161   /* Now update the actual sibling pages. The order in which they are updated
8162   ** is important, as this code needs to avoid disrupting any page from which
8163   ** cells may still to be read. In practice, this means:
8164   **
8165   **  (1) If cells are moving left (from apNew[iPg] to apNew[iPg-1])
8166   **      then it is not safe to update page apNew[iPg] until after
8167   **      the left-hand sibling apNew[iPg-1] has been updated.
8168   **
8169   **  (2) If cells are moving right (from apNew[iPg] to apNew[iPg+1])
8170   **      then it is not safe to update page apNew[iPg] until after
8171   **      the right-hand sibling apNew[iPg+1] has been updated.
8172   **
8173   ** If neither of the above apply, the page is safe to update.
8174   **
8175   ** The iPg value in the following loop starts at nNew-1 goes down
8176   ** to 0, then back up to nNew-1 again, thus making two passes over
8177   ** the pages.  On the initial downward pass, only condition (1) above
8178   ** needs to be tested because (2) will always be true from the previous
8179   ** step.  On the upward pass, both conditions are always true, so the
8180   ** upwards pass simply processes pages that were missed on the downward
8181   ** pass.
8182   */
8183   for(i=1-nNew; i<nNew; i++){
8184     int iPg = i<0 ? -i : i;
8185     assert( iPg>=0 && iPg<nNew );
8186     if( abDone[iPg] ) continue;         /* Skip pages already processed */
8187     if( i>=0                            /* On the upwards pass, or... */
8188      || cntOld[iPg-1]>=cntNew[iPg-1]    /* Condition (1) is true */
8189     ){
8190       int iNew;
8191       int iOld;
8192       int nNewCell;
8193 
8194       /* Verify condition (1):  If cells are moving left, update iPg
8195       ** only after iPg-1 has already been updated. */
8196       assert( iPg==0 || cntOld[iPg-1]>=cntNew[iPg-1] || abDone[iPg-1] );
8197 
8198       /* Verify condition (2):  If cells are moving right, update iPg
8199       ** only after iPg+1 has already been updated. */
8200       assert( cntNew[iPg]>=cntOld[iPg] || abDone[iPg+1] );
8201 
8202       if( iPg==0 ){
8203         iNew = iOld = 0;
8204         nNewCell = cntNew[0];
8205       }else{
8206         iOld = iPg<nOld ? (cntOld[iPg-1] + !leafData) : b.nCell;
8207         iNew = cntNew[iPg-1] + !leafData;
8208         nNewCell = cntNew[iPg] - iNew;
8209       }
8210 
8211       rc = editPage(apNew[iPg], iOld, iNew, nNewCell, &b);
8212       if( rc ) goto balance_cleanup;
8213       abDone[iPg]++;
8214       apNew[iPg]->nFree = usableSpace-szNew[iPg];
8215       assert( apNew[iPg]->nOverflow==0 );
8216       assert( apNew[iPg]->nCell==nNewCell );
8217     }
8218   }
8219 
8220   /* All pages have been processed exactly once */
8221   assert( memcmp(abDone, "\01\01\01\01\01", nNew)==0 );
8222 
8223   assert( nOld>0 );
8224   assert( nNew>0 );
8225 
8226   if( isRoot && pParent->nCell==0 && pParent->hdrOffset<=apNew[0]->nFree ){
8227     /* The root page of the b-tree now contains no cells. The only sibling
8228     ** page is the right-child of the parent. Copy the contents of the
8229     ** child page into the parent, decreasing the overall height of the
8230     ** b-tree structure by one. This is described as the "balance-shallower"
8231     ** sub-algorithm in some documentation.
8232     **
8233     ** If this is an auto-vacuum database, the call to copyNodeContent()
8234     ** sets all pointer-map entries corresponding to database image pages
8235     ** for which the pointer is stored within the content being copied.
8236     **
8237     ** It is critical that the child page be defragmented before being
8238     ** copied into the parent, because if the parent is page 1 then it will
8239     ** by smaller than the child due to the database header, and so all the
8240     ** free space needs to be up front.
8241     */
8242     assert( nNew==1 || CORRUPT_DB );
8243     rc = defragmentPage(apNew[0], -1);
8244     testcase( rc!=SQLITE_OK );
8245     assert( apNew[0]->nFree ==
8246         (get2byteNotZero(&apNew[0]->aData[5]) - apNew[0]->cellOffset
8247           - apNew[0]->nCell*2)
8248       || rc!=SQLITE_OK
8249     );
8250     copyNodeContent(apNew[0], pParent, &rc);
8251     freePage(apNew[0], &rc);
8252   }else if( ISAUTOVACUUM && !leafCorrection ){
8253     /* Fix the pointer map entries associated with the right-child of each
8254     ** sibling page. All other pointer map entries have already been taken
8255     ** care of.  */
8256     for(i=0; i<nNew; i++){
8257       u32 key = get4byte(&apNew[i]->aData[8]);
8258       ptrmapPut(pBt, key, PTRMAP_BTREE, apNew[i]->pgno, &rc);
8259     }
8260   }
8261 
8262   assert( pParent->isInit );
8263   TRACE(("BALANCE: finished: old=%d new=%d cells=%d\n",
8264           nOld, nNew, b.nCell));
8265 
8266   /* Free any old pages that were not reused as new pages.
8267   */
8268   for(i=nNew; i<nOld; i++){
8269     freePage(apOld[i], &rc);
8270   }
8271 
8272 #if 0
8273   if( ISAUTOVACUUM && rc==SQLITE_OK && apNew[0]->isInit ){
8274     /* The ptrmapCheckPages() contains assert() statements that verify that
8275     ** all pointer map pages are set correctly. This is helpful while
8276     ** debugging. This is usually disabled because a corrupt database may
8277     ** cause an assert() statement to fail.  */
8278     ptrmapCheckPages(apNew, nNew);
8279     ptrmapCheckPages(&pParent, 1);
8280   }
8281 #endif
8282 
8283   /*
8284   ** Cleanup before returning.
8285   */
8286 balance_cleanup:
8287   sqlite3StackFree(0, b.apCell);
8288   for(i=0; i<nOld; i++){
8289     releasePage(apOld[i]);
8290   }
8291   for(i=0; i<nNew; i++){
8292     releasePage(apNew[i]);
8293   }
8294 
8295   return rc;
8296 }
8297 
8298 
8299 /*
8300 ** This function is called when the root page of a b-tree structure is
8301 ** overfull (has one or more overflow pages).
8302 **
8303 ** A new child page is allocated and the contents of the current root
8304 ** page, including overflow cells, are copied into the child. The root
8305 ** page is then overwritten to make it an empty page with the right-child
8306 ** pointer pointing to the new page.
8307 **
8308 ** Before returning, all pointer-map entries corresponding to pages
8309 ** that the new child-page now contains pointers to are updated. The
8310 ** entry corresponding to the new right-child pointer of the root
8311 ** page is also updated.
8312 **
8313 ** If successful, *ppChild is set to contain a reference to the child
8314 ** page and SQLITE_OK is returned. In this case the caller is required
8315 ** to call releasePage() on *ppChild exactly once. If an error occurs,
8316 ** an error code is returned and *ppChild is set to 0.
8317 */
balance_deeper(MemPage * pRoot,MemPage ** ppChild)8318 static int balance_deeper(MemPage *pRoot, MemPage **ppChild){
8319   int rc;                        /* Return value from subprocedures */
8320   MemPage *pChild = 0;           /* Pointer to a new child page */
8321   Pgno pgnoChild = 0;            /* Page number of the new child page */
8322   BtShared *pBt = pRoot->pBt;    /* The BTree */
8323 
8324   assert( pRoot->nOverflow>0 );
8325   assert( sqlite3_mutex_held(pBt->mutex) );
8326 
8327   /* Make pRoot, the root page of the b-tree, writable. Allocate a new
8328   ** page that will become the new right-child of pPage. Copy the contents
8329   ** of the node stored on pRoot into the new child page.
8330   */
8331   rc = sqlite3PagerWrite(pRoot->pDbPage);
8332   if( rc==SQLITE_OK ){
8333     rc = allocateBtreePage(pBt,&pChild,&pgnoChild,pRoot->pgno,0);
8334     copyNodeContent(pRoot, pChild, &rc);
8335     if( ISAUTOVACUUM ){
8336       ptrmapPut(pBt, pgnoChild, PTRMAP_BTREE, pRoot->pgno, &rc);
8337     }
8338   }
8339   if( rc ){
8340     *ppChild = 0;
8341     releasePage(pChild);
8342     return rc;
8343   }
8344   assert( sqlite3PagerIswriteable(pChild->pDbPage) );
8345   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
8346   assert( pChild->nCell==pRoot->nCell || CORRUPT_DB );
8347 
8348   TRACE(("BALANCE: copy root %d into %d\n", pRoot->pgno, pChild->pgno));
8349 
8350   /* Copy the overflow cells from pRoot to pChild */
8351   memcpy(pChild->aiOvfl, pRoot->aiOvfl,
8352          pRoot->nOverflow*sizeof(pRoot->aiOvfl[0]));
8353   memcpy(pChild->apOvfl, pRoot->apOvfl,
8354          pRoot->nOverflow*sizeof(pRoot->apOvfl[0]));
8355   pChild->nOverflow = pRoot->nOverflow;
8356 
8357   /* Zero the contents of pRoot. Then install pChild as the right-child. */
8358   zeroPage(pRoot, pChild->aData[0] & ~PTF_LEAF);
8359   put4byte(&pRoot->aData[pRoot->hdrOffset+8], pgnoChild);
8360 
8361   *ppChild = pChild;
8362   return SQLITE_OK;
8363 }
8364 
8365 /*
8366 ** Return SQLITE_CORRUPT if any cursor other than pCur is currently valid
8367 ** on the same B-tree as pCur.
8368 **
8369 ** This can if a database is corrupt with two or more SQL tables
8370 ** pointing to the same b-tree.  If an insert occurs on one SQL table
8371 ** and causes a BEFORE TRIGGER to do a secondary insert on the other SQL
8372 ** table linked to the same b-tree.  If the secondary insert causes a
8373 ** rebalance, that can change content out from under the cursor on the
8374 ** first SQL table, violating invariants on the first insert.
8375 */
anotherValidCursor(BtCursor * pCur)8376 static int anotherValidCursor(BtCursor *pCur){
8377   BtCursor *pOther;
8378   for(pOther=pCur->pBt->pCursor; pOther; pOther=pOther->pNext){
8379     if( pOther!=pCur
8380      && pOther->eState==CURSOR_VALID
8381      && pOther->pPage==pCur->pPage
8382     ){
8383       return SQLITE_CORRUPT_BKPT;
8384     }
8385   }
8386   return SQLITE_OK;
8387 }
8388 
8389 /*
8390 ** The page that pCur currently points to has just been modified in
8391 ** some way. This function figures out if this modification means the
8392 ** tree needs to be balanced, and if so calls the appropriate balancing
8393 ** routine. Balancing routines are:
8394 **
8395 **   balance_quick()
8396 **   balance_deeper()
8397 **   balance_nonroot()
8398 */
balance(BtCursor * pCur)8399 static int balance(BtCursor *pCur){
8400   int rc = SQLITE_OK;
8401   const int nMin = pCur->pBt->usableSize * 2 / 3;
8402   u8 aBalanceQuickSpace[13];
8403   u8 *pFree = 0;
8404 
8405   VVA_ONLY( int balance_quick_called = 0 );
8406   VVA_ONLY( int balance_deeper_called = 0 );
8407 
8408   do {
8409     int iPage;
8410     MemPage *pPage = pCur->pPage;
8411 
8412     if( NEVER(pPage->nFree<0) && btreeComputeFreeSpace(pPage) ) break;
8413     if( pPage->nOverflow==0 && pPage->nFree<=nMin ){
8414       break;
8415     }else if( (iPage = pCur->iPage)==0 ){
8416       if( pPage->nOverflow && (rc = anotherValidCursor(pCur))==SQLITE_OK ){
8417         /* The root page of the b-tree is overfull. In this case call the
8418         ** balance_deeper() function to create a new child for the root-page
8419         ** and copy the current contents of the root-page to it. The
8420         ** next iteration of the do-loop will balance the child page.
8421         */
8422         assert( balance_deeper_called==0 );
8423         VVA_ONLY( balance_deeper_called++ );
8424         rc = balance_deeper(pPage, &pCur->apPage[1]);
8425         if( rc==SQLITE_OK ){
8426           pCur->iPage = 1;
8427           pCur->ix = 0;
8428           pCur->aiIdx[0] = 0;
8429           pCur->apPage[0] = pPage;
8430           pCur->pPage = pCur->apPage[1];
8431           assert( pCur->pPage->nOverflow );
8432         }
8433       }else{
8434         break;
8435       }
8436     }else{
8437       MemPage * const pParent = pCur->apPage[iPage-1];
8438       int const iIdx = pCur->aiIdx[iPage-1];
8439 
8440       rc = sqlite3PagerWrite(pParent->pDbPage);
8441       if( rc==SQLITE_OK && pParent->nFree<0 ){
8442         rc = btreeComputeFreeSpace(pParent);
8443       }
8444       if( rc==SQLITE_OK ){
8445 #ifndef SQLITE_OMIT_QUICKBALANCE
8446         if( pPage->intKeyLeaf
8447          && pPage->nOverflow==1
8448          && pPage->aiOvfl[0]==pPage->nCell
8449          && pParent->pgno!=1
8450          && pParent->nCell==iIdx
8451         ){
8452           /* Call balance_quick() to create a new sibling of pPage on which
8453           ** to store the overflow cell. balance_quick() inserts a new cell
8454           ** into pParent, which may cause pParent overflow. If this
8455           ** happens, the next iteration of the do-loop will balance pParent
8456           ** use either balance_nonroot() or balance_deeper(). Until this
8457           ** happens, the overflow cell is stored in the aBalanceQuickSpace[]
8458           ** buffer.
8459           **
8460           ** The purpose of the following assert() is to check that only a
8461           ** single call to balance_quick() is made for each call to this
8462           ** function. If this were not verified, a subtle bug involving reuse
8463           ** of the aBalanceQuickSpace[] might sneak in.
8464           */
8465           assert( balance_quick_called==0 );
8466           VVA_ONLY( balance_quick_called++ );
8467           rc = balance_quick(pParent, pPage, aBalanceQuickSpace);
8468         }else
8469 #endif
8470         {
8471           /* In this case, call balance_nonroot() to redistribute cells
8472           ** between pPage and up to 2 of its sibling pages. This involves
8473           ** modifying the contents of pParent, which may cause pParent to
8474           ** become overfull or underfull. The next iteration of the do-loop
8475           ** will balance the parent page to correct this.
8476           **
8477           ** If the parent page becomes overfull, the overflow cell or cells
8478           ** are stored in the pSpace buffer allocated immediately below.
8479           ** A subsequent iteration of the do-loop will deal with this by
8480           ** calling balance_nonroot() (balance_deeper() may be called first,
8481           ** but it doesn't deal with overflow cells - just moves them to a
8482           ** different page). Once this subsequent call to balance_nonroot()
8483           ** has completed, it is safe to release the pSpace buffer used by
8484           ** the previous call, as the overflow cell data will have been
8485           ** copied either into the body of a database page or into the new
8486           ** pSpace buffer passed to the latter call to balance_nonroot().
8487           */
8488           u8 *pSpace = sqlite3PageMalloc(pCur->pBt->pageSize);
8489           rc = balance_nonroot(pParent, iIdx, pSpace, iPage==1,
8490                                pCur->hints&BTREE_BULKLOAD);
8491           if( pFree ){
8492             /* If pFree is not NULL, it points to the pSpace buffer used
8493             ** by a previous call to balance_nonroot(). Its contents are
8494             ** now stored either on real database pages or within the
8495             ** new pSpace buffer, so it may be safely freed here. */
8496             sqlite3PageFree(pFree);
8497           }
8498 
8499           /* The pSpace buffer will be freed after the next call to
8500           ** balance_nonroot(), or just before this function returns, whichever
8501           ** comes first. */
8502           pFree = pSpace;
8503         }
8504       }
8505 
8506       pPage->nOverflow = 0;
8507 
8508       /* The next iteration of the do-loop balances the parent page. */
8509       releasePage(pPage);
8510       pCur->iPage--;
8511       assert( pCur->iPage>=0 );
8512       pCur->pPage = pCur->apPage[pCur->iPage];
8513     }
8514   }while( rc==SQLITE_OK );
8515 
8516   if( pFree ){
8517     sqlite3PageFree(pFree);
8518   }
8519   return rc;
8520 }
8521 
8522 /* Overwrite content from pX into pDest.  Only do the write if the
8523 ** content is different from what is already there.
8524 */
btreeOverwriteContent(MemPage * pPage,u8 * pDest,const BtreePayload * pX,int iOffset,int iAmt)8525 static int btreeOverwriteContent(
8526   MemPage *pPage,           /* MemPage on which writing will occur */
8527   u8 *pDest,                /* Pointer to the place to start writing */
8528   const BtreePayload *pX,   /* Source of data to write */
8529   int iOffset,              /* Offset of first byte to write */
8530   int iAmt                  /* Number of bytes to be written */
8531 ){
8532   int nData = pX->nData - iOffset;
8533   if( nData<=0 ){
8534     /* Overwritting with zeros */
8535     int i;
8536     for(i=0; i<iAmt && pDest[i]==0; i++){}
8537     if( i<iAmt ){
8538       int rc = sqlite3PagerWrite(pPage->pDbPage);
8539       if( rc ) return rc;
8540       memset(pDest + i, 0, iAmt - i);
8541     }
8542   }else{
8543     if( nData<iAmt ){
8544       /* Mixed read data and zeros at the end.  Make a recursive call
8545       ** to write the zeros then fall through to write the real data */
8546       int rc = btreeOverwriteContent(pPage, pDest+nData, pX, iOffset+nData,
8547                                  iAmt-nData);
8548       if( rc ) return rc;
8549       iAmt = nData;
8550     }
8551     if( memcmp(pDest, ((u8*)pX->pData) + iOffset, iAmt)!=0 ){
8552       int rc = sqlite3PagerWrite(pPage->pDbPage);
8553       if( rc ) return rc;
8554       /* In a corrupt database, it is possible for the source and destination
8555       ** buffers to overlap.  This is harmless since the database is already
8556       ** corrupt but it does cause valgrind and ASAN warnings.  So use
8557       ** memmove(). */
8558       memmove(pDest, ((u8*)pX->pData) + iOffset, iAmt);
8559     }
8560   }
8561   return SQLITE_OK;
8562 }
8563 
8564 /*
8565 ** Overwrite the cell that cursor pCur is pointing to with fresh content
8566 ** contained in pX.
8567 */
btreeOverwriteCell(BtCursor * pCur,const BtreePayload * pX)8568 static int btreeOverwriteCell(BtCursor *pCur, const BtreePayload *pX){
8569   int iOffset;                        /* Next byte of pX->pData to write */
8570   int nTotal = pX->nData + pX->nZero; /* Total bytes of to write */
8571   int rc;                             /* Return code */
8572   MemPage *pPage = pCur->pPage;       /* Page being written */
8573   BtShared *pBt;                      /* Btree */
8574   Pgno ovflPgno;                      /* Next overflow page to write */
8575   u32 ovflPageSize;                   /* Size to write on overflow page */
8576 
8577   if( pCur->info.pPayload + pCur->info.nLocal > pPage->aDataEnd
8578    || pCur->info.pPayload < pPage->aData + pPage->cellOffset
8579   ){
8580     return SQLITE_CORRUPT_BKPT;
8581   }
8582   /* Overwrite the local portion first */
8583   rc = btreeOverwriteContent(pPage, pCur->info.pPayload, pX,
8584                              0, pCur->info.nLocal);
8585   if( rc ) return rc;
8586   if( pCur->info.nLocal==nTotal ) return SQLITE_OK;
8587 
8588   /* Now overwrite the overflow pages */
8589   iOffset = pCur->info.nLocal;
8590   assert( nTotal>=0 );
8591   assert( iOffset>=0 );
8592   ovflPgno = get4byte(pCur->info.pPayload + iOffset);
8593   pBt = pPage->pBt;
8594   ovflPageSize = pBt->usableSize - 4;
8595   do{
8596     rc = btreeGetPage(pBt, ovflPgno, &pPage, 0);
8597     if( rc ) return rc;
8598     if( sqlite3PagerPageRefcount(pPage->pDbPage)!=1 ){
8599       rc = SQLITE_CORRUPT_BKPT;
8600     }else{
8601       if( iOffset+ovflPageSize<(u32)nTotal ){
8602         ovflPgno = get4byte(pPage->aData);
8603       }else{
8604         ovflPageSize = nTotal - iOffset;
8605       }
8606       rc = btreeOverwriteContent(pPage, pPage->aData+4, pX,
8607                                  iOffset, ovflPageSize);
8608     }
8609     sqlite3PagerUnref(pPage->pDbPage);
8610     if( rc ) return rc;
8611     iOffset += ovflPageSize;
8612   }while( iOffset<nTotal );
8613   return SQLITE_OK;
8614 }
8615 
8616 
8617 /*
8618 ** Insert a new record into the BTree.  The content of the new record
8619 ** is described by the pX object.  The pCur cursor is used only to
8620 ** define what table the record should be inserted into, and is left
8621 ** pointing at a random location.
8622 **
8623 ** For a table btree (used for rowid tables), only the pX.nKey value of
8624 ** the key is used. The pX.pKey value must be NULL.  The pX.nKey is the
8625 ** rowid or INTEGER PRIMARY KEY of the row.  The pX.nData,pData,nZero fields
8626 ** hold the content of the row.
8627 **
8628 ** For an index btree (used for indexes and WITHOUT ROWID tables), the
8629 ** key is an arbitrary byte sequence stored in pX.pKey,nKey.  The
8630 ** pX.pData,nData,nZero fields must be zero.
8631 **
8632 ** If the seekResult parameter is non-zero, then a successful call to
8633 ** MovetoUnpacked() to seek cursor pCur to (pKey,nKey) has already
8634 ** been performed.  In other words, if seekResult!=0 then the cursor
8635 ** is currently pointing to a cell that will be adjacent to the cell
8636 ** to be inserted.  If seekResult<0 then pCur points to a cell that is
8637 ** smaller then (pKey,nKey).  If seekResult>0 then pCur points to a cell
8638 ** that is larger than (pKey,nKey).
8639 **
8640 ** If seekResult==0, that means pCur is pointing at some unknown location.
8641 ** In that case, this routine must seek the cursor to the correct insertion
8642 ** point for (pKey,nKey) before doing the insertion.  For index btrees,
8643 ** if pX->nMem is non-zero, then pX->aMem contains pointers to the unpacked
8644 ** key values and pX->aMem can be used instead of pX->pKey to avoid having
8645 ** to decode the key.
8646 */
sqlite3BtreeInsert(BtCursor * pCur,const BtreePayload * pX,int flags,int seekResult)8647 int sqlite3BtreeInsert(
8648   BtCursor *pCur,                /* Insert data into the table of this cursor */
8649   const BtreePayload *pX,        /* Content of the row to be inserted */
8650   int flags,                     /* True if this is likely an append */
8651   int seekResult                 /* Result of prior MovetoUnpacked() call */
8652 ){
8653   int rc;
8654   int loc = seekResult;          /* -1: before desired location  +1: after */
8655   int szNew = 0;
8656   int idx;
8657   MemPage *pPage;
8658   Btree *p = pCur->pBtree;
8659   BtShared *pBt = p->pBt;
8660   unsigned char *oldCell;
8661   unsigned char *newCell = 0;
8662 
8663   assert( (flags & (BTREE_SAVEPOSITION|BTREE_APPEND))==flags );
8664 
8665   if( pCur->eState==CURSOR_FAULT ){
8666     assert( pCur->skipNext!=SQLITE_OK );
8667     return pCur->skipNext;
8668   }
8669 
8670   assert( cursorOwnsBtShared(pCur) );
8671   assert( (pCur->curFlags & BTCF_WriteFlag)!=0
8672               && pBt->inTransaction==TRANS_WRITE
8673               && (pBt->btsFlags & BTS_READ_ONLY)==0 );
8674   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8675 
8676   /* Assert that the caller has been consistent. If this cursor was opened
8677   ** expecting an index b-tree, then the caller should be inserting blob
8678   ** keys with no associated data. If the cursor was opened expecting an
8679   ** intkey table, the caller should be inserting integer keys with a
8680   ** blob of associated data.  */
8681   assert( (pX->pKey==0)==(pCur->pKeyInfo==0) );
8682 
8683   /* Save the positions of any other cursors open on this table.
8684   **
8685   ** In some cases, the call to btreeMoveto() below is a no-op. For
8686   ** example, when inserting data into a table with auto-generated integer
8687   ** keys, the VDBE layer invokes sqlite3BtreeLast() to figure out the
8688   ** integer key to use. It then calls this function to actually insert the
8689   ** data into the intkey B-Tree. In this case btreeMoveto() recognizes
8690   ** that the cursor is already where it needs to be and returns without
8691   ** doing any work. To avoid thwarting these optimizations, it is important
8692   ** not to clear the cursor here.
8693   */
8694   if( pCur->curFlags & BTCF_Multiple ){
8695     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
8696     if( rc ) return rc;
8697   }
8698 
8699   if( pCur->pKeyInfo==0 ){
8700     assert( pX->pKey==0 );
8701     /* If this is an insert into a table b-tree, invalidate any incrblob
8702     ** cursors open on the row being replaced */
8703     invalidateIncrblobCursors(p, pCur->pgnoRoot, pX->nKey, 0);
8704 
8705     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8706     ** to a row with the same key as the new entry being inserted.
8707     */
8708 #ifdef SQLITE_DEBUG
8709     if( flags & BTREE_SAVEPOSITION ){
8710       assert( pCur->curFlags & BTCF_ValidNKey );
8711       assert( pX->nKey==pCur->info.nKey );
8712       assert( loc==0 );
8713     }
8714 #endif
8715 
8716     /* On the other hand, BTREE_SAVEPOSITION==0 does not imply
8717     ** that the cursor is not pointing to a row to be overwritten.
8718     ** So do a complete check.
8719     */
8720     if( (pCur->curFlags&BTCF_ValidNKey)!=0 && pX->nKey==pCur->info.nKey ){
8721       /* The cursor is pointing to the entry that is to be
8722       ** overwritten */
8723       assert( pX->nData>=0 && pX->nZero>=0 );
8724       if( pCur->info.nSize!=0
8725        && pCur->info.nPayload==(u32)pX->nData+pX->nZero
8726       ){
8727         /* New entry is the same size as the old.  Do an overwrite */
8728         return btreeOverwriteCell(pCur, pX);
8729       }
8730       assert( loc==0 );
8731     }else if( loc==0 ){
8732       /* The cursor is *not* pointing to the cell to be overwritten, nor
8733       ** to an adjacent cell.  Move the cursor so that it is pointing either
8734       ** to the cell to be overwritten or an adjacent cell.
8735       */
8736       rc = sqlite3BtreeMovetoUnpacked(pCur, 0, pX->nKey, flags!=0, &loc);
8737       if( rc ) return rc;
8738     }
8739   }else{
8740     /* This is an index or a WITHOUT ROWID table */
8741 
8742     /* If BTREE_SAVEPOSITION is set, the cursor must already be pointing
8743     ** to a row with the same key as the new entry being inserted.
8744     */
8745     assert( (flags & BTREE_SAVEPOSITION)==0 || loc==0 );
8746 
8747     /* If the cursor is not already pointing either to the cell to be
8748     ** overwritten, or if a new cell is being inserted, if the cursor is
8749     ** not pointing to an immediately adjacent cell, then move the cursor
8750     ** so that it does.
8751     */
8752     if( loc==0 && (flags & BTREE_SAVEPOSITION)==0 ){
8753       if( pX->nMem ){
8754         UnpackedRecord r;
8755         r.pKeyInfo = pCur->pKeyInfo;
8756         r.aMem = pX->aMem;
8757         r.nField = pX->nMem;
8758         r.default_rc = 0;
8759         r.errCode = 0;
8760         r.r1 = 0;
8761         r.r2 = 0;
8762         r.eqSeen = 0;
8763         rc = sqlite3BtreeMovetoUnpacked(pCur, &r, 0, flags!=0, &loc);
8764       }else{
8765         rc = btreeMoveto(pCur, pX->pKey, pX->nKey, flags!=0, &loc);
8766       }
8767       if( rc ) return rc;
8768     }
8769 
8770     /* If the cursor is currently pointing to an entry to be overwritten
8771     ** and the new content is the same as as the old, then use the
8772     ** overwrite optimization.
8773     */
8774     if( loc==0 ){
8775       getCellInfo(pCur);
8776       if( pCur->info.nKey==pX->nKey ){
8777         BtreePayload x2;
8778         x2.pData = pX->pKey;
8779         x2.nData = pX->nKey;
8780         x2.nZero = 0;
8781         return btreeOverwriteCell(pCur, &x2);
8782       }
8783     }
8784 
8785   }
8786   assert( pCur->eState==CURSOR_VALID
8787        || (pCur->eState==CURSOR_INVALID && loc)
8788        || CORRUPT_DB );
8789 
8790   pPage = pCur->pPage;
8791   assert( pPage->intKey || pX->nKey>=0 );
8792   assert( pPage->leaf || !pPage->intKey );
8793   if( pPage->nFree<0 ){
8794     if( pCur->eState>CURSOR_INVALID ){
8795       rc = SQLITE_CORRUPT_BKPT;
8796     }else{
8797       rc = btreeComputeFreeSpace(pPage);
8798     }
8799     if( rc ) return rc;
8800   }
8801 
8802   TRACE(("INSERT: table=%d nkey=%lld ndata=%d page=%d %s\n",
8803           pCur->pgnoRoot, pX->nKey, pX->nData, pPage->pgno,
8804           loc==0 ? "overwrite" : "new entry"));
8805   assert( pPage->isInit );
8806   newCell = pBt->pTmpSpace;
8807   assert( newCell!=0 );
8808   rc = fillInCell(pPage, newCell, pX, &szNew);
8809   if( rc ) goto end_insert;
8810   assert( szNew==pPage->xCellSize(pPage, newCell) );
8811   assert( szNew <= MX_CELL_SIZE(pBt) );
8812   idx = pCur->ix;
8813   if( loc==0 ){
8814     CellInfo info;
8815     assert( idx<pPage->nCell );
8816     rc = sqlite3PagerWrite(pPage->pDbPage);
8817     if( rc ){
8818       goto end_insert;
8819     }
8820     oldCell = findCell(pPage, idx);
8821     if( !pPage->leaf ){
8822       memcpy(newCell, oldCell, 4);
8823     }
8824     rc = clearCell(pPage, oldCell, &info);
8825     testcase( pCur->curFlags & BTCF_ValidOvfl );
8826     invalidateOverflowCache(pCur);
8827     if( info.nSize==szNew && info.nLocal==info.nPayload
8828      && (!ISAUTOVACUUM || szNew<pPage->minLocal)
8829     ){
8830       /* Overwrite the old cell with the new if they are the same size.
8831       ** We could also try to do this if the old cell is smaller, then add
8832       ** the leftover space to the free list.  But experiments show that
8833       ** doing that is no faster then skipping this optimization and just
8834       ** calling dropCell() and insertCell().
8835       **
8836       ** This optimization cannot be used on an autovacuum database if the
8837       ** new entry uses overflow pages, as the insertCell() call below is
8838       ** necessary to add the PTRMAP_OVERFLOW1 pointer-map entry.  */
8839       assert( rc==SQLITE_OK ); /* clearCell never fails when nLocal==nPayload */
8840       if( oldCell < pPage->aData+pPage->hdrOffset+10 ){
8841         return SQLITE_CORRUPT_BKPT;
8842       }
8843       if( oldCell+szNew > pPage->aDataEnd ){
8844         return SQLITE_CORRUPT_BKPT;
8845       }
8846       memcpy(oldCell, newCell, szNew);
8847       return SQLITE_OK;
8848     }
8849     dropCell(pPage, idx, info.nSize, &rc);
8850     if( rc ) goto end_insert;
8851   }else if( loc<0 && pPage->nCell>0 ){
8852     assert( pPage->leaf );
8853     idx = ++pCur->ix;
8854     pCur->curFlags &= ~BTCF_ValidNKey;
8855   }else{
8856     assert( pPage->leaf );
8857   }
8858   insertCell(pPage, idx, newCell, szNew, 0, 0, &rc);
8859   assert( pPage->nOverflow==0 || rc==SQLITE_OK );
8860   assert( rc!=SQLITE_OK || pPage->nCell>0 || pPage->nOverflow>0 );
8861 
8862   /* If no error has occurred and pPage has an overflow cell, call balance()
8863   ** to redistribute the cells within the tree. Since balance() may move
8864   ** the cursor, zero the BtCursor.info.nSize and BTCF_ValidNKey
8865   ** variables.
8866   **
8867   ** Previous versions of SQLite called moveToRoot() to move the cursor
8868   ** back to the root page as balance() used to invalidate the contents
8869   ** of BtCursor.apPage[] and BtCursor.aiIdx[]. Instead of doing that,
8870   ** set the cursor state to "invalid". This makes common insert operations
8871   ** slightly faster.
8872   **
8873   ** There is a subtle but important optimization here too. When inserting
8874   ** multiple records into an intkey b-tree using a single cursor (as can
8875   ** happen while processing an "INSERT INTO ... SELECT" statement), it
8876   ** is advantageous to leave the cursor pointing to the last entry in
8877   ** the b-tree if possible. If the cursor is left pointing to the last
8878   ** entry in the table, and the next row inserted has an integer key
8879   ** larger than the largest existing key, it is possible to insert the
8880   ** row without seeking the cursor. This can be a big performance boost.
8881   */
8882   pCur->info.nSize = 0;
8883   if( pPage->nOverflow ){
8884     assert( rc==SQLITE_OK );
8885     pCur->curFlags &= ~(BTCF_ValidNKey);
8886     rc = balance(pCur);
8887 
8888     /* Must make sure nOverflow is reset to zero even if the balance()
8889     ** fails. Internal data structure corruption will result otherwise.
8890     ** Also, set the cursor state to invalid. This stops saveCursorPosition()
8891     ** from trying to save the current position of the cursor.  */
8892     pCur->pPage->nOverflow = 0;
8893     pCur->eState = CURSOR_INVALID;
8894     if( (flags & BTREE_SAVEPOSITION) && rc==SQLITE_OK ){
8895       btreeReleaseAllCursorPages(pCur);
8896       if( pCur->pKeyInfo ){
8897         assert( pCur->pKey==0 );
8898         pCur->pKey = sqlite3Malloc( pX->nKey );
8899         if( pCur->pKey==0 ){
8900           rc = SQLITE_NOMEM;
8901         }else{
8902           memcpy(pCur->pKey, pX->pKey, pX->nKey);
8903         }
8904       }
8905       pCur->eState = CURSOR_REQUIRESEEK;
8906       pCur->nKey = pX->nKey;
8907     }
8908   }
8909   assert( pCur->iPage<0 || pCur->pPage->nOverflow==0 );
8910 
8911 end_insert:
8912   return rc;
8913 }
8914 
8915 /*
8916 ** Delete the entry that the cursor is pointing to.
8917 **
8918 ** If the BTREE_SAVEPOSITION bit of the flags parameter is zero, then
8919 ** the cursor is left pointing at an arbitrary location after the delete.
8920 ** But if that bit is set, then the cursor is left in a state such that
8921 ** the next call to BtreeNext() or BtreePrev() moves it to the same row
8922 ** as it would have been on if the call to BtreeDelete() had been omitted.
8923 **
8924 ** The BTREE_AUXDELETE bit of flags indicates that is one of several deletes
8925 ** associated with a single table entry and its indexes.  Only one of those
8926 ** deletes is considered the "primary" delete.  The primary delete occurs
8927 ** on a cursor that is not a BTREE_FORDELETE cursor.  All but one delete
8928 ** operation on non-FORDELETE cursors is tagged with the AUXDELETE flag.
8929 ** The BTREE_AUXDELETE bit is a hint that is not used by this implementation,
8930 ** but which might be used by alternative storage engines.
8931 */
sqlite3BtreeDelete(BtCursor * pCur,u8 flags)8932 int sqlite3BtreeDelete(BtCursor *pCur, u8 flags){
8933   Btree *p = pCur->pBtree;
8934   BtShared *pBt = p->pBt;
8935   int rc;                              /* Return code */
8936   MemPage *pPage;                      /* Page to delete cell from */
8937   unsigned char *pCell;                /* Pointer to cell to delete */
8938   int iCellIdx;                        /* Index of cell to delete */
8939   int iCellDepth;                      /* Depth of node containing pCell */
8940   CellInfo info;                       /* Size of the cell being deleted */
8941   int bSkipnext = 0;                   /* Leaf cursor in SKIPNEXT state */
8942   u8 bPreserve = flags & BTREE_SAVEPOSITION;  /* Keep cursor valid */
8943 
8944   assert( cursorOwnsBtShared(pCur) );
8945   assert( pBt->inTransaction==TRANS_WRITE );
8946   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
8947   assert( pCur->curFlags & BTCF_WriteFlag );
8948   assert( hasSharedCacheTableLock(p, pCur->pgnoRoot, pCur->pKeyInfo!=0, 2) );
8949   assert( !hasReadConflicts(p, pCur->pgnoRoot) );
8950   assert( (flags & ~(BTREE_SAVEPOSITION | BTREE_AUXDELETE))==0 );
8951   if( pCur->eState==CURSOR_REQUIRESEEK ){
8952     rc = btreeRestoreCursorPosition(pCur);
8953     if( rc ) return rc;
8954   }
8955   assert( pCur->eState==CURSOR_VALID );
8956 
8957   iCellDepth = pCur->iPage;
8958   iCellIdx = pCur->ix;
8959   pPage = pCur->pPage;
8960   pCell = findCell(pPage, iCellIdx);
8961   if( pPage->nFree<0 && btreeComputeFreeSpace(pPage) ) return SQLITE_CORRUPT;
8962 
8963   /* If the bPreserve flag is set to true, then the cursor position must
8964   ** be preserved following this delete operation. If the current delete
8965   ** will cause a b-tree rebalance, then this is done by saving the cursor
8966   ** key and leaving the cursor in CURSOR_REQUIRESEEK state before
8967   ** returning.
8968   **
8969   ** Or, if the current delete will not cause a rebalance, then the cursor
8970   ** will be left in CURSOR_SKIPNEXT state pointing to the entry immediately
8971   ** before or after the deleted entry. In this case set bSkipnext to true.  */
8972   if( bPreserve ){
8973     if( !pPage->leaf
8974      || (pPage->nFree+cellSizePtr(pPage,pCell)+2)>(int)(pBt->usableSize*2/3)
8975      || pPage->nCell==1  /* See dbfuzz001.test for a test case */
8976     ){
8977       /* A b-tree rebalance will be required after deleting this entry.
8978       ** Save the cursor key.  */
8979       rc = saveCursorKey(pCur);
8980       if( rc ) return rc;
8981     }else{
8982       bSkipnext = 1;
8983     }
8984   }
8985 
8986   /* If the page containing the entry to delete is not a leaf page, move
8987   ** the cursor to the largest entry in the tree that is smaller than
8988   ** the entry being deleted. This cell will replace the cell being deleted
8989   ** from the internal node. The 'previous' entry is used for this instead
8990   ** of the 'next' entry, as the previous entry is always a part of the
8991   ** sub-tree headed by the child page of the cell being deleted. This makes
8992   ** balancing the tree following the delete operation easier.  */
8993   if( !pPage->leaf ){
8994     rc = sqlite3BtreePrevious(pCur, 0);
8995     assert( rc!=SQLITE_DONE );
8996     if( rc ) return rc;
8997   }
8998 
8999   /* Save the positions of any other cursors open on this table before
9000   ** making any modifications.  */
9001   if( pCur->curFlags & BTCF_Multiple ){
9002     rc = saveAllCursors(pBt, pCur->pgnoRoot, pCur);
9003     if( rc ) return rc;
9004   }
9005 
9006   /* If this is a delete operation to remove a row from a table b-tree,
9007   ** invalidate any incrblob cursors open on the row being deleted.  */
9008   if( pCur->pKeyInfo==0 ){
9009     invalidateIncrblobCursors(p, pCur->pgnoRoot, pCur->info.nKey, 0);
9010   }
9011 
9012   /* Make the page containing the entry to be deleted writable. Then free any
9013   ** overflow pages associated with the entry and finally remove the cell
9014   ** itself from within the page.  */
9015   rc = sqlite3PagerWrite(pPage->pDbPage);
9016   if( rc ) return rc;
9017   rc = clearCell(pPage, pCell, &info);
9018   dropCell(pPage, iCellIdx, info.nSize, &rc);
9019   if( rc ) return rc;
9020 
9021   /* If the cell deleted was not located on a leaf page, then the cursor
9022   ** is currently pointing to the largest entry in the sub-tree headed
9023   ** by the child-page of the cell that was just deleted from an internal
9024   ** node. The cell from the leaf node needs to be moved to the internal
9025   ** node to replace the deleted cell.  */
9026   if( !pPage->leaf ){
9027     MemPage *pLeaf = pCur->pPage;
9028     int nCell;
9029     Pgno n;
9030     unsigned char *pTmp;
9031 
9032     if( pLeaf->nFree<0 ){
9033       rc = btreeComputeFreeSpace(pLeaf);
9034       if( rc ) return rc;
9035     }
9036     if( iCellDepth<pCur->iPage-1 ){
9037       n = pCur->apPage[iCellDepth+1]->pgno;
9038     }else{
9039       n = pCur->pPage->pgno;
9040     }
9041     pCell = findCell(pLeaf, pLeaf->nCell-1);
9042     if( pCell<&pLeaf->aData[4] ) return SQLITE_CORRUPT_BKPT;
9043     nCell = pLeaf->xCellSize(pLeaf, pCell);
9044     assert( MX_CELL_SIZE(pBt) >= nCell );
9045     pTmp = pBt->pTmpSpace;
9046     assert( pTmp!=0 );
9047     rc = sqlite3PagerWrite(pLeaf->pDbPage);
9048     if( rc==SQLITE_OK ){
9049       insertCell(pPage, iCellIdx, pCell-4, nCell+4, pTmp, n, &rc);
9050     }
9051     dropCell(pLeaf, pLeaf->nCell-1, nCell, &rc);
9052     if( rc ) return rc;
9053   }
9054 
9055   /* Balance the tree. If the entry deleted was located on a leaf page,
9056   ** then the cursor still points to that page. In this case the first
9057   ** call to balance() repairs the tree, and the if(...) condition is
9058   ** never true.
9059   **
9060   ** Otherwise, if the entry deleted was on an internal node page, then
9061   ** pCur is pointing to the leaf page from which a cell was removed to
9062   ** replace the cell deleted from the internal node. This is slightly
9063   ** tricky as the leaf node may be underfull, and the internal node may
9064   ** be either under or overfull. In this case run the balancing algorithm
9065   ** on the leaf node first. If the balance proceeds far enough up the
9066   ** tree that we can be sure that any problem in the internal node has
9067   ** been corrected, so be it. Otherwise, after balancing the leaf node,
9068   ** walk the cursor up the tree to the internal node and balance it as
9069   ** well.  */
9070   rc = balance(pCur);
9071   if( rc==SQLITE_OK && pCur->iPage>iCellDepth ){
9072     releasePageNotNull(pCur->pPage);
9073     pCur->iPage--;
9074     while( pCur->iPage>iCellDepth ){
9075       releasePage(pCur->apPage[pCur->iPage--]);
9076     }
9077     pCur->pPage = pCur->apPage[pCur->iPage];
9078     rc = balance(pCur);
9079   }
9080 
9081   if( rc==SQLITE_OK ){
9082     if( bSkipnext ){
9083       assert( bPreserve && (pCur->iPage==iCellDepth || CORRUPT_DB) );
9084       assert( pPage==pCur->pPage || CORRUPT_DB );
9085       assert( (pPage->nCell>0 || CORRUPT_DB) && iCellIdx<=pPage->nCell );
9086       pCur->eState = CURSOR_SKIPNEXT;
9087       if( iCellIdx>=pPage->nCell ){
9088         pCur->skipNext = -1;
9089         pCur->ix = pPage->nCell-1;
9090       }else{
9091         pCur->skipNext = 1;
9092       }
9093     }else{
9094       rc = moveToRoot(pCur);
9095       if( bPreserve ){
9096         btreeReleaseAllCursorPages(pCur);
9097         pCur->eState = CURSOR_REQUIRESEEK;
9098       }
9099       if( rc==SQLITE_EMPTY ) rc = SQLITE_OK;
9100     }
9101   }
9102   return rc;
9103 }
9104 
9105 /*
9106 ** Create a new BTree table.  Write into *piTable the page
9107 ** number for the root page of the new table.
9108 **
9109 ** The type of type is determined by the flags parameter.  Only the
9110 ** following values of flags are currently in use.  Other values for
9111 ** flags might not work:
9112 **
9113 **     BTREE_INTKEY|BTREE_LEAFDATA     Used for SQL tables with rowid keys
9114 **     BTREE_ZERODATA                  Used for SQL indices
9115 */
btreeCreateTable(Btree * p,Pgno * piTable,int createTabFlags)9116 static int btreeCreateTable(Btree *p, Pgno *piTable, int createTabFlags){
9117   BtShared *pBt = p->pBt;
9118   MemPage *pRoot;
9119   Pgno pgnoRoot;
9120   int rc;
9121   int ptfFlags;          /* Page-type flage for the root page of new table */
9122 
9123   assert( sqlite3BtreeHoldsMutex(p) );
9124   assert( pBt->inTransaction==TRANS_WRITE );
9125   assert( (pBt->btsFlags & BTS_READ_ONLY)==0 );
9126 
9127 #ifdef SQLITE_OMIT_AUTOVACUUM
9128   rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9129   if( rc ){
9130     return rc;
9131   }
9132 #else
9133   if( pBt->autoVacuum ){
9134     Pgno pgnoMove;      /* Move a page here to make room for the root-page */
9135     MemPage *pPageMove; /* The page to move to. */
9136 
9137     /* Creating a new table may probably require moving an existing database
9138     ** to make room for the new tables root page. In case this page turns
9139     ** out to be an overflow page, delete all overflow page-map caches
9140     ** held by open cursors.
9141     */
9142     invalidateAllOverflowCache(pBt);
9143 
9144     /* Read the value of meta[3] from the database to determine where the
9145     ** root page of the new table should go. meta[3] is the largest root-page
9146     ** created so far, so the new root-page is (meta[3]+1).
9147     */
9148     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &pgnoRoot);
9149     if( pgnoRoot>btreePagecount(pBt) ){
9150       return SQLITE_CORRUPT_BKPT;
9151     }
9152     pgnoRoot++;
9153 
9154     /* The new root-page may not be allocated on a pointer-map page, or the
9155     ** PENDING_BYTE page.
9156     */
9157     while( pgnoRoot==PTRMAP_PAGENO(pBt, pgnoRoot) ||
9158         pgnoRoot==PENDING_BYTE_PAGE(pBt) ){
9159       pgnoRoot++;
9160     }
9161     assert( pgnoRoot>=3 );
9162 
9163     /* Allocate a page. The page that currently resides at pgnoRoot will
9164     ** be moved to the allocated page (unless the allocated page happens
9165     ** to reside at pgnoRoot).
9166     */
9167     rc = allocateBtreePage(pBt, &pPageMove, &pgnoMove, pgnoRoot, BTALLOC_EXACT);
9168     if( rc!=SQLITE_OK ){
9169       return rc;
9170     }
9171 
9172     if( pgnoMove!=pgnoRoot ){
9173       /* pgnoRoot is the page that will be used for the root-page of
9174       ** the new table (assuming an error did not occur). But we were
9175       ** allocated pgnoMove. If required (i.e. if it was not allocated
9176       ** by extending the file), the current page at position pgnoMove
9177       ** is already journaled.
9178       */
9179       u8 eType = 0;
9180       Pgno iPtrPage = 0;
9181 
9182       /* Save the positions of any open cursors. This is required in
9183       ** case they are holding a reference to an xFetch reference
9184       ** corresponding to page pgnoRoot.  */
9185       rc = saveAllCursors(pBt, 0, 0);
9186       releasePage(pPageMove);
9187       if( rc!=SQLITE_OK ){
9188         return rc;
9189       }
9190 
9191       /* Move the page currently at pgnoRoot to pgnoMove. */
9192       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9193       if( rc!=SQLITE_OK ){
9194         return rc;
9195       }
9196       rc = ptrmapGet(pBt, pgnoRoot, &eType, &iPtrPage);
9197       if( eType==PTRMAP_ROOTPAGE || eType==PTRMAP_FREEPAGE ){
9198         rc = SQLITE_CORRUPT_BKPT;
9199       }
9200       if( rc!=SQLITE_OK ){
9201         releasePage(pRoot);
9202         return rc;
9203       }
9204       assert( eType!=PTRMAP_ROOTPAGE );
9205       assert( eType!=PTRMAP_FREEPAGE );
9206       rc = relocatePage(pBt, pRoot, eType, iPtrPage, pgnoMove, 0);
9207       releasePage(pRoot);
9208 
9209       /* Obtain the page at pgnoRoot */
9210       if( rc!=SQLITE_OK ){
9211         return rc;
9212       }
9213       rc = btreeGetPage(pBt, pgnoRoot, &pRoot, 0);
9214       if( rc!=SQLITE_OK ){
9215         return rc;
9216       }
9217       rc = sqlite3PagerWrite(pRoot->pDbPage);
9218       if( rc!=SQLITE_OK ){
9219         releasePage(pRoot);
9220         return rc;
9221       }
9222     }else{
9223       pRoot = pPageMove;
9224     }
9225 
9226     /* Update the pointer-map and meta-data with the new root-page number. */
9227     ptrmapPut(pBt, pgnoRoot, PTRMAP_ROOTPAGE, 0, &rc);
9228     if( rc ){
9229       releasePage(pRoot);
9230       return rc;
9231     }
9232 
9233     /* When the new root page was allocated, page 1 was made writable in
9234     ** order either to increase the database filesize, or to decrement the
9235     ** freelist count.  Hence, the sqlite3BtreeUpdateMeta() call cannot fail.
9236     */
9237     assert( sqlite3PagerIswriteable(pBt->pPage1->pDbPage) );
9238     rc = sqlite3BtreeUpdateMeta(p, 4, pgnoRoot);
9239     if( NEVER(rc) ){
9240       releasePage(pRoot);
9241       return rc;
9242     }
9243 
9244   }else{
9245     rc = allocateBtreePage(pBt, &pRoot, &pgnoRoot, 1, 0);
9246     if( rc ) return rc;
9247   }
9248 #endif
9249   assert( sqlite3PagerIswriteable(pRoot->pDbPage) );
9250   if( createTabFlags & BTREE_INTKEY ){
9251     ptfFlags = PTF_INTKEY | PTF_LEAFDATA | PTF_LEAF;
9252   }else{
9253     ptfFlags = PTF_ZERODATA | PTF_LEAF;
9254   }
9255   zeroPage(pRoot, ptfFlags);
9256   sqlite3PagerUnref(pRoot->pDbPage);
9257   assert( (pBt->openFlags & BTREE_SINGLE)==0 || pgnoRoot==2 );
9258   *piTable = pgnoRoot;
9259   return SQLITE_OK;
9260 }
sqlite3BtreeCreateTable(Btree * p,Pgno * piTable,int flags)9261 int sqlite3BtreeCreateTable(Btree *p, Pgno *piTable, int flags){
9262   int rc;
9263   sqlite3BtreeEnter(p);
9264   rc = btreeCreateTable(p, piTable, flags);
9265   sqlite3BtreeLeave(p);
9266   return rc;
9267 }
9268 
9269 /*
9270 ** Erase the given database page and all its children.  Return
9271 ** the page to the freelist.
9272 */
clearDatabasePage(BtShared * pBt,Pgno pgno,int freePageFlag,int * pnChange)9273 static int clearDatabasePage(
9274   BtShared *pBt,           /* The BTree that contains the table */
9275   Pgno pgno,               /* Page number to clear */
9276   int freePageFlag,        /* Deallocate page if true */
9277   int *pnChange            /* Add number of Cells freed to this counter */
9278 ){
9279   MemPage *pPage;
9280   int rc;
9281   unsigned char *pCell;
9282   int i;
9283   int hdr;
9284   CellInfo info;
9285 
9286   assert( sqlite3_mutex_held(pBt->mutex) );
9287   if( pgno>btreePagecount(pBt) ){
9288     return SQLITE_CORRUPT_BKPT;
9289   }
9290   rc = getAndInitPage(pBt, pgno, &pPage, 0, 0);
9291   if( rc ) return rc;
9292   if( pPage->bBusy ){
9293     rc = SQLITE_CORRUPT_BKPT;
9294     goto cleardatabasepage_out;
9295   }
9296   pPage->bBusy = 1;
9297   hdr = pPage->hdrOffset;
9298   for(i=0; i<pPage->nCell; i++){
9299     pCell = findCell(pPage, i);
9300     if( !pPage->leaf ){
9301       rc = clearDatabasePage(pBt, get4byte(pCell), 1, pnChange);
9302       if( rc ) goto cleardatabasepage_out;
9303     }
9304     rc = clearCell(pPage, pCell, &info);
9305     if( rc ) goto cleardatabasepage_out;
9306   }
9307   if( !pPage->leaf ){
9308     rc = clearDatabasePage(pBt, get4byte(&pPage->aData[hdr+8]), 1, pnChange);
9309     if( rc ) goto cleardatabasepage_out;
9310   }else if( pnChange ){
9311     assert( pPage->intKey || CORRUPT_DB );
9312     testcase( !pPage->intKey );
9313     *pnChange += pPage->nCell;
9314   }
9315   if( freePageFlag ){
9316     freePage(pPage, &rc);
9317   }else if( (rc = sqlite3PagerWrite(pPage->pDbPage))==0 ){
9318     zeroPage(pPage, pPage->aData[hdr] | PTF_LEAF);
9319   }
9320 
9321 cleardatabasepage_out:
9322   pPage->bBusy = 0;
9323   releasePage(pPage);
9324   return rc;
9325 }
9326 
9327 /*
9328 ** Delete all information from a single table in the database.  iTable is
9329 ** the page number of the root of the table.  After this routine returns,
9330 ** the root page is empty, but still exists.
9331 **
9332 ** This routine will fail with SQLITE_LOCKED if there are any open
9333 ** read cursors on the table.  Open write cursors are moved to the
9334 ** root of the table.
9335 **
9336 ** If pnChange is not NULL, then table iTable must be an intkey table. The
9337 ** integer value pointed to by pnChange is incremented by the number of
9338 ** entries in the table.
9339 */
sqlite3BtreeClearTable(Btree * p,int iTable,int * pnChange)9340 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange){
9341   int rc;
9342   BtShared *pBt = p->pBt;
9343   sqlite3BtreeEnter(p);
9344   assert( p->inTrans==TRANS_WRITE );
9345 
9346   rc = saveAllCursors(pBt, (Pgno)iTable, 0);
9347 
9348   if( SQLITE_OK==rc ){
9349     /* Invalidate all incrblob cursors open on table iTable (assuming iTable
9350     ** is the root of a table b-tree - if it is not, the following call is
9351     ** a no-op).  */
9352     invalidateIncrblobCursors(p, (Pgno)iTable, 0, 1);
9353     rc = clearDatabasePage(pBt, (Pgno)iTable, 0, pnChange);
9354   }
9355   sqlite3BtreeLeave(p);
9356   return rc;
9357 }
9358 
9359 /*
9360 ** Delete all information from the single table that pCur is open on.
9361 **
9362 ** This routine only work for pCur on an ephemeral table.
9363 */
sqlite3BtreeClearTableOfCursor(BtCursor * pCur)9364 int sqlite3BtreeClearTableOfCursor(BtCursor *pCur){
9365   return sqlite3BtreeClearTable(pCur->pBtree, pCur->pgnoRoot, 0);
9366 }
9367 
9368 /*
9369 ** Erase all information in a table and add the root of the table to
9370 ** the freelist.  Except, the root of the principle table (the one on
9371 ** page 1) is never added to the freelist.
9372 **
9373 ** This routine will fail with SQLITE_LOCKED if there are any open
9374 ** cursors on the table.
9375 **
9376 ** If AUTOVACUUM is enabled and the page at iTable is not the last
9377 ** root page in the database file, then the last root page
9378 ** in the database file is moved into the slot formerly occupied by
9379 ** iTable and that last slot formerly occupied by the last root page
9380 ** is added to the freelist instead of iTable.  In this say, all
9381 ** root pages are kept at the beginning of the database file, which
9382 ** is necessary for AUTOVACUUM to work right.  *piMoved is set to the
9383 ** page number that used to be the last root page in the file before
9384 ** the move.  If no page gets moved, *piMoved is set to 0.
9385 ** The last root page is recorded in meta[3] and the value of
9386 ** meta[3] is updated by this procedure.
9387 */
btreeDropTable(Btree * p,Pgno iTable,int * piMoved)9388 static int btreeDropTable(Btree *p, Pgno iTable, int *piMoved){
9389   int rc;
9390   MemPage *pPage = 0;
9391   BtShared *pBt = p->pBt;
9392 
9393   assert( sqlite3BtreeHoldsMutex(p) );
9394   assert( p->inTrans==TRANS_WRITE );
9395   assert( iTable>=2 );
9396   if( iTable>btreePagecount(pBt) ){
9397     return SQLITE_CORRUPT_BKPT;
9398   }
9399 
9400   rc = btreeGetPage(pBt, (Pgno)iTable, &pPage, 0);
9401   if( rc ) return rc;
9402   rc = sqlite3BtreeClearTable(p, iTable, 0);
9403   if( rc ){
9404     releasePage(pPage);
9405     return rc;
9406   }
9407 
9408   *piMoved = 0;
9409 
9410 #ifdef SQLITE_OMIT_AUTOVACUUM
9411   freePage(pPage, &rc);
9412   releasePage(pPage);
9413 #else
9414   if( pBt->autoVacuum ){
9415     Pgno maxRootPgno;
9416     sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &maxRootPgno);
9417 
9418     if( iTable==maxRootPgno ){
9419       /* If the table being dropped is the table with the largest root-page
9420       ** number in the database, put the root page on the free list.
9421       */
9422       freePage(pPage, &rc);
9423       releasePage(pPage);
9424       if( rc!=SQLITE_OK ){
9425         return rc;
9426       }
9427     }else{
9428       /* The table being dropped does not have the largest root-page
9429       ** number in the database. So move the page that does into the
9430       ** gap left by the deleted root-page.
9431       */
9432       MemPage *pMove;
9433       releasePage(pPage);
9434       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9435       if( rc!=SQLITE_OK ){
9436         return rc;
9437       }
9438       rc = relocatePage(pBt, pMove, PTRMAP_ROOTPAGE, 0, iTable, 0);
9439       releasePage(pMove);
9440       if( rc!=SQLITE_OK ){
9441         return rc;
9442       }
9443       pMove = 0;
9444       rc = btreeGetPage(pBt, maxRootPgno, &pMove, 0);
9445       freePage(pMove, &rc);
9446       releasePage(pMove);
9447       if( rc!=SQLITE_OK ){
9448         return rc;
9449       }
9450       *piMoved = maxRootPgno;
9451     }
9452 
9453     /* Set the new 'max-root-page' value in the database header. This
9454     ** is the old value less one, less one more if that happens to
9455     ** be a root-page number, less one again if that is the
9456     ** PENDING_BYTE_PAGE.
9457     */
9458     maxRootPgno--;
9459     while( maxRootPgno==PENDING_BYTE_PAGE(pBt)
9460            || PTRMAP_ISPAGE(pBt, maxRootPgno) ){
9461       maxRootPgno--;
9462     }
9463     assert( maxRootPgno!=PENDING_BYTE_PAGE(pBt) );
9464 
9465     rc = sqlite3BtreeUpdateMeta(p, 4, maxRootPgno);
9466   }else{
9467     freePage(pPage, &rc);
9468     releasePage(pPage);
9469   }
9470 #endif
9471   return rc;
9472 }
sqlite3BtreeDropTable(Btree * p,int iTable,int * piMoved)9473 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved){
9474   int rc;
9475   sqlite3BtreeEnter(p);
9476   rc = btreeDropTable(p, iTable, piMoved);
9477   sqlite3BtreeLeave(p);
9478   return rc;
9479 }
9480 
9481 
9482 /*
9483 ** This function may only be called if the b-tree connection already
9484 ** has a read or write transaction open on the database.
9485 **
9486 ** Read the meta-information out of a database file.  Meta[0]
9487 ** is the number of free pages currently in the database.  Meta[1]
9488 ** through meta[15] are available for use by higher layers.  Meta[0]
9489 ** is read-only, the others are read/write.
9490 **
9491 ** The schema layer numbers meta values differently.  At the schema
9492 ** layer (and the SetCookie and ReadCookie opcodes) the number of
9493 ** free pages is not visible.  So Cookie[0] is the same as Meta[1].
9494 **
9495 ** This routine treats Meta[BTREE_DATA_VERSION] as a special case.  Instead
9496 ** of reading the value out of the header, it instead loads the "DataVersion"
9497 ** from the pager.  The BTREE_DATA_VERSION value is not actually stored in the
9498 ** database file.  It is a number computed by the pager.  But its access
9499 ** pattern is the same as header meta values, and so it is convenient to
9500 ** read it from this routine.
9501 */
sqlite3BtreeGetMeta(Btree * p,int idx,u32 * pMeta)9502 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta){
9503   BtShared *pBt = p->pBt;
9504 
9505   sqlite3BtreeEnter(p);
9506   assert( p->inTrans>TRANS_NONE );
9507   assert( SQLITE_OK==querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK) );
9508   assert( pBt->pPage1 );
9509   assert( idx>=0 && idx<=15 );
9510 
9511   if( idx==BTREE_DATA_VERSION ){
9512     *pMeta = sqlite3PagerDataVersion(pBt->pPager) + p->iDataVersion;
9513   }else{
9514     *pMeta = get4byte(&pBt->pPage1->aData[36 + idx*4]);
9515   }
9516 
9517   /* If auto-vacuum is disabled in this build and this is an auto-vacuum
9518   ** database, mark the database as read-only.  */
9519 #ifdef SQLITE_OMIT_AUTOVACUUM
9520   if( idx==BTREE_LARGEST_ROOT_PAGE && *pMeta>0 ){
9521     pBt->btsFlags |= BTS_READ_ONLY;
9522   }
9523 #endif
9524 
9525   sqlite3BtreeLeave(p);
9526 }
9527 
9528 /*
9529 ** Write meta-information back into the database.  Meta[0] is
9530 ** read-only and may not be written.
9531 */
sqlite3BtreeUpdateMeta(Btree * p,int idx,u32 iMeta)9532 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta){
9533   BtShared *pBt = p->pBt;
9534   unsigned char *pP1;
9535   int rc;
9536   assert( idx>=1 && idx<=15 );
9537   sqlite3BtreeEnter(p);
9538   assert( p->inTrans==TRANS_WRITE );
9539   assert( pBt->pPage1!=0 );
9540   pP1 = pBt->pPage1->aData;
9541   rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
9542   if( rc==SQLITE_OK ){
9543     put4byte(&pP1[36 + idx*4], iMeta);
9544 #ifndef SQLITE_OMIT_AUTOVACUUM
9545     if( idx==BTREE_INCR_VACUUM ){
9546       assert( pBt->autoVacuum || iMeta==0 );
9547       assert( iMeta==0 || iMeta==1 );
9548       pBt->incrVacuum = (u8)iMeta;
9549     }
9550 #endif
9551   }
9552   sqlite3BtreeLeave(p);
9553   return rc;
9554 }
9555 
9556 /*
9557 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
9558 ** number of entries in the b-tree and write the result to *pnEntry.
9559 **
9560 ** SQLITE_OK is returned if the operation is successfully executed.
9561 ** Otherwise, if an error is encountered (i.e. an IO error or database
9562 ** corruption) an SQLite error code is returned.
9563 */
sqlite3BtreeCount(sqlite3 * db,BtCursor * pCur,i64 * pnEntry)9564 int sqlite3BtreeCount(sqlite3 *db, BtCursor *pCur, i64 *pnEntry){
9565   i64 nEntry = 0;                      /* Value to return in *pnEntry */
9566   int rc;                              /* Return code */
9567 
9568   rc = moveToRoot(pCur);
9569   if( rc==SQLITE_EMPTY ){
9570     *pnEntry = 0;
9571     return SQLITE_OK;
9572   }
9573 
9574   /* Unless an error occurs, the following loop runs one iteration for each
9575   ** page in the B-Tree structure (not including overflow pages).
9576   */
9577   while( rc==SQLITE_OK && !AtomicLoad(&db->u1.isInterrupted) ){
9578     int iIdx;                          /* Index of child node in parent */
9579     MemPage *pPage;                    /* Current page of the b-tree */
9580 
9581     /* If this is a leaf page or the tree is not an int-key tree, then
9582     ** this page contains countable entries. Increment the entry counter
9583     ** accordingly.
9584     */
9585     pPage = pCur->pPage;
9586     if( pPage->leaf || !pPage->intKey ){
9587       nEntry += pPage->nCell;
9588     }
9589 
9590     /* pPage is a leaf node. This loop navigates the cursor so that it
9591     ** points to the first interior cell that it points to the parent of
9592     ** the next page in the tree that has not yet been visited. The
9593     ** pCur->aiIdx[pCur->iPage] value is set to the index of the parent cell
9594     ** of the page, or to the number of cells in the page if the next page
9595     ** to visit is the right-child of its parent.
9596     **
9597     ** If all pages in the tree have been visited, return SQLITE_OK to the
9598     ** caller.
9599     */
9600     if( pPage->leaf ){
9601       do {
9602         if( pCur->iPage==0 ){
9603           /* All pages of the b-tree have been visited. Return successfully. */
9604           *pnEntry = nEntry;
9605           return moveToRoot(pCur);
9606         }
9607         moveToParent(pCur);
9608       }while ( pCur->ix>=pCur->pPage->nCell );
9609 
9610       pCur->ix++;
9611       pPage = pCur->pPage;
9612     }
9613 
9614     /* Descend to the child node of the cell that the cursor currently
9615     ** points at. This is the right-child if (iIdx==pPage->nCell).
9616     */
9617     iIdx = pCur->ix;
9618     if( iIdx==pPage->nCell ){
9619       rc = moveToChild(pCur, get4byte(&pPage->aData[pPage->hdrOffset+8]));
9620     }else{
9621       rc = moveToChild(pCur, get4byte(findCell(pPage, iIdx)));
9622     }
9623   }
9624 
9625   /* An error has occurred. Return an error code. */
9626   return rc;
9627 }
9628 
9629 /*
9630 ** Return the pager associated with a BTree.  This routine is used for
9631 ** testing and debugging only.
9632 */
sqlite3BtreePager(Btree * p)9633 Pager *sqlite3BtreePager(Btree *p){
9634   return p->pBt->pPager;
9635 }
9636 
9637 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9638 /*
9639 ** Append a message to the error message string.
9640 */
checkAppendMsg(IntegrityCk * pCheck,const char * zFormat,...)9641 static void checkAppendMsg(
9642   IntegrityCk *pCheck,
9643   const char *zFormat,
9644   ...
9645 ){
9646   va_list ap;
9647   if( !pCheck->mxErr ) return;
9648   pCheck->mxErr--;
9649   pCheck->nErr++;
9650   va_start(ap, zFormat);
9651   if( pCheck->errMsg.nChar ){
9652     sqlite3_str_append(&pCheck->errMsg, "\n", 1);
9653   }
9654   if( pCheck->zPfx ){
9655     sqlite3_str_appendf(&pCheck->errMsg, pCheck->zPfx, pCheck->v1, pCheck->v2);
9656   }
9657   sqlite3_str_vappendf(&pCheck->errMsg, zFormat, ap);
9658   va_end(ap);
9659   if( pCheck->errMsg.accError==SQLITE_NOMEM ){
9660     pCheck->bOomFault = 1;
9661   }
9662 }
9663 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9664 
9665 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9666 
9667 /*
9668 ** Return non-zero if the bit in the IntegrityCk.aPgRef[] array that
9669 ** corresponds to page iPg is already set.
9670 */
getPageReferenced(IntegrityCk * pCheck,Pgno iPg)9671 static int getPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9672   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9673   return (pCheck->aPgRef[iPg/8] & (1 << (iPg & 0x07)));
9674 }
9675 
9676 /*
9677 ** Set the bit in the IntegrityCk.aPgRef[] array that corresponds to page iPg.
9678 */
setPageReferenced(IntegrityCk * pCheck,Pgno iPg)9679 static void setPageReferenced(IntegrityCk *pCheck, Pgno iPg){
9680   assert( iPg<=pCheck->nPage && sizeof(pCheck->aPgRef[0])==1 );
9681   pCheck->aPgRef[iPg/8] |= (1 << (iPg & 0x07));
9682 }
9683 
9684 
9685 /*
9686 ** Add 1 to the reference count for page iPage.  If this is the second
9687 ** reference to the page, add an error message to pCheck->zErrMsg.
9688 ** Return 1 if there are 2 or more references to the page and 0 if
9689 ** if this is the first reference to the page.
9690 **
9691 ** Also check that the page number is in bounds.
9692 */
checkRef(IntegrityCk * pCheck,Pgno iPage)9693 static int checkRef(IntegrityCk *pCheck, Pgno iPage){
9694   if( iPage>pCheck->nPage || iPage==0 ){
9695     checkAppendMsg(pCheck, "invalid page number %d", iPage);
9696     return 1;
9697   }
9698   if( getPageReferenced(pCheck, iPage) ){
9699     checkAppendMsg(pCheck, "2nd reference to page %d", iPage);
9700     return 1;
9701   }
9702   if( AtomicLoad(&pCheck->db->u1.isInterrupted) ) return 1;
9703   setPageReferenced(pCheck, iPage);
9704   return 0;
9705 }
9706 
9707 #ifndef SQLITE_OMIT_AUTOVACUUM
9708 /*
9709 ** Check that the entry in the pointer-map for page iChild maps to
9710 ** page iParent, pointer type ptrType. If not, append an error message
9711 ** to pCheck.
9712 */
checkPtrmap(IntegrityCk * pCheck,Pgno iChild,u8 eType,Pgno iParent)9713 static void checkPtrmap(
9714   IntegrityCk *pCheck,   /* Integrity check context */
9715   Pgno iChild,           /* Child page number */
9716   u8 eType,              /* Expected pointer map type */
9717   Pgno iParent           /* Expected pointer map parent page number */
9718 ){
9719   int rc;
9720   u8 ePtrmapType;
9721   Pgno iPtrmapParent;
9722 
9723   rc = ptrmapGet(pCheck->pBt, iChild, &ePtrmapType, &iPtrmapParent);
9724   if( rc!=SQLITE_OK ){
9725     if( rc==SQLITE_NOMEM || rc==SQLITE_IOERR_NOMEM ) pCheck->bOomFault = 1;
9726     checkAppendMsg(pCheck, "Failed to read ptrmap key=%d", iChild);
9727     return;
9728   }
9729 
9730   if( ePtrmapType!=eType || iPtrmapParent!=iParent ){
9731     checkAppendMsg(pCheck,
9732       "Bad ptr map entry key=%d expected=(%d,%d) got=(%d,%d)",
9733       iChild, eType, iParent, ePtrmapType, iPtrmapParent);
9734   }
9735 }
9736 #endif
9737 
9738 /*
9739 ** Check the integrity of the freelist or of an overflow page list.
9740 ** Verify that the number of pages on the list is N.
9741 */
checkList(IntegrityCk * pCheck,int isFreeList,Pgno iPage,u32 N)9742 static void checkList(
9743   IntegrityCk *pCheck,  /* Integrity checking context */
9744   int isFreeList,       /* True for a freelist.  False for overflow page list */
9745   Pgno iPage,           /* Page number for first page in the list */
9746   u32 N                 /* Expected number of pages in the list */
9747 ){
9748   int i;
9749   u32 expected = N;
9750   int nErrAtStart = pCheck->nErr;
9751   while( iPage!=0 && pCheck->mxErr ){
9752     DbPage *pOvflPage;
9753     unsigned char *pOvflData;
9754     if( checkRef(pCheck, iPage) ) break;
9755     N--;
9756     if( sqlite3PagerGet(pCheck->pPager, (Pgno)iPage, &pOvflPage, 0) ){
9757       checkAppendMsg(pCheck, "failed to get page %d", iPage);
9758       break;
9759     }
9760     pOvflData = (unsigned char *)sqlite3PagerGetData(pOvflPage);
9761     if( isFreeList ){
9762       u32 n = (u32)get4byte(&pOvflData[4]);
9763 #ifndef SQLITE_OMIT_AUTOVACUUM
9764       if( pCheck->pBt->autoVacuum ){
9765         checkPtrmap(pCheck, iPage, PTRMAP_FREEPAGE, 0);
9766       }
9767 #endif
9768       if( n>pCheck->pBt->usableSize/4-2 ){
9769         checkAppendMsg(pCheck,
9770            "freelist leaf count too big on page %d", iPage);
9771         N--;
9772       }else{
9773         for(i=0; i<(int)n; i++){
9774           Pgno iFreePage = get4byte(&pOvflData[8+i*4]);
9775 #ifndef SQLITE_OMIT_AUTOVACUUM
9776           if( pCheck->pBt->autoVacuum ){
9777             checkPtrmap(pCheck, iFreePage, PTRMAP_FREEPAGE, 0);
9778           }
9779 #endif
9780           checkRef(pCheck, iFreePage);
9781         }
9782         N -= n;
9783       }
9784     }
9785 #ifndef SQLITE_OMIT_AUTOVACUUM
9786     else{
9787       /* If this database supports auto-vacuum and iPage is not the last
9788       ** page in this overflow list, check that the pointer-map entry for
9789       ** the following page matches iPage.
9790       */
9791       if( pCheck->pBt->autoVacuum && N>0 ){
9792         i = get4byte(pOvflData);
9793         checkPtrmap(pCheck, i, PTRMAP_OVERFLOW2, iPage);
9794       }
9795     }
9796 #endif
9797     iPage = get4byte(pOvflData);
9798     sqlite3PagerUnref(pOvflPage);
9799   }
9800   if( N && nErrAtStart==pCheck->nErr ){
9801     checkAppendMsg(pCheck,
9802       "%s is %d but should be %d",
9803       isFreeList ? "size" : "overflow list length",
9804       expected-N, expected);
9805   }
9806 }
9807 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
9808 
9809 /*
9810 ** An implementation of a min-heap.
9811 **
9812 ** aHeap[0] is the number of elements on the heap.  aHeap[1] is the
9813 ** root element.  The daughter nodes of aHeap[N] are aHeap[N*2]
9814 ** and aHeap[N*2+1].
9815 **
9816 ** The heap property is this:  Every node is less than or equal to both
9817 ** of its daughter nodes.  A consequence of the heap property is that the
9818 ** root node aHeap[1] is always the minimum value currently in the heap.
9819 **
9820 ** The btreeHeapInsert() routine inserts an unsigned 32-bit number onto
9821 ** the heap, preserving the heap property.  The btreeHeapPull() routine
9822 ** removes the root element from the heap (the minimum value in the heap)
9823 ** and then moves other nodes around as necessary to preserve the heap
9824 ** property.
9825 **
9826 ** This heap is used for cell overlap and coverage testing.  Each u32
9827 ** entry represents the span of a cell or freeblock on a btree page.
9828 ** The upper 16 bits are the index of the first byte of a range and the
9829 ** lower 16 bits are the index of the last byte of that range.
9830 */
btreeHeapInsert(u32 * aHeap,u32 x)9831 static void btreeHeapInsert(u32 *aHeap, u32 x){
9832   u32 j, i = ++aHeap[0];
9833   aHeap[i] = x;
9834   while( (j = i/2)>0 && aHeap[j]>aHeap[i] ){
9835     x = aHeap[j];
9836     aHeap[j] = aHeap[i];
9837     aHeap[i] = x;
9838     i = j;
9839   }
9840 }
btreeHeapPull(u32 * aHeap,u32 * pOut)9841 static int btreeHeapPull(u32 *aHeap, u32 *pOut){
9842   u32 j, i, x;
9843   if( (x = aHeap[0])==0 ) return 0;
9844   *pOut = aHeap[1];
9845   aHeap[1] = aHeap[x];
9846   aHeap[x] = 0xffffffff;
9847   aHeap[0]--;
9848   i = 1;
9849   while( (j = i*2)<=aHeap[0] ){
9850     if( aHeap[j]>aHeap[j+1] ) j++;
9851     if( aHeap[i]<aHeap[j] ) break;
9852     x = aHeap[i];
9853     aHeap[i] = aHeap[j];
9854     aHeap[j] = x;
9855     i = j;
9856   }
9857   return 1;
9858 }
9859 
9860 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
9861 /*
9862 ** Do various sanity checks on a single page of a tree.  Return
9863 ** the tree depth.  Root pages return 0.  Parents of root pages
9864 ** return 1, and so forth.
9865 **
9866 ** These checks are done:
9867 **
9868 **      1.  Make sure that cells and freeblocks do not overlap
9869 **          but combine to completely cover the page.
9870 **      2.  Make sure integer cell keys are in order.
9871 **      3.  Check the integrity of overflow pages.
9872 **      4.  Recursively call checkTreePage on all children.
9873 **      5.  Verify that the depth of all children is the same.
9874 */
checkTreePage(IntegrityCk * pCheck,Pgno iPage,i64 * piMinKey,i64 maxKey)9875 static int checkTreePage(
9876   IntegrityCk *pCheck,  /* Context for the sanity check */
9877   Pgno iPage,           /* Page number of the page to check */
9878   i64 *piMinKey,        /* Write minimum integer primary key here */
9879   i64 maxKey            /* Error if integer primary key greater than this */
9880 ){
9881   MemPage *pPage = 0;      /* The page being analyzed */
9882   int i;                   /* Loop counter */
9883   int rc;                  /* Result code from subroutine call */
9884   int depth = -1, d2;      /* Depth of a subtree */
9885   int pgno;                /* Page number */
9886   int nFrag;               /* Number of fragmented bytes on the page */
9887   int hdr;                 /* Offset to the page header */
9888   int cellStart;           /* Offset to the start of the cell pointer array */
9889   int nCell;               /* Number of cells */
9890   int doCoverageCheck = 1; /* True if cell coverage checking should be done */
9891   int keyCanBeEqual = 1;   /* True if IPK can be equal to maxKey
9892                            ** False if IPK must be strictly less than maxKey */
9893   u8 *data;                /* Page content */
9894   u8 *pCell;               /* Cell content */
9895   u8 *pCellIdx;            /* Next element of the cell pointer array */
9896   BtShared *pBt;           /* The BtShared object that owns pPage */
9897   u32 pc;                  /* Address of a cell */
9898   u32 usableSize;          /* Usable size of the page */
9899   u32 contentOffset;       /* Offset to the start of the cell content area */
9900   u32 *heap = 0;           /* Min-heap used for checking cell coverage */
9901   u32 x, prev = 0;         /* Next and previous entry on the min-heap */
9902   const char *saved_zPfx = pCheck->zPfx;
9903   int saved_v1 = pCheck->v1;
9904   int saved_v2 = pCheck->v2;
9905   u8 savedIsInit = 0;
9906 
9907   /* Check that the page exists
9908   */
9909   pBt = pCheck->pBt;
9910   usableSize = pBt->usableSize;
9911   if( iPage==0 ) return 0;
9912   if( checkRef(pCheck, iPage) ) return 0;
9913   pCheck->zPfx = "Page %u: ";
9914   pCheck->v1 = iPage;
9915   if( (rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0 ){
9916     checkAppendMsg(pCheck,
9917        "unable to get the page. error code=%d", rc);
9918     goto end_of_check;
9919   }
9920 
9921   /* Clear MemPage.isInit to make sure the corruption detection code in
9922   ** btreeInitPage() is executed.  */
9923   savedIsInit = pPage->isInit;
9924   pPage->isInit = 0;
9925   if( (rc = btreeInitPage(pPage))!=0 ){
9926     assert( rc==SQLITE_CORRUPT );  /* The only possible error from InitPage */
9927     checkAppendMsg(pCheck,
9928                    "btreeInitPage() returns error code %d", rc);
9929     goto end_of_check;
9930   }
9931   if( (rc = btreeComputeFreeSpace(pPage))!=0 ){
9932     assert( rc==SQLITE_CORRUPT );
9933     checkAppendMsg(pCheck, "free space corruption", rc);
9934     goto end_of_check;
9935   }
9936   data = pPage->aData;
9937   hdr = pPage->hdrOffset;
9938 
9939   /* Set up for cell analysis */
9940   pCheck->zPfx = "On tree page %u cell %d: ";
9941   contentOffset = get2byteNotZero(&data[hdr+5]);
9942   assert( contentOffset<=usableSize );  /* Enforced by btreeInitPage() */
9943 
9944   /* EVIDENCE-OF: R-37002-32774 The two-byte integer at offset 3 gives the
9945   ** number of cells on the page. */
9946   nCell = get2byte(&data[hdr+3]);
9947   assert( pPage->nCell==nCell );
9948 
9949   /* EVIDENCE-OF: R-23882-45353 The cell pointer array of a b-tree page
9950   ** immediately follows the b-tree page header. */
9951   cellStart = hdr + 12 - 4*pPage->leaf;
9952   assert( pPage->aCellIdx==&data[cellStart] );
9953   pCellIdx = &data[cellStart + 2*(nCell-1)];
9954 
9955   if( !pPage->leaf ){
9956     /* Analyze the right-child page of internal pages */
9957     pgno = get4byte(&data[hdr+8]);
9958 #ifndef SQLITE_OMIT_AUTOVACUUM
9959     if( pBt->autoVacuum ){
9960       pCheck->zPfx = "On page %u at right child: ";
9961       checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
9962     }
9963 #endif
9964     depth = checkTreePage(pCheck, pgno, &maxKey, maxKey);
9965     keyCanBeEqual = 0;
9966   }else{
9967     /* For leaf pages, the coverage check will occur in the same loop
9968     ** as the other cell checks, so initialize the heap.  */
9969     heap = pCheck->heap;
9970     heap[0] = 0;
9971   }
9972 
9973   /* EVIDENCE-OF: R-02776-14802 The cell pointer array consists of K 2-byte
9974   ** integer offsets to the cell contents. */
9975   for(i=nCell-1; i>=0 && pCheck->mxErr; i--){
9976     CellInfo info;
9977 
9978     /* Check cell size */
9979     pCheck->v2 = i;
9980     assert( pCellIdx==&data[cellStart + i*2] );
9981     pc = get2byteAligned(pCellIdx);
9982     pCellIdx -= 2;
9983     if( pc<contentOffset || pc>usableSize-4 ){
9984       checkAppendMsg(pCheck, "Offset %d out of range %d..%d",
9985                              pc, contentOffset, usableSize-4);
9986       doCoverageCheck = 0;
9987       continue;
9988     }
9989     pCell = &data[pc];
9990     pPage->xParseCell(pPage, pCell, &info);
9991     if( pc+info.nSize>usableSize ){
9992       checkAppendMsg(pCheck, "Extends off end of page");
9993       doCoverageCheck = 0;
9994       continue;
9995     }
9996 
9997     /* Check for integer primary key out of range */
9998     if( pPage->intKey ){
9999       if( keyCanBeEqual ? (info.nKey > maxKey) : (info.nKey >= maxKey) ){
10000         checkAppendMsg(pCheck, "Rowid %lld out of order", info.nKey);
10001       }
10002       maxKey = info.nKey;
10003       keyCanBeEqual = 0;     /* Only the first key on the page may ==maxKey */
10004     }
10005 
10006     /* Check the content overflow list */
10007     if( info.nPayload>info.nLocal ){
10008       u32 nPage;       /* Number of pages on the overflow chain */
10009       Pgno pgnoOvfl;   /* First page of the overflow chain */
10010       assert( pc + info.nSize - 4 <= usableSize );
10011       nPage = (info.nPayload - info.nLocal + usableSize - 5)/(usableSize - 4);
10012       pgnoOvfl = get4byte(&pCell[info.nSize - 4]);
10013 #ifndef SQLITE_OMIT_AUTOVACUUM
10014       if( pBt->autoVacuum ){
10015         checkPtrmap(pCheck, pgnoOvfl, PTRMAP_OVERFLOW1, iPage);
10016       }
10017 #endif
10018       checkList(pCheck, 0, pgnoOvfl, nPage);
10019     }
10020 
10021     if( !pPage->leaf ){
10022       /* Check sanity of left child page for internal pages */
10023       pgno = get4byte(pCell);
10024 #ifndef SQLITE_OMIT_AUTOVACUUM
10025       if( pBt->autoVacuum ){
10026         checkPtrmap(pCheck, pgno, PTRMAP_BTREE, iPage);
10027       }
10028 #endif
10029       d2 = checkTreePage(pCheck, pgno, &maxKey, maxKey);
10030       keyCanBeEqual = 0;
10031       if( d2!=depth ){
10032         checkAppendMsg(pCheck, "Child page depth differs");
10033         depth = d2;
10034       }
10035     }else{
10036       /* Populate the coverage-checking heap for leaf pages */
10037       btreeHeapInsert(heap, (pc<<16)|(pc+info.nSize-1));
10038     }
10039   }
10040   *piMinKey = maxKey;
10041 
10042   /* Check for complete coverage of the page
10043   */
10044   pCheck->zPfx = 0;
10045   if( doCoverageCheck && pCheck->mxErr>0 ){
10046     /* For leaf pages, the min-heap has already been initialized and the
10047     ** cells have already been inserted.  But for internal pages, that has
10048     ** not yet been done, so do it now */
10049     if( !pPage->leaf ){
10050       heap = pCheck->heap;
10051       heap[0] = 0;
10052       for(i=nCell-1; i>=0; i--){
10053         u32 size;
10054         pc = get2byteAligned(&data[cellStart+i*2]);
10055         size = pPage->xCellSize(pPage, &data[pc]);
10056         btreeHeapInsert(heap, (pc<<16)|(pc+size-1));
10057       }
10058     }
10059     /* Add the freeblocks to the min-heap
10060     **
10061     ** EVIDENCE-OF: R-20690-50594 The second field of the b-tree page header
10062     ** is the offset of the first freeblock, or zero if there are no
10063     ** freeblocks on the page.
10064     */
10065     i = get2byte(&data[hdr+1]);
10066     while( i>0 ){
10067       int size, j;
10068       assert( (u32)i<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10069       size = get2byte(&data[i+2]);
10070       assert( (u32)(i+size)<=usableSize ); /* due to btreeComputeFreeSpace() */
10071       btreeHeapInsert(heap, (((u32)i)<<16)|(i+size-1));
10072       /* EVIDENCE-OF: R-58208-19414 The first 2 bytes of a freeblock are a
10073       ** big-endian integer which is the offset in the b-tree page of the next
10074       ** freeblock in the chain, or zero if the freeblock is the last on the
10075       ** chain. */
10076       j = get2byte(&data[i]);
10077       /* EVIDENCE-OF: R-06866-39125 Freeblocks are always connected in order of
10078       ** increasing offset. */
10079       assert( j==0 || j>i+size );     /* Enforced by btreeComputeFreeSpace() */
10080       assert( (u32)j<=usableSize-4 ); /* Enforced by btreeComputeFreeSpace() */
10081       i = j;
10082     }
10083     /* Analyze the min-heap looking for overlap between cells and/or
10084     ** freeblocks, and counting the number of untracked bytes in nFrag.
10085     **
10086     ** Each min-heap entry is of the form:    (start_address<<16)|end_address.
10087     ** There is an implied first entry the covers the page header, the cell
10088     ** pointer index, and the gap between the cell pointer index and the start
10089     ** of cell content.
10090     **
10091     ** The loop below pulls entries from the min-heap in order and compares
10092     ** the start_address against the previous end_address.  If there is an
10093     ** overlap, that means bytes are used multiple times.  If there is a gap,
10094     ** that gap is added to the fragmentation count.
10095     */
10096     nFrag = 0;
10097     prev = contentOffset - 1;   /* Implied first min-heap entry */
10098     while( btreeHeapPull(heap,&x) ){
10099       if( (prev&0xffff)>=(x>>16) ){
10100         checkAppendMsg(pCheck,
10101           "Multiple uses for byte %u of page %u", x>>16, iPage);
10102         break;
10103       }else{
10104         nFrag += (x>>16) - (prev&0xffff) - 1;
10105         prev = x;
10106       }
10107     }
10108     nFrag += usableSize - (prev&0xffff) - 1;
10109     /* EVIDENCE-OF: R-43263-13491 The total number of bytes in all fragments
10110     ** is stored in the fifth field of the b-tree page header.
10111     ** EVIDENCE-OF: R-07161-27322 The one-byte integer at offset 7 gives the
10112     ** number of fragmented free bytes within the cell content area.
10113     */
10114     if( heap[0]==0 && nFrag!=data[hdr+7] ){
10115       checkAppendMsg(pCheck,
10116           "Fragmentation of %d bytes reported as %d on page %u",
10117           nFrag, data[hdr+7], iPage);
10118     }
10119   }
10120 
10121 end_of_check:
10122   if( !doCoverageCheck ) pPage->isInit = savedIsInit;
10123   releasePage(pPage);
10124   pCheck->zPfx = saved_zPfx;
10125   pCheck->v1 = saved_v1;
10126   pCheck->v2 = saved_v2;
10127   return depth+1;
10128 }
10129 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10130 
10131 #ifndef SQLITE_OMIT_INTEGRITY_CHECK
10132 /*
10133 ** This routine does a complete check of the given BTree file.  aRoot[] is
10134 ** an array of pages numbers were each page number is the root page of
10135 ** a table.  nRoot is the number of entries in aRoot.
10136 **
10137 ** A read-only or read-write transaction must be opened before calling
10138 ** this function.
10139 **
10140 ** Write the number of error seen in *pnErr.  Except for some memory
10141 ** allocation errors,  an error message held in memory obtained from
10142 ** malloc is returned if *pnErr is non-zero.  If *pnErr==0 then NULL is
10143 ** returned.  If a memory allocation error occurs, NULL is returned.
10144 **
10145 ** If the first entry in aRoot[] is 0, that indicates that the list of
10146 ** root pages is incomplete.  This is a "partial integrity-check".  This
10147 ** happens when performing an integrity check on a single table.  The
10148 ** zero is skipped, of course.  But in addition, the freelist checks
10149 ** and the checks to make sure every page is referenced are also skipped,
10150 ** since obviously it is not possible to know which pages are covered by
10151 ** the unverified btrees.  Except, if aRoot[1] is 1, then the freelist
10152 ** checks are still performed.
10153 */
sqlite3BtreeIntegrityCheck(sqlite3 * db,Btree * p,Pgno * aRoot,int nRoot,int mxErr,int * pnErr)10154 char *sqlite3BtreeIntegrityCheck(
10155   sqlite3 *db,  /* Database connection that is running the check */
10156   Btree *p,     /* The btree to be checked */
10157   Pgno *aRoot,  /* An array of root pages numbers for individual trees */
10158   int nRoot,    /* Number of entries in aRoot[] */
10159   int mxErr,    /* Stop reporting errors after this many */
10160   int *pnErr    /* Write number of errors seen to this variable */
10161 ){
10162   Pgno i;
10163   IntegrityCk sCheck;
10164   BtShared *pBt = p->pBt;
10165   u64 savedDbFlags = pBt->db->flags;
10166   char zErr[100];
10167   int bPartial = 0;            /* True if not checking all btrees */
10168   int bCkFreelist = 1;         /* True to scan the freelist */
10169   VVA_ONLY( int nRef );
10170   assert( nRoot>0 );
10171 
10172   /* aRoot[0]==0 means this is a partial check */
10173   if( aRoot[0]==0 ){
10174     assert( nRoot>1 );
10175     bPartial = 1;
10176     if( aRoot[1]!=1 ) bCkFreelist = 0;
10177   }
10178 
10179   sqlite3BtreeEnter(p);
10180   assert( p->inTrans>TRANS_NONE && pBt->inTransaction>TRANS_NONE );
10181   VVA_ONLY( nRef = sqlite3PagerRefcount(pBt->pPager) );
10182   assert( nRef>=0 );
10183   sCheck.db = db;
10184   sCheck.pBt = pBt;
10185   sCheck.pPager = pBt->pPager;
10186   sCheck.nPage = btreePagecount(sCheck.pBt);
10187   sCheck.mxErr = mxErr;
10188   sCheck.nErr = 0;
10189   sCheck.bOomFault = 0;
10190   sCheck.zPfx = 0;
10191   sCheck.v1 = 0;
10192   sCheck.v2 = 0;
10193   sCheck.aPgRef = 0;
10194   sCheck.heap = 0;
10195   sqlite3StrAccumInit(&sCheck.errMsg, 0, zErr, sizeof(zErr), SQLITE_MAX_LENGTH);
10196   sCheck.errMsg.printfFlags = SQLITE_PRINTF_INTERNAL;
10197   if( sCheck.nPage==0 ){
10198     goto integrity_ck_cleanup;
10199   }
10200 
10201   sCheck.aPgRef = sqlite3MallocZero((sCheck.nPage / 8)+ 1);
10202   if( !sCheck.aPgRef ){
10203     sCheck.bOomFault = 1;
10204     goto integrity_ck_cleanup;
10205   }
10206   sCheck.heap = (u32*)sqlite3PageMalloc( pBt->pageSize );
10207   if( sCheck.heap==0 ){
10208     sCheck.bOomFault = 1;
10209     goto integrity_ck_cleanup;
10210   }
10211 
10212   i = PENDING_BYTE_PAGE(pBt);
10213   if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
10214 
10215   /* Check the integrity of the freelist
10216   */
10217   if( bCkFreelist ){
10218     sCheck.zPfx = "Main freelist: ";
10219     checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
10220               get4byte(&pBt->pPage1->aData[36]));
10221     sCheck.zPfx = 0;
10222   }
10223 
10224   /* Check all the tables.
10225   */
10226 #ifndef SQLITE_OMIT_AUTOVACUUM
10227   if( !bPartial ){
10228     if( pBt->autoVacuum ){
10229       Pgno mx = 0;
10230       Pgno mxInHdr;
10231       for(i=0; (int)i<nRoot; i++) if( mx<aRoot[i] ) mx = aRoot[i];
10232       mxInHdr = get4byte(&pBt->pPage1->aData[52]);
10233       if( mx!=mxInHdr ){
10234         checkAppendMsg(&sCheck,
10235           "max rootpage (%d) disagrees with header (%d)",
10236           mx, mxInHdr
10237         );
10238       }
10239     }else if( get4byte(&pBt->pPage1->aData[64])!=0 ){
10240       checkAppendMsg(&sCheck,
10241         "incremental_vacuum enabled with a max rootpage of zero"
10242       );
10243     }
10244   }
10245 #endif
10246   testcase( pBt->db->flags & SQLITE_CellSizeCk );
10247   pBt->db->flags &= ~(u64)SQLITE_CellSizeCk;
10248   for(i=0; (int)i<nRoot && sCheck.mxErr; i++){
10249     i64 notUsed;
10250     if( aRoot[i]==0 ) continue;
10251 #ifndef SQLITE_OMIT_AUTOVACUUM
10252     if( pBt->autoVacuum && aRoot[i]>1 && !bPartial ){
10253       checkPtrmap(&sCheck, aRoot[i], PTRMAP_ROOTPAGE, 0);
10254     }
10255 #endif
10256     checkTreePage(&sCheck, aRoot[i], &notUsed, LARGEST_INT64);
10257   }
10258   pBt->db->flags = savedDbFlags;
10259 
10260   /* Make sure every page in the file is referenced
10261   */
10262   if( !bPartial ){
10263     for(i=1; i<=sCheck.nPage && sCheck.mxErr; i++){
10264 #ifdef SQLITE_OMIT_AUTOVACUUM
10265       if( getPageReferenced(&sCheck, i)==0 ){
10266         checkAppendMsg(&sCheck, "Page %d is never used", i);
10267       }
10268 #else
10269       /* If the database supports auto-vacuum, make sure no tables contain
10270       ** references to pointer-map pages.
10271       */
10272       if( getPageReferenced(&sCheck, i)==0 &&
10273          (PTRMAP_PAGENO(pBt, i)!=i || !pBt->autoVacuum) ){
10274         checkAppendMsg(&sCheck, "Page %d is never used", i);
10275       }
10276       if( getPageReferenced(&sCheck, i)!=0 &&
10277          (PTRMAP_PAGENO(pBt, i)==i && pBt->autoVacuum) ){
10278         checkAppendMsg(&sCheck, "Pointer map page %d is referenced", i);
10279       }
10280 #endif
10281     }
10282   }
10283 
10284   /* Clean  up and report errors.
10285   */
10286 integrity_ck_cleanup:
10287   sqlite3PageFree(sCheck.heap);
10288   sqlite3_free(sCheck.aPgRef);
10289   if( sCheck.bOomFault ){
10290     sqlite3_str_reset(&sCheck.errMsg);
10291     sCheck.nErr++;
10292   }
10293   *pnErr = sCheck.nErr;
10294   if( sCheck.nErr==0 ) sqlite3_str_reset(&sCheck.errMsg);
10295   /* Make sure this analysis did not leave any unref() pages. */
10296   assert( nRef==sqlite3PagerRefcount(pBt->pPager) );
10297   sqlite3BtreeLeave(p);
10298   return sqlite3StrAccumFinish(&sCheck.errMsg);
10299 }
10300 #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
10301 
10302 /*
10303 ** Return the full pathname of the underlying database file.  Return
10304 ** an empty string if the database is in-memory or a TEMP database.
10305 **
10306 ** The pager filename is invariant as long as the pager is
10307 ** open so it is safe to access without the BtShared mutex.
10308 */
sqlite3BtreeGetFilename(Btree * p)10309 const char *sqlite3BtreeGetFilename(Btree *p){
10310   assert( p->pBt->pPager!=0 );
10311   return sqlite3PagerFilename(p->pBt->pPager, 1);
10312 }
10313 
10314 /*
10315 ** Return the pathname of the journal file for this database. The return
10316 ** value of this routine is the same regardless of whether the journal file
10317 ** has been created or not.
10318 **
10319 ** The pager journal filename is invariant as long as the pager is
10320 ** open so it is safe to access without the BtShared mutex.
10321 */
sqlite3BtreeGetJournalname(Btree * p)10322 const char *sqlite3BtreeGetJournalname(Btree *p){
10323   assert( p->pBt->pPager!=0 );
10324   return sqlite3PagerJournalname(p->pBt->pPager);
10325 }
10326 
10327 /*
10328 ** Return one of SQLITE_TXN_NONE, SQLITE_TXN_READ, or SQLITE_TXN_WRITE
10329 ** to describe the current transaction state of Btree p.
10330 */
sqlite3BtreeTxnState(Btree * p)10331 int sqlite3BtreeTxnState(Btree *p){
10332   assert( p==0 || sqlite3_mutex_held(p->db->mutex) );
10333   return p ? p->inTrans : 0;
10334 }
10335 
10336 #ifndef SQLITE_OMIT_WAL
10337 /*
10338 ** Run a checkpoint on the Btree passed as the first argument.
10339 **
10340 ** Return SQLITE_LOCKED if this or any other connection has an open
10341 ** transaction on the shared-cache the argument Btree is connected to.
10342 **
10343 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
10344 */
sqlite3BtreeCheckpoint(Btree * p,int eMode,int * pnLog,int * pnCkpt)10345 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt){
10346   int rc = SQLITE_OK;
10347   if( p ){
10348     BtShared *pBt = p->pBt;
10349     sqlite3BtreeEnter(p);
10350     if( pBt->inTransaction!=TRANS_NONE ){
10351       rc = SQLITE_LOCKED;
10352     }else{
10353       rc = sqlite3PagerCheckpoint(pBt->pPager, p->db, eMode, pnLog, pnCkpt);
10354     }
10355     sqlite3BtreeLeave(p);
10356   }
10357   return rc;
10358 }
10359 #endif
10360 
10361 /*
10362 ** Return true if there is currently a backup running on Btree p.
10363 */
sqlite3BtreeIsInBackup(Btree * p)10364 int sqlite3BtreeIsInBackup(Btree *p){
10365   assert( p );
10366   assert( sqlite3_mutex_held(p->db->mutex) );
10367   return p->nBackup!=0;
10368 }
10369 
10370 /*
10371 ** This function returns a pointer to a blob of memory associated with
10372 ** a single shared-btree. The memory is used by client code for its own
10373 ** purposes (for example, to store a high-level schema associated with
10374 ** the shared-btree). The btree layer manages reference counting issues.
10375 **
10376 ** The first time this is called on a shared-btree, nBytes bytes of memory
10377 ** are allocated, zeroed, and returned to the caller. For each subsequent
10378 ** call the nBytes parameter is ignored and a pointer to the same blob
10379 ** of memory returned.
10380 **
10381 ** If the nBytes parameter is 0 and the blob of memory has not yet been
10382 ** allocated, a null pointer is returned. If the blob has already been
10383 ** allocated, it is returned as normal.
10384 **
10385 ** Just before the shared-btree is closed, the function passed as the
10386 ** xFree argument when the memory allocation was made is invoked on the
10387 ** blob of allocated memory. The xFree function should not call sqlite3_free()
10388 ** on the memory, the btree layer does that.
10389 */
sqlite3BtreeSchema(Btree * p,int nBytes,void (* xFree)(void *))10390 void *sqlite3BtreeSchema(Btree *p, int nBytes, void(*xFree)(void *)){
10391   BtShared *pBt = p->pBt;
10392   sqlite3BtreeEnter(p);
10393   if( !pBt->pSchema && nBytes ){
10394     pBt->pSchema = sqlite3DbMallocZero(0, nBytes);
10395     pBt->xFreeSchema = xFree;
10396   }
10397   sqlite3BtreeLeave(p);
10398   return pBt->pSchema;
10399 }
10400 
10401 /*
10402 ** Return SQLITE_LOCKED_SHAREDCACHE if another user of the same shared
10403 ** btree as the argument handle holds an exclusive lock on the
10404 ** sqlite_schema table. Otherwise SQLITE_OK.
10405 */
sqlite3BtreeSchemaLocked(Btree * p)10406 int sqlite3BtreeSchemaLocked(Btree *p){
10407   int rc;
10408   assert( sqlite3_mutex_held(p->db->mutex) );
10409   sqlite3BtreeEnter(p);
10410   rc = querySharedCacheTableLock(p, SCHEMA_ROOT, READ_LOCK);
10411   assert( rc==SQLITE_OK || rc==SQLITE_LOCKED_SHAREDCACHE );
10412   sqlite3BtreeLeave(p);
10413   return rc;
10414 }
10415 
10416 
10417 #ifndef SQLITE_OMIT_SHARED_CACHE
10418 /*
10419 ** Obtain a lock on the table whose root page is iTab.  The
10420 ** lock is a write lock if isWritelock is true or a read lock
10421 ** if it is false.
10422 */
sqlite3BtreeLockTable(Btree * p,int iTab,u8 isWriteLock)10423 int sqlite3BtreeLockTable(Btree *p, int iTab, u8 isWriteLock){
10424   int rc = SQLITE_OK;
10425   assert( p->inTrans!=TRANS_NONE );
10426   if( p->sharable ){
10427     u8 lockType = READ_LOCK + isWriteLock;
10428     assert( READ_LOCK+1==WRITE_LOCK );
10429     assert( isWriteLock==0 || isWriteLock==1 );
10430 
10431     sqlite3BtreeEnter(p);
10432     rc = querySharedCacheTableLock(p, iTab, lockType);
10433     if( rc==SQLITE_OK ){
10434       rc = setSharedCacheTableLock(p, iTab, lockType);
10435     }
10436     sqlite3BtreeLeave(p);
10437   }
10438   return rc;
10439 }
10440 #endif
10441 
10442 #ifndef SQLITE_OMIT_INCRBLOB
10443 /*
10444 ** Argument pCsr must be a cursor opened for writing on an
10445 ** INTKEY table currently pointing at a valid table entry.
10446 ** This function modifies the data stored as part of that entry.
10447 **
10448 ** Only the data content may only be modified, it is not possible to
10449 ** change the length of the data stored. If this function is called with
10450 ** parameters that attempt to write past the end of the existing data,
10451 ** no modifications are made and SQLITE_CORRUPT is returned.
10452 */
sqlite3BtreePutData(BtCursor * pCsr,u32 offset,u32 amt,void * z)10453 int sqlite3BtreePutData(BtCursor *pCsr, u32 offset, u32 amt, void *z){
10454   int rc;
10455   assert( cursorOwnsBtShared(pCsr) );
10456   assert( sqlite3_mutex_held(pCsr->pBtree->db->mutex) );
10457   assert( pCsr->curFlags & BTCF_Incrblob );
10458 
10459   rc = restoreCursorPosition(pCsr);
10460   if( rc!=SQLITE_OK ){
10461     return rc;
10462   }
10463   assert( pCsr->eState!=CURSOR_REQUIRESEEK );
10464   if( pCsr->eState!=CURSOR_VALID ){
10465     return SQLITE_ABORT;
10466   }
10467 
10468   /* Save the positions of all other cursors open on this table. This is
10469   ** required in case any of them are holding references to an xFetch
10470   ** version of the b-tree page modified by the accessPayload call below.
10471   **
10472   ** Note that pCsr must be open on a INTKEY table and saveCursorPosition()
10473   ** and hence saveAllCursors() cannot fail on a BTREE_INTKEY table, hence
10474   ** saveAllCursors can only return SQLITE_OK.
10475   */
10476   VVA_ONLY(rc =) saveAllCursors(pCsr->pBt, pCsr->pgnoRoot, pCsr);
10477   assert( rc==SQLITE_OK );
10478 
10479   /* Check some assumptions:
10480   **   (a) the cursor is open for writing,
10481   **   (b) there is a read/write transaction open,
10482   **   (c) the connection holds a write-lock on the table (if required),
10483   **   (d) there are no conflicting read-locks, and
10484   **   (e) the cursor points at a valid row of an intKey table.
10485   */
10486   if( (pCsr->curFlags & BTCF_WriteFlag)==0 ){
10487     return SQLITE_READONLY;
10488   }
10489   assert( (pCsr->pBt->btsFlags & BTS_READ_ONLY)==0
10490               && pCsr->pBt->inTransaction==TRANS_WRITE );
10491   assert( hasSharedCacheTableLock(pCsr->pBtree, pCsr->pgnoRoot, 0, 2) );
10492   assert( !hasReadConflicts(pCsr->pBtree, pCsr->pgnoRoot) );
10493   assert( pCsr->pPage->intKey );
10494 
10495   return accessPayload(pCsr, offset, amt, (unsigned char *)z, 1);
10496 }
10497 
10498 /*
10499 ** Mark this cursor as an incremental blob cursor.
10500 */
sqlite3BtreeIncrblobCursor(BtCursor * pCur)10501 void sqlite3BtreeIncrblobCursor(BtCursor *pCur){
10502   pCur->curFlags |= BTCF_Incrblob;
10503   pCur->pBtree->hasIncrblobCur = 1;
10504 }
10505 #endif
10506 
10507 /*
10508 ** Set both the "read version" (single byte at byte offset 18) and
10509 ** "write version" (single byte at byte offset 19) fields in the database
10510 ** header to iVersion.
10511 */
sqlite3BtreeSetVersion(Btree * pBtree,int iVersion)10512 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion){
10513   BtShared *pBt = pBtree->pBt;
10514   int rc;                         /* Return code */
10515 
10516   assert( iVersion==1 || iVersion==2 );
10517 
10518   /* If setting the version fields to 1, do not automatically open the
10519   ** WAL connection, even if the version fields are currently set to 2.
10520   */
10521   pBt->btsFlags &= ~BTS_NO_WAL;
10522   if( iVersion==1 ) pBt->btsFlags |= BTS_NO_WAL;
10523 
10524   rc = sqlite3BtreeBeginTrans(pBtree, 0, 0);
10525   if( rc==SQLITE_OK ){
10526     u8 *aData = pBt->pPage1->aData;
10527     if( aData[18]!=(u8)iVersion || aData[19]!=(u8)iVersion ){
10528       rc = sqlite3BtreeBeginTrans(pBtree, 2, 0);
10529       if( rc==SQLITE_OK ){
10530         rc = sqlite3PagerWrite(pBt->pPage1->pDbPage);
10531         if( rc==SQLITE_OK ){
10532           aData[18] = (u8)iVersion;
10533           aData[19] = (u8)iVersion;
10534         }
10535       }
10536     }
10537   }
10538 
10539   pBt->btsFlags &= ~BTS_NO_WAL;
10540   return rc;
10541 }
10542 
10543 /*
10544 ** Return true if the cursor has a hint specified.  This routine is
10545 ** only used from within assert() statements
10546 */
sqlite3BtreeCursorHasHint(BtCursor * pCsr,unsigned int mask)10547 int sqlite3BtreeCursorHasHint(BtCursor *pCsr, unsigned int mask){
10548   return (pCsr->hints & mask)!=0;
10549 }
10550 
10551 /*
10552 ** Return true if the given Btree is read-only.
10553 */
sqlite3BtreeIsReadonly(Btree * p)10554 int sqlite3BtreeIsReadonly(Btree *p){
10555   return (p->pBt->btsFlags & BTS_READ_ONLY)!=0;
10556 }
10557 
10558 /*
10559 ** Return the size of the header added to each page by this module.
10560 */
sqlite3HeaderSizeBtree(void)10561 int sqlite3HeaderSizeBtree(void){ return ROUND8(sizeof(MemPage)); }
10562 
10563 #if !defined(SQLITE_OMIT_SHARED_CACHE)
10564 /*
10565 ** Return true if the Btree passed as the only argument is sharable.
10566 */
sqlite3BtreeSharable(Btree * p)10567 int sqlite3BtreeSharable(Btree *p){
10568   return p->sharable;
10569 }
10570 
10571 /*
10572 ** Return the number of connections to the BtShared object accessed by
10573 ** the Btree handle passed as the only argument. For private caches
10574 ** this is always 1. For shared caches it may be 1 or greater.
10575 */
sqlite3BtreeConnectionCount(Btree * p)10576 int sqlite3BtreeConnectionCount(Btree *p){
10577   testcase( p->sharable );
10578   return p->pBt->nRef;
10579 }
10580 #endif
10581