1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2010, 2013 Oracle and/or its affiliates. All rights reserved.
5 */
6
7 /*
8 ** This file implements the sqlite btree.h interface for Berkeley DB.
9 **
10 ** Build-time options:
11 **
12 ** BDBSQL_AUTO_PAGE_SIZE -- Let Berkeley DB choose a default page size.
13 ** BDBSQL_CONCURRENT_CONNECTIONS -- If there are going to be multiple
14 ** connections to the same database, this can be used
15 ** to disable a locking optimization.
16 ** BDBSQL_CONVERT_SQLITE -- If an attempt is made to open a SQLite database,
17 ** convert it on the fly to Berkeley DB.
18 ** BDBSQL_FILE_PER_TABLE -- Don't use sub-databases, use a file per table.
19 ** BDBSQL_OMIT_LEAKCHECK -- Omit combined sqlite and BDB memory allocation.
20 ** BDBSQL_SINGLE_PROCESS -- Keep all environment on the heap (necessary on
21 ** platforms without mmap).
22 ** BDBSQL_PRELOAD_HANDLES -- Open all tables when first connecting.
23 ** BDBSQL_SINGLE_THREAD -- Omit support for multithreading.
24 ** BDBSQL_SHARE_PRIVATE -- Implies BDBSQL_SINGLE_PROCESS and implements
25 ** inter-process sharing and synchronization of
26 ** databases.
27 ** BDBSQL_TXN_SNAPSHOTS_DEFAULT -- Always enable concurrency between read
28 ** and write transactions.
29 ** BDBSQL_MEMORY_MAX -- Define the maximum amount of memory (bytes) to be used
30 ** by shared structures in the main environment region.
31 ** BDBSQL_LOCK_TABLESIZE -- Define the number of buckets in the lock object
32 ** hash table in the Berkeley DB environment.
33 */
34
35 #if defined(BDBSQL_CONVERT_SQLITE) && defined(BDBSQL_FILE_PER_TABLE)
36 #error BDBSQL_CONVERT_SQLITE is incompatible with BDBSQL_FILE_PER_TABLE
37 #endif
38
39 #ifdef BDBSQL_OMIT_SHARING
40 #error BDBSQL_OMIT_SHARING has been replaced by BDBSQL_SINGLE_PROCESS
41 #endif
42
43 #include <assert.h>
44
45 #include "sqliteInt.h"
46 #include "btreeInt.h"
47 #include "vdbeInt.h"
48 #include <db.h>
49 #ifdef BDBSQL_SHARE_PRIVATE
50 #include <sys/mman.h>
51 #include <fcntl.h>
52 #endif
53
54 #ifdef BDBSQL_OMIT_LEAKCHECK
55 #define sqlite3_malloc malloc
56 #define sqlite3_free free
57 #define sqlite3_strdup strdup
58 #else
59 #define sqlite3_strdup btreeStrdup
60 #endif
61
62 /*
63 * We use the following internal DB functions.
64 */
65 extern void __os_dirfree(ENV *env, char **namesp, int cnt);
66 extern int __os_dirlist(ENV *env,
67 const char *dir, int returndir, char ***namesp, int *cntp);
68 extern int __os_exists (ENV *, const char *, int *);
69 extern int __os_fileid(ENV *, const char *, int, u_int8_t *);
70 extern int __os_mkdir (ENV *, const char *, int);
71 extern int __os_unlink (ENV *, const char *, int);
72 extern void __os_yield (ENV *, u_long, u_long);
73
74 /*
75 * The DB_SQL_LOCKER structure is used to unlock a DB handle. The id field must
76 * be compatible with the id field of the DB_LOCKER struct. We know the first
77 * field will be a "u_int32_t id", define enough of a structure here so that
78 * we can use the id field without including lock.h.
79 */
80 typedef struct {
81 u_int32_t id;
82 } DB_SQL_LOCKER;
83
84 #define DB_MIN_CACHESIZE 20 /* pages */
85
86 #define US_PER_SEC 1000000 /* Microseconds in a second */
87
88 /* The rowid is never longer than 9 bytes.*/
89 #define ROWIDMAXSIZE 10
90
91 /* Forward declarations for internal functions. */
92 static int btreeCleanupCachedHandles(Btree *p, cleanup_mode_t cleanup);
93 static int btreeCloseCursor(BtCursor *pCur, int removeList);
94 static int btreeCompressInt(u_int8_t *buf, u_int64_t i);
95 static int btreeConfigureDbHandle(Btree *p, int iTable, DB **dbpp);
96 static int btreeCreateDataTable(Btree *, int, CACHED_DB **);
97 static int btreeCreateSharedBtree(
98 Btree *, const char *, u_int8_t *, sqlite3 *, int, storage_mode_t);
99 static int btreeCreateTable(Btree *p, int *piTable, int flags);
100 static void btreeHandleDbError(
101 const DB_ENV *dbenv, const char *errpfx, const char *msg);
102 static int btreeDbHandleIsLocked(CACHED_DB *cached_db);
103 static int btreeDbHandleLock(Btree *p, CACHED_DB *cached_db);
104 static int btreeDbHandleUnlock(Btree *p, CACHED_DB *cached_db);
105 static int btreeDecompressInt(const u_int8_t *buf, u_int64_t *i);
106 static void btreeFreeSharedBtree(BtShared *p, int clear_cache);
107 static int btreeGetSharedBtree(
108 BtShared **, u_int8_t *, sqlite3 *, storage_mode_t, int);
109 static int btreeInvalidateHandleCache(Btree *p);
110 static int btreeLoadBufferIntoTable(BtCursor *pCur);
111 static int btreeMoveto(BtCursor *pCur,
112 const void *pKey, i64 nKey, int bias, int *pRes);
113 static int btreePrepareEnvironment(Btree *p);
114 static int btreeRepIsClient(Btree *p);
115 static int btreeRepStartupFinished(Btree *p);
116 static int btreeRestoreCursorPosition(BtCursor *pCur, int skipMoveto);
117 static int btreeSetUpReplication(Btree *p, int master, u8 *replicate);
118 static int btreeTripAll(Btree *p, int iTable, int incrblobUpdate);
119 static int btreeTripWatchers(BtCursor *pBt, int incrblobUpdate);
120 static int indexIsCollated(KeyInfo *keyInfo);
121 static int supportsDuplicates(DB *db);
122 #ifdef BDBSQL_SHARE_PRIVATE
123 static int btreeFileLock(Btree *p);
124 static int btreeFileUnlock(Btree *p);
125 static int btreeReopenPrivateEnvironment(Btree *p);
126 static int btreeSetupLockfile(Btree *p, int *createdFile);
127 #endif
128
129 /*
130 * Flags for btreeFindOrCreateDataTable
131 * Defined in btree.h:
132 * #define BTREE_INTKEY 1
133 * #define BTREE_BLOBKEY 2
134 */
135 #define BTREE_CREATE 4 /* If we want to create the table */
136
137 /* Globals are protected by the static "open" mutex (SQLITE_MUTEX_STATIC_OPEN).
138 */
139
140 /* The head of the linked list of shared Btree objects */
141 struct BtShared *g_shared_btrees = NULL;
142
143 /* The environment handle used for temporary environments (NULL or open). */
144 DB_ENV *g_tmp_env;
145
146 /* The unique id for the next shared Btree object created. */
147 u_int32_t g_uid_next = 0;
148
149 /* Number of times we're prepared to try multiple gets. */
150 #define MAX_SMALLS 100
151
152 /* Number of times to retry operations that return a "busy" error. */
153 #define BUSY_RETRY_COUNT 100
154
155 /* TODO: This should probably be '\' on Windows. */
156 #define PATH_SEPARATOR "/"
157
158 #define pBDb (pCur->cached_db->dbp)
159 #define pDbc (pCur->dbc)
160 #define pIntKey ((pCur->flags & BTREE_INTKEY) != 0)
161 #define pIsBuffer (pCur->pBtree->pBt->resultsBuffer)
162
163 #define GET_TABLENAME(b, sz, i, prefix) do { \
164 if (pBt->dbStorage == DB_STORE_NAMED) \
165 sqlite3_snprintf((sz), (b), "%stable%05d", \
166 (prefix), (i)); \
167 else if (pBt->dbStorage == DB_STORE_INMEM) \
168 sqlite3_snprintf((sz), (b), "%stemp%05d_%05d", \
169 (prefix), pBt->uid, (i)); \
170 else \
171 b = NULL; \
172 } while (0)
173
174 #define GET_DURABLE(pBt) \
175 ((pBt)->dbStorage == DB_STORE_NAMED && \
176 ((pBt)->flags & BTREE_OMIT_JOURNAL) == 0)
177
178 #define IS_ENV_READONLY(pBt) \
179 (pBt->readonly ? 1 : 0)
180 #define GET_ENV_READONLY(pBt) \
181 (IS_ENV_READONLY(pBt) ? DB_RDONLY : 0)
182 #define IS_BTREE_READONLY(p) \
183 ((p->readonly || IS_ENV_READONLY(p->pBt)) ? 1 : 0)
184
185 #ifndef BDBSQL_SINGLE_THREAD
186 #define RMW(pCur) \
187 (pCur->wrFlag && pCur->pBtree->pBt->dbStorage == DB_STORE_NAMED ? \
188 DB_RMW : 0)
189 #else
190 #define RMW(pCur) 0
191 #endif
192
193 #ifdef BDBSQL_SINGLE_THREAD
194 #define GET_BTREE_ISOLATION(p) 0
195 #else
196 #define GET_BTREE_ISOLATION(p) (!p->pBt->transactional ? 0 : \
197 ((p->db->flags & SQLITE_ReadUncommitted) ? \
198 DB_READ_UNCOMMITTED : DB_READ_COMMITTED) | \
199 ((p->pBt->read_txn_flags & DB_TXN_SNAPSHOT) ? \
200 DB_TXN_SNAPSHOT : 0))
201 #endif
202
203 /* The transaction for incrblobs is held in the cursor, so when deadlock
204 * happens the cursor transaction must be aborted instead of the statement
205 * transaction. */
206 #define HANDLE_INCRBLOB_DEADLOCK(ret, pCur) \
207 if (ret == DB_LOCK_DEADLOCK && pCur->isIncrblobHandle) {\
208 if (!pCur->wrFlag) \
209 pCur->pBtree->read_txn = NULL; \
210 if (pCur->txn == pCur->pBtree->savepoint_txn) \
211 pCur->pBtree->savepoint_txn = \
212 pCur->pBtree->savepoint_txn->parent;\
213 pCur->txn->abort(pCur->txn); \
214 pCur->txn = NULL; \
215 return SQLITE_LOCKED; \
216 }
217
218 /* Decide which transaction to use when reading the meta data table. */
219 #define GET_META_TXN(p) \
220 (p->txn_excl ? pSavepointTxn : \
221 (pReadTxn ? pReadTxn : pFamilyTxn))
222
223 /* Decide which flags to use when reading the meta data table. */
224 #define GET_META_FLAGS(p) \
225 ((p->txn_excl ? DB_RMW : 0) | \
226 (GET_BTREE_ISOLATION(p) & ~DB_TXN_SNAPSHOT))
227
dberr2sqlite(int err,Btree * p)228 int dberr2sqlite(int err, Btree *p)
229 {
230 BtShared *pBt;
231 int ret;
232
233 switch (err) {
234 case 0:
235 ret = SQLITE_OK;
236 break;
237 case DB_LOCK_DEADLOCK:
238 case DB_LOCK_NOTGRANTED:
239 case DB_REP_JOIN_FAILURE:
240 ret = SQLITE_BUSY;
241 break;
242 case DB_NOTFOUND:
243 ret = SQLITE_NOTFOUND;
244 break;
245 case DB_RUNRECOVERY:
246 ret = SQLITE_CORRUPT;
247 break;
248 case EACCES:
249 ret = SQLITE_READONLY;
250 break;
251 case EIO:
252 ret = SQLITE_IOERR;
253 break;
254 case EPERM:
255 ret = SQLITE_PERM;
256 break;
257 case ENOMEM:
258 ret = SQLITE_NOMEM;
259 break;
260 case ENOENT:
261 ret = SQLITE_CANTOPEN;
262 break;
263 case ENOSPC:
264 ret = SQLITE_FULL;
265 break;
266 default:
267 ret = SQLITE_ERROR;
268 }
269
270 if (p == NULL)
271 return ret;
272
273 pBt = p->pBt;
274 if (pBt != NULL && pBt->err_msg != NULL) {
275 if (ret != SQLITE_OK)
276 sqlite3Error(p->db, ret, pBt->err_msg);
277 else
278 sqlite3Error(p->db, ret, NULL);
279 sqlite3_free(pBt->err_msg);
280 pBt->err_msg = NULL;
281 }
282 return ret;
283 }
284
285 /*
286 * Close db handle and cleanup resource (e.g.: remove in-memory db)
287 * automatically.
288 *
289 * Note: closeDB is more dangerous than dbp->close since it would remove
290 * in-memory db. Generally, closeDB should only be used instead of dbp->close
291 * when:
292 * 1. Cleanup cached handles.
293 * 2. DB handle creating fails. Safe because no one own this uncreated handle.
294 * 3. Drop Tables.
295 *
296 * In other cases (error handlers, vacuum , backup, etc.), closeDB should not
297 * be called anyway. That's because the db might be required by other
298 * connections.
299 */
closeDB(Btree * p,DB * dbp,u_int32_t flags)300 int closeDB(Btree *p, DB *dbp, u_int32_t flags)
301 {
302 char *tableName, *fileName, tableNameBuf[DBNAME_SIZE];
303 u_int32_t remove_flags;
304 int ret, needRemove;
305 BtShared *pBt;
306
307 tableName = NULL;
308 fileName = NULL;
309 needRemove = 0;
310
311 if (p == NULL || (pBt = p->pBt) == NULL || dbp == NULL)
312 return 0;
313
314 /*
315 * In MPOOL, Named in-memory databases get an artificially bumped
316 * reference count so they don't disappear on close; they need a
317 * remove to make them disappear.
318 */
319 if (pBt->dbStorage == DB_STORE_INMEM &&
320 (dbp->flags & DB_AM_OPEN_CALLED))
321 needRemove = 1;
322
323 /*
324 * Save tableName into buf for subsquent dbremove. The buf is required
325 * since tableName would be destroyed after db is closed.
326 */
327 if (needRemove && (dbp->get_dbname(dbp, (const char **)&fileName,
328 (const char**)&tableName) == 0)) {
329 strncpy(tableNameBuf, tableName, sizeof(tableNameBuf) - 1);
330 tableName = tableNameBuf;
331 }
332
333 ret = dbp->close(dbp, flags);
334
335 /*
336 * Do removes as needed to prevent mpool leak. pSavepointTxn is
337 * required since the operations might be rollbacked.
338 */
339 if (needRemove) {
340 remove_flags = DB_NOSYNC;
341 if (!GET_DURABLE(pBt))
342 remove_flags |= DB_TXN_NOT_DURABLE;
343 if (pSavepointTxn == NULL)
344 remove_flags |= (DB_AUTO_COMMIT | DB_LOG_NO_DATA);
345 (void)pDbEnv->dbremove(pDbEnv, pSavepointTxn, fileName,
346 tableName, remove_flags);
347 }
348
349 return ret;
350 }
351
352 #define ERR_FILE_NAME "sql-errors.txt"
btreeGetErrorFile(const BtShared * pBt,char * fname)353 void btreeGetErrorFile(const BtShared *pBt, char *fname) {
354 if (pBt == NULL)
355 /* No env directory, use the current working directory. */
356 sqlite3_snprintf(BT_MAX_PATH, fname, ERR_FILE_NAME);
357 else {
358 sqlite3_mutex_enter(pBt->mutex);
359 if (pBt->err_file == NULL)
360 sqlite3_snprintf(BT_MAX_PATH, fname,
361 "%s/%s", pBt->dir_name, ERR_FILE_NAME);
362 else
363 sqlite3_snprintf(BT_MAX_PATH, fname,
364 "%s", pBt->err_file);
365 sqlite3_mutex_leave(pBt->mutex);
366 }
367 }
368
btreeHandleDbError(const DB_ENV * dbenv,const char * errpfx,const char * msg)369 static void btreeHandleDbError(
370 const DB_ENV *dbenv,
371 const char *errpfx,
372 const char *msg
373 ) {
374 BtShared *pBt;
375 FILE *fp;
376 char fname[BT_MAX_PATH];
377
378 /* Store the error msg to pBt->err_msg for future use. */
379 pBt = (BtShared *)dbenv->app_private;
380 if (pBt && (errpfx || msg)) {
381 if (pBt->err_msg != NULL)
382 sqlite3_free(pBt->err_msg);
383 pBt->err_msg = sqlite3_mprintf("%s:%s", errpfx, msg);
384 }
385
386 /*
387 * If error_file is set, flush the error to the error file. Else flush
388 * the error msg to stderr.
389 * Simply igore the error return from btreeGetErrorFile since we're
390 * in the error handle routine.
391 */
392 btreeGetErrorFile(pBt, fname);
393 fp = fopen(fname, "a");
394 if (fp == NULL)
395 fp = stderr;
396
397 fprintf(fp, "%s:%s\n", errpfx, msg);
398 if (fp != stderr) {
399 fflush(fp);
400 fclose(fp);
401 }
402 }
403
404 /*
405 * Used in cases where SQLITE_LOCKED should be returned instead of
406 * SQLITE_BUSY.
407 */
dberr2sqlitelocked(int err,Btree * p)408 static int dberr2sqlitelocked(int err, Btree *p)
409 {
410 int rc = dberr2sqlite(err, p);
411 if (rc == SQLITE_BUSY)
412 rc = SQLITE_LOCKED;
413 return rc;
414 }
415
416 #ifndef NDEBUG
log_msg(loglevel_t level,const char * fmt,...)417 void log_msg(loglevel_t level, const char *fmt, ...)
418 {
419 if (level >= CURRENT_LOG_LEVEL) {
420 va_list ap;
421 va_start(ap, fmt);
422 vfprintf(stdout, fmt, ap);
423 fputc('\n', stdout);
424 fflush(stdout);
425 va_end(ap);
426 }
427 }
428 #endif
429
430 #ifdef BDBSQL_FILE_PER_TABLE
getMetaDataFileName(const char * full_name,char ** filename)431 int getMetaDataFileName(const char *full_name, char **filename)
432 {
433 *filename = sqlite3_malloc(strlen(full_name) +
434 strlen(BDBSQL_META_DATA_TABLE) + 2);
435 if (*filename == NULL)
436 return SQLITE_NOMEM;
437 strcpy(*filename, full_name);
438 strcpy(*filename + strlen(full_name), PATH_SEPARATOR);
439 strcpy(*filename + strlen(full_name) + 1, BDBSQL_META_DATA_TABLE);
440 return SQLITE_OK;
441 }
442 #endif
443
444 #ifndef BDBSQL_OMIT_LEAKCHECK
445 /*
446 * Wrap the sqlite malloc and realloc APIs before using them in Berkeley DB
447 * since they use different parameter types to the standard malloc and
448 * realloc.
449 * The signature of free matches, so we don't need to wrap it.
450 */
btreeMalloc(size_t size)451 static void *btreeMalloc(size_t size)
452 {
453 if (size != (size_t)(int)size)
454 return NULL;
455
456 return sqlite3_malloc((int)size);
457 }
458
btreeRealloc(void * buff,size_t size)459 static void *btreeRealloc(void * buff, size_t size)
460 {
461 if (size != (size_t)(int)size)
462 return NULL;
463
464 return sqlite3_realloc(buff, (int)size);
465 }
466
btreeStrdup(const char * sq)467 static char *btreeStrdup(const char *sq)
468 {
469 return sqlite3_mprintf("%s", sq);
470 }
471 #endif
472
btreeCompareIntKey(DB * dbp,const DBT * dbt1,const DBT * dbt2)473 static int btreeCompareIntKey(DB *dbp, const DBT *dbt1, const DBT *dbt2)
474 {
475 i64 v1,v2;
476 assert(dbt1->size == sizeof(i64));
477 assert(dbt2->size == sizeof(i64));
478
479 memcpy(&v1, dbt1->data, sizeof(i64));
480 memcpy(&v2, dbt2->data, sizeof(i64));
481 if (v1 < v2)
482 return -1;
483 return v1 > v2;
484 }
485
486 #ifdef BDBSQL_CONVERT_SQLITE
btreeConvertSqlite(BtShared * pBt,DB_ENV * tmp_env)487 static int btreeConvertSqlite(BtShared *pBt, DB_ENV *tmp_env)
488 {
489 char convert_cmd[BT_MAX_PATH + 200];
490 int ret;
491 #ifdef ANDROID
492 const char* dbsql_shell = "sqlite3";
493 const char* sqlite_shell = "sqlite3orig";
494 #else
495 const char* dbsql_shell = "dbsql";
496 const char* sqlite_shell = "sqlite3";
497 #endif
498
499 log_msg(LOG_NORMAL, "Attempting to convert %s", pBt->full_name);
500
501 /*
502 * We're going to attempt to convert a SQLite database to Berkeley DB.
503 * The main complication is that we may have already created an
504 * environment in the journal directory. This will prevent SQLite from
505 * accessing the database with the same name. Also, if we try to start
506 * a dbsql with that name to create the new file, that will destroy the
507 * environment we just created.
508 *
509 * So, the process is:
510 * 1. rename the file
511 * 2. dump / load to another name (in Berkeley DB format)
512 * 3. rename file 2 to the original name
513 * 4. if everything worked, remove file 1
514 * 5. if anything went wrong, rename file 1 back to
515 * the original name.
516 *
517 * Use variables in the script to avoid sending in the filename
518 * lots of times.
519 */
520 sqlite3_snprintf(sizeof(convert_cmd), convert_cmd,
521 "f='%s' ; t=\"$f-bdbtmp\" ; mv \"$f\" \"$t-1\" || exit $? "
522 "; ((echo PRAGMA txn_bulk=1';' PRAGMA user_version="
523 "`%s \"$t-1\" 'pragma user_version'`';'"
524 " ; %s \"$t-1\" .dump) | %s \"$t-2\""
525 " && mv \"$t-2\" \"$f\" && rm -r \"$t-2-journal\" && rm \"$t-1\")"
526 "|| mv \"$t-1\" \"$f\"",
527 pBt->full_name, sqlite_shell, sqlite_shell, dbsql_shell);
528
529 if ((ret = system(convert_cmd)) != 0)
530 return (ret);
531
532 /*
533 * If all of that worked, we need to reset LSNs before we can
534 * open that database file in our environment. That has to be
535 * done in a temporary environment to avoid LSN checks...
536 */
537 log_msg(LOG_NORMAL, "Resetting LSNs in %s", pBt->full_name);
538 ret = tmp_env->lsn_reset(tmp_env, pBt->full_name, 0);
539
540 return (ret);
541 }
542 #endif
543
544 /*
545 * An internal function that opens the metadata database that is present for
546 * every SQLite Btree, and the special "tables" database maintained by Berkeley
547 * DB that lists all of the subdatabases in a file.
548 *
549 * This is split out into a separate function so that it will be easy to change
550 * the Btree layer to create Berkeley DB database handles per Btree object,
551 * rather than per BtShared object.
552 */
btreeOpenMetaTables(Btree * p,int * pCreating)553 int btreeOpenMetaTables(Btree *p, int *pCreating)
554 {
555 BtShared *pBt;
556 DBC *dbc;
557 DBT key, data;
558 DB_ENV *tmp_env;
559 char *fileName;
560 int i, idx, rc, ret, t_ret;
561 u32 val;
562 #ifdef BDBSQL_FILE_PER_TABLE
563 char **dirnames;
564 int cnt;
565 #endif
566
567 pBt = p->pBt;
568 rc = SQLITE_OK;
569 ret = t_ret = 0;
570
571 if (pBt->lsn_reset != NO_LSN_RESET) {
572 /*
573 * Reset the LSNs in the database, so that we can open the
574 * database in a new environment.
575 *
576 * This is the first time we try to open the database file, so
577 * an EINVAL error may indicate an attempt to open a SQLite
578 * database.
579 */
580 ret = db_env_create(&tmp_env, 0);
581 if (ret != 0)
582 goto err;
583 tmp_env->set_errcall(tmp_env, NULL);
584 if (pBt->encrypted) {
585 ret = tmp_env->set_encrypt(tmp_env,
586 pBt->encrypt_pwd, DB_ENCRYPT_AES);
587 if (ret != 0)
588 goto err;
589 }
590 ret = tmp_env->open(
591 tmp_env, NULL, DB_CREATE | DB_PRIVATE | DB_INIT_MPOOL, 0);
592 while (ret == 0 && pBt->lsn_reset == LSN_RESET_FILE) {
593 ret = tmp_env->lsn_reset(tmp_env, pBt->full_name, 0);
594 #ifdef BDBSQL_CONVERT_SQLITE
595 if (ret == EINVAL &&
596 btreeConvertSqlite(pBt, tmp_env) == 0) {
597 ret = 0;
598 continue;
599 }
600 #endif
601 break;
602 }
603 if (ret == EINVAL)
604 rc = SQLITE_NOTADB;
605 #ifdef BDBSQL_FILE_PER_TABLE
606 __os_dirlist(NULL, pBt->full_name, 0, &dirnames, &cnt);
607 for (i = 0; i < cnt; i++)
608 (void)tmp_env->lsn_reset(tmp_env, dirnames[i], 0);
609 __os_dirfree(NULL, dirnames, cnt);
610 #endif
611 if ((t_ret = tmp_env->close(tmp_env, 0)) != 0 &&
612 ret == 0)
613 ret = t_ret;
614 if (ret != 0)
615 goto err;
616 pBt->lsn_reset = NO_LSN_RESET;
617 }
618
619 if (pMetaDb != NULL) {
620 *pCreating = 0;
621 goto addmeta;
622 }
623
624 /*
625 * We open the metadata and tables databases in auto-commit
626 * transactions. These may deadlock or conflict, and should be safe to
627 * retry, but for safety we limit how many times we'll do that before
628 * returning the error.
629 */
630 i = 0;
631 do {
632 if ((ret = db_create(&pMetaDb, pDbEnv, 0)) != 0)
633 goto err;
634
635 if (pBt->encrypted &&
636 ((ret = pMetaDb->set_flags(pMetaDb, DB_ENCRYPT)) != 0))
637 goto err;
638
639 if (!GET_DURABLE(pBt)) {
640 /* Ensure that log records are not written to disk. */
641 if ((ret =
642 pMetaDb->set_flags(pMetaDb, DB_TXN_NOT_DURABLE))
643 != 0)
644 goto err;
645 }
646
647 /*
648 * The metadata DB is the first one opened in the file, so it
649 * is sufficient to set the page size on it -- other databases
650 * in the same file will inherit the same pagesize. We must
651 * open it before the table DB because this open call may be
652 * creating the file.
653 */
654 if (pBt->pageSize != 0 &&
655 (ret = pMetaDb->set_pagesize(pMetaDb, pBt->pageSize)) != 0)
656 goto err;
657
658 pBt->pageSizeFixed = 1;
659
660 #ifdef BDBSQL_FILE_PER_TABLE
661 fileName = BDBSQL_META_DATA_TABLE;
662 #else
663 fileName = pBt->short_name;
664 #endif
665 ret = pMetaDb->open(pMetaDb, NULL, fileName,
666 pBt->dbStorage == DB_STORE_NAMED ? "metadb" : NULL,
667 DB_BTREE,
668 pBt->db_oflags | GET_AUTO_COMMIT(pBt, NULL) |
669 GET_ENV_READONLY(pBt), 0);
670
671 if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
672 (void)pMetaDb->close(pMetaDb, DB_NOSYNC);
673 pMetaDb = NULL;
674 }
675 } while ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
676 ++i < BUSY_RETRY_COUNT);
677
678 if (ret != 0) {
679 if (ret == EACCES && IS_ENV_READONLY(pBt))
680 rc = SQLITE_READONLY;
681 else if (ret == EINVAL)
682 rc = SQLITE_NOTADB;
683 goto err;
684 }
685
686 /* Set the default max_page_count */
687 sqlite3BtreeMaxPageCount(p, pBt->pageCount);
688
689 if (pBt->dbStorage != DB_STORE_NAMED)
690 goto addmeta;
691
692 i = 0;
693 do {
694 /* Named databases use a db to track new table names. */
695 if ((ret = db_create(&pTablesDb, pDbEnv, 0)) != 0)
696 goto err;
697
698 if (pBt->encrypted &&
699 ((ret = pTablesDb->set_flags(pTablesDb, DB_ENCRYPT)) != 0))
700 goto err;
701 #ifdef BDBSQL_FILE_PER_TABLE
702 /*
703 * When opening a file-per-table we need an additional table to
704 * track the names of tables within the database.
705 */
706 ret = pTablesDb->open(pTablesDb, NULL, fileName,
707 "tables", DB_BTREE, (pBt->db_oflags) |
708 GET_AUTO_COMMIT(pBt, NULL), 0);
709 /*
710 * Insert an entry for the metadata table, so the usage of
711 * this table matches the sub-db cursor in the non-split case.
712 */
713 memset(&key, 0, sizeof(key));
714 memset(&data, 0, sizeof(data));
715 key.data = "metadb";
716 key.size = 6;
717 pTablesDb->put(pTablesDb, NULL, &key, &data, 0);
718 #else
719 ret = pTablesDb->open(pTablesDb, NULL, fileName,
720 NULL, DB_BTREE, (pBt->db_oflags & ~DB_CREATE) |
721 DB_RDONLY | GET_AUTO_COMMIT(pBt, NULL), 0);
722 #endif
723 if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
724 (void)pTablesDb->close(pTablesDb, DB_NOSYNC);
725 pTablesDb = NULL;
726 }
727 } while ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
728 ++i < BUSY_RETRY_COUNT);
729
730 if (ret != 0)
731 goto err;
732
733 /* Check whether we're creating the database */
734 if ((ret = pTablesDb->cursor(pTablesDb, pFamilyTxn, &dbc, 0)) != 0)
735 goto err;
736
737 memset(&key, 0, sizeof(key));
738 memset(&data, 0, sizeof(data));
739 data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
740 ret = dbc->get(dbc, &key, &data, DB_LAST);
741 if (ret == 0)
742 *pCreating =
743 (strncmp((const char *)key.data, "metadb", key.size) == 0);
744 if ((t_ret = dbc->close(dbc)) != 0 && ret == 0)
745 ret = t_ret;
746 if (ret != 0)
747 goto err;
748
749 addmeta:/*
750 * Populate the MetaDb with any values that were set prior to
751 * the sqlite3BtreeOpen that triggers this.
752 */
753 for (idx = 0; idx < NUMMETA; idx++) {
754 if (pBt->meta[idx].cached)
755 val = pBt->meta[idx].value;
756 else if (idx == BTREE_LARGEST_ROOT_PAGE && *pCreating)
757 val = pBt->autoVacuum;
758 else if (idx == BTREE_INCR_VACUUM && *pCreating)
759 val = pBt->incrVacuum;
760 else
761 continue;
762 if ((rc = sqlite3BtreeUpdateMeta(p, idx, val)) != SQLITE_OK)
763 goto err;
764 }
765
766 if (!*pCreating) {
767 /* This matches SQLite, I don't understand the naming. */
768 sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &val);
769 if (p->db->errCode == SQLITE_BUSY) {
770 rc = SQLITE_BUSY;
771 goto err;
772 }
773 pBt->autoVacuum = (u8)val;
774 sqlite3BtreeGetMeta(p, BTREE_INCR_VACUUM, &val);
775 if (p->db->errCode == SQLITE_BUSY) {
776 rc = SQLITE_BUSY;
777 goto err;
778 }
779 pBt->incrVacuum = (u8)val;
780 }
781
782 err: if (rc != SQLITE_OK || ret != 0) {
783 if (pTablesDb != NULL)
784 (void)pTablesDb->close(pTablesDb, DB_NOSYNC);
785 if (pMetaDb != NULL)
786 (void)pMetaDb->close(pMetaDb, DB_NOSYNC);
787 pTablesDb = pMetaDb = NULL;
788 }
789
790 return MAP_ERR(rc, ret, p);
791 }
792
793 /*
794 * Berkeley DB doesn't NUL-terminate database names, do the conversion
795 * manually to avoid making a copy just in order to call strtol.
796 */
btreeTableNameToId(const char * subdb,int len,int * pid)797 int btreeTableNameToId(const char *subdb, int len, int *pid)
798 {
799 const char *p;
800 int id;
801
802 assert(len > 5);
803 assert(strncmp(subdb, "table", 5) == 0);
804
805 id = 0;
806 for (p = subdb + 5; p < subdb + len; p++) {
807 if (*p < '0' || *p > '9')
808 return (EINVAL);
809 id = (id * 10) + (*p - '0');
810 }
811 *pid = id;
812 return (0);
813 }
814
815 #ifdef BDBSQL_PRELOAD_HANDLES
btreePreloadHandles(Btree * p)816 static int btreePreloadHandles(Btree *p)
817 {
818 BtShared *pBt;
819 CACHED_DB *cached_db;
820 DBC *dbc;
821 DBT key, data;
822 int iTable, ret;
823
824 pBt = p->pBt;
825 dbc = NULL;
826
827 if ((ret = pTablesDb->cursor(pTablesDb, NULL, &dbc, 0)) != 0)
828 goto err;
829
830 memset(&key, 0, sizeof(key));
831 memset(&data, 0, sizeof(data));
832 data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
833
834 sqlite3_mutex_enter(pBt->mutex);
835 while ((ret = dbc->get(dbc, &key, &data, DB_NEXT)) == 0) {
836 if (strncmp((const char *)key.data, "table", 5) != 0)
837 continue;
838 if ((ret = btreeTableNameToId(
839 (const char *)key.data, key.size, &iTable)) != 0)
840 break;
841 cached_db = NULL;
842 (void)btreeCreateDataTable(p, iTable, &cached_db);
843 }
844 sqlite3_mutex_leave(pBt->mutex);
845
846 err: if (ret == DB_NOTFOUND)
847 ret = 0;
848 if (dbc != NULL)
849 (void)dbc->close(dbc);
850 return (ret);
851 }
852 #endif /* BDBSQL_PRELOAD_HANDLES */
853
854 /*
855 ** Free an allocated BtShared and any dependent allocated objects.
856 */
btreeFreeSharedBtree(BtShared * p,int clear_cache)857 static void btreeFreeSharedBtree(BtShared *p, int clear_cache)
858 {
859 BtShared *tmp_bt;
860
861 if (p == NULL)
862 return;
863
864 #ifdef BDBSQL_SHARE_PRIVATE
865 /* close the shared lockfile */
866 if (p->lockfile.fd > 0)
867 (void)close(p->lockfile.fd);
868 if (p->lockfile.mutex != NULL)
869 sqlite3_mutex_free(p->lockfile.mutex);
870 #endif
871 if (clear_cache) {
872 if (p == g_shared_btrees && p->pNextDb == NULL)
873 g_shared_btrees = NULL;
874 else if (p == g_shared_btrees) {
875 g_shared_btrees = p->pNextDb;
876 g_shared_btrees->pPrevDb = NULL;
877 } else if (p->pNextDb == NULL)
878 p->pPrevDb->pNextDb = NULL;
879 else {
880 tmp_bt = p->pPrevDb;
881 p->pPrevDb->pNextDb = p->pNextDb;
882 p->pNextDb->pPrevDb = tmp_bt;
883 }
884 }
885 if (p->encrypt_pwd != NULL)
886 CLEAR_PWD(p);
887 if (p->mutex != NULL)
888 sqlite3_mutex_free(p->mutex);
889 if (p->dir_name != NULL)
890 sqlite3_free(p->dir_name);
891 if (p->full_name != NULL)
892 sqlite3_free(p->full_name);
893 if (p->orig_name != NULL)
894 sqlite3_free(p->orig_name);
895 if (p->err_file != NULL)
896 sqlite3_free(p->err_file);
897 if (p->err_msg != NULL)
898 sqlite3_free(p->err_msg);
899
900 sqlite3_free(p);
901 }
902
btreeCheckEnvPrepare(Btree * p)903 static int btreeCheckEnvPrepare(Btree *p)
904 {
905 BtShared *pBt;
906 int f_exists, f_isdir, rc;
907 #ifndef BDBSQL_FILE_PER_TABLE
908 int attrs;
909 sqlite3_file *fp;
910 #endif
911
912 pBt = p->pBt;
913 rc = SQLITE_OK;
914 f_exists = f_isdir = 0;
915
916 assert(pBt->dbStorage == DB_STORE_NAMED);
917 assert(pBt->dir_name != NULL);
918 f_exists = !__os_exists(NULL, pBt->full_name, &f_isdir);
919 pBt->database_existed = f_exists;
920
921 if ((p->vfsFlags & SQLITE_OPEN_READONLY) && !f_exists) {
922 rc = SQLITE_READONLY;
923 goto err;
924 }
925
926 if (!f_exists) {
927 if ((p->vfsFlags & SQLITE_OPEN_READONLY) != 0) {
928 rc = SQLITE_READONLY;
929 goto err;
930 } else if (!(p->vfsFlags & SQLITE_OPEN_CREATE)) {
931 rc = SQLITE_CANTOPEN;
932 goto err;
933 }
934 } else {
935 #ifndef BDBSQL_FILE_PER_TABLE
936 /*
937 * If we don't have write permission for a file,
938 * automatically open any databases read-only.
939 */
940 fp = (sqlite3_file *)sqlite3_malloc(p->db->pVfs->szOsFile);
941 if (fp == NULL) {
942 rc = SQLITE_NOMEM;
943 goto err;
944 }
945 memset(fp, 0, p->db->pVfs->szOsFile);
946 rc = sqlite3OsOpen(p->db->pVfs, pBt->full_name, fp,
947 SQLITE_OPEN_MAIN_DB | SQLITE_OPEN_READWRITE,
948 &attrs);
949 if (attrs & SQLITE_OPEN_READONLY)
950 pBt->readonly = 1;
951 if (rc == SQLITE_OK)
952 (void)sqlite3OsClose(fp);
953 sqlite3_free(fp);
954 #endif
955 /*
956 * Always open existing tables, even if the matching
957 * env does not exist (yet).
958 */
959 pBt->env_oflags |= DB_CREATE;
960 pBt->need_open = 1;
961 }
962 err: return rc;
963 }
964
btreeCheckEnvOpen(Btree * p,int createdDir,u8 replicate)965 static int btreeCheckEnvOpen(Btree *p, int createdDir, u8 replicate)
966 {
967 BtShared *pBt;
968 int env_exists, f_exists;
969
970 pBt = p->pBt;
971 env_exists = f_exists = 0;
972
973 assert(pBt->dbStorage == DB_STORE_NAMED);
974 assert(pBt->dir_name != NULL);
975 f_exists = pBt->database_existed;
976 env_exists = !__os_exists(NULL, pBt->dir_name, NULL);
977 if (env_exists && createdDir)
978 env_exists = 0;
979 if (env_exists && !f_exists) {
980 int f_isdir;
981 /*
982 * there may have been a race for database creation. Recheck
983 * file existence before destroying the environment.
984 */
985 f_exists = !__os_exists(NULL, pBt->full_name, &f_isdir);
986 }
987 if (!env_exists && !IS_ENV_READONLY(pBt) && f_exists)
988 pBt->lsn_reset = LSN_RESET_FILE;
989
990 /*
991 * If we are opening a database read-only, and there is not
992 * already an environment, create a non-transactional
993 * private environment to use. Otherwise we run into issues
994 * with mismatching LSNs.
995 */
996 if (!env_exists && IS_ENV_READONLY(pBt)) {
997 pBt->env_oflags |= DB_PRIVATE;
998 pBt->transactional = 0;
999 } else {
1000 pBt->env_oflags |= DB_INIT_LOG | DB_INIT_TXN |
1001 (replicate ? DB_INIT_REP : 0);
1002 #ifndef BDBSQL_SINGLE_THREAD
1003 pBt->env_oflags |= DB_INIT_LOCK;
1004 #endif
1005 #ifdef BDBSQL_SINGLE_PROCESS
1006 /*
1007 * If BDBSQL_OMIT_LEAKCHECK is enabled, single_process would
1008 * always take affect, not matter the pragma setting.
1009 */
1010 pBt->single_process = 1;
1011 #endif
1012 if (pBt->single_process) {
1013 pBt->env_oflags |= DB_PRIVATE | DB_CREATE;
1014 } else if (!replicate && !pBt->repForceRecover) {
1015 /*
1016 * FAILCHK_ISALIVE doesn't currently work with
1017 * replication. Also, replication can't use DB_REGISTER
1018 * because it assumes actual recoveries between
1019 * sessions. Avoid adding these flags if we are running
1020 * with replication or if this is the first time we are
1021 * opening the env after turning off replication
1022 * (repForceRecover).
1023 */
1024 pBt->env_oflags |= DB_FAILCHK_ISALIVE | DB_REGISTER;
1025 }
1026 }
1027 /*
1028 * If we're prepared to create the environment, do that now.
1029 * Otherwise, if the table is being created, SQLite will call
1030 * sqlite3BtreeCursor and expect a "SQLITE_EMPTY" return, then
1031 * call sqlite3BtreeCreateTable. The result of this open is
1032 * recorded in the Btree object passed in.
1033 */
1034 pBt->env_oflags |= DB_CREATE;
1035
1036 if ((pBt->env_oflags & DB_INIT_TXN) != 0)
1037 pBt->env_oflags |= DB_RECOVER;
1038
1039 return SQLITE_OK;
1040 }
1041
1042 /*
1043 * Determine whether replication is configured and make all needed
1044 * replication calls prior to opening environment.
1045 */
btreeSetUpReplication(Btree * p,int master,u8 * replicate)1046 static int btreeSetUpReplication(Btree *p, int master, u8 *replicate)
1047 {
1048 BtShared *pBt;
1049 sqlite3 *db;
1050 char *value, *value2;
1051 DB_SITE *lsite, *rsite;
1052 char *host, *msg;
1053 u_int port = 0;
1054 int rc, rc2, ret;
1055
1056 pBt = p->pBt;
1057 db = p->db;
1058 rc = SQLITE_OK;
1059 *replicate = ret = 0;
1060
1061 value = NULL;
1062 if ((rc = getPersistentPragma(p, "replication",
1063 &value, NULL)) == SQLITE_OK && value)
1064 *replicate = atoi(value);
1065 if (value)
1066 sqlite3_free(value);
1067
1068 if (*replicate) {
1069 value = NULL;
1070 value2 = NULL;
1071 if ((rc = getPersistentPragma(p, "replication_verbose_output",
1072 &value, NULL)) == SQLITE_OK && value && atoi(value)) {
1073 if (pDbEnv->set_verbose(pDbEnv,
1074 DB_VERB_REPLICATION, 1) != 0) {
1075 sqlite3Error(db, SQLITE_ERROR, "Error in "
1076 "replication set_verbose call");
1077 rc = SQLITE_ERROR;
1078 }
1079 else if ((rc = getPersistentPragma(p,
1080 "replication_verbose_file",
1081 &value2, NULL)) == SQLITE_OK && value && value2) {
1082 if ((rc = unsetRepVerboseFile(
1083 pBt, pDbEnv, &msg)) != SQLITE_OK)
1084 sqlite3Error(db, rc, msg);
1085 if (rc == SQLITE_OK && strlen(value2) > 0 &&
1086 (rc = setRepVerboseFile(
1087 pBt, pDbEnv, value2, msg)) != SQLITE_OK)
1088 sqlite3Error(db, rc, msg);
1089 }
1090 }
1091 if (value)
1092 sqlite3_free(value);
1093 if (value2)
1094 sqlite3_free(value2);
1095 if (rc != SQLITE_OK)
1096 goto err;
1097
1098 /* There must be a local_site value. */
1099 lsite = NULL;
1100 value = NULL;
1101 if ((rc = getPersistentPragma(p, "replication_local_site",
1102 &value, NULL)) == SQLITE_OK && value) {
1103 /* Pragma code already syntax-checked the value. */
1104 rc2 = getHostPort(value, &host, &port);
1105 if (pDbEnv->repmgr_site(pDbEnv,
1106 host, port, &lsite, 0) != 0) {
1107 sqlite3Error(db, SQLITE_ERROR, "Error in "
1108 "replication call repmgr_site LOCAL");
1109 rc = SQLITE_ERROR;
1110 }
1111 if (rc != SQLITE_ERROR &&
1112 lsite->set_config(lsite, DB_LOCAL_SITE, 1) != 0) {
1113 sqlite3Error(db, SQLITE_ERROR, "Error in "
1114 "replication call site config LOCAL");
1115 rc = SQLITE_ERROR;
1116 }
1117 if (rc != SQLITE_ERROR && master &&
1118 lsite->set_config(lsite,
1119 DB_GROUP_CREATOR, 1) != 0) {
1120 sqlite3Error(db, SQLITE_ERROR, "Error in "
1121 "replication call site config CREATOR");
1122 rc = SQLITE_ERROR;
1123 }
1124 if (lsite != NULL && lsite->close(lsite) != 0) {
1125 sqlite3Error(db, SQLITE_ERROR, "Error in "
1126 "replication call site close LOCAL");
1127 rc = SQLITE_ERROR;
1128 }
1129 if (rc2 == SQLITE_OK)
1130 sqlite3_free(host);
1131 } else {
1132 sqlite3Error(db, SQLITE_ERROR, "Must specify local "
1133 "site before starting replication");
1134 rc = SQLITE_ERROR;
1135 }
1136 if (value)
1137 sqlite3_free(value);
1138 if (rc != SQLITE_OK)
1139 goto err;
1140
1141 /* It is optional to have a remote_site value. */
1142 rsite = NULL;
1143 value = NULL;
1144 if (getPersistentPragma(p, "replication_remote_site",
1145 &value, NULL) == SQLITE_OK && value) {
1146 /* Pragma code already syntax-checked the value. */
1147 rc2 = getHostPort(value, &host, &port);
1148 if (pDbEnv->repmgr_site(pDbEnv,
1149 host, port, &rsite, 0) != 0) {
1150 sqlite3Error(db, SQLITE_ERROR, "Error in "
1151 "replication call repmgr_site REMOTE");
1152 rc = SQLITE_ERROR;
1153 }
1154 if (rc != SQLITE_ERROR &&
1155 rsite->set_config(rsite,
1156 DB_BOOTSTRAP_HELPER, 1) != 0)
1157 sqlite3Error(db, SQLITE_ERROR, "Error in "
1158 "replication call site config HELPER");
1159 if (rsite != NULL && rsite->close(rsite) != 0)
1160 sqlite3Error(db, SQLITE_ERROR, "Error in "
1161 "replication call site close REMOTE");
1162 if (rc2 == SQLITE_OK)
1163 sqlite3_free(host);
1164 }
1165 if (value)
1166 sqlite3_free(value);
1167
1168 /* Set 2SITE_STRICT to ensure data durability. */
1169 if (pDbEnv->rep_set_config(pDbEnv,
1170 DB_REPMGR_CONF_2SITE_STRICT, 1) != 0) {
1171 sqlite3Error(db, SQLITE_ERROR, "Error in "
1172 "replication call rep_set_config");
1173 rc = SQLITE_ERROR;
1174 goto err;
1175 }
1176
1177 /*
1178 * Set up heartbeats to detect when client loses connection
1179 * to master and to enable rerequest processing.
1180 */
1181 if (pDbEnv->rep_set_timeout(pDbEnv,
1182 DB_REP_HEARTBEAT_MONITOR, 7000000) != 0) {
1183 sqlite3Error(db, SQLITE_ERROR, "Error in replication "
1184 "call rep_set_timeout heartbeat monitor");
1185 rc = SQLITE_ERROR;
1186 goto err;
1187 }
1188 if (pDbEnv->rep_set_timeout(pDbEnv,
1189 DB_REP_HEARTBEAT_SEND, 5000000) != 0) {
1190 sqlite3Error(db, SQLITE_ERROR, "Error in replication "
1191 "call rep_set_timeout heartbeat send");
1192 rc = SQLITE_ERROR;
1193 goto err;
1194 }
1195 }
1196
1197 err:
1198 return rc;
1199 }
1200
1201 /* See if environment is currently configured as a replication client. */
btreeRepIsClient(Btree * p)1202 static int btreeRepIsClient(Btree *p)
1203 {
1204 DB_REP_STAT *rep_stat;
1205 BtShared *pBt;
1206 int is_client;
1207
1208 pBt = p->pBt;
1209 is_client = 0;
1210
1211 if (!pBt->repStarted)
1212 return (0);
1213
1214 if (pDbEnv->rep_stat(pDbEnv, &rep_stat, 0) != 0) {
1215 sqlite3Error(p->db, SQLITE_ERROR,
1216 "Unable to determine if site is a replication client");
1217 return (0);
1218 }
1219 if (rep_stat->st_status == DB_REP_CLIENT)
1220 is_client = 1;
1221 sqlite3_free(rep_stat);
1222 return (is_client);
1223 }
1224
1225 /*
1226 * See if replication startup is finished by polling replication statistics.
1227 * Returns 1 if replication startup is finished; 0 otherwise. Note that
1228 * this function waits a finite amount of time for a replication election
1229 * to complete but it waits indefinitely for a replication client to
1230 * synchronize with the master after the election.
1231 */
btreeRepStartupFinished(Btree * p)1232 static int btreeRepStartupFinished(Btree *p)
1233 {
1234 DB_REP_STAT *repStat;
1235 BtShared *pBt;
1236 sqlite3 *db;
1237 u_int32_t electRetry, electTimeout, slept;
1238 int clientSyncComplete, startupComplete;
1239
1240 pBt = p->pBt;
1241 db = p->db;
1242 clientSyncComplete = slept = startupComplete = 0;
1243 electRetry = electTimeout = 0;
1244
1245 if (pDbEnv->rep_get_timeout(pDbEnv,
1246 DB_REP_ELECTION_RETRY, &electRetry) != 0) {
1247 sqlite3Error(db, SQLITE_ERROR, "Error in "
1248 "replication call rep_get_timeout election retry");
1249 goto err;
1250 }
1251 if (pDbEnv->rep_get_timeout(pDbEnv,
1252 DB_REP_ELECTION_TIMEOUT, &electTimeout) != 0) {
1253 sqlite3Error(db, SQLITE_ERROR, "Error in "
1254 "replication call rep_get_timeout election timeout");
1255 goto err;
1256 }
1257 electRetry = electRetry / US_PER_SEC;
1258 electTimeout = electTimeout / US_PER_SEC;
1259
1260 /*
1261 * Wait to see if election and replication site startup finishes.
1262 * If this site has been elected master or if it is a client that
1263 * has finished its synchronization with the master, startup is
1264 * finished. Wait long enough to allow time for many election
1265 * attempts. Using default timeout values, the wait is 15 minutes.
1266 */
1267 do {
1268 __os_yield(pDbEnv->env, 1, 0);
1269 if (pDbEnv->rep_stat(pDbEnv, &repStat, 0) != 0) {
1270 sqlite3Error(db, SQLITE_ERROR, "Error in "
1271 "replication call rep_stat election");
1272 goto err;
1273 }
1274 if (repStat->st_status == DB_REP_MASTER ||
1275 repStat->st_startup_complete)
1276 startupComplete = 1;
1277 sqlite3_free(repStat);
1278 } while (!startupComplete &&
1279 ++slept < (electTimeout + electRetry) * 75);
1280
1281 /*
1282 * If startup isn't finished yet but this site is a client with
1283 * a known master, the client is still synchronizing with the master.
1284 * Wait indefinitely because this can take a very long time if a full
1285 * internal initialization is needed.
1286 */
1287 if (!startupComplete && repStat->st_status == DB_REP_CLIENT &&
1288 repStat->st_master != DB_EID_INVALID)
1289 do {
1290 __os_yield(pDbEnv->env, 2, 0);
1291 if (pDbEnv->rep_stat(pDbEnv, &repStat, 0) != 0) {
1292 sqlite3Error(db, SQLITE_ERROR, "Error in "
1293 "replication call rep_stat client sync");
1294 goto err;
1295 }
1296 if (repStat->st_startup_complete)
1297 clientSyncComplete = 1;
1298 sqlite3_free(repStat);
1299 } while (!clientSyncComplete);
1300
1301 err: if (startupComplete || clientSyncComplete)
1302 return (1);
1303 else
1304 return (0);
1305 }
1306
1307 /*
1308 * This function finds, opens or creates the Berkeley DB environment associated
1309 * with a database opened using sqlite3BtreeOpen. There are a few different
1310 * cases:
1311 * * Temporary and transient databases share a single environment. If the
1312 * shared handle exists, return it, otherwise create a shared handle.
1313 * * For named databases, attempt to open an existing environment, if one
1314 * exists, otherwise create a new environment.
1315 */
btreePrepareEnvironment(Btree * p)1316 static int btreePrepareEnvironment(Btree *p)
1317 {
1318 BtShared *pBt;
1319 #ifdef BDBSQL_FILE_PER_TABLE
1320 char *dirPathName, dirPathBuf[BT_MAX_PATH];
1321 #endif
1322 int rc, ret;
1323
1324 pBt = p->pBt;
1325 ret = 0;
1326 rc = SQLITE_OK;
1327
1328 pBt->env_oflags = DB_INIT_MPOOL |
1329 ((pBt->dbStorage == DB_STORE_NAMED) ? 0 : DB_PRIVATE)
1330 #ifndef BDBSQL_SINGLE_THREAD
1331 | DB_THREAD
1332 #endif
1333 ;
1334
1335 if (pBt->dbStorage == DB_STORE_NAMED) {
1336 if ((rc = btreeCheckEnvPrepare(p)) != SQLITE_OK)
1337 goto err;
1338
1339 if ((ret = db_env_create(&pDbEnv, 0)) != 0)
1340 goto err;
1341 pDbEnv->set_errpfx(pDbEnv, pBt->full_name);
1342 pDbEnv->app_private = pBt;
1343 pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
1344 #ifndef BDBSQL_SINGLE_THREAD
1345 #ifndef BDBSQL_CONCURRENT_CONNECTIONS
1346 pDbEnv->set_flags(pDbEnv, DB_DATABASE_LOCKING, 1);
1347 #endif
1348 pDbEnv->set_lk_detect(pDbEnv, DB_LOCK_DEFAULT);
1349 pDbEnv->set_lk_tablesize(pDbEnv, 20000);
1350 pDbEnv->set_memory_max(pDbEnv, 0, 16 * 1024 * 1024);
1351 #ifdef BDBSQL_TXN_SNAPSHOTS_DEFAULT
1352 pBt->env_oflags |= DB_MULTIVERSION;
1353 pBt->read_txn_flags |= DB_TXN_SNAPSHOT;
1354 #endif
1355 #endif
1356 pDbEnv->set_lg_regionmax(pDbEnv, BDBSQL_LOG_REGIONMAX);
1357 #ifdef BDBSQL_MEMORY_MAX
1358 pDbEnv->set_memory_max(pDbEnv, BDBSQL_MEMORY_MAX / GIGABYTE,
1359 BDBSQL_MEMORY_MAX % GIGABYTE);
1360 #endif
1361 #ifdef BDBSQL_LOCK_TABLESIZE
1362 pDbEnv->set_lk_tablesize(pDbEnv, BDBSQL_LOCK_TABLESIZE);
1363 #endif
1364 #ifndef BDBSQL_OMIT_LEAKCHECK
1365 pDbEnv->set_alloc(pDbEnv, btreeMalloc, btreeRealloc,
1366 sqlite3_free);
1367 #endif
1368 if ((ret = pDbEnv->set_lg_max(pDbEnv, pBt->logFileSize)) != 0)
1369 goto err;
1370 #ifndef BDBSQL_OMIT_LOG_REMOVE
1371 if ((ret = pDbEnv->log_set_config(pDbEnv,
1372 DB_LOG_AUTO_REMOVE, 1)) != 0)
1373 goto err;
1374 #endif
1375 /*
1376 * Set the directory where the database file will be created
1377 * to the parent of the environment directory.
1378 */
1379 #ifdef BDBSQL_FILE_PER_TABLE
1380 /* Reuse envDirNameBuf. */
1381 dirPathName = dirPathBuf;
1382 memset(dirPathName, 0, BT_MAX_PATH);
1383 sqlite3_snprintf(sizeof(dirPathName), dirPathName,
1384 "../%s", pBt->short_name);
1385 pDbEnv->set_data_dir(pDbEnv, dirPathName);
1386 pDbEnv->set_create_dir(pDbEnv, dirPathName);
1387 #else
1388 pDbEnv->set_data_dir(pDbEnv, "..");
1389 #endif
1390 #ifdef BDBSQL_SHARE_PRIVATE
1391 /*
1392 * set mpool mutex count to 10/core. This significantly
1393 * reduces the cost of environment open/close
1394 */
1395 if (pBt->mp_mutex_count == 0)
1396 pBt->mp_mutex_count = 10 * __os_cpu_count();
1397 pDbEnv->set_mp_mtxcount(pDbEnv, pBt->mp_mutex_count);
1398 #endif
1399
1400 } else if (g_tmp_env == NULL) {
1401 /*
1402 * Creating environment shared by temp and transient tables.
1403 * We're just creating a handle here, so it doesn't matter if
1404 * we race with some other thread at this point, as long as
1405 * only one of the environment handles is opened.
1406 */
1407 if ((ret = db_env_create(&pDbEnv, 0)) != 0)
1408 goto err;
1409 pDbEnv->set_errpfx(pDbEnv, "<temp>");
1410 pDbEnv->app_private = pBt;
1411 pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
1412 pBt->env_oflags |= DB_CREATE | DB_INIT_TXN | DB_PRIVATE;
1413
1414 /*
1415 * Never create log files. We mark all databases non-durable,
1416 * but BDB still occasionally writes log records (e.g., for
1417 * checkpoints). This guarantees that those log records aren't
1418 * written to files. A small buffer should be fine.
1419 */
1420 pDbEnv->set_lg_bsize(pDbEnv, 64 * 1024);
1421 pDbEnv->set_lg_max(pDbEnv, 32 * 1024);
1422 #ifndef BDBSQL_OMIT_LEAKCHECK
1423 pDbEnv->set_alloc(pDbEnv, btreeMalloc, btreeRealloc,
1424 sqlite3_free);
1425 #endif
1426 pDbEnv->log_set_config(pDbEnv, DB_LOG_IN_MEMORY, 1);
1427 } else
1428 rc = btreeOpenEnvironment(p, 0);
1429
1430 err: return MAP_ERR(rc, ret, p);
1431 }
1432
1433 /*
1434 * The function finds an opened BtShared handle if one exists in the cache.
1435 * It assumes that the global SQLITE_MUTEX_STATIC_OPEN lock is held.
1436 */
btreeUpdateBtShared(Btree * p,int needLock)1437 int btreeUpdateBtShared(Btree *p, int needLock)
1438 {
1439 BtShared *pBt, *next_bt;
1440 sqlite3_mutex *mutexOpen;
1441 u_int8_t new_fileid[DB_FILE_ID_LEN];
1442 char *filename;
1443 int rc, ret;
1444
1445 pBt = p->pBt;
1446 rc = SQLITE_OK;
1447 ret = 0;
1448
1449 if (pBt->dbStorage != DB_STORE_NAMED)
1450 return SQLITE_OK;
1451
1452 #ifdef BDBSQL_FILE_PER_TABLE
1453 rc = getMetaDataFileName(pBt->full_name, &filename);
1454 if (rc != SQLITE_OK)
1455 return rc;
1456 #else
1457 filename = pBt->full_name;
1458 #endif
1459
1460 if (needLock) {
1461 mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
1462 sqlite3_mutex_enter(mutexOpen);
1463 #ifdef SQLITE_DEBUG
1464 } else {
1465 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1466 assert(sqlite3_mutex_held(mutexOpen));
1467 mutexOpen = NULL;
1468 #endif
1469 }
1470 /*
1471 * Check to see if a connection has been opened to the same database
1472 * using a different BtShared. If so, switch to using that BtShared.
1473 *
1474 * It's safe to do this shuffle, since it only ever happens for
1475 * named databases, and we are always holding the global
1476 * SQLITE_MUTEX_STATIC_OPEN mutex in that case.
1477 */
1478 if (pBt->dbStorage == DB_STORE_NAMED && !pBt->env_opened &&
1479 !(ret = __os_exists(NULL, filename, NULL)) &&
1480 __os_fileid(NULL, filename, 0, new_fileid) == 0) {
1481 for (next_bt = g_shared_btrees; next_bt != NULL;
1482 next_bt = next_bt->pNextDb) {
1483 if (pBt != next_bt && memcmp(
1484 new_fileid, next_bt->fileid, DB_FILE_ID_LEN) == 0)
1485 break;
1486 }
1487 if (next_bt != pBt && next_bt != NULL) {
1488 /* Found a different BtShared to use. "upgrade" */
1489 ++next_bt->nRef;
1490 if (--pBt->nRef == 0) {
1491 (void)btreeFreeSharedBtree(pBt, 1);
1492 }
1493 p->pBt = next_bt;
1494 pBt = next_bt;
1495 }
1496 } else {
1497 if (ret != ENOENT && ret != 0)
1498 rc = dberr2sqlite(ret, p);
1499 }
1500 if (needLock)
1501 sqlite3_mutex_leave(mutexOpen);
1502
1503 #ifdef BDBSQL_FILE_PER_TABLE
1504 sqlite3_free(filename);
1505 #endif
1506 return rc;
1507 }
1508
1509 /*
1510 * Closes and re-opens a Berkeley DB environment handle.
1511 * Required when enabling or disabling replication on an existing database.
1512 * Assumes that the required open flags have been set in BtShared.
1513 */
btreeReopenEnvironment(Btree * p,int removingRep)1514 int btreeReopenEnvironment(Btree *p, int removingRep)
1515 {
1516 int idx, rc, ret;
1517 sqlite3_mutex *mutexOpen;
1518 BtShared *pBt;
1519
1520 rc = SQLITE_OK;
1521 ret = 0;
1522 pBt = p->pBt;
1523
1524 if (pBt->transactional == 0 || pBt->first_cursor != NULL ||
1525 pMainTxn != NULL || pBt->dbStorage != DB_STORE_NAMED)
1526 return SQLITE_ERROR;
1527
1528 /* commit family txn; it will be null when shutting down */
1529 if (pFamilyTxn != NULL) {
1530 ret = pFamilyTxn->commit(pFamilyTxn, 0);
1531 pFamilyTxn = NULL;
1532 /* p->inTrans = TRANS_NONE; don't change state of this */
1533 if (ret != 0)
1534 rc = dberr2sqlite(ret, p);
1535 if (rc != SQLITE_OK)
1536 return (rc);
1537 }
1538
1539 /*
1540 * Acquire mutexOpen lock while closing down cached db handles.
1541 */
1542 mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
1543 sqlite3_mutex_enter(mutexOpen);
1544 /* Close open DB handles and clear related hash table */
1545 if ((rc = btreeCleanupCachedHandles(p, CLEANUP_CLOSE)) != SQLITE_OK)
1546 goto err;
1547 sqlite3HashClear(&pBt->db_cache);
1548 /* close tables and meta databases */
1549 if (pTablesDb != NULL &&
1550 (ret = pTablesDb->close(pTablesDb, DB_NOSYNC)) != 0)
1551 goto err;
1552 if (pMetaDb != NULL &&
1553 (ret = pMetaDb->close(pMetaDb, DB_NOSYNC)) != 0)
1554 goto err;
1555 pTablesDb = pMetaDb = NULL;
1556
1557 /* Flush the cache of metadata values */
1558 for (idx = 0; idx < NUMMETA; idx++)
1559 pBt->meta[idx].cached = 0;
1560 /*
1561 * Close environment, ignore DB_RUNRECOVERY errors.
1562 */
1563 if ((ret = pDbEnv->close(pDbEnv, 0)) != 0 && ret != DB_RUNRECOVERY)
1564 goto err;
1565 pDbEnv = NULL;
1566 pBt->env_opened = 0;
1567 p->connected = 0;
1568
1569 /* Configure and open a new environment. */
1570 if ((rc = btreePrepareEnvironment(p)) != 0)
1571 goto err;
1572 /*
1573 * Make thread count match the default value that env_open() sets
1574 * with FAILCHK so that the thread region is initialized correctly
1575 * for use with FAILCHK when reopening without replication.
1576 */
1577 if (removingRep &&
1578 (ret = pDbEnv->set_thread_count(pDbEnv, 50)) != 0)
1579 goto err;
1580 rc = btreeOpenEnvironment(p, 0);
1581
1582 /* Release the lock now. */
1583 err: sqlite3_mutex_leave(mutexOpen);
1584 if (rc == SQLITE_OK && ret != 0)
1585 rc = dberr2sqlite(ret, p);
1586 return rc;
1587 }
1588
1589 /*
1590 * Called from sqlite3BtreeCreateTable, if it the Berkeley DB environment
1591 * did not already exist when sqlite3BtreeOpen was called.
1592 */
btreeOpenEnvironment(Btree * p,int needLock)1593 int btreeOpenEnvironment(Btree *p, int needLock)
1594 {
1595 BtShared *pBt;
1596 sqlite3 *db;
1597 CACHED_DB *cached_db;
1598 int creating, iTable, newEnv, rc, ret, reuse_env, writeLock;
1599 sqlite3_mutex *mutexOpen;
1600 txn_mode_t txn_mode;
1601 i64 cache_sz;
1602 int createdDir = 0;
1603 #ifdef BDBSQL_SHARE_PRIVATE
1604 int createdFile = 0;
1605 #endif
1606 int i;
1607 u8 replicate = 0;
1608
1609 newEnv = ret = reuse_env = 0;
1610 rc = SQLITE_OK;
1611 cached_db = NULL;
1612 mutexOpen = NULL;
1613 pBt = p->pBt;
1614 db = p->db;
1615
1616 /*
1617 * The open (and setting pBt->env_opened) is protected by the open
1618 * mutex, to prevent concurrent threads trying to call DB_ENV->open
1619 * simultaneously.
1620 */
1621 if (needLock) {
1622 mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
1623 sqlite3_mutex_enter(mutexOpen);
1624 #ifdef SQLITE_DEBUG
1625 } else if (pBt->dbStorage == DB_STORE_NAMED) {
1626 mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1627 assert(sqlite3_mutex_held(mutexOpen));
1628 mutexOpen = NULL;
1629 #endif
1630 }
1631
1632 /*
1633 * If we already created a handle and someone has opened the global
1634 * handle in the meantime, close our handle to free the memory.
1635 */
1636 if (pBt->dbStorage != DB_STORE_NAMED && g_tmp_env != NULL) {
1637 assert(!pBt->env_opened);
1638 assert(pDbEnv != g_tmp_env);
1639 if (pDbEnv != NULL)
1640 (void)pDbEnv->close(pDbEnv, 0);
1641
1642 pDbEnv = g_tmp_env;
1643 pBt->env_opened = newEnv = reuse_env = 1;
1644 }
1645 /*
1646 * Check to see if the table has been opened to the same database
1647 * using a different name. If so, switch to using that BtShared.
1648 */
1649 if ((rc = btreeUpdateBtShared(p, 0)) != SQLITE_OK)
1650 goto err;
1651 pBt = p->pBt;
1652
1653 if (!pBt->env_opened) {
1654 cache_sz = (i64)pBt->cacheSize;
1655 if (cache_sz < DB_MIN_CACHESIZE)
1656 cache_sz = DB_MIN_CACHESIZE;
1657 cache_sz *= (pBt->dbStorage == DB_STORE_NAMED &&
1658 pBt->pageSize > 0) ?
1659 pBt->pageSize : SQLITE_DEFAULT_PAGE_SIZE;
1660 pDbEnv->set_cachesize(pDbEnv,
1661 (u_int32_t)(cache_sz / GIGABYTE),
1662 (u_int32_t)(cache_sz % GIGABYTE), 0);
1663 if (pBt->pageSize != 0 &&
1664 (ret = pDbEnv->set_mp_pagesize(pDbEnv, pBt->pageSize)) != 0)
1665 goto err;
1666 pDbEnv->set_mp_mmapsize(pDbEnv, 0);
1667 pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
1668 if (pBt->dir_name != NULL) {
1669 createdDir =
1670 (__os_mkdir(NULL, pBt->dir_name, 0777) == 0);
1671 #ifdef BDBSQL_FILE_PER_TABLE
1672 createdDir =
1673 (__os_mkdir(NULL, pBt->full_name, 0777) == 0);
1674 #endif
1675 }
1676
1677 if (pBt->dbStorage == DB_STORE_NAMED) {
1678 #ifdef BDBSQL_SHARE_PRIVATE
1679 if ((ret = btreeSetupLockfile(p, &createdFile)) != 0)
1680 goto err;
1681 /*
1682 * if lock isn't held, take read lock for open,
1683 * but do not reopen env
1684 */
1685 if (!createdFile) {
1686 btreeScopedFileLock(p, 0, 1);
1687 /*
1688 * don't checkpoint; it'd confuse
1689 * active writers
1690 */
1691 pBt->env_oflags |= DB_NO_CHECKPOINT;
1692 }
1693 #endif
1694 if ((rc = btreeSetUpReplication(p, pBt->repStartMaster,
1695 &replicate)) != SQLITE_OK)
1696 goto err;
1697 if ((rc = btreeCheckEnvOpen(p,
1698 createdDir, replicate)) != SQLITE_OK)
1699 goto err;
1700 }
1701 if ((ret = pDbEnv->open(
1702 pDbEnv, pBt->dir_name, pBt->env_oflags, 0)) != 0) {
1703 #ifdef BDBSQL_SHARE_PRIVATE
1704 if (pBt->dbStorage == DB_STORE_NAMED)
1705 btreeScopedFileUnlock(p, createdFile);
1706 #endif
1707 if (ret == ENOENT && (pBt->env_oflags & DB_CREATE) == 0)
1708 return SQLITE_OK;
1709 goto err;
1710 }
1711 pBt->env_opened = newEnv = 1;
1712 /*
1713 * repForceRecover is set when turning off replication and
1714 * used to set env open flags. Clear it here after opening
1715 * the environment.
1716 */
1717 pBt->repForceRecover = 0;
1718 if (pBt->dbStorage != DB_STORE_NAMED) {
1719 g_tmp_env = pDbEnv;
1720 reuse_env = 1;
1721 } else {
1722 #ifdef BDBSQL_SHARE_PRIVATE
1723 btreeScopedFileUnlock(p, createdFile);
1724 #endif
1725 }
1726 }
1727
1728 assert(!p->connected);
1729 p->connected = 1;
1730
1731 /*
1732 * If the environment was already open, drop the open mutex before
1733 * proceeding. Some other thread may be holding a schema lock and
1734 * be waiting for the open mutex, which would lead to a latch deadlock.
1735 *
1736 * On the other hand, if we are creating the environment, this thread
1737 * is expecting to find the schema table empty, so we need to hold
1738 * onto the open mutex and get an exclusive schema lock, to prevent
1739 * some other thread getting in ahead of us.
1740 */
1741 if (!newEnv && needLock) {
1742 assert(sqlite3_mutex_held(mutexOpen));
1743 sqlite3_mutex_leave(mutexOpen);
1744 needLock = 0;
1745 }
1746
1747 /*
1748 * Start replication. If we are not starting as the initial master,
1749 * do not try to create SQL metadata because we will use a
1750 * replicated copy that should already exist or get sent to us
1751 * shortly during replication client synchronization.
1752 */
1753 if (replicate) {
1754 if ((ret = pDbEnv->repmgr_start(pDbEnv, 1,
1755 pBt->repStartMaster ?
1756 DB_REP_MASTER : DB_REP_ELECTION)) != 0) {
1757 sqlite3Error(db, SQLITE_CANTOPEN, "Error in "
1758 "replication call repmgr_start");
1759 rc = SQLITE_CANTOPEN;
1760 goto err;
1761 }
1762 pBt->repStarted = 1;
1763
1764 if (!pBt->repStartMaster) {
1765 /*
1766 * Allow time for replication client to hold an
1767 * election and synchronize with the master.
1768 */
1769 if (!btreeRepStartupFinished(p)) {
1770 sqlite3Error(db, SQLITE_CANTOPEN, "Error "
1771 "starting as replication client");
1772 rc = SQLITE_CANTOPEN;
1773 goto err;
1774 }
1775 creating = i = 0;
1776 /*
1777 * There is a slight possibility that some of the
1778 * replicated SQL metadata may lag behind the end
1779 * of client synchronization, so retry opening the
1780 * SQL metadata a few times if there are errors.
1781 */
1782 do {
1783 rc = btreeOpenMetaTables(p, &creating);
1784 } while ((rc != SQLITE_OK) && ++i < BUSY_RETRY_COUNT);
1785 if (rc == SQLITE_OK)
1786 goto aftercreatemeta;
1787 else {
1788 sqlite3Error(db, SQLITE_CANTOPEN, "Error "
1789 "opening replicated SQL metadata");
1790 rc = SQLITE_CANTOPEN;
1791 goto err;
1792 }
1793 }
1794 }
1795 pBt->repStartMaster = 0;
1796
1797 if ((!IS_ENV_READONLY(pBt) && p->vfsFlags & SQLITE_OPEN_CREATE) ||
1798 pBt->dbStorage == DB_STORE_INMEM)
1799 pBt->db_oflags |= DB_CREATE;
1800
1801 creating = 1;
1802 if (pBt->dbStorage == DB_STORE_NAMED &&
1803 (rc = btreeOpenMetaTables(p, &creating)) != SQLITE_OK)
1804 goto err;
1805 if (creating) {
1806 /*
1807 * Update the fileid now that the file has been created.
1808 * Ignore error returns - the fileid isn't critical.
1809 */
1810 if (pBt->dbStorage == DB_STORE_NAMED) {
1811 char *filename;
1812 #ifdef BDBSQL_FILE_PER_TABLE
1813 rc = getMetaDataFileName(pBt->full_name, &filename);
1814 if (rc != SQLITE_OK)
1815 goto err;
1816 #else
1817 filename = pBt->full_name;
1818 #endif
1819 (void)__os_fileid(NULL, filename, 0, pBt->fileid);
1820 #ifdef BDBSQL_FILE_PER_TABLE
1821 if (filename != NULL)
1822 sqlite3_free(filename);
1823 #endif
1824 }
1825
1826 if ((rc = btreeCreateTable(p, &iTable,
1827 BTREE_INTKEY)) != SQLITE_OK)
1828 goto err;
1829
1830 assert(iTable == MASTER_ROOT);
1831 }
1832 aftercreatemeta:
1833
1834 #ifdef BDBSQL_PRELOAD_HANDLES
1835 if (newEnv && !creating && pBt->dbStorage == DB_STORE_NAMED)
1836 (void)btreePreloadHandles(p);
1837 #endif
1838
1839 /*
1840 * If transactions were started before the environment was opened,
1841 * start them now. Also, if creating a new environment, take a write
1842 * lock to prevent races setting up the metadata tables. Always start
1843 * the ultimate parent by starting a read transaction.
1844 */
1845 writeLock = (p->schemaLockMode == LOCKMODE_WRITE) ||
1846 (newEnv && !IS_BTREE_READONLY(p));
1847
1848 if (pBt->transactional) {
1849 txn_mode = p->inTrans;
1850 p->inTrans = TRANS_NONE;
1851
1852 if ((ret = pDbEnv->txn_begin(pDbEnv,
1853 NULL, &pFamilyTxn, DB_TXN_FAMILY)) != 0)
1854 return dberr2sqlite(ret, p);
1855 #ifdef BDBSQL_SHARE_PRIVATE
1856 pBt->lockfile.in_env_open = 1;
1857 #endif
1858 if ((writeLock || txn_mode == TRANS_WRITE) &&
1859 !btreeRepIsClient(p) &&
1860 (rc = sqlite3BtreeBeginTrans(p,
1861 (writeLock || txn_mode == TRANS_WRITE))) != SQLITE_OK)
1862 goto err;
1863 }
1864
1865 if (p->schemaLockMode != LOCKMODE_NONE) {
1866 p->schemaLockMode = LOCKMODE_NONE;
1867 rc = sqlite3BtreeLockTable(p, MASTER_ROOT, writeLock);
1868 if (rc != SQLITE_OK)
1869 goto err;
1870 }
1871
1872 /*
1873 * It is now okay for other threads to use this BtShared handle.
1874 */
1875 err: if (rc != SQLITE_OK || ret != 0) {
1876 pBt->panic = 1;
1877 p->connected = 0;
1878 }
1879 #ifdef BDBSQL_SHARE_PRIVATE
1880 pBt->lockfile.in_env_open = 0;
1881 #endif
1882 if (needLock) {
1883 assert(sqlite3_mutex_held(mutexOpen));
1884 sqlite3_mutex_leave(mutexOpen);
1885 }
1886 return MAP_ERR(rc, ret, p);
1887 }
1888
btreeGetSharedBtree(BtShared ** ppBt,u_int8_t * fileid,sqlite3 * db,storage_mode_t store,int vfsFlags)1889 static int btreeGetSharedBtree(
1890 BtShared **ppBt,
1891 u_int8_t *fileid,
1892 sqlite3 *db,
1893 storage_mode_t store,
1894 int vfsFlags)
1895 {
1896 Btree *pExisting;
1897 BtShared *next_bt;
1898 int iDb;
1899
1900 #ifdef SQLITE_DEBUG
1901 sqlite3_mutex *mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1902 assert(sqlite3_mutex_held(mutexOpen));
1903 #endif
1904
1905 /*
1906 * SQLite uses this check, but Berkeley DB always operates with a
1907 * shared cache.
1908 if (sqlite3GlobalConfig.sharedCacheEnabled != 1)
1909 return 1;
1910 */
1911
1912 *ppBt = NULL;
1913 for (next_bt = g_shared_btrees; next_bt != NULL;
1914 next_bt = next_bt->pNextDb) {
1915 assert(next_bt->nRef > 0);
1916 if ((store != DB_STORE_NAMED && next_bt->full_name == NULL) ||
1917 (store == DB_STORE_NAMED &&
1918 memcmp(fileid, next_bt->fileid, DB_FILE_ID_LEN) == 0)) {
1919 /*
1920 * If the application thinks we are in shared cache
1921 * mode, check that the btree handle being added does
1922 * not already exist in the list of handles.
1923 */
1924 if (vfsFlags & SQLITE_OPEN_SHAREDCACHE) {
1925 for (iDb = db->nDb - 1; iDb >= 0; iDb--) {
1926 pExisting = db->aDb[iDb].pBt;
1927 if (pExisting &&
1928 pExisting->pBt == next_bt)
1929 /* Leave mutex. */
1930 return SQLITE_CONSTRAINT;
1931 }
1932 }
1933 *ppBt = next_bt;
1934 sqlite3_mutex_enter(next_bt->mutex);
1935 next_bt->nRef++;
1936 sqlite3_mutex_leave(next_bt->mutex);
1937 break;
1938 }
1939 }
1940
1941 return SQLITE_OK;
1942 }
1943
btreeCreateSharedBtree(Btree * p,const char * zFilename,u_int8_t * fileid,sqlite3 * db,int flags,storage_mode_t store)1944 static int btreeCreateSharedBtree(
1945 Btree *p,
1946 const char *zFilename,
1947 u_int8_t *fileid,
1948 sqlite3 *db,
1949 int flags,
1950 storage_mode_t store)
1951 {
1952 BtShared *new_bt;
1953 char *dirPathName, dirPathBuf[BT_MAX_PATH];
1954
1955 #ifdef SQLITE_DEBUG
1956 if (store == DB_STORE_NAMED) {
1957 sqlite3_mutex *mutexOpen =
1958 sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1959 assert(sqlite3_mutex_held(mutexOpen));
1960 }
1961 #endif
1962
1963 new_bt = NULL;
1964 if ((new_bt = (struct BtShared *)sqlite3_malloc(
1965 sizeof(struct BtShared))) == NULL)
1966 return SQLITE_NOMEM;
1967 memset(new_bt, 0, sizeof(struct BtShared));
1968 new_bt->dbStorage = store;
1969 if (store == DB_STORE_TMP) {
1970 new_bt->transactional = 0;
1971 new_bt->resultsBuffer = 1;
1972 } else {
1973 new_bt->transactional = 1;
1974 new_bt->resultsBuffer = 0;
1975 }
1976 #ifndef BDBSQL_AUTO_PAGE_SIZE
1977 new_bt->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1978 #endif
1979 new_bt->flags = flags;
1980 new_bt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1981 if (new_bt->mutex == NULL && sqlite3GlobalConfig.bCoreMutex)
1982 goto err_nomem;
1983 memcpy(new_bt->fileid, fileid, DB_FILE_ID_LEN);
1984
1985 /*
1986 * Always open database with read-uncommitted enabled
1987 * since SQLite allows DB_READ_UNCOMMITTED cursors to
1988 * be created on any table.
1989 */
1990 #ifndef BDBSQL_SINGLE_THREAD
1991 new_bt->db_oflags = DB_THREAD |
1992 (new_bt->transactional ? DB_READ_UNCOMMITTED : 0);
1993 #endif
1994 sqlite3HashInit(&new_bt->db_cache);
1995 if (store == DB_STORE_NAMED) {
1996 /* Store full path of zfilename */
1997 dirPathName = dirPathBuf;
1998 sqlite3OsFullPathname(
1999 db->pVfs, zFilename, sizeof(dirPathBuf), dirPathName);
2000 if ((new_bt->full_name = sqlite3_strdup(dirPathName)) == NULL)
2001 goto err_nomem;
2002 if ((new_bt->orig_name = sqlite3_strdup(zFilename)) == NULL)
2003 goto err_nomem;
2004 sqlite3_snprintf(sizeof(dirPathBuf), dirPathBuf,
2005 "%s-journal", new_bt->full_name);
2006 if ((new_bt->dir_name = sqlite3_strdup(dirPathBuf)) == NULL)
2007 goto err_nomem;
2008
2009 /* Extract just the file name component. */
2010 new_bt->short_name = strrchr(new_bt->orig_name, '/');
2011 if (new_bt->short_name == NULL ||
2012 new_bt->short_name < strrchr(new_bt->orig_name, '\\'))
2013 new_bt->short_name =
2014 strrchr(new_bt->orig_name, '\\');
2015 if (new_bt->short_name == NULL)
2016 new_bt->short_name = new_bt->orig_name;
2017 else
2018 /* Move past actual path seperator. */
2019 ++new_bt->short_name;
2020 }
2021
2022 new_bt->cacheSize = SQLITE_DEFAULT_CACHE_SIZE;
2023 new_bt->pageCount = SQLITE_MAX_PAGE_COUNT;
2024 new_bt->nRef = 1;
2025 new_bt->uid = g_uid_next++;
2026 new_bt->logFileSize = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT;
2027 #ifdef SQLITE_SECURE_DELETE
2028 new_bt->secureDelete = 1;
2029 #endif
2030
2031 p->pBt = new_bt;
2032
2033 return SQLITE_OK;
2034
2035 err_nomem:
2036 btreeFreeSharedBtree(new_bt, 0);
2037 return SQLITE_NOMEM;
2038 }
2039
2040 /*
2041 ** Open a new database.
2042 **
2043 ** zFilename is the name of the database file. If zFilename is NULL a new
2044 ** database with a random name is created. This randomly named database file
2045 ** will be deleted when sqlite3BtreeClose() is called.
2046 */
sqlite3BtreeOpen(const char * zFilename,sqlite3 * db,Btree ** ppBtree,int flags,int vfsFlags)2047 int sqlite3BtreeOpen(
2048 const char *zFilename, /* Name of the file containing the database */
2049 sqlite3 *db, /* Associated database connection */
2050 Btree **ppBtree, /* Pointer to new Btree object written here */
2051 int flags, /* Options */
2052 int vfsFlags) /* Flags passed through to VFS open */
2053 {
2054 Btree *p, *next_btree;
2055 BtShared *pBt, *next_bt;
2056 int rc;
2057 sqlite3_mutex *mutexOpen;
2058 storage_mode_t store;
2059 u_int8_t fileid[DB_FILE_ID_LEN];
2060 char *filename;
2061
2062 log_msg(LOG_VERBOSE, "sqlite3BtreeOpen(%s, %p, %p, %u, %u)", zFilename,
2063 db, ppBtree, flags, vfsFlags);
2064
2065 pBt = NULL;
2066 rc = SQLITE_OK;
2067 mutexOpen = NULL;
2068 filename = NULL;
2069
2070 if ((p = (Btree *)sqlite3_malloc(sizeof(Btree))) == NULL)
2071 return SQLITE_NOMEM;
2072 memset(p, 0, sizeof(Btree));
2073 memset(&fileid[0], 0, DB_FILE_ID_LEN);
2074 p->db = db;
2075 p->vfsFlags = vfsFlags;
2076 p->pBt = NULL;
2077 p->readonly = 0;
2078 p->txn_bulk = BDBSQL_TXN_BULK_DEFAULT;
2079 p->vacuumPages = BDBSQL_INCR_VACUUM_PAGES;
2080 p->fillPercent = BDBSQL_VACUUM_FILLPERCENT;
2081
2082 if ((vfsFlags & SQLITE_OPEN_TRANSIENT_DB) != 0) {
2083 log_msg(LOG_DEBUG, "sqlite3BtreeOpen creating temporary DB.");
2084 store = DB_STORE_TMP;
2085 } else if (zFilename == NULL ||
2086 (zFilename[0] == '\0' || strcmp(zFilename, ":memory:") == 0) ||
2087 (flags & BTREE_MEMORY) != 0) {
2088 /*
2089 * Berkeley DB treats in-memory and temporary databases the
2090 * same way: if there is not enough space in cache, pages
2091 * overflow to temporary files.
2092 */
2093 log_msg(LOG_DEBUG, "sqlite3BtreeOpen creating in-memory DB.");
2094 store = DB_STORE_INMEM;
2095 } else {
2096 log_msg(LOG_DEBUG, "sqlite3BtreeOpen creating named DB.");
2097 store = DB_STORE_NAMED;
2098 /*
2099 * We always use the shared cache of handles, but SQLite
2100 * performs additional checks for conflicting table locks
2101 * when it is in shared cache mode, and aborts early.
2102 * We use the sharable flag to control that behavior.
2103 */
2104 if (vfsFlags & SQLITE_OPEN_SHAREDCACHE)
2105 p->sharable = 1;
2106 }
2107
2108 mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(store));
2109 sqlite3_mutex_enter(mutexOpen);
2110
2111 #ifdef BDBSQL_FILE_PER_TABLE
2112 if (store == DB_STORE_NAMED) {
2113 rc = getMetaDataFileName(zFilename, &filename);
2114 if (rc != SQLITE_OK)
2115 goto err;
2116 }
2117 #else
2118 filename = (char *)zFilename;
2119 #endif
2120
2121 /* Non-named databases never share any content in BtShared. */
2122 if (store == DB_STORE_NAMED &&
2123 !__os_exists(NULL, filename, NULL) &&
2124 __os_fileid(NULL, filename, 0, fileid) == 0) {
2125 if ((rc = btreeGetSharedBtree(&pBt,
2126 fileid, db, store, vfsFlags)) != SQLITE_OK)
2127 goto err;
2128 }
2129
2130 if (pBt != NULL) {
2131 p->pBt = pBt;
2132 if ((rc = btreeOpenEnvironment(p, 0)) != SQLITE_OK) {
2133 /*
2134 * clean up ref. from btreeGetSharedBtree() [#18767]
2135 */
2136 assert(pBt->nRef > 1);
2137 sqlite3_mutex_enter(pBt->mutex);
2138 pBt->nRef--;
2139 sqlite3_mutex_leave(pBt->mutex);
2140 goto err;
2141 }
2142 /* The btreeOpenEnvironment call might have updated pBt. */
2143 pBt = p->pBt;
2144 } else {
2145 if ((rc = btreeCreateSharedBtree(p,
2146 zFilename, fileid, db, flags, store)) != 0)
2147 goto err;
2148 pBt = p->pBt;
2149 if (!pBt->resultsBuffer &&
2150 (rc = btreePrepareEnvironment(p)) != 0) {
2151 btreeFreeSharedBtree(pBt, 0);
2152 goto err;
2153 }
2154 /* Only named databases are in the shared btree cache. */
2155 if (store == DB_STORE_NAMED) {
2156 if (g_shared_btrees == NULL) {
2157 pBt->pPrevDb = NULL;
2158 g_shared_btrees = pBt;
2159 } else {
2160 for (next_bt = g_shared_btrees;
2161 next_bt->pNextDb != NULL;
2162 next_bt = next_bt->pNextDb) {}
2163 next_bt->pNextDb = pBt;
2164 pBt->pPrevDb = next_bt;
2165 }
2166 }
2167 }
2168
2169 /* Add this Btree object to the list of Btrees seen by the BtShared */
2170 for (next_btree = pBt->btrees; next_btree != NULL;
2171 next_btree = next_btree->pNext) {
2172 if (next_btree == p)
2173 break;
2174 }
2175 if (next_btree == NULL) {
2176 if (pBt->btrees == NULL)
2177 pBt->btrees = p;
2178 else {
2179 p->pNext = pBt->btrees;
2180 pBt->btrees->pPrev = p;
2181 pBt->btrees = p;
2182 }
2183 }
2184 p->readonly = (p->vfsFlags & SQLITE_OPEN_READONLY) ? 1 : 0;
2185 *ppBtree = p;
2186
2187 err: if (rc != SQLITE_OK)
2188 sqlite3_free(p);
2189 if (mutexOpen != NULL) {
2190 assert(sqlite3_mutex_held(mutexOpen));
2191 sqlite3_mutex_leave(mutexOpen);
2192 }
2193 #ifdef BDBSQL_FILE_PER_TABLE
2194 if (filename != NULL)
2195 sqlite3_free(filename);
2196 #endif
2197 return rc;
2198 }
2199
2200 /* Close all cursors for the given transaction. */
btreeCloseAllCursors(Btree * p,DB_TXN * txn)2201 static int btreeCloseAllCursors(Btree *p, DB_TXN *txn)
2202 {
2203 BtCursor *c, *nextc, *prevc, *free_cursors;
2204 BtShared *pBt;
2205 DB_TXN *db_txn, *dbc_txn;
2206 int rc, ret, t_rc;
2207
2208 log_msg(LOG_VERBOSE, "btreeCloseAllCursors(%p, %p)", p, txn);
2209
2210 free_cursors = NULL;
2211 pBt = p->pBt;
2212 rc = SQLITE_OK;
2213
2214 sqlite3_mutex_enter(pBt->mutex);
2215 for (c = pBt->first_cursor, prevc = NULL;
2216 c != NULL;
2217 prevc = c, c = nextc) {
2218 nextc = c->next;
2219 if (p != c->pBtree)
2220 continue;
2221 if (txn != NULL) {
2222 if (c->dbc == NULL)
2223 continue;
2224 dbc_txn = c->dbc->txn;
2225 db_txn = c->dbc->dbp->cur_txn;
2226 while (dbc_txn != NULL && dbc_txn != txn)
2227 dbc_txn = dbc_txn->parent;
2228 while (db_txn != NULL && db_txn != txn)
2229 db_txn = db_txn->parent;
2230 if (dbc_txn != txn && db_txn != txn)
2231 continue;
2232 }
2233
2234 /*
2235 * Detach the cursor from the main list and add it to the free
2236 * list.
2237 */
2238 if (prevc == NULL)
2239 pBt->first_cursor = nextc;
2240 else
2241 prevc->next = nextc;
2242
2243 c->next = free_cursors;
2244 free_cursors = c;
2245 c = prevc;
2246 }
2247 sqlite3_mutex_leave(pBt->mutex);
2248
2249 for (c = free_cursors; c != NULL; c = c->next) {
2250 t_rc = btreeCloseCursor(c, 0);
2251 if (t_rc != SQLITE_OK && rc == SQLITE_OK)
2252 rc = t_rc;
2253 }
2254
2255 if (p->compact_cursor != NULL) {
2256 if ((ret = p->compact_cursor->close(p->compact_cursor)) != 0 &&
2257 rc == SQLITE_OK)
2258 rc = dberr2sqlite(ret, p);
2259 p->compact_cursor = NULL;
2260 }
2261
2262 if (p->schemaLock != NULL && txn != NULL) {
2263 dbc_txn = p->schemaLock->txn;
2264 while (dbc_txn != NULL && dbc_txn != txn)
2265 dbc_txn = dbc_txn->parent;
2266 if (dbc_txn == txn &&
2267 (t_rc = btreeLockSchema(p, LOCKMODE_NONE)) != SQLITE_OK &&
2268 rc == SQLITE_OK)
2269 rc = t_rc;
2270 }
2271
2272 return rc;
2273 }
2274
btreeCleanupCachedHandles(Btree * p,cleanup_mode_t cleanup)2275 static int btreeCleanupCachedHandles(Btree *p, cleanup_mode_t cleanup)
2276 {
2277 DB *dbp;
2278 DB_SEQUENCE *seq;
2279 DBT key;
2280 CACHED_DB *cached_db;
2281 BtShared *pBt;
2282 HashElem *e, *e_next;
2283 SEQ_COOKIE *sc;
2284 int remove, ret, rc;
2285
2286 log_msg(LOG_VERBOSE, "btreeCleanupCachedHandles(%p, %d)",
2287 p, (int)cleanup);
2288
2289 pBt = p->pBt;
2290 e = NULL;
2291 rc = SQLITE_OK;
2292 remove = 0;
2293
2294 /* If a backup is in progress, we can't drop handle locks. */
2295 if ((cleanup == CLEANUP_GET_LOCKS || cleanup == CLEANUP_DROP_LOCKS) &&
2296 p->nBackup > 0)
2297 return (SQLITE_OK);
2298
2299 if ((cleanup == CLEANUP_GET_LOCKS || cleanup == CLEANUP_DROP_LOCKS))
2300 sqlite3_mutex_enter(pBt->mutex);
2301
2302 for (e = sqliteHashFirst(&pBt->db_cache); e != NULL;
2303 e = e_next) {
2304 /*
2305 * Grab the next value now rather than in the for loop so that
2306 * it's possible to remove elements from the list inline.
2307 */
2308 e_next = sqliteHashNext(e);
2309 cached_db = sqliteHashData(e);
2310
2311 if (cached_db == NULL)
2312 continue;
2313
2314 if (cleanup == CLEANUP_DROP_LOCKS ||
2315 cleanup == CLEANUP_GET_LOCKS) {
2316 if (cached_db->is_sequence || cached_db->dbp == NULL ||
2317 strcmp(cached_db->key, "1") == 0)
2318 continue;
2319 if (cleanup == CLEANUP_GET_LOCKS)
2320 btreeDbHandleLock(p, cached_db);
2321 else if (cleanup == CLEANUP_DROP_LOCKS) {
2322 btreeDbHandleUnlock(p, cached_db);
2323 }
2324 continue;
2325 }
2326
2327 if (cached_db->is_sequence) {
2328 sc = (SEQ_COOKIE *)cached_db->cookie;
2329 if (cleanup == CLEANUP_ABORT && sc != NULL) {
2330 memset(&key, 0, sizeof(key));
2331 key.data = sc->name;
2332 key.size = key.ulen = sc->name_len;
2333 key.flags = DB_DBT_USERMEM;
2334 if (pMetaDb->exists(pMetaDb,
2335 pFamilyTxn, &key, 0) == DB_NOTFOUND) {
2336 /*
2337 * This abort removed a sequence -
2338 * remove the matching cache entry.
2339 */
2340 remove = 1;
2341 }
2342 }
2343 seq = (DB_SEQUENCE *)cached_db->dbp;
2344 if (seq != NULL && (ret = seq->close(seq, 0)) != 0 &&
2345 rc == SQLITE_OK)
2346 rc = dberr2sqlite(ret, p);
2347 } else if ((dbp = cached_db->dbp) != NULL) {
2348 /*
2349 * We have to clear the cache of any stale DB handles.
2350 * If a transaction has been aborted, the handle will
2351 * no longer be open. We peek inside the handle at
2352 * the flags to find out: otherwise, we would need to
2353 * track all parent / child relationships when
2354 * rolling back transactions.
2355 */
2356 if (cleanup == CLEANUP_ABORT &&
2357 (dbp->flags & DB_AM_OPEN_CALLED) != 0)
2358 continue;
2359
2360 #ifndef BDBSQL_SINGLE_THREAD
2361 if (dbp->app_private != NULL)
2362 sqlite3_free(dbp->app_private);
2363 #endif
2364 if ((ret = closeDB(p, dbp, DB_NOSYNC)) == 0 &&
2365 rc == SQLITE_OK)
2366 rc = dberr2sqlite(ret, p);
2367 remove = 1;
2368 }
2369 if (cleanup == CLEANUP_CLOSE || remove) {
2370 if (remove)
2371 sqlite3HashInsert(&pBt->db_cache,
2372 cached_db->key,
2373 (int)strlen(cached_db->key), NULL);
2374 if (cached_db->cookie != NULL)
2375 sqlite3_free(cached_db->cookie);
2376 sqlite3_free(cached_db);
2377 remove = 0;
2378 } else
2379 cached_db->dbp = NULL;
2380 }
2381
2382 if ((cleanup == CLEANUP_GET_LOCKS || cleanup == CLEANUP_DROP_LOCKS))
2383 sqlite3_mutex_leave(pBt->mutex);
2384
2385 return rc;
2386 }
2387
2388 /*
2389 ** Close an open database and invalidate all cursors.
2390 */
sqlite3BtreeClose(Btree * p)2391 int sqlite3BtreeClose(Btree *p)
2392 {
2393 Btree *next_btree;
2394 BtShared *pBt;
2395 int ret, rc, t_rc, t_ret;
2396 sqlite3_mutex *mutexOpen;
2397 #ifdef BDBSQL_SHARE_PRIVATE
2398 int needsunlock = 0;
2399 #endif
2400
2401 log_msg(LOG_VERBOSE, "sqlite3BtreeClose(%p)", p);
2402
2403 ret = 0;
2404 pBt = p->pBt;
2405 rc = SQLITE_OK;
2406
2407 if (pBt == NULL)
2408 goto done;
2409 #ifdef BDBSQL_SHARE_PRIVATE
2410 /*
2411 * It is useful to checkpoint when closing but in the case of
2412 * BDBSQL_SHARE_PRIVATE the write lock is required to ensure
2413 * that the current data is written. That must be acquired while
2414 * the environment is still intact in case of a re-open.
2415 */
2416 if (pBt->dbStorage == DB_STORE_NAMED && pDbEnv) {
2417 if (pBt->transactional && pBt->env_opened) {
2418 btreeScopedFileLock(p, 1, 0);
2419 needsunlock = 1;
2420 /* checkpoint happens below */
2421 }
2422 }
2423 #endif
2424
2425 rc = btreeCloseAllCursors(p, NULL);
2426
2427 #ifndef SQLITE_OMIT_AUTOVACUUM
2428 /*
2429 * Btree might keep some incremental vacuum info with an internal
2430 * link list. Need to free the link when Btree is closed.
2431 */
2432 btreeFreeVacuumInfo(p);
2433 #endif
2434
2435 if (pMainTxn != NULL &&
2436 (t_rc = sqlite3BtreeRollback(p)) != SQLITE_OK && rc == SQLITE_OK)
2437 rc = t_rc;
2438 assert(pMainTxn == NULL);
2439
2440 if (pFamilyTxn != NULL) {
2441 ret = pFamilyTxn->commit(pFamilyTxn, 0);
2442 pFamilyTxn = NULL;
2443 p->inTrans = TRANS_NONE;
2444 p->txn_excl = 0;
2445 if (ret != 0 && rc == SQLITE_OK)
2446 rc = dberr2sqlite(ret, p);
2447 }
2448
2449 if (p->schema != NULL) {
2450 if (p->free_schema != NULL)
2451 p->free_schema(p->schema);
2452 /* This needs to be a real call to sqlite3_free. */
2453 #ifdef BDBSQL_OMIT_LEAKCHECK
2454 #undef sqlite3_free
2455 #endif
2456 sqlite3_free(p->schema);
2457 #ifdef BDBSQL_OMIT_LEAKCHECK
2458 #define sqlite3_free free
2459 #endif
2460 }
2461
2462 /*
2463 * #18538 -- another thread may be attempting to open this BtShared at
2464 * the same time that we are closing it.
2465 *
2466 * To avoid a race, we need to hold the open mutex until the
2467 * environment is closed. Otherwise, the opening thread might open its
2468 * handle before this one is completely closed, and DB_REGISTER doesn't
2469 * support that.
2470 */
2471 mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
2472 sqlite3_mutex_enter(mutexOpen);
2473
2474 /* Remove this pBt from the BtShared list of btrees. */
2475 for (next_btree = pBt->btrees; next_btree != NULL;
2476 next_btree = next_btree->pNext) {
2477 if (next_btree == p) {
2478 if (next_btree == pBt->btrees) {
2479 pBt->btrees = next_btree->pNext;
2480 if (pBt->btrees != NULL)
2481 pBt->btrees->pPrev = NULL;
2482 } else {
2483 p->pPrev->pNext = p->pNext;
2484 if (p->pNext != NULL)
2485 p->pNext->pPrev = p->pPrev;
2486 }
2487 }
2488 }
2489
2490 if (--pBt->nRef == 0) {
2491 assert (pBt->btrees == NULL);
2492 if (pBt->dbStorage == DB_STORE_NAMED) {
2493 /* Remove it from the linked list of shared envs. */
2494 assert(pBt == g_shared_btrees || pBt->pPrevDb != NULL);
2495 if (pBt == g_shared_btrees)
2496 g_shared_btrees = pBt->pNextDb;
2497 else
2498 pBt->pPrevDb->pNextDb = pBt->pNextDb;
2499 if (pBt->pNextDb != NULL)
2500 pBt->pNextDb->pPrevDb = pBt->pPrevDb;
2501 }
2502
2503 /*
2504 * At this point, the BtShared has been removed from the shared
2505 * list, so it cannot be reused and it is safe to close any
2506 * handles.
2507 */
2508 t_rc = btreeCleanupCachedHandles(p, CLEANUP_CLOSE);
2509 if (t_rc != SQLITE_OK && rc == SQLITE_OK)
2510 rc = t_rc;
2511 sqlite3HashClear(&pBt->db_cache);
2512
2513 /* Delete any memory held by the pragma cache. */
2514 cleanPragmaCache(p);
2515
2516 if (pTablesDb != NULL && (t_ret =
2517 pTablesDb->close(pTablesDb, DB_NOSYNC)) != 0 && ret == 0)
2518 ret = t_ret;
2519 if (pMetaDb != NULL && (t_ret =
2520 pMetaDb->close(pMetaDb, DB_NOSYNC)) != 0 && ret == 0)
2521 ret = t_ret;
2522 pTablesDb = pMetaDb = NULL;
2523
2524 /* We never close down the shared tmp environment. */
2525 if (pBt->dbStorage == DB_STORE_NAMED && pDbEnv) {
2526 /*
2527 * Checkpoint when closing. This allows log file
2528 * auto-removal, which keeps the size of the
2529 * environment directory small and also
2530 * bounds the time we would have to spend in
2531 * recovery.
2532 */
2533 if (pBt->transactional && pBt->env_opened) {
2534 if ((t_ret = pDbEnv->txn_checkpoint(pDbEnv,
2535 0, 0, 0)) != 0 && ret == 0)
2536 ret = t_ret;
2537 }
2538 #ifdef BDBSQL_SHARE_PRIVATE
2539 /* don't flush the cache; checkpoint has been done */
2540 pDbEnv->set_errcall(pDbEnv, NULL);
2541 pDbEnv->set_flags(pDbEnv, DB_NOFLUSH, 1);
2542 #endif
2543 if ((t_ret = pDbEnv->close(pDbEnv, 0)) != 0 && ret == 0)
2544 ret = t_ret;
2545 pBt->repStarted = 0;
2546 }
2547 #ifdef BDBSQL_SHARE_PRIVATE
2548 /* this must happen before the pBt disappears */
2549 if (needsunlock)
2550 btreeScopedFileUnlock(p, 1);
2551 #endif
2552 btreeFreeSharedBtree(pBt, 0);
2553 }
2554 sqlite3_mutex_leave(mutexOpen);
2555
2556 done: rc = (rc != SQLITE_OK) ?
2557 rc : (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
2558 sqlite3_free(p);
2559 return rc;
2560 }
2561
2562 /*
2563 ** Change the limit on the number of pages allowed in the cache.
2564 **
2565 ** The maximum number of cache pages is set to the absolute value of mxPage.
2566 ** If mxPage is negative in SQLite, the pager will operate asynchronously - it
2567 ** will not stop to do fsync()s to insure data is written to the disk surface
2568 ** before continuing.
2569 **
2570 ** The Berkeley DB cache always operates in asynchronously (except when writing
2571 ** a checkpoint), but log writes are triggered to maintain write-ahead logging
2572 ** semantics.
2573 */
sqlite3BtreeSetCacheSize(Btree * p,int mxPage)2574 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage)
2575 {
2576 BtShared *pBt;
2577 log_msg(LOG_VERBOSE, "sqlite3BtreeSetCacheSize(%p, %u)", p, mxPage);
2578
2579 pBt = p->pBt;
2580 if (mxPage < 0)
2581 mxPage = -mxPage;
2582
2583 if (!p->connected)
2584 pBt->cacheSize = mxPage;
2585 return SQLITE_OK;
2586 }
2587
2588 /*
2589 ** Change the way data is synced to disk in order to increase or decrease how
2590 ** well the database resists damage due to OS crashes and power failures.
2591 ** Level 1 is the same as asynchronous (no syncs() occur and there is a high
2592 ** probability of damage) Level 2 is the default. There is a very low but
2593 ** non-zero probability of damage. Level 3 reduces the probability of damage
2594 ** to near zero but with a write performance reduction.
2595 **
2596 ** Berkeley DB always does the equivalent of "fullSync".
2597 */
sqlite3BtreeSetSafetyLevel(Btree * p,int level,int fullSync,int ckptFullSync)2598 int sqlite3BtreeSetSafetyLevel(
2599 Btree *p,
2600 int level,
2601 int fullSync,
2602 int ckptFullSync)
2603 {
2604 BtShared *pBt;
2605 log_msg(LOG_VERBOSE,
2606 "sqlite3BtreeSetSafetyLevel(%p, %u, %u, %u)",
2607 p, level, fullSync, ckptFullSync);
2608
2609 pBt = p->pBt;
2610
2611 /* TODO: Ignore ckptFullSync for now - it corresponds to:
2612 * PRAGMA checkpoint_fullfsync
2613 * Berkeley DB doesn't allow you to disable that, so ignore the pragma.
2614 */
2615 if (GET_DURABLE(p->pBt)) {
2616 pDbEnv->set_flags(pDbEnv, DB_TXN_NOSYNC, (level == 1));
2617 pDbEnv->set_flags(pDbEnv, DB_TXN_WRITE_NOSYNC, (level == 2));
2618 }
2619 return SQLITE_OK;
2620 }
2621
sqlite3BtreeHandleCacheUpdate(Btree * p,int schema_changed)2622 int sqlite3BtreeHandleCacheUpdate(Btree *p, int schema_changed)
2623 {
2624 int rc;
2625
2626 if (schema_changed != 0 && (rc = btreeInvalidateHandleCache(p)) != 0)
2627 return rc;
2628 return btreeCleanupCachedHandles(p, CLEANUP_GET_LOCKS);
2629 }
2630
2631 /*
2632 * If the schema version has changed since the last transaction we need to
2633 * close all handles in the handle cache that aren't holding a handle lock.
2634 * Ideally we could do this via the sqlite3ResetInternalSchema method
2635 * but there is no obvious hook there, and.. since we do the GET_LOCKS
2636 * call here, we need to close handles now or we can't tell if they need to be
2637 * closed.
2638 * TODO: We'll probably be best altering the sqlite code to make this work
2639 * more efficiently.
2640 */
btreeInvalidateHandleCache(Btree * p)2641 static int btreeInvalidateHandleCache(Btree *p) {
2642 BtShared *pBt;
2643 int cookie, i, rc, ret;
2644 CACHED_DB *cached_db, **tables_to_close;
2645 DB *dbp;
2646 HashElem *e, *e_next;
2647 u_int32_t flags;
2648
2649 rc = ret = 0;
2650 pBt = p->pBt;
2651
2652 if (p->inTrans == TRANS_NONE && p->db != NULL && p->db->aDb != NULL) {
2653 sqlite3BtreeGetMeta(p, BTREE_SCHEMA_VERSION, (u32 *)&cookie);
2654 if (p->db->aDb[0].pSchema != NULL &&
2655 p->db->aDb[0].pSchema->schema_cookie != cookie) {
2656 /*
2657 * TODO: Is it possible that this function is called
2658 * while already holding the mutex? Maybe from the
2659 * sequence code.
2660 */
2661 sqlite3_mutex_enter(pBt->mutex);
2662 /*
2663 * We can't call DB->close while holding the mutex, so
2664 * record which handles we want to close and do the
2665 * actual close after the mutex is released.
2666 */
2667 for (e = sqliteHashFirst(&pBt->db_cache), i = 0;
2668 e != NULL; e = sqliteHashNext(e), i++) {}
2669
2670 if (i == 0) {
2671 sqlite3_mutex_leave(pBt->mutex);
2672 return (0);
2673 }
2674
2675 tables_to_close =
2676 sqlite3_malloc(i * sizeof(CACHED_DB *));
2677 if (tables_to_close == NULL) {
2678 sqlite3_mutex_leave(pBt->mutex);
2679 return SQLITE_NOMEM;
2680 }
2681 memset(tables_to_close, 0, i * sizeof(CACHED_DB *));
2682 /*
2683 * Ideally we'd be able to find out if the Berkeley DB
2684 * fileid is still valid, but that's not currently
2685 * simple, so close all handles.
2686 */
2687 for (e = sqliteHashFirst(&pBt->db_cache), i = 0;
2688 e != NULL; e = e_next) {
2689 e_next = sqliteHashNext(e);
2690 cached_db = sqliteHashData(e);
2691
2692 /* Skip table name db and in memory tables. */
2693 if (cached_db == NULL ||
2694 strcmp(cached_db->key, "1") == 0 ||
2695 cached_db->dbp == NULL)
2696 continue;
2697 dbp = cached_db->dbp;
2698 dbp->dbenv->get_open_flags(dbp->dbenv, &flags);
2699 if (flags & DB_PRIVATE)
2700 continue;
2701 if (btreeDbHandleIsLocked(cached_db))
2702 continue;
2703 tables_to_close[i++] = cached_db;
2704 sqlite3HashInsert(&pBt->db_cache,
2705 cached_db->key,
2706 (int)strlen(cached_db->key), NULL);
2707 }
2708 sqlite3_mutex_leave(pBt->mutex);
2709 for (i = 0; tables_to_close[i] != NULL; i++) {
2710 cached_db = tables_to_close[i];
2711 dbp = cached_db->dbp;
2712 #ifndef BDBSQL_SINGLE_THREAD
2713 if (dbp->app_private != NULL)
2714 sqlite3_free(dbp->app_private);
2715 #endif
2716 if ((ret = closeDB(p, dbp, DB_NOSYNC)) == 0 &&
2717 rc == SQLITE_OK)
2718 rc = dberr2sqlite(ret, p);
2719 if (cached_db->cookie != NULL)
2720 sqlite3_free(cached_db->cookie);
2721 sqlite3_free(cached_db);
2722 }
2723 sqlite3_free(tables_to_close);
2724 if (rc != 0)
2725 return (rc);
2726 }
2727 }
2728 return (0);
2729 }
2730
btreeBeginTransInternal(Btree * p,int wrflag)2731 int btreeBeginTransInternal(Btree *p, int wrflag)
2732 {
2733 btreeCleanupCachedHandles(p, CLEANUP_GET_LOCKS);
2734 return sqlite3BtreeBeginTrans(p, wrflag);
2735 }
2736
2737 /*
2738 ** Attempt to start a new transaction. A write-transaction is started if the
2739 ** second argument is true, otherwise a read-transaction. No-op if a
2740 ** transaction is already in progress.
2741 **
2742 ** A write-transaction must be started before attempting any changes to the
2743 ** database. None of the following routines will work unless a transaction
2744 ** is started first:
2745 **
2746 ** sqlite3BtreeCreateTable()
2747 ** sqlite3BtreeCreateIndex()
2748 ** sqlite3BtreeClearTable()
2749 ** sqlite3BtreeDropTable()
2750 ** sqlite3BtreeInsert()
2751 ** sqlite3BtreeDelete()
2752 ** sqlite3BtreeUpdateMeta()
2753 */
sqlite3BtreeBeginTrans(Btree * p,int wrflag)2754 int sqlite3BtreeBeginTrans(Btree *p, int wrflag)
2755 {
2756 BtShared *pBt;
2757 int rc;
2758 u_int32_t txn_exclPriority;
2759 u32 temp;
2760
2761 log_msg(LOG_VERBOSE,
2762 "sqlite3BtreeBeginTrans(%p, %u) -- writer %s",
2763 p, wrflag, pReadTxn ? "active" : "inactive");
2764
2765 /*
2766 * The BtShared is not in a usable state. Return NOMEM, since it
2767 * is the most consistently well handled error return from SQLite code.
2768 */
2769 if (p->pBt->panic)
2770 return SQLITE_NOMEM;
2771
2772 pBt = p->pBt;
2773 rc = SQLITE_OK;
2774 txn_exclPriority = -1;
2775
2776 /* A replication client should not start write transactions. */
2777 if (wrflag && (IS_BTREE_READONLY(p) || btreeRepIsClient(p)))
2778 return SQLITE_READONLY;
2779
2780 if (!p->connected) {
2781 if (wrflag != 2) {
2782 p->inTrans = (wrflag || p->inTrans == TRANS_WRITE) ?
2783 TRANS_WRITE : TRANS_READ;
2784 if (!pBt->need_open)
2785 return SQLITE_OK;
2786 }
2787 if ((rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
2788 return rc;
2789 /* The btreeOpenEnvironment call might have updated pBt. */
2790 pBt = p->pBt;
2791 }
2792
2793 if (wrflag == 2)
2794 p->txn_excl = 1;
2795 if (pBt->transactional) {
2796 if (wrflag && p->inTrans != TRANS_WRITE)
2797 p->inTrans = TRANS_WRITE;
2798 else if (p->inTrans == TRANS_NONE)
2799 p->inTrans = TRANS_READ;
2800
2801 if (pReadTxn == NULL || p->nSavepoint <= p->db->nSavepoint)
2802 rc = sqlite3BtreeBeginStmt(p, p->db->nSavepoint);
2803
2804 /* Exclusive transaction. */
2805 if (wrflag == 2 && rc == SQLITE_OK) {
2806 pSavepointTxn->set_priority(pSavepointTxn,
2807 txn_exclPriority);
2808 pReadTxn->set_priority(pReadTxn, txn_exclPriority);
2809 pMainTxn->set_priority(pMainTxn, txn_exclPriority);
2810 pFamilyTxn->set_priority(pFamilyTxn, txn_exclPriority);
2811 sqlite3BtreeGetMeta(p, 1, &temp);
2812 } else if (p->txn_priority != 0) {
2813 pSavepointTxn->set_priority(pSavepointTxn,
2814 p->txn_priority);
2815 pReadTxn->set_priority(pReadTxn, p->txn_priority);
2816 pMainTxn->set_priority(pMainTxn, p->txn_priority);
2817 pFamilyTxn->set_priority(pFamilyTxn, p->txn_priority);
2818 }
2819 }
2820 return rc;
2821 }
2822
2823 /***************************************************************************
2824 ** This routine does the first phase of a two-phase commit. This routine
2825 ** causes a rollback journal to be created (if it does not already exist)
2826 ** and populated with enough information so that if a power loss occurs the
2827 ** database can be restored to its original state by playing back the journal.
2828 ** Then the contents of the journal are flushed out to the disk. After the
2829 ** journal is safely on oxide, the changes to the database are written into
2830 ** the database file and flushed to oxide. At the end of this call, the
2831 ** rollback journal still exists on the disk and we are still holding all
2832 ** locks, so the transaction has not committed. See sqlite3BtreeCommit() for
2833 ** the second phase of the commit process.
2834 **
2835 ** This call is a no-op if no write-transaction is currently active on pBt.
2836 **
2837 ** Otherwise, sync the database file for the engine pBt. zMaster points to
2838 ** the name of a master journal file that should be written into the
2839 ** individual journal file, or is NULL, indicating no master journal file
2840 ** (single database transaction).
2841 **
2842 ** When this is called, the master journal should already have been created,
2843 ** populated with this journal pointer and synced to disk.
2844 **
2845 ** Once this is routine has returned, the only thing required to commit the
2846 ** write-transaction for this database file is to delete the journal.
2847 */
sqlite3BtreeCommitPhaseOne(Btree * p,const char * zMaster)2848 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster)
2849 {
2850 log_msg(LOG_VERBOSE,
2851 "sqlite3BtreeCommitPhaseOne(%p, %s)", p, zMaster);
2852 return SQLITE_OK;
2853 }
2854
2855 /***************************************************************************
2856 ** Commit the transaction currently in progress.
2857 **
2858 ** This routine implements the second phase of a 2-phase commit. The
2859 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
2860 ** be invoked prior to calling this routine. The sqlite3BtreeCommitPhaseOne()
2861 ** routine did all the work of writing information out to disk and flushing the
2862 ** contents so that they are written onto the disk platter. All this
2863 ** routine has to do is delete or truncate or zero the header in the
2864 ** the rollback journal (which causes the transaction to commit) and
2865 ** drop locks.
2866 **
2867 ** Normally, if an error occurs while the pager layer is attempting to
2868 ** finalize the underlying journal file, this function returns an error and
2869 ** the upper layer will attempt a rollback. However, if the second argument
2870 ** is non-zero then this b-tree transaction is part of a multi-file
2871 ** transaction. In this case, the transaction has already been committed
2872 ** (by deleting a master journal file) and the caller will ignore this
2873 ** functions return code. So, even if an error occurs in the pager layer,
2874 ** reset the b-tree objects internal state to indicate that the write
2875 ** transaction has been closed. This is quite safe, as the pager will have
2876 ** transitioned to the error state.
2877 **
2878 ** This will release the write lock on the database file. If there
2879 ** are no active cursors, it also releases the read lock.
2880 **
2881 ** NOTE: It's OK for Berkeley DB to ignore the bCleanup flag - it is only used
2882 ** by SQLite when it is safe for it to ignore stray journal files. That's not
2883 ** a relevant consideration for Berkele DB.
2884 */
sqlite3BtreeCommitPhaseTwo(Btree * p,int bCleanup)2885 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup)
2886 {
2887 Btree *next_btree;
2888 BtShared *pBt;
2889 DELETED_TABLE *dtable, *next;
2890 char *tableName, tableNameBuf[DBNAME_SIZE];
2891 char *oldTableName, oldTableNameBuf[DBNAME_SIZE], *fileName;
2892 int needVacuum, rc, ret, t_rc;
2893 int in_trans, removeFlags;
2894 u_int32_t defaultTxnPriority;
2895 #ifdef BDBSQL_SHARE_PRIVATE
2896 int deleted = 0; /* indicates tables were deleted */
2897 int needsunlock = 0;
2898 #endif
2899 #ifdef BDBSQL_FILE_PER_TABLE
2900 DBT key;
2901 #endif
2902 log_msg(LOG_VERBOSE,
2903 "sqlite3BtreeCommitPhaseTwo(%p) -- writer %s",
2904 p, pReadTxn ? "active" : "inactive");
2905
2906 pBt = p->pBt;
2907 rc = SQLITE_OK;
2908 defaultTxnPriority = 100;
2909 needVacuum = 0;
2910 removeFlags = DB_AUTO_COMMIT | DB_LOG_NO_DATA | DB_NOSYNC | \
2911 (GET_DURABLE(pBt) ? 0 : DB_TXN_NOT_DURABLE);
2912
2913 if (pMainTxn && p->db->activeVdbeCnt <= 1) {
2914 #ifdef BDBSQL_SHARE_PRIVATE
2915 needsunlock = 1;
2916 #endif
2917 /* Mark the end of an exclusive transaction. */
2918 p->txn_excl = 0;
2919 t_rc = btreeCloseAllCursors(p, pMainTxn);
2920 if (t_rc != SQLITE_OK && rc == SQLITE_OK)
2921 rc = t_rc;
2922
2923 /*
2924 * Even if we get an error, we can't use the
2925 * transaction handle again, so we should keep going
2926 * and clear out the Btree fields.
2927 */
2928 ret = pMainTxn->commit(pMainTxn, 0);
2929 if (ret != 0 && rc == SQLITE_OK)
2930 rc = dberr2sqlite(ret, p);
2931
2932 pMainTxn = pSavepointTxn = pReadTxn = NULL;
2933 p->nSavepoint = 0;
2934
2935 for (dtable = p->deleted_tables;
2936 dtable != NULL;
2937 dtable = next) {
2938 #ifdef BDBSQL_SHARE_PRIVATE
2939 deleted = 1;
2940 #endif
2941 tableName = tableNameBuf;
2942 GET_TABLENAME(tableName, sizeof(tableNameBuf),
2943 dtable->iTable, "");
2944 FIX_TABLENAME(pBt, fileName, tableName);
2945
2946 /*
2947 * In memory db was not renamed. Just do a quick remove
2948 * in this case.
2949 */
2950 if (pBt->dbStorage == DB_STORE_INMEM) {
2951 ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2952 tableName, removeFlags);
2953 goto next;
2954 }
2955 #ifndef BDBSQL_FILE_PER_TABLE
2956 oldTableName = oldTableNameBuf;
2957 GET_TABLENAME(oldTableName, sizeof(oldTableNameBuf),
2958 dtable->iTable, "old-");
2959
2960 ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2961 oldTableName, removeFlags);
2962 #else
2963 if (dtable->flag == DTF_DELETE) {
2964 oldTableName = oldTableNameBuf;
2965 GET_TABLENAME(oldTableName,
2966 sizeof(oldTableNameBuf),
2967 dtable->iTable, "old-");
2968
2969 ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2970 oldTableName, removeFlags);
2971 } else {
2972 ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2973 NULL, removeFlags);
2974 if (ret != 0 && rc == SQLITE_OK)
2975 rc = dberr2sqlite(ret, p);
2976
2977 memset(&key, 0, sizeof(key));
2978 key.flags = DB_DBT_USERMEM;
2979 key.data = tableName;
2980 key.size = strlen(tableName);
2981 ret = pTablesDb->del(pTablesDb, NULL, &key, 0);
2982 }
2983 #endif
2984 next: if (ret != 0 && rc == SQLITE_OK)
2985 rc = dberr2sqlite(ret, p);
2986
2987 next = dtable->next;
2988 sqlite3_free(dtable);
2989 }
2990 p->deleted_tables = NULL;
2991
2992 /* Execute vacuum if auto-vacuum mode is FULL or incremental */
2993 needVacuum = (pBt->dbStorage == DB_STORE_NAMED &&
2994 p->inTrans == TRANS_WRITE &&
2995 (sqlite3BtreeGetAutoVacuum(p) == BTREE_AUTOVACUUM_FULL ||
2996 p->needVacuum));
2997 } else if (p->inTrans == TRANS_WRITE)
2998 rc = sqlite3BtreeSavepoint(p, SAVEPOINT_RELEASE, 0);
2999
3000 #ifdef BDBSQL_SHARE_PRIVATE
3001 if (pBt->dbStorage == DB_STORE_NAMED && needsunlock) {
3002 /* need to checkpoint if databases were removed */
3003 if (deleted) {
3004 assert(btreeHasFileLock(p, 1)); /* write lock */
3005 rc = dberr2sqlite(pDbEnv->txn_checkpoint(
3006 pDbEnv, 0, 0, 0), p);
3007 }
3008 btreeFileUnlock(p);
3009 }
3010 #endif
3011 if (pFamilyTxn)
3012 pFamilyTxn->set_priority(pFamilyTxn, defaultTxnPriority);
3013
3014 if (p->db->activeVdbeCnt > 1)
3015 p->inTrans = TRANS_READ;
3016 else {
3017 p->inTrans = TRANS_NONE;
3018 if (p->schemaLockMode > LOCKMODE_NONE &&
3019 (t_rc = btreeLockSchema(p, LOCKMODE_NONE)) != SQLITE_OK &&
3020 rc == SQLITE_OK)
3021 rc = t_rc;
3022
3023 /*
3024 * Only release the handle locks if no transactions are active
3025 * in any Btree.
3026 */
3027 in_trans = 0;
3028 for (next_btree = pBt->btrees; next_btree != NULL;
3029 next_btree = next_btree->pNext) {
3030 if (next_btree->inTrans != TRANS_NONE) {
3031 in_trans = 1;
3032 break;
3033 }
3034 }
3035
3036 /* Drop any handle locks if this was the only active txn. */
3037 if (in_trans == 0)
3038 btreeCleanupCachedHandles(p, CLEANUP_DROP_LOCKS);
3039 }
3040
3041 if (needVacuum && rc == SQLITE_OK)
3042 rc = btreeVacuum(p, &p->db->zErrMsg);
3043
3044 return rc;
3045 }
3046
3047 /*
3048 ** Do both phases of the commit.
3049 */
sqlite3BtreeCommit(Btree * p)3050 int sqlite3BtreeCommit(Btree *p)
3051 {
3052 BtShared *pBt;
3053 int rc;
3054
3055 log_msg(LOG_VERBOSE, "sqlite3BtreeCommit(%p)", p);
3056
3057 pBt = p->pBt;
3058 rc = sqlite3BtreeCommitPhaseOne(p, NULL);
3059 if (rc == SQLITE_OK)
3060 rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3061
3062 return (rc);
3063 }
3064
3065 /*
3066 ** Rollback the transaction in progress. All cursors will be invalidated
3067 ** by this operation. Any attempt to use a cursor that was open at the
3068 ** beginning of this operation will result in an error.
3069 **
3070 ** This will release the write lock on the database file. If there are no
3071 ** active cursors, it also releases the read lock.
3072 */
sqlite3BtreeRollback(Btree * p)3073 int sqlite3BtreeRollback(Btree *p)
3074 {
3075 BtShared *pBt;
3076 int rc, t_rc;
3077
3078 log_msg(LOG_VERBOSE, "sqlite3BtreeRollback(%p)", p);
3079
3080 rc = SQLITE_OK;
3081 pBt = p->pBt;
3082 if (pMainTxn != NULL)
3083 rc = sqlite3BtreeSavepoint(p, SAVEPOINT_ROLLBACK, -1);
3084 if (p->schemaLockMode > LOCKMODE_NONE &&
3085 (t_rc = btreeLockSchema(p, LOCKMODE_NONE)) != SQLITE_OK &&
3086 rc == SQLITE_OK)
3087 rc = t_rc;
3088
3089 /* Clear failure state if rollback is done successfully. */
3090 if (rc == SQLITE_OK)
3091 pBt->panic = 0;
3092
3093 return rc;
3094 }
3095
3096 /*
3097 ** Start a statement subtransaction. The subtransaction can be rolled back
3098 ** independently of the main transaction. You must start a transaction
3099 ** before starting a subtransaction. The subtransaction is ended automatically
3100 ** if the main transaction commits or rolls back.
3101 **
3102 ** Only one subtransaction may be active at a time. It is an error to try
3103 ** to start a new subtransaction if another subtransaction is already active.
3104 **
3105 ** Statement subtransactions are used around individual SQL statements that
3106 ** are contained within a BEGIN...COMMIT block. If a constraint error
3107 ** occurs within the statement, the effect of that one statement can be
3108 ** rolled back without having to rollback the entire transaction.
3109 */
sqlite3BtreeBeginStmt(Btree * p,int iStatement)3110 int sqlite3BtreeBeginStmt(Btree *p, int iStatement)
3111 {
3112 BtShared *pBt;
3113 int ret;
3114
3115 log_msg(LOG_VERBOSE, "sqlite3BtreeBeginStmt(%p, %d)", p, iStatement);
3116
3117 pBt = p->pBt;
3118 ret = 0;
3119
3120 if (pBt->transactional && p->inTrans != TRANS_NONE &&
3121 pFamilyTxn != NULL) {
3122
3123 if (!pMainTxn) {
3124 #ifdef BDBSQL_SHARE_PRIVATE
3125 /* btree{Read,Write}lock may reopen the environment */
3126 if (pBt->dbStorage == DB_STORE_NAMED)
3127 btreeFileLock(p);
3128 #endif
3129 if ((ret = pDbEnv->txn_begin(pDbEnv, pFamilyTxn,
3130 &pMainTxn, p->txn_bulk ? DB_TXN_BULK :
3131 pBt->read_txn_flags)) != 0) {
3132 #ifdef BDBSQL_SHARE_PRIVATE
3133 if (pBt->dbStorage == DB_STORE_NAMED)
3134 btreeFileUnlock(p);
3135 #endif
3136 return dberr2sqlite(ret, p);
3137 }
3138 pSavepointTxn = pMainTxn;
3139 }
3140
3141 if (!pReadTxn) {
3142 if (p->txn_bulk)
3143 pReadTxn = pMainTxn;
3144 else if ((ret = pDbEnv->txn_begin(pDbEnv, pMainTxn,
3145 &pReadTxn, pBt->read_txn_flags)) != 0)
3146 return dberr2sqlite(ret, p);
3147 }
3148
3149 while (p->nSavepoint <= iStatement && !p->txn_bulk) {
3150 if ((ret = pDbEnv->txn_begin(pDbEnv, pSavepointTxn,
3151 &pSavepointTxn, 0)) != 0)
3152 return dberr2sqlite(ret, p);
3153 p->nSavepoint++;
3154 }
3155 }
3156 return SQLITE_OK;
3157 }
3158
btreeCompare(DB * dbp,const DBT * dbt1,const DBT * dbt2,struct KeyInfo * keyInfo)3159 static int btreeCompare(
3160 DB *dbp,
3161 const DBT *dbt1,
3162 const DBT *dbt2,
3163 struct KeyInfo *keyInfo)
3164 {
3165 int res;
3166
3167 log_msg(LOG_VERBOSE, "btreeCompare(%p, %p, %p)", dbp, dbt1, dbt2);
3168
3169 if (dbt1->app_data != NULL)
3170 /* Use the unpacked key from dbt1 */
3171 res = -sqlite3VdbeRecordCompare(dbt2->size, dbt2->data,
3172 dbt1->app_data);
3173 else if (dbt2->app_data != NULL)
3174 /* Use the unpacked key from dbt2 */
3175 res = sqlite3VdbeRecordCompare(dbt1->size, dbt1->data,
3176 dbt2->app_data);
3177 else {
3178 /*
3179 * We don't have an unpacked key cached, generate one.
3180 *
3181 * This code should only execute if we are inside
3182 * DB->sort_multiple, or some uncommon paths inside Berkeley
3183 * DB, such as deferred delete of an item in a Btree.
3184 */
3185 BtShared *pBt = NULL;
3186 UnpackedRecord *p;
3187 char aSpace[40 * sizeof(void *)];
3188 int locked = 0;
3189
3190 /* This case can happen when searching temporary tables. */
3191 if (dbt1->data == dbt2->data)
3192 return 0;
3193
3194 #ifndef BDBSQL_SINGLE_THREAD
3195 if (keyInfo == NULL) {
3196 /* Find a cursor for this table, and use its keyInfo. */
3197 TableInfo *tableInfo = dbp->app_private;
3198 BtCursor *pCur = NULL;
3199 int iTable = tableInfo->iTable;
3200
3201 pBt = tableInfo->pBt;
3202
3203 /*
3204 * We can end up in here while closing a cursor, but we
3205 * take care not to be holding the BtShared mutex.
3206 * Keep the mutex until we are done so that some other
3207 * thread can't free the keyInfo from under us.
3208 */
3209 if (!pBt->resultsBuffer) {
3210 sqlite3_mutex_enter(pBt->mutex);
3211 locked = 1;
3212 }
3213
3214 for (pCur = pBt->first_cursor;
3215 pCur != NULL;
3216 pCur = pCur->next)
3217 if (pCur->tableIndex == iTable &&
3218 isCurrentThread(pCur->threadID))
3219 break;
3220
3221 assert(pCur);
3222 keyInfo = pCur->keyInfo;
3223 }
3224 #endif
3225
3226 p = sqlite3VdbeRecordUnpack(keyInfo, dbt2->size, dbt2->data,
3227 aSpace, sizeof(aSpace));
3228
3229 /*
3230 * XXX If we are out of memory, the call to unpack the record
3231 * may have returned NULL. The out-of-memory error has been
3232 * noted and will be handled by the VM, but we really want to
3233 * return that error to Berkeley DB. There is no way to do
3234 * that through the callback, so return zero.
3235 *
3236 * We choose zero because it makes loops terminate (e.g., if
3237 * we're called as part of a sort).
3238 */
3239 res = (p == NULL) ? 0 :
3240 sqlite3VdbeRecordCompare(dbt1->size, dbt1->data, p);
3241 if (p != NULL)
3242 sqlite3VdbeDeleteUnpackedRecord(p);
3243
3244 if (locked)
3245 sqlite3_mutex_leave(pBt->mutex);
3246 }
3247 return res;
3248 }
3249
btreeCompareKeyInfo(DB * dbp,const DBT * dbt1,const DBT * dbt2)3250 static int btreeCompareKeyInfo(DB *dbp, const DBT *dbt1, const DBT *dbt2)
3251 {
3252 assert(dbp->app_private != NULL);
3253 return btreeCompare(dbp, dbt1, dbt2,
3254 (struct KeyInfo *)dbp->app_private);
3255 }
3256
3257 #ifndef BDBSQL_SINGLE_THREAD
btreeCompareShared(DB * dbp,const DBT * dbt1,const DBT * dbt2)3258 static int btreeCompareShared(DB *dbp, const DBT *dbt1, const DBT *dbt2)
3259 {
3260 /*
3261 * In some cases (e.g., vacuum), a KeyInfo may have been stashed
3262 * inside the TableInfo. That's because we can't change the comparator
3263 * to btreeCompareKeyInfo on an open DB handle. If so, use that in
3264 * preference to searching for one.
3265 */
3266 return btreeCompare(dbp, dbt1, dbt2,
3267 ((TableInfo *)dbp->app_private)->pKeyInfo);
3268 }
3269 #endif
3270
3271 /*
3272 * Configures a Berkeley DB database handle prior to calling open.
3273 */
btreeConfigureDbHandle(Btree * p,int iTable,DB ** dbpp)3274 static int btreeConfigureDbHandle(Btree *p, int iTable, DB **dbpp)
3275 {
3276 BtShared *pBt;
3277 DB *dbp;
3278 DB_MPOOLFILE *pMpf;
3279 int ret;
3280 u_int32_t flags;
3281 #ifndef BDBSQL_SINGLE_THREAD
3282 TableInfo *tableInfo;
3283
3284 tableInfo = NULL;
3285 #endif
3286
3287 pBt = p->pBt;
3288 /* Odd-numbered tables have integer keys. */
3289 flags = (iTable & 1) ? BTREE_INTKEY : 0;
3290
3291 if ((ret = db_create(&dbp, pDbEnv, 0)) != 0)
3292 goto err;
3293 if ((flags & BTREE_INTKEY) == 0) {
3294 #ifdef BDBSQL_SINGLE_THREAD
3295 dbp->set_bt_compare(dbp, btreeCompareKeyInfo);
3296 #else
3297 if ((tableInfo = sqlite3_malloc(sizeof(TableInfo))) == NULL) {
3298 ret = ENOMEM;
3299 goto err;
3300 }
3301 tableInfo->pBt = pBt;
3302 tableInfo->pKeyInfo = NULL;
3303 tableInfo->iTable = iTable;
3304 dbp->app_private = tableInfo;
3305 dbp->set_bt_compare(dbp, btreeCompareShared);
3306 #endif
3307 } else
3308 dbp->set_bt_compare(dbp, btreeCompareIntKey);
3309
3310 if (pBt->pageSize != 0 &&
3311 (ret = dbp->set_pagesize(dbp, pBt->pageSize)) != 0)
3312 goto err;
3313 if (pBt->dbStorage == DB_STORE_INMEM) {
3314 /* Make sure the cache does not overflow to disk. */
3315 pMpf = dbp->get_mpf(dbp);
3316 pMpf->set_flags(pMpf, DB_MPOOL_NOFILE, 1);
3317 }
3318 if (!GET_DURABLE(pBt) &&
3319 (ret = dbp->set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
3320 goto err;
3321 if (pBt->encrypted && (ret = dbp->set_flags(dbp, DB_ENCRYPT)) != 0)
3322 goto err;
3323 err: if (ret != 0) {
3324 #ifndef BDBSQL_SINGLE_THREAD
3325 if (tableInfo != NULL)
3326 sqlite3_free(tableInfo);
3327 #endif
3328 if (dbp != NULL)
3329 (void)closeDB(p, dbp, DB_NOSYNC);
3330 *dbpp = NULL;
3331 } else {
3332 *dbpp = dbp;
3333 }
3334 return (ret);
3335 }
3336
btreeFindOrCreateDataTable(Btree * p,int * piTable,CACHED_DB ** ppCachedDb,int flags)3337 int btreeFindOrCreateDataTable(
3338 Btree *p, /* The btree */
3339 int *piTable, /* Root page of table to create */
3340 CACHED_DB **ppCachedDb,
3341 int flags)
3342 {
3343 BtShared *pBt;
3344 CACHED_DB *cached_db, *create_db;
3345 DB *dbp;
3346 char cached_db_key[CACHE_KEY_SIZE];
3347 int iTable, rc, ret;
3348
3349 pBt = p->pBt;
3350 rc = SQLITE_OK;
3351 ret = 0;
3352 cached_db = *ppCachedDb;
3353 create_db = NULL;
3354
3355 iTable = *piTable;
3356 sqlite3_mutex_enter(pBt->mutex);
3357
3358 if (flags & BTREE_CREATE) {
3359 if (pBt->dbStorage != DB_STORE_NAMED)
3360 iTable = pBt->last_table;
3361
3362 iTable++;
3363
3364 /* Make sure (iTable & 1) iff BTREE_INTKEY is set */
3365 if ((flags & BTREE_INTKEY) != 0) {
3366 if ((iTable & 1) == 0)
3367 iTable += 1;
3368 } else if ((iTable & 1) == 1)
3369 iTable += 1;
3370 pBt->last_table = iTable;
3371 }
3372
3373 sqlite3_snprintf(sizeof(cached_db_key), cached_db_key, "%x", iTable);
3374 cached_db = sqlite3HashFind(&pBt->db_cache,
3375 cached_db_key, (int)strlen(cached_db_key));
3376 if ((flags & BTREE_CREATE) && cached_db != NULL) {
3377 /*
3378 * If the table already exists in the cache, it's a
3379 * hang-over from a table that was deleted in another
3380 * process. Close the handle now.
3381 */
3382 if ((dbp = cached_db->dbp) != NULL) {
3383 #ifndef BDBSQL_SINGLE_THREAD
3384 if (dbp->app_private != NULL)
3385 sqlite3_free(dbp->app_private);
3386 #endif
3387 ret = closeDB(p, dbp, DB_NOSYNC);
3388 cached_db->dbp = NULL;
3389 if (ret != 0)
3390 goto err;
3391 }
3392 sqlite3HashInsert(&pBt->db_cache,
3393 cached_db_key, (int)strlen(cached_db_key), NULL);
3394 sqlite3_free(cached_db);
3395 cached_db = NULL;
3396 }
3397 if (cached_db == NULL || cached_db->dbp == NULL) {
3398 sqlite3_mutex_leave(pBt->mutex);
3399 if ((create_db = (CACHED_DB *)sqlite3_malloc(
3400 sizeof(CACHED_DB))) == NULL)
3401 {
3402 ret = ENOMEM;
3403 goto err;
3404 }
3405 memset(create_db, 0, sizeof(CACHED_DB));
3406 rc = btreeCreateDataTable(p, iTable, &create_db);
3407 if (rc != SQLITE_OK)
3408 goto err;
3409 sqlite3_mutex_enter(pBt->mutex);
3410 cached_db = sqlite3HashFind(&pBt->db_cache,
3411 cached_db_key, (int)strlen(cached_db_key));
3412 /* if its not there, then insert it. */
3413 if (cached_db == NULL) {
3414 rc = btreeCreateDataTable(p, iTable, &create_db);
3415 sqlite3_mutex_leave(pBt->mutex);
3416 cached_db = create_db;
3417 create_db = NULL;
3418 } else {
3419 if (cached_db->dbp == NULL) {
3420 cached_db->dbp = create_db->dbp;
3421 create_db->dbp = NULL;
3422 }
3423 sqlite3_mutex_leave(pBt->mutex);
3424 if (create_db->dbp != NULL)
3425 ret = create_db->dbp->close(
3426 create_db->dbp, DB_NOSYNC);
3427 if (ret != 0)
3428 goto err;
3429 }
3430 if (rc != SQLITE_OK)
3431 goto err;
3432 } else
3433 sqlite3_mutex_leave(pBt->mutex);
3434
3435 *ppCachedDb = cached_db;
3436 *piTable = iTable;
3437 err:
3438 if (ret != 0)
3439 rc = dberr2sqlite(ret, p);
3440 if (create_db != NULL)
3441 sqlite3_free(create_db);
3442 return (rc);
3443 }
3444
3445 /*
3446 * A utility function to create the table containing the actual data.
3447 * There are 3 modes:
3448 * 1) *ppCacheDb == NULL -> create/open the db and put it in the cache.
3449 * 2) *ppCacheDb != NULL && (*ppCacheDb)->dbp == NULL ->
3450 * create/open the db but don't cache.
3451 * 3) *ppCacheDb != NULL && (*ppCacheDb)->dbp != NULL ->
3452 * Put the db in the cache.
3453 */
btreeCreateDataTable(Btree * p,int iTable,CACHED_DB ** ppCachedDb)3454 static int btreeCreateDataTable(
3455 Btree *p, /* The btree */
3456 int iTable, /* Root page of table to create */
3457 CACHED_DB **ppCachedDb)
3458 {
3459 BtShared *pBt;
3460 CACHED_DB *cached_db, *stale_db;
3461 DB *dbp;
3462 #ifdef BDBSQL_FILE_PER_TABLE
3463 DBT d, k;
3464 #endif
3465 char *fileName, *tableName, tableNameBuf[DBNAME_SIZE];
3466 int ret, t_ret;
3467
3468 log_msg(LOG_VERBOSE, "sqlite3BtreeCreateDataTable(%p, %u, %p)",
3469 p, iTable, ppCachedDb);
3470
3471 pBt = p->pBt;
3472 assert(!pBt->resultsBuffer);
3473
3474 dbp = NULL;
3475 assert(ppCachedDb != NULL);
3476 cached_db = *ppCachedDb;
3477
3478 tableName = tableNameBuf;
3479 GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
3480 log_msg(LOG_VERBOSE,
3481 "sqlite3BtreeCursor creating the actual DB: file name:"
3482 "%s, table name: %s type: %u.",
3483 pBt->full_name, tableName, pBt->dbStorage);
3484
3485 FIX_TABLENAME(pBt, fileName, tableName);
3486 if (cached_db != NULL && cached_db->dbp != NULL) {
3487 dbp = cached_db->dbp;
3488 cached_db->dbp = NULL;
3489 goto insert_db;
3490 }
3491
3492 /*
3493 * First try without DB_CREATE, in auto-commit mode, so the
3494 * handle can be safely shared in the cache. If we are really
3495 * creating the table, we should be holding the schema lock,
3496 * which will protect the handle in cache until we are done.
3497 */
3498 if ((ret = btreeConfigureDbHandle(p, iTable, &dbp)) != 0)
3499 goto err;
3500 ret = ENOENT;
3501 if (pBt->dbStorage == DB_STORE_NAMED &&
3502 (pBt->db_oflags & DB_CREATE) != 0) {
3503 ret = dbp->open(dbp, pFamilyTxn, fileName, tableName, DB_BTREE,
3504 (pBt->db_oflags & ~DB_CREATE) | GET_ENV_READONLY(pBt) |
3505 GET_AUTO_COMMIT(pBt, pFamilyTxn), 0);
3506 /* Close and re-configure handle. */
3507 if (ret == ENOENT) {
3508 #ifndef BDBSQL_SINGLE_THREAD
3509 if (dbp->app_private != NULL)
3510 sqlite3_free(dbp->app_private);
3511 #endif
3512 if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0) {
3513 ret = t_ret;
3514 goto err;
3515 }
3516 if ((t_ret =
3517 btreeConfigureDbHandle(p, iTable, &dbp)) != 0) {
3518 ret = t_ret;
3519 goto err;
3520 }
3521 }
3522 }
3523 if (ret == ENOENT) {
3524 /*
3525 * Indices in files should be configured with DB_DUPSORT.
3526 * Only do this once we are sure we are creating the database
3527 * so that we can open v5.0 database files without error.
3528 */
3529 if (pBt->dbStorage == DB_STORE_NAMED && (iTable & 1) == 0)
3530 dbp->set_flags(dbp, DB_DUPSORT);
3531
3532 ret = dbp->open(dbp, pSavepointTxn, fileName, tableName,
3533 DB_BTREE, pBt->db_oflags | GET_ENV_READONLY(pBt) |
3534 GET_AUTO_COMMIT(pBt, pSavepointTxn), 0);
3535 #ifdef BDBSQL_FILE_PER_TABLE
3536 if (ret == 0 && pBt->dbStorage == DB_STORE_NAMED) {
3537 memset(&k, 0, sizeof(k));
3538 memset(&d, 0, sizeof(d));
3539 k.data = fileName;
3540 k.size = strlen(fileName);
3541 if ((t_ret = pTablesDb->put(
3542 pTablesDb, pSavepointTxn, &k, &d, 0)) != 0)
3543 ret = t_ret;
3544 }
3545 #endif
3546 }
3547 if (ret != 0)
3548 goto err;
3549
3550 if (cached_db == NULL) {
3551 if ((cached_db = (CACHED_DB *)sqlite3_malloc(
3552 sizeof(CACHED_DB))) == NULL)
3553 {
3554 ret = ENOMEM;
3555 goto err;
3556 }
3557 memset(cached_db, 0, sizeof(CACHED_DB));
3558 insert_db:
3559 sqlite3_snprintf(sizeof(cached_db->key),
3560 cached_db->key, "%x", iTable);
3561
3562 assert(sqlite3_mutex_held(pBt->mutex));
3563 stale_db = sqlite3HashInsert(&pBt->db_cache, cached_db->key,
3564 (int)strlen(cached_db->key), cached_db);
3565 if (stale_db) {
3566 sqlite3_free(stale_db);
3567 /*
3568 * Hash table out of memory when returned pointer is
3569 * same as the original value pointer.
3570 */
3571 if (stale_db == cached_db) {
3572 ret = ENOMEM;
3573 goto err;
3574 }
3575 }
3576 }
3577
3578 assert(cached_db->dbp == NULL);
3579 cached_db->dbp = dbp;
3580 cached_db->created = 1;
3581 *ppCachedDb = cached_db;
3582 return SQLITE_OK;
3583
3584 err: if (dbp != NULL) {
3585 #ifndef BDBSQL_SINGLE_THREAD
3586 if (dbp->app_private != NULL)
3587 sqlite3_free(dbp->app_private);
3588 #endif
3589 (void)dbp->close(dbp, DB_NOSYNC);
3590 dbp = NULL;
3591 }
3592 return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
3593 }
3594
3595 /*
3596 * Only persisent uncollated indexes use the 1 key, duplicate
3597 * data structure, because the space saving is not worth the
3598 * overhead in temperary indexes, and collated (other than binary
3599 * collation) indexes lose data because different values can be
3600 * stored under the same key if the collation reads them as
3601 * identical.
3602 */
isDupIndex(int flags,int storage,KeyInfo * keyInfo,DB * db)3603 int isDupIndex(int flags, int storage, KeyInfo *keyInfo, DB *db)
3604 {
3605 return (!(flags & BTREE_INTKEY) && (storage == DB_STORE_NAMED) &&
3606 !indexIsCollated(keyInfo) && supportsDuplicates(db));
3607 }
3608
3609 /*
3610 ** Create a new cursor for the BTree whose root is on the page iTable. The act
3611 ** of acquiring a cursor gets a read lock on the database file.
3612 **
3613 ** If wrFlag==0, then the cursor can only be used for reading.
3614 ** If wrFlag==1, then the cursor can be used for reading or for writing if
3615 ** other conditions for writing are also met. These are the conditions that
3616 ** must be met in order for writing to be allowed:
3617 **
3618 ** 1: The cursor must have been opened with wrFlag==1
3619 **
3620 ** 2: No other cursors may be open with wrFlag==0 on the same table
3621 **
3622 ** 3: The database must be writable (not on read-only media)
3623 **
3624 ** 4: There must be an active transaction.
3625 **
3626 ** Condition 2 warrants further discussion. If any cursor is opened on a table
3627 ** with wrFlag==0, that prevents all other cursors from writing to that table.
3628 ** This is a kind of "read-lock". When a cursor is opened with wrFlag==0
3629 ** it is guaranteed that the table will not change as long as the cursor
3630 ** is open. This allows the cursor to do a sequential scan of the table
3631 ** without having to worry about entries being inserted or deleted during the
3632 ** scan. Cursors should be opened with wrFlag==0 only if this read-lock
3633 ** property is needed. That is to say, cursors should be opened with
3634 ** wrFlag==0 only if they intend to use sqlite3BtreeNext() system call.
3635 ** All other cursors should be opened with wrFlag==1 even if they never really
3636 ** intend to write.
3637 **
3638 ** No checking is done to make sure that page iTable really is the root page
3639 ** of a b-tree. If it is not, then the cursor acquired will not work
3640 ** correctly.
3641 **
3642 ** The comparison function must be logically the same for every cursor on a
3643 ** particular table. Changing the comparison function will result in
3644 ** incorrect operations. If the comparison function is NULL, a default
3645 ** comparison function is used. The comparison function is always ignored
3646 ** for INTKEY tables.
3647 */
sqlite3BtreeCursor(Btree * p,int iTable,int wrFlag,struct KeyInfo * keyInfo,BtCursor * pCur)3648 int sqlite3BtreeCursor(
3649 Btree *p, /* The btree */
3650 int iTable, /* Root page of table to open */
3651 int wrFlag, /* 1 to write. 0 read-only */
3652 struct KeyInfo *keyInfo, /* First argument to compare function */
3653 BtCursor *pCur) /* Write new cursor here */
3654 {
3655 BtShared *pBt;
3656 CACHED_DB *cached_db;
3657 int rc, ret;
3658
3659 log_msg(LOG_VERBOSE, "sqlite3BtreeCursor(%p, %u, %u, %p, %p)",
3660 p, iTable, wrFlag, keyInfo, pCur);
3661
3662 pBt = p->pBt;
3663 rc = SQLITE_OK;
3664 ret = 0;
3665 cached_db = NULL;
3666 pCur->threadID = NULL;
3667
3668 if (!p->connected) {
3669 if ((rc = btreeUpdateBtShared(p, 1)) != SQLITE_OK)
3670 goto err;
3671 pBt = p->pBt;
3672 /*
3673 * If the table is temporary, vdbe expects the table to be
3674 * created automatically when the first cursor is opened.
3675 * Otherwise, if the database does not exist yet, the caller
3676 * expects a SQLITE_EMPTY return, vdbe will then call
3677 * sqlite3BtreeCreateTable directly.
3678 * If the code created the temporary environment the first time
3679 * sqlite3BtreeOpen is called, it would not be possible to
3680 * honor cache size setting pragmas.
3681 */
3682 if (pBt->need_open &&
3683 (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
3684 goto err;
3685 else if (pBt->dbStorage == DB_STORE_NAMED && !pBt->env_opened &&
3686 !__os_exists(NULL, pBt->full_name, 0)) {
3687 /*
3688 * The file didn't exist when sqlite3BtreeOpen was
3689 * called, but has since been created. Open the
3690 * existing database now.
3691 * Don't fold the open into the if clause, since this
3692 * situation can match following statements as well.
3693 */
3694 if ((rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
3695 goto err;
3696 } else if (pBt->dbStorage != DB_STORE_TMP &&
3697 !wrFlag && !pBt->env_opened)
3698 return SQLITE_EMPTY;
3699 else if (!pBt->resultsBuffer &&
3700 (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
3701 goto err;
3702 }
3703
3704 if (wrFlag && IS_BTREE_READONLY(p))
3705 return SQLITE_READONLY;
3706
3707 assert(p->connected || pBt->resultsBuffer);
3708 assert(!pBt->transactional || p->inTrans != TRANS_NONE);
3709
3710 pCur->threadID = getThreadID(p->db);
3711 if (pCur->threadID == NULL && p->db->mallocFailed) {
3712 rc = SQLITE_NOMEM;
3713 goto err;
3714 }
3715
3716 pCur->pBtree = p;
3717 pCur->tableIndex = iTable;
3718
3719 /* SQLite should guarantee that an appropriate transaction is active. */
3720 assert(!pBt->transactional || pMainTxn != NULL);
3721 assert(!pBt->transactional || !wrFlag || pSavepointTxn != NULL);
3722
3723 /*
3724 * Always use the savepoint transaction for write cursors, or the
3725 * top-level cursor for read-only cursors (to avoid tripping and
3726 * re-opening the read cursor for updates within a select).
3727 */
3728 pCur->txn = wrFlag ? pSavepointTxn : pReadTxn;
3729
3730 if (pBt->resultsBuffer)
3731 goto setup_cursor;
3732
3733 /* Retrieve the matching handle from the cache. */
3734 rc = btreeFindOrCreateDataTable(p, &iTable, &cached_db, 0);
3735 if (rc != SQLITE_OK)
3736 goto err;
3737 assert(cached_db != NULL && cached_db->dbp != NULL);
3738
3739 pCur->cached_db = cached_db;
3740
3741 ret = pBDb->cursor(pBDb, pCur->txn, &pDbc,
3742 GET_BTREE_ISOLATION(p) & ~DB_READ_COMMITTED);
3743 if (ret != 0) {
3744 rc = dberr2sqlite(ret, p);
3745 goto err;
3746 }
3747
3748 if (!wrFlag) {
3749 /*
3750 * The sqlite btree API doesn't care about the position of
3751 * cursors on error. Setting this flag avoids cursor
3752 * duplication inside Berkeley DB. We can only do it for
3753 * read-only cursors, however: deletes don't complete until the
3754 * cursor is closed.
3755 */
3756 pDbc->flags |= DBC_TRANSIENT;
3757 }
3758
3759 setup_cursor:
3760 pCur->flags = (iTable & 1) ? BTREE_INTKEY : 0;
3761 pCur->keyInfo = keyInfo;
3762 pCur->skipMulti = 1;
3763 pCur->multiData.data = NULL;
3764 pCur->wrFlag = wrFlag;
3765 pCur->eState = CURSOR_INVALID;
3766 pCur->lastRes = 0;
3767 if (pCur->cached_db)
3768 pCur->isDupIndex = isDupIndex(pCur->flags,
3769 pCur->pBtree->pBt->dbStorage, pCur->keyInfo,
3770 pCur->cached_db->dbp);
3771
3772 #ifdef BDBSQL_SINGLE_THREAD
3773 if (cached_db != NULL)
3774 pBDb->app_private = keyInfo;
3775 #endif
3776
3777 sqlite3_mutex_enter(pBt->mutex);
3778 assert(pCur != pBt->first_cursor);
3779 pCur->next = pBt->first_cursor;
3780 pBt->first_cursor = pCur;
3781 sqlite3_mutex_leave(pBt->mutex);
3782 return SQLITE_OK;
3783
3784 err: if (pDbc != NULL) {
3785 (void)pDbc->close(pDbc);
3786 pDbc = NULL;
3787 }
3788 if (pCur->threadID != NULL) {
3789 sqlite3DbFree(p->db, pCur->threadID);
3790 pCur->threadID = NULL;
3791 }
3792 pCur->eState = CURSOR_FAULT;
3793 pCur->error = rc;
3794 return SQLITE_OK;
3795 }
3796
3797 /*
3798 ** Return the size of a BtCursor object in bytes.
3799 **
3800 ** This interfaces is needed so that users of cursors can preallocate
3801 ** sufficient storage to hold a cursor. The BtCursor object is opaque
3802 ** to users so they cannot do the sizeof() themselves - they must call
3803 ** this routine.
3804 */
sqlite3BtreeCursorSize(void)3805 int sqlite3BtreeCursorSize(void)
3806 {
3807 return (sizeof(BtCursor));
3808 }
3809
3810 /*
3811 ** Initialize memory that will be converted into a BtCursor object.
3812 **
3813 ** The simple approach here would be to memset() the entire object
3814 ** to zero. But if there are large parts that can be skipped, do
3815 ** that here to save time.
3816 */
sqlite3BtreeCursorZero(BtCursor * pCur)3817 void sqlite3BtreeCursorZero(BtCursor *pCur)
3818 {
3819 memset(pCur, 0, sizeof(BtCursor));
3820 pCur->index.data = pCur->indexKeyBuf;
3821 pCur->index.ulen = CURSOR_BUFSIZE;
3822 pCur->index.flags = DB_DBT_USERMEM;
3823 }
3824
btreeCloseCursor(BtCursor * pCur,int listRemove)3825 static int btreeCloseCursor(BtCursor *pCur, int listRemove)
3826 {
3827 BtCursor *c, *prev;
3828 Btree *p;
3829 BtShared *pBt;
3830 int ret;
3831
3832 assert(pCur->pBtree != NULL);
3833 p = pCur->pBtree;
3834 pBt = p->pBt;
3835 ret = 0;
3836
3837 /*
3838 * Change the cursor's state to invalid before closing it, and do
3839 * so holding the BtShared mutex, so that no other thread will attempt
3840 * to access this cursor while it is being closed.
3841 */
3842 sqlite3_mutex_enter(pBt->mutex);
3843 pCur->eState = CURSOR_FAULT;
3844 pCur->error = SQLITE_ABORT;
3845 sqlite3_mutex_leave(pBt->mutex);
3846
3847 /*
3848 * Warning: it is important that we call DBC->close while the cursor
3849 * is still on the list. It is possible that closing a cursor will
3850 * result in the comparison callback being called, which in turn
3851 * may go looking on the list for a matching cursor, in order to find
3852 * a KeyInfo pointer it can use.
3853 */
3854 if (pDbc) {
3855 ret = pDbc->close(pDbc);
3856 pDbc = NULL;
3857 }
3858
3859 if (listRemove) {
3860 sqlite3_mutex_enter(pBt->mutex);
3861 for (prev = NULL, c = pBt->first_cursor; c != NULL;
3862 prev = c, c = c->next)
3863 if (c == pCur) {
3864 if (prev == NULL)
3865 pBt->first_cursor = c->next;
3866 else
3867 prev->next = c->next;
3868 break;
3869 }
3870 sqlite3_mutex_leave(pBt->mutex);
3871 }
3872
3873 if ((pCur->key.flags & DB_DBT_APPMALLOC) != 0) {
3874 sqlite3_free(pCur->key.data);
3875 pCur->key.data = NULL;
3876 pCur->key.flags &= ~DB_DBT_APPMALLOC;
3877 }
3878 if (pCur->multiData.data != NULL) {
3879 sqlite3_free(pCur->multiData.data);
3880 pCur->multiData.data = NULL;
3881 }
3882 if (pCur->index.data != pCur->indexKeyBuf) {
3883 sqlite3_free(pCur->index.data);
3884 pCur->index.data = NULL;
3885 }
3886
3887 /* Incrblob write cursors have their own dedicated transactions. */
3888 if (pCur->isIncrblobHandle && pCur->txn && pCur->wrFlag &&
3889 pSavepointTxn != NULL && pCur->txn != pSavepointTxn) {
3890 ret = pCur->txn->commit(pCur->txn, DB_TXN_NOSYNC);
3891 pCur->txn = 0;
3892 }
3893
3894 sqlite3DbFree(p->db, pCur->threadID);
3895
3896 ret = dberr2sqlite(ret, p);
3897 pCur->pBtree = NULL;
3898 return ret;
3899 }
3900
3901 /*
3902 ** Close a cursor.
3903 */
sqlite3BtreeCloseCursor(BtCursor * pCur)3904 int sqlite3BtreeCloseCursor(BtCursor *pCur)
3905 {
3906 log_msg(LOG_VERBOSE, "sqlite3BtreeCloseCursor(%p)", pCur);
3907
3908 if (!pCur || !pCur->pBtree)
3909 return SQLITE_OK;
3910
3911 return btreeCloseCursor(pCur, 1);
3912 }
3913
indexIsCollated(KeyInfo * keyInfo)3914 int indexIsCollated(KeyInfo *keyInfo)
3915 {
3916 u32 i;
3917
3918 if (!keyInfo)
3919 return 0;
3920
3921 for (i = 0; i < keyInfo->nField; i++) {
3922 if (keyInfo->aColl[i] != NULL &&
3923 (keyInfo->aColl[i]->type != SQLITE_COLL_BINARY))
3924 break;
3925 }
3926 return ((i != keyInfo->nField) ? 1 : 0);
3927 }
3928
3929 /* Indexes created before 5.1 do not support duplicates.*/
supportsDuplicates(DB * db)3930 int supportsDuplicates(DB *db)
3931 {
3932 u_int32_t val;
3933 db->get_flags(db, &val);
3934 return (val & DB_DUPSORT);
3935 }
3936
3937 /* Store the rowid in the index as data
3938 * instead of as part of the key, so rows
3939 * that have the same indexed value have only one
3940 * key in the index.
3941 * The original index key looks like:
3942 * hdrSize_column1Size_columnNSize_rowIdSize_column1Data_columnNData_rowid
3943 * The new index key looks like:
3944 * hdrSize_column1Size_columnNSize_column1Data_columnNData
3945 * With a data section that looks like:
3946 * rowIdSize_rowid
3947 */
splitIndexKey(BtCursor * pCur)3948 int splitIndexKey(BtCursor *pCur)
3949 {
3950 u32 hdrSize, rowidType;
3951 unsigned char *aKey = (unsigned char *)pCur->key.data;
3952 assert(pCur->isDupIndex);
3953 getVarint32(aKey, hdrSize);
3954 getVarint32(&aKey[hdrSize-1], rowidType);
3955 pCur->data.size = sqlite3VdbeSerialTypeLen(rowidType) + 1;
3956 pCur->key.size = pCur->key.size - pCur->data.size;
3957 memmove(&aKey[hdrSize-1], &aKey[hdrSize], pCur->key.size-(hdrSize-1));
3958 putVarint32(&aKey[pCur->key.size], rowidType);
3959 putVarint32(aKey, hdrSize-1);
3960 pCur->data.data = &aKey[pCur->key.size];
3961 return 0;
3962 }
3963
3964 /* Move the cursor so that it points to an entry near pUnKey/nKey.
3965 ** Return a success code.
3966 **
3967 ** For INTKEY tables, only the nKey parameter is used. pUnKey is ignored. For
3968 ** other tables, nKey is the number of bytes of data in nKey. The comparison
3969 ** function specified when the cursor was created is used to compare keys.
3970 **
3971 ** If an exact match is not found, then the cursor is always left pointing at
3972 ** a leaf page which would hold the entry if it were present. The cursor
3973 ** might point to an entry that comes before or after the key.
3974 **
3975 ** The result of comparing the key with the entry to which the cursor is
3976 ** written to *pRes if pRes!=NULL. The meaning of this value is as follows:
3977 **
3978 ** *pRes<0 The cursor is left pointing at an entry that is smaller
3979 ** than pUnKey or if the table is empty and the cursor is
3980 ** therefore left point to nothing.
3981 **
3982 ** *pRes==0 The cursor is left pointing at an entry that exactly
3983 ** matches pUnKey.
3984 **
3985 ** *pRes>0 The cursor is left pointing at an entry that is larger
3986 ** than pUnKey.
3987 */
sqlite3BtreeMovetoUnpacked(BtCursor * pCur,UnpackedRecord * pUnKey,i64 nKey,int bias,int * pRes)3988 int sqlite3BtreeMovetoUnpacked(
3989 BtCursor *pCur, UnpackedRecord *pUnKey, i64 nKey, int bias, int *pRes)
3990 {
3991 int rc, res, ret;
3992 unsigned char buf[ROWIDMAXSIZE];
3993
3994 log_msg(LOG_VERBOSE, "sqlite3BtreeMovetoUnpacked(%p, %p, %u, %u, %p)",
3995 pCur, pUnKey, (int)nKey, bias, pRes);
3996
3997 res = -1;
3998 ret = DB_NOTFOUND;
3999
4000 /* Invalidate current cursor state. */
4001 if (pDbc == NULL &&
4002 (rc = btreeRestoreCursorPosition(pCur, 1)) != SQLITE_OK)
4003 return rc;
4004
4005 if (pCur->eState == CURSOR_VALID &&
4006 pIntKey && pCur->savedIntKey == nKey) {
4007 *pRes = 0;
4008 return SQLITE_OK;
4009 }
4010
4011 pCur->multiGetPtr = pCur->multiPutPtr = NULL;
4012 pCur->isFirst = 0;
4013 memset(&pCur->key, 0, sizeof(pCur->key));
4014 memset(&pCur->data, 0, sizeof(pCur->data));
4015 pCur->skipMulti = 1;
4016
4017 if (pIntKey) {
4018 pCur->key.size = sizeof(i64);
4019 pCur->nKey = nKey;
4020 pCur->key.data = &(pCur->nKey);
4021
4022 if (pCur->lastKey != 0 && nKey > pCur->lastKey) {
4023 pCur->eState = CURSOR_INVALID;
4024 ret = 0;
4025 goto done;
4026 }
4027 } else {
4028 assert(pUnKey != NULL);
4029 pCur->key.app_data = pUnKey;
4030 /*
4031 * If looking for an entry in an index with duplicates then the
4032 * rowid part of the key needs to be put in the data DBT.
4033 */
4034 if (pCur->isDupIndex &&
4035 (pUnKey->nField > pCur->keyInfo->nField)) {
4036 u8 serial_type;
4037 Mem *rowid = &pUnKey->aMem[pUnKey->nField - 1];
4038 int file_format =
4039 pCur->pBtree->db->pVdbe->minWriteFileFormat;
4040 serial_type = sqlite3VdbeSerialType(rowid, file_format);
4041 pCur->data.size =
4042 sqlite3VdbeSerialTypeLen(serial_type) + 1;
4043 assert(pCur->data.size < ROWIDMAXSIZE);
4044 pCur->data.data = &buf;
4045 putVarint32(buf, serial_type);
4046 sqlite3VdbeSerialPut(&buf[1], ROWIDMAXSIZE - 1,
4047 rowid, file_format);
4048 ret = pDbc->get(pDbc, &pCur->key, &pCur->data,
4049 DB_GET_BOTH_RANGE | RMW(pCur));
4050 /*
4051 * If not looking for a specific key in the index (just
4052 * looking at the value part of the key) then do a
4053 * bulk get since the search likely wants all
4054 * entries that have that value.
4055 */
4056 } else if (!pCur->isDupIndex ||
4057 (pUnKey->nField < pCur->keyInfo->nField))
4058 pCur->skipMulti = 0;
4059 }
4060
4061 if (ret == DB_NOTFOUND)
4062 ret = pDbc->get(pDbc, &pCur->key, &pCur->data,
4063 DB_SET_RANGE | RMW(pCur));
4064
4065 if (ret == DB_NOTFOUND) {
4066 ret = pDbc->get(pDbc,
4067 &pCur->key, &pCur->data, DB_LAST | RMW(pCur));
4068
4069 if (ret == 0 && pIntKey)
4070 memcpy(&(pCur->lastKey), pCur->key.data, sizeof(i64));
4071 }
4072
4073 if (ret == 0) {
4074 pCur->eState = CURSOR_VALID;
4075 /* Check whether we got an exact match. */
4076 if (pIntKey) {
4077 memcpy(&(pCur->savedIntKey), pCur->key.data,
4078 sizeof(i64));
4079 res = (pCur->savedIntKey == nKey) ?
4080 0 : (pCur->savedIntKey < nKey) ? -1 : 1;
4081 } else {
4082 DBT target, index;
4083 memset(&target, 0, sizeof(target));
4084 memset(&index, 0, sizeof(index));
4085 target.app_data = pUnKey;
4086 /* paranoia */
4087 pCur->key.app_data = NULL;
4088 if (pCur->isDupIndex) {
4089 btreeCreateIndexKey(pCur);
4090 index = pCur->index;
4091 } else
4092 index = pCur->key;
4093 if (index.data) {
4094 #ifdef BDBSQL_SINGLE_THREAD
4095 res = btreeCompareKeyInfo(
4096 pBDb, &index, &target);
4097 #else
4098 res = btreeCompareShared(pBDb, &index, &target);
4099 #endif
4100 } else {
4101 ret = ENOMEM;
4102 pCur->eState = CURSOR_FAULT;
4103 pCur->error = ret;
4104 }
4105 }
4106 } else if (ret == DB_NOTFOUND) {
4107 /* The table is empty. */
4108 log_msg(LOG_VERBOSE, "sqlite3BtreeMoveto the table is empty.");
4109 ret = 0;
4110 pCur->eState = CURSOR_INVALID;
4111 pCur->lastKey = -1;
4112 } else {
4113 pCur->eState = CURSOR_FAULT;
4114 pCur->error = ret;
4115 }
4116
4117 done: if (pRes != NULL)
4118 *pRes = res;
4119 HANDLE_INCRBLOB_DEADLOCK(ret, pCur)
4120 return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, pCur->pBtree);
4121 }
4122
btreeMoveto(BtCursor * pCur,const void * pKey,i64 nKey,int bias,int * pRes)4123 int btreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int bias, int *pRes)
4124 {
4125 UnpackedRecord *p;
4126 char aSpace[150];
4127 int res;
4128
4129 /*
4130 * Cache an unpacked key in the DBT so we don't have to unpack
4131 * it on every comparison.
4132 */
4133 p = sqlite3VdbeRecordUnpack(pCur->keyInfo, (int)nKey, pKey, aSpace,
4134 sizeof(aSpace));
4135
4136 res = sqlite3BtreeMovetoUnpacked(pCur, p, nKey, bias, pRes);
4137
4138 sqlite3VdbeDeleteUnpackedRecord(p);
4139 pCur->key.app_data = NULL;
4140
4141 return res;
4142 }
4143
btreeTripCursor(BtCursor * pCur,int incrBlobUpdate)4144 static int btreeTripCursor(BtCursor *pCur, int incrBlobUpdate)
4145 {
4146 DBC *dbc;
4147 int ret;
4148 void *keyCopy;
4149
4150 /*
4151 * This is protected by the BtShared mutex so that other threads won't
4152 * attempt to access the cursor in btreeTripWatchers while we are
4153 * closing it.
4154 */
4155 assert(sqlite3_mutex_held(pCur->pBtree->pBt->mutex));
4156
4157 dbc = pDbc;
4158 pDbc = NULL;
4159
4160 /*
4161 * Need to close here to so that the update happens unambiguously in
4162 * the primary cursor. That means the memory holding our copy of the
4163 * key will be freed, so take a copy here.
4164 */
4165 if (!pIntKey) {
4166 if (!pCur->isDupIndex) {
4167 if ((keyCopy = sqlite3_malloc(pCur->key.size)) == NULL)
4168 return SQLITE_NOMEM;
4169 memcpy(keyCopy, pCur->key.data, pCur->key.size);
4170 pCur->key.data = keyCopy;
4171 pCur->key.flags |= DB_DBT_APPMALLOC;
4172 }
4173 }
4174
4175 if (pCur->eState == CURSOR_VALID)
4176 pCur->eState = (pCur->isIncrblobHandle && !incrBlobUpdate) ?
4177 CURSOR_INVALID : CURSOR_REQUIRESEEK;
4178
4179 ret = dbc->close(dbc);
4180 pCur->multiGetPtr = NULL;
4181 pCur->isFirst = 0;
4182 return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, pCur->pBtree);
4183 }
4184
btreeTripWatchers(BtCursor * pCur,int incrBlobUpdate)4185 static int btreeTripWatchers(BtCursor *pCur, int incrBlobUpdate)
4186 {
4187 BtShared *pBt;
4188 BtCursor *pC;
4189 int cmp, rc;
4190
4191 pBt = pCur->pBtree->pBt;
4192 rc = SQLITE_OK;
4193
4194 sqlite3_mutex_enter(pBt->mutex);
4195 for (pC = pBt->first_cursor;
4196 pC != NULL && rc == SQLITE_OK;
4197 pC = pC->next) {
4198 if (pC == pCur || pCur->pBtree != pC->pBtree ||
4199 pC->tableIndex != pCur->tableIndex ||
4200 pC->eState != CURSOR_VALID)
4201 continue;
4202 /* The call to ->cmp does not do any locking. */
4203 if (pC->multiGetPtr == NULL &&
4204 (pDbc->cmp(pDbc, pC->dbc, &cmp, 0) != 0 || cmp != 0))
4205 continue;
4206
4207 rc = btreeTripCursor(pC, incrBlobUpdate);
4208 }
4209 sqlite3_mutex_leave(pBt->mutex);
4210
4211 return rc;
4212 }
4213
btreeTripAll(Btree * p,int iTable,int incrBlobUpdate)4214 static int btreeTripAll(Btree *p, int iTable, int incrBlobUpdate)
4215 {
4216 BtShared *pBt;
4217 BtCursor *pC;
4218 int rc;
4219
4220 pBt = p->pBt;
4221 rc = SQLITE_OK;
4222
4223 assert(sqlite3_mutex_held(pBt->mutex));
4224 for (pC = pBt->first_cursor;
4225 pC != NULL && rc == SQLITE_OK;
4226 pC = pC->next) {
4227 if (pC->tableIndex != iTable || pC->dbc == NULL)
4228 continue;
4229 if (pC->pBtree != p)
4230 return SQLITE_LOCKED_SHAREDCACHE;
4231 rc = btreeTripCursor(pC, incrBlobUpdate);
4232 }
4233
4234 return rc;
4235 }
4236
btreeRestoreCursorPosition(BtCursor * pCur,int skipMoveto)4237 static int btreeRestoreCursorPosition(BtCursor *pCur, int skipMoveto)
4238 {
4239 Btree *p;
4240 BtShared *pBt;
4241 void *keyCopy;
4242 int rc, ret, size;
4243
4244 if (pCur->eState == CURSOR_FAULT)
4245 return pCur->error;
4246 else if (pCur->pBtree == NULL ||
4247 (pCur->eState == CURSOR_INVALID && !skipMoveto))
4248 return SQLITE_ABORT;
4249
4250 p = pCur->pBtree;
4251 pBt = p->pBt;
4252
4253 assert(pDbc == NULL);
4254
4255 if (pIsBuffer) {
4256 rc = btreeLoadBufferIntoTable(pCur);
4257 if (rc != SQLITE_OK)
4258 return rc;
4259 } else {
4260 /*
4261 * SQLite should guarantee that an appropriate transaction is
4262 * active.
4263 */
4264 assert(!pBt->transactional || pReadTxn != NULL);
4265 assert(!pBt->transactional || !pCur->wrFlag ||
4266 pSavepointTxn != NULL);
4267
4268 pCur->txn = pCur->wrFlag ? pSavepointTxn : pReadTxn;
4269
4270 if ((ret = pBDb->cursor(pBDb, pCur->txn, &pDbc,
4271 GET_BTREE_ISOLATION(p) & ~DB_READ_COMMITTED)) != 0)
4272 return dberr2sqlite(ret, p);
4273 }
4274
4275 if (skipMoveto) {
4276 if ((pCur->key.flags & DB_DBT_APPMALLOC) != 0) {
4277 sqlite3_free(pCur->key.data);
4278 pCur->key.data = NULL;
4279 pCur->key.flags &= ~DB_DBT_APPMALLOC;
4280 }
4281 pCur->eState = CURSOR_INVALID;
4282 return SQLITE_OK;
4283 }
4284
4285 if (pIntKey)
4286 return sqlite3BtreeMovetoUnpacked(pCur, NULL,
4287 pCur->savedIntKey, 0, &pCur->lastRes);
4288
4289 /*
4290 * The pointer in pCur->key.data will be overwritten when we
4291 * reposition, so we need to take a copy.
4292 */
4293 if (pCur->isDupIndex) {
4294 keyCopy = btreeCreateIndexKey(pCur);
4295 size = pCur->index.size;
4296 memset(&pCur->index, 0, sizeof(DBT));
4297 if (keyCopy == NULL)
4298 return SQLITE_NOMEM;
4299 } else {
4300 assert((pCur->key.flags & DB_DBT_APPMALLOC) != 0);
4301 pCur->key.flags &= ~DB_DBT_APPMALLOC;
4302 keyCopy = pCur->key.data;
4303 size = pCur->key.size;
4304 }
4305 rc = btreeMoveto(pCur, keyCopy, size,
4306 0, &pCur->lastRes);
4307 if (keyCopy != pCur->indexKeyBuf)
4308 sqlite3_free(keyCopy);
4309 return rc;
4310 }
4311
4312 /*
4313 * Create a temporary table and load the contents of the multi buffer into it.
4314 */
btreeLoadBufferIntoTable(BtCursor * pCur)4315 static int btreeLoadBufferIntoTable(BtCursor *pCur)
4316 {
4317 Btree *p;
4318 BtShared *pBt;
4319 int rc, ret;
4320 void *temp;
4321 sqlite3_mutex *mutexOpen;
4322
4323 assert(pCur->cached_db == NULL);
4324
4325 p = pCur->pBtree;
4326 pBt = p->pBt;
4327 ret = 0;
4328
4329 UPDATE_DURING_BACKUP(p)
4330
4331 temp = pCur->multiData.data;
4332 pCur->multiData.data = NULL;
4333 assert(pIsBuffer);
4334 pIsBuffer = 0;
4335
4336 if ((rc = btreeCloseCursor(pCur, 1)) != SQLITE_OK)
4337 goto err;
4338
4339 if (pBt->dbenv == NULL) {
4340 mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
4341 sqlite3_mutex_enter(mutexOpen);
4342 rc = btreePrepareEnvironment(p);
4343 sqlite3_mutex_leave(mutexOpen);
4344 if (rc != SQLITE_OK)
4345 goto err;
4346 }
4347 rc = sqlite3BtreeCursor(p, pCur->tableIndex, 1, pCur->keyInfo, pCur);
4348 if (pCur->eState == CURSOR_FAULT)
4349 rc = pCur->error;
4350 if (rc != SQLITE_OK)
4351 goto err;
4352 assert(!pCur->isDupIndex);
4353 pCur->multiData.data = temp;
4354 temp = NULL;
4355 if (pCur->multiData.data != NULL) {
4356 if ((ret = pBDb->sort_multiple(pBDb, &pCur->multiData, NULL,
4357 DB_MULTIPLE_KEY)) != 0)
4358 goto err;
4359 if ((ret = pBDb->put(pBDb, pCur->txn, &pCur->multiData, NULL,
4360 DB_MULTIPLE_KEY)) != 0)
4361 goto err;
4362 }
4363
4364 err: /*
4365 * If we get to here and we haven't set up the newly-opened cursor
4366 * properly, free the buffer it was holding now. SQLite may not close
4367 * the cursor explicitly, and it is no longer in the list of open
4368 * cursors for the environment, so it will not be cleaned up on close.
4369 */
4370 if (temp != NULL) {
4371 assert(rc != SQLITE_OK || ret != 0);
4372 sqlite3_free(temp);
4373 }
4374 return MAP_ERR(rc, ret, p);
4375 }
4376
4377 /*
4378 ** Set *pSize to the size of the buffer needed to hold the value of the key
4379 ** for the current entry. If the cursor is not pointing to a valid entry,
4380 ** *pSize is set to 0.
4381 **
4382 ** For a table with the INTKEY flag set, this routine returns the key itself,
4383 ** not the number of bytes in the key.
4384 */
sqlite3BtreeKeySize(BtCursor * pCur,i64 * pSize)4385 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize)
4386 {
4387 int rc;
4388
4389 log_msg(LOG_VERBOSE, "sqlite3BtreeKeySize(%p, %p)", pCur, pSize);
4390
4391 if (pCur->eState != CURSOR_VALID &&
4392 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4393 return rc;
4394
4395 if (pIntKey)
4396 *pSize = pCur->savedIntKey;
4397 else {
4398 if (pCur->isDupIndex)
4399 *pSize = (pCur->eState == CURSOR_VALID) ?
4400 pCur->index.size : 0;
4401 else
4402 *pSize = (pCur->eState == CURSOR_VALID) ?
4403 pCur->key.size : 0;
4404 }
4405
4406 return SQLITE_OK;
4407 }
4408
4409 /*
4410 ** Set *pSize to the number of bytes of data in the entry the cursor currently
4411 ** points to. Always return SQLITE_OK. Failure is not possible. If the cursor
4412 ** is not currently pointing to an entry (which can happen, for example, if
4413 ** the database is empty) then *pSize is set to 0.
4414 */
sqlite3BtreeDataSize(BtCursor * pCur,u32 * pSize)4415 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize)
4416 {
4417 int rc;
4418
4419 log_msg(LOG_VERBOSE, "sqlite3BtreeDataSize(%p, %p)", pCur, pSize);
4420
4421 if (pCur->eState != CURSOR_VALID &&
4422 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4423 return rc;
4424
4425 if (pCur->isDupIndex)
4426 *pSize = 0;
4427 else
4428 *pSize = (pCur->eState == CURSOR_VALID) ? pCur->data.size : 0;
4429 return SQLITE_OK;
4430 }
4431
4432 /*
4433 ** Read part of the key associated with cursor pCur. Exactly "amt" bytes will
4434 ** be transfered into pBuf[]. The transfer begins at "offset".
4435 **
4436 ** Return SQLITE_OK on success or an error code if anything goes wrong. An
4437 ** error is returned if "offset+amt" is larger than the available payload.
4438 */
sqlite3BtreeKey(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)4439 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf)
4440 {
4441 int rc;
4442
4443 log_msg(LOG_VERBOSE, "sqlite3BtreeKey(%p, %u, %u, %p)",
4444 pCur, offset, amt, pBuf);
4445
4446 if (pCur->eState != CURSOR_VALID &&
4447 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4448 return rc;
4449
4450 assert(pCur->eState == CURSOR_VALID);
4451 /* The rowid part of the key in an index is stored in the
4452 * data part of the cursor.*/
4453 if (pCur->isDupIndex)
4454 memcpy(pBuf, (u_int8_t *)pCur->index.data + offset, amt);
4455 else
4456 memcpy(pBuf, (u_int8_t *)pCur->key.data + offset, amt);
4457 return SQLITE_OK;
4458 }
4459
4460 /*
4461 ** Read part of the data associated with cursor pCur. Exactly "amt" bytes
4462 ** will be transfered into pBuf[]. The transfer begins at "offset".
4463 **
4464 ** Return SQLITE_OK on success or an error code if anything goes wrong. An
4465 ** error is returned if "offset+amt" is larger than the available payload.
4466 */
sqlite3BtreeData(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)4467 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf)
4468 {
4469 int rc;
4470
4471 log_msg(LOG_VERBOSE, "sqlite3BtreeData(%p, %u, %u, %p)",
4472 pCur, offset, amt, pBuf);
4473
4474 if (pCur->eState != CURSOR_VALID &&
4475 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4476 return rc;
4477
4478 assert(pCur->eState == CURSOR_VALID);
4479 memcpy(pBuf, (u_int8_t *)pCur->data.data + offset, amt);
4480 return SQLITE_OK;
4481 }
4482
allocateCursorIndex(BtCursor * pCur,u_int32_t amount)4483 void *allocateCursorIndex(BtCursor *pCur, u_int32_t amount)
4484 {
4485 if (pCur->index.ulen < amount) {
4486 pCur->index.ulen = amount * 2;
4487 if (pCur->index.data != pCur->indexKeyBuf)
4488 sqlite3_free(pCur->index.data);
4489 pCur->index.data = sqlite3_malloc(pCur->index.ulen);
4490 if (!pCur->index.data) {
4491 pCur->error = SQLITE_NOMEM;
4492 pCur->eState = CURSOR_FAULT;
4493 return NULL;
4494 }
4495 }
4496 return pCur->index.data;
4497 }
4498
4499 /* The rowid part of an index key is actually stored as data
4500 * in a Berkeley DB database, so it needs to be appended to the
4501 * key. */
btreeCreateIndexKey(BtCursor * pCur)4502 void *btreeCreateIndexKey(BtCursor *pCur)
4503 {
4504 u32 hdrSize;
4505 u_int32_t amount;
4506 unsigned char *aKey = (unsigned char *)pCur->key.data;
4507 unsigned char *data = (unsigned char *)pCur->data.data;
4508 unsigned char *newKey;
4509
4510 amount = pCur->key.size + pCur->data.size;
4511 if (!allocateCursorIndex(pCur, amount))
4512 return NULL;
4513 newKey = (unsigned char *)pCur->index.data;
4514 getVarint32(aKey, hdrSize);
4515 /*
4516 * The first byte contains the size of the record header,
4517 * which will change anyway so no need to copy it now. We
4518 * are trying to minimize the number of times memcpy is called
4519 * in the common path.
4520 */
4521 if ((hdrSize - 1) == 1)
4522 newKey[1] = aKey[1];
4523 else
4524 memcpy(&newKey[1], &aKey[1], hdrSize - 1);
4525 if (pCur->key.size != hdrSize) {
4526 memcpy(&newKey[hdrSize+1], &aKey[hdrSize],
4527 pCur->key.size - hdrSize);
4528 }
4529 memcpy(&newKey[pCur->key.size+1], &data[1], pCur->data.size - 1);
4530 newKey[hdrSize] = data[0];
4531 putVarint32(newKey, hdrSize+1);
4532 pCur->index.size = amount;
4533 return newKey;
4534 }
4535
4536 /*
4537 ** For the entry that cursor pCur is point to, return as many bytes of the
4538 ** key or data as are available on the local b-tree page. Write the number
4539 ** of available bytes into *pAmt.
4540 **
4541 ** The pointer returned is ephemeral. The key/data may move or be destroyed
4542 ** on the next call to any Btree routine.
4543 **
4544 ** These routines is used to get quick access to key and data in the common
4545 ** case where no overflow pages are used.
4546 */
sqlite3BtreeKeyFetch(BtCursor * pCur,int * pAmt)4547 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt)
4548 {
4549 log_msg(LOG_VERBOSE, "sqlite3BtreeKeyFetch(%p, %p)", pCur, pAmt);
4550
4551 assert(pCur->eState == CURSOR_VALID);
4552 if (pCur->isDupIndex) {
4553 *pAmt = pCur->index.size;
4554 return pCur->index.data;
4555 }
4556 *pAmt = pCur->key.size;
4557 return pCur->key.data;
4558 }
4559
sqlite3BtreeDataFetch(BtCursor * pCur,int * pAmt)4560 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt)
4561 {
4562 log_msg(LOG_VERBOSE, "sqlite3BtreeDataFetch(%p, %p)", pCur, pAmt);
4563
4564 assert(pCur->eState == CURSOR_VALID);
4565 *pAmt = pCur->data.size;
4566 return pCur->data.data;
4567 }
4568
4569 /*
4570 ** Clear the current cursor position.
4571 */
sqlite3BtreeClearCursor(BtCursor * pCur)4572 void sqlite3BtreeClearCursor(BtCursor *pCur)
4573 {
4574 log_msg(LOG_VERBOSE, "sqlite3BtreeClearCursor(%p)", pCur);
4575
4576 pCur->eState = CURSOR_INVALID;
4577 }
4578
decodeResults(BtCursor * pCur)4579 static int decodeResults(BtCursor *pCur)
4580 {
4581 if (pIntKey)
4582 memcpy(&(pCur->savedIntKey), pCur->key.data, sizeof(i64));
4583 else if (pCur->isDupIndex && btreeCreateIndexKey(pCur) == NULL)
4584 return SQLITE_NOMEM;
4585 return SQLITE_OK;
4586 }
4587
cursorGet(BtCursor * pCur,int op,int * pRes)4588 static int cursorGet(BtCursor *pCur, int op, int *pRes)
4589 {
4590 static int numMultiGets, numBufferGets, numBufferSmalls;
4591 DBT oldkey;
4592 int ret, equal;
4593
4594 log_msg(LOG_VERBOSE, "cursorGet(%p, %u, %p)", pCur, op, pRes);
4595 ret = 0;
4596
4597 if (op == DB_NEXT && pCur->multiGetPtr != NULL) {
4598 /*
4599 * Get the next record, skipping duplicates in buffered
4600 * indices/transient table. Note that when we store an
4601 * index in a buffer, it is always configured with
4602 * BTREE_ZERODATA and we don't configure transient indices
4603 * with DB_DUPSORT. So the data part will always be empty,
4604 * and we don't need to check it.
4605 */
4606 for (equal = 0, oldkey = pCur->key; equal == 0;
4607 oldkey = pCur->key) {
4608 DB_MULTIPLE_KEY_NEXT(pCur->multiGetPtr,
4609 &pCur->multiData, pCur->key.data, pCur->key.size,
4610 pCur->data.data, pCur->data.size);
4611 if (!pIsBuffer || pCur->multiGetPtr == NULL ||
4612 oldkey.size != pCur->key.size)
4613 break;
4614 if (pCur->keyInfo == NULL)
4615 equal = memcmp(pCur->key.data, oldkey.data,
4616 oldkey.size);
4617 else
4618 equal = btreeCompare(NULL, &pCur->key,
4619 &oldkey, pCur->keyInfo);
4620 }
4621
4622 if (pCur->multiGetPtr != NULL) {
4623 ++numBufferGets;
4624 *pRes = 0;
4625 return decodeResults(pCur);
4626 } else if (pIsBuffer)
4627 goto err;
4628 }
4629
4630 if (pIsBuffer && op == DB_LAST) {
4631 DBT key, data;
4632 memset(&key, 0, sizeof(key));
4633 memset(&data, 0, sizeof(data));
4634 if (pCur->multiGetPtr == NULL)
4635 goto err;
4636 do {
4637 DB_MULTIPLE_KEY_NEXT(pCur->multiGetPtr,
4638 &pCur->multiData, key.data, key.size,
4639 data.data, data.size);
4640 if (pCur->multiGetPtr != NULL) {
4641 pCur->key = key;
4642 pCur->data = data;
4643 }
4644 } while (pCur->multiGetPtr != NULL);
4645 *pRes = 0;
4646 return decodeResults(pCur);
4647 }
4648
4649 assert(!pIsBuffer);
4650
4651 if (op == DB_FIRST || (op == DB_NEXT && !pCur->skipMulti)) {
4652 ++numMultiGets;
4653
4654 if (pCur->multiData.data == NULL) {
4655 pCur->multiData.data = sqlite3_malloc(MULTI_BUFSIZE);
4656 if (pCur->multiData.data == NULL)
4657 return SQLITE_NOMEM;
4658 pCur->multiData.flags = DB_DBT_USERMEM;
4659 pCur->multiData.ulen = MULTI_BUFSIZE;
4660 }
4661
4662 /*
4663 * We can't keep DBC_TRANSIENT set on a bulk get
4664 * cursor: if the buffer turns out to be too small, we
4665 * have no way to restore the position.
4666 */
4667 pDbc->flags &= ~DBC_TRANSIENT;
4668 ret = pDbc->get(pDbc, &pCur->key, &pCur->multiData,
4669 op | DB_MULTIPLE_KEY);
4670 if (!pCur->wrFlag)
4671 pDbc->flags |= DBC_TRANSIENT;
4672
4673 if (ret == 0) {
4674 pCur->isFirst = (op == DB_FIRST);
4675 DB_MULTIPLE_INIT(pCur->multiGetPtr, &pCur->multiData);
4676 DB_MULTIPLE_KEY_NEXT(pCur->multiGetPtr,
4677 &pCur->multiData, pCur->key.data, pCur->key.size,
4678 pCur->data.data, pCur->data.size);
4679 pCur->eState = CURSOR_VALID;
4680 *pRes = 0;
4681 return decodeResults(pCur);
4682 } else if (ret == DB_BUFFER_SMALL) {
4683 ++numBufferSmalls;
4684 #if 0
4685 if (pCur->numBufferSmalls == MAX_SMALLS)
4686 fprintf(stderr,
4687 "Skipping multi-gets, size == %d!\n",
4688 pCur->multiData.size);
4689 #endif
4690 } else
4691 goto err;
4692 } else if (op == DB_NEXT)
4693 pCur->skipMulti = 0;
4694
4695 pCur->lastRes = 0;
4696 pCur->isFirst = 0;
4697
4698 ret = pDbc->get(pDbc, &pCur->key, &pCur->data, op | RMW(pCur));
4699 if (ret == 0) {
4700 pCur->eState = CURSOR_VALID;
4701 *pRes = 0;
4702 return decodeResults(pCur);
4703 } else {
4704 err: if (ret == DB_NOTFOUND)
4705 ret = 0;
4706 if (ret != 0 && ret != DB_LOCK_DEADLOCK)
4707 log_msg(LOG_NORMAL, "cursorGet get returned error: %s",
4708 db_strerror(ret));
4709 pCur->key.size = pCur->data.size = 0;
4710 pCur->eState = CURSOR_INVALID;
4711 *pRes = 1;
4712 }
4713 return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, pCur->pBtree);
4714 }
4715
4716 /* Move the cursor to the first entry in the table. Return SQLITE_OK on
4717 ** success. Set *pRes to 0 if the cursor actually points to something or set
4718 ** *pRes to 1 if the table is empty.
4719 */
sqlite3BtreeFirst(BtCursor * pCur,int * pRes)4720 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes)
4721 {
4722 DB *tmp_db;
4723 u_int32_t get_flag;
4724 int rc, ret;
4725
4726 log_msg(LOG_VERBOSE, "sqlite3BtreeFirst(%p, %p)", pCur, pRes);
4727
4728 get_flag = DB_FIRST;
4729
4730 if (pCur->eState == CURSOR_FAULT)
4731 return pCur->error;
4732
4733 /*
4734 * We might be lucky, and be holding all of a table in the bulk buffer.
4735 */
4736 if (pCur->multiData.data != NULL && (pIsBuffer || pCur->isFirst)) {
4737 /*
4738 * If we've just finished constructing a transient table, sort
4739 * and retrieve.
4740 */
4741 if (pCur->multiPutPtr != NULL) {
4742 if (pCur->eState == CURSOR_FAULT)
4743 return pCur->error;
4744
4745 if ((ret = db_create(&tmp_db,
4746 pCur->pBtree->pBt->dbenv, 0)) != 0)
4747 return dberr2sqlite(ret, pCur->pBtree);
4748 tmp_db->app_private = pCur->keyInfo;
4749 if (!pIntKey)
4750 tmp_db->set_bt_compare(tmp_db,
4751 btreeCompareKeyInfo);
4752 else
4753 tmp_db->set_bt_compare(tmp_db,
4754 btreeCompareIntKey);
4755 tmp_db->sort_multiple(tmp_db, &pCur->multiData,
4756 NULL, DB_MULTIPLE_KEY);
4757 if ((ret = tmp_db->close(tmp_db, 0)) != 0)
4758 return dberr2sqlite(ret, pCur->pBtree);
4759 pCur->multiPutPtr = NULL;
4760 }
4761
4762 DB_MULTIPLE_INIT(pCur->multiGetPtr, &pCur->multiData);
4763 memset(&pCur->key, 0, sizeof(pCur->key));
4764 pCur->isFirst = 1;
4765 pCur->eState = CURSOR_VALID;
4766 get_flag = DB_NEXT;
4767 } else if (pIsBuffer) {
4768 *pRes = 1;
4769 return SQLITE_OK;
4770 } else {
4771 pCur->multiGetPtr = NULL;
4772
4773 if (pDbc == NULL &&
4774 (rc = btreeRestoreCursorPosition(pCur, 1)) != SQLITE_OK)
4775 return rc;
4776 }
4777
4778 return cursorGet(pCur, get_flag, pRes);
4779 }
4780
4781 /*
4782 ** Move the cursor to the last entry in the table. Return SQLITE_OK on
4783 ** success. Set *pRes to 0 if the cursor actually points to something or set
4784 ** *pRes to 1 if the table is empty.
4785 */
sqlite3BtreeLast(BtCursor * pCur,int * pRes)4786 int sqlite3BtreeLast(BtCursor *pCur, int *pRes)
4787 {
4788 DB *tmp_db;
4789 int rc, ret;
4790
4791 log_msg(LOG_VERBOSE, "sqlite3BtreeLast(%p, %p)", pCur, pRes);
4792
4793 if (pCur->eState == CURSOR_FAULT)
4794 return pCur->error;
4795
4796 if (pCur->multiData.data != NULL && pIsBuffer) {
4797 if (pCur->multiPutPtr != NULL) {
4798 if ((ret = db_create(&tmp_db,
4799 pCur->pBtree->pBt->dbenv, 0)) != 0)
4800 return dberr2sqlite(ret, pCur->pBtree);
4801 tmp_db->app_private = pCur->keyInfo;
4802 if (!pIntKey)
4803 tmp_db->set_bt_compare(tmp_db,
4804 btreeCompareKeyInfo);
4805 else
4806 tmp_db->set_bt_compare(tmp_db,
4807 btreeCompareIntKey);
4808 tmp_db->sort_multiple(tmp_db, &pCur->multiData,
4809 NULL, DB_MULTIPLE_KEY);
4810 if ((ret = tmp_db->close(tmp_db, 0)) != 0)
4811 return dberr2sqlite(ret, pCur->pBtree);
4812 pCur->multiPutPtr = NULL;
4813 }
4814
4815 DB_MULTIPLE_INIT(pCur->multiGetPtr, &pCur->multiData);
4816 memset(&pCur->key, 0, sizeof(pCur->key));
4817 pCur->eState = CURSOR_VALID;
4818 } else if (pIsBuffer) {
4819 *pRes = 1;
4820 return SQLITE_OK;
4821 } else {
4822 if (pDbc == NULL &&
4823 (rc = btreeRestoreCursorPosition(pCur, 1)) != SQLITE_OK)
4824 return rc;
4825
4826 pCur->multiGetPtr = NULL;
4827 }
4828
4829 return cursorGet(pCur, DB_LAST, pRes);
4830 }
4831
4832 /*
4833 ** Return TRUE if the cursor is not pointing at an entry of the table.
4834 **
4835 ** TRUE will be returned after a call to sqlite3BtreeNext() moves past the last
4836 ** entry in the table or sqlite3BtreePrev() moves past the first entry. TRUE
4837 ** is also returned if the table is empty.
4838 */
sqlite3BtreeEof(BtCursor * pCur)4839 int sqlite3BtreeEof(BtCursor *pCur)
4840 {
4841 log_msg(LOG_VERBOSE, "sqlite3BtreeEof(%p)", pCur);
4842
4843 return pCur->eState == CURSOR_INVALID;
4844 }
4845
4846 /*
4847 ** Advance the cursor to the next entry in the database. If successful then
4848 ** set *pRes=0. If the cursor was already pointing to the last entry in the
4849 ** database before this routine was called, then set *pRes=1.
4850 */
sqlite3BtreeNext(BtCursor * pCur,int * pRes)4851 int sqlite3BtreeNext(BtCursor *pCur, int *pRes)
4852 {
4853 int rc;
4854 log_msg(LOG_VERBOSE, "sqlite3BtreeNext(%p, %p)", pCur, pRes);
4855
4856 if (pCur->pBtree != NULL && pCur->eState == CURSOR_INVALID) {
4857 *pRes = 1;
4858 return SQLITE_OK;
4859 }
4860
4861 if (pCur->eState != CURSOR_VALID &&
4862 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4863 return rc;
4864
4865 if (pCur->lastRes > 0) {
4866 pCur->lastRes = 0;
4867 *pRes = 0;
4868 return SQLITE_OK;
4869 }
4870
4871 return cursorGet(pCur, DB_NEXT, pRes);
4872 }
4873
4874 /*
4875 ** Step the cursor to the back to the previous entry in the database. If
4876 ** successful then set *pRes=0. If the cursor was already pointing to the
4877 ** first entry in the database before this routine was called, then set *pRes=1.
4878 */
sqlite3BtreePrevious(BtCursor * pCur,int * pRes)4879 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes)
4880 {
4881 int rc;
4882 log_msg(LOG_VERBOSE, "sqlite3BtreePrevious(%p, %p)", pCur, pRes);
4883
4884 if (pCur->eState != CURSOR_VALID &&
4885 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4886 return rc;
4887
4888 if (pCur->eState == CURSOR_INVALID) {
4889 *pRes = 1;
4890 return SQLITE_OK;
4891 }
4892
4893 if (pCur->lastRes < 0) {
4894 pCur->lastRes = 0;
4895 *pRes = 0;
4896 return SQLITE_OK;
4897 }
4898
4899 return cursorGet(pCur, DB_PREV, pRes);
4900 }
4901
insertData(BtCursor * pCur,int nZero,int nData)4902 static int insertData(BtCursor *pCur, int nZero, int nData)
4903 {
4904 int ret;
4905
4906 UPDATE_DURING_BACKUP(pCur->pBtree);
4907 ret = pDbc->put(pDbc, &pCur->key, &pCur->data,
4908 (pCur->isDupIndex) ? DB_NODUPDATA : DB_KEYLAST);
4909
4910 if (ret == 0 && nZero > 0) {
4911 DBT zeroData;
4912 u8 zero;
4913
4914 zero = 0;
4915 memset(&zeroData, 0, sizeof(zeroData));
4916 zeroData.data = &zero;
4917 zeroData.size = zeroData.dlen = zeroData.ulen = 1;
4918 zeroData.doff = nData + nZero - 1;
4919 zeroData.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
4920
4921 ret = pDbc->put(pDbc, &pCur->key, &zeroData, DB_CURRENT);
4922 }
4923 return ret;
4924 }
4925
4926 /*
4927 ** Insert a new record into the BTree. The key is given by (pKey,nKey) and
4928 ** the data is given by (pData,nData). The cursor is used only to define
4929 ** what table the record should be inserted into. The cursor is left
4930 ** pointing at a random location.
4931 **
4932 ** For an INTKEY table, only the nKey value of the key is used. pKey is
4933 ** ignored. For a ZERODATA table, the pData and nData are both ignored.
4934 */
sqlite3BtreeInsert(BtCursor * pCur,const void * pKey,i64 nKey,const void * pData,int nData,int nZero,int appendBias,int seekResult)4935 int sqlite3BtreeInsert(
4936 BtCursor *pCur, /* Insert data into the table of this cursor */
4937 const void *pKey, i64 nKey, /* The key of the new record */
4938 const void *pData, int nData, /* The data of the new record */
4939 int nZero, /* Number of extra 0 bytes */
4940 int appendBias, /* True if this likely an append */
4941 int seekResult) /* Result of prior sqlite3BtreeMoveto() call */
4942 {
4943 int rc, ret;
4944 i64 encKey;
4945 UnpackedRecord *p;
4946 char aSpace[150];
4947
4948 log_msg(LOG_VERBOSE,
4949 "sqlite3BtreeInsert(%p, %p, %u, %p, %u, %u, %u, %u)",
4950 pCur, pKey, (int)nKey, pData, nData, nZero, appendBias, seekResult);
4951
4952 if (!pCur->wrFlag)
4953 return SQLITE_READONLY;
4954
4955 p = NULL;
4956 rc = SQLITE_OK;
4957
4958 /* Invalidate current cursor state. */
4959 pCur->multiGetPtr = NULL;
4960 pCur->isFirst = 0;
4961 pCur->lastKey = 0;
4962 memset(&pCur->key, 0, sizeof(pCur->key));
4963 memset(&pCur->data, 0, sizeof(pCur->data));
4964
4965 if (pIntKey) {
4966 pCur->key.size = sizeof(i64);
4967 encKey = nKey;
4968 pCur->key.data = &encKey;
4969 } else {
4970 pCur->key.data = (void *)pKey;
4971 pCur->key.size = (u_int32_t)nKey;
4972 }
4973 if (pCur->isDupIndex)
4974 splitIndexKey(pCur);
4975 else {
4976 pCur->data.data = (void *)pData;
4977 pCur->data.size = nData;
4978 }
4979
4980 if (pIsBuffer) {
4981 ret = 0;
4982 if (nZero == 0) {
4983 if (pCur->multiData.data == NULL) {
4984 if ((pCur->multiData.data =
4985 sqlite3_malloc(MULTI_BUFSIZE)) == NULL) {
4986 ret = ENOMEM;
4987 goto err;
4988 }
4989 pCur->multiData.flags = DB_DBT_USERMEM;
4990 pCur->multiData.ulen = MULTI_BUFSIZE;
4991 DB_MULTIPLE_WRITE_INIT(pCur->multiPutPtr,
4992 &pCur->multiData);
4993 }
4994 /*
4995 * It is possible for temporary results to be written,
4996 * read, then written again. In that case just load
4997 * the results into a table.
4998 */
4999 if (pCur->multiPutPtr != NULL) {
5000 DB_MULTIPLE_KEY_WRITE_NEXT(pCur->multiPutPtr,
5001 &pCur->multiData,
5002 pCur->key.data, pCur->key.size,
5003 pCur->data.data, pCur->data.size);
5004 }
5005 } else
5006 pCur->multiPutPtr = NULL;
5007 if (pCur->multiPutPtr == NULL) {
5008 rc = btreeLoadBufferIntoTable(pCur);
5009 if (rc != SQLITE_OK)
5010 return rc;
5011 ret = insertData(pCur, nZero, nData);
5012 }
5013 goto err;
5014 }
5015 if (!pIntKey && pKey != NULL) {
5016 /*
5017 * Cache an unpacked key in the DBT so we don't have to unpack
5018 * it on every comparison.
5019 */
5020 pCur->key.app_data = p = sqlite3VdbeRecordUnpack(pCur->keyInfo,
5021 (int)nKey, pKey, aSpace, sizeof(aSpace));
5022 }
5023
5024 ret = insertData(pCur, nZero, nData);
5025
5026 if (ret == 0) {
5027 /*
5028 * We may have updated a record or inserted into a range that
5029 * is cached by another cursor.
5030 */
5031 if ((rc = btreeTripWatchers(pCur, 0)) != SQLITE_OK)
5032 goto err;
5033 pCur->skipMulti = 0;
5034 } else
5035 pCur->eState = CURSOR_INVALID;
5036 err: if (p != NULL)
5037 sqlite3VdbeDeleteUnpackedRecord(p);
5038 pCur->key.app_data = NULL;
5039 return MAP_ERR_LOCKED(rc, ret, pCur->pBtree);
5040 }
5041
5042 /*
5043 ** Delete the entry that the cursor is pointing to. The cursor is left
5044 ** pointing at a random location.
5045 */
sqlite3BtreeDelete(BtCursor * pCur)5046 int sqlite3BtreeDelete(BtCursor *pCur)
5047 {
5048 DBC *tmpc;
5049 int rc, ret;
5050
5051 log_msg(LOG_VERBOSE, "sqlite3BtreeDelete(%p)", pCur);
5052
5053 ret = 0;
5054 if (!pCur->wrFlag)
5055 return SQLITE_READONLY;
5056
5057 if (pIsBuffer) {
5058 int res;
5059 rc = btreeMoveto(pCur, pCur->key.data, pCur->key.size, 0, &res);
5060 if (rc != SQLITE_OK)
5061 return rc;
5062 }
5063
5064 assert(!pIsBuffer);
5065
5066 if (pCur->multiGetPtr != NULL) {
5067 DBT dummy;
5068 pCur->multiGetPtr = NULL;
5069 pCur->isFirst = 0;
5070 memset(&dummy, 0, sizeof(dummy));
5071 dummy.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL;
5072 if ((ret = pDbc->get(pDbc,
5073 &pCur->key, &dummy, DB_SET | RMW(pCur))) != 0)
5074 return dberr2sqlitelocked(ret, pCur->pBtree);
5075 pCur->eState = CURSOR_VALID;
5076 }
5077
5078 if ((rc = btreeTripWatchers(pCur, 0)) != SQLITE_OK)
5079 return rc;
5080 ret = pDbc->del(pDbc, 0);
5081
5082 /*
5083 * We now de-position the cursor to ensure that the record is
5084 * really deleted. [#18667]
5085 *
5086 * Since we tripped all watchers before doing the delete, there can be
5087 * no other open cursors pointing to this record. SQLite's record
5088 * comparator will behave incorrectly if it sees a record that is
5089 * marked for deletion (see the UNPACKED_PREFIX_SEARCH flag), so this
5090 * makes sure that never happens.
5091 */
5092 if (ret == 0 && (ret = pDbc->dup(pDbc, &tmpc, 0)) == 0) {
5093 ret = pDbc->close(pDbc);
5094 pDbc = tmpc;
5095 }
5096 pCur->eState = CURSOR_INVALID;
5097
5098 return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, pCur->pBtree);
5099 }
5100
5101 /*
5102 ** Create a new BTree table. Write into *piTable the page number for the root
5103 ** page of the new table.
5104 **
5105 ** The type of type is determined by the flags parameter. Only the following
5106 ** values of flags are currently in use. Other values for flags might not
5107 ** work:
5108 **
5109 ** BTREE_INTKEY Used for SQL tables with rowid keys
5110 ** BTREE_BLOBKEY Used for SQL indices
5111 */
btreeCreateTable(Btree * p,int * piTable,int flags)5112 static int btreeCreateTable(Btree *p, int *piTable, int flags)
5113 {
5114 BtShared *pBt;
5115 CACHED_DB *cached_db;
5116 DBC *dbc;
5117 DBT key, data;
5118 int lastTable, rc, ret, t_ret;
5119
5120 cached_db = NULL;
5121 pBt = p->pBt;
5122 rc = SQLITE_OK;
5123 lastTable = 0;
5124 ret = 0;
5125
5126 dbc = NULL;
5127 if (pBt->dbStorage == DB_STORE_NAMED) {
5128 ret = pTablesDb->cursor(pTablesDb, pFamilyTxn, &dbc, 0);
5129 if (ret != 0)
5130 goto err;
5131
5132 memset(&key, 0, sizeof(key));
5133 memset(&data, 0, sizeof(data));
5134 data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
5135
5136 if ((ret = dbc->get(dbc, &key, &data, DB_LAST)) != 0)
5137 goto err;
5138
5139 if (strncmp((const char *)key.data, "table", 5) == 0 &&
5140 (ret = btreeTableNameToId(
5141 (const char *)key.data, key.size, &lastTable)) != 0)
5142 goto err;
5143
5144 ret = dbc->close(dbc);
5145 dbc = NULL;
5146 if (ret != 0)
5147 goto err;
5148 }
5149
5150 cached_db = NULL;
5151 rc = btreeFindOrCreateDataTable(p,
5152 &lastTable, &cached_db, flags | BTREE_CREATE);
5153 if (rc == SQLITE_OK)
5154 *piTable = lastTable;
5155
5156 err: if (dbc != NULL)
5157 if ((t_ret = dbc->close(dbc)) != 0 && ret == 0)
5158 ret = t_ret;
5159
5160 return MAP_ERR(rc, ret, p);
5161 }
5162
sqlite3BtreeCreateTable(Btree * p,int * piTable,int flags)5163 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags)
5164 {
5165 BtShared *pBt;
5166 int rc;
5167
5168 log_msg(LOG_VERBOSE, "sqlite3BtreeCreateTable(%p, %p, %u)",
5169 p, piTable, flags);
5170
5171 pBt = p->pBt;
5172
5173 /*
5174 * With ephemeral tables, there are at most two tables created: the
5175 * initial master table, which is used for INTKEY tables, or, for
5176 * indices, a second table is opened and the master table is unused.
5177 */
5178 if (pBt->resultsBuffer) {
5179 assert(!(flags & BTREE_INTKEY));
5180 *piTable = 2;
5181 return SQLITE_OK;
5182 }
5183
5184 if (!p->connected &&
5185 (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
5186 return rc;
5187
5188 return btreeCreateTable(p, piTable, flags);
5189 }
5190
5191 /*
5192 ** Delete all information from a single table in the database. iTable is the
5193 ** page number of the root of the table. After this routine returns, the root
5194 ** page is empty, but still exists.
5195 **
5196 ** This routine will fail with SQLITE_LOCKED if there are any open read
5197 ** cursors on the table. Open write cursors are moved to the root of the
5198 ** table.
5199 **
5200 ** If pnChange is not NULL, then table iTable must be an intkey table. The
5201 ** integer value pointed to by pnChange is incremented by the number of
5202 ** entries in the table.
5203 */
sqlite3BtreeClearTable(Btree * p,int iTable,int * pnChange)5204 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange)
5205 {
5206 BtShared *pBt;
5207 CACHED_DB *cached_db;
5208 DELETED_TABLE *dtable;
5209 char *tableName, tableNameBuf[DBNAME_SIZE];
5210 char *oldTableName, oldTableNameBuf[DBNAME_SIZE], *fileName;
5211 int need_truncate, rc, ret, tryfast;
5212 u_int32_t count;
5213
5214 log_msg(LOG_VERBOSE, "sqlite3BtreeClearTable(%p, %u, %p)",
5215 p, iTable, pnChange);
5216
5217 pBt = p->pBt;
5218 count = 0;
5219 ret = tryfast = 0;
5220 rc = SQLITE_OK;
5221 need_truncate = 1;
5222 if (IS_BTREE_READONLY(p))
5223 return SQLITE_READONLY;
5224
5225 /* Close any open cursors. */
5226 sqlite3_mutex_enter(pBt->mutex);
5227
5228 /*
5229 * SQLite expects all cursors apart from read-uncommitted cursors to be
5230 * closed. However, Berkeley DB cannot truncate unless *all* cursors
5231 * are closed. This call to btreeTripAll will fail if there are any
5232 * cursors open on other connections with * SQLITE_LOCKED_SHAREDCACHE,
5233 * which makes tests shared2-1.[23] fail with "table locked" errors.
5234 */
5235 if ((rc = btreeTripAll(p, iTable, 0)) != SQLITE_OK) {
5236 sqlite3_mutex_leave(pBt->mutex);
5237 return rc;
5238 }
5239 sqlite3_mutex_leave(pBt->mutex);
5240
5241 rc = btreeFindOrCreateDataTable(p, &iTable, &cached_db, 0);
5242
5243 if (rc != SQLITE_OK)
5244 return rc;
5245
5246 assert(cached_db != NULL && cached_db->dbp != NULL);
5247
5248 /*
5249 * The motivation here is that logging all of the contents of pages
5250 * we want to clear is slow. Instead, we can transactionally create
5251 * a new, empty table, and rename the old one. If this transaction
5252 * goes on to commit, we can non-transactionally free the old pages
5253 * at that point.
5254 *
5255 * Steps are:
5256 * 1. do a transactional rename of the old table
5257 * 2. do a transactional create of a new table with the same name
5258 * 3. if/when this transaction commits, do a non-transactional
5259 * remove of the old table.
5260 */
5261 if (pBt->dbStorage == DB_STORE_NAMED) {
5262 /* TODO: count the records */
5263 DB_BTREE_STAT *stat;
5264
5265 if ((ret = cached_db->dbp->stat(cached_db->dbp,
5266 pFamilyTxn, &stat, GET_BTREE_ISOLATION(p) &
5267 ~DB_TXN_SNAPSHOT)) != 0)
5268 goto err;
5269 count = stat->bt_ndata;
5270
5271 /*
5272 * Try the fast path (minimal logging) approach to truncating
5273 * for all but the smallest databases.
5274 */
5275 tryfast =
5276 (stat->bt_leaf_pg + stat->bt_dup_pg + stat->bt_over_pg) > 4;
5277 sqlite3_free(stat);
5278 }
5279
5280 if (tryfast) {
5281 #ifndef BDBSQL_SINGLE_THREAD
5282 if (cached_db->dbp->app_private != NULL)
5283 sqlite3_free(cached_db->dbp->app_private);
5284 #endif
5285 ret = cached_db->dbp->close(cached_db->dbp, DB_NOSYNC);
5286 cached_db->dbp = NULL;
5287 if (ret != 0)
5288 goto err;
5289
5290 tableName = tableNameBuf;
5291 GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
5292 oldTableName = oldTableNameBuf;
5293 GET_TABLENAME(oldTableName, sizeof(oldTableNameBuf), iTable,
5294 "old-");
5295
5296 FIX_TABLENAME(pBt, fileName, tableName);
5297 if ((ret = pDbEnv->dbrename(pDbEnv, pSavepointTxn,
5298 fileName, tableName, oldTableName, DB_NOSYNC)) == 0) {
5299 need_truncate = 0;
5300 dtable = (DELETED_TABLE *)sqlite3_malloc(
5301 sizeof(DELETED_TABLE));
5302 if (dtable == NULL)
5303 return SQLITE_NOMEM;
5304 dtable->iTable = iTable;
5305 dtable->txn = pSavepointTxn;
5306 #ifdef BDBSQL_FILE_PER_TABLE
5307 dtable->flag = DTF_DELETE;
5308 #endif
5309 dtable->next = p->deleted_tables;
5310 p->deleted_tables = dtable;
5311 } else if (ret != EEXIST)
5312 goto err;
5313
5314 sqlite3_mutex_enter(pBt->mutex);
5315 rc = btreeCreateDataTable(p, iTable, &cached_db);
5316 sqlite3_mutex_leave(pBt->mutex);
5317 if (rc != SQLITE_OK)
5318 goto err;
5319 }
5320
5321 if (need_truncate) {
5322 assert(cached_db != NULL && cached_db->dbp != NULL);
5323 ret = cached_db->dbp->truncate(cached_db->dbp,
5324 pSavepointTxn, &count, 0);
5325 }
5326
5327 if (ret == 0 && pnChange != NULL)
5328 *pnChange += count;
5329
5330 err: return MAP_ERR(rc, ret, p);
5331 }
5332
5333 /*
5334 ** Erase all information in a table and add the root of the table to the
5335 ** freelist. Except, the root of the principle table (the one on page 1) is
5336 ** never added to the freelist.
5337 **
5338 ** This routine will fail with SQLITE_LOCKED if there are any open cursors on
5339 ** the table.
5340 */
sqlite3BtreeDropTable(Btree * p,int iTable,int * piMoved)5341 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved)
5342 {
5343 char cached_db_key[CACHE_KEY_SIZE];
5344 BtShared *pBt;
5345 CACHED_DB *cached_db;
5346 DB *dbp;
5347 DELETED_TABLE *dtable;
5348 char *fileName, *tableName, tableNameBuf[DBNAME_SIZE];
5349 char *oldTableName, oldTableNameBuf[DBNAME_SIZE];
5350 int need_remove, ret;
5351 DBT key;
5352 int skip_rename;
5353
5354 log_msg(LOG_VERBOSE, "sqlite3BtreeDropTable(%p, %u, %p)",
5355 p, iTable, piMoved);
5356
5357 skip_rename = 0;
5358 pBt = p->pBt;
5359 *piMoved = 0;
5360 ret = 0;
5361 need_remove = 1;
5362
5363 /* Close any cached handle */
5364 sqlite3_snprintf(sizeof(cached_db_key), cached_db_key, "%x", iTable);
5365 sqlite3_mutex_enter(pBt->mutex);
5366 cached_db = sqlite3HashFind(&pBt->db_cache,
5367 cached_db_key, (int)strlen(cached_db_key));
5368 if (cached_db != NULL && (dbp = cached_db->dbp) != NULL) {
5369 #ifndef BDBSQL_SINGLE_THREAD
5370 if (dbp->app_private != NULL)
5371 sqlite3_free(dbp->app_private);
5372 #endif
5373 ret = dbp->close(dbp, DB_NOSYNC);
5374 cached_db->dbp = NULL;
5375 if (ret != 0)
5376 goto err;
5377 }
5378 sqlite3HashInsert(
5379 &pBt->db_cache, cached_db_key, (int)strlen(cached_db_key), NULL);
5380 sqlite3_mutex_leave(pBt->mutex);
5381 sqlite3_free(cached_db);
5382
5383 if (pBt->dbStorage == DB_STORE_NAMED) {
5384 tableName = tableNameBuf;
5385 GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
5386 FIX_TABLENAME(pBt, fileName, tableName);
5387
5388 oldTableName = oldTableNameBuf;
5389 GET_TABLENAME(oldTableName, sizeof(oldTableNameBuf), iTable,
5390 "old-");
5391
5392 memset(&key, 0, sizeof(key));
5393 key.data = oldTableName;
5394 key.size = (u_int32_t)strlen(oldTableName);
5395 key.flags = DB_DBT_USERMEM;
5396 /* If the renamed table already exists, we could be in one of
5397 * two possible situations:
5398 * 1) This is the second table within the same transaction
5399 * that has the same table ID that has been dropped.
5400 * 2) There was a crash in the middle of
5401 * sqlite3BtreeCommitPhaseTwo, meaning the dbrename was
5402 * committed, but the dbremove was not completed.
5403 * In the first situation, we want the first table to be the
5404 * one that is in the deleted_tables list. In the second case,
5405 * it's safe to remove the old-* table before proceeding.
5406 *
5407 * TODO: If the error message Berkeley DB generates when
5408 * renaming to a table that already exists is removed,
5409 * We could remove this exists check, and move the logic
5410 * below into an if (ret == EEXIST) clause.
5411 */
5412 if (pTablesDb->exists(pTablesDb, pSavepointTxn, &key, 0) == 0) {
5413 for (dtable = p->deleted_tables;
5414 dtable != NULL && iTable != dtable->iTable;
5415 dtable = dtable->next) {}
5416 /* Case 2, remove the table. */
5417 if (dtable == NULL) {
5418 if ((ret = pDbEnv->dbremove(pDbEnv,
5419 pSavepointTxn, pBt->short_name,
5420 oldTableName, DB_NOSYNC)) != 0)
5421 goto err;
5422 } else
5423 skip_rename = 1;
5424 }
5425
5426 if (!skip_rename) {
5427 ret = pDbEnv->dbrename(pDbEnv, pSavepointTxn, fileName,
5428 tableName, oldTableName, DB_NOSYNC);
5429 if (ret != 0)
5430 goto err;
5431 need_remove = 0;
5432 dtable = (DELETED_TABLE *)sqlite3_malloc(
5433 sizeof(DELETED_TABLE));
5434 if (dtable == NULL)
5435 return SQLITE_NOMEM;
5436 dtable->iTable = iTable;
5437 dtable->txn = pSavepointTxn;
5438 #ifdef BDBSQL_FILE_PER_TABLE
5439 dtable->flag = DTF_DROP;
5440 #endif
5441 dtable->next = p->deleted_tables;
5442 p->deleted_tables = dtable;
5443 }
5444
5445 if (need_remove) {
5446 ret = pDbEnv->dbremove(pDbEnv, pSavepointTxn,
5447 fileName, tableName, DB_NOSYNC);
5448 if (ret != 0)
5449 goto err;
5450 #ifdef BDBSQL_FILE_PER_TABLE
5451 memset(&key, 0, sizeof(key));
5452 key.flags = DB_DBT_USERMEM;
5453 key.data = tableName;
5454 key.size = strlen(tableName);
5455 ret = pTablesDb->del(pTablesDb, pSavepointTxn, &key, 0);
5456 #endif
5457 }
5458
5459 } else if (pBt->dbStorage == DB_STORE_INMEM) {
5460 /*
5461 * Add the in-memory tables into deleted_tables. Don't do the
5462 * remove now since the operation might be rollbacked. The
5463 * deleted_tables will be removed when commit.
5464 *
5465 * We don't rename the in-memory db as above DB_STORE_NAMED
5466 * case because:
5467 * 1) In memory table names are always unique.
5468 * 2) Can not rename a in-memory db since dbrename can not
5469 * accept DB_TXN_NOT_DURABLE.
5470 */
5471 dtable = (DELETED_TABLE *)sqlite3_malloc(sizeof(DELETED_TABLE));
5472 if (dtable == NULL)
5473 return SQLITE_NOMEM;
5474 dtable->iTable = iTable;
5475 dtable->txn = pSavepointTxn;
5476 dtable->next = p->deleted_tables;
5477 p->deleted_tables = dtable;
5478 }
5479
5480 err: return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, p);
5481 }
5482
5483 /*
5484 ** Read the meta-information out of a database file. Meta[0] is the number
5485 ** of free pages currently in the database. Meta[1] through meta[15] are
5486 ** available for use by higher layers. Meta[0] is read-only, the others are
5487 ** read/write.
5488 **
5489 ** The schema layer numbers meta values differently. At the schema layer (and
5490 ** the SetCookie and ReadCookie opcodes) the number of free pages is not
5491 ** visible. So Cookie[0] is the same as Meta[1].
5492 */
sqlite3BtreeGetMeta(Btree * p,int idx,u32 * pMeta)5493 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta)
5494 {
5495 BtShared *pBt;
5496 int ret;
5497 DBT key, data;
5498 i64 metaKey, metaData;
5499
5500 log_msg(LOG_VERBOSE, "sqlite3BtreeGetMeta(%p, %u, %p)",
5501 p, idx, pMeta);
5502
5503 pBt = p->pBt;
5504 assert(idx >= 0 && idx < NUMMETA);
5505
5506 /*
5507 * Under some (odd) circumstances SQLite expects a database to be
5508 * opened here: If it didn't exist when the connection was opened, but
5509 * was created by another connection since then. If we don't open the
5510 * table now, some virtual table operations fail - altermalloc.test
5511 * has such a scenario.
5512 */
5513 if (!p->connected && pBt->dbStorage == DB_STORE_NAMED &&
5514 !pBt->database_existed && !__os_exists(NULL, pBt->full_name, 0)) {
5515 btreeUpdateBtShared(p, 1);
5516 pBt = p->pBt;
5517 ret = btreeOpenEnvironment(p, 1);
5518 /*
5519 * Ignore failures. There's not much else we can do. A failure
5520 * here will likely leave the connection in a bad state.
5521 * This path is tested by altermalloc.
5522 */
5523 }
5524 /* Once connected to a shared environment, don't trust the cache. */
5525 if (idx > 0 && idx < NUMMETA && pBt->meta[idx].cached &&
5526 (!p->connected || pBt->dbStorage != DB_STORE_NAMED)) {
5527 *pMeta = pBt->meta[idx].value;
5528 return;
5529 } else if (idx == 0 || !p->connected ||
5530 pBt->dbStorage != DB_STORE_NAMED) {
5531 *pMeta = 0;
5532 return;
5533 }
5534
5535 assert(p->pBt->dbStorage == DB_STORE_NAMED);
5536
5537 memset(&key, 0, sizeof(key));
5538 metaKey = idx;
5539 key.data = &metaKey;
5540 key.size = key.ulen = sizeof(metaKey);
5541 key.flags = DB_DBT_USERMEM;
5542 memset(&data, 0, sizeof(data));
5543 data.data = &metaData;
5544 data.size = data.ulen = sizeof(metaData);
5545 data.flags = DB_DBT_USERMEM;
5546
5547 /*
5548 * Trigger a read-modify-write get from the metadata table to stop
5549 * other connections from being able to proceed while an exclusive
5550 * transaction is active.
5551 */
5552 if ((ret = pMetaDb->get(pMetaDb, GET_META_TXN(p), &key, &data,
5553 GET_META_FLAGS(p))) == 0) {
5554 assert(data.size == sizeof(i64));
5555 *pMeta = (u32)(metaData);
5556 if (idx < NUMMETA) {
5557 pBt->meta[idx].value = *pMeta;
5558 pBt->meta[idx].cached = 1;
5559 }
5560 } else if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY) {
5561 *pMeta = 0;
5562 ret = 0;
5563 } else if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
5564 p->db->errCode = SQLITE_BUSY;
5565 ret = 0;
5566 *pMeta = 0;
5567 sqlite3BtreeRollback(p);
5568 }
5569
5570 assert(ret == 0);
5571 }
5572
5573 /*
5574 ** Write meta-information back into the database. Meta[0] is read-only and
5575 ** may not be written.
5576 */
sqlite3BtreeUpdateMeta(Btree * p,int idx,u32 iMeta)5577 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta)
5578 {
5579 BtShared *pBt;
5580 int rc, ret;
5581 DBT key, data;
5582 i64 metaKey, metaData;
5583
5584 log_msg(LOG_VERBOSE, "sqlite3BtreeUpdateMeta(%p, %u, %u)",
5585 p, idx, iMeta);
5586
5587 pBt = p->pBt;
5588 if (IS_BTREE_READONLY(p))
5589 return SQLITE_READONLY;
5590
5591 assert(idx > 0 && idx < NUMMETA);
5592
5593 sqlite3_mutex_enter(pBt->mutex);
5594 pBt->meta[idx].value = iMeta;
5595 pBt->meta[idx].cached = 1;
5596
5597 #ifndef SQLITE_OMIT_AUTOVACUUM
5598 if (idx == BTREE_INCR_VACUUM) {
5599 assert(iMeta == 0 || iMeta == 1);
5600 pBt->incrVacuum = (u8)iMeta;
5601 }
5602 #endif
5603 sqlite3_mutex_leave(pBt->mutex);
5604
5605 /* Skip the database update for private environments. */
5606 if (pBt->dbStorage != DB_STORE_NAMED)
5607 return SQLITE_OK;
5608
5609 if (!p->connected && (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
5610 return rc;
5611 /* OpenEnvironment might have changed the pBt, update it. */
5612 pBt = p->pBt;
5613
5614 memset(&key, 0, sizeof(key));
5615 metaKey = idx;
5616 key.data = &metaKey;
5617 key.size = key.ulen = sizeof(metaKey);
5618 key.flags = DB_DBT_USERMEM;
5619 memset(&data, 0, sizeof(data));
5620 metaData = iMeta;
5621 data.data = &metaData;
5622 data.size = data.ulen = sizeof(metaData);
5623 data.flags = DB_DBT_USERMEM;
5624
5625 ret = pMetaDb->put(pMetaDb, pSavepointTxn, &key, &data, 0);
5626
5627 return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
5628 }
5629
5630 #ifndef SQLITE_OMIT_BTREECOUNT
5631 /*
5632 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
5633 ** number of entries in the b-tree and write the result to *pnEntry.
5634 **
5635 ** SQLITE_OK is returned if the operation is successfully executed.
5636 ** Otherwise, if an error is encountered (i.e. an IO error or database
5637 ** corruption) an SQLite error code is returned.
5638 */
sqlite3BtreeCount(BtCursor * pCur,i64 * pnEntry)5639 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry)
5640 {
5641 Btree *p;
5642 DB_BTREE_STAT *stat;
5643 int ret;
5644
5645 if (pCur->eState == CURSOR_FAULT || pCur->cached_db->dbp == NULL)
5646 return (pCur->error == 0 ? SQLITE_ERROR : pCur->error);
5647
5648 p = pCur->pBtree;
5649
5650 if ((ret = pBDb->stat(pBDb, pReadTxn ? pReadTxn : pFamilyTxn, &stat,
5651 GET_BTREE_ISOLATION(p) & ~DB_TXN_SNAPSHOT)) == 0) {
5652 *pnEntry = stat->bt_ndata;
5653 sqlite3_free(stat);
5654 }
5655
5656 return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
5657 }
5658 #endif
5659
5660 /*
5661 ** This routine does a complete check of the given BTree file. aRoot[] is
5662 ** an array of pages numbers were each page number is the root page of a table.
5663 ** nRoot is the number of entries in aRoot.
5664 **
5665 ** If everything checks out, this routine returns NULL. If something is amiss,
5666 ** an error message is written into memory obtained from malloc() and a
5667 ** pointer to that error message is returned. The calling function is
5668 ** responsible for freeing the error message when it is done.
5669 */
sqlite3BtreeIntegrityCheck(Btree * pBt,int * aRoot,int nRoot,int mxErr,int * pnErr)5670 char *sqlite3BtreeIntegrityCheck(
5671 Btree *pBt, /* The btree to be checked */
5672 int *aRoot, /* An array of root page numbers for individual trees */
5673 int nRoot, /* Number of entries in aRoot[] */
5674 int mxErr, /* Stop reporting errors after this many */
5675 int *pnErr) /* Write number of errors seen to this variable */
5676 {
5677 int ret;
5678
5679 log_msg(LOG_VERBOSE, "sqlite3BtreeIntegrityCheck(%p, %p, %u, %u, %p)",
5680 pBt, aRoot, nRoot, mxErr, pnErr);
5681
5682 ret = 0;
5683 *pnErr = 0;
5684 #if 0
5685 DB *db;
5686 int i;
5687 char *tableName, tableNameBuf[DBNAME_SIZE];
5688 /*
5689 * XXX: Have to do this outside the environment, verify doesn't play
5690 * nice with locking.
5691 */
5692 for (i = 0; i < nRoot && ret == 0; i++) {
5693 tableName = tableNameBuf;
5694 GET_TABLENAME(tableName, sizeof(tableNameBuf), aRoot[i], "");
5695 if ((ret = db_create(&db, pDbEnv, 0)) == 0)
5696 ret = db->verify(db, tableName,
5697 NULL, NULL, DB_NOORDERCHK);
5698 }
5699
5700 #endif
5701 return (ret == 0) ? NULL : sqlite3_strdup(db_strerror(ret));
5702 }
5703
5704 /*
5705 ** Return the full pathname of the underlying database file.
5706 */
sqlite3BtreeGetFilename(Btree * p)5707 const char *sqlite3BtreeGetFilename(Btree *p)
5708 {
5709 log_msg(LOG_VERBOSE, "sqlite3BtreeGetFilename(%p) (%s)",
5710 p, p->pBt->full_name);
5711
5712 return (p->pBt->full_name != NULL) ? p->pBt->full_name : "";
5713 }
5714
5715 /*
5716 ** Return non-zero if a transaction is active.
5717 */
sqlite3BtreeIsInTrans(Btree * p)5718 int sqlite3BtreeIsInTrans(Btree *p)
5719 {
5720 return (p && p->inTrans == TRANS_WRITE);
5721 }
5722
5723 /*
5724 * Berkeley DB always uses WAL, but the SQLite flag is disabled on Windows
5725 * Mobile (CE) because some of the SQLite WAL code doesn't build with the flag
5726 * enabled.
5727 */
5728 #ifndef SQLITE_OMIT_WAL
5729 /*
5730 ** Run a checkpoint on the Btree passed as the first argument.
5731 **
5732 ** Return SQLITE_LOCKED if this or any other connection has an open
5733 ** transaction on the shared-cache the argument Btree is connected to.
5734 **
5735 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
5736 */
sqlite3BtreeCheckpoint(Btree * p,int eMode,int * pnLog,int * pnCkpt)5737 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt)
5738 {
5739 BtShared *pBt;
5740 int rc;
5741
5742 /*
5743 * TODO: Investigate eMode. In SQLite there are three possible modes
5744 * SQLITE_CHECKPOINT_PASSIVE - return instead of blocking on locks
5745 * SQLITE_CHECKPOINT_FULL - Wait to get an exclusive lock.
5746 * SQLITE_CHECKPOINT_RESTART - as for full, except force a new log file
5747 *
5748 * Berkeley DB checkpoints really work like FULL. It might be possible
5749 * to mimic PASSIVE (default in SQLite) with lock no-wait, but do we
5750 * care?
5751 */
5752 rc = SQLITE_OK;
5753 if (p != NULL) {
5754 pBt = p->pBt;
5755 if (p->inTrans != TRANS_NONE)
5756 rc = SQLITE_LOCKED;
5757 else
5758 rc = sqlite3PagerCheckpoint((Pager *)p);
5759 }
5760 /*
5761 * The following two variables are used to return information via
5762 * the sqlite_wal_checkoint_v2 database. They don't map well onto
5763 * Berkeley DB, so return 0 for now.
5764 * pnLog: Size of WAL log in frames.
5765 * pnCkpt: Total number of frames checkpointed.
5766 */
5767 if (pnLog != 0)
5768 *pnLog = 0;
5769 if (pnCkpt != 0)
5770 *pnCkpt = 0;
5771 return rc;
5772 }
5773 #endif
5774
5775 /*
5776 * Determine whether or not a cursor has moved from the position it was last
5777 * placed at.
5778 */
sqlite3BtreeCursorHasMoved(BtCursor * pCur,int * pHasMoved)5779 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved)
5780 {
5781 int rc;
5782
5783 /* Set this here in case of error. */
5784 *pHasMoved = 1;
5785
5786 /*
5787 * We only want to return an error if the cursor is faulted, not just
5788 * if it is not pointing at anything.
5789 */
5790 if (pCur->eState != CURSOR_VALID && pCur->eState != CURSOR_INVALID &&
5791 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
5792 return rc;
5793
5794 if (pCur->eState == CURSOR_VALID && pCur->lastRes == 0)
5795 *pHasMoved = 0;
5796 return SQLITE_OK;
5797 }
5798
5799 #ifndef NDEBUG
5800 /*
5801 ** Return true if the given BtCursor is valid. A valid cursor is one that is
5802 ** currently pointing to a row in a (non-empty) table.
5803 **
5804 ** This is a verification routine, it is used only within assert() statements.
5805 */
sqlite3BtreeCursorIsValid(BtCursor * pCur)5806 int sqlite3BtreeCursorIsValid(BtCursor *pCur)
5807 {
5808 return (pCur != NULL && pCur->eState == CURSOR_VALID);
5809 }
5810 #endif /* NDEBUG */
5811
5812 /*****************************************************************
5813 ** Argument pCsr must be a cursor opened for writing on an INTKEY table
5814 ** currently pointing at a valid table entry. This function modifies the
5815 ** data stored as part of that entry. Only the data content may be modified,
5816 ** it is not possible to change the length of the data stored.
5817 */
sqlite3BtreePutData(BtCursor * pCur,u32 offset,u32 amt,void * z)5818 int sqlite3BtreePutData(BtCursor *pCur, u32 offset, u32 amt, void *z)
5819 {
5820 DBT pdata;
5821 int rc, ret;
5822 log_msg(LOG_VERBOSE, "sqlite3BtreePutData(%p, %u, %u, %p)",
5823 pCur, offset, amt, z);
5824
5825 /*
5826 * Check that the cursor is open for writing and the cursor points at a
5827 * valid row of an intKey table.
5828 */
5829 if (!pCur->wrFlag)
5830 return SQLITE_READONLY;
5831
5832 UPDATE_DURING_BACKUP(pCur->pBtree)
5833
5834 if (pDbc == NULL &&
5835 (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
5836 return rc;
5837
5838 if (pCur->eState != CURSOR_VALID)
5839 return SQLITE_ABORT;
5840
5841 assert(!pCur->multiGetPtr);
5842
5843 #ifndef SQLITE_OMIT_INCRBLOB
5844 assert(pCur);
5845 assert(pDbc);
5846
5847 rc = SQLITE_OK;
5848 memcpy((u_int8_t *)pCur->data.data + offset, z, amt);
5849
5850 memset(&pdata, 0, sizeof(DBT));
5851 pdata.data = (void *)z;
5852 pdata.size = pdata.dlen = amt;
5853 pdata.doff = offset;
5854 pdata.flags |= DB_DBT_PARTIAL;
5855
5856 if ((rc = btreeTripWatchers(pCur, 1)) != SQLITE_OK)
5857 return rc;
5858
5859 ret = pDbc->put(pDbc, &pCur->key, &pdata, DB_CURRENT);
5860 if (ret != 0) {
5861 HANDLE_INCRBLOB_DEADLOCK(ret, pCur)
5862 rc = dberr2sqlitelocked(ret, pCur->pBtree);
5863 }
5864 #endif
5865 return rc;
5866 }
5867
5868 /*****************************************************************
5869 ** Set a flag on this cursor to indicate that it is an incremental blob
5870 ** cursor. Incrblob cursors are invalidated differently to ordinary cursors:
5871 ** if the value under an incrblob cursor is modified, attempts to access
5872 ** the cursor again will result in an error.
5873 */
sqlite3BtreeCacheOverflow(BtCursor * pCur)5874 void sqlite3BtreeCacheOverflow(BtCursor *pCur)
5875 {
5876 Btree *p;
5877
5878 log_msg(LOG_VERBOSE, "sqlite3BtreeCacheOverflow(%p)", pCur);
5879
5880 pCur->isIncrblobHandle = 1;
5881 p = pCur->pBtree;
5882
5883 /*
5884 * Give the transaction to the incrblob cursor, since it has to live
5885 * the lifetime of the cursor.
5886 */
5887 if (p && p->connected && p->pBt->transactional && pCur->wrFlag) {
5888 /* XXX error handling */
5889 p->pBt->dbenv->txn_begin(p->pBt->dbenv, pSavepointTxn->parent,
5890 &pSavepointTxn, 0);
5891 }
5892 }
5893
5894 /*****************************************************************
5895 ** Return non-zero if a read (or write) transaction is active.
5896 */
sqlite3BtreeIsInReadTrans(Btree * p)5897 int sqlite3BtreeIsInReadTrans(Btree *p)
5898 {
5899 log_msg(LOG_VERBOSE, "sqlite3BtreeIsInReadTrans(%p)", p);
5900 return (p && p->inTrans != TRANS_NONE);
5901 }
5902
5903 /***************************************************************************
5904 ** This routine sets the state to CURSOR_FAULT and the error code to errCode
5905 ** for every cursor on BtShared that pengine references.
5906 **
5907 ** Every cursor is tripped, including cursors that belong to other databases
5908 ** connections that happen to be sharing the cache with pengine.
5909 **
5910 ** This routine gets called when a rollback occurs. All cursors using the same
5911 ** cache must be tripped to prevent them from trying to use the engine after
5912 ** the rollback. The rollback may have deleted tables or moved root pages, so
5913 ** it is not sufficient to save the state of the cursor. The cursor must be
5914 ** invalidated.
5915 */
sqlite3BtreeTripAllCursors(Btree * p,int errCode)5916 void sqlite3BtreeTripAllCursors(Btree* p, int errCode)
5917 {
5918 BtShared *pBt;
5919 BtCursor *pCur;
5920
5921 log_msg(LOG_VERBOSE, "sqlite3BtreeTripAllCursors(%p, %u)", p, errCode);
5922
5923 pBt = p->pBt;
5924
5925 sqlite3_mutex_enter(pBt->mutex);
5926 for (pCur = pBt->first_cursor; pCur != NULL; pCur = pCur->next) {
5927 pCur->eState = CURSOR_FAULT;
5928 pCur->error = errCode;
5929 }
5930 sqlite3_mutex_leave(pBt->mutex);
5931 }
5932
btreeLockSchema(Btree * p,lock_mode_t lockMode)5933 int btreeLockSchema(Btree *p, lock_mode_t lockMode)
5934 {
5935 BtCursor *pCur, tmpCursor;
5936 BtShared *pBt;
5937 DBC *oldCur;
5938 int opened, rc, res, ret;
5939
5940 pBt = p->pBt;
5941 pCur = &tmpCursor;
5942 oldCur = NULL;
5943 opened = 0;
5944 rc = SQLITE_OK;
5945
5946 if (!p->connected) {
5947 if (lockMode == LOCKMODE_NONE || lockMode > p->schemaLockMode)
5948 p->schemaLockMode = lockMode;
5949 return SQLITE_OK;
5950 }
5951
5952 if (lockMode == LOCKMODE_NONE)
5953 goto done;
5954
5955 sqlite3BtreeCursorZero(pCur);
5956 rc = sqlite3BtreeCursor(p, MASTER_ROOT,
5957 lockMode == LOCKMODE_WRITE, NULL, pCur);
5958 opened = (rc == SQLITE_OK);
5959 if (pCur->eState == CURSOR_FAULT)
5960 rc = pCur->error;
5961
5962 /*
5963 * Any repeatable operation would do: we get the last item just because
5964 * it doesn't try to do a bulk get.
5965 */
5966 if (rc == SQLITE_OK)
5967 rc = sqlite3BtreeLast(pCur, &res);
5968
5969 done: if (p->schemaLock != NULL) {
5970 if ((ret = p->schemaLock->close(p->schemaLock)) != 0 &&
5971 rc == SQLITE_OK)
5972 rc = dberr2sqlite(ret, p);
5973 p->schemaLock = NULL;
5974 }
5975
5976 if (opened && rc == SQLITE_OK) {
5977 p->schemaLockMode = lockMode;
5978 p->schemaLock = pDbc;
5979 pDbc = NULL;
5980 } else
5981 p->schemaLockMode = LOCKMODE_NONE;
5982 if (opened)
5983 (void)sqlite3BtreeCloseCursor(pCur);
5984
5985 return rc;
5986 }
5987
5988 /*****************************************************************
5989 ** Obtain a lock on the table whose root page is iTab. The lock is a write
5990 ** lock if isWritelock is true or a read lock if it is false.
5991 */
sqlite3BtreeLockTable(Btree * p,int iTable,u8 isWriteLock)5992 int sqlite3BtreeLockTable(Btree *p, int iTable, u8 isWriteLock)
5993 {
5994 lock_mode_t lockMode;
5995 int rc;
5996
5997 log_msg(LOG_VERBOSE, "sqlite3BtreeLockTable(%p, %u, %u)",
5998 p, iTable, isWriteLock);
5999
6000 lockMode = isWriteLock ? LOCKMODE_WRITE : LOCKMODE_READ;
6001
6002 if (iTable != MASTER_ROOT || !p->pBt->transactional ||
6003 p->schemaLockMode >= lockMode)
6004 return SQLITE_OK;
6005
6006 rc = btreeLockSchema(p, lockMode);
6007
6008 if (!p->connected && rc != SQLITE_NOMEM) {
6009 p->schemaLockMode = lockMode;
6010 return SQLITE_OK;
6011 }
6012
6013 if (rc == SQLITE_BUSY)
6014 rc = SQLITE_LOCKED;
6015
6016 return rc;
6017 }
6018
6019 /*****************************************************************
6020 ** Return true if another user of the same shared engine as the argument
6021 ** handle holds an exclusive lock on the sqlite_master table.
6022 */
sqlite3BtreeSchemaLocked(Btree * p)6023 int sqlite3BtreeSchemaLocked(Btree *p)
6024 {
6025 BtCursor *pCur;
6026 BtShared *pBt;
6027
6028 log_msg(LOG_VERBOSE, "sqlite3BtreeSchemaLocked(%p)", p);
6029
6030 pBt = p->pBt;
6031
6032 if (p->sharable) {
6033 sqlite3_mutex_enter(pBt->mutex);
6034 for (pCur = pBt->first_cursor;
6035 pCur != NULL;
6036 pCur = pCur->next) {
6037 if (pCur->pBtree != p && pCur->pBtree->connected &&
6038 pCur->pBtree->schemaLockMode == LOCKMODE_WRITE) {
6039 sqlite3_mutex_leave(pBt->mutex);
6040 return SQLITE_LOCKED_SHAREDCACHE;
6041 }
6042 }
6043 sqlite3_mutex_leave(pBt->mutex);
6044 }
6045
6046 return SQLITE_OK;
6047 }
6048
6049 /*****************************************************************
6050 ** No op.
6051 */
sqlite3BtreeSyncDisabled(Btree * p)6052 int sqlite3BtreeSyncDisabled(Btree *p)
6053 {
6054 log_msg(LOG_VERBOSE, "sqlite3BtreeSyncDisabled(%p)", p);
6055 return (0);
6056 }
6057
6058 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
6059 /*
6060 ** Change the default pages size and the number of reserved bytes per page.
6061 ** Or, if the page size has already been fixed, return SQLITE_READONLY
6062 ** without changing anything.
6063 **
6064 ** The page size must be a power of 2 between 512 and 65536. If the page
6065 ** size supplied does not meet this constraint then the page size is not
6066 ** changed.
6067 **
6068 ** Page sizes are constrained to be a power of two so that the region of the
6069 ** database file used for locking (beginning at PENDING_BYTE, the first byte
6070 ** past the 1GB boundary, 0x40000000) needs to occur at the beginning of a page.
6071 **
6072 ** If parameter nReserve is less than zero, then the number of reserved bytes
6073 ** per page is left unchanged.
6074 **
6075 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
6076 ** and autovacuum mode can no longer be changed.
6077 */
sqlite3BtreeSetPageSize(Btree * p,int pageSize,int nReserve,int iFix)6078 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix)
6079 {
6080 BtShared *pBt;
6081
6082 log_msg(LOG_VERBOSE, "sqlite3BtreeSetPageSize(%p, %u, %u)",
6083 p, pageSize, nReserve);
6084
6085 if (pageSize != 0 && (pageSize < 512 || pageSize > 65536 ||
6086 ((pageSize - 1) & pageSize) != 0))
6087 return SQLITE_OK;
6088
6089 pBt = p->pBt;
6090 if (pBt->pageSizeFixed)
6091 return SQLITE_READONLY;
6092
6093 /* Can't set the page size once a table has been created. */
6094 if (pMetaDb != NULL)
6095 return SQLITE_OK;
6096
6097 pBt->pageSize = pageSize;
6098 if (iFix)
6099 pBt->pageSizeFixed = 1;
6100
6101 return SQLITE_OK;
6102 }
6103
6104 /***************************************************************************
6105 ** Return the currently defined page size.
6106 */
sqlite3BtreeGetPageSize(Btree * p)6107 int sqlite3BtreeGetPageSize(Btree *p)
6108 {
6109 BtShared *pBt;
6110 u_int32_t pagesize;
6111
6112 log_msg(LOG_VERBOSE, "sqlite3BtreeGetPageSize(%p)", p);
6113
6114 pBt = p->pBt;
6115 if (!p->connected && pBt->need_open)
6116 btreeOpenEnvironment(p, 1);
6117
6118 if (pMetaDb != NULL &&
6119 pMetaDb->get_pagesize(pMetaDb, &pagesize) == 0)
6120 return (int)pagesize;
6121 if (pBt->pageSize == 0)
6122 return SQLITE_DEFAULT_PAGE_SIZE;
6123 return p->pBt->pageSize;
6124 }
6125
6126 /***************************************************************************
6127 ** No op.
6128 */
sqlite3BtreeGetReserve(Btree * p)6129 int sqlite3BtreeGetReserve(Btree *p)
6130 {
6131 log_msg(LOG_VERBOSE, "sqlite3BtreeGetReserve(%p)", p);
6132 /* FIXME: Need to check how this is used by SQLite. */
6133 return (0);
6134 }
6135
sqlite3BtreeLastPage(Btree * p)6136 u32 sqlite3BtreeLastPage(Btree *p)
6137 {
6138 log_msg(LOG_VERBOSE, "sqlite3BtreeLastPage(%p)", p);
6139 /* FIXME: Is there a cheap way to do this? */
6140 return (0);
6141 }
6142
6143 /*
6144 ** Set both the "read version" (single byte at byte offset 18) and
6145 ** "write version" (single byte at byte offset 19) fields in the database
6146 ** header to iVersion.
6147 ** This function is only called by OP_JournalMode, when changing to or from
6148 ** WAL journaling. We are always WAL, so it's safe to return OK.
6149 */
sqlite3BtreeSetVersion(Btree * pBtree,int iVersion)6150 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion)
6151 {
6152 pBtree = NULL;
6153 iVersion = 0;
6154 return (SQLITE_OK);
6155 }
6156
6157 /***************************************************************************
6158 **
6159 ** Set the maximum page count for a database if mxPage is positive.
6160 ** No changes are made if mxPage is 0 or negative.
6161 ** Regardless of the value of mxPage, return the current maximum page count.
6162 **
6163 ** If mxPage <= minimum page count, set it to the minimum possible value.
6164 */
sqlite3BtreeMaxPageCount(Btree * p,int mxPage)6165 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage)
6166 {
6167 int defPgCnt, newPgCnt;
6168 BtShared *pBt;
6169 CACHED_DB *cached_db;
6170 DB_MPOOLFILE *pMpf;
6171 u_int32_t gBytes, bytes;
6172 u_int32_t pgSize;
6173 db_pgno_t minPgNo;
6174 HashElem *e;
6175
6176 log_msg(LOG_VERBOSE, "sqlite3BtreeMaxPageCount(%p, %u)", p, mxPage);
6177
6178 pBt = p->pBt;
6179 if (!pMetaDb) {
6180 if (mxPage > 0)
6181 pBt->pageCount = mxPage;
6182 return pBt->pageCount;
6183 }
6184
6185 pMpf = pMetaDb->get_mpf(pMetaDb);
6186 assert(pMpf);
6187 gBytes = bytes = pgSize = 0;
6188
6189 /* Get the current maximum page number. */
6190 pMetaDb->get_pagesize(pMetaDb, &pgSize);
6191 pMpf->get_maxsize(pMpf, &gBytes, &bytes);
6192 defPgCnt = (int)(gBytes * (GIGABYTE / pgSize) + bytes / pgSize);
6193
6194 if (mxPage <= 0 || IS_BTREE_READONLY(p))
6195 return defPgCnt;
6196
6197 /*
6198 * Retrieve the current last page number, so we can avoid setting a
6199 * value smaller than that.
6200 */
6201 minPgNo = 0;
6202 if (pMpf->get_last_pgno(pMpf, &minPgNo) != 0)
6203 return defPgCnt;
6204
6205 /*
6206 * If sqlite3BtreeCreateTable has been called, but the table has not
6207 * yet been created, reserve an additional two pages for the table.
6208 * This is a bit of a hack, otherwise sqlite3BtreeCursor can return
6209 * SQLITE_FULL, which the VDBE code does not expect.
6210 */
6211 for (e = sqliteHashFirst(&pBt->db_cache); e != NULL;
6212 e = sqliteHashNext(e)) {
6213 cached_db = sqliteHashData(e);
6214 if (cached_db == NULL)
6215 continue;
6216 if (cached_db->created == 0)
6217 minPgNo += 2;
6218 }
6219 /*
6220 * If mxPage is less than the current last page, set the maximum
6221 * page number to the current last page number.
6222 */
6223 newPgCnt = (mxPage < (int)minPgNo) ? (int)minPgNo : mxPage;
6224
6225 gBytes = (u_int32_t) (newPgCnt / (GIGABYTE / pgSize));
6226 bytes = (u_int32_t) ((newPgCnt % (GIGABYTE / pgSize)) * pgSize);
6227 if (pMpf->set_maxsize(pMpf, gBytes, bytes) != 0)
6228 return defPgCnt;
6229
6230 return newPgCnt;
6231 }
6232
6233 /*
6234 ** Set the secureDelete flag if newFlag is 0 or 1. If newFlag is -1,
6235 ** then make no changes. Always return the value of the secureDelete
6236 ** setting after the change.
6237 */
sqlite3BtreeSecureDelete(Btree * p,int newFlag)6238 int sqlite3BtreeSecureDelete(Btree *p, int newFlag)
6239 {
6240 int oldFlag;
6241
6242 oldFlag = 0;
6243 if (p != NULL) {
6244 sqlite3_mutex_enter(p->pBt->mutex);
6245 if (newFlag >= 0)
6246 p->pBt->secureDelete = (newFlag != 0);
6247 oldFlag = p->pBt->secureDelete;
6248 sqlite3_mutex_leave(p->pBt->mutex);
6249 }
6250
6251 return oldFlag;
6252 }
6253 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) */
6254
6255 /*****************************************************************
6256 ** Return the pathname of the journal file for this database. The return
6257 ** value of this routine is the same regardless of whether the journal file
6258 ** has been created or not.
6259 **
6260 ** The pager journal filename is invariant as long as the pager is open so
6261 ** it is safe to access without the BtShared mutex.
6262 */
sqlite3BtreeGetJournalname(Btree * p)6263 const char *sqlite3BtreeGetJournalname(Btree *p)
6264 {
6265 BtShared *pBt;
6266
6267 log_msg(LOG_VERBOSE, "sqlite3BtreeGetJournalname(%p)", p);
6268 pBt = p->pBt;
6269 return (pBt->dir_name != 0 ? pBt->dir_name : "");
6270 }
6271
6272 /*****************************************************************
6273 ** This function returns a pointer to a blob of memory associated with a
6274 ** single shared-engine. The memory is used by client code for its own
6275 ** purposes (for example, to store a high-level schema associated with the
6276 ** shared-engine). The engine layer manages reference counting issues.
6277 **
6278 ** The first time this is called on a shared-engine, nBytes bytes of memory
6279 ** are allocated, zeroed, and returned to the caller. For each subsequent call
6280 ** the nBytes parameter is ignored and a pointer to the same blob of memory
6281 ** returned.
6282 **
6283 ** Just before the shared-engine is closed, the function passed as the xFree
6284 ** argument when the memory allocation was made is invoked on the blob of
6285 ** allocated memory. This function should not call sqlite3_free() on the
6286 ** memory, the engine layer does that.
6287 */
sqlite3BtreeSchema(Btree * p,int nBytes,void (* xFree)(void *))6288 void *sqlite3BtreeSchema(Btree *p, int nBytes, void (*xFree)(void *))
6289 {
6290 log_msg(LOG_VERBOSE, "sqlite3BtreeSchema(%p, %u, fn_ptr)", p, nBytes);
6291 /* This was happening when an environment open failed in bigfile.
6292 if (p == NULL || p->pBt == NULL)
6293 return NULL;*/
6294
6295 if (p->schema == NULL && nBytes > 0) {
6296 p->schema = sqlite3MallocZero(nBytes);
6297 p->free_schema = xFree;
6298 }
6299 return (p->schema);
6300 }
6301
btreeGetIndex(Btree * p,int iTable)6302 Index *btreeGetIndex(Btree *p, int iTable)
6303 {
6304 sqlite3 *db = p->db;
6305 HashElem *e;
6306 Index *index;
6307 Schema *pSchema;
6308 int i;
6309
6310 index = NULL;
6311
6312 assert(sqlite3_mutex_held(db->mutex));
6313 for (i = 0; i < db->nDb; i++) {
6314 if (db->aDb[i].pBt != p)
6315 continue;
6316 pSchema = db->aDb[i].pSchema;
6317 assert(pSchema);
6318 for (e = sqliteHashFirst(&pSchema->idxHash); e != NULL;
6319 e = sqliteHashNext(e)) {
6320 index = sqliteHashData(e);
6321 if (index->tnum == iTable)
6322 goto done;
6323 index = NULL;
6324 }
6325 }
6326 done: return index;
6327 }
6328
btreeGetKeyInfo(Btree * p,int iTable,KeyInfo ** pKeyInfo)6329 int btreeGetKeyInfo(Btree *p, int iTable, KeyInfo **pKeyInfo)
6330 {
6331 Index *pIdx;
6332 Parse parse;
6333 *pKeyInfo = 0;
6334
6335 /* Only indexes have a KeyInfo */
6336 if (iTable > 0 && (iTable & 1) == 0) {
6337 pIdx = btreeGetIndex(p, iTable);
6338 if (pIdx == NULL)
6339 return SQLITE_ERROR;
6340
6341 /*
6342 * Set up a dummy Parse structure -- these are the only fields
6343 * that are accessed inside sqlite3IndexKeyinfo. That function
6344 * could just take a sqlite3 struct instead of a Parse, but it
6345 * is consistent with the other functions normally called
6346 * during parsing.
6347 */
6348 parse.db = p->db;
6349 parse.nErr = 0;
6350
6351 *pKeyInfo = sqlite3IndexKeyinfo(&parse, pIdx);
6352 if (!*pKeyInfo)
6353 return SQLITE_NOMEM;
6354 (*pKeyInfo)->enc = ENC(p->db);
6355 }
6356 return SQLITE_OK;
6357 }
6358
6359 #ifndef SQLITE_OMIT_AUTOVACUUM
sqlite3BtreeIncrVacuum(Btree * p)6360 int sqlite3BtreeIncrVacuum(Btree *p)
6361 {
6362 BtShared *pBt;
6363
6364 assert(p && p->inTrans >= TRANS_READ);
6365
6366 pBt = p->pBt;
6367
6368 if (!pBt->autoVacuum || pBt->dbStorage != DB_STORE_NAMED)
6369 return SQLITE_DONE;
6370
6371 /* Just mark here and let sqlite3BtreeCommitPhaseTwo do the vacuum */
6372 p->needVacuum = 1;
6373 /*
6374 * Always return SQLITE_DONE to end OP_IncrVacuum immediatelly since
6375 * we ignore the "N" of PRAGMA incremental_vacuum(N);
6376 */
6377 return SQLITE_DONE;
6378 }
6379 #endif
6380
sqlite3BtreeIsInBackup(Btree * p)6381 int sqlite3BtreeIsInBackup(Btree *p)
6382 {
6383 return p->nBackup;
6384 }
6385
sqlite3BtreeGetAutoVacuum(Btree * p)6386 int sqlite3BtreeGetAutoVacuum(Btree *p)
6387 {
6388 #ifdef SQLITE_OMIT_AUTOVACUUM
6389 return BTREE_AUTOVACUUM_NONE;
6390 #else
6391 BtShared *pBt;
6392 int vacuum_mode;
6393
6394 pBt = p->pBt;
6395
6396 sqlite3_mutex_enter(pBt->mutex);
6397 vacuum_mode = (pBt->autoVacuum ?
6398 (pBt->incrVacuum ? BTREE_AUTOVACUUM_INCR : BTREE_AUTOVACUUM_FULL) :
6399 BTREE_AUTOVACUUM_NONE);
6400 sqlite3_mutex_leave(pBt->mutex);
6401
6402 return vacuum_mode;
6403 #endif
6404 }
6405
sqlite3BtreeSetAutoVacuum(Btree * p,int autoVacuum)6406 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum)
6407 {
6408 #ifdef SQLITE_OMIT_AUTOVACUUM
6409 return SQLITE_READONLY;
6410 #else
6411 BtShared *pBt = p->pBt;
6412 int rc = SQLITE_OK;
6413 u8 savedIncrVacuum;
6414
6415 savedIncrVacuum = pBt->incrVacuum;
6416 sqlite3_mutex_enter(pBt->mutex);
6417 /* Do not like sqlite, BDB allows setting vacuum at any time */
6418 pBt->autoVacuum = (autoVacuum != 0);
6419 pBt->incrVacuum = (autoVacuum == 2);
6420 sqlite3_mutex_leave(pBt->mutex);
6421
6422 /* If setting is changed, we need to reset incrVacuum Info */
6423 if (pBt->incrVacuum != savedIncrVacuum)
6424 btreeFreeVacuumInfo(p);
6425
6426 if (rc == SQLITE_OK && !p->connected && !pBt->resultsBuffer)
6427 rc = btreeOpenEnvironment(p, 1);
6428
6429 return rc;
6430 #endif
6431 }
6432
sqlite3BtreeGetCachedRowid(BtCursor * pCur)6433 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur)
6434 {
6435 return pCur->cachedRowid;
6436 }
6437
sqlite3BtreeSetCachedRowid(BtCursor * pCur,sqlite3_int64 iRowid)6438 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid)
6439 {
6440 BtShared *pBt;
6441 BtCursor *pC;
6442
6443 pBt = pCur->pBtree->pBt;
6444
6445 sqlite3_mutex_enter(pBt->mutex);
6446 for (pC = pBt->first_cursor; pC != NULL; pC = pC->next)
6447 if (pC->cached_db == pCur->cached_db)
6448 pC->cachedRowid = iRowid;
6449 sqlite3_mutex_leave(pBt->mutex);
6450 }
6451
sqlite3BtreeSavepoint(Btree * p,int op,int iSavepoint)6452 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint)
6453 {
6454 BtShared *pBt;
6455 DB_TXN *txn;
6456 DB_TXN *ttxn;
6457 DELETED_TABLE *dtable, *prev, *next;
6458 #ifdef BDBSQL_SHARE_PRIVATE
6459 int isMain = 0;
6460 #endif
6461 int rc, ret;
6462
6463 log_msg(LOG_VERBOSE, "sqlite3BtreeSavepoint(%p,%d,%d)",
6464 p, op, iSavepoint);
6465
6466 /*
6467 * If iSavepoint + 2 > p->nSavepoint and this is not a rollback,
6468 * then the savepoint has been created, but sqlite3BtreeBeginStmt
6469 * has not been called to create the actual child transaction. If
6470 * this is a rollback and iSavepoint + 2 > p->nSavepoint, then
6471 * the read transaction lost its locks due to deadlock in an
6472 * update transaction and needs to be aborted.
6473 */
6474 if (p && op == SAVEPOINT_ROLLBACK &&
6475 (p->txn_bulk ||
6476 (((iSavepoint + 2 > p->nSavepoint) || (p->inTrans == TRANS_READ)) &&
6477 pReadTxn))) {
6478 /* Abort a read or bulk transaction, handled below. */
6479 } else if (!p ||
6480 pSavepointTxn == NULL || iSavepoint + 2 > p->nSavepoint)
6481 return SQLITE_OK;
6482
6483 pBt = p->pBt;
6484
6485 /*
6486 * Note that iSavepoint can be negative, meaning that all savepoints
6487 * should be released or rolled back.
6488 */
6489 if (iSavepoint < 0) {
6490 txn = pMainTxn;
6491 #ifdef BDBSQL_SHARE_PRIVATE
6492 isMain = 1;
6493 #endif
6494 } else if (op == SAVEPOINT_ROLLBACK &&
6495 ((iSavepoint + 2 > p->nSavepoint) || p->inTrans == TRANS_READ)) {
6496 txn = pReadTxn;
6497 pReadTxn = NULL;
6498 } else {
6499 txn = pSavepointTxn;
6500 while (--p->nSavepoint > iSavepoint + 1 && txn->parent != NULL)
6501 txn = txn->parent;
6502 }
6503
6504 if (p->deleted_tables != NULL && p->inTrans == TRANS_WRITE) {
6505 for (ttxn = pSavepointTxn;
6506 ttxn != txn->parent;
6507 ttxn = ttxn->parent) {
6508 prev = NULL;
6509 for (dtable = p->deleted_tables;
6510 dtable != NULL;
6511 dtable = next) {
6512 next = dtable->next;
6513 if (dtable->txn == ttxn &&
6514 op == SAVEPOINT_ROLLBACK) {
6515 sqlite3_free(dtable);
6516 if (prev)
6517 prev->next = next;
6518 else
6519 p->deleted_tables = next;
6520 } else {
6521 prev = dtable;
6522 if (op == SAVEPOINT_RELEASE)
6523 dtable->txn = txn->parent;
6524 }
6525 }
6526 }
6527 }
6528
6529 if (txn->parent == NULL) {
6530 assert(iSavepoint < 0 || p->txn_bulk);
6531 pMainTxn = pReadTxn = pSavepointTxn = NULL;
6532 p->nSavepoint = 0;
6533 p->inTrans = TRANS_NONE;
6534 p->txn_excl = 0;
6535 /* pReadTxn is only NULL if the read txn is being aborted */
6536 } else if (p->inTrans == TRANS_WRITE && pReadTxn)
6537 pSavepointTxn = txn->parent;
6538
6539 rc = btreeCloseAllCursors(p, txn);
6540 if (rc != SQLITE_OK)
6541 return rc;
6542
6543 ret = (op == SAVEPOINT_RELEASE) ?
6544 txn->commit(txn, DB_TXN_NOSYNC) : txn->abort(txn);
6545 #ifdef BDBSQL_SHARE_PRIVATE
6546 if (isMain && pBt->dbStorage == DB_STORE_NAMED)
6547 btreeFileUnlock(p);
6548 #endif
6549 if (ret != 0)
6550 goto err;
6551
6552 if (op == SAVEPOINT_ROLLBACK &&
6553 (rc = btreeCleanupCachedHandles(p, CLEANUP_ABORT)) != SQLITE_OK)
6554 return rc;
6555
6556 if (op == SAVEPOINT_ROLLBACK && p->txn_bulk && iSavepoint >= 0)
6557 return SQLITE_ABORT;
6558
6559 err: return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
6560 }
6561
6562 /* Stub out enough to make sqlite3_file_control fail gracefully. */
sqlite3BtreePager(Btree * p)6563 Pager *sqlite3BtreePager(Btree *p)
6564 {
6565 return (Pager *)p;
6566 }
6567
6568 #ifndef SQLITE_OMIT_SHARED_CACHE
6569 /*
6570 ** Enable or disable the shared pager and schema features.
6571 **
6572 ** This routine has no effect on existing database connections.
6573 ** The shared cache setting effects only future calls to
6574 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
6575 */
sqlite3_enable_shared_cache(int enable)6576 int sqlite3_enable_shared_cache(int enable)
6577 {
6578 sqlite3GlobalConfig.sharedCacheEnabled = enable;
6579 return SQLITE_OK;
6580 }
6581 #endif
6582
6583 /*
6584 * Returns the Berkeley DB* struct for the user created
6585 * table with the given iTable value.
6586 */
btreeGetUserTable(Btree * p,DB_TXN * pTxn,DB ** pDb,int iTable)6587 int btreeGetUserTable(Btree *p, DB_TXN *pTxn, DB **pDb, int iTable)
6588 {
6589 char *fileName, *tableName, tableNameBuf[DBNAME_SIZE];
6590 int ret, rc;
6591 BtShared *pBt;
6592 DB *dbp;
6593 KeyInfo *keyInfo;
6594 void *app;
6595
6596 rc = SQLITE_OK;
6597 pBt = p->pBt;
6598 dbp = *pDb;
6599 keyInfo = NULL;
6600 /* Is the metadata table. */
6601 if (iTable < 1) {
6602 *pDb = NULL;
6603 return SQLITE_OK;
6604 }
6605
6606 /* If the handle is not in the cache, open it. */
6607 tableName = tableNameBuf;
6608 GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
6609 FIX_TABLENAME(pBt, fileName, tableName);
6610
6611 /* Open a DB handle on that table. */
6612 if ((ret = db_create(&dbp, pDbEnv, 0)) != 0)
6613 return dberr2sqlite(ret, p);
6614
6615 if (!GET_DURABLE(pBt) &&
6616 (ret = dbp->set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
6617 goto err;
6618 if (pBt->encrypted && (ret = dbp->set_flags(dbp, DB_ENCRYPT)) != 0)
6619 goto err;
6620
6621 if (!(iTable & 1)) {
6622 /* Get the KeyInfo for the index */
6623 if ((rc = btreeGetKeyInfo(p, iTable, &keyInfo)) != SQLITE_OK)
6624 goto err;
6625
6626 if (keyInfo) {
6627 dbp->app_private = keyInfo;
6628 dbp->set_bt_compare(dbp, btreeCompareKeyInfo);
6629 }
6630 } else
6631 dbp->set_bt_compare(dbp, btreeCompareIntKey);
6632
6633 tableName = tableNameBuf;
6634 FIX_TABLENAME(pBt, fileName, tableName);
6635 if ((ret = dbp->open(dbp, pTxn, fileName, tableName, DB_BTREE,
6636 (pBt->db_oflags & ~DB_CREATE) | GET_ENV_READONLY(pBt), 0) |
6637 GET_AUTO_COMMIT(pBt, pTxn)) != 0)
6638 goto err;
6639
6640 *pDb = dbp;
6641 return rc;
6642
6643 err: app = dbp->app_private;
6644 dbp->app_private = NULL;
6645 dbp->close(dbp, 0);
6646 if (app)
6647 sqlite3DbFree(p->db, app);
6648 return MAP_ERR(rc, ret, p);
6649 }
6650
6651 /*
6652 * Gets a list of all the iTable values of the tables in the given database,
6653 * and allocates and sets that list into iTables. The caller must free iTables
6654 * using sqlite3_free().
6655 * iTables - Contains the list iTable values for all tables in the database. A
6656 * value of -1 marks the end of the list. The caller must use sqlit3_free() to
6657 * deallocate the list.
6658 */
btreeGetTables(Btree * p,int ** iTables,DB_TXN * txn)6659 int btreeGetTables(Btree *p, int **iTables, DB_TXN *txn)
6660 {
6661 DB *dbp;
6662 DBC *dbc;
6663 DB_BTREE_STAT *stats;
6664 DBT key, data;
6665 Mem iTable;
6666 int current, entries, i, inTrans, rc, ret;
6667 int *tables, *ptr;
6668 u32 hdrSize, type;
6669 unsigned char *endHdr, *record, *ptr2;
6670
6671 memset(&key, 0, sizeof(key));
6672 memset(&data, 0, sizeof(data));
6673 ret = inTrans = 0;
6674 dbp = NULL;
6675 dbc = NULL;
6676 tables = ptr = NULL;
6677
6678 /* Get the sqlite master db handle and count the entries in it. */
6679 if ((rc = btreeGetUserTable(p, txn, &dbp, MASTER_ROOT)) != SQLITE_OK)
6680 goto err;
6681 assert(dbp != NULL);
6682
6683 if ((ret = dbp->stat(dbp, txn, &stats, 0)) != 0)
6684 goto err;
6685
6686 entries = stats->bt_nkeys;
6687 #ifdef BDBSQL_OMIT_LEAKCHECK
6688 free(stats);
6689 #else
6690 sqlite3_free(stats);
6691 #endif
6692
6693 /*
6694 * Add room for the sqlite master and a value of -1 to
6695 * mark the end of the table. The sqlite master may include
6696 * views, which will not be recored in the tables entry.
6697 */
6698 entries += 2;
6699 tables = sqlite3Malloc(entries * sizeof(tables));
6700 if (!tables) {
6701 rc = SQLITE_NOMEM;
6702 goto err;
6703 }
6704 ptr = tables;
6705 /* Sqlite master table. */
6706 tables[0] = MASTER_ROOT;
6707 tables++;
6708
6709 /* Read each iTable value from the sqlite master */
6710 if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
6711 goto err;
6712 current = 0;
6713 while ((ret = dbc->get(dbc, &key, &data, DB_NEXT)) == 0) {
6714 /* The iTable value is the 4th entry in the record. */
6715 assert(current < entries);
6716 memset(&iTable, 0, sizeof(iTable));
6717 record = (unsigned char *)data.data;
6718 getVarint32(record, hdrSize);
6719 endHdr = record + hdrSize;
6720 ptr2 = record;
6721 record = endHdr;
6722 ptr2++;
6723 for (i = 0; i < 3; i++) {
6724 assert(ptr2 < endHdr);
6725 ptr2 += getVarint32(ptr2, type);
6726 record += sqlite3VdbeSerialTypeLen(type);
6727 }
6728 assert(ptr2 < endHdr);
6729 ptr2 += getVarint32(ptr2, type);
6730 sqlite3VdbeSerialGet(record, type, &iTable);
6731 assert(iTable.flags & MEM_Int);
6732 /* Do not count veiws and triggers. */
6733 if (iTable.u.i > 0) {
6734 tables[0] = (int)iTable.u.i;
6735 tables++;
6736 current++;
6737 }
6738 }
6739 if (ret != DB_NOTFOUND)
6740 goto err;
6741 else
6742 ret = 0;
6743
6744 /* Mark the end of the list. */
6745 tables[0] = -1;
6746 *iTables = ptr;
6747
6748 err: if ((ret != 0 || rc != SQLITE_OK) && ptr)
6749 sqlite3_free(ptr);
6750 if (dbc)
6751 dbc->close(dbc);
6752 if (dbp) {
6753 void *app = dbp->app_private;
6754 dbp->close(dbp, DB_NOSYNC);
6755 if (app)
6756 sqlite3DbFree(p->db, app);
6757 }
6758 return MAP_ERR(rc, ret, p);
6759 }
6760
6761 /*
6762 * Gets the number of pages in all user tables in the database.
6763 * p - Btree of the database.
6764 * name - Name of the database, such as main or temp.
6765 * tables - A list of the iTable values of all tables in the database is
6766 * allocated and returned in this variable, the caller must use
6767 * sqlite3_free() to free the memory when done.
6768 * pageCount - Is set to the number of pages in the database.
6769 */
btreeGetPageCount(Btree * p,int ** tables,u32 * pageCount,DB_TXN * txn)6770 int btreeGetPageCount(Btree *p, int **tables, u32 *pageCount, DB_TXN *txn)
6771 {
6772 DB *dbp;
6773 DB_BTREE_STAT *stats;
6774 DBC *dbc;
6775 DB_TXN *txnChild;
6776 BtShared *pBt;
6777 int i, ret, ret2, rc;
6778 void *app;
6779
6780 ret = ret2 = 0;
6781 dbp = NULL;
6782 *pageCount = 0;
6783 rc = SQLITE_OK;
6784 dbc = NULL;
6785 pBt = p->pBt;
6786 txnChild = NULL;
6787
6788 /*
6789 * Get a list of all the iTable values for all tables in
6790 * the database.
6791 */
6792 if ((rc = btreeGetTables(p, tables, txn)) != SQLITE_OK)
6793 goto err;
6794
6795 /*
6796 * Do not want to keep the locks on all the tables, but
6797 * also do not want to commit or abort the transaction.
6798 */
6799 ret = pDbEnv->txn_begin(pDbEnv, txn, &txnChild, DB_TXN_NOSYNC);
6800 if (ret != 0)
6801 goto err;
6802
6803 /*
6804 * For each table, get a DB handle and use the stat() function
6805 * to get the page count.
6806 */
6807 i = 0;
6808 while ((*tables)[i] > -1) {
6809 rc = btreeGetUserTable(p, txnChild, &dbp, (*tables)[i]);
6810 if (rc != SQLITE_OK)
6811 goto err;
6812 assert(dbp);
6813
6814 ret = dbp->stat(dbp, txnChild, (void *)&stats, DB_FAST_STAT);
6815 if (ret != 0)
6816 goto err;
6817
6818 *pageCount += stats->bt_pagecnt;
6819
6820 app = dbp->app_private;
6821 dbp->close(dbp, DB_NOSYNC);
6822 if (app)
6823 sqlite3DbFree(p->db, app);
6824 dbp = 0;
6825 #ifdef BDBSQL_OMIT_LEAKCHECK
6826 free(stats);
6827 #else
6828 sqlite3_free(stats);
6829 #endif
6830 i++;
6831 }
6832
6833 err: if (dbp) {
6834 app = dbp->app_private;
6835 dbp->close(dbp, DB_NOSYNC);
6836 if (app)
6837 sqlite3DbFree(p->db, app);
6838 }
6839
6840 /* Was only used for reading, so safe to abort. */
6841 if (txnChild) {
6842 if ((ret2 = txnChild->abort(txnChild)) != 0 && ret == 0)
6843 ret = ret2;
6844 }
6845
6846 return MAP_ERR(rc, ret, p);
6847 }
6848
6849 /*
6850 * This pair of functions manages the handle lock held by Berkeley DB for
6851 * database (DB) handles. Berkeley DB holds those locks so that a remove can't
6852 * succeed while a handle is still open. The SQL API needs that remove to
6853 * succeed if the handle is "just cached" - that is not actively in use.
6854 * Consequently we reach into the DB handle and unlock the handle_lock when the
6855 * handle is only being held cached.
6856 * We re-get the lock when the handle is accessed again. A handle shouldn't be
6857 * accessed after a remove, but we'll be a bit paranoid and do checks for that
6858 * situation anyway.
6859 */
btreeDbHandleLock(Btree * p,CACHED_DB * cached_db)6860 static int btreeDbHandleLock(Btree *p, CACHED_DB *cached_db)
6861 {
6862 BtShared *pBt;
6863 DB *dbp;
6864 DBT fileobj;
6865 DB_LOCK_ILOCK lock_desc;
6866 int ret;
6867
6868 pBt = p->pBt;
6869 ret = 0;
6870 dbp = cached_db->dbp;
6871
6872 if (btreeDbHandleIsLocked(cached_db))
6873 return (0);
6874
6875 /* Ensure we're going to ask for a reasonable lock. */
6876 if (cached_db->lock_mode == DB_LOCK_NG)
6877 return (0);
6878
6879 memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
6880 lock_desc.pgno = dbp->meta_pgno;
6881 lock_desc.type = DB_HANDLE_LOCK;
6882
6883 memset(&fileobj, 0, sizeof(fileobj));
6884 fileobj.data = &lock_desc;
6885 fileobj.size = sizeof(lock_desc);
6886
6887 if (dbp != NULL && dbp->locker != NULL) {
6888 ret = pDbEnv->lock_get(pDbEnv,
6889 ((DB_SQL_LOCKER*)dbp->locker)->id, 0, &fileobj,
6890 cached_db->lock_mode, &(dbp->handle_lock));
6891 /* Avoid getting the lock again, until it's been dropped. */
6892 cached_db->lock_mode = DB_LOCK_NG;
6893 }
6894
6895 return (ret);
6896 }
6897
btreeDbHandleUnlock(Btree * p,CACHED_DB * cached_db)6898 static int btreeDbHandleUnlock(Btree *p, CACHED_DB *cached_db)
6899 {
6900 BtShared *pBt;
6901
6902 pBt = p->pBt;
6903 if (!btreeDbHandleIsLocked(cached_db))
6904 return (0);
6905
6906 cached_db->lock_mode = cached_db->dbp->handle_lock.mode;
6907 return (pDbEnv->lock_put(pDbEnv, &cached_db->dbp->handle_lock));
6908 }
6909
btreeDbHandleIsLocked(CACHED_DB * cached_db)6910 static int btreeDbHandleIsLocked(CACHED_DB *cached_db)
6911 {
6912 #define LOCK_INVALID 0
6913 return (cached_db->dbp->handle_lock.off != LOCK_INVALID);
6914 }
6915
6916 /*
6917 * Integer compression
6918 *
6919 * First byte | Next | Maximum
6920 * byte | bytes| value
6921 * ------------+------+---------------------------------------------------------
6922 * [0 xxxxxxx] | 0 | 2^7 - 1
6923 * [10 xxxxxx] | 1 | 2^14 + 2^7 - 1
6924 * [110 xxxxx] | 2 | 2^21 + 2^14 + 2^7 - 1
6925 * [1110 xxxx] | 3 | 2^28 + 2^21 + 2^14 + 2^7 - 1
6926 * [11110 xxx] | 4 | 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6927 * [11111 000] | 5 | 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6928 * [11111 001] | 6 | 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6929 * [11111 010] | 7 | 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6930 * [11111 011] | 8 | 2^64 + 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 +
6931 * | | 2^7 - 1
6932 *
6933 * NOTE: this compression algorithm depends
6934 * on big-endian order, so swap if necessary.
6935 */
6936 extern int __db_isbigendian(void);
6937
6938 #define CMP_INT_1BYTE_MAX 0x7F
6939 #define CMP_INT_2BYTE_MAX 0x407F
6940 #define CMP_INT_3BYTE_MAX 0x20407F
6941 #define CMP_INT_4BYTE_MAX 0x1020407F
6942
6943 #if defined(_MSC_VER) && _MSC_VER < 1300
6944 #define CMP_INT_5BYTE_MAX 0x081020407Fi64
6945 #define CMP_INT_6BYTE_MAX 0x01081020407Fi64
6946 #define CMP_INT_7BYTE_MAX 0x0101081020407Fi64
6947 #define CMP_INT_8BYTE_MAX 0x010101081020407Fi64
6948 #else
6949 #define CMP_INT_5BYTE_MAX 0x081020407FLL
6950 #define CMP_INT_6BYTE_MAX 0x01081020407FLL
6951 #define CMP_INT_7BYTE_MAX 0x0101081020407FLL
6952 #define CMP_INT_8BYTE_MAX 0x010101081020407FLL
6953 #endif
6954
6955 #define CMP_INT_2BYTE_VAL 0x80
6956 #define CMP_INT_3BYTE_VAL 0xC0
6957 #define CMP_INT_4BYTE_VAL 0xE0
6958 #define CMP_INT_5BYTE_VAL 0xF0
6959 #define CMP_INT_6BYTE_VAL 0xF8
6960 #define CMP_INT_7BYTE_VAL 0xF9
6961 #define CMP_INT_8BYTE_VAL 0xFA
6962 #define CMP_INT_9BYTE_VAL 0xFB
6963
6964 #define CMP_INT_2BYTE_MASK 0x3F
6965 #define CMP_INT_3BYTE_MASK 0x1F
6966 #define CMP_INT_4BYTE_MASK 0x0F
6967 #define CMP_INT_5BYTE_MASK 0x07
6968
6969 static const u_int8_t __dbsql_marshaled_int_size[] = {
6970 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6971 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6972 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6973 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6974 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6975 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6976 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6977 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6978 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6979 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6980 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6981 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6982 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6983 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6984 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6985 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6986
6987 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6988 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6989 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6990 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6991 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6992 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6993 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6994 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6995
6996 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
6997 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
6998 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
6999 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
7000
7001 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
7002 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
7003
7004 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
7005 0x06, 0x07, 0x08, 0x09, 0xFF, 0xFF, 0xFF, 0xFF
7006 };
7007
7008 /*
7009 * btreeCompressInt --
7010 * Compresses the integer into the buffer, returning the number of
7011 * bytes occupied.
7012 *
7013 * An exact copy of __db_compress_int
7014 */
btreeCompressInt(u_int8_t * buf,u_int64_t i)7015 static int btreeCompressInt(u_int8_t *buf, u_int64_t i)
7016 {
7017 if (i <= CMP_INT_1BYTE_MAX) {
7018 /* no swapping for one byte value */
7019 buf[0] = (u_int8_t)i;
7020 return 1;
7021 } else {
7022 u_int8_t *p = (u_int8_t*)&i;
7023 if (i <= CMP_INT_2BYTE_MAX) {
7024 i -= CMP_INT_1BYTE_MAX + 1;
7025 if (__db_isbigendian() != 0) {
7026 buf[0] = p[6] | CMP_INT_2BYTE_VAL;
7027 buf[1] = p[7];
7028 } else {
7029 buf[0] = p[1] | CMP_INT_2BYTE_VAL;
7030 buf[1] = p[0];
7031 }
7032 return 2;
7033 } else if (i <= CMP_INT_3BYTE_MAX) {
7034 i -= CMP_INT_2BYTE_MAX + 1;
7035 if (__db_isbigendian() != 0) {
7036 buf[0] = p[5] | CMP_INT_3BYTE_VAL;
7037 buf[1] = p[6];
7038 buf[2] = p[7];
7039 } else {
7040 buf[0] = p[2] | CMP_INT_3BYTE_VAL;
7041 buf[1] = p[1];
7042 buf[2] = p[0];
7043 }
7044 return 3;
7045 } else if (i <= CMP_INT_4BYTE_MAX) {
7046 i -= CMP_INT_3BYTE_MAX + 1;
7047 if (__db_isbigendian() != 0) {
7048 buf[0] = p[4] | CMP_INT_4BYTE_VAL;
7049 buf[1] = p[5];
7050 buf[2] = p[6];
7051 buf[3] = p[7];
7052 } else {
7053 buf[0] = p[3] | CMP_INT_4BYTE_VAL;
7054 buf[1] = p[2];
7055 buf[2] = p[1];
7056 buf[3] = p[0];
7057 }
7058 return 4;
7059 } else if (i <= CMP_INT_5BYTE_MAX) {
7060 i -= CMP_INT_4BYTE_MAX + 1;
7061 if (__db_isbigendian() != 0) {
7062 buf[0] = p[3] | CMP_INT_5BYTE_VAL;
7063 buf[1] = p[4];
7064 buf[2] = p[5];
7065 buf[3] = p[6];
7066 buf[4] = p[7];
7067 } else {
7068 buf[0] = p[4] | CMP_INT_5BYTE_VAL;
7069 buf[1] = p[3];
7070 buf[2] = p[2];
7071 buf[3] = p[1];
7072 buf[4] = p[0];
7073 }
7074 return 5;
7075 } else if (i <= CMP_INT_6BYTE_MAX) {
7076 i -= CMP_INT_5BYTE_MAX + 1;
7077 if (__db_isbigendian() != 0) {
7078 buf[0] = CMP_INT_6BYTE_VAL;
7079 buf[1] = p[3];
7080 buf[2] = p[4];
7081 buf[3] = p[5];
7082 buf[4] = p[6];
7083 buf[5] = p[7];
7084 } else {
7085 buf[0] = CMP_INT_6BYTE_VAL;
7086 buf[1] = p[4];
7087 buf[2] = p[3];
7088 buf[3] = p[2];
7089 buf[4] = p[1];
7090 buf[5] = p[0];
7091 }
7092 return 6;
7093 } else if (i <= CMP_INT_7BYTE_MAX) {
7094 i -= CMP_INT_6BYTE_MAX + 1;
7095 if (__db_isbigendian() != 0) {
7096 buf[0] = CMP_INT_7BYTE_VAL;
7097 buf[1] = p[2];
7098 buf[2] = p[3];
7099 buf[3] = p[4];
7100 buf[4] = p[5];
7101 buf[5] = p[6];
7102 buf[6] = p[7];
7103 } else {
7104 buf[0] = CMP_INT_7BYTE_VAL;
7105 buf[1] = p[5];
7106 buf[2] = p[4];
7107 buf[3] = p[3];
7108 buf[4] = p[2];
7109 buf[5] = p[1];
7110 buf[6] = p[0];
7111 }
7112 return 7;
7113 } else if (i <= CMP_INT_8BYTE_MAX) {
7114 i -= CMP_INT_7BYTE_MAX + 1;
7115 if (__db_isbigendian() != 0) {
7116 buf[0] = CMP_INT_8BYTE_VAL;
7117 buf[1] = p[1];
7118 buf[2] = p[2];
7119 buf[3] = p[3];
7120 buf[4] = p[4];
7121 buf[5] = p[5];
7122 buf[6] = p[6];
7123 buf[7] = p[7];
7124 } else {
7125 buf[0] = CMP_INT_8BYTE_VAL;
7126 buf[1] = p[6];
7127 buf[2] = p[5];
7128 buf[3] = p[4];
7129 buf[4] = p[3];
7130 buf[5] = p[2];
7131 buf[6] = p[1];
7132 buf[7] = p[0];
7133 }
7134 return 8;
7135 } else {
7136 i -= CMP_INT_8BYTE_MAX + 1;
7137 if (__db_isbigendian() != 0) {
7138 buf[0] = CMP_INT_9BYTE_VAL;
7139 buf[1] = p[0];
7140 buf[2] = p[1];
7141 buf[3] = p[2];
7142 buf[4] = p[3];
7143 buf[5] = p[4];
7144 buf[6] = p[5];
7145 buf[7] = p[6];
7146 buf[8] = p[7];
7147 } else {
7148 buf[0] = CMP_INT_9BYTE_VAL;
7149 buf[1] = p[7];
7150 buf[2] = p[6];
7151 buf[3] = p[5];
7152 buf[4] = p[4];
7153 buf[5] = p[3];
7154 buf[6] = p[2];
7155 buf[7] = p[1];
7156 buf[8] = p[0];
7157 }
7158 return 9;
7159 }
7160 }
7161 }
7162
7163 /*
7164 * btreeDecompressInt --
7165 * Decompresses the compressed integer pointer to by buf into i,
7166 * returning the number of bytes read.
7167 *
7168 * An exact copy of __db_decompress_int
7169 */
btreeDecompressInt(const u_int8_t * buf,u_int64_t * i)7170 static int btreeDecompressInt(const u_int8_t *buf, u_int64_t *i)
7171 {
7172 int len;
7173 u_int64_t tmp;
7174 u_int8_t *p;
7175 u_int8_t c;
7176
7177 tmp = 0;
7178 p = (u_int8_t*)&tmp;
7179 c = buf[0];
7180 len = __dbsql_marshaled_int_size[c];
7181
7182 switch (len) {
7183 case 1:
7184 *i = c;
7185 return 1;
7186 case 2:
7187 if (__db_isbigendian() != 0) {
7188 p[6] = (c & CMP_INT_2BYTE_MASK);
7189 p[7] = buf[1];
7190 } else {
7191 p[1] = (c & CMP_INT_2BYTE_MASK);
7192 p[0] = buf[1];
7193 }
7194 tmp += CMP_INT_1BYTE_MAX + 1;
7195 break;
7196 case 3:
7197 if (__db_isbigendian() != 0) {
7198 p[5] = (c & CMP_INT_3BYTE_MASK);
7199 p[6] = buf[1];
7200 p[7] = buf[2];
7201 } else {
7202 p[2] = (c & CMP_INT_3BYTE_MASK);
7203 p[1] = buf[1];
7204 p[0] = buf[2];
7205 }
7206 tmp += CMP_INT_2BYTE_MAX + 1;
7207 break;
7208 case 4:
7209 if (__db_isbigendian() != 0) {
7210 p[4] = (c & CMP_INT_4BYTE_MASK);
7211 p[5] = buf[1];
7212 p[6] = buf[2];
7213 p[7] = buf[3];
7214 } else {
7215 p[3] = (c & CMP_INT_4BYTE_MASK);
7216 p[2] = buf[1];
7217 p[1] = buf[2];
7218 p[0] = buf[3];
7219 }
7220 tmp += CMP_INT_3BYTE_MAX + 1;
7221 break;
7222 case 5:
7223 if (__db_isbigendian() != 0) {
7224 p[3] = (c & CMP_INT_5BYTE_MASK);
7225 p[4] = buf[1];
7226 p[5] = buf[2];
7227 p[6] = buf[3];
7228 p[7] = buf[4];
7229 } else {
7230 p[4] = (c & CMP_INT_5BYTE_MASK);
7231 p[3] = buf[1];
7232 p[2] = buf[2];
7233 p[1] = buf[3];
7234 p[0] = buf[4];
7235 }
7236 tmp += CMP_INT_4BYTE_MAX + 1;
7237 break;
7238 case 6:
7239 if (__db_isbigendian() != 0) {
7240 p[3] = buf[1];
7241 p[4] = buf[2];
7242 p[5] = buf[3];
7243 p[6] = buf[4];
7244 p[7] = buf[5];
7245 } else {
7246 p[4] = buf[1];
7247 p[3] = buf[2];
7248 p[2] = buf[3];
7249 p[1] = buf[4];
7250 p[0] = buf[5];
7251 }
7252 tmp += CMP_INT_5BYTE_MAX + 1;
7253 break;
7254 case 7:
7255 if (__db_isbigendian() != 0) {
7256 p[2] = buf[1];
7257 p[3] = buf[2];
7258 p[4] = buf[3];
7259 p[5] = buf[4];
7260 p[6] = buf[5];
7261 p[7] = buf[6];
7262 } else {
7263 p[5] = buf[1];
7264 p[4] = buf[2];
7265 p[3] = buf[3];
7266 p[2] = buf[4];
7267 p[1] = buf[5];
7268 p[0] = buf[6];
7269 }
7270 tmp += CMP_INT_6BYTE_MAX + 1;
7271 break;
7272 case 8:
7273 if (__db_isbigendian() != 0) {
7274 p[1] = buf[1];
7275 p[2] = buf[2];
7276 p[3] = buf[3];
7277 p[4] = buf[4];
7278 p[5] = buf[5];
7279 p[6] = buf[6];
7280 p[7] = buf[7];
7281 } else {
7282 p[6] = buf[1];
7283 p[5] = buf[2];
7284 p[4] = buf[3];
7285 p[3] = buf[4];
7286 p[2] = buf[5];
7287 p[1] = buf[6];
7288 p[0] = buf[7];
7289 }
7290 tmp += CMP_INT_7BYTE_MAX + 1;
7291 break;
7292 case 9:
7293 if (__db_isbigendian() != 0) {
7294 p[0] = buf[1];
7295 p[1] = buf[2];
7296 p[2] = buf[3];
7297 p[3] = buf[4];
7298 p[4] = buf[5];
7299 p[5] = buf[6];
7300 p[6] = buf[7];
7301 p[7] = buf[8];
7302 } else {
7303 p[7] = buf[1];
7304 p[6] = buf[2];
7305 p[5] = buf[3];
7306 p[4] = buf[4];
7307 p[3] = buf[5];
7308 p[2] = buf[6];
7309 p[1] = buf[7];
7310 p[0] = buf[8];
7311 }
7312 tmp += CMP_INT_8BYTE_MAX + 1;
7313 break;
7314 default:
7315 break;
7316 }
7317
7318 *i = tmp;
7319 return len;
7320 }
7321
7322 #ifdef BDBSQL_OMIT_LEAKCHECK
7323 #undef sqlite3_malloc
7324 #undef sqlite3_free
7325 #undef sqlite3_strdup
7326 #endif
7327
7328 #ifdef BDBSQL_SHARE_PRIVATE
7329
7330 /*
7331 * Platform requirements:
7332 * -- must have mmap()
7333 * -- must have fcntl() for posix file locking
7334 * -- must support full posix open() semantics (e.g. VXWORKS does not)
7335 */
7336
7337 /* this is a very stripped down version of btreeOpenEnvironment() */
openPrivateEnvironment(Btree * p,int startFamily)7338 static int openPrivateEnvironment(Btree *p, int startFamily)
7339 {
7340 BtShared *pBt;
7341 CACHED_DB *cached_db;
7342 int creating, iTable, newEnv, rc, ret, reuse_env, writeLock;
7343 txn_mode_t txn_mode;
7344 i64 cache_sz;
7345
7346 newEnv = ret = reuse_env = 0;
7347 rc = SQLITE_OK;
7348 cached_db = NULL;
7349 /*
7350 * btreeOpenEnvironment() now does this here:
7351 * (void)btreeUpdateBtShared(p, 0);
7352 * Need to consider how multiple opens with different paths
7353 * affects BDBSQL_SHARE_PRIVATE
7354 */
7355 pBt = p->pBt;
7356 assert(pBt->dbStorage == DB_STORE_NAMED);
7357
7358 /* open mutex is held */
7359 cache_sz = (i64)pBt->cacheSize;
7360 if (cache_sz < DB_MIN_CACHESIZE)
7361 cache_sz = DB_MIN_CACHESIZE;
7362 cache_sz *= (pBt->pageSize > 0) ?
7363 pBt->pageSize : SQLITE_DEFAULT_PAGE_SIZE;
7364 pDbEnv->set_cachesize(pDbEnv,
7365 (u_int32_t)(cache_sz / GIGABYTE),
7366 (u_int32_t)(cache_sz % GIGABYTE), 0);
7367 if (pBt->pageSize != 0 &&
7368 (ret = pDbEnv->set_mp_pagesize(pDbEnv, pBt->pageSize)) != 0)
7369 goto err;
7370 pDbEnv->set_mp_mmapsize(pDbEnv, 0);
7371 pDbEnv->set_mp_mtxcount(pDbEnv, pBt->mp_mutex_count);
7372 pDbEnv->app_private = pBt;
7373 pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
7374
7375 ret = pDbEnv->open(pDbEnv, pBt->dir_name, pBt->env_oflags, 0);
7376 /* There is no acceptable failure for this reopen. */
7377 if (ret != 0)
7378 goto err;
7379
7380 pBt->env_opened = newEnv = 1;
7381 assert(!p->connected);
7382 p->connected = 1;
7383
7384 if (!IS_ENV_READONLY(pBt) && p->vfsFlags & SQLITE_OPEN_CREATE)
7385 pBt->db_oflags |= DB_CREATE;
7386
7387 creating = 0;
7388 if ((rc = btreeOpenMetaTables(p, &creating)) != SQLITE_OK)
7389 goto err;
7390 /* If this assertion trips, get code from btreeOpenEnvironment(). */
7391 assert(!creating); /* TBD */
7392
7393 #ifdef BDBSQL_PRELOAD_HANDLES
7394 if (newEnv && !creating)
7395 (void)btreePreloadHandles(p);
7396 #endif
7397 /* need to start the family txn */
7398 if (startFamily && (ret = pDbEnv->txn_begin(pDbEnv, NULL, &pFamilyTxn,
7399 DB_TXN_FAMILY|(p->txn_bulk ? DB_TXN_BULK:0))) != 0)
7400 return dberr2sqlite(ret, p);
7401
7402 err: if (rc != SQLITE_OK || ret != 0) {
7403 p->connected = 0;
7404 }
7405 return MAP_ERR(rc, ret, p);
7406 }
7407
7408 /*
7409 * btreeReopenPrivateEnvironment()
7410 * For shared private environments this function does work from
7411 * both sqlite3BtreeClose() and btreePrepareEnvironment().
7412 * - close any open databases
7413 * - close the environment, but prevent cache flush
7414 * - set up opening the new environment.
7415 */
btreeReopenPrivateEnvironment(Btree * p)7416 static int btreeReopenPrivateEnvironment(Btree *p)
7417 {
7418 BtShared *pBt;
7419 #ifdef BDBSQL_FILE_PER_TABLE
7420 char *dirPathName, dirPathBuf[BT_MAX_PATH];
7421 #endif
7422 int ret, rc, t_rc, t_ret, startFamily, idx;
7423 sqlite3_mutex *mutexOpen;
7424
7425 log_msg(LOG_VERBOSE, "btreeReopenPrivateEnvironment(%p)", p);
7426
7427 ret = 0;
7428 pBt = p->pBt;
7429 rc = SQLITE_OK;
7430
7431 /*
7432 * do not reopen if pBt->nRef is 0. That means the environment
7433 * is being closed.
7434 */
7435 if (pBt == NULL || pBt->nRef == 0)
7436 goto done;
7437
7438 /* make some state assertions (TBD -- remove these eventually) */
7439 assert(pBt->transactional); /* must be transactional */
7440 assert(pBt->first_cursor == NULL); /* no active cursors */
7441 assert(pMainTxn == NULL); /* only at top-level txn */
7442 assert(pBt->dbStorage == DB_STORE_NAMED); /* not temp */
7443
7444 /* commit family txn; it will be null when shutting down */
7445 if (pFamilyTxn != NULL) {
7446 startFamily = 1;
7447 ret = pFamilyTxn->commit(pFamilyTxn, 0);
7448 pFamilyTxn = NULL;
7449 /* p->inTrans = TRANS_NONE; don't change state of this */
7450 if (ret != 0 && rc == SQLITE_OK)
7451 rc = dberr2sqlite(ret, p);
7452 } else
7453 startFamily = 0;
7454
7455 /*
7456 * acquire mutexOpen lock while closing down cached db handles.
7457 * There is a case where the call could be from
7458 * btreeOpenEnvironment() in which case the mutex is already
7459 * held. It's inefficient to close/reopen in that path but
7460 * it should be infrequent and it's more consistent to do that
7461 * than just return.
7462 */
7463 mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
7464 if (!pBt->lockfile.in_env_open)
7465 sqlite3_mutex_enter(mutexOpen);
7466 /* close open DB handles and clear related hash table */
7467 t_rc = btreeCleanupCachedHandles(p, CLEANUP_CLOSE);
7468 if (t_rc != SQLITE_OK && rc == SQLITE_OK)
7469 rc = t_rc;
7470 sqlite3HashClear(&pBt->db_cache);
7471 /* close tables and meta databases */
7472 if (pTablesDb != NULL &&
7473 (t_ret = pTablesDb->close(pTablesDb, DB_NOSYNC)) != 0 && ret == 0)
7474 ret = t_ret;
7475 if (pMetaDb != NULL &&
7476 (t_ret = pMetaDb->close(pMetaDb, DB_NOSYNC)) != 0 && ret == 0)
7477 ret = t_ret;
7478 pTablesDb = pMetaDb = NULL;
7479
7480 /* flush the cache of metadata values */
7481 for (idx = 0; idx < NUMMETA; idx++)
7482 pBt->meta[idx].cached = 0;
7483 /*
7484 * close environment:
7485 * - set the error call to nothing to quiet any errors
7486 * - set DB_NOFLUSH to prevent the cache from flushing
7487 * - ignore a DB_RUNRECOVERY error
7488 */
7489 pDbEnv->set_errcall(pDbEnv, NULL);
7490 pDbEnv->set_flags(pDbEnv, DB_NOFLUSH, 1);
7491 if ((t_ret = pDbEnv->close(pDbEnv, 0)) != 0 && ret == 0) {
7492 if (t_ret != DB_RUNRECOVERY) /* ignore runrecovery */
7493 ret = t_ret;
7494 }
7495
7496 /* hold onto openMutex until done with open */
7497 if (ret != 0)
7498 goto err;
7499
7500 pBt->lsn_reset = NO_LSN_RESET;
7501
7502 /* do some work from btreePrepareEnvironment */
7503 if ((ret = db_env_create(&pDbEnv, 0)) != 0)
7504 goto err;
7505 pDbEnv->set_errpfx(pDbEnv, pBt->full_name);
7506 #ifndef BDBSQL_SINGLE_THREAD
7507 pDbEnv->set_flags(pDbEnv, DB_DATABASE_LOCKING, 1);
7508 pDbEnv->set_lk_detect(pDbEnv, DB_LOCK_DEFAULT);
7509 #endif
7510 pDbEnv->set_lg_regionmax(pDbEnv, BDBSQL_LOG_REGIONMAX);
7511 #ifndef BDBSQL_OMIT_LEAKCHECK
7512 pDbEnv->set_alloc(pDbEnv, btreeMalloc, btreeRealloc,
7513 sqlite3_free);
7514 #endif
7515 if ((ret = pDbEnv->set_lg_max(pDbEnv, pBt->logFileSize)) != 0)
7516 goto err;
7517 #ifndef BDBSQL_OMIT_LOG_REMOVE
7518 if ((ret = pDbEnv->log_set_config(pDbEnv,
7519 DB_LOG_AUTO_REMOVE, 1)) != 0)
7520 goto err;
7521 #endif
7522 #ifdef BDBSQL_FILE_PER_TABLE
7523 /* Reuse dirPathBuf. */
7524 dirPathName = dirPathBuf;
7525 memset(dirPathName, 0, BT_MAX_PATH);
7526 sqlite3_snprintf(sizeof(dirPathName), dirPathName,
7527 "%s/..", pBt->full_name);
7528 pDbEnv->add_data_dir(pDbEnv, dirPathName);
7529 pDbEnv->set_create_dir(pDbEnv, dirPathName);
7530 #else
7531 pDbEnv->add_data_dir(pDbEnv, "..");
7532 #endif
7533 /*
7534 * by definition this function is only called
7535 * for DB_PRIVATE, transactional environments.
7536 * If we hold the write lock it is OK to checkpoint
7537 * during recovery; otherwise do not.
7538 */
7539 pBt->env_oflags = DB_INIT_MPOOL | DB_INIT_LOG | DB_INIT_TXN |
7540 DB_INIT_LOCK | DB_PRIVATE | DB_CREATE | DB_THREAD | DB_RECOVER;
7541 if (!btreeHasFileLock(p, 1))
7542 pBt->env_oflags |= DB_NO_CHECKPOINT;
7543
7544 p->connected = 0;
7545 /* do the open */
7546 rc = openPrivateEnvironment(p, startFamily);
7547 err:
7548 if (!pBt->lockfile.in_env_open)
7549 sqlite3_mutex_leave(mutexOpen);
7550 done:
7551 return MAP_ERR(rc, ret, p);
7552 }
7553
lockFile(int fd,int isread)7554 static int lockFile(int fd, int isread)
7555 {
7556 struct flock fl;
7557 memset(&fl, 0, sizeof(fl));
7558 fl.l_type = (isread ? F_RDLCK : F_WRLCK);
7559 fl.l_whence = SEEK_SET;
7560 fl.l_start = 0;
7561 fl.l_len = 0; /* 0 means lock the whole file */
7562 if (fcntl(fd, F_SETLKW, &fl) < 0) {
7563 /* TBD -- deal with error better */
7564 return errno;
7565 }
7566 return 0;
7567 }
7568
unlockFile(int fd)7569 static int unlockFile(int fd)
7570 {
7571 struct flock fl;
7572 memset(&fl, 0, sizeof(fl));
7573 fl.l_whence = SEEK_SET;
7574 fl.l_start = 0;
7575 fl.l_len = 0;
7576 fl.l_type = F_UNLCK;
7577 if (fcntl(fd, F_SETLKW, &fl) < 0) {
7578 /* TBD -- deal with error better */
7579 return errno;
7580 }
7581 return 0;
7582 }
7583
7584 /*
7585 * create/open the shared lock file, protected by openMutex
7586 * - open or create file
7587 * - initialize file if creating
7588 * - map the file
7589 * - allocate/initialize mutex for the LockFileInfo
7590 * - if the file was created, return with it locked to
7591 * synchronize environment creation as well
7592 */
btreeSetupLockfile(Btree * p,int * createdFile)7593 static int btreeSetupLockfile(Btree *p, int *createdFile)
7594 {
7595 BtShared *pBt;
7596 int fd, ret;
7597 char fname[BT_MAX_PATH];
7598 char initial_bytes[30];
7599 int *ptr;
7600
7601 pBt = p->pBt;
7602 if (pBt->lockfile.fd != 0)
7603 return 0; /* already done */
7604
7605 *createdFile = 0;
7606 /* file is envdir/.lck */
7607 sqlite3_snprintf(sizeof(fname), fname,
7608 "%s/.lck", pBt->dir_name);
7609
7610 /* try a simple open for the common case -- the file exists */
7611 fd = open(fname, O_RDWR , 0);
7612 if (fd < 0) {
7613 /* handle file creation/initialization */
7614 if (errno != ENOENT)
7615 goto err;
7616 fd = open(fname, O_CREAT|O_RDWR, 0666);
7617 if (fd < 0)
7618 goto err;
7619 /* write lock the file to handle initialization race */
7620 lockFile(fd, 0);
7621
7622 /* if the file is non-zero we lost the race -- nothing to do */
7623 if (read(fd, initial_bytes, 4) != 4) {
7624 /* write some data to extend the file size */
7625 sqlite3_snprintf(sizeof(initial_bytes), initial_bytes,
7626 "00000000dontwritehere", 0);
7627 *createdFile = 1;
7628 if (write(fd, initial_bytes, strlen(initial_bytes))
7629 != strlen(initial_bytes))
7630 goto err;
7631 } else
7632 unlockFile(fd);
7633 }
7634
7635 /* allocate mutex for the thread-shared structure */
7636 assert(pBt->lockfile.mutex == 0);
7637 pBt->lockfile.mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
7638 if (pBt->lockfile.mutex == NULL && sqlite3GlobalConfig.bCoreMutex) {
7639 errno = ENOMEM;
7640 goto err;
7641 }
7642
7643 /* map the file */
7644 if ((pBt->lockfile.mapAddr = mmap(NULL, 4096, PROT_READ|PROT_WRITE,
7645 MAP_SHARED, fd, 0)) == 0)
7646 goto err;
7647
7648 ptr = (int *)(pBt->lockfile.mapAddr);
7649 if (*createdFile) {
7650 ptr[0] = 0;
7651 ptr[1] = 0xdeadbeef; /* for debugging */
7652 *((int *)(pBt->lockfile.mapAddr)) = 0;
7653 pBt->lockfile.writelock_count = 1;
7654 /* returning with lock held */
7655 } else {
7656 assert(ptr[1] == 0xdeadbeef);
7657 }
7658
7659 pBt->lockfile.fd = fd;
7660 pBt->lockfile.generation = ptr[0];
7661 return 0;
7662 err:
7663 if (*createdFile)
7664 unlockFile(fd);
7665 if (fd >= 0)
7666 close(fd);
7667 return errno;
7668 }
7669
btreeReadlock(Btree * p,int dontreopen)7670 static int btreeReadlock(Btree *p, int dontreopen)
7671 {
7672 int err;
7673 int curGen, ret;
7674 LockFileInfo *linfo = &p->pBt->lockfile;
7675
7676 assert(linfo->fd > 0);
7677 assert(p->pBt->dbStorage == DB_STORE_NAMED);
7678
7679 sqlite3_mutex_enter(linfo->mutex);
7680 ++linfo->readlock_count;
7681
7682 /*
7683 * a waiting writer means writelock_count is non-zero, which
7684 * means a free pass -- the readlock will have been locked
7685 * by a previous reader.
7686 */
7687 if (linfo->readlock_count == 1 && linfo->writelock_count == 0) {
7688 if ((ret = lockFile(linfo->fd, 1)) != 0)
7689 goto err;
7690 /* check generation number, reopen if mismatch */
7691 curGen = *((int *)(linfo->mapAddr));
7692 if (curGen != linfo->generation && dontreopen == 0) {
7693 /* hold the mutex to lock out racing threads */
7694 ret = btreeReopenPrivateEnvironment(p);
7695 }
7696 linfo->generation = curGen;
7697 }
7698 err:
7699 sqlite3_mutex_leave(linfo->mutex);
7700 return ret;
7701 }
7702
btreeWritelock(Btree * p,int dontReopen)7703 static int btreeWritelock(Btree *p, int dontReopen)
7704 {
7705 int err;
7706 int curGen, ret;
7707 int reacquire = 0;
7708 LockFileInfo *linfo = &p->pBt->lockfile;
7709
7710 assert(linfo->fd > 0);
7711 assert(p->pBt->dbStorage == DB_STORE_NAMED);
7712
7713 sqlite3_mutex_enter(linfo->mutex);
7714 ++linfo->writelock_count;
7715 /* check write_waiting also, to serialize new write lock requests */
7716 if (linfo->writelock_count == 1 || linfo->write_waiting) {
7717 /*
7718 * indicate that a writer *may* be waiting for a lock
7719 * by setting write_waiting. This will cause future
7720 * writers to enter this clause as well. They will
7721 * back up on the lock if it's not yet been acquired.
7722 */
7723 linfo->write_waiting = 1;
7724
7725 /*
7726 * release the mutex if there are active readers; this
7727 * allows them to unlock. Otherwise block future
7728 * readers/writers on the mutex while waiting for the file lock
7729 */
7730 if (linfo->readlock_count != 0) {
7731 reacquire = 1;
7732 sqlite3_mutex_leave(linfo->mutex);
7733 }
7734
7735 if ((ret = lockFile(linfo->fd, 0) != 0))
7736 goto err;
7737
7738 if (reacquire) {
7739 reacquire = 0;
7740 sqlite3_mutex_enter(linfo->mutex);
7741 }
7742 /* clear this flag unconditionally, we have the lock */
7743 linfo->write_waiting = 0;
7744
7745 /* get and increment current generation number */
7746 curGen = *((int *)(linfo->mapAddr));
7747 *((int *)(linfo->mapAddr)) = curGen+1;
7748 if (curGen != linfo->generation && dontReopen == 0) {
7749 /* hold the mutex to lock out racing threads */
7750 ret = btreeReopenPrivateEnvironment(p);
7751 }
7752 linfo->generation = curGen+1;
7753 }
7754 err:
7755 if (!reacquire)
7756 sqlite3_mutex_leave(linfo->mutex);
7757 return ret;
7758 }
7759
btreeScopedFileLock(Btree * p,int iswrite,int dontreopen)7760 int btreeScopedFileLock(Btree *p, int iswrite, int dontreopen)
7761 {
7762 return (iswrite ? btreeWritelock(p, dontreopen) :
7763 btreeReadlock(p, dontreopen));
7764 }
7765
btreeFileLock(Btree * p)7766 static int btreeFileLock(Btree *p)
7767 {
7768 p->maintxn_is_write = (p->inTrans == TRANS_WRITE);
7769 return btreeScopedFileLock(p, p->maintxn_is_write, 0);
7770 }
7771
btreeScopedFileUnlock(Btree * p,int iswrite)7772 int btreeScopedFileUnlock(Btree *p, int iswrite)
7773 {
7774 int ret = 0;
7775 struct flock fl;
7776 LockFileInfo *linfo = &p->pBt->lockfile;
7777
7778 assert(linfo->fd > 0);
7779 assert(p->pBt->dbStorage == DB_STORE_NAMED);
7780
7781 sqlite3_mutex_enter(linfo->mutex);
7782 if (iswrite) {
7783 assert(linfo->writelock_count > 0);
7784 --linfo->writelock_count;
7785 } else {
7786 assert(linfo->readlock_count > 0);
7787 --linfo->readlock_count;
7788 }
7789 /*
7790 * if a writer is waiting, writelock_count will be non-zero, which
7791 * is enough to suppress the unlock.
7792 */
7793 if (linfo->writelock_count == 0) {
7794 if (linfo->readlock_count == 0)
7795 ret = unlockFile(linfo->fd);
7796 else /* downgrade */
7797 ret = lockFile(linfo->fd, 1);
7798 }
7799 sqlite3_mutex_leave(linfo->mutex);
7800 return ret;
7801 }
7802
btreeFileUnlock(Btree * p)7803 static int btreeFileUnlock(Btree *p)
7804 {
7805 return btreeScopedFileUnlock(p, (p->maintxn_is_write != 0));
7806 }
7807
7808 /*
7809 * method to check for some sort of lock.
7810 * do this without acquiring the mutex. It can only be
7811 * called safely when it is known that the process has the
7812 * file lock (either read or write).
7813 */
btreeHasFileLock(Btree * p,int iswrite)7814 int btreeHasFileLock(Btree *p, int iswrite)
7815 {
7816 LockFileInfo *linfo = &p->pBt->lockfile;
7817 if (iswrite)
7818 return (linfo->writelock_count);
7819 else
7820 return (linfo->readlock_count);
7821 }
7822
7823 #endif /* BDBSQL_SHARE_PRIVATE */
7824
7825 /*
7826 * Berkeley DB needs to be able to compare threads so that we can lookup
7827 * structures that are thread specific. The implementations are based on the
7828 * platform specific SQLite sqlite3_mutex_held implementations.
7829 */
7830 #ifdef SQLITE_MUTEX_OS2
7831
getThreadID(sqlite3 * db)7832 void *getThreadID(sqlite3 *db)
7833 {
7834 TID *tid;
7835 PTID ptib;
7836
7837 tid = NULL;
7838 tid = (pthread_t *)sqlite3DbMallocRaw(db, sizeof(TID));
7839 if (tid != NULL) {
7840 DosGetInfoBlocks(&ptib, NULL);
7841 memcpy(tid, &ptib->tib_ptib2->tib2_ultid, sizeof(TID));
7842 } else
7843 db->mallocFailed = 1;
7844 return tid;
7845 }
7846
isCurrentThread(void * tid)7847 int isCurrentThread(void *tid)
7848 {
7849 TID threadid;
7850 PTID ptib;
7851
7852 threadid = *((TID *)tid);
7853 DosGetInfoBlocks(&ptib, NULL);
7854 return threadid == ptib->tib_ptib2->tib2_ultid;
7855 }
7856
7857 #elif defined(SQLITE_MUTEX_PTHREADS)
7858
getThreadID(sqlite3 * db)7859 void *getThreadID(sqlite3 *db)
7860 {
7861 pthread_t *tid, temp_tid;
7862
7863 tid = NULL;
7864 tid = (pthread_t *)sqlite3DbMallocRaw(db, sizeof(pthread_t));
7865 if (tid != NULL) {
7866 temp_tid = pthread_self();
7867 memcpy(tid, &temp_tid, sizeof(pthread_t));
7868 } else
7869 db->mallocFailed = 1;
7870 return tid;
7871 }
7872
isCurrentThread(void * tid)7873 int isCurrentThread(void *tid)
7874 {
7875 return pthread_equal(*((pthread_t *)tid), pthread_self());
7876 }
7877
7878 #elif defined(SQLITE_MUTEX_W32)
7879
getThreadID(sqlite3 * db)7880 void *getThreadID(sqlite3 *db)
7881 {
7882 DWORD *tid, temp_tid;
7883
7884 tid = NULL;
7885 tid = (DWORD *)sqlite3DbMallocRaw(db, sizeof(DWORD));
7886 if (tid != NULL) {
7887 temp_tid = GetCurrentThreadId();
7888 memcpy(tid, &temp_tid, sizeof(DWORD));
7889 } else
7890 db->mallocFailed = 1;
7891 return tid;
7892 }
7893
isCurrentThread(void * tid)7894 int isCurrentThread(void *tid)
7895 {
7896 DWORD threadid;
7897
7898 threadid = *((DWORD *)tid);
7899 return (threadid == GetCurrentThreadId());
7900 }
7901
7902 #else
7903
getThreadID(sqlite3 * db)7904 void *getThreadID(sqlite3 *db)
7905 {
7906 return NULL;
7907 }
7908
isCurrentThread(void * tid)7909 int isCurrentThread(void *tid)
7910 {
7911 return 1;
7912 }
7913
7914 #endif
7915