1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2010, 2013 Oracle and/or its affiliates.  All rights reserved.
5  */
6 
7 /*
8 ** This file implements the sqlite btree.h interface for Berkeley DB.
9 **
10 ** Build-time options:
11 **
12 **  BDBSQL_AUTO_PAGE_SIZE -- Let Berkeley DB choose a default page size.
13 **  BDBSQL_CONCURRENT_CONNECTIONS -- If there are going to be multiple
14 **                           connections to the same database, this can be used
15 **                           to disable a locking optimization.
16 **  BDBSQL_CONVERT_SQLITE -- If an attempt is made to open a SQLite database,
17 **                           convert it on the fly to Berkeley DB.
18 **  BDBSQL_FILE_PER_TABLE -- Don't use sub-databases, use a file per table.
19 **  BDBSQL_OMIT_LEAKCHECK -- Omit combined sqlite and BDB memory allocation.
20 **  BDBSQL_SINGLE_PROCESS -- Keep all environment on the heap (necessary on
21 **                         platforms without mmap).
22 **  BDBSQL_PRELOAD_HANDLES -- Open all tables when first connecting.
23 **  BDBSQL_SINGLE_THREAD -- Omit support for multithreading.
24 **  BDBSQL_SHARE_PRIVATE -- Implies BDBSQL_SINGLE_PROCESS and implements
25 **                          inter-process sharing and synchronization of
26 **                          databases.
27 **  BDBSQL_TXN_SNAPSHOTS_DEFAULT -- Always enable concurrency between read
28 **                                  and write transactions.
29 **  BDBSQL_MEMORY_MAX -- Define the maximum amount of memory (bytes) to be used
30 **                       by shared structures in the main environment region.
31 **  BDBSQL_LOCK_TABLESIZE -- Define the number of buckets in the lock object
32 **                           hash table in the Berkeley DB environment.
33 */
34 
35 #if defined(BDBSQL_CONVERT_SQLITE) && defined(BDBSQL_FILE_PER_TABLE)
36 #error BDBSQL_CONVERT_SQLITE is incompatible with BDBSQL_FILE_PER_TABLE
37 #endif
38 
39 #ifdef BDBSQL_OMIT_SHARING
40 #error BDBSQL_OMIT_SHARING has been replaced by BDBSQL_SINGLE_PROCESS
41 #endif
42 
43 #include <assert.h>
44 
45 #include "sqliteInt.h"
46 #include "btreeInt.h"
47 #include "vdbeInt.h"
48 #include <db.h>
49 #ifdef BDBSQL_SHARE_PRIVATE
50 #include <sys/mman.h>
51 #include <fcntl.h>
52 #endif
53 
54 #ifdef BDBSQL_OMIT_LEAKCHECK
55 #define	sqlite3_malloc malloc
56 #define	sqlite3_free free
57 #define	sqlite3_strdup strdup
58 #else
59 #define	sqlite3_strdup btreeStrdup
60 #endif
61 
62 /*
63  * We use the following internal DB functions.
64  */
65 extern void __os_dirfree(ENV *env, char **namesp, int cnt);
66 extern int __os_dirlist(ENV *env,
67     const char *dir, int returndir, char ***namesp, int *cntp);
68 extern int __os_exists (ENV *, const char *, int *);
69 extern int __os_fileid(ENV *, const char *, int, u_int8_t *);
70 extern int __os_mkdir (ENV *, const char *, int);
71 extern int __os_unlink (ENV *, const char *, int);
72 extern void __os_yield (ENV *, u_long, u_long);
73 
74 /*
75  * The DB_SQL_LOCKER structure is used to unlock a DB handle. The id field must
76  * be compatible with the id field of the DB_LOCKER struct. We know the first
77  * field will be a "u_int32_t id", define enough of a structure here so that
78  * we can use the id field without including lock.h.
79  */
80 typedef struct {
81 	u_int32_t id;
82 } DB_SQL_LOCKER;
83 
84 #define	DB_MIN_CACHESIZE 20		/* pages */
85 
86 #define	US_PER_SEC 1000000		/* Microseconds in a second */
87 
88 /* The rowid is never longer than 9 bytes.*/
89 #define	ROWIDMAXSIZE 10
90 
91 /* Forward declarations for internal functions. */
92 static int btreeCleanupCachedHandles(Btree *p, cleanup_mode_t cleanup);
93 static int btreeCloseCursor(BtCursor *pCur, int removeList);
94 static int btreeCompressInt(u_int8_t *buf, u_int64_t i);
95 static int btreeConfigureDbHandle(Btree *p, int iTable, DB **dbpp);
96 static int btreeCreateDataTable(Btree *, int, CACHED_DB **);
97 static int btreeCreateSharedBtree(
98     Btree *, const char *, u_int8_t *, sqlite3 *, int, storage_mode_t);
99 static int btreeCreateTable(Btree *p, int *piTable, int flags);
100 static void btreeHandleDbError(
101     const DB_ENV *dbenv, const char *errpfx, const char *msg);
102 static int btreeDbHandleIsLocked(CACHED_DB *cached_db);
103 static int btreeDbHandleLock(Btree *p, CACHED_DB *cached_db);
104 static int btreeDbHandleUnlock(Btree *p, CACHED_DB *cached_db);
105 static int btreeDecompressInt(const u_int8_t *buf, u_int64_t *i);
106 static void btreeFreeSharedBtree(BtShared *p, int clear_cache);
107 static int btreeGetSharedBtree(
108     BtShared **, u_int8_t *, sqlite3 *, storage_mode_t, int);
109 static int btreeInvalidateHandleCache(Btree *p);
110 static int btreeLoadBufferIntoTable(BtCursor *pCur);
111 static int btreeMoveto(BtCursor *pCur,
112     const void *pKey, i64 nKey, int bias, int *pRes);
113 static int btreePrepareEnvironment(Btree *p);
114 static int btreeRepIsClient(Btree *p);
115 static int btreeRepStartupFinished(Btree *p);
116 static int btreeRestoreCursorPosition(BtCursor *pCur, int skipMoveto);
117 static int btreeSetUpReplication(Btree *p, int master, u8 *replicate);
118 static int btreeTripAll(Btree *p, int iTable, int incrblobUpdate);
119 static int btreeTripWatchers(BtCursor *pBt, int incrblobUpdate);
120 static int indexIsCollated(KeyInfo *keyInfo);
121 static int supportsDuplicates(DB *db);
122 #ifdef BDBSQL_SHARE_PRIVATE
123 static int btreeFileLock(Btree *p);
124 static int btreeFileUnlock(Btree *p);
125 static int btreeReopenPrivateEnvironment(Btree *p);
126 static int btreeSetupLockfile(Btree *p, int *createdFile);
127 #endif
128 
129 /*
130  * Flags for btreeFindOrCreateDataTable
131  * Defined in btree.h:
132  * #define BTREE_INTKEY     1
133  * #define BTREE_BLOBKEY    2
134  */
135 #define BTREE_CREATE 4	/* If we want to create the table */
136 
137 /* Globals are protected by the static "open" mutex (SQLITE_MUTEX_STATIC_OPEN).
138  */
139 
140 /* The head of the linked list of shared Btree objects */
141 struct BtShared *g_shared_btrees = NULL;
142 
143 /* The environment handle used for temporary environments (NULL or open). */
144 DB_ENV *g_tmp_env;
145 
146 /* The unique id for the next shared Btree object created. */
147 u_int32_t g_uid_next = 0;
148 
149 /* Number of times we're prepared to try multiple gets. */
150 #define	MAX_SMALLS 100
151 
152 /* Number of times to retry operations that return a "busy" error. */
153 #define	BUSY_RETRY_COUNT	100
154 
155 /* TODO: This should probably be '\' on Windows. */
156 #define	PATH_SEPARATOR	"/"
157 
158 #define	pBDb	(pCur->cached_db->dbp)
159 #define	pDbc	(pCur->dbc)
160 #define	pIntKey	((pCur->flags & BTREE_INTKEY) != 0)
161 #define	pIsBuffer	(pCur->pBtree->pBt->resultsBuffer)
162 
163 #define	GET_TABLENAME(b, sz, i, prefix)	do {			\
164 	if (pBt->dbStorage == DB_STORE_NAMED)			\
165 		sqlite3_snprintf((sz), (b), "%stable%05d",	\
166 		(prefix), (i));					\
167 	else if (pBt->dbStorage == DB_STORE_INMEM)		\
168 		sqlite3_snprintf((sz), (b), "%stemp%05d_%05d",	\
169 		    (prefix), pBt->uid, (i));			\
170 	else							\
171 		b = NULL;					\
172 } while (0)
173 
174 #define	GET_DURABLE(pBt)					\
175 	((pBt)->dbStorage == DB_STORE_NAMED &&			\
176 	((pBt)->flags & BTREE_OMIT_JOURNAL) == 0)
177 
178 #define	IS_ENV_READONLY(pBt)					\
179 	(pBt->readonly ? 1 : 0)
180 #define	GET_ENV_READONLY(pBt)					\
181 	(IS_ENV_READONLY(pBt) ? DB_RDONLY : 0)
182 #define	IS_BTREE_READONLY(p)					\
183 	((p->readonly || IS_ENV_READONLY(p->pBt)) ? 1 : 0)
184 
185 #ifndef BDBSQL_SINGLE_THREAD
186 #define	RMW(pCur)						\
187     (pCur->wrFlag && pCur->pBtree->pBt->dbStorage == DB_STORE_NAMED ?	\
188     DB_RMW : 0)
189 #else
190 #define	RMW(pCur) 0
191 #endif
192 
193 #ifdef BDBSQL_SINGLE_THREAD
194 #define	GET_BTREE_ISOLATION(p)	0
195 #else
196 #define	GET_BTREE_ISOLATION(p) (!p->pBt->transactional ? 0 :	\
197 	((p->db->flags & SQLITE_ReadUncommitted) ?		\
198 	DB_READ_UNCOMMITTED : DB_READ_COMMITTED) |		\
199 	((p->pBt->read_txn_flags & DB_TXN_SNAPSHOT) ?		\
200 	DB_TXN_SNAPSHOT : 0))
201 #endif
202 
203 /* The transaction for incrblobs is held in the cursor, so when deadlock
204  * happens the cursor transaction must be aborted instead of the statement
205  * transaction. */
206 #define	HANDLE_INCRBLOB_DEADLOCK(ret, pCur)			\
207 	if (ret == DB_LOCK_DEADLOCK && pCur->isIncrblobHandle) {\
208 		if (!pCur->wrFlag)				\
209 			pCur->pBtree->read_txn = NULL;		\
210 		if (pCur->txn == pCur->pBtree->savepoint_txn)   \
211 			pCur->pBtree->savepoint_txn =           \
212 			    pCur->pBtree->savepoint_txn->parent;\
213 		pCur->txn->abort(pCur->txn);			\
214 		pCur->txn = NULL;				\
215 		return SQLITE_LOCKED;				\
216 	}
217 
218 /* Decide which transaction to use when reading the meta data table. */
219 #define	GET_META_TXN(p)					\
220 	(p->txn_excl ? pSavepointTxn :			\
221 		(pReadTxn ? pReadTxn : pFamilyTxn))
222 
223 /* Decide which flags to use when reading the meta data table. */
224 #define	GET_META_FLAGS(p)				\
225 	((p->txn_excl ? DB_RMW : 0) |			\
226 	    (GET_BTREE_ISOLATION(p) & ~DB_TXN_SNAPSHOT))
227 
dberr2sqlite(int err,Btree * p)228 int dberr2sqlite(int err, Btree *p)
229 {
230 	BtShared *pBt;
231 	int ret;
232 
233 	switch (err) {
234 	case 0:
235 		ret = SQLITE_OK;
236 		break;
237 	case DB_LOCK_DEADLOCK:
238 	case DB_LOCK_NOTGRANTED:
239 	case DB_REP_JOIN_FAILURE:
240 		ret = SQLITE_BUSY;
241 		break;
242 	case DB_NOTFOUND:
243 		ret = SQLITE_NOTFOUND;
244 		break;
245 	case DB_RUNRECOVERY:
246 		ret = SQLITE_CORRUPT;
247 		break;
248 	case EACCES:
249 		ret = SQLITE_READONLY;
250 		break;
251 	case EIO:
252 		ret = SQLITE_IOERR;
253 		break;
254 	case EPERM:
255 		ret = SQLITE_PERM;
256 		break;
257 	case ENOMEM:
258 		ret = SQLITE_NOMEM;
259 		break;
260 	case ENOENT:
261 		ret = SQLITE_CANTOPEN;
262 		break;
263 	case ENOSPC:
264 		ret = SQLITE_FULL;
265 		break;
266 	default:
267 		ret = SQLITE_ERROR;
268 	}
269 
270 	if (p == NULL)
271 		return ret;
272 
273 	pBt = p->pBt;
274 	if (pBt != NULL && pBt->err_msg != NULL) {
275 		if (ret != SQLITE_OK)
276 			sqlite3Error(p->db, ret, pBt->err_msg);
277 		else
278 			sqlite3Error(p->db, ret, NULL);
279 		sqlite3_free(pBt->err_msg);
280 		pBt->err_msg = NULL;
281 	}
282 	return ret;
283 }
284 
285 /*
286  * Close db handle and cleanup resource (e.g.: remove in-memory db)
287  * automatically.
288  *
289  * Note: closeDB is more dangerous than dbp->close since it would remove
290  * in-memory db. Generally, closeDB should only be used instead of dbp->close
291  * when:
292  *   1. Cleanup cached handles.
293  *   2. DB handle creating fails. Safe because no one own this uncreated handle.
294  *   3. Drop Tables.
295  *
296  * In other cases (error handlers, vacuum , backup, etc.), closeDB should not
297  * be called anyway. That's because the db might be required by other
298  * connections.
299  */
closeDB(Btree * p,DB * dbp,u_int32_t flags)300 int closeDB(Btree *p, DB *dbp, u_int32_t flags)
301 {
302 	char *tableName, *fileName, tableNameBuf[DBNAME_SIZE];
303 	u_int32_t remove_flags;
304 	int ret, needRemove;
305 	BtShared *pBt;
306 
307 	tableName = NULL;
308 	fileName = NULL;
309 	needRemove = 0;
310 
311 	if (p == NULL || (pBt = p->pBt) == NULL || dbp == NULL)
312 		return 0;
313 
314 	/*
315 	 * In MPOOL, Named in-memory databases get an artificially bumped
316 	 * reference count so they don't disappear on close; they need a
317 	 * remove to make them disappear.
318 	 */
319 	if (pBt->dbStorage == DB_STORE_INMEM &&
320 	    (dbp->flags & DB_AM_OPEN_CALLED))
321 		needRemove = 1;
322 
323 	/*
324 	 * Save tableName into buf for subsquent dbremove. The buf is required
325 	 * since tableName would be destroyed after db is closed.
326 	 */
327 	if (needRemove && (dbp->get_dbname(dbp, (const char **)&fileName,
328 	   (const char**)&tableName) == 0)) {
329 		strncpy(tableNameBuf, tableName, sizeof(tableNameBuf) - 1);
330 		tableName = tableNameBuf;
331 	}
332 
333 	ret = dbp->close(dbp, flags);
334 
335 	/*
336 	 * Do removes as needed to prevent mpool leak. pSavepointTxn is
337 	 * required since the operations might be rollbacked.
338 	 */
339 	if (needRemove) {
340 		remove_flags = DB_NOSYNC;
341 		if (!GET_DURABLE(pBt))
342 			remove_flags |= DB_TXN_NOT_DURABLE;
343 		if (pSavepointTxn == NULL)
344 			remove_flags |= (DB_AUTO_COMMIT | DB_LOG_NO_DATA);
345 		(void)pDbEnv->dbremove(pDbEnv, pSavepointTxn, fileName,
346 			tableName, remove_flags);
347 	}
348 
349 	return ret;
350 }
351 
352 #define ERR_FILE_NAME "sql-errors.txt"
btreeGetErrorFile(const BtShared * pBt,char * fname)353 void btreeGetErrorFile(const BtShared *pBt, char *fname) {
354 	if (pBt == NULL)
355 		/* No env directory, use the current working directory. */
356                 sqlite3_snprintf(BT_MAX_PATH, fname, ERR_FILE_NAME);
357 	else {
358 		sqlite3_mutex_enter(pBt->mutex);
359 		if (pBt->err_file == NULL)
360 			sqlite3_snprintf(BT_MAX_PATH, fname,
361 			    "%s/%s", pBt->dir_name, ERR_FILE_NAME);
362 		else
363 			sqlite3_snprintf(BT_MAX_PATH, fname,
364 			    "%s", pBt->err_file);
365 		sqlite3_mutex_leave(pBt->mutex);
366 	}
367 }
368 
btreeHandleDbError(const DB_ENV * dbenv,const char * errpfx,const char * msg)369 static void btreeHandleDbError(
370 	const DB_ENV *dbenv,
371 	const char *errpfx,
372 	const char *msg
373 ) {
374 	BtShared *pBt;
375 	FILE *fp;
376 	char fname[BT_MAX_PATH];
377 
378 	/* Store the error msg to pBt->err_msg for future use. */
379 	pBt = (BtShared *)dbenv->app_private;
380 	if (pBt && (errpfx || msg)) {
381 		if (pBt->err_msg != NULL)
382 			sqlite3_free(pBt->err_msg);
383 		pBt->err_msg = sqlite3_mprintf("%s:%s", errpfx, msg);
384 	}
385 
386 	/*
387 	 * If error_file is set, flush the error to the error file. Else flush
388 	 * the error msg to stderr.
389 	 * Simply igore the error return from btreeGetErrorFile since we're
390 	 * in the error handle routine.
391 	 */
392 	btreeGetErrorFile(pBt, fname);
393 	fp = fopen(fname, "a");
394 	if (fp == NULL)
395 		fp = stderr;
396 
397 	fprintf(fp, "%s:%s\n", errpfx, msg);
398 	if (fp != stderr) {
399 		fflush(fp);
400 		fclose(fp);
401 	}
402 }
403 
404 /*
405  * Used in cases where SQLITE_LOCKED should be returned instead of
406  * SQLITE_BUSY.
407  */
dberr2sqlitelocked(int err,Btree * p)408 static int dberr2sqlitelocked(int err, Btree *p)
409 {
410 	int rc = dberr2sqlite(err, p);
411 	if (rc == SQLITE_BUSY)
412 		rc = SQLITE_LOCKED;
413 	return rc;
414 }
415 
416 #ifndef NDEBUG
log_msg(loglevel_t level,const char * fmt,...)417 void log_msg(loglevel_t level, const char *fmt, ...)
418 {
419 	if (level >= CURRENT_LOG_LEVEL) {
420 		va_list ap;
421 		va_start(ap, fmt);
422 		vfprintf(stdout, fmt, ap);
423 		fputc('\n', stdout);
424 		fflush(stdout);
425 		va_end(ap);
426 	}
427 }
428 #endif
429 
430 #ifdef BDBSQL_FILE_PER_TABLE
getMetaDataFileName(const char * full_name,char ** filename)431 int getMetaDataFileName(const char *full_name, char **filename)
432 {
433 	*filename = sqlite3_malloc(strlen(full_name) +
434 		strlen(BDBSQL_META_DATA_TABLE) + 2);
435 	if (*filename == NULL)
436 		return SQLITE_NOMEM;
437 	strcpy(*filename, full_name);
438 	strcpy(*filename + strlen(full_name), PATH_SEPARATOR);
439 	strcpy(*filename + strlen(full_name) + 1, BDBSQL_META_DATA_TABLE);
440 	return SQLITE_OK;
441 }
442 #endif
443 
444 #ifndef BDBSQL_OMIT_LEAKCHECK
445 /*
446  * Wrap the sqlite malloc and realloc APIs before using them in Berkeley DB
447  * since they use different parameter types to the standard malloc and
448  * realloc.
449  * The signature of free matches, so we don't need to wrap it.
450  */
btreeMalloc(size_t size)451 static void *btreeMalloc(size_t size)
452 {
453 	if (size != (size_t)(int)size)
454 		return NULL;
455 
456 	return sqlite3_malloc((int)size);
457 }
458 
btreeRealloc(void * buff,size_t size)459 static void *btreeRealloc(void * buff, size_t size)
460 {
461 	if (size != (size_t)(int)size)
462 		return NULL;
463 
464 	return sqlite3_realloc(buff, (int)size);
465 }
466 
btreeStrdup(const char * sq)467 static char *btreeStrdup(const char *sq)
468 {
469 	return sqlite3_mprintf("%s", sq);
470 }
471 #endif
472 
btreeCompareIntKey(DB * dbp,const DBT * dbt1,const DBT * dbt2)473 static int btreeCompareIntKey(DB *dbp, const DBT *dbt1, const DBT *dbt2)
474 {
475 	i64 v1,v2;
476 	assert(dbt1->size == sizeof(i64));
477 	assert(dbt2->size == sizeof(i64));
478 
479 	memcpy(&v1, dbt1->data, sizeof(i64));
480 	memcpy(&v2, dbt2->data, sizeof(i64));
481 	if (v1 < v2)
482 		return -1;
483 	return v1 > v2;
484 }
485 
486 #ifdef BDBSQL_CONVERT_SQLITE
btreeConvertSqlite(BtShared * pBt,DB_ENV * tmp_env)487 static int btreeConvertSqlite(BtShared *pBt, DB_ENV *tmp_env)
488 {
489 	char convert_cmd[BT_MAX_PATH + 200];
490 	int ret;
491 #ifdef ANDROID
492 	const char* dbsql_shell = "sqlite3";
493 	const char* sqlite_shell = "sqlite3orig";
494 #else
495 	const char* dbsql_shell = "dbsql";
496 	const char* sqlite_shell = "sqlite3";
497 #endif
498 
499 	log_msg(LOG_NORMAL, "Attempting to convert %s", pBt->full_name);
500 
501 	/*
502 	 * We're going to attempt to convert a SQLite database to Berkeley DB.
503 	 * The main complication is that we may have already created an
504 	 * environment in the journal directory.  This will prevent SQLite from
505 	 * accessing the database with the same name.  Also, if we try to start
506 	 * a dbsql with that name to create the new file, that will destroy the
507 	 * environment we just created.
508 	 *
509 	 * So, the process is:
510 	 *   1. rename the file
511 	 *   2. dump / load to another name (in Berkeley DB format)
512 	 *   3. rename file 2 to the original name
513 	 *   4. if everything worked, remove file 1
514 	 *   5. if anything went wrong, rename file 1 back to
515 	 *      the original name.
516 	 *
517 	 * Use variables in the script to avoid sending in the filename
518 	 * lots of times.
519 	 */
520 	sqlite3_snprintf(sizeof(convert_cmd), convert_cmd,
521 	    "f='%s' ; t=\"$f-bdbtmp\" ; mv \"$f\" \"$t-1\" || exit $? "
522 	    "; ((echo PRAGMA txn_bulk=1';' PRAGMA user_version="
523 		"`%s \"$t-1\" 'pragma user_version'`';'"
524 	    "  ; %s \"$t-1\" .dump) | %s \"$t-2\""
525 	    " && mv \"$t-2\" \"$f\" && rm -r \"$t-2-journal\" && rm \"$t-1\")"
526 	    "|| mv \"$t-1\" \"$f\"",
527 	    pBt->full_name, sqlite_shell, sqlite_shell, dbsql_shell);
528 
529 	if ((ret = system(convert_cmd)) != 0)
530 		return (ret);
531 
532 	/*
533 	 * If all of that worked, we need to reset LSNs before we can
534 	 * open that database file in our environment.  That has to be
535 	 * done in a temporary environment to avoid LSN checks...
536 	 */
537 	log_msg(LOG_NORMAL, "Resetting LSNs in %s", pBt->full_name);
538 	ret = tmp_env->lsn_reset(tmp_env, pBt->full_name, 0);
539 
540 	return (ret);
541 }
542 #endif
543 
544 /*
545  * An internal function that opens the metadata database that is present for
546  * every SQLite Btree, and the special "tables" database maintained by Berkeley
547  * DB that lists all of the subdatabases in a file.
548  *
549  * This is split out into a separate function so that it will be easy to change
550  * the Btree layer to create Berkeley DB database handles per Btree object,
551  * rather than per BtShared object.
552  */
btreeOpenMetaTables(Btree * p,int * pCreating)553 int btreeOpenMetaTables(Btree *p, int *pCreating)
554 {
555 	BtShared *pBt;
556 	DBC *dbc;
557 	DBT key, data;
558 	DB_ENV *tmp_env;
559 	char *fileName;
560 	int i, idx, rc, ret, t_ret;
561 	u32 val;
562 #ifdef BDBSQL_FILE_PER_TABLE
563 	char **dirnames;
564 	int cnt;
565 #endif
566 
567 	pBt = p->pBt;
568 	rc = SQLITE_OK;
569 	ret = t_ret = 0;
570 
571 	if (pBt->lsn_reset != NO_LSN_RESET) {
572 		/*
573 		 * Reset the LSNs in the database, so that we can open the
574 		 * database in a new environment.
575 		 *
576 		 * This is the first time we try to open the database file, so
577 		 * an EINVAL error may indicate an attempt to open a SQLite
578 		 * database.
579 		 */
580 		ret = db_env_create(&tmp_env, 0);
581 		if (ret != 0)
582 			goto err;
583 		tmp_env->set_errcall(tmp_env, NULL);
584 		if (pBt->encrypted) {
585 			ret = tmp_env->set_encrypt(tmp_env,
586 			    pBt->encrypt_pwd, DB_ENCRYPT_AES);
587 			if (ret != 0)
588 				goto err;
589 		}
590 		ret = tmp_env->open(
591 		    tmp_env, NULL, DB_CREATE | DB_PRIVATE | DB_INIT_MPOOL, 0);
592 		while (ret == 0 && pBt->lsn_reset == LSN_RESET_FILE) {
593 			ret = tmp_env->lsn_reset(tmp_env, pBt->full_name, 0);
594 #ifdef BDBSQL_CONVERT_SQLITE
595 			if (ret == EINVAL &&
596 			    btreeConvertSqlite(pBt, tmp_env) == 0) {
597 				ret = 0;
598 				continue;
599 			}
600 #endif
601 			break;
602 		}
603 		if (ret == EINVAL)
604 			rc = SQLITE_NOTADB;
605 #ifdef BDBSQL_FILE_PER_TABLE
606 		__os_dirlist(NULL, pBt->full_name, 0, &dirnames, &cnt);
607 		for (i = 0; i < cnt; i++)
608 			(void)tmp_env->lsn_reset(tmp_env, dirnames[i], 0);
609 		__os_dirfree(NULL, dirnames, cnt);
610 #endif
611 		if ((t_ret = tmp_env->close(tmp_env, 0)) != 0 &&
612 		    ret == 0)
613 			ret = t_ret;
614 		if (ret != 0)
615 			goto err;
616 		pBt->lsn_reset = NO_LSN_RESET;
617 	}
618 
619 	if (pMetaDb != NULL) {
620 		*pCreating = 0;
621 		goto addmeta;
622 	}
623 
624 	/*
625 	 * We open the metadata and tables databases in auto-commit
626 	 * transactions.  These may deadlock or conflict, and should be safe to
627 	 * retry, but for safety we limit how many times we'll do that before
628 	 * returning the error.
629 	 */
630 	i = 0;
631 	do {
632 		if ((ret = db_create(&pMetaDb, pDbEnv, 0)) != 0)
633 			goto err;
634 
635 		if (pBt->encrypted &&
636 		    ((ret = pMetaDb->set_flags(pMetaDb, DB_ENCRYPT)) != 0))
637 				goto err;
638 
639 		if (!GET_DURABLE(pBt)) {
640 			/* Ensure that log records are not written to disk. */
641 			if ((ret =
642 			    pMetaDb->set_flags(pMetaDb, DB_TXN_NOT_DURABLE))
643 			    != 0)
644 				goto err;
645 		}
646 
647 		/*
648 		 * The metadata DB is the first one opened in the file, so it
649 		 * is sufficient to set the page size on it -- other databases
650 		 * in the same file will inherit the same pagesize.  We must
651 		 * open it before the table DB because this open call may be
652 		 * creating the file.
653 		 */
654 		if (pBt->pageSize != 0 &&
655 		    (ret = pMetaDb->set_pagesize(pMetaDb, pBt->pageSize)) != 0)
656 			goto err;
657 
658 		pBt->pageSizeFixed = 1;
659 
660 #ifdef BDBSQL_FILE_PER_TABLE
661 		fileName = BDBSQL_META_DATA_TABLE;
662 #else
663 		fileName = pBt->short_name;
664 #endif
665 		ret = pMetaDb->open(pMetaDb, NULL, fileName,
666 		    pBt->dbStorage == DB_STORE_NAMED ? "metadb" : NULL,
667 		    DB_BTREE,
668 		    pBt->db_oflags | GET_AUTO_COMMIT(pBt, NULL) |
669 		    GET_ENV_READONLY(pBt), 0);
670 
671 		if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
672 			(void)pMetaDb->close(pMetaDb, DB_NOSYNC);
673 			pMetaDb = NULL;
674 		}
675 	} while ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
676 	    ++i < BUSY_RETRY_COUNT);
677 
678 	if (ret != 0) {
679 		if (ret == EACCES && IS_ENV_READONLY(pBt))
680 			rc = SQLITE_READONLY;
681 		else if (ret == EINVAL)
682 			rc = SQLITE_NOTADB;
683 		goto err;
684 	}
685 
686 	/* Set the default max_page_count */
687 	sqlite3BtreeMaxPageCount(p, pBt->pageCount);
688 
689 	if (pBt->dbStorage != DB_STORE_NAMED)
690 		goto addmeta;
691 
692 	i = 0;
693 	do {
694 		/* Named databases use a db to track new table names. */
695 		if ((ret = db_create(&pTablesDb, pDbEnv, 0)) != 0)
696 			goto err;
697 
698 		if (pBt->encrypted &&
699 		    ((ret = pTablesDb->set_flags(pTablesDb, DB_ENCRYPT)) != 0))
700 				goto err;
701 #ifdef BDBSQL_FILE_PER_TABLE
702 		/*
703 		 * When opening a file-per-table we need an additional table to
704 		 * track the names of tables within the database.
705 		 */
706 		ret = pTablesDb->open(pTablesDb, NULL, fileName,
707 		    "tables", DB_BTREE, (pBt->db_oflags) |
708 		     GET_AUTO_COMMIT(pBt, NULL), 0);
709 		/*
710 		 * Insert an entry for the metadata table, so the usage of
711 		 * this table matches the sub-db cursor in the non-split case.
712 		 */
713 		memset(&key, 0, sizeof(key));
714 		memset(&data, 0, sizeof(data));
715 		key.data = "metadb";
716 		key.size = 6;
717 		pTablesDb->put(pTablesDb, NULL, &key, &data, 0);
718 #else
719 		ret = pTablesDb->open(pTablesDb, NULL, fileName,
720 		    NULL, DB_BTREE, (pBt->db_oflags & ~DB_CREATE) |
721 		    DB_RDONLY | GET_AUTO_COMMIT(pBt, NULL), 0);
722 #endif
723 		if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
724 			(void)pTablesDb->close(pTablesDb, DB_NOSYNC);
725 			pTablesDb = NULL;
726 		}
727 	} while ((ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) &&
728 	    ++i < BUSY_RETRY_COUNT);
729 
730 	if (ret != 0)
731 		goto err;
732 
733 	/* Check whether we're creating the database */
734 	if ((ret = pTablesDb->cursor(pTablesDb, pFamilyTxn, &dbc, 0)) != 0)
735 		goto err;
736 
737 	memset(&key, 0, sizeof(key));
738 	memset(&data, 0, sizeof(data));
739 	data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
740 	ret = dbc->get(dbc, &key, &data, DB_LAST);
741 	if (ret == 0)
742 		*pCreating =
743 		    (strncmp((const char *)key.data, "metadb", key.size) == 0);
744 	if ((t_ret = dbc->close(dbc)) != 0 && ret == 0)
745 		ret = t_ret;
746 	if (ret != 0)
747 		goto err;
748 
749 addmeta:/*
750 	 * Populate the MetaDb with any values that were set prior to
751 	 * the sqlite3BtreeOpen that triggers this.
752 	 */
753 	for (idx = 0; idx < NUMMETA; idx++) {
754 		if (pBt->meta[idx].cached)
755 			val = pBt->meta[idx].value;
756 		else if (idx == BTREE_LARGEST_ROOT_PAGE && *pCreating)
757 			val = pBt->autoVacuum;
758 		else if (idx == BTREE_INCR_VACUUM && *pCreating)
759 			val = pBt->incrVacuum;
760 		else
761 			continue;
762 		if ((rc = sqlite3BtreeUpdateMeta(p, idx, val)) != SQLITE_OK)
763 			goto err;
764 	}
765 
766 	if (!*pCreating) {
767 		/* This matches SQLite, I don't understand the naming. */
768 		sqlite3BtreeGetMeta(p, BTREE_LARGEST_ROOT_PAGE, &val);
769 		if (p->db->errCode == SQLITE_BUSY) {
770 			rc = SQLITE_BUSY;
771 			goto err;
772 		}
773 		pBt->autoVacuum = (u8)val;
774 		sqlite3BtreeGetMeta(p, BTREE_INCR_VACUUM, &val);
775 		if (p->db->errCode == SQLITE_BUSY) {
776 			rc = SQLITE_BUSY;
777 			goto err;
778 		}
779 		pBt->incrVacuum = (u8)val;
780 	}
781 
782 err:	if (rc != SQLITE_OK || ret != 0) {
783 		if (pTablesDb != NULL)
784 			(void)pTablesDb->close(pTablesDb, DB_NOSYNC);
785 		if (pMetaDb != NULL)
786 			(void)pMetaDb->close(pMetaDb, DB_NOSYNC);
787 		pTablesDb = pMetaDb = NULL;
788 	}
789 
790 	return MAP_ERR(rc, ret, p);
791 }
792 
793 /*
794  * Berkeley DB doesn't NUL-terminate database names, do the conversion
795  * manually to avoid making a copy just in order to call strtol.
796  */
btreeTableNameToId(const char * subdb,int len,int * pid)797 int btreeTableNameToId(const char *subdb, int len, int *pid)
798 {
799 	const char *p;
800 	int id;
801 
802 	assert(len > 5);
803 	assert(strncmp(subdb, "table", 5) == 0);
804 
805 	id = 0;
806 	for (p = subdb + 5; p < subdb + len; p++) {
807 		if (*p < '0' || *p > '9')
808 			return (EINVAL);
809 		id = (id * 10) + (*p - '0');
810 	}
811 	*pid = id;
812 	return (0);
813 }
814 
815 #ifdef BDBSQL_PRELOAD_HANDLES
btreePreloadHandles(Btree * p)816 static int btreePreloadHandles(Btree *p)
817 {
818 	BtShared *pBt;
819 	CACHED_DB *cached_db;
820 	DBC *dbc;
821 	DBT key, data;
822 	int iTable, ret;
823 
824 	pBt = p->pBt;
825 	dbc = NULL;
826 
827 	if ((ret = pTablesDb->cursor(pTablesDb, NULL, &dbc, 0)) != 0)
828 		goto err;
829 
830 	memset(&key, 0, sizeof(key));
831 	memset(&data, 0, sizeof(data));
832 	data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
833 
834 	sqlite3_mutex_enter(pBt->mutex);
835 	while ((ret = dbc->get(dbc, &key, &data, DB_NEXT)) == 0) {
836 		if (strncmp((const char *)key.data, "table", 5) != 0)
837 			continue;
838 		if ((ret = btreeTableNameToId(
839 		    (const char *)key.data, key.size, &iTable)) != 0)
840 			break;
841 		cached_db = NULL;
842 		(void)btreeCreateDataTable(p, iTable, &cached_db);
843 	}
844 	sqlite3_mutex_leave(pBt->mutex);
845 
846 err:	if (ret == DB_NOTFOUND)
847 		ret = 0;
848 	if (dbc != NULL)
849 		(void)dbc->close(dbc);
850 	return (ret);
851 }
852 #endif /* BDBSQL_PRELOAD_HANDLES */
853 
854 /*
855 ** Free an allocated BtShared and any dependent allocated objects.
856 */
btreeFreeSharedBtree(BtShared * p,int clear_cache)857 static void btreeFreeSharedBtree(BtShared *p, int clear_cache)
858 {
859 	BtShared *tmp_bt;
860 
861 	if (p == NULL)
862 		return;
863 
864 #ifdef BDBSQL_SHARE_PRIVATE
865 	/* close the shared lockfile */
866 	if (p->lockfile.fd > 0)
867 		(void)close(p->lockfile.fd);
868 	if (p->lockfile.mutex != NULL)
869 		sqlite3_mutex_free(p->lockfile.mutex);
870 #endif
871 	if (clear_cache) {
872 		if (p == g_shared_btrees && p->pNextDb == NULL)
873 			g_shared_btrees = NULL;
874 		else if (p == g_shared_btrees) {
875 			g_shared_btrees = p->pNextDb;
876 			g_shared_btrees->pPrevDb = NULL;
877 		} else if (p->pNextDb == NULL)
878 			p->pPrevDb->pNextDb = NULL;
879 		else {
880 			tmp_bt = p->pPrevDb;
881 			p->pPrevDb->pNextDb = p->pNextDb;
882 			p->pNextDb->pPrevDb = tmp_bt;
883 		}
884 	}
885 	if (p->encrypt_pwd != NULL)
886 		CLEAR_PWD(p);
887 	if (p->mutex != NULL)
888 		sqlite3_mutex_free(p->mutex);
889 	if (p->dir_name != NULL)
890 		sqlite3_free(p->dir_name);
891 	if (p->full_name != NULL)
892 		sqlite3_free(p->full_name);
893 	if (p->orig_name != NULL)
894 		sqlite3_free(p->orig_name);
895 	if (p->err_file != NULL)
896 		sqlite3_free(p->err_file);
897 	if (p->err_msg != NULL)
898 		sqlite3_free(p->err_msg);
899 
900 	sqlite3_free(p);
901 }
902 
btreeCheckEnvPrepare(Btree * p)903 static int btreeCheckEnvPrepare(Btree *p)
904 {
905 	BtShared *pBt;
906 	int f_exists, f_isdir, rc;
907 #ifndef BDBSQL_FILE_PER_TABLE
908 	int attrs;
909 	sqlite3_file *fp;
910 #endif
911 
912 	pBt = p->pBt;
913 	rc = SQLITE_OK;
914 	f_exists = f_isdir = 0;
915 
916 	assert(pBt->dbStorage == DB_STORE_NAMED);
917 	assert(pBt->dir_name != NULL);
918 	f_exists = !__os_exists(NULL, pBt->full_name, &f_isdir);
919 	pBt->database_existed = f_exists;
920 
921 	if ((p->vfsFlags & SQLITE_OPEN_READONLY) && !f_exists) {
922 		rc = SQLITE_READONLY;
923 		goto err;
924 	}
925 
926 	if (!f_exists) {
927 		if ((p->vfsFlags & SQLITE_OPEN_READONLY) != 0) {
928 			rc = SQLITE_READONLY;
929 			goto err;
930 		} else if (!(p->vfsFlags & SQLITE_OPEN_CREATE)) {
931 			rc = SQLITE_CANTOPEN;
932 			goto err;
933 		}
934 	} else {
935 #ifndef BDBSQL_FILE_PER_TABLE
936 		/*
937 		 * If we don't have write permission for a file,
938 		 * automatically open any databases read-only.
939 		 */
940 		fp = (sqlite3_file *)sqlite3_malloc(p->db->pVfs->szOsFile);
941 		if (fp == NULL) {
942 			rc = SQLITE_NOMEM;
943 			goto err;
944 		}
945 		memset(fp, 0, p->db->pVfs->szOsFile);
946 		rc = sqlite3OsOpen(p->db->pVfs, pBt->full_name, fp,
947 		    SQLITE_OPEN_MAIN_DB | SQLITE_OPEN_READWRITE,
948 		    &attrs);
949 		if (attrs & SQLITE_OPEN_READONLY)
950 			pBt->readonly = 1;
951 		if (rc == SQLITE_OK)
952 			(void)sqlite3OsClose(fp);
953 		sqlite3_free(fp);
954 #endif
955 		/*
956 		 * Always open existing tables, even if the matching
957 		 * env does not exist (yet).
958 		 */
959 		pBt->env_oflags |= DB_CREATE;
960 		pBt->need_open = 1;
961 	}
962 err:	return rc;
963 }
964 
btreeCheckEnvOpen(Btree * p,int createdDir,u8 replicate)965 static int btreeCheckEnvOpen(Btree *p, int createdDir, u8 replicate)
966 {
967 	BtShared *pBt;
968 	int env_exists, f_exists;
969 
970 	pBt = p->pBt;
971 	env_exists = f_exists = 0;
972 
973 	assert(pBt->dbStorage == DB_STORE_NAMED);
974 	assert(pBt->dir_name != NULL);
975 	f_exists = pBt->database_existed;
976 	env_exists = !__os_exists(NULL, pBt->dir_name, NULL);
977 	if (env_exists && createdDir)
978 		env_exists = 0;
979 	if (env_exists && !f_exists) {
980 		int f_isdir;
981 		/*
982 		 * there may have been a race for database creation. Recheck
983 		 * file existence before destroying the environment.
984 		 */
985 		f_exists = !__os_exists(NULL, pBt->full_name, &f_isdir);
986 	}
987 	if (!env_exists && !IS_ENV_READONLY(pBt) && f_exists)
988 		pBt->lsn_reset = LSN_RESET_FILE;
989 
990 	/*
991 	 * If we are opening a database read-only, and there is not
992 	 * already an environment, create a non-transactional
993 	 * private environment to use. Otherwise we run into issues
994 	 * with mismatching LSNs.
995 	 */
996 	if (!env_exists && IS_ENV_READONLY(pBt)) {
997 		pBt->env_oflags |= DB_PRIVATE;
998 		pBt->transactional = 0;
999 	} else {
1000 		pBt->env_oflags |= DB_INIT_LOG | DB_INIT_TXN |
1001 		    (replicate ? DB_INIT_REP : 0);
1002 #ifndef BDBSQL_SINGLE_THREAD
1003 		pBt->env_oflags |= DB_INIT_LOCK;
1004 #endif
1005 #ifdef BDBSQL_SINGLE_PROCESS
1006 		/*
1007 		 * If BDBSQL_OMIT_LEAKCHECK is enabled, single_process would
1008 		 * always take affect, not matter the pragma setting.
1009 		 */
1010 		pBt->single_process = 1;
1011 #endif
1012 		if (pBt->single_process) {
1013 			pBt->env_oflags |= DB_PRIVATE | DB_CREATE;
1014 		} else if (!replicate && !pBt->repForceRecover) {
1015 			/*
1016 			 * FAILCHK_ISALIVE doesn't currently work with
1017 			 * replication. Also, replication can't use DB_REGISTER
1018 			 * because it assumes actual recoveries between
1019 			 * sessions. Avoid adding these flags if we are running
1020 			 * with replication or if this is the first time we are
1021 			 * opening the env after turning off replication
1022 			 * (repForceRecover).
1023 			 */
1024 			pBt->env_oflags |= DB_FAILCHK_ISALIVE | DB_REGISTER;
1025 		}
1026 	}
1027 	/*
1028 	 * If we're prepared to create the environment, do that now.
1029 	 * Otherwise, if the table is being created, SQLite will call
1030 	 * sqlite3BtreeCursor and expect a "SQLITE_EMPTY" return, then
1031 	 * call sqlite3BtreeCreateTable.  The result of this open is
1032 	 * recorded in the Btree object passed in.
1033 	 */
1034 	pBt->env_oflags |= DB_CREATE;
1035 
1036 	if ((pBt->env_oflags & DB_INIT_TXN) != 0)
1037 		pBt->env_oflags |= DB_RECOVER;
1038 
1039 	return SQLITE_OK;
1040 }
1041 
1042 /*
1043  * Determine whether replication is configured and make all needed
1044  * replication calls prior to opening environment.
1045  */
btreeSetUpReplication(Btree * p,int master,u8 * replicate)1046 static int btreeSetUpReplication(Btree *p, int master, u8 *replicate)
1047 {
1048 	BtShared *pBt;
1049 	sqlite3 *db;
1050 	char *value, *value2;
1051 	DB_SITE *lsite, *rsite;
1052 	char *host, *msg;
1053 	u_int port = 0;
1054 	int rc, rc2, ret;
1055 
1056 	pBt = p->pBt;
1057 	db = p->db;
1058 	rc = SQLITE_OK;
1059 	*replicate = ret = 0;
1060 
1061 	value = NULL;
1062 	if ((rc = getPersistentPragma(p, "replication",
1063 	    &value, NULL)) == SQLITE_OK && value)
1064 		*replicate = atoi(value);
1065 	if (value)
1066 		sqlite3_free(value);
1067 
1068 	if (*replicate) {
1069 		value = NULL;
1070 		value2 = NULL;
1071 		if ((rc = getPersistentPragma(p, "replication_verbose_output",
1072 		    &value, NULL)) == SQLITE_OK && value && atoi(value)) {
1073 			if (pDbEnv->set_verbose(pDbEnv,
1074 			    DB_VERB_REPLICATION, 1) != 0) {
1075 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1076 				    "replication set_verbose call");
1077 				rc = SQLITE_ERROR;
1078 			}
1079 			else if ((rc = getPersistentPragma(p,
1080 			    "replication_verbose_file",
1081 			    &value2, NULL)) == SQLITE_OK && value && value2) {
1082 				if ((rc = unsetRepVerboseFile(
1083 				    pBt, pDbEnv, &msg)) != SQLITE_OK)
1084 					sqlite3Error(db, rc, msg);
1085 				if (rc == SQLITE_OK && strlen(value2) > 0 &&
1086 				    (rc = setRepVerboseFile(
1087 				    pBt, pDbEnv, value2, msg)) != SQLITE_OK)
1088 					sqlite3Error(db, rc, msg);
1089 			}
1090 		}
1091 		if (value)
1092 			sqlite3_free(value);
1093 		if (value2)
1094 			sqlite3_free(value2);
1095 		if (rc != SQLITE_OK)
1096 			goto err;
1097 
1098 		/* There must be a local_site value. */
1099 		lsite = NULL;
1100 		value = NULL;
1101 		if ((rc = getPersistentPragma(p, "replication_local_site",
1102 		    &value, NULL)) == SQLITE_OK && value) {
1103 			/* Pragma code already syntax-checked the value.  */
1104 			rc2 = getHostPort(value, &host, &port);
1105 			if (pDbEnv->repmgr_site(pDbEnv,
1106 			    host, port, &lsite, 0) != 0) {
1107 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1108 				    "replication call repmgr_site LOCAL");
1109 				rc = SQLITE_ERROR;
1110 			}
1111 			if (rc != SQLITE_ERROR &&
1112 			    lsite->set_config(lsite, DB_LOCAL_SITE, 1) != 0) {
1113 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1114 				    "replication call site config LOCAL");
1115 				rc = SQLITE_ERROR;
1116 			}
1117 			if (rc != SQLITE_ERROR && master &&
1118 			    lsite->set_config(lsite,
1119 			    DB_GROUP_CREATOR, 1) != 0) {
1120 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1121 				    "replication call site config CREATOR");
1122 				rc = SQLITE_ERROR;
1123 			}
1124 			if (lsite != NULL && lsite->close(lsite) != 0) {
1125 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1126 				    "replication call site close LOCAL");
1127 				rc = SQLITE_ERROR;
1128 			}
1129 			if (rc2 == SQLITE_OK)
1130 				sqlite3_free(host);
1131 		} else {
1132 			sqlite3Error(db, SQLITE_ERROR, "Must specify local "
1133 			    "site before starting replication");
1134 			rc = SQLITE_ERROR;
1135 		}
1136 		if (value)
1137 			sqlite3_free(value);
1138 		if (rc != SQLITE_OK)
1139 			goto err;
1140 
1141 		/* It is optional to have a remote_site value. */
1142 		rsite = NULL;
1143 		value = NULL;
1144 		if (getPersistentPragma(p, "replication_remote_site",
1145 		    &value, NULL) == SQLITE_OK && value) {
1146 			/* Pragma code already syntax-checked the value.  */
1147 			rc2 = getHostPort(value, &host, &port);
1148 			if (pDbEnv->repmgr_site(pDbEnv,
1149 			    host, port, &rsite, 0) != 0) {
1150 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1151 				    "replication call repmgr_site REMOTE");
1152 				rc = SQLITE_ERROR;
1153 			}
1154 			if (rc != SQLITE_ERROR &&
1155 			    rsite->set_config(rsite,
1156 			    DB_BOOTSTRAP_HELPER, 1) != 0)
1157 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1158 				    "replication call site config HELPER");
1159 			if (rsite != NULL && rsite->close(rsite) != 0)
1160 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1161 				    "replication call site close REMOTE");
1162 			if (rc2 == SQLITE_OK)
1163 				sqlite3_free(host);
1164 		}
1165 		if (value)
1166 			sqlite3_free(value);
1167 
1168 		/* Set 2SITE_STRICT to ensure data durability. */
1169 		if (pDbEnv->rep_set_config(pDbEnv,
1170 		    DB_REPMGR_CONF_2SITE_STRICT, 1) != 0) {
1171 			sqlite3Error(db, SQLITE_ERROR, "Error in "
1172 			    "replication call rep_set_config");
1173 			rc = SQLITE_ERROR;
1174 			goto err;
1175 		}
1176 
1177 		/*
1178 		 * Set up heartbeats to detect when client loses connection
1179 		 * to master and to enable rerequest processing.
1180 		 */
1181 		if (pDbEnv->rep_set_timeout(pDbEnv,
1182 		    DB_REP_HEARTBEAT_MONITOR, 7000000) != 0) {
1183 			sqlite3Error(db, SQLITE_ERROR, "Error in replication "
1184 			    "call rep_set_timeout heartbeat monitor");
1185 			rc = SQLITE_ERROR;
1186 			goto err;
1187 		}
1188 		if (pDbEnv->rep_set_timeout(pDbEnv,
1189 		    DB_REP_HEARTBEAT_SEND, 5000000) != 0) {
1190 			sqlite3Error(db, SQLITE_ERROR, "Error in replication "
1191 			    "call rep_set_timeout heartbeat send");
1192 			rc = SQLITE_ERROR;
1193 			goto err;
1194 		}
1195 	}
1196 
1197 err:
1198 	return rc;
1199 }
1200 
1201 /* See if environment is currently configured as a replication client. */
btreeRepIsClient(Btree * p)1202 static int btreeRepIsClient(Btree *p)
1203 {
1204 	DB_REP_STAT *rep_stat;
1205 	BtShared *pBt;
1206 	int is_client;
1207 
1208 	pBt = p->pBt;
1209 	is_client = 0;
1210 
1211 	if (!pBt->repStarted)
1212 		return (0);
1213 
1214 	if (pDbEnv->rep_stat(pDbEnv, &rep_stat, 0) != 0) {
1215 		sqlite3Error(p->db, SQLITE_ERROR,
1216 		    "Unable to determine if site is a replication client");
1217 		return (0);
1218 	}
1219 	if (rep_stat->st_status == DB_REP_CLIENT)
1220 		is_client = 1;
1221 	sqlite3_free(rep_stat);
1222 	return (is_client);
1223 }
1224 
1225 /*
1226  * See if replication startup is finished by polling replication statistics.
1227  * Returns 1 if replication startup is finished; 0 otherwise.  Note that
1228  * this function waits a finite amount of time for a replication election
1229  * to complete but it waits indefinitely for a replication client to
1230  * synchronize with the master after the election.
1231  */
btreeRepStartupFinished(Btree * p)1232 static int btreeRepStartupFinished(Btree *p)
1233 {
1234 	DB_REP_STAT *repStat;
1235 	BtShared *pBt;
1236 	sqlite3 *db;
1237 	u_int32_t electRetry, electTimeout, slept;
1238 	int clientSyncComplete, startupComplete;
1239 
1240 	pBt = p->pBt;
1241 	db = p->db;
1242 	clientSyncComplete = slept = startupComplete = 0;
1243 	electRetry = electTimeout = 0;
1244 
1245 	if (pDbEnv->rep_get_timeout(pDbEnv,
1246 	    DB_REP_ELECTION_RETRY, &electRetry) != 0) {
1247 		sqlite3Error(db, SQLITE_ERROR, "Error in "
1248 		    "replication call rep_get_timeout election retry");
1249 		goto err;
1250 	}
1251 	if (pDbEnv->rep_get_timeout(pDbEnv,
1252 	    DB_REP_ELECTION_TIMEOUT, &electTimeout) != 0) {
1253 		sqlite3Error(db, SQLITE_ERROR, "Error in "
1254 		    "replication call rep_get_timeout election timeout");
1255 		goto err;
1256 	}
1257 	electRetry = electRetry / US_PER_SEC;
1258 	electTimeout = electTimeout / US_PER_SEC;
1259 
1260 	/*
1261 	 * Wait to see if election and replication site startup finishes.
1262 	 * If this site has been elected master or if it is a client that
1263 	 * has finished its synchronization with the master, startup is
1264 	 * finished.  Wait long enough to allow time for many election
1265 	 * attempts.  Using default timeout values, the wait is 15 minutes.
1266 	 */
1267 	do {
1268 		__os_yield(pDbEnv->env, 1, 0);
1269 		if (pDbEnv->rep_stat(pDbEnv, &repStat, 0) != 0) {
1270 			sqlite3Error(db, SQLITE_ERROR, "Error in "
1271 			    "replication call rep_stat election");
1272 			goto err;
1273 		}
1274 		if (repStat->st_status == DB_REP_MASTER ||
1275 		    repStat->st_startup_complete)
1276 			startupComplete = 1;
1277 		sqlite3_free(repStat);
1278 	} while (!startupComplete &&
1279 	    ++slept < (electTimeout + electRetry) * 75);
1280 
1281 	/*
1282 	 * If startup isn't finished yet but this site is a client with
1283 	 * a known master, the client is still synchronizing with the master.
1284 	 * Wait indefinitely because this can take a very long time if a full
1285 	 * internal initialization is needed.
1286 	 */
1287 	if (!startupComplete && repStat->st_status == DB_REP_CLIENT &&
1288 	    repStat->st_master != DB_EID_INVALID)
1289 		do {
1290 			__os_yield(pDbEnv->env, 2, 0);
1291 			if (pDbEnv->rep_stat(pDbEnv, &repStat, 0) != 0) {
1292 				sqlite3Error(db, SQLITE_ERROR, "Error in "
1293 				    "replication call rep_stat client sync");
1294 				goto err;
1295 			}
1296 			if (repStat->st_startup_complete)
1297 				clientSyncComplete = 1;
1298 			sqlite3_free(repStat);
1299 		} while (!clientSyncComplete);
1300 
1301 err:	if (startupComplete || clientSyncComplete)
1302 		return (1);
1303 	else
1304 		return (0);
1305 }
1306 
1307 /*
1308  * This function finds, opens or creates the Berkeley DB environment associated
1309  * with a database opened using sqlite3BtreeOpen. There are a few different
1310  * cases:
1311  *  * Temporary and transient databases share a single environment. If the
1312  *    shared handle exists, return it, otherwise create a shared handle.
1313  *  * For named databases, attempt to open an existing environment, if one
1314  *    exists, otherwise create a new environment.
1315  */
btreePrepareEnvironment(Btree * p)1316 static int btreePrepareEnvironment(Btree *p)
1317 {
1318 	BtShared *pBt;
1319 #ifdef BDBSQL_FILE_PER_TABLE
1320 	char *dirPathName, dirPathBuf[BT_MAX_PATH];
1321 #endif
1322 	int rc, ret;
1323 
1324 	pBt = p->pBt;
1325 	ret = 0;
1326 	rc = SQLITE_OK;
1327 
1328 	pBt->env_oflags = DB_INIT_MPOOL |
1329 	    ((pBt->dbStorage == DB_STORE_NAMED) ? 0 : DB_PRIVATE)
1330 #ifndef BDBSQL_SINGLE_THREAD
1331 	    | DB_THREAD
1332 #endif
1333 		;
1334 
1335 	if (pBt->dbStorage == DB_STORE_NAMED) {
1336 		if ((rc = btreeCheckEnvPrepare(p)) != SQLITE_OK)
1337 			goto err;
1338 
1339 		if ((ret = db_env_create(&pDbEnv, 0)) != 0)
1340 			goto err;
1341 		pDbEnv->set_errpfx(pDbEnv, pBt->full_name);
1342 		pDbEnv->app_private = pBt;
1343 		pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
1344 #ifndef BDBSQL_SINGLE_THREAD
1345 #ifndef BDBSQL_CONCURRENT_CONNECTIONS
1346 		pDbEnv->set_flags(pDbEnv, DB_DATABASE_LOCKING, 1);
1347 #endif
1348 		pDbEnv->set_lk_detect(pDbEnv, DB_LOCK_DEFAULT);
1349 		pDbEnv->set_lk_tablesize(pDbEnv, 20000);
1350 		pDbEnv->set_memory_max(pDbEnv, 0, 16 * 1024 * 1024);
1351 #ifdef BDBSQL_TXN_SNAPSHOTS_DEFAULT
1352 		pBt->env_oflags |= DB_MULTIVERSION;
1353 		pBt->read_txn_flags |= DB_TXN_SNAPSHOT;
1354 #endif
1355 #endif
1356 		pDbEnv->set_lg_regionmax(pDbEnv, BDBSQL_LOG_REGIONMAX);
1357 #ifdef BDBSQL_MEMORY_MAX
1358 		pDbEnv->set_memory_max(pDbEnv, BDBSQL_MEMORY_MAX / GIGABYTE,
1359 				       BDBSQL_MEMORY_MAX % GIGABYTE);
1360 #endif
1361 #ifdef BDBSQL_LOCK_TABLESIZE
1362 		pDbEnv->set_lk_tablesize(pDbEnv, BDBSQL_LOCK_TABLESIZE);
1363 #endif
1364 #ifndef BDBSQL_OMIT_LEAKCHECK
1365 		pDbEnv->set_alloc(pDbEnv, btreeMalloc, btreeRealloc,
1366 		    sqlite3_free);
1367 #endif
1368 		if ((ret = pDbEnv->set_lg_max(pDbEnv, pBt->logFileSize)) != 0)
1369 			goto err;
1370 #ifndef BDBSQL_OMIT_LOG_REMOVE
1371 		if ((ret = pDbEnv->log_set_config(pDbEnv,
1372 		    DB_LOG_AUTO_REMOVE, 1)) != 0)
1373 			goto err;
1374 #endif
1375 		/*
1376 		 * Set the directory where the database file will be created
1377 		 * to the parent of the environment directory.
1378 		 */
1379 #ifdef BDBSQL_FILE_PER_TABLE
1380 		/* Reuse envDirNameBuf. */
1381 		dirPathName = dirPathBuf;
1382 		memset(dirPathName, 0, BT_MAX_PATH);
1383 		sqlite3_snprintf(sizeof(dirPathName), dirPathName,
1384 		    "../%s", pBt->short_name);
1385 		pDbEnv->set_data_dir(pDbEnv, dirPathName);
1386 		pDbEnv->set_create_dir(pDbEnv, dirPathName);
1387 #else
1388 		pDbEnv->set_data_dir(pDbEnv, "..");
1389 #endif
1390 #ifdef BDBSQL_SHARE_PRIVATE
1391 		/*
1392 		 * set mpool mutex count to 10/core.  This significantly
1393 		 * reduces the cost of environment open/close
1394 		 */
1395 		if (pBt->mp_mutex_count == 0)
1396 		  pBt->mp_mutex_count = 10 * __os_cpu_count();
1397 		pDbEnv->set_mp_mtxcount(pDbEnv, pBt->mp_mutex_count);
1398 #endif
1399 
1400 	} else if (g_tmp_env == NULL) {
1401 		/*
1402 		 * Creating environment shared by temp and transient tables.
1403 		 * We're just creating a handle here, so it doesn't matter if
1404 		 * we race with some other thread at this point, as long as
1405 		 * only one of the environment handles is opened.
1406 		 */
1407 		if ((ret = db_env_create(&pDbEnv, 0)) != 0)
1408 			goto err;
1409 		pDbEnv->set_errpfx(pDbEnv, "<temp>");
1410 		pDbEnv->app_private = pBt;
1411 		pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
1412 		pBt->env_oflags |= DB_CREATE | DB_INIT_TXN | DB_PRIVATE;
1413 
1414 		/*
1415 		 * Never create log files.  We mark all databases non-durable,
1416 		 * but BDB still occasionally writes log records (e.g., for
1417 		 * checkpoints).  This guarantees that those log records aren't
1418 		 * written to files.  A small buffer should be fine.
1419 		 */
1420 		pDbEnv->set_lg_bsize(pDbEnv, 64 * 1024);
1421 		pDbEnv->set_lg_max(pDbEnv, 32 * 1024);
1422 #ifndef BDBSQL_OMIT_LEAKCHECK
1423 		pDbEnv->set_alloc(pDbEnv, btreeMalloc, btreeRealloc,
1424 		    sqlite3_free);
1425 #endif
1426 		pDbEnv->log_set_config(pDbEnv, DB_LOG_IN_MEMORY, 1);
1427 	} else
1428 		rc = btreeOpenEnvironment(p, 0);
1429 
1430 err:	return MAP_ERR(rc, ret, p);
1431 }
1432 
1433 /*
1434  * The function finds an opened BtShared handle if one exists in the cache.
1435  * It assumes that the global SQLITE_MUTEX_STATIC_OPEN lock is held.
1436  */
btreeUpdateBtShared(Btree * p,int needLock)1437 int btreeUpdateBtShared(Btree *p, int needLock)
1438 {
1439 	BtShared *pBt, *next_bt;
1440 	sqlite3_mutex *mutexOpen;
1441 	u_int8_t new_fileid[DB_FILE_ID_LEN];
1442 	char *filename;
1443 	int rc, ret;
1444 
1445 	pBt = p->pBt;
1446 	rc = SQLITE_OK;
1447 	ret = 0;
1448 
1449 	if (pBt->dbStorage != DB_STORE_NAMED)
1450 		return SQLITE_OK;
1451 
1452 #ifdef BDBSQL_FILE_PER_TABLE
1453 	rc = getMetaDataFileName(pBt->full_name, &filename);
1454 	if (rc != SQLITE_OK)
1455 		return rc;
1456 #else
1457 	filename = pBt->full_name;
1458 #endif
1459 
1460 	if (needLock) {
1461 		mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
1462 		sqlite3_mutex_enter(mutexOpen);
1463 #ifdef SQLITE_DEBUG
1464 	} else {
1465 		mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1466 		assert(sqlite3_mutex_held(mutexOpen));
1467 		mutexOpen = NULL;
1468 #endif
1469 	}
1470 	/*
1471      * Check to see if a connection has been opened to the same database
1472 	 * using a different BtShared. If so, switch to using that BtShared.
1473 	 *
1474 	 * It's safe to do this shuffle, since it only ever happens for
1475 	 * named databases, and we are always holding the global
1476 	 * SQLITE_MUTEX_STATIC_OPEN mutex in that case.
1477 	 */
1478 	if (pBt->dbStorage == DB_STORE_NAMED && !pBt->env_opened &&
1479 	    !(ret = __os_exists(NULL, filename, NULL)) &&
1480 	    __os_fileid(NULL, filename, 0, new_fileid) == 0) {
1481 		for (next_bt = g_shared_btrees; next_bt != NULL;
1482 		    next_bt = next_bt->pNextDb) {
1483 			if (pBt != next_bt && memcmp(
1484 			    new_fileid, next_bt->fileid, DB_FILE_ID_LEN) == 0)
1485 				break;
1486 		}
1487 		if (next_bt != pBt && next_bt != NULL) {
1488 			/* Found a different BtShared to use. "upgrade" */
1489 			++next_bt->nRef;
1490 			if (--pBt->nRef == 0) {
1491 				(void)btreeFreeSharedBtree(pBt, 1);
1492 			}
1493 			p->pBt = next_bt;
1494 			pBt = next_bt;
1495 		}
1496 	} else {
1497 		if (ret != ENOENT && ret != 0)
1498 			rc = dberr2sqlite(ret, p);
1499 	}
1500 	if (needLock)
1501 		sqlite3_mutex_leave(mutexOpen);
1502 
1503 #ifdef BDBSQL_FILE_PER_TABLE
1504 	sqlite3_free(filename);
1505 #endif
1506 	return rc;
1507 }
1508 
1509 /*
1510  * Closes and re-opens a Berkeley DB environment handle.
1511  * Required when enabling or disabling replication on an existing database.
1512  * Assumes that the required open flags have been set in BtShared.
1513  */
btreeReopenEnvironment(Btree * p,int removingRep)1514 int btreeReopenEnvironment(Btree *p, int removingRep)
1515 {
1516 	int idx, rc, ret;
1517 	sqlite3_mutex *mutexOpen;
1518 	BtShared *pBt;
1519 
1520 	rc = SQLITE_OK;
1521 	ret = 0;
1522 	pBt = p->pBt;
1523 
1524 	if (pBt->transactional == 0 || pBt->first_cursor != NULL ||
1525 	    pMainTxn != NULL || pBt->dbStorage != DB_STORE_NAMED)
1526 		return SQLITE_ERROR;
1527 
1528 	/* commit family txn; it will be null when shutting down */
1529 	if (pFamilyTxn != NULL) {
1530 		ret = pFamilyTxn->commit(pFamilyTxn, 0);
1531 		pFamilyTxn = NULL;
1532 		/* p->inTrans = TRANS_NONE; don't change state of this */
1533 		if (ret != 0)
1534 			rc = dberr2sqlite(ret, p);
1535 		if (rc != SQLITE_OK)
1536 			return (rc);
1537 	}
1538 
1539 	/*
1540 	 * Acquire mutexOpen lock while closing down cached db handles.
1541 	 */
1542 	mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
1543 	sqlite3_mutex_enter(mutexOpen);
1544 	/* Close open DB handles and clear related hash table */
1545 	if ((rc = btreeCleanupCachedHandles(p, CLEANUP_CLOSE)) != SQLITE_OK)
1546 		goto err;
1547 	sqlite3HashClear(&pBt->db_cache);
1548 	/* close tables and meta databases */
1549 	if (pTablesDb != NULL &&
1550 	    (ret = pTablesDb->close(pTablesDb, DB_NOSYNC)) != 0)
1551 		goto err;
1552 	if (pMetaDb != NULL &&
1553 	    (ret = pMetaDb->close(pMetaDb, DB_NOSYNC)) != 0)
1554 		goto err;
1555 	pTablesDb = pMetaDb = NULL;
1556 
1557 	/* Flush the cache of metadata values */
1558 	for (idx = 0; idx < NUMMETA; idx++)
1559 		pBt->meta[idx].cached = 0;
1560 	/*
1561 	 * Close environment, ignore DB_RUNRECOVERY errors.
1562 	 */
1563 	if ((ret = pDbEnv->close(pDbEnv, 0)) != 0 && ret != DB_RUNRECOVERY)
1564        		goto err;
1565 	pDbEnv = NULL;
1566 	pBt->env_opened = 0;
1567 	p->connected = 0;
1568 
1569 	/* Configure and open a new environment. */
1570 	if ((rc = btreePrepareEnvironment(p)) != 0)
1571 		goto err;
1572 	/*
1573 	 * Make thread count match the default value that env_open() sets
1574 	 * with FAILCHK so that the thread region is initialized correctly
1575 	 * for use with FAILCHK when reopening without replication.
1576 	 */
1577 	if (removingRep &&
1578 	    (ret = pDbEnv->set_thread_count(pDbEnv, 50)) != 0)
1579 		goto err;
1580 	rc = btreeOpenEnvironment(p, 0);
1581 
1582 	/* Release the lock now. */
1583 err:	sqlite3_mutex_leave(mutexOpen);
1584 	if (rc == SQLITE_OK && ret != 0)
1585 		rc = dberr2sqlite(ret, p);
1586 	return rc;
1587 }
1588 
1589 /*
1590  * Called from sqlite3BtreeCreateTable, if it the Berkeley DB environment
1591  * did not already exist when sqlite3BtreeOpen was called.
1592  */
btreeOpenEnvironment(Btree * p,int needLock)1593 int btreeOpenEnvironment(Btree *p, int needLock)
1594 {
1595 	BtShared *pBt;
1596 	sqlite3 *db;
1597 	CACHED_DB *cached_db;
1598 	int creating, iTable, newEnv, rc, ret, reuse_env, writeLock;
1599 	sqlite3_mutex *mutexOpen;
1600 	txn_mode_t txn_mode;
1601 	i64 cache_sz;
1602 	int createdDir = 0;
1603 #ifdef BDBSQL_SHARE_PRIVATE
1604 	int createdFile = 0;
1605 #endif
1606 	int i;
1607 	u8 replicate = 0;
1608 
1609 	newEnv = ret = reuse_env = 0;
1610 	rc = SQLITE_OK;
1611 	cached_db = NULL;
1612 	mutexOpen = NULL;
1613 	pBt = p->pBt;
1614 	db = p->db;
1615 
1616 	/*
1617 	 * The open (and setting pBt->env_opened) is protected by the open
1618 	 * mutex, to prevent concurrent threads trying to call DB_ENV->open
1619 	 * simultaneously.
1620 	 */
1621 	if (needLock) {
1622 		mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
1623 		sqlite3_mutex_enter(mutexOpen);
1624 #ifdef SQLITE_DEBUG
1625 	} else if (pBt->dbStorage == DB_STORE_NAMED) {
1626 		mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1627 		assert(sqlite3_mutex_held(mutexOpen));
1628 		mutexOpen = NULL;
1629 #endif
1630 	}
1631 
1632 	/*
1633 	 * If we already created a handle and someone has opened the global
1634 	 * handle in the meantime, close our handle to free the memory.
1635 	 */
1636 	if (pBt->dbStorage != DB_STORE_NAMED && g_tmp_env != NULL) {
1637 		assert(!pBt->env_opened);
1638 		assert(pDbEnv != g_tmp_env);
1639 		if (pDbEnv != NULL)
1640 			(void)pDbEnv->close(pDbEnv, 0);
1641 
1642 		pDbEnv = g_tmp_env;
1643 		pBt->env_opened = newEnv = reuse_env = 1;
1644 	}
1645 	/*
1646 	 * Check to see if the table has been opened to the same database
1647 	 * using a different name. If so, switch to using that BtShared.
1648 	 */
1649 	if ((rc = btreeUpdateBtShared(p, 0)) != SQLITE_OK)
1650 		goto err;
1651 	pBt = p->pBt;
1652 
1653 	if (!pBt->env_opened) {
1654 		cache_sz = (i64)pBt->cacheSize;
1655 		if (cache_sz < DB_MIN_CACHESIZE)
1656 			cache_sz = DB_MIN_CACHESIZE;
1657 		cache_sz *= (pBt->dbStorage == DB_STORE_NAMED &&
1658 		    pBt->pageSize > 0) ?
1659 		    pBt->pageSize : SQLITE_DEFAULT_PAGE_SIZE;
1660 		pDbEnv->set_cachesize(pDbEnv,
1661 		    (u_int32_t)(cache_sz / GIGABYTE),
1662 		    (u_int32_t)(cache_sz % GIGABYTE), 0);
1663 		if (pBt->pageSize != 0 &&
1664 		    (ret = pDbEnv->set_mp_pagesize(pDbEnv, pBt->pageSize)) != 0)
1665 			goto err;
1666 		pDbEnv->set_mp_mmapsize(pDbEnv, 0);
1667 		pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
1668 		if (pBt->dir_name != NULL) {
1669 			createdDir =
1670 			    (__os_mkdir(NULL, pBt->dir_name, 0777) == 0);
1671 #ifdef BDBSQL_FILE_PER_TABLE
1672 			createdDir =
1673 			    (__os_mkdir(NULL, pBt->full_name, 0777) == 0);
1674 #endif
1675 		}
1676 
1677 		if (pBt->dbStorage == DB_STORE_NAMED) {
1678 #ifdef BDBSQL_SHARE_PRIVATE
1679 			if ((ret = btreeSetupLockfile(p, &createdFile)) != 0)
1680 				goto err;
1681 			/*
1682 			 * if lock isn't held, take read lock for open,
1683 			 * but do not reopen env
1684 			 */
1685 			if (!createdFile) {
1686 				btreeScopedFileLock(p, 0, 1);
1687 				/*
1688 				 * don't checkpoint; it'd confuse
1689 				 * active writers
1690 				 */
1691 				pBt->env_oflags |= DB_NO_CHECKPOINT;
1692 			}
1693 #endif
1694 			if ((rc = btreeSetUpReplication(p, pBt->repStartMaster,
1695 			    &replicate)) != SQLITE_OK)
1696 				goto err;
1697 			if ((rc = btreeCheckEnvOpen(p,
1698 			    createdDir, replicate)) != SQLITE_OK)
1699 				goto err;
1700 		}
1701 		if ((ret = pDbEnv->open(
1702 		    pDbEnv, pBt->dir_name, pBt->env_oflags, 0)) != 0) {
1703 #ifdef BDBSQL_SHARE_PRIVATE
1704 			if (pBt->dbStorage == DB_STORE_NAMED)
1705 				btreeScopedFileUnlock(p, createdFile);
1706 #endif
1707 			if (ret == ENOENT && (pBt->env_oflags & DB_CREATE) == 0)
1708 				return SQLITE_OK;
1709 			goto err;
1710 		}
1711 		pBt->env_opened = newEnv = 1;
1712 		/*
1713 		 * repForceRecover is set when turning off replication and
1714 		 * used to set env open flags.  Clear it here after opening
1715 		 * the environment.
1716 		 */
1717 		pBt->repForceRecover = 0;
1718 		if (pBt->dbStorage != DB_STORE_NAMED) {
1719 			g_tmp_env = pDbEnv;
1720 			reuse_env = 1;
1721 		} else {
1722 #ifdef BDBSQL_SHARE_PRIVATE
1723 			btreeScopedFileUnlock(p, createdFile);
1724 #endif
1725 		}
1726 	}
1727 
1728 	assert(!p->connected);
1729 	p->connected = 1;
1730 
1731 	/*
1732 	 * If the environment was already open, drop the open mutex before
1733 	 * proceeding.  Some other thread may be holding a schema lock and
1734 	 * be waiting for the open mutex, which would lead to a latch deadlock.
1735 	 *
1736 	 * On the other hand, if we are creating the environment, this thread
1737 	 * is expecting to find the schema table empty, so we need to hold
1738 	 * onto the open mutex and get an exclusive schema lock, to prevent
1739 	 * some other thread getting in ahead of us.
1740 	 */
1741 	if (!newEnv && needLock) {
1742 		assert(sqlite3_mutex_held(mutexOpen));
1743 		sqlite3_mutex_leave(mutexOpen);
1744 		needLock = 0;
1745 	}
1746 
1747 	/*
1748 	 * Start replication.  If we are not starting as the initial master,
1749 	 * do not try to create SQL metadata because we will use a
1750 	 * replicated copy that should already exist or get sent to us
1751 	 * shortly during replication client synchronization.
1752 	 */
1753 	if (replicate) {
1754 		if ((ret = pDbEnv->repmgr_start(pDbEnv, 1,
1755 		    pBt->repStartMaster ?
1756 		    DB_REP_MASTER : DB_REP_ELECTION)) != 0) {
1757 			sqlite3Error(db, SQLITE_CANTOPEN, "Error in "
1758 			    "replication call repmgr_start");
1759 			rc = SQLITE_CANTOPEN;
1760 			goto err;
1761 		}
1762 		pBt->repStarted = 1;
1763 
1764 		if (!pBt->repStartMaster) {
1765 			/*
1766 			 * Allow time for replication client to hold an
1767 			 * election and synchronize with the master.
1768 			 */
1769 			if (!btreeRepStartupFinished(p)) {
1770 				sqlite3Error(db, SQLITE_CANTOPEN, "Error "
1771 				    "starting as replication client");
1772 				rc = SQLITE_CANTOPEN;
1773 				goto err;
1774 			}
1775 			creating = i = 0;
1776 			/*
1777 			 * There is a slight possibility that some of the
1778 			 * replicated SQL metadata may lag behind the end
1779 			 * of client synchronization, so retry opening the
1780 			 * SQL metadata a few times if there are errors.
1781 			 */
1782 			do {
1783 				rc = btreeOpenMetaTables(p, &creating);
1784 			} while ((rc != SQLITE_OK) && ++i < BUSY_RETRY_COUNT);
1785 			if (rc == SQLITE_OK)
1786 				goto aftercreatemeta;
1787 			else {
1788 				sqlite3Error(db, SQLITE_CANTOPEN, "Error "
1789 				    "opening replicated SQL metadata");
1790 				rc = SQLITE_CANTOPEN;
1791 				goto err;
1792 			}
1793 		}
1794 	}
1795 	pBt->repStartMaster = 0;
1796 
1797 	if ((!IS_ENV_READONLY(pBt) && p->vfsFlags & SQLITE_OPEN_CREATE) ||
1798 	    pBt->dbStorage == DB_STORE_INMEM)
1799 		pBt->db_oflags |= DB_CREATE;
1800 
1801 	creating = 1;
1802 	if (pBt->dbStorage == DB_STORE_NAMED &&
1803 	    (rc = btreeOpenMetaTables(p, &creating)) != SQLITE_OK)
1804 		goto err;
1805 	if (creating) {
1806 		/*
1807 	     * Update the fileid now that the file has been created.
1808 		 * Ignore error returns - the fileid isn't critical.
1809 		 */
1810 		if (pBt->dbStorage == DB_STORE_NAMED) {
1811 			char *filename;
1812 #ifdef BDBSQL_FILE_PER_TABLE
1813 			rc = getMetaDataFileName(pBt->full_name, &filename);
1814 			if (rc != SQLITE_OK)
1815 				goto err;
1816 #else
1817 			filename = pBt->full_name;
1818 #endif
1819 			(void)__os_fileid(NULL, filename, 0, pBt->fileid);
1820 #ifdef BDBSQL_FILE_PER_TABLE
1821 			if (filename != NULL)
1822 				sqlite3_free(filename);
1823 #endif
1824 		}
1825 
1826 		if ((rc = btreeCreateTable(p, &iTable,
1827 		    BTREE_INTKEY)) != SQLITE_OK)
1828 			goto err;
1829 
1830 		assert(iTable == MASTER_ROOT);
1831 	}
1832 aftercreatemeta:
1833 
1834 #ifdef BDBSQL_PRELOAD_HANDLES
1835 	if (newEnv && !creating && pBt->dbStorage == DB_STORE_NAMED)
1836 		(void)btreePreloadHandles(p);
1837 #endif
1838 
1839 	/*
1840 	 * If transactions were started before the environment was opened,
1841 	 * start them now.  Also, if creating a new environment, take a write
1842 	 * lock to prevent races setting up the metadata tables.  Always start
1843 	 * the ultimate parent by starting a read transaction.
1844 	 */
1845 	writeLock = (p->schemaLockMode == LOCKMODE_WRITE) ||
1846 	    (newEnv && !IS_BTREE_READONLY(p));
1847 
1848 	if (pBt->transactional) {
1849 		txn_mode = p->inTrans;
1850 		p->inTrans = TRANS_NONE;
1851 
1852 		if ((ret = pDbEnv->txn_begin(pDbEnv,
1853 		    NULL, &pFamilyTxn, DB_TXN_FAMILY)) != 0)
1854 			return dberr2sqlite(ret, p);
1855 #ifdef BDBSQL_SHARE_PRIVATE
1856 		pBt->lockfile.in_env_open = 1;
1857 #endif
1858 		if ((writeLock || txn_mode == TRANS_WRITE) &&
1859 		    !btreeRepIsClient(p) &&
1860 		    (rc = sqlite3BtreeBeginTrans(p,
1861 		    (writeLock || txn_mode == TRANS_WRITE))) != SQLITE_OK)
1862 			goto err;
1863 	}
1864 
1865 	if (p->schemaLockMode != LOCKMODE_NONE) {
1866 		p->schemaLockMode = LOCKMODE_NONE;
1867 		rc = sqlite3BtreeLockTable(p, MASTER_ROOT, writeLock);
1868 		if (rc != SQLITE_OK)
1869 			goto err;
1870 	}
1871 
1872 	/*
1873 	 * It is now okay for other threads to use this BtShared handle.
1874 	 */
1875 err:	if (rc != SQLITE_OK || ret != 0) {
1876 		pBt->panic = 1;
1877 		p->connected = 0;
1878 	}
1879 #ifdef BDBSQL_SHARE_PRIVATE
1880 	pBt->lockfile.in_env_open = 0;
1881 #endif
1882 	if (needLock) {
1883 		assert(sqlite3_mutex_held(mutexOpen));
1884 		sqlite3_mutex_leave(mutexOpen);
1885 	}
1886 	return MAP_ERR(rc, ret, p);
1887 }
1888 
btreeGetSharedBtree(BtShared ** ppBt,u_int8_t * fileid,sqlite3 * db,storage_mode_t store,int vfsFlags)1889 static int btreeGetSharedBtree(
1890     BtShared **ppBt,
1891     u_int8_t *fileid,
1892     sqlite3 *db,
1893     storage_mode_t store,
1894     int vfsFlags)
1895 {
1896 	Btree *pExisting;
1897 	BtShared *next_bt;
1898 	int iDb;
1899 
1900 #ifdef SQLITE_DEBUG
1901 	sqlite3_mutex *mutexOpen = sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1902 	assert(sqlite3_mutex_held(mutexOpen));
1903 #endif
1904 
1905 	/*
1906 	 * SQLite uses this check, but Berkeley DB always operates with a
1907 	 * shared cache.
1908 	if (sqlite3GlobalConfig.sharedCacheEnabled != 1)
1909 		return 1;
1910 	*/
1911 
1912 	*ppBt = NULL;
1913 	for (next_bt = g_shared_btrees; next_bt != NULL;
1914 	    next_bt = next_bt->pNextDb) {
1915 		assert(next_bt->nRef > 0);
1916 		if ((store != DB_STORE_NAMED && next_bt->full_name == NULL) ||
1917 		    (store == DB_STORE_NAMED &&
1918 		    memcmp(fileid, next_bt->fileid, DB_FILE_ID_LEN) == 0)) {
1919 			/*
1920 			 * If the application thinks we are in shared cache
1921 			 * mode, check that the btree handle being added does
1922 			 * not already exist in the list of handles.
1923 			 */
1924 			if (vfsFlags & SQLITE_OPEN_SHAREDCACHE) {
1925 				for (iDb = db->nDb - 1; iDb >= 0; iDb--) {
1926 					pExisting = db->aDb[iDb].pBt;
1927 					if (pExisting &&
1928 					    pExisting->pBt == next_bt)
1929 						/* Leave mutex. */
1930 						return SQLITE_CONSTRAINT;
1931 				}
1932 			}
1933 			*ppBt = next_bt;
1934 			sqlite3_mutex_enter(next_bt->mutex);
1935 			next_bt->nRef++;
1936 			sqlite3_mutex_leave(next_bt->mutex);
1937 			break;
1938 		}
1939 	}
1940 
1941 	return SQLITE_OK;
1942 }
1943 
btreeCreateSharedBtree(Btree * p,const char * zFilename,u_int8_t * fileid,sqlite3 * db,int flags,storage_mode_t store)1944 static int btreeCreateSharedBtree(
1945     Btree *p,
1946     const char *zFilename,
1947     u_int8_t *fileid,
1948     sqlite3 *db,
1949     int flags,
1950     storage_mode_t store)
1951 {
1952 	BtShared *new_bt;
1953 	char *dirPathName, dirPathBuf[BT_MAX_PATH];
1954 
1955 #ifdef SQLITE_DEBUG
1956 	if (store == DB_STORE_NAMED) {
1957 		sqlite3_mutex *mutexOpen =
1958 		    sqlite3MutexAlloc(SQLITE_MUTEX_STATIC_OPEN);
1959 		assert(sqlite3_mutex_held(mutexOpen));
1960 	}
1961 #endif
1962 
1963 	new_bt = NULL;
1964 	if ((new_bt = (struct BtShared *)sqlite3_malloc(
1965 	    sizeof(struct BtShared))) == NULL)
1966 		return SQLITE_NOMEM;
1967 	memset(new_bt, 0, sizeof(struct BtShared));
1968 	new_bt->dbStorage = store;
1969 	if (store == DB_STORE_TMP) {
1970 		new_bt->transactional = 0;
1971 		new_bt->resultsBuffer = 1;
1972 	} else {
1973 		new_bt->transactional = 1;
1974 		new_bt->resultsBuffer = 0;
1975 	}
1976 #ifndef BDBSQL_AUTO_PAGE_SIZE
1977 	new_bt->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
1978 #endif
1979 	new_bt->flags = flags;
1980 	new_bt->mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
1981 	if (new_bt->mutex == NULL && sqlite3GlobalConfig.bCoreMutex)
1982 		goto err_nomem;
1983 	memcpy(new_bt->fileid, fileid, DB_FILE_ID_LEN);
1984 
1985 	/*
1986 	 * Always open database with read-uncommitted enabled
1987 	 * since SQLite allows DB_READ_UNCOMMITTED cursors to
1988 	 * be created on any table.
1989 	 */
1990 #ifndef BDBSQL_SINGLE_THREAD
1991 	new_bt->db_oflags = DB_THREAD |
1992 	    (new_bt->transactional ? DB_READ_UNCOMMITTED : 0);
1993 #endif
1994 	sqlite3HashInit(&new_bt->db_cache);
1995 	if (store == DB_STORE_NAMED) {
1996 		/* Store full path of zfilename */
1997 		dirPathName = dirPathBuf;
1998 		sqlite3OsFullPathname(
1999 		    db->pVfs, zFilename, sizeof(dirPathBuf), dirPathName);
2000 		if ((new_bt->full_name = sqlite3_strdup(dirPathName)) == NULL)
2001 			goto err_nomem;
2002 		if ((new_bt->orig_name = sqlite3_strdup(zFilename)) == NULL)
2003 			goto err_nomem;
2004 		sqlite3_snprintf(sizeof(dirPathBuf), dirPathBuf,
2005 		    "%s-journal", new_bt->full_name);
2006 		if ((new_bt->dir_name = sqlite3_strdup(dirPathBuf)) == NULL)
2007 			goto err_nomem;
2008 
2009 		/* Extract just the file name component. */
2010 		new_bt->short_name = strrchr(new_bt->orig_name, '/');
2011 		if (new_bt->short_name == NULL ||
2012 		    new_bt->short_name < strrchr(new_bt->orig_name, '\\'))
2013 			new_bt->short_name =
2014 			    strrchr(new_bt->orig_name, '\\');
2015 		if (new_bt->short_name == NULL)
2016 			new_bt->short_name = new_bt->orig_name;
2017 		else
2018 			/* Move past actual path seperator. */
2019 			++new_bt->short_name;
2020 	}
2021 
2022 	new_bt->cacheSize = SQLITE_DEFAULT_CACHE_SIZE;
2023 	new_bt->pageCount = SQLITE_MAX_PAGE_COUNT;
2024 	new_bt->nRef = 1;
2025 	new_bt->uid = g_uid_next++;
2026 	new_bt->logFileSize = SQLITE_DEFAULT_JOURNAL_SIZE_LIMIT;
2027 #ifdef SQLITE_SECURE_DELETE
2028 	new_bt->secureDelete = 1;
2029 #endif
2030 
2031 	p->pBt = new_bt;
2032 
2033 	return SQLITE_OK;
2034 
2035 err_nomem:
2036 	btreeFreeSharedBtree(new_bt, 0);
2037 	return SQLITE_NOMEM;
2038 }
2039 
2040 /*
2041 ** Open a new database.
2042 **
2043 ** zFilename is the name of the database file.  If zFilename is NULL a new
2044 ** database with a random name is created.  This randomly named database file
2045 ** will be deleted when sqlite3BtreeClose() is called.
2046 */
sqlite3BtreeOpen(const char * zFilename,sqlite3 * db,Btree ** ppBtree,int flags,int vfsFlags)2047 int sqlite3BtreeOpen(
2048     const char *zFilename,	/* Name of the file containing the database */
2049     sqlite3 *db,		/* Associated database connection */
2050     Btree **ppBtree,		/* Pointer to new Btree object written here */
2051     int flags,			/* Options */
2052     int vfsFlags)		/* Flags passed through to VFS open */
2053 {
2054 	Btree *p, *next_btree;
2055 	BtShared *pBt, *next_bt;
2056 	int rc;
2057 	sqlite3_mutex *mutexOpen;
2058 	storage_mode_t store;
2059 	u_int8_t fileid[DB_FILE_ID_LEN];
2060 	char *filename;
2061 
2062 	log_msg(LOG_VERBOSE, "sqlite3BtreeOpen(%s, %p, %p, %u, %u)", zFilename,
2063 	    db, ppBtree, flags, vfsFlags);
2064 
2065 	pBt = NULL;
2066 	rc = SQLITE_OK;
2067 	mutexOpen = NULL;
2068 	filename = NULL;
2069 
2070 	if ((p = (Btree *)sqlite3_malloc(sizeof(Btree))) == NULL)
2071 		return SQLITE_NOMEM;
2072 	memset(p, 0, sizeof(Btree));
2073 	memset(&fileid[0], 0, DB_FILE_ID_LEN);
2074 	p->db = db;
2075 	p->vfsFlags = vfsFlags;
2076 	p->pBt = NULL;
2077 	p->readonly = 0;
2078 	p->txn_bulk = BDBSQL_TXN_BULK_DEFAULT;
2079 	p->vacuumPages = BDBSQL_INCR_VACUUM_PAGES;
2080 	p->fillPercent = BDBSQL_VACUUM_FILLPERCENT;
2081 
2082 	if ((vfsFlags & SQLITE_OPEN_TRANSIENT_DB) != 0) {
2083 		log_msg(LOG_DEBUG, "sqlite3BtreeOpen creating temporary DB.");
2084 		store = DB_STORE_TMP;
2085 	} else if (zFilename == NULL ||
2086 	    (zFilename[0] == '\0' || strcmp(zFilename, ":memory:") == 0) ||
2087 	    (flags & BTREE_MEMORY) != 0) {
2088 		/*
2089 		 * Berkeley DB treats in-memory and temporary databases the
2090 		 * same way: if there is not enough space in cache, pages
2091 		 * overflow to temporary files.
2092 		 */
2093 		log_msg(LOG_DEBUG, "sqlite3BtreeOpen creating in-memory DB.");
2094 		store = DB_STORE_INMEM;
2095 	} else {
2096 		log_msg(LOG_DEBUG, "sqlite3BtreeOpen creating named DB.");
2097 		store = DB_STORE_NAMED;
2098 		/*
2099 		 * We always use the shared cache of handles, but SQLite
2100 		 * performs additional checks for conflicting table locks
2101 		 * when it is in shared cache mode, and aborts early.
2102 		 * We use the sharable flag to control that behavior.
2103 		 */
2104 		if (vfsFlags & SQLITE_OPEN_SHAREDCACHE)
2105 			p->sharable = 1;
2106 	}
2107 
2108 	mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(store));
2109 	sqlite3_mutex_enter(mutexOpen);
2110 
2111 #ifdef BDBSQL_FILE_PER_TABLE
2112 	if (store == DB_STORE_NAMED) {
2113 		rc = getMetaDataFileName(zFilename, &filename);
2114 		if (rc != SQLITE_OK)
2115 			goto err;
2116 	}
2117 #else
2118 	filename = (char *)zFilename;
2119 #endif
2120 
2121 	/* Non-named databases never share any content in BtShared. */
2122 	if (store == DB_STORE_NAMED &&
2123 	    !__os_exists(NULL, filename, NULL) &&
2124 	    __os_fileid(NULL, filename, 0, fileid) == 0) {
2125 		if ((rc = btreeGetSharedBtree(&pBt,
2126 		    fileid, db, store, vfsFlags)) != SQLITE_OK)
2127 			goto err;
2128 	}
2129 
2130 	if (pBt != NULL) {
2131 		p->pBt = pBt;
2132 		if ((rc = btreeOpenEnvironment(p, 0)) != SQLITE_OK) {
2133 			/*
2134 			 * clean up ref. from btreeGetSharedBtree() [#18767]
2135 			 */
2136 			assert(pBt->nRef > 1);
2137 			sqlite3_mutex_enter(pBt->mutex);
2138 			pBt->nRef--;
2139 			sqlite3_mutex_leave(pBt->mutex);
2140 			goto err;
2141 		}
2142 		/* The btreeOpenEnvironment call might have updated pBt. */
2143 		pBt = p->pBt;
2144 	} else {
2145 		if ((rc = btreeCreateSharedBtree(p,
2146 		    zFilename, fileid, db, flags, store)) != 0)
2147 			goto err;
2148 		pBt = p->pBt;
2149 		if (!pBt->resultsBuffer &&
2150 		    (rc = btreePrepareEnvironment(p)) != 0) {
2151 			btreeFreeSharedBtree(pBt, 0);
2152 			goto err;
2153 		}
2154 		/* Only named databases are in the shared btree cache. */
2155 		if (store == DB_STORE_NAMED) {
2156 			if (g_shared_btrees == NULL) {
2157 				pBt->pPrevDb = NULL;
2158 				g_shared_btrees = pBt;
2159 			} else {
2160 				for (next_bt = g_shared_btrees;
2161 				    next_bt->pNextDb != NULL;
2162 				    next_bt = next_bt->pNextDb) {}
2163 				next_bt->pNextDb = pBt;
2164 				pBt->pPrevDb = next_bt;
2165 			}
2166 		}
2167 	}
2168 
2169 	/* Add this Btree object to the list of Btrees seen by the BtShared */
2170 	for (next_btree = pBt->btrees; next_btree != NULL;
2171 	    next_btree = next_btree->pNext) {
2172 		if (next_btree == p)
2173 			break;
2174 	}
2175 	if (next_btree == NULL) {
2176 		if (pBt->btrees == NULL)
2177 			pBt->btrees = p;
2178 		else {
2179 			p->pNext = pBt->btrees;
2180 			pBt->btrees->pPrev = p;
2181 			pBt->btrees = p;
2182 		}
2183 	}
2184 	p->readonly = (p->vfsFlags & SQLITE_OPEN_READONLY) ? 1 : 0;
2185 	*ppBtree = p;
2186 
2187 err:	if (rc != SQLITE_OK)
2188 		sqlite3_free(p);
2189 	if (mutexOpen != NULL) {
2190 		assert(sqlite3_mutex_held(mutexOpen));
2191 		sqlite3_mutex_leave(mutexOpen);
2192 	}
2193 #ifdef BDBSQL_FILE_PER_TABLE
2194 	if (filename != NULL)
2195 		sqlite3_free(filename);
2196 #endif
2197 	return rc;
2198 }
2199 
2200 /* Close all cursors for the given transaction. */
btreeCloseAllCursors(Btree * p,DB_TXN * txn)2201 static int btreeCloseAllCursors(Btree *p, DB_TXN *txn)
2202 {
2203 	BtCursor *c, *nextc, *prevc, *free_cursors;
2204 	BtShared *pBt;
2205 	DB_TXN *db_txn, *dbc_txn;
2206 	int rc, ret, t_rc;
2207 
2208 	log_msg(LOG_VERBOSE, "btreeCloseAllCursors(%p, %p)", p, txn);
2209 
2210 	free_cursors = NULL;
2211 	pBt = p->pBt;
2212 	rc = SQLITE_OK;
2213 
2214 	sqlite3_mutex_enter(pBt->mutex);
2215 	for (c = pBt->first_cursor, prevc = NULL;
2216 	    c != NULL;
2217 	    prevc = c, c = nextc) {
2218 		nextc = c->next;
2219 		if (p != c->pBtree)
2220 			continue;
2221 		if (txn != NULL) {
2222 			if (c->dbc == NULL)
2223 				continue;
2224 			dbc_txn = c->dbc->txn;
2225 			db_txn = c->dbc->dbp->cur_txn;
2226 			while (dbc_txn != NULL && dbc_txn != txn)
2227 				dbc_txn = dbc_txn->parent;
2228 			while (db_txn != NULL && db_txn != txn)
2229 				db_txn = db_txn->parent;
2230 			if (dbc_txn != txn && db_txn != txn)
2231 				continue;
2232 		}
2233 
2234 		/*
2235 		 * Detach the cursor from the main list and add it to the free
2236 		 * list.
2237 		 */
2238 		if (prevc == NULL)
2239 			pBt->first_cursor = nextc;
2240 		else
2241 			prevc->next = nextc;
2242 
2243 		c->next = free_cursors;
2244 		free_cursors = c;
2245 		c = prevc;
2246 	}
2247 	sqlite3_mutex_leave(pBt->mutex);
2248 
2249 	for (c = free_cursors; c != NULL; c = c->next) {
2250 		t_rc = btreeCloseCursor(c, 0);
2251 		if (t_rc != SQLITE_OK && rc == SQLITE_OK)
2252 			rc = t_rc;
2253 	}
2254 
2255 	if (p->compact_cursor != NULL) {
2256 		if ((ret = p->compact_cursor->close(p->compact_cursor)) != 0 &&
2257 		    rc == SQLITE_OK)
2258 			rc = dberr2sqlite(ret, p);
2259 		p->compact_cursor = NULL;
2260 	}
2261 
2262 	if (p->schemaLock != NULL && txn != NULL) {
2263 		dbc_txn = p->schemaLock->txn;
2264 		while (dbc_txn != NULL && dbc_txn != txn)
2265 		    dbc_txn = dbc_txn->parent;
2266 		if (dbc_txn == txn &&
2267 		    (t_rc = btreeLockSchema(p, LOCKMODE_NONE)) != SQLITE_OK &&
2268 		    rc == SQLITE_OK)
2269 			rc = t_rc;
2270 	}
2271 
2272 	return rc;
2273 }
2274 
btreeCleanupCachedHandles(Btree * p,cleanup_mode_t cleanup)2275 static int btreeCleanupCachedHandles(Btree *p, cleanup_mode_t cleanup)
2276 {
2277 	DB *dbp;
2278 	DB_SEQUENCE *seq;
2279 	DBT key;
2280 	CACHED_DB *cached_db;
2281 	BtShared *pBt;
2282 	HashElem *e, *e_next;
2283 	SEQ_COOKIE *sc;
2284 	int remove, ret, rc;
2285 
2286 	log_msg(LOG_VERBOSE, "btreeCleanupCachedHandles(%p, %d)",
2287 	    p, (int)cleanup);
2288 
2289 	pBt = p->pBt;
2290 	e = NULL;
2291 	rc = SQLITE_OK;
2292 	remove = 0;
2293 
2294 	/* If a backup is in progress, we can't drop handle locks. */
2295 	if ((cleanup == CLEANUP_GET_LOCKS || cleanup == CLEANUP_DROP_LOCKS) &&
2296 	    p->nBackup > 0)
2297 		return (SQLITE_OK);
2298 
2299 	if ((cleanup == CLEANUP_GET_LOCKS || cleanup == CLEANUP_DROP_LOCKS))
2300 		sqlite3_mutex_enter(pBt->mutex);
2301 
2302 	for (e = sqliteHashFirst(&pBt->db_cache); e != NULL;
2303 	    e = e_next) {
2304 		/*
2305 		 * Grab the next value now rather than in the for loop so that
2306 		 * it's possible to remove elements from the list inline.
2307 		 */
2308 		e_next = sqliteHashNext(e);
2309 		cached_db = sqliteHashData(e);
2310 
2311 		if (cached_db == NULL)
2312 			continue;
2313 
2314 		if (cleanup == CLEANUP_DROP_LOCKS ||
2315 		    cleanup == CLEANUP_GET_LOCKS) {
2316 			if (cached_db->is_sequence || cached_db->dbp == NULL ||
2317 			    strcmp(cached_db->key, "1") == 0)
2318 				continue;
2319 			if (cleanup == CLEANUP_GET_LOCKS)
2320 				btreeDbHandleLock(p, cached_db);
2321 			else if (cleanup == CLEANUP_DROP_LOCKS) {
2322 				btreeDbHandleUnlock(p, cached_db);
2323 			}
2324 			continue;
2325 		}
2326 
2327 		if (cached_db->is_sequence) {
2328 			sc = (SEQ_COOKIE *)cached_db->cookie;
2329 			if (cleanup == CLEANUP_ABORT && sc != NULL) {
2330 				memset(&key, 0, sizeof(key));
2331 				key.data = sc->name;
2332 				key.size = key.ulen = sc->name_len;
2333 				key.flags = DB_DBT_USERMEM;
2334 				if (pMetaDb->exists(pMetaDb,
2335 				    pFamilyTxn, &key, 0) == DB_NOTFOUND) {
2336 					/*
2337 					 * This abort removed a sequence -
2338 					 * remove the matching cache entry.
2339 					 */
2340 					remove = 1;
2341 				}
2342 			}
2343 			seq = (DB_SEQUENCE *)cached_db->dbp;
2344 			if (seq != NULL && (ret = seq->close(seq, 0)) != 0 &&
2345 			    rc == SQLITE_OK)
2346 				rc = dberr2sqlite(ret, p);
2347 		} else if ((dbp = cached_db->dbp) != NULL) {
2348 			/*
2349 			 * We have to clear the cache of any stale DB handles.
2350 			 * If a transaction has been aborted, the handle will
2351 			 * no longer be open.  We peek inside the handle at
2352 			 * the flags to find out: otherwise, we would need to
2353 			 * track all parent / child relationships when
2354 			 * rolling back transactions.
2355 			 */
2356 			if (cleanup == CLEANUP_ABORT &&
2357 			    (dbp->flags & DB_AM_OPEN_CALLED) != 0)
2358 				continue;
2359 
2360 #ifndef BDBSQL_SINGLE_THREAD
2361 			if (dbp->app_private != NULL)
2362 				sqlite3_free(dbp->app_private);
2363 #endif
2364 			if ((ret = closeDB(p, dbp, DB_NOSYNC)) == 0 &&
2365 			    rc == SQLITE_OK)
2366 				rc = dberr2sqlite(ret, p);
2367 			remove = 1;
2368 		}
2369 		if (cleanup == CLEANUP_CLOSE || remove) {
2370 			if (remove)
2371 				sqlite3HashInsert(&pBt->db_cache,
2372 				    cached_db->key,
2373 				    (int)strlen(cached_db->key), NULL);
2374 			if (cached_db->cookie != NULL)
2375 				sqlite3_free(cached_db->cookie);
2376 			sqlite3_free(cached_db);
2377 			remove = 0;
2378 		} else
2379 			cached_db->dbp = NULL;
2380 	}
2381 
2382 	if ((cleanup == CLEANUP_GET_LOCKS || cleanup == CLEANUP_DROP_LOCKS))
2383 		sqlite3_mutex_leave(pBt->mutex);
2384 
2385 	return rc;
2386 }
2387 
2388 /*
2389 ** Close an open database and invalidate all cursors.
2390 */
sqlite3BtreeClose(Btree * p)2391 int sqlite3BtreeClose(Btree *p)
2392 {
2393 	Btree *next_btree;
2394 	BtShared *pBt;
2395 	int ret, rc, t_rc, t_ret;
2396 	sqlite3_mutex *mutexOpen;
2397 #ifdef BDBSQL_SHARE_PRIVATE
2398 	int needsunlock = 0;
2399 #endif
2400 
2401 	log_msg(LOG_VERBOSE, "sqlite3BtreeClose(%p)", p);
2402 
2403 	ret = 0;
2404 	pBt = p->pBt;
2405 	rc = SQLITE_OK;
2406 
2407 	if (pBt == NULL)
2408 		goto done;
2409 #ifdef BDBSQL_SHARE_PRIVATE
2410 	/*
2411 	 * It is useful to checkpoint when closing but in the case of
2412 	 * BDBSQL_SHARE_PRIVATE the write lock is required to ensure
2413 	 * that the current data is written.  That must be acquired while
2414 	 * the environment is still intact in case of a re-open.
2415 	 */
2416 	if (pBt->dbStorage == DB_STORE_NAMED && pDbEnv) {
2417 		if (pBt->transactional && pBt->env_opened) {
2418 			btreeScopedFileLock(p, 1, 0);
2419 			needsunlock = 1;
2420 			/* checkpoint happens below */
2421 		}
2422 	}
2423 #endif
2424 
2425 	rc = btreeCloseAllCursors(p, NULL);
2426 
2427 #ifndef SQLITE_OMIT_AUTOVACUUM
2428 	/*
2429 	 * Btree might keep some incremental vacuum info with an internal
2430 	 * link list. Need to free the link when Btree is closed.
2431 	 */
2432 	btreeFreeVacuumInfo(p);
2433 #endif
2434 
2435 	if (pMainTxn != NULL &&
2436 	    (t_rc = sqlite3BtreeRollback(p)) != SQLITE_OK && rc == SQLITE_OK)
2437 		rc = t_rc;
2438 	assert(pMainTxn == NULL);
2439 
2440 	if (pFamilyTxn != NULL) {
2441 		ret = pFamilyTxn->commit(pFamilyTxn, 0);
2442 		pFamilyTxn = NULL;
2443 		p->inTrans = TRANS_NONE;
2444 		p->txn_excl = 0;
2445 		if (ret != 0 && rc == SQLITE_OK)
2446 			rc = dberr2sqlite(ret, p);
2447 	}
2448 
2449 	if (p->schema != NULL) {
2450 		if (p->free_schema != NULL)
2451 			p->free_schema(p->schema);
2452 		/* This needs to be a real call to sqlite3_free. */
2453 #ifdef BDBSQL_OMIT_LEAKCHECK
2454 #undef	sqlite3_free
2455 #endif
2456 		sqlite3_free(p->schema);
2457 #ifdef BDBSQL_OMIT_LEAKCHECK
2458 #define	sqlite3_free free
2459 #endif
2460 	}
2461 
2462 	/*
2463 	 * #18538 -- another thread may be attempting to open this BtShared at
2464 	 * the same time that we are closing it.
2465 	 *
2466 	 * To avoid a race, we need to hold the open mutex until the
2467 	 * environment is closed.  Otherwise, the opening thread might open its
2468 	 * handle before this one is completely closed, and DB_REGISTER doesn't
2469 	 * support that.
2470 	 */
2471 	mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
2472 	sqlite3_mutex_enter(mutexOpen);
2473 
2474 	/* Remove this pBt from the BtShared list of btrees. */
2475 	for (next_btree = pBt->btrees; next_btree != NULL;
2476 	    next_btree = next_btree->pNext) {
2477 		if (next_btree == p) {
2478 			if (next_btree == pBt->btrees) {
2479 				pBt->btrees = next_btree->pNext;
2480 				if (pBt->btrees != NULL)
2481 					pBt->btrees->pPrev = NULL;
2482 			} else {
2483 				p->pPrev->pNext = p->pNext;
2484 				if (p->pNext != NULL)
2485 					p->pNext->pPrev = p->pPrev;
2486 			}
2487 		}
2488 	}
2489 
2490 	if (--pBt->nRef == 0) {
2491 		assert (pBt->btrees == NULL);
2492 		if (pBt->dbStorage == DB_STORE_NAMED) {
2493 			/* Remove it from the linked list of shared envs. */
2494 			assert(pBt == g_shared_btrees || pBt->pPrevDb != NULL);
2495 			if (pBt == g_shared_btrees)
2496 				g_shared_btrees = pBt->pNextDb;
2497 			else
2498 				pBt->pPrevDb->pNextDb = pBt->pNextDb;
2499 			if (pBt->pNextDb != NULL)
2500 				pBt->pNextDb->pPrevDb = pBt->pPrevDb;
2501 		}
2502 
2503 		/*
2504 		 * At this point, the BtShared has been removed from the shared
2505 		 * list, so it cannot be reused and it is safe to close any
2506 		 * handles.
2507 		 */
2508 		t_rc = btreeCleanupCachedHandles(p, CLEANUP_CLOSE);
2509 		if (t_rc != SQLITE_OK && rc == SQLITE_OK)
2510 			rc = t_rc;
2511 		sqlite3HashClear(&pBt->db_cache);
2512 
2513 		/* Delete any memory held by the pragma cache. */
2514 		cleanPragmaCache(p);
2515 
2516 		if (pTablesDb != NULL && (t_ret =
2517 		    pTablesDb->close(pTablesDb, DB_NOSYNC)) != 0 && ret == 0)
2518 			ret = t_ret;
2519 		if (pMetaDb != NULL && (t_ret =
2520 		    pMetaDb->close(pMetaDb, DB_NOSYNC)) != 0 && ret == 0)
2521 			ret = t_ret;
2522 		pTablesDb = pMetaDb = NULL;
2523 
2524 		/* We never close down the shared tmp environment. */
2525 		if (pBt->dbStorage == DB_STORE_NAMED && pDbEnv) {
2526 			/*
2527 			 * Checkpoint when closing.  This allows log file
2528 			 * auto-removal, which keeps the size of the
2529 			 * environment directory small and also
2530 			 * bounds the time we would have to spend in
2531 			 * recovery.
2532 			 */
2533 			if (pBt->transactional && pBt->env_opened) {
2534 				if ((t_ret = pDbEnv->txn_checkpoint(pDbEnv,
2535 				    0, 0, 0)) != 0 && ret == 0)
2536 					ret = t_ret;
2537 			}
2538 #ifdef BDBSQL_SHARE_PRIVATE
2539 			/* don't flush the cache; checkpoint has been done */
2540 			pDbEnv->set_errcall(pDbEnv, NULL);
2541 			pDbEnv->set_flags(pDbEnv, DB_NOFLUSH, 1);
2542 #endif
2543 			if ((t_ret = pDbEnv->close(pDbEnv, 0)) != 0 && ret == 0)
2544 				ret = t_ret;
2545 			pBt->repStarted = 0;
2546 		}
2547 #ifdef BDBSQL_SHARE_PRIVATE
2548 		/* this must happen before the pBt disappears */
2549 		if (needsunlock)
2550 			btreeScopedFileUnlock(p, 1);
2551 #endif
2552 		btreeFreeSharedBtree(pBt, 0);
2553 	}
2554 	sqlite3_mutex_leave(mutexOpen);
2555 
2556 done:	rc = (rc != SQLITE_OK) ?
2557 	    rc : (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
2558 	sqlite3_free(p);
2559 	return rc;
2560 }
2561 
2562 /*
2563 ** Change the limit on the number of pages allowed in the cache.
2564 **
2565 ** The maximum number of cache pages is set to the absolute value of mxPage.
2566 ** If mxPage is negative in SQLite, the pager will operate asynchronously - it
2567 ** will not stop to do fsync()s to insure data is written to the disk surface
2568 ** before continuing.
2569 **
2570 ** The Berkeley DB cache always operates in asynchronously (except when writing
2571 ** a checkpoint), but log writes are triggered to maintain write-ahead logging
2572 ** semantics.
2573 */
sqlite3BtreeSetCacheSize(Btree * p,int mxPage)2574 int sqlite3BtreeSetCacheSize(Btree *p, int mxPage)
2575 {
2576 	BtShared *pBt;
2577 	log_msg(LOG_VERBOSE, "sqlite3BtreeSetCacheSize(%p, %u)", p, mxPage);
2578 
2579 	pBt = p->pBt;
2580 	if (mxPage < 0)
2581 		mxPage = -mxPage;
2582 
2583 	if (!p->connected)
2584 		pBt->cacheSize = mxPage;
2585 	return SQLITE_OK;
2586 }
2587 
2588 /*
2589 ** Change the way data is synced to disk in order to increase or decrease how
2590 ** well the database resists damage due to OS crashes and power failures.
2591 ** Level 1 is the same as asynchronous (no syncs() occur and there is a high
2592 ** probability of damage)  Level 2 is the default.  There is a very low but
2593 ** non-zero probability of damage.  Level 3 reduces the probability of damage
2594 ** to near zero but with a write performance reduction.
2595 **
2596 ** Berkeley DB always does the equivalent of "fullSync".
2597 */
sqlite3BtreeSetSafetyLevel(Btree * p,int level,int fullSync,int ckptFullSync)2598 int sqlite3BtreeSetSafetyLevel(
2599     Btree *p,
2600     int level,
2601     int fullSync,
2602     int ckptFullSync)
2603 {
2604 	BtShared *pBt;
2605 	log_msg(LOG_VERBOSE,
2606 	    "sqlite3BtreeSetSafetyLevel(%p, %u, %u, %u)",
2607 	    p, level, fullSync, ckptFullSync);
2608 
2609 	pBt = p->pBt;
2610 
2611 	/* TODO: Ignore ckptFullSync for now - it corresponds to:
2612 	 * PRAGMA checkpoint_fullfsync
2613 	 * Berkeley DB doesn't allow you to disable that, so ignore the pragma.
2614 	 */
2615 	if (GET_DURABLE(p->pBt)) {
2616 		pDbEnv->set_flags(pDbEnv, DB_TXN_NOSYNC, (level == 1));
2617 		pDbEnv->set_flags(pDbEnv, DB_TXN_WRITE_NOSYNC, (level == 2));
2618 	}
2619 	return SQLITE_OK;
2620 }
2621 
sqlite3BtreeHandleCacheUpdate(Btree * p,int schema_changed)2622 int sqlite3BtreeHandleCacheUpdate(Btree *p, int schema_changed)
2623 {
2624 	int rc;
2625 
2626 	if (schema_changed != 0 && (rc = btreeInvalidateHandleCache(p)) != 0)
2627 		return rc;
2628 	return btreeCleanupCachedHandles(p, CLEANUP_GET_LOCKS);
2629 }
2630 
2631 /*
2632  * If the schema version has changed since the last transaction we need to
2633  * close all handles in the handle cache that aren't holding a handle lock.
2634  * Ideally we could do this via the sqlite3ResetInternalSchema method
2635  * but there is no obvious hook there, and.. since we do the GET_LOCKS
2636  * call here, we need to close handles now or we can't tell if they need to be
2637  * closed.
2638  * TODO: We'll probably be best altering the sqlite code to make this work
2639  * more efficiently.
2640  */
btreeInvalidateHandleCache(Btree * p)2641 static int btreeInvalidateHandleCache(Btree *p) {
2642 	BtShared *pBt;
2643 	int cookie, i, rc, ret;
2644 	CACHED_DB *cached_db, **tables_to_close;
2645 	DB *dbp;
2646 	HashElem *e, *e_next;
2647 	u_int32_t flags;
2648 
2649 	rc = ret = 0;
2650 	pBt = p->pBt;
2651 
2652 	if (p->inTrans == TRANS_NONE && p->db != NULL && p->db->aDb != NULL) {
2653 		sqlite3BtreeGetMeta(p, BTREE_SCHEMA_VERSION, (u32 *)&cookie);
2654 		if (p->db->aDb[0].pSchema != NULL &&
2655 		    p->db->aDb[0].pSchema->schema_cookie != cookie) {
2656 			/*
2657 			 * TODO: Is it possible that this function is called
2658 			 * while already holding the mutex? Maybe from the
2659 			 * sequence code.
2660 			 */
2661 			sqlite3_mutex_enter(pBt->mutex);
2662 			/*
2663 			 * We can't call DB->close while holding the mutex, so
2664 			 * record which handles we want to close and do the
2665 			 * actual close after the mutex is released.
2666 			 */
2667 			for (e = sqliteHashFirst(&pBt->db_cache), i = 0;
2668 			    e != NULL; e = sqliteHashNext(e), i++) {}
2669 
2670 			if (i == 0) {
2671 				sqlite3_mutex_leave(pBt->mutex);
2672 				return (0);
2673 			}
2674 
2675 			tables_to_close =
2676 			     sqlite3_malloc(i * sizeof(CACHED_DB *));
2677 			if (tables_to_close == NULL) {
2678 				sqlite3_mutex_leave(pBt->mutex);
2679 				return SQLITE_NOMEM;
2680 			}
2681 			memset(tables_to_close, 0, i * sizeof(CACHED_DB *));
2682 			/*
2683 			 * Ideally we'd be able to find out if the Berkeley DB
2684 			 * fileid is still valid, but that's not currently
2685 			 * simple, so close all handles.
2686 			 */
2687 			for (e = sqliteHashFirst(&pBt->db_cache), i = 0;
2688 			    e != NULL; e = e_next) {
2689 				e_next = sqliteHashNext(e);
2690 				cached_db = sqliteHashData(e);
2691 
2692 				/* Skip table name db and in memory tables. */
2693 				if (cached_db == NULL ||
2694 				    strcmp(cached_db->key, "1") == 0 ||
2695 				    cached_db->dbp == NULL)
2696 					continue;
2697 				dbp = cached_db->dbp;
2698 				dbp->dbenv->get_open_flags(dbp->dbenv, &flags);
2699 				if (flags & DB_PRIVATE)
2700 					continue;
2701 				if (btreeDbHandleIsLocked(cached_db))
2702 					continue;
2703 				tables_to_close[i++] = cached_db;
2704 				sqlite3HashInsert(&pBt->db_cache,
2705 				    cached_db->key,
2706 				    (int)strlen(cached_db->key), NULL);
2707 			}
2708 			sqlite3_mutex_leave(pBt->mutex);
2709 			for (i = 0; tables_to_close[i] != NULL; i++) {
2710 				cached_db = tables_to_close[i];
2711 				dbp = cached_db->dbp;
2712 #ifndef BDBSQL_SINGLE_THREAD
2713 				if (dbp->app_private != NULL)
2714 					sqlite3_free(dbp->app_private);
2715 #endif
2716 				if ((ret = closeDB(p, dbp, DB_NOSYNC)) == 0 &&
2717 				    rc == SQLITE_OK)
2718 					rc = dberr2sqlite(ret, p);
2719 				if (cached_db->cookie != NULL)
2720 					sqlite3_free(cached_db->cookie);
2721 				sqlite3_free(cached_db);
2722 			}
2723 			sqlite3_free(tables_to_close);
2724 			if (rc != 0)
2725 				return (rc);
2726 		}
2727 	}
2728 	return (0);
2729 }
2730 
btreeBeginTransInternal(Btree * p,int wrflag)2731 int btreeBeginTransInternal(Btree *p, int wrflag)
2732 {
2733 	btreeCleanupCachedHandles(p, CLEANUP_GET_LOCKS);
2734 	return sqlite3BtreeBeginTrans(p, wrflag);
2735 }
2736 
2737 /*
2738 ** Attempt to start a new transaction. A write-transaction is started if the
2739 ** second argument is true, otherwise a read-transaction. No-op if a
2740 ** transaction is already in progress.
2741 **
2742 ** A write-transaction must be started before attempting any changes to the
2743 ** database.  None of the following routines will work unless a transaction
2744 ** is started first:
2745 **
2746 **      sqlite3BtreeCreateTable()
2747 **      sqlite3BtreeCreateIndex()
2748 **      sqlite3BtreeClearTable()
2749 **      sqlite3BtreeDropTable()
2750 **      sqlite3BtreeInsert()
2751 **      sqlite3BtreeDelete()
2752 **      sqlite3BtreeUpdateMeta()
2753 */
sqlite3BtreeBeginTrans(Btree * p,int wrflag)2754 int sqlite3BtreeBeginTrans(Btree *p, int wrflag)
2755 {
2756 	BtShared *pBt;
2757 	int rc;
2758 	u_int32_t txn_exclPriority;
2759 	u32 temp;
2760 
2761 	log_msg(LOG_VERBOSE,
2762 	    "sqlite3BtreeBeginTrans(%p, %u) -- writer %s",
2763 	    p, wrflag, pReadTxn ? "active" : "inactive");
2764 
2765 	/*
2766 	 * The BtShared is not in a usable state. Return NOMEM, since it
2767 	 * is the most consistently well handled error return from SQLite code.
2768 	 */
2769 	if (p->pBt->panic)
2770 		return SQLITE_NOMEM;
2771 
2772 	pBt = p->pBt;
2773 	rc = SQLITE_OK;
2774 	txn_exclPriority = -1;
2775 
2776 	/* A replication client should not start write transactions. */
2777 	if (wrflag && (IS_BTREE_READONLY(p) || btreeRepIsClient(p)))
2778 		return SQLITE_READONLY;
2779 
2780 	if (!p->connected) {
2781 		if (wrflag != 2) {
2782 			p->inTrans = (wrflag || p->inTrans == TRANS_WRITE) ?
2783 				TRANS_WRITE : TRANS_READ;
2784 			if (!pBt->need_open)
2785 				return SQLITE_OK;
2786 		}
2787 		if ((rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
2788 			return rc;
2789 		/* The btreeOpenEnvironment call might have updated pBt. */
2790 		pBt = p->pBt;
2791 	}
2792 
2793 	if (wrflag == 2)
2794 		p->txn_excl = 1;
2795 	if (pBt->transactional) {
2796 		if (wrflag && p->inTrans != TRANS_WRITE)
2797 			p->inTrans = TRANS_WRITE;
2798 		else if (p->inTrans == TRANS_NONE)
2799 			p->inTrans = TRANS_READ;
2800 
2801 		if (pReadTxn == NULL || p->nSavepoint <= p->db->nSavepoint)
2802 			rc = sqlite3BtreeBeginStmt(p, p->db->nSavepoint);
2803 
2804 		/* Exclusive transaction. */
2805 		if (wrflag == 2 && rc == SQLITE_OK) {
2806 			pSavepointTxn->set_priority(pSavepointTxn,
2807 			    txn_exclPriority);
2808 			pReadTxn->set_priority(pReadTxn, txn_exclPriority);
2809 			pMainTxn->set_priority(pMainTxn, txn_exclPriority);
2810 			pFamilyTxn->set_priority(pFamilyTxn, txn_exclPriority);
2811 			sqlite3BtreeGetMeta(p, 1, &temp);
2812 		} else if (p->txn_priority != 0) {
2813 			pSavepointTxn->set_priority(pSavepointTxn,
2814 			    p->txn_priority);
2815 			pReadTxn->set_priority(pReadTxn, p->txn_priority);
2816 			pMainTxn->set_priority(pMainTxn, p->txn_priority);
2817 			pFamilyTxn->set_priority(pFamilyTxn, p->txn_priority);
2818 		}
2819 	}
2820 	return rc;
2821 }
2822 
2823 /***************************************************************************
2824 ** This routine does the first phase of a two-phase commit.  This routine
2825 ** causes a rollback journal to be created (if it does not already exist)
2826 ** and populated with enough information so that if a power loss occurs the
2827 ** database can be restored to its original state by playing back the journal.
2828 ** Then the contents of the journal are flushed out to the disk. After the
2829 ** journal is safely on oxide, the changes to the database are written into
2830 ** the database file and flushed to oxide. At the end of this call, the
2831 ** rollback journal still exists on the disk and we are still holding all
2832 ** locks, so the transaction has not committed. See sqlite3BtreeCommit() for
2833 ** the second phase of the commit process.
2834 **
2835 ** This call is a no-op if no write-transaction is currently active on pBt.
2836 **
2837 ** Otherwise, sync the database file for the engine pBt. zMaster points to
2838 ** the name of a master journal file that should be written into the
2839 ** individual journal file, or is NULL, indicating no master journal file
2840 ** (single database transaction).
2841 **
2842 ** When this is called, the master journal should already have been created,
2843 ** populated with this journal pointer and synced to disk.
2844 **
2845 ** Once this is routine has returned, the only thing required to commit the
2846 ** write-transaction for this database file is to delete the journal.
2847 */
sqlite3BtreeCommitPhaseOne(Btree * p,const char * zMaster)2848 int sqlite3BtreeCommitPhaseOne(Btree *p, const char *zMaster)
2849 {
2850 	log_msg(LOG_VERBOSE,
2851 	    "sqlite3BtreeCommitPhaseOne(%p, %s)", p, zMaster);
2852 	return SQLITE_OK;
2853 }
2854 
2855 /***************************************************************************
2856 ** Commit the transaction currently in progress.
2857 **
2858 ** This routine implements the second phase of a 2-phase commit.  The
2859 ** sqlite3BtreeCommitPhaseOne() routine does the first phase and should
2860 ** be invoked prior to calling this routine.  The sqlite3BtreeCommitPhaseOne()
2861 ** routine did all the work of writing information out to disk and flushing the
2862 ** contents so that they are written onto the disk platter.  All this
2863 ** routine has to do is delete or truncate or zero the header in the
2864 ** the rollback journal (which causes the transaction to commit) and
2865 ** drop locks.
2866 **
2867 ** Normally, if an error occurs while the pager layer is attempting to
2868 ** finalize the underlying journal file, this function returns an error and
2869 ** the upper layer will attempt a rollback. However, if the second argument
2870 ** is non-zero then this b-tree transaction is part of a multi-file
2871 ** transaction. In this case, the transaction has already been committed
2872 ** (by deleting a master journal file) and the caller will ignore this
2873 ** functions return code. So, even if an error occurs in the pager layer,
2874 ** reset the b-tree objects internal state to indicate that the write
2875 ** transaction has been closed. This is quite safe, as the pager will have
2876 ** transitioned to the error state.
2877 **
2878 ** This will release the write lock on the database file.  If there
2879 ** are no active cursors, it also releases the read lock.
2880 **
2881 ** NOTE: It's OK for Berkeley DB to ignore the bCleanup flag - it is only used
2882 ** by SQLite when it is safe for it to ignore stray journal files. That's not
2883 ** a relevant consideration for Berkele DB.
2884 */
sqlite3BtreeCommitPhaseTwo(Btree * p,int bCleanup)2885 int sqlite3BtreeCommitPhaseTwo(Btree *p, int bCleanup)
2886 {
2887 	Btree *next_btree;
2888 	BtShared *pBt;
2889 	DELETED_TABLE *dtable, *next;
2890 	char *tableName, tableNameBuf[DBNAME_SIZE];
2891 	char *oldTableName, oldTableNameBuf[DBNAME_SIZE], *fileName;
2892 	int needVacuum, rc, ret, t_rc;
2893 	int in_trans, removeFlags;
2894 	u_int32_t defaultTxnPriority;
2895 #ifdef BDBSQL_SHARE_PRIVATE
2896 	int deleted = 0; /* indicates tables were deleted */
2897 	int needsunlock = 0;
2898 #endif
2899 #ifdef BDBSQL_FILE_PER_TABLE
2900 	DBT key;
2901 #endif
2902 	log_msg(LOG_VERBOSE,
2903 	    "sqlite3BtreeCommitPhaseTwo(%p) -- writer %s",
2904 	    p, pReadTxn ? "active" : "inactive");
2905 
2906 	pBt = p->pBt;
2907 	rc = SQLITE_OK;
2908 	defaultTxnPriority = 100;
2909 	needVacuum = 0;
2910 	removeFlags = DB_AUTO_COMMIT | DB_LOG_NO_DATA | DB_NOSYNC | \
2911 	    (GET_DURABLE(pBt) ? 0 : DB_TXN_NOT_DURABLE);
2912 
2913 	if (pMainTxn && p->db->activeVdbeCnt <= 1) {
2914 #ifdef BDBSQL_SHARE_PRIVATE
2915 		needsunlock = 1;
2916 #endif
2917 		/* Mark the end of an exclusive transaction. */
2918 		p->txn_excl = 0;
2919 		t_rc = btreeCloseAllCursors(p, pMainTxn);
2920 		if (t_rc != SQLITE_OK && rc == SQLITE_OK)
2921 			rc = t_rc;
2922 
2923 		/*
2924 		 * Even if we get an error, we can't use the
2925 		 * transaction handle again, so we should keep going
2926 		 * and clear out the Btree fields.
2927 		 */
2928 		ret = pMainTxn->commit(pMainTxn, 0);
2929 		if (ret != 0 && rc == SQLITE_OK)
2930 			rc = dberr2sqlite(ret, p);
2931 
2932 		pMainTxn = pSavepointTxn = pReadTxn = NULL;
2933 		p->nSavepoint = 0;
2934 
2935 		for (dtable = p->deleted_tables;
2936 		    dtable != NULL;
2937 		    dtable = next) {
2938 #ifdef BDBSQL_SHARE_PRIVATE
2939 			deleted = 1;
2940 #endif
2941 			tableName = tableNameBuf;
2942 			GET_TABLENAME(tableName, sizeof(tableNameBuf),
2943 			    dtable->iTable, "");
2944 			FIX_TABLENAME(pBt, fileName, tableName);
2945 
2946 			/*
2947 			 * In memory db was not renamed. Just do a quick remove
2948 			 * in this case.
2949 			 */
2950 			if (pBt->dbStorage == DB_STORE_INMEM) {
2951 				ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2952 				    tableName, removeFlags);
2953 				goto next;
2954 			}
2955 #ifndef BDBSQL_FILE_PER_TABLE
2956 			oldTableName = oldTableNameBuf;
2957 			GET_TABLENAME(oldTableName, sizeof(oldTableNameBuf),
2958 			    dtable->iTable, "old-");
2959 
2960 			ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2961 			    oldTableName, removeFlags);
2962 #else
2963 			if (dtable->flag == DTF_DELETE) {
2964 				oldTableName = oldTableNameBuf;
2965 				GET_TABLENAME(oldTableName,
2966 				    sizeof(oldTableNameBuf),
2967 				    dtable->iTable, "old-");
2968 
2969 				ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2970 				    oldTableName, removeFlags);
2971 			} else {
2972 				ret = pDbEnv->dbremove(pDbEnv, NULL, fileName,
2973 				    NULL, removeFlags);
2974 				if (ret != 0 && rc == SQLITE_OK)
2975 					rc = dberr2sqlite(ret, p);
2976 
2977 				memset(&key, 0, sizeof(key));
2978 				key.flags = DB_DBT_USERMEM;
2979 				key.data = tableName;
2980 				key.size = strlen(tableName);
2981 				ret = pTablesDb->del(pTablesDb, NULL, &key, 0);
2982 			}
2983 #endif
2984 next:			if (ret != 0 && rc == SQLITE_OK)
2985 				rc = dberr2sqlite(ret, p);
2986 
2987 			next = dtable->next;
2988 			sqlite3_free(dtable);
2989 		}
2990 		p->deleted_tables = NULL;
2991 
2992 		/* Execute vacuum if auto-vacuum mode is FULL or incremental */
2993 		needVacuum = (pBt->dbStorage == DB_STORE_NAMED &&
2994 		    p->inTrans == TRANS_WRITE &&
2995 		    (sqlite3BtreeGetAutoVacuum(p) == BTREE_AUTOVACUUM_FULL ||
2996 		    p->needVacuum));
2997 	} else if (p->inTrans == TRANS_WRITE)
2998 		rc = sqlite3BtreeSavepoint(p, SAVEPOINT_RELEASE, 0);
2999 
3000 #ifdef BDBSQL_SHARE_PRIVATE
3001 	if (pBt->dbStorage == DB_STORE_NAMED && needsunlock) {
3002 		/* need to checkpoint if databases were removed */
3003 		if (deleted) {
3004 			assert(btreeHasFileLock(p, 1)); /* write lock */
3005 			rc = dberr2sqlite(pDbEnv->txn_checkpoint(
3006 			    pDbEnv, 0, 0, 0), p);
3007 		}
3008 		btreeFileUnlock(p);
3009 	}
3010 #endif
3011 	if (pFamilyTxn)
3012 		pFamilyTxn->set_priority(pFamilyTxn, defaultTxnPriority);
3013 
3014 	if (p->db->activeVdbeCnt > 1)
3015 		p->inTrans = TRANS_READ;
3016 	else {
3017 		p->inTrans = TRANS_NONE;
3018 		if (p->schemaLockMode > LOCKMODE_NONE &&
3019 		    (t_rc = btreeLockSchema(p, LOCKMODE_NONE)) != SQLITE_OK &&
3020 		    rc == SQLITE_OK)
3021 			rc = t_rc;
3022 
3023 		/*
3024 		 * Only release the handle locks if no transactions are active
3025 		 * in any Btree.
3026 		 */
3027 		in_trans = 0;
3028 		for (next_btree = pBt->btrees; next_btree != NULL;
3029 		     next_btree = next_btree->pNext) {
3030 			if (next_btree->inTrans != TRANS_NONE) {
3031 				in_trans = 1;
3032 				break;
3033 			}
3034 		}
3035 
3036 		/* Drop any handle locks if this was the only active txn. */
3037 		if (in_trans == 0)
3038 			btreeCleanupCachedHandles(p, CLEANUP_DROP_LOCKS);
3039 	}
3040 
3041 	if (needVacuum && rc == SQLITE_OK)
3042 		rc = btreeVacuum(p, &p->db->zErrMsg);
3043 
3044 	return rc;
3045 }
3046 
3047 /*
3048 ** Do both phases of the commit.
3049 */
sqlite3BtreeCommit(Btree * p)3050 int sqlite3BtreeCommit(Btree *p)
3051 {
3052 	BtShared *pBt;
3053 	int rc;
3054 
3055 	log_msg(LOG_VERBOSE, "sqlite3BtreeCommit(%p)", p);
3056 
3057 	pBt = p->pBt;
3058 	rc = sqlite3BtreeCommitPhaseOne(p, NULL);
3059 	if (rc == SQLITE_OK)
3060 		rc = sqlite3BtreeCommitPhaseTwo(p, 0);
3061 
3062 	return (rc);
3063 }
3064 
3065 /*
3066 ** Rollback the transaction in progress.  All cursors will be invalidated
3067 ** by this operation.  Any attempt to use a cursor that was open at the
3068 ** beginning of this operation will result in an error.
3069 **
3070 ** This will release the write lock on the database file.  If there are no
3071 ** active cursors, it also releases the read lock.
3072 */
sqlite3BtreeRollback(Btree * p)3073 int sqlite3BtreeRollback(Btree *p)
3074 {
3075 	BtShared *pBt;
3076 	int rc, t_rc;
3077 
3078 	log_msg(LOG_VERBOSE, "sqlite3BtreeRollback(%p)", p);
3079 
3080 	rc = SQLITE_OK;
3081 	pBt = p->pBt;
3082 	if (pMainTxn != NULL)
3083 		rc = sqlite3BtreeSavepoint(p, SAVEPOINT_ROLLBACK, -1);
3084 	if (p->schemaLockMode > LOCKMODE_NONE &&
3085 	    (t_rc = btreeLockSchema(p, LOCKMODE_NONE)) != SQLITE_OK &&
3086 	    rc == SQLITE_OK)
3087 		rc = t_rc;
3088 
3089 	/* Clear failure state if rollback is done successfully. */
3090 	if (rc == SQLITE_OK)
3091 		pBt->panic = 0;
3092 
3093 	return rc;
3094 }
3095 
3096 /*
3097 ** Start a statement subtransaction.  The subtransaction can be rolled back
3098 ** independently of the main transaction. You must start a transaction
3099 ** before starting a subtransaction. The subtransaction is ended automatically
3100 ** if the main transaction commits or rolls back.
3101 **
3102 ** Only one subtransaction may be active at a time.  It is an error to try
3103 ** to start a new subtransaction if another subtransaction is already active.
3104 **
3105 ** Statement subtransactions are used around individual SQL statements that
3106 ** are contained within a BEGIN...COMMIT block.  If a constraint error
3107 ** occurs within the statement, the effect of that one statement can be
3108 ** rolled back without having to rollback the entire transaction.
3109 */
sqlite3BtreeBeginStmt(Btree * p,int iStatement)3110 int sqlite3BtreeBeginStmt(Btree *p, int iStatement)
3111 {
3112 	BtShared *pBt;
3113 	int ret;
3114 
3115 	log_msg(LOG_VERBOSE, "sqlite3BtreeBeginStmt(%p, %d)", p, iStatement);
3116 
3117 	pBt = p->pBt;
3118 	ret = 0;
3119 
3120 	if (pBt->transactional && p->inTrans != TRANS_NONE &&
3121 	    pFamilyTxn != NULL) {
3122 
3123 		if (!pMainTxn) {
3124 #ifdef BDBSQL_SHARE_PRIVATE
3125 			/* btree{Read,Write}lock may reopen the environment */
3126 			if (pBt->dbStorage == DB_STORE_NAMED)
3127 				btreeFileLock(p);
3128 #endif
3129 			if ((ret = pDbEnv->txn_begin(pDbEnv, pFamilyTxn,
3130 			    &pMainTxn, p->txn_bulk ? DB_TXN_BULK :
3131 			    pBt->read_txn_flags)) != 0) {
3132 #ifdef BDBSQL_SHARE_PRIVATE
3133 				if (pBt->dbStorage == DB_STORE_NAMED)
3134 					btreeFileUnlock(p);
3135 #endif
3136 				return dberr2sqlite(ret, p);
3137 			}
3138 			pSavepointTxn = pMainTxn;
3139 		}
3140 
3141 		if (!pReadTxn) {
3142 			if (p->txn_bulk)
3143 			       pReadTxn = pMainTxn;
3144 			else if ((ret = pDbEnv->txn_begin(pDbEnv, pMainTxn,
3145 			    &pReadTxn, pBt->read_txn_flags)) != 0)
3146 				return dberr2sqlite(ret, p);
3147 		}
3148 
3149 		while (p->nSavepoint <= iStatement && !p->txn_bulk) {
3150 			if ((ret = pDbEnv->txn_begin(pDbEnv, pSavepointTxn,
3151 			    &pSavepointTxn, 0)) != 0)
3152 				return dberr2sqlite(ret, p);
3153 			p->nSavepoint++;
3154 		}
3155 	}
3156 	return SQLITE_OK;
3157 }
3158 
btreeCompare(DB * dbp,const DBT * dbt1,const DBT * dbt2,struct KeyInfo * keyInfo)3159 static int btreeCompare(
3160     DB *dbp,
3161     const DBT *dbt1,
3162     const DBT *dbt2,
3163     struct KeyInfo *keyInfo)
3164 {
3165 	int res;
3166 
3167 	log_msg(LOG_VERBOSE, "btreeCompare(%p, %p, %p)", dbp, dbt1, dbt2);
3168 
3169 	if (dbt1->app_data != NULL)
3170 		/* Use the unpacked key from dbt1 */
3171 		res = -sqlite3VdbeRecordCompare(dbt2->size, dbt2->data,
3172 		    dbt1->app_data);
3173 	else if (dbt2->app_data != NULL)
3174 		/* Use the unpacked key from dbt2 */
3175 		res = sqlite3VdbeRecordCompare(dbt1->size, dbt1->data,
3176 		    dbt2->app_data);
3177 	else {
3178 		/*
3179 		 * We don't have an unpacked key cached, generate one.
3180 		 *
3181 		 * This code should only execute if we are inside
3182 		 * DB->sort_multiple, or some uncommon paths inside Berkeley
3183 		 * DB, such as deferred delete of an item in a Btree.
3184 		 */
3185 		BtShared *pBt = NULL;
3186 		UnpackedRecord *p;
3187 		char aSpace[40 * sizeof(void *)];
3188 		int locked = 0;
3189 
3190 		/* This case can happen when searching temporary tables. */
3191 		if (dbt1->data == dbt2->data)
3192 			return 0;
3193 
3194 #ifndef BDBSQL_SINGLE_THREAD
3195 		if (keyInfo == NULL) {
3196 			/* Find a cursor for this table, and use its keyInfo. */
3197 			TableInfo *tableInfo = dbp->app_private;
3198 			BtCursor *pCur = NULL;
3199 			int iTable = tableInfo->iTable;
3200 
3201 			pBt = tableInfo->pBt;
3202 
3203 			/*
3204 			 * We can end up in here while closing a cursor, but we
3205 			 * take care not to be holding the BtShared mutex.
3206 			 * Keep the mutex until we are done so that some other
3207 			 * thread can't free the keyInfo from under us.
3208 			 */
3209 			if (!pBt->resultsBuffer) {
3210 				sqlite3_mutex_enter(pBt->mutex);
3211 				locked = 1;
3212 			}
3213 
3214 			for (pCur = pBt->first_cursor;
3215 			    pCur != NULL;
3216 			    pCur = pCur->next)
3217 				if (pCur->tableIndex == iTable &&
3218 				    isCurrentThread(pCur->threadID))
3219 					break;
3220 
3221 			assert(pCur);
3222 			keyInfo = pCur->keyInfo;
3223 		}
3224 #endif
3225 
3226 		p = sqlite3VdbeRecordUnpack(keyInfo, dbt2->size, dbt2->data,
3227 		    aSpace, sizeof(aSpace));
3228 
3229 		/*
3230 		 * XXX If we are out of memory, the call to unpack the record
3231 		 * may have returned NULL.  The out-of-memory error has been
3232 		 * noted and will be handled by the VM, but we really want to
3233 		 * return that error to Berkeley DB.  There is no way to do
3234 		 * that through the callback, so return zero.
3235 		 *
3236 		 * We choose zero because it makes loops terminate (e.g., if
3237 		 * we're called as part of a sort).
3238 		 */
3239 		res = (p == NULL) ? 0 :
3240 		    sqlite3VdbeRecordCompare(dbt1->size, dbt1->data, p);
3241 		if (p != NULL)
3242 			sqlite3VdbeDeleteUnpackedRecord(p);
3243 
3244 		if (locked)
3245 			sqlite3_mutex_leave(pBt->mutex);
3246 	}
3247 	return res;
3248 }
3249 
btreeCompareKeyInfo(DB * dbp,const DBT * dbt1,const DBT * dbt2)3250 static int btreeCompareKeyInfo(DB *dbp, const DBT *dbt1, const DBT *dbt2)
3251 {
3252 	assert(dbp->app_private != NULL);
3253 	return btreeCompare(dbp, dbt1, dbt2,
3254 	    (struct KeyInfo *)dbp->app_private);
3255 }
3256 
3257 #ifndef BDBSQL_SINGLE_THREAD
btreeCompareShared(DB * dbp,const DBT * dbt1,const DBT * dbt2)3258 static int btreeCompareShared(DB *dbp, const DBT *dbt1, const DBT *dbt2)
3259 {
3260 	/*
3261 	 * In some cases (e.g., vacuum), a KeyInfo may have been stashed
3262 	 * inside the TableInfo.  That's because we can't change the comparator
3263 	 * to btreeCompareKeyInfo on an open DB handle.  If so, use that in
3264 	 * preference to searching for one.
3265 	 */
3266 	return btreeCompare(dbp, dbt1, dbt2,
3267 	    ((TableInfo *)dbp->app_private)->pKeyInfo);
3268 }
3269 #endif
3270 
3271 /*
3272  * Configures a Berkeley DB database handle prior to calling open.
3273  */
btreeConfigureDbHandle(Btree * p,int iTable,DB ** dbpp)3274 static int btreeConfigureDbHandle(Btree *p, int iTable, DB **dbpp)
3275 {
3276 	BtShared *pBt;
3277 	DB *dbp;
3278 	DB_MPOOLFILE *pMpf;
3279 	int ret;
3280 	u_int32_t flags;
3281 #ifndef BDBSQL_SINGLE_THREAD
3282 	TableInfo *tableInfo;
3283 
3284 	tableInfo = NULL;
3285 #endif
3286 
3287 	pBt = p->pBt;
3288 	/* Odd-numbered tables have integer keys. */
3289 	flags = (iTable & 1) ? BTREE_INTKEY : 0;
3290 
3291 	if ((ret = db_create(&dbp, pDbEnv, 0)) != 0)
3292 		goto err;
3293 	if ((flags & BTREE_INTKEY) == 0) {
3294 #ifdef BDBSQL_SINGLE_THREAD
3295 		dbp->set_bt_compare(dbp, btreeCompareKeyInfo);
3296 #else
3297 		if ((tableInfo = sqlite3_malloc(sizeof(TableInfo))) == NULL) {
3298 			ret = ENOMEM;
3299 			goto err;
3300 		}
3301 		tableInfo->pBt = pBt;
3302 		tableInfo->pKeyInfo = NULL;
3303 		tableInfo->iTable = iTable;
3304 		dbp->app_private = tableInfo;
3305 		dbp->set_bt_compare(dbp, btreeCompareShared);
3306 #endif
3307 	} else
3308 		dbp->set_bt_compare(dbp, btreeCompareIntKey);
3309 
3310 	if (pBt->pageSize != 0 &&
3311 	    (ret = dbp->set_pagesize(dbp, pBt->pageSize)) != 0)
3312 		goto err;
3313 	if (pBt->dbStorage == DB_STORE_INMEM) {
3314 		/* Make sure the cache does not overflow to disk. */
3315 		pMpf = dbp->get_mpf(dbp);
3316 		pMpf->set_flags(pMpf, DB_MPOOL_NOFILE, 1);
3317 	}
3318 	if (!GET_DURABLE(pBt) &&
3319 	    (ret = dbp->set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
3320 		goto err;
3321 	if (pBt->encrypted && (ret = dbp->set_flags(dbp, DB_ENCRYPT)) != 0)
3322 		goto err;
3323 err:	if (ret != 0) {
3324 #ifndef BDBSQL_SINGLE_THREAD
3325 		if (tableInfo != NULL)
3326 			sqlite3_free(tableInfo);
3327 #endif
3328 		if (dbp != NULL)
3329 			(void)closeDB(p, dbp, DB_NOSYNC);
3330 		*dbpp = NULL;
3331 	} else {
3332 		*dbpp = dbp;
3333 	}
3334 	return (ret);
3335 }
3336 
btreeFindOrCreateDataTable(Btree * p,int * piTable,CACHED_DB ** ppCachedDb,int flags)3337 int btreeFindOrCreateDataTable(
3338     Btree *p,			/* The btree */
3339     int *piTable,			/* Root page of table to create */
3340     CACHED_DB **ppCachedDb,
3341     int flags)
3342 {
3343 	BtShared *pBt;
3344 	CACHED_DB *cached_db, *create_db;
3345 	DB *dbp;
3346 	char cached_db_key[CACHE_KEY_SIZE];
3347 	int iTable, rc, ret;
3348 
3349 	pBt = p->pBt;
3350 	rc = SQLITE_OK;
3351 	ret = 0;
3352 	cached_db = *ppCachedDb;
3353 	create_db = NULL;
3354 
3355 	iTable = *piTable;
3356 	sqlite3_mutex_enter(pBt->mutex);
3357 
3358 	if (flags & BTREE_CREATE) {
3359 		if (pBt->dbStorage != DB_STORE_NAMED)
3360 			iTable = pBt->last_table;
3361 
3362 		iTable++;
3363 
3364 		/* Make sure (iTable & 1) iff BTREE_INTKEY is set */
3365 		if ((flags & BTREE_INTKEY) != 0) {
3366 			if ((iTable & 1) == 0)
3367 				iTable += 1;
3368 		} else if ((iTable & 1) == 1)
3369 			iTable += 1;
3370 		pBt->last_table = iTable;
3371 	}
3372 
3373 	sqlite3_snprintf(sizeof(cached_db_key), cached_db_key, "%x", iTable);
3374 	cached_db = sqlite3HashFind(&pBt->db_cache,
3375 	    cached_db_key, (int)strlen(cached_db_key));
3376 	if ((flags & BTREE_CREATE) && cached_db != NULL) {
3377 		/*
3378 		 * If the table already exists in the cache, it's a
3379 		 * hang-over from a table that was deleted in another
3380 		 * process. Close the handle now.
3381 		 */
3382 		if ((dbp = cached_db->dbp) != NULL) {
3383 #ifndef BDBSQL_SINGLE_THREAD
3384 			if (dbp->app_private != NULL)
3385 				sqlite3_free(dbp->app_private);
3386 #endif
3387 			ret = closeDB(p, dbp, DB_NOSYNC);
3388 			cached_db->dbp = NULL;
3389 			if (ret != 0)
3390 				goto err;
3391 		}
3392 		sqlite3HashInsert(&pBt->db_cache,
3393 		    cached_db_key, (int)strlen(cached_db_key), NULL);
3394 		sqlite3_free(cached_db);
3395 		cached_db = NULL;
3396 	}
3397 	if (cached_db == NULL || cached_db->dbp == NULL) {
3398 		sqlite3_mutex_leave(pBt->mutex);
3399 		if ((create_db = (CACHED_DB *)sqlite3_malloc(
3400 		    sizeof(CACHED_DB))) == NULL)
3401 		{
3402 			ret = ENOMEM;
3403 			goto err;
3404 		}
3405 		memset(create_db, 0, sizeof(CACHED_DB));
3406 		rc = btreeCreateDataTable(p, iTable, &create_db);
3407 		if (rc != SQLITE_OK)
3408 			goto err;
3409 		sqlite3_mutex_enter(pBt->mutex);
3410 		cached_db = sqlite3HashFind(&pBt->db_cache,
3411 		    cached_db_key, (int)strlen(cached_db_key));
3412 		/* if its not there, then insert it. */
3413 		if (cached_db == NULL) {
3414 			rc = btreeCreateDataTable(p, iTable, &create_db);
3415 			sqlite3_mutex_leave(pBt->mutex);
3416 			cached_db = create_db;
3417 			create_db = NULL;
3418 		} else {
3419 			if (cached_db->dbp == NULL) {
3420 				cached_db->dbp = create_db->dbp;
3421 				create_db->dbp = NULL;
3422 			}
3423 			sqlite3_mutex_leave(pBt->mutex);
3424 			if (create_db->dbp != NULL)
3425 				ret = create_db->dbp->close(
3426 				     create_db->dbp, DB_NOSYNC);
3427 			if (ret != 0)
3428 				goto err;
3429 		}
3430 		if (rc != SQLITE_OK)
3431 			goto err;
3432 	} else
3433 		sqlite3_mutex_leave(pBt->mutex);
3434 
3435 	*ppCachedDb = cached_db;
3436 	*piTable = iTable;
3437 err:
3438 	if (ret != 0)
3439 		rc = dberr2sqlite(ret, p);
3440 	if (create_db != NULL)
3441 		sqlite3_free(create_db);
3442 	return (rc);
3443 }
3444 
3445 /*
3446  * A utility function to create the table containing the actual data.
3447  * There are 3 modes:
3448  *	1) *ppCacheDb == NULL -> create/open the db and put it in the cache.
3449  *	2) *ppCacheDb != NULL && (*ppCacheDb)->dbp == NULL ->
3450  *				create/open the db but don't cache.
3451  *	3) *ppCacheDb != NULL && (*ppCacheDb)->dbp != NULL ->
3452  *				Put the db in the cache.
3453  */
btreeCreateDataTable(Btree * p,int iTable,CACHED_DB ** ppCachedDb)3454 static int btreeCreateDataTable(
3455     Btree *p,			/* The btree */
3456     int iTable,			/* Root page of table to create */
3457     CACHED_DB **ppCachedDb)
3458 {
3459 	BtShared *pBt;
3460 	CACHED_DB *cached_db, *stale_db;
3461 	DB *dbp;
3462 #ifdef BDBSQL_FILE_PER_TABLE
3463 	DBT d, k;
3464 #endif
3465 	char *fileName, *tableName, tableNameBuf[DBNAME_SIZE];
3466 	int ret, t_ret;
3467 
3468 	log_msg(LOG_VERBOSE, "sqlite3BtreeCreateDataTable(%p, %u, %p)",
3469 	    p, iTable, ppCachedDb);
3470 
3471 	pBt = p->pBt;
3472 	assert(!pBt->resultsBuffer);
3473 
3474 	dbp = NULL;
3475 	assert(ppCachedDb != NULL);
3476 	cached_db = *ppCachedDb;
3477 
3478 	tableName = tableNameBuf;
3479 	GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
3480 	log_msg(LOG_VERBOSE,
3481 	    "sqlite3BtreeCursor creating the actual DB: file name:"
3482 	    "%s, table name: %s type: %u.",
3483 	    pBt->full_name, tableName, pBt->dbStorage);
3484 
3485 	FIX_TABLENAME(pBt, fileName, tableName);
3486 	if (cached_db != NULL && cached_db->dbp != NULL) {
3487 		dbp = cached_db->dbp;
3488 		cached_db->dbp = NULL;
3489 		goto insert_db;
3490 	}
3491 
3492 	/*
3493 	 * First try without DB_CREATE, in auto-commit mode, so the
3494 	 * handle can be safely shared in the cache.  If we are really
3495 	 * creating the table, we should be holding the schema lock,
3496 	 * which will protect the handle in cache until we are done.
3497 	 */
3498 	if ((ret = btreeConfigureDbHandle(p, iTable, &dbp)) != 0)
3499 		goto err;
3500 	ret = ENOENT;
3501 	if (pBt->dbStorage == DB_STORE_NAMED &&
3502 	    (pBt->db_oflags & DB_CREATE) != 0) {
3503 		ret = dbp->open(dbp, pFamilyTxn, fileName, tableName, DB_BTREE,
3504 		    (pBt->db_oflags & ~DB_CREATE) | GET_ENV_READONLY(pBt) |
3505 		    GET_AUTO_COMMIT(pBt, pFamilyTxn), 0);
3506 		/* Close and re-configure handle. */
3507 		if (ret == ENOENT) {
3508 #ifndef BDBSQL_SINGLE_THREAD
3509 			if (dbp->app_private != NULL)
3510 				sqlite3_free(dbp->app_private);
3511 #endif
3512 			if ((t_ret = dbp->close(dbp, DB_NOSYNC)) != 0) {
3513 				ret = t_ret;
3514 				goto err;
3515 			}
3516 			if ((t_ret =
3517 			    btreeConfigureDbHandle(p, iTable, &dbp)) != 0) {
3518 				ret = t_ret;
3519 				goto err;
3520 			}
3521 		}
3522 	}
3523 	if (ret == ENOENT) {
3524 		/*
3525 		 * Indices in files should be configured with DB_DUPSORT.
3526 		 * Only do this once we are sure we are creating the database
3527 		 * so that we can open v5.0 database files without error.
3528 		 */
3529 		if (pBt->dbStorage == DB_STORE_NAMED && (iTable & 1) == 0)
3530 			dbp->set_flags(dbp, DB_DUPSORT);
3531 
3532 		ret = dbp->open(dbp, pSavepointTxn, fileName, tableName,
3533 		    DB_BTREE, pBt->db_oflags | GET_ENV_READONLY(pBt) |
3534 		    GET_AUTO_COMMIT(pBt, pSavepointTxn), 0);
3535 #ifdef BDBSQL_FILE_PER_TABLE
3536 		if (ret == 0 && pBt->dbStorage == DB_STORE_NAMED) {
3537 			memset(&k, 0, sizeof(k));
3538 			memset(&d, 0, sizeof(d));
3539 			k.data = fileName;
3540 			k.size = strlen(fileName);
3541 			if ((t_ret = pTablesDb->put(
3542 			    pTablesDb, pSavepointTxn, &k, &d, 0)) != 0)
3543 				ret = t_ret;
3544 		}
3545 #endif
3546 	}
3547 	if (ret != 0)
3548 		goto err;
3549 
3550 	if (cached_db == NULL) {
3551 		if ((cached_db = (CACHED_DB *)sqlite3_malloc(
3552 		    sizeof(CACHED_DB))) == NULL)
3553 		{
3554 			ret = ENOMEM;
3555 			goto err;
3556 		}
3557 		memset(cached_db, 0, sizeof(CACHED_DB));
3558 insert_db:
3559 		sqlite3_snprintf(sizeof(cached_db->key),
3560 		    cached_db->key, "%x", iTable);
3561 
3562 		assert(sqlite3_mutex_held(pBt->mutex));
3563 		stale_db = sqlite3HashInsert(&pBt->db_cache, cached_db->key,
3564 		    (int)strlen(cached_db->key), cached_db);
3565 		if (stale_db) {
3566 			sqlite3_free(stale_db);
3567 			/*
3568 			 * Hash table out of memory when returned pointer is
3569 			 * same as the original value pointer.
3570 			 */
3571 			if (stale_db == cached_db) {
3572 				ret = ENOMEM;
3573 				goto err;
3574 			}
3575 		}
3576 	}
3577 
3578 	assert(cached_db->dbp == NULL);
3579 	cached_db->dbp = dbp;
3580 	cached_db->created = 1;
3581 	*ppCachedDb = cached_db;
3582 	return SQLITE_OK;
3583 
3584 err:	if (dbp != NULL) {
3585 #ifndef BDBSQL_SINGLE_THREAD
3586 		if (dbp->app_private != NULL)
3587 			sqlite3_free(dbp->app_private);
3588 #endif
3589 		(void)dbp->close(dbp, DB_NOSYNC);
3590 		dbp = NULL;
3591 	}
3592 	return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
3593 }
3594 
3595 /*
3596  * Only persisent uncollated indexes use the 1 key, duplicate
3597  * data structure, because the space saving is not worth the
3598  * overhead in temperary indexes, and collated (other than binary
3599  * collation) indexes lose data because different values can be
3600  * stored under the same key if the collation reads them as
3601  * identical.
3602  */
isDupIndex(int flags,int storage,KeyInfo * keyInfo,DB * db)3603 int isDupIndex(int flags, int storage, KeyInfo *keyInfo, DB *db)
3604 {
3605 	return (!(flags & BTREE_INTKEY) && (storage == DB_STORE_NAMED) &&
3606 	    !indexIsCollated(keyInfo) && supportsDuplicates(db));
3607 }
3608 
3609 /*
3610 ** Create a new cursor for the BTree whose root is on the page iTable. The act
3611 ** of acquiring a cursor gets a read lock on the database file.
3612 **
3613 ** If wrFlag==0, then the cursor can only be used for reading.
3614 ** If wrFlag==1, then the cursor can be used for reading or for writing if
3615 ** other conditions for writing are also met.  These are the conditions that
3616 ** must be met in order for writing to be allowed:
3617 **
3618 ** 1:  The cursor must have been opened with wrFlag==1
3619 **
3620 ** 2:  No other cursors may be open with wrFlag==0 on the same table
3621 **
3622 ** 3:  The database must be writable (not on read-only media)
3623 **
3624 ** 4:  There must be an active transaction.
3625 **
3626 ** Condition 2 warrants further discussion.  If any cursor is opened on a table
3627 ** with wrFlag==0, that prevents all other cursors from writing to that table.
3628 ** This is a kind of "read-lock".  When a cursor is opened with wrFlag==0
3629 ** it is guaranteed that the table will not change as long as the cursor
3630 ** is open.  This allows the cursor to do a sequential scan of the table
3631 ** without having to worry about entries being inserted or deleted during the
3632 ** scan.  Cursors should be opened with wrFlag==0 only if this read-lock
3633 ** property is needed. That is to say, cursors should be opened with
3634 ** wrFlag==0 only if they intend to use sqlite3BtreeNext() system call.
3635 ** All other cursors should be opened with wrFlag==1 even if they never really
3636 ** intend to write.
3637 **
3638 ** No checking is done to make sure that page iTable really is the root page
3639 ** of a b-tree.  If it is not, then the cursor acquired will not work
3640 ** correctly.
3641 **
3642 ** The comparison function must be logically the same for every cursor on a
3643 ** particular table.  Changing the comparison function will result in
3644 ** incorrect operations.  If the comparison function is NULL, a default
3645 ** comparison function is used.  The comparison function is always ignored
3646 ** for INTKEY tables.
3647 */
sqlite3BtreeCursor(Btree * p,int iTable,int wrFlag,struct KeyInfo * keyInfo,BtCursor * pCur)3648 int sqlite3BtreeCursor(
3649     Btree *p,			/* The btree */
3650     int iTable,			/* Root page of table to open */
3651     int wrFlag,			/* 1 to write. 0 read-only */
3652     struct KeyInfo *keyInfo,	/* First argument to compare function */
3653     BtCursor *pCur)			/* Write new cursor here */
3654 {
3655 	BtShared *pBt;
3656 	CACHED_DB *cached_db;
3657 	int rc, ret;
3658 
3659 	log_msg(LOG_VERBOSE, "sqlite3BtreeCursor(%p, %u, %u, %p, %p)",
3660 	    p, iTable, wrFlag, keyInfo, pCur);
3661 
3662 	pBt = p->pBt;
3663 	rc = SQLITE_OK;
3664 	ret = 0;
3665 	cached_db = NULL;
3666 	pCur->threadID = NULL;
3667 
3668 	if (!p->connected) {
3669 		if ((rc = btreeUpdateBtShared(p, 1)) != SQLITE_OK)
3670 			goto err;
3671 		pBt = p->pBt;
3672 		/*
3673 		 * If the table is temporary, vdbe expects the table to be
3674 		 * created automatically when the first cursor is opened.
3675 		 * Otherwise, if the database does not exist yet, the caller
3676 		 * expects a SQLITE_EMPTY return, vdbe will then call
3677 		 * sqlite3BtreeCreateTable directly.
3678 		 * If the code created the temporary environment the first time
3679 		 * sqlite3BtreeOpen is called, it would not be possible to
3680 		 * honor cache size setting pragmas.
3681 		 */
3682 		if (pBt->need_open &&
3683 		    (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
3684 			goto err;
3685 		else if (pBt->dbStorage == DB_STORE_NAMED && !pBt->env_opened &&
3686 		    !__os_exists(NULL, pBt->full_name, 0)) {
3687 			/*
3688 			 * The file didn't exist when sqlite3BtreeOpen was
3689 			 * called, but has since been created. Open the
3690 			 * existing database now.
3691 			 * Don't fold the open into the if clause, since this
3692 			 * situation can match following statements as well.
3693 			 */
3694 			if ((rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
3695 				goto err;
3696 		} else if (pBt->dbStorage != DB_STORE_TMP &&
3697 		    !wrFlag && !pBt->env_opened)
3698 			return SQLITE_EMPTY;
3699 		else if (!pBt->resultsBuffer &&
3700 		    (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
3701 			goto err;
3702 	}
3703 
3704 	if (wrFlag && IS_BTREE_READONLY(p))
3705 		return SQLITE_READONLY;
3706 
3707 	assert(p->connected || pBt->resultsBuffer);
3708 	assert(!pBt->transactional || p->inTrans != TRANS_NONE);
3709 
3710 	pCur->threadID = getThreadID(p->db);
3711 	if (pCur->threadID == NULL && p->db->mallocFailed) {
3712 		rc = SQLITE_NOMEM;
3713 		goto err;
3714 	}
3715 
3716 	pCur->pBtree = p;
3717 	pCur->tableIndex = iTable;
3718 
3719 	/* SQLite should guarantee that an appropriate transaction is active. */
3720 	assert(!pBt->transactional || pMainTxn != NULL);
3721 	assert(!pBt->transactional || !wrFlag || pSavepointTxn != NULL);
3722 
3723 	/*
3724 	 * Always use the savepoint transaction for write cursors, or the
3725 	 * top-level cursor for read-only cursors (to avoid tripping and
3726 	 * re-opening the read cursor for updates within a select).
3727 	 */
3728 	pCur->txn = wrFlag ? pSavepointTxn : pReadTxn;
3729 
3730 	if (pBt->resultsBuffer)
3731 		goto setup_cursor;
3732 
3733 	/* Retrieve the matching handle from the cache. */
3734 	rc = btreeFindOrCreateDataTable(p, &iTable, &cached_db, 0);
3735 	if (rc != SQLITE_OK)
3736 		goto err;
3737 	assert(cached_db != NULL && cached_db->dbp != NULL);
3738 
3739 	pCur->cached_db = cached_db;
3740 
3741 	ret = pBDb->cursor(pBDb, pCur->txn, &pDbc,
3742 	    GET_BTREE_ISOLATION(p) & ~DB_READ_COMMITTED);
3743 	if (ret != 0) {
3744 		rc = dberr2sqlite(ret, p);
3745 		goto err;
3746 	}
3747 
3748 	if (!wrFlag) {
3749 		/*
3750 		 * The sqlite btree API doesn't care about the position of
3751 		 * cursors on error.  Setting this flag avoids cursor
3752 		 * duplication inside Berkeley DB.  We can only do it for
3753 		 * read-only cursors, however: deletes don't complete until the
3754 		 * cursor is closed.
3755 		 */
3756 		pDbc->flags |= DBC_TRANSIENT;
3757 	}
3758 
3759 setup_cursor:
3760 	pCur->flags = (iTable & 1) ? BTREE_INTKEY : 0;
3761 	pCur->keyInfo = keyInfo;
3762 	pCur->skipMulti = 1;
3763 	pCur->multiData.data = NULL;
3764 	pCur->wrFlag = wrFlag;
3765 	pCur->eState = CURSOR_INVALID;
3766 	pCur->lastRes = 0;
3767 	if (pCur->cached_db)
3768 		pCur->isDupIndex = isDupIndex(pCur->flags,
3769 		    pCur->pBtree->pBt->dbStorage, pCur->keyInfo,
3770 		    pCur->cached_db->dbp);
3771 
3772 #ifdef BDBSQL_SINGLE_THREAD
3773 	if (cached_db != NULL)
3774 		pBDb->app_private = keyInfo;
3775 #endif
3776 
3777 	sqlite3_mutex_enter(pBt->mutex);
3778 	assert(pCur != pBt->first_cursor);
3779 	pCur->next = pBt->first_cursor;
3780 	pBt->first_cursor = pCur;
3781 	sqlite3_mutex_leave(pBt->mutex);
3782 	return SQLITE_OK;
3783 
3784 err:	if (pDbc != NULL) {
3785 		(void)pDbc->close(pDbc);
3786 		pDbc = NULL;
3787 	}
3788 	if (pCur->threadID != NULL) {
3789 		sqlite3DbFree(p->db, pCur->threadID);
3790 		pCur->threadID = NULL;
3791 	}
3792 	pCur->eState = CURSOR_FAULT;
3793 	pCur->error = rc;
3794 	return SQLITE_OK;
3795 }
3796 
3797 /*
3798 ** Return the size of a BtCursor object in bytes.
3799 **
3800 ** This interfaces is needed so that users of cursors can preallocate
3801 ** sufficient storage to hold a cursor.  The BtCursor object is opaque
3802 ** to users so they cannot do the sizeof() themselves - they must call
3803 ** this routine.
3804 */
sqlite3BtreeCursorSize(void)3805 int sqlite3BtreeCursorSize(void)
3806 {
3807 	return (sizeof(BtCursor));
3808 }
3809 
3810 /*
3811 ** Initialize memory that will be converted into a BtCursor object.
3812 **
3813 ** The simple approach here would be to memset() the entire object
3814 ** to zero.  But if there are large parts that can be skipped, do
3815 ** that here to save time.
3816 */
sqlite3BtreeCursorZero(BtCursor * pCur)3817 void sqlite3BtreeCursorZero(BtCursor *pCur)
3818 {
3819 	memset(pCur, 0, sizeof(BtCursor));
3820 	pCur->index.data = pCur->indexKeyBuf;
3821 	pCur->index.ulen = CURSOR_BUFSIZE;
3822 	pCur->index.flags = DB_DBT_USERMEM;
3823 }
3824 
btreeCloseCursor(BtCursor * pCur,int listRemove)3825 static int btreeCloseCursor(BtCursor *pCur, int listRemove)
3826 {
3827 	BtCursor *c, *prev;
3828 	Btree *p;
3829 	BtShared *pBt;
3830 	int ret;
3831 
3832 	assert(pCur->pBtree != NULL);
3833 	p = pCur->pBtree;
3834 	pBt = p->pBt;
3835 	ret = 0;
3836 
3837 	/*
3838 	 * Change the cursor's state to invalid before closing it, and do
3839 	 * so holding the BtShared mutex, so that no other thread will attempt
3840 	 * to access this cursor while it is being closed.
3841 	 */
3842 	sqlite3_mutex_enter(pBt->mutex);
3843 	pCur->eState = CURSOR_FAULT;
3844 	pCur->error = SQLITE_ABORT;
3845 	sqlite3_mutex_leave(pBt->mutex);
3846 
3847 	/*
3848 	 * Warning: it is important that we call DBC->close while the cursor
3849 	 * is still on the list.  It is possible that closing a cursor will
3850 	 * result in the comparison callback being called, which in turn
3851 	 * may go looking on the list for a matching cursor, in order to find
3852 	 * a KeyInfo pointer it can use.
3853 	 */
3854 	if (pDbc) {
3855 		ret = pDbc->close(pDbc);
3856 		pDbc = NULL;
3857 	}
3858 
3859 	if (listRemove) {
3860 		sqlite3_mutex_enter(pBt->mutex);
3861 		for (prev = NULL, c = pBt->first_cursor; c != NULL;
3862 		    prev = c, c = c->next)
3863 			if (c == pCur) {
3864 				if (prev == NULL)
3865 					pBt->first_cursor = c->next;
3866 				else
3867 					prev->next = c->next;
3868 				break;
3869 			}
3870 		sqlite3_mutex_leave(pBt->mutex);
3871 	}
3872 
3873 	if ((pCur->key.flags & DB_DBT_APPMALLOC) != 0) {
3874 		sqlite3_free(pCur->key.data);
3875 		pCur->key.data = NULL;
3876 		pCur->key.flags &= ~DB_DBT_APPMALLOC;
3877 	}
3878 	if (pCur->multiData.data != NULL) {
3879 		sqlite3_free(pCur->multiData.data);
3880 		pCur->multiData.data = NULL;
3881 	}
3882 	if (pCur->index.data != pCur->indexKeyBuf) {
3883 		sqlite3_free(pCur->index.data);
3884 		pCur->index.data = NULL;
3885 	}
3886 
3887 	/* Incrblob write cursors have their own dedicated transactions. */
3888 	if (pCur->isIncrblobHandle && pCur->txn && pCur->wrFlag &&
3889 	    pSavepointTxn != NULL && pCur->txn != pSavepointTxn) {
3890 		ret = pCur->txn->commit(pCur->txn, DB_TXN_NOSYNC);
3891 		pCur->txn = 0;
3892 	}
3893 
3894 	sqlite3DbFree(p->db, pCur->threadID);
3895 
3896 	ret = dberr2sqlite(ret, p);
3897 	pCur->pBtree = NULL;
3898 	return ret;
3899 }
3900 
3901 /*
3902 ** Close a cursor.
3903 */
sqlite3BtreeCloseCursor(BtCursor * pCur)3904 int sqlite3BtreeCloseCursor(BtCursor *pCur)
3905 {
3906 	log_msg(LOG_VERBOSE, "sqlite3BtreeCloseCursor(%p)", pCur);
3907 
3908 	if (!pCur || !pCur->pBtree)
3909 		return SQLITE_OK;
3910 
3911 	return btreeCloseCursor(pCur, 1);
3912 }
3913 
indexIsCollated(KeyInfo * keyInfo)3914 int indexIsCollated(KeyInfo *keyInfo)
3915 {
3916 	u32 i;
3917 
3918 	if (!keyInfo)
3919 		return 0;
3920 
3921 	for (i = 0; i < keyInfo->nField; i++) {
3922 		if (keyInfo->aColl[i] != NULL &&
3923 		    (keyInfo->aColl[i]->type != SQLITE_COLL_BINARY))
3924 			break;
3925 	}
3926 	return ((i != keyInfo->nField) ? 1 : 0);
3927 }
3928 
3929 /* Indexes created before 5.1 do not support duplicates.*/
supportsDuplicates(DB * db)3930 int supportsDuplicates(DB *db)
3931 {
3932 	u_int32_t val;
3933 	db->get_flags(db, &val);
3934 	return (val & DB_DUPSORT);
3935 }
3936 
3937 /* Store the rowid in the index as data
3938  * instead of as part of the key, so rows
3939  * that have the same indexed value have only one
3940  * key in the index.
3941  * The original index key looks like:
3942  * hdrSize_column1Size_columnNSize_rowIdSize_column1Data_columnNData_rowid
3943  * The new index key looks like:
3944  * hdrSize_column1Size_columnNSize_column1Data_columnNData
3945  * With a data section that looks like:
3946  * rowIdSize_rowid
3947  */
splitIndexKey(BtCursor * pCur)3948 int splitIndexKey(BtCursor *pCur)
3949 {
3950 	u32 hdrSize, rowidType;
3951 	unsigned char *aKey = (unsigned char *)pCur->key.data;
3952 	assert(pCur->isDupIndex);
3953 	getVarint32(aKey, hdrSize);
3954 	getVarint32(&aKey[hdrSize-1], rowidType);
3955 	pCur->data.size = sqlite3VdbeSerialTypeLen(rowidType) + 1;
3956 	pCur->key.size = pCur->key.size - pCur->data.size;
3957 	memmove(&aKey[hdrSize-1], &aKey[hdrSize], pCur->key.size-(hdrSize-1));
3958 	putVarint32(&aKey[pCur->key.size], rowidType);
3959 	putVarint32(aKey, hdrSize-1);
3960 	pCur->data.data = &aKey[pCur->key.size];
3961 	return 0;
3962 }
3963 
3964 /* Move the cursor so that it points to an entry near pUnKey/nKey.
3965 ** Return a success code.
3966 **
3967 ** For INTKEY tables, only the nKey parameter is used.  pUnKey is ignored. For
3968 ** other tables, nKey is the number of bytes of data in nKey. The comparison
3969 ** function specified when the cursor was created is used to compare keys.
3970 **
3971 ** If an exact match is not found, then the cursor is always left pointing at
3972 ** a leaf page which would hold the entry if it were present. The cursor
3973 ** might point to an entry that comes before or after the key.
3974 **
3975 ** The result of comparing the key with the entry to which the cursor is
3976 ** written to *pRes if pRes!=NULL.  The meaning of this value is as follows:
3977 **
3978 **     *pRes<0      The cursor is left pointing at an entry that is smaller
3979 **                  than pUnKey or if the table is empty and the cursor is
3980 **                  therefore left point to nothing.
3981 **
3982 **     *pRes==0     The cursor is left pointing at an entry that exactly
3983 **                  matches pUnKey.
3984 **
3985 **     *pRes>0      The cursor is left pointing at an entry that is larger
3986 **                  than pUnKey.
3987 */
sqlite3BtreeMovetoUnpacked(BtCursor * pCur,UnpackedRecord * pUnKey,i64 nKey,int bias,int * pRes)3988 int sqlite3BtreeMovetoUnpacked(
3989     BtCursor *pCur, UnpackedRecord *pUnKey, i64 nKey, int bias, int *pRes)
3990 {
3991 	int rc, res, ret;
3992 	unsigned char buf[ROWIDMAXSIZE];
3993 
3994 	log_msg(LOG_VERBOSE, "sqlite3BtreeMovetoUnpacked(%p, %p, %u, %u, %p)",
3995 	    pCur, pUnKey, (int)nKey, bias, pRes);
3996 
3997 	res = -1;
3998 	ret = DB_NOTFOUND;
3999 
4000 	/* Invalidate current cursor state. */
4001 	if (pDbc == NULL &&
4002 	    (rc = btreeRestoreCursorPosition(pCur, 1)) != SQLITE_OK)
4003 		return rc;
4004 
4005 	if (pCur->eState == CURSOR_VALID &&
4006 	    pIntKey && pCur->savedIntKey == nKey) {
4007 		*pRes = 0;
4008 		return SQLITE_OK;
4009 	}
4010 
4011 	pCur->multiGetPtr = pCur->multiPutPtr = NULL;
4012 	pCur->isFirst = 0;
4013 	memset(&pCur->key, 0, sizeof(pCur->key));
4014 	memset(&pCur->data, 0, sizeof(pCur->data));
4015 	pCur->skipMulti = 1;
4016 
4017 	if (pIntKey) {
4018 		pCur->key.size = sizeof(i64);
4019 		pCur->nKey = nKey;
4020 		pCur->key.data = &(pCur->nKey);
4021 
4022 		if (pCur->lastKey != 0 && nKey > pCur->lastKey) {
4023 			pCur->eState = CURSOR_INVALID;
4024 			ret = 0;
4025 			goto done;
4026 		}
4027 	} else {
4028 		assert(pUnKey != NULL);
4029 		pCur->key.app_data = pUnKey;
4030 		/*
4031 		 * If looking for an entry in an index with duplicates then the
4032 		 * rowid part of the key needs to be put in the data DBT.
4033 		 */
4034 		if (pCur->isDupIndex &&
4035 		    (pUnKey->nField > pCur->keyInfo->nField)) {
4036 			u8 serial_type;
4037 			Mem *rowid = &pUnKey->aMem[pUnKey->nField - 1];
4038 			int file_format =
4039 			    pCur->pBtree->db->pVdbe->minWriteFileFormat;
4040 			serial_type = sqlite3VdbeSerialType(rowid, file_format);
4041 			pCur->data.size =
4042 			    sqlite3VdbeSerialTypeLen(serial_type) + 1;
4043 			assert(pCur->data.size < ROWIDMAXSIZE);
4044 			pCur->data.data = &buf;
4045 			putVarint32(buf, serial_type);
4046 			sqlite3VdbeSerialPut(&buf[1], ROWIDMAXSIZE - 1,
4047 			    rowid, file_format);
4048 			ret = pDbc->get(pDbc, &pCur->key, &pCur->data,
4049 			    DB_GET_BOTH_RANGE | RMW(pCur));
4050 		/*
4051 		 * If not looking for a specific key in the index (just
4052 		 * looking at the value part of the key) then do a
4053 		 * bulk get since the search likely wants all
4054 		 * entries that have that value.
4055 		 */
4056 		} else if (!pCur->isDupIndex ||
4057 		    (pUnKey->nField < pCur->keyInfo->nField))
4058 			pCur->skipMulti = 0;
4059 	}
4060 
4061 	if (ret == DB_NOTFOUND)
4062 		ret = pDbc->get(pDbc, &pCur->key, &pCur->data,
4063 		    DB_SET_RANGE | RMW(pCur));
4064 
4065 	if (ret == DB_NOTFOUND) {
4066 		ret = pDbc->get(pDbc,
4067 		    &pCur->key, &pCur->data, DB_LAST | RMW(pCur));
4068 
4069 		if (ret == 0 && pIntKey)
4070 			memcpy(&(pCur->lastKey), pCur->key.data, sizeof(i64));
4071 	}
4072 
4073 	if (ret == 0) {
4074 		pCur->eState = CURSOR_VALID;
4075 		/* Check whether we got an exact match. */
4076 		if (pIntKey) {
4077 			memcpy(&(pCur->savedIntKey), pCur->key.data,
4078 			    sizeof(i64));
4079 			res = (pCur->savedIntKey == nKey) ?
4080 			    0 : (pCur->savedIntKey < nKey) ? -1 : 1;
4081 		} else {
4082 			DBT target, index;
4083 			memset(&target, 0, sizeof(target));
4084 			memset(&index, 0, sizeof(index));
4085 			target.app_data = pUnKey;
4086 			/* paranoia */
4087 			pCur->key.app_data = NULL;
4088 			if (pCur->isDupIndex) {
4089 				btreeCreateIndexKey(pCur);
4090 				index = pCur->index;
4091 			} else
4092 				index = pCur->key;
4093 			if (index.data) {
4094 #ifdef BDBSQL_SINGLE_THREAD
4095 				res = btreeCompareKeyInfo(
4096 				    pBDb, &index, &target);
4097 #else
4098 				res = btreeCompareShared(pBDb, &index, &target);
4099 #endif
4100 			} else {
4101 				ret = ENOMEM;
4102 				pCur->eState = CURSOR_FAULT;
4103 				pCur->error = ret;
4104 			}
4105 		}
4106 	} else if (ret == DB_NOTFOUND) {
4107 		/* The table is empty. */
4108 		log_msg(LOG_VERBOSE, "sqlite3BtreeMoveto the table is empty.");
4109 		ret = 0;
4110 		pCur->eState = CURSOR_INVALID;
4111 		pCur->lastKey = -1;
4112 	} else {
4113 		pCur->eState = CURSOR_FAULT;
4114 		pCur->error = ret;
4115 	}
4116 
4117 done:	if (pRes != NULL)
4118 		*pRes = res;
4119 	HANDLE_INCRBLOB_DEADLOCK(ret, pCur)
4120 		return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, pCur->pBtree);
4121 }
4122 
btreeMoveto(BtCursor * pCur,const void * pKey,i64 nKey,int bias,int * pRes)4123 int btreeMoveto(BtCursor *pCur, const void *pKey, i64 nKey, int bias, int *pRes)
4124 {
4125 	UnpackedRecord *p;
4126 	char aSpace[150];
4127 	int res;
4128 
4129 	/*
4130 	 * Cache an unpacked key in the DBT so we don't have to unpack
4131 	 * it on every comparison.
4132 	 */
4133 	p = sqlite3VdbeRecordUnpack(pCur->keyInfo, (int)nKey, pKey, aSpace,
4134 	    sizeof(aSpace));
4135 
4136 	res = sqlite3BtreeMovetoUnpacked(pCur, p, nKey, bias, pRes);
4137 
4138 	sqlite3VdbeDeleteUnpackedRecord(p);
4139 	pCur->key.app_data = NULL;
4140 
4141 	return res;
4142 }
4143 
btreeTripCursor(BtCursor * pCur,int incrBlobUpdate)4144 static int btreeTripCursor(BtCursor *pCur, int incrBlobUpdate)
4145 {
4146 	DBC *dbc;
4147 	int ret;
4148 	void *keyCopy;
4149 
4150 	/*
4151 	 * This is protected by the BtShared mutex so that other threads won't
4152 	 * attempt to access the cursor in btreeTripWatchers while we are
4153 	 * closing it.
4154 	 */
4155 	assert(sqlite3_mutex_held(pCur->pBtree->pBt->mutex));
4156 
4157 	dbc = pDbc;
4158 	pDbc = NULL;
4159 
4160 	/*
4161 	 * Need to close here to so that the update happens unambiguously in
4162 	 * the primary cursor.  That means the memory holding our copy of the
4163 	 * key will be freed, so take a copy here.
4164 	 */
4165 	if (!pIntKey) {
4166 		if (!pCur->isDupIndex) {
4167 			if ((keyCopy = sqlite3_malloc(pCur->key.size)) == NULL)
4168 				return SQLITE_NOMEM;
4169 			memcpy(keyCopy, pCur->key.data, pCur->key.size);
4170 			pCur->key.data = keyCopy;
4171 			pCur->key.flags |= DB_DBT_APPMALLOC;
4172 		}
4173 	}
4174 
4175 	if (pCur->eState == CURSOR_VALID)
4176 		pCur->eState = (pCur->isIncrblobHandle && !incrBlobUpdate) ?
4177 		    CURSOR_INVALID : CURSOR_REQUIRESEEK;
4178 
4179 	ret = dbc->close(dbc);
4180 	pCur->multiGetPtr = NULL;
4181 	pCur->isFirst = 0;
4182 	return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, pCur->pBtree);
4183 }
4184 
btreeTripWatchers(BtCursor * pCur,int incrBlobUpdate)4185 static int btreeTripWatchers(BtCursor *pCur, int incrBlobUpdate)
4186 {
4187 	BtShared *pBt;
4188 	BtCursor *pC;
4189 	int cmp, rc;
4190 
4191 	pBt = pCur->pBtree->pBt;
4192 	rc = SQLITE_OK;
4193 
4194 	sqlite3_mutex_enter(pBt->mutex);
4195 	for (pC = pBt->first_cursor;
4196 	    pC != NULL && rc == SQLITE_OK;
4197 	    pC = pC->next) {
4198 		if (pC == pCur || pCur->pBtree != pC->pBtree ||
4199 		    pC->tableIndex != pCur->tableIndex ||
4200 		    pC->eState != CURSOR_VALID)
4201 			continue;
4202 		/* The call to ->cmp does not do any locking. */
4203 		if (pC->multiGetPtr == NULL &&
4204 		    (pDbc->cmp(pDbc, pC->dbc, &cmp, 0) != 0 || cmp != 0))
4205 			continue;
4206 
4207 		rc = btreeTripCursor(pC, incrBlobUpdate);
4208 	}
4209 	sqlite3_mutex_leave(pBt->mutex);
4210 
4211 	return rc;
4212 }
4213 
btreeTripAll(Btree * p,int iTable,int incrBlobUpdate)4214 static int btreeTripAll(Btree *p, int iTable, int incrBlobUpdate)
4215 {
4216 	BtShared *pBt;
4217 	BtCursor *pC;
4218 	int rc;
4219 
4220 	pBt = p->pBt;
4221 	rc = SQLITE_OK;
4222 
4223 	assert(sqlite3_mutex_held(pBt->mutex));
4224 	for (pC = pBt->first_cursor;
4225 	    pC != NULL && rc == SQLITE_OK;
4226 	    pC = pC->next) {
4227 		if (pC->tableIndex != iTable || pC->dbc == NULL)
4228 			continue;
4229 		if (pC->pBtree != p)
4230 			return SQLITE_LOCKED_SHAREDCACHE;
4231 		rc = btreeTripCursor(pC, incrBlobUpdate);
4232 	}
4233 
4234 	return rc;
4235 }
4236 
btreeRestoreCursorPosition(BtCursor * pCur,int skipMoveto)4237 static int btreeRestoreCursorPosition(BtCursor *pCur, int skipMoveto)
4238 {
4239 	Btree *p;
4240 	BtShared *pBt;
4241 	void *keyCopy;
4242 	int rc, ret, size;
4243 
4244 	if (pCur->eState == CURSOR_FAULT)
4245 		return pCur->error;
4246 	else if (pCur->pBtree == NULL ||
4247 	    (pCur->eState == CURSOR_INVALID && !skipMoveto))
4248 		return SQLITE_ABORT;
4249 
4250 	p = pCur->pBtree;
4251 	pBt = p->pBt;
4252 
4253 	assert(pDbc == NULL);
4254 
4255 	if (pIsBuffer) {
4256 		rc = btreeLoadBufferIntoTable(pCur);
4257 		if (rc != SQLITE_OK)
4258 			return rc;
4259 	} else {
4260 		/*
4261 		 * SQLite should guarantee that an appropriate transaction is
4262 		 * active.
4263 		 */
4264 		assert(!pBt->transactional || pReadTxn != NULL);
4265 		assert(!pBt->transactional || !pCur->wrFlag ||
4266 		    pSavepointTxn != NULL);
4267 
4268 		pCur->txn = pCur->wrFlag ? pSavepointTxn : pReadTxn;
4269 
4270 		if ((ret = pBDb->cursor(pBDb, pCur->txn, &pDbc,
4271 		    GET_BTREE_ISOLATION(p) & ~DB_READ_COMMITTED)) != 0)
4272 			return dberr2sqlite(ret, p);
4273 	}
4274 
4275 	if (skipMoveto) {
4276 		if ((pCur->key.flags & DB_DBT_APPMALLOC) != 0) {
4277 			sqlite3_free(pCur->key.data);
4278 			pCur->key.data = NULL;
4279 			pCur->key.flags &= ~DB_DBT_APPMALLOC;
4280 		}
4281 		pCur->eState = CURSOR_INVALID;
4282 		return SQLITE_OK;
4283 	}
4284 
4285 	if (pIntKey)
4286 		return sqlite3BtreeMovetoUnpacked(pCur, NULL,
4287 		    pCur->savedIntKey, 0, &pCur->lastRes);
4288 
4289 	/*
4290 	 * The pointer in pCur->key.data will be overwritten when we
4291 	 * reposition, so we need to take a copy.
4292 	 */
4293 	if (pCur->isDupIndex) {
4294 		keyCopy = btreeCreateIndexKey(pCur);
4295 		size = pCur->index.size;
4296 		memset(&pCur->index, 0, sizeof(DBT));
4297 		if (keyCopy == NULL)
4298 			return SQLITE_NOMEM;
4299 	} else {
4300 		assert((pCur->key.flags & DB_DBT_APPMALLOC) != 0);
4301 		pCur->key.flags &= ~DB_DBT_APPMALLOC;
4302 		keyCopy = pCur->key.data;
4303 		size = pCur->key.size;
4304 	}
4305 	rc = btreeMoveto(pCur, keyCopy, size,
4306 	    0, &pCur->lastRes);
4307 	if (keyCopy != pCur->indexKeyBuf)
4308 		sqlite3_free(keyCopy);
4309 	return rc;
4310 }
4311 
4312 /*
4313  * Create a temporary table and load the contents of the multi buffer into it.
4314  */
btreeLoadBufferIntoTable(BtCursor * pCur)4315 static int btreeLoadBufferIntoTable(BtCursor *pCur)
4316 {
4317 	Btree *p;
4318 	BtShared *pBt;
4319 	int rc, ret;
4320 	void *temp;
4321 	sqlite3_mutex *mutexOpen;
4322 
4323 	assert(pCur->cached_db == NULL);
4324 
4325 	p = pCur->pBtree;
4326 	pBt = p->pBt;
4327 	ret = 0;
4328 
4329 	UPDATE_DURING_BACKUP(p)
4330 
4331 	temp = pCur->multiData.data;
4332 	pCur->multiData.data = NULL;
4333 	assert(pIsBuffer);
4334 	pIsBuffer = 0;
4335 
4336 	if ((rc = btreeCloseCursor(pCur, 1)) != SQLITE_OK)
4337 		goto err;
4338 
4339 	if (pBt->dbenv == NULL) {
4340 		mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
4341 		sqlite3_mutex_enter(mutexOpen);
4342 		rc = btreePrepareEnvironment(p);
4343 		sqlite3_mutex_leave(mutexOpen);
4344 		if (rc != SQLITE_OK)
4345 			goto err;
4346 	}
4347 	rc = sqlite3BtreeCursor(p, pCur->tableIndex, 1, pCur->keyInfo, pCur);
4348 	if (pCur->eState == CURSOR_FAULT)
4349 		rc = pCur->error;
4350 	if (rc != SQLITE_OK)
4351 		goto err;
4352 	assert(!pCur->isDupIndex);
4353 	pCur->multiData.data = temp;
4354 	temp = NULL;
4355 	if (pCur->multiData.data != NULL) {
4356 		if ((ret = pBDb->sort_multiple(pBDb, &pCur->multiData, NULL,
4357 		    DB_MULTIPLE_KEY)) != 0)
4358 			goto err;
4359 		if ((ret = pBDb->put(pBDb, pCur->txn, &pCur->multiData, NULL,
4360 		    DB_MULTIPLE_KEY)) != 0)
4361 			goto err;
4362 	}
4363 
4364 err:	/*
4365 	 * If we get to here and we haven't set up the newly-opened cursor
4366 	 * properly, free the buffer it was holding now.  SQLite may not close
4367 	 * the cursor explicitly, and it is no longer in the list of open
4368 	 * cursors for the environment, so it will not be cleaned up on close.
4369 	 */
4370 	if (temp != NULL) {
4371 		assert(rc != SQLITE_OK || ret != 0);
4372 		sqlite3_free(temp);
4373 	}
4374 	return MAP_ERR(rc, ret, p);
4375 }
4376 
4377 /*
4378 ** Set *pSize to the size of the buffer needed to hold the value of the key
4379 ** for the current entry.  If the cursor is not pointing to a valid entry,
4380 ** *pSize is set to 0.
4381 **
4382 ** For a table with the INTKEY flag set, this routine returns the key itself,
4383 ** not the number of bytes in the key.
4384 */
sqlite3BtreeKeySize(BtCursor * pCur,i64 * pSize)4385 int sqlite3BtreeKeySize(BtCursor *pCur, i64 *pSize)
4386 {
4387 	int rc;
4388 
4389 	log_msg(LOG_VERBOSE, "sqlite3BtreeKeySize(%p, %p)", pCur, pSize);
4390 
4391 	if (pCur->eState != CURSOR_VALID &&
4392 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4393 		return rc;
4394 
4395 	if (pIntKey)
4396 		*pSize = pCur->savedIntKey;
4397 	else {
4398 		if (pCur->isDupIndex)
4399 			*pSize = (pCur->eState == CURSOR_VALID) ?
4400 				pCur->index.size : 0;
4401 		else
4402 			*pSize = (pCur->eState == CURSOR_VALID) ?
4403 		pCur->key.size : 0;
4404 	}
4405 
4406 	return SQLITE_OK;
4407 }
4408 
4409 /*
4410 ** Set *pSize to the number of bytes of data in the entry the cursor currently
4411 ** points to.  Always return SQLITE_OK. Failure is not possible. If the cursor
4412 ** is not currently pointing to an entry (which can happen, for example, if
4413 ** the database is empty) then *pSize is set to 0.
4414 */
sqlite3BtreeDataSize(BtCursor * pCur,u32 * pSize)4415 int sqlite3BtreeDataSize(BtCursor *pCur, u32 *pSize)
4416 {
4417 	int rc;
4418 
4419 	log_msg(LOG_VERBOSE, "sqlite3BtreeDataSize(%p, %p)", pCur, pSize);
4420 
4421 	if (pCur->eState != CURSOR_VALID &&
4422 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4423 		return rc;
4424 
4425 	if (pCur->isDupIndex)
4426 		*pSize = 0;
4427 	else
4428 		*pSize = (pCur->eState == CURSOR_VALID) ? pCur->data.size : 0;
4429 	return SQLITE_OK;
4430 }
4431 
4432 /*
4433 ** Read part of the key associated with cursor pCur.  Exactly "amt" bytes will
4434 ** be transfered into pBuf[].  The transfer begins at "offset".
4435 **
4436 ** Return SQLITE_OK on success or an error code if anything goes wrong. An
4437 ** error is returned if "offset+amt" is larger than the available payload.
4438 */
sqlite3BtreeKey(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)4439 int sqlite3BtreeKey(BtCursor *pCur, u32 offset, u32 amt, void *pBuf)
4440 {
4441 	int rc;
4442 
4443 	log_msg(LOG_VERBOSE, "sqlite3BtreeKey(%p, %u, %u, %p)",
4444 	    pCur, offset, amt, pBuf);
4445 
4446 	if (pCur->eState != CURSOR_VALID &&
4447 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4448 		return rc;
4449 
4450 	assert(pCur->eState == CURSOR_VALID);
4451 	/* The rowid part of the key in an index is stored in the
4452 	 * data part of the cursor.*/
4453 	if (pCur->isDupIndex)
4454 		memcpy(pBuf, (u_int8_t *)pCur->index.data + offset, amt);
4455 	else
4456 		memcpy(pBuf, (u_int8_t *)pCur->key.data + offset, amt);
4457 	return SQLITE_OK;
4458 }
4459 
4460 /*
4461 ** Read part of the data associated with cursor pCur.  Exactly "amt" bytes
4462 ** will be transfered into pBuf[].  The transfer begins at "offset".
4463 **
4464 ** Return SQLITE_OK on success or an error code if anything goes wrong. An
4465 ** error is returned if "offset+amt" is larger than the available payload.
4466 */
sqlite3BtreeData(BtCursor * pCur,u32 offset,u32 amt,void * pBuf)4467 int sqlite3BtreeData(BtCursor *pCur, u32 offset, u32 amt, void *pBuf)
4468 {
4469 	int rc;
4470 
4471 	log_msg(LOG_VERBOSE, "sqlite3BtreeData(%p, %u, %u, %p)",
4472 	    pCur, offset, amt, pBuf);
4473 
4474 	if (pCur->eState != CURSOR_VALID &&
4475 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4476 		return rc;
4477 
4478 	assert(pCur->eState == CURSOR_VALID);
4479 	memcpy(pBuf, (u_int8_t *)pCur->data.data + offset, amt);
4480 	return SQLITE_OK;
4481 }
4482 
allocateCursorIndex(BtCursor * pCur,u_int32_t amount)4483 void *allocateCursorIndex(BtCursor *pCur, u_int32_t amount)
4484 {
4485 	if (pCur->index.ulen < amount) {
4486 		pCur->index.ulen = amount * 2;
4487 		if (pCur->index.data != pCur->indexKeyBuf)
4488 			sqlite3_free(pCur->index.data);
4489 		pCur->index.data = sqlite3_malloc(pCur->index.ulen);
4490 		if (!pCur->index.data) {
4491 			pCur->error = SQLITE_NOMEM;
4492 			pCur->eState = CURSOR_FAULT;
4493 			return NULL;
4494 		}
4495 	}
4496 	return pCur->index.data;
4497 }
4498 
4499 /* The rowid part of an index key is actually stored as data
4500  * in a Berkeley DB database, so it needs to be appended to the
4501  * key. */
btreeCreateIndexKey(BtCursor * pCur)4502 void *btreeCreateIndexKey(BtCursor *pCur)
4503 {
4504 	u32 hdrSize;
4505 	u_int32_t amount;
4506 	unsigned char *aKey = (unsigned char *)pCur->key.data;
4507 	unsigned char *data = (unsigned char *)pCur->data.data;
4508 	unsigned char *newKey;
4509 
4510 	amount = pCur->key.size + pCur->data.size;
4511 	if (!allocateCursorIndex(pCur, amount))
4512 		return NULL;
4513 	newKey = (unsigned char *)pCur->index.data;
4514 	getVarint32(aKey, hdrSize);
4515 	/*
4516 	 * The first byte contains the size of the record header,
4517 	 * which will change anyway so no need to copy it now.  We
4518 	 * are trying to minimize the number of times memcpy is called
4519 	 * in the common path.
4520 	 */
4521 	if ((hdrSize - 1) == 1)
4522 		newKey[1] = aKey[1];
4523 	else
4524 		memcpy(&newKey[1], &aKey[1], hdrSize - 1);
4525 	if (pCur->key.size != hdrSize) {
4526 		memcpy(&newKey[hdrSize+1], &aKey[hdrSize],
4527 		    pCur->key.size - hdrSize);
4528 	}
4529 	memcpy(&newKey[pCur->key.size+1], &data[1], pCur->data.size - 1);
4530 	newKey[hdrSize] = data[0];
4531 	putVarint32(newKey, hdrSize+1);
4532 	pCur->index.size = amount;
4533 	return newKey;
4534 }
4535 
4536 /*
4537 ** For the entry that cursor pCur is point to, return as many bytes of the
4538 ** key or data as are available on the local b-tree page. Write the number
4539 ** of available bytes into *pAmt.
4540 **
4541 ** The pointer returned is ephemeral.  The key/data may move or be destroyed
4542 ** on the next call to any Btree routine.
4543 **
4544 ** These routines is used to get quick access to key and data in the common
4545 ** case where no overflow pages are used.
4546 */
sqlite3BtreeKeyFetch(BtCursor * pCur,int * pAmt)4547 const void *sqlite3BtreeKeyFetch(BtCursor *pCur, int *pAmt)
4548 {
4549 	log_msg(LOG_VERBOSE, "sqlite3BtreeKeyFetch(%p, %p)", pCur, pAmt);
4550 
4551 	assert(pCur->eState == CURSOR_VALID);
4552 	if (pCur->isDupIndex) {
4553 		*pAmt = pCur->index.size;
4554 		return pCur->index.data;
4555 	}
4556 	*pAmt = pCur->key.size;
4557 	return pCur->key.data;
4558 }
4559 
sqlite3BtreeDataFetch(BtCursor * pCur,int * pAmt)4560 const void *sqlite3BtreeDataFetch(BtCursor *pCur, int *pAmt)
4561 {
4562 	log_msg(LOG_VERBOSE, "sqlite3BtreeDataFetch(%p, %p)", pCur, pAmt);
4563 
4564 	assert(pCur->eState == CURSOR_VALID);
4565 	*pAmt = pCur->data.size;
4566 	return pCur->data.data;
4567 }
4568 
4569 /*
4570 ** Clear the current cursor position.
4571 */
sqlite3BtreeClearCursor(BtCursor * pCur)4572 void sqlite3BtreeClearCursor(BtCursor *pCur)
4573 {
4574 	log_msg(LOG_VERBOSE, "sqlite3BtreeClearCursor(%p)", pCur);
4575 
4576 	pCur->eState = CURSOR_INVALID;
4577 }
4578 
decodeResults(BtCursor * pCur)4579 static int decodeResults(BtCursor *pCur)
4580 {
4581 	if (pIntKey)
4582 		memcpy(&(pCur->savedIntKey), pCur->key.data, sizeof(i64));
4583 	else if (pCur->isDupIndex && btreeCreateIndexKey(pCur) == NULL)
4584 		return SQLITE_NOMEM;
4585 	return SQLITE_OK;
4586 }
4587 
cursorGet(BtCursor * pCur,int op,int * pRes)4588 static int cursorGet(BtCursor *pCur, int op, int *pRes)
4589 {
4590 	static int numMultiGets, numBufferGets, numBufferSmalls;
4591 	DBT oldkey;
4592 	int ret, equal;
4593 
4594 	log_msg(LOG_VERBOSE, "cursorGet(%p, %u, %p)", pCur, op, pRes);
4595 	ret = 0;
4596 
4597 	if (op == DB_NEXT && pCur->multiGetPtr != NULL) {
4598 		/*
4599 		 * Get the next record, skipping duplicates in buffered
4600 		 * indices/transient table.  Note that when we store an
4601 		 * index in a buffer, it is always configured with
4602 		 * BTREE_ZERODATA and we don't configure transient indices
4603 		 * with DB_DUPSORT.  So the data part will always be empty,
4604 		 * and we don't need to check it.
4605 		 */
4606 		for (equal = 0, oldkey = pCur->key; equal == 0;
4607 		    oldkey = pCur->key) {
4608 			DB_MULTIPLE_KEY_NEXT(pCur->multiGetPtr,
4609 			    &pCur->multiData, pCur->key.data, pCur->key.size,
4610 			    pCur->data.data, pCur->data.size);
4611 			if (!pIsBuffer || pCur->multiGetPtr == NULL ||
4612 			    oldkey.size != pCur->key.size)
4613 				break;
4614 			if (pCur->keyInfo == NULL)
4615 				equal = memcmp(pCur->key.data, oldkey.data,
4616 				    oldkey.size);
4617 			else
4618 				equal = btreeCompare(NULL, &pCur->key,
4619 				    &oldkey, pCur->keyInfo);
4620 		}
4621 
4622 		if (pCur->multiGetPtr != NULL) {
4623 			++numBufferGets;
4624 			*pRes = 0;
4625 			return decodeResults(pCur);
4626 		} else if (pIsBuffer)
4627 			goto err;
4628 	}
4629 
4630 	if (pIsBuffer && op == DB_LAST) {
4631 		DBT key, data;
4632 		memset(&key, 0, sizeof(key));
4633 		memset(&data, 0, sizeof(data));
4634 		if (pCur->multiGetPtr == NULL)
4635 			goto err;
4636 		do {
4637 			DB_MULTIPLE_KEY_NEXT(pCur->multiGetPtr,
4638 			    &pCur->multiData, key.data, key.size,
4639 			    data.data, data.size);
4640 			if (pCur->multiGetPtr != NULL) {
4641 				pCur->key = key;
4642 				pCur->data = data;
4643 			}
4644 		} while (pCur->multiGetPtr != NULL);
4645 		*pRes = 0;
4646 		return decodeResults(pCur);
4647 	}
4648 
4649 	assert(!pIsBuffer);
4650 
4651 	if (op == DB_FIRST || (op == DB_NEXT && !pCur->skipMulti)) {
4652 		++numMultiGets;
4653 
4654 		if (pCur->multiData.data == NULL) {
4655 			pCur->multiData.data = sqlite3_malloc(MULTI_BUFSIZE);
4656 			if (pCur->multiData.data == NULL)
4657 				return SQLITE_NOMEM;
4658 			pCur->multiData.flags = DB_DBT_USERMEM;
4659 			pCur->multiData.ulen = MULTI_BUFSIZE;
4660 		}
4661 
4662 		/*
4663 		 * We can't keep DBC_TRANSIENT set on a bulk get
4664 		 * cursor: if the buffer turns out to be too small, we
4665 		 * have no way to restore the position.
4666 		 */
4667 		pDbc->flags &= ~DBC_TRANSIENT;
4668 		ret = pDbc->get(pDbc, &pCur->key, &pCur->multiData,
4669 		    op | DB_MULTIPLE_KEY);
4670 		if (!pCur->wrFlag)
4671 			pDbc->flags |= DBC_TRANSIENT;
4672 
4673 		if (ret == 0) {
4674 			pCur->isFirst = (op == DB_FIRST);
4675 			DB_MULTIPLE_INIT(pCur->multiGetPtr, &pCur->multiData);
4676 			DB_MULTIPLE_KEY_NEXT(pCur->multiGetPtr,
4677 			    &pCur->multiData, pCur->key.data, pCur->key.size,
4678 			    pCur->data.data, pCur->data.size);
4679 			pCur->eState = CURSOR_VALID;
4680 			*pRes = 0;
4681 			return decodeResults(pCur);
4682 		} else if (ret == DB_BUFFER_SMALL) {
4683 			++numBufferSmalls;
4684 #if 0
4685 			if (pCur->numBufferSmalls == MAX_SMALLS)
4686 				fprintf(stderr,
4687 				    "Skipping multi-gets, size == %d!\n",
4688 				    pCur->multiData.size);
4689 #endif
4690 		} else
4691 			goto err;
4692 	} else if (op == DB_NEXT)
4693 		pCur->skipMulti = 0;
4694 
4695 	pCur->lastRes = 0;
4696 	pCur->isFirst = 0;
4697 
4698 	ret = pDbc->get(pDbc, &pCur->key, &pCur->data, op | RMW(pCur));
4699 	if (ret == 0) {
4700 		pCur->eState = CURSOR_VALID;
4701 		*pRes = 0;
4702 		return decodeResults(pCur);
4703 	} else {
4704 err:		if (ret == DB_NOTFOUND)
4705 			ret = 0;
4706 		if (ret != 0 && ret != DB_LOCK_DEADLOCK)
4707 			log_msg(LOG_NORMAL, "cursorGet get returned error: %s",
4708 			    db_strerror(ret));
4709 		pCur->key.size = pCur->data.size = 0;
4710 		pCur->eState = CURSOR_INVALID;
4711 		*pRes = 1;
4712 	}
4713 	return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, pCur->pBtree);
4714 }
4715 
4716 /* Move the cursor to the first entry in the table.  Return SQLITE_OK on
4717 ** success.  Set *pRes to 0 if the cursor actually points to something or set
4718 ** *pRes to 1 if the table is empty.
4719 */
sqlite3BtreeFirst(BtCursor * pCur,int * pRes)4720 int sqlite3BtreeFirst(BtCursor *pCur, int *pRes)
4721 {
4722 	DB *tmp_db;
4723 	u_int32_t get_flag;
4724 	int rc, ret;
4725 
4726 	log_msg(LOG_VERBOSE, "sqlite3BtreeFirst(%p, %p)", pCur, pRes);
4727 
4728 	get_flag = DB_FIRST;
4729 
4730 	if (pCur->eState == CURSOR_FAULT)
4731 		return pCur->error;
4732 
4733 	/*
4734 	 * We might be lucky, and be holding all of a table in the bulk buffer.
4735 	 */
4736 	if (pCur->multiData.data != NULL && (pIsBuffer || pCur->isFirst)) {
4737 		/*
4738 		 * If we've just finished constructing a transient table, sort
4739 		 * and retrieve.
4740 		 */
4741 		if (pCur->multiPutPtr != NULL) {
4742 			if (pCur->eState == CURSOR_FAULT)
4743 				return pCur->error;
4744 
4745 			if ((ret = db_create(&tmp_db,
4746 			    pCur->pBtree->pBt->dbenv, 0)) != 0)
4747 			    return dberr2sqlite(ret, pCur->pBtree);
4748 			tmp_db->app_private = pCur->keyInfo;
4749 			if (!pIntKey)
4750 				tmp_db->set_bt_compare(tmp_db,
4751 				    btreeCompareKeyInfo);
4752 			else
4753 				tmp_db->set_bt_compare(tmp_db,
4754 				    btreeCompareIntKey);
4755 			tmp_db->sort_multiple(tmp_db, &pCur->multiData,
4756 			    NULL, DB_MULTIPLE_KEY);
4757 			if ((ret = tmp_db->close(tmp_db, 0)) != 0)
4758 				return dberr2sqlite(ret, pCur->pBtree);
4759 			pCur->multiPutPtr = NULL;
4760 		}
4761 
4762 		DB_MULTIPLE_INIT(pCur->multiGetPtr, &pCur->multiData);
4763 		memset(&pCur->key, 0, sizeof(pCur->key));
4764 		pCur->isFirst = 1;
4765 		pCur->eState = CURSOR_VALID;
4766 		get_flag = DB_NEXT;
4767 	} else if (pIsBuffer) {
4768 		*pRes = 1;
4769 		return SQLITE_OK;
4770 	} else {
4771 		pCur->multiGetPtr = NULL;
4772 
4773 		if (pDbc == NULL &&
4774 		    (rc = btreeRestoreCursorPosition(pCur, 1)) != SQLITE_OK)
4775 			return rc;
4776 	}
4777 
4778 	return cursorGet(pCur, get_flag, pRes);
4779 }
4780 
4781 /*
4782 ** Move the cursor to the last entry in the table.  Return SQLITE_OK on
4783 ** success.  Set *pRes to 0 if the cursor actually points to something or set
4784 ** *pRes to 1 if the table is empty.
4785 */
sqlite3BtreeLast(BtCursor * pCur,int * pRes)4786 int sqlite3BtreeLast(BtCursor *pCur, int *pRes)
4787 {
4788 	DB *tmp_db;
4789 	int rc, ret;
4790 
4791 	log_msg(LOG_VERBOSE, "sqlite3BtreeLast(%p, %p)", pCur, pRes);
4792 
4793 	if (pCur->eState == CURSOR_FAULT)
4794 		return pCur->error;
4795 
4796 	if (pCur->multiData.data != NULL && pIsBuffer) {
4797 		if (pCur->multiPutPtr != NULL) {
4798 			if ((ret = db_create(&tmp_db,
4799 			    pCur->pBtree->pBt->dbenv, 0)) != 0)
4800 			    return dberr2sqlite(ret, pCur->pBtree);
4801 			tmp_db->app_private = pCur->keyInfo;
4802 			if (!pIntKey)
4803 				tmp_db->set_bt_compare(tmp_db,
4804 				    btreeCompareKeyInfo);
4805 			else
4806 				tmp_db->set_bt_compare(tmp_db,
4807 				    btreeCompareIntKey);
4808 			tmp_db->sort_multiple(tmp_db, &pCur->multiData,
4809 			    NULL, DB_MULTIPLE_KEY);
4810 			if ((ret = tmp_db->close(tmp_db, 0)) != 0)
4811 				return dberr2sqlite(ret, pCur->pBtree);
4812 			pCur->multiPutPtr = NULL;
4813 		}
4814 
4815 		DB_MULTIPLE_INIT(pCur->multiGetPtr, &pCur->multiData);
4816 		memset(&pCur->key, 0, sizeof(pCur->key));
4817 		pCur->eState = CURSOR_VALID;
4818 	} else if (pIsBuffer) {
4819 		*pRes = 1;
4820 		return SQLITE_OK;
4821 	} else {
4822 		if (pDbc == NULL &&
4823 		    (rc = btreeRestoreCursorPosition(pCur, 1)) != SQLITE_OK)
4824 			return rc;
4825 
4826 		pCur->multiGetPtr = NULL;
4827 	}
4828 
4829 	return cursorGet(pCur, DB_LAST, pRes);
4830 }
4831 
4832 /*
4833 ** Return TRUE if the cursor is not pointing at an entry of the table.
4834 **
4835 ** TRUE will be returned after a call to sqlite3BtreeNext() moves past the last
4836 ** entry in the table or sqlite3BtreePrev() moves past the first entry. TRUE
4837 ** is also returned if the table is empty.
4838 */
sqlite3BtreeEof(BtCursor * pCur)4839 int sqlite3BtreeEof(BtCursor *pCur)
4840 {
4841 	log_msg(LOG_VERBOSE, "sqlite3BtreeEof(%p)", pCur);
4842 
4843 	return pCur->eState == CURSOR_INVALID;
4844 }
4845 
4846 /*
4847 ** Advance the cursor to the next entry in the database.  If successful then
4848 ** set *pRes=0.  If the cursor was already pointing to the last entry in the
4849 ** database before this routine was called, then set *pRes=1.
4850 */
sqlite3BtreeNext(BtCursor * pCur,int * pRes)4851 int sqlite3BtreeNext(BtCursor *pCur, int *pRes)
4852 {
4853 	int rc;
4854 	log_msg(LOG_VERBOSE, "sqlite3BtreeNext(%p, %p)", pCur, pRes);
4855 
4856 	if (pCur->pBtree != NULL && pCur->eState == CURSOR_INVALID) {
4857 		*pRes = 1;
4858 		return SQLITE_OK;
4859 	}
4860 
4861 	if (pCur->eState != CURSOR_VALID &&
4862 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4863 		return rc;
4864 
4865 	if (pCur->lastRes > 0) {
4866 		pCur->lastRes = 0;
4867 		*pRes = 0;
4868 		return SQLITE_OK;
4869 	}
4870 
4871 	return cursorGet(pCur, DB_NEXT, pRes);
4872 }
4873 
4874 /*
4875 ** Step the cursor to the back to the previous entry in the database.  If
4876 ** successful then set *pRes=0.  If the cursor was already pointing to the
4877 ** first entry in the database before this routine was called, then set *pRes=1.
4878 */
sqlite3BtreePrevious(BtCursor * pCur,int * pRes)4879 int sqlite3BtreePrevious(BtCursor *pCur, int *pRes)
4880 {
4881 	int rc;
4882 	log_msg(LOG_VERBOSE, "sqlite3BtreePrevious(%p, %p)", pCur, pRes);
4883 
4884 	if (pCur->eState != CURSOR_VALID &&
4885 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
4886 		return rc;
4887 
4888 	if (pCur->eState == CURSOR_INVALID) {
4889 		*pRes = 1;
4890 		return SQLITE_OK;
4891 	}
4892 
4893 	if (pCur->lastRes < 0) {
4894 		pCur->lastRes = 0;
4895 		*pRes = 0;
4896 		return SQLITE_OK;
4897 	}
4898 
4899 	return cursorGet(pCur, DB_PREV, pRes);
4900 }
4901 
insertData(BtCursor * pCur,int nZero,int nData)4902 static int insertData(BtCursor *pCur, int nZero, int nData)
4903 {
4904 	int ret;
4905 
4906 	UPDATE_DURING_BACKUP(pCur->pBtree);
4907 	ret = pDbc->put(pDbc, &pCur->key, &pCur->data,
4908 	    (pCur->isDupIndex) ? DB_NODUPDATA : DB_KEYLAST);
4909 
4910 	if (ret == 0 && nZero > 0) {
4911 		DBT zeroData;
4912 		u8 zero;
4913 
4914 		zero = 0;
4915 		memset(&zeroData, 0, sizeof(zeroData));
4916 		zeroData.data = &zero;
4917 		zeroData.size = zeroData.dlen = zeroData.ulen = 1;
4918 		zeroData.doff = nData + nZero - 1;
4919 		zeroData.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
4920 
4921 		ret = pDbc->put(pDbc, &pCur->key, &zeroData, DB_CURRENT);
4922 	}
4923 	return ret;
4924 }
4925 
4926 /*
4927 ** Insert a new record into the BTree.  The key is given by (pKey,nKey) and
4928 ** the data is given by (pData,nData).  The cursor is used only to define
4929 ** what table the record should be inserted into.  The cursor is left
4930 ** pointing at a random location.
4931 **
4932 ** For an INTKEY table, only the nKey value of the key is used.  pKey is
4933 ** ignored.  For a ZERODATA table, the pData and nData are both ignored.
4934 */
sqlite3BtreeInsert(BtCursor * pCur,const void * pKey,i64 nKey,const void * pData,int nData,int nZero,int appendBias,int seekResult)4935 int sqlite3BtreeInsert(
4936     BtCursor *pCur,		/* Insert data into the table of this cursor */
4937     const void *pKey, i64 nKey,	/* The key of the new record */
4938     const void *pData, int nData,	/* The data of the new record */
4939     int nZero,			/* Number of extra 0 bytes */
4940     int appendBias,		/* True if this likely an append */
4941     int seekResult)		/* Result of prior sqlite3BtreeMoveto() call */
4942 {
4943 	int rc, ret;
4944 	i64 encKey;
4945 	UnpackedRecord *p;
4946 	char aSpace[150];
4947 
4948 	log_msg(LOG_VERBOSE,
4949 	    "sqlite3BtreeInsert(%p, %p, %u, %p, %u, %u, %u, %u)",
4950 	    pCur, pKey, (int)nKey, pData, nData, nZero, appendBias, seekResult);
4951 
4952 	if (!pCur->wrFlag)
4953 		return SQLITE_READONLY;
4954 
4955 	p = NULL;
4956 	rc = SQLITE_OK;
4957 
4958 	/* Invalidate current cursor state. */
4959 	pCur->multiGetPtr = NULL;
4960 	pCur->isFirst = 0;
4961 	pCur->lastKey = 0;
4962 	memset(&pCur->key, 0, sizeof(pCur->key));
4963 	memset(&pCur->data, 0, sizeof(pCur->data));
4964 
4965 	if (pIntKey) {
4966 		pCur->key.size = sizeof(i64);
4967 		encKey = nKey;
4968 		pCur->key.data = &encKey;
4969 	} else {
4970 		pCur->key.data = (void *)pKey;
4971 		pCur->key.size = (u_int32_t)nKey;
4972 	}
4973 	if (pCur->isDupIndex)
4974 		splitIndexKey(pCur);
4975 	else {
4976 		pCur->data.data = (void *)pData;
4977 		pCur->data.size = nData;
4978 	}
4979 
4980 	if (pIsBuffer) {
4981 		ret = 0;
4982 		if (nZero == 0) {
4983 			if (pCur->multiData.data == NULL) {
4984 				if ((pCur->multiData.data =
4985 				    sqlite3_malloc(MULTI_BUFSIZE)) == NULL) {
4986 					ret = ENOMEM;
4987 					goto err;
4988 				}
4989 				pCur->multiData.flags = DB_DBT_USERMEM;
4990 				pCur->multiData.ulen = MULTI_BUFSIZE;
4991 				DB_MULTIPLE_WRITE_INIT(pCur->multiPutPtr,
4992 				    &pCur->multiData);
4993 			}
4994 			/*
4995 			 * It is possible for temporary results to be written,
4996 			 * read, then written again.  In that case just load
4997 			 * the results into a table.
4998 			 */
4999 			if (pCur->multiPutPtr != NULL) {
5000 				DB_MULTIPLE_KEY_WRITE_NEXT(pCur->multiPutPtr,
5001 				    &pCur->multiData,
5002 				    pCur->key.data, pCur->key.size,
5003 				    pCur->data.data, pCur->data.size);
5004 			}
5005 		} else
5006 			pCur->multiPutPtr = NULL;
5007 		if (pCur->multiPutPtr == NULL) {
5008 			rc = btreeLoadBufferIntoTable(pCur);
5009 			if (rc != SQLITE_OK)
5010 				return rc;
5011 			ret = insertData(pCur, nZero, nData);
5012 		}
5013 		goto err;
5014 	}
5015 	if (!pIntKey && pKey != NULL) {
5016 		/*
5017 		 * Cache an unpacked key in the DBT so we don't have to unpack
5018 		 * it on every comparison.
5019 		 */
5020 		pCur->key.app_data = p = sqlite3VdbeRecordUnpack(pCur->keyInfo,
5021 		    (int)nKey, pKey, aSpace, sizeof(aSpace));
5022 	}
5023 
5024 	ret = insertData(pCur, nZero, nData);
5025 
5026 	if (ret == 0) {
5027 		/*
5028 		 * We may have updated a record or inserted into a range that
5029 		 * is cached by another cursor.
5030 		 */
5031 		if ((rc = btreeTripWatchers(pCur, 0)) != SQLITE_OK)
5032 			goto err;
5033 		pCur->skipMulti = 0;
5034 	} else
5035 		pCur->eState = CURSOR_INVALID;
5036 err:	if (p != NULL)
5037 		sqlite3VdbeDeleteUnpackedRecord(p);
5038 	pCur->key.app_data = NULL;
5039 	return MAP_ERR_LOCKED(rc, ret, pCur->pBtree);
5040 }
5041 
5042 /*
5043 ** Delete the entry that the cursor is pointing to.  The cursor is left
5044 ** pointing at a random location.
5045 */
sqlite3BtreeDelete(BtCursor * pCur)5046 int sqlite3BtreeDelete(BtCursor *pCur)
5047 {
5048 	DBC *tmpc;
5049 	int rc, ret;
5050 
5051 	log_msg(LOG_VERBOSE, "sqlite3BtreeDelete(%p)", pCur);
5052 
5053 	ret = 0;
5054 	if (!pCur->wrFlag)
5055 		return SQLITE_READONLY;
5056 
5057 	if (pIsBuffer) {
5058 		int res;
5059 		rc = btreeMoveto(pCur, pCur->key.data, pCur->key.size, 0, &res);
5060 		if (rc != SQLITE_OK)
5061 			return rc;
5062 	}
5063 
5064 	assert(!pIsBuffer);
5065 
5066 	if (pCur->multiGetPtr != NULL) {
5067 		DBT dummy;
5068 		pCur->multiGetPtr = NULL;
5069 		pCur->isFirst = 0;
5070 		memset(&dummy, 0, sizeof(dummy));
5071 		dummy.flags = DB_DBT_USERMEM | DB_DBT_PARTIAL;
5072 		if ((ret = pDbc->get(pDbc,
5073 		    &pCur->key, &dummy, DB_SET | RMW(pCur))) != 0)
5074 		    return dberr2sqlitelocked(ret, pCur->pBtree);
5075 		pCur->eState = CURSOR_VALID;
5076 	}
5077 
5078 	if ((rc = btreeTripWatchers(pCur, 0)) != SQLITE_OK)
5079 		return rc;
5080 	ret = pDbc->del(pDbc, 0);
5081 
5082 	/*
5083 	 * We now de-position the cursor to ensure that the record is
5084 	 * really deleted. [#18667]
5085 	 *
5086 	 * Since we tripped all watchers before doing the delete, there can be
5087 	 * no other open cursors pointing to this record.  SQLite's record
5088 	 * comparator will behave incorrectly if it sees a record that is
5089 	 * marked for deletion (see the UNPACKED_PREFIX_SEARCH flag), so this
5090 	 * makes sure that never happens.
5091 	 */
5092 	if (ret == 0 && (ret = pDbc->dup(pDbc, &tmpc, 0)) == 0) {
5093 		ret = pDbc->close(pDbc);
5094 		pDbc = tmpc;
5095 	}
5096 	pCur->eState = CURSOR_INVALID;
5097 
5098 	return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, pCur->pBtree);
5099 }
5100 
5101 /*
5102 ** Create a new BTree table.  Write into *piTable the page number for the root
5103 ** page of the new table.
5104 **
5105 ** The type of type is determined by the flags parameter.  Only the following
5106 ** values of flags are currently in use.  Other values for flags might not
5107 ** work:
5108 **
5109 **     BTREE_INTKEY		Used for SQL tables with rowid keys
5110 **     BTREE_BLOBKEY		Used for SQL indices
5111 */
btreeCreateTable(Btree * p,int * piTable,int flags)5112 static int btreeCreateTable(Btree *p, int *piTable, int flags)
5113 {
5114 	BtShared *pBt;
5115 	CACHED_DB *cached_db;
5116 	DBC *dbc;
5117 	DBT key, data;
5118 	int lastTable, rc, ret, t_ret;
5119 
5120 	cached_db = NULL;
5121 	pBt = p->pBt;
5122 	rc = SQLITE_OK;
5123 	lastTable = 0;
5124 	ret = 0;
5125 
5126 	dbc = NULL;
5127 	if (pBt->dbStorage == DB_STORE_NAMED) {
5128 		ret = pTablesDb->cursor(pTablesDb, pFamilyTxn, &dbc, 0);
5129 		if (ret != 0)
5130 			goto err;
5131 
5132 		memset(&key, 0, sizeof(key));
5133 		memset(&data, 0, sizeof(data));
5134 		data.flags = DB_DBT_PARTIAL | DB_DBT_USERMEM;
5135 
5136 		if ((ret = dbc->get(dbc, &key, &data, DB_LAST)) != 0)
5137 			goto err;
5138 
5139 		if (strncmp((const char *)key.data, "table", 5) == 0 &&
5140 		    (ret = btreeTableNameToId(
5141 		    (const char *)key.data, key.size, &lastTable)) != 0)
5142 			goto err;
5143 
5144 		ret = dbc->close(dbc);
5145 		dbc = NULL;
5146 		if (ret != 0)
5147 			goto err;
5148 	}
5149 
5150 	cached_db = NULL;
5151 	rc = btreeFindOrCreateDataTable(p,
5152 	    &lastTable, &cached_db, flags | BTREE_CREATE);
5153 	if (rc == SQLITE_OK)
5154 		*piTable = lastTable;
5155 
5156 err:	if (dbc != NULL)
5157 		if ((t_ret = dbc->close(dbc)) != 0 && ret == 0)
5158 			ret = t_ret;
5159 
5160 	return MAP_ERR(rc, ret, p);
5161 }
5162 
sqlite3BtreeCreateTable(Btree * p,int * piTable,int flags)5163 int sqlite3BtreeCreateTable(Btree *p, int *piTable, int flags)
5164 {
5165 	BtShared *pBt;
5166 	int rc;
5167 
5168 	log_msg(LOG_VERBOSE, "sqlite3BtreeCreateTable(%p, %p, %u)",
5169 	    p, piTable, flags);
5170 
5171 	pBt = p->pBt;
5172 
5173 	/*
5174 	 * With ephemeral tables, there are at most two tables created: the
5175 	 * initial master table, which is used for INTKEY tables, or, for
5176 	 * indices, a second table is opened and the master table is unused.
5177 	 */
5178 	if (pBt->resultsBuffer) {
5179 		assert(!(flags & BTREE_INTKEY));
5180 		*piTable = 2;
5181 		return SQLITE_OK;
5182 	}
5183 
5184 	if (!p->connected &&
5185 	    (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
5186 		return rc;
5187 
5188 	return btreeCreateTable(p, piTable, flags);
5189 }
5190 
5191 /*
5192 ** Delete all information from a single table in the database.  iTable is the
5193 ** page number of the root of the table.  After this routine returns, the root
5194 ** page is empty, but still exists.
5195 **
5196 ** This routine will fail with SQLITE_LOCKED if there are any open read
5197 ** cursors on the table.  Open write cursors are moved to the root of the
5198 ** table.
5199 **
5200 ** If pnChange is not NULL, then table iTable must be an intkey table. The
5201 ** integer value pointed to by pnChange is incremented by the number of
5202 ** entries in the table.
5203 */
sqlite3BtreeClearTable(Btree * p,int iTable,int * pnChange)5204 int sqlite3BtreeClearTable(Btree *p, int iTable, int *pnChange)
5205 {
5206 	BtShared *pBt;
5207 	CACHED_DB *cached_db;
5208 	DELETED_TABLE *dtable;
5209 	char *tableName, tableNameBuf[DBNAME_SIZE];
5210 	char *oldTableName, oldTableNameBuf[DBNAME_SIZE], *fileName;
5211 	int need_truncate, rc, ret, tryfast;
5212 	u_int32_t count;
5213 
5214 	log_msg(LOG_VERBOSE, "sqlite3BtreeClearTable(%p, %u, %p)",
5215 	    p, iTable, pnChange);
5216 
5217 	pBt = p->pBt;
5218 	count = 0;
5219 	ret = tryfast = 0;
5220 	rc = SQLITE_OK;
5221 	need_truncate = 1;
5222 	if (IS_BTREE_READONLY(p))
5223 		return SQLITE_READONLY;
5224 
5225 	/* Close any open cursors. */
5226 	sqlite3_mutex_enter(pBt->mutex);
5227 
5228 	/*
5229 	 * SQLite expects all cursors apart from read-uncommitted cursors to be
5230 	 * closed.  However, Berkeley DB cannot truncate unless *all* cursors
5231 	 * are closed.  This call to btreeTripAll will fail if there are any
5232 	 * cursors open on other connections with * SQLITE_LOCKED_SHAREDCACHE,
5233 	 * which makes tests shared2-1.[23] fail with "table locked" errors.
5234 	 */
5235 	if ((rc = btreeTripAll(p, iTable, 0)) != SQLITE_OK) {
5236 		sqlite3_mutex_leave(pBt->mutex);
5237 		return rc;
5238 	}
5239 	sqlite3_mutex_leave(pBt->mutex);
5240 
5241 	rc = btreeFindOrCreateDataTable(p, &iTable, &cached_db, 0);
5242 
5243 	if (rc != SQLITE_OK)
5244 		return rc;
5245 
5246 	assert(cached_db != NULL && cached_db->dbp != NULL);
5247 
5248 	/*
5249 	 * The motivation here is that logging all of the contents of pages
5250 	 * we want to clear is slow.  Instead, we can transactionally create
5251 	 * a new, empty table, and rename the old one.  If this transaction
5252 	 * goes on to commit, we can non-transactionally free the old pages
5253 	 * at that point.
5254 	 *
5255 	 * Steps are:
5256 	 *   1. do a transactional rename of the old table
5257 	 *   2. do a transactional create of a new table with the same name
5258 	 *   3. if/when this transaction commits, do a non-transactional
5259 	 *      remove of the old table.
5260 	 */
5261 	if (pBt->dbStorage == DB_STORE_NAMED) {
5262 		/* TODO: count the records */
5263 		DB_BTREE_STAT *stat;
5264 
5265 		if ((ret = cached_db->dbp->stat(cached_db->dbp,
5266 		    pFamilyTxn, &stat, GET_BTREE_ISOLATION(p) &
5267 		    ~DB_TXN_SNAPSHOT)) != 0)
5268 			goto err;
5269 		count = stat->bt_ndata;
5270 
5271 		/*
5272 		 * Try the fast path (minimal logging) approach to truncating
5273 		 * for all but the smallest databases.
5274 		 */
5275 		tryfast =
5276 		    (stat->bt_leaf_pg + stat->bt_dup_pg + stat->bt_over_pg) > 4;
5277 		sqlite3_free(stat);
5278 	}
5279 
5280 	if (tryfast) {
5281 #ifndef BDBSQL_SINGLE_THREAD
5282 		if (cached_db->dbp->app_private != NULL)
5283 			sqlite3_free(cached_db->dbp->app_private);
5284 #endif
5285 		ret = cached_db->dbp->close(cached_db->dbp, DB_NOSYNC);
5286 		cached_db->dbp = NULL;
5287 		if (ret != 0)
5288 			goto err;
5289 
5290 		tableName = tableNameBuf;
5291 		GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
5292 		oldTableName = oldTableNameBuf;
5293 		GET_TABLENAME(oldTableName, sizeof(oldTableNameBuf), iTable,
5294 		    "old-");
5295 
5296 		FIX_TABLENAME(pBt, fileName, tableName);
5297 		if ((ret = pDbEnv->dbrename(pDbEnv, pSavepointTxn,
5298 		    fileName, tableName, oldTableName, DB_NOSYNC)) == 0) {
5299 			need_truncate = 0;
5300 			dtable = (DELETED_TABLE *)sqlite3_malloc(
5301 			    sizeof(DELETED_TABLE));
5302 			if (dtable == NULL)
5303 				return SQLITE_NOMEM;
5304 			dtable->iTable = iTable;
5305 			dtable->txn = pSavepointTxn;
5306 #ifdef BDBSQL_FILE_PER_TABLE
5307 			dtable->flag = DTF_DELETE;
5308 #endif
5309 			dtable->next = p->deleted_tables;
5310 			p->deleted_tables = dtable;
5311 		} else if (ret != EEXIST)
5312 			goto err;
5313 
5314 		sqlite3_mutex_enter(pBt->mutex);
5315 		rc = btreeCreateDataTable(p, iTable, &cached_db);
5316 		sqlite3_mutex_leave(pBt->mutex);
5317 		if (rc != SQLITE_OK)
5318 			goto err;
5319 	}
5320 
5321 	if (need_truncate) {
5322 		assert(cached_db != NULL && cached_db->dbp != NULL);
5323 		ret = cached_db->dbp->truncate(cached_db->dbp,
5324 		    pSavepointTxn, &count, 0);
5325 	}
5326 
5327 	if (ret == 0 && pnChange != NULL)
5328 		*pnChange += count;
5329 
5330 err:	return MAP_ERR(rc, ret, p);
5331 }
5332 
5333 /*
5334 ** Erase all information in a table and add the root of the table to the
5335 ** freelist.  Except, the root of the principle table (the one on page 1) is
5336 ** never added to the freelist.
5337 **
5338 ** This routine will fail with SQLITE_LOCKED if there are any open cursors on
5339 ** the table.
5340 */
sqlite3BtreeDropTable(Btree * p,int iTable,int * piMoved)5341 int sqlite3BtreeDropTable(Btree *p, int iTable, int *piMoved)
5342 {
5343 	char cached_db_key[CACHE_KEY_SIZE];
5344 	BtShared *pBt;
5345 	CACHED_DB *cached_db;
5346 	DB *dbp;
5347 	DELETED_TABLE *dtable;
5348 	char *fileName, *tableName, tableNameBuf[DBNAME_SIZE];
5349 	char *oldTableName, oldTableNameBuf[DBNAME_SIZE];
5350 	int need_remove, ret;
5351 	DBT key;
5352 	int skip_rename;
5353 
5354 	log_msg(LOG_VERBOSE, "sqlite3BtreeDropTable(%p, %u, %p)",
5355 	    p, iTable, piMoved);
5356 
5357 	skip_rename = 0;
5358 	pBt = p->pBt;
5359 	*piMoved = 0;
5360 	ret = 0;
5361 	need_remove = 1;
5362 
5363 	/* Close any cached handle */
5364 	sqlite3_snprintf(sizeof(cached_db_key), cached_db_key, "%x", iTable);
5365 	sqlite3_mutex_enter(pBt->mutex);
5366 	cached_db = sqlite3HashFind(&pBt->db_cache,
5367 	    cached_db_key, (int)strlen(cached_db_key));
5368 	if (cached_db != NULL && (dbp = cached_db->dbp) != NULL) {
5369 #ifndef BDBSQL_SINGLE_THREAD
5370 		if (dbp->app_private != NULL)
5371 			sqlite3_free(dbp->app_private);
5372 #endif
5373 		ret = dbp->close(dbp, DB_NOSYNC);
5374 		cached_db->dbp = NULL;
5375 		if (ret != 0)
5376 			goto err;
5377 	}
5378 	sqlite3HashInsert(
5379 	    &pBt->db_cache, cached_db_key, (int)strlen(cached_db_key), NULL);
5380 	sqlite3_mutex_leave(pBt->mutex);
5381 	sqlite3_free(cached_db);
5382 
5383 	if (pBt->dbStorage == DB_STORE_NAMED) {
5384 		tableName = tableNameBuf;
5385 		GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
5386 		FIX_TABLENAME(pBt, fileName, tableName);
5387 
5388 		oldTableName = oldTableNameBuf;
5389 		GET_TABLENAME(oldTableName, sizeof(oldTableNameBuf), iTable,
5390 		    "old-");
5391 
5392 		memset(&key, 0, sizeof(key));
5393 		key.data = oldTableName;
5394 		key.size = (u_int32_t)strlen(oldTableName);
5395 		key.flags = DB_DBT_USERMEM;
5396 		/* If the renamed table already exists, we could be in one of
5397 		 * two possible situations:
5398 		 * 1) This is the second table within the same transaction
5399 		 *    that has the same table ID that has been dropped.
5400 		 * 2) There was a crash in the middle of
5401 		 *     sqlite3BtreeCommitPhaseTwo, meaning the dbrename was
5402 		 *     committed, but the dbremove was not completed.
5403 		 * In the first situation, we want the first table to be the
5404 		 * one that is in the deleted_tables list. In the second case,
5405 		 * it's safe to remove the old-* table before proceeding.
5406 		 *
5407 		 * TODO: If the error message Berkeley DB generates when
5408 		 *       renaming to a table that already exists is removed,
5409 		 *       We could remove this exists check, and move the logic
5410 		 *       below into an if (ret == EEXIST) clause.
5411 		 */
5412 		if (pTablesDb->exists(pTablesDb, pSavepointTxn, &key, 0) == 0) {
5413 			for (dtable = p->deleted_tables;
5414 			    dtable != NULL && iTable != dtable->iTable;
5415 			    dtable = dtable->next) {}
5416 			/* Case 2, remove the table. */
5417 			if (dtable == NULL) {
5418 				if ((ret = pDbEnv->dbremove(pDbEnv,
5419 				    pSavepointTxn, pBt->short_name,
5420 				    oldTableName, DB_NOSYNC)) != 0)
5421 					goto err;
5422 			} else
5423 				skip_rename = 1;
5424 		}
5425 
5426 		if (!skip_rename) {
5427 			ret = pDbEnv->dbrename(pDbEnv, pSavepointTxn, fileName,
5428 			    tableName, oldTableName, DB_NOSYNC);
5429 			if (ret != 0)
5430 				goto err;
5431 			need_remove = 0;
5432 			dtable = (DELETED_TABLE *)sqlite3_malloc(
5433 			    sizeof(DELETED_TABLE));
5434 			if (dtable == NULL)
5435 				return SQLITE_NOMEM;
5436 			dtable->iTable = iTable;
5437 			dtable->txn = pSavepointTxn;
5438 #ifdef BDBSQL_FILE_PER_TABLE
5439 			dtable->flag = DTF_DROP;
5440 #endif
5441 			dtable->next = p->deleted_tables;
5442 			p->deleted_tables = dtable;
5443 		}
5444 
5445 		if (need_remove) {
5446 			ret = pDbEnv->dbremove(pDbEnv, pSavepointTxn,
5447 			    fileName, tableName, DB_NOSYNC);
5448 			if (ret != 0)
5449 				goto err;
5450 #ifdef BDBSQL_FILE_PER_TABLE
5451 			memset(&key, 0, sizeof(key));
5452 			key.flags = DB_DBT_USERMEM;
5453 			key.data = tableName;
5454 			key.size = strlen(tableName);
5455 			ret = pTablesDb->del(pTablesDb, pSavepointTxn, &key, 0);
5456 #endif
5457 		}
5458 
5459 	} else if (pBt->dbStorage == DB_STORE_INMEM) {
5460 		/*
5461 		 * Add the in-memory tables into deleted_tables. Don't do the
5462 		 * remove now since the operation might be rollbacked.  The
5463 		 * deleted_tables will be removed when commit.
5464 		 *
5465 		 * We don't rename the in-memory db as above DB_STORE_NAMED
5466 		 * case because:
5467 		 * 1) In memory table names are always unique.
5468 		 * 2) Can not rename a in-memory db since dbrename can not
5469 		 *    accept DB_TXN_NOT_DURABLE.
5470 		 */
5471 		dtable = (DELETED_TABLE *)sqlite3_malloc(sizeof(DELETED_TABLE));
5472 		if (dtable == NULL)
5473 			return SQLITE_NOMEM;
5474 		dtable->iTable = iTable;
5475 		dtable->txn = pSavepointTxn;
5476 		dtable->next = p->deleted_tables;
5477 		p->deleted_tables = dtable;
5478 	}
5479 
5480 err:	return (ret == 0) ? SQLITE_OK : dberr2sqlitelocked(ret, p);
5481 }
5482 
5483 /*
5484 ** Read the meta-information out of a database file.  Meta[0] is the number
5485 ** of free pages currently in the database.  Meta[1] through meta[15] are
5486 ** available for use by higher layers.  Meta[0] is read-only, the others are
5487 ** read/write.
5488 **
5489 ** The schema layer numbers meta values differently.  At the schema layer (and
5490 ** the SetCookie and ReadCookie opcodes) the number of free pages is not
5491 ** visible.  So Cookie[0] is the same as Meta[1].
5492 */
sqlite3BtreeGetMeta(Btree * p,int idx,u32 * pMeta)5493 void sqlite3BtreeGetMeta(Btree *p, int idx, u32 *pMeta)
5494 {
5495 	BtShared *pBt;
5496 	int ret;
5497 	DBT key, data;
5498 	i64 metaKey, metaData;
5499 
5500 	log_msg(LOG_VERBOSE, "sqlite3BtreeGetMeta(%p, %u, %p)",
5501 	    p, idx, pMeta);
5502 
5503 	pBt = p->pBt;
5504 	assert(idx >= 0 && idx < NUMMETA);
5505 
5506 	/*
5507 	 * Under some (odd) circumstances SQLite expects a database to be
5508 	 * opened here: If it didn't exist when the connection was opened, but
5509 	 * was created by another connection since then. If we don't open the
5510 	 * table now, some virtual table operations fail - altermalloc.test
5511 	 * has such a scenario.
5512 	 */
5513 	if (!p->connected && pBt->dbStorage == DB_STORE_NAMED &&
5514 	    !pBt->database_existed && !__os_exists(NULL, pBt->full_name, 0)) {
5515 		btreeUpdateBtShared(p, 1);
5516 		pBt = p->pBt;
5517 		ret = btreeOpenEnvironment(p, 1);
5518 		/*
5519 		 * Ignore failures. There's not much else we can do. A failure
5520 		 * here will likely leave the connection in a bad state.
5521 		 * This path is tested by altermalloc.
5522 		 */
5523 	}
5524 	/* Once connected to a shared environment, don't trust the cache. */
5525 	if (idx > 0 && idx < NUMMETA && pBt->meta[idx].cached &&
5526 	    (!p->connected || pBt->dbStorage != DB_STORE_NAMED)) {
5527 		*pMeta = pBt->meta[idx].value;
5528 		return;
5529 	} else if (idx == 0 || !p->connected ||
5530 	    pBt->dbStorage != DB_STORE_NAMED) {
5531 		*pMeta = 0;
5532 		return;
5533 	}
5534 
5535 	assert(p->pBt->dbStorage == DB_STORE_NAMED);
5536 
5537 	memset(&key, 0, sizeof(key));
5538 	metaKey = idx;
5539 	key.data = &metaKey;
5540 	key.size = key.ulen = sizeof(metaKey);
5541 	key.flags = DB_DBT_USERMEM;
5542 	memset(&data, 0, sizeof(data));
5543 	data.data = &metaData;
5544 	data.size = data.ulen = sizeof(metaData);
5545 	data.flags = DB_DBT_USERMEM;
5546 
5547 	/*
5548 	 * Trigger a read-modify-write get from the metadata table to stop
5549 	 * other connections from being able to proceed while an exclusive
5550 	 * transaction is active.
5551 	 */
5552 	if ((ret = pMetaDb->get(pMetaDb, GET_META_TXN(p), &key, &data,
5553 	    GET_META_FLAGS(p))) == 0) {
5554 		assert(data.size == sizeof(i64));
5555 		*pMeta = (u32)(metaData);
5556 		if (idx < NUMMETA) {
5557 			pBt->meta[idx].value = *pMeta;
5558 			pBt->meta[idx].cached = 1;
5559 		}
5560 	} else if (ret == DB_NOTFOUND || ret == DB_KEYEMPTY) {
5561 		*pMeta = 0;
5562 		ret = 0;
5563 	} else if (ret == DB_LOCK_DEADLOCK || ret == DB_LOCK_NOTGRANTED) {
5564 		p->db->errCode = SQLITE_BUSY;
5565 		ret = 0;
5566 		*pMeta = 0;
5567 		sqlite3BtreeRollback(p);
5568        }
5569 
5570 	assert(ret == 0);
5571 }
5572 
5573 /*
5574 ** Write meta-information back into the database.  Meta[0] is read-only and
5575 ** may not be written.
5576 */
sqlite3BtreeUpdateMeta(Btree * p,int idx,u32 iMeta)5577 int sqlite3BtreeUpdateMeta(Btree *p, int idx, u32 iMeta)
5578 {
5579 	BtShared *pBt;
5580 	int rc, ret;
5581 	DBT key, data;
5582 	i64 metaKey, metaData;
5583 
5584 	log_msg(LOG_VERBOSE, "sqlite3BtreeUpdateMeta(%p, %u, %u)",
5585 	    p, idx, iMeta);
5586 
5587 	pBt = p->pBt;
5588 	if (IS_BTREE_READONLY(p))
5589 		return SQLITE_READONLY;
5590 
5591 	assert(idx > 0 && idx < NUMMETA);
5592 
5593 	sqlite3_mutex_enter(pBt->mutex);
5594 	pBt->meta[idx].value = iMeta;
5595 	pBt->meta[idx].cached = 1;
5596 
5597 #ifndef SQLITE_OMIT_AUTOVACUUM
5598 	if (idx == BTREE_INCR_VACUUM) {
5599 		assert(iMeta == 0 || iMeta == 1);
5600 		pBt->incrVacuum = (u8)iMeta;
5601 	}
5602 #endif
5603 	sqlite3_mutex_leave(pBt->mutex);
5604 
5605 	/* Skip the database update for private environments. */
5606 	if (pBt->dbStorage != DB_STORE_NAMED)
5607 		return SQLITE_OK;
5608 
5609 	if (!p->connected && (rc = btreeOpenEnvironment(p, 1)) != SQLITE_OK)
5610 		return rc;
5611 	/* OpenEnvironment might have changed the pBt, update it. */
5612 	pBt = p->pBt;
5613 
5614 	memset(&key, 0, sizeof(key));
5615 	metaKey = idx;
5616 	key.data = &metaKey;
5617 	key.size = key.ulen = sizeof(metaKey);
5618 	key.flags = DB_DBT_USERMEM;
5619 	memset(&data, 0, sizeof(data));
5620 	metaData = iMeta;
5621 	data.data = &metaData;
5622 	data.size = data.ulen = sizeof(metaData);
5623 	data.flags = DB_DBT_USERMEM;
5624 
5625 	ret = pMetaDb->put(pMetaDb, pSavepointTxn, &key, &data, 0);
5626 
5627 	return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
5628 }
5629 
5630 #ifndef SQLITE_OMIT_BTREECOUNT
5631 /*
5632 ** The first argument, pCur, is a cursor opened on some b-tree. Count the
5633 ** number of entries in the b-tree and write the result to *pnEntry.
5634 **
5635 ** SQLITE_OK is returned if the operation is successfully executed.
5636 ** Otherwise, if an error is encountered (i.e. an IO error or database
5637 ** corruption) an SQLite error code is returned.
5638 */
sqlite3BtreeCount(BtCursor * pCur,i64 * pnEntry)5639 int sqlite3BtreeCount(BtCursor *pCur, i64 *pnEntry)
5640 {
5641 	Btree *p;
5642 	DB_BTREE_STAT *stat;
5643 	int ret;
5644 
5645 	if (pCur->eState == CURSOR_FAULT || pCur->cached_db->dbp == NULL)
5646 		return (pCur->error == 0 ? SQLITE_ERROR : pCur->error);
5647 
5648 	p = pCur->pBtree;
5649 
5650 	if ((ret = pBDb->stat(pBDb, pReadTxn ? pReadTxn : pFamilyTxn, &stat,
5651 	    GET_BTREE_ISOLATION(p) & ~DB_TXN_SNAPSHOT)) == 0) {
5652 		*pnEntry = stat->bt_ndata;
5653 		sqlite3_free(stat);
5654 	}
5655 
5656 	return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
5657 }
5658 #endif
5659 
5660 /*
5661 ** This routine does a complete check of the given BTree file.  aRoot[] is
5662 ** an array of pages numbers were each page number is the root page of a table.
5663 ** nRoot is the number of entries in aRoot.
5664 **
5665 ** If everything checks out, this routine returns NULL.  If something is amiss,
5666 ** an error message is written into memory obtained from malloc() and a
5667 ** pointer to that error message is returned.  The calling function is
5668 ** responsible for freeing the error message when it is done.
5669 */
sqlite3BtreeIntegrityCheck(Btree * pBt,int * aRoot,int nRoot,int mxErr,int * pnErr)5670 char *sqlite3BtreeIntegrityCheck(
5671     Btree *pBt,	/* The btree to be checked */
5672     int *aRoot,	/* An array of root page numbers for individual trees */
5673     int nRoot,	/* Number of entries in aRoot[] */
5674     int mxErr,	/* Stop reporting errors after this many */
5675     int *pnErr)	/* Write number of errors seen to this variable */
5676 {
5677 	int ret;
5678 
5679 	log_msg(LOG_VERBOSE, "sqlite3BtreeIntegrityCheck(%p, %p, %u, %u, %p)",
5680 	    pBt, aRoot, nRoot, mxErr, pnErr);
5681 
5682 	ret = 0;
5683 	*pnErr = 0;
5684 #if 0
5685 	DB *db;
5686 	int i;
5687 	char *tableName, tableNameBuf[DBNAME_SIZE];
5688 	/*
5689 	 * XXX: Have to do this outside the environment, verify doesn't play
5690 	 * nice with locking.
5691 	 */
5692 	for (i = 0; i < nRoot && ret == 0; i++) {
5693 		tableName = tableNameBuf;
5694 		GET_TABLENAME(tableName, sizeof(tableNameBuf), aRoot[i], "");
5695 		if ((ret = db_create(&db, pDbEnv, 0)) == 0)
5696 			ret = db->verify(db, tableName,
5697 			    NULL, NULL, DB_NOORDERCHK);
5698 	}
5699 
5700 #endif
5701 	return (ret == 0) ? NULL : sqlite3_strdup(db_strerror(ret));
5702 }
5703 
5704 /*
5705 ** Return the full pathname of the underlying database file.
5706 */
sqlite3BtreeGetFilename(Btree * p)5707 const char *sqlite3BtreeGetFilename(Btree *p)
5708 {
5709 	log_msg(LOG_VERBOSE, "sqlite3BtreeGetFilename(%p) (%s)",
5710 	    p, p->pBt->full_name);
5711 
5712 	return (p->pBt->full_name != NULL) ? p->pBt->full_name : "";
5713 }
5714 
5715 /*
5716 ** Return non-zero if a transaction is active.
5717 */
sqlite3BtreeIsInTrans(Btree * p)5718 int sqlite3BtreeIsInTrans(Btree *p)
5719 {
5720 	return (p && p->inTrans == TRANS_WRITE);
5721 }
5722 
5723 /*
5724  * Berkeley DB always uses WAL, but the SQLite flag is disabled on Windows
5725  * Mobile (CE) because some of the SQLite WAL code doesn't build with the flag
5726  * enabled.
5727  */
5728 #ifndef SQLITE_OMIT_WAL
5729 /*
5730 ** Run a checkpoint on the Btree passed as the first argument.
5731 **
5732 ** Return SQLITE_LOCKED if this or any other connection has an open
5733 ** transaction on the shared-cache the argument Btree is connected to.
5734 **
5735 ** Parameter eMode is one of SQLITE_CHECKPOINT_PASSIVE, FULL or RESTART.
5736 */
sqlite3BtreeCheckpoint(Btree * p,int eMode,int * pnLog,int * pnCkpt)5737 int sqlite3BtreeCheckpoint(Btree *p, int eMode, int *pnLog, int *pnCkpt)
5738 {
5739 	BtShared *pBt;
5740 	int rc;
5741 
5742 	/*
5743 	 * TODO: Investigate eMode. In SQLite there are three possible modes
5744 	 * SQLITE_CHECKPOINT_PASSIVE - return instead of blocking on locks
5745 	 * SQLITE_CHECKPOINT_FULL - Wait to get an exclusive lock.
5746 	 * SQLITE_CHECKPOINT_RESTART - as for full, except force a new log file
5747 	 *
5748 	 * Berkeley DB checkpoints really work like FULL. It might be possible
5749 	 * to mimic PASSIVE (default in SQLite) with lock no-wait, but do we
5750 	 * care?
5751 	 */
5752 	rc = SQLITE_OK;
5753 	if (p != NULL) {
5754 		pBt = p->pBt;
5755 		if (p->inTrans != TRANS_NONE)
5756 			rc = SQLITE_LOCKED;
5757 		else
5758 			rc = sqlite3PagerCheckpoint((Pager *)p);
5759 	}
5760 	/*
5761 	 * The following two variables are used to return information via
5762 	 * the sqlite_wal_checkoint_v2 database. They don't map well onto
5763 	 * Berkeley DB, so return 0 for now.
5764 	 * pnLog: Size of WAL log in frames.
5765 	 * pnCkpt: Total number of frames checkpointed.
5766 	 */
5767 	if (pnLog != 0)
5768 		*pnLog = 0;
5769 	if (pnCkpt != 0)
5770 		*pnCkpt = 0;
5771 	return rc;
5772 }
5773 #endif
5774 
5775 /*
5776  * Determine whether or not a cursor has moved from the position it was last
5777  * placed at.
5778  */
sqlite3BtreeCursorHasMoved(BtCursor * pCur,int * pHasMoved)5779 int sqlite3BtreeCursorHasMoved(BtCursor *pCur, int *pHasMoved)
5780 {
5781 	int rc;
5782 
5783 	/* Set this here in case of error. */
5784 	*pHasMoved = 1;
5785 
5786 	/*
5787 	 * We only want to return an error if the cursor is faulted, not just
5788 	 * if it is not pointing at anything.
5789 	 */
5790 	if (pCur->eState != CURSOR_VALID && pCur->eState != CURSOR_INVALID &&
5791 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
5792 		return rc;
5793 
5794 	if (pCur->eState == CURSOR_VALID && pCur->lastRes == 0)
5795 		*pHasMoved = 0;
5796 	return SQLITE_OK;
5797 }
5798 
5799 #ifndef NDEBUG
5800 /*
5801 ** Return true if the given BtCursor is valid.  A valid cursor is one that is
5802 ** currently pointing to a row in a (non-empty) table.
5803 **
5804 ** This is a verification routine, it is used only within assert() statements.
5805 */
sqlite3BtreeCursorIsValid(BtCursor * pCur)5806 int sqlite3BtreeCursorIsValid(BtCursor *pCur)
5807 {
5808 	return (pCur != NULL && pCur->eState == CURSOR_VALID);
5809 }
5810 #endif /* NDEBUG */
5811 
5812 /*****************************************************************
5813 ** Argument pCsr must be a cursor opened for writing on an INTKEY table
5814 ** currently pointing at a valid table entry. This function modifies the
5815 ** data stored as part of that entry. Only the data content may be modified,
5816 ** it is not possible to change the length of the data stored.
5817 */
sqlite3BtreePutData(BtCursor * pCur,u32 offset,u32 amt,void * z)5818 int sqlite3BtreePutData(BtCursor *pCur, u32 offset, u32 amt, void *z)
5819 {
5820 	DBT pdata;
5821 	int rc, ret;
5822 	log_msg(LOG_VERBOSE, "sqlite3BtreePutData(%p, %u, %u, %p)",
5823 	    pCur, offset, amt, z);
5824 
5825 	/*
5826 	 * Check that the cursor is open for writing and the cursor points at a
5827 	 * valid row of an intKey table.
5828 	 */
5829 	if (!pCur->wrFlag)
5830 		return SQLITE_READONLY;
5831 
5832 	UPDATE_DURING_BACKUP(pCur->pBtree)
5833 
5834 	if (pDbc == NULL &&
5835 	    (rc = btreeRestoreCursorPosition(pCur, 0)) != SQLITE_OK)
5836 		return rc;
5837 
5838 	if (pCur->eState != CURSOR_VALID)
5839 		return SQLITE_ABORT;
5840 
5841 	assert(!pCur->multiGetPtr);
5842 
5843 #ifndef SQLITE_OMIT_INCRBLOB
5844 	assert(pCur);
5845 	assert(pDbc);
5846 
5847 	rc = SQLITE_OK;
5848 	memcpy((u_int8_t *)pCur->data.data + offset, z, amt);
5849 
5850 	memset(&pdata, 0, sizeof(DBT));
5851 	pdata.data = (void *)z;
5852 	pdata.size = pdata.dlen = amt;
5853 	pdata.doff = offset;
5854 	pdata.flags |= DB_DBT_PARTIAL;
5855 
5856 	if ((rc = btreeTripWatchers(pCur, 1)) != SQLITE_OK)
5857 		return rc;
5858 
5859 	ret = pDbc->put(pDbc, &pCur->key, &pdata, DB_CURRENT);
5860 	if (ret != 0) {
5861 		HANDLE_INCRBLOB_DEADLOCK(ret, pCur)
5862 		rc = dberr2sqlitelocked(ret, pCur->pBtree);
5863 	}
5864 #endif
5865 	return rc;
5866 }
5867 
5868 /*****************************************************************
5869 ** Set a flag on this cursor to indicate that it is an incremental blob
5870 ** cursor.  Incrblob cursors are invalidated differently to ordinary cursors:
5871 ** if the value under an incrblob cursor is modified, attempts to access
5872 ** the cursor again will result in an error.
5873 */
sqlite3BtreeCacheOverflow(BtCursor * pCur)5874 void sqlite3BtreeCacheOverflow(BtCursor *pCur)
5875 {
5876 	Btree *p;
5877 
5878 	log_msg(LOG_VERBOSE, "sqlite3BtreeCacheOverflow(%p)", pCur);
5879 
5880 	pCur->isIncrblobHandle = 1;
5881 	p = pCur->pBtree;
5882 
5883 	/*
5884 	 * Give the transaction to the incrblob cursor, since it has to live
5885 	 * the lifetime of the cursor.
5886 	 */
5887 	if (p && p->connected && p->pBt->transactional && pCur->wrFlag) {
5888 		/* XXX error handling */
5889 		p->pBt->dbenv->txn_begin(p->pBt->dbenv, pSavepointTxn->parent,
5890 		    &pSavepointTxn, 0);
5891 	}
5892 }
5893 
5894 /*****************************************************************
5895 ** Return non-zero if a read (or write) transaction is active.
5896 */
sqlite3BtreeIsInReadTrans(Btree * p)5897 int sqlite3BtreeIsInReadTrans(Btree *p)
5898 {
5899 	log_msg(LOG_VERBOSE, "sqlite3BtreeIsInReadTrans(%p)", p);
5900 	return (p && p->inTrans != TRANS_NONE);
5901 }
5902 
5903 /***************************************************************************
5904 ** This routine sets the state to CURSOR_FAULT and the error code to errCode
5905 ** for every cursor on BtShared that pengine references.
5906 **
5907 ** Every cursor is tripped, including cursors that belong to other databases
5908 ** connections that happen to be sharing the cache with pengine.
5909 **
5910 ** This routine gets called when a rollback occurs. All cursors using the same
5911 ** cache must be tripped to prevent them from trying to use the engine after
5912 ** the rollback.  The rollback may have deleted tables or moved root pages, so
5913 ** it is not sufficient to save the state of the cursor. The cursor must be
5914 ** invalidated.
5915 */
sqlite3BtreeTripAllCursors(Btree * p,int errCode)5916 void sqlite3BtreeTripAllCursors(Btree*	p, int errCode)
5917 {
5918 	BtShared *pBt;
5919 	BtCursor *pCur;
5920 
5921 	log_msg(LOG_VERBOSE, "sqlite3BtreeTripAllCursors(%p, %u)", p, errCode);
5922 
5923 	pBt = p->pBt;
5924 
5925 	sqlite3_mutex_enter(pBt->mutex);
5926 	for (pCur = pBt->first_cursor; pCur != NULL; pCur = pCur->next) {
5927 		pCur->eState = CURSOR_FAULT;
5928 		pCur->error = errCode;
5929 	}
5930 	sqlite3_mutex_leave(pBt->mutex);
5931 }
5932 
btreeLockSchema(Btree * p,lock_mode_t lockMode)5933 int btreeLockSchema(Btree *p, lock_mode_t lockMode)
5934 {
5935 	BtCursor *pCur, tmpCursor;
5936 	BtShared *pBt;
5937 	DBC *oldCur;
5938 	int opened, rc, res, ret;
5939 
5940 	pBt = p->pBt;
5941 	pCur = &tmpCursor;
5942 	oldCur = NULL;
5943 	opened = 0;
5944 	rc = SQLITE_OK;
5945 
5946 	if (!p->connected) {
5947 		if (lockMode == LOCKMODE_NONE || lockMode > p->schemaLockMode)
5948 			p->schemaLockMode = lockMode;
5949 		return SQLITE_OK;
5950 	}
5951 
5952 	if (lockMode == LOCKMODE_NONE)
5953 		goto done;
5954 
5955 	sqlite3BtreeCursorZero(pCur);
5956 	rc = sqlite3BtreeCursor(p, MASTER_ROOT,
5957 	    lockMode == LOCKMODE_WRITE, NULL, pCur);
5958 	opened = (rc == SQLITE_OK);
5959 	if (pCur->eState == CURSOR_FAULT)
5960 		rc = pCur->error;
5961 
5962 	/*
5963 	 * Any repeatable operation would do: we get the last item just because
5964 	 * it doesn't try to do a bulk get.
5965 	 */
5966 	if (rc == SQLITE_OK)
5967 		rc = sqlite3BtreeLast(pCur, &res);
5968 
5969 done:	if (p->schemaLock != NULL) {
5970 		if ((ret = p->schemaLock->close(p->schemaLock)) != 0 &&
5971 		    rc == SQLITE_OK)
5972 			rc = dberr2sqlite(ret, p);
5973 		p->schemaLock = NULL;
5974 	}
5975 
5976 	if (opened && rc == SQLITE_OK) {
5977 		p->schemaLockMode = lockMode;
5978 		p->schemaLock = pDbc;
5979 		pDbc = NULL;
5980 	} else
5981 		p->schemaLockMode = LOCKMODE_NONE;
5982 	if (opened)
5983 		(void)sqlite3BtreeCloseCursor(pCur);
5984 
5985 	return rc;
5986 }
5987 
5988 /*****************************************************************
5989 ** Obtain a lock on the table whose root page is iTab.  The lock is a write
5990 ** lock if isWritelock is true or a read lock if it is false.
5991 */
sqlite3BtreeLockTable(Btree * p,int iTable,u8 isWriteLock)5992 int sqlite3BtreeLockTable(Btree *p, int iTable, u8 isWriteLock)
5993 {
5994 	lock_mode_t lockMode;
5995 	int rc;
5996 
5997 	log_msg(LOG_VERBOSE, "sqlite3BtreeLockTable(%p, %u, %u)",
5998 	    p, iTable, isWriteLock);
5999 
6000 	lockMode = isWriteLock ? LOCKMODE_WRITE : LOCKMODE_READ;
6001 
6002 	if (iTable != MASTER_ROOT || !p->pBt->transactional ||
6003 	    p->schemaLockMode >= lockMode)
6004 		return SQLITE_OK;
6005 
6006 	rc = btreeLockSchema(p, lockMode);
6007 
6008 	if (!p->connected && rc != SQLITE_NOMEM) {
6009 		p->schemaLockMode = lockMode;
6010 		return SQLITE_OK;
6011 	}
6012 
6013 	if (rc == SQLITE_BUSY)
6014 		rc = SQLITE_LOCKED;
6015 
6016 	return rc;
6017 }
6018 
6019 /*****************************************************************
6020 ** Return true if another user of the same shared engine as the argument
6021 ** handle holds an exclusive lock on the sqlite_master table.
6022 */
sqlite3BtreeSchemaLocked(Btree * p)6023 int sqlite3BtreeSchemaLocked(Btree *p)
6024 {
6025 	BtCursor *pCur;
6026 	BtShared *pBt;
6027 
6028 	log_msg(LOG_VERBOSE, "sqlite3BtreeSchemaLocked(%p)", p);
6029 
6030 	pBt = p->pBt;
6031 
6032 	if (p->sharable) {
6033 		sqlite3_mutex_enter(pBt->mutex);
6034 		for (pCur = pBt->first_cursor;
6035 		    pCur != NULL;
6036 		    pCur = pCur->next) {
6037 			if (pCur->pBtree != p && pCur->pBtree->connected &&
6038 			    pCur->pBtree->schemaLockMode == LOCKMODE_WRITE) {
6039 				sqlite3_mutex_leave(pBt->mutex);
6040 				return SQLITE_LOCKED_SHAREDCACHE;
6041 			}
6042 		}
6043 		sqlite3_mutex_leave(pBt->mutex);
6044 	}
6045 
6046 	return SQLITE_OK;
6047 }
6048 
6049 /*****************************************************************
6050 ** No op.
6051 */
sqlite3BtreeSyncDisabled(Btree * p)6052 int sqlite3BtreeSyncDisabled(Btree *p)
6053 {
6054 	log_msg(LOG_VERBOSE, "sqlite3BtreeSyncDisabled(%p)", p);
6055 	return (0);
6056 }
6057 
6058 #if !defined(SQLITE_OMIT_PAGER_PRAGMAS) || !defined(SQLITE_OMIT_VACUUM)
6059 /*
6060 ** Change the default pages size and the number of reserved bytes per page.
6061 ** Or, if the page size has already been fixed, return SQLITE_READONLY
6062 ** without changing anything.
6063 **
6064 ** The page size must be a power of 2 between 512 and 65536.  If the page
6065 ** size supplied does not meet this constraint then the page size is not
6066 ** changed.
6067 **
6068 ** Page sizes are constrained to be a power of two so that the region of the
6069 ** database file used for locking (beginning at PENDING_BYTE, the first byte
6070 ** past the 1GB boundary, 0x40000000) needs to occur at the beginning of a page.
6071 **
6072 ** If parameter nReserve is less than zero, then the number of reserved bytes
6073 ** per page is left unchanged.
6074 **
6075 ** If the iFix!=0 then the pageSizeFixed flag is set so that the page size
6076 ** and autovacuum mode can no longer be changed.
6077 */
sqlite3BtreeSetPageSize(Btree * p,int pageSize,int nReserve,int iFix)6078 int sqlite3BtreeSetPageSize(Btree *p, int pageSize, int nReserve, int iFix)
6079 {
6080 	BtShared *pBt;
6081 
6082 	log_msg(LOG_VERBOSE, "sqlite3BtreeSetPageSize(%p, %u, %u)",
6083 	    p, pageSize, nReserve);
6084 
6085 	if (pageSize != 0 && (pageSize < 512 || pageSize > 65536 ||
6086 	    ((pageSize - 1) & pageSize) != 0))
6087 		return SQLITE_OK;
6088 
6089 	pBt = p->pBt;
6090 	if (pBt->pageSizeFixed)
6091 		return SQLITE_READONLY;
6092 
6093 	/* Can't set the page size once a table has been created. */
6094 	if (pMetaDb != NULL)
6095 		return SQLITE_OK;
6096 
6097 	pBt->pageSize = pageSize;
6098 	if (iFix)
6099 		pBt->pageSizeFixed = 1;
6100 
6101 	return SQLITE_OK;
6102 }
6103 
6104 /***************************************************************************
6105 ** Return the currently defined page size.
6106 */
sqlite3BtreeGetPageSize(Btree * p)6107 int sqlite3BtreeGetPageSize(Btree *p)
6108 {
6109 	BtShared *pBt;
6110 	u_int32_t pagesize;
6111 
6112 	log_msg(LOG_VERBOSE, "sqlite3BtreeGetPageSize(%p)", p);
6113 
6114 	pBt = p->pBt;
6115 	if (!p->connected && pBt->need_open)
6116 		btreeOpenEnvironment(p, 1);
6117 
6118 	if (pMetaDb != NULL &&
6119 	    pMetaDb->get_pagesize(pMetaDb, &pagesize) == 0)
6120 		return (int)pagesize;
6121 	if (pBt->pageSize == 0)
6122 		return SQLITE_DEFAULT_PAGE_SIZE;
6123 	return p->pBt->pageSize;
6124 }
6125 
6126 /***************************************************************************
6127 ** No op.
6128 */
sqlite3BtreeGetReserve(Btree * p)6129 int sqlite3BtreeGetReserve(Btree *p)
6130 {
6131 	log_msg(LOG_VERBOSE, "sqlite3BtreeGetReserve(%p)", p);
6132 	/* FIXME: Need to check how this is used by SQLite. */
6133 	return (0);
6134 }
6135 
sqlite3BtreeLastPage(Btree * p)6136 u32 sqlite3BtreeLastPage(Btree *p)
6137 {
6138 	log_msg(LOG_VERBOSE, "sqlite3BtreeLastPage(%p)", p);
6139 	/* FIXME: Is there a cheap way to do this? */
6140 	return (0);
6141 }
6142 
6143 /*
6144 ** Set both the "read version" (single byte at byte offset 18) and
6145 ** "write version" (single byte at byte offset 19) fields in the database
6146 ** header to iVersion.
6147 ** This function is only called by OP_JournalMode, when changing to or from
6148 ** WAL journaling. We are always WAL, so it's safe to return OK.
6149 */
sqlite3BtreeSetVersion(Btree * pBtree,int iVersion)6150 int sqlite3BtreeSetVersion(Btree *pBtree, int iVersion)
6151 {
6152 	pBtree = NULL;
6153 	iVersion = 0;
6154 	return (SQLITE_OK);
6155 }
6156 
6157 /***************************************************************************
6158 **
6159 ** Set the maximum page count for a database if mxPage is positive.
6160 ** No changes are made if mxPage is 0 or negative.
6161 ** Regardless of the value of mxPage, return the current maximum page count.
6162 **
6163 ** If mxPage <= minimum page count, set it to the minimum possible value.
6164 */
sqlite3BtreeMaxPageCount(Btree * p,int mxPage)6165 int sqlite3BtreeMaxPageCount(Btree *p, int mxPage)
6166 {
6167 	int defPgCnt, newPgCnt;
6168 	BtShared *pBt;
6169 	CACHED_DB *cached_db;
6170 	DB_MPOOLFILE *pMpf;
6171 	u_int32_t gBytes, bytes;
6172 	u_int32_t pgSize;
6173 	db_pgno_t minPgNo;
6174 	HashElem *e;
6175 
6176 	log_msg(LOG_VERBOSE, "sqlite3BtreeMaxPageCount(%p, %u)", p, mxPage);
6177 
6178 	pBt = p->pBt;
6179 	if (!pMetaDb) {
6180 		if (mxPage > 0)
6181 			pBt->pageCount = mxPage;
6182 		return pBt->pageCount;
6183 	}
6184 
6185 	pMpf = pMetaDb->get_mpf(pMetaDb);
6186 	assert(pMpf);
6187 	gBytes = bytes = pgSize = 0;
6188 
6189 	/* Get the current maximum page number. */
6190 	pMetaDb->get_pagesize(pMetaDb, &pgSize);
6191 	pMpf->get_maxsize(pMpf, &gBytes, &bytes);
6192 	defPgCnt = (int)(gBytes * (GIGABYTE / pgSize) + bytes / pgSize);
6193 
6194 	if (mxPage <= 0 || IS_BTREE_READONLY(p))
6195 		return defPgCnt;
6196 
6197 	/*
6198 	 * Retrieve the current last page number, so we can avoid setting a
6199 	 * value smaller than that.
6200 	 */
6201 	minPgNo = 0;
6202 	if (pMpf->get_last_pgno(pMpf, &minPgNo) != 0)
6203 		return defPgCnt;
6204 
6205 	/*
6206 	 * If sqlite3BtreeCreateTable has been called, but the table has not
6207 	 * yet been created, reserve an additional two pages for the table.
6208 	 * This is a bit of a hack, otherwise sqlite3BtreeCursor can return
6209 	 * SQLITE_FULL, which the VDBE code does not expect.
6210 	 */
6211 	for (e = sqliteHashFirst(&pBt->db_cache); e != NULL;
6212 	    e = sqliteHashNext(e)) {
6213 		cached_db = sqliteHashData(e);
6214 		if (cached_db == NULL)
6215 			continue;
6216 		if (cached_db->created == 0)
6217 			minPgNo += 2;
6218 	}
6219 	/*
6220 	 * If mxPage is less than the current last page, set the maximum
6221 	 * page number to the current last page number.
6222 	 */
6223 	newPgCnt = (mxPage < (int)minPgNo) ? (int)minPgNo : mxPage;
6224 
6225 	gBytes = (u_int32_t) (newPgCnt / (GIGABYTE / pgSize));
6226 	bytes = (u_int32_t) ((newPgCnt % (GIGABYTE / pgSize)) * pgSize);
6227 	if (pMpf->set_maxsize(pMpf, gBytes, bytes) != 0)
6228 		return defPgCnt;
6229 
6230 	return newPgCnt;
6231 }
6232 
6233 /*
6234 ** Set the secureDelete flag if newFlag is 0 or 1.  If newFlag is -1,
6235 ** then make no changes.  Always return the value of the secureDelete
6236 ** setting after the change.
6237 */
sqlite3BtreeSecureDelete(Btree * p,int newFlag)6238 int sqlite3BtreeSecureDelete(Btree *p, int newFlag)
6239 {
6240 	int oldFlag;
6241 
6242 	oldFlag = 0;
6243 	if (p != NULL) {
6244 		sqlite3_mutex_enter(p->pBt->mutex);
6245 		if (newFlag >= 0)
6246 			p->pBt->secureDelete = (newFlag != 0);
6247 		oldFlag = p->pBt->secureDelete;
6248 		sqlite3_mutex_leave(p->pBt->mutex);
6249 	}
6250 
6251 	return oldFlag;
6252 }
6253 #endif /* !defined(SQLITE_OMIT_PAGER_PRAGMAS) */
6254 
6255 /*****************************************************************
6256 ** Return the pathname of the journal file for this database. The return
6257 ** value of this routine is the same regardless of whether the journal file
6258 ** has been created or not.
6259 **
6260 ** The pager journal filename is invariant as long as the pager is open so
6261 ** it is safe to access without the BtShared mutex.
6262 */
sqlite3BtreeGetJournalname(Btree * p)6263 const char *sqlite3BtreeGetJournalname(Btree *p)
6264 {
6265 	BtShared *pBt;
6266 
6267 	log_msg(LOG_VERBOSE, "sqlite3BtreeGetJournalname(%p)", p);
6268 	pBt = p->pBt;
6269 	return (pBt->dir_name != 0 ? pBt->dir_name : "");
6270 }
6271 
6272 /*****************************************************************
6273 ** This function returns a pointer to a blob of memory associated with a
6274 ** single shared-engine. The memory is used by client code for its own
6275 ** purposes (for example, to store a high-level schema associated with the
6276 ** shared-engine). The engine layer manages reference counting issues.
6277 **
6278 ** The first time this is called on a shared-engine, nBytes bytes of memory
6279 ** are allocated, zeroed, and returned to the caller. For each subsequent call
6280 ** the nBytes parameter is ignored and a pointer to the same blob of memory
6281 ** returned.
6282 **
6283 ** Just before the shared-engine is closed, the function passed as the xFree
6284 ** argument when the memory allocation was made is invoked on the blob of
6285 ** allocated memory. This function should not call sqlite3_free() on the
6286 ** memory, the engine layer does that.
6287 */
sqlite3BtreeSchema(Btree * p,int nBytes,void (* xFree)(void *))6288 void *sqlite3BtreeSchema(Btree *p, int nBytes, void (*xFree)(void *))
6289 {
6290 	log_msg(LOG_VERBOSE, "sqlite3BtreeSchema(%p, %u, fn_ptr)", p, nBytes);
6291 	/* This was happening when an environment open failed in bigfile.
6292 	if (p == NULL || p->pBt == NULL)
6293 		return NULL;*/
6294 
6295 	if (p->schema == NULL && nBytes > 0) {
6296 		p->schema = sqlite3MallocZero(nBytes);
6297 		p->free_schema = xFree;
6298 	}
6299 	return (p->schema);
6300 }
6301 
btreeGetIndex(Btree * p,int iTable)6302 Index *btreeGetIndex(Btree *p, int iTable)
6303 {
6304 	sqlite3 *db = p->db;
6305 	HashElem *e;
6306 	Index *index;
6307 	Schema *pSchema;
6308 	int i;
6309 
6310 	index = NULL;
6311 
6312 	assert(sqlite3_mutex_held(db->mutex));
6313 	for (i = 0; i < db->nDb; i++) {
6314 		if (db->aDb[i].pBt != p)
6315 			continue;
6316 		pSchema = db->aDb[i].pSchema;
6317 		assert(pSchema);
6318 		for (e = sqliteHashFirst(&pSchema->idxHash); e != NULL;
6319 		    e = sqliteHashNext(e)) {
6320 			index = sqliteHashData(e);
6321 			if (index->tnum == iTable)
6322 				goto done;
6323 			index = NULL;
6324 		}
6325 	}
6326 done:	return index;
6327 }
6328 
btreeGetKeyInfo(Btree * p,int iTable,KeyInfo ** pKeyInfo)6329 int btreeGetKeyInfo(Btree *p, int iTable, KeyInfo **pKeyInfo)
6330 {
6331 	Index *pIdx;
6332 	Parse parse;
6333 	*pKeyInfo = 0;
6334 
6335 	/* Only indexes have a KeyInfo */
6336 	if (iTable > 0 && (iTable & 1) == 0) {
6337 		pIdx = btreeGetIndex(p, iTable);
6338 		if (pIdx == NULL)
6339 			return SQLITE_ERROR;
6340 
6341 		/*
6342 		 * Set up a dummy Parse structure -- these are the only fields
6343 		 * that are accessed inside sqlite3IndexKeyinfo.  That function
6344 		 * could just take a sqlite3 struct instead of a Parse, but it
6345 		 * is consistent with the other functions normally called
6346 		 * during parsing.
6347 		 */
6348 		parse.db = p->db;
6349 		parse.nErr = 0;
6350 
6351 		*pKeyInfo = sqlite3IndexKeyinfo(&parse, pIdx);
6352 		if (!*pKeyInfo)
6353 			return SQLITE_NOMEM;
6354 		(*pKeyInfo)->enc = ENC(p->db);
6355 	}
6356 	return SQLITE_OK;
6357 }
6358 
6359 #ifndef SQLITE_OMIT_AUTOVACUUM
sqlite3BtreeIncrVacuum(Btree * p)6360 int sqlite3BtreeIncrVacuum(Btree *p)
6361 {
6362 	BtShared *pBt;
6363 
6364 	assert(p && p->inTrans >= TRANS_READ);
6365 
6366 	pBt = p->pBt;
6367 
6368 	if (!pBt->autoVacuum || pBt->dbStorage != DB_STORE_NAMED)
6369 		return SQLITE_DONE;
6370 
6371 	/* Just mark here and let sqlite3BtreeCommitPhaseTwo do the vacuum */
6372 	p->needVacuum = 1;
6373 	/*
6374 	 * Always return SQLITE_DONE to end OP_IncrVacuum immediatelly since
6375 	 * we ignore the "N" of PRAGMA incremental_vacuum(N);
6376 	 */
6377 	return SQLITE_DONE;
6378 }
6379 #endif
6380 
sqlite3BtreeIsInBackup(Btree * p)6381 int sqlite3BtreeIsInBackup(Btree *p)
6382 {
6383 	return p->nBackup;
6384 }
6385 
sqlite3BtreeGetAutoVacuum(Btree * p)6386 int sqlite3BtreeGetAutoVacuum(Btree *p)
6387 {
6388 #ifdef SQLITE_OMIT_AUTOVACUUM
6389 	return BTREE_AUTOVACUUM_NONE;
6390 #else
6391 	BtShared *pBt;
6392 	int vacuum_mode;
6393 
6394 	pBt = p->pBt;
6395 
6396 	sqlite3_mutex_enter(pBt->mutex);
6397 	vacuum_mode = (pBt->autoVacuum ?
6398 	    (pBt->incrVacuum ? BTREE_AUTOVACUUM_INCR : BTREE_AUTOVACUUM_FULL) :
6399 	    BTREE_AUTOVACUUM_NONE);
6400 	sqlite3_mutex_leave(pBt->mutex);
6401 
6402 	return vacuum_mode;
6403 #endif
6404 }
6405 
sqlite3BtreeSetAutoVacuum(Btree * p,int autoVacuum)6406 int sqlite3BtreeSetAutoVacuum(Btree *p, int autoVacuum)
6407 {
6408 #ifdef SQLITE_OMIT_AUTOVACUUM
6409 	return SQLITE_READONLY;
6410 #else
6411 	BtShared *pBt = p->pBt;
6412 	int rc = SQLITE_OK;
6413 	u8 savedIncrVacuum;
6414 
6415 	savedIncrVacuum = pBt->incrVacuum;
6416 	sqlite3_mutex_enter(pBt->mutex);
6417 	/* Do not like sqlite, BDB allows setting vacuum at any time */
6418 	pBt->autoVacuum = (autoVacuum != 0);
6419 	pBt->incrVacuum = (autoVacuum == 2);
6420 	sqlite3_mutex_leave(pBt->mutex);
6421 
6422 	/* If setting is changed, we need to reset incrVacuum Info */
6423 	if (pBt->incrVacuum != savedIncrVacuum)
6424 		btreeFreeVacuumInfo(p);
6425 
6426 	if (rc == SQLITE_OK && !p->connected && !pBt->resultsBuffer)
6427 		rc = btreeOpenEnvironment(p, 1);
6428 
6429 	return rc;
6430 #endif
6431 }
6432 
sqlite3BtreeGetCachedRowid(BtCursor * pCur)6433 sqlite3_int64 sqlite3BtreeGetCachedRowid(BtCursor *pCur)
6434 {
6435 	return pCur->cachedRowid;
6436 }
6437 
sqlite3BtreeSetCachedRowid(BtCursor * pCur,sqlite3_int64 iRowid)6438 void sqlite3BtreeSetCachedRowid(BtCursor *pCur, sqlite3_int64 iRowid)
6439 {
6440 	BtShared *pBt;
6441 	BtCursor *pC;
6442 
6443 	pBt = pCur->pBtree->pBt;
6444 
6445 	sqlite3_mutex_enter(pBt->mutex);
6446 	for (pC = pBt->first_cursor; pC != NULL; pC = pC->next)
6447 		if (pC->cached_db == pCur->cached_db)
6448 			pC->cachedRowid = iRowid;
6449 	sqlite3_mutex_leave(pBt->mutex);
6450 }
6451 
sqlite3BtreeSavepoint(Btree * p,int op,int iSavepoint)6452 int sqlite3BtreeSavepoint(Btree *p, int op, int iSavepoint)
6453 {
6454 	BtShared *pBt;
6455 	DB_TXN *txn;
6456 	DB_TXN *ttxn;
6457 	DELETED_TABLE *dtable, *prev, *next;
6458 #ifdef BDBSQL_SHARE_PRIVATE
6459 	int isMain = 0;
6460 #endif
6461 	int rc, ret;
6462 
6463 	log_msg(LOG_VERBOSE, "sqlite3BtreeSavepoint(%p,%d,%d)",
6464 	    p, op, iSavepoint);
6465 
6466 	/*
6467 	 * If iSavepoint + 2 > p->nSavepoint and this is not a rollback,
6468 	 * then the savepoint has been created, but sqlite3BtreeBeginStmt
6469 	 * has not been called to create the actual child transaction. If
6470 	 * this is a rollback and iSavepoint + 2 > p->nSavepoint, then
6471 	 * the read transaction lost its locks due to deadlock in an
6472 	 * update transaction and needs to be aborted.
6473 	 */
6474 	if (p && op == SAVEPOINT_ROLLBACK &&
6475 	    (p->txn_bulk ||
6476 	    (((iSavepoint + 2 > p->nSavepoint) || (p->inTrans == TRANS_READ)) &&
6477 	    pReadTxn))) {
6478 		/* Abort a read or bulk transaction, handled below. */
6479 	} else if (!p ||
6480 	    pSavepointTxn == NULL || iSavepoint + 2 > p->nSavepoint)
6481 		return SQLITE_OK;
6482 
6483 	pBt = p->pBt;
6484 
6485 	/*
6486 	 * Note that iSavepoint can be negative, meaning that all savepoints
6487 	 * should be released or rolled back.
6488 	 */
6489 	if (iSavepoint < 0) {
6490 		txn = pMainTxn;
6491 #ifdef BDBSQL_SHARE_PRIVATE
6492 		isMain = 1;
6493 #endif
6494 	} else if (op == SAVEPOINT_ROLLBACK &&
6495 	    ((iSavepoint + 2 > p->nSavepoint) || p->inTrans == TRANS_READ)) {
6496 		txn = pReadTxn;
6497 		pReadTxn = NULL;
6498 	} else {
6499 		txn = pSavepointTxn;
6500 		while (--p->nSavepoint > iSavepoint + 1 && txn->parent != NULL)
6501 			txn = txn->parent;
6502 	}
6503 
6504 	if (p->deleted_tables != NULL && p->inTrans == TRANS_WRITE) {
6505 		for (ttxn = pSavepointTxn;
6506 		    ttxn != txn->parent;
6507 		    ttxn = ttxn->parent) {
6508 			prev = NULL;
6509 			for (dtable = p->deleted_tables;
6510 			    dtable != NULL;
6511 			    dtable = next) {
6512 				next = dtable->next;
6513 				if (dtable->txn == ttxn &&
6514 				    op == SAVEPOINT_ROLLBACK) {
6515 					sqlite3_free(dtable);
6516 					if (prev)
6517 						prev->next = next;
6518 					else
6519 						p->deleted_tables = next;
6520 				} else {
6521 					prev = dtable;
6522 					if (op == SAVEPOINT_RELEASE)
6523 						dtable->txn = txn->parent;
6524 				}
6525 			}
6526 		}
6527 	}
6528 
6529 	if (txn->parent == NULL) {
6530 		assert(iSavepoint < 0 || p->txn_bulk);
6531 		pMainTxn = pReadTxn = pSavepointTxn = NULL;
6532 		p->nSavepoint = 0;
6533 		p->inTrans = TRANS_NONE;
6534 		p->txn_excl = 0;
6535 	 /* pReadTxn is only NULL if the read txn is being aborted */
6536 	} else if (p->inTrans == TRANS_WRITE && pReadTxn)
6537 		pSavepointTxn = txn->parent;
6538 
6539 	rc = btreeCloseAllCursors(p, txn);
6540 	if (rc != SQLITE_OK)
6541 		return rc;
6542 
6543 	ret = (op == SAVEPOINT_RELEASE) ?
6544 	    txn->commit(txn, DB_TXN_NOSYNC) : txn->abort(txn);
6545 #ifdef BDBSQL_SHARE_PRIVATE
6546 	if (isMain && pBt->dbStorage == DB_STORE_NAMED)
6547 		btreeFileUnlock(p);
6548 #endif
6549 	if (ret != 0)
6550 		goto err;
6551 
6552 	if (op == SAVEPOINT_ROLLBACK &&
6553 	    (rc = btreeCleanupCachedHandles(p, CLEANUP_ABORT)) != SQLITE_OK)
6554 		return rc;
6555 
6556 	if (op == SAVEPOINT_ROLLBACK && p->txn_bulk && iSavepoint >= 0)
6557 		return SQLITE_ABORT;
6558 
6559 err:	return (ret == 0) ? SQLITE_OK : dberr2sqlite(ret, p);
6560 }
6561 
6562 /* Stub out enough to make sqlite3_file_control fail gracefully. */
sqlite3BtreePager(Btree * p)6563 Pager *sqlite3BtreePager(Btree *p)
6564 {
6565 	return (Pager *)p;
6566 }
6567 
6568 #ifndef SQLITE_OMIT_SHARED_CACHE
6569 /*
6570 ** Enable or disable the shared pager and schema features.
6571 **
6572 ** This routine has no effect on existing database connections.
6573 ** The shared cache setting effects only future calls to
6574 ** sqlite3_open(), sqlite3_open16(), or sqlite3_open_v2().
6575 */
sqlite3_enable_shared_cache(int enable)6576 int sqlite3_enable_shared_cache(int enable)
6577 {
6578 	sqlite3GlobalConfig.sharedCacheEnabled = enable;
6579 	return SQLITE_OK;
6580 }
6581 #endif
6582 
6583 /*
6584  * Returns the Berkeley DB* struct for the user created
6585  * table with the given iTable value.
6586  */
btreeGetUserTable(Btree * p,DB_TXN * pTxn,DB ** pDb,int iTable)6587 int btreeGetUserTable(Btree *p, DB_TXN *pTxn, DB **pDb, int iTable)
6588 {
6589 	char *fileName, *tableName, tableNameBuf[DBNAME_SIZE];
6590 	int ret, rc;
6591 	BtShared *pBt;
6592 	DB *dbp;
6593 	KeyInfo *keyInfo;
6594 	void *app;
6595 
6596 	rc = SQLITE_OK;
6597 	pBt = p->pBt;
6598 	dbp = *pDb;
6599 	keyInfo = NULL;
6600 	/* Is the metadata table. */
6601 	if (iTable < 1) {
6602 		*pDb = NULL;
6603 		return SQLITE_OK;
6604 	}
6605 
6606 	/* If the handle is not in the cache, open it. */
6607 	tableName = tableNameBuf;
6608 	GET_TABLENAME(tableName, sizeof(tableNameBuf), iTable, "");
6609 	FIX_TABLENAME(pBt, fileName, tableName);
6610 
6611 	/* Open a DB handle on that table. */
6612 	if ((ret = db_create(&dbp, pDbEnv, 0)) != 0)
6613 		return dberr2sqlite(ret, p);
6614 
6615 	if (!GET_DURABLE(pBt) &&
6616 	    (ret = dbp->set_flags(dbp, DB_TXN_NOT_DURABLE)) != 0)
6617 		goto err;
6618 	if (pBt->encrypted && (ret = dbp->set_flags(dbp, DB_ENCRYPT)) != 0)
6619 		goto err;
6620 
6621 	if (!(iTable & 1)) {
6622 		/* Get the KeyInfo for the index */
6623 		if ((rc = btreeGetKeyInfo(p, iTable, &keyInfo)) != SQLITE_OK)
6624 			goto err;
6625 
6626 		if (keyInfo) {
6627 			dbp->app_private = keyInfo;
6628 			dbp->set_bt_compare(dbp, btreeCompareKeyInfo);
6629 		}
6630 	} else
6631 		dbp->set_bt_compare(dbp, btreeCompareIntKey);
6632 
6633 	tableName = tableNameBuf;
6634 	FIX_TABLENAME(pBt, fileName, tableName);
6635 	if ((ret = dbp->open(dbp, pTxn, fileName, tableName, DB_BTREE,
6636 	    (pBt->db_oflags & ~DB_CREATE) | GET_ENV_READONLY(pBt), 0) |
6637 	    GET_AUTO_COMMIT(pBt, pTxn)) != 0)
6638 		goto err;
6639 
6640 	*pDb = dbp;
6641 	return rc;
6642 
6643 err:	app = dbp->app_private;
6644 	dbp->app_private = NULL;
6645 	dbp->close(dbp, 0);
6646 	if (app)
6647 		sqlite3DbFree(p->db, app);
6648 	return MAP_ERR(rc, ret, p);
6649 }
6650 
6651 /*
6652  * Gets a list of all the iTable values of the tables in the given database,
6653  * and allocates and sets that list into iTables.  The caller must free iTables
6654  * using sqlite3_free().
6655  * iTables - Contains the list iTable values for all tables in the database.  A
6656  * value of -1 marks the end of the list.  The caller must use sqlit3_free() to
6657  * deallocate the list.
6658  */
btreeGetTables(Btree * p,int ** iTables,DB_TXN * txn)6659 int btreeGetTables(Btree *p, int **iTables, DB_TXN *txn)
6660 {
6661 	DB *dbp;
6662 	DBC *dbc;
6663 	DB_BTREE_STAT *stats;
6664 	DBT key, data;
6665 	Mem iTable;
6666 	int current, entries, i, inTrans, rc, ret;
6667 	int *tables, *ptr;
6668 	u32 hdrSize, type;
6669 	unsigned char *endHdr, *record, *ptr2;
6670 
6671 	memset(&key, 0, sizeof(key));
6672 	memset(&data, 0, sizeof(data));
6673 	ret = inTrans = 0;
6674 	dbp = NULL;
6675 	dbc = NULL;
6676 	tables = ptr = NULL;
6677 
6678 	/* Get the sqlite master db handle and count the entries in it. */
6679 	if ((rc = btreeGetUserTable(p, txn, &dbp, MASTER_ROOT)) != SQLITE_OK)
6680 		goto err;
6681 	assert(dbp != NULL);
6682 
6683 	if ((ret = dbp->stat(dbp, txn, &stats, 0)) != 0)
6684 		goto err;
6685 
6686 	entries = stats->bt_nkeys;
6687 #ifdef BDBSQL_OMIT_LEAKCHECK
6688 	free(stats);
6689 #else
6690 	sqlite3_free(stats);
6691 #endif
6692 
6693 	/*
6694 	 * Add room for the sqlite master and a value of -1 to
6695 	 * mark the end of the table.  The sqlite master may include
6696 	 * views, which will not be recored in the tables entry.
6697 	 */
6698 	entries += 2;
6699 	tables = sqlite3Malloc(entries * sizeof(tables));
6700 	if (!tables) {
6701 		rc = SQLITE_NOMEM;
6702 		goto err;
6703 	}
6704 	ptr = tables;
6705 	/* Sqlite master table. */
6706 	tables[0] = MASTER_ROOT;
6707 	tables++;
6708 
6709 	/* Read each iTable value from the sqlite master */
6710 	if ((ret = dbp->cursor(dbp, txn, &dbc, 0)) != 0)
6711 		goto err;
6712 	current = 0;
6713 	while ((ret = dbc->get(dbc, &key, &data, DB_NEXT)) == 0) {
6714 		/* The iTable value is the 4th entry in the record. */
6715 		assert(current < entries);
6716 		memset(&iTable, 0, sizeof(iTable));
6717 		record = (unsigned char *)data.data;
6718 		getVarint32(record, hdrSize);
6719 		endHdr = record + hdrSize;
6720 		ptr2 = record;
6721 		record = endHdr;
6722 		ptr2++;
6723 		for (i = 0; i < 3; i++) {
6724 			assert(ptr2 < endHdr);
6725 			ptr2 += getVarint32(ptr2, type);
6726 			record += sqlite3VdbeSerialTypeLen(type);
6727 		}
6728 		assert(ptr2 < endHdr);
6729 		ptr2 += getVarint32(ptr2, type);
6730 		sqlite3VdbeSerialGet(record, type, &iTable);
6731 		assert(iTable.flags & MEM_Int);
6732 		/* Do not count veiws and triggers. */
6733 		if (iTable.u.i > 0) {
6734 			tables[0] = (int)iTable.u.i;
6735 			tables++;
6736 			current++;
6737 		}
6738 	}
6739 	if (ret != DB_NOTFOUND)
6740 		goto err;
6741 	else
6742 		ret = 0;
6743 
6744 	/* Mark the end of the list. */
6745 	tables[0] = -1;
6746 	*iTables = ptr;
6747 
6748 err:	if ((ret != 0 || rc != SQLITE_OK) && ptr)
6749 		 sqlite3_free(ptr);
6750 	if (dbc)
6751 		dbc->close(dbc);
6752 	if (dbp) {
6753 		void *app = dbp->app_private;
6754 		dbp->close(dbp, DB_NOSYNC);
6755 		if (app)
6756 			sqlite3DbFree(p->db, app);
6757 	}
6758 	return MAP_ERR(rc, ret, p);
6759 }
6760 
6761 /*
6762  * Gets the number of pages in all user tables in the database.
6763  * p - Btree of the database.
6764  * name - Name of the database, such as main or temp.
6765  * tables - A list of the iTable values of all tables in the database is
6766  *          allocated and returned in this variable, the caller must use
6767  *          sqlite3_free() to free the memory when done.
6768  * pageCount - Is set to the number of pages in the database.
6769  */
btreeGetPageCount(Btree * p,int ** tables,u32 * pageCount,DB_TXN * txn)6770 int btreeGetPageCount(Btree *p, int **tables, u32 *pageCount, DB_TXN *txn)
6771 {
6772 	DB *dbp;
6773 	DB_BTREE_STAT *stats;
6774 	DBC *dbc;
6775 	DB_TXN *txnChild;
6776 	BtShared *pBt;
6777 	int i, ret, ret2, rc;
6778 	void *app;
6779 
6780 	ret = ret2 = 0;
6781 	dbp = NULL;
6782 	*pageCount = 0;
6783 	rc = SQLITE_OK;
6784 	dbc = NULL;
6785 	pBt = p->pBt;
6786 	txnChild = NULL;
6787 
6788 	/*
6789 	 * Get a list of all the iTable values for all tables in
6790 	 * the database.
6791 	 */
6792 	if ((rc = btreeGetTables(p, tables, txn)) != SQLITE_OK)
6793 		goto err;
6794 
6795 	/*
6796 	 * Do not want to keep the locks on all the tables, but
6797 	 * also do not want to commit or abort the transaction.
6798 	 */
6799 	ret = pDbEnv->txn_begin(pDbEnv, txn, &txnChild, DB_TXN_NOSYNC);
6800 	if (ret != 0)
6801 		goto err;
6802 
6803 	/*
6804 	 * For each table, get a DB handle and use the stat() function
6805 	 * to get the page count.
6806 	 */
6807 	i = 0;
6808 	while ((*tables)[i] > -1) {
6809 		rc = btreeGetUserTable(p, txnChild, &dbp, (*tables)[i]);
6810 		if (rc != SQLITE_OK)
6811 			goto err;
6812 		assert(dbp);
6813 
6814 		ret = dbp->stat(dbp, txnChild, (void *)&stats, DB_FAST_STAT);
6815 		if (ret != 0)
6816 			goto err;
6817 
6818 		*pageCount += stats->bt_pagecnt;
6819 
6820 		app = dbp->app_private;
6821 		dbp->close(dbp, DB_NOSYNC);
6822 		if (app)
6823 			sqlite3DbFree(p->db, app);
6824 		dbp = 0;
6825 #ifdef BDBSQL_OMIT_LEAKCHECK
6826 	free(stats);
6827 #else
6828 	sqlite3_free(stats);
6829 #endif
6830 		i++;
6831 	}
6832 
6833 err:	if (dbp) {
6834 		app = dbp->app_private;
6835 		dbp->close(dbp, DB_NOSYNC);
6836 		if (app)
6837 			sqlite3DbFree(p->db, app);
6838 		}
6839 
6840 	 /* Was only used for reading, so safe to abort. */
6841 	 if (txnChild) {
6842 		 if ((ret2 = txnChild->abort(txnChild)) != 0 && ret == 0)
6843 			 ret = ret2;
6844 	 }
6845 
6846 	return MAP_ERR(rc, ret, p);
6847 }
6848 
6849 /*
6850  * This pair of functions manages the handle lock held by Berkeley DB for
6851  * database (DB) handles. Berkeley DB holds those locks so that a remove can't
6852  * succeed while a handle is still open. The SQL API needs that remove to
6853  * succeed if the handle is "just cached" - that is not actively in use.
6854  * Consequently we reach into the DB handle and unlock the handle_lock when the
6855  * handle is only being held cached.
6856  * We re-get the lock when the handle is accessed again. A handle shouldn't be
6857  * accessed after a remove, but we'll be a bit paranoid and do checks for that
6858  * situation anyway.
6859  */
btreeDbHandleLock(Btree * p,CACHED_DB * cached_db)6860 static int btreeDbHandleLock(Btree *p, CACHED_DB *cached_db)
6861 {
6862 	BtShared *pBt;
6863 	DB *dbp;
6864 	DBT fileobj;
6865 	DB_LOCK_ILOCK lock_desc;
6866 	int ret;
6867 
6868 	pBt = p->pBt;
6869 	ret = 0;
6870 	dbp = cached_db->dbp;
6871 
6872 	if (btreeDbHandleIsLocked(cached_db))
6873 		return (0);
6874 
6875 	/* Ensure we're going to ask for a reasonable lock. */
6876 	if (cached_db->lock_mode == DB_LOCK_NG)
6877 		return (0);
6878 
6879 	memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
6880 	lock_desc.pgno = dbp->meta_pgno;
6881 	lock_desc.type = DB_HANDLE_LOCK;
6882 
6883 	memset(&fileobj, 0, sizeof(fileobj));
6884 	fileobj.data = &lock_desc;
6885 	fileobj.size = sizeof(lock_desc);
6886 
6887 	if (dbp != NULL && dbp->locker != NULL) {
6888 		ret = pDbEnv->lock_get(pDbEnv,
6889 		    ((DB_SQL_LOCKER*)dbp->locker)->id, 0, &fileobj,
6890 		    cached_db->lock_mode, &(dbp->handle_lock));
6891 		/* Avoid getting the lock again, until it's been dropped. */
6892 		cached_db->lock_mode = DB_LOCK_NG;
6893 	}
6894 
6895 	return (ret);
6896 }
6897 
btreeDbHandleUnlock(Btree * p,CACHED_DB * cached_db)6898 static int btreeDbHandleUnlock(Btree *p, CACHED_DB *cached_db)
6899 {
6900 	BtShared *pBt;
6901 
6902 	pBt = p->pBt;
6903 	if (!btreeDbHandleIsLocked(cached_db))
6904 		return (0);
6905 
6906 	cached_db->lock_mode = cached_db->dbp->handle_lock.mode;
6907 	return (pDbEnv->lock_put(pDbEnv, &cached_db->dbp->handle_lock));
6908 }
6909 
btreeDbHandleIsLocked(CACHED_DB * cached_db)6910 static int btreeDbHandleIsLocked(CACHED_DB *cached_db)
6911 {
6912 #define	LOCK_INVALID 0
6913 	return (cached_db->dbp->handle_lock.off != LOCK_INVALID);
6914 }
6915 
6916 /*
6917  * Integer compression
6918  *
6919  *  First byte | Next | Maximum
6920  *  byte       | bytes| value
6921  * ------------+------+---------------------------------------------------------
6922  * [0 xxxxxxx] | 0    | 2^7 - 1
6923  * [10 xxxxxx] | 1    | 2^14 + 2^7 - 1
6924  * [110 xxxxx] | 2    | 2^21 + 2^14 + 2^7 - 1
6925  * [1110 xxxx] | 3    | 2^28 + 2^21 + 2^14 + 2^7 - 1
6926  * [11110 xxx] | 4    | 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6927  * [11111 000] | 5    | 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6928  * [11111 001] | 6    | 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6929  * [11111 010] | 7    | 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 + 2^7 - 1
6930  * [11111 011] | 8    | 2^64 + 2^56 + 2^48 + 2^40 + 2^35 + 2^28 + 2^21 + 2^14 +
6931  *	       |      |	2^7 - 1
6932  *
6933  * NOTE: this compression algorithm depends
6934  * on big-endian order, so swap if necessary.
6935  */
6936 extern int __db_isbigendian(void);
6937 
6938 #define	CMP_INT_1BYTE_MAX 0x7F
6939 #define	CMP_INT_2BYTE_MAX 0x407F
6940 #define	CMP_INT_3BYTE_MAX 0x20407F
6941 #define	CMP_INT_4BYTE_MAX 0x1020407F
6942 
6943 #if defined(_MSC_VER) && _MSC_VER < 1300
6944 #define	CMP_INT_5BYTE_MAX 0x081020407Fi64
6945 #define	CMP_INT_6BYTE_MAX 0x01081020407Fi64
6946 #define	CMP_INT_7BYTE_MAX 0x0101081020407Fi64
6947 #define	CMP_INT_8BYTE_MAX 0x010101081020407Fi64
6948 #else
6949 #define	CMP_INT_5BYTE_MAX 0x081020407FLL
6950 #define	CMP_INT_6BYTE_MAX 0x01081020407FLL
6951 #define	CMP_INT_7BYTE_MAX 0x0101081020407FLL
6952 #define	CMP_INT_8BYTE_MAX 0x010101081020407FLL
6953 #endif
6954 
6955 #define	CMP_INT_2BYTE_VAL 0x80
6956 #define	CMP_INT_3BYTE_VAL 0xC0
6957 #define	CMP_INT_4BYTE_VAL 0xE0
6958 #define	CMP_INT_5BYTE_VAL 0xF0
6959 #define	CMP_INT_6BYTE_VAL 0xF8
6960 #define	CMP_INT_7BYTE_VAL 0xF9
6961 #define	CMP_INT_8BYTE_VAL 0xFA
6962 #define	CMP_INT_9BYTE_VAL 0xFB
6963 
6964 #define	CMP_INT_2BYTE_MASK 0x3F
6965 #define	CMP_INT_3BYTE_MASK 0x1F
6966 #define	CMP_INT_4BYTE_MASK 0x0F
6967 #define	CMP_INT_5BYTE_MASK 0x07
6968 
6969 static const u_int8_t __dbsql_marshaled_int_size[] = {
6970 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6971 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6972 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6973 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6974 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6975 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6976 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6977 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6978 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6979 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6980 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6981 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6982 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6983 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6984 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6985 	0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01, 0x01,
6986 
6987 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6988 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6989 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6990 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6991 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6992 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6993 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6994 	0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02, 0x02,
6995 
6996 	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
6997 	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
6998 	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
6999 	0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03, 0x03,
7000 
7001 	0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
7002 	0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04, 0x04,
7003 
7004 	0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05, 0x05,
7005 	0x06, 0x07, 0x08, 0x09, 0xFF, 0xFF, 0xFF, 0xFF
7006 };
7007 
7008 /*
7009  * btreeCompressInt --
7010  *	Compresses the integer into the buffer, returning the number of
7011  *	bytes occupied.
7012  *
7013  * An exact copy of __db_compress_int
7014  */
btreeCompressInt(u_int8_t * buf,u_int64_t i)7015 static int btreeCompressInt(u_int8_t *buf, u_int64_t i)
7016 {
7017 	if (i <= CMP_INT_1BYTE_MAX) {
7018 		/* no swapping for one byte value */
7019 		buf[0] = (u_int8_t)i;
7020 		return 1;
7021 	} else {
7022 		u_int8_t *p = (u_int8_t*)&i;
7023 		if (i <= CMP_INT_2BYTE_MAX) {
7024 			i -= CMP_INT_1BYTE_MAX + 1;
7025 			if (__db_isbigendian() != 0) {
7026 				buf[0] = p[6] | CMP_INT_2BYTE_VAL;
7027 				buf[1] = p[7];
7028 			} else {
7029 				buf[0] = p[1] | CMP_INT_2BYTE_VAL;
7030 				buf[1] = p[0];
7031 			}
7032 			return 2;
7033 		} else if (i <= CMP_INT_3BYTE_MAX) {
7034 			i -= CMP_INT_2BYTE_MAX + 1;
7035 			if (__db_isbigendian() != 0) {
7036 				buf[0] = p[5] | CMP_INT_3BYTE_VAL;
7037 				buf[1] = p[6];
7038 				buf[2] = p[7];
7039 			} else {
7040 				buf[0] = p[2] | CMP_INT_3BYTE_VAL;
7041 				buf[1] = p[1];
7042 				buf[2] = p[0];
7043 			}
7044 			return 3;
7045 		} else if (i <= CMP_INT_4BYTE_MAX) {
7046 			i -= CMP_INT_3BYTE_MAX + 1;
7047 			if (__db_isbigendian() != 0) {
7048 				buf[0] = p[4] | CMP_INT_4BYTE_VAL;
7049 				buf[1] = p[5];
7050 				buf[2] = p[6];
7051 				buf[3] = p[7];
7052 			} else {
7053 				buf[0] = p[3] | CMP_INT_4BYTE_VAL;
7054 				buf[1] = p[2];
7055 				buf[2] = p[1];
7056 				buf[3] = p[0];
7057 			}
7058 			return 4;
7059 		} else if (i <= CMP_INT_5BYTE_MAX) {
7060 			i -= CMP_INT_4BYTE_MAX + 1;
7061 			if (__db_isbigendian() != 0) {
7062 				buf[0] = p[3] | CMP_INT_5BYTE_VAL;
7063 				buf[1] = p[4];
7064 				buf[2] = p[5];
7065 				buf[3] = p[6];
7066 				buf[4] = p[7];
7067 			} else {
7068 				buf[0] = p[4] | CMP_INT_5BYTE_VAL;
7069 				buf[1] = p[3];
7070 				buf[2] = p[2];
7071 				buf[3] = p[1];
7072 				buf[4] = p[0];
7073 			}
7074 			return 5;
7075 		} else if (i <= CMP_INT_6BYTE_MAX) {
7076 			i -= CMP_INT_5BYTE_MAX + 1;
7077 			if (__db_isbigendian() != 0) {
7078 				buf[0] = CMP_INT_6BYTE_VAL;
7079 				buf[1] = p[3];
7080 				buf[2] = p[4];
7081 				buf[3] = p[5];
7082 				buf[4] = p[6];
7083 				buf[5] = p[7];
7084 			} else {
7085 				buf[0] = CMP_INT_6BYTE_VAL;
7086 				buf[1] = p[4];
7087 				buf[2] = p[3];
7088 				buf[3] = p[2];
7089 				buf[4] = p[1];
7090 				buf[5] = p[0];
7091 			}
7092 			return 6;
7093 		} else if (i <= CMP_INT_7BYTE_MAX) {
7094 			i -= CMP_INT_6BYTE_MAX + 1;
7095 			if (__db_isbigendian() != 0) {
7096 				buf[0] = CMP_INT_7BYTE_VAL;
7097 				buf[1] = p[2];
7098 				buf[2] = p[3];
7099 				buf[3] = p[4];
7100 				buf[4] = p[5];
7101 				buf[5] = p[6];
7102 				buf[6] = p[7];
7103 			} else {
7104 				buf[0] = CMP_INT_7BYTE_VAL;
7105 				buf[1] = p[5];
7106 				buf[2] = p[4];
7107 				buf[3] = p[3];
7108 				buf[4] = p[2];
7109 				buf[5] = p[1];
7110 				buf[6] = p[0];
7111 			}
7112 			return 7;
7113 		} else if (i <= CMP_INT_8BYTE_MAX) {
7114 			i -= CMP_INT_7BYTE_MAX + 1;
7115 			if (__db_isbigendian() != 0) {
7116 				buf[0] = CMP_INT_8BYTE_VAL;
7117 				buf[1] = p[1];
7118 				buf[2] = p[2];
7119 				buf[3] = p[3];
7120 				buf[4] = p[4];
7121 				buf[5] = p[5];
7122 				buf[6] = p[6];
7123 				buf[7] = p[7];
7124 			} else {
7125 				buf[0] = CMP_INT_8BYTE_VAL;
7126 				buf[1] = p[6];
7127 				buf[2] = p[5];
7128 				buf[3] = p[4];
7129 				buf[4] = p[3];
7130 				buf[5] = p[2];
7131 				buf[6] = p[1];
7132 				buf[7] = p[0];
7133 			}
7134 			return 8;
7135 		} else {
7136 			i -= CMP_INT_8BYTE_MAX + 1;
7137 			if (__db_isbigendian() != 0) {
7138 				buf[0] = CMP_INT_9BYTE_VAL;
7139 				buf[1] = p[0];
7140 				buf[2] = p[1];
7141 				buf[3] = p[2];
7142 				buf[4] = p[3];
7143 				buf[5] = p[4];
7144 				buf[6] = p[5];
7145 				buf[7] = p[6];
7146 				buf[8] = p[7];
7147 			} else {
7148 				buf[0] = CMP_INT_9BYTE_VAL;
7149 				buf[1] = p[7];
7150 				buf[2] = p[6];
7151 				buf[3] = p[5];
7152 				buf[4] = p[4];
7153 				buf[5] = p[3];
7154 				buf[6] = p[2];
7155 				buf[7] = p[1];
7156 				buf[8] = p[0];
7157 			}
7158 			return 9;
7159 		}
7160 	}
7161 }
7162 
7163 /*
7164  * btreeDecompressInt --
7165  *	Decompresses the compressed integer pointer to by buf into i,
7166  *	returning the number of bytes read.
7167  *
7168  * An exact copy of __db_decompress_int
7169  */
btreeDecompressInt(const u_int8_t * buf,u_int64_t * i)7170 static int btreeDecompressInt(const u_int8_t *buf, u_int64_t *i)
7171 {
7172 	int len;
7173 	u_int64_t tmp;
7174 	u_int8_t *p;
7175 	u_int8_t c;
7176 
7177 	tmp = 0;
7178 	p = (u_int8_t*)&tmp;
7179 	c = buf[0];
7180 	len = __dbsql_marshaled_int_size[c];
7181 
7182 	switch (len) {
7183 	case 1:
7184 		*i = c;
7185 		return 1;
7186 	case 2:
7187 		if (__db_isbigendian() != 0) {
7188 			p[6] = (c & CMP_INT_2BYTE_MASK);
7189 			p[7] = buf[1];
7190 		} else {
7191 			p[1] = (c & CMP_INT_2BYTE_MASK);
7192 			p[0] = buf[1];
7193 		}
7194 		tmp += CMP_INT_1BYTE_MAX + 1;
7195 		break;
7196 	case 3:
7197 		if (__db_isbigendian() != 0) {
7198 			p[5] = (c & CMP_INT_3BYTE_MASK);
7199 			p[6] = buf[1];
7200 			p[7] = buf[2];
7201 		} else {
7202 			p[2] = (c & CMP_INT_3BYTE_MASK);
7203 			p[1] = buf[1];
7204 			p[0] = buf[2];
7205 		}
7206 		tmp += CMP_INT_2BYTE_MAX + 1;
7207 		break;
7208 	case 4:
7209 		if (__db_isbigendian() != 0) {
7210 			p[4] = (c & CMP_INT_4BYTE_MASK);
7211 			p[5] = buf[1];
7212 			p[6] = buf[2];
7213 			p[7] = buf[3];
7214 		} else {
7215 			p[3] = (c & CMP_INT_4BYTE_MASK);
7216 			p[2] = buf[1];
7217 			p[1] = buf[2];
7218 			p[0] = buf[3];
7219 		}
7220 		tmp += CMP_INT_3BYTE_MAX + 1;
7221 		break;
7222 	case 5:
7223 		if (__db_isbigendian() != 0) {
7224 			p[3] = (c & CMP_INT_5BYTE_MASK);
7225 			p[4] = buf[1];
7226 			p[5] = buf[2];
7227 			p[6] = buf[3];
7228 			p[7] = buf[4];
7229 		} else {
7230 			p[4] = (c & CMP_INT_5BYTE_MASK);
7231 			p[3] = buf[1];
7232 			p[2] = buf[2];
7233 			p[1] = buf[3];
7234 			p[0] = buf[4];
7235 		}
7236 		tmp += CMP_INT_4BYTE_MAX + 1;
7237 		break;
7238 	case 6:
7239 		if (__db_isbigendian() != 0) {
7240 			p[3] = buf[1];
7241 			p[4] = buf[2];
7242 			p[5] = buf[3];
7243 			p[6] = buf[4];
7244 			p[7] = buf[5];
7245 		} else {
7246 			p[4] = buf[1];
7247 			p[3] = buf[2];
7248 			p[2] = buf[3];
7249 			p[1] = buf[4];
7250 			p[0] = buf[5];
7251 		}
7252 		tmp += CMP_INT_5BYTE_MAX + 1;
7253 		break;
7254 	case 7:
7255 		if (__db_isbigendian() != 0) {
7256 			p[2] = buf[1];
7257 			p[3] = buf[2];
7258 			p[4] = buf[3];
7259 			p[5] = buf[4];
7260 			p[6] = buf[5];
7261 			p[7] = buf[6];
7262 		} else {
7263 			p[5] = buf[1];
7264 			p[4] = buf[2];
7265 			p[3] = buf[3];
7266 			p[2] = buf[4];
7267 			p[1] = buf[5];
7268 			p[0] = buf[6];
7269 		}
7270 		tmp += CMP_INT_6BYTE_MAX + 1;
7271 		break;
7272 	case 8:
7273 		if (__db_isbigendian() != 0) {
7274 			p[1] = buf[1];
7275 			p[2] = buf[2];
7276 			p[3] = buf[3];
7277 			p[4] = buf[4];
7278 			p[5] = buf[5];
7279 			p[6] = buf[6];
7280 			p[7] = buf[7];
7281 		} else {
7282 			p[6] = buf[1];
7283 			p[5] = buf[2];
7284 			p[4] = buf[3];
7285 			p[3] = buf[4];
7286 			p[2] = buf[5];
7287 			p[1] = buf[6];
7288 			p[0] = buf[7];
7289 		}
7290 		tmp += CMP_INT_7BYTE_MAX + 1;
7291 		break;
7292 	case 9:
7293 		if (__db_isbigendian() != 0) {
7294 			p[0] = buf[1];
7295 			p[1] = buf[2];
7296 			p[2] = buf[3];
7297 			p[3] = buf[4];
7298 			p[4] = buf[5];
7299 			p[5] = buf[6];
7300 			p[6] = buf[7];
7301 			p[7] = buf[8];
7302 		} else {
7303 			p[7] = buf[1];
7304 			p[6] = buf[2];
7305 			p[5] = buf[3];
7306 			p[4] = buf[4];
7307 			p[3] = buf[5];
7308 			p[2] = buf[6];
7309 			p[1] = buf[7];
7310 			p[0] = buf[8];
7311 		}
7312 		tmp += CMP_INT_8BYTE_MAX + 1;
7313 		break;
7314 	default:
7315 		break;
7316 	}
7317 
7318 	*i = tmp;
7319 	return len;
7320 }
7321 
7322 #ifdef BDBSQL_OMIT_LEAKCHECK
7323 #undef sqlite3_malloc
7324 #undef sqlite3_free
7325 #undef sqlite3_strdup
7326 #endif
7327 
7328 #ifdef BDBSQL_SHARE_PRIVATE
7329 
7330 /*
7331  * Platform requirements:
7332  * -- must have mmap()
7333  * -- must have fcntl() for posix file locking
7334  * -- must support full posix open() semantics (e.g. VXWORKS does not)
7335  */
7336 
7337 /* this is a very stripped down version of btreeOpenEnvironment() */
openPrivateEnvironment(Btree * p,int startFamily)7338 static int openPrivateEnvironment(Btree *p, int startFamily)
7339 {
7340 	BtShared *pBt;
7341 	CACHED_DB *cached_db;
7342 	int creating, iTable, newEnv, rc, ret, reuse_env, writeLock;
7343 	txn_mode_t txn_mode;
7344 	i64 cache_sz;
7345 
7346 	newEnv = ret = reuse_env = 0;
7347 	rc = SQLITE_OK;
7348 	cached_db = NULL;
7349 	/*
7350 	 * btreeOpenEnvironment() now does this here:
7351 	 *  (void)btreeUpdateBtShared(p, 0);
7352 	 * Need to consider how multiple opens with different paths
7353 	 * affects BDBSQL_SHARE_PRIVATE
7354 	 */
7355 	pBt = p->pBt;
7356 	assert(pBt->dbStorage == DB_STORE_NAMED);
7357 
7358 	/* open mutex is held */
7359 	cache_sz = (i64)pBt->cacheSize;
7360 	if (cache_sz < DB_MIN_CACHESIZE)
7361 		cache_sz = DB_MIN_CACHESIZE;
7362 	cache_sz *= (pBt->pageSize > 0) ?
7363 	    pBt->pageSize : SQLITE_DEFAULT_PAGE_SIZE;
7364 	pDbEnv->set_cachesize(pDbEnv,
7365 	    (u_int32_t)(cache_sz / GIGABYTE),
7366 	    (u_int32_t)(cache_sz % GIGABYTE), 0);
7367 	if (pBt->pageSize != 0 &&
7368 	    (ret = pDbEnv->set_mp_pagesize(pDbEnv, pBt->pageSize)) != 0)
7369 		goto err;
7370 	pDbEnv->set_mp_mmapsize(pDbEnv, 0);
7371 	pDbEnv->set_mp_mtxcount(pDbEnv, pBt->mp_mutex_count);
7372 	pDbEnv->app_private = pBt;
7373 	pDbEnv->set_errcall(pDbEnv, btreeHandleDbError);
7374 
7375 	ret = pDbEnv->open(pDbEnv, pBt->dir_name, pBt->env_oflags, 0);
7376 	/* There is no acceptable failure for this reopen. */
7377 	if (ret != 0)
7378 		goto err;
7379 
7380 	pBt->env_opened = newEnv = 1;
7381 	assert(!p->connected);
7382 	p->connected = 1;
7383 
7384 	if (!IS_ENV_READONLY(pBt) && p->vfsFlags & SQLITE_OPEN_CREATE)
7385 		pBt->db_oflags |= DB_CREATE;
7386 
7387 	creating = 0;
7388 	if ((rc = btreeOpenMetaTables(p, &creating)) != SQLITE_OK)
7389 		goto err;
7390 	/* If this assertion trips, get code from btreeOpenEnvironment(). */
7391 	assert(!creating); /* TBD */
7392 
7393 #ifdef BDBSQL_PRELOAD_HANDLES
7394 	if (newEnv && !creating)
7395 		(void)btreePreloadHandles(p);
7396 #endif
7397 	/* need to start the family txn */
7398 	if (startFamily && (ret = pDbEnv->txn_begin(pDbEnv, NULL, &pFamilyTxn,
7399 	    DB_TXN_FAMILY|(p->txn_bulk ? DB_TXN_BULK:0))) != 0)
7400 		return dberr2sqlite(ret, p);
7401 
7402 err:	if (rc != SQLITE_OK || ret != 0) {
7403 		p->connected = 0;
7404 	}
7405 	return MAP_ERR(rc, ret, p);
7406 }
7407 
7408 /*
7409  * btreeReopenPrivateEnvironment()
7410  * For shared private environments this function does work from
7411  * both sqlite3BtreeClose() and btreePrepareEnvironment().
7412  * - close any open databases
7413  * - close the environment, but prevent cache flush
7414  * - set up opening the new environment.
7415  */
btreeReopenPrivateEnvironment(Btree * p)7416 static int btreeReopenPrivateEnvironment(Btree *p)
7417 {
7418 	BtShared *pBt;
7419 #ifdef BDBSQL_FILE_PER_TABLE
7420 	char *dirPathName, dirPathBuf[BT_MAX_PATH];
7421 #endif
7422 	int ret, rc, t_rc, t_ret, startFamily, idx;
7423 	sqlite3_mutex *mutexOpen;
7424 
7425 	log_msg(LOG_VERBOSE, "btreeReopenPrivateEnvironment(%p)", p);
7426 
7427 	ret = 0;
7428 	pBt = p->pBt;
7429 	rc = SQLITE_OK;
7430 
7431 	/*
7432 	 * do not reopen if pBt->nRef is 0.  That means the environment
7433 	 * is being closed.
7434 	 */
7435 	if (pBt == NULL || pBt->nRef == 0)
7436 		goto done;
7437 
7438 	/* make some state assertions (TBD -- remove these eventually) */
7439 	assert(pBt->transactional); /* must be transactional */
7440 	assert(pBt->first_cursor == NULL); /* no active cursors */
7441 	assert(pMainTxn == NULL); /* only at top-level txn */
7442 	assert(pBt->dbStorage == DB_STORE_NAMED); /* not temp */
7443 
7444 	/* commit family txn; it will be null when shutting down */
7445 	if (pFamilyTxn != NULL) {
7446 		startFamily = 1;
7447 		ret = pFamilyTxn->commit(pFamilyTxn, 0);
7448 		pFamilyTxn = NULL;
7449 		/* p->inTrans = TRANS_NONE; don't change state of this */
7450 		if (ret != 0 && rc == SQLITE_OK)
7451 			rc = dberr2sqlite(ret, p);
7452 	} else
7453 		startFamily = 0;
7454 
7455 	/*
7456 	 * acquire mutexOpen lock while closing down cached db handles.
7457 	 * There is a case where the call could be from
7458 	 * btreeOpenEnvironment() in which case the mutex is already
7459 	 * held.  It's inefficient to close/reopen in that path but
7460 	 * it should be infrequent and it's more consistent to do that
7461 	 * than just return.
7462 	 */
7463 	mutexOpen = sqlite3MutexAlloc(OPEN_MUTEX(pBt->dbStorage));
7464 	if (!pBt->lockfile.in_env_open)
7465 		sqlite3_mutex_enter(mutexOpen);
7466 	/* close open DB handles and clear related hash table */
7467 	t_rc = btreeCleanupCachedHandles(p, CLEANUP_CLOSE);
7468 	if (t_rc != SQLITE_OK && rc == SQLITE_OK)
7469 		rc = t_rc;
7470 	sqlite3HashClear(&pBt->db_cache);
7471 	/* close tables and meta databases */
7472 	if (pTablesDb != NULL &&
7473 	    (t_ret = pTablesDb->close(pTablesDb, DB_NOSYNC)) != 0 && ret == 0)
7474 		ret = t_ret;
7475 	if (pMetaDb != NULL &&
7476 	    (t_ret = pMetaDb->close(pMetaDb, DB_NOSYNC)) != 0 && ret == 0)
7477 		ret = t_ret;
7478 	pTablesDb = pMetaDb = NULL;
7479 
7480 	/* flush the cache of metadata values */
7481 	for (idx = 0; idx < NUMMETA; idx++)
7482 		pBt->meta[idx].cached = 0;
7483 	/*
7484 	 * close environment:
7485 	 * - set the error call to nothing to quiet any errors
7486 	 * - set DB_NOFLUSH to prevent the cache from flushing
7487 	 * - ignore a DB_RUNRECOVERY error
7488 	 */
7489 	pDbEnv->set_errcall(pDbEnv, NULL);
7490 	pDbEnv->set_flags(pDbEnv, DB_NOFLUSH, 1);
7491 	if ((t_ret = pDbEnv->close(pDbEnv, 0)) != 0 && ret == 0) {
7492 		if (t_ret != DB_RUNRECOVERY) /* ignore runrecovery */
7493 			ret = t_ret;
7494 	}
7495 
7496 	/* hold onto openMutex until done with open */
7497 	if (ret != 0)
7498 		goto err;
7499 
7500 	pBt->lsn_reset = NO_LSN_RESET;
7501 
7502 	/* do some work from btreePrepareEnvironment */
7503 	if ((ret = db_env_create(&pDbEnv, 0)) != 0)
7504 		goto err;
7505 	pDbEnv->set_errpfx(pDbEnv, pBt->full_name);
7506 #ifndef BDBSQL_SINGLE_THREAD
7507 	pDbEnv->set_flags(pDbEnv, DB_DATABASE_LOCKING, 1);
7508 	pDbEnv->set_lk_detect(pDbEnv, DB_LOCK_DEFAULT);
7509 #endif
7510 	pDbEnv->set_lg_regionmax(pDbEnv, BDBSQL_LOG_REGIONMAX);
7511 #ifndef BDBSQL_OMIT_LEAKCHECK
7512 	pDbEnv->set_alloc(pDbEnv, btreeMalloc, btreeRealloc,
7513 	    sqlite3_free);
7514 #endif
7515 	if ((ret = pDbEnv->set_lg_max(pDbEnv, pBt->logFileSize)) != 0)
7516 		goto err;
7517 #ifndef BDBSQL_OMIT_LOG_REMOVE
7518 	if ((ret = pDbEnv->log_set_config(pDbEnv,
7519 	    DB_LOG_AUTO_REMOVE, 1)) != 0)
7520 		goto err;
7521 #endif
7522 #ifdef BDBSQL_FILE_PER_TABLE
7523 	/* Reuse dirPathBuf. */
7524 	dirPathName = dirPathBuf;
7525 	memset(dirPathName, 0, BT_MAX_PATH);
7526 	sqlite3_snprintf(sizeof(dirPathName), dirPathName,
7527 	    "%s/..", pBt->full_name);
7528 	pDbEnv->add_data_dir(pDbEnv, dirPathName);
7529 	pDbEnv->set_create_dir(pDbEnv, dirPathName);
7530 #else
7531 	pDbEnv->add_data_dir(pDbEnv, "..");
7532 #endif
7533 	/*
7534 	 * by definition this function is only called
7535 	 * for DB_PRIVATE, transactional environments.
7536 	 * If we hold the write lock it is OK to checkpoint
7537 	 * during recovery; otherwise do not.
7538 	 */
7539 	pBt->env_oflags = DB_INIT_MPOOL | DB_INIT_LOG | DB_INIT_TXN |
7540 	    DB_INIT_LOCK | DB_PRIVATE | DB_CREATE | DB_THREAD | DB_RECOVER;
7541 	if (!btreeHasFileLock(p, 1))
7542 	    pBt->env_oflags |= DB_NO_CHECKPOINT;
7543 
7544 	p->connected = 0;
7545 	/* do the open */
7546 	rc = openPrivateEnvironment(p, startFamily);
7547 err:
7548 	if (!pBt->lockfile.in_env_open)
7549 	sqlite3_mutex_leave(mutexOpen);
7550 done:
7551 	return MAP_ERR(rc, ret, p);
7552 }
7553 
lockFile(int fd,int isread)7554 static int lockFile(int fd, int isread)
7555 {
7556 	struct flock fl;
7557 	memset(&fl, 0, sizeof(fl));
7558 	fl.l_type = (isread ? F_RDLCK : F_WRLCK);
7559 	fl.l_whence = SEEK_SET;
7560 	fl.l_start = 0;
7561 	fl.l_len = 0; /* 0 means lock the whole file */
7562 	if (fcntl(fd, F_SETLKW, &fl) < 0) {
7563 		/* TBD -- deal with error better */
7564 		return errno;
7565 	}
7566 	return 0;
7567 }
7568 
unlockFile(int fd)7569 static int unlockFile(int fd)
7570 {
7571 	struct flock fl;
7572 	memset(&fl, 0, sizeof(fl));
7573 	fl.l_whence = SEEK_SET;
7574 	fl.l_start = 0;
7575 	fl.l_len = 0;
7576 	fl.l_type = F_UNLCK;
7577 	if (fcntl(fd, F_SETLKW, &fl) < 0) {
7578 		/* TBD -- deal with error better */
7579 		return errno;
7580 	}
7581 	return 0;
7582 }
7583 
7584 /*
7585  * create/open the shared lock file, protected by openMutex
7586  * - open or create file
7587  * - initialize file if creating
7588  * - map the file
7589  * - allocate/initialize mutex for the LockFileInfo
7590  * - if the file was created, return with it locked to
7591  * synchronize environment creation as well
7592  */
btreeSetupLockfile(Btree * p,int * createdFile)7593 static int btreeSetupLockfile(Btree *p, int *createdFile)
7594 {
7595 	BtShared *pBt;
7596 	int fd, ret;
7597 	char fname[BT_MAX_PATH];
7598 	char initial_bytes[30];
7599 	int *ptr;
7600 
7601 	pBt = p->pBt;
7602 	if (pBt->lockfile.fd != 0)
7603 		return 0; /* already done */
7604 
7605 	*createdFile = 0;
7606 	/* file is envdir/.lck */
7607 	sqlite3_snprintf(sizeof(fname), fname,
7608 	    "%s/.lck", pBt->dir_name);
7609 
7610 	/* try a simple open for the common case -- the file exists */
7611 	fd = open(fname, O_RDWR , 0);
7612 	if (fd < 0) {
7613 		/* handle file creation/initialization */
7614 		if (errno != ENOENT)
7615 			goto err;
7616 		fd = open(fname, O_CREAT|O_RDWR, 0666);
7617 		if (fd < 0)
7618 			goto err;
7619 		/* write lock the file to handle initialization race */
7620 		lockFile(fd, 0);
7621 
7622 		/* if the file is non-zero we lost the race -- nothing to do */
7623 		if (read(fd, initial_bytes, 4) != 4) {
7624 			/* write some data to extend the file size */
7625 			sqlite3_snprintf(sizeof(initial_bytes), initial_bytes,
7626 			    "00000000dontwritehere", 0);
7627 			*createdFile = 1;
7628 			if (write(fd, initial_bytes, strlen(initial_bytes))
7629 			    != strlen(initial_bytes))
7630 				goto err;
7631 		} else
7632 			unlockFile(fd);
7633 	}
7634 
7635 	/* allocate mutex for the thread-shared structure */
7636 	assert(pBt->lockfile.mutex == 0);
7637 	pBt->lockfile.mutex = sqlite3MutexAlloc(SQLITE_MUTEX_FAST);
7638 	if (pBt->lockfile.mutex == NULL && sqlite3GlobalConfig.bCoreMutex) {
7639 		errno = ENOMEM;
7640 		goto err;
7641 	}
7642 
7643 	/* map the file */
7644 	if ((pBt->lockfile.mapAddr = mmap(NULL, 4096, PROT_READ|PROT_WRITE,
7645 	    MAP_SHARED, fd, 0)) == 0)
7646 		goto err;
7647 
7648 	ptr = (int *)(pBt->lockfile.mapAddr);
7649 	if (*createdFile) {
7650 		ptr[0] = 0;
7651 		ptr[1] = 0xdeadbeef; /* for debugging */
7652 		*((int *)(pBt->lockfile.mapAddr)) = 0;
7653 		pBt->lockfile.writelock_count = 1;
7654 		/* returning with lock held */
7655 	} else {
7656 		assert(ptr[1] == 0xdeadbeef);
7657 	}
7658 
7659 	pBt->lockfile.fd = fd;
7660 	pBt->lockfile.generation = ptr[0];
7661 	return 0;
7662 err:
7663 	if (*createdFile)
7664 		unlockFile(fd);
7665 	if (fd >= 0)
7666 		close(fd);
7667 	return errno;
7668 }
7669 
btreeReadlock(Btree * p,int dontreopen)7670 static int btreeReadlock(Btree *p, int dontreopen)
7671 {
7672 	int err;
7673 	int curGen, ret;
7674 	LockFileInfo *linfo = &p->pBt->lockfile;
7675 
7676 	assert(linfo->fd > 0);
7677 	assert(p->pBt->dbStorage == DB_STORE_NAMED);
7678 
7679 	sqlite3_mutex_enter(linfo->mutex);
7680 	++linfo->readlock_count;
7681 
7682 	/*
7683 	 * a waiting writer means writelock_count is non-zero, which
7684 	 * means a free pass -- the readlock will have been locked
7685 	 * by a previous reader.
7686 	 */
7687 	if (linfo->readlock_count == 1 && linfo->writelock_count == 0) {
7688 		if ((ret = lockFile(linfo->fd, 1)) != 0)
7689 			goto err;
7690 		/* check generation number, reopen if mismatch */
7691 		curGen = *((int *)(linfo->mapAddr));
7692 		if (curGen != linfo->generation && dontreopen == 0) {
7693 			/* hold the mutex to lock out racing threads */
7694 			ret = btreeReopenPrivateEnvironment(p);
7695 		}
7696 		linfo->generation = curGen;
7697 	}
7698 err:
7699 	sqlite3_mutex_leave(linfo->mutex);
7700 	return ret;
7701 }
7702 
btreeWritelock(Btree * p,int dontReopen)7703 static int btreeWritelock(Btree *p, int dontReopen)
7704 {
7705 	int err;
7706 	int curGen, ret;
7707 	int reacquire = 0;
7708 	LockFileInfo *linfo = &p->pBt->lockfile;
7709 
7710 	assert(linfo->fd > 0);
7711 	assert(p->pBt->dbStorage == DB_STORE_NAMED);
7712 
7713 	sqlite3_mutex_enter(linfo->mutex);
7714 	++linfo->writelock_count;
7715 	/* check write_waiting also, to serialize new write lock requests */
7716 	if (linfo->writelock_count == 1 || linfo->write_waiting) {
7717 		/*
7718 		 * indicate that a writer *may* be waiting for a lock
7719 		 * by setting write_waiting.  This will cause future
7720 		 * writers to enter this clause as well.  They will
7721 		 * back up on the lock if it's not yet been acquired.
7722 		 */
7723 		linfo->write_waiting = 1;
7724 
7725 		/*
7726 		 * release the mutex if there are active readers; this
7727 		 * allows them to unlock.  Otherwise block future
7728 		 * readers/writers on the mutex while waiting for the file lock
7729 		 */
7730 		if (linfo->readlock_count != 0) {
7731 			reacquire = 1;
7732 			sqlite3_mutex_leave(linfo->mutex);
7733 		}
7734 
7735 		if ((ret = lockFile(linfo->fd, 0) != 0))
7736 			goto err;
7737 
7738 		if (reacquire) {
7739 			reacquire = 0;
7740 			sqlite3_mutex_enter(linfo->mutex);
7741 		}
7742 		/* clear this flag unconditionally, we have the lock */
7743 		linfo->write_waiting = 0;
7744 
7745 		/* get and increment current generation number */
7746 		curGen = *((int *)(linfo->mapAddr));
7747 		*((int *)(linfo->mapAddr)) = curGen+1;
7748 		if (curGen != linfo->generation && dontReopen == 0) {
7749 			/* hold the mutex to lock out racing threads */
7750 			ret = btreeReopenPrivateEnvironment(p);
7751 		}
7752 		linfo->generation = curGen+1;
7753 	}
7754 err:
7755 	if (!reacquire)
7756 		sqlite3_mutex_leave(linfo->mutex);
7757 	return ret;
7758 }
7759 
btreeScopedFileLock(Btree * p,int iswrite,int dontreopen)7760 int btreeScopedFileLock(Btree *p, int iswrite, int dontreopen)
7761 {
7762 	return (iswrite ? btreeWritelock(p, dontreopen) :
7763 	    btreeReadlock(p, dontreopen));
7764 }
7765 
btreeFileLock(Btree * p)7766 static int btreeFileLock(Btree *p)
7767 {
7768 	p->maintxn_is_write = (p->inTrans == TRANS_WRITE);
7769 	return btreeScopedFileLock(p, p->maintxn_is_write, 0);
7770 }
7771 
btreeScopedFileUnlock(Btree * p,int iswrite)7772 int btreeScopedFileUnlock(Btree *p, int iswrite)
7773 {
7774 	int ret = 0;
7775 	struct flock fl;
7776 	LockFileInfo *linfo = &p->pBt->lockfile;
7777 
7778 	assert(linfo->fd > 0);
7779 	assert(p->pBt->dbStorage == DB_STORE_NAMED);
7780 
7781 	sqlite3_mutex_enter(linfo->mutex);
7782 	if (iswrite) {
7783 		assert(linfo->writelock_count > 0);
7784 		--linfo->writelock_count;
7785 	} else {
7786 		assert(linfo->readlock_count > 0);
7787 		--linfo->readlock_count;
7788 	}
7789 	/*
7790 	 * if a writer is waiting, writelock_count will be non-zero, which
7791 	 * is enough to suppress the unlock.
7792 	 */
7793 	if (linfo->writelock_count == 0) {
7794 		if (linfo->readlock_count == 0)
7795 			ret = unlockFile(linfo->fd);
7796 		else /* downgrade */
7797 			ret = lockFile(linfo->fd, 1);
7798 	}
7799 	sqlite3_mutex_leave(linfo->mutex);
7800 	return ret;
7801 }
7802 
btreeFileUnlock(Btree * p)7803 static int btreeFileUnlock(Btree *p)
7804 {
7805 	return btreeScopedFileUnlock(p, (p->maintxn_is_write != 0));
7806 }
7807 
7808 /*
7809  * method to check for some sort of lock.
7810  * do this without acquiring the mutex.  It can only be
7811  * called safely when it is known that the process has the
7812  * file lock (either read or write).
7813  */
btreeHasFileLock(Btree * p,int iswrite)7814 int btreeHasFileLock(Btree *p, int iswrite)
7815 {
7816 	LockFileInfo *linfo = &p->pBt->lockfile;
7817 	if (iswrite)
7818 		return (linfo->writelock_count);
7819 	else
7820 		return (linfo->readlock_count);
7821 }
7822 
7823 #endif /* BDBSQL_SHARE_PRIVATE */
7824 
7825 /*
7826  * Berkeley DB needs to be able to compare threads so that we can lookup
7827  * structures that are thread specific. The implementations are based on the
7828  * platform specific SQLite sqlite3_mutex_held implementations.
7829  */
7830 #ifdef SQLITE_MUTEX_OS2
7831 
getThreadID(sqlite3 * db)7832 void *getThreadID(sqlite3 *db)
7833 {
7834 	TID *tid;
7835 	PTID ptib;
7836 
7837 	tid = NULL;
7838 	tid = (pthread_t *)sqlite3DbMallocRaw(db, sizeof(TID));
7839 	if (tid != NULL) {
7840 		DosGetInfoBlocks(&ptib, NULL);
7841 		memcpy(tid, &ptib->tib_ptib2->tib2_ultid, sizeof(TID));
7842 	} else
7843 		db->mallocFailed = 1;
7844 	return tid;
7845 }
7846 
isCurrentThread(void * tid)7847 int isCurrentThread(void *tid)
7848 {
7849 	TID threadid;
7850 	PTID ptib;
7851 
7852 	threadid = *((TID *)tid);
7853 	DosGetInfoBlocks(&ptib, NULL);
7854 	return threadid == ptib->tib_ptib2->tib2_ultid;
7855 }
7856 
7857 #elif defined(SQLITE_MUTEX_PTHREADS)
7858 
getThreadID(sqlite3 * db)7859 void *getThreadID(sqlite3 *db)
7860 {
7861 	pthread_t *tid, temp_tid;
7862 
7863 	tid = NULL;
7864 	tid = (pthread_t *)sqlite3DbMallocRaw(db, sizeof(pthread_t));
7865 	if (tid != NULL) {
7866 		temp_tid = pthread_self();
7867 		memcpy(tid, &temp_tid, sizeof(pthread_t));
7868 	} else
7869 		db->mallocFailed = 1;
7870 	return tid;
7871 }
7872 
isCurrentThread(void * tid)7873 int isCurrentThread(void *tid)
7874 {
7875 	return pthread_equal(*((pthread_t *)tid), pthread_self());
7876 }
7877 
7878 #elif defined(SQLITE_MUTEX_W32)
7879 
getThreadID(sqlite3 * db)7880 void *getThreadID(sqlite3 *db)
7881 {
7882 	DWORD *tid, temp_tid;
7883 
7884 	tid = NULL;
7885 	tid = (DWORD *)sqlite3DbMallocRaw(db, sizeof(DWORD));
7886 	if (tid != NULL) {
7887 		temp_tid = GetCurrentThreadId();
7888 		memcpy(tid, &temp_tid, sizeof(DWORD));
7889 	} else
7890 		db->mallocFailed = 1;
7891 	return tid;
7892 }
7893 
isCurrentThread(void * tid)7894 int isCurrentThread(void *tid)
7895 {
7896 	DWORD threadid;
7897 
7898 	threadid = *((DWORD *)tid);
7899 	return (threadid == GetCurrentThreadId());
7900 }
7901 
7902 #else
7903 
getThreadID(sqlite3 * db)7904 void *getThreadID(sqlite3 *db)
7905 {
7906 	return NULL;
7907 }
7908 
isCurrentThread(void * tid)7909 int isCurrentThread(void *tid)
7910 {
7911 	return 1;
7912 }
7913 
7914 #endif
7915