1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2012, 2013 Oracle and/or its affiliates.  All rights reserved.
5  */
6 
7 #include <errno.h>
8 
9 #include "sqliteInt.h"
10 #include <db.h>
11 
12 #ifdef BDBSQL_SHARE_PRIVATE
13 /* BDBSQL_SHARE_PRIVATE implies BDBSQL_SINGLE_PROCESS */
14 #define	BDBSQL_SINGLE_PROCESS
15 #endif
16 
17 #define	INTKEY_BUFSIZE	(sizeof(i64) + 2) /* We add 2 bytes to negatives. */
18 #define	MULTI_BUFSIZE	8 * SQLITE_DEFAULT_PAGE_SIZE
19 #define	DBNAME_SIZE	20
20 #define	NUMMETA		16
21 #define	NUM_DB_PRAGMA	30
22 #define	CURSOR_BUFSIZE	32 /* For holding index keys. */
23 /* This should match SQLite VFS.mxPathname */
24 #define	BT_MAX_PATH 512
25 
26 #define BT_MAX_SEQ_NAME 128
27 
28 /*
29  * The default size of the Berkeley DB environment's logging area, in
30  * bytes.
31  */
32 #ifndef BDBSQL_LOG_REGIONMAX
33 # define BDBSQL_LOG_REGIONMAX (300 * 1024)
34 #endif
35 
36 /*
37  * The default policy for enabling the transactional bulk insert
38  * optimization.
39  */
40 #ifndef BDBSQL_TXN_BULK_DEFAULT
41 # define BDBSQL_TXN_BULK_DEFAULT 0
42 #endif
43 
44 /*
45  * The default pages number for incremental vacuum
46  */
47 #ifndef BDBSQL_INCR_VACUUM_PAGES
48 # define BDBSQL_INCR_VACUUM_PAGES 128
49 #endif
50 
51 /*
52  * The default fill percent for vacuum
53  */
54 #ifndef BDBSQL_VACUUM_FILLPERCENT
55 # define BDBSQL_VACUUM_FILLPERCENT 85
56 #endif
57 
58 #ifndef UINT32_MAX                      /* Maximum 32-bit unsigned. */
59 #define	UINT32_MAX      4294967295U
60 #endif
61 #ifndef INT64_MAX
62 #define	INT64_MAX ((((i64)0x7fffffff) << 32) | 0xffffffff)
63 #endif
64 #ifndef GIGABYTE
65 #define	GIGABYTE 1073741824
66 #endif
67 
68 #define	MAP_ERR(rc, ret, p)					\
69 	((rc != SQLITE_OK) ? rc : (ret == 0) ? SQLITE_OK :	\
70 	    dberr2sqlite(ret, p))
71 
72 #define	MAP_ERR_LOCKED(rc, ret, p)					\
73 	((rc != SQLITE_OK) ? rc : (ret == 0) ? SQLITE_OK :	\
74 	    dberr2sqlitelocked(ret, p))
75 
76 /* Declare custom functions added by Berkeley DB to SQL. */
77 int add_sequence_functions(sqlite3 *db);
78 
79 typedef int (*compareFunc)(void*,int,const void*,int,const void*);
80 
81 typedef struct {
82 	/*
83 	 * There are two types of tables stored in this cache:
84 	 * * Normal tables created by SQLite. These have 8 char names.
85 	 * * Tables used to handle sequences, which can have arbitrary names.
86 	 */
87 #define	CACHE_KEY_SIZE 9		/* 8 hex characters + NUL */
88 	char key[BT_MAX_SEQ_NAME];
89 	DB *dbp;
90 	int is_sequence;
91 	db_lockmode_t lock_mode;
92 	int created;
93 	void *cookie;
94 } CACHED_DB;
95 
96 typedef struct {
97 	int32_t cache;
98 	int64_t min_val;
99 	int64_t max_val;
100 	int64_t start_val;
101 	int32_t incr;
102 	u8	decrementing;
103 	u8	used;
104 	int64_t val; /* If not using a cache - this is the last value. */
105 	char    name[BT_MAX_SEQ_NAME];
106 	int32_t name_len;
107 	DB_SEQUENCE *handle; /* Never used directly from the DB cache key. */
108 } SEQ_COOKIE;
109 
110 typedef struct {
111 	u32 value;
112 	u8 cached;
113 } CACHED_META;
114 
115 typedef struct {
116 	char *value;
117 	u32 size;
118 	u32 offset;
119 } CACHED_PRAGMA;
120 
121 typedef struct DELETED_TABLE DELETED_TABLE;
122 struct DELETED_TABLE {
123 	int iTable;
124 	DB_TXN *txn;
125 #ifdef BDBSQL_FILE_PER_TABLE
126 	int flag;
127 #define	DTF_DELETE	0x00
128 #define	DTF_DROP	0x01
129 #endif
130 	DELETED_TABLE *next;
131 };
132 
133 #ifndef BDBSQL_SINGLE_THREAD
134 typedef struct {
135 	BtShared *pBt;
136 	KeyInfo *pKeyInfo;
137 	int iTable;
138 } TableInfo;
139 #endif
140 
141 #ifdef BDBSQL_SHARE_PRIVATE
142 typedef struct {
143 	int fd;
144 	void *mapAddr;
145 	int generation;
146 	int readlock_count;
147 	int writelock_count;
148 	int write_waiting;
149 	int in_env_open;
150 	sqlite3_mutex *mutex;
151 } LockFileInfo;
152 #endif
153 
154 typedef enum { CLEANUP_COMMIT, CLEANUP_ABORT, CLEANUP_CLOSE,
155     CLEANUP_DROP_LOCKS, CLEANUP_GET_LOCKS } cleanup_mode_t;
156 /* There are three possible table types in SQLite. */
157 typedef enum { DB_STORE_NAMED, DB_STORE_TMP, DB_STORE_INMEM } storage_mode_t;
158 typedef enum { TRANS_NONE, TRANS_READ, TRANS_WRITE } txn_mode_t;
159 typedef enum { LOCKMODE_NONE, LOCKMODE_READ, LOCKMODE_WRITE } lock_mode_t;
160 typedef enum { NO_LSN_RESET, LSN_RESET_FILE } lsn_reset_t;
161 
162 /* Declarations for functions that are shared by adapter source files. */
163 int btreeBeginTransInternal(Btree *p, int wrflag);
164 void *btreeCreateIndexKey(BtCursor *pCur);
165 void btreeGetErrorFile(const BtShared *pBt, char *fname);
166 Index *btreeGetIndex(Btree *p, int iTable);
167 int btreeGetPageCount(Btree *p, int **tables, u32 *pageCount, DB_TXN *txn);
168 int btreeGetUserTable(Btree *p, DB_TXN *pTxn, DB **pDb, int iTable);
169 int btreeGetTables(Btree *, int **, DB_TXN *);
170 int btreeLockSchema(Btree *p, lock_mode_t lockMode);
171 int btreeOpenEnvironment(Btree *p, int needLock);
172 int btreeOpenMetaTables(Btree *p, int *pCreating);
173 int btreeReopenEnvironment(Btree *p, int removingRep);
174 int btreeUpdateBtShared(Btree *p, int needLock);
175 #ifndef SQLITE_OMIT_VACUUM
176 int btreeIncrVacuum(Btree *p, u_int32_t *truncatedPages);
177 int btreeVacuum(Btree *p, char **pzErrMsg);
178 void btreeFreeVacuumInfo(Btree *p);
179 #endif
180 int dberr2sqlite(int, Btree *p);
181 int closeDB(Btree *p, DB *dbp, u_int32_t flags);
182 void *allocateCursorIndex(BtCursor *pCur, u_int32_t amount);
183 int splitIndexKey(BtCursor *pCur);
184 int isDupIndex(int flags, int storage, KeyInfo *keyInfo, DB *db);
185 #ifdef BDBSQL_SHARE_PRIVATE
186 int btreeScopedFileLock(Btree *p, int iswrite, int dontreopen);
187 int btreeScopedFileUnlock(Btree *p, int iswrite);
188 int btreeHasFileLock(Btree *p, int iswrite);
189 #endif
190 #ifdef SQLITE_HAS_CODEC
191 int sqlite3CodecAttach(sqlite3*, int, const void*, int);
192 #endif
193 int getPersistentPragma(Btree *p, const char *pragma_name, char **value,
194     Parse *pParse);
195 int setPersistentPragma(Btree *p, const char *pragma_name, const char *value,
196     Parse *pParse);
197 int encodeI64(u_int8_t *buf, i64 num);
198 int cleanPragmaCache(Btree *p);
199 int getHostPort(const char *hpstr, char **host, u_int *port);
200 int setRepVerboseFile(BtShared *pBt, DB_ENV *dbenv, const char *fname,
201     char *msg);
202 int unsetRepVerboseFile(BtShared *pBt, DB_ENV *dbenv, char **msg);
203 /* Returns the thread id as a void *, which needs to be freed. */
204 void *getThreadID(sqlite3 *db);
205 /* Checks if the thread id item identifies the current thread. */
206 int isCurrentThread(void *tid);
207 
208 #define	CLEAR_PWD(pBt)	do {						\
209 	memset((pBt)->encrypt_pwd, 0xff, (pBt)->encrypt_pwd_len);	\
210 	free((pBt)->encrypt_pwd);				\
211 	(pBt)->encrypt_pwd_len = 0;					\
212 	(pBt)->encrypt_pwd = NULL;					\
213 } while (0)
214 
215 /*
216  * There is some subtlety about which mutex to use: for shared handles, we
217  * update some structures that are protected by the open mutex.  In-memory
218  * databases all share the same g_tmp_env handle, so we need to make sure they
219  * get it single-threaded (so the initial open is done once).
220  *
221  * However, we can't use the open mutex to protect transient database opens and
222  * closes: we might already be holding locks in a shared environment when we
223  * try to open the temporary env, which would lead to a lock/mutex deadlock.
224  * We take a different static mutex from SQLite, previously used in the pager.
225  */
226 #define	OPEN_MUTEX(store)	((store == DB_STORE_NAMED) ?	\
227 	SQLITE_MUTEX_STATIC_OPEN : SQLITE_MUTEX_STATIC_LRU)
228 
229 #ifdef BDBSQL_FILE_PER_TABLE
230 /* Name of the metadata table in BDBSQL_FILE_PER_TABLE */
231 #define	BDBSQL_META_DATA_TABLE "metadata"
232 int getMetaDataFileName(const char *full_name, char **filename);
233 #endif
234 
235 struct BtShared {
236 	char *dir_name;
237 	char *full_name;
238 	char *short_name; /* A pointer into orig_name memory. */
239 	char *orig_name;
240 	char *err_file;
241 	char *err_msg;
242 	u_int8_t fileid[DB_FILE_ID_LEN];
243 	char *encrypt_pwd;
244 	lsn_reset_t lsn_reset;
245 	storage_mode_t dbStorage;
246 	u_int32_t env_oflags;
247 	DB_ENV *dbenv;
248 	int env_opened, encrypted, encrypt_pwd_len, last_table, need_open;
249 	/*
250 	 * Handles for the metadata DB, which holds the SQLite metadata for a
251 	 * file, and the tables DB, which is the Berkeley DB-internal database
252 	 * of sub-databases in a file.
253 	 */
254 	DB *metadb, *tablesdb;
255 	/* Caches persistent pragma values. */
256 	CACHED_PRAGMA pragma[NUM_DB_PRAGMA];
257 	sqlite3_mutex *pragma_cache_mutex;
258 	u8 cache_loaded;
259 	CACHED_META meta[NUMMETA];
260 	Hash db_cache;
261 #ifdef BDBSQL_SHARE_PRIVATE
262 	LockFileInfo lockfile;
263 	u_int32_t mp_mutex_count;
264 #endif
265 	/*
266 	 * A unique name is assigned to each in memory table. This value is
267 	 * used to ensure that each BtShared object gets a unique identifier.
268 	 * NOTE: For DB_STORE_INMEM tables, despite sharing the same environment
269 	 * handle, the internal table name is unique because it comprises of
270 	 * both the uid and iTable.
271 	 */
272 	u_int32_t uid;
273 	u_int32_t flags;
274 	u_int32_t panic; /* If the environment is not in a usable state. */
275 	u_int32_t db_oflags;
276 	u_int32_t transactional;
277 	u_int32_t pageSize;
278 	u_int32_t pageCount;
279 	u_int32_t pageSizeFixed;
280 	u_int32_t cacheSize;
281 	u_int32_t logFileSize; /* In bytes */
282 	u_int32_t database_existed; /* Did the database file exist on open. */
283 	u_int32_t read_txn_flags; /* Flags passed to the read transaction. */
284 	u8 autoVacuum; /* Is auto-vacuum enabled? */
285 	u8 incrVacuum; /* Is incremental vacuum enabled? */
286 	u8 resultsBuffer; /* Query results are stored in a in-memory buffer */
287 	u8 secureDelete; /* Overwrite deleted data */
288 	/* Non-recursive mutex required to access this struct */
289 	sqlite3_mutex *mutex;
290 	BtCursor *first_cursor;
291 
292 	/* Fields used to maintain the linked list of shared objects. */
293 	BtShared *pNextDb;
294 	BtShared *pPrevDb;
295 	Btree *btrees; /* A linked list of btrees that have been opened in this BtShared. */
296 	int nRef;
297 	int readonly;
298 	int repStartMaster; /* Start replication site as initial master? */
299 	FILE *repVerbFile; /* File for replication verbose output. */
300 	int repStarted; /* Replication is configured and started. */
301 	int repForceRecover; /* Force recovery on next open environment. */
302 	int single_process; /* If non-zero, keep all environment on the heap. */
303 };
304 
305 struct BtCursor {
306 	Btree *pBtree;
307 	int tableIndex;
308 	u_int32_t flags;
309 	u8 isDupIndex, isFirst, isIncrblobHandle, wrFlag;
310 	CACHED_DB *cached_db;
311 	DBC *dbc;
312 	DB_TXN *txn;
313 	struct KeyInfo *keyInfo;
314 	enum {
315 		CURSOR_INVALID, CURSOR_VALID, CURSOR_REQUIRESEEK, CURSOR_FAULT
316 	} eState;
317 	int error, lastRes;
318 	i64 cachedRowid, savedIntKey, lastKey;
319 	DBT key, data, index;
320 	i64 nKey;
321 	u8 indexKeyBuf[CURSOR_BUFSIZE];
322 	DBT multiData;
323 	void *multiGetPtr, *multiPutPtr;
324 	void *threadID;
325 	int skipMulti;
326 	BtCursor *next;
327 };
328 
329 struct Btree {
330 	struct BtShared *pBt;
331 	sqlite3 *db;
332 
333 	int connected;		/* Set up with an open environment */
334 	DB_TXN *family_txn;	/* Makes txns and cursors lock-compatible. */
335 	DB_TXN *main_txn;	/* Base transaction for read and savepoint. */
336 	DB_TXN *read_txn;
337 	DB_TXN *savepoint_txn;
338 	int nSavepoint;		/* The number of open savepoints. */
339 #ifdef BDBSQL_SHARE_PRIVATE
340 	int maintxn_is_write;
341 #endif
342 	int vfsFlags;
343 
344 	void* schema;		/* Opaque schema handle used by SQLite */
345 	void (*free_schema)(void*);	/* Destructor for schema */
346 
347 	DELETED_TABLE *deleted_tables;
348 
349 	struct VacuumInfo {
350 		DBT start;
351 		int iTable;
352 		struct VacuumInfo* next;
353 	} *vacuumInfo;       /* Keep incremental vacuum infomation */
354 	u8 inVacuum;	     /* True if vacuum is in progress */
355 	u8 needVacuum;	     /* True if the Btree needs vacuum in txn commit */
356 	u32 vacuumPages;     /* Num of pages for AutoVacuum/IncrVacuum */
357 	u32 fillPercent;     /* fillPercent for Vacuum */
358 	DBC *compact_cursor; /* Walks over table names during vacuum. */
359 
360 	txn_mode_t inTrans;
361 	lock_mode_t schemaLockMode;
362 	DBC *schemaLock;
363 	u8 sharable;	/* True if we can share pBt with another db */
364 	u8 locked;	/* True if db currently has pBt locked */
365 	u8 txn_excl;	/* True if in an exclusive transaction */
366 	u8 txn_bulk;	/* True to enable the bulk loading optimization */
367 	u32 txn_priority;	/* Transaction priority. */
368 	int wantToLock;	/* Number of nested calls to sqlite3BtreeEnter() */
369 	int nBackup;	/* Number of backup operations reading this btree */
370 	u32 updateDuringBackup; /* An update was performed during a backup. */
371 	int readonly;
372 	Btree *pNext;
373 	Btree *pPrev;
374 };
375 
376 /* Shared by btree.c and btmutex.c */
377 typedef enum {
378 	LOG_VERBOSE, LOG_DEBUG, LOG_NORMAL, LOG_RELEASE, LOG_NONE
379 } loglevel_t;
380 
381 #define	CURRENT_LOG_LEVEL LOG_RELEASE
382 
383 #ifdef NDEBUG
384 #define	log_msg(...)
385 #else
386 /* Utility functions. */
387 void log_msg(loglevel_t level, const char *fmt, ...);
388 #endif
389 
390 /*
391  * Common functions for internal DBSQL btree components (btree.c, vacuum.c, etc)
392  */
393 int btreeFindOrCreateDataTable(Btree *, int *, CACHED_DB **, int);
394 int btreeGetKeyInfo(Btree *p, int iTable, KeyInfo **pKeyInfo);
395 int btreeTableNameToId(const char *subdb, int len, int *pid);
396 
397 /*
398  * Common macros for internal DBSQL btree components (btree.c, vacuum.c, etc)
399  */
400 #define	pDbEnv		(pBt->dbenv)
401 #define	pMetaDb		(pBt->metadb)
402 #define	pTablesDb	(pBt->tablesdb)
403 #define	pFamilyTxn	(p->family_txn)
404 #define	pReadTxn	(p->read_txn)
405 #define	pMainTxn	(p->main_txn)
406 #define	pSavepointTxn	(p->savepoint_txn)
407 
408 #ifdef BDBSQL_FILE_PER_TABLE
409 #define	FIX_TABLENAME(pBt, fileName, tableName) do {		\
410 	if (pBt->dbStorage == DB_STORE_NAMED) {			\
411 		fileName = tableName;				\
412 	} else							\
413 		fileName = pBt->short_name;			\
414 } while (0)
415 #else
416 #define	FIX_TABLENAME(pBt, fileName, tableName) do {		\
417 	fileName = pBt->short_name;				\
418 } while (0)
419 #endif
420 
421 #define	GET_AUTO_COMMIT(pBt, txn) (((pBt)->transactional &&	\
422 	(!(txn) || (txn) == pFamilyTxn)) ? DB_AUTO_COMMIT : 0)
423 
424 /*
425  * If an update occurs while this Btree is also performing backup then
426  * increase the updateDuringBackup counter.  This value is checked before
427  * and after each backup step, and if it has increase then the backup
428  * process is reset.
429  */
430 #define	UPDATE_DURING_BACKUP(p)  \
431     if (p->nBackup > 0)     \
432 	p->updateDuringBackup++;
433 
434