1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 2013 Oracle and/or its affiliates.  All rights reserved.
5  */
6 /*
7  * Copyright (c) 1990, 1993, 1994, 1995, 1996
8  *	Keith Bostic.  All rights reserved.
9  */
10 /*
11  * Copyright (c) 1990, 1993, 1994, 1995
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * $Id$
39  */
40 
41 #include "db_config.h"
42 
43 #include "db_int.h"
44 #include "dbinc/db_page.h"
45 #include "dbinc/db_swap.h"
46 #include "dbinc/btree.h"
47 #include "dbinc/fop.h"
48 #include "dbinc/hash.h"
49 #include "dbinc/heap.h"
50 #include "dbinc/lock.h"
51 #include "dbinc/mp.h"
52 #include "dbinc/partition.h"
53 #include "dbinc/qam.h"
54 #include "dbinc/txn.h"
55 
56 static int __db_disassociate __P((DB *));
57 static int __db_disassociate_foreign __P ((DB *));
58 
59 #ifdef CONFIG_TEST
60 static int __db_makecopy __P((ENV *, const char *, const char *));
61 static int __qam_testdocopy __P((DB *, const char *));
62 #endif
63 
64 /*
65  * DB.C --
66  *	This file contains the utility functions for the DBP layer.
67  */
68 
69 /*
70  * __db_master_open --
71  *	Open up a handle on a master database.
72  *
73  * PUBLIC: int __db_master_open __P((DB *, DB_THREAD_INFO *,
74  * PUBLIC:     DB_TXN *, const char *, u_int32_t, int, DB **));
75  */
76 int
__db_master_open(subdbp,ip,txn,name,flags,mode,dbpp)77 __db_master_open(subdbp, ip, txn, name, flags, mode, dbpp)
78 	DB *subdbp;
79 	DB_THREAD_INFO *ip;
80 	DB_TXN *txn;
81 	const char *name;
82 	u_int32_t flags;
83 	int mode;
84 	DB **dbpp;
85 {
86 	DB *dbp;
87 	int ret;
88 
89 	*dbpp = NULL;
90 
91 	/* Open up a handle on the main database. */
92 	if ((ret = __db_create_internal(&dbp, subdbp->env, 0)) != 0)
93 		return (ret);
94 
95 	/* Set the creation directory. */
96 	dbp->dirname = subdbp->dirname;
97 
98 	/*
99 	 * It's always a btree.
100 	 * Run in the transaction we've created.
101 	 * Set the pagesize in case we're creating a new database.
102 	 * Flag that we're creating a database with subdatabases.
103 	 */
104 	dbp->pgsize = subdbp->pgsize;
105 	F_SET(dbp, DB_AM_SUBDB);
106 	F_SET(dbp, F_ISSET(subdbp,
107 	    DB_AM_RECOVER | DB_AM_SWAP |
108 	    DB_AM_ENCRYPT | DB_AM_CHKSUM | DB_AM_NOT_DURABLE));
109 
110 	/*
111 	 * If there was a subdb specified, then we only want to apply
112 	 * DB_EXCL to the subdb, not the actual file.  We only got here
113 	 * because there was a subdb specified.
114 	 */
115 	LF_CLR(DB_EXCL);
116 	LF_SET(DB_RDWRMASTER);
117 	if ((ret = __db_open(dbp, ip, txn,
118 	    name, NULL, DB_BTREE, flags, mode, PGNO_BASE_MD)) != 0)
119 		goto err;
120 
121 	/*
122 	 * The items in dbp are initialized from the master file's meta page.
123 	 * Other items such as checksum and encryption are checked when we
124 	 * read the meta-page, so we do not check those here.  However, if
125 	 * the meta-page caused checksumming to be turned on and it wasn't
126 	 * already, set it here.
127 	 */
128 	if (F_ISSET(dbp, DB_AM_CHKSUM))
129 		F_SET(subdbp, DB_AM_CHKSUM);
130 
131 	/*
132 	 * The user may have specified a page size for an existing file,
133 	 * which we want to ignore.
134 	 */
135 	subdbp->pgsize = dbp->pgsize;
136 	*dbpp = dbp;
137 
138 	if (0) {
139 err:		if (!F_ISSET(dbp, DB_AM_DISCARD))
140 			(void)__db_close(dbp, txn, DB_NOSYNC);
141 	}
142 
143 	return (ret);
144 }
145 
146 /*
147  * __db_master_update --
148  *	Add/Open/Remove a subdatabase from a master database.
149  *
150  * PUBLIC: int __db_master_update __P((DB *, DB *, DB_THREAD_INFO *, DB_TXN *,
151  * PUBLIC:      const char *, DBTYPE, mu_action, const char *, u_int32_t));
152  */
153 int
__db_master_update(mdbp,sdbp,ip,txn,subdb,type,action,newname,flags)154 __db_master_update(mdbp, sdbp, ip, txn, subdb, type, action, newname, flags)
155 	DB *mdbp, *sdbp;
156 	DB_TXN *txn;
157 	DB_THREAD_INFO *ip;
158 	const char *subdb;
159 	DBTYPE type;
160 	mu_action action;
161 	const char *newname;
162 	u_int32_t flags;
163 {
164 	DBC *dbc, *ndbc;
165 	DBT key, data, ndata;
166 	ENV *env;
167 	PAGE *p, *r;
168 	db_pgno_t t_pgno;
169 	int modify, ret, t_ret;
170 
171 	env = mdbp->env;
172 	dbc = ndbc = NULL;
173 	p = NULL;
174 
175 	/*
176 	 * Open up a cursor.  If this is CDB and we're creating the database,
177 	 * make it an update cursor.
178 	 *
179 	 * Might we modify the master database?  If so, we'll need to lock.
180 	 */
181 	modify = (!F_ISSET(mdbp, DB_AM_RDONLY) &&
182 	    (action != MU_OPEN || LF_ISSET(DB_CREATE))) ? 1 : 0;
183 
184 	if ((ret = __db_cursor(mdbp, ip, txn, &dbc,
185 	    (CDB_LOCKING(env) && modify) ? DB_WRITECURSOR : 0)) != 0)
186 		return (ret);
187 
188 	/*
189 	 * Point the cursor at the record.
190 	 *
191 	 * If we're removing or potentially creating an entry, lock the page
192 	 * with DB_RMW.
193 	 *
194 	 * We do multiple cursor operations with the cursor in some cases and
195 	 * subsequently access the data DBT information.  Set DB_DBT_MALLOC so
196 	 * we don't risk modification of the data between our uses of it.
197 	 *
198 	 * !!!
199 	 * We don't include the name's nul termination in the database.
200 	 */
201 	DB_INIT_DBT(key, subdb, strlen(subdb));
202 	memset(&data, 0, sizeof(data));
203 	F_SET(&data, DB_DBT_MALLOC);
204 
205 	ret = __dbc_get(dbc, &key, &data,
206 	    DB_SET | ((STD_LOCKING(dbc) && modify) ? DB_RMW : 0));
207 
208 	/*
209 	 * What we do next--whether or not we found a record for the
210 	 * specified subdatabase--depends on what the specified action is.
211 	 * Handle ret appropriately as the first statement of each case.
212 	 */
213 	switch (action) {
214 	case MU_REMOVE:
215 		/*
216 		 * We should have found something if we're removing it.  Note
217 		 * that in the common case where the DB we're asking to remove
218 		 * doesn't exist, we won't get this far;  __db_subdb_remove
219 		 * will already have returned an error from __db_open.
220 		 */
221 		if (ret != 0)
222 			goto err;
223 
224 		/*
225 		 * Delete the subdatabase entry first;  if this fails,
226 		 * we don't want to touch the actual subdb pages.
227 		 */
228 		if ((ret = __dbc_del(dbc, 0)) != 0)
229 			goto err;
230 
231 		/*
232 		 * We're handling actual data, not on-page meta-data,
233 		 * so it hasn't been converted to/from opposite
234 		 * endian architectures.  Do it explicitly, now.
235 		 */
236 		memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
237 		DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
238 		if ((ret = __memp_fget(mdbp->mpf, &sdbp->meta_pgno,
239 		    ip, dbc->txn, DB_MPOOL_DIRTY, &p)) != 0)
240 			goto err;
241 
242 		/* Free the root on the master db if it was created. */
243 		if (TYPE(p) == P_BTREEMETA &&
244 		    ((BTMETA *)p)->root != PGNO_INVALID) {
245 			if ((ret = __memp_fget(mdbp->mpf,
246 			    &((BTMETA *)p)->root, ip, dbc->txn,
247 			    DB_MPOOL_DIRTY, &r)) != 0)
248 				goto err;
249 
250 			/* Free and put the page. */
251 			if ((ret = __db_free(dbc, r, 0)) != 0) {
252 				r = NULL;
253 				goto err;
254 			}
255 		}
256 		/* Free and put the page. */
257 		if ((ret = __db_free(dbc, p, 0)) != 0) {
258 			p = NULL;
259 			goto err;
260 		}
261 		p = NULL;
262 		break;
263 	case MU_RENAME:
264 		/* We should have found something if we're renaming it. */
265 		if (ret != 0)
266 			goto err;
267 
268 		/*
269 		 * Before we rename, we need to make sure we're not
270 		 * overwriting another subdatabase, or else this operation
271 		 * won't be undoable.  Open a second cursor and check
272 		 * for the existence of newname;  it shouldn't appear under
273 		 * us since we hold the metadata lock.
274 		 */
275 		if ((ret = __db_cursor(mdbp, ip, txn, &ndbc,
276 		    CDB_LOCKING(env) ? DB_WRITECURSOR : 0)) != 0)
277 			goto err;
278 		DB_SET_DBT(key, newname, strlen(newname));
279 
280 		/*
281 		 * We don't actually care what the meta page of the potentially-
282 		 * overwritten DB is; we just care about existence.
283 		 */
284 		memset(&ndata, 0, sizeof(ndata));
285 		F_SET(&ndata, DB_DBT_USERMEM | DB_DBT_PARTIAL);
286 
287 		if ((ret = __dbc_get(ndbc, &key, &ndata, DB_SET)) == 0) {
288 			/* A subdb called newname exists.  Bail. */
289 			ret = EEXIST;
290 			__db_errx(env, DB_STR_A("0673",
291 			    "rename: database %s exists", "%s"), newname);
292 			goto err;
293 		} else if (ret != DB_NOTFOUND)
294 			goto err;
295 
296 		/*
297 		 * Now do the put first; we don't want to lose our only
298 		 * reference to the subdb.  Use the second cursor so the
299 		 * first one continues to point to the old record.
300 		 */
301 		if ((ret = __dbc_put(ndbc, &key, &data, DB_KEYFIRST)) != 0)
302 			goto err;
303 		if ((ret = __dbc_del(dbc, 0)) != 0) {
304 			/*
305 			 * If the delete fails, try to delete the record
306 			 * we just put, in case we're not txn-protected.
307 			 */
308 			(void)__dbc_del(ndbc, 0);
309 			goto err;
310 		}
311 
312 		break;
313 	case MU_OPEN:
314 		/*
315 		 * Get the subdatabase information.  If it already exists,
316 		 * copy out the page number and we're done.
317 		 */
318 		switch (ret) {
319 		case 0:
320 			if (LF_ISSET(DB_CREATE) && LF_ISSET(DB_EXCL)) {
321 				ret = EEXIST;
322 				goto err;
323 			}
324 			memcpy(&sdbp->meta_pgno, data.data, sizeof(db_pgno_t));
325 			DB_NTOHL_SWAP(env, &sdbp->meta_pgno);
326 			goto done;
327 		case DB_NOTFOUND:
328 			if (LF_ISSET(DB_CREATE))
329 				break;
330 			/*
331 			 * No db_err, it is reasonable to remove a
332 			 * nonexistent db.
333 			 */
334 			ret = ENOENT;
335 			goto err;
336 		default:
337 			goto err;
338 		}
339 
340 		/* Create a subdatabase. */
341 		if (F_ISSET(mdbp, DB_AM_RDONLY)) {
342 			ret = EBADF;
343 			goto err;
344 		}
345 		if ((ret = __db_new(dbc,
346 		    type == DB_HASH ? P_HASHMETA : P_BTREEMETA, NULL, &p)) != 0)
347 			goto err;
348 		sdbp->meta_pgno = PGNO(p);
349 
350 		/*
351 		 * XXX
352 		 * We're handling actual data, not on-page meta-data, so it
353 		 * hasn't been converted to/from opposite endian architectures.
354 		 * Do it explicitly, now.
355 		 */
356 		t_pgno = PGNO(p);
357 		DB_HTONL_SWAP(env, &t_pgno);
358 		memset(&ndata, 0, sizeof(ndata));
359 		ndata.data = &t_pgno;
360 		ndata.size = sizeof(db_pgno_t);
361 		if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
362 			goto err;
363 		F_SET(sdbp, DB_AM_CREATED);
364 		break;
365 
366 	case MU_MOVE:
367 		/* We should have found something if we're moving it. */
368 		if (ret != 0)
369 			goto err;
370 		t_pgno = sdbp->meta_pgno;
371 		DB_HTONL_SWAP(env, &t_pgno);
372 		memset(&ndata, 0, sizeof(ndata));
373 		ndata.data = &t_pgno;
374 		ndata.size = sizeof(db_pgno_t);
375 		if ((ret = __dbc_put(dbc, &key, &ndata, 0)) != 0)
376 			goto err;
377 		mdbp->mpf->mfp->revision++;
378 	}
379 
380 err:
381 done:	/*
382 	 * If we allocated a page: if we're successful, mark the page dirty
383 	 * and return it to the cache, otherwise, discard/free it.
384 	 */
385 	if (p != NULL && (t_ret = __memp_fput(mdbp->mpf,
386 	     dbc->thread_info, p, dbc->priority)) != 0 && ret == 0)
387 		ret = t_ret;
388 
389 	/* Discard the cursor(s) and data. */
390 	if (data.data != NULL)
391 		__os_ufree(env, data.data);
392 	if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
393 		ret = t_ret;
394 	if (ndbc != NULL && (t_ret = __dbc_close(ndbc)) != 0 && ret == 0)
395 		ret = t_ret;
396 
397 	return (ret);
398 }
399 
400 /*
401  * __env_dbreg_setup --
402  *
403  * PUBLIC: int __env_dbreg_setup __P((DB *,
404  * PUBLIC:      DB_TXN *, const char *, const char *, u_int32_t));
405  */
406 int
__env_dbreg_setup(dbp,txn,fname,dname,id)407 __env_dbreg_setup(dbp, txn, fname, dname, id)
408 	DB *dbp;
409 	DB_TXN *txn;
410 	const char *fname, *dname;
411 	u_int32_t id;
412 {
413 	ENV *env;
414 	int ret;
415 
416 	env = dbp->env;
417 	if (dbp->log_filename == NULL
418 #if !defined(DEBUG_ROP) && !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
419 	    && (txn != NULL || F_ISSET(dbp, DB_AM_RECOVER))
420 #endif
421 #if !defined(DEBUG_ROP)
422 	    && !F_ISSET(dbp, DB_AM_RDONLY)
423 #endif
424 	    ) {
425 		if ((ret = __dbreg_setup(dbp,
426 		    F_ISSET(dbp, DB_AM_INMEM) ? dname: fname,
427 		    F_ISSET(dbp, DB_AM_INMEM) ? NULL : dname, id)) != 0)
428 			return (ret);
429 
430 		/*
431 		 * If we're actively logging and our caller isn't a
432 		 * recovery function that already did so, then assign
433 		 * this dbp a log fileid.
434 		 */
435 		if (DBENV_LOGGING(env) && !F_ISSET(dbp, DB_AM_RECOVER) &&
436 		    (ret = __dbreg_new_id(dbp, txn)) != 0)
437 			return (ret);
438 	}
439 	return (0);
440 }
441 
442 /*
443  * __env_setup --
444  *	Set up the underlying environment during a db_open.
445  *
446  * PUBLIC: int __env_setup __P((DB *,
447  * PUBLIC:     DB_TXN *, const char *, const char *, u_int32_t, u_int32_t));
448  */
449 int
__env_setup(dbp,txn,fname,dname,id,flags)450 __env_setup(dbp, txn, fname, dname, id, flags)
451 	DB *dbp;
452 	DB_TXN *txn;
453 	const char *fname, *dname;
454 	u_int32_t id, flags;
455 {
456 	DB *ldbp;
457 	DB_ENV *dbenv;
458 	ENV *env;
459 	u_int32_t maxid;
460 	int ret;
461 
462 	env = dbp->env;
463 	dbenv = env->dbenv;
464 
465 	/*
466 	 * When verifying an in-memory db, we need to pass dname to
467 	 * __env_mpool.  That is the only time fname will be used.
468 	 */
469 	if (F_ISSET(dbp, DB_AM_INMEM) && F_ISSET(dbp, DB_AM_VERIFYING))
470 		fname = dname;
471 
472 	/* If we don't yet have an environment, it's time to create it. */
473 	if (!F_ISSET(env, ENV_OPEN_CALLED)) {
474 #if defined(HAVE_MIXED_SIZE_ADDRESSING) && (SIZEOF_CHAR_P == 8)
475 		__db_errx(env, DB_STR("0701", "DB_PRIVATE is not supported by"
476 		    " 64-bit applications in mixed-size-addressing mode"));
477 	       return (EINVAL);
478 #endif
479 		/* Make sure we have at least DB_MINCACHE pages in our cache. */
480 		if (dbenv->mp_gbytes == 0 &&
481 		    dbenv->mp_bytes < dbp->pgsize * DB_MINPAGECACHE &&
482 		    (ret = __memp_set_cachesize(
483 		    dbenv, 0, dbp->pgsize * DB_MINPAGECACHE, 0)) != 0)
484 			return (ret);
485 
486 		if ((ret = __env_open(dbenv, NULL, DB_CREATE |
487 		    DB_INIT_MPOOL | DB_PRIVATE | LF_ISSET(DB_THREAD), 0)) != 0)
488 			return (ret);
489 	}
490 
491 	/* Join the underlying cache. */
492 	if ((!F_ISSET(dbp, DB_AM_INMEM) || F_ISSET(dbp, DB_AM_VERIFYING) ||
493 	    dname == NULL) && (ret = __env_mpool(dbp, fname, flags)) != 0)
494 		return (ret);
495 
496 	/* We may need a per-thread mutex. */
497 	if (LF_ISSET(DB_THREAD) && (ret = __mutex_alloc(
498 	    env, MTX_DB_HANDLE, DB_MUTEX_PROCESS_ONLY, &dbp->mutex)) != 0)
499 		return (ret);
500 
501 	/*
502 	 * Set up a bookkeeping entry for this database in the log region,
503 	 * if such a region exists.  Note that even if we're in recovery
504 	 * or a replication client, where we won't log registries, we'll
505 	 * still need an FNAME struct, so LOGGING_ON is the correct macro.
506 	 */
507 	if (LOGGING_ON(env) &&
508 	    (!F_ISSET(dbp, DB_AM_INMEM) || dname == NULL) &&
509 	    (ret = __env_dbreg_setup(dbp, txn, fname, dname, id)) != 0)
510 		return (ret);
511 
512 	/*
513 	 * Insert ourselves into the ENV's dblist.  We allocate a
514 	 * unique ID to each {fileid, meta page number} pair, and to
515 	 * each temporary file (since they all have a zero fileid).
516 	 * This ID gives us something to use to tell which DB handles
517 	 * go with which databases in all the cursor adjustment
518 	 * routines, where we don't want to do a lot of ugly and
519 	 * expensive memcmps.
520 	 */
521 	MUTEX_LOCK(env, env->mtx_dblist);
522 	maxid = 0;
523 	TAILQ_FOREACH(ldbp, &env->dblist, dblistlinks) {
524 		/*
525 		 * There are three cases: on-disk database (first clause),
526 		 * named in-memory database (second clause), temporary database
527 		 * (never matches; no clause).
528 		 */
529 		if (!F_ISSET(dbp, DB_AM_INMEM)) {
530 			if (memcmp(ldbp->fileid, dbp->fileid, DB_FILE_ID_LEN)
531 			    == 0 && ldbp->meta_pgno == dbp->meta_pgno)
532 				break;
533 		} else if (dname != NULL) {
534 			if (F_ISSET(ldbp, DB_AM_INMEM) &&
535 			    ldbp->dname != NULL &&
536 			    strcmp(ldbp->dname, dname) == 0)
537 				break;
538 		}
539 		if (ldbp->adj_fileid > maxid)
540 			maxid = ldbp->adj_fileid;
541 	}
542 
543 	/*
544 	 * If ldbp is NULL, we didn't find a match. Assign the dbp an
545 	 * adj_fileid one higher than the largest we found, and
546 	 * insert it at the head of the master dbp list.
547 	 *
548 	 * If ldbp is not NULL, it is a match for our dbp.  Give dbp
549 	 * the same ID that ldbp has, and add it after ldbp so they're
550 	 * together in the list.
551 	 */
552 	if (ldbp == NULL) {
553 		dbp->adj_fileid = maxid + 1;
554 		TAILQ_INSERT_HEAD(&env->dblist, dbp, dblistlinks);
555 	} else {
556 		dbp->adj_fileid = ldbp->adj_fileid;
557 		TAILQ_INSERT_AFTER(&env->dblist, ldbp, dbp, dblistlinks);
558 	}
559 	MUTEX_UNLOCK(env, env->mtx_dblist);
560 
561 	return (0);
562 }
563 
564 /*
565  * __env_mpool --
566  *	Set up the underlying environment cache during a db_open.
567  *
568  * PUBLIC: int __env_mpool __P((DB *, const char *, u_int32_t));
569  */
570 int
__env_mpool(dbp,fname,flags)571 __env_mpool(dbp, fname, flags)
572 	DB *dbp;
573 	const char *fname;
574 	u_int32_t flags;
575 {
576 	DBT pgcookie;
577 	DB_MPOOLFILE *mpf;
578 	DB_PGINFO pginfo;
579 	ENV *env;
580 	int fidset, ftype, ret;
581 	int32_t lsn_off;
582 	u_int8_t nullfid[DB_FILE_ID_LEN];
583 	u_int32_t clear_len;
584 
585 	env = dbp->env;
586 
587 	/* The LSN is the first entry on a DB page, byte offset 0. */
588 	lsn_off = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LSN_OFF_NOTSET : 0;
589 
590 	/* It's possible that this database is already open. */
591 	if (F_ISSET(dbp, DB_AM_OPEN_CALLED))
592 		return (0);
593 
594 	/*
595 	 * If we need to pre- or post-process a file's pages on I/O, set the
596 	 * file type.  If it's a hash file, always call the pgin and pgout
597 	 * routines.  This means that hash files can never be mapped into
598 	 * process memory.  If it's a btree file and requires swapping, we
599 	 * need to page the file in and out.  This has to be right -- we can't
600 	 * mmap files that are being paged in and out.
601 	 */
602 	switch (dbp->type) {
603 	case DB_BTREE:
604 	case DB_HEAP:
605 	case DB_RECNO:
606 		ftype = F_ISSET(dbp, DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM)
607 		    ? DB_FTYPE_SET : DB_FTYPE_NOTSET;
608 		clear_len = CRYPTO_ON(env) ?
609 		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
610 		    DB_PAGE_DB_LEN;
611 		break;
612 	case DB_HASH:
613 		ftype = DB_FTYPE_SET;
614 		clear_len = CRYPTO_ON(env) ?
615 		    (dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET) :
616 		    DB_PAGE_DB_LEN;
617 		break;
618 	case DB_QUEUE:
619 		ftype = F_ISSET(dbp,
620 		    DB_AM_SWAP | DB_AM_ENCRYPT | DB_AM_CHKSUM) ?
621 		    DB_FTYPE_SET : DB_FTYPE_NOTSET;
622 
623 		/*
624 		 * If we came in here without a pagesize set, then we need
625 		 * to mark the in-memory handle as having clear_len not
626 		 * set, because we don't really know the clear length or
627 		 * the page size yet (since the file doesn't yet exist).
628 		 */
629 		clear_len = dbp->pgsize != 0 ? dbp->pgsize : DB_CLEARLEN_NOTSET;
630 		break;
631 	case DB_UNKNOWN:
632 		/*
633 		 * If we're running in the verifier, our database might
634 		 * be corrupt and we might not know its type--but we may
635 		 * still want to be able to verify and salvage.
636 		 *
637 		 * If we can't identify the type, it's not going to be safe
638 		 * to call __db_pgin--we pretty much have to give up all
639 		 * hope of salvaging cross-endianness.  Proceed anyway;
640 		 * at worst, the database will just appear more corrupt
641 		 * than it actually is, but at best, we may be able
642 		 * to salvage some data even with no metadata page.
643 		 */
644 		if (F_ISSET(dbp, DB_AM_VERIFYING)) {
645 			ftype = DB_FTYPE_NOTSET;
646 			clear_len = DB_PAGE_DB_LEN;
647 			break;
648 		}
649 
650 		/*
651 		 * This might be an in-memory file and we won't know its
652 		 * file type until after we open it and read the meta-data
653 		 * page.
654 		 */
655 		if (F_ISSET(dbp, DB_AM_INMEM)) {
656 			clear_len = DB_CLEARLEN_NOTSET;
657 			ftype = DB_FTYPE_NOTSET;
658 			lsn_off = DB_LSN_OFF_NOTSET;
659 			break;
660 		}
661 		/* FALLTHROUGH */
662 	default:
663 		return (__db_unknown_type(env, "DB->open", dbp->type));
664 	}
665 
666 	mpf = dbp->mpf;
667 
668 	memset(nullfid, 0, DB_FILE_ID_LEN);
669 	fidset = memcmp(nullfid, dbp->fileid, DB_FILE_ID_LEN);
670 	if (fidset)
671 		(void)__memp_set_fileid(mpf, dbp->fileid);
672 
673 	(void)__memp_set_clear_len(mpf, clear_len);
674 	(void)__memp_set_ftype(mpf, ftype);
675 	(void)__memp_set_lsn_offset(mpf, lsn_off);
676 
677 	pginfo.db_pagesize = dbp->pgsize;
678 	pginfo.flags =
679 	    F_ISSET(dbp, (DB_AM_CHKSUM | DB_AM_ENCRYPT | DB_AM_SWAP));
680 	pginfo.type = dbp->type;
681 	pgcookie.data = &pginfo;
682 	pgcookie.size = sizeof(DB_PGINFO);
683 	(void)__memp_set_pgcookie(mpf, &pgcookie);
684 
685 #ifndef DIAG_MVCC
686 	if (F_ISSET(env->dbenv, DB_ENV_MULTIVERSION))
687 #endif
688 		if (F_ISSET(dbp, DB_AM_TXN) &&
689 		    dbp->type != DB_QUEUE && dbp->type != DB_UNKNOWN)
690 			LF_SET(DB_MULTIVERSION);
691 
692 	if ((ret = __memp_fopen(mpf, NULL, fname, &dbp->dirname,
693 	    LF_ISSET(DB_CREATE | DB_DURABLE_UNKNOWN | DB_MULTIVERSION |
694 		DB_NOMMAP | DB_ODDFILESIZE | DB_RDONLY | DB_TRUNCATE) |
695 	    (F_ISSET(env->dbenv, DB_ENV_DIRECT_DB) ? DB_DIRECT : 0) |
696 	    (F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_TXN_NOT_DURABLE : 0),
697 	    0, dbp->pgsize)) != 0) {
698 		/*
699 		 * The open didn't work; we need to reset the mpf,
700 		 * retaining the in-memory semantics (if any).
701 		 */
702 		(void)__memp_fclose(dbp->mpf, 0);
703 		(void)__memp_fcreate(env, &dbp->mpf);
704 		if (F_ISSET(dbp, DB_AM_INMEM))
705 			MAKE_INMEM(dbp);
706 		return (ret);
707 	}
708 
709 	/*
710 	 * Set the open flag.  We use it to mean that the dbp has gone
711 	 * through mpf setup, including dbreg_register.  Also, below,
712 	 * the underlying access method open functions may want to do
713 	 * things like acquire cursors, so the open flag has to be set
714 	 * before calling them.
715 	 */
716 	F_SET(dbp, DB_AM_OPEN_CALLED);
717 	if (!fidset && fname != NULL) {
718 		(void)__memp_get_fileid(dbp->mpf, dbp->fileid);
719 		dbp->preserve_fid = 1;
720 	}
721 
722 	return (0);
723 }
724 
725 /*
726  * __db_close --
727  *	DB->close method.
728  *
729  * PUBLIC: int __db_close __P((DB *, DB_TXN *, u_int32_t));
730  */
731 int
__db_close(dbp,txn,flags)732 __db_close(dbp, txn, flags)
733 	DB *dbp;
734 	DB_TXN *txn;
735 	u_int32_t flags;
736 {
737 	ENV *env;
738 	int db_ref, deferred_close, ret, t_ret;
739 
740 	env = dbp->env;
741 	deferred_close = 0;
742 
743 	PERFMON4(env, db, close,
744 	    dbp->fname, dbp->dname, flags, &dbp->fileid[0]);
745 
746 	/* Refresh the structure and close any underlying resources. */
747 	ret = __db_refresh(dbp, txn, flags, &deferred_close, 0);
748 
749 	/*
750 	 * If we've deferred the close because the logging of the close failed,
751 	 * return our failure right away without destroying the handle.
752 	 */
753 	if (deferred_close)
754 		return (ret);
755 
756 	/* !!!
757 	 * This code has an apparent race between the moment we read and
758 	 * decrement env->db_ref and the moment we check whether it's 0.
759 	 * However, if the environment is DBLOCAL, the user shouldn't have a
760 	 * reference to the env handle anyway;  the only way we can get
761 	 * multiple dbps sharing a local env is if we open them internally
762 	 * during something like a subdatabase open.  If any such thing is
763 	 * going on while the user is closing the original dbp with a local
764 	 * env, someone's already badly screwed up, so there's no reason
765 	 * to bother engineering around this possibility.
766 	 */
767 	MUTEX_LOCK(env, env->mtx_dblist);
768 	db_ref = --env->db_ref;
769 	MUTEX_UNLOCK(env, env->mtx_dblist);
770 	if (F_ISSET(env, ENV_DBLOCAL) && db_ref == 0 &&
771 	    (t_ret = __env_close(env->dbenv, 0)) != 0 && ret == 0)
772 		ret = t_ret;
773 
774 	/* Free the database handle. */
775 	memset(dbp, CLEAR_BYTE, sizeof(*dbp));
776 	__os_free(env, dbp);
777 
778 	return (ret);
779 }
780 
781 /*
782  * __db_refresh --
783  *	Refresh the DB structure, releasing any allocated resources.
784  * This does most of the work of closing files now because refresh
785  * is what is used during abort processing (since we can't destroy
786  * the actual handle) and during abort processing, we may have a
787  * fully opened handle.
788  *
789  * PUBLIC: int __db_refresh __P((DB *, DB_TXN *, u_int32_t, int *, int));
790  */
791 int
__db_refresh(dbp,txn,flags,deferred_closep,reuse)792 __db_refresh(dbp, txn, flags, deferred_closep, reuse)
793 	DB *dbp;
794 	DB_TXN *txn;
795 	u_int32_t flags;
796 	int *deferred_closep, reuse;
797 {
798 	DB *sdbp;
799 	DBC *dbc;
800 	DB_FOREIGN_INFO *f_info, *tmp;
801 	DB_LOCKER *locker;
802 	DB_LOCKREQ lreq;
803 	ENV *env;
804 	REGENV *renv;
805 	REGINFO *infop;
806 	u_int32_t save_flags;
807 	int resync, ret, t_ret;
808 
809 	ret = 0;
810 
811 	env = dbp->env;
812 	infop = env->reginfo;
813 	if (infop != NULL)
814 		renv = infop->primary;
815 	else
816 		renv = NULL;
817 
818 	/*
819 	 * If this dbp is not completely open, avoid trapping by trying to
820 	 * sync without an mpool file.
821 	 */
822 	if (dbp->mpf == NULL)
823 		LF_SET(DB_NOSYNC);
824 
825 	/* If never opened, or not currently open, it's easy. */
826 	if (!F_ISSET(dbp, DB_AM_OPEN_CALLED))
827 		goto never_opened;
828 
829 	/*
830 	 * If we have any secondary indices, disassociate them from us.
831 	 * We don't bother with the mutex here;  it only protects some
832 	 * of the ops that will make us core-dump mid-close anyway, and
833 	 * if you're trying to do something with a secondary *while* you're
834 	 * closing the primary, you deserve what you get.  The disassociation
835 	 * is mostly done just so we can close primaries and secondaries in
836 	 * any order--but within one thread of control.
837 	 */
838 	LIST_FOREACH(sdbp, &dbp->s_secondaries, s_links) {
839 		LIST_REMOVE(sdbp, s_links);
840 		if ((t_ret = __db_disassociate(sdbp)) != 0 && ret == 0)
841 			ret = t_ret;
842 	}
843 	if (F_ISSET(dbp, DB_AM_SECONDARY))
844 		LIST_REMOVE(dbp, s_links);
845 
846 	/*
847 	 * Disassociate ourself from any databases using us as a foreign key
848 	 * database by clearing the referring db's pointer.  Reclaim memory.
849 	 */
850 	f_info = LIST_FIRST(&dbp->f_primaries);
851 	while (f_info != NULL) {
852 		tmp = LIST_NEXT(f_info, f_links);
853 		LIST_REMOVE(f_info, f_links);
854 		f_info->dbp->s_foreign = NULL;
855 		__os_free(env, f_info);
856 		f_info = tmp;
857 	}
858 
859 	if (dbp->s_foreign != NULL &&
860 	    (t_ret = __db_disassociate_foreign(dbp)) != 0 && ret == 0)
861 		ret = t_ret;
862 
863 	/*
864 	 * Sync the underlying access method.  Do before closing the cursors
865 	 * because DB->sync allocates cursors in order to write Recno backing
866 	 * source text files.
867 	 *
868 	 * Sync is slow on some systems, notably Solaris filesystems where the
869 	 * entire buffer cache is searched.  If we're in recovery, don't flush
870 	 * the file, it's not necessary.
871 	 */
872 	if (!LF_ISSET(DB_NOSYNC) &&
873 	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
874 	    (t_ret = __db_sync(dbp)) != 0 && ret == 0)
875 		ret = t_ret;
876 
877 	/*
878 	 * Go through the active cursors, unregister each cursor from its
879 	 * transaction if any, and call the cursor recycle routine,
880 	 * which resolves pending operations and moves the cursors onto the
881 	 * free list.  Then, walk the free list and call the cursor destroy
882 	 * routine.  Note that any failure on a close is considered "really
883 	 * bad" and we just break out of the loop and force forward.
884 	 */
885 	resync = TAILQ_FIRST(&dbp->active_queue) == NULL ? 0 : 1;
886 	while ((dbc = TAILQ_FIRST(&dbp->active_queue)) != NULL) {
887 		if (dbc->txn != NULL)
888 			TAILQ_REMOVE(&(dbc->txn->my_cursors), dbc, txn_cursors);
889 
890 		if ((t_ret = __dbc_close(dbc)) != 0) {
891 			if (ret == 0)
892 				ret = t_ret;
893 			break;
894 		}
895 	}
896 
897 	while ((dbc = TAILQ_FIRST(&dbp->free_queue)) != NULL)
898 		if ((t_ret = __dbc_destroy(dbc)) != 0) {
899 			if (ret == 0)
900 				ret = t_ret;
901 			break;
902 		}
903 
904 	/*
905 	 * Close any outstanding join cursors.  Join cursors destroy themselves
906 	 * on close and have no separate destroy routine.  We don't have to set
907 	 * the resync flag here, because join cursors aren't write cursors.
908 	 */
909 	while ((dbc = TAILQ_FIRST(&dbp->join_queue)) != NULL)
910 		if ((t_ret = __db_join_close(dbc)) != 0) {
911 			if (ret == 0)
912 				ret = t_ret;
913 			break;
914 		}
915 
916 	/*
917 	 * Sync the memory pool, even though we've already called DB->sync,
918 	 * because closing cursors can dirty pages by deleting items they
919 	 * referenced.
920 	 *
921 	 * Sync is slow on some systems, notably Solaris filesystems where the
922 	 * entire buffer cache is searched.  If we're in recovery, don't flush
923 	 * the file, it's not necessary.
924 	 */
925 	if (resync && !LF_ISSET(DB_NOSYNC) &&
926 	    !F_ISSET(dbp, DB_AM_DISCARD | DB_AM_RECOVER) &&
927 	    (t_ret = __memp_fsync(dbp->mpf)) != 0 && ret == 0)
928 		ret = t_ret;
929 
930 	/*
931 	 * If there is a file extension watermark associated with this
932 	 * database, we don't need it any more.
933 	 */
934 	__txn_remove_fe_watermark(txn, dbp);
935 
936 never_opened:
937 	MUTEX_LOCK(env, env->mtx_dblist);
938 	/*
939 	 * At this point, we haven't done anything to render the DB handle
940 	 * unusable, at least by a transaction abort.  Take the opportunity
941 	 * now to log the file close if we have initialized the logging
942 	 * information.  If this log fails and we're in a transaction,
943 	 * we have to bail out of the attempted close; we'll need a dbp in
944 	 * order to successfully abort the transaction, and we can't conjure
945 	 * a new one up because we haven't gotten out the dbreg_register
946 	 * record that represents the close.  In this case, we put off
947 	 * actually closing the dbp until we've performed the abort.
948 	 */
949 	if (!reuse && LOGGING_ON(dbp->env) && dbp->log_filename != NULL) {
950 		/*
951 		 * Discard the log file id, if any.  We want to log the close
952 		 * if and only if this is not a recovery dbp or a client dbp,
953 		 * or a dead dbp handle.
954 		 */
955 		DB_ASSERT(env, renv != NULL);
956 		if (F_ISSET(dbp, DB_AM_RECOVER) || IS_REP_CLIENT(env) ||
957 		    dbp->timestamp != renv->rep_timestamp) {
958 			if ((t_ret = __dbreg_revoke_id(dbp,
959 			    0, DB_LOGFILEID_INVALID)) == 0 && ret == 0)
960 				ret = t_ret;
961 			if ((t_ret = __dbreg_teardown(dbp)) != 0 && ret == 0)
962 				ret = t_ret;
963 		} else {
964 			if ((t_ret = __dbreg_close_id(dbp,
965 			    txn, DBREG_CLOSE)) != 0 && txn != NULL) {
966 				MUTEX_UNLOCK(env, env->mtx_dblist);
967 				/*
968 				 * We're in a txn and the attempt to log the
969 				 * close failed;  let the txn subsystem know
970 				 * that we need to destroy this dbp once we're
971 				 * done with the abort, then bail from the
972 				 * close.
973 				 *
974 				 * Note that if the attempt to put off the
975 				 * close -also- fails--which it won't unless
976 				 * we're out of heap memory--we're really
977 				 * screwed.  Panic.
978 				 */
979 				if ((ret =
980 				    __txn_closeevent(env, txn, dbp)) != 0)
981 					return (__env_panic(env, ret));
982 				if (deferred_closep != NULL)
983 					*deferred_closep = 1;
984 				return (t_ret);
985 			}
986 			/*
987 			 * If dbreg_close_id failed and we were not in a
988 			 * transaction, then we need to finish this close
989 			 * because the caller can't do anything with the
990 			 * handle after we return an error.  We rely on
991 			 * dbreg_close_id to mark the entry in some manner
992 			 * so that we do not do a clean shutdown of this
993 			 * environment.  If shutdown isn't clean, then the
994 			 * application *must* run recovery and that will
995 			 * generate the RCLOSE record.
996 			 */
997 		}
998 
999 	}
1000 
1001 	/* Close any handle we've been holding since the open.  */
1002 	if (dbp->saved_open_fhp != NULL &&
1003 	    (t_ret = __os_closehandle(env, dbp->saved_open_fhp)) != 0 &&
1004 	    ret == 0)
1005 		ret = t_ret;
1006 
1007 	/*
1008 	 * Remove this DB handle from the ENV's dblist, if it's been added.
1009 	 *
1010 	 * Close our reference to the underlying cache while locked, we don't
1011 	 * want to race with a thread searching for our underlying cache link
1012 	 * while opening a DB handle.
1013 	 *
1014 	 * The DB handle may not yet have been added to the ENV list, don't
1015 	 * blindly call the underlying TAILQ_REMOVE macro.  Explicitly reset
1016 	 * the field values to NULL so that we can't call TAILQ_REMOVE twice.
1017 	 */
1018 	if (!reuse &&
1019 	    (dbp->dblistlinks.tqe_next != NULL ||
1020 	    dbp->dblistlinks.tqe_prev != NULL)) {
1021 		TAILQ_REMOVE(&env->dblist, dbp, dblistlinks);
1022 		dbp->dblistlinks.tqe_next = NULL;
1023 		dbp->dblistlinks.tqe_prev = NULL;
1024 	}
1025 
1026 	/* Close the memory pool file handle. */
1027 	if (dbp->mpf != NULL) {
1028 		if ((t_ret = __memp_fclose(dbp->mpf,
1029 		    F_ISSET(dbp, DB_AM_DISCARD) ? DB_MPOOL_DISCARD : 0)) != 0 &&
1030 		    ret == 0)
1031 			ret = t_ret;
1032 		dbp->mpf = NULL;
1033 		if (reuse &&
1034 		    (t_ret = __memp_fcreate(env, &dbp->mpf)) != 0 &&
1035 		    ret == 0)
1036 			ret = t_ret;
1037 	}
1038 
1039 	MUTEX_UNLOCK(env, env->mtx_dblist);
1040 
1041 	/*
1042 	 * Call the access specific close function.
1043 	 *
1044 	 * We do this here rather than in __db_close as we need to do this when
1045 	 * aborting an open so that file descriptors are closed and abort of
1046 	 * renames can succeed on platforms that lock open files (such as
1047 	 * Windows).  In particular, we need to ensure that all the extents
1048 	 * associated with a queue are closed so that queue renames can be
1049 	 * aborted.
1050 	 *
1051 	 * It is also important that we do this before releasing the handle
1052 	 * lock, because dbremove and dbrename assume that once they have the
1053 	 * handle lock, it is safe to modify the underlying file(s).
1054 	 *
1055 	 * !!!
1056 	 * Because of where these functions are called in the DB handle close
1057 	 * process, these routines can't do anything that would dirty pages or
1058 	 * otherwise affect closing down the database.  Specifically, we can't
1059 	 * abort and recover any of the information they control.
1060 	 */
1061 #ifdef HAVE_PARTITION
1062 	if (dbp->p_internal != NULL &&
1063 	    (t_ret = __partition_close(dbp, txn, flags)) != 0 && ret == 0)
1064 		ret = t_ret;
1065 #endif
1066 	if ((t_ret = __bam_db_close(dbp)) != 0 && ret == 0)
1067 		ret = t_ret;
1068 	if ((t_ret = __ham_db_close(dbp)) != 0 && ret == 0)
1069 		ret = t_ret;
1070 	if ((t_ret = __heap_db_close(dbp)) != 0 && ret == 0)
1071 		ret = t_ret;
1072 	if ((t_ret = __qam_db_close(dbp, dbp->flags)) != 0 && ret == 0)
1073 		ret = t_ret;
1074 
1075 	/*
1076 	 * !!!
1077 	 * At this point, the access-method specific information has been
1078 	 * freed.  From now on, we can use the dbp, but not touch any
1079 	 * access-method specific data.
1080 	 */
1081 
1082 	if (!reuse && dbp->locker != NULL) {
1083 		/* We may have pending trade operations on this dbp. */
1084 		if (txn == NULL)
1085 			txn = dbp->cur_txn;
1086 		if (IS_REAL_TXN(txn))
1087 			__txn_remlock(env,
1088 			     txn, &dbp->handle_lock, dbp->locker);
1089 
1090 		/* We may be holding the handle lock; release it. */
1091 		lreq.op = DB_LOCK_PUT_ALL;
1092 		lreq.obj = NULL;
1093 		if ((t_ret = __lock_vec(env,
1094 		    dbp->locker, 0, &lreq, 1, NULL)) != 0 && ret == 0)
1095 			ret = t_ret;
1096 
1097 		if ((t_ret =
1098 		     __lock_id_free(env, dbp->locker)) != 0 && ret == 0)
1099 			ret = t_ret;
1100 		dbp->locker = NULL;
1101 		LOCK_INIT(dbp->handle_lock);
1102 	}
1103 
1104 	/*
1105 	 * If this is a temporary file (un-named in-memory file), then
1106 	 * discard the locker ID allocated as the fileid.
1107 	 */
1108 	if (LOCKING_ON(env) &&
1109 	    F_ISSET(dbp, DB_AM_INMEM) && !dbp->preserve_fid &&
1110 	    *(u_int32_t *)dbp->fileid != DB_LOCK_INVALIDID) {
1111 		if ((t_ret = __lock_getlocker(env->lk_handle,
1112 		     *(u_int32_t *)dbp->fileid, 0, &locker)) == 0)
1113 			t_ret = __lock_id_free(env, locker);
1114 		if (ret == 0)
1115 			ret = t_ret;
1116 	}
1117 
1118 	if (reuse) {
1119 		/*
1120 		 * If we are reusing this dbp, then we're done now. Re-init
1121 		 * the handle, preserving important flags, and then return.
1122 		 * This code is borrowed from __db_init, which does more
1123 		 * than we can do here.
1124 		 */
1125 		save_flags = F_ISSET(dbp, DB_AM_INMEM |
1126 		    DB_AM_RDONLY | DB_AM_TXN);
1127 
1128 		if ((ret = __bam_db_create(dbp)) != 0)
1129 			return (ret);
1130 		if ((ret = __ham_db_create(dbp)) != 0)
1131 			return (ret);
1132 		if ((ret = __heap_db_create(dbp)) != 0)
1133 			return (ret);
1134 		if ((ret = __qam_db_create(dbp)) != 0)
1135 			return (ret);
1136 
1137 		/* Restore flags */
1138 		dbp->flags = dbp->orig_flags | save_flags;
1139 
1140 		if (FLD_ISSET(save_flags, DB_AM_INMEM)) {
1141 			/*
1142 			 * If this is inmem, then it may have a fileid
1143 			 * even if it was never opened, and we need to
1144 			 * clear out that fileid.
1145 			 */
1146 			memset(dbp->fileid, 0, sizeof(dbp->fileid));
1147 			MAKE_INMEM(dbp);
1148 		}
1149 		return (ret);
1150 	}
1151 
1152 	dbp->type = DB_UNKNOWN;
1153 
1154 	/*
1155 	 * The thread mutex may have been invalidated in __dbreg_close_id if the
1156 	 * fname refcount did not go to 0. If not, discard the thread mutex.
1157 	 */
1158 	if ((t_ret = __mutex_free(env, &dbp->mutex)) != 0 && ret == 0)
1159 		ret = t_ret;
1160 
1161 	/* Discard any memory allocated for the file and database names. */
1162 	if (dbp->fname != NULL) {
1163 		__os_free(dbp->env, dbp->fname);
1164 		dbp->fname = NULL;
1165 	}
1166 	if (dbp->dname != NULL) {
1167 		__os_free(dbp->env, dbp->dname);
1168 		dbp->dname = NULL;
1169 	}
1170 
1171 	/* Discard any memory used to store returned data. */
1172 	if (dbp->my_rskey.data != NULL)
1173 		__os_free(dbp->env, dbp->my_rskey.data);
1174 	if (dbp->my_rkey.data != NULL)
1175 		__os_free(dbp->env, dbp->my_rkey.data);
1176 	if (dbp->my_rdata.data != NULL)
1177 		__os_free(dbp->env, dbp->my_rdata.data);
1178 
1179 	/* For safety's sake;  we may refresh twice. */
1180 	memset(&dbp->my_rskey, 0, sizeof(DBT));
1181 	memset(&dbp->my_rkey, 0, sizeof(DBT));
1182 	memset(&dbp->my_rdata, 0, sizeof(DBT));
1183 
1184 	/* Clear out fields that normally get set during open. */
1185 	memset(dbp->fileid, 0, sizeof(dbp->fileid));
1186 	dbp->adj_fileid = 0;
1187 	dbp->meta_pgno = 0;
1188 	dbp->cur_locker = NULL;
1189 	dbp->cur_txn = NULL;
1190 	dbp->associate_locker = NULL;
1191 	dbp->open_flags = 0;
1192 
1193 	/*
1194 	 * If we are being refreshed with a txn specified, then we need
1195 	 * to make sure that we clear out the lock handle field, because
1196 	 * releasing all the locks for this transaction will release this
1197 	 * lock and we don't want close to stumble upon this handle and
1198 	 * try to close it.
1199 	 */
1200 	if (txn != NULL)
1201 		LOCK_INIT(dbp->handle_lock);
1202 
1203 	/* Reset flags to whatever the user configured. */
1204 	dbp->flags = dbp->orig_flags;
1205 
1206 	return (ret);
1207 }
1208 
1209 /*
1210  * __db_disassociate --
1211  *	Destroy the association between a given secondary and its primary.
1212  */
1213 static int
__db_disassociate(sdbp)1214 __db_disassociate(sdbp)
1215 	DB *sdbp;
1216 {
1217 	DBC *dbc;
1218 	int ret, t_ret;
1219 
1220 	ret = 0;
1221 
1222 	sdbp->s_callback = NULL;
1223 	sdbp->s_primary = NULL;
1224 	sdbp->get = sdbp->stored_get;
1225 	sdbp->close = sdbp->stored_close;
1226 
1227 	/*
1228 	 * Complain, but proceed, if we have any active cursors.  (We're in
1229 	 * the middle of a close, so there's really no turning back.)
1230 	 */
1231 	if (sdbp->s_refcnt != 1 ||
1232 	    TAILQ_FIRST(&sdbp->active_queue) != NULL ||
1233 	    TAILQ_FIRST(&sdbp->join_queue) != NULL) {
1234 		__db_errx(sdbp->env, DB_STR("0674",
1235 "Closing a primary DB while a secondary DB has active cursors is unsafe"));
1236 		ret = EINVAL;
1237 	}
1238 	sdbp->s_refcnt = 0;
1239 
1240 	while ((dbc = TAILQ_FIRST(&sdbp->free_queue)) != NULL)
1241 		if ((t_ret = __dbc_destroy(dbc)) != 0 && ret == 0)
1242 			ret = t_ret;
1243 
1244 	F_CLR(sdbp, DB_AM_SECONDARY);
1245 	return (ret);
1246 }
1247 
1248 /*
1249  * __db_disassociate_foreign --
1250  *     Destroy the association between a given secondary and its foreign.
1251  */
1252 static int
__db_disassociate_foreign(sdbp)1253 __db_disassociate_foreign(sdbp)
1254 	DB *sdbp;
1255 {
1256 	DB *fdbp;
1257 	DB_FOREIGN_INFO *f_info, *tmp;
1258 	int ret;
1259 
1260 	if (sdbp->s_foreign == NULL)
1261 		return (0);
1262 	if ((ret = __os_malloc(sdbp->env, sizeof(DB_FOREIGN_INFO), &tmp)) != 0)
1263 		return (ret);
1264 
1265 	fdbp = sdbp->s_foreign;
1266 	ret = 0;
1267 	f_info = LIST_FIRST(&fdbp->f_primaries);
1268 	while (f_info != NULL) {
1269 		tmp = LIST_NEXT(f_info, f_links);
1270 		if (f_info ->dbp == sdbp) {
1271 			LIST_REMOVE(f_info, f_links);
1272 			__os_free(sdbp->env, f_info);
1273 		}
1274 		f_info = tmp;
1275 	}
1276 
1277 	return (ret);
1278 }
1279 
1280 /*
1281  * __db_log_page
1282  *	Log a meta-data or root page during a subdatabase create operation.
1283  *
1284  * PUBLIC: int __db_log_page __P((DB *, DB_TXN *, DB_LSN *, db_pgno_t, PAGE *));
1285  */
1286 int
__db_log_page(dbp,txn,lsn,pgno,page)1287 __db_log_page(dbp, txn, lsn, pgno, page)
1288 	DB *dbp;
1289 	DB_TXN *txn;
1290 	DB_LSN *lsn;
1291 	db_pgno_t pgno;
1292 	PAGE *page;
1293 {
1294 	DBT page_dbt;
1295 	DB_LSN new_lsn;
1296 	int ret;
1297 
1298 	if (!LOGGING_ON(dbp->env) || txn == NULL)
1299 		return (0);
1300 
1301 	memset(&page_dbt, 0, sizeof(page_dbt));
1302 	page_dbt.size = dbp->pgsize;
1303 	page_dbt.data = page;
1304 
1305 	ret = __crdel_metasub_log(dbp, txn, &new_lsn, F_ISSET(dbp,
1306 	    DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0, pgno, &page_dbt, lsn);
1307 
1308 	if (ret == 0)
1309 		page->lsn = new_lsn;
1310 	return (ret);
1311 }
1312 
1313 /*
1314  * __db_walk_cursors
1315  *	Walk all cursors for a database.
1316  *
1317  * PUBLIC: int __db_walk_cursors __P((DB *, DBC *,
1318  * PUBLIC:	int (*) __P((DBC *, DBC *,
1319  * PUBLIC:      u_int32_t *, db_pgno_t, u_int32_t, void *)),
1320  * PUBLIC:      u_int32_t *, db_pgno_t, u_int32_t, void *));
1321  */
1322  int
__db_walk_cursors(dbp,my_dbc,func,countp,pgno,indx,args)1323  __db_walk_cursors(dbp, my_dbc, func, countp, pgno, indx, args)
1324 	DB *dbp;
1325 	DBC *my_dbc;
1326 	int (*func)__P((DBC *, DBC *,
1327 	    u_int32_t *, db_pgno_t, u_int32_t, void *));
1328 	u_int32_t *countp;
1329 	db_pgno_t pgno;
1330 	u_int32_t indx;
1331 	void *args;
1332 {
1333 	ENV *env;
1334 	DB *ldbp;
1335 	DBC *dbc;
1336 	int ret;
1337 
1338 	env = dbp->env;
1339 	ret = 0;
1340 
1341 	MUTEX_LOCK(env, env->mtx_dblist);
1342 	FIND_FIRST_DB_MATCH(env, dbp, ldbp);
1343 	for (*countp = 0;
1344 	    ldbp != NULL && ldbp->adj_fileid == dbp->adj_fileid;
1345 	    ldbp = TAILQ_NEXT(ldbp, dblistlinks)) {
1346 loop:		MUTEX_LOCK(env, ldbp->mutex);
1347 		TAILQ_FOREACH(dbc, &ldbp->active_queue, links)
1348 			if ((ret = (func)(dbc, my_dbc,
1349 			    countp, pgno, indx, args)) != 0)
1350 				break;
1351 		/*
1352 		 * We use the error to communicate that function
1353 		 * dropped the mutex.
1354 		 */
1355 		if (ret == DB_LOCK_NOTGRANTED)
1356 			goto loop;
1357 		MUTEX_UNLOCK(env, ldbp->mutex);
1358 		if (ret != 0)
1359 			break;
1360 	}
1361 	MUTEX_UNLOCK(env, env->mtx_dblist);
1362 	return (ret);
1363 }
1364 
1365 /*
1366  * __db_backup_name
1367  *	Create the backup file name for a given file.
1368  *
1369  * PUBLIC: int __db_backup_name __P((ENV *,
1370  * PUBLIC:     const char *, DB_TXN *, char **));
1371  */
1372 #undef	BACKUP_PREFIX
1373 #define	BACKUP_PREFIX	"__db."
1374 
1375 #undef	MAX_INT_TO_HEX
1376 #define	MAX_INT_TO_HEX	8
1377 
1378 int
__db_backup_name(env,name,txn,backup)1379 __db_backup_name(env, name, txn, backup)
1380 	ENV *env;
1381 	const char *name;
1382 	DB_TXN *txn;
1383 	char **backup;
1384 {
1385 	u_int32_t id;
1386 	size_t len;
1387 	int ret;
1388 	char *p, *retp;
1389 
1390 	*backup = NULL;
1391 
1392 	/*
1393 	 * Part of the name may be a full path, so we need to make sure that
1394 	 * we allocate enough space for it, even in the case where we don't
1395 	 * use the entire filename for the backup name.
1396 	 */
1397 	len = strlen(name) + strlen(BACKUP_PREFIX) + 2 * MAX_INT_TO_HEX + 1;
1398 	if ((ret = __os_malloc(env, len, &retp)) != 0)
1399 		return (ret);
1400 
1401 	/*
1402 	 * Create the name.  Backup file names are in one of 2 forms: in a
1403 	 * transactional env "__db.TXNID.ID", where ID is a random number,
1404 	 * and in any other env "__db.FILENAME".
1405 	 *
1406 	 * In addition, the name passed may contain an env-relative path.
1407 	 * In that case, put the "__db." in the right place (in the last
1408 	 * component of the pathname).
1409 	 *
1410 	 * There are four cases here:
1411 	 *	1. simple path w/out transaction
1412 	 *	2. simple path + transaction
1413 	 *	3. multi-component path w/out transaction
1414 	 *	4. multi-component path + transaction
1415 	 */
1416 	p = __db_rpath(name);
1417 	if (IS_REAL_TXN(txn)) {
1418 		__os_unique_id(env, &id);
1419 		if (p == NULL)				/* Case 2. */
1420 			snprintf(retp, len, "%s%x.%x",
1421 			    BACKUP_PREFIX, txn->txnid, id);
1422 		else					/* Case 4. */
1423 			snprintf(retp, len, "%.*s%x.%x",
1424 			    (int)(p - name) + 1, name, txn->txnid, id);
1425 	} else {
1426 		if (p == NULL)				/* Case 1. */
1427 			snprintf(retp, len, "%s%s", BACKUP_PREFIX, name);
1428 		else					/* Case 3. */
1429 			snprintf(retp, len, "%.*s%s%s",
1430 			    (int)(p - name) + 1, name, BACKUP_PREFIX, p + 1);
1431 	}
1432 
1433 	*backup = retp;
1434 	return (0);
1435 }
1436 
1437 #ifdef CONFIG_TEST
1438 /*
1439  * __db_testcopy
1440  *	Create a copy of all backup files and our "main" DB.
1441  *
1442  * PUBLIC: #ifdef CONFIG_TEST
1443  * PUBLIC: int __db_testcopy __P((ENV *, DB *, const char *));
1444  * PUBLIC: #endif
1445  */
1446 int
__db_testcopy(env,dbp,name)1447 __db_testcopy(env, dbp, name)
1448 	ENV *env;
1449 	DB *dbp;
1450 	const char *name;
1451 {
1452 	DB_MPOOL *dbmp;
1453 	DB_MPOOLFILE *mpf;
1454 
1455 	DB_ASSERT(env, dbp != NULL || name != NULL);
1456 
1457 	if (name == NULL) {
1458 		dbmp = env->mp_handle;
1459 		mpf = dbp->mpf;
1460 		name = R_ADDR(dbmp->reginfo, mpf->mfp->path_off);
1461 	}
1462 
1463 	if (dbp != NULL && dbp->type == DB_QUEUE)
1464 		return (__qam_testdocopy(dbp, name));
1465 	else
1466 #ifdef HAVE_PARTITION
1467 	if (dbp != NULL && DB_IS_PARTITIONED(dbp))
1468 		return (__part_testdocopy(dbp, name));
1469 	else
1470 #endif
1471 		return (__db_testdocopy(env, name));
1472 }
1473 
1474 static int
__qam_testdocopy(dbp,name)1475 __qam_testdocopy(dbp, name)
1476 	DB *dbp;
1477 	const char *name;
1478 {
1479 	DB_THREAD_INFO *ip;
1480 	QUEUE_FILELIST *filelist, *fp;
1481 	int ret;
1482 	char buf[DB_MAXPATHLEN], *dir;
1483 
1484 	filelist = NULL;
1485 	if ((ret = __db_testdocopy(dbp->env, name)) != 0)
1486 		return (ret);
1487 
1488 	/* Call ENV_GET_THREAD_INFO to get a valid DB_THREAD_INFO */
1489 	ENV_GET_THREAD_INFO(dbp->env, ip);
1490 	if (dbp->mpf != NULL &&
1491 	    (ret = __qam_gen_filelist(dbp, ip, &filelist)) != 0)
1492 		goto done;
1493 
1494 	if (filelist == NULL)
1495 		return (0);
1496 	dir = ((QUEUE *)dbp->q_internal)->dir;
1497 	for (fp = filelist; fp->mpf != NULL; fp++) {
1498 		snprintf(buf, sizeof(buf),
1499 		    QUEUE_EXTENT, dir, PATH_SEPARATOR[0], name, fp->id);
1500 		if ((ret = __db_testdocopy(dbp->env, buf)) != 0)
1501 			return (ret);
1502 	}
1503 
1504 done:	__os_free(dbp->env, filelist);
1505 	return (0);
1506 }
1507 
1508 /*
1509  * __db_testdocopy
1510  *	Create a copy of all backup files and our "main" DB.
1511  * PUBLIC: int __db_testdocopy __P((ENV *, const char *));
1512  */
1513 int
__db_testdocopy(env,name)1514 __db_testdocopy(env, name)
1515 	ENV *env;
1516 	const char *name;
1517 {
1518 	size_t len;
1519 	int dircnt, i, ret;
1520 	char *copy, **namesp, *p, *real_name;
1521 
1522 	dircnt = 0;
1523 	copy = NULL;
1524 	namesp = NULL;
1525 
1526 	/* Create the real backing file name. */
1527 	if ((ret = __db_appname(env,
1528 	    DB_APP_DATA, name, NULL, &real_name)) != 0)
1529 		return (ret);
1530 
1531 	/*
1532 	 * !!!
1533 	 * There are tests that attempt to copy non-existent files.  I'd guess
1534 	 * it's a testing bug, but I don't have time to figure it out.  Block
1535 	 * the case here.
1536 	 */
1537 	if (__os_exists(env, real_name, NULL) != 0) {
1538 		__os_free(env, real_name);
1539 		return (0);
1540 	}
1541 
1542 	/*
1543 	 * Copy the file itself.
1544 	 *
1545 	 * Allocate space for the file name, including adding an ".afterop" and
1546 	 * trailing nul byte.
1547 	 */
1548 	len = strlen(real_name) + sizeof(".afterop");
1549 	if ((ret = __os_malloc(env, len, &copy)) != 0)
1550 		goto err;
1551 	snprintf(copy, len, "%s.afterop", real_name);
1552 	if ((ret = __db_makecopy(env, real_name, copy)) != 0)
1553 		goto err;
1554 
1555 	/*
1556 	 * Get the directory path to call __os_dirlist().
1557 	 */
1558 	if ((p = __db_rpath(real_name)) != NULL)
1559 		*p = '\0';
1560 	if ((ret = __os_dirlist(env, real_name, 0, &namesp, &dircnt)) != 0)
1561 		goto err;
1562 
1563 	/*
1564 	 * Walk the directory looking for backup files.  Backup file names in
1565 	 * transactional environments are of the form:
1566 	 *
1567 	 *	BACKUP_PREFIX.TXNID.ID
1568 	 */
1569 	for (i = 0; i < dircnt; i++) {
1570 		/* Check for a related backup file name. */
1571 		if (strncmp(
1572 		    namesp[i], BACKUP_PREFIX, sizeof(BACKUP_PREFIX) - 1) != 0)
1573 			continue;
1574 		p = namesp[i] + sizeof(BACKUP_PREFIX);
1575 		p += strspn(p, "0123456789ABCDEFabcdef");
1576 		if (*p != '.')
1577 			continue;
1578 		++p;
1579 		p += strspn(p, "0123456789ABCDEFabcdef");
1580 		if (*p != '\0')
1581 			continue;
1582 
1583 		/*
1584 		 * Copy the backup file.
1585 		 *
1586 		 * Allocate space for the file name, including adding a
1587 		 * ".afterop" and trailing nul byte.
1588 		 */
1589 		if (real_name != NULL) {
1590 			__os_free(env, real_name);
1591 			real_name = NULL;
1592 		}
1593 		if ((ret = __db_appname(env,
1594 		    DB_APP_DATA, namesp[i], NULL, &real_name)) != 0)
1595 			goto err;
1596 		if (copy != NULL) {
1597 			__os_free(env, copy);
1598 			copy = NULL;
1599 		}
1600 		len = strlen(real_name) + sizeof(".afterop");
1601 		if ((ret = __os_malloc(env, len, &copy)) != 0)
1602 			goto err;
1603 		snprintf(copy, len, "%s.afterop", real_name);
1604 		if ((ret = __db_makecopy(env, real_name, copy)) != 0)
1605 			goto err;
1606 	}
1607 
1608 err:	if (namesp != NULL)
1609 		__os_dirfree(env, namesp, dircnt);
1610 	if (copy != NULL)
1611 		__os_free(env, copy);
1612 	if (real_name != NULL)
1613 		__os_free(env, real_name);
1614 	return (ret);
1615 }
1616 
1617 static int
__db_makecopy(env,src,dest)1618 __db_makecopy(env, src, dest)
1619 	ENV *env;
1620 	const char *src, *dest;
1621 {
1622 	DB_FH *rfhp, *wfhp;
1623 	size_t rcnt, wcnt;
1624 	int ret;
1625 	char *buf;
1626 
1627 	rfhp = wfhp = NULL;
1628 
1629 	if ((ret = __os_malloc(env, 64 * 1024, &buf)) != 0)
1630 		goto err;
1631 
1632 	if ((ret = __os_open(env, src, 0,
1633 	    DB_OSO_RDONLY, DB_MODE_600, &rfhp)) != 0)
1634 		goto err;
1635 	if ((ret = __os_open(env, dest, 0,
1636 	    DB_OSO_CREATE | DB_OSO_TRUNC, DB_MODE_600, &wfhp)) != 0)
1637 		goto err;
1638 
1639 	for (;;) {
1640 		if ((ret =
1641 		    __os_read(env, rfhp, buf, sizeof(buf), &rcnt)) != 0)
1642 			goto err;
1643 		if (rcnt == 0)
1644 			break;
1645 		if ((ret =
1646 		    __os_write(env, wfhp, buf, sizeof(buf), &wcnt)) != 0)
1647 			goto err;
1648 	}
1649 
1650 	if (0) {
1651 err:		__db_err(env, ret, "__db_makecopy: %s -> %s", src, dest);
1652 	}
1653 
1654 	if (buf != NULL)
1655 		__os_free(env, buf);
1656 	if (rfhp != NULL)
1657 		(void)__os_closehandle(env, rfhp);
1658 	if (wfhp != NULL)
1659 		(void)__os_closehandle(env, wfhp);
1660 	return (ret);
1661 }
1662 #endif
1663