1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 2001, 2013 Oracle and/or its affiliates.  All rights reserved.
5  *
6  * $Id$
7  */
8 
9 #include "db_config.h"
10 
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/db_am.h"
14 #include "dbinc/hash.h"
15 #include "dbinc/fop.h"
16 #include "dbinc/lock.h"
17 #include "dbinc/mp.h"
18 #include "dbinc/txn.h"
19 
20 static int __fop_set_pgsize __P((DB *, DB_FH *, const char *));
21 static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t));
22 static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
23 static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t,
24 	    u_int32_t));
25 static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
26 	       const char *, const char *, const char *, DB_LOCKER *));
27 static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
28 static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
29 	     const char *, const char *, const char *, DB_LOCKER *));
30 
31 /*
32  * Acquire the environment meta-data lock.  The parameters are the
33  * environment (ENV), the locker id to use in acquiring the lock (ID)
34  * and a pointer to a DB_LOCK.
35  *
36  * !!!
37  * Turn off locking for Critical Path.  The application must do its own
38  * synchronization of open/create.  Two threads creating and opening a
39  * file at the same time may have unpredictable results.
40  */
41 #ifdef CRITICALPATH_10266
42 #define	GET_ENVLOCK(ENV, ID, L) (0)
43 #else
44 #define	GET_ENVLOCK(ENV, ID, L) do {					\
45 	DBT __dbt;							\
46 	u_int32_t __lockval;						\
47 									\
48 	if (LOCKING_ON((ENV))) {					\
49 		__lockval = 1;						\
50 		__dbt.data = &__lockval;				\
51 		__dbt.size = sizeof(__lockval);				\
52 		if ((ret = __lock_get((ENV), (ID),			\
53 		    0, &__dbt, DB_LOCK_WRITE, (L))) != 0)		\
54 			goto err;					\
55 	}								\
56 } while (0)
57 #endif
58 
59 #define	RESET_MPF(D, F) do {						\
60 	(void)__memp_fclose((D)->mpf, (F));				\
61 	(D)->mpf = NULL;						\
62 	F_CLR((D), DB_AM_OPEN_CALLED);					\
63 	if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0)		\
64 		goto err;						\
65 } while (0)
66 
67 /*
68  * If we open a file handle and our caller is doing fcntl(2) locking,
69  * we can't close the handle because that would discard the caller's
70  * lock. Save it until we close or refresh the DB handle.
71  */
72 #define	CLOSE_HANDLE(D, F) {						\
73 	if ((F) != NULL) {						\
74 		if (LF_ISSET(DB_FCNTL_LOCKING))				\
75 			(D)->saved_open_fhp = (F);			\
76 		else if ((t_ret =					\
77 		    __os_closehandle((D)->env, (F))) != 0) {		\
78 			if (ret == 0)					\
79 				ret = t_ret;				\
80 			goto err;					\
81 		}							\
82 		(F) = NULL;						\
83 	}								\
84 }
85 
86 /*
87  * __fop_lock_handle --
88  *
89  * Get the handle lock for a database.  If the envlock is specified, do this
90  * as a lock_vec call that releases the environment lock before acquiring the
91  * handle lock.
92  *
93  * PUBLIC: int __fop_lock_handle __P((ENV *,
94  * PUBLIC:     DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
95  *
96  */
97 int
__fop_lock_handle(env,dbp,locker,mode,elockp,flags)98 __fop_lock_handle(env, dbp, locker, mode, elockp, flags)
99 	ENV *env;
100 	DB *dbp;
101 	DB_LOCKER *locker;
102 	db_lockmode_t mode;
103 	DB_LOCK *elockp;
104 	u_int32_t flags;
105 {
106 	DBT fileobj;
107 	DB_LOCKREQ reqs[2], *ereq;
108 	DB_LOCK_ILOCK lock_desc;
109 	int ret;
110 
111 	if (!LOCKING_ON(env) ||
112 	    F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER))
113 		return (0);
114 
115 	/*
116 	 * If we are in recovery, the only locking we should be
117 	 * doing is on the global environment.  The one exception
118 	 * is if we are opening an exclusive database on a client
119 	 * syncing with the master.
120 	 */
121 	if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL))
122 		return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp));
123 
124 	memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
125 	lock_desc.pgno = dbp->meta_pgno;
126 	lock_desc.type = DB_HANDLE_LOCK;
127 
128 	memset(&fileobj, 0, sizeof(fileobj));
129 	fileobj.data = &lock_desc;
130 	fileobj.size = sizeof(lock_desc);
131 	DB_TEST_SUBLOCKS(env, flags);
132 	if (F2_ISSET(dbp, DB2_AM_INTEXCL))
133 	    flags |= DB_LOCK_IGNORE_REC;
134 	if (elockp == NULL)
135 		ret = __lock_get(env, locker,
136 		    flags, &fileobj, mode, &dbp->handle_lock);
137 	else {
138 		reqs[0].op = DB_LOCK_PUT;
139 		reqs[0].lock = *elockp;
140 		reqs[1].op = DB_LOCK_GET;
141 		reqs[1].mode = mode;
142 		reqs[1].obj = &fileobj;
143 		reqs[1].timeout = 0;
144 		if ((ret = __lock_vec(env,
145 		    locker, flags, reqs, 2, &ereq)) == 0) {
146 			dbp->handle_lock = reqs[1].lock;
147 			if (elockp != &dbp->handle_lock)
148 				LOCK_INIT(*elockp);
149 		} else if (ereq != reqs)
150 			LOCK_INIT(*elockp);
151 	}
152 
153 	dbp->cur_locker = locker;
154 	return (ret);
155 }
156 
157 /*
158  * __fop_file_setup --
159  *
160  * Perform all the needed checking and locking to open up or create a
161  * file.
162  *
163  * There's a reason we don't push this code down into the buffer cache.
164  * The problem is that there's no information external to the file that
165  * we can use as a unique ID.  UNIX has dev/inode pairs, but they are
166  * not necessarily unique after reboot, if the file was mounted via NFS.
167  * Windows has similar problems, as the FAT filesystem doesn't maintain
168  * dev/inode numbers across reboot.  So, we must get something from the
169  * file we can use to ensure that, even after a reboot, the file we're
170  * joining in the cache is the right file for us to join.  The solution
171  * we use is to maintain a file ID that's stored in the database, and
172  * that's why we have to open and read the file before calling into the
173  * buffer cache or obtaining a lock (we use this unique fileid to lock
174  * as well as to identify like files in the cache).
175  *
176  * There are a couple of idiosyncrasies that this code must support, in
177  * particular, DB_TRUNCATE and DB_FCNTL_LOCKING.  First, we disallow
178  * DB_TRUNCATE in the presence of transactions, since opening a file with
179  * O_TRUNC will result in data being lost in an unrecoverable fashion.
180  * We also disallow DB_TRUNCATE if locking is enabled, because even in
181  * the presence of locking, we cannot avoid race conditions, so allowing
182  * DB_TRUNCATE with locking would be misleading.  See SR [#7345] for more
183  * details.
184  *
185  * However, if you are running with neither locking nor transactions, then
186  * you can specify DB_TRUNCATE, and if you do so, we will truncate the file
187  * regardless of its contents.
188  *
189  * FCNTL locking introduces another set of complications.  First, the only
190  * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility
191  * with programs like Sendmail and Postfix.  In these cases, the caller may
192  * already have a lock on the file; we need to make sure that any file handles
193  * we open remain open, because if we were to close them, the lock held by the
194  * caller would go away.  Furthermore, Sendmail and/or Postfix need the ability
195  * to create databases in empty files.  So, when you're doing FCNTL locking,
196  * it's reasonable that you are trying to create a database into a 0-length
197  * file and we allow it, while under normal conditions, we do not create
198  * databases if the files already exist and are not Berkeley DB files.
199  *
200  * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip,
201  * PUBLIC:     DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
202  */
203 int
__fop_file_setup(dbp,ip,txn,name,mode,flags,retidp)204 __fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
205 	DB *dbp;
206 	DB_THREAD_INFO *ip;
207 	DB_TXN *txn;
208 	const char *name;
209 	int mode;
210 	u_int32_t flags, *retidp;
211 {
212 	DBTYPE save_type;
213 	DB_FH *fhp;
214 	DB_LOCK elock;
215 	DB_LOCKER *locker;
216 	DB_TXN *stxn;
217 	ENV *env;
218 	size_t len;
219 	APPNAME aflags;
220 	u_int32_t dflags, oflags;
221 	u_int8_t mbuf[DBMETASIZE];
222 	int created_locker, create_ok, ret, retries, t_ret, tmp_created;
223 	int truncating, was_inval;
224 	char *real_name, *real_tmpname, *tmpname;
225 	db_lockmode_t lockmode;
226 
227 	*retidp = TXN_INVALID;
228 
229 	env = dbp->env;
230 	fhp = NULL;
231 	LOCK_INIT(elock);
232 	stxn = NULL;
233 	created_locker = tmp_created = truncating = was_inval = 0;
234 	real_name = real_tmpname = tmpname = NULL;
235 	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
236 	aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META :
237 	    (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA);
238 	LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB);
239 
240 	ret = 0;
241 	retries = 0;
242 	save_type = dbp->type;
243 	if (F2_ISSET(dbp, DB2_AM_EXCL))
244 		lockmode = DB_LOCK_WRITE;
245 	else
246 		lockmode = DB_LOCK_READ;
247 
248 	/*
249 	 * Get a lockerid for this handle.  There are paths through queue
250 	 * rename and remove where this dbp already has a locker, so make
251 	 * sure we don't clobber it and conflict.
252 	 */
253 	if (LOCKING_ON(env) &&
254 	    !F_ISSET(dbp, DB_AM_COMPENSATE) &&
255 	    !F_ISSET(dbp, DB_AM_RECOVER) &&
256 	    dbp->locker == DB_LOCK_INVALIDID) {
257 		if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
258 			goto err;
259 		created_locker = 1;
260 	}
261 	LOCK_INIT(dbp->handle_lock);
262 
263 	if (txn != NULL && dbp->locker != NULL && F_ISSET(txn, TXN_INFAMILY)) {
264 		if ((ret = __lock_addfamilylocker(env,
265 		    txn->txnid, dbp->locker->id, 1)) != 0)
266 			goto err;
267 		txn = NULL;
268 	}
269 
270 	locker = txn == NULL ? dbp->locker : txn->locker;
271 
272 	oflags = 0;
273 	if (F_ISSET(dbp, DB_AM_INMEM))
274 		real_name = (char *)name;
275 	else {
276 		/* Get the real backing file name. */
277 		if ((ret = __db_appname(env,
278 		    aflags, name, &dbp->dirname, &real_name)) != 0)
279 			goto err;
280 
281 		/* Fill in the default file mode. */
282 		if (mode == 0)
283 			mode = DB_MODE_660;
284 
285 		if (LF_ISSET(DB_RDONLY))
286 			oflags |= DB_OSO_RDONLY;
287 		if (LF_ISSET(DB_TRUNCATE))
288 			oflags |= DB_OSO_TRUNC;
289 	}
290 
291 	retries = 0;
292 	create_ok = LF_ISSET(DB_CREATE);
293 	LF_CLR(DB_CREATE);
294 
295 retry:
296 	/*
297 	 * If we cannot create the file, only retry a few times.  We
298 	 * think we might be in a race with another create, but it could
299 	 * be that the backup filename exists (that is, is left over from
300 	 * a previous crash).  It is also possible to read the metadata
301 	 * page while it is being written and fail the checksum.
302 	 */
303 	if (++retries > DB_RETRY) {
304 		__db_errx(env, DB_STR_A("0002",
305 		    "__fop_file_setup:  Retry limit (%d) exceeded", "%d"),
306 		    DB_RETRY);
307 		goto err;
308 	}
309 	if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER))
310 		GET_ENVLOCK(env, locker, &elock);
311 	if (name == NULL)
312 		ret = ENOENT;
313 	else if (F_ISSET(dbp, DB_AM_INMEM)) {
314 		ret = __env_mpool(dbp, name, flags);
315 		/*
316 		 * We are using __env_open as a check for existence.
317 		 * However, __env_mpool does an actual open and there
318 		 * are scenarios where the object exists, but cannot be
319 		 * opened, because our settings don't match those internally.
320 		 * We need to check for that explicitly.  We'll need the
321 		 * mpool open to read the meta-data page, so we're going to
322 		 * have to temporarily turn this dbp into an UNKNOWN one.
323 		 */
324 		if (ret == EINVAL) {
325 			was_inval = 1;
326 			save_type = dbp->type;
327 			dbp->type = DB_UNKNOWN;
328 			ret = __env_mpool(dbp, name, flags);
329 			dbp->type = save_type;
330 		}
331 	} else
332 		ret = __os_exists(env, real_name, NULL);
333 
334 	if (ret == 0) {
335 		/*
336 		 * If the file exists, there are 5 possible cases:
337 		 * 1. DB_EXCL was specified so this is an error, unless
338 		 *	this is a file left around after a rename and we
339 		 *	are in the same transaction.  This gets decomposed
340 		 *	into several subcases, because we check for various
341 		 *	errors before we know we're in rename.
342 		 * 2. We are truncating, and it doesn't matter what kind
343 		 *	of file it is, we should open/create it.
344 		 * 3. It is 0-length, we are not doing transactions (i.e.,
345 		 *      we are sendmail), we should open/create into it.
346 		 *	-- on-disk files only!
347 		 * 4. Is it a Berkeley DB file and we should simply open it.
348 		 * 5. It is not a BDB file and we should return an error.
349 		 */
350 
351 		/* Open file (if there is one). */
352 reopen:		if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
353 		    __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0)
354 			goto err;
355 
356 		/* Case 2: DB_TRUNCATE: we must do the creation in place. */
357 		if (LF_ISSET(DB_TRUNCATE)) {
358 			if (LF_ISSET(DB_EXCL)) {
359 				/* Case 1a: DB_EXCL and DB_TRUNCATE. */
360 				ret = EEXIST;
361 				goto err;
362 			}
363 			tmpname = (char *)name;
364 			goto creat2;
365 		}
366 
367 		/* Cases 1,3-5: we need to read the meta-data page. */
368 		if (F_ISSET(dbp, DB_AM_INMEM)) {
369 			if (LOGGING_ON(env) && (ret = __env_dbreg_setup(dbp,
370 			    txn, NULL, name, TXN_INVALID)) != 0)
371 				return (ret);
372 			ret = __fop_inmem_read_meta(
373 			    dbp, txn, name, flags, DB_CHK_META|DB_CHK_ONLY);
374 		} else {
375 			ret = __fop_read_meta(env, real_name, mbuf,
376 			    sizeof(mbuf), fhp,
377 			    LF_ISSET(DB_NOERROR) ||
378 			    (LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL) ? 1 : 0,
379 			    &len);
380 
381 			/* Case 3: 0-length, no txns. */
382 			if (ret != 0 && len == 0 && txn == NULL) {
383 				if (LF_ISSET(DB_EXCL)) {
384 					/*
385 					 * Case 1b: DB_EXCL and
386 					 * 0-length file exists.
387 					 */
388 					ret = EEXIST;
389 					goto err;
390 				}
391 				tmpname = (char *)name;
392 				if (create_ok)
393 					goto creat2;
394 				goto done;
395 			}
396 
397 			/*
398 			 * Case 4: This is a valid file.  Now check the
399 			 * checksum and decrypt the file so the file
400 			 * id can be obtained for the handle lock.  Note that
401 			 * the checksum can fail if the database is being
402 			 * written (possible because the handle lock has
403 			 * not been obtained yet).  So on checksum fail retry
404 			 * until the checksum succeeds or the number of
405 			 * retries is exhausted, then throw an error.
406 			 */
407 			if (ret == 0 && (ret = __db_chk_meta(env, dbp,
408 			    (DBMETA *)mbuf, DB_CHK_META)) == DB_CHKSUM_FAIL) {
409 				if ((t_ret = __ENV_LPUT(env, elock)) != 0) {
410 					ret = t_ret;
411 					goto err;
412 				}
413 				/*
414 				 * Retry unless the number of retries is
415 				 * exhausted.
416 				 */
417 				if (!(retries < DB_RETRY)) {
418 					__db_errx(env, DB_STR_A("0210",
419 			"%s: metadata page checksum error", "%s"), real_name);
420 					if (F_ISSET(dbp, DB_AM_RECOVER))
421 						ret = ENOENT;
422 					else
423 						ret = EINVAL;
424 					goto err;
425 				}
426 				CLOSE_HANDLE(dbp, fhp);
427 				goto retry;
428 			}
429 			/* Get the file id for the handle lock. */
430 			if (ret == 0)
431 				memcpy(dbp->fileid,
432 				((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
433 		}
434 
435 		/* Case 5: Invalid file. */
436 		if (ret != 0)
437 			goto err;
438 
439 		/* Now, get our handle lock. */
440 		if ((ret = __fop_lock_handle(env,
441 		    dbp, locker, lockmode, NULL, DB_LOCK_NOWAIT)) == 0) {
442 			if ((ret = __ENV_LPUT(env, elock)) != 0)
443 				goto err;
444 		} else if (ret != DB_LOCK_NOTGRANTED ||
445 		    ((txn != NULL && (F_ISSET(txn, TXN_NOWAIT))) ||
446 		    F2_ISSET(dbp, DB2_AM_NOWAIT)))
447 			goto err;
448 		else {
449 			PERFMON3(env,
450 			    race, fop_file_setup, (char *) name, ret, flags);
451 			/*
452 			 * We were unable to acquire the handle lock without
453 			 * blocking.  The fact that we are blocking might mean
454 			 * that someone else is trying to delete the file.
455 			 * Since some platforms cannot delete files while they
456 			 * are open (Windows), we are going to have to close
457 			 * the file.  This would be a problem if we were doing
458 			 * FCNTL locking, because our closing the handle would
459 			 * release the FCNTL locks.  Fortunately, if we are
460 			 * doing FCNTL locking, then we should never fail to
461 			 * acquire our handle lock, so we should never get here.
462 			 * We assert it here to make sure we aren't destroying
463 			 * any application level FCNTL semantics.
464 			 */
465 			DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
466 			if (!F_ISSET(dbp, DB_AM_INMEM))
467 				CLOSE_HANDLE(dbp, fhp);
468 			if ((ret = __fop_lock_handle(env,
469 			    dbp, locker, lockmode, &elock, 0)) != 0) {
470 				if (F_ISSET(dbp, DB_AM_INMEM))
471 					RESET_MPF(dbp, 0);
472 				goto err;
473 			}
474 
475 			/*
476 			 * If we had to wait, we might be waiting on a
477 			 * dummy file used in create/destroy of a database.
478 			 * To be sure we have the correct information we
479 			 * try again.
480 			 */
481 			if (F_ISSET(dbp, DB_AM_INMEM)) {
482 				RESET_MPF(dbp, 0);
483 				MAKE_INMEM(dbp);
484 			}
485 			if ((ret =
486 			    __ENV_LPUT(env, dbp->handle_lock)) != 0) {
487 				LOCK_INIT(dbp->handle_lock);
488 				goto err;
489 			}
490 			goto retry;
491 
492 		}
493 
494 		/*
495 		 * If we got here, then we have the handle lock, it is now
496 		 * safe to check the rest of the meta data, since the file
497 		 * will not be deleted out from under the handle.
498 		 */
499 		if (F_ISSET(dbp, DB_AM_INMEM)) {
500 			if ((ret = __fop_inmem_read_meta(
501 			    dbp, txn, name, flags, DB_SKIP_CHK)) != 0)
502 				goto err;
503 		} else {
504 			if ((ret = __db_meta_setup(env, dbp, real_name,
505 			    (DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0)
506 				goto err;
507 		}
508 
509 		/*
510 		 * Check for a file in the midst of a rename.  If we find that
511 		 * the file is in the midst of a rename, it must be the case
512 		 * that it is in our current transaction (else we would still
513 		 * be blocking), so we can continue along and create a new file
514 		 * with the same name.  In that case, we have to close the file
515 		 * handle because we reuse it below.  This is a case where
516 		 * a 'was_inval' above is OK.
517 		 */
518 		if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
519 			was_inval = 0;
520 			if (create_ok) {
521 				if (F_ISSET(dbp, DB_AM_INMEM)) {
522 					RESET_MPF(dbp, DB_MPOOL_DISCARD);
523 				} else
524 					CLOSE_HANDLE(dbp, fhp);
525 				LF_SET(DB_CREATE);
526 				goto create;
527 			} else {
528 				ret = ENOENT;
529 				goto err;
530 			}
531 		}
532 
533 		/* If we get here, a was_inval is bad. */
534 		if (was_inval) {
535 			ret = EINVAL;
536 			goto err;
537 		}
538 
539 		/*
540 		 * Now, case 1: check for DB_EXCL, because the file that exists
541 		 * is not in the middle of a rename, so we have an error.  This
542 		 * is a weird case, but we need to make sure that we don't
543 		 * continue to hold the handle lock, since technically, we
544 		 * should not have been allowed to open it.
545 		 */
546 		if (LF_ISSET(DB_EXCL)) {
547 			ret = __ENV_LPUT(env, dbp->handle_lock);
548 			LOCK_INIT(dbp->handle_lock);
549 			if (ret == 0)
550 				ret = EEXIST;
551 			goto err;
552 		}
553 		goto done;
554 	}
555 
556 	/* File does not exist. */
557 #ifdef	HAVE_VXWORKS
558 	/*
559 	 * VxWorks can return file-system specific error codes if the
560 	 * file does not exist, not ENOENT.
561 	 */
562 	if (!create_ok)
563 #else
564 	if (!create_ok || ret != ENOENT)
565 #endif
566 		goto err;
567 	LF_SET(DB_CREATE);
568 	/*
569 	 * If we were trying to open a non-existent master database
570 	 * readonly clear that here.
571 	 */
572 	LF_CLR(DB_RDONLY);
573 	F_CLR(dbp, DB_AM_RDONLY);
574 	ret = 0;
575 
576 	/*
577 	 * We need to create file, which means that we need to set up the file,
578 	 * the fileid and the locks.  Then we need to call the appropriate
579 	 * routines to create meta-data pages.  For in-memory files, we retain
580 	 * the environment lock, while for on-disk files, we drop the env lock
581 	 * and create into a temporary.
582 	 */
583 	if (!F_ISSET(dbp, DB_AM_INMEM) &&
584 	    (ret = __ENV_LPUT(env, elock)) != 0)
585 		goto err;
586 
587 create:	if (txn != NULL && IS_REP_CLIENT(env) &&
588 	    !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
589 		__db_errx(env, DB_STR("0003",
590 		    "Transactional create on replication client disallowed"));
591 		ret = EINVAL;
592 		goto err;
593 	}
594 
595 	if (F_ISSET(dbp, DB_AM_INMEM)) {
596 		if (LOGGING_ON(env) && (ret =
597 		    __env_dbreg_setup(dbp, txn, NULL, name, TXN_INVALID)) != 0)
598 			return (ret);
599 		if ((ret = __fop_inmem_create(dbp, name, txn, flags)) != 0)
600 			return (ret);
601 	} else {
602 		if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0)
603 			goto err;
604 		if (TXN_ON(env) && txn != NULL &&
605 		    (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
606 			goto err;
607 		if ((ret = __fop_create(env, stxn, &fhp,
608 		    tmpname, &dbp->dirname, aflags, mode, dflags)) != 0) {
609 			/*
610 			 * If no transactions, there is a race on creating the
611 			 * backup file, as the backup file name is the same for
612 			 * all processes.  Wait for the other process to finish
613 			 * with the name.
614 			 */
615 			if (!TXN_ON(env) && ret == EEXIST) {
616 				PERFMON3(env,
617 				    race, fop_file_setup, tmpname, ret, flags);
618 				__os_free(env, tmpname);
619 				tmpname = NULL;
620 				__os_yield(env, 1, 0);
621 				goto retry;
622 			}
623 			goto err;
624 		}
625 		tmp_created = 1;
626 	}
627 
628 creat2:	if (!F_ISSET(dbp, DB_AM_INMEM)) {
629 		if ((ret = __db_appname(env,
630 		    aflags, tmpname, &dbp->dirname, &real_tmpname)) != 0)
631 			goto err;
632 
633 		/* Set the pagesize if it isn't yet set. */
634 		if (dbp->pgsize == 0 &&
635 		    (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0)
636 			goto errmsg;
637 
638 		/* Construct a file_id. */
639 		if ((ret =
640 		    __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0)
641 			goto errmsg;
642 	}
643 
644 	if ((ret = __db_new_file(dbp, ip,
645 	    F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0)
646 		goto err;
647 
648 	/* Output the REOPEN record after we create. */
649 	if (F_ISSET(dbp, DB_AM_INMEM) && dbp->log_filename != NULL && (ret =
650 	    __dbreg_log_id(dbp, txn, dbp->log_filename->id, 0)) != 0)
651 		return (ret);
652 
653 	/*
654 	 * We need to close the handle here on platforms where remove and
655 	 * rename fail if a handle is open (including Windows).
656 	 */
657 	CLOSE_HANDLE(dbp, fhp);
658 
659 	/*
660 	 * Now move the file into place unless we are creating in place (because
661 	 * we created a database in a file that started out 0-length).  If
662 	 * this is an in-memory file, we may or may not hold the environment
663 	 * lock depending on how we got here.
664 	 */
665 	if (!F_ISSET(dbp, DB_AM_COMPENSATE) &&
666 	    !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock))
667 		GET_ENVLOCK(env, locker, &elock);
668 
669 	if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
670 		F_CLR(dbp, DB_AM_IN_RENAME);
671 		__txn_remrem(env, txn, real_name);
672 	} else if (name == tmpname) {
673 		/* We created it in place. */
674 	} else if (!F_ISSET(dbp, DB_AM_INMEM) &&
675 	    __os_exists(env, real_name, NULL) == 0) {
676 		/*
677 		 * Someone managed to create the file; remove our temp
678 		 * and try to open the file that now exists.
679 		 */
680 		(void)__fop_remove(env, NULL,
681 		    dbp->fileid, tmpname, &dbp->dirname, aflags, dflags);
682 		(void)__ENV_LPUT(env, dbp->handle_lock);
683 		LOCK_INIT(dbp->handle_lock);
684 
685 		if (stxn != NULL) {
686 			ret = __txn_abort(stxn);
687 			stxn = NULL;
688 		}
689 		if (ret != 0)
690 			goto err;
691 		goto reopen;
692 	}
693 
694 	if (name != NULL && (ret = __fop_lock_handle(env,
695 	    dbp, locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn)|
696 	    (F2_ISSET(dbp,DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
697 		goto err;
698 	if (tmpname != NULL &&
699 	    tmpname != name && (ret = __fop_rename(env, stxn, tmpname,
700 	    name, &dbp->dirname, dbp->fileid, aflags, 1, dflags)) != 0)
701 		goto err;
702 	if ((ret = __ENV_LPUT(env, elock)) != 0)
703 		goto err;
704 
705 	if (stxn != NULL) {
706 		*retidp = stxn->txnid;
707 		ret = __txn_commit(stxn, 0);
708 		stxn = NULL;
709 	} else
710 		*retidp = TXN_INVALID;
711 
712 	if (ret != 0)
713 		goto err;
714 
715 	F_SET(dbp, DB_AM_CREATED);
716 
717 	if (0) {
718 errmsg:		__db_err(env, ret, "%s", name);
719 
720 err:		CLOSE_HANDLE(dbp, fhp);
721 		if (stxn != NULL)
722 			(void)__txn_abort(stxn);
723 		if (tmp_created && txn == NULL)
724 			(void)__fop_remove(env,
725 			    NULL, NULL, tmpname, NULL, aflags, dflags);
726 		if (txn == NULL)
727 			(void)__ENV_LPUT(env, dbp->handle_lock);
728 		(void)__ENV_LPUT(env, elock);
729 		if (created_locker) {
730 			(void)__lock_id_free(env, dbp->locker);
731 			dbp->locker = NULL;
732 		}
733 	}
734 
735 done:	/*
736 	 * There are cases where real_name and tmpname take on the
737 	 * exact same string, so we need to make sure that we do not
738 	 * free twice.
739 	 */
740 	if (!truncating && tmpname != NULL && tmpname != name)
741 		__os_free(env, tmpname);
742 	if (real_name != name && real_name != NULL)
743 		__os_free(env, real_name);
744 	if (real_tmpname != NULL)
745 		__os_free(env, real_tmpname);
746 	CLOSE_HANDLE(dbp, fhp);
747 
748 	return (ret);
749 }
750 
751 /*
752  * __fop_set_pgsize --
753  *	Set the page size based on file information.
754  */
755 static int
__fop_set_pgsize(dbp,fhp,name)756 __fop_set_pgsize(dbp, fhp, name)
757 	DB *dbp;
758 	DB_FH *fhp;
759 	const char *name;
760 {
761 	ENV *env;
762 	u_int32_t iopsize;
763 	int ret;
764 
765 	env = dbp->env;
766 
767 	/*
768 	 * Use the filesystem's optimum I/O size as the pagesize if a pagesize
769 	 * not specified.  Some filesystems have 64K as their optimum I/O size,
770 	 * but as that results in fairly large default caches, we limit the
771 	 * default pagesize to 16K.
772 	 */
773 	if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) {
774 		__db_err(env, ret, "%s", name);
775 		return (ret);
776 	}
777 	if (iopsize < 512)
778 		iopsize = 512;
779 	if (iopsize > 16 * 1024)
780 		iopsize = 16 * 1024;
781 
782 	/*
783 	 * Sheer paranoia, but we don't want anything that's not a power-of-2
784 	 * (we rely on that for alignment of various types on the pages), and
785 	 * we want a multiple of the sector size as well.  If the value
786 	 * we got out of __os_ioinfo looks bad, use a default instead.
787 	 */
788 	if (!IS_VALID_PAGESIZE(iopsize))
789 		iopsize = DB_DEF_IOSIZE;
790 
791 	dbp->pgsize = iopsize;
792 	F_SET(dbp, DB_AM_PGDEF);
793 
794 	return (0);
795 }
796 
797 /*
798  * __fop_subdb_setup --
799  *
800  * Subdb setup is significantly simpler than file setup.  In terms of
801  * locking, for the duration of the operation/transaction, the locks on
802  * the meta-data page will suffice to protect us from simultaneous operations
803  * on the sub-database.  Before we complete the operation though, we'll get a
804  * handle lock on the subdatabase so that on one else can try to remove it
805  * while we've got it open.  We use an object that looks like the meta-data
806  * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle.
807  * locks.
808  *
809  * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *,
810  * PUBLIC:     const char *, const char *, int, u_int32_t));
811  */
812 int
__fop_subdb_setup(dbp,ip,txn,mname,name,mode,flags)813 __fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags)
814 	DB *dbp;
815 	DB_THREAD_INFO *ip;
816 	DB_TXN *txn;
817 	const char *mname, *name;
818 	int mode;
819 	u_int32_t flags;
820 {
821 	DB *mdbp;
822 	ENV *env;
823 	db_lockmode_t lkmode;
824 	u_int32_t mflags;
825 	int ret, t_ret;
826 
827 	mdbp = NULL;
828 	env = dbp->env;
829 
830 	mflags = flags | DB_RDONLY;
831 retry:	if ((ret = __db_master_open(dbp,
832 	    ip, txn, mname, mflags, mode, &mdbp)) != 0)
833 		return (ret);
834 	/*
835 	 * If we created this file, then we need to set the DISCARD flag so
836 	 * that if we fail in the middle of this routine, we discard from the
837 	 * mpool any pages that we just created.
838 	 */
839 	if (F_ISSET(mdbp, DB_AM_CREATED))
840 		F_SET(mdbp, DB_AM_DISCARD);
841 
842 	/*
843 	 * We are going to close this instance of the master, so we can
844 	 * steal its handle instead of reopening a handle on the database.
845 	 */
846 	if (LF_ISSET(DB_FCNTL_LOCKING)) {
847 		dbp->saved_open_fhp = mdbp->saved_open_fhp;
848 		mdbp->saved_open_fhp = NULL;
849 	}
850 
851 	/* Copy the pagesize and set the sub-database flag. */
852 	dbp->pgsize = mdbp->pgsize;
853 	F_SET(dbp, DB_AM_SUBDB);
854 
855 	if (name != NULL && (ret = __db_master_update(mdbp, dbp,
856 	    ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) {
857 		if (ret == EBADF && F_ISSET(mdbp, DB_AM_RDONLY)) {
858 			/* We need to reopen the master R/W to do the create. */
859 			if ((ret = __db_close(mdbp, txn, 0)) != 0)
860 				goto err;
861 			FLD_CLR(mflags, DB_RDONLY);
862 			goto retry;
863 		}
864 		goto err;
865 	}
866 
867 	/*
868 	 * Hijack the master's locker ID as well, so that our locks don't
869 	 * conflict with the master's.  Since we're closing the master,
870 	 * that locker would just have been freed anyway.  Once we've gotten
871 	 * the locker id, we need to acquire the handle lock for this
872 	 * subdatabase.
873 	 */
874 	dbp->locker = mdbp->locker;
875 	mdbp->locker = NULL;
876 
877 	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
878 
879 	/*
880 	 * We copy our fileid from our master so that we all open
881 	 * the same file in mpool.  We'll use the meta-pgno to lock
882 	 * so that we end up with different handle locks.
883 	 */
884 
885 	memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
886 	lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ||
887 	    F2_ISSET(dbp, DB2_AM_EXCL) ? DB_LOCK_WRITE : DB_LOCK_READ;
888 	if ((ret = __fop_lock_handle(env, dbp,
889 	    txn == NULL ? dbp->locker : txn->locker, lkmode, NULL,
890 	    NOWAIT_FLAG(txn) |
891 	    (F2_ISSET(dbp, DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
892 		goto err;
893 
894 	if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) {
895 		/*
896 		 * If there was no transaction and we created this database,
897 		 * then we need to undo the update of the master database.
898 		 */
899 		if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL)
900 			(void)__db_master_update(mdbp, dbp,
901 			    ip, txn, name, dbp->type, MU_REMOVE, NULL, 0);
902 		F_CLR(dbp, DB_AM_CREATED);
903 		goto err;
904 	}
905 
906 	/*
907 	 * XXX
908 	 * This should have been done at the top of this routine.  The problem
909 	 * is that __db_init_subdb() uses "standard" routines to process the
910 	 * meta-data page and set information in the DB handle based on it.
911 	 * Those routines have to deal with swapped pages and will normally set
912 	 * the DB_AM_SWAP flag.  However, we use the master's metadata page and
913 	 * that has already been swapped, so they get the is-swapped test wrong.
914 	 */
915 	F_CLR(dbp, DB_AM_SWAP);
916 	F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP));
917 
918 	/*
919 	 * In the file create case, these happen in separate places so we have
920 	 * two different tests.  They end up in the same place for subdbs, but
921 	 * for compatibility with file testing, we put them both here anyway.
922 	 */
923 	DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname);
924 	DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname);
925 
926 	/*
927 	 * File exists and we have the appropriate locks; we should now
928 	 * process a normal open.
929 	 */
930 	if (F_ISSET(mdbp, DB_AM_CREATED)) {
931 		F_SET(dbp, DB_AM_CREATED_MSTR);
932 		F_CLR(mdbp, DB_AM_DISCARD);
933 	}
934 
935 	if (0) {
936 err:
937 DB_TEST_RECOVERY_LABEL
938 		if (txn == NULL)
939 			(void)__ENV_LPUT(env, dbp->handle_lock);
940 	}
941 
942 	/*
943 	 * The master's handle lock is under the control of the
944 	 * subdb (it acquired the master's locker).  We want to
945 	 * keep the master's handle lock so that no one can remove
946 	 * the file while the subdb is open.  If we register the
947 	 * trade event and then invalidate the copy of the lock
948 	 * in the master's handle, that will accomplish this.  However,
949 	 * before we register this event, we'd better remove any
950 	 * events that we've already registered for the master.
951 	 */
952 	if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) {
953 		/* Unregister old master events. */
954 		 __txn_remlock(env,
955 		    txn, &mdbp->handle_lock, DB_LOCK_INVALIDID);
956 
957 		/* Now register the new event. */
958 		if ((t_ret = __txn_lockevent(env, txn, dbp,
959 		    &mdbp->handle_lock, dbp->locker == NULL ?
960 		    mdbp->locker : dbp->locker)) != 0 && ret == 0)
961 			ret = t_ret;
962 	}
963 	LOCK_INIT(mdbp->handle_lock);
964 
965 	/*
966 	 * If the master was created, we need to sync so that the metadata
967 	 * page is correct on disk for recovery, since it isn't read through
968 	 * mpool.  If we're opening a subdb in an existing file, we can skip
969 	 * the sync.
970 	 */
971 	if ((t_ret = __db_close(mdbp, txn,
972 	    F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && ret == 0)
973 		ret = t_ret;
974 
975 	return (ret);
976 }
977 
978 /*
979  * __fop_remove_setup --
980  *	Open handle appropriately and lock for removal of a database file.
981  *
982  * PUBLIC: int __fop_remove_setup __P((DB *,
983  * PUBLIC:      DB_TXN *, const char *, u_int32_t));
984  */
985 int
__fop_remove_setup(dbp,txn,name,flags)986 __fop_remove_setup(dbp, txn, name, flags)
987 	DB *dbp;
988 	DB_TXN *txn;
989 	const char *name;
990 	u_int32_t flags;
991 {
992 	DB_FH *fhp;
993 	DB_LOCK elock;
994 	ENV *env;
995 	u_int8_t mbuf[DBMETASIZE];
996 	int ret;
997 
998 	COMPQUIET(flags, 0);
999 
1000 	env = dbp->env;
1001 
1002 	LOCK_INIT(elock);
1003 	fhp = NULL;
1004 	ret = 0;
1005 
1006 	/* Create locker if necessary. */
1007 retry:	if (LOCKING_ON(env)) {
1008 		if (IS_REAL_TXN(txn))
1009 			dbp->locker = txn->locker;
1010 		else if (dbp->locker == DB_LOCK_INVALIDID) {
1011 			if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
1012 				goto err;
1013 			if (txn != NULL && F_ISSET(txn, TXN_INFAMILY) &&
1014 			    (ret = __lock_addfamilylocker(env,
1015 			    txn->txnid, dbp->locker->id, 1)) != 0)
1016 				goto err;
1017 		}
1018 	}
1019 
1020 	/*
1021 	 * We are about to open a file handle and then possibly close it.
1022 	 * We cannot close handles if we are doing FCNTL locking.  However,
1023 	 * there is no way to pass the FCNTL flag into this routine via the
1024 	 * user API.  The only way we can get in here and be doing FCNTL
1025 	 * locking is if we are trying to clean up an open that was called
1026 	 * with FCNTL locking.  In that case, the save_fhp should already be
1027 	 * set.  So, we use that field to tell us if we need to make sure
1028 	 * that we shouldn't close the handle.
1029 	 */
1030 	fhp = dbp->saved_open_fhp;
1031 	DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL);
1032 
1033 	/*
1034 	 * Lock environment to protect file open.  That will enable us to
1035 	 * read the meta-data page and get the fileid so that we can lock
1036 	 * the handle.
1037 	 */
1038 	GET_ENVLOCK(env, dbp->locker, &elock);
1039 
1040 	/* Open database. */
1041 	if (F_ISSET(dbp, DB_AM_INMEM)) {
1042 		if ((ret = __env_mpool(dbp, name, flags)) == 0)
1043 			ret = __os_strdup(env, name, &dbp->dname);
1044 	} else if (fhp == NULL)
1045 		ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp);
1046 	if (ret != 0)
1047 		goto err;
1048 
1049 	/* Get meta-data */
1050 	if (F_ISSET(dbp, DB_AM_INMEM))
1051 		ret = __fop_inmem_read_meta(
1052 		    dbp, txn, name, flags, DB_CHK_META);
1053 	else if ((ret = __fop_read_meta(env,
1054 	    name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0)
1055 		ret = __db_meta_setup(env, dbp,
1056 		    name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN);
1057 	if (ret != 0)
1058 		goto err;
1059 
1060 	/*
1061 	 * Now, get the handle lock.  We first try with NOWAIT, because if
1062 	 * we have to wait, we're going to have to close the file and reopen
1063 	 * it, so that if there is someone else removing it, our open doesn't
1064 	 * prevent that.
1065 	 */
1066 	if ((ret = __fop_lock_handle(env,
1067 	    dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) {
1068 		/*
1069 		 * Close the file, block on the lock, clean up the dbp, and
1070 		 * then start all over again.
1071 		 */
1072 		if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) {
1073 			(void)__os_closehandle(env, fhp);
1074 			fhp = NULL;
1075 		}
1076 		if (ret != DB_LOCK_NOTGRANTED ||
1077 		    (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
1078 			goto err;
1079 		else if ((ret = __fop_lock_handle(env,
1080 		    dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1081 			goto err;
1082 
1083 		if (F_ISSET(dbp, DB_AM_INMEM)) {
1084 			(void)__lock_put(env, &dbp->handle_lock);
1085 			(void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1);
1086 		} else {
1087 			if (txn != NULL)
1088 				dbp->locker = NULL;
1089 			(void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0);
1090 		}
1091 		goto retry;
1092 	} else if ((ret = __ENV_LPUT(env, elock)) != 0)
1093 		goto err;
1094 	else if (F_ISSET(dbp, DB_AM_IN_RENAME))
1095 		ret = ENOENT;
1096 
1097 	if (0) {
1098 err:		(void)__ENV_LPUT(env, elock);
1099 	}
1100 	if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING))
1101 		(void)__os_closehandle(env, fhp);
1102 	/*
1103 	 * If this is a real file and we are going to proceed with the removal,
1104 	 * then we need to make sure that we don't leave any pages around in the
1105 	 * mpool since the file is closed and will be reopened again before
1106 	 * access.  However, this might be an in-memory file, in which case
1107 	 * we will handle the discard from the mpool later as it's the "real"
1108 	 * removal of the database.
1109 	 */
1110 	if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM))
1111 		F_SET(dbp, DB_AM_DISCARD);
1112 	return (ret);
1113 }
1114 
1115 /*
1116  * __fop_read_meta --
1117  *	Read the meta-data page from a file and return it in buf.
1118  *
1119  * PUBLIC: int __fop_read_meta __P((ENV *, const char *,
1120  * PUBLIC:     u_int8_t *, size_t, DB_FH *, int, size_t *));
1121  */
1122 int
__fop_read_meta(env,name,buf,size,fhp,errok,nbytesp)1123 __fop_read_meta(env, name, buf, size, fhp, errok, nbytesp)
1124 	ENV *env;
1125 	const char *name;
1126 	u_int8_t *buf;
1127 	size_t size;
1128 	DB_FH *fhp;
1129 	int errok;
1130 	size_t *nbytesp;
1131 {
1132 	size_t nr;
1133 	int ret;
1134 
1135 	/*
1136 	 * Our caller wants to know the number of bytes read, even if we
1137 	 * return an error.
1138 	 */
1139 	if (nbytesp != NULL)
1140 		*nbytesp = 0;
1141 
1142 	nr = 0;
1143 	ret = __os_read(env, fhp, buf, size, &nr);
1144 	if (nbytesp != NULL)
1145 		*nbytesp = nr;
1146 
1147 	if (ret != 0) {
1148 		if (!errok)
1149 			__db_err(env, ret, "%s", name);
1150 		goto err;
1151 	}
1152 
1153 	if (nr != size) {
1154 		if (!errok)
1155 			__db_errx(env, DB_STR_A("0004",
1156 			    "fop_read_meta: %s: unexpected file type or format",
1157 			    "%s"), name);
1158 		ret = EINVAL;
1159 	}
1160 
1161 err:
1162 	return (ret);
1163 }
1164 
1165 /*
1166  * __fop_dummy --
1167  *	This implements the creation and name swapping of dummy files that
1168  * we use for remove and rename (remove is simply a rename with a delayed
1169  * remove).
1170  *
1171  * PUBLIC: int __fop_dummy __P((DB *,
1172  * PUBLIC:     DB_TXN *, const char *, const char *));
1173  */
1174 int
__fop_dummy(dbp,txn,old,new)1175 __fop_dummy(dbp, txn, old, new)
1176 	DB *dbp;
1177 	DB_TXN *txn;
1178 	const char *old, *new;
1179 {
1180 	DB *tmpdbp;
1181 	DB_TXN *stxn;
1182 	ENV *env;
1183 	char *back;
1184 	int ret, t_ret;
1185 	u_int8_t mbuf[DBMETASIZE];
1186 
1187 	env = dbp->env;
1188 	back = NULL;
1189 	stxn = NULL;
1190 	tmpdbp = NULL;
1191 
1192 	DB_ASSERT(env, txn != NULL);
1193 
1194 	/*
1195 	 * Begin sub transaction to encapsulate the rename.  Note that we
1196 	 * expect the inmem_swap calls to complete the sub-transaction,
1197 	 * aborting on error and committing on success.
1198 	 */
1199 	if (TXN_ON(env) &&
1200 	    (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
1201 		goto err;
1202 
1203 	/* We need to create a dummy file as a place holder. */
1204 	if ((ret = __db_backup_name(env, new, stxn, &back)) != 0)
1205 		goto err;
1206 	/* Create a dummy dbp handle. */
1207 	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1208 		goto err;
1209 	if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
1210 		(ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0)
1211 		goto err;
1212 	memset(mbuf, 0, sizeof(mbuf));
1213 	ret = F_ISSET(dbp, DB_AM_INMEM) ?
1214 	    __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
1215 	    __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf);
1216 
1217 	if (ret != 0)
1218 		goto err;
1219 
1220 	ret = F_ISSET(dbp, DB_AM_INMEM) ?
1221 	    __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
1222 	    __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker);
1223 	stxn = NULL;
1224 	if (ret != 0)
1225 		goto err;
1226 
1227 err:	if (stxn != NULL)
1228 		(void)__txn_abort(stxn);
1229 	if (tmpdbp != NULL &&
1230 	    (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1231 		ret = t_ret;
1232 	if (back != NULL)
1233 		__os_free(env, back);
1234 	return (ret);
1235 }
1236 
1237 /*
1238  * __fop_dbrename --
1239  *	Do the appropriate file locking and file system operations
1240  * to effect a dbrename in the absence of transactions (__fop_dummy
1241  * and the subsequent calls in __db_rename do the work for the
1242  * transactional case).
1243  *
1244  * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
1245  */
1246 int
__fop_dbrename(dbp,old,new)1247 __fop_dbrename(dbp, old, new)
1248 	DB *dbp;
1249 	const char *old, *new;
1250 {
1251 	DB_LOCK elock;
1252 	ENV *env;
1253 	char *real_new, *real_old;
1254 	int ret, t_ret;
1255 
1256 	env = dbp->env;
1257 	real_new = NULL;
1258 	real_old = NULL;
1259 	LOCK_INIT(elock);
1260 
1261 	if (F_ISSET(dbp, DB_AM_INMEM)) {
1262 		real_new = (char *)new;
1263 		real_old = (char *)old;
1264 	} else {
1265 		/* Get full names. */
1266 		if ((ret = __db_appname(env,
1267 		    DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0)
1268 			goto err;
1269 
1270 		if ((ret = __db_appname(env,
1271 		    DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0)
1272 			goto err;
1273 	}
1274 
1275 	/*
1276 	 * It is an error to rename a file over one that already exists,
1277 	 * as that wouldn't be transaction-safe.  We check explicitly
1278 	 * for ondisk files, but it's done memp_nameop for in-memory ones.
1279 	 */
1280 	GET_ENVLOCK(env, dbp->locker, &elock);
1281 	ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT :
1282 	    __os_exists(env, real_new, NULL);
1283 
1284 	if (ret == 0) {
1285 		ret = EEXIST;
1286 		__db_errx(env, DB_STR_A("0005",
1287 		    "rename: file %s exists", "%s"), real_new);
1288 		goto err;
1289 	}
1290 
1291 	ret = __memp_nameop(env,
1292 	    dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM));
1293 
1294 err:	if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0)
1295 		ret = t_ret;
1296 	if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL)
1297 		__os_free(env, real_old);
1298 	if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL)
1299 		__os_free(env, real_new);
1300 	return (ret);
1301 }
1302 
1303 static int
__fop_inmem_create(dbp,name,txn,flags)1304 __fop_inmem_create(dbp, name, txn, flags)
1305 	DB *dbp;
1306 	const char *name;
1307 	DB_TXN *txn;
1308 	u_int32_t flags;
1309 {
1310 	DBT fid_dbt, name_dbt;
1311 	DB_LSN lsn;
1312 	ENV *env;
1313 	int ret;
1314 	int32_t lfid;
1315 	u_int32_t dflags, *p32;
1316 
1317 	env = dbp->env;
1318 	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1319 
1320 	MAKE_INMEM(dbp);
1321 
1322 	/* Set the pagesize if it isn't yet set. */
1323 	if (dbp->pgsize == 0)
1324 		dbp->pgsize = DB_DEF_IOSIZE;
1325 
1326 	/*
1327 	 * Construct a file_id.
1328 	 *
1329 	 * If this file has no name, then we only need a fileid for locking.
1330 	 * If this file has a name, we need the fileid both for locking and
1331 	 * matching in the memory pool.  So, with unnamed in-memory databases,
1332 	 * use a lock_id.  For named in-memory files, we need to find a value
1333 	 * that we can use to uniquely identify a name/fid pair.  We use a
1334 	 * combination of a unique id (__os_unique_id) and a hash of the
1335 	 * original name.
1336 	 */
1337 	if (name == NULL) {
1338 		if (LOCKING_ON(env) && (ret =
1339 		    __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0)
1340 			goto err;
1341 	}  else {
1342 		p32 = (u_int32_t *)(&dbp->fileid[0]);
1343 		__os_unique_id(env, p32);
1344 		p32++;
1345 		(void)strncpy(
1346 		    (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t));
1347 		dbp->preserve_fid = 1;
1348 
1349 		if (DBENV_LOGGING(env) &&
1350 #if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
1351 		    txn != NULL &&
1352 #endif
1353 		    dbp->log_filename != NULL)
1354 			memcpy(dbp->log_filename->ufid,
1355 			    dbp->fileid, DB_FILE_ID_LEN);
1356 	}
1357 
1358 	/* Now, set the fileid. */
1359 	if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
1360 		goto err;
1361 
1362 	if ((ret = __env_mpool(dbp, name, flags)) != 0)
1363 		goto err;
1364 
1365 	if (DBENV_LOGGING(env) &&
1366 #if !defined(DEBUG_WOP)
1367 	    txn != NULL &&
1368 #endif
1369 	    name != NULL) {
1370 		DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
1371 		memset(&fid_dbt, 0, sizeof(fid_dbt));
1372 		fid_dbt.data = dbp->fileid;
1373 		fid_dbt.size = DB_FILE_ID_LEN;
1374 		lfid = dbp->log_filename == NULL ?
1375 		    DB_LOGFILEID_INVALID : dbp->log_filename->id;
1376 		if ((ret = __crdel_inmem_create_log(env, txn,
1377 		    &lsn, dflags, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0)
1378 			goto err;
1379 	}
1380 
1381 	F_SET(dbp, DB_AM_CREATED);
1382 
1383 err:
1384 	return (ret);
1385 }
1386 
1387 static int
__fop_inmem_read_meta(dbp,txn,name,flags,chkflags)1388 __fop_inmem_read_meta(dbp, txn, name, flags, chkflags)
1389 	DB *dbp;
1390 	DB_TXN *txn;
1391 	const char *name;
1392 	u_int32_t flags;
1393 	u_int32_t chkflags;
1394 {
1395 	DBMETA *metap;
1396 	DB_THREAD_INFO *ip;
1397 	db_pgno_t pgno;
1398 	int ret, t_ret;
1399 
1400 	if (txn == NULL)
1401 		ENV_GET_THREAD_INFO(dbp->env, ip);
1402 	else
1403 		ip = txn->thread_info;
1404 
1405 	pgno  = PGNO_BASE_MD;
1406 	if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0)
1407 		return (ret);
1408 	if (FLD_ISSET(chkflags, DB_CHK_ONLY)) {
1409 		if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0)
1410 			memcpy(dbp->fileid,
1411 			    ((DBMETA *)metap)->uid, DB_FILE_ID_LEN);
1412 	} else
1413 		ret = __db_meta_setup(
1414 		    dbp->env, dbp, name, metap, flags, chkflags);
1415 
1416 	if ((t_ret =
1417 	    __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
1418 		ret = t_ret;
1419 
1420 	return (ret);
1421 }
1422 
1423 static int
__fop_ondisk_dummy(dbp,txn,name,mbuf)1424 __fop_ondisk_dummy(dbp, txn, name, mbuf)
1425 	DB *dbp;
1426 	DB_TXN *txn;
1427 	const char *name;
1428 	u_int8_t *mbuf;
1429 {
1430 	ENV *env;
1431 	int ret;
1432 	char *realname;
1433 	u_int32_t dflags;
1434 
1435 	realname = NULL;
1436 	env = dbp->env;
1437 	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1438 
1439 	if ((ret = __db_appname(env,
1440 	    DB_APP_DATA, name, &dbp->dirname, &realname)) != 0)
1441 		goto err;
1442 
1443 	if ((ret = __fop_create(env,
1444 	    txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0)
1445 		goto err;
1446 
1447 	if ((ret =
1448 	    __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0)
1449 		goto err;
1450 
1451 	((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1452 	if ((ret = __fop_write(env, txn, name, dbp->dirname,
1453 	    DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
1454 		goto err;
1455 
1456 	memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
1457 
1458 err:	if (realname != NULL)
1459 		__os_free(env, realname);
1460 
1461 	return (ret);
1462 }
1463 
1464 static int
__fop_inmem_dummy(dbp,txn,name,mbuf)1465 __fop_inmem_dummy(dbp, txn, name, mbuf)
1466 	DB *dbp;
1467 	DB_TXN *txn;
1468 	const char *name;
1469 	u_int8_t *mbuf;
1470 {
1471 	DBMETA *metap;
1472 	DB_THREAD_INFO *ip;
1473 	db_pgno_t pgno;
1474 	int ret, t_ret;
1475 
1476 	if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0)
1477 		return (ret);
1478 	if (txn == NULL)
1479 		ENV_GET_THREAD_INFO(dbp->env, ip);
1480 	else
1481 		ip = txn->thread_info;
1482 
1483 	pgno  = PGNO_BASE_MD;
1484 	if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn,
1485 	    DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0)
1486 		return (ret);
1487 	/* Check file existed. */
1488 	if (metap->magic != 0)
1489 		ret = EEXIST;
1490 	else
1491 		metap->magic = DB_RENAMEMAGIC;
1492 
1493 	/* Copy the fileid onto the meta-data page. */
1494 	memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN);
1495 
1496 	if ((t_ret = __memp_fput(dbp->mpf, ip, metap,
1497 	    ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0)
1498 		ret = t_ret;
1499 
1500 	if (ret != 0)
1501 		goto err;
1502 
1503 	((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1504 
1505 err:	return (ret);
1506 }
1507 
1508 static int
__fop_ondisk_swap(dbp,tmpdbp,txn,old,new,back,locker)1509 __fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
1510 	DB *dbp, *tmpdbp;
1511 	DB_TXN *txn;
1512 	const char *old, *new, *back;
1513 	DB_LOCKER *locker;
1514 {
1515 	DBT fiddbt, namedbt, tmpdbt;
1516 	DB_FH *fhp;
1517 	DB_LOCK elock;
1518 	DB_LSN lsn;
1519 	DB_TXN *parent;
1520 	ENV *env;
1521 	u_int8_t mbuf[DBMETASIZE];
1522 	u_int32_t child_txnid, dflags;
1523 	int ret, t_ret;
1524 	char *realold, *realnew;
1525 
1526 	env = dbp->env;
1527 	DB_ASSERT(env, txn != NULL);
1528 	DB_ASSERT(env, old != NULL);
1529 
1530 	realold = realnew = NULL;
1531 	LOCK_INIT(elock);
1532 	fhp = NULL;
1533 	dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1534 
1535 	if ((ret = __db_appname(env,
1536 	    DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0)
1537 		goto err;
1538 
1539 	/* Now, lock the name space while we initialize this file. */
1540 retry:	GET_ENVLOCK(env, locker, &elock);
1541 	if (__os_exists(env, realnew, NULL) == 0) {
1542 		/*
1543 		 * It is possible that the only reason this file exists is
1544 		 * because we've done a previous rename of it and we have
1545 		 * left a placeholder here.  We need to check for that case
1546 		 * and allow this rename to succeed if that's the case.
1547 		 */
1548 		if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0)
1549 			goto err;
1550 		if ((ret = __fop_read_meta(env,
1551 		    realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 ||
1552 		    (ret = __db_meta_setup(env,
1553 		    tmpdbp, realnew, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) {
1554 			ret = EEXIST;
1555 			goto err;
1556 		}
1557 		ret = __os_closehandle(env, fhp);
1558 		fhp = NULL;
1559 		if (ret != 0)
1560 			goto err;
1561 
1562 		/*
1563 		 * Now, try to acquire the handle lock.  If the handle is locked
1564 		 * by our current, transaction, then we'll get it and life is
1565 		 * good.
1566 		 *
1567 		 * Alternately, it's not locked at all, we'll get the lock, but
1568 		 * we will realize it exists and consider this an error.
1569 		 *
1570 		 * However, if it's held by another transaction, then there
1571 		 * could be two different scenarios: 1) the file is in the
1572 		 * midst of being created or deleted and when that transaction
1573 		 * is over, we might be able to proceed. 2) the file is open
1574 		 * and exists and we should report an error. In order to
1575 		 * distinguish these two cases, we do the following. First, we
1576 		 * try to acquire a READLOCK.  If the handle is in the midst of
1577 		 * being created, then we'll block because a writelock is held.
1578 		 * In that case, we should request a blocking write, and when we
1579 		 * get the lock, we should then go back and check to see if the
1580 		 * object exists and start all over again.
1581 		 *
1582 		 * If we got the READLOCK, then either no one is holding the
1583 		 * lock or someone has an open handle and the fact that the file
1584 		 * exists is problematic.  So, in this case, we request the
1585 		 * WRITELOCK non-blocking -- if it succeeds, we're golden.  If
1586 		 * it fails, then the file exists and we return EEXIST.
1587 		 */
1588 		if ((ret = __fop_lock_handle(env,
1589 		    tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1590 			/*
1591 			 * Someone holds a write-lock.  Wait for the write-lock
1592 			 * and after we get it, release it and start over.
1593 			 */
1594 			if ((ret = __fop_lock_handle(env, tmpdbp,
1595 			    locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1596 				goto err;
1597 			if ((ret =
1598 			    __lock_put(env, &tmpdbp->handle_lock)) != 0)
1599 				goto err;
1600 			if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0)
1601 				goto err;
1602 			goto retry;
1603 		}
1604 
1605 		/* We got the read lock; try to upgrade it. */
1606 		ret = __fop_lock_handle(env,
1607 		    tmpdbp, locker, DB_LOCK_WRITE,
1608 		    NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT);
1609 		if (ret != 0) {
1610 			/*
1611 			 * We did not get the writelock, so someone
1612 			 * has the handle open.  This is an error.
1613 			 */
1614 			(void)__lock_put(env, &tmpdbp->handle_lock);
1615 			ret = EEXIST;
1616 		} else  if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1617 			/* We got the lock and are renaming it. */
1618 			ret = 0;
1619 		else { /* We got the lock, but the file exists. */
1620 			(void)__lock_put(env, &tmpdbp->handle_lock);
1621 			ret = EEXIST;
1622 		}
1623 		if (ret != 0)
1624 			goto err;
1625 	}
1626 
1627 	/*
1628 	 * While we have the namespace locked, do the renames and then
1629 	 * swap for the handle lock.
1630 	 */
1631 	if ((ret = __fop_rename(env, txn,
1632 	    old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
1633 		goto err;
1634 	if ((ret = __fop_rename(env, txn, back, old,
1635 	    &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
1636 		goto err;
1637 	if ((ret = __fop_lock_handle(env,
1638 	    tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
1639 		goto err;
1640 
1641 	/*
1642 	 * We just acquired a transactional lock on the tmp handle.
1643 	 * We need to null out the tmp handle's lock so that it
1644 	 * doesn't create problems for us in the close path.
1645 	 */
1646 	LOCK_INIT(tmpdbp->handle_lock);
1647 
1648 	/* Commit the child. */
1649 	child_txnid = txn->txnid;
1650 	parent = txn->parent;
1651 	ret = __txn_commit(txn, 0);
1652 	txn = NULL;
1653 
1654 	/*
1655 	 * If the new name is available because it was previously renamed
1656 	 * remove it from the remove list.
1657 	 */
1658 	if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1659 		__txn_remrem(env, parent, realnew);
1660 
1661 	/* Now log the child information in the parent. */
1662 	memset(&fiddbt, 0, sizeof(fiddbt));
1663 	fiddbt.data = dbp->fileid;
1664 	fiddbt.size = DB_FILE_ID_LEN;
1665 	memset(&tmpdbt, 0, sizeof(fiddbt));
1666 	tmpdbt.data = tmpdbp->fileid;
1667 	tmpdbt.size = DB_FILE_ID_LEN;
1668 	DB_INIT_DBT(namedbt, old, strlen(old) + 1);
1669 	if ((t_ret = __fop_file_remove_log(env,
1670 	    parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt,
1671 	    (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
1672 		ret = t_ret;
1673 
1674 	/* This is a delayed delete of the dummy file. */
1675 	if ((ret = __db_appname(env,
1676 	    DB_APP_DATA, old, &dbp->dirname, &realold)) != 0)
1677 		goto err;
1678 
1679 	if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
1680 		goto err;
1681 
1682 err:	if (txn != NULL)	/* Ret must already be set, so void abort. */
1683 		(void)__txn_abort(txn);
1684 
1685 	(void)__ENV_LPUT(env, elock);
1686 
1687 	if (fhp != NULL &&
1688 	    (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
1689 		ret = t_ret;
1690 
1691 	if (realnew != NULL)
1692 		__os_free(env, realnew);
1693 	if (realold != NULL)
1694 		__os_free(env, realold);
1695 	return (ret);
1696 }
1697 
1698 static int
__fop_inmem_swap(olddbp,backdbp,txn,old,new,back,locker)1699 __fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker)
1700 	DB *olddbp, *backdbp;
1701 	DB_TXN *txn;
1702 	const char *old, *new, *back;
1703 	DB_LOCKER *locker;
1704 {
1705 	DB *tmpdbp;
1706 	DBT fid_dbt, n1_dbt, n2_dbt;
1707 	DB_LOCK elock;
1708 	DB_LSN lsn;
1709 	DB_TXN *parent;
1710 	ENV *env;
1711 	int ret, t_ret;
1712 
1713 	env = olddbp->env;
1714 	parent = txn->parent;
1715 retry:	LOCK_INIT(elock);
1716 	if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1717 		return (ret);
1718 	MAKE_INMEM(tmpdbp);
1719 
1720 	GET_ENVLOCK(env, locker, &elock);
1721 	if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) {
1722 		/*
1723 		 * It is possible that the only reason this database exists is
1724 		 * because we've done a previous rename of it and we have
1725 		 * left a placeholder here.  We need to check for that case
1726 		 * and allow this rename to succeed if that's the case.
1727 		 */
1728 
1729 		if ((ret = __fop_inmem_read_meta(
1730 		    tmpdbp, txn, new, 0, DB_CHK_META)) != 0) {
1731 			ret = EEXIST;
1732 			goto err;
1733 		}
1734 
1735 		/*
1736 		 * Now, try to acquire the handle lock.  If it's from our txn,
1737 		 * then we'll get the lock.  If it's not, then someone else has
1738 		 * it locked.  See the comments in __fop_ondisk_swap for
1739 		 * details.
1740 		 */
1741 		if ((ret = __fop_lock_handle(env,
1742 		    tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1743 			/*
1744 			 * Someone holds a writelock.  Try for the WRITELOCK
1745 			 * and after we get it, retry.
1746 			 */
1747 			if ((ret = __fop_lock_handle(env, tmpdbp,
1748 			    locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1749 				goto err;
1750 
1751 			/* We have the write lock; release it and start over. */
1752 			(void)__lock_put(env, &tmpdbp->handle_lock);
1753 			(void)__db_close(tmpdbp, NULL, DB_NOSYNC);
1754 			(void)__ENV_LPUT(env, elock);
1755 			goto retry;
1756 		} else {
1757 			(void)__lock_put(env, &tmpdbp->handle_lock);
1758 			if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1759 				ret = EEXIST;
1760 		}
1761 		if (ret != 0)
1762 			goto err;
1763 	}
1764 
1765 	/* Log the renames. */
1766 	if (LOGGING_ON(env)
1767 #ifndef DEBUG_WOP
1768 	    && txn != NULL
1769 #endif
1770 	) {
1771 		/* Rename old to new. */
1772 		DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN);
1773 		DB_INIT_DBT(n1_dbt, old, strlen(old) + 1);
1774 		DB_INIT_DBT(n2_dbt, new, strlen(new) + 1);
1775 		if ((ret = __crdel_inmem_rename_log(
1776 		    env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0)
1777 			goto err;
1778 
1779 		/* Rename back to old */
1780 		fid_dbt.data = backdbp->fileid;
1781 		DB_SET_DBT(n2_dbt, back, strlen(back) + 1);
1782 		if ((ret = __crdel_inmem_rename_log(
1783 		    env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0)
1784 			goto err;
1785 	}
1786 
1787 	/*
1788 	 * While we have the namespace locked, do the renames and then
1789 	 * swap for the handle lock.   If we ran into a file in the midst
1790 	 * of rename, then we need to delete it first, else nameop is
1791 	 * going to consider it an error.
1792 	 */
1793 	if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) {
1794 		if ((ret = __memp_nameop(env,
1795 		    tmpdbp->fileid, NULL, new, NULL, 1)) != 0)
1796 			goto err;
1797 		__txn_remrem(env, parent, new);
1798 	}
1799 
1800 	if ((ret = __memp_nameop(
1801 	    env, olddbp->fileid, new, old, new, 1)) != 0)
1802 		goto err;
1803 	if ((ret = __memp_nameop(
1804 	    env, backdbp->fileid, old, back, old, 1)) != 0)
1805 		goto err;
1806 
1807 	if ((ret = __fop_lock_handle(env,
1808 	    tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1809 		goto err;
1810 
1811 	/*
1812 	 * We just acquired a transactional lock on the tmp handle.
1813 	 * We need to null out the tmp handle's lock so that it
1814 	 * doesn't create problems for us in the close path.
1815 	 */
1816 	LOCK_INIT(tmpdbp->handle_lock);
1817 
1818 	DB_ASSERT(env, txn != NULL);
1819 
1820 	/* Commit the child. */
1821 	ret = __txn_commit(txn, 0);
1822 	txn = NULL;
1823 
1824 	if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0)
1825 		goto err;
1826 
1827 err:	(void)__ENV_LPUT(env, elock);
1828 
1829 	if (txn != NULL)
1830 		(void)__txn_abort(txn);
1831 
1832 	if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1833 		ret = t_ret;
1834 
1835 	return (ret);
1836 }
1837