1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 2001, 2013 Oracle and/or its affiliates. All rights reserved.
5 *
6 * $Id$
7 */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/db_am.h"
14 #include "dbinc/hash.h"
15 #include "dbinc/fop.h"
16 #include "dbinc/lock.h"
17 #include "dbinc/mp.h"
18 #include "dbinc/txn.h"
19
20 static int __fop_set_pgsize __P((DB *, DB_FH *, const char *));
21 static int __fop_inmem_create __P((DB *, const char *, DB_TXN *, u_int32_t));
22 static int __fop_inmem_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
23 static int __fop_inmem_read_meta __P((DB *, DB_TXN *, const char *, u_int32_t,
24 u_int32_t));
25 static int __fop_inmem_swap __P((DB *, DB *, DB_TXN *,
26 const char *, const char *, const char *, DB_LOCKER *));
27 static int __fop_ondisk_dummy __P((DB *, DB_TXN *, const char *, u_int8_t *));
28 static int __fop_ondisk_swap __P((DB *, DB *, DB_TXN *,
29 const char *, const char *, const char *, DB_LOCKER *));
30
31 /*
32 * Acquire the environment meta-data lock. The parameters are the
33 * environment (ENV), the locker id to use in acquiring the lock (ID)
34 * and a pointer to a DB_LOCK.
35 *
36 * !!!
37 * Turn off locking for Critical Path. The application must do its own
38 * synchronization of open/create. Two threads creating and opening a
39 * file at the same time may have unpredictable results.
40 */
41 #ifdef CRITICALPATH_10266
42 #define GET_ENVLOCK(ENV, ID, L) (0)
43 #else
44 #define GET_ENVLOCK(ENV, ID, L) do { \
45 DBT __dbt; \
46 u_int32_t __lockval; \
47 \
48 if (LOCKING_ON((ENV))) { \
49 __lockval = 1; \
50 __dbt.data = &__lockval; \
51 __dbt.size = sizeof(__lockval); \
52 if ((ret = __lock_get((ENV), (ID), \
53 0, &__dbt, DB_LOCK_WRITE, (L))) != 0) \
54 goto err; \
55 } \
56 } while (0)
57 #endif
58
59 #define RESET_MPF(D, F) do { \
60 (void)__memp_fclose((D)->mpf, (F)); \
61 (D)->mpf = NULL; \
62 F_CLR((D), DB_AM_OPEN_CALLED); \
63 if ((ret = __memp_fcreate((D)->env, &(D)->mpf)) != 0) \
64 goto err; \
65 } while (0)
66
67 /*
68 * If we open a file handle and our caller is doing fcntl(2) locking,
69 * we can't close the handle because that would discard the caller's
70 * lock. Save it until we close or refresh the DB handle.
71 */
72 #define CLOSE_HANDLE(D, F) { \
73 if ((F) != NULL) { \
74 if (LF_ISSET(DB_FCNTL_LOCKING)) \
75 (D)->saved_open_fhp = (F); \
76 else if ((t_ret = \
77 __os_closehandle((D)->env, (F))) != 0) { \
78 if (ret == 0) \
79 ret = t_ret; \
80 goto err; \
81 } \
82 (F) = NULL; \
83 } \
84 }
85
86 /*
87 * __fop_lock_handle --
88 *
89 * Get the handle lock for a database. If the envlock is specified, do this
90 * as a lock_vec call that releases the environment lock before acquiring the
91 * handle lock.
92 *
93 * PUBLIC: int __fop_lock_handle __P((ENV *,
94 * PUBLIC: DB *, DB_LOCKER *, db_lockmode_t, DB_LOCK *, u_int32_t));
95 *
96 */
97 int
__fop_lock_handle(env,dbp,locker,mode,elockp,flags)98 __fop_lock_handle(env, dbp, locker, mode, elockp, flags)
99 ENV *env;
100 DB *dbp;
101 DB_LOCKER *locker;
102 db_lockmode_t mode;
103 DB_LOCK *elockp;
104 u_int32_t flags;
105 {
106 DBT fileobj;
107 DB_LOCKREQ reqs[2], *ereq;
108 DB_LOCK_ILOCK lock_desc;
109 int ret;
110
111 if (!LOCKING_ON(env) ||
112 F_ISSET(dbp, DB_AM_COMPENSATE | DB_AM_RECOVER))
113 return (0);
114
115 /*
116 * If we are in recovery, the only locking we should be
117 * doing is on the global environment. The one exception
118 * is if we are opening an exclusive database on a client
119 * syncing with the master.
120 */
121 if (IS_RECOVERING(env) && !F2_ISSET(dbp, DB2_AM_INTEXCL))
122 return (elockp == NULL ? 0 : __ENV_LPUT(env, *elockp));
123
124 memcpy(lock_desc.fileid, dbp->fileid, DB_FILE_ID_LEN);
125 lock_desc.pgno = dbp->meta_pgno;
126 lock_desc.type = DB_HANDLE_LOCK;
127
128 memset(&fileobj, 0, sizeof(fileobj));
129 fileobj.data = &lock_desc;
130 fileobj.size = sizeof(lock_desc);
131 DB_TEST_SUBLOCKS(env, flags);
132 if (F2_ISSET(dbp, DB2_AM_INTEXCL))
133 flags |= DB_LOCK_IGNORE_REC;
134 if (elockp == NULL)
135 ret = __lock_get(env, locker,
136 flags, &fileobj, mode, &dbp->handle_lock);
137 else {
138 reqs[0].op = DB_LOCK_PUT;
139 reqs[0].lock = *elockp;
140 reqs[1].op = DB_LOCK_GET;
141 reqs[1].mode = mode;
142 reqs[1].obj = &fileobj;
143 reqs[1].timeout = 0;
144 if ((ret = __lock_vec(env,
145 locker, flags, reqs, 2, &ereq)) == 0) {
146 dbp->handle_lock = reqs[1].lock;
147 if (elockp != &dbp->handle_lock)
148 LOCK_INIT(*elockp);
149 } else if (ereq != reqs)
150 LOCK_INIT(*elockp);
151 }
152
153 dbp->cur_locker = locker;
154 return (ret);
155 }
156
157 /*
158 * __fop_file_setup --
159 *
160 * Perform all the needed checking and locking to open up or create a
161 * file.
162 *
163 * There's a reason we don't push this code down into the buffer cache.
164 * The problem is that there's no information external to the file that
165 * we can use as a unique ID. UNIX has dev/inode pairs, but they are
166 * not necessarily unique after reboot, if the file was mounted via NFS.
167 * Windows has similar problems, as the FAT filesystem doesn't maintain
168 * dev/inode numbers across reboot. So, we must get something from the
169 * file we can use to ensure that, even after a reboot, the file we're
170 * joining in the cache is the right file for us to join. The solution
171 * we use is to maintain a file ID that's stored in the database, and
172 * that's why we have to open and read the file before calling into the
173 * buffer cache or obtaining a lock (we use this unique fileid to lock
174 * as well as to identify like files in the cache).
175 *
176 * There are a couple of idiosyncrasies that this code must support, in
177 * particular, DB_TRUNCATE and DB_FCNTL_LOCKING. First, we disallow
178 * DB_TRUNCATE in the presence of transactions, since opening a file with
179 * O_TRUNC will result in data being lost in an unrecoverable fashion.
180 * We also disallow DB_TRUNCATE if locking is enabled, because even in
181 * the presence of locking, we cannot avoid race conditions, so allowing
182 * DB_TRUNCATE with locking would be misleading. See SR [#7345] for more
183 * details.
184 *
185 * However, if you are running with neither locking nor transactions, then
186 * you can specify DB_TRUNCATE, and if you do so, we will truncate the file
187 * regardless of its contents.
188 *
189 * FCNTL locking introduces another set of complications. First, the only
190 * reason we support the DB_FCNTL_LOCKING flag is for historic compatibility
191 * with programs like Sendmail and Postfix. In these cases, the caller may
192 * already have a lock on the file; we need to make sure that any file handles
193 * we open remain open, because if we were to close them, the lock held by the
194 * caller would go away. Furthermore, Sendmail and/or Postfix need the ability
195 * to create databases in empty files. So, when you're doing FCNTL locking,
196 * it's reasonable that you are trying to create a database into a 0-length
197 * file and we allow it, while under normal conditions, we do not create
198 * databases if the files already exist and are not Berkeley DB files.
199 *
200 * PUBLIC: int __fop_file_setup __P((DB *, DB_THREAD_INFO *ip,
201 * PUBLIC: DB_TXN *, const char *, int, u_int32_t, u_int32_t *));
202 */
203 int
__fop_file_setup(dbp,ip,txn,name,mode,flags,retidp)204 __fop_file_setup(dbp, ip, txn, name, mode, flags, retidp)
205 DB *dbp;
206 DB_THREAD_INFO *ip;
207 DB_TXN *txn;
208 const char *name;
209 int mode;
210 u_int32_t flags, *retidp;
211 {
212 DBTYPE save_type;
213 DB_FH *fhp;
214 DB_LOCK elock;
215 DB_LOCKER *locker;
216 DB_TXN *stxn;
217 ENV *env;
218 size_t len;
219 APPNAME aflags;
220 u_int32_t dflags, oflags;
221 u_int8_t mbuf[DBMETASIZE];
222 int created_locker, create_ok, ret, retries, t_ret, tmp_created;
223 int truncating, was_inval;
224 char *real_name, *real_tmpname, *tmpname;
225 db_lockmode_t lockmode;
226
227 *retidp = TXN_INVALID;
228
229 env = dbp->env;
230 fhp = NULL;
231 LOCK_INIT(elock);
232 stxn = NULL;
233 created_locker = tmp_created = truncating = was_inval = 0;
234 real_name = real_tmpname = tmpname = NULL;
235 dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
236 aflags = LF_ISSET(DB_INTERNAL_PERSISTENT_DB) ? DB_APP_META :
237 (LF_ISSET(DB_INTERNAL_TEMPORARY_DB) ? DB_APP_NONE : DB_APP_DATA);
238 LF_CLR(DB_INTERNAL_PERSISTENT_DB | DB_INTERNAL_TEMPORARY_DB);
239
240 ret = 0;
241 retries = 0;
242 save_type = dbp->type;
243 if (F2_ISSET(dbp, DB2_AM_EXCL))
244 lockmode = DB_LOCK_WRITE;
245 else
246 lockmode = DB_LOCK_READ;
247
248 /*
249 * Get a lockerid for this handle. There are paths through queue
250 * rename and remove where this dbp already has a locker, so make
251 * sure we don't clobber it and conflict.
252 */
253 if (LOCKING_ON(env) &&
254 !F_ISSET(dbp, DB_AM_COMPENSATE) &&
255 !F_ISSET(dbp, DB_AM_RECOVER) &&
256 dbp->locker == DB_LOCK_INVALIDID) {
257 if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
258 goto err;
259 created_locker = 1;
260 }
261 LOCK_INIT(dbp->handle_lock);
262
263 if (txn != NULL && dbp->locker != NULL && F_ISSET(txn, TXN_INFAMILY)) {
264 if ((ret = __lock_addfamilylocker(env,
265 txn->txnid, dbp->locker->id, 1)) != 0)
266 goto err;
267 txn = NULL;
268 }
269
270 locker = txn == NULL ? dbp->locker : txn->locker;
271
272 oflags = 0;
273 if (F_ISSET(dbp, DB_AM_INMEM))
274 real_name = (char *)name;
275 else {
276 /* Get the real backing file name. */
277 if ((ret = __db_appname(env,
278 aflags, name, &dbp->dirname, &real_name)) != 0)
279 goto err;
280
281 /* Fill in the default file mode. */
282 if (mode == 0)
283 mode = DB_MODE_660;
284
285 if (LF_ISSET(DB_RDONLY))
286 oflags |= DB_OSO_RDONLY;
287 if (LF_ISSET(DB_TRUNCATE))
288 oflags |= DB_OSO_TRUNC;
289 }
290
291 retries = 0;
292 create_ok = LF_ISSET(DB_CREATE);
293 LF_CLR(DB_CREATE);
294
295 retry:
296 /*
297 * If we cannot create the file, only retry a few times. We
298 * think we might be in a race with another create, but it could
299 * be that the backup filename exists (that is, is left over from
300 * a previous crash). It is also possible to read the metadata
301 * page while it is being written and fail the checksum.
302 */
303 if (++retries > DB_RETRY) {
304 __db_errx(env, DB_STR_A("0002",
305 "__fop_file_setup: Retry limit (%d) exceeded", "%d"),
306 DB_RETRY);
307 goto err;
308 }
309 if (!F_ISSET(dbp, DB_AM_COMPENSATE) && !F_ISSET(dbp, DB_AM_RECOVER))
310 GET_ENVLOCK(env, locker, &elock);
311 if (name == NULL)
312 ret = ENOENT;
313 else if (F_ISSET(dbp, DB_AM_INMEM)) {
314 ret = __env_mpool(dbp, name, flags);
315 /*
316 * We are using __env_open as a check for existence.
317 * However, __env_mpool does an actual open and there
318 * are scenarios where the object exists, but cannot be
319 * opened, because our settings don't match those internally.
320 * We need to check for that explicitly. We'll need the
321 * mpool open to read the meta-data page, so we're going to
322 * have to temporarily turn this dbp into an UNKNOWN one.
323 */
324 if (ret == EINVAL) {
325 was_inval = 1;
326 save_type = dbp->type;
327 dbp->type = DB_UNKNOWN;
328 ret = __env_mpool(dbp, name, flags);
329 dbp->type = save_type;
330 }
331 } else
332 ret = __os_exists(env, real_name, NULL);
333
334 if (ret == 0) {
335 /*
336 * If the file exists, there are 5 possible cases:
337 * 1. DB_EXCL was specified so this is an error, unless
338 * this is a file left around after a rename and we
339 * are in the same transaction. This gets decomposed
340 * into several subcases, because we check for various
341 * errors before we know we're in rename.
342 * 2. We are truncating, and it doesn't matter what kind
343 * of file it is, we should open/create it.
344 * 3. It is 0-length, we are not doing transactions (i.e.,
345 * we are sendmail), we should open/create into it.
346 * -- on-disk files only!
347 * 4. Is it a Berkeley DB file and we should simply open it.
348 * 5. It is not a BDB file and we should return an error.
349 */
350
351 /* Open file (if there is one). */
352 reopen: if (!F_ISSET(dbp, DB_AM_INMEM) && (ret =
353 __os_open(env, real_name, 0, oflags, 0, &fhp)) != 0)
354 goto err;
355
356 /* Case 2: DB_TRUNCATE: we must do the creation in place. */
357 if (LF_ISSET(DB_TRUNCATE)) {
358 if (LF_ISSET(DB_EXCL)) {
359 /* Case 1a: DB_EXCL and DB_TRUNCATE. */
360 ret = EEXIST;
361 goto err;
362 }
363 tmpname = (char *)name;
364 goto creat2;
365 }
366
367 /* Cases 1,3-5: we need to read the meta-data page. */
368 if (F_ISSET(dbp, DB_AM_INMEM)) {
369 if (LOGGING_ON(env) && (ret = __env_dbreg_setup(dbp,
370 txn, NULL, name, TXN_INVALID)) != 0)
371 return (ret);
372 ret = __fop_inmem_read_meta(
373 dbp, txn, name, flags, DB_CHK_META|DB_CHK_ONLY);
374 } else {
375 ret = __fop_read_meta(env, real_name, mbuf,
376 sizeof(mbuf), fhp,
377 LF_ISSET(DB_NOERROR) ||
378 (LF_ISSET(DB_FCNTL_LOCKING) && txn == NULL) ? 1 : 0,
379 &len);
380
381 /* Case 3: 0-length, no txns. */
382 if (ret != 0 && len == 0 && txn == NULL) {
383 if (LF_ISSET(DB_EXCL)) {
384 /*
385 * Case 1b: DB_EXCL and
386 * 0-length file exists.
387 */
388 ret = EEXIST;
389 goto err;
390 }
391 tmpname = (char *)name;
392 if (create_ok)
393 goto creat2;
394 goto done;
395 }
396
397 /*
398 * Case 4: This is a valid file. Now check the
399 * checksum and decrypt the file so the file
400 * id can be obtained for the handle lock. Note that
401 * the checksum can fail if the database is being
402 * written (possible because the handle lock has
403 * not been obtained yet). So on checksum fail retry
404 * until the checksum succeeds or the number of
405 * retries is exhausted, then throw an error.
406 */
407 if (ret == 0 && (ret = __db_chk_meta(env, dbp,
408 (DBMETA *)mbuf, DB_CHK_META)) == DB_CHKSUM_FAIL) {
409 if ((t_ret = __ENV_LPUT(env, elock)) != 0) {
410 ret = t_ret;
411 goto err;
412 }
413 /*
414 * Retry unless the number of retries is
415 * exhausted.
416 */
417 if (!(retries < DB_RETRY)) {
418 __db_errx(env, DB_STR_A("0210",
419 "%s: metadata page checksum error", "%s"), real_name);
420 if (F_ISSET(dbp, DB_AM_RECOVER))
421 ret = ENOENT;
422 else
423 ret = EINVAL;
424 goto err;
425 }
426 CLOSE_HANDLE(dbp, fhp);
427 goto retry;
428 }
429 /* Get the file id for the handle lock. */
430 if (ret == 0)
431 memcpy(dbp->fileid,
432 ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
433 }
434
435 /* Case 5: Invalid file. */
436 if (ret != 0)
437 goto err;
438
439 /* Now, get our handle lock. */
440 if ((ret = __fop_lock_handle(env,
441 dbp, locker, lockmode, NULL, DB_LOCK_NOWAIT)) == 0) {
442 if ((ret = __ENV_LPUT(env, elock)) != 0)
443 goto err;
444 } else if (ret != DB_LOCK_NOTGRANTED ||
445 ((txn != NULL && (F_ISSET(txn, TXN_NOWAIT))) ||
446 F2_ISSET(dbp, DB2_AM_NOWAIT)))
447 goto err;
448 else {
449 PERFMON3(env,
450 race, fop_file_setup, (char *) name, ret, flags);
451 /*
452 * We were unable to acquire the handle lock without
453 * blocking. The fact that we are blocking might mean
454 * that someone else is trying to delete the file.
455 * Since some platforms cannot delete files while they
456 * are open (Windows), we are going to have to close
457 * the file. This would be a problem if we were doing
458 * FCNTL locking, because our closing the handle would
459 * release the FCNTL locks. Fortunately, if we are
460 * doing FCNTL locking, then we should never fail to
461 * acquire our handle lock, so we should never get here.
462 * We assert it here to make sure we aren't destroying
463 * any application level FCNTL semantics.
464 */
465 DB_ASSERT(env, !LF_ISSET(DB_FCNTL_LOCKING));
466 if (!F_ISSET(dbp, DB_AM_INMEM))
467 CLOSE_HANDLE(dbp, fhp);
468 if ((ret = __fop_lock_handle(env,
469 dbp, locker, lockmode, &elock, 0)) != 0) {
470 if (F_ISSET(dbp, DB_AM_INMEM))
471 RESET_MPF(dbp, 0);
472 goto err;
473 }
474
475 /*
476 * If we had to wait, we might be waiting on a
477 * dummy file used in create/destroy of a database.
478 * To be sure we have the correct information we
479 * try again.
480 */
481 if (F_ISSET(dbp, DB_AM_INMEM)) {
482 RESET_MPF(dbp, 0);
483 MAKE_INMEM(dbp);
484 }
485 if ((ret =
486 __ENV_LPUT(env, dbp->handle_lock)) != 0) {
487 LOCK_INIT(dbp->handle_lock);
488 goto err;
489 }
490 goto retry;
491
492 }
493
494 /*
495 * If we got here, then we have the handle lock, it is now
496 * safe to check the rest of the meta data, since the file
497 * will not be deleted out from under the handle.
498 */
499 if (F_ISSET(dbp, DB_AM_INMEM)) {
500 if ((ret = __fop_inmem_read_meta(
501 dbp, txn, name, flags, DB_SKIP_CHK)) != 0)
502 goto err;
503 } else {
504 if ((ret = __db_meta_setup(env, dbp, real_name,
505 (DBMETA *)mbuf, flags, DB_SKIP_CHK)) != 0)
506 goto err;
507 }
508
509 /*
510 * Check for a file in the midst of a rename. If we find that
511 * the file is in the midst of a rename, it must be the case
512 * that it is in our current transaction (else we would still
513 * be blocking), so we can continue along and create a new file
514 * with the same name. In that case, we have to close the file
515 * handle because we reuse it below. This is a case where
516 * a 'was_inval' above is OK.
517 */
518 if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
519 was_inval = 0;
520 if (create_ok) {
521 if (F_ISSET(dbp, DB_AM_INMEM)) {
522 RESET_MPF(dbp, DB_MPOOL_DISCARD);
523 } else
524 CLOSE_HANDLE(dbp, fhp);
525 LF_SET(DB_CREATE);
526 goto create;
527 } else {
528 ret = ENOENT;
529 goto err;
530 }
531 }
532
533 /* If we get here, a was_inval is bad. */
534 if (was_inval) {
535 ret = EINVAL;
536 goto err;
537 }
538
539 /*
540 * Now, case 1: check for DB_EXCL, because the file that exists
541 * is not in the middle of a rename, so we have an error. This
542 * is a weird case, but we need to make sure that we don't
543 * continue to hold the handle lock, since technically, we
544 * should not have been allowed to open it.
545 */
546 if (LF_ISSET(DB_EXCL)) {
547 ret = __ENV_LPUT(env, dbp->handle_lock);
548 LOCK_INIT(dbp->handle_lock);
549 if (ret == 0)
550 ret = EEXIST;
551 goto err;
552 }
553 goto done;
554 }
555
556 /* File does not exist. */
557 #ifdef HAVE_VXWORKS
558 /*
559 * VxWorks can return file-system specific error codes if the
560 * file does not exist, not ENOENT.
561 */
562 if (!create_ok)
563 #else
564 if (!create_ok || ret != ENOENT)
565 #endif
566 goto err;
567 LF_SET(DB_CREATE);
568 /*
569 * If we were trying to open a non-existent master database
570 * readonly clear that here.
571 */
572 LF_CLR(DB_RDONLY);
573 F_CLR(dbp, DB_AM_RDONLY);
574 ret = 0;
575
576 /*
577 * We need to create file, which means that we need to set up the file,
578 * the fileid and the locks. Then we need to call the appropriate
579 * routines to create meta-data pages. For in-memory files, we retain
580 * the environment lock, while for on-disk files, we drop the env lock
581 * and create into a temporary.
582 */
583 if (!F_ISSET(dbp, DB_AM_INMEM) &&
584 (ret = __ENV_LPUT(env, elock)) != 0)
585 goto err;
586
587 create: if (txn != NULL && IS_REP_CLIENT(env) &&
588 !F_ISSET(dbp, DB_AM_NOT_DURABLE)) {
589 __db_errx(env, DB_STR("0003",
590 "Transactional create on replication client disallowed"));
591 ret = EINVAL;
592 goto err;
593 }
594
595 if (F_ISSET(dbp, DB_AM_INMEM)) {
596 if (LOGGING_ON(env) && (ret =
597 __env_dbreg_setup(dbp, txn, NULL, name, TXN_INVALID)) != 0)
598 return (ret);
599 if ((ret = __fop_inmem_create(dbp, name, txn, flags)) != 0)
600 return (ret);
601 } else {
602 if ((ret = __db_backup_name(env, name, txn, &tmpname)) != 0)
603 goto err;
604 if (TXN_ON(env) && txn != NULL &&
605 (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
606 goto err;
607 if ((ret = __fop_create(env, stxn, &fhp,
608 tmpname, &dbp->dirname, aflags, mode, dflags)) != 0) {
609 /*
610 * If no transactions, there is a race on creating the
611 * backup file, as the backup file name is the same for
612 * all processes. Wait for the other process to finish
613 * with the name.
614 */
615 if (!TXN_ON(env) && ret == EEXIST) {
616 PERFMON3(env,
617 race, fop_file_setup, tmpname, ret, flags);
618 __os_free(env, tmpname);
619 tmpname = NULL;
620 __os_yield(env, 1, 0);
621 goto retry;
622 }
623 goto err;
624 }
625 tmp_created = 1;
626 }
627
628 creat2: if (!F_ISSET(dbp, DB_AM_INMEM)) {
629 if ((ret = __db_appname(env,
630 aflags, tmpname, &dbp->dirname, &real_tmpname)) != 0)
631 goto err;
632
633 /* Set the pagesize if it isn't yet set. */
634 if (dbp->pgsize == 0 &&
635 (ret = __fop_set_pgsize(dbp, fhp, real_tmpname)) != 0)
636 goto errmsg;
637
638 /* Construct a file_id. */
639 if ((ret =
640 __os_fileid(env, real_tmpname, 1, dbp->fileid)) != 0)
641 goto errmsg;
642 }
643
644 if ((ret = __db_new_file(dbp, ip,
645 F_ISSET(dbp, DB_AM_INMEM) ? txn : stxn, fhp, tmpname)) != 0)
646 goto err;
647
648 /* Output the REOPEN record after we create. */
649 if (F_ISSET(dbp, DB_AM_INMEM) && dbp->log_filename != NULL && (ret =
650 __dbreg_log_id(dbp, txn, dbp->log_filename->id, 0)) != 0)
651 return (ret);
652
653 /*
654 * We need to close the handle here on platforms where remove and
655 * rename fail if a handle is open (including Windows).
656 */
657 CLOSE_HANDLE(dbp, fhp);
658
659 /*
660 * Now move the file into place unless we are creating in place (because
661 * we created a database in a file that started out 0-length). If
662 * this is an in-memory file, we may or may not hold the environment
663 * lock depending on how we got here.
664 */
665 if (!F_ISSET(dbp, DB_AM_COMPENSATE) &&
666 !F_ISSET(dbp, DB_AM_RECOVER) && !LOCK_ISSET(elock))
667 GET_ENVLOCK(env, locker, &elock);
668
669 if (F_ISSET(dbp, DB_AM_IN_RENAME)) {
670 F_CLR(dbp, DB_AM_IN_RENAME);
671 __txn_remrem(env, txn, real_name);
672 } else if (name == tmpname) {
673 /* We created it in place. */
674 } else if (!F_ISSET(dbp, DB_AM_INMEM) &&
675 __os_exists(env, real_name, NULL) == 0) {
676 /*
677 * Someone managed to create the file; remove our temp
678 * and try to open the file that now exists.
679 */
680 (void)__fop_remove(env, NULL,
681 dbp->fileid, tmpname, &dbp->dirname, aflags, dflags);
682 (void)__ENV_LPUT(env, dbp->handle_lock);
683 LOCK_INIT(dbp->handle_lock);
684
685 if (stxn != NULL) {
686 ret = __txn_abort(stxn);
687 stxn = NULL;
688 }
689 if (ret != 0)
690 goto err;
691 goto reopen;
692 }
693
694 if (name != NULL && (ret = __fop_lock_handle(env,
695 dbp, locker, DB_LOCK_WRITE, NULL, NOWAIT_FLAG(txn)|
696 (F2_ISSET(dbp,DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
697 goto err;
698 if (tmpname != NULL &&
699 tmpname != name && (ret = __fop_rename(env, stxn, tmpname,
700 name, &dbp->dirname, dbp->fileid, aflags, 1, dflags)) != 0)
701 goto err;
702 if ((ret = __ENV_LPUT(env, elock)) != 0)
703 goto err;
704
705 if (stxn != NULL) {
706 *retidp = stxn->txnid;
707 ret = __txn_commit(stxn, 0);
708 stxn = NULL;
709 } else
710 *retidp = TXN_INVALID;
711
712 if (ret != 0)
713 goto err;
714
715 F_SET(dbp, DB_AM_CREATED);
716
717 if (0) {
718 errmsg: __db_err(env, ret, "%s", name);
719
720 err: CLOSE_HANDLE(dbp, fhp);
721 if (stxn != NULL)
722 (void)__txn_abort(stxn);
723 if (tmp_created && txn == NULL)
724 (void)__fop_remove(env,
725 NULL, NULL, tmpname, NULL, aflags, dflags);
726 if (txn == NULL)
727 (void)__ENV_LPUT(env, dbp->handle_lock);
728 (void)__ENV_LPUT(env, elock);
729 if (created_locker) {
730 (void)__lock_id_free(env, dbp->locker);
731 dbp->locker = NULL;
732 }
733 }
734
735 done: /*
736 * There are cases where real_name and tmpname take on the
737 * exact same string, so we need to make sure that we do not
738 * free twice.
739 */
740 if (!truncating && tmpname != NULL && tmpname != name)
741 __os_free(env, tmpname);
742 if (real_name != name && real_name != NULL)
743 __os_free(env, real_name);
744 if (real_tmpname != NULL)
745 __os_free(env, real_tmpname);
746 CLOSE_HANDLE(dbp, fhp);
747
748 return (ret);
749 }
750
751 /*
752 * __fop_set_pgsize --
753 * Set the page size based on file information.
754 */
755 static int
__fop_set_pgsize(dbp,fhp,name)756 __fop_set_pgsize(dbp, fhp, name)
757 DB *dbp;
758 DB_FH *fhp;
759 const char *name;
760 {
761 ENV *env;
762 u_int32_t iopsize;
763 int ret;
764
765 env = dbp->env;
766
767 /*
768 * Use the filesystem's optimum I/O size as the pagesize if a pagesize
769 * not specified. Some filesystems have 64K as their optimum I/O size,
770 * but as that results in fairly large default caches, we limit the
771 * default pagesize to 16K.
772 */
773 if ((ret = __os_ioinfo(env, name, fhp, NULL, NULL, &iopsize)) != 0) {
774 __db_err(env, ret, "%s", name);
775 return (ret);
776 }
777 if (iopsize < 512)
778 iopsize = 512;
779 if (iopsize > 16 * 1024)
780 iopsize = 16 * 1024;
781
782 /*
783 * Sheer paranoia, but we don't want anything that's not a power-of-2
784 * (we rely on that for alignment of various types on the pages), and
785 * we want a multiple of the sector size as well. If the value
786 * we got out of __os_ioinfo looks bad, use a default instead.
787 */
788 if (!IS_VALID_PAGESIZE(iopsize))
789 iopsize = DB_DEF_IOSIZE;
790
791 dbp->pgsize = iopsize;
792 F_SET(dbp, DB_AM_PGDEF);
793
794 return (0);
795 }
796
797 /*
798 * __fop_subdb_setup --
799 *
800 * Subdb setup is significantly simpler than file setup. In terms of
801 * locking, for the duration of the operation/transaction, the locks on
802 * the meta-data page will suffice to protect us from simultaneous operations
803 * on the sub-database. Before we complete the operation though, we'll get a
804 * handle lock on the subdatabase so that on one else can try to remove it
805 * while we've got it open. We use an object that looks like the meta-data
806 * page lock with a different type (DB_HANDLE_LOCK) for the long-term handle.
807 * locks.
808 *
809 * PUBLIC: int __fop_subdb_setup __P((DB *, DB_THREAD_INFO *, DB_TXN *,
810 * PUBLIC: const char *, const char *, int, u_int32_t));
811 */
812 int
__fop_subdb_setup(dbp,ip,txn,mname,name,mode,flags)813 __fop_subdb_setup(dbp, ip, txn, mname, name, mode, flags)
814 DB *dbp;
815 DB_THREAD_INFO *ip;
816 DB_TXN *txn;
817 const char *mname, *name;
818 int mode;
819 u_int32_t flags;
820 {
821 DB *mdbp;
822 ENV *env;
823 db_lockmode_t lkmode;
824 u_int32_t mflags;
825 int ret, t_ret;
826
827 mdbp = NULL;
828 env = dbp->env;
829
830 mflags = flags | DB_RDONLY;
831 retry: if ((ret = __db_master_open(dbp,
832 ip, txn, mname, mflags, mode, &mdbp)) != 0)
833 return (ret);
834 /*
835 * If we created this file, then we need to set the DISCARD flag so
836 * that if we fail in the middle of this routine, we discard from the
837 * mpool any pages that we just created.
838 */
839 if (F_ISSET(mdbp, DB_AM_CREATED))
840 F_SET(mdbp, DB_AM_DISCARD);
841
842 /*
843 * We are going to close this instance of the master, so we can
844 * steal its handle instead of reopening a handle on the database.
845 */
846 if (LF_ISSET(DB_FCNTL_LOCKING)) {
847 dbp->saved_open_fhp = mdbp->saved_open_fhp;
848 mdbp->saved_open_fhp = NULL;
849 }
850
851 /* Copy the pagesize and set the sub-database flag. */
852 dbp->pgsize = mdbp->pgsize;
853 F_SET(dbp, DB_AM_SUBDB);
854
855 if (name != NULL && (ret = __db_master_update(mdbp, dbp,
856 ip, txn, name, dbp->type, MU_OPEN, NULL, flags)) != 0) {
857 if (ret == EBADF && F_ISSET(mdbp, DB_AM_RDONLY)) {
858 /* We need to reopen the master R/W to do the create. */
859 if ((ret = __db_close(mdbp, txn, 0)) != 0)
860 goto err;
861 FLD_CLR(mflags, DB_RDONLY);
862 goto retry;
863 }
864 goto err;
865 }
866
867 /*
868 * Hijack the master's locker ID as well, so that our locks don't
869 * conflict with the master's. Since we're closing the master,
870 * that locker would just have been freed anyway. Once we've gotten
871 * the locker id, we need to acquire the handle lock for this
872 * subdatabase.
873 */
874 dbp->locker = mdbp->locker;
875 mdbp->locker = NULL;
876
877 DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOG, ret, mname);
878
879 /*
880 * We copy our fileid from our master so that we all open
881 * the same file in mpool. We'll use the meta-pgno to lock
882 * so that we end up with different handle locks.
883 */
884
885 memcpy(dbp->fileid, mdbp->fileid, DB_FILE_ID_LEN);
886 lkmode = F_ISSET(dbp, DB_AM_CREATED) || LF_ISSET(DB_WRITEOPEN) ||
887 F2_ISSET(dbp, DB2_AM_EXCL) ? DB_LOCK_WRITE : DB_LOCK_READ;
888 if ((ret = __fop_lock_handle(env, dbp,
889 txn == NULL ? dbp->locker : txn->locker, lkmode, NULL,
890 NOWAIT_FLAG(txn) |
891 (F2_ISSET(dbp, DB2_AM_NOWAIT) ? DB_LOCK_NOWAIT : 0))) != 0)
892 goto err;
893
894 if ((ret = __db_init_subdb(mdbp, dbp, name, ip, txn)) != 0) {
895 /*
896 * If there was no transaction and we created this database,
897 * then we need to undo the update of the master database.
898 */
899 if (F_ISSET(dbp, DB_AM_CREATED) && txn == NULL)
900 (void)__db_master_update(mdbp, dbp,
901 ip, txn, name, dbp->type, MU_REMOVE, NULL, 0);
902 F_CLR(dbp, DB_AM_CREATED);
903 goto err;
904 }
905
906 /*
907 * XXX
908 * This should have been done at the top of this routine. The problem
909 * is that __db_init_subdb() uses "standard" routines to process the
910 * meta-data page and set information in the DB handle based on it.
911 * Those routines have to deal with swapped pages and will normally set
912 * the DB_AM_SWAP flag. However, we use the master's metadata page and
913 * that has already been swapped, so they get the is-swapped test wrong.
914 */
915 F_CLR(dbp, DB_AM_SWAP);
916 F_SET(dbp, F_ISSET(mdbp, DB_AM_SWAP));
917
918 /*
919 * In the file create case, these happen in separate places so we have
920 * two different tests. They end up in the same place for subdbs, but
921 * for compatibility with file testing, we put them both here anyway.
922 */
923 DB_TEST_RECOVERY(dbp, DB_TEST_POSTLOGMETA, ret, mname);
924 DB_TEST_RECOVERY(dbp, DB_TEST_POSTSYNC, ret, mname);
925
926 /*
927 * File exists and we have the appropriate locks; we should now
928 * process a normal open.
929 */
930 if (F_ISSET(mdbp, DB_AM_CREATED)) {
931 F_SET(dbp, DB_AM_CREATED_MSTR);
932 F_CLR(mdbp, DB_AM_DISCARD);
933 }
934
935 if (0) {
936 err:
937 DB_TEST_RECOVERY_LABEL
938 if (txn == NULL)
939 (void)__ENV_LPUT(env, dbp->handle_lock);
940 }
941
942 /*
943 * The master's handle lock is under the control of the
944 * subdb (it acquired the master's locker). We want to
945 * keep the master's handle lock so that no one can remove
946 * the file while the subdb is open. If we register the
947 * trade event and then invalidate the copy of the lock
948 * in the master's handle, that will accomplish this. However,
949 * before we register this event, we'd better remove any
950 * events that we've already registered for the master.
951 */
952 if (!F_ISSET(dbp, DB_AM_RECOVER) && IS_REAL_TXN(txn)) {
953 /* Unregister old master events. */
954 __txn_remlock(env,
955 txn, &mdbp->handle_lock, DB_LOCK_INVALIDID);
956
957 /* Now register the new event. */
958 if ((t_ret = __txn_lockevent(env, txn, dbp,
959 &mdbp->handle_lock, dbp->locker == NULL ?
960 mdbp->locker : dbp->locker)) != 0 && ret == 0)
961 ret = t_ret;
962 }
963 LOCK_INIT(mdbp->handle_lock);
964
965 /*
966 * If the master was created, we need to sync so that the metadata
967 * page is correct on disk for recovery, since it isn't read through
968 * mpool. If we're opening a subdb in an existing file, we can skip
969 * the sync.
970 */
971 if ((t_ret = __db_close(mdbp, txn,
972 F_ISSET(dbp, DB_AM_CREATED_MSTR) ? 0 : DB_NOSYNC)) != 0 && ret == 0)
973 ret = t_ret;
974
975 return (ret);
976 }
977
978 /*
979 * __fop_remove_setup --
980 * Open handle appropriately and lock for removal of a database file.
981 *
982 * PUBLIC: int __fop_remove_setup __P((DB *,
983 * PUBLIC: DB_TXN *, const char *, u_int32_t));
984 */
985 int
__fop_remove_setup(dbp,txn,name,flags)986 __fop_remove_setup(dbp, txn, name, flags)
987 DB *dbp;
988 DB_TXN *txn;
989 const char *name;
990 u_int32_t flags;
991 {
992 DB_FH *fhp;
993 DB_LOCK elock;
994 ENV *env;
995 u_int8_t mbuf[DBMETASIZE];
996 int ret;
997
998 COMPQUIET(flags, 0);
999
1000 env = dbp->env;
1001
1002 LOCK_INIT(elock);
1003 fhp = NULL;
1004 ret = 0;
1005
1006 /* Create locker if necessary. */
1007 retry: if (LOCKING_ON(env)) {
1008 if (IS_REAL_TXN(txn))
1009 dbp->locker = txn->locker;
1010 else if (dbp->locker == DB_LOCK_INVALIDID) {
1011 if ((ret = __lock_id(env, NULL, &dbp->locker)) != 0)
1012 goto err;
1013 if (txn != NULL && F_ISSET(txn, TXN_INFAMILY) &&
1014 (ret = __lock_addfamilylocker(env,
1015 txn->txnid, dbp->locker->id, 1)) != 0)
1016 goto err;
1017 }
1018 }
1019
1020 /*
1021 * We are about to open a file handle and then possibly close it.
1022 * We cannot close handles if we are doing FCNTL locking. However,
1023 * there is no way to pass the FCNTL flag into this routine via the
1024 * user API. The only way we can get in here and be doing FCNTL
1025 * locking is if we are trying to clean up an open that was called
1026 * with FCNTL locking. In that case, the save_fhp should already be
1027 * set. So, we use that field to tell us if we need to make sure
1028 * that we shouldn't close the handle.
1029 */
1030 fhp = dbp->saved_open_fhp;
1031 DB_ASSERT(env, LF_ISSET(DB_FCNTL_LOCKING) || fhp == NULL);
1032
1033 /*
1034 * Lock environment to protect file open. That will enable us to
1035 * read the meta-data page and get the fileid so that we can lock
1036 * the handle.
1037 */
1038 GET_ENVLOCK(env, dbp->locker, &elock);
1039
1040 /* Open database. */
1041 if (F_ISSET(dbp, DB_AM_INMEM)) {
1042 if ((ret = __env_mpool(dbp, name, flags)) == 0)
1043 ret = __os_strdup(env, name, &dbp->dname);
1044 } else if (fhp == NULL)
1045 ret = __os_open(env, name, 0, DB_OSO_RDONLY, 0, &fhp);
1046 if (ret != 0)
1047 goto err;
1048
1049 /* Get meta-data */
1050 if (F_ISSET(dbp, DB_AM_INMEM))
1051 ret = __fop_inmem_read_meta(
1052 dbp, txn, name, flags, DB_CHK_META);
1053 else if ((ret = __fop_read_meta(env,
1054 name, mbuf, sizeof(mbuf), fhp, 0, NULL)) == 0)
1055 ret = __db_meta_setup(env, dbp,
1056 name, (DBMETA *)mbuf, flags, DB_CHK_META | DB_CHK_NOLSN);
1057 if (ret != 0)
1058 goto err;
1059
1060 /*
1061 * Now, get the handle lock. We first try with NOWAIT, because if
1062 * we have to wait, we're going to have to close the file and reopen
1063 * it, so that if there is someone else removing it, our open doesn't
1064 * prevent that.
1065 */
1066 if ((ret = __fop_lock_handle(env,
1067 dbp, dbp->locker, DB_LOCK_WRITE, NULL, DB_LOCK_NOWAIT)) != 0) {
1068 /*
1069 * Close the file, block on the lock, clean up the dbp, and
1070 * then start all over again.
1071 */
1072 if (!F_ISSET(dbp, DB_AM_INMEM) && !LF_ISSET(DB_FCNTL_LOCKING)) {
1073 (void)__os_closehandle(env, fhp);
1074 fhp = NULL;
1075 }
1076 if (ret != DB_LOCK_NOTGRANTED ||
1077 (txn != NULL && F_ISSET(txn, TXN_NOWAIT)))
1078 goto err;
1079 else if ((ret = __fop_lock_handle(env,
1080 dbp, dbp->locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1081 goto err;
1082
1083 if (F_ISSET(dbp, DB_AM_INMEM)) {
1084 (void)__lock_put(env, &dbp->handle_lock);
1085 (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 1);
1086 } else {
1087 if (txn != NULL)
1088 dbp->locker = NULL;
1089 (void)__db_refresh(dbp, txn, DB_NOSYNC, NULL, 0);
1090 }
1091 goto retry;
1092 } else if ((ret = __ENV_LPUT(env, elock)) != 0)
1093 goto err;
1094 else if (F_ISSET(dbp, DB_AM_IN_RENAME))
1095 ret = ENOENT;
1096
1097 if (0) {
1098 err: (void)__ENV_LPUT(env, elock);
1099 }
1100 if (fhp != NULL && !LF_ISSET(DB_FCNTL_LOCKING))
1101 (void)__os_closehandle(env, fhp);
1102 /*
1103 * If this is a real file and we are going to proceed with the removal,
1104 * then we need to make sure that we don't leave any pages around in the
1105 * mpool since the file is closed and will be reopened again before
1106 * access. However, this might be an in-memory file, in which case
1107 * we will handle the discard from the mpool later as it's the "real"
1108 * removal of the database.
1109 */
1110 if (ret == 0 && !F_ISSET(dbp, DB_AM_INMEM))
1111 F_SET(dbp, DB_AM_DISCARD);
1112 return (ret);
1113 }
1114
1115 /*
1116 * __fop_read_meta --
1117 * Read the meta-data page from a file and return it in buf.
1118 *
1119 * PUBLIC: int __fop_read_meta __P((ENV *, const char *,
1120 * PUBLIC: u_int8_t *, size_t, DB_FH *, int, size_t *));
1121 */
1122 int
__fop_read_meta(env,name,buf,size,fhp,errok,nbytesp)1123 __fop_read_meta(env, name, buf, size, fhp, errok, nbytesp)
1124 ENV *env;
1125 const char *name;
1126 u_int8_t *buf;
1127 size_t size;
1128 DB_FH *fhp;
1129 int errok;
1130 size_t *nbytesp;
1131 {
1132 size_t nr;
1133 int ret;
1134
1135 /*
1136 * Our caller wants to know the number of bytes read, even if we
1137 * return an error.
1138 */
1139 if (nbytesp != NULL)
1140 *nbytesp = 0;
1141
1142 nr = 0;
1143 ret = __os_read(env, fhp, buf, size, &nr);
1144 if (nbytesp != NULL)
1145 *nbytesp = nr;
1146
1147 if (ret != 0) {
1148 if (!errok)
1149 __db_err(env, ret, "%s", name);
1150 goto err;
1151 }
1152
1153 if (nr != size) {
1154 if (!errok)
1155 __db_errx(env, DB_STR_A("0004",
1156 "fop_read_meta: %s: unexpected file type or format",
1157 "%s"), name);
1158 ret = EINVAL;
1159 }
1160
1161 err:
1162 return (ret);
1163 }
1164
1165 /*
1166 * __fop_dummy --
1167 * This implements the creation and name swapping of dummy files that
1168 * we use for remove and rename (remove is simply a rename with a delayed
1169 * remove).
1170 *
1171 * PUBLIC: int __fop_dummy __P((DB *,
1172 * PUBLIC: DB_TXN *, const char *, const char *));
1173 */
1174 int
__fop_dummy(dbp,txn,old,new)1175 __fop_dummy(dbp, txn, old, new)
1176 DB *dbp;
1177 DB_TXN *txn;
1178 const char *old, *new;
1179 {
1180 DB *tmpdbp;
1181 DB_TXN *stxn;
1182 ENV *env;
1183 char *back;
1184 int ret, t_ret;
1185 u_int8_t mbuf[DBMETASIZE];
1186
1187 env = dbp->env;
1188 back = NULL;
1189 stxn = NULL;
1190 tmpdbp = NULL;
1191
1192 DB_ASSERT(env, txn != NULL);
1193
1194 /*
1195 * Begin sub transaction to encapsulate the rename. Note that we
1196 * expect the inmem_swap calls to complete the sub-transaction,
1197 * aborting on error and committing on success.
1198 */
1199 if (TXN_ON(env) &&
1200 (ret = __txn_begin(env, NULL, txn, &stxn, 0)) != 0)
1201 goto err;
1202
1203 /* We need to create a dummy file as a place holder. */
1204 if ((ret = __db_backup_name(env, new, stxn, &back)) != 0)
1205 goto err;
1206 /* Create a dummy dbp handle. */
1207 if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1208 goto err;
1209 if (F_ISSET(dbp, DB_AM_NOT_DURABLE) &&
1210 (ret = __db_set_flags(tmpdbp, DB_TXN_NOT_DURABLE)) != 0)
1211 goto err;
1212 memset(mbuf, 0, sizeof(mbuf));
1213 ret = F_ISSET(dbp, DB_AM_INMEM) ?
1214 __fop_inmem_dummy(tmpdbp, stxn, back, mbuf) :
1215 __fop_ondisk_dummy(tmpdbp, stxn, back, mbuf);
1216
1217 if (ret != 0)
1218 goto err;
1219
1220 ret = F_ISSET(dbp, DB_AM_INMEM) ?
1221 __fop_inmem_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker) :
1222 __fop_ondisk_swap(dbp, tmpdbp, stxn, old, new, back, txn->locker);
1223 stxn = NULL;
1224 if (ret != 0)
1225 goto err;
1226
1227 err: if (stxn != NULL)
1228 (void)__txn_abort(stxn);
1229 if (tmpdbp != NULL &&
1230 (t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1231 ret = t_ret;
1232 if (back != NULL)
1233 __os_free(env, back);
1234 return (ret);
1235 }
1236
1237 /*
1238 * __fop_dbrename --
1239 * Do the appropriate file locking and file system operations
1240 * to effect a dbrename in the absence of transactions (__fop_dummy
1241 * and the subsequent calls in __db_rename do the work for the
1242 * transactional case).
1243 *
1244 * PUBLIC: int __fop_dbrename __P((DB *, const char *, const char *));
1245 */
1246 int
__fop_dbrename(dbp,old,new)1247 __fop_dbrename(dbp, old, new)
1248 DB *dbp;
1249 const char *old, *new;
1250 {
1251 DB_LOCK elock;
1252 ENV *env;
1253 char *real_new, *real_old;
1254 int ret, t_ret;
1255
1256 env = dbp->env;
1257 real_new = NULL;
1258 real_old = NULL;
1259 LOCK_INIT(elock);
1260
1261 if (F_ISSET(dbp, DB_AM_INMEM)) {
1262 real_new = (char *)new;
1263 real_old = (char *)old;
1264 } else {
1265 /* Get full names. */
1266 if ((ret = __db_appname(env,
1267 DB_APP_DATA, old, &dbp->dirname, &real_old)) != 0)
1268 goto err;
1269
1270 if ((ret = __db_appname(env,
1271 DB_APP_DATA, new, &dbp->dirname, &real_new)) != 0)
1272 goto err;
1273 }
1274
1275 /*
1276 * It is an error to rename a file over one that already exists,
1277 * as that wouldn't be transaction-safe. We check explicitly
1278 * for ondisk files, but it's done memp_nameop for in-memory ones.
1279 */
1280 GET_ENVLOCK(env, dbp->locker, &elock);
1281 ret = F_ISSET(dbp, DB_AM_INMEM) ? ENOENT :
1282 __os_exists(env, real_new, NULL);
1283
1284 if (ret == 0) {
1285 ret = EEXIST;
1286 __db_errx(env, DB_STR_A("0005",
1287 "rename: file %s exists", "%s"), real_new);
1288 goto err;
1289 }
1290
1291 ret = __memp_nameop(env,
1292 dbp->fileid, new, real_old, real_new, F_ISSET(dbp, DB_AM_INMEM));
1293
1294 err: if ((t_ret = __ENV_LPUT(env, elock)) != 0 && ret == 0)
1295 ret = t_ret;
1296 if (!F_ISSET(dbp, DB_AM_INMEM) && real_old != NULL)
1297 __os_free(env, real_old);
1298 if (!F_ISSET(dbp, DB_AM_INMEM) && real_new != NULL)
1299 __os_free(env, real_new);
1300 return (ret);
1301 }
1302
1303 static int
__fop_inmem_create(dbp,name,txn,flags)1304 __fop_inmem_create(dbp, name, txn, flags)
1305 DB *dbp;
1306 const char *name;
1307 DB_TXN *txn;
1308 u_int32_t flags;
1309 {
1310 DBT fid_dbt, name_dbt;
1311 DB_LSN lsn;
1312 ENV *env;
1313 int ret;
1314 int32_t lfid;
1315 u_int32_t dflags, *p32;
1316
1317 env = dbp->env;
1318 dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1319
1320 MAKE_INMEM(dbp);
1321
1322 /* Set the pagesize if it isn't yet set. */
1323 if (dbp->pgsize == 0)
1324 dbp->pgsize = DB_DEF_IOSIZE;
1325
1326 /*
1327 * Construct a file_id.
1328 *
1329 * If this file has no name, then we only need a fileid for locking.
1330 * If this file has a name, we need the fileid both for locking and
1331 * matching in the memory pool. So, with unnamed in-memory databases,
1332 * use a lock_id. For named in-memory files, we need to find a value
1333 * that we can use to uniquely identify a name/fid pair. We use a
1334 * combination of a unique id (__os_unique_id) and a hash of the
1335 * original name.
1336 */
1337 if (name == NULL) {
1338 if (LOCKING_ON(env) && (ret =
1339 __lock_id(env, (u_int32_t *)dbp->fileid, NULL)) != 0)
1340 goto err;
1341 } else {
1342 p32 = (u_int32_t *)(&dbp->fileid[0]);
1343 __os_unique_id(env, p32);
1344 p32++;
1345 (void)strncpy(
1346 (char *)p32, name, DB_FILE_ID_LEN - sizeof(u_int32_t));
1347 dbp->preserve_fid = 1;
1348
1349 if (DBENV_LOGGING(env) &&
1350 #if !defined(DEBUG_WOP) && !defined(DIAGNOSTIC)
1351 txn != NULL &&
1352 #endif
1353 dbp->log_filename != NULL)
1354 memcpy(dbp->log_filename->ufid,
1355 dbp->fileid, DB_FILE_ID_LEN);
1356 }
1357
1358 /* Now, set the fileid. */
1359 if ((ret = __memp_set_fileid(dbp->mpf, dbp->fileid)) != 0)
1360 goto err;
1361
1362 if ((ret = __env_mpool(dbp, name, flags)) != 0)
1363 goto err;
1364
1365 if (DBENV_LOGGING(env) &&
1366 #if !defined(DEBUG_WOP)
1367 txn != NULL &&
1368 #endif
1369 name != NULL) {
1370 DB_INIT_DBT(name_dbt, name, strlen(name) + 1);
1371 memset(&fid_dbt, 0, sizeof(fid_dbt));
1372 fid_dbt.data = dbp->fileid;
1373 fid_dbt.size = DB_FILE_ID_LEN;
1374 lfid = dbp->log_filename == NULL ?
1375 DB_LOGFILEID_INVALID : dbp->log_filename->id;
1376 if ((ret = __crdel_inmem_create_log(env, txn,
1377 &lsn, dflags, lfid, &name_dbt, &fid_dbt, dbp->pgsize)) != 0)
1378 goto err;
1379 }
1380
1381 F_SET(dbp, DB_AM_CREATED);
1382
1383 err:
1384 return (ret);
1385 }
1386
1387 static int
__fop_inmem_read_meta(dbp,txn,name,flags,chkflags)1388 __fop_inmem_read_meta(dbp, txn, name, flags, chkflags)
1389 DB *dbp;
1390 DB_TXN *txn;
1391 const char *name;
1392 u_int32_t flags;
1393 u_int32_t chkflags;
1394 {
1395 DBMETA *metap;
1396 DB_THREAD_INFO *ip;
1397 db_pgno_t pgno;
1398 int ret, t_ret;
1399
1400 if (txn == NULL)
1401 ENV_GET_THREAD_INFO(dbp->env, ip);
1402 else
1403 ip = txn->thread_info;
1404
1405 pgno = PGNO_BASE_MD;
1406 if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &metap)) != 0)
1407 return (ret);
1408 if (FLD_ISSET(chkflags, DB_CHK_ONLY)) {
1409 if ((ret = __db_chk_meta(dbp->env, dbp, metap, chkflags)) == 0)
1410 memcpy(dbp->fileid,
1411 ((DBMETA *)metap)->uid, DB_FILE_ID_LEN);
1412 } else
1413 ret = __db_meta_setup(
1414 dbp->env, dbp, name, metap, flags, chkflags);
1415
1416 if ((t_ret =
1417 __memp_fput(dbp->mpf, ip, metap, dbp->priority)) && ret == 0)
1418 ret = t_ret;
1419
1420 return (ret);
1421 }
1422
1423 static int
__fop_ondisk_dummy(dbp,txn,name,mbuf)1424 __fop_ondisk_dummy(dbp, txn, name, mbuf)
1425 DB *dbp;
1426 DB_TXN *txn;
1427 const char *name;
1428 u_int8_t *mbuf;
1429 {
1430 ENV *env;
1431 int ret;
1432 char *realname;
1433 u_int32_t dflags;
1434
1435 realname = NULL;
1436 env = dbp->env;
1437 dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1438
1439 if ((ret = __db_appname(env,
1440 DB_APP_DATA, name, &dbp->dirname, &realname)) != 0)
1441 goto err;
1442
1443 if ((ret = __fop_create(env,
1444 txn, NULL, name, &dbp->dirname, DB_APP_DATA, 0, dflags)) != 0)
1445 goto err;
1446
1447 if ((ret =
1448 __os_fileid(env, realname, 1, ((DBMETA *)mbuf)->uid)) != 0)
1449 goto err;
1450
1451 ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1452 if ((ret = __fop_write(env, txn, name, dbp->dirname,
1453 DB_APP_DATA, NULL, 0, 0, 0, mbuf, DBMETASIZE, 1, dflags)) != 0)
1454 goto err;
1455
1456 memcpy(dbp->fileid, ((DBMETA *)mbuf)->uid, DB_FILE_ID_LEN);
1457
1458 err: if (realname != NULL)
1459 __os_free(env, realname);
1460
1461 return (ret);
1462 }
1463
1464 static int
__fop_inmem_dummy(dbp,txn,name,mbuf)1465 __fop_inmem_dummy(dbp, txn, name, mbuf)
1466 DB *dbp;
1467 DB_TXN *txn;
1468 const char *name;
1469 u_int8_t *mbuf;
1470 {
1471 DBMETA *metap;
1472 DB_THREAD_INFO *ip;
1473 db_pgno_t pgno;
1474 int ret, t_ret;
1475
1476 if ((ret = __fop_inmem_create(dbp, name, txn, DB_CREATE)) != 0)
1477 return (ret);
1478 if (txn == NULL)
1479 ENV_GET_THREAD_INFO(dbp->env, ip);
1480 else
1481 ip = txn->thread_info;
1482
1483 pgno = PGNO_BASE_MD;
1484 if ((ret = __memp_fget(dbp->mpf, &pgno, ip, txn,
1485 DB_MPOOL_CREATE | DB_MPOOL_DIRTY, &metap)) != 0)
1486 return (ret);
1487 /* Check file existed. */
1488 if (metap->magic != 0)
1489 ret = EEXIST;
1490 else
1491 metap->magic = DB_RENAMEMAGIC;
1492
1493 /* Copy the fileid onto the meta-data page. */
1494 memcpy(metap->uid, dbp->fileid, DB_FILE_ID_LEN);
1495
1496 if ((t_ret = __memp_fput(dbp->mpf, ip, metap,
1497 ret == 0 ? dbp->priority : DB_PRIORITY_VERY_LOW)) != 0 && ret == 0)
1498 ret = t_ret;
1499
1500 if (ret != 0)
1501 goto err;
1502
1503 ((DBMETA *)mbuf)->magic = DB_RENAMEMAGIC;
1504
1505 err: return (ret);
1506 }
1507
1508 static int
__fop_ondisk_swap(dbp,tmpdbp,txn,old,new,back,locker)1509 __fop_ondisk_swap(dbp, tmpdbp, txn, old, new, back, locker)
1510 DB *dbp, *tmpdbp;
1511 DB_TXN *txn;
1512 const char *old, *new, *back;
1513 DB_LOCKER *locker;
1514 {
1515 DBT fiddbt, namedbt, tmpdbt;
1516 DB_FH *fhp;
1517 DB_LOCK elock;
1518 DB_LSN lsn;
1519 DB_TXN *parent;
1520 ENV *env;
1521 u_int8_t mbuf[DBMETASIZE];
1522 u_int32_t child_txnid, dflags;
1523 int ret, t_ret;
1524 char *realold, *realnew;
1525
1526 env = dbp->env;
1527 DB_ASSERT(env, txn != NULL);
1528 DB_ASSERT(env, old != NULL);
1529
1530 realold = realnew = NULL;
1531 LOCK_INIT(elock);
1532 fhp = NULL;
1533 dflags = F_ISSET(dbp, DB_AM_NOT_DURABLE) ? DB_LOG_NOT_DURABLE : 0;
1534
1535 if ((ret = __db_appname(env,
1536 DB_APP_DATA, new, &dbp->dirname, &realnew)) != 0)
1537 goto err;
1538
1539 /* Now, lock the name space while we initialize this file. */
1540 retry: GET_ENVLOCK(env, locker, &elock);
1541 if (__os_exists(env, realnew, NULL) == 0) {
1542 /*
1543 * It is possible that the only reason this file exists is
1544 * because we've done a previous rename of it and we have
1545 * left a placeholder here. We need to check for that case
1546 * and allow this rename to succeed if that's the case.
1547 */
1548 if ((ret = __os_open(env, realnew, 0, 0, 0, &fhp)) != 0)
1549 goto err;
1550 if ((ret = __fop_read_meta(env,
1551 realnew, mbuf, sizeof(mbuf), fhp, 0, NULL)) != 0 ||
1552 (ret = __db_meta_setup(env,
1553 tmpdbp, realnew, (DBMETA *)mbuf, 0, DB_CHK_META)) != 0) {
1554 ret = EEXIST;
1555 goto err;
1556 }
1557 ret = __os_closehandle(env, fhp);
1558 fhp = NULL;
1559 if (ret != 0)
1560 goto err;
1561
1562 /*
1563 * Now, try to acquire the handle lock. If the handle is locked
1564 * by our current, transaction, then we'll get it and life is
1565 * good.
1566 *
1567 * Alternately, it's not locked at all, we'll get the lock, but
1568 * we will realize it exists and consider this an error.
1569 *
1570 * However, if it's held by another transaction, then there
1571 * could be two different scenarios: 1) the file is in the
1572 * midst of being created or deleted and when that transaction
1573 * is over, we might be able to proceed. 2) the file is open
1574 * and exists and we should report an error. In order to
1575 * distinguish these two cases, we do the following. First, we
1576 * try to acquire a READLOCK. If the handle is in the midst of
1577 * being created, then we'll block because a writelock is held.
1578 * In that case, we should request a blocking write, and when we
1579 * get the lock, we should then go back and check to see if the
1580 * object exists and start all over again.
1581 *
1582 * If we got the READLOCK, then either no one is holding the
1583 * lock or someone has an open handle and the fact that the file
1584 * exists is problematic. So, in this case, we request the
1585 * WRITELOCK non-blocking -- if it succeeds, we're golden. If
1586 * it fails, then the file exists and we return EEXIST.
1587 */
1588 if ((ret = __fop_lock_handle(env,
1589 tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1590 /*
1591 * Someone holds a write-lock. Wait for the write-lock
1592 * and after we get it, release it and start over.
1593 */
1594 if ((ret = __fop_lock_handle(env, tmpdbp,
1595 locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1596 goto err;
1597 if ((ret =
1598 __lock_put(env, &tmpdbp->handle_lock)) != 0)
1599 goto err;
1600 if ((ret = __db_refresh(tmpdbp, NULL, 0, NULL, 0)) != 0)
1601 goto err;
1602 goto retry;
1603 }
1604
1605 /* We got the read lock; try to upgrade it. */
1606 ret = __fop_lock_handle(env,
1607 tmpdbp, locker, DB_LOCK_WRITE,
1608 NULL, DB_LOCK_UPGRADE | DB_LOCK_NOWAIT);
1609 if (ret != 0) {
1610 /*
1611 * We did not get the writelock, so someone
1612 * has the handle open. This is an error.
1613 */
1614 (void)__lock_put(env, &tmpdbp->handle_lock);
1615 ret = EEXIST;
1616 } else if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1617 /* We got the lock and are renaming it. */
1618 ret = 0;
1619 else { /* We got the lock, but the file exists. */
1620 (void)__lock_put(env, &tmpdbp->handle_lock);
1621 ret = EEXIST;
1622 }
1623 if (ret != 0)
1624 goto err;
1625 }
1626
1627 /*
1628 * While we have the namespace locked, do the renames and then
1629 * swap for the handle lock.
1630 */
1631 if ((ret = __fop_rename(env, txn,
1632 old, new, &dbp->dirname, dbp->fileid, DB_APP_DATA, 1, dflags)) != 0)
1633 goto err;
1634 if ((ret = __fop_rename(env, txn, back, old,
1635 &dbp->dirname, tmpdbp->fileid, DB_APP_DATA, 0, dflags)) != 0)
1636 goto err;
1637 if ((ret = __fop_lock_handle(env,
1638 tmpdbp, locker, DB_LOCK_WRITE, &elock, NOWAIT_FLAG(txn))) != 0)
1639 goto err;
1640
1641 /*
1642 * We just acquired a transactional lock on the tmp handle.
1643 * We need to null out the tmp handle's lock so that it
1644 * doesn't create problems for us in the close path.
1645 */
1646 LOCK_INIT(tmpdbp->handle_lock);
1647
1648 /* Commit the child. */
1649 child_txnid = txn->txnid;
1650 parent = txn->parent;
1651 ret = __txn_commit(txn, 0);
1652 txn = NULL;
1653
1654 /*
1655 * If the new name is available because it was previously renamed
1656 * remove it from the remove list.
1657 */
1658 if (F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1659 __txn_remrem(env, parent, realnew);
1660
1661 /* Now log the child information in the parent. */
1662 memset(&fiddbt, 0, sizeof(fiddbt));
1663 fiddbt.data = dbp->fileid;
1664 fiddbt.size = DB_FILE_ID_LEN;
1665 memset(&tmpdbt, 0, sizeof(fiddbt));
1666 tmpdbt.data = tmpdbp->fileid;
1667 tmpdbt.size = DB_FILE_ID_LEN;
1668 DB_INIT_DBT(namedbt, old, strlen(old) + 1);
1669 if ((t_ret = __fop_file_remove_log(env,
1670 parent, &lsn, dflags, &fiddbt, &tmpdbt, &namedbt,
1671 (u_int32_t)DB_APP_DATA, child_txnid)) != 0 && ret == 0)
1672 ret = t_ret;
1673
1674 /* This is a delayed delete of the dummy file. */
1675 if ((ret = __db_appname(env,
1676 DB_APP_DATA, old, &dbp->dirname, &realold)) != 0)
1677 goto err;
1678
1679 if ((ret = __txn_remevent(env, parent, realold, NULL, 0)) != 0)
1680 goto err;
1681
1682 err: if (txn != NULL) /* Ret must already be set, so void abort. */
1683 (void)__txn_abort(txn);
1684
1685 (void)__ENV_LPUT(env, elock);
1686
1687 if (fhp != NULL &&
1688 (t_ret = __os_closehandle(env, fhp)) != 0 && ret == 0)
1689 ret = t_ret;
1690
1691 if (realnew != NULL)
1692 __os_free(env, realnew);
1693 if (realold != NULL)
1694 __os_free(env, realold);
1695 return (ret);
1696 }
1697
1698 static int
__fop_inmem_swap(olddbp,backdbp,txn,old,new,back,locker)1699 __fop_inmem_swap(olddbp, backdbp, txn, old, new, back, locker)
1700 DB *olddbp, *backdbp;
1701 DB_TXN *txn;
1702 const char *old, *new, *back;
1703 DB_LOCKER *locker;
1704 {
1705 DB *tmpdbp;
1706 DBT fid_dbt, n1_dbt, n2_dbt;
1707 DB_LOCK elock;
1708 DB_LSN lsn;
1709 DB_TXN *parent;
1710 ENV *env;
1711 int ret, t_ret;
1712
1713 env = olddbp->env;
1714 parent = txn->parent;
1715 retry: LOCK_INIT(elock);
1716 if ((ret = __db_create_internal(&tmpdbp, env, 0)) != 0)
1717 return (ret);
1718 MAKE_INMEM(tmpdbp);
1719
1720 GET_ENVLOCK(env, locker, &elock);
1721 if ((ret = __env_mpool(tmpdbp, new, 0)) == 0) {
1722 /*
1723 * It is possible that the only reason this database exists is
1724 * because we've done a previous rename of it and we have
1725 * left a placeholder here. We need to check for that case
1726 * and allow this rename to succeed if that's the case.
1727 */
1728
1729 if ((ret = __fop_inmem_read_meta(
1730 tmpdbp, txn, new, 0, DB_CHK_META)) != 0) {
1731 ret = EEXIST;
1732 goto err;
1733 }
1734
1735 /*
1736 * Now, try to acquire the handle lock. If it's from our txn,
1737 * then we'll get the lock. If it's not, then someone else has
1738 * it locked. See the comments in __fop_ondisk_swap for
1739 * details.
1740 */
1741 if ((ret = __fop_lock_handle(env,
1742 tmpdbp, locker, DB_LOCK_READ, NULL, DB_LOCK_NOWAIT)) != 0) {
1743 /*
1744 * Someone holds a writelock. Try for the WRITELOCK
1745 * and after we get it, retry.
1746 */
1747 if ((ret = __fop_lock_handle(env, tmpdbp,
1748 locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1749 goto err;
1750
1751 /* We have the write lock; release it and start over. */
1752 (void)__lock_put(env, &tmpdbp->handle_lock);
1753 (void)__db_close(tmpdbp, NULL, DB_NOSYNC);
1754 (void)__ENV_LPUT(env, elock);
1755 goto retry;
1756 } else {
1757 (void)__lock_put(env, &tmpdbp->handle_lock);
1758 if (!F_ISSET(tmpdbp, DB_AM_IN_RENAME))
1759 ret = EEXIST;
1760 }
1761 if (ret != 0)
1762 goto err;
1763 }
1764
1765 /* Log the renames. */
1766 if (LOGGING_ON(env)
1767 #ifndef DEBUG_WOP
1768 && txn != NULL
1769 #endif
1770 ) {
1771 /* Rename old to new. */
1772 DB_INIT_DBT(fid_dbt, olddbp->fileid, DB_FILE_ID_LEN);
1773 DB_INIT_DBT(n1_dbt, old, strlen(old) + 1);
1774 DB_INIT_DBT(n2_dbt, new, strlen(new) + 1);
1775 if ((ret = __crdel_inmem_rename_log(
1776 env, txn, &lsn, 0, &n1_dbt, &n2_dbt, &fid_dbt)) != 0)
1777 goto err;
1778
1779 /* Rename back to old */
1780 fid_dbt.data = backdbp->fileid;
1781 DB_SET_DBT(n2_dbt, back, strlen(back) + 1);
1782 if ((ret = __crdel_inmem_rename_log(
1783 env, txn, &lsn, 0, &n2_dbt, &n1_dbt, &fid_dbt)) != 0)
1784 goto err;
1785 }
1786
1787 /*
1788 * While we have the namespace locked, do the renames and then
1789 * swap for the handle lock. If we ran into a file in the midst
1790 * of rename, then we need to delete it first, else nameop is
1791 * going to consider it an error.
1792 */
1793 if (F_ISSET(tmpdbp, DB_AM_IN_RENAME)) {
1794 if ((ret = __memp_nameop(env,
1795 tmpdbp->fileid, NULL, new, NULL, 1)) != 0)
1796 goto err;
1797 __txn_remrem(env, parent, new);
1798 }
1799
1800 if ((ret = __memp_nameop(
1801 env, olddbp->fileid, new, old, new, 1)) != 0)
1802 goto err;
1803 if ((ret = __memp_nameop(
1804 env, backdbp->fileid, old, back, old, 1)) != 0)
1805 goto err;
1806
1807 if ((ret = __fop_lock_handle(env,
1808 tmpdbp, locker, DB_LOCK_WRITE, &elock, 0)) != 0)
1809 goto err;
1810
1811 /*
1812 * We just acquired a transactional lock on the tmp handle.
1813 * We need to null out the tmp handle's lock so that it
1814 * doesn't create problems for us in the close path.
1815 */
1816 LOCK_INIT(tmpdbp->handle_lock);
1817
1818 DB_ASSERT(env, txn != NULL);
1819
1820 /* Commit the child. */
1821 ret = __txn_commit(txn, 0);
1822 txn = NULL;
1823
1824 if ((ret = __db_inmem_remove(backdbp, parent, old)) != 0)
1825 goto err;
1826
1827 err: (void)__ENV_LPUT(env, elock);
1828
1829 if (txn != NULL)
1830 (void)__txn_abort(txn);
1831
1832 if ((t_ret = __db_close(tmpdbp, NULL, 0)) != 0 && ret == 0)
1833 ret = t_ret;
1834
1835 return (ret);
1836 }
1837