1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 1997, 1998, 1999
5 * Sleepycat Software. All rights reserved.
6 */
7
8 #include "db_config.h"
9
10 #ifndef lint
11 static const char sccsid[] = "@(#)env_region.c 11.7 (Sleepycat) 11/12/99";
12 #endif /* not lint */
13
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
16
17 #include <ctype.h>
18 #include <errno.h>
19 #include <string.h>
20
21 #ifndef _MSC_VER /* _WIN32 */
22 #include <unistd.h>
23
24 #endif
25
26 #endif /* !NO_SYSTEM_INCLUDES */
27
28 #include "db_int.h"
29
30 static int CDB___db_des_destroy __P((DB_ENV *, REGION *));
31 static int CDB___db_des_get __P((DB_ENV *, REGINFO *, REGINFO *, REGION **));
32 static int CDB___db_e_remfile __P((DB_ENV *));
33 static int CDB___db_faultmem __P((void *, size_t, int));
34
35 /*
36 * CDB___db_e_attach
37 * Join/create the environment
38 *
39 * PUBLIC: int CDB___db_e_attach __P((DB_ENV *));
40 */
41 int
CDB___db_e_attach(dbenv)42 CDB___db_e_attach(dbenv)
43 DB_ENV *dbenv;
44 {
45 REGENV *renv;
46 REGENV_REF ref;
47 REGINFO *infop;
48 REGION *rp, tregion;
49 size_t size;
50 ssize_t nrw;
51 u_int32_t mbytes, bytes;
52 int retry_cnt, ret, segid;
53 char buf[sizeof(DB_REGION_FMT) + 20];
54
55 #if !defined(HAVE_MUTEX_THREADS)
56 /*
57 * !!!
58 * If we don't have spinlocks, we need a file descriptor for fcntl(2)
59 * locking. We use the file handle from the REGENV file for this
60 * purpose.
61 *
62 * Since we may be using shared memory regions, e.g., shmget(2), and
63 * not a mapped-in regular file, the backing file may be only a few
64 * bytes in length. So, this depends on the ability to call fcntl to
65 * lock file offsets much larger than the actual physical file. I
66 * think that's safe -- besides, very few systems actually need this
67 * kind of support, SunOS is the only one still in wide use of which
68 * I'm aware.
69 *
70 * The error case is if an application lacks spinlocks and wants to be
71 * threaded. That doesn't work because fcntl may lock the underlying
72 * process, including all its threads.
73 */
74 if (F_ISSET(dbenv, DB_ENV_THREAD)) {
75 CDB___db_err(dbenv,
76 "architecture lacks fast mutexes: applications cannot be threaded");
77 return (EINVAL);
78 }
79 #endif
80
81 /* Initialization */
82 retry_cnt = 0;
83
84 /* Repeated initialization. */
85 loop: renv = NULL;
86
87 /* Set up the DB_ENV's REG_INFO structure. */
88 if ((ret = CDB___os_calloc(1, sizeof(REGINFO), &infop)) != 0)
89 return (ret);
90 infop->id = REG_ID_ENV;
91 infop->mode = dbenv->db_mode;
92 if (F_ISSET(dbenv, DB_ENV_CREATE))
93 F_SET(infop, REGION_CREATE_OK);
94
95 /*
96 * We have to single-thread the creation of the REGENV region. Once
97 * it exists, we can do locking using locks in the region, but until
98 * then we have to be the only player in the game.
99 *
100 * If this is a private environment, we are only called once and there
101 * are no possible race conditions.
102 *
103 * If this is a public environment, we use the filesystem to ensure
104 * the creation of the environment file is single-threaded.
105 */
106 if (F_ISSET(dbenv, DB_ENV_PRIVATE))
107 goto creation;
108
109 /* Build the region name. */
110 (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
111 if ((ret = CDB___db_appname(dbenv,
112 DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0)
113 goto err;
114
115 /*
116 * Try to create the file, if we have the authority. We have to ensure
117 * that multiple threads/processes attempting to simultaneously create
118 * the file are properly ordered. Open using the O_CREAT and O_EXCL
119 * flags so that multiple attempts to create the region will return
120 * failure in all but one. POSIX 1003.1 requires that EEXIST be the
121 * errno return value -- I sure hope they're right.
122 */
123 if (F_ISSET(dbenv, DB_ENV_CREATE)) {
124 if ((ret = CDB___os_open(infop->name, DB_OSO_CREATE | DB_OSO_EXCL,
125 dbenv->db_mode, dbenv->lockfhp)) == 0)
126 goto creation;
127 if (ret != EEXIST) {
128 CDB___db_err(dbenv,
129 "%s: %s", infop->name, CDB_db_strerror(ret));
130 goto err;
131 }
132 }
133
134 /*
135 * If we couldn't create the file, try and open it. (If that fails,
136 * we're done.)
137 */
138 if ((ret =
139 CDB___os_open(infop->name, 0, dbenv->db_mode, dbenv->lockfhp)) != 0)
140 goto err;
141
142 /*
143 * !!!
144 * The region may be in system memory not backed by the filesystem
145 * (more specifically, not backed by this file), and we're joining
146 * it. In that case, the process that created it will have written
147 * out a REGENV_REF structure as its only contents. We read that
148 * structure before we do anything further, e.g., we can't just map
149 * that file in and then figure out what's going on.
150 *
151 * All of this noise is because some systems don't have a coherent VM
152 * and buffer cache, and what's worse, when you mix operations on the
153 * VM and buffer cache, half the time you hang the system.
154 *
155 * If the file is the size of an REGENV_REF structure, then we know
156 * the real region is in some other memory. (The only way you get a
157 * file that size is to deliberately write it, as it's smaller than
158 * any possible disk sector created by writing a file or mapping the
159 * file into memory.) In which case, retrieve the structure from the
160 * file and use it to acquire the referenced memory.
161 *
162 * If the structure is larger than a REGENV_REF structure, then this
163 * file is backing the shared memory region, and we just map it into
164 * memory.
165 *
166 * And yes, this makes me want to take somebody and kill them. (I
167 * digress -- but you have no freakin' idea. This is unbelievably
168 * stupid and gross, and I've probably spent six months of my life,
169 * now, trying to make different versions of it work.)
170 */
171 if ((ret = CDB___os_ioinfo(infop->name,
172 dbenv->lockfhp, &mbytes, &bytes, NULL)) != 0) {
173 CDB___db_err(dbenv, "%s: %s", infop->name, CDB_db_strerror(ret));
174 goto err;
175 }
176
177 /*
178 * !!!
179 * A size_t is OK -- regions get mapped into memory, and so can't
180 * be larger than a size_t.
181 */
182 size = mbytes * MEGABYTE + bytes;
183
184 /*
185 * If the size is 0 or less than the size of a REGENV_REF structure,
186 * the region (or, possibly, the REGENV_REF structure) has not been
187 * fully written. Wait awhile and try again.
188 *
189 * Otherwise, if the size is the size of a REGENV_REF structure,
190 * read it into memory and use it as a reference to the real region.
191 */
192 segid = INVALID_REGION_SEGID;
193 if (size <= sizeof(ref)) {
194 if (size != sizeof(ref))
195 goto retry;
196
197 if ((ret = CDB___os_read(dbenv->lockfhp, &ref,
198 sizeof(ref), &nrw)) != 0 || nrw < (ssize_t)sizeof(ref)) {
199 if (ret == 0)
200 ret = EIO;
201 CDB___db_err(dbenv,
202 "%s: unable to read system-memory information from: %s",
203 infop->name, CDB_db_strerror(ret));
204 goto err;
205 }
206 size = ref.size;
207 segid = ref.segid;
208
209 F_SET(dbenv, DB_ENV_SYSTEM_MEM);
210 }
211
212 /*
213 * If not doing thread locking, we need to save the file handle for
214 * fcntl(2) locking. Otherwise, discard the handle, we no longer
215 * need it, and the less contact between the buffer cache and the VM,
216 * the better.
217 */
218 #ifdef HAVE_MUTEX_THREADS
219 CDB___os_closehandle(dbenv->lockfhp);
220 #endif
221
222 /* Call the region join routine to acquire the region. */
223 memset(&tregion, 0, sizeof(tregion));
224 tregion.size = size;
225 tregion.segid = segid;
226 if ((ret = CDB___os_r_attach(dbenv, infop, &tregion)) != 0)
227 goto err;
228
229 /*
230 * The environment's REGENV structure has to live at offset 0 instead
231 * of the usual shalloc information. Set the primary reference and
232 * correct the "addr" value to reference the shalloc region. Note,
233 * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted
234 * as well, but that should be fine.
235 */
236 infop->primary = R_ADDR(infop, 0);
237 infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
238
239 /*
240 * Check if the environment has had a catastrophic failure.
241 *
242 * Check the magic number to ensure the region is initialized. If the
243 * magic number isn't set, the lock may not have been initialized, and
244 * an attempt to use it could lead to random behavior.
245 *
246 * The panic and magic values aren't protected by any lock, so we never
247 * use them in any check that's more complex than set/not-set.
248 *
249 * !!!
250 * I'd rather play permissions games using the underlying file, but I
251 * can't because Windows/NT filesystems won't open files mode 0.
252 */
253 renv = infop->primary;
254 if (renv->panic) {
255 ret = CDB___db_panic_msg(dbenv);
256 goto err;
257 }
258 if (renv->magic != DB_REGION_MAGIC)
259 goto retry;
260
261 /* Lock the environment. */
262 MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
263
264 /*
265 * Finally! We own the environment now. Repeat the panic check, it's
266 * possible that it was set while we waited for the lock.
267 */
268 if (renv->panic) {
269 ret = CDB___db_panic_msg(dbenv);
270 goto err_unlock;
271 }
272
273 /*
274 * Get a reference to the underlying REGION information for this
275 * environment.
276 */
277 if ((ret = CDB___db_des_get(dbenv, infop, infop, &rp)) != 0)
278 goto err_unlock;
279 if (rp == NULL) {
280 CDB___db_err(dbenv,
281 "%s: unable to find environment REGION", infop->name);
282 ret = EINVAL;
283 goto err_unlock;
284 }
285 infop->rp = rp;
286
287 /*
288 * There's still a possibility for inconsistent data. When we acquired
289 * the size of the region and attached to it, it might have still been
290 * growing as part of its creation. We can detect this by checking the
291 * size we originally found against the region's current size. (The
292 * region's current size has to be final, the creator finished growing
293 * it before releasing the environment for us to lock.)
294 */
295 if (rp->size != size) {
296 err_unlock: MUTEX_UNLOCK(&renv->mutex);
297 goto retry;
298 }
299
300 /* Increment the reference count. */
301 ++renv->refcnt;
302
303 /* Discard our lock. */
304 MUTEX_UNLOCK(&renv->mutex);
305
306 /*
307 * Fault the pages into memory. Note, do this AFTER releasing the
308 * lock, because we're only reading the pages, not writing them.
309 */
310 (void)CDB___db_faultmem(infop->primary, rp->size, 0);
311
312 /* Everything looks good, we're done. */
313 dbenv->reginfo = infop;
314 return (0);
315
316 creation:
317 /* Create the environment region. */
318 F_SET(infop, REGION_CREATE);
319
320 /*
321 * Allocate room for 50 REGION structures plus overhead (we're going
322 * to use this space for last-ditch allocation requests), although we
323 * should never need anything close to that.
324 */
325 memset(&tregion, 0, sizeof(tregion));
326 tregion.size = 50 * sizeof(REGION) + 50 * sizeof(MUTEX) + 2048;
327 tregion.segid = INVALID_REGION_SEGID;
328 if ((ret = CDB___os_r_attach(dbenv, infop, &tregion)) != 0)
329 goto err;
330
331 /*
332 * Fault the pages into memory. Note, do this BEFORE we initialize
333 * anything, because we're writing the pages, not just reading them.
334 */
335 (void)CDB___db_faultmem(infop->addr, tregion.size, 1);
336
337 /*
338 * The first object in the region is the REGENV structure. This is
339 * different from the other regions, and, from everything else in
340 * this region, where all objects are allocated from the pool, i.e.,
341 * there aren't any fixed locations. The remaining space is made
342 * available for later allocation.
343 *
344 * The allocation space must be size_t aligned, because that's what
345 * the initialization routine is going to store there. To make sure
346 * that happens, the REGENV structure was padded with a final size_t.
347 * No other region needs to worry about it because all of them treat
348 * the entire region as allocation space.
349 *
350 * Set the primary reference and correct the "addr" value to reference
351 * the shalloc region. Note, this requires that we "uncorrect" it at
352 * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be
353 * shifted as well, but that should be fine.
354 */
355 infop->primary = R_ADDR(infop, 0);
356 infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
357 CDB___db_shalloc_init(infop->addr, tregion.size - sizeof(REGENV));
358
359 /*
360 * Initialize the rest of the REGENV structure, except for the magic
361 * number which validates the file/environment.
362 */
363 renv = infop->primary;
364 renv->panic = 0;
365 CDB_db_version(&renv->majver, &renv->minver, &renv->patch);
366 SH_LIST_INIT(&renv->regionq);
367 renv->refcnt = 1;
368
369 /*
370 * Lock the environment.
371 *
372 * Check the lock call return. This is the first lock we initialize
373 * and acquire, and we have to know if it fails. (It CAN fail, e.g.,
374 * SunOS, when using fcntl(2) for locking and using an in-memory
375 * filesystem as the database home. But you knew that, I'm sure -- it
376 * probably wasn't even worth mentioning.)
377 */
378 if ((ret =
379 __db_mutex_init(dbenv, &renv->mutex, DB_FCNTL_OFF_GEN, 0)) != 0) {
380 CDB___db_err(dbenv, "%s: unable to initialize environment lock: %s",
381 infop->name, CDB_db_strerror(ret));
382 goto err;
383 }
384
385 if (!F_ISSET(&renv->mutex, MUTEX_IGNORE) &&
386 (ret = __db_mutex_lock(&renv->mutex, dbenv->lockfhp)) != 0) {
387 CDB___db_err(dbenv, "%s: unable to acquire environment lock: %s",
388 infop->name, CDB_db_strerror(ret));
389 goto err;
390 }
391
392 /*
393 * Get the underlying REGION structure for this environment. Note,
394 * we created the underlying OS region before we acquired the REGION
395 * structure, which is backwards from the normal procedure. Update
396 * the REGION structure.
397 */
398 if ((ret = CDB___db_des_get(dbenv, infop, infop, &rp)) != 0)
399 goto err;
400 infop->rp = rp;
401 rp->size = tregion.size;
402 rp->segid = tregion.segid;
403
404 /*
405 * !!!
406 * If we create an environment where regions are public and in system
407 * memory, we have to inform processes joining the environment how to
408 * attach to the shared memory segment. So, we write the shared memory
409 * identifier into the file, to be read by those other processes.
410 *
411 * XXX
412 * This is really OS-layer information, but I can't see any easy way
413 * to move it down there without passing down information that it has
414 * no right to know, e.g., that this is the one-and-only REGENV region
415 * and not some other random region.
416 */
417 if (tregion.segid != INVALID_REGION_SEGID) {
418 ref.size = tregion.size;
419 ref.segid = tregion.segid;
420 if ((ret = CDB___os_write(dbenv->lockfhp,
421 &ref, sizeof(ref), &nrw)) != 0 || nrw != sizeof(ref)) {
422 CDB___db_err(dbenv,
423 "%s: unable to write out public environment ID: %s",
424 infop->name, CDB_db_strerror(ret));
425 goto err;
426 }
427 }
428
429 /*
430 * If not doing thread locking, we need to save the file handle for
431 * fcntl(2) locking. Otherwise, discard the handle, we no longer
432 * need it, and the less contact between the buffer cache and the VM,
433 * the better.
434 */
435 #if defined(HAVE_MUTEX_THREADS)
436 if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
437 CDB___os_closehandle(dbenv->lockfhp);
438 #endif
439
440 /* Validate the file. */
441 renv->magic = DB_REGION_MAGIC;
442
443 /* Discard our lock. */
444 MUTEX_UNLOCK(&renv->mutex);
445
446 /* Everything looks good, we're done. */
447 dbenv->reginfo = infop;
448 return (0);
449
450 err:
451 retry: /* Close any open file handle. */
452 if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
453 (void)CDB___os_closehandle(dbenv->lockfhp);
454
455 /*
456 * If we joined or created the region, detach from it. If we created
457 * it, destroy it. Note, there's a path in the above code where we're
458 * using a temporary REGION structure because we haven't yet allocated
459 * the real one. In that case the region address (addr) will be filled
460 * in, but the REGION pointer (rp) won't. Fix it.
461 */
462 if (infop->addr != NULL) {
463 if (infop->rp == NULL)
464 infop->rp = &tregion;
465
466 /* Reset the addr value that we "corrected" above. */
467 infop->addr = infop->primary;
468 (void)CDB___os_r_detach(dbenv,
469 infop, F_ISSET(infop, REGION_CREATE));
470 }
471
472 /* Free the allocated name and/or REGINFO structure. */
473 if (infop->name != NULL)
474 CDB___os_freestr(infop->name);
475 CDB___os_free(infop, sizeof(REGINFO));
476
477 /* If we had a temporary error, wait awhile and try again. */
478 if (ret == 0) {
479 if (++retry_cnt > 3) {
480 CDB___db_err(dbenv, "unable to join the environment");
481 ret = EAGAIN;
482 } else {
483 CDB___os_sleep(retry_cnt * 3, 0);
484 goto loop;
485 }
486 }
487
488 return (ret);
489 }
490
491 /*
492 * CDB___db_e_detach --
493 * Detach from the environment.
494 *
495 * PUBLIC: int CDB___db_e_detach __P((DB_ENV *, int));
496 */
497 int
CDB___db_e_detach(dbenv,destroy)498 CDB___db_e_detach(dbenv, destroy)
499 DB_ENV *dbenv;
500 int destroy;
501 {
502 REGENV *renv;
503 REGINFO *infop;
504
505 infop = dbenv->reginfo;
506 renv = infop->primary;
507
508 /* Lock the environment. */
509 MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
510
511 /* Decrement the reference count. */
512 if (renv->refcnt == 0) {
513 CDB___db_err(dbenv,
514 "region %lu (environment): reference count went negative",
515 infop->rp->id);
516 } else
517 --renv->refcnt;
518
519 /* Release the lock. */
520 MUTEX_UNLOCK(&renv->mutex);
521
522 /* Close the locking file handle. */
523 if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
524 (void)CDB___os_closehandle(dbenv->lockfhp);
525
526 /* Reset the addr value that we "corrected" above. */
527 infop->addr = infop->primary;
528
529 /*
530 * Release the region, and kill our reference.
531 *
532 * We set the DBENV->reginfo field to NULL here and discard its memory.
533 * DBENV->remove calls CDB___dbenv_remove to do the region remove, and
534 * CDB___dbenv_remove attached and then detaches from the region. We don't
535 * want to return to DBENV->remove with a non-NULL DBENV->reginfo field
536 * because it will attempt to detach again as part of its cleanup.
537 */
538 (void)CDB___os_r_detach(dbenv, infop, destroy);
539
540 if (infop->name != NULL)
541 CDB___os_free(infop->name, 0);
542 CDB___os_free(dbenv->reginfo, sizeof(REGINFO));
543 dbenv->reginfo = NULL;
544
545 return (0);
546 }
547
548 /*
549 * CDB___db_e_remove --
550 * Discard an environment if it's not in use.
551 *
552 * PUBLIC: int CDB___db_e_remove __P((DB_ENV *, int));
553 */
554 int
CDB___db_e_remove(dbenv,force)555 CDB___db_e_remove(dbenv, force)
556 DB_ENV *dbenv;
557 int force;
558 {
559 REGENV *renv;
560 REGINFO *infop, reginfo;
561 REGION *rp;
562 int ret, saved_value;
563
564 /*
565 * This routine has to walk a nasty line between not looking into
566 * the environment (which may be corrupted after an app or system
567 * crash), and removing everything that needs removing. What we
568 * do is:
569 * 1. Connect to the environment (so it better be OK).
570 * 2. If the environment is in use (reference count is non-zero),
571 * return EBUSY.
572 * 3. Overwrite the magic number so that any threads of control
573 * attempting to connect will backoff and retry.
574 * 4. Walk the list of regions. Connect to each region and then
575 * disconnect with the destroy flag set. This shouldn't cause
576 * any problems, even if the region is corrupted, because we
577 * should never be looking inside the region.
578 * 5. Walk the list of files in the directory, unlinking any
579 * files that match a region name. Unlink the environment
580 * file last.
581 *
582 * If the force flag is set, we do not acquire any locks during this
583 * process.
584 */
585 saved_value = DB_GLOBAL(db_mutexlocks);
586 if (force)
587 DB_GLOBAL(db_mutexlocks) = 0;
588
589 /* Join the environment. */
590 if ((ret = CDB___db_e_attach(dbenv)) != 0) {
591 /*
592 * If we can't join it, we assume that's because it doesn't
593 * exist. It would be better to know why we failed, but it
594 * probably isn't important.
595 */
596 ret = 0;
597 if (force)
598 goto remfiles;
599 goto err;
600 }
601
602 infop = dbenv->reginfo;
603 renv = infop->primary;
604
605 /* Lock the environment. */
606 MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
607
608 /* If it's in use, we're done. */
609 if (renv->refcnt == 1 || force) {
610 /*
611 * Set the panic flag and overwrite the magic number.
612 *
613 * !!!
614 * From this point on, there's no going back, we pretty
615 * much ignore errors, and just whack on whatever we can.
616 */
617 renv->panic = 1;
618 renv->magic = 0;
619
620 /*
621 * Unlock the environment. We should no longer need the lock
622 * because we've poisoned the pool, but we can't continue to
623 * hold it either, because other routines may want it.
624 */
625 MUTEX_UNLOCK(&renv->mutex);
626
627 /*
628 * Attach to each sub-region and destroy it.
629 *
630 * !!!
631 * The REGION_CREATE_OK flag is set for Windows/95 -- regions
632 * are zero'd out when the last reference to the region goes
633 * away, in which case the underlying OS region code requires
634 * callers be prepared to create the region in order to join it.
635 */
636 memset(®info, 0, sizeof(reginfo));
637 restart: for (rp = SH_LIST_FIRST(&renv->regionq, __db_region);
638 rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) {
639 if (rp->id == REG_ID_ENV)
640 continue;
641
642 reginfo.id = rp->id;
643 reginfo.flags = REGION_CREATE_OK;
644 if (CDB___db_r_attach(dbenv, ®info, 0) == 0) {
645 R_UNLOCK(dbenv, ®info);
646 (void)CDB___db_r_detach(dbenv, ®info, 1);
647 }
648 goto restart;
649 }
650
651 /* Destroy the environment's region. */
652 (void)CDB___db_e_detach(dbenv, 1);
653
654 /* Discard the physical files. */
655 remfiles: (void)CDB___db_e_remfile(dbenv);
656 } else {
657 /* Unlock the environment. */
658 MUTEX_UNLOCK(&renv->mutex);
659
660 /* Discard the environment. */
661 (void)CDB___db_e_detach(dbenv, 0);
662
663 ret = EBUSY;
664 }
665
666 err: if (force)
667 DB_GLOBAL(db_mutexlocks) = saved_value;
668
669 return (ret);
670 }
671
672 /*
673 * CDB___db_e_remfile --
674 * Discard any region files in the filesystem.
675 */
676 static int
CDB___db_e_remfile(dbenv)677 CDB___db_e_remfile(dbenv)
678 DB_ENV *dbenv;
679 {
680 static char *old_region_names[] = {
681 "__db_lock.share",
682 "__db_log.share",
683 "__db_mpool.share",
684 "__db_txn.share",
685 NULL,
686 };
687 int cnt, fcnt, lastrm, ret;
688 u_int8_t saved_byte;
689 const char *dir;
690 char *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
691
692 /* Get the full path of a file in the environment. */
693 (void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
694 if ((ret =
695 CDB___db_appname(dbenv, DB_APP_NONE, NULL, buf, 0, NULL, &path)) != 0)
696 return (ret);
697
698 /* Get the parent directory for the environment. */
699 if ((p = CDB___db_rpath(path)) == NULL) {
700 p = path;
701 saved_byte = *p;
702
703 dir = PATH_DOT;
704 } else {
705 saved_byte = *p;
706 *p = '\0';
707
708 dir = path;
709 }
710
711 /* Get the list of file names. */
712 ret = CDB___os_dirlist(dir, &names, &fcnt);
713
714 /* Restore the path, and free it. */
715 *p = saved_byte;
716 CDB___os_freestr(path);
717
718 if (ret != 0) {
719 CDB___db_err(dbenv, "%s: %s", dir, CDB_db_strerror(ret));
720 return (ret);
721 }
722
723 /*
724 * Search for valid region names, and remove them. We remove the
725 * environment region last, because it's the key to this whole mess.
726 */
727 for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
728 if (strlen(names[cnt]) != DB_REGION_NAME_LENGTH ||
729 memcmp(names[cnt], DB_REGION_FMT, DB_REGION_NAME_NUM) != 0)
730 continue;
731 if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
732 lastrm = cnt;
733 continue;
734 }
735 for (p = names[cnt] + DB_REGION_NAME_NUM;
736 *p != '\0' && isdigit((int)*p); ++p)
737 ;
738 if (*p != '\0')
739 continue;
740
741 if (CDB___db_appname(dbenv,
742 DB_APP_NONE, NULL, names[cnt], 0, NULL, &path) == 0) {
743 (void)CDB___os_unlink(path);
744 CDB___os_freestr(path);
745 }
746 }
747
748 if (lastrm != -1)
749 if (CDB___db_appname(dbenv,
750 DB_APP_NONE, NULL, names[lastrm], 0, NULL, &path) == 0) {
751 (void)CDB___os_unlink(path);
752 CDB___os_freestr(path);
753 }
754 CDB___os_dirfree(names, fcnt);
755
756 /*
757 * !!!
758 * Backward compatibility -- remove region files from releases
759 * before 2.8.XX.
760 */
761 for (names = (char **)old_region_names; *names != NULL; ++names)
762 if (CDB___db_appname(dbenv,
763 DB_APP_NONE, NULL, *names, 0, NULL, &path) == 0) {
764 (void)CDB___os_unlink(path);
765 CDB___os_freestr(path);
766 }
767
768 return (0);
769 }
770
771 /*
772 * CDB___db_e_stat
773 * Statistics for the environment.
774 *
775 * PUBLIC: int CDB___db_e_stat __P((DB_ENV *, REGENV *, REGION *, int *));
776 */
777 int
CDB___db_e_stat(dbenv,arg_renv,arg_regions,arg_regions_cnt)778 CDB___db_e_stat(dbenv, arg_renv, arg_regions, arg_regions_cnt)
779 DB_ENV *dbenv;
780 REGENV *arg_renv;
781 REGION *arg_regions;
782 int *arg_regions_cnt;
783 {
784 REGENV *renv;
785 REGINFO *infop;
786 REGION *rp;
787 int n;
788
789 infop = dbenv->reginfo;
790 renv = infop->primary;
791 rp = infop->rp;
792
793 /* Lock the environment. */
794 MUTEX_LOCK(&rp->mutex, dbenv->lockfhp);
795
796 *arg_renv = *renv;
797
798 for (n = 0, rp = SH_LIST_FIRST(&renv->regionq, __db_region);
799 n < *arg_regions_cnt && rp != NULL;
800 ++n, rp = SH_LIST_NEXT(rp, q, __db_region))
801 arg_regions[n] = *rp;
802
803 /* Release the lock. */
804 rp = infop->rp;
805 MUTEX_UNLOCK(&rp->mutex);
806
807 *arg_regions_cnt = n == 0 ? n : n - 1;
808
809 return (0);
810 }
811
812 /*
813 * CDB___db_r_attach
814 * Join/create a region.
815 *
816 * PUBLIC: int CDB___db_r_attach __P((DB_ENV *, REGINFO *, size_t));
817 */
818 int
CDB___db_r_attach(dbenv,infop,size)819 CDB___db_r_attach(dbenv, infop, size)
820 DB_ENV *dbenv;
821 REGINFO *infop;
822 size_t size;
823 {
824 REGENV *renv;
825 REGION *rp;
826 int ret;
827 char buf[sizeof(DB_REGION_FMT) + 20];
828
829 renv = ((REGINFO *)dbenv->reginfo)->primary;
830 F_CLR(infop, REGION_CREATE);
831
832 /* Lock the environment. */
833 MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
834
835 /* Find or create a REGION structure for this region. */
836 if ((ret = CDB___db_des_get(dbenv, dbenv->reginfo, infop, &rp)) != 0) {
837 MUTEX_UNLOCK(&renv->mutex);
838 return (ret);
839 }
840 infop->rp = rp;
841 infop->id = rp->id;
842
843 /* If we're creating the region, set the desired size. */
844 if (F_ISSET(infop, REGION_CREATE))
845 rp->size = size;
846
847 /* Join/create the underlying region. */
848 (void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
849 if ((ret = CDB___db_appname(dbenv,
850 DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0)
851 goto err;
852 if ((ret = CDB___os_r_attach(dbenv, infop, rp)) != 0)
853 goto err;
854
855 /*
856 * Fault the pages into memory. Note, do this BEFORE we initialize
857 * anything because we're writing pages in created regions, not just
858 * reading them.
859 */
860 (void)CDB___db_faultmem(infop->addr,
861 rp->size, F_ISSET(infop, REGION_CREATE));
862
863 /*
864 * !!!
865 * The underlying layer may have just decided that we are going
866 * to create the region. There are various system issues that
867 * can result in a useless region that requires re-initialization.
868 *
869 * If we created the region, initialize it for allocation.
870 */
871 if (F_ISSET(infop, REGION_CREATE)) {
872 ((REGION *)(infop->addr))->magic = DB_REGION_MAGIC;
873
874 (void)CDB___db_shalloc_init(infop->addr, rp->size);
875 }
876
877 /*
878 * If the underlying REGION isn't the environment, acquire a lock
879 * for it and release our lock on the environment.
880 */
881 if (infop->id != REG_ID_ENV) {
882 MUTEX_LOCK(&rp->mutex, dbenv->lockfhp);
883 MUTEX_UNLOCK(&renv->mutex);
884 }
885
886 return (0);
887
888 /* Discard the underlying region. */
889 err: if (infop->addr != NULL)
890 (void)CDB___os_r_detach(dbenv,
891 infop, F_ISSET(infop, REGION_CREATE));
892 infop->rp = NULL;
893 infop->id = REG_ID_INVALID;
894
895 /* Discard the REGION structure if we created it. */
896 if (F_ISSET(infop, REGION_CREATE))
897 (void)CDB___db_des_destroy(dbenv, rp);
898
899 /* Release the environment lock. */
900 MUTEX_UNLOCK(&renv->mutex);
901
902 return (ret);
903 }
904
905 /*
906 * CDB___db_r_detach --
907 * Detach from a region.
908 *
909 * PUBLIC: int CDB___db_r_detach __P((DB_ENV *, REGINFO *, int));
910 */
911 int
CDB___db_r_detach(dbenv,infop,destroy)912 CDB___db_r_detach(dbenv, infop, destroy)
913 DB_ENV *dbenv;
914 REGINFO *infop;
915 int destroy;
916 {
917 REGENV *renv;
918 REGION *rp;
919 int ret, t_ret;
920
921 renv = ((REGINFO *)dbenv->reginfo)->primary;
922 rp = infop->rp;
923
924 /* Lock the environment. */
925 MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
926
927 /* Acquire the lock for the REGION. */
928 MUTEX_LOCK(&rp->mutex, dbenv->lockfhp);
929
930 /* Detach from the underlying OS region. */
931 ret = CDB___os_r_detach(dbenv, infop, destroy);
932
933 /* Release the REGION lock. */
934 MUTEX_UNLOCK(&rp->mutex);
935
936 /* If we destroyed the region, discard the REGION structure. */
937 if (destroy &&
938 ((t_ret = CDB___db_des_destroy(dbenv, rp)) != 0) && ret == 0)
939 ret = t_ret;
940
941 /* Release the environment lock. */
942 MUTEX_UNLOCK(&renv->mutex);
943
944 /* Destroy the structure. */
945 if (infop->name != NULL)
946 CDB___os_freestr(infop->name);
947
948 return (ret);
949 }
950
951 /*
952 * CDB___db_des_get --
953 * Return a reference to the shared information for a REGION,
954 * optionally creating a new entry.
955 */
956 static int
CDB___db_des_get(dbenv,env_infop,infop,rpp)957 CDB___db_des_get(dbenv, env_infop, infop, rpp)
958 DB_ENV *dbenv;
959 REGINFO *env_infop, *infop;
960 REGION **rpp;
961 {
962 REGENV *renv;
963 REGION *rp;
964 int maxid, ret;
965
966 /*
967 * !!!
968 * Called with the environment already locked.
969 */
970 *rpp = NULL;
971 renv = env_infop->primary;
972
973 maxid = REG_ID_ASSIGN;
974 for (rp = SH_LIST_FIRST(&renv->regionq, __db_region);
975 rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) {
976 if (rp->id == infop->id)
977 break;
978 if (rp->id > maxid)
979 maxid = rp->id;
980 }
981
982 /*
983 * If we didn't find a region, or we found one needing initialization,
984 * and we can't create the region, fail.
985 */
986 if (!F_ISSET(infop, REGION_CREATE_OK) &&
987 (rp == NULL || F_ISSET(rp, REG_DEAD)))
988 return (ENOENT);
989
990 /*
991 * If we didn't find a region, create and initialize a REGION structure
992 * for the caller. If id was set, use that value, otherwise we use the
993 * next available ID.
994 */
995 if (rp == NULL) {
996 if ((ret = CDB___db_shalloc(env_infop->addr,
997 sizeof(REGION), MUTEX_ALIGN, &rp)) != 0)
998 return (ret);
999
1000 /* Initialize the region. */
1001 memset(rp, 0, sizeof(*rp));
1002 if ((ret = __db_mutex_init(dbenv, &rp->mutex,
1003 R_OFFSET(env_infop, &rp->mutex) + DB_FCNTL_OFF_GEN,
1004 0)) != 0) {
1005 CDB___db_shalloc_free(env_infop->addr, rp);
1006 return (ret);
1007 }
1008 rp->segid = INVALID_REGION_SEGID;
1009 rp->id = infop->id == REG_ID_INVALID ? maxid + 1 : infop->id;
1010
1011 SH_LIST_INSERT_HEAD(&renv->regionq, rp, q, __db_region);
1012 F_SET(infop, REGION_CREATE);
1013 } else {
1014 /*
1015 * There is one race -- a caller created a region, was trying
1016 * to initialize it for general use, and failed somehow. We
1017 * leave the region around and tell each new caller that they
1018 * are creating it, because that's easier than dealing with
1019 * the races involved in removing it.
1020 */
1021 if (F_ISSET(rp, REG_DEAD)) {
1022 rp->primary = INVALID_ROFF;
1023
1024 F_CLR(rp, REG_DEAD);
1025 F_SET(infop, REGION_CREATE);
1026 }
1027 }
1028
1029 *rpp = rp;
1030 return (0);
1031 }
1032
1033 /*
1034 * CDB___db_des_destroy --
1035 * Destroy a reference to a REGION.
1036 */
1037 static int
CDB___db_des_destroy(dbenv,rp)1038 CDB___db_des_destroy(dbenv, rp)
1039 DB_ENV *dbenv;
1040 REGION *rp;
1041 {
1042 REGINFO *infop;
1043
1044 /*
1045 * !!!
1046 * Called with the environment already locked.
1047 */
1048 infop = dbenv->reginfo;
1049
1050 SH_LIST_REMOVE(rp, q, __db_region);
1051 CDB___db_shalloc_free(infop->addr, rp);
1052
1053 return (0);
1054 }
1055
1056 /*
1057 * CDB___db_faultmem --
1058 * Fault the region into memory.
1059 */
1060 static int
CDB___db_faultmem(addr,size,created)1061 CDB___db_faultmem(addr, size, created)
1062 void *addr;
1063 size_t size;
1064 int created;
1065 {
1066 int ret;
1067 u_int8_t *p, *t;
1068
1069 /*
1070 * It's sometimes significantly faster to page-fault in all of the
1071 * region's pages before we run the application, as we see nasty
1072 * side-effects when we page-fault while holding various locks, i.e.,
1073 * the lock takes a long time to acquire because of the underlying
1074 * page fault, and the other threads convoy behind the lock holder.
1075 *
1076 * If we created the region, we write a non-zero value so that the
1077 * system can't cheat. If we're just joining the region, we can
1078 * only read the value and try to confuse the compiler sufficiently
1079 * that it doesn't figure out that we're never really using it.
1080 */
1081 ret = 0;
1082 if (DB_GLOBAL(db_region_init)) {
1083 if (created)
1084 for (p = addr, t = (u_int8_t *)addr + size;
1085 p < t; p += OS_VMPAGESIZE)
1086 p[0] = 0xdb;
1087 else
1088 for (p = addr, t = (u_int8_t *)addr + size;
1089 p < t; p += OS_VMPAGESIZE)
1090 ret |= p[0];
1091 }
1092
1093 return (ret);
1094 }
1095