1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 1997, 1998, 1999
5  *	Sleepycat Software.  All rights reserved.
6  */
7 
8 #include "db_config.h"
9 
10 #ifndef lint
11 static const char sccsid[] = "@(#)env_region.c	11.7 (Sleepycat) 11/12/99";
12 #endif /* not lint */
13 
14 #ifndef NO_SYSTEM_INCLUDES
15 #include <sys/types.h>
16 
17 #include <ctype.h>
18 #include <errno.h>
19 #include <string.h>
20 
21 #ifndef _MSC_VER /* _WIN32 */
22 #include <unistd.h>
23 
24 #endif
25 
26 #endif /* !NO_SYSTEM_INCLUDES */
27 
28 #include "db_int.h"
29 
30 static int CDB___db_des_destroy __P((DB_ENV *, REGION *));
31 static int CDB___db_des_get __P((DB_ENV *, REGINFO *, REGINFO *, REGION **));
32 static int CDB___db_e_remfile __P((DB_ENV *));
33 static int CDB___db_faultmem __P((void *, size_t, int));
34 
35 /*
36  * CDB___db_e_attach
37  *	Join/create the environment
38  *
39  * PUBLIC: int CDB___db_e_attach __P((DB_ENV *));
40  */
41 int
CDB___db_e_attach(dbenv)42 CDB___db_e_attach(dbenv)
43 	DB_ENV *dbenv;
44 {
45 	REGENV *renv;
46 	REGENV_REF ref;
47 	REGINFO *infop;
48 	REGION *rp, tregion;
49 	size_t size;
50 	ssize_t nrw;
51 	u_int32_t mbytes, bytes;
52 	int retry_cnt, ret, segid;
53 	char buf[sizeof(DB_REGION_FMT) + 20];
54 
55 #if !defined(HAVE_MUTEX_THREADS)
56 	/*
57 	 * !!!
58 	 * If we don't have spinlocks, we need a file descriptor for fcntl(2)
59 	 * locking.  We use the file handle from the REGENV file for this
60 	 * purpose.
61 	 *
62 	 * Since we may be using shared memory regions, e.g., shmget(2), and
63 	 * not a mapped-in regular file, the backing file may be only a few
64 	 * bytes in length.  So, this depends on the ability to call fcntl to
65 	 * lock file offsets much larger than the actual physical file.  I
66 	 * think that's safe -- besides, very few systems actually need this
67 	 * kind of support, SunOS is the only one still in wide use of which
68 	 * I'm aware.
69 	 *
70 	 * The error case is if an application lacks spinlocks and wants to be
71 	 * threaded.  That doesn't work because fcntl may lock the underlying
72 	 * process, including all its threads.
73 	 */
74 	if (F_ISSET(dbenv, DB_ENV_THREAD)) {
75 		CDB___db_err(dbenv,
76 "architecture lacks fast mutexes: applications cannot be threaded");
77 		return (EINVAL);
78 	}
79 #endif
80 
81 	/* Initialization */
82 	retry_cnt = 0;
83 
84 	/* Repeated initialization. */
85 loop:	renv = NULL;
86 
87 	/* Set up the DB_ENV's REG_INFO structure. */
88 	if ((ret = CDB___os_calloc(1, sizeof(REGINFO), &infop)) != 0)
89 		return (ret);
90 	infop->id = REG_ID_ENV;
91 	infop->mode = dbenv->db_mode;
92 	if (F_ISSET(dbenv, DB_ENV_CREATE))
93 		F_SET(infop, REGION_CREATE_OK);
94 
95 	/*
96 	 * We have to single-thread the creation of the REGENV region.  Once
97 	 * it exists, we can do locking using locks in the region, but until
98 	 * then we have to be the only player in the game.
99 	 *
100 	 * If this is a private environment, we are only called once and there
101 	 * are no possible race conditions.
102 	 *
103 	 * If this is a public environment, we use the filesystem to ensure
104 	 * the creation of the environment file is single-threaded.
105 	 */
106 	if (F_ISSET(dbenv, DB_ENV_PRIVATE))
107 		goto creation;
108 
109 	/* Build the region name. */
110 	(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
111 	if ((ret = CDB___db_appname(dbenv,
112 	    DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0)
113 		goto err;
114 
115 	/*
116 	 * Try to create the file, if we have the authority.  We have to ensure
117 	 * that multiple threads/processes attempting to simultaneously create
118 	 * the file are properly ordered.  Open using the O_CREAT and O_EXCL
119 	 * flags so that multiple attempts to create the region will return
120 	 * failure in all but one.  POSIX 1003.1 requires that EEXIST be the
121 	 * errno return value -- I sure hope they're right.
122 	 */
123 	if (F_ISSET(dbenv, DB_ENV_CREATE)) {
124 		if ((ret = CDB___os_open(infop->name, DB_OSO_CREATE | DB_OSO_EXCL,
125 		    dbenv->db_mode, dbenv->lockfhp)) == 0)
126 			goto creation;
127 		if (ret != EEXIST) {
128 			CDB___db_err(dbenv,
129 			    "%s: %s", infop->name, CDB_db_strerror(ret));
130 			goto err;
131 		}
132 	}
133 
134 	/*
135 	 * If we couldn't create the file, try and open it.  (If that fails,
136 	 * we're done.)
137 	 */
138 	if ((ret =
139 	    CDB___os_open(infop->name, 0, dbenv->db_mode, dbenv->lockfhp)) != 0)
140 		goto err;
141 
142 	/*
143 	 * !!!
144 	 * The region may be in system memory not backed by the filesystem
145 	 * (more specifically, not backed by this file), and we're joining
146 	 * it.  In that case, the process that created it will have written
147 	 * out a REGENV_REF structure as its only contents.  We read that
148 	 * structure before we do anything further, e.g., we can't just map
149 	 * that file in and then figure out what's going on.
150 	 *
151 	 * All of this noise is because some systems don't have a coherent VM
152 	 * and buffer cache, and what's worse, when you mix operations on the
153 	 * VM and buffer cache, half the time you hang the system.
154 	 *
155 	 * If the file is the size of an REGENV_REF structure, then we know
156 	 * the real region is in some other memory.  (The only way you get a
157 	 * file that size is to deliberately write it, as it's smaller than
158 	 * any possible disk sector created by writing a file or mapping the
159 	 * file into memory.)  In which case, retrieve the structure from the
160 	 * file and use it to acquire the referenced memory.
161 	 *
162 	 * If the structure is larger than a REGENV_REF structure, then this
163 	 * file is backing the shared memory region, and we just map it into
164 	 * memory.
165 	 *
166 	 * And yes, this makes me want to take somebody and kill them.  (I
167 	 * digress -- but you have no freakin' idea.  This is unbelievably
168 	 * stupid and gross, and I've probably spent six months of my life,
169 	 * now, trying to make different versions of it work.)
170 	 */
171 	if ((ret = CDB___os_ioinfo(infop->name,
172 	    dbenv->lockfhp, &mbytes, &bytes, NULL)) != 0) {
173 		CDB___db_err(dbenv, "%s: %s", infop->name, CDB_db_strerror(ret));
174 		goto err;
175 	}
176 
177 	/*
178 	 * !!!
179 	 * A size_t is OK -- regions get mapped into memory, and so can't
180 	 * be larger than a size_t.
181 	 */
182 	size = mbytes * MEGABYTE + bytes;
183 
184 	/*
185 	 * If the size is 0 or less than the size of a REGENV_REF structure,
186 	 * the region (or, possibly, the REGENV_REF structure) has not been
187 	 * fully written.  Wait awhile and try again.
188 	 *
189 	 * Otherwise, if the size is the size of a REGENV_REF structure,
190 	 * read it into memory and use it as a reference to the real region.
191 	 */
192 	segid = INVALID_REGION_SEGID;
193 	if (size <= sizeof(ref)) {
194 		if (size != sizeof(ref))
195 			goto retry;
196 
197 		if ((ret = CDB___os_read(dbenv->lockfhp, &ref,
198 		    sizeof(ref), &nrw)) != 0 || nrw < (ssize_t)sizeof(ref)) {
199 			if (ret == 0)
200 				ret = EIO;
201 			CDB___db_err(dbenv,
202 		    "%s: unable to read system-memory information from: %s",
203 			    infop->name, CDB_db_strerror(ret));
204 			goto err;
205 		}
206 		size = ref.size;
207 		segid = ref.segid;
208 
209 		F_SET(dbenv, DB_ENV_SYSTEM_MEM);
210 	}
211 
212 	/*
213 	 * If not doing thread locking, we need to save the file handle for
214 	 * fcntl(2) locking.  Otherwise, discard the handle, we no longer
215 	 * need it, and the less contact between the buffer cache and the VM,
216 	 * the better.
217 	 */
218 #ifdef HAVE_MUTEX_THREADS
219 	 CDB___os_closehandle(dbenv->lockfhp);
220 #endif
221 
222 	/* Call the region join routine to acquire the region. */
223 	memset(&tregion, 0, sizeof(tregion));
224 	tregion.size = size;
225 	tregion.segid = segid;
226 	if ((ret = CDB___os_r_attach(dbenv, infop, &tregion)) != 0)
227 		goto err;
228 
229 	/*
230 	 * The environment's REGENV structure has to live at offset 0 instead
231 	 * of the usual shalloc information.  Set the primary reference and
232 	 * correct the "addr" value to reference the shalloc region.  Note,
233 	 * this means that all of our offsets (R_ADDR/R_OFFSET) get shifted
234 	 * as well, but that should be fine.
235 	 */
236 	infop->primary = R_ADDR(infop, 0);
237 	infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
238 
239 	/*
240 	 * Check if the environment has had a catastrophic failure.
241 	 *
242 	 * Check the magic number to ensure the region is initialized.  If the
243 	 * magic number isn't set, the lock may not have been initialized, and
244 	 * an attempt to use it could lead to random behavior.
245 	 *
246 	 * The panic and magic values aren't protected by any lock, so we never
247 	 * use them in any check that's more complex than set/not-set.
248 	 *
249 	 * !!!
250 	 * I'd rather play permissions games using the underlying file, but I
251 	 * can't because Windows/NT filesystems won't open files mode 0.
252 	 */
253 	renv = infop->primary;
254 	if (renv->panic) {
255 		ret = CDB___db_panic_msg(dbenv);
256 		goto err;
257 	}
258 	if (renv->magic != DB_REGION_MAGIC)
259 		goto retry;
260 
261 	/* Lock the environment. */
262 	MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
263 
264 	/*
265 	 * Finally!  We own the environment now.  Repeat the panic check, it's
266 	 * possible that it was set while we waited for the lock.
267 	 */
268 	if (renv->panic) {
269 		ret = CDB___db_panic_msg(dbenv);
270 		goto err_unlock;
271 	}
272 
273 	/*
274 	 * Get a reference to the underlying REGION information for this
275 	 * environment.
276 	 */
277 	if ((ret = CDB___db_des_get(dbenv, infop, infop, &rp)) != 0)
278 		goto err_unlock;
279 	if (rp == NULL) {
280 		CDB___db_err(dbenv,
281 		    "%s: unable to find environment REGION", infop->name);
282 		ret = EINVAL;
283 		goto err_unlock;
284 	}
285 	infop->rp = rp;
286 
287 	/*
288 	 * There's still a possibility for inconsistent data.  When we acquired
289 	 * the size of the region and attached to it, it might have still been
290 	 * growing as part of its creation.  We can detect this by checking the
291 	 * size we originally found against the region's current size.  (The
292 	 * region's current size has to be final, the creator finished growing
293 	 * it before releasing the environment for us to lock.)
294 	 */
295 	if (rp->size != size) {
296 err_unlock:	MUTEX_UNLOCK(&renv->mutex);
297 		goto retry;
298 	}
299 
300 	/* Increment the reference count. */
301 	++renv->refcnt;
302 
303 	/* Discard our lock. */
304 	MUTEX_UNLOCK(&renv->mutex);
305 
306 	/*
307 	 * Fault the pages into memory.  Note, do this AFTER releasing the
308 	 * lock, because we're only reading the pages, not writing them.
309 	 */
310 	(void)CDB___db_faultmem(infop->primary, rp->size, 0);
311 
312 	/* Everything looks good, we're done. */
313 	dbenv->reginfo = infop;
314 	return (0);
315 
316 creation:
317 	/* Create the environment region. */
318 	F_SET(infop, REGION_CREATE);
319 
320 	/*
321 	 * Allocate room for 50 REGION structures plus overhead (we're going
322 	 * to use this space for last-ditch allocation requests), although we
323 	 * should never need anything close to that.
324 	 */
325 	memset(&tregion, 0, sizeof(tregion));
326 	tregion.size = 50 * sizeof(REGION) + 50 * sizeof(MUTEX) + 2048;
327 	tregion.segid = INVALID_REGION_SEGID;
328 	if ((ret = CDB___os_r_attach(dbenv, infop, &tregion)) != 0)
329 		goto err;
330 
331 	/*
332 	 * Fault the pages into memory.  Note, do this BEFORE we initialize
333 	 * anything, because we're writing the pages, not just reading them.
334 	 */
335 	(void)CDB___db_faultmem(infop->addr, tregion.size, 1);
336 
337 	/*
338 	 * The first object in the region is the REGENV structure.  This is
339 	 * different from the other regions, and, from everything else in
340 	 * this region, where all objects are allocated from the pool, i.e.,
341 	 * there aren't any fixed locations.  The remaining space is made
342 	 * available for later allocation.
343 	 *
344 	 * The allocation space must be size_t aligned, because that's what
345 	 * the initialization routine is going to store there.  To make sure
346 	 * that happens, the REGENV structure was padded with a final size_t.
347 	 * No other region needs to worry about it because all of them treat
348 	 * the entire region as allocation space.
349 	 *
350 	 * Set the primary reference and correct the "addr" value to reference
351 	 * the shalloc region.  Note, this requires that we "uncorrect" it at
352 	 * region detach, and that all of our offsets (R_ADDR/R_OFFSET) will be
353 	 * shifted as well, but that should be fine.
354 	 */
355 	infop->primary = R_ADDR(infop, 0);
356 	infop->addr = (u_int8_t *)infop->addr + sizeof(REGENV);
357 	CDB___db_shalloc_init(infop->addr, tregion.size - sizeof(REGENV));
358 
359 	/*
360 	 * Initialize the rest of the REGENV structure, except for the magic
361 	 * number which validates the file/environment.
362 	 */
363 	renv = infop->primary;
364 	renv->panic = 0;
365 	CDB_db_version(&renv->majver, &renv->minver, &renv->patch);
366 	SH_LIST_INIT(&renv->regionq);
367 	renv->refcnt = 1;
368 
369 	/*
370 	 * Lock the environment.
371 	 *
372 	 * Check the lock call return.  This is the first lock we initialize
373 	 * and acquire, and we have to know if it fails.  (It CAN fail, e.g.,
374 	 * SunOS, when using fcntl(2) for locking and using an in-memory
375 	 * filesystem as the database home.  But you knew that, I'm sure -- it
376 	 * probably wasn't even worth mentioning.)
377 	 */
378 	if ((ret =
379 	    __db_mutex_init(dbenv, &renv->mutex, DB_FCNTL_OFF_GEN, 0)) != 0) {
380 		CDB___db_err(dbenv, "%s: unable to initialize environment lock: %s",
381 		    infop->name, CDB_db_strerror(ret));
382 		goto err;
383 	}
384 
385 	if (!F_ISSET(&renv->mutex, MUTEX_IGNORE) &&
386 	    (ret = __db_mutex_lock(&renv->mutex, dbenv->lockfhp)) != 0) {
387 		CDB___db_err(dbenv, "%s: unable to acquire environment lock: %s",
388 		    infop->name, CDB_db_strerror(ret));
389 		goto err;
390 	}
391 
392 	/*
393 	 * Get the underlying REGION structure for this environment.  Note,
394 	 * we created the underlying OS region before we acquired the REGION
395 	 * structure, which is backwards from the normal procedure.  Update
396 	 * the REGION structure.
397 	 */
398 	if ((ret = CDB___db_des_get(dbenv, infop, infop, &rp)) != 0)
399 		goto err;
400 	infop->rp = rp;
401 	rp->size = tregion.size;
402 	rp->segid = tregion.segid;
403 
404 	/*
405 	 * !!!
406 	 * If we create an environment where regions are public and in system
407 	 * memory, we have to inform processes joining the environment how to
408 	 * attach to the shared memory segment.  So, we write the shared memory
409 	 * identifier into the file, to be read by those other processes.
410 	 *
411 	 * XXX
412 	 * This is really OS-layer information, but I can't see any easy way
413 	 * to move it down there without passing down information that it has
414 	 * no right to know, e.g., that this is the one-and-only REGENV region
415 	 * and not some other random region.
416 	 */
417 	if (tregion.segid != INVALID_REGION_SEGID) {
418 		ref.size = tregion.size;
419 		ref.segid = tregion.segid;
420 		if ((ret = CDB___os_write(dbenv->lockfhp,
421 		    &ref, sizeof(ref), &nrw)) != 0 || nrw != sizeof(ref)) {
422 			CDB___db_err(dbenv,
423 			    "%s: unable to write out public environment ID: %s",
424 			    infop->name, CDB_db_strerror(ret));
425 			goto err;
426 		}
427 	}
428 
429 	/*
430 	 * If not doing thread locking, we need to save the file handle for
431 	 * fcntl(2) locking.  Otherwise, discard the handle, we no longer
432 	 * need it, and the less contact between the buffer cache and the VM,
433 	 * the better.
434 	 */
435 #if defined(HAVE_MUTEX_THREADS)
436 	if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
437 		 CDB___os_closehandle(dbenv->lockfhp);
438 #endif
439 
440 	/* Validate the file. */
441 	renv->magic = DB_REGION_MAGIC;
442 
443 	/* Discard our lock. */
444 	MUTEX_UNLOCK(&renv->mutex);
445 
446 	/* Everything looks good, we're done. */
447 	dbenv->reginfo = infop;
448 	return (0);
449 
450 err:
451 retry:	/* Close any open file handle. */
452 	if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
453 		(void)CDB___os_closehandle(dbenv->lockfhp);
454 
455 	/*
456 	 * If we joined or created the region, detach from it.  If we created
457 	 * it, destroy it.  Note, there's a path in the above code where we're
458 	 * using a temporary REGION structure because we haven't yet allocated
459 	 * the real one.  In that case the region address (addr) will be filled
460 	 * in, but the REGION pointer (rp) won't.  Fix it.
461 	 */
462 	if (infop->addr != NULL) {
463 		if (infop->rp == NULL)
464 			infop->rp = &tregion;
465 
466 		/* Reset the addr value that we "corrected" above. */
467 		infop->addr = infop->primary;
468 		(void)CDB___os_r_detach(dbenv,
469 		    infop, F_ISSET(infop, REGION_CREATE));
470 	}
471 
472 	/* Free the allocated name and/or REGINFO structure. */
473 	if (infop->name != NULL)
474 		CDB___os_freestr(infop->name);
475 	CDB___os_free(infop, sizeof(REGINFO));
476 
477 	/* If we had a temporary error, wait awhile and try again. */
478 	if (ret == 0) {
479 		if (++retry_cnt > 3) {
480 			CDB___db_err(dbenv, "unable to join the environment");
481 			ret = EAGAIN;
482 		} else {
483 			CDB___os_sleep(retry_cnt * 3, 0);
484 			goto loop;
485 		}
486 	}
487 
488 	return (ret);
489 }
490 
491 /*
492  * CDB___db_e_detach --
493  *	Detach from the environment.
494  *
495  * PUBLIC: int CDB___db_e_detach __P((DB_ENV *, int));
496  */
497 int
CDB___db_e_detach(dbenv,destroy)498 CDB___db_e_detach(dbenv, destroy)
499 	DB_ENV *dbenv;
500 	int destroy;
501 {
502 	REGENV *renv;
503 	REGINFO *infop;
504 
505 	infop = dbenv->reginfo;
506 	renv = infop->primary;
507 
508 	/* Lock the environment. */
509 	MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
510 
511 	/* Decrement the reference count. */
512 	if (renv->refcnt == 0) {
513 		CDB___db_err(dbenv,
514 		    "region %lu (environment): reference count went negative",
515 		    infop->rp->id);
516 	} else
517 		--renv->refcnt;
518 
519 	/* Release the lock. */
520 	MUTEX_UNLOCK(&renv->mutex);
521 
522 	/* Close the locking file handle. */
523 	if (F_ISSET(dbenv->lockfhp, DB_FH_VALID))
524 		(void)CDB___os_closehandle(dbenv->lockfhp);
525 
526 	/* Reset the addr value that we "corrected" above. */
527 	infop->addr = infop->primary;
528 
529 	/*
530 	 * Release the region, and kill our reference.
531 	 *
532 	 * We set the DBENV->reginfo field to NULL here and discard its memory.
533 	 * DBENV->remove calls CDB___dbenv_remove to do the region remove, and
534 	 * CDB___dbenv_remove attached and then detaches from the region.  We don't
535 	 * want to return to DBENV->remove with a non-NULL DBENV->reginfo field
536 	 * because it will attempt to detach again as part of its cleanup.
537 	 */
538 	(void)CDB___os_r_detach(dbenv, infop, destroy);
539 
540 	if (infop->name != NULL)
541 		CDB___os_free(infop->name, 0);
542 	CDB___os_free(dbenv->reginfo, sizeof(REGINFO));
543 	dbenv->reginfo = NULL;
544 
545 	return (0);
546 }
547 
548 /*
549  * CDB___db_e_remove --
550  *	Discard an environment if it's not in use.
551  *
552  * PUBLIC: int CDB___db_e_remove __P((DB_ENV *, int));
553  */
554 int
CDB___db_e_remove(dbenv,force)555 CDB___db_e_remove(dbenv, force)
556 	DB_ENV *dbenv;
557 	int force;
558 {
559 	REGENV *renv;
560 	REGINFO *infop, reginfo;
561 	REGION *rp;
562 	int ret, saved_value;
563 
564 	/*
565 	 * This routine has to walk a nasty line between not looking into
566 	 * the environment (which may be corrupted after an app or system
567 	 * crash), and removing everything that needs removing.  What we
568 	 * do is:
569 	 *	1. Connect to the environment (so it better be OK).
570 	 *	2. If the environment is in use (reference count is non-zero),
571 	 *	   return EBUSY.
572 	 *	3. Overwrite the magic number so that any threads of control
573 	 *	   attempting to connect will backoff and retry.
574 	 *	4. Walk the list of regions.  Connect to each region and then
575 	 *	   disconnect with the destroy flag set.  This shouldn't cause
576 	 *	   any problems, even if the region is corrupted, because we
577 	 *	   should never be looking inside the region.
578 	 *	5. Walk the list of files in the directory, unlinking any
579 	 *	   files that match a region name.  Unlink the environment
580 	 *	   file last.
581 	 *
582 	 * If the force flag is set, we do not acquire any locks during this
583 	 * process.
584 	 */
585 	saved_value = DB_GLOBAL(db_mutexlocks);
586 	if (force)
587 		DB_GLOBAL(db_mutexlocks) = 0;
588 
589 	/* Join the environment. */
590 	if ((ret = CDB___db_e_attach(dbenv)) != 0) {
591 		/*
592 		 * If we can't join it, we assume that's because it doesn't
593 		 * exist.  It would be better to know why we failed, but it
594 		 * probably isn't important.
595 		 */
596 		ret = 0;
597 		if (force)
598 			goto remfiles;
599 		goto err;
600 	}
601 
602 	infop = dbenv->reginfo;
603 	renv = infop->primary;
604 
605 	/* Lock the environment. */
606 	MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
607 
608 	/* If it's in use, we're done. */
609 	if (renv->refcnt == 1 || force) {
610 		/*
611 		 * Set the panic flag and overwrite the magic number.
612 		 *
613 		 * !!!
614 		 * From this point on, there's no going back, we pretty
615 		 * much ignore errors, and just whack on whatever we can.
616 		 */
617 		renv->panic = 1;
618 		renv->magic = 0;
619 
620 		/*
621 		 * Unlock the environment.  We should no longer need the lock
622 		 * because we've poisoned the pool, but we can't continue to
623 		 * hold it either, because other routines may want it.
624 		 */
625 		MUTEX_UNLOCK(&renv->mutex);
626 
627 		/*
628 		 * Attach to each sub-region and destroy it.
629 		 *
630 		 * !!!
631 		 * The REGION_CREATE_OK flag is set for Windows/95 -- regions
632 		 * are zero'd out when the last reference to the region goes
633 		 * away, in which case the underlying OS region code requires
634 		 * callers be prepared to create the region in order to join it.
635 		 */
636 		memset(&reginfo, 0, sizeof(reginfo));
637 restart:	for (rp = SH_LIST_FIRST(&renv->regionq, __db_region);
638 		    rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) {
639 			if (rp->id == REG_ID_ENV)
640 				continue;
641 
642 			reginfo.id = rp->id;
643 			reginfo.flags = REGION_CREATE_OK;
644 			if (CDB___db_r_attach(dbenv, &reginfo, 0) == 0) {
645 				R_UNLOCK(dbenv, &reginfo);
646 				(void)CDB___db_r_detach(dbenv, &reginfo, 1);
647 			}
648 			goto restart;
649 		}
650 
651 		/* Destroy the environment's region. */
652 		(void)CDB___db_e_detach(dbenv, 1);
653 
654 		/* Discard the physical files. */
655 remfiles:	(void)CDB___db_e_remfile(dbenv);
656 	} else {
657 		/* Unlock the environment. */
658 		MUTEX_UNLOCK(&renv->mutex);
659 
660 		/* Discard the environment. */
661 		(void)CDB___db_e_detach(dbenv, 0);
662 
663 		ret = EBUSY;
664 	}
665 
666 err:	if (force)
667 		DB_GLOBAL(db_mutexlocks) = saved_value;
668 
669 	return (ret);
670 }
671 
672 /*
673  * CDB___db_e_remfile --
674  *	Discard any region files in the filesystem.
675  */
676 static int
CDB___db_e_remfile(dbenv)677 CDB___db_e_remfile(dbenv)
678 	DB_ENV *dbenv;
679 {
680 	static char *old_region_names[] = {
681 		"__db_lock.share",
682 		"__db_log.share",
683 		"__db_mpool.share",
684 		"__db_txn.share",
685 		NULL,
686 	};
687 	int cnt, fcnt, lastrm, ret;
688 	u_int8_t saved_byte;
689 	const char *dir;
690 	char *p, **names, *path, buf[sizeof(DB_REGION_FMT) + 20];
691 
692 	/* Get the full path of a file in the environment. */
693 	(void)snprintf(buf, sizeof(buf), "%s", DB_REGION_ENV);
694 	if ((ret =
695 	    CDB___db_appname(dbenv, DB_APP_NONE, NULL, buf, 0, NULL, &path)) != 0)
696 		return (ret);
697 
698 	/* Get the parent directory for the environment. */
699 	if ((p = CDB___db_rpath(path)) == NULL) {
700 		p = path;
701 		saved_byte = *p;
702 
703 		dir = PATH_DOT;
704 	} else {
705 		saved_byte = *p;
706 		*p = '\0';
707 
708 		dir = path;
709 	}
710 
711 	/* Get the list of file names. */
712 	ret = CDB___os_dirlist(dir, &names, &fcnt);
713 
714 	/* Restore the path, and free it. */
715 	*p = saved_byte;
716 	CDB___os_freestr(path);
717 
718 	if (ret != 0) {
719 		CDB___db_err(dbenv, "%s: %s", dir, CDB_db_strerror(ret));
720 		return (ret);
721 	}
722 
723 	/*
724 	 * Search for valid region names, and remove them.  We remove the
725 	 * environment region last, because it's the key to this whole mess.
726 	 */
727 	for (lastrm = -1, cnt = fcnt; --cnt >= 0;) {
728 		if (strlen(names[cnt]) != DB_REGION_NAME_LENGTH ||
729 		    memcmp(names[cnt], DB_REGION_FMT, DB_REGION_NAME_NUM) != 0)
730 			continue;
731 		if (strcmp(names[cnt], DB_REGION_ENV) == 0) {
732 			lastrm = cnt;
733 			continue;
734 		}
735 		for (p = names[cnt] + DB_REGION_NAME_NUM;
736 		    *p != '\0' && isdigit((int)*p); ++p)
737 			;
738 		if (*p != '\0')
739 			continue;
740 
741 		if (CDB___db_appname(dbenv,
742 		    DB_APP_NONE, NULL, names[cnt], 0, NULL, &path) == 0) {
743 			(void)CDB___os_unlink(path);
744 			CDB___os_freestr(path);
745 		}
746 	}
747 
748 	if (lastrm != -1)
749 		if (CDB___db_appname(dbenv,
750 		    DB_APP_NONE, NULL, names[lastrm], 0, NULL, &path) == 0) {
751 			(void)CDB___os_unlink(path);
752 			CDB___os_freestr(path);
753 		}
754 	CDB___os_dirfree(names, fcnt);
755 
756 	/*
757 	 * !!!
758 	 * Backward compatibility -- remove region files from releases
759 	 * before 2.8.XX.
760 	 */
761 	for (names = (char **)old_region_names; *names != NULL; ++names)
762 		if (CDB___db_appname(dbenv,
763 		    DB_APP_NONE, NULL, *names, 0, NULL, &path) == 0) {
764 			(void)CDB___os_unlink(path);
765 			CDB___os_freestr(path);
766 		}
767 
768 	return (0);
769 }
770 
771 /*
772  * CDB___db_e_stat
773  *	Statistics for the environment.
774  *
775  * PUBLIC: int CDB___db_e_stat __P((DB_ENV *, REGENV *, REGION *, int *));
776  */
777 int
CDB___db_e_stat(dbenv,arg_renv,arg_regions,arg_regions_cnt)778 CDB___db_e_stat(dbenv, arg_renv, arg_regions, arg_regions_cnt)
779 	DB_ENV *dbenv;
780 	REGENV *arg_renv;
781 	REGION *arg_regions;
782 	int *arg_regions_cnt;
783 {
784 	REGENV *renv;
785 	REGINFO *infop;
786 	REGION *rp;
787 	int n;
788 
789 	infop = dbenv->reginfo;
790 	renv = infop->primary;
791 	rp = infop->rp;
792 
793 	/* Lock the environment. */
794 	MUTEX_LOCK(&rp->mutex, dbenv->lockfhp);
795 
796 	*arg_renv = *renv;
797 
798 	for (n = 0, rp = SH_LIST_FIRST(&renv->regionq, __db_region);
799 	    n < *arg_regions_cnt && rp != NULL;
800 	    ++n, rp = SH_LIST_NEXT(rp, q, __db_region))
801 		arg_regions[n] = *rp;
802 
803 	/* Release the lock. */
804 	rp = infop->rp;
805 	MUTEX_UNLOCK(&rp->mutex);
806 
807 	*arg_regions_cnt = n == 0 ? n : n - 1;
808 
809 	return (0);
810 }
811 
812 /*
813  * CDB___db_r_attach
814  *	Join/create a region.
815  *
816  * PUBLIC: int CDB___db_r_attach __P((DB_ENV *, REGINFO *, size_t));
817  */
818 int
CDB___db_r_attach(dbenv,infop,size)819 CDB___db_r_attach(dbenv, infop, size)
820 	DB_ENV *dbenv;
821 	REGINFO *infop;
822 	size_t size;
823 {
824 	REGENV *renv;
825 	REGION *rp;
826 	int ret;
827 	char buf[sizeof(DB_REGION_FMT) + 20];
828 
829 	renv = ((REGINFO *)dbenv->reginfo)->primary;
830 	F_CLR(infop, REGION_CREATE);
831 
832 	/* Lock the environment. */
833 	MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
834 
835 	/* Find or create a REGION structure for this region. */
836 	if ((ret = CDB___db_des_get(dbenv, dbenv->reginfo, infop, &rp)) != 0) {
837 		MUTEX_UNLOCK(&renv->mutex);
838 		return (ret);
839 	}
840 	infop->rp = rp;
841 	infop->id = rp->id;
842 
843 	/* If we're creating the region, set the desired size. */
844 	if (F_ISSET(infop, REGION_CREATE))
845 		rp->size = size;
846 
847 	/* Join/create the underlying region. */
848 	(void)snprintf(buf, sizeof(buf), DB_REGION_FMT, infop->id);
849 	if ((ret = CDB___db_appname(dbenv,
850 	    DB_APP_NONE, NULL, buf, 0, NULL, &infop->name)) != 0)
851 		goto err;
852 	if ((ret = CDB___os_r_attach(dbenv, infop, rp)) != 0)
853 		goto err;
854 
855 	/*
856 	 * Fault the pages into memory.  Note, do this BEFORE we initialize
857 	 * anything because we're writing pages in created regions, not just
858 	 * reading them.
859 	 */
860 	(void)CDB___db_faultmem(infop->addr,
861 	    rp->size, F_ISSET(infop, REGION_CREATE));
862 
863 	/*
864 	 * !!!
865 	 * The underlying layer may have just decided that we are going
866 	 * to create the region.  There are various system issues that
867 	 * can result in a useless region that requires re-initialization.
868 	 *
869 	 * If we created the region, initialize it for allocation.
870 	 */
871 	if (F_ISSET(infop, REGION_CREATE)) {
872 		((REGION *)(infop->addr))->magic = DB_REGION_MAGIC;
873 
874 		(void)CDB___db_shalloc_init(infop->addr, rp->size);
875 	}
876 
877 	/*
878 	 * If the underlying REGION isn't the environment, acquire a lock
879 	 * for it and release our lock on the environment.
880 	 */
881 	if (infop->id != REG_ID_ENV) {
882 		MUTEX_LOCK(&rp->mutex, dbenv->lockfhp);
883 		MUTEX_UNLOCK(&renv->mutex);
884 	}
885 
886 	return (0);
887 
888 	/* Discard the underlying region. */
889 err:	if (infop->addr != NULL)
890 		(void)CDB___os_r_detach(dbenv,
891 		    infop, F_ISSET(infop, REGION_CREATE));
892 	infop->rp = NULL;
893 	infop->id = REG_ID_INVALID;
894 
895 	/* Discard the REGION structure if we created it. */
896 	if (F_ISSET(infop, REGION_CREATE))
897 		(void)CDB___db_des_destroy(dbenv, rp);
898 
899 	/* Release the environment lock. */
900 	MUTEX_UNLOCK(&renv->mutex);
901 
902 	return (ret);
903 }
904 
905 /*
906  * CDB___db_r_detach --
907  *	Detach from a region.
908  *
909  * PUBLIC: int CDB___db_r_detach __P((DB_ENV *, REGINFO *, int));
910  */
911 int
CDB___db_r_detach(dbenv,infop,destroy)912 CDB___db_r_detach(dbenv, infop, destroy)
913 	DB_ENV *dbenv;
914 	REGINFO *infop;
915 	int destroy;
916 {
917 	REGENV *renv;
918 	REGION *rp;
919 	int ret, t_ret;
920 
921 	renv = ((REGINFO *)dbenv->reginfo)->primary;
922 	rp = infop->rp;
923 
924 	/* Lock the environment. */
925 	MUTEX_LOCK(&renv->mutex, dbenv->lockfhp);
926 
927 	/* Acquire the lock for the REGION. */
928 	MUTEX_LOCK(&rp->mutex, dbenv->lockfhp);
929 
930 	/* Detach from the underlying OS region. */
931 	ret = CDB___os_r_detach(dbenv, infop, destroy);
932 
933 	/* Release the REGION lock. */
934 	MUTEX_UNLOCK(&rp->mutex);
935 
936 	/* If we destroyed the region, discard the REGION structure. */
937 	if (destroy &&
938 	    ((t_ret = CDB___db_des_destroy(dbenv, rp)) != 0) && ret == 0)
939 		ret = t_ret;
940 
941 	/* Release the environment lock. */
942 	MUTEX_UNLOCK(&renv->mutex);
943 
944 	/* Destroy the structure. */
945 	if (infop->name != NULL)
946 		CDB___os_freestr(infop->name);
947 
948 	return (ret);
949 }
950 
951 /*
952  * CDB___db_des_get --
953  *	Return a reference to the shared information for a REGION,
954  *	optionally creating a new entry.
955  */
956 static int
CDB___db_des_get(dbenv,env_infop,infop,rpp)957 CDB___db_des_get(dbenv, env_infop, infop, rpp)
958 	DB_ENV *dbenv;
959 	REGINFO *env_infop, *infop;
960 	REGION **rpp;
961 {
962 	REGENV *renv;
963 	REGION *rp;
964 	int maxid, ret;
965 
966 	/*
967 	 * !!!
968 	 * Called with the environment already locked.
969 	 */
970 	*rpp = NULL;
971 	renv = env_infop->primary;
972 
973 	maxid = REG_ID_ASSIGN;
974 	for (rp = SH_LIST_FIRST(&renv->regionq, __db_region);
975 	    rp != NULL; rp = SH_LIST_NEXT(rp, q, __db_region)) {
976 		if (rp->id == infop->id)
977 			break;
978 		if (rp->id > maxid)
979 			maxid = rp->id;
980 	}
981 
982 	/*
983 	 * If we didn't find a region, or we found one needing initialization,
984 	 * and we can't create the region, fail.
985 	 */
986 	if (!F_ISSET(infop, REGION_CREATE_OK) &&
987 	    (rp == NULL || F_ISSET(rp, REG_DEAD)))
988 		return (ENOENT);
989 
990 	/*
991 	 * If we didn't find a region, create and initialize a REGION structure
992 	 * for the caller.  If id was set, use that value, otherwise we use the
993 	 * next available ID.
994 	 */
995 	if (rp == NULL) {
996 		if ((ret = CDB___db_shalloc(env_infop->addr,
997 		    sizeof(REGION), MUTEX_ALIGN, &rp)) != 0)
998 			return (ret);
999 
1000 		/* Initialize the region. */
1001 		memset(rp, 0, sizeof(*rp));
1002 		if ((ret = __db_mutex_init(dbenv, &rp->mutex,
1003 		    R_OFFSET(env_infop, &rp->mutex) + DB_FCNTL_OFF_GEN,
1004 		    0)) != 0) {
1005 			CDB___db_shalloc_free(env_infop->addr, rp);
1006 			return (ret);
1007 		}
1008 		rp->segid = INVALID_REGION_SEGID;
1009 		rp->id = infop->id == REG_ID_INVALID ? maxid + 1 : infop->id;
1010 
1011 		SH_LIST_INSERT_HEAD(&renv->regionq, rp, q, __db_region);
1012 		F_SET(infop, REGION_CREATE);
1013 	} else {
1014 		/*
1015 		 * There is one race -- a caller created a region, was trying
1016 		 * to initialize it for general use, and failed somehow.  We
1017 		 * leave the region around and tell each new caller that they
1018 		 * are creating it, because that's easier than dealing with
1019 		 * the races involved in removing it.
1020 		 */
1021 		if (F_ISSET(rp, REG_DEAD)) {
1022 			rp->primary = INVALID_ROFF;
1023 
1024 			F_CLR(rp, REG_DEAD);
1025 			F_SET(infop, REGION_CREATE);
1026 		}
1027 	}
1028 
1029 	*rpp = rp;
1030 	return (0);
1031 }
1032 
1033 /*
1034  * CDB___db_des_destroy --
1035  *	Destroy a reference to a REGION.
1036  */
1037 static int
CDB___db_des_destroy(dbenv,rp)1038 CDB___db_des_destroy(dbenv, rp)
1039 	DB_ENV *dbenv;
1040 	REGION *rp;
1041 {
1042 	REGINFO *infop;
1043 
1044 	/*
1045 	 * !!!
1046 	 * Called with the environment already locked.
1047 	 */
1048 	infop = dbenv->reginfo;
1049 
1050 	SH_LIST_REMOVE(rp, q, __db_region);
1051 	CDB___db_shalloc_free(infop->addr, rp);
1052 
1053 	return (0);
1054 }
1055 
1056 /*
1057  * CDB___db_faultmem --
1058  *	Fault the region into memory.
1059  */
1060 static int
CDB___db_faultmem(addr,size,created)1061 CDB___db_faultmem(addr, size, created)
1062 	void *addr;
1063 	size_t size;
1064 	int created;
1065 {
1066 	int ret;
1067 	u_int8_t *p, *t;
1068 
1069 	/*
1070 	 * It's sometimes significantly faster to page-fault in all of the
1071 	 * region's pages before we run the application, as we see nasty
1072 	 * side-effects when we page-fault while holding various locks, i.e.,
1073 	 * the lock takes a long time to acquire because of the underlying
1074 	 * page fault, and the other threads convoy behind the lock holder.
1075 	 *
1076 	 * If we created the region, we write a non-zero value so that the
1077 	 * system can't cheat.  If we're just joining the region, we can
1078 	 * only read the value and try to confuse the compiler sufficiently
1079 	 * that it doesn't figure out that we're never really using it.
1080 	 */
1081 	ret = 0;
1082 	if (DB_GLOBAL(db_region_init)) {
1083 		if (created)
1084 			for (p = addr, t = (u_int8_t *)addr + size;
1085 			    p < t; p += OS_VMPAGESIZE)
1086 				p[0] = 0xdb;
1087 		else
1088 			for (p = addr, t = (u_int8_t *)addr + size;
1089 			    p < t; p += OS_VMPAGESIZE)
1090 				ret |= p[0];
1091 	}
1092 
1093 	return (ret);
1094 }
1095