1 /*-
2  * See the file LICENSE for redistribution information.
3  *
4  * Copyright (c) 1996, 2013 Oracle and/or its affiliates.  All rights reserved.
5  */
6 /*
7  * Copyright (c) 1990, 1993, 1994, 1995, 1996
8  *	Keith Bostic.  All rights reserved.
9  */
10 /*
11  * Copyright (c) 1990, 1993, 1994, 1995
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * This code is derived from software contributed to Berkeley by
15  * Mike Olson.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  * 3. Neither the name of the University nor the names of its contributors
26  *    may be used to endorse or promote products derived from this software
27  *    without specific prior written permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39  * SUCH DAMAGE.
40  *
41  * $Id$
42  */
43 
44 #include "db_config.h"
45 
46 #include "db_int.h"
47 #include "dbinc/db_page.h"
48 #include "dbinc/lock.h"
49 #include "dbinc/log.h"
50 #include "dbinc/mp.h"
51 #include "dbinc/txn.h"
52 #include "dbinc/db_am.h"
53 #include "dbinc/hash.h"
54 
55 static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
56 #ifdef HAVE_FTRUNCATE
57 static int  __db_pglistcmp __P((const void *, const void *));
58 static int  __db_truncate_freelist __P((DBC *, DBMETA *,
59       PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
60 #endif
61 
62 /*
63  * __db_init_meta --
64  *	Helper function for __db_new that initializes the important fields in
65  * a meta-data page (used instead of P_INIT).  We need to make sure that we
66  * retain the page number and LSN of the existing page.
67  */
68 static void
__db_init_meta(dbp,p,pgno,pgtype)69 __db_init_meta(dbp, p, pgno, pgtype)
70 	DB *dbp;
71 	void *p;
72 	db_pgno_t pgno;
73 	u_int32_t pgtype;
74 {
75 	DBMETA *meta;
76 	DB_LSN save_lsn;
77 
78 	meta = (DBMETA *)p;
79 	save_lsn = meta->lsn;
80 	memset(meta, 0, sizeof(DBMETA));
81 	meta->lsn = save_lsn;
82 	meta->pagesize = dbp->pgsize;
83 	if (F_ISSET(dbp, DB_AM_CHKSUM))
84 		FLD_SET(meta->metaflags, DBMETA_CHKSUM);
85 	meta->pgno = pgno;
86 	meta->type = (u_int8_t)pgtype;
87 }
88 
89 /*
90  * __db_new --
91  *	Get a new page, preferably from the freelist.
92  *
93  * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
94  */
95 int
__db_new(dbc,type,lockp,pagepp)96 __db_new(dbc, type, lockp, pagepp)
97 	DBC *dbc;
98 	u_int32_t type;
99 	DB_LOCK *lockp;
100 	PAGE **pagepp;
101 {
102 	DB *dbp;
103 	DBMETA *meta;
104 	DB_LOCK metalock;
105 	DB_LSN lsn;
106 	DB_MPOOLFILE *mpf;
107 	ENV *env;
108 	PAGE *h;
109 	db_pgno_t last, *list, pgno, newnext;
110 	int extend, hash, ret;
111 
112 	meta = NULL;
113 	dbp = dbc->dbp;
114 	env = dbp->env;
115 	mpf = dbp->mpf;
116 	h = NULL;
117 	newnext = PGNO_INVALID;
118 	if (lockp != NULL)
119 		LOCK_INIT(*lockp);
120 
121 	hash = 0;
122 	ret = 0;
123 	LOCK_INIT(metalock);
124 
125 #ifdef HAVE_HASH
126 	if (dbp->type == DB_HASH) {
127 		if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
128 			goto err;
129 		if (meta != NULL)
130 			hash = 1;
131 	}
132 #endif
133 	if (meta == NULL) {
134 		pgno = PGNO_BASE_MD;
135 		if ((ret = __db_lget(dbc,
136 		    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
137 			goto err;
138 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
139 		    DB_MPOOL_DIRTY, &meta)) != 0)
140 			goto err;
141 	}
142 
143 	last = meta->last_pgno;
144 	if (meta->free == PGNO_INVALID) {
145 		if (FLD_ISSET(type, P_DONTEXTEND)) {
146 			*pagepp = NULL;
147 			goto err;
148 		}
149 		last = pgno = meta->last_pgno + 1;
150 		ZERO_LSN(lsn);
151 		extend = 1;
152 	} else {
153 		pgno = meta->free;
154 		/*
155 		 * Lock the new page.  Do this here because we must do it
156 		 * before getting the page and the caller may need the lock
157 		 * to keep readers from seeing the page before the transaction
158 		 * commits.  We can do this because no one will hold a free
159 		 * page locked.
160 		 */
161 		if (lockp != NULL && (ret =
162 		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
163 			goto err;
164 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
165 		    DB_MPOOL_DIRTY, &h)) != 0)
166 			goto err;
167 
168 		/*
169 		 * We want to take the first page off the free list and
170 		 * then set meta->free to the that page's next_pgno, but
171 		 * we need to log the change first.
172 		 */
173 		newnext = h->next_pgno;
174 		lsn = h->lsn;
175 		extend = 0;
176 		DB_ASSERT(env, TYPE(h) == P_INVALID);
177 
178 		if (TYPE(h) != P_INVALID) {
179 			__db_errx(env, DB_STR_A("0689",
180 			    "%s page %lu is on free list with type %lu",
181 			    "%s %lu %lu"), dbp->fname, (u_long)PGNO(h),
182 			    (u_long)TYPE(h));
183 			return (__env_panic(env, EINVAL));
184 		}
185 
186 	}
187 
188 	FLD_CLR(type, P_DONTEXTEND);
189 
190 	/*
191 	 * Log the allocation before fetching the new page.  If we
192 	 * don't have room in the log then we don't want to tell
193 	 * mpool to extend the file.
194 	 */
195 	if (DBC_LOGGING(dbc)) {
196 		if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
197 		    &LSN(meta), PGNO_BASE_MD, &lsn,
198 		    pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
199 			goto err;
200 	} else
201 		LSN_NOT_LOGGED(LSN(meta));
202 
203 	meta->free = newnext;
204 
205 	if (extend == 1) {
206 		if (lockp != NULL && (ret =
207 		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
208 			goto err;
209 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
210 		    DB_MPOOL_NEW, &h)) != 0)
211 			goto err;
212 		DB_ASSERT(env, last == pgno);
213 		meta->last_pgno = pgno;
214 		ZERO_LSN(h->lsn);
215 		h->pgno = pgno;
216 
217 		/*
218 		 * If the file was extended for the first time in this
219 		 * transaction, set the MPOOLFILE's file extension
220 		 * watermark.
221 		 */
222 		__txn_add_fe_watermark(dbc->txn, dbp, h->pgno);
223 
224 	}
225 	LSN(h) = LSN(meta);
226 
227 	if (hash == 0 && (ret = __memp_fput(mpf,
228 	    dbc->thread_info, meta, dbc->priority)) != 0)
229 		goto err;
230 	meta = NULL;
231 
232 	switch (type) {
233 		case P_BTREEMETA:
234 		case P_HASHMETA:
235 		case P_QAMMETA:
236 			__db_init_meta(dbp, h, h->pgno, type);
237 			break;
238 		default:
239 			P_INIT(h, dbp->pgsize,
240 			    h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
241 			break;
242 	}
243 
244 	/* Fix up the sorted free list if necessary. */
245 #ifdef HAVE_FTRUNCATE
246 	if (extend == 0) {
247 		u_int32_t nelems = 0;
248 
249 		if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
250 			goto err;
251 		if (nelems != 0) {
252 			DB_ASSERT(env, h->pgno == list[0]);
253 			memmove(list, &list[1], (nelems - 1) * sizeof(*list));
254 			if ((ret = __memp_extend_freelist(
255 			    dbp->mpf, nelems - 1, &list)) != 0)
256 				goto err;
257 		}
258 	}
259 #else
260 	COMPQUIET(list, NULL);
261 #endif
262 
263 	if ((ret = __TLPUT(dbc, metalock)) != 0)
264 		return (ret);
265 	*pagepp = h;
266 	PERFMON6(env, alloc, new, dbp->fname, dbp->dname, pgno, type, h, 0);
267 	return (0);
268 
269 err:	if (h != NULL)
270 		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
271 	if (meta != NULL && hash == 0)
272 		(void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
273 	(void)__TLPUT(dbc, metalock);
274 	if (lockp != NULL)
275 		(void)__LPUT(dbc, *lockp);
276 	/* Failure return - report 0 pgno, null page address. */
277 	PERFMON6(env, alloc, new, dbp->fname, dbp->dname, 0, type, NULL, ret);
278 	return (ret);
279 }
280 
281 /*
282  * __db_free --
283  *	Add a page to the head of the freelist.
284  *
285  * PUBLIC: int __db_free __P((DBC *, PAGE *, u_int32_t));
286  */
287 int
__db_free(dbc,h,flags)288 __db_free(dbc, h, flags)
289 	DBC *dbc;
290 	PAGE *h;
291 	u_int32_t flags;
292 {
293 	DB *dbp;
294 	DBMETA *meta;
295 	DBT ddbt, ldbt;
296 	DB_LOCK metalock;
297 	DB_LSN *lsnp;
298 	DB_MPOOLFILE *mpf;
299 	PAGE *prev;
300 	db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
301 	u_int32_t lflag;
302 	int hash, ret, t_ret;
303 #ifdef HAVE_FTRUNCATE
304 	db_pgno_t *list, *lp;
305 	u_int32_t nelem, position, start;
306 	int do_truncate;
307 #endif
308 
309 	dbp = dbc->dbp;
310 	mpf = dbp->mpf;
311 	prev_pgno = PGNO_INVALID;
312 	meta = NULL;
313 	prev = NULL;
314 	LOCK_INIT(metalock);
315 #ifdef HAVE_FTRUNCATE
316 	lp = NULL;
317 	nelem = 0;
318 	do_truncate = 0;
319 #endif
320 
321 	/*
322 	 * Retrieve the metadata page.  If we are not keeping a sorted
323 	 * free list put the page at the head of the the free list.
324 	 * If we are keeping a sorted free list, for truncation,
325 	 * then figure out where this page belongs and either
326 	 * link it in or truncate the file as much as possible.
327 	 * If either the lock get or page get routines
328 	 * fail, then we need to put the page with which we were called
329 	 * back because our caller assumes we take care of it.
330 	 */
331 	hash = 0;
332 
333 	pgno = PGNO_BASE_MD;
334 	if ((ret = __db_lget(dbc,
335 	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
336 		goto err;
337 
338 #ifdef HAVE_HASH
339 	if (dbp->type == DB_HASH) {
340 		if ((ret = __ham_return_meta(dbc,
341 #ifdef HAVE_FTRUNCATE
342 		    0,
343 #else
344 		    DB_MPOOL_DIRTY,
345 #endif
346 		&meta)) != 0)
347 			goto err;
348 		if (meta != NULL)
349 			hash = 1;
350 	}
351 #endif
352 	if (meta == NULL) {
353 		/* If we support truncate, we might not dirty the meta page. */
354 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
355 #ifdef HAVE_FTRUNCATE
356 		    0,
357 #else
358 		    DB_MPOOL_DIRTY,
359 #endif
360 		    &meta)) != 0)
361 			goto err1;
362 	}
363 
364 	last_pgno = meta->last_pgno;
365 	next_pgno = meta->free;
366 	/*
367 	 * Assign lsnp here so it always initialized when
368 	 * HAVE_FTRUNCATE is not defined.
369 	 */
370 	lsnp = &LSN(meta);
371 
372 	DB_ASSERT(dbp->env, h->pgno != next_pgno);
373 
374 #ifdef HAVE_FTRUNCATE
375 	/*
376 	 * If we are maintaining a sorted free list see if we either have a
377 	 * new truncation point or the page goes somewhere in the middle of
378 	 * the list.  If it goes in the middle of the list, we will drop the
379 	 * meta page and get the previous page.
380 	 */
381 	COMPQUIET(position, 0);
382 	if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
383 		goto err1;
384 	if (list == NULL)
385 		goto no_sort;
386 
387 	if (h->pgno != last_pgno) {
388 		/*
389 		 * Put the page number in the sorted list.  Find its
390 		 * position and the previous page.  After logging we
391 		 * will extend the list, make room and insert the page in
392 		 * the list.
393 		 */
394 		position = 0;
395 		if (nelem != 0) {
396 			__db_freelist_pos(h->pgno, list, nelem, &position);
397 
398 			DB_ASSERT(dbp->env, h->pgno != list[position]);
399 
400 			/* Get the previous page if this is not the smallest. */
401 			if (position != 0 || h->pgno > list[0])
402 				prev_pgno = list[position];
403 		}
404 
405 	} else if (nelem != 0) {
406 		/* Find the truncation point. */
407 		for (lp = &list[nelem - 1]; lp >= list; lp--)
408 			if (--last_pgno != *lp)
409 				break;
410 		if (lp < list || last_pgno < h->pgno - 1)
411 			do_truncate = 1;
412 		last_pgno = meta->last_pgno;
413 	}
414 
415 no_sort:
416 	if (prev_pgno == PGNO_INVALID) {
417 #ifdef HAVE_HASH
418 		if (hash) {
419 			if ((ret =
420 			    __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
421 				goto err1;
422 		} else
423 #endif
424 		if ((ret = __memp_dirty(mpf,
425 		    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
426 			goto err1;
427 		lsnp = &LSN(meta);
428 	} else {
429 		pgno = prev_pgno;
430 		if ((ret = __memp_fget(mpf, &pgno,
431 		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
432 			goto err1;
433 		next_pgno = NEXT_PGNO(prev);
434 		lsnp = &LSN(prev);
435 	}
436 #endif
437 
438 	/*
439 	 * Log the change.
440 	 *	We are either logging an update to the metapage or to the
441 	 * previous page in the sorted list.
442 	 */
443 	if (DBC_LOGGING(dbc)) {
444 		memset(&ldbt, 0, sizeof(ldbt));
445 		ldbt.data = h;
446 		ldbt.size = P_OVERHEAD(dbp);
447 		/*
448 		 * If we are removing pages from the file, we need to make
449 		 * sure the logging happens before the truncation.  If we
450 		 * are truncating multiple pages we don't need to flush the
451 		 * log here as it will be flushed by __db_truncate_freelist.
452 		 */
453 		lflag = 0;
454 
455 #ifdef HAVE_FTRUNCATE
456 		if (h->pgno == last_pgno && do_truncate == 0)
457 			lflag = DB_FLUSH;
458 #endif
459 		switch (h->type) {
460 		case P_HASH:
461 		case P_IBTREE:
462 		case P_IRECNO:
463 		case P_LBTREE:
464 		case P_LRECNO:
465 		case P_LDUP:
466 			if (h->entries > 0 && (h->pgno == last_pgno ||
467 			    !LF_ISSET(DB_LOG_NO_DATA))) {
468 				ldbt.size += h->entries * sizeof(db_indx_t);
469 				ddbt.data = (u_int8_t *)h + HOFFSET(h);
470 				ddbt.size = dbp->pgsize - HOFFSET(h);
471 				if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
472 				     lsnp, lflag,
473 				     h->pgno, lsnp, pgno,
474 				     &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
475 					goto err1;
476 				goto logged;
477 			}
478 			break;
479 		case P_HASHMETA:
480 			ldbt.size = sizeof(HMETA);
481 			break;
482 		case P_BTREEMETA:
483 			ldbt.size = sizeof(BTMETA);
484 			break;
485 		case P_OVERFLOW:
486 			ldbt.size += OV_LEN(h);
487 			break;
488 		default:
489 			DB_ASSERT(dbp->env, h->type != P_QAMDATA);
490 		}
491 
492 		if ((ret = __db_pg_free_log(dbp,
493 		      dbc->txn, lsnp, lflag, h->pgno,
494 		      lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
495 			goto err1;
496 	} else
497 		LSN_NOT_LOGGED(*lsnp);
498 
499 logged:
500 #ifdef HAVE_FTRUNCATE
501 	if (do_truncate) {
502 		start = (u_int32_t) (lp - list) + 1;
503 		meta->last_pgno--;
504 		ret = __db_truncate_freelist(
505 		      dbc, meta, h, list, start, nelem);
506 		h = NULL;
507 	} else if (h->pgno == last_pgno) {
508 		/*
509 		 * We are going to throw this page away, but if we are
510 		 * using MVCC then this version may stick around and we
511 		 * might have to make a copy.
512 		 */
513 		if (atomic_read(&mpf->mfp->multiversion) &&
514 		    (ret = __memp_dirty(mpf,
515 		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
516 			goto err1;
517 		LSN(h) = *lsnp;
518 		P_INIT(h, dbp->pgsize,
519 		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
520 		if ((ret = __memp_fput(mpf,
521 		    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
522 			goto err1;
523 		h = NULL;
524 		/* Give the page back to the OS. */
525 		if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
526 		    last_pgno, 0)) != 0)
527 			goto err1;
528 		DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
529 		meta->last_pgno--;
530 	} else {
531 		if (list != NULL) {
532 			/* Put the page number into the list. */
533 			if ((ret =
534 			    __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
535 				goto err1;
536 			if (prev_pgno != PGNO_INVALID)
537 				lp = &list[position + 1];
538 			else
539 				lp = list;
540 			if (nelem != 0 && position != nelem)
541 				memmove(lp + 1, lp, (size_t)
542 				    ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
543 			*lp = h->pgno;
544 		}
545 #else
546 	{
547 #endif
548 		/*
549 		 * If we are not truncating the page then we
550 		 * reinitialize it and put it at the head of
551 		 * the free list.
552 		 */
553 		if ((ret = __memp_dirty(mpf,
554 		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
555 			goto err1;
556 		LSN(h) = *lsnp;
557 		P_INIT(h, dbp->pgsize,
558 		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
559 #ifdef DIAGNOSTIC
560 		memset((u_int8_t *) h + P_OVERHEAD(dbp),
561 		    CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
562 #endif
563 		if (prev_pgno == PGNO_INVALID)
564 			meta->free = h->pgno;
565 		else
566 			NEXT_PGNO(prev) = h->pgno;
567 	}
568 
569 	/* Discard the metadata or previous page. */
570 err1:	if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
571 	    dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
572 		ret = t_ret;
573 	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
574 		ret = t_ret;
575 	if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
576 	    dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
577 		ret = t_ret;
578 
579 	/* Discard the caller's page reference. */
580 err:	if (h != NULL && (t_ret = __memp_fput(mpf,
581 	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
582 		ret = t_ret;
583 
584 	PERFMON4(dbp->env, alloc, free, dbp->fname, dbp->dname, pgno, ret);
585 	/*
586 	 * XXX
587 	 * We have to unlock the caller's page in the caller!
588 	 */
589 	return (ret);
590 }
591 
592 #ifdef HAVE_FTRUNCATE
593 /*
594  * __db_freelist_pos -- find the position of a page in the freelist.
595  *	The list is sorted, we do a binary search.
596  *
597  * PUBLIC: #ifdef HAVE_FTRUNCATE
598  * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
599  * PUBLIC:       db_pgno_t *, u_int32_t, u_int32_t *));
600  * PUBLIC: #endif
601  */
602 void
__db_freelist_pos(pgno,list,nelem,posp)603 __db_freelist_pos(pgno, list, nelem, posp)
604 	db_pgno_t pgno;
605 	db_pgno_t *list;
606 	u_int32_t nelem;
607 	u_int32_t *posp;
608 {
609 	u_int32_t base, indx, lim;
610 
611 	indx = 0;
612 	for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
613 		indx = base + (lim >> 1);
614 		if (pgno == list[indx]) {
615 			*posp = indx;
616 			return;
617 		}
618 		if (pgno > list[indx]) {
619 			base = indx + 1;
620 			--lim;
621 		}
622 	}
623 	if (base != 0)
624 		base--;
625 	*posp = base;
626 	return;
627 }
628 
629 static int
__db_pglistcmp(a,b)630 __db_pglistcmp(a, b)
631 	const void *a, *b;
632 {
633 	db_pglist_t *ap, *bp;
634 
635 	ap = (db_pglist_t *)a;
636 	bp = (db_pglist_t *)b;
637 
638 	return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
639 }
640 
641 /*
642  * __db_freelist_sort -- sort a list of free pages.
643  * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
644  */
645 void
__db_freelist_sort(list,nelems)646 __db_freelist_sort(list, nelems)
647 	db_pglist_t *list;
648 	u_int32_t nelems;
649 {
650 	qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
651 }
652 
653 /*
654  * __db_pg_truncate -- find the truncation point in a sorted freelist.
655  *
656  * PUBLIC: #ifdef HAVE_FTRUNCATE
657  * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
658  * PUBLIC:    db_pglist_t *, DB_COMPACT *, u_int32_t *,
659  * PUBLIC:    db_pgno_t , db_pgno_t *, DB_LSN *, int));
660  * PUBLIC: #endif
661  */
662 int
__db_pg_truncate(dbc,txn,list,c_data,nelemp,free_pgno,last_pgno,lsnp,in_recovery)663 __db_pg_truncate(dbc, txn,
664     list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
665 	DBC *dbc;
666 	DB_TXN *txn;
667 	db_pglist_t *list;
668 	DB_COMPACT *c_data;
669 	u_int32_t *nelemp;
670 	db_pgno_t free_pgno, *last_pgno;
671 	DB_LSN *lsnp;
672 	int in_recovery;
673 {
674 	DB *dbp;
675 	DBT ddbt;
676 	DB_LSN null_lsn;
677 	DB_MPOOLFILE *mpf;
678 	PAGE *h;
679 	db_pglist_t *lp, *slp;
680 	db_pgno_t lpgno, pgno;
681 	u_int32_t elems, log_size, tpoint;
682 	int last, ret;
683 
684 	ret = 0;
685 	h = NULL;
686 
687 	dbp = dbc->dbp;
688 	mpf = dbp->mpf;
689 	elems = tpoint = *nelemp;
690 
691 	/*
692 	 * Figure out what (if any) pages can be truncated immediately and
693 	 * record the place from which we can truncate, so we can do the
694 	 * memp_ftruncate below.  We also use this to avoid ever putting
695 	 * these pages on the freelist, which we are about to relink.
696 	 */
697 	pgno = *last_pgno;
698 	lp = &list[elems - 1];
699 	last = 1;
700 	while (tpoint != 0) {
701 		if (lp->pgno != pgno)
702 			break;
703 		pgno--;
704 		tpoint--;
705 		lp--;
706 	}
707 
708 	lp = list;
709 	slp = &list[elems];
710 	/*
711 	 * Log the sorted list. We log the whole list so it can be rebuilt.
712 	 * Don't overflow the log file.
713 	 */
714 again:	if (DBC_LOGGING(dbc)) {
715 		last = 1;
716 		lpgno = *last_pgno;
717 		ddbt.size = elems * sizeof(*lp);
718 		ddbt.data = lp;
719 		log_size = ((LOG *)dbc->env->
720 		    lg_handle->reginfo.primary)->log_size;
721 		if (ddbt.size > log_size / 2) {
722 			elems = (log_size / 2) / sizeof(*lp);
723 			ddbt.size = elems * sizeof(*lp);
724 			last = 0;
725 			/*
726 			 * If we stopped after the truncation point
727 			 * then we need to truncate from here.
728 			 */
729 			if (lp + elems >= &list[tpoint])
730 				lpgno = lp[elems - 1].pgno;
731 		}
732 		/*
733 		 * If this is not the beginning of the list fetch the end
734 		 * of the previous segment.  This page becomes the last_free
735 		 * page and will link to this segment if it is not truncated.
736 		 */
737 		if (lp != list) {
738 			if ((ret = __memp_fget(mpf, &lp[-1].pgno,
739 			    dbc->thread_info, txn, 0, &h)) != 0)
740 				goto err;
741 		}
742 
743 		slp = &lp[elems];
744 
745 		ZERO_LSN(null_lsn);
746 		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
747 		     lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
748 		     lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
749 		     h != NULL ? &LSN(h) : &null_lsn,
750 		     free_pgno, lpgno, &ddbt)) != 0)
751 			goto err;
752 		if (h != NULL) {
753 			LSN(h) = *lsnp;
754 			if ((ret = __memp_fput(mpf,
755 			    dbc->thread_info, h, dbc->priority)) != 0)
756 				goto err;
757 		}
758 		h = NULL;
759 	} else if (!in_recovery)
760 		LSN_NOT_LOGGED(*lsnp);
761 
762 	for (; lp < slp && lp < &list[tpoint]; lp++) {
763 		if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
764 		    txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
765 			/* Page may have been truncated later. */
766 			if (in_recovery && ret == DB_PAGE_NOTFOUND) {
767 				ret = 0;
768 				continue;
769 			}
770 			goto err;
771 		}
772 		if (in_recovery) {
773 			if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
774 				if ((ret = __memp_dirty(mpf, &h,
775 				    dbc->thread_info,
776 				    txn, dbp->priority, 0)) != 0) {
777 					(void)__memp_fput(mpf,
778 					    dbc->thread_info, h, dbp->priority);
779 					goto err;
780 				}
781 			} else
782 				goto skip;
783 		}
784 
785 		if (lp == &list[tpoint - 1])
786 			NEXT_PGNO(h) = PGNO_INVALID;
787 		else
788 			NEXT_PGNO(h) = lp[1].pgno;
789 		DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
790 
791 		LSN(h) = *lsnp;
792 skip:		if ((ret = __memp_fput(mpf,
793 		    dbc->thread_info, h, dbp->priority)) != 0)
794 			goto err;
795 		h = NULL;
796 	}
797 
798 	/*
799 	 * If we did not log everything try again.  We start from slp and
800 	 * try to go to the end of the list.
801 	 */
802 	if (last == 0) {
803 		elems = (u_int32_t)(&list[*nelemp] - slp);
804 		lp = slp;
805 		goto again;
806 	}
807 
808 	/*
809 	 * Truncate the file.  Its possible that the last page is the
810 	 * only one that got truncated and that's done in the caller.
811 	 */
812 	if (pgno != *last_pgno) {
813 		if (tpoint != *nelemp &&
814 		    (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
815 		    pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
816 			goto err;
817 		if (c_data)
818 			c_data->compact_pages_truncated += *last_pgno - pgno;
819 		*last_pgno = pgno;
820 	}
821 	*nelemp = tpoint;
822 
823 	if (0) {
824 err:		if (h != NULL)
825 			(void)__memp_fput(mpf,
826 			    dbc->thread_info, h, dbc->priority);
827 	}
828 	return (ret);
829 }
830 
831 /*
832  * __db_free_truncate --
833  *	  Build a sorted free list and truncate free pages at the end
834  *	  of the file.
835  *
836  * PUBLIC: #ifdef HAVE_FTRUNCATE
837  * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
838  * PUBLIC:    u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
839  * PUBLIC:    db_pgno_t *));
840  * PUBLIC: #endif
841  */
842 int
__db_free_truncate(dbp,ip,txn,flags,c_data,listp,nelemp,last_pgnop)843 __db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
844 	DB *dbp;
845 	DB_THREAD_INFO *ip;
846 	DB_TXN *txn;
847 	u_int32_t flags;
848 	DB_COMPACT *c_data;
849 	db_pglist_t **listp;
850 	u_int32_t *nelemp;
851 	db_pgno_t *last_pgnop;
852 {
853 	DBC *dbc;
854 	DBMETA *meta;
855 	DB_LOCK metalock;
856 	DB_MPOOLFILE *mpf;
857 	ENV *env;
858 	PAGE *h;
859 	db_pglist_t *list, *lp;
860 	db_pgno_t pgno;
861 	u_int32_t nelems;
862 	int ret, t_ret;
863 	size_t size;
864 
865 	COMPQUIET(flags, 0);
866 	list = NULL;
867 	meta = NULL;
868 	env = dbp->env;
869 	mpf = dbp->mpf;
870 	h = NULL;
871 	nelems = 0;
872 	if (listp != NULL) {
873 		*listp = NULL;
874 		DB_ASSERT(env, nelemp != NULL);
875 		*nelemp = 0;
876 	}
877 
878 	if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
879 		return (ret);
880 
881 	pgno = PGNO_BASE_MD;
882 	if ((ret = __db_lget(dbc,
883 	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
884 		goto err;
885 	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
886 	    &meta)) != 0)
887 		goto err;
888 
889 	if (last_pgnop != NULL)
890 		*last_pgnop = meta->last_pgno;
891 	if ((pgno = meta->free) == PGNO_INVALID)
892 		goto done;
893 
894 	size = 128;
895 	if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
896 		goto err;
897 	lp = list;
898 
899 	do {
900 		if (lp == &list[size]) {
901 			size *= 2;
902 			if ((ret = __os_realloc(env,
903 			    size * sizeof(*list), &list)) != 0)
904 				goto err;
905 			lp = &list[size / 2];
906 		}
907 		if ((ret = __memp_fget(mpf, &pgno,
908 		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
909 			goto err;
910 
911 		lp->pgno = pgno;
912 		lp->next_pgno = NEXT_PGNO(h);
913 		lp->lsn = LSN(h);
914 		pgno = NEXT_PGNO(h);
915 		if ((ret = __memp_fput(mpf,
916 		    dbc->thread_info, h, dbc->priority)) != 0)
917 			goto err;
918 		lp++;
919 	} while (pgno != PGNO_INVALID);
920 	nelems = (u_int32_t)(lp - list);
921 
922 	if ((ret = __memp_dirty(mpf,
923 	    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
924 		goto err;
925 
926 	/* Sort the list */
927 	__db_freelist_sort(list, nelems);
928 
929 	if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
930 	    &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
931 		goto err;
932 
933 	if (nelems == 0)
934 		meta->free = PGNO_INVALID;
935 	else
936 		meta->free = list[0].pgno;
937 
938 done:	if (last_pgnop != NULL)
939 		*last_pgnop = meta->last_pgno;
940 
941 	/*
942 	 * Set the truncation point which determines which pages may be
943 	 * relocated. Pages above are candidates to be swapped with a lower one
944 	 * from the freelist by __db_exchange_page(); pages before the truncate
945 	 * point are not relocated.
946 	 * The truncation point starts as N pages less than the last_pgno, where
947 	 * N is the size of the free list. This is reduced by 1/4 in the hope
948 	 * that partially full pages will be coalesced together, creating
949 	 * additional free pages during the compact.
950 	 */
951 	if (c_data) {
952 		c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
953 		if (c_data->compact_truncate > nelems >> 2)
954 			c_data->compact_truncate -= nelems >> 2;
955 	}
956 
957 	if (nelems != 0 && listp != NULL) {
958 		*listp = list;
959 		*nelemp = nelems;
960 		list = NULL;
961 	}
962 
963 err:	if (list != NULL)
964 		__os_free(env, list);
965 	if (meta != NULL && (t_ret = __memp_fput(mpf,
966 	     dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
967 		ret = t_ret;
968 	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
969 		ret = t_ret;
970 	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
971 		ret = t_ret;
972 	return (ret);
973 }
974 
975 static int
__db_truncate_freelist(dbc,meta,h,list,start,nelem)976 __db_truncate_freelist(dbc, meta, h, list, start, nelem)
977 	DBC *dbc;
978 	DBMETA *meta;
979 	PAGE *h;
980 	db_pgno_t *list;
981 	u_int32_t start, nelem;
982 {
983 	DB *dbp;
984 	DBT ddbt;
985 	DB_LSN null_lsn;
986 	DB_MPOOLFILE *mpf;
987 	PAGE *last_free, *pg;
988 	db_pgno_t *lp, free_pgno, lpgno;
989 	db_pglist_t *plist, *pp, *spp;
990 	u_int32_t elem, log_size;
991 	int last, ret;
992 
993 	dbp = dbc->dbp;
994 	mpf = dbp->mpf;
995 	plist = NULL;
996 	last_free = NULL;
997 	pg = NULL;
998 
999 	if (start != 0 &&
1000 	    (ret = __memp_fget(mpf, &list[start - 1],
1001 	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
1002 		goto err;
1003 
1004 	if (DBC_LOGGING(dbc)) {
1005 		if ((ret = __os_malloc(dbp->env,
1006 		     (nelem - start) * sizeof(*pp), &plist)) != 0)
1007 			goto err;
1008 
1009 		pp = plist;
1010 		for (lp = &list[start]; lp < &list[nelem]; lp++) {
1011 			pp->pgno = *lp;
1012 			if ((ret = __memp_fget(mpf, lp,
1013 			     dbc->thread_info, dbc->txn, 0, &pg)) != 0)
1014 				goto err;
1015 			pp->lsn = LSN(pg);
1016 			pp->next_pgno = NEXT_PGNO(pg);
1017 			if ((ret = __memp_fput(mpf,
1018 			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
1019 				goto err;
1020 			pg = NULL;
1021 			pp++;
1022 		}
1023 		ZERO_LSN(null_lsn);
1024 		pp = plist;
1025 		elem = nelem - start;
1026 		log_size = ((LOG *)dbc->env->
1027 		    lg_handle->reginfo.primary)->log_size;
1028 again:		ddbt.data = spp = pp;
1029 		free_pgno = pp->pgno;
1030 		lpgno = meta->last_pgno;
1031 		ddbt.size = elem * sizeof(*pp);
1032 		if (ddbt.size > log_size / 2) {
1033 			elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
1034 			ddbt.size = elem * sizeof(*pp);
1035 			pp += elem;
1036 			elem = (nelem - start) - (u_int32_t)(pp - plist);
1037 			lpgno = pp[-1].pgno;
1038 			last = 0;
1039 		} else
1040 			last = 1;
1041 		/*
1042 		 * Get the page which will link to this section if we abort.
1043 		 * If this is the first segment then its last_free.
1044 		 */
1045 		if (spp == plist)
1046 			pg = last_free;
1047 		else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
1048 		     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
1049 			goto err;
1050 
1051 		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
1052 		     &LSN(meta), last == 1 ? DB_FLUSH : 0,
1053 		     PGNO(meta), &LSN(meta),
1054 		     pg != NULL ? PGNO(pg) : PGNO_INVALID,
1055 		     pg != NULL ? &LSN(pg) : &null_lsn,
1056 		     free_pgno, lpgno, &ddbt)) != 0)
1057 			goto err;
1058 		if (pg != NULL) {
1059 			LSN(pg) = LSN(meta);
1060 			if (pg != last_free && (ret = __memp_fput(mpf,
1061 			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
1062 				goto err;
1063 			pg = NULL;
1064 		}
1065 		if (last == 0)
1066 			goto again;
1067 	} else
1068 		LSN_NOT_LOGGED(LSN(meta));
1069 
1070 	if ((ret = __memp_fput(mpf,
1071 	    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
1072 		goto err;
1073 	h = NULL;
1074 	if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
1075 	    list[start], 0)) != 0)
1076 		goto err;
1077 	meta->last_pgno = list[start] - 1;
1078 
1079 	if (start == 0)
1080 		meta->free = PGNO_INVALID;
1081 	else {
1082 		NEXT_PGNO(last_free) = PGNO_INVALID;
1083 		if ((ret = __memp_fput(mpf,
1084 		    dbc->thread_info, last_free, dbc->priority)) != 0)
1085 			goto err;
1086 		last_free = NULL;
1087 	}
1088 
1089 	/* Shrink the number of elements in the list. */
1090 	ret = __memp_extend_freelist(mpf, start, &list);
1091 
1092 err:	if (plist != NULL)
1093 		__os_free(dbp->env, plist);
1094 
1095 	/* We need to put the page on error. */
1096 	if (h != NULL)
1097 		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
1098 	if (pg != NULL && pg != last_free)
1099 		(void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
1100 	if (last_free != NULL)
1101 		(void)__memp_fput(mpf,
1102 		    dbc->thread_info, last_free, dbc->priority);
1103 
1104 	return (ret);
1105 }
1106 #endif
1107 
1108 #ifdef DEBUG
1109 /*
1110  * __db_lprint --
1111  *	Print out the list of locks currently held by a cursor.
1112  *
1113  * PUBLIC: int __db_lprint __P((DBC *));
1114  */
1115 int
__db_lprint(dbc)1116 __db_lprint(dbc)
1117 	DBC *dbc;
1118 {
1119 	DB *dbp;
1120 	DB_LOCKREQ req;
1121 	ENV *env;
1122 
1123 	dbp = dbc->dbp;
1124 	env = dbp->env;
1125 
1126 	if (LOCKING_ON(env)) {
1127 		req.op = DB_LOCK_DUMP;
1128 		(void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
1129 	}
1130 	return (0);
1131 }
1132 #endif
1133 
1134 /*
1135  * __db_lget --
1136  *	The standard lock get call.
1137  *
1138  * PUBLIC: int __db_lget __P((DBC *,
1139  * PUBLIC:     int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
1140  */
1141 int
__db_lget(dbc,action,pgno,mode,lkflags,lockp)1142 __db_lget(dbc, action, pgno, mode, lkflags, lockp)
1143 	DBC *dbc;
1144 	int action;
1145 	db_pgno_t pgno;
1146 	db_lockmode_t mode;
1147 	u_int32_t lkflags;
1148 	DB_LOCK *lockp;
1149 {
1150 	DB *dbp;
1151 	DB_LOCKREQ couple[3], *reqp;
1152 	DB_TXN *txn;
1153 	ENV *env;
1154 	int has_timeout, i, ret;
1155 
1156 	dbp = dbc->dbp;
1157 	env = dbp->env;
1158 	txn = dbc->txn;
1159 
1160 	/*
1161 	 * We do not always check if we're configured for locking before
1162 	 * calling __db_lget to acquire the lock.
1163 	 */
1164 	if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
1165 	    (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
1166 	    dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
1167 	    F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
1168 	    (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
1169 	    (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
1170 		LOCK_INIT(*lockp);
1171 		return (0);
1172 	}
1173 
1174 	/*
1175 	 * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
1176 	 * pass that along to the lock call.
1177 	 */
1178 	if (DB_NONBLOCK(dbc))
1179 		lkflags |= DB_LOCK_NOWAIT;
1180 
1181 	/*
1182 	 * If we're trying to run in exclusive mode, attempt to get an
1183 	 * exclusive database lock.  If it is not available then wait
1184 	 * for the lock on the database and clear the exclusive bit.
1185 	 *
1186 	 * If we get an exclusive lock on the database, mark the cursor
1187 	 * with DBC_DONTLOCK to avoid any further locking.
1188 	 */
1189 	if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
1190 		dbc->lock.type = DB_DATABASE_LOCK;
1191 		dbc->lock.pgno = PGNO_BASE_MD;
1192 		if ((ret = __lock_get(env, dbc->locker, DB_LOCK_NOWAIT,
1193 		    &dbc->lock_dbt, F_ISSET(dbp, DB_AM_RDONLY) ?
1194 		    DB_LOCK_READ : DB_LOCK_WRITE, lockp)) == 0) {
1195 			if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
1196 				F_SET(dbc, DBC_DONTLOCK);
1197 				if (!IS_REAL_TXN(txn))
1198 					dbc->mylock = *lockp;
1199 				LOCK_INIT(*lockp);
1200 				return (0);
1201 			}
1202 		} else if (ret == DB_LOCK_NOTGRANTED &&
1203 		    (lkflags & DB_LOCK_NOWAIT) == 0) {
1204 			if ((ret = __lock_get(env, dbc->locker, 0,
1205 			    &dbc->lock_dbt, DB_LOCK_WRITE, lockp)) != 0)
1206 				return (ret);
1207 			F_CLR(dbp->mpf->mfp, MP_DATABASE_LOCKING);
1208 			if ((ret = __lock_put(env, lockp)) != 0)
1209 				return (ret);
1210 			LOCK_INIT(*lockp);
1211 		} else if (ret != 0)
1212 			return (ret);
1213 	}
1214 
1215 	dbc->lock.pgno = pgno;
1216 	if (lkflags & DB_LOCK_RECORD)
1217 		dbc->lock.type = DB_RECORD_LOCK;
1218 	else
1219 		dbc->lock.type = DB_PAGE_LOCK;
1220 	lkflags &= ~DB_LOCK_RECORD;
1221 
1222 	if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
1223 		mode = DB_LOCK_READ_UNCOMMITTED;
1224 
1225 	has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
1226 	    (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
1227 
1228 	/*
1229 	 * Transactional locking.
1230 	 * Hold on to the previous read lock only if we are in full isolation.
1231 	 * COUPLE_ALWAYS indicates we are holding an interior node which need
1232 	 *	not be isolated.
1233 	 * Downgrade write locks if we are supporting dirty readers and the
1234 	 * update did not have an error.
1235 	 */
1236 	if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
1237 	    !LOCK_ISSET(*lockp))
1238 		action = 0;
1239 	else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
1240 		action = LCK_COUPLE;
1241 	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
1242 	    lockp->mode == DB_LOCK_READ)
1243 		action = LCK_COUPLE;
1244 	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1245 		action = LCK_COUPLE;
1246 	else if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
1247 	     !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
1248 		action = LCK_DOWNGRADE;
1249 	else
1250 		action = 0;
1251 
1252 	i = 0;
1253 	switch (action) {
1254 	default:
1255 		if (has_timeout)
1256 			goto do_couple;
1257 		ret = __lock_get(env,
1258 		    dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
1259 		break;
1260 
1261 	case LCK_DOWNGRADE:
1262 		couple[0].op = DB_LOCK_GET;
1263 		couple[0].obj = NULL;
1264 		couple[0].lock = *lockp;
1265 		couple[0].mode = DB_LOCK_WWRITE;
1266 		UMRW_SET(couple[0].timeout);
1267 		i++;
1268 		/* FALLTHROUGH */
1269 	case LCK_COUPLE:
1270 do_couple:	couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
1271 		couple[i].obj = &dbc->lock_dbt;
1272 		couple[i].mode = mode;
1273 		UMRW_SET(couple[i].timeout);
1274 		i++;
1275 		if (has_timeout)
1276 			couple[0].timeout =
1277 			     F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
1278 		if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
1279 			couple[i].op = DB_LOCK_PUT;
1280 			couple[i].lock = *lockp;
1281 			i++;
1282 		}
1283 
1284 		ret = __lock_vec(env,
1285 		    dbc->locker, lkflags, couple, i, &reqp);
1286 		if (ret == 0 || reqp == &couple[i - 1])
1287 			*lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
1288 		break;
1289 	}
1290 
1291 	if (txn != NULL && ret == DB_LOCK_DEADLOCK)
1292 		F_SET(txn, TXN_DEADLOCK);
1293 	return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
1294 		 DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
1295 }
1296 
1297 #ifdef DIAGNOSTIC
1298 /*
1299  * __db_haslock --
1300  *	Determine if this locker holds a particular lock.
1301  *	Returns 0 if lock is held, non-zero otherwise.
1302  *
1303  * PUBLIC: #ifdef DIAGNOSTIC
1304  * PUBLIC: int __db_haslock __P((ENV *, DB_LOCKER *,
1305  * PUBLIC:     DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
1306  * PUBLIC: #endif
1307  */
1308 int
__db_haslock(env,locker,dbmfp,pgno,mode,type)1309 __db_haslock(env, locker, dbmfp, pgno, mode, type)
1310 	ENV *env;
1311 	DB_LOCKER *locker;
1312 	DB_MPOOLFILE *dbmfp;
1313 	db_pgno_t pgno;
1314 	db_lockmode_t mode;
1315 	u_int32_t type;
1316 {
1317 	DBT lkdata;
1318 	DB_LOCK lock;
1319 	DB_LOCK_ILOCK ilock;
1320 
1321 	memset(&lkdata, 0, sizeof(lkdata));
1322 	lkdata.data = &ilock;
1323 	lkdata.size = sizeof(ilock);
1324 
1325 	memcpy(ilock.fileid, dbmfp->fileid, DB_FILE_ID_LEN);
1326 	ilock.pgno = pgno;
1327 	ilock.type = type;
1328 
1329 	return (__lock_get(env, locker, DB_LOCK_CHECK, &lkdata, mode, &lock));
1330 }
1331 /*
1332  * __db_has_pagelock --
1333  *	Determine if this locker holds a particular page lock.
1334  *	Returns 0 if lock is held, non-zero otherwise.
1335  *
1336  * PUBLIC: #ifdef DIAGNOSTIC
1337  * PUBLIC: int __db_has_pagelock __P((ENV *, DB_LOCKER *,
1338  * PUBLIC:     DB_MPOOLFILE *, PAGE *, db_lockmode_t));
1339  * PUBLIC: #endif
1340  */
1341 int
__db_has_pagelock(env,locker,dbmfp,pagep,mode)1342 __db_has_pagelock(env, locker, dbmfp, pagep, mode)
1343 	ENV *env;
1344 	DB_LOCKER *locker;
1345 	DB_MPOOLFILE *dbmfp;
1346 	PAGE *pagep;
1347 	db_lockmode_t mode;
1348 {
1349 	int ret;
1350 
1351 	switch (pagep->type) {
1352 	case P_OVERFLOW:
1353 	case P_INVALID:
1354 	case P_QAMDATA:
1355 	case P_QAMMETA:
1356 	case P_IHEAP:
1357 		return (0);
1358 	case P_HASH:
1359 		if (PREV_PGNO(pagep) != PGNO_INVALID)
1360 			return (0);
1361 		break;
1362 	default:
1363 		break;
1364 	}
1365 	if ((ret = __db_haslock(env,
1366 	    locker, dbmfp, pagep->pgno, mode, DB_PAGE_LOCK)) != 0)
1367 		ret = __db_haslock(env,
1368 		    locker, dbmfp, PGNO_BASE_MD, mode, DB_DATABASE_LOCK);
1369 	return (ret);
1370 }
1371 #endif
1372 
1373 /*
1374  * __db_lput --
1375  *	The standard lock put call.
1376  *
1377  * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
1378  */
1379 int
__db_lput(dbc,lockp)1380 __db_lput(dbc, lockp)
1381 	DBC *dbc;
1382 	DB_LOCK *lockp;
1383 {
1384 	DB_LOCKREQ couple[2], *reqp;
1385 	ENV *env;
1386 	int action, ret;
1387 
1388 	/*
1389 	 * Transactional locking.
1390 	 * Hold on to the read locks only if we are in full isolation.
1391 	 * Downgrade write locks if we are supporting dirty readers unless
1392 	 * there was an error.
1393 	 */
1394 	if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
1395 	    !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
1396 		action = LCK_DOWNGRADE;
1397 	else if (dbc->txn == NULL)
1398 		action = LCK_COUPLE;
1399 	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
1400 	    lockp->mode == DB_LOCK_READ)
1401 		action = LCK_COUPLE;
1402 	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1403 		action = LCK_COUPLE;
1404 	else
1405 		action = 0;
1406 
1407 	env = dbc->env;
1408 	switch (action) {
1409 	case LCK_COUPLE:
1410 		ret = __lock_put(env, lockp);
1411 		break;
1412 	case LCK_DOWNGRADE:
1413 		couple[0].op = DB_LOCK_GET;
1414 		couple[0].obj = NULL;
1415 		couple[0].mode = DB_LOCK_WWRITE;
1416 		couple[0].lock = *lockp;
1417 		UMRW_SET(couple[0].timeout);
1418 		couple[1].op = DB_LOCK_PUT;
1419 		couple[1].lock = *lockp;
1420 		ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
1421 		if (ret == 0 || reqp == &couple[1])
1422 			*lockp = couple[0].lock;
1423 		break;
1424 	default:
1425 		ret = 0;
1426 		break;
1427 	}
1428 
1429 	return (ret);
1430 }
1431