1 /*-
2  * Copyright (c) 1996, 2020 Oracle and/or its affiliates.  All rights reserved.
3  *
4  * See the file LICENSE for license information.
5  */
6 /*
7  * Copyright (c) 1990, 1993, 1994, 1995, 1996
8  *	Keith Bostic.  All rights reserved.
9  */
10 /*
11  * Copyright (c) 1990, 1993, 1994, 1995
12  *	The Regents of the University of California.  All rights reserved.
13  *
14  * This code is derived from software contributed to Berkeley by
15  * Mike Olson.
16  *
17  * Redistribution and use in source and binary forms, with or without
18  * modification, are permitted provided that the following conditions
19  * are met:
20  * 1. Redistributions of source code must retain the above copyright
21  *    notice, this list of conditions and the following disclaimer.
22  * 2. Redistributions in binary form must reproduce the above copyright
23  *    notice, this list of conditions and the following disclaimer in the
24  *    documentation and/or other materials provided with the distribution.
25  * 3. Neither the name of the University nor the names of its contributors
26  *    may be used to endorse or promote products derived from this software
27  *    without specific prior written permission.
28  *
29  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39  * SUCH DAMAGE.
40  *
41  * $Id$
42  */
43 
44 #include "db_config.h"
45 
46 #include "db_int.h"
47 #include "dbinc/db_page.h"
48 #include "dbinc/lock.h"
49 #include "dbinc/log.h"
50 #include "dbinc/mp.h"
51 #include "dbinc/txn.h"
52 #include "dbinc/db_am.h"
53 #include "dbinc/hash.h"
54 
55 static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
56 #ifdef HAVE_FTRUNCATE
57 static int  __db_pglistcmp __P((const void *, const void *));
58 static int  __db_truncate_freelist __P((DBC *, DBMETA *,
59       PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
60 #endif
61 
62 /*
63  * __db_init_meta --
64  *	Helper function for __db_new that initializes the important fields in
65  * a meta-data page (used instead of P_INIT).  We need to make sure that we
66  * retain the page number and LSN of the existing page.
67  */
68 static void
__db_init_meta(dbp,p,pgno,pgtype)69 __db_init_meta(dbp, p, pgno, pgtype)
70 	DB *dbp;
71 	void *p;
72 	db_pgno_t pgno;
73 	u_int32_t pgtype;
74 {
75 	DBMETA *meta;
76 	DB_LSN save_lsn;
77 
78 	meta = (DBMETA *)p;
79 	save_lsn = meta->lsn;
80 	memset(meta, 0, sizeof(DBMETA));
81 	meta->lsn = save_lsn;
82 	meta->pagesize = dbp->pgsize;
83 	if (F_ISSET(dbp, DB_AM_CHKSUM))
84 		FLD_SET(meta->metaflags, DBMETA_CHKSUM);
85 	meta->pgno = pgno;
86 	meta->type = (u_int8_t)pgtype;
87 }
88 
89 /*
90  * __db_new --
91  *	Get a new page, preferably from the freelist.
92  *
93  * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
94  */
95 int
__db_new(dbc,type,lockp,pagepp)96 __db_new(dbc, type, lockp, pagepp)
97 	DBC *dbc;
98 	u_int32_t type;
99 	DB_LOCK *lockp;
100 	PAGE **pagepp;
101 {
102 	DB *dbp;
103 	DBMETA *meta;
104 	DB_LOCK metalock;
105 	DB_LSN lsn;
106 	DB_MPOOLFILE *mpf;
107 	ENV *env;
108 	PAGE *h;
109 	db_pgno_t last, *list, pgno, newnext;
110 	int extend, hash, ret;
111 
112 	COMPQUIET(last, 0);
113 
114 	meta = NULL;
115 	dbp = dbc->dbp;
116 	env = dbp->env;
117 	mpf = dbp->mpf;
118 	h = NULL;
119 	newnext = PGNO_INVALID;
120 	if (lockp != NULL)
121 		LOCK_INIT(*lockp);
122 
123 	hash = 0;
124 	ret = 0;
125 	LOCK_INIT(metalock);
126 
127 #ifdef HAVE_HASH
128 	if (dbp->type == DB_HASH) {
129 		if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
130 			goto err;
131 		if (meta != NULL)
132 			hash = 1;
133 	}
134 #endif
135 	if (meta == NULL) {
136 		pgno = PGNO_BASE_MD;
137 		if ((ret = __db_lget(dbc,
138 		    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
139 			goto err;
140 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
141 		    DB_MPOOL_DIRTY, &meta)) != 0)
142 			goto err;
143 	}
144 
145 	last = meta->last_pgno;
146 	if (meta->free == PGNO_INVALID) {
147 		if (FLD_ISSET(type, P_DONTEXTEND)) {
148 			*pagepp = NULL;
149 			goto err;
150 		}
151 		last = pgno = meta->last_pgno + 1;
152 		ZERO_LSN(lsn);
153 		extend = 1;
154 	} else {
155 		pgno = meta->free;
156 		/*
157 		 * Lock the new page.  Do this here because we must do it
158 		 * before getting the page and the caller may need the lock
159 		 * to keep readers from seeing the page before the transaction
160 		 * commits.  We can do this because no one will hold a free
161 		 * page locked.
162 		 */
163 		if (lockp != NULL && (ret =
164 		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
165 			goto err;
166 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
167 		    DB_MPOOL_DIRTY, &h)) != 0)
168 			goto err;
169 
170 		/*
171 		 * We want to take the first page off the free list and
172 		 * then set meta->free to the that page's next_pgno, but
173 		 * we need to log the change first.
174 		 */
175 		newnext = h->next_pgno;
176 		lsn = h->lsn;
177 		extend = 0;
178 		DB_ASSERT(env, TYPE(h) == P_INVALID);
179 
180 		if (TYPE(h) != P_INVALID) {
181 			__db_errx(env, DB_STR_A("0689",
182 			    "%s page %lu is on free list with type %lu",
183 			    "%s %lu %lu"), dbp->fname, (u_long)PGNO(h),
184 			    (u_long)TYPE(h));
185 			return (__env_panic(env, EINVAL));
186 		}
187 
188 	}
189 
190 	FLD_CLR(type, P_DONTEXTEND);
191 
192 	/*
193 	 * Log the allocation before fetching the new page.  If we
194 	 * don't have room in the log then we don't want to tell
195 	 * mpool to extend the file.
196 	 */
197 	if (DBC_LOGGING(dbc)) {
198 		if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
199 		    &LSN(meta), PGNO_BASE_MD, &lsn,
200 		    pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
201 			goto err;
202 	} else
203 		LSN_NOT_LOGGED(LSN(meta));
204 
205 	meta->free = newnext;
206 
207 	if (extend == 1) {
208 		if (lockp != NULL && (ret =
209 		     __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
210 			goto err;
211 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
212 		    DB_MPOOL_NEW, &h)) != 0)
213 			goto err;
214 		DB_ASSERT(env, last == pgno);
215 		meta->last_pgno = pgno;
216 		ZERO_LSN(h->lsn);
217 		h->pgno = pgno;
218 
219 		/*
220 		 * If the file was extended for the first time in this
221 		 * transaction, set the MPOOLFILE's file extension
222 		 * watermark.
223 		 */
224 		__txn_add_fe_watermark(dbc->txn, dbp, h->pgno);
225 
226 	}
227 	LSN(h) = LSN(meta);
228 
229 	if (hash == 0 && (ret = __memp_fput(mpf,
230 	    dbc->thread_info, meta, dbc->priority)) != 0)
231 		goto err;
232 	meta = NULL;
233 
234 	switch (type) {
235 		case P_BTREEMETA:
236 		case P_HASHMETA:
237 		case P_QAMMETA:
238 			__db_init_meta(dbp, h, h->pgno, type);
239 			break;
240 		default:
241 			P_INIT(h, dbp->pgsize,
242 			    h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
243 			break;
244 	}
245 
246 	/* Fix up the sorted free list if necessary. */
247 #ifdef HAVE_FTRUNCATE
248 	if (extend == 0) {
249 		u_int32_t nelems = 0;
250 
251 		if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
252 			goto err;
253 		if (nelems != 0) {
254 			DB_ASSERT(env, h->pgno == list[0]);
255 			memmove(list, &list[1], (nelems - 1) * sizeof(*list));
256 			if ((ret = __memp_extend_freelist(
257 			    dbp->mpf, nelems - 1, &list)) != 0)
258 				goto err;
259 		}
260 	}
261 #else
262 	COMPQUIET(list, NULL);
263 #endif
264 
265 	if ((ret = __TLPUT(dbc, metalock)) != 0)
266 		return (ret);
267 	*pagepp = h;
268 	PERFMON6(env, alloc, new, dbp->fname, dbp->dname, pgno, type, h, 0);
269 	return (0);
270 
271 err:	if (h != NULL)
272 		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
273 	if (meta != NULL && hash == 0)
274 		(void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
275 	(void)__TLPUT(dbc, metalock);
276 	if (lockp != NULL)
277 		(void)__LPUT(dbc, *lockp);
278 	/* Failure return - report 0 pgno, null page address. */
279 	PERFMON6(env, alloc, new, dbp->fname, dbp->dname, 0, type, NULL, ret);
280 	return (ret);
281 }
282 
283 /*
284  * __db_free --
285  *	Add a page to the head of the freelist.
286  *
287  * PUBLIC: int __db_free __P((DBC *, PAGE *, u_int32_t));
288  */
289 int
__db_free(dbc,h,flags)290 __db_free(dbc, h, flags)
291 	DBC *dbc;
292 	PAGE *h;
293 	u_int32_t flags;
294 {
295 	DB *dbp;
296 	DBMETA *meta;
297 	DBT ddbt, ldbt;
298 	DB_LOCK metalock;
299 	DB_LSN *lsnp;
300 	DB_MPOOLFILE *mpf;
301 	PAGE *prev;
302 	db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
303 	u_int32_t lflag;
304 	int hash, ret, t_ret;
305 #ifdef HAVE_FTRUNCATE
306 	db_pgno_t *list, *lp;
307 	u_int32_t nelem, position, start;
308 	int do_truncate;
309 #endif
310 
311 	dbp = dbc->dbp;
312 	mpf = dbp->mpf;
313 	prev_pgno = PGNO_INVALID;
314 	meta = NULL;
315 	prev = NULL;
316 	LOCK_INIT(metalock);
317 #ifdef HAVE_FTRUNCATE
318 	lp = NULL;
319 	nelem = 0;
320 	do_truncate = 0;
321 #endif
322 
323 	/*
324 	 * Retrieve the metadata page.  If we are not keeping a sorted
325 	 * free list put the page at the head of the the free list.
326 	 * If we are keeping a sorted free list, for truncation,
327 	 * then figure out where this page belongs and either
328 	 * link it in or truncate the file as much as possible.
329 	 * If either the lock get or page get routines
330 	 * fail, then we need to put the page with which we were called
331 	 * back because our caller assumes we take care of it.
332 	 */
333 	hash = 0;
334 
335 	pgno = PGNO_BASE_MD;
336 	if ((ret = __db_lget(dbc,
337 	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
338 		goto err;
339 
340 #ifdef HAVE_HASH
341 	if (dbp->type == DB_HASH) {
342 		if ((ret = __ham_return_meta(dbc,
343 #ifdef HAVE_FTRUNCATE
344 		    0,
345 #else
346 		    DB_MPOOL_DIRTY,
347 #endif
348 		&meta)) != 0)
349 			goto err;
350 		if (meta != NULL)
351 			hash = 1;
352 	}
353 #endif
354 	if (meta == NULL) {
355 		/* If we support truncate, we might not dirty the meta page. */
356 		if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
357 #ifdef HAVE_FTRUNCATE
358 		    0,
359 #else
360 		    DB_MPOOL_DIRTY,
361 #endif
362 		    &meta)) != 0)
363 			goto err1;
364 	}
365 
366 	last_pgno = meta->last_pgno;
367 	next_pgno = meta->free;
368 	/*
369 	 * Assign lsnp here so it always initialized when
370 	 * HAVE_FTRUNCATE is not defined.
371 	 */
372 	lsnp = &LSN(meta);
373 
374 	DB_ASSERT(dbp->env, h->pgno != next_pgno);
375 
376 #ifdef HAVE_FTRUNCATE
377 	/*
378 	 * If we are maintaining a sorted free list see if we either have a
379 	 * new truncation point or the page goes somewhere in the middle of
380 	 * the list.  If it goes in the middle of the list, we will drop the
381 	 * meta page and get the previous page.
382 	 */
383 	COMPQUIET(position, 0);
384 	if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
385 		goto err1;
386 	if (list == NULL)
387 		goto no_sort;
388 
389 	if (h->pgno != last_pgno) {
390 		/*
391 		 * Put the page number in the sorted list.  Find its
392 		 * position and the previous page.  After logging we
393 		 * will extend the list, make room and insert the page in
394 		 * the list.
395 		 */
396 		position = 0;
397 		if (nelem != 0) {
398 			__db_freelist_pos(h->pgno, list, nelem, &position);
399 
400 			DB_ASSERT(dbp->env, h->pgno != list[position]);
401 
402 			/* Get the previous page if this is not the smallest. */
403 			if (position != 0 || h->pgno > list[0])
404 				prev_pgno = list[position];
405 		}
406 
407 	} else if (nelem != 0) {
408 		/* Find the truncation point. */
409 		for (lp = &list[nelem - 1]; lp >= list; lp--)
410 			if (--last_pgno != *lp)
411 				break;
412 		if (lp < list || last_pgno < h->pgno - 1)
413 			do_truncate = 1;
414 		last_pgno = meta->last_pgno;
415 	}
416 
417 no_sort:
418 	if (prev_pgno == PGNO_INVALID) {
419 #ifdef HAVE_HASH
420 		if (hash) {
421 			if ((ret =
422 			    __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
423 				goto err1;
424 		} else
425 #endif
426 		if ((ret = __memp_dirty(mpf,
427 		    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
428 			goto err1;
429 		lsnp = &LSN(meta);
430 	} else {
431 		pgno = prev_pgno;
432 		if ((ret = __memp_fget(mpf, &pgno,
433 		    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
434 			goto err1;
435 		next_pgno = NEXT_PGNO(prev);
436 		lsnp = &LSN(prev);
437 	}
438 #endif
439 
440 	/*
441 	 * Log the change.
442 	 *	We are either logging an update to the metapage or to the
443 	 * previous page in the sorted list.
444 	 */
445 	if (DBC_LOGGING(dbc)) {
446 		memset(&ldbt, 0, sizeof(ldbt));
447 		ldbt.data = h;
448 		ldbt.size = P_OVERHEAD(dbp);
449 		/*
450 		 * If we are removing pages from the file, we need to make
451 		 * sure the logging happens before the truncation.  If we
452 		 * are truncating multiple pages we don't need to flush the
453 		 * log here as it will be flushed by __db_truncate_freelist.
454 		 */
455 		lflag = 0;
456 
457 #ifdef HAVE_FTRUNCATE
458 		if (h->pgno == last_pgno && do_truncate == 0)
459 			lflag = DB_FLUSH;
460 #endif
461 		switch (h->type) {
462 		case P_HASH:
463 		case P_IBTREE:
464 		case P_IRECNO:
465 		case P_LBTREE:
466 		case P_LRECNO:
467 		case P_LDUP:
468 			if (h->entries > 0 && (h->pgno == last_pgno ||
469 			    !LF_ISSET(DB_LOG_NO_DATA))) {
470 				ldbt.size += h->entries * sizeof(db_indx_t);
471 				ddbt.data = (u_int8_t *)h + HOFFSET(h);
472 				ddbt.size = dbp->pgsize - HOFFSET(h);
473 				if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
474 				     lsnp, lflag,
475 				     h->pgno, lsnp, pgno,
476 				     &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
477 					goto err1;
478 				goto logged;
479 			}
480 			break;
481 		case P_HASHMETA:
482 			ldbt.size = sizeof(HMETA);
483 			break;
484 		case P_BTREEMETA:
485 			ldbt.size = sizeof(BTMETA);
486 			break;
487 		case P_OVERFLOW:
488 			ldbt.size += OV_LEN(h);
489 			break;
490 		default:
491 			DB_ASSERT(dbp->env, h->type != P_QAMDATA);
492 		}
493 
494 		if ((ret = __db_pg_free_log(dbp,
495 		      dbc->txn, lsnp, lflag, h->pgno,
496 		      lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
497 			goto err1;
498 	} else
499 		LSN_NOT_LOGGED(*lsnp);
500 
501 logged:
502 #ifdef HAVE_FTRUNCATE
503 	if (do_truncate) {
504 		start = (u_int32_t) (lp - list) + 1;
505 		meta->last_pgno--;
506 		ret = __db_truncate_freelist(
507 		      dbc, meta, h, list, start, nelem);
508 		h = NULL;
509 	} else if (h->pgno == last_pgno) {
510 		/*
511 		 * We are going to throw this page away, but if we are
512 		 * using MVCC then this version may stick around and we
513 		 * might have to make a copy.
514 		 */
515 		if (atomic_read(&mpf->mfp->multiversion) &&
516 		    (ret = __memp_dirty(mpf,
517 		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
518 			goto err1;
519 		LSN(h) = *lsnp;
520 		P_INIT(h, dbp->pgsize,
521 		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
522 		if ((ret = __memp_fput(mpf,
523 		    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
524 			goto err1;
525 		h = NULL;
526 		/* Give the page back to the OS. */
527 		if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
528 		    last_pgno, 0)) != 0)
529 			goto err1;
530 		DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
531 		meta->last_pgno--;
532 	} else {
533 		if (list != NULL) {
534 			/* Put the page number into the list. */
535 			if ((ret =
536 			    __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
537 				goto err1;
538 			if (prev_pgno != PGNO_INVALID)
539 				lp = &list[position + 1];
540 			else
541 				lp = list;
542 			if (nelem != 0 && position != nelem)
543 				memmove(lp + 1, lp, (size_t)
544 				    ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
545 			*lp = h->pgno;
546 		}
547 #else
548 	{
549 #endif
550 		/*
551 		 * If we are not truncating the page then we
552 		 * reinitialize it and put it at the head of
553 		 * the free list.
554 		 */
555 		if ((ret = __memp_dirty(mpf,
556 		    &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
557 			goto err1;
558 		LSN(h) = *lsnp;
559 		P_INIT(h, dbp->pgsize,
560 		    h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
561 #ifdef DIAGNOSTIC
562 		memset((u_int8_t *) h + P_OVERHEAD(dbp),
563 		    CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
564 #endif
565 		if (prev_pgno == PGNO_INVALID)
566 			meta->free = h->pgno;
567 		else
568 			NEXT_PGNO(prev) = h->pgno;
569 	}
570 
571 	/* Discard the metadata or previous page. */
572 err1:	if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
573 	    dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
574 		ret = t_ret;
575 	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
576 		ret = t_ret;
577 	if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
578 	    dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
579 		ret = t_ret;
580 
581 	/* Discard the caller's page reference. */
582 err:	if (h != NULL && (t_ret = __memp_fput(mpf,
583 	    dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
584 		ret = t_ret;
585 
586 	PERFMON4(dbp->env, alloc, free, dbp->fname, dbp->dname, pgno, ret);
587 	/*
588 	 * !!!
589 	 * We have to unlock the caller's page in the caller!
590 	 */
591 	return (ret);
592 }
593 
594 #ifdef HAVE_FTRUNCATE
595 /*
596  * __db_freelist_pos -- find the position of a page in the freelist.
597  *	The list is sorted, we do a binary search.
598  *
599  * PUBLIC: #ifdef HAVE_FTRUNCATE
600  * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
601  * PUBLIC:       db_pgno_t *, u_int32_t, u_int32_t *));
602  * PUBLIC: #endif
603  */
604 void
__db_freelist_pos(pgno,list,nelem,posp)605 __db_freelist_pos(pgno, list, nelem, posp)
606 	db_pgno_t pgno;
607 	db_pgno_t *list;
608 	u_int32_t nelem;
609 	u_int32_t *posp;
610 {
611 	u_int32_t base, indx, lim;
612 
613 	indx = 0;
614 	for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
615 		indx = base + (lim >> 1);
616 		if (pgno == list[indx]) {
617 			*posp = indx;
618 			return;
619 		}
620 		if (pgno > list[indx]) {
621 			base = indx + 1;
622 			--lim;
623 		}
624 	}
625 	if (base != 0)
626 		base--;
627 	*posp = base;
628 	return;
629 }
630 
631 static int
__db_pglistcmp(a,b)632 __db_pglistcmp(a, b)
633 	const void *a, *b;
634 {
635 	db_pglist_t *ap, *bp;
636 
637 	ap = (db_pglist_t *)a;
638 	bp = (db_pglist_t *)b;
639 
640 	return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
641 }
642 
643 /*
644  * __db_freelist_sort -- sort a list of free pages.
645  * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
646  */
647 void
__db_freelist_sort(list,nelems)648 __db_freelist_sort(list, nelems)
649 	db_pglist_t *list;
650 	u_int32_t nelems;
651 {
652 	qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
653 }
654 
655 /*
656  * __db_pg_truncate -- find the truncation point in a sorted freelist.
657  *
658  * PUBLIC: #ifdef HAVE_FTRUNCATE
659  * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
660  * PUBLIC:    db_pglist_t *, DB_COMPACT *, u_int32_t *,
661  * PUBLIC:    db_pgno_t , db_pgno_t *, DB_LSN *, int));
662  * PUBLIC: #endif
663  */
664 int
__db_pg_truncate(dbc,txn,list,c_data,nelemp,free_pgno,last_pgno,lsnp,in_recovery)665 __db_pg_truncate(dbc, txn,
666     list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
667 	DBC *dbc;
668 	DB_TXN *txn;
669 	db_pglist_t *list;
670 	DB_COMPACT *c_data;
671 	u_int32_t *nelemp;
672 	db_pgno_t free_pgno, *last_pgno;
673 	DB_LSN *lsnp;
674 	int in_recovery;
675 {
676 	DB *dbp;
677 	DBT ddbt;
678 	DB_LSN null_lsn;
679 	DB_MPOOLFILE *mpf;
680 	PAGE *h;
681 	db_pglist_t *lp, *slp;
682 	db_pgno_t lpgno, pgno;
683 	u_int32_t elems, log_size, tpoint;
684 	int last, ret;
685 
686 	ret = 0;
687 	h = NULL;
688 
689 	dbp = dbc->dbp;
690 	mpf = dbp->mpf;
691 	elems = tpoint = *nelemp;
692 
693 	/*
694 	 * Figure out what (if any) pages can be truncated immediately and
695 	 * record the place from which we can truncate, so we can do the
696 	 * memp_ftruncate below.  We also use this to avoid ever putting
697 	 * these pages on the freelist, which we are about to relink.
698 	 */
699 	pgno = *last_pgno;
700 	lp = &list[elems - 1];
701 	last = 1;
702 	while (tpoint != 0) {
703 		if (lp->pgno != pgno)
704 			break;
705 		pgno--;
706 		tpoint--;
707 		lp--;
708 	}
709 
710 	lp = list;
711 	slp = &list[elems];
712 	/*
713 	 * Log the sorted list. We log the whole list so it can be rebuilt.
714 	 * Don't overflow the log file.
715 	 */
716 again:	if (DBC_LOGGING(dbc)) {
717 		last = 1;
718 		lpgno = *last_pgno;
719 		ddbt.size = elems * sizeof(*lp);
720 		ddbt.data = lp;
721 		log_size = ((LOG *)dbc->env->
722 		    lg_handle->reginfo.primary)->log_size;
723 		if (ddbt.size > log_size / 2) {
724 			elems = (log_size / 2) / sizeof(*lp);
725 			ddbt.size = elems * sizeof(*lp);
726 			last = 0;
727 			/*
728 			 * If we stopped after the truncation point
729 			 * then we need to truncate from here.
730 			 */
731 			if (lp + elems >= &list[tpoint])
732 				lpgno = lp[elems - 1].pgno;
733 		}
734 		/*
735 		 * If this is not the beginning of the list fetch the end
736 		 * of the previous segment.  This page becomes the last_free
737 		 * page and will link to this segment if it is not truncated.
738 		 */
739 		if (lp != list) {
740 			if ((ret = __memp_fget(mpf, &lp[-1].pgno,
741 			    dbc->thread_info, txn, 0, &h)) != 0)
742 				goto err;
743 		}
744 
745 		slp = &lp[elems];
746 
747 		ZERO_LSN(null_lsn);
748 		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
749 		     lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
750 		     lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
751 		     h != NULL ? &LSN(h) : &null_lsn,
752 		     free_pgno, lpgno, &ddbt)) != 0)
753 			goto err;
754 		if (h != NULL) {
755 			LSN(h) = *lsnp;
756 			if ((ret = __memp_fput(mpf,
757 			    dbc->thread_info, h, dbc->priority)) != 0)
758 				goto err;
759 		}
760 		h = NULL;
761 	} else if (!in_recovery)
762 		LSN_NOT_LOGGED(*lsnp);
763 
764 	for (; lp < slp && lp < &list[tpoint]; lp++) {
765 		if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
766 		    txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
767 			/* Page may have been truncated later. */
768 			if (in_recovery && ret == DB_PAGE_NOTFOUND) {
769 				ret = 0;
770 				continue;
771 			}
772 			goto err;
773 		}
774 		if (in_recovery) {
775 			if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
776 				if ((ret = __memp_dirty(mpf, &h,
777 				    dbc->thread_info,
778 				    txn, dbp->priority, 0)) != 0) {
779 					(void)__memp_fput(mpf,
780 					    dbc->thread_info, h, dbp->priority);
781 					goto err;
782 				}
783 			} else
784 				goto skip;
785 		}
786 
787 		if (lp == &list[tpoint - 1])
788 			NEXT_PGNO(h) = PGNO_INVALID;
789 		else
790 			NEXT_PGNO(h) = lp[1].pgno;
791 		DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
792 
793 		LSN(h) = *lsnp;
794 skip:		if ((ret = __memp_fput(mpf,
795 		    dbc->thread_info, h, dbp->priority)) != 0)
796 			goto err;
797 		h = NULL;
798 	}
799 
800 	/*
801 	 * If we did not log everything try again.  We start from slp and
802 	 * try to go to the end of the list.
803 	 */
804 	if (last == 0) {
805 		elems = (u_int32_t)(&list[*nelemp] - slp);
806 		lp = slp;
807 		goto again;
808 	}
809 
810 	/*
811 	 * Truncate the file.  Its possible that the last page is the
812 	 * only one that got truncated and that's done in the caller.
813 	 */
814 	if (pgno != *last_pgno) {
815 		if (tpoint != *nelemp &&
816 		    (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
817 		    pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
818 			goto err;
819 		if (c_data)
820 			c_data->compact_pages_truncated += *last_pgno - pgno;
821 		*last_pgno = pgno;
822 	}
823 	*nelemp = tpoint;
824 
825 	if (0) {
826 err:		if (h != NULL)
827 			(void)__memp_fput(mpf,
828 			    dbc->thread_info, h, dbc->priority);
829 	}
830 	return (ret);
831 }
832 
833 /*
834  * __db_free_truncate --
835  *	  Build a sorted free list and truncate free pages at the end
836  *	  of the file.
837  *
838  * PUBLIC: #ifdef HAVE_FTRUNCATE
839  * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
840  * PUBLIC:    u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
841  * PUBLIC:    db_pgno_t *));
842  * PUBLIC: #endif
843  */
844 int
__db_free_truncate(dbp,ip,txn,flags,c_data,listp,nelemp,last_pgnop)845 __db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
846 	DB *dbp;
847 	DB_THREAD_INFO *ip;
848 	DB_TXN *txn;
849 	u_int32_t flags;
850 	DB_COMPACT *c_data;
851 	db_pglist_t **listp;
852 	u_int32_t *nelemp;
853 	db_pgno_t *last_pgnop;
854 {
855 	DBC *dbc;
856 	DBMETA *meta;
857 	DB_LOCK metalock;
858 	DB_MPOOLFILE *mpf;
859 	ENV *env;
860 	PAGE *h;
861 	db_pglist_t *list, *lp;
862 	db_pgno_t pgno;
863 	u_int32_t nelems;
864 	int ret, t_ret;
865 	size_t size;
866 
867 	COMPQUIET(flags, 0);
868 	list = NULL;
869 	meta = NULL;
870 	env = dbp->env;
871 	mpf = dbp->mpf;
872 	h = NULL;
873 	nelems = 0;
874 	if (listp != NULL) {
875 		*listp = NULL;
876 		DB_ASSERT(env, nelemp != NULL);
877 		*nelemp = 0;
878 	}
879 
880 	if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
881 		return (ret);
882 
883 	pgno = PGNO_BASE_MD;
884 	if ((ret = __db_lget(dbc,
885 	    LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
886 		goto err;
887 	if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
888 	    &meta)) != 0)
889 		goto err;
890 
891 	if (last_pgnop != NULL)
892 		*last_pgnop = meta->last_pgno;
893 	if ((pgno = meta->free) == PGNO_INVALID)
894 		goto done;
895 
896 	size = 128;
897 	if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
898 		goto err;
899 	lp = list;
900 
901 	do {
902 		if (lp == &list[size]) {
903 			size *= 2;
904 			if ((ret = __os_realloc(env,
905 			    size * sizeof(*list), &list)) != 0)
906 				goto err;
907 			lp = &list[size / 2];
908 		}
909 		if ((ret = __memp_fget(mpf, &pgno,
910 		     dbc->thread_info, dbc->txn, 0, &h)) != 0)
911 			goto err;
912 
913 		lp->pgno = pgno;
914 		lp->next_pgno = NEXT_PGNO(h);
915 		lp->lsn = LSN(h);
916 		pgno = NEXT_PGNO(h);
917 		if ((ret = __memp_fput(mpf,
918 		    dbc->thread_info, h, dbc->priority)) != 0)
919 			goto err;
920 		lp++;
921 	} while (pgno != PGNO_INVALID);
922 	nelems = (u_int32_t)(lp - list);
923 
924 	if ((ret = __memp_dirty(mpf,
925 	    &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
926 		goto err;
927 
928 	/* Sort the list */
929 	__db_freelist_sort(list, nelems);
930 
931 	if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
932 	    &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
933 		goto err;
934 
935 	if (nelems == 0)
936 		meta->free = PGNO_INVALID;
937 	else
938 		meta->free = list[0].pgno;
939 
940 done:	if (last_pgnop != NULL)
941 		*last_pgnop = meta->last_pgno;
942 
943 	/*
944 	 * Set the truncation point which determines which pages may be
945 	 * relocated. Pages above are candidates to be swapped with a lower one
946 	 * from the freelist by __db_exchange_page(); pages before the truncate
947 	 * point are not relocated.
948 	 * The truncation point starts as N pages less than the last_pgno, where
949 	 * N is the size of the free list. This is reduced by 1/4 in the hope
950 	 * that partially full pages will be coalesced together, creating
951 	 * additional free pages during the compact.
952 	 */
953 	if (c_data) {
954 		c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
955 		if (c_data->compact_truncate > nelems >> 2)
956 			c_data->compact_truncate -= nelems >> 2;
957 	}
958 
959 	if (nelems != 0 && listp != NULL) {
960 		*listp = list;
961 		*nelemp = nelems;
962 		list = NULL;
963 	}
964 
965 err:	if (list != NULL)
966 		__os_free(env, list);
967 	if (meta != NULL && (t_ret = __memp_fput(mpf,
968 	     dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
969 		ret = t_ret;
970 	if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
971 		ret = t_ret;
972 	if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
973 		ret = t_ret;
974 	return (ret);
975 }
976 
977 static int
__db_truncate_freelist(dbc,meta,h,list,start,nelem)978 __db_truncate_freelist(dbc, meta, h, list, start, nelem)
979 	DBC *dbc;
980 	DBMETA *meta;
981 	PAGE *h;
982 	db_pgno_t *list;
983 	u_int32_t start, nelem;
984 {
985 	DB *dbp;
986 	DBT ddbt;
987 	DB_LSN null_lsn;
988 	DB_MPOOLFILE *mpf;
989 	PAGE *last_free, *pg;
990 	db_pgno_t *lp, free_pgno, lpgno;
991 	db_pglist_t *plist, *pp, *spp;
992 	u_int32_t elem, log_size;
993 	int last, ret;
994 
995 	dbp = dbc->dbp;
996 	mpf = dbp->mpf;
997 	plist = NULL;
998 	last_free = NULL;
999 	pg = NULL;
1000 
1001 	if (start != 0 &&
1002 	    (ret = __memp_fget(mpf, &list[start - 1],
1003 	    dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
1004 		goto err;
1005 
1006 	if (DBC_LOGGING(dbc)) {
1007 		if ((ret = __os_malloc(dbp->env,
1008 		     (nelem - start) * sizeof(*pp), &plist)) != 0)
1009 			goto err;
1010 
1011 		pp = plist;
1012 		for (lp = &list[start]; lp < &list[nelem]; lp++) {
1013 			pp->pgno = *lp;
1014 			if ((ret = __memp_fget(mpf, lp,
1015 			     dbc->thread_info, dbc->txn, 0, &pg)) != 0)
1016 				goto err;
1017 			pp->lsn = LSN(pg);
1018 			pp->next_pgno = NEXT_PGNO(pg);
1019 			if ((ret = __memp_fput(mpf,
1020 			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
1021 				goto err;
1022 			pg = NULL;
1023 			pp++;
1024 		}
1025 		ZERO_LSN(null_lsn);
1026 		pp = plist;
1027 		elem = nelem - start;
1028 		log_size = ((LOG *)dbc->env->
1029 		    lg_handle->reginfo.primary)->log_size;
1030 again:		ddbt.data = spp = pp;
1031 		free_pgno = pp->pgno;
1032 		lpgno = meta->last_pgno;
1033 		ddbt.size = elem * sizeof(*pp);
1034 		if (ddbt.size > log_size / 2) {
1035 			elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
1036 			ddbt.size = elem * sizeof(*pp);
1037 			pp += elem;
1038 			elem = (nelem - start) - (u_int32_t)(pp - plist);
1039 			lpgno = pp[-1].pgno;
1040 			last = 0;
1041 		} else
1042 			last = 1;
1043 		/*
1044 		 * Get the page which will link to this section if we abort.
1045 		 * If this is the first segment then its last_free.
1046 		 */
1047 		if (spp == plist)
1048 			pg = last_free;
1049 		else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
1050 		     dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
1051 			goto err;
1052 
1053 		if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
1054 		     &LSN(meta), last == 1 ? DB_FLUSH : 0,
1055 		     PGNO(meta), &LSN(meta),
1056 		     pg != NULL ? PGNO(pg) : PGNO_INVALID,
1057 		     pg != NULL ? &LSN(pg) : &null_lsn,
1058 		     free_pgno, lpgno, &ddbt)) != 0)
1059 			goto err;
1060 		if (pg != NULL) {
1061 			LSN(pg) = LSN(meta);
1062 			if (pg != last_free && (ret = __memp_fput(mpf,
1063 			    dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
1064 				goto err;
1065 			pg = NULL;
1066 		}
1067 		if (last == 0)
1068 			goto again;
1069 	} else
1070 		LSN_NOT_LOGGED(LSN(meta));
1071 
1072 	if ((ret = __memp_fput(mpf,
1073 	    dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
1074 		goto err;
1075 	h = NULL;
1076 	if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
1077 	    list[start], 0)) != 0)
1078 		goto err;
1079 	meta->last_pgno = list[start] - 1;
1080 
1081 	if (start == 0)
1082 		meta->free = PGNO_INVALID;
1083 	else {
1084 		NEXT_PGNO(last_free) = PGNO_INVALID;
1085 		if ((ret = __memp_fput(mpf,
1086 		    dbc->thread_info, last_free, dbc->priority)) != 0)
1087 			goto err;
1088 		last_free = NULL;
1089 	}
1090 
1091 	/* Shrink the number of elements in the list. */
1092 	ret = __memp_extend_freelist(mpf, start, &list);
1093 
1094 err:	if (plist != NULL)
1095 		__os_free(dbp->env, plist);
1096 
1097 	/* We need to put the page on error. */
1098 	if (h != NULL)
1099 		(void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
1100 	if (pg != NULL && pg != last_free)
1101 		(void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
1102 	if (last_free != NULL)
1103 		(void)__memp_fput(mpf,
1104 		    dbc->thread_info, last_free, dbc->priority);
1105 
1106 	return (ret);
1107 }
1108 #endif
1109 
1110 #ifdef DEBUG
1111 /*
1112  * __db_lprint --
1113  *	Print out the list of locks currently held by a cursor.
1114  *
1115  * PUBLIC: int __db_lprint __P((DBC *));
1116  */
1117 int
__db_lprint(dbc)1118 __db_lprint(dbc)
1119 	DBC *dbc;
1120 {
1121 	DB *dbp;
1122 	DB_LOCKREQ req;
1123 	ENV *env;
1124 
1125 	dbp = dbc->dbp;
1126 	env = dbp->env;
1127 
1128 	if (LOCKING_ON(env)) {
1129 		req.op = DB_LOCK_DUMP;
1130 		(void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
1131 	}
1132 	return (0);
1133 }
1134 #endif
1135 
1136 /*
1137  * __db_lget --
1138  *	The standard lock get call.
1139  *
1140  * PUBLIC: int __db_lget __P((DBC *,
1141  * PUBLIC:     int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
1142  */
1143 int
__db_lget(dbc,action,pgno,mode,lkflags,lockp)1144 __db_lget(dbc, action, pgno, mode, lkflags, lockp)
1145 	DBC *dbc;
1146 	int action;
1147 	db_pgno_t pgno;
1148 	db_lockmode_t mode;
1149 	u_int32_t lkflags;
1150 	DB_LOCK *lockp;
1151 {
1152 	DB *dbp;
1153 	DB_LOCKREQ couple[3], *reqp;
1154 	DB_TXN *txn;
1155 	ENV *env;
1156 	int has_timeout, i, ret;
1157 
1158 	dbp = dbc->dbp;
1159 	env = dbp->env;
1160 	txn = dbc->txn;
1161 
1162 	/*
1163 	 * We do not always check if we're configured for locking before
1164 	 * calling __db_lget to acquire the lock.
1165 	 */
1166 	if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
1167 	    (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
1168 	    dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
1169 	    F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
1170 	    (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
1171 	    (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
1172 		LOCK_INIT(*lockp);
1173 		return (0);
1174 	}
1175 
1176 	/*
1177 	 * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
1178 	 * pass that along to the lock call.
1179 	 */
1180 	if (DB_NONBLOCK(dbc))
1181 		lkflags |= DB_LOCK_NOWAIT;
1182 
1183 	/*
1184 	 * If we're trying to run in exclusive mode, attempt to get an
1185 	 * exclusive database lock.  If it is not available then wait
1186 	 * for the lock on the database and clear the exclusive bit.
1187 	 *
1188 	 * If we get an exclusive lock on the database, mark the cursor
1189 	 * with DBC_DONTLOCK to avoid any further locking.
1190 	 */
1191 	if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
1192 		dbc->lock.type = DB_DATABASE_LOCK;
1193 		dbc->lock.pgno = PGNO_BASE_MD;
1194 		if ((ret = __lock_get(env, dbc->locker, DB_LOCK_NOWAIT,
1195 		    &dbc->lock_dbt, F_ISSET(dbp, DB_AM_RDONLY) ?
1196 		    DB_LOCK_READ : DB_LOCK_WRITE, lockp)) == 0) {
1197 			if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
1198 				F_SET(dbc, DBC_DONTLOCK);
1199 				if (!IS_REAL_TXN(txn))
1200 					dbc->mylock = *lockp;
1201 				LOCK_INIT(*lockp);
1202 				return (0);
1203 			}
1204 		} else if (ret == DB_LOCK_NOTGRANTED &&
1205 		    (lkflags & DB_LOCK_NOWAIT) == 0) {
1206 			if ((ret = __lock_get(env, dbc->locker, 0,
1207 			    &dbc->lock_dbt, DB_LOCK_WRITE, lockp)) != 0)
1208 				return (ret);
1209 			F_CLR(dbp->mpf->mfp, MP_DATABASE_LOCKING);
1210 			if ((ret = __lock_put(env, lockp)) != 0)
1211 				return (ret);
1212 			LOCK_INIT(*lockp);
1213 		} else if (ret != 0)
1214 			return (ret);
1215 	}
1216 
1217 	dbc->lock.pgno = pgno;
1218 	if (lkflags & DB_LOCK_RECORD)
1219 		dbc->lock.type = DB_RECORD_LOCK;
1220 	else
1221 		dbc->lock.type = DB_PAGE_LOCK;
1222 	lkflags &= ~DB_LOCK_RECORD;
1223 
1224 	if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
1225 		mode = DB_LOCK_READ_UNCOMMITTED;
1226 
1227 	has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
1228 	    (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
1229 
1230 	/*
1231 	 * Transactional locking.
1232 	 * Hold on to the previous read lock only if we are in full isolation.
1233 	 * COUPLE_ALWAYS indicates we are holding an interior node which need
1234 	 *	not be isolated.
1235 	 * Downgrade write locks if we are supporting dirty readers and the
1236 	 * update did not have an error.
1237 	 */
1238 	if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
1239 	    !LOCK_ISSET(*lockp))
1240 		action = 0;
1241 	else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
1242 		action = LCK_COUPLE;
1243 	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
1244 	    lockp->mode == DB_LOCK_READ)
1245 		action = LCK_COUPLE;
1246 	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1247 		action = LCK_COUPLE;
1248 	else if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
1249 	     !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
1250 		action = LCK_DOWNGRADE;
1251 	else
1252 		action = 0;
1253 
1254 	i = 0;
1255 	switch (action) {
1256 	default:
1257 		if (has_timeout)
1258 			goto do_couple;
1259 		ret = __lock_get(env,
1260 		    dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
1261 		break;
1262 
1263 	case LCK_DOWNGRADE:
1264 		couple[0].op = DB_LOCK_GET;
1265 		couple[0].obj = NULL;
1266 		couple[0].lock = *lockp;
1267 		couple[0].mode = DB_LOCK_WWRITE;
1268 		UMRW_SET(couple[0].timeout);
1269 		i++;
1270 		/* FALLTHROUGH */
1271 	case LCK_COUPLE:
1272 do_couple:	couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
1273 		couple[i].obj = &dbc->lock_dbt;
1274 		couple[i].mode = mode;
1275 		UMRW_SET(couple[i].timeout);
1276 		i++;
1277 		if (has_timeout)
1278 			couple[0].timeout =
1279 			     F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
1280 		if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
1281 			couple[i].op = DB_LOCK_PUT;
1282 			couple[i].lock = *lockp;
1283 			i++;
1284 		}
1285 
1286 		ret = __lock_vec(env,
1287 		    dbc->locker, lkflags, couple, i, &reqp);
1288 		if (ret == 0 || reqp == &couple[i - 1])
1289 			*lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
1290 		break;
1291 	}
1292 
1293 	if (txn != NULL && ret == DB_LOCK_DEADLOCK)
1294 		F_SET(txn, TXN_DEADLOCK);
1295 	return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
1296 		 DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
1297 }
1298 
1299 #ifdef DIAGNOSTIC
1300 /*
1301  * __db_haslock --
1302  *	Determine if this locker holds a particular lock.
1303  *	Returns 0 if lock is held, non-zero otherwise.
1304  *
1305  * PUBLIC: #ifdef DIAGNOSTIC
1306  * PUBLIC: int __db_haslock __P((ENV *, DB_LOCKER *,
1307  * PUBLIC:     DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
1308  * PUBLIC: #endif
1309  */
1310 int
__db_haslock(env,locker,dbmfp,pgno,mode,type)1311 __db_haslock(env, locker, dbmfp, pgno, mode, type)
1312 	ENV *env;
1313 	DB_LOCKER *locker;
1314 	DB_MPOOLFILE *dbmfp;
1315 	db_pgno_t pgno;
1316 	db_lockmode_t mode;
1317 	u_int32_t type;
1318 {
1319 	DBT lkdata;
1320 	DB_LOCK lock;
1321 	DB_LOCK_ILOCK ilock;
1322 
1323 	memset(&lkdata, 0, sizeof(lkdata));
1324 	lkdata.data = &ilock;
1325 	lkdata.size = sizeof(ilock);
1326 
1327 	memcpy(ilock.fileid, dbmfp->fileid, DB_FILE_ID_LEN);
1328 	ilock.pgno = pgno;
1329 	ilock.type = type;
1330 
1331 	return (__lock_get(env, locker, DB_LOCK_CHECK, &lkdata, mode, &lock));
1332 }
1333 /*
1334  * __db_has_pagelock --
1335  *	Determine if this locker holds a particular page lock, and return an
1336  *	error if it is missing a page lock that it should have.
1337  *	Otherwise (TDS with the page locked, or DS or CDS) return 0.
1338  *
1339  * PUBLIC: #ifdef DIAGNOSTIC
1340  * PUBLIC: int __db_has_pagelock __P((ENV *, DB_LOCKER *,
1341  * PUBLIC:     DB_MPOOLFILE *, PAGE *, db_lockmode_t));
1342  * PUBLIC: #endif
1343  */
1344 int
__db_has_pagelock(env,locker,dbmfp,pagep,mode)1345 __db_has_pagelock(env, locker, dbmfp, pagep, mode)
1346 	ENV *env;
1347 	DB_LOCKER *locker;
1348 	DB_MPOOLFILE *dbmfp;
1349 	PAGE *pagep;
1350 	db_lockmode_t mode;
1351 {
1352 	int ret;
1353 
1354 	if (!FLD_ISSET(env->open_flags, DB_INIT_TXN))
1355 		return (0);
1356 
1357 	switch (pagep->type) {
1358 	case P_OVERFLOW:
1359 	case P_INVALID:
1360 	case P_QAMDATA:
1361 	case P_QAMMETA:
1362 	case P_IHEAP:
1363 		return (0);
1364 	case P_HASH:
1365 		if (PREV_PGNO(pagep) != PGNO_INVALID)
1366 			return (0);
1367 		break;
1368 	default:
1369 		break;
1370 	}
1371 	if ((ret = __db_haslock(env,
1372 	    locker, dbmfp, pagep->pgno, mode, DB_PAGE_LOCK)) != 0)
1373 		ret = __db_haslock(env,
1374 		    locker, dbmfp, PGNO_BASE_MD, mode, DB_DATABASE_LOCK);
1375 	return (ret);
1376 }
1377 #endif
1378 
1379 /*
1380  * __db_lput --
1381  *	The standard lock put call.
1382  *
1383  * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
1384  */
1385 int
__db_lput(dbc,lockp)1386 __db_lput(dbc, lockp)
1387 	DBC *dbc;
1388 	DB_LOCK *lockp;
1389 {
1390 	DB_LOCKREQ couple[2], *reqp;
1391 	ENV *env;
1392 	int action, ret;
1393 
1394 	/*
1395 	 * Transactional locking.
1396 	 * Hold on to the read locks only if we are in full isolation.
1397 	 * Downgrade write locks if we are supporting dirty readers unless
1398 	 * there was an error.
1399 	 */
1400 	if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
1401 	    !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
1402 		action = LCK_DOWNGRADE;
1403 	else if (dbc->txn == NULL)
1404 		action = LCK_COUPLE;
1405 	else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
1406 	    lockp->mode == DB_LOCK_READ)
1407 		action = LCK_COUPLE;
1408 	else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1409 		action = LCK_COUPLE;
1410 	else
1411 		action = 0;
1412 
1413 	env = dbc->env;
1414 	switch (action) {
1415 	case LCK_COUPLE:
1416 		ret = __lock_put(env, lockp);
1417 		break;
1418 	case LCK_DOWNGRADE:
1419 		couple[0].op = DB_LOCK_GET;
1420 		couple[0].obj = NULL;
1421 		couple[0].mode = DB_LOCK_WWRITE;
1422 		couple[0].lock = *lockp;
1423 		UMRW_SET(couple[0].timeout);
1424 		couple[1].op = DB_LOCK_PUT;
1425 		couple[1].lock = *lockp;
1426 		ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
1427 		if (ret == 0 || reqp == &couple[1])
1428 			*lockp = couple[0].lock;
1429 		break;
1430 	default:
1431 		ret = 0;
1432 		break;
1433 	}
1434 
1435 	return (ret);
1436 }
1437