1 /*-
2 * See the file LICENSE for redistribution information.
3 *
4 * Copyright (c) 1996, 2013 Oracle and/or its affiliates. All rights reserved.
5 */
6 /*
7 * Copyright (c) 1990, 1993, 1994, 1995, 1996
8 * Keith Bostic. All rights reserved.
9 */
10 /*
11 * Copyright (c) 1990, 1993, 1994, 1995
12 * The Regents of the University of California. All rights reserved.
13 *
14 * This code is derived from software contributed to Berkeley by
15 * Mike Olson.
16 *
17 * Redistribution and use in source and binary forms, with or without
18 * modification, are permitted provided that the following conditions
19 * are met:
20 * 1. Redistributions of source code must retain the above copyright
21 * notice, this list of conditions and the following disclaimer.
22 * 2. Redistributions in binary form must reproduce the above copyright
23 * notice, this list of conditions and the following disclaimer in the
24 * documentation and/or other materials provided with the distribution.
25 * 3. Neither the name of the University nor the names of its contributors
26 * may be used to endorse or promote products derived from this software
27 * without specific prior written permission.
28 *
29 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
30 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
31 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
32 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
33 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
34 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
35 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
36 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
37 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
38 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
39 * SUCH DAMAGE.
40 *
41 * $Id$
42 */
43
44 #include "db_config.h"
45
46 #include "db_int.h"
47 #include "dbinc/db_page.h"
48 #include "dbinc/lock.h"
49 #include "dbinc/log.h"
50 #include "dbinc/mp.h"
51 #include "dbinc/txn.h"
52 #include "dbinc/db_am.h"
53 #include "dbinc/hash.h"
54
55 static void __db_init_meta __P((DB *, void *, db_pgno_t, u_int32_t));
56 #ifdef HAVE_FTRUNCATE
57 static int __db_pglistcmp __P((const void *, const void *));
58 static int __db_truncate_freelist __P((DBC *, DBMETA *,
59 PAGE *, db_pgno_t *, u_int32_t, u_int32_t));
60 #endif
61
62 /*
63 * __db_init_meta --
64 * Helper function for __db_new that initializes the important fields in
65 * a meta-data page (used instead of P_INIT). We need to make sure that we
66 * retain the page number and LSN of the existing page.
67 */
68 static void
__db_init_meta(dbp,p,pgno,pgtype)69 __db_init_meta(dbp, p, pgno, pgtype)
70 DB *dbp;
71 void *p;
72 db_pgno_t pgno;
73 u_int32_t pgtype;
74 {
75 DBMETA *meta;
76 DB_LSN save_lsn;
77
78 meta = (DBMETA *)p;
79 save_lsn = meta->lsn;
80 memset(meta, 0, sizeof(DBMETA));
81 meta->lsn = save_lsn;
82 meta->pagesize = dbp->pgsize;
83 if (F_ISSET(dbp, DB_AM_CHKSUM))
84 FLD_SET(meta->metaflags, DBMETA_CHKSUM);
85 meta->pgno = pgno;
86 meta->type = (u_int8_t)pgtype;
87 }
88
89 /*
90 * __db_new --
91 * Get a new page, preferably from the freelist.
92 *
93 * PUBLIC: int __db_new __P((DBC *, u_int32_t, DB_LOCK *, PAGE **));
94 */
95 int
__db_new(dbc,type,lockp,pagepp)96 __db_new(dbc, type, lockp, pagepp)
97 DBC *dbc;
98 u_int32_t type;
99 DB_LOCK *lockp;
100 PAGE **pagepp;
101 {
102 DB *dbp;
103 DBMETA *meta;
104 DB_LOCK metalock;
105 DB_LSN lsn;
106 DB_MPOOLFILE *mpf;
107 ENV *env;
108 PAGE *h;
109 db_pgno_t last, *list, pgno, newnext;
110 int extend, hash, ret;
111
112 meta = NULL;
113 dbp = dbc->dbp;
114 env = dbp->env;
115 mpf = dbp->mpf;
116 h = NULL;
117 newnext = PGNO_INVALID;
118 if (lockp != NULL)
119 LOCK_INIT(*lockp);
120
121 hash = 0;
122 ret = 0;
123 LOCK_INIT(metalock);
124
125 #ifdef HAVE_HASH
126 if (dbp->type == DB_HASH) {
127 if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
128 goto err;
129 if (meta != NULL)
130 hash = 1;
131 }
132 #endif
133 if (meta == NULL) {
134 pgno = PGNO_BASE_MD;
135 if ((ret = __db_lget(dbc,
136 LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
137 goto err;
138 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
139 DB_MPOOL_DIRTY, &meta)) != 0)
140 goto err;
141 }
142
143 last = meta->last_pgno;
144 if (meta->free == PGNO_INVALID) {
145 if (FLD_ISSET(type, P_DONTEXTEND)) {
146 *pagepp = NULL;
147 goto err;
148 }
149 last = pgno = meta->last_pgno + 1;
150 ZERO_LSN(lsn);
151 extend = 1;
152 } else {
153 pgno = meta->free;
154 /*
155 * Lock the new page. Do this here because we must do it
156 * before getting the page and the caller may need the lock
157 * to keep readers from seeing the page before the transaction
158 * commits. We can do this because no one will hold a free
159 * page locked.
160 */
161 if (lockp != NULL && (ret =
162 __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
163 goto err;
164 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
165 DB_MPOOL_DIRTY, &h)) != 0)
166 goto err;
167
168 /*
169 * We want to take the first page off the free list and
170 * then set meta->free to the that page's next_pgno, but
171 * we need to log the change first.
172 */
173 newnext = h->next_pgno;
174 lsn = h->lsn;
175 extend = 0;
176 DB_ASSERT(env, TYPE(h) == P_INVALID);
177
178 if (TYPE(h) != P_INVALID) {
179 __db_errx(env, DB_STR_A("0689",
180 "%s page %lu is on free list with type %lu",
181 "%s %lu %lu"), dbp->fname, (u_long)PGNO(h),
182 (u_long)TYPE(h));
183 return (__env_panic(env, EINVAL));
184 }
185
186 }
187
188 FLD_CLR(type, P_DONTEXTEND);
189
190 /*
191 * Log the allocation before fetching the new page. If we
192 * don't have room in the log then we don't want to tell
193 * mpool to extend the file.
194 */
195 if (DBC_LOGGING(dbc)) {
196 if ((ret = __db_pg_alloc_log(dbp, dbc->txn, &LSN(meta), 0,
197 &LSN(meta), PGNO_BASE_MD, &lsn,
198 pgno, (u_int32_t)type, newnext, meta->last_pgno)) != 0)
199 goto err;
200 } else
201 LSN_NOT_LOGGED(LSN(meta));
202
203 meta->free = newnext;
204
205 if (extend == 1) {
206 if (lockp != NULL && (ret =
207 __db_lget(dbc, 0, pgno, DB_LOCK_WRITE, 0, lockp)) != 0)
208 goto err;
209 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
210 DB_MPOOL_NEW, &h)) != 0)
211 goto err;
212 DB_ASSERT(env, last == pgno);
213 meta->last_pgno = pgno;
214 ZERO_LSN(h->lsn);
215 h->pgno = pgno;
216
217 /*
218 * If the file was extended for the first time in this
219 * transaction, set the MPOOLFILE's file extension
220 * watermark.
221 */
222 __txn_add_fe_watermark(dbc->txn, dbp, h->pgno);
223
224 }
225 LSN(h) = LSN(meta);
226
227 if (hash == 0 && (ret = __memp_fput(mpf,
228 dbc->thread_info, meta, dbc->priority)) != 0)
229 goto err;
230 meta = NULL;
231
232 switch (type) {
233 case P_BTREEMETA:
234 case P_HASHMETA:
235 case P_QAMMETA:
236 __db_init_meta(dbp, h, h->pgno, type);
237 break;
238 default:
239 P_INIT(h, dbp->pgsize,
240 h->pgno, PGNO_INVALID, PGNO_INVALID, 0, type);
241 break;
242 }
243
244 /* Fix up the sorted free list if necessary. */
245 #ifdef HAVE_FTRUNCATE
246 if (extend == 0) {
247 u_int32_t nelems = 0;
248
249 if ((ret = __memp_get_freelist(dbp->mpf, &nelems, &list)) != 0)
250 goto err;
251 if (nelems != 0) {
252 DB_ASSERT(env, h->pgno == list[0]);
253 memmove(list, &list[1], (nelems - 1) * sizeof(*list));
254 if ((ret = __memp_extend_freelist(
255 dbp->mpf, nelems - 1, &list)) != 0)
256 goto err;
257 }
258 }
259 #else
260 COMPQUIET(list, NULL);
261 #endif
262
263 if ((ret = __TLPUT(dbc, metalock)) != 0)
264 return (ret);
265 *pagepp = h;
266 PERFMON6(env, alloc, new, dbp->fname, dbp->dname, pgno, type, h, 0);
267 return (0);
268
269 err: if (h != NULL)
270 (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
271 if (meta != NULL && hash == 0)
272 (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
273 (void)__TLPUT(dbc, metalock);
274 if (lockp != NULL)
275 (void)__LPUT(dbc, *lockp);
276 /* Failure return - report 0 pgno, null page address. */
277 PERFMON6(env, alloc, new, dbp->fname, dbp->dname, 0, type, NULL, ret);
278 return (ret);
279 }
280
281 /*
282 * __db_free --
283 * Add a page to the head of the freelist.
284 *
285 * PUBLIC: int __db_free __P((DBC *, PAGE *, u_int32_t));
286 */
287 int
__db_free(dbc,h,flags)288 __db_free(dbc, h, flags)
289 DBC *dbc;
290 PAGE *h;
291 u_int32_t flags;
292 {
293 DB *dbp;
294 DBMETA *meta;
295 DBT ddbt, ldbt;
296 DB_LOCK metalock;
297 DB_LSN *lsnp;
298 DB_MPOOLFILE *mpf;
299 PAGE *prev;
300 db_pgno_t last_pgno, next_pgno, pgno, prev_pgno;
301 u_int32_t lflag;
302 int hash, ret, t_ret;
303 #ifdef HAVE_FTRUNCATE
304 db_pgno_t *list, *lp;
305 u_int32_t nelem, position, start;
306 int do_truncate;
307 #endif
308
309 dbp = dbc->dbp;
310 mpf = dbp->mpf;
311 prev_pgno = PGNO_INVALID;
312 meta = NULL;
313 prev = NULL;
314 LOCK_INIT(metalock);
315 #ifdef HAVE_FTRUNCATE
316 lp = NULL;
317 nelem = 0;
318 do_truncate = 0;
319 #endif
320
321 /*
322 * Retrieve the metadata page. If we are not keeping a sorted
323 * free list put the page at the head of the the free list.
324 * If we are keeping a sorted free list, for truncation,
325 * then figure out where this page belongs and either
326 * link it in or truncate the file as much as possible.
327 * If either the lock get or page get routines
328 * fail, then we need to put the page with which we were called
329 * back because our caller assumes we take care of it.
330 */
331 hash = 0;
332
333 pgno = PGNO_BASE_MD;
334 if ((ret = __db_lget(dbc,
335 LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
336 goto err;
337
338 #ifdef HAVE_HASH
339 if (dbp->type == DB_HASH) {
340 if ((ret = __ham_return_meta(dbc,
341 #ifdef HAVE_FTRUNCATE
342 0,
343 #else
344 DB_MPOOL_DIRTY,
345 #endif
346 &meta)) != 0)
347 goto err;
348 if (meta != NULL)
349 hash = 1;
350 }
351 #endif
352 if (meta == NULL) {
353 /* If we support truncate, we might not dirty the meta page. */
354 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
355 #ifdef HAVE_FTRUNCATE
356 0,
357 #else
358 DB_MPOOL_DIRTY,
359 #endif
360 &meta)) != 0)
361 goto err1;
362 }
363
364 last_pgno = meta->last_pgno;
365 next_pgno = meta->free;
366 /*
367 * Assign lsnp here so it always initialized when
368 * HAVE_FTRUNCATE is not defined.
369 */
370 lsnp = &LSN(meta);
371
372 DB_ASSERT(dbp->env, h->pgno != next_pgno);
373
374 #ifdef HAVE_FTRUNCATE
375 /*
376 * If we are maintaining a sorted free list see if we either have a
377 * new truncation point or the page goes somewhere in the middle of
378 * the list. If it goes in the middle of the list, we will drop the
379 * meta page and get the previous page.
380 */
381 COMPQUIET(position, 0);
382 if ((ret = __memp_get_freelist(mpf, &nelem, &list)) != 0)
383 goto err1;
384 if (list == NULL)
385 goto no_sort;
386
387 if (h->pgno != last_pgno) {
388 /*
389 * Put the page number in the sorted list. Find its
390 * position and the previous page. After logging we
391 * will extend the list, make room and insert the page in
392 * the list.
393 */
394 position = 0;
395 if (nelem != 0) {
396 __db_freelist_pos(h->pgno, list, nelem, &position);
397
398 DB_ASSERT(dbp->env, h->pgno != list[position]);
399
400 /* Get the previous page if this is not the smallest. */
401 if (position != 0 || h->pgno > list[0])
402 prev_pgno = list[position];
403 }
404
405 } else if (nelem != 0) {
406 /* Find the truncation point. */
407 for (lp = &list[nelem - 1]; lp >= list; lp--)
408 if (--last_pgno != *lp)
409 break;
410 if (lp < list || last_pgno < h->pgno - 1)
411 do_truncate = 1;
412 last_pgno = meta->last_pgno;
413 }
414
415 no_sort:
416 if (prev_pgno == PGNO_INVALID) {
417 #ifdef HAVE_HASH
418 if (hash) {
419 if ((ret =
420 __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
421 goto err1;
422 } else
423 #endif
424 if ((ret = __memp_dirty(mpf,
425 &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
426 goto err1;
427 lsnp = &LSN(meta);
428 } else {
429 pgno = prev_pgno;
430 if ((ret = __memp_fget(mpf, &pgno,
431 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &prev)) != 0)
432 goto err1;
433 next_pgno = NEXT_PGNO(prev);
434 lsnp = &LSN(prev);
435 }
436 #endif
437
438 /*
439 * Log the change.
440 * We are either logging an update to the metapage or to the
441 * previous page in the sorted list.
442 */
443 if (DBC_LOGGING(dbc)) {
444 memset(&ldbt, 0, sizeof(ldbt));
445 ldbt.data = h;
446 ldbt.size = P_OVERHEAD(dbp);
447 /*
448 * If we are removing pages from the file, we need to make
449 * sure the logging happens before the truncation. If we
450 * are truncating multiple pages we don't need to flush the
451 * log here as it will be flushed by __db_truncate_freelist.
452 */
453 lflag = 0;
454
455 #ifdef HAVE_FTRUNCATE
456 if (h->pgno == last_pgno && do_truncate == 0)
457 lflag = DB_FLUSH;
458 #endif
459 switch (h->type) {
460 case P_HASH:
461 case P_IBTREE:
462 case P_IRECNO:
463 case P_LBTREE:
464 case P_LRECNO:
465 case P_LDUP:
466 if (h->entries > 0 && (h->pgno == last_pgno ||
467 !LF_ISSET(DB_LOG_NO_DATA))) {
468 ldbt.size += h->entries * sizeof(db_indx_t);
469 ddbt.data = (u_int8_t *)h + HOFFSET(h);
470 ddbt.size = dbp->pgsize - HOFFSET(h);
471 if ((ret = __db_pg_freedata_log(dbp, dbc->txn,
472 lsnp, lflag,
473 h->pgno, lsnp, pgno,
474 &ldbt, next_pgno, last_pgno, &ddbt)) != 0)
475 goto err1;
476 goto logged;
477 }
478 break;
479 case P_HASHMETA:
480 ldbt.size = sizeof(HMETA);
481 break;
482 case P_BTREEMETA:
483 ldbt.size = sizeof(BTMETA);
484 break;
485 case P_OVERFLOW:
486 ldbt.size += OV_LEN(h);
487 break;
488 default:
489 DB_ASSERT(dbp->env, h->type != P_QAMDATA);
490 }
491
492 if ((ret = __db_pg_free_log(dbp,
493 dbc->txn, lsnp, lflag, h->pgno,
494 lsnp, pgno, &ldbt, next_pgno, last_pgno)) != 0)
495 goto err1;
496 } else
497 LSN_NOT_LOGGED(*lsnp);
498
499 logged:
500 #ifdef HAVE_FTRUNCATE
501 if (do_truncate) {
502 start = (u_int32_t) (lp - list) + 1;
503 meta->last_pgno--;
504 ret = __db_truncate_freelist(
505 dbc, meta, h, list, start, nelem);
506 h = NULL;
507 } else if (h->pgno == last_pgno) {
508 /*
509 * We are going to throw this page away, but if we are
510 * using MVCC then this version may stick around and we
511 * might have to make a copy.
512 */
513 if (atomic_read(&mpf->mfp->multiversion) &&
514 (ret = __memp_dirty(mpf,
515 &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
516 goto err1;
517 LSN(h) = *lsnp;
518 P_INIT(h, dbp->pgsize,
519 h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
520 if ((ret = __memp_fput(mpf,
521 dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
522 goto err1;
523 h = NULL;
524 /* Give the page back to the OS. */
525 if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
526 last_pgno, 0)) != 0)
527 goto err1;
528 DB_ASSERT(dbp->env, meta->pgno == PGNO_BASE_MD);
529 meta->last_pgno--;
530 } else {
531 if (list != NULL) {
532 /* Put the page number into the list. */
533 if ((ret =
534 __memp_extend_freelist(mpf, nelem + 1, &list)) != 0)
535 goto err1;
536 if (prev_pgno != PGNO_INVALID)
537 lp = &list[position + 1];
538 else
539 lp = list;
540 if (nelem != 0 && position != nelem)
541 memmove(lp + 1, lp, (size_t)
542 ((u_int8_t*)&list[nelem] - (u_int8_t*)lp));
543 *lp = h->pgno;
544 }
545 #else
546 {
547 #endif
548 /*
549 * If we are not truncating the page then we
550 * reinitialize it and put it at the head of
551 * the free list.
552 */
553 if ((ret = __memp_dirty(mpf,
554 &h, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
555 goto err1;
556 LSN(h) = *lsnp;
557 P_INIT(h, dbp->pgsize,
558 h->pgno, PGNO_INVALID, next_pgno, 0, P_INVALID);
559 #ifdef DIAGNOSTIC
560 memset((u_int8_t *) h + P_OVERHEAD(dbp),
561 CLEAR_BYTE, dbp->pgsize - P_OVERHEAD(dbp));
562 #endif
563 if (prev_pgno == PGNO_INVALID)
564 meta->free = h->pgno;
565 else
566 NEXT_PGNO(prev) = h->pgno;
567 }
568
569 /* Discard the metadata or previous page. */
570 err1: if (hash == 0 && meta != NULL && (t_ret = __memp_fput(mpf,
571 dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
572 ret = t_ret;
573 if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
574 ret = t_ret;
575 if (prev != (PAGE*) meta && prev != NULL && (t_ret = __memp_fput(mpf,
576 dbc->thread_info, prev, dbc->priority)) != 0 && ret == 0)
577 ret = t_ret;
578
579 /* Discard the caller's page reference. */
580 err: if (h != NULL && (t_ret = __memp_fput(mpf,
581 dbc->thread_info, h, dbc->priority)) != 0 && ret == 0)
582 ret = t_ret;
583
584 PERFMON4(dbp->env, alloc, free, dbp->fname, dbp->dname, pgno, ret);
585 /*
586 * XXX
587 * We have to unlock the caller's page in the caller!
588 */
589 return (ret);
590 }
591
592 #ifdef HAVE_FTRUNCATE
593 /*
594 * __db_freelist_pos -- find the position of a page in the freelist.
595 * The list is sorted, we do a binary search.
596 *
597 * PUBLIC: #ifdef HAVE_FTRUNCATE
598 * PUBLIC: void __db_freelist_pos __P((db_pgno_t,
599 * PUBLIC: db_pgno_t *, u_int32_t, u_int32_t *));
600 * PUBLIC: #endif
601 */
602 void
__db_freelist_pos(pgno,list,nelem,posp)603 __db_freelist_pos(pgno, list, nelem, posp)
604 db_pgno_t pgno;
605 db_pgno_t *list;
606 u_int32_t nelem;
607 u_int32_t *posp;
608 {
609 u_int32_t base, indx, lim;
610
611 indx = 0;
612 for (base = 0, lim = nelem; lim != 0; lim >>= 1) {
613 indx = base + (lim >> 1);
614 if (pgno == list[indx]) {
615 *posp = indx;
616 return;
617 }
618 if (pgno > list[indx]) {
619 base = indx + 1;
620 --lim;
621 }
622 }
623 if (base != 0)
624 base--;
625 *posp = base;
626 return;
627 }
628
629 static int
__db_pglistcmp(a,b)630 __db_pglistcmp(a, b)
631 const void *a, *b;
632 {
633 db_pglist_t *ap, *bp;
634
635 ap = (db_pglist_t *)a;
636 bp = (db_pglist_t *)b;
637
638 return ((ap->pgno > bp->pgno) ? 1 : (ap->pgno < bp->pgno) ? -1: 0);
639 }
640
641 /*
642 * __db_freelist_sort -- sort a list of free pages.
643 * PUBLIC: void __db_freelist_sort __P((db_pglist_t *, u_int32_t));
644 */
645 void
__db_freelist_sort(list,nelems)646 __db_freelist_sort(list, nelems)
647 db_pglist_t *list;
648 u_int32_t nelems;
649 {
650 qsort(list, (size_t)nelems, sizeof(db_pglist_t), __db_pglistcmp);
651 }
652
653 /*
654 * __db_pg_truncate -- find the truncation point in a sorted freelist.
655 *
656 * PUBLIC: #ifdef HAVE_FTRUNCATE
657 * PUBLIC: int __db_pg_truncate __P((DBC *, DB_TXN *,
658 * PUBLIC: db_pglist_t *, DB_COMPACT *, u_int32_t *,
659 * PUBLIC: db_pgno_t , db_pgno_t *, DB_LSN *, int));
660 * PUBLIC: #endif
661 */
662 int
__db_pg_truncate(dbc,txn,list,c_data,nelemp,free_pgno,last_pgno,lsnp,in_recovery)663 __db_pg_truncate(dbc, txn,
664 list, c_data, nelemp, free_pgno, last_pgno, lsnp, in_recovery)
665 DBC *dbc;
666 DB_TXN *txn;
667 db_pglist_t *list;
668 DB_COMPACT *c_data;
669 u_int32_t *nelemp;
670 db_pgno_t free_pgno, *last_pgno;
671 DB_LSN *lsnp;
672 int in_recovery;
673 {
674 DB *dbp;
675 DBT ddbt;
676 DB_LSN null_lsn;
677 DB_MPOOLFILE *mpf;
678 PAGE *h;
679 db_pglist_t *lp, *slp;
680 db_pgno_t lpgno, pgno;
681 u_int32_t elems, log_size, tpoint;
682 int last, ret;
683
684 ret = 0;
685 h = NULL;
686
687 dbp = dbc->dbp;
688 mpf = dbp->mpf;
689 elems = tpoint = *nelemp;
690
691 /*
692 * Figure out what (if any) pages can be truncated immediately and
693 * record the place from which we can truncate, so we can do the
694 * memp_ftruncate below. We also use this to avoid ever putting
695 * these pages on the freelist, which we are about to relink.
696 */
697 pgno = *last_pgno;
698 lp = &list[elems - 1];
699 last = 1;
700 while (tpoint != 0) {
701 if (lp->pgno != pgno)
702 break;
703 pgno--;
704 tpoint--;
705 lp--;
706 }
707
708 lp = list;
709 slp = &list[elems];
710 /*
711 * Log the sorted list. We log the whole list so it can be rebuilt.
712 * Don't overflow the log file.
713 */
714 again: if (DBC_LOGGING(dbc)) {
715 last = 1;
716 lpgno = *last_pgno;
717 ddbt.size = elems * sizeof(*lp);
718 ddbt.data = lp;
719 log_size = ((LOG *)dbc->env->
720 lg_handle->reginfo.primary)->log_size;
721 if (ddbt.size > log_size / 2) {
722 elems = (log_size / 2) / sizeof(*lp);
723 ddbt.size = elems * sizeof(*lp);
724 last = 0;
725 /*
726 * If we stopped after the truncation point
727 * then we need to truncate from here.
728 */
729 if (lp + elems >= &list[tpoint])
730 lpgno = lp[elems - 1].pgno;
731 }
732 /*
733 * If this is not the beginning of the list fetch the end
734 * of the previous segment. This page becomes the last_free
735 * page and will link to this segment if it is not truncated.
736 */
737 if (lp != list) {
738 if ((ret = __memp_fget(mpf, &lp[-1].pgno,
739 dbc->thread_info, txn, 0, &h)) != 0)
740 goto err;
741 }
742
743 slp = &lp[elems];
744
745 ZERO_LSN(null_lsn);
746 if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
747 lsnp, last == 1 ? DB_FLUSH : 0, PGNO_BASE_MD,
748 lsnp, h != NULL ? PGNO(h) : PGNO_INVALID,
749 h != NULL ? &LSN(h) : &null_lsn,
750 free_pgno, lpgno, &ddbt)) != 0)
751 goto err;
752 if (h != NULL) {
753 LSN(h) = *lsnp;
754 if ((ret = __memp_fput(mpf,
755 dbc->thread_info, h, dbc->priority)) != 0)
756 goto err;
757 }
758 h = NULL;
759 } else if (!in_recovery)
760 LSN_NOT_LOGGED(*lsnp);
761
762 for (; lp < slp && lp < &list[tpoint]; lp++) {
763 if ((ret = __memp_fget(mpf, &lp->pgno, dbc->thread_info,
764 txn, !in_recovery ? DB_MPOOL_DIRTY : 0, &h)) != 0) {
765 /* Page may have been truncated later. */
766 if (in_recovery && ret == DB_PAGE_NOTFOUND) {
767 ret = 0;
768 continue;
769 }
770 goto err;
771 }
772 if (in_recovery) {
773 if (LOG_COMPARE(&LSN(h), &lp->lsn) == 0) {
774 if ((ret = __memp_dirty(mpf, &h,
775 dbc->thread_info,
776 txn, dbp->priority, 0)) != 0) {
777 (void)__memp_fput(mpf,
778 dbc->thread_info, h, dbp->priority);
779 goto err;
780 }
781 } else
782 goto skip;
783 }
784
785 if (lp == &list[tpoint - 1])
786 NEXT_PGNO(h) = PGNO_INVALID;
787 else
788 NEXT_PGNO(h) = lp[1].pgno;
789 DB_ASSERT(mpf->env, NEXT_PGNO(h) < *last_pgno);
790
791 LSN(h) = *lsnp;
792 skip: if ((ret = __memp_fput(mpf,
793 dbc->thread_info, h, dbp->priority)) != 0)
794 goto err;
795 h = NULL;
796 }
797
798 /*
799 * If we did not log everything try again. We start from slp and
800 * try to go to the end of the list.
801 */
802 if (last == 0) {
803 elems = (u_int32_t)(&list[*nelemp] - slp);
804 lp = slp;
805 goto again;
806 }
807
808 /*
809 * Truncate the file. Its possible that the last page is the
810 * only one that got truncated and that's done in the caller.
811 */
812 if (pgno != *last_pgno) {
813 if (tpoint != *nelemp &&
814 (ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
815 pgno + 1, in_recovery ? MP_TRUNC_RECOVER : 0)) != 0)
816 goto err;
817 if (c_data)
818 c_data->compact_pages_truncated += *last_pgno - pgno;
819 *last_pgno = pgno;
820 }
821 *nelemp = tpoint;
822
823 if (0) {
824 err: if (h != NULL)
825 (void)__memp_fput(mpf,
826 dbc->thread_info, h, dbc->priority);
827 }
828 return (ret);
829 }
830
831 /*
832 * __db_free_truncate --
833 * Build a sorted free list and truncate free pages at the end
834 * of the file.
835 *
836 * PUBLIC: #ifdef HAVE_FTRUNCATE
837 * PUBLIC: int __db_free_truncate __P((DB *, DB_THREAD_INFO *, DB_TXN *,
838 * PUBLIC: u_int32_t, DB_COMPACT *, db_pglist_t **, u_int32_t *,
839 * PUBLIC: db_pgno_t *));
840 * PUBLIC: #endif
841 */
842 int
__db_free_truncate(dbp,ip,txn,flags,c_data,listp,nelemp,last_pgnop)843 __db_free_truncate(dbp, ip, txn, flags, c_data, listp, nelemp, last_pgnop)
844 DB *dbp;
845 DB_THREAD_INFO *ip;
846 DB_TXN *txn;
847 u_int32_t flags;
848 DB_COMPACT *c_data;
849 db_pglist_t **listp;
850 u_int32_t *nelemp;
851 db_pgno_t *last_pgnop;
852 {
853 DBC *dbc;
854 DBMETA *meta;
855 DB_LOCK metalock;
856 DB_MPOOLFILE *mpf;
857 ENV *env;
858 PAGE *h;
859 db_pglist_t *list, *lp;
860 db_pgno_t pgno;
861 u_int32_t nelems;
862 int ret, t_ret;
863 size_t size;
864
865 COMPQUIET(flags, 0);
866 list = NULL;
867 meta = NULL;
868 env = dbp->env;
869 mpf = dbp->mpf;
870 h = NULL;
871 nelems = 0;
872 if (listp != NULL) {
873 *listp = NULL;
874 DB_ASSERT(env, nelemp != NULL);
875 *nelemp = 0;
876 }
877
878 if ((ret = __db_cursor(dbp, ip, txn, &dbc, DB_WRITELOCK)) != 0)
879 return (ret);
880
881 pgno = PGNO_BASE_MD;
882 if ((ret = __db_lget(dbc,
883 LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
884 goto err;
885 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn, 0,
886 &meta)) != 0)
887 goto err;
888
889 if (last_pgnop != NULL)
890 *last_pgnop = meta->last_pgno;
891 if ((pgno = meta->free) == PGNO_INVALID)
892 goto done;
893
894 size = 128;
895 if ((ret = __os_malloc(env, size * sizeof(*list), &list)) != 0)
896 goto err;
897 lp = list;
898
899 do {
900 if (lp == &list[size]) {
901 size *= 2;
902 if ((ret = __os_realloc(env,
903 size * sizeof(*list), &list)) != 0)
904 goto err;
905 lp = &list[size / 2];
906 }
907 if ((ret = __memp_fget(mpf, &pgno,
908 dbc->thread_info, dbc->txn, 0, &h)) != 0)
909 goto err;
910
911 lp->pgno = pgno;
912 lp->next_pgno = NEXT_PGNO(h);
913 lp->lsn = LSN(h);
914 pgno = NEXT_PGNO(h);
915 if ((ret = __memp_fput(mpf,
916 dbc->thread_info, h, dbc->priority)) != 0)
917 goto err;
918 lp++;
919 } while (pgno != PGNO_INVALID);
920 nelems = (u_int32_t)(lp - list);
921
922 if ((ret = __memp_dirty(mpf,
923 &meta, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
924 goto err;
925
926 /* Sort the list */
927 __db_freelist_sort(list, nelems);
928
929 if ((ret = __db_pg_truncate(dbc, txn, list, c_data,
930 &nelems, meta->free, &meta->last_pgno, &LSN(meta), 0)) != 0)
931 goto err;
932
933 if (nelems == 0)
934 meta->free = PGNO_INVALID;
935 else
936 meta->free = list[0].pgno;
937
938 done: if (last_pgnop != NULL)
939 *last_pgnop = meta->last_pgno;
940
941 /*
942 * Set the truncation point which determines which pages may be
943 * relocated. Pages above are candidates to be swapped with a lower one
944 * from the freelist by __db_exchange_page(); pages before the truncate
945 * point are not relocated.
946 * The truncation point starts as N pages less than the last_pgno, where
947 * N is the size of the free list. This is reduced by 1/4 in the hope
948 * that partially full pages will be coalesced together, creating
949 * additional free pages during the compact.
950 */
951 if (c_data) {
952 c_data->compact_truncate = (u_int32_t)meta->last_pgno - nelems;
953 if (c_data->compact_truncate > nelems >> 2)
954 c_data->compact_truncate -= nelems >> 2;
955 }
956
957 if (nelems != 0 && listp != NULL) {
958 *listp = list;
959 *nelemp = nelems;
960 list = NULL;
961 }
962
963 err: if (list != NULL)
964 __os_free(env, list);
965 if (meta != NULL && (t_ret = __memp_fput(mpf,
966 dbc->thread_info, (PAGE *)meta, dbc->priority)) != 0 && ret == 0)
967 ret = t_ret;
968 if ((t_ret = __TLPUT(dbc, metalock)) != 0 && ret == 0)
969 ret = t_ret;
970 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
971 ret = t_ret;
972 return (ret);
973 }
974
975 static int
__db_truncate_freelist(dbc,meta,h,list,start,nelem)976 __db_truncate_freelist(dbc, meta, h, list, start, nelem)
977 DBC *dbc;
978 DBMETA *meta;
979 PAGE *h;
980 db_pgno_t *list;
981 u_int32_t start, nelem;
982 {
983 DB *dbp;
984 DBT ddbt;
985 DB_LSN null_lsn;
986 DB_MPOOLFILE *mpf;
987 PAGE *last_free, *pg;
988 db_pgno_t *lp, free_pgno, lpgno;
989 db_pglist_t *plist, *pp, *spp;
990 u_int32_t elem, log_size;
991 int last, ret;
992
993 dbp = dbc->dbp;
994 mpf = dbp->mpf;
995 plist = NULL;
996 last_free = NULL;
997 pg = NULL;
998
999 if (start != 0 &&
1000 (ret = __memp_fget(mpf, &list[start - 1],
1001 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &last_free)) != 0)
1002 goto err;
1003
1004 if (DBC_LOGGING(dbc)) {
1005 if ((ret = __os_malloc(dbp->env,
1006 (nelem - start) * sizeof(*pp), &plist)) != 0)
1007 goto err;
1008
1009 pp = plist;
1010 for (lp = &list[start]; lp < &list[nelem]; lp++) {
1011 pp->pgno = *lp;
1012 if ((ret = __memp_fget(mpf, lp,
1013 dbc->thread_info, dbc->txn, 0, &pg)) != 0)
1014 goto err;
1015 pp->lsn = LSN(pg);
1016 pp->next_pgno = NEXT_PGNO(pg);
1017 if ((ret = __memp_fput(mpf,
1018 dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
1019 goto err;
1020 pg = NULL;
1021 pp++;
1022 }
1023 ZERO_LSN(null_lsn);
1024 pp = plist;
1025 elem = nelem - start;
1026 log_size = ((LOG *)dbc->env->
1027 lg_handle->reginfo.primary)->log_size;
1028 again: ddbt.data = spp = pp;
1029 free_pgno = pp->pgno;
1030 lpgno = meta->last_pgno;
1031 ddbt.size = elem * sizeof(*pp);
1032 if (ddbt.size > log_size / 2) {
1033 elem = (log_size / 2) / (u_int32_t)sizeof(*pp);
1034 ddbt.size = elem * sizeof(*pp);
1035 pp += elem;
1036 elem = (nelem - start) - (u_int32_t)(pp - plist);
1037 lpgno = pp[-1].pgno;
1038 last = 0;
1039 } else
1040 last = 1;
1041 /*
1042 * Get the page which will link to this section if we abort.
1043 * If this is the first segment then its last_free.
1044 */
1045 if (spp == plist)
1046 pg = last_free;
1047 else if ((ret = __memp_fget(mpf, &spp[-1].pgno,
1048 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pg)) != 0)
1049 goto err;
1050
1051 if ((ret = __db_pg_trunc_log(dbp, dbc->txn,
1052 &LSN(meta), last == 1 ? DB_FLUSH : 0,
1053 PGNO(meta), &LSN(meta),
1054 pg != NULL ? PGNO(pg) : PGNO_INVALID,
1055 pg != NULL ? &LSN(pg) : &null_lsn,
1056 free_pgno, lpgno, &ddbt)) != 0)
1057 goto err;
1058 if (pg != NULL) {
1059 LSN(pg) = LSN(meta);
1060 if (pg != last_free && (ret = __memp_fput(mpf,
1061 dbc->thread_info, pg, DB_PRIORITY_VERY_LOW)) != 0)
1062 goto err;
1063 pg = NULL;
1064 }
1065 if (last == 0)
1066 goto again;
1067 } else
1068 LSN_NOT_LOGGED(LSN(meta));
1069
1070 if ((ret = __memp_fput(mpf,
1071 dbc->thread_info, h, DB_PRIORITY_VERY_LOW)) != 0)
1072 goto err;
1073 h = NULL;
1074 if ((ret = __memp_ftruncate(mpf, dbc->txn, dbc->thread_info,
1075 list[start], 0)) != 0)
1076 goto err;
1077 meta->last_pgno = list[start] - 1;
1078
1079 if (start == 0)
1080 meta->free = PGNO_INVALID;
1081 else {
1082 NEXT_PGNO(last_free) = PGNO_INVALID;
1083 if ((ret = __memp_fput(mpf,
1084 dbc->thread_info, last_free, dbc->priority)) != 0)
1085 goto err;
1086 last_free = NULL;
1087 }
1088
1089 /* Shrink the number of elements in the list. */
1090 ret = __memp_extend_freelist(mpf, start, &list);
1091
1092 err: if (plist != NULL)
1093 __os_free(dbp->env, plist);
1094
1095 /* We need to put the page on error. */
1096 if (h != NULL)
1097 (void)__memp_fput(mpf, dbc->thread_info, h, dbc->priority);
1098 if (pg != NULL && pg != last_free)
1099 (void)__memp_fput(mpf, dbc->thread_info, pg, dbc->priority);
1100 if (last_free != NULL)
1101 (void)__memp_fput(mpf,
1102 dbc->thread_info, last_free, dbc->priority);
1103
1104 return (ret);
1105 }
1106 #endif
1107
1108 #ifdef DEBUG
1109 /*
1110 * __db_lprint --
1111 * Print out the list of locks currently held by a cursor.
1112 *
1113 * PUBLIC: int __db_lprint __P((DBC *));
1114 */
1115 int
__db_lprint(dbc)1116 __db_lprint(dbc)
1117 DBC *dbc;
1118 {
1119 DB *dbp;
1120 DB_LOCKREQ req;
1121 ENV *env;
1122
1123 dbp = dbc->dbp;
1124 env = dbp->env;
1125
1126 if (LOCKING_ON(env)) {
1127 req.op = DB_LOCK_DUMP;
1128 (void)__lock_vec(env, dbc->locker, 0, &req, 1, NULL);
1129 }
1130 return (0);
1131 }
1132 #endif
1133
1134 /*
1135 * __db_lget --
1136 * The standard lock get call.
1137 *
1138 * PUBLIC: int __db_lget __P((DBC *,
1139 * PUBLIC: int, db_pgno_t, db_lockmode_t, u_int32_t, DB_LOCK *));
1140 */
1141 int
__db_lget(dbc,action,pgno,mode,lkflags,lockp)1142 __db_lget(dbc, action, pgno, mode, lkflags, lockp)
1143 DBC *dbc;
1144 int action;
1145 db_pgno_t pgno;
1146 db_lockmode_t mode;
1147 u_int32_t lkflags;
1148 DB_LOCK *lockp;
1149 {
1150 DB *dbp;
1151 DB_LOCKREQ couple[3], *reqp;
1152 DB_TXN *txn;
1153 ENV *env;
1154 int has_timeout, i, ret;
1155
1156 dbp = dbc->dbp;
1157 env = dbp->env;
1158 txn = dbc->txn;
1159
1160 /*
1161 * We do not always check if we're configured for locking before
1162 * calling __db_lget to acquire the lock.
1163 */
1164 if (CDB_LOCKING(env) || !LOCKING_ON(env) ||
1165 (MULTIVERSION(dbp) && mode == DB_LOCK_READ &&
1166 dbc->txn != NULL && F_ISSET(dbc->txn, TXN_SNAPSHOT)) ||
1167 F_ISSET(dbc, DBC_DONTLOCK) || (F_ISSET(dbc, DBC_RECOVER) &&
1168 (action != LCK_ROLLBACK || IS_REP_CLIENT(env))) ||
1169 (action != LCK_ALWAYS && F_ISSET(dbc, DBC_OPD))) {
1170 LOCK_INIT(*lockp);
1171 return (0);
1172 }
1173
1174 /*
1175 * If the transaction enclosing this cursor has DB_LOCK_NOWAIT set,
1176 * pass that along to the lock call.
1177 */
1178 if (DB_NONBLOCK(dbc))
1179 lkflags |= DB_LOCK_NOWAIT;
1180
1181 /*
1182 * If we're trying to run in exclusive mode, attempt to get an
1183 * exclusive database lock. If it is not available then wait
1184 * for the lock on the database and clear the exclusive bit.
1185 *
1186 * If we get an exclusive lock on the database, mark the cursor
1187 * with DBC_DONTLOCK to avoid any further locking.
1188 */
1189 if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
1190 dbc->lock.type = DB_DATABASE_LOCK;
1191 dbc->lock.pgno = PGNO_BASE_MD;
1192 if ((ret = __lock_get(env, dbc->locker, DB_LOCK_NOWAIT,
1193 &dbc->lock_dbt, F_ISSET(dbp, DB_AM_RDONLY) ?
1194 DB_LOCK_READ : DB_LOCK_WRITE, lockp)) == 0) {
1195 if (F_ISSET(dbp->mpf->mfp, MP_DATABASE_LOCKING)) {
1196 F_SET(dbc, DBC_DONTLOCK);
1197 if (!IS_REAL_TXN(txn))
1198 dbc->mylock = *lockp;
1199 LOCK_INIT(*lockp);
1200 return (0);
1201 }
1202 } else if (ret == DB_LOCK_NOTGRANTED &&
1203 (lkflags & DB_LOCK_NOWAIT) == 0) {
1204 if ((ret = __lock_get(env, dbc->locker, 0,
1205 &dbc->lock_dbt, DB_LOCK_WRITE, lockp)) != 0)
1206 return (ret);
1207 F_CLR(dbp->mpf->mfp, MP_DATABASE_LOCKING);
1208 if ((ret = __lock_put(env, lockp)) != 0)
1209 return (ret);
1210 LOCK_INIT(*lockp);
1211 } else if (ret != 0)
1212 return (ret);
1213 }
1214
1215 dbc->lock.pgno = pgno;
1216 if (lkflags & DB_LOCK_RECORD)
1217 dbc->lock.type = DB_RECORD_LOCK;
1218 else
1219 dbc->lock.type = DB_PAGE_LOCK;
1220 lkflags &= ~DB_LOCK_RECORD;
1221
1222 if (F_ISSET(dbc, DBC_READ_UNCOMMITTED) && mode == DB_LOCK_READ)
1223 mode = DB_LOCK_READ_UNCOMMITTED;
1224
1225 has_timeout = F_ISSET(dbc, DBC_RECOVER) ||
1226 (txn != NULL && F_ISSET(txn, TXN_LOCKTIMEOUT));
1227
1228 /*
1229 * Transactional locking.
1230 * Hold on to the previous read lock only if we are in full isolation.
1231 * COUPLE_ALWAYS indicates we are holding an interior node which need
1232 * not be isolated.
1233 * Downgrade write locks if we are supporting dirty readers and the
1234 * update did not have an error.
1235 */
1236 if ((action != LCK_COUPLE && action != LCK_COUPLE_ALWAYS) ||
1237 !LOCK_ISSET(*lockp))
1238 action = 0;
1239 else if (dbc->txn == NULL || action == LCK_COUPLE_ALWAYS)
1240 action = LCK_COUPLE;
1241 else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
1242 lockp->mode == DB_LOCK_READ)
1243 action = LCK_COUPLE;
1244 else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1245 action = LCK_COUPLE;
1246 else if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
1247 !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
1248 action = LCK_DOWNGRADE;
1249 else
1250 action = 0;
1251
1252 i = 0;
1253 switch (action) {
1254 default:
1255 if (has_timeout)
1256 goto do_couple;
1257 ret = __lock_get(env,
1258 dbc->locker, lkflags, &dbc->lock_dbt, mode, lockp);
1259 break;
1260
1261 case LCK_DOWNGRADE:
1262 couple[0].op = DB_LOCK_GET;
1263 couple[0].obj = NULL;
1264 couple[0].lock = *lockp;
1265 couple[0].mode = DB_LOCK_WWRITE;
1266 UMRW_SET(couple[0].timeout);
1267 i++;
1268 /* FALLTHROUGH */
1269 case LCK_COUPLE:
1270 do_couple: couple[i].op = has_timeout? DB_LOCK_GET_TIMEOUT : DB_LOCK_GET;
1271 couple[i].obj = &dbc->lock_dbt;
1272 couple[i].mode = mode;
1273 UMRW_SET(couple[i].timeout);
1274 i++;
1275 if (has_timeout)
1276 couple[0].timeout =
1277 F_ISSET(dbc, DBC_RECOVER) ? 0 : txn->lock_timeout;
1278 if (action == LCK_COUPLE || action == LCK_DOWNGRADE) {
1279 couple[i].op = DB_LOCK_PUT;
1280 couple[i].lock = *lockp;
1281 i++;
1282 }
1283
1284 ret = __lock_vec(env,
1285 dbc->locker, lkflags, couple, i, &reqp);
1286 if (ret == 0 || reqp == &couple[i - 1])
1287 *lockp = i == 1 ? couple[0].lock : couple[i - 2].lock;
1288 break;
1289 }
1290
1291 if (txn != NULL && ret == DB_LOCK_DEADLOCK)
1292 F_SET(txn, TXN_DEADLOCK);
1293 return ((ret == DB_LOCK_NOTGRANTED && !F_ISSET(env->dbenv,
1294 DB_ENV_TIME_NOTGRANTED)) ? DB_LOCK_DEADLOCK : ret);
1295 }
1296
1297 #ifdef DIAGNOSTIC
1298 /*
1299 * __db_haslock --
1300 * Determine if this locker holds a particular lock.
1301 * Returns 0 if lock is held, non-zero otherwise.
1302 *
1303 * PUBLIC: #ifdef DIAGNOSTIC
1304 * PUBLIC: int __db_haslock __P((ENV *, DB_LOCKER *,
1305 * PUBLIC: DB_MPOOLFILE *, db_pgno_t, db_lockmode_t, u_int32_t));
1306 * PUBLIC: #endif
1307 */
1308 int
__db_haslock(env,locker,dbmfp,pgno,mode,type)1309 __db_haslock(env, locker, dbmfp, pgno, mode, type)
1310 ENV *env;
1311 DB_LOCKER *locker;
1312 DB_MPOOLFILE *dbmfp;
1313 db_pgno_t pgno;
1314 db_lockmode_t mode;
1315 u_int32_t type;
1316 {
1317 DBT lkdata;
1318 DB_LOCK lock;
1319 DB_LOCK_ILOCK ilock;
1320
1321 memset(&lkdata, 0, sizeof(lkdata));
1322 lkdata.data = &ilock;
1323 lkdata.size = sizeof(ilock);
1324
1325 memcpy(ilock.fileid, dbmfp->fileid, DB_FILE_ID_LEN);
1326 ilock.pgno = pgno;
1327 ilock.type = type;
1328
1329 return (__lock_get(env, locker, DB_LOCK_CHECK, &lkdata, mode, &lock));
1330 }
1331 /*
1332 * __db_has_pagelock --
1333 * Determine if this locker holds a particular page lock.
1334 * Returns 0 if lock is held, non-zero otherwise.
1335 *
1336 * PUBLIC: #ifdef DIAGNOSTIC
1337 * PUBLIC: int __db_has_pagelock __P((ENV *, DB_LOCKER *,
1338 * PUBLIC: DB_MPOOLFILE *, PAGE *, db_lockmode_t));
1339 * PUBLIC: #endif
1340 */
1341 int
__db_has_pagelock(env,locker,dbmfp,pagep,mode)1342 __db_has_pagelock(env, locker, dbmfp, pagep, mode)
1343 ENV *env;
1344 DB_LOCKER *locker;
1345 DB_MPOOLFILE *dbmfp;
1346 PAGE *pagep;
1347 db_lockmode_t mode;
1348 {
1349 int ret;
1350
1351 switch (pagep->type) {
1352 case P_OVERFLOW:
1353 case P_INVALID:
1354 case P_QAMDATA:
1355 case P_QAMMETA:
1356 case P_IHEAP:
1357 return (0);
1358 case P_HASH:
1359 if (PREV_PGNO(pagep) != PGNO_INVALID)
1360 return (0);
1361 break;
1362 default:
1363 break;
1364 }
1365 if ((ret = __db_haslock(env,
1366 locker, dbmfp, pagep->pgno, mode, DB_PAGE_LOCK)) != 0)
1367 ret = __db_haslock(env,
1368 locker, dbmfp, PGNO_BASE_MD, mode, DB_DATABASE_LOCK);
1369 return (ret);
1370 }
1371 #endif
1372
1373 /*
1374 * __db_lput --
1375 * The standard lock put call.
1376 *
1377 * PUBLIC: int __db_lput __P((DBC *, DB_LOCK *));
1378 */
1379 int
__db_lput(dbc,lockp)1380 __db_lput(dbc, lockp)
1381 DBC *dbc;
1382 DB_LOCK *lockp;
1383 {
1384 DB_LOCKREQ couple[2], *reqp;
1385 ENV *env;
1386 int action, ret;
1387
1388 /*
1389 * Transactional locking.
1390 * Hold on to the read locks only if we are in full isolation.
1391 * Downgrade write locks if we are supporting dirty readers unless
1392 * there was an error.
1393 */
1394 if (F_ISSET(dbc->dbp, DB_AM_READ_UNCOMMITTED) &&
1395 !F_ISSET(dbc, DBC_ERROR) && lockp->mode == DB_LOCK_WRITE)
1396 action = LCK_DOWNGRADE;
1397 else if (dbc->txn == NULL)
1398 action = LCK_COUPLE;
1399 else if (F_ISSET(dbc, DBC_READ_COMMITTED | DBC_WAS_READ_COMMITTED) &&
1400 lockp->mode == DB_LOCK_READ)
1401 action = LCK_COUPLE;
1402 else if (lockp->mode == DB_LOCK_READ_UNCOMMITTED)
1403 action = LCK_COUPLE;
1404 else
1405 action = 0;
1406
1407 env = dbc->env;
1408 switch (action) {
1409 case LCK_COUPLE:
1410 ret = __lock_put(env, lockp);
1411 break;
1412 case LCK_DOWNGRADE:
1413 couple[0].op = DB_LOCK_GET;
1414 couple[0].obj = NULL;
1415 couple[0].mode = DB_LOCK_WWRITE;
1416 couple[0].lock = *lockp;
1417 UMRW_SET(couple[0].timeout);
1418 couple[1].op = DB_LOCK_PUT;
1419 couple[1].lock = *lockp;
1420 ret = __lock_vec(env, dbc->locker, 0, couple, 2, &reqp);
1421 if (ret == 0 || reqp == &couple[1])
1422 *lockp = couple[0].lock;
1423 break;
1424 default:
1425 ret = 0;
1426 break;
1427 }
1428
1429 return (ret);
1430 }
1431