1 /*-
2 * Copyright (c) 1999, 2020 Oracle and/or its affiliates. All rights reserved.
3 *
4 * See the file LICENSE for license information.
5 *
6 * $Id$
7 */
8
9 #include "db_config.h"
10
11 #include "db_int.h"
12 #include "dbinc/db_page.h"
13 #include "dbinc/btree.h"
14 #include "dbinc/hash.h"
15 #include "dbinc/lock.h"
16 #include "dbinc/mp.h"
17 #include "dbinc/txn.h"
18 #include "dbinc/fop.h"
19 #ifdef HAVE_FTRUNCATE
20 static int __db_free_freelist __P((DB *, DB_THREAD_INFO *, DB_TXN *));
21 static int __db_setup_freelist __P((DB *, db_pglist_t *, u_int32_t));
22 #endif
23
24 #define SAVE_START \
25 do { \
26 save_data = *c_data; \
27 ret = __db_retcopy(env, \
28 &save_start, current.data, current.size, \
29 &save_start.data, &save_start.ulen); \
30 } while (0)
31
32 /*
33 * Only restore those things that are negated by aborting the
34 * transaction. We don't restore the number of deadlocks, for example.
35 */
36
37 #define RESTORE_START \
38 do { \
39 c_data->compact_pages_free = \
40 save_data.compact_pages_free; \
41 c_data->compact_levels = save_data.compact_levels; \
42 c_data->compact_truncate = save_data.compact_truncate; \
43 c_data->compact_empty_buckets = \
44 save_data.compact_empty_buckets; \
45 ret = __db_retcopy(env, ¤t, \
46 save_start.data, save_start.size, \
47 ¤t.data, ¤t.ulen); \
48 } while (0)
49
50 /*
51 * __db_compact_int -- compact a database.
52 *
53 * PUBLIC: int __db_compact_int __P((DB *, DB_THREAD_INFO *, DB_TXN *,
54 * PUBLIC: DBT *, DBT *, DB_COMPACT *, u_int32_t, DBT *));
55 */
56 int
__db_compact_int(dbp,ip,txn,start,stop,c_data,flags,end)57 __db_compact_int(dbp, ip, txn, start, stop, c_data, flags, end)
58 DB *dbp;
59 DB_THREAD_INFO *ip;
60 DB_TXN *txn;
61 DBT *start, *stop;
62 DB_COMPACT *c_data;
63 u_int32_t flags;
64 DBT *end;
65 {
66 DBC *dbc;
67 DBT current, save_start;
68 DB_COMPACT save_data;
69 DB_TXN *txn_orig;
70 ENV *env;
71 u_int32_t factor, retry;
72 int deadlock, have_freelist, isdone, ret, span, t_ret, txn_local;
73
74 #ifdef HAVE_HASH
75 u_int32_t empty_buckets;
76 #endif
77
78 #ifdef HAVE_FTRUNCATE
79 db_pglist_t *list;
80 db_pgno_t last_pgno;
81 u_int32_t nelems, truncated;
82 #endif
83 env = dbp->env;
84
85 memset(¤t, 0, sizeof(current));
86 memset(&save_start, 0, sizeof(save_start));
87 dbc = NULL;
88 factor = 0;
89 have_freelist = deadlock = isdone = span = 0;
90 ret = retry = 0;
91 txn_orig = txn;
92
93 #ifdef HAVE_FTRUNCATE
94 list = NULL;
95 last_pgno = 0;
96 nelems = truncated = 0;
97 #endif
98
99 /*
100 * We pass "current" to the internal routine, indicating where that
101 * routine should begin its work and expecting that it will return to
102 * us the last key that it processed.
103 */
104 if (start != NULL && (ret = __db_retcopy(env,
105 ¤t, start->data, start->size,
106 ¤t.data, ¤t.ulen)) != 0)
107 return (ret);
108 #ifdef HAVE_HASH
109 empty_buckets = c_data->compact_empty_buckets;
110 #endif
111
112 if (IS_DB_AUTO_COMMIT(dbp, txn)) {
113 txn_local = 1;
114 LF_SET(DB_AUTO_COMMIT);
115 } else
116 txn_local = 0;
117 if (!LF_ISSET(DB_FREE_SPACE | DB_FREELIST_ONLY))
118 goto no_free;
119 if (LF_ISSET(DB_FREELIST_ONLY))
120 LF_SET(DB_FREE_SPACE);
121
122 #ifdef HAVE_FTRUNCATE
123 /* Sort the freelist and set up the in-memory list representation. */
124 if (txn_local && (ret = __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
125 goto err;
126
127 if ((ret = __db_free_truncate(dbp, ip,
128 txn, flags, c_data, &list, &nelems, &last_pgno)) != 0) {
129 LF_CLR(DB_FREE_SPACE);
130 goto terr;
131 }
132
133 /* If the freelist is empty and we are not filling, get out. */
134 if (nelems == 0 && LF_ISSET(DB_FREELIST_ONLY)) {
135 ret = 0;
136 LF_CLR(DB_FREE_SPACE);
137 goto terr;
138 }
139 if ((ret = __db_setup_freelist(dbp, list, nelems)) != 0) {
140 /* Someone else owns the free list. */
141 if (ret == EBUSY)
142 ret = 0;
143 }
144 if (ret == 0)
145 have_freelist = 1;
146
147 /* Commit the txn and release the meta page lock. */
148 terr: if (txn_local) {
149 if ((t_ret = __txn_commit(txn, DB_TXN_NOSYNC)) != 0 && ret == 0)
150 ret = t_ret;
151 txn = NULL;
152 }
153 if (ret != 0)
154 goto err;
155
156 /* Save the number truncated so far, we will add what we get below. */
157 truncated = c_data->compact_pages_truncated;
158 if (LF_ISSET(DB_FREELIST_ONLY))
159 goto done;
160 #endif
161
162 /*
163 * We want factor to be the target number of free bytes on each page,
164 * so we know when to stop adding items to a page. Make sure to
165 * subtract the page overhead when computing this target. This can
166 * result in a 1-2% error on the smallest page.
167 * First figure out how many bytes we should use:
168 */
169 no_free:
170 factor = dbp->pgsize - SIZEOF_PAGE;
171 if (c_data->compact_fillpercent != 0) {
172 factor *= c_data->compact_fillpercent;
173 factor /= 100;
174 }
175 /* Now convert to the number of free bytes to target. */
176 factor = (dbp->pgsize - SIZEOF_PAGE) - factor;
177
178 if (c_data->compact_pages == 0)
179 c_data->compact_pages = DB_MAX_PAGES;
180
181 do {
182 deadlock = 0;
183
184 SAVE_START;
185 if (ret != 0)
186 break;
187
188 if (txn_local) {
189 if ((ret =
190 __txn_begin(env, ip, txn_orig, &txn, 0)) != 0)
191 break;
192
193 if (c_data->compact_timeout != 0 &&
194 (ret = __txn_set_timeout(txn,
195 c_data->compact_timeout, DB_SET_LOCK_TIMEOUT)) != 0)
196 goto err;
197 }
198
199 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
200 goto err;
201
202 #ifdef HAVE_HASH
203 if (dbp->type == DB_HASH)
204 ret = __ham_compact_int(dbc,
205 ¤t, stop, factor, c_data, &isdone, flags);
206 else
207 #endif
208 ret = __bam_compact_int(dbc, ¤t, stop, factor,
209 &span, c_data, &isdone);
210 if (ret == DB_LOCK_DEADLOCK && txn_local) {
211 /*
212 * We retry on deadlock. Cancel the statistics
213 * and reset the start point to before this
214 * iteration.
215 */
216 deadlock = 1;
217 c_data->compact_deadlock++;
218 RESTORE_START;
219 }
220 /*
221 * If we could not get a lock while holding an internal
222 * node latched, commit the current local transaction otherwise
223 * report a deadlock.
224 */
225 if (ret == DB_LOCK_NOTGRANTED) {
226 if (txn_local || retry++ < 5)
227 ret = 0;
228 else
229 ret = DB_LOCK_DEADLOCK;
230 } else
231 retry = 0;
232
233 if ((t_ret = __dbc_close(dbc)) != 0 && ret == 0)
234 ret = t_ret;
235
236 err: if (txn_local && txn != NULL) {
237 if (ret == 0 && deadlock == 0)
238 ret = __txn_commit(txn, DB_TXN_NOSYNC);
239 else if ((t_ret = __txn_abort(txn)) != 0 && ret == 0)
240 ret = t_ret;
241 txn = NULL;
242 }
243 DB_ASSERT(env, ip == NULL || ip->dbth_pincount == 0);
244 } while (ret == 0 && !isdone);
245
246 if (ret == 0 && end != NULL)
247 ret = __db_retcopy(env, end, current.data, current.size,
248 &end->data, &end->ulen);
249 if (current.data != NULL)
250 __os_free(env, current.data);
251 if (save_start.data != NULL)
252 __os_free(env, save_start.data);
253
254 #ifdef HAVE_FTRUNCATE
255 /*
256 * Finish up truncation work. If there are pages left in the free
257 * list we can try to move the internal structures around so that we
258 * can remove more pages from the file.
259 * For BTREE search the internal nodes of the tree as we may have
260 * missed some while walking the leaf nodes.
261 * For HASH we will compact the hash table itself, moving segments
262 * to lower number pages where possible.
263 * Then calculate how many pages we have truncated and release
264 * the in-memory free list.
265 */
266 done: if (LF_ISSET(DB_FREE_SPACE)) {
267 DBMETA *meta;
268 db_pgno_t pgno;
269 int pgs_done;
270
271 pgno = PGNO_BASE_MD;
272 isdone = 1;
273 pgs_done = 0;
274 if (ret == 0 && !LF_ISSET(DB_FREELIST_ONLY) &&
275 __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta) == 0) {
276 isdone = meta->free == PGNO_INVALID;
277 ret = __memp_fput(dbp->mpf, ip, meta, dbp->priority);
278 }
279
280 #ifdef HAVE_HASH
281 if (dbp->type == DB_HASH) {
282 c_data->compact_empty_buckets -= empty_buckets;
283 if (!isdone || c_data->compact_empty_buckets != 0)
284 ret = __ham_compact_hash(dbp,
285 ip, txn_orig, c_data);
286 c_data->compact_empty_buckets += empty_buckets;
287 } else
288 #endif
289 if (!isdone)
290 ret = __bam_truncate_ipages(dbp,
291 ip, txn_orig, c_data, &pgs_done);
292
293 /* Clean up the free list. */
294 if (list != NULL)
295 __os_free(env, list);
296
297 if ((t_ret =
298 __memp_fget(dbp->mpf, &pgno, ip, txn, 0, &meta)) == 0) {
299 c_data->compact_pages_truncated =
300 truncated + last_pgno - meta->last_pgno;
301 if ((t_ret = __memp_fput(dbp->mpf, ip,
302 meta, dbp->priority)) != 0 && ret == 0)
303 ret = t_ret;
304 } else if (ret == 0)
305 ret = t_ret;
306
307 if (have_freelist && (t_ret =
308 __db_free_freelist(dbp, ip, txn_orig)) != 0 && ret == 0)
309 t_ret = ret;
310 }
311 #endif
312
313 return (ret);
314 }
315
316 #ifdef HAVE_FTRUNCATE
317 static int
__db_setup_freelist(dbp,list,nelems)318 __db_setup_freelist(dbp, list, nelems)
319 DB *dbp;
320 db_pglist_t *list;
321 u_int32_t nelems;
322 {
323 DB_MPOOLFILE *mpf;
324 db_pgno_t *plist;
325 int ret;
326
327 mpf = dbp->mpf;
328
329 if ((ret = __memp_alloc_freelist(mpf, nelems, &plist)) != 0)
330 return (ret);
331
332 while (nelems-- != 0)
333 *plist++ = list++->pgno;
334
335 return (0);
336 }
337
338 static int
__db_free_freelist(dbp,ip,txn)339 __db_free_freelist(dbp, ip, txn)
340 DB *dbp;
341 DB_THREAD_INFO *ip;
342 DB_TXN *txn;
343 {
344 DBC *dbc;
345 DB_LOCK lock;
346 int auto_commit, ret, t_ret;
347
348 LOCK_INIT(lock);
349 auto_commit = ret = 0;
350
351 /*
352 * If we are not in a transaction then we need to get
353 * a lock on the meta page, otherwise we should already
354 * have the lock.
355 */
356
357 dbc = NULL;
358 if (IS_DB_AUTO_COMMIT(dbp, txn)) {
359 /*
360 * We must not timeout the lock or we will not free the list.
361 * We ignore errors from txn_begin as there is little that
362 * the application can do with the error and we want to
363 * get the lock and free the list if at all possible.
364 */
365 if (__txn_begin(dbp->env, ip, txn, &txn, 0) == 0) {
366 (void)__lock_set_timeout(dbp->env,
367 txn->locker, 0, DB_SET_TXN_TIMEOUT);
368 (void)__lock_set_timeout(dbp->env,
369 txn->locker, 0, DB_SET_LOCK_TIMEOUT);
370 auto_commit = 1;
371 }
372 /* Get a cursor so we can call __db_lget. */
373 if ((ret = __db_cursor(dbp, ip, txn, &dbc, 0)) != 0)
374 return (ret);
375
376 if ((ret = __db_lget(dbc,
377 0, PGNO_BASE_MD, DB_LOCK_WRITE, 0, &lock)) != 0)
378 goto err;
379 }
380
381 ret = __memp_free_freelist(dbp->mpf);
382
383 err: if (dbc != NULL && (t_ret = __LPUT(dbc, lock)) != 0 && ret == 0)
384 ret = t_ret;
385
386 if (dbc != NULL && (t_ret = __dbc_close(dbc)) != 0 && ret == 0)
387 ret = t_ret;
388
389 if (auto_commit && (t_ret = __txn_abort(txn)) != 0 && ret == 0)
390 ret = t_ret;
391
392 return (ret);
393 }
394 #endif
395
396 /*
397 * __db_exchange_page -- try to move a page 'down', to earlier in the file.
398 *
399 * This tries to move a page to a lower location the file, by swapping it
400 * with an earlier free page. The free page comes either from the free list or
401 * the newpgno parameter (e.g., __ham_compact_hash()). If the new page turns
402 * out to be higher than the original one, the allocation is undone and
403 * the caller is left unchanged. After a successful swap, this routine can
404 * optionally free the old, higher numbered page.
405 * The cursor's stack includes at least the immediate parent of this page.
406 *
407 * PUBLIC: int __db_exchange_page
408 * PUBLIC: __P((DBC *, PAGE **, PAGE *, db_pgno_t, int, int *));
409 */
410 int
__db_exchange_page(dbc,pgp,opg,newpgno,flags,pgs_donep)411 __db_exchange_page(dbc, pgp, opg, newpgno, flags, pgs_donep)
412 DBC *dbc;
413 PAGE **pgp, *opg;
414 db_pgno_t newpgno;
415 int flags;
416 int *pgs_donep;
417 {
418 BTREE_CURSOR *cp;
419 DB *dbp;
420 DBT data, *dp, hdr;
421 DB_LSN lsn;
422 DB_LOCK lock;
423 EPG *epg;
424 PAGE *newpage;
425 db_pgno_t oldpgno, *pgnop;
426 int ret;
427
428 COMPQUIET(oldpgno, 0);
429
430 DB_ASSERT(NULL, dbc != NULL);
431 dbp = dbc->dbp;
432 LOCK_INIT(lock);
433
434 /*
435 * We want to free a page that lives in the part of the file that
436 * can be truncated, so we're going to move it onto a free page
437 * that is in the part of the file that need not be truncated.
438 * In the case of compacting hash table segments the caller already
439 * identified a contiguous set of pages to use. Otherwise
440 * since the freelist is ordered now, we can simply call __db_new
441 * which will grab the first element off the freelist; we know this
442 * is the lowest numbered free page.
443 */
444 if (newpgno != PGNO_INVALID) {
445 if ((ret = __memp_fget(dbp->mpf, &newpgno,
446 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &newpage)) != 0)
447 return (ret);
448 } else if ((ret = __db_new(dbc, P_DONTEXTEND | TYPE(*pgp),
449 STD_LOCKING(dbc) && TYPE(*pgp) != P_OVERFLOW ? &lock : NULL,
450 &newpage)) != 0)
451 return (ret);
452
453 /*
454 * If newpage is null then __db_new would have had to allocate
455 * a new page from the filesystem, so there is no reason
456 * to continue this action.
457 */
458 if (newpage == NULL)
459 return (0);
460
461 /*
462 * It is possible that a higher page is allocated if other threads
463 * are allocating at the same time, if so, just put it back.
464 */
465 if (PGNO(newpage) > PGNO(*pgp)) {
466 /* It is unfortunate but you can't just free a new overflow. */
467 /* !!! Is the above comment still true? */
468 /* !!! Should __db_new(OVERFLOW) zero OV_LEN()? */
469 if (TYPE(newpage) == P_OVERFLOW)
470 OV_LEN(newpage) = 0;
471 if ((ret = __LPUT(dbc, lock)) != 0)
472 return (ret);
473 return (__db_free(dbc, newpage, 0));
474 }
475
476 /* Log if necessary. */
477 if (DBC_LOGGING(dbc)) {
478 memset(&hdr, 0, sizeof(hdr));
479 hdr.data = *pgp;
480 hdr.size = P_OVERHEAD(dbp);
481 memset(&data, 0, sizeof(data));
482 dp = &data;
483 switch (TYPE(*pgp)) {
484 case P_OVERFLOW:
485 data.data = (u_int8_t *)*pgp + P_OVERHEAD(dbp);
486 data.size = OV_LEN(*pgp);
487 break;
488 case P_BTREEMETA:
489 hdr.size = sizeof(BTMETA);
490 dp = NULL;
491 break;
492 case P_HASHMETA:
493 hdr.size = sizeof(HMETA);
494 dp = NULL;
495 break;
496 default:
497 data.data = (u_int8_t *)*pgp + HOFFSET(*pgp);
498 data.size = dbp->pgsize - HOFFSET(*pgp);
499 hdr.size += NUM_ENT(*pgp) * sizeof(db_indx_t);
500 }
501 if ((ret = __db_merge_log(dbp, dbc->txn,
502 &LSN(newpage), 0, PGNO(newpage), &LSN(newpage),
503 PGNO(*pgp), &LSN(*pgp), &hdr, dp, 1)) != 0)
504 goto err;
505 } else
506 LSN_NOT_LOGGED(LSN(newpage));
507
508 oldpgno = PGNO(*pgp);
509 newpgno = PGNO(newpage);
510 lsn = LSN(newpage);
511 memcpy(newpage, *pgp, dbp->pgsize);
512 PGNO(newpage) = newpgno;
513 LSN(newpage) = lsn;
514
515 /* Empty the old page. */
516 if ((ret = __memp_dirty(dbp->mpf,
517 pgp, dbc->thread_info, dbc->txn, dbc->priority, 0)) != 0)
518 goto err;
519 if (TYPE(*pgp) == P_OVERFLOW)
520 OV_LEN(*pgp) = 0;
521 else {
522 HOFFSET(*pgp) = dbp->pgsize;
523 NUM_ENT(*pgp) = 0;
524 }
525 LSN(*pgp) = lsn;
526
527 /* Update siblings. */
528 switch (TYPE(newpage)) {
529 case P_OVERFLOW:
530 case P_LBTREE:
531 case P_LRECNO:
532 case P_LDUP:
533 case P_HASH:
534 if (NEXT_PGNO(newpage) == PGNO_INVALID &&
535 PREV_PGNO(newpage) == PGNO_INVALID)
536 break;
537 if ((ret = __db_relink(dbc, *pgp, opg, PGNO(newpage))) != 0)
538 goto err;
539 break;
540 default:
541 break;
542 }
543
544 /*
545 * For HASH we may reuse the old page for an even higher numbered
546 * page. Otherwise we free the old page.
547 */
548 if (!LF_ISSET(DB_EXCH_FREE)) {
549 NEXT_PGNO(*pgp) = PREV_PGNO(*pgp) = PGNO_INVALID;
550 ret = __memp_fput(dbp->mpf,
551 dbc->thread_info, *pgp, dbc->priority);
552 } else
553 ret = __db_free(dbc, *pgp, 0);
554 *pgp = newpage;
555
556 if (ret != 0)
557 return (ret);
558
559 if (!LF_ISSET(DB_EXCH_PARENT))
560 goto done;
561
562 /* Update the parent. */
563 cp = (BTREE_CURSOR *)dbc->internal;
564 epg = &cp->csp[-1];
565
566 switch (TYPE(epg->page)) {
567 case P_IBTREE:
568 pgnop = &GET_BINTERNAL(dbp, epg->page, epg->indx)->pgno;
569 break;
570 case P_IRECNO:
571 pgnop = &GET_RINTERNAL(dbp, epg->page, epg->indx)->pgno;
572 break;
573 case P_LBTREE:
574 case P_LRECNO:
575 case P_LDUP:
576 pgnop = &GET_BOVERFLOW(dbp, epg->page, epg->indx)->pgno;
577 break;
578 default:
579 return (__db_pgfmt(dbp->env, PGNO(epg->page)));
580 }
581 DB_ASSERT(dbp->env, oldpgno == *pgnop);
582 if (DBC_LOGGING(dbc)) {
583 if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(epg->page),
584 0, PGNO(epg->page), &LSN(epg->page), (u_int32_t)epg->indx,
585 *pgnop, PGNO(newpage))) != 0)
586 return (ret);
587 } else
588 LSN_NOT_LOGGED(LSN(epg->page));
589
590 *pgnop = PGNO(newpage);
591 cp->csp->page = newpage;
592 if ((ret = __TLPUT(dbc, lock)) != 0)
593 return (ret);
594
595 done:
596 (*pgs_donep)++;
597 return (0);
598
599 err: (void)__memp_fput(dbp->mpf, dbc->thread_info, newpage, dbc->priority);
600 (void)__TLPUT(dbc, lock);
601 return (ret);
602 }
603
604 /*
605 * __db_truncate_overflow -- find overflow pages to truncate.
606 * Walk the pages of an overflow chain and swap out
607 * high numbered pages. We are passed the first page
608 * but only deal with the second and subsequent pages.
609 * PUBLIC: int __db_truncate_overflow __P((DBC *, db_pgno_t,
610 * PUBLIC: PAGE **, DB_COMPACT *, int *));
611 */
612 int
__db_truncate_overflow(dbc,pgno,ppg,c_data,pgs_donep)613 __db_truncate_overflow(dbc, pgno, ppg, c_data, pgs_donep)
614 DBC *dbc;
615 db_pgno_t pgno;
616 PAGE **ppg;
617 DB_COMPACT *c_data;
618 int *pgs_donep;
619 {
620 DB *dbp;
621 DB_LOCK lock;
622 PAGE *page;
623 db_pgno_t ppgno;
624 int have_lock, ret, t_ret;
625
626 dbp = dbc->dbp;
627 page = NULL;
628 LOCK_INIT(lock);
629 have_lock = ppg == NULL;
630
631 if ((ret = __memp_fget(dbp->mpf, &pgno,
632 dbc->thread_info, dbc->txn, 0, &page)) != 0)
633 return (ret);
634
635 while ((pgno = NEXT_PGNO(page)) != PGNO_INVALID) {
636 if ((ret = __memp_fput(dbp->mpf,
637 dbc->thread_info, page, dbc->priority)) != 0)
638 return (ret);
639 if ((ret = __memp_fget(dbp->mpf, &pgno,
640 dbc->thread_info, dbc->txn, 0, &page)) != 0)
641 return (ret);
642 if (pgno <= c_data->compact_truncate)
643 continue;
644 if (!have_lock) {
645 DB_ASSERT(dbp->env, ppg != NULL);
646 ppgno = PGNO(*ppg);
647 if ((ret = __memp_fput(dbp->mpf, dbc->thread_info,
648 *ppg, dbc->priority)) != 0)
649 goto err;
650 *ppg = NULL;
651 if ((ret = __db_lget(dbc, 0, ppgno,
652 DB_LOCK_WRITE, 0, &lock)) != 0)
653 goto err;
654 if ((ret = __memp_fget(dbp->mpf, &ppgno,
655 dbc->thread_info,
656 dbc->txn, DB_MPOOL_DIRTY, ppg)) != 0)
657 goto err;
658 have_lock = 1;
659 }
660 if ((ret = __db_exchange_page(dbc,
661 &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
662 break;
663 }
664
665 err: if (page != NULL &&
666 (t_ret = __memp_fput(dbp->mpf,
667 dbc->thread_info, page, dbc->priority)) != 0 && ret == 0)
668 ret = t_ret;
669 if ((t_ret = __TLPUT(dbc, lock)) != 0 && ret == 0)
670 ret = t_ret;
671 return (ret);
672 }
673
674 /*
675 * __db_truncate_root -- swap a root page for a lower numbered page.
676 * PUBLIC: int __db_truncate_root __P((DBC *,
677 * PUBLIC: PAGE *, u_int32_t, db_pgno_t *, u_int32_t, int *));
678 */
679 int
__db_truncate_root(dbc,ppg,indx,pgnop,tlen,pgs_donep)680 __db_truncate_root(dbc, ppg, indx, pgnop, tlen, pgs_donep)
681 DBC *dbc;
682 PAGE *ppg;
683 u_int32_t indx;
684 db_pgno_t *pgnop;
685 u_int32_t tlen;
686 int *pgs_donep;
687 {
688 DB *dbp;
689 DBT orig;
690 PAGE *page;
691 int ret, t_ret;
692 db_pgno_t newpgno;
693
694 dbp = dbc->dbp;
695
696 DB_ASSERT(dbc->dbp->env, IS_DIRTY(ppg));
697 if ((ret = __memp_fget(dbp->mpf, pgnop,
698 dbc->thread_info, dbc->txn, 0, &page)) != 0)
699 goto err;
700
701 /*
702 * If this is a multiply reference overflow key, then we will just
703 * copy it and decrement the reference count. This is part of a
704 * fix to get rid of multiple references.
705 */
706 if (TYPE(page) == P_OVERFLOW && OV_REF(page) > 1) {
707 COMPQUIET(newpgno, 0);
708 if ((ret = __db_ovref(dbc, *pgnop)) != 0)
709 goto err;
710 memset(&orig, 0, sizeof(orig));
711 if ((ret = __db_goff(dbc, &orig, tlen, *pgnop,
712 &orig.data, &orig.size)) == 0)
713 ret = __db_poff(dbc, &orig, &newpgno);
714 if (orig.data != NULL)
715 __os_free(dbp->env, orig.data);
716 if (ret != 0)
717 goto err;
718 } else {
719 LOCK_CHECK_OFF(dbc->thread_info);
720 ret = __db_exchange_page(dbc,
721 &page, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep);
722 LOCK_CHECK_ON(dbc->thread_info);
723 if (ret != 0)
724 goto err;
725 newpgno = PGNO(page);
726 /* If we could not allocate from the free list, give up.*/
727 if (newpgno == *pgnop)
728 goto err;
729 }
730
731 /* Update the reference. */
732 if (DBC_LOGGING(dbc)) {
733 if ((ret = __db_pgno_log(dbp, dbc->txn, &LSN(ppg), 0, PGNO(ppg),
734 &LSN(ppg), (u_int32_t)indx, *pgnop, newpgno)) != 0)
735 goto err;
736 } else
737 LSN_NOT_LOGGED(LSN(ppg));
738
739 *pgnop = newpgno;
740
741 err: if (page != NULL && (t_ret =
742 __memp_fput(dbp->mpf, dbc->thread_info,
743 page, dbc->priority)) != 0 && ret == 0)
744 ret = t_ret;
745 return (ret);
746 }
747
748 #ifdef HAVE_FTRUNCATE
749 /*
750 * __db_find_free --
751 * Find a contiguous "size" range of free pages that are lower numbers
752 * than the pages starting at "bstart". We can also return a set of pages
753 * that overlaps with the pages at "bstart".
754 * PUBLIC: int __db_find_free __P((DBC *, u_int32_t,
755 * PUBLIC: u_int32_t, db_pgno_t, db_pgno_t *));
756 */
757 int
__db_find_free(dbc,type,size,bstart,freep)758 __db_find_free(dbc, type, size, bstart, freep)
759 DBC *dbc;
760 u_int32_t type;
761 u_int32_t size;
762 db_pgno_t bstart, *freep;
763 {
764 DB *dbp;
765 DBMETA *meta;
766 DBT listdbt;
767 DB_LOCK metalock;
768 DB_LSN lsn;
769 DB_MPOOLFILE *mpf;
770 PAGE *page, *freepg;
771 u_int32_t i, j, start, nelems;
772 db_pgno_t *list, next_free, pgno;
773 db_pglist_t *lp, *pglist;
774 int hash, ret, t_ret;
775
776 dbp = dbc->dbp;
777 mpf = dbp->mpf;
778 nelems = 0;
779 hash = 0;
780 page = NULL;
781 pglist = NULL;
782 meta = NULL;
783 LOCK_INIT(metalock);
784
785 #ifdef HAVE_HASH
786 if (dbp->type == DB_HASH) {
787 if ((ret = __ham_return_meta(dbc, DB_MPOOL_DIRTY, &meta)) != 0)
788 return (ret);
789 if (meta != NULL)
790 hash = 1;
791 }
792 #endif
793 if (meta == NULL) {
794 pgno = PGNO_BASE_MD;
795 if ((ret = __db_lget(dbc,
796 LCK_ALWAYS, pgno, DB_LOCK_WRITE, 0, &metalock)) != 0)
797 goto err;
798 if ((ret = __memp_fget(mpf, &pgno, dbc->thread_info, dbc->txn,
799 DB_MPOOL_DIRTY, &meta)) != 0)
800 goto err;
801 }
802
803 if ((ret = __memp_get_freelist(mpf, &nelems, &list)) != 0)
804 goto err;
805
806 if (nelems == 0) {
807 ret = DBC_ERR(dbc, DB_NOTFOUND);
808 goto err;
809 }
810
811 for (i = 0; i < nelems; i++) {
812 if (list[i] > bstart) {
813 ret = DBC_ERR(dbc, DB_NOTFOUND);
814 goto err;
815 }
816 start = i;
817 if (size == 1)
818 goto found;
819 while (i < nelems - 1 && list[i] + 1 == list[i + 1]) {
820 i++;
821 if (i - start == size - 1)
822 goto found;
823 }
824 if (i - start == size - 1)
825 goto found;
826 /*
827 * If the last set of contiguous free pages we found
828 * are contiguous to the chunk we are trying to move,
829 * then we can slide the allocated chunk back some number
830 * of pages -- figure out how many by calculating the
831 * number of pages before the allocated ones that we have
832 * found in the free list.
833 */
834 if (list[i] == bstart - 1) {
835 size = (i - start) + 1;
836 goto found;
837 }
838 }
839 ret = DBC_ERR(dbc, DB_NOTFOUND);
840 goto err;
841
842 found: /* We have size range of pages. Remove them. */
843 next_free = i == nelems - 1 ? PGNO_INVALID : list[i + 1];
844 *freep = list[start];
845 if (start == 0) {
846 page = (PAGE *)meta;
847 } else if ((ret = __memp_fget(mpf, &list[start - 1],
848 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &page)) != 0)
849 return (ret);
850 if (DBC_LOGGING(dbc)) {
851 if ((ret = __os_malloc(dbp->env,
852 size * sizeof(db_pglist_t), &pglist)) != 0)
853 goto err;
854 lp = pglist;
855 for (j = start; j < start + size; j++, lp++) {
856 if ((ret = __memp_fget(mpf, &list[j],
857 dbc->thread_info, dbc->txn, 0, &freepg)) != 0)
858 goto err;
859 lp->pgno = PGNO(freepg);
860 lp->next_pgno = NEXT_PGNO(freepg);
861 lp->lsn = LSN(freepg);
862 if ((ret = __memp_fput(mpf,
863 dbc->thread_info, freepg, dbc->priority)) != 0)
864 goto err;
865 }
866 listdbt.size = size * sizeof(*pglist);
867 listdbt.data = pglist;
868 if ((ret = __db_realloc_log(dbp, dbc->txn, &lsn, 0,
869 PGNO(page), &LSN(page), next_free, type, &listdbt)) != 0)
870 goto err;
871 __os_free(dbp->env, pglist);
872 pglist = NULL;
873 } else
874 LSN_NOT_LOGGED(lsn);
875
876 LSN(page) = lsn;
877 if (start == 0)
878 meta->free = next_free;
879 else
880 NEXT_PGNO(page) = next_free;
881
882 if (page != (PAGE *)meta && (ret = __memp_fput(mpf,
883 dbc->thread_info, page, dbc->priority)) != 0)
884 goto err;
885
886 for (j = start; j < start + size; j++) {
887 if ((ret = __memp_fget(mpf,
888 &list[j], dbc->thread_info,
889 dbc->txn, DB_MPOOL_DIRTY, &freepg)) != 0)
890 goto err;
891 P_INIT(freepg, dbp->pgsize,
892 list[j], PGNO_INVALID, PGNO_INVALID, 0, type);
893 LSN(freepg) = lsn;
894 if ((ret = __memp_fput(mpf,
895 dbc->thread_info, freepg, dbc->priority)) != 0)
896 goto err;
897 }
898
899 if (++i != nelems)
900 memmove(&list[start], &list[i], (nelems - i) * sizeof(*list));
901 if ((ret = __memp_extend_freelist(mpf, nelems - size, &list)) != 0)
902 goto err;
903 if (hash == 0)
904 ret = __memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
905 t_ret = __TLPUT(dbc, metalock);
906
907 return (ret == 0 ? t_ret : ret);
908
909 err: if (page != NULL && page != (PAGE *)meta)
910 (void)__memp_fput(mpf, dbc->thread_info, page, dbc->priority);
911 if (pglist != NULL)
912 __os_free(dbp->env, pglist);
913 if (meta != NULL && hash == 0)
914 (void)__memp_fput(mpf, dbc->thread_info, meta, dbc->priority);
915 (void)__TLPUT(dbc, metalock);
916 return (ret);
917 }
918 #endif
919
920 /*
921 * __db_relink --
922 * Relink around a deleted page.
923 *
924 * PUBLIC: int __db_relink __P((DBC *, PAGE *, PAGE *, db_pgno_t));
925 * Otherp can be either the previous or the next page to use if
926 * the caller already holds that page.
927 */
928 int
__db_relink(dbc,pagep,otherp,new_pgno)929 __db_relink(dbc, pagep, otherp, new_pgno)
930 DBC *dbc;
931 PAGE *pagep, *otherp;
932 db_pgno_t new_pgno;
933 {
934 DB *dbp;
935 DB_LOCK npl, ppl;
936 DB_LSN *nlsnp, *plsnp, ret_lsn;
937 DB_MPOOLFILE *mpf;
938 PAGE *np, *pp;
939 int ret, t_ret;
940
941 dbp = dbc->dbp;
942 np = pp = NULL;
943 LOCK_INIT(npl);
944 LOCK_INIT(ppl);
945 nlsnp = plsnp = NULL;
946 mpf = dbp->mpf;
947 ret = 0;
948
949 /*
950 * Retrieve the one/two pages. The caller must have them locked
951 * because the parent is latched. For a remove, we may need
952 * two pages (the before and after). For an add, we only need one
953 * because, the split took care of the prev.
954 */
955 if (pagep->next_pgno != PGNO_INVALID) {
956 if (((np = otherp) == NULL ||
957 PGNO(otherp) != pagep->next_pgno) &&
958 (ret = __memp_fget(mpf, &pagep->next_pgno,
959 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &np)) != 0) {
960 ret = __db_pgerr(dbp, pagep->next_pgno, ret);
961 goto err;
962 }
963 nlsnp = &np->lsn;
964 }
965 if (pagep->prev_pgno != PGNO_INVALID) {
966 if (((pp = otherp) == NULL ||
967 PGNO(otherp) != pagep->prev_pgno) &&
968 (ret = __memp_fget(mpf, &pagep->prev_pgno,
969 dbc->thread_info, dbc->txn, DB_MPOOL_DIRTY, &pp)) != 0) {
970 ret = __db_pgerr(dbp, pagep->prev_pgno, ret);
971 goto err;
972 }
973 plsnp = &pp->lsn;
974 }
975
976 /* Log the change. */
977 if (DBC_LOGGING(dbc)) {
978 if ((ret = __db_relink_log(dbp, dbc->txn, &ret_lsn, 0,
979 pagep->pgno, new_pgno, pagep->prev_pgno, plsnp,
980 pagep->next_pgno, nlsnp)) != 0)
981 goto err;
982 } else
983 LSN_NOT_LOGGED(ret_lsn);
984 if (np != NULL)
985 np->lsn = ret_lsn;
986 if (pp != NULL)
987 pp->lsn = ret_lsn;
988
989 /*
990 * Modify and release the two pages.
991 */
992 if (np != NULL) {
993 if (new_pgno == PGNO_INVALID)
994 np->prev_pgno = pagep->prev_pgno;
995 else
996 np->prev_pgno = new_pgno;
997 if (np != otherp)
998 ret = __memp_fput(mpf,
999 dbc->thread_info, np, dbc->priority);
1000 if ((t_ret = __TLPUT(dbc, npl)) != 0 && ret == 0)
1001 ret = t_ret;
1002 if (ret != 0)
1003 goto err;
1004 }
1005
1006 if (pp != NULL) {
1007 if (new_pgno == PGNO_INVALID)
1008 pp->next_pgno = pagep->next_pgno;
1009 else
1010 pp->next_pgno = new_pgno;
1011 if (pp != otherp)
1012 ret = __memp_fput(mpf,
1013 dbc->thread_info, pp, dbc->priority);
1014 if ((t_ret = __TLPUT(dbc, ppl)) != 0 && ret == 0)
1015 ret = t_ret;
1016 if (ret != 0)
1017 goto err;
1018 }
1019 return (0);
1020
1021 err: if (np != NULL && np != otherp)
1022 (void)__memp_fput(mpf, dbc->thread_info, np, dbc->priority);
1023 if (pp != NULL && pp != otherp)
1024 (void)__memp_fput(mpf, dbc->thread_info, pp, dbc->priority);
1025 return (ret);
1026 }
1027
1028 /*
1029 * __db_move_metadata -- move a meta data page to a lower page number.
1030 * The meta data page must be exclusively latched on entry.
1031 *
1032 * PUBLIC: int __db_move_metadata
1033 * PUBLIC: __P((DBC *, DBMETA **, DB_COMPACT *, int *));
1034 */
1035 int
__db_move_metadata(dbc,metap,c_data,pgs_donep)1036 __db_move_metadata(dbc, metap, c_data, pgs_donep)
1037 DBC *dbc;
1038 DBMETA **metap;
1039 DB_COMPACT *c_data;
1040 int *pgs_donep;
1041 {
1042 BTREE *bt;
1043 DB *dbp, *mdbp;
1044 DB_LOCK handle_lock;
1045 HASH *ht;
1046 int ret, t_ret;
1047
1048 dbp = dbc->dbp;
1049
1050 c_data->compact_pages_examine++;
1051 if ((ret = __db_exchange_page(dbc,
1052 (PAGE **)metap, NULL, PGNO_INVALID, DB_EXCH_FREE, pgs_donep)) != 0)
1053 return (ret);
1054
1055 if (PGNO(*metap) == dbp->meta_pgno)
1056 return (0);
1057
1058 if ((ret = __db_master_open(dbp,
1059 dbc->thread_info, dbc->txn, dbp->fname, 0, 0, &mdbp)) != 0)
1060 return (ret);
1061
1062 dbp->meta_pgno = PGNO(*metap);
1063
1064 if ((ret = __db_master_update(mdbp, dbp, dbc->thread_info,
1065 dbc->txn, dbp->dname, dbp->type, MU_MOVE, NULL, 0)) != 0)
1066 goto err;
1067
1068 /*
1069 * The handle lock for subdb's depends on the metadata page number:
1070 * swap the old one for the new one.
1071 */
1072 if (STD_LOCKING(dbc)) {
1073 /*
1074 * If this dbp is still in an opening transaction we need to
1075 * change its lock in the event.
1076 */
1077 if (dbp->cur_txn != NULL)
1078 __txn_remlock(dbp->env,
1079 dbp->cur_txn, &dbp->handle_lock, dbp->locker);
1080
1081 handle_lock = dbp->handle_lock;
1082 if ((ret = __fop_lock_handle(dbp->env, dbp,
1083 dbp->cur_locker != NULL ? dbp->cur_locker : dbp->locker,
1084 dbp->cur_txn != NULL ? DB_LOCK_WRITE : DB_LOCK_READ,
1085 NULL, 0)) != 0)
1086 goto err;
1087
1088 /* Move all the other handles to the new lock. */
1089 if ((ret = __lock_change(dbp->env,
1090 &handle_lock, &dbp->handle_lock)) != 0)
1091 goto err;
1092
1093 /* Reregister the event. */
1094 if (dbp->cur_txn != NULL)
1095 ret = __txn_lockevent(dbp->env,
1096 dbp->cur_txn, dbp, &dbp->handle_lock, dbp->locker);
1097 }
1098 if (dbp->log_filename != NULL)
1099 dbp->log_filename->meta_pgno = dbp->meta_pgno;
1100 if (dbp->type == DB_HASH) {
1101 ht = dbp->h_internal;
1102 ht->meta_pgno = dbp->meta_pgno;
1103 ht->revision = ++dbp->mpf->mfp->revision;
1104 } else {
1105 bt = dbp->bt_internal;
1106 bt->bt_meta = dbp->meta_pgno;
1107 bt->revision = ++dbp->mpf->mfp->revision;
1108 }
1109
1110 err: if ((t_ret = __db_close(mdbp, dbc->txn, DB_NOSYNC)) != 0 && ret == 0)
1111 ret = t_ret;
1112 return (ret);
1113 }
1114